tulu2-7b-cost-UI-nojudge-5e-7 / trainer_state.json
just1nseo's picture
Model save
bb7c140 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 2430,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.439453125,
"learning_rate": 2.05761316872428e-09,
"logits/chosen": -0.12849420309066772,
"logits/rejected": 0.32615596055984497,
"logps/chosen": -277.55615234375,
"logps/rejected": -196.8867950439453,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.390625,
"learning_rate": 2.0576131687242796e-08,
"logits/chosen": -0.10881485790014267,
"logits/rejected": -0.0043433839455246925,
"logps/chosen": -201.5467529296875,
"logps/rejected": -227.90283203125,
"loss": 0.6936,
"rewards/accuracies": 0.3888888955116272,
"rewards/chosen": -0.00041578023228794336,
"rewards/margins": -0.0005395316984504461,
"rewards/margins_max": 0.0015237904153764248,
"rewards/margins_min": -0.002602853812277317,
"rewards/margins_std": 0.0029179779812693596,
"rewards/rejected": 0.00012375140795484185,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.53515625,
"learning_rate": 4.115226337448559e-08,
"logits/chosen": -0.09926486760377884,
"logits/rejected": 0.07927028834819794,
"logps/chosen": -214.200927734375,
"logps/rejected": -222.8786163330078,
"loss": 0.6928,
"rewards/accuracies": 0.5,
"rewards/chosen": -6.019688589731231e-05,
"rewards/margins": 0.00048257355228997767,
"rewards/margins_max": 0.003266632091253996,
"rewards/margins_min": -0.0023014850448817015,
"rewards/margins_std": 0.003937253262847662,
"rewards/rejected": -0.0005427704309113324,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.44140625,
"learning_rate": 6.172839506172839e-08,
"logits/chosen": -0.09111969918012619,
"logits/rejected": 0.19229279458522797,
"logps/chosen": -243.2583465576172,
"logps/rejected": -227.18716430664062,
"loss": 0.6931,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0003833642113022506,
"rewards/margins": 0.0004754146502818912,
"rewards/margins_max": 0.003324592486023903,
"rewards/margins_min": -0.0023737631272524595,
"rewards/margins_std": 0.004029345698654652,
"rewards/rejected": -9.20505408430472e-05,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.51953125,
"learning_rate": 8.230452674897118e-08,
"logits/chosen": -0.19453440606594086,
"logits/rejected": 0.028327126055955887,
"logps/chosen": -221.5961151123047,
"logps/rejected": -247.38626098632812,
"loss": 0.6931,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0006131277186796069,
"rewards/margins": -0.00020894096815027297,
"rewards/margins_max": 0.0022117348853498697,
"rewards/margins_min": -0.0026296167634427547,
"rewards/margins_std": 0.0034233524929732084,
"rewards/rejected": -0.0004041867796331644,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 0.423828125,
"learning_rate": 1.02880658436214e-07,
"logits/chosen": -0.15002524852752686,
"logits/rejected": 0.0392913818359375,
"logps/chosen": -179.68746948242188,
"logps/rejected": -196.04762268066406,
"loss": 0.6933,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0005380930379033089,
"rewards/margins": -0.0005233940901234746,
"rewards/margins_max": 0.0024810037575662136,
"rewards/margins_min": -0.0035277921706438065,
"rewards/margins_std": 0.004248860292136669,
"rewards/rejected": -1.4698971426696517e-05,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 0.46484375,
"learning_rate": 1.2345679012345677e-07,
"logits/chosen": -0.10200711339712143,
"logits/rejected": 0.0844399482011795,
"logps/chosen": -211.0288543701172,
"logps/rejected": -226.002685546875,
"loss": 0.693,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0015867957845330238,
"rewards/margins": -0.0001534456678200513,
"rewards/margins_max": 0.003357082139700651,
"rewards/margins_min": -0.003663973417133093,
"rewards/margins_std": 0.004964635707437992,
"rewards/rejected": -0.0014333500294014812,
"step": 60
},
{
"epoch": 0.03,
"grad_norm": 0.369140625,
"learning_rate": 1.4403292181069958e-07,
"logits/chosen": -0.07794054597616196,
"logits/rejected": 0.18598364293575287,
"logps/chosen": -198.8175048828125,
"logps/rejected": -218.30746459960938,
"loss": 0.6928,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0009691319428384304,
"rewards/margins": 0.000832684978377074,
"rewards/margins_max": 0.002835240215063095,
"rewards/margins_min": -0.001169870374724269,
"rewards/margins_std": 0.0028320408891886473,
"rewards/rejected": -0.0018018169794231653,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 0.4921875,
"learning_rate": 1.6460905349794237e-07,
"logits/chosen": -0.12472915649414062,
"logits/rejected": 0.1473190039396286,
"logps/chosen": -239.3271942138672,
"logps/rejected": -240.5177459716797,
"loss": 0.6919,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.00019421194156166166,
"rewards/margins": 0.002306972863152623,
"rewards/margins_max": 0.0059133050963282585,
"rewards/margins_min": -0.0012993593700230122,
"rewards/margins_std": 0.005100123584270477,
"rewards/rejected": -0.0021127606742084026,
"step": 80
},
{
"epoch": 0.04,
"grad_norm": 0.458984375,
"learning_rate": 1.8518518518518516e-07,
"logits/chosen": -0.10487590730190277,
"logits/rejected": 0.12480039894580841,
"logps/chosen": -207.2847137451172,
"logps/rejected": -217.23159790039062,
"loss": 0.6927,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0011818298371508718,
"rewards/margins": 0.00026366618112660944,
"rewards/margins_max": 0.003499214071780443,
"rewards/margins_min": -0.002971881767734885,
"rewards/margins_std": 0.004575755912810564,
"rewards/rejected": -0.0014454961055889726,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 0.455078125,
"learning_rate": 2.05761316872428e-07,
"logits/chosen": -0.13099896907806396,
"logits/rejected": 0.1086927056312561,
"logps/chosen": -225.9371795654297,
"logps/rejected": -218.25045776367188,
"loss": 0.6922,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0010145825799554586,
"rewards/margins": 0.0017641137819737196,
"rewards/margins_max": 0.004758741240948439,
"rewards/margins_min": -0.0012305134441703558,
"rewards/margins_std": 0.004235042724758387,
"rewards/rejected": -0.002778696594759822,
"step": 100
},
{
"epoch": 0.05,
"grad_norm": 0.453125,
"learning_rate": 2.2633744855967078e-07,
"logits/chosen": -0.05056118965148926,
"logits/rejected": 0.2104618102312088,
"logps/chosen": -216.330078125,
"logps/rejected": -227.7096405029297,
"loss": 0.6921,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.0010366869391873479,
"rewards/margins": 0.001504565472714603,
"rewards/margins_max": 0.004530596546828747,
"rewards/margins_min": -0.0015214652521535754,
"rewards/margins_std": 0.004279454238712788,
"rewards/rejected": -0.002541252411901951,
"step": 110
},
{
"epoch": 0.05,
"grad_norm": 0.490234375,
"learning_rate": 2.4691358024691354e-07,
"logits/chosen": -0.06995914876461029,
"logits/rejected": 0.1886831820011139,
"logps/chosen": -231.55233764648438,
"logps/rejected": -236.476806640625,
"loss": 0.6918,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.0009716992499306798,
"rewards/margins": 0.0028838925063610077,
"rewards/margins_max": 0.005884943995624781,
"rewards/margins_min": -0.00011715893924701959,
"rewards/margins_std": 0.004244127310812473,
"rewards/rejected": -0.003855592105537653,
"step": 120
},
{
"epoch": 0.05,
"grad_norm": 0.455078125,
"learning_rate": 2.6748971193415635e-07,
"logits/chosen": -0.1631317138671875,
"logits/rejected": 0.07130730152130127,
"logps/chosen": -209.4818115234375,
"logps/rejected": -209.85134887695312,
"loss": 0.6916,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0010341443121433258,
"rewards/margins": 0.0034868132788687944,
"rewards/margins_max": 0.0071867769584059715,
"rewards/margins_min": -0.0002131500223185867,
"rewards/margins_std": 0.005232538096606731,
"rewards/rejected": -0.004520958289504051,
"step": 130
},
{
"epoch": 0.06,
"grad_norm": 0.4921875,
"learning_rate": 2.8806584362139917e-07,
"logits/chosen": -0.09385956078767776,
"logits/rejected": 0.1125023365020752,
"logps/chosen": -211.3016357421875,
"logps/rejected": -221.1059112548828,
"loss": 0.6914,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0018074193503707647,
"rewards/margins": 0.0037975527811795473,
"rewards/margins_max": 0.006774452514946461,
"rewards/margins_min": 0.0008206538623198867,
"rewards/margins_std": 0.004209971055388451,
"rewards/rejected": -0.005604972131550312,
"step": 140
},
{
"epoch": 0.06,
"grad_norm": 0.453125,
"learning_rate": 3.086419753086419e-07,
"logits/chosen": -0.15894190967082977,
"logits/rejected": 0.14013248682022095,
"logps/chosen": -206.8115234375,
"logps/rejected": -206.34732055664062,
"loss": 0.691,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.001109520555473864,
"rewards/margins": 0.004203209187835455,
"rewards/margins_max": 0.0073582506738603115,
"rewards/margins_min": 0.001048167236149311,
"rewards/margins_std": 0.004461902659386396,
"rewards/rejected": -0.0053127300925552845,
"step": 150
},
{
"epoch": 0.07,
"grad_norm": 0.3984375,
"learning_rate": 3.2921810699588474e-07,
"logits/chosen": -0.16771957278251648,
"logits/rejected": 0.03995511680841446,
"logps/chosen": -192.62110900878906,
"logps/rejected": -207.176025390625,
"loss": 0.6907,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.0027189915999770164,
"rewards/margins": 0.005160785745829344,
"rewards/margins_max": 0.008886894211173058,
"rewards/margins_min": 0.0014346761163324118,
"rewards/margins_std": 0.00526951439678669,
"rewards/rejected": -0.007879776880145073,
"step": 160
},
{
"epoch": 0.07,
"grad_norm": 0.388671875,
"learning_rate": 3.4979423868312755e-07,
"logits/chosen": -0.12725508213043213,
"logits/rejected": 0.13450825214385986,
"logps/chosen": -229.01223754882812,
"logps/rejected": -225.0189971923828,
"loss": 0.6899,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.0019386851927265525,
"rewards/margins": 0.0065582552924752235,
"rewards/margins_max": 0.010505530051887035,
"rewards/margins_min": 0.0026109800674021244,
"rewards/margins_std": 0.005582289770245552,
"rewards/rejected": -0.00849694013595581,
"step": 170
},
{
"epoch": 0.07,
"grad_norm": 0.43359375,
"learning_rate": 3.703703703703703e-07,
"logits/chosen": -0.11525268852710724,
"logits/rejected": 0.029247064143419266,
"logps/chosen": -222.8477325439453,
"logps/rejected": -272.62139892578125,
"loss": 0.69,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.002625895431265235,
"rewards/margins": 0.0063320668414235115,
"rewards/margins_max": 0.010786894708871841,
"rewards/margins_min": 0.0018772392068058252,
"rewards/margins_std": 0.006300077773630619,
"rewards/rejected": -0.008957963436841965,
"step": 180
},
{
"epoch": 0.08,
"grad_norm": 0.515625,
"learning_rate": 3.909465020576131e-07,
"logits/chosen": -0.09508004784584045,
"logits/rejected": 0.12103313207626343,
"logps/chosen": -209.8905792236328,
"logps/rejected": -215.1144561767578,
"loss": 0.6894,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.00340329110622406,
"rewards/margins": 0.0074819354340434074,
"rewards/margins_max": 0.01133053284138441,
"rewards/margins_min": 0.003633336629718542,
"rewards/margins_std": 0.0054427399300038815,
"rewards/rejected": -0.010885225608944893,
"step": 190
},
{
"epoch": 0.08,
"grad_norm": 0.384765625,
"learning_rate": 4.11522633744856e-07,
"logits/chosen": -0.13388411700725555,
"logits/rejected": 0.08588583767414093,
"logps/chosen": -222.95236206054688,
"logps/rejected": -217.00717163085938,
"loss": 0.6893,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.005303654354065657,
"rewards/margins": 0.0068459659814834595,
"rewards/margins_max": 0.01136021874845028,
"rewards/margins_min": 0.002331711584702134,
"rewards/margins_std": 0.006384119391441345,
"rewards/rejected": -0.012149619869887829,
"step": 200
},
{
"epoch": 0.09,
"grad_norm": 0.447265625,
"learning_rate": 4.320987654320987e-07,
"logits/chosen": -0.04203199967741966,
"logits/rejected": 0.1425343006849289,
"logps/chosen": -206.8761444091797,
"logps/rejected": -228.0805206298828,
"loss": 0.6887,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.004312532022595406,
"rewards/margins": 0.008573906496167183,
"rewards/margins_max": 0.013535317964851856,
"rewards/margins_min": 0.003612496657297015,
"rewards/margins_std": 0.00701649347320199,
"rewards/rejected": -0.012886440381407738,
"step": 210
},
{
"epoch": 0.09,
"grad_norm": 0.4921875,
"learning_rate": 4.5267489711934156e-07,
"logits/chosen": -0.09616607427597046,
"logits/rejected": 0.13670727610588074,
"logps/chosen": -203.38113403320312,
"logps/rejected": -206.166259765625,
"loss": 0.6884,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.003925122786313295,
"rewards/margins": 0.010814773850142956,
"rewards/margins_max": 0.01591581106185913,
"rewards/margins_min": 0.005713737104088068,
"rewards/margins_std": 0.007213953882455826,
"rewards/rejected": -0.01473989523947239,
"step": 220
},
{
"epoch": 0.09,
"grad_norm": 0.4453125,
"learning_rate": 4.732510288065844e-07,
"logits/chosen": -0.045470915734767914,
"logits/rejected": 0.10779553651809692,
"logps/chosen": -189.78988647460938,
"logps/rejected": -224.5202178955078,
"loss": 0.6874,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.0036609836388379335,
"rewards/margins": 0.011346762999892235,
"rewards/margins_max": 0.018040811643004417,
"rewards/margins_min": 0.004652712494134903,
"rewards/margins_std": 0.009466813877224922,
"rewards/rejected": -0.015007746405899525,
"step": 230
},
{
"epoch": 0.1,
"grad_norm": 0.474609375,
"learning_rate": 4.938271604938271e-07,
"logits/chosen": -0.19475580751895905,
"logits/rejected": 0.12157033383846283,
"logps/chosen": -208.2560577392578,
"logps/rejected": -196.8409881591797,
"loss": 0.687,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.005628727376461029,
"rewards/margins": 0.012200703844428062,
"rewards/margins_max": 0.01744863949716091,
"rewards/margins_min": 0.0069527653977274895,
"rewards/margins_std": 0.007421704940497875,
"rewards/rejected": -0.01782943308353424,
"step": 240
},
{
"epoch": 0.1,
"grad_norm": 0.416015625,
"learning_rate": 4.999873612357511e-07,
"logits/chosen": -0.2264724224805832,
"logits/rejected": 0.03944239020347595,
"logps/chosen": -222.04580688476562,
"logps/rejected": -220.2604522705078,
"loss": 0.6866,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.00675012543797493,
"rewards/margins": 0.013585137203335762,
"rewards/margins_max": 0.020951781421899796,
"rewards/margins_min": 0.006218491587787867,
"rewards/margins_std": 0.010418008081614971,
"rewards/rejected": -0.020335260778665543,
"step": 250
},
{
"epoch": 0.11,
"grad_norm": 0.55859375,
"learning_rate": 4.999254601606523e-07,
"logits/chosen": -0.11498390138149261,
"logits/rejected": 0.15362046658992767,
"logps/chosen": -192.99081420898438,
"logps/rejected": -203.53524780273438,
"loss": 0.6854,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.005103799514472485,
"rewards/margins": 0.015132298693060875,
"rewards/margins_max": 0.02228725515305996,
"rewards/margins_min": 0.007977343164384365,
"rewards/margins_std": 0.0101186353713274,
"rewards/rejected": -0.020236099138855934,
"step": 260
},
{
"epoch": 0.11,
"grad_norm": 0.5078125,
"learning_rate": 4.998119881260575e-07,
"logits/chosen": -0.19641172885894775,
"logits/rejected": 0.07705807685852051,
"logps/chosen": -220.6441650390625,
"logps/rejected": -218.833984375,
"loss": 0.6845,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.0068857064470648766,
"rewards/margins": 0.017990007996559143,
"rewards/margins_max": 0.02593587338924408,
"rewards/margins_min": 0.010044138878583908,
"rewards/margins_std": 0.011237152852118015,
"rewards/rejected": -0.024875711649656296,
"step": 270
},
{
"epoch": 0.12,
"grad_norm": 0.39453125,
"learning_rate": 4.996469685463948e-07,
"logits/chosen": -0.17675986886024475,
"logits/rejected": 0.053129892796278,
"logps/chosen": -230.238525390625,
"logps/rejected": -226.4733123779297,
"loss": 0.6847,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.008551741018891335,
"rewards/margins": 0.017831740900874138,
"rewards/margins_max": 0.025221537798643112,
"rewards/margins_min": 0.010441945865750313,
"rewards/margins_std": 0.010450749658048153,
"rewards/rejected": -0.02638348378241062,
"step": 280
},
{
"epoch": 0.12,
"grad_norm": 0.462890625,
"learning_rate": 4.994304354726891e-07,
"logits/chosen": -0.11211202293634415,
"logits/rejected": 0.17212675511837006,
"logps/chosen": -240.7134552001953,
"logps/rejected": -235.8301239013672,
"loss": 0.683,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.008315947838127613,
"rewards/margins": 0.018191199749708176,
"rewards/margins_max": 0.024264231324195862,
"rewards/margins_min": 0.012118167243897915,
"rewards/margins_std": 0.008588564582169056,
"rewards/rejected": -0.026507148519158363,
"step": 290
},
{
"epoch": 0.12,
"grad_norm": 0.64453125,
"learning_rate": 4.991624335855357e-07,
"logits/chosen": -0.16157573461532593,
"logits/rejected": -0.003342109965160489,
"logps/chosen": -192.2877655029297,
"logps/rejected": -198.39122009277344,
"loss": 0.6826,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.00918472371995449,
"rewards/margins": 0.0205369982868433,
"rewards/margins_max": 0.028297582641243935,
"rewards/margins_min": 0.012776409275829792,
"rewards/margins_std": 0.010975128039717674,
"rewards/rejected": -0.029721718281507492,
"step": 300
},
{
"epoch": 0.13,
"grad_norm": 0.4609375,
"learning_rate": 4.988430181858809e-07,
"logits/chosen": -0.20107969641685486,
"logits/rejected": 0.04424827918410301,
"logps/chosen": -200.2167510986328,
"logps/rejected": -193.597900390625,
"loss": 0.6816,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.009901036508381367,
"rewards/margins": 0.02123742178082466,
"rewards/margins_max": 0.030068615451455116,
"rewards/margins_min": 0.01240622065961361,
"rewards/margins_std": 0.012489198707044125,
"rewards/rejected": -0.031138455495238304,
"step": 310
},
{
"epoch": 0.13,
"grad_norm": 0.412109375,
"learning_rate": 4.984722551836112e-07,
"logits/chosen": -0.08247671276330948,
"logits/rejected": 0.09890026599168777,
"logps/chosen": -214.65396118164062,
"logps/rejected": -244.7481231689453,
"loss": 0.6807,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.01232508011162281,
"rewards/margins": 0.022393036633729935,
"rewards/margins_max": 0.032050006091594696,
"rewards/margins_min": 0.012736069969832897,
"rewards/margins_std": 0.01365701388567686,
"rewards/rejected": -0.034718118607997894,
"step": 320
},
{
"epoch": 0.14,
"grad_norm": 0.45703125,
"learning_rate": 4.980502210839523e-07,
"logits/chosen": -0.19298240542411804,
"logits/rejected": 0.05112982913851738,
"logps/chosen": -216.5214385986328,
"logps/rejected": -205.87411499023438,
"loss": 0.6798,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.011784660629928112,
"rewards/margins": 0.02403593435883522,
"rewards/margins_max": 0.035633157938718796,
"rewards/margins_min": 0.012438705191016197,
"rewards/margins_std": 0.016400957480072975,
"rewards/rejected": -0.03582059592008591,
"step": 330
},
{
"epoch": 0.14,
"grad_norm": 0.423828125,
"learning_rate": 4.975770029716832e-07,
"logits/chosen": -0.16089969873428345,
"logits/rejected": 0.06704260408878326,
"logps/chosen": -198.7475128173828,
"logps/rejected": -214.4049835205078,
"loss": 0.6812,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.01207501720637083,
"rewards/margins": 0.027419626712799072,
"rewards/margins_max": 0.038728706538677216,
"rewards/margins_min": 0.016110548749566078,
"rewards/margins_std": 0.015993457287549973,
"rewards/rejected": -0.03949464112520218,
"step": 340
},
{
"epoch": 0.14,
"grad_norm": 0.3984375,
"learning_rate": 4.970526984931663e-07,
"logits/chosen": -0.12522733211517334,
"logits/rejected": 0.02256820723414421,
"logps/chosen": -189.53872680664062,
"logps/rejected": -237.49057006835938,
"loss": 0.6788,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.014810247346758842,
"rewards/margins": 0.031076129525899887,
"rewards/margins_max": 0.04324505478143692,
"rewards/margins_min": 0.018907207995653152,
"rewards/margins_std": 0.017209455370903015,
"rewards/rejected": -0.04588637501001358,
"step": 350
},
{
"epoch": 0.15,
"grad_norm": 0.466796875,
"learning_rate": 4.96477415836199e-07,
"logits/chosen": -0.12060017883777618,
"logits/rejected": 0.15660127997398376,
"logps/chosen": -207.26455688476562,
"logps/rejected": -198.59371948242188,
"loss": 0.68,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.01682819053530693,
"rewards/margins": 0.025025326758623123,
"rewards/margins_max": 0.037398561835289,
"rewards/margins_min": 0.012652089819312096,
"rewards/margins_std": 0.017498398199677467,
"rewards/rejected": -0.041853513568639755,
"step": 360
},
{
"epoch": 0.15,
"grad_norm": 0.404296875,
"learning_rate": 4.958512737076895e-07,
"logits/chosen": -0.1564178466796875,
"logits/rejected": 0.09775165468454361,
"logps/chosen": -199.6991424560547,
"logps/rejected": -201.5712432861328,
"loss": 0.6774,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.0173592958599329,
"rewards/margins": 0.029393130913376808,
"rewards/margins_max": 0.04372996464371681,
"rewards/margins_min": 0.015056299977004528,
"rewards/margins_std": 0.020275337621569633,
"rewards/rejected": -0.04675242677330971,
"step": 370
},
{
"epoch": 0.16,
"grad_norm": 0.423828125,
"learning_rate": 4.951744013091616e-07,
"logits/chosen": -0.06468039751052856,
"logits/rejected": 0.11344078928232193,
"logps/chosen": -202.63296508789062,
"logps/rejected": -218.40939331054688,
"loss": 0.677,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.014664116315543652,
"rewards/margins": 0.035016417503356934,
"rewards/margins_max": 0.04858310893177986,
"rewards/margins_min": 0.021449726074934006,
"rewards/margins_std": 0.019186200574040413,
"rewards/rejected": -0.04968053475022316,
"step": 380
},
{
"epoch": 0.16,
"grad_norm": 0.52734375,
"learning_rate": 4.944469383100954e-07,
"logits/chosen": -0.16045762598514557,
"logits/rejected": 0.10485055297613144,
"logps/chosen": -204.5330810546875,
"logps/rejected": -205.8036651611328,
"loss": 0.6759,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.02098998799920082,
"rewards/margins": 0.03199451044201851,
"rewards/margins_max": 0.048134349286556244,
"rewards/margins_min": 0.015854666009545326,
"rewards/margins_std": 0.02282518334686756,
"rewards/rejected": -0.05298449844121933,
"step": 390
},
{
"epoch": 0.16,
"grad_norm": 0.494140625,
"learning_rate": 4.936690348191063e-07,
"logits/chosen": -0.1567983776330948,
"logits/rejected": 0.08766036480665207,
"logps/chosen": -227.46139526367188,
"logps/rejected": -232.83517456054688,
"loss": 0.6753,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.013347646221518517,
"rewards/margins": 0.03657007962465286,
"rewards/margins_max": 0.05410366132855415,
"rewards/margins_min": 0.01903649792075157,
"rewards/margins_std": 0.024796226993203163,
"rewards/rejected": -0.04991772025823593,
"step": 400
},
{
"epoch": 0.17,
"grad_norm": 0.46875,
"learning_rate": 4.928408513529719e-07,
"logits/chosen": -0.1951916515827179,
"logits/rejected": -0.047125209122896194,
"logps/chosen": -199.21746826171875,
"logps/rejected": -224.54483032226562,
"loss": 0.6743,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.01675131730735302,
"rewards/margins": 0.03562582656741142,
"rewards/margins_max": 0.047902870923280716,
"rewards/margins_min": 0.023348785936832428,
"rewards/margins_std": 0.017362359911203384,
"rewards/rejected": -0.05237714573740959,
"step": 410
},
{
"epoch": 0.17,
"grad_norm": 0.46484375,
"learning_rate": 4.919625588035091e-07,
"logits/chosen": -0.12850052118301392,
"logits/rejected": 0.22448399662971497,
"logps/chosen": -234.625732421875,
"logps/rejected": -220.24868774414062,
"loss": 0.6743,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.014748236164450645,
"rewards/margins": 0.039627060294151306,
"rewards/margins_max": 0.05790294334292412,
"rewards/margins_min": 0.021351177245378494,
"rewards/margins_std": 0.025846004486083984,
"rewards/rejected": -0.0543752983212471,
"step": 420
},
{
"epoch": 0.18,
"grad_norm": 0.470703125,
"learning_rate": 4.910343384023118e-07,
"logits/chosen": -0.08536979556083679,
"logits/rejected": 0.1372174322605133,
"logps/chosen": -233.83938598632812,
"logps/rejected": -246.7130584716797,
"loss": 0.6725,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.019801601767539978,
"rewards/margins": 0.03844233602285385,
"rewards/margins_max": 0.053383953869342804,
"rewards/margins_min": 0.0235007144510746,
"rewards/margins_std": 0.02113064005970955,
"rewards/rejected": -0.05824393779039383,
"step": 430
},
{
"epoch": 0.18,
"grad_norm": 0.431640625,
"learning_rate": 4.900563816833543e-07,
"logits/chosen": -0.01743602380156517,
"logits/rejected": 0.18241354823112488,
"logps/chosen": -205.63961791992188,
"logps/rejected": -239.1418914794922,
"loss": 0.6707,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.018751228228211403,
"rewards/margins": 0.04704046994447708,
"rewards/margins_max": 0.06214872747659683,
"rewards/margins_min": 0.03193220496177673,
"rewards/margins_std": 0.021366309374570847,
"rewards/rejected": -0.06579168885946274,
"step": 440
},
{
"epoch": 0.19,
"grad_norm": 0.40625,
"learning_rate": 4.890288904434699e-07,
"logits/chosen": -0.13453516364097595,
"logits/rejected": 0.11062689125537872,
"logps/chosen": -203.70687866210938,
"logps/rejected": -222.5287322998047,
"loss": 0.6692,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.020012306049466133,
"rewards/margins": 0.04848041012883186,
"rewards/margins_max": 0.0692964643239975,
"rewards/margins_min": 0.02766435220837593,
"rewards/margins_std": 0.029438350349664688,
"rewards/rejected": -0.06849271804094315,
"step": 450
},
{
"epoch": 0.19,
"grad_norm": 0.515625,
"learning_rate": 4.8795207670071e-07,
"logits/chosen": -0.14224310219287872,
"logits/rejected": 0.06176813691854477,
"logps/chosen": -207.9291229248047,
"logps/rejected": -238.2221221923828,
"loss": 0.6675,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.02270520105957985,
"rewards/margins": 0.05173413082957268,
"rewards/margins_max": 0.0734233409166336,
"rewards/margins_min": 0.0300449226051569,
"rewards/margins_std": 0.03067317232489586,
"rewards/rejected": -0.07443933188915253,
"step": 460
},
{
"epoch": 0.19,
"grad_norm": 0.486328125,
"learning_rate": 4.868261626505958e-07,
"logits/chosen": -0.09490348398685455,
"logits/rejected": 0.11677880585193634,
"logps/chosen": -215.65878295898438,
"logps/rejected": -227.8511199951172,
"loss": 0.6686,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.026142066344618797,
"rewards/margins": 0.049620382487773895,
"rewards/margins_max": 0.06977085769176483,
"rewards/margins_min": 0.029469912871718407,
"rewards/margins_std": 0.028497066348791122,
"rewards/rejected": -0.07576245814561844,
"step": 470
},
{
"epoch": 0.2,
"grad_norm": 0.4140625,
"learning_rate": 4.856513806202697e-07,
"logits/chosen": -0.17281684279441833,
"logits/rejected": 0.06971795111894608,
"logps/chosen": -210.6538543701172,
"logps/rejected": -217.97451782226562,
"loss": 0.6666,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.020999779924750328,
"rewards/margins": 0.05904274061322212,
"rewards/margins_max": 0.08474047482013702,
"rewards/margins_min": 0.03334500640630722,
"rewards/margins_std": 0.036342088133096695,
"rewards/rejected": -0.0800425186753273,
"step": 480
},
{
"epoch": 0.2,
"grad_norm": 0.46484375,
"learning_rate": 4.844279730205544e-07,
"logits/chosen": -0.10984311252832413,
"logits/rejected": 0.0616462342441082,
"logps/chosen": -230.95059204101562,
"logps/rejected": -258.0708312988281,
"loss": 0.6673,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.027716726064682007,
"rewards/margins": 0.051834236830472946,
"rewards/margins_max": 0.07494363188743591,
"rewards/margins_min": 0.028724845498800278,
"rewards/margins_std": 0.03268161416053772,
"rewards/rejected": -0.07955096662044525,
"step": 490
},
{
"epoch": 0.21,
"grad_norm": 0.404296875,
"learning_rate": 4.831561922959338e-07,
"logits/chosen": -0.1495492160320282,
"logits/rejected": 0.1394021064043045,
"logps/chosen": -210.3924560546875,
"logps/rejected": -208.85018920898438,
"loss": 0.6652,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.022129181772470474,
"rewards/margins": 0.060523491352796555,
"rewards/margins_max": 0.08514624834060669,
"rewards/margins_min": 0.03590074181556702,
"rewards/margins_std": 0.034821830689907074,
"rewards/rejected": -0.08265267312526703,
"step": 500
},
{
"epoch": 0.21,
"grad_norm": 0.5078125,
"learning_rate": 4.818363008724618e-07,
"logits/chosen": -0.1585562378168106,
"logits/rejected": 0.060911018401384354,
"logps/chosen": -212.550537109375,
"logps/rejected": -239.1901397705078,
"loss": 0.6665,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.031042397022247314,
"rewards/margins": 0.055754829198122025,
"rewards/margins_max": 0.07818542420864105,
"rewards/margins_min": 0.0333242304623127,
"rewards/margins_std": 0.03172165900468826,
"rewards/rejected": -0.08679722249507904,
"step": 510
},
{
"epoch": 0.21,
"grad_norm": 0.48046875,
"learning_rate": 4.804685711036113e-07,
"logits/chosen": -0.16393280029296875,
"logits/rejected": 0.08432348072528839,
"logps/chosen": -221.3743896484375,
"logps/rejected": -245.53262329101562,
"loss": 0.6648,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.030147483572363853,
"rewards/margins": 0.058864571154117584,
"rewards/margins_max": 0.07852182537317276,
"rewards/margins_min": 0.039207302033901215,
"rewards/margins_std": 0.027799565345048904,
"rewards/rejected": -0.08901204913854599,
"step": 520
},
{
"epoch": 0.22,
"grad_norm": 0.44921875,
"learning_rate": 4.790532852140767e-07,
"logits/chosen": -0.15655621886253357,
"logits/rejected": 0.1627272069454193,
"logps/chosen": -224.1035919189453,
"logps/rejected": -224.27749633789062,
"loss": 0.6639,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.03070450760424137,
"rewards/margins": 0.059945207089185715,
"rewards/margins_max": 0.08565831184387207,
"rewards/margins_min": 0.03423209488391876,
"rewards/margins_std": 0.03636383265256882,
"rewards/rejected": -0.09064970910549164,
"step": 530
},
{
"epoch": 0.22,
"grad_norm": 0.44921875,
"learning_rate": 4.775907352415367e-07,
"logits/chosen": -0.20449629426002502,
"logits/rejected": 0.07528124749660492,
"logps/chosen": -225.3180694580078,
"logps/rejected": -227.5177001953125,
"loss": 0.6623,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.03224950283765793,
"rewards/margins": 0.06405209004878998,
"rewards/margins_max": 0.09230367094278336,
"rewards/margins_min": 0.0358005091547966,
"rewards/margins_std": 0.039953768253326416,
"rewards/rejected": -0.0963016003370285,
"step": 540
},
{
"epoch": 0.23,
"grad_norm": 0.447265625,
"learning_rate": 4.760812229763944e-07,
"logits/chosen": -0.14346732199192047,
"logits/rejected": 0.15249694883823395,
"logps/chosen": -230.9304656982422,
"logps/rejected": -227.08364868164062,
"loss": 0.6633,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.03136484697461128,
"rewards/margins": 0.0606791190803051,
"rewards/margins_max": 0.09166625887155533,
"rewards/margins_min": 0.029691975563764572,
"rewards/margins_std": 0.043822430074214935,
"rewards/rejected": -0.09204395860433578,
"step": 550
},
{
"epoch": 0.23,
"grad_norm": 0.45703125,
"learning_rate": 4.7452505989950455e-07,
"logits/chosen": -0.14922063052654266,
"logits/rejected": 0.08096525818109512,
"logps/chosen": -227.1333770751953,
"logps/rejected": -244.99362182617188,
"loss": 0.6612,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.02952210046350956,
"rewards/margins": 0.0689021423459053,
"rewards/margins_max": 0.09765410423278809,
"rewards/margins_min": 0.04015018790960312,
"rewards/margins_std": 0.04066140204668045,
"rewards/rejected": -0.09842424839735031,
"step": 560
},
{
"epoch": 0.23,
"grad_norm": 0.44140625,
"learning_rate": 4.729225671179e-07,
"logits/chosen": -0.16192954778671265,
"logits/rejected": 0.16453325748443604,
"logps/chosen": -222.29306030273438,
"logps/rejected": -210.7808074951172,
"loss": 0.6608,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.0332382507622242,
"rewards/margins": 0.06441639363765717,
"rewards/margins_max": 0.09518542140722275,
"rewards/margins_min": 0.03364737331867218,
"rewards/margins_std": 0.04351397603750229,
"rewards/rejected": -0.09765465557575226,
"step": 570
},
{
"epoch": 0.24,
"grad_norm": 0.5078125,
"learning_rate": 4.712740752985337e-07,
"logits/chosen": -0.03260333463549614,
"logits/rejected": 0.15674880146980286,
"logps/chosen": -225.6151580810547,
"logps/rejected": -230.8135223388672,
"loss": 0.6668,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.0472460612654686,
"rewards/margins": 0.05132395029067993,
"rewards/margins_max": 0.07761866599321365,
"rewards/margins_min": 0.02502923086285591,
"rewards/margins_std": 0.03718634322285652,
"rewards/rejected": -0.09857000410556793,
"step": 580
},
{
"epoch": 0.24,
"grad_norm": 0.49609375,
"learning_rate": 4.695799246000464e-07,
"logits/chosen": -0.16538329422473907,
"logits/rejected": 0.07633324712514877,
"logps/chosen": -208.8240203857422,
"logps/rejected": -231.292724609375,
"loss": 0.6592,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.04221474379301071,
"rewards/margins": 0.07459871470928192,
"rewards/margins_max": 0.10690847784280777,
"rewards/margins_min": 0.04228895902633667,
"rewards/margins_std": 0.04569289833307266,
"rewards/rejected": -0.11681344360113144,
"step": 590
},
{
"epoch": 0.25,
"grad_norm": 0.453125,
"learning_rate": 4.6784046460257694e-07,
"logits/chosen": -0.1475997418165207,
"logits/rejected": 0.10680235922336578,
"logps/chosen": -234.5234375,
"logps/rejected": -233.68527221679688,
"loss": 0.6617,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.041780535131692886,
"rewards/margins": 0.059098441153764725,
"rewards/margins_max": 0.08605752140283585,
"rewards/margins_min": 0.0321393683552742,
"rewards/margins_std": 0.03812588378787041,
"rewards/rejected": -0.10087897628545761,
"step": 600
},
{
"epoch": 0.25,
"grad_norm": 0.4609375,
"learning_rate": 4.660560542356278e-07,
"logits/chosen": -0.14557047188282013,
"logits/rejected": 0.11930598318576813,
"logps/chosen": -230.1462860107422,
"logps/rejected": -230.4649200439453,
"loss": 0.6581,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.039344362914562225,
"rewards/margins": 0.0685037225484848,
"rewards/margins_max": 0.09414297342300415,
"rewards/margins_min": 0.04286447912454605,
"rewards/margins_std": 0.03625936806201935,
"rewards/rejected": -0.10784808546304703,
"step": 610
},
{
"epoch": 0.26,
"grad_norm": 0.4609375,
"learning_rate": 4.6422706170400175e-07,
"logits/chosen": -0.18832182884216309,
"logits/rejected": 0.102397121489048,
"logps/chosen": -221.0863037109375,
"logps/rejected": -231.1044464111328,
"loss": 0.6598,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.03650045394897461,
"rewards/margins": 0.0659482330083847,
"rewards/margins_max": 0.09619072079658508,
"rewards/margins_min": 0.03570573776960373,
"rewards/margins_std": 0.04276934266090393,
"rewards/rejected": -0.10244867950677872,
"step": 620
},
{
"epoch": 0.26,
"grad_norm": 0.50390625,
"learning_rate": 4.6235386441182434e-07,
"logits/chosen": -0.12858518958091736,
"logits/rejected": 0.04834365099668503,
"logps/chosen": -221.7181854248047,
"logps/rejected": -244.804443359375,
"loss": 0.6565,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.04021327942609787,
"rewards/margins": 0.07245869934558868,
"rewards/margins_max": 0.10741807520389557,
"rewards/margins_min": 0.03749933838844299,
"rewards/margins_std": 0.049440011382102966,
"rewards/rejected": -0.11267199367284775,
"step": 630
},
{
"epoch": 0.26,
"grad_norm": 0.51171875,
"learning_rate": 4.604368488846686e-07,
"logits/chosen": -0.14893962442874908,
"logits/rejected": 0.021318774670362473,
"logps/chosen": -199.5926513671875,
"logps/rejected": -227.07766723632812,
"loss": 0.6571,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.041552286595106125,
"rewards/margins": 0.07733511924743652,
"rewards/margins_max": 0.11415763199329376,
"rewards/margins_min": 0.04051261395215988,
"rewards/margins_std": 0.052074890583753586,
"rewards/rejected": -0.11888740956783295,
"step": 640
},
{
"epoch": 0.27,
"grad_norm": 0.51171875,
"learning_rate": 4.58476410689797e-07,
"logits/chosen": -0.10416440665721893,
"logits/rejected": 0.03748173266649246,
"logps/chosen": -217.5540771484375,
"logps/rejected": -244.89523315429688,
"loss": 0.6574,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.039487071335315704,
"rewards/margins": 0.07235467433929443,
"rewards/margins_max": 0.10398920625448227,
"rewards/margins_min": 0.040720134973526,
"rewards/margins_std": 0.04473799094557762,
"rewards/rejected": -0.11184175312519073,
"step": 650
},
{
"epoch": 0.27,
"grad_norm": 0.5078125,
"learning_rate": 4.5647295435453817e-07,
"logits/chosen": -0.13231520354747772,
"logits/rejected": 0.022942349314689636,
"logps/chosen": -237.333740234375,
"logps/rejected": -246.2041778564453,
"loss": 0.6593,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05165981128811836,
"rewards/margins": 0.06908587366342545,
"rewards/margins_max": 0.10448731482028961,
"rewards/margins_min": 0.033684439957141876,
"rewards/margins_std": 0.05006518214941025,
"rewards/rejected": -0.12074568122625351,
"step": 660
},
{
"epoch": 0.28,
"grad_norm": 0.44921875,
"learning_rate": 4.544268932828144e-07,
"logits/chosen": -0.17096921801567078,
"logits/rejected": 0.009365534409880638,
"logps/chosen": -211.588623046875,
"logps/rejected": -257.00726318359375,
"loss": 0.6546,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.040882695466279984,
"rewards/margins": 0.081394262611866,
"rewards/margins_max": 0.11418579518795013,
"rewards/margins_min": 0.048602718859910965,
"rewards/margins_std": 0.04637424275279045,
"rewards/rejected": -0.12227696180343628,
"step": 670
},
{
"epoch": 0.28,
"grad_norm": 0.4140625,
"learning_rate": 4.523386496698376e-07,
"logits/chosen": -0.22352655231952667,
"logits/rejected": 0.08650527149438858,
"logps/chosen": -225.86953735351562,
"logps/rejected": -221.8078155517578,
"loss": 0.6565,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.04626091942191124,
"rewards/margins": 0.07565927505493164,
"rewards/margins_max": 0.10747319459915161,
"rewards/margins_min": 0.043845366686582565,
"rewards/margins_std": 0.044991664588451385,
"rewards/rejected": -0.12192019075155258,
"step": 680
},
{
"epoch": 0.28,
"grad_norm": 0.451171875,
"learning_rate": 4.502086544149918e-07,
"logits/chosen": -0.1821189820766449,
"logits/rejected": 0.08161283284425735,
"logps/chosen": -209.0590057373047,
"logps/rejected": -238.8436737060547,
"loss": 0.6541,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.04758086055517197,
"rewards/margins": 0.08119155466556549,
"rewards/margins_max": 0.11566410213708878,
"rewards/margins_min": 0.0467190146446228,
"rewards/margins_std": 0.04875154048204422,
"rewards/rejected": -0.12877242267131805,
"step": 690
},
{
"epoch": 0.29,
"grad_norm": 0.5390625,
"learning_rate": 4.4803734703291845e-07,
"logits/chosen": -0.17991140484809875,
"logits/rejected": 0.10490355640649796,
"logps/chosen": -228.5984649658203,
"logps/rejected": -212.1452178955078,
"loss": 0.654,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.043927937746047974,
"rewards/margins": 0.07159805297851562,
"rewards/margins_max": 0.10323642194271088,
"rewards/margins_min": 0.039959684014320374,
"rewards/margins_std": 0.044743407517671585,
"rewards/rejected": -0.1155259981751442,
"step": 700
},
{
"epoch": 0.29,
"grad_norm": 0.431640625,
"learning_rate": 4.4582517556282474e-07,
"logits/chosen": -0.18320028483867645,
"logits/rejected": 0.013312360271811485,
"logps/chosen": -206.8184814453125,
"logps/rejected": -252.7014923095703,
"loss": 0.6506,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.04943935200572014,
"rewards/margins": 0.08512347936630249,
"rewards/margins_max": 0.12365541607141495,
"rewards/margins_min": 0.04659154266119003,
"rewards/margins_std": 0.054492391645908356,
"rewards/rejected": -0.13456283509731293,
"step": 710
},
{
"epoch": 0.3,
"grad_norm": 0.466796875,
"learning_rate": 4.435725964760331e-07,
"logits/chosen": -0.1689848005771637,
"logits/rejected": 0.07571324706077576,
"logps/chosen": -207.1726531982422,
"logps/rejected": -228.133056640625,
"loss": 0.6553,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.04197344928979874,
"rewards/margins": 0.07342037558555603,
"rewards/margins_max": 0.10903932899236679,
"rewards/margins_min": 0.037801433354616165,
"rewards/margins_std": 0.05037280172109604,
"rewards/rejected": -0.11539383232593536,
"step": 720
},
{
"epoch": 0.3,
"grad_norm": 0.5234375,
"learning_rate": 4.412800745817901e-07,
"logits/chosen": -0.16182328760623932,
"logits/rejected": 0.07639019191265106,
"logps/chosen": -229.777587890625,
"logps/rejected": -233.11746215820312,
"loss": 0.6582,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.04751753434538841,
"rewards/margins": 0.07238186150789261,
"rewards/margins_max": 0.10116531699895859,
"rewards/margins_min": 0.04359840601682663,
"rewards/margins_std": 0.040705952793359756,
"rewards/rejected": -0.11989939212799072,
"step": 730
},
{
"epoch": 0.3,
"grad_norm": 0.486328125,
"learning_rate": 4.3894808293135526e-07,
"logits/chosen": -0.11428213119506836,
"logits/rejected": 0.13538585603237152,
"logps/chosen": -223.481689453125,
"logps/rejected": -238.7801055908203,
"loss": 0.6537,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.04576458781957626,
"rewards/margins": 0.08550850301980972,
"rewards/margins_max": 0.11636098474264145,
"rewards/margins_min": 0.0546560175716877,
"rewards/margins_std": 0.04363200441002846,
"rewards/rejected": -0.131273090839386,
"step": 740
},
{
"epoch": 0.31,
"grad_norm": 0.41796875,
"learning_rate": 4.365771027203896e-07,
"logits/chosen": -0.14200684428215027,
"logits/rejected": 0.09855206310749054,
"logps/chosen": -213.05581665039062,
"logps/rejected": -232.3656463623047,
"loss": 0.6507,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.03935530036687851,
"rewards/margins": 0.09331099689006805,
"rewards/margins_max": 0.1294441670179367,
"rewards/margins_min": 0.0571778230369091,
"rewards/margins_std": 0.051100023090839386,
"rewards/rejected": -0.13266630470752716,
"step": 750
},
{
"epoch": 0.31,
"grad_norm": 0.494140625,
"learning_rate": 4.3416762318966236e-07,
"logits/chosen": -0.10148487240076065,
"logits/rejected": 0.1706521213054657,
"logps/chosen": -225.71914672851562,
"logps/rejected": -217.4258270263672,
"loss": 0.6512,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.04290129616856575,
"rewards/margins": 0.07890793681144714,
"rewards/margins_max": 0.10971459001302719,
"rewards/margins_min": 0.0481012761592865,
"rewards/margins_std": 0.04356719180941582,
"rewards/rejected": -0.12180924415588379,
"step": 760
},
{
"epoch": 0.32,
"grad_norm": 0.52734375,
"learning_rate": 4.317201415240992e-07,
"logits/chosen": -0.049225617200136185,
"logits/rejected": 0.1614571064710617,
"logps/chosen": -219.39144897460938,
"logps/rejected": -221.80313110351562,
"loss": 0.6539,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.06035063415765762,
"rewards/margins": 0.07851085066795349,
"rewards/margins_max": 0.12034505605697632,
"rewards/margins_min": 0.03667663782835007,
"rewards/margins_std": 0.05916251987218857,
"rewards/rejected": -0.13886147737503052,
"step": 770
},
{
"epoch": 0.32,
"grad_norm": 0.75390625,
"learning_rate": 4.2923516275018974e-07,
"logits/chosen": -0.14591281116008759,
"logits/rejected": 0.050023000687360764,
"logps/chosen": -237.15017700195312,
"logps/rejected": -258.215576171875,
"loss": 0.6514,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05067404359579086,
"rewards/margins": 0.092898890376091,
"rewards/margins_max": 0.12995196878910065,
"rewards/margins_min": 0.055845797061920166,
"rewards/margins_std": 0.05240098387002945,
"rewards/rejected": -0.14357292652130127,
"step": 780
},
{
"epoch": 0.33,
"grad_norm": 0.388671875,
"learning_rate": 4.267131996317781e-07,
"logits/chosen": -0.13919471204280853,
"logits/rejected": 0.15345291793346405,
"logps/chosen": -203.8855743408203,
"logps/rejected": -212.9399871826172,
"loss": 0.6524,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.04519537463784218,
"rewards/margins": 0.08230610936880112,
"rewards/margins_max": 0.11458753049373627,
"rewards/margins_min": 0.05002468824386597,
"rewards/margins_std": 0.045652832835912704,
"rewards/rejected": -0.1275014877319336,
"step": 790
},
{
"epoch": 0.33,
"grad_norm": 0.5390625,
"learning_rate": 4.2415477256425634e-07,
"logits/chosen": -0.17129512131214142,
"logits/rejected": 0.020465224981307983,
"logps/chosen": -208.89291381835938,
"logps/rejected": -218.3040771484375,
"loss": 0.6527,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.05368215963244438,
"rewards/margins": 0.07769324630498886,
"rewards/margins_max": 0.11353801190853119,
"rewards/margins_min": 0.04184848070144653,
"rewards/margins_std": 0.050692152231931686,
"rewards/rejected": -0.13137540221214294,
"step": 800
},
{
"epoch": 0.33,
"grad_norm": 0.48828125,
"learning_rate": 4.2156040946718343e-07,
"logits/chosen": -0.10920798778533936,
"logits/rejected": 0.08190996944904327,
"logps/chosen": -204.00723266601562,
"logps/rejected": -246.7364501953125,
"loss": 0.6493,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.04529748111963272,
"rewards/margins": 0.08185549080371857,
"rewards/margins_max": 0.11279450356960297,
"rewards/margins_min": 0.05091645568609238,
"rewards/margins_std": 0.043754395097494125,
"rewards/rejected": -0.1271529644727707,
"step": 810
},
{
"epoch": 0.34,
"grad_norm": 0.5,
"learning_rate": 4.189306456753511e-07,
"logits/chosen": -0.08927767723798752,
"logits/rejected": 0.16780522465705872,
"logps/chosen": -216.26211547851562,
"logps/rejected": -223.88839721679688,
"loss": 0.6505,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.04724791273474693,
"rewards/margins": 0.08486290276050568,
"rewards/margins_max": 0.1154739111661911,
"rewards/margins_min": 0.05425189062952995,
"rewards/margins_std": 0.043290503323078156,
"rewards/rejected": -0.1321108192205429,
"step": 820
},
{
"epoch": 0.34,
"grad_norm": 0.408203125,
"learning_rate": 4.1626602382832044e-07,
"logits/chosen": -0.11900673061609268,
"logits/rejected": 0.10318160057067871,
"logps/chosen": -221.74072265625,
"logps/rejected": -248.9683837890625,
"loss": 0.6488,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.04588564485311508,
"rewards/margins": 0.08740084618330002,
"rewards/margins_max": 0.11989488452672958,
"rewards/margins_min": 0.05490681529045105,
"rewards/margins_std": 0.045953501015901566,
"rewards/rejected": -0.1332865059375763,
"step": 830
},
{
"epoch": 0.35,
"grad_norm": 0.455078125,
"learning_rate": 4.1356709375845046e-07,
"logits/chosen": -0.1787930279970169,
"logits/rejected": 0.05423184484243393,
"logps/chosen": -203.95889282226562,
"logps/rejected": -226.582763671875,
"loss": 0.6485,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.05451052263379097,
"rewards/margins": 0.10125939548015594,
"rewards/margins_max": 0.14539141952991486,
"rewards/margins_min": 0.057127393782138824,
"rewards/margins_std": 0.0624120831489563,
"rewards/rejected": -0.1557699292898178,
"step": 840
},
{
"epoch": 0.35,
"grad_norm": 0.5078125,
"learning_rate": 4.1083441237744285e-07,
"logits/chosen": -0.07366688549518585,
"logits/rejected": 0.04007618874311447,
"logps/chosen": -222.0310516357422,
"logps/rejected": -271.1426696777344,
"loss": 0.6536,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.05247300863265991,
"rewards/margins": 0.09073988348245621,
"rewards/margins_max": 0.1382082849740982,
"rewards/margins_min": 0.043271489441394806,
"rewards/margins_std": 0.06713045388460159,
"rewards/rejected": -0.14321288466453552,
"step": 850
},
{
"epoch": 0.35,
"grad_norm": 0.46484375,
"learning_rate": 4.0806854356142597e-07,
"logits/chosen": -0.11528744548559189,
"logits/rejected": 0.17569738626480103,
"logps/chosen": -233.05810546875,
"logps/rejected": -237.28506469726562,
"loss": 0.6505,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.058273959904909134,
"rewards/margins": 0.08572045713663101,
"rewards/margins_max": 0.12326614558696747,
"rewards/margins_min": 0.048174768686294556,
"rewards/margins_std": 0.05309762433171272,
"rewards/rejected": -0.14399442076683044,
"step": 860
},
{
"epoch": 0.36,
"grad_norm": 0.44921875,
"learning_rate": 4.052700580346011e-07,
"logits/chosen": -0.1549403816461563,
"logits/rejected": 0.08772721141576767,
"logps/chosen": -223.6434783935547,
"logps/rejected": -235.5021514892578,
"loss": 0.6499,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.061686016619205475,
"rewards/margins": 0.08836236596107483,
"rewards/margins_max": 0.13330301642417908,
"rewards/margins_min": 0.04342171922326088,
"rewards/margins_std": 0.06355567276477814,
"rewards/rejected": -0.1500483751296997,
"step": 870
},
{
"epoch": 0.36,
"grad_norm": 0.4375,
"learning_rate": 4.024395332514768e-07,
"logits/chosen": -0.1552925854921341,
"logits/rejected": 0.08893848955631256,
"logps/chosen": -224.326171875,
"logps/rejected": -226.89663696289062,
"loss": 0.6486,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.04646015912294388,
"rewards/margins": 0.09568478912115097,
"rewards/margins_max": 0.13710376620292664,
"rewards/margins_min": 0.0542658269405365,
"rewards/margins_std": 0.058575280010700226,
"rewards/rejected": -0.14214494824409485,
"step": 880
},
{
"epoch": 0.37,
"grad_norm": 0.4765625,
"learning_rate": 3.9957755327771357e-07,
"logits/chosen": -0.1727294921875,
"logits/rejected": -0.014213940128684044,
"logps/chosen": -200.10498046875,
"logps/rejected": -253.10232543945312,
"loss": 0.6517,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.060309626162052155,
"rewards/margins": 0.08810073137283325,
"rewards/margins_max": 0.12239007651805878,
"rewards/margins_min": 0.05381138250231743,
"rewards/margins_std": 0.04849245399236679,
"rewards/rejected": -0.1484103500843048,
"step": 890
},
{
"epoch": 0.37,
"grad_norm": 0.45703125,
"learning_rate": 3.966847086696045e-07,
"logits/chosen": -0.14839962124824524,
"logits/rejected": 0.09455759823322296,
"logps/chosen": -228.38784790039062,
"logps/rejected": -245.74832153320312,
"loss": 0.6496,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.04339542239904404,
"rewards/margins": 0.1028498187661171,
"rewards/margins_max": 0.14537741243839264,
"rewards/margins_min": 0.06032223626971245,
"rewards/margins_std": 0.060143083333969116,
"rewards/rejected": -0.14624525606632233,
"step": 900
},
{
"epoch": 0.37,
"grad_norm": 0.51953125,
"learning_rate": 3.937615963522166e-07,
"logits/chosen": -0.11433364450931549,
"logits/rejected": 0.19467870891094208,
"logps/chosen": -223.18154907226562,
"logps/rejected": -213.2982940673828,
"loss": 0.6523,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.058440931141376495,
"rewards/margins": 0.08149015158414841,
"rewards/margins_max": 0.12004747241735458,
"rewards/margins_min": 0.04293282702565193,
"rewards/margins_std": 0.054528284817934036,
"rewards/rejected": -0.1399310827255249,
"step": 910
},
{
"epoch": 0.38,
"grad_norm": 0.5,
"learning_rate": 3.9080881949621884e-07,
"logits/chosen": -0.14250853657722473,
"logits/rejected": 0.048256054520606995,
"logps/chosen": -207.46792602539062,
"logps/rejected": -238.5066680908203,
"loss": 0.6476,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.06639896333217621,
"rewards/margins": 0.09544476121664047,
"rewards/margins_max": 0.1366080939769745,
"rewards/margins_min": 0.05428142473101616,
"rewards/margins_std": 0.05821375176310539,
"rewards/rejected": -0.16184373199939728,
"step": 920
},
{
"epoch": 0.38,
"grad_norm": 0.42578125,
"learning_rate": 3.878269873934197e-07,
"logits/chosen": -0.13014793395996094,
"logits/rejected": 0.1661478579044342,
"logps/chosen": -212.73648071289062,
"logps/rejected": -206.10086059570312,
"loss": 0.6479,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06022537872195244,
"rewards/margins": 0.08250834047794342,
"rewards/margins_max": 0.12295888364315033,
"rewards/margins_min": 0.042057789862155914,
"rewards/margins_std": 0.05720571428537369,
"rewards/rejected": -0.14273372292518616,
"step": 930
},
{
"epoch": 0.39,
"grad_norm": 0.4765625,
"learning_rate": 3.848167153310432e-07,
"logits/chosen": -0.08797403424978256,
"logits/rejected": 0.062249403446912766,
"logps/chosen": -190.28912353515625,
"logps/rejected": -227.6543426513672,
"loss": 0.6461,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.05042362958192825,
"rewards/margins": 0.09770865738391876,
"rewards/margins_max": 0.13965031504631042,
"rewards/margins_min": 0.055767010897397995,
"rewards/margins_std": 0.05931444838643074,
"rewards/rejected": -0.1481322944164276,
"step": 940
},
{
"epoch": 0.39,
"grad_norm": 0.455078125,
"learning_rate": 3.817786244647671e-07,
"logits/chosen": -0.1679493486881256,
"logits/rejected": 0.10348667949438095,
"logps/chosen": -212.8212432861328,
"logps/rejected": -224.01748657226562,
"loss": 0.6443,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05369574576616287,
"rewards/margins": 0.09801065921783447,
"rewards/margins_max": 0.14342837035655975,
"rewards/margins_min": 0.052592933177948,
"rewards/margins_std": 0.06423036009073257,
"rewards/rejected": -0.15170639753341675,
"step": 950
},
{
"epoch": 0.4,
"grad_norm": 0.53515625,
"learning_rate": 3.787133416905504e-07,
"logits/chosen": -0.11426540464162827,
"logits/rejected": 0.06826993077993393,
"logps/chosen": -224.8111114501953,
"logps/rejected": -261.72686767578125,
"loss": 0.6455,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06286445260047913,
"rewards/margins": 0.10180015861988068,
"rewards/margins_max": 0.14011150598526,
"rewards/margins_min": 0.06348879635334015,
"rewards/margins_std": 0.054180435836315155,
"rewards/rejected": -0.1646645963191986,
"step": 960
},
{
"epoch": 0.4,
"grad_norm": 0.458984375,
"learning_rate": 3.7562149951527614e-07,
"logits/chosen": -0.16833610832691193,
"logits/rejected": 0.05001373961567879,
"logps/chosen": -193.0453643798828,
"logps/rejected": -209.85385131835938,
"loss": 0.6465,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.05429987236857414,
"rewards/margins": 0.09220142662525177,
"rewards/margins_max": 0.1386091709136963,
"rewards/margins_min": 0.04579367861151695,
"rewards/margins_std": 0.0656304582953453,
"rewards/rejected": -0.14650128781795502,
"step": 970
},
{
"epoch": 0.4,
"grad_norm": 0.4609375,
"learning_rate": 3.7250373592623654e-07,
"logits/chosen": -0.15326061844825745,
"logits/rejected": 0.10640069097280502,
"logps/chosen": -200.30386352539062,
"logps/rejected": -214.9497833251953,
"loss": 0.6469,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.04672769457101822,
"rewards/margins": 0.10547290742397308,
"rewards/margins_max": 0.15153531730175018,
"rewards/margins_min": 0.05941050127148628,
"rewards/margins_std": 0.06514209508895874,
"rewards/rejected": -0.1522006094455719,
"step": 980
},
{
"epoch": 0.41,
"grad_norm": 0.439453125,
"learning_rate": 3.693606942594872e-07,
"logits/chosen": -0.12123314291238785,
"logits/rejected": 0.036874063313007355,
"logps/chosen": -208.25991821289062,
"logps/rejected": -238.3395233154297,
"loss": 0.6472,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.053240977227687836,
"rewards/margins": 0.09769239276647568,
"rewards/margins_max": 0.1466548591852188,
"rewards/margins_min": 0.04872991517186165,
"rewards/margins_std": 0.06924339383840561,
"rewards/rejected": -0.15093335509300232,
"step": 990
},
{
"epoch": 0.41,
"grad_norm": 0.515625,
"learning_rate": 3.661930230670982e-07,
"logits/chosen": -0.2230033129453659,
"logits/rejected": 0.09608611464500427,
"logps/chosen": -267.37774658203125,
"logps/rejected": -247.0121307373047,
"loss": 0.6453,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.04814675822854042,
"rewards/margins": 0.10081255435943604,
"rewards/margins_max": 0.1392827332019806,
"rewards/margins_min": 0.06234236806631088,
"rewards/margins_std": 0.05440504476428032,
"rewards/rejected": -0.14895930886268616,
"step": 1000
},
{
"epoch": 0.42,
"grad_norm": 0.470703125,
"learning_rate": 3.6300137598332745e-07,
"logits/chosen": -0.1410978138446808,
"logits/rejected": 0.1180311068892479,
"logps/chosen": -248.922607421875,
"logps/rejected": -238.7857208251953,
"loss": 0.6517,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0673481673002243,
"rewards/margins": 0.08777225762605667,
"rewards/margins_max": 0.13753345608711243,
"rewards/margins_min": 0.03801106661558151,
"rewards/margins_std": 0.07037295401096344,
"rewards/rejected": -0.15512043237686157,
"step": 1010
},
{
"epoch": 0.42,
"grad_norm": 0.51953125,
"learning_rate": 3.5978641158974746e-07,
"logits/chosen": -0.2249755561351776,
"logits/rejected": 0.06120014935731888,
"logps/chosen": -230.6334686279297,
"logps/rejected": -235.4031982421875,
"loss": 0.6465,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.057920414954423904,
"rewards/margins": 0.1049647331237793,
"rewards/margins_max": 0.14824087917804718,
"rewards/margins_min": 0.06168859079480171,
"rewards/margins_std": 0.06120172142982483,
"rewards/rejected": -0.1628851592540741,
"step": 1020
},
{
"epoch": 0.42,
"grad_norm": 0.50390625,
"learning_rate": 3.565487932793489e-07,
"logits/chosen": -0.17121955752372742,
"logits/rejected": 0.12345151603221893,
"logps/chosen": -200.95846557617188,
"logps/rejected": -201.8996124267578,
"loss": 0.6484,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.053819794207811356,
"rewards/margins": 0.09484090656042099,
"rewards/margins_max": 0.13828353583812714,
"rewards/margins_min": 0.05139826610684395,
"rewards/margins_std": 0.06143715977668762,
"rewards/rejected": -0.14866070449352264,
"step": 1030
},
{
"epoch": 0.43,
"grad_norm": 0.421875,
"learning_rate": 3.5328918911965344e-07,
"logits/chosen": -0.13216093182563782,
"logits/rejected": 0.06172620505094528,
"logps/chosen": -219.61856079101562,
"logps/rejected": -247.0506591796875,
"loss": 0.6429,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.0629672110080719,
"rewards/margins": 0.09889484941959381,
"rewards/margins_max": 0.1449834108352661,
"rewards/margins_min": 0.05280628800392151,
"rewards/margins_std": 0.06517906486988068,
"rewards/rejected": -0.1618620604276657,
"step": 1040
},
{
"epoch": 0.43,
"grad_norm": 0.390625,
"learning_rate": 3.500082717148606e-07,
"logits/chosen": -0.17225618660449982,
"logits/rejected": 0.060709256678819656,
"logps/chosen": -204.1416015625,
"logps/rejected": -227.08505249023438,
"loss": 0.6464,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.055190980434417725,
"rewards/margins": 0.09612666070461273,
"rewards/margins_max": 0.143478125333786,
"rewards/margins_min": 0.048775214701890945,
"rewards/margins_std": 0.06696505844593048,
"rewards/rejected": -0.15131765604019165,
"step": 1050
},
{
"epoch": 0.44,
"grad_norm": 0.4609375,
"learning_rate": 3.4670671806705946e-07,
"logits/chosen": -0.10742886364459991,
"logits/rejected": 0.12255527079105377,
"logps/chosen": -227.4519500732422,
"logps/rejected": -244.0882110595703,
"loss": 0.6449,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.06157033517956734,
"rewards/margins": 0.09928043186664581,
"rewards/margins_max": 0.14087219536304474,
"rewards/margins_min": 0.057688675820827484,
"rewards/margins_std": 0.05881963297724724,
"rewards/rejected": -0.16085079312324524,
"step": 1060
},
{
"epoch": 0.44,
"grad_norm": 0.43359375,
"learning_rate": 3.433852094365318e-07,
"logits/chosen": -0.14116446673870087,
"logits/rejected": 0.18814103305339813,
"logps/chosen": -214.08493041992188,
"logps/rejected": -220.2405548095703,
"loss": 0.6448,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.053702183067798615,
"rewards/margins": 0.09958843886852264,
"rewards/margins_max": 0.13894253969192505,
"rewards/margins_min": 0.06023435667157173,
"rewards/margins_std": 0.055655092000961304,
"rewards/rejected": -0.15329062938690186,
"step": 1070
},
{
"epoch": 0.44,
"grad_norm": 0.470703125,
"learning_rate": 3.400444312011776e-07,
"logits/chosen": -0.11480595916509628,
"logits/rejected": 0.08137498050928116,
"logps/chosen": -205.24142456054688,
"logps/rejected": -236.08151245117188,
"loss": 0.6491,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.060475945472717285,
"rewards/margins": 0.08602551370859146,
"rewards/margins_max": 0.12408311665058136,
"rewards/margins_min": 0.047967903316020966,
"rewards/margins_std": 0.05382157489657402,
"rewards/rejected": -0.14650145173072815,
"step": 1080
},
{
"epoch": 0.45,
"grad_norm": 0.458984375,
"learning_rate": 3.3668507271509057e-07,
"logits/chosen": -0.11696537584066391,
"logits/rejected": 0.15271435678005219,
"logps/chosen": -206.26171875,
"logps/rejected": -234.06503295898438,
"loss": 0.6474,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05601676180958748,
"rewards/margins": 0.09412574768066406,
"rewards/margins_max": 0.1347564160823822,
"rewards/margins_min": 0.05349506065249443,
"rewards/margins_std": 0.05746046453714371,
"rewards/rejected": -0.15014250576496124,
"step": 1090
},
{
"epoch": 0.45,
"grad_norm": 0.47265625,
"learning_rate": 3.333078271663128e-07,
"logits/chosen": -0.16351190209388733,
"logits/rejected": 0.14817702770233154,
"logps/chosen": -247.7356719970703,
"logps/rejected": -209.0570068359375,
"loss": 0.6458,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.050507061183452606,
"rewards/margins": 0.10204179584980011,
"rewards/margins_max": 0.14540044963359833,
"rewards/margins_min": 0.0586831197142601,
"rewards/margins_std": 0.06131840869784355,
"rewards/rejected": -0.1525488644838333,
"step": 1100
},
{
"epoch": 0.46,
"grad_norm": 0.56640625,
"learning_rate": 3.299133914337989e-07,
"logits/chosen": -0.16040000319480896,
"logits/rejected": 0.1360020935535431,
"logps/chosen": -238.80850219726562,
"logps/rejected": -253.80984497070312,
"loss": 0.6424,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.055037401616573334,
"rewards/margins": 0.11829885095357895,
"rewards/margins_max": 0.1685524433851242,
"rewards/margins_min": 0.0680452212691307,
"rewards/margins_std": 0.07106934487819672,
"rewards/rejected": -0.1733362376689911,
"step": 1110
},
{
"epoch": 0.46,
"grad_norm": 0.455078125,
"learning_rate": 3.265024659436183e-07,
"logits/chosen": -0.14927372336387634,
"logits/rejected": 0.09233134239912033,
"logps/chosen": -236.8211212158203,
"logps/rejected": -256.06243896484375,
"loss": 0.6464,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.06843633949756622,
"rewards/margins": 0.10929034650325775,
"rewards/margins_max": 0.15579745173454285,
"rewards/margins_min": 0.06278324127197266,
"rewards/margins_std": 0.06577096879482269,
"rewards/rejected": -0.17772668600082397,
"step": 1120
},
{
"epoch": 0.47,
"grad_norm": 0.5,
"learning_rate": 3.230757545244251e-07,
"logits/chosen": -0.16293886303901672,
"logits/rejected": 0.12322285026311874,
"logps/chosen": -208.5253448486328,
"logps/rejected": -223.73648071289062,
"loss": 0.6461,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.05477524548768997,
"rewards/margins": 0.09718328714370728,
"rewards/margins_max": 0.14000651240348816,
"rewards/margins_min": 0.0543600432574749,
"rewards/margins_std": 0.06056120991706848,
"rewards/rejected": -0.15195852518081665,
"step": 1130
},
{
"epoch": 0.47,
"grad_norm": 0.4609375,
"learning_rate": 3.196339642622269e-07,
"logits/chosen": -0.04625851660966873,
"logits/rejected": 0.06496497243642807,
"logps/chosen": -208.4033660888672,
"logps/rejected": -250.89321899414062,
"loss": 0.6425,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.06276147067546844,
"rewards/margins": 0.10249187797307968,
"rewards/margins_max": 0.15007010102272034,
"rewards/margins_min": 0.054913658648729324,
"rewards/margins_std": 0.06728576868772507,
"rewards/rejected": -0.16525335609912872,
"step": 1140
},
{
"epoch": 0.47,
"grad_norm": 0.427734375,
"learning_rate": 3.1617780535448053e-07,
"logits/chosen": -0.09090803563594818,
"logits/rejected": 0.13836640119552612,
"logps/chosen": -224.8212890625,
"logps/rejected": -236.4337158203125,
"loss": 0.6485,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06297770887613297,
"rewards/margins": 0.08947329223155975,
"rewards/margins_max": 0.12863615155220032,
"rewards/margins_min": 0.0503104105591774,
"rewards/margins_std": 0.055384665727615356,
"rewards/rejected": -0.15245100855827332,
"step": 1150
},
{
"epoch": 0.48,
"grad_norm": 0.498046875,
"learning_rate": 3.127079909635462e-07,
"logits/chosen": -0.1454629898071289,
"logits/rejected": 0.09381814301013947,
"logps/chosen": -210.1219940185547,
"logps/rejected": -247.29519653320312,
"loss": 0.6413,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.05111883953213692,
"rewards/margins": 0.11492305994033813,
"rewards/margins_max": 0.15684781968593597,
"rewards/margins_min": 0.07299830764532089,
"rewards/margins_std": 0.0592905655503273,
"rewards/rejected": -0.16604191064834595,
"step": 1160
},
{
"epoch": 0.48,
"grad_norm": 0.462890625,
"learning_rate": 3.0922523706952976e-07,
"logits/chosen": -0.12923486530780792,
"logits/rejected": 0.00274011492729187,
"logps/chosen": -192.57505798339844,
"logps/rejected": -229.5814971923828,
"loss": 0.6417,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06112251803278923,
"rewards/margins": 0.11229976266622543,
"rewards/margins_max": 0.15655739605426788,
"rewards/margins_min": 0.06804212182760239,
"rewards/margins_std": 0.06258974969387054,
"rewards/rejected": -0.17342229187488556,
"step": 1170
},
{
"epoch": 0.49,
"grad_norm": 0.427734375,
"learning_rate": 3.057302623225434e-07,
"logits/chosen": -0.08845367282629013,
"logits/rejected": 0.15953145921230316,
"logps/chosen": -224.7268524169922,
"logps/rejected": -228.22573852539062,
"loss": 0.6472,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06610189378261566,
"rewards/margins": 0.09007870405912399,
"rewards/margins_max": 0.1336970031261444,
"rewards/margins_min": 0.046460412442684174,
"rewards/margins_std": 0.06168559193611145,
"rewards/rejected": -0.15618060529232025,
"step": 1180
},
{
"epoch": 0.49,
"grad_norm": 0.5234375,
"learning_rate": 3.0222378789441585e-07,
"logits/chosen": -0.13951900601387024,
"logits/rejected": 0.21554584801197052,
"logps/chosen": -241.9784698486328,
"logps/rejected": -247.6939239501953,
"loss": 0.6468,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.0631137415766716,
"rewards/margins": 0.10374633967876434,
"rewards/margins_max": 0.15403683483600616,
"rewards/margins_min": 0.05345584824681282,
"rewards/margins_std": 0.07112149894237518,
"rewards/rejected": -0.16686007380485535,
"step": 1190
},
{
"epoch": 0.49,
"grad_norm": 0.474609375,
"learning_rate": 2.9870653732988137e-07,
"logits/chosen": -0.15974149107933044,
"logits/rejected": 0.055376578122377396,
"logps/chosen": -207.2960662841797,
"logps/rejected": -221.9731903076172,
"loss": 0.6474,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.0634765774011612,
"rewards/margins": 0.08953996747732162,
"rewards/margins_max": 0.12941452860832214,
"rewards/margins_min": 0.04966540262103081,
"rewards/margins_std": 0.056391142308712006,
"rewards/rejected": -0.15301653742790222,
"step": 1200
},
{
"epoch": 0.5,
"grad_norm": 0.57421875,
"learning_rate": 2.951792363972804e-07,
"logits/chosen": -0.13279682397842407,
"logits/rejected": 0.10459411144256592,
"logps/chosen": -224.2257080078125,
"logps/rejected": -227.02102661132812,
"loss": 0.6466,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06181638687849045,
"rewards/margins": 0.10990460962057114,
"rewards/margins_max": 0.15748900175094604,
"rewards/margins_min": 0.062320221215486526,
"rewards/margins_std": 0.06729448586702347,
"rewards/rejected": -0.17172099649906158,
"step": 1210
},
{
"epoch": 0.5,
"grad_norm": 0.47265625,
"learning_rate": 2.9164261293879984e-07,
"logits/chosen": -0.1285082995891571,
"logits/rejected": 0.08084109425544739,
"logps/chosen": -206.4801788330078,
"logps/rejected": -229.85586547851562,
"loss": 0.6483,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06227404996752739,
"rewards/margins": 0.09520912915468216,
"rewards/margins_max": 0.1408213973045349,
"rewards/margins_min": 0.04959685727953911,
"rewards/margins_std": 0.06450549513101578,
"rewards/rejected": -0.15748317539691925,
"step": 1220
},
{
"epoch": 0.51,
"grad_norm": 0.4375,
"learning_rate": 2.8809739672028677e-07,
"logits/chosen": -0.12225770950317383,
"logits/rejected": 0.03362155705690384,
"logps/chosen": -217.4171905517578,
"logps/rejected": -241.16458129882812,
"loss": 0.6507,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.07442399859428406,
"rewards/margins": 0.08279917389154434,
"rewards/margins_max": 0.11553443968296051,
"rewards/margins_min": 0.050063878297805786,
"rewards/margins_std": 0.0462946780025959,
"rewards/rejected": -0.1572231650352478,
"step": 1230
},
{
"epoch": 0.51,
"grad_norm": 0.458984375,
"learning_rate": 2.845443192806644e-07,
"logits/chosen": -0.18313539028167725,
"logits/rejected": 0.1276620626449585,
"logps/chosen": -229.458740234375,
"logps/rejected": -231.79013061523438,
"loss": 0.6471,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.06001945212483406,
"rewards/margins": 0.08784504234790802,
"rewards/margins_max": 0.1322353482246399,
"rewards/margins_min": 0.043454740196466446,
"rewards/margins_std": 0.06277737021446228,
"rewards/rejected": -0.14786449074745178,
"step": 1240
},
{
"epoch": 0.51,
"grad_norm": 0.4375,
"learning_rate": 2.809841137809825e-07,
"logits/chosen": -0.17046763002872467,
"logits/rejected": 0.05243430286645889,
"logps/chosen": -207.2881317138672,
"logps/rejected": -244.39028930664062,
"loss": 0.6446,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05877915769815445,
"rewards/margins": 0.10812593996524811,
"rewards/margins_max": 0.14523643255233765,
"rewards/margins_min": 0.07101544737815857,
"rewards/margins_std": 0.05248216539621353,
"rewards/rejected": -0.16690510511398315,
"step": 1250
},
{
"epoch": 0.52,
"grad_norm": 0.5625,
"learning_rate": 2.774175148531329e-07,
"logits/chosen": -0.18015912175178528,
"logits/rejected": 0.0010575338965281844,
"logps/chosen": -221.28512573242188,
"logps/rejected": -250.00222778320312,
"loss": 0.6442,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.05312822386622429,
"rewards/margins": 0.11167536675930023,
"rewards/margins_max": 0.15390248596668243,
"rewards/margins_min": 0.06944824755191803,
"rewards/margins_std": 0.05971817299723625,
"rewards/rejected": -0.16480359435081482,
"step": 1260
},
{
"epoch": 0.52,
"grad_norm": 0.474609375,
"learning_rate": 2.738452584482617e-07,
"logits/chosen": -0.1896088570356369,
"logits/rejected": 0.06044679880142212,
"logps/chosen": -195.58468627929688,
"logps/rejected": -227.13525390625,
"loss": 0.6419,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.052375711500644684,
"rewards/margins": 0.11357314884662628,
"rewards/margins_max": 0.15889129042625427,
"rewards/margins_min": 0.0682549923658371,
"rewards/margins_std": 0.06408955156803131,
"rewards/rejected": -0.16594885289669037,
"step": 1270
},
{
"epoch": 0.53,
"grad_norm": 0.47265625,
"learning_rate": 2.702680816849091e-07,
"logits/chosen": -0.15573439002037048,
"logits/rejected": 0.09830964356660843,
"logps/chosen": -217.51754760742188,
"logps/rejected": -240.4334259033203,
"loss": 0.6443,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.06747113913297653,
"rewards/margins": 0.10181452333927155,
"rewards/margins_max": 0.14106084406375885,
"rewards/margins_min": 0.06256819516420364,
"rewards/margins_std": 0.055502694100141525,
"rewards/rejected": -0.1692856401205063,
"step": 1280
},
{
"epoch": 0.53,
"grad_norm": 0.5234375,
"learning_rate": 2.666867226969087e-07,
"logits/chosen": -0.15920008718967438,
"logits/rejected": 0.07830671966075897,
"logps/chosen": -216.7744903564453,
"logps/rejected": -231.097412109375,
"loss": 0.6481,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.06382476538419724,
"rewards/margins": 0.08724324405193329,
"rewards/margins_max": 0.1375073939561844,
"rewards/margins_min": 0.03697910159826279,
"rewards/margins_std": 0.07108423113822937,
"rewards/rejected": -0.15106801688671112,
"step": 1290
},
{
"epoch": 0.53,
"grad_norm": 0.51171875,
"learning_rate": 2.631019204810763e-07,
"logits/chosen": -0.15687043964862823,
"logits/rejected": -0.0006535470602102578,
"logps/chosen": -218.9697265625,
"logps/rejected": -246.04977416992188,
"loss": 0.6462,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.061477065086364746,
"rewards/margins": 0.08852384984493256,
"rewards/margins_max": 0.1352422535419464,
"rewards/margins_min": 0.0418054573237896,
"rewards/margins_std": 0.06606978923082352,
"rewards/rejected": -0.1500009298324585,
"step": 1300
},
{
"epoch": 0.54,
"grad_norm": 0.5078125,
"learning_rate": 2.5951441474472206e-07,
"logits/chosen": -0.19294488430023193,
"logits/rejected": 0.11017533391714096,
"logps/chosen": -235.78421020507812,
"logps/rejected": -242.24685668945312,
"loss": 0.6419,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.05141814798116684,
"rewards/margins": 0.11200641095638275,
"rewards/margins_max": 0.1588469296693802,
"rewards/margins_min": 0.06516589224338531,
"rewards/margins_std": 0.06624249368906021,
"rewards/rejected": -0.163424551486969,
"step": 1310
},
{
"epoch": 0.54,
"grad_norm": 0.5546875,
"learning_rate": 2.5592494575301533e-07,
"logits/chosen": -0.1245236024260521,
"logits/rejected": 0.016547679901123047,
"logps/chosen": -225.09353637695312,
"logps/rejected": -267.2383117675781,
"loss": 0.6414,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.053593188524246216,
"rewards/margins": 0.10814990848302841,
"rewards/margins_max": 0.14369070529937744,
"rewards/margins_min": 0.07260910421609879,
"rewards/margins_std": 0.050262290984392166,
"rewards/rejected": -0.16174308955669403,
"step": 1320
},
{
"epoch": 0.55,
"grad_norm": 0.51171875,
"learning_rate": 2.523342541762335e-07,
"logits/chosen": -0.20755529403686523,
"logits/rejected": 0.07851056009531021,
"logps/chosen": -198.47854614257812,
"logps/rejected": -213.96701049804688,
"loss": 0.6438,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05272821709513664,
"rewards/margins": 0.11356940120458603,
"rewards/margins_max": 0.1586931347846985,
"rewards/margins_min": 0.06844566762447357,
"rewards/margins_std": 0.06381459534168243,
"rewards/rejected": -0.16629762947559357,
"step": 1330
},
{
"epoch": 0.55,
"grad_norm": 0.5234375,
"learning_rate": 2.487430809369293e-07,
"logits/chosen": -0.048032622784376144,
"logits/rejected": 0.1702868640422821,
"logps/chosen": -214.9596405029297,
"logps/rejected": -229.90847778320312,
"loss": 0.6458,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.058893002569675446,
"rewards/margins": 0.10679246485233307,
"rewards/margins_max": 0.15617462992668152,
"rewards/margins_min": 0.057410307228565216,
"rewards/margins_std": 0.06983692944049835,
"rewards/rejected": -0.1656854748725891,
"step": 1340
},
{
"epoch": 0.56,
"grad_norm": 0.53515625,
"learning_rate": 2.4515216705704393e-07,
"logits/chosen": -0.21574148535728455,
"logits/rejected": 0.07479486614465714,
"logps/chosen": -235.31396484375,
"logps/rejected": -240.1641845703125,
"loss": 0.6435,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.057291556149721146,
"rewards/margins": 0.11093351989984512,
"rewards/margins_max": 0.15275637805461884,
"rewards/margins_min": 0.06911066174507141,
"rewards/margins_std": 0.0591464526951313,
"rewards/rejected": -0.16822507977485657,
"step": 1350
},
{
"epoch": 0.56,
"grad_norm": 0.56640625,
"learning_rate": 2.415622535050009e-07,
"logits/chosen": -0.17604181170463562,
"logits/rejected": 0.05615830421447754,
"logps/chosen": -204.60494995117188,
"logps/rejected": -226.7333984375,
"loss": 0.6454,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.05647622421383858,
"rewards/margins": 0.09753639996051788,
"rewards/margins_max": 0.1477840393781662,
"rewards/margins_min": 0.04728874936699867,
"rewards/margins_std": 0.0710608959197998,
"rewards/rejected": -0.15401262044906616,
"step": 1360
},
{
"epoch": 0.56,
"grad_norm": 0.4453125,
"learning_rate": 2.379740810428111e-07,
"logits/chosen": -0.1344299018383026,
"logits/rejected": 0.13296538591384888,
"logps/chosen": -205.1040496826172,
"logps/rejected": -212.88705444335938,
"loss": 0.6458,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.059041477739810944,
"rewards/margins": 0.09553243964910507,
"rewards/margins_max": 0.14307790994644165,
"rewards/margins_min": 0.0479869581758976,
"rewards/margins_std": 0.06723945587873459,
"rewards/rejected": -0.15457391738891602,
"step": 1370
},
{
"epoch": 0.57,
"grad_norm": 0.486328125,
"learning_rate": 2.3438839007321936e-07,
"logits/chosen": -0.11420653760433197,
"logits/rejected": 0.11103509366512299,
"logps/chosen": -222.06143188476562,
"logps/rejected": -249.2257843017578,
"loss": 0.6423,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.04806683585047722,
"rewards/margins": 0.1108899936079979,
"rewards/margins_max": 0.1588352620601654,
"rewards/margins_min": 0.06294471770524979,
"rewards/margins_std": 0.06780485063791275,
"rewards/rejected": -0.15895681083202362,
"step": 1380
},
{
"epoch": 0.57,
"grad_norm": 0.494140625,
"learning_rate": 2.3080592048692593e-07,
"logits/chosen": -0.18425148725509644,
"logits/rejected": -0.025905439630150795,
"logps/chosen": -223.4422149658203,
"logps/rejected": -251.49514770507812,
"loss": 0.6462,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.06482952833175659,
"rewards/margins": 0.09514541923999786,
"rewards/margins_max": 0.1500011533498764,
"rewards/margins_min": 0.04028966277837753,
"rewards/margins_std": 0.07757773995399475,
"rewards/rejected": -0.15997494757175446,
"step": 1390
},
{
"epoch": 0.58,
"grad_norm": 0.49609375,
"learning_rate": 2.2722741150991376e-07,
"logits/chosen": -0.16047583520412445,
"logits/rejected": 0.10932193696498871,
"logps/chosen": -214.70315551757812,
"logps/rejected": -212.7730712890625,
"loss": 0.649,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0567842535674572,
"rewards/margins": 0.09505367279052734,
"rewards/margins_max": 0.13385465741157532,
"rewards/margins_min": 0.05625268071889877,
"rewards/margins_std": 0.05487288907170296,
"rewards/rejected": -0.15183793008327484,
"step": 1400
},
{
"epoch": 0.58,
"grad_norm": 0.4609375,
"learning_rate": 2.2365360155091238e-07,
"logits/chosen": -0.11961637437343597,
"logits/rejected": 0.0894673764705658,
"logps/chosen": -209.631103515625,
"logps/rejected": -264.8625183105469,
"loss": 0.6374,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05508565157651901,
"rewards/margins": 0.12142980098724365,
"rewards/margins_max": 0.17250314354896545,
"rewards/margins_min": 0.07035643607378006,
"rewards/margins_std": 0.07222862541675568,
"rewards/rejected": -0.17651543021202087,
"step": 1410
},
{
"epoch": 0.58,
"grad_norm": 0.515625,
"learning_rate": 2.2008522804903062e-07,
"logits/chosen": -0.0964190810918808,
"logits/rejected": 0.10529766976833344,
"logps/chosen": -212.2164306640625,
"logps/rejected": -232.51168823242188,
"loss": 0.6436,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.04842360317707062,
"rewards/margins": 0.10283006727695465,
"rewards/margins_max": 0.14770516753196716,
"rewards/margins_min": 0.05795495584607124,
"rewards/margins_std": 0.063462994992733,
"rewards/rejected": -0.15125367045402527,
"step": 1420
},
{
"epoch": 0.59,
"grad_norm": 0.494140625,
"learning_rate": 2.1652302732158988e-07,
"logits/chosen": -0.15599027276039124,
"logits/rejected": -0.01368700247257948,
"logps/chosen": -196.22361755371094,
"logps/rejected": -246.3559112548828,
"loss": 0.6417,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.06172167509794235,
"rewards/margins": 0.10635526478290558,
"rewards/margins_max": 0.1531476378440857,
"rewards/margins_min": 0.05956289917230606,
"rewards/margins_std": 0.06617439538240433,
"rewards/rejected": -0.16807694733142853,
"step": 1430
},
{
"epoch": 0.59,
"grad_norm": 0.54296875,
"learning_rate": 2.1296773441218785e-07,
"logits/chosen": -0.11377346515655518,
"logits/rejected": 0.15727418661117554,
"logps/chosen": -188.34353637695312,
"logps/rejected": -210.76950073242188,
"loss": 0.6412,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05929671972990036,
"rewards/margins": 0.10617075115442276,
"rewards/margins_max": 0.15472009778022766,
"rewards/margins_min": 0.05762138217687607,
"rewards/margins_std": 0.06865915656089783,
"rewards/rejected": -0.16546745598316193,
"step": 1440
},
{
"epoch": 0.6,
"grad_norm": 0.50390625,
"learning_rate": 2.094200829390262e-07,
"logits/chosen": -0.09401213377714157,
"logits/rejected": 0.049455929547548294,
"logps/chosen": -235.5387725830078,
"logps/rejected": -273.2581787109375,
"loss": 0.644,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.0620415136218071,
"rewards/margins": 0.10562853515148163,
"rewards/margins_max": 0.1506195366382599,
"rewards/margins_min": 0.06063752621412277,
"rewards/margins_std": 0.06362690031528473,
"rewards/rejected": -0.16767004132270813,
"step": 1450
},
{
"epoch": 0.6,
"grad_norm": 0.52734375,
"learning_rate": 2.0588080494353172e-07,
"logits/chosen": -0.17110076546669006,
"logits/rejected": 0.08025307953357697,
"logps/chosen": -227.67440795898438,
"logps/rejected": -259.4073486328125,
"loss": 0.641,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05723772570490837,
"rewards/margins": 0.10702015459537506,
"rewards/margins_max": 0.14093990623950958,
"rewards/margins_min": 0.07310040295124054,
"rewards/margins_std": 0.04796977713704109,
"rewards/rejected": -0.16425786912441254,
"step": 1460
},
{
"epoch": 0.6,
"grad_norm": 0.52734375,
"learning_rate": 2.0235063073930276e-07,
"logits/chosen": -0.09825171530246735,
"logits/rejected": 0.13015155494213104,
"logps/chosen": -212.82754516601562,
"logps/rejected": -232.27713012695312,
"loss": 0.6429,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.0580797903239727,
"rewards/margins": 0.10242825746536255,
"rewards/margins_max": 0.13443560898303986,
"rewards/margins_min": 0.07042091339826584,
"rewards/margins_std": 0.045265212655067444,
"rewards/rejected": -0.16050805151462555,
"step": 1470
},
{
"epoch": 0.61,
"grad_norm": 0.466796875,
"learning_rate": 1.9883028876141266e-07,
"logits/chosen": -0.15827712416648865,
"logits/rejected": 0.0378829650580883,
"logps/chosen": -208.70877075195312,
"logps/rejected": -225.761474609375,
"loss": 0.6474,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.06385836750268936,
"rewards/margins": 0.09914499521255493,
"rewards/margins_max": 0.14492857456207275,
"rewards/margins_min": 0.05336139351129532,
"rewards/margins_std": 0.06474778801202774,
"rewards/rejected": -0.1630033552646637,
"step": 1480
},
{
"epoch": 0.61,
"grad_norm": 0.4765625,
"learning_rate": 1.9532050541610058e-07,
"logits/chosen": -0.08087868988513947,
"logits/rejected": 0.025525391101837158,
"logps/chosen": -202.79037475585938,
"logps/rejected": -251.94650268554688,
"loss": 0.6458,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.06054714322090149,
"rewards/margins": 0.1071493998169899,
"rewards/margins_max": 0.15883831679821014,
"rewards/margins_min": 0.055460475385189056,
"rewards/margins_std": 0.07309918105602264,
"rewards/rejected": -0.16769655048847198,
"step": 1490
},
{
"epoch": 0.62,
"grad_norm": 0.37890625,
"learning_rate": 1.9182200493088052e-07,
"logits/chosen": -0.16447165608406067,
"logits/rejected": 0.11063267290592194,
"logps/chosen": -216.5113067626953,
"logps/rejected": -230.7987823486328,
"loss": 0.6439,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.0644717812538147,
"rewards/margins": 0.09193485975265503,
"rewards/margins_max": 0.1328367441892624,
"rewards/margins_min": 0.05103297159075737,
"rewards/margins_std": 0.057844001799821854,
"rewards/rejected": -0.15640662610530853,
"step": 1500
},
{
"epoch": 0.62,
"grad_norm": 0.423828125,
"learning_rate": 1.883355092051009e-07,
"logits/chosen": -0.07868603616952896,
"logits/rejected": 0.05034567043185234,
"logps/chosen": -218.24722290039062,
"logps/rejected": -260.0995788574219,
"loss": 0.6433,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.052408766001462936,
"rewards/margins": 0.11046520620584488,
"rewards/margins_max": 0.15805724263191223,
"rewards/margins_min": 0.06287316977977753,
"rewards/margins_std": 0.0673053115606308,
"rewards/rejected": -0.1628739833831787,
"step": 1510
},
{
"epoch": 0.63,
"grad_norm": 0.5078125,
"learning_rate": 1.8486173766098362e-07,
"logits/chosen": -0.14829647541046143,
"logits/rejected": 0.0757719874382019,
"logps/chosen": -204.81362915039062,
"logps/rejected": -235.73849487304688,
"loss": 0.6435,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.04796791076660156,
"rewards/margins": 0.09321852028369904,
"rewards/margins_max": 0.12461745738983154,
"rewards/margins_min": 0.061819594353437424,
"rewards/margins_std": 0.044404786080121994,
"rewards/rejected": -0.1411864459514618,
"step": 1520
},
{
"epoch": 0.63,
"grad_norm": 0.490234375,
"learning_rate": 1.8140140709517465e-07,
"logits/chosen": -0.07676380127668381,
"logits/rejected": 0.15969929099082947,
"logps/chosen": -228.54879760742188,
"logps/rejected": -246.28616333007812,
"loss": 0.6461,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.0546930655837059,
"rewards/margins": 0.0984901636838913,
"rewards/margins_max": 0.13989897072315216,
"rewards/margins_min": 0.057081352919340134,
"rewards/margins_std": 0.058560896664857864,
"rewards/rejected": -0.1531832069158554,
"step": 1530
},
{
"epoch": 0.63,
"grad_norm": 0.5078125,
"learning_rate": 1.7795523153083653e-07,
"logits/chosen": -0.08178448677062988,
"logits/rejected": 0.004074615426361561,
"logps/chosen": -195.57810974121094,
"logps/rejected": -247.9669647216797,
"loss": 0.6498,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.05888283997774124,
"rewards/margins": 0.09247585386037827,
"rewards/margins_max": 0.14980639517307281,
"rewards/margins_min": 0.035145316272974014,
"rewards/margins_std": 0.08107762038707733,
"rewards/rejected": -0.1513586789369583,
"step": 1540
},
{
"epoch": 0.64,
"grad_norm": 0.51171875,
"learning_rate": 1.7452392207031286e-07,
"logits/chosen": -0.10405842959880829,
"logits/rejected": -0.0477495901286602,
"logps/chosen": -199.43121337890625,
"logps/rejected": -295.82403564453125,
"loss": 0.6435,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.04707425460219383,
"rewards/margins": 0.11261805146932602,
"rewards/margins_max": 0.15691080689430237,
"rewards/margins_min": 0.06832531839609146,
"rewards/margins_std": 0.06263939291238785,
"rewards/rejected": -0.15969231724739075,
"step": 1550
},
{
"epoch": 0.64,
"grad_norm": 0.474609375,
"learning_rate": 1.7110818674839563e-07,
"logits/chosen": -0.08448558300733566,
"logits/rejected": 0.16416208446025848,
"logps/chosen": -202.5157012939453,
"logps/rejected": -211.07785034179688,
"loss": 0.6466,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.059366196393966675,
"rewards/margins": 0.08933896571397781,
"rewards/margins_max": 0.13254401087760925,
"rewards/margins_min": 0.046133920550346375,
"rewards/margins_std": 0.06110116094350815,
"rewards/rejected": -0.1487051546573639,
"step": 1560
},
{
"epoch": 0.65,
"grad_norm": 0.404296875,
"learning_rate": 1.6770873038622562e-07,
"logits/chosen": -0.12733057141304016,
"logits/rejected": 0.0979214534163475,
"logps/chosen": -226.008056640625,
"logps/rejected": -237.6015167236328,
"loss": 0.6494,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06018907576799393,
"rewards/margins": 0.08392681181430817,
"rewards/margins_max": 0.11851127445697784,
"rewards/margins_min": 0.049342334270477295,
"rewards/margins_std": 0.04890982061624527,
"rewards/rejected": -0.1441158801317215,
"step": 1570
},
{
"epoch": 0.65,
"grad_norm": 0.5,
"learning_rate": 1.643262544458558e-07,
"logits/chosen": -0.1266459971666336,
"logits/rejected": 0.1119358167052269,
"logps/chosen": -248.8621826171875,
"logps/rejected": -259.1594543457031,
"loss": 0.6477,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06370563060045242,
"rewards/margins": 0.09041455388069153,
"rewards/margins_max": 0.12537343800067902,
"rewards/margins_min": 0.05545566603541374,
"rewards/margins_std": 0.04943932592868805,
"rewards/rejected": -0.15412016212940216,
"step": 1580
},
{
"epoch": 0.65,
"grad_norm": 0.490234375,
"learning_rate": 1.6096145688550772e-07,
"logits/chosen": -0.13403485715389252,
"logits/rejected": 0.11216270923614502,
"logps/chosen": -197.7595672607422,
"logps/rejected": -214.38229370117188,
"loss": 0.649,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.06467537581920624,
"rewards/margins": 0.08191889524459839,
"rewards/margins_max": 0.12750712037086487,
"rewards/margins_min": 0.03633067384362221,
"rewards/margins_std": 0.06447147578001022,
"rewards/rejected": -0.14659425616264343,
"step": 1590
},
{
"epoch": 0.66,
"grad_norm": 0.478515625,
"learning_rate": 1.5761503201555138e-07,
"logits/chosen": -0.14292378723621368,
"logits/rejected": 0.06406668573617935,
"logps/chosen": -210.74838256835938,
"logps/rejected": -227.1605987548828,
"loss": 0.6452,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.0532594732940197,
"rewards/margins": 0.09269430488348007,
"rewards/margins_max": 0.1299545019865036,
"rewards/margins_min": 0.05543411523103714,
"rewards/margins_std": 0.05269387364387512,
"rewards/rejected": -0.14595378935337067,
"step": 1600
},
{
"epoch": 0.66,
"grad_norm": 0.4765625,
"learning_rate": 1.542876703552372e-07,
"logits/chosen": -0.10188720375299454,
"logits/rejected": 0.10029338300228119,
"logps/chosen": -222.6920928955078,
"logps/rejected": -241.28536987304688,
"loss": 0.6436,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06753533333539963,
"rewards/margins": 0.10507749021053314,
"rewards/margins_max": 0.1470262110233307,
"rewards/margins_min": 0.06312878429889679,
"rewards/margins_std": 0.05932443216443062,
"rewards/rejected": -0.17261283099651337,
"step": 1610
},
{
"epoch": 0.67,
"grad_norm": 0.453125,
"learning_rate": 1.5098005849021078e-07,
"logits/chosen": -0.14857852458953857,
"logits/rejected": 0.04540370777249336,
"logps/chosen": -213.944580078125,
"logps/rejected": -238.8897247314453,
"loss": 0.6391,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05818655341863632,
"rewards/margins": 0.10537393391132355,
"rewards/margins_max": 0.14596855640411377,
"rewards/margins_min": 0.06477929651737213,
"rewards/margins_std": 0.057409483939409256,
"rewards/rejected": -0.16356047987937927,
"step": 1620
},
{
"epoch": 0.67,
"grad_norm": 0.51171875,
"learning_rate": 1.4769287893083905e-07,
"logits/chosen": -0.20055250823497772,
"logits/rejected": 0.12372901290655136,
"logps/chosen": -214.1376190185547,
"logps/rejected": -244.47225952148438,
"loss": 0.6379,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05139562487602234,
"rewards/margins": 0.1305696666240692,
"rewards/margins_max": 0.1713367998600006,
"rewards/margins_min": 0.08980251848697662,
"rewards/margins_std": 0.05765343829989433,
"rewards/rejected": -0.18196527659893036,
"step": 1630
},
{
"epoch": 0.67,
"grad_norm": 0.474609375,
"learning_rate": 1.444268099713775e-07,
"logits/chosen": -0.17057690024375916,
"logits/rejected": 0.061030395328998566,
"logps/chosen": -211.06912231445312,
"logps/rejected": -238.83682250976562,
"loss": 0.6464,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.0555347204208374,
"rewards/margins": 0.097480908036232,
"rewards/margins_max": 0.15012948215007782,
"rewards/margins_min": 0.044832345098257065,
"rewards/margins_std": 0.07445631921291351,
"rewards/rejected": -0.1530156284570694,
"step": 1640
},
{
"epoch": 0.68,
"grad_norm": 0.45703125,
"learning_rate": 1.411825255500071e-07,
"logits/chosen": -0.13909710943698883,
"logits/rejected": 0.012036198750138283,
"logps/chosen": -192.96359252929688,
"logps/rejected": -238.0941925048828,
"loss": 0.6473,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05943521112203598,
"rewards/margins": 0.09797738492488861,
"rewards/margins_max": 0.13452477753162384,
"rewards/margins_min": 0.06142998859286308,
"rewards/margins_std": 0.051685821264982224,
"rewards/rejected": -0.1574126034975052,
"step": 1650
},
{
"epoch": 0.68,
"grad_norm": 0.470703125,
"learning_rate": 1.379606951097705e-07,
"logits/chosen": -0.14935798943042755,
"logits/rejected": 0.03216198831796646,
"logps/chosen": -193.1946563720703,
"logps/rejected": -228.28125,
"loss": 0.643,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.05348304659128189,
"rewards/margins": 0.10333843529224396,
"rewards/margins_max": 0.14901982247829437,
"rewards/margins_min": 0.057657063007354736,
"rewards/margins_std": 0.06460321694612503,
"rewards/rejected": -0.15682148933410645,
"step": 1660
},
{
"epoch": 0.69,
"grad_norm": 0.455078125,
"learning_rate": 1.3476198346043553e-07,
"logits/chosen": -0.18985338509082794,
"logits/rejected": 0.12341825664043427,
"logps/chosen": -224.2623748779297,
"logps/rejected": -220.1488037109375,
"loss": 0.6435,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.059168003499507904,
"rewards/margins": 0.09694478660821915,
"rewards/margins_max": 0.14475250244140625,
"rewards/margins_min": 0.049137067049741745,
"rewards/margins_std": 0.06761031597852707,
"rewards/rejected": -0.15611279010772705,
"step": 1670
},
{
"epoch": 0.69,
"grad_norm": 0.412109375,
"learning_rate": 1.3158705064131477e-07,
"logits/chosen": -0.13782618939876556,
"logits/rejected": 0.008584958501160145,
"logps/chosen": -205.80709838867188,
"logps/rejected": -230.9507293701172,
"loss": 0.6432,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05955750495195389,
"rewards/margins": 0.10197343677282333,
"rewards/margins_max": 0.1481003761291504,
"rewards/margins_min": 0.05584648996591568,
"rewards/margins_std": 0.06523334980010986,
"rewards/rejected": -0.16153094172477722,
"step": 1680
},
{
"epoch": 0.7,
"grad_norm": 0.50390625,
"learning_rate": 1.2843655178506943e-07,
"logits/chosen": -0.18119294941425323,
"logits/rejected": 0.03453055024147034,
"logps/chosen": -224.9883270263672,
"logps/rejected": -257.1782531738281,
"loss": 0.642,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05576961115002632,
"rewards/margins": 0.10703034698963165,
"rewards/margins_max": 0.15220855176448822,
"rewards/margins_min": 0.06185212731361389,
"rewards/margins_std": 0.06389166414737701,
"rewards/rejected": -0.16279995441436768,
"step": 1690
},
{
"epoch": 0.7,
"grad_norm": 0.52734375,
"learning_rate": 1.2531113698252565e-07,
"logits/chosen": -0.2272913008928299,
"logits/rejected": 0.0765170305967331,
"logps/chosen": -208.4404296875,
"logps/rejected": -214.04281616210938,
"loss": 0.645,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0450308695435524,
"rewards/margins": 0.10445519536733627,
"rewards/margins_max": 0.14625009894371033,
"rewards/margins_min": 0.06266029924154282,
"rewards/margins_std": 0.05910690873861313,
"rewards/rejected": -0.14948606491088867,
"step": 1700
},
{
"epoch": 0.7,
"grad_norm": 0.478515625,
"learning_rate": 1.222114511485317e-07,
"logits/chosen": -0.12483775615692139,
"logits/rejected": 0.09466644376516342,
"logps/chosen": -208.94430541992188,
"logps/rejected": -255.8892059326172,
"loss": 0.6405,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05880628898739815,
"rewards/margins": 0.11126656830310822,
"rewards/margins_max": 0.16315510869026184,
"rewards/margins_min": 0.059378039091825485,
"rewards/margins_std": 0.0733814612030983,
"rewards/rejected": -0.17007283866405487,
"step": 1710
},
{
"epoch": 0.71,
"grad_norm": 0.51171875,
"learning_rate": 1.191381338888825e-07,
"logits/chosen": -0.13245120644569397,
"logits/rejected": 0.10843801498413086,
"logps/chosen": -204.6431121826172,
"logps/rejected": -226.485595703125,
"loss": 0.6498,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05664772912859917,
"rewards/margins": 0.09273017942905426,
"rewards/margins_max": 0.1266176402568817,
"rewards/margins_min": 0.0588427297770977,
"rewards/margins_std": 0.04792410135269165,
"rewards/rejected": -0.14937791228294373,
"step": 1720
},
{
"epoch": 0.71,
"grad_norm": 0.443359375,
"learning_rate": 1.1609181936833965e-07,
"logits/chosen": -0.09875744581222534,
"logits/rejected": 0.03166166692972183,
"logps/chosen": -204.11947631835938,
"logps/rejected": -255.87258911132812,
"loss": 0.6417,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.05774586275219917,
"rewards/margins": 0.10740862786769867,
"rewards/margins_max": 0.1617855578660965,
"rewards/margins_min": 0.053031690418720245,
"rewards/margins_std": 0.07690059393644333,
"rewards/rejected": -0.16515448689460754,
"step": 1730
},
{
"epoch": 0.72,
"grad_norm": 0.41796875,
"learning_rate": 1.1307313617977512e-07,
"logits/chosen": -0.1509208381175995,
"logits/rejected": 0.09706972539424896,
"logps/chosen": -201.7398681640625,
"logps/rejected": -210.61923217773438,
"loss": 0.6443,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06123855710029602,
"rewards/margins": 0.09280522167682648,
"rewards/margins_max": 0.14272518455982208,
"rewards/margins_min": 0.04288526624441147,
"rewards/margins_std": 0.07059746980667114,
"rewards/rejected": -0.1540437638759613,
"step": 1740
},
{
"epoch": 0.72,
"grad_norm": 0.43359375,
"learning_rate": 1.1008270721446358e-07,
"logits/chosen": -0.15949265658855438,
"logits/rejected": 0.1050555557012558,
"logps/chosen": -207.06884765625,
"logps/rejected": -211.36892700195312,
"loss": 0.6447,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.05307169631123543,
"rewards/margins": 0.10218322277069092,
"rewards/margins_max": 0.14148741960525513,
"rewards/margins_min": 0.06287900358438492,
"rewards/margins_std": 0.05558454990386963,
"rewards/rejected": -0.15525491535663605,
"step": 1750
},
{
"epoch": 0.72,
"grad_norm": 0.46875,
"learning_rate": 1.071211495335518e-07,
"logits/chosen": -0.09772120416164398,
"logits/rejected": 0.08581139892339706,
"logps/chosen": -212.9148712158203,
"logps/rejected": -235.0979461669922,
"loss": 0.6422,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.064021997153759,
"rewards/margins": 0.10210440307855606,
"rewards/margins_max": 0.14792828261852264,
"rewards/margins_min": 0.05628051236271858,
"rewards/margins_std": 0.06480477005243301,
"rewards/rejected": -0.16612640023231506,
"step": 1760
},
{
"epoch": 0.73,
"grad_norm": 0.498046875,
"learning_rate": 1.0418907424073081e-07,
"logits/chosen": -0.1678512841463089,
"logits/rejected": 0.04902017116546631,
"logps/chosen": -200.6660614013672,
"logps/rejected": -223.4239959716797,
"loss": 0.6436,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05520619824528694,
"rewards/margins": 0.0991910919547081,
"rewards/margins_max": 0.13650527596473694,
"rewards/margins_min": 0.06187691539525986,
"rewards/margins_std": 0.052770208567380905,
"rewards/rejected": -0.15439727902412415,
"step": 1770
},
{
"epoch": 0.73,
"grad_norm": 0.4765625,
"learning_rate": 1.012870863561377e-07,
"logits/chosen": -0.10301700979471207,
"logits/rejected": 0.13756810128688812,
"logps/chosen": -206.919677734375,
"logps/rejected": -227.0911865234375,
"loss": 0.6498,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.06356682628393173,
"rewards/margins": 0.07415839284658432,
"rewards/margins_max": 0.11350224167108536,
"rewards/margins_min": 0.03481454402208328,
"rewards/margins_std": 0.055640608072280884,
"rewards/rejected": -0.13772521913051605,
"step": 1780
},
{
"epoch": 0.74,
"grad_norm": 0.416015625,
"learning_rate": 9.84157846915124e-08,
"logits/chosen": -0.19909122586250305,
"logits/rejected": 0.05684971809387207,
"logps/chosen": -213.03915405273438,
"logps/rejected": -251.5517120361328,
"loss": 0.639,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05604202672839165,
"rewards/margins": 0.11602386087179184,
"rewards/margins_max": 0.16751208901405334,
"rewards/margins_min": 0.06453560292720795,
"rewards/margins_std": 0.07281537353992462,
"rewards/rejected": -0.1720658838748932,
"step": 1790
},
{
"epoch": 0.74,
"grad_norm": 0.51953125,
"learning_rate": 9.557576172663575e-08,
"logits/chosen": -0.11178640276193619,
"logits/rejected": 0.04836495593190193,
"logps/chosen": -208.6399688720703,
"logps/rejected": -242.5208740234375,
"loss": 0.6439,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06215709447860718,
"rewards/margins": 0.1025010198354721,
"rewards/margins_max": 0.1522974669933319,
"rewards/margins_min": 0.0527045838534832,
"rewards/margins_std": 0.07042279839515686,
"rewards/rejected": -0.16465812921524048,
"step": 1800
},
{
"epoch": 0.74,
"grad_norm": 0.419921875,
"learning_rate": 9.276760348707389e-08,
"logits/chosen": -0.20817360281944275,
"logits/rejected": 0.15238900482654572,
"logps/chosen": -236.2836456298828,
"logps/rejected": -212.127685546875,
"loss": 0.6415,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.06291679292917252,
"rewards/margins": 0.1133137121796608,
"rewards/margins_max": 0.15874022245407104,
"rewards/margins_min": 0.06788720935583115,
"rewards/margins_std": 0.06424277275800705,
"rewards/rejected": -0.17623049020767212,
"step": 1810
},
{
"epoch": 0.75,
"grad_norm": 0.466796875,
"learning_rate": 8.999188942325475e-08,
"logits/chosen": -0.10632741451263428,
"logits/rejected": 0.12391182035207748,
"logps/chosen": -215.11575317382812,
"logps/rejected": -262.51446533203125,
"loss": 0.6436,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05760825797915459,
"rewards/margins": 0.10929699242115021,
"rewards/margins_max": 0.15289412438869476,
"rewards/margins_min": 0.06569983065128326,
"rewards/margins_std": 0.0616556778550148,
"rewards/rejected": -0.1669052541255951,
"step": 1820
},
{
"epoch": 0.75,
"grad_norm": 0.5,
"learning_rate": 8.724919229090094e-08,
"logits/chosen": -0.17001786828041077,
"logits/rejected": 0.017346305772662163,
"logps/chosen": -218.0846710205078,
"logps/rejected": -244.435546875,
"loss": 0.6445,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.059084583073854446,
"rewards/margins": 0.0965479165315628,
"rewards/margins_max": 0.14651286602020264,
"rewards/margins_min": 0.04658297449350357,
"rewards/margins_std": 0.07066110521554947,
"rewards/rejected": -0.15563251078128815,
"step": 1830
},
{
"epoch": 0.76,
"grad_norm": 0.458984375,
"learning_rate": 8.454007803284452e-08,
"logits/chosen": -0.10606809705495834,
"logits/rejected": 0.05775570124387741,
"logps/chosen": -199.2654266357422,
"logps/rejected": -245.3488311767578,
"loss": 0.6463,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.051395904272794724,
"rewards/margins": 0.10947608947753906,
"rewards/margins_max": 0.15262414515018463,
"rewards/margins_min": 0.0663280338048935,
"rewards/margins_std": 0.061020564287900925,
"rewards/rejected": -0.16087199747562408,
"step": 1840
},
{
"epoch": 0.76,
"grad_norm": 0.451171875,
"learning_rate": 8.186510566224725e-08,
"logits/chosen": -0.15745623409748077,
"logits/rejected": 0.06878992170095444,
"logps/chosen": -212.1492462158203,
"logps/rejected": -233.72598266601562,
"loss": 0.6457,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.050053782761096954,
"rewards/margins": 0.11182372272014618,
"rewards/margins_max": 0.1652067005634308,
"rewards/margins_min": 0.05844072625041008,
"rewards/margins_std": 0.07549495995044708,
"rewards/rejected": -0.16187749803066254,
"step": 1850
},
{
"epoch": 0.77,
"grad_norm": 0.447265625,
"learning_rate": 7.922482714725065e-08,
"logits/chosen": -0.12382777780294418,
"logits/rejected": 0.10572858899831772,
"logps/chosen": -218.09371948242188,
"logps/rejected": -235.0614013671875,
"loss": 0.6419,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.058691591024398804,
"rewards/margins": 0.10704471170902252,
"rewards/margins_max": 0.14886514842510223,
"rewards/margins_min": 0.06522427499294281,
"rewards/margins_std": 0.05914302542805672,
"rewards/rejected": -0.16573630273342133,
"step": 1860
},
{
"epoch": 0.77,
"grad_norm": 0.486328125,
"learning_rate": 7.661978729708013e-08,
"logits/chosen": -0.1819213330745697,
"logits/rejected": 0.13269567489624023,
"logps/chosen": -210.587158203125,
"logps/rejected": -200.8339385986328,
"loss": 0.6497,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.0586087591946125,
"rewards/margins": 0.08718381822109222,
"rewards/margins_max": 0.12654808163642883,
"rewards/margins_min": 0.04781955108046532,
"rewards/margins_std": 0.05566948652267456,
"rewards/rejected": -0.14579257369041443,
"step": 1870
},
{
"epoch": 0.77,
"grad_norm": 0.5078125,
"learning_rate": 7.405052364962603e-08,
"logits/chosen": -0.13600589334964752,
"logits/rejected": 0.14749519526958466,
"logps/chosen": -207.6585693359375,
"logps/rejected": -238.1978759765625,
"loss": 0.64,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.05971568822860718,
"rewards/margins": 0.11025593429803848,
"rewards/margins_max": 0.15179547667503357,
"rewards/margins_min": 0.0687163919210434,
"rewards/margins_std": 0.058745790272951126,
"rewards/rejected": -0.16997162997722626,
"step": 1880
},
{
"epoch": 0.78,
"grad_norm": 0.640625,
"learning_rate": 7.151756636052527e-08,
"logits/chosen": -0.15209710597991943,
"logits/rejected": 0.15300539135932922,
"logps/chosen": -214.08145141601562,
"logps/rejected": -228.1832733154297,
"loss": 0.6377,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.05681489780545235,
"rewards/margins": 0.114329494535923,
"rewards/margins_max": 0.16191419959068298,
"rewards/margins_min": 0.06674476712942123,
"rewards/margins_std": 0.06729496270418167,
"rewards/rejected": -0.17114439606666565,
"step": 1890
},
{
"epoch": 0.78,
"grad_norm": 0.447265625,
"learning_rate": 6.902143809376593e-08,
"logits/chosen": -0.02033737674355507,
"logits/rejected": 0.07773645222187042,
"logps/chosen": -199.2742462158203,
"logps/rejected": -251.02969360351562,
"loss": 0.6436,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.055648110806941986,
"rewards/margins": 0.10049048811197281,
"rewards/margins_max": 0.13645590841770172,
"rewards/margins_min": 0.06452508270740509,
"rewards/margins_std": 0.050862766802310944,
"rewards/rejected": -0.156138613820076,
"step": 1900
},
{
"epoch": 0.79,
"grad_norm": 0.4765625,
"learning_rate": 6.656265391383834e-08,
"logits/chosen": -0.18140563368797302,
"logits/rejected": 0.11698174476623535,
"logps/chosen": -219.529541015625,
"logps/rejected": -219.55859375,
"loss": 0.6417,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.053447991609573364,
"rewards/margins": 0.11373796314001083,
"rewards/margins_max": 0.15492601692676544,
"rewards/margins_min": 0.07254988700151443,
"rewards/margins_std": 0.058248721063137054,
"rewards/rejected": -0.1671859472990036,
"step": 1910
},
{
"epoch": 0.79,
"grad_norm": 0.486328125,
"learning_rate": 6.414172117945363e-08,
"logits/chosen": -0.12691111862659454,
"logits/rejected": -0.011705311946570873,
"logps/chosen": -214.7986297607422,
"logps/rejected": -251.7666015625,
"loss": 0.6415,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.04762512072920799,
"rewards/margins": 0.10763095319271088,
"rewards/margins_max": 0.15741634368896484,
"rewards/margins_min": 0.05784556269645691,
"rewards/margins_std": 0.07040717452764511,
"rewards/rejected": -0.15525606274604797,
"step": 1920
},
{
"epoch": 0.79,
"grad_norm": 0.48828125,
"learning_rate": 6.175913943885275e-08,
"logits/chosen": -0.21015481650829315,
"logits/rejected": 0.04729234799742699,
"logps/chosen": -219.0660400390625,
"logps/rejected": -230.38601684570312,
"loss": 0.6465,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.06361646950244904,
"rewards/margins": 0.0891466736793518,
"rewards/margins_max": 0.1280238926410675,
"rewards/margins_min": 0.05026944726705551,
"rewards/margins_std": 0.05498070642352104,
"rewards/rejected": -0.15276315808296204,
"step": 1930
},
{
"epoch": 0.8,
"grad_norm": 0.42578125,
"learning_rate": 5.941540032672695e-08,
"logits/chosen": -0.1390005648136139,
"logits/rejected": 0.1156986802816391,
"logps/chosen": -203.31509399414062,
"logps/rejected": -213.8002166748047,
"loss": 0.645,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.05469059199094772,
"rewards/margins": 0.09125205874443054,
"rewards/margins_max": 0.12813611328601837,
"rewards/margins_min": 0.05436800792813301,
"rewards/margins_std": 0.05216192454099655,
"rewards/rejected": -0.14594264328479767,
"step": 1940
},
{
"epoch": 0.8,
"grad_norm": 0.52734375,
"learning_rate": 5.711098746277135e-08,
"logits/chosen": -0.18979230523109436,
"logits/rejected": 0.02134443074464798,
"logps/chosen": -241.7399444580078,
"logps/rejected": -279.031005859375,
"loss": 0.6456,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06025727838277817,
"rewards/margins": 0.10567016899585724,
"rewards/margins_max": 0.14187535643577576,
"rewards/margins_min": 0.06946493685245514,
"rewards/margins_std": 0.05120190232992172,
"rewards/rejected": -0.1659274399280548,
"step": 1950
},
{
"epoch": 0.81,
"grad_norm": 0.5078125,
"learning_rate": 5.484637635189185e-08,
"logits/chosen": -0.18471169471740723,
"logits/rejected": -0.013999777846038342,
"logps/chosen": -213.14492797851562,
"logps/rejected": -244.3299560546875,
"loss": 0.6453,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.06022089719772339,
"rewards/margins": 0.0926375538110733,
"rewards/margins_max": 0.14156688749790192,
"rewards/margins_min": 0.0437081977725029,
"rewards/margins_std": 0.06919653713703156,
"rewards/rejected": -0.1528584361076355,
"step": 1960
},
{
"epoch": 0.81,
"grad_norm": 0.470703125,
"learning_rate": 5.262203428608755e-08,
"logits/chosen": -0.1870705783367157,
"logits/rejected": 0.11638101190328598,
"logps/chosen": -220.24398803710938,
"logps/rejected": -216.3998260498047,
"loss": 0.6448,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.061370521783828735,
"rewards/margins": 0.10007043182849884,
"rewards/margins_max": 0.1472715139389038,
"rewards/margins_min": 0.05286933854222298,
"rewards/margins_std": 0.06675241887569427,
"rewards/rejected": -0.16144093871116638,
"step": 1970
},
{
"epoch": 0.81,
"grad_norm": 0.466796875,
"learning_rate": 5.0438420248026745e-08,
"logits/chosen": -0.12526465952396393,
"logits/rejected": 0.0398729033768177,
"logps/chosen": -209.6382293701172,
"logps/rejected": -229.21401977539062,
"loss": 0.6452,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.06217757612466812,
"rewards/margins": 0.09538298845291138,
"rewards/margins_max": 0.13602666556835175,
"rewards/margins_min": 0.054739318788051605,
"rewards/margins_std": 0.05747883766889572,
"rewards/rejected": -0.1575605720281601,
"step": 1980
},
{
"epoch": 0.82,
"grad_norm": 0.42578125,
"learning_rate": 4.829598481633812e-08,
"logits/chosen": -0.200607568025589,
"logits/rejected": 0.017670905217528343,
"logps/chosen": -221.0457000732422,
"logps/rejected": -239.06143188476562,
"loss": 0.6431,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06344129145145416,
"rewards/margins": 0.10496222972869873,
"rewards/margins_max": 0.14306050539016724,
"rewards/margins_min": 0.06686393916606903,
"rewards/margins_std": 0.053879112005233765,
"rewards/rejected": -0.1684035062789917,
"step": 1990
},
{
"epoch": 0.82,
"grad_norm": 0.490234375,
"learning_rate": 4.619517007263596e-08,
"logits/chosen": -0.12676799297332764,
"logits/rejected": 0.16620242595672607,
"logps/chosen": -207.8087615966797,
"logps/rejected": -226.5636749267578,
"loss": 0.6455,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.04663520306348801,
"rewards/margins": 0.09131400287151337,
"rewards/margins_max": 0.13556495308876038,
"rewards/margins_min": 0.04706304520368576,
"rewards/margins_std": 0.06258030235767365,
"rewards/rejected": -0.13794919848442078,
"step": 2000
},
{
"epoch": 0.83,
"grad_norm": 0.431640625,
"learning_rate": 4.413640951029849e-08,
"logits/chosen": -0.19667062163352966,
"logits/rejected": 0.08986136317253113,
"logps/chosen": -210.84982299804688,
"logps/rejected": -226.7263641357422,
"loss": 0.6395,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.0630938857793808,
"rewards/margins": 0.11106900870800018,
"rewards/margins_max": 0.1641812026500702,
"rewards/margins_min": 0.05795680359005928,
"rewards/margins_std": 0.075111985206604,
"rewards/rejected": -0.17416289448738098,
"step": 2010
},
{
"epoch": 0.83,
"grad_norm": 0.396484375,
"learning_rate": 4.212012794501888e-08,
"logits/chosen": -0.1816846877336502,
"logits/rejected": 0.028480231761932373,
"logps/chosen": -219.585693359375,
"logps/rejected": -236.0741729736328,
"loss": 0.6452,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.058341920375823975,
"rewards/margins": 0.09123341739177704,
"rewards/margins_max": 0.13824895024299622,
"rewards/margins_min": 0.04421788826584816,
"rewards/margins_std": 0.06649000942707062,
"rewards/rejected": -0.1495753526687622,
"step": 2020
},
{
"epoch": 0.84,
"grad_norm": 0.494140625,
"learning_rate": 4.014674142714605e-08,
"logits/chosen": -0.22851577401161194,
"logits/rejected": -0.011786893010139465,
"logps/chosen": -220.0074920654297,
"logps/rejected": -241.3643798828125,
"loss": 0.6461,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.058182261884212494,
"rewards/margins": 0.09729903191328049,
"rewards/margins_max": 0.13660015165805817,
"rewards/margins_min": 0.05799790471792221,
"rewards/margins_std": 0.05558018758893013,
"rewards/rejected": -0.15548129379749298,
"step": 2030
},
{
"epoch": 0.84,
"grad_norm": 0.423828125,
"learning_rate": 3.821665715583508e-08,
"logits/chosen": -0.16805607080459595,
"logits/rejected": 0.06839191168546677,
"logps/chosen": -205.09890747070312,
"logps/rejected": -233.2451934814453,
"loss": 0.6455,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.05809404328465462,
"rewards/margins": 0.09838706254959106,
"rewards/margins_max": 0.15165673196315765,
"rewards/margins_min": 0.04511738568544388,
"rewards/margins_std": 0.07533469051122665,
"rewards/rejected": -0.15648110210895538,
"step": 2040
},
{
"epoch": 0.84,
"grad_norm": 0.455078125,
"learning_rate": 3.633027339502318e-08,
"logits/chosen": -0.12912589311599731,
"logits/rejected": 0.10515755414962769,
"logps/chosen": -203.35064697265625,
"logps/rejected": -230.3003387451172,
"loss": 0.6433,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.06046416610479355,
"rewards/margins": 0.0981229841709137,
"rewards/margins_max": 0.1455102413892746,
"rewards/margins_min": 0.05073573440313339,
"rewards/margins_std": 0.06701570004224777,
"rewards/rejected": -0.15858715772628784,
"step": 2050
},
{
"epoch": 0.85,
"grad_norm": 0.474609375,
"learning_rate": 3.448797939124992e-08,
"logits/chosen": -0.18330267071723938,
"logits/rejected": 0.10012233257293701,
"logps/chosen": -236.8273162841797,
"logps/rejected": -239.4166259765625,
"loss": 0.6421,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.06569047272205353,
"rewards/margins": 0.10959680378437042,
"rewards/margins_max": 0.1525718718767166,
"rewards/margins_min": 0.06662173569202423,
"rewards/margins_std": 0.060775917023420334,
"rewards/rejected": -0.17528727650642395,
"step": 2060
},
{
"epoch": 0.85,
"grad_norm": 0.46875,
"learning_rate": 3.269015529333805e-08,
"logits/chosen": -0.14039239287376404,
"logits/rejected": 0.017039867118000984,
"logps/chosen": -220.995849609375,
"logps/rejected": -262.5998840332031,
"loss": 0.6412,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.06117891147732735,
"rewards/margins": 0.11040657758712769,
"rewards/margins_max": 0.16529114544391632,
"rewards/margins_min": 0.05552203208208084,
"rewards/margins_std": 0.07761847972869873,
"rewards/rejected": -0.17158548533916473,
"step": 2070
},
{
"epoch": 0.86,
"grad_norm": 0.474609375,
"learning_rate": 3.0937172073951525e-08,
"logits/chosen": -0.1851753294467926,
"logits/rejected": 0.10401411354541779,
"logps/chosen": -227.9702606201172,
"logps/rejected": -212.72158813476562,
"loss": 0.6477,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.056978046894073486,
"rewards/margins": 0.08857695013284683,
"rewards/margins_max": 0.12364324182271957,
"rewards/margins_min": 0.05351065471768379,
"rewards/margins_std": 0.049591224640607834,
"rewards/rejected": -0.14555500447750092,
"step": 2080
},
{
"epoch": 0.86,
"grad_norm": 0.53515625,
"learning_rate": 2.9229391453046814e-08,
"logits/chosen": -0.10515342652797699,
"logits/rejected": 0.07552903145551682,
"logps/chosen": -205.2806396484375,
"logps/rejected": -247.0736541748047,
"loss": 0.6459,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.05599958822131157,
"rewards/margins": 0.10005147755146027,
"rewards/margins_max": 0.14496475458145142,
"rewards/margins_min": 0.055138200521469116,
"rewards/margins_std": 0.06351695954799652,
"rewards/rejected": -0.15605106949806213,
"step": 2090
},
{
"epoch": 0.86,
"grad_norm": 0.54296875,
"learning_rate": 2.756716582323407e-08,
"logits/chosen": -0.18915608525276184,
"logits/rejected": 0.10052738338708878,
"logps/chosen": -214.53921508789062,
"logps/rejected": -217.12191772460938,
"loss": 0.6424,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.055889736860990524,
"rewards/margins": 0.10114419460296631,
"rewards/margins_max": 0.13807928562164307,
"rewards/margins_min": 0.06420911848545074,
"rewards/margins_std": 0.052234094589948654,
"rewards/rejected": -0.15703395009040833,
"step": 2100
},
{
"epoch": 0.87,
"grad_norm": 0.392578125,
"learning_rate": 2.5950838177062255e-08,
"logits/chosen": -0.18466925621032715,
"logits/rejected": -0.0018462598090991378,
"logps/chosen": -188.68309020996094,
"logps/rejected": -227.0560302734375,
"loss": 0.6371,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.04883568361401558,
"rewards/margins": 0.11194157600402832,
"rewards/margins_max": 0.151218444108963,
"rewards/margins_min": 0.07266470044851303,
"rewards/margins_std": 0.05554589629173279,
"rewards/rejected": -0.1607772707939148,
"step": 2110
},
{
"epoch": 0.87,
"grad_norm": 0.451171875,
"learning_rate": 2.438074203624424e-08,
"logits/chosen": -0.18436864018440247,
"logits/rejected": 0.04486365243792534,
"logps/chosen": -236.3048553466797,
"logps/rejected": -249.7228546142578,
"loss": 0.6426,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.05570978671312332,
"rewards/margins": 0.09718596935272217,
"rewards/margins_max": 0.13665816187858582,
"rewards/margins_min": 0.057713788002729416,
"rewards/margins_std": 0.055822111666202545,
"rewards/rejected": -0.1528957486152649,
"step": 2120
},
{
"epoch": 0.88,
"grad_norm": 0.54296875,
"learning_rate": 2.2857201382836282e-08,
"logits/chosen": -0.14316371083259583,
"logits/rejected": 0.12006983906030655,
"logps/chosen": -209.4042510986328,
"logps/rejected": -232.90353393554688,
"loss": 0.641,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.05837802216410637,
"rewards/margins": 0.1196541041135788,
"rewards/margins_max": 0.1695931851863861,
"rewards/margins_min": 0.06971500813961029,
"rewards/margins_std": 0.07062454521656036,
"rewards/rejected": -0.17803213000297546,
"step": 2130
},
{
"epoch": 0.88,
"grad_norm": 0.5,
"learning_rate": 2.138053059238573e-08,
"logits/chosen": -0.16864949464797974,
"logits/rejected": 0.02723456546664238,
"logps/chosen": -236.72793579101562,
"logps/rejected": -249.60092163085938,
"loss": 0.648,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.06903600692749023,
"rewards/margins": 0.08803755789995193,
"rewards/margins_max": 0.1324770301580429,
"rewards/margins_min": 0.043598074465990067,
"rewards/margins_std": 0.06284691393375397,
"rewards/rejected": -0.15707355737686157,
"step": 2140
},
{
"epoch": 0.88,
"grad_norm": 0.54296875,
"learning_rate": 1.9951034369060952e-08,
"logits/chosen": -0.16016066074371338,
"logits/rejected": 0.05830240249633789,
"logps/chosen": -234.17642211914062,
"logps/rejected": -261.7164001464844,
"loss": 0.6444,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.07695670425891876,
"rewards/margins": 0.10259735584259033,
"rewards/margins_max": 0.15329495072364807,
"rewards/margins_min": 0.05189976841211319,
"rewards/margins_std": 0.07169721275568008,
"rewards/rejected": -0.1795540750026703,
"step": 2150
},
{
"epoch": 0.89,
"grad_norm": 0.443359375,
"learning_rate": 1.8569007682777415e-08,
"logits/chosen": -0.12369527667760849,
"logits/rejected": 0.0878855437040329,
"logps/chosen": -222.7164306640625,
"logps/rejected": -236.99679565429688,
"loss": 0.6479,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.06640324741601944,
"rewards/margins": 0.09582678228616714,
"rewards/margins_max": 0.14717131853103638,
"rewards/margins_min": 0.04448222368955612,
"rewards/margins_std": 0.07261216640472412,
"rewards/rejected": -0.16223004460334778,
"step": 2160
},
{
"epoch": 0.89,
"grad_norm": 0.48046875,
"learning_rate": 1.7234735708331673e-08,
"logits/chosen": -0.0934344008564949,
"logits/rejected": 0.15260164439678192,
"logps/chosen": -228.0308837890625,
"logps/rejected": -225.59909057617188,
"loss": 0.6476,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06557970494031906,
"rewards/margins": 0.09566263109445572,
"rewards/margins_max": 0.13359490036964417,
"rewards/margins_min": 0.05773034691810608,
"rewards/margins_std": 0.05364434793591499,
"rewards/rejected": -0.16124233603477478,
"step": 2170
},
{
"epoch": 0.9,
"grad_norm": 0.5625,
"learning_rate": 1.594849376655702e-08,
"logits/chosen": -0.15817022323608398,
"logits/rejected": 0.05617784336209297,
"logps/chosen": -214.3910675048828,
"logps/rejected": -237.107177734375,
"loss": 0.6453,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.05829305574297905,
"rewards/margins": 0.1066914051771164,
"rewards/margins_max": 0.1624380648136139,
"rewards/margins_min": 0.05094476789236069,
"rewards/margins_std": 0.07883764803409576,
"rewards/rejected": -0.16498446464538574,
"step": 2180
},
{
"epoch": 0.9,
"grad_norm": 0.5234375,
"learning_rate": 1.4710547267512253e-08,
"logits/chosen": -0.14244134724140167,
"logits/rejected": 0.0684979110956192,
"logps/chosen": -204.86105346679688,
"logps/rejected": -225.93289184570312,
"loss": 0.6479,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.05910477787256241,
"rewards/margins": 0.0981929823756218,
"rewards/margins_max": 0.15315920114517212,
"rewards/margins_min": 0.04322676360607147,
"rewards/margins_std": 0.07773397862911224,
"rewards/rejected": -0.1572977602481842,
"step": 2190
},
{
"epoch": 0.91,
"grad_norm": 0.52734375,
"learning_rate": 1.3521151655715602e-08,
"logits/chosen": -0.08284337818622589,
"logits/rejected": 0.15880750119686127,
"logps/chosen": -228.5080108642578,
"logps/rejected": -237.3908233642578,
"loss": 0.6435,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.0625772625207901,
"rewards/margins": 0.09644486010074615,
"rewards/margins_max": 0.14257869124412537,
"rewards/margins_min": 0.05031103640794754,
"rewards/margins_std": 0.06524308770895004,
"rewards/rejected": -0.15902213752269745,
"step": 2200
},
{
"epoch": 0.91,
"grad_norm": 0.44921875,
"learning_rate": 1.2380552357434932e-08,
"logits/chosen": -0.12101718038320541,
"logits/rejected": 0.08898299932479858,
"logps/chosen": -206.19943237304688,
"logps/rejected": -222.4971923828125,
"loss": 0.6423,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.0597076490521431,
"rewards/margins": 0.09617511183023453,
"rewards/margins_max": 0.13126114010810852,
"rewards/margins_min": 0.06108907610177994,
"rewards/margins_std": 0.049619145691394806,
"rewards/rejected": -0.15588276088237762,
"step": 2210
},
{
"epoch": 0.91,
"grad_norm": 0.46484375,
"learning_rate": 1.1288984730044998e-08,
"logits/chosen": -0.15863756835460663,
"logits/rejected": 0.09168653935194016,
"logps/chosen": -225.25711059570312,
"logps/rejected": -244.12844848632812,
"loss": 0.6429,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.05879504233598709,
"rewards/margins": 0.105413056910038,
"rewards/margins_max": 0.14556556940078735,
"rewards/margins_min": 0.06526056677103043,
"rewards/margins_std": 0.05678422003984451,
"rewards/rejected": -0.16420809924602509,
"step": 2220
},
{
"epoch": 0.92,
"grad_norm": 0.4765625,
"learning_rate": 1.0246674013462852e-08,
"logits/chosen": -0.1646089255809784,
"logits/rejected": 0.06766968220472336,
"logps/chosen": -235.84341430664062,
"logps/rejected": -237.8670196533203,
"loss": 0.6474,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.060948483645915985,
"rewards/margins": 0.09341399371623993,
"rewards/margins_max": 0.14046376943588257,
"rewards/margins_min": 0.046364206820726395,
"rewards/margins_std": 0.06653843820095062,
"rewards/rejected": -0.15436246991157532,
"step": 2230
},
{
"epoch": 0.92,
"grad_norm": 0.48828125,
"learning_rate": 9.253835283670381e-09,
"logits/chosen": -0.13539066910743713,
"logits/rejected": 0.1348244845867157,
"logps/chosen": -216.82565307617188,
"logps/rejected": -221.89682006835938,
"loss": 0.6441,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.04900694265961647,
"rewards/margins": 0.11065097898244858,
"rewards/margins_max": 0.16241362690925598,
"rewards/margins_min": 0.058888327330350876,
"rewards/margins_std": 0.07320345193147659,
"rewards/rejected": -0.15965792536735535,
"step": 2240
},
{
"epoch": 0.93,
"grad_norm": 0.423828125,
"learning_rate": 8.310673408334496e-09,
"logits/chosen": -0.1519135981798172,
"logits/rejected": 0.039234552532434464,
"logps/chosen": -198.13656616210938,
"logps/rejected": -219.90493774414062,
"loss": 0.646,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.05604385584592819,
"rewards/margins": 0.10299549251794815,
"rewards/margins_max": 0.13935771584510803,
"rewards/margins_min": 0.06663324683904648,
"rewards/margins_std": 0.05142395943403244,
"rewards/rejected": -0.15903934836387634,
"step": 2250
},
{
"epoch": 0.93,
"grad_norm": 0.458984375,
"learning_rate": 7.417383004533567e-09,
"logits/chosen": -0.18010476231575012,
"logits/rejected": 0.00459135789424181,
"logps/chosen": -200.7607421875,
"logps/rejected": -229.78955078125,
"loss": 0.6464,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06125596910715103,
"rewards/margins": 0.08764808624982834,
"rewards/margins_max": 0.1255454272031784,
"rewards/margins_min": 0.04975075647234917,
"rewards/margins_std": 0.0535949282348156,
"rewards/rejected": -0.14890405535697937,
"step": 2260
},
{
"epoch": 0.93,
"grad_norm": 0.69921875,
"learning_rate": 6.574148398599183e-09,
"logits/chosen": -0.10086911916732788,
"logits/rejected": 0.08481906354427338,
"logps/chosen": -219.78976440429688,
"logps/rejected": -238.8533477783203,
"loss": 0.6423,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.057928215712308884,
"rewards/margins": 0.10362323373556137,
"rewards/margins_max": 0.15504048764705658,
"rewards/margins_min": 0.05220597982406616,
"rewards/margins_std": 0.07271497696638107,
"rewards/rejected": -0.16155146062374115,
"step": 2270
},
{
"epoch": 0.94,
"grad_norm": 0.458984375,
"learning_rate": 5.7811435880811e-09,
"logits/chosen": -0.17055755853652954,
"logits/rejected": 0.11661942303180695,
"logps/chosen": -214.5771942138672,
"logps/rejected": -227.61026000976562,
"loss": 0.6366,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.04704277589917183,
"rewards/margins": 0.14319387078285217,
"rewards/margins_max": 0.18986010551452637,
"rewards/margins_min": 0.09652762115001678,
"rewards/margins_std": 0.06599602103233337,
"rewards/rejected": -0.1902366429567337,
"step": 2280
},
{
"epoch": 0.94,
"grad_norm": 0.51953125,
"learning_rate": 5.03853220584391e-09,
"logits/chosen": -0.13923177123069763,
"logits/rejected": 0.1401730477809906,
"logps/chosen": -230.38186645507812,
"logps/rejected": -243.49148559570312,
"loss": 0.6444,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.06614328920841217,
"rewards/margins": 0.10860340297222137,
"rewards/margins_max": 0.15587784349918365,
"rewards/margins_min": 0.0613289400935173,
"rewards/margins_std": 0.06685616821050644,
"rewards/rejected": -0.17474667727947235,
"step": 2290
},
{
"epoch": 0.95,
"grad_norm": 0.4296875,
"learning_rate": 4.346467486301881e-09,
"logits/chosen": -0.1736827790737152,
"logits/rejected": 0.1112961396574974,
"logps/chosen": -211.4966278076172,
"logps/rejected": -222.78640747070312,
"loss": 0.6448,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.053413182497024536,
"rewards/margins": 0.09374302625656128,
"rewards/margins_max": 0.1378607153892517,
"rewards/margins_min": 0.04962532967329025,
"rewards/margins_std": 0.062391847372055054,
"rewards/rejected": -0.14715620875358582,
"step": 2300
},
{
"epoch": 0.95,
"grad_norm": 0.49609375,
"learning_rate": 3.7050922338e-09,
"logits/chosen": -0.1892717033624649,
"logits/rejected": 0.03548423945903778,
"logps/chosen": -221.7522735595703,
"logps/rejected": -260.614501953125,
"loss": 0.6469,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.06389190256595612,
"rewards/margins": 0.09883169084787369,
"rewards/margins_max": 0.14532563090324402,
"rewards/margins_min": 0.05233774706721306,
"rewards/margins_std": 0.06575236469507217,
"rewards/rejected": -0.162723571062088,
"step": 2310
},
{
"epoch": 0.95,
"grad_norm": 0.53515625,
"learning_rate": 3.1145387931467705e-09,
"logits/chosen": -0.16101527214050293,
"logits/rejected": 0.12721626460552216,
"logps/chosen": -215.1056365966797,
"logps/rejected": -217.00265502929688,
"loss": 0.6464,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.06273301690816879,
"rewards/margins": 0.09039122611284256,
"rewards/margins_max": 0.13275966048240662,
"rewards/margins_min": 0.04802277684211731,
"rewards/margins_std": 0.059918034821748734,
"rewards/rejected": -0.15312424302101135,
"step": 2320
},
{
"epoch": 0.96,
"grad_norm": 0.451171875,
"learning_rate": 2.5749290223055498e-09,
"logits/chosen": -0.0827338844537735,
"logits/rejected": 0.14059332013130188,
"logps/chosen": -216.60971069335938,
"logps/rejected": -236.35986328125,
"loss": 0.6445,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.061508674174547195,
"rewards/margins": 0.10081305354833603,
"rewards/margins_max": 0.13673627376556396,
"rewards/margins_min": 0.0648898258805275,
"rewards/margins_std": 0.05080310255289078,
"rewards/rejected": -0.16232173144817352,
"step": 2330
},
{
"epoch": 0.96,
"grad_norm": 0.51953125,
"learning_rate": 2.086374267249724e-09,
"logits/chosen": -0.16651371121406555,
"logits/rejected": 0.09923712909221649,
"logps/chosen": -231.68002319335938,
"logps/rejected": -244.4033203125,
"loss": 0.645,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.056234586983919144,
"rewards/margins": 0.10171985626220703,
"rewards/margins_max": 0.14425083994865417,
"rewards/margins_min": 0.05918886512517929,
"rewards/margins_std": 0.06014790013432503,
"rewards/rejected": -0.15795443952083588,
"step": 2340
},
{
"epoch": 0.97,
"grad_norm": 0.46484375,
"learning_rate": 1.6489753389869742e-09,
"logits/chosen": -0.1336033046245575,
"logits/rejected": 0.05479846149682999,
"logps/chosen": -219.4292449951172,
"logps/rejected": -253.1160125732422,
"loss": 0.6424,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.05078500509262085,
"rewards/margins": 0.11668237298727036,
"rewards/margins_max": 0.16264045238494873,
"rewards/margins_min": 0.07072430849075317,
"rewards/margins_std": 0.06499452143907547,
"rewards/rejected": -0.1674673855304718,
"step": 2350
},
{
"epoch": 0.97,
"grad_norm": 0.41015625,
"learning_rate": 1.262822492757415e-09,
"logits/chosen": -0.1401694118976593,
"logits/rejected": 0.09876145422458649,
"logps/chosen": -195.66046142578125,
"logps/rejected": -218.9545135498047,
"loss": 0.6463,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.05541166663169861,
"rewards/margins": 0.103818379342556,
"rewards/margins_max": 0.15159131586551666,
"rewards/margins_min": 0.05604543536901474,
"rewards/margins_std": 0.06756114214658737,
"rewards/rejected": -0.159230038523674,
"step": 2360
},
{
"epoch": 0.98,
"grad_norm": 0.484375,
"learning_rate": 9.279954094097709e-10,
"logits/chosen": -0.18476589024066925,
"logits/rejected": 0.04040500894188881,
"logps/chosen": -211.14932250976562,
"logps/rejected": -221.921875,
"loss": 0.6431,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.06176955625414848,
"rewards/margins": 0.09228489547967911,
"rewards/margins_max": 0.14073041081428528,
"rewards/margins_min": 0.04383937641978264,
"rewards/margins_std": 0.06851230561733246,
"rewards/rejected": -0.1540544331073761,
"step": 2370
},
{
"epoch": 0.98,
"grad_norm": 0.498046875,
"learning_rate": 6.445631789597228e-10,
"logits/chosen": -0.1372963935136795,
"logits/rejected": 0.2152913510799408,
"logps/chosen": -240.0201873779297,
"logps/rejected": -232.8347625732422,
"loss": 0.646,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.0573449544608593,
"rewards/margins": 0.09935126453638077,
"rewards/margins_max": 0.15012647211551666,
"rewards/margins_min": 0.04857606440782547,
"rewards/margins_std": 0.07180698215961456,
"rewards/rejected": -0.15669623017311096,
"step": 2380
},
{
"epoch": 0.98,
"grad_norm": 0.400390625,
"learning_rate": 4.1258428633339503e-10,
"logits/chosen": -0.11732598394155502,
"logits/rejected": 0.09575303643941879,
"logps/chosen": -200.75241088867188,
"logps/rejected": -224.1798553466797,
"loss": 0.6445,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.05652853846549988,
"rewards/margins": 0.11160199344158173,
"rewards/margins_max": 0.16687549650669098,
"rewards/margins_min": 0.056328482925891876,
"rewards/margins_std": 0.07816854864358902,
"rewards/rejected": -0.1681305170059204,
"step": 2390
},
{
"epoch": 0.99,
"grad_norm": 0.515625,
"learning_rate": 2.3210659929931432e-10,
"logits/chosen": -0.1350639909505844,
"logits/rejected": 0.05836183577775955,
"logps/chosen": -198.38735961914062,
"logps/rejected": -229.34182739257812,
"loss": 0.6426,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.052884865552186966,
"rewards/margins": 0.10049891471862793,
"rewards/margins_max": 0.15602007508277893,
"rewards/margins_min": 0.04497777670621872,
"rewards/margins_std": 0.07851874828338623,
"rewards/rejected": -0.1533837914466858,
"step": 2400
},
{
"epoch": 0.99,
"grad_norm": 0.41015625,
"learning_rate": 1.0316735859111636e-10,
"logits/chosen": -0.18329744040966034,
"logits/rejected": 0.12713107466697693,
"logps/chosen": -213.11398315429688,
"logps/rejected": -212.9873504638672,
"loss": 0.6424,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.056636691093444824,
"rewards/margins": 0.1115182638168335,
"rewards/margins_max": 0.17401501536369324,
"rewards/margins_min": 0.04902151972055435,
"rewards/margins_std": 0.088383749127388,
"rewards/rejected": -0.16815496981143951,
"step": 2410
},
{
"epoch": 1.0,
"grad_norm": 0.50390625,
"learning_rate": 2.5793170223026295e-11,
"logits/chosen": -0.09687203168869019,
"logits/rejected": 0.1417822688817978,
"logps/chosen": -213.21426391601562,
"logps/rejected": -236.4486541748047,
"loss": 0.6433,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.06115274503827095,
"rewards/margins": 0.10435257852077484,
"rewards/margins_max": 0.14675767719745636,
"rewards/margins_min": 0.06194749474525452,
"rewards/margins_std": 0.05996985360980034,
"rewards/rejected": -0.1655053198337555,
"step": 2420
},
{
"epoch": 1.0,
"grad_norm": 0.431640625,
"learning_rate": 0.0,
"logits/chosen": -0.09515249729156494,
"logits/rejected": 0.09782592952251434,
"logps/chosen": -215.6335906982422,
"logps/rejected": -256.457763671875,
"loss": 0.6454,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.053227148950099945,
"rewards/margins": 0.10610984265804291,
"rewards/margins_max": 0.14640632271766663,
"rewards/margins_min": 0.06581337004899979,
"rewards/margins_std": 0.05698782205581665,
"rewards/rejected": -0.15933698415756226,
"step": 2430
},
{
"epoch": 1.0,
"eval_logits/chosen": 0.7168996334075928,
"eval_logits/rejected": 0.8648675084114075,
"eval_logps/chosen": -339.363525390625,
"eval_logps/rejected": -325.4676208496094,
"eval_loss": 0.6905444264411926,
"eval_rewards/accuracies": 0.578000009059906,
"eval_rewards/chosen": -0.05384029448032379,
"eval_rewards/margins": 0.0059645208530128,
"eval_rewards/margins_max": 0.06618467718362808,
"eval_rewards/margins_min": -0.05284303426742554,
"eval_rewards/margins_std": 0.038610368967056274,
"eval_rewards/rejected": -0.05980480834841728,
"eval_runtime": 835.6041,
"eval_samples_per_second": 4.787,
"eval_steps_per_second": 0.299,
"step": 2430
},
{
"epoch": 1.0,
"step": 2430,
"total_flos": 0.0,
"train_loss": 0.6545025648894134,
"train_runtime": 24035.423,
"train_samples_per_second": 1.618,
"train_steps_per_second": 0.101
}
],
"logging_steps": 10,
"max_steps": 2430,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}