tulu2-7b-cost-UF-UI-judge13b-5e-7 / trainer_state.json
just1nseo's picture
Model save
50831b9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 1724,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.43359375,
"learning_rate": 2.890173410404624e-09,
"logits/chosen": 0.1325806975364685,
"logits/rejected": 0.3077998757362366,
"logps/chosen": -239.35935974121094,
"logps/rejected": -304.581298828125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 0.416015625,
"learning_rate": 2.890173410404624e-08,
"logits/chosen": -0.010774746537208557,
"logits/rejected": 0.23452165722846985,
"logps/chosen": -243.3074493408203,
"logps/rejected": -304.1199035644531,
"loss": 0.6932,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.00028879166347905993,
"rewards/margins": 0.0006378353573381901,
"rewards/margins_max": 0.0028404404874891043,
"rewards/margins_min": -0.0015647696563974023,
"rewards/margins_std": 0.0031149541027843952,
"rewards/rejected": -0.00034904375206679106,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.400390625,
"learning_rate": 5.780346820809248e-08,
"logits/chosen": -0.05719061568379402,
"logits/rejected": 0.5148837566375732,
"logps/chosen": -272.7169494628906,
"logps/rejected": -216.58859252929688,
"loss": 0.6931,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.0008704366046003997,
"rewards/margins": 0.0001740378502290696,
"rewards/margins_max": 0.0022189407609403133,
"rewards/margins_min": -0.0018708650022745132,
"rewards/margins_std": 0.002891929354518652,
"rewards/rejected": -0.0010444745421409607,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.4921875,
"learning_rate": 8.670520231213872e-08,
"logits/chosen": 0.05507341027259827,
"logits/rejected": 0.5646872520446777,
"logps/chosen": -272.96728515625,
"logps/rejected": -252.10733032226562,
"loss": 0.6932,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.0014279346214607358,
"rewards/margins": -0.001033178297802806,
"rewards/margins_max": 0.002007028553634882,
"rewards/margins_min": -0.004073385149240494,
"rewards/margins_std": 0.00429950188845396,
"rewards/rejected": -0.00039475635276176035,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.447265625,
"learning_rate": 1.1560693641618496e-07,
"logits/chosen": -0.08530770242214203,
"logits/rejected": 0.37523841857910156,
"logps/chosen": -256.03692626953125,
"logps/rejected": -224.8648223876953,
"loss": 0.6932,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0013576907804235816,
"rewards/margins": -0.0014004515251144767,
"rewards/margins_max": 0.0015217246254906058,
"rewards/margins_min": -0.004322628024965525,
"rewards/margins_std": 0.0041325814090669155,
"rewards/rejected": 4.276079198461957e-05,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.45703125,
"learning_rate": 1.445086705202312e-07,
"logits/chosen": 0.10976707935333252,
"logits/rejected": 0.40187758207321167,
"logps/chosen": -205.61318969726562,
"logps/rejected": -214.9802703857422,
"loss": 0.693,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0007841205224394798,
"rewards/margins": 0.0018329259473830462,
"rewards/margins_max": 0.004336017183959484,
"rewards/margins_min": -0.0006701658712700009,
"rewards/margins_std": 0.0035399063490331173,
"rewards/rejected": -0.0010488051921129227,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 0.39453125,
"learning_rate": 1.7341040462427744e-07,
"logits/chosen": 0.2901094853878021,
"logits/rejected": 0.4794164299964905,
"logps/chosen": -207.44509887695312,
"logps/rejected": -231.39382934570312,
"loss": 0.693,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.001270442851819098,
"rewards/margins": -0.0007280521094799042,
"rewards/margins_max": 0.0019893264397978783,
"rewards/margins_min": -0.0034454308915883303,
"rewards/margins_std": 0.0038429535925388336,
"rewards/rejected": -0.0005423908005468547,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 0.435546875,
"learning_rate": 2.023121387283237e-07,
"logits/chosen": 0.035371266305446625,
"logits/rejected": 0.4755796492099762,
"logps/chosen": -259.833740234375,
"logps/rejected": -226.2167205810547,
"loss": 0.6929,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0010710505302995443,
"rewards/margins": 0.0011786860413849354,
"rewards/margins_max": 0.004792899824678898,
"rewards/margins_min": -0.002435527741909027,
"rewards/margins_std": 0.005111270118504763,
"rewards/rejected": -0.0022497368045151234,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 0.4609375,
"learning_rate": 2.3121387283236991e-07,
"logits/chosen": 0.27303510904312134,
"logits/rejected": 0.7382463216781616,
"logps/chosen": -217.78671264648438,
"logps/rejected": -208.35910034179688,
"loss": 0.6928,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.2639263988821767e-05,
"rewards/margins": 0.0014770211419090629,
"rewards/margins_max": 0.0042491876520216465,
"rewards/margins_min": -0.0012951450189575553,
"rewards/margins_std": 0.003920434974133968,
"rewards/rejected": -0.0014996604295447469,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 0.6640625,
"learning_rate": 2.601156069364162e-07,
"logits/chosen": -0.20650863647460938,
"logits/rejected": 0.17405006289482117,
"logps/chosen": -226.12808227539062,
"logps/rejected": -233.56381225585938,
"loss": 0.692,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.000633719377219677,
"rewards/margins": 0.0017947215819731355,
"rewards/margins_max": 0.004501459188759327,
"rewards/margins_min": -0.0009120159666053951,
"rewards/margins_std": 0.0038279048167169094,
"rewards/rejected": -0.0011610020883381367,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 0.431640625,
"learning_rate": 2.890173410404624e-07,
"logits/chosen": -0.019260473549365997,
"logits/rejected": 0.5504380464553833,
"logps/chosen": -292.51995849609375,
"logps/rejected": -235.86843872070312,
"loss": 0.6919,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.001650218851864338,
"rewards/margins": 0.002649242291226983,
"rewards/margins_max": 0.005218566861003637,
"rewards/margins_min": 7.99179106252268e-05,
"rewards/margins_std": 0.0036335731856524944,
"rewards/rejected": -0.0009990233229473233,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 0.53125,
"learning_rate": 3.1791907514450865e-07,
"logits/chosen": -0.06840448081493378,
"logits/rejected": 0.6899427175521851,
"logps/chosen": -252.0308380126953,
"logps/rejected": -199.84799194335938,
"loss": 0.6918,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0018273231107741594,
"rewards/margins": 0.00415054801851511,
"rewards/margins_max": 0.0076604606583714485,
"rewards/margins_min": 0.0006406344473361969,
"rewards/margins_std": 0.004963767249137163,
"rewards/rejected": -0.0023232249077409506,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 0.36328125,
"learning_rate": 3.468208092485549e-07,
"logits/chosen": 0.09203040599822998,
"logits/rejected": 0.5125548243522644,
"logps/chosen": -256.213623046875,
"logps/rejected": -232.49942016601562,
"loss": 0.6915,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0007183876005001366,
"rewards/margins": 0.004233072511851788,
"rewards/margins_max": 0.007029411382973194,
"rewards/margins_min": 0.0014367332914844155,
"rewards/margins_std": 0.003954620566219091,
"rewards/rejected": -0.0035146852023899555,
"step": 120
},
{
"epoch": 0.08,
"grad_norm": 0.462890625,
"learning_rate": 3.757225433526011e-07,
"logits/chosen": -0.027632858604192734,
"logits/rejected": 0.39557844400405884,
"logps/chosen": -266.2771911621094,
"logps/rejected": -271.76116943359375,
"loss": 0.6907,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.002352924318984151,
"rewards/margins": 0.005208231043070555,
"rewards/margins_max": 0.008825947530567646,
"rewards/margins_min": 0.001590514904819429,
"rewards/margins_std": 0.005116222891956568,
"rewards/rejected": -0.00285530649125576,
"step": 130
},
{
"epoch": 0.08,
"grad_norm": 0.40625,
"learning_rate": 4.046242774566474e-07,
"logits/chosen": 0.06764040887355804,
"logits/rejected": 0.3966519236564636,
"logps/chosen": -178.83749389648438,
"logps/rejected": -188.39877319335938,
"loss": 0.6908,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.0029165446758270264,
"rewards/margins": 0.006306161172688007,
"rewards/margins_max": 0.009462257847189903,
"rewards/margins_min": 0.0031500644981861115,
"rewards/margins_std": 0.004463394172489643,
"rewards/rejected": -0.0033896160311996937,
"step": 140
},
{
"epoch": 0.09,
"grad_norm": 0.447265625,
"learning_rate": 4.3352601156069365e-07,
"logits/chosen": 0.011811649426817894,
"logits/rejected": 0.4984157979488373,
"logps/chosen": -268.1231994628906,
"logps/rejected": -223.78799438476562,
"loss": 0.6899,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.002369340742006898,
"rewards/margins": 0.006674068979918957,
"rewards/margins_max": 0.013764929957687855,
"rewards/margins_min": -0.0004167918232269585,
"rewards/margins_std": 0.010027991607785225,
"rewards/rejected": -0.0043047284707427025,
"step": 150
},
{
"epoch": 0.09,
"grad_norm": 0.322265625,
"learning_rate": 4.6242774566473983e-07,
"logits/chosen": -0.03828499838709831,
"logits/rejected": 0.3794795870780945,
"logps/chosen": -245.52865600585938,
"logps/rejected": -234.1727752685547,
"loss": 0.689,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.004552280530333519,
"rewards/margins": 0.008487861603498459,
"rewards/margins_max": 0.012918056920170784,
"rewards/margins_min": 0.004057666752487421,
"rewards/margins_std": 0.006265241652727127,
"rewards/rejected": -0.003935581538826227,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 0.49609375,
"learning_rate": 4.913294797687861e-07,
"logits/chosen": -0.0168992280960083,
"logits/rejected": 0.500325620174408,
"logps/chosen": -296.49517822265625,
"logps/rejected": -248.3328094482422,
"loss": 0.6887,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.003083079354837537,
"rewards/margins": 0.006065175868570805,
"rewards/margins_max": 0.011483820155262947,
"rewards/margins_min": 0.0006465300684794784,
"rewards/margins_std": 0.0076631223782896996,
"rewards/rejected": -0.002982096979394555,
"step": 170
},
{
"epoch": 0.1,
"grad_norm": 0.40625,
"learning_rate": 4.999748710138438e-07,
"logits/chosen": 0.14815935492515564,
"logits/rejected": 0.5510139465332031,
"logps/chosen": -233.9811553955078,
"logps/rejected": -228.5449676513672,
"loss": 0.688,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.003167560789734125,
"rewards/margins": 0.007796141318976879,
"rewards/margins_max": 0.012642833404242992,
"rewards/margins_min": 0.002949449699372053,
"rewards/margins_std": 0.006854257546365261,
"rewards/rejected": -0.004628580994904041,
"step": 180
},
{
"epoch": 0.11,
"grad_norm": 0.416015625,
"learning_rate": 4.998518024263461e-07,
"logits/chosen": 0.19040322303771973,
"logits/rejected": 0.6236617565155029,
"logps/chosen": -230.96762084960938,
"logps/rejected": -211.4745330810547,
"loss": 0.6871,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.006373309530317783,
"rewards/margins": 0.012960617430508137,
"rewards/margins_max": 0.01996336504817009,
"rewards/margins_min": 0.0059578740037977695,
"rewards/margins_std": 0.0099033759906888,
"rewards/rejected": -0.006587309297174215,
"step": 190
},
{
"epoch": 0.12,
"grad_norm": 0.416015625,
"learning_rate": 4.996262291366814e-07,
"logits/chosen": 0.054732900112867355,
"logits/rejected": 0.22424785792827606,
"logps/chosen": -210.0012664794922,
"logps/rejected": -233.76388549804688,
"loss": 0.6873,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.004412280861288309,
"rewards/margins": 0.011961949989199638,
"rewards/margins_max": 0.017657486721873283,
"rewards/margins_min": 0.006266415119171143,
"rewards/margins_std": 0.0080547034740448,
"rewards/rejected": -0.007549669593572617,
"step": 200
},
{
"epoch": 0.12,
"grad_norm": 0.498046875,
"learning_rate": 4.992982436890003e-07,
"logits/chosen": 0.09016792476177216,
"logits/rejected": 0.45956069231033325,
"logps/chosen": -226.3985595703125,
"logps/rejected": -221.092529296875,
"loss": 0.6868,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.005489318631589413,
"rewards/margins": 0.013238553889095783,
"rewards/margins_max": 0.018587926402688026,
"rewards/margins_min": 0.00788918323814869,
"rewards/margins_std": 0.007565152831375599,
"rewards/rejected": -0.007749234326183796,
"step": 210
},
{
"epoch": 0.13,
"grad_norm": 0.458984375,
"learning_rate": 4.988679806432711e-07,
"logits/chosen": -0.08951343595981598,
"logits/rejected": 0.46994414925575256,
"logps/chosen": -264.4379577636719,
"logps/rejected": -236.77346801757812,
"loss": 0.6853,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.007678179536014795,
"rewards/margins": 0.01784335821866989,
"rewards/margins_max": 0.025632936507463455,
"rewards/margins_min": 0.010053779929876328,
"rewards/margins_std": 0.011016124859452248,
"rewards/rejected": -0.010165175423026085,
"step": 220
},
{
"epoch": 0.13,
"grad_norm": 0.474609375,
"learning_rate": 4.983356165200751e-07,
"logits/chosen": 0.07358375936746597,
"logits/rejected": 0.617803692817688,
"logps/chosen": -276.56536865234375,
"logps/rejected": -237.3117218017578,
"loss": 0.6848,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.0074386284686625,
"rewards/margins": 0.01824963092803955,
"rewards/margins_max": 0.026552444323897362,
"rewards/margins_min": 0.00994681753218174,
"rewards/margins_std": 0.01174195110797882,
"rewards/rejected": -0.010811002925038338,
"step": 230
},
{
"epoch": 0.14,
"grad_norm": 0.4296875,
"learning_rate": 4.977013697281864e-07,
"logits/chosen": 0.23069170117378235,
"logits/rejected": 0.546830952167511,
"logps/chosen": -229.92764282226562,
"logps/rejected": -231.63357543945312,
"loss": 0.6848,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.005361995659768581,
"rewards/margins": 0.015256190672516823,
"rewards/margins_max": 0.022752556949853897,
"rewards/margins_min": 0.007759819272905588,
"rewards/margins_std": 0.010601467452943325,
"rewards/rejected": -0.009894194081425667,
"step": 240
},
{
"epoch": 0.15,
"grad_norm": 0.412109375,
"learning_rate": 4.969655004749673e-07,
"logits/chosen": 0.05646086856722832,
"logits/rejected": 0.3687281012535095,
"logps/chosen": -203.8467559814453,
"logps/rejected": -216.0234375,
"loss": 0.6846,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.002810864243656397,
"rewards/margins": 0.014029537327587605,
"rewards/margins_max": 0.019475888460874557,
"rewards/margins_min": 0.008583188988268375,
"rewards/margins_std": 0.007702300790697336,
"rewards/rejected": -0.011218673549592495,
"step": 250
},
{
"epoch": 0.15,
"grad_norm": 0.490234375,
"learning_rate": 4.961283106596155e-07,
"logits/chosen": 0.1512751430273056,
"logits/rejected": 0.5323320627212524,
"logps/chosen": -256.96673583984375,
"logps/rejected": -265.65509033203125,
"loss": 0.6829,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.011281570419669151,
"rewards/margins": 0.0202823244035244,
"rewards/margins_max": 0.02979358099400997,
"rewards/margins_min": 0.010771063156425953,
"rewards/margins_std": 0.013450953178107738,
"rewards/rejected": -0.009000752121210098,
"step": 260
},
{
"epoch": 0.16,
"grad_norm": 0.447265625,
"learning_rate": 4.951901437493054e-07,
"logits/chosen": 0.08749596029520035,
"logits/rejected": 0.47565847635269165,
"logps/chosen": -252.97323608398438,
"logps/rejected": -220.1329803466797,
"loss": 0.6826,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.005718126427382231,
"rewards/margins": 0.019988398998975754,
"rewards/margins_max": 0.025959456339478493,
"rewards/margins_min": 0.014017338864505291,
"rewards/margins_std": 0.008444352075457573,
"rewards/rejected": -0.014270270243287086,
"step": 270
},
{
"epoch": 0.16,
"grad_norm": 0.453125,
"learning_rate": 4.941513846382779e-07,
"logits/chosen": 0.31170374155044556,
"logits/rejected": 0.6478020548820496,
"logps/chosen": -207.89794921875,
"logps/rejected": -225.51791381835938,
"loss": 0.6828,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.010051739402115345,
"rewards/margins": 0.019436318427324295,
"rewards/margins_max": 0.025176430121064186,
"rewards/margins_min": 0.013696206733584404,
"rewards/margins_std": 0.008117742836475372,
"rewards/rejected": -0.009384581819176674,
"step": 280
},
{
"epoch": 0.17,
"grad_norm": 0.431640625,
"learning_rate": 4.930124594899313e-07,
"logits/chosen": 0.14136287569999695,
"logits/rejected": 0.5530031323432922,
"logps/chosen": -244.9897918701172,
"logps/rejected": -244.90457153320312,
"loss": 0.6814,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.0166664756834507,
"rewards/margins": 0.02829556167125702,
"rewards/margins_max": 0.037106942385435104,
"rewards/margins_min": 0.019484177231788635,
"rewards/margins_std": 0.012461178004741669,
"rewards/rejected": -0.011629085056483746,
"step": 290
},
{
"epoch": 0.17,
"grad_norm": 0.494140625,
"learning_rate": 4.917738355619842e-07,
"logits/chosen": 0.2040259838104248,
"logits/rejected": 0.6138412356376648,
"logps/chosen": -193.21507263183594,
"logps/rejected": -194.8699188232422,
"loss": 0.6796,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.012191513553261757,
"rewards/margins": 0.026244569569826126,
"rewards/margins_max": 0.036748819053173065,
"rewards/margins_min": 0.015740320086479187,
"rewards/margins_std": 0.014855247922241688,
"rewards/rejected": -0.01405305415391922,
"step": 300
},
{
"epoch": 0.18,
"grad_norm": 0.453125,
"learning_rate": 4.904360210147762e-07,
"logits/chosen": 0.1507195234298706,
"logits/rejected": 0.5720406174659729,
"logps/chosen": -242.0141143798828,
"logps/rejected": -216.76132202148438,
"loss": 0.6791,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.010296806693077087,
"rewards/margins": 0.02473880909383297,
"rewards/margins_max": 0.036660365760326385,
"rewards/margins_min": 0.012817250564694405,
"rewards/margins_std": 0.0168596301227808,
"rewards/rejected": -0.014442001469433308,
"step": 310
},
{
"epoch": 0.19,
"grad_norm": 0.41796875,
"learning_rate": 4.8899956470279e-07,
"logits/chosen": -0.03488525375723839,
"logits/rejected": 0.40159520506858826,
"logps/chosen": -218.23812866210938,
"logps/rejected": -190.8876953125,
"loss": 0.679,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.014135973528027534,
"rewards/margins": 0.02363484725356102,
"rewards/margins_max": 0.036806877702474594,
"rewards/margins_min": 0.010462815873324871,
"rewards/margins_std": 0.018628064543008804,
"rewards/rejected": -0.00949887465685606,
"step": 320
},
{
"epoch": 0.19,
"grad_norm": 0.4375,
"learning_rate": 4.874650559494765e-07,
"logits/chosen": 0.10674601793289185,
"logits/rejected": 0.5667238831520081,
"logps/chosen": -242.5848388671875,
"logps/rejected": -212.60922241210938,
"loss": 0.6782,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.008991287089884281,
"rewards/margins": 0.02689727023243904,
"rewards/margins_max": 0.03854988515377045,
"rewards/margins_min": 0.015244655311107635,
"rewards/margins_std": 0.016479285433888435,
"rewards/rejected": -0.017905984073877335,
"step": 330
},
{
"epoch": 0.2,
"grad_norm": 0.357421875,
"learning_rate": 4.858331243054782e-07,
"logits/chosen": 0.09378918260335922,
"logits/rejected": 0.42793530225753784,
"logps/chosen": -282.80413818359375,
"logps/rejected": -245.1541748046875,
"loss": 0.6796,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.004886592272669077,
"rewards/margins": 0.021504424512386322,
"rewards/margins_max": 0.03542860597372055,
"rewards/margins_min": 0.007580241654068232,
"rewards/margins_std": 0.019691769033670425,
"rewards/rejected": -0.016617832705378532,
"step": 340
},
{
"epoch": 0.2,
"grad_norm": 0.486328125,
"learning_rate": 4.841044392903481e-07,
"logits/chosen": 0.1290682703256607,
"logits/rejected": 0.6047347784042358,
"logps/chosen": -232.40908813476562,
"logps/rejected": -181.57228088378906,
"loss": 0.6783,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.008800150826573372,
"rewards/margins": 0.028118547052145004,
"rewards/margins_max": 0.04057111591100693,
"rewards/margins_min": 0.015665989369153976,
"rewards/margins_std": 0.0176105834543705,
"rewards/rejected": -0.01931839995086193,
"step": 350
},
{
"epoch": 0.21,
"grad_norm": 0.435546875,
"learning_rate": 4.822797101178718e-07,
"logits/chosen": -0.10504484176635742,
"logits/rejected": 0.437595933675766,
"logps/chosen": -256.3827209472656,
"logps/rejected": -231.28836059570312,
"loss": 0.6777,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.014989467337727547,
"rewards/margins": 0.03444572165608406,
"rewards/margins_max": 0.04873298108577728,
"rewards/margins_min": 0.02015846036374569,
"rewards/margins_std": 0.020205235108733177,
"rewards/rejected": -0.019456254318356514,
"step": 360
},
{
"epoch": 0.21,
"grad_norm": 0.390625,
"learning_rate": 4.803596854051038e-07,
"logits/chosen": -0.0018104672199115157,
"logits/rejected": 0.5270112752914429,
"logps/chosen": -251.33740234375,
"logps/rejected": -203.73886108398438,
"loss": 0.6749,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.010898159816861153,
"rewards/margins": 0.02897489070892334,
"rewards/margins_max": 0.041702691465616226,
"rewards/margins_min": 0.016247089952230453,
"rewards/margins_std": 0.01799982599914074,
"rewards/rejected": -0.018076732754707336,
"step": 370
},
{
"epoch": 0.22,
"grad_norm": 0.3671875,
"learning_rate": 4.783451528652382e-07,
"logits/chosen": 0.03281222656369209,
"logits/rejected": 0.3939230740070343,
"logps/chosen": -203.0167694091797,
"logps/rejected": -197.302490234375,
"loss": 0.6775,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01019463874399662,
"rewards/margins": 0.030594149604439735,
"rewards/margins_max": 0.041967082768678665,
"rewards/margins_min": 0.019221220165491104,
"rewards/margins_std": 0.01608375459909439,
"rewards/rejected": -0.020399510860443115,
"step": 380
},
{
"epoch": 0.23,
"grad_norm": 0.4140625,
"learning_rate": 4.7623693898443963e-07,
"logits/chosen": 0.06993720680475235,
"logits/rejected": 0.44206172227859497,
"logps/chosen": -185.37237548828125,
"logps/rejected": -187.4385986328125,
"loss": 0.6751,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.009011445567011833,
"rewards/margins": 0.03231946378946304,
"rewards/margins_max": 0.04668620228767395,
"rewards/margins_min": 0.017952727153897285,
"rewards/margins_std": 0.02031763456761837,
"rewards/rejected": -0.02330802008509636,
"step": 390
},
{
"epoch": 0.23,
"grad_norm": 0.44140625,
"learning_rate": 4.740359086827685e-07,
"logits/chosen": -0.0161175187677145,
"logits/rejected": 0.4163980484008789,
"logps/chosen": -239.71432495117188,
"logps/rejected": -241.2501678466797,
"loss": 0.6737,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.018473349511623383,
"rewards/margins": 0.04534245282411575,
"rewards/margins_max": 0.06162145733833313,
"rewards/margins_min": 0.02906343713402748,
"rewards/margins_std": 0.0230219978839159,
"rewards/rejected": -0.026869099587202072,
"step": 400
},
{
"epoch": 0.24,
"grad_norm": 0.359375,
"learning_rate": 4.7174296495933593e-07,
"logits/chosen": -0.04076371714472771,
"logits/rejected": 0.20715077221393585,
"logps/chosen": -188.3863525390625,
"logps/rejected": -203.01266479492188,
"loss": 0.6749,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.011351143009960651,
"rewards/margins": 0.03776105120778084,
"rewards/margins_max": 0.05341630056500435,
"rewards/margins_min": 0.022105801850557327,
"rewards/margins_std": 0.022139865905046463,
"rewards/rejected": -0.026409905403852463,
"step": 410
},
{
"epoch": 0.24,
"grad_norm": 0.478515625,
"learning_rate": 4.6935904852183805e-07,
"logits/chosen": 0.29291218519210815,
"logits/rejected": 0.5505505800247192,
"logps/chosen": -203.9456024169922,
"logps/rejected": -217.8910369873047,
"loss": 0.6712,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.012085825204849243,
"rewards/margins": 0.038635291159152985,
"rewards/margins_max": 0.059398896992206573,
"rewards/margins_min": 0.017871689051389694,
"rewards/margins_std": 0.029364168643951416,
"rewards/rejected": -0.02654946781694889,
"step": 420
},
{
"epoch": 0.25,
"grad_norm": 0.431640625,
"learning_rate": 4.6688513740061965e-07,
"logits/chosen": 0.12483358383178711,
"logits/rejected": 0.46587473154067993,
"logps/chosen": -264.0867004394531,
"logps/rejected": -292.27685546875,
"loss": 0.6731,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.019537176936864853,
"rewards/margins": 0.040542975068092346,
"rewards/margins_max": 0.05839340761303902,
"rewards/margins_min": 0.022692536935210228,
"rewards/margins_std": 0.02524433098733425,
"rewards/rejected": -0.021005798131227493,
"step": 430
},
{
"epoch": 0.26,
"grad_norm": 0.4296875,
"learning_rate": 4.6432224654742475e-07,
"logits/chosen": -0.0027520388830453157,
"logits/rejected": 0.48325324058532715,
"logps/chosen": -231.2857208251953,
"logps/rejected": -221.3975372314453,
"loss": 0.6719,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.017787110060453415,
"rewards/margins": 0.04569714143872261,
"rewards/margins_max": 0.06507585942745209,
"rewards/margins_min": 0.026318421587347984,
"rewards/margins_std": 0.027405640110373497,
"rewards/rejected": -0.027910029515624046,
"step": 440
},
{
"epoch": 0.26,
"grad_norm": 0.4375,
"learning_rate": 4.616714274190011e-07,
"logits/chosen": 0.3332589566707611,
"logits/rejected": 0.5584608316421509,
"logps/chosen": -211.74325561523438,
"logps/rejected": -225.31689453125,
"loss": 0.6705,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.010198825970292091,
"rewards/margins": 0.04217001795768738,
"rewards/margins_max": 0.0582113042473793,
"rewards/margins_min": 0.026128727942705154,
"rewards/margins_std": 0.022685810923576355,
"rewards/rejected": -0.031971193850040436,
"step": 450
},
{
"epoch": 0.27,
"grad_norm": 0.435546875,
"learning_rate": 4.589337675457273e-07,
"logits/chosen": 0.10014849901199341,
"logits/rejected": 0.564907431602478,
"logps/chosen": -217.19985961914062,
"logps/rejected": -214.29440307617188,
"loss": 0.6713,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.018607165664434433,
"rewards/margins": 0.05433148890733719,
"rewards/margins_max": 0.07488565146923065,
"rewards/margins_min": 0.033777330070734024,
"rewards/margins_std": 0.02906796894967556,
"rewards/rejected": -0.03572431951761246,
"step": 460
},
{
"epoch": 0.27,
"grad_norm": 0.4609375,
"learning_rate": 4.5611039008544007e-07,
"logits/chosen": 0.13153567910194397,
"logits/rejected": 0.652635931968689,
"logps/chosen": -261.8456726074219,
"logps/rejected": -231.66531372070312,
"loss": 0.671,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.013766567222774029,
"rewards/margins": 0.04572372883558273,
"rewards/margins_max": 0.06320376694202423,
"rewards/margins_min": 0.028243690729141235,
"rewards/margins_std": 0.024720508605241776,
"rewards/rejected": -0.03195716068148613,
"step": 470
},
{
"epoch": 0.28,
"grad_norm": 0.419921875,
"learning_rate": 4.532024533626457e-07,
"logits/chosen": 0.0050893365405499935,
"logits/rejected": 0.3075583577156067,
"logps/chosen": -214.87033081054688,
"logps/rejected": -231.591064453125,
"loss": 0.6694,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.012458743527531624,
"rewards/margins": 0.046287618577480316,
"rewards/margins_max": 0.06574501842260361,
"rewards/margins_min": 0.026830215007066727,
"rewards/margins_std": 0.02751692570745945,
"rewards/rejected": -0.03382887691259384,
"step": 480
},
{
"epoch": 0.28,
"grad_norm": 0.435546875,
"learning_rate": 4.502111503933032e-07,
"logits/chosen": 0.16573339700698853,
"logits/rejected": 0.5059231519699097,
"logps/chosen": -214.00900268554688,
"logps/rejected": -226.75070190429688,
"loss": 0.6705,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.011546745896339417,
"rewards/margins": 0.03893359750509262,
"rewards/margins_max": 0.0571872778236866,
"rewards/margins_min": 0.020679913461208344,
"rewards/margins_std": 0.0258146021515131,
"rewards/rejected": -0.027386849746108055,
"step": 490
},
{
"epoch": 0.29,
"grad_norm": 0.42578125,
"learning_rate": 4.471377083953753e-07,
"logits/chosen": 0.19767063856124878,
"logits/rejected": 0.6161295175552368,
"logps/chosen": -211.5915985107422,
"logps/rejected": -231.336669921875,
"loss": 0.6672,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.021602794528007507,
"rewards/margins": 0.05690021067857742,
"rewards/margins_max": 0.08022460341453552,
"rewards/margins_min": 0.03357581049203873,
"rewards/margins_std": 0.032985687255859375,
"rewards/rejected": -0.03529741242527962,
"step": 500
},
{
"epoch": 0.3,
"grad_norm": 0.4609375,
"learning_rate": 4.4398338828534766e-07,
"logits/chosen": 0.051334965974092484,
"logits/rejected": 0.5114815831184387,
"logps/chosen": -252.36349487304688,
"logps/rejected": -253.6934051513672,
"loss": 0.67,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.021400339901447296,
"rewards/margins": 0.05237139016389847,
"rewards/margins_max": 0.07569600641727448,
"rewards/margins_min": 0.029046764597296715,
"rewards/margins_std": 0.03298599272966385,
"rewards/rejected": -0.030971046537160873,
"step": 510
},
{
"epoch": 0.3,
"grad_norm": 0.40234375,
"learning_rate": 4.407494841609224e-07,
"logits/chosen": 0.16097505390644073,
"logits/rejected": 0.503351092338562,
"logps/chosen": -187.7499542236328,
"logps/rejected": -182.64669799804688,
"loss": 0.6691,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.015485493466258049,
"rewards/margins": 0.039487432688474655,
"rewards/margins_max": 0.0597788468003273,
"rewards/margins_min": 0.019196024164557457,
"rewards/margins_std": 0.028696388006210327,
"rewards/rejected": -0.024001937359571457,
"step": 520
},
{
"epoch": 0.31,
"grad_norm": 0.462890625,
"learning_rate": 4.374373227700993e-07,
"logits/chosen": 0.03560265153646469,
"logits/rejected": 0.5799299478530884,
"logps/chosen": -273.8843688964844,
"logps/rejected": -234.033935546875,
"loss": 0.6673,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.007162511348724365,
"rewards/margins": 0.0483052022755146,
"rewards/margins_max": 0.06804867088794708,
"rewards/margins_min": 0.028561726212501526,
"rewards/margins_std": 0.027921488508582115,
"rewards/rejected": -0.04114269092679024,
"step": 530
},
{
"epoch": 0.31,
"grad_norm": 0.408203125,
"learning_rate": 4.340482629668615e-07,
"logits/chosen": 0.027306120842695236,
"logits/rejected": 0.671806812286377,
"logps/chosen": -259.85015869140625,
"logps/rejected": -201.55807495117188,
"loss": 0.6673,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.02854643389582634,
"rewards/margins": 0.0538957342505455,
"rewards/margins_max": 0.0864059180021286,
"rewards/margins_min": 0.0213855542242527,
"rewards/margins_std": 0.045976340770721436,
"rewards/rejected": -0.025349300354719162,
"step": 540
},
{
"epoch": 0.32,
"grad_norm": 0.3515625,
"learning_rate": 4.30583695153689e-07,
"logits/chosen": 0.04380347207188606,
"logits/rejected": 0.4509994089603424,
"logps/chosen": -273.69775390625,
"logps/rejected": -259.96966552734375,
"loss": 0.6693,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.022089816629886627,
"rewards/margins": 0.056071024388074875,
"rewards/margins_max": 0.08100839704275131,
"rewards/margins_min": 0.031133651733398438,
"rewards/margins_std": 0.035266775637865067,
"rewards/rejected": -0.033981211483478546,
"step": 550
},
{
"epoch": 0.32,
"grad_norm": 0.4140625,
"learning_rate": 4.2704504071112986e-07,
"logits/chosen": 0.10579466819763184,
"logits/rejected": 0.5407041311264038,
"logps/chosen": -240.98483276367188,
"logps/rejected": -211.9040985107422,
"loss": 0.6687,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.017832906916737556,
"rewards/margins": 0.05916459485888481,
"rewards/margins_max": 0.08200596272945404,
"rewards/margins_min": 0.036323241889476776,
"rewards/margins_std": 0.03230256214737892,
"rewards/rejected": -0.041331697255373,
"step": 560
},
{
"epoch": 0.33,
"grad_norm": 0.376953125,
"learning_rate": 4.234337514146612e-07,
"logits/chosen": 0.11410923302173615,
"logits/rejected": 0.6912606954574585,
"logps/chosen": -251.16793823242188,
"logps/rejected": -229.26553344726562,
"loss": 0.6663,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.019808156415820122,
"rewards/margins": 0.05665863677859306,
"rewards/margins_max": 0.08191566169261932,
"rewards/margins_min": 0.0314016118645668,
"rewards/margins_std": 0.03571882098913193,
"rewards/rejected": -0.036850474774837494,
"step": 570
},
{
"epoch": 0.34,
"grad_norm": 0.357421875,
"learning_rate": 4.197513088390813e-07,
"logits/chosen": -0.013543277978897095,
"logits/rejected": 0.37492939829826355,
"logps/chosen": -232.13333129882812,
"logps/rejected": -223.6721954345703,
"loss": 0.6657,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.014923980459570885,
"rewards/margins": 0.05013802647590637,
"rewards/margins_max": 0.07493571937084198,
"rewards/margins_min": 0.025340333580970764,
"rewards/margins_std": 0.03506923094391823,
"rewards/rejected": -0.03521404415369034,
"step": 580
},
{
"epoch": 0.34,
"grad_norm": 0.51171875,
"learning_rate": 4.1599922375067554e-07,
"logits/chosen": -0.03167729452252388,
"logits/rejected": 0.535004734992981,
"logps/chosen": -325.4375915527344,
"logps/rejected": -253.494873046875,
"loss": 0.6668,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.01660420373082161,
"rewards/margins": 0.059089016169309616,
"rewards/margins_max": 0.08827444911003113,
"rewards/margins_min": 0.029903585091233253,
"rewards/margins_std": 0.041274432092905045,
"rewards/rejected": -0.04248481243848801,
"step": 590
},
{
"epoch": 0.35,
"grad_norm": 0.380859375,
"learning_rate": 4.121790354874065e-07,
"logits/chosen": 0.05303360894322395,
"logits/rejected": 0.40770038962364197,
"logps/chosen": -202.06549072265625,
"logps/rejected": -214.628173828125,
"loss": 0.6649,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005082354880869389,
"rewards/margins": 0.05396551638841629,
"rewards/margins_max": 0.07737747579813004,
"rewards/margins_min": 0.03055354580283165,
"rewards/margins_std": 0.03310951590538025,
"rewards/rejected": -0.04888315126299858,
"step": 600
},
{
"epoch": 0.35,
"grad_norm": 0.369140625,
"learning_rate": 4.082923113273822e-07,
"logits/chosen": 0.11870566755533218,
"logits/rejected": 0.464911550283432,
"logps/chosen": -231.35336303710938,
"logps/rejected": -234.9374237060547,
"loss": 0.6666,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01106190960854292,
"rewards/margins": 0.0625653862953186,
"rewards/margins_max": 0.08917935192584991,
"rewards/margins_min": 0.03595142811536789,
"rewards/margins_std": 0.037637822329998016,
"rewards/rejected": -0.05150347948074341,
"step": 610
},
{
"epoch": 0.36,
"grad_norm": 0.443359375,
"learning_rate": 4.043406458458609e-07,
"logits/chosen": 0.09034819900989532,
"logits/rejected": 0.5873952507972717,
"logps/chosen": -265.25396728515625,
"logps/rejected": -214.2862548828125,
"loss": 0.6628,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0020419310312718153,
"rewards/margins": 0.06574475765228271,
"rewards/margins_max": 0.08710642158985138,
"rewards/margins_min": 0.04438310116529465,
"rewards/margins_std": 0.030209947377443314,
"rewards/rejected": -0.06370283663272858,
"step": 620
},
{
"epoch": 0.37,
"grad_norm": 0.4921875,
"learning_rate": 4.0032566026105806e-07,
"logits/chosen": 0.008516276255249977,
"logits/rejected": 0.6535265445709229,
"logps/chosen": -260.87298583984375,
"logps/rejected": -267.5401916503906,
"loss": 0.663,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.03661227226257324,
"rewards/margins": 0.07144369184970856,
"rewards/margins_max": 0.09834811091423035,
"rewards/margins_min": 0.044539276510477066,
"rewards/margins_std": 0.03804859146475792,
"rewards/rejected": -0.03483142331242561,
"step": 630
},
{
"epoch": 0.37,
"grad_norm": 0.474609375,
"learning_rate": 3.9624900176902184e-07,
"logits/chosen": 0.013054514303803444,
"logits/rejected": 0.3652392029762268,
"logps/chosen": -235.1199493408203,
"logps/rejected": -248.31411743164062,
"loss": 0.6656,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.014549237675964832,
"rewards/margins": 0.05561714246869087,
"rewards/margins_max": 0.08446307480335236,
"rewards/margins_min": 0.026771211996674538,
"rewards/margins_std": 0.040794309228658676,
"rewards/rejected": -0.041067905724048615,
"step": 640
},
{
"epoch": 0.38,
"grad_norm": 0.41015625,
"learning_rate": 3.921123428678511e-07,
"logits/chosen": 0.022506317123770714,
"logits/rejected": 0.6284270882606506,
"logps/chosen": -305.97674560546875,
"logps/rejected": -239.0786590576172,
"loss": 0.666,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.020474497228860855,
"rewards/margins": 0.06788565218448639,
"rewards/margins_max": 0.09115969389677048,
"rewards/margins_min": 0.044611603021621704,
"rewards/margins_std": 0.03291446715593338,
"rewards/rejected": -0.047411151230335236,
"step": 650
},
{
"epoch": 0.38,
"grad_norm": 0.478515625,
"learning_rate": 3.8791738067153314e-07,
"logits/chosen": 0.07077694684267044,
"logits/rejected": 0.5682755708694458,
"logps/chosen": -231.22695922851562,
"logps/rejected": -227.6490478515625,
"loss": 0.6622,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03146480768918991,
"rewards/margins": 0.06544210761785507,
"rewards/margins_max": 0.0967545360326767,
"rewards/margins_min": 0.034129686653614044,
"rewards/margins_std": 0.044282447546720505,
"rewards/rejected": -0.03397729992866516,
"step": 660
},
{
"epoch": 0.39,
"grad_norm": 0.41796875,
"learning_rate": 3.83665836213682e-07,
"logits/chosen": 0.12142015993595123,
"logits/rejected": 0.5390751957893372,
"logps/chosen": -207.6114501953125,
"logps/rejected": -215.29849243164062,
"loss": 0.6636,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.011886438354849815,
"rewards/margins": 0.05365458130836487,
"rewards/margins_max": 0.07296213507652283,
"rewards/margins_min": 0.03434702754020691,
"rewards/margins_std": 0.027305006980895996,
"rewards/rejected": -0.0417681448161602,
"step": 670
},
{
"epoch": 0.39,
"grad_norm": 0.46875,
"learning_rate": 3.7935945374146417e-07,
"logits/chosen": 0.007061509881168604,
"logits/rejected": 0.3642507493495941,
"logps/chosen": -236.29788208007812,
"logps/rejected": -242.33544921875,
"loss": 0.6631,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.02563950978219509,
"rewards/margins": 0.05955478549003601,
"rewards/margins_max": 0.08539506047964096,
"rewards/margins_min": 0.03371449559926987,
"rewards/margins_std": 0.036543674767017365,
"rewards/rejected": -0.03391526639461517,
"step": 680
},
{
"epoch": 0.4,
"grad_norm": 0.5234375,
"learning_rate": 3.75e-07,
"logits/chosen": 0.08328167349100113,
"logits/rejected": 0.5527598857879639,
"logps/chosen": -239.66159057617188,
"logps/rejected": -235.6712188720703,
"loss": 0.6622,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023291967809200287,
"rewards/margins": 0.07459411025047302,
"rewards/margins_max": 0.1087113469839096,
"rewards/margins_min": 0.04047687351703644,
"rewards/margins_std": 0.04824905842542648,
"rewards/rejected": -0.051302142441272736,
"step": 690
},
{
"epoch": 0.41,
"grad_norm": 0.40625,
"learning_rate": 3.7058926350753517e-07,
"logits/chosen": 0.04602205008268356,
"logits/rejected": 0.6276509165763855,
"logps/chosen": -247.14205932617188,
"logps/rejected": -208.6519775390625,
"loss": 0.6614,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.022474488243460655,
"rewards/margins": 0.07001164555549622,
"rewards/margins_max": 0.09704446792602539,
"rewards/margins_min": 0.04297882691025734,
"rewards/margins_std": 0.038230184465646744,
"rewards/rejected": -0.04753715917468071,
"step": 700
},
{
"epoch": 0.41,
"grad_norm": 0.4453125,
"learning_rate": 3.661290538216798e-07,
"logits/chosen": 0.291398823261261,
"logits/rejected": 0.6808168292045593,
"logps/chosen": -224.65090942382812,
"logps/rejected": -205.6571807861328,
"loss": 0.6632,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.0020084187854081392,
"rewards/margins": 0.05480460077524185,
"rewards/margins_max": 0.0770978108048439,
"rewards/margins_min": 0.0325113907456398,
"rewards/margins_std": 0.031527359038591385,
"rewards/rejected": -0.05279617756605148,
"step": 710
},
{
"epoch": 0.42,
"grad_norm": 0.4375,
"learning_rate": 3.616212007970159e-07,
"logits/chosen": 0.05395558476448059,
"logits/rejected": 0.29135066270828247,
"logps/chosen": -189.52139282226562,
"logps/rejected": -215.48080444335938,
"loss": 0.6633,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.008078034035861492,
"rewards/margins": 0.05178927257657051,
"rewards/margins_max": 0.0689278393983841,
"rewards/margins_min": 0.034650713205337524,
"rewards/margins_std": 0.024237588047981262,
"rewards/rejected": -0.043711237609386444,
"step": 720
},
{
"epoch": 0.42,
"grad_norm": 0.4609375,
"learning_rate": 3.5706755383437703e-07,
"logits/chosen": 0.09721295535564423,
"logits/rejected": 0.5186147689819336,
"logps/chosen": -302.69482421875,
"logps/rejected": -258.5033874511719,
"loss": 0.6646,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.020449183881282806,
"rewards/margins": 0.052381712943315506,
"rewards/margins_max": 0.07583948969841003,
"rewards/margins_min": 0.02892393246293068,
"rewards/margins_std": 0.0331743024289608,
"rewards/rejected": -0.0319325253367424,
"step": 730
},
{
"epoch": 0.43,
"grad_norm": 0.443359375,
"learning_rate": 3.5246998112210993e-07,
"logits/chosen": 0.13969309628009796,
"logits/rejected": 0.6499422192573547,
"logps/chosen": -262.07000732421875,
"logps/rejected": -253.33364868164062,
"loss": 0.6583,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.020577292889356613,
"rewards/margins": 0.08194496482610703,
"rewards/margins_max": 0.10924677550792694,
"rewards/margins_min": 0.05464313551783562,
"rewards/margins_std": 0.038610607385635376,
"rewards/rejected": -0.061367668211460114,
"step": 740
},
{
"epoch": 0.44,
"grad_norm": 0.39453125,
"learning_rate": 3.4783036886962736e-07,
"logits/chosen": 0.15751202404499054,
"logits/rejected": 0.583830714225769,
"logps/chosen": -232.4749298095703,
"logps/rejected": -251.43881225585938,
"loss": 0.6642,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.013448268175125122,
"rewards/margins": 0.06021388620138168,
"rewards/margins_max": 0.08211688697338104,
"rewards/margins_min": 0.03831087797880173,
"rewards/margins_std": 0.030975526198744774,
"rewards/rejected": -0.04676561802625656,
"step": 750
},
{
"epoch": 0.44,
"grad_norm": 0.451171875,
"learning_rate": 3.4315062053356847e-07,
"logits/chosen": -0.02616945281624794,
"logits/rejected": 0.5470731854438782,
"logps/chosen": -247.7039031982422,
"logps/rejected": -204.8767547607422,
"loss": 0.6635,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.02075277827680111,
"rewards/margins": 0.06478811800479889,
"rewards/margins_max": 0.09738490730524063,
"rewards/margins_min": 0.03219131752848625,
"rewards/margins_std": 0.04609883576631546,
"rewards/rejected": -0.04403533786535263,
"step": 760
},
{
"epoch": 0.45,
"grad_norm": 0.515625,
"learning_rate": 3.384326560368826e-07,
"logits/chosen": 0.040539853274822235,
"logits/rejected": 0.5014762878417969,
"logps/chosen": -249.2455596923828,
"logps/rejected": -242.47781372070312,
"loss": 0.662,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.02245604246854782,
"rewards/margins": 0.05939044803380966,
"rewards/margins_max": 0.08405659347772598,
"rewards/margins_min": 0.03472430631518364,
"rewards/margins_std": 0.03488319739699364,
"rewards/rejected": -0.03693440556526184,
"step": 770
},
{
"epoch": 0.45,
"grad_norm": 0.5,
"learning_rate": 3.3367841098115777e-07,
"logits/chosen": 0.05805939435958862,
"logits/rejected": 0.47922706604003906,
"logps/chosen": -286.8292541503906,
"logps/rejected": -230.5067138671875,
"loss": 0.6653,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.004244116134941578,
"rewards/margins": 0.0571456179022789,
"rewards/margins_max": 0.08360336720943451,
"rewards/margins_min": 0.030687877908349037,
"rewards/margins_std": 0.03741690143942833,
"rewards/rejected": -0.052901506423950195,
"step": 780
},
{
"epoch": 0.46,
"grad_norm": 0.40234375,
"learning_rate": 3.2888983585251713e-07,
"logits/chosen": 0.11492130905389786,
"logits/rejected": 0.3956727087497711,
"logps/chosen": -204.6266632080078,
"logps/rejected": -208.7443084716797,
"loss": 0.6606,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011013984680175781,
"rewards/margins": 0.057107020169496536,
"rewards/margins_max": 0.07711775600910187,
"rewards/margins_min": 0.037096280604600906,
"rewards/margins_std": 0.02829946205019951,
"rewards/rejected": -0.046093035489320755,
"step": 790
},
{
"epoch": 0.46,
"grad_norm": 0.466796875,
"learning_rate": 3.240688952214085e-07,
"logits/chosen": -0.019520867615938187,
"logits/rejected": 0.34635210037231445,
"logps/chosen": -278.4693298339844,
"logps/rejected": -257.54986572265625,
"loss": 0.6607,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.020895112305879593,
"rewards/margins": 0.08000204712152481,
"rewards/margins_max": 0.1040647029876709,
"rewards/margins_min": 0.05593939870595932,
"rewards/margins_std": 0.034029725939035416,
"rewards/rejected": -0.05910693481564522,
"step": 800
},
{
"epoch": 0.47,
"grad_norm": 0.365234375,
"learning_rate": 3.192175669366156e-07,
"logits/chosen": 0.08061734586954117,
"logits/rejected": 0.440199077129364,
"logps/chosen": -216.41323852539062,
"logps/rejected": -240.26333618164062,
"loss": 0.6611,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.011639273725450039,
"rewards/margins": 0.061767347157001495,
"rewards/margins_max": 0.09113974124193192,
"rewards/margins_min": 0.03239493444561958,
"rewards/margins_std": 0.04153885692358017,
"rewards/rejected": -0.050128065049648285,
"step": 810
},
{
"epoch": 0.48,
"grad_norm": 0.435546875,
"learning_rate": 3.14337841313822e-07,
"logits/chosen": 0.2162504643201828,
"logits/rejected": 0.6251672506332397,
"logps/chosen": -249.9015655517578,
"logps/rejected": -198.54403686523438,
"loss": 0.6629,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.008589675650000572,
"rewards/margins": 0.05789928883314133,
"rewards/margins_max": 0.07874341309070587,
"rewards/margins_min": 0.03705517202615738,
"rewards/margins_std": 0.029478034004569054,
"rewards/rejected": -0.0493096187710762,
"step": 820
},
{
"epoch": 0.48,
"grad_norm": 0.443359375,
"learning_rate": 3.094317203190603e-07,
"logits/chosen": -0.0029448375571519136,
"logits/rejected": 0.4555005431175232,
"logps/chosen": -240.8060760498047,
"logps/rejected": -222.56246948242188,
"loss": 0.6561,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.022363275289535522,
"rewards/margins": 0.08168495446443558,
"rewards/margins_max": 0.11077789962291718,
"rewards/margins_min": 0.052591998130083084,
"rewards/margins_std": 0.04114364832639694,
"rewards/rejected": -0.059321679174900055,
"step": 830
},
{
"epoch": 0.49,
"grad_norm": 0.38671875,
"learning_rate": 3.045012167473814e-07,
"logits/chosen": 0.1808149516582489,
"logits/rejected": 0.5233570337295532,
"logps/chosen": -263.43255615234375,
"logps/rejected": -270.8913269042969,
"loss": 0.6616,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.02442259155213833,
"rewards/margins": 0.0733276903629303,
"rewards/margins_max": 0.104800745844841,
"rewards/margins_min": 0.041854631155729294,
"rewards/margins_std": 0.04450962692499161,
"rewards/rejected": -0.04890510439872742,
"step": 840
},
{
"epoch": 0.49,
"grad_norm": 0.4140625,
"learning_rate": 2.995483533970809e-07,
"logits/chosen": 0.2622363269329071,
"logits/rejected": 0.7754552960395813,
"logps/chosen": -228.362060546875,
"logps/rejected": -187.44383239746094,
"loss": 0.6618,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011710538528859615,
"rewards/margins": 0.06277038902044296,
"rewards/margins_max": 0.08341649174690247,
"rewards/margins_min": 0.04212428256869316,
"rewards/margins_std": 0.029198000207543373,
"rewards/rejected": -0.05105985328555107,
"step": 850
},
{
"epoch": 0.5,
"grad_norm": 0.453125,
"learning_rate": 2.9457516223982235e-07,
"logits/chosen": 0.11260411888360977,
"logits/rejected": 0.47127556800842285,
"logps/chosen": -251.4638214111328,
"logps/rejected": -251.6316680908203,
"loss": 0.6609,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.009782608598470688,
"rewards/margins": 0.07295442372560501,
"rewards/margins_max": 0.10423107445240021,
"rewards/margins_min": 0.04167778044939041,
"rewards/margins_std": 0.044231854379177094,
"rewards/rejected": -0.06317181885242462,
"step": 860
},
{
"epoch": 0.5,
"grad_norm": 0.44921875,
"learning_rate": 2.895836835869962e-07,
"logits/chosen": 0.03560788184404373,
"logits/rejected": 0.4069921374320984,
"logps/chosen": -228.38876342773438,
"logps/rejected": -221.29638671875,
"loss": 0.662,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.009866083040833473,
"rewards/margins": 0.06033489108085632,
"rewards/margins_max": 0.09506522119045258,
"rewards/margins_min": 0.02560456469655037,
"rewards/margins_std": 0.0491160973906517,
"rewards/rejected": -0.050468809902668,
"step": 870
},
{
"epoch": 0.51,
"grad_norm": 0.48046875,
"learning_rate": 2.845759652526574e-07,
"logits/chosen": 0.07124204933643341,
"logits/rejected": 0.5192992687225342,
"logps/chosen": -234.10836791992188,
"logps/rejected": -189.55230712890625,
"loss": 0.66,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01570773683488369,
"rewards/margins": 0.05234966799616814,
"rewards/margins_max": 0.07433562725782394,
"rewards/margins_min": 0.030363699421286583,
"rewards/margins_std": 0.031092852354049683,
"rewards/rejected": -0.036641925573349,
"step": 880
},
{
"epoch": 0.52,
"grad_norm": 0.427734375,
"learning_rate": 2.795540617133853e-07,
"logits/chosen": 0.24306873977184296,
"logits/rejected": 0.4881308674812317,
"logps/chosen": -233.5541534423828,
"logps/rejected": -271.29119873046875,
"loss": 0.6601,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.0062574222683906555,
"rewards/margins": 0.06694331020116806,
"rewards/margins_max": 0.0913429707288742,
"rewards/margins_min": 0.04254365712404251,
"rewards/margins_std": 0.03450632840394974,
"rewards/rejected": -0.060685895383358,
"step": 890
},
{
"epoch": 0.52,
"grad_norm": 0.40234375,
"learning_rate": 2.7452003326540995e-07,
"logits/chosen": 0.1885126382112503,
"logits/rejected": 0.6096329689025879,
"logps/chosen": -223.55380249023438,
"logps/rejected": -210.834716796875,
"loss": 0.6613,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01565275713801384,
"rewards/margins": 0.0681251734495163,
"rewards/margins_max": 0.0929432287812233,
"rewards/margins_min": 0.043307114392519,
"rewards/margins_std": 0.035098038613796234,
"rewards/rejected": -0.05247241258621216,
"step": 900
},
{
"epoch": 0.53,
"grad_norm": 0.369140625,
"learning_rate": 2.694759451793508e-07,
"logits/chosen": 0.3056187033653259,
"logits/rejected": 0.5238193273544312,
"logps/chosen": -180.62220764160156,
"logps/rejected": -202.76705932617188,
"loss": 0.6628,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.005610722117125988,
"rewards/margins": 0.053133320063352585,
"rewards/margins_max": 0.0700041875243187,
"rewards/margins_min": 0.03626246377825737,
"rewards/margins_std": 0.023858997970819473,
"rewards/rejected": -0.04752260446548462,
"step": 910
},
{
"epoch": 0.53,
"grad_norm": 0.48828125,
"learning_rate": 2.644238668529146e-07,
"logits/chosen": 0.21234102547168732,
"logits/rejected": 0.48591142892837524,
"logps/chosen": -223.54971313476562,
"logps/rejected": -248.9346466064453,
"loss": 0.6607,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.017756493762135506,
"rewards/margins": 0.07771660387516022,
"rewards/margins_max": 0.11433382332324982,
"rewards/margins_min": 0.04109939560294151,
"rewards/margins_std": 0.05178455635905266,
"rewards/rejected": -0.05996011570096016,
"step": 920
},
{
"epoch": 0.54,
"grad_norm": 0.396484375,
"learning_rate": 2.593658709619001e-07,
"logits/chosen": 0.11299429088830948,
"logits/rejected": 0.5906545519828796,
"logps/chosen": -222.49609375,
"logps/rejected": -204.37290954589844,
"loss": 0.6601,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.02080368809401989,
"rewards/margins": 0.07051359862089157,
"rewards/margins_max": 0.10480418056249619,
"rewards/margins_min": 0.03622300922870636,
"rewards/margins_std": 0.048494212329387665,
"rewards/rejected": -0.04970990866422653,
"step": 930
},
{
"epoch": 0.55,
"grad_norm": 0.423828125,
"learning_rate": 2.5430403260985807e-07,
"logits/chosen": 0.11868913471698761,
"logits/rejected": 0.5508742332458496,
"logps/chosen": -212.3166961669922,
"logps/rejected": -219.1356658935547,
"loss": 0.6583,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.021529385820031166,
"rewards/margins": 0.06332559883594513,
"rewards/margins_max": 0.0937047004699707,
"rewards/margins_min": 0.03294649347662926,
"rewards/margins_std": 0.042962536215782166,
"rewards/rejected": -0.04179621487855911,
"step": 940
},
{
"epoch": 0.55,
"grad_norm": 0.470703125,
"learning_rate": 2.4924042847675503e-07,
"logits/chosen": 0.06126406043767929,
"logits/rejected": 0.5420705080032349,
"logps/chosen": -294.85845947265625,
"logps/rejected": -215.2727813720703,
"loss": 0.661,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.007373870350420475,
"rewards/margins": 0.05419896915555,
"rewards/margins_max": 0.08067617565393448,
"rewards/margins_min": 0.02772175334393978,
"rewards/margins_std": 0.03744443506002426,
"rewards/rejected": -0.0468250997364521,
"step": 950
},
{
"epoch": 0.56,
"grad_norm": 0.47265625,
"learning_rate": 2.441771359669902e-07,
"logits/chosen": 0.13893456757068634,
"logits/rejected": 0.4921324849128723,
"logps/chosen": -235.5193634033203,
"logps/rejected": -225.794189453125,
"loss": 0.6607,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.012106789276003838,
"rewards/margins": 0.06842382997274399,
"rewards/margins_max": 0.100715771317482,
"rewards/margins_min": 0.03613189607858658,
"rewards/margins_std": 0.045667704194784164,
"rewards/rejected": -0.056317038834095,
"step": 960
},
{
"epoch": 0.56,
"grad_norm": 0.443359375,
"learning_rate": 2.391162323571161e-07,
"logits/chosen": 0.07089251279830933,
"logits/rejected": 0.48170119524002075,
"logps/chosen": -230.9342498779297,
"logps/rejected": -226.3340301513672,
"loss": 0.6617,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.010878843255341053,
"rewards/margins": 0.06217268109321594,
"rewards/margins_max": 0.08883620798587799,
"rewards/margins_min": 0.03550915792584419,
"rewards/margins_std": 0.037707917392253876,
"rewards/rejected": -0.051293838769197464,
"step": 970
},
{
"epoch": 0.57,
"grad_norm": 0.42578125,
"learning_rate": 2.340597939436097e-07,
"logits/chosen": 0.03681742399930954,
"logits/rejected": 0.5955736041069031,
"logps/chosen": -234.0045166015625,
"logps/rejected": -216.2124786376953,
"loss": 0.6614,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.0253006462007761,
"rewards/margins": 0.06550078094005585,
"rewards/margins_max": 0.0953935831785202,
"rewards/margins_min": 0.035607993602752686,
"rewards/margins_std": 0.04227479174733162,
"rewards/rejected": -0.0402001328766346,
"step": 980
},
{
"epoch": 0.57,
"grad_norm": 0.42578125,
"learning_rate": 2.2900989519104796e-07,
"logits/chosen": 0.1664225161075592,
"logits/rejected": 0.4196982979774475,
"logps/chosen": -182.28829956054688,
"logps/rejected": -211.08865356445312,
"loss": 0.6625,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.0058049350045621395,
"rewards/margins": 0.06564933061599731,
"rewards/margins_max": 0.09529349207878113,
"rewards/margins_min": 0.036005161702632904,
"rewards/margins_std": 0.04192318022251129,
"rewards/rejected": -0.05984439328312874,
"step": 990
},
{
"epoch": 0.58,
"grad_norm": 0.4375,
"learning_rate": 2.2396860788103353e-07,
"logits/chosen": -0.04069889336824417,
"logits/rejected": 0.4455093741416931,
"logps/chosen": -208.73477172851562,
"logps/rejected": -199.85501098632812,
"loss": 0.6608,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.015201890841126442,
"rewards/margins": 0.08097913861274719,
"rewards/margins_max": 0.11325138807296753,
"rewards/margins_min": 0.04870688170194626,
"rewards/margins_std": 0.04563985764980316,
"rewards/rejected": -0.0657772421836853,
"step": 1000
},
{
"epoch": 0.59,
"grad_norm": 0.451171875,
"learning_rate": 2.1893800026222083e-07,
"logits/chosen": 0.24370861053466797,
"logits/rejected": 0.655241847038269,
"logps/chosen": -239.9451446533203,
"logps/rejected": -255.0171356201172,
"loss": 0.6612,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.01818387396633625,
"rewards/margins": 0.06645138561725616,
"rewards/margins_max": 0.0944729745388031,
"rewards/margins_min": 0.03842979669570923,
"rewards/margins_std": 0.039628516882658005,
"rewards/rejected": -0.048267509788274765,
"step": 1010
},
{
"epoch": 0.59,
"grad_norm": 0.376953125,
"learning_rate": 2.1392013620179336e-07,
"logits/chosen": -0.15726599097251892,
"logits/rejected": 0.27727076411247253,
"logps/chosen": -208.62881469726562,
"logps/rejected": -205.62429809570312,
"loss": 0.6593,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.012712801806628704,
"rewards/margins": 0.07130307704210281,
"rewards/margins_max": 0.09740529954433441,
"rewards/margins_min": 0.04520086199045181,
"rewards/margins_std": 0.03691411018371582,
"rewards/rejected": -0.05859028175473213,
"step": 1020
},
{
"epoch": 0.6,
"grad_norm": 0.373046875,
"learning_rate": 2.0891707433873623e-07,
"logits/chosen": 0.2577076256275177,
"logits/rejected": 0.5587279796600342,
"logps/chosen": -232.6507568359375,
"logps/rejected": -236.791015625,
"loss": 0.6608,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.007417677901685238,
"rewards/margins": 0.06323407590389252,
"rewards/margins_max": 0.09169165790081024,
"rewards/margins_min": 0.03477650135755539,
"rewards/margins_std": 0.040245089679956436,
"rewards/rejected": -0.055816400796175,
"step": 1030
},
{
"epoch": 0.6,
"grad_norm": 0.4609375,
"learning_rate": 2.039308672392556e-07,
"logits/chosen": 0.09692186862230301,
"logits/rejected": 0.5365327000617981,
"logps/chosen": -220.7172393798828,
"logps/rejected": -204.85055541992188,
"loss": 0.6567,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.016125962138175964,
"rewards/margins": 0.06824339926242828,
"rewards/margins_max": 0.10508973896503448,
"rewards/margins_min": 0.03139704838395119,
"rewards/margins_std": 0.052108604460954666,
"rewards/rejected": -0.05211742967367172,
"step": 1040
},
{
"epoch": 0.61,
"grad_norm": 0.36328125,
"learning_rate": 1.9896356055468845e-07,
"logits/chosen": 0.24312233924865723,
"logits/rejected": 0.5007752180099487,
"logps/chosen": -217.9171600341797,
"logps/rejected": -255.72866821289062,
"loss": 0.6605,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.015429767780005932,
"rewards/margins": 0.06471355259418488,
"rewards/margins_max": 0.09141434729099274,
"rewards/margins_min": 0.03801275044679642,
"rewards/margins_std": 0.03776064142584801,
"rewards/rejected": -0.04928378015756607,
"step": 1050
},
{
"epoch": 0.61,
"grad_norm": 0.359375,
"learning_rate": 1.940171921822496e-07,
"logits/chosen": 0.007707296404987574,
"logits/rejected": 0.3314017653465271,
"logps/chosen": -218.86654663085938,
"logps/rejected": -214.7074737548828,
"loss": 0.6625,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.010595353320240974,
"rewards/margins": 0.05604109913110733,
"rewards/margins_max": 0.08353577554225922,
"rewards/margins_min": 0.028546428307890892,
"rewards/margins_std": 0.03888333961367607,
"rewards/rejected": -0.045445747673511505,
"step": 1060
},
{
"epoch": 0.62,
"grad_norm": 0.421875,
"learning_rate": 1.8909379142895977e-07,
"logits/chosen": 0.08975931257009506,
"logits/rejected": 0.49662691354751587,
"logps/chosen": -243.73941040039062,
"logps/rejected": -218.0565643310547,
"loss": 0.6628,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.017341626808047295,
"rewards/margins": 0.06548301875591278,
"rewards/margins_max": 0.10044316947460175,
"rewards/margins_min": 0.030522847548127174,
"rewards/margins_std": 0.0494411401450634,
"rewards/rejected": -0.04814138263463974,
"step": 1070
},
{
"epoch": 0.63,
"grad_norm": 0.419921875,
"learning_rate": 1.841953781790983e-07,
"logits/chosen": 0.14877240359783173,
"logits/rejected": 0.32807669043540955,
"logps/chosen": -201.35398864746094,
"logps/rejected": -237.98403930664062,
"loss": 0.6614,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.011331291869282722,
"rewards/margins": 0.05169866234064102,
"rewards/margins_max": 0.08101126551628113,
"rewards/margins_min": 0.02238604798913002,
"rewards/margins_std": 0.041454292833805084,
"rewards/rejected": -0.04036737233400345,
"step": 1080
},
{
"epoch": 0.63,
"grad_norm": 0.5234375,
"learning_rate": 1.793239620655211e-07,
"logits/chosen": 0.10640072822570801,
"logits/rejected": 0.5526248812675476,
"logps/chosen": -198.35403442382812,
"logps/rejected": -196.8388671875,
"loss": 0.6604,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.0263301283121109,
"rewards/margins": 0.07441949844360352,
"rewards/margins_max": 0.1034015566110611,
"rewards/margins_min": 0.045437444001436234,
"rewards/margins_std": 0.040986817330121994,
"rewards/rejected": -0.04808937385678291,
"step": 1090
},
{
"epoch": 0.64,
"grad_norm": 0.390625,
"learning_rate": 1.744815416451847e-07,
"logits/chosen": 0.1694943606853485,
"logits/rejected": 0.6004883050918579,
"logps/chosen": -255.3223114013672,
"logps/rejected": -243.01541137695312,
"loss": 0.6625,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01719365268945694,
"rewards/margins": 0.06180461123585701,
"rewards/margins_max": 0.08655586838722229,
"rewards/margins_min": 0.03705335780978203,
"rewards/margins_std": 0.03500355780124664,
"rewards/rejected": -0.04461096227169037,
"step": 1100
},
{
"epoch": 0.64,
"grad_norm": 0.4453125,
"learning_rate": 1.6967010357921446e-07,
"logits/chosen": 0.11355743557214737,
"logits/rejected": 0.4874862730503082,
"logps/chosen": -210.58767700195312,
"logps/rejected": -219.46701049804688,
"loss": 0.6618,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.005143271759152412,
"rewards/margins": 0.061519283801317215,
"rewards/margins_max": 0.0864943265914917,
"rewards/margins_min": 0.036544252187013626,
"rewards/margins_std": 0.035320036113262177,
"rewards/rejected": -0.05637601017951965,
"step": 1110
},
{
"epoch": 0.65,
"grad_norm": 0.439453125,
"learning_rate": 1.6489162181785255e-07,
"logits/chosen": 0.15795719623565674,
"logits/rejected": 0.5425394773483276,
"logps/chosen": -245.29562377929688,
"logps/rejected": -233.9000244140625,
"loss": 0.6602,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.021811651065945625,
"rewards/margins": 0.07487231492996216,
"rewards/margins_max": 0.09871380031108856,
"rewards/margins_min": 0.051030855625867844,
"rewards/margins_std": 0.03371693566441536,
"rewards/rejected": -0.05306067317724228,
"step": 1120
},
{
"epoch": 0.66,
"grad_norm": 0.361328125,
"learning_rate": 1.6014805679062183e-07,
"logits/chosen": -0.04248831048607826,
"logits/rejected": 0.36503881216049194,
"logps/chosen": -204.58383178710938,
"logps/rejected": -203.0003204345703,
"loss": 0.6607,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.020199652761220932,
"rewards/margins": 0.08475508540868759,
"rewards/margins_max": 0.11757893860340118,
"rewards/margins_min": 0.051931243389844894,
"rewards/margins_std": 0.046419933438301086,
"rewards/rejected": -0.06455543637275696,
"step": 1130
},
{
"epoch": 0.66,
"grad_norm": 0.482421875,
"learning_rate": 1.5544135460203527e-07,
"logits/chosen": 0.250204861164093,
"logits/rejected": 0.5448838472366333,
"logps/chosen": -212.43508911132812,
"logps/rejected": -247.50747680664062,
"loss": 0.6601,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.013406927697360516,
"rewards/margins": 0.07055126130580902,
"rewards/margins_max": 0.09891954064369202,
"rewards/margins_min": 0.04218297451734543,
"rewards/margins_std": 0.04011881351470947,
"rewards/rejected": -0.05714433267712593,
"step": 1140
},
{
"epoch": 0.67,
"grad_norm": 0.408203125,
"learning_rate": 1.5077344623318388e-07,
"logits/chosen": 0.08146306127309799,
"logits/rejected": 0.5028539896011353,
"logps/chosen": -244.5470733642578,
"logps/rejected": -203.9750213623047,
"loss": 0.6622,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.00543981185182929,
"rewards/margins": 0.0606420524418354,
"rewards/margins_max": 0.09149619191884995,
"rewards/margins_min": 0.029787922278046608,
"rewards/margins_std": 0.043634332716464996,
"rewards/rejected": -0.05520225316286087,
"step": 1150
},
{
"epoch": 0.67,
"grad_norm": 0.4921875,
"learning_rate": 1.461462467495284e-07,
"logits/chosen": 0.09238779544830322,
"logits/rejected": 0.5282326340675354,
"logps/chosen": -239.08853149414062,
"logps/rejected": -234.31228637695312,
"loss": 0.6582,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.020727628841996193,
"rewards/margins": 0.07139938324689865,
"rewards/margins_max": 0.0972491055727005,
"rewards/margins_min": 0.045549679547548294,
"rewards/margins_std": 0.036557018756866455,
"rewards/rejected": -0.0506717674434185,
"step": 1160
},
{
"epoch": 0.68,
"grad_norm": 0.400390625,
"learning_rate": 1.4156165451522028e-07,
"logits/chosen": 0.08472833782434464,
"logits/rejected": 0.5027869939804077,
"logps/chosen": -205.4404754638672,
"logps/rejected": -202.98440551757812,
"loss": 0.663,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.011948509141802788,
"rewards/margins": 0.06199117749929428,
"rewards/margins_max": 0.08956360816955566,
"rewards/margins_min": 0.03441876173019409,
"rewards/margins_std": 0.038993291556835175,
"rewards/rejected": -0.05004267022013664,
"step": 1170
},
{
"epoch": 0.68,
"grad_norm": 0.470703125,
"learning_rate": 1.3702155041427543e-07,
"logits/chosen": 0.1654224544763565,
"logits/rejected": 0.39103928208351135,
"logps/chosen": -221.5464630126953,
"logps/rejected": -246.1484832763672,
"loss": 0.6611,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.008782127872109413,
"rewards/margins": 0.05567712336778641,
"rewards/margins_max": 0.07324758917093277,
"rewards/margins_min": 0.038106657564640045,
"rewards/margins_std": 0.024848390370607376,
"rewards/rejected": -0.046894993633031845,
"step": 1180
},
{
"epoch": 0.69,
"grad_norm": 0.4375,
"learning_rate": 1.3252779707891902e-07,
"logits/chosen": 0.009541223756968975,
"logits/rejected": 0.48217493295669556,
"logps/chosen": -272.9510192871094,
"logps/rejected": -204.46435546875,
"loss": 0.6611,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.009134182706475258,
"rewards/margins": 0.05944829061627388,
"rewards/margins_max": 0.08002766221761703,
"rewards/margins_min": 0.03886892646551132,
"rewards/margins_std": 0.02910362184047699,
"rewards/rejected": -0.05031410977244377,
"step": 1190
},
{
"epoch": 0.7,
"grad_norm": 0.462890625,
"learning_rate": 1.2808223812541774e-07,
"logits/chosen": 0.07254563271999359,
"logits/rejected": 0.47662535309791565,
"logps/chosen": -241.54336547851562,
"logps/rejected": -211.88424682617188,
"loss": 0.6606,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0020990788470953703,
"rewards/margins": 0.05149079114198685,
"rewards/margins_max": 0.08034542202949524,
"rewards/margins_min": 0.022636160254478455,
"rewards/margins_std": 0.040806613862514496,
"rewards/rejected": -0.04939170926809311,
"step": 1200
},
{
"epoch": 0.7,
"grad_norm": 0.4375,
"learning_rate": 1.2368669739771469e-07,
"logits/chosen": 0.07886068522930145,
"logits/rejected": 0.4947189390659332,
"logps/chosen": -206.33993530273438,
"logps/rejected": -212.7965850830078,
"loss": 0.6578,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.009903495199978352,
"rewards/margins": 0.0682389959692955,
"rewards/margins_max": 0.09637950360774994,
"rewards/margins_min": 0.04009848088026047,
"rewards/margins_std": 0.03979669511318207,
"rewards/rejected": -0.058335501700639725,
"step": 1210
},
{
"epoch": 0.71,
"grad_norm": 0.439453125,
"learning_rate": 1.1934297821917497e-07,
"logits/chosen": -0.18527595698833466,
"logits/rejected": 0.35417476296424866,
"logps/chosen": -271.8248291015625,
"logps/rejected": -208.87966918945312,
"loss": 0.6619,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.014687316492199898,
"rewards/margins": 0.05254317447543144,
"rewards/margins_max": 0.0765123963356018,
"rewards/margins_min": 0.028573954477906227,
"rewards/margins_std": 0.03389759734272957,
"rewards/rejected": -0.03785586357116699,
"step": 1220
},
{
"epoch": 0.71,
"grad_norm": 0.40234375,
"learning_rate": 1.1505286265275094e-07,
"logits/chosen": 0.09351782500743866,
"logits/rejected": 0.5304566621780396,
"logps/chosen": -217.6367645263672,
"logps/rejected": -209.18603515625,
"loss": 0.666,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.01146542839705944,
"rewards/margins": 0.07028119266033173,
"rewards/margins_max": 0.10538403689861298,
"rewards/margins_min": 0.03517835959792137,
"rewards/margins_std": 0.0496429018676281,
"rewards/rejected": -0.05881576985120773,
"step": 1230
},
{
"epoch": 0.72,
"grad_norm": 0.390625,
"learning_rate": 1.1081811076986963e-07,
"logits/chosen": 0.026241421699523926,
"logits/rejected": 0.6041153073310852,
"logps/chosen": -228.3728790283203,
"logps/rejected": -190.1019287109375,
"loss": 0.6596,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.016418198123574257,
"rewards/margins": 0.0706411749124527,
"rewards/margins_max": 0.09941698610782623,
"rewards/margins_min": 0.041865330189466476,
"rewards/margins_std": 0.04069516435265541,
"rewards/rejected": -0.054222963750362396,
"step": 1240
},
{
"epoch": 0.73,
"grad_norm": 0.427734375,
"learning_rate": 1.0664045992834184e-07,
"logits/chosen": 0.19840288162231445,
"logits/rejected": 0.5584182143211365,
"logps/chosen": -254.10147094726562,
"logps/rejected": -256.0483703613281,
"loss": 0.6583,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.012557362206280231,
"rewards/margins": 0.06964166462421417,
"rewards/margins_max": 0.09085742384195328,
"rewards/margins_min": 0.04842590540647507,
"rewards/margins_std": 0.030003610998392105,
"rewards/rejected": -0.057084303349256516,
"step": 1250
},
{
"epoch": 0.73,
"grad_norm": 0.484375,
"learning_rate": 1.0252162405959042e-07,
"logits/chosen": -0.029180001467466354,
"logits/rejected": 0.4648149609565735,
"logps/chosen": -273.28375244140625,
"logps/rejected": -244.730712890625,
"loss": 0.6602,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.02007482200860977,
"rewards/margins": 0.06700652837753296,
"rewards/margins_max": 0.10410724580287933,
"rewards/margins_min": 0.029905814677476883,
"rewards/margins_std": 0.05246833711862564,
"rewards/rejected": -0.04693170636892319,
"step": 1260
},
{
"epoch": 0.74,
"grad_norm": 0.494140625,
"learning_rate": 9.846329296548963e-08,
"logits/chosen": -0.017562460154294968,
"logits/rejected": 0.4763096868991852,
"logps/chosen": -269.8515625,
"logps/rejected": -263.83148193359375,
"loss": 0.6598,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.010733803734183311,
"rewards/margins": 0.07448114454746246,
"rewards/margins_max": 0.10118886083364487,
"rewards/margins_min": 0.04777342826128006,
"rewards/margins_std": 0.03777041286230087,
"rewards/rejected": -0.0637473464012146,
"step": 1270
},
{
"epoch": 0.74,
"grad_norm": 0.486328125,
"learning_rate": 9.446713162510341e-08,
"logits/chosen": 0.22771111130714417,
"logits/rejected": 0.7621752023696899,
"logps/chosen": -266.06390380859375,
"logps/rejected": -250.635498046875,
"loss": 0.6584,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.030348753556609154,
"rewards/margins": 0.07343067973852158,
"rewards/margins_max": 0.10677297413349152,
"rewards/margins_min": 0.040088407695293427,
"rewards/margins_std": 0.04715309664607048,
"rewards/rejected": -0.04308192804455757,
"step": 1280
},
{
"epoch": 0.75,
"grad_norm": 0.515625,
"learning_rate": 9.053477951160737e-08,
"logits/chosen": 0.015399669297039509,
"logits/rejected": 0.7483765482902527,
"logps/chosen": -276.5067443847656,
"logps/rejected": -227.33761596679688,
"loss": 0.6579,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.026790842413902283,
"rewards/margins": 0.08279003202915192,
"rewards/margins_max": 0.11221597343683243,
"rewards/margins_min": 0.05336407572031021,
"rewards/margins_std": 0.04161457344889641,
"rewards/rejected": -0.05599917098879814,
"step": 1290
},
{
"epoch": 0.75,
"grad_norm": 0.396484375,
"learning_rate": 8.666784991967596e-08,
"logits/chosen": 0.010845961980521679,
"logits/rejected": 0.42500224709510803,
"logps/chosen": -213.1592254638672,
"logps/rejected": -199.2817840576172,
"loss": 0.6613,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014592917636036873,
"rewards/margins": 0.0668349340558052,
"rewards/margins_max": 0.09872870147228241,
"rewards/margins_min": 0.03494114801287651,
"rewards/margins_std": 0.04510461539030075,
"rewards/rejected": -0.05224201828241348,
"step": 1300
},
{
"epoch": 0.76,
"grad_norm": 0.4921875,
"learning_rate": 8.286792930360823e-08,
"logits/chosen": 0.25165149569511414,
"logits/rejected": 0.6992672681808472,
"logps/chosen": -217.0974884033203,
"logps/rejected": -202.47030639648438,
"loss": 0.6599,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.011730032041668892,
"rewards/margins": 0.0590001717209816,
"rewards/margins_max": 0.07914995402097702,
"rewards/margins_min": 0.03885037824511528,
"rewards/margins_std": 0.02849610149860382,
"rewards/rejected": -0.04727013781666756,
"step": 1310
},
{
"epoch": 0.77,
"grad_norm": 0.52734375,
"learning_rate": 7.91365766264665e-08,
"logits/chosen": 0.20514824986457825,
"logits/rejected": 0.5356392860412598,
"logps/chosen": -248.6316680908203,
"logps/rejected": -240.5338134765625,
"loss": 0.6591,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.010535435751080513,
"rewards/margins": 0.06282900273799896,
"rewards/margins_max": 0.09407368302345276,
"rewards/margins_min": 0.031584326177835464,
"rewards/margins_std": 0.04418665170669556,
"rewards/rejected": -0.052293576300144196,
"step": 1320
},
{
"epoch": 0.77,
"grad_norm": 0.455078125,
"learning_rate": 7.547532272049264e-08,
"logits/chosen": 0.25605538487434387,
"logits/rejected": 0.6374403238296509,
"logps/chosen": -255.80410766601562,
"logps/rejected": -255.73764038085938,
"loss": 0.6619,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.013418711721897125,
"rewards/margins": 0.06125851348042488,
"rewards/margins_max": 0.08139893412590027,
"rewards/margins_min": 0.04111810773611069,
"rewards/margins_std": 0.028482843190431595,
"rewards/rejected": -0.047839801758527756,
"step": 1330
},
{
"epoch": 0.78,
"grad_norm": 0.4140625,
"learning_rate": 7.188566965906584e-08,
"logits/chosen": 0.10137088596820831,
"logits/rejected": 0.5515474081039429,
"logps/chosen": -271.2210693359375,
"logps/rejected": -272.3622131347656,
"loss": 0.6598,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.00015007219917606562,
"rewards/margins": 0.06623668223619461,
"rewards/margins_max": 0.10004226863384247,
"rewards/margins_min": 0.03243108466267586,
"rewards/margins_std": 0.04780833050608635,
"rewards/rejected": -0.06638675183057785,
"step": 1340
},
{
"epoch": 0.78,
"grad_norm": 0.412109375,
"learning_rate": 6.836909014045924e-08,
"logits/chosen": 0.005819192621856928,
"logits/rejected": 0.38501212000846863,
"logps/chosen": -247.23056030273438,
"logps/rejected": -238.4652557373047,
"loss": 0.6607,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01672416180372238,
"rewards/margins": 0.07304920256137848,
"rewards/margins_max": 0.10092739760875702,
"rewards/margins_min": 0.04517098516225815,
"rewards/margins_std": 0.039425741881132126,
"rewards/rejected": -0.0563250370323658,
"step": 1350
},
{
"epoch": 0.79,
"grad_norm": 0.4609375,
"learning_rate": 6.492702688364737e-08,
"logits/chosen": -0.07613168656826019,
"logits/rejected": 0.20295462012290955,
"logps/chosen": -203.92233276367188,
"logps/rejected": -247.69277954101562,
"loss": 0.6604,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.014894701540470123,
"rewards/margins": 0.06641440093517303,
"rewards/margins_max": 0.09283626079559326,
"rewards/margins_min": 0.039992526173591614,
"rewards/margins_std": 0.037366170436143875,
"rewards/rejected": -0.05151969939470291,
"step": 1360
},
{
"epoch": 0.79,
"grad_norm": 0.4375,
"learning_rate": 6.156089203641373e-08,
"logits/chosen": -0.014948748052120209,
"logits/rejected": 0.4398605227470398,
"logps/chosen": -247.429931640625,
"logps/rejected": -251.06826782226562,
"loss": 0.6571,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0333079919219017,
"rewards/margins": 0.08266235888004303,
"rewards/margins_max": 0.10667815059423447,
"rewards/margins_min": 0.0586465522646904,
"rewards/margins_std": 0.03396347165107727,
"rewards/rejected": -0.04935435950756073,
"step": 1370
},
{
"epoch": 0.8,
"grad_norm": 0.427734375,
"learning_rate": 5.827206659599987e-08,
"logits/chosen": 0.28106218576431274,
"logits/rejected": 0.7749143242835999,
"logps/chosen": -222.03665161132812,
"logps/rejected": -200.11221313476562,
"loss": 0.6576,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.017674388363957405,
"rewards/margins": 0.07599468529224396,
"rewards/margins_max": 0.11385379731655121,
"rewards/margins_min": 0.038135576993227005,
"rewards/margins_std": 0.05354086682200432,
"rewards/rejected": -0.058320302516222,
"step": 1380
},
{
"epoch": 0.81,
"grad_norm": 0.50390625,
"learning_rate": 5.506189984253501e-08,
"logits/chosen": 0.16949541866779327,
"logits/rejected": 0.4548502564430237,
"logps/chosen": -205.447265625,
"logps/rejected": -221.4696044921875,
"loss": 0.6611,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.003050294006243348,
"rewards/margins": 0.06650832295417786,
"rewards/margins_max": 0.09234586358070374,
"rewards/margins_min": 0.040670786052942276,
"rewards/margins_std": 0.036539800465106964,
"rewards/rejected": -0.06345803290605545,
"step": 1390
},
{
"epoch": 0.81,
"grad_norm": 0.482421875,
"learning_rate": 5.1931708785477506e-08,
"logits/chosen": 0.11355874687433243,
"logits/rejected": 0.6481127738952637,
"logps/chosen": -216.15432739257812,
"logps/rejected": -187.30389404296875,
"loss": 0.6592,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.015445582568645477,
"rewards/margins": 0.05808136984705925,
"rewards/margins_max": 0.08922155201435089,
"rewards/margins_min": 0.026941198855638504,
"rewards/margins_std": 0.04403885826468468,
"rewards/rejected": -0.04263579100370407,
"step": 1400
},
{
"epoch": 0.82,
"grad_norm": 0.380859375,
"learning_rate": 4.888277762329582e-08,
"logits/chosen": 0.11872565746307373,
"logits/rejected": 0.5771151185035706,
"logps/chosen": -215.25442504882812,
"logps/rejected": -214.4876251220703,
"loss": 0.6619,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.01657172292470932,
"rewards/margins": 0.06676243245601654,
"rewards/margins_max": 0.0983147844672203,
"rewards/margins_min": 0.03521009162068367,
"rewards/margins_std": 0.04462175816297531,
"rewards/rejected": -0.05019070953130722,
"step": 1410
},
{
"epoch": 0.82,
"grad_norm": 0.439453125,
"learning_rate": 4.591635721661072e-08,
"logits/chosen": 0.1136382669210434,
"logits/rejected": 0.5482941269874573,
"logps/chosen": -243.9540557861328,
"logps/rejected": -231.51473999023438,
"loss": 0.6606,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01714186929166317,
"rewards/margins": 0.07303180545568466,
"rewards/margins_max": 0.10039409250020981,
"rewards/margins_min": 0.045669522136449814,
"rewards/margins_std": 0.03869611397385597,
"rewards/rejected": -0.05588993430137634,
"step": 1420
},
{
"epoch": 0.83,
"grad_norm": 0.431640625,
"learning_rate": 4.3033664575015005e-08,
"logits/chosen": 0.24127981066703796,
"logits/rejected": 0.6273223161697388,
"logps/chosen": -258.4788818359375,
"logps/rejected": -255.1360321044922,
"loss": 0.6591,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.0254741869866848,
"rewards/margins": 0.0617264024913311,
"rewards/margins_max": 0.08791927993297577,
"rewards/margins_min": 0.035533517599105835,
"rewards/margins_std": 0.03704233095049858,
"rewards/rejected": -0.036252211779356,
"step": 1430
},
{
"epoch": 0.84,
"grad_norm": 0.4453125,
"learning_rate": 4.023588235778019e-08,
"logits/chosen": 0.048088885843753815,
"logits/rejected": 0.4085961878299713,
"logps/chosen": -235.32763671875,
"logps/rejected": -246.94937133789062,
"loss": 0.6625,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.017656199634075165,
"rewards/margins": 0.07100087404251099,
"rewards/margins_max": 0.09923985600471497,
"rewards/margins_min": 0.042761895805597305,
"rewards/margins_std": 0.039935946464538574,
"rewards/rejected": -0.05334467440843582,
"step": 1440
},
{
"epoch": 0.84,
"grad_norm": 0.4609375,
"learning_rate": 3.752415838865664e-08,
"logits/chosen": -0.09887398779392242,
"logits/rejected": 0.5310045480728149,
"logps/chosen": -245.59951782226562,
"logps/rejected": -266.8290100097656,
"loss": 0.6586,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.018602244555950165,
"rewards/margins": 0.08193326741456985,
"rewards/margins_max": 0.11139090359210968,
"rewards/margins_min": 0.05247562378644943,
"rewards/margins_std": 0.041659384965896606,
"rewards/rejected": -0.06333102285861969,
"step": 1450
},
{
"epoch": 0.85,
"grad_norm": 0.439453125,
"learning_rate": 3.4899605184965206e-08,
"logits/chosen": 0.03019891306757927,
"logits/rejected": 0.44324207305908203,
"logps/chosen": -225.20443725585938,
"logps/rejected": -183.06094360351562,
"loss": 0.6609,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.0028962846845388412,
"rewards/margins": 0.0560896173119545,
"rewards/margins_max": 0.07679092139005661,
"rewards/margins_min": 0.035388313233852386,
"rewards/margins_std": 0.02927606739103794,
"rewards/rejected": -0.05319333076477051,
"step": 1460
},
{
"epoch": 0.85,
"grad_norm": 0.439453125,
"learning_rate": 3.23632995011732e-08,
"logits/chosen": -0.06648756563663483,
"logits/rejected": 0.29680854082107544,
"logps/chosen": -226.04983520507812,
"logps/rejected": -258.3298034667969,
"loss": 0.6587,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03215508535504341,
"rewards/margins": 0.08979654312133789,
"rewards/margins_max": 0.12097585201263428,
"rewards/margins_min": 0.058617234230041504,
"rewards/margins_std": 0.044094208627939224,
"rewards/rejected": -0.057641465216875076,
"step": 1470
},
{
"epoch": 0.86,
"grad_norm": 0.455078125,
"learning_rate": 2.991628188714351e-08,
"logits/chosen": 0.00623916694894433,
"logits/rejected": 0.48251962661743164,
"logps/chosen": -313.39935302734375,
"logps/rejected": -245.91720581054688,
"loss": 0.6596,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.004381291568279266,
"rewards/margins": 0.07124367356300354,
"rewards/margins_max": 0.09969727694988251,
"rewards/margins_min": 0.04279007390141487,
"rewards/margins_std": 0.04023946821689606,
"rewards/rejected": -0.06686238944530487,
"step": 1480
},
{
"epoch": 0.86,
"grad_norm": 0.4375,
"learning_rate": 2.755955626123596e-08,
"logits/chosen": 0.12439896166324615,
"logits/rejected": 0.6011586785316467,
"logps/chosen": -250.7643585205078,
"logps/rejected": -217.0757293701172,
"loss": 0.6624,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.018308712169528008,
"rewards/margins": 0.05787688493728638,
"rewards/margins_max": 0.09185748547315598,
"rewards/margins_min": 0.023896273225545883,
"rewards/margins_std": 0.04805583506822586,
"rewards/rejected": -0.03956816717982292,
"step": 1490
},
{
"epoch": 0.87,
"grad_norm": 0.42578125,
"learning_rate": 2.5294089498438225e-08,
"logits/chosen": 0.024487819522619247,
"logits/rejected": 0.5533932447433472,
"logps/chosen": -245.57492065429688,
"logps/rejected": -220.93258666992188,
"loss": 0.6584,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.010946778580546379,
"rewards/margins": 0.06493957340717316,
"rewards/margins_max": 0.0981217697262764,
"rewards/margins_min": 0.03175736218690872,
"rewards/margins_std": 0.046926725655794144,
"rewards/rejected": -0.05399278551340103,
"step": 1500
},
{
"epoch": 0.88,
"grad_norm": 0.48046875,
"learning_rate": 2.312081103369354e-08,
"logits/chosen": 0.10629892349243164,
"logits/rejected": 0.5729449987411499,
"logps/chosen": -227.0969696044922,
"logps/rejected": -209.62841796875,
"loss": 0.659,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.013625606894493103,
"rewards/margins": 0.05797373503446579,
"rewards/margins_max": 0.0893624946475029,
"rewards/margins_min": 0.02658497728407383,
"rewards/margins_std": 0.04439040273427963,
"rewards/rejected": -0.04434812813997269,
"step": 1510
},
{
"epoch": 0.88,
"grad_norm": 0.48046875,
"learning_rate": 2.104061248058872e-08,
"logits/chosen": 0.10214777290821075,
"logits/rejected": 0.4200982451438904,
"logps/chosen": -213.7083740234375,
"logps/rejected": -225.8516845703125,
"loss": 0.6666,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.018484923988580704,
"rewards/margins": 0.058260779827833176,
"rewards/margins_max": 0.08636601269245148,
"rewards/margins_min": 0.030155545100569725,
"rewards/margins_std": 0.03974680230021477,
"rewards/rejected": -0.03977585583925247,
"step": 1520
},
{
"epoch": 0.89,
"grad_norm": 0.44921875,
"learning_rate": 1.9054347265559213e-08,
"logits/chosen": 0.1583404242992401,
"logits/rejected": 0.6649370193481445,
"logps/chosen": -259.9563903808594,
"logps/rejected": -223.4931640625,
"loss": 0.6565,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.014935478568077087,
"rewards/margins": 0.07356850802898407,
"rewards/margins_max": 0.10868100821971893,
"rewards/margins_min": 0.0384560152888298,
"rewards/margins_std": 0.049656566232442856,
"rewards/rejected": -0.058633022010326385,
"step": 1530
},
{
"epoch": 0.89,
"grad_norm": 0.498046875,
"learning_rate": 1.716283027776061e-08,
"logits/chosen": 0.2019151747226715,
"logits/rejected": 0.8282853364944458,
"logps/chosen": -291.37066650390625,
"logps/rejected": -222.61831665039062,
"loss": 0.6634,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.016527246683835983,
"rewards/margins": 0.07255034148693085,
"rewards/margins_max": 0.1086968407034874,
"rewards/margins_min": 0.036403849720954895,
"rewards/margins_std": 0.05111886188387871,
"rewards/rejected": -0.05602309852838516,
"step": 1540
},
{
"epoch": 0.9,
"grad_norm": 0.4296875,
"learning_rate": 1.536683753475043e-08,
"logits/chosen": 0.22870250046253204,
"logits/rejected": 0.4174967408180237,
"logps/chosen": -219.11306762695312,
"logps/rejected": -241.36563110351562,
"loss": 0.6615,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0025456459261476994,
"rewards/margins": 0.059264473617076874,
"rewards/margins_max": 0.08250005543231964,
"rewards/margins_min": 0.036028891801834106,
"rewards/margins_std": 0.032860077917575836,
"rewards/rejected": -0.061810124665498734,
"step": 1550
},
{
"epoch": 0.9,
"grad_norm": 0.390625,
"learning_rate": 1.3667105864117873e-08,
"logits/chosen": 0.21612632274627686,
"logits/rejected": 0.39824485778808594,
"logps/chosen": -200.84498596191406,
"logps/rejected": -228.2679901123047,
"loss": 0.6605,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.008642548695206642,
"rewards/margins": 0.0651601254940033,
"rewards/margins_max": 0.10423406213521957,
"rewards/margins_min": 0.026086175814270973,
"rewards/margins_std": 0.05525890737771988,
"rewards/rejected": -0.0565175786614418,
"step": 1560
},
{
"epoch": 0.91,
"grad_norm": 0.41796875,
"learning_rate": 1.2064332601191163e-08,
"logits/chosen": -0.04893340915441513,
"logits/rejected": 0.339263916015625,
"logps/chosen": -222.4666748046875,
"logps/rejected": -217.02999877929688,
"loss": 0.6612,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.0008535057422704995,
"rewards/margins": 0.05954117700457573,
"rewards/margins_max": 0.0829622894525528,
"rewards/margins_min": 0.03612007200717926,
"rewards/margins_std": 0.03312245011329651,
"rewards/rejected": -0.06039468199014664,
"step": 1570
},
{
"epoch": 0.92,
"grad_norm": 0.39453125,
"learning_rate": 1.0559175302947476e-08,
"logits/chosen": 0.012552693486213684,
"logits/rejected": 0.5173078775405884,
"logps/chosen": -260.0834045410156,
"logps/rejected": -247.43447875976562,
"loss": 0.6595,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.011661765165627003,
"rewards/margins": 0.06366874277591705,
"rewards/margins_max": 0.09778660535812378,
"rewards/margins_min": 0.029550885781645775,
"rewards/margins_std": 0.04824993759393692,
"rewards/rejected": -0.052006978541612625,
"step": 1580
},
{
"epoch": 0.92,
"grad_norm": 0.349609375,
"learning_rate": 9.152251478242417e-09,
"logits/chosen": -0.02594194933772087,
"logits/rejected": 0.4399421215057373,
"logps/chosen": -212.4099578857422,
"logps/rejected": -199.73458862304688,
"loss": 0.6594,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.007081738207489252,
"rewards/margins": 0.06215248256921768,
"rewards/margins_max": 0.08854631334543228,
"rewards/margins_min": 0.03575865179300308,
"rewards/margins_std": 0.03732650727033615,
"rewards/rejected": -0.055070746690034866,
"step": 1590
},
{
"epoch": 0.93,
"grad_norm": 0.427734375,
"learning_rate": 7.844138334469425e-09,
"logits/chosen": 0.4558231234550476,
"logits/rejected": 0.8965223431587219,
"logps/chosen": -201.3118438720703,
"logps/rejected": -192.5732421875,
"loss": 0.6628,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.009340132586658001,
"rewards/margins": 0.0616113655269146,
"rewards/margins_max": 0.09181926399469376,
"rewards/margins_min": 0.03140346333384514,
"rewards/margins_std": 0.04272041842341423,
"rewards/rejected": -0.05227123573422432,
"step": 1600
},
{
"epoch": 0.93,
"grad_norm": 0.37109375,
"learning_rate": 6.635372540753498e-09,
"logits/chosen": 0.11258337646722794,
"logits/rejected": 0.6999211311340332,
"logps/chosen": -240.33975219726562,
"logps/rejected": -214.0699920654297,
"loss": 0.6577,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.027147358283400536,
"rewards/margins": 0.0817473754286766,
"rewards/margins_max": 0.12004182487726212,
"rewards/margins_min": 0.0434529110789299,
"rewards/margins_std": 0.05415653437376022,
"rewards/rejected": -0.05460001155734062,
"step": 1610
},
{
"epoch": 0.94,
"grad_norm": 0.4609375,
"learning_rate": 5.526450007776435e-09,
"logits/chosen": 0.1300087720155716,
"logits/rejected": 0.5238357782363892,
"logps/chosen": -292.7140197753906,
"logps/rejected": -246.2644805908203,
"loss": 0.6611,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.0037552430294454098,
"rewards/margins": 0.05609096214175224,
"rewards/margins_max": 0.07447664439678192,
"rewards/margins_min": 0.03770528361201286,
"rewards/margins_std": 0.026001274585723877,
"rewards/rejected": -0.052335720509290695,
"step": 1620
},
{
"epoch": 0.95,
"grad_norm": 0.431640625,
"learning_rate": 4.517825684323323e-09,
"logits/chosen": 0.18602465093135834,
"logits/rejected": 0.5172281861305237,
"logps/chosen": -223.3422088623047,
"logps/rejected": -241.034912109375,
"loss": 0.6596,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.00845097191631794,
"rewards/margins": 0.06410791724920273,
"rewards/margins_max": 0.09119440615177155,
"rewards/margins_min": 0.037021439522504807,
"rewards/margins_std": 0.03830606862902641,
"rewards/rejected": -0.05565694719552994,
"step": 1630
},
{
"epoch": 0.95,
"grad_norm": 0.474609375,
"learning_rate": 3.6099133706344044e-09,
"logits/chosen": 0.13008326292037964,
"logits/rejected": 0.6074930429458618,
"logps/chosen": -223.1219940185547,
"logps/rejected": -207.696044921875,
"loss": 0.6569,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.02304968610405922,
"rewards/margins": 0.07380314916372299,
"rewards/margins_max": 0.09590893238782883,
"rewards/margins_min": 0.05169736221432686,
"rewards/margins_std": 0.03126230835914612,
"rewards/rejected": -0.05075346678495407,
"step": 1640
},
{
"epoch": 0.96,
"grad_norm": 0.4921875,
"learning_rate": 2.8030855486386174e-09,
"logits/chosen": 0.28828924894332886,
"logits/rejected": 0.6710017919540405,
"logps/chosen": -256.94903564453125,
"logps/rejected": -281.40411376953125,
"loss": 0.6586,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023295782506465912,
"rewards/margins": 0.071876659989357,
"rewards/margins_max": 0.09554243832826614,
"rewards/margins_min": 0.048210885375738144,
"rewards/margins_std": 0.03346845880150795,
"rewards/rejected": -0.04858088120818138,
"step": 1650
},
{
"epoch": 0.96,
"grad_norm": 0.515625,
"learning_rate": 2.097673229138286e-09,
"logits/chosen": 0.16988131403923035,
"logits/rejected": 0.47897881269454956,
"logps/chosen": -224.6415557861328,
"logps/rejected": -232.2594451904297,
"loss": 0.6587,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.012618700973689556,
"rewards/margins": 0.07099349051713943,
"rewards/margins_max": 0.10776303708553314,
"rewards/margins_min": 0.03422392159700394,
"rewards/margins_std": 0.0520000159740448,
"rewards/rejected": -0.05837478116154671,
"step": 1660
},
{
"epoch": 0.97,
"grad_norm": 0.44921875,
"learning_rate": 1.493965816008136e-09,
"logits/chosen": -0.009510600939393044,
"logits/rejected": 0.3807966113090515,
"logps/chosen": -211.14254760742188,
"logps/rejected": -236.635498046875,
"loss": 0.6601,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00740268686786294,
"rewards/margins": 0.07398059964179993,
"rewards/margins_max": 0.10376466810703278,
"rewards/margins_min": 0.04419652372598648,
"rewards/margins_std": 0.0421210452914238,
"rewards/rejected": -0.06657791137695312,
"step": 1670
},
{
"epoch": 0.97,
"grad_norm": 0.447265625,
"learning_rate": 9.922109874636875e-10,
"logits/chosen": 0.19054090976715088,
"logits/rejected": 0.557522177696228,
"logps/chosen": -233.7532501220703,
"logps/rejected": -239.6273651123047,
"loss": 0.6579,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.015365364961326122,
"rewards/margins": 0.08128596842288971,
"rewards/margins_max": 0.11999186128377914,
"rewards/margins_min": 0.04258008301258087,
"rewards/margins_std": 0.05473839119076729,
"rewards/rejected": -0.06592060625553131,
"step": 1680
},
{
"epoch": 0.98,
"grad_norm": 0.416015625,
"learning_rate": 5.926145944483984e-10,
"logits/chosen": 0.04970569908618927,
"logits/rejected": 0.41454869508743286,
"logps/chosen": -197.70941162109375,
"logps/rejected": -207.9854278564453,
"loss": 0.6625,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.009294511750340462,
"rewards/margins": 0.05480729788541794,
"rewards/margins_max": 0.08153598010540009,
"rewards/margins_min": 0.02807862125337124,
"rewards/margins_std": 0.03780006244778633,
"rewards/rejected": -0.04551279544830322,
"step": 1690
},
{
"epoch": 0.99,
"grad_norm": 0.42578125,
"learning_rate": 2.9534057618091356e-10,
"logits/chosen": 0.1366875320672989,
"logits/rejected": 0.4813140034675598,
"logps/chosen": -195.55368041992188,
"logps/rejected": -211.63711547851562,
"loss": 0.6599,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.014302869327366352,
"rewards/margins": 0.0652112141251564,
"rewards/margins_max": 0.09685875475406647,
"rewards/margins_min": 0.03356366977095604,
"rewards/margins_std": 0.04475637897849083,
"rewards/rejected": -0.05090833827853203,
"step": 1700
},
{
"epoch": 0.99,
"grad_norm": 0.4453125,
"learning_rate": 1.0051089289686565e-10,
"logits/chosen": 0.20965194702148438,
"logits/rejected": 0.5980690121650696,
"logps/chosen": -218.3548583984375,
"logps/rejected": -252.60159301757812,
"loss": 0.6601,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01929156482219696,
"rewards/margins": 0.06570716202259064,
"rewards/margins_max": 0.09711313247680664,
"rewards/margins_min": 0.03430120274424553,
"rewards/margins_std": 0.044414736330509186,
"rewards/rejected": -0.04641559720039368,
"step": 1710
},
{
"epoch": 1.0,
"grad_norm": 0.404296875,
"learning_rate": 8.205475813372054e-12,
"logits/chosen": 0.07036467641592026,
"logits/rejected": 0.6885267496109009,
"logps/chosen": -334.186279296875,
"logps/rejected": -232.6072998046875,
"loss": 0.6604,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.015851657837629318,
"rewards/margins": 0.06690393388271332,
"rewards/margins_max": 0.0959465354681015,
"rewards/margins_min": 0.037861332297325134,
"rewards/margins_std": 0.041072435677051544,
"rewards/rejected": -0.0510522723197937,
"step": 1720
},
{
"epoch": 1.0,
"eval_logits/chosen": 0.7297662496566772,
"eval_logits/rejected": 0.8997808694839478,
"eval_logps/chosen": -337.8507080078125,
"eval_logps/rejected": -318.01556396484375,
"eval_loss": 0.6928703784942627,
"eval_rewards/accuracies": 0.5364999771118164,
"eval_rewards/chosen": 0.002909434260800481,
"eval_rewards/margins": 0.0005662557086907327,
"eval_rewards/margins_max": 0.07228709012269974,
"eval_rewards/margins_min": -0.08225506544113159,
"eval_rewards/margins_std": 0.050406549125909805,
"eval_rewards/rejected": 0.002343178726732731,
"eval_runtime": 864.7602,
"eval_samples_per_second": 9.251,
"eval_steps_per_second": 0.289,
"step": 1724
},
{
"epoch": 1.0,
"step": 1724,
"total_flos": 0.0,
"train_loss": 0.6676546893927447,
"train_runtime": 9120.8228,
"train_samples_per_second": 3.024,
"train_steps_per_second": 0.189
}
],
"logging_steps": 10,
"max_steps": 1724,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}