ziansu's picture
Training in progress, step 600, checkpoint
1aae851 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5156854318865493,
"eval_steps": 50,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008594757198109154,
"grad_norm": 0.05167795345187187,
"learning_rate": 4.999451708687114e-06,
"logits/chosen": 15.084823608398438,
"logits/rejected": 15.218259811401367,
"logps/chosen": -0.3124043345451355,
"logps/rejected": -0.31854626536369324,
"loss": 0.9405,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.46860653162002563,
"rewards/margins": 0.009212849661707878,
"rewards/rejected": -0.47781938314437866,
"step": 10
},
{
"epoch": 0.017189514396218308,
"grad_norm": 0.06444549560546875,
"learning_rate": 4.997807075247147e-06,
"logits/chosen": 14.565855026245117,
"logits/rejected": 14.914319038391113,
"logps/chosen": -0.28220412135124207,
"logps/rejected": -0.3605547249317169,
"loss": 0.9294,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4233061671257019,
"rewards/margins": 0.11752591282129288,
"rewards/rejected": -0.5408320426940918,
"step": 20
},
{
"epoch": 0.02578427159432746,
"grad_norm": 0.059900399297475815,
"learning_rate": 4.9950668210706795e-06,
"logits/chosen": 14.878230094909668,
"logits/rejected": 15.334558486938477,
"logps/chosen": -0.2837519347667694,
"logps/rejected": -0.320808470249176,
"loss": 0.9338,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.4256278872489929,
"rewards/margins": 0.05558476969599724,
"rewards/rejected": -0.48121267557144165,
"step": 30
},
{
"epoch": 0.034379028792436615,
"grad_norm": 0.05459418520331383,
"learning_rate": 4.9912321481237616e-06,
"logits/chosen": 14.800946235656738,
"logits/rejected": 15.134121894836426,
"logps/chosen": -0.2971518635749817,
"logps/rejected": -0.3476788401603699,
"loss": 0.9202,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.4457278251647949,
"rewards/margins": 0.07579050213098526,
"rewards/rejected": -0.521518349647522,
"step": 40
},
{
"epoch": 0.042973785990545764,
"grad_norm": 0.05792691186070442,
"learning_rate": 4.986304738420684e-06,
"logits/chosen": 14.62980842590332,
"logits/rejected": 14.848493576049805,
"logps/chosen": -0.27511823177337646,
"logps/rejected": -0.32557612657546997,
"loss": 0.9213,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.4126773774623871,
"rewards/margins": 0.07568677514791489,
"rewards/rejected": -0.48836421966552734,
"step": 50
},
{
"epoch": 0.042973785990545764,
"eval_logits/chosen": 14.195974349975586,
"eval_logits/rejected": 15.046167373657227,
"eval_logps/chosen": -0.27934810519218445,
"eval_logps/rejected": -0.3643363118171692,
"eval_loss": 0.9250189065933228,
"eval_rewards/accuracies": 0.557894766330719,
"eval_rewards/chosen": -0.4190221428871155,
"eval_rewards/margins": 0.1274823397397995,
"eval_rewards/rejected": -0.5465044379234314,
"eval_runtime": 26.0506,
"eval_samples_per_second": 28.905,
"eval_steps_per_second": 3.647,
"step": 50
},
{
"epoch": 0.05156854318865492,
"grad_norm": 0.08806851506233215,
"learning_rate": 4.980286753286196e-06,
"logits/chosen": 14.311370849609375,
"logits/rejected": 15.19476318359375,
"logps/chosen": -0.26153135299682617,
"logps/rejected": -0.34108471870422363,
"loss": 0.9255,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.39229699969291687,
"rewards/margins": 0.11933007091283798,
"rewards/rejected": -0.5116270780563354,
"step": 60
},
{
"epoch": 0.060163300386764075,
"grad_norm": 0.10536951571702957,
"learning_rate": 4.973180832407471e-06,
"logits/chosen": 14.646909713745117,
"logits/rejected": 15.134190559387207,
"logps/chosen": -0.2928832173347473,
"logps/rejected": -0.37275972962379456,
"loss": 0.9155,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.4393247961997986,
"rewards/margins": 0.11981481313705444,
"rewards/rejected": -0.559139609336853,
"step": 70
},
{
"epoch": 0.06875805758487323,
"grad_norm": 0.07452531903982162,
"learning_rate": 4.964990092676263e-06,
"logits/chosen": 14.383807182312012,
"logits/rejected": 14.806958198547363,
"logps/chosen": -0.2724239230155945,
"logps/rejected": -0.33048146963119507,
"loss": 0.9191,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.4086359143257141,
"rewards/margins": 0.08708634227514267,
"rewards/rejected": -0.495722234249115,
"step": 80
},
{
"epoch": 0.07735281478298238,
"grad_norm": 0.06996195018291473,
"learning_rate": 4.9557181268217225e-06,
"logits/chosen": 14.557902336120605,
"logits/rejected": 15.043550491333008,
"logps/chosen": -0.3053165078163147,
"logps/rejected": -0.36941051483154297,
"loss": 0.9255,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.45797473192214966,
"rewards/margins": 0.0961410254240036,
"rewards/rejected": -0.5541157126426697,
"step": 90
},
{
"epoch": 0.08594757198109153,
"grad_norm": 0.09053988754749298,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": 13.747509956359863,
"logits/rejected": 14.678106307983398,
"logps/chosen": -0.2453141212463379,
"logps/rejected": -0.36430835723876953,
"loss": 0.9022,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.36797118186950684,
"rewards/margins": 0.17849135398864746,
"rewards/rejected": -0.5464625358581543,
"step": 100
},
{
"epoch": 0.08594757198109153,
"eval_logits/chosen": 14.017444610595703,
"eval_logits/rejected": 14.885564804077148,
"eval_logps/chosen": -0.2685285806655884,
"eval_logps/rejected": -0.3654690384864807,
"eval_loss": 0.9166209697723389,
"eval_rewards/accuracies": 0.557894766330719,
"eval_rewards/chosen": -0.4027928411960602,
"eval_rewards/margins": 0.14541073143482208,
"eval_rewards/rejected": -0.5482036471366882,
"eval_runtime": 26.0431,
"eval_samples_per_second": 28.914,
"eval_steps_per_second": 3.648,
"step": 100
},
{
"epoch": 0.09454232917920069,
"grad_norm": 0.07788874208927155,
"learning_rate": 4.933947257182901e-06,
"logits/chosen": 14.805160522460938,
"logits/rejected": 14.767298698425293,
"logps/chosen": -0.30586495995521545,
"logps/rejected": -0.3159794211387634,
"loss": 0.9128,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.45879751443862915,
"rewards/margins": 0.015171671286225319,
"rewards/rejected": -0.47396916151046753,
"step": 110
},
{
"epoch": 0.10313708637730984,
"grad_norm": 0.07691823691129684,
"learning_rate": 4.921457902821578e-06,
"logits/chosen": 13.761972427368164,
"logits/rejected": 14.64726448059082,
"logps/chosen": -0.2784760296344757,
"logps/rejected": -0.34076255559921265,
"loss": 0.9179,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.41771402955055237,
"rewards/margins": 0.09342982620000839,
"rewards/rejected": -0.5111438632011414,
"step": 120
},
{
"epoch": 0.11173184357541899,
"grad_norm": 0.08534488826990128,
"learning_rate": 4.907906416994146e-06,
"logits/chosen": 13.837780952453613,
"logits/rejected": 14.767657279968262,
"logps/chosen": -0.26367664337158203,
"logps/rejected": -0.3845904469490051,
"loss": 0.8978,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.39551490545272827,
"rewards/margins": 0.18137072026729584,
"rewards/rejected": -0.5768855810165405,
"step": 130
},
{
"epoch": 0.12032660077352815,
"grad_norm": 0.08117899298667908,
"learning_rate": 4.893298743830168e-06,
"logits/chosen": 13.270025253295898,
"logits/rejected": 14.128207206726074,
"logps/chosen": -0.24728116393089294,
"logps/rejected": -0.3510771095752716,
"loss": 0.9117,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.370921790599823,
"rewards/margins": 0.1556939035654068,
"rewards/rejected": -0.5266156196594238,
"step": 140
},
{
"epoch": 0.1289213579716373,
"grad_norm": 0.1263500601053238,
"learning_rate": 4.8776412907378845e-06,
"logits/chosen": 13.525009155273438,
"logits/rejected": 14.163309097290039,
"logps/chosen": -0.24874648451805115,
"logps/rejected": -0.38132259249687195,
"loss": 0.9007,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3731197714805603,
"rewards/margins": 0.1988641768693924,
"rewards/rejected": -0.5719839334487915,
"step": 150
},
{
"epoch": 0.1289213579716373,
"eval_logits/chosen": 12.438652992248535,
"eval_logits/rejected": 13.519843101501465,
"eval_logps/chosen": -0.2689361274242401,
"eval_logps/rejected": -0.3897271454334259,
"eval_loss": 0.8991575241088867,
"eval_rewards/accuracies": 0.5894736647605896,
"eval_rewards/chosen": -0.40340420603752136,
"eval_rewards/margins": 0.1811865121126175,
"eval_rewards/rejected": -0.5845907330513,
"eval_runtime": 26.0482,
"eval_samples_per_second": 28.908,
"eval_steps_per_second": 3.647,
"step": 150
},
{
"epoch": 0.13751611516974646,
"grad_norm": 0.11390316486358643,
"learning_rate": 4.860940925593703e-06,
"logits/chosen": 12.494891166687012,
"logits/rejected": 13.346384048461914,
"logps/chosen": -0.26858460903167725,
"logps/rejected": -0.4170496463775635,
"loss": 0.8854,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4028768539428711,
"rewards/margins": 0.22269758582115173,
"rewards/rejected": -0.6255744695663452,
"step": 160
},
{
"epoch": 0.1461108723678556,
"grad_norm": 0.14250700175762177,
"learning_rate": 4.84320497372973e-06,
"logits/chosen": 11.637483596801758,
"logits/rejected": 12.72177505493164,
"logps/chosen": -0.2967775762081146,
"logps/rejected": -0.440357506275177,
"loss": 0.8884,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4451664090156555,
"rewards/margins": 0.21536986529827118,
"rewards/rejected": -0.6605362892150879,
"step": 170
},
{
"epoch": 0.15470562956596476,
"grad_norm": 0.174351766705513,
"learning_rate": 4.824441214720629e-06,
"logits/chosen": 11.577589988708496,
"logits/rejected": 12.179681777954102,
"logps/chosen": -0.29397666454315186,
"logps/rejected": -0.4009665548801422,
"loss": 0.8756,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.44096502661705017,
"rewards/margins": 0.16048480570316315,
"rewards/rejected": -0.6014498472213745,
"step": 180
},
{
"epoch": 0.1633003867640739,
"grad_norm": 0.22877676784992218,
"learning_rate": 4.804657878971252e-06,
"logits/chosen": 9.352752685546875,
"logits/rejected": 10.27645206451416,
"logps/chosen": -0.30452457070350647,
"logps/rejected": -0.4765443205833435,
"loss": 0.8781,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.4567868113517761,
"rewards/margins": 0.25802966952323914,
"rewards/rejected": -0.7148164510726929,
"step": 190
},
{
"epoch": 0.17189514396218306,
"grad_norm": 0.2517675459384918,
"learning_rate": 4.783863644106502e-06,
"logits/chosen": 8.136419296264648,
"logits/rejected": 9.26432991027832,
"logps/chosen": -0.3416380286216736,
"logps/rejected": -0.4680122435092926,
"loss": 0.8531,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.5124570727348328,
"rewards/margins": 0.18956127762794495,
"rewards/rejected": -0.7020183801651001,
"step": 200
},
{
"epoch": 0.17189514396218306,
"eval_logits/chosen": 7.26609992980957,
"eval_logits/rejected": 8.391904830932617,
"eval_logps/chosen": -0.31862083077430725,
"eval_logps/rejected": -0.5189473032951355,
"eval_loss": 0.8484573364257812,
"eval_rewards/accuracies": 0.6315789222717285,
"eval_rewards/chosen": -0.47793126106262207,
"eval_rewards/margins": 0.30048972368240356,
"eval_rewards/rejected": -0.7784210443496704,
"eval_runtime": 26.0496,
"eval_samples_per_second": 28.906,
"eval_steps_per_second": 3.647,
"step": 200
},
{
"epoch": 0.18048990116029223,
"grad_norm": 0.28971683979034424,
"learning_rate": 4.762067631165049e-06,
"logits/chosen": 7.321592807769775,
"logits/rejected": 7.871228218078613,
"logps/chosen": -0.3311695158481598,
"logps/rejected": -0.4879254400730133,
"loss": 0.8211,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.4967542588710785,
"rewards/margins": 0.23513388633728027,
"rewards/rejected": -0.7318881750106812,
"step": 210
},
{
"epoch": 0.18908465835840138,
"grad_norm": 0.568050742149353,
"learning_rate": 4.7392794005985324e-06,
"logits/chosen": 5.077876091003418,
"logits/rejected": 5.706583499908447,
"logps/chosen": -0.3127230405807495,
"logps/rejected": -0.5744297504425049,
"loss": 0.8331,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.46908459067344666,
"rewards/margins": 0.39256006479263306,
"rewards/rejected": -0.8616446256637573,
"step": 220
},
{
"epoch": 0.19767941555651053,
"grad_norm": 0.32453760504722595,
"learning_rate": 4.715508948078037e-06,
"logits/chosen": 4.265925407409668,
"logits/rejected": 4.2006964683532715,
"logps/chosen": -0.4032830595970154,
"logps/rejected": -0.6459742784500122,
"loss": 0.7986,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6049246191978455,
"rewards/margins": 0.3640367388725281,
"rewards/rejected": -0.9689614176750183,
"step": 230
},
{
"epoch": 0.20627417275461968,
"grad_norm": 0.448809951543808,
"learning_rate": 4.690766700109659e-06,
"logits/chosen": 3.3534884452819824,
"logits/rejected": 3.4250903129577637,
"logps/chosen": -0.3817242383956909,
"logps/rejected": -0.7190496921539307,
"loss": 0.7708,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5725863575935364,
"rewards/margins": 0.5059882402420044,
"rewards/rejected": -1.078574538230896,
"step": 240
},
{
"epoch": 0.21486892995272883,
"grad_norm": 0.4277574419975281,
"learning_rate": 4.665063509461098e-06,
"logits/chosen": 3.151397228240967,
"logits/rejected": 2.8183228969573975,
"logps/chosen": -0.44173598289489746,
"logps/rejected": -0.8323748707771301,
"loss": 0.7722,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6626039743423462,
"rewards/margins": 0.5859583616256714,
"rewards/rejected": -1.248562216758728,
"step": 250
},
{
"epoch": 0.21486892995272883,
"eval_logits/chosen": 2.520007848739624,
"eval_logits/rejected": 1.9197090864181519,
"eval_logps/chosen": -0.4703753888607025,
"eval_logps/rejected": -0.90553879737854,
"eval_loss": 0.7410055994987488,
"eval_rewards/accuracies": 0.6631578803062439,
"eval_rewards/chosen": -0.7055630087852478,
"eval_rewards/margins": 0.6527453064918518,
"eval_rewards/rejected": -1.3583083152770996,
"eval_runtime": 26.0441,
"eval_samples_per_second": 28.912,
"eval_steps_per_second": 3.648,
"step": 250
},
{
"epoch": 0.22346368715083798,
"grad_norm": 0.5626497268676758,
"learning_rate": 4.638410650401267e-06,
"logits/chosen": 1.2351257801055908,
"logits/rejected": 0.5925868153572083,
"logps/chosen": -0.46581563353538513,
"logps/rejected": -0.9673674702644348,
"loss": 0.6933,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6987233757972717,
"rewards/margins": 0.7523276209831238,
"rewards/rejected": -1.451051115989685,
"step": 260
},
{
"epoch": 0.23205844434894715,
"grad_norm": 0.7433231472969055,
"learning_rate": 4.610819813755038e-06,
"logits/chosen": 3.1690659523010254,
"logits/rejected": 2.0423803329467773,
"logps/chosen": -0.506645679473877,
"logps/rejected": -1.0180162191390991,
"loss": 0.7265,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7599684596061707,
"rewards/margins": 0.767055869102478,
"rewards/rejected": -1.527024507522583,
"step": 270
},
{
"epoch": 0.2406532015470563,
"grad_norm": 1.4220589399337769,
"learning_rate": 4.582303101775249e-06,
"logits/chosen": 2.8173985481262207,
"logits/rejected": 1.5537467002868652,
"logps/chosen": -0.5869659185409546,
"logps/rejected": -1.1085975170135498,
"loss": 0.6725,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8804486989974976,
"rewards/margins": 0.7824474573135376,
"rewards/rejected": -1.6628963947296143,
"step": 280
},
{
"epoch": 0.24924795874516545,
"grad_norm": 0.6397098898887634,
"learning_rate": 4.55287302283426e-06,
"logits/chosen": 2.734229564666748,
"logits/rejected": 1.9948323965072632,
"logps/chosen": -0.6540845036506653,
"logps/rejected": -1.451608419418335,
"loss": 0.571,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9811266660690308,
"rewards/margins": 1.1962860822677612,
"rewards/rejected": -2.177412748336792,
"step": 290
},
{
"epoch": 0.2578427159432746,
"grad_norm": 0.4591177701950073,
"learning_rate": 4.522542485937369e-06,
"logits/chosen": 2.2491040229797363,
"logits/rejected": 1.345014214515686,
"logps/chosen": -0.6877793073654175,
"logps/rejected": -1.6054528951644897,
"loss": 0.5782,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.0316689014434814,
"rewards/margins": 1.3765103816986084,
"rewards/rejected": -2.408179521560669,
"step": 300
},
{
"epoch": 0.2578427159432746,
"eval_logits/chosen": 1.661840796470642,
"eval_logits/rejected": 0.6246702671051025,
"eval_logps/chosen": -0.7322248816490173,
"eval_logps/rejected": -2.272771120071411,
"eval_loss": 0.563686728477478,
"eval_rewards/accuracies": 0.7157894968986511,
"eval_rewards/chosen": -1.0983372926712036,
"eval_rewards/margins": 2.310819387435913,
"eval_rewards/rejected": -3.409156560897827,
"eval_runtime": 26.0455,
"eval_samples_per_second": 28.911,
"eval_steps_per_second": 3.647,
"step": 300
},
{
"epoch": 0.2664374731413838,
"grad_norm": 0.786809504032135,
"learning_rate": 4.491324795060491e-06,
"logits/chosen": 1.3445788621902466,
"logits/rejected": 0.4989510178565979,
"logps/chosen": -0.7276264429092407,
"logps/rejected": -2.3235878944396973,
"loss": 0.5253,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0914397239685059,
"rewards/margins": 2.393942356109619,
"rewards/rejected": -3.485382080078125,
"step": 310
},
{
"epoch": 0.2750322303394929,
"grad_norm": 0.3913320004940033,
"learning_rate": 4.4592336433146e-06,
"logits/chosen": 2.61965012550354,
"logits/rejected": 1.9477211236953735,
"logps/chosen": -0.7146936655044556,
"logps/rejected": -1.9647115468978882,
"loss": 0.5294,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0720404386520386,
"rewards/margins": 1.8750267028808594,
"rewards/rejected": -2.9470672607421875,
"step": 320
},
{
"epoch": 0.28362698753760207,
"grad_norm": 0.4867005944252014,
"learning_rate": 4.426283106939474e-06,
"logits/chosen": 2.500439167022705,
"logits/rejected": 1.6413562297821045,
"logps/chosen": -0.8710287809371948,
"logps/rejected": -2.36894154548645,
"loss": 0.548,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.306543231010437,
"rewards/margins": 2.246868848800659,
"rewards/rejected": -3.5534119606018066,
"step": 330
},
{
"epoch": 0.2922217447357112,
"grad_norm": 0.8009849786758423,
"learning_rate": 4.3924876391293915e-06,
"logits/chosen": 1.3847177028656006,
"logits/rejected": 0.8994542360305786,
"logps/chosen": -0.8447234034538269,
"logps/rejected": -2.800283908843994,
"loss": 0.4797,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2670851945877075,
"rewards/margins": 2.9333412647247314,
"rewards/rejected": -4.2004265785217285,
"step": 340
},
{
"epoch": 0.30081650193382037,
"grad_norm": 2.0202796459198,
"learning_rate": 4.357862063693486e-06,
"logits/chosen": 2.3197357654571533,
"logits/rejected": 1.37326180934906,
"logps/chosen": -0.8590717315673828,
"logps/rejected": -2.1532845497131348,
"loss": 0.5126,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.2886077165603638,
"rewards/margins": 1.941319465637207,
"rewards/rejected": -3.2299270629882812,
"step": 350
},
{
"epoch": 0.30081650193382037,
"eval_logits/chosen": 2.0864102840423584,
"eval_logits/rejected": 1.2036340236663818,
"eval_logps/chosen": -0.9554746150970459,
"eval_logps/rejected": -3.0601954460144043,
"eval_loss": 0.5108997821807861,
"eval_rewards/accuracies": 0.7368420958518982,
"eval_rewards/chosen": -1.4332119226455688,
"eval_rewards/margins": 3.15708065032959,
"eval_rewards/rejected": -4.590292930603027,
"eval_runtime": 26.0503,
"eval_samples_per_second": 28.906,
"eval_steps_per_second": 3.647,
"step": 350
},
{
"epoch": 0.3094112591319295,
"grad_norm": 1.0668681859970093,
"learning_rate": 4.322421568553529e-06,
"logits/chosen": 1.6770871877670288,
"logits/rejected": 1.073407530784607,
"logps/chosen": -1.1393296718597412,
"logps/rejected": -2.886169910430908,
"loss": 0.5031,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.7089945077896118,
"rewards/margins": 2.620260238647461,
"rewards/rejected": -4.329255104064941,
"step": 360
},
{
"epoch": 0.31800601633003867,
"grad_norm": 0.5015287399291992,
"learning_rate": 4.286181699082008e-06,
"logits/chosen": 2.156587600708008,
"logits/rejected": 1.371209979057312,
"logps/chosen": -0.9851818084716797,
"logps/rejected": -3.2286324501037598,
"loss": 0.4662,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.47777259349823,
"rewards/margins": 3.3651764392852783,
"rewards/rejected": -4.842948913574219,
"step": 370
},
{
"epoch": 0.3266007735281478,
"grad_norm": 0.9893808960914612,
"learning_rate": 4.249158351283414e-06,
"logits/chosen": 2.6184191703796387,
"logits/rejected": 2.212998390197754,
"logps/chosen": -0.9414733052253723,
"logps/rejected": -2.940886974334717,
"loss": 0.4829,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.4122098684310913,
"rewards/margins": 2.9991202354431152,
"rewards/rejected": -4.411330223083496,
"step": 380
},
{
"epoch": 0.33519553072625696,
"grad_norm": 0.7588702440261841,
"learning_rate": 4.211367764821722e-06,
"logits/chosen": 3.257941484451294,
"logits/rejected": 2.5362088680267334,
"logps/chosen": -1.182255744934082,
"logps/rejected": -2.8621151447296143,
"loss": 0.4538,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.7733834981918335,
"rewards/margins": 2.5197887420654297,
"rewards/rejected": -4.293172359466553,
"step": 390
},
{
"epoch": 0.3437902879243661,
"grad_norm": 0.6317985653877258,
"learning_rate": 4.172826515897146e-06,
"logits/chosen": 3.057791233062744,
"logits/rejected": 2.4121367931365967,
"logps/chosen": -1.0847463607788086,
"logps/rejected": -3.3152599334716797,
"loss": 0.4847,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6271196603775024,
"rewards/margins": 3.3457705974578857,
"rewards/rejected": -4.9728899002075195,
"step": 400
},
{
"epoch": 0.3437902879243661,
"eval_logits/chosen": 2.9584426879882812,
"eval_logits/rejected": 2.292771577835083,
"eval_logps/chosen": -1.202886939048767,
"eval_logps/rejected": -3.6770312786102295,
"eval_loss": 0.47303518652915955,
"eval_rewards/accuracies": 0.7473683953285217,
"eval_rewards/chosen": -1.8043304681777954,
"eval_rewards/margins": 3.711216688156128,
"eval_rewards/rejected": -5.515547275543213,
"eval_runtime": 26.0247,
"eval_samples_per_second": 28.934,
"eval_steps_per_second": 3.65,
"step": 400
},
{
"epoch": 0.3523850451224753,
"grad_norm": 1.0523916482925415,
"learning_rate": 4.133551509975264e-06,
"logits/chosen": 2.9360365867614746,
"logits/rejected": 2.330521583557129,
"logps/chosen": -1.3002166748046875,
"logps/rejected": -3.2887542247772217,
"loss": 0.4398,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9503250122070312,
"rewards/margins": 2.9828057289123535,
"rewards/rejected": -4.933130741119385,
"step": 410
},
{
"epoch": 0.36097980232058446,
"grad_norm": 0.6079875826835632,
"learning_rate": 4.093559974371725e-06,
"logits/chosen": 3.1500794887542725,
"logits/rejected": 2.329282283782959,
"logps/chosen": -1.23466157913208,
"logps/rejected": -3.291548252105713,
"loss": 0.4774,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.8519923686981201,
"rewards/margins": 3.085329532623291,
"rewards/rejected": -4.93732213973999,
"step": 420
},
{
"epoch": 0.3695745595186936,
"grad_norm": 1.3175437450408936,
"learning_rate": 4.052869450695776e-06,
"logits/chosen": 3.4488296508789062,
"logits/rejected": 2.6282899379730225,
"logps/chosen": -1.380877137184143,
"logps/rejected": -4.005017280578613,
"loss": 0.4158,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0713157653808594,
"rewards/margins": 3.9362099170684814,
"rewards/rejected": -6.007525444030762,
"step": 430
},
{
"epoch": 0.37816931671680276,
"grad_norm": 3.7249863147735596,
"learning_rate": 4.011497787155938e-06,
"logits/chosen": 2.5173678398132324,
"logits/rejected": 1.943926215171814,
"logps/chosen": -1.7800304889678955,
"logps/rejected": -4.422289848327637,
"loss": 0.3916,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.6700453758239746,
"rewards/margins": 3.9633898735046387,
"rewards/rejected": -6.633435249328613,
"step": 440
},
{
"epoch": 0.3867640739149119,
"grad_norm": 2.9776103496551514,
"learning_rate": 3.969463130731183e-06,
"logits/chosen": 3.2318034172058105,
"logits/rejected": 2.5253517627716064,
"logps/chosen": -2.309701442718506,
"logps/rejected": -4.725776672363281,
"loss": 0.368,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.464552640914917,
"rewards/margins": 3.624112606048584,
"rewards/rejected": -7.0886640548706055,
"step": 450
},
{
"epoch": 0.3867640739149119,
"eval_logits/chosen": 2.397157907485962,
"eval_logits/rejected": 2.0492196083068848,
"eval_logps/chosen": -2.6244213581085205,
"eval_logps/rejected": -5.247391700744629,
"eval_loss": 0.3982011079788208,
"eval_rewards/accuracies": 0.8842105269432068,
"eval_rewards/chosen": -3.936631917953491,
"eval_rewards/margins": 3.934455633163452,
"eval_rewards/rejected": -7.87108850479126,
"eval_runtime": 26.0501,
"eval_samples_per_second": 28.906,
"eval_steps_per_second": 3.647,
"step": 450
},
{
"epoch": 0.39535883111302106,
"grad_norm": 2.3925623893737793,
"learning_rate": 3.92678391921108e-06,
"logits/chosen": 3.0329971313476562,
"logits/rejected": 2.67683482170105,
"logps/chosen": -2.4644994735717773,
"logps/rejected": -4.755246162414551,
"loss": 0.3584,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.696749210357666,
"rewards/margins": 3.436119794845581,
"rewards/rejected": -7.132868766784668,
"step": 460
},
{
"epoch": 0.4039535883111302,
"grad_norm": 3.1981327533721924,
"learning_rate": 3.88347887310836e-06,
"logits/chosen": 2.219741106033325,
"logits/rejected": 1.8649622201919556,
"logps/chosen": -2.2890329360961914,
"logps/rejected": -5.124932289123535,
"loss": 0.3709,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.433549404144287,
"rewards/margins": 4.253849029541016,
"rewards/rejected": -7.687398433685303,
"step": 470
},
{
"epoch": 0.41254834550923936,
"grad_norm": 2.0272741317749023,
"learning_rate": 3.839566987447492e-06,
"logits/chosen": 3.6659038066864014,
"logits/rejected": 3.202749252319336,
"logps/chosen": -2.5729193687438965,
"logps/rejected": -4.992354393005371,
"loss": 0.3837,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.859379529953003,
"rewards/margins": 3.629152297973633,
"rewards/rejected": -7.488531589508057,
"step": 480
},
{
"epoch": 0.4211431027073485,
"grad_norm": 2.5182268619537354,
"learning_rate": 3.795067523432826e-06,
"logits/chosen": 3.327012538909912,
"logits/rejected": 3.1205530166625977,
"logps/chosen": -3.016247510910034,
"logps/rejected": -5.566779136657715,
"loss": 0.3112,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.524371147155762,
"rewards/margins": 3.8257980346679688,
"rewards/rejected": -8.35016918182373,
"step": 490
},
{
"epoch": 0.42973785990545765,
"grad_norm": 2.990694046020508,
"learning_rate": 3.7500000000000005e-06,
"logits/chosen": 2.7793381214141846,
"logits/rejected": 2.7330098152160645,
"logps/chosen": -2.7836732864379883,
"logps/rejected": -5.60109806060791,
"loss": 0.3069,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.175509929656982,
"rewards/margins": 4.226136684417725,
"rewards/rejected": -8.401647567749023,
"step": 500
},
{
"epoch": 0.42973785990545765,
"eval_logits/chosen": 2.5767242908477783,
"eval_logits/rejected": 2.1918540000915527,
"eval_logps/chosen": -3.1751770973205566,
"eval_logps/rejected": -6.361191749572754,
"eval_loss": 0.35469338297843933,
"eval_rewards/accuracies": 0.9157894849777222,
"eval_rewards/chosen": -4.762764930725098,
"eval_rewards/margins": 4.779022693634033,
"eval_rewards/rejected": -9.541787147521973,
"eval_runtime": 26.0483,
"eval_samples_per_second": 28.908,
"eval_steps_per_second": 3.647,
"step": 500
},
{
"epoch": 0.4383326171035668,
"grad_norm": 3.1177096366882324,
"learning_rate": 3.7043841852542884e-06,
"logits/chosen": 3.4840216636657715,
"logits/rejected": 2.871774196624756,
"logps/chosen": -2.739344596862793,
"logps/rejected": -5.363945960998535,
"loss": 0.3468,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -4.1090168952941895,
"rewards/margins": 3.9369025230407715,
"rewards/rejected": -8.045918464660645,
"step": 510
},
{
"epoch": 0.44692737430167595,
"grad_norm": 2.212597131729126,
"learning_rate": 3.658240087799655e-06,
"logits/chosen": 2.8667449951171875,
"logits/rejected": 2.463776111602783,
"logps/chosen": -3.17940092086792,
"logps/rejected": -6.375420570373535,
"loss": 0.3092,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.769101619720459,
"rewards/margins": 4.794029235839844,
"rewards/rejected": -9.563131332397461,
"step": 520
},
{
"epoch": 0.45552213149978515,
"grad_norm": 4.475163459777832,
"learning_rate": 3.611587947962319e-06,
"logits/chosen": 3.234764814376831,
"logits/rejected": 2.6656813621520996,
"logps/chosen": -3.0503814220428467,
"logps/rejected": -5.525468826293945,
"loss": 0.3044,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -4.5755720138549805,
"rewards/margins": 3.7126305103302,
"rewards/rejected": -8.288202285766602,
"step": 530
},
{
"epoch": 0.4641168886978943,
"grad_norm": 1.8678548336029053,
"learning_rate": 3.564448228912682e-06,
"logits/chosen": 2.1433145999908447,
"logits/rejected": 2.1412692070007324,
"logps/chosen": -2.6177189350128174,
"logps/rejected": -5.8179192543029785,
"loss": 0.3376,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -3.9265785217285156,
"rewards/margins": 4.800299644470215,
"rewards/rejected": -8.72687816619873,
"step": 540
},
{
"epoch": 0.47271164589600345,
"grad_norm": 2.3289716243743896,
"learning_rate": 3.516841607689501e-06,
"logits/chosen": 2.7216885089874268,
"logits/rejected": 2.549870729446411,
"logps/chosen": -2.7370285987854004,
"logps/rejected": -5.929703712463379,
"loss": 0.2937,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -4.1055426597595215,
"rewards/margins": 4.7890119552612305,
"rewards/rejected": -8.894556045532227,
"step": 550
},
{
"epoch": 0.47271164589600345,
"eval_logits/chosen": 2.7431576251983643,
"eval_logits/rejected": 2.386326789855957,
"eval_logps/chosen": -3.3791866302490234,
"eval_logps/rejected": -6.955687999725342,
"eval_loss": 0.33076339960098267,
"eval_rewards/accuracies": 0.9157894849777222,
"eval_rewards/chosen": -5.068779945373535,
"eval_rewards/margins": 5.364751815795898,
"eval_rewards/rejected": -10.433531761169434,
"eval_runtime": 26.0558,
"eval_samples_per_second": 28.899,
"eval_steps_per_second": 3.646,
"step": 550
},
{
"epoch": 0.4813064030941126,
"grad_norm": 2.7705740928649902,
"learning_rate": 3.4687889661302577e-06,
"logits/chosen": 2.2392983436584473,
"logits/rejected": 1.9859422445297241,
"logps/chosen": -3.14917254447937,
"logps/rejected": -6.809067726135254,
"loss": 0.2983,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -4.723758697509766,
"rewards/margins": 5.489841938018799,
"rewards/rejected": -10.213602066040039,
"step": 560
},
{
"epoch": 0.48990116029222175,
"grad_norm": 2.1203205585479736,
"learning_rate": 3.4203113817116955e-06,
"logits/chosen": 2.5817489624023438,
"logits/rejected": 2.54498291015625,
"logps/chosen": -3.4195308685302734,
"logps/rejected": -7.411266326904297,
"loss": 0.3014,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -5.129295349121094,
"rewards/margins": 5.987602710723877,
"rewards/rejected": -11.116899490356445,
"step": 570
},
{
"epoch": 0.4984959174903309,
"grad_norm": 1.7489718198776245,
"learning_rate": 3.3714301183045382e-06,
"logits/chosen": 2.1257646083831787,
"logits/rejected": 2.1210994720458984,
"logps/chosen": -2.9680445194244385,
"logps/rejected": -6.824588775634766,
"loss": 0.2752,
"rewards/accuracies": 0.9375,
"rewards/chosen": -4.452066898345947,
"rewards/margins": 5.784815788269043,
"rewards/rejected": -10.236883163452148,
"step": 580
},
{
"epoch": 0.50709067468844,
"grad_norm": 2.1680099964141846,
"learning_rate": 3.3221666168464584e-06,
"logits/chosen": 2.5764970779418945,
"logits/rejected": 2.2523038387298584,
"logps/chosen": -3.667435884475708,
"logps/rejected": -7.162708282470703,
"loss": 0.2968,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.501153945922852,
"rewards/margins": 5.242908954620361,
"rewards/rejected": -10.744061470031738,
"step": 590
},
{
"epoch": 0.5156854318865493,
"grad_norm": 1.7536494731903076,
"learning_rate": 3.272542485937369e-06,
"logits/chosen": 2.2658116817474365,
"logits/rejected": 1.980126142501831,
"logps/chosen": -3.5995922088623047,
"logps/rejected": -7.158552646636963,
"loss": 0.2971,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.399388313293457,
"rewards/margins": 5.338440418243408,
"rewards/rejected": -10.737829208374023,
"step": 600
},
{
"epoch": 0.5156854318865493,
"eval_logits/chosen": 2.6781415939331055,
"eval_logits/rejected": 2.508939027786255,
"eval_logps/chosen": -3.80741548538208,
"eval_logps/rejected": -7.577634334564209,
"eval_loss": 0.3210188150405884,
"eval_rewards/accuracies": 0.9368420839309692,
"eval_rewards/chosen": -5.711122989654541,
"eval_rewards/margins": 5.655328273773193,
"eval_rewards/rejected": -11.366451263427734,
"eval_runtime": 26.0494,
"eval_samples_per_second": 28.907,
"eval_steps_per_second": 3.647,
"step": 600
}
],
"logging_steps": 10,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4077101809126605e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}