zephyr-7b-dpo-full / trainer_state.json
wzhouad's picture
Model save
2490fbf verified
raw
history blame
No virus
51.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9968602825745683,
"eval_steps": 100,
"global_step": 954,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": 0.2709607779979706,
"logits/rejected": 0.36084669828414917,
"logps/chosen": -304.1212463378906,
"logps/rejected": -281.92694091796875,
"loss": 0.1836,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.00025563794770278037,
"rewards/margins": -4.445898957783356e-05,
"rewards/rejected": -0.00021117893629707396,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": 0.3387250602245331,
"logits/rejected": 0.365884006023407,
"logps/chosen": -287.37677001953125,
"logps/rejected": -261.12213134765625,
"loss": 0.1853,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.0006877075065858662,
"rewards/margins": -0.0008078098180703819,
"rewards/rejected": 0.00012010247155558318,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": 0.19636110961437225,
"logits/rejected": 0.2971157133579254,
"logps/chosen": -355.48052978515625,
"logps/rejected": -307.60101318359375,
"loss": 0.1858,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0008693916606716812,
"rewards/margins": 0.0018057005945593119,
"rewards/rejected": -0.002675092313438654,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": 0.2009788304567337,
"logits/rejected": 0.2732384204864502,
"logps/chosen": -320.2412414550781,
"logps/rejected": -295.5198059082031,
"loss": 0.1786,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0026089001912623644,
"rewards/margins": 0.005245196167379618,
"rewards/rejected": -0.007854094728827477,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": 0.30105799436569214,
"logits/rejected": 0.3381732106208801,
"logps/chosen": -329.18377685546875,
"logps/rejected": -330.98297119140625,
"loss": 0.1792,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.004522019065916538,
"rewards/margins": 0.024444926530122757,
"rewards/rejected": -0.02896694466471672,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 6.249999999999999e-07,
"logits/chosen": 0.2789975106716156,
"logits/rejected": 0.37255367636680603,
"logps/chosen": -289.91961669921875,
"logps/rejected": -293.2933044433594,
"loss": 0.1882,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.022119298577308655,
"rewards/margins": 0.04484058916568756,
"rewards/rejected": -0.06695988774299622,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 7.291666666666666e-07,
"logits/chosen": 0.40268006920814514,
"logits/rejected": 0.4395596981048584,
"logps/chosen": -272.30242919921875,
"logps/rejected": -299.4195861816406,
"loss": 0.1725,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.05775861814618111,
"rewards/margins": 0.07561491429805756,
"rewards/rejected": -0.13337352871894836,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": 0.38377270102500916,
"logits/rejected": 0.427955687046051,
"logps/chosen": -305.79461669921875,
"logps/rejected": -306.46527099609375,
"loss": 0.1546,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11612454801797867,
"rewards/margins": 0.15051202476024628,
"rewards/rejected": -0.26663655042648315,
"step": 80
},
{
"epoch": 0.19,
"learning_rate": 9.374999999999999e-07,
"logits/chosen": 0.40008336305618286,
"logits/rejected": 0.5159471035003662,
"logps/chosen": -304.7159423828125,
"logps/rejected": -290.49200439453125,
"loss": 0.1296,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.20290033519268036,
"rewards/margins": 0.20165178179740906,
"rewards/rejected": -0.4045521318912506,
"step": 90
},
{
"epoch": 0.21,
"learning_rate": 9.999463737538052e-07,
"logits/chosen": 0.3695070147514343,
"logits/rejected": 0.4569215774536133,
"logps/chosen": -343.1157531738281,
"logps/rejected": -321.3959655761719,
"loss": 0.111,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3429110646247864,
"rewards/margins": 0.19276174902915955,
"rewards/rejected": -0.5356727838516235,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": 0.39762556552886963,
"eval_logits/rejected": 0.4520445764064789,
"eval_logps/chosen": -336.5851135253906,
"eval_logps/rejected": -375.96063232421875,
"eval_loss": 0.10800629109144211,
"eval_rewards/accuracies": 0.71484375,
"eval_rewards/chosen": -0.3300043046474457,
"eval_rewards/margins": 0.3133509159088135,
"eval_rewards/rejected": -0.6433552503585815,
"eval_runtime": 74.5651,
"eval_samples_per_second": 26.822,
"eval_steps_per_second": 0.429,
"step": 100
},
{
"epoch": 0.23,
"learning_rate": 9.993432105822034e-07,
"logits/chosen": 0.29598233103752136,
"logits/rejected": 0.3596528172492981,
"logps/chosen": -346.12603759765625,
"logps/rejected": -347.3135681152344,
"loss": 0.1045,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.37233370542526245,
"rewards/margins": 0.2192118912935257,
"rewards/rejected": -0.5915456414222717,
"step": 110
},
{
"epoch": 0.25,
"learning_rate": 9.980706626858607e-07,
"logits/chosen": 0.22473303973674774,
"logits/rejected": 0.2900647521018982,
"logps/chosen": -369.64788818359375,
"logps/rejected": -380.33404541015625,
"loss": 0.0935,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4193580746650696,
"rewards/margins": 0.33016690611839294,
"rewards/rejected": -0.7495249509811401,
"step": 120
},
{
"epoch": 0.27,
"learning_rate": 9.961304359538434e-07,
"logits/chosen": 0.28685927391052246,
"logits/rejected": 0.34184715151786804,
"logps/chosen": -355.6005554199219,
"logps/rejected": -348.94989013671875,
"loss": 0.0953,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.49181294441223145,
"rewards/margins": 0.21913418173789978,
"rewards/rejected": -0.7109471559524536,
"step": 130
},
{
"epoch": 0.29,
"learning_rate": 9.935251313189563e-07,
"logits/chosen": 0.24738028645515442,
"logits/rejected": 0.3477911353111267,
"logps/chosen": -349.7526550292969,
"logps/rejected": -339.65081787109375,
"loss": 0.0842,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5085957646369934,
"rewards/margins": 0.2822812795639038,
"rewards/rejected": -0.7908770442008972,
"step": 140
},
{
"epoch": 0.31,
"learning_rate": 9.902582412711118e-07,
"logits/chosen": 0.23527678847312927,
"logits/rejected": 0.297168105840683,
"logps/chosen": -385.29278564453125,
"logps/rejected": -388.8484191894531,
"loss": 0.0829,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5571666955947876,
"rewards/margins": 0.3536582887172699,
"rewards/rejected": -0.9108250737190247,
"step": 150
},
{
"epoch": 0.33,
"learning_rate": 9.86334145175542e-07,
"logits/chosen": 0.2127149999141693,
"logits/rejected": 0.3407444953918457,
"logps/chosen": -391.6556701660156,
"logps/rejected": -385.64373779296875,
"loss": 0.0831,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.6610974073410034,
"rewards/margins": 0.4304705560207367,
"rewards/rejected": -1.0915679931640625,
"step": 160
},
{
"epoch": 0.36,
"learning_rate": 9.817581034021272e-07,
"logits/chosen": 0.10886111110448837,
"logits/rejected": 0.23057182133197784,
"logps/chosen": -425.691650390625,
"logps/rejected": -426.1463928222656,
"loss": 0.0777,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6238452792167664,
"rewards/margins": 0.35001182556152344,
"rewards/rejected": -0.9738571047782898,
"step": 170
},
{
"epoch": 0.38,
"learning_rate": 9.765362502737097e-07,
"logits/chosen": 0.28310832381248474,
"logits/rejected": 0.34999170899391174,
"logps/chosen": -386.44647216796875,
"logps/rejected": -417.8492736816406,
"loss": 0.0723,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.615284264087677,
"rewards/margins": 0.5002199411392212,
"rewards/rejected": -1.115504264831543,
"step": 180
},
{
"epoch": 0.4,
"learning_rate": 9.706755858428485e-07,
"logits/chosen": 0.2382032573223114,
"logits/rejected": 0.3148980438709259,
"logps/chosen": -376.91082763671875,
"logps/rejected": -408.2652282714844,
"loss": 0.0709,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.6280232667922974,
"rewards/margins": 0.39571380615234375,
"rewards/rejected": -1.0237369537353516,
"step": 190
},
{
"epoch": 0.42,
"learning_rate": 9.641839665080363e-07,
"logits/chosen": 0.2947675287723541,
"logits/rejected": 0.44535762071609497,
"logps/chosen": -357.99951171875,
"logps/rejected": -366.5459899902344,
"loss": 0.0697,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5609619617462158,
"rewards/margins": 0.4246044158935547,
"rewards/rejected": -0.9855663180351257,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": 0.3266603350639343,
"eval_logits/rejected": 0.4101351499557495,
"eval_logps/chosen": -362.02423095703125,
"eval_logps/rejected": -433.75665283203125,
"eval_loss": 0.07280407100915909,
"eval_rewards/accuracies": 0.7421875,
"eval_rewards/chosen": -0.5843959450721741,
"eval_rewards/margins": 0.6369195580482483,
"eval_rewards/rejected": -1.221315622329712,
"eval_runtime": 75.214,
"eval_samples_per_second": 26.591,
"eval_steps_per_second": 0.425,
"step": 200
},
{
"epoch": 0.44,
"learning_rate": 9.570700944819582e-07,
"logits/chosen": 0.29511094093322754,
"logits/rejected": 0.4236629605293274,
"logps/chosen": -369.3717346191406,
"logps/rejected": -388.28240966796875,
"loss": 0.075,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6854770183563232,
"rewards/margins": 0.5033689141273499,
"rewards/rejected": -1.1888458728790283,
"step": 210
},
{
"epoch": 0.46,
"learning_rate": 9.493435061259129e-07,
"logits/chosen": 0.24741777777671814,
"logits/rejected": 0.3947208523750305,
"logps/chosen": -389.66033935546875,
"logps/rejected": -376.7333984375,
"loss": 0.0737,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6762610077857971,
"rewards/margins": 0.43233251571655273,
"rewards/rejected": -1.1085935831069946,
"step": 220
},
{
"epoch": 0.48,
"learning_rate": 9.4101455916603e-07,
"logits/chosen": 0.20855531096458435,
"logits/rejected": 0.32445111870765686,
"logps/chosen": -379.79962158203125,
"logps/rejected": -424.62109375,
"loss": 0.0673,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7091845273971558,
"rewards/margins": 0.5963308215141296,
"rewards/rejected": -1.3055154085159302,
"step": 230
},
{
"epoch": 0.5,
"learning_rate": 9.320944188084241e-07,
"logits/chosen": 0.1453891396522522,
"logits/rejected": 0.3285972774028778,
"logps/chosen": -462.5914001464844,
"logps/rejected": -419.4541015625,
"loss": 0.0613,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.7956157922744751,
"rewards/margins": 0.49665746092796326,
"rewards/rejected": -1.2922732830047607,
"step": 240
},
{
"epoch": 0.52,
"learning_rate": 9.225950427718974e-07,
"logits/chosen": 0.19092246890068054,
"logits/rejected": 0.27189213037490845,
"logps/chosen": -375.6587219238281,
"logps/rejected": -393.5346984863281,
"loss": 0.0586,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6252821683883667,
"rewards/margins": 0.4895978569984436,
"rewards/rejected": -1.114880084991455,
"step": 250
},
{
"epoch": 0.54,
"learning_rate": 9.125291652582547e-07,
"logits/chosen": 0.2094411551952362,
"logits/rejected": 0.25953131914138794,
"logps/chosen": -379.9848937988281,
"logps/rejected": -423.44891357421875,
"loss": 0.0649,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7326905727386475,
"rewards/margins": 0.668548047542572,
"rewards/rejected": -1.4012387990951538,
"step": 260
},
{
"epoch": 0.57,
"learning_rate": 9.019102798817195e-07,
"logits/chosen": 0.20080764591693878,
"logits/rejected": 0.2263043224811554,
"logps/chosen": -374.03680419921875,
"logps/rejected": -410.4359436035156,
"loss": 0.0584,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8592589497566223,
"rewards/margins": 0.481633722782135,
"rewards/rejected": -1.3408926725387573,
"step": 270
},
{
"epoch": 0.59,
"learning_rate": 8.90752621580335e-07,
"logits/chosen": 0.16465748846530914,
"logits/rejected": 0.2313542366027832,
"logps/chosen": -432.99884033203125,
"logps/rejected": -439.2601013183594,
"loss": 0.0529,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.04286527633667,
"rewards/margins": 0.4970209002494812,
"rewards/rejected": -1.5398861169815063,
"step": 280
},
{
"epoch": 0.61,
"learning_rate": 8.79071147533597e-07,
"logits/chosen": 0.17371919751167297,
"logits/rejected": 0.25447744131088257,
"logps/chosen": -404.51141357421875,
"logps/rejected": -469.1675720214844,
"loss": 0.0592,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.7758570313453674,
"rewards/margins": 0.7041818499565125,
"rewards/rejected": -1.4800388813018799,
"step": 290
},
{
"epoch": 0.63,
"learning_rate": 8.668815171119019e-07,
"logits/chosen": 0.20719440281391144,
"logits/rejected": 0.22149357199668884,
"logps/chosen": -375.27703857421875,
"logps/rejected": -434.1705627441406,
"loss": 0.055,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7744671106338501,
"rewards/margins": 0.6087144613265991,
"rewards/rejected": -1.3831814527511597,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": 0.245079904794693,
"eval_logits/rejected": 0.2779832184314728,
"eval_logps/chosen": -383.036865234375,
"eval_logps/rejected": -465.8376159667969,
"eval_loss": 0.06104155629873276,
"eval_rewards/accuracies": 0.7265625,
"eval_rewards/chosen": -0.7945222854614258,
"eval_rewards/margins": 0.7476030588150024,
"eval_rewards/rejected": -1.5421253442764282,
"eval_runtime": 75.177,
"eval_samples_per_second": 26.604,
"eval_steps_per_second": 0.426,
"step": 300
},
{
"epoch": 0.65,
"learning_rate": 8.54200070884685e-07,
"logits/chosen": 0.2089851200580597,
"logits/rejected": 0.27757978439331055,
"logps/chosen": -364.8197021484375,
"logps/rejected": -426.005615234375,
"loss": 0.0519,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8241626620292664,
"rewards/margins": 0.7324913740158081,
"rewards/rejected": -1.5566540956497192,
"step": 310
},
{
"epoch": 0.67,
"learning_rate": 8.410438087153911e-07,
"logits/chosen": 0.19529737532138824,
"logits/rejected": 0.22135886549949646,
"logps/chosen": -390.7099304199219,
"logps/rejected": -456.1578063964844,
"loss": 0.0485,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9577051401138306,
"rewards/margins": 0.7674695253372192,
"rewards/rejected": -1.7251746654510498,
"step": 320
},
{
"epoch": 0.69,
"learning_rate": 8.274303669726426e-07,
"logits/chosen": 0.10426706075668335,
"logits/rejected": 0.09881766140460968,
"logps/chosen": -433.2498474121094,
"logps/rejected": -481.43218994140625,
"loss": 0.0523,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0801985263824463,
"rewards/margins": 0.6202031373977661,
"rewards/rejected": -1.7004016637802124,
"step": 330
},
{
"epoch": 0.71,
"learning_rate": 8.133779948881513e-07,
"logits/chosen": 0.18205437064170837,
"logits/rejected": 0.2083740234375,
"logps/chosen": -362.75665283203125,
"logps/rejected": -403.1224060058594,
"loss": 0.058,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.767187237739563,
"rewards/margins": 0.5591500401496887,
"rewards/rejected": -1.3263373374938965,
"step": 340
},
{
"epoch": 0.73,
"learning_rate": 7.989055300930704e-07,
"logits/chosen": 0.17904943227767944,
"logits/rejected": 0.1928117871284485,
"logps/chosen": -392.50665283203125,
"logps/rejected": -460.57049560546875,
"loss": 0.0581,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9408512115478516,
"rewards/margins": 0.6105338335037231,
"rewards/rejected": -1.5513849258422852,
"step": 350
},
{
"epoch": 0.75,
"learning_rate": 7.840323733655778e-07,
"logits/chosen": 0.11943835020065308,
"logits/rejected": 0.14566612243652344,
"logps/chosen": -408.4542541503906,
"logps/rejected": -477.89556884765625,
"loss": 0.0586,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9370508193969727,
"rewards/margins": 0.6635792851448059,
"rewards/rejected": -1.6006300449371338,
"step": 360
},
{
"epoch": 0.77,
"learning_rate": 7.687784626235447e-07,
"logits/chosen": 0.08894483745098114,
"logits/rejected": 0.21349970996379852,
"logps/chosen": -427.32806396484375,
"logps/rejected": -448.68115234375,
"loss": 0.0583,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.9731753468513489,
"rewards/margins": 0.594713032245636,
"rewards/rejected": -1.5678884983062744,
"step": 370
},
{
"epoch": 0.8,
"learning_rate": 7.531642461971514e-07,
"logits/chosen": 0.1658913791179657,
"logits/rejected": 0.1944103091955185,
"logps/chosen": -356.7992858886719,
"logps/rejected": -428.2513732910156,
"loss": 0.0597,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8277195692062378,
"rewards/margins": 0.6664320826530457,
"rewards/rejected": -1.4941515922546387,
"step": 380
},
{
"epoch": 0.82,
"learning_rate": 7.372106554172801e-07,
"logits/chosen": 0.17445510625839233,
"logits/rejected": 0.24218544363975525,
"logps/chosen": -364.82818603515625,
"logps/rejected": -411.1913146972656,
"loss": 0.0667,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.7496371865272522,
"rewards/margins": 0.4808691143989563,
"rewards/rejected": -1.2305063009262085,
"step": 390
},
{
"epoch": 0.84,
"learning_rate": 7.209390765564318e-07,
"logits/chosen": 0.21217799186706543,
"logits/rejected": 0.19523081183433533,
"logps/chosen": -384.03082275390625,
"logps/rejected": -439.64697265625,
"loss": 0.0573,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.9415151476860046,
"rewards/margins": 0.5813928842544556,
"rewards/rejected": -1.5229079723358154,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": 0.23484691977500916,
"eval_logits/rejected": 0.25605612993240356,
"eval_logps/chosen": -386.639404296875,
"eval_logps/rejected": -471.1476745605469,
"eval_loss": 0.05661754682660103,
"eval_rewards/accuracies": 0.73828125,
"eval_rewards/chosen": -0.8305472135543823,
"eval_rewards/margins": 0.7646786570549011,
"eval_rewards/rejected": -1.5952258110046387,
"eval_runtime": 73.9447,
"eval_samples_per_second": 27.047,
"eval_steps_per_second": 0.433,
"step": 400
},
{
"epoch": 0.86,
"learning_rate": 7.043713221597773e-07,
"logits/chosen": 0.13619688153266907,
"logits/rejected": 0.24737751483917236,
"logps/chosen": -439.60595703125,
"logps/rejected": -467.98974609375,
"loss": 0.0498,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.987277626991272,
"rewards/margins": 0.6264899373054504,
"rewards/rejected": -1.6137676239013672,
"step": 410
},
{
"epoch": 0.88,
"learning_rate": 6.875296018047809e-07,
"logits/chosen": 0.08895771205425262,
"logits/rejected": 0.10534010827541351,
"logps/chosen": -437.58575439453125,
"logps/rejected": -504.79962158203125,
"loss": 0.0508,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.197588562965393,
"rewards/margins": 0.6411014795303345,
"rewards/rejected": -1.8386898040771484,
"step": 420
},
{
"epoch": 0.9,
"learning_rate": 6.704364923285857e-07,
"logits/chosen": 0.13031154870986938,
"logits/rejected": 0.22820834815502167,
"logps/chosen": -406.86474609375,
"logps/rejected": -403.8560485839844,
"loss": 0.0488,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0081294775009155,
"rewards/margins": 0.5856183171272278,
"rewards/rejected": -1.593747854232788,
"step": 430
},
{
"epoch": 0.92,
"learning_rate": 6.531149075630796e-07,
"logits/chosen": 0.06417986750602722,
"logits/rejected": 0.1644040048122406,
"logps/chosen": -426.49859619140625,
"logps/rejected": -444.4884338378906,
"loss": 0.0587,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0372142791748047,
"rewards/margins": 0.5977233052253723,
"rewards/rejected": -1.6349375247955322,
"step": 440
},
{
"epoch": 0.94,
"learning_rate": 6.355880676182085e-07,
"logits/chosen": 0.1278570294380188,
"logits/rejected": 0.10543633997440338,
"logps/chosen": -385.71832275390625,
"logps/rejected": -487.64495849609375,
"loss": 0.0594,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.880386471748352,
"rewards/margins": 0.8574458360671997,
"rewards/rejected": -1.7378323078155518,
"step": 450
},
{
"epoch": 0.96,
"learning_rate": 6.178794677547137e-07,
"logits/chosen": 0.14232680201530457,
"logits/rejected": 0.19507645070552826,
"logps/chosen": -429.20794677734375,
"logps/rejected": -448.43621826171875,
"loss": 0.0586,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8559433817863464,
"rewards/margins": 0.6609224677085876,
"rewards/rejected": -1.5168659687042236,
"step": 460
},
{
"epoch": 0.98,
"learning_rate": 6.000128468880222e-07,
"logits/chosen": 0.17209979891777039,
"logits/rejected": 0.20294690132141113,
"logps/chosen": -434.7554626464844,
"logps/rejected": -495.654541015625,
"loss": 0.0564,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.0413404703140259,
"rewards/margins": 0.7894097566604614,
"rewards/rejected": -1.8307502269744873,
"step": 470
},
{
"epoch": 1.0,
"learning_rate": 5.820121557655108e-07,
"logits/chosen": 0.1130753755569458,
"logits/rejected": 0.17763587832450867,
"logps/chosen": -447.583251953125,
"logps/rejected": -536.4083862304688,
"loss": 0.042,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.053792953491211,
"rewards/margins": 0.9330266714096069,
"rewards/rejected": -1.9868196249008179,
"step": 480
},
{
"epoch": 1.03,
"learning_rate": 5.639015248598023e-07,
"logits/chosen": 0.173425555229187,
"logits/rejected": 0.1563911885023117,
"logps/chosen": -396.56976318359375,
"logps/rejected": -509.23760986328125,
"loss": 0.0272,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0132352113723755,
"rewards/margins": 1.2193310260772705,
"rewards/rejected": -2.2325661182403564,
"step": 490
},
{
"epoch": 1.05,
"learning_rate": 5.457052320211339e-07,
"logits/chosen": 0.1425987184047699,
"logits/rejected": 0.1953365057706833,
"logps/chosen": -486.52484130859375,
"logps/rejected": -575.6693115234375,
"loss": 0.0215,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.450165033340454,
"rewards/margins": 1.2124006748199463,
"rewards/rejected": -2.6625657081604004,
"step": 500
},
{
"epoch": 1.05,
"eval_logits/chosen": 0.22210484743118286,
"eval_logits/rejected": 0.2419252097606659,
"eval_logps/chosen": -465.0879821777344,
"eval_logps/rejected": -598.30078125,
"eval_loss": 0.03267505019903183,
"eval_rewards/accuracies": 0.73046875,
"eval_rewards/chosen": -1.6150331497192383,
"eval_rewards/margins": 1.2517237663269043,
"eval_rewards/rejected": -2.8667569160461426,
"eval_runtime": 74.1502,
"eval_samples_per_second": 26.972,
"eval_steps_per_second": 0.432,
"step": 500
},
{
"epoch": 1.07,
"learning_rate": 5.274476699321637e-07,
"logits/chosen": 0.10247495025396347,
"logits/rejected": 0.18925973773002625,
"logps/chosen": -478.4522399902344,
"logps/rejected": -538.3016967773438,
"loss": 0.019,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.6425974369049072,
"rewards/margins": 0.9050165414810181,
"rewards/rejected": -2.5476138591766357,
"step": 510
},
{
"epoch": 1.09,
"learning_rate": 5.091533134088387e-07,
"logits/chosen": 0.15197055041790009,
"logits/rejected": 0.21326705813407898,
"logps/chosen": -460.1466369628906,
"logps/rejected": -564.4393310546875,
"loss": 0.0199,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4866502285003662,
"rewards/margins": 1.329404354095459,
"rewards/rejected": -2.8160548210144043,
"step": 520
},
{
"epoch": 1.11,
"learning_rate": 4.908466865911614e-07,
"logits/chosen": 0.13690608739852905,
"logits/rejected": 0.24023446440696716,
"logps/chosen": -495.3097229003906,
"logps/rejected": -585.8782958984375,
"loss": 0.0185,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.5163190364837646,
"rewards/margins": 1.2400487661361694,
"rewards/rejected": -2.7563679218292236,
"step": 530
},
{
"epoch": 1.13,
"learning_rate": 4.7255233006783624e-07,
"logits/chosen": 0.21338143944740295,
"logits/rejected": 0.24035005271434784,
"logps/chosen": -469.50958251953125,
"logps/rejected": -574.6529541015625,
"loss": 0.0153,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.6547876596450806,
"rewards/margins": 1.1869592666625977,
"rewards/rejected": -2.8417468070983887,
"step": 540
},
{
"epoch": 1.15,
"learning_rate": 4.5429476797886617e-07,
"logits/chosen": 0.17387095093727112,
"logits/rejected": 0.20873236656188965,
"logps/chosen": -454.8916015625,
"logps/rejected": -569.7607421875,
"loss": 0.0162,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4260327816009521,
"rewards/margins": 1.1895036697387695,
"rewards/rejected": -2.6155364513397217,
"step": 550
},
{
"epoch": 1.17,
"learning_rate": 4.3609847514019763e-07,
"logits/chosen": 0.03245037421584129,
"logits/rejected": 0.15740999579429626,
"logps/chosen": -468.34393310546875,
"logps/rejected": -583.3796997070312,
"loss": 0.0162,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.6325418949127197,
"rewards/margins": 1.2155063152313232,
"rewards/rejected": -2.848047971725464,
"step": 560
},
{
"epoch": 1.19,
"learning_rate": 4.179878442344892e-07,
"logits/chosen": 0.19224026799201965,
"logits/rejected": 0.25054025650024414,
"logps/chosen": -458.1896057128906,
"logps/rejected": -582.621337890625,
"loss": 0.0151,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.6655352115631104,
"rewards/margins": 1.4305517673492432,
"rewards/rejected": -3.0960865020751953,
"step": 570
},
{
"epoch": 1.21,
"learning_rate": 3.9998715311197783e-07,
"logits/chosen": 0.11674971878528595,
"logits/rejected": 0.1621953547000885,
"logps/chosen": -469.97454833984375,
"logps/rejected": -602.2510375976562,
"loss": 0.0159,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8368819952011108,
"rewards/margins": 1.2762649059295654,
"rewards/rejected": -3.113147258758545,
"step": 580
},
{
"epoch": 1.23,
"learning_rate": 3.821205322452863e-07,
"logits/chosen": 0.20661136507987976,
"logits/rejected": 0.2854346036911011,
"logps/chosen": -462.8573303222656,
"logps/rejected": -592.9844970703125,
"loss": 0.0131,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7403570413589478,
"rewards/margins": 1.424896478652954,
"rewards/rejected": -3.1652536392211914,
"step": 590
},
{
"epoch": 1.26,
"learning_rate": 3.6441193238179146e-07,
"logits/chosen": 0.2347058355808258,
"logits/rejected": 0.30280107259750366,
"logps/chosen": -493.71484375,
"logps/rejected": -621.8485107421875,
"loss": 0.0139,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6602065563201904,
"rewards/margins": 1.4890201091766357,
"rewards/rejected": -3.149226665496826,
"step": 600
},
{
"epoch": 1.26,
"eval_logits/chosen": 0.2600603699684143,
"eval_logits/rejected": 0.2915884852409363,
"eval_logps/chosen": -484.3870849609375,
"eval_logps/rejected": -620.5768432617188,
"eval_loss": 0.025975177064538002,
"eval_rewards/accuracies": 0.72265625,
"eval_rewards/chosen": -1.8080239295959473,
"eval_rewards/margins": 1.2814933061599731,
"eval_rewards/rejected": -3.08951735496521,
"eval_runtime": 74.1333,
"eval_samples_per_second": 26.978,
"eval_steps_per_second": 0.432,
"step": 600
},
{
"epoch": 1.28,
"learning_rate": 3.4688509243692034e-07,
"logits/chosen": 0.11437401920557022,
"logits/rejected": 0.2602505087852478,
"logps/chosen": -514.426513671875,
"logps/rejected": -637.3177490234375,
"loss": 0.0139,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.734244704246521,
"rewards/margins": 1.4157501459121704,
"rewards/rejected": -3.1499948501586914,
"step": 610
},
{
"epoch": 1.3,
"learning_rate": 3.295635076714144e-07,
"logits/chosen": 0.07244641333818436,
"logits/rejected": 0.16178789734840393,
"logps/chosen": -518.953857421875,
"logps/rejected": -616.7998657226562,
"loss": 0.0141,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.6754181385040283,
"rewards/margins": 1.4035594463348389,
"rewards/rejected": -3.078977346420288,
"step": 620
},
{
"epoch": 1.32,
"learning_rate": 3.12470398195219e-07,
"logits/chosen": 0.1367851048707962,
"logits/rejected": 0.3572950065135956,
"logps/chosen": -585.3605346679688,
"logps/rejected": -622.2881469726562,
"loss": 0.0145,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.1314330101013184,
"rewards/margins": 1.1666393280029297,
"rewards/rejected": -3.298072099685669,
"step": 630
},
{
"epoch": 1.34,
"learning_rate": 2.956286778402226e-07,
"logits/chosen": 0.23215535283088684,
"logits/rejected": 0.308633416891098,
"logps/chosen": -464.2928771972656,
"logps/rejected": -603.6142578125,
"loss": 0.0119,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.798147201538086,
"rewards/margins": 1.2844369411468506,
"rewards/rejected": -3.0825843811035156,
"step": 640
},
{
"epoch": 1.36,
"learning_rate": 2.7906092344356826e-07,
"logits/chosen": 0.19899992644786835,
"logits/rejected": 0.3110192120075226,
"logps/chosen": -524.2318115234375,
"logps/rejected": -611.1939697265625,
"loss": 0.0135,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.8059593439102173,
"rewards/margins": 1.16557776927948,
"rewards/rejected": -2.9715373516082764,
"step": 650
},
{
"epoch": 1.38,
"learning_rate": 2.6278934458271996e-07,
"logits/chosen": 0.11150866746902466,
"logits/rejected": 0.20857281982898712,
"logps/chosen": -505.08135986328125,
"logps/rejected": -640.5789184570312,
"loss": 0.0138,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.8031425476074219,
"rewards/margins": 1.359779953956604,
"rewards/rejected": -3.1629223823547363,
"step": 660
},
{
"epoch": 1.4,
"learning_rate": 2.468357538028487e-07,
"logits/chosen": 0.09730945527553558,
"logits/rejected": 0.19464361667633057,
"logps/chosen": -528.5618896484375,
"logps/rejected": -603.7526245117188,
"loss": 0.0133,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9411073923110962,
"rewards/margins": 1.2485072612762451,
"rewards/rejected": -3.189614772796631,
"step": 670
},
{
"epoch": 1.42,
"learning_rate": 2.312215373764551e-07,
"logits/chosen": 0.14483553171157837,
"logits/rejected": 0.1836375743150711,
"logps/chosen": -516.8216552734375,
"logps/rejected": -620.4552001953125,
"loss": 0.0138,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.9225925207138062,
"rewards/margins": 1.22107994556427,
"rewards/rejected": -3.143672466278076,
"step": 680
},
{
"epoch": 1.44,
"learning_rate": 2.1596762663442213e-07,
"logits/chosen": 0.16792774200439453,
"logits/rejected": 0.24698173999786377,
"logps/chosen": -488.84234619140625,
"logps/rejected": -614.4656372070312,
"loss": 0.0129,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8360084295272827,
"rewards/margins": 1.531054973602295,
"rewards/rejected": -3.367063045501709,
"step": 690
},
{
"epoch": 1.47,
"learning_rate": 2.0109446990692963e-07,
"logits/chosen": 0.1680019199848175,
"logits/rejected": 0.29429227113723755,
"logps/chosen": -520.464599609375,
"logps/rejected": -605.1278076171875,
"loss": 0.0125,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9141952991485596,
"rewards/margins": 1.3153671026229858,
"rewards/rejected": -3.229562282562256,
"step": 700
},
{
"epoch": 1.47,
"eval_logits/chosen": 0.26143062114715576,
"eval_logits/rejected": 0.29468628764152527,
"eval_logps/chosen": -494.7950134277344,
"eval_logps/rejected": -630.4850463867188,
"eval_loss": 0.02471703477203846,
"eval_rewards/accuracies": 0.73046875,
"eval_rewards/chosen": -1.9121036529541016,
"eval_rewards/margins": 1.2764959335327148,
"eval_rewards/rejected": -3.188599109649658,
"eval_runtime": 74.3821,
"eval_samples_per_second": 26.888,
"eval_steps_per_second": 0.43,
"step": 700
},
{
"epoch": 1.49,
"learning_rate": 1.8662200511184872e-07,
"logits/chosen": 0.12966138124465942,
"logits/rejected": 0.19625753164291382,
"logps/chosen": -487.907470703125,
"logps/rejected": -615.3556518554688,
"loss": 0.0139,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.759558081626892,
"rewards/margins": 1.376070261001587,
"rewards/rejected": -3.1356282234191895,
"step": 710
},
{
"epoch": 1.51,
"learning_rate": 1.725696330273575e-07,
"logits/chosen": 0.1202569380402565,
"logits/rejected": 0.177236407995224,
"logps/chosen": -529.9605712890625,
"logps/rejected": -626.4281005859375,
"loss": 0.0139,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.9097445011138916,
"rewards/margins": 1.0962101221084595,
"rewards/rejected": -3.0059542655944824,
"step": 720
},
{
"epoch": 1.53,
"learning_rate": 1.589561912846089e-07,
"logits/chosen": 0.19967588782310486,
"logits/rejected": 0.3316526710987091,
"logps/chosen": -481.29779052734375,
"logps/rejected": -603.3109130859375,
"loss": 0.0123,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8021833896636963,
"rewards/margins": 1.3467209339141846,
"rewards/rejected": -3.148904323577881,
"step": 730
},
{
"epoch": 1.55,
"learning_rate": 1.4579992911531496e-07,
"logits/chosen": 0.20930282771587372,
"logits/rejected": 0.27201521396636963,
"logps/chosen": -432.78070068359375,
"logps/rejected": -560.7802124023438,
"loss": 0.0121,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.5994914770126343,
"rewards/margins": 1.341997742652893,
"rewards/rejected": -2.9414889812469482,
"step": 740
},
{
"epoch": 1.57,
"learning_rate": 1.3311848288809813e-07,
"logits/chosen": 0.10060323774814606,
"logits/rejected": 0.20884795486927032,
"logps/chosen": -488.33123779296875,
"logps/rejected": -580.7650756835938,
"loss": 0.0137,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6071643829345703,
"rewards/margins": 1.2730271816253662,
"rewards/rejected": -2.8801915645599365,
"step": 750
},
{
"epoch": 1.59,
"learning_rate": 1.209288524664029e-07,
"logits/chosen": 0.13590273261070251,
"logits/rejected": 0.2782810628414154,
"logps/chosen": -511.728271484375,
"logps/rejected": -611.2279052734375,
"loss": 0.0122,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.938084363937378,
"rewards/margins": 1.1513842344284058,
"rewards/rejected": -3.0894687175750732,
"step": 760
},
{
"epoch": 1.61,
"learning_rate": 1.0924737841966497e-07,
"logits/chosen": 0.2217625081539154,
"logits/rejected": 0.317230761051178,
"logps/chosen": -485.08624267578125,
"logps/rejected": -596.3556518554688,
"loss": 0.0113,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7839618921279907,
"rewards/margins": 1.4122188091278076,
"rewards/rejected": -3.196180820465088,
"step": 770
},
{
"epoch": 1.63,
"learning_rate": 9.808972011828054e-08,
"logits/chosen": 0.15653935074806213,
"logits/rejected": 0.288215696811676,
"logps/chosen": -521.5089111328125,
"logps/rejected": -634.9436645507812,
"loss": 0.0125,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.080339193344116,
"rewards/margins": 1.2963542938232422,
"rewards/rejected": -3.3766937255859375,
"step": 780
},
{
"epoch": 1.65,
"learning_rate": 8.747083474174527e-08,
"logits/chosen": 0.18242642283439636,
"logits/rejected": 0.33485209941864014,
"logps/chosen": -511.490966796875,
"logps/rejected": -598.623046875,
"loss": 0.0118,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.951550841331482,
"rewards/margins": 1.2288819551467896,
"rewards/rejected": -3.1804327964782715,
"step": 790
},
{
"epoch": 1.67,
"learning_rate": 7.740495722810269e-08,
"logits/chosen": 0.21551553905010223,
"logits/rejected": 0.30063092708587646,
"logps/chosen": -491.99810791015625,
"logps/rejected": -625.0213012695312,
"loss": 0.0107,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9443299770355225,
"rewards/margins": 1.5316195487976074,
"rewards/rejected": -3.47594952583313,
"step": 800
},
{
"epoch": 1.67,
"eval_logits/chosen": 0.28409868478775024,
"eval_logits/rejected": 0.3196317255496979,
"eval_logps/chosen": -503.05755615234375,
"eval_logps/rejected": -641.1343994140625,
"eval_loss": 0.022644678130745888,
"eval_rewards/accuracies": 0.71875,
"eval_rewards/chosen": -1.9947288036346436,
"eval_rewards/margins": 1.3003644943237305,
"eval_rewards/rejected": -3.295093536376953,
"eval_runtime": 75.2508,
"eval_samples_per_second": 26.578,
"eval_steps_per_second": 0.425,
"step": 800
},
{
"epoch": 1.7,
"learning_rate": 6.790558119157597e-08,
"logits/chosen": 0.12448444217443466,
"logits/rejected": 0.15070387721061707,
"logps/chosen": -556.6755981445312,
"logps/rejected": -664.4570922851562,
"loss": 0.012,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.042440414428711,
"rewards/margins": 1.2739028930664062,
"rewards/rejected": -3.316342830657959,
"step": 810
},
{
"epoch": 1.72,
"learning_rate": 5.898544083397e-08,
"logits/chosen": 0.16522815823554993,
"logits/rejected": 0.24165570735931396,
"logps/chosen": -488.497314453125,
"logps/rejected": -626.2318115234375,
"loss": 0.0114,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8827340602874756,
"rewards/margins": 1.2979196310043335,
"rewards/rejected": -3.1806535720825195,
"step": 820
},
{
"epoch": 1.74,
"learning_rate": 5.065649387408705e-08,
"logits/chosen": 0.1387493908405304,
"logits/rejected": 0.18901556730270386,
"logps/chosen": -501.7972717285156,
"logps/rejected": -649.2764892578125,
"loss": 0.0109,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.9313652515411377,
"rewards/margins": 1.6679942607879639,
"rewards/rejected": -3.5993595123291016,
"step": 830
},
{
"epoch": 1.76,
"learning_rate": 4.292990551804171e-08,
"logits/chosen": 0.12224831432104111,
"logits/rejected": 0.2817748785018921,
"logps/chosen": -532.9338989257812,
"logps/rejected": -598.0217895507812,
"loss": 0.0119,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9113423824310303,
"rewards/margins": 1.2608517408370972,
"rewards/rejected": -3.172194242477417,
"step": 840
},
{
"epoch": 1.78,
"learning_rate": 3.581603349196371e-08,
"logits/chosen": 0.13969172537326813,
"logits/rejected": 0.24790000915527344,
"logps/chosen": -473.35235595703125,
"logps/rejected": -589.666259765625,
"loss": 0.0108,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.7080204486846924,
"rewards/margins": 1.3214609622955322,
"rewards/rejected": -3.0294814109802246,
"step": 850
},
{
"epoch": 1.8,
"learning_rate": 2.9324414157151367e-08,
"logits/chosen": 0.2230512797832489,
"logits/rejected": 0.22629483044147491,
"logps/chosen": -498.7433166503906,
"logps/rejected": -645.7354736328125,
"loss": 0.0118,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.8642832040786743,
"rewards/margins": 1.4582536220550537,
"rewards/rejected": -3.3225369453430176,
"step": 860
},
{
"epoch": 1.82,
"learning_rate": 2.3463749726290284e-08,
"logits/chosen": 0.13460347056388855,
"logits/rejected": 0.25947511196136475,
"logps/chosen": -519.2232666015625,
"logps/rejected": -617.5545654296875,
"loss": 0.0107,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.8353309631347656,
"rewards/margins": 1.3836863040924072,
"rewards/rejected": -3.219017505645752,
"step": 870
},
{
"epoch": 1.84,
"learning_rate": 1.824189659787284e-08,
"logits/chosen": 0.2357769012451172,
"logits/rejected": 0.23074205219745636,
"logps/chosen": -469.6444396972656,
"logps/rejected": -621.1527709960938,
"loss": 0.0103,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.836211919784546,
"rewards/margins": 1.5233814716339111,
"rewards/rejected": -3.359593152999878,
"step": 880
},
{
"epoch": 1.86,
"learning_rate": 1.3665854824458035e-08,
"logits/chosen": 0.210123211145401,
"logits/rejected": 0.2904731333255768,
"logps/chosen": -509.2762756347656,
"logps/rejected": -632.9801025390625,
"loss": 0.0112,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9698455333709717,
"rewards/margins": 1.3619072437286377,
"rewards/rejected": -3.3317527770996094,
"step": 890
},
{
"epoch": 1.88,
"learning_rate": 9.741758728888217e-09,
"logits/chosen": 0.12742657959461212,
"logits/rejected": 0.16876272857189178,
"logps/chosen": -529.4017333984375,
"logps/rejected": -672.1700439453125,
"loss": 0.0106,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.8726167678833008,
"rewards/margins": 1.5384435653686523,
"rewards/rejected": -3.411060333251953,
"step": 900
},
{
"epoch": 1.88,
"eval_logits/chosen": 0.28414925932884216,
"eval_logits/rejected": 0.32150793075561523,
"eval_logps/chosen": -503.032470703125,
"eval_logps/rejected": -640.8138427734375,
"eval_loss": 0.022440288215875626,
"eval_rewards/accuracies": 0.71484375,
"eval_rewards/chosen": -1.9944782257080078,
"eval_rewards/margins": 1.2974092960357666,
"eval_rewards/rejected": -3.2918872833251953,
"eval_runtime": 75.3961,
"eval_samples_per_second": 26.527,
"eval_steps_per_second": 0.424,
"step": 900
},
{
"epoch": 1.9,
"learning_rate": 6.474868681043577e-09,
"logits/chosen": 0.28579333424568176,
"logits/rejected": 0.3222460150718689,
"logps/chosen": -456.7623596191406,
"logps/rejected": -624.6663208007812,
"loss": 0.0108,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.772883415222168,
"rewards/margins": 1.6853721141815186,
"rewards/rejected": -3.4582557678222656,
"step": 910
},
{
"epoch": 1.93,
"learning_rate": 3.869564046156459e-09,
"logits/chosen": 0.12005837261676788,
"logits/rejected": 0.25674593448638916,
"logps/chosen": -503.75482177734375,
"logps/rejected": -651.0211181640625,
"loss": 0.0101,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.934744119644165,
"rewards/margins": 1.5219500064849854,
"rewards/rejected": -3.4566943645477295,
"step": 920
},
{
"epoch": 1.95,
"learning_rate": 1.929337314139412e-09,
"logits/chosen": 0.17854854464530945,
"logits/rejected": 0.25535306334495544,
"logps/chosen": -509.5010681152344,
"logps/rejected": -665.1165771484375,
"loss": 0.0115,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.8284565210342407,
"rewards/margins": 1.7975488901138306,
"rewards/rejected": -3.626005172729492,
"step": 930
},
{
"epoch": 1.97,
"learning_rate": 6.567894177967325e-10,
"logits/chosen": 0.17056016623973846,
"logits/rejected": 0.19406890869140625,
"logps/chosen": -532.8721923828125,
"logps/rejected": -619.7807006835938,
"loss": 0.0099,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9198890924453735,
"rewards/margins": 1.3293774127960205,
"rewards/rejected": -3.2492668628692627,
"step": 940
},
{
"epoch": 1.99,
"learning_rate": 5.3626246194704575e-11,
"logits/chosen": 0.18597963452339172,
"logits/rejected": 0.2593163549900055,
"logps/chosen": -524.7967529296875,
"logps/rejected": -610.752685546875,
"loss": 0.0124,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9720103740692139,
"rewards/margins": 1.214902639389038,
"rewards/rejected": -3.186913013458252,
"step": 950
},
{
"epoch": 2.0,
"step": 954,
"total_flos": 0.0,
"train_loss": 0.049936374161290924,
"train_runtime": 8881.7089,
"train_samples_per_second": 13.766,
"train_steps_per_second": 0.107
}
],
"logging_steps": 10,
"max_steps": 954,
"num_train_epochs": 2,
"save_steps": 1000,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}