simonycl's picture
Upload folder using huggingface_hub
0b75cf8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984168865435357,
"eval_steps": 400,
"global_step": 473,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021108179419525065,
"grad_norm": 3.7888171889145084,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -1.7614977359771729,
"logits/rejected": -2.1336593627929688,
"logps/chosen": -258.78717041015625,
"logps/rejected": -241.137451171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010554089709762533,
"grad_norm": 5.486005258108119,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -1.652553915977478,
"logits/rejected": -1.944653868675232,
"logps/chosen": -254.9417724609375,
"logps/rejected": -233.73040771484375,
"loss": 0.6933,
"rewards/accuracies": 0.3359375,
"rewards/chosen": 0.0009400760754942894,
"rewards/margins": -0.00012203870574012399,
"rewards/rejected": 0.0010621148394420743,
"step": 5
},
{
"epoch": 0.021108179419525065,
"grad_norm": 4.961389255891659,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -1.7172822952270508,
"logits/rejected": -1.9224716424942017,
"logps/chosen": -285.58203125,
"logps/rejected": -271.65899658203125,
"loss": 0.6933,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0001836848387029022,
"rewards/margins": 5.627591235679574e-05,
"rewards/rejected": 0.00012740897363983095,
"step": 10
},
{
"epoch": 0.0316622691292876,
"grad_norm": 4.465341637207026,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -1.7000751495361328,
"logits/rejected": -2.006362199783325,
"logps/chosen": -294.66119384765625,
"logps/rejected": -266.40240478515625,
"loss": 0.693,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.003800996346399188,
"rewards/margins": 0.0003609915147535503,
"rewards/rejected": 0.003440004540607333,
"step": 15
},
{
"epoch": 0.04221635883905013,
"grad_norm": 4.29839711906534,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -1.5813852548599243,
"logits/rejected": -1.917645812034607,
"logps/chosen": -269.6716003417969,
"logps/rejected": -243.76126098632812,
"loss": 0.6929,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.004329930525273085,
"rewards/margins": 0.0014942068373784423,
"rewards/rejected": 0.002835723338648677,
"step": 20
},
{
"epoch": 0.052770448548812667,
"grad_norm": 4.1974406559327,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -1.4200265407562256,
"logits/rejected": -1.6618592739105225,
"logps/chosen": -277.4543762207031,
"logps/rejected": -256.47283935546875,
"loss": 0.6921,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.011705084703862667,
"rewards/margins": 0.001989929471164942,
"rewards/rejected": 0.009715155698359013,
"step": 25
},
{
"epoch": 0.0633245382585752,
"grad_norm": 4.063491497272294,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.5429295301437378,
"logits/rejected": -1.7798576354980469,
"logps/chosen": -282.87689208984375,
"logps/rejected": -262.7992858886719,
"loss": 0.6906,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.019914668053388596,
"rewards/margins": 0.004695773124694824,
"rewards/rejected": 0.015218895860016346,
"step": 30
},
{
"epoch": 0.07387862796833773,
"grad_norm": 4.21419727711893,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -1.5838125944137573,
"logits/rejected": -1.8180118799209595,
"logps/chosen": -261.9321594238281,
"logps/rejected": -255.01626586914062,
"loss": 0.689,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.029436618089675903,
"rewards/margins": 0.007458895444869995,
"rewards/rejected": 0.021977724507451057,
"step": 35
},
{
"epoch": 0.08443271767810026,
"grad_norm": 3.992903303419019,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -1.5156856775283813,
"logits/rejected": -1.7749900817871094,
"logps/chosen": -263.44287109375,
"logps/rejected": -244.74044799804688,
"loss": 0.6854,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.03905363008379936,
"rewards/margins": 0.01673820987343788,
"rewards/rejected": 0.02231542207300663,
"step": 40
},
{
"epoch": 0.09498680738786279,
"grad_norm": 4.26522333339902,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -1.6547425985336304,
"logits/rejected": -1.8507578372955322,
"logps/chosen": -268.19354248046875,
"logps/rejected": -257.1205139160156,
"loss": 0.6822,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.04456416517496109,
"rewards/margins": 0.022208593785762787,
"rewards/rejected": 0.0223555751144886,
"step": 45
},
{
"epoch": 0.10554089709762533,
"grad_norm": 4.338593813384902,
"learning_rate": 4.999726797933858e-07,
"logits/chosen": -1.6332728862762451,
"logits/rejected": -1.850756049156189,
"logps/chosen": -263.47998046875,
"logps/rejected": -249.18734741210938,
"loss": 0.6782,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.037815388292074203,
"rewards/margins": 0.02914128080010414,
"rewards/rejected": 0.008674108423292637,
"step": 50
},
{
"epoch": 0.11609498680738786,
"grad_norm": 4.4734059999380635,
"learning_rate": 4.99665396039775e-07,
"logits/chosen": -1.6244800090789795,
"logits/rejected": -1.8389520645141602,
"logps/chosen": -280.68115234375,
"logps/rejected": -266.77935791015625,
"loss": 0.6705,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.01766197383403778,
"rewards/margins": 0.03439956158399582,
"rewards/rejected": -0.016737591475248337,
"step": 55
},
{
"epoch": 0.1266490765171504,
"grad_norm": 5.180495838186906,
"learning_rate": 4.99017099386437e-07,
"logits/chosen": -1.7377235889434814,
"logits/rejected": -1.9698741436004639,
"logps/chosen": -276.61029052734375,
"logps/rejected": -265.09356689453125,
"loss": 0.6677,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.04501429200172424,
"rewards/margins": 0.047541338950395584,
"rewards/rejected": -0.09255563467741013,
"step": 60
},
{
"epoch": 0.13720316622691292,
"grad_norm": 5.0555182208034335,
"learning_rate": 4.980286753286194e-07,
"logits/chosen": -1.7377593517303467,
"logits/rejected": -1.9555679559707642,
"logps/chosen": -297.94842529296875,
"logps/rejected": -286.69110107421875,
"loss": 0.6676,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.013160338625311852,
"rewards/margins": 0.06764128059148788,
"rewards/rejected": -0.08080162107944489,
"step": 65
},
{
"epoch": 0.14775725593667546,
"grad_norm": 5.593861280995437,
"learning_rate": 4.967014739346915e-07,
"logits/chosen": -1.902021050453186,
"logits/rejected": -2.1676580905914307,
"logps/chosen": -274.930908203125,
"logps/rejected": -265.46917724609375,
"loss": 0.6612,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.07597370445728302,
"rewards/margins": 0.06604303419589996,
"rewards/rejected": -0.14201673865318298,
"step": 70
},
{
"epoch": 0.158311345646438,
"grad_norm": 5.729170497012147,
"learning_rate": 4.950373080021136e-07,
"logits/chosen": -1.8614518642425537,
"logits/rejected": -2.113079786300659,
"logps/chosen": -286.76824951171875,
"logps/rejected": -274.01043701171875,
"loss": 0.6598,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07419217377901077,
"rewards/margins": 0.07467035204172134,
"rewards/rejected": -0.14886252582073212,
"step": 75
},
{
"epoch": 0.16886543535620052,
"grad_norm": 5.494684099688743,
"learning_rate": 4.930384505813737e-07,
"logits/chosen": -1.923152208328247,
"logits/rejected": -2.1438252925872803,
"logps/chosen": -284.2359619140625,
"logps/rejected": -276.147705078125,
"loss": 0.6638,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.11922915279865265,
"rewards/margins": 0.0638352707028389,
"rewards/rejected": -0.18306441605091095,
"step": 80
},
{
"epoch": 0.17941952506596306,
"grad_norm": 5.815028665022688,
"learning_rate": 4.907076318712738e-07,
"logits/chosen": -1.9811105728149414,
"logits/rejected": -2.159453868865967,
"logps/chosen": -286.17047119140625,
"logps/rejected": -275.25762939453125,
"loss": 0.6572,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.10111021995544434,
"rewards/margins": 0.09180058538913727,
"rewards/rejected": -0.1929108202457428,
"step": 85
},
{
"epoch": 0.18997361477572558,
"grad_norm": 6.05860390265305,
"learning_rate": 4.88048035489807e-07,
"logits/chosen": -1.8606590032577515,
"logits/rejected": -2.207517147064209,
"logps/chosen": -288.4847106933594,
"logps/rejected": -272.2112731933594,
"loss": 0.6493,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.16333934664726257,
"rewards/margins": 0.09367315471172333,
"rewards/rejected": -0.2570124864578247,
"step": 90
},
{
"epoch": 0.20052770448548812,
"grad_norm": 5.9220580962205105,
"learning_rate": 4.85063294125718e-07,
"logits/chosen": -1.9957729578018188,
"logits/rejected": -2.102470636367798,
"logps/chosen": -306.8893127441406,
"logps/rejected": -315.8654479980469,
"loss": 0.6528,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.189494326710701,
"rewards/margins": 0.06697932630777359,
"rewards/rejected": -0.2564736604690552,
"step": 95
},
{
"epoch": 0.21108179419525067,
"grad_norm": 7.106879633726346,
"learning_rate": 4.817574845766874e-07,
"logits/chosen": -1.914390206336975,
"logits/rejected": -2.158510446548462,
"logps/chosen": -312.228271484375,
"logps/rejected": -307.2701416015625,
"loss": 0.6473,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.23975618183612823,
"rewards/margins": 0.13584721088409424,
"rewards/rejected": -0.37560343742370605,
"step": 100
},
{
"epoch": 0.22163588390501318,
"grad_norm": 6.261854070868125,
"learning_rate": 4.781351221809166e-07,
"logits/chosen": -2.121222496032715,
"logits/rejected": -2.3385891914367676,
"logps/chosen": -288.9300231933594,
"logps/rejected": -287.0550537109375,
"loss": 0.6455,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.34203463792800903,
"rewards/margins": 0.1223745122551918,
"rewards/rejected": -0.4644091725349426,
"step": 105
},
{
"epoch": 0.23218997361477572,
"grad_norm": 6.2411593338388816,
"learning_rate": 4.742011546497182e-07,
"logits/chosen": -1.9769681692123413,
"logits/rejected": -2.1361823081970215,
"logps/chosen": -309.54766845703125,
"logps/rejected": -307.20306396484375,
"loss": 0.6489,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28532418608665466,
"rewards/margins": 0.1274307519197464,
"rewards/rejected": -0.41275492310523987,
"step": 110
},
{
"epoch": 0.24274406332453827,
"grad_norm": 6.782103703812521,
"learning_rate": 4.6996095530953875e-07,
"logits/chosen": -1.9189682006835938,
"logits/rejected": -2.1745872497558594,
"logps/chosen": -314.22308349609375,
"logps/rejected": -309.86859130859375,
"loss": 0.6341,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.3373129367828369,
"rewards/margins": 0.11438401788473129,
"rewards/rejected": -0.4516969621181488,
"step": 115
},
{
"epoch": 0.2532981530343008,
"grad_norm": 7.845105191338386,
"learning_rate": 4.654203157626399e-07,
"logits/chosen": -2.0927116870880127,
"logits/rejected": -2.4226441383361816,
"logps/chosen": -330.85467529296875,
"logps/rejected": -319.5343933105469,
"loss": 0.6363,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.41750985383987427,
"rewards/margins": 0.1433776617050171,
"rewards/rejected": -0.5608875155448914,
"step": 120
},
{
"epoch": 0.2638522427440633,
"grad_norm": 7.838145177076444,
"learning_rate": 4.605854379764673e-07,
"logits/chosen": -2.088397264480591,
"logits/rejected": -2.309814453125,
"logps/chosen": -321.032958984375,
"logps/rejected": -316.51812744140625,
"loss": 0.6335,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.44016337394714355,
"rewards/margins": 0.12760691344738007,
"rewards/rejected": -0.5677703619003296,
"step": 125
},
{
"epoch": 0.27440633245382584,
"grad_norm": 7.175313209394211,
"learning_rate": 4.5546292581250857e-07,
"logits/chosen": -2.1430201530456543,
"logits/rejected": -2.3672008514404297,
"logps/chosen": -320.6697692871094,
"logps/rejected": -315.40594482421875,
"loss": 0.6314,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.46516576409339905,
"rewards/margins": 0.20094823837280273,
"rewards/rejected": -0.6661140322685242,
"step": 130
},
{
"epoch": 0.2849604221635884,
"grad_norm": 8.139621502884008,
"learning_rate": 4.5005977600621275e-07,
"logits/chosen": -2.157411813735962,
"logits/rejected": -2.422761917114258,
"logps/chosen": -334.2851867675781,
"logps/rejected": -331.96240234375,
"loss": 0.6361,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5050655007362366,
"rewards/margins": 0.16327856481075287,
"rewards/rejected": -0.6683440208435059,
"step": 135
},
{
"epoch": 0.2955145118733509,
"grad_norm": 8.66968270113885,
"learning_rate": 4.443833686102919e-07,
"logits/chosen": -2.18753981590271,
"logits/rejected": -2.4200239181518555,
"logps/chosen": -351.04388427734375,
"logps/rejected": -355.5639953613281,
"loss": 0.6345,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6600111126899719,
"rewards/margins": 0.2067330777645111,
"rewards/rejected": -0.8667442202568054,
"step": 140
},
{
"epoch": 0.30606860158311344,
"grad_norm": 8.486691938463958,
"learning_rate": 4.384414569144561e-07,
"logits/chosen": -2.2690327167510986,
"logits/rejected": -2.467618227005005,
"logps/chosen": -345.2842102050781,
"logps/rejected": -351.8019104003906,
"loss": 0.6236,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6318648457527161,
"rewards/margins": 0.21810145676136017,
"rewards/rejected": -0.8499662280082703,
"step": 145
},
{
"epoch": 0.316622691292876,
"grad_norm": 10.799601481281647,
"learning_rate": 4.3224215685535287e-07,
"logits/chosen": -2.1107537746429443,
"logits/rejected": -2.275696039199829,
"logps/chosen": -330.2477111816406,
"logps/rejected": -332.95306396484375,
"loss": 0.6218,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.6694210767745972,
"rewards/margins": 0.16177809238433838,
"rewards/rejected": -0.8311992883682251,
"step": 150
},
{
"epoch": 0.32717678100263853,
"grad_norm": 9.91414676698451,
"learning_rate": 4.2579393593117364e-07,
"logits/chosen": -2.109783887863159,
"logits/rejected": -2.3675389289855957,
"logps/chosen": -360.96612548828125,
"logps/rejected": -354.8112487792969,
"loss": 0.6228,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.7774096727371216,
"rewards/margins": 0.20474901795387268,
"rewards/rejected": -0.9821586608886719,
"step": 155
},
{
"epoch": 0.33773087071240104,
"grad_norm": 9.84160004233017,
"learning_rate": 4.191056016360699e-07,
"logits/chosen": -2.1540074348449707,
"logits/rejected": -2.363142728805542,
"logps/chosen": -353.576416015625,
"logps/rejected": -356.2342834472656,
"loss": 0.6191,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8168758153915405,
"rewards/margins": 0.1971598118543625,
"rewards/rejected": -1.014035701751709,
"step": 160
},
{
"epoch": 0.3482849604221636,
"grad_norm": 10.137761591125681,
"learning_rate": 4.121862894301754e-07,
"logits/chosen": -2.1386771202087402,
"logits/rejected": -2.463273286819458,
"logps/chosen": -378.07904052734375,
"logps/rejected": -362.71893310546875,
"loss": 0.6188,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.91053307056427,
"rewards/margins": 0.17932763695716858,
"rewards/rejected": -1.0898606777191162,
"step": 165
},
{
"epoch": 0.35883905013192613,
"grad_norm": 12.186665200345084,
"learning_rate": 4.050454502616667e-07,
"logits/chosen": -2.1371123790740967,
"logits/rejected": -2.453059673309326,
"logps/chosen": -393.2257080078125,
"logps/rejected": -389.81195068359375,
"loss": 0.6228,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0101797580718994,
"rewards/margins": 0.24000540375709534,
"rewards/rejected": -1.2501851320266724,
"step": 170
},
{
"epoch": 0.36939313984168864,
"grad_norm": 7.8151280249971276,
"learning_rate": 3.976928376579047e-07,
"logits/chosen": -2.1572844982147217,
"logits/rejected": -2.5259194374084473,
"logps/chosen": -371.3215026855469,
"logps/rejected": -361.8147277832031,
"loss": 0.6206,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8133503794670105,
"rewards/margins": 0.20597751438617706,
"rewards/rejected": -1.0193278789520264,
"step": 175
},
{
"epoch": 0.37994722955145116,
"grad_norm": 10.09035062532825,
"learning_rate": 3.9013849440328945e-07,
"logits/chosen": -2.11098051071167,
"logits/rejected": -2.379697799682617,
"logps/chosen": -331.082763671875,
"logps/rejected": -332.23443603515625,
"loss": 0.6247,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.7421666979789734,
"rewards/margins": 0.18613779544830322,
"rewards/rejected": -0.9283044934272766,
"step": 180
},
{
"epoch": 0.39050131926121373,
"grad_norm": 11.160942548142629,
"learning_rate": 3.8239273882202473e-07,
"logits/chosen": -2.120657444000244,
"logits/rejected": -2.317275285720825,
"logps/chosen": -406.42041015625,
"logps/rejected": -405.0521545410156,
"loss": 0.609,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.1358160972595215,
"rewards/margins": 0.21516843140125275,
"rewards/rejected": -1.3509845733642578,
"step": 185
},
{
"epoch": 0.40105540897097625,
"grad_norm": 10.026177303858306,
"learning_rate": 3.7446615068452804e-07,
"logits/chosen": -2.2416388988494873,
"logits/rejected": -2.50757098197937,
"logps/chosen": -402.29205322265625,
"logps/rejected": -400.1927795410156,
"loss": 0.5983,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1884238719940186,
"rewards/margins": 0.24852600693702698,
"rewards/rejected": -1.4369499683380127,
"step": 190
},
{
"epoch": 0.41160949868073876,
"grad_norm": 9.915852139948614,
"learning_rate": 3.6636955675673743e-07,
"logits/chosen": -2.292942762374878,
"logits/rejected": -2.5384058952331543,
"logps/chosen": -383.97418212890625,
"logps/rejected": -393.2140197753906,
"loss": 0.6009,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.096928358078003,
"rewards/margins": 0.28661248087882996,
"rewards/rejected": -1.3835408687591553,
"step": 195
},
{
"epoch": 0.42216358839050133,
"grad_norm": 14.20330931182508,
"learning_rate": 3.5811401601205093e-07,
"logits/chosen": -2.2763895988464355,
"logits/rejected": -2.5465030670166016,
"logps/chosen": -393.99267578125,
"logps/rejected": -405.69268798828125,
"loss": 0.6428,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.2335624694824219,
"rewards/margins": 0.20860306918621063,
"rewards/rejected": -1.442165732383728,
"step": 200
},
{
"epoch": 0.43271767810026385,
"grad_norm": 9.857364268590045,
"learning_rate": 3.497108045260995e-07,
"logits/chosen": -2.2732205390930176,
"logits/rejected": -2.512218713760376,
"logps/chosen": -384.2240905761719,
"logps/rejected": -387.35052490234375,
"loss": 0.6098,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.2368552684783936,
"rewards/margins": 0.2555080056190491,
"rewards/rejected": -1.492363452911377,
"step": 205
},
{
"epoch": 0.44327176781002636,
"grad_norm": 10.31880014400564,
"learning_rate": 3.411714000749838e-07,
"logits/chosen": -2.3171160221099854,
"logits/rejected": -2.6205945014953613,
"logps/chosen": -408.45916748046875,
"logps/rejected": -408.751220703125,
"loss": 0.6023,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.2992274761199951,
"rewards/margins": 0.2618922293186188,
"rewards/rejected": -1.5611199140548706,
"step": 210
},
{
"epoch": 0.45382585751978893,
"grad_norm": 12.303776927971242,
"learning_rate": 3.3250746645801287e-07,
"logits/chosen": -2.199439525604248,
"logits/rejected": -2.4404823780059814,
"logps/chosen": -443.233642578125,
"logps/rejected": -461.68597412109375,
"loss": 0.594,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.6428359746932983,
"rewards/margins": 0.345574289560318,
"rewards/rejected": -1.9884103536605835,
"step": 215
},
{
"epoch": 0.46437994722955145,
"grad_norm": 12.23954935903092,
"learning_rate": 3.237308375663571e-07,
"logits/chosen": -2.234389305114746,
"logits/rejected": -2.4988560676574707,
"logps/chosen": -442.395751953125,
"logps/rejected": -463.5758361816406,
"loss": 0.5764,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.693788766860962,
"rewards/margins": 0.34694477915763855,
"rewards/rejected": -2.040733575820923,
"step": 220
},
{
"epoch": 0.47493403693931396,
"grad_norm": 14.593463763552798,
"learning_rate": 3.148535012193767e-07,
"logits/chosen": -2.2539751529693604,
"logits/rejected": -2.492187023162842,
"logps/chosen": -510.832275390625,
"logps/rejected": -525.7044677734375,
"loss": 0.5971,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1941466331481934,
"rewards/margins": 0.3329901695251465,
"rewards/rejected": -2.5271365642547607,
"step": 225
},
{
"epoch": 0.48548812664907653,
"grad_norm": 10.847822783700963,
"learning_rate": 3.0588758279070183e-07,
"logits/chosen": -2.233119249343872,
"logits/rejected": -2.4563241004943848,
"logps/chosen": -432.57635498046875,
"logps/rejected": -434.28399658203125,
"loss": 0.6159,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.6671392917633057,
"rewards/margins": 0.20822450518608093,
"rewards/rejected": -1.8753639459609985,
"step": 230
},
{
"epoch": 0.49604221635883905,
"grad_norm": 9.907519766746905,
"learning_rate": 2.968453286464312e-07,
"logits/chosen": -2.2562708854675293,
"logits/rejected": -2.479283094406128,
"logps/chosen": -388.1554260253906,
"logps/rejected": -398.91656494140625,
"loss": 0.5934,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3175015449523926,
"rewards/margins": 0.24448028206825256,
"rewards/rejected": -1.5619816780090332,
"step": 235
},
{
"epoch": 0.5065963060686016,
"grad_norm": 11.385701610802691,
"learning_rate": 2.8773908941806877e-07,
"logits/chosen": -2.2707817554473877,
"logits/rejected": -2.499936103820801,
"logps/chosen": -438.36834716796875,
"logps/rejected": -435.4722595214844,
"loss": 0.6058,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.5956835746765137,
"rewards/margins": 0.22251293063163757,
"rewards/rejected": -1.8181965351104736,
"step": 240
},
{
"epoch": 0.5171503957783641,
"grad_norm": 13.386635563356508,
"learning_rate": 2.785813031330473e-07,
"logits/chosen": -2.2956652641296387,
"logits/rejected": -2.5434913635253906,
"logps/chosen": -469.11676025390625,
"logps/rejected": -465.9659118652344,
"loss": 0.6099,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.9268583059310913,
"rewards/margins": 0.21980834007263184,
"rewards/rejected": -2.1466667652130127,
"step": 245
},
{
"epoch": 0.5277044854881267,
"grad_norm": 10.39915818076319,
"learning_rate": 2.693844782258779e-07,
"logits/chosen": -2.3407020568847656,
"logits/rejected": -2.5239195823669434,
"logps/chosen": -459.4602966308594,
"logps/rejected": -466.40966796875,
"loss": 0.6065,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.813126564025879,
"rewards/margins": 0.22814805805683136,
"rewards/rejected": -2.0412745475769043,
"step": 250
},
{
"epoch": 0.5382585751978892,
"grad_norm": 13.963581040356289,
"learning_rate": 2.601611764531342e-07,
"logits/chosen": -2.2778186798095703,
"logits/rejected": -2.4619011878967285,
"logps/chosen": -394.53631591796875,
"logps/rejected": -416.8975524902344,
"loss": 0.6027,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.4522285461425781,
"rewards/margins": 0.3218362331390381,
"rewards/rejected": -1.7740647792816162,
"step": 255
},
{
"epoch": 0.5488126649076517,
"grad_norm": 9.618957459005115,
"learning_rate": 2.5092399573560323e-07,
"logits/chosen": -2.219548463821411,
"logits/rejected": -2.35976243019104,
"logps/chosen": -442.07470703125,
"logps/rejected": -447.6449279785156,
"loss": 0.6068,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.6692779064178467,
"rewards/margins": 0.20155127346515656,
"rewards/rejected": -1.8708292245864868,
"step": 260
},
{
"epoch": 0.5593667546174143,
"grad_norm": 13.385422305434652,
"learning_rate": 2.4168555295104124e-07,
"logits/chosen": -2.215520143508911,
"logits/rejected": -2.2993171215057373,
"logps/chosen": -438.2977600097656,
"logps/rejected": -455.9774475097656,
"loss": 0.585,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.5749682188034058,
"rewards/margins": 0.3128505051136017,
"rewards/rejected": -1.8878189325332642,
"step": 265
},
{
"epoch": 0.5699208443271768,
"grad_norm": 20.732510246650026,
"learning_rate": 2.3245846670103626e-07,
"logits/chosen": -2.383749008178711,
"logits/rejected": -2.67887806892395,
"logps/chosen": -489.03680419921875,
"logps/rejected": -513.4251708984375,
"loss": 0.5809,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8371191024780273,
"rewards/margins": 0.4673503041267395,
"rewards/rejected": -2.304469347000122,
"step": 270
},
{
"epoch": 0.5804749340369393,
"grad_norm": 21.85869665384531,
"learning_rate": 2.232553400755159e-07,
"logits/chosen": -2.5215096473693848,
"logits/rejected": -2.706601619720459,
"logps/chosen": -494.96881103515625,
"logps/rejected": -513.2868041992188,
"loss": 0.6131,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.0746359825134277,
"rewards/margins": 0.40606698393821716,
"rewards/rejected": -2.4807028770446777,
"step": 275
},
{
"epoch": 0.5910290237467019,
"grad_norm": 11.697047808438843,
"learning_rate": 2.1408874343844294e-07,
"logits/chosen": -2.4797873497009277,
"logits/rejected": -2.675075054168701,
"logps/chosen": -458.46148681640625,
"logps/rejected": -470.7688903808594,
"loss": 0.5758,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7771122455596924,
"rewards/margins": 0.36666423082351685,
"rewards/rejected": -2.1437766551971436,
"step": 280
},
{
"epoch": 0.6015831134564644,
"grad_norm": 12.782798316845536,
"learning_rate": 2.049711972582101e-07,
"logits/chosen": -2.287956714630127,
"logits/rejected": -2.528700590133667,
"logps/chosen": -435.39569091796875,
"logps/rejected": -443.40240478515625,
"loss": 0.5757,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6082191467285156,
"rewards/margins": 0.2757379412651062,
"rewards/rejected": -1.8839571475982666,
"step": 285
},
{
"epoch": 0.6121372031662269,
"grad_norm": 15.368915084454407,
"learning_rate": 1.9591515500618588e-07,
"logits/chosen": -2.276632070541382,
"logits/rejected": -2.4731783866882324,
"logps/chosen": -473.08233642578125,
"logps/rejected": -492.99774169921875,
"loss": 0.5871,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.886792540550232,
"rewards/margins": 0.30836355686187744,
"rewards/rejected": -2.1951560974121094,
"step": 290
},
{
"epoch": 0.6226912928759895,
"grad_norm": 12.7131825042862,
"learning_rate": 1.8693298614677112e-07,
"logits/chosen": -2.141019821166992,
"logits/rejected": -2.3793327808380127,
"logps/chosen": -507.8687438964844,
"logps/rejected": -522.310546875,
"loss": 0.5797,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.982000708580017,
"rewards/margins": 0.3759006857872009,
"rewards/rejected": -2.3579015731811523,
"step": 295
},
{
"epoch": 0.633245382585752,
"grad_norm": 16.694511883816396,
"learning_rate": 1.7803695924219814e-07,
"logits/chosen": -2.2722671031951904,
"logits/rejected": -2.486273765563965,
"logps/chosen": -485.3500061035156,
"logps/rejected": -501.1082458496094,
"loss": 0.5968,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.0275466442108154,
"rewards/margins": 0.3500698506832123,
"rewards/rejected": -2.3776164054870605,
"step": 300
},
{
"epoch": 0.6437994722955145,
"grad_norm": 12.047975502833824,
"learning_rate": 1.6923922519515067e-07,
"logits/chosen": -2.2678744792938232,
"logits/rejected": -2.4072413444519043,
"logps/chosen": -485.83709716796875,
"logps/rejected": -510.50628662109375,
"loss": 0.5798,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.0933148860931396,
"rewards/margins": 0.37190961837768555,
"rewards/rejected": -2.465224504470825,
"step": 305
},
{
"epoch": 0.6543535620052771,
"grad_norm": 15.563285458971103,
"learning_rate": 1.605518006520924e-07,
"logits/chosen": -2.2849347591400146,
"logits/rejected": -2.5423800945281982,
"logps/chosen": -502.2515563964844,
"logps/rejected": -523.35400390625,
"loss": 0.5888,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.3487892150878906,
"rewards/margins": 0.39526933431625366,
"rewards/rejected": -2.744058609008789,
"step": 310
},
{
"epoch": 0.6649076517150396,
"grad_norm": 12.024205485012896,
"learning_rate": 1.519865515899731e-07,
"logits/chosen": -2.3724029064178467,
"logits/rejected": -2.584688901901245,
"logps/chosen": -492.44183349609375,
"logps/rejected": -507.952880859375,
"loss": 0.5818,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.114487409591675,
"rewards/margins": 0.37069326639175415,
"rewards/rejected": -2.485180616378784,
"step": 315
},
{
"epoch": 0.6754617414248021,
"grad_norm": 12.928310439067417,
"learning_rate": 1.4355517710873182e-07,
"logits/chosen": -2.332822322845459,
"logits/rejected": -2.580765724182129,
"logps/chosen": -478.24560546875,
"logps/rejected": -496.3634338378906,
"loss": 0.5952,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.112123966217041,
"rewards/margins": 0.3569082021713257,
"rewards/rejected": -2.4690322875976562,
"step": 320
},
{
"epoch": 0.6860158311345647,
"grad_norm": 14.108193064852378,
"learning_rate": 1.3526919345173318e-07,
"logits/chosen": -2.4187912940979004,
"logits/rejected": -2.564967632293701,
"logps/chosen": -518.6376953125,
"logps/rejected": -543.2145385742188,
"loss": 0.578,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.3633389472961426,
"rewards/margins": 0.39262861013412476,
"rewards/rejected": -2.755967378616333,
"step": 325
},
{
"epoch": 0.6965699208443272,
"grad_norm": 19.566691765082115,
"learning_rate": 1.2713991827596443e-07,
"logits/chosen": -2.466085195541382,
"logits/rejected": -2.6994807720184326,
"logps/chosen": -525.8836669921875,
"logps/rejected": -555.58154296875,
"loss": 0.576,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.558525800704956,
"rewards/margins": 0.42205095291137695,
"rewards/rejected": -2.980576753616333,
"step": 330
},
{
"epoch": 0.7071240105540897,
"grad_norm": 16.838554493879652,
"learning_rate": 1.191784551934773e-07,
"logits/chosen": -2.502603054046631,
"logits/rejected": -2.686891794204712,
"logps/chosen": -503.7294006347656,
"logps/rejected": -527.2174072265625,
"loss": 0.5818,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.4917781352996826,
"rewards/margins": 0.3848406672477722,
"rewards/rejected": -2.8766188621520996,
"step": 335
},
{
"epoch": 0.7176781002638523,
"grad_norm": 14.504581595303138,
"learning_rate": 1.1139567860518953e-07,
"logits/chosen": -2.369147777557373,
"logits/rejected": -2.5709891319274902,
"logps/chosen": -490.6739807128906,
"logps/rejected": -508.79486083984375,
"loss": 0.5972,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.1547963619232178,
"rewards/margins": 0.4393877387046814,
"rewards/rejected": -2.594184160232544,
"step": 340
},
{
"epoch": 0.7282321899736148,
"grad_norm": 17.356368091641894,
"learning_rate": 1.0380221884776128e-07,
"logits/chosen": -2.3545467853546143,
"logits/rejected": -2.6243462562561035,
"logps/chosen": -524.93896484375,
"logps/rejected": -539.4567260742188,
"loss": 0.5834,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.282606840133667,
"rewards/margins": 0.3932141065597534,
"rewards/rejected": -2.675821304321289,
"step": 345
},
{
"epoch": 0.7387862796833773,
"grad_norm": 12.791178420216584,
"learning_rate": 9.640844767383405e-08,
"logits/chosen": -2.3955166339874268,
"logits/rejected": -2.7560970783233643,
"logps/chosen": -512.841552734375,
"logps/rejected": -530.08203125,
"loss": 0.574,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2674124240875244,
"rewards/margins": 0.4363299012184143,
"rewards/rejected": -2.703742504119873,
"step": 350
},
{
"epoch": 0.7493403693931399,
"grad_norm": 15.246389662934607,
"learning_rate": 8.922446408546378e-08,
"logits/chosen": -2.1915884017944336,
"logits/rejected": -2.4269826412200928,
"logps/chosen": -500.866943359375,
"logps/rejected": -525.7913208007812,
"loss": 0.5859,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.0900752544403076,
"rewards/margins": 0.4145973324775696,
"rewards/rejected": -2.5046725273132324,
"step": 355
},
{
"epoch": 0.7598944591029023,
"grad_norm": 13.547069603435855,
"learning_rate": 8.22600805400994e-08,
"logits/chosen": -2.177799940109253,
"logits/rejected": -2.3907604217529297,
"logps/chosen": -484.6221618652344,
"logps/rejected": -505.7215270996094,
"loss": 0.5938,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.995901346206665,
"rewards/margins": 0.3997262418270111,
"rewards/rejected": -2.395627498626709,
"step": 360
},
{
"epoch": 0.7704485488126649,
"grad_norm": 13.801750243778415,
"learning_rate": 7.552480954794558e-08,
"logits/chosen": -2.4111971855163574,
"logits/rejected": -2.541329860687256,
"logps/chosen": -483.94940185546875,
"logps/rejected": -505.018798828125,
"loss": 0.5786,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.1984305381774902,
"rewards/margins": 0.30352845788002014,
"rewards/rejected": -2.5019590854644775,
"step": 365
},
{
"epoch": 0.7810026385224275,
"grad_norm": 14.514442087318445,
"learning_rate": 6.902785067901854e-08,
"logits/chosen": -2.3650124073028564,
"logits/rejected": -2.6508941650390625,
"logps/chosen": -493.879150390625,
"logps/rejected": -509.673828125,
"loss": 0.5716,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.1922976970672607,
"rewards/margins": 0.36951905488967896,
"rewards/rejected": -2.561816930770874,
"step": 370
},
{
"epoch": 0.7915567282321899,
"grad_norm": 17.145765706359093,
"learning_rate": 6.277807799763973e-08,
"logits/chosen": -2.3265914916992188,
"logits/rejected": -2.5552210807800293,
"logps/chosen": -559.2115478515625,
"logps/rejected": -583.4457397460938,
"loss": 0.5863,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.589047908782959,
"rewards/margins": 0.4866320490837097,
"rewards/rejected": -3.0756797790527344,
"step": 375
},
{
"epoch": 0.8021108179419525,
"grad_norm": 12.09091453196307,
"learning_rate": 5.678402794153145e-08,
"logits/chosen": -2.2694175243377686,
"logits/rejected": -2.5305798053741455,
"logps/chosen": -522.9309692382812,
"logps/rejected": -551.8192749023438,
"loss": 0.5797,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.3409042358398438,
"rewards/margins": 0.4437629282474518,
"rewards/rejected": -2.7846672534942627,
"step": 380
},
{
"epoch": 0.8126649076517151,
"grad_norm": 13.834538465076683,
"learning_rate": 5.105388766206969e-08,
"logits/chosen": -2.3926773071289062,
"logits/rejected": -2.5119540691375732,
"logps/chosen": -503.90850830078125,
"logps/rejected": -526.0748901367188,
"loss": 0.5905,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.2285289764404297,
"rewards/margins": 0.36956310272216797,
"rewards/rejected": -2.5980920791625977,
"step": 385
},
{
"epoch": 0.8232189973614775,
"grad_norm": 12.082575884438302,
"learning_rate": 4.5595483841620484e-08,
"logits/chosen": -2.219971179962158,
"logits/rejected": -2.4546897411346436,
"logps/chosen": -487.01544189453125,
"logps/rejected": -499.76495361328125,
"loss": 0.5761,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.0403149127960205,
"rewards/margins": 0.41723886132240295,
"rewards/rejected": -2.4575533866882324,
"step": 390
},
{
"epoch": 0.8337730870712401,
"grad_norm": 14.209125492445594,
"learning_rate": 4.0416272003232526e-08,
"logits/chosen": -2.3332419395446777,
"logits/rejected": -2.5107414722442627,
"logps/chosen": -479.9403381347656,
"logps/rejected": -502.7928771972656,
"loss": 0.5866,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.998462438583374,
"rewards/margins": 0.44837865233421326,
"rewards/rejected": -2.446840763092041,
"step": 395
},
{
"epoch": 0.8443271767810027,
"grad_norm": 13.431649672455546,
"learning_rate": 3.552332632729041e-08,
"logits/chosen": -2.3189663887023926,
"logits/rejected": -2.414161205291748,
"logps/chosen": -480.2158203125,
"logps/rejected": -508.34466552734375,
"loss": 0.5675,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.9780933856964111,
"rewards/margins": 0.3599149286746979,
"rewards/rejected": -2.338008403778076,
"step": 400
},
{
"epoch": 0.8443271767810027,
"eval_logits/chosen": -2.860567092895508,
"eval_logits/rejected": -2.755436420440674,
"eval_logps/chosen": -475.5936279296875,
"eval_logps/rejected": -511.86138916015625,
"eval_loss": 0.6271286606788635,
"eval_rewards/accuracies": 0.6350806355476379,
"eval_rewards/chosen": -2.127014636993408,
"eval_rewards/margins": 0.2526260018348694,
"eval_rewards/rejected": -2.379640579223633,
"eval_runtime": 325.3184,
"eval_samples_per_second": 6.074,
"eval_steps_per_second": 0.381,
"step": 400
},
{
"epoch": 0.8548812664907651,
"grad_norm": 14.805379177265603,
"learning_rate": 3.092332998903416e-08,
"logits/chosen": -2.3201870918273926,
"logits/rejected": -2.5070722103118896,
"logps/chosen": -494.46075439453125,
"logps/rejected": -522.0347900390625,
"loss": 0.5659,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.1851017475128174,
"rewards/margins": 0.4036984443664551,
"rewards/rejected": -2.5887999534606934,
"step": 405
},
{
"epoch": 0.8654353562005277,
"grad_norm": 14.004584602296926,
"learning_rate": 2.6622566030146455e-08,
"logits/chosen": -2.2431082725524902,
"logits/rejected": -2.432163715362549,
"logps/chosen": -490.9959411621094,
"logps/rejected": -510.72088623046875,
"loss": 0.5799,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.1516032218933105,
"rewards/margins": 0.3738686442375183,
"rewards/rejected": -2.5254716873168945,
"step": 410
},
{
"epoch": 0.8759894459102903,
"grad_norm": 12.549317692341273,
"learning_rate": 2.26269087768734e-08,
"logits/chosen": -2.362277030944824,
"logits/rejected": -2.535696029663086,
"logps/chosen": -504.3929748535156,
"logps/rejected": -532.2606201171875,
"loss": 0.5687,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.364351511001587,
"rewards/margins": 0.4068065285682678,
"rewards/rejected": -2.771157741546631,
"step": 415
},
{
"epoch": 0.8865435356200527,
"grad_norm": 17.291644458564797,
"learning_rate": 1.894181581640106e-08,
"logits/chosen": -2.474963903427124,
"logits/rejected": -2.697309970855713,
"logps/chosen": -524.9340209960938,
"logps/rejected": -546.8604125976562,
"loss": 0.5769,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.467923641204834,
"rewards/margins": 0.43783673644065857,
"rewards/rejected": -2.9057605266571045,
"step": 420
},
{
"epoch": 0.8970976253298153,
"grad_norm": 14.61494685094214,
"learning_rate": 1.5572320542448143e-08,
"logits/chosen": -2.338838577270508,
"logits/rejected": -2.5825653076171875,
"logps/chosen": -540.0514526367188,
"logps/rejected": -563.7896118164062,
"loss": 0.5923,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.444960832595825,
"rewards/margins": 0.40859413146972656,
"rewards/rejected": -2.8535547256469727,
"step": 425
},
{
"epoch": 0.9076517150395779,
"grad_norm": 11.41595324078826,
"learning_rate": 1.2523025280255729e-08,
"logits/chosen": -2.3540937900543213,
"logits/rejected": -2.604950428009033,
"logps/chosen": -529.1434326171875,
"logps/rejected": -546.1678466796875,
"loss": 0.576,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.464961051940918,
"rewards/margins": 0.4290400445461273,
"rewards/rejected": -2.894001007080078,
"step": 430
},
{
"epoch": 0.9182058047493403,
"grad_norm": 15.41017745385185,
"learning_rate": 9.798095000364214e-09,
"logits/chosen": -2.409531831741333,
"logits/rejected": -2.6002402305603027,
"logps/chosen": -515.3450317382812,
"logps/rejected": -548.9906005859375,
"loss": 0.5571,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.4495291709899902,
"rewards/margins": 0.48690468072891235,
"rewards/rejected": -2.936434030532837,
"step": 435
},
{
"epoch": 0.9287598944591029,
"grad_norm": 15.067700929396372,
"learning_rate": 7.401251629764876e-09,
"logits/chosen": -2.5300445556640625,
"logits/rejected": -2.7091293334960938,
"logps/chosen": -556.22607421875,
"logps/rejected": -582.80126953125,
"loss": 0.587,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.7479639053344727,
"rewards/margins": 0.405862033367157,
"rewards/rejected": -3.1538259983062744,
"step": 440
},
{
"epoch": 0.9393139841688655,
"grad_norm": 15.58719326370491,
"learning_rate": 5.335768968195098e-09,
"logits/chosen": -2.4501638412475586,
"logits/rejected": -2.711761713027954,
"logps/chosen": -547.3215942382812,
"logps/rejected": -560.46142578125,
"loss": 0.5694,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.6382341384887695,
"rewards/margins": 0.393528550863266,
"rewards/rejected": -3.0317625999450684,
"step": 445
},
{
"epoch": 0.9498680738786279,
"grad_norm": 18.222145597670412,
"learning_rate": 3.604468216521883e-09,
"logits/chosen": -2.5248587131500244,
"logits/rejected": -2.665889024734497,
"logps/chosen": -544.0384521484375,
"logps/rejected": -558.4286499023438,
"loss": 0.5696,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.6324782371520996,
"rewards/margins": 0.36458876729011536,
"rewards/rejected": -2.9970669746398926,
"step": 450
},
{
"epoch": 0.9604221635883905,
"grad_norm": 18.763031291028923,
"learning_rate": 2.2097141233206884e-09,
"logits/chosen": -2.4280190467834473,
"logits/rejected": -2.6637661457061768,
"logps/chosen": -531.4348754882812,
"logps/rejected": -560.4275512695312,
"loss": 0.5771,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.5917065143585205,
"rewards/margins": 0.4408086836338043,
"rewards/rejected": -3.032515287399292,
"step": 455
},
{
"epoch": 0.9709762532981531,
"grad_norm": 13.86977592293856,
"learning_rate": 1.1534117549133472e-09,
"logits/chosen": -2.514380931854248,
"logits/rejected": -2.7799925804138184,
"logps/chosen": -552.8911743164062,
"logps/rejected": -577.2693481445312,
"loss": 0.5713,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.666104793548584,
"rewards/margins": 0.5134469270706177,
"rewards/rejected": -3.179551601409912,
"step": 460
},
{
"epoch": 0.9815303430079155,
"grad_norm": 15.545025613547882,
"learning_rate": 4.3700389327672173e-10,
"logits/chosen": -2.3933727741241455,
"logits/rejected": -2.630448818206787,
"logps/chosen": -566.9051513671875,
"logps/rejected": -592.8246459960938,
"loss": 0.5817,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.771134614944458,
"rewards/margins": 0.4589918553829193,
"rewards/rejected": -3.230126142501831,
"step": 465
},
{
"epoch": 0.9920844327176781,
"grad_norm": 13.156905753144738,
"learning_rate": 6.146906537587982e-11,
"logits/chosen": -2.4273524284362793,
"logits/rejected": -2.5787882804870605,
"logps/chosen": -544.8056640625,
"logps/rejected": -564.3406982421875,
"loss": 0.5834,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.588836908340454,
"rewards/margins": 0.3985593914985657,
"rewards/rejected": -2.987395763397217,
"step": 470
},
{
"epoch": 0.9984168865435357,
"step": 473,
"total_flos": 0.0,
"train_loss": 0.6128583030015167,
"train_runtime": 20192.8387,
"train_samples_per_second": 3.003,
"train_steps_per_second": 0.023
}
],
"logging_steps": 5,
"max_steps": 473,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}