IL_BERAll-zephyr-7b-sft-full / trainer_state.json
TTTXXX01's picture
Model save
7edf111 verified
raw
history blame contribute delete
No virus
68 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1274,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007849293563579278,
"grad_norm": 6.091196076983652,
"learning_rate": 3.90625e-09,
"logits/chosen": 5914.52099609375,
"logits/rejected": 2785.021484375,
"logps/chosen": -212.45889282226562,
"logps/rejected": -98.59669494628906,
"loss": 1.3863,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.007849293563579277,
"grad_norm": 6.048636541099143,
"learning_rate": 3.9062499999999997e-08,
"logits/chosen": 4973.81396484375,
"logits/rejected": 4328.32861328125,
"logps/chosen": -204.19737243652344,
"logps/rejected": -179.740234375,
"loss": 1.3862,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.08651990443468094,
"rewards/margins": 0.12112583220005035,
"rewards/rejected": -0.034605927765369415,
"step": 10
},
{
"epoch": 0.015698587127158554,
"grad_norm": 6.189956928555152,
"learning_rate": 7.812499999999999e-08,
"logits/chosen": 6084.02587890625,
"logits/rejected": 4834.0732421875,
"logps/chosen": -217.18612670898438,
"logps/rejected": -196.73153686523438,
"loss": 1.3864,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.02506137453019619,
"rewards/margins": 0.04303772374987602,
"rewards/rejected": -0.01797635480761528,
"step": 20
},
{
"epoch": 0.023547880690737835,
"grad_norm": 5.4726473359462195,
"learning_rate": 1.1718749999999999e-07,
"logits/chosen": 6084.0302734375,
"logits/rejected": 5104.97900390625,
"logps/chosen": -250.5454559326172,
"logps/rejected": -209.36410522460938,
"loss": 1.3861,
"rewards/accuracies": 0.5583332777023315,
"rewards/chosen": 0.026890581473708153,
"rewards/margins": 0.09340113401412964,
"rewards/rejected": -0.06651054322719574,
"step": 30
},
{
"epoch": 0.03139717425431711,
"grad_norm": 5.708267831588723,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": 5311.87744140625,
"logits/rejected": 4346.86328125,
"logps/chosen": -212.0022430419922,
"logps/rejected": -181.71847534179688,
"loss": 1.386,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": 0.07700984179973602,
"rewards/margins": 0.10846559703350067,
"rewards/rejected": -0.031455766409635544,
"step": 40
},
{
"epoch": 0.03924646781789639,
"grad_norm": 5.759396354993872,
"learning_rate": 1.9531249999999998e-07,
"logits/chosen": 6424.58251953125,
"logits/rejected": 5042.18115234375,
"logps/chosen": -265.2978820800781,
"logps/rejected": -206.7998809814453,
"loss": 1.3856,
"rewards/accuracies": 0.6750000715255737,
"rewards/chosen": 0.3287124037742615,
"rewards/margins": 0.4289844036102295,
"rewards/rejected": -0.10027195513248444,
"step": 50
},
{
"epoch": 0.04709576138147567,
"grad_norm": 5.54406858970845,
"learning_rate": 2.3437499999999998e-07,
"logits/chosen": 5484.29541015625,
"logits/rejected": 4559.962890625,
"logps/chosen": -213.7506103515625,
"logps/rejected": -209.12460327148438,
"loss": 1.385,
"rewards/accuracies": 0.6583333611488342,
"rewards/chosen": 0.253384530544281,
"rewards/margins": 0.5778969526290894,
"rewards/rejected": -0.32451242208480835,
"step": 60
},
{
"epoch": 0.054945054945054944,
"grad_norm": 5.35185403577633,
"learning_rate": 2.734375e-07,
"logits/chosen": 5194.3994140625,
"logits/rejected": 4918.51220703125,
"logps/chosen": -178.344970703125,
"logps/rejected": -177.43560791015625,
"loss": 1.3842,
"rewards/accuracies": 0.6916667222976685,
"rewards/chosen": 0.13593974709510803,
"rewards/margins": 0.8398297429084778,
"rewards/rejected": -0.7038900256156921,
"step": 70
},
{
"epoch": 0.06279434850863422,
"grad_norm": 5.638870230561589,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": 5774.1318359375,
"logits/rejected": 5269.8134765625,
"logps/chosen": -196.78341674804688,
"logps/rejected": -182.97677612304688,
"loss": 1.3822,
"rewards/accuracies": 0.6416666507720947,
"rewards/chosen": -0.21483942866325378,
"rewards/margins": 1.1714082956314087,
"rewards/rejected": -1.3862475156784058,
"step": 80
},
{
"epoch": 0.0706436420722135,
"grad_norm": 6.478511073625711,
"learning_rate": 3.5156249999999997e-07,
"logits/chosen": 6040.28759765625,
"logits/rejected": 5181.716796875,
"logps/chosen": -220.1483917236328,
"logps/rejected": -190.4631805419922,
"loss": 1.3787,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -1.2661734819412231,
"rewards/margins": 4.753196716308594,
"rewards/rejected": -6.019370079040527,
"step": 90
},
{
"epoch": 0.07849293563579278,
"grad_norm": 7.188974837064224,
"learning_rate": 3.9062499999999997e-07,
"logits/chosen": 5967.84326171875,
"logits/rejected": 5745.97119140625,
"logps/chosen": -213.9687042236328,
"logps/rejected": -208.8219757080078,
"loss": 1.3796,
"rewards/accuracies": 0.6833333969116211,
"rewards/chosen": -3.738008975982666,
"rewards/margins": 5.6422576904296875,
"rewards/rejected": -9.380266189575195,
"step": 100
},
{
"epoch": 0.08634222919937205,
"grad_norm": 6.118081410153287,
"learning_rate": 4.2968749999999996e-07,
"logits/chosen": 6471.71923828125,
"logits/rejected": 5290.84716796875,
"logps/chosen": -188.41543579101562,
"logps/rejected": -190.62838745117188,
"loss": 1.3749,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -5.300592422485352,
"rewards/margins": 6.335596561431885,
"rewards/rejected": -11.636189460754395,
"step": 110
},
{
"epoch": 0.09419152276295134,
"grad_norm": 8.791461375827627,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": 6398.0341796875,
"logits/rejected": 5325.00927734375,
"logps/chosen": -210.2766571044922,
"logps/rejected": -212.75204467773438,
"loss": 1.3728,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -7.439939022064209,
"rewards/margins": 7.147006988525391,
"rewards/rejected": -14.586946487426758,
"step": 120
},
{
"epoch": 0.10204081632653061,
"grad_norm": 6.480916055994096,
"learning_rate": 4.999962424962166e-07,
"logits/chosen": 6332.94677734375,
"logits/rejected": 5863.13134765625,
"logps/chosen": -215.77871704101562,
"logps/rejected": -212.88671875,
"loss": 1.3705,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -5.970229625701904,
"rewards/margins": 9.271949768066406,
"rewards/rejected": -15.242179870605469,
"step": 130
},
{
"epoch": 0.10989010989010989,
"grad_norm": 7.916401372438219,
"learning_rate": 4.998647417232375e-07,
"logits/chosen": 6197.4365234375,
"logits/rejected": 5458.46240234375,
"logps/chosen": -195.366943359375,
"logps/rejected": -196.8258056640625,
"loss": 1.3696,
"rewards/accuracies": 0.6250000596046448,
"rewards/chosen": -9.699501037597656,
"rewards/margins": 9.53441047668457,
"rewards/rejected": -19.23391342163086,
"step": 140
},
{
"epoch": 0.11773940345368916,
"grad_norm": 9.11816822426609,
"learning_rate": 4.995454786965036e-07,
"logits/chosen": 6377.1611328125,
"logits/rejected": 5330.43115234375,
"logps/chosen": -209.7741241455078,
"logps/rejected": -192.396728515625,
"loss": 1.3666,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -8.089722633361816,
"rewards/margins": 12.00928783416748,
"rewards/rejected": -20.099010467529297,
"step": 150
},
{
"epoch": 0.12558869701726844,
"grad_norm": 7.198206798530057,
"learning_rate": 4.990386933279972e-07,
"logits/chosen": 6321.40087890625,
"logits/rejected": 5649.20849609375,
"logps/chosen": -207.3892822265625,
"logps/rejected": -219.2005157470703,
"loss": 1.3659,
"rewards/accuracies": 0.6916667222976685,
"rewards/chosen": -9.86109733581543,
"rewards/margins": 12.281832695007324,
"rewards/rejected": -22.14293098449707,
"step": 160
},
{
"epoch": 0.13343799058084774,
"grad_norm": 6.321494665117691,
"learning_rate": 4.983447664444096e-07,
"logits/chosen": 6516.60546875,
"logits/rejected": 5811.42822265625,
"logps/chosen": -219.67501831054688,
"logps/rejected": -216.0376434326172,
"loss": 1.3671,
"rewards/accuracies": 0.6750000715255737,
"rewards/chosen": -10.054361343383789,
"rewards/margins": 9.661711692810059,
"rewards/rejected": -19.71607208251953,
"step": 170
},
{
"epoch": 0.141287284144427,
"grad_norm": 7.558390140870204,
"learning_rate": 4.97464219500968e-07,
"logits/chosen": 5710.5439453125,
"logits/rejected": 4990.15771484375,
"logps/chosen": -198.03170776367188,
"logps/rejected": -199.192626953125,
"loss": 1.3638,
"rewards/accuracies": 0.6500000357627869,
"rewards/chosen": -9.503952026367188,
"rewards/margins": 11.527425765991211,
"rewards/rejected": -21.031375885009766,
"step": 180
},
{
"epoch": 0.14913657770800628,
"grad_norm": 8.14145163308194,
"learning_rate": 4.963977141895843e-07,
"logits/chosen": 5859.50146484375,
"logits/rejected": 5036.01953125,
"logps/chosen": -214.22640991210938,
"logps/rejected": -225.4895782470703,
"loss": 1.3601,
"rewards/accuracies": 0.7250000834465027,
"rewards/chosen": -10.467451095581055,
"rewards/margins": 23.782638549804688,
"rewards/rejected": -34.25008773803711,
"step": 190
},
{
"epoch": 0.15698587127158556,
"grad_norm": 8.494418405300177,
"learning_rate": 4.951460519416227e-07,
"logits/chosen": 5772.40625,
"logits/rejected": 5338.69140625,
"logps/chosen": -191.8777313232422,
"logps/rejected": -223.7870635986328,
"loss": 1.359,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -9.333466529846191,
"rewards/margins": 17.782575607299805,
"rewards/rejected": -27.116046905517578,
"step": 200
},
{
"epoch": 0.16483516483516483,
"grad_norm": 9.658476061049418,
"learning_rate": 4.937101733256606e-07,
"logits/chosen": 5223.62548828125,
"logits/rejected": 4660.197265625,
"logps/chosen": -166.54293823242188,
"logps/rejected": -186.89669799804688,
"loss": 1.3593,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -12.629673957824707,
"rewards/margins": 17.08604621887207,
"rewards/rejected": -29.715723037719727,
"step": 210
},
{
"epoch": 0.1726844583987441,
"grad_norm": 12.248366456833509,
"learning_rate": 4.920911573406924e-07,
"logits/chosen": 6362.5478515625,
"logits/rejected": 5419.66650390625,
"logps/chosen": -207.56906127929688,
"logps/rejected": -192.8691864013672,
"loss": 1.3577,
"rewards/accuracies": 0.7166666388511658,
"rewards/chosen": -11.232467651367188,
"rewards/margins": 17.640005111694336,
"rewards/rejected": -28.872472763061523,
"step": 220
},
{
"epoch": 0.18053375196232338,
"grad_norm": 7.234262107057838,
"learning_rate": 4.902902206053098e-07,
"logits/chosen": 5827.66650390625,
"logits/rejected": 5263.23046875,
"logps/chosen": -198.8260498046875,
"logps/rejected": -209.947265625,
"loss": 1.3604,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -11.487305641174316,
"rewards/margins": 18.69247817993164,
"rewards/rejected": -30.179784774780273,
"step": 230
},
{
"epoch": 0.18838304552590268,
"grad_norm": 8.459392596172329,
"learning_rate": 4.883087164434672e-07,
"logits/chosen": 5309.54736328125,
"logits/rejected": 4243.5830078125,
"logps/chosen": -175.29354858398438,
"logps/rejected": -179.5849151611328,
"loss": 1.3558,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -7.664523124694824,
"rewards/margins": 17.230939865112305,
"rewards/rejected": -24.895463943481445,
"step": 240
},
{
"epoch": 0.19623233908948196,
"grad_norm": 9.145048905164794,
"learning_rate": 4.861481338675183e-07,
"logits/chosen": 6279.61474609375,
"logits/rejected": 5581.43603515625,
"logps/chosen": -178.78981018066406,
"logps/rejected": -217.976806640625,
"loss": 1.3579,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -11.645959854125977,
"rewards/margins": 21.824161529541016,
"rewards/rejected": -33.470123291015625,
"step": 250
},
{
"epoch": 0.20408163265306123,
"grad_norm": 10.037813125733608,
"learning_rate": 4.838100964592904e-07,
"logits/chosen": 6413.66650390625,
"logits/rejected": 5192.2119140625,
"logps/chosen": -214.44338989257812,
"logps/rejected": -199.10244750976562,
"loss": 1.3693,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -14.724939346313477,
"rewards/margins": 18.1535587310791,
"rewards/rejected": -32.87849426269531,
"step": 260
},
{
"epoch": 0.2119309262166405,
"grad_norm": 10.210289382921355,
"learning_rate": 4.812963611500339e-07,
"logits/chosen": 6258.6923828125,
"logits/rejected": 6061.39453125,
"logps/chosen": -207.8274383544922,
"logps/rejected": -219.6881561279297,
"loss": 1.3476,
"rewards/accuracies": 0.6416667103767395,
"rewards/chosen": -11.071606636047363,
"rewards/margins": 19.98748779296875,
"rewards/rejected": -31.059091567993164,
"step": 270
},
{
"epoch": 0.21978021978021978,
"grad_norm": 11.667424937518986,
"learning_rate": 4.786088169001671e-07,
"logits/chosen": 5358.77783203125,
"logits/rejected": 4660.5009765625,
"logps/chosen": -173.97543334960938,
"logps/rejected": -208.5042266845703,
"loss": 1.3537,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -10.824542045593262,
"rewards/margins": 28.37823486328125,
"rewards/rejected": -39.202781677246094,
"step": 280
},
{
"epoch": 0.22762951334379905,
"grad_norm": 10.483113107420898,
"learning_rate": 4.7574948327980567e-07,
"logits/chosen": 7435.53759765625,
"logits/rejected": 5505.32666015625,
"logps/chosen": -247.2607879638672,
"logps/rejected": -226.1746368408203,
"loss": 1.3473,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": -10.232341766357422,
"rewards/margins": 33.42657470703125,
"rewards/rejected": -43.65891647338867,
"step": 290
},
{
"epoch": 0.23547880690737832,
"grad_norm": 8.228448413177858,
"learning_rate": 4.727205089511466e-07,
"logits/chosen": 5422.88818359375,
"logits/rejected": 5400.13525390625,
"logps/chosen": -178.8369903564453,
"logps/rejected": -201.50466918945312,
"loss": 1.357,
"rewards/accuracies": 0.6750000715255737,
"rewards/chosen": -11.364561080932617,
"rewards/margins": 19.953664779663086,
"rewards/rejected": -31.318225860595703,
"step": 300
},
{
"epoch": 0.24332810047095763,
"grad_norm": 7.699139270208414,
"learning_rate": 4.6952417005384247e-07,
"logits/chosen": 6096.75732421875,
"logits/rejected": 5434.83837890625,
"logps/chosen": -185.6956024169922,
"logps/rejected": -198.10134887695312,
"loss": 1.3619,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -8.114912033081055,
"rewards/margins": 14.518139839172363,
"rewards/rejected": -22.6330509185791,
"step": 310
},
{
"epoch": 0.25117739403453687,
"grad_norm": 8.328180326269704,
"learning_rate": 4.661628684945851e-07,
"logits/chosen": 6136.8212890625,
"logits/rejected": 5324.23583984375,
"logps/chosen": -210.75827026367188,
"logps/rejected": -234.6461944580078,
"loss": 1.3578,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -9.701288223266602,
"rewards/margins": 22.927001953125,
"rewards/rejected": -32.62828826904297,
"step": 320
},
{
"epoch": 0.25902668759811615,
"grad_norm": 9.875007026467317,
"learning_rate": 4.626391301421782e-07,
"logits/chosen": 5934.5712890625,
"logits/rejected": 5409.8681640625,
"logps/chosen": -204.72036743164062,
"logps/rejected": -202.51492309570312,
"loss": 1.3638,
"rewards/accuracies": 0.6916666626930237,
"rewards/chosen": -10.267139434814453,
"rewards/margins": 14.924982070922852,
"rewards/rejected": -25.192119598388672,
"step": 330
},
{
"epoch": 0.2668759811616955,
"grad_norm": 8.606670577696239,
"learning_rate": 4.5895560292945996e-07,
"logits/chosen": 6179.17822265625,
"logits/rejected": 6319.3310546875,
"logps/chosen": -199.89364624023438,
"logps/rejected": -245.26809692382812,
"loss": 1.356,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -6.035394191741943,
"rewards/margins": 18.865169525146484,
"rewards/rejected": -24.900564193725586,
"step": 340
},
{
"epoch": 0.27472527472527475,
"grad_norm": 22.986995482748114,
"learning_rate": 4.5511505486349865e-07,
"logits/chosen": 6497.4287109375,
"logits/rejected": 5893.86474609375,
"logps/chosen": -206.90151977539062,
"logps/rejected": -249.62130737304688,
"loss": 1.3533,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -12.075809478759766,
"rewards/margins": 30.723468780517578,
"rewards/rejected": -42.79928207397461,
"step": 350
},
{
"epoch": 0.282574568288854,
"grad_norm": 9.742030346206404,
"learning_rate": 4.5112037194555876e-07,
"logits/chosen": 5949.8857421875,
"logits/rejected": 5860.00634765625,
"logps/chosen": -198.9341278076172,
"logps/rejected": -252.93209838867188,
"loss": 1.3655,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.01068878173828,
"rewards/margins": 32.63959503173828,
"rewards/rejected": -53.6502799987793,
"step": 360
},
{
"epoch": 0.2904238618524333,
"grad_norm": 8.763637069131867,
"learning_rate": 4.4697455600239863e-07,
"logits/chosen": 5399.63525390625,
"logits/rejected": 5097.599609375,
"logps/chosen": -195.9980010986328,
"logps/rejected": -197.7607879638672,
"loss": 1.3627,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -11.57593822479248,
"rewards/margins": 18.113765716552734,
"rewards/rejected": -29.6897029876709,
"step": 370
},
{
"epoch": 0.29827315541601257,
"grad_norm": 9.722274579855199,
"learning_rate": 4.426807224305315e-07,
"logits/chosen": 6468.1220703125,
"logits/rejected": 5369.0634765625,
"logps/chosen": -234.26748657226562,
"logps/rejected": -212.1043243408203,
"loss": 1.354,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -5.926461219787598,
"rewards/margins": 23.168312072753906,
"rewards/rejected": -29.094772338867188,
"step": 380
},
{
"epoch": 0.30612244897959184,
"grad_norm": 15.31595541298082,
"learning_rate": 4.3824209785514326e-07,
"logits/chosen": 6639.2294921875,
"logits/rejected": 5100.4287109375,
"logps/chosen": -221.4827117919922,
"logps/rejected": -218.9009552001953,
"loss": 1.3476,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -5.989265441894531,
"rewards/margins": 33.05856704711914,
"rewards/rejected": -39.047828674316406,
"step": 390
},
{
"epoch": 0.3139717425431711,
"grad_norm": 10.537639563559068,
"learning_rate": 4.3366201770542687e-07,
"logits/chosen": 5737.9208984375,
"logits/rejected": 5631.57080078125,
"logps/chosen": -203.96151733398438,
"logps/rejected": -229.1461639404297,
"loss": 1.3599,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -13.172935485839844,
"rewards/margins": 27.23373794555664,
"rewards/rejected": -40.406673431396484,
"step": 400
},
{
"epoch": 0.3218210361067504,
"grad_norm": 14.959421459394797,
"learning_rate": 4.2894392370815567e-07,
"logits/chosen": 6207.42041015625,
"logits/rejected": 5546.6611328125,
"logps/chosen": -224.15078735351562,
"logps/rejected": -258.1195068359375,
"loss": 1.3344,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -11.663908004760742,
"rewards/margins": 38.00326156616211,
"rewards/rejected": -49.66717529296875,
"step": 410
},
{
"epoch": 0.32967032967032966,
"grad_norm": 17.843898253212178,
"learning_rate": 4.2409136130137845e-07,
"logits/chosen": 5856.669921875,
"logits/rejected": 5317.4970703125,
"logps/chosen": -218.15768432617188,
"logps/rejected": -230.917236328125,
"loss": 1.3484,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -14.127777099609375,
"rewards/margins": 35.54801940917969,
"rewards/rejected": -49.6757926940918,
"step": 420
},
{
"epoch": 0.33751962323390894,
"grad_norm": 12.654228568647438,
"learning_rate": 4.1910797697018017e-07,
"logits/chosen": 5639.2978515625,
"logits/rejected": 4720.31982421875,
"logps/chosen": -193.45645141601562,
"logps/rejected": -209.80795288085938,
"loss": 1.3462,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -13.711461067199707,
"rewards/margins": 34.86336898803711,
"rewards/rejected": -48.57483673095703,
"step": 430
},
{
"epoch": 0.3453689167974882,
"grad_norm": 15.182887086880035,
"learning_rate": 4.1399751550651084e-07,
"logits/chosen": 5991.6171875,
"logits/rejected": 5934.1552734375,
"logps/chosen": -193.38800048828125,
"logps/rejected": -230.582275390625,
"loss": 1.3459,
"rewards/accuracies": 0.75,
"rewards/chosen": -10.994651794433594,
"rewards/margins": 27.90401268005371,
"rewards/rejected": -38.89866638183594,
"step": 440
},
{
"epoch": 0.3532182103610675,
"grad_norm": 12.029246709671026,
"learning_rate": 4.087638171951401e-07,
"logits/chosen": 6900.34765625,
"logits/rejected": 4994.3525390625,
"logps/chosen": -218.0048370361328,
"logps/rejected": -219.4988555908203,
"loss": 1.3499,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -9.236083984375,
"rewards/margins": 47.2701416015625,
"rewards/rejected": -56.5062255859375,
"step": 450
},
{
"epoch": 0.36106750392464676,
"grad_norm": 15.803880587400545,
"learning_rate": 4.034108149278543e-07,
"logits/chosen": 7089.22021484375,
"logits/rejected": 5539.4384765625,
"logps/chosen": -264.29150390625,
"logps/rejected": -238.7609405517578,
"loss": 1.3517,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -13.85925006866455,
"rewards/margins": 34.69366455078125,
"rewards/rejected": -48.552913665771484,
"step": 460
},
{
"epoch": 0.36891679748822603,
"grad_norm": 16.115599045605588,
"learning_rate": 3.979425312480629e-07,
"logits/chosen": 6082.546875,
"logits/rejected": 5345.21728515625,
"logps/chosen": -225.55813598632812,
"logps/rejected": -248.83438110351562,
"loss": 1.3451,
"rewards/accuracies": 0.75,
"rewards/chosen": -13.293352127075195,
"rewards/margins": 32.974754333496094,
"rewards/rejected": -46.26811218261719,
"step": 470
},
{
"epoch": 0.37676609105180536,
"grad_norm": 12.53417188182312,
"learning_rate": 3.923630753280357e-07,
"logits/chosen": 6546.7509765625,
"logits/rejected": 5691.3193359375,
"logps/chosen": -218.65902709960938,
"logps/rejected": -214.631103515625,
"loss": 1.3509,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -11.263853073120117,
"rewards/margins": 28.971487045288086,
"rewards/rejected": -40.23533630371094,
"step": 480
},
{
"epoch": 0.38461538461538464,
"grad_norm": 19.95513568811511,
"learning_rate": 3.866766398810424e-07,
"logits/chosen": 6155.7880859375,
"logits/rejected": 5917.6748046875,
"logps/chosen": -180.28146362304688,
"logps/rejected": -236.327880859375,
"loss": 1.3366,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -3.6729559898376465,
"rewards/margins": 30.630626678466797,
"rewards/rejected": -34.3035888671875,
"step": 490
},
{
"epoch": 0.3924646781789639,
"grad_norm": 14.227400790753371,
"learning_rate": 3.8088749801071496e-07,
"logits/chosen": 6715.08447265625,
"logits/rejected": 5196.7041015625,
"logps/chosen": -247.65261840820312,
"logps/rejected": -270.3143005371094,
"loss": 1.3572,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -25.698944091796875,
"rewards/margins": 42.09914779663086,
"rewards/rejected": -67.798095703125,
"step": 500
},
{
"epoch": 0.4003139717425432,
"grad_norm": 10.674798547850948,
"learning_rate": 3.75e-07,
"logits/chosen": 5342.1806640625,
"logits/rejected": 4739.6083984375,
"logps/chosen": -199.51919555664062,
"logps/rejected": -209.77294921875,
"loss": 1.3525,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -10.036214828491211,
"rewards/margins": 33.37144088745117,
"rewards/rejected": -43.407649993896484,
"step": 510
},
{
"epoch": 0.40816326530612246,
"grad_norm": 12.39074250082983,
"learning_rate": 3.6901857004211443e-07,
"logits/chosen": 5672.80517578125,
"logits/rejected": 5283.02490234375,
"logps/chosen": -211.51986694335938,
"logps/rejected": -235.0128173828125,
"loss": 1.3601,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -13.053865432739258,
"rewards/margins": 26.170928955078125,
"rewards/rejected": -39.224796295166016,
"step": 520
},
{
"epoch": 0.41601255886970173,
"grad_norm": 11.108226426071516,
"learning_rate": 3.6294770291596076e-07,
"logits/chosen": 6426.45166015625,
"logits/rejected": 5303.09375,
"logps/chosen": -220.7977294921875,
"logps/rejected": -231.1540985107422,
"loss": 1.3453,
"rewards/accuracies": 0.6999999284744263,
"rewards/chosen": -7.8749799728393555,
"rewards/margins": 24.518779754638672,
"rewards/rejected": -32.39376449584961,
"step": 530
},
{
"epoch": 0.423861852433281,
"grad_norm": 31.66087255257573,
"learning_rate": 3.5679196060850034e-07,
"logits/chosen": 6119.76708984375,
"logits/rejected": 5501.98193359375,
"logps/chosen": -221.72915649414062,
"logps/rejected": -231.87255859375,
"loss": 1.3487,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -10.911323547363281,
"rewards/margins": 32.72243881225586,
"rewards/rejected": -43.63376235961914,
"step": 540
},
{
"epoch": 0.4317111459968603,
"grad_norm": 17.116916865875684,
"learning_rate": 3.505559688866229e-07,
"logits/chosen": 5922.16259765625,
"logits/rejected": 5534.40625,
"logps/chosen": -227.80270385742188,
"logps/rejected": -273.3616638183594,
"loss": 1.3437,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -13.062261581420898,
"rewards/margins": 35.235328674316406,
"rewards/rejected": -48.29759216308594,
"step": 550
},
{
"epoch": 0.43956043956043955,
"grad_norm": 10.303239033366689,
"learning_rate": 3.4424441382108826e-07,
"logits/chosen": 5970.333984375,
"logits/rejected": 5599.16015625,
"logps/chosen": -220.08242797851562,
"logps/rejected": -242.54141235351562,
"loss": 1.354,
"rewards/accuracies": 0.6583333611488342,
"rewards/chosen": -16.8071346282959,
"rewards/margins": 32.52507781982422,
"rewards/rejected": -49.33221435546875,
"step": 560
},
{
"epoch": 0.4474097331240188,
"grad_norm": 20.36092824335855,
"learning_rate": 3.378620382651523e-07,
"logits/chosen": 6295.93798828125,
"logits/rejected": 5818.79541015625,
"logps/chosen": -256.4508361816406,
"logps/rejected": -272.3232727050781,
"loss": 1.3442,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -13.490982055664062,
"rewards/margins": 35.06177520751953,
"rewards/rejected": -48.55276107788086,
"step": 570
},
{
"epoch": 0.4552590266875981,
"grad_norm": 13.847806384981444,
"learning_rate": 3.314136382905234e-07,
"logits/chosen": 6245.16455078125,
"logits/rejected": 5669.74609375,
"logps/chosen": -220.435546875,
"logps/rejected": -257.63934326171875,
"loss": 1.3525,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -8.020076751708984,
"rewards/margins": 36.30790328979492,
"rewards/rejected": -44.327980041503906,
"step": 580
},
{
"epoch": 0.4631083202511774,
"grad_norm": 13.55788048465109,
"learning_rate": 3.249040595833274e-07,
"logits/chosen": 6800.77880859375,
"logits/rejected": 5768.46728515625,
"logps/chosen": -242.50244140625,
"logps/rejected": -225.4458770751953,
"loss": 1.3389,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -10.776572227478027,
"rewards/margins": 35.810447692871094,
"rewards/rejected": -46.5870246887207,
"step": 590
},
{
"epoch": 0.47095761381475665,
"grad_norm": 19.567474002862465,
"learning_rate": 3.1833819380279023e-07,
"logits/chosen": 6432.34130859375,
"logits/rejected": 5503.3408203125,
"logps/chosen": -190.61471557617188,
"logps/rejected": -236.8105926513672,
"loss": 1.3495,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -13.066381454467773,
"rewards/margins": 34.057960510253906,
"rewards/rejected": -47.12434005737305,
"step": 600
},
{
"epoch": 0.478806907378336,
"grad_norm": 10.833448357785096,
"learning_rate": 3.11720974905373e-07,
"logits/chosen": 6166.84716796875,
"logits/rejected": 5408.181640625,
"logps/chosen": -217.9842987060547,
"logps/rejected": -233.4291534423828,
"loss": 1.3351,
"rewards/accuracies": 0.75,
"rewards/chosen": -11.866181373596191,
"rewards/margins": 36.140419006347656,
"rewards/rejected": -48.0066032409668,
"step": 610
},
{
"epoch": 0.48665620094191525,
"grad_norm": 17.584761280856203,
"learning_rate": 3.0505737543712275e-07,
"logits/chosen": 5255.32763671875,
"logits/rejected": 4338.2158203125,
"logps/chosen": -199.14022827148438,
"logps/rejected": -215.39840698242188,
"loss": 1.3499,
"rewards/accuracies": 0.8083333969116211,
"rewards/chosen": -15.381566047668457,
"rewards/margins": 38.9179801940918,
"rewards/rejected": -54.29954147338867,
"step": 620
},
{
"epoch": 0.4945054945054945,
"grad_norm": 13.336131403415491,
"learning_rate": 2.9835240279702513e-07,
"logits/chosen": 6839.3251953125,
"logits/rejected": 5872.88525390625,
"logps/chosen": -251.8268280029297,
"logps/rejected": -247.50167846679688,
"loss": 1.3415,
"rewards/accuracies": 0.7916666269302368,
"rewards/chosen": -9.010820388793945,
"rewards/margins": 44.24280548095703,
"rewards/rejected": -53.25362014770508,
"step": 630
},
{
"epoch": 0.5023547880690737,
"grad_norm": 10.500273772282682,
"learning_rate": 2.9161109547416667e-07,
"logits/chosen": 6504.427734375,
"logits/rejected": 5596.26953125,
"logps/chosen": -223.74313354492188,
"logps/rejected": -247.1144256591797,
"loss": 1.3389,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -12.981588363647461,
"rewards/margins": 19.623910903930664,
"rewards/rejected": -32.605499267578125,
"step": 640
},
{
"epoch": 0.5102040816326531,
"grad_norm": 13.241712923369416,
"learning_rate": 2.848385192615339e-07,
"logits/chosen": 5621.92431640625,
"logits/rejected": 4618.6728515625,
"logps/chosen": -207.3036651611328,
"logps/rejected": -212.81039428710938,
"loss": 1.3446,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -10.964346885681152,
"rewards/margins": 33.11830520629883,
"rewards/rejected": -44.08264923095703,
"step": 650
},
{
"epoch": 0.5180533751962323,
"grad_norm": 13.137564726428407,
"learning_rate": 2.780397634492949e-07,
"logits/chosen": 6302.98388671875,
"logits/rejected": 5078.0986328125,
"logps/chosen": -229.484375,
"logps/rejected": -250.4006805419922,
"loss": 1.3497,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -12.097832679748535,
"rewards/margins": 50.35541915893555,
"rewards/rejected": -62.45325469970703,
"step": 660
},
{
"epoch": 0.5259026687598116,
"grad_norm": 12.607712109286384,
"learning_rate": 2.71219937000424e-07,
"logits/chosen": 6293.5849609375,
"logits/rejected": 5201.06005859375,
"logps/chosen": -219.1787109375,
"logps/rejected": -234.1125030517578,
"loss": 1.3522,
"rewards/accuracies": 0.7916667461395264,
"rewards/chosen": -12.419242858886719,
"rewards/margins": 32.838829040527344,
"rewards/rejected": -45.25807571411133,
"step": 670
},
{
"epoch": 0.533751962323391,
"grad_norm": 10.42216150578162,
"learning_rate": 2.6438416471154273e-07,
"logits/chosen": 6108.7177734375,
"logits/rejected": 5131.86474609375,
"logps/chosen": -223.22036743164062,
"logps/rejected": -227.4945831298828,
"loss": 1.3444,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -9.559103012084961,
"rewards/margins": 38.708797454833984,
"rewards/rejected": -48.267906188964844,
"step": 680
},
{
"epoch": 0.5416012558869702,
"grad_norm": 17.01807369235278,
"learning_rate": 2.5753758336186326e-07,
"logits/chosen": 6047.66015625,
"logits/rejected": 5569.13134765625,
"logps/chosen": -221.77609252929688,
"logps/rejected": -264.51800537109375,
"loss": 1.3412,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -12.669670104980469,
"rewards/margins": 35.65166473388672,
"rewards/rejected": -48.32134246826172,
"step": 690
},
{
"epoch": 0.5494505494505495,
"grad_norm": 16.12776261618448,
"learning_rate": 2.5068533785312666e-07,
"logits/chosen": 5761.84619140625,
"logits/rejected": 5558.48583984375,
"logps/chosen": -202.7579345703125,
"logps/rejected": -238.9604034423828,
"loss": 1.3651,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -13.49761962890625,
"rewards/margins": 38.488807678222656,
"rewards/rejected": -51.986427307128906,
"step": 700
},
{
"epoch": 0.5572998430141287,
"grad_norm": 11.103396938840731,
"learning_rate": 2.4383257734343794e-07,
"logits/chosen": 5719.7939453125,
"logits/rejected": 5761.4130859375,
"logps/chosen": -207.0905303955078,
"logps/rejected": -249.865966796875,
"loss": 1.3403,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -13.179117202758789,
"rewards/margins": 34.066200256347656,
"rewards/rejected": -47.24531936645508,
"step": 710
},
{
"epoch": 0.565149136577708,
"grad_norm": 10.875868983762,
"learning_rate": 2.3698445137790258e-07,
"logits/chosen": 6126.095703125,
"logits/rejected": 5306.52001953125,
"logps/chosen": -227.9593505859375,
"logps/rejected": -244.30264282226562,
"loss": 1.3517,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -8.9561128616333,
"rewards/margins": 35.58136749267578,
"rewards/rejected": -44.537479400634766,
"step": 720
},
{
"epoch": 0.5729984301412873,
"grad_norm": 12.46733582160012,
"learning_rate": 2.3014610601897157e-07,
"logits/chosen": 6644.74365234375,
"logits/rejected": 5127.03857421875,
"logps/chosen": -237.8786163330078,
"logps/rejected": -223.18807983398438,
"loss": 1.3406,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -12.732693672180176,
"rewards/margins": 35.38166427612305,
"rewards/rejected": -48.11436080932617,
"step": 730
},
{
"epoch": 0.5808477237048666,
"grad_norm": 14.378248213557361,
"learning_rate": 2.2332267997940513e-07,
"logits/chosen": 5524.26220703125,
"logits/rejected": 4709.974609375,
"logps/chosen": -201.53176879882812,
"logps/rejected": -213.3249053955078,
"loss": 1.3391,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -9.107335090637207,
"rewards/margins": 41.571495056152344,
"rewards/rejected": -50.6788330078125,
"step": 740
},
{
"epoch": 0.5886970172684458,
"grad_norm": 16.649702927791314,
"learning_rate": 2.1651930076075723e-07,
"logits/chosen": 6013.10302734375,
"logits/rejected": 5475.51953125,
"logps/chosen": -194.5826416015625,
"logps/rejected": -208.33847045898438,
"loss": 1.3492,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -14.156700134277344,
"rewards/margins": 27.087514877319336,
"rewards/rejected": -41.24421691894531,
"step": 750
},
{
"epoch": 0.5965463108320251,
"grad_norm": 12.057829105152498,
"learning_rate": 2.0974108080028692e-07,
"logits/chosen": 6306.58837890625,
"logits/rejected": 5016.3056640625,
"logps/chosen": -212.6140594482422,
"logps/rejected": -217.46597290039062,
"loss": 1.3462,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -9.363100051879883,
"rewards/margins": 33.57235336303711,
"rewards/rejected": -42.935447692871094,
"step": 760
},
{
"epoch": 0.6043956043956044,
"grad_norm": 13.607431735853279,
"learning_rate": 2.0299311362918773e-07,
"logits/chosen": 6517.55224609375,
"logits/rejected": 5634.74755859375,
"logps/chosen": -242.9558563232422,
"logps/rejected": -272.95355224609375,
"loss": 1.3507,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -13.222195625305176,
"rewards/margins": 35.8712272644043,
"rewards/rejected": -49.093421936035156,
"step": 770
},
{
"epoch": 0.6122448979591837,
"grad_norm": 14.042243888509429,
"learning_rate": 1.962804700450265e-07,
"logits/chosen": 6358.8125,
"logits/rejected": 6069.78759765625,
"logps/chosen": -226.16159057617188,
"logps/rejected": -279.2201232910156,
"loss": 1.3483,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -10.056262016296387,
"rewards/margins": 31.77614974975586,
"rewards/rejected": -41.83241653442383,
"step": 780
},
{
"epoch": 0.6200941915227629,
"grad_norm": 13.85111247684391,
"learning_rate": 1.8960819430126334e-07,
"logits/chosen": 5926.2744140625,
"logits/rejected": 5265.1884765625,
"logps/chosen": -216.1208953857422,
"logps/rejected": -251.05642700195312,
"loss": 1.3464,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -17.989896774291992,
"rewards/margins": 44.46880340576172,
"rewards/rejected": -62.45869827270508,
"step": 790
},
{
"epoch": 0.6279434850863422,
"grad_norm": 12.764962415212846,
"learning_rate": 1.8298130031671972e-07,
"logits/chosen": 5927.6357421875,
"logits/rejected": 5216.50146484375,
"logps/chosen": -230.69552612304688,
"logps/rejected": -257.57598876953125,
"loss": 1.3564,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -13.972677230834961,
"rewards/margins": 30.90505027770996,
"rewards/rejected": -44.87772750854492,
"step": 800
},
{
"epoch": 0.6357927786499215,
"grad_norm": 12.348701701738166,
"learning_rate": 1.7640476790784075e-07,
"logits/chosen": 5474.27490234375,
"logits/rejected": 4945.47509765625,
"logps/chosen": -213.3369598388672,
"logps/rejected": -264.7867736816406,
"loss": 1.3448,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -10.881568908691406,
"rewards/margins": 32.60791778564453,
"rewards/rejected": -43.48948287963867,
"step": 810
},
{
"epoch": 0.6436420722135008,
"grad_norm": 12.55787593683916,
"learning_rate": 1.6988353904658492e-07,
"logits/chosen": 5950.470703125,
"logits/rejected": 4638.33349609375,
"logps/chosen": -230.09524536132812,
"logps/rejected": -206.407470703125,
"loss": 1.3416,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -8.880694389343262,
"rewards/margins": 28.838424682617188,
"rewards/rejected": -37.71912384033203,
"step": 820
},
{
"epoch": 0.6514913657770801,
"grad_norm": 17.32057277815633,
"learning_rate": 1.634225141467513e-07,
"logits/chosen": 5889.0400390625,
"logits/rejected": 5296.57861328125,
"logps/chosen": -219.9248046875,
"logps/rejected": -244.50936889648438,
"loss": 1.3485,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -15.738775253295898,
"rewards/margins": 36.31574249267578,
"rewards/rejected": -52.05452346801758,
"step": 830
},
{
"epoch": 0.6593406593406593,
"grad_norm": 9.825712429431242,
"learning_rate": 1.570265483815364e-07,
"logits/chosen": 6438.00390625,
"logits/rejected": 5311.1455078125,
"logps/chosen": -243.78604125976562,
"logps/rejected": -258.28704833984375,
"loss": 1.3441,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -16.456207275390625,
"rewards/margins": 29.037649154663086,
"rewards/rejected": -45.493858337402344,
"step": 840
},
{
"epoch": 0.6671899529042387,
"grad_norm": 14.735433365070342,
"learning_rate": 1.5070044803508691e-07,
"logits/chosen": 5953.31298828125,
"logits/rejected": 5381.14306640625,
"logps/chosen": -227.7479705810547,
"logps/rejected": -255.1121368408203,
"loss": 1.3349,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -12.270512580871582,
"rewards/margins": 42.38630294799805,
"rewards/rejected": -54.65681838989258,
"step": 850
},
{
"epoch": 0.6750392464678179,
"grad_norm": 14.85492459591332,
"learning_rate": 1.444489668907914e-07,
"logits/chosen": 6416.33544921875,
"logits/rejected": 5480.611328125,
"logps/chosen": -260.19989013671875,
"logps/rejected": -254.9077606201172,
"loss": 1.3516,
"rewards/accuracies": 0.7750000953674316,
"rewards/chosen": -12.429244995117188,
"rewards/margins": 39.79665756225586,
"rewards/rejected": -52.22589874267578,
"step": 860
},
{
"epoch": 0.6828885400313972,
"grad_norm": 13.017271488143887,
"learning_rate": 1.3827680265902232e-07,
"logits/chosen": 6371.8037109375,
"logits/rejected": 5308.52490234375,
"logps/chosen": -242.83413696289062,
"logps/rejected": -247.3595733642578,
"loss": 1.351,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -12.737371444702148,
"rewards/margins": 33.933265686035156,
"rewards/rejected": -46.67063522338867,
"step": 870
},
{
"epoch": 0.6907378335949764,
"grad_norm": 15.15778095800919,
"learning_rate": 1.3218859344701632e-07,
"logits/chosen": 5609.341796875,
"logits/rejected": 5382.73095703125,
"logps/chosen": -221.3697967529297,
"logps/rejected": -276.8291931152344,
"loss": 1.3483,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -10.858831405639648,
"rewards/margins": 34.38120651245117,
"rewards/rejected": -45.24003982543945,
"step": 880
},
{
"epoch": 0.6985871271585558,
"grad_norm": 13.765055358350205,
"learning_rate": 1.2618891427354172e-07,
"logits/chosen": 6611.1533203125,
"logits/rejected": 5410.708984375,
"logps/chosen": -267.79962158203125,
"logps/rejected": -259.8660888671875,
"loss": 1.3481,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -10.203554153442383,
"rewards/margins": 37.881988525390625,
"rewards/rejected": -48.085540771484375,
"step": 890
},
{
"epoch": 0.706436420722135,
"grad_norm": 12.391358583369788,
"learning_rate": 1.202822736309758e-07,
"logits/chosen": 5603.50537109375,
"logits/rejected": 5218.40185546875,
"logps/chosen": -215.1715087890625,
"logps/rejected": -255.24758911132812,
"loss": 1.3495,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -10.822305679321289,
"rewards/margins": 33.995201110839844,
"rewards/rejected": -44.8175048828125,
"step": 900
},
{
"epoch": 0.7142857142857143,
"grad_norm": 18.87336390048285,
"learning_rate": 1.1447311009737299e-07,
"logits/chosen": 5508.84375,
"logits/rejected": 5254.75244140625,
"logps/chosen": -222.1977081298828,
"logps/rejected": -262.20513916015625,
"loss": 1.3453,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -13.971402168273926,
"rewards/margins": 40.305274963378906,
"rewards/rejected": -54.27667999267578,
"step": 910
},
{
"epoch": 0.7221350078492935,
"grad_norm": 15.471482371326609,
"learning_rate": 1.0876578900107053e-07,
"logits/chosen": 6093.49951171875,
"logits/rejected": 5076.36376953125,
"logps/chosen": -245.2948455810547,
"logps/rejected": -248.81405639648438,
"loss": 1.3461,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -12.873262405395508,
"rewards/margins": 37.79849624633789,
"rewards/rejected": -50.6717529296875,
"step": 920
},
{
"epoch": 0.7299843014128728,
"grad_norm": 11.479378316337622,
"learning_rate": 1.0316459914033793e-07,
"logits/chosen": 6001.8134765625,
"logits/rejected": 4559.4609375,
"logps/chosen": -252.53317260742188,
"logps/rejected": -239.29428100585938,
"loss": 1.3471,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -16.544239044189453,
"rewards/margins": 36.82581329345703,
"rewards/rejected": -53.37005615234375,
"step": 930
},
{
"epoch": 0.7378335949764521,
"grad_norm": 12.94277337339525,
"learning_rate": 9.767374956053584e-08,
"logits/chosen": 5815.173828125,
"logits/rejected": 5115.169921875,
"logps/chosen": -231.0220184326172,
"logps/rejected": -261.7562561035156,
"loss": 1.3429,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -12.728368759155273,
"rewards/margins": 44.04799270629883,
"rewards/rejected": -56.7763671875,
"step": 940
},
{
"epoch": 0.7456828885400314,
"grad_norm": 16.27087945734002,
"learning_rate": 9.229736639120561e-08,
"logits/chosen": 5988.3154296875,
"logits/rejected": 5553.0830078125,
"logps/chosen": -231.2310028076172,
"logps/rejected": -251.68289184570312,
"loss": 1.348,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -13.42981243133545,
"rewards/margins": 24.098569869995117,
"rewards/rejected": -37.528377532958984,
"step": 950
},
{
"epoch": 0.7535321821036107,
"grad_norm": 16.766717992055163,
"learning_rate": 8.70394897454659e-08,
"logits/chosen": 5841.966796875,
"logits/rejected": 5221.5361328125,
"logps/chosen": -227.2954864501953,
"logps/rejected": -253.348876953125,
"loss": 1.3363,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -8.86706829071045,
"rewards/margins": 42.00550079345703,
"rewards/rejected": -50.87256622314453,
"step": 960
},
{
"epoch": 0.7613814756671899,
"grad_norm": 17.264677009971713,
"learning_rate": 8.19040706840472e-08,
"logits/chosen": 5942.7607421875,
"logits/rejected": 4996.2412109375,
"logps/chosen": -252.40908813476562,
"logps/rejected": -269.8039855957031,
"loss": 1.3361,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -12.189082145690918,
"rewards/margins": 50.92434310913086,
"rewards/rejected": -63.113426208496094,
"step": 970
},
{
"epoch": 0.7692307692307693,
"grad_norm": 12.969674705460362,
"learning_rate": 7.689496824624525e-08,
"logits/chosen": 5647.4619140625,
"logits/rejected": 4565.35107421875,
"logps/chosen": -239.58450317382812,
"logps/rejected": -268.37799072265625,
"loss": 1.3324,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -10.767900466918945,
"rewards/margins": 67.11649322509766,
"rewards/rejected": -77.88438415527344,
"step": 980
},
{
"epoch": 0.7770800627943485,
"grad_norm": 23.179398971044233,
"learning_rate": 7.201594655002458e-08,
"logits/chosen": 5969.14111328125,
"logits/rejected": 5011.64013671875,
"logps/chosen": -241.0636444091797,
"logps/rejected": -262.5384216308594,
"loss": 1.3365,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -15.168705940246582,
"rewards/margins": 53.300010681152344,
"rewards/rejected": -68.46871185302734,
"step": 990
},
{
"epoch": 0.7849293563579278,
"grad_norm": 18.79279527226742,
"learning_rate": 6.727067196345099e-08,
"logits/chosen": 5659.3037109375,
"logits/rejected": 4810.89599609375,
"logps/chosen": -227.1795654296875,
"logps/rejected": -228.3984375,
"loss": 1.3449,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -13.423723220825195,
"rewards/margins": 34.62942123413086,
"rewards/rejected": -48.053138732910156,
"step": 1000
},
{
"epoch": 0.792778649921507,
"grad_norm": 15.30089044819146,
"learning_rate": 6.26627103495786e-08,
"logits/chosen": 5842.5341796875,
"logits/rejected": 4896.11181640625,
"logps/chosen": -224.3483428955078,
"logps/rejected": -247.2809295654297,
"loss": 1.34,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -9.639090538024902,
"rewards/margins": 46.20824432373047,
"rewards/rejected": -55.84733200073242,
"step": 1010
},
{
"epoch": 0.8006279434850864,
"grad_norm": 12.810319531592627,
"learning_rate": 5.8195524386862374e-08,
"logits/chosen": 5930.25390625,
"logits/rejected": 5296.1630859375,
"logps/chosen": -257.00250244140625,
"logps/rejected": -280.92657470703125,
"loss": 1.3463,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -8.82390022277832,
"rewards/margins": 46.02201461791992,
"rewards/rejected": -54.845909118652344,
"step": 1020
},
{
"epoch": 0.8084772370486656,
"grad_norm": 11.44579430939054,
"learning_rate": 5.38724709671092e-08,
"logits/chosen": 6328.5556640625,
"logits/rejected": 5993.76171875,
"logps/chosen": -243.43869018554688,
"logps/rejected": -289.0228271484375,
"loss": 1.3372,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -11.970319747924805,
"rewards/margins": 43.93321990966797,
"rewards/rejected": -55.903541564941406,
"step": 1030
},
{
"epoch": 0.8163265306122449,
"grad_norm": 14.741952244341237,
"learning_rate": 4.969679867292276e-08,
"logits/chosen": 5626.61572265625,
"logits/rejected": 5149.10791015625,
"logps/chosen": -236.9131317138672,
"logps/rejected": -273.8883972167969,
"loss": 1.3424,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -16.811473846435547,
"rewards/margins": 47.892974853515625,
"rewards/rejected": -64.70445251464844,
"step": 1040
},
{
"epoch": 0.8241758241758241,
"grad_norm": 12.693662955042376,
"learning_rate": 4.5671645336537416e-08,
"logits/chosen": 5679.7373046875,
"logits/rejected": 5195.1259765625,
"logps/chosen": -251.4984130859375,
"logps/rejected": -279.0545959472656,
"loss": 1.3414,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -14.017779350280762,
"rewards/margins": 49.24242401123047,
"rewards/rejected": -63.26020431518555,
"step": 1050
},
{
"epoch": 0.8320251177394035,
"grad_norm": 31.47444666328788,
"learning_rate": 4.180003568187776e-08,
"logits/chosen": 7014.08056640625,
"logits/rejected": 5543.162109375,
"logps/chosen": -276.7340393066406,
"logps/rejected": -269.3011169433594,
"loss": 1.3503,
"rewards/accuracies": 0.6666667461395264,
"rewards/chosen": -15.746711730957031,
"rewards/margins": 33.51522445678711,
"rewards/rejected": -49.26193618774414,
"step": 1060
},
{
"epoch": 0.8398744113029827,
"grad_norm": 16.176876775515055,
"learning_rate": 3.8084879051612144e-08,
"logits/chosen": 5845.7783203125,
"logits/rejected": 5383.59521484375,
"logps/chosen": -234.75259399414062,
"logps/rejected": -243.68917846679688,
"loss": 1.3441,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -9.509564399719238,
"rewards/margins": 41.015254974365234,
"rewards/rejected": -50.524818420410156,
"step": 1070
},
{
"epoch": 0.847723704866562,
"grad_norm": 14.969831250800548,
"learning_rate": 3.452896722091128e-08,
"logits/chosen": 6403.892578125,
"logits/rejected": 4980.4814453125,
"logps/chosen": -274.7662658691406,
"logps/rejected": -261.01898193359375,
"loss": 1.3305,
"rewards/accuracies": 0.8083333969116211,
"rewards/chosen": -8.196954727172852,
"rewards/margins": 51.842140197753906,
"rewards/rejected": -60.039100646972656,
"step": 1080
},
{
"epoch": 0.8555729984301413,
"grad_norm": 12.198123465136609,
"learning_rate": 3.11349722995527e-08,
"logits/chosen": 6488.9091796875,
"logits/rejected": 4886.4169921875,
"logps/chosen": -241.4394073486328,
"logps/rejected": -268.80352783203125,
"loss": 1.3471,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -13.964780807495117,
"rewards/margins": 41.95417785644531,
"rewards/rejected": -55.9189567565918,
"step": 1090
},
{
"epoch": 0.8634222919937206,
"grad_norm": 18.075378598084896,
"learning_rate": 2.7905444723949762e-08,
"logits/chosen": 6258.9072265625,
"logits/rejected": 5193.19384765625,
"logps/chosen": -251.8688507080078,
"logps/rejected": -251.71829223632812,
"loss": 1.3449,
"rewards/accuracies": 0.7916666269302368,
"rewards/chosen": -13.332514762878418,
"rewards/margins": 48.888423919677734,
"rewards/rejected": -62.2209358215332,
"step": 1100
},
{
"epoch": 0.8712715855572999,
"grad_norm": 18.246911185615897,
"learning_rate": 2.484281134061142e-08,
"logits/chosen": 6621.4384765625,
"logits/rejected": 5365.8623046875,
"logps/chosen": -279.5318603515625,
"logps/rejected": -282.0029296875,
"loss": 1.3424,
"rewards/accuracies": 0.8083332777023315,
"rewards/chosen": -14.918279647827148,
"rewards/margins": 44.81663131713867,
"rewards/rejected": -59.73491287231445,
"step": 1110
},
{
"epoch": 0.8791208791208791,
"grad_norm": 22.551350441375604,
"learning_rate": 2.194937358247506e-08,
"logits/chosen": 6477.88916015625,
"logits/rejected": 5286.2412109375,
"logps/chosen": -260.225341796875,
"logps/rejected": -279.5767822265625,
"loss": 1.3418,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": -15.084878921508789,
"rewards/margins": 47.721107482910156,
"rewards/rejected": -62.805992126464844,
"step": 1120
},
{
"epoch": 0.8869701726844584,
"grad_norm": 20.252362802872884,
"learning_rate": 1.9227305739481612e-08,
"logits/chosen": 5893.1474609375,
"logits/rejected": 4668.8095703125,
"logps/chosen": -245.6111297607422,
"logps/rejected": -238.3753662109375,
"loss": 1.3376,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -10.829057693481445,
"rewards/margins": 47.32074737548828,
"rewards/rejected": -58.149803161621094,
"step": 1130
},
{
"epoch": 0.8948194662480377,
"grad_norm": 13.189894058710424,
"learning_rate": 1.6678653324693787e-08,
"logits/chosen": 6479.234375,
"logits/rejected": 5293.7001953125,
"logps/chosen": -269.5186462402344,
"logps/rejected": -273.58905029296875,
"loss": 1.3437,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -11.897893905639648,
"rewards/margins": 40.87809753417969,
"rewards/rejected": -52.77599334716797,
"step": 1140
},
{
"epoch": 0.902668759811617,
"grad_norm": 12.521869991300122,
"learning_rate": 1.4305331537183384e-08,
"logits/chosen": 5731.880859375,
"logits/rejected": 5293.7578125,
"logps/chosen": -239.46334838867188,
"logps/rejected": -267.51025390625,
"loss": 1.3369,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -12.479973793029785,
"rewards/margins": 35.54231262207031,
"rewards/rejected": -48.02228927612305,
"step": 1150
},
{
"epoch": 0.9105180533751962,
"grad_norm": 13.463956997262862,
"learning_rate": 1.2109123822844653e-08,
"logits/chosen": 5900.7177734375,
"logits/rejected": 4710.4609375,
"logps/chosen": -244.7340545654297,
"logps/rejected": -246.96536254882812,
"loss": 1.3439,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -16.17725372314453,
"rewards/margins": 34.85002899169922,
"rewards/rejected": -51.027286529541016,
"step": 1160
},
{
"epoch": 0.9183673469387755,
"grad_norm": 15.662977380913924,
"learning_rate": 1.0091680534213387e-08,
"logits/chosen": 6465.8505859375,
"logits/rejected": 6233.8583984375,
"logps/chosen": -257.33880615234375,
"logps/rejected": -297.4341735839844,
"loss": 1.3457,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -14.390420913696289,
"rewards/margins": 36.296791076660156,
"rewards/rejected": -50.68721389770508,
"step": 1170
},
{
"epoch": 0.9262166405023547,
"grad_norm": 18.75305375047271,
"learning_rate": 8.254517690300944e-09,
"logits/chosen": 5696.08154296875,
"logits/rejected": 5191.6025390625,
"logps/chosen": -252.3257293701172,
"logps/rejected": -268.64801025390625,
"loss": 1.3451,
"rewards/accuracies": 0.75,
"rewards/chosen": -12.379720687866211,
"rewards/margins": 39.44649887084961,
"rewards/rejected": -51.82622146606445,
"step": 1180
},
{
"epoch": 0.9340659340659341,
"grad_norm": 14.426773906657814,
"learning_rate": 6.599015837372907e-09,
"logits/chosen": 6177.75537109375,
"logits/rejected": 5415.826171875,
"logps/chosen": -269.7903747558594,
"logps/rejected": -276.715576171875,
"loss": 1.3386,
"rewards/accuracies": 0.7166666388511658,
"rewards/chosen": -20.904890060424805,
"rewards/margins": 38.65822219848633,
"rewards/rejected": -59.5631103515625,
"step": 1190
},
{
"epoch": 0.9419152276295133,
"grad_norm": 15.760226868571879,
"learning_rate": 5.126419011529992e-09,
"logits/chosen": 6390.10302734375,
"logits/rejected": 5463.6162109375,
"logps/chosen": -267.0502014160156,
"logps/rejected": -277.47808837890625,
"loss": 1.3385,
"rewards/accuracies": 0.8083333969116211,
"rewards/chosen": -11.388493537902832,
"rewards/margins": 47.42402267456055,
"rewards/rejected": -58.81251907348633,
"step": 1200
},
{
"epoch": 0.9497645211930926,
"grad_norm": 29.730772203455786,
"learning_rate": 3.837833803870177e-09,
"logits/chosen": 5976.55224609375,
"logits/rejected": 5252.8037109375,
"logps/chosen": -253.4025115966797,
"logps/rejected": -275.3264465332031,
"loss": 1.3459,
"rewards/accuracies": 0.7750000953674316,
"rewards/chosen": -13.217000007629395,
"rewards/margins": 43.908164978027344,
"rewards/rejected": -57.125160217285156,
"step": 1210
},
{
"epoch": 0.957613814756672,
"grad_norm": 17.84856218528166,
"learning_rate": 2.734228528934679e-09,
"logits/chosen": 7450.5419921875,
"logits/rejected": 5507.4033203125,
"logps/chosen": -313.83624267578125,
"logps/rejected": -304.4243469238281,
"loss": 1.3486,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -17.067832946777344,
"rewards/margins": 42.159278869628906,
"rewards/rejected": -59.22711181640625,
"step": 1220
},
{
"epoch": 0.9654631083202512,
"grad_norm": 19.74856745242947,
"learning_rate": 1.8164324970625645e-09,
"logits/chosen": 6633.40478515625,
"logits/rejected": 5254.0,
"logps/chosen": -270.46966552734375,
"logps/rejected": -267.3912048339844,
"loss": 1.3434,
"rewards/accuracies": 0.75,
"rewards/chosen": -9.887968063354492,
"rewards/margins": 44.506534576416016,
"rewards/rejected": -54.394500732421875,
"step": 1230
},
{
"epoch": 0.9733124018838305,
"grad_norm": 12.427120458275336,
"learning_rate": 1.0851353912008642e-09,
"logits/chosen": 5715.10546875,
"logits/rejected": 5259.88232421875,
"logps/chosen": -249.3816680908203,
"logps/rejected": -292.0200500488281,
"loss": 1.3377,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -17.769298553466797,
"rewards/margins": 39.09291076660156,
"rewards/rejected": -56.862205505371094,
"step": 1240
},
{
"epoch": 0.9811616954474097,
"grad_norm": 12.98993462559583,
"learning_rate": 5.408867486384471e-10,
"logits/chosen": 5827.32421875,
"logits/rejected": 4937.1123046875,
"logps/chosen": -239.4810333251953,
"logps/rejected": -234.88510131835938,
"loss": 1.3445,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -9.172881126403809,
"rewards/margins": 36.28196716308594,
"rewards/rejected": -45.4548454284668,
"step": 1250
},
{
"epoch": 0.989010989010989,
"grad_norm": 16.16222617431415,
"learning_rate": 1.840955480532924e-10,
"logits/chosen": 5506.1591796875,
"logits/rejected": 5235.78662109375,
"logps/chosen": -246.6016082763672,
"logps/rejected": -265.4342956542969,
"loss": 1.3381,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -14.441003799438477,
"rewards/margins": 34.66820526123047,
"rewards/rejected": -49.10921096801758,
"step": 1260
},
{
"epoch": 0.9968602825745683,
"grad_norm": 19.003423412523194,
"learning_rate": 1.502990218302247e-11,
"logits/chosen": 5780.91015625,
"logits/rejected": 4716.0341796875,
"logps/chosen": -237.00357055664062,
"logps/rejected": -240.70358276367188,
"loss": 1.3392,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -12.52961540222168,
"rewards/margins": 41.86973571777344,
"rewards/rejected": -54.39934539794922,
"step": 1270
},
{
"epoch": 1.0,
"step": 1274,
"total_flos": 0.0,
"train_loss": 1.3517364699574805,
"train_runtime": 14845.1399,
"train_samples_per_second": 4.118,
"train_steps_per_second": 0.086
}
],
"logging_steps": 10,
"max_steps": 1274,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}