IL_DPO48-zephyr-7b-sft-full / trainer_state.json
TTTXXX01's picture
Model save
e0e4dc2 verified
raw
history blame contribute delete
No virus
101 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 500,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005234231876472127,
"grad_norm": 7.5491774607562485,
"learning_rate": 2.617801047120419e-09,
"logits/chosen": 5773.244140625,
"logits/rejected": 4887.3955078125,
"logps/chosen": -261.77630615234375,
"logps/rejected": -134.50271606445312,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.005234231876472127,
"grad_norm": 7.564045160748545,
"learning_rate": 2.6178010471204188e-08,
"logits/chosen": 4445.29443359375,
"logits/rejected": 4136.89404296875,
"logps/chosen": -199.90216064453125,
"logps/rejected": -178.72950744628906,
"loss": 0.693,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.0001119289590860717,
"rewards/margins": 0.000557027175091207,
"rewards/rejected": -0.0004450982087291777,
"step": 10
},
{
"epoch": 0.010468463752944255,
"grad_norm": 7.04613658824832,
"learning_rate": 5.2356020942408376e-08,
"logits/chosen": 6441.7216796875,
"logits/rejected": 5833.8310546875,
"logps/chosen": -267.2023010253906,
"logps/rejected": -242.09786987304688,
"loss": 0.6932,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.0004725625622086227,
"rewards/margins": -0.0009369999170303345,
"rewards/rejected": 0.00046443723840638995,
"step": 20
},
{
"epoch": 0.015702695629416383,
"grad_norm": 7.050014404404103,
"learning_rate": 7.853403141361257e-08,
"logits/chosen": 6073.69384765625,
"logits/rejected": 4584.10400390625,
"logps/chosen": -242.3122100830078,
"logps/rejected": -186.73757934570312,
"loss": 0.6932,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.0008681340259499848,
"rewards/margins": -0.0006206175312399864,
"rewards/rejected": -0.0002475165529176593,
"step": 30
},
{
"epoch": 0.02093692750588851,
"grad_norm": 7.0094537847752,
"learning_rate": 1.0471204188481675e-07,
"logits/chosen": 6178.7880859375,
"logits/rejected": 5119.3330078125,
"logps/chosen": -267.6510925292969,
"logps/rejected": -238.3938446044922,
"loss": 0.6929,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 5.8413388615008444e-05,
"rewards/margins": 0.0008872878970578313,
"rewards/rejected": -0.0008288744720630348,
"step": 40
},
{
"epoch": 0.02617115938236064,
"grad_norm": 6.498624484675514,
"learning_rate": 1.3089005235602092e-07,
"logits/chosen": 5807.2255859375,
"logits/rejected": 4976.87890625,
"logps/chosen": -232.0266571044922,
"logps/rejected": -215.0687255859375,
"loss": 0.6929,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -6.710218440275639e-05,
"rewards/margins": 0.0002581426524557173,
"rewards/rejected": -0.00032524490961804986,
"step": 50
},
{
"epoch": 0.031405391258832765,
"grad_norm": 6.354896668199181,
"learning_rate": 1.5706806282722514e-07,
"logits/chosen": 5920.17041015625,
"logits/rejected": 4380.2998046875,
"logps/chosen": -276.4042053222656,
"logps/rejected": -198.1670684814453,
"loss": 0.6924,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0011509377509355545,
"rewards/margins": 0.0029835705645382404,
"rewards/rejected": -0.0018326330464333296,
"step": 60
},
{
"epoch": 0.036639623135304895,
"grad_norm": 7.188225691003244,
"learning_rate": 1.8324607329842932e-07,
"logits/chosen": 5793.0302734375,
"logits/rejected": 5064.73046875,
"logps/chosen": -241.7870330810547,
"logps/rejected": -217.55068969726562,
"loss": 0.692,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0016902139177545905,
"rewards/margins": 0.005393642000854015,
"rewards/rejected": -0.0037034284323453903,
"step": 70
},
{
"epoch": 0.04187385501177702,
"grad_norm": 6.885409466782051,
"learning_rate": 2.094240837696335e-07,
"logits/chosen": 5731.5439453125,
"logits/rejected": 4790.80517578125,
"logps/chosen": -230.2675018310547,
"logps/rejected": -203.81747436523438,
"loss": 0.6916,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.0019947488326579332,
"rewards/margins": 0.0073792897164821625,
"rewards/rejected": -0.005384541116654873,
"step": 80
},
{
"epoch": 0.04710808688824915,
"grad_norm": 7.01483850364403,
"learning_rate": 2.356020942408377e-07,
"logits/chosen": 6064.4345703125,
"logits/rejected": 5340.29443359375,
"logps/chosen": -245.2501983642578,
"logps/rejected": -234.0878143310547,
"loss": 0.6913,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0002847136929631233,
"rewards/margins": 0.00501064071431756,
"rewards/rejected": -0.0052953544072806835,
"step": 90
},
{
"epoch": 0.05234231876472128,
"grad_norm": 6.584750614575209,
"learning_rate": 2.6178010471204185e-07,
"logits/chosen": 5483.78662109375,
"logits/rejected": 4830.17626953125,
"logps/chosen": -195.8482208251953,
"logps/rejected": -172.69119262695312,
"loss": 0.6908,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.006601253990083933,
"rewards/margins": 0.006475942675024271,
"rewards/rejected": -0.013077196665108204,
"step": 100
},
{
"epoch": 0.05757655064119341,
"grad_norm": 7.00116071266525,
"learning_rate": 2.879581151832461e-07,
"logits/chosen": 4919.4482421875,
"logits/rejected": 3946.84765625,
"logps/chosen": -207.5120086669922,
"logps/rejected": -149.10848999023438,
"loss": 0.6895,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0063446699641644955,
"rewards/margins": 0.012786591425538063,
"rewards/rejected": -0.019131261855363846,
"step": 110
},
{
"epoch": 0.06281078251766553,
"grad_norm": 6.875094615901205,
"learning_rate": 3.1413612565445027e-07,
"logits/chosen": 6150.2900390625,
"logits/rejected": 5531.5439453125,
"logps/chosen": -241.3804473876953,
"logps/rejected": -234.3568572998047,
"loss": 0.686,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.007997828535735607,
"rewards/margins": 0.03657924011349678,
"rewards/rejected": -0.044577065855264664,
"step": 120
},
{
"epoch": 0.06804501439413765,
"grad_norm": 7.22615793159286,
"learning_rate": 3.4031413612565446e-07,
"logits/chosen": 6236.9755859375,
"logits/rejected": 4412.3017578125,
"logps/chosen": -223.0286865234375,
"logps/rejected": -177.5249786376953,
"loss": 0.6845,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0182146318256855,
"rewards/margins": 0.040880750864744186,
"rewards/rejected": -0.059095390141010284,
"step": 130
},
{
"epoch": 0.07327924627060979,
"grad_norm": 7.647819285658808,
"learning_rate": 3.6649214659685864e-07,
"logits/chosen": 5931.47900390625,
"logits/rejected": 5780.89208984375,
"logps/chosen": -238.3067169189453,
"logps/rejected": -247.47079467773438,
"loss": 0.6811,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05609896779060364,
"rewards/margins": 0.04913746565580368,
"rewards/rejected": -0.10523643344640732,
"step": 140
},
{
"epoch": 0.07851347814708191,
"grad_norm": 8.236442048395077,
"learning_rate": 3.926701570680628e-07,
"logits/chosen": 5606.55029296875,
"logits/rejected": 5088.86279296875,
"logps/chosen": -234.2759246826172,
"logps/rejected": -225.5093994140625,
"loss": 0.6813,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.061849020421504974,
"rewards/margins": 0.0713229849934578,
"rewards/rejected": -0.13317202031612396,
"step": 150
},
{
"epoch": 0.08374771002355404,
"grad_norm": 7.993800474590215,
"learning_rate": 4.18848167539267e-07,
"logits/chosen": 5549.6689453125,
"logits/rejected": 4999.32763671875,
"logps/chosen": -210.8323211669922,
"logps/rejected": -230.56655883789062,
"loss": 0.6741,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.14567852020263672,
"rewards/margins": 0.10253773629665375,
"rewards/rejected": -0.24821624159812927,
"step": 160
},
{
"epoch": 0.08898194190002617,
"grad_norm": 8.807660704706082,
"learning_rate": 4.450261780104712e-07,
"logits/chosen": 6826.31787109375,
"logits/rejected": 5490.9287109375,
"logps/chosen": -267.2113952636719,
"logps/rejected": -253.62295532226562,
"loss": 0.6684,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.28720229864120483,
"rewards/margins": 0.1500168889760971,
"rewards/rejected": -0.4372192323207855,
"step": 170
},
{
"epoch": 0.0942161737764983,
"grad_norm": 13.018768437683475,
"learning_rate": 4.712041884816754e-07,
"logits/chosen": 6161.29736328125,
"logits/rejected": 4387.1025390625,
"logps/chosen": -280.9503479003906,
"logps/rejected": -251.7024383544922,
"loss": 0.6672,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4883364737033844,
"rewards/margins": 0.13436347246170044,
"rewards/rejected": -0.6226999163627625,
"step": 180
},
{
"epoch": 0.09945040565297043,
"grad_norm": 12.166316451485214,
"learning_rate": 4.973821989528796e-07,
"logits/chosen": 5830.9501953125,
"logits/rejected": 5651.06298828125,
"logps/chosen": -257.42633056640625,
"logps/rejected": -298.8231506347656,
"loss": 0.6572,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.46903976798057556,
"rewards/margins": 0.2048400640487671,
"rewards/rejected": -0.6738797426223755,
"step": 190
},
{
"epoch": 0.10468463752944256,
"grad_norm": 10.296880781028285,
"learning_rate": 4.999661831436498e-07,
"logits/chosen": 5897.57373046875,
"logits/rejected": 5823.5986328125,
"logps/chosen": -264.2397155761719,
"logps/rejected": -303.2627868652344,
"loss": 0.6599,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.4873962998390198,
"rewards/margins": 0.25847315788269043,
"rewards/rejected": -0.7458693981170654,
"step": 200
},
{
"epoch": 0.10991886940591468,
"grad_norm": 12.312533931256393,
"learning_rate": 4.998492971140339e-07,
"logits/chosen": 5829.45654296875,
"logits/rejected": 5781.94775390625,
"logps/chosen": -262.94244384765625,
"logps/rejected": -321.5575866699219,
"loss": 0.655,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5211669206619263,
"rewards/margins": 0.3335101306438446,
"rewards/rejected": -0.8546770215034485,
"step": 210
},
{
"epoch": 0.11515310128238682,
"grad_norm": 11.413061792372044,
"learning_rate": 4.996489634487865e-07,
"logits/chosen": 5954.07958984375,
"logits/rejected": 5074.4462890625,
"logps/chosen": -295.57037353515625,
"logps/rejected": -291.2997131347656,
"loss": 0.6611,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.687902569770813,
"rewards/margins": 0.26726865768432617,
"rewards/rejected": -0.9551712870597839,
"step": 220
},
{
"epoch": 0.12038733315885894,
"grad_norm": 16.092022253534562,
"learning_rate": 4.993652490577246e-07,
"logits/chosen": 6523.6455078125,
"logits/rejected": 5203.65869140625,
"logps/chosen": -303.7278137207031,
"logps/rejected": -307.8695983886719,
"loss": 0.649,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7638736367225647,
"rewards/margins": 0.3057602047920227,
"rewards/rejected": -1.0696338415145874,
"step": 230
},
{
"epoch": 0.12562156503533106,
"grad_norm": 10.894941993110562,
"learning_rate": 4.9899824869915e-07,
"logits/chosen": 5843.22705078125,
"logits/rejected": 4340.3564453125,
"logps/chosen": -299.8017578125,
"logps/rejected": -266.58160400390625,
"loss": 0.6545,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.778353214263916,
"rewards/margins": 0.2908143997192383,
"rewards/rejected": -1.0691677331924438,
"step": 240
},
{
"epoch": 0.13085579691180318,
"grad_norm": 15.436510071051824,
"learning_rate": 4.985480849482012e-07,
"logits/chosen": 5789.1865234375,
"logits/rejected": 5862.6337890625,
"logps/chosen": -273.215087890625,
"logps/rejected": -316.2986755371094,
"loss": 0.6496,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.694969654083252,
"rewards/margins": 0.2356947660446167,
"rewards/rejected": -0.9306643605232239,
"step": 250
},
{
"epoch": 0.1360900287882753,
"grad_norm": 16.967835475128144,
"learning_rate": 4.980149081559142e-07,
"logits/chosen": 6428.578125,
"logits/rejected": 6090.5703125,
"logps/chosen": -351.8347473144531,
"logps/rejected": -366.26715087890625,
"loss": 0.6454,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9397789239883423,
"rewards/margins": 0.3180678188800812,
"rewards/rejected": -1.2578465938568115,
"step": 260
},
{
"epoch": 0.14132426066474746,
"grad_norm": 20.655525821311087,
"learning_rate": 4.973988963990065e-07,
"logits/chosen": 5191.80419921875,
"logits/rejected": 4412.33642578125,
"logps/chosen": -310.77447509765625,
"logps/rejected": -351.3142395019531,
"loss": 0.6489,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0443050861358643,
"rewards/margins": 0.456368625164032,
"rewards/rejected": -1.500673532485962,
"step": 270
},
{
"epoch": 0.14655849254121958,
"grad_norm": 16.53683127766641,
"learning_rate": 4.967002554204008e-07,
"logits/chosen": 5606.6220703125,
"logits/rejected": 4663.47998046875,
"logps/chosen": -362.4611511230469,
"logps/rejected": -385.1017761230469,
"loss": 0.6329,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3439080715179443,
"rewards/margins": 0.5687575936317444,
"rewards/rejected": -1.9126653671264648,
"step": 280
},
{
"epoch": 0.1517927244176917,
"grad_norm": 13.731548773970651,
"learning_rate": 4.959192185605087e-07,
"logits/chosen": 5860.9970703125,
"logits/rejected": 5171.845703125,
"logps/chosen": -345.3323974609375,
"logps/rejected": -396.91387939453125,
"loss": 0.6405,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2838389873504639,
"rewards/margins": 0.4448428153991699,
"rewards/rejected": -1.7286819219589233,
"step": 290
},
{
"epoch": 0.15702695629416383,
"grad_norm": 15.516769429678961,
"learning_rate": 4.950560466792969e-07,
"logits/chosen": 6540.11181640625,
"logits/rejected": 5237.14306640625,
"logps/chosen": -370.7175598144531,
"logps/rejected": -381.68731689453125,
"loss": 0.647,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0437076091766357,
"rewards/margins": 0.41619840264320374,
"rewards/rejected": -1.4599062204360962,
"step": 300
},
{
"epoch": 0.16226118817063595,
"grad_norm": 15.23495566455289,
"learning_rate": 4.941110280691619e-07,
"logits/chosen": 5895.0712890625,
"logits/rejected": 4663.57666015625,
"logps/chosen": -328.5111999511719,
"logps/rejected": -317.84136962890625,
"loss": 0.6316,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9885784983634949,
"rewards/margins": 0.467812716960907,
"rewards/rejected": -1.4563910961151123,
"step": 310
},
{
"epoch": 0.16749542004710807,
"grad_norm": 12.994410953517146,
"learning_rate": 4.930844783586424e-07,
"logits/chosen": 5147.50830078125,
"logits/rejected": 4891.75927734375,
"logps/chosen": -270.1437072753906,
"logps/rejected": -316.5980529785156,
"loss": 0.6442,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0208574533462524,
"rewards/margins": 0.3713577687740326,
"rewards/rejected": -1.392215371131897,
"step": 320
},
{
"epoch": 0.17272965192358022,
"grad_norm": 25.668033482423173,
"learning_rate": 4.919767404070033e-07,
"logits/chosen": 6307.4296875,
"logits/rejected": 5151.60400390625,
"logps/chosen": -341.2019958496094,
"logps/rejected": -356.7355651855469,
"loss": 0.6357,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1538581848144531,
"rewards/margins": 0.4713706970214844,
"rewards/rejected": -1.6252288818359375,
"step": 330
},
{
"epoch": 0.17796388380005235,
"grad_norm": 18.566603418251706,
"learning_rate": 4.907881841897216e-07,
"logits/chosen": 5456.0732421875,
"logits/rejected": 5621.28564453125,
"logps/chosen": -366.95880126953125,
"logps/rejected": -429.9764709472656,
"loss": 0.6446,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.5142645835876465,
"rewards/margins": 0.40540844202041626,
"rewards/rejected": -1.919672966003418,
"step": 340
},
{
"epoch": 0.18319811567652447,
"grad_norm": 15.467065391000633,
"learning_rate": 4.895192066749189e-07,
"logits/chosen": 5902.5888671875,
"logits/rejected": 4471.02490234375,
"logps/chosen": -372.2309265136719,
"logps/rejected": -398.52490234375,
"loss": 0.6217,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5484896898269653,
"rewards/margins": 0.45622071623802185,
"rewards/rejected": -2.0047104358673096,
"step": 350
},
{
"epoch": 0.1884323475529966,
"grad_norm": 15.119783236904505,
"learning_rate": 4.881702316907768e-07,
"logits/chosen": 6141.3212890625,
"logits/rejected": 4610.8212890625,
"logps/chosen": -334.36376953125,
"logps/rejected": -341.06304931640625,
"loss": 0.6372,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1254819631576538,
"rewards/margins": 0.5175460577011108,
"rewards/rejected": -1.6430280208587646,
"step": 360
},
{
"epoch": 0.19366657942946872,
"grad_norm": 16.916135709316627,
"learning_rate": 4.86741709783982e-07,
"logits/chosen": 5536.07177734375,
"logits/rejected": 4676.4970703125,
"logps/chosen": -308.6365661621094,
"logps/rejected": -361.42022705078125,
"loss": 0.6438,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0314075946807861,
"rewards/margins": 0.6450502276420593,
"rewards/rejected": -1.6764577627182007,
"step": 370
},
{
"epoch": 0.19890081130594087,
"grad_norm": 20.375718209590385,
"learning_rate": 4.85234118069247e-07,
"logits/chosen": 6313.5400390625,
"logits/rejected": 5581.75537109375,
"logps/chosen": -365.587646484375,
"logps/rejected": -383.8091735839844,
"loss": 0.6376,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2571805715560913,
"rewards/margins": 0.49333277344703674,
"rewards/rejected": -1.7505133152008057,
"step": 380
},
{
"epoch": 0.204135043182413,
"grad_norm": 22.004393446801256,
"learning_rate": 4.836479600699578e-07,
"logits/chosen": 5796.1845703125,
"logits/rejected": 5391.08056640625,
"logps/chosen": -358.70281982421875,
"logps/rejected": -422.412841796875,
"loss": 0.652,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4709709882736206,
"rewards/margins": 0.5307806730270386,
"rewards/rejected": -2.0017518997192383,
"step": 390
},
{
"epoch": 0.2093692750588851,
"grad_norm": 16.72031008823946,
"learning_rate": 4.819837655500013e-07,
"logits/chosen": 6321.2421875,
"logits/rejected": 6179.9267578125,
"logps/chosen": -391.6398620605469,
"logps/rejected": -447.68701171875,
"loss": 0.6263,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.5102037191390991,
"rewards/margins": 0.5057711601257324,
"rewards/rejected": -2.015974998474121,
"step": 400
},
{
"epoch": 0.21460350693535724,
"grad_norm": 13.254253162407238,
"learning_rate": 4.802420903368285e-07,
"logits/chosen": 5838.13427734375,
"logits/rejected": 4767.97265625,
"logps/chosen": -323.6955871582031,
"logps/rejected": -403.03204345703125,
"loss": 0.6262,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3097789287567139,
"rewards/margins": 0.8338877558708191,
"rewards/rejected": -2.1436662673950195,
"step": 410
},
{
"epoch": 0.21983773881182936,
"grad_norm": 14.878076929512742,
"learning_rate": 4.784235161358123e-07,
"logits/chosen": 6580.14453125,
"logits/rejected": 5022.2802734375,
"logps/chosen": -370.36663818359375,
"logps/rejected": -406.0109558105469,
"loss": 0.6325,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3005058765411377,
"rewards/margins": 0.645524263381958,
"rewards/rejected": -1.9460302591323853,
"step": 420
},
{
"epoch": 0.22507197068830148,
"grad_norm": 20.06439838050598,
"learning_rate": 4.7652865033596314e-07,
"logits/chosen": 6275.22607421875,
"logits/rejected": 5113.31591796875,
"logps/chosen": -382.3496398925781,
"logps/rejected": -440.8421936035156,
"loss": 0.6318,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6066843271255493,
"rewards/margins": 0.5545600652694702,
"rewards/rejected": -2.1612443923950195,
"step": 430
},
{
"epoch": 0.23030620256477363,
"grad_norm": 22.120777825162968,
"learning_rate": 4.7455812580706534e-07,
"logits/chosen": 5785.953125,
"logits/rejected": 4642.66162109375,
"logps/chosen": -327.7315673828125,
"logps/rejected": -375.60174560546875,
"loss": 0.621,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1555116176605225,
"rewards/margins": 0.5638757944107056,
"rewards/rejected": -1.719387412071228,
"step": 440
},
{
"epoch": 0.23554043444124576,
"grad_norm": 13.51190093535208,
"learning_rate": 4.725126006883046e-07,
"logits/chosen": 5409.0078125,
"logits/rejected": 5192.5322265625,
"logps/chosen": -322.37652587890625,
"logps/rejected": -383.2165832519531,
"loss": 0.6344,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1335276365280151,
"rewards/margins": 0.5543726682662964,
"rewards/rejected": -1.687900185585022,
"step": 450
},
{
"epoch": 0.24077466631771788,
"grad_norm": 15.29005551288156,
"learning_rate": 4.703927581684539e-07,
"logits/chosen": 5768.34326171875,
"logits/rejected": 5688.51318359375,
"logps/chosen": -342.89410400390625,
"logps/rejected": -355.6271667480469,
"loss": 0.6524,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.247072458267212,
"rewards/margins": 0.38124534487724304,
"rewards/rejected": -1.6283178329467773,
"step": 460
},
{
"epoch": 0.24600889819419,
"grad_norm": 14.004434288132737,
"learning_rate": 4.68199306257695e-07,
"logits/chosen": 5412.37744140625,
"logits/rejected": 4303.890625,
"logps/chosen": -360.8803405761719,
"logps/rejected": -420.22076416015625,
"loss": 0.6139,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.478992223739624,
"rewards/margins": 0.6786683797836304,
"rewards/rejected": -2.157660722732544,
"step": 470
},
{
"epoch": 0.2512431300706621,
"grad_norm": 20.211543807599117,
"learning_rate": 4.6593297755114776e-07,
"logits/chosen": 6246.66943359375,
"logits/rejected": 5820.33935546875,
"logps/chosen": -369.6717834472656,
"logps/rejected": -455.38494873046875,
"loss": 0.6433,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.606078863143921,
"rewards/margins": 0.5704205632209778,
"rewards/rejected": -2.176499605178833,
"step": 480
},
{
"epoch": 0.2564773619471343,
"grad_norm": 12.654030981602599,
"learning_rate": 4.635945289841902e-07,
"logits/chosen": 4824.7998046875,
"logits/rejected": 4868.42724609375,
"logps/chosen": -301.3868713378906,
"logps/rejected": -385.3939208984375,
"loss": 0.6484,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.299076795578003,
"rewards/margins": 0.41370564699172974,
"rewards/rejected": -1.7127822637557983,
"step": 490
},
{
"epoch": 0.26171159382360637,
"grad_norm": 21.014153020532053,
"learning_rate": 4.611847415796476e-07,
"logits/chosen": 6195.263671875,
"logits/rejected": 5270.9248046875,
"logps/chosen": -342.86016845703125,
"logps/rejected": -348.72308349609375,
"loss": 0.6511,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.059452772140503,
"rewards/margins": 0.3982711434364319,
"rewards/rejected": -1.4577242136001587,
"step": 500
},
{
"epoch": 0.2669458257000785,
"grad_norm": 15.629527805404802,
"learning_rate": 4.5870442018693773e-07,
"logits/chosen": 5918.3779296875,
"logits/rejected": 5355.09912109375,
"logps/chosen": -324.29803466796875,
"logps/rejected": -372.2521667480469,
"loss": 0.632,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0098707675933838,
"rewards/margins": 0.4723685681819916,
"rewards/rejected": -1.4822394847869873,
"step": 510
},
{
"epoch": 0.2721800575765506,
"grad_norm": 21.676809757975366,
"learning_rate": 4.5615439321325735e-07,
"logits/chosen": 6207.53173828125,
"logits/rejected": 4946.9072265625,
"logps/chosen": -332.4702453613281,
"logps/rejected": -391.6280212402344,
"loss": 0.6148,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.026963472366333,
"rewards/margins": 0.6531401872634888,
"rewards/rejected": -1.6801038980484009,
"step": 520
},
{
"epoch": 0.27741428945302277,
"grad_norm": 23.79952337893574,
"learning_rate": 4.535355123469008e-07,
"logits/chosen": 5684.533203125,
"logits/rejected": 5139.0107421875,
"logps/chosen": -371.2861022949219,
"logps/rejected": -437.2891540527344,
"loss": 0.6285,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5409961938858032,
"rewards/margins": 0.7230764627456665,
"rewards/rejected": -2.2640726566314697,
"step": 530
},
{
"epoch": 0.2826485213294949,
"grad_norm": 18.16354981413204,
"learning_rate": 4.5084865227280366e-07,
"logits/chosen": 5638.453125,
"logits/rejected": 5075.7314453125,
"logps/chosen": -398.3193054199219,
"logps/rejected": -441.16033935546875,
"loss": 0.63,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6414705514907837,
"rewards/margins": 0.6848443746566772,
"rewards/rejected": -2.326314687728882,
"step": 540
},
{
"epoch": 0.287882753205967,
"grad_norm": 26.021483127779707,
"learning_rate": 4.4809471038040437e-07,
"logits/chosen": 5500.9501953125,
"logits/rejected": 4291.2802734375,
"logps/chosen": -389.2489013671875,
"logps/rejected": -409.811279296875,
"loss": 0.641,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.5569204092025757,
"rewards/margins": 0.7008808851242065,
"rewards/rejected": -2.2578012943267822,
"step": 550
},
{
"epoch": 0.29311698508243916,
"grad_norm": 15.956576081472086,
"learning_rate": 4.4527460646392386e-07,
"logits/chosen": 5543.23193359375,
"logits/rejected": 5107.40625,
"logps/chosen": -328.09698486328125,
"logps/rejected": -381.325439453125,
"loss": 0.6394,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3435633182525635,
"rewards/margins": 0.45007848739624023,
"rewards/rejected": -1.7936416864395142,
"step": 560
},
{
"epoch": 0.29835121695891126,
"grad_norm": 13.093007587120157,
"learning_rate": 4.4238928241516163e-07,
"logits/chosen": 6740.7314453125,
"logits/rejected": 5075.4892578125,
"logps/chosen": -383.84674072265625,
"logps/rejected": -408.04046630859375,
"loss": 0.62,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2751758098602295,
"rewards/margins": 0.8238226175308228,
"rewards/rejected": -2.0989983081817627,
"step": 570
},
{
"epoch": 0.3035854488353834,
"grad_norm": 24.06019117727656,
"learning_rate": 4.394397019089116e-07,
"logits/chosen": 5973.04150390625,
"logits/rejected": 4739.271484375,
"logps/chosen": -371.7142028808594,
"logps/rejected": -389.0022888183594,
"loss": 0.626,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.3149608373641968,
"rewards/margins": 0.5819457173347473,
"rewards/rejected": -1.8969066143035889,
"step": 580
},
{
"epoch": 0.30881968071185556,
"grad_norm": 17.81896374953663,
"learning_rate": 4.3642685008110246e-07,
"logits/chosen": 5682.49365234375,
"logits/rejected": 4360.3330078125,
"logps/chosen": -321.8192138671875,
"logps/rejected": -370.5431823730469,
"loss": 0.6423,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.199681043624878,
"rewards/margins": 0.7428802251815796,
"rewards/rejected": -1.942561149597168,
"step": 590
},
{
"epoch": 0.31405391258832765,
"grad_norm": 16.935052692220793,
"learning_rate": 4.333517331997704e-07,
"logits/chosen": 6167.5615234375,
"logits/rejected": 5758.603515625,
"logps/chosen": -402.3914794921875,
"logps/rejected": -434.56158447265625,
"loss": 0.6304,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5500683784484863,
"rewards/margins": 0.46028876304626465,
"rewards/rejected": -2.01035737991333,
"step": 600
},
{
"epoch": 0.3192881444647998,
"grad_norm": 15.773609977818438,
"learning_rate": 4.302153783289736e-07,
"logits/chosen": 5890.45947265625,
"logits/rejected": 4988.90380859375,
"logps/chosen": -399.48944091796875,
"logps/rejected": -501.8160705566406,
"loss": 0.5844,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8650957345962524,
"rewards/margins": 0.8637407422065735,
"rewards/rejected": -2.7288365364074707,
"step": 610
},
{
"epoch": 0.3245223763412719,
"grad_norm": 20.438404398459674,
"learning_rate": 4.2701883298576124e-07,
"logits/chosen": 5650.4580078125,
"logits/rejected": 5150.5224609375,
"logps/chosen": -462.61883544921875,
"logps/rejected": -513.2371826171875,
"loss": 0.6356,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.343827724456787,
"rewards/margins": 0.8286565542221069,
"rewards/rejected": -3.1724846363067627,
"step": 620
},
{
"epoch": 0.32975660821774405,
"grad_norm": 28.24293371703605,
"learning_rate": 4.237631647903115e-07,
"logits/chosen": 5648.98046875,
"logits/rejected": 4617.064453125,
"logps/chosen": -411.988525390625,
"logps/rejected": -463.56158447265625,
"loss": 0.6294,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.997859239578247,
"rewards/margins": 0.6983556747436523,
"rewards/rejected": -2.6962146759033203,
"step": 630
},
{
"epoch": 0.33499084009421615,
"grad_norm": 23.577036886324247,
"learning_rate": 4.204494611093548e-07,
"logits/chosen": 5993.8974609375,
"logits/rejected": 4195.65283203125,
"logps/chosen": -419.8607482910156,
"logps/rejected": -440.91717529296875,
"loss": 0.6299,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.721379280090332,
"rewards/margins": 0.695422887802124,
"rewards/rejected": -2.416802406311035,
"step": 640
},
{
"epoch": 0.3402250719706883,
"grad_norm": 18.70040237006655,
"learning_rate": 4.1707882869300235e-07,
"logits/chosen": 6020.3857421875,
"logits/rejected": 4892.1318359375,
"logps/chosen": -388.27813720703125,
"logps/rejected": -392.47674560546875,
"loss": 0.6304,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5754492282867432,
"rewards/margins": 0.5581509470939636,
"rewards/rejected": -2.1335999965667725,
"step": 650
},
{
"epoch": 0.34545930384716045,
"grad_norm": 18.77689044696186,
"learning_rate": 4.136523933051005e-07,
"logits/chosen": 6190.458984375,
"logits/rejected": 5476.84912109375,
"logps/chosen": -394.31134033203125,
"logps/rejected": -425.36248779296875,
"loss": 0.6175,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.6519289016723633,
"rewards/margins": 0.5381680130958557,
"rewards/rejected": -2.190096616744995,
"step": 660
},
{
"epoch": 0.35069353572363254,
"grad_norm": 18.186712218474053,
"learning_rate": 4.101712993472348e-07,
"logits/chosen": 6320.23828125,
"logits/rejected": 5412.2626953125,
"logps/chosen": -394.0950622558594,
"logps/rejected": -413.16644287109375,
"loss": 0.6309,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6630204916000366,
"rewards/margins": 0.59214186668396,
"rewards/rejected": -2.255162477493286,
"step": 670
},
{
"epoch": 0.3559277676001047,
"grad_norm": 21.426538798598312,
"learning_rate": 4.066367094765091e-07,
"logits/chosen": 5823.1728515625,
"logits/rejected": 4670.80224609375,
"logps/chosen": -417.28515625,
"logps/rejected": -464.26654052734375,
"loss": 0.6031,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7154357433319092,
"rewards/margins": 0.9158226251602173,
"rewards/rejected": -2.631258487701416,
"step": 680
},
{
"epoch": 0.3611619994765768,
"grad_norm": 19.144193841746027,
"learning_rate": 4.0304980421722766e-07,
"logits/chosen": 5696.5908203125,
"logits/rejected": 5137.9638671875,
"logps/chosen": -425.8158264160156,
"logps/rejected": -490.96624755859375,
"loss": 0.6246,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8618491888046265,
"rewards/margins": 0.8498145937919617,
"rewards/rejected": -2.7116637229919434,
"step": 690
},
{
"epoch": 0.36639623135304894,
"grad_norm": 28.56372190962352,
"learning_rate": 3.994117815666095e-07,
"logits/chosen": 5727.22607421875,
"logits/rejected": 4252.705078125,
"logps/chosen": -492.46014404296875,
"logps/rejected": -520.4065551757812,
"loss": 0.6296,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1523029804229736,
"rewards/margins": 0.9564183354377747,
"rewards/rejected": -3.1087214946746826,
"step": 700
},
{
"epoch": 0.3716304632295211,
"grad_norm": 13.063007551794367,
"learning_rate": 3.957238565946671e-07,
"logits/chosen": 5457.42041015625,
"logits/rejected": 4502.88720703125,
"logps/chosen": -379.50506591796875,
"logps/rejected": -405.9420471191406,
"loss": 0.655,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.719842553138733,
"rewards/margins": 0.5198991894721985,
"rewards/rejected": -2.239741563796997,
"step": 710
},
{
"epoch": 0.3768646951059932,
"grad_norm": 11.137969578259929,
"learning_rate": 3.9198726103838306e-07,
"logits/chosen": 5491.45947265625,
"logits/rejected": 4884.5771484375,
"logps/chosen": -358.10699462890625,
"logps/rejected": -377.1960754394531,
"loss": 0.6109,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.378875970840454,
"rewards/margins": 0.5345520377159119,
"rewards/rejected": -1.9134283065795898,
"step": 720
},
{
"epoch": 0.38209892698246534,
"grad_norm": 18.42567249890633,
"learning_rate": 3.8820324289031946e-07,
"logits/chosen": 5650.734375,
"logits/rejected": 4883.583984375,
"logps/chosen": -329.21630859375,
"logps/rejected": -421.2305603027344,
"loss": 0.6106,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.3386439085006714,
"rewards/margins": 0.9097055196762085,
"rewards/rejected": -2.248349666595459,
"step": 730
},
{
"epoch": 0.38733315885893743,
"grad_norm": 21.014679051728024,
"learning_rate": 3.84373065981799e-07,
"logits/chosen": 6379.822265625,
"logits/rejected": 4723.3544921875,
"logps/chosen": -400.08380126953125,
"logps/rejected": -476.69720458984375,
"loss": 0.6107,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6094900369644165,
"rewards/margins": 1.0389902591705322,
"rewards/rejected": -2.648480176925659,
"step": 740
},
{
"epoch": 0.3925673907354096,
"grad_norm": 25.5783449608529,
"learning_rate": 3.8049800956079545e-07,
"logits/chosen": 5933.28173828125,
"logits/rejected": 5049.6416015625,
"logps/chosen": -450.82745361328125,
"logps/rejected": -519.0262451171875,
"loss": 0.6471,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.1224923133850098,
"rewards/margins": 1.0625412464141846,
"rewards/rejected": -3.1850337982177734,
"step": 750
},
{
"epoch": 0.39780162261188173,
"grad_norm": 16.150618590693583,
"learning_rate": 3.7657936786467525e-07,
"logits/chosen": 5189.0732421875,
"logits/rejected": 4285.34912109375,
"logps/chosen": -424.62255859375,
"logps/rejected": -479.2969665527344,
"loss": 0.6186,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.2088141441345215,
"rewards/margins": 0.7376548051834106,
"rewards/rejected": -2.9464688301086426,
"step": 760
},
{
"epoch": 0.40303585448835383,
"grad_norm": 15.760084999630747,
"learning_rate": 3.7261844968793226e-07,
"logits/chosen": 4326.27197265625,
"logits/rejected": 4380.33544921875,
"logps/chosen": -372.68756103515625,
"logps/rejected": -481.65313720703125,
"loss": 0.6109,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9684680700302124,
"rewards/margins": 0.8767637014389038,
"rewards/rejected": -2.8452320098876953,
"step": 770
},
{
"epoch": 0.408270086364826,
"grad_norm": 18.09652778784993,
"learning_rate": 3.6861657794506187e-07,
"logits/chosen": 4880.94482421875,
"logits/rejected": 4508.5419921875,
"logps/chosen": -407.27587890625,
"logps/rejected": -466.6880798339844,
"loss": 0.6446,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.0973594188690186,
"rewards/margins": 0.6051468253135681,
"rewards/rejected": -2.7025063037872314,
"step": 780
},
{
"epoch": 0.4135043182412981,
"grad_norm": 15.553054502461759,
"learning_rate": 3.6457508922871777e-07,
"logits/chosen": 6180.486328125,
"logits/rejected": 4504.57763671875,
"logps/chosen": -405.5555725097656,
"logps/rejected": -487.57196044921875,
"loss": 0.6097,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.869215726852417,
"rewards/margins": 1.1324493885040283,
"rewards/rejected": -3.0016651153564453,
"step": 790
},
{
"epoch": 0.4187385501177702,
"grad_norm": 52.02343099220796,
"learning_rate": 3.6049533336330084e-07,
"logits/chosen": 6146.11865234375,
"logits/rejected": 4862.7744140625,
"logps/chosen": -443.3235778808594,
"logps/rejected": -514.3902587890625,
"loss": 0.6423,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.093003034591675,
"rewards/margins": 1.0282524824142456,
"rewards/rejected": -3.12125563621521,
"step": 800
},
{
"epoch": 0.4239727819942423,
"grad_norm": 25.391701434361387,
"learning_rate": 3.56378672954129e-07,
"logits/chosen": 6351.4970703125,
"logits/rejected": 4460.3125,
"logps/chosen": -440.08294677734375,
"logps/rejected": -489.60321044921875,
"loss": 0.6175,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8706138134002686,
"rewards/margins": 1.1428322792053223,
"rewards/rejected": -3.01344633102417,
"step": 810
},
{
"epoch": 0.42920701387071447,
"grad_norm": 17.33884318164809,
"learning_rate": 3.5222648293233803e-07,
"logits/chosen": 6334.86279296875,
"logits/rejected": 5818.06591796875,
"logps/chosen": -396.09466552734375,
"logps/rejected": -470.11273193359375,
"loss": 0.6092,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6134361028671265,
"rewards/margins": 0.7463122606277466,
"rewards/rejected": -2.359748363494873,
"step": 820
},
{
"epoch": 0.4344412457471866,
"grad_norm": 21.34021081433511,
"learning_rate": 3.480401500956657e-07,
"logits/chosen": 5477.52587890625,
"logits/rejected": 4610.40283203125,
"logps/chosen": -352.7813415527344,
"logps/rejected": -410.7137756347656,
"loss": 0.6365,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.540126085281372,
"rewards/margins": 0.4730333387851715,
"rewards/rejected": -2.0131595134735107,
"step": 830
},
{
"epoch": 0.4396754776236587,
"grad_norm": 17.95258525844177,
"learning_rate": 3.438210726452724e-07,
"logits/chosen": 6387.1103515625,
"logits/rejected": 5639.19580078125,
"logps/chosen": -402.55999755859375,
"logps/rejected": -427.85400390625,
"loss": 0.6315,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4374101161956787,
"rewards/margins": 0.6155884265899658,
"rewards/rejected": -2.0529983043670654,
"step": 840
},
{
"epoch": 0.44490970950013087,
"grad_norm": 18.9222054407907,
"learning_rate": 3.395706597187538e-07,
"logits/chosen": 4786.2646484375,
"logits/rejected": 4725.2626953125,
"logps/chosen": -342.1614990234375,
"logps/rejected": -403.74755859375,
"loss": 0.614,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.560929536819458,
"rewards/margins": 0.6686034202575684,
"rewards/rejected": -2.2295329570770264,
"step": 850
},
{
"epoch": 0.45014394137660296,
"grad_norm": 24.87010650260379,
"learning_rate": 3.3529033091949986e-07,
"logits/chosen": 5798.42724609375,
"logits/rejected": 5365.8623046875,
"logps/chosen": -429.4087829589844,
"logps/rejected": -528.0635375976562,
"loss": 0.6112,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7890077829360962,
"rewards/margins": 0.9684630632400513,
"rewards/rejected": -2.7574710845947266,
"step": 860
},
{
"epoch": 0.4553781732530751,
"grad_norm": 56.53886775450491,
"learning_rate": 3.309815158425591e-07,
"logits/chosen": 5630.0419921875,
"logits/rejected": 5342.580078125,
"logps/chosen": -417.60888671875,
"logps/rejected": -509.32647705078125,
"loss": 0.6257,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.7594547271728516,
"rewards/margins": 1.0495405197143555,
"rewards/rejected": -2.808995008468628,
"step": 870
},
{
"epoch": 0.46061240512954726,
"grad_norm": 24.277071765568724,
"learning_rate": 3.2664565359716536e-07,
"logits/chosen": 5669.77392578125,
"logits/rejected": 4588.5927734375,
"logps/chosen": -415.36163330078125,
"logps/rejected": -488.67120361328125,
"loss": 0.6156,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9911209344863892,
"rewards/margins": 1.0688735246658325,
"rewards/rejected": -3.0599944591522217,
"step": 880
},
{
"epoch": 0.46584663700601936,
"grad_norm": 17.534117100677573,
"learning_rate": 3.222841923260869e-07,
"logits/chosen": 5307.109375,
"logits/rejected": 4587.55029296875,
"logps/chosen": -423.51629638671875,
"logps/rejected": -494.17193603515625,
"loss": 0.6121,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.087824821472168,
"rewards/margins": 0.893652081489563,
"rewards/rejected": -2.9814765453338623,
"step": 890
},
{
"epoch": 0.4710808688824915,
"grad_norm": 20.56698549553084,
"learning_rate": 3.1789858872195887e-07,
"logits/chosen": 6439.45751953125,
"logits/rejected": 5222.29833984375,
"logps/chosen": -458.2245178222656,
"logps/rejected": -531.4591674804688,
"loss": 0.6043,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.106672525405884,
"rewards/margins": 0.9118589162826538,
"rewards/rejected": -3.018531322479248,
"step": 900
},
{
"epoch": 0.4763151007589636,
"grad_norm": 15.634569986443797,
"learning_rate": 3.1349030754075937e-07,
"logits/chosen": 5356.185546875,
"logits/rejected": 4248.3271484375,
"logps/chosen": -420.09600830078125,
"logps/rejected": -509.48101806640625,
"loss": 0.6183,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.1424427032470703,
"rewards/margins": 1.1177256107330322,
"rewards/rejected": -3.2601680755615234,
"step": 910
},
{
"epoch": 0.48154933263543576,
"grad_norm": 17.43008538687268,
"learning_rate": 3.090608211125931e-07,
"logits/chosen": 5311.978515625,
"logits/rejected": 4518.35693359375,
"logps/chosen": -421.0234375,
"logps/rejected": -501.09527587890625,
"loss": 0.5957,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.1865296363830566,
"rewards/margins": 0.9108685255050659,
"rewards/rejected": -3.097398281097412,
"step": 920
},
{
"epoch": 0.48678356451190785,
"grad_norm": 23.081663273096012,
"learning_rate": 3.0461160884994487e-07,
"logits/chosen": 5700.06689453125,
"logits/rejected": 5031.7353515625,
"logps/chosen": -447.28936767578125,
"logps/rejected": -512.2467651367188,
"loss": 0.6257,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.2586405277252197,
"rewards/margins": 0.7844768762588501,
"rewards/rejected": -3.0431172847747803,
"step": 930
},
{
"epoch": 0.49201779638838,
"grad_norm": 18.627739282913765,
"learning_rate": 3.001441567535681e-07,
"logits/chosen": 6320.2421875,
"logits/rejected": 5199.8828125,
"logps/chosen": -429.02667236328125,
"logps/rejected": -511.12457275390625,
"loss": 0.6071,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9009828567504883,
"rewards/margins": 1.0119611024856567,
"rewards/rejected": -2.9129440784454346,
"step": 940
},
{
"epoch": 0.49725202826485215,
"grad_norm": 20.9694437636251,
"learning_rate": 2.956599569161724e-07,
"logits/chosen": 5312.28173828125,
"logits/rejected": 4129.46435546875,
"logps/chosen": -352.3714294433594,
"logps/rejected": -402.3336486816406,
"loss": 0.6166,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.611181616783142,
"rewards/margins": 0.5900977849960327,
"rewards/rejected": -2.2012791633605957,
"step": 950
},
{
"epoch": 0.5024862601413242,
"grad_norm": 16.66673110491197,
"learning_rate": 2.91160507024077e-07,
"logits/chosen": 5664.244140625,
"logits/rejected": 4732.4833984375,
"logps/chosen": -374.69970703125,
"logps/rejected": -430.1102600097656,
"loss": 0.6171,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5680463314056396,
"rewards/margins": 0.7437410950660706,
"rewards/rejected": -2.3117871284484863,
"step": 960
},
{
"epoch": 0.5077204920177963,
"grad_norm": 14.965729396145859,
"learning_rate": 2.866473098569953e-07,
"logits/chosen": 5775.98291015625,
"logits/rejected": 4830.63916015625,
"logps/chosen": -399.218017578125,
"logps/rejected": -450.00469970703125,
"loss": 0.6236,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5897982120513916,
"rewards/margins": 0.7888145446777344,
"rewards/rejected": -2.378612518310547,
"step": 970
},
{
"epoch": 0.5129547238942685,
"grad_norm": 12.518165998557452,
"learning_rate": 2.8212187278611905e-07,
"logits/chosen": 5487.87646484375,
"logits/rejected": 4786.9697265625,
"logps/chosen": -406.44769287109375,
"logps/rejected": -478.30450439453125,
"loss": 0.6078,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7121471166610718,
"rewards/margins": 0.8904681205749512,
"rewards/rejected": -2.6026151180267334,
"step": 980
},
{
"epoch": 0.5181889557707406,
"grad_norm": 22.061851534247943,
"learning_rate": 2.775857072706684e-07,
"logits/chosen": 5991.2373046875,
"logits/rejected": 4359.41357421875,
"logps/chosen": -416.60516357421875,
"logps/rejected": -461.73016357421875,
"loss": 0.6386,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.810485601425171,
"rewards/margins": 0.9933170080184937,
"rewards/rejected": -2.803802967071533,
"step": 990
},
{
"epoch": 0.5234231876472127,
"grad_norm": 22.46913725233362,
"learning_rate": 2.7304032835307667e-07,
"logits/chosen": 6123.0048828125,
"logits/rejected": 5400.46240234375,
"logps/chosen": -433.31829833984375,
"logps/rejected": -514.8015747070312,
"loss": 0.6364,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.062668561935425,
"rewards/margins": 0.59827721118927,
"rewards/rejected": -2.6609461307525635,
"step": 1000
},
{
"epoch": 0.528657419523685,
"grad_norm": 16.396544720613925,
"learning_rate": 2.6848725415297884e-07,
"logits/chosen": 5970.46044921875,
"logits/rejected": 5188.1962890625,
"logps/chosen": -450.0951232910156,
"logps/rejected": -460.515625,
"loss": 0.6228,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9533536434173584,
"rewards/margins": 0.6516803503036499,
"rewards/rejected": -2.6050338745117188,
"step": 1010
},
{
"epoch": 0.533891651400157,
"grad_norm": 34.998855163224775,
"learning_rate": 2.6392800536017183e-07,
"logits/chosen": 5251.8818359375,
"logits/rejected": 4933.35546875,
"logps/chosen": -433.3590393066406,
"logps/rejected": -494.32366943359375,
"loss": 0.6187,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9257965087890625,
"rewards/margins": 0.7166542410850525,
"rewards/rejected": -2.6424505710601807,
"step": 1020
},
{
"epoch": 0.5391258832766291,
"grad_norm": 20.45554516626394,
"learning_rate": 2.59364104726716e-07,
"logits/chosen": 5809.958984375,
"logits/rejected": 5054.63037109375,
"logps/chosen": -413.60357666015625,
"logps/rejected": -492.5873107910156,
"loss": 0.6035,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.7435877323150635,
"rewards/margins": 0.8188160061836243,
"rewards/rejected": -2.562403678894043,
"step": 1030
},
{
"epoch": 0.5443601151531012,
"grad_norm": 25.933977698433374,
"learning_rate": 2.547970765583491e-07,
"logits/chosen": 5483.72412109375,
"logits/rejected": 4852.462890625,
"logps/chosen": -373.3037414550781,
"logps/rejected": -430.94378662109375,
"loss": 0.6243,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6192424297332764,
"rewards/margins": 0.8005384206771851,
"rewards/rejected": -2.419780969619751,
"step": 1040
},
{
"epoch": 0.5495943470295734,
"grad_norm": 18.23336853816008,
"learning_rate": 2.502284462053799e-07,
"logits/chosen": 6024.7958984375,
"logits/rejected": 5882.58740234375,
"logps/chosen": -410.0364685058594,
"logps/rejected": -473.29779052734375,
"loss": 0.6254,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.7705657482147217,
"rewards/margins": 0.7812509536743164,
"rewards/rejected": -2.551816940307617,
"step": 1050
},
{
"epoch": 0.5548285789060455,
"grad_norm": 45.486266011389816,
"learning_rate": 2.4565973955323374e-07,
"logits/chosen": 5641.85302734375,
"logits/rejected": 4873.16845703125,
"logps/chosen": -415.40582275390625,
"logps/rejected": -460.23077392578125,
"loss": 0.6214,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7098748683929443,
"rewards/margins": 0.8872604370117188,
"rewards/rejected": -2.597135305404663,
"step": 1060
},
{
"epoch": 0.5600628107825176,
"grad_norm": 23.73611035678335,
"learning_rate": 2.410924825128195e-07,
"logits/chosen": 5291.748046875,
"logits/rejected": 5004.06884765625,
"logps/chosen": -400.042236328125,
"logps/rejected": -488.37744140625,
"loss": 0.599,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.792931318283081,
"rewards/margins": 0.8118869662284851,
"rewards/rejected": -2.604818344116211,
"step": 1070
},
{
"epoch": 0.5652970426589898,
"grad_norm": 23.703780906245843,
"learning_rate": 2.365282005108875e-07,
"logits/chosen": 5615.40283203125,
"logits/rejected": 4617.5302734375,
"logps/chosen": -391.23028564453125,
"logps/rejected": -494.76531982421875,
"loss": 0.6073,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8765054941177368,
"rewards/margins": 1.003303050994873,
"rewards/rejected": -2.8798086643218994,
"step": 1080
},
{
"epoch": 0.5705312745354619,
"grad_norm": 32.00654280597893,
"learning_rate": 2.319684179805491e-07,
"logits/chosen": 5474.94189453125,
"logits/rejected": 4257.7763671875,
"logps/chosen": -418.8746032714844,
"logps/rejected": -479.42205810546875,
"loss": 0.6239,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8052211999893188,
"rewards/margins": 1.1022889614105225,
"rewards/rejected": -2.907510280609131,
"step": 1090
},
{
"epoch": 0.575765506411934,
"grad_norm": 15.09375460303486,
"learning_rate": 2.2741465785212902e-07,
"logits/chosen": 5132.87255859375,
"logits/rejected": 3877.443359375,
"logps/chosen": -369.39129638671875,
"logps/rejected": -445.2359313964844,
"loss": 0.5876,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5681183338165283,
"rewards/margins": 1.1039445400238037,
"rewards/rejected": -2.672062635421753,
"step": 1100
},
{
"epoch": 0.5809997382884062,
"grad_norm": 15.752950958144131,
"learning_rate": 2.2286844104451843e-07,
"logits/chosen": 5614.02734375,
"logits/rejected": 4852.61962890625,
"logps/chosen": -421.18035888671875,
"logps/rejected": -493.23944091796875,
"loss": 0.617,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8397204875946045,
"rewards/margins": 0.82035893201828,
"rewards/rejected": -2.6600797176361084,
"step": 1110
},
{
"epoch": 0.5862339701648783,
"grad_norm": 20.061686761620173,
"learning_rate": 2.183312859572008e-07,
"logits/chosen": 6473.8583984375,
"logits/rejected": 5419.43115234375,
"logps/chosen": -412.7747497558594,
"logps/rejected": -464.63446044921875,
"loss": 0.6271,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6665458679199219,
"rewards/margins": 0.8658057451248169,
"rewards/rejected": -2.53235125541687,
"step": 1120
},
{
"epoch": 0.5914682020413504,
"grad_norm": 17.630546844566275,
"learning_rate": 2.138047079631184e-07,
"logits/chosen": 5279.314453125,
"logits/rejected": 5356.86962890625,
"logps/chosen": -409.72161865234375,
"logps/rejected": -491.9193420410156,
"loss": 0.6111,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9969879388809204,
"rewards/margins": 0.7077668905258179,
"rewards/rejected": -2.70475435256958,
"step": 1130
},
{
"epoch": 0.5967024339178225,
"grad_norm": 20.142582983294798,
"learning_rate": 2.0929021890255068e-07,
"logits/chosen": 6199.505859375,
"logits/rejected": 5334.6689453125,
"logps/chosen": -431.4466247558594,
"logps/rejected": -511.4515075683594,
"loss": 0.6176,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7895443439483643,
"rewards/margins": 0.8201072812080383,
"rewards/rejected": -2.609651803970337,
"step": 1140
},
{
"epoch": 0.6019366657942947,
"grad_norm": 19.471822868052573,
"learning_rate": 2.0478932657817102e-07,
"logits/chosen": 5034.8251953125,
"logits/rejected": 4781.177734375,
"logps/chosen": -387.94140625,
"logps/rejected": -474.83636474609375,
"loss": 0.6173,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.82735276222229,
"rewards/margins": 0.8202959299087524,
"rewards/rejected": -2.647648572921753,
"step": 1150
},
{
"epoch": 0.6071708976707668,
"grad_norm": 45.513438143142956,
"learning_rate": 2.0030353425145374e-07,
"logits/chosen": 7131.70166015625,
"logits/rejected": 6376.83056640625,
"logps/chosen": -501.9178161621094,
"logps/rejected": -538.24658203125,
"loss": 0.6376,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0238595008850098,
"rewards/margins": 0.638025164604187,
"rewards/rejected": -2.6618847846984863,
"step": 1160
},
{
"epoch": 0.6124051295472389,
"grad_norm": 18.61685092469,
"learning_rate": 1.9583434014059635e-07,
"logits/chosen": 5769.359375,
"logits/rejected": 4956.7412109375,
"logps/chosen": -418.234375,
"logps/rejected": -483.03814697265625,
"loss": 0.6085,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.870987892150879,
"rewards/margins": 0.8069852590560913,
"rewards/rejected": -2.677973508834839,
"step": 1170
},
{
"epoch": 0.6176393614237111,
"grad_norm": 19.392180606978926,
"learning_rate": 1.9138323692012733e-07,
"logits/chosen": 5019.05419921875,
"logits/rejected": 4895.45458984375,
"logps/chosen": -433.4505310058594,
"logps/rejected": -480.860107421875,
"loss": 0.6085,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0441999435424805,
"rewards/margins": 0.6482217311859131,
"rewards/rejected": -2.6924219131469727,
"step": 1180
},
{
"epoch": 0.6228735933001832,
"grad_norm": 50.383157244491294,
"learning_rate": 1.8695171122236442e-07,
"logits/chosen": 5166.943359375,
"logits/rejected": 5133.3642578125,
"logps/chosen": -406.5730285644531,
"logps/rejected": -516.8052978515625,
"loss": 0.6235,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9044840335845947,
"rewards/margins": 0.8772269487380981,
"rewards/rejected": -2.781710386276245,
"step": 1190
},
{
"epoch": 0.6281078251766553,
"grad_norm": 23.25471727050923,
"learning_rate": 1.8254124314089223e-07,
"logits/chosen": 5613.8095703125,
"logits/rejected": 5036.1220703125,
"logps/chosen": -431.58013916015625,
"logps/rejected": -522.5189208984375,
"loss": 0.6149,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.9861242771148682,
"rewards/margins": 1.0060144662857056,
"rewards/rejected": -2.992138385772705,
"step": 1200
},
{
"epoch": 0.6333420570531274,
"grad_norm": 27.427712896477214,
"learning_rate": 1.7815330573622205e-07,
"logits/chosen": 5823.63671875,
"logits/rejected": 5659.783203125,
"logps/chosen": -410.86138916015625,
"logps/rejected": -526.7249755859375,
"loss": 0.6205,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8027637004852295,
"rewards/margins": 0.8670876622200012,
"rewards/rejected": -2.669851303100586,
"step": 1210
},
{
"epoch": 0.6385762889295996,
"grad_norm": 17.16161963024681,
"learning_rate": 1.7378936454380274e-07,
"logits/chosen": 5706.4755859375,
"logits/rejected": 4772.328125,
"logps/chosen": -412.3294982910156,
"logps/rejected": -477.41192626953125,
"loss": 0.601,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9385788440704346,
"rewards/margins": 0.7884070873260498,
"rewards/rejected": -2.7269861698150635,
"step": 1220
},
{
"epoch": 0.6438105208060717,
"grad_norm": 26.23316113841427,
"learning_rate": 1.694508770845427e-07,
"logits/chosen": 6720.44677734375,
"logits/rejected": 5618.7529296875,
"logps/chosen": -475.612060546875,
"logps/rejected": -506.27984619140625,
"loss": 0.6229,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.977423071861267,
"rewards/margins": 0.6886818408966064,
"rewards/rejected": -2.666104793548584,
"step": 1230
},
{
"epoch": 0.6490447526825438,
"grad_norm": 21.8651357246224,
"learning_rate": 1.651392923780105e-07,
"logits/chosen": 6241.5029296875,
"logits/rejected": 4998.0126953125,
"logps/chosen": -414.9952697753906,
"logps/rejected": -458.4529724121094,
"loss": 0.6061,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8033950328826904,
"rewards/margins": 0.8357815742492676,
"rewards/rejected": -2.639176845550537,
"step": 1240
},
{
"epoch": 0.654278984559016,
"grad_norm": 19.845703065114936,
"learning_rate": 1.6085605045847367e-07,
"logits/chosen": 5718.64404296875,
"logits/rejected": 4613.75634765625,
"logps/chosen": -417.8412170410156,
"logps/rejected": -497.18701171875,
"loss": 0.6224,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8895454406738281,
"rewards/margins": 0.7920354604721069,
"rewards/rejected": -2.6815807819366455,
"step": 1250
},
{
"epoch": 0.6595132164354881,
"grad_norm": 18.36104314119822,
"learning_rate": 1.5660258189393944e-07,
"logits/chosen": 5908.99951171875,
"logits/rejected": 4583.3828125,
"logps/chosen": -426.84161376953125,
"logps/rejected": -481.43865966796875,
"loss": 0.6198,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8158848285675049,
"rewards/margins": 0.921142578125,
"rewards/rejected": -2.737027406692505,
"step": 1260
},
{
"epoch": 0.6647474483119602,
"grad_norm": 27.47339811147932,
"learning_rate": 1.5238030730835577e-07,
"logits/chosen": 5228.90576171875,
"logits/rejected": 5379.51708984375,
"logps/chosen": -355.2702941894531,
"logps/rejected": -476.2916564941406,
"loss": 0.6088,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5809125900268555,
"rewards/margins": 1.1285021305084229,
"rewards/rejected": -2.7094149589538574,
"step": 1270
},
{
"epoch": 0.6699816801884323,
"grad_norm": 21.733099164416224,
"learning_rate": 1.4819063690713564e-07,
"logits/chosen": 5919.9453125,
"logits/rejected": 4732.36865234375,
"logps/chosen": -406.5284118652344,
"logps/rejected": -480.59552001953125,
"loss": 0.6132,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.690146803855896,
"rewards/margins": 0.9789739847183228,
"rewards/rejected": -2.669121026992798,
"step": 1280
},
{
"epoch": 0.6752159120649045,
"grad_norm": 27.01797105501278,
"learning_rate": 1.4403497000615883e-07,
"logits/chosen": 5621.28515625,
"logits/rejected": 4914.8369140625,
"logps/chosen": -453.36248779296875,
"logps/rejected": -479.4039611816406,
"loss": 0.6216,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8445937633514404,
"rewards/margins": 0.883182168006897,
"rewards/rejected": -2.727776050567627,
"step": 1290
},
{
"epoch": 0.6804501439413766,
"grad_norm": 29.582455929961025,
"learning_rate": 1.3991469456441272e-07,
"logits/chosen": 5492.75341796875,
"logits/rejected": 5214.58740234375,
"logps/chosen": -382.15350341796875,
"logps/rejected": -472.4346618652344,
"loss": 0.6141,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4912300109863281,
"rewards/margins": 0.8933757543563843,
"rewards/rejected": -2.384605646133423,
"step": 1300
},
{
"epoch": 0.6856843758178487,
"grad_norm": 16.98125254775057,
"learning_rate": 1.358311867204244e-07,
"logits/chosen": 4601.31982421875,
"logits/rejected": 4569.09765625,
"logps/chosen": -333.4889831542969,
"logps/rejected": -421.4237365722656,
"loss": 0.6107,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.4790998697280884,
"rewards/margins": 0.8222945928573608,
"rewards/rejected": -2.30139422416687,
"step": 1310
},
{
"epoch": 0.6909186076943209,
"grad_norm": 18.693048899733224,
"learning_rate": 1.3178581033264216e-07,
"logits/chosen": 6154.45166015625,
"logits/rejected": 5227.0224609375,
"logps/chosen": -430.81890869140625,
"logps/rejected": -505.7598571777344,
"loss": 0.6233,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8447399139404297,
"rewards/margins": 0.7838276624679565,
"rewards/rejected": -2.628567695617676,
"step": 1320
},
{
"epoch": 0.696152839570793,
"grad_norm": 23.20538962752919,
"learning_rate": 1.2777991652391757e-07,
"logits/chosen": 5333.5048828125,
"logits/rejected": 3960.68212890625,
"logps/chosen": -402.9344177246094,
"logps/rejected": -442.1331481933594,
"loss": 0.6293,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.700577735900879,
"rewards/margins": 0.9019187688827515,
"rewards/rejected": -2.60249662399292,
"step": 1330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 19.43268415725634,
"learning_rate": 1.2381484323024178e-07,
"logits/chosen": 6016.5185546875,
"logits/rejected": 5181.9228515625,
"logps/chosen": -408.1551818847656,
"logps/rejected": -457.7464904785156,
"loss": 0.6094,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6999537944793701,
"rewards/margins": 0.7662817239761353,
"rewards/rejected": -2.466235637664795,
"step": 1340
},
{
"epoch": 0.7066213033237373,
"grad_norm": 22.3621515216726,
"learning_rate": 1.1989191475388516e-07,
"logits/chosen": 4984.4111328125,
"logits/rejected": 4563.0322265625,
"logps/chosen": -346.7846374511719,
"logps/rejected": -447.44586181640625,
"loss": 0.621,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.6289829015731812,
"rewards/margins": 0.8670762181282043,
"rewards/rejected": -2.496058940887451,
"step": 1350
},
{
"epoch": 0.7118555352002094,
"grad_norm": 16.827916345332202,
"learning_rate": 1.1601244132109179e-07,
"logits/chosen": 4982.31103515625,
"logits/rejected": 4440.9169921875,
"logps/chosen": -379.25128173828125,
"logps/rejected": -465.8182067871094,
"loss": 0.6101,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8500652313232422,
"rewards/margins": 0.7695325016975403,
"rewards/rejected": -2.619597911834717,
"step": 1360
},
{
"epoch": 0.7170897670766815,
"grad_norm": 19.10478789750096,
"learning_rate": 1.1217771864447395e-07,
"logits/chosen": 5696.0634765625,
"logits/rejected": 4793.515625,
"logps/chosen": -422.21905517578125,
"logps/rejected": -524.974609375,
"loss": 0.6266,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.7729225158691406,
"rewards/margins": 0.9866235852241516,
"rewards/rejected": -2.7595460414886475,
"step": 1370
},
{
"epoch": 0.7223239989531536,
"grad_norm": 19.571481210859417,
"learning_rate": 1.0838902749025499e-07,
"logits/chosen": 6979.7353515625,
"logits/rejected": 5534.80615234375,
"logps/chosen": -437.5282287597656,
"logps/rejected": -475.3587341308594,
"loss": 0.6206,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6889175176620483,
"rewards/margins": 0.7310249209403992,
"rewards/rejected": -2.4199423789978027,
"step": 1380
},
{
"epoch": 0.7275582308296258,
"grad_norm": 23.479770735886802,
"learning_rate": 1.0464763325050358e-07,
"logits/chosen": 5203.9345703125,
"logits/rejected": 4617.71630859375,
"logps/chosen": -415.99737548828125,
"logps/rejected": -473.8778381347656,
"loss": 0.608,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.8423852920532227,
"rewards/margins": 0.7913864850997925,
"rewards/rejected": -2.6337718963623047,
"step": 1390
},
{
"epoch": 0.7327924627060979,
"grad_norm": 28.458417507814094,
"learning_rate": 1.0095478552050346e-07,
"logits/chosen": 6179.98046875,
"logits/rejected": 4097.23828125,
"logps/chosen": -432.69146728515625,
"logps/rejected": -464.85992431640625,
"loss": 0.6005,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6340433359146118,
"rewards/margins": 0.9390060305595398,
"rewards/rejected": -2.573049306869507,
"step": 1400
},
{
"epoch": 0.73802669458257,
"grad_norm": 22.061291739222355,
"learning_rate": 9.731171768139806e-08,
"logits/chosen": 5738.4248046875,
"logits/rejected": 4614.5322265625,
"logps/chosen": -385.05133056640625,
"logps/rejected": -455.3321838378906,
"loss": 0.626,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6064504384994507,
"rewards/margins": 0.9582611322402954,
"rewards/rejected": -2.564711570739746,
"step": 1410
},
{
"epoch": 0.7432609264590422,
"grad_norm": 25.413288039384696,
"learning_rate": 9.37196464882522e-08,
"logits/chosen": 5494.5439453125,
"logits/rejected": 4928.0751953125,
"logps/chosen": -385.5731201171875,
"logps/rejected": -464.8663024902344,
"loss": 0.6289,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7576345205307007,
"rewards/margins": 0.8199461698532104,
"rewards/rejected": -2.577580690383911,
"step": 1420
},
{
"epoch": 0.7484951583355143,
"grad_norm": 22.45781701506148,
"learning_rate": 9.017977166366444e-08,
"logits/chosen": 5656.9072265625,
"logits/rejected": 4975.0439453125,
"logps/chosen": -404.0146789550781,
"logps/rejected": -485.17022705078125,
"loss": 0.623,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6433677673339844,
"rewards/margins": 0.8800700306892395,
"rewards/rejected": -2.523437976837158,
"step": 1430
},
{
"epoch": 0.7537293902119864,
"grad_norm": 13.945507178550827,
"learning_rate": 8.669327549707095e-08,
"logits/chosen": 5781.94189453125,
"logits/rejected": 4841.93994140625,
"logps/chosen": -427.2398376464844,
"logps/rejected": -485.5018615722656,
"loss": 0.6082,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.718334436416626,
"rewards/margins": 0.9542592763900757,
"rewards/rejected": -2.672593593597412,
"step": 1440
},
{
"epoch": 0.7589636220884585,
"grad_norm": 17.966049413367486,
"learning_rate": 8.326132244986931e-08,
"logits/chosen": 5145.71875,
"logits/rejected": 4337.2958984375,
"logps/chosen": -398.82135009765625,
"logps/rejected": -474.75933837890625,
"loss": 0.6032,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7171170711517334,
"rewards/margins": 1.0231225490570068,
"rewards/rejected": -2.7402396202087402,
"step": 1450
},
{
"epoch": 0.7641978539649307,
"grad_norm": 20.586662671394684,
"learning_rate": 7.988505876649862e-08,
"logits/chosen": 5346.1103515625,
"logits/rejected": 4014.310546875,
"logps/chosen": -407.9379577636719,
"logps/rejected": -500.1922302246094,
"loss": 0.6257,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.785790205001831,
"rewards/margins": 1.0074379444122314,
"rewards/rejected": -2.7932276725769043,
"step": 1460
},
{
"epoch": 0.7694320858414028,
"grad_norm": 17.218488686000693,
"learning_rate": 7.656561209160248e-08,
"logits/chosen": 5829.01416015625,
"logits/rejected": 4944.89208984375,
"logps/chosen": -427.6463928222656,
"logps/rejected": -475.11236572265625,
"loss": 0.596,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6583614349365234,
"rewards/margins": 0.945914626121521,
"rewards/rejected": -2.604275941848755,
"step": 1470
},
{
"epoch": 0.7746663177178749,
"grad_norm": 27.7313611604028,
"learning_rate": 7.330409109340562e-08,
"logits/chosen": 5904.09912109375,
"logits/rejected": 5181.5791015625,
"logps/chosen": -440.94451904296875,
"logps/rejected": -501.65545654296875,
"loss": 0.5985,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.647769570350647,
"rewards/margins": 0.962969183921814,
"rewards/rejected": -2.610738754272461,
"step": 1480
},
{
"epoch": 0.7799005495943471,
"grad_norm": 28.768549723017788,
"learning_rate": 7.010158509342681e-08,
"logits/chosen": 6550.0625,
"logits/rejected": 4658.27978515625,
"logps/chosen": -417.83758544921875,
"logps/rejected": -465.58209228515625,
"loss": 0.5979,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.625791311264038,
"rewards/margins": 1.0529232025146484,
"rewards/rejected": -2.6787142753601074,
"step": 1490
},
{
"epoch": 0.7851347814708192,
"grad_norm": 26.074328942084968,
"learning_rate": 6.695916370265527e-08,
"logits/chosen": 5247.5302734375,
"logits/rejected": 4586.5869140625,
"logps/chosen": -395.1465148925781,
"logps/rejected": -413.99884033203125,
"loss": 0.6356,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.7414640188217163,
"rewards/margins": 0.6474174857139587,
"rewards/rejected": -2.3888819217681885,
"step": 1500
},
{
"epoch": 0.7903690133472913,
"grad_norm": 21.80364567121782,
"learning_rate": 6.387787646430853e-08,
"logits/chosen": 6516.0478515625,
"logits/rejected": 5851.53369140625,
"logps/chosen": -426.70318603515625,
"logps/rejected": -492.4895935058594,
"loss": 0.6294,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.723693609237671,
"rewards/margins": 0.7622456550598145,
"rewards/rejected": -2.4859395027160645,
"step": 1510
},
{
"epoch": 0.7956032452237635,
"grad_norm": 23.451371826789497,
"learning_rate": 6.0858752503294e-08,
"logits/chosen": 5100.3837890625,
"logits/rejected": 4843.9755859375,
"logps/chosen": -410.7384338378906,
"logps/rejected": -452.9171447753906,
"loss": 0.6065,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.6687591075897217,
"rewards/margins": 0.6757498383522034,
"rewards/rejected": -2.344508647918701,
"step": 1520
},
{
"epoch": 0.8008374771002356,
"grad_norm": 18.4137285906291,
"learning_rate": 5.7902800182489385e-08,
"logits/chosen": 5347.9619140625,
"logits/rejected": 5055.91455078125,
"logps/chosen": -371.74029541015625,
"logps/rejected": -444.6211853027344,
"loss": 0.6062,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.6094753742218018,
"rewards/margins": 0.9651139974594116,
"rewards/rejected": -2.574589252471924,
"step": 1530
},
{
"epoch": 0.8060717089767077,
"grad_norm": 18.372297005488328,
"learning_rate": 5.5011006765957604e-08,
"logits/chosen": 6517.6826171875,
"logits/rejected": 5801.03955078125,
"logps/chosen": -430.2518615722656,
"logps/rejected": -544.8726806640625,
"loss": 0.6076,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.744037389755249,
"rewards/margins": 0.9321613311767578,
"rewards/rejected": -2.676198720932007,
"step": 1540
},
{
"epoch": 0.8113059408531798,
"grad_norm": 24.974440327502748,
"learning_rate": 5.218433808920883e-08,
"logits/chosen": 5668.3994140625,
"logits/rejected": 5112.5869140625,
"logps/chosen": -416.13336181640625,
"logps/rejected": -498.39453125,
"loss": 0.6025,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7290430068969727,
"rewards/margins": 0.8825391530990601,
"rewards/rejected": -2.611582040786743,
"step": 1550
},
{
"epoch": 0.816540172729652,
"grad_norm": 24.66708515929543,
"learning_rate": 4.942373823661927e-08,
"logits/chosen": 6769.8955078125,
"logits/rejected": 5016.2587890625,
"logps/chosen": -447.3492736816406,
"logps/rejected": -503.0823669433594,
"loss": 0.6096,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7596700191497803,
"rewards/margins": 1.0560283660888672,
"rewards/rejected": -2.8156983852386475,
"step": 1560
},
{
"epoch": 0.821774404606124,
"grad_norm": 15.540461473239736,
"learning_rate": 4.6730129226114354e-08,
"logits/chosen": 5088.92236328125,
"logits/rejected": 4692.33349609375,
"logps/chosen": -409.94024658203125,
"logps/rejected": -442.9159240722656,
"loss": 0.61,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.9526259899139404,
"rewards/margins": 0.727096676826477,
"rewards/rejected": -2.679722547531128,
"step": 1570
},
{
"epoch": 0.8270086364825961,
"grad_norm": 22.30927140417861,
"learning_rate": 4.41044107012227e-08,
"logits/chosen": 6509.494140625,
"logits/rejected": 5121.66162109375,
"logps/chosen": -454.4883728027344,
"logps/rejected": -491.09814453125,
"loss": 0.6164,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6590086221694946,
"rewards/margins": 0.8761310577392578,
"rewards/rejected": -2.535139560699463,
"step": 1580
},
{
"epoch": 0.8322428683590684,
"grad_norm": 47.249244932789814,
"learning_rate": 4.1547459630601966e-08,
"logits/chosen": 5681.8876953125,
"logits/rejected": 5076.9794921875,
"logps/chosen": -435.9734802246094,
"logps/rejected": -483.70458984375,
"loss": 0.6239,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8887542486190796,
"rewards/margins": 0.6841882467269897,
"rewards/rejected": -2.5729424953460693,
"step": 1590
},
{
"epoch": 0.8374771002355405,
"grad_norm": 19.509237361503633,
"learning_rate": 3.9060130015138857e-08,
"logits/chosen": 5260.7138671875,
"logits/rejected": 4629.92578125,
"logps/chosen": -414.8975524902344,
"logps/rejected": -494.1025390625,
"loss": 0.6117,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.839999794960022,
"rewards/margins": 1.0193700790405273,
"rewards/rejected": -2.8593695163726807,
"step": 1600
},
{
"epoch": 0.8427113321120125,
"grad_norm": 15.758769361501436,
"learning_rate": 3.664325260271953e-08,
"logits/chosen": 6010.47119140625,
"logits/rejected": 5069.5751953125,
"logps/chosen": -467.64404296875,
"logps/rejected": -507.5274963378906,
"loss": 0.6071,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.0030617713928223,
"rewards/margins": 0.7443469166755676,
"rewards/rejected": -2.747408390045166,
"step": 1610
},
{
"epoch": 0.8479455639884846,
"grad_norm": 21.123986793744674,
"learning_rate": 3.429763461076676e-08,
"logits/chosen": 5870.20068359375,
"logits/rejected": 5074.16357421875,
"logps/chosen": -405.6874084472656,
"logps/rejected": -476.35211181640625,
"loss": 0.6096,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7369863986968994,
"rewards/margins": 0.9186028242111206,
"rewards/rejected": -2.6555895805358887,
"step": 1620
},
{
"epoch": 0.8531797958649568,
"grad_norm": 19.05302083047077,
"learning_rate": 3.202405945663555e-08,
"logits/chosen": 5784.2412109375,
"logits/rejected": 3889.80126953125,
"logps/chosen": -427.1604919433594,
"logps/rejected": -439.701904296875,
"loss": 0.6078,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9810470342636108,
"rewards/margins": 0.740452766418457,
"rewards/rejected": -2.7214999198913574,
"step": 1630
},
{
"epoch": 0.8584140277414289,
"grad_norm": 29.86452301634578,
"learning_rate": 2.9823286495958556e-08,
"logits/chosen": 4778.2958984375,
"logits/rejected": 5450.62451171875,
"logps/chosen": -398.36407470703125,
"logps/rejected": -521.3021240234375,
"loss": 0.6096,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9749752283096313,
"rewards/margins": 0.7352627515792847,
"rewards/rejected": -2.710237979888916,
"step": 1640
},
{
"epoch": 0.863648259617901,
"grad_norm": 18.974661489747966,
"learning_rate": 2.769605076902695e-08,
"logits/chosen": 6121.0751953125,
"logits/rejected": 5588.75439453125,
"logps/chosen": -424.2884826660156,
"logps/rejected": -515.7366943359375,
"loss": 0.609,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8259862661361694,
"rewards/margins": 0.7989758253097534,
"rewards/rejected": -2.624962329864502,
"step": 1650
},
{
"epoch": 0.8688824914943732,
"grad_norm": 20.830223854892928,
"learning_rate": 2.5643062755293403e-08,
"logits/chosen": 5408.017578125,
"logits/rejected": 4577.1982421875,
"logps/chosen": -427.53997802734375,
"logps/rejected": -462.0577087402344,
"loss": 0.6127,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8826709985733032,
"rewards/margins": 0.7450687885284424,
"rewards/rejected": -2.627739906311035,
"step": 1660
},
{
"epoch": 0.8741167233708453,
"grad_norm": 30.839808557441238,
"learning_rate": 2.366500813607733e-08,
"logits/chosen": 6019.47412109375,
"logits/rejected": 4637.82763671875,
"logps/chosen": -409.47406005859375,
"logps/rejected": -507.8202209472656,
"loss": 0.6124,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7381088733673096,
"rewards/margins": 1.1539865732192993,
"rewards/rejected": -2.8920950889587402,
"step": 1670
},
{
"epoch": 0.8793509552473174,
"grad_norm": 22.32621549985474,
"learning_rate": 2.176254756555329e-08,
"logits/chosen": 6369.30859375,
"logits/rejected": 5620.3662109375,
"logps/chosen": -467.0570373535156,
"logps/rejected": -547.2705078125,
"loss": 0.5994,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8894094228744507,
"rewards/margins": 1.0848562717437744,
"rewards/rejected": -2.9742655754089355,
"step": 1680
},
{
"epoch": 0.8845851871237895,
"grad_norm": 20.301098233070547,
"learning_rate": 1.9936316450097468e-08,
"logits/chosen": 5071.96142578125,
"logits/rejected": 4552.37353515625,
"logps/chosen": -400.34100341796875,
"logps/rejected": -446.0146484375,
"loss": 0.61,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8436905145645142,
"rewards/margins": 0.716572105884552,
"rewards/rejected": -2.560262680053711,
"step": 1690
},
{
"epoch": 0.8898194190002617,
"grad_norm": 23.493546384450056,
"learning_rate": 1.8186924736067477e-08,
"logits/chosen": 5736.19921875,
"logits/rejected": 4311.3408203125,
"logps/chosen": -420.8236389160156,
"logps/rejected": -512.0423583984375,
"loss": 0.6042,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7245066165924072,
"rewards/margins": 1.156449317932129,
"rewards/rejected": -2.880955219268799,
"step": 1700
},
{
"epoch": 0.8950536508767338,
"grad_norm": 18.623486803085754,
"learning_rate": 1.651495670608488e-08,
"logits/chosen": 6630.7412109375,
"logits/rejected": 5112.56396484375,
"logps/chosen": -430.5503845214844,
"logps/rejected": -508.31304931640625,
"loss": 0.5846,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7643120288848877,
"rewards/margins": 1.1240522861480713,
"rewards/rejected": -2.888363838195801,
"step": 1710
},
{
"epoch": 0.9002878827532059,
"grad_norm": 21.977526068073495,
"learning_rate": 1.4920970783889737e-08,
"logits/chosen": 6202.2060546875,
"logits/rejected": 4598.1708984375,
"logps/chosen": -452.6166076660156,
"logps/rejected": -524.5369262695312,
"loss": 0.5982,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9582983255386353,
"rewards/margins": 0.9155516624450684,
"rewards/rejected": -2.873849868774414,
"step": 1720
},
{
"epoch": 0.9055221146296781,
"grad_norm": 12.755570308165497,
"learning_rate": 1.340549934783164e-08,
"logits/chosen": 5910.86328125,
"logits/rejected": 5579.3876953125,
"logps/chosen": -443.11163330078125,
"logps/rejected": -530.6002197265625,
"loss": 0.5984,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8624699115753174,
"rewards/margins": 0.8643971681594849,
"rewards/rejected": -2.726867198944092,
"step": 1730
},
{
"epoch": 0.9107563465061502,
"grad_norm": 23.464328832306045,
"learning_rate": 1.1969048553059608e-08,
"logits/chosen": 5595.259765625,
"logits/rejected": 4795.32080078125,
"logps/chosen": -382.4716796875,
"logps/rejected": -451.7056579589844,
"loss": 0.621,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.7278823852539062,
"rewards/margins": 0.8011847734451294,
"rewards/rejected": -2.529067277908325,
"step": 1740
},
{
"epoch": 0.9159905783826223,
"grad_norm": 22.662637254674035,
"learning_rate": 1.06120981624703e-08,
"logits/chosen": 5303.560546875,
"logits/rejected": 5642.16650390625,
"logps/chosen": -418.61309814453125,
"logps/rejected": -528.3426513671875,
"loss": 0.6137,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.819700837135315,
"rewards/margins": 0.8951080441474915,
"rewards/rejected": -2.714808702468872,
"step": 1750
},
{
"epoch": 0.9212248102590945,
"grad_norm": 23.37220649579407,
"learning_rate": 9.335101386471284e-09,
"logits/chosen": 6105.37158203125,
"logits/rejected": 5412.89892578125,
"logps/chosen": -447.61993408203125,
"logps/rejected": -507.3324279785156,
"loss": 0.6005,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.9051244258880615,
"rewards/margins": 0.883420467376709,
"rewards/rejected": -2.7885448932647705,
"step": 1760
},
{
"epoch": 0.9264590421355666,
"grad_norm": 31.57553761420153,
"learning_rate": 8.138484731612273e-09,
"logits/chosen": 5806.66064453125,
"logits/rejected": 4830.857421875,
"logps/chosen": -429.99420166015625,
"logps/rejected": -527.69140625,
"loss": 0.6107,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8986709117889404,
"rewards/margins": 0.999901294708252,
"rewards/rejected": -2.8985724449157715,
"step": 1770
},
{
"epoch": 0.9316932740120387,
"grad_norm": 24.672880887648823,
"learning_rate": 7.0226478581355e-09,
"logits/chosen": 5885.85205078125,
"logits/rejected": 5139.58203125,
"logps/chosen": -445.98675537109375,
"logps/rejected": -503.46337890625,
"loss": 0.6272,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.0940308570861816,
"rewards/margins": 0.7923761606216431,
"rewards/rejected": -2.886406660079956,
"step": 1780
},
{
"epoch": 0.9369275058885108,
"grad_norm": 18.080254178645642,
"learning_rate": 5.987963446492383e-09,
"logits/chosen": 5920.791015625,
"logits/rejected": 5237.79833984375,
"logps/chosen": -406.27386474609375,
"logps/rejected": -479.7198181152344,
"loss": 0.5786,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7285455465316772,
"rewards/margins": 0.9587591886520386,
"rewards/rejected": -2.687304735183716,
"step": 1790
},
{
"epoch": 0.942161737764983,
"grad_norm": 35.07844691929086,
"learning_rate": 5.0347770728713935e-09,
"logits/chosen": 5880.59228515625,
"logits/rejected": 4549.359375,
"logps/chosen": -462.1459045410156,
"logps/rejected": -468.9349670410156,
"loss": 0.6162,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7892353534698486,
"rewards/margins": 0.8386019468307495,
"rewards/rejected": -2.6278374195098877,
"step": 1800
},
{
"epoch": 0.9473959696414551,
"grad_norm": 19.945059521235283,
"learning_rate": 4.1634070937782424e-09,
"logits/chosen": 5899.3720703125,
"logits/rejected": 5313.3671875,
"logps/chosen": -451.93212890625,
"logps/rejected": -543.2415771484375,
"loss": 0.6142,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.9903990030288696,
"rewards/margins": 0.9100092649459839,
"rewards/rejected": -2.9004082679748535,
"step": 1810
},
{
"epoch": 0.9526302015179272,
"grad_norm": 24.094584349575342,
"learning_rate": 3.3741445397075797e-09,
"logits/chosen": 6125.74267578125,
"logits/rejected": 5158.01171875,
"logps/chosen": -463.64044189453125,
"logps/rejected": -555.1447143554688,
"loss": 0.6252,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9464343786239624,
"rewards/margins": 1.044654130935669,
"rewards/rejected": -2.9910888671875,
"step": 1820
},
{
"epoch": 0.9578644333943994,
"grad_norm": 25.276279664246026,
"learning_rate": 2.667253017941018e-09,
"logits/chosen": 6131.8310546875,
"logits/rejected": 4804.04150390625,
"logps/chosen": -452.3642578125,
"logps/rejected": -507.6914978027344,
"loss": 0.5973,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.9133832454681396,
"rewards/margins": 0.8848444223403931,
"rewards/rejected": -2.798227548599243,
"step": 1830
},
{
"epoch": 0.9630986652708715,
"grad_norm": 22.802704931718225,
"learning_rate": 2.0429686245045097e-09,
"logits/chosen": 5988.15625,
"logits/rejected": 4626.0927734375,
"logps/chosen": -486.51708984375,
"logps/rejected": -504.944091796875,
"loss": 0.6291,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9659137725830078,
"rewards/margins": 0.8604008555412292,
"rewards/rejected": -2.826314687728882,
"step": 1840
},
{
"epoch": 0.9683328971473436,
"grad_norm": 26.969071687122177,
"learning_rate": 1.5014998653141708e-09,
"logits/chosen": 5640.72021484375,
"logits/rejected": 4785.45068359375,
"logps/chosen": -440.749267578125,
"logps/rejected": -500.2676696777344,
"loss": 0.6259,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8496116399765015,
"rewards/margins": 1.0721490383148193,
"rewards/rejected": -2.9217605590820312,
"step": 1850
},
{
"epoch": 0.9735671290238157,
"grad_norm": 22.885075554568353,
"learning_rate": 1.0430275865371263e-09,
"logits/chosen": 5859.7861328125,
"logits/rejected": 4826.97119140625,
"logps/chosen": -409.632568359375,
"logps/rejected": -510.0669860839844,
"loss": 0.6016,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9688892364501953,
"rewards/margins": 1.0203845500946045,
"rewards/rejected": -2.9892735481262207,
"step": 1860
},
{
"epoch": 0.9788013609002879,
"grad_norm": 25.424962808525937,
"learning_rate": 6.677049141901314e-10,
"logits/chosen": 4790.49072265625,
"logits/rejected": 4639.8623046875,
"logps/chosen": -394.59674072265625,
"logps/rejected": -495.4620666503906,
"loss": 0.6084,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.8832927942276,
"rewards/margins": 0.9284135103225708,
"rewards/rejected": -2.811706066131592,
"step": 1870
},
{
"epoch": 0.98403559277676,
"grad_norm": 16.314513060865362,
"learning_rate": 3.7565720299687077e-10,
"logits/chosen": 6143.9091796875,
"logits/rejected": 5207.35400390625,
"logps/chosen": -465.2191467285156,
"logps/rejected": -504.1424865722656,
"loss": 0.5934,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.917109727859497,
"rewards/margins": 0.8995591998100281,
"rewards/rejected": -2.81666898727417,
"step": 1880
},
{
"epoch": 0.9892698246532321,
"grad_norm": 26.393655113815115,
"learning_rate": 1.6698199452053197e-10,
"logits/chosen": 4443.6845703125,
"logits/rejected": 4451.62548828125,
"logps/chosen": -400.55633544921875,
"logps/rejected": -473.33331298828125,
"loss": 0.6138,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.8365901708602905,
"rewards/margins": 0.7946940064430237,
"rewards/rejected": -2.631284236907959,
"step": 1890
},
{
"epoch": 0.9945040565297043,
"grad_norm": 28.937103875297968,
"learning_rate": 4.174898458556009e-11,
"logits/chosen": 6005.9638671875,
"logits/rejected": 4214.5224609375,
"logps/chosen": -429.625,
"logps/rejected": -486.3451232910156,
"loss": 0.6063,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9642302989959717,
"rewards/margins": 0.9053429365158081,
"rewards/rejected": -2.8695731163024902,
"step": 1900
},
{
"epoch": 0.9997382884061764,
"grad_norm": 57.71415226213478,
"learning_rate": 0.0,
"logits/chosen": 6091.05859375,
"logits/rejected": 4940.8408203125,
"logps/chosen": -462.4815979003906,
"logps/rejected": -539.9644165039062,
"loss": 0.6206,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.0189812183380127,
"rewards/margins": 0.8955272436141968,
"rewards/rejected": -2.91450834274292,
"step": 1910
},
{
"epoch": 0.9997382884061764,
"step": 1910,
"total_flos": 0.0,
"train_loss": 0.6271847719921492,
"train_runtime": 17433.9091,
"train_samples_per_second": 3.507,
"train_steps_per_second": 0.11
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}