{ "best_metric": 0.6194455623626709, "best_model_checkpoint": "./checkpoints_dpo_final_2/Phi-3-mini-4k-instruct/checkpoint-1500", "epoch": 2.0, "eval_steps": 50, "global_step": 1608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004975124378109453, "grad_norm": 7.420848846435547, "learning_rate": 8.000000000000001e-07, "logits/chosen": 0.31535276770591736, "logits/rejected": 0.2069419026374817, "logps/chosen": -443.7961120605469, "logps/rejected": -403.4725341796875, "loss": 1.2553, "rewards/accuracies": 0.515625, "rewards/chosen": 7.078475475311279, "rewards/margins": 0.11215054243803024, "rewards/rejected": 6.966324329376221, "step": 4 }, { "epoch": 0.009950248756218905, "grad_norm": 7.148873805999756, "learning_rate": 1.6000000000000001e-06, "logits/chosen": 0.28295886516571045, "logits/rejected": 0.3022560179233551, "logps/chosen": -366.45233154296875, "logps/rejected": -426.3655090332031, "loss": 0.9806, "rewards/accuracies": 0.515625, "rewards/chosen": 6.658623218536377, "rewards/margins": -0.2977091073989868, "rewards/rejected": 6.956332683563232, "step": 8 }, { "epoch": 0.014925373134328358, "grad_norm": 8.879262924194336, "learning_rate": 2.4000000000000003e-06, "logits/chosen": 0.5261741876602173, "logits/rejected": 0.47682714462280273, "logps/chosen": -399.4539489746094, "logps/rejected": -382.5442810058594, "loss": 0.9681, "rewards/accuracies": 0.59375, "rewards/chosen": 6.928550720214844, "rewards/margins": 0.06680499017238617, "rewards/rejected": 6.861745357513428, "step": 12 }, { "epoch": 0.01990049751243781, "grad_norm": 12.120668411254883, "learning_rate": 3.2000000000000003e-06, "logits/chosen": 0.7348346710205078, "logits/rejected": 0.45035141706466675, "logps/chosen": -433.3005065917969, "logps/rejected": -387.620361328125, "loss": 1.1552, "rewards/accuracies": 0.609375, "rewards/chosen": 6.386252403259277, "rewards/margins": 0.23828373849391937, "rewards/rejected": 6.147968292236328, "step": 16 }, { "epoch": 0.024875621890547265, "grad_norm": 7.843857765197754, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.5343019962310791, "logits/rejected": 0.24399122595787048, "logps/chosen": -425.4538879394531, "logps/rejected": -353.02947998046875, "loss": 0.9466, "rewards/accuracies": 0.5, "rewards/chosen": 5.57993745803833, "rewards/margins": 0.3308122158050537, "rewards/rejected": 5.249125957489014, "step": 20 }, { "epoch": 0.029850746268656716, "grad_norm": 6.334160804748535, "learning_rate": 4.800000000000001e-06, "logits/chosen": 0.27708423137664795, "logits/rejected": 0.22941020131111145, "logps/chosen": -384.0626525878906, "logps/rejected": -335.5965881347656, "loss": 0.8395, "rewards/accuracies": 0.421875, "rewards/chosen": 4.043614864349365, "rewards/margins": -0.3541470766067505, "rewards/rejected": 4.397762298583984, "step": 24 }, { "epoch": 0.03482587064676617, "grad_norm": 6.785109519958496, "learning_rate": 5.600000000000001e-06, "logits/chosen": 0.3940538465976715, "logits/rejected": 0.13648821413516998, "logps/chosen": -459.2915954589844, "logps/rejected": -384.01031494140625, "loss": 0.7924, "rewards/accuracies": 0.609375, "rewards/chosen": 3.638526439666748, "rewards/margins": 0.16708242893218994, "rewards/rejected": 3.4714441299438477, "step": 28 }, { "epoch": 0.03980099502487562, "grad_norm": 6.778156280517578, "learning_rate": 6.4000000000000006e-06, "logits/chosen": 0.6169087886810303, "logits/rejected": 0.4341488480567932, "logps/chosen": -476.3402404785156, "logps/rejected": -463.748779296875, "loss": 0.7885, "rewards/accuracies": 0.59375, "rewards/chosen": 3.291264533996582, "rewards/margins": 0.29383713006973267, "rewards/rejected": 2.9974277019500732, "step": 32 }, { "epoch": 0.04477611940298507, "grad_norm": 7.815089702606201, "learning_rate": 7.2000000000000005e-06, "logits/chosen": 0.8724404573440552, "logits/rejected": 0.5648743510246277, "logps/chosen": -418.50372314453125, "logps/rejected": -364.5290222167969, "loss": 0.8768, "rewards/accuracies": 0.5, "rewards/chosen": 2.4819023609161377, "rewards/margins": -0.11123146116733551, "rewards/rejected": 2.5931336879730225, "step": 36 }, { "epoch": 0.04975124378109453, "grad_norm": 5.426750659942627, "learning_rate": 8.000000000000001e-06, "logits/chosen": 0.4279947876930237, "logits/rejected": 0.23432603478431702, "logps/chosen": -429.2681579589844, "logps/rejected": -391.44281005859375, "loss": 0.7756, "rewards/accuracies": 0.546875, "rewards/chosen": 2.358858585357666, "rewards/margins": 0.2736659348011017, "rewards/rejected": 2.0851926803588867, "step": 40 }, { "epoch": 0.05472636815920398, "grad_norm": 6.188413619995117, "learning_rate": 8.8e-06, "logits/chosen": 0.13682100176811218, "logits/rejected": 0.03174281492829323, "logps/chosen": -440.99224853515625, "logps/rejected": -466.3514404296875, "loss": 0.7254, "rewards/accuracies": 0.5, "rewards/chosen": 2.198638439178467, "rewards/margins": 0.15260140597820282, "rewards/rejected": 2.046036958694458, "step": 44 }, { "epoch": 0.05970149253731343, "grad_norm": 6.239718914031982, "learning_rate": 9.600000000000001e-06, "logits/chosen": 0.23234650492668152, "logits/rejected": 0.25346270203590393, "logps/chosen": -512.7527465820312, "logps/rejected": -494.216552734375, "loss": 0.7121, "rewards/accuracies": 0.53125, "rewards/chosen": 2.1018052101135254, "rewards/margins": 0.08936208486557007, "rewards/rejected": 2.0124430656433105, "step": 48 }, { "epoch": 0.06218905472636816, "eval_logits/chosen": 0.33167123794555664, "eval_logits/rejected": 0.1882716715335846, "eval_logps/chosen": -432.4385070800781, "eval_logps/rejected": -392.6837158203125, "eval_loss": 0.7078412175178528, "eval_rewards/accuracies": 0.5694444179534912, "eval_rewards/chosen": 1.9858527183532715, "eval_rewards/margins": 0.0740758553147316, "eval_rewards/rejected": 1.9117769002914429, "eval_runtime": 149.9332, "eval_samples_per_second": 7.623, "eval_steps_per_second": 0.24, "step": 50 }, { "epoch": 0.06467661691542288, "grad_norm": 6.6676530838012695, "learning_rate": 1.04e-05, "logits/chosen": 0.046434201300144196, "logits/rejected": 0.02955937385559082, "logps/chosen": -452.5304260253906, "logps/rejected": -471.0543518066406, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": 2.0271408557891846, "rewards/margins": 0.25260087847709656, "rewards/rejected": 1.7745399475097656, "step": 52 }, { "epoch": 0.06965174129353234, "grad_norm": 6.596746444702148, "learning_rate": 1.1200000000000001e-05, "logits/chosen": 0.5402776598930359, "logits/rejected": 0.4161326587200165, "logps/chosen": -451.6429748535156, "logps/rejected": -434.0633239746094, "loss": 0.7366, "rewards/accuracies": 0.515625, "rewards/chosen": 1.8598072528839111, "rewards/margins": -0.006673937663435936, "rewards/rejected": 1.8664811849594116, "step": 56 }, { "epoch": 0.07462686567164178, "grad_norm": 5.593472957611084, "learning_rate": 1.2e-05, "logits/chosen": 0.40079164505004883, "logits/rejected": 0.25033000111579895, "logps/chosen": -504.302490234375, "logps/rejected": -494.911376953125, "loss": 0.7328, "rewards/accuracies": 0.625, "rewards/chosen": 1.5626219511032104, "rewards/margins": 0.10345478355884552, "rewards/rejected": 1.4591671228408813, "step": 60 }, { "epoch": 0.07960199004975124, "grad_norm": 5.58954381942749, "learning_rate": 1.2800000000000001e-05, "logits/chosen": 0.25867709517478943, "logits/rejected": 0.14657628536224365, "logps/chosen": -431.3748474121094, "logps/rejected": -424.9607849121094, "loss": 0.6738, "rewards/accuracies": 0.59375, "rewards/chosen": 0.8101712465286255, "rewards/margins": 0.0693005919456482, "rewards/rejected": 0.7408705949783325, "step": 64 }, { "epoch": 0.0845771144278607, "grad_norm": 6.670442581176758, "learning_rate": 1.3600000000000002e-05, "logits/chosen": 0.3967319130897522, "logits/rejected": 0.3033946752548218, "logps/chosen": -457.9029541015625, "logps/rejected": -449.4042053222656, "loss": 0.6967, "rewards/accuracies": 0.578125, "rewards/chosen": 0.5467469692230225, "rewards/margins": 0.12546321749687195, "rewards/rejected": 0.4212837517261505, "step": 68 }, { "epoch": 0.08955223880597014, "grad_norm": 5.157428741455078, "learning_rate": 1.4400000000000001e-05, "logits/chosen": 0.5071850419044495, "logits/rejected": 0.3140091598033905, "logps/chosen": -429.99468994140625, "logps/rejected": -405.0130615234375, "loss": 0.6743, "rewards/accuracies": 0.640625, "rewards/chosen": 0.8408704996109009, "rewards/margins": 0.23394504189491272, "rewards/rejected": 0.6069254279136658, "step": 72 }, { "epoch": 0.0945273631840796, "grad_norm": 5.6009521484375, "learning_rate": 1.5200000000000002e-05, "logits/chosen": 0.2075415998697281, "logits/rejected": 0.07978951930999756, "logps/chosen": -417.7720947265625, "logps/rejected": -385.97576904296875, "loss": 0.7021, "rewards/accuracies": 0.609375, "rewards/chosen": 1.121203064918518, "rewards/margins": 0.08919668942689896, "rewards/rejected": 1.0320063829421997, "step": 76 }, { "epoch": 0.09950248756218906, "grad_norm": 5.1235575675964355, "learning_rate": 1.6000000000000003e-05, "logits/chosen": 0.4204176962375641, "logits/rejected": 0.03434094786643982, "logps/chosen": -658.8785400390625, "logps/rejected": -461.9447937011719, "loss": 0.6689, "rewards/accuracies": 0.59375, "rewards/chosen": 0.9897336363792419, "rewards/margins": 0.21804025769233704, "rewards/rejected": 0.7716932892799377, "step": 80 }, { "epoch": 0.1044776119402985, "grad_norm": 6.649357795715332, "learning_rate": 1.6800000000000002e-05, "logits/chosen": 0.2298906296491623, "logits/rejected": 0.1789359152317047, "logps/chosen": -478.6454772949219, "logps/rejected": -463.6629638671875, "loss": 0.7429, "rewards/accuracies": 0.59375, "rewards/chosen": 0.5649065971374512, "rewards/margins": 0.21340136229991913, "rewards/rejected": 0.35150521993637085, "step": 84 }, { "epoch": 0.10945273631840796, "grad_norm": 6.433568954467773, "learning_rate": 1.76e-05, "logits/chosen": 0.1520080417394638, "logits/rejected": 0.10301964730024338, "logps/chosen": -534.1607666015625, "logps/rejected": -517.918701171875, "loss": 0.6625, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4300222396850586, "rewards/margins": 0.29336607456207275, "rewards/rejected": 0.13665619492530823, "step": 88 }, { "epoch": 0.11442786069651742, "grad_norm": 5.594571590423584, "learning_rate": 1.8400000000000003e-05, "logits/chosen": 0.18866638839244843, "logits/rejected": 0.03936055302619934, "logps/chosen": -477.4437255859375, "logps/rejected": -434.723388671875, "loss": 0.6865, "rewards/accuracies": 0.59375, "rewards/chosen": 0.16455253958702087, "rewards/margins": 0.15519294142723083, "rewards/rejected": 0.00935959443449974, "step": 92 }, { "epoch": 0.11940298507462686, "grad_norm": 4.924789905548096, "learning_rate": 1.9200000000000003e-05, "logits/chosen": 0.19729886949062347, "logits/rejected": 0.06473005563020706, "logps/chosen": -444.3799133300781, "logps/rejected": -394.0942687988281, "loss": 0.6609, "rewards/accuracies": 0.5625, "rewards/chosen": 0.34238868951797485, "rewards/margins": 0.23226764798164368, "rewards/rejected": 0.11012104153633118, "step": 96 }, { "epoch": 0.12437810945273632, "grad_norm": 9.16905403137207, "learning_rate": 2e-05, "logits/chosen": 0.1745055466890335, "logits/rejected": 0.18110498785972595, "logps/chosen": -590.113525390625, "logps/rejected": -565.5681762695312, "loss": 0.672, "rewards/accuracies": 0.578125, "rewards/chosen": 0.5021845102310181, "rewards/margins": 0.1258457899093628, "rewards/rejected": 0.3763387203216553, "step": 100 }, { "epoch": 0.12437810945273632, "eval_logits/chosen": 0.27222201228141785, "eval_logits/rejected": 0.13300225138664246, "eval_logps/chosen": -448.08441162109375, "eval_logps/rejected": -409.7933349609375, "eval_loss": 0.6717547178268433, "eval_rewards/accuracies": 0.5972222089767456, "eval_rewards/chosen": 0.42125940322875977, "eval_rewards/margins": 0.22044435143470764, "eval_rewards/rejected": 0.20081506669521332, "eval_runtime": 150.2898, "eval_samples_per_second": 7.605, "eval_steps_per_second": 0.24, "step": 100 }, { "epoch": 0.12935323383084577, "grad_norm": 5.174230098724365, "learning_rate": 1.9999652796146877e-05, "logits/chosen": 0.4161723256111145, "logits/rejected": 0.32582810521125793, "logps/chosen": -493.2930908203125, "logps/rejected": -458.1988830566406, "loss": 0.6712, "rewards/accuracies": 0.5625, "rewards/chosen": 0.37257951498031616, "rewards/margins": 0.16849718987941742, "rewards/rejected": 0.20408231019973755, "step": 104 }, { "epoch": 0.13432835820895522, "grad_norm": 5.811988353729248, "learning_rate": 1.9998611208697607e-05, "logits/chosen": 0.5949371457099915, "logits/rejected": 0.41842520236968994, "logps/chosen": -447.29522705078125, "logps/rejected": -407.77264404296875, "loss": 0.6549, "rewards/accuracies": 0.703125, "rewards/chosen": 0.040240589529275894, "rewards/margins": 0.27960366010665894, "rewards/rejected": -0.23936308920383453, "step": 108 }, { "epoch": 0.13930348258706468, "grad_norm": 6.341477870941162, "learning_rate": 1.9996875309980824e-05, "logits/chosen": 0.5326985120773315, "logits/rejected": 0.3219985067844391, "logps/chosen": -592.6687622070312, "logps/rejected": -506.40093994140625, "loss": 0.6684, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08561475574970245, "rewards/margins": 0.18506459891796112, "rewards/rejected": -0.27067938446998596, "step": 112 }, { "epoch": 0.14427860696517414, "grad_norm": 6.128859519958496, "learning_rate": 1.9994445220538678e-05, "logits/chosen": 0.2585601210594177, "logits/rejected": 0.06527578085660934, "logps/chosen": -442.81512451171875, "logps/rejected": -460.4501953125, "loss": 0.6902, "rewards/accuracies": 0.59375, "rewards/chosen": 0.07791093736886978, "rewards/margins": 0.12571600079536438, "rewards/rejected": -0.04780507832765579, "step": 116 }, { "epoch": 0.14925373134328357, "grad_norm": 5.584373950958252, "learning_rate": 1.999132110911845e-05, "logits/chosen": 0.27150627970695496, "logits/rejected": 0.1847885251045227, "logps/chosen": -469.2530517578125, "logps/rejected": -458.80413818359375, "loss": 0.6793, "rewards/accuracies": 0.640625, "rewards/chosen": 0.3338521122932434, "rewards/margins": 0.2423708438873291, "rewards/rejected": 0.09148130565881729, "step": 120 }, { "epoch": 0.15422885572139303, "grad_norm": 5.340240955352783, "learning_rate": 1.9987503192660842e-05, "logits/chosen": 0.2772689759731293, "logits/rejected": 0.20361235737800598, "logps/chosen": -403.8421630859375, "logps/rejected": -364.796630859375, "loss": 0.6732, "rewards/accuracies": 0.53125, "rewards/chosen": 0.463238924741745, "rewards/margins": 0.17842896282672882, "rewards/rejected": 0.2848099172115326, "step": 124 }, { "epoch": 0.15920398009950248, "grad_norm": 5.592566013336182, "learning_rate": 1.9982991736284914e-05, "logits/chosen": 0.482767254114151, "logits/rejected": 0.42924097180366516, "logps/chosen": -474.4277648925781, "logps/rejected": -526.3604736328125, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": 0.6025639176368713, "rewards/margins": 0.2601732611656189, "rewards/rejected": 0.3423907160758972, "step": 128 }, { "epoch": 0.16417910447761194, "grad_norm": 6.180532932281494, "learning_rate": 1.997778705326968e-05, "logits/chosen": 0.20447391271591187, "logits/rejected": 0.13856717944145203, "logps/chosen": -433.2802734375, "logps/rejected": -459.3676452636719, "loss": 0.6757, "rewards/accuracies": 0.671875, "rewards/chosen": 0.37419936060905457, "rewards/margins": 0.33122357726097107, "rewards/rejected": 0.04297574609518051, "step": 132 }, { "epoch": 0.1691542288557214, "grad_norm": 5.7720417976379395, "learning_rate": 1.9971889505032337e-05, "logits/chosen": 0.37103909254074097, "logits/rejected": 0.18156485259532928, "logps/chosen": -431.2093200683594, "logps/rejected": -415.803955078125, "loss": 0.6676, "rewards/accuracies": 0.671875, "rewards/chosen": 0.00045023113489151, "rewards/margins": 0.28690749406814575, "rewards/rejected": -0.28645727038383484, "step": 136 }, { "epoch": 0.17412935323383086, "grad_norm": 5.279744625091553, "learning_rate": 1.9965299501103178e-05, "logits/chosen": 0.6684572696685791, "logits/rejected": 0.4265105724334717, "logps/chosen": -405.96636962890625, "logps/rejected": -363.99810791015625, "loss": 0.6718, "rewards/accuracies": 0.578125, "rewards/chosen": -0.31693071126937866, "rewards/margins": 0.12704896926879883, "rewards/rejected": -0.4439797103404999, "step": 140 }, { "epoch": 0.1791044776119403, "grad_norm": 5.412991046905518, "learning_rate": 1.995801749909715e-05, "logits/chosen": 0.3472476601600647, "logits/rejected": 0.1070006936788559, "logps/chosen": -525.67529296875, "logps/rejected": -470.4413146972656, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.21376881003379822, "rewards/margins": 0.07273076474666595, "rewards/rejected": -0.286499559879303, "step": 144 }, { "epoch": 0.18407960199004975, "grad_norm": 5.56497049331665, "learning_rate": 1.995004400468209e-05, "logits/chosen": 0.23391787707805634, "logits/rejected": 0.42092186212539673, "logps/chosen": -431.3445739746094, "logps/rejected": -513.2816772460938, "loss": 0.6803, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5987527966499329, "rewards/margins": 0.20438729226589203, "rewards/rejected": 0.39436548948287964, "step": 148 }, { "epoch": 0.1865671641791045, "eval_logits/chosen": 0.3917093575000763, "eval_logits/rejected": 0.2565095126628876, "eval_logps/chosen": -440.29315185546875, "eval_logps/rejected": -402.7275390625, "eval_loss": 0.6632580161094666, "eval_rewards/accuracies": 0.6215277910232544, "eval_rewards/chosen": 1.2003861665725708, "eval_rewards/margins": 0.29299187660217285, "eval_rewards/rejected": 0.907394289970398, "eval_runtime": 150.4796, "eval_samples_per_second": 7.596, "eval_steps_per_second": 0.239, "step": 150 }, { "epoch": 0.1890547263681592, "grad_norm": 6.083770751953125, "learning_rate": 1.9941379571543597e-05, "logits/chosen": 0.33355918526649475, "logits/rejected": 0.4423186779022217, "logps/chosen": -489.43389892578125, "logps/rejected": -527.8333129882812, "loss": 0.7118, "rewards/accuracies": 0.609375, "rewards/chosen": 1.1078438758850098, "rewards/margins": 0.08143356442451477, "rewards/rejected": 1.0264102220535278, "step": 152 }, { "epoch": 0.19402985074626866, "grad_norm": 5.055530071258545, "learning_rate": 1.9932024801346583e-05, "logits/chosen": 0.37234047055244446, "logits/rejected": 0.23300248384475708, "logps/chosen": -445.1590270996094, "logps/rejected": -421.0417175292969, "loss": 0.6896, "rewards/accuracies": 0.546875, "rewards/chosen": 1.0396504402160645, "rewards/margins": 0.15197786688804626, "rewards/rejected": 0.8876725435256958, "step": 156 }, { "epoch": 0.19900497512437812, "grad_norm": 4.7338433265686035, "learning_rate": 1.992198034369349e-05, "logits/chosen": 0.016373004764318466, "logits/rejected": 0.12857607007026672, "logps/chosen": -392.64678955078125, "logps/rejected": -409.271240234375, "loss": 0.6344, "rewards/accuracies": 0.640625, "rewards/chosen": 0.56157386302948, "rewards/margins": 0.11771346628665924, "rewards/rejected": 0.44386038184165955, "step": 160 }, { "epoch": 0.20398009950248755, "grad_norm": 5.313661098480225, "learning_rate": 1.991124689607921e-05, "logits/chosen": 0.6525070667266846, "logits/rejected": 0.5595052242279053, "logps/chosen": -499.96746826171875, "logps/rejected": -459.646728515625, "loss": 0.6648, "rewards/accuracies": 0.5625, "rewards/chosen": 0.22787390649318695, "rewards/margins": 0.18890802562236786, "rewards/rejected": 0.038965899497270584, "step": 164 }, { "epoch": 0.208955223880597, "grad_norm": 5.3913984298706055, "learning_rate": 1.9899825203842613e-05, "logits/chosen": 0.4010236859321594, "logits/rejected": 0.2576262950897217, "logps/chosen": -378.7827453613281, "logps/rejected": -365.35235595703125, "loss": 0.6702, "rewards/accuracies": 0.609375, "rewards/chosen": 0.21431918442249298, "rewards/margins": 0.11841318756341934, "rewards/rejected": 0.09590599685907364, "step": 168 }, { "epoch": 0.21393034825870647, "grad_norm": 4.707008361816406, "learning_rate": 1.988771606011481e-05, "logits/chosen": 0.5776969790458679, "logits/rejected": 0.5886460542678833, "logps/chosen": -452.3276672363281, "logps/rejected": -492.7530822753906, "loss": 0.635, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4167579114437103, "rewards/margins": 0.38403820991516113, "rewards/rejected": 0.03271971270442009, "step": 172 }, { "epoch": 0.21890547263681592, "grad_norm": 4.890253067016602, "learning_rate": 1.987492030576407e-05, "logits/chosen": 0.4215804934501648, "logits/rejected": 0.3395119309425354, "logps/chosen": -443.6938781738281, "logps/rejected": -448.883056640625, "loss": 0.6518, "rewards/accuracies": 0.671875, "rewards/chosen": 0.3662562072277069, "rewards/margins": 0.25069642066955566, "rewards/rejected": 0.11555974185466766, "step": 176 }, { "epoch": 0.22388059701492538, "grad_norm": 4.925017833709717, "learning_rate": 1.986143882933744e-05, "logits/chosen": 0.7153533697128296, "logits/rejected": 0.5962733626365662, "logps/chosen": -378.98199462890625, "logps/rejected": -362.8702697753906, "loss": 0.6265, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10049009323120117, "rewards/margins": 0.24912574887275696, "rewards/rejected": -0.1486356258392334, "step": 180 }, { "epoch": 0.22885572139303484, "grad_norm": 4.816032409667969, "learning_rate": 1.9847272566999026e-05, "logits/chosen": 0.3551070988178253, "logits/rejected": 0.1886759102344513, "logps/chosen": -481.8218688964844, "logps/rejected": -461.4677429199219, "loss": 0.6018, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11597292125225067, "rewards/margins": 0.4469318389892578, "rewards/rejected": -0.5629047155380249, "step": 184 }, { "epoch": 0.23383084577114427, "grad_norm": 5.482291221618652, "learning_rate": 1.9832422502465013e-05, "logits/chosen": 0.07703270018100739, "logits/rejected": 0.08134737610816956, "logps/chosen": -430.76470947265625, "logps/rejected": -479.6883239746094, "loss": 0.6444, "rewards/accuracies": 0.625, "rewards/chosen": -0.38663041591644287, "rewards/margins": 0.16125822067260742, "rewards/rejected": -0.5478886365890503, "step": 188 }, { "epoch": 0.23880597014925373, "grad_norm": 5.5123677253723145, "learning_rate": 1.9816889666935318e-05, "logits/chosen": 0.46063917875289917, "logits/rejected": 0.40867650508880615, "logps/chosen": -496.49615478515625, "logps/rejected": -474.017578125, "loss": 0.6574, "rewards/accuracies": 0.578125, "rewards/chosen": -0.224630206823349, "rewards/margins": 0.17337118089199066, "rewards/rejected": -0.39800137281417847, "step": 192 }, { "epoch": 0.24378109452736318, "grad_norm": 8.160882949829102, "learning_rate": 1.9800675139022006e-05, "logits/chosen": 0.5780532956123352, "logits/rejected": 0.3103576898574829, "logps/chosen": -491.5118103027344, "logps/rejected": -422.33807373046875, "loss": 0.6543, "rewards/accuracies": 0.609375, "rewards/chosen": -0.08265501260757446, "rewards/margins": 0.24835649132728577, "rewards/rejected": -0.33101150393486023, "step": 196 }, { "epoch": 0.24875621890547264, "grad_norm": 6.051442623138428, "learning_rate": 1.9783780044674402e-05, "logits/chosen": 0.5951110124588013, "logits/rejected": 0.5504649877548218, "logps/chosen": -438.7686767578125, "logps/rejected": -458.02325439453125, "loss": 0.6816, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20991414785385132, "rewards/margins": 0.2382897138595581, "rewards/rejected": -0.4482038617134094, "step": 200 }, { "epoch": 0.24875621890547264, "eval_logits/chosen": 0.27061331272125244, "eval_logits/rejected": 0.13349506258964539, "eval_logps/chosen": -454.5816955566406, "eval_logps/rejected": -416.6123046875, "eval_loss": 0.6534828543663025, "eval_rewards/accuracies": 0.59375, "eval_rewards/chosen": -0.22846804559230804, "eval_rewards/margins": 0.25261345505714417, "eval_rewards/rejected": -0.4810815453529358, "eval_runtime": 150.5659, "eval_samples_per_second": 7.591, "eval_steps_per_second": 0.239, "step": 200 }, { "epoch": 0.2537313432835821, "grad_norm": 5.35200834274292, "learning_rate": 1.976620555710087e-05, "logits/chosen": 0.2719428837299347, "logits/rejected": 0.18390944600105286, "logps/chosen": -401.3759765625, "logps/rejected": -378.04510498046875, "loss": 0.6804, "rewards/accuracies": 0.640625, "rewards/chosen": -0.2212618738412857, "rewards/margins": 0.1665419489145279, "rewards/rejected": -0.3878038227558136, "step": 204 }, { "epoch": 0.25870646766169153, "grad_norm": 5.549587726593018, "learning_rate": 1.974795289668737e-05, "logits/chosen": 0.222773939371109, "logits/rejected": 0.27480173110961914, "logps/chosen": -450.5555419921875, "logps/rejected": -478.2663879394531, "loss": 0.6274, "rewards/accuracies": 0.625, "rewards/chosen": 0.5931901335716248, "rewards/margins": 0.2987501919269562, "rewards/rejected": 0.29443997144699097, "step": 208 }, { "epoch": 0.263681592039801, "grad_norm": 5.261623859405518, "learning_rate": 1.972902333091271e-05, "logits/chosen": 0.41583824157714844, "logits/rejected": 0.16713739931583405, "logps/chosen": -533.6800537109375, "logps/rejected": -458.5304260253906, "loss": 0.677, "rewards/accuracies": 0.546875, "rewards/chosen": 0.5807373523712158, "rewards/margins": 0.1561833620071411, "rewards/rejected": 0.4245539605617523, "step": 212 }, { "epoch": 0.26865671641791045, "grad_norm": 5.055637359619141, "learning_rate": 1.9709418174260523e-05, "logits/chosen": 0.3311361074447632, "logits/rejected": 0.3872915506362915, "logps/chosen": -467.373046875, "logps/rejected": -458.4536437988281, "loss": 0.647, "rewards/accuracies": 0.578125, "rewards/chosen": 0.7115026712417603, "rewards/margins": 0.30211564898490906, "rewards/rejected": 0.40938708186149597, "step": 216 }, { "epoch": 0.2736318407960199, "grad_norm": 4.662365913391113, "learning_rate": 1.9689138788127994e-05, "logits/chosen": 0.43617844581604004, "logits/rejected": 0.209380641579628, "logps/chosen": -391.93701171875, "logps/rejected": -352.4445495605469, "loss": 0.6663, "rewards/accuracies": 0.640625, "rewards/chosen": 0.24579453468322754, "rewards/margins": 0.27014172077178955, "rewards/rejected": -0.0243472121655941, "step": 220 }, { "epoch": 0.27860696517412936, "grad_norm": 5.244974136352539, "learning_rate": 1.966818658073133e-05, "logits/chosen": 0.179366797208786, "logits/rejected": 0.17232109606266022, "logps/chosen": -475.9603271484375, "logps/rejected": -503.4451904296875, "loss": 0.6791, "rewards/accuracies": 0.640625, "rewards/chosen": 0.10947009921073914, "rewards/margins": 0.0003622081130743027, "rewards/rejected": 0.10910789668560028, "step": 224 }, { "epoch": 0.2835820895522388, "grad_norm": 4.5364179611206055, "learning_rate": 1.9646563007007952e-05, "logits/chosen": 0.11134719103574753, "logits/rejected": -0.09881246089935303, "logps/chosen": -491.548828125, "logps/rejected": -504.40496826171875, "loss": 0.6516, "rewards/accuracies": 0.640625, "rewards/chosen": 0.12891899049282074, "rewards/margins": 0.29636111855506897, "rewards/rejected": -0.16744214296340942, "step": 228 }, { "epoch": 0.2885572139303483, "grad_norm": 4.562107086181641, "learning_rate": 1.9624269568515486e-05, "logits/chosen": 0.33666372299194336, "logits/rejected": 0.3560597896575928, "logps/chosen": -485.7892150878906, "logps/rejected": -458.96600341796875, "loss": 0.633, "rewards/accuracies": 0.578125, "rewards/chosen": 0.3998726010322571, "rewards/margins": 0.21638146042823792, "rewards/rejected": 0.18349118530750275, "step": 232 }, { "epoch": 0.2935323383084577, "grad_norm": 6.737706661224365, "learning_rate": 1.960130781332748e-05, "logits/chosen": 0.6583088040351868, "logits/rejected": 0.5398542284965515, "logps/chosen": -500.09442138671875, "logps/rejected": -470.6582946777344, "loss": 0.6685, "rewards/accuracies": 0.671875, "rewards/chosen": 0.579067587852478, "rewards/margins": 0.4634256958961487, "rewards/rejected": 0.11564186215400696, "step": 236 }, { "epoch": 0.29850746268656714, "grad_norm": 5.656876087188721, "learning_rate": 1.957767933592591e-05, "logits/chosen": 0.40276038646698, "logits/rejected": 0.3526462912559509, "logps/chosen": -452.8072509765625, "logps/rejected": -455.0268249511719, "loss": 0.6849, "rewards/accuracies": 0.640625, "rewards/chosen": 0.6671885848045349, "rewards/margins": 0.457474946975708, "rewards/rejected": 0.20971357822418213, "step": 240 }, { "epoch": 0.3034825870646766, "grad_norm": 4.534661769866943, "learning_rate": 1.955338577709046e-05, "logits/chosen": 0.11831162869930267, "logits/rejected": -0.027393575757741928, "logps/chosen": -446.3404846191406, "logps/rejected": -409.2080078125, "loss": 0.6423, "rewards/accuracies": 0.65625, "rewards/chosen": 0.14041246473789215, "rewards/margins": 0.2885099947452545, "rewards/rejected": -0.14809754490852356, "step": 244 }, { "epoch": 0.30845771144278605, "grad_norm": 4.870954513549805, "learning_rate": 1.9528428823784567e-05, "logits/chosen": -0.06817762553691864, "logits/rejected": 0.10090361535549164, "logps/chosen": -417.2574157714844, "logps/rejected": -500.5931701660156, "loss": 0.6719, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2122226357460022, "rewards/margins": 0.06963346153497696, "rewards/rejected": -0.28185608983039856, "step": 248 }, { "epoch": 0.31094527363184077, "eval_logits/chosen": 0.24545568227767944, "eval_logits/rejected": 0.10711152106523514, "eval_logps/chosen": -453.1002502441406, "eval_logps/rejected": -414.6319580078125, "eval_loss": 0.6768244504928589, "eval_rewards/accuracies": 0.6006944179534912, "eval_rewards/chosen": -0.08032441139221191, "eval_rewards/margins": 0.20272159576416016, "eval_rewards/rejected": -0.2830459773540497, "eval_runtime": 150.4022, "eval_samples_per_second": 7.6, "eval_steps_per_second": 0.239, "step": 250 }, { "epoch": 0.31343283582089554, "grad_norm": 4.674103260040283, "learning_rate": 1.9502810209038302e-05, "logits/chosen": 0.2548333406448364, "logits/rejected": 0.23590323328971863, "logps/chosen": -441.0978698730469, "logps/rejected": -444.6314697265625, "loss": 0.6797, "rewards/accuracies": 0.484375, "rewards/chosen": -0.21375508606433868, "rewards/margins": 0.04151350259780884, "rewards/rejected": -0.25526857376098633, "step": 252 }, { "epoch": 0.31840796019900497, "grad_norm": 15.962188720703125, "learning_rate": 1.9476531711828027e-05, "logits/chosen": 0.22388213872909546, "logits/rejected": 0.02985329180955887, "logps/chosen": -527.328369140625, "logps/rejected": -451.01165771484375, "loss": 0.6971, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01938755437731743, "rewards/margins": 0.19766713678836823, "rewards/rejected": -0.17827960848808289, "step": 256 }, { "epoch": 0.32338308457711445, "grad_norm": 4.411048412322998, "learning_rate": 1.9449595156952827e-05, "logits/chosen": 0.09123142063617706, "logits/rejected": 0.008157305419445038, "logps/chosen": -464.3664245605469, "logps/rejected": -449.2779846191406, "loss": 0.6432, "rewards/accuracies": 0.609375, "rewards/chosen": 0.09136360138654709, "rewards/margins": 0.1354491412639618, "rewards/rejected": -0.044085558503866196, "step": 260 }, { "epoch": 0.3283582089552239, "grad_norm": 4.79756498336792, "learning_rate": 1.9422002414907837e-05, "logits/chosen": 0.4070839583873749, "logits/rejected": 0.3463619649410248, "logps/chosen": -413.456298828125, "logps/rejected": -413.7463073730469, "loss": 0.6395, "rewards/accuracies": 0.625, "rewards/chosen": 0.4945845901966095, "rewards/margins": 0.20277410745620728, "rewards/rejected": 0.29181045293807983, "step": 264 }, { "epoch": 0.3333333333333333, "grad_norm": 4.720849990844727, "learning_rate": 1.9393755401754324e-05, "logits/chosen": 0.29830023646354675, "logits/rejected": 0.3905254602432251, "logps/chosen": -390.8925476074219, "logps/rejected": -460.8228759765625, "loss": 0.6237, "rewards/accuracies": 0.625, "rewards/chosen": 0.34830933809280396, "rewards/margins": 0.27932432293891907, "rewards/rejected": 0.06898501515388489, "step": 268 }, { "epoch": 0.3383084577114428, "grad_norm": 5.378329277038574, "learning_rate": 1.936485607898665e-05, "logits/chosen": 0.07186523079872131, "logits/rejected": 0.15830281376838684, "logps/chosen": -393.9452819824219, "logps/rejected": -455.62957763671875, "loss": 0.6858, "rewards/accuracies": 0.515625, "rewards/chosen": 0.17164385318756104, "rewards/margins": 0.09308388829231262, "rewards/rejected": 0.0785599797964096, "step": 272 }, { "epoch": 0.34328358208955223, "grad_norm": 4.18524169921875, "learning_rate": 1.9335306453396066e-05, "logits/chosen": 0.056332044303417206, "logits/rejected": 0.07097341120243073, "logps/chosen": -490.37994384765625, "logps/rejected": -514.7352294921875, "loss": 0.6139, "rewards/accuracies": 0.671875, "rewards/chosen": 0.11651282012462616, "rewards/margins": 0.41814491152763367, "rewards/rejected": -0.3016320765018463, "step": 276 }, { "epoch": 0.3482587064676617, "grad_norm": 5.413318634033203, "learning_rate": 1.9305108576931336e-05, "logits/chosen": 0.01699664443731308, "logits/rejected": -0.03439049795269966, "logps/chosen": -382.8931579589844, "logps/rejected": -419.8720703125, "loss": 0.6516, "rewards/accuracies": 0.609375, "rewards/chosen": -0.3111031949520111, "rewards/margins": 0.21040624380111694, "rewards/rejected": -0.5215094089508057, "step": 280 }, { "epoch": 0.35323383084577115, "grad_norm": 5.107039928436279, "learning_rate": 1.927426454655627e-05, "logits/chosen": 0.30719754099845886, "logits/rejected": 0.2690942883491516, "logps/chosen": -494.9206237792969, "logps/rejected": -498.79901123046875, "loss": 0.6475, "rewards/accuracies": 0.515625, "rewards/chosen": -0.8447175025939941, "rewards/margins": 0.18089117109775543, "rewards/rejected": -1.0256086587905884, "step": 284 }, { "epoch": 0.3582089552238806, "grad_norm": 4.356219291687012, "learning_rate": 1.924277650410412e-05, "logits/chosen": 0.13748708367347717, "logits/rejected": 0.2504044473171234, "logps/chosen": -548.0153198242188, "logps/rejected": -559.4176635742188, "loss": 0.6994, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6704975366592407, "rewards/margins": -0.03709391877055168, "rewards/rejected": -0.6334035992622375, "step": 288 }, { "epoch": 0.36318407960199006, "grad_norm": 4.30932092666626, "learning_rate": 1.9210646636128805e-05, "logits/chosen": 0.16785617172718048, "logits/rejected": 0.32375362515449524, "logps/chosen": -417.7137145996094, "logps/rejected": -482.4889221191406, "loss": 0.6539, "rewards/accuracies": 0.546875, "rewards/chosen": -0.3137147128582001, "rewards/margins": 0.08970025926828384, "rewards/rejected": -0.4034149646759033, "step": 292 }, { "epoch": 0.3681592039800995, "grad_norm": 4.273219108581543, "learning_rate": 1.9177877173753127e-05, "logits/chosen": 0.1516554057598114, "logits/rejected": 0.0621149055659771, "logps/chosen": -439.8550109863281, "logps/rejected": -445.9311218261719, "loss": 0.6221, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12972819805145264, "rewards/margins": 0.31710904836654663, "rewards/rejected": -0.44683724641799927, "step": 296 }, { "epoch": 0.373134328358209, "grad_norm": 4.085379123687744, "learning_rate": 1.91444703925138e-05, "logits/chosen": 0.2226869910955429, "logits/rejected": 0.2288302779197693, "logps/chosen": -402.9095458984375, "logps/rejected": -436.23846435546875, "loss": 0.642, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1526205837726593, "rewards/margins": 0.27878373861312866, "rewards/rejected": -0.12616315484046936, "step": 300 }, { "epoch": 0.373134328358209, "eval_logits/chosen": 0.2271442711353302, "eval_logits/rejected": 0.08639353513717651, "eval_logps/chosen": -448.8922119140625, "eval_logps/rejected": -411.57562255859375, "eval_loss": 0.6402102112770081, "eval_rewards/accuracies": 0.6145833134651184, "eval_rewards/chosen": 0.3404841423034668, "eval_rewards/margins": 0.31789708137512207, "eval_rewards/rejected": 0.022587047889828682, "eval_runtime": 149.8006, "eval_samples_per_second": 7.63, "eval_steps_per_second": 0.24, "step": 300 }, { "epoch": 0.3781094527363184, "grad_norm": 4.7133870124816895, "learning_rate": 1.9110428612203463e-05, "logits/chosen": 0.28455495834350586, "logits/rejected": 0.3236948847770691, "logps/chosen": -557.7841186523438, "logps/rejected": -595.4920654296875, "loss": 0.6539, "rewards/accuracies": 0.59375, "rewards/chosen": 0.502812922000885, "rewards/margins": 0.31084102392196655, "rewards/rejected": 0.19197186827659607, "step": 304 }, { "epoch": 0.38308457711442784, "grad_norm": 4.747364521026611, "learning_rate": 1.9075754196709574e-05, "logits/chosen": 0.3259233832359314, "logits/rejected": 0.2481708824634552, "logps/chosen": -431.3799133300781, "logps/rejected": -437.3810729980469, "loss": 0.6545, "rewards/accuracies": 0.625, "rewards/chosen": 0.463757187128067, "rewards/margins": 0.22637638449668884, "rewards/rejected": 0.23738083243370056, "step": 308 }, { "epoch": 0.3880597014925373, "grad_norm": 4.707996368408203, "learning_rate": 1.904044955385026e-05, "logits/chosen": 0.2886297404766083, "logits/rejected": 0.035777147859334946, "logps/chosen": -497.3841857910156, "logps/rejected": -406.03729248046875, "loss": 0.6223, "rewards/accuracies": 0.75, "rewards/chosen": 0.6689484119415283, "rewards/margins": 0.5415085554122925, "rewards/rejected": 0.12743981182575226, "step": 312 }, { "epoch": 0.39303482587064675, "grad_norm": 4.470433235168457, "learning_rate": 1.9004517135207127e-05, "logits/chosen": 0.22225256264209747, "logits/rejected": 0.2989833652973175, "logps/chosen": -394.5459289550781, "logps/rejected": -429.8094177246094, "loss": 0.6654, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2795145511627197, "rewards/margins": 0.16655004024505615, "rewards/rejected": 0.11296449601650238, "step": 316 }, { "epoch": 0.39800995024875624, "grad_norm": 4.836558818817139, "learning_rate": 1.8967959435955027e-05, "logits/chosen": 0.37761908769607544, "logits/rejected": 0.26500552892684937, "logps/chosen": -482.1424560546875, "logps/rejected": -428.25, "loss": 0.6482, "rewards/accuracies": 0.453125, "rewards/chosen": -0.08129014819860458, "rewards/margins": 0.13191911578178406, "rewards/rejected": -0.21320928633213043, "step": 320 }, { "epoch": 0.40298507462686567, "grad_norm": 5.268253326416016, "learning_rate": 1.893077899468876e-05, "logits/chosen": 0.2713007926940918, "logits/rejected": 0.04821309447288513, "logps/chosen": -563.9439697265625, "logps/rejected": -503.7855529785156, "loss": 0.6486, "rewards/accuracies": 0.703125, "rewards/chosen": -0.37922781705856323, "rewards/margins": 0.29474934935569763, "rewards/rejected": -0.6739771366119385, "step": 324 }, { "epoch": 0.4079601990049751, "grad_norm": 4.993300437927246, "learning_rate": 1.889297839324682e-05, "logits/chosen": 0.34269845485687256, "logits/rejected": 0.27501022815704346, "logps/chosen": -438.5770568847656, "logps/rejected": -437.1994934082031, "loss": 0.6593, "rewards/accuracies": 0.515625, "rewards/chosen": -0.48632100224494934, "rewards/margins": 0.22347672283649445, "rewards/rejected": -0.709797739982605, "step": 328 }, { "epoch": 0.4129353233830846, "grad_norm": 4.8070149421691895, "learning_rate": 1.8854560256532098e-05, "logits/chosen": 0.04936538636684418, "logits/rejected": -0.0027198120951652527, "logps/chosen": -438.79168701171875, "logps/rejected": -428.35308837890625, "loss": 0.6556, "rewards/accuracies": 0.609375, "rewards/chosen": -0.2654213309288025, "rewards/margins": 0.41919708251953125, "rewards/rejected": -0.6846184134483337, "step": 332 }, { "epoch": 0.417910447761194, "grad_norm": 4.997420787811279, "learning_rate": 1.8815527252329624e-05, "logits/chosen": 0.2193477749824524, "logits/rejected": 0.03042268194258213, "logps/chosen": -468.4323425292969, "logps/rejected": -426.1119384765625, "loss": 0.6168, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03436838462948799, "rewards/margins": 0.521725058555603, "rewards/rejected": -0.5560933351516724, "step": 336 }, { "epoch": 0.4228855721393035, "grad_norm": 4.165882110595703, "learning_rate": 1.8775882091121282e-05, "logits/chosen": 0.5012113451957703, "logits/rejected": 0.35550257563591003, "logps/chosen": -505.60626220703125, "logps/rejected": -438.73095703125, "loss": 0.6309, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1919064074754715, "rewards/margins": 0.44131097197532654, "rewards/rejected": -0.24940457940101624, "step": 340 }, { "epoch": 0.42786069651741293, "grad_norm": 4.5491251945495605, "learning_rate": 1.8735627525897618e-05, "logits/chosen": 0.3401688039302826, "logits/rejected": 0.10173173248767853, "logps/chosen": -449.0252990722656, "logps/rejected": -379.44598388671875, "loss": 0.6475, "rewards/accuracies": 0.671875, "rewards/chosen": 0.304412841796875, "rewards/margins": 0.298681378364563, "rewards/rejected": 0.005731441080570221, "step": 344 }, { "epoch": 0.43283582089552236, "grad_norm": 4.290804862976074, "learning_rate": 1.8694766351966665e-05, "logits/chosen": 0.20657242834568024, "logits/rejected": 0.16187314689159393, "logps/chosen": -430.30169677734375, "logps/rejected": -508.4122314453125, "loss": 0.6675, "rewards/accuracies": 0.765625, "rewards/chosen": 0.5791266560554504, "rewards/margins": 0.42658331990242004, "rewards/rejected": 0.152543306350708, "step": 348 }, { "epoch": 0.43532338308457713, "eval_logits/chosen": 0.27793338894844055, "eval_logits/rejected": 0.1382322907447815, "eval_logps/chosen": -444.71087646484375, "eval_logps/rejected": -407.1244201660156, "eval_loss": 0.6471754908561707, "eval_rewards/accuracies": 0.6006944179534912, "eval_rewards/chosen": 0.7586135864257812, "eval_rewards/margins": 0.29090631008148193, "eval_rewards/rejected": 0.4677073061466217, "eval_runtime": 150.2506, "eval_samples_per_second": 7.607, "eval_steps_per_second": 0.24, "step": 350 }, { "epoch": 0.43781094527363185, "grad_norm": 4.04841947555542, "learning_rate": 1.8653301406759827e-05, "logits/chosen": 0.26602596044540405, "logits/rejected": 0.26264214515686035, "logps/chosen": -414.1706848144531, "logps/rejected": -394.3015441894531, "loss": 0.7136, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5916293263435364, "rewards/margins": -0.07215756177902222, "rewards/rejected": 0.6637868881225586, "step": 352 }, { "epoch": 0.4427860696517413, "grad_norm": 4.154921054840088, "learning_rate": 1.8611235569634852e-05, "logits/chosen": 0.47313758730888367, "logits/rejected": 0.21173089742660522, "logps/chosen": -429.60491943359375, "logps/rejected": -392.5804138183594, "loss": 0.6954, "rewards/accuracies": 0.53125, "rewards/chosen": 0.679366946220398, "rewards/margins": 0.12296590954065323, "rewards/rejected": 0.5564010739326477, "step": 356 }, { "epoch": 0.44776119402985076, "grad_norm": 3.9639251232147217, "learning_rate": 1.8568571761675893e-05, "logits/chosen": 0.4981469213962555, "logits/rejected": 0.49814143776893616, "logps/chosen": -423.76898193359375, "logps/rejected": -453.06573486328125, "loss": 0.6729, "rewards/accuracies": 0.515625, "rewards/chosen": 0.6893520355224609, "rewards/margins": 0.2094535529613495, "rewards/rejected": 0.47989848256111145, "step": 360 }, { "epoch": 0.4527363184079602, "grad_norm": 4.174687385559082, "learning_rate": 1.8525312945490647e-05, "logits/chosen": 0.1745152622461319, "logits/rejected": 0.22328950464725494, "logps/chosen": -420.2294616699219, "logps/rejected": -452.5687255859375, "loss": 0.6294, "rewards/accuracies": 0.734375, "rewards/chosen": 0.4526810348033905, "rewards/margins": 0.409667432308197, "rewards/rejected": 0.043013621121644974, "step": 364 }, { "epoch": 0.4577114427860697, "grad_norm": 5.690698146820068, "learning_rate": 1.8481462125004647e-05, "logits/chosen": 0.3042501211166382, "logits/rejected": 0.19751590490341187, "logps/chosen": -480.2320556640625, "logps/rejected": -409.99993896484375, "loss": 0.6514, "rewards/accuracies": 0.453125, "rewards/chosen": -0.034170668572187424, "rewards/margins": 0.10836675763130188, "rewards/rejected": -0.1425374299287796, "step": 368 }, { "epoch": 0.4626865671641791, "grad_norm": 3.7740769386291504, "learning_rate": 1.8437022345252666e-05, "logits/chosen": 0.410859614610672, "logits/rejected": 0.2786995470523834, "logps/chosen": -536.8661499023438, "logps/rejected": -485.7401123046875, "loss": 0.6416, "rewards/accuracies": 0.625, "rewards/chosen": 0.021265551447868347, "rewards/margins": 0.20183995366096497, "rewards/rejected": -0.1805744171142578, "step": 372 }, { "epoch": 0.46766169154228854, "grad_norm": 4.466541290283203, "learning_rate": 1.8391996692167242e-05, "logits/chosen": 0.36077880859375, "logits/rejected": 0.02420664392411709, "logps/chosen": -574.6773071289062, "logps/rejected": -416.6241455078125, "loss": 0.7154, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05402504652738571, "rewards/margins": 0.175716370344162, "rewards/rejected": -0.2297414094209671, "step": 376 }, { "epoch": 0.472636815920398, "grad_norm": 4.715292930603027, "learning_rate": 1.8346388292364438e-05, "logits/chosen": 0.5576101541519165, "logits/rejected": 0.2390051931142807, "logps/chosen": -482.841796875, "logps/rejected": -415.119384765625, "loss": 0.6533, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0752110630273819, "rewards/margins": 0.1834164559841156, "rewards/rejected": -0.2586275339126587, "step": 380 }, { "epoch": 0.47761194029850745, "grad_norm": 4.4041523933410645, "learning_rate": 1.8300200312926674e-05, "logits/chosen": 0.4594465494155884, "logits/rejected": 0.21978969871997833, "logps/chosen": -478.629638671875, "logps/rejected": -375.6353454589844, "loss": 0.6265, "rewards/accuracies": 0.578125, "rewards/chosen": -0.09999266266822815, "rewards/margins": 0.25770846009254456, "rewards/rejected": -0.3577011227607727, "step": 384 }, { "epoch": 0.48258706467661694, "grad_norm": 4.524245738983154, "learning_rate": 1.8253435961182844e-05, "logits/chosen": 0.011010982096195221, "logits/rejected": -0.07573414593935013, "logps/chosen": -508.1129455566406, "logps/rejected": -466.13006591796875, "loss": 0.6485, "rewards/accuracies": 0.640625, "rewards/chosen": 0.21897917985916138, "rewards/margins": 0.22618308663368225, "rewards/rejected": -0.007203895598649979, "step": 388 }, { "epoch": 0.48756218905472637, "grad_norm": 3.8508663177490234, "learning_rate": 1.8206098484485563e-05, "logits/chosen": 0.17437395453453064, "logits/rejected": 0.12683795392513275, "logps/chosen": -448.64056396484375, "logps/rejected": -439.05767822265625, "loss": 0.6487, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2330358326435089, "rewards/margins": 0.20018717646598816, "rewards/rejected": 0.03284864127635956, "step": 392 }, { "epoch": 0.4925373134328358, "grad_norm": 4.620991230010986, "learning_rate": 1.8158191169985696e-05, "logits/chosen": 0.18229001760482788, "logits/rejected": 0.053403086960315704, "logps/chosen": -529.84814453125, "logps/rejected": -488.3792724609375, "loss": 0.6234, "rewards/accuracies": 0.65625, "rewards/chosen": 0.21858179569244385, "rewards/margins": 0.3658568859100342, "rewards/rejected": -0.14727509021759033, "step": 396 }, { "epoch": 0.4975124378109453, "grad_norm": 4.5922722816467285, "learning_rate": 1.810971734440408e-05, "logits/chosen": 0.30341237783432007, "logits/rejected": 0.07493434846401215, "logps/chosen": -452.9410705566406, "logps/rejected": -400.3564453125, "loss": 0.6581, "rewards/accuracies": 0.53125, "rewards/chosen": 0.029513243585824966, "rewards/margins": 0.16814345121383667, "rewards/rejected": -0.1386302411556244, "step": 400 }, { "epoch": 0.4975124378109453, "eval_logits/chosen": 0.17696020007133484, "eval_logits/rejected": 0.03260684758424759, "eval_logps/chosen": -452.606689453125, "eval_logps/rejected": -414.8606872558594, "eval_loss": 0.6501542925834656, "eval_rewards/accuracies": 0.6180555820465088, "eval_rewards/chosen": -0.030969224870204926, "eval_rewards/margins": 0.27494877576828003, "eval_rewards/rejected": -0.3059180676937103, "eval_runtime": 150.3142, "eval_samples_per_second": 7.604, "eval_steps_per_second": 0.239, "step": 400 }, { "epoch": 0.5024875621890548, "grad_norm": 3.963479518890381, "learning_rate": 1.806068037380052e-05, "logits/chosen": 0.27582094073295593, "logits/rejected": 0.19119888544082642, "logps/chosen": -423.74456787109375, "logps/rejected": -438.787841796875, "loss": 0.6637, "rewards/accuracies": 0.546875, "rewards/chosen": -0.0536828339099884, "rewards/margins": 0.13508188724517822, "rewards/rejected": -0.18876472115516663, "step": 404 }, { "epoch": 0.5074626865671642, "grad_norm": 4.1663713455200195, "learning_rate": 1.801108366334004e-05, "logits/chosen": 0.17915582656860352, "logits/rejected": 0.18883880972862244, "logps/chosen": -480.3377380371094, "logps/rejected": -529.461669921875, "loss": 0.6489, "rewards/accuracies": 0.625, "rewards/chosen": -0.4978042244911194, "rewards/margins": 0.31464439630508423, "rewards/rejected": -0.8124486207962036, "step": 408 }, { "epoch": 0.5124378109452736, "grad_norm": 3.5810389518737793, "learning_rate": 1.796093065705644e-05, "logits/chosen": 0.3043825030326843, "logits/rejected": 0.20817437767982483, "logps/chosen": -431.47955322265625, "logps/rejected": -417.6255798339844, "loss": 0.6157, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6938939690589905, "rewards/margins": 0.24841205775737762, "rewards/rejected": -0.9423060417175293, "step": 412 }, { "epoch": 0.5174129353233831, "grad_norm": 4.341555118560791, "learning_rate": 1.791022483761312e-05, "logits/chosen": 0.2805790603160858, "logits/rejected": 0.07360462844371796, "logps/chosen": -518.8629760742188, "logps/rejected": -453.9353332519531, "loss": 0.6335, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8408029079437256, "rewards/margins": 0.42247286438941956, "rewards/rejected": -1.2632758617401123, "step": 416 }, { "epoch": 0.5223880597014925, "grad_norm": 4.9308390617370605, "learning_rate": 1.7858969726061262e-05, "logits/chosen": 0.061581894755363464, "logits/rejected": 0.14411726593971252, "logps/chosen": -428.17498779296875, "logps/rejected": -457.570068359375, "loss": 0.6959, "rewards/accuracies": 0.546875, "rewards/chosen": -1.0671128034591675, "rewards/margins": 0.08442307263612747, "rewards/rejected": -1.151535987854004, "step": 420 }, { "epoch": 0.527363184079602, "grad_norm": 4.160035133361816, "learning_rate": 1.7807168881595304e-05, "logits/chosen": -0.10161225497722626, "logits/rejected": -0.09652488678693771, "logps/chosen": -465.89825439453125, "logps/rejected": -476.0804138183594, "loss": 0.6391, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8282710313796997, "rewards/margins": 0.3242005407810211, "rewards/rejected": -1.152471661567688, "step": 424 }, { "epoch": 0.5323383084577115, "grad_norm": 4.315358638763428, "learning_rate": 1.7754825901305814e-05, "logits/chosen": 0.30026042461395264, "logits/rejected": 0.15877141058444977, "logps/chosen": -469.1257019042969, "logps/rejected": -489.8163757324219, "loss": 0.6313, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5448905825614929, "rewards/margins": 0.34440505504608154, "rewards/rejected": -0.8892955780029297, "step": 428 }, { "epoch": 0.5373134328358209, "grad_norm": 4.043447017669678, "learning_rate": 1.7701944419929673e-05, "logits/chosen": 0.3924216628074646, "logits/rejected": 0.34802040457725525, "logps/chosen": -483.4385070800781, "logps/rejected": -494.6759033203125, "loss": 0.6521, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4959757328033447, "rewards/margins": 0.3944730758666992, "rewards/rejected": -0.890448808670044, "step": 432 }, { "epoch": 0.5422885572139303, "grad_norm": 4.426882266998291, "learning_rate": 1.7648528109597704e-05, "logits/chosen": 0.42673125863075256, "logits/rejected": 0.25516799092292786, "logps/chosen": -504.686279296875, "logps/rejected": -443.46954345703125, "loss": 0.614, "rewards/accuracies": 0.65625, "rewards/chosen": -0.49745240807533264, "rewards/margins": 0.42378664016723633, "rewards/rejected": -0.9212391376495361, "step": 436 }, { "epoch": 0.5472636815920398, "grad_norm": 4.089346885681152, "learning_rate": 1.7594580679579654e-05, "logits/chosen": 0.09302594512701035, "logits/rejected": 0.11728382110595703, "logps/chosen": -459.3074951171875, "logps/rejected": -414.81268310546875, "loss": 0.6545, "rewards/accuracies": 0.640625, "rewards/chosen": -0.38374608755111694, "rewards/margins": 0.3443138897418976, "rewards/rejected": -0.7280599474906921, "step": 440 }, { "epoch": 0.5522388059701493, "grad_norm": 3.8262646198272705, "learning_rate": 1.7540105876026647e-05, "logits/chosen": 0.20306290686130524, "logits/rejected": 0.07559295743703842, "logps/chosen": -558.5977172851562, "logps/rejected": -493.43841552734375, "loss": 0.6138, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0867682546377182, "rewards/margins": 0.407266229391098, "rewards/rejected": -0.32049790024757385, "step": 444 }, { "epoch": 0.5572139303482587, "grad_norm": 4.2944440841674805, "learning_rate": 1.7485107481711014e-05, "logits/chosen": 0.20840412378311157, "logits/rejected": 0.08403539657592773, "logps/chosen": -517.5396728515625, "logps/rejected": -472.1680908203125, "loss": 0.6155, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10591678321361542, "rewards/margins": 0.29278436303138733, "rewards/rejected": -0.18686755001544952, "step": 448 }, { "epoch": 0.5597014925373134, "eval_logits/chosen": 0.24904420971870422, "eval_logits/rejected": 0.11017153412103653, "eval_logps/chosen": -452.0428161621094, "eval_logps/rejected": -414.6964111328125, "eval_loss": 0.6415970921516418, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 0.025422947481274605, "eval_rewards/margins": 0.3149137794971466, "eval_rewards/rejected": -0.28949081897735596, "eval_runtime": 150.2184, "eval_samples_per_second": 7.609, "eval_steps_per_second": 0.24, "step": 450 }, { "epoch": 0.5621890547263682, "grad_norm": 4.054657936096191, "learning_rate": 1.7429589315763637e-05, "logits/chosen": 0.2601884603500366, "logits/rejected": 0.022673480212688446, "logps/chosen": -499.178466796875, "logps/rejected": -424.4082946777344, "loss": 0.6285, "rewards/accuracies": 0.59375, "rewards/chosen": 0.014242544770240784, "rewards/margins": 0.29500868916511536, "rewards/rejected": -0.28076615929603577, "step": 452 }, { "epoch": 0.5671641791044776, "grad_norm": 4.114813804626465, "learning_rate": 1.737355523340875e-05, "logits/chosen": 0.2519476115703583, "logits/rejected": 0.17674781382083893, "logps/chosen": -425.04718017578125, "logps/rejected": -395.718505859375, "loss": 0.604, "rewards/accuracies": 0.578125, "rewards/chosen": 0.1875460147857666, "rewards/margins": 0.26022982597351074, "rewards/rejected": -0.07268380373716354, "step": 456 }, { "epoch": 0.572139303482587, "grad_norm": 4.393073558807373, "learning_rate": 1.7317009125696208e-05, "logits/chosen": 0.3865906298160553, "logits/rejected": 0.1851556897163391, "logps/chosen": -487.2419738769531, "logps/rejected": -482.6796875, "loss": 0.6472, "rewards/accuracies": 0.734375, "rewards/chosen": 0.4721629023551941, "rewards/margins": 0.6545584797859192, "rewards/rejected": -0.18239565193653107, "step": 460 }, { "epoch": 0.5771144278606966, "grad_norm": 4.722254276275635, "learning_rate": 1.725995491923131e-05, "logits/chosen": 0.019634254276752472, "logits/rejected": -0.1314508616924286, "logps/chosen": -511.8298645019531, "logps/rejected": -418.1177062988281, "loss": 0.649, "rewards/accuracies": 0.59375, "rewards/chosen": -0.29104384779930115, "rewards/margins": 0.20025068521499634, "rewards/rejected": -0.4912944734096527, "step": 464 }, { "epoch": 0.582089552238806, "grad_norm": 3.6944985389709473, "learning_rate": 1.7202396575902118e-05, "logits/chosen": 0.3104863464832306, "logits/rejected": 0.17023295164108276, "logps/chosen": -443.22528076171875, "logps/rejected": -439.04559326171875, "loss": 0.6272, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12756507098674774, "rewards/margins": 0.5996299386024475, "rewards/rejected": -0.7271949648857117, "step": 468 }, { "epoch": 0.5870646766169154, "grad_norm": 4.278947353363037, "learning_rate": 1.714433809260435e-05, "logits/chosen": 0.2733452320098877, "logits/rejected": 0.1945551484823227, "logps/chosen": -472.4483642578125, "logps/rejected": -459.3942565917969, "loss": 0.6713, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2876668870449066, "rewards/margins": 0.2343754768371582, "rewards/rejected": -0.5220423340797424, "step": 472 }, { "epoch": 0.5920398009950248, "grad_norm": 5.243388652801514, "learning_rate": 1.7085783500963825e-05, "logits/chosen": 0.26794660091400146, "logits/rejected": 0.19214050471782684, "logps/chosen": -464.2667541503906, "logps/rejected": -466.7138977050781, "loss": 0.6101, "rewards/accuracies": 0.671875, "rewards/chosen": -0.38191846013069153, "rewards/margins": 0.3241249918937683, "rewards/rejected": -0.7060434818267822, "step": 476 }, { "epoch": 0.5970149253731343, "grad_norm": 3.732940435409546, "learning_rate": 1.702673686705651e-05, "logits/chosen": 0.4054350256919861, "logits/rejected": 0.4670087993144989, "logps/chosen": -428.61163330078125, "logps/rejected": -499.5010681152344, "loss": 0.6277, "rewards/accuracies": 0.53125, "rewards/chosen": -0.007774517871439457, "rewards/margins": 0.18405042588710785, "rewards/rejected": -0.19182495772838593, "step": 480 }, { "epoch": 0.6019900497512438, "grad_norm": 3.6961166858673096, "learning_rate": 1.6967202291126174e-05, "logits/chosen": 0.25117918848991394, "logits/rejected": 0.1439165323972702, "logps/chosen": -419.8067321777344, "logps/rejected": -385.7373352050781, "loss": 0.6272, "rewards/accuracies": 0.671875, "rewards/chosen": 0.3202853500843048, "rewards/margins": 0.39529967308044434, "rewards/rejected": -0.07501433044672012, "step": 484 }, { "epoch": 0.6069651741293532, "grad_norm": 3.861052989959717, "learning_rate": 1.690718390729964e-05, "logits/chosen": 0.6219749450683594, "logits/rejected": 0.3956920802593231, "logps/chosen": -487.5699768066406, "logps/rejected": -440.1090087890625, "loss": 0.596, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43934863805770874, "rewards/margins": 0.5191280245780945, "rewards/rejected": -0.07977931201457977, "step": 488 }, { "epoch": 0.6119402985074627, "grad_norm": 3.7701494693756104, "learning_rate": 1.684668588329973e-05, "logits/chosen": 0.23229114711284637, "logits/rejected": 0.18851926922798157, "logps/chosen": -467.3754577636719, "logps/rejected": -452.4528503417969, "loss": 0.6017, "rewards/accuracies": 0.734375, "rewards/chosen": 0.5834671854972839, "rewards/margins": 0.5684060454368591, "rewards/rejected": 0.015061168000102043, "step": 492 }, { "epoch": 0.6169154228855721, "grad_norm": 4.047066688537598, "learning_rate": 1.6785712420155864e-05, "logits/chosen": 0.35120919346809387, "logits/rejected": 0.15895111858844757, "logps/chosen": -609.0511474609375, "logps/rejected": -520.7322998046875, "loss": 0.6535, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3787741959095001, "rewards/margins": 0.24649052321910858, "rewards/rejected": 0.13228368759155273, "step": 496 }, { "epoch": 0.6218905472636815, "grad_norm": 4.4764604568481445, "learning_rate": 1.67242677519123e-05, "logits/chosen": 0.6815188527107239, "logits/rejected": 0.49643221497535706, "logps/chosen": -530.8171997070312, "logps/rejected": -430.7647399902344, "loss": 0.6438, "rewards/accuracies": 0.609375, "rewards/chosen": -0.17197714745998383, "rewards/margins": 0.11586709320545197, "rewards/rejected": -0.2878442406654358, "step": 500 }, { "epoch": 0.6218905472636815, "eval_logits/chosen": 0.21956767141819, "eval_logits/rejected": 0.07988239079713821, "eval_logps/chosen": -455.1015319824219, "eval_logps/rejected": -417.8031005859375, "eval_loss": 0.638308048248291, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -0.2804534435272217, "eval_rewards/margins": 0.31970784068107605, "eval_rewards/rejected": -0.6001612544059753, "eval_runtime": 150.1876, "eval_samples_per_second": 7.61, "eval_steps_per_second": 0.24, "step": 500 }, { "epoch": 0.6268656716417911, "grad_norm": 4.097576141357422, "learning_rate": 1.6662356145334158e-05, "logits/chosen": 0.17615841329097748, "logits/rejected": 0.03691507875919342, "logps/chosen": -502.0008544921875, "logps/rejected": -458.7081604003906, "loss": 0.5963, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2368411123752594, "rewards/margins": 0.48721814155578613, "rewards/rejected": -0.7240592241287231, "step": 504 }, { "epoch": 0.6318407960199005, "grad_norm": 4.976437091827393, "learning_rate": 1.6599981899611103e-05, "logits/chosen": 0.12691722810268402, "logits/rejected": 0.1578553318977356, "logps/chosen": -495.80755615234375, "logps/rejected": -519.9158935546875, "loss": 0.6323, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2605741322040558, "rewards/margins": 0.3820592761039734, "rewards/rejected": -0.6426333785057068, "step": 508 }, { "epoch": 0.6368159203980099, "grad_norm": 4.70882511138916, "learning_rate": 1.653714934605883e-05, "logits/chosen": 0.09863700717687607, "logits/rejected": -0.042115092277526855, "logps/chosen": -526.08251953125, "logps/rejected": -482.67034912109375, "loss": 0.6142, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35250020027160645, "rewards/margins": 0.47799065709114075, "rewards/rejected": -0.8304908275604248, "step": 512 }, { "epoch": 0.6417910447761194, "grad_norm": 4.004364967346191, "learning_rate": 1.647386284781828e-05, "logits/chosen": 0.4434223175048828, "logits/rejected": 0.3743742108345032, "logps/chosen": -461.95660400390625, "logps/rejected": -461.05523681640625, "loss": 0.6287, "rewards/accuracies": 0.609375, "rewards/chosen": -0.1911483108997345, "rewards/margins": 0.49052226543426514, "rewards/rejected": -0.681670606136322, "step": 516 }, { "epoch": 0.6467661691542289, "grad_norm": 4.580120086669922, "learning_rate": 1.6410126799552653e-05, "logits/chosen": 0.04173935577273369, "logits/rejected": 0.12166699767112732, "logps/chosen": -442.1742858886719, "logps/rejected": -484.1488342285156, "loss": 0.6941, "rewards/accuracies": 0.484375, "rewards/chosen": -0.40055373311042786, "rewards/margins": 0.04041279852390289, "rewards/rejected": -0.44096654653549194, "step": 520 }, { "epoch": 0.6517412935323383, "grad_norm": 4.086886882781982, "learning_rate": 1.6345945627142264e-05, "logits/chosen": 0.27961117029190063, "logits/rejected": 0.2143298089504242, "logps/chosen": -431.29827880859375, "logps/rejected": -459.7369384765625, "loss": 0.662, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11182767897844315, "rewards/margins": 0.17104047536849976, "rewards/rejected": -0.2828681766986847, "step": 524 }, { "epoch": 0.6567164179104478, "grad_norm": 12.731650352478027, "learning_rate": 1.628132378737718e-05, "logits/chosen": 0.4169122278690338, "logits/rejected": 0.07197268307209015, "logps/chosen": -529.1094970703125, "logps/rejected": -411.6646423339844, "loss": 0.614, "rewards/accuracies": 0.59375, "rewards/chosen": -0.21321269869804382, "rewards/margins": 0.4294634461402893, "rewards/rejected": -0.6426761150360107, "step": 528 }, { "epoch": 0.6616915422885572, "grad_norm": 3.989084005355835, "learning_rate": 1.6216265767647756e-05, "logits/chosen": 0.30040451884269714, "logits/rejected": 0.2668513357639313, "logps/chosen": -517.35009765625, "logps/rejected": -487.19268798828125, "loss": 0.64, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3091263175010681, "rewards/margins": 0.4227018654346466, "rewards/rejected": -0.7318282127380371, "step": 532 }, { "epoch": 0.6666666666666666, "grad_norm": 4.201297760009766, "learning_rate": 1.615077608563302e-05, "logits/chosen": 0.41413354873657227, "logits/rejected": 0.10457613319158554, "logps/chosen": -612.5179443359375, "logps/rejected": -496.9129333496094, "loss": 0.6, "rewards/accuracies": 0.671875, "rewards/chosen": -0.36306214332580566, "rewards/margins": 0.50215744972229, "rewards/rejected": -0.8652196526527405, "step": 536 }, { "epoch": 0.6716417910447762, "grad_norm": 4.379523277282715, "learning_rate": 1.6084859288986957e-05, "logits/chosen": 0.19828909635543823, "logits/rejected": 0.02061871998012066, "logps/chosen": -455.3958435058594, "logps/rejected": -407.6889953613281, "loss": 0.6598, "rewards/accuracies": 0.609375, "rewards/chosen": -0.3266015648841858, "rewards/margins": 0.33076098561286926, "rewards/rejected": -0.6573625206947327, "step": 540 }, { "epoch": 0.6766169154228856, "grad_norm": 3.583308696746826, "learning_rate": 1.601851995502272e-05, "logits/chosen": 0.5686550736427307, "logits/rejected": 0.585844874382019, "logps/chosen": -415.34234619140625, "logps/rejected": -441.9537048339844, "loss": 0.5988, "rewards/accuracies": 0.609375, "rewards/chosen": -0.4486159682273865, "rewards/margins": 0.2918972969055176, "rewards/rejected": -0.740513265132904, "step": 544 }, { "epoch": 0.681592039800995, "grad_norm": 4.024960517883301, "learning_rate": 1.5951762690394788e-05, "logits/chosen": 0.30994874238967896, "logits/rejected": 0.0973886027932167, "logps/chosen": -457.6329345703125, "logps/rejected": -444.02935791015625, "loss": 0.6069, "rewards/accuracies": 0.734375, "rewards/chosen": -0.29979974031448364, "rewards/margins": 0.32890307903289795, "rewards/rejected": -0.6287028789520264, "step": 548 }, { "epoch": 0.6840796019900498, "eval_logits/chosen": 0.2462157905101776, "eval_logits/rejected": 0.10789595544338226, "eval_logps/chosen": -458.82330322265625, "eval_logps/rejected": -421.25732421875, "eval_loss": 0.6359681487083435, "eval_rewards/accuracies": 0.6006944179534912, "eval_rewards/chosen": -0.6526302695274353, "eval_rewards/margins": 0.2929559648036957, "eval_rewards/rejected": -0.9455862045288086, "eval_runtime": 150.077, "eval_samples_per_second": 7.616, "eval_steps_per_second": 0.24, "step": 550 }, { "epoch": 0.6865671641791045, "grad_norm": 4.124903202056885, "learning_rate": 1.5884592130779056e-05, "logits/chosen": 0.14517062902450562, "logits/rejected": 0.04018905386328697, "logps/chosen": -487.6488342285156, "logps/rejected": -464.5549011230469, "loss": 0.6444, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6954266428947449, "rewards/margins": 0.17861610651016235, "rewards/rejected": -0.8740427494049072, "step": 552 }, { "epoch": 0.6915422885572139, "grad_norm": 3.633653402328491, "learning_rate": 1.581701294055095e-05, "logits/chosen": 0.2584773302078247, "logits/rejected": -0.018043681979179382, "logps/chosen": -511.7429504394531, "logps/rejected": -504.0497741699219, "loss": 0.6013, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7117761373519897, "rewards/margins": 0.30023542046546936, "rewards/rejected": -1.0120115280151367, "step": 556 }, { "epoch": 0.6965174129353234, "grad_norm": 4.155452251434326, "learning_rate": 1.5749029812461515e-05, "logits/chosen": 0.314390629529953, "logits/rejected": 0.3334752023220062, "logps/chosen": -530.2760009765625, "logps/rejected": -522.5399780273438, "loss": 0.6607, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6176879405975342, "rewards/margins": -0.009039867669343948, "rewards/rejected": -0.6086481213569641, "step": 560 }, { "epoch": 0.7014925373134329, "grad_norm": 4.27207088470459, "learning_rate": 1.568064746731156e-05, "logits/chosen": 0.3614248037338257, "logits/rejected": 0.06402953714132309, "logps/chosen": -542.3954467773438, "logps/rejected": -467.5965881347656, "loss": 0.6567, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07103107869625092, "rewards/margins": 0.46965718269348145, "rewards/rejected": -0.5406882166862488, "step": 564 }, { "epoch": 0.7064676616915423, "grad_norm": 4.479232311248779, "learning_rate": 1.5611870653623826e-05, "logits/chosen": 0.045674506574869156, "logits/rejected": -0.21201254427433014, "logps/chosen": -595.710205078125, "logps/rejected": -465.8755187988281, "loss": 0.6375, "rewards/accuracies": 0.625, "rewards/chosen": -0.17292442917823792, "rewards/margins": 0.23611289262771606, "rewards/rejected": -0.40903735160827637, "step": 568 }, { "epoch": 0.7114427860696517, "grad_norm": 3.8208746910095215, "learning_rate": 1.5542704147313257e-05, "logits/chosen": 0.4481641948223114, "logits/rejected": 0.399469256401062, "logps/chosen": -401.0939636230469, "logps/rejected": -375.7672424316406, "loss": 0.6036, "rewards/accuracies": 0.609375, "rewards/chosen": -0.2306845486164093, "rewards/margins": 0.27575400471687317, "rewards/rejected": -0.5064386129379272, "step": 572 }, { "epoch": 0.7164179104477612, "grad_norm": 4.367500305175781, "learning_rate": 1.5473152751355353e-05, "logits/chosen": 0.11335344612598419, "logits/rejected": -0.00670961756259203, "logps/chosen": -442.5565185546875, "logps/rejected": -403.98382568359375, "loss": 0.6571, "rewards/accuracies": 0.59375, "rewards/chosen": -0.47417518496513367, "rewards/margins": 0.30662575364112854, "rewards/rejected": -0.7808009386062622, "step": 576 }, { "epoch": 0.7213930348258707, "grad_norm": 3.969484567642212, "learning_rate": 1.5403221295452647e-05, "logits/chosen": 0.31861090660095215, "logits/rejected": 0.11263471841812134, "logps/chosen": -439.46630859375, "logps/rejected": -427.44677734375, "loss": 0.6126, "rewards/accuracies": 0.640625, "rewards/chosen": -0.6381343603134155, "rewards/margins": 0.34186607599258423, "rewards/rejected": -0.9800004363059998, "step": 580 }, { "epoch": 0.7263681592039801, "grad_norm": 3.820988655090332, "learning_rate": 1.5332914635699327e-05, "logits/chosen": 0.40720105171203613, "logits/rejected": 0.1580687016248703, "logps/chosen": -461.20068359375, "logps/rejected": -403.16094970703125, "loss": 0.6514, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4938652217388153, "rewards/margins": 0.117790088057518, "rewards/rejected": -0.6116552948951721, "step": 584 }, { "epoch": 0.7313432835820896, "grad_norm": 3.886521339416504, "learning_rate": 1.5262237654244026e-05, "logits/chosen": 0.5522980093955994, "logits/rejected": 0.4452764093875885, "logps/chosen": -421.0696105957031, "logps/rejected": -399.4547424316406, "loss": 0.5962, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3875323235988617, "rewards/margins": 0.46860218048095703, "rewards/rejected": -0.8561345338821411, "step": 588 }, { "epoch": 0.736318407960199, "grad_norm": 4.079859733581543, "learning_rate": 1.5191195258950804e-05, "logits/chosen": 0.40618038177490234, "logits/rejected": 0.134785994887352, "logps/chosen": -651.926025390625, "logps/rejected": -503.9865417480469, "loss": 0.6448, "rewards/accuracies": 0.75, "rewards/chosen": -0.186576247215271, "rewards/margins": 0.5179406404495239, "rewards/rejected": -0.7045168876647949, "step": 592 }, { "epoch": 0.7412935323383084, "grad_norm": 3.5976879596710205, "learning_rate": 1.5119792383058338e-05, "logits/chosen": 0.09992431104183197, "logits/rejected": -0.01921015977859497, "logps/chosen": -470.5938720703125, "logps/rejected": -444.0641174316406, "loss": 0.6545, "rewards/accuracies": 0.53125, "rewards/chosen": -0.31064364314079285, "rewards/margins": 0.2923417091369629, "rewards/rejected": -0.6029854416847229, "step": 596 }, { "epoch": 0.746268656716418, "grad_norm": 4.070811748504639, "learning_rate": 1.5048033984837352e-05, "logits/chosen": 0.15589873492717743, "logits/rejected": -0.034729793667793274, "logps/chosen": -567.2528076171875, "logps/rejected": -548.9393310546875, "loss": 0.6227, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1316080242395401, "rewards/margins": 0.4812317490577698, "rewards/rejected": -0.6128398180007935, "step": 600 }, { "epoch": 0.746268656716418, "eval_logits/chosen": 0.18069180846214294, "eval_logits/rejected": 0.03814281150698662, "eval_logps/chosen": -453.0019836425781, "eval_logps/rejected": -415.46087646484375, "eval_loss": 0.6348658800125122, "eval_rewards/accuracies": 0.6215277910232544, "eval_rewards/chosen": -0.07049696147441864, "eval_rewards/margins": 0.29544174671173096, "eval_rewards/rejected": -0.3659386932849884, "eval_runtime": 150.1609, "eval_samples_per_second": 7.612, "eval_steps_per_second": 0.24, "step": 600 }, { "epoch": 0.7512437810945274, "grad_norm": 4.077042579650879, "learning_rate": 1.4975925047246319e-05, "logits/chosen": 0.09503468126058578, "logits/rejected": 0.14383243024349213, "logps/chosen": -511.3801574707031, "logps/rejected": -459.1648254394531, "loss": 0.6159, "rewards/accuracies": 0.640625, "rewards/chosen": -0.030200934037566185, "rewards/margins": 0.34608855843544006, "rewards/rejected": -0.376289427280426, "step": 604 }, { "epoch": 0.7562189054726368, "grad_norm": 4.1146039962768555, "learning_rate": 1.4903470577585433e-05, "logits/chosen": 0.5371518731117249, "logits/rejected": 0.44205495715141296, "logps/chosen": -480.82513427734375, "logps/rejected": -464.04632568359375, "loss": 0.6054, "rewards/accuracies": 0.640625, "rewards/chosen": 0.07143578678369522, "rewards/margins": 0.3510420322418213, "rewards/rejected": -0.27960628271102905, "step": 608 }, { "epoch": 0.7611940298507462, "grad_norm": 4.153296947479248, "learning_rate": 1.4830675607148899e-05, "logits/chosen": 0.2690809369087219, "logits/rejected": 0.2488354742527008, "logps/chosen": -472.01849365234375, "logps/rejected": -491.8638916015625, "loss": 0.634, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1490858495235443, "rewards/margins": 0.38102632761001587, "rewards/rejected": -0.5301121473312378, "step": 612 }, { "epoch": 0.7661691542288557, "grad_norm": 4.3426337242126465, "learning_rate": 1.475754519087557e-05, "logits/chosen": 0.4082140624523163, "logits/rejected": 0.4368078410625458, "logps/chosen": -414.7156677246094, "logps/rejected": -415.9171142578125, "loss": 0.663, "rewards/accuracies": 0.609375, "rewards/chosen": -0.10527034103870392, "rewards/margins": 0.1847696453332901, "rewards/rejected": -0.2900400161743164, "step": 616 }, { "epoch": 0.7711442786069652, "grad_norm": 3.6295218467712402, "learning_rate": 1.4684084406997903e-05, "logits/chosen": 0.23331183195114136, "logits/rejected": 0.01600750908255577, "logps/chosen": -577.4814453125, "logps/rejected": -505.6333923339844, "loss": 0.6446, "rewards/accuracies": 0.71875, "rewards/chosen": -0.024487711489200592, "rewards/margins": 0.35684019327163696, "rewards/rejected": -0.38132789731025696, "step": 620 }, { "epoch": 0.7761194029850746, "grad_norm": 4.441697120666504, "learning_rate": 1.4610298356689341e-05, "logits/chosen": 0.19809234142303467, "logits/rejected": 0.22685889899730682, "logps/chosen": -413.6700134277344, "logps/rejected": -467.2070007324219, "loss": 0.6361, "rewards/accuracies": 0.5625, "rewards/chosen": 0.029005911201238632, "rewards/margins": 0.28220340609550476, "rewards/rejected": -0.2531975209712982, "step": 624 }, { "epoch": 0.7810945273631841, "grad_norm": 4.630463600158691, "learning_rate": 1.453619216371008e-05, "logits/chosen": 0.42978817224502563, "logits/rejected": 0.39091044664382935, "logps/chosen": -480.048095703125, "logps/rejected": -498.24530029296875, "loss": 0.6538, "rewards/accuracies": 0.515625, "rewards/chosen": -0.5016615986824036, "rewards/margins": 0.07318463921546936, "rewards/rejected": -0.5748462080955505, "step": 628 }, { "epoch": 0.7860696517412935, "grad_norm": 4.055500030517578, "learning_rate": 1.446177097405127e-05, "logits/chosen": 0.19197359681129456, "logits/rejected": 0.267251193523407, "logps/chosen": -554.1470336914062, "logps/rejected": -491.2269287109375, "loss": 0.6184, "rewards/accuracies": 0.640625, "rewards/chosen": -0.36392736434936523, "rewards/margins": 0.29624325037002563, "rewards/rejected": -0.6601705551147461, "step": 632 }, { "epoch": 0.7910447761194029, "grad_norm": 4.158740520477295, "learning_rate": 1.4387039955577668e-05, "logits/chosen": 0.28597795963287354, "logits/rejected": 0.2785332202911377, "logps/chosen": -504.1370849609375, "logps/rejected": -474.9548645019531, "loss": 0.643, "rewards/accuracies": 0.578125, "rewards/chosen": -0.12694326043128967, "rewards/margins": 0.2074156403541565, "rewards/rejected": -0.33435890078544617, "step": 636 }, { "epoch": 0.7960199004975125, "grad_norm": 4.120989799499512, "learning_rate": 1.4312004297668791e-05, "logits/chosen": 0.17556458711624146, "logits/rejected": 0.0959894210100174, "logps/chosen": -404.9556579589844, "logps/rejected": -391.56549072265625, "loss": 0.5971, "rewards/accuracies": 0.734375, "rewards/chosen": -0.09616656601428986, "rewards/margins": 0.39131832122802734, "rewards/rejected": -0.487484872341156, "step": 640 }, { "epoch": 0.8009950248756219, "grad_norm": 4.523448944091797, "learning_rate": 1.4236669210858544e-05, "logits/chosen": 0.25030747056007385, "logits/rejected": 0.20863890647888184, "logps/chosen": -498.8720703125, "logps/rejected": -507.3069152832031, "loss": 0.5908, "rewards/accuracies": 0.578125, "rewards/chosen": -0.24966737627983093, "rewards/margins": 0.2907105088233948, "rewards/rejected": -0.5403779149055481, "step": 644 }, { "epoch": 0.8059701492537313, "grad_norm": 4.00128173828125, "learning_rate": 1.4161039926473412e-05, "logits/chosen": 0.4552380442619324, "logits/rejected": 0.18772940337657928, "logps/chosen": -536.3428344726562, "logps/rejected": -490.09429931640625, "loss": 0.6473, "rewards/accuracies": 0.59375, "rewards/chosen": -0.040442317724227905, "rewards/margins": 0.29100677371025085, "rewards/rejected": -0.33144912123680115, "step": 648 }, { "epoch": 0.8084577114427861, "eval_logits/chosen": 0.2775518596172333, "eval_logits/rejected": 0.14060264825820923, "eval_logps/chosen": -455.484375, "eval_logps/rejected": -418.57281494140625, "eval_loss": 0.6331359148025513, "eval_rewards/accuracies": 0.6527777910232544, "eval_rewards/chosen": -0.31874096393585205, "eval_rewards/margins": 0.3583892583847046, "eval_rewards/rejected": -0.6771301627159119, "eval_runtime": 150.1695, "eval_samples_per_second": 7.611, "eval_steps_per_second": 0.24, "step": 650 }, { "epoch": 0.8109452736318408, "grad_norm": 6.469996452331543, "learning_rate": 1.4085121696269185e-05, "logits/chosen": 0.5448468327522278, "logits/rejected": 0.19260184466838837, "logps/chosen": -587.0845947265625, "logps/rejected": -455.35992431640625, "loss": 0.6466, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3791936933994293, "rewards/margins": 0.3534315824508667, "rewards/rejected": -0.7326253056526184, "step": 652 }, { "epoch": 0.8159203980099502, "grad_norm": 4.154365539550781, "learning_rate": 1.4008919792066273e-05, "logits/chosen": 0.24580639600753784, "logits/rejected": 0.30128005146980286, "logps/chosen": -402.3567199707031, "logps/rejected": -456.4067687988281, "loss": 0.651, "rewards/accuracies": 0.640625, "rewards/chosen": -0.35055863857269287, "rewards/margins": 0.36460378766059875, "rewards/rejected": -0.7151623964309692, "step": 656 }, { "epoch": 0.8208955223880597, "grad_norm": 4.000673294067383, "learning_rate": 1.3932439505383628e-05, "logits/chosen": 0.4568510055541992, "logits/rejected": 0.26491212844848633, "logps/chosen": -578.89453125, "logps/rejected": -484.39654541015625, "loss": 0.6105, "rewards/accuracies": 0.609375, "rewards/chosen": -0.5648167133331299, "rewards/margins": 0.4626457989215851, "rewards/rejected": -1.0274624824523926, "step": 660 }, { "epoch": 0.8258706467661692, "grad_norm": 4.538127899169922, "learning_rate": 1.385568614707129e-05, "logits/chosen": 0.4450688660144806, "logits/rejected": 0.1880086064338684, "logps/chosen": -522.4884033203125, "logps/rejected": -430.0087585449219, "loss": 0.5808, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8698031902313232, "rewards/margins": 0.45266827940940857, "rewards/rejected": -1.3224713802337646, "step": 664 }, { "epoch": 0.8308457711442786, "grad_norm": 3.7640268802642822, "learning_rate": 1.3778665046941616e-05, "logits/chosen": 0.3476739525794983, "logits/rejected": 0.0015247669070959091, "logps/chosen": -535.035888671875, "logps/rejected": -452.0794677734375, "loss": 0.5904, "rewards/accuracies": 0.71875, "rewards/chosen": -0.602352499961853, "rewards/margins": 0.5098788142204285, "rewards/rejected": -1.1122313737869263, "step": 668 }, { "epoch": 0.835820895522388, "grad_norm": 3.9549944400787354, "learning_rate": 1.3701381553399147e-05, "logits/chosen": 0.6829994916915894, "logits/rejected": 0.7129935622215271, "logps/chosen": -473.60589599609375, "logps/rejected": -505.9931945800781, "loss": 0.6523, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7586303949356079, "rewards/margins": 0.21102982759475708, "rewards/rejected": -0.9696601629257202, "step": 672 }, { "epoch": 0.8407960199004975, "grad_norm": 4.284757614135742, "learning_rate": 1.3623841033069232e-05, "logits/chosen": 0.40947800874710083, "logits/rejected": 0.37943294644355774, "logps/chosen": -399.6325378417969, "logps/rejected": -418.98846435546875, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": -0.6792310476303101, "rewards/margins": 0.2475418746471405, "rewards/rejected": -0.9267728924751282, "step": 676 }, { "epoch": 0.845771144278607, "grad_norm": 4.278886795043945, "learning_rate": 1.3546048870425356e-05, "logits/chosen": 0.38366350531578064, "logits/rejected": 0.2522772252559662, "logps/chosen": -413.2381286621094, "logps/rejected": -408.53399658203125, "loss": 0.6442, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6863256692886353, "rewards/margins": 0.4241764545440674, "rewards/rejected": -1.110502004623413, "step": 680 }, { "epoch": 0.8507462686567164, "grad_norm": 3.915144920349121, "learning_rate": 1.3468010467415248e-05, "logits/chosen": 0.3846738636493683, "logits/rejected": 0.17315393686294556, "logps/chosen": -491.52935791015625, "logps/rejected": -414.61328125, "loss": 0.631, "rewards/accuracies": 0.625, "rewards/chosen": -0.7921670079231262, "rewards/margins": 0.18218526244163513, "rewards/rejected": -0.9743523597717285, "step": 684 }, { "epoch": 0.8557213930348259, "grad_norm": 4.119470596313477, "learning_rate": 1.3389731243085747e-05, "logits/chosen": 0.23576557636260986, "logits/rejected": 0.2507054805755615, "logps/chosen": -452.724365234375, "logps/rejected": -467.43682861328125, "loss": 0.6641, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6324411630630493, "rewards/margins": 0.29806220531463623, "rewards/rejected": -0.9305033683776855, "step": 688 }, { "epoch": 0.8606965174129353, "grad_norm": 3.8940017223358154, "learning_rate": 1.3311216633206514e-05, "logits/chosen": 0.19210243225097656, "logits/rejected": 0.04936864227056503, "logps/chosen": -511.88641357421875, "logps/rejected": -425.4428405761719, "loss": 0.6167, "rewards/accuracies": 0.578125, "rewards/chosen": -0.5534711480140686, "rewards/margins": 0.3882919251918793, "rewards/rejected": -0.9417631030082703, "step": 692 }, { "epoch": 0.8656716417910447, "grad_norm": 4.534005165100098, "learning_rate": 1.3232472089892567e-05, "logits/chosen": 0.4114670157432556, "logits/rejected": 0.31665346026420593, "logps/chosen": -458.63006591796875, "logps/rejected": -402.3096923828125, "loss": 0.6983, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4137086868286133, "rewards/margins": 0.13000260293483734, "rewards/rejected": -0.543711245059967, "step": 696 }, { "epoch": 0.8706467661691543, "grad_norm": 5.47732400894165, "learning_rate": 1.315350308122567e-05, "logits/chosen": 0.38530704379081726, "logits/rejected": 0.30780690908432007, "logps/chosen": -444.169677734375, "logps/rejected": -430.9597473144531, "loss": 0.6259, "rewards/accuracies": 0.46875, "rewards/chosen": -0.41313281655311584, "rewards/margins": 0.10978913307189941, "rewards/rejected": -0.5229219794273376, "step": 700 }, { "epoch": 0.8706467661691543, "eval_logits/chosen": 0.23911841213703156, "eval_logits/rejected": 0.09861024469137192, "eval_logps/chosen": -456.55279541015625, "eval_logps/rejected": -419.2005615234375, "eval_loss": 0.6294909715652466, "eval_rewards/accuracies": 0.6111111044883728, "eval_rewards/chosen": -0.4255761504173279, "eval_rewards/margins": 0.31433236598968506, "eval_rewards/rejected": -0.7399084568023682, "eval_runtime": 150.0352, "eval_samples_per_second": 7.618, "eval_steps_per_second": 0.24, "step": 700 }, { "epoch": 0.8756218905472637, "grad_norm": 3.469086170196533, "learning_rate": 1.3074315090874652e-05, "logits/chosen": 0.09198964387178421, "logits/rejected": 0.1355361044406891, "logps/chosen": -370.5699462890625, "logps/rejected": -411.66070556640625, "loss": 0.6532, "rewards/accuracies": 0.59375, "rewards/chosen": -0.27120280265808105, "rewards/margins": 0.3581971824169159, "rewards/rejected": -0.6293999552726746, "step": 704 }, { "epoch": 0.8805970149253731, "grad_norm": 3.4145619869232178, "learning_rate": 1.2994913617714573e-05, "logits/chosen": 0.39067643880844116, "logits/rejected": 0.1868411898612976, "logps/chosen": -448.4958801269531, "logps/rejected": -393.5880126953125, "loss": 0.5979, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31874287128448486, "rewards/margins": 0.48711925745010376, "rewards/rejected": -0.8058621883392334, "step": 708 }, { "epoch": 0.8855721393034826, "grad_norm": 3.769350528717041, "learning_rate": 1.2915304175444929e-05, "logits/chosen": 0.43691831827163696, "logits/rejected": 0.3168666958808899, "logps/chosen": -450.9046630859375, "logps/rejected": -430.5725402832031, "loss": 0.636, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4702681005001068, "rewards/margins": 0.20657652616500854, "rewards/rejected": -0.6768447160720825, "step": 712 }, { "epoch": 0.8905472636815921, "grad_norm": 4.2490763664245605, "learning_rate": 1.2835492292206735e-05, "logits/chosen": 0.5658756494522095, "logits/rejected": 0.4351132810115814, "logps/chosen": -418.0053405761719, "logps/rejected": -432.3887634277344, "loss": 0.6119, "rewards/accuracies": 0.625, "rewards/chosen": -0.36454248428344727, "rewards/margins": 0.2455846071243286, "rewards/rejected": -0.6101270914077759, "step": 716 }, { "epoch": 0.8955223880597015, "grad_norm": 3.402549982070923, "learning_rate": 1.2755483510198668e-05, "logits/chosen": 0.18330873548984528, "logits/rejected": 0.06855818629264832, "logps/chosen": -470.72052001953125, "logps/rejected": -431.72314453125, "loss": 0.6069, "rewards/accuracies": 0.609375, "rewards/chosen": -0.3080929219722748, "rewards/margins": 0.3066636025905609, "rewards/rejected": -0.6147565245628357, "step": 720 }, { "epoch": 0.900497512437811, "grad_norm": 3.684074640274048, "learning_rate": 1.2675283385292212e-05, "logits/chosen": 0.2085587978363037, "logits/rejected": 0.11812448501586914, "logps/chosen": -449.0517883300781, "logps/rejected": -440.92767333984375, "loss": 0.6349, "rewards/accuracies": 0.625, "rewards/chosen": -0.25056391954421997, "rewards/margins": 0.2954629063606262, "rewards/rejected": -0.546026885509491, "step": 724 }, { "epoch": 0.9054726368159204, "grad_norm": 3.109182596206665, "learning_rate": 1.2594897486645836e-05, "logits/chosen": 0.21170970797538757, "logits/rejected": 0.11683456599712372, "logps/chosen": -457.32684326171875, "logps/rejected": -442.80426025390625, "loss": 0.5875, "rewards/accuracies": 0.75, "rewards/chosen": -0.32196876406669617, "rewards/margins": 0.5613601803779602, "rewards/rejected": -0.8833289742469788, "step": 728 }, { "epoch": 0.9104477611940298, "grad_norm": 6.466948986053467, "learning_rate": 1.2514331396318298e-05, "logits/chosen": 0.16703735291957855, "logits/rejected": 0.1217992752790451, "logps/chosen": -456.64312744140625, "logps/rejected": -477.60943603515625, "loss": 0.6526, "rewards/accuracies": 0.546875, "rewards/chosen": -0.4616580009460449, "rewards/margins": 0.1874929666519165, "rewards/rejected": -0.6491509079933167, "step": 732 }, { "epoch": 0.9154228855721394, "grad_norm": 3.79681396484375, "learning_rate": 1.2433590708880991e-05, "logits/chosen": 0.08391296863555908, "logits/rejected": -0.12564268708229065, "logps/chosen": -556.4468994140625, "logps/rejected": -446.3089294433594, "loss": 0.625, "rewards/accuracies": 0.625, "rewards/chosen": -0.7545434236526489, "rewards/margins": 0.35477572679519653, "rewards/rejected": -1.1093190908432007, "step": 736 }, { "epoch": 0.9203980099502488, "grad_norm": 4.004853248596191, "learning_rate": 1.2352681031029476e-05, "logits/chosen": 0.21419230103492737, "logits/rejected": 0.11660319566726685, "logps/chosen": -400.4061584472656, "logps/rejected": -386.1788330078125, "loss": 0.6353, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8190222978591919, "rewards/margins": 0.254682332277298, "rewards/rejected": -1.073704481124878, "step": 740 }, { "epoch": 0.9253731343283582, "grad_norm": 3.5617058277130127, "learning_rate": 1.2271607981194132e-05, "logits/chosen": 0.23552103340625763, "logits/rejected": 0.17505709826946259, "logps/chosen": -487.88153076171875, "logps/rejected": -487.0649719238281, "loss": 0.6393, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8327006697654724, "rewards/margins": 0.439730167388916, "rewards/rejected": -1.2724308967590332, "step": 744 }, { "epoch": 0.9303482587064676, "grad_norm": 3.8972809314727783, "learning_rate": 1.2190377189150016e-05, "logits/chosen": 0.1701466292142868, "logits/rejected": -0.17507055401802063, "logps/chosen": -546.63134765625, "logps/rejected": -442.184814453125, "loss": 0.6572, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6856258511543274, "rewards/margins": 0.2289436310529709, "rewards/rejected": -0.9145694971084595, "step": 748 }, { "epoch": 0.9328358208955224, "eval_logits/chosen": 0.21203385293483734, "eval_logits/rejected": 0.0725901871919632, "eval_logps/chosen": -458.26568603515625, "eval_logps/rejected": -420.7373962402344, "eval_loss": 0.6389397382736206, "eval_rewards/accuracies": 0.6006944179534912, "eval_rewards/chosen": -0.5968630313873291, "eval_rewards/margins": 0.29672402143478394, "eval_rewards/rejected": -0.8935869932174683, "eval_runtime": 149.6811, "eval_samples_per_second": 7.636, "eval_steps_per_second": 0.241, "step": 750 }, { "epoch": 0.9353233830845771, "grad_norm": 3.4292774200439453, "learning_rate": 1.2108994295625924e-05, "logits/chosen": 0.3646988868713379, "logits/rejected": 0.3169184625148773, "logps/chosen": -452.8676452636719, "logps/rejected": -477.1390075683594, "loss": 0.6384, "rewards/accuracies": 0.625, "rewards/chosen": -0.6115289926528931, "rewards/margins": 0.3036550283432007, "rewards/rejected": -0.9151840209960938, "step": 752 }, { "epoch": 0.9402985074626866, "grad_norm": 4.111717700958252, "learning_rate": 1.2027464951912703e-05, "logits/chosen": -0.010581929236650467, "logits/rejected": -0.32105395197868347, "logps/chosen": -577.8338012695312, "logps/rejected": -460.8430480957031, "loss": 0.6558, "rewards/accuracies": 0.625, "rewards/chosen": -0.42835140228271484, "rewards/margins": 0.29241591691970825, "rewards/rejected": -0.7207673192024231, "step": 756 }, { "epoch": 0.945273631840796, "grad_norm": 3.9025838375091553, "learning_rate": 1.1945794819470805e-05, "logits/chosen": 0.17384302616119385, "logits/rejected": 0.1552993208169937, "logps/chosen": -457.97296142578125, "logps/rejected": -547.5758056640625, "loss": 0.6789, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3758777976036072, "rewards/margins": 0.24084581434726715, "rewards/rejected": -0.6167235970497131, "step": 760 }, { "epoch": 0.9502487562189055, "grad_norm": 3.4757816791534424, "learning_rate": 1.1863989569537165e-05, "logits/chosen": 0.0369485542178154, "logits/rejected": -0.15935146808624268, "logps/chosen": -442.7840270996094, "logps/rejected": -408.35162353515625, "loss": 0.602, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12775643169879913, "rewards/margins": 0.40465879440307617, "rewards/rejected": -0.27690234780311584, "step": 764 }, { "epoch": 0.9552238805970149, "grad_norm": 3.849163770675659, "learning_rate": 1.1782054882731377e-05, "logits/chosen": 0.30783870816230774, "logits/rejected": 0.18860141932964325, "logps/chosen": -455.1307373046875, "logps/rejected": -429.60430908203125, "loss": 0.6285, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10190219432115555, "rewards/margins": 0.3229271173477173, "rewards/rejected": -0.4248293340206146, "step": 768 }, { "epoch": 0.9601990049751243, "grad_norm": 3.603282928466797, "learning_rate": 1.1699996448661242e-05, "logits/chosen": 0.2716388702392578, "logits/rejected": 0.048077456653118134, "logps/chosen": -479.42657470703125, "logps/rejected": -440.88507080078125, "loss": 0.5974, "rewards/accuracies": 0.703125, "rewards/chosen": 0.03539299592375755, "rewards/margins": 0.5358645915985107, "rewards/rejected": -0.5004715919494629, "step": 772 }, { "epoch": 0.9651741293532339, "grad_norm": 3.572518825531006, "learning_rate": 1.161781996552765e-05, "logits/chosen": 0.17352545261383057, "logits/rejected": 0.060841046273708344, "logps/chosen": -446.12451171875, "logps/rejected": -426.59222412109375, "loss": 0.6235, "rewards/accuracies": 0.59375, "rewards/chosen": 0.16514363884925842, "rewards/margins": 0.26698362827301025, "rewards/rejected": -0.10184000432491302, "step": 776 }, { "epoch": 0.9701492537313433, "grad_norm": 4.555858135223389, "learning_rate": 1.1535531139728918e-05, "logits/chosen": 0.24533721804618835, "logits/rejected": -0.07171311974525452, "logps/chosen": -523.5499877929688, "logps/rejected": -437.7908935546875, "loss": 0.6537, "rewards/accuracies": 0.546875, "rewards/chosen": 0.12301430106163025, "rewards/margins": 0.2650811970233917, "rewards/rejected": -0.14206688106060028, "step": 780 }, { "epoch": 0.9751243781094527, "grad_norm": 3.932504892349243, "learning_rate": 1.1453135685464524e-05, "logits/chosen": 0.08851994574069977, "logits/rejected": 0.14470553398132324, "logps/chosen": -439.4774169921875, "logps/rejected": -538.0982666015625, "loss": 0.6267, "rewards/accuracies": 0.703125, "rewards/chosen": 0.26700615882873535, "rewards/margins": 0.5629878640174866, "rewards/rejected": -0.295981764793396, "step": 784 }, { "epoch": 0.9800995024875622, "grad_norm": 3.7867138385772705, "learning_rate": 1.1370639324338313e-05, "logits/chosen": 0.26342546939849854, "logits/rejected": -0.009939752519130707, "logps/chosen": -462.4211120605469, "logps/rejected": -397.9438171386719, "loss": 0.6298, "rewards/accuracies": 0.609375, "rewards/chosen": 0.277785062789917, "rewards/margins": 0.45373010635375977, "rewards/rejected": -0.1759449988603592, "step": 788 }, { "epoch": 0.9850746268656716, "grad_norm": 3.5211777687072754, "learning_rate": 1.1288047784961166e-05, "logits/chosen": 0.3734492063522339, "logits/rejected": 0.2930186092853546, "logps/chosen": -512.7589721679688, "logps/rejected": -474.911865234375, "loss": 0.6174, "rewards/accuracies": 0.71875, "rewards/chosen": 0.332348495721817, "rewards/margins": 0.38974326848983765, "rewards/rejected": -0.05739474669098854, "step": 792 }, { "epoch": 0.9900497512437811, "grad_norm": 3.415151357650757, "learning_rate": 1.1205366802553231e-05, "logits/chosen": 0.2811368703842163, "logits/rejected": 0.15755276381969452, "logps/chosen": -549.73583984375, "logps/rejected": -500.3531494140625, "loss": 0.6064, "rewards/accuracies": 0.53125, "rewards/chosen": 0.04190271347761154, "rewards/margins": 0.3072620630264282, "rewards/rejected": -0.26535937190055847, "step": 796 }, { "epoch": 0.9950248756218906, "grad_norm": 3.637012481689453, "learning_rate": 1.1122602118545642e-05, "logits/chosen": 0.12841928005218506, "logits/rejected": -0.023960597813129425, "logps/chosen": -483.5614013671875, "logps/rejected": -490.4222412109375, "loss": 0.63, "rewards/accuracies": 0.671875, "rewards/chosen": 0.10303351283073425, "rewards/margins": 0.44947099685668945, "rewards/rejected": -0.3464375436306, "step": 800 }, { "epoch": 0.9950248756218906, "eval_logits/chosen": 0.2406376451253891, "eval_logits/rejected": 0.10255695879459381, "eval_logps/chosen": -454.539794921875, "eval_logps/rejected": -417.31793212890625, "eval_loss": 0.6309738159179688, "eval_rewards/accuracies": 0.6284722089767456, "eval_rewards/chosen": -0.22427807748317719, "eval_rewards/margins": 0.3273647427558899, "eval_rewards/rejected": -0.5516427755355835, "eval_runtime": 150.3056, "eval_samples_per_second": 7.605, "eval_steps_per_second": 0.24, "step": 800 }, { "epoch": 1.0, "grad_norm": 3.7533183097839355, "learning_rate": 1.1039759480181836e-05, "logits/chosen": 0.12052932381629944, "logits/rejected": 0.07770034670829773, "logps/chosen": -418.409912109375, "logps/rejected": -415.7170715332031, "loss": 0.6279, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11391990631818771, "rewards/margins": 0.34513598680496216, "rewards/rejected": -0.45905593037605286, "step": 804 }, { "epoch": 1.0049751243781095, "grad_norm": 3.1612284183502197, "learning_rate": 1.0956844640118462e-05, "logits/chosen": 0.3718172311782837, "logits/rejected": 0.10918774455785751, "logps/chosen": -493.1455078125, "logps/rejected": -435.0345153808594, "loss": 0.4822, "rewards/accuracies": 0.671875, "rewards/chosen": -0.20489560067653656, "rewards/margins": 0.660038411617279, "rewards/rejected": -0.864933967590332, "step": 808 }, { "epoch": 1.0099502487562189, "grad_norm": 3.007526397705078, "learning_rate": 1.0873863356025911e-05, "logits/chosen": 0.23822058737277985, "logits/rejected": 0.07988135516643524, "logps/chosen": -398.7310791015625, "logps/rejected": -416.26171875, "loss": 0.4403, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2089827060699463, "rewards/margins": 0.7835352420806885, "rewards/rejected": -0.9925180077552795, "step": 812 }, { "epoch": 1.0149253731343284, "grad_norm": 2.805800437927246, "learning_rate": 1.0790821390188493e-05, "logits/chosen": 0.32303646206855774, "logits/rejected": 0.2196110635995865, "logps/chosen": -503.0115966796875, "logps/rejected": -474.46246337890625, "loss": 0.4475, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5059514045715332, "rewards/margins": 0.6203033328056335, "rewards/rejected": -1.1262547969818115, "step": 816 }, { "epoch": 1.0199004975124377, "grad_norm": 2.849635362625122, "learning_rate": 1.0707724509104318e-05, "logits/chosen": 0.12564139068126678, "logits/rejected": -0.1032138541340828, "logps/chosen": -492.5018615722656, "logps/rejected": -433.72662353515625, "loss": 0.4503, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2990395128726959, "rewards/margins": 0.7483544945716858, "rewards/rejected": -1.047394037246704, "step": 820 }, { "epoch": 1.0248756218905473, "grad_norm": 3.1247310638427734, "learning_rate": 1.062457848308484e-05, "logits/chosen": 0.1007804125547409, "logits/rejected": -0.011237893253564835, "logps/chosen": -511.64373779296875, "logps/rejected": -443.93951416015625, "loss": 0.4899, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3589535057544708, "rewards/margins": 0.5756500959396362, "rewards/rejected": -0.9346035718917847, "step": 824 }, { "epoch": 1.0298507462686568, "grad_norm": 3.1016077995300293, "learning_rate": 1.0541389085854177e-05, "logits/chosen": 0.5417459011077881, "logits/rejected": 0.2698720395565033, "logps/chosen": -491.50115966796875, "logps/rejected": -424.73309326171875, "loss": 0.4774, "rewards/accuracies": 0.796875, "rewards/chosen": -0.044443853199481964, "rewards/margins": 0.7540363073348999, "rewards/rejected": -0.7984801530838013, "step": 828 }, { "epoch": 1.0348258706467661, "grad_norm": 3.130073070526123, "learning_rate": 1.0458162094148185e-05, "logits/chosen": 0.5757798552513123, "logits/rejected": 0.4289059042930603, "logps/chosen": -455.25860595703125, "logps/rejected": -414.67767333984375, "loss": 0.4726, "rewards/accuracies": 0.828125, "rewards/chosen": 0.16106560826301575, "rewards/margins": 0.6384649872779846, "rewards/rejected": -0.47739943861961365, "step": 832 }, { "epoch": 1.0398009950248757, "grad_norm": 2.843583822250366, "learning_rate": 1.0374903287313307e-05, "logits/chosen": 0.6092027425765991, "logits/rejected": 0.4161675274372101, "logps/chosen": -471.7634582519531, "logps/rejected": -432.1963195800781, "loss": 0.4692, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15890881419181824, "rewards/margins": 0.7123965620994568, "rewards/rejected": -0.5534877777099609, "step": 836 }, { "epoch": 1.044776119402985, "grad_norm": 3.054884433746338, "learning_rate": 1.029161844690525e-05, "logits/chosen": 0.04671328887343407, "logits/rejected": -0.0623294860124588, "logps/chosen": -432.43463134765625, "logps/rejected": -444.45330810546875, "loss": 0.4637, "rewards/accuracies": 0.859375, "rewards/chosen": 0.2890382409095764, "rewards/margins": 0.9058393836021423, "rewards/rejected": -0.6168012022972107, "step": 840 }, { "epoch": 1.0497512437810945, "grad_norm": 2.780414581298828, "learning_rate": 1.0208313356287505e-05, "logits/chosen": 0.16017179191112518, "logits/rejected": 0.3132883310317993, "logps/chosen": -367.1025390625, "logps/rejected": -421.85235595703125, "loss": 0.4423, "rewards/accuracies": 0.8125, "rewards/chosen": 0.15863201022148132, "rewards/margins": 0.6779903173446655, "rewards/rejected": -0.5193582773208618, "step": 844 }, { "epoch": 1.054726368159204, "grad_norm": 2.5658442974090576, "learning_rate": 1.0124993800229774e-05, "logits/chosen": 0.5552780628204346, "logits/rejected": 0.4013071060180664, "logps/chosen": -482.0791015625, "logps/rejected": -464.7220764160156, "loss": 0.4431, "rewards/accuracies": 0.859375, "rewards/chosen": 0.09260371327400208, "rewards/margins": 0.9124071002006531, "rewards/rejected": -0.8198033571243286, "step": 848 }, { "epoch": 1.0572139303482586, "eval_logits/chosen": 0.19921229779720306, "eval_logits/rejected": 0.06041179224848747, "eval_logps/chosen": -455.6217346191406, "eval_logps/rejected": -418.9701843261719, "eval_loss": 0.6237961649894714, "eval_rewards/accuracies": 0.6631944179534912, "eval_rewards/chosen": -0.332474023103714, "eval_rewards/margins": 0.3843950629234314, "eval_rewards/rejected": -0.7168691158294678, "eval_runtime": 150.1103, "eval_samples_per_second": 7.614, "eval_steps_per_second": 0.24, "step": 850 }, { "epoch": 1.0597014925373134, "grad_norm": 2.8231992721557617, "learning_rate": 1.004166556450623e-05, "logits/chosen": 0.33288416266441345, "logits/rejected": 0.09707096964120865, "logps/chosen": -464.0365905761719, "logps/rejected": -426.97601318359375, "loss": 0.446, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08150234073400497, "rewards/margins": 0.8820555210113525, "rewards/rejected": -0.9635578393936157, "step": 852 }, { "epoch": 1.064676616915423, "grad_norm": 3.675276756286621, "learning_rate": 9.958334435493776e-06, "logits/chosen": 0.19826172292232513, "logits/rejected": 0.07607944309711456, "logps/chosen": -414.4964599609375, "logps/rejected": -442.0020751953125, "loss": 0.4813, "rewards/accuracies": 0.859375, "rewards/chosen": -0.1501787155866623, "rewards/margins": 0.8547053337097168, "rewards/rejected": -1.004884123802185, "step": 856 }, { "epoch": 1.0696517412935322, "grad_norm": 2.7473108768463135, "learning_rate": 9.87500619977023e-06, "logits/chosen": 0.11883494257926941, "logits/rejected": -0.021124478429555893, "logps/chosen": -453.2391052246094, "logps/rejected": -427.75830078125, "loss": 0.4716, "rewards/accuracies": 0.734375, "rewards/chosen": -0.10561797767877579, "rewards/margins": 0.6551963090896606, "rewards/rejected": -0.7608143091201782, "step": 860 }, { "epoch": 1.0746268656716418, "grad_norm": 3.5719878673553467, "learning_rate": 9.791686643712498e-06, "logits/chosen": 0.3640301525592804, "logits/rejected": 0.11707509309053421, "logps/chosen": -486.08709716796875, "logps/rejected": -430.9344787597656, "loss": 0.4856, "rewards/accuracies": 0.78125, "rewards/chosen": -0.09809544682502747, "rewards/margins": 0.7188686728477478, "rewards/rejected": -0.8169642090797424, "step": 864 }, { "epoch": 1.0796019900497513, "grad_norm": 3.0903241634368896, "learning_rate": 9.708381553094754e-06, "logits/chosen": 0.26261359453201294, "logits/rejected": 0.11373281478881836, "logps/chosen": -434.330078125, "logps/rejected": -389.1520080566406, "loss": 0.4558, "rewards/accuracies": 0.875, "rewards/chosen": 0.051621563732624054, "rewards/margins": 0.8995364904403687, "rewards/rejected": -0.84791499376297, "step": 868 }, { "epoch": 1.0845771144278606, "grad_norm": 3.3322811126708984, "learning_rate": 9.625096712686694e-06, "logits/chosen": 0.011679138988256454, "logits/rejected": 0.09790559113025665, "logps/chosen": -424.90777587890625, "logps/rejected": -438.1689758300781, "loss": 0.4516, "rewards/accuracies": 0.875, "rewards/chosen": -0.2651669383049011, "rewards/margins": 0.9951549768447876, "rewards/rejected": -1.260321855545044, "step": 872 }, { "epoch": 1.0895522388059702, "grad_norm": 2.9339921474456787, "learning_rate": 9.541837905851817e-06, "logits/chosen": 0.047505155205726624, "logits/rejected": 0.039440758526325226, "logps/chosen": -442.19647216796875, "logps/rejected": -480.726318359375, "loss": 0.4718, "rewards/accuracies": 0.84375, "rewards/chosen": -0.05062495172023773, "rewards/margins": 0.8886253237724304, "rewards/rejected": -0.9392504096031189, "step": 876 }, { "epoch": 1.0945273631840795, "grad_norm": 3.0647056102752686, "learning_rate": 9.458610914145826e-06, "logits/chosen": 0.14126014709472656, "logits/rejected": 0.15012700855731964, "logps/chosen": -428.18438720703125, "logps/rejected": -441.0984191894531, "loss": 0.4385, "rewards/accuracies": 0.84375, "rewards/chosen": -0.17007280886173248, "rewards/margins": 0.8784099817276001, "rewards/rejected": -1.048482894897461, "step": 880 }, { "epoch": 1.099502487562189, "grad_norm": 2.8182191848754883, "learning_rate": 9.375421516915165e-06, "logits/chosen": 0.26013338565826416, "logits/rejected": 0.15596720576286316, "logps/chosen": -466.556396484375, "logps/rejected": -454.1954040527344, "loss": 0.4286, "rewards/accuracies": 0.875, "rewards/chosen": -0.03595845773816109, "rewards/margins": 1.0611960887908936, "rewards/rejected": -1.0971544981002808, "step": 884 }, { "epoch": 1.1044776119402986, "grad_norm": 2.964768171310425, "learning_rate": 9.292275490895685e-06, "logits/chosen": 0.1450473666191101, "logits/rejected": 0.006217047572135925, "logps/chosen": -560.520751953125, "logps/rejected": -509.8658142089844, "loss": 0.4187, "rewards/accuracies": 0.828125, "rewards/chosen": -0.46229514479637146, "rewards/margins": 0.9188253283500671, "rewards/rejected": -1.3811204433441162, "step": 888 }, { "epoch": 1.109452736318408, "grad_norm": 3.2267932891845703, "learning_rate": 9.209178609811509e-06, "logits/chosen": 0.30923786759376526, "logits/rejected": 0.10800696909427643, "logps/chosen": -449.5932922363281, "logps/rejected": -435.5396423339844, "loss": 0.4548, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5209077596664429, "rewards/margins": 0.8729082942008972, "rewards/rejected": -1.3938158750534058, "step": 892 }, { "epoch": 1.1144278606965174, "grad_norm": 3.159635066986084, "learning_rate": 9.126136643974094e-06, "logits/chosen": 0.03288649767637253, "logits/rejected": 0.00020163506269454956, "logps/chosen": -508.6898498535156, "logps/rejected": -505.1740417480469, "loss": 0.4825, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6636922359466553, "rewards/margins": 0.842619776725769, "rewards/rejected": -1.5063120126724243, "step": 896 }, { "epoch": 1.1194029850746268, "grad_norm": 3.421736478805542, "learning_rate": 9.043155359881538e-06, "logits/chosen": 0.192546546459198, "logits/rejected": -0.08039741218090057, "logps/chosen": -563.0585327148438, "logps/rejected": -510.62469482421875, "loss": 0.47, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5328125953674316, "rewards/margins": 0.9578996300697327, "rewards/rejected": -1.4907121658325195, "step": 900 }, { "epoch": 1.1194029850746268, "eval_logits/chosen": 0.11539439111948013, "eval_logits/rejected": -0.026901576668024063, "eval_logps/chosen": -458.8861083984375, "eval_logps/rejected": -422.9441223144531, "eval_loss": 0.6285870671272278, "eval_rewards/accuracies": 0.6597222089767456, "eval_rewards/chosen": -0.6589114665985107, "eval_rewards/margins": 0.4553508758544922, "eval_rewards/rejected": -1.114262342453003, "eval_runtime": 151.3253, "eval_samples_per_second": 7.553, "eval_steps_per_second": 0.238, "step": 900 }, { "epoch": 1.1243781094527363, "grad_norm": 3.3492910861968994, "learning_rate": 8.960240519818167e-06, "logits/chosen": 0.002382766455411911, "logits/rejected": 0.02159365639090538, "logps/chosen": -570.236328125, "logps/rejected": -555.9732666015625, "loss": 0.4746, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7100229859352112, "rewards/margins": 0.716564953327179, "rewards/rejected": -1.4265879392623901, "step": 904 }, { "epoch": 1.1293532338308458, "grad_norm": 3.300537347793579, "learning_rate": 8.877397881454358e-06, "logits/chosen": 0.3528830409049988, "logits/rejected": 0.20602422952651978, "logps/chosen": -531.8645629882812, "logps/rejected": -465.4809265136719, "loss": 0.4532, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6002427935600281, "rewards/margins": 0.8341575264930725, "rewards/rejected": -1.4344004392623901, "step": 908 }, { "epoch": 1.1343283582089552, "grad_norm": 3.9822070598602295, "learning_rate": 8.79463319744677e-06, "logits/chosen": -0.01692591980099678, "logits/rejected": -0.15085071325302124, "logps/chosen": -470.937744140625, "logps/rejected": -452.7400207519531, "loss": 0.4703, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19091981649398804, "rewards/margins": 0.8660258054733276, "rewards/rejected": -1.0569454431533813, "step": 912 }, { "epoch": 1.1393034825870647, "grad_norm": 2.87620210647583, "learning_rate": 8.711952215038837e-06, "logits/chosen": 0.38492149114608765, "logits/rejected": 0.373988538980484, "logps/chosen": -454.4605407714844, "logps/rejected": -512.40966796875, "loss": 0.4653, "rewards/accuracies": 0.828125, "rewards/chosen": -0.19313852488994598, "rewards/margins": 0.8566547632217407, "rewards/rejected": -1.0497933626174927, "step": 916 }, { "epoch": 1.144278606965174, "grad_norm": 2.891087770462036, "learning_rate": 8.629360675661693e-06, "logits/chosen": 0.4491625130176544, "logits/rejected": 0.3486018776893616, "logps/chosen": -431.761474609375, "logps/rejected": -429.3786926269531, "loss": 0.4501, "rewards/accuracies": 0.828125, "rewards/chosen": -0.27446961402893066, "rewards/margins": 0.7661145329475403, "rewards/rejected": -1.0405840873718262, "step": 920 }, { "epoch": 1.1492537313432836, "grad_norm": 3.027414560317993, "learning_rate": 8.546864314535478e-06, "logits/chosen": 0.15871021151542664, "logits/rejected": 0.12511278688907623, "logps/chosen": -464.06573486328125, "logps/rejected": -463.8356018066406, "loss": 0.4435, "rewards/accuracies": 0.84375, "rewards/chosen": -0.32652032375335693, "rewards/margins": 0.6782144904136658, "rewards/rejected": -1.0047348737716675, "step": 924 }, { "epoch": 1.154228855721393, "grad_norm": 3.0336227416992188, "learning_rate": 8.464468860271084e-06, "logits/chosen": 0.05021004378795624, "logits/rejected": -0.06525184959173203, "logps/chosen": -471.3458557128906, "logps/rejected": -464.1771240234375, "loss": 0.4468, "rewards/accuracies": 0.890625, "rewards/chosen": 0.007861072197556496, "rewards/margins": 1.1992193460464478, "rewards/rejected": -1.191358208656311, "step": 928 }, { "epoch": 1.1592039800995024, "grad_norm": 3.0443830490112305, "learning_rate": 8.382180034472353e-06, "logits/chosen": 0.40720558166503906, "logits/rejected": 0.19039109349250793, "logps/chosen": -528.5640258789062, "logps/rejected": -479.5672302246094, "loss": 0.4447, "rewards/accuracies": 0.796875, "rewards/chosen": -0.19556114077568054, "rewards/margins": 0.9518004655838013, "rewards/rejected": -1.1473615169525146, "step": 932 }, { "epoch": 1.164179104477612, "grad_norm": 3.200226068496704, "learning_rate": 8.30000355133876e-06, "logits/chosen": 0.3259899318218231, "logits/rejected": 0.12083222717046738, "logps/chosen": -434.56243896484375, "logps/rejected": -426.33038330078125, "loss": 0.4644, "rewards/accuracies": 0.734375, "rewards/chosen": -0.26636868715286255, "rewards/margins": 0.7016679048538208, "rewards/rejected": -0.9680365920066833, "step": 936 }, { "epoch": 1.1691542288557213, "grad_norm": 3.3915629386901855, "learning_rate": 8.217945117268624e-06, "logits/chosen": -0.04203636944293976, "logits/rejected": 0.0748155415058136, "logps/chosen": -479.9148864746094, "logps/rejected": -463.1988220214844, "loss": 0.4677, "rewards/accuracies": 0.75, "rewards/chosen": -0.46628129482269287, "rewards/margins": 0.8466652631759644, "rewards/rejected": -1.3129465579986572, "step": 940 }, { "epoch": 1.1741293532338308, "grad_norm": 4.10882043838501, "learning_rate": 8.136010430462837e-06, "logits/chosen": 0.14211219549179077, "logits/rejected": 0.05810711905360222, "logps/chosen": -468.7088317871094, "logps/rejected": -482.8578796386719, "loss": 0.435, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2802658677101135, "rewards/margins": 1.0771757364273071, "rewards/rejected": -1.3574416637420654, "step": 944 }, { "epoch": 1.1791044776119404, "grad_norm": 2.790282726287842, "learning_rate": 8.0542051805292e-06, "logits/chosen": -0.11554953455924988, "logits/rejected": -0.208129420876503, "logps/chosen": -456.598876953125, "logps/rejected": -420.1100158691406, "loss": 0.4436, "rewards/accuracies": 0.71875, "rewards/chosen": -0.33638888597488403, "rewards/margins": 0.7983494400978088, "rewards/rejected": -1.1347384452819824, "step": 948 }, { "epoch": 1.1815920398009951, "eval_logits/chosen": 0.14645284414291382, "eval_logits/rejected": 0.006150411441922188, "eval_logps/chosen": -458.5403747558594, "eval_logps/rejected": -422.0716552734375, "eval_loss": 0.6252362132072449, "eval_rewards/accuracies": 0.6354166865348816, "eval_rewards/chosen": -0.6243360042572021, "eval_rewards/margins": 0.4026750922203064, "eval_rewards/rejected": -1.0270111560821533, "eval_runtime": 150.2224, "eval_samples_per_second": 7.609, "eval_steps_per_second": 0.24, "step": 950 }, { "epoch": 1.1840796019900497, "grad_norm": 2.771230697631836, "learning_rate": 7.9725350480873e-06, "logits/chosen": 0.3046668767929077, "logits/rejected": 0.17750529944896698, "logps/chosen": -487.18145751953125, "logps/rejected": -489.9942932128906, "loss": 0.4533, "rewards/accuracies": 0.875, "rewards/chosen": -0.21156734228134155, "rewards/margins": 0.8424564599990845, "rewards/rejected": -1.0540237426757812, "step": 952 }, { "epoch": 1.1890547263681592, "grad_norm": 3.108633041381836, "learning_rate": 7.89100570437408e-06, "logits/chosen": 0.5195536017417908, "logits/rejected": 0.25093546509742737, "logps/chosen": -470.94891357421875, "logps/rejected": -412.3416748046875, "loss": 0.4549, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2779660224914551, "rewards/margins": 0.7905957102775574, "rewards/rejected": -1.0685617923736572, "step": 956 }, { "epoch": 1.1940298507462686, "grad_norm": 3.2091431617736816, "learning_rate": 7.809622810849986e-06, "logits/chosen": 0.3036578595638275, "logits/rejected": 0.09663239121437073, "logps/chosen": -518.3139038085938, "logps/rejected": -491.35693359375, "loss": 0.4682, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2721414566040039, "rewards/margins": 0.8471077084541321, "rewards/rejected": -1.1192492246627808, "step": 960 }, { "epoch": 1.199004975124378, "grad_norm": 3.8627357482910156, "learning_rate": 7.72839201880587e-06, "logits/chosen": 0.18529945611953735, "logits/rejected": 0.0777958407998085, "logps/chosen": -406.6213684082031, "logps/rejected": -376.0231628417969, "loss": 0.4521, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16672272980213165, "rewards/margins": 0.8920981884002686, "rewards/rejected": -1.0588209629058838, "step": 964 }, { "epoch": 1.2039800995024876, "grad_norm": 3.3319764137268066, "learning_rate": 7.647318968970528e-06, "logits/chosen": 0.2665182650089264, "logits/rejected": 0.11450497806072235, "logps/chosen": -494.59991455078125, "logps/rejected": -406.59857177734375, "loss": 0.4512, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35199272632598877, "rewards/margins": 0.6968194246292114, "rewards/rejected": -1.0488121509552002, "step": 968 }, { "epoch": 1.208955223880597, "grad_norm": 3.6332457065582275, "learning_rate": 7.566409291119008e-06, "logits/chosen": 0.16549383103847504, "logits/rejected": -0.042187366634607315, "logps/chosen": -411.47100830078125, "logps/rejected": -402.9810791015625, "loss": 0.4496, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3397058844566345, "rewards/margins": 0.9274094700813293, "rewards/rejected": -1.2671154737472534, "step": 972 }, { "epoch": 1.2139303482587065, "grad_norm": 6.592203617095947, "learning_rate": 7.485668603681706e-06, "logits/chosen": 0.42461320757865906, "logits/rejected": 0.21914049983024597, "logps/chosen": -555.8026123046875, "logps/rejected": -517.2098388671875, "loss": 0.4522, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5179316401481628, "rewards/margins": 0.991509735584259, "rewards/rejected": -1.5094413757324219, "step": 976 }, { "epoch": 1.2189054726368158, "grad_norm": 2.970505952835083, "learning_rate": 7.405102513354166e-06, "logits/chosen": 0.33260223269462585, "logits/rejected": 0.3816107511520386, "logps/chosen": -447.697509765625, "logps/rejected": -462.2044677734375, "loss": 0.4361, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3011395037174225, "rewards/margins": 0.949279248714447, "rewards/rejected": -1.250418782234192, "step": 980 }, { "epoch": 1.2238805970149254, "grad_norm": 3.419555187225342, "learning_rate": 7.324716614707794e-06, "logits/chosen": 0.2947157025337219, "logits/rejected": 0.11895683407783508, "logps/chosen": -479.42059326171875, "logps/rejected": -443.11993408203125, "loss": 0.4681, "rewards/accuracies": 0.796875, "rewards/chosen": -0.37127968668937683, "rewards/margins": 0.907949686050415, "rewards/rejected": -1.2792294025421143, "step": 984 }, { "epoch": 1.228855721393035, "grad_norm": 3.920330286026001, "learning_rate": 7.2445164898013345e-06, "logits/chosen": 0.2471471130847931, "logits/rejected": 0.16002610325813293, "logps/chosen": -482.1630859375, "logps/rejected": -452.62213134765625, "loss": 0.4621, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4815042316913605, "rewards/margins": 0.6901217699050903, "rewards/rejected": -1.171626091003418, "step": 988 }, { "epoch": 1.2338308457711442, "grad_norm": 3.4007022380828857, "learning_rate": 7.1645077077932666e-06, "logits/chosen": 0.13965100049972534, "logits/rejected": -0.005473073571920395, "logps/chosen": -492.3451843261719, "logps/rejected": -430.3792419433594, "loss": 0.4685, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6226473450660706, "rewards/margins": 0.7819827795028687, "rewards/rejected": -1.404630184173584, "step": 992 }, { "epoch": 1.2388059701492538, "grad_norm": 3.3789188861846924, "learning_rate": 7.084695824555074e-06, "logits/chosen": 0.12412463128566742, "logits/rejected": -0.04749886691570282, "logps/chosen": -456.4656982421875, "logps/rejected": -449.3288269042969, "loss": 0.4596, "rewards/accuracies": 0.859375, "rewards/chosen": -0.37792813777923584, "rewards/margins": 0.9587810635566711, "rewards/rejected": -1.3367091417312622, "step": 996 }, { "epoch": 1.243781094527363, "grad_norm": 3.2594857215881348, "learning_rate": 7.005086382285426e-06, "logits/chosen": 0.19527438282966614, "logits/rejected": -0.04300277307629585, "logps/chosen": -524.0037841796875, "logps/rejected": -452.43988037109375, "loss": 0.4483, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3103996217250824, "rewards/margins": 0.7246525883674622, "rewards/rejected": -1.0350522994995117, "step": 1000 }, { "epoch": 1.243781094527363, "eval_logits/chosen": 0.1812911480665207, "eval_logits/rejected": 0.043385788798332214, "eval_logps/chosen": -458.6221923828125, "eval_logps/rejected": -422.3155822753906, "eval_loss": 0.6238306164741516, "eval_rewards/accuracies": 0.6319444179534912, "eval_rewards/chosen": -0.6325181126594543, "eval_rewards/margins": 0.41889193654060364, "eval_rewards/rejected": -1.05141019821167, "eval_runtime": 150.5346, "eval_samples_per_second": 7.593, "eval_steps_per_second": 0.239, "step": 1000 }, { "epoch": 1.2487562189054726, "grad_norm": 3.286379098892212, "learning_rate": 6.925684909125354e-06, "logits/chosen": 0.3578662574291229, "logits/rejected": 0.16462884843349457, "logps/chosen": -489.2138366699219, "logps/rejected": -452.0278015136719, "loss": 0.4915, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4701817035675049, "rewards/margins": 0.689208984375, "rewards/rejected": -1.1593906879425049, "step": 1004 }, { "epoch": 1.2537313432835822, "grad_norm": 3.4609503746032715, "learning_rate": 6.84649691877433e-06, "logits/chosen": 0.292858362197876, "logits/rejected": 0.3071328401565552, "logps/chosen": -498.725341796875, "logps/rejected": -514.9793701171875, "loss": 0.4504, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3274274468421936, "rewards/margins": 0.735092282295227, "rewards/rejected": -1.0625197887420654, "step": 1008 }, { "epoch": 1.2587064676616915, "grad_norm": 4.149216175079346, "learning_rate": 6.767527910107437e-06, "logits/chosen": 0.41340234875679016, "logits/rejected": 0.23155152797698975, "logps/chosen": -589.994384765625, "logps/rejected": -512.817138671875, "loss": 0.4954, "rewards/accuracies": 0.8125, "rewards/chosen": -0.050317004323005676, "rewards/margins": 0.8621648550033569, "rewards/rejected": -0.9124818444252014, "step": 1012 }, { "epoch": 1.263681592039801, "grad_norm": 3.068629026412964, "learning_rate": 6.688783366793488e-06, "logits/chosen": 0.07183945924043655, "logits/rejected": 0.1859855055809021, "logps/chosen": -416.20281982421875, "logps/rejected": -504.38360595703125, "loss": 0.4373, "rewards/accuracies": 0.828125, "rewards/chosen": -0.47665709257125854, "rewards/margins": 1.1034715175628662, "rewards/rejected": -1.5801286697387695, "step": 1016 }, { "epoch": 1.2686567164179103, "grad_norm": 3.3486008644104004, "learning_rate": 6.610268756914254e-06, "logits/chosen": 0.2134770154953003, "logits/rejected": 0.20936183631420135, "logps/chosen": -456.9624328613281, "logps/rejected": -467.43695068359375, "loss": 0.4572, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4169631004333496, "rewards/margins": 0.7615378499031067, "rewards/rejected": -1.1785008907318115, "step": 1020 }, { "epoch": 1.2736318407960199, "grad_norm": 2.8429603576660156, "learning_rate": 6.5319895325847535e-06, "logits/chosen": 0.37751051783561707, "logits/rejected": 0.24831168353557587, "logps/chosen": -482.39044189453125, "logps/rejected": -445.41131591796875, "loss": 0.4163, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3370966613292694, "rewards/margins": 1.0583980083465576, "rewards/rejected": -1.3954945802688599, "step": 1024 }, { "epoch": 1.2786069651741294, "grad_norm": 3.422572374343872, "learning_rate": 6.453951129574644e-06, "logits/chosen": 0.09760095179080963, "logits/rejected": -0.02564432844519615, "logps/chosen": -478.0529479980469, "logps/rejected": -460.9502258300781, "loss": 0.49, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4741438329219818, "rewards/margins": 0.7156703472137451, "rewards/rejected": -1.1898140907287598, "step": 1028 }, { "epoch": 1.2835820895522387, "grad_norm": 7.709662914276123, "learning_rate": 6.3761589669307745e-06, "logits/chosen": 0.2523835301399231, "logits/rejected": -0.018656061962246895, "logps/chosen": -464.8309326171875, "logps/rejected": -421.08831787109375, "loss": 0.4486, "rewards/accuracies": 0.84375, "rewards/chosen": -0.46211400628089905, "rewards/margins": 0.7744930982589722, "rewards/rejected": -1.2366070747375488, "step": 1032 }, { "epoch": 1.2885572139303483, "grad_norm": 3.1016786098480225, "learning_rate": 6.298618446600856e-06, "logits/chosen": 0.28200894594192505, "logits/rejected": 0.07383685559034348, "logps/chosen": -498.76904296875, "logps/rejected": -477.27203369140625, "loss": 0.4219, "rewards/accuracies": 0.828125, "rewards/chosen": -0.26558923721313477, "rewards/margins": 1.0237658023834229, "rewards/rejected": -1.2893550395965576, "step": 1036 }, { "epoch": 1.2935323383084576, "grad_norm": 3.2327277660369873, "learning_rate": 6.221334953058389e-06, "logits/chosen": 0.15867388248443604, "logits/rejected": 0.08303281664848328, "logps/chosen": -409.2590026855469, "logps/rejected": -455.77191162109375, "loss": 0.4991, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4104706346988678, "rewards/margins": 0.7841976881027222, "rewards/rejected": -1.1946682929992676, "step": 1040 }, { "epoch": 1.2985074626865671, "grad_norm": 3.339369058609009, "learning_rate": 6.144313852928712e-06, "logits/chosen": -0.03164299577474594, "logits/rejected": -0.07064341753721237, "logps/chosen": -476.3907470703125, "logps/rejected": -485.3206787109375, "loss": 0.4807, "rewards/accuracies": 0.75, "rewards/chosen": -0.3814237713813782, "rewards/margins": 0.6672918796539307, "rewards/rejected": -1.048715591430664, "step": 1044 }, { "epoch": 1.3034825870646767, "grad_norm": 3.5485079288482666, "learning_rate": 6.067560494616374e-06, "logits/chosen": 0.07933502644300461, "logits/rejected": -0.09825630486011505, "logps/chosen": -476.7882080078125, "logps/rejected": -423.75634765625, "loss": 0.4568, "rewards/accuracies": 0.859375, "rewards/chosen": -0.4646296203136444, "rewards/margins": 0.9645313620567322, "rewards/rejected": -1.4291609525680542, "step": 1048 }, { "epoch": 1.3059701492537314, "eval_logits/chosen": 0.2900795340538025, "eval_logits/rejected": 0.1563159078359604, "eval_logps/chosen": -461.8538818359375, "eval_logps/rejected": -425.2582702636719, "eval_loss": 0.6297169923782349, "eval_rewards/accuracies": 0.6284722089767456, "eval_rewards/chosen": -0.9556920528411865, "eval_rewards/margins": 0.38998663425445557, "eval_rewards/rejected": -1.3456788063049316, "eval_runtime": 150.3263, "eval_samples_per_second": 7.603, "eval_steps_per_second": 0.239, "step": 1050 }, { "epoch": 1.308457711442786, "grad_norm": 3.0293073654174805, "learning_rate": 5.9910802079337285e-06, "logits/chosen": 0.6111765503883362, "logits/rejected": 0.5151335000991821, "logps/chosen": -502.5360412597656, "logps/rejected": -504.69293212890625, "loss": 0.4666, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5117080807685852, "rewards/margins": 0.8272175788879395, "rewards/rejected": -1.338925838470459, "step": 1052 }, { "epoch": 1.3134328358208955, "grad_norm": 3.2000210285186768, "learning_rate": 5.9148783037308154e-06, "logits/chosen": 0.513329029083252, "logits/rejected": 0.3263680338859558, "logps/chosen": -450.2699890136719, "logps/rejected": -474.0347900390625, "loss": 0.4315, "rewards/accuracies": 0.828125, "rewards/chosen": -0.664053201675415, "rewards/margins": 0.9260993003845215, "rewards/rejected": -1.5901525020599365, "step": 1056 }, { "epoch": 1.3184079601990049, "grad_norm": 3.6587066650390625, "learning_rate": 5.838960073526589e-06, "logits/chosen": 0.306156724691391, "logits/rejected": 0.25492236018180847, "logps/chosen": -410.9796142578125, "logps/rejected": -433.6587829589844, "loss": 0.45, "rewards/accuracies": 0.796875, "rewards/chosen": -0.671193540096283, "rewards/margins": 0.6943222880363464, "rewards/rejected": -1.365515947341919, "step": 1060 }, { "epoch": 1.3233830845771144, "grad_norm": 3.327662706375122, "learning_rate": 5.763330789141457e-06, "logits/chosen": 0.48469293117523193, "logits/rejected": 0.19550618529319763, "logps/chosen": -453.54766845703125, "logps/rejected": -384.0401611328125, "loss": 0.4744, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7684299945831299, "rewards/margins": 0.6500393152236938, "rewards/rejected": -1.4184694290161133, "step": 1064 }, { "epoch": 1.328358208955224, "grad_norm": 3.2267775535583496, "learning_rate": 5.687995702331211e-06, "logits/chosen": 0.2688726484775543, "logits/rejected": -0.08813167363405228, "logps/chosen": -501.2934265136719, "logps/rejected": -409.6513671875, "loss": 0.4722, "rewards/accuracies": 0.828125, "rewards/chosen": -0.40029340982437134, "rewards/margins": 1.1426159143447876, "rewards/rejected": -1.5429092645645142, "step": 1068 }, { "epoch": 1.3333333333333333, "grad_norm": 2.840391159057617, "learning_rate": 5.612960044422335e-06, "logits/chosen": 0.149551659822464, "logits/rejected": 0.09185415506362915, "logps/chosen": -478.0235290527344, "logps/rejected": -465.431640625, "loss": 0.4435, "rewards/accuracies": 0.90625, "rewards/chosen": -0.33660268783569336, "rewards/margins": 0.9488120079040527, "rewards/rejected": -1.2854145765304565, "step": 1072 }, { "epoch": 1.3383084577114428, "grad_norm": 5.434293270111084, "learning_rate": 5.538229025948729e-06, "logits/chosen": 0.27715224027633667, "logits/rejected": 0.15390388667583466, "logps/chosen": -462.994873046875, "logps/rejected": -443.705322265625, "loss": 0.4547, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3295069932937622, "rewards/margins": 0.7583127617835999, "rewards/rejected": -1.0878196954727173, "step": 1076 }, { "epoch": 1.3432835820895521, "grad_norm": 2.998685121536255, "learning_rate": 5.463807836289921e-06, "logits/chosen": 0.21240472793579102, "logits/rejected": 0.08427554368972778, "logps/chosen": -515.3600463867188, "logps/rejected": -489.23199462890625, "loss": 0.4714, "rewards/accuracies": 0.84375, "rewards/chosen": -0.37038007378578186, "rewards/margins": 0.8079636693000793, "rewards/rejected": -1.1783437728881836, "step": 1080 }, { "epoch": 1.3482587064676617, "grad_norm": 3.10492205619812, "learning_rate": 5.389701643310661e-06, "logits/chosen": 0.025207914412021637, "logits/rejected": -0.02044026553630829, "logps/chosen": -456.87506103515625, "logps/rejected": -439.67376708984375, "loss": 0.4703, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6905679106712341, "rewards/margins": 0.7245473861694336, "rewards/rejected": -1.4151153564453125, "step": 1084 }, { "epoch": 1.3532338308457712, "grad_norm": 3.226285934448242, "learning_rate": 5.3159155930021e-06, "logits/chosen": 0.3893488645553589, "logits/rejected": 0.08346641063690186, "logps/chosen": -531.7063598632812, "logps/rejected": -439.24896240234375, "loss": 0.4655, "rewards/accuracies": 0.890625, "rewards/chosen": -0.45566755533218384, "rewards/margins": 0.9592388868331909, "rewards/rejected": -1.4149065017700195, "step": 1088 }, { "epoch": 1.3582089552238805, "grad_norm": 3.0475516319274902, "learning_rate": 5.2424548091244334e-06, "logits/chosen": 0.2637353539466858, "logits/rejected": 0.1712309867143631, "logps/chosen": -587.0075073242188, "logps/rejected": -532.0603637695312, "loss": 0.4242, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4887734055519104, "rewards/margins": 0.7975496053695679, "rewards/rejected": -1.286323070526123, "step": 1092 }, { "epoch": 1.36318407960199, "grad_norm": 3.3424243927001953, "learning_rate": 5.169324392851105e-06, "logits/chosen": 0.08936847001314163, "logits/rejected": 0.15995635092258453, "logps/chosen": -441.9477233886719, "logps/rejected": -479.75811767578125, "loss": 0.4372, "rewards/accuracies": 0.828125, "rewards/chosen": -0.15634143352508545, "rewards/margins": 0.9987101554870605, "rewards/rejected": -1.155051589012146, "step": 1096 }, { "epoch": 1.3681592039800994, "grad_norm": 3.5509424209594727, "learning_rate": 5.096529422414571e-06, "logits/chosen": 0.23796138167381287, "logits/rejected": 0.1327465921640396, "logps/chosen": -397.3380126953125, "logps/rejected": -398.999755859375, "loss": 0.4555, "rewards/accuracies": 0.765625, "rewards/chosen": -0.27862298488616943, "rewards/margins": 0.8094170093536377, "rewards/rejected": -1.0880398750305176, "step": 1100 }, { "epoch": 1.3681592039800994, "eval_logits/chosen": 0.2271488755941391, "eval_logits/rejected": 0.09045371413230896, "eval_logps/chosen": -458.12158203125, "eval_logps/rejected": -421.81396484375, "eval_loss": 0.6310929656028748, "eval_rewards/accuracies": 0.6319444179534912, "eval_rewards/chosen": -0.5824543833732605, "eval_rewards/margins": 0.4187923073768616, "eval_rewards/rejected": -1.001246690750122, "eval_runtime": 150.3083, "eval_samples_per_second": 7.604, "eval_steps_per_second": 0.24, "step": 1100 }, { "epoch": 1.373134328358209, "grad_norm": 3.0843594074249268, "learning_rate": 5.0240749527536845e-06, "logits/chosen": 0.34869927167892456, "logits/rejected": 0.4646757245063782, "logps/chosen": -461.98406982421875, "logps/rejected": -484.62005615234375, "loss": 0.4463, "rewards/accuracies": 0.734375, "rewards/chosen": -0.35243409872055054, "rewards/margins": 0.8469762206077576, "rewards/rejected": -1.199410319328308, "step": 1104 }, { "epoch": 1.3781094527363185, "grad_norm": 3.259739637374878, "learning_rate": 4.951966015162652e-06, "logits/chosen": 0.2850785255432129, "logits/rejected": 0.18751974403858185, "logps/chosen": -446.40777587890625, "logps/rejected": -452.7511291503906, "loss": 0.4905, "rewards/accuracies": 0.75, "rewards/chosen": -0.5133770108222961, "rewards/margins": 0.7468600273132324, "rewards/rejected": -1.2602368593215942, "step": 1108 }, { "epoch": 1.3830845771144278, "grad_norm": 3.8932905197143555, "learning_rate": 4.880207616941663e-06, "logits/chosen": 0.12189581990242004, "logits/rejected": -0.021296532824635506, "logps/chosen": -558.7091674804688, "logps/rejected": -541.033203125, "loss": 0.4628, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5688372254371643, "rewards/margins": 0.9365439414978027, "rewards/rejected": -1.5053812265396118, "step": 1112 }, { "epoch": 1.3880597014925373, "grad_norm": 3.545090913772583, "learning_rate": 4.8088047410492e-06, "logits/chosen": 0.388312965631485, "logits/rejected": 0.4156542122364044, "logps/chosen": -481.10894775390625, "logps/rejected": -492.54351806640625, "loss": 0.4705, "rewards/accuracies": 0.84375, "rewards/chosen": -0.45832952857017517, "rewards/margins": 0.7087246179580688, "rewards/rejected": -1.1670540571212769, "step": 1116 }, { "epoch": 1.3930348258706466, "grad_norm": 6.003669738769531, "learning_rate": 4.737762345755975e-06, "logits/chosen": 0.4008483588695526, "logits/rejected": 0.18298931419849396, "logps/chosen": -427.1964111328125, "logps/rejected": -415.52569580078125, "loss": 0.4845, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3898906111717224, "rewards/margins": 0.851486086845398, "rewards/rejected": -1.2413768768310547, "step": 1120 }, { "epoch": 1.3980099502487562, "grad_norm": 3.3979361057281494, "learning_rate": 4.667085364300678e-06, "logits/chosen": 0.2219407558441162, "logits/rejected": 0.34066152572631836, "logps/chosen": -412.3261413574219, "logps/rejected": -455.014404296875, "loss": 0.4715, "rewards/accuracies": 0.640625, "rewards/chosen": -0.659538745880127, "rewards/margins": 0.5176219344139099, "rewards/rejected": -1.1771607398986816, "step": 1124 }, { "epoch": 1.4029850746268657, "grad_norm": 3.4212944507598877, "learning_rate": 4.596778704547359e-06, "logits/chosen": 0.26894667744636536, "logits/rejected": 0.4513319134712219, "logps/chosen": -422.888916015625, "logps/rejected": -494.77490234375, "loss": 0.4455, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5094490647315979, "rewards/margins": 0.8285016417503357, "rewards/rejected": -1.3379508256912231, "step": 1128 }, { "epoch": 1.407960199004975, "grad_norm": 3.561112403869629, "learning_rate": 4.526847248644652e-06, "logits/chosen": 0.4178231358528137, "logits/rejected": 0.2783817648887634, "logps/chosen": -483.21710205078125, "logps/rejected": -457.58880615234375, "loss": 0.4657, "rewards/accuracies": 0.875, "rewards/chosen": -0.2329222410917282, "rewards/margins": 0.928954005241394, "rewards/rejected": -1.1618762016296387, "step": 1132 }, { "epoch": 1.4129353233830846, "grad_norm": 2.9117441177368164, "learning_rate": 4.457295852686746e-06, "logits/chosen": 0.0378599688410759, "logits/rejected": -0.03257442265748978, "logps/chosen": -457.05810546875, "logps/rejected": -469.375244140625, "loss": 0.4197, "rewards/accuracies": 0.875, "rewards/chosen": -0.36898645758628845, "rewards/margins": 0.7519980072975159, "rewards/rejected": -1.120984435081482, "step": 1136 }, { "epoch": 1.417910447761194, "grad_norm": 2.848207950592041, "learning_rate": 4.388129346376177e-06, "logits/chosen": 0.286159873008728, "logits/rejected": 0.172444686293602, "logps/chosen": -449.7042541503906, "logps/rejected": -421.1620178222656, "loss": 0.4734, "rewards/accuracies": 0.84375, "rewards/chosen": -0.26226454973220825, "rewards/margins": 0.8386867046356201, "rewards/rejected": -1.1009511947631836, "step": 1140 }, { "epoch": 1.4228855721393034, "grad_norm": 3.3167192935943604, "learning_rate": 4.319352532688444e-06, "logits/chosen": 0.5666424632072449, "logits/rejected": 0.4337669014930725, "logps/chosen": -525.3414306640625, "logps/rejected": -470.1195373535156, "loss": 0.4637, "rewards/accuracies": 0.75, "rewards/chosen": -0.21915507316589355, "rewards/margins": 0.6568694114685059, "rewards/rejected": -0.8760244250297546, "step": 1144 }, { "epoch": 1.427860696517413, "grad_norm": 3.3861823081970215, "learning_rate": 4.250970187538484e-06, "logits/chosen": 0.5643165707588196, "logits/rejected": 0.32966622710227966, "logps/chosen": -515.8458251953125, "logps/rejected": -457.2875061035156, "loss": 0.4744, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32475122809410095, "rewards/margins": 0.6942940950393677, "rewards/rejected": -1.0190452337265015, "step": 1148 }, { "epoch": 1.4303482587064678, "eval_logits/chosen": 0.1860918253660202, "eval_logits/rejected": 0.04717594385147095, "eval_logps/chosen": -457.6623229980469, "eval_logps/rejected": -421.17510986328125, "eval_loss": 0.6247898936271667, "eval_rewards/accuracies": 0.6423611044883728, "eval_rewards/chosen": -0.5365298390388489, "eval_rewards/margins": 0.40083229541778564, "eval_rewards/rejected": -0.9373621344566345, "eval_runtime": 150.2353, "eval_samples_per_second": 7.608, "eval_steps_per_second": 0.24, "step": 1150 }, { "epoch": 1.4328358208955223, "grad_norm": 3.2325477600097656, "learning_rate": 4.182987059449056e-06, "logits/chosen": 0.7428713440895081, "logits/rejected": 0.5120058059692383, "logps/chosen": -538.3825073242188, "logps/rejected": -495.9022521972656, "loss": 0.4357, "rewards/accuracies": 0.875, "rewards/chosen": -0.06033702194690704, "rewards/margins": 0.8632436990737915, "rewards/rejected": -0.9235806465148926, "step": 1152 }, { "epoch": 1.4378109452736318, "grad_norm": 3.037522554397583, "learning_rate": 4.115407869220948e-06, "logits/chosen": 0.14752769470214844, "logits/rejected": 0.11974264681339264, "logps/chosen": -451.0633544921875, "logps/rejected": -483.72320556640625, "loss": 0.4238, "rewards/accuracies": 0.828125, "rewards/chosen": -0.340376079082489, "rewards/margins": 0.8940660953521729, "rewards/rejected": -1.2344422340393066, "step": 1156 }, { "epoch": 1.4427860696517412, "grad_norm": 3.1680057048797607, "learning_rate": 4.048237309605216e-06, "logits/chosen": 0.09094828367233276, "logits/rejected": 0.04780995845794678, "logps/chosen": -484.3055725097656, "logps/rejected": -510.98077392578125, "loss": 0.4296, "rewards/accuracies": 0.828125, "rewards/chosen": -0.12246174365282059, "rewards/margins": 0.9445063471794128, "rewards/rejected": -1.0669679641723633, "step": 1160 }, { "epoch": 1.4477611940298507, "grad_norm": 3.2691431045532227, "learning_rate": 3.981480044977284e-06, "logits/chosen": 0.40636903047561646, "logits/rejected": 0.1688281148672104, "logps/chosen": -479.45855712890625, "logps/rejected": -447.90863037109375, "loss": 0.4313, "rewards/accuracies": 0.828125, "rewards/chosen": -0.18339478969573975, "rewards/margins": 0.978028416633606, "rewards/rejected": -1.1614230871200562, "step": 1164 }, { "epoch": 1.4527363184079602, "grad_norm": 4.486429691314697, "learning_rate": 3.915140711013044e-06, "logits/chosen": 0.27190345525741577, "logits/rejected": -0.040328770875930786, "logps/chosen": -491.3775634765625, "logps/rejected": -391.1009521484375, "loss": 0.4203, "rewards/accuracies": 0.828125, "rewards/chosen": -0.08996576815843582, "rewards/margins": 0.9748902916908264, "rewards/rejected": -1.064855933189392, "step": 1168 }, { "epoch": 1.4577114427860698, "grad_norm": 3.12208890914917, "learning_rate": 3.849223914366981e-06, "logits/chosen": 0.47814592719078064, "logits/rejected": 0.34353479743003845, "logps/chosen": -412.7701110839844, "logps/rejected": -392.3418884277344, "loss": 0.4408, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47262948751449585, "rewards/margins": 0.8124217987060547, "rewards/rejected": -1.2850513458251953, "step": 1172 }, { "epoch": 1.462686567164179, "grad_norm": 3.506924629211426, "learning_rate": 3.7837342323522454e-06, "logits/chosen": 0.21991969645023346, "logits/rejected": 0.07681813836097717, "logps/chosen": -441.36126708984375, "logps/rejected": -480.6995849609375, "loss": 0.4859, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3392900824546814, "rewards/margins": 0.7776792645454407, "rewards/rejected": -1.116969347000122, "step": 1176 }, { "epoch": 1.4676616915422884, "grad_norm": 3.4964208602905273, "learning_rate": 3.7186762126228227e-06, "logits/chosen": 0.22460336983203888, "logits/rejected": 0.20655813813209534, "logps/chosen": -462.06292724609375, "logps/rejected": -461.53619384765625, "loss": 0.4446, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4264296293258667, "rewards/margins": 0.894850492477417, "rewards/rejected": -1.3212801218032837, "step": 1180 }, { "epoch": 1.472636815920398, "grad_norm": 3.5613043308258057, "learning_rate": 3.654054372857738e-06, "logits/chosen": 0.5799933075904846, "logits/rejected": 0.6048757433891296, "logps/chosen": -396.9797058105469, "logps/rejected": -425.9620361328125, "loss": 0.4914, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5064554810523987, "rewards/margins": 0.8282725214958191, "rewards/rejected": -1.3347280025482178, "step": 1184 }, { "epoch": 1.4776119402985075, "grad_norm": 3.3799402713775635, "learning_rate": 3.5898732004473523e-06, "logits/chosen": 0.12272289395332336, "logits/rejected": 0.06375124305486679, "logps/chosen": -490.4371337890625, "logps/rejected": -490.0047607421875, "loss": 0.4499, "rewards/accuracies": 0.796875, "rewards/chosen": -0.523765504360199, "rewards/margins": 0.788061261177063, "rewards/rejected": -1.3118268251419067, "step": 1188 }, { "epoch": 1.482587064676617, "grad_norm": 3.9501986503601074, "learning_rate": 3.5261371521817247e-06, "logits/chosen": 0.410488486289978, "logits/rejected": 0.24566176533699036, "logps/chosen": -488.9652099609375, "logps/rejected": -474.0276794433594, "loss": 0.481, "rewards/accuracies": 0.859375, "rewards/chosen": -0.2862902283668518, "rewards/margins": 0.930047869682312, "rewards/rejected": -1.216338038444519, "step": 1192 }, { "epoch": 1.4875621890547264, "grad_norm": 3.150517702102661, "learning_rate": 3.462850653941171e-06, "logits/chosen": 0.478097140789032, "logits/rejected": 0.35722148418426514, "logps/chosen": -488.7939453125, "logps/rejected": -501.26995849609375, "loss": 0.441, "rewards/accuracies": 0.828125, "rewards/chosen": -0.44928714632987976, "rewards/margins": 1.0384962558746338, "rewards/rejected": -1.487783432006836, "step": 1196 }, { "epoch": 1.4925373134328357, "grad_norm": 3.1727919578552246, "learning_rate": 3.4000181003889e-06, "logits/chosen": 0.5604240298271179, "logits/rejected": 0.5023808479309082, "logps/chosen": -495.4609069824219, "logps/rejected": -486.8149108886719, "loss": 0.4245, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5019670128822327, "rewards/margins": 0.7741233110427856, "rewards/rejected": -1.276090383529663, "step": 1200 }, { "epoch": 1.4925373134328357, "eval_logits/chosen": 0.0996675118803978, "eval_logits/rejected": -0.04232680797576904, "eval_logps/chosen": -458.75396728515625, "eval_logps/rejected": -422.38055419921875, "eval_loss": 0.625542402267456, "eval_rewards/accuracies": 0.6423611044883728, "eval_rewards/chosen": -0.6456986665725708, "eval_rewards/margins": 0.4122096002101898, "eval_rewards/rejected": -1.057908296585083, "eval_runtime": 150.4735, "eval_samples_per_second": 7.596, "eval_steps_per_second": 0.239, "step": 1200 }, { "epoch": 1.4975124378109452, "grad_norm": 3.5961523056030273, "learning_rate": 3.337643854665843e-06, "logits/chosen": 0.24791333079338074, "logits/rejected": 0.09919527173042297, "logps/chosen": -472.2637939453125, "logps/rejected": -468.1837463378906, "loss": 0.4725, "rewards/accuracies": 0.75, "rewards/chosen": -0.3583456873893738, "rewards/margins": 0.6298627853393555, "rewards/rejected": -0.988208532333374, "step": 1204 }, { "epoch": 1.5024875621890548, "grad_norm": 3.5279345512390137, "learning_rate": 3.2757322480876996e-06, "logits/chosen": 0.5340238213539124, "logits/rejected": 0.27070313692092896, "logps/chosen": -559.6838989257812, "logps/rejected": -504.8668212890625, "loss": 0.4544, "rewards/accuracies": 0.8125, "rewards/chosen": -0.36284855008125305, "rewards/margins": 0.9084014892578125, "rewards/rejected": -1.2712500095367432, "step": 1208 }, { "epoch": 1.5074626865671643, "grad_norm": 3.2980010509490967, "learning_rate": 3.2142875798441376e-06, "logits/chosen": 0.4306156635284424, "logits/rejected": 0.39325863122940063, "logps/chosen": -497.91082763671875, "logps/rejected": -504.37799072265625, "loss": 0.4623, "rewards/accuracies": 0.78125, "rewards/chosen": -0.36765068769454956, "rewards/margins": 0.9293129444122314, "rewards/rejected": -1.2969635725021362, "step": 1212 }, { "epoch": 1.5124378109452736, "grad_norm": 3.0862483978271484, "learning_rate": 3.15331411670027e-06, "logits/chosen": 0.3046882450580597, "logits/rejected": 0.16192057728767395, "logps/chosen": -494.2255859375, "logps/rejected": -420.78521728515625, "loss": 0.4428, "rewards/accuracies": 0.78125, "rewards/chosen": -0.43021082878112793, "rewards/margins": 0.6426270008087158, "rewards/rejected": -1.0728377103805542, "step": 1216 }, { "epoch": 1.517412935323383, "grad_norm": 3.462331533432007, "learning_rate": 3.092816092700366e-06, "logits/chosen": 0.5411734580993652, "logits/rejected": 0.49231088161468506, "logps/chosen": -453.7169189453125, "logps/rejected": -449.0989990234375, "loss": 0.4454, "rewards/accuracies": 0.796875, "rewards/chosen": -0.22313016653060913, "rewards/margins": 0.8612147569656372, "rewards/rejected": -1.0843448638916016, "step": 1220 }, { "epoch": 1.5223880597014925, "grad_norm": 3.7653286457061768, "learning_rate": 3.032797708873828e-06, "logits/chosen": 0.11857330799102783, "logits/rejected": 0.07505325227975845, "logps/chosen": -410.9967956542969, "logps/rejected": -394.2019348144531, "loss": 0.4583, "rewards/accuracies": 0.875, "rewards/chosen": -0.18447177112102509, "rewards/margins": 0.8445629477500916, "rewards/rejected": -1.0290347337722778, "step": 1224 }, { "epoch": 1.527363184079602, "grad_norm": 3.707679033279419, "learning_rate": 2.97326313294349e-06, "logits/chosen": 0.40369507670402527, "logits/rejected": 0.31983357667922974, "logps/chosen": -493.98748779296875, "logps/rejected": -492.9511413574219, "loss": 0.4529, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09158361703157425, "rewards/margins": 0.8120929598808289, "rewards/rejected": -0.9036765694618225, "step": 1228 }, { "epoch": 1.5323383084577116, "grad_norm": 3.2079174518585205, "learning_rate": 2.914216499036178e-06, "logits/chosen": 0.25405532121658325, "logits/rejected": 0.0883391723036766, "logps/chosen": -499.791259765625, "logps/rejected": -481.15985107421875, "loss": 0.4596, "rewards/accuracies": 0.859375, "rewards/chosen": -0.11543658375740051, "rewards/margins": 0.9941234588623047, "rewards/rejected": -1.1095600128173828, "step": 1232 }, { "epoch": 1.537313432835821, "grad_norm": 2.983884811401367, "learning_rate": 2.855661907395655e-06, "logits/chosen": 0.03389931470155716, "logits/rejected": -0.021514683961868286, "logps/chosen": -491.31011962890625, "logps/rejected": -476.60400390625, "loss": 0.426, "rewards/accuracies": 0.765625, "rewards/chosen": -0.34667280316352844, "rewards/margins": 0.7871711850166321, "rewards/rejected": -1.133844017982483, "step": 1236 }, { "epoch": 1.5422885572139302, "grad_norm": 3.2897229194641113, "learning_rate": 2.7976034240978834e-06, "logits/chosen": 0.2967681884765625, "logits/rejected": 0.21248680353164673, "logps/chosen": -450.992431640625, "logps/rejected": -448.6529541015625, "loss": 0.4541, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4360540509223938, "rewards/margins": 0.8572646379470825, "rewards/rejected": -1.2933186292648315, "step": 1240 }, { "epoch": 1.5472636815920398, "grad_norm": 3.180680990219116, "learning_rate": 2.740045080768694e-06, "logits/chosen": 0.18130186200141907, "logits/rejected": 0.08365779370069504, "logps/chosen": -490.65093994140625, "logps/rejected": -455.1732482910156, "loss": 0.4406, "rewards/accuracies": 0.828125, "rewards/chosen": -0.30239832401275635, "rewards/margins": 1.023730993270874, "rewards/rejected": -1.3261293172836304, "step": 1244 }, { "epoch": 1.5522388059701493, "grad_norm": 3.139068841934204, "learning_rate": 2.6829908743037936e-06, "logits/chosen": 0.09111860394477844, "logits/rejected": 0.002367449924349785, "logps/chosen": -452.03741455078125, "logps/rejected": -420.84130859375, "loss": 0.4767, "rewards/accuracies": 0.765625, "rewards/chosen": -0.44431072473526, "rewards/margins": 0.6989957690238953, "rewards/rejected": -1.1433064937591553, "step": 1248 }, { "epoch": 1.554726368159204, "eval_logits/chosen": 0.2652171850204468, "eval_logits/rejected": 0.12997165322303772, "eval_logps/chosen": -459.63043212890625, "eval_logps/rejected": -423.3202209472656, "eval_loss": 0.629449188709259, "eval_rewards/accuracies": 0.6319444179534912, "eval_rewards/chosen": -0.733344554901123, "eval_rewards/margins": 0.41853055357933044, "eval_rewards/rejected": -1.1518750190734863, "eval_runtime": 149.9641, "eval_samples_per_second": 7.622, "eval_steps_per_second": 0.24, "step": 1250 }, { "epoch": 1.5572139303482588, "grad_norm": 2.9897727966308594, "learning_rate": 2.626444766591253e-06, "logits/chosen": 0.2800312340259552, "logits/rejected": 0.3279171586036682, "logps/chosen": -443.030517578125, "logps/rejected": -504.1892395019531, "loss": 0.4307, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6528493762016296, "rewards/margins": 0.8220282793045044, "rewards/rejected": -1.4748777151107788, "step": 1252 }, { "epoch": 1.5621890547263682, "grad_norm": 3.8915977478027344, "learning_rate": 2.570410684236365e-06, "logits/chosen": 0.422254741191864, "logits/rejected": 0.20921355485916138, "logps/chosen": -451.8226318359375, "logps/rejected": -409.6569519042969, "loss": 0.4575, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5334627628326416, "rewards/margins": 0.7321829199790955, "rewards/rejected": -1.2656457424163818, "step": 1256 }, { "epoch": 1.5671641791044775, "grad_norm": 3.2011289596557617, "learning_rate": 2.514892518288988e-06, "logits/chosen": 0.2908586859703064, "logits/rejected": 0.2285086065530777, "logps/chosen": -540.8156127929688, "logps/rejected": -594.375244140625, "loss": 0.4278, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4963512718677521, "rewards/margins": 1.1527037620544434, "rewards/rejected": -1.6490551233291626, "step": 1260 }, { "epoch": 1.572139303482587, "grad_norm": 3.1737494468688965, "learning_rate": 2.4598941239733555e-06, "logits/chosen": 0.3123033046722412, "logits/rejected": 0.09384813904762268, "logps/chosen": -506.7591552734375, "logps/rejected": -445.96966552734375, "loss": 0.4576, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5109487771987915, "rewards/margins": 0.6419615745544434, "rewards/rejected": -1.1529103517532349, "step": 1264 }, { "epoch": 1.5771144278606966, "grad_norm": 3.3242666721343994, "learning_rate": 2.4054193204203457e-06, "logits/chosen": 0.4143025279045105, "logits/rejected": 0.3821703791618347, "logps/chosen": -444.6474609375, "logps/rejected": -441.0220031738281, "loss": 0.4599, "rewards/accuracies": 0.828125, "rewards/chosen": -0.30891337990760803, "rewards/margins": 0.7618473768234253, "rewards/rejected": -1.0707608461380005, "step": 1268 }, { "epoch": 1.582089552238806, "grad_norm": 3.3792619705200195, "learning_rate": 2.3514718904022993e-06, "logits/chosen": 0.4942702651023865, "logits/rejected": 0.4701668620109558, "logps/chosen": -440.89813232421875, "logps/rejected": -436.6291809082031, "loss": 0.4681, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30445003509521484, "rewards/margins": 0.7473883032798767, "rewards/rejected": -1.0518382787704468, "step": 1272 }, { "epoch": 1.5870646766169154, "grad_norm": 3.4865262508392334, "learning_rate": 2.2980555800703273e-06, "logits/chosen": 0.03151869773864746, "logits/rejected": -0.11891334503889084, "logps/chosen": -446.2289123535156, "logps/rejected": -399.5888977050781, "loss": 0.4466, "rewards/accuracies": 0.828125, "rewards/chosen": -0.37324345111846924, "rewards/margins": 0.8816293478012085, "rewards/rejected": -1.2548727989196777, "step": 1276 }, { "epoch": 1.5920398009950247, "grad_norm": 3.55745530128479, "learning_rate": 2.2451740986941905e-06, "logits/chosen": 0.06370651721954346, "logits/rejected": 0.08302780240774155, "logps/chosen": -445.5990905761719, "logps/rejected": -471.1882629394531, "loss": 0.472, "rewards/accuracies": 0.75, "rewards/chosen": -0.45704060792922974, "rewards/margins": 0.803365170955658, "rewards/rejected": -1.2604056596755981, "step": 1280 }, { "epoch": 1.5970149253731343, "grad_norm": 4.591226100921631, "learning_rate": 2.1928311184046967e-06, "logits/chosen": 0.3119097650051117, "logits/rejected": 0.196340873837471, "logps/chosen": -478.5575256347656, "logps/rejected": -448.47967529296875, "loss": 0.4374, "rewards/accuracies": 0.8125, "rewards/chosen": -0.293584406375885, "rewards/margins": 0.9845431447029114, "rewards/rejected": -1.2781274318695068, "step": 1284 }, { "epoch": 1.6019900497512438, "grad_norm": 3.375105142593384, "learning_rate": 2.1410302739387424e-06, "logits/chosen": 0.35102376341819763, "logits/rejected": 0.2489197850227356, "logps/chosen": -476.27801513671875, "logps/rejected": -478.7754821777344, "loss": 0.4485, "rewards/accuracies": 0.765625, "rewards/chosen": -0.42445307970046997, "rewards/margins": 0.7885385751724243, "rewards/rejected": -1.2129915952682495, "step": 1288 }, { "epoch": 1.6069651741293534, "grad_norm": 3.0185811519622803, "learning_rate": 2.0897751623868833e-06, "logits/chosen": 0.150477796792984, "logits/rejected": 0.00734228640794754, "logps/chosen": -409.4201354980469, "logps/rejected": -382.40716552734375, "loss": 0.4268, "rewards/accuracies": 0.828125, "rewards/chosen": -0.45322197675704956, "rewards/margins": 0.8744790554046631, "rewards/rejected": -1.3277010917663574, "step": 1292 }, { "epoch": 1.6119402985074627, "grad_norm": 3.5438005924224854, "learning_rate": 2.0390693429435626e-06, "logits/chosen": 0.27790558338165283, "logits/rejected": 0.18525969982147217, "logps/chosen": -441.6915283203125, "logps/rejected": -437.93292236328125, "loss": 0.4422, "rewards/accuracies": 0.84375, "rewards/chosen": -0.37093889713287354, "rewards/margins": 0.9228270053863525, "rewards/rejected": -1.2937657833099365, "step": 1296 }, { "epoch": 1.616915422885572, "grad_norm": 3.5567455291748047, "learning_rate": 1.9889163366599607e-06, "logits/chosen": 0.11875329911708832, "logits/rejected": -0.03504091128706932, "logps/chosen": -443.64599609375, "logps/rejected": -427.08154296875, "loss": 0.4714, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6551141738891602, "rewards/margins": 0.9976121187210083, "rewards/rejected": -1.652726411819458, "step": 1300 }, { "epoch": 1.616915422885572, "eval_logits/chosen": 0.1787891387939453, "eval_logits/rejected": 0.03969912976026535, "eval_logps/chosen": -460.4245300292969, "eval_logps/rejected": -424.1896057128906, "eval_loss": 0.6253213286399841, "eval_rewards/accuracies": 0.6493055820465088, "eval_rewards/chosen": -0.8127551674842834, "eval_rewards/margins": 0.42605745792388916, "eval_rewards/rejected": -1.2388125658035278, "eval_runtime": 149.9009, "eval_samples_per_second": 7.625, "eval_steps_per_second": 0.24, "step": 1300 }, { "epoch": 1.6218905472636815, "grad_norm": 3.5520968437194824, "learning_rate": 1.939319626199483e-06, "logits/chosen": 0.2985292375087738, "logits/rejected": 0.2237393856048584, "logps/chosen": -436.99053955078125, "logps/rejected": -440.2374267578125, "loss": 0.4406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5354505777359009, "rewards/margins": 0.9315750598907471, "rewards/rejected": -1.4670255184173584, "step": 1304 }, { "epoch": 1.626865671641791, "grad_norm": 3.226693868637085, "learning_rate": 1.890282655595922e-06, "logits/chosen": 0.07176372408866882, "logits/rejected": 0.027363533154129982, "logps/chosen": -474.4735107421875, "logps/rejected": -492.5865173339844, "loss": 0.4211, "rewards/accuracies": 0.875, "rewards/chosen": -0.5504629611968994, "rewards/margins": 0.9078149795532227, "rewards/rejected": -1.458277940750122, "step": 1308 }, { "epoch": 1.6318407960199006, "grad_norm": 3.6644186973571777, "learning_rate": 1.8418088300143044e-06, "logits/chosen": 0.07038739323616028, "logits/rejected": -0.047772906720638275, "logps/chosen": -434.7318115234375, "logps/rejected": -420.2578430175781, "loss": 0.4714, "rewards/accuracies": 0.75, "rewards/chosen": -0.6160821914672852, "rewards/margins": 0.760805606842041, "rewards/rejected": -1.3768879175186157, "step": 1312 }, { "epoch": 1.63681592039801, "grad_norm": 4.1779093742370605, "learning_rate": 1.7939015155144378e-06, "logits/chosen": 0.40807458758354187, "logits/rejected": 0.42695319652557373, "logps/chosen": -497.0574645996094, "logps/rejected": -563.171142578125, "loss": 0.4406, "rewards/accuracies": 0.828125, "rewards/chosen": -0.484649658203125, "rewards/margins": 0.8875025510787964, "rewards/rejected": -1.3721522092819214, "step": 1316 }, { "epoch": 1.6417910447761193, "grad_norm": 4.379684925079346, "learning_rate": 1.7465640388171589e-06, "logits/chosen": 0.34882089495658875, "logits/rejected": 0.1509179174900055, "logps/chosen": -489.57470703125, "logps/rejected": -458.5868225097656, "loss": 0.428, "rewards/accuracies": 0.859375, "rewards/chosen": -0.40813666582107544, "rewards/margins": 0.9864634871482849, "rewards/rejected": -1.3946000337600708, "step": 1320 }, { "epoch": 1.6467661691542288, "grad_norm": 2.909369945526123, "learning_rate": 1.6997996870733268e-06, "logits/chosen": 0.5466185808181763, "logits/rejected": 0.4700179696083069, "logps/chosen": -430.5594787597656, "logps/rejected": -413.28466796875, "loss": 0.4083, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2856665849685669, "rewards/margins": 0.9960864782333374, "rewards/rejected": -1.2817531824111938, "step": 1324 }, { "epoch": 1.6517412935323383, "grad_norm": 2.9255692958831787, "learning_rate": 1.6536117076355652e-06, "logits/chosen": 0.3447165787220001, "logits/rejected": 0.2050694227218628, "logps/chosen": -502.8382568359375, "logps/rejected": -489.395263671875, "loss": 0.465, "rewards/accuracies": 0.828125, "rewards/chosen": -0.17628344893455505, "rewards/margins": 0.8542786240577698, "rewards/rejected": -1.0305620431900024, "step": 1328 }, { "epoch": 1.6567164179104479, "grad_norm": 3.957753896713257, "learning_rate": 1.6080033078327585e-06, "logits/chosen": 0.037751637399196625, "logits/rejected": -0.0011347047984600067, "logps/chosen": -483.31829833984375, "logps/rejected": -538.3037719726562, "loss": 0.4812, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5082539319992065, "rewards/margins": 0.6922823190689087, "rewards/rejected": -1.2005363702774048, "step": 1332 }, { "epoch": 1.6616915422885572, "grad_norm": 3.4213998317718506, "learning_rate": 1.5629776547473397e-06, "logits/chosen": 0.4570158123970032, "logits/rejected": 0.31136855483055115, "logps/chosen": -430.8033447265625, "logps/rejected": -415.0570068359375, "loss": 0.4232, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4616736173629761, "rewards/margins": 0.7074974179267883, "rewards/rejected": -1.1691709756851196, "step": 1336 }, { "epoch": 1.6666666666666665, "grad_norm": 4.027344226837158, "learning_rate": 1.5185378749953538e-06, "logits/chosen": 0.4271657466888428, "logits/rejected": 0.4088464379310608, "logps/chosen": -447.46575927734375, "logps/rejected": -485.32666015625, "loss": 0.4902, "rewards/accuracies": 0.796875, "rewards/chosen": -0.26651304960250854, "rewards/margins": 0.8962290287017822, "rewards/rejected": -1.162742018699646, "step": 1340 }, { "epoch": 1.671641791044776, "grad_norm": 2.962707281112671, "learning_rate": 1.4746870545093528e-06, "logits/chosen": 0.45913419127464294, "logits/rejected": 0.1819644272327423, "logps/chosen": -413.649658203125, "logps/rejected": -396.4461669921875, "loss": 0.4185, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3579864203929901, "rewards/margins": 0.9220394492149353, "rewards/rejected": -1.2800259590148926, "step": 1344 }, { "epoch": 1.6766169154228856, "grad_norm": 3.308551788330078, "learning_rate": 1.4314282383241097e-06, "logits/chosen": 0.3890434205532074, "logits/rejected": 0.17695972323417664, "logps/chosen": -442.94415283203125, "logps/rejected": -399.24871826171875, "loss": 0.4336, "rewards/accuracies": 0.859375, "rewards/chosen": -0.38554510474205017, "rewards/margins": 1.0001806020736694, "rewards/rejected": -1.385725736618042, "step": 1348 }, { "epoch": 1.6791044776119404, "eval_logits/chosen": 0.2587340176105499, "eval_logits/rejected": 0.12335896492004395, "eval_logps/chosen": -459.9505615234375, "eval_logps/rejected": -423.8654479980469, "eval_loss": 0.6228974461555481, "eval_rewards/accuracies": 0.6423611044883728, "eval_rewards/chosen": -0.7653533220291138, "eval_rewards/margins": 0.44104525446891785, "eval_rewards/rejected": -1.2063984870910645, "eval_runtime": 149.9993, "eval_samples_per_second": 7.62, "eval_steps_per_second": 0.24, "step": 1350 }, { "epoch": 1.6815920398009951, "grad_norm": 3.7813925743103027, "learning_rate": 1.388764430365147e-06, "logits/chosen": 0.04878038913011551, "logits/rejected": 0.14894048869609833, "logps/chosen": -482.2070617675781, "logps/rejected": -522.3347778320312, "loss": 0.4985, "rewards/accuracies": 0.796875, "rewards/chosen": -0.40297234058380127, "rewards/margins": 0.8282972574234009, "rewards/rejected": -1.2312694787979126, "step": 1352 }, { "epoch": 1.6865671641791045, "grad_norm": 4.5991363525390625, "learning_rate": 1.3466985932401743e-06, "logits/chosen": 0.328086256980896, "logits/rejected": 0.15323612093925476, "logps/chosen": -475.05078125, "logps/rejected": -424.8678894042969, "loss": 0.5033, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5331354737281799, "rewards/margins": 0.6546344757080078, "rewards/rejected": -1.187769889831543, "step": 1356 }, { "epoch": 1.6915422885572138, "grad_norm": 3.5663769245147705, "learning_rate": 1.3052336480333372e-06, "logits/chosen": 0.2575068771839142, "logits/rejected": 0.05139423906803131, "logps/chosen": -488.58538818359375, "logps/rejected": -455.89971923828125, "loss": 0.4237, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3154524862766266, "rewards/margins": 1.0843840837478638, "rewards/rejected": -1.399836540222168, "step": 1360 }, { "epoch": 1.6965174129353233, "grad_norm": 2.9855713844299316, "learning_rate": 1.2643724741023845e-06, "logits/chosen": 0.19129760563373566, "logits/rejected": 0.0936068594455719, "logps/chosen": -416.364501953125, "logps/rejected": -433.6866455078125, "loss": 0.4275, "rewards/accuracies": 0.875, "rewards/chosen": -0.408183217048645, "rewards/margins": 1.019007921218872, "rewards/rejected": -1.427191138267517, "step": 1364 }, { "epoch": 1.7014925373134329, "grad_norm": 3.654536485671997, "learning_rate": 1.2241179088787192e-06, "logits/chosen": 0.39111489057540894, "logits/rejected": 0.035055145621299744, "logps/chosen": -540.2343139648438, "logps/rejected": -456.66424560546875, "loss": 0.4579, "rewards/accuracies": 0.890625, "rewards/chosen": -0.567006528377533, "rewards/margins": 0.8900810480117798, "rewards/rejected": -1.4570876359939575, "step": 1368 }, { "epoch": 1.7064676616915424, "grad_norm": 3.8228235244750977, "learning_rate": 1.1844727476703776e-06, "logits/chosen": 0.42539361119270325, "logits/rejected": 0.1684579700231552, "logps/chosen": -524.5021362304688, "logps/rejected": -491.33587646484375, "loss": 0.4795, "rewards/accuracies": 0.84375, "rewards/chosen": -0.535822868347168, "rewards/margins": 1.0067797899246216, "rewards/rejected": -1.5426026582717896, "step": 1372 }, { "epoch": 1.7114427860696517, "grad_norm": 3.2348790168762207, "learning_rate": 1.1454397434679022e-06, "logits/chosen": 0.40201398730278015, "logits/rejected": 0.2579033672809601, "logps/chosen": -561.11328125, "logps/rejected": -525.084716796875, "loss": 0.4248, "rewards/accuracies": 0.890625, "rewards/chosen": -0.30076974630355835, "rewards/margins": 1.0572198629379272, "rewards/rejected": -1.3579895496368408, "step": 1376 }, { "epoch": 1.716417910447761, "grad_norm": 3.4407033920288086, "learning_rate": 1.1070216067531825e-06, "logits/chosen": 0.25583919882774353, "logits/rejected": 0.3665779232978821, "logps/chosen": -391.76739501953125, "logps/rejected": -447.6612854003906, "loss": 0.45, "rewards/accuracies": 0.75, "rewards/chosen": -0.5686730742454529, "rewards/margins": 0.7188047170639038, "rewards/rejected": -1.287477731704712, "step": 1380 }, { "epoch": 1.7213930348258706, "grad_norm": 3.3550174236297607, "learning_rate": 1.0692210053112451e-06, "logits/chosen": 0.09708093851804733, "logits/rejected": -0.10916668176651001, "logps/chosen": -493.13922119140625, "logps/rejected": -448.70391845703125, "loss": 0.4779, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7006162405014038, "rewards/margins": 0.6611677408218384, "rewards/rejected": -1.3617841005325317, "step": 1384 }, { "epoch": 1.7263681592039801, "grad_norm": 3.5417962074279785, "learning_rate": 1.032040564044975e-06, "logits/chosen": 0.10944227129220963, "logits/rejected": 0.03197764605283737, "logps/chosen": -475.2284851074219, "logps/rejected": -441.09356689453125, "loss": 0.4613, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5546907186508179, "rewards/margins": 0.8569729328155518, "rewards/rejected": -1.4116637706756592, "step": 1388 }, { "epoch": 1.7313432835820897, "grad_norm": 2.918147087097168, "learning_rate": 9.954828647928727e-07, "logits/chosen": 0.22849802672863007, "logits/rejected": 0.1010328084230423, "logps/chosen": -401.5469055175781, "logps/rejected": -382.9391174316406, "loss": 0.4112, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3927709460258484, "rewards/margins": 0.9075154066085815, "rewards/rejected": -1.3002864122390747, "step": 1392 }, { "epoch": 1.736318407960199, "grad_norm": 3.2414426803588867, "learning_rate": 9.595504461497441e-07, "logits/chosen": 0.6792712807655334, "logits/rejected": 0.5262346863746643, "logps/chosen": -517.4805297851562, "logps/rejected": -470.47845458984375, "loss": 0.4617, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5418896675109863, "rewards/margins": 0.7864224910736084, "rewards/rejected": -1.3283122777938843, "step": 1396 }, { "epoch": 1.7412935323383083, "grad_norm": 2.9658279418945312, "learning_rate": 9.242458032904311e-07, "logits/chosen": 0.32243314385414124, "logits/rejected": 0.08803755044937134, "logps/chosen": -545.0567626953125, "logps/rejected": -452.5140686035156, "loss": 0.4791, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7034752368927002, "rewards/margins": 0.48704254627227783, "rewards/rejected": -1.1905179023742676, "step": 1400 }, { "epoch": 1.7412935323383083, "eval_logits/chosen": 0.1930580586194992, "eval_logits/rejected": 0.05467061698436737, "eval_logps/chosen": -459.8746643066406, "eval_logps/rejected": -423.8709716796875, "eval_loss": 0.6216272115707397, "eval_rewards/accuracies": 0.6388888955116272, "eval_rewards/chosen": -0.7577680945396423, "eval_rewards/margins": 0.4491753578186035, "eval_rewards/rejected": -1.2069435119628906, "eval_runtime": 150.434, "eval_samples_per_second": 7.598, "eval_steps_per_second": 0.239, "step": 1400 }, { "epoch": 1.7462686567164178, "grad_norm": 3.2509801387786865, "learning_rate": 8.895713877965373e-07, "logits/chosen": 0.4595485031604767, "logits/rejected": 0.1782127022743225, "logps/chosen": -443.97564697265625, "logps/rejected": -366.19793701171875, "loss": 0.4664, "rewards/accuracies": 0.765625, "rewards/chosen": -0.40020307898521423, "rewards/margins": 0.7735263109207153, "rewards/rejected": -1.173729419708252, "step": 1404 }, { "epoch": 1.7512437810945274, "grad_norm": 3.9130449295043945, "learning_rate": 8.555296074861996e-07, "logits/chosen": 0.10901626199483871, "logits/rejected": -0.14974814653396606, "logps/chosen": -456.7862548828125, "logps/rejected": -415.9285888671875, "loss": 0.4287, "rewards/accuracies": 0.859375, "rewards/chosen": -0.35942861437797546, "rewards/margins": 1.0868324041366577, "rewards/rejected": -1.4462610483169556, "step": 1408 }, { "epoch": 1.756218905472637, "grad_norm": 4.010313510894775, "learning_rate": 8.22122826246875e-07, "logits/chosen": 0.6070827841758728, "logits/rejected": 0.44983193278312683, "logps/chosen": -474.1615295410156, "logps/rejected": -467.8525390625, "loss": 0.4663, "rewards/accuracies": 0.875, "rewards/chosen": -0.38996899127960205, "rewards/margins": 0.9174278974533081, "rewards/rejected": -1.3073970079421997, "step": 1412 }, { "epoch": 1.7611940298507462, "grad_norm": 3.6952216625213623, "learning_rate": 7.89353363871197e-07, "logits/chosen": 0.5432174205780029, "logits/rejected": 0.16360357403755188, "logps/chosen": -484.909423828125, "logps/rejected": -380.3939514160156, "loss": 0.5218, "rewards/accuracies": 0.75, "rewards/chosen": -0.5896638631820679, "rewards/margins": 0.679851233959198, "rewards/rejected": -1.2695151567459106, "step": 1416 }, { "epoch": 1.7661691542288556, "grad_norm": 3.4239590167999268, "learning_rate": 7.572234958958846e-07, "logits/chosen": 0.5283284187316895, "logits/rejected": 0.45787736773490906, "logps/chosen": -476.11383056640625, "logps/rejected": -496.3194580078125, "loss": 0.4459, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6096381545066833, "rewards/margins": 0.8067396283149719, "rewards/rejected": -1.4163777828216553, "step": 1420 }, { "epoch": 1.771144278606965, "grad_norm": 3.3646957874298096, "learning_rate": 7.2573545344373e-07, "logits/chosen": 0.32012930512428284, "logits/rejected": 0.18036966025829315, "logps/chosen": -473.15997314453125, "logps/rejected": -461.828369140625, "loss": 0.4338, "rewards/accuracies": 0.828125, "rewards/chosen": -0.468191534280777, "rewards/margins": 0.9105731248855591, "rewards/rejected": -1.3787648677825928, "step": 1424 }, { "epoch": 1.7761194029850746, "grad_norm": 3.111020088195801, "learning_rate": 6.948914230686688e-07, "logits/chosen": 0.072984479367733, "logits/rejected": -0.05354681983590126, "logps/chosen": -499.5512390136719, "logps/rejected": -478.2958679199219, "loss": 0.43, "rewards/accuracies": 0.875, "rewards/chosen": -0.33797788619995117, "rewards/margins": 1.0079048871994019, "rewards/rejected": -1.345882773399353, "step": 1428 }, { "epoch": 1.7810945273631842, "grad_norm": 3.493523597717285, "learning_rate": 6.646935466039373e-07, "logits/chosen": 0.1697852909564972, "logits/rejected": 0.06225850433111191, "logps/chosen": -436.7352600097656, "logps/rejected": -451.919677734375, "loss": 0.4171, "rewards/accuracies": 0.890625, "rewards/chosen": -0.30590546131134033, "rewards/margins": 1.0445671081542969, "rewards/rejected": -1.3504725694656372, "step": 1432 }, { "epoch": 1.7860696517412935, "grad_norm": 3.5960781574249268, "learning_rate": 6.351439210133492e-07, "logits/chosen": 0.16463078558444977, "logits/rejected": -0.0510396808385849, "logps/chosen": -489.61944580078125, "logps/rejected": -507.6096496582031, "loss": 0.4364, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5339682698249817, "rewards/margins": 0.8638878464698792, "rewards/rejected": -1.3978562355041504, "step": 1436 }, { "epoch": 1.7910447761194028, "grad_norm": 2.9790070056915283, "learning_rate": 6.062445982456777e-07, "logits/chosen": 0.34399691224098206, "logits/rejected": 0.11060275137424469, "logps/chosen": -484.4259338378906, "logps/rejected": -441.9401550292969, "loss": 0.4195, "rewards/accuracies": 0.875, "rewards/chosen": -0.41451165080070496, "rewards/margins": 0.9917902946472168, "rewards/rejected": -1.4063019752502441, "step": 1440 }, { "epoch": 1.7960199004975124, "grad_norm": 3.2462868690490723, "learning_rate": 5.77997585092166e-07, "logits/chosen": 0.24684180319309235, "logits/rejected": 0.09757015109062195, "logps/chosen": -501.04132080078125, "logps/rejected": -473.86407470703125, "loss": 0.4557, "rewards/accuracies": 0.75, "rewards/chosen": -0.5149147510528564, "rewards/margins": 0.7671060562133789, "rewards/rejected": -1.2820206880569458, "step": 1444 }, { "epoch": 1.800995024875622, "grad_norm": 4.118817329406738, "learning_rate": 5.504048430471753e-07, "logits/chosen": 0.05877215415239334, "logits/rejected": 0.0584217831492424, "logps/chosen": -416.53216552734375, "logps/rejected": -467.2144775390625, "loss": 0.439, "rewards/accuracies": 0.84375, "rewards/chosen": -0.606140673160553, "rewards/margins": 0.9090366363525391, "rewards/rejected": -1.5151773691177368, "step": 1448 }, { "epoch": 1.8034825870646767, "eval_logits/chosen": 0.20400173962116241, "eval_logits/rejected": 0.06605671346187592, "eval_logps/chosen": -459.7663879394531, "eval_logps/rejected": -423.7731018066406, "eval_loss": 0.6204391121864319, "eval_rewards/accuracies": 0.6493055820465088, "eval_rewards/chosen": -0.7469313144683838, "eval_rewards/margins": 0.45023012161254883, "eval_rewards/rejected": -1.1971614360809326, "eval_runtime": 150.447, "eval_samples_per_second": 7.597, "eval_steps_per_second": 0.239, "step": 1450 }, { "epoch": 1.8059701492537314, "grad_norm": 3.226379156112671, "learning_rate": 5.234682881719766e-07, "logits/chosen": 0.1486922800540924, "logits/rejected": 0.2820119261741638, "logps/chosen": -438.1523132324219, "logps/rejected": -485.3592529296875, "loss": 0.458, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4801374673843384, "rewards/margins": 0.7025588750839233, "rewards/rejected": -1.1826963424682617, "step": 1452 }, { "epoch": 1.8109452736318408, "grad_norm": 4.159534454345703, "learning_rate": 4.971897909616985e-07, "logits/chosen": 0.42471548914909363, "logits/rejected": 0.16627195477485657, "logps/chosen": -582.96630859375, "logps/rejected": -513.5985107421875, "loss": 0.4856, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4713844358921051, "rewards/margins": 0.9388619661331177, "rewards/rejected": -1.4102462530136108, "step": 1456 }, { "epoch": 1.81592039800995, "grad_norm": 3.511375665664673, "learning_rate": 4.715711762154362e-07, "logits/chosen": 0.11827405542135239, "logits/rejected": 0.02628401480615139, "logps/chosen": -472.3614501953125, "logps/rejected": -421.99163818359375, "loss": 0.438, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5915001034736633, "rewards/margins": 0.8902807235717773, "rewards/rejected": -1.481780767440796, "step": 1460 }, { "epoch": 1.8208955223880596, "grad_norm": 3.314199686050415, "learning_rate": 4.4661422290954495e-07, "logits/chosen": 0.1869126260280609, "logits/rejected": 0.040468111634254456, "logps/chosen": -461.4591064453125, "logps/rejected": -420.57196044921875, "loss": 0.4234, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6002779006958008, "rewards/margins": 0.8377397060394287, "rewards/rejected": -1.438017725944519, "step": 1464 }, { "epoch": 1.8258706467661692, "grad_norm": 3.301818609237671, "learning_rate": 4.2232066407409067e-07, "logits/chosen": 0.21843373775482178, "logits/rejected": 0.028878776356577873, "logps/chosen": -522.05029296875, "logps/rejected": -468.34906005859375, "loss": 0.4338, "rewards/accuracies": 0.828125, "rewards/chosen": -0.30435776710510254, "rewards/margins": 0.8217609524726868, "rewards/rejected": -1.126118779182434, "step": 1468 }, { "epoch": 1.8308457711442787, "grad_norm": 3.323982000350952, "learning_rate": 3.986921866725202e-07, "logits/chosen": 0.37691932916641235, "logits/rejected": 0.059887684881687164, "logps/chosen": -472.496337890625, "logps/rejected": -386.56976318359375, "loss": 0.4778, "rewards/accuracies": 0.875, "rewards/chosen": -0.31983205676078796, "rewards/margins": 1.0095765590667725, "rewards/rejected": -1.3294085264205933, "step": 1472 }, { "epoch": 1.835820895522388, "grad_norm": 3.2554380893707275, "learning_rate": 3.7573043148451673e-07, "logits/chosen": 0.21050050854682922, "logits/rejected": 0.1986107975244522, "logps/chosen": -456.1379699707031, "logps/rejected": -496.9686584472656, "loss": 0.4472, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4782811999320984, "rewards/margins": 0.9004664421081543, "rewards/rejected": -1.3787477016448975, "step": 1476 }, { "epoch": 1.8407960199004973, "grad_norm": 3.1780014038085938, "learning_rate": 3.5343699299205003e-07, "logits/chosen": 0.4725267291069031, "logits/rejected": 0.20679879188537598, "logps/chosen": -454.1123352050781, "logps/rejected": -402.17718505859375, "loss": 0.4866, "rewards/accuracies": 0.75, "rewards/chosen": -0.6523603200912476, "rewards/margins": 0.8148846626281738, "rewards/rejected": -1.467245101928711, "step": 1480 }, { "epoch": 1.845771144278607, "grad_norm": 3.745753765106201, "learning_rate": 3.3181341926867283e-07, "logits/chosen": 0.48626863956451416, "logits/rejected": 0.5913352966308594, "logps/chosen": -453.683349609375, "logps/rejected": -447.6007385253906, "loss": 0.4875, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3742063045501709, "rewards/margins": 0.7547400593757629, "rewards/rejected": -1.128946304321289, "step": 1484 }, { "epoch": 1.8507462686567164, "grad_norm": 3.743507146835327, "learning_rate": 3.1086121187200667e-07, "logits/chosen": 0.11791680753231049, "logits/rejected": 0.10027449578046799, "logps/chosen": -442.95611572265625, "logps/rejected": -485.8495788574219, "loss": 0.4396, "rewards/accuracies": 0.875, "rewards/chosen": -0.3810359537601471, "rewards/margins": 0.9448322653770447, "rewards/rejected": -1.3258682489395142, "step": 1488 }, { "epoch": 1.855721393034826, "grad_norm": 3.817042350769043, "learning_rate": 2.905818257394799e-07, "logits/chosen": 0.15434856712818146, "logits/rejected": 0.07496091723442078, "logps/chosen": -409.6748046875, "logps/rejected": -421.5605773925781, "loss": 0.4731, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3658009171485901, "rewards/margins": 0.9456602334976196, "rewards/rejected": -1.3114612102508545, "step": 1492 }, { "epoch": 1.8606965174129353, "grad_norm": 3.0990395545959473, "learning_rate": 2.7097666908729283e-07, "logits/chosen": 0.46521398425102234, "logits/rejected": 0.3093582093715668, "logps/chosen": -508.86285400390625, "logps/rejected": -487.97943115234375, "loss": 0.4883, "rewards/accuracies": 0.71875, "rewards/chosen": -0.479577898979187, "rewards/margins": 0.6940962672233582, "rewards/rejected": -1.17367422580719, "step": 1496 }, { "epoch": 1.8656716417910446, "grad_norm": 3.0450475215911865, "learning_rate": 2.520471033126326e-07, "logits/chosen": 0.261200487613678, "logits/rejected": 0.14318135380744934, "logps/chosen": -501.09991455078125, "logps/rejected": -465.6934509277344, "loss": 0.4419, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5811474323272705, "rewards/margins": 0.7728549838066101, "rewards/rejected": -1.3540023565292358, "step": 1500 }, { "epoch": 1.8656716417910446, "eval_logits/chosen": 0.1892445683479309, "eval_logits/rejected": 0.05091705545783043, "eval_logps/chosen": -458.9963073730469, "eval_logps/rejected": -422.9080810546875, "eval_loss": 0.6194455623626709, "eval_rewards/accuracies": 0.6458333134651184, "eval_rewards/chosen": -0.6699296832084656, "eval_rewards/margins": 0.4407287836074829, "eval_rewards/rejected": -1.1106584072113037, "eval_runtime": 150.1801, "eval_samples_per_second": 7.611, "eval_steps_per_second": 0.24, "step": 1500 }, { "epoch": 1.8706467661691542, "grad_norm": 3.545064687728882, "learning_rate": 2.3379444289913344e-07, "logits/chosen": 0.47270292043685913, "logits/rejected": 0.26645568013191223, "logps/chosen": -409.9461975097656, "logps/rejected": -400.95550537109375, "loss": 0.438, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3254129886627197, "rewards/margins": 0.9240537881851196, "rewards/rejected": -1.2494667768478394, "step": 1504 }, { "epoch": 1.8756218905472637, "grad_norm": 3.722698926925659, "learning_rate": 2.1621995532559947e-07, "logits/chosen": 0.2734871506690979, "logits/rejected": 0.05334743112325668, "logps/chosen": -501.898193359375, "logps/rejected": -427.0452880859375, "loss": 0.4706, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5731074213981628, "rewards/margins": 0.7717230916023254, "rewards/rejected": -1.3448305130004883, "step": 1508 }, { "epoch": 1.8805970149253732, "grad_norm": 3.6065824031829834, "learning_rate": 1.9932486097799408e-07, "logits/chosen": 0.33175939321517944, "logits/rejected": 0.2642689049243927, "logps/chosen": -384.1080627441406, "logps/rejected": -377.58319091796875, "loss": 0.4599, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4119167923927307, "rewards/margins": 0.9615055322647095, "rewards/rejected": -1.373422384262085, "step": 1512 }, { "epoch": 1.8855721393034826, "grad_norm": 2.6977298259735107, "learning_rate": 1.8311033306468552e-07, "logits/chosen": 0.4200694262981415, "logits/rejected": 0.023060984909534454, "logps/chosen": -495.07916259765625, "logps/rejected": -384.0296630859375, "loss": 0.4238, "rewards/accuracies": 0.859375, "rewards/chosen": -0.41816192865371704, "rewards/margins": 0.9803006649017334, "rewards/rejected": -1.3984625339508057, "step": 1516 }, { "epoch": 1.890547263681592, "grad_norm": 3.5309643745422363, "learning_rate": 1.6757749753498865e-07, "logits/chosen": 0.17390736937522888, "logits/rejected": 0.16996516287326813, "logps/chosen": -438.53564453125, "logps/rejected": -477.38470458984375, "loss": 0.4468, "rewards/accuracies": 0.78125, "rewards/chosen": -0.47648751735687256, "rewards/margins": 0.8215656876564026, "rewards/rejected": -1.2980531454086304, "step": 1520 }, { "epoch": 1.8955223880597014, "grad_norm": 3.5672707557678223, "learning_rate": 1.5272743300097316e-07, "logits/chosen": 0.35543692111968994, "logits/rejected": 0.38684284687042236, "logps/chosen": -425.6787414550781, "logps/rejected": -453.45623779296875, "loss": 0.4827, "rewards/accuracies": 0.734375, "rewards/chosen": -0.47601401805877686, "rewards/margins": 0.7526903748512268, "rewards/rejected": -1.2287043333053589, "step": 1524 }, { "epoch": 1.900497512437811, "grad_norm": 3.358347177505493, "learning_rate": 1.3856117066256225e-07, "logits/chosen": 0.2838931679725647, "logits/rejected": 0.07619883120059967, "logps/chosen": -547.037353515625, "logps/rejected": -493.8963623046875, "loss": 0.4515, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11599953472614288, "rewards/margins": 0.964540958404541, "rewards/rejected": -1.080540657043457, "step": 1528 }, { "epoch": 1.9054726368159205, "grad_norm": 3.064255714416504, "learning_rate": 1.2507969423593225e-07, "logits/chosen": 0.29778000712394714, "logits/rejected": 0.24798990786075592, "logps/chosen": -478.83734130859375, "logps/rejected": -486.351806640625, "loss": 0.411, "rewards/accuracies": 0.875, "rewards/chosen": -0.28864914178848267, "rewards/margins": 1.0248101949691772, "rewards/rejected": -1.3134592771530151, "step": 1532 }, { "epoch": 1.9104477611940298, "grad_norm": 3.4975333213806152, "learning_rate": 1.1228393988519381e-07, "logits/chosen": -0.012770354747772217, "logits/rejected": 0.11666233092546463, "logps/chosen": -442.9209899902344, "logps/rejected": -564.3067626953125, "loss": 0.4512, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6616036891937256, "rewards/margins": 0.6503514647483826, "rewards/rejected": -1.311955213546753, "step": 1536 }, { "epoch": 1.9154228855721394, "grad_norm": 3.4289166927337646, "learning_rate": 1.0017479615738957e-07, "logits/chosen": 0.4374559223651886, "logits/rejected": 0.37482768297195435, "logps/chosen": -523.0839233398438, "logps/rejected": -634.0493774414062, "loss": 0.462, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4454476237297058, "rewards/margins": 0.5911861658096313, "rewards/rejected": -1.036633849143982, "step": 1540 }, { "epoch": 1.9203980099502487, "grad_norm": 3.3477044105529785, "learning_rate": 8.875310392079118e-08, "logits/chosen": 0.22588732838630676, "logits/rejected": -0.04192977398633957, "logps/chosen": -507.6371765136719, "logps/rejected": -440.7511901855469, "loss": 0.4336, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3868103325366974, "rewards/margins": 1.0209940671920776, "rewards/rejected": -1.4078043699264526, "step": 1544 }, { "epoch": 1.9253731343283582, "grad_norm": 3.4166507720947266, "learning_rate": 7.801965630651165e-08, "logits/chosen": 0.0826062485575676, "logits/rejected": 0.03505164384841919, "logps/chosen": -470.573974609375, "logps/rejected": -484.4287109375, "loss": 0.4593, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5422804951667786, "rewards/margins": 0.6872562170028687, "rewards/rejected": -1.2295366525650024, "step": 1548 }, { "epoch": 1.927860696517413, "eval_logits/chosen": 0.2005145251750946, "eval_logits/rejected": 0.06279084831476212, "eval_logps/chosen": -459.1916809082031, "eval_logps/rejected": -423.0291442871094, "eval_loss": 0.6213585138320923, "eval_rewards/accuracies": 0.6527777910232544, "eval_rewards/chosen": -0.6894701719284058, "eval_rewards/margins": 0.433290034532547, "eval_rewards/rejected": -1.1227601766586304, "eval_runtime": 150.4688, "eval_samples_per_second": 7.596, "eval_steps_per_second": 0.239, "step": 1550 }, { "epoch": 1.9303482587064678, "grad_norm": 2.936657190322876, "learning_rate": 6.797519865342161e-08, "logits/chosen": 0.6600261926651001, "logits/rejected": 0.5058936476707458, "logps/chosen": -422.639404296875, "logps/rejected": -443.1512145996094, "loss": 0.4434, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5597524642944336, "rewards/margins": 0.8031996488571167, "rewards/rejected": -1.3629521131515503, "step": 1552 }, { "epoch": 1.935323383084577, "grad_norm": 2.7770910263061523, "learning_rate": 5.862042845640403e-08, "logits/chosen": 0.510213315486908, "logits/rejected": 0.3406936824321747, "logps/chosen": -503.9508361816406, "logps/rejected": -470.0325622558594, "loss": 0.4498, "rewards/accuracies": 0.875, "rewards/chosen": -0.21441341936588287, "rewards/margins": 1.0556612014770508, "rewards/rejected": -1.270074486732483, "step": 1556 }, { "epoch": 1.9402985074626866, "grad_norm": 3.034893751144409, "learning_rate": 4.9955995317908514e-08, "logits/chosen": 0.41465142369270325, "logits/rejected": 0.19684141874313354, "logps/chosen": -474.51544189453125, "logps/rejected": -428.1244201660156, "loss": 0.4425, "rewards/accuracies": 0.84375, "rewards/chosen": -0.16932249069213867, "rewards/margins": 1.1101325750350952, "rewards/rejected": -1.2794551849365234, "step": 1560 }, { "epoch": 1.945273631840796, "grad_norm": 2.902290105819702, "learning_rate": 4.198250090284961e-08, "logits/chosen": 0.22269777953624725, "logits/rejected": 0.017038095742464066, "logps/chosen": -470.39324951171875, "logps/rejected": -418.3542785644531, "loss": 0.4143, "rewards/accuracies": 0.859375, "rewards/chosen": -0.31770819425582886, "rewards/margins": 0.9097498655319214, "rewards/rejected": -1.227458119392395, "step": 1564 }, { "epoch": 1.9502487562189055, "grad_norm": 3.539262533187866, "learning_rate": 3.47004988968247e-08, "logits/chosen": 0.5226894021034241, "logits/rejected": 0.30332833528518677, "logps/chosen": -520.8106689453125, "logps/rejected": -476.7620544433594, "loss": 0.4462, "rewards/accuracies": 0.84375, "rewards/chosen": -0.49939191341400146, "rewards/margins": 0.961308479309082, "rewards/rejected": -1.460700273513794, "step": 1568 }, { "epoch": 1.955223880597015, "grad_norm": 3.8761661052703857, "learning_rate": 2.8110494967664713e-08, "logits/chosen": 0.30280035734176636, "logits/rejected": 0.13733740150928497, "logps/chosen": -457.7912292480469, "logps/rejected": -432.34967041015625, "loss": 0.437, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2639601230621338, "rewards/margins": 0.9372557401657104, "rewards/rejected": -1.2012157440185547, "step": 1572 }, { "epoch": 1.9601990049751243, "grad_norm": 3.9091761112213135, "learning_rate": 2.221294673032004e-08, "logits/chosen": -0.02172435261309147, "logits/rejected": -0.22200414538383484, "logps/chosen": -475.1850891113281, "logps/rejected": -424.7291259765625, "loss": 0.4323, "rewards/accuracies": 0.765625, "rewards/chosen": -0.49128663539886475, "rewards/margins": 0.7886272668838501, "rewards/rejected": -1.2799140214920044, "step": 1576 }, { "epoch": 1.9651741293532339, "grad_norm": 3.5163590908050537, "learning_rate": 1.7008263715085904e-08, "logits/chosen": 0.2808230519294739, "logits/rejected": 0.15157818794250488, "logps/chosen": -547.7991943359375, "logps/rejected": -508.9268798828125, "loss": 0.4872, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3645941913127899, "rewards/margins": 1.0243759155273438, "rewards/rejected": -1.388970136642456, "step": 1580 }, { "epoch": 1.9701492537313432, "grad_norm": 3.4658432006835938, "learning_rate": 1.24968073391607e-08, "logits/chosen": 0.16077536344528198, "logits/rejected": 0.02570854127407074, "logps/chosen": -450.45086669921875, "logps/rejected": -430.0142517089844, "loss": 0.4754, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4632072448730469, "rewards/margins": 0.7755333185195923, "rewards/rejected": -1.2387404441833496, "step": 1584 }, { "epoch": 1.9751243781094527, "grad_norm": 3.7998785972595215, "learning_rate": 8.678890881552715e-09, "logits/chosen": 0.16036288440227509, "logits/rejected": 0.17200781404972076, "logps/chosen": -450.34710693359375, "logps/rejected": -467.6290283203125, "loss": 0.4537, "rewards/accuracies": 0.828125, "rewards/chosen": -0.44261491298675537, "rewards/margins": 0.9358773231506348, "rewards/rejected": -1.3784922361373901, "step": 1588 }, { "epoch": 1.9800995024875623, "grad_norm": 3.656935214996338, "learning_rate": 5.554779461323101e-09, "logits/chosen": 0.13178521394729614, "logits/rejected": -0.02235669642686844, "logps/chosen": -462.8760070800781, "logps/rejected": -402.28424072265625, "loss": 0.4578, "rewards/accuracies": 0.796875, "rewards/chosen": -0.23067457973957062, "rewards/margins": 0.9117249846458435, "rewards/rejected": -1.142399549484253, "step": 1592 }, { "epoch": 1.9850746268656716, "grad_norm": 3.9424142837524414, "learning_rate": 3.1246900191761463e-09, "logits/chosen": 0.27911561727523804, "logits/rejected": 0.14084932208061218, "logps/chosen": -568.9130249023438, "logps/rejected": -536.7129516601562, "loss": 0.4657, "rewards/accuracies": 0.796875, "rewards/chosen": -0.47243887186050415, "rewards/margins": 0.8830912113189697, "rewards/rejected": -1.355530023574829, "step": 1596 }, { "epoch": 1.9900497512437811, "grad_norm": 3.261087656021118, "learning_rate": 1.3887913023946652e-09, "logits/chosen": 0.40209636092185974, "logits/rejected": 0.20681683719158173, "logps/chosen": -534.0907592773438, "logps/rejected": -446.65838623046875, "loss": 0.4444, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3766060769557953, "rewards/margins": 0.7923564910888672, "rewards/rejected": -1.1689625978469849, "step": 1600 }, { "epoch": 1.9900497512437811, "eval_logits/chosen": 0.22255222499370575, "eval_logits/rejected": 0.08632177859544754, "eval_logps/chosen": -459.1236572265625, "eval_logps/rejected": -423.0472106933594, "eval_loss": 0.6228893399238586, "eval_rewards/accuracies": 0.6666666865348816, "eval_rewards/chosen": -0.6826636791229248, "eval_rewards/margins": 0.441908061504364, "eval_rewards/rejected": -1.1245719194412231, "eval_runtime": 150.7224, "eval_samples_per_second": 7.583, "eval_steps_per_second": 0.239, "step": 1600 }, { "epoch": 1.9950248756218905, "grad_norm": 3.3245160579681396, "learning_rate": 3.4720385312492223e-10, "logits/chosen": 0.24731820821762085, "logits/rejected": 0.46343696117401123, "logps/chosen": -394.77288818359375, "logps/rejected": -496.1209716796875, "loss": 0.4491, "rewards/accuracies": 0.796875, "rewards/chosen": -0.31272804737091064, "rewards/margins": 0.8752219080924988, "rewards/rejected": -1.1879500150680542, "step": 1604 }, { "epoch": 2.0, "grad_norm": 3.5245065689086914, "learning_rate": 0.0, "logits/chosen": 0.11977434158325195, "logits/rejected": 0.1967284381389618, "logps/chosen": -452.91552734375, "logps/rejected": -495.7862243652344, "loss": 0.4799, "rewards/accuracies": 0.75, "rewards/chosen": -0.47581416368484497, "rewards/margins": 0.5203736424446106, "rewards/rejected": -0.9961878657341003, "step": 1608 }, { "epoch": 2.0, "step": 1608, "total_flos": 0.0, "train_loss": 0.5581450056080794, "train_runtime": 39294.0243, "train_samples_per_second": 2.619, "train_steps_per_second": 0.041 } ], "logging_steps": 4, "max_steps": 1608, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }