diff --git "a/checkpoint-1166/trainer_state.json" "b/checkpoint-1166/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1166/trainer_state.json" @@ -0,0 +1,17527 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1166, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 103.41960351338825, + "learning_rate": 1.7094017094017096e-08, + "logits/chosen": -0.7548736333847046, + "logits/rejected": -0.7155890464782715, + "logps/chosen": -586.5838623046875, + "logps/rejected": -658.200927734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 94.92896844649115, + "learning_rate": 3.418803418803419e-08, + "logits/chosen": -0.7686550617218018, + "logits/rejected": -0.712468147277832, + "logps/chosen": -457.4859619140625, + "logps/rejected": -713.6287841796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 90.79844942313059, + "learning_rate": 5.128205128205128e-08, + "logits/chosen": -0.9042060375213623, + "logits/rejected": -0.8283087611198425, + "logps/chosen": -373.404296875, + "logps/rejected": -573.5121459960938, + "loss": 0.7268, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.06697890162467957, + "rewards/margins": -0.059918761253356934, + "rewards/rejected": -0.007060145493596792, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 113.84425518159345, + "learning_rate": 6.837606837606839e-08, + "logits/chosen": -0.7961872816085815, + "logits/rejected": -0.7325714826583862, + "logps/chosen": -538.67919921875, + "logps/rejected": -809.2041015625, + "loss": 0.7242, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.049712374806404114, + "rewards/margins": -0.05105018615722656, + "rewards/rejected": 0.0013378113508224487, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 97.52089397123956, + "learning_rate": 8.547008547008547e-08, + "logits/chosen": -0.7953340411186218, + "logits/rejected": -0.7663401961326599, + "logps/chosen": -565.4126586914062, + "logps/rejected": -706.5430297851562, + "loss": 0.6737, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00451049767434597, + "rewards/margins": 0.04731311649084091, + "rewards/rejected": -0.04280261695384979, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 104.40420699285828, + "learning_rate": 1.0256410256410256e-07, + "logits/chosen": -0.7810869216918945, + "logits/rejected": -0.7261877059936523, + "logps/chosen": -576.093017578125, + "logps/rejected": -735.2755737304688, + "loss": 0.724, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.016032269224524498, + "rewards/margins": -0.04482083395123482, + "rewards/rejected": 0.028788568452000618, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 98.11913062972486, + "learning_rate": 1.1965811965811965e-07, + "logits/chosen": -0.7417203783988953, + "logits/rejected": -0.7288725972175598, + "logps/chosen": -536.7333984375, + "logps/rejected": -579.6881103515625, + "loss": 0.7402, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0824371799826622, + "rewards/margins": -0.07586034387350082, + "rewards/rejected": -0.00657684775069356, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 98.41781433766984, + "learning_rate": 1.3675213675213677e-07, + "logits/chosen": -1.0201611518859863, + "logits/rejected": -1.0109407901763916, + "logps/chosen": -481.22076416015625, + "logps/rejected": -501.9241943359375, + "loss": 0.7265, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.009386436082422733, + "rewards/margins": -0.053712133318185806, + "rewards/rejected": 0.04432569816708565, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 85.90502357414873, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -0.7735470533370972, + "logits/rejected": -0.7421846389770508, + "logps/chosen": -355.7308349609375, + "logps/rejected": -520.117431640625, + "loss": 0.6849, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.033313192427158356, + "rewards/margins": 0.026676924899220467, + "rewards/rejected": -0.05999011546373367, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 95.53596114513927, + "learning_rate": 1.7094017094017095e-07, + "logits/chosen": -0.7909107208251953, + "logits/rejected": -0.729066789150238, + "logps/chosen": -462.21490478515625, + "logps/rejected": -622.1067504882812, + "loss": 0.7079, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008044671267271042, + "rewards/margins": -0.017987489700317383, + "rewards/rejected": 0.009942816570401192, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 94.62643944442853, + "learning_rate": 1.8803418803418802e-07, + "logits/chosen": -0.9328438639640808, + "logits/rejected": -0.9020552039146423, + "logps/chosen": -372.81146240234375, + "logps/rejected": -567.58056640625, + "loss": 0.7157, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.03133794292807579, + "rewards/margins": -0.03527111932635307, + "rewards/rejected": 0.06660906225442886, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 93.39398928307216, + "learning_rate": 2.0512820512820512e-07, + "logits/chosen": -0.9031232595443726, + "logits/rejected": -0.8751566410064697, + "logps/chosen": -436.08343505859375, + "logps/rejected": -568.6261596679688, + "loss": 0.7387, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.04213881492614746, + "rewards/margins": -0.07084135711193085, + "rewards/rejected": 0.028702545911073685, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 101.92780870439769, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": -0.7718335390090942, + "logits/rejected": -0.7280720472335815, + "logps/chosen": -475.76568603515625, + "logps/rejected": -623.90283203125, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028224896639585495, + "rewards/margins": 0.012632707133889198, + "rewards/rejected": 0.015592193230986595, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 100.09509124265254, + "learning_rate": 2.393162393162393e-07, + "logits/chosen": -0.7317771911621094, + "logits/rejected": -0.6727234125137329, + "logps/chosen": -573.5426025390625, + "logps/rejected": -711.6471557617188, + "loss": 0.6586, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04689348116517067, + "rewards/margins": 0.08301004022359848, + "rewards/rejected": -0.03611656650900841, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 99.7593876423607, + "learning_rate": 2.5641025641025636e-07, + "logits/chosen": -0.7521563172340393, + "logits/rejected": -0.6863745450973511, + "logps/chosen": -498.97991943359375, + "logps/rejected": -682.7203979492188, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.028330065310001373, + "rewards/margins": 0.019269824028015137, + "rewards/rejected": -0.04759988933801651, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 88.29988329350945, + "learning_rate": 2.7350427350427354e-07, + "logits/chosen": -0.8903733491897583, + "logits/rejected": -0.8610409498214722, + "logps/chosen": -441.4660949707031, + "logps/rejected": -485.55010986328125, + "loss": 0.7074, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.010682763531804085, + "rewards/margins": -0.018920645117759705, + "rewards/rejected": 0.008237885311245918, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 104.0592089591977, + "learning_rate": 2.905982905982906e-07, + "logits/chosen": -1.0094716548919678, + "logits/rejected": -0.963087797164917, + "logps/chosen": -477.1594543457031, + "logps/rejected": -731.2901611328125, + "loss": 0.7002, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02713947370648384, + "rewards/margins": -0.008646871894598007, + "rewards/rejected": -0.018492601811885834, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 105.17653654674692, + "learning_rate": 3.076923076923077e-07, + "logits/chosen": -1.0000486373901367, + "logits/rejected": -0.9455796480178833, + "logps/chosen": -692.1254272460938, + "logps/rejected": -884.697021484375, + "loss": 0.6617, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003712274134159088, + "rewards/margins": 0.0707027018070221, + "rewards/rejected": -0.07441498339176178, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 91.15778116804893, + "learning_rate": 3.2478632478632476e-07, + "logits/chosen": -0.9160683155059814, + "logits/rejected": -0.8868091106414795, + "logps/chosen": -607.456787109375, + "logps/rejected": -590.1045532226562, + "loss": 0.6537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.017058946192264557, + "rewards/margins": 0.08485370874404907, + "rewards/rejected": -0.10191265493631363, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 88.18915926623042, + "learning_rate": 3.418803418803419e-07, + "logits/chosen": -0.9559439420700073, + "logits/rejected": -0.9025272130966187, + "logps/chosen": -379.51666259765625, + "logps/rejected": -568.963134765625, + "loss": 0.7113, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08084973692893982, + "rewards/margins": -0.025880228728055954, + "rewards/rejected": -0.05496950075030327, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 105.50380476742227, + "learning_rate": 3.5897435897435896e-07, + "logits/chosen": -0.8581007719039917, + "logits/rejected": -0.8482990264892578, + "logps/chosen": -721.2944946289062, + "logps/rejected": -692.0745239257812, + "loss": 0.6512, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0144729632884264, + "rewards/margins": 0.09765224158763885, + "rewards/rejected": -0.0831792801618576, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 100.17926934231421, + "learning_rate": 3.7606837606837604e-07, + "logits/chosen": -0.9451916813850403, + "logits/rejected": -0.8803855180740356, + "logps/chosen": -479.50274658203125, + "logps/rejected": -680.472412109375, + "loss": 0.6848, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02336878702044487, + "rewards/margins": 0.02742600627243519, + "rewards/rejected": -0.004057217389345169, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 99.39544083096526, + "learning_rate": 3.931623931623931e-07, + "logits/chosen": -0.7238095998764038, + "logits/rejected": -0.6513512134552002, + "logps/chosen": -518.0869140625, + "logps/rejected": -722.469482421875, + "loss": 0.6238, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.049712516367435455, + "rewards/margins": 0.1534470021724701, + "rewards/rejected": -0.10373449325561523, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 103.09338152254325, + "learning_rate": 4.1025641025641024e-07, + "logits/chosen": -0.9146740436553955, + "logits/rejected": -0.8484085202217102, + "logps/chosen": -481.82745361328125, + "logps/rejected": -674.730224609375, + "loss": 0.7114, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010667498223483562, + "rewards/margins": -0.010577872395515442, + "rewards/rejected": -8.963048458099365e-05, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 96.04021273997232, + "learning_rate": 4.273504273504273e-07, + "logits/chosen": -0.9091683626174927, + "logits/rejected": -0.8563873767852783, + "logps/chosen": -492.4463806152344, + "logps/rejected": -638.01025390625, + "loss": 0.7015, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018622679635882378, + "rewards/margins": -0.0039053894579410553, + "rewards/rejected": -0.01471729390323162, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 109.06282899339998, + "learning_rate": 4.444444444444444e-07, + "logits/chosen": -0.9659221768379211, + "logits/rejected": -0.9196881651878357, + "logps/chosen": -492.91900634765625, + "logps/rejected": -853.394287109375, + "loss": 0.6713, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.028880760073661804, + "rewards/margins": 0.059592749923467636, + "rewards/rejected": -0.08847351372241974, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 99.3240842921101, + "learning_rate": 4.6153846153846156e-07, + "logits/chosen": -0.9494356513023376, + "logits/rejected": -0.8917842507362366, + "logps/chosen": -528.2301025390625, + "logps/rejected": -776.6889038085938, + "loss": 0.6467, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.011080075986683369, + "rewards/margins": 0.11547484248876572, + "rewards/rejected": -0.12655490636825562, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 108.40026128693336, + "learning_rate": 4.786324786324786e-07, + "logits/chosen": -0.9912917613983154, + "logits/rejected": -0.9570505023002625, + "logps/chosen": -555.765869140625, + "logps/rejected": -669.6681518554688, + "loss": 0.7363, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08198236674070358, + "rewards/margins": -0.06766227632761002, + "rewards/rejected": -0.014320090413093567, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 99.90460421005808, + "learning_rate": 4.957264957264958e-07, + "logits/chosen": -0.95127934217453, + "logits/rejected": -0.9175419807434082, + "logps/chosen": -461.0755615234375, + "logps/rejected": -580.02734375, + "loss": 0.7048, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0809219628572464, + "rewards/margins": -0.01347380131483078, + "rewards/rejected": -0.06744816154241562, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 112.99941238862309, + "learning_rate": 5.128205128205127e-07, + "logits/chosen": -0.9818075895309448, + "logits/rejected": -0.9260789155960083, + "logps/chosen": -535.239013671875, + "logps/rejected": -700.30517578125, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09685669839382172, + "rewards/margins": 0.015113834291696548, + "rewards/rejected": -0.11197052150964737, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 83.0378730434607, + "learning_rate": 5.299145299145299e-07, + "logits/chosen": -0.6910840272903442, + "logits/rejected": -0.6364599466323853, + "logps/chosen": -465.90606689453125, + "logps/rejected": -567.572021484375, + "loss": 0.6282, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.016259711235761642, + "rewards/margins": 0.1518758237361908, + "rewards/rejected": -0.13561612367630005, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 98.11090569271404, + "learning_rate": 5.470085470085471e-07, + "logits/chosen": -0.7717163562774658, + "logits/rejected": -0.7245222330093384, + "logps/chosen": -592.1800537109375, + "logps/rejected": -645.472900390625, + "loss": 0.6386, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04043560102581978, + "rewards/margins": 0.12422771751880646, + "rewards/rejected": -0.16466331481933594, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 100.66982974140217, + "learning_rate": 5.641025641025641e-07, + "logits/chosen": -0.7310692071914673, + "logits/rejected": -0.7151703834533691, + "logps/chosen": -553.5906982421875, + "logps/rejected": -547.3599853515625, + "loss": 0.705, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.04172036796808243, + "rewards/margins": -0.008993186056613922, + "rewards/rejected": -0.032727185636758804, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 94.70197301435658, + "learning_rate": 5.811965811965812e-07, + "logits/chosen": -0.6653925776481628, + "logits/rejected": -0.622881293296814, + "logps/chosen": -426.7507629394531, + "logps/rejected": -596.0507202148438, + "loss": 0.6539, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04939991235733032, + "rewards/margins": 0.08512675017118454, + "rewards/rejected": -0.13452666997909546, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 104.42226965215687, + "learning_rate": 5.982905982905982e-07, + "logits/chosen": -0.787976086139679, + "logits/rejected": -0.7573559284210205, + "logps/chosen": -565.1084594726562, + "logps/rejected": -869.0287475585938, + "loss": 0.6703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07111568748950958, + "rewards/margins": 0.08143329620361328, + "rewards/rejected": -0.15254898369312286, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 109.94862708513548, + "learning_rate": 6.153846153846154e-07, + "logits/chosen": -0.7678736448287964, + "logits/rejected": -0.7088093757629395, + "logps/chosen": -588.3760986328125, + "logps/rejected": -797.10107421875, + "loss": 0.7167, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.13470612466335297, + "rewards/margins": -0.018723297864198685, + "rewards/rejected": -0.11598282307386398, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 104.69858779468399, + "learning_rate": 6.324786324786325e-07, + "logits/chosen": -0.7733074426651001, + "logits/rejected": -0.6972559094429016, + "logps/chosen": -525.0222778320312, + "logps/rejected": -643.43017578125, + "loss": 0.7148, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07904539257287979, + "rewards/margins": -0.021822579205036163, + "rewards/rejected": -0.057222817093133926, + "step": 37 + }, + { + "epoch": 0.03, + "grad_norm": 84.47695627881467, + "learning_rate": 6.495726495726495e-07, + "logits/chosen": -0.769538164138794, + "logits/rejected": -0.7374118566513062, + "logps/chosen": -469.9814758300781, + "logps/rejected": -566.3709716796875, + "loss": 0.6477, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04114339128136635, + "rewards/margins": 0.13897393643856049, + "rewards/rejected": -0.18011733889579773, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 102.47977429806984, + "learning_rate": 6.666666666666666e-07, + "logits/chosen": -0.7609751224517822, + "logits/rejected": -0.6807464361190796, + "logps/chosen": -604.7998046875, + "logps/rejected": -682.2073974609375, + "loss": 0.7093, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07954426109790802, + "rewards/margins": 0.0017297714948654175, + "rewards/rejected": -0.08127403259277344, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 92.09223558927283, + "learning_rate": 6.837606837606838e-07, + "logits/chosen": -0.7742663621902466, + "logits/rejected": -0.7225289940834045, + "logps/chosen": -542.4127197265625, + "logps/rejected": -764.8325805664062, + "loss": 0.5848, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0005925875157117844, + "rewards/margins": 0.2573086619377136, + "rewards/rejected": -0.2567160725593567, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 91.93998818640803, + "learning_rate": 7.008547008547007e-07, + "logits/chosen": -0.9238195419311523, + "logits/rejected": -0.8682441711425781, + "logps/chosen": -558.4532470703125, + "logps/rejected": -765.9144287109375, + "loss": 0.5732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03887009620666504, + "rewards/margins": 0.2795259356498718, + "rewards/rejected": -0.2406558394432068, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 95.61208794097061, + "learning_rate": 7.179487179487179e-07, + "logits/chosen": -0.7909547090530396, + "logits/rejected": -0.7456868886947632, + "logps/chosen": -476.070556640625, + "logps/rejected": -590.078369140625, + "loss": 0.6532, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03314027935266495, + "rewards/margins": 0.09669484198093414, + "rewards/rejected": -0.1298351287841797, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 81.14395901291414, + "learning_rate": 7.350427350427351e-07, + "logits/chosen": -0.9072396755218506, + "logits/rejected": -0.8494898080825806, + "logps/chosen": -438.7601318359375, + "logps/rejected": -602.9498291015625, + "loss": 0.6088, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03542018309235573, + "rewards/margins": 0.19208072125911713, + "rewards/rejected": -0.22750090062618256, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 108.59101240066408, + "learning_rate": 7.521367521367521e-07, + "logits/chosen": -0.9132080078125, + "logits/rejected": -0.8617574572563171, + "logps/chosen": -545.069580078125, + "logps/rejected": -611.75830078125, + "loss": 0.7416, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09693308174610138, + "rewards/margins": -0.07984675467014313, + "rewards/rejected": -0.017086319625377655, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 96.66496044258588, + "learning_rate": 7.692307692307693e-07, + "logits/chosen": -0.9789643883705139, + "logits/rejected": -0.8941919803619385, + "logps/chosen": -495.7006530761719, + "logps/rejected": -747.5504760742188, + "loss": 0.6242, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03485063090920448, + "rewards/margins": 0.1573757827281952, + "rewards/rejected": -0.19222640991210938, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 93.27997514769116, + "learning_rate": 7.863247863247862e-07, + "logits/chosen": -0.9590734243392944, + "logits/rejected": -0.9081867933273315, + "logps/chosen": -468.8348693847656, + "logps/rejected": -667.7100830078125, + "loss": 0.6551, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0537002757191658, + "rewards/margins": 0.08610428124666214, + "rewards/rejected": -0.13980455696582794, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 84.49214156499791, + "learning_rate": 8.034188034188034e-07, + "logits/chosen": -1.0012445449829102, + "logits/rejected": -0.9466636180877686, + "logps/chosen": -543.0889282226562, + "logps/rejected": -655.0601196289062, + "loss": 0.5992, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.031609345227479935, + "rewards/margins": 0.22827112674713135, + "rewards/rejected": -0.2598804533481598, + "step": 47 + }, + { + "epoch": 0.04, + "grad_norm": 91.01884588886823, + "learning_rate": 8.205128205128205e-07, + "logits/chosen": -0.9361293315887451, + "logits/rejected": -0.9137954711914062, + "logps/chosen": -493.56622314453125, + "logps/rejected": -541.2529907226562, + "loss": 0.6555, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0510423518717289, + "rewards/margins": 0.10465296357870102, + "rewards/rejected": -0.053610607981681824, + "step": 48 + }, + { + "epoch": 0.04, + "grad_norm": 78.77859928486089, + "learning_rate": 8.376068376068375e-07, + "logits/chosen": -0.9242331385612488, + "logits/rejected": -0.8696056008338928, + "logps/chosen": -497.6260986328125, + "logps/rejected": -671.1222534179688, + "loss": 0.5487, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.07649482786655426, + "rewards/margins": 0.35616305470466614, + "rewards/rejected": -0.2796682119369507, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 87.4924099852313, + "learning_rate": 8.547008547008546e-07, + "logits/chosen": -0.9026439189910889, + "logits/rejected": -0.855921745300293, + "logps/chosen": -401.735595703125, + "logps/rejected": -690.1610717773438, + "loss": 0.5861, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04143127053976059, + "rewards/margins": 0.2539970278739929, + "rewards/rejected": -0.2954282760620117, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 81.48941014314516, + "learning_rate": 8.717948717948718e-07, + "logits/chosen": -0.942750871181488, + "logits/rejected": -0.9072023630142212, + "logps/chosen": -370.6748046875, + "logps/rejected": -623.7341918945312, + "loss": 0.6305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03426019474864006, + "rewards/margins": 0.16470417380332947, + "rewards/rejected": -0.19896435737609863, + "step": 51 + }, + { + "epoch": 0.04, + "grad_norm": 97.18373689098986, + "learning_rate": 8.888888888888888e-07, + "logits/chosen": -0.8328244686126709, + "logits/rejected": -0.8133566975593567, + "logps/chosen": -606.1514892578125, + "logps/rejected": -718.189453125, + "loss": 0.6274, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09226015210151672, + "rewards/margins": 0.14438080787658691, + "rewards/rejected": -0.23664094507694244, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 84.23311513947658, + "learning_rate": 9.059829059829059e-07, + "logits/chosen": -0.8741067051887512, + "logits/rejected": -0.8305940628051758, + "logps/chosen": -443.4811706542969, + "logps/rejected": -599.29248046875, + "loss": 0.599, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07226061820983887, + "rewards/margins": 0.24112094938755035, + "rewards/rejected": -0.3133815824985504, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 90.77895336471985, + "learning_rate": 9.230769230769231e-07, + "logits/chosen": -0.8400816917419434, + "logits/rejected": -0.7951209545135498, + "logps/chosen": -502.77703857421875, + "logps/rejected": -656.4312133789062, + "loss": 0.6166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13248419761657715, + "rewards/margins": 0.1832733154296875, + "rewards/rejected": -0.31575751304626465, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 92.83966456996117, + "learning_rate": 9.401709401709401e-07, + "logits/chosen": -0.9149412512779236, + "logits/rejected": -0.8611310124397278, + "logps/chosen": -636.2821044921875, + "logps/rejected": -555.0322265625, + "loss": 0.6557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18232059478759766, + "rewards/margins": 0.0952020213007927, + "rewards/rejected": -0.27752262353897095, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 91.35675517335002, + "learning_rate": 9.572649572649572e-07, + "logits/chosen": -0.8824781775474548, + "logits/rejected": -0.8100583553314209, + "logps/chosen": -575.037109375, + "logps/rejected": -663.770263671875, + "loss": 0.6307, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1571289300918579, + "rewards/margins": 0.1695071905851364, + "rewards/rejected": -0.3266361355781555, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 89.40472477922503, + "learning_rate": 9.743589743589742e-07, + "logits/chosen": -0.8715717792510986, + "logits/rejected": -0.8538134098052979, + "logps/chosen": -436.94000244140625, + "logps/rejected": -606.1776123046875, + "loss": 0.6018, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05886922404170036, + "rewards/margins": 0.2175457775592804, + "rewards/rejected": -0.27641499042510986, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 93.86457785629594, + "learning_rate": 9.914529914529915e-07, + "logits/chosen": -0.8911943435668945, + "logits/rejected": -0.8368760347366333, + "logps/chosen": -542.065185546875, + "logps/rejected": -709.164306640625, + "loss": 0.5788, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04741700738668442, + "rewards/margins": 0.2624610662460327, + "rewards/rejected": -0.30987805128097534, + "step": 58 + }, + { + "epoch": 0.05, + "grad_norm": 86.26470924808025, + "learning_rate": 1.0085470085470084e-06, + "logits/chosen": -0.7618128657341003, + "logits/rejected": -0.715040922164917, + "logps/chosen": -546.5999755859375, + "logps/rejected": -586.6727294921875, + "loss": 0.6301, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.027117159217596054, + "rewards/margins": 0.1485387682914734, + "rewards/rejected": -0.17565591633319855, + "step": 59 + }, + { + "epoch": 0.05, + "grad_norm": 80.45324400821403, + "learning_rate": 1.0256410256410255e-06, + "logits/chosen": -0.8306759595870972, + "logits/rejected": -0.7963862419128418, + "logps/chosen": -382.11224365234375, + "logps/rejected": -530.7271728515625, + "loss": 0.6279, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06177574023604393, + "rewards/margins": 0.1595396101474762, + "rewards/rejected": -0.22131533920764923, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 83.54456700968679, + "learning_rate": 1.0427350427350427e-06, + "logits/chosen": -0.6904474496841431, + "logits/rejected": -0.6812146902084351, + "logps/chosen": -491.5672607421875, + "logps/rejected": -609.932861328125, + "loss": 0.5828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12990780174732208, + "rewards/margins": 0.25919151306152344, + "rewards/rejected": -0.3890992999076843, + "step": 61 + }, + { + "epoch": 0.05, + "grad_norm": 77.16981900105961, + "learning_rate": 1.0598290598290598e-06, + "logits/chosen": -0.6535602807998657, + "logits/rejected": -0.6413712501525879, + "logps/chosen": -464.19293212890625, + "logps/rejected": -532.1095581054688, + "loss": 0.6012, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.03915134817361832, + "rewards/margins": 0.22922548651695251, + "rewards/rejected": -0.26837682723999023, + "step": 62 + }, + { + "epoch": 0.05, + "grad_norm": 104.62885637579853, + "learning_rate": 1.0769230769230769e-06, + "logits/chosen": -0.8028097152709961, + "logits/rejected": -0.7520387172698975, + "logps/chosen": -637.960205078125, + "logps/rejected": -638.2055053710938, + "loss": 0.6533, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1958635449409485, + "rewards/margins": 0.13062095642089844, + "rewards/rejected": -0.3264845013618469, + "step": 63 + }, + { + "epoch": 0.05, + "grad_norm": 100.23843850521743, + "learning_rate": 1.0940170940170942e-06, + "logits/chosen": -0.6847934126853943, + "logits/rejected": -0.5922191143035889, + "logps/chosen": -723.4080810546875, + "logps/rejected": -724.36767578125, + "loss": 0.6123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18822208046913147, + "rewards/margins": 0.1920095980167389, + "rewards/rejected": -0.38023167848587036, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 95.0522315726931, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -0.7592595815658569, + "logits/rejected": -0.7065274715423584, + "logps/chosen": -575.9987182617188, + "logps/rejected": -705.1849365234375, + "loss": 0.6486, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08855009078979492, + "rewards/margins": 0.26747530698776245, + "rewards/rejected": -0.17892523109912872, + "step": 65 + }, + { + "epoch": 0.06, + "grad_norm": 82.5319959067867, + "learning_rate": 1.1282051282051281e-06, + "logits/chosen": -0.6380032300949097, + "logits/rejected": -0.6153943538665771, + "logps/chosen": -457.5918884277344, + "logps/rejected": -647.5048828125, + "loss": 0.5669, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.060186341404914856, + "rewards/margins": 0.32465216517448425, + "rewards/rejected": -0.3848384916782379, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 133.2973708733622, + "learning_rate": 1.1452991452991452e-06, + "logits/chosen": -0.6638052463531494, + "logits/rejected": -0.6150358319282532, + "logps/chosen": -694.1866455078125, + "logps/rejected": -747.4185791015625, + "loss": 0.6363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.191949263215065, + "rewards/margins": 0.1725366711616516, + "rewards/rejected": -0.3644859492778778, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 84.88844038959499, + "learning_rate": 1.1623931623931625e-06, + "logits/chosen": -0.800848662853241, + "logits/rejected": -0.7365254163742065, + "logps/chosen": -598.2763671875, + "logps/rejected": -724.450927734375, + "loss": 0.5192, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08877643197774887, + "rewards/margins": 0.43157708644866943, + "rewards/rejected": -0.5203535556793213, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 74.27440749130595, + "learning_rate": 1.1794871794871795e-06, + "logits/chosen": -0.7443728446960449, + "logits/rejected": -0.7063024044036865, + "logps/chosen": -463.95013427734375, + "logps/rejected": -710.5635986328125, + "loss": 0.5168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02280556596815586, + "rewards/margins": 0.4473559260368347, + "rewards/rejected": -0.470161497592926, + "step": 69 + }, + { + "epoch": 0.06, + "grad_norm": 78.31022777921227, + "learning_rate": 1.1965811965811964e-06, + "logits/chosen": -0.6374633312225342, + "logits/rejected": -0.6176419258117676, + "logps/chosen": -513.8482666015625, + "logps/rejected": -636.00927734375, + "loss": 0.5565, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.16604569554328918, + "rewards/margins": 0.32729434967041016, + "rewards/rejected": -0.49334001541137695, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 82.71600203096435, + "learning_rate": 1.2136752136752135e-06, + "logits/chosen": -0.7886828184127808, + "logits/rejected": -0.7378125190734863, + "logps/chosen": -431.9499206542969, + "logps/rejected": -634.23095703125, + "loss": 0.5438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04967509210109711, + "rewards/margins": 0.3546733260154724, + "rewards/rejected": -0.40434837341308594, + "step": 71 + }, + { + "epoch": 0.06, + "grad_norm": 86.16215472228565, + "learning_rate": 1.2307692307692308e-06, + "logits/chosen": -0.7888700366020203, + "logits/rejected": -0.739388644695282, + "logps/chosen": -556.30078125, + "logps/rejected": -581.4580078125, + "loss": 0.5939, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11910320818424225, + "rewards/margins": 0.2826518416404724, + "rewards/rejected": -0.40175506472587585, + "step": 72 + }, + { + "epoch": 0.06, + "grad_norm": 84.04344881624239, + "learning_rate": 1.2478632478632478e-06, + "logits/chosen": -0.766875684261322, + "logits/rejected": -0.7309463024139404, + "logps/chosen": -425.70941162109375, + "logps/rejected": -436.1416931152344, + "loss": 0.6546, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15280786156654358, + "rewards/margins": 0.09877495467662811, + "rewards/rejected": -0.2515828013420105, + "step": 73 + }, + { + "epoch": 0.06, + "grad_norm": 83.32547240766198, + "learning_rate": 1.264957264957265e-06, + "logits/chosen": -0.7253504991531372, + "logits/rejected": -0.7070431709289551, + "logps/chosen": -471.2389831542969, + "logps/rejected": -688.390869140625, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2157745659351349, + "rewards/margins": 0.25578293204307556, + "rewards/rejected": -0.47155749797821045, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 81.5941281481595, + "learning_rate": 1.2820512820512822e-06, + "logits/chosen": -0.6651368141174316, + "logits/rejected": -0.6502801179885864, + "logps/chosen": -471.9756774902344, + "logps/rejected": -554.71630859375, + "loss": 0.5826, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13057751953601837, + "rewards/margins": 0.2676604092121124, + "rewards/rejected": -0.3982379138469696, + "step": 75 + }, + { + "epoch": 0.07, + "grad_norm": 85.78261235986672, + "learning_rate": 1.299145299145299e-06, + "logits/chosen": -0.7806533575057983, + "logits/rejected": -0.7606949806213379, + "logps/chosen": -437.90997314453125, + "logps/rejected": -730.5367431640625, + "loss": 0.5606, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09575305134057999, + "rewards/margins": 0.3341591954231262, + "rewards/rejected": -0.429912269115448, + "step": 76 + }, + { + "epoch": 0.07, + "grad_norm": 80.12358188477748, + "learning_rate": 1.3162393162393161e-06, + "logits/chosen": -0.7996336221694946, + "logits/rejected": -0.761587381362915, + "logps/chosen": -516.3214721679688, + "logps/rejected": -669.3408203125, + "loss": 0.5536, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17250573635101318, + "rewards/margins": 0.3307540714740753, + "rewards/rejected": -0.5032598376274109, + "step": 77 + }, + { + "epoch": 0.07, + "grad_norm": 68.90500632179386, + "learning_rate": 1.3333333333333332e-06, + "logits/chosen": -0.7328706979751587, + "logits/rejected": -0.7092921733856201, + "logps/chosen": -478.4838562011719, + "logps/rejected": -623.0524291992188, + "loss": 0.5091, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05939531326293945, + "rewards/margins": 0.44020992517471313, + "rewards/rejected": -0.4996052384376526, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 65.3752440554934, + "learning_rate": 1.3504273504273505e-06, + "logits/chosen": -0.8254345059394836, + "logits/rejected": -0.7779566049575806, + "logps/chosen": -351.9629821777344, + "logps/rejected": -470.9985046386719, + "loss": 0.5464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.057938240468502045, + "rewards/margins": 0.3562437891960144, + "rewards/rejected": -0.41418206691741943, + "step": 79 + }, + { + "epoch": 0.07, + "grad_norm": 83.33809923207538, + "learning_rate": 1.3675213675213676e-06, + "logits/chosen": -0.7703996896743774, + "logits/rejected": -0.7198264002799988, + "logps/chosen": -644.7678833007812, + "logps/rejected": -719.0814208984375, + "loss": 0.5473, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24494361877441406, + "rewards/margins": 0.37615740299224854, + "rewards/rejected": -0.6211010217666626, + "step": 80 + }, + { + "epoch": 0.07, + "grad_norm": 87.93972477475651, + "learning_rate": 1.3846153846153844e-06, + "logits/chosen": -0.7104290127754211, + "logits/rejected": -0.6862534880638123, + "logps/chosen": -476.5368957519531, + "logps/rejected": -573.0159912109375, + "loss": 0.6023, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1351388543844223, + "rewards/margins": 0.23241892457008362, + "rewards/rejected": -0.3675577938556671, + "step": 81 + }, + { + "epoch": 0.07, + "grad_norm": 83.56310082112059, + "learning_rate": 1.4017094017094015e-06, + "logits/chosen": -0.7223315834999084, + "logits/rejected": -0.6844210028648376, + "logps/chosen": -574.5439453125, + "logps/rejected": -774.9532470703125, + "loss": 0.5268, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.14116373658180237, + "rewards/margins": 0.39218950271606445, + "rewards/rejected": -0.5333532094955444, + "step": 82 + }, + { + "epoch": 0.07, + "grad_norm": 83.086595568021, + "learning_rate": 1.4188034188034188e-06, + "logits/chosen": -0.6719001531600952, + "logits/rejected": -0.6052131652832031, + "logps/chosen": -555.9496459960938, + "logps/rejected": -671.5604248046875, + "loss": 0.5781, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03966934233903885, + "rewards/margins": 0.32521799206733704, + "rewards/rejected": -0.2855486571788788, + "step": 83 + }, + { + "epoch": 0.07, + "grad_norm": 98.97658586758621, + "learning_rate": 1.4358974358974359e-06, + "logits/chosen": -0.7302659749984741, + "logits/rejected": -0.6979095935821533, + "logps/chosen": -745.6942138671875, + "logps/rejected": -744.9570922851562, + "loss": 0.5822, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2759849429130554, + "rewards/margins": 0.28207817673683167, + "rewards/rejected": -0.5580631494522095, + "step": 84 + }, + { + "epoch": 0.07, + "grad_norm": 68.76305542901659, + "learning_rate": 1.452991452991453e-06, + "logits/chosen": -0.7735276222229004, + "logits/rejected": -0.7231909036636353, + "logps/chosen": -473.5228271484375, + "logps/rejected": -680.1998291015625, + "loss": 0.4859, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08385667204856873, + "rewards/margins": 0.5274546146392822, + "rewards/rejected": -0.6113112568855286, + "step": 85 + }, + { + "epoch": 0.07, + "grad_norm": 74.88598476634989, + "learning_rate": 1.4700854700854702e-06, + "logits/chosen": -0.8076218962669373, + "logits/rejected": -0.7630515694618225, + "logps/chosen": -510.4481201171875, + "logps/rejected": -677.6754150390625, + "loss": 0.4774, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.14206896722316742, + "rewards/margins": 0.5710533857345581, + "rewards/rejected": -0.7131223678588867, + "step": 86 + }, + { + "epoch": 0.07, + "grad_norm": 74.58496629885917, + "learning_rate": 1.487179487179487e-06, + "logits/chosen": -0.7392367124557495, + "logits/rejected": -0.7253636717796326, + "logps/chosen": -430.012451171875, + "logps/rejected": -599.80517578125, + "loss": 0.4863, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10148727893829346, + "rewards/margins": 0.5541262030601501, + "rewards/rejected": -0.6556135416030884, + "step": 87 + }, + { + "epoch": 0.08, + "grad_norm": 60.2346447729904, + "learning_rate": 1.5042735042735041e-06, + "logits/chosen": -0.7053150534629822, + "logits/rejected": -0.6814218759536743, + "logps/chosen": -347.4354248046875, + "logps/rejected": -578.521728515625, + "loss": 0.5126, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09811823070049286, + "rewards/margins": 0.4941341280937195, + "rewards/rejected": -0.5922523140907288, + "step": 88 + }, + { + "epoch": 0.08, + "grad_norm": 102.6805009384135, + "learning_rate": 1.5213675213675212e-06, + "logits/chosen": -0.7140446901321411, + "logits/rejected": -0.6726734638214111, + "logps/chosen": -535.3824462890625, + "logps/rejected": -766.9484252929688, + "loss": 0.5048, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15884247422218323, + "rewards/margins": 0.5328004956245422, + "rewards/rejected": -0.6916429400444031, + "step": 89 + }, + { + "epoch": 0.08, + "grad_norm": 69.69374210672956, + "learning_rate": 1.5384615384615385e-06, + "logits/chosen": -0.6098439693450928, + "logits/rejected": -0.6184341907501221, + "logps/chosen": -491.7330322265625, + "logps/rejected": -803.6419067382812, + "loss": 0.4604, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13347195088863373, + "rewards/margins": 0.6201971769332886, + "rewards/rejected": -0.7536691427230835, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 79.95513092727752, + "learning_rate": 1.5555555555555556e-06, + "logits/chosen": -0.7003642320632935, + "logits/rejected": -0.6302669048309326, + "logps/chosen": -645.9889526367188, + "logps/rejected": -725.8878173828125, + "loss": 0.5059, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13106298446655273, + "rewards/margins": 0.4724835157394409, + "rewards/rejected": -0.6035465002059937, + "step": 91 + }, + { + "epoch": 0.08, + "grad_norm": 88.05720697404861, + "learning_rate": 1.5726495726495724e-06, + "logits/chosen": -0.7634373903274536, + "logits/rejected": -0.6494268178939819, + "logps/chosen": -679.6563720703125, + "logps/rejected": -634.7783203125, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19827817380428314, + "rewards/margins": 0.3626251220703125, + "rewards/rejected": -0.5609033107757568, + "step": 92 + }, + { + "epoch": 0.08, + "grad_norm": 68.36179701549388, + "learning_rate": 1.5897435897435895e-06, + "logits/chosen": -0.6748229265213013, + "logits/rejected": -0.6570998430252075, + "logps/chosen": -484.94976806640625, + "logps/rejected": -636.7830200195312, + "loss": 0.4894, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09381289780139923, + "rewards/margins": 0.5250856876373291, + "rewards/rejected": -0.6188986301422119, + "step": 93 + }, + { + "epoch": 0.08, + "grad_norm": 71.06292002236022, + "learning_rate": 1.6068376068376068e-06, + "logits/chosen": -0.68152916431427, + "logits/rejected": -0.6525790691375732, + "logps/chosen": -361.0284729003906, + "logps/rejected": -344.1199645996094, + "loss": 0.6176, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17955589294433594, + "rewards/margins": 0.1847459077835083, + "rewards/rejected": -0.36430180072784424, + "step": 94 + }, + { + "epoch": 0.08, + "grad_norm": 76.75834996079558, + "learning_rate": 1.6239316239316239e-06, + "logits/chosen": -0.7209312319755554, + "logits/rejected": -0.691003143787384, + "logps/chosen": -538.3939208984375, + "logps/rejected": -878.8992919921875, + "loss": 0.4889, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.36771565675735474, + "rewards/margins": 0.5804280042648315, + "rewards/rejected": -0.9481436014175415, + "step": 95 + }, + { + "epoch": 0.08, + "grad_norm": 72.2280506478773, + "learning_rate": 1.641025641025641e-06, + "logits/chosen": -0.8478317260742188, + "logits/rejected": -0.7494789361953735, + "logps/chosen": -455.7156066894531, + "logps/rejected": -556.229736328125, + "loss": 0.5173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13087773323059082, + "rewards/margins": 0.4210716485977173, + "rewards/rejected": -0.5519493818283081, + "step": 96 + }, + { + "epoch": 0.08, + "grad_norm": 79.87734471613219, + "learning_rate": 1.6581196581196582e-06, + "logits/chosen": -0.7853848934173584, + "logits/rejected": -0.7578170299530029, + "logps/chosen": -374.66754150390625, + "logps/rejected": -629.7857666015625, + "loss": 0.4637, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.131764218211174, + "rewards/margins": 0.5719059705734253, + "rewards/rejected": -0.7036702036857605, + "step": 97 + }, + { + "epoch": 0.08, + "grad_norm": 63.36940463548466, + "learning_rate": 1.675213675213675e-06, + "logits/chosen": -0.7420146465301514, + "logits/rejected": -0.7114475965499878, + "logps/chosen": -419.96209716796875, + "logps/rejected": -752.1236572265625, + "loss": 0.4367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15291455388069153, + "rewards/margins": 0.6730989813804626, + "rewards/rejected": -0.8260136246681213, + "step": 98 + }, + { + "epoch": 0.08, + "grad_norm": 65.96679602813242, + "learning_rate": 1.6923076923076922e-06, + "logits/chosen": -0.7752172946929932, + "logits/rejected": -0.6906420588493347, + "logps/chosen": -543.2679443359375, + "logps/rejected": -700.7498779296875, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19367974996566772, + "rewards/margins": 0.6900912523269653, + "rewards/rejected": -0.8837709426879883, + "step": 99 + }, + { + "epoch": 0.09, + "grad_norm": 72.20674567697648, + "learning_rate": 1.7094017094017092e-06, + "logits/chosen": -0.766581654548645, + "logits/rejected": -0.6910284161567688, + "logps/chosen": -489.31884765625, + "logps/rejected": -679.50390625, + "loss": 0.4481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10363808274269104, + "rewards/margins": 0.6479713916778564, + "rewards/rejected": -0.7516094446182251, + "step": 100 + }, + { + "epoch": 0.09, + "grad_norm": 73.31143088988361, + "learning_rate": 1.7264957264957265e-06, + "logits/chosen": -0.823377251625061, + "logits/rejected": -0.7363071441650391, + "logps/chosen": -728.3934326171875, + "logps/rejected": -869.7471923828125, + "loss": 0.4868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30075347423553467, + "rewards/margins": 0.6340336799621582, + "rewards/rejected": -0.9347871541976929, + "step": 101 + }, + { + "epoch": 0.09, + "grad_norm": 71.59168235641044, + "learning_rate": 1.7435897435897436e-06, + "logits/chosen": -0.7990365028381348, + "logits/rejected": -0.7497241497039795, + "logps/chosen": -532.1788330078125, + "logps/rejected": -694.0585327148438, + "loss": 0.4825, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.26346921920776367, + "rewards/margins": 0.5893768072128296, + "rewards/rejected": -0.8528460264205933, + "step": 102 + }, + { + "epoch": 0.09, + "grad_norm": 66.90381226463809, + "learning_rate": 1.7606837606837607e-06, + "logits/chosen": -0.7751025557518005, + "logits/rejected": -0.7068533897399902, + "logps/chosen": -400.0640869140625, + "logps/rejected": -499.7941589355469, + "loss": 0.5111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1898057758808136, + "rewards/margins": 0.4734571576118469, + "rewards/rejected": -0.6632629632949829, + "step": 103 + }, + { + "epoch": 0.09, + "grad_norm": 91.57313094191657, + "learning_rate": 1.7777777777777775e-06, + "logits/chosen": -0.7730945944786072, + "logits/rejected": -0.7324565052986145, + "logps/chosen": -584.8296508789062, + "logps/rejected": -557.51416015625, + "loss": 0.6382, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4563564658164978, + "rewards/margins": 0.16472357511520386, + "rewards/rejected": -0.6210800409317017, + "step": 104 + }, + { + "epoch": 0.09, + "grad_norm": 80.8676493591247, + "learning_rate": 1.7948717948717948e-06, + "logits/chosen": -0.800835371017456, + "logits/rejected": -0.7630007266998291, + "logps/chosen": -688.7553100585938, + "logps/rejected": -779.5538940429688, + "loss": 0.4738, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33632346987724304, + "rewards/margins": 0.6307164430618286, + "rewards/rejected": -0.9670398235321045, + "step": 105 + }, + { + "epoch": 0.09, + "grad_norm": 94.83615645547866, + "learning_rate": 1.8119658119658119e-06, + "logits/chosen": -0.7865450978279114, + "logits/rejected": -0.7478854060173035, + "logps/chosen": -679.5304565429688, + "logps/rejected": -742.7913208007812, + "loss": 0.5669, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.301138699054718, + "rewards/margins": 0.36614471673965454, + "rewards/rejected": -0.6672834157943726, + "step": 106 + }, + { + "epoch": 0.09, + "grad_norm": 75.67023157367913, + "learning_rate": 1.829059829059829e-06, + "logits/chosen": -0.8037106990814209, + "logits/rejected": -0.7343106269836426, + "logps/chosen": -478.39031982421875, + "logps/rejected": -601.8953857421875, + "loss": 0.5616, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2523706257343292, + "rewards/margins": 0.36477893590927124, + "rewards/rejected": -0.6171494722366333, + "step": 107 + }, + { + "epoch": 0.09, + "grad_norm": 77.13542686202837, + "learning_rate": 1.8461538461538462e-06, + "logits/chosen": -0.8077168464660645, + "logits/rejected": -0.7684412598609924, + "logps/chosen": -580.583740234375, + "logps/rejected": -786.945068359375, + "loss": 0.4501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27610066533088684, + "rewards/margins": 0.6333410739898682, + "rewards/rejected": -0.9094418287277222, + "step": 108 + }, + { + "epoch": 0.09, + "grad_norm": 68.08513125203355, + "learning_rate": 1.8632478632478631e-06, + "logits/chosen": -0.7108200192451477, + "logits/rejected": -0.6949425935745239, + "logps/chosen": -625.0948486328125, + "logps/rejected": -853.0823364257812, + "loss": 0.4433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26558175683021545, + "rewards/margins": 0.6938874125480652, + "rewards/rejected": -0.9594690799713135, + "step": 109 + }, + { + "epoch": 0.09, + "grad_norm": 80.04012798449374, + "learning_rate": 1.8803418803418802e-06, + "logits/chosen": -0.8558471202850342, + "logits/rejected": -0.8219131827354431, + "logps/chosen": -504.2074279785156, + "logps/rejected": -616.380859375, + "loss": 0.5202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24755051732063293, + "rewards/margins": 0.41993147134780884, + "rewards/rejected": -0.6674820184707642, + "step": 110 + }, + { + "epoch": 0.1, + "grad_norm": 66.76691728685117, + "learning_rate": 1.8974358974358973e-06, + "logits/chosen": -0.8877596855163574, + "logits/rejected": -0.8417974710464478, + "logps/chosen": -395.5901794433594, + "logps/rejected": -509.20428466796875, + "loss": 0.5051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11536961048841476, + "rewards/margins": 0.46388569474220276, + "rewards/rejected": -0.5792553424835205, + "step": 111 + }, + { + "epoch": 0.1, + "grad_norm": 68.84182709142127, + "learning_rate": 1.9145299145299143e-06, + "logits/chosen": -0.8668128252029419, + "logits/rejected": -0.8128141164779663, + "logps/chosen": -615.441162109375, + "logps/rejected": -841.2947998046875, + "loss": 0.4201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.30144259333610535, + "rewards/margins": 0.7650531530380249, + "rewards/rejected": -1.0664957761764526, + "step": 112 + }, + { + "epoch": 0.1, + "grad_norm": 72.82730282619598, + "learning_rate": 1.9316239316239316e-06, + "logits/chosen": -0.8395406007766724, + "logits/rejected": -0.7451783418655396, + "logps/chosen": -712.5833129882812, + "logps/rejected": -855.0979614257812, + "loss": 0.3898, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15199768543243408, + "rewards/margins": 0.9080096483230591, + "rewards/rejected": -1.0600073337554932, + "step": 113 + }, + { + "epoch": 0.1, + "grad_norm": 93.92928292851688, + "learning_rate": 1.9487179487179485e-06, + "logits/chosen": -0.8096364736557007, + "logits/rejected": -0.7551666498184204, + "logps/chosen": -771.3917236328125, + "logps/rejected": -970.24462890625, + "loss": 0.4669, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.45426732301712036, + "rewards/margins": 0.7061700820922852, + "rewards/rejected": -1.1604373455047607, + "step": 114 + }, + { + "epoch": 0.1, + "grad_norm": 79.1858750636432, + "learning_rate": 1.9658119658119658e-06, + "logits/chosen": -0.8325074911117554, + "logits/rejected": -0.7650719285011292, + "logps/chosen": -597.4686279296875, + "logps/rejected": -716.0536499023438, + "loss": 0.4879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2944122552871704, + "rewards/margins": 0.6035362482070923, + "rewards/rejected": -0.8979485034942627, + "step": 115 + }, + { + "epoch": 0.1, + "grad_norm": 72.37840288546316, + "learning_rate": 1.982905982905983e-06, + "logits/chosen": -0.807362973690033, + "logits/rejected": -0.7953593730926514, + "logps/chosen": -589.3651733398438, + "logps/rejected": -734.3743896484375, + "loss": 0.4439, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3563947081565857, + "rewards/margins": 0.7376902103424072, + "rewards/rejected": -1.0940849781036377, + "step": 116 + }, + { + "epoch": 0.1, + "grad_norm": 75.5794034970188, + "learning_rate": 2e-06, + "logits/chosen": -0.8776167631149292, + "logits/rejected": -0.8628365993499756, + "logps/chosen": -434.11993408203125, + "logps/rejected": -607.8153076171875, + "loss": 0.5061, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34286579489707947, + "rewards/margins": 0.4919288456439972, + "rewards/rejected": -0.8347946405410767, + "step": 117 + }, + { + "epoch": 0.1, + "grad_norm": 68.7024215365996, + "learning_rate": 1.999995515454355e-06, + "logits/chosen": -0.8011682033538818, + "logits/rejected": -0.7599575519561768, + "logps/chosen": -565.426513671875, + "logps/rejected": -722.574951171875, + "loss": 0.4489, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.390835702419281, + "rewards/margins": 0.7291101813316345, + "rewards/rejected": -1.1199458837509155, + "step": 118 + }, + { + "epoch": 0.1, + "grad_norm": 69.50092820504294, + "learning_rate": 1.999982061857643e-06, + "logits/chosen": -0.9195200204849243, + "logits/rejected": -0.8597877025604248, + "logps/chosen": -695.9260864257812, + "logps/rejected": -779.2306518554688, + "loss": 0.4423, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2885461449623108, + "rewards/margins": 0.7777824401855469, + "rewards/rejected": -1.0663286447525024, + "step": 119 + }, + { + "epoch": 0.1, + "grad_norm": 71.50992646655071, + "learning_rate": 1.9999596393305298e-06, + "logits/chosen": -0.8019847869873047, + "logits/rejected": -0.7569107413291931, + "logps/chosen": -461.3133544921875, + "logps/rejected": -575.7021484375, + "loss": 0.5173, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2865118980407715, + "rewards/margins": 0.47288602590560913, + "rewards/rejected": -0.7593979239463806, + "step": 120 + }, + { + "epoch": 0.1, + "grad_norm": 77.06119329349305, + "learning_rate": 1.9999282480741252e-06, + "logits/chosen": -0.8069489002227783, + "logits/rejected": -0.7503291368484497, + "logps/chosen": -412.9802551269531, + "logps/rejected": -491.18585205078125, + "loss": 0.5334, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1733241081237793, + "rewards/margins": 0.42330116033554077, + "rewards/rejected": -0.5966252684593201, + "step": 121 + }, + { + "epoch": 0.1, + "grad_norm": 87.32968201538856, + "learning_rate": 1.999887888369981e-06, + "logits/chosen": -0.8172651529312134, + "logits/rejected": -0.7478220462799072, + "logps/chosen": -713.5738525390625, + "logps/rejected": -749.0739135742188, + "loss": 0.4664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31455856561660767, + "rewards/margins": 0.675393283367157, + "rewards/rejected": -0.9899518489837646, + "step": 122 + }, + { + "epoch": 0.11, + "grad_norm": 60.70810298598834, + "learning_rate": 1.999838560580086e-06, + "logits/chosen": -0.9219634532928467, + "logits/rejected": -0.8740679025650024, + "logps/chosen": -444.17626953125, + "logps/rejected": -695.4278564453125, + "loss": 0.4462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2280620038509369, + "rewards/margins": 0.6651239395141602, + "rewards/rejected": -0.8931859731674194, + "step": 123 + }, + { + "epoch": 0.11, + "grad_norm": 71.52073306791745, + "learning_rate": 1.9997802651468664e-06, + "logits/chosen": -0.797703742980957, + "logits/rejected": -0.7664157152175903, + "logps/chosen": -479.1082763671875, + "logps/rejected": -589.7113037109375, + "loss": 0.4608, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.24791169166564941, + "rewards/margins": 0.5920776128768921, + "rewards/rejected": -0.8399893045425415, + "step": 124 + }, + { + "epoch": 0.11, + "grad_norm": 79.2006988044831, + "learning_rate": 1.999713002593179e-06, + "logits/chosen": -0.8782462477684021, + "logits/rejected": -0.8354315757751465, + "logps/chosen": -541.5379638671875, + "logps/rejected": -650.455810546875, + "loss": 0.5032, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30667245388031006, + "rewards/margins": 0.5787320137023926, + "rewards/rejected": -0.8854044675827026, + "step": 125 + }, + { + "epoch": 0.11, + "grad_norm": 76.74145047917592, + "learning_rate": 1.999636773522308e-06, + "logits/chosen": -0.8741164207458496, + "logits/rejected": -0.8457087278366089, + "logps/chosen": -482.1501770019531, + "logps/rejected": -639.4264526367188, + "loss": 0.4945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34282004833221436, + "rewards/margins": 0.5826435089111328, + "rewards/rejected": -0.9254635572433472, + "step": 126 + }, + { + "epoch": 0.11, + "grad_norm": 72.2024867540662, + "learning_rate": 1.999551578617958e-06, + "logits/chosen": -0.8951029181480408, + "logits/rejected": -0.8367560505867004, + "logps/chosen": -508.193115234375, + "logps/rejected": -651.3898315429688, + "loss": 0.4561, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19248032569885254, + "rewards/margins": 0.6740570068359375, + "rewards/rejected": -0.86653733253479, + "step": 127 + }, + { + "epoch": 0.11, + "grad_norm": 68.33385779419781, + "learning_rate": 1.999457418644251e-06, + "logits/chosen": -0.8008483052253723, + "logits/rejected": -0.7975972890853882, + "logps/chosen": -455.5728454589844, + "logps/rejected": -730.8447875976562, + "loss": 0.4601, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35138142108917236, + "rewards/margins": 0.8937031030654907, + "rewards/rejected": -1.245084524154663, + "step": 128 + }, + { + "epoch": 0.11, + "grad_norm": 60.09864215410157, + "learning_rate": 1.9993542944457167e-06, + "logits/chosen": -0.7812170386314392, + "logits/rejected": -0.7546092867851257, + "logps/chosen": -365.2407531738281, + "logps/rejected": -650.365966796875, + "loss": 0.3604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20425821840763092, + "rewards/margins": 0.9624660611152649, + "rewards/rejected": -1.1667242050170898, + "step": 129 + }, + { + "epoch": 0.11, + "grad_norm": 81.74557893605306, + "learning_rate": 1.999242206947284e-06, + "logits/chosen": -0.8479536175727844, + "logits/rejected": -0.7998827695846558, + "logps/chosen": -603.7364501953125, + "logps/rejected": -804.69482421875, + "loss": 0.439, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47630196809768677, + "rewards/margins": 0.7274442911148071, + "rewards/rejected": -1.2037461996078491, + "step": 130 + }, + { + "epoch": 0.11, + "grad_norm": 70.49366758942749, + "learning_rate": 1.999121157154277e-06, + "logits/chosen": -0.8379110097885132, + "logits/rejected": -0.7947043180465698, + "logps/chosen": -558.55615234375, + "logps/rejected": -611.01611328125, + "loss": 0.4813, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37627291679382324, + "rewards/margins": 0.6493163704872131, + "rewards/rejected": -1.0255892276763916, + "step": 131 + }, + { + "epoch": 0.11, + "grad_norm": 72.10084104635021, + "learning_rate": 1.9989911461524012e-06, + "logits/chosen": -0.7971664667129517, + "logits/rejected": -0.7829629182815552, + "logps/chosen": -392.22967529296875, + "logps/rejected": -476.143798828125, + "loss": 0.5238, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3677898347377777, + "rewards/margins": 0.45336997509002686, + "rewards/rejected": -0.8211597800254822, + "step": 132 + }, + { + "epoch": 0.11, + "grad_norm": 58.32316467653529, + "learning_rate": 1.9988521751077387e-06, + "logits/chosen": -0.8165258765220642, + "logits/rejected": -0.7373073101043701, + "logps/chosen": -504.18206787109375, + "logps/rejected": -669.3347778320312, + "loss": 0.3772, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23694676160812378, + "rewards/margins": 0.9710106253623962, + "rewards/rejected": -1.2079572677612305, + "step": 133 + }, + { + "epoch": 0.11, + "grad_norm": 62.43807799568858, + "learning_rate": 1.9987042452667324e-06, + "logits/chosen": -0.8428569436073303, + "logits/rejected": -0.7920178174972534, + "logps/chosen": -502.78192138671875, + "logps/rejected": -705.2931518554688, + "loss": 0.4184, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3294200003147125, + "rewards/margins": 0.806629478931427, + "rewards/rejected": -1.136049509048462, + "step": 134 + }, + { + "epoch": 0.12, + "grad_norm": 78.44898603046353, + "learning_rate": 1.9985473579561792e-06, + "logits/chosen": -0.8491930961608887, + "logits/rejected": -0.8058072328567505, + "logps/chosen": -486.43756103515625, + "logps/rejected": -606.3853759765625, + "loss": 0.4936, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.43683668971061707, + "rewards/margins": 0.5353400111198425, + "rewards/rejected": -0.9721767902374268, + "step": 135 + }, + { + "epoch": 0.12, + "grad_norm": 64.12841449779542, + "learning_rate": 1.998381514583215e-06, + "logits/chosen": -0.772560715675354, + "logits/rejected": -0.7717125415802002, + "logps/chosen": -540.2191162109375, + "logps/rejected": -791.0310668945312, + "loss": 0.3893, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.38439521193504333, + "rewards/margins": 0.9186478853225708, + "rewards/rejected": -1.3030431270599365, + "step": 136 + }, + { + "epoch": 0.12, + "grad_norm": 64.02262479770808, + "learning_rate": 1.9982067166353048e-06, + "logits/chosen": -0.8253116607666016, + "logits/rejected": -0.7849256992340088, + "logps/chosen": -575.468017578125, + "logps/rejected": -662.4056396484375, + "loss": 0.4202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28426802158355713, + "rewards/margins": 0.9540620446205139, + "rewards/rejected": -1.2383301258087158, + "step": 137 + }, + { + "epoch": 0.12, + "grad_norm": 82.62839604239348, + "learning_rate": 1.998022965680227e-06, + "logits/chosen": -0.8180713653564453, + "logits/rejected": -0.7528756260871887, + "logps/chosen": -600.6514892578125, + "logps/rejected": -630.1636962890625, + "loss": 0.5035, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5511524677276611, + "rewards/margins": 0.5072042346000671, + "rewards/rejected": -1.058356761932373, + "step": 138 + }, + { + "epoch": 0.12, + "grad_norm": 71.44852839544707, + "learning_rate": 1.997830263366061e-06, + "logits/chosen": -0.7998161315917969, + "logits/rejected": -0.7412513494491577, + "logps/chosen": -513.164306640625, + "logps/rejected": -934.647705078125, + "loss": 0.4242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18442249298095703, + "rewards/margins": 0.7800825834274292, + "rewards/rejected": -0.9645050168037415, + "step": 139 + }, + { + "epoch": 0.12, + "grad_norm": 52.61456092779066, + "learning_rate": 1.9976286114211705e-06, + "logits/chosen": -0.8468146324157715, + "logits/rejected": -0.78184574842453, + "logps/chosen": -439.827880859375, + "logps/rejected": -507.6515808105469, + "loss": 0.3766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20624279975891113, + "rewards/margins": 0.8863797187805176, + "rewards/rejected": -1.0926225185394287, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 69.57744451282258, + "learning_rate": 1.997418011654192e-06, + "logits/chosen": -0.9197933673858643, + "logits/rejected": -0.8328337073326111, + "logps/chosen": -429.9018249511719, + "logps/rejected": -433.46246337890625, + "loss": 0.5531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3884229063987732, + "rewards/margins": 0.45711952447891235, + "rewards/rejected": -0.8455424308776855, + "step": 141 + }, + { + "epoch": 0.12, + "grad_norm": 59.410206466636474, + "learning_rate": 1.997198465954012e-06, + "logits/chosen": -0.8448234796524048, + "logits/rejected": -0.806464672088623, + "logps/chosen": -513.1571655273438, + "logps/rejected": -833.9468383789062, + "loss": 0.3146, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37702399492263794, + "rewards/margins": 1.382176160812378, + "rewards/rejected": -1.759200096130371, + "step": 142 + }, + { + "epoch": 0.12, + "grad_norm": 72.20471516742066, + "learning_rate": 1.9969699762897573e-06, + "logits/chosen": -0.915653645992279, + "logits/rejected": -0.8454491496086121, + "logps/chosen": -619.8283081054688, + "logps/rejected": -733.0457763671875, + "loss": 0.4527, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5407058000564575, + "rewards/margins": 0.7225282192230225, + "rewards/rejected": -1.2632339000701904, + "step": 143 + }, + { + "epoch": 0.12, + "grad_norm": 59.75240456422268, + "learning_rate": 1.9967325447107722e-06, + "logits/chosen": -0.8744698762893677, + "logits/rejected": -0.8536149859428406, + "logps/chosen": -528.2760009765625, + "logps/rejected": -776.7694091796875, + "loss": 0.3287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4334297776222229, + "rewards/margins": 1.1689581871032715, + "rewards/rejected": -1.6023879051208496, + "step": 144 + }, + { + "epoch": 0.12, + "grad_norm": 60.74667932979385, + "learning_rate": 1.996486173346602e-06, + "logits/chosen": -0.8356216549873352, + "logits/rejected": -0.8344606757164001, + "logps/chosen": -483.53900146484375, + "logps/rejected": -659.2405395507812, + "loss": 0.3723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35159799456596375, + "rewards/margins": 0.9288654327392578, + "rewards/rejected": -1.280463457107544, + "step": 145 + }, + { + "epoch": 0.13, + "grad_norm": 67.71810220038617, + "learning_rate": 1.996230864406974e-06, + "logits/chosen": -0.9083362817764282, + "logits/rejected": -0.8268660306930542, + "logps/chosen": -633.2772827148438, + "logps/rejected": -801.44140625, + "loss": 0.3127, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2331371307373047, + "rewards/margins": 1.2440853118896484, + "rewards/rejected": -1.4772224426269531, + "step": 146 + }, + { + "epoch": 0.13, + "grad_norm": 65.4351547090826, + "learning_rate": 1.9959666201817776e-06, + "logits/chosen": -0.8755106925964355, + "logits/rejected": -0.8636406064033508, + "logps/chosen": -507.8783264160156, + "logps/rejected": -684.91552734375, + "loss": 0.3942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40344667434692383, + "rewards/margins": 0.9445846080780029, + "rewards/rejected": -1.3480312824249268, + "step": 147 + }, + { + "epoch": 0.13, + "grad_norm": 62.6842472143847, + "learning_rate": 1.9956934430410437e-06, + "logits/chosen": -0.903535008430481, + "logits/rejected": -0.8534011840820312, + "logps/chosen": -395.8306884765625, + "logps/rejected": -533.202392578125, + "loss": 0.4462, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3898427486419678, + "rewards/margins": 0.7070665955543518, + "rewards/rejected": -1.0969094038009644, + "step": 148 + }, + { + "epoch": 0.13, + "grad_norm": 56.18745353407791, + "learning_rate": 1.995411335434922e-06, + "logits/chosen": -0.8835330605506897, + "logits/rejected": -0.844456136226654, + "logps/chosen": -450.849853515625, + "logps/rejected": -655.7808837890625, + "loss": 0.3416, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2940594255924225, + "rewards/margins": 1.1423484086990356, + "rewards/rejected": -1.4364076852798462, + "step": 149 + }, + { + "epoch": 0.13, + "grad_norm": 63.93201700244688, + "learning_rate": 1.995120299893662e-06, + "logits/chosen": -0.8894577622413635, + "logits/rejected": -0.8500162363052368, + "logps/chosen": -529.0599365234375, + "logps/rejected": -826.4214477539062, + "loss": 0.3375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28005659580230713, + "rewards/margins": 1.3898392915725708, + "rewards/rejected": -1.6698957681655884, + "step": 150 + }, + { + "epoch": 0.13, + "grad_norm": 66.03579836821562, + "learning_rate": 1.994820339027588e-06, + "logits/chosen": -0.9007207155227661, + "logits/rejected": -0.8309506177902222, + "logps/chosen": -554.010009765625, + "logps/rejected": -609.4149169921875, + "loss": 0.4365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41904115676879883, + "rewards/margins": 0.7743521928787231, + "rewards/rejected": -1.193393349647522, + "step": 151 + }, + { + "epoch": 0.13, + "grad_norm": 63.85789808273116, + "learning_rate": 1.9945114555270767e-06, + "logits/chosen": -0.8752983808517456, + "logits/rejected": -0.7941920757293701, + "logps/chosen": -763.06640625, + "logps/rejected": -773.3269653320312, + "loss": 0.3656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3239164352416992, + "rewards/margins": 0.992270290851593, + "rewards/rejected": -1.3161866664886475, + "step": 152 + }, + { + "epoch": 0.13, + "grad_norm": 65.57581932182828, + "learning_rate": 1.994193652162532e-06, + "logits/chosen": -0.9188427925109863, + "logits/rejected": -0.855987548828125, + "logps/chosen": -587.0689086914062, + "logps/rejected": -663.8291015625, + "loss": 0.3913, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26296913623809814, + "rewards/margins": 0.8903168439865112, + "rewards/rejected": -1.1532859802246094, + "step": 153 + }, + { + "epoch": 0.13, + "grad_norm": 67.85437233358816, + "learning_rate": 1.993866931784361e-06, + "logits/chosen": -0.8676232099533081, + "logits/rejected": -0.8185759782791138, + "logps/chosen": -503.1413269042969, + "logps/rejected": -732.5165405273438, + "loss": 0.396, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3670608699321747, + "rewards/margins": 0.9600380659103394, + "rewards/rejected": -1.3270988464355469, + "step": 154 + }, + { + "epoch": 0.13, + "grad_norm": 54.16281611590089, + "learning_rate": 1.9935312973229495e-06, + "logits/chosen": -0.8576045632362366, + "logits/rejected": -0.8100249767303467, + "logps/chosen": -475.8677062988281, + "logps/rejected": -678.9979248046875, + "loss": 0.3568, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11242443323135376, + "rewards/margins": 1.0827581882476807, + "rewards/rejected": -1.1951825618743896, + "step": 155 + }, + { + "epoch": 0.13, + "grad_norm": 65.15718998860318, + "learning_rate": 1.993186751788633e-06, + "logits/chosen": -0.877051055431366, + "logits/rejected": -0.8274168372154236, + "logps/chosen": -448.50860595703125, + "logps/rejected": -672.7408447265625, + "loss": 0.4592, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.34052014350891113, + "rewards/margins": 0.9053160548210144, + "rewards/rejected": -1.2458362579345703, + "step": 156 + }, + { + "epoch": 0.13, + "grad_norm": 67.06517025156816, + "learning_rate": 1.992833298271672e-06, + "logits/chosen": -0.8686350584030151, + "logits/rejected": -0.8159289360046387, + "logps/chosen": -624.25830078125, + "logps/rejected": -718.9054565429688, + "loss": 0.3583, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3402397334575653, + "rewards/margins": 1.1835572719573975, + "rewards/rejected": -1.5237970352172852, + "step": 157 + }, + { + "epoch": 0.14, + "grad_norm": 74.89861873824249, + "learning_rate": 1.992470939942223e-06, + "logits/chosen": -0.8808647394180298, + "logits/rejected": -0.8215168118476868, + "logps/chosen": -559.8514404296875, + "logps/rejected": -716.53125, + "loss": 0.4391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5647865533828735, + "rewards/margins": 0.8510969877243042, + "rewards/rejected": -1.4158835411071777, + "step": 158 + }, + { + "epoch": 0.14, + "grad_norm": 58.337214417043405, + "learning_rate": 1.9920996800503117e-06, + "logits/chosen": -0.8979121446609497, + "logits/rejected": -0.8257559537887573, + "logps/chosen": -567.0684814453125, + "logps/rejected": -770.5816650390625, + "loss": 0.4238, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.35424235463142395, + "rewards/margins": 0.9558961391448975, + "rewards/rejected": -1.310138463973999, + "step": 159 + }, + { + "epoch": 0.14, + "grad_norm": 80.57856746871143, + "learning_rate": 1.991719521925801e-06, + "logits/chosen": -0.8393473625183105, + "logits/rejected": -0.7686038017272949, + "logps/chosen": -573.0647583007812, + "logps/rejected": -683.67529296875, + "loss": 0.5307, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3153061866760254, + "rewards/margins": 0.6114298105239868, + "rewards/rejected": -0.9267358779907227, + "step": 160 + }, + { + "epoch": 0.14, + "grad_norm": 80.55202576409322, + "learning_rate": 1.9913304689783644e-06, + "logits/chosen": -0.8318350315093994, + "logits/rejected": -0.7842640280723572, + "logps/chosen": -634.771240234375, + "logps/rejected": -816.2799072265625, + "loss": 0.485, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5386346578598022, + "rewards/margins": 1.1234714984893799, + "rewards/rejected": -1.6621060371398926, + "step": 161 + }, + { + "epoch": 0.14, + "grad_norm": 56.32400126506285, + "learning_rate": 1.990932524697454e-06, + "logits/chosen": -0.8687608242034912, + "logits/rejected": -0.8195289373397827, + "logps/chosen": -427.2818298339844, + "logps/rejected": -599.0328369140625, + "loss": 0.381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1253286600112915, + "rewards/margins": 0.9828024506568909, + "rewards/rejected": -1.1081310510635376, + "step": 162 + }, + { + "epoch": 0.14, + "grad_norm": 63.0643021542132, + "learning_rate": 1.990525692652267e-06, + "logits/chosen": -0.867060661315918, + "logits/rejected": -0.8365573883056641, + "logps/chosen": -675.904296875, + "logps/rejected": -826.5540771484375, + "loss": 0.3607, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33339425921440125, + "rewards/margins": 1.0543599128723145, + "rewards/rejected": -1.387754201889038, + "step": 163 + }, + { + "epoch": 0.14, + "grad_norm": 67.5672182666397, + "learning_rate": 1.990109976491718e-06, + "logits/chosen": -0.842487096786499, + "logits/rejected": -0.8210547566413879, + "logps/chosen": -516.2122192382812, + "logps/rejected": -652.6751708984375, + "loss": 0.4432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3034408688545227, + "rewards/margins": 0.9074206948280334, + "rewards/rejected": -1.2108616828918457, + "step": 164 + }, + { + "epoch": 0.14, + "grad_norm": 61.464764505682616, + "learning_rate": 1.9896853799444026e-06, + "logits/chosen": -0.8813076019287109, + "logits/rejected": -0.8397949934005737, + "logps/chosen": -557.419677734375, + "logps/rejected": -787.8724365234375, + "loss": 0.4034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3238334655761719, + "rewards/margins": 0.8356885313987732, + "rewards/rejected": -1.1595220565795898, + "step": 165 + }, + { + "epoch": 0.14, + "grad_norm": 64.47594791896671, + "learning_rate": 1.9892519068185667e-06, + "logits/chosen": -0.8496279716491699, + "logits/rejected": -0.7839698791503906, + "logps/chosen": -526.0557250976562, + "logps/rejected": -588.1710205078125, + "loss": 0.4638, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.06029529124498367, + "rewards/margins": 0.7168031930923462, + "rewards/rejected": -0.6565079689025879, + "step": 166 + }, + { + "epoch": 0.14, + "grad_norm": 54.170446969718355, + "learning_rate": 1.98880956100207e-06, + "logits/chosen": -0.8706177473068237, + "logits/rejected": -0.8323687314987183, + "logps/chosen": -409.9179992675781, + "logps/rejected": -655.827880859375, + "loss": 0.3739, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1820179671049118, + "rewards/margins": 0.9817320108413696, + "rewards/rejected": -1.1637499332427979, + "step": 167 + }, + { + "epoch": 0.14, + "grad_norm": 114.6857522864625, + "learning_rate": 1.9883583464623523e-06, + "logits/chosen": -0.9125653505325317, + "logits/rejected": -0.874718427658081, + "logps/chosen": -599.5379638671875, + "logps/rejected": -680.3572998046875, + "loss": 0.6713, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.444853812456131, + "rewards/margins": 0.5304194688796997, + "rewards/rejected": -0.9752731919288635, + "step": 168 + }, + { + "epoch": 0.14, + "grad_norm": 76.6076339710897, + "learning_rate": 1.9878982672463987e-06, + "logits/chosen": -0.9233865141868591, + "logits/rejected": -0.8617924451828003, + "logps/chosen": -559.2950439453125, + "logps/rejected": -761.9292602539062, + "loss": 0.4374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4529637098312378, + "rewards/margins": 0.9664942622184753, + "rewards/rejected": -1.419458031654358, + "step": 169 + }, + { + "epoch": 0.15, + "grad_norm": 80.24554751811056, + "learning_rate": 1.987429327480701e-06, + "logits/chosen": -0.8420670032501221, + "logits/rejected": -0.8396788239479065, + "logps/chosen": -451.66748046875, + "logps/rejected": -740.0496826171875, + "loss": 0.5313, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.43624886870384216, + "rewards/margins": 0.7384995222091675, + "rewards/rejected": -1.1747483015060425, + "step": 170 + }, + { + "epoch": 0.15, + "grad_norm": 50.03612260766987, + "learning_rate": 1.9869515313712226e-06, + "logits/chosen": -0.9093887805938721, + "logits/rejected": -0.8626548051834106, + "logps/chosen": -497.70465087890625, + "logps/rejected": -659.1704711914062, + "loss": 0.3677, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3543227016925812, + "rewards/margins": 1.0449423789978027, + "rewards/rejected": -1.3992650508880615, + "step": 171 + }, + { + "epoch": 0.15, + "grad_norm": 77.8668671112791, + "learning_rate": 1.986464883203361e-06, + "logits/chosen": -0.8573567867279053, + "logits/rejected": -0.8139798641204834, + "logps/chosen": -475.87750244140625, + "logps/rejected": -573.9266357421875, + "loss": 0.4809, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28878140449523926, + "rewards/margins": 0.7627514600753784, + "rewards/rejected": -1.0515328645706177, + "step": 172 + }, + { + "epoch": 0.15, + "grad_norm": 51.49285833661925, + "learning_rate": 1.985969387341908e-06, + "logits/chosen": -0.7776519656181335, + "logits/rejected": -0.7655439376831055, + "logps/chosen": -494.25653076171875, + "logps/rejected": -746.9952392578125, + "loss": 0.2972, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3533231019973755, + "rewards/margins": 1.2390961647033691, + "rewards/rejected": -1.5924192667007446, + "step": 173 + }, + { + "epoch": 0.15, + "grad_norm": 69.87605864741144, + "learning_rate": 1.985465048231011e-06, + "logits/chosen": -0.8837787508964539, + "logits/rejected": -0.8801698088645935, + "logps/chosen": -396.24224853515625, + "logps/rejected": -653.4398193359375, + "loss": 0.4339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20290517807006836, + "rewards/margins": 0.9276418089866638, + "rewards/rejected": -1.130547046661377, + "step": 174 + }, + { + "epoch": 0.15, + "grad_norm": 61.25270124043899, + "learning_rate": 1.9849518703941335e-06, + "logits/chosen": -0.8431894779205322, + "logits/rejected": -0.7994276285171509, + "logps/chosen": -387.6629638671875, + "logps/rejected": -628.9801025390625, + "loss": 0.3654, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2903192639350891, + "rewards/margins": 0.9974017143249512, + "rewards/rejected": -1.2877209186553955, + "step": 175 + }, + { + "epoch": 0.15, + "grad_norm": 46.026434612264836, + "learning_rate": 1.9844298584340143e-06, + "logits/chosen": -0.8580862283706665, + "logits/rejected": -0.8199708461761475, + "logps/chosen": -487.328125, + "logps/rejected": -728.115234375, + "loss": 0.2489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17964893579483032, + "rewards/margins": 1.5337719917297363, + "rewards/rejected": -1.7134208679199219, + "step": 176 + }, + { + "epoch": 0.15, + "grad_norm": 53.4694134691448, + "learning_rate": 1.9838990170326268e-06, + "logits/chosen": -0.923569917678833, + "logits/rejected": -0.8764654397964478, + "logps/chosen": -553.5648193359375, + "logps/rejected": -613.848876953125, + "loss": 0.3562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2990966737270355, + "rewards/margins": 1.1244242191314697, + "rewards/rejected": -1.4235209226608276, + "step": 177 + }, + { + "epoch": 0.15, + "grad_norm": 59.50137781436025, + "learning_rate": 1.983359350951136e-06, + "logits/chosen": -0.8597773313522339, + "logits/rejected": -0.8245177865028381, + "logps/chosen": -515.1898803710938, + "logps/rejected": -726.3103637695312, + "loss": 0.3574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3775560259819031, + "rewards/margins": 1.0746281147003174, + "rewards/rejected": -1.4521839618682861, + "step": 178 + }, + { + "epoch": 0.15, + "grad_norm": 46.84015049044058, + "learning_rate": 1.982810865029855e-06, + "logits/chosen": -0.8746932148933411, + "logits/rejected": -0.8448111414909363, + "logps/chosen": -353.440185546875, + "logps/rejected": -537.9382934570312, + "loss": 0.4036, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05355449020862579, + "rewards/margins": 0.8959289789199829, + "rewards/rejected": -0.9494835138320923, + "step": 179 + }, + { + "epoch": 0.15, + "grad_norm": 57.555815006437385, + "learning_rate": 1.9822535641882054e-06, + "logits/chosen": -0.9322961568832397, + "logits/rejected": -0.8470557928085327, + "logps/chosen": -650.664306640625, + "logps/rejected": -664.6771850585938, + "loss": 0.3578, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.353482186794281, + "rewards/margins": 1.0259697437286377, + "rewards/rejected": -0.6724876165390015, + "step": 180 + }, + { + "epoch": 0.16, + "grad_norm": 52.64380215131483, + "learning_rate": 1.9816874534246694e-06, + "logits/chosen": -0.94826340675354, + "logits/rejected": -0.8962709903717041, + "logps/chosen": -436.15155029296875, + "logps/rejected": -667.5868530273438, + "loss": 0.3527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3777049779891968, + "rewards/margins": 1.048713207244873, + "rewards/rejected": -1.4264180660247803, + "step": 181 + }, + { + "epoch": 0.16, + "grad_norm": 61.337145217195584, + "learning_rate": 1.981112537816745e-06, + "logits/chosen": -0.9093761444091797, + "logits/rejected": -0.8457399606704712, + "logps/chosen": -687.2678833007812, + "logps/rejected": -717.3057861328125, + "loss": 0.3736, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5045949220657349, + "rewards/margins": 1.1356408596038818, + "rewards/rejected": -1.6402359008789062, + "step": 182 + }, + { + "epoch": 0.16, + "grad_norm": 55.882554466879625, + "learning_rate": 1.9805288225209037e-06, + "logits/chosen": -0.9058928489685059, + "logits/rejected": -0.8362715244293213, + "logps/chosen": -718.281005859375, + "logps/rejected": -1016.6908569335938, + "loss": 0.2412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5235175490379333, + "rewards/margins": 1.552617073059082, + "rewards/rejected": -2.07613468170166, + "step": 183 + }, + { + "epoch": 0.16, + "grad_norm": 85.18071092892326, + "learning_rate": 1.979936312772541e-06, + "logits/chosen": -0.8438854217529297, + "logits/rejected": -0.8518179059028625, + "logps/chosen": -588.9202880859375, + "logps/rejected": -629.9645385742188, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6537317037582397, + "rewards/margins": 0.815560519695282, + "rewards/rejected": -1.469292163848877, + "step": 184 + }, + { + "epoch": 0.16, + "grad_norm": 40.526405519388895, + "learning_rate": 1.979335013885931e-06, + "logits/chosen": -0.9247338771820068, + "logits/rejected": -0.8627129793167114, + "logps/chosen": -581.0001220703125, + "logps/rejected": -817.584716796875, + "loss": 0.2342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21191178262233734, + "rewards/margins": 1.7073981761932373, + "rewards/rejected": -1.9193100929260254, + "step": 185 + }, + { + "epoch": 0.16, + "grad_norm": 66.44985379036187, + "learning_rate": 1.978724931254178e-06, + "logits/chosen": -0.914710283279419, + "logits/rejected": -0.8438661694526672, + "logps/chosen": -668.4779052734375, + "logps/rejected": -792.9580078125, + "loss": 0.3533, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5750971436500549, + "rewards/margins": 1.072385549545288, + "rewards/rejected": -1.6474826335906982, + "step": 186 + }, + { + "epoch": 0.16, + "grad_norm": 86.06670016108346, + "learning_rate": 1.9781060703491694e-06, + "logits/chosen": -0.8819316029548645, + "logits/rejected": -0.8386082053184509, + "logps/chosen": -863.5635375976562, + "logps/rejected": -774.0555419921875, + "loss": 0.4269, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5523691177368164, + "rewards/margins": 0.9685525894165039, + "rewards/rejected": -1.5209217071533203, + "step": 187 + }, + { + "epoch": 0.16, + "grad_norm": 55.92321246323812, + "learning_rate": 1.9774784367215245e-06, + "logits/chosen": -0.9131249189376831, + "logits/rejected": -0.8162829875946045, + "logps/chosen": -660.4703369140625, + "logps/rejected": -848.2841796875, + "loss": 0.2694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3814217150211334, + "rewards/margins": 1.388995885848999, + "rewards/rejected": -1.7704176902770996, + "step": 188 + }, + { + "epoch": 0.16, + "grad_norm": 47.296797461987886, + "learning_rate": 1.976842036000547e-06, + "logits/chosen": -0.8920987844467163, + "logits/rejected": -0.8426529169082642, + "logps/chosen": -641.9872436523438, + "logps/rejected": -836.463623046875, + "loss": 0.254, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2708077132701874, + "rewards/margins": 1.5072784423828125, + "rewards/rejected": -1.7780860662460327, + "step": 189 + }, + { + "epoch": 0.16, + "grad_norm": 53.86538055736526, + "learning_rate": 1.976196873894173e-06, + "logits/chosen": -0.8458589315414429, + "logits/rejected": -0.8193198442459106, + "logps/chosen": -632.2750854492188, + "logps/rejected": -753.638916015625, + "loss": 0.3357, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37582093477249146, + "rewards/margins": 1.048279881477356, + "rewards/rejected": -1.4241008758544922, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 54.769017667295614, + "learning_rate": 1.9755429561889205e-06, + "logits/chosen": -0.9080367088317871, + "logits/rejected": -0.8948763608932495, + "logps/chosen": -532.4638671875, + "logps/rejected": -732.6849365234375, + "loss": 0.3107, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3257588744163513, + "rewards/margins": 1.5003809928894043, + "rewards/rejected": -1.8261399269104004, + "step": 191 + }, + { + "epoch": 0.16, + "grad_norm": 49.30835267251258, + "learning_rate": 1.9748802887498368e-06, + "logits/chosen": -0.8217437267303467, + "logits/rejected": -0.8103574514389038, + "logps/chosen": -381.0695495605469, + "logps/rejected": -666.2606201171875, + "loss": 0.3521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.370362251996994, + "rewards/margins": 1.2242530584335327, + "rewards/rejected": -1.5946152210235596, + "step": 192 + }, + { + "epoch": 0.17, + "grad_norm": 39.70677456901692, + "learning_rate": 1.9742088775204463e-06, + "logits/chosen": -0.9067370891571045, + "logits/rejected": -0.8767822980880737, + "logps/chosen": -338.2500915527344, + "logps/rejected": -571.2421875, + "loss": 0.3016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18578293919563293, + "rewards/margins": 1.3771023750305176, + "rewards/rejected": -1.5628852844238281, + "step": 193 + }, + { + "epoch": 0.17, + "grad_norm": 37.226494875826404, + "learning_rate": 1.9735287285226984e-06, + "logits/chosen": -0.82257080078125, + "logits/rejected": -0.7932236194610596, + "logps/chosen": -422.1122131347656, + "logps/rejected": -737.2191162109375, + "loss": 0.2516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22484631836414337, + "rewards/margins": 1.4313842058181763, + "rewards/rejected": -1.6562305688858032, + "step": 194 + }, + { + "epoch": 0.17, + "grad_norm": 70.16182828765132, + "learning_rate": 1.9728398478569113e-06, + "logits/chosen": -0.9176048040390015, + "logits/rejected": -0.8764656782150269, + "logps/chosen": -725.5680541992188, + "logps/rejected": -755.45849609375, + "loss": 0.3571, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3184995651245117, + "rewards/margins": 1.065685510635376, + "rewards/rejected": -1.3841850757598877, + "step": 195 + }, + { + "epoch": 0.17, + "grad_norm": 34.46849454206544, + "learning_rate": 1.9721422417017185e-06, + "logits/chosen": -0.9285256862640381, + "logits/rejected": -0.882872462272644, + "logps/chosen": -428.5306091308594, + "logps/rejected": -768.4300537109375, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3067496120929718, + "rewards/margins": 1.816524863243103, + "rewards/rejected": -2.123274564743042, + "step": 196 + }, + { + "epoch": 0.17, + "grad_norm": 68.90183135392742, + "learning_rate": 1.971435916314013e-06, + "logits/chosen": -0.851874589920044, + "logits/rejected": -0.8348385095596313, + "logps/chosen": -493.03045654296875, + "logps/rejected": -677.19189453125, + "loss": 0.4673, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6967003345489502, + "rewards/margins": 1.1165249347686768, + "rewards/rejected": -1.813225269317627, + "step": 197 + }, + { + "epoch": 0.17, + "grad_norm": 47.658400615569406, + "learning_rate": 1.970720878028892e-06, + "logits/chosen": -0.9000753164291382, + "logits/rejected": -0.8680911064147949, + "logps/chosen": -508.72967529296875, + "logps/rejected": -634.1900634765625, + "loss": 0.3581, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3899848759174347, + "rewards/margins": 1.2031042575836182, + "rewards/rejected": -1.5930891036987305, + "step": 198 + }, + { + "epoch": 0.17, + "grad_norm": 55.44563313958047, + "learning_rate": 1.9699971332595994e-06, + "logits/chosen": -0.9187253713607788, + "logits/rejected": -0.9054923057556152, + "logps/chosen": -457.06787109375, + "logps/rejected": -592.365966796875, + "loss": 0.3716, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3505123257637024, + "rewards/margins": 1.2057658433914185, + "rewards/rejected": -1.556278109550476, + "step": 199 + }, + { + "epoch": 0.17, + "grad_norm": 50.47942610292587, + "learning_rate": 1.9692646884974677e-06, + "logits/chosen": -0.8936487436294556, + "logits/rejected": -0.891660213470459, + "logps/chosen": -641.30322265625, + "logps/rejected": -801.561767578125, + "loss": 0.3059, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.541850209236145, + "rewards/margins": 1.4763543605804443, + "rewards/rejected": -2.018204689025879, + "step": 200 + }, + { + "epoch": 0.17, + "grad_norm": 55.01255322615232, + "learning_rate": 1.968523550311861e-06, + "logits/chosen": -0.789238452911377, + "logits/rejected": -0.7798421382904053, + "logps/chosen": -410.76922607421875, + "logps/rejected": -635.4869995117188, + "loss": 0.346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5689191818237305, + "rewards/margins": 1.2248907089233398, + "rewards/rejected": -1.7938098907470703, + "step": 201 + }, + { + "epoch": 0.17, + "grad_norm": 50.19339087306107, + "learning_rate": 1.967773725350115e-06, + "logits/chosen": -0.8648644685745239, + "logits/rejected": -0.8245384693145752, + "logps/chosen": -513.4756469726562, + "logps/rejected": -949.0679321289062, + "loss": 0.2709, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.48574915528297424, + "rewards/margins": 1.6821191310882568, + "rewards/rejected": -2.167868137359619, + "step": 202 + }, + { + "epoch": 0.17, + "grad_norm": 46.244053912991895, + "learning_rate": 1.9670152203374792e-06, + "logits/chosen": -0.9518221020698547, + "logits/rejected": -0.912643551826477, + "logps/chosen": -523.580078125, + "logps/rejected": -801.155029296875, + "loss": 0.2803, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4601432681083679, + "rewards/margins": 1.4460601806640625, + "rewards/rejected": -1.9062033891677856, + "step": 203 + }, + { + "epoch": 0.17, + "grad_norm": 86.33604808746128, + "learning_rate": 1.9662480420770532e-06, + "logits/chosen": -0.994076132774353, + "logits/rejected": -0.9478283524513245, + "logps/chosen": -551.2991943359375, + "logps/rejected": -729.0390014648438, + "loss": 0.4418, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9313852190971375, + "rewards/margins": 0.9562951326370239, + "rewards/rejected": -1.8876804113388062, + "step": 204 + }, + { + "epoch": 0.18, + "grad_norm": 63.281110377931036, + "learning_rate": 1.965472197449729e-06, + "logits/chosen": -0.9953914880752563, + "logits/rejected": -0.9329037666320801, + "logps/chosen": -452.6296691894531, + "logps/rejected": -581.63671875, + "loss": 0.3916, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6000749468803406, + "rewards/margins": 0.8758547306060791, + "rewards/rejected": -1.4759297370910645, + "step": 205 + }, + { + "epoch": 0.18, + "grad_norm": 61.35086829140238, + "learning_rate": 1.964687693414129e-06, + "logits/chosen": -0.979865550994873, + "logits/rejected": -0.9563643932342529, + "logps/chosen": -581.7698364257812, + "logps/rejected": -679.9580078125, + "loss": 0.4202, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6248217225074768, + "rewards/margins": 1.460896372795105, + "rewards/rejected": -2.0857181549072266, + "step": 206 + }, + { + "epoch": 0.18, + "grad_norm": 66.71900425954713, + "learning_rate": 1.96389453700654e-06, + "logits/chosen": -1.0769457817077637, + "logits/rejected": -1.0209407806396484, + "logps/chosen": -532.283203125, + "logps/rejected": -622.6073608398438, + "loss": 0.387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5653808116912842, + "rewards/margins": 1.0661849975585938, + "rewards/rejected": -1.631565809249878, + "step": 207 + }, + { + "epoch": 0.18, + "grad_norm": 62.19269408403389, + "learning_rate": 1.9630927353408553e-06, + "logits/chosen": -1.0014283657073975, + "logits/rejected": -0.978428840637207, + "logps/chosen": -429.43206787109375, + "logps/rejected": -454.16217041015625, + "loss": 0.4662, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2503443956375122, + "rewards/margins": 0.6810840964317322, + "rewards/rejected": -0.9314284324645996, + "step": 208 + }, + { + "epoch": 0.18, + "grad_norm": 50.77513215236795, + "learning_rate": 1.9622822956085064e-06, + "logits/chosen": -0.9936904907226562, + "logits/rejected": -0.9656355381011963, + "logps/chosen": -349.3846435546875, + "logps/rejected": -486.734375, + "loss": 0.3967, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2877790927886963, + "rewards/margins": 0.8903200626373291, + "rewards/rejected": -1.1780990362167358, + "step": 209 + }, + { + "epoch": 0.18, + "grad_norm": 47.75737657842299, + "learning_rate": 1.961463225078402e-06, + "logits/chosen": -1.0057542324066162, + "logits/rejected": -0.970438539981842, + "logps/chosen": -588.5496215820312, + "logps/rejected": -781.412841796875, + "loss": 0.3138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48664844036102295, + "rewards/margins": 1.434399127960205, + "rewards/rejected": -1.9210474491119385, + "step": 210 + }, + { + "epoch": 0.18, + "grad_norm": 79.01899526621891, + "learning_rate": 1.96063553109686e-06, + "logits/chosen": -1.0867319107055664, + "logits/rejected": -1.0168215036392212, + "logps/chosen": -487.307861328125, + "logps/rejected": -644.064208984375, + "loss": 0.4499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6227390766143799, + "rewards/margins": 0.8902217149734497, + "rewards/rejected": -1.5129609107971191, + "step": 211 + }, + { + "epoch": 0.18, + "grad_norm": 82.77598850049955, + "learning_rate": 1.9597992210875437e-06, + "logits/chosen": -1.08547043800354, + "logits/rejected": -1.0560569763183594, + "logps/chosen": -607.5922241210938, + "logps/rejected": -690.8724365234375, + "loss": 0.5202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5151211619377136, + "rewards/margins": 1.051287055015564, + "rewards/rejected": -1.5664081573486328, + "step": 212 + }, + { + "epoch": 0.18, + "grad_norm": 48.76088540859079, + "learning_rate": 1.9589543025513933e-06, + "logits/chosen": -1.059824824333191, + "logits/rejected": -0.9934217929840088, + "logps/chosen": -470.1881408691406, + "logps/rejected": -723.36474609375, + "loss": 0.3258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30257856845855713, + "rewards/margins": 1.6323356628417969, + "rewards/rejected": -1.9349143505096436, + "step": 213 + }, + { + "epoch": 0.18, + "grad_norm": 47.021049477346295, + "learning_rate": 1.958100783066561e-06, + "logits/chosen": -0.962358295917511, + "logits/rejected": -0.9208459854125977, + "logps/chosen": -513.943359375, + "logps/rejected": -723.7570190429688, + "loss": 0.2964, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4522506594657898, + "rewards/margins": 1.5093984603881836, + "rewards/rejected": -1.961648941040039, + "step": 214 + }, + { + "epoch": 0.18, + "grad_norm": 67.28984682350877, + "learning_rate": 1.9572386702883406e-06, + "logits/chosen": -1.0783056020736694, + "logits/rejected": -0.9888733625411987, + "logps/chosen": -603.6023559570312, + "logps/rejected": -753.92431640625, + "loss": 0.365, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4725470542907715, + "rewards/margins": 1.2321484088897705, + "rewards/rejected": -1.704695463180542, + "step": 215 + }, + { + "epoch": 0.19, + "grad_norm": 63.185655249618556, + "learning_rate": 1.9563679719491004e-06, + "logits/chosen": -1.0363142490386963, + "logits/rejected": -0.9776827692985535, + "logps/chosen": -572.5197143554688, + "logps/rejected": -808.026611328125, + "loss": 0.2894, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6176286339759827, + "rewards/margins": 1.396406888961792, + "rewards/rejected": -2.01403546333313, + "step": 216 + }, + { + "epoch": 0.19, + "grad_norm": 67.44667093371345, + "learning_rate": 1.955488695858213e-06, + "logits/chosen": -0.9920237064361572, + "logits/rejected": -0.9496505260467529, + "logps/chosen": -657.737548828125, + "logps/rejected": -717.6353759765625, + "loss": 0.3514, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7767983078956604, + "rewards/margins": 1.014103889465332, + "rewards/rejected": -1.7909021377563477, + "step": 217 + }, + { + "epoch": 0.19, + "grad_norm": 53.50894829446239, + "learning_rate": 1.9546008499019862e-06, + "logits/chosen": -0.9635003805160522, + "logits/rejected": -0.9544948935508728, + "logps/chosen": -453.1898193359375, + "logps/rejected": -485.15679931640625, + "loss": 0.3781, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.20648762583732605, + "rewards/margins": 1.0868135690689087, + "rewards/rejected": -1.2933012247085571, + "step": 218 + }, + { + "epoch": 0.19, + "grad_norm": 66.84092485372332, + "learning_rate": 1.953704442043591e-06, + "logits/chosen": -1.0142822265625, + "logits/rejected": -0.9846318960189819, + "logps/chosen": -579.3690185546875, + "logps/rejected": -646.5301513671875, + "loss": 0.4208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5368911027908325, + "rewards/margins": 0.9143826961517334, + "rewards/rejected": -1.4512736797332764, + "step": 219 + }, + { + "epoch": 0.19, + "grad_norm": 54.394417151247545, + "learning_rate": 1.9527994803229923e-06, + "logits/chosen": -1.0889610052108765, + "logits/rejected": -1.04813814163208, + "logps/chosen": -565.11767578125, + "logps/rejected": -727.020263671875, + "loss": 0.3149, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.34967750310897827, + "rewards/margins": 1.3908071517944336, + "rewards/rejected": -1.7404847145080566, + "step": 220 + }, + { + "epoch": 0.19, + "grad_norm": 51.361945382744985, + "learning_rate": 1.9518859728568736e-06, + "logits/chosen": -1.0240240097045898, + "logits/rejected": -0.9509984850883484, + "logps/chosen": -501.5472717285156, + "logps/rejected": -812.7438354492188, + "loss": 0.2624, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5783480405807495, + "rewards/margins": 1.6443132162094116, + "rewards/rejected": -2.222661256790161, + "step": 221 + }, + { + "epoch": 0.19, + "grad_norm": 48.47503303116554, + "learning_rate": 1.950963927838567e-06, + "logits/chosen": -0.9973111152648926, + "logits/rejected": -0.9358306527137756, + "logps/chosen": -522.234130859375, + "logps/rejected": -693.0299682617188, + "loss": 0.2573, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5451964139938354, + "rewards/margins": 1.5520622730255127, + "rewards/rejected": -2.0972585678100586, + "step": 222 + }, + { + "epoch": 0.19, + "grad_norm": 51.089371754814614, + "learning_rate": 1.9500333535379783e-06, + "logits/chosen": -1.0092889070510864, + "logits/rejected": -0.9825106859207153, + "logps/chosen": -426.1991882324219, + "logps/rejected": -652.1566772460938, + "loss": 0.3307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3618304133415222, + "rewards/margins": 1.24405837059021, + "rewards/rejected": -1.6058887243270874, + "step": 223 + }, + { + "epoch": 0.19, + "grad_norm": 60.37540059428791, + "learning_rate": 1.949094258301513e-06, + "logits/chosen": -1.0221076011657715, + "logits/rejected": -0.9619882106781006, + "logps/chosen": -504.2724914550781, + "logps/rejected": -607.927001953125, + "loss": 0.4181, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.000596955418586731, + "rewards/margins": 0.7537394762039185, + "rewards/rejected": -0.7543364763259888, + "step": 224 + }, + { + "epoch": 0.19, + "grad_norm": 80.2573736713337, + "learning_rate": 1.9481466505520034e-06, + "logits/chosen": -1.03997802734375, + "logits/rejected": -0.9718859195709229, + "logps/chosen": -615.118896484375, + "logps/rejected": -781.4096069335938, + "loss": 0.479, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7925766706466675, + "rewards/margins": 0.7858482599258423, + "rewards/rejected": -1.5784249305725098, + "step": 225 + }, + { + "epoch": 0.19, + "grad_norm": 84.60494731211561, + "learning_rate": 1.947190538788628e-06, + "logits/chosen": -1.0425546169281006, + "logits/rejected": -0.9868509769439697, + "logps/chosen": -558.9896240234375, + "logps/rejected": -764.4346923828125, + "loss": 0.4825, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5591992139816284, + "rewards/margins": 1.024910807609558, + "rewards/rejected": -1.5841100215911865, + "step": 226 + }, + { + "epoch": 0.19, + "grad_norm": 60.8006500130628, + "learning_rate": 1.946225931586842e-06, + "logits/chosen": -1.0703706741333008, + "logits/rejected": -1.0248998403549194, + "logps/chosen": -491.8314208984375, + "logps/rejected": -698.8748779296875, + "loss": 0.3613, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6801337003707886, + "rewards/margins": 1.0860910415649414, + "rewards/rejected": -1.7662248611450195, + "step": 227 + }, + { + "epoch": 0.2, + "grad_norm": 57.94489583576497, + "learning_rate": 1.9452528375982947e-06, + "logits/chosen": -1.0509393215179443, + "logits/rejected": -0.9977148771286011, + "logps/chosen": -656.40625, + "logps/rejected": -723.1608276367188, + "loss": 0.3023, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2907889485359192, + "rewards/margins": 1.4880188703536987, + "rewards/rejected": -1.7788077592849731, + "step": 228 + }, + { + "epoch": 0.2, + "grad_norm": 50.233865363274404, + "learning_rate": 1.9442712655507552e-06, + "logits/chosen": -1.024355173110962, + "logits/rejected": -0.9878383874893188, + "logps/chosen": -269.308837890625, + "logps/rejected": -605.1350708007812, + "loss": 0.3081, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.39276325702667236, + "rewards/margins": 1.510910987854004, + "rewards/rejected": -1.9036743640899658, + "step": 229 + }, + { + "epoch": 0.2, + "grad_norm": 49.79390182856688, + "learning_rate": 1.9432812242480326e-06, + "logits/chosen": -1.0242599248886108, + "logits/rejected": -0.9582304954528809, + "logps/chosen": -533.2835693359375, + "logps/rejected": -1009.9631958007812, + "loss": 0.2663, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7415204048156738, + "rewards/margins": 2.0787625312805176, + "rewards/rejected": -2.8202829360961914, + "step": 230 + }, + { + "epoch": 0.2, + "grad_norm": 65.90741394993712, + "learning_rate": 1.9422827225698976e-06, + "logits/chosen": -1.0297460556030273, + "logits/rejected": -0.9815988540649414, + "logps/chosen": -566.6522216796875, + "logps/rejected": -730.4981079101562, + "loss": 0.3494, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5851923823356628, + "rewards/margins": 1.1192116737365723, + "rewards/rejected": -1.7044041156768799, + "step": 231 + }, + { + "epoch": 0.2, + "grad_norm": 61.83529221894195, + "learning_rate": 1.9412757694720036e-06, + "logits/chosen": -1.1333967447280884, + "logits/rejected": -1.0697953701019287, + "logps/chosen": -572.987548828125, + "logps/rejected": -693.1698608398438, + "loss": 0.3909, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7623158693313599, + "rewards/margins": 1.2518970966339111, + "rewards/rejected": -2.0142128467559814, + "step": 232 + }, + { + "epoch": 0.2, + "grad_norm": 56.26007991200333, + "learning_rate": 1.9402603739858045e-06, + "logits/chosen": -1.0455517768859863, + "logits/rejected": -1.022415280342102, + "logps/chosen": -481.7862243652344, + "logps/rejected": -479.995849609375, + "loss": 0.4368, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4785584807395935, + "rewards/margins": 0.7565988302230835, + "rewards/rejected": -1.2351572513580322, + "step": 233 + }, + { + "epoch": 0.2, + "grad_norm": 53.78226139691404, + "learning_rate": 1.9392365452184743e-06, + "logits/chosen": -1.2446177005767822, + "logits/rejected": -1.1865254640579224, + "logps/chosen": -427.7890625, + "logps/rejected": -636.18017578125, + "loss": 0.3274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48274075984954834, + "rewards/margins": 1.0873993635177612, + "rewards/rejected": -1.5701401233673096, + "step": 234 + }, + { + "epoch": 0.2, + "grad_norm": 51.257742769219234, + "learning_rate": 1.938204292352828e-06, + "logits/chosen": -1.0768308639526367, + "logits/rejected": -1.0284702777862549, + "logps/chosen": -443.66351318359375, + "logps/rejected": -646.884033203125, + "loss": 0.3885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6171982288360596, + "rewards/margins": 1.1787415742874146, + "rewards/rejected": -1.7959399223327637, + "step": 235 + }, + { + "epoch": 0.2, + "grad_norm": 57.867770622832765, + "learning_rate": 1.9371636246472353e-06, + "logits/chosen": -1.2808043956756592, + "logits/rejected": -1.2098761796951294, + "logps/chosen": -390.9673767089844, + "logps/rejected": -572.98388671875, + "loss": 0.4232, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.521543562412262, + "rewards/margins": 1.1939754486083984, + "rewards/rejected": -1.7155190706253052, + "step": 236 + }, + { + "epoch": 0.2, + "grad_norm": 55.39431785341968, + "learning_rate": 1.936114551435539e-06, + "logits/chosen": -1.2188622951507568, + "logits/rejected": -1.1490856409072876, + "logps/chosen": -390.39910888671875, + "logps/rejected": -651.4248046875, + "loss": 0.3413, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.42151105403900146, + "rewards/margins": 1.3620014190673828, + "rewards/rejected": -1.7835125923156738, + "step": 237 + }, + { + "epoch": 0.2, + "grad_norm": 73.23920168715607, + "learning_rate": 1.935057082126974e-06, + "logits/chosen": -1.290961742401123, + "logits/rejected": -1.2376530170440674, + "logps/chosen": -544.6636962890625, + "logps/rejected": -715.2994995117188, + "loss": 0.405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37994012236595154, + "rewards/margins": 1.172218918800354, + "rewards/rejected": -1.552159070968628, + "step": 238 + }, + { + "epoch": 0.2, + "grad_norm": 63.741831466021814, + "learning_rate": 1.9339912262060782e-06, + "logits/chosen": -1.1444615125656128, + "logits/rejected": -1.0806961059570312, + "logps/chosen": -594.5799560546875, + "logps/rejected": -838.465087890625, + "loss": 0.318, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4210588335990906, + "rewards/margins": 1.4533946514129639, + "rewards/rejected": -1.8744535446166992, + "step": 239 + }, + { + "epoch": 0.21, + "grad_norm": 64.76955820674983, + "learning_rate": 1.9329169932326104e-06, + "logits/chosen": -1.163501501083374, + "logits/rejected": -1.1278737783432007, + "logps/chosen": -480.21234130859375, + "logps/rejected": -577.05078125, + "loss": 0.4336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6075069308280945, + "rewards/margins": 0.9887516498565674, + "rewards/rejected": -1.5962586402893066, + "step": 240 + }, + { + "epoch": 0.21, + "grad_norm": 44.25371144145174, + "learning_rate": 1.9318343928414642e-06, + "logits/chosen": -1.106231927871704, + "logits/rejected": -1.084270715713501, + "logps/chosen": -518.6808471679688, + "logps/rejected": -728.62548828125, + "loss": 0.2596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43097513914108276, + "rewards/margins": 1.509975552558899, + "rewards/rejected": -1.940950632095337, + "step": 241 + }, + { + "epoch": 0.21, + "grad_norm": 72.96662080811308, + "learning_rate": 1.9307434347425826e-06, + "logits/chosen": -1.1415469646453857, + "logits/rejected": -1.0857826471328735, + "logps/chosen": -627.71728515625, + "logps/rejected": -860.5419921875, + "loss": 0.2612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6891728639602661, + "rewards/margins": 1.6753425598144531, + "rewards/rejected": -2.3645153045654297, + "step": 242 + }, + { + "epoch": 0.21, + "grad_norm": 37.9024385841564, + "learning_rate": 1.929644128720867e-06, + "logits/chosen": -1.196105718612671, + "logits/rejected": -1.1384600400924683, + "logps/chosen": -535.7288208007812, + "logps/rejected": -742.9649047851562, + "loss": 0.2388, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5874618291854858, + "rewards/margins": 1.8706109523773193, + "rewards/rejected": -2.4580729007720947, + "step": 243 + }, + { + "epoch": 0.21, + "grad_norm": 55.95864101578566, + "learning_rate": 1.9285364846360943e-06, + "logits/chosen": -1.1476514339447021, + "logits/rejected": -1.1225823163986206, + "logps/chosen": -625.8814697265625, + "logps/rejected": -686.959716796875, + "loss": 0.3768, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6543445587158203, + "rewards/margins": 1.4013875722885132, + "rewards/rejected": -2.055732250213623, + "step": 244 + }, + { + "epoch": 0.21, + "grad_norm": 73.63823079566366, + "learning_rate": 1.9274205124228243e-06, + "logits/chosen": -1.0947630405426025, + "logits/rejected": -1.04892897605896, + "logps/chosen": -478.984375, + "logps/rejected": -659.949951171875, + "loss": 0.4166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6773884296417236, + "rewards/margins": 1.2850854396820068, + "rewards/rejected": -1.9624738693237305, + "step": 245 + }, + { + "epoch": 0.21, + "grad_norm": 83.81728984916164, + "learning_rate": 1.926296222090315e-06, + "logits/chosen": -1.0103175640106201, + "logits/rejected": -1.0097943544387817, + "logps/chosen": -879.529541015625, + "logps/rejected": -963.75537109375, + "loss": 0.2921, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1037704944610596, + "rewards/margins": 1.5854477882385254, + "rewards/rejected": -2.689218044281006, + "step": 246 + }, + { + "epoch": 0.21, + "grad_norm": 64.47092513570345, + "learning_rate": 1.925163623722428e-06, + "logits/chosen": -1.226211667060852, + "logits/rejected": -1.181449294090271, + "logps/chosen": -425.39190673828125, + "logps/rejected": -598.806884765625, + "loss": 0.4008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5162314176559448, + "rewards/margins": 1.3047699928283691, + "rewards/rejected": -1.8210015296936035, + "step": 247 + }, + { + "epoch": 0.21, + "grad_norm": 41.54494614357395, + "learning_rate": 1.9240227274775424e-06, + "logits/chosen": -1.1234793663024902, + "logits/rejected": -1.0364389419555664, + "logps/chosen": -313.11126708984375, + "logps/rejected": -506.63958740234375, + "loss": 0.3275, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37168002128601074, + "rewards/margins": 1.317708969116211, + "rewards/rejected": -1.6893889904022217, + "step": 248 + }, + { + "epoch": 0.21, + "grad_norm": 32.141234054176465, + "learning_rate": 1.9228735435884606e-06, + "logits/chosen": -1.1416168212890625, + "logits/rejected": -1.0254522562026978, + "logps/chosen": -370.5738525390625, + "logps/rejected": -811.5826416015625, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25858139991760254, + "rewards/margins": 2.1482415199279785, + "rewards/rejected": -2.406822681427002, + "step": 249 + }, + { + "epoch": 0.21, + "grad_norm": 64.94292695390094, + "learning_rate": 1.9217160823623165e-06, + "logits/chosen": -1.029049277305603, + "logits/rejected": -0.9953701496124268, + "logps/chosen": -461.535400390625, + "logps/rejected": -589.9942626953125, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6874184012413025, + "rewards/margins": 1.177901029586792, + "rewards/rejected": -1.8653193712234497, + "step": 250 + }, + { + "epoch": 0.22, + "grad_norm": 41.45077725222385, + "learning_rate": 1.920550354180487e-06, + "logits/chosen": -1.163294792175293, + "logits/rejected": -1.0786793231964111, + "logps/chosen": -449.732177734375, + "logps/rejected": -768.60498046875, + "loss": 0.2243, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.49330294132232666, + "rewards/margins": 1.8286638259887695, + "rewards/rejected": -2.3219666481018066, + "step": 251 + }, + { + "epoch": 0.22, + "grad_norm": 41.54008016919367, + "learning_rate": 1.919376369498494e-06, + "logits/chosen": -1.3669646978378296, + "logits/rejected": -1.305161476135254, + "logps/chosen": -576.000244140625, + "logps/rejected": -688.1344604492188, + "loss": 0.2346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6032384634017944, + "rewards/margins": 1.8792409896850586, + "rewards/rejected": -2.4824795722961426, + "step": 252 + }, + { + "epoch": 0.22, + "grad_norm": 39.54645804416662, + "learning_rate": 1.9181941388459134e-06, + "logits/chosen": -1.297188639640808, + "logits/rejected": -1.179163932800293, + "logps/chosen": -419.82672119140625, + "logps/rejected": -684.82666015625, + "loss": 0.1996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4974091351032257, + "rewards/margins": 1.855064034461975, + "rewards/rejected": -2.352473258972168, + "step": 253 + }, + { + "epoch": 0.22, + "grad_norm": 66.18713318470269, + "learning_rate": 1.91700367282628e-06, + "logits/chosen": -1.2055079936981201, + "logits/rejected": -1.1353366374969482, + "logps/chosen": -578.854736328125, + "logps/rejected": -769.0565185546875, + "loss": 0.3484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7658222913742065, + "rewards/margins": 1.182613492012024, + "rewards/rejected": -1.9484357833862305, + "step": 254 + }, + { + "epoch": 0.22, + "grad_norm": 65.30291958637173, + "learning_rate": 1.9158049821169918e-06, + "logits/chosen": -1.1541709899902344, + "logits/rejected": -1.1019580364227295, + "logps/chosen": -475.2603454589844, + "logps/rejected": -729.6705322265625, + "loss": 0.4214, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.21428640186786652, + "rewards/margins": 1.0960335731506348, + "rewards/rejected": -1.3103199005126953, + "step": 255 + }, + { + "epoch": 0.22, + "grad_norm": 54.104609363402034, + "learning_rate": 1.9145980774692156e-06, + "logits/chosen": -1.2529370784759521, + "logits/rejected": -1.2374582290649414, + "logps/chosen": -668.109375, + "logps/rejected": -857.1074829101562, + "loss": 0.3398, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7821727991104126, + "rewards/margins": 1.554344654083252, + "rewards/rejected": -2.336517333984375, + "step": 256 + }, + { + "epoch": 0.22, + "grad_norm": 39.49616108710431, + "learning_rate": 1.913382969707789e-06, + "logits/chosen": -1.3481879234313965, + "logits/rejected": -1.238132357597351, + "logps/chosen": -383.8392333984375, + "logps/rejected": -611.764404296875, + "loss": 0.2138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3953891396522522, + "rewards/margins": 1.674397587776184, + "rewards/rejected": -2.069786548614502, + "step": 257 + }, + { + "epoch": 0.22, + "grad_norm": 57.582765320833765, + "learning_rate": 1.9121596697311243e-06, + "logits/chosen": -1.295384168624878, + "logits/rejected": -1.2667222023010254, + "logps/chosen": -457.3524169921875, + "logps/rejected": -507.17449951171875, + "loss": 0.4294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4994971752166748, + "rewards/margins": 1.1396489143371582, + "rewards/rejected": -1.639146089553833, + "step": 258 + }, + { + "epoch": 0.22, + "grad_norm": 44.576501979807574, + "learning_rate": 1.910928188511111e-06, + "logits/chosen": -1.305537462234497, + "logits/rejected": -1.2473948001861572, + "logps/chosen": -395.11907958984375, + "logps/rejected": -581.2030029296875, + "loss": 0.2701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6577807664871216, + "rewards/margins": 1.5606365203857422, + "rewards/rejected": -2.2184174060821533, + "step": 259 + }, + { + "epoch": 0.22, + "grad_norm": 54.85712220293985, + "learning_rate": 1.9096885370930173e-06, + "logits/chosen": -1.2297561168670654, + "logits/rejected": -1.1931822299957275, + "logps/chosen": -437.64154052734375, + "logps/rejected": -528.6766357421875, + "loss": 0.415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5516808032989502, + "rewards/margins": 1.180525302886963, + "rewards/rejected": -1.732206106185913, + "step": 260 + }, + { + "epoch": 0.22, + "grad_norm": 56.27136975442982, + "learning_rate": 1.9084407265953887e-06, + "logits/chosen": -1.3499705791473389, + "logits/rejected": -1.2703001499176025, + "logps/chosen": -499.3698425292969, + "logps/rejected": -805.4957275390625, + "loss": 0.2961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46790242195129395, + "rewards/margins": 1.8862146139144897, + "rewards/rejected": -2.354116916656494, + "step": 261 + }, + { + "epoch": 0.22, + "grad_norm": 32.252199427793144, + "learning_rate": 1.907184768209952e-06, + "logits/chosen": -1.3755966424942017, + "logits/rejected": -1.3271452188491821, + "logps/chosen": -430.7845458984375, + "logps/rejected": -596.4493408203125, + "loss": 0.2377, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.42741551995277405, + "rewards/margins": 1.5471506118774414, + "rewards/rejected": -1.974565863609314, + "step": 262 + }, + { + "epoch": 0.23, + "grad_norm": 77.79391791238616, + "learning_rate": 1.9059206732015125e-06, + "logits/chosen": -1.3845618963241577, + "logits/rejected": -1.3517491817474365, + "logps/chosen": -527.8436889648438, + "logps/rejected": -609.9903564453125, + "loss": 0.5141, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7712549567222595, + "rewards/margins": 1.2328592538833618, + "rewards/rejected": -2.0041141510009766, + "step": 263 + }, + { + "epoch": 0.23, + "grad_norm": 62.17984948886498, + "learning_rate": 1.9046484529078539e-06, + "logits/chosen": -1.3209779262542725, + "logits/rejected": -1.2516530752182007, + "logps/chosen": -445.5620422363281, + "logps/rejected": -651.108642578125, + "loss": 0.3466, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5886340141296387, + "rewards/margins": 1.6127263307571411, + "rewards/rejected": -2.2013602256774902, + "step": 264 + }, + { + "epoch": 0.23, + "grad_norm": 44.63454231126517, + "learning_rate": 1.9033681187396362e-06, + "logits/chosen": -1.380696415901184, + "logits/rejected": -1.3153538703918457, + "logps/chosen": -425.5417785644531, + "logps/rejected": -688.957275390625, + "loss": 0.279, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.534125566482544, + "rewards/margins": 1.6324182748794556, + "rewards/rejected": -2.166543960571289, + "step": 265 + }, + { + "epoch": 0.23, + "grad_norm": 80.91662784064529, + "learning_rate": 1.902079682180293e-06, + "logits/chosen": -1.2613506317138672, + "logits/rejected": -1.2341322898864746, + "logps/chosen": -729.0734252929688, + "logps/rejected": -847.1480102539062, + "loss": 0.3454, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9773740768432617, + "rewards/margins": 1.5395441055297852, + "rewards/rejected": -2.516918182373047, + "step": 266 + }, + { + "epoch": 0.23, + "grad_norm": 51.409969035837435, + "learning_rate": 1.9007831547859299e-06, + "logits/chosen": -1.3617085218429565, + "logits/rejected": -1.2971055507659912, + "logps/chosen": -602.2572631835938, + "logps/rejected": -707.104736328125, + "loss": 0.3547, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.602672815322876, + "rewards/margins": 1.435534954071045, + "rewards/rejected": -2.038207769393921, + "step": 267 + }, + { + "epoch": 0.23, + "grad_norm": 59.718107084802284, + "learning_rate": 1.899478548185219e-06, + "logits/chosen": -1.3794385194778442, + "logits/rejected": -1.2717067003250122, + "logps/chosen": -433.0210266113281, + "logps/rejected": -644.1810913085938, + "loss": 0.356, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6217007040977478, + "rewards/margins": 1.6019165515899658, + "rewards/rejected": -2.2236175537109375, + "step": 268 + }, + { + "epoch": 0.23, + "grad_norm": 69.55971626212478, + "learning_rate": 1.8981658740792967e-06, + "logits/chosen": -1.30446195602417, + "logits/rejected": -1.25761079788208, + "logps/chosen": -594.3277587890625, + "logps/rejected": -729.9678955078125, + "loss": 0.3306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7294997572898865, + "rewards/margins": 1.456763505935669, + "rewards/rejected": -2.1862633228302, + "step": 269 + }, + { + "epoch": 0.23, + "grad_norm": 49.94697620233205, + "learning_rate": 1.8968451442416562e-06, + "logits/chosen": -1.3442294597625732, + "logits/rejected": -1.256706953048706, + "logps/chosen": -462.27301025390625, + "logps/rejected": -640.5385131835938, + "loss": 0.3501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6719030737876892, + "rewards/margins": 1.362053632736206, + "rewards/rejected": -2.03395676612854, + "step": 270 + }, + { + "epoch": 0.23, + "grad_norm": 113.15812961073618, + "learning_rate": 1.8955163705180443e-06, + "logits/chosen": -1.2014787197113037, + "logits/rejected": -1.2413536310195923, + "logps/chosen": -776.939453125, + "logps/rejected": -686.668212890625, + "loss": 0.8989, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.169008731842041, + "rewards/margins": 0.6716674566268921, + "rewards/rejected": -1.8406760692596436, + "step": 271 + }, + { + "epoch": 0.23, + "grad_norm": 63.927868317477646, + "learning_rate": 1.894179564826354e-06, + "logits/chosen": -1.3427788019180298, + "logits/rejected": -1.270050287246704, + "logps/chosen": -506.8124694824219, + "logps/rejected": -714.5921630859375, + "loss": 0.36, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5294536352157593, + "rewards/margins": 1.8180158138275146, + "rewards/rejected": -2.3474695682525635, + "step": 272 + }, + { + "epoch": 0.23, + "grad_norm": 52.1760169157746, + "learning_rate": 1.892834739156517e-06, + "logits/chosen": -1.3242626190185547, + "logits/rejected": -1.3110473155975342, + "logps/chosen": -461.1047668457031, + "logps/rejected": -539.557373046875, + "loss": 0.3191, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4187151789665222, + "rewards/margins": 1.3253779411315918, + "rewards/rejected": -1.7440930604934692, + "step": 273 + }, + { + "epoch": 0.23, + "grad_norm": 48.99631544015967, + "learning_rate": 1.8914819055703983e-06, + "logits/chosen": -1.3378002643585205, + "logits/rejected": -1.2494276762008667, + "logps/chosen": -472.304931640625, + "logps/rejected": -652.1216430664062, + "loss": 0.3131, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5154717564582825, + "rewards/margins": 1.3171603679656982, + "rewards/rejected": -1.832632064819336, + "step": 274 + }, + { + "epoch": 0.24, + "grad_norm": 55.637858401703724, + "learning_rate": 1.890121076201685e-06, + "logits/chosen": -1.2824797630310059, + "logits/rejected": -1.206345796585083, + "logps/chosen": -503.8155212402344, + "logps/rejected": -654.08056640625, + "loss": 0.3554, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6268565058708191, + "rewards/margins": 1.6319068670272827, + "rewards/rejected": -2.258763313293457, + "step": 275 + }, + { + "epoch": 0.24, + "grad_norm": 82.8687629552848, + "learning_rate": 1.8887522632557804e-06, + "logits/chosen": -1.2825202941894531, + "logits/rejected": -1.2575864791870117, + "logps/chosen": -591.2352294921875, + "logps/rejected": -663.9232177734375, + "loss": 0.4883, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8104920387268066, + "rewards/margins": 0.9792019724845886, + "rewards/rejected": -1.78969407081604, + "step": 276 + }, + { + "epoch": 0.24, + "grad_norm": 83.84616386000505, + "learning_rate": 1.887375479009693e-06, + "logits/chosen": -1.312516450881958, + "logits/rejected": -1.2433936595916748, + "logps/chosen": -633.65283203125, + "logps/rejected": -777.0487670898438, + "loss": 0.5805, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9956063032150269, + "rewards/margins": 1.1437734365463257, + "rewards/rejected": -2.1393797397613525, + "step": 277 + }, + { + "epoch": 0.24, + "grad_norm": 62.922250186992066, + "learning_rate": 1.8859907358119257e-06, + "logits/chosen": -1.346423625946045, + "logits/rejected": -1.224915862083435, + "logps/chosen": -625.4689331054688, + "logps/rejected": -958.303955078125, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7824087142944336, + "rewards/margins": 2.1031761169433594, + "rewards/rejected": -2.885584831237793, + "step": 278 + }, + { + "epoch": 0.24, + "grad_norm": 50.281759620820885, + "learning_rate": 1.8845980460823674e-06, + "logits/chosen": -1.232679843902588, + "logits/rejected": -1.203260898590088, + "logps/chosen": -696.5806274414062, + "logps/rejected": -820.2708129882812, + "loss": 0.2181, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6646475791931152, + "rewards/margins": 1.7615255117416382, + "rewards/rejected": -2.426173210144043, + "step": 279 + }, + { + "epoch": 0.24, + "grad_norm": 91.55282932822111, + "learning_rate": 1.8831974223121789e-06, + "logits/chosen": -1.2700848579406738, + "logits/rejected": -1.2630860805511475, + "logps/chosen": -687.1458740234375, + "logps/rejected": -726.3959350585938, + "loss": 0.5542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.814836859703064, + "rewards/margins": 1.0548039674758911, + "rewards/rejected": -1.869640827178955, + "step": 280 + }, + { + "epoch": 0.24, + "grad_norm": 47.23631638320723, + "learning_rate": 1.8817888770636828e-06, + "logits/chosen": -1.2989399433135986, + "logits/rejected": -1.2461373805999756, + "logps/chosen": -542.2744140625, + "logps/rejected": -695.8860473632812, + "loss": 0.2762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5193393230438232, + "rewards/margins": 1.421765923500061, + "rewards/rejected": -1.9411051273345947, + "step": 281 + }, + { + "epoch": 0.24, + "grad_norm": 42.45351766075743, + "learning_rate": 1.8803724229702501e-06, + "logits/chosen": -1.3047549724578857, + "logits/rejected": -1.2471988201141357, + "logps/chosen": -623.1423950195312, + "logps/rejected": -827.06201171875, + "loss": 0.2677, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7911394834518433, + "rewards/margins": 1.5670664310455322, + "rewards/rejected": -2.358205795288086, + "step": 282 + }, + { + "epoch": 0.24, + "grad_norm": 132.5548506788457, + "learning_rate": 1.878948072736187e-06, + "logits/chosen": -1.3195421695709229, + "logits/rejected": -1.3033177852630615, + "logps/chosen": -665.1011352539062, + "logps/rejected": -786.5206298828125, + "loss": 0.3534, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7998645305633545, + "rewards/margins": 1.4624037742614746, + "rewards/rejected": -2.262268304824829, + "step": 283 + }, + { + "epoch": 0.24, + "grad_norm": 39.06583611828614, + "learning_rate": 1.8775158391366205e-06, + "logits/chosen": -1.4100770950317383, + "logits/rejected": -1.359595775604248, + "logps/chosen": -424.16839599609375, + "logps/rejected": -588.47998046875, + "loss": 0.3373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7181679606437683, + "rewards/margins": 1.374335765838623, + "rewards/rejected": -2.092503786087036, + "step": 284 + }, + { + "epoch": 0.24, + "grad_norm": 65.62725799965577, + "learning_rate": 1.8760757350173844e-06, + "logits/chosen": -1.3192460536956787, + "logits/rejected": -1.311014175415039, + "logps/chosen": -462.3755798339844, + "logps/rejected": -593.5933837890625, + "loss": 0.421, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7063227891921997, + "rewards/margins": 1.034854769706726, + "rewards/rejected": -1.7411775588989258, + "step": 285 + }, + { + "epoch": 0.25, + "grad_norm": 88.17881420608741, + "learning_rate": 1.8746277732949043e-06, + "logits/chosen": -1.287336826324463, + "logits/rejected": -1.2378259897232056, + "logps/chosen": -743.6614379882812, + "logps/rejected": -926.6859130859375, + "loss": 0.3923, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.45865917205810547, + "rewards/margins": 1.0680279731750488, + "rewards/rejected": -1.5266873836517334, + "step": 286 + }, + { + "epoch": 0.25, + "grad_norm": 92.58863179872984, + "learning_rate": 1.873171966956081e-06, + "logits/chosen": -1.3020586967468262, + "logits/rejected": -1.2492753267288208, + "logps/chosen": -561.0408325195312, + "logps/rejected": -696.8612060546875, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4373791813850403, + "rewards/margins": 1.1843516826629639, + "rewards/rejected": -1.6217308044433594, + "step": 287 + }, + { + "epoch": 0.25, + "grad_norm": 41.92668197491958, + "learning_rate": 1.8717083290581745e-06, + "logits/chosen": -1.3229628801345825, + "logits/rejected": -1.3219010829925537, + "logps/chosen": -593.58349609375, + "logps/rejected": -677.1654663085938, + "loss": 0.33, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.339139461517334, + "rewards/margins": 1.3238906860351562, + "rewards/rejected": -1.6630302667617798, + "step": 288 + }, + { + "epoch": 0.25, + "grad_norm": 69.95582054379341, + "learning_rate": 1.8702368727286868e-06, + "logits/chosen": -1.2912516593933105, + "logits/rejected": -1.2390421628952026, + "logps/chosen": -511.74383544921875, + "logps/rejected": -553.6732177734375, + "loss": 0.4282, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4722836911678314, + "rewards/margins": 1.0789742469787598, + "rewards/rejected": -1.551257848739624, + "step": 289 + }, + { + "epoch": 0.25, + "grad_norm": 44.21457140750985, + "learning_rate": 1.8687576111652437e-06, + "logits/chosen": -1.4131152629852295, + "logits/rejected": -1.2697126865386963, + "logps/chosen": -542.24951171875, + "logps/rejected": -849.6591186523438, + "loss": 0.2178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7038062810897827, + "rewards/margins": 2.04502534866333, + "rewards/rejected": -2.7488315105438232, + "step": 290 + }, + { + "epoch": 0.25, + "grad_norm": 64.80569375842471, + "learning_rate": 1.8672705576354775e-06, + "logits/chosen": -1.2966111898422241, + "logits/rejected": -1.2468252182006836, + "logps/chosen": -440.4309997558594, + "logps/rejected": -538.8372802734375, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7583557367324829, + "rewards/margins": 1.1637259721755981, + "rewards/rejected": -1.922081708908081, + "step": 291 + }, + { + "epoch": 0.25, + "grad_norm": 51.64831816245686, + "learning_rate": 1.865775725476907e-06, + "logits/chosen": -1.291834831237793, + "logits/rejected": -1.2686530351638794, + "logps/chosen": -423.56182861328125, + "logps/rejected": -501.0770263671875, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3772410750389099, + "rewards/margins": 1.0666435956954956, + "rewards/rejected": -1.4438846111297607, + "step": 292 + }, + { + "epoch": 0.25, + "grad_norm": 59.11445357687318, + "learning_rate": 1.8642731280968182e-06, + "logits/chosen": -1.3467607498168945, + "logits/rejected": -1.3099944591522217, + "logps/chosen": -549.3778076171875, + "logps/rejected": -635.3603515625, + "loss": 0.3034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8041099309921265, + "rewards/margins": 1.2853634357452393, + "rewards/rejected": -2.0894734859466553, + "step": 293 + }, + { + "epoch": 0.25, + "grad_norm": 62.39719980624235, + "learning_rate": 1.8627627789721442e-06, + "logits/chosen": -1.3086678981781006, + "logits/rejected": -1.2643378973007202, + "logps/chosen": -642.4073486328125, + "logps/rejected": -816.996826171875, + "loss": 0.3278, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6026842594146729, + "rewards/margins": 1.8675678968429565, + "rewards/rejected": -2.470252275466919, + "step": 294 + }, + { + "epoch": 0.25, + "grad_norm": 61.69447609824399, + "learning_rate": 1.8612446916493442e-06, + "logits/chosen": -1.3641777038574219, + "logits/rejected": -1.3854668140411377, + "logps/chosen": -471.4262390136719, + "logps/rejected": -504.0348815917969, + "loss": 0.355, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.44899052381515503, + "rewards/margins": 1.3087570667266846, + "rewards/rejected": -1.7577476501464844, + "step": 295 + }, + { + "epoch": 0.25, + "grad_norm": 51.73986225545052, + "learning_rate": 1.8597188797442823e-06, + "logits/chosen": -1.3421553373336792, + "logits/rejected": -1.2874841690063477, + "logps/chosen": -505.52557373046875, + "logps/rejected": -656.2530517578125, + "loss": 0.2572, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3865881562232971, + "rewards/margins": 1.5258070230484009, + "rewards/rejected": -1.9123951196670532, + "step": 296 + }, + { + "epoch": 0.25, + "grad_norm": 71.58151264329189, + "learning_rate": 1.8581853569421042e-06, + "logits/chosen": -1.3670592308044434, + "logits/rejected": -1.3198003768920898, + "logps/chosen": -537.0482788085938, + "logps/rejected": -658.83447265625, + "loss": 0.3565, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5762161016464233, + "rewards/margins": 1.3206223249435425, + "rewards/rejected": -1.8968384265899658, + "step": 297 + }, + { + "epoch": 0.26, + "grad_norm": 100.50925042279562, + "learning_rate": 1.8566441369971163e-06, + "logits/chosen": -1.2974328994750977, + "logits/rejected": -1.2744688987731934, + "logps/chosen": -639.2808227539062, + "logps/rejected": -766.7714233398438, + "loss": 0.6602, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.816308856010437, + "rewards/margins": 1.0288530588150024, + "rewards/rejected": -1.8451619148254395, + "step": 298 + }, + { + "epoch": 0.26, + "grad_norm": 47.43785962452232, + "learning_rate": 1.8550952337326606e-06, + "logits/chosen": -1.365097165107727, + "logits/rejected": -1.2711853981018066, + "logps/chosen": -403.49005126953125, + "logps/rejected": -644.4906616210938, + "loss": 0.3325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44394218921661377, + "rewards/margins": 1.5209300518035889, + "rewards/rejected": -1.9648722410202026, + "step": 299 + }, + { + "epoch": 0.26, + "grad_norm": 49.38250053324777, + "learning_rate": 1.8535386610409925e-06, + "logits/chosen": -1.3787580728530884, + "logits/rejected": -1.2780003547668457, + "logps/chosen": -498.15521240234375, + "logps/rejected": -632.5543212890625, + "loss": 0.3357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.489728182554245, + "rewards/margins": 1.3524258136749268, + "rewards/rejected": -1.8421540260314941, + "step": 300 + }, + { + "epoch": 0.26, + "grad_norm": 98.44072601146492, + "learning_rate": 1.851974432883154e-06, + "logits/chosen": -1.2081255912780762, + "logits/rejected": -1.1339161396026611, + "logps/chosen": -562.579833984375, + "logps/rejected": -777.3492431640625, + "loss": 0.6088, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0019558966159820557, + "rewards/margins": 0.27304917573928833, + "rewards/rejected": -0.2750050723552704, + "step": 301 + }, + { + "epoch": 0.26, + "grad_norm": 91.11788015516005, + "learning_rate": 1.8504025632888507e-06, + "logits/chosen": -1.2894055843353271, + "logits/rejected": -1.2341303825378418, + "logps/chosen": -750.760986328125, + "logps/rejected": -849.8092651367188, + "loss": 0.4514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8769577741622925, + "rewards/margins": 1.1654422283172607, + "rewards/rejected": -2.0423998832702637, + "step": 302 + }, + { + "epoch": 0.26, + "grad_norm": 76.68001793142624, + "learning_rate": 1.8488230663563241e-06, + "logits/chosen": -1.2934482097625732, + "logits/rejected": -1.351201057434082, + "logps/chosen": -580.1993408203125, + "logps/rejected": -513.2964477539062, + "loss": 0.5818, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8330243229866028, + "rewards/margins": 0.5873950719833374, + "rewards/rejected": -1.420419454574585, + "step": 303 + }, + { + "epoch": 0.26, + "grad_norm": 47.858777042823164, + "learning_rate": 1.8472359562522266e-06, + "logits/chosen": -1.3255586624145508, + "logits/rejected": -1.2752617597579956, + "logps/chosen": -406.1722412109375, + "logps/rejected": -593.2540283203125, + "loss": 0.3364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43785959482192993, + "rewards/margins": 1.5706452131271362, + "rewards/rejected": -2.008504867553711, + "step": 304 + }, + { + "epoch": 0.26, + "grad_norm": 85.19988100914014, + "learning_rate": 1.8456412472114935e-06, + "logits/chosen": -1.3651058673858643, + "logits/rejected": -1.273531436920166, + "logps/chosen": -481.7969970703125, + "logps/rejected": -662.74658203125, + "loss": 0.4825, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6961475610733032, + "rewards/margins": 1.2806881666183472, + "rewards/rejected": -1.9768357276916504, + "step": 305 + }, + { + "epoch": 0.26, + "grad_norm": 37.84875798282611, + "learning_rate": 1.8440389535372156e-06, + "logits/chosen": -1.372624397277832, + "logits/rejected": -1.293984293937683, + "logps/chosen": -568.9570922851562, + "logps/rejected": -868.6464233398438, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6600321531295776, + "rewards/margins": 2.31733775138855, + "rewards/rejected": -2.977370262145996, + "step": 306 + }, + { + "epoch": 0.26, + "grad_norm": 47.54836575905022, + "learning_rate": 1.8424290896005115e-06, + "logits/chosen": -1.4119298458099365, + "logits/rejected": -1.3349215984344482, + "logps/chosen": -431.46490478515625, + "logps/rejected": -647.0010375976562, + "loss": 0.3487, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5202719569206238, + "rewards/margins": 1.5132447481155396, + "rewards/rejected": -2.0335168838500977, + "step": 307 + }, + { + "epoch": 0.26, + "grad_norm": 49.36518160151795, + "learning_rate": 1.8408116698403976e-06, + "logits/chosen": -1.3197250366210938, + "logits/rejected": -1.33034348487854, + "logps/chosen": -673.7100219726562, + "logps/rejected": -822.8350219726562, + "loss": 0.2688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7116418480873108, + "rewards/margins": 1.844422698020935, + "rewards/rejected": -2.5560646057128906, + "step": 308 + }, + { + "epoch": 0.27, + "grad_norm": 54.41588382019316, + "learning_rate": 1.8391867087636595e-06, + "logits/chosen": -1.3856697082519531, + "logits/rejected": -1.365121841430664, + "logps/chosen": -472.591552734375, + "logps/rejected": -553.991455078125, + "loss": 0.4403, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7445403337478638, + "rewards/margins": 0.8742175102233887, + "rewards/rejected": -1.618757963180542, + "step": 309 + }, + { + "epoch": 0.27, + "grad_norm": 34.64658589787295, + "learning_rate": 1.8375542209447214e-06, + "logits/chosen": -1.3953791856765747, + "logits/rejected": -1.2682315111160278, + "logps/chosen": -386.12139892578125, + "logps/rejected": -713.0578002929688, + "loss": 0.1955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5013018250465393, + "rewards/margins": 2.0638022422790527, + "rewards/rejected": -2.5651040077209473, + "step": 310 + }, + { + "epoch": 0.27, + "grad_norm": 73.96567656460611, + "learning_rate": 1.8359142210255155e-06, + "logits/chosen": -1.3954124450683594, + "logits/rejected": -1.3836252689361572, + "logps/chosen": -518.5062255859375, + "logps/rejected": -664.3820190429688, + "loss": 0.4891, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9950438737869263, + "rewards/margins": 1.2294788360595703, + "rewards/rejected": -2.224522829055786, + "step": 311 + }, + { + "epoch": 0.27, + "grad_norm": 59.0139814547133, + "learning_rate": 1.834266723715351e-06, + "logits/chosen": -1.3194384574890137, + "logits/rejected": -1.3285057544708252, + "logps/chosen": -629.0936279296875, + "logps/rejected": -770.8262329101562, + "loss": 0.4121, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7154449820518494, + "rewards/margins": 1.523475170135498, + "rewards/rejected": -2.238920211791992, + "step": 312 + }, + { + "epoch": 0.27, + "grad_norm": 51.14484872549365, + "learning_rate": 1.8326117437907812e-06, + "logits/chosen": -1.4025180339813232, + "logits/rejected": -1.3029136657714844, + "logps/chosen": -507.39154052734375, + "logps/rejected": -731.9945678710938, + "loss": 0.2758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5277796983718872, + "rewards/margins": 1.7257564067840576, + "rewards/rejected": -2.2535362243652344, + "step": 313 + }, + { + "epoch": 0.27, + "grad_norm": 66.36525875544719, + "learning_rate": 1.8309492960954727e-06, + "logits/chosen": -1.3588955402374268, + "logits/rejected": -1.2931160926818848, + "logps/chosen": -462.9432067871094, + "logps/rejected": -654.9721069335938, + "loss": 0.3511, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6080495715141296, + "rewards/margins": 1.597927451133728, + "rewards/rejected": -2.205976963043213, + "step": 314 + }, + { + "epoch": 0.27, + "grad_norm": 54.03835330583822, + "learning_rate": 1.82927939554007e-06, + "logits/chosen": -1.3320766687393188, + "logits/rejected": -1.2851673364639282, + "logps/chosen": -712.2108154296875, + "logps/rejected": -880.1512451171875, + "loss": 0.2395, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.747326135635376, + "rewards/margins": 1.9266762733459473, + "rewards/rejected": -2.6740026473999023, + "step": 315 + }, + { + "epoch": 0.27, + "grad_norm": 39.25357661887786, + "learning_rate": 1.8276020571020645e-06, + "logits/chosen": -1.4441983699798584, + "logits/rejected": -1.351264238357544, + "logps/chosen": -342.5238037109375, + "logps/rejected": -649.1917724609375, + "loss": 0.2823, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5742235779762268, + "rewards/margins": 1.6854310035705566, + "rewards/rejected": -2.2596545219421387, + "step": 316 + }, + { + "epoch": 0.27, + "grad_norm": 35.63885420819062, + "learning_rate": 1.8259172958256571e-06, + "logits/chosen": -1.4204754829406738, + "logits/rejected": -1.2960968017578125, + "logps/chosen": -570.7115478515625, + "logps/rejected": -947.851318359375, + "loss": 0.1699, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6737782955169678, + "rewards/margins": 2.6754255294799805, + "rewards/rejected": -3.3492040634155273, + "step": 317 + }, + { + "epoch": 0.27, + "grad_norm": 86.03391195673755, + "learning_rate": 1.8242251268216257e-06, + "logits/chosen": -1.3299427032470703, + "logits/rejected": -1.2639224529266357, + "logps/chosen": -499.9549255371094, + "logps/rejected": -681.12841796875, + "loss": 0.4574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.649490237236023, + "rewards/margins": 1.0054829120635986, + "rewards/rejected": -1.6549732685089111, + "step": 318 + }, + { + "epoch": 0.27, + "grad_norm": 47.572685320587574, + "learning_rate": 1.8225255652671887e-06, + "logits/chosen": -1.3856923580169678, + "logits/rejected": -1.3267571926116943, + "logps/chosen": -375.29290771484375, + "logps/rejected": -526.8903198242188, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3808596730232239, + "rewards/margins": 0.9564526677131653, + "rewards/rejected": -1.3373123407363892, + "step": 319 + }, + { + "epoch": 0.27, + "grad_norm": 40.98992768419042, + "learning_rate": 1.8208186264058686e-06, + "logits/chosen": -1.3933082818984985, + "logits/rejected": -1.2459851503372192, + "logps/chosen": -535.0771484375, + "logps/rejected": -1006.7776489257812, + "loss": 0.258, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6266862154006958, + "rewards/margins": 2.235572338104248, + "rewards/rejected": -2.8622586727142334, + "step": 320 + }, + { + "epoch": 0.28, + "grad_norm": 81.03986839354482, + "learning_rate": 1.8191043255473557e-06, + "logits/chosen": -1.2229186296463013, + "logits/rejected": -1.1526546478271484, + "logps/chosen": -610.3226928710938, + "logps/rejected": -869.8119506835938, + "loss": 0.4548, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8089736700057983, + "rewards/margins": 1.7825186252593994, + "rewards/rejected": -2.591492176055908, + "step": 321 + }, + { + "epoch": 0.28, + "grad_norm": 54.00681041288272, + "learning_rate": 1.8173826780673713e-06, + "logits/chosen": -1.286928415298462, + "logits/rejected": -1.2091097831726074, + "logps/chosen": -352.4930419921875, + "logps/rejected": -519.7963256835938, + "loss": 0.3332, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.30570027232170105, + "rewards/margins": 1.4030377864837646, + "rewards/rejected": -1.708738088607788, + "step": 322 + }, + { + "epoch": 0.28, + "grad_norm": 59.45445472685165, + "learning_rate": 1.8156536994075286e-06, + "logits/chosen": -1.380958080291748, + "logits/rejected": -1.3711278438568115, + "logps/chosen": -557.9470825195312, + "logps/rejected": -585.9173583984375, + "loss": 0.3746, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8871201276779175, + "rewards/margins": 1.1648715734481812, + "rewards/rejected": -2.0519914627075195, + "step": 323 + }, + { + "epoch": 0.28, + "grad_norm": 39.40230304474445, + "learning_rate": 1.8139174050751956e-06, + "logits/chosen": -1.3996045589447021, + "logits/rejected": -1.306899070739746, + "logps/chosen": -497.3608703613281, + "logps/rejected": -719.8843994140625, + "loss": 0.2138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47391462326049805, + "rewards/margins": 2.2606656551361084, + "rewards/rejected": -2.7345802783966064, + "step": 324 + }, + { + "epoch": 0.28, + "grad_norm": 50.28586406633642, + "learning_rate": 1.8121738106433536e-06, + "logits/chosen": -1.295037031173706, + "logits/rejected": -1.2569923400878906, + "logps/chosen": -646.929931640625, + "logps/rejected": -846.8131103515625, + "loss": 0.206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1925201416015625, + "rewards/margins": 1.9704111814498901, + "rewards/rejected": -3.162931442260742, + "step": 325 + }, + { + "epoch": 0.28, + "grad_norm": 56.423053246664416, + "learning_rate": 1.8104229317504612e-06, + "logits/chosen": -1.4025418758392334, + "logits/rejected": -1.3613529205322266, + "logps/chosen": -528.4503173828125, + "logps/rejected": -717.5523681640625, + "loss": 0.3203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8349852561950684, + "rewards/margins": 1.745392084121704, + "rewards/rejected": -2.5803773403167725, + "step": 326 + }, + { + "epoch": 0.28, + "grad_norm": 39.3618431458811, + "learning_rate": 1.8086647841003102e-06, + "logits/chosen": -1.3957802057266235, + "logits/rejected": -1.4136476516723633, + "logps/chosen": -421.7500915527344, + "logps/rejected": -432.0810852050781, + "loss": 0.2859, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2593984007835388, + "rewards/margins": 1.647970199584961, + "rewards/rejected": -1.907368779182434, + "step": 327 + }, + { + "epoch": 0.28, + "grad_norm": 53.49012674039251, + "learning_rate": 1.8068993834618881e-06, + "logits/chosen": -1.3327207565307617, + "logits/rejected": -1.258753776550293, + "logps/chosen": -625.6865234375, + "logps/rejected": -831.6580810546875, + "loss": 0.2618, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.953024685382843, + "rewards/margins": 1.8639934062957764, + "rewards/rejected": -2.8170180320739746, + "step": 328 + }, + { + "epoch": 0.28, + "grad_norm": 60.790641539855926, + "learning_rate": 1.8051267456692342e-06, + "logits/chosen": -1.4130940437316895, + "logits/rejected": -1.3980942964553833, + "logps/chosen": -448.37872314453125, + "logps/rejected": -547.0968627929688, + "loss": 0.4777, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6373985409736633, + "rewards/margins": 0.8018583059310913, + "rewards/rejected": -1.4392569065093994, + "step": 329 + }, + { + "epoch": 0.28, + "grad_norm": 79.04052147407155, + "learning_rate": 1.8033468866212984e-06, + "logits/chosen": -1.3974734544754028, + "logits/rejected": -1.339294672012329, + "logps/chosen": -443.251708984375, + "logps/rejected": -591.7564697265625, + "loss": 0.5173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6866668462753296, + "rewards/margins": 1.2705740928649902, + "rewards/rejected": -1.9572408199310303, + "step": 330 + }, + { + "epoch": 0.28, + "grad_norm": 69.99995765079761, + "learning_rate": 1.8015598222817994e-06, + "logits/chosen": -1.3072905540466309, + "logits/rejected": -1.19639253616333, + "logps/chosen": -537.7159423828125, + "logps/rejected": -831.37060546875, + "loss": 0.2668, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8045264482498169, + "rewards/margins": 1.8684673309326172, + "rewards/rejected": -2.6729936599731445, + "step": 331 + }, + { + "epoch": 0.28, + "grad_norm": 38.237575149701975, + "learning_rate": 1.79976556867908e-06, + "logits/chosen": -1.3282616138458252, + "logits/rejected": -1.266440749168396, + "logps/chosen": -435.3412170410156, + "logps/rejected": -683.4049072265625, + "loss": 0.2422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5417599678039551, + "rewards/margins": 1.7700687646865845, + "rewards/rejected": -2.311828851699829, + "step": 332 + }, + { + "epoch": 0.29, + "grad_norm": 86.16521686543277, + "learning_rate": 1.7979641419059647e-06, + "logits/chosen": -1.2827945947647095, + "logits/rejected": -1.2071640491485596, + "logps/chosen": -448.04608154296875, + "logps/rejected": -671.893310546875, + "loss": 0.5764, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0399840772151947, + "rewards/margins": 0.3430766463279724, + "rewards/rejected": -0.3030925691127777, + "step": 333 + }, + { + "epoch": 0.29, + "grad_norm": 46.42168599676963, + "learning_rate": 1.7961555581196148e-06, + "logits/chosen": -1.31438410282135, + "logits/rejected": -1.2875746488571167, + "logps/chosen": -459.66156005859375, + "logps/rejected": -604.5240478515625, + "loss": 0.3108, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5575841665267944, + "rewards/margins": 1.5096338987350464, + "rewards/rejected": -2.067218065261841, + "step": 334 + }, + { + "epoch": 0.29, + "grad_norm": 52.554734666068185, + "learning_rate": 1.7943398335413833e-06, + "logits/chosen": -1.3907947540283203, + "logits/rejected": -1.3661128282546997, + "logps/chosen": -453.46282958984375, + "logps/rejected": -573.4193115234375, + "loss": 0.4039, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5714187622070312, + "rewards/margins": 1.3516925573349, + "rewards/rejected": -1.9231112003326416, + "step": 335 + }, + { + "epoch": 0.29, + "grad_norm": 73.66641233425995, + "learning_rate": 1.79251698445667e-06, + "logits/chosen": -1.282754898071289, + "logits/rejected": -1.2947840690612793, + "logps/chosen": -708.6914672851562, + "logps/rejected": -734.3735961914062, + "loss": 0.4046, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.258388876914978, + "rewards/margins": 1.2340383529663086, + "rewards/rejected": -2.492427349090576, + "step": 336 + }, + { + "epoch": 0.29, + "grad_norm": 55.273678419099184, + "learning_rate": 1.790687027214774e-06, + "logits/chosen": -1.2410988807678223, + "logits/rejected": -1.2197015285491943, + "logps/chosen": -626.3568115234375, + "logps/rejected": -815.574951171875, + "loss": 0.2696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8296209573745728, + "rewards/margins": 1.8387782573699951, + "rewards/rejected": -2.6683993339538574, + "step": 337 + }, + { + "epoch": 0.29, + "grad_norm": 95.16657798565699, + "learning_rate": 1.7888499782287495e-06, + "logits/chosen": -1.2814643383026123, + "logits/rejected": -1.3364362716674805, + "logps/chosen": -649.78125, + "logps/rejected": -612.60400390625, + "loss": 0.6059, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8462148904800415, + "rewards/margins": 1.2750688791275024, + "rewards/rejected": -2.121283531188965, + "step": 338 + }, + { + "epoch": 0.29, + "grad_norm": 37.834308839348786, + "learning_rate": 1.7870058539752563e-06, + "logits/chosen": -1.3570611476898193, + "logits/rejected": -1.3527884483337402, + "logps/chosen": -523.838134765625, + "logps/rejected": -617.7093505859375, + "loss": 0.2005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5999828577041626, + "rewards/margins": 2.075089693069458, + "rewards/rejected": -2.67507266998291, + "step": 339 + }, + { + "epoch": 0.29, + "grad_norm": 62.59492431101693, + "learning_rate": 1.7851546709944133e-06, + "logits/chosen": -1.2640312910079956, + "logits/rejected": -1.2444536685943604, + "logps/chosen": -718.8636474609375, + "logps/rejected": -756.96923828125, + "loss": 0.4128, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7843811511993408, + "rewards/margins": 1.6093783378601074, + "rewards/rejected": -2.3937594890594482, + "step": 340 + }, + { + "epoch": 0.29, + "grad_norm": 56.365393302117916, + "learning_rate": 1.7832964458896496e-06, + "logits/chosen": -1.356136441230774, + "logits/rejected": -1.2352664470672607, + "logps/chosen": -456.6836242675781, + "logps/rejected": -874.0556030273438, + "loss": 0.2437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5173958539962769, + "rewards/margins": 2.5600132942199707, + "rewards/rejected": -3.077409267425537, + "step": 341 + }, + { + "epoch": 0.29, + "grad_norm": 32.8496543014448, + "learning_rate": 1.7814311953275559e-06, + "logits/chosen": -1.3836917877197266, + "logits/rejected": -1.3068275451660156, + "logps/chosen": -476.92242431640625, + "logps/rejected": -723.5286865234375, + "loss": 0.1923, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5671064853668213, + "rewards/margins": 2.021279811859131, + "rewards/rejected": -2.588386058807373, + "step": 342 + }, + { + "epoch": 0.29, + "grad_norm": 51.257559058374795, + "learning_rate": 1.7795589360377342e-06, + "logits/chosen": -1.423358678817749, + "logits/rejected": -1.3222960233688354, + "logps/chosen": -353.30499267578125, + "logps/rejected": -577.5755615234375, + "loss": 0.4257, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8450976610183716, + "rewards/margins": 1.2097313404083252, + "rewards/rejected": -2.0548288822174072, + "step": 343 + }, + { + "epoch": 0.3, + "grad_norm": 53.5472303290772, + "learning_rate": 1.7776796848126501e-06, + "logits/chosen": -1.3959057331085205, + "logits/rejected": -1.356281042098999, + "logps/chosen": -535.9912109375, + "logps/rejected": -663.2745971679688, + "loss": 0.3195, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8873413801193237, + "rewards/margins": 1.5602761507034302, + "rewards/rejected": -2.447617530822754, + "step": 344 + }, + { + "epoch": 0.3, + "grad_norm": 53.12673247773521, + "learning_rate": 1.7757934585074784e-06, + "logits/chosen": -1.3323516845703125, + "logits/rejected": -1.3047423362731934, + "logps/chosen": -465.5447998046875, + "logps/rejected": -702.0733642578125, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.813624382019043, + "rewards/margins": 1.4025866985321045, + "rewards/rejected": -2.2162110805511475, + "step": 345 + }, + { + "epoch": 0.3, + "grad_norm": 61.43784959072281, + "learning_rate": 1.7739002740399554e-06, + "logits/chosen": -1.3370304107666016, + "logits/rejected": -1.2953394651412964, + "logps/chosen": -605.59033203125, + "logps/rejected": -796.789794921875, + "loss": 0.4125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.918696403503418, + "rewards/margins": 1.4033992290496826, + "rewards/rejected": -2.3220956325531006, + "step": 346 + }, + { + "epoch": 0.3, + "grad_norm": 40.496158372532264, + "learning_rate": 1.7720001483902254e-06, + "logits/chosen": -1.3652217388153076, + "logits/rejected": -1.2486958503723145, + "logps/chosen": -460.3392028808594, + "logps/rejected": -763.511962890625, + "loss": 0.2107, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.41109907627105713, + "rewards/margins": 2.041755199432373, + "rewards/rejected": -2.4528541564941406, + "step": 347 + }, + { + "epoch": 0.3, + "grad_norm": 53.499096353198155, + "learning_rate": 1.7700930986006888e-06, + "logits/chosen": -1.371845006942749, + "logits/rejected": -1.3209772109985352, + "logps/chosen": -427.68170166015625, + "logps/rejected": -644.3697509765625, + "loss": 0.3152, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.705706000328064, + "rewards/margins": 1.6445813179016113, + "rewards/rejected": -2.3502871990203857, + "step": 348 + }, + { + "epoch": 0.3, + "grad_norm": 58.9588633137131, + "learning_rate": 1.7681791417758495e-06, + "logits/chosen": -1.4229094982147217, + "logits/rejected": -1.3599036931991577, + "logps/chosen": -471.8563232421875, + "logps/rejected": -617.5604858398438, + "loss": 0.3771, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.713165283203125, + "rewards/margins": 1.2468420267105103, + "rewards/rejected": -1.9600073099136353, + "step": 349 + }, + { + "epoch": 0.3, + "grad_norm": 69.70902398852822, + "learning_rate": 1.7662582950821604e-06, + "logits/chosen": -1.2817418575286865, + "logits/rejected": -1.2800257205963135, + "logps/chosen": -619.9619750976562, + "logps/rejected": -757.70751953125, + "loss": 0.2988, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8852773904800415, + "rewards/margins": 1.772665023803711, + "rewards/rejected": -2.657942295074463, + "step": 350 + }, + { + "epoch": 0.3, + "grad_norm": 34.282024233072164, + "learning_rate": 1.7643305757478713e-06, + "logits/chosen": -1.4060933589935303, + "logits/rejected": -1.3654086589813232, + "logps/chosen": -543.7503051757812, + "logps/rejected": -668.7984619140625, + "loss": 0.2402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4676051437854767, + "rewards/margins": 1.831949234008789, + "rewards/rejected": -2.2995543479919434, + "step": 351 + }, + { + "epoch": 0.3, + "grad_norm": 30.3024998135122, + "learning_rate": 1.762396001062873e-06, + "logits/chosen": -1.3362281322479248, + "logits/rejected": -1.3081523180007935, + "logps/chosen": -548.0572509765625, + "logps/rejected": -773.1283569335938, + "loss": 0.2547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3063896596431732, + "rewards/margins": 1.772287130355835, + "rewards/rejected": -2.078676700592041, + "step": 352 + }, + { + "epoch": 0.3, + "grad_norm": 58.827125540810336, + "learning_rate": 1.760454588378542e-06, + "logits/chosen": -1.3522402048110962, + "logits/rejected": -1.330960750579834, + "logps/chosen": -448.0185546875, + "logps/rejected": -519.0568237304688, + "loss": 0.3176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4417378902435303, + "rewards/margins": 1.2674145698547363, + "rewards/rejected": -1.7091525793075562, + "step": 353 + }, + { + "epoch": 0.3, + "grad_norm": 36.49894563740577, + "learning_rate": 1.758506355107586e-06, + "logits/chosen": -1.3145179748535156, + "logits/rejected": -1.3016877174377441, + "logps/chosen": -489.9166259765625, + "logps/rejected": -590.7893676757812, + "loss": 0.2841, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2938302457332611, + "rewards/margins": 1.5872488021850586, + "rewards/rejected": -1.8810791969299316, + "step": 354 + }, + { + "epoch": 0.3, + "grad_norm": 50.86650433813805, + "learning_rate": 1.7565513187238875e-06, + "logits/chosen": -1.411332368850708, + "logits/rejected": -1.342905044555664, + "logps/chosen": -359.9126281738281, + "logps/rejected": -543.0909423828125, + "loss": 0.351, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4272446632385254, + "rewards/margins": 1.5099502801895142, + "rewards/rejected": -1.9371949434280396, + "step": 355 + }, + { + "epoch": 0.31, + "grad_norm": 82.16799604256238, + "learning_rate": 1.754589496762346e-06, + "logits/chosen": -1.2620693445205688, + "logits/rejected": -1.2261128425598145, + "logps/chosen": -573.6851196289062, + "logps/rejected": -668.463623046875, + "loss": 0.4958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9251641631126404, + "rewards/margins": 1.105979084968567, + "rewards/rejected": -2.0311431884765625, + "step": 356 + }, + { + "epoch": 0.31, + "grad_norm": 36.40453847231406, + "learning_rate": 1.7526209068187217e-06, + "logits/chosen": -1.3427318334579468, + "logits/rejected": -1.337662696838379, + "logps/chosen": -574.2933349609375, + "logps/rejected": -721.6268310546875, + "loss": 0.2151, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4808851182460785, + "rewards/margins": 1.956594467163086, + "rewards/rejected": -2.4374794960021973, + "step": 357 + }, + { + "epoch": 0.31, + "grad_norm": 40.2136820610071, + "learning_rate": 1.7506455665494774e-06, + "logits/chosen": -1.3520996570587158, + "logits/rejected": -1.2746731042861938, + "logps/chosen": -391.5846862792969, + "logps/rejected": -690.509765625, + "loss": 0.18, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.39890822768211365, + "rewards/margins": 2.4898672103881836, + "rewards/rejected": -2.88877534866333, + "step": 358 + }, + { + "epoch": 0.31, + "grad_norm": 66.20511722931484, + "learning_rate": 1.748663493671621e-06, + "logits/chosen": -1.3608579635620117, + "logits/rejected": -1.3019959926605225, + "logps/chosen": -534.7568969726562, + "logps/rejected": -757.3606567382812, + "loss": 0.3957, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9439442753791809, + "rewards/margins": 1.482708215713501, + "rewards/rejected": -2.426652669906616, + "step": 359 + }, + { + "epoch": 0.31, + "grad_norm": 64.49934725997622, + "learning_rate": 1.746674705962544e-06, + "logits/chosen": -1.3710341453552246, + "logits/rejected": -1.2802362442016602, + "logps/chosen": -408.6067810058594, + "logps/rejected": -552.7216796875, + "loss": 0.3656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6530160903930664, + "rewards/margins": 1.6699604988098145, + "rewards/rejected": -2.322976589202881, + "step": 360 + }, + { + "epoch": 0.31, + "grad_norm": 70.11251547062426, + "learning_rate": 1.744679221259866e-06, + "logits/chosen": -1.3386015892028809, + "logits/rejected": -1.3565545082092285, + "logps/chosen": -743.8438720703125, + "logps/rejected": -634.8836669921875, + "loss": 0.4044, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0614737272262573, + "rewards/margins": 1.0349459648132324, + "rewards/rejected": -2.0964198112487793, + "step": 361 + }, + { + "epoch": 0.31, + "grad_norm": 82.91664116977694, + "learning_rate": 1.7426770574612708e-06, + "logits/chosen": -1.268636703491211, + "logits/rejected": -1.2521432638168335, + "logps/chosen": -695.606689453125, + "logps/rejected": -826.245361328125, + "loss": 0.4141, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3942124843597412, + "rewards/margins": 1.6098053455352783, + "rewards/rejected": -3.0040180683135986, + "step": 362 + }, + { + "epoch": 0.31, + "grad_norm": 86.52667633185308, + "learning_rate": 1.7406682325243482e-06, + "logits/chosen": -1.4180306196212769, + "logits/rejected": -1.4059098958969116, + "logps/chosen": -618.788330078125, + "logps/rejected": -620.6964721679688, + "loss": 0.4947, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9639477729797363, + "rewards/margins": 1.3326777219772339, + "rewards/rejected": -2.2966253757476807, + "step": 363 + }, + { + "epoch": 0.31, + "grad_norm": 45.85536985118218, + "learning_rate": 1.7386527644664328e-06, + "logits/chosen": -1.3537003993988037, + "logits/rejected": -1.235815167427063, + "logps/chosen": -436.8193359375, + "logps/rejected": -825.0286865234375, + "loss": 0.2462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6824496984481812, + "rewards/margins": 2.1164891719818115, + "rewards/rejected": -2.798938751220703, + "step": 364 + }, + { + "epoch": 0.31, + "grad_norm": 58.52944726682894, + "learning_rate": 1.7366306713644416e-06, + "logits/chosen": -1.3131760358810425, + "logits/rejected": -1.2818546295166016, + "logps/chosen": -392.1138000488281, + "logps/rejected": -597.6663818359375, + "loss": 0.4684, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38064292073249817, + "rewards/margins": 1.8004159927368164, + "rewards/rejected": -2.181058883666992, + "step": 365 + }, + { + "epoch": 0.31, + "grad_norm": 32.81893405022377, + "learning_rate": 1.7346019713547121e-06, + "logits/chosen": -1.3495137691497803, + "logits/rejected": -1.2485442161560059, + "logps/chosen": -590.7689208984375, + "logps/rejected": -916.5122680664062, + "loss": 0.1726, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5310375690460205, + "rewards/margins": 2.2462780475616455, + "rewards/rejected": -2.777315616607666, + "step": 366 + }, + { + "epoch": 0.31, + "grad_norm": 45.22131227486043, + "learning_rate": 1.7325666826328397e-06, + "logits/chosen": -1.372185230255127, + "logits/rejected": -1.3290915489196777, + "logps/chosen": -532.4417114257812, + "logps/rejected": -732.6104736328125, + "loss": 0.2474, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4011019170284271, + "rewards/margins": 2.100841999053955, + "rewards/rejected": -2.501944065093994, + "step": 367 + }, + { + "epoch": 0.32, + "grad_norm": 766.8967712959084, + "learning_rate": 1.7305248234535156e-06, + "logits/chosen": -1.3028106689453125, + "logits/rejected": -1.2383298873901367, + "logps/chosen": -594.026611328125, + "logps/rejected": -714.35302734375, + "loss": 0.5844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14716491103172302, + "rewards/margins": 0.5474169850349426, + "rewards/rejected": -0.6945818662643433, + "step": 368 + }, + { + "epoch": 0.32, + "grad_norm": 74.31008210706332, + "learning_rate": 1.7284764121303599e-06, + "logits/chosen": -1.4357049465179443, + "logits/rejected": -1.3083711862564087, + "logps/chosen": -327.5078125, + "logps/rejected": -561.4296875, + "loss": 0.4596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2435588240623474, + "rewards/margins": 0.7658228874206543, + "rewards/rejected": -1.009381651878357, + "step": 369 + }, + { + "epoch": 0.32, + "grad_norm": 43.852407517822435, + "learning_rate": 1.7264214670357613e-06, + "logits/chosen": -1.3738019466400146, + "logits/rejected": -1.344679832458496, + "logps/chosen": -456.21734619140625, + "logps/rejected": -683.3712158203125, + "loss": 0.2788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6068999767303467, + "rewards/margins": 1.8802766799926758, + "rewards/rejected": -2.4871766567230225, + "step": 370 + }, + { + "epoch": 0.32, + "grad_norm": 48.945596081457964, + "learning_rate": 1.7243600066007104e-06, + "logits/chosen": -1.3600564002990723, + "logits/rejected": -1.2774728536605835, + "logps/chosen": -429.667236328125, + "logps/rejected": -666.998046875, + "loss": 0.3351, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7039352655410767, + "rewards/margins": 2.060344696044922, + "rewards/rejected": -2.764279842376709, + "step": 371 + }, + { + "epoch": 0.32, + "grad_norm": 87.69018998649285, + "learning_rate": 1.7222920493146336e-06, + "logits/chosen": -1.3533427715301514, + "logits/rejected": -1.2983510494232178, + "logps/chosen": -506.4881286621094, + "logps/rejected": -719.895751953125, + "loss": 0.6736, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1213555335998535, + "rewards/margins": 1.4744019508361816, + "rewards/rejected": -2.595757484436035, + "step": 372 + }, + { + "epoch": 0.32, + "grad_norm": 35.82422267357685, + "learning_rate": 1.7202176137252287e-06, + "logits/chosen": -1.364864706993103, + "logits/rejected": -1.2952933311462402, + "logps/chosen": -399.06549072265625, + "logps/rejected": -614.533447265625, + "loss": 0.2149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.655327558517456, + "rewards/margins": 1.7265857458114624, + "rewards/rejected": -2.381913185119629, + "step": 373 + }, + { + "epoch": 0.32, + "grad_norm": 57.76953763457483, + "learning_rate": 1.7181367184382975e-06, + "logits/chosen": -1.2076451778411865, + "logits/rejected": -1.2127151489257812, + "logps/chosen": -641.7158203125, + "logps/rejected": -791.0142822265625, + "loss": 0.276, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.530408501625061, + "rewards/margins": 2.003429412841797, + "rewards/rejected": -2.5338380336761475, + "step": 374 + }, + { + "epoch": 0.32, + "grad_norm": 55.405902933053476, + "learning_rate": 1.7160493821175806e-06, + "logits/chosen": -1.3833491802215576, + "logits/rejected": -1.3392293453216553, + "logps/chosen": -614.4901733398438, + "logps/rejected": -714.2847290039062, + "loss": 0.2459, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.013306736946106, + "rewards/margins": 2.0114693641662598, + "rewards/rejected": -3.024775981903076, + "step": 375 + }, + { + "epoch": 0.32, + "grad_norm": 50.68328672154846, + "learning_rate": 1.7139556234845874e-06, + "logits/chosen": -1.4341458082199097, + "logits/rejected": -1.3658862113952637, + "logps/chosen": -387.8314208984375, + "logps/rejected": -540.5579223632812, + "loss": 0.3614, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8665677309036255, + "rewards/margins": 1.5501694679260254, + "rewards/rejected": -2.4167373180389404, + "step": 376 + }, + { + "epoch": 0.32, + "grad_norm": 59.60217381880438, + "learning_rate": 1.7118554613184302e-06, + "logits/chosen": -1.3032417297363281, + "logits/rejected": -1.2954423427581787, + "logps/chosen": -630.8447265625, + "logps/rejected": -679.8330078125, + "loss": 0.3405, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1230149269104004, + "rewards/margins": 1.624404788017273, + "rewards/rejected": -2.747419834136963, + "step": 377 + }, + { + "epoch": 0.32, + "grad_norm": 27.89135462360495, + "learning_rate": 1.7097489144556553e-06, + "logits/chosen": -1.3719645738601685, + "logits/rejected": -1.3334708213806152, + "logps/chosen": -499.67791748046875, + "logps/rejected": -713.4361572265625, + "loss": 0.1902, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8453254699707031, + "rewards/margins": 1.9828104972839355, + "rewards/rejected": -2.8281362056732178, + "step": 378 + }, + { + "epoch": 0.33, + "grad_norm": 60.874062147399314, + "learning_rate": 1.7076360017900742e-06, + "logits/chosen": -1.2090768814086914, + "logits/rejected": -1.1481618881225586, + "logps/chosen": -740.1016845703125, + "logps/rejected": -920.9785766601562, + "loss": 0.2601, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9678854942321777, + "rewards/margins": 2.6631102561950684, + "rewards/rejected": -3.630995988845825, + "step": 379 + }, + { + "epoch": 0.33, + "grad_norm": 47.97271908744414, + "learning_rate": 1.705516742272593e-06, + "logits/chosen": -1.2268987894058228, + "logits/rejected": -1.2583870887756348, + "logps/chosen": -560.3924560546875, + "logps/rejected": -637.6690673828125, + "loss": 0.4351, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.14536714553833, + "rewards/margins": 1.5063209533691406, + "rewards/rejected": -2.6516880989074707, + "step": 380 + }, + { + "epoch": 0.33, + "grad_norm": 57.222584056932256, + "learning_rate": 1.7033911549110438e-06, + "logits/chosen": -1.3473737239837646, + "logits/rejected": -1.3357570171356201, + "logps/chosen": -555.26025390625, + "logps/rejected": -700.132080078125, + "loss": 0.2627, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.967682957649231, + "rewards/margins": 1.708535075187683, + "rewards/rejected": -2.676218032836914, + "step": 381 + }, + { + "epoch": 0.33, + "grad_norm": 82.14782264995479, + "learning_rate": 1.7012592587700137e-06, + "logits/chosen": -1.3172829151153564, + "logits/rejected": -1.3071085214614868, + "logps/chosen": -502.27252197265625, + "logps/rejected": -572.5811157226562, + "loss": 0.4242, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.954552412033081, + "rewards/margins": 1.1110751628875732, + "rewards/rejected": -2.0656275749206543, + "step": 382 + }, + { + "epoch": 0.33, + "grad_norm": 39.535486116430114, + "learning_rate": 1.6991210729706743e-06, + "logits/chosen": -1.3450148105621338, + "logits/rejected": -1.3007009029388428, + "logps/chosen": -380.4422607421875, + "logps/rejected": -574.0826416015625, + "loss": 0.2487, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5027610063552856, + "rewards/margins": 1.7666382789611816, + "rewards/rejected": -2.2693991661071777, + "step": 383 + }, + { + "epoch": 0.33, + "grad_norm": 115.97105805630862, + "learning_rate": 1.6969766166906085e-06, + "logits/chosen": -1.2329654693603516, + "logits/rejected": -1.2378525733947754, + "logps/chosen": -557.4694213867188, + "logps/rejected": -650.35498046875, + "loss": 0.4743, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0643105506896973, + "rewards/margins": 1.2340928316116333, + "rewards/rejected": -2.298403263092041, + "step": 384 + }, + { + "epoch": 0.33, + "grad_norm": 46.15615789349278, + "learning_rate": 1.694825909163641e-06, + "logits/chosen": -1.415977120399475, + "logits/rejected": -1.3116364479064941, + "logps/chosen": -425.91217041015625, + "logps/rejected": -695.7625732421875, + "loss": 0.3217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9717909097671509, + "rewards/margins": 1.7135262489318848, + "rewards/rejected": -2.685317039489746, + "step": 385 + }, + { + "epoch": 0.33, + "grad_norm": 71.6040609635783, + "learning_rate": 1.6926689696796636e-06, + "logits/chosen": -1.3962388038635254, + "logits/rejected": -1.2283735275268555, + "logps/chosen": -575.3253173828125, + "logps/rejected": -874.0101928710938, + "loss": 0.2812, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8452811241149902, + "rewards/margins": 2.1846938133239746, + "rewards/rejected": -3.029974937438965, + "step": 386 + }, + { + "epoch": 0.33, + "grad_norm": 36.61658137478186, + "learning_rate": 1.6905058175844637e-06, + "logits/chosen": -1.3449249267578125, + "logits/rejected": -1.2902624607086182, + "logps/chosen": -444.6363525390625, + "logps/rejected": -710.4901123046875, + "loss": 0.2367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.596918523311615, + "rewards/margins": 2.046452522277832, + "rewards/rejected": -2.643371105194092, + "step": 387 + }, + { + "epoch": 0.33, + "grad_norm": 57.3810553940541, + "learning_rate": 1.6883364722795498e-06, + "logits/chosen": -1.337453007698059, + "logits/rejected": -1.2524361610412598, + "logps/chosen": -415.93402099609375, + "logps/rejected": -661.5554809570312, + "loss": 0.3094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37221938371658325, + "rewards/margins": 1.6274501085281372, + "rewards/rejected": -1.9996695518493652, + "step": 388 + }, + { + "epoch": 0.33, + "grad_norm": 51.57334994385235, + "learning_rate": 1.686160953221978e-06, + "logits/chosen": -1.2846753597259521, + "logits/rejected": -1.1942753791809082, + "logps/chosen": -485.0477294921875, + "logps/rejected": -710.1423950195312, + "loss": 0.3319, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6163160800933838, + "rewards/margins": 1.3468387126922607, + "rewards/rejected": -1.9631547927856445, + "step": 389 + }, + { + "epoch": 0.33, + "grad_norm": 59.367200479176645, + "learning_rate": 1.6839792799241771e-06, + "logits/chosen": -1.2812542915344238, + "logits/rejected": -1.2009220123291016, + "logps/chosen": -701.4754028320312, + "logps/rejected": -948.4730224609375, + "loss": 0.298, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8413772583007812, + "rewards/margins": 2.0989725589752197, + "rewards/rejected": -2.940349578857422, + "step": 390 + }, + { + "epoch": 0.34, + "grad_norm": 31.270768426311996, + "learning_rate": 1.6817914719537748e-06, + "logits/chosen": -1.2630783319473267, + "logits/rejected": -1.1632015705108643, + "logps/chosen": -367.569091796875, + "logps/rejected": -747.9874267578125, + "loss": 0.1791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2967870831489563, + "rewards/margins": 2.931680679321289, + "rewards/rejected": -3.2284677028656006, + "step": 391 + }, + { + "epoch": 0.34, + "grad_norm": 87.98851035196604, + "learning_rate": 1.6795975489334193e-06, + "logits/chosen": -1.2246571779251099, + "logits/rejected": -1.2379138469696045, + "logps/chosen": -650.24951171875, + "logps/rejected": -619.5977172851562, + "loss": 0.6714, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3291599750518799, + "rewards/margins": 0.8845857381820679, + "rewards/rejected": -2.213745594024658, + "step": 392 + }, + { + "epoch": 0.34, + "grad_norm": 50.650897632871555, + "learning_rate": 1.677397530540608e-06, + "logits/chosen": -1.3371104001998901, + "logits/rejected": -1.279056429862976, + "logps/chosen": -473.9917297363281, + "logps/rejected": -733.524169921875, + "loss": 0.2614, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7630621194839478, + "rewards/margins": 1.9246776103973389, + "rewards/rejected": -2.687739849090576, + "step": 393 + }, + { + "epoch": 0.34, + "grad_norm": 64.88482584948143, + "learning_rate": 1.675191436507505e-06, + "logits/chosen": -1.2914042472839355, + "logits/rejected": -1.2568284273147583, + "logps/chosen": -417.3004150390625, + "logps/rejected": -581.8896484375, + "loss": 0.4118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6889206171035767, + "rewards/margins": 1.3778702020645142, + "rewards/rejected": -2.06679105758667, + "step": 394 + }, + { + "epoch": 0.34, + "grad_norm": 37.73605637814536, + "learning_rate": 1.6729792866207703e-06, + "logits/chosen": -1.283979892730713, + "logits/rejected": -1.2231981754302979, + "logps/chosen": -483.034912109375, + "logps/rejected": -708.1431884765625, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.929643452167511, + "rewards/margins": 2.2847232818603516, + "rewards/rejected": -3.2143666744232178, + "step": 395 + }, + { + "epoch": 0.34, + "grad_norm": 51.231246461675696, + "learning_rate": 1.6707611007213778e-06, + "logits/chosen": -1.3425097465515137, + "logits/rejected": -1.2905793190002441, + "logps/chosen": -516.3165283203125, + "logps/rejected": -699.6722412109375, + "loss": 0.2348, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7765346765518188, + "rewards/margins": 2.0664963722229004, + "rewards/rejected": -2.8430309295654297, + "step": 396 + }, + { + "epoch": 0.34, + "grad_norm": 47.72450656288295, + "learning_rate": 1.6685368987044392e-06, + "logits/chosen": -1.3056159019470215, + "logits/rejected": -1.2380082607269287, + "logps/chosen": -491.7803955078125, + "logps/rejected": -702.978759765625, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6762930154800415, + "rewards/margins": 2.062279224395752, + "rewards/rejected": -2.738572597503662, + "step": 397 + }, + { + "epoch": 0.34, + "grad_norm": 70.21202586898191, + "learning_rate": 1.6663067005190254e-06, + "logits/chosen": -1.143059253692627, + "logits/rejected": -1.0712796449661255, + "logps/chosen": -617.78076171875, + "logps/rejected": -838.5026245117188, + "loss": 0.456, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4345775842666626, + "rewards/margins": 1.6229698657989502, + "rewards/rejected": -3.0575473308563232, + "step": 398 + }, + { + "epoch": 0.34, + "grad_norm": 46.56299997352908, + "learning_rate": 1.6640705261679883e-06, + "logits/chosen": -1.2313382625579834, + "logits/rejected": -1.1530615091323853, + "logps/chosen": -631.331298828125, + "logps/rejected": -757.3225708007812, + "loss": 0.2068, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8394178152084351, + "rewards/margins": 2.230578899383545, + "rewards/rejected": -3.0699968338012695, + "step": 399 + }, + { + "epoch": 0.34, + "grad_norm": 63.84886780378537, + "learning_rate": 1.6618283957077787e-06, + "logits/chosen": -1.2598519325256348, + "logits/rejected": -1.2227567434310913, + "logps/chosen": -512.57958984375, + "logps/rejected": -578.773681640625, + "loss": 0.3582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6850414276123047, + "rewards/margins": 1.4070050716400146, + "rewards/rejected": -2.0920464992523193, + "step": 400 + }, + { + "epoch": 0.34, + "grad_norm": 35.51582894200608, + "learning_rate": 1.6595803292482699e-06, + "logits/chosen": -1.271228551864624, + "logits/rejected": -1.226959228515625, + "logps/chosen": -730.7811279296875, + "logps/rejected": -896.078125, + "loss": 0.181, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9396979808807373, + "rewards/margins": 2.190903902053833, + "rewards/rejected": -3.1306021213531494, + "step": 401 + }, + { + "epoch": 0.34, + "grad_norm": 28.390943749629226, + "learning_rate": 1.6573263469525754e-06, + "logits/chosen": -1.1954461336135864, + "logits/rejected": -1.1502574682235718, + "logps/chosen": -529.775634765625, + "logps/rejected": -699.5891723632812, + "loss": 0.1812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.41936323046684265, + "rewards/margins": 2.2371792793273926, + "rewards/rejected": -2.6565425395965576, + "step": 402 + }, + { + "epoch": 0.35, + "grad_norm": 44.35213726968621, + "learning_rate": 1.6550664690368678e-06, + "logits/chosen": -1.2156412601470947, + "logits/rejected": -1.1602269411087036, + "logps/chosen": -542.976318359375, + "logps/rejected": -711.1033325195312, + "loss": 0.2548, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6662370562553406, + "rewards/margins": 1.8856029510498047, + "rewards/rejected": -2.551839828491211, + "step": 403 + }, + { + "epoch": 0.35, + "grad_norm": 75.17976256340974, + "learning_rate": 1.6528007157701986e-06, + "logits/chosen": -1.204666018486023, + "logits/rejected": -1.183717966079712, + "logps/chosen": -747.551025390625, + "logps/rejected": -933.54736328125, + "loss": 0.4962, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.092570185661316, + "rewards/margins": 1.9422069787979126, + "rewards/rejected": -3.0347769260406494, + "step": 404 + }, + { + "epoch": 0.35, + "grad_norm": 103.04235564300753, + "learning_rate": 1.6505291074743157e-06, + "logits/chosen": -1.3840012550354004, + "logits/rejected": -1.312503457069397, + "logps/chosen": -491.85076904296875, + "logps/rejected": -664.5662841796875, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6823098063468933, + "rewards/margins": 1.6049988269805908, + "rewards/rejected": -2.28730845451355, + "step": 405 + }, + { + "epoch": 0.35, + "grad_norm": 85.63345072930973, + "learning_rate": 1.6482516645234811e-06, + "logits/chosen": -1.2395625114440918, + "logits/rejected": -1.152807593345642, + "logps/chosen": -504.61492919921875, + "logps/rejected": -706.6768798828125, + "loss": 0.4778, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9438247680664062, + "rewards/margins": 2.0758321285247803, + "rewards/rejected": -3.0196568965911865, + "step": 406 + }, + { + "epoch": 0.35, + "grad_norm": 59.471028399123476, + "learning_rate": 1.6459684073442887e-06, + "logits/chosen": -1.1889766454696655, + "logits/rejected": -1.1216541528701782, + "logps/chosen": -654.0748901367188, + "logps/rejected": -957.9883422851562, + "loss": 0.2074, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9139692783355713, + "rewards/margins": 2.6043825149536133, + "rewards/rejected": -3.5183520317077637, + "step": 407 + }, + { + "epoch": 0.35, + "grad_norm": 50.30155647050052, + "learning_rate": 1.6436793564154808e-06, + "logits/chosen": -1.2363783121109009, + "logits/rejected": -1.2209937572479248, + "logps/chosen": -587.3294067382812, + "logps/rejected": -718.9139404296875, + "loss": 0.2917, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8330459594726562, + "rewards/margins": 2.0317726135253906, + "rewards/rejected": -2.864818572998047, + "step": 408 + }, + { + "epoch": 0.35, + "grad_norm": 65.97832606363103, + "learning_rate": 1.6413845322677635e-06, + "logits/chosen": -1.1675924062728882, + "logits/rejected": -1.1158082485198975, + "logps/chosen": -668.6107177734375, + "logps/rejected": -825.7411499023438, + "loss": 0.3324, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9506618976593018, + "rewards/margins": 2.0555379390716553, + "rewards/rejected": -3.006199836730957, + "step": 409 + }, + { + "epoch": 0.35, + "grad_norm": 122.0389351510737, + "learning_rate": 1.639083955483625e-06, + "logits/chosen": -1.356440782546997, + "logits/rejected": -1.3123233318328857, + "logps/chosen": -572.5408935546875, + "logps/rejected": -647.79736328125, + "loss": 0.7814, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5167076587677002, + "rewards/margins": 0.6254584789276123, + "rewards/rejected": -2.1421661376953125, + "step": 410 + }, + { + "epoch": 0.35, + "grad_norm": 66.59653601012016, + "learning_rate": 1.6367776466971475e-06, + "logits/chosen": -1.3011500835418701, + "logits/rejected": -1.251386046409607, + "logps/chosen": -421.4635314941406, + "logps/rejected": -582.5531005859375, + "loss": 0.4516, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6226481199264526, + "rewards/margins": 1.8402199745178223, + "rewards/rejected": -2.4628682136535645, + "step": 411 + }, + { + "epoch": 0.35, + "grad_norm": 71.05323766925942, + "learning_rate": 1.6344656265938258e-06, + "logits/chosen": -1.2155036926269531, + "logits/rejected": -1.1783559322357178, + "logps/chosen": -667.48388671875, + "logps/rejected": -756.5985107421875, + "loss": 0.2926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7361356616020203, + "rewards/margins": 2.011352062225342, + "rewards/rejected": -2.747488021850586, + "step": 412 + }, + { + "epoch": 0.35, + "grad_norm": 68.98412970305316, + "learning_rate": 1.6321479159103786e-06, + "logits/chosen": -1.304020643234253, + "logits/rejected": -1.280059576034546, + "logps/chosen": -566.3141479492188, + "logps/rejected": -568.9078979492188, + "loss": 0.5021, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8730708360671997, + "rewards/margins": 1.178234338760376, + "rewards/rejected": -2.0513052940368652, + "step": 413 + }, + { + "epoch": 0.36, + "grad_norm": 38.10469841786047, + "learning_rate": 1.6298245354345654e-06, + "logits/chosen": -1.4465073347091675, + "logits/rejected": -1.380890965461731, + "logps/chosen": -570.2463989257812, + "logps/rejected": -813.3609619140625, + "loss": 0.2257, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6123977899551392, + "rewards/margins": 2.4449164867401123, + "rewards/rejected": -3.057314157485962, + "step": 414 + }, + { + "epoch": 0.36, + "grad_norm": 63.94973697259471, + "learning_rate": 1.6274955060049972e-06, + "logits/chosen": -1.3530452251434326, + "logits/rejected": -1.2638301849365234, + "logps/chosen": -340.5704345703125, + "logps/rejected": -459.89715576171875, + "loss": 0.4984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5781039595603943, + "rewards/margins": 1.089951753616333, + "rewards/rejected": -1.668055534362793, + "step": 415 + }, + { + "epoch": 0.36, + "grad_norm": 42.9621695989501, + "learning_rate": 1.6251608485109519e-06, + "logits/chosen": -1.4585225582122803, + "logits/rejected": -1.4104504585266113, + "logps/chosen": -495.1009216308594, + "logps/rejected": -635.91748046875, + "loss": 0.2641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.71750408411026, + "rewards/margins": 1.8936041593551636, + "rewards/rejected": -2.6111083030700684, + "step": 416 + }, + { + "epoch": 0.36, + "grad_norm": 56.19773710256689, + "learning_rate": 1.622820583892185e-06, + "logits/chosen": -1.4692611694335938, + "logits/rejected": -1.3992235660552979, + "logps/chosen": -479.1830749511719, + "logps/rejected": -642.672119140625, + "loss": 0.383, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5529279112815857, + "rewards/margins": 1.683672308921814, + "rewards/rejected": -2.236600399017334, + "step": 417 + }, + { + "epoch": 0.36, + "grad_norm": 66.9721595221014, + "learning_rate": 1.6204747331387448e-06, + "logits/chosen": -1.3393265008926392, + "logits/rejected": -1.2750935554504395, + "logps/chosen": -490.73211669921875, + "logps/rejected": -633.0335083007812, + "loss": 0.558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6580825448036194, + "rewards/margins": 1.143843650817871, + "rewards/rejected": -1.8019261360168457, + "step": 418 + }, + { + "epoch": 0.36, + "grad_norm": 41.10087728789618, + "learning_rate": 1.6181233172907796e-06, + "logits/chosen": -1.4568686485290527, + "logits/rejected": -1.3738291263580322, + "logps/chosen": -479.49639892578125, + "logps/rejected": -744.3993530273438, + "loss": 0.2447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5038151741027832, + "rewards/margins": 1.8586931228637695, + "rewards/rejected": -2.3625082969665527, + "step": 419 + }, + { + "epoch": 0.36, + "grad_norm": 54.06812565547437, + "learning_rate": 1.6157663574383538e-06, + "logits/chosen": -1.2719099521636963, + "logits/rejected": -1.268294334411621, + "logps/chosen": -628.4613647460938, + "logps/rejected": -751.2962036132812, + "loss": 0.2633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6819362640380859, + "rewards/margins": 1.8705052137374878, + "rewards/rejected": -2.5524415969848633, + "step": 420 + }, + { + "epoch": 0.36, + "grad_norm": 97.92587825067058, + "learning_rate": 1.6134038747212544e-06, + "logits/chosen": -1.448411464691162, + "logits/rejected": -1.3822431564331055, + "logps/chosen": -352.2517395019531, + "logps/rejected": -514.0858154296875, + "loss": 0.3892, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6510804891586304, + "rewards/margins": 1.0912226438522339, + "rewards/rejected": -1.7423030138015747, + "step": 421 + }, + { + "epoch": 0.36, + "grad_norm": 50.430038449778294, + "learning_rate": 1.6110358903288056e-06, + "logits/chosen": -1.3349554538726807, + "logits/rejected": -1.2732388973236084, + "logps/chosen": -494.1665954589844, + "logps/rejected": -697.268798828125, + "loss": 0.3301, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8012133240699768, + "rewards/margins": 1.9825458526611328, + "rewards/rejected": -2.783759117126465, + "step": 422 + }, + { + "epoch": 0.36, + "grad_norm": 59.394078146172234, + "learning_rate": 1.6086624254996748e-06, + "logits/chosen": -1.2826063632965088, + "logits/rejected": -1.2882716655731201, + "logps/chosen": -613.343505859375, + "logps/rejected": -738.014404296875, + "loss": 0.272, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8677641153335571, + "rewards/margins": 1.6630736589431763, + "rewards/rejected": -2.5308375358581543, + "step": 423 + }, + { + "epoch": 0.36, + "grad_norm": 44.53266060036663, + "learning_rate": 1.6062835015216854e-06, + "logits/chosen": -1.4333112239837646, + "logits/rejected": -1.3385894298553467, + "logps/chosen": -624.1317138671875, + "logps/rejected": -977.5977783203125, + "loss": 0.2785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29590773582458496, + "rewards/margins": 2.2630515098571777, + "rewards/rejected": -2.558959484100342, + "step": 424 + }, + { + "epoch": 0.36, + "grad_norm": 86.4811714615724, + "learning_rate": 1.6038991397316232e-06, + "logits/chosen": -1.2939656972885132, + "logits/rejected": -1.2937078475952148, + "logps/chosen": -461.0428466796875, + "logps/rejected": -537.1651611328125, + "loss": 0.5664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.44620293378829956, + "rewards/margins": 1.0640058517456055, + "rewards/rejected": -1.5102088451385498, + "step": 425 + }, + { + "epoch": 0.37, + "grad_norm": 85.17690049322667, + "learning_rate": 1.601509361515047e-06, + "logits/chosen": -1.366929054260254, + "logits/rejected": -1.3334461450576782, + "logps/chosen": -486.15484619140625, + "logps/rejected": -612.0838623046875, + "loss": 0.5528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.74158775806427, + "rewards/margins": 0.6057339906692505, + "rewards/rejected": -1.347321629524231, + "step": 426 + }, + { + "epoch": 0.37, + "grad_norm": 58.22263704872626, + "learning_rate": 1.5991141883060958e-06, + "logits/chosen": -1.3588616847991943, + "logits/rejected": -1.2897999286651611, + "logps/chosen": -541.0726318359375, + "logps/rejected": -708.3511962890625, + "loss": 0.3274, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3630627393722534, + "rewards/margins": 1.3744697570800781, + "rewards/rejected": -1.7375324964523315, + "step": 427 + }, + { + "epoch": 0.37, + "grad_norm": 39.379425670836454, + "learning_rate": 1.5967136415872966e-06, + "logits/chosen": -1.4818212985992432, + "logits/rejected": -1.4103446006774902, + "logps/chosen": -411.818115234375, + "logps/rejected": -620.8701171875, + "loss": 0.3029, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19901621341705322, + "rewards/margins": 1.6631044149398804, + "rewards/rejected": -1.8621206283569336, + "step": 428 + }, + { + "epoch": 0.37, + "grad_norm": 71.56604182577863, + "learning_rate": 1.5943077428893724e-06, + "logits/chosen": -1.1709115505218506, + "logits/rejected": -1.1403000354766846, + "logps/chosen": -498.3337707519531, + "logps/rejected": -666.076904296875, + "loss": 0.3815, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.43662720918655396, + "rewards/margins": 1.2479010820388794, + "rewards/rejected": -1.684528112411499, + "step": 429 + }, + { + "epoch": 0.37, + "grad_norm": 46.784228562458715, + "learning_rate": 1.5918965137910478e-06, + "logits/chosen": -1.3150413036346436, + "logits/rejected": -1.2522664070129395, + "logps/chosen": -569.6610107421875, + "logps/rejected": -761.95556640625, + "loss": 0.2254, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8883675932884216, + "rewards/margins": 1.8457796573638916, + "rewards/rejected": -2.734147310256958, + "step": 430 + }, + { + "epoch": 0.37, + "grad_norm": 45.06090655791921, + "learning_rate": 1.589479975918857e-06, + "logits/chosen": -1.3054924011230469, + "logits/rejected": -1.310208797454834, + "logps/chosen": -558.9491577148438, + "logps/rejected": -629.2244873046875, + "loss": 0.3722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9529801607131958, + "rewards/margins": 1.5737230777740479, + "rewards/rejected": -2.526703357696533, + "step": 431 + }, + { + "epoch": 0.37, + "grad_norm": 65.54274021493728, + "learning_rate": 1.5870581509469486e-06, + "logits/chosen": -1.0618208646774292, + "logits/rejected": -1.030731201171875, + "logps/chosen": -448.14190673828125, + "logps/rejected": -519.9080810546875, + "loss": 0.4505, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.646375298500061, + "rewards/margins": 1.157083511352539, + "rewards/rejected": -1.8034586906433105, + "step": 432 + }, + { + "epoch": 0.37, + "grad_norm": 39.39951403805403, + "learning_rate": 1.5846310605968923e-06, + "logits/chosen": -1.3308734893798828, + "logits/rejected": -1.255303978919983, + "logps/chosen": -635.2354736328125, + "logps/rejected": -828.3485107421875, + "loss": 0.2347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5919246673583984, + "rewards/margins": 2.2630014419555664, + "rewards/rejected": -2.854926109313965, + "step": 433 + }, + { + "epoch": 0.37, + "grad_norm": 69.90226205895301, + "learning_rate": 1.5821987266374826e-06, + "logits/chosen": -1.2554914951324463, + "logits/rejected": -1.2407859563827515, + "logps/chosen": -719.94580078125, + "logps/rejected": -779.0153198242188, + "loss": 0.2793, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9973692893981934, + "rewards/margins": 1.8503321409225464, + "rewards/rejected": -2.8477015495300293, + "step": 434 + }, + { + "epoch": 0.37, + "grad_norm": 64.8893318380094, + "learning_rate": 1.5797611708845447e-06, + "logits/chosen": -1.3571207523345947, + "logits/rejected": -1.3038105964660645, + "logps/chosen": -515.7115478515625, + "logps/rejected": -759.386474609375, + "loss": 0.3357, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5526295900344849, + "rewards/margins": 2.4585442543029785, + "rewards/rejected": -3.011174201965332, + "step": 435 + }, + { + "epoch": 0.37, + "grad_norm": 33.07107510329828, + "learning_rate": 1.577318415200739e-06, + "logits/chosen": -1.208054780960083, + "logits/rejected": -1.1525557041168213, + "logps/chosen": -488.433837890625, + "logps/rejected": -620.9463500976562, + "loss": 0.1896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.821509599685669, + "rewards/margins": 2.117979049682617, + "rewards/rejected": -2.939488649368286, + "step": 436 + }, + { + "epoch": 0.37, + "grad_norm": 73.00783877428485, + "learning_rate": 1.5748704814953643e-06, + "logits/chosen": -1.3184621334075928, + "logits/rejected": -1.2896759510040283, + "logps/chosen": -582.6005859375, + "logps/rejected": -633.3236694335938, + "loss": 0.3145, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8894643783569336, + "rewards/margins": 1.5803616046905518, + "rewards/rejected": -2.4698259830474854, + "step": 437 + }, + { + "epoch": 0.38, + "grad_norm": 77.38950721995847, + "learning_rate": 1.5724173917241611e-06, + "logits/chosen": -1.3838648796081543, + "logits/rejected": -1.3053375482559204, + "logps/chosen": -542.278564453125, + "logps/rejected": -678.4103393554688, + "loss": 0.5398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7684734463691711, + "rewards/margins": 1.3273355960845947, + "rewards/rejected": -2.095808982849121, + "step": 438 + }, + { + "epoch": 0.38, + "grad_norm": 45.569714523985276, + "learning_rate": 1.5699591678891157e-06, + "logits/chosen": -1.4103734493255615, + "logits/rejected": -1.2873899936676025, + "logps/chosen": -566.2435302734375, + "logps/rejected": -853.83740234375, + "loss": 0.2647, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6442338228225708, + "rewards/margins": 2.577425241470337, + "rewards/rejected": -3.2216591835021973, + "step": 439 + }, + { + "epoch": 0.38, + "grad_norm": 41.351074889929265, + "learning_rate": 1.5674958320382623e-06, + "logits/chosen": -1.4523862600326538, + "logits/rejected": -1.3567168712615967, + "logps/chosen": -538.0159912109375, + "logps/rejected": -816.7225341796875, + "loss": 0.2577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5122110843658447, + "rewards/margins": 1.928330421447754, + "rewards/rejected": -2.4405415058135986, + "step": 440 + }, + { + "epoch": 0.38, + "grad_norm": 54.720135217149405, + "learning_rate": 1.5650274062654844e-06, + "logits/chosen": -1.1927452087402344, + "logits/rejected": -1.1439197063446045, + "logps/chosen": -622.1829833984375, + "logps/rejected": -811.093017578125, + "loss": 0.2502, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5815736055374146, + "rewards/margins": 1.7208086252212524, + "rewards/rejected": -2.302382469177246, + "step": 441 + }, + { + "epoch": 0.38, + "grad_norm": 72.94654723585565, + "learning_rate": 1.5625539127103188e-06, + "logits/chosen": -1.243112564086914, + "logits/rejected": -1.1448404788970947, + "logps/chosen": -459.63360595703125, + "logps/rejected": -657.576171875, + "loss": 0.5021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09865646809339523, + "rewards/margins": 0.6924504637718201, + "rewards/rejected": -0.7911069393157959, + "step": 442 + }, + { + "epoch": 0.38, + "grad_norm": 50.440504920610245, + "learning_rate": 1.5600753735577547e-06, + "logits/chosen": -1.264277458190918, + "logits/rejected": -1.223200798034668, + "logps/chosen": -386.2403259277344, + "logps/rejected": -529.3462524414062, + "loss": 0.4116, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03323012590408325, + "rewards/margins": 0.9878279566764832, + "rewards/rejected": -0.9545978903770447, + "step": 443 + }, + { + "epoch": 0.38, + "grad_norm": 74.39359903694583, + "learning_rate": 1.5575918110380362e-06, + "logits/chosen": -1.182845115661621, + "logits/rejected": -1.1100221872329712, + "logps/chosen": -555.8534545898438, + "logps/rejected": -643.8975830078125, + "loss": 0.3459, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7339385747909546, + "rewards/margins": 1.4555790424346924, + "rewards/rejected": -2.1895174980163574, + "step": 444 + }, + { + "epoch": 0.38, + "grad_norm": 72.22911123903084, + "learning_rate": 1.5551032474264618e-06, + "logits/chosen": -1.075033187866211, + "logits/rejected": -1.032829999923706, + "logps/chosen": -525.1008911132812, + "logps/rejected": -743.3673706054688, + "loss": 0.4397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8419838547706604, + "rewards/margins": 1.3715593814849854, + "rewards/rejected": -2.21354341506958, + "step": 445 + }, + { + "epoch": 0.38, + "grad_norm": 45.646517241158904, + "learning_rate": 1.5526097050431863e-06, + "logits/chosen": -1.1065202951431274, + "logits/rejected": -1.0796585083007812, + "logps/chosen": -541.9082641601562, + "logps/rejected": -585.971435546875, + "loss": 0.3449, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7269726991653442, + "rewards/margins": 1.2809374332427979, + "rewards/rejected": -2.0079102516174316, + "step": 446 + }, + { + "epoch": 0.38, + "grad_norm": 62.86632108103949, + "learning_rate": 1.5501112062530185e-06, + "logits/chosen": -1.0184428691864014, + "logits/rejected": -0.985652506351471, + "logps/chosen": -603.8740844726562, + "logps/rejected": -795.9415283203125, + "loss": 0.3501, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2108818292617798, + "rewards/margins": 1.2921862602233887, + "rewards/rejected": -2.503067970275879, + "step": 447 + }, + { + "epoch": 0.38, + "grad_norm": 33.58903116651355, + "learning_rate": 1.5476077734652222e-06, + "logits/chosen": -0.8725420236587524, + "logits/rejected": -0.8370548486709595, + "logps/chosen": -456.96435546875, + "logps/rejected": -775.3349609375, + "loss": 0.2112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5516107082366943, + "rewards/margins": 2.407532215118408, + "rewards/rejected": -2.9591431617736816, + "step": 448 + }, + { + "epoch": 0.39, + "grad_norm": 47.09393568036316, + "learning_rate": 1.5450994291333151e-06, + "logits/chosen": -0.9912916421890259, + "logits/rejected": -0.9687042236328125, + "logps/chosen": -385.6941833496094, + "logps/rejected": -556.1038818359375, + "loss": 0.4064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5287074446678162, + "rewards/margins": 1.4976418018341064, + "rewards/rejected": -2.0263493061065674, + "step": 449 + }, + { + "epoch": 0.39, + "grad_norm": 73.63416167778202, + "learning_rate": 1.5425861957548657e-06, + "logits/chosen": -1.1284853219985962, + "logits/rejected": -1.0713797807693481, + "logps/chosen": -629.8704833984375, + "logps/rejected": -706.3995361328125, + "loss": 0.3777, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2089190483093262, + "rewards/margins": 1.2346839904785156, + "rewards/rejected": -2.443603038787842, + "step": 450 + }, + { + "epoch": 0.39, + "grad_norm": 72.98530696780311, + "learning_rate": 1.5400680958712942e-06, + "logits/chosen": -1.0030968189239502, + "logits/rejected": -0.974974513053894, + "logps/chosen": -400.2067565917969, + "logps/rejected": -490.24041748046875, + "loss": 0.3404, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5827921032905579, + "rewards/margins": 1.6719770431518555, + "rewards/rejected": -2.2547693252563477, + "step": 451 + }, + { + "epoch": 0.39, + "grad_norm": 38.68236963528052, + "learning_rate": 1.5375451520676684e-06, + "logits/chosen": -1.0761189460754395, + "logits/rejected": -1.0248160362243652, + "logps/chosen": -408.9266357421875, + "logps/rejected": -681.0333251953125, + "loss": 0.2131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7555465698242188, + "rewards/margins": 1.823000192642212, + "rewards/rejected": -2.5785470008850098, + "step": 452 + }, + { + "epoch": 0.39, + "grad_norm": 107.75050665764365, + "learning_rate": 1.5350173869725009e-06, + "logits/chosen": -0.9483497142791748, + "logits/rejected": -0.9131959676742554, + "logps/chosen": -538.731689453125, + "logps/rejected": -595.2650756835938, + "loss": 0.6406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9458996057510376, + "rewards/margins": 1.2236244678497314, + "rewards/rejected": -2.1695239543914795, + "step": 453 + }, + { + "epoch": 0.39, + "grad_norm": 53.19153906331593, + "learning_rate": 1.5324848232575482e-06, + "logits/chosen": -0.9808636903762817, + "logits/rejected": -0.9459196329116821, + "logps/chosen": -538.7525634765625, + "logps/rejected": -663.7606201171875, + "loss": 0.2758, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0494015216827393, + "rewards/margins": 1.7582777738571167, + "rewards/rejected": -2.8076791763305664, + "step": 454 + }, + { + "epoch": 0.39, + "grad_norm": 54.0164683776154, + "learning_rate": 1.5299474836376055e-06, + "logits/chosen": -1.0516619682312012, + "logits/rejected": -1.0118380784988403, + "logps/chosen": -467.1536560058594, + "logps/rejected": -640.5703125, + "loss": 0.297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.914126455783844, + "rewards/margins": 1.8050568103790283, + "rewards/rejected": -2.7191834449768066, + "step": 455 + }, + { + "epoch": 0.39, + "grad_norm": 69.76944754945215, + "learning_rate": 1.5274053908703033e-06, + "logits/chosen": -0.9428573846817017, + "logits/rejected": -0.8974170684814453, + "logps/chosen": -631.4688720703125, + "logps/rejected": -762.390380859375, + "loss": 0.4408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.977918267250061, + "rewards/margins": 1.8812000751495361, + "rewards/rejected": -2.8591182231903076, + "step": 456 + }, + { + "epoch": 0.39, + "grad_norm": 35.547985558837226, + "learning_rate": 1.5248585677559032e-06, + "logits/chosen": -0.9528816342353821, + "logits/rejected": -0.9294205904006958, + "logps/chosen": -422.3802490234375, + "logps/rejected": -676.3129272460938, + "loss": 0.2149, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5416313409805298, + "rewards/margins": 2.1708362102508545, + "rewards/rejected": -2.712467670440674, + "step": 457 + }, + { + "epoch": 0.39, + "grad_norm": 45.82051469410822, + "learning_rate": 1.5223070371370953e-06, + "logits/chosen": -0.8964556455612183, + "logits/rejected": -0.8561989068984985, + "logps/chosen": -565.7823486328125, + "logps/rejected": -795.1091918945312, + "loss": 0.1674, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7855107188224792, + "rewards/margins": 2.320809841156006, + "rewards/rejected": -3.106320381164551, + "step": 458 + }, + { + "epoch": 0.39, + "grad_norm": 62.130735305048916, + "learning_rate": 1.51975082189879e-06, + "logits/chosen": -0.9833825826644897, + "logits/rejected": -0.9350192546844482, + "logps/chosen": -642.7136840820312, + "logps/rejected": -894.32568359375, + "loss": 0.3018, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6459782123565674, + "rewards/margins": 2.0245954990386963, + "rewards/rejected": -2.6705734729766846, + "step": 459 + }, + { + "epoch": 0.39, + "grad_norm": 43.43854358154253, + "learning_rate": 1.517189944967915e-06, + "logits/chosen": -0.9655320644378662, + "logits/rejected": -0.9203553199768066, + "logps/chosen": -484.23211669921875, + "logps/rejected": -691.60693359375, + "loss": 0.1823, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.607116162776947, + "rewards/margins": 2.231950044631958, + "rewards/rejected": -2.83906626701355, + "step": 460 + }, + { + "epoch": 0.4, + "grad_norm": 55.04848780763436, + "learning_rate": 1.5146244293132094e-06, + "logits/chosen": -0.9897803068161011, + "logits/rejected": -0.9608893394470215, + "logps/chosen": -388.45849609375, + "logps/rejected": -519.8792724609375, + "loss": 0.4448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5719874501228333, + "rewards/margins": 1.5956003665924072, + "rewards/rejected": -2.1675877571105957, + "step": 461 + }, + { + "epoch": 0.4, + "grad_norm": 56.574543353720905, + "learning_rate": 1.5120542979450173e-06, + "logits/chosen": -0.928221583366394, + "logits/rejected": -0.8038352727890015, + "logps/chosen": -673.7662353515625, + "logps/rejected": -914.059326171875, + "loss": 0.2237, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8671103715896606, + "rewards/margins": 2.07771635055542, + "rewards/rejected": -2.94482684135437, + "step": 462 + }, + { + "epoch": 0.4, + "grad_norm": 62.25539236311385, + "learning_rate": 1.509479573915082e-06, + "logits/chosen": -0.8410499095916748, + "logits/rejected": -0.8236804008483887, + "logps/chosen": -453.6719970703125, + "logps/rejected": -598.1178588867188, + "loss": 0.3562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49575868248939514, + "rewards/margins": 1.7100486755371094, + "rewards/rejected": -2.2058072090148926, + "step": 463 + }, + { + "epoch": 0.4, + "grad_norm": 236.08526660286373, + "learning_rate": 1.5069002803163375e-06, + "logits/chosen": -0.9178378582000732, + "logits/rejected": -0.8615768551826477, + "logps/chosen": -548.9498291015625, + "logps/rejected": -557.3529052734375, + "loss": 0.4353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7421918511390686, + "rewards/margins": 0.7083767652511597, + "rewards/rejected": -1.450568675994873, + "step": 464 + }, + { + "epoch": 0.4, + "grad_norm": 58.82894013303105, + "learning_rate": 1.5043164402827043e-06, + "logits/chosen": -0.8117421865463257, + "logits/rejected": -0.8035683631896973, + "logps/chosen": -371.0589294433594, + "logps/rejected": -584.568603515625, + "loss": 0.3537, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3649640381336212, + "rewards/margins": 1.6534215211868286, + "rewards/rejected": -2.018385410308838, + "step": 465 + }, + { + "epoch": 0.4, + "grad_norm": 52.38409147957441, + "learning_rate": 1.5017280769888791e-06, + "logits/chosen": -0.9991079568862915, + "logits/rejected": -0.9378537535667419, + "logps/chosen": -431.78387451171875, + "logps/rejected": -594.4720458984375, + "loss": 0.3443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2835843563079834, + "rewards/margins": 1.412635087966919, + "rewards/rejected": -1.6962194442749023, + "step": 466 + }, + { + "epoch": 0.4, + "grad_norm": 57.99597845314727, + "learning_rate": 1.4991352136501295e-06, + "logits/chosen": -1.061201810836792, + "logits/rejected": -1.0437837839126587, + "logps/chosen": -419.2176208496094, + "logps/rejected": -547.6530151367188, + "loss": 0.4095, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7993943691253662, + "rewards/margins": 1.1544561386108398, + "rewards/rejected": -1.9538503885269165, + "step": 467 + }, + { + "epoch": 0.4, + "grad_norm": 55.909921186407395, + "learning_rate": 1.4965378735220821e-06, + "logits/chosen": -0.8839898705482483, + "logits/rejected": -0.8770423531532288, + "logps/chosen": -568.14794921875, + "logps/rejected": -774.8721313476562, + "loss": 0.276, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2065205574035645, + "rewards/margins": 1.614347219467163, + "rewards/rejected": -2.8208677768707275, + "step": 468 + }, + { + "epoch": 0.4, + "grad_norm": 28.50181966886888, + "learning_rate": 1.4939360799005183e-06, + "logits/chosen": -0.881384015083313, + "logits/rejected": -0.852988064289093, + "logps/chosen": -335.64569091796875, + "logps/rejected": -606.104736328125, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6380329132080078, + "rewards/margins": 1.972926139831543, + "rewards/rejected": -2.6109588146209717, + "step": 469 + }, + { + "epoch": 0.4, + "grad_norm": 74.33240566346468, + "learning_rate": 1.4913298561211627e-06, + "logits/chosen": -0.868405818939209, + "logits/rejected": -0.869547426700592, + "logps/chosen": -380.373779296875, + "logps/rejected": -620.7781982421875, + "loss": 0.4729, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.89825439453125, + "rewards/margins": 1.3418068885803223, + "rewards/rejected": -2.2400612831115723, + "step": 470 + }, + { + "epoch": 0.4, + "grad_norm": 62.01430464542387, + "learning_rate": 1.4887192255594744e-06, + "logits/chosen": -0.9125020503997803, + "logits/rejected": -0.8442560434341431, + "logps/chosen": -540.2130126953125, + "logps/rejected": -651.144287109375, + "loss": 0.4313, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0115090608596802, + "rewards/margins": 1.6366745233535767, + "rewards/rejected": -2.648183584213257, + "step": 471 + }, + { + "epoch": 0.4, + "grad_norm": 88.73643027706079, + "learning_rate": 1.4861042116304369e-06, + "logits/chosen": -0.8929340839385986, + "logits/rejected": -0.7871063351631165, + "logps/chosen": -654.5023193359375, + "logps/rejected": -625.2041625976562, + "loss": 0.4566, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.387766718864441, + "rewards/margins": 1.0511832237243652, + "rewards/rejected": -2.4389500617980957, + "step": 472 + }, + { + "epoch": 0.41, + "grad_norm": 68.26138348422057, + "learning_rate": 1.4834848377883486e-06, + "logits/chosen": -0.8650990724563599, + "logits/rejected": -0.8362406492233276, + "logps/chosen": -513.7879638671875, + "logps/rejected": -628.8133544921875, + "loss": 0.3425, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9172006845474243, + "rewards/margins": 1.5339590311050415, + "rewards/rejected": -2.451159715652466, + "step": 473 + }, + { + "epoch": 0.41, + "grad_norm": 46.12241480060752, + "learning_rate": 1.480861127526613e-06, + "logits/chosen": -0.9154388904571533, + "logits/rejected": -0.8683236837387085, + "logps/chosen": -431.5248107910156, + "logps/rejected": -608.5435791015625, + "loss": 0.203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6946539878845215, + "rewards/margins": 2.112185478210449, + "rewards/rejected": -2.80683970451355, + "step": 474 + }, + { + "epoch": 0.41, + "grad_norm": 45.097472853318116, + "learning_rate": 1.4782331043775276e-06, + "logits/chosen": -0.8772906064987183, + "logits/rejected": -0.8202885389328003, + "logps/chosen": -567.1807250976562, + "logps/rejected": -685.1033935546875, + "loss": 0.2714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.72135990858078, + "rewards/margins": 2.1727640628814697, + "rewards/rejected": -2.8941240310668945, + "step": 475 + }, + { + "epoch": 0.41, + "grad_norm": 73.02755926747012, + "learning_rate": 1.4756007919120708e-06, + "logits/chosen": -0.8577795028686523, + "logits/rejected": -0.837565541267395, + "logps/chosen": -476.64483642578125, + "logps/rejected": -730.842529296875, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8776594996452332, + "rewards/margins": 1.9460971355438232, + "rewards/rejected": -2.823756694793701, + "step": 476 + }, + { + "epoch": 0.41, + "grad_norm": 54.704647833843524, + "learning_rate": 1.472964213739694e-06, + "logits/chosen": -0.9222075939178467, + "logits/rejected": -0.8904163837432861, + "logps/chosen": -489.0938720703125, + "logps/rejected": -712.3372802734375, + "loss": 0.2647, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1723819971084595, + "rewards/margins": 1.9933929443359375, + "rewards/rejected": -3.1657748222351074, + "step": 477 + }, + { + "epoch": 0.41, + "grad_norm": 50.10043792173988, + "learning_rate": 1.470323393508107e-06, + "logits/chosen": -0.9438526630401611, + "logits/rejected": -0.8643888235092163, + "logps/chosen": -700.4232177734375, + "logps/rejected": -811.2674560546875, + "loss": 0.2606, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8976376056671143, + "rewards/margins": 2.241746425628662, + "rewards/rejected": -3.1393840312957764, + "step": 478 + }, + { + "epoch": 0.41, + "grad_norm": 28.369621605558525, + "learning_rate": 1.4676783549030684e-06, + "logits/chosen": -0.8669946789741516, + "logits/rejected": -0.8010271787643433, + "logps/chosen": -577.3978271484375, + "logps/rejected": -856.3585205078125, + "loss": 0.1487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5408543348312378, + "rewards/margins": 2.634451389312744, + "rewards/rejected": -3.1753058433532715, + "step": 479 + }, + { + "epoch": 0.41, + "grad_norm": 61.711699275612446, + "learning_rate": 1.4650291216481706e-06, + "logits/chosen": -0.8652939796447754, + "logits/rejected": -0.771173357963562, + "logps/chosen": -670.241943359375, + "logps/rejected": -756.6165771484375, + "loss": 0.3041, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.716055691242218, + "rewards/margins": 1.8450524806976318, + "rewards/rejected": -2.561108112335205, + "step": 480 + }, + { + "epoch": 0.41, + "grad_norm": 93.40482934075946, + "learning_rate": 1.4623757175046278e-06, + "logits/chosen": -0.751588761806488, + "logits/rejected": -0.7028893232345581, + "logps/chosen": -497.6466064453125, + "logps/rejected": -766.5819091796875, + "loss": 0.6932, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17273837327957153, + "rewards/margins": 0.17415812611579895, + "rewards/rejected": -0.34689652919769287, + "step": 481 + }, + { + "epoch": 0.41, + "grad_norm": 69.34539505945266, + "learning_rate": 1.459718166271065e-06, + "logits/chosen": -0.8390201330184937, + "logits/rejected": -0.7440624237060547, + "logps/chosen": -695.57666015625, + "logps/rejected": -682.0604858398438, + "loss": 0.365, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8433142900466919, + "rewards/margins": 1.2970421314239502, + "rewards/rejected": -2.1403563022613525, + "step": 482 + }, + { + "epoch": 0.41, + "grad_norm": 66.15245519454665, + "learning_rate": 1.457056491783301e-06, + "logits/chosen": -0.8030404448509216, + "logits/rejected": -0.77801114320755, + "logps/chosen": -459.1448974609375, + "logps/rejected": -622.8702392578125, + "loss": 0.4067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6972125768661499, + "rewards/margins": 1.2079944610595703, + "rewards/rejected": -1.9052069187164307, + "step": 483 + }, + { + "epoch": 0.42, + "grad_norm": 67.25380978057618, + "learning_rate": 1.454390717914138e-06, + "logits/chosen": -0.8256564140319824, + "logits/rejected": -0.8271548748016357, + "logps/chosen": -392.1229248046875, + "logps/rejected": -539.0008544921875, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6776982545852661, + "rewards/margins": 1.1847028732299805, + "rewards/rejected": -1.862401008605957, + "step": 484 + }, + { + "epoch": 0.42, + "grad_norm": 50.637462599424396, + "learning_rate": 1.4517208685731445e-06, + "logits/chosen": -0.729097843170166, + "logits/rejected": -0.721123456954956, + "logps/chosen": -490.776611328125, + "logps/rejected": -603.486083984375, + "loss": 0.3576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4171733260154724, + "rewards/margins": 1.5965278148651123, + "rewards/rejected": -2.0137009620666504, + "step": 485 + }, + { + "epoch": 0.42, + "grad_norm": 45.79116637926291, + "learning_rate": 1.4490469677064435e-06, + "logits/chosen": -0.8838560581207275, + "logits/rejected": -0.7998265027999878, + "logps/chosen": -529.9131469726562, + "logps/rejected": -649.6559448242188, + "loss": 0.2911, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9633562564849854, + "rewards/margins": 1.5850216150283813, + "rewards/rejected": -2.548377752304077, + "step": 486 + }, + { + "epoch": 0.42, + "grad_norm": 69.41826659241754, + "learning_rate": 1.4463690392964955e-06, + "logits/chosen": -0.8535840511322021, + "logits/rejected": -0.7974878549575806, + "logps/chosen": -609.5081787109375, + "logps/rejected": -682.191650390625, + "loss": 0.3662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.127061367034912, + "rewards/margins": 1.7252570390701294, + "rewards/rejected": -2.85231876373291, + "step": 487 + }, + { + "epoch": 0.42, + "grad_norm": 47.61794633825491, + "learning_rate": 1.4436871073618857e-06, + "logits/chosen": -0.8231527805328369, + "logits/rejected": -0.7265893220901489, + "logps/chosen": -580.491943359375, + "logps/rejected": -609.302734375, + "loss": 0.2244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7856195569038391, + "rewards/margins": 2.2862648963928223, + "rewards/rejected": -3.0718846321105957, + "step": 488 + }, + { + "epoch": 0.42, + "grad_norm": 88.33257603126563, + "learning_rate": 1.4410011959571051e-06, + "logits/chosen": -0.7357794046401978, + "logits/rejected": -0.705422043800354, + "logps/chosen": -509.08978271484375, + "logps/rejected": -619.1658325195312, + "loss": 0.4252, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9400831460952759, + "rewards/margins": 1.3381483554840088, + "rewards/rejected": -2.278231620788574, + "step": 489 + }, + { + "epoch": 0.42, + "grad_norm": 85.65292490962175, + "learning_rate": 1.4383113291723398e-06, + "logits/chosen": -0.808850884437561, + "logits/rejected": -0.8109133243560791, + "logps/chosen": -512.6813354492188, + "logps/rejected": -797.9263916015625, + "loss": 0.2337, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8441105484962463, + "rewards/margins": 2.7022745609283447, + "rewards/rejected": -3.5463852882385254, + "step": 490 + }, + { + "epoch": 0.42, + "grad_norm": 55.69288279315483, + "learning_rate": 1.4356175311332495e-06, + "logits/chosen": -0.876907467842102, + "logits/rejected": -0.8422799110412598, + "logps/chosen": -454.2169189453125, + "logps/rejected": -666.6516723632812, + "loss": 0.3201, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6969516277313232, + "rewards/margins": 1.889945149421692, + "rewards/rejected": -2.5868966579437256, + "step": 491 + }, + { + "epoch": 0.42, + "grad_norm": 46.51583166431031, + "learning_rate": 1.4329198260007551e-06, + "logits/chosen": -0.849693775177002, + "logits/rejected": -0.8062174320220947, + "logps/chosen": -663.0662841796875, + "logps/rejected": -922.7932739257812, + "loss": 0.2157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8148276209831238, + "rewards/margins": 2.7568774223327637, + "rewards/rejected": -3.5717051029205322, + "step": 492 + }, + { + "epoch": 0.42, + "grad_norm": 75.15763989738159, + "learning_rate": 1.4302182379708203e-06, + "logits/chosen": -0.758513331413269, + "logits/rejected": -0.754541277885437, + "logps/chosen": -380.8668212890625, + "logps/rejected": -550.5960693359375, + "loss": 0.4863, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7542673945426941, + "rewards/margins": 1.8577656745910645, + "rewards/rejected": -2.612032890319824, + "step": 493 + }, + { + "epoch": 0.42, + "grad_norm": 64.8880511331801, + "learning_rate": 1.4275127912742343e-06, + "logits/chosen": -0.8404750823974609, + "logits/rejected": -0.8266079425811768, + "logps/chosen": -414.9617004394531, + "logps/rejected": -634.954345703125, + "loss": 0.2856, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7905468940734863, + "rewards/margins": 2.1434035301208496, + "rewards/rejected": -2.933950424194336, + "step": 494 + }, + { + "epoch": 0.42, + "grad_norm": 50.93952365848894, + "learning_rate": 1.4248035101763962e-06, + "logits/chosen": -0.8301562666893005, + "logits/rejected": -0.8052735328674316, + "logps/chosen": -585.7339477539062, + "logps/rejected": -790.9482421875, + "loss": 0.2048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.060710072517395, + "rewards/margins": 2.148515462875366, + "rewards/rejected": -3.209225654602051, + "step": 495 + }, + { + "epoch": 0.43, + "grad_norm": 31.16762541396582, + "learning_rate": 1.422090418977095e-06, + "logits/chosen": -0.8170212507247925, + "logits/rejected": -0.7670871019363403, + "logps/chosen": -566.548583984375, + "logps/rejected": -701.8504638671875, + "loss": 0.2523, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7676441669464111, + "rewards/margins": 2.0442118644714355, + "rewards/rejected": -2.8118557929992676, + "step": 496 + }, + { + "epoch": 0.43, + "grad_norm": 61.98372066168573, + "learning_rate": 1.4193735420102932e-06, + "logits/chosen": -0.8982579708099365, + "logits/rejected": -0.8410770297050476, + "logps/chosen": -454.56805419921875, + "logps/rejected": -605.292724609375, + "loss": 0.4107, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1641063690185547, + "rewards/margins": 1.3544279336929321, + "rewards/rejected": -2.5185341835021973, + "step": 497 + }, + { + "epoch": 0.43, + "grad_norm": 49.7457946102483, + "learning_rate": 1.4166529036439092e-06, + "logits/chosen": -0.8498281240463257, + "logits/rejected": -0.8173060417175293, + "logps/chosen": -638.1928100585938, + "logps/rejected": -833.4638671875, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9349879622459412, + "rewards/margins": 2.4696130752563477, + "rewards/rejected": -3.4046010971069336, + "step": 498 + }, + { + "epoch": 0.43, + "grad_norm": 27.961072326529063, + "learning_rate": 1.413928528279596e-06, + "logits/chosen": -0.7953431606292725, + "logits/rejected": -0.8111715316772461, + "logps/chosen": -405.8520202636719, + "logps/rejected": -663.355224609375, + "loss": 0.1762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34801992774009705, + "rewards/margins": 2.1400349140167236, + "rewards/rejected": -2.4880547523498535, + "step": 499 + }, + { + "epoch": 0.43, + "grad_norm": 64.49815264864202, + "learning_rate": 1.411200440352525e-06, + "logits/chosen": -0.8560844659805298, + "logits/rejected": -0.8431380987167358, + "logps/chosen": -539.9221801757812, + "logps/rejected": -746.8150024414062, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9053245186805725, + "rewards/margins": 1.952344298362732, + "rewards/rejected": -2.85766863822937, + "step": 500 + }, + { + "epoch": 0.43, + "grad_norm": 38.310902654677925, + "learning_rate": 1.4084686643311666e-06, + "logits/chosen": -0.8619130849838257, + "logits/rejected": -0.7855139374732971, + "logps/chosen": -505.196044921875, + "logps/rejected": -717.7348022460938, + "loss": 0.1972, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6216053366661072, + "rewards/margins": 2.5277938842773438, + "rewards/rejected": -3.1493990421295166, + "step": 501 + }, + { + "epoch": 0.43, + "grad_norm": 71.1766081581125, + "learning_rate": 1.4057332247170684e-06, + "logits/chosen": -0.8671563863754272, + "logits/rejected": -0.8257827758789062, + "logps/chosen": -600.34814453125, + "logps/rejected": -878.1358642578125, + "loss": 0.4022, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3607724905014038, + "rewards/margins": 2.288400888442993, + "rewards/rejected": -3.6491734981536865, + "step": 502 + }, + { + "epoch": 0.43, + "grad_norm": 78.98485490319837, + "learning_rate": 1.4029941460446385e-06, + "logits/chosen": -0.7788889408111572, + "logits/rejected": -0.8246132135391235, + "logps/chosen": -631.4876708984375, + "logps/rejected": -972.2409057617188, + "loss": 0.3269, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.226896047592163, + "rewards/margins": 2.414889097213745, + "rewards/rejected": -3.641785144805908, + "step": 503 + }, + { + "epoch": 0.43, + "grad_norm": 74.91603010165284, + "learning_rate": 1.4002514528809234e-06, + "logits/chosen": -0.837902307510376, + "logits/rejected": -0.8045358061790466, + "logps/chosen": -474.34075927734375, + "logps/rejected": -522.15771484375, + "loss": 0.4926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8030575513839722, + "rewards/margins": 0.9458550214767456, + "rewards/rejected": -1.7489125728607178, + "step": 504 + }, + { + "epoch": 0.43, + "grad_norm": 39.39261676529048, + "learning_rate": 1.397505169825389e-06, + "logits/chosen": -0.8392449021339417, + "logits/rejected": -0.8159143328666687, + "logps/chosen": -501.5545959472656, + "logps/rejected": -852.7933349609375, + "loss": 0.1755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8916856646537781, + "rewards/margins": 2.3363566398620605, + "rewards/rejected": -3.2280421257019043, + "step": 505 + }, + { + "epoch": 0.43, + "grad_norm": 44.17201764139978, + "learning_rate": 1.394755321509698e-06, + "logits/chosen": -0.8164315223693848, + "logits/rejected": -0.7713231444358826, + "logps/chosen": -511.5898132324219, + "logps/rejected": -640.146484375, + "loss": 0.2406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3921697735786438, + "rewards/margins": 1.7518031597137451, + "rewards/rejected": -2.143972873687744, + "step": 506 + }, + { + "epoch": 0.43, + "grad_norm": 39.94324558492835, + "learning_rate": 1.3920019325974915e-06, + "logits/chosen": -0.8083140850067139, + "logits/rejected": -0.8277095556259155, + "logps/chosen": -283.79443359375, + "logps/rejected": -565.7360229492188, + "loss": 0.2681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32972055673599243, + "rewards/margins": 1.5236279964447021, + "rewards/rejected": -1.853348731994629, + "step": 507 + }, + { + "epoch": 0.44, + "grad_norm": 49.77860810968748, + "learning_rate": 1.3892450277841655e-06, + "logits/chosen": -0.8014517426490784, + "logits/rejected": -0.7942814826965332, + "logps/chosen": -403.44921875, + "logps/rejected": -607.3531494140625, + "loss": 0.2493, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6396724581718445, + "rewards/margins": 1.6928253173828125, + "rewards/rejected": -2.3324978351593018, + "step": 508 + }, + { + "epoch": 0.44, + "grad_norm": 34.07100201871765, + "learning_rate": 1.3864846317966512e-06, + "logits/chosen": -0.8613470792770386, + "logits/rejected": -0.828007698059082, + "logps/chosen": -487.0976867675781, + "logps/rejected": -767.64697265625, + "loss": 0.1583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4158779978752136, + "rewards/margins": 2.6424601078033447, + "rewards/rejected": -3.058338165283203, + "step": 509 + }, + { + "epoch": 0.44, + "grad_norm": 32.23216272215837, + "learning_rate": 1.3837207693931925e-06, + "logits/chosen": -0.8819748163223267, + "logits/rejected": -0.858333945274353, + "logps/chosen": -306.4765319824219, + "logps/rejected": -589.0430908203125, + "loss": 0.1842, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.46015307307243347, + "rewards/margins": 2.3081138134002686, + "rewards/rejected": -2.7682669162750244, + "step": 510 + }, + { + "epoch": 0.44, + "grad_norm": 47.186284111106794, + "learning_rate": 1.3809534653631233e-06, + "logits/chosen": -0.9686492681503296, + "logits/rejected": -0.8675292730331421, + "logps/chosen": -590.91259765625, + "logps/rejected": -778.565673828125, + "loss": 0.1576, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4737451672554016, + "rewards/margins": 3.0512213706970215, + "rewards/rejected": -3.524966239929199, + "step": 511 + }, + { + "epoch": 0.44, + "grad_norm": 35.326151954502585, + "learning_rate": 1.3781827445266458e-06, + "logits/chosen": -0.8942842483520508, + "logits/rejected": -0.8705543279647827, + "logps/chosen": -452.56634521484375, + "logps/rejected": -680.036376953125, + "loss": 0.2228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7477368116378784, + "rewards/margins": 2.2438342571258545, + "rewards/rejected": -2.9915709495544434, + "step": 512 + }, + { + "epoch": 0.44, + "grad_norm": 39.76095534582754, + "learning_rate": 1.3754086317346087e-06, + "logits/chosen": -0.902711033821106, + "logits/rejected": -0.8663870096206665, + "logps/chosen": -497.970703125, + "logps/rejected": -667.9498291015625, + "loss": 0.2127, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7539928555488586, + "rewards/margins": 2.4936609268188477, + "rewards/rejected": -3.2476539611816406, + "step": 513 + }, + { + "epoch": 0.44, + "grad_norm": 68.85063612286014, + "learning_rate": 1.3726311518682827e-06, + "logits/chosen": -0.9456419944763184, + "logits/rejected": -0.8960084915161133, + "logps/chosen": -484.5071105957031, + "logps/rejected": -608.3556518554688, + "loss": 0.3153, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8282623291015625, + "rewards/margins": 1.6905174255371094, + "rewards/rejected": -2.518779993057251, + "step": 514 + }, + { + "epoch": 0.44, + "grad_norm": 70.97166935156534, + "learning_rate": 1.369850329839138e-06, + "logits/chosen": -0.838205873966217, + "logits/rejected": -0.8520830273628235, + "logps/chosen": -349.7906188964844, + "logps/rejected": -575.4569702148438, + "loss": 0.3721, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9320447444915771, + "rewards/margins": 1.8971960544586182, + "rewards/rejected": -2.829240560531616, + "step": 515 + }, + { + "epoch": 0.44, + "grad_norm": 63.54087433833503, + "learning_rate": 1.3670661905886216e-06, + "logits/chosen": -0.966612696647644, + "logits/rejected": -0.8654012680053711, + "logps/chosen": -421.6285095214844, + "logps/rejected": -474.3664245605469, + "loss": 0.505, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8842155933380127, + "rewards/margins": 0.9205365180969238, + "rewards/rejected": -1.804752230644226, + "step": 516 + }, + { + "epoch": 0.44, + "grad_norm": 81.66997925349206, + "learning_rate": 1.3642787590879323e-06, + "logits/chosen": -0.9743420481681824, + "logits/rejected": -0.8784420490264893, + "logps/chosen": -709.7653198242188, + "logps/rejected": -782.1478271484375, + "loss": 0.375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5563608407974243, + "rewards/margins": 1.992991328239441, + "rewards/rejected": -3.5493521690368652, + "step": 517 + }, + { + "epoch": 0.44, + "grad_norm": 35.212048034707685, + "learning_rate": 1.361488060337798e-06, + "logits/chosen": -0.8872784972190857, + "logits/rejected": -0.8517801761627197, + "logps/chosen": -471.6920471191406, + "logps/rejected": -743.310546875, + "loss": 0.2124, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7769272327423096, + "rewards/margins": 2.494457483291626, + "rewards/rejected": -3.2713847160339355, + "step": 518 + }, + { + "epoch": 0.45, + "grad_norm": 71.4255282290973, + "learning_rate": 1.3586941193682505e-06, + "logits/chosen": -0.9100337028503418, + "logits/rejected": -0.8517925143241882, + "logps/chosen": -512.8143920898438, + "logps/rejected": -625.2344970703125, + "loss": 0.4623, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.429246425628662, + "rewards/margins": 1.5222432613372803, + "rewards/rejected": -2.9514899253845215, + "step": 519 + }, + { + "epoch": 0.45, + "grad_norm": 67.98416347949309, + "learning_rate": 1.3558969612384007e-06, + "logits/chosen": -0.9615705013275146, + "logits/rejected": -0.9132786989212036, + "logps/chosen": -456.4330749511719, + "logps/rejected": -483.18768310546875, + "loss": 0.4794, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8184623718261719, + "rewards/margins": 1.3500648736953735, + "rewards/rejected": -2.168527126312256, + "step": 520 + }, + { + "epoch": 0.45, + "grad_norm": 61.255057172371146, + "learning_rate": 1.3530966110362163e-06, + "logits/chosen": -0.8489855527877808, + "logits/rejected": -0.82160484790802, + "logps/chosen": -631.4532470703125, + "logps/rejected": -705.7227783203125, + "loss": 0.3343, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1591343879699707, + "rewards/margins": 2.0831398963928223, + "rewards/rejected": -3.242274284362793, + "step": 521 + }, + { + "epoch": 0.45, + "grad_norm": 33.594536176516364, + "learning_rate": 1.3502930938782934e-06, + "logits/chosen": -0.9394704103469849, + "logits/rejected": -0.8963195085525513, + "logps/chosen": -618.7100830078125, + "logps/rejected": -790.1339721679688, + "loss": 0.2045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8805956244468689, + "rewards/margins": 2.06959867477417, + "rewards/rejected": -2.9501941204071045, + "step": 522 + }, + { + "epoch": 0.45, + "grad_norm": 74.7822911987875, + "learning_rate": 1.3474864349096333e-06, + "logits/chosen": -0.8570155501365662, + "logits/rejected": -0.826379656791687, + "logps/chosen": -676.6806640625, + "logps/rejected": -821.4932861328125, + "loss": 0.403, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.221854329109192, + "rewards/margins": 1.7235009670257568, + "rewards/rejected": -2.9453554153442383, + "step": 523 + }, + { + "epoch": 0.45, + "grad_norm": 60.51652877902085, + "learning_rate": 1.3446766593034167e-06, + "logits/chosen": -0.9486373662948608, + "logits/rejected": -0.8681553602218628, + "logps/chosen": -627.4612426757812, + "logps/rejected": -750.2420654296875, + "loss": 0.2923, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9254991412162781, + "rewards/margins": 1.9712306261062622, + "rewards/rejected": -2.8967297077178955, + "step": 524 + }, + { + "epoch": 0.45, + "grad_norm": 66.00914028262723, + "learning_rate": 1.3418637922607768e-06, + "logits/chosen": -0.8991914987564087, + "logits/rejected": -0.8221247792243958, + "logps/chosen": -432.67388916015625, + "logps/rejected": -493.1002197265625, + "loss": 0.4838, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11370259523391724, + "rewards/margins": 1.1485422849655151, + "rewards/rejected": -1.2622448205947876, + "step": 525 + }, + { + "epoch": 0.45, + "grad_norm": 95.95420441142826, + "learning_rate": 1.3390478590105761e-06, + "logits/chosen": -0.779647946357727, + "logits/rejected": -0.7370286583900452, + "logps/chosen": -520.8436279296875, + "logps/rejected": -618.977294921875, + "loss": 0.5084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4177210330963135, + "rewards/margins": 0.7659667730331421, + "rewards/rejected": -1.183687686920166, + "step": 526 + }, + { + "epoch": 0.45, + "grad_norm": 117.29601610029175, + "learning_rate": 1.3362288848091763e-06, + "logits/chosen": -0.7529869079589844, + "logits/rejected": -0.7606877088546753, + "logps/chosen": -380.1894836425781, + "logps/rejected": -729.3990478515625, + "loss": 0.4763, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15248233079910278, + "rewards/margins": 1.381932258605957, + "rewards/rejected": -1.534414529800415, + "step": 527 + }, + { + "epoch": 0.45, + "grad_norm": 82.17873398853364, + "learning_rate": 1.333406894940214e-06, + "logits/chosen": -0.8722752332687378, + "logits/rejected": -0.8514535427093506, + "logps/chosen": -476.3056335449219, + "logps/rejected": -726.4517822265625, + "loss": 0.4771, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8900383710861206, + "rewards/margins": 1.644824743270874, + "rewards/rejected": -2.534862995147705, + "step": 528 + }, + { + "epoch": 0.45, + "grad_norm": 34.13497579443595, + "learning_rate": 1.3305819147143747e-06, + "logits/chosen": -0.9461549520492554, + "logits/rejected": -0.910599946975708, + "logps/chosen": -434.37384033203125, + "logps/rejected": -686.114990234375, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6752679347991943, + "rewards/margins": 2.158499240875244, + "rewards/rejected": -2.8337674140930176, + "step": 529 + }, + { + "epoch": 0.45, + "grad_norm": 79.10388212463529, + "learning_rate": 1.3277539694691635e-06, + "logits/chosen": -0.9313993453979492, + "logits/rejected": -0.8736569881439209, + "logps/chosen": -559.302978515625, + "logps/rejected": -660.0396728515625, + "loss": 0.4475, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9553658962249756, + "rewards/margins": 1.5288103818893433, + "rewards/rejected": -2.4841761589050293, + "step": 530 + }, + { + "epoch": 0.46, + "grad_norm": 35.47979017711729, + "learning_rate": 1.3249230845686796e-06, + "logits/chosen": -0.9311937093734741, + "logits/rejected": -0.9165546298027039, + "logps/chosen": -280.443359375, + "logps/rejected": -392.58966064453125, + "loss": 0.2719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5209568738937378, + "rewards/margins": 1.7553080320358276, + "rewards/rejected": -2.2762649059295654, + "step": 531 + }, + { + "epoch": 0.46, + "grad_norm": 32.579500006278366, + "learning_rate": 1.322089285403388e-06, + "logits/chosen": -0.9264934062957764, + "logits/rejected": -0.8829073905944824, + "logps/chosen": -457.67913818359375, + "logps/rejected": -540.3162841796875, + "loss": 0.2518, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8536669015884399, + "rewards/margins": 1.865778923034668, + "rewards/rejected": -2.7194457054138184, + "step": 532 + }, + { + "epoch": 0.46, + "grad_norm": 47.65394221635163, + "learning_rate": 1.3192525973898921e-06, + "logits/chosen": -0.9628214240074158, + "logits/rejected": -0.9117701649665833, + "logps/chosen": -472.184326171875, + "logps/rejected": -635.360107421875, + "loss": 0.2958, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7624552249908447, + "rewards/margins": 2.0999326705932617, + "rewards/rejected": -2.8623881340026855, + "step": 533 + }, + { + "epoch": 0.46, + "grad_norm": 55.1554666550412, + "learning_rate": 1.3164130459707057e-06, + "logits/chosen": -0.9542844295501709, + "logits/rejected": -0.9096299409866333, + "logps/chosen": -614.8569946289062, + "logps/rejected": -683.0138549804688, + "loss": 0.3101, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8923014998435974, + "rewards/margins": 2.132594108581543, + "rewards/rejected": -3.024895668029785, + "step": 534 + }, + { + "epoch": 0.46, + "grad_norm": 78.2854978701924, + "learning_rate": 1.313570656614025e-06, + "logits/chosen": -0.9504913091659546, + "logits/rejected": -0.937301516532898, + "logps/chosen": -439.90118408203125, + "logps/rejected": -611.697998046875, + "loss": 0.4856, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3660770654678345, + "rewards/margins": 1.6826179027557373, + "rewards/rejected": -3.0486950874328613, + "step": 535 + }, + { + "epoch": 0.46, + "grad_norm": 60.26173610703018, + "learning_rate": 1.310725454813499e-06, + "logits/chosen": -0.9467581510543823, + "logits/rejected": -0.9146875143051147, + "logps/chosen": -322.9682312011719, + "logps/rejected": -368.23681640625, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7642669677734375, + "rewards/margins": 1.2350019216537476, + "rewards/rejected": -1.9992687702178955, + "step": 536 + }, + { + "epoch": 0.46, + "grad_norm": 42.04846751083289, + "learning_rate": 1.3078774660880031e-06, + "logits/chosen": -0.9183229207992554, + "logits/rejected": -0.915323793888092, + "logps/chosen": -342.38861083984375, + "logps/rejected": -661.6390991210938, + "loss": 0.2917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5758622884750366, + "rewards/margins": 2.632599115371704, + "rewards/rejected": -3.208461284637451, + "step": 537 + }, + { + "epoch": 0.46, + "grad_norm": 95.46446791152454, + "learning_rate": 1.3050267159814078e-06, + "logits/chosen": -0.904394268989563, + "logits/rejected": -0.8909515142440796, + "logps/chosen": -494.231689453125, + "logps/rejected": -684.6685791015625, + "loss": 0.5835, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.316034197807312, + "rewards/margins": 1.488415002822876, + "rewards/rejected": -2.8044490814208984, + "step": 538 + }, + { + "epoch": 0.46, + "grad_norm": 37.14479971677457, + "learning_rate": 1.3021732300623506e-06, + "logits/chosen": -0.9366617202758789, + "logits/rejected": -0.8918839693069458, + "logps/chosen": -384.676513671875, + "logps/rejected": -581.505126953125, + "loss": 0.189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5559818148612976, + "rewards/margins": 2.3850436210632324, + "rewards/rejected": -2.941025495529175, + "step": 539 + }, + { + "epoch": 0.46, + "grad_norm": 55.17905263819993, + "learning_rate": 1.299317033924008e-06, + "logits/chosen": -0.8729772567749023, + "logits/rejected": -0.8713866472244263, + "logps/chosen": -436.38037109375, + "logps/rejected": -499.4176330566406, + "loss": 0.3997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.184898853302002, + "rewards/margins": 1.184369444847107, + "rewards/rejected": -2.3692681789398193, + "step": 540 + }, + { + "epoch": 0.46, + "grad_norm": 76.69953904332482, + "learning_rate": 1.2964581531838635e-06, + "logits/chosen": -1.016025424003601, + "logits/rejected": -0.9276155233383179, + "logps/chosen": -535.6536865234375, + "logps/rejected": -646.4261474609375, + "loss": 0.4855, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5077444314956665, + "rewards/margins": 1.6514971256256104, + "rewards/rejected": -3.1592416763305664, + "step": 541 + }, + { + "epoch": 0.46, + "grad_norm": 51.71547000713668, + "learning_rate": 1.2935966134834795e-06, + "logits/chosen": -0.978553295135498, + "logits/rejected": -0.9328474998474121, + "logps/chosen": -449.3985290527344, + "logps/rejected": -534.5638427734375, + "loss": 0.3566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.960557222366333, + "rewards/margins": 1.6971855163574219, + "rewards/rejected": -2.657742738723755, + "step": 542 + }, + { + "epoch": 0.47, + "grad_norm": 48.59844761404303, + "learning_rate": 1.290732440488267e-06, + "logits/chosen": -0.9164152145385742, + "logits/rejected": -0.8484071493148804, + "logps/chosen": -594.1298828125, + "logps/rejected": -669.592041015625, + "loss": 0.2708, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.024033784866333, + "rewards/margins": 1.7735604047775269, + "rewards/rejected": -2.7975940704345703, + "step": 543 + }, + { + "epoch": 0.47, + "grad_norm": 43.09196422252919, + "learning_rate": 1.2878656598872546e-06, + "logits/chosen": -0.9126098155975342, + "logits/rejected": -0.8721531629562378, + "logps/chosen": -389.77130126953125, + "logps/rejected": -440.10699462890625, + "loss": 0.3755, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8628970384597778, + "rewards/margins": 1.5729416608810425, + "rewards/rejected": -2.4358386993408203, + "step": 544 + }, + { + "epoch": 0.47, + "grad_norm": 51.58322967373032, + "learning_rate": 1.2849962973928596e-06, + "logits/chosen": -0.9882091879844666, + "logits/rejected": -0.9401744604110718, + "logps/chosen": -444.46014404296875, + "logps/rejected": -576.692138671875, + "loss": 0.2728, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.189112901687622, + "rewards/margins": 1.7125802040100098, + "rewards/rejected": -2.901693344116211, + "step": 545 + }, + { + "epoch": 0.47, + "grad_norm": 40.06366735276771, + "learning_rate": 1.282124378740656e-06, + "logits/chosen": -0.9059635400772095, + "logits/rejected": -0.8775876760482788, + "logps/chosen": -593.1556396484375, + "logps/rejected": -672.8941650390625, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5856313705444336, + "rewards/margins": 2.3719394207000732, + "rewards/rejected": -2.9575705528259277, + "step": 546 + }, + { + "epoch": 0.47, + "grad_norm": 64.91149921888086, + "learning_rate": 1.2792499296891447e-06, + "logits/chosen": -0.8570492267608643, + "logits/rejected": -0.8381605744361877, + "logps/chosen": -486.0616455078125, + "logps/rejected": -580.86328125, + "loss": 0.2649, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.176407814025879, + "rewards/margins": 1.77084481716156, + "rewards/rejected": -2.9472527503967285, + "step": 547 + }, + { + "epoch": 0.47, + "grad_norm": 81.59847334724569, + "learning_rate": 1.276372976019521e-06, + "logits/chosen": -0.9134464263916016, + "logits/rejected": -0.9017162322998047, + "logps/chosen": -449.33380126953125, + "logps/rejected": -634.1409912109375, + "loss": 0.5316, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0740644931793213, + "rewards/margins": 1.6356202363967896, + "rewards/rejected": -2.7096848487854004, + "step": 548 + }, + { + "epoch": 0.47, + "grad_norm": 74.91599389267594, + "learning_rate": 1.2734935435354455e-06, + "logits/chosen": -0.8812904953956604, + "logits/rejected": -0.8673186302185059, + "logps/chosen": -427.50567626953125, + "logps/rejected": -558.2020263671875, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8359134197235107, + "rewards/margins": 1.2576205730438232, + "rewards/rejected": -2.093533992767334, + "step": 549 + }, + { + "epoch": 0.47, + "grad_norm": 58.499944184001166, + "learning_rate": 1.270611658062811e-06, + "logits/chosen": -0.9335320591926575, + "logits/rejected": -0.9049162864685059, + "logps/chosen": -519.2843627929688, + "logps/rejected": -713.3898315429688, + "loss": 0.4237, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.122235655784607, + "rewards/margins": 2.171722888946533, + "rewards/rejected": -3.2939586639404297, + "step": 550 + }, + { + "epoch": 0.47, + "grad_norm": 47.10898356331998, + "learning_rate": 1.2677273454495112e-06, + "logits/chosen": -0.8580210208892822, + "logits/rejected": -0.8268457651138306, + "logps/chosen": -403.48651123046875, + "logps/rejected": -560.0176391601562, + "loss": 0.3254, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9335359334945679, + "rewards/margins": 1.762251853942871, + "rewards/rejected": -2.6957879066467285, + "step": 551 + }, + { + "epoch": 0.47, + "grad_norm": 103.34165034219963, + "learning_rate": 1.2648406315652088e-06, + "logits/chosen": -0.8921483755111694, + "logits/rejected": -0.8262415528297424, + "logps/chosen": -655.8408203125, + "logps/rejected": -681.6361694335938, + "loss": 0.6242, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9099090099334717, + "rewards/margins": 1.288282871246338, + "rewards/rejected": -2.1981921195983887, + "step": 552 + }, + { + "epoch": 0.47, + "grad_norm": 30.0351296907791, + "learning_rate": 1.2619515423011055e-06, + "logits/chosen": -0.9337635040283203, + "logits/rejected": -0.8803566694259644, + "logps/chosen": -728.0569458007812, + "logps/rejected": -904.9400634765625, + "loss": 0.1531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9571795463562012, + "rewards/margins": 2.3777036666870117, + "rewards/rejected": -3.334883213043213, + "step": 553 + }, + { + "epoch": 0.48, + "grad_norm": 44.81285970122842, + "learning_rate": 1.2590601035697054e-06, + "logits/chosen": -0.9095577001571655, + "logits/rejected": -0.8822007179260254, + "logps/chosen": -641.730224609375, + "logps/rejected": -791.3433837890625, + "loss": 0.1613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8262911438941956, + "rewards/margins": 2.1680355072021484, + "rewards/rejected": -2.994326591491699, + "step": 554 + }, + { + "epoch": 0.48, + "grad_norm": 45.72398867570701, + "learning_rate": 1.2561663413045868e-06, + "logits/chosen": -0.9518017768859863, + "logits/rejected": -0.8841419219970703, + "logps/chosen": -622.923095703125, + "logps/rejected": -859.262451171875, + "loss": 0.1866, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8729736804962158, + "rewards/margins": 2.8778254985809326, + "rewards/rejected": -3.7507991790771484, + "step": 555 + }, + { + "epoch": 0.48, + "grad_norm": 46.514213166710974, + "learning_rate": 1.2532702814601678e-06, + "logits/chosen": -0.9096502065658569, + "logits/rejected": -0.8895667791366577, + "logps/chosen": -670.5291748046875, + "logps/rejected": -916.8817138671875, + "loss": 0.1813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9845317602157593, + "rewards/margins": 2.594719648361206, + "rewards/rejected": -3.579251289367676, + "step": 556 + }, + { + "epoch": 0.48, + "grad_norm": 29.579282897800084, + "learning_rate": 1.2503719500114733e-06, + "logits/chosen": -0.9815787076950073, + "logits/rejected": -0.9444399476051331, + "logps/chosen": -427.94683837890625, + "logps/rejected": -708.6376953125, + "loss": 0.2072, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5375573039054871, + "rewards/margins": 2.491065740585327, + "rewards/rejected": -3.02862286567688, + "step": 557 + }, + { + "epoch": 0.48, + "grad_norm": 61.526489949118854, + "learning_rate": 1.2474713729539033e-06, + "logits/chosen": -1.0085158348083496, + "logits/rejected": -0.9338014125823975, + "logps/chosen": -676.2670288085938, + "logps/rejected": -752.7526245117188, + "loss": 0.2631, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2552642822265625, + "rewards/margins": 2.4462804794311523, + "rewards/rejected": -3.701545238494873, + "step": 558 + }, + { + "epoch": 0.48, + "grad_norm": 52.45376747069775, + "learning_rate": 1.2445685763029969e-06, + "logits/chosen": -0.9655488133430481, + "logits/rejected": -0.948523759841919, + "logps/chosen": -379.6853942871094, + "logps/rejected": -659.249755859375, + "loss": 0.3075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8615530729293823, + "rewards/margins": 2.0184621810913086, + "rewards/rejected": -2.8800153732299805, + "step": 559 + }, + { + "epoch": 0.48, + "grad_norm": 66.74324899476902, + "learning_rate": 1.2416635860942034e-06, + "logits/chosen": -0.97737717628479, + "logits/rejected": -0.9480463266372681, + "logps/chosen": -421.3519287109375, + "logps/rejected": -556.2459106445312, + "loss": 0.4132, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0725280046463013, + "rewards/margins": 1.3633832931518555, + "rewards/rejected": -2.435911178588867, + "step": 560 + }, + { + "epoch": 0.48, + "grad_norm": 73.63127069033355, + "learning_rate": 1.238756428382645e-06, + "logits/chosen": -0.9148063659667969, + "logits/rejected": -0.9012689590454102, + "logps/chosen": -466.36785888671875, + "logps/rejected": -678.955810546875, + "loss": 0.3981, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2128989696502686, + "rewards/margins": 1.9558484554290771, + "rewards/rejected": -3.1687474250793457, + "step": 561 + }, + { + "epoch": 0.48, + "grad_norm": 92.09052046516862, + "learning_rate": 1.2358471292428842e-06, + "logits/chosen": -0.9906730055809021, + "logits/rejected": -0.9136836528778076, + "logps/chosen": -727.5438232421875, + "logps/rejected": -815.8961181640625, + "loss": 0.5894, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4386944770812988, + "rewards/margins": 1.7987642288208008, + "rewards/rejected": -3.2374587059020996, + "step": 562 + }, + { + "epoch": 0.48, + "grad_norm": 58.5981985039348, + "learning_rate": 1.2329357147686907e-06, + "logits/chosen": -0.9983447790145874, + "logits/rejected": -0.9759050607681274, + "logps/chosen": -643.207275390625, + "logps/rejected": -822.6072998046875, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.18256413936615, + "rewards/margins": 2.6189026832580566, + "rewards/rejected": -3.801466941833496, + "step": 563 + }, + { + "epoch": 0.48, + "grad_norm": 79.50938619489895, + "learning_rate": 1.230022211072807e-06, + "logits/chosen": -0.9487758874893188, + "logits/rejected": -0.919625997543335, + "logps/chosen": -617.9252319335938, + "logps/rejected": -784.6173706054688, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6651418209075928, + "rewards/margins": 1.4620158672332764, + "rewards/rejected": -3.127157688140869, + "step": 564 + }, + { + "epoch": 0.48, + "grad_norm": 66.18762390042676, + "learning_rate": 1.2271066442867135e-06, + "logits/chosen": -0.9428020715713501, + "logits/rejected": -0.9230005145072937, + "logps/chosen": -454.8497009277344, + "logps/rejected": -749.8408813476562, + "loss": 0.3256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8824508190155029, + "rewards/margins": 2.7411446571350098, + "rewards/rejected": -3.6235954761505127, + "step": 565 + }, + { + "epoch": 0.49, + "grad_norm": 40.83906170657592, + "learning_rate": 1.224189040560395e-06, + "logits/chosen": -0.9711836576461792, + "logits/rejected": -0.9430567026138306, + "logps/chosen": -459.6309814453125, + "logps/rejected": -692.2510986328125, + "loss": 0.2556, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0203781127929688, + "rewards/margins": 1.8715031147003174, + "rewards/rejected": -2.891881227493286, + "step": 566 + }, + { + "epoch": 0.49, + "grad_norm": 61.9853897269269, + "learning_rate": 1.221269426062105e-06, + "logits/chosen": -0.979697048664093, + "logits/rejected": -0.9400407671928406, + "logps/chosen": -524.4586181640625, + "logps/rejected": -583.64453125, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1567959785461426, + "rewards/margins": 1.524742603302002, + "rewards/rejected": -2.6815383434295654, + "step": 567 + }, + { + "epoch": 0.49, + "grad_norm": 78.27335936591861, + "learning_rate": 1.2183478269781336e-06, + "logits/chosen": -0.9543784856796265, + "logits/rejected": -0.9156996011734009, + "logps/chosen": -548.0863037109375, + "logps/rejected": -758.37939453125, + "loss": 0.3461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1840085983276367, + "rewards/margins": 2.1264264583587646, + "rewards/rejected": -3.3104352951049805, + "step": 568 + }, + { + "epoch": 0.49, + "grad_norm": 60.75530097259052, + "learning_rate": 1.2154242695125692e-06, + "logits/chosen": -1.001960039138794, + "logits/rejected": -0.9309448003768921, + "logps/chosen": -622.9661254882812, + "logps/rejected": -733.677734375, + "loss": 0.3013, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9647570848464966, + "rewards/margins": 1.95701265335083, + "rewards/rejected": -2.921769618988037, + "step": 569 + }, + { + "epoch": 0.49, + "grad_norm": 39.07594264943836, + "learning_rate": 1.2124987798870652e-06, + "logits/chosen": -0.9486143589019775, + "logits/rejected": -0.8990459442138672, + "logps/chosen": -594.9383544921875, + "logps/rejected": -679.9696044921875, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0007832050323486, + "rewards/margins": 2.252715826034546, + "rewards/rejected": -3.2534990310668945, + "step": 570 + }, + { + "epoch": 0.49, + "grad_norm": 83.12897907525809, + "learning_rate": 1.2095713843406055e-06, + "logits/chosen": -1.01059889793396, + "logits/rejected": -0.9461950063705444, + "logps/chosen": -512.59619140625, + "logps/rejected": -616.6376953125, + "loss": 0.4345, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1057041883468628, + "rewards/margins": 1.77592134475708, + "rewards/rejected": -2.8816254138946533, + "step": 571 + }, + { + "epoch": 0.49, + "grad_norm": 54.641379490857744, + "learning_rate": 1.2066421091292678e-06, + "logits/chosen": -0.9506186246871948, + "logits/rejected": -0.9077310562133789, + "logps/chosen": -608.8321533203125, + "logps/rejected": -890.667236328125, + "loss": 0.2483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6453957557678223, + "rewards/margins": 2.200510025024414, + "rewards/rejected": -3.8459057807922363, + "step": 572 + }, + { + "epoch": 0.49, + "grad_norm": 50.41011389855287, + "learning_rate": 1.203710980525989e-06, + "logits/chosen": -0.9624886512756348, + "logits/rejected": -0.9110809564590454, + "logps/chosen": -629.2330932617188, + "logps/rejected": -815.7590942382812, + "loss": 0.23, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8843998908996582, + "rewards/margins": 2.8275225162506104, + "rewards/rejected": -3.7119226455688477, + "step": 573 + }, + { + "epoch": 0.49, + "grad_norm": 67.53346213341143, + "learning_rate": 1.2007780248203297e-06, + "logits/chosen": -0.952025294303894, + "logits/rejected": -0.9283043146133423, + "logps/chosen": -651.7374267578125, + "logps/rejected": -827.676025390625, + "loss": 0.3369, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4210906028747559, + "rewards/margins": 1.837165117263794, + "rewards/rejected": -3.25825572013855, + "step": 574 + }, + { + "epoch": 0.49, + "grad_norm": 71.65141079463194, + "learning_rate": 1.1978432683182362e-06, + "logits/chosen": -1.015653371810913, + "logits/rejected": -0.9584256410598755, + "logps/chosen": -567.3726806640625, + "logps/rejected": -817.5277709960938, + "loss": 0.3354, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2098946571350098, + "rewards/margins": 2.3213119506835938, + "rewards/rejected": -3.5312068462371826, + "step": 575 + }, + { + "epoch": 0.49, + "grad_norm": 38.6052770456671, + "learning_rate": 1.1949067373418083e-06, + "logits/chosen": -0.9470927715301514, + "logits/rejected": -0.9256807565689087, + "logps/chosen": -437.8433837890625, + "logps/rejected": -804.2654418945312, + "loss": 0.21, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.90472811460495, + "rewards/margins": 3.095479965209961, + "rewards/rejected": -4.000207901000977, + "step": 576 + }, + { + "epoch": 0.49, + "grad_norm": 46.437618961672996, + "learning_rate": 1.1919684582290603e-06, + "logits/chosen": -0.9666635990142822, + "logits/rejected": -0.9787067770957947, + "logps/chosen": -413.5608825683594, + "logps/rejected": -606.1953735351562, + "loss": 0.2762, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9771944880485535, + "rewards/margins": 2.034590244293213, + "rewards/rejected": -3.011784553527832, + "step": 577 + }, + { + "epoch": 0.5, + "grad_norm": 72.80610670978952, + "learning_rate": 1.1890284573336854e-06, + "logits/chosen": -0.9922659397125244, + "logits/rejected": -0.92891925573349, + "logps/chosen": -587.3009033203125, + "logps/rejected": -689.83447265625, + "loss": 0.2993, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1690540313720703, + "rewards/margins": 2.1405553817749023, + "rewards/rejected": -3.3096094131469727, + "step": 578 + }, + { + "epoch": 0.5, + "grad_norm": 88.9899177492974, + "learning_rate": 1.1860867610248207e-06, + "logits/chosen": -1.0009547472000122, + "logits/rejected": -0.997244119644165, + "logps/chosen": -589.050537109375, + "logps/rejected": -714.9618530273438, + "loss": 0.7316, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4328802824020386, + "rewards/margins": 1.5104745626449585, + "rewards/rejected": -2.943354845046997, + "step": 579 + }, + { + "epoch": 0.5, + "grad_norm": 83.58436085989156, + "learning_rate": 1.1831433956868085e-06, + "logits/chosen": -0.9802489280700684, + "logits/rejected": -0.9745005965232849, + "logps/chosen": -440.23333740234375, + "logps/rejected": -646.380859375, + "loss": 0.5048, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0479545593261719, + "rewards/margins": 1.576943039894104, + "rewards/rejected": -2.6248974800109863, + "step": 580 + }, + { + "epoch": 0.5, + "grad_norm": 103.96994001693628, + "learning_rate": 1.180198387718961e-06, + "logits/chosen": -1.0007036924362183, + "logits/rejected": -0.9575830698013306, + "logps/chosen": -675.14990234375, + "logps/rejected": -838.3320922851562, + "loss": 0.753, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6041653156280518, + "rewards/margins": 2.025559663772583, + "rewards/rejected": -3.6297249794006348, + "step": 581 + }, + { + "epoch": 0.5, + "grad_norm": 43.837917171955105, + "learning_rate": 1.1772517635353242e-06, + "logits/chosen": -1.0432533025741577, + "logits/rejected": -0.980958104133606, + "logps/chosen": -575.733154296875, + "logps/rejected": -836.3759155273438, + "loss": 0.2372, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.785294771194458, + "rewards/margins": 2.858860492706299, + "rewards/rejected": -3.6441550254821777, + "step": 582 + }, + { + "epoch": 0.5, + "grad_norm": 26.165545034747257, + "learning_rate": 1.1743035495644384e-06, + "logits/chosen": -0.9434425830841064, + "logits/rejected": -0.9176989793777466, + "logps/chosen": -377.6453552246094, + "logps/rejected": -487.42156982421875, + "loss": 0.2029, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3606542646884918, + "rewards/margins": 2.1150460243225098, + "rewards/rejected": -2.4756999015808105, + "step": 583 + }, + { + "epoch": 0.5, + "grad_norm": 27.433102681077315, + "learning_rate": 1.171353772249105e-06, + "logits/chosen": -0.9990615844726562, + "logits/rejected": -0.9651137590408325, + "logps/chosen": -463.429443359375, + "logps/rejected": -799.1802978515625, + "loss": 0.1036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7478154897689819, + "rewards/margins": 3.271451473236084, + "rewards/rejected": -4.0192670822143555, + "step": 584 + }, + { + "epoch": 0.5, + "grad_norm": 54.56104954596308, + "learning_rate": 1.1684024580461454e-06, + "logits/chosen": -0.9565891027450562, + "logits/rejected": -0.9538889527320862, + "logps/chosen": -416.2050476074219, + "logps/rejected": -626.102294921875, + "loss": 0.298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.821914553642273, + "rewards/margins": 2.1759161949157715, + "rewards/rejected": -2.997830629348755, + "step": 585 + }, + { + "epoch": 0.5, + "grad_norm": 46.90624235741009, + "learning_rate": 1.1654496334261658e-06, + "logits/chosen": -0.96038818359375, + "logits/rejected": -0.965350329875946, + "logps/chosen": -407.58795166015625, + "logps/rejected": -632.771728515625, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.224656343460083, + "rewards/margins": 2.23311185836792, + "rewards/rejected": -3.457768201828003, + "step": 586 + }, + { + "epoch": 0.5, + "grad_norm": 44.85395647703205, + "learning_rate": 1.1624953248733203e-06, + "logits/chosen": -1.0106650590896606, + "logits/rejected": -0.963015079498291, + "logps/chosen": -562.22412109375, + "logps/rejected": -734.1881103515625, + "loss": 0.2537, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0141054391860962, + "rewards/margins": 2.6984009742736816, + "rewards/rejected": -3.7125065326690674, + "step": 587 + }, + { + "epoch": 0.5, + "grad_norm": 26.980171055047652, + "learning_rate": 1.1595395588850717e-06, + "logits/chosen": -0.9881083965301514, + "logits/rejected": -0.9700764417648315, + "logps/chosen": -417.80352783203125, + "logps/rejected": -763.685546875, + "loss": 0.1568, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0090665817260742, + "rewards/margins": 2.765437602996826, + "rewards/rejected": -3.7745041847229004, + "step": 588 + }, + { + "epoch": 0.51, + "grad_norm": 47.795237191457275, + "learning_rate": 1.1565823619719554e-06, + "logits/chosen": -0.9474834203720093, + "logits/rejected": -0.9007887840270996, + "logps/chosen": -632.9498291015625, + "logps/rejected": -781.4476318359375, + "loss": 0.2604, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4948418140411377, + "rewards/margins": 2.670294761657715, + "rewards/rejected": -4.165136337280273, + "step": 589 + }, + { + "epoch": 0.51, + "grad_norm": 65.7744725890404, + "learning_rate": 1.1536237606573404e-06, + "logits/chosen": -0.9904187917709351, + "logits/rejected": -0.9322486519813538, + "logps/chosen": -599.295166015625, + "logps/rejected": -696.2531127929688, + "loss": 0.4085, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5187959671020508, + "rewards/margins": 1.557621955871582, + "rewards/rejected": -3.076417922973633, + "step": 590 + }, + { + "epoch": 0.51, + "grad_norm": 61.573912975092064, + "learning_rate": 1.1506637814771913e-06, + "logits/chosen": -0.9480462074279785, + "logits/rejected": -0.9528161287307739, + "logps/chosen": -451.1434326171875, + "logps/rejected": -786.1947021484375, + "loss": 0.2855, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9499127864837646, + "rewards/margins": 2.757876396179199, + "rewards/rejected": -3.7077889442443848, + "step": 591 + }, + { + "epoch": 0.51, + "grad_norm": 48.66734390755508, + "learning_rate": 1.1477024509798325e-06, + "logits/chosen": -0.9910733699798584, + "logits/rejected": -0.9133542776107788, + "logps/chosen": -637.9290771484375, + "logps/rejected": -939.7032470703125, + "loss": 0.1984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0765866041183472, + "rewards/margins": 2.9725711345672607, + "rewards/rejected": -4.049158096313477, + "step": 592 + }, + { + "epoch": 0.51, + "grad_norm": 50.76606314718852, + "learning_rate": 1.144739795725707e-06, + "logits/chosen": -1.021897315979004, + "logits/rejected": -0.9521700143814087, + "logps/chosen": -539.6085205078125, + "logps/rejected": -684.051025390625, + "loss": 0.2097, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9156203269958496, + "rewards/margins": 2.4041714668273926, + "rewards/rejected": -3.319791793823242, + "step": 593 + }, + { + "epoch": 0.51, + "grad_norm": 75.33018115295012, + "learning_rate": 1.1417758422871404e-06, + "logits/chosen": -0.9927308559417725, + "logits/rejected": -0.9772903919219971, + "logps/chosen": -481.47491455078125, + "logps/rejected": -655.3239135742188, + "loss": 0.505, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4134584665298462, + "rewards/margins": 1.5445600748062134, + "rewards/rejected": -2.9580185413360596, + "step": 594 + }, + { + "epoch": 0.51, + "grad_norm": 79.49013356730272, + "learning_rate": 1.1388106172481015e-06, + "logits/chosen": -0.976418137550354, + "logits/rejected": -0.9609758853912354, + "logps/chosen": -582.8574829101562, + "logps/rejected": -749.47119140625, + "loss": 0.4163, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.068327784538269, + "rewards/margins": 1.7612555027008057, + "rewards/rejected": -2.829583168029785, + "step": 595 + }, + { + "epoch": 0.51, + "grad_norm": 62.84741892276459, + "learning_rate": 1.1358441472039646e-06, + "logits/chosen": -0.9960223436355591, + "logits/rejected": -0.976997971534729, + "logps/chosen": -447.45001220703125, + "logps/rejected": -551.7108764648438, + "loss": 0.539, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1301624774932861, + "rewards/margins": 1.4154155254364014, + "rewards/rejected": -2.5455780029296875, + "step": 596 + }, + { + "epoch": 0.51, + "grad_norm": 42.049801312127514, + "learning_rate": 1.1328764587612702e-06, + "logits/chosen": -1.0407339334487915, + "logits/rejected": -0.9951431751251221, + "logps/chosen": -420.0325927734375, + "logps/rejected": -700.8817749023438, + "loss": 0.2579, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9708338379859924, + "rewards/margins": 2.6650800704956055, + "rewards/rejected": -3.635913610458374, + "step": 597 + }, + { + "epoch": 0.51, + "grad_norm": 66.43047811530603, + "learning_rate": 1.1299075785374874e-06, + "logits/chosen": -0.979002833366394, + "logits/rejected": -0.9451375603675842, + "logps/chosen": -594.0153198242188, + "logps/rejected": -647.1163330078125, + "loss": 0.4018, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.270151138305664, + "rewards/margins": 1.5419825315475464, + "rewards/rejected": -2.812133550643921, + "step": 598 + }, + { + "epoch": 0.51, + "grad_norm": 39.929678506324095, + "learning_rate": 1.1269375331607726e-06, + "logits/chosen": -1.0492507219314575, + "logits/rejected": -0.9841207265853882, + "logps/chosen": -607.1595458984375, + "logps/rejected": -821.344970703125, + "loss": 0.146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8867940902709961, + "rewards/margins": 2.6768949031829834, + "rewards/rejected": -3.5636887550354004, + "step": 599 + }, + { + "epoch": 0.51, + "grad_norm": 63.07068369742342, + "learning_rate": 1.1239663492697355e-06, + "logits/chosen": -1.0737665891647339, + "logits/rejected": -0.991007924079895, + "logps/chosen": -682.14501953125, + "logps/rejected": -820.7092895507812, + "loss": 0.2867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1276676654815674, + "rewards/margins": 2.4414119720458984, + "rewards/rejected": -3.5690793991088867, + "step": 600 + }, + { + "epoch": 0.52, + "grad_norm": 40.28793200844207, + "learning_rate": 1.1209940535131947e-06, + "logits/chosen": -1.0836031436920166, + "logits/rejected": -1.001068353652954, + "logps/chosen": -421.8529968261719, + "logps/rejected": -553.0750122070312, + "loss": 0.2355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8329640030860901, + "rewards/margins": 1.9275306463241577, + "rewards/rejected": -2.7604947090148926, + "step": 601 + }, + { + "epoch": 0.52, + "grad_norm": 78.6354473877312, + "learning_rate": 1.1180206725499424e-06, + "logits/chosen": -0.9799869060516357, + "logits/rejected": -0.9672070741653442, + "logps/chosen": -391.574951171875, + "logps/rejected": -675.866455078125, + "loss": 0.4875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9515032768249512, + "rewards/margins": 2.0327067375183105, + "rewards/rejected": -2.9842100143432617, + "step": 602 + }, + { + "epoch": 0.52, + "grad_norm": 51.203160420320835, + "learning_rate": 1.115046233048504e-06, + "logits/chosen": -0.9612528085708618, + "logits/rejected": -0.923812210559845, + "logps/chosen": -569.8873291015625, + "logps/rejected": -666.525390625, + "loss": 0.2289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5378499031066895, + "rewards/margins": 1.9845186471939087, + "rewards/rejected": -2.5223684310913086, + "step": 603 + }, + { + "epoch": 0.52, + "grad_norm": 50.967899550632275, + "learning_rate": 1.1120707616868987e-06, + "logits/chosen": -0.948002278804779, + "logits/rejected": -0.9246044158935547, + "logps/chosen": -543.124755859375, + "logps/rejected": -738.28515625, + "loss": 0.3327, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8030984401702881, + "rewards/margins": 1.7016018629074097, + "rewards/rejected": -2.504700183868408, + "step": 604 + }, + { + "epoch": 0.52, + "grad_norm": 43.06619559392241, + "learning_rate": 1.1090942851524012e-06, + "logits/chosen": -1.0343921184539795, + "logits/rejected": -0.9407830834388733, + "logps/chosen": -632.280029296875, + "logps/rejected": -797.356689453125, + "loss": 0.1989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6318868398666382, + "rewards/margins": 2.2190303802490234, + "rewards/rejected": -2.850917339324951, + "step": 605 + }, + { + "epoch": 0.52, + "grad_norm": 43.55443697785336, + "learning_rate": 1.106116830141301e-06, + "logits/chosen": -0.9716185331344604, + "logits/rejected": -0.925656795501709, + "logps/chosen": -493.6955871582031, + "logps/rejected": -678.1077880859375, + "loss": 0.1976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7711277008056641, + "rewards/margins": 2.2040441036224365, + "rewards/rejected": -2.9751715660095215, + "step": 606 + }, + { + "epoch": 0.52, + "grad_norm": 53.70680892666797, + "learning_rate": 1.1031384233586632e-06, + "logits/chosen": -1.0033338069915771, + "logits/rejected": -0.9584630727767944, + "logps/chosen": -653.1076049804688, + "logps/rejected": -841.616943359375, + "loss": 0.2092, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6441432237625122, + "rewards/margins": 2.346440553665161, + "rewards/rejected": -2.990583658218384, + "step": 607 + }, + { + "epoch": 0.52, + "grad_norm": 63.030835220246225, + "learning_rate": 1.1001590915180915e-06, + "logits/chosen": -0.9717227220535278, + "logits/rejected": -0.9596288204193115, + "logps/chosen": -476.002197265625, + "logps/rejected": -581.9302978515625, + "loss": 0.3562, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0460882186889648, + "rewards/margins": 1.1150057315826416, + "rewards/rejected": -2.1610941886901855, + "step": 608 + }, + { + "epoch": 0.52, + "grad_norm": 32.55524769432412, + "learning_rate": 1.0971788613414842e-06, + "logits/chosen": -1.0176827907562256, + "logits/rejected": -0.9593334197998047, + "logps/chosen": -490.81640625, + "logps/rejected": -721.6340942382812, + "loss": 0.1971, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3526301980018616, + "rewards/margins": 2.5123186111450195, + "rewards/rejected": -2.8649487495422363, + "step": 609 + }, + { + "epoch": 0.52, + "grad_norm": 52.07686220232358, + "learning_rate": 1.0941977595587983e-06, + "logits/chosen": -0.9811424016952515, + "logits/rejected": -0.9464911222457886, + "logps/chosen": -440.84222412109375, + "logps/rejected": -677.8856201171875, + "loss": 0.2756, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5718835592269897, + "rewards/margins": 2.2763261795043945, + "rewards/rejected": -2.848209857940674, + "step": 610 + }, + { + "epoch": 0.52, + "grad_norm": 84.38983739427725, + "learning_rate": 1.0912158129078074e-06, + "logits/chosen": -1.0566349029541016, + "logits/rejected": -0.9579128623008728, + "logps/chosen": -611.2342529296875, + "logps/rejected": -662.5355224609375, + "loss": 0.4657, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4705239534378052, + "rewards/margins": 1.3576524257659912, + "rewards/rejected": -2.828176498413086, + "step": 611 + }, + { + "epoch": 0.52, + "grad_norm": 95.745219124751, + "learning_rate": 1.0882330481338634e-06, + "logits/chosen": -1.0057260990142822, + "logits/rejected": -0.9815188646316528, + "logps/chosen": -624.9097900390625, + "logps/rejected": -802.2578125, + "loss": 0.5204, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1399705410003662, + "rewards/margins": 2.3458216190338135, + "rewards/rejected": -3.4857921600341797, + "step": 612 + }, + { + "epoch": 0.53, + "grad_norm": 35.61612150060107, + "learning_rate": 1.0852494919896564e-06, + "logits/chosen": -1.04823637008667, + "logits/rejected": -0.9574418067932129, + "logps/chosen": -450.38531494140625, + "logps/rejected": -605.09619140625, + "loss": 0.2273, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4428199529647827, + "rewards/margins": 2.150094985961914, + "rewards/rejected": -2.5929148197174072, + "step": 613 + }, + { + "epoch": 0.53, + "grad_norm": 88.8221121726083, + "learning_rate": 1.0822651712349728e-06, + "logits/chosen": -0.9696275591850281, + "logits/rejected": -0.9406725168228149, + "logps/chosen": -438.52655029296875, + "logps/rejected": -477.2586669921875, + "loss": 0.6035, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3069283962249756, + "rewards/margins": 0.9594730138778687, + "rewards/rejected": -2.266401529312134, + "step": 614 + }, + { + "epoch": 0.53, + "grad_norm": 108.04629258782866, + "learning_rate": 1.0792801126364585e-06, + "logits/chosen": -0.9994640350341797, + "logits/rejected": -0.956566333770752, + "logps/chosen": -533.0015258789062, + "logps/rejected": -713.0087890625, + "loss": 0.5597, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2643516063690186, + "rewards/margins": 1.590078592300415, + "rewards/rejected": -2.8544301986694336, + "step": 615 + }, + { + "epoch": 0.53, + "grad_norm": 59.464095147384604, + "learning_rate": 1.076294342967377e-06, + "logits/chosen": -0.9867278933525085, + "logits/rejected": -0.9743552207946777, + "logps/chosen": -419.0953063964844, + "logps/rejected": -548.52392578125, + "loss": 0.3782, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8928730487823486, + "rewards/margins": 1.9194822311401367, + "rewards/rejected": -2.8123552799224854, + "step": 616 + }, + { + "epoch": 0.53, + "grad_norm": 56.24052261515072, + "learning_rate": 1.0733078890073682e-06, + "logits/chosen": -1.029302716255188, + "logits/rejected": -0.9855225086212158, + "logps/chosen": -538.539794921875, + "logps/rejected": -725.5313720703125, + "loss": 0.2391, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9613665342330933, + "rewards/margins": 2.064399480819702, + "rewards/rejected": -3.025765895843506, + "step": 617 + }, + { + "epoch": 0.53, + "grad_norm": 179.2209749040515, + "learning_rate": 1.0703207775422106e-06, + "logits/chosen": -0.9867770671844482, + "logits/rejected": -0.9663810133934021, + "logps/chosen": -509.22747802734375, + "logps/rejected": -587.5941162109375, + "loss": 0.3361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7451854944229126, + "rewards/margins": 1.7211875915527344, + "rewards/rejected": -2.4663732051849365, + "step": 618 + }, + { + "epoch": 0.53, + "grad_norm": 27.876934753230678, + "learning_rate": 1.0673330353635796e-06, + "logits/chosen": -0.9755138158798218, + "logits/rejected": -0.9498250484466553, + "logps/chosen": -447.90704345703125, + "logps/rejected": -960.9149169921875, + "loss": 0.158, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9585582613945007, + "rewards/margins": 3.6578381061553955, + "rewards/rejected": -4.616396903991699, + "step": 619 + }, + { + "epoch": 0.53, + "grad_norm": 48.905850898670266, + "learning_rate": 1.0643446892688077e-06, + "logits/chosen": -1.0404212474822998, + "logits/rejected": -0.9776492714881897, + "logps/chosen": -468.32281494140625, + "logps/rejected": -645.6575927734375, + "loss": 0.3063, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.310685396194458, + "rewards/margins": 1.8421094417572021, + "rewards/rejected": -3.15279483795166, + "step": 620 + }, + { + "epoch": 0.53, + "grad_norm": 79.60473032815341, + "learning_rate": 1.0613557660606441e-06, + "logits/chosen": -1.0487558841705322, + "logits/rejected": -1.0085748434066772, + "logps/chosen": -588.3466796875, + "logps/rejected": -665.53369140625, + "loss": 0.4134, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.34440279006958, + "rewards/margins": 1.925700306892395, + "rewards/rejected": -3.2701029777526855, + "step": 621 + }, + { + "epoch": 0.53, + "grad_norm": 55.38539507823246, + "learning_rate": 1.0583662925470126e-06, + "logits/chosen": -1.027477502822876, + "logits/rejected": -0.9840755462646484, + "logps/chosen": -571.076904296875, + "logps/rejected": -684.5631103515625, + "loss": 0.2579, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2199088335037231, + "rewards/margins": 2.1186861991882324, + "rewards/rejected": -3.338594913482666, + "step": 622 + }, + { + "epoch": 0.53, + "grad_norm": 65.12413866371425, + "learning_rate": 1.0553762955407757e-06, + "logits/chosen": -1.0024276971817017, + "logits/rejected": -0.9541710615158081, + "logps/chosen": -645.3480834960938, + "logps/rejected": -868.0056762695312, + "loss": 0.2964, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9870597720146179, + "rewards/margins": 1.8057602643966675, + "rewards/rejected": -2.7928199768066406, + "step": 623 + }, + { + "epoch": 0.54, + "grad_norm": 54.89562559971715, + "learning_rate": 1.052385801859489e-06, + "logits/chosen": -0.9582788944244385, + "logits/rejected": -0.9098652005195618, + "logps/chosen": -510.34844970703125, + "logps/rejected": -772.8673095703125, + "loss": 0.2481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3798649311065674, + "rewards/margins": 2.6270527839660645, + "rewards/rejected": -4.006917953491211, + "step": 624 + }, + { + "epoch": 0.54, + "grad_norm": 77.07307778949189, + "learning_rate": 1.0493948383251628e-06, + "logits/chosen": -0.9984012842178345, + "logits/rejected": -1.00254487991333, + "logps/chosen": -596.7376708984375, + "logps/rejected": -779.016357421875, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0308185815811157, + "rewards/margins": 2.1803455352783203, + "rewards/rejected": -3.2111639976501465, + "step": 625 + }, + { + "epoch": 0.54, + "grad_norm": 27.688582306282832, + "learning_rate": 1.0464034317640226e-06, + "logits/chosen": -1.0623116493225098, + "logits/rejected": -1.0131748914718628, + "logps/chosen": -588.09716796875, + "logps/rejected": -838.4576416015625, + "loss": 0.185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.712467610836029, + "rewards/margins": 2.243401050567627, + "rewards/rejected": -2.955868721008301, + "step": 626 + }, + { + "epoch": 0.54, + "grad_norm": 55.18732834418408, + "learning_rate": 1.0434116090062663e-06, + "logits/chosen": -0.9450533390045166, + "logits/rejected": -0.8979382514953613, + "logps/chosen": -357.7943115234375, + "logps/rejected": -512.4151611328125, + "loss": 0.391, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3883112370967865, + "rewards/margins": 1.048119306564331, + "rewards/rejected": -1.4364304542541504, + "step": 627 + }, + { + "epoch": 0.54, + "grad_norm": 84.22638818887654, + "learning_rate": 1.040419396885826e-06, + "logits/chosen": -1.0142240524291992, + "logits/rejected": -0.9256457090377808, + "logps/chosen": -577.904052734375, + "logps/rejected": -668.9786987304688, + "loss": 0.4729, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.595212996006012, + "rewards/margins": 1.0791289806365967, + "rewards/rejected": -1.6743419170379639, + "step": 628 + }, + { + "epoch": 0.54, + "grad_norm": 58.71812972569202, + "learning_rate": 1.0374268222401257e-06, + "logits/chosen": -0.8847454786300659, + "logits/rejected": -0.876816987991333, + "logps/chosen": -488.02508544921875, + "logps/rejected": -507.3223571777344, + "loss": 0.3981, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2222071886062622, + "rewards/margins": 0.9527764320373535, + "rewards/rejected": -1.1749836206436157, + "step": 629 + }, + { + "epoch": 0.54, + "grad_norm": 69.55694695794018, + "learning_rate": 1.0344339119098393e-06, + "logits/chosen": -0.9340708255767822, + "logits/rejected": -0.931158185005188, + "logps/chosen": -617.0542602539062, + "logps/rejected": -717.384521484375, + "loss": 0.3453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5446648597717285, + "rewards/margins": 1.2806169986724854, + "rewards/rejected": -1.8252819776535034, + "step": 630 + }, + { + "epoch": 0.54, + "grad_norm": 54.83156555797582, + "learning_rate": 1.0314406927386538e-06, + "logits/chosen": -0.9774016737937927, + "logits/rejected": -0.9222882986068726, + "logps/chosen": -410.22833251953125, + "logps/rejected": -523.1634521484375, + "loss": 0.3641, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6278001070022583, + "rewards/margins": 1.2517343759536743, + "rewards/rejected": -1.879534363746643, + "step": 631 + }, + { + "epoch": 0.54, + "grad_norm": 82.00952622693666, + "learning_rate": 1.0284471915730251e-06, + "logits/chosen": -1.0299391746520996, + "logits/rejected": -0.9885897636413574, + "logps/chosen": -451.1562805175781, + "logps/rejected": -645.5934448242188, + "loss": 0.5372, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9116708636283875, + "rewards/margins": 1.7287958860397339, + "rewards/rejected": -2.6404666900634766, + "step": 632 + }, + { + "epoch": 0.54, + "grad_norm": 33.30811545108685, + "learning_rate": 1.0254534352619379e-06, + "logits/chosen": -0.9370101094245911, + "logits/rejected": -0.9013323783874512, + "logps/chosen": -387.3674011230469, + "logps/rejected": -653.4251708984375, + "loss": 0.1564, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5667197108268738, + "rewards/margins": 2.4151532649993896, + "rewards/rejected": -2.981873035430908, + "step": 633 + }, + { + "epoch": 0.54, + "grad_norm": 90.51805812937883, + "learning_rate": 1.0224594506566666e-06, + "logits/chosen": -0.9417135715484619, + "logits/rejected": -0.9171383380889893, + "logps/chosen": -725.6295776367188, + "logps/rejected": -796.2141723632812, + "loss": 0.5424, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3215444087982178, + "rewards/margins": 1.8195197582244873, + "rewards/rejected": -3.141064167022705, + "step": 634 + }, + { + "epoch": 0.54, + "grad_norm": 62.04604028789953, + "learning_rate": 1.0194652646105317e-06, + "logits/chosen": -0.981023371219635, + "logits/rejected": -0.9579742550849915, + "logps/chosen": -600.0975341796875, + "logps/rejected": -712.9047241210938, + "loss": 0.299, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0958032608032227, + "rewards/margins": 1.629847526550293, + "rewards/rejected": -2.7256507873535156, + "step": 635 + }, + { + "epoch": 0.55, + "grad_norm": 44.86862238379095, + "learning_rate": 1.0164709039786616e-06, + "logits/chosen": -1.0345852375030518, + "logits/rejected": -0.9957494735717773, + "logps/chosen": -662.246337890625, + "logps/rejected": -1078.583740234375, + "loss": 0.2296, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1617629528045654, + "rewards/margins": 4.0115203857421875, + "rewards/rejected": -5.173283576965332, + "step": 636 + }, + { + "epoch": 0.55, + "grad_norm": 66.67094105532799, + "learning_rate": 1.0134763956177504e-06, + "logits/chosen": -1.0106799602508545, + "logits/rejected": -0.9603334665298462, + "logps/chosen": -593.8453369140625, + "logps/rejected": -729.3505859375, + "loss": 0.3855, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4872370958328247, + "rewards/margins": 1.5294179916381836, + "rewards/rejected": -3.016655206680298, + "step": 637 + }, + { + "epoch": 0.55, + "grad_norm": 29.81447058483848, + "learning_rate": 1.0104817663858161e-06, + "logits/chosen": -0.9926195740699768, + "logits/rejected": -0.9689513444900513, + "logps/chosen": -318.2772216796875, + "logps/rejected": -550.7271118164062, + "loss": 0.245, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8237345218658447, + "rewards/margins": 1.9877159595489502, + "rewards/rejected": -2.811450481414795, + "step": 638 + }, + { + "epoch": 0.55, + "grad_norm": 75.35557968141725, + "learning_rate": 1.0074870431419627e-06, + "logits/chosen": -1.0136584043502808, + "logits/rejected": -0.934355616569519, + "logps/chosen": -665.5938110351562, + "logps/rejected": -847.51953125, + "loss": 0.395, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2840629816055298, + "rewards/margins": 2.6745872497558594, + "rewards/rejected": -3.9586501121520996, + "step": 639 + }, + { + "epoch": 0.55, + "grad_norm": 57.02748396120161, + "learning_rate": 1.0044922527461358e-06, + "logits/chosen": -1.0077900886535645, + "logits/rejected": -0.9565660953521729, + "logps/chosen": -544.416259765625, + "logps/rejected": -856.279541015625, + "loss": 0.2009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.349835753440857, + "rewards/margins": 2.358369827270508, + "rewards/rejected": -3.7082056999206543, + "step": 640 + }, + { + "epoch": 0.55, + "grad_norm": 21.76600647344353, + "learning_rate": 1.0014974220588836e-06, + "logits/chosen": -0.9938585162162781, + "logits/rejected": -0.9594500064849854, + "logps/chosen": -365.6549987792969, + "logps/rejected": -681.994384765625, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7148664593696594, + "rewards/margins": 2.855158805847168, + "rewards/rejected": -3.5700254440307617, + "step": 641 + }, + { + "epoch": 0.55, + "grad_norm": 88.30985096224623, + "learning_rate": 9.985025779411165e-07, + "logits/chosen": -0.9774501323699951, + "logits/rejected": -0.9425498247146606, + "logps/chosen": -552.4366455078125, + "logps/rejected": -776.1644287109375, + "loss": 0.6567, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5388054847717285, + "rewards/margins": 2.0177862644195557, + "rewards/rejected": -3.556591749191284, + "step": 642 + }, + { + "epoch": 0.55, + "grad_norm": 33.67261866832234, + "learning_rate": 9.955077472538648e-07, + "logits/chosen": -1.0780702829360962, + "logits/rejected": -1.0096969604492188, + "logps/chosen": -564.3319091796875, + "logps/rejected": -761.5578002929688, + "loss": 0.226, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1705207824707031, + "rewards/margins": 2.825336217880249, + "rewards/rejected": -3.995857000350952, + "step": 643 + }, + { + "epoch": 0.55, + "grad_norm": 19.057745564111876, + "learning_rate": 9.925129568580374e-07, + "logits/chosen": -0.9582267999649048, + "logits/rejected": -0.941879391670227, + "logps/chosen": -360.990478515625, + "logps/rejected": -773.388671875, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.433311402797699, + "rewards/margins": 3.6941919326782227, + "rewards/rejected": -4.127503395080566, + "step": 644 + }, + { + "epoch": 0.55, + "grad_norm": 39.503260225728425, + "learning_rate": 9.89518233614184e-07, + "logits/chosen": -1.0283384323120117, + "logits/rejected": -0.9627246856689453, + "logps/chosen": -544.3568725585938, + "logps/rejected": -672.4243774414062, + "loss": 0.2435, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3087122440338135, + "rewards/margins": 1.910134196281433, + "rewards/rejected": -3.218846559524536, + "step": 645 + }, + { + "epoch": 0.55, + "grad_norm": 33.99114210586287, + "learning_rate": 9.8652360438225e-07, + "logits/chosen": -1.0589323043823242, + "logits/rejected": -0.9973713159561157, + "logps/chosen": -550.265625, + "logps/rejected": -813.226318359375, + "loss": 0.1675, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1273949146270752, + "rewards/margins": 2.866863250732422, + "rewards/rejected": -3.9942586421966553, + "step": 646 + }, + { + "epoch": 0.55, + "grad_norm": 50.68095458780209, + "learning_rate": 9.835290960213381e-07, + "logits/chosen": -1.0335006713867188, + "logits/rejected": -1.0002275705337524, + "logps/chosen": -356.583740234375, + "logps/rejected": -690.9955444335938, + "loss": 0.2406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8581660389900208, + "rewards/margins": 2.352982521057129, + "rewards/rejected": -3.211148262023926, + "step": 647 + }, + { + "epoch": 0.56, + "grad_norm": 55.01033262884792, + "learning_rate": 9.805347353894684e-07, + "logits/chosen": -0.9837990999221802, + "logits/rejected": -0.9763227701187134, + "logps/chosen": -447.79180908203125, + "logps/rejected": -744.4036865234375, + "loss": 0.3265, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1442887783050537, + "rewards/margins": 2.827482223510742, + "rewards/rejected": -3.971770763397217, + "step": 648 + }, + { + "epoch": 0.56, + "grad_norm": 69.9962549998083, + "learning_rate": 9.775405493433336e-07, + "logits/chosen": -1.059086799621582, + "logits/rejected": -0.9900112152099609, + "logps/chosen": -807.8218994140625, + "logps/rejected": -719.1181640625, + "loss": 0.3185, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7500087022781372, + "rewards/margins": 1.4685823917388916, + "rewards/rejected": -3.2185912132263184, + "step": 649 + }, + { + "epoch": 0.56, + "grad_norm": 127.33599373241286, + "learning_rate": 9.745465647380618e-07, + "logits/chosen": -1.0178935527801514, + "logits/rejected": -0.9754936695098877, + "logps/chosen": -677.5139770507812, + "logps/rejected": -957.6383056640625, + "loss": 0.2906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8714618682861328, + "rewards/margins": 2.3842825889587402, + "rewards/rejected": -4.255744934082031, + "step": 650 + }, + { + "epoch": 0.56, + "grad_norm": 82.60462428449118, + "learning_rate": 9.71552808426975e-07, + "logits/chosen": -1.0387744903564453, + "logits/rejected": -0.9887288808822632, + "logps/chosen": -718.5755004882812, + "logps/rejected": -1037.51953125, + "loss": 0.2611, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3768072128295898, + "rewards/margins": 3.0493197441101074, + "rewards/rejected": -4.4261274337768555, + "step": 651 + }, + { + "epoch": 0.56, + "grad_norm": 79.66272459241623, + "learning_rate": 9.685593072613463e-07, + "logits/chosen": -1.0097852945327759, + "logits/rejected": -0.9862075448036194, + "logps/chosen": -632.6588134765625, + "logps/rejected": -866.3151245117188, + "loss": 0.3264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7535400390625, + "rewards/margins": 2.2010996341705322, + "rewards/rejected": -3.9546396732330322, + "step": 652 + }, + { + "epoch": 0.56, + "grad_norm": 55.47087463249963, + "learning_rate": 9.655660880901604e-07, + "logits/chosen": -1.0378929376602173, + "logits/rejected": -0.9814661741256714, + "logps/chosen": -619.1103515625, + "logps/rejected": -685.860107421875, + "loss": 0.2389, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1459777355194092, + "rewards/margins": 2.4581964015960693, + "rewards/rejected": -3.6041741371154785, + "step": 653 + }, + { + "epoch": 0.56, + "grad_norm": 27.190095964458937, + "learning_rate": 9.625731777598744e-07, + "logits/chosen": -1.0018420219421387, + "logits/rejected": -0.9675527811050415, + "logps/chosen": -427.070068359375, + "logps/rejected": -564.972900390625, + "loss": 0.1653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7308350801467896, + "rewards/margins": 2.6819570064544678, + "rewards/rejected": -3.4127919673919678, + "step": 654 + }, + { + "epoch": 0.56, + "grad_norm": 47.02737017794142, + "learning_rate": 9.595806031141738e-07, + "logits/chosen": -1.0094876289367676, + "logits/rejected": -0.9599051475524902, + "logps/chosen": -543.79541015625, + "logps/rejected": -703.00927734375, + "loss": 0.1868, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.366621971130371, + "rewards/margins": 2.607872486114502, + "rewards/rejected": -3.974494457244873, + "step": 655 + }, + { + "epoch": 0.56, + "grad_norm": 42.21459880445656, + "learning_rate": 9.565883909937339e-07, + "logits/chosen": -1.0014833211898804, + "logits/rejected": -0.940342366695404, + "logps/chosen": -557.1934204101562, + "logps/rejected": -724.4772338867188, + "loss": 0.2306, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4227467775344849, + "rewards/margins": 2.1267175674438477, + "rewards/rejected": -3.549463987350464, + "step": 656 + }, + { + "epoch": 0.56, + "grad_norm": 52.19377804700672, + "learning_rate": 9.535965682359777e-07, + "logits/chosen": -1.0168036222457886, + "logits/rejected": -0.9771864414215088, + "logps/chosen": -378.285888671875, + "logps/rejected": -516.0848388671875, + "loss": 0.3705, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6493128538131714, + "rewards/margins": 2.358621120452881, + "rewards/rejected": -3.007934093475342, + "step": 657 + }, + { + "epoch": 0.56, + "grad_norm": 47.28051655396848, + "learning_rate": 9.506051616748373e-07, + "logits/chosen": -1.052863359451294, + "logits/rejected": -0.9798964262008667, + "logps/chosen": -632.6463012695312, + "logps/rejected": -971.0015869140625, + "loss": 0.1349, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1951112747192383, + "rewards/margins": 3.460440158843994, + "rewards/rejected": -4.655550956726074, + "step": 658 + }, + { + "epoch": 0.57, + "grad_norm": 44.463965490286, + "learning_rate": 9.476141981405112e-07, + "logits/chosen": -1.0496432781219482, + "logits/rejected": -1.0346564054489136, + "logps/chosen": -476.4952392578125, + "logps/rejected": -637.9500122070312, + "loss": 0.2351, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.085493564605713, + "rewards/margins": 2.589820384979248, + "rewards/rejected": -3.675313949584961, + "step": 659 + }, + { + "epoch": 0.57, + "grad_norm": 68.27512907635017, + "learning_rate": 9.44623704459224e-07, + "logits/chosen": -1.050779104232788, + "logits/rejected": -1.0321484804153442, + "logps/chosen": -491.0112609863281, + "logps/rejected": -692.8443603515625, + "loss": 0.2597, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3593103885650635, + "rewards/margins": 2.38303542137146, + "rewards/rejected": -3.7423458099365234, + "step": 660 + }, + { + "epoch": 0.57, + "grad_norm": 82.05647902225522, + "learning_rate": 9.416337074529872e-07, + "logits/chosen": -1.0249155759811401, + "logits/rejected": -0.9780560731887817, + "logps/chosen": -583.6378173828125, + "logps/rejected": -788.2223510742188, + "loss": 0.5512, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0925850868225098, + "rewards/margins": 2.51676082611084, + "rewards/rejected": -3.6093459129333496, + "step": 661 + }, + { + "epoch": 0.57, + "grad_norm": 45.265104089038, + "learning_rate": 9.386442339393563e-07, + "logits/chosen": -1.024889588356018, + "logits/rejected": -0.9728207588195801, + "logps/chosen": -473.43365478515625, + "logps/rejected": -656.6044921875, + "loss": 0.1839, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.80340975522995, + "rewards/margins": 2.5102243423461914, + "rewards/rejected": -3.313633918762207, + "step": 662 + }, + { + "epoch": 0.57, + "grad_norm": 65.92164850179016, + "learning_rate": 9.356553107311921e-07, + "logits/chosen": -1.016336441040039, + "logits/rejected": -1.0131916999816895, + "logps/chosen": -474.9246520996094, + "logps/rejected": -530.8618774414062, + "loss": 0.4065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9806596040725708, + "rewards/margins": 1.3442504405975342, + "rewards/rejected": -2.3249101638793945, + "step": 663 + }, + { + "epoch": 0.57, + "grad_norm": 81.83170951612074, + "learning_rate": 9.326669646364203e-07, + "logits/chosen": -0.9990056753158569, + "logits/rejected": -0.9700546264648438, + "logps/chosen": -473.5721740722656, + "logps/rejected": -528.7078247070312, + "loss": 0.6083, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9107904434204102, + "rewards/margins": 1.244102954864502, + "rewards/rejected": -2.154893398284912, + "step": 664 + }, + { + "epoch": 0.57, + "grad_norm": 55.94802612690403, + "learning_rate": 9.296792224577894e-07, + "logits/chosen": -1.024969458580017, + "logits/rejected": -0.9708298444747925, + "logps/chosen": -538.3052978515625, + "logps/rejected": -770.4991455078125, + "loss": 0.2874, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.726364254951477, + "rewards/margins": 2.3672268390655518, + "rewards/rejected": -3.0935909748077393, + "step": 665 + }, + { + "epoch": 0.57, + "grad_norm": 65.92778234189511, + "learning_rate": 9.266921109926317e-07, + "logits/chosen": -1.020105242729187, + "logits/rejected": -0.9524201154708862, + "logps/chosen": -530.647705078125, + "logps/rejected": -631.1388549804688, + "loss": 0.4601, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9452372789382935, + "rewards/margins": 1.8844785690307617, + "rewards/rejected": -2.8297157287597656, + "step": 666 + }, + { + "epoch": 0.57, + "grad_norm": 47.23741485222515, + "learning_rate": 9.237056570326231e-07, + "logits/chosen": -0.995692789554596, + "logits/rejected": -0.963083028793335, + "logps/chosen": -438.02374267578125, + "logps/rejected": -751.4575805664062, + "loss": 0.2614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4857766032218933, + "rewards/margins": 2.190925359725952, + "rewards/rejected": -2.6767020225524902, + "step": 667 + }, + { + "epoch": 0.57, + "grad_norm": 37.82814644493825, + "learning_rate": 9.207198873635413e-07, + "logits/chosen": -1.0047978162765503, + "logits/rejected": -0.9785387516021729, + "logps/chosen": -464.0424499511719, + "logps/rejected": -700.0529174804688, + "loss": 0.196, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7992026805877686, + "rewards/margins": 2.433967351913452, + "rewards/rejected": -3.2331700325012207, + "step": 668 + }, + { + "epoch": 0.57, + "grad_norm": 67.30485233904321, + "learning_rate": 9.177348287650273e-07, + "logits/chosen": -1.024238109588623, + "logits/rejected": -0.9735568761825562, + "logps/chosen": -440.551513671875, + "logps/rejected": -649.260009765625, + "loss": 0.3645, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7555053234100342, + "rewards/margins": 1.5785208940505981, + "rewards/rejected": -2.334026336669922, + "step": 669 + }, + { + "epoch": 0.57, + "grad_norm": 51.33789079996192, + "learning_rate": 9.147505080103436e-07, + "logits/chosen": -1.0277349948883057, + "logits/rejected": -0.9844967126846313, + "logps/chosen": -485.0599365234375, + "logps/rejected": -690.2149658203125, + "loss": 0.2801, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0582692623138428, + "rewards/margins": 1.9671037197113037, + "rewards/rejected": -3.0253729820251465, + "step": 670 + }, + { + "epoch": 0.58, + "grad_norm": 84.470816979724, + "learning_rate": 9.117669518661365e-07, + "logits/chosen": -0.9990596771240234, + "logits/rejected": -0.9557846784591675, + "logps/chosen": -474.22113037109375, + "logps/rejected": -616.2432861328125, + "loss": 0.4854, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1680184602737427, + "rewards/margins": 1.3425781726837158, + "rewards/rejected": -2.510596513748169, + "step": 671 + }, + { + "epoch": 0.58, + "grad_norm": 34.1166550327403, + "learning_rate": 9.087841870921929e-07, + "logits/chosen": -1.0641005039215088, + "logits/rejected": -1.0280168056488037, + "logps/chosen": -365.0167236328125, + "logps/rejected": -560.3458251953125, + "loss": 0.238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7898271083831787, + "rewards/margins": 2.0286502838134766, + "rewards/rejected": -2.8184773921966553, + "step": 672 + }, + { + "epoch": 0.58, + "grad_norm": 60.0255646685014, + "learning_rate": 9.058022404412018e-07, + "logits/chosen": -1.0256595611572266, + "logits/rejected": -1.0097715854644775, + "logps/chosen": -336.9100646972656, + "logps/rejected": -485.3856201171875, + "loss": 0.3979, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9657609462738037, + "rewards/margins": 1.0824462175369263, + "rewards/rejected": -2.0482070446014404, + "step": 673 + }, + { + "epoch": 0.58, + "grad_norm": 52.558938660404124, + "learning_rate": 9.028211386585157e-07, + "logits/chosen": -1.0598628520965576, + "logits/rejected": -1.0078938007354736, + "logps/chosen": -487.5439147949219, + "logps/rejected": -664.8508911132812, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8298671245574951, + "rewards/margins": 2.1915345191955566, + "rewards/rejected": -3.0214016437530518, + "step": 674 + }, + { + "epoch": 0.58, + "grad_norm": 79.31656307930656, + "learning_rate": 8.998409084819087e-07, + "logits/chosen": -1.0546629428863525, + "logits/rejected": -1.0089797973632812, + "logps/chosen": -675.748046875, + "logps/rejected": -908.391845703125, + "loss": 0.4046, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3004239797592163, + "rewards/margins": 2.9876890182495117, + "rewards/rejected": -4.288113117218018, + "step": 675 + }, + { + "epoch": 0.58, + "grad_norm": 35.20555496949064, + "learning_rate": 8.968615766413365e-07, + "logits/chosen": -1.0567575693130493, + "logits/rejected": -1.0212836265563965, + "logps/chosen": -423.4122314453125, + "logps/rejected": -738.394287109375, + "loss": 0.1584, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.609798014163971, + "rewards/margins": 3.163112163543701, + "rewards/rejected": -3.7729101181030273, + "step": 676 + }, + { + "epoch": 0.58, + "grad_norm": 49.798090039342085, + "learning_rate": 8.938831698586993e-07, + "logits/chosen": -1.0136868953704834, + "logits/rejected": -0.9751554727554321, + "logps/chosen": -400.88995361328125, + "logps/rejected": -635.0789794921875, + "loss": 0.2124, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1595284938812256, + "rewards/margins": 2.3611745834350586, + "rewards/rejected": -3.520702838897705, + "step": 677 + }, + { + "epoch": 0.58, + "grad_norm": 38.677926146867755, + "learning_rate": 8.90905714847599e-07, + "logits/chosen": -1.075408697128296, + "logits/rejected": -1.0172092914581299, + "logps/chosen": -539.0830078125, + "logps/rejected": -641.5501708984375, + "loss": 0.169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0429928302764893, + "rewards/margins": 2.260859251022339, + "rewards/rejected": -3.303852081298828, + "step": 678 + }, + { + "epoch": 0.58, + "grad_norm": 55.458709677682904, + "learning_rate": 8.87929238313101e-07, + "logits/chosen": -1.0920605659484863, + "logits/rejected": -1.0440154075622559, + "logps/chosen": -455.96051025390625, + "logps/rejected": -724.99560546875, + "loss": 0.2578, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0661029815673828, + "rewards/margins": 2.8346362113952637, + "rewards/rejected": -3.9007389545440674, + "step": 679 + }, + { + "epoch": 0.58, + "grad_norm": 52.56159035268648, + "learning_rate": 8.849537669514961e-07, + "logits/chosen": -1.018843650817871, + "logits/rejected": -0.997867226600647, + "logps/chosen": -395.1448974609375, + "logps/rejected": -477.63116455078125, + "loss": 0.428, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9292417764663696, + "rewards/margins": 1.5945565700531006, + "rewards/rejected": -2.5237984657287598, + "step": 680 + }, + { + "epoch": 0.58, + "grad_norm": 31.972474581807276, + "learning_rate": 8.819793274500577e-07, + "logits/chosen": -1.0466132164001465, + "logits/rejected": -1.0130168199539185, + "logps/chosen": -539.3006591796875, + "logps/rejected": -813.1967163085938, + "loss": 0.1193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9003525972366333, + "rewards/margins": 3.3533737659454346, + "rewards/rejected": -4.253726005554199, + "step": 681 + }, + { + "epoch": 0.58, + "grad_norm": 47.21458017557427, + "learning_rate": 8.790059464868051e-07, + "logits/chosen": -1.108288288116455, + "logits/rejected": -1.025892972946167, + "logps/chosen": -577.5280151367188, + "logps/rejected": -683.5076904296875, + "loss": 0.1913, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9032981395721436, + "rewards/margins": 2.549906015396118, + "rewards/rejected": -3.4532041549682617, + "step": 682 + }, + { + "epoch": 0.59, + "grad_norm": 67.17080337761237, + "learning_rate": 8.760336507302644e-07, + "logits/chosen": -1.068331003189087, + "logits/rejected": -1.0443072319030762, + "logps/chosen": -470.96563720703125, + "logps/rejected": -577.9490356445312, + "loss": 0.3619, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0024915933609009, + "rewards/margins": 2.034213066101074, + "rewards/rejected": -3.0367047786712646, + "step": 683 + }, + { + "epoch": 0.59, + "grad_norm": 79.39125605456371, + "learning_rate": 8.730624668392274e-07, + "logits/chosen": -1.011876106262207, + "logits/rejected": -0.9848489761352539, + "logps/chosen": -395.51123046875, + "logps/rejected": -556.3402099609375, + "loss": 0.4688, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3166017532348633, + "rewards/margins": 1.682083249092102, + "rewards/rejected": -2.998684883117676, + "step": 684 + }, + { + "epoch": 0.59, + "grad_norm": 32.74442625831233, + "learning_rate": 8.700924214625129e-07, + "logits/chosen": -1.104191780090332, + "logits/rejected": -1.0686068534851074, + "logps/chosen": -329.3619689941406, + "logps/rejected": -559.1907958984375, + "loss": 0.248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5531414151191711, + "rewards/margins": 2.6192808151245117, + "rewards/rejected": -3.172422409057617, + "step": 685 + }, + { + "epoch": 0.59, + "grad_norm": 61.697891131967666, + "learning_rate": 8.671235412387294e-07, + "logits/chosen": -1.0755338668823242, + "logits/rejected": -1.0327041149139404, + "logps/chosen": -499.4652099609375, + "logps/rejected": -629.8323364257812, + "loss": 0.3479, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.190791130065918, + "rewards/margins": 2.2594170570373535, + "rewards/rejected": -3.4502081871032715, + "step": 686 + }, + { + "epoch": 0.59, + "grad_norm": 52.226803144753, + "learning_rate": 8.641558527960353e-07, + "logits/chosen": -1.091662883758545, + "logits/rejected": -1.0301198959350586, + "logps/chosen": -495.9996337890625, + "logps/rejected": -590.9207153320312, + "loss": 0.3159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9306167960166931, + "rewards/margins": 2.0519721508026123, + "rewards/rejected": -2.98258900642395, + "step": 687 + }, + { + "epoch": 0.59, + "grad_norm": 37.689109383788484, + "learning_rate": 8.611893827518987e-07, + "logits/chosen": -1.003065586090088, + "logits/rejected": -0.9978610873222351, + "logps/chosen": -495.4908752441406, + "logps/rejected": -803.0103759765625, + "loss": 0.1529, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.143528699874878, + "rewards/margins": 2.74113130569458, + "rewards/rejected": -3.884659767150879, + "step": 688 + }, + { + "epoch": 0.59, + "grad_norm": 57.00456224390602, + "learning_rate": 8.582241577128596e-07, + "logits/chosen": -1.0166442394256592, + "logits/rejected": -1.0022616386413574, + "logps/chosen": -444.8175964355469, + "logps/rejected": -605.216064453125, + "loss": 0.2829, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.45799413323402405, + "rewards/margins": 1.3977954387664795, + "rewards/rejected": -1.8557895421981812, + "step": 689 + }, + { + "epoch": 0.59, + "grad_norm": 90.3576776617285, + "learning_rate": 8.552602042742929e-07, + "logits/chosen": -0.9633793830871582, + "logits/rejected": -0.9141581058502197, + "logps/chosen": -486.4992980957031, + "logps/rejected": -623.2057495117188, + "loss": 0.5209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09763508290052414, + "rewards/margins": 0.8215664029121399, + "rewards/rejected": -0.9192014932632446, + "step": 690 + }, + { + "epoch": 0.59, + "grad_norm": 75.80511298063934, + "learning_rate": 8.522975490201676e-07, + "logits/chosen": -0.9981414079666138, + "logits/rejected": -0.9680377840995789, + "logps/chosen": -582.814208984375, + "logps/rejected": -789.6953125, + "loss": 0.4338, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12799492478370667, + "rewards/margins": 1.251299262046814, + "rewards/rejected": -1.3792941570281982, + "step": 691 + }, + { + "epoch": 0.59, + "grad_norm": 58.03890101081773, + "learning_rate": 8.493362185228085e-07, + "logits/chosen": -1.0232558250427246, + "logits/rejected": -1.0005247592926025, + "logps/chosen": -470.6429138183594, + "logps/rejected": -584.0569458007812, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6013145446777344, + "rewards/margins": 2.231989860534668, + "rewards/rejected": -2.8333044052124023, + "step": 692 + }, + { + "epoch": 0.59, + "grad_norm": 39.57112612968621, + "learning_rate": 8.463762393426596e-07, + "logits/chosen": -1.0991417169570923, + "logits/rejected": -1.0630440711975098, + "logps/chosen": -457.98760986328125, + "logps/rejected": -599.1575317382812, + "loss": 0.2307, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7417198419570923, + "rewards/margins": 2.1613929271698, + "rewards/rejected": -2.9031128883361816, + "step": 693 + }, + { + "epoch": 0.6, + "grad_norm": 85.58269634576412, + "learning_rate": 8.434176380280445e-07, + "logits/chosen": -1.139190912246704, + "logits/rejected": -1.1178913116455078, + "logps/chosen": -432.321533203125, + "logps/rejected": -602.7254638671875, + "loss": 0.6591, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0561460256576538, + "rewards/margins": 1.7356929779052734, + "rewards/rejected": -2.791839122772217, + "step": 694 + }, + { + "epoch": 0.6, + "grad_norm": 60.306927620731216, + "learning_rate": 8.404604411149279e-07, + "logits/chosen": -1.0914738178253174, + "logits/rejected": -1.0556695461273193, + "logps/chosen": -388.7018737792969, + "logps/rejected": -444.811279296875, + "loss": 0.4614, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7933942079544067, + "rewards/margins": 1.2613402605056763, + "rewards/rejected": -2.054734230041504, + "step": 695 + }, + { + "epoch": 0.6, + "grad_norm": 112.37605995114386, + "learning_rate": 8.375046751266796e-07, + "logits/chosen": -1.071258783340454, + "logits/rejected": -1.0431017875671387, + "logps/chosen": -693.3665771484375, + "logps/rejected": -638.94482421875, + "loss": 0.8419, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8593192100524902, + "rewards/margins": 0.6869137287139893, + "rewards/rejected": -2.5462329387664795, + "step": 696 + }, + { + "epoch": 0.6, + "grad_norm": 81.30328331871632, + "learning_rate": 8.345503665738343e-07, + "logits/chosen": -1.1141998767852783, + "logits/rejected": -1.0982260704040527, + "logps/chosen": -762.9190673828125, + "logps/rejected": -865.8209228515625, + "loss": 0.4519, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9578491449356079, + "rewards/margins": 2.394010305404663, + "rewards/rejected": -3.3518595695495605, + "step": 697 + }, + { + "epoch": 0.6, + "grad_norm": 51.28776378440233, + "learning_rate": 8.31597541953855e-07, + "logits/chosen": -1.0857526063919067, + "logits/rejected": -1.0541229248046875, + "logps/chosen": -576.4926147460938, + "logps/rejected": -903.2041625976562, + "loss": 0.1986, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.033154010772705, + "rewards/margins": 3.0796685218811035, + "rewards/rejected": -4.112822532653809, + "step": 698 + }, + { + "epoch": 0.6, + "grad_norm": 57.160000515124885, + "learning_rate": 8.28646227750895e-07, + "logits/chosen": -1.0621399879455566, + "logits/rejected": -1.0101492404937744, + "logps/chosen": -623.4498291015625, + "logps/rejected": -849.6536254882812, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0524137020111084, + "rewards/margins": 2.7270641326904297, + "rewards/rejected": -3.779478073120117, + "step": 699 + }, + { + "epoch": 0.6, + "grad_norm": 35.66216623997116, + "learning_rate": 8.256964504355616e-07, + "logits/chosen": -1.0806918144226074, + "logits/rejected": -1.068446159362793, + "logps/chosen": -572.2581176757812, + "logps/rejected": -722.5521240234375, + "loss": 0.1679, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0125174522399902, + "rewards/margins": 2.556424140930176, + "rewards/rejected": -3.568941593170166, + "step": 700 + }, + { + "epoch": 0.6, + "grad_norm": 46.657284636799524, + "learning_rate": 8.227482364646761e-07, + "logits/chosen": -1.1175563335418701, + "logits/rejected": -1.0721676349639893, + "logps/chosen": -326.6000061035156, + "logps/rejected": -496.0318298339844, + "loss": 0.3426, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.769618034362793, + "rewards/margins": 1.6918368339538574, + "rewards/rejected": -2.4614548683166504, + "step": 701 + }, + { + "epoch": 0.6, + "grad_norm": 37.87281245613246, + "learning_rate": 8.198016122810387e-07, + "logits/chosen": -1.1282148361206055, + "logits/rejected": -1.045536994934082, + "logps/chosen": -478.67999267578125, + "logps/rejected": -792.648193359375, + "loss": 0.1563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9513812065124512, + "rewards/margins": 3.4069557189941406, + "rewards/rejected": -4.358336925506592, + "step": 702 + }, + { + "epoch": 0.6, + "grad_norm": 82.09000373881834, + "learning_rate": 8.168566043131917e-07, + "logits/chosen": -1.1367781162261963, + "logits/rejected": -1.0976234674453735, + "logps/chosen": -491.50787353515625, + "logps/rejected": -632.8331298828125, + "loss": 0.5357, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3084259033203125, + "rewards/margins": 1.7329047918319702, + "rewards/rejected": -3.0413308143615723, + "step": 703 + }, + { + "epoch": 0.6, + "grad_norm": 47.92059762815023, + "learning_rate": 8.139132389751793e-07, + "logits/chosen": -1.1266899108886719, + "logits/rejected": -1.0857560634613037, + "logps/chosen": -483.18505859375, + "logps/rejected": -612.6836547851562, + "loss": 0.2122, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8871411085128784, + "rewards/margins": 2.389219045639038, + "rewards/rejected": -3.276360034942627, + "step": 704 + }, + { + "epoch": 0.6, + "grad_norm": 64.29930782092856, + "learning_rate": 8.109715426663144e-07, + "logits/chosen": -1.1286720037460327, + "logits/rejected": -1.106456995010376, + "logps/chosen": -493.1081237792969, + "logps/rejected": -593.7147827148438, + "loss": 0.3804, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0602256059646606, + "rewards/margins": 1.802858829498291, + "rewards/rejected": -2.863084316253662, + "step": 705 + }, + { + "epoch": 0.61, + "grad_norm": 67.54118085026357, + "learning_rate": 8.080315417709396e-07, + "logits/chosen": -1.1312260627746582, + "logits/rejected": -1.0784671306610107, + "logps/chosen": -658.8128051757812, + "logps/rejected": -790.6376953125, + "loss": 0.3104, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2036482095718384, + "rewards/margins": 2.3108599185943604, + "rewards/rejected": -3.5145082473754883, + "step": 706 + }, + { + "epoch": 0.61, + "grad_norm": 66.40738539477269, + "learning_rate": 8.050932626581918e-07, + "logits/chosen": -1.0875067710876465, + "logits/rejected": -1.0200226306915283, + "logps/chosen": -600.2384033203125, + "logps/rejected": -778.6403198242188, + "loss": 0.3488, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3704009056091309, + "rewards/margins": 2.3894481658935547, + "rewards/rejected": -3.7598493099212646, + "step": 707 + }, + { + "epoch": 0.61, + "grad_norm": 43.056248164179266, + "learning_rate": 8.021567316817637e-07, + "logits/chosen": -1.1687889099121094, + "logits/rejected": -1.0820575952529907, + "logps/chosen": -612.1283569335938, + "logps/rejected": -858.5739135742188, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9502978324890137, + "rewards/margins": 3.2523093223571777, + "rewards/rejected": -4.202607154846191, + "step": 708 + }, + { + "epoch": 0.61, + "grad_norm": 34.882399762039014, + "learning_rate": 7.992219751796704e-07, + "logits/chosen": -1.0977517366409302, + "logits/rejected": -1.0606008768081665, + "logps/chosen": -386.17108154296875, + "logps/rejected": -614.8777465820312, + "loss": 0.2711, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7608810663223267, + "rewards/margins": 2.5815372467041016, + "rewards/rejected": -3.3424181938171387, + "step": 709 + }, + { + "epoch": 0.61, + "grad_norm": 41.4699572158317, + "learning_rate": 7.962890194740108e-07, + "logits/chosen": -1.159165859222412, + "logits/rejected": -1.1114017963409424, + "logps/chosen": -440.3230285644531, + "logps/rejected": -511.60504150390625, + "loss": 0.2059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5045056343078613, + "rewards/margins": 2.3634910583496094, + "rewards/rejected": -2.8679966926574707, + "step": 710 + }, + { + "epoch": 0.61, + "grad_norm": 32.88599014002069, + "learning_rate": 7.933578908707324e-07, + "logits/chosen": -1.0776267051696777, + "logits/rejected": -1.0276210308074951, + "logps/chosen": -521.4286499023438, + "logps/rejected": -695.8472900390625, + "loss": 0.1634, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.188821792602539, + "rewards/margins": 2.572869062423706, + "rewards/rejected": -3.761690855026245, + "step": 711 + }, + { + "epoch": 0.61, + "grad_norm": 65.68899031321791, + "learning_rate": 7.904286156593946e-07, + "logits/chosen": -1.1048779487609863, + "logits/rejected": -1.0757970809936523, + "logps/chosen": -292.3482360839844, + "logps/rejected": -528.6915283203125, + "loss": 0.5532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6723714470863342, + "rewards/margins": 1.703259825706482, + "rewards/rejected": -2.375631093978882, + "step": 712 + }, + { + "epoch": 0.61, + "grad_norm": 78.97284178105878, + "learning_rate": 7.87501220112935e-07, + "logits/chosen": -1.143258810043335, + "logits/rejected": -1.079023838043213, + "logps/chosen": -615.0197143554688, + "logps/rejected": -760.0322265625, + "loss": 0.3618, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.426388144493103, + "rewards/margins": 2.533200740814209, + "rewards/rejected": -3.9595890045166016, + "step": 713 + }, + { + "epoch": 0.61, + "grad_norm": 40.012440631122075, + "learning_rate": 7.845757304874312e-07, + "logits/chosen": -1.1630828380584717, + "logits/rejected": -1.1020534038543701, + "logps/chosen": -494.54901123046875, + "logps/rejected": -600.6971435546875, + "loss": 0.229, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0917311906814575, + "rewards/margins": 2.534214973449707, + "rewards/rejected": -3.625946521759033, + "step": 714 + }, + { + "epoch": 0.61, + "grad_norm": 39.65384568557147, + "learning_rate": 7.816521730218663e-07, + "logits/chosen": -1.1046650409698486, + "logits/rejected": -1.0768277645111084, + "logps/chosen": -405.21148681640625, + "logps/rejected": -637.754638671875, + "loss": 0.2215, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9128862619400024, + "rewards/margins": 2.3731446266174316, + "rewards/rejected": -3.2860312461853027, + "step": 715 + }, + { + "epoch": 0.61, + "grad_norm": 85.73045814483734, + "learning_rate": 7.787305739378949e-07, + "logits/chosen": -1.1642417907714844, + "logits/rejected": -1.1198418140411377, + "logps/chosen": -488.8841857910156, + "logps/rejected": -612.7138671875, + "loss": 0.4937, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2218539714813232, + "rewards/margins": 1.5381131172180176, + "rewards/rejected": -2.759967088699341, + "step": 716 + }, + { + "epoch": 0.61, + "grad_norm": 42.53386185849525, + "learning_rate": 7.758109594396053e-07, + "logits/chosen": -1.089572548866272, + "logits/rejected": -1.0360852479934692, + "logps/chosen": -579.4656982421875, + "logps/rejected": -960.8588256835938, + "loss": 0.1665, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8310198783874512, + "rewards/margins": 4.329297065734863, + "rewards/rejected": -5.160316467285156, + "step": 717 + }, + { + "epoch": 0.62, + "grad_norm": 61.91236169763918, + "learning_rate": 7.728933557132864e-07, + "logits/chosen": -1.1526691913604736, + "logits/rejected": -1.1194026470184326, + "logps/chosen": -639.4688720703125, + "logps/rejected": -732.7659912109375, + "loss": 0.2888, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6607766151428223, + "rewards/margins": 1.501218557357788, + "rewards/rejected": -3.1619949340820312, + "step": 718 + }, + { + "epoch": 0.62, + "grad_norm": 62.81120284872164, + "learning_rate": 7.69977788927193e-07, + "logits/chosen": -1.0805715322494507, + "logits/rejected": -1.0669435262680054, + "logps/chosen": -659.6182861328125, + "logps/rejected": -815.5236206054688, + "loss": 0.2639, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2269022464752197, + "rewards/margins": 2.239307165145874, + "rewards/rejected": -3.4662094116210938, + "step": 719 + }, + { + "epoch": 0.62, + "grad_norm": 54.1255651596908, + "learning_rate": 7.670642852313093e-07, + "logits/chosen": -1.1983035802841187, + "logits/rejected": -1.1525425910949707, + "logps/chosen": -632.2000732421875, + "logps/rejected": -597.90283203125, + "loss": 0.2917, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4242873191833496, + "rewards/margins": 1.8659169673919678, + "rewards/rejected": -3.2902045249938965, + "step": 720 + }, + { + "epoch": 0.62, + "grad_norm": 30.82111376515087, + "learning_rate": 7.641528707571157e-07, + "logits/chosen": -1.1706678867340088, + "logits/rejected": -1.1194672584533691, + "logps/chosen": -472.59967041015625, + "logps/rejected": -579.7078247070312, + "loss": 0.1936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8377057313919067, + "rewards/margins": 2.2968480587005615, + "rewards/rejected": -3.134553909301758, + "step": 721 + }, + { + "epoch": 0.62, + "grad_norm": 50.88348011114949, + "learning_rate": 7.612435716173551e-07, + "logits/chosen": -1.167925238609314, + "logits/rejected": -1.1308338642120361, + "logps/chosen": -470.2132263183594, + "logps/rejected": -618.6790771484375, + "loss": 0.2607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9302735328674316, + "rewards/margins": 2.231713056564331, + "rewards/rejected": -3.1619863510131836, + "step": 722 + }, + { + "epoch": 0.62, + "grad_norm": 63.98869670043746, + "learning_rate": 7.583364139057967e-07, + "logits/chosen": -1.13923180103302, + "logits/rejected": -1.11210298538208, + "logps/chosen": -669.2499389648438, + "logps/rejected": -699.835205078125, + "loss": 0.2721, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8878486156463623, + "rewards/margins": 1.806983232498169, + "rewards/rejected": -3.6948318481445312, + "step": 723 + }, + { + "epoch": 0.62, + "grad_norm": 64.99137054774114, + "learning_rate": 7.55431423697003e-07, + "logits/chosen": -1.1343735456466675, + "logits/rejected": -1.081453800201416, + "logps/chosen": -517.8303833007812, + "logps/rejected": -810.4320068359375, + "loss": 0.301, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0299971103668213, + "rewards/margins": 3.119328022003174, + "rewards/rejected": -4.149324893951416, + "step": 724 + }, + { + "epoch": 0.62, + "grad_norm": 45.66387754211882, + "learning_rate": 7.525286270460969e-07, + "logits/chosen": -1.1713275909423828, + "logits/rejected": -1.1240694522857666, + "logps/chosen": -584.0823974609375, + "logps/rejected": -670.561767578125, + "loss": 0.1905, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.169159173965454, + "rewards/margins": 2.4920945167541504, + "rewards/rejected": -3.6612532138824463, + "step": 725 + }, + { + "epoch": 0.62, + "grad_norm": 59.89071668488572, + "learning_rate": 7.496280499885266e-07, + "logits/chosen": -1.136541724205017, + "logits/rejected": -1.0874381065368652, + "logps/chosen": -488.20233154296875, + "logps/rejected": -660.3988647460938, + "loss": 0.3706, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1385947465896606, + "rewards/margins": 1.85256826877594, + "rewards/rejected": -2.9911630153656006, + "step": 726 + }, + { + "epoch": 0.62, + "grad_norm": 50.81307150074276, + "learning_rate": 7.467297185398323e-07, + "logits/chosen": -1.124204158782959, + "logits/rejected": -1.1052558422088623, + "logps/chosen": -549.4209594726562, + "logps/rejected": -717.3243408203125, + "loss": 0.2483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2831385135650635, + "rewards/margins": 2.285860776901245, + "rewards/rejected": -3.5689992904663086, + "step": 727 + }, + { + "epoch": 0.62, + "grad_norm": 62.236229962423614, + "learning_rate": 7.43833658695413e-07, + "logits/chosen": -1.1308350563049316, + "logits/rejected": -1.0671460628509521, + "logps/chosen": -371.38238525390625, + "logps/rejected": -560.85205078125, + "loss": 0.3585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.947990357875824, + "rewards/margins": 1.855220079421997, + "rewards/rejected": -2.803210496902466, + "step": 728 + }, + { + "epoch": 0.63, + "grad_norm": 52.450467055842964, + "learning_rate": 7.409398964302946e-07, + "logits/chosen": -1.165910243988037, + "logits/rejected": -1.0663230419158936, + "logps/chosen": -549.573974609375, + "logps/rejected": -1081.66064453125, + "loss": 0.2114, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.556742548942566, + "rewards/margins": 3.4699325561523438, + "rewards/rejected": -5.026675224304199, + "step": 729 + }, + { + "epoch": 0.63, + "grad_norm": 83.48049695338477, + "learning_rate": 7.380484576988948e-07, + "logits/chosen": -1.1759792566299438, + "logits/rejected": -1.124603271484375, + "logps/chosen": -603.390625, + "logps/rejected": -697.0489501953125, + "loss": 0.6446, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.043789267539978, + "rewards/margins": 1.9236034154891968, + "rewards/rejected": -2.9673924446105957, + "step": 730 + }, + { + "epoch": 0.63, + "grad_norm": 25.114247133832123, + "learning_rate": 7.351593684347908e-07, + "logits/chosen": -1.200255274772644, + "logits/rejected": -1.1304539442062378, + "logps/chosen": -431.82489013671875, + "logps/rejected": -722.521728515625, + "loss": 0.1204, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.025187611579895, + "rewards/margins": 2.7052230834960938, + "rewards/rejected": -3.730410575866699, + "step": 731 + }, + { + "epoch": 0.63, + "grad_norm": 52.31421975307311, + "learning_rate": 7.322726545504889e-07, + "logits/chosen": -1.172715425491333, + "logits/rejected": -1.12477445602417, + "logps/chosen": -370.84576416015625, + "logps/rejected": -681.51318359375, + "loss": 0.2574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6131293773651123, + "rewards/margins": 2.7430875301361084, + "rewards/rejected": -3.3562169075012207, + "step": 732 + }, + { + "epoch": 0.63, + "grad_norm": 78.43657110280873, + "learning_rate": 7.293883419371892e-07, + "logits/chosen": -1.1670312881469727, + "logits/rejected": -1.1011550426483154, + "logps/chosen": -411.6805419921875, + "logps/rejected": -600.664306640625, + "loss": 0.5422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9437257051467896, + "rewards/margins": 2.7335152626037598, + "rewards/rejected": -3.677241086959839, + "step": 733 + }, + { + "epoch": 0.63, + "grad_norm": 87.72015513512574, + "learning_rate": 7.265064564645544e-07, + "logits/chosen": -1.233947992324829, + "logits/rejected": -1.1877312660217285, + "logps/chosen": -538.8570556640625, + "logps/rejected": -600.8184814453125, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5215339660644531, + "rewards/margins": 1.1645421981811523, + "rewards/rejected": -2.6860761642456055, + "step": 734 + }, + { + "epoch": 0.63, + "grad_norm": 38.87597940211733, + "learning_rate": 7.236270239804791e-07, + "logits/chosen": -1.1780080795288086, + "logits/rejected": -1.12031888961792, + "logps/chosen": -474.943603515625, + "logps/rejected": -638.14013671875, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0575810670852661, + "rewards/margins": 2.20149564743042, + "rewards/rejected": -3.2590765953063965, + "step": 735 + }, + { + "epoch": 0.63, + "grad_norm": 64.52242884909481, + "learning_rate": 7.207500703108556e-07, + "logits/chosen": -1.1174970865249634, + "logits/rejected": -1.0595775842666626, + "logps/chosen": -523.7102661132812, + "logps/rejected": -703.5924072265625, + "loss": 0.3152, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0679781436920166, + "rewards/margins": 2.1514697074890137, + "rewards/rejected": -3.2194480895996094, + "step": 736 + }, + { + "epoch": 0.63, + "grad_norm": 55.6938214020685, + "learning_rate": 7.178756212593442e-07, + "logits/chosen": -1.1437046527862549, + "logits/rejected": -1.09684419631958, + "logps/chosen": -481.9346923828125, + "logps/rejected": -824.975341796875, + "loss": 0.2046, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9986522197723389, + "rewards/margins": 2.441328525543213, + "rewards/rejected": -3.4399807453155518, + "step": 737 + }, + { + "epoch": 0.63, + "grad_norm": 105.47202970842274, + "learning_rate": 7.150037026071404e-07, + "logits/chosen": -1.2130227088928223, + "logits/rejected": -1.1825370788574219, + "logps/chosen": -605.1729736328125, + "logps/rejected": -633.9487915039062, + "loss": 0.8505, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5780174732208252, + "rewards/margins": 0.7051810026168823, + "rewards/rejected": -2.283198356628418, + "step": 738 + }, + { + "epoch": 0.63, + "grad_norm": 52.61984772985579, + "learning_rate": 7.121343401127456e-07, + "logits/chosen": -1.1280416250228882, + "logits/rejected": -1.0790488719940186, + "logps/chosen": -385.1922607421875, + "logps/rejected": -701.3255615234375, + "loss": 0.3356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5919640064239502, + "rewards/margins": 2.0131258964538574, + "rewards/rejected": -2.6050901412963867, + "step": 739 + }, + { + "epoch": 0.63, + "grad_norm": 51.35346020631582, + "learning_rate": 7.092675595117332e-07, + "logits/chosen": -1.1492226123809814, + "logits/rejected": -1.0986218452453613, + "logps/chosen": -395.18731689453125, + "logps/rejected": -550.7103271484375, + "loss": 0.3553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6217691898345947, + "rewards/margins": 1.554800033569336, + "rewards/rejected": -2.1765689849853516, + "step": 740 + }, + { + "epoch": 0.64, + "grad_norm": 70.21040596452477, + "learning_rate": 7.064033865165203e-07, + "logits/chosen": -1.1506271362304688, + "logits/rejected": -1.121349811553955, + "logps/chosen": -496.9982604980469, + "logps/rejected": -625.1170654296875, + "loss": 0.4375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0170750617980957, + "rewards/margins": 1.751772403717041, + "rewards/rejected": -2.7688472270965576, + "step": 741 + }, + { + "epoch": 0.64, + "grad_norm": 68.89351576725261, + "learning_rate": 7.035418468161365e-07, + "logits/chosen": -1.2487106323242188, + "logits/rejected": -1.219386100769043, + "logps/chosen": -414.99365234375, + "logps/rejected": -643.078369140625, + "loss": 0.3391, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8841046690940857, + "rewards/margins": 1.9309461116790771, + "rewards/rejected": -2.8150508403778076, + "step": 742 + }, + { + "epoch": 0.64, + "grad_norm": 28.199879890615307, + "learning_rate": 7.006829660759923e-07, + "logits/chosen": -1.115299940109253, + "logits/rejected": -1.07859468460083, + "logps/chosen": -562.867431640625, + "logps/rejected": -755.9138793945312, + "loss": 0.1586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8423274159431458, + "rewards/margins": 2.3967549800872803, + "rewards/rejected": -3.2390823364257812, + "step": 743 + }, + { + "epoch": 0.64, + "grad_norm": 73.92393577003973, + "learning_rate": 6.978267699376493e-07, + "logits/chosen": -1.1400046348571777, + "logits/rejected": -1.1184709072113037, + "logps/chosen": -516.0626831054688, + "logps/rejected": -608.6300048828125, + "loss": 0.527, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7962015271186829, + "rewards/margins": 1.346013069152832, + "rewards/rejected": -2.142214775085449, + "step": 744 + }, + { + "epoch": 0.64, + "grad_norm": 48.84711403391857, + "learning_rate": 6.949732840185925e-07, + "logits/chosen": -1.183213710784912, + "logits/rejected": -1.1628715991973877, + "logps/chosen": -407.721923828125, + "logps/rejected": -440.9341735839844, + "loss": 0.2901, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6022155284881592, + "rewards/margins": 1.7358124256134033, + "rewards/rejected": -2.3380279541015625, + "step": 745 + }, + { + "epoch": 0.64, + "grad_norm": 70.5674991574398, + "learning_rate": 6.921225339119971e-07, + "logits/chosen": -1.167476773262024, + "logits/rejected": -1.1096925735473633, + "logps/chosen": -490.8918151855469, + "logps/rejected": -707.9902954101562, + "loss": 0.2934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7327123880386353, + "rewards/margins": 2.283106803894043, + "rewards/rejected": -3.015819549560547, + "step": 746 + }, + { + "epoch": 0.64, + "grad_norm": 47.13349865460932, + "learning_rate": 6.892745451865008e-07, + "logits/chosen": -1.259293556213379, + "logits/rejected": -1.201749563217163, + "logps/chosen": -394.35711669921875, + "logps/rejected": -533.4117431640625, + "loss": 0.372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6959115266799927, + "rewards/margins": 1.4656853675842285, + "rewards/rejected": -2.1615970134735107, + "step": 747 + }, + { + "epoch": 0.64, + "grad_norm": 27.819387134703216, + "learning_rate": 6.86429343385975e-07, + "logits/chosen": -1.2313560247421265, + "logits/rejected": -1.2034997940063477, + "logps/chosen": -436.7022705078125, + "logps/rejected": -587.44140625, + "loss": 0.285, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0417014360427856, + "rewards/margins": 2.1582226753234863, + "rewards/rejected": -3.1999239921569824, + "step": 748 + }, + { + "epoch": 0.64, + "grad_norm": 42.357745921717495, + "learning_rate": 6.835869540292942e-07, + "logits/chosen": -1.187593698501587, + "logits/rejected": -1.2094995975494385, + "logps/chosen": -465.0749816894531, + "logps/rejected": -561.1448364257812, + "loss": 0.2746, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.540374755859375, + "rewards/margins": 2.182948112487793, + "rewards/rejected": -2.723323106765747, + "step": 749 + }, + { + "epoch": 0.64, + "grad_norm": 32.360209468440345, + "learning_rate": 6.807474026101078e-07, + "logits/chosen": -1.2537109851837158, + "logits/rejected": -1.1764297485351562, + "logps/chosen": -573.1024780273438, + "logps/rejected": -926.65625, + "loss": 0.2464, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1264824867248535, + "rewards/margins": 2.9104440212249756, + "rewards/rejected": -4.03692626953125, + "step": 750 + }, + { + "epoch": 0.64, + "grad_norm": 21.1459484931328, + "learning_rate": 6.779107145966121e-07, + "logits/chosen": -1.2650989294052124, + "logits/rejected": -1.208359718322754, + "logps/chosen": -451.5343017578125, + "logps/rejected": -670.3783569335938, + "loss": 0.193, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0734288692474365, + "rewards/margins": 2.3972225189208984, + "rewards/rejected": -3.470651626586914, + "step": 751 + }, + { + "epoch": 0.64, + "grad_norm": 62.5659394844564, + "learning_rate": 6.750769154313205e-07, + "logits/chosen": -1.2595267295837402, + "logits/rejected": -1.2201892137527466, + "logps/chosen": -526.067138671875, + "logps/rejected": -715.4703369140625, + "loss": 0.3608, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2677977085113525, + "rewards/margins": 1.8319288492202759, + "rewards/rejected": -3.099726676940918, + "step": 752 + }, + { + "epoch": 0.65, + "grad_norm": 38.93281498713387, + "learning_rate": 6.722460305308367e-07, + "logits/chosen": -1.213770866394043, + "logits/rejected": -1.1902894973754883, + "logps/chosen": -570.1568603515625, + "logps/rejected": -643.709228515625, + "loss": 0.254, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2748959064483643, + "rewards/margins": 1.9360785484313965, + "rewards/rejected": -3.21097469329834, + "step": 753 + }, + { + "epoch": 0.65, + "grad_norm": 54.254456805570406, + "learning_rate": 6.694180852856253e-07, + "logits/chosen": -1.1973718404769897, + "logits/rejected": -1.1430859565734863, + "logps/chosen": -592.2709350585938, + "logps/rejected": -756.05615234375, + "loss": 0.243, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6226004362106323, + "rewards/margins": 2.1562705039978027, + "rewards/rejected": -3.7788708209991455, + "step": 754 + }, + { + "epoch": 0.65, + "grad_norm": 106.8955122637892, + "learning_rate": 6.665931050597859e-07, + "logits/chosen": -1.2209007740020752, + "logits/rejected": -1.1845306158065796, + "logps/chosen": -610.568359375, + "logps/rejected": -641.6486206054688, + "loss": 0.7744, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4062316417694092, + "rewards/margins": 1.0251706838607788, + "rewards/rejected": -2.4314022064208984, + "step": 755 + }, + { + "epoch": 0.65, + "grad_norm": 71.75036324422791, + "learning_rate": 6.637711151908239e-07, + "logits/chosen": -1.2332406044006348, + "logits/rejected": -1.163503646850586, + "logps/chosen": -399.26348876953125, + "logps/rejected": -641.740478515625, + "loss": 0.4403, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0247321128845215, + "rewards/margins": 2.3649094104766846, + "rewards/rejected": -3.389641523361206, + "step": 756 + }, + { + "epoch": 0.65, + "grad_norm": 55.89352128487458, + "learning_rate": 6.609521409894237e-07, + "logits/chosen": -1.2482631206512451, + "logits/rejected": -1.1869556903839111, + "logps/chosen": -506.63739013671875, + "logps/rejected": -672.4251708984375, + "loss": 0.1912, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8384074568748474, + "rewards/margins": 2.9175162315368652, + "rewards/rejected": -3.7559235095977783, + "step": 757 + }, + { + "epoch": 0.65, + "grad_norm": 36.236798796376284, + "learning_rate": 6.58136207739223e-07, + "logits/chosen": -1.2353761196136475, + "logits/rejected": -1.1979963779449463, + "logps/chosen": -294.13189697265625, + "logps/rejected": -533.3206787109375, + "loss": 0.2674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5782788395881653, + "rewards/margins": 2.0374372005462646, + "rewards/rejected": -2.615715980529785, + "step": 758 + }, + { + "epoch": 0.65, + "grad_norm": 49.56045920531742, + "learning_rate": 6.553233406965834e-07, + "logits/chosen": -1.2530863285064697, + "logits/rejected": -1.1859345436096191, + "logps/chosen": -343.58172607421875, + "logps/rejected": -638.3863525390625, + "loss": 0.3254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8800230026245117, + "rewards/margins": 2.761399745941162, + "rewards/rejected": -3.6414225101470947, + "step": 759 + }, + { + "epoch": 0.65, + "grad_norm": 78.49067555425536, + "learning_rate": 6.525135650903666e-07, + "logits/chosen": -1.1959669589996338, + "logits/rejected": -1.142458200454712, + "logps/chosen": -699.4346313476562, + "logps/rejected": -895.7913818359375, + "loss": 0.3717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4892971515655518, + "rewards/margins": 2.3924736976623535, + "rewards/rejected": -3.8817708492279053, + "step": 760 + }, + { + "epoch": 0.65, + "grad_norm": 80.9773466833384, + "learning_rate": 6.497069061217064e-07, + "logits/chosen": -1.2011051177978516, + "logits/rejected": -1.1678533554077148, + "logps/chosen": -499.91290283203125, + "logps/rejected": -533.1343383789062, + "loss": 0.6389, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1470344066619873, + "rewards/margins": 1.2944519519805908, + "rewards/rejected": -2.441486358642578, + "step": 761 + }, + { + "epoch": 0.65, + "grad_norm": 71.52760626195689, + "learning_rate": 6.469033889637837e-07, + "logits/chosen": -1.20949387550354, + "logits/rejected": -1.177109956741333, + "logps/chosen": -533.7897338867188, + "logps/rejected": -590.6302490234375, + "loss": 0.3683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.932750940322876, + "rewards/margins": 2.078059196472168, + "rewards/rejected": -3.010810136795044, + "step": 762 + }, + { + "epoch": 0.65, + "grad_norm": 40.547912487993926, + "learning_rate": 6.44103038761599e-07, + "logits/chosen": -1.2746690511703491, + "logits/rejected": -1.2152957916259766, + "logps/chosen": -429.7515869140625, + "logps/rejected": -710.8656616210938, + "loss": 0.2478, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9441927075386047, + "rewards/margins": 2.390007495880127, + "rewards/rejected": -3.334199905395508, + "step": 763 + }, + { + "epoch": 0.66, + "grad_norm": 58.98554065072182, + "learning_rate": 6.413058806317495e-07, + "logits/chosen": -1.2609431743621826, + "logits/rejected": -1.2304378747940063, + "logps/chosen": -449.83343505859375, + "logps/rejected": -550.3903198242188, + "loss": 0.268, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0245052576065063, + "rewards/margins": 2.3938255310058594, + "rewards/rejected": -3.4183309078216553, + "step": 764 + }, + { + "epoch": 0.66, + "grad_norm": 20.32757958785188, + "learning_rate": 6.385119396622021e-07, + "logits/chosen": -1.2361825704574585, + "logits/rejected": -1.1875343322753906, + "logps/chosen": -436.0609130859375, + "logps/rejected": -726.2998046875, + "loss": 0.13, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5165493488311768, + "rewards/margins": 3.3205623626708984, + "rewards/rejected": -3.8371119499206543, + "step": 765 + }, + { + "epoch": 0.66, + "grad_norm": 40.364014159059444, + "learning_rate": 6.357212409120678e-07, + "logits/chosen": -1.2350900173187256, + "logits/rejected": -1.186276912689209, + "logps/chosen": -407.8211669921875, + "logps/rejected": -741.8107299804688, + "loss": 0.1741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8670476078987122, + "rewards/margins": 3.295076370239258, + "rewards/rejected": -4.162123680114746, + "step": 766 + }, + { + "epoch": 0.66, + "grad_norm": 16.2455480762992, + "learning_rate": 6.329338094113784e-07, + "logits/chosen": -1.2227864265441895, + "logits/rejected": -1.1488749980926514, + "logps/chosen": -572.5362548828125, + "logps/rejected": -886.8587036132812, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9070994853973389, + "rewards/margins": 3.629757881164551, + "rewards/rejected": -4.536857604980469, + "step": 767 + }, + { + "epoch": 0.66, + "grad_norm": 23.28049560328349, + "learning_rate": 6.301496701608619e-07, + "logits/chosen": -1.2397854328155518, + "logits/rejected": -1.1634564399719238, + "logps/chosen": -572.800048828125, + "logps/rejected": -759.6251220703125, + "loss": 0.1429, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8949357271194458, + "rewards/margins": 2.677290439605713, + "rewards/rejected": -3.5722262859344482, + "step": 768 + }, + { + "epoch": 0.66, + "grad_norm": 78.84035838825896, + "learning_rate": 6.273688481317174e-07, + "logits/chosen": -1.2053560018539429, + "logits/rejected": -1.1527836322784424, + "logps/chosen": -603.7992553710938, + "logps/rejected": -744.5968017578125, + "loss": 0.447, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0674078464508057, + "rewards/margins": 2.0926146507263184, + "rewards/rejected": -3.160022497177124, + "step": 769 + }, + { + "epoch": 0.66, + "grad_norm": 51.9242687714073, + "learning_rate": 6.245913682653911e-07, + "logits/chosen": -1.2491703033447266, + "logits/rejected": -1.2077652215957642, + "logps/chosen": -421.03265380859375, + "logps/rejected": -591.1710205078125, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1425645351409912, + "rewards/margins": 2.417445182800293, + "rewards/rejected": -3.560009717941284, + "step": 770 + }, + { + "epoch": 0.66, + "grad_norm": 40.60942224554684, + "learning_rate": 6.218172554733542e-07, + "logits/chosen": -1.2798171043395996, + "logits/rejected": -1.2115535736083984, + "logps/chosen": -453.9419250488281, + "logps/rejected": -695.1771240234375, + "loss": 0.2188, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0825917720794678, + "rewards/margins": 2.792531967163086, + "rewards/rejected": -3.8751237392425537, + "step": 771 + }, + { + "epoch": 0.66, + "grad_norm": 79.29510477081502, + "learning_rate": 6.190465346368769e-07, + "logits/chosen": -1.2755049467086792, + "logits/rejected": -1.2568185329437256, + "logps/chosen": -473.3320007324219, + "logps/rejected": -498.23431396484375, + "loss": 0.4651, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2007759809494019, + "rewards/margins": 1.0075633525848389, + "rewards/rejected": -2.208339214324951, + "step": 772 + }, + { + "epoch": 0.66, + "grad_norm": 69.26698239236194, + "learning_rate": 6.162792306068074e-07, + "logits/chosen": -1.2635630369186401, + "logits/rejected": -1.1861530542373657, + "logps/chosen": -557.6167602539062, + "logps/rejected": -946.214599609375, + "loss": 0.253, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.246105670928955, + "rewards/margins": 3.4181196689605713, + "rewards/rejected": -4.6642255783081055, + "step": 773 + }, + { + "epoch": 0.66, + "grad_norm": 44.00865651354992, + "learning_rate": 6.135153682033488e-07, + "logits/chosen": -1.2853941917419434, + "logits/rejected": -1.1902930736541748, + "logps/chosen": -329.769287109375, + "logps/rejected": -618.7449951171875, + "loss": 0.3086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.47864830493927, + "rewards/margins": 2.291872024536133, + "rewards/rejected": -2.7705204486846924, + "step": 774 + }, + { + "epoch": 0.66, + "grad_norm": 50.05200409581262, + "learning_rate": 6.107549722158347e-07, + "logits/chosen": -1.21261727809906, + "logits/rejected": -1.1689696311950684, + "logps/chosen": -507.6671142578125, + "logps/rejected": -783.21337890625, + "loss": 0.2574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6551064848899841, + "rewards/margins": 1.705780029296875, + "rewards/rejected": -2.360886573791504, + "step": 775 + }, + { + "epoch": 0.67, + "grad_norm": 63.05478198929676, + "learning_rate": 6.079980674025085e-07, + "logits/chosen": -1.1971309185028076, + "logits/rejected": -1.1465520858764648, + "logps/chosen": -626.6431274414062, + "logps/rejected": -827.80908203125, + "loss": 0.3393, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7310771942138672, + "rewards/margins": 1.9147140979766846, + "rewards/rejected": -2.645791530609131, + "step": 776 + }, + { + "epoch": 0.67, + "grad_norm": 66.27914639678409, + "learning_rate": 6.052446784903021e-07, + "logits/chosen": -1.1706092357635498, + "logits/rejected": -1.1214057207107544, + "logps/chosen": -586.0625, + "logps/rejected": -695.3425903320312, + "loss": 0.326, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38804376125335693, + "rewards/margins": 1.602970004081726, + "rewards/rejected": -1.991013765335083, + "step": 777 + }, + { + "epoch": 0.67, + "grad_norm": 62.401850509194375, + "learning_rate": 6.024948301746111e-07, + "logits/chosen": -1.234520673751831, + "logits/rejected": -1.1878061294555664, + "logps/chosen": -542.1024169921875, + "logps/rejected": -711.7498779296875, + "loss": 0.3367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5028198957443237, + "rewards/margins": 1.2911505699157715, + "rewards/rejected": -1.7939705848693848, + "step": 778 + }, + { + "epoch": 0.67, + "grad_norm": 91.22663071211362, + "learning_rate": 5.997485471190764e-07, + "logits/chosen": -1.1923151016235352, + "logits/rejected": -1.1633234024047852, + "logps/chosen": -523.5685424804688, + "logps/rejected": -662.0013427734375, + "loss": 0.5362, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9992001056671143, + "rewards/margins": 0.7236025333404541, + "rewards/rejected": -1.7228026390075684, + "step": 779 + }, + { + "epoch": 0.67, + "grad_norm": 61.3655202551869, + "learning_rate": 5.970058539553613e-07, + "logits/chosen": -1.2179090976715088, + "logits/rejected": -1.1775586605072021, + "logps/chosen": -623.544921875, + "logps/rejected": -763.564697265625, + "loss": 0.2909, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5548088550567627, + "rewards/margins": 1.5996613502502441, + "rewards/rejected": -2.154470205307007, + "step": 780 + }, + { + "epoch": 0.67, + "grad_norm": 70.28615878748617, + "learning_rate": 5.942667752829317e-07, + "logits/chosen": -1.218387246131897, + "logits/rejected": -1.1726099252700806, + "logps/chosen": -542.6781005859375, + "logps/rejected": -712.7110595703125, + "loss": 0.4253, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7333188056945801, + "rewards/margins": 1.2428584098815918, + "rewards/rejected": -1.9761772155761719, + "step": 781 + }, + { + "epoch": 0.67, + "grad_norm": 68.19048994228498, + "learning_rate": 5.915313356688339e-07, + "logits/chosen": -1.2339286804199219, + "logits/rejected": -1.1851835250854492, + "logps/chosen": -525.4983520507812, + "logps/rejected": -790.62353515625, + "loss": 0.3123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9128273725509644, + "rewards/margins": 1.8884153366088867, + "rewards/rejected": -2.8012425899505615, + "step": 782 + }, + { + "epoch": 0.67, + "grad_norm": 57.37838326558948, + "learning_rate": 5.887995596474748e-07, + "logits/chosen": -1.2295994758605957, + "logits/rejected": -1.1778643131256104, + "logps/chosen": -520.6300048828125, + "logps/rejected": -704.8819580078125, + "loss": 0.3803, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1053333282470703, + "rewards/margins": 1.898297905921936, + "rewards/rejected": -3.003631114959717, + "step": 783 + }, + { + "epoch": 0.67, + "grad_norm": 37.27334657073891, + "learning_rate": 5.86071471720404e-07, + "logits/chosen": -1.2449480295181274, + "logits/rejected": -1.2495803833007812, + "logps/chosen": -420.33740234375, + "logps/rejected": -505.73748779296875, + "loss": 0.2564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5788464546203613, + "rewards/margins": 1.8256258964538574, + "rewards/rejected": -2.4044723510742188, + "step": 784 + }, + { + "epoch": 0.67, + "grad_norm": 55.97231080048403, + "learning_rate": 5.83347096356091e-07, + "logits/chosen": -1.3385541439056396, + "logits/rejected": -1.30124032497406, + "logps/chosen": -448.4632263183594, + "logps/rejected": -502.9323425292969, + "loss": 0.3686, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.049405813217163, + "rewards/margins": 1.4481611251831055, + "rewards/rejected": -2.4975671768188477, + "step": 785 + }, + { + "epoch": 0.67, + "grad_norm": 55.72959258812196, + "learning_rate": 5.806264579897063e-07, + "logits/chosen": -1.302438735961914, + "logits/rejected": -1.2615262269973755, + "logps/chosen": -472.30706787109375, + "logps/rejected": -655.6341552734375, + "loss": 0.3095, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8192142248153687, + "rewards/margins": 2.3826041221618652, + "rewards/rejected": -3.2018184661865234, + "step": 786 + }, + { + "epoch": 0.67, + "grad_norm": 26.834919776565844, + "learning_rate": 5.779095810229051e-07, + "logits/chosen": -1.1845002174377441, + "logits/rejected": -1.153458595275879, + "logps/chosen": -582.74365234375, + "logps/rejected": -814.0584106445312, + "loss": 0.0881, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0058015584945679, + "rewards/margins": 3.220125675201416, + "rewards/rejected": -4.225927352905273, + "step": 787 + }, + { + "epoch": 0.68, + "grad_norm": 64.87831875624222, + "learning_rate": 5.751964898236039e-07, + "logits/chosen": -1.2975207567214966, + "logits/rejected": -1.2309163808822632, + "logps/chosen": -587.638671875, + "logps/rejected": -691.3016357421875, + "loss": 0.4853, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2252812385559082, + "rewards/margins": 1.7551825046539307, + "rewards/rejected": -2.980463743209839, + "step": 788 + }, + { + "epoch": 0.68, + "grad_norm": 54.54179421648986, + "learning_rate": 5.724872087257656e-07, + "logits/chosen": -1.2868032455444336, + "logits/rejected": -1.2407780885696411, + "logps/chosen": -579.31005859375, + "logps/rejected": -694.39599609375, + "loss": 0.2304, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.182490587234497, + "rewards/margins": 2.7682316303253174, + "rewards/rejected": -3.9507226943969727, + "step": 789 + }, + { + "epoch": 0.68, + "grad_norm": 51.9322318937397, + "learning_rate": 5.697817620291798e-07, + "logits/chosen": -1.3190757036209106, + "logits/rejected": -1.2874265909194946, + "logps/chosen": -441.6192626953125, + "logps/rejected": -552.3643798828125, + "loss": 0.2837, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0770200490951538, + "rewards/margins": 2.20762300491333, + "rewards/rejected": -3.2846431732177734, + "step": 790 + }, + { + "epoch": 0.68, + "grad_norm": 33.304444234616746, + "learning_rate": 5.670801739992448e-07, + "logits/chosen": -1.2871630191802979, + "logits/rejected": -1.2591633796691895, + "logps/chosen": -426.33038330078125, + "logps/rejected": -657.7274169921875, + "loss": 0.219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49022212624549866, + "rewards/margins": 2.8954200744628906, + "rewards/rejected": -3.3856422901153564, + "step": 791 + }, + { + "epoch": 0.68, + "grad_norm": 77.92285907029817, + "learning_rate": 5.643824688667505e-07, + "logits/chosen": -1.277097463607788, + "logits/rejected": -1.2177878618240356, + "logps/chosen": -528.1997680664062, + "logps/rejected": -846.9409790039062, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0698024034500122, + "rewards/margins": 3.044430732727051, + "rewards/rejected": -4.114233016967773, + "step": 792 + }, + { + "epoch": 0.68, + "grad_norm": 63.01592387354561, + "learning_rate": 5.616886708276603e-07, + "logits/chosen": -1.3257449865341187, + "logits/rejected": -1.2598521709442139, + "logps/chosen": -580.3109130859375, + "logps/rejected": -764.5173950195312, + "loss": 0.2923, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9693795442581177, + "rewards/margins": 2.5392367839813232, + "rewards/rejected": -3.5086162090301514, + "step": 793 + }, + { + "epoch": 0.68, + "grad_norm": 88.32330940362355, + "learning_rate": 5.589988040428946e-07, + "logits/chosen": -1.2941094636917114, + "logits/rejected": -1.233504295349121, + "logps/chosen": -569.9989624023438, + "logps/rejected": -704.6729736328125, + "loss": 0.4141, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2531591653823853, + "rewards/margins": 2.2078423500061035, + "rewards/rejected": -3.4610016345977783, + "step": 794 + }, + { + "epoch": 0.68, + "grad_norm": 24.135022838491736, + "learning_rate": 5.563128926381149e-07, + "logits/chosen": -1.2901718616485596, + "logits/rejected": -1.1893861293792725, + "logps/chosen": -485.58612060546875, + "logps/rejected": -867.0072021484375, + "loss": 0.1402, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3576972484588623, + "rewards/margins": 3.1181297302246094, + "rewards/rejected": -4.475826740264893, + "step": 795 + }, + { + "epoch": 0.68, + "grad_norm": 49.316205154733275, + "learning_rate": 5.536309607035042e-07, + "logits/chosen": -1.315154790878296, + "logits/rejected": -1.2445651292800903, + "logps/chosen": -363.44677734375, + "logps/rejected": -587.053955078125, + "loss": 0.2735, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1718934774398804, + "rewards/margins": 2.271083116531372, + "rewards/rejected": -3.442976713180542, + "step": 796 + }, + { + "epoch": 0.68, + "grad_norm": 114.13517381128635, + "learning_rate": 5.509530322935564e-07, + "logits/chosen": -1.2886552810668945, + "logits/rejected": -1.2342393398284912, + "logps/chosen": -513.4869384765625, + "logps/rejected": -628.4658203125, + "loss": 0.6908, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5586981773376465, + "rewards/margins": 1.7595255374908447, + "rewards/rejected": -3.318223476409912, + "step": 797 + }, + { + "epoch": 0.68, + "grad_norm": 51.628883615524664, + "learning_rate": 5.482791314268558e-07, + "logits/chosen": -1.3084138631820679, + "logits/rejected": -1.249441385269165, + "logps/chosen": -418.89837646484375, + "logps/rejected": -603.8793334960938, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.818135142326355, + "rewards/margins": 2.3880748748779297, + "rewards/rejected": -3.206209659576416, + "step": 798 + }, + { + "epoch": 0.69, + "grad_norm": 99.95898090957525, + "learning_rate": 5.456092820858619e-07, + "logits/chosen": -1.317805528640747, + "logits/rejected": -1.304664969444275, + "logps/chosen": -563.1351318359375, + "logps/rejected": -533.6165161132812, + "loss": 0.7847, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.563063383102417, + "rewards/margins": 1.5004191398620605, + "rewards/rejected": -3.0634822845458984, + "step": 799 + }, + { + "epoch": 0.69, + "grad_norm": 123.1394399884689, + "learning_rate": 5.429435082166991e-07, + "logits/chosen": -1.294856309890747, + "logits/rejected": -1.2487471103668213, + "logps/chosen": -667.078125, + "logps/rejected": -792.073486328125, + "loss": 0.3631, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8730590343475342, + "rewards/margins": 1.7214754819869995, + "rewards/rejected": -3.5945346355438232, + "step": 800 + }, + { + "epoch": 0.69, + "grad_norm": 32.95676078668431, + "learning_rate": 5.402818337289352e-07, + "logits/chosen": -1.2985724210739136, + "logits/rejected": -1.234189748764038, + "logps/chosen": -457.0745849609375, + "logps/rejected": -772.1397705078125, + "loss": 0.2344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9183647632598877, + "rewards/margins": 3.1553783416748047, + "rewards/rejected": -4.073742866516113, + "step": 801 + }, + { + "epoch": 0.69, + "grad_norm": 30.11951749501376, + "learning_rate": 5.376242824953718e-07, + "logits/chosen": -1.3676421642303467, + "logits/rejected": -1.3023195266723633, + "logps/chosen": -296.14837646484375, + "logps/rejected": -550.3222045898438, + "loss": 0.2515, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8110196590423584, + "rewards/margins": 2.414071798324585, + "rewards/rejected": -3.2250914573669434, + "step": 802 + }, + { + "epoch": 0.69, + "grad_norm": 37.1130777897999, + "learning_rate": 5.349708783518296e-07, + "logits/chosen": -1.3127175569534302, + "logits/rejected": -1.2090966701507568, + "logps/chosen": -555.64892578125, + "logps/rejected": -886.0833740234375, + "loss": 0.1507, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.431624412536621, + "rewards/margins": 2.953738212585449, + "rewards/rejected": -4.38536262512207, + "step": 803 + }, + { + "epoch": 0.69, + "grad_norm": 135.12316330777904, + "learning_rate": 5.323216450969315e-07, + "logits/chosen": -1.256744384765625, + "logits/rejected": -1.267289400100708, + "logps/chosen": -616.8837280273438, + "logps/rejected": -545.8270874023438, + "loss": 1.0152, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1458804607391357, + "rewards/margins": 0.6963490843772888, + "rewards/rejected": -2.8422296047210693, + "step": 804 + }, + { + "epoch": 0.69, + "grad_norm": 45.54737697343092, + "learning_rate": 5.296766064918928e-07, + "logits/chosen": -1.308955192565918, + "logits/rejected": -1.2631933689117432, + "logps/chosen": -527.814208984375, + "logps/rejected": -647.0279541015625, + "loss": 0.2665, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3599857091903687, + "rewards/margins": 1.8339883089065552, + "rewards/rejected": -3.193974018096924, + "step": 805 + }, + { + "epoch": 0.69, + "grad_norm": 54.18978295900378, + "learning_rate": 5.270357862603061e-07, + "logits/chosen": -1.233064889907837, + "logits/rejected": -1.1992673873901367, + "logps/chosen": -444.14178466796875, + "logps/rejected": -639.1141357421875, + "loss": 0.2421, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0077587366104126, + "rewards/margins": 2.1859450340270996, + "rewards/rejected": -3.1937036514282227, + "step": 806 + }, + { + "epoch": 0.69, + "grad_norm": 39.59007654916537, + "learning_rate": 5.243992080879291e-07, + "logits/chosen": -1.2696118354797363, + "logits/rejected": -1.2163591384887695, + "logps/chosen": -522.70751953125, + "logps/rejected": -730.43896484375, + "loss": 0.2281, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2305470705032349, + "rewards/margins": 2.6388511657714844, + "rewards/rejected": -3.869398593902588, + "step": 807 + }, + { + "epoch": 0.69, + "grad_norm": 26.841044346191513, + "learning_rate": 5.217668956224724e-07, + "logits/chosen": -1.2543085813522339, + "logits/rejected": -1.1556754112243652, + "logps/chosen": -520.13916015625, + "logps/rejected": -780.648681640625, + "loss": 0.1559, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1710164546966553, + "rewards/margins": 3.7417702674865723, + "rewards/rejected": -4.912786483764648, + "step": 808 + }, + { + "epoch": 0.69, + "grad_norm": 62.724830798722856, + "learning_rate": 5.191388724733866e-07, + "logits/chosen": -1.2515757083892822, + "logits/rejected": -1.2258267402648926, + "logps/chosen": -633.2823486328125, + "logps/rejected": -840.8907470703125, + "loss": 0.2973, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.084636926651001, + "rewards/margins": 3.320814847946167, + "rewards/rejected": -4.40545129776001, + "step": 809 + }, + { + "epoch": 0.69, + "grad_norm": 30.28960633435741, + "learning_rate": 5.165151622116513e-07, + "logits/chosen": -1.3251980543136597, + "logits/rejected": -1.2799733877182007, + "logps/chosen": -366.42340087890625, + "logps/rejected": -616.395751953125, + "loss": 0.3236, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1137187480926514, + "rewards/margins": 2.0565237998962402, + "rewards/rejected": -3.1702423095703125, + "step": 810 + }, + { + "epoch": 0.7, + "grad_norm": 58.48126129769298, + "learning_rate": 5.138957883695636e-07, + "logits/chosen": -1.3073029518127441, + "logits/rejected": -1.258333683013916, + "logps/chosen": -563.4068603515625, + "logps/rejected": -777.6580810546875, + "loss": 0.2518, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3961129188537598, + "rewards/margins": 2.564927577972412, + "rewards/rejected": -3.961040496826172, + "step": 811 + }, + { + "epoch": 0.7, + "grad_norm": 57.6106870953538, + "learning_rate": 5.112807744405256e-07, + "logits/chosen": -1.218122959136963, + "logits/rejected": -1.1502938270568848, + "logps/chosen": -655.7843627929688, + "logps/rejected": -883.1055908203125, + "loss": 0.1827, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.585040807723999, + "rewards/margins": 3.1629416942596436, + "rewards/rejected": -4.747982025146484, + "step": 812 + }, + { + "epoch": 0.7, + "grad_norm": 100.19382275968717, + "learning_rate": 5.08670143878837e-07, + "logits/chosen": -1.3215523958206177, + "logits/rejected": -1.2981047630310059, + "logps/chosen": -634.1466064453125, + "logps/rejected": -740.7958984375, + "loss": 0.7425, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9333231449127197, + "rewards/margins": 1.4622352123260498, + "rewards/rejected": -3.3955583572387695, + "step": 813 + }, + { + "epoch": 0.7, + "grad_norm": 72.74885728218051, + "learning_rate": 5.060639200994818e-07, + "logits/chosen": -1.283916711807251, + "logits/rejected": -1.3019851446151733, + "logps/chosen": -675.86181640625, + "logps/rejected": -665.9303588867188, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5731701850891113, + "rewards/margins": 1.503868579864502, + "rewards/rejected": -3.0770387649536133, + "step": 814 + }, + { + "epoch": 0.7, + "grad_norm": 48.932796076182605, + "learning_rate": 5.034621264779178e-07, + "logits/chosen": -1.2343616485595703, + "logits/rejected": -1.1905256509780884, + "logps/chosen": -759.7833251953125, + "logps/rejected": -1085.231201171875, + "loss": 0.1768, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4716095924377441, + "rewards/margins": 3.619154691696167, + "rewards/rejected": -5.090764045715332, + "step": 815 + }, + { + "epoch": 0.7, + "grad_norm": 85.90267594791926, + "learning_rate": 5.008647863498709e-07, + "logits/chosen": -1.3507819175720215, + "logits/rejected": -1.2976577281951904, + "logps/chosen": -445.6289978027344, + "logps/rejected": -615.7457885742188, + "loss": 0.4761, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3056609630584717, + "rewards/margins": 1.7964327335357666, + "rewards/rejected": -3.1020936965942383, + "step": 816 + }, + { + "epoch": 0.7, + "grad_norm": 41.18656467647798, + "learning_rate": 4.982719230111208e-07, + "logits/chosen": -1.2840217351913452, + "logits/rejected": -1.2977279424667358, + "logps/chosen": -477.98187255859375, + "logps/rejected": -559.2445068359375, + "loss": 0.3443, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.997779130935669, + "rewards/margins": 1.459186315536499, + "rewards/rejected": -2.456965446472168, + "step": 817 + }, + { + "epoch": 0.7, + "grad_norm": 75.45175286746739, + "learning_rate": 4.956835597172954e-07, + "logits/chosen": -1.2445216178894043, + "logits/rejected": -1.1829955577850342, + "logps/chosen": -594.428955078125, + "logps/rejected": -779.2150268554688, + "loss": 0.1929, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.097069025039673, + "rewards/margins": 2.3932814598083496, + "rewards/rejected": -4.490350723266602, + "step": 818 + }, + { + "epoch": 0.7, + "grad_norm": 45.938823252272236, + "learning_rate": 4.930997196836624e-07, + "logits/chosen": -1.336355447769165, + "logits/rejected": -1.2834736108779907, + "logps/chosen": -324.4188232421875, + "logps/rejected": -535.7325439453125, + "loss": 0.2532, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.83463454246521, + "rewards/margins": 2.159453868865967, + "rewards/rejected": -2.994088649749756, + "step": 819 + }, + { + "epoch": 0.7, + "grad_norm": 58.34911238362092, + "learning_rate": 4.905204260849183e-07, + "logits/chosen": -1.3637986183166504, + "logits/rejected": -1.3230304718017578, + "logps/chosen": -612.1046752929688, + "logps/rejected": -678.5731201171875, + "loss": 0.2828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.068753957748413, + "rewards/margins": 2.6097583770751953, + "rewards/rejected": -3.6785125732421875, + "step": 820 + }, + { + "epoch": 0.7, + "grad_norm": 58.100590563616066, + "learning_rate": 4.879457020549827e-07, + "logits/chosen": -1.3381030559539795, + "logits/rejected": -1.3141615390777588, + "logps/chosen": -430.4180908203125, + "logps/rejected": -661.449462890625, + "loss": 0.234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5537689924240112, + "rewards/margins": 2.3485374450683594, + "rewards/rejected": -3.90230655670166, + "step": 821 + }, + { + "epoch": 0.7, + "grad_norm": 38.786982025083816, + "learning_rate": 4.853755706867907e-07, + "logits/chosen": -1.3229811191558838, + "logits/rejected": -1.273648977279663, + "logps/chosen": -517.710205078125, + "logps/rejected": -821.1460571289062, + "loss": 0.18, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2663819789886475, + "rewards/margins": 3.2408554553985596, + "rewards/rejected": -4.507237434387207, + "step": 822 + }, + { + "epoch": 0.71, + "grad_norm": 52.03349499015386, + "learning_rate": 4.828100550320852e-07, + "logits/chosen": -1.2744662761688232, + "logits/rejected": -1.265845775604248, + "logps/chosen": -756.6103515625, + "logps/rejected": -750.5546264648438, + "loss": 0.2136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.724693775177002, + "rewards/margins": 2.322697401046753, + "rewards/rejected": -4.047390937805176, + "step": 823 + }, + { + "epoch": 0.71, + "grad_norm": 29.282105070034646, + "learning_rate": 4.802491781012101e-07, + "logits/chosen": -1.3247438669204712, + "logits/rejected": -1.255344033241272, + "logps/chosen": -471.8821105957031, + "logps/rejected": -746.1473999023438, + "loss": 0.1789, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.132800817489624, + "rewards/margins": 3.1457228660583496, + "rewards/rejected": -4.278523921966553, + "step": 824 + }, + { + "epoch": 0.71, + "grad_norm": 37.228174031486915, + "learning_rate": 4.776929628629046e-07, + "logits/chosen": -1.3179203271865845, + "logits/rejected": -1.2262849807739258, + "logps/chosen": -446.9613952636719, + "logps/rejected": -784.3031616210938, + "loss": 0.1404, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2367494106292725, + "rewards/margins": 3.3095202445983887, + "rewards/rejected": -4.546269416809082, + "step": 825 + }, + { + "epoch": 0.71, + "grad_norm": 37.98624611657895, + "learning_rate": 4.7514143224409654e-07, + "logits/chosen": -1.303385615348816, + "logits/rejected": -1.2276699542999268, + "logps/chosen": -395.5613708496094, + "logps/rejected": -729.7018432617188, + "loss": 0.171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.738844633102417, + "rewards/margins": 3.0277833938598633, + "rewards/rejected": -4.766628265380859, + "step": 826 + }, + { + "epoch": 0.71, + "grad_norm": 45.503230516625734, + "learning_rate": 4.7259460912969717e-07, + "logits/chosen": -1.3405150175094604, + "logits/rejected": -1.2592682838439941, + "logps/chosen": -396.72015380859375, + "logps/rejected": -673.91015625, + "loss": 0.2404, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2170133590698242, + "rewards/margins": 2.498927116394043, + "rewards/rejected": -3.715940475463867, + "step": 827 + }, + { + "epoch": 0.71, + "grad_norm": 31.217943894028497, + "learning_rate": 4.700525163623944e-07, + "logits/chosen": -1.376317024230957, + "logits/rejected": -1.3095309734344482, + "logps/chosen": -411.83831787109375, + "logps/rejected": -690.10498046875, + "loss": 0.1662, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.012393832206726, + "rewards/margins": 2.9905548095703125, + "rewards/rejected": -4.002948760986328, + "step": 828 + }, + { + "epoch": 0.71, + "grad_norm": 74.58665440702462, + "learning_rate": 4.6751517674245155e-07, + "logits/chosen": -1.3186997175216675, + "logits/rejected": -1.2811741828918457, + "logps/chosen": -424.9061279296875, + "logps/rejected": -555.0997314453125, + "loss": 0.4596, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0757957696914673, + "rewards/margins": 1.781997561454773, + "rewards/rejected": -2.8577933311462402, + "step": 829 + }, + { + "epoch": 0.71, + "grad_norm": 36.606778713254194, + "learning_rate": 4.649826130274993e-07, + "logits/chosen": -1.2815415859222412, + "logits/rejected": -1.2470782995224, + "logps/chosen": -649.908935546875, + "logps/rejected": -758.615234375, + "loss": 0.1889, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5537364482879639, + "rewards/margins": 2.073343276977539, + "rewards/rejected": -3.627079486846924, + "step": 830 + }, + { + "epoch": 0.71, + "grad_norm": 80.61121666736041, + "learning_rate": 4.624548479323317e-07, + "logits/chosen": -1.265627145767212, + "logits/rejected": -1.2178000211715698, + "logps/chosen": -451.3035583496094, + "logps/rejected": -690.942626953125, + "loss": 0.5877, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1930776834487915, + "rewards/margins": 2.1340413093566895, + "rewards/rejected": -3.3271191120147705, + "step": 831 + }, + { + "epoch": 0.71, + "grad_norm": 57.27185180266218, + "learning_rate": 4.5993190412870596e-07, + "logits/chosen": -1.3708715438842773, + "logits/rejected": -1.2856638431549072, + "logps/chosen": -385.2818298339844, + "logps/rejected": -777.6578369140625, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0449188947677612, + "rewards/margins": 2.955021858215332, + "rewards/rejected": -3.999940872192383, + "step": 832 + }, + { + "epoch": 0.71, + "grad_norm": 54.264431774671195, + "learning_rate": 4.574138042451344e-07, + "logits/chosen": -1.3288730382919312, + "logits/rejected": -1.240283489227295, + "logps/chosen": -722.72998046875, + "logps/rejected": -1019.0667114257812, + "loss": 0.2717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7918155193328857, + "rewards/margins": 2.536999464035034, + "rewards/rejected": -4.32881498336792, + "step": 833 + }, + { + "epoch": 0.72, + "grad_norm": 48.72019081686653, + "learning_rate": 4.549005708666852e-07, + "logits/chosen": -1.295773983001709, + "logits/rejected": -1.2186580896377563, + "logps/chosen": -560.7127685546875, + "logps/rejected": -935.4384155273438, + "loss": 0.2302, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2861629724502563, + "rewards/margins": 3.5684313774108887, + "rewards/rejected": -4.854594707489014, + "step": 834 + }, + { + "epoch": 0.72, + "grad_norm": 53.292990955117695, + "learning_rate": 4.523922265347778e-07, + "logits/chosen": -1.2727816104888916, + "logits/rejected": -1.2295081615447998, + "logps/chosen": -543.6903076171875, + "logps/rejected": -791.6380004882812, + "loss": 0.3634, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2239705324172974, + "rewards/margins": 2.2526931762695312, + "rewards/rejected": -3.476663589477539, + "step": 835 + }, + { + "epoch": 0.72, + "grad_norm": 43.02721104114026, + "learning_rate": 4.4988879374698166e-07, + "logits/chosen": -1.3318668603897095, + "logits/rejected": -1.2620775699615479, + "logps/chosen": -580.2116088867188, + "logps/rejected": -836.694091796875, + "loss": 0.1654, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3591465950012207, + "rewards/margins": 3.187530040740967, + "rewards/rejected": -4.5466766357421875, + "step": 836 + }, + { + "epoch": 0.72, + "grad_norm": 40.26525761919001, + "learning_rate": 4.473902949568138e-07, + "logits/chosen": -1.2813338041305542, + "logits/rejected": -1.2284204959869385, + "logps/chosen": -586.521240234375, + "logps/rejected": -773.0760498046875, + "loss": 0.2007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4743058681488037, + "rewards/margins": 2.6700499057769775, + "rewards/rejected": -4.144355773925781, + "step": 837 + }, + { + "epoch": 0.72, + "grad_norm": 72.74773020712323, + "learning_rate": 4.4489675257353807e-07, + "logits/chosen": -1.314551591873169, + "logits/rejected": -1.2525804042816162, + "logps/chosen": -429.6893615722656, + "logps/rejected": -571.6607666015625, + "loss": 0.4142, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7104672193527222, + "rewards/margins": 1.4704867601394653, + "rewards/rejected": -3.1809539794921875, + "step": 838 + }, + { + "epoch": 0.72, + "grad_norm": 55.78038788914927, + "learning_rate": 4.424081889619639e-07, + "logits/chosen": -1.3101136684417725, + "logits/rejected": -1.2543182373046875, + "logps/chosen": -417.7502746582031, + "logps/rejected": -665.982177734375, + "loss": 0.2769, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3031742572784424, + "rewards/margins": 2.185793399810791, + "rewards/rejected": -3.4889674186706543, + "step": 839 + }, + { + "epoch": 0.72, + "grad_norm": 45.93818042291123, + "learning_rate": 4.3992462644224515e-07, + "logits/chosen": -1.3367724418640137, + "logits/rejected": -1.2911133766174316, + "logps/chosen": -446.1727600097656, + "logps/rejected": -677.7947998046875, + "loss": 0.2157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2187086343765259, + "rewards/margins": 2.477630138397217, + "rewards/rejected": -3.6963391304016113, + "step": 840 + }, + { + "epoch": 0.72, + "grad_norm": 33.162499393664156, + "learning_rate": 4.37446087289681e-07, + "logits/chosen": -1.30533766746521, + "logits/rejected": -1.2600688934326172, + "logps/chosen": -485.5511474609375, + "logps/rejected": -687.138671875, + "loss": 0.2553, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8878797888755798, + "rewards/margins": 2.69405460357666, + "rewards/rejected": -3.5819342136383057, + "step": 841 + }, + { + "epoch": 0.72, + "grad_norm": 32.77410242264212, + "learning_rate": 4.3497259373451533e-07, + "logits/chosen": -1.2432332038879395, + "logits/rejected": -1.2274351119995117, + "logps/chosen": -525.11083984375, + "logps/rejected": -652.7245483398438, + "loss": 0.2081, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.685070276260376, + "rewards/margins": 2.884859085083008, + "rewards/rejected": -3.5699291229248047, + "step": 842 + }, + { + "epoch": 0.72, + "grad_norm": 89.81416550928549, + "learning_rate": 4.3250416796173806e-07, + "logits/chosen": -1.2505658864974976, + "logits/rejected": -1.2060911655426025, + "logps/chosen": -691.4589233398438, + "logps/rejected": -829.2577514648438, + "loss": 0.5968, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.898880958557129, + "rewards/margins": 1.7532389163970947, + "rewards/rejected": -3.6521201133728027, + "step": 843 + }, + { + "epoch": 0.72, + "grad_norm": 81.97624676646213, + "learning_rate": 4.3004083211088414e-07, + "logits/chosen": -1.2881934642791748, + "logits/rejected": -1.2460763454437256, + "logps/chosen": -575.97802734375, + "logps/rejected": -724.1553955078125, + "loss": 0.4642, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2311291694641113, + "rewards/margins": 1.8496699333190918, + "rewards/rejected": -3.080799102783203, + "step": 844 + }, + { + "epoch": 0.72, + "grad_norm": 29.22772748448285, + "learning_rate": 4.275826082758388e-07, + "logits/chosen": -1.2812985181808472, + "logits/rejected": -1.2197277545928955, + "logps/chosen": -618.3961791992188, + "logps/rejected": -909.3668823242188, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0194859504699707, + "rewards/margins": 2.808882236480713, + "rewards/rejected": -3.8283684253692627, + "step": 845 + }, + { + "epoch": 0.73, + "grad_norm": 61.829876899345294, + "learning_rate": 4.2512951850463597e-07, + "logits/chosen": -1.3472239971160889, + "logits/rejected": -1.2804555892944336, + "logps/chosen": -587.024169921875, + "logps/rejected": -706.7734375, + "loss": 0.293, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0534980297088623, + "rewards/margins": 1.6623592376708984, + "rewards/rejected": -2.7158570289611816, + "step": 846 + }, + { + "epoch": 0.73, + "grad_norm": 53.24683135146778, + "learning_rate": 4.226815847992611e-07, + "logits/chosen": -1.2948989868164062, + "logits/rejected": -1.263986349105835, + "logps/chosen": -561.6561889648438, + "logps/rejected": -683.1256103515625, + "loss": 0.3061, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1184791326522827, + "rewards/margins": 1.7602111101150513, + "rewards/rejected": -2.878690481185913, + "step": 847 + }, + { + "epoch": 0.73, + "grad_norm": 32.03500041273063, + "learning_rate": 4.202388291154555e-07, + "logits/chosen": -1.2769651412963867, + "logits/rejected": -1.295407772064209, + "logps/chosen": -470.678955078125, + "logps/rejected": -634.0687255859375, + "loss": 0.1691, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8327485918998718, + "rewards/margins": 2.287118911743164, + "rewards/rejected": -3.1198675632476807, + "step": 848 + }, + { + "epoch": 0.73, + "grad_norm": 45.721064817373104, + "learning_rate": 4.1780127336251767e-07, + "logits/chosen": -1.311166524887085, + "logits/rejected": -1.19183349609375, + "logps/chosen": -445.4447937011719, + "logps/rejected": -761.5078125, + "loss": 0.2777, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7454906702041626, + "rewards/margins": 2.716456174850464, + "rewards/rejected": -3.461946964263916, + "step": 849 + }, + { + "epoch": 0.73, + "grad_norm": 93.05220178000309, + "learning_rate": 4.153689394031079e-07, + "logits/chosen": -1.3191728591918945, + "logits/rejected": -1.242370367050171, + "logps/chosen": -508.0107421875, + "logps/rejected": -761.6094970703125, + "loss": 0.3951, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.194153070449829, + "rewards/margins": 1.9952038526535034, + "rewards/rejected": -3.189356803894043, + "step": 850 + }, + { + "epoch": 0.73, + "grad_norm": 28.96983105061877, + "learning_rate": 4.129418490530514e-07, + "logits/chosen": -1.3243119716644287, + "logits/rejected": -1.217747449874878, + "logps/chosen": -496.8054504394531, + "logps/rejected": -842.6761474609375, + "loss": 0.1176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.863527238368988, + "rewards/margins": 3.1362767219543457, + "rewards/rejected": -3.9998040199279785, + "step": 851 + }, + { + "epoch": 0.73, + "grad_norm": 37.45097658640633, + "learning_rate": 4.1052002408114304e-07, + "logits/chosen": -1.3475794792175293, + "logits/rejected": -1.279633641242981, + "logps/chosen": -495.43853759765625, + "logps/rejected": -691.76220703125, + "loss": 0.2384, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2313153743743896, + "rewards/margins": 1.863783359527588, + "rewards/rejected": -3.0950987339019775, + "step": 852 + }, + { + "epoch": 0.73, + "grad_norm": 52.686708466661, + "learning_rate": 4.081034862089523e-07, + "logits/chosen": -1.2625312805175781, + "logits/rejected": -1.2287614345550537, + "logps/chosen": -537.3236694335938, + "logps/rejected": -665.4468994140625, + "loss": 0.2602, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6482099890708923, + "rewards/margins": 1.9057555198669434, + "rewards/rejected": -2.5539653301239014, + "step": 853 + }, + { + "epoch": 0.73, + "grad_norm": 60.405374160574816, + "learning_rate": 4.056922571106277e-07, + "logits/chosen": -1.3058245182037354, + "logits/rejected": -1.2516499757766724, + "logps/chosen": -636.6150512695312, + "logps/rejected": -859.4437255859375, + "loss": 0.3198, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4464586973190308, + "rewards/margins": 2.4538464546203613, + "rewards/rejected": -3.9003052711486816, + "step": 854 + }, + { + "epoch": 0.73, + "grad_norm": 64.8038400855151, + "learning_rate": 4.0328635841270344e-07, + "logits/chosen": -1.268088936805725, + "logits/rejected": -1.2096847295761108, + "logps/chosen": -548.0049438476562, + "logps/rejected": -827.2142944335938, + "loss": 0.2479, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.17325758934021, + "rewards/margins": 2.010125160217285, + "rewards/rejected": -3.183382987976074, + "step": 855 + }, + { + "epoch": 0.73, + "grad_norm": 69.11316301966247, + "learning_rate": 4.0088581169390425e-07, + "logits/chosen": -1.4180727005004883, + "logits/rejected": -1.3446136713027954, + "logps/chosen": -458.9594421386719, + "logps/rejected": -664.6500244140625, + "loss": 0.468, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.034346580505371, + "rewards/margins": 1.9922888278961182, + "rewards/rejected": -3.02663516998291, + "step": 856 + }, + { + "epoch": 0.73, + "grad_norm": 43.37475914020421, + "learning_rate": 3.984906384849529e-07, + "logits/chosen": -1.2790015935897827, + "logits/rejected": -1.2007935047149658, + "logps/chosen": -709.7748413085938, + "logps/rejected": -934.968994140625, + "loss": 0.2759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8545200228691101, + "rewards/margins": 3.244429588317871, + "rewards/rejected": -4.098949432373047, + "step": 857 + }, + { + "epoch": 0.74, + "grad_norm": 20.579376552917985, + "learning_rate": 3.9610086026837673e-07, + "logits/chosen": -1.387350082397461, + "logits/rejected": -1.2440000772476196, + "logps/chosen": -405.1728515625, + "logps/rejected": -705.4710693359375, + "loss": 0.1538, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6755664348602295, + "rewards/margins": 3.2075552940368652, + "rewards/rejected": -3.883122205734253, + "step": 858 + }, + { + "epoch": 0.74, + "grad_norm": 61.989156105540516, + "learning_rate": 3.937164984783149e-07, + "logits/chosen": -1.3607792854309082, + "logits/rejected": -1.338577389717102, + "logps/chosen": -448.86956787109375, + "logps/rejected": -516.97705078125, + "loss": 0.3516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2934528589248657, + "rewards/margins": 1.3286328315734863, + "rewards/rejected": -2.6220855712890625, + "step": 859 + }, + { + "epoch": 0.74, + "grad_norm": 113.66640912811468, + "learning_rate": 3.9133757450032536e-07, + "logits/chosen": -1.4172799587249756, + "logits/rejected": -1.325761079788208, + "logps/chosen": -598.533935546875, + "logps/rejected": -704.5578002929688, + "loss": 0.5744, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6968603134155273, + "rewards/margins": 1.3225895166397095, + "rewards/rejected": -3.0194497108459473, + "step": 860 + }, + { + "epoch": 0.74, + "grad_norm": 109.10917904229667, + "learning_rate": 3.889641096711943e-07, + "logits/chosen": -1.2880518436431885, + "logits/rejected": -1.231501817703247, + "logps/chosen": -564.6885986328125, + "logps/rejected": -741.1936645507812, + "loss": 0.5165, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8741509914398193, + "rewards/margins": 1.7373528480529785, + "rewards/rejected": -3.611504077911377, + "step": 861 + }, + { + "epoch": 0.74, + "grad_norm": 64.27319512225013, + "learning_rate": 3.8659612527874574e-07, + "logits/chosen": -1.3026931285858154, + "logits/rejected": -1.3118447065353394, + "logps/chosen": -641.9603881835938, + "logps/rejected": -660.7064208984375, + "loss": 0.3268, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2755794525146484, + "rewards/margins": 1.8998128175735474, + "rewards/rejected": -3.1753921508789062, + "step": 862 + }, + { + "epoch": 0.74, + "grad_norm": 51.47387197981583, + "learning_rate": 3.842336425616466e-07, + "logits/chosen": -1.3199818134307861, + "logits/rejected": -1.2690174579620361, + "logps/chosen": -632.9410400390625, + "logps/rejected": -703.441650390625, + "loss": 0.3068, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.29012930393219, + "rewards/margins": 2.181661605834961, + "rewards/rejected": -3.4717910289764404, + "step": 863 + }, + { + "epoch": 0.74, + "grad_norm": 33.07989943791176, + "learning_rate": 3.818766827092201e-07, + "logits/chosen": -1.3294596672058105, + "logits/rejected": -1.2252604961395264, + "logps/chosen": -441.6009521484375, + "logps/rejected": -716.4881591796875, + "loss": 0.1971, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9453874230384827, + "rewards/margins": 2.805691719055176, + "rewards/rejected": -3.7510790824890137, + "step": 864 + }, + { + "epoch": 0.74, + "grad_norm": 43.328260504476305, + "learning_rate": 3.795252668612554e-07, + "logits/chosen": -1.3333276510238647, + "logits/rejected": -1.2647820711135864, + "logps/chosen": -502.2019348144531, + "logps/rejected": -724.9705810546875, + "loss": 0.1994, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1261513233184814, + "rewards/margins": 2.8922533988952637, + "rewards/rejected": -4.018404960632324, + "step": 865 + }, + { + "epoch": 0.74, + "grad_norm": 58.03496627953685, + "learning_rate": 3.771794161078148e-07, + "logits/chosen": -1.3372564315795898, + "logits/rejected": -1.2777786254882812, + "logps/chosen": -499.2994384765625, + "logps/rejected": -706.81787109375, + "loss": 0.2921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3794007301330566, + "rewards/margins": 2.1858272552490234, + "rewards/rejected": -3.56522798538208, + "step": 866 + }, + { + "epoch": 0.74, + "grad_norm": 59.009855907870005, + "learning_rate": 3.748391514890483e-07, + "logits/chosen": -1.4050931930541992, + "logits/rejected": -1.3332555294036865, + "logps/chosen": -293.22882080078125, + "logps/rejected": -503.4451904296875, + "loss": 0.7504, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0884900093078613, + "rewards/margins": 1.6704758405685425, + "rewards/rejected": -2.7589659690856934, + "step": 867 + }, + { + "epoch": 0.74, + "grad_norm": 46.32140136229227, + "learning_rate": 3.7250449399500285e-07, + "logits/chosen": -1.3360533714294434, + "logits/rejected": -1.3012781143188477, + "logps/chosen": -473.07305908203125, + "logps/rejected": -656.3292236328125, + "loss": 0.2358, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2797776460647583, + "rewards/margins": 2.2085609436035156, + "rewards/rejected": -3.4883384704589844, + "step": 868 + }, + { + "epoch": 0.75, + "grad_norm": 57.81381369600409, + "learning_rate": 3.701754645654347e-07, + "logits/chosen": -1.3500280380249023, + "logits/rejected": -1.3101742267608643, + "logps/chosen": -582.2659912109375, + "logps/rejected": -626.7835693359375, + "loss": 0.2263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7205829620361328, + "rewards/margins": 2.4664716720581055, + "rewards/rejected": -3.1870546340942383, + "step": 869 + }, + { + "epoch": 0.75, + "grad_norm": 36.414745480971156, + "learning_rate": 3.678520840896213e-07, + "logits/chosen": -1.4011168479919434, + "logits/rejected": -1.3437912464141846, + "logps/chosen": -468.67303466796875, + "logps/rejected": -627.2376708984375, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0265426635742188, + "rewards/margins": 2.329216718673706, + "rewards/rejected": -3.355759382247925, + "step": 870 + }, + { + "epoch": 0.75, + "grad_norm": 72.01976252804879, + "learning_rate": 3.655343734061743e-07, + "logits/chosen": -1.30082106590271, + "logits/rejected": -1.233238935470581, + "logps/chosen": -593.5780029296875, + "logps/rejected": -778.603759765625, + "loss": 0.3019, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1820402145385742, + "rewards/margins": 2.6853573322296143, + "rewards/rejected": -3.8673977851867676, + "step": 871 + }, + { + "epoch": 0.75, + "grad_norm": 94.95198859210063, + "learning_rate": 3.6322235330285245e-07, + "logits/chosen": -1.3168257474899292, + "logits/rejected": -1.3221426010131836, + "logps/chosen": -592.3690795898438, + "logps/rejected": -682.3460693359375, + "loss": 0.4481, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4910621643066406, + "rewards/margins": 2.1111958026885986, + "rewards/rejected": -3.6022579669952393, + "step": 872 + }, + { + "epoch": 0.75, + "grad_norm": 77.93664456604847, + "learning_rate": 3.6091604451637514e-07, + "logits/chosen": -1.3545784950256348, + "logits/rejected": -1.3380658626556396, + "logps/chosen": -574.0543212890625, + "logps/rejected": -678.6168212890625, + "loss": 0.3947, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.328019618988037, + "rewards/margins": 1.94981050491333, + "rewards/rejected": -3.277829885482788, + "step": 873 + }, + { + "epoch": 0.75, + "grad_norm": 52.91490995676715, + "learning_rate": 3.5861546773223625e-07, + "logits/chosen": -1.3453152179718018, + "logits/rejected": -1.2786279916763306, + "logps/chosen": -525.9072265625, + "logps/rejected": -661.3233642578125, + "loss": 0.2586, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.391870379447937, + "rewards/margins": 1.7153456211090088, + "rewards/rejected": -3.1072158813476562, + "step": 874 + }, + { + "epoch": 0.75, + "grad_norm": 36.63533426304873, + "learning_rate": 3.563206435845195e-07, + "logits/chosen": -1.4298312664031982, + "logits/rejected": -1.3732223510742188, + "logps/chosen": -417.8304443359375, + "logps/rejected": -637.6014404296875, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1234023571014404, + "rewards/margins": 2.6905455589294434, + "rewards/rejected": -3.8139476776123047, + "step": 875 + }, + { + "epoch": 0.75, + "grad_norm": 55.72006781264042, + "learning_rate": 3.5403159265571134e-07, + "logits/chosen": -1.3255597352981567, + "logits/rejected": -1.2867255210876465, + "logps/chosen": -570.984375, + "logps/rejected": -834.64794921875, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0393853187561035, + "rewards/margins": 2.5629029273986816, + "rewards/rejected": -4.602288246154785, + "step": 876 + }, + { + "epoch": 0.75, + "grad_norm": 43.175911684475345, + "learning_rate": 3.517483354765186e-07, + "logits/chosen": -1.357595682144165, + "logits/rejected": -1.2869113683700562, + "logps/chosen": -397.64715576171875, + "logps/rejected": -673.1788330078125, + "loss": 0.2144, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9668766260147095, + "rewards/margins": 2.812863349914551, + "rewards/rejected": -3.779740333557129, + "step": 877 + }, + { + "epoch": 0.75, + "grad_norm": 31.308656856204863, + "learning_rate": 3.494708925256844e-07, + "logits/chosen": -1.3941121101379395, + "logits/rejected": -1.3153181076049805, + "logps/chosen": -505.4889221191406, + "logps/rejected": -811.90673828125, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4519171714782715, + "rewards/margins": 3.0042178630828857, + "rewards/rejected": -4.456134796142578, + "step": 878 + }, + { + "epoch": 0.75, + "grad_norm": 52.39570886453552, + "learning_rate": 3.471992842298015e-07, + "logits/chosen": -1.407675862312317, + "logits/rejected": -1.3652654886245728, + "logps/chosen": -427.5823974609375, + "logps/rejected": -508.2393798828125, + "loss": 0.3277, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6519958972930908, + "rewards/margins": 1.814331293106079, + "rewards/rejected": -2.46632719039917, + "step": 879 + }, + { + "epoch": 0.75, + "grad_norm": 67.58038638473134, + "learning_rate": 3.4493353096313194e-07, + "logits/chosen": -1.422912359237671, + "logits/rejected": -1.3669354915618896, + "logps/chosen": -604.5769653320312, + "logps/rejected": -827.68603515625, + "loss": 0.4145, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.78847336769104, + "rewards/margins": 2.4188690185546875, + "rewards/rejected": -4.207342624664307, + "step": 880 + }, + { + "epoch": 0.76, + "grad_norm": 63.33642822741839, + "learning_rate": 3.426736530474247e-07, + "logits/chosen": -1.3313500881195068, + "logits/rejected": -1.2653937339782715, + "logps/chosen": -582.1412353515625, + "logps/rejected": -808.296630859375, + "loss": 0.2413, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1795361042022705, + "rewards/margins": 3.1476869583129883, + "rewards/rejected": -4.32722282409668, + "step": 881 + }, + { + "epoch": 0.76, + "grad_norm": 54.286660414687084, + "learning_rate": 3.4041967075172993e-07, + "logits/chosen": -1.410056471824646, + "logits/rejected": -1.3243889808654785, + "logps/chosen": -427.4581298828125, + "logps/rejected": -628.5403442382812, + "loss": 0.3641, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.216133952140808, + "rewards/margins": 2.6110310554504395, + "rewards/rejected": -3.827165126800537, + "step": 882 + }, + { + "epoch": 0.76, + "grad_norm": 29.622758715360273, + "learning_rate": 3.3817160429222124e-07, + "logits/chosen": -1.3391587734222412, + "logits/rejected": -1.3287652730941772, + "logps/chosen": -481.3605041503906, + "logps/rejected": -685.9511108398438, + "loss": 0.1739, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9065276384353638, + "rewards/margins": 2.8289523124694824, + "rewards/rejected": -3.735480308532715, + "step": 883 + }, + { + "epoch": 0.76, + "grad_norm": 112.93637118227241, + "learning_rate": 3.3592947383201173e-07, + "logits/chosen": -1.3215630054473877, + "logits/rejected": -1.2854342460632324, + "logps/chosen": -598.59765625, + "logps/rejected": -838.0128173828125, + "loss": 0.42, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8170629739761353, + "rewards/margins": 2.4677867889404297, + "rewards/rejected": -4.284849166870117, + "step": 884 + }, + { + "epoch": 0.76, + "grad_norm": 98.09938605966101, + "learning_rate": 3.3369329948097434e-07, + "logits/chosen": -1.3352463245391846, + "logits/rejected": -1.3223787546157837, + "logps/chosen": -593.1196899414062, + "logps/rejected": -615.0050048828125, + "loss": 0.6885, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.898991584777832, + "rewards/margins": 0.8527295589447021, + "rewards/rejected": -2.751720905303955, + "step": 885 + }, + { + "epoch": 0.76, + "grad_norm": 90.878606470055, + "learning_rate": 3.3146310129556077e-07, + "logits/chosen": -1.321690559387207, + "logits/rejected": -1.3665200471878052, + "logps/chosen": -696.987060546875, + "logps/rejected": -712.210693359375, + "loss": 0.329, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7030959129333496, + "rewards/margins": 2.140103340148926, + "rewards/rejected": -3.8431992530822754, + "step": 886 + }, + { + "epoch": 0.76, + "grad_norm": 112.21630938679121, + "learning_rate": 3.2923889927862226e-07, + "logits/chosen": -1.3148661851882935, + "logits/rejected": -1.3028908967971802, + "logps/chosen": -635.5806274414062, + "logps/rejected": -855.8046875, + "loss": 0.452, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8170392513275146, + "rewards/margins": 2.2406771183013916, + "rewards/rejected": -4.057716369628906, + "step": 887 + }, + { + "epoch": 0.76, + "grad_norm": 44.05711014236523, + "learning_rate": 3.2702071337922964e-07, + "logits/chosen": -1.2637290954589844, + "logits/rejected": -1.206153392791748, + "logps/chosen": -570.122802734375, + "logps/rejected": -821.76171875, + "loss": 0.2241, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8035266399383545, + "rewards/margins": 2.428105592727661, + "rewards/rejected": -4.231632232666016, + "step": 888 + }, + { + "epoch": 0.76, + "grad_norm": 109.6736924788751, + "learning_rate": 3.2480856349249517e-07, + "logits/chosen": -1.2842376232147217, + "logits/rejected": -1.2788653373718262, + "logps/chosen": -556.2728271484375, + "logps/rejected": -554.0238037109375, + "loss": 0.7193, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9949722290039062, + "rewards/margins": 1.1830435991287231, + "rewards/rejected": -3.178015947341919, + "step": 889 + }, + { + "epoch": 0.76, + "grad_norm": 99.72169912710434, + "learning_rate": 3.226024694593922e-07, + "logits/chosen": -1.39703369140625, + "logits/rejected": -1.400864601135254, + "logps/chosen": -649.0518798828125, + "logps/rejected": -563.8770141601562, + "loss": 0.5959, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.150852680206299, + "rewards/margins": 1.6393182277679443, + "rewards/rejected": -3.790170669555664, + "step": 890 + }, + { + "epoch": 0.76, + "grad_norm": 70.1925666635167, + "learning_rate": 3.2040245106658037e-07, + "logits/chosen": -1.362339973449707, + "logits/rejected": -1.339266300201416, + "logps/chosen": -674.0003051757812, + "logps/rejected": -634.685302734375, + "loss": 0.3611, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.792270302772522, + "rewards/margins": 1.7869552373886108, + "rewards/rejected": -3.579225540161133, + "step": 891 + }, + { + "epoch": 0.77, + "grad_norm": 70.55623056745203, + "learning_rate": 3.1820852804622555e-07, + "logits/chosen": -1.3071826696395874, + "logits/rejected": -1.267538070678711, + "logps/chosen": -392.2010803222656, + "logps/rejected": -595.3833618164062, + "loss": 0.5074, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1206765174865723, + "rewards/margins": 1.930222511291504, + "rewards/rejected": -3.050899028778076, + "step": 892 + }, + { + "epoch": 0.77, + "grad_norm": 110.22494900777089, + "learning_rate": 3.160207200758226e-07, + "logits/chosen": -1.378072738647461, + "logits/rejected": -1.3356727361679077, + "logps/chosen": -659.4681396484375, + "logps/rejected": -700.875, + "loss": 0.618, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.127047300338745, + "rewards/margins": 1.8155895471572876, + "rewards/rejected": -3.9426369667053223, + "step": 893 + }, + { + "epoch": 0.77, + "grad_norm": 46.144916062738126, + "learning_rate": 3.138390467780221e-07, + "logits/chosen": -1.366478443145752, + "logits/rejected": -1.2956424951553345, + "logps/chosen": -498.22845458984375, + "logps/rejected": -830.8386840820312, + "loss": 0.2081, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1988614797592163, + "rewards/margins": 3.48574161529541, + "rewards/rejected": -4.684603691101074, + "step": 894 + }, + { + "epoch": 0.77, + "grad_norm": 70.18858740432836, + "learning_rate": 3.1166352772045023e-07, + "logits/chosen": -1.3166738748550415, + "logits/rejected": -1.3166499137878418, + "logps/chosen": -695.97509765625, + "logps/rejected": -737.0611572265625, + "loss": 0.344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6051607131958008, + "rewards/margins": 2.00355863571167, + "rewards/rejected": -3.6087193489074707, + "step": 895 + }, + { + "epoch": 0.77, + "grad_norm": 68.23376706593672, + "learning_rate": 3.09494182415536e-07, + "logits/chosen": -1.2961673736572266, + "logits/rejected": -1.2795644998550415, + "logps/chosen": -593.0159912109375, + "logps/rejected": -706.4309692382812, + "loss": 0.4114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7094247341156006, + "rewards/margins": 1.647386074066162, + "rewards/rejected": -3.3568105697631836, + "step": 896 + }, + { + "epoch": 0.77, + "grad_norm": 33.157370260694265, + "learning_rate": 3.0733103032033634e-07, + "logits/chosen": -1.3536429405212402, + "logits/rejected": -1.2895846366882324, + "logps/chosen": -411.0083923339844, + "logps/rejected": -608.671630859375, + "loss": 0.1843, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9498167037963867, + "rewards/margins": 2.741847515106201, + "rewards/rejected": -3.691663980484009, + "step": 897 + }, + { + "epoch": 0.77, + "grad_norm": 25.068181022827815, + "learning_rate": 3.0517409083635905e-07, + "logits/chosen": -1.3577206134796143, + "logits/rejected": -1.3004130125045776, + "logps/chosen": -526.4813842773438, + "logps/rejected": -718.7724609375, + "loss": 0.1954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.991965651512146, + "rewards/margins": 2.8482069969177246, + "rewards/rejected": -3.840172529220581, + "step": 898 + }, + { + "epoch": 0.77, + "grad_norm": 54.60576445454713, + "learning_rate": 3.030233833093915e-07, + "logits/chosen": -1.3127899169921875, + "logits/rejected": -1.2542513608932495, + "logps/chosen": -375.11737060546875, + "logps/rejected": -528.6685180664062, + "loss": 0.3104, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0060513019561768, + "rewards/margins": 2.2479963302612305, + "rewards/rejected": -3.2540478706359863, + "step": 899 + }, + { + "epoch": 0.77, + "grad_norm": 69.03825593931651, + "learning_rate": 3.008789270293258e-07, + "logits/chosen": -1.369001865386963, + "logits/rejected": -1.3474452495574951, + "logps/chosen": -391.5672302246094, + "logps/rejected": -490.455078125, + "loss": 0.4387, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0773385763168335, + "rewards/margins": 1.8839397430419922, + "rewards/rejected": -2.9612784385681152, + "step": 900 + }, + { + "epoch": 0.77, + "grad_norm": 82.4675140405883, + "learning_rate": 2.9874074122998626e-07, + "logits/chosen": -1.327359676361084, + "logits/rejected": -1.3002171516418457, + "logps/chosen": -580.1005859375, + "logps/rejected": -651.4447021484375, + "loss": 0.5103, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4842534065246582, + "rewards/margins": 1.662112832069397, + "rewards/rejected": -3.1463663578033447, + "step": 901 + }, + { + "epoch": 0.77, + "grad_norm": 25.93819082249965, + "learning_rate": 2.9660884508895635e-07, + "logits/chosen": -1.3338119983673096, + "logits/rejected": -1.2930299043655396, + "logps/chosen": -528.5415649414062, + "logps/rejected": -738.2474975585938, + "loss": 0.1384, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1045358180999756, + "rewards/margins": 2.8727259635925293, + "rewards/rejected": -3.977262020111084, + "step": 902 + }, + { + "epoch": 0.77, + "grad_norm": 57.906261612530855, + "learning_rate": 2.944832577274071e-07, + "logits/chosen": -1.3311939239501953, + "logits/rejected": -1.3131906986236572, + "logps/chosen": -518.005126953125, + "logps/rejected": -610.8331298828125, + "loss": 0.2076, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.115837812423706, + "rewards/margins": 2.317455291748047, + "rewards/rejected": -3.433293342590332, + "step": 903 + }, + { + "epoch": 0.78, + "grad_norm": 60.51137036355153, + "learning_rate": 2.9236399820992584e-07, + "logits/chosen": -1.3146090507507324, + "logits/rejected": -1.3134419918060303, + "logps/chosen": -497.95281982421875, + "logps/rejected": -555.810302734375, + "loss": 0.3785, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9638426303863525, + "rewards/margins": 1.6513032913208008, + "rewards/rejected": -2.6151459217071533, + "step": 904 + }, + { + "epoch": 0.78, + "grad_norm": 69.07987355925474, + "learning_rate": 2.9025108554434484e-07, + "logits/chosen": -1.2922639846801758, + "logits/rejected": -1.2921481132507324, + "logps/chosen": -638.66650390625, + "logps/rejected": -691.6033935546875, + "loss": 0.4971, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0466654300689697, + "rewards/margins": 1.6078869104385376, + "rewards/rejected": -2.654552459716797, + "step": 905 + }, + { + "epoch": 0.78, + "grad_norm": 34.38374236627064, + "learning_rate": 2.8814453868156975e-07, + "logits/chosen": -1.1898102760314941, + "logits/rejected": -1.150869607925415, + "logps/chosen": -632.0054931640625, + "logps/rejected": -763.151611328125, + "loss": 0.2693, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.688407301902771, + "rewards/margins": 2.7878832817077637, + "rewards/rejected": -3.476290702819824, + "step": 906 + }, + { + "epoch": 0.78, + "grad_norm": 40.99030600675662, + "learning_rate": 2.860443765154126e-07, + "logits/chosen": -1.3723232746124268, + "logits/rejected": -1.2995691299438477, + "logps/chosen": -407.0501708984375, + "logps/rejected": -686.1329956054688, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8385758399963379, + "rewards/margins": 2.8460376262664795, + "rewards/rejected": -3.6846137046813965, + "step": 907 + }, + { + "epoch": 0.78, + "grad_norm": 79.22389737309626, + "learning_rate": 2.8395061788241956e-07, + "logits/chosen": -1.2611339092254639, + "logits/rejected": -1.1847124099731445, + "logps/chosen": -566.4060668945312, + "logps/rejected": -827.1646728515625, + "loss": 0.2721, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2613106966018677, + "rewards/margins": 2.8124876022338867, + "rewards/rejected": -4.073798179626465, + "step": 908 + }, + { + "epoch": 0.78, + "grad_norm": 37.901490746080235, + "learning_rate": 2.818632815617021e-07, + "logits/chosen": -1.3264902830123901, + "logits/rejected": -1.2834479808807373, + "logps/chosen": -508.1125793457031, + "logps/rejected": -708.22314453125, + "loss": 0.2283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0484519004821777, + "rewards/margins": 2.244354009628296, + "rewards/rejected": -3.2928059101104736, + "step": 909 + }, + { + "epoch": 0.78, + "grad_norm": 66.8687708436843, + "learning_rate": 2.7978238627477146e-07, + "logits/chosen": -1.3407678604125977, + "logits/rejected": -1.2671020030975342, + "logps/chosen": -480.5046081542969, + "logps/rejected": -702.8544311523438, + "loss": 0.3486, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.188808560371399, + "rewards/margins": 2.1973466873168945, + "rewards/rejected": -3.386155128479004, + "step": 910 + }, + { + "epoch": 0.78, + "grad_norm": 27.632333790497384, + "learning_rate": 2.7770795068536643e-07, + "logits/chosen": -1.3072032928466797, + "logits/rejected": -1.2601563930511475, + "logps/chosen": -443.11138916015625, + "logps/rejected": -548.894287109375, + "loss": 0.2195, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.793603241443634, + "rewards/margins": 2.062899112701416, + "rewards/rejected": -2.8565022945404053, + "step": 911 + }, + { + "epoch": 0.78, + "grad_norm": 48.781777727628075, + "learning_rate": 2.7563999339928935e-07, + "logits/chosen": -1.3822541236877441, + "logits/rejected": -1.3028593063354492, + "logps/chosen": -374.9779052734375, + "logps/rejected": -534.6602783203125, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7297624349594116, + "rewards/margins": 2.3048460483551025, + "rewards/rejected": -3.034608840942383, + "step": 912 + }, + { + "epoch": 0.78, + "grad_norm": 70.82186567684192, + "learning_rate": 2.735785329642386e-07, + "logits/chosen": -1.3031113147735596, + "logits/rejected": -1.2537412643432617, + "logps/chosen": -854.8223876953125, + "logps/rejected": -878.5704345703125, + "loss": 0.2976, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5797104835510254, + "rewards/margins": 2.0103750228881836, + "rewards/rejected": -3.590085506439209, + "step": 913 + }, + { + "epoch": 0.78, + "grad_norm": 71.14393446698111, + "learning_rate": 2.7152358786964023e-07, + "logits/chosen": -1.3568994998931885, + "logits/rejected": -1.317795991897583, + "logps/chosen": -554.6603393554688, + "logps/rejected": -741.9503173828125, + "loss": 0.3941, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5482054948806763, + "rewards/margins": 1.7386507987976074, + "rewards/rejected": -3.286856174468994, + "step": 914 + }, + { + "epoch": 0.78, + "grad_norm": 64.13171433552547, + "learning_rate": 2.6947517654648467e-07, + "logits/chosen": -1.3185844421386719, + "logits/rejected": -1.2491166591644287, + "logps/chosen": -437.0296630859375, + "logps/rejected": -627.7676391601562, + "loss": 0.3991, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3248052597045898, + "rewards/margins": 1.7850700616836548, + "rewards/rejected": -3.109875202178955, + "step": 915 + }, + { + "epoch": 0.79, + "grad_norm": 28.89268785562417, + "learning_rate": 2.674333173671601e-07, + "logits/chosen": -1.3552255630493164, + "logits/rejected": -1.3029472827911377, + "logps/chosen": -479.81939697265625, + "logps/rejected": -634.21923828125, + "loss": 0.1487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6625692844390869, + "rewards/margins": 2.2750813961029053, + "rewards/rejected": -2.937650680541992, + "step": 916 + }, + { + "epoch": 0.79, + "grad_norm": 25.6391724086389, + "learning_rate": 2.6539802864528783e-07, + "logits/chosen": -1.3402647972106934, + "logits/rejected": -1.2553725242614746, + "logps/chosen": -405.7496643066406, + "logps/rejected": -715.4351806640625, + "loss": 0.1174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39334067702293396, + "rewards/margins": 3.430051565170288, + "rewards/rejected": -3.823391914367676, + "step": 917 + }, + { + "epoch": 0.79, + "grad_norm": 46.84769643268985, + "learning_rate": 2.6336932863555826e-07, + "logits/chosen": -1.3779666423797607, + "logits/rejected": -1.2912983894348145, + "logps/chosen": -522.4635009765625, + "logps/rejected": -788.0476684570312, + "loss": 0.2176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0136640071868896, + "rewards/margins": 3.4003186225891113, + "rewards/rejected": -4.41398286819458, + "step": 918 + }, + { + "epoch": 0.79, + "grad_norm": 80.28931203730443, + "learning_rate": 2.61347235533567e-07, + "logits/chosen": -1.3870668411254883, + "logits/rejected": -1.3371049165725708, + "logps/chosen": -713.8438110351562, + "logps/rejected": -824.6231079101562, + "loss": 0.2795, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5177955627441406, + "rewards/margins": 2.441744327545166, + "rewards/rejected": -3.9595396518707275, + "step": 919 + }, + { + "epoch": 0.79, + "grad_norm": 68.33602752330498, + "learning_rate": 2.5933176747565165e-07, + "logits/chosen": -1.349252462387085, + "logits/rejected": -1.3031195402145386, + "logps/chosen": -467.68505859375, + "logps/rejected": -747.3790283203125, + "loss": 0.3611, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1066734790802002, + "rewards/margins": 3.041616201400757, + "rewards/rejected": -4.148289680480957, + "step": 920 + }, + { + "epoch": 0.79, + "grad_norm": 66.7312634364071, + "learning_rate": 2.5732294253872943e-07, + "logits/chosen": -1.3333121538162231, + "logits/rejected": -1.2936124801635742, + "logps/chosen": -596.22265625, + "logps/rejected": -773.28125, + "loss": 0.3431, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2985539436340332, + "rewards/margins": 2.0604376792907715, + "rewards/rejected": -3.3589913845062256, + "step": 921 + }, + { + "epoch": 0.79, + "grad_norm": 46.716215063346944, + "learning_rate": 2.553207787401339e-07, + "logits/chosen": -1.3393959999084473, + "logits/rejected": -1.277437686920166, + "logps/chosen": -480.6090393066406, + "logps/rejected": -705.7352294921875, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8253424167633057, + "rewards/margins": 2.352390766143799, + "rewards/rejected": -3.1777331829071045, + "step": 922 + }, + { + "epoch": 0.79, + "grad_norm": 48.9397323986539, + "learning_rate": 2.533252940374556e-07, + "logits/chosen": -1.34089994430542, + "logits/rejected": -1.2757072448730469, + "logps/chosen": -557.601806640625, + "logps/rejected": -759.7794799804688, + "loss": 0.2221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6078077554702759, + "rewards/margins": 2.3486251831054688, + "rewards/rejected": -3.956432819366455, + "step": 923 + }, + { + "epoch": 0.79, + "grad_norm": 60.29091845061661, + "learning_rate": 2.513365063283791e-07, + "logits/chosen": -1.2821693420410156, + "logits/rejected": -1.282141923904419, + "logps/chosen": -505.06671142578125, + "logps/rejected": -537.974365234375, + "loss": 0.3372, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9660041332244873, + "rewards/margins": 1.7687538862228394, + "rewards/rejected": -2.734757900238037, + "step": 924 + }, + { + "epoch": 0.79, + "grad_norm": 32.52231932265654, + "learning_rate": 2.493544334505221e-07, + "logits/chosen": -1.307525396347046, + "logits/rejected": -1.2522432804107666, + "logps/chosen": -570.719970703125, + "logps/rejected": -773.0458984375, + "loss": 0.1414, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1016355752944946, + "rewards/margins": 2.7439346313476562, + "rewards/rejected": -3.8455703258514404, + "step": 925 + }, + { + "epoch": 0.79, + "grad_norm": 43.05479637811732, + "learning_rate": 2.4737909318127826e-07, + "logits/chosen": -1.3062845468521118, + "logits/rejected": -1.2266449928283691, + "logps/chosen": -376.541015625, + "logps/rejected": -599.5220947265625, + "loss": 0.217, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7425134181976318, + "rewards/margins": 2.541034698486328, + "rewards/rejected": -3.28354811668396, + "step": 926 + }, + { + "epoch": 0.8, + "grad_norm": 46.41979495785788, + "learning_rate": 2.45410503237654e-07, + "logits/chosen": -1.3085191249847412, + "logits/rejected": -1.2489107847213745, + "logps/chosen": -297.17486572265625, + "logps/rejected": -566.790771484375, + "loss": 0.283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9123257398605347, + "rewards/margins": 2.1935391426086426, + "rewards/rejected": -3.1058647632598877, + "step": 927 + }, + { + "epoch": 0.8, + "grad_norm": 47.81115773318066, + "learning_rate": 2.434486812761124e-07, + "logits/chosen": -1.3504283428192139, + "logits/rejected": -1.31508469581604, + "logps/chosen": -531.3181762695312, + "logps/rejected": -676.3302612304688, + "loss": 0.2547, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1350109577178955, + "rewards/margins": 2.2765567302703857, + "rewards/rejected": -3.4115676879882812, + "step": 928 + }, + { + "epoch": 0.8, + "grad_norm": 71.94119778498928, + "learning_rate": 2.4149364489241386e-07, + "logits/chosen": -1.3219082355499268, + "logits/rejected": -1.3023300170898438, + "logps/chosen": -632.0505981445312, + "logps/rejected": -608.5750732421875, + "loss": 0.5524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9060620665550232, + "rewards/margins": 2.401428461074829, + "rewards/rejected": -3.307490348815918, + "step": 929 + }, + { + "epoch": 0.8, + "grad_norm": 36.080090426945006, + "learning_rate": 2.3954541162145804e-07, + "logits/chosen": -1.283137321472168, + "logits/rejected": -1.2344775199890137, + "logps/chosen": -568.8763427734375, + "logps/rejected": -669.472412109375, + "loss": 0.1851, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0817811489105225, + "rewards/margins": 2.3809902667999268, + "rewards/rejected": -3.462771415710449, + "step": 930 + }, + { + "epoch": 0.8, + "grad_norm": 86.54210763526781, + "learning_rate": 2.3760399893712714e-07, + "logits/chosen": -1.277703046798706, + "logits/rejected": -1.2782983779907227, + "logps/chosen": -728.403564453125, + "logps/rejected": -810.8538818359375, + "loss": 0.3426, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6086221933364868, + "rewards/margins": 2.27468204498291, + "rewards/rejected": -3.8833041191101074, + "step": 931 + }, + { + "epoch": 0.8, + "grad_norm": 76.30793036259315, + "learning_rate": 2.3566942425212867e-07, + "logits/chosen": -1.3559530973434448, + "logits/rejected": -1.2951138019561768, + "logps/chosen": -556.3028564453125, + "logps/rejected": -691.5188598632812, + "loss": 0.4465, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.14959716796875, + "rewards/margins": 2.3433687686920166, + "rewards/rejected": -3.4929661750793457, + "step": 932 + }, + { + "epoch": 0.8, + "grad_norm": 67.37617332972937, + "learning_rate": 2.3374170491783952e-07, + "logits/chosen": -1.3153711557388306, + "logits/rejected": -1.248844861984253, + "logps/chosen": -566.6974487304688, + "logps/rejected": -788.5882568359375, + "loss": 0.565, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3778927326202393, + "rewards/margins": 2.809847354888916, + "rewards/rejected": -4.187740325927734, + "step": 933 + }, + { + "epoch": 0.8, + "grad_norm": 59.85884327598616, + "learning_rate": 2.3182085822415055e-07, + "logits/chosen": -1.3591265678405762, + "logits/rejected": -1.3253945112228394, + "logps/chosen": -493.8184509277344, + "logps/rejected": -725.8446044921875, + "loss": 0.4137, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6604773998260498, + "rewards/margins": 1.2979148626327515, + "rewards/rejected": -2.9583921432495117, + "step": 934 + }, + { + "epoch": 0.8, + "grad_norm": 86.97489953971993, + "learning_rate": 2.2990690139931114e-07, + "logits/chosen": -1.2852991819381714, + "logits/rejected": -1.2641592025756836, + "logps/chosen": -530.9556884765625, + "logps/rejected": -662.2107543945312, + "loss": 0.5082, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.521914005279541, + "rewards/margins": 1.842146635055542, + "rewards/rejected": -3.364060401916504, + "step": 935 + }, + { + "epoch": 0.8, + "grad_norm": 82.77432246534761, + "learning_rate": 2.2799985160977454e-07, + "logits/chosen": -1.3647409677505493, + "logits/rejected": -1.3406472206115723, + "logps/chosen": -666.6566162109375, + "logps/rejected": -643.3453979492188, + "loss": 0.3221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.455517053604126, + "rewards/margins": 2.4178004264831543, + "rewards/rejected": -3.8733177185058594, + "step": 936 + }, + { + "epoch": 0.8, + "grad_norm": 22.240157953731014, + "learning_rate": 2.2609972596004477e-07, + "logits/chosen": -1.3525817394256592, + "logits/rejected": -1.2699151039123535, + "logps/chosen": -370.5299377441406, + "logps/rejected": -715.29736328125, + "loss": 0.156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0559579133987427, + "rewards/margins": 2.735990047454834, + "rewards/rejected": -3.791947841644287, + "step": 937 + }, + { + "epoch": 0.8, + "grad_norm": 55.495969319969, + "learning_rate": 2.242065414925215e-07, + "logits/chosen": -1.3445532321929932, + "logits/rejected": -1.2792248725891113, + "logps/chosen": -530.41796875, + "logps/rejected": -776.2282104492188, + "loss": 0.3099, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4455312490463257, + "rewards/margins": 1.9357490539550781, + "rewards/rejected": -3.3812801837921143, + "step": 938 + }, + { + "epoch": 0.81, + "grad_norm": 34.049314536628835, + "learning_rate": 2.2232031518734984e-07, + "logits/chosen": -1.3321669101715088, + "logits/rejected": -1.2851825952529907, + "logps/chosen": -369.9688720703125, + "logps/rejected": -547.10302734375, + "loss": 0.2334, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6528770327568054, + "rewards/margins": 2.41916561126709, + "rewards/rejected": -3.072042465209961, + "step": 939 + }, + { + "epoch": 0.81, + "grad_norm": 61.74554258352496, + "learning_rate": 2.204410639622657e-07, + "logits/chosen": -1.3021714687347412, + "logits/rejected": -1.2831745147705078, + "logps/chosen": -489.5042724609375, + "logps/rejected": -573.2079467773438, + "loss": 0.3026, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3532168865203857, + "rewards/margins": 1.6035637855529785, + "rewards/rejected": -2.9567806720733643, + "step": 940 + }, + { + "epoch": 0.81, + "grad_norm": 43.76747281023233, + "learning_rate": 2.1856880467244408e-07, + "logits/chosen": -1.3493702411651611, + "logits/rejected": -1.3328311443328857, + "logps/chosen": -335.95745849609375, + "logps/rejected": -380.2527770996094, + "loss": 0.3337, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7542818784713745, + "rewards/margins": 1.4853137731552124, + "rewards/rejected": -2.239595651626587, + "step": 941 + }, + { + "epoch": 0.81, + "grad_norm": 77.37259259760772, + "learning_rate": 2.1670355411035058e-07, + "logits/chosen": -1.3695814609527588, + "logits/rejected": -1.285352349281311, + "logps/chosen": -460.94085693359375, + "logps/rejected": -627.2164306640625, + "loss": 0.3334, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6768767833709717, + "rewards/margins": 2.2322726249694824, + "rewards/rejected": -2.909149169921875, + "step": 942 + }, + { + "epoch": 0.81, + "grad_norm": 37.64021272407518, + "learning_rate": 2.1484532900558684e-07, + "logits/chosen": -1.3233516216278076, + "logits/rejected": -1.245588779449463, + "logps/chosen": -558.4962158203125, + "logps/rejected": -842.7387084960938, + "loss": 0.1812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2138087749481201, + "rewards/margins": 2.9561476707458496, + "rewards/rejected": -4.169956207275391, + "step": 943 + }, + { + "epoch": 0.81, + "grad_norm": 93.38322909034852, + "learning_rate": 2.1299414602474375e-07, + "logits/chosen": -1.3307756185531616, + "logits/rejected": -1.3243978023529053, + "logps/chosen": -569.825439453125, + "logps/rejected": -607.7615966796875, + "loss": 0.4985, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2010345458984375, + "rewards/margins": 1.0289090871810913, + "rewards/rejected": -2.2299435138702393, + "step": 944 + }, + { + "epoch": 0.81, + "grad_norm": 43.12913446481472, + "learning_rate": 2.1115002177125063e-07, + "logits/chosen": -1.3236948251724243, + "logits/rejected": -1.2851643562316895, + "logps/chosen": -598.6200561523438, + "logps/rejected": -682.32421875, + "loss": 0.1885, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.110639214515686, + "rewards/margins": 2.551461696624756, + "rewards/rejected": -3.6621007919311523, + "step": 945 + }, + { + "epoch": 0.81, + "grad_norm": 79.79479941191578, + "learning_rate": 2.0931297278522609e-07, + "logits/chosen": -1.301816701889038, + "logits/rejected": -1.2469000816345215, + "logps/chosen": -523.5300903320312, + "logps/rejected": -710.1577758789062, + "loss": 0.5301, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.259035587310791, + "rewards/margins": 2.14595365524292, + "rewards/rejected": -3.40498948097229, + "step": 946 + }, + { + "epoch": 0.81, + "grad_norm": 52.533395564987856, + "learning_rate": 2.0748301554333024e-07, + "logits/chosen": -1.2569694519042969, + "logits/rejected": -1.1883673667907715, + "logps/chosen": -546.7478637695312, + "logps/rejected": -734.0489501953125, + "loss": 0.338, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6118360757827759, + "rewards/margins": 2.1850857734680176, + "rewards/rejected": -3.796921730041504, + "step": 947 + }, + { + "epoch": 0.81, + "grad_norm": 39.58756476499145, + "learning_rate": 2.0566016645861662e-07, + "logits/chosen": -1.3819479942321777, + "logits/rejected": -1.3744871616363525, + "logps/chosen": -480.4954833984375, + "logps/rejected": -505.9744873046875, + "loss": 0.264, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1077557802200317, + "rewards/margins": 1.5549559593200684, + "rewards/rejected": -2.6627118587493896, + "step": 948 + }, + { + "epoch": 0.81, + "grad_norm": 33.37668426570639, + "learning_rate": 2.0384444188038508e-07, + "logits/chosen": -1.3193440437316895, + "logits/rejected": -1.2121649980545044, + "logps/chosen": -444.0843811035156, + "logps/rejected": -845.9827270507812, + "loss": 0.2048, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.88461834192276, + "rewards/margins": 3.3373303413391113, + "rewards/rejected": -4.221948623657227, + "step": 949 + }, + { + "epoch": 0.81, + "grad_norm": 58.852877656840754, + "learning_rate": 2.0203585809403523e-07, + "logits/chosen": -1.2987149953842163, + "logits/rejected": -1.2459378242492676, + "logps/chosen": -350.14276123046875, + "logps/rejected": -584.1080322265625, + "loss": 0.3171, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.116916537284851, + "rewards/margins": 1.9350357055664062, + "rewards/rejected": -3.051952362060547, + "step": 950 + }, + { + "epoch": 0.82, + "grad_norm": 88.45307444424925, + "learning_rate": 2.0023443132092e-07, + "logits/chosen": -1.3424527645111084, + "logits/rejected": -1.3207765817642212, + "logps/chosen": -555.0639038085938, + "logps/rejected": -669.8426513671875, + "loss": 0.3594, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3616647720336914, + "rewards/margins": 2.0655899047851562, + "rewards/rejected": -3.4272546768188477, + "step": 951 + }, + { + "epoch": 0.82, + "grad_norm": 65.1823986201499, + "learning_rate": 1.9844017771820054e-07, + "logits/chosen": -1.2823162078857422, + "logits/rejected": -1.2424159049987793, + "logps/chosen": -427.4981689453125, + "logps/rejected": -607.1182861328125, + "loss": 0.367, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1584736108779907, + "rewards/margins": 1.8016468286514282, + "rewards/rejected": -2.960120439529419, + "step": 952 + }, + { + "epoch": 0.82, + "grad_norm": 39.696090131290745, + "learning_rate": 1.9665311337870173e-07, + "logits/chosen": -1.3776021003723145, + "logits/rejected": -1.3465569019317627, + "logps/chosen": -385.9962463378906, + "logps/rejected": -483.7662353515625, + "loss": 0.3253, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5088029503822327, + "rewards/margins": 2.1894595623016357, + "rewards/rejected": -2.6982624530792236, + "step": 953 + }, + { + "epoch": 0.82, + "grad_norm": 76.36659757388189, + "learning_rate": 1.9487325433076573e-07, + "logits/chosen": -1.3619831800460815, + "logits/rejected": -1.2768319845199585, + "logps/chosen": -510.9540710449219, + "logps/rejected": -762.8428344726562, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6498937606811523, + "rewards/margins": 1.9130890369415283, + "rewards/rejected": -3.5629830360412598, + "step": 954 + }, + { + "epoch": 0.82, + "grad_norm": 100.57414208414959, + "learning_rate": 1.931006165381117e-07, + "logits/chosen": -1.3057761192321777, + "logits/rejected": -1.2368066310882568, + "logps/chosen": -714.7298583984375, + "logps/rejected": -942.5250244140625, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.454352617263794, + "rewards/margins": 3.2785496711730957, + "rewards/rejected": -4.732902526855469, + "step": 955 + }, + { + "epoch": 0.82, + "grad_norm": 77.062422421182, + "learning_rate": 1.913352158996898e-07, + "logits/chosen": -1.3390214443206787, + "logits/rejected": -1.2953336238861084, + "logps/chosen": -427.72222900390625, + "logps/rejected": -579.9778442382812, + "loss": 0.4281, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1499311923980713, + "rewards/margins": 1.604925274848938, + "rewards/rejected": -2.754856586456299, + "step": 956 + }, + { + "epoch": 0.82, + "grad_norm": 81.358607096424, + "learning_rate": 1.8957706824953912e-07, + "logits/chosen": -1.3217957019805908, + "logits/rejected": -1.3499778509140015, + "logps/chosen": -604.0429077148438, + "logps/rejected": -668.5711669921875, + "loss": 0.4214, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2244956493377686, + "rewards/margins": 1.9957714080810547, + "rewards/rejected": -3.2202672958374023, + "step": 957 + }, + { + "epoch": 0.82, + "grad_norm": 51.96635681812691, + "learning_rate": 1.878261893566465e-07, + "logits/chosen": -1.290670394897461, + "logits/rejected": -1.2346240282058716, + "logps/chosen": -511.96966552734375, + "logps/rejected": -710.0076293945312, + "loss": 0.2174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35284197330474854, + "rewards/margins": 3.1048197746276855, + "rewards/rejected": -3.4576616287231445, + "step": 958 + }, + { + "epoch": 0.82, + "grad_norm": 65.51188245747359, + "learning_rate": 1.860825949248047e-07, + "logits/chosen": -1.3652698993682861, + "logits/rejected": -1.305970311164856, + "logps/chosen": -436.9747619628906, + "logps/rejected": -593.4093627929688, + "loss": 0.318, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0405833721160889, + "rewards/margins": 2.0152382850646973, + "rewards/rejected": -3.0558218955993652, + "step": 959 + }, + { + "epoch": 0.82, + "grad_norm": 70.47338845675975, + "learning_rate": 1.8434630059247126e-07, + "logits/chosen": -1.31570565700531, + "logits/rejected": -1.288224220275879, + "logps/chosen": -541.5432739257812, + "logps/rejected": -632.031982421875, + "loss": 0.3616, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4176642894744873, + "rewards/margins": 1.8958582878112793, + "rewards/rejected": -3.3135228157043457, + "step": 960 + }, + { + "epoch": 0.82, + "grad_norm": 103.46121882906307, + "learning_rate": 1.826173219326287e-07, + "logits/chosen": -1.4040485620498657, + "logits/rejected": -1.3483211994171143, + "logps/chosen": -695.0693359375, + "logps/rejected": -754.0690307617188, + "loss": 0.6171, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.865354061126709, + "rewards/margins": 1.688320517539978, + "rewards/rejected": -3.5536746978759766, + "step": 961 + }, + { + "epoch": 0.83, + "grad_norm": 32.83999159031318, + "learning_rate": 1.808956744526443e-07, + "logits/chosen": -1.3187965154647827, + "logits/rejected": -1.2331805229187012, + "logps/chosen": -490.5249328613281, + "logps/rejected": -802.759521484375, + "loss": 0.2065, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0255398750305176, + "rewards/margins": 3.493373394012451, + "rewards/rejected": -4.518913269042969, + "step": 962 + }, + { + "epoch": 0.83, + "grad_norm": 19.993012647341114, + "learning_rate": 1.7918137359413154e-07, + "logits/chosen": -1.3293086290359497, + "logits/rejected": -1.2440801858901978, + "logps/chosen": -390.60986328125, + "logps/rejected": -690.3939208984375, + "loss": 0.1433, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9291788935661316, + "rewards/margins": 3.344635248184204, + "rewards/rejected": -4.2738142013549805, + "step": 963 + }, + { + "epoch": 0.83, + "grad_norm": 71.96482022456891, + "learning_rate": 1.7747443473281133e-07, + "logits/chosen": -1.3135042190551758, + "logits/rejected": -1.274287223815918, + "logps/chosen": -664.2023315429688, + "logps/rejected": -819.9852905273438, + "loss": 0.3968, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9484283924102783, + "rewards/margins": 2.0423455238342285, + "rewards/rejected": -3.990774154663086, + "step": 964 + }, + { + "epoch": 0.83, + "grad_norm": 49.96214887666054, + "learning_rate": 1.7577487317837414e-07, + "logits/chosen": -1.384070634841919, + "logits/rejected": -1.3049094676971436, + "logps/chosen": -397.2987060546875, + "logps/rejected": -640.8340454101562, + "loss": 0.3909, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8136837482452393, + "rewards/margins": 2.6849682331085205, + "rewards/rejected": -3.4986519813537598, + "step": 965 + }, + { + "epoch": 0.83, + "grad_norm": 37.07711626565963, + "learning_rate": 1.7408270417434278e-07, + "logits/chosen": -1.3725950717926025, + "logits/rejected": -1.3246898651123047, + "logps/chosen": -635.620361328125, + "logps/rejected": -773.0831909179688, + "loss": 0.2158, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4327852725982666, + "rewards/margins": 2.48598575592041, + "rewards/rejected": -3.918771266937256, + "step": 966 + }, + { + "epoch": 0.83, + "grad_norm": 59.25461140547252, + "learning_rate": 1.723979428979353e-07, + "logits/chosen": -1.3322556018829346, + "logits/rejected": -1.30332350730896, + "logps/chosen": -493.76800537109375, + "logps/rejected": -601.7940673828125, + "loss": 0.355, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.601588487625122, + "rewards/margins": 1.7294272184371948, + "rewards/rejected": -3.3310158252716064, + "step": 967 + }, + { + "epoch": 0.83, + "grad_norm": 81.86136207581325, + "learning_rate": 1.7072060445992963e-07, + "logits/chosen": -1.3395432233810425, + "logits/rejected": -1.268092393875122, + "logps/chosen": -471.16900634765625, + "logps/rejected": -727.853759765625, + "loss": 0.4345, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0265016555786133, + "rewards/margins": 2.8057689666748047, + "rewards/rejected": -3.832270622253418, + "step": 968 + }, + { + "epoch": 0.83, + "grad_norm": 54.317732056880494, + "learning_rate": 1.6905070390452746e-07, + "logits/chosen": -1.2924282550811768, + "logits/rejected": -1.228909969329834, + "logps/chosen": -512.61572265625, + "logps/rejected": -765.1852416992188, + "loss": 0.227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9856740236282349, + "rewards/margins": 2.5250301361083984, + "rewards/rejected": -3.510704517364502, + "step": 969 + }, + { + "epoch": 0.83, + "grad_norm": 41.962128256470194, + "learning_rate": 1.6738825620921893e-07, + "logits/chosen": -1.3754926919937134, + "logits/rejected": -1.2921273708343506, + "logps/chosen": -585.1094970703125, + "logps/rejected": -874.86083984375, + "loss": 0.1683, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1646623611450195, + "rewards/margins": 3.2304623126983643, + "rewards/rejected": -4.395124912261963, + "step": 970 + }, + { + "epoch": 0.83, + "grad_norm": 102.50633564708357, + "learning_rate": 1.6573327628464896e-07, + "logits/chosen": -1.3682212829589844, + "logits/rejected": -1.3239730596542358, + "logps/chosen": -514.51123046875, + "logps/rejected": -719.0240478515625, + "loss": 0.738, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6949272155761719, + "rewards/margins": 1.5487298965454102, + "rewards/rejected": -3.243657112121582, + "step": 971 + }, + { + "epoch": 0.83, + "grad_norm": 59.90244100603329, + "learning_rate": 1.640857789744846e-07, + "logits/chosen": -1.3000390529632568, + "logits/rejected": -1.270625352859497, + "logps/chosen": -742.279541015625, + "logps/rejected": -793.2999267578125, + "loss": 0.2283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4226157665252686, + "rewards/margins": 2.159538745880127, + "rewards/rejected": -3.5821545124053955, + "step": 972 + }, + { + "epoch": 0.83, + "grad_norm": 58.278500296634284, + "learning_rate": 1.6244577905527868e-07, + "logits/chosen": -1.3476643562316895, + "logits/rejected": -1.3438431024551392, + "logps/chosen": -443.50811767578125, + "logps/rejected": -473.27508544921875, + "loss": 0.4892, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1845086812973022, + "rewards/margins": 1.4653077125549316, + "rewards/rejected": -2.6498162746429443, + "step": 973 + }, + { + "epoch": 0.84, + "grad_norm": 35.075985813774416, + "learning_rate": 1.6081329123634024e-07, + "logits/chosen": -1.271264672279358, + "logits/rejected": -1.2945821285247803, + "logps/chosen": -486.099609375, + "logps/rejected": -553.9111328125, + "loss": 0.251, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.140496015548706, + "rewards/margins": 2.0564515590667725, + "rewards/rejected": -3.1969475746154785, + "step": 974 + }, + { + "epoch": 0.84, + "grad_norm": 58.20649671335517, + "learning_rate": 1.5918833015960244e-07, + "logits/chosen": -1.3587465286254883, + "logits/rejected": -1.2988827228546143, + "logps/chosen": -520.69775390625, + "logps/rejected": -818.2510986328125, + "loss": 0.291, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5749714374542236, + "rewards/margins": 2.2684600353240967, + "rewards/rejected": -3.8434317111968994, + "step": 975 + }, + { + "epoch": 0.84, + "grad_norm": 115.5741282409676, + "learning_rate": 1.5757091039948855e-07, + "logits/chosen": -1.336240291595459, + "logits/rejected": -1.2873685359954834, + "logps/chosen": -555.2882690429688, + "logps/rejected": -592.8360595703125, + "loss": 0.5613, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3236135244369507, + "rewards/margins": 1.2951241731643677, + "rewards/rejected": -2.6187376976013184, + "step": 976 + }, + { + "epoch": 0.84, + "grad_norm": 31.639541138118187, + "learning_rate": 1.559610464627844e-07, + "logits/chosen": -1.259886622428894, + "logits/rejected": -1.219529390335083, + "logps/chosen": -432.88116455078125, + "logps/rejected": -646.241943359375, + "loss": 0.2021, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3749642372131348, + "rewards/margins": 2.893190860748291, + "rewards/rejected": -4.268155097961426, + "step": 977 + }, + { + "epoch": 0.84, + "grad_norm": 57.79686067323558, + "learning_rate": 1.5435875278850664e-07, + "logits/chosen": -1.3151590824127197, + "logits/rejected": -1.2965672016143799, + "logps/chosen": -660.9667358398438, + "logps/rejected": -831.7444458007812, + "loss": 0.2738, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1809264421463013, + "rewards/margins": 3.1272950172424316, + "rewards/rejected": -4.308221340179443, + "step": 978 + }, + { + "epoch": 0.84, + "grad_norm": 23.415580259184537, + "learning_rate": 1.5276404374777352e-07, + "logits/chosen": -1.2803633213043213, + "logits/rejected": -1.2494806051254272, + "logps/chosen": -546.5381469726562, + "logps/rejected": -770.8450927734375, + "loss": 0.1087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8054540753364563, + "rewards/margins": 3.301450252532959, + "rewards/rejected": -4.10690450668335, + "step": 979 + }, + { + "epoch": 0.84, + "grad_norm": 69.39581972642632, + "learning_rate": 1.511769336436759e-07, + "logits/chosen": -1.3319766521453857, + "logits/rejected": -1.3051915168762207, + "logps/chosen": -614.9429321289062, + "logps/rejected": -808.3277587890625, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.653443455696106, + "rewards/margins": 2.0603551864624023, + "rewards/rejected": -3.7137985229492188, + "step": 980 + }, + { + "epoch": 0.84, + "grad_norm": 71.17196160686636, + "learning_rate": 1.4959743671114923e-07, + "logits/chosen": -1.2849793434143066, + "logits/rejected": -1.2056975364685059, + "logps/chosen": -529.6190185546875, + "logps/rejected": -878.0888671875, + "loss": 0.4077, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2433801889419556, + "rewards/margins": 2.849297523498535, + "rewards/rejected": -4.092677593231201, + "step": 981 + }, + { + "epoch": 0.84, + "grad_norm": 37.17978114274386, + "learning_rate": 1.4802556711684578e-07, + "logits/chosen": -1.342916488647461, + "logits/rejected": -1.2826406955718994, + "logps/chosen": -664.3662109375, + "logps/rejected": -953.063720703125, + "loss": 0.1432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.191670536994934, + "rewards/margins": 3.7289047241210938, + "rewards/rejected": -4.920575141906738, + "step": 982 + }, + { + "epoch": 0.84, + "grad_norm": 67.10573660182742, + "learning_rate": 1.464613389590076e-07, + "logits/chosen": -1.359865427017212, + "logits/rejected": -1.3003504276275635, + "logps/chosen": -522.3917236328125, + "logps/rejected": -681.456298828125, + "loss": 0.3646, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3133918046951294, + "rewards/margins": 2.1504573822021484, + "rewards/rejected": -3.4638490676879883, + "step": 983 + }, + { + "epoch": 0.84, + "grad_norm": 29.117522756221938, + "learning_rate": 1.4490476626733904e-07, + "logits/chosen": -1.3106701374053955, + "logits/rejected": -1.2240524291992188, + "logps/chosen": -465.88330078125, + "logps/rejected": -737.56787109375, + "loss": 0.1507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.892052412033081, + "rewards/margins": 2.813678026199341, + "rewards/rejected": -3.705730438232422, + "step": 984 + }, + { + "epoch": 0.84, + "grad_norm": 21.957818753419676, + "learning_rate": 1.4335586300288384e-07, + "logits/chosen": -1.4063811302185059, + "logits/rejected": -1.3505864143371582, + "logps/chosen": -529.839599609375, + "logps/rejected": -693.262451171875, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9908750057220459, + "rewards/margins": 3.0074610710144043, + "rewards/rejected": -3.998335838317871, + "step": 985 + }, + { + "epoch": 0.85, + "grad_norm": 55.984646307333605, + "learning_rate": 1.4181464305789582e-07, + "logits/chosen": -1.3785548210144043, + "logits/rejected": -1.374957799911499, + "logps/chosen": -412.48687744140625, + "logps/rejected": -459.7071228027344, + "loss": 0.402, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3801637887954712, + "rewards/margins": 1.472696304321289, + "rewards/rejected": -2.852860450744629, + "step": 986 + }, + { + "epoch": 0.85, + "grad_norm": 45.42030857890214, + "learning_rate": 1.402811202557176e-07, + "logits/chosen": -1.3691201210021973, + "logits/rejected": -1.2851285934448242, + "logps/chosen": -588.96484375, + "logps/rejected": -945.3374633789062, + "loss": 0.1955, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0526621341705322, + "rewards/margins": 4.162746429443359, + "rewards/rejected": -5.2154083251953125, + "step": 987 + }, + { + "epoch": 0.85, + "grad_norm": 93.84393859471162, + "learning_rate": 1.3875530835065574e-07, + "logits/chosen": -1.3202193975448608, + "logits/rejected": -1.3074629306793213, + "logps/chosen": -512.6787109375, + "logps/rejected": -594.0485229492188, + "loss": 0.8315, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.14207124710083, + "rewards/margins": 1.4724304676055908, + "rewards/rejected": -2.614501714706421, + "step": 988 + }, + { + "epoch": 0.85, + "grad_norm": 30.332149826016966, + "learning_rate": 1.3723722102785574e-07, + "logits/chosen": -1.347909688949585, + "logits/rejected": -1.2588642835617065, + "logps/chosen": -501.5035400390625, + "logps/rejected": -778.224853515625, + "loss": 0.184, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.144532322883606, + "rewards/margins": 3.007246971130371, + "rewards/rejected": -4.1517791748046875, + "step": 989 + }, + { + "epoch": 0.85, + "grad_norm": 66.25531036689799, + "learning_rate": 1.3572687190318167e-07, + "logits/chosen": -1.3748841285705566, + "logits/rejected": -1.3424025774002075, + "logps/chosen": -370.31097412109375, + "logps/rejected": -493.2507019042969, + "loss": 0.3701, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7494587898254395, + "rewards/margins": 1.8322503566741943, + "rewards/rejected": -2.581709384918213, + "step": 990 + }, + { + "epoch": 0.85, + "grad_norm": 79.38873125449446, + "learning_rate": 1.3422427452309304e-07, + "logits/chosen": -1.3242628574371338, + "logits/rejected": -1.270920753479004, + "logps/chosen": -503.6895751953125, + "logps/rejected": -741.283935546875, + "loss": 0.432, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2765638828277588, + "rewards/margins": 2.556812286376953, + "rewards/rejected": -3.833375930786133, + "step": 991 + }, + { + "epoch": 0.85, + "grad_norm": 83.40848008718488, + "learning_rate": 1.3272944236452255e-07, + "logits/chosen": -1.3762929439544678, + "logits/rejected": -1.2758681774139404, + "logps/chosen": -531.3277587890625, + "logps/rejected": -764.064208984375, + "loss": 0.5236, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1648799180984497, + "rewards/margins": 2.10933256149292, + "rewards/rejected": -3.27421236038208, + "step": 992 + }, + { + "epoch": 0.85, + "grad_norm": 27.36302955280646, + "learning_rate": 1.3124238883475625e-07, + "logits/chosen": -1.340722680091858, + "logits/rejected": -1.3234275579452515, + "logps/chosen": -476.11065673828125, + "logps/rejected": -614.7606201171875, + "loss": 0.2048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8835915327072144, + "rewards/margins": 2.2028541564941406, + "rewards/rejected": -3.0864458084106445, + "step": 993 + }, + { + "epoch": 0.85, + "grad_norm": 50.74166752479985, + "learning_rate": 1.297631272713132e-07, + "logits/chosen": -1.3843188285827637, + "logits/rejected": -1.3354160785675049, + "logps/chosen": -362.65545654296875, + "logps/rejected": -511.3385314941406, + "loss": 0.38, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8462494611740112, + "rewards/margins": 1.3559162616729736, + "rewards/rejected": -2.2021656036376953, + "step": 994 + }, + { + "epoch": 0.85, + "grad_norm": 79.43461015601946, + "learning_rate": 1.2829167094182535e-07, + "logits/chosen": -1.3651742935180664, + "logits/rejected": -1.3554399013519287, + "logps/chosen": -345.17913818359375, + "logps/rejected": -470.7667541503906, + "loss": 0.5855, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4406148195266724, + "rewards/margins": 0.8224074840545654, + "rewards/rejected": -2.2630221843719482, + "step": 995 + }, + { + "epoch": 0.85, + "grad_norm": 57.67269416870282, + "learning_rate": 1.268280330439191e-07, + "logits/chosen": -1.3220021724700928, + "logits/rejected": -1.2686153650283813, + "logps/chosen": -482.16314697265625, + "logps/rejected": -677.6004638671875, + "loss": 0.2283, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.219618558883667, + "rewards/margins": 2.923917770385742, + "rewards/rejected": -4.14353609085083, + "step": 996 + }, + { + "epoch": 0.86, + "grad_norm": 49.46726905531337, + "learning_rate": 1.2537222670509563e-07, + "logits/chosen": -1.366963267326355, + "logits/rejected": -1.3310010433197021, + "logps/chosen": -572.5836791992188, + "logps/rejected": -686.4254760742188, + "loss": 0.2289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3257455825805664, + "rewards/margins": 1.9714258909225464, + "rewards/rejected": -3.2971715927124023, + "step": 997 + }, + { + "epoch": 0.86, + "grad_norm": 32.18065986952302, + "learning_rate": 1.2392426498261555e-07, + "logits/chosen": -1.394819974899292, + "logits/rejected": -1.2971246242523193, + "logps/chosen": -410.18109130859375, + "logps/rejected": -691.6347045898438, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0975755453109741, + "rewards/margins": 2.950037956237793, + "rewards/rejected": -4.047614097595215, + "step": 998 + }, + { + "epoch": 0.86, + "grad_norm": 107.19719355463234, + "learning_rate": 1.2248416086337975e-07, + "logits/chosen": -1.3006575107574463, + "logits/rejected": -1.2912487983703613, + "logps/chosen": -520.3414306640625, + "logps/rejected": -561.614013671875, + "loss": 0.7654, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.300315022468567, + "rewards/margins": 0.9441814422607422, + "rewards/rejected": -2.2444963455200195, + "step": 999 + }, + { + "epoch": 0.86, + "grad_norm": 77.32374464766163, + "learning_rate": 1.2105192726381298e-07, + "logits/chosen": -1.334614634513855, + "logits/rejected": -1.3272688388824463, + "logps/chosen": -638.7611083984375, + "logps/rejected": -625.6121215820312, + "loss": 0.4573, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7237482070922852, + "rewards/margins": 1.8083107471466064, + "rewards/rejected": -3.5320589542388916, + "step": 1000 + }, + { + "epoch": 0.86, + "grad_norm": 50.55706077901139, + "learning_rate": 1.196275770297497e-07, + "logits/chosen": -1.352452278137207, + "logits/rejected": -1.3267664909362793, + "logps/chosen": -478.7269592285156, + "logps/rejected": -654.9346923828125, + "loss": 0.2493, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6469810009002686, + "rewards/margins": 2.880655288696289, + "rewards/rejected": -3.5276360511779785, + "step": 1001 + }, + { + "epoch": 0.86, + "grad_norm": 48.758499448833476, + "learning_rate": 1.1821112293631719e-07, + "logits/chosen": -1.3520736694335938, + "logits/rejected": -1.3165087699890137, + "logps/chosen": -473.81866455078125, + "logps/rejected": -618.9940795898438, + "loss": 0.2119, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1709930896759033, + "rewards/margins": 2.36578369140625, + "rewards/rejected": -3.5367767810821533, + "step": 1002 + }, + { + "epoch": 0.86, + "grad_norm": 37.22498704233837, + "learning_rate": 1.1680257768782099e-07, + "logits/chosen": -1.3299715518951416, + "logits/rejected": -1.2586889266967773, + "logps/chosen": -405.7853088378906, + "logps/rejected": -546.0199584960938, + "loss": 0.2608, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.06967031955719, + "rewards/margins": 1.9266602993011475, + "rewards/rejected": -2.996330738067627, + "step": 1003 + }, + { + "epoch": 0.86, + "grad_norm": 54.30371464873953, + "learning_rate": 1.1540195391763263e-07, + "logits/chosen": -1.3893147706985474, + "logits/rejected": -1.37056565284729, + "logps/chosen": -449.3239440917969, + "logps/rejected": -508.5211181640625, + "loss": 0.2624, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9809780120849609, + "rewards/margins": 1.7566754817962646, + "rewards/rejected": -2.7376537322998047, + "step": 1004 + }, + { + "epoch": 0.86, + "grad_norm": 34.826076763047475, + "learning_rate": 1.1400926418807422e-07, + "logits/chosen": -1.3537343740463257, + "logits/rejected": -1.2689040899276733, + "logps/chosen": -424.01190185546875, + "logps/rejected": -680.014892578125, + "loss": 0.2066, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1469078063964844, + "rewards/margins": 2.787965774536133, + "rewards/rejected": -3.934873580932617, + "step": 1005 + }, + { + "epoch": 0.86, + "grad_norm": 32.71959772315284, + "learning_rate": 1.1262452099030683e-07, + "logits/chosen": -1.343701958656311, + "logits/rejected": -1.2804139852523804, + "logps/chosen": -515.3363037109375, + "logps/rejected": -678.830322265625, + "loss": 0.176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8507044315338135, + "rewards/margins": 3.0262112617492676, + "rewards/rejected": -3.876915693283081, + "step": 1006 + }, + { + "epoch": 0.86, + "grad_norm": 47.6305946661059, + "learning_rate": 1.112477367442195e-07, + "logits/chosen": -1.3749220371246338, + "logits/rejected": -1.2896788120269775, + "logps/chosen": -507.39715576171875, + "logps/rejected": -836.339599609375, + "loss": 0.2065, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.589701771736145, + "rewards/margins": 2.4947781562805176, + "rewards/rejected": -4.084479808807373, + "step": 1007 + }, + { + "epoch": 0.86, + "grad_norm": 91.08197293979373, + "learning_rate": 1.0987892379831499e-07, + "logits/chosen": -1.409374713897705, + "logits/rejected": -1.3736358880996704, + "logps/chosen": -566.7190551757812, + "logps/rejected": -724.8707275390625, + "loss": 0.7248, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5256876945495605, + "rewards/margins": 1.7136530876159668, + "rewards/rejected": -3.2393407821655273, + "step": 1008 + }, + { + "epoch": 0.87, + "grad_norm": 73.79584711311568, + "learning_rate": 1.085180944296018e-07, + "logits/chosen": -1.388725996017456, + "logits/rejected": -1.3790984153747559, + "logps/chosen": -533.7811279296875, + "logps/rejected": -701.3124389648438, + "loss": 0.4499, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2950994968414307, + "rewards/margins": 2.425567865371704, + "rewards/rejected": -3.7206673622131348, + "step": 1009 + }, + { + "epoch": 0.87, + "grad_norm": 104.60604217147537, + "learning_rate": 1.0716526084348276e-07, + "logits/chosen": -1.2697685956954956, + "logits/rejected": -1.2896143198013306, + "logps/chosen": -526.0863647460938, + "logps/rejected": -502.4462890625, + "loss": 0.686, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7153476476669312, + "rewards/margins": 0.7424807548522949, + "rewards/rejected": -2.4578285217285156, + "step": 1010 + }, + { + "epoch": 0.87, + "grad_norm": 62.41838282787974, + "learning_rate": 1.0582043517364602e-07, + "logits/chosen": -1.3432402610778809, + "logits/rejected": -1.3046520948410034, + "logps/chosen": -598.40966796875, + "logps/rejected": -688.8533325195312, + "loss": 0.2734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2707072496414185, + "rewards/margins": 2.0681800842285156, + "rewards/rejected": -3.3388874530792236, + "step": 1011 + }, + { + "epoch": 0.87, + "grad_norm": 23.917482883938213, + "learning_rate": 1.0448362948195566e-07, + "logits/chosen": -1.3280737400054932, + "logits/rejected": -1.2590006589889526, + "logps/chosen": -574.62255859375, + "logps/rejected": -878.9960327148438, + "loss": 0.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6344642639160156, + "rewards/margins": 3.077831268310547, + "rewards/rejected": -4.7122955322265625, + "step": 1012 + }, + { + "epoch": 0.87, + "grad_norm": 90.17755555406973, + "learning_rate": 1.031548557583436e-07, + "logits/chosen": -1.2596286535263062, + "logits/rejected": -1.235691785812378, + "logps/chosen": -618.44482421875, + "logps/rejected": -745.6839599609375, + "loss": 0.6558, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.025634765625, + "rewards/margins": 1.847394347190857, + "rewards/rejected": -3.8730289936065674, + "step": 1013 + }, + { + "epoch": 0.87, + "grad_norm": 57.67179771729252, + "learning_rate": 1.0183412592070318e-07, + "logits/chosen": -1.4220824241638184, + "logits/rejected": -1.2803467512130737, + "logps/chosen": -543.591552734375, + "logps/rejected": -951.6238403320312, + "loss": 0.2018, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.496922254562378, + "rewards/margins": 3.2244110107421875, + "rewards/rejected": -4.7213335037231445, + "step": 1014 + }, + { + "epoch": 0.87, + "grad_norm": 28.27046503087058, + "learning_rate": 1.0052145181478088e-07, + "logits/chosen": -1.3783986568450928, + "logits/rejected": -1.3363351821899414, + "logps/chosen": -430.4894714355469, + "logps/rejected": -737.7826538085938, + "loss": 0.1427, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.000460147857666, + "rewards/margins": 3.2528653144836426, + "rewards/rejected": -4.253325462341309, + "step": 1015 + }, + { + "epoch": 0.87, + "grad_norm": 68.50746326940782, + "learning_rate": 9.921684521407003e-08, + "logits/chosen": -1.325369119644165, + "logits/rejected": -1.2970994710922241, + "logps/chosen": -496.46337890625, + "logps/rejected": -493.948974609375, + "loss": 0.4351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9476513862609863, + "rewards/margins": 1.5521341562271118, + "rewards/rejected": -2.4997854232788086, + "step": 1016 + }, + { + "epoch": 0.87, + "grad_norm": 45.159571790116686, + "learning_rate": 9.792031781970689e-08, + "logits/chosen": -1.4115604162216187, + "logits/rejected": -1.377731442451477, + "logps/chosen": -377.2203369140625, + "logps/rejected": -521.806884765625, + "loss": 0.2619, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0020372867584229, + "rewards/margins": 1.844942331314087, + "rewards/rejected": -2.8469796180725098, + "step": 1017 + }, + { + "epoch": 0.87, + "grad_norm": 63.42631756119659, + "learning_rate": 9.663188126036392e-08, + "logits/chosen": -1.3807454109191895, + "logits/rejected": -1.337803840637207, + "logps/chosen": -519.5097045898438, + "logps/rejected": -685.4462280273438, + "loss": 0.3507, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5622811317443848, + "rewards/margins": 2.141068935394287, + "rewards/rejected": -3.703350305557251, + "step": 1018 + }, + { + "epoch": 0.87, + "grad_norm": 34.09574052170219, + "learning_rate": 9.535154709214587e-08, + "logits/chosen": -1.3408069610595703, + "logits/rejected": -1.272974967956543, + "logps/chosen": -395.58734130859375, + "logps/rejected": -605.916748046875, + "loss": 0.1966, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.33781898021698, + "rewards/margins": 2.1266462802886963, + "rewards/rejected": -3.4644651412963867, + "step": 1019 + }, + { + "epoch": 0.87, + "grad_norm": 103.9306170119398, + "learning_rate": 9.407932679848751e-08, + "logits/chosen": -1.3915414810180664, + "logits/rejected": -1.3104114532470703, + "logps/chosen": -506.6783142089844, + "logps/rejected": -715.592529296875, + "loss": 0.5415, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.56777024269104, + "rewards/margins": 1.9328727722167969, + "rewards/rejected": -3.500643014907837, + "step": 1020 + }, + { + "epoch": 0.88, + "grad_norm": 34.40064754463839, + "learning_rate": 9.281523179004802e-08, + "logits/chosen": -1.3873071670532227, + "logits/rejected": -1.3065744638442993, + "logps/chosen": -335.1987609863281, + "logps/rejected": -605.9613037109375, + "loss": 0.262, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5773022174835205, + "rewards/margins": 2.349818706512451, + "rewards/rejected": -2.9271209239959717, + "step": 1021 + }, + { + "epoch": 0.88, + "grad_norm": 67.12493341782304, + "learning_rate": 9.155927340461111e-08, + "logits/chosen": -1.347530722618103, + "logits/rejected": -1.2916395664215088, + "logps/chosen": -685.7630004882812, + "logps/rejected": -955.0648803710938, + "loss": 0.4494, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5220239162445068, + "rewards/margins": 2.94976806640625, + "rewards/rejected": -4.471792221069336, + "step": 1022 + }, + { + "epoch": 0.88, + "grad_norm": 62.10208865187062, + "learning_rate": 9.031146290698277e-08, + "logits/chosen": -1.350818157196045, + "logits/rejected": -1.2238643169403076, + "logps/chosen": -477.2236022949219, + "logps/rejected": -798.7186279296875, + "loss": 0.2364, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9117076992988586, + "rewards/margins": 3.2474284172058105, + "rewards/rejected": -4.159135818481445, + "step": 1023 + }, + { + "epoch": 0.88, + "grad_norm": 76.81999610296886, + "learning_rate": 8.907181148888854e-08, + "logits/chosen": -1.298819899559021, + "logits/rejected": -1.3052400350570679, + "logps/chosen": -656.9742431640625, + "logps/rejected": -658.60009765625, + "loss": 0.3144, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.474662184715271, + "rewards/margins": 2.117044687271118, + "rewards/rejected": -3.5917067527770996, + "step": 1024 + }, + { + "epoch": 0.88, + "grad_norm": 38.17404499212991, + "learning_rate": 8.78403302688755e-08, + "logits/chosen": -1.358424186706543, + "logits/rejected": -1.2844429016113281, + "logps/chosen": -521.257080078125, + "logps/rejected": -793.573974609375, + "loss": 0.1823, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2657744884490967, + "rewards/margins": 2.5545406341552734, + "rewards/rejected": -3.820315361022949, + "step": 1025 + }, + { + "epoch": 0.88, + "grad_norm": 57.92634731089754, + "learning_rate": 8.661703029221112e-08, + "logits/chosen": -1.429605484008789, + "logits/rejected": -1.3548343181610107, + "logps/chosen": -479.7242431640625, + "logps/rejected": -599.1637573242188, + "loss": 0.3724, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6530858278274536, + "rewards/margins": 1.7444809675216675, + "rewards/rejected": -3.397566795349121, + "step": 1026 + }, + { + "epoch": 0.88, + "grad_norm": 57.762652805798844, + "learning_rate": 8.540192253078448e-08, + "logits/chosen": -1.3694772720336914, + "logits/rejected": -1.3181052207946777, + "logps/chosen": -578.649658203125, + "logps/rejected": -698.65185546875, + "loss": 0.3659, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3504722118377686, + "rewards/margins": 2.076638698577881, + "rewards/rejected": -3.4271106719970703, + "step": 1027 + }, + { + "epoch": 0.88, + "grad_norm": 57.605495838687986, + "learning_rate": 8.41950178830081e-08, + "logits/chosen": -1.3705828189849854, + "logits/rejected": -1.3483182191848755, + "logps/chosen": -485.2783508300781, + "logps/rejected": -574.943603515625, + "loss": 0.3538, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2768254280090332, + "rewards/margins": 2.3636741638183594, + "rewards/rejected": -3.6404995918273926, + "step": 1028 + }, + { + "epoch": 0.88, + "grad_norm": 42.00448683749464, + "learning_rate": 8.299632717371996e-08, + "logits/chosen": -1.4272587299346924, + "logits/rejected": -1.3087083101272583, + "logps/chosen": -408.137451171875, + "logps/rejected": -737.99267578125, + "loss": 0.1625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9820849895477295, + "rewards/margins": 3.0054163932800293, + "rewards/rejected": -3.987501621246338, + "step": 1029 + }, + { + "epoch": 0.88, + "grad_norm": 36.82882855714117, + "learning_rate": 8.180586115408627e-08, + "logits/chosen": -1.380150318145752, + "logits/rejected": -1.309890866279602, + "logps/chosen": -352.4360046386719, + "logps/rejected": -592.15625, + "loss": 0.2671, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7464353442192078, + "rewards/margins": 2.4291770458221436, + "rewards/rejected": -3.175612688064575, + "step": 1030 + }, + { + "epoch": 0.88, + "grad_norm": 34.92691331449185, + "learning_rate": 8.06236305015059e-08, + "logits/chosen": -1.3108597993850708, + "logits/rejected": -1.2580173015594482, + "logps/chosen": -530.5318603515625, + "logps/rejected": -834.72314453125, + "loss": 0.1618, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1678415536880493, + "rewards/margins": 3.0013504028320312, + "rewards/rejected": -4.169192314147949, + "step": 1031 + }, + { + "epoch": 0.89, + "grad_norm": 40.819265846962935, + "learning_rate": 7.944964581951275e-08, + "logits/chosen": -1.344224452972412, + "logits/rejected": -1.2919423580169678, + "logps/chosen": -646.0286254882812, + "logps/rejected": -807.8919067382812, + "loss": 0.2051, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2766971588134766, + "rewards/margins": 3.460568428039551, + "rewards/rejected": -4.737265586853027, + "step": 1032 + }, + { + "epoch": 0.89, + "grad_norm": 44.888672218587935, + "learning_rate": 7.828391763768316e-08, + "logits/chosen": -1.3413419723510742, + "logits/rejected": -1.2852157354354858, + "logps/chosen": -467.3982238769531, + "logps/rejected": -574.54833984375, + "loss": 0.2917, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5983957648277283, + "rewards/margins": 2.3117122650146484, + "rewards/rejected": -2.9101080894470215, + "step": 1033 + }, + { + "epoch": 0.89, + "grad_norm": 57.511637878172635, + "learning_rate": 7.71264564115397e-08, + "logits/chosen": -1.3855578899383545, + "logits/rejected": -1.3349218368530273, + "logps/chosen": -506.8746337890625, + "logps/rejected": -727.783447265625, + "loss": 0.2616, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.276779294013977, + "rewards/margins": 2.763277769088745, + "rewards/rejected": -4.040057182312012, + "step": 1034 + }, + { + "epoch": 0.89, + "grad_norm": 62.57801150274811, + "learning_rate": 7.597727252245723e-08, + "logits/chosen": -1.348046064376831, + "logits/rejected": -1.2998453378677368, + "logps/chosen": -576.858154296875, + "logps/rejected": -694.7091674804688, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.056487560272217, + "rewards/margins": 1.7695956230163574, + "rewards/rejected": -3.826083183288574, + "step": 1035 + }, + { + "epoch": 0.89, + "grad_norm": 45.60053488392325, + "learning_rate": 7.483637627757166e-08, + "logits/chosen": -1.3614377975463867, + "logits/rejected": -1.312305212020874, + "logps/chosen": -544.572021484375, + "logps/rejected": -674.3843994140625, + "loss": 0.1863, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1244726181030273, + "rewards/margins": 2.8420958518981934, + "rewards/rejected": -3.9665684700012207, + "step": 1036 + }, + { + "epoch": 0.89, + "grad_norm": 40.22451424779994, + "learning_rate": 7.370377790968496e-08, + "logits/chosen": -1.3451738357543945, + "logits/rejected": -1.3830194473266602, + "logps/chosen": -387.7374572753906, + "logps/rejected": -351.87322998046875, + "loss": 0.3227, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7732783555984497, + "rewards/margins": 1.255499243736267, + "rewards/rejected": -2.028777599334717, + "step": 1037 + }, + { + "epoch": 0.89, + "grad_norm": 36.91936601511155, + "learning_rate": 7.257948757717558e-08, + "logits/chosen": -1.3813602924346924, + "logits/rejected": -1.3303508758544922, + "logps/chosen": -609.59375, + "logps/rejected": -660.0333251953125, + "loss": 0.1944, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2263309955596924, + "rewards/margins": 2.387805461883545, + "rewards/rejected": -3.6141366958618164, + "step": 1038 + }, + { + "epoch": 0.89, + "grad_norm": 39.65751850865929, + "learning_rate": 7.146351536390605e-08, + "logits/chosen": -1.3641141653060913, + "logits/rejected": -1.3134207725524902, + "logps/chosen": -550.0794677734375, + "logps/rejected": -780.9698486328125, + "loss": 0.2307, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2107312679290771, + "rewards/margins": 2.552530288696289, + "rewards/rejected": -3.763261318206787, + "step": 1039 + }, + { + "epoch": 0.89, + "grad_norm": 34.24417524354166, + "learning_rate": 7.0355871279133e-08, + "logits/chosen": -1.3492000102996826, + "logits/rejected": -1.309326171875, + "logps/chosen": -619.298583984375, + "logps/rejected": -761.5640869140625, + "loss": 0.1531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9659633636474609, + "rewards/margins": 2.9570584297180176, + "rewards/rejected": -3.9230217933654785, + "step": 1040 + }, + { + "epoch": 0.89, + "grad_norm": 111.866433260211, + "learning_rate": 6.925656525741751e-08, + "logits/chosen": -1.362472653388977, + "logits/rejected": -1.3299155235290527, + "logps/chosen": -527.7230224609375, + "logps/rejected": -719.2764282226562, + "loss": 0.6964, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.815812349319458, + "rewards/margins": 1.7427184581756592, + "rewards/rejected": -3.558530807495117, + "step": 1041 + }, + { + "epoch": 0.89, + "grad_norm": 32.95656660320831, + "learning_rate": 6.816560715853547e-08, + "logits/chosen": -1.3642749786376953, + "logits/rejected": -1.2888407707214355, + "logps/chosen": -646.0355224609375, + "logps/rejected": -815.24609375, + "loss": 0.1344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1993162631988525, + "rewards/margins": 3.1481070518493652, + "rewards/rejected": -4.347423553466797, + "step": 1042 + }, + { + "epoch": 0.89, + "grad_norm": 78.96166752915579, + "learning_rate": 6.708300676738976e-08, + "logits/chosen": -1.3494223356246948, + "logits/rejected": -1.2774553298950195, + "logps/chosen": -482.1902160644531, + "logps/rejected": -707.10986328125, + "loss": 0.4936, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3335380554199219, + "rewards/margins": 2.5525479316711426, + "rewards/rejected": -3.8860857486724854, + "step": 1043 + }, + { + "epoch": 0.9, + "grad_norm": 19.09480900466112, + "learning_rate": 6.600877379392212e-08, + "logits/chosen": -1.4266753196716309, + "logits/rejected": -1.3531907796859741, + "logps/chosen": -369.3329772949219, + "logps/rejected": -586.53564453125, + "loss": 0.1982, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4353833794593811, + "rewards/margins": 2.7501494884490967, + "rewards/rejected": -3.185532808303833, + "step": 1044 + }, + { + "epoch": 0.9, + "grad_norm": 25.09530960091258, + "learning_rate": 6.494291787302608e-08, + "logits/chosen": -1.388257622718811, + "logits/rejected": -1.3475077152252197, + "logps/chosen": -452.254150390625, + "logps/rejected": -888.9625854492188, + "loss": 0.1198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8969854116439819, + "rewards/margins": 3.282654285430908, + "rewards/rejected": -4.17963981628418, + "step": 1045 + }, + { + "epoch": 0.9, + "grad_norm": 22.14700396565832, + "learning_rate": 6.388544856446065e-08, + "logits/chosen": -1.3584980964660645, + "logits/rejected": -1.278203010559082, + "logps/chosen": -422.3341369628906, + "logps/rejected": -690.5770263671875, + "loss": 0.1809, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0029441118240356, + "rewards/margins": 3.2371349334716797, + "rewards/rejected": -4.240078926086426, + "step": 1046 + }, + { + "epoch": 0.9, + "grad_norm": 67.19354932374553, + "learning_rate": 6.283637535276498e-08, + "logits/chosen": -1.3828531503677368, + "logits/rejected": -1.2909801006317139, + "logps/chosen": -482.1180114746094, + "logps/rejected": -852.4668579101562, + "loss": 0.3232, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.214806079864502, + "rewards/margins": 3.309603214263916, + "rewards/rejected": -4.524409294128418, + "step": 1047 + }, + { + "epoch": 0.9, + "grad_norm": 47.26993737738886, + "learning_rate": 6.179570764717179e-08, + "logits/chosen": -1.4263463020324707, + "logits/rejected": -1.2946155071258545, + "logps/chosen": -424.43975830078125, + "logps/rejected": -778.6882934570312, + "loss": 0.276, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0966931581497192, + "rewards/margins": 3.0400359630584717, + "rewards/rejected": -4.1367292404174805, + "step": 1048 + }, + { + "epoch": 0.9, + "grad_norm": 82.17354273413754, + "learning_rate": 6.076345478152533e-08, + "logits/chosen": -1.319034218788147, + "logits/rejected": -1.2957336902618408, + "logps/chosen": -635.9437866210938, + "logps/rejected": -813.4094848632812, + "loss": 0.4827, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7995774745941162, + "rewards/margins": 2.410430669784546, + "rewards/rejected": -4.210008144378662, + "step": 1049 + }, + { + "epoch": 0.9, + "grad_norm": 60.38245546238117, + "learning_rate": 5.973962601419569e-08, + "logits/chosen": -1.383164644241333, + "logits/rejected": -1.2681220769882202, + "logps/chosen": -520.2368774414062, + "logps/rejected": -813.9752807617188, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1653625965118408, + "rewards/margins": 3.8374204635620117, + "rewards/rejected": -5.002782821655273, + "step": 1050 + }, + { + "epoch": 0.9, + "grad_norm": 54.869719235412234, + "learning_rate": 5.872423052799636e-08, + "logits/chosen": -1.3539948463439941, + "logits/rejected": -1.324246883392334, + "logps/chosen": -618.18896484375, + "logps/rejected": -643.138427734375, + "loss": 0.324, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3248660564422607, + "rewards/margins": 2.106539726257324, + "rewards/rejected": -3.431406021118164, + "step": 1051 + }, + { + "epoch": 0.9, + "grad_norm": 79.63429487086702, + "learning_rate": 5.771727743010213e-08, + "logits/chosen": -1.3471636772155762, + "logits/rejected": -1.3674094676971436, + "logps/chosen": -564.186279296875, + "logps/rejected": -493.5242919921875, + "loss": 0.6197, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8813424110412598, + "rewards/margins": 0.9798173308372498, + "rewards/rejected": -2.8611598014831543, + "step": 1052 + }, + { + "epoch": 0.9, + "grad_norm": 56.72483919511824, + "learning_rate": 5.6718775751967486e-08, + "logits/chosen": -1.3621277809143066, + "logits/rejected": -1.3243482112884521, + "logps/chosen": -392.24176025390625, + "logps/rejected": -494.3200378417969, + "loss": 0.2933, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9714163541793823, + "rewards/margins": 2.1407787799835205, + "rewards/rejected": -3.1121950149536133, + "step": 1053 + }, + { + "epoch": 0.9, + "grad_norm": 71.94034161177133, + "learning_rate": 5.5728734449244865e-08, + "logits/chosen": -1.309327483177185, + "logits/rejected": -1.2972667217254639, + "logps/chosen": -524.4185791015625, + "logps/rejected": -532.0087890625, + "loss": 0.374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9590820074081421, + "rewards/margins": 1.8255786895751953, + "rewards/rejected": -2.784660816192627, + "step": 1054 + }, + { + "epoch": 0.9, + "grad_norm": 41.31298907116539, + "learning_rate": 5.4747162401705295e-08, + "logits/chosen": -1.336936354637146, + "logits/rejected": -1.2083768844604492, + "logps/chosen": -544.3446655273438, + "logps/rejected": -971.6831665039062, + "loss": 0.1877, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7546546459197998, + "rewards/margins": 3.293325901031494, + "rewards/rejected": -5.047980785369873, + "step": 1055 + }, + { + "epoch": 0.91, + "grad_norm": 40.80957672653119, + "learning_rate": 5.377406841315801e-08, + "logits/chosen": -1.3370170593261719, + "logits/rejected": -1.299474835395813, + "logps/chosen": -566.1239013671875, + "logps/rejected": -749.313232421875, + "loss": 0.1484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3078569173812866, + "rewards/margins": 2.86323881149292, + "rewards/rejected": -4.171095848083496, + "step": 1056 + }, + { + "epoch": 0.91, + "grad_norm": 48.62880251246643, + "learning_rate": 5.280946121137186e-08, + "logits/chosen": -1.4300320148468018, + "logits/rejected": -1.3434855937957764, + "logps/chosen": -404.80902099609375, + "logps/rejected": -617.2060546875, + "loss": 0.2349, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1375755071640015, + "rewards/margins": 2.7895593643188477, + "rewards/rejected": -3.9271347522735596, + "step": 1057 + }, + { + "epoch": 0.91, + "grad_norm": 32.04470778039381, + "learning_rate": 5.185334944799691e-08, + "logits/chosen": -1.2941210269927979, + "logits/rejected": -1.246250867843628, + "logps/chosen": -477.75714111328125, + "logps/rejected": -722.9541625976562, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9855974912643433, + "rewards/margins": 2.813016891479492, + "rewards/rejected": -3.798614263534546, + "step": 1058 + }, + { + "epoch": 0.91, + "grad_norm": 60.862123746447736, + "learning_rate": 5.0905741698486714e-08, + "logits/chosen": -1.3242413997650146, + "logits/rejected": -1.3037867546081543, + "logps/chosen": -564.5498046875, + "logps/rejected": -639.8648681640625, + "loss": 0.3195, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3947079181671143, + "rewards/margins": 1.843632459640503, + "rewards/rejected": -3.238340377807617, + "step": 1059 + }, + { + "epoch": 0.91, + "grad_norm": 66.02784035664585, + "learning_rate": 4.996664646202176e-08, + "logits/chosen": -1.3540661334991455, + "logits/rejected": -1.2995530366897583, + "logps/chosen": -457.6804504394531, + "logps/rejected": -669.781005859375, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1613222360610962, + "rewards/margins": 2.4262075424194336, + "rewards/rejected": -3.5875296592712402, + "step": 1060 + }, + { + "epoch": 0.91, + "grad_norm": 37.626471741124945, + "learning_rate": 4.903607216143302e-08, + "logits/chosen": -1.3499476909637451, + "logits/rejected": -1.3001892566680908, + "logps/chosen": -513.4680786132812, + "logps/rejected": -740.8880615234375, + "loss": 0.159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5764918327331543, + "rewards/margins": 2.7328219413757324, + "rewards/rejected": -4.309313774108887, + "step": 1061 + }, + { + "epoch": 0.91, + "grad_norm": 60.42659277512601, + "learning_rate": 4.811402714312629e-08, + "logits/chosen": -1.3405532836914062, + "logits/rejected": -1.2848247289657593, + "logps/chosen": -594.1248779296875, + "logps/rejected": -725.2188720703125, + "loss": 0.3342, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9339052438735962, + "rewards/margins": 1.9501739740371704, + "rewards/rejected": -3.8840792179107666, + "step": 1062 + }, + { + "epoch": 0.91, + "grad_norm": 217.25995878770206, + "learning_rate": 4.720051967700767e-08, + "logits/chosen": -1.366823673248291, + "logits/rejected": -1.2912871837615967, + "logps/chosen": -408.3677062988281, + "logps/rejected": -631.322998046875, + "loss": 0.3042, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2799038887023926, + "rewards/margins": 2.1277143955230713, + "rewards/rejected": -3.407618284225464, + "step": 1063 + }, + { + "epoch": 0.91, + "grad_norm": 84.30541015310905, + "learning_rate": 4.629555795640872e-08, + "logits/chosen": -1.3612558841705322, + "logits/rejected": -1.2869057655334473, + "logps/chosen": -506.45849609375, + "logps/rejected": -751.8193359375, + "loss": 0.52, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.497196912765503, + "rewards/margins": 2.4841039180755615, + "rewards/rejected": -3.9813010692596436, + "step": 1064 + }, + { + "epoch": 0.91, + "grad_norm": 81.58503198700524, + "learning_rate": 4.539915009801376e-08, + "logits/chosen": -1.411940574645996, + "logits/rejected": -1.403637409210205, + "logps/chosen": -568.0249633789062, + "logps/rejected": -625.3397216796875, + "loss": 0.2493, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9405019283294678, + "rewards/margins": 2.0997798442840576, + "rewards/rejected": -3.0402820110321045, + "step": 1065 + }, + { + "epoch": 0.91, + "grad_norm": 52.81874353691749, + "learning_rate": 4.4511304141787054e-08, + "logits/chosen": -1.391019582748413, + "logits/rejected": -1.3759312629699707, + "logps/chosen": -340.2884521484375, + "logps/rejected": -414.40118408203125, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9816931486129761, + "rewards/margins": 1.8901349306106567, + "rewards/rejected": -2.871828079223633, + "step": 1066 + }, + { + "epoch": 0.92, + "grad_norm": 57.871120470857946, + "learning_rate": 4.363202805089972e-08, + "logits/chosen": -1.3674225807189941, + "logits/rejected": -1.3060979843139648, + "logps/chosen": -414.9110107421875, + "logps/rejected": -597.43505859375, + "loss": 0.3054, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1439032554626465, + "rewards/margins": 1.9327505826950073, + "rewards/rejected": -3.0766537189483643, + "step": 1067 + }, + { + "epoch": 0.92, + "grad_norm": 39.46963037255718, + "learning_rate": 4.276132971165936e-08, + "logits/chosen": -1.3694632053375244, + "logits/rejected": -1.3286242485046387, + "logps/chosen": -538.0938720703125, + "logps/rejected": -737.7407836914062, + "loss": 0.1362, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7101688385009766, + "rewards/margins": 2.513315200805664, + "rewards/rejected": -4.223484039306641, + "step": 1068 + }, + { + "epoch": 0.92, + "grad_norm": 31.99320375940036, + "learning_rate": 4.18992169334389e-08, + "logits/chosen": -1.329564094543457, + "logits/rejected": -1.255440592765808, + "logps/chosen": -491.8975524902344, + "logps/rejected": -717.998046875, + "loss": 0.1948, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9839257001876831, + "rewards/margins": 2.8212757110595703, + "rewards/rejected": -3.805201530456543, + "step": 1069 + }, + { + "epoch": 0.92, + "grad_norm": 46.43656004948136, + "learning_rate": 4.104569744860642e-08, + "logits/chosen": -1.3599036931991577, + "logits/rejected": -1.3458278179168701, + "logps/chosen": -497.247314453125, + "logps/rejected": -589.5523681640625, + "loss": 0.3527, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3754032850265503, + "rewards/margins": 1.8670722246170044, + "rewards/rejected": -3.2424755096435547, + "step": 1070 + }, + { + "epoch": 0.92, + "grad_norm": 59.26817635134149, + "learning_rate": 4.020077891245621e-08, + "logits/chosen": -1.314011812210083, + "logits/rejected": -1.30635404586792, + "logps/chosen": -509.9771728515625, + "logps/rejected": -611.1748657226562, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4132881164550781, + "rewards/margins": 1.814769983291626, + "rewards/rejected": -3.228058099746704, + "step": 1071 + }, + { + "epoch": 0.92, + "grad_norm": 55.05893563291395, + "learning_rate": 3.9364468903139825e-08, + "logits/chosen": -1.3378361463546753, + "logits/rejected": -1.3174649477005005, + "logps/chosen": -666.12109375, + "logps/rejected": -693.6858520507812, + "loss": 0.2653, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.633272409439087, + "rewards/margins": 2.0344698429107666, + "rewards/rejected": -3.6677422523498535, + "step": 1072 + }, + { + "epoch": 0.92, + "grad_norm": 66.25089057476616, + "learning_rate": 3.85367749215979e-08, + "logits/chosen": -1.3286153078079224, + "logits/rejected": -1.3204158544540405, + "logps/chosen": -591.8280029296875, + "logps/rejected": -673.842529296875, + "loss": 0.3108, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0493413209915161, + "rewards/margins": 2.0450916290283203, + "rewards/rejected": -3.094432830810547, + "step": 1073 + }, + { + "epoch": 0.92, + "grad_norm": 46.524895866045775, + "learning_rate": 3.7717704391493466e-08, + "logits/chosen": -1.3015832901000977, + "logits/rejected": -1.2925136089324951, + "logps/chosen": -576.459716796875, + "logps/rejected": -705.7184448242188, + "loss": 0.2449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1967111825942993, + "rewards/margins": 2.4815011024475098, + "rewards/rejected": -3.6782124042510986, + "step": 1074 + }, + { + "epoch": 0.92, + "grad_norm": 67.7835331038535, + "learning_rate": 3.6907264659144846e-08, + "logits/chosen": -1.3371295928955078, + "logits/rejected": -1.2966009378433228, + "logps/chosen": -502.981201171875, + "logps/rejected": -667.0936889648438, + "loss": 0.306, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9223822355270386, + "rewards/margins": 2.2916440963745117, + "rewards/rejected": -3.2140259742736816, + "step": 1075 + }, + { + "epoch": 0.92, + "grad_norm": 98.2426503459349, + "learning_rate": 3.6105462993459956e-08, + "logits/chosen": -1.359288215637207, + "logits/rejected": -1.3329675197601318, + "logps/chosen": -543.23291015625, + "logps/rejected": -600.3795166015625, + "loss": 0.6025, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5733633041381836, + "rewards/margins": 1.2107982635498047, + "rewards/rejected": -2.7841615676879883, + "step": 1076 + }, + { + "epoch": 0.92, + "grad_norm": 62.82469312316676, + "learning_rate": 3.531230658587114e-08, + "logits/chosen": -1.3349018096923828, + "logits/rejected": -1.284590482711792, + "logps/chosen": -522.4531860351562, + "logps/rejected": -633.3223876953125, + "loss": 0.2956, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2021509408950806, + "rewards/margins": 2.3413257598876953, + "rewards/rejected": -3.5434770584106445, + "step": 1077 + }, + { + "epoch": 0.92, + "grad_norm": 63.67005318641801, + "learning_rate": 3.452780255027066e-08, + "logits/chosen": -1.4124021530151367, + "logits/rejected": -1.3744122982025146, + "logps/chosen": -489.9981689453125, + "logps/rejected": -560.9869995117188, + "loss": 0.3767, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2464032173156738, + "rewards/margins": 1.5017939805984497, + "rewards/rejected": -2.748197317123413, + "step": 1078 + }, + { + "epoch": 0.93, + "grad_norm": 82.83531057954863, + "learning_rate": 3.375195792294694e-08, + "logits/chosen": -1.2610230445861816, + "logits/rejected": -1.2093844413757324, + "logps/chosen": -511.044677734375, + "logps/rejected": -661.511474609375, + "loss": 0.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3683613538742065, + "rewards/margins": 1.7111997604370117, + "rewards/rejected": -3.079561233520508, + "step": 1079 + }, + { + "epoch": 0.93, + "grad_norm": 63.97628020971092, + "learning_rate": 3.298477966252089e-08, + "logits/chosen": -1.346972942352295, + "logits/rejected": -1.232084035873413, + "logps/chosen": -533.18408203125, + "logps/rejected": -757.748291015625, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.215575933456421, + "rewards/margins": 2.8929176330566406, + "rewards/rejected": -4.108493804931641, + "step": 1080 + }, + { + "epoch": 0.93, + "grad_norm": 40.72641388442564, + "learning_rate": 3.222627464988459e-08, + "logits/chosen": -1.4029819965362549, + "logits/rejected": -1.268467903137207, + "logps/chosen": -555.34033203125, + "logps/rejected": -960.1851806640625, + "loss": 0.2085, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8269891142845154, + "rewards/margins": 4.098667621612549, + "rewards/rejected": -4.925657272338867, + "step": 1081 + }, + { + "epoch": 0.93, + "grad_norm": 56.2349025954042, + "learning_rate": 3.1476449688138895e-08, + "logits/chosen": -1.3948179483413696, + "logits/rejected": -1.303943157196045, + "logps/chosen": -385.4075012207031, + "logps/rejected": -645.4246826171875, + "loss": 0.2414, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.961907148361206, + "rewards/margins": 2.621835947036743, + "rewards/rejected": -3.583743095397949, + "step": 1082 + }, + { + "epoch": 0.93, + "grad_norm": 71.31375486781268, + "learning_rate": 3.073531150253217e-08, + "logits/chosen": -1.3443121910095215, + "logits/rejected": -1.2920128107070923, + "logps/chosen": -523.985107421875, + "logps/rejected": -646.9969482421875, + "loss": 0.4286, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0373486280441284, + "rewards/margins": 2.374429225921631, + "rewards/rejected": -3.411777973175049, + "step": 1083 + }, + { + "epoch": 0.93, + "grad_norm": 77.27680706949431, + "learning_rate": 3.0002866740400424e-08, + "logits/chosen": -1.4241924285888672, + "logits/rejected": -1.3850486278533936, + "logps/chosen": -329.6954040527344, + "logps/rejected": -459.6412353515625, + "loss": 0.6585, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0908865928649902, + "rewards/margins": 0.8222720623016357, + "rewards/rejected": -1.913158655166626, + "step": 1084 + }, + { + "epoch": 0.93, + "grad_norm": 66.59633314582754, + "learning_rate": 2.9279121971107712e-08, + "logits/chosen": -1.328117847442627, + "logits/rejected": -1.2686291933059692, + "logps/chosen": -499.4151611328125, + "logps/rejected": -712.1885986328125, + "loss": 0.3499, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1894209384918213, + "rewards/margins": 2.26012921333313, + "rewards/rejected": -3.449550151824951, + "step": 1085 + }, + { + "epoch": 0.93, + "grad_norm": 34.118069670094165, + "learning_rate": 2.8564083685986838e-08, + "logits/chosen": -1.4095797538757324, + "logits/rejected": -1.3639633655548096, + "logps/chosen": -383.1153564453125, + "logps/rejected": -568.6919555664062, + "loss": 0.2055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0487778186798096, + "rewards/margins": 1.981377124786377, + "rewards/rejected": -3.0301549434661865, + "step": 1086 + }, + { + "epoch": 0.93, + "grad_norm": 86.39325868481784, + "learning_rate": 2.785775829828152e-08, + "logits/chosen": -1.3580666780471802, + "logits/rejected": -1.2918117046356201, + "logps/chosen": -563.025634765625, + "logps/rejected": -764.1182250976562, + "loss": 0.5369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4875305891036987, + "rewards/margins": 2.3983347415924072, + "rewards/rejected": -3.8858656883239746, + "step": 1087 + }, + { + "epoch": 0.93, + "grad_norm": 30.38437626026005, + "learning_rate": 2.7160152143088533e-08, + "logits/chosen": -1.3298563957214355, + "logits/rejected": -1.2902374267578125, + "logps/chosen": -627.9073486328125, + "logps/rejected": -791.8634643554688, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1206411123275757, + "rewards/margins": 3.556591033935547, + "rewards/rejected": -4.677231788635254, + "step": 1088 + }, + { + "epoch": 0.93, + "grad_norm": 108.26552979078068, + "learning_rate": 2.6471271477301328e-08, + "logits/chosen": -1.32789945602417, + "logits/rejected": -1.2792022228240967, + "logps/chosen": -668.3905029296875, + "logps/rejected": -888.897705078125, + "loss": 0.6575, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9736433029174805, + "rewards/margins": 2.0397772789001465, + "rewards/rejected": -4.013420581817627, + "step": 1089 + }, + { + "epoch": 0.93, + "grad_norm": 113.96211501772365, + "learning_rate": 2.5791122479553505e-08, + "logits/chosen": -1.3528571128845215, + "logits/rejected": -1.3356266021728516, + "logps/chosen": -504.17742919921875, + "logps/rejected": -588.0662841796875, + "loss": 0.6183, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3590521812438965, + "rewards/margins": 1.5391099452972412, + "rewards/rejected": -2.8981621265411377, + "step": 1090 + }, + { + "epoch": 0.94, + "grad_norm": 62.84055237165941, + "learning_rate": 2.5119711250163323e-08, + "logits/chosen": -1.3433167934417725, + "logits/rejected": -1.3004448413848877, + "logps/chosen": -630.4215087890625, + "logps/rejected": -844.3784790039062, + "loss": 0.2892, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4850654602050781, + "rewards/margins": 2.3362314701080322, + "rewards/rejected": -3.8212971687316895, + "step": 1091 + }, + { + "epoch": 0.94, + "grad_norm": 49.72033236294848, + "learning_rate": 2.445704381107949e-08, + "logits/chosen": -1.3576364517211914, + "logits/rejected": -1.2695292234420776, + "logps/chosen": -506.31060791015625, + "logps/rejected": -802.626953125, + "loss": 0.267, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1525599956512451, + "rewards/margins": 3.1654715538024902, + "rewards/rejected": -4.318031311035156, + "step": 1092 + }, + { + "epoch": 0.94, + "grad_norm": 35.0255603595314, + "learning_rate": 2.380312610582691e-08, + "logits/chosen": -1.3337469100952148, + "logits/rejected": -1.279638648033142, + "logps/chosen": -454.1136474609375, + "logps/rejected": -615.020263671875, + "loss": 0.2116, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.780484676361084, + "rewards/margins": 2.418971061706543, + "rewards/rejected": -3.199455738067627, + "step": 1093 + }, + { + "epoch": 0.94, + "grad_norm": 46.86951995562813, + "learning_rate": 2.31579639994528e-08, + "logits/chosen": -1.363749384880066, + "logits/rejected": -1.3308840990066528, + "logps/chosen": -592.7799072265625, + "logps/rejected": -634.8521728515625, + "loss": 0.1967, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1438612937927246, + "rewards/margins": 2.781632900238037, + "rewards/rejected": -3.9254941940307617, + "step": 1094 + }, + { + "epoch": 0.94, + "grad_norm": 90.86907239722963, + "learning_rate": 2.252156327847543e-08, + "logits/chosen": -1.3248136043548584, + "logits/rejected": -1.3026912212371826, + "logps/chosen": -611.6460571289062, + "logps/rejected": -674.7425537109375, + "loss": 0.7975, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9554386138916016, + "rewards/margins": 1.4960346221923828, + "rewards/rejected": -3.4514732360839844, + "step": 1095 + }, + { + "epoch": 0.94, + "grad_norm": 55.03351828353279, + "learning_rate": 2.189392965083059e-08, + "logits/chosen": -1.3430871963500977, + "logits/rejected": -1.294329047203064, + "logps/chosen": -702.509521484375, + "logps/rejected": -968.4942626953125, + "loss": 0.1352, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1937084197998047, + "rewards/margins": 4.060674667358398, + "rewards/rejected": -5.254383087158203, + "step": 1096 + }, + { + "epoch": 0.94, + "grad_norm": 92.13970354226072, + "learning_rate": 2.1275068745821743e-08, + "logits/chosen": -1.4347259998321533, + "logits/rejected": -1.3344197273254395, + "logps/chosen": -487.1264953613281, + "logps/rejected": -683.416748046875, + "loss": 0.7083, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7204651832580566, + "rewards/margins": 1.733154535293579, + "rewards/rejected": -3.4536197185516357, + "step": 1097 + }, + { + "epoch": 0.94, + "grad_norm": 34.39366187826271, + "learning_rate": 2.0664986114068973e-08, + "logits/chosen": -1.3739490509033203, + "logits/rejected": -1.3129985332489014, + "logps/chosen": -410.9372863769531, + "logps/rejected": -514.8299560546875, + "loss": 0.2945, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.940898060798645, + "rewards/margins": 1.9497122764587402, + "rewards/rejected": -2.8906102180480957, + "step": 1098 + }, + { + "epoch": 0.94, + "grad_norm": 35.32745819666828, + "learning_rate": 2.0063687227458882e-08, + "logits/chosen": -1.345841407775879, + "logits/rejected": -1.2993870973587036, + "logps/chosen": -627.6751098632812, + "logps/rejected": -769.1766357421875, + "loss": 0.16, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2006680965423584, + "rewards/margins": 2.500814437866211, + "rewards/rejected": -3.7014827728271484, + "step": 1099 + }, + { + "epoch": 0.94, + "grad_norm": 42.89372340874667, + "learning_rate": 1.9471177479096102e-08, + "logits/chosen": -1.2868573665618896, + "logits/rejected": -1.2914104461669922, + "logps/chosen": -471.10211181640625, + "logps/rejected": -511.0739440917969, + "loss": 0.2658, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3655877113342285, + "rewards/margins": 1.5534907579421997, + "rewards/rejected": -2.9190783500671387, + "step": 1100 + }, + { + "epoch": 0.94, + "grad_norm": 66.93499212181665, + "learning_rate": 1.8887462183254877e-08, + "logits/chosen": -1.2483763694763184, + "logits/rejected": -1.217915415763855, + "logps/chosen": -709.8128662109375, + "logps/rejected": -865.47607421875, + "loss": 0.3359, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7271713018417358, + "rewards/margins": 2.937894821166992, + "rewards/rejected": -4.665066242218018, + "step": 1101 + }, + { + "epoch": 0.95, + "grad_norm": 57.77362074884795, + "learning_rate": 1.831254657533077e-08, + "logits/chosen": -1.3025121688842773, + "logits/rejected": -1.2883214950561523, + "logps/chosen": -581.310546875, + "logps/rejected": -696.29638671875, + "loss": 0.2503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9122763276100159, + "rewards/margins": 2.7274765968322754, + "rewards/rejected": -3.6397528648376465, + "step": 1102 + }, + { + "epoch": 0.95, + "grad_norm": 38.67736601897149, + "learning_rate": 1.7746435811794357e-08, + "logits/chosen": -1.3677195310592651, + "logits/rejected": -1.3084774017333984, + "logps/chosen": -369.76910400390625, + "logps/rejected": -629.9393310546875, + "loss": 0.162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6241545677185059, + "rewards/margins": 2.79376482963562, + "rewards/rejected": -3.417919635772705, + "step": 1103 + }, + { + "epoch": 0.95, + "grad_norm": 37.541347259643885, + "learning_rate": 1.7189134970144847e-08, + "logits/chosen": -1.3679184913635254, + "logits/rejected": -1.3461575508117676, + "logps/chosen": -533.3414306640625, + "logps/rejected": -686.5682983398438, + "loss": 0.1703, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.156436562538147, + "rewards/margins": 2.7757630348205566, + "rewards/rejected": -3.932199478149414, + "step": 1104 + }, + { + "epoch": 0.95, + "grad_norm": 53.39127411417644, + "learning_rate": 1.664064904886431e-08, + "logits/chosen": -1.3563721179962158, + "logits/rejected": -1.312720775604248, + "logps/chosen": -565.919921875, + "logps/rejected": -785.633056640625, + "loss": 0.2198, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.468775987625122, + "rewards/margins": 2.6510097980499268, + "rewards/rejected": -4.119785785675049, + "step": 1105 + }, + { + "epoch": 0.95, + "grad_norm": 40.875245401109694, + "learning_rate": 1.6100982967373056e-08, + "logits/chosen": -1.3901944160461426, + "logits/rejected": -1.314322590827942, + "logps/chosen": -478.9412536621094, + "logps/rejected": -664.1520385742188, + "loss": 0.2461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2784733772277832, + "rewards/margins": 2.814133644104004, + "rewards/rejected": -4.092606544494629, + "step": 1106 + }, + { + "epoch": 0.95, + "grad_norm": 46.94616916655286, + "learning_rate": 1.557014156598535e-08, + "logits/chosen": -1.3899496793746948, + "logits/rejected": -1.3239576816558838, + "logps/chosen": -492.5782775878906, + "logps/rejected": -698.148681640625, + "loss": 0.2926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.575579047203064, + "rewards/margins": 3.199647903442383, + "rewards/rejected": -3.7752270698547363, + "step": 1107 + }, + { + "epoch": 0.95, + "grad_norm": 88.45774590135527, + "learning_rate": 1.5048129605866433e-08, + "logits/chosen": -1.2571226358413696, + "logits/rejected": -1.2514090538024902, + "logps/chosen": -723.9155883789062, + "logps/rejected": -808.6048583984375, + "loss": 0.4931, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6201335191726685, + "rewards/margins": 2.539370059967041, + "rewards/rejected": -4.159502983093262, + "step": 1108 + }, + { + "epoch": 0.95, + "grad_norm": 29.18465847959277, + "learning_rate": 1.4534951768989e-08, + "logits/chosen": -1.346365213394165, + "logits/rejected": -1.2711735963821411, + "logps/chosen": -389.49078369140625, + "logps/rejected": -583.47802734375, + "loss": 0.1784, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0953283309936523, + "rewards/margins": 2.546407699584961, + "rewards/rejected": -3.6417360305786133, + "step": 1109 + }, + { + "epoch": 0.95, + "grad_norm": 28.249185451400095, + "learning_rate": 1.403061265809191e-08, + "logits/chosen": -1.3683123588562012, + "logits/rejected": -1.2581026554107666, + "logps/chosen": -507.8565979003906, + "logps/rejected": -826.8822021484375, + "loss": 0.1604, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.20506751537323, + "rewards/margins": 3.6192493438720703, + "rewards/rejected": -4.824316501617432, + "step": 1110 + }, + { + "epoch": 0.95, + "grad_norm": 85.02557735435447, + "learning_rate": 1.3535116796638767e-08, + "logits/chosen": -1.340364933013916, + "logits/rejected": -1.3544952869415283, + "logps/chosen": -616.94091796875, + "logps/rejected": -598.183837890625, + "loss": 0.4654, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.400769591331482, + "rewards/margins": 1.4828534126281738, + "rewards/rejected": -2.8836231231689453, + "step": 1111 + }, + { + "epoch": 0.95, + "grad_norm": 72.5093628753269, + "learning_rate": 1.3048468628777398e-08, + "logits/chosen": -1.3603501319885254, + "logits/rejected": -1.28261137008667, + "logps/chosen": -603.1817626953125, + "logps/rejected": -794.6780395507812, + "loss": 0.6567, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2409727573394775, + "rewards/margins": 2.7074697017669678, + "rewards/rejected": -3.9484424591064453, + "step": 1112 + }, + { + "epoch": 0.95, + "grad_norm": 62.751686560930395, + "learning_rate": 1.2570672519299108e-08, + "logits/chosen": -1.385198712348938, + "logits/rejected": -1.3630871772766113, + "logps/chosen": -443.33050537109375, + "logps/rejected": -560.1493530273438, + "loss": 0.3856, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.240035891532898, + "rewards/margins": 2.105348587036133, + "rewards/rejected": -3.3453845977783203, + "step": 1113 + }, + { + "epoch": 0.96, + "grad_norm": 45.999675175415945, + "learning_rate": 1.2101732753601379e-08, + "logits/chosen": -1.3332526683807373, + "logits/rejected": -1.305140495300293, + "logps/chosen": -409.5257568359375, + "logps/rejected": -439.9317626953125, + "loss": 0.3374, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9548590183258057, + "rewards/margins": 1.4134576320648193, + "rewards/rejected": -2.368316650390625, + "step": 1114 + }, + { + "epoch": 0.96, + "grad_norm": 59.87516963513788, + "learning_rate": 1.1641653537647456e-08, + "logits/chosen": -1.2905126810073853, + "logits/rejected": -1.2474026679992676, + "logps/chosen": -825.4317626953125, + "logps/rejected": -992.7080688476562, + "loss": 0.2083, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1057610511779785, + "rewards/margins": 3.1582183837890625, + "rewards/rejected": -5.263979911804199, + "step": 1115 + }, + { + "epoch": 0.96, + "grad_norm": 63.899841777084106, + "learning_rate": 1.119043899792993e-08, + "logits/chosen": -1.3253779411315918, + "logits/rejected": -1.2959191799163818, + "logps/chosen": -440.66455078125, + "logps/rejected": -645.551513671875, + "loss": 0.3568, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3789682388305664, + "rewards/margins": 2.5655908584594727, + "rewards/rejected": -3.944559097290039, + "step": 1116 + }, + { + "epoch": 0.96, + "grad_norm": 31.935937833275386, + "learning_rate": 1.0748093181433216e-08, + "logits/chosen": -1.360658049583435, + "logits/rejected": -1.3096072673797607, + "logps/chosen": -497.0452575683594, + "logps/rejected": -653.573974609375, + "loss": 0.2061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.161726474761963, + "rewards/margins": 2.291126251220703, + "rewards/rejected": -3.452852487564087, + "step": 1117 + }, + { + "epoch": 0.96, + "grad_norm": 74.0455033197263, + "learning_rate": 1.0314620055597246e-08, + "logits/chosen": -1.32560396194458, + "logits/rejected": -1.2965167760849, + "logps/chosen": -603.4779052734375, + "logps/rejected": -789.289794921875, + "loss": 0.3315, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2657091617584229, + "rewards/margins": 2.866349697113037, + "rewards/rejected": -4.132059097290039, + "step": 1118 + }, + { + "epoch": 0.96, + "grad_norm": 48.797521802242606, + "learning_rate": 9.890023508282165e-09, + "logits/chosen": -1.3630735874176025, + "logits/rejected": -1.3144967555999756, + "logps/chosen": -476.443115234375, + "logps/rejected": -639.4280395507812, + "loss": 0.2791, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7719219326972961, + "rewards/margins": 2.7206759452819824, + "rewards/rejected": -3.492598056793213, + "step": 1119 + }, + { + "epoch": 0.96, + "grad_norm": 75.32071662583112, + "learning_rate": 9.474307347733024e-09, + "logits/chosen": -1.3259435892105103, + "logits/rejected": -1.2706059217453003, + "logps/chosen": -482.0497741699219, + "logps/rejected": -624.172607421875, + "loss": 0.466, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3574128150939941, + "rewards/margins": 1.9411921501159668, + "rewards/rejected": -3.298604965209961, + "step": 1120 + }, + { + "epoch": 0.96, + "grad_norm": 21.492431931667138, + "learning_rate": 9.067475302546147e-09, + "logits/chosen": -1.2945029735565186, + "logits/rejected": -1.2453341484069824, + "logps/chosen": -823.6737060546875, + "logps/rejected": -1095.56640625, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8724188804626465, + "rewards/margins": 3.2580301761627197, + "rewards/rejected": -5.130449295043945, + "step": 1121 + }, + { + "epoch": 0.96, + "grad_norm": 18.955868304864357, + "learning_rate": 8.669531021635257e-09, + "logits/chosen": -1.3908123970031738, + "logits/rejected": -1.3216614723205566, + "logps/chosen": -368.7554016113281, + "logps/rejected": -699.5101928710938, + "loss": 0.2035, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7341961860656738, + "rewards/margins": 3.5546658039093018, + "rewards/rejected": -4.288862228393555, + "step": 1122 + }, + { + "epoch": 0.96, + "grad_norm": 54.82451185740064, + "learning_rate": 8.28047807419885e-09, + "logits/chosen": -1.3758102655410767, + "logits/rejected": -1.298590898513794, + "logps/chosen": -530.2117919921875, + "logps/rejected": -753.3561401367188, + "loss": 0.2113, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0154526233673096, + "rewards/margins": 3.282680034637451, + "rewards/rejected": -4.29813289642334, + "step": 1123 + }, + { + "epoch": 0.96, + "grad_norm": 23.3833093803656, + "learning_rate": 7.900319949688427e-09, + "logits/chosen": -1.3352243900299072, + "logits/rejected": -1.2280926704406738, + "logps/chosen": -428.10418701171875, + "logps/rejected": -751.018310546875, + "loss": 0.1816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.796173632144928, + "rewards/margins": 3.192791223526001, + "rewards/rejected": -3.9889650344848633, + "step": 1124 + }, + { + "epoch": 0.96, + "grad_norm": 42.988311388156085, + "learning_rate": 7.529060057776981e-09, + "logits/chosen": -1.351731300354004, + "logits/rejected": -1.2667642831802368, + "logps/chosen": -568.9866333007812, + "logps/rejected": -843.7405395507812, + "loss": 0.1532, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3037387132644653, + "rewards/margins": 2.6970062255859375, + "rewards/rejected": -4.0007452964782715, + "step": 1125 + }, + { + "epoch": 0.97, + "grad_norm": 58.589929396690955, + "learning_rate": 7.1667017283281176e-09, + "logits/chosen": -1.2922766208648682, + "logits/rejected": -1.253503680229187, + "logps/chosen": -500.7716979980469, + "logps/rejected": -637.717529296875, + "loss": 0.3383, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3333024978637695, + "rewards/margins": 2.46457839012146, + "rewards/rejected": -3.7978806495666504, + "step": 1126 + }, + { + "epoch": 0.97, + "grad_norm": 62.986156527543464, + "learning_rate": 6.813248211366973e-09, + "logits/chosen": -1.3930134773254395, + "logits/rejected": -1.2740166187286377, + "logps/chosen": -618.4913940429688, + "logps/rejected": -840.85888671875, + "loss": 0.2634, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4361460208892822, + "rewards/margins": 2.6445565223693848, + "rewards/rejected": -4.080702781677246, + "step": 1127 + }, + { + "epoch": 0.97, + "grad_norm": 32.701636879677764, + "learning_rate": 6.468702677050464e-09, + "logits/chosen": -1.2866826057434082, + "logits/rejected": -1.1640338897705078, + "logps/chosen": -562.0260009765625, + "logps/rejected": -916.1024169921875, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3056342601776123, + "rewards/margins": 3.7452681064605713, + "rewards/rejected": -5.050902366638184, + "step": 1128 + }, + { + "epoch": 0.97, + "grad_norm": 79.45710287733499, + "learning_rate": 6.133068215638748e-09, + "logits/chosen": -1.354970932006836, + "logits/rejected": -1.2986483573913574, + "logps/chosen": -446.92877197265625, + "logps/rejected": -665.33203125, + "loss": 0.4294, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3372687101364136, + "rewards/margins": 2.512016773223877, + "rewards/rejected": -3.84928560256958, + "step": 1129 + }, + { + "epoch": 0.97, + "grad_norm": 49.14039347193333, + "learning_rate": 5.8063478374680285e-09, + "logits/chosen": -1.319446325302124, + "logits/rejected": -1.3057183027267456, + "logps/chosen": -609.715087890625, + "logps/rejected": -698.795654296875, + "loss": 0.1921, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2187325954437256, + "rewards/margins": 2.7987396717071533, + "rewards/rejected": -4.017472267150879, + "step": 1130 + }, + { + "epoch": 0.97, + "grad_norm": 66.63403806754211, + "learning_rate": 5.48854447292324e-09, + "logits/chosen": -1.3315446376800537, + "logits/rejected": -1.2545630931854248, + "logps/chosen": -419.51055908203125, + "logps/rejected": -618.6432495117188, + "loss": 0.4235, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0476508140563965, + "rewards/margins": 2.285956382751465, + "rewards/rejected": -3.3336071968078613, + "step": 1131 + }, + { + "epoch": 0.97, + "grad_norm": 57.07860454385766, + "learning_rate": 5.179660972411848e-09, + "logits/chosen": -1.383811116218567, + "logits/rejected": -1.3193073272705078, + "logps/chosen": -589.6669311523438, + "logps/rejected": -812.602783203125, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4525666236877441, + "rewards/margins": 2.547089099884033, + "rewards/rejected": -3.9996557235717773, + "step": 1132 + }, + { + "epoch": 0.97, + "grad_norm": 87.49969249077617, + "learning_rate": 4.87970010633798e-09, + "logits/chosen": -1.3547697067260742, + "logits/rejected": -1.292210578918457, + "logps/chosen": -366.54345703125, + "logps/rejected": -503.3235168457031, + "loss": 0.499, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.019222617149353, + "rewards/margins": 1.5081883668899536, + "rewards/rejected": -2.5274109840393066, + "step": 1133 + }, + { + "epoch": 0.97, + "grad_norm": 77.39361614522927, + "learning_rate": 4.588664565078115e-09, + "logits/chosen": -1.2798162698745728, + "logits/rejected": -1.211428165435791, + "logps/chosen": -636.0787353515625, + "logps/rejected": -900.0787353515625, + "loss": 0.6096, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2634141445159912, + "rewards/margins": 2.9379217624664307, + "rewards/rejected": -4.201335906982422, + "step": 1134 + }, + { + "epoch": 0.97, + "grad_norm": 67.03503487794497, + "learning_rate": 4.3065569589565424e-09, + "logits/chosen": -1.3400976657867432, + "logits/rejected": -1.3143208026885986, + "logps/chosen": -568.5703735351562, + "logps/rejected": -604.925048828125, + "loss": 0.4941, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1162084341049194, + "rewards/margins": 1.7637848854064941, + "rewards/rejected": -2.879993438720703, + "step": 1135 + }, + { + "epoch": 0.97, + "grad_norm": 57.41363643734463, + "learning_rate": 4.033379818222271e-09, + "logits/chosen": -1.337462067604065, + "logits/rejected": -1.3244147300720215, + "logps/chosen": -488.3711853027344, + "logps/rejected": -537.2532958984375, + "loss": 0.3174, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7379860877990723, + "rewards/margins": 1.4863120317459106, + "rewards/rejected": -3.2242980003356934, + "step": 1136 + }, + { + "epoch": 0.98, + "grad_norm": 61.66213574933086, + "learning_rate": 3.769135593025941e-09, + "logits/chosen": -1.3855712413787842, + "logits/rejected": -1.3328797817230225, + "logps/chosen": -437.3864440917969, + "logps/rejected": -679.2550048828125, + "loss": 0.3173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0746511220932007, + "rewards/margins": 2.411076545715332, + "rewards/rejected": -3.485727548599243, + "step": 1137 + }, + { + "epoch": 0.98, + "grad_norm": 47.317221747492454, + "learning_rate": 3.5138266533980553e-09, + "logits/chosen": -1.3714405298233032, + "logits/rejected": -1.3022525310516357, + "logps/chosen": -397.1058349609375, + "logps/rejected": -654.9545288085938, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5204839706420898, + "rewards/margins": 2.413515090942383, + "rewards/rejected": -3.9339990615844727, + "step": 1138 + }, + { + "epoch": 0.98, + "grad_norm": 53.52800108897222, + "learning_rate": 3.267455289227894e-09, + "logits/chosen": -1.321838617324829, + "logits/rejected": -1.2986551523208618, + "logps/chosen": -569.0445556640625, + "logps/rejected": -653.1036987304688, + "loss": 0.355, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3003381490707397, + "rewards/margins": 1.9961235523223877, + "rewards/rejected": -3.296461820602417, + "step": 1139 + }, + { + "epoch": 0.98, + "grad_norm": 71.56680662426476, + "learning_rate": 3.0300237102426353e-09, + "logits/chosen": -1.3216408491134644, + "logits/rejected": -1.2848577499389648, + "logps/chosen": -523.3908081054688, + "logps/rejected": -589.765625, + "loss": 0.5425, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.445241928100586, + "rewards/margins": 1.263198733329773, + "rewards/rejected": -2.7084405422210693, + "step": 1140 + }, + { + "epoch": 0.98, + "grad_norm": 47.095683960933016, + "learning_rate": 2.80153404598793e-09, + "logits/chosen": -1.435076355934143, + "logits/rejected": -1.3584879636764526, + "logps/chosen": -435.42779541015625, + "logps/rejected": -651.4534912109375, + "loss": 0.3493, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2850425243377686, + "rewards/margins": 2.3500070571899414, + "rewards/rejected": -3.63504958152771, + "step": 1141 + }, + { + "epoch": 0.98, + "grad_norm": 59.8759315859725, + "learning_rate": 2.5819883458082502e-09, + "logits/chosen": -1.349071979522705, + "logits/rejected": -1.2874417304992676, + "logps/chosen": -474.7420349121094, + "logps/rejected": -683.0074462890625, + "loss": 0.2861, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2572880983352661, + "rewards/margins": 2.384906768798828, + "rewards/rejected": -3.6421947479248047, + "step": 1142 + }, + { + "epoch": 0.98, + "grad_norm": 56.16289363032499, + "learning_rate": 2.3713885788291253e-09, + "logits/chosen": -1.3628742694854736, + "logits/rejected": -1.3211891651153564, + "logps/chosen": -480.4175720214844, + "logps/rejected": -555.6670532226562, + "loss": 0.3192, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1059783697128296, + "rewards/margins": 1.4368137121200562, + "rewards/rejected": -2.5427920818328857, + "step": 1143 + }, + { + "epoch": 0.98, + "grad_norm": 80.38296285383446, + "learning_rate": 2.1697366339391566e-09, + "logits/chosen": -1.4191553592681885, + "logits/rejected": -1.3954321146011353, + "logps/chosen": -424.06219482421875, + "logps/rejected": -492.4453125, + "loss": 0.4467, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1218348741531372, + "rewards/margins": 1.4760911464691162, + "rewards/rejected": -2.597926139831543, + "step": 1144 + }, + { + "epoch": 0.98, + "grad_norm": 64.79748784286423, + "learning_rate": 1.977034319772919e-09, + "logits/chosen": -1.317758560180664, + "logits/rejected": -1.2300162315368652, + "logps/chosen": -525.77880859375, + "logps/rejected": -677.3091430664062, + "loss": 0.1877, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2687187194824219, + "rewards/margins": 2.5530333518981934, + "rewards/rejected": -3.8217523097991943, + "step": 1145 + }, + { + "epoch": 0.98, + "grad_norm": 117.9999058266044, + "learning_rate": 1.7932833646950862e-09, + "logits/chosen": -1.391584873199463, + "logits/rejected": -1.3528341054916382, + "logps/chosen": -698.065185546875, + "logps/rejected": -626.8812255859375, + "loss": 0.6825, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.786995530128479, + "rewards/margins": 1.257378101348877, + "rewards/rejected": -3.0443735122680664, + "step": 1146 + }, + { + "epoch": 0.98, + "grad_norm": 120.74778768884202, + "learning_rate": 1.6184854167847762e-09, + "logits/chosen": -1.3571929931640625, + "logits/rejected": -1.308891773223877, + "logps/chosen": -628.2859497070312, + "logps/rejected": -804.627197265625, + "loss": 0.9244, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9744269847869873, + "rewards/margins": 0.7434270977973938, + "rewards/rejected": -2.7178540229797363, + "step": 1147 + }, + { + "epoch": 0.98, + "grad_norm": 43.30334270730007, + "learning_rate": 1.4526420438207843e-09, + "logits/chosen": -1.3464512825012207, + "logits/rejected": -1.3035533428192139, + "logps/chosen": -432.1238708496094, + "logps/rejected": -540.8009033203125, + "loss": 0.3103, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1443607807159424, + "rewards/margins": 2.0538125038146973, + "rewards/rejected": -3.1981732845306396, + "step": 1148 + }, + { + "epoch": 0.99, + "grad_norm": 73.63501253133734, + "learning_rate": 1.2957547332673735e-09, + "logits/chosen": -1.4333579540252686, + "logits/rejected": -1.3868712186813354, + "logps/chosen": -352.1337890625, + "logps/rejected": -462.68682861328125, + "loss": 0.6665, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.07978093624115, + "rewards/margins": 1.6578272581100464, + "rewards/rejected": -2.737607955932617, + "step": 1149 + }, + { + "epoch": 0.99, + "grad_norm": 92.22993063444079, + "learning_rate": 1.1478248922611732e-09, + "logits/chosen": -1.25857675075531, + "logits/rejected": -1.2520473003387451, + "logps/chosen": -579.1795654296875, + "logps/rejected": -641.9288330078125, + "loss": 0.614, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0475940704345703, + "rewards/margins": 1.7527912855148315, + "rewards/rejected": -3.8003852367401123, + "step": 1150 + }, + { + "epoch": 0.99, + "grad_norm": 77.95791372072948, + "learning_rate": 1.0088538475985231e-09, + "logits/chosen": -1.468308687210083, + "logits/rejected": -1.372737169265747, + "logps/chosen": -440.12872314453125, + "logps/rejected": -676.8677978515625, + "loss": 0.3524, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7725510001182556, + "rewards/margins": 2.5467355251312256, + "rewards/rejected": -3.319286346435547, + "step": 1151 + }, + { + "epoch": 0.99, + "grad_norm": 37.89540413647167, + "learning_rate": 8.788428457232599e-10, + "logits/chosen": -1.327483892440796, + "logits/rejected": -1.259476900100708, + "logps/chosen": -606.5562744140625, + "logps/rejected": -799.5638427734375, + "loss": 0.2021, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3170266151428223, + "rewards/margins": 2.9507436752319336, + "rewards/rejected": -4.267770767211914, + "step": 1152 + }, + { + "epoch": 0.99, + "grad_norm": 37.3700905910909, + "learning_rate": 7.577930527160603e-10, + "logits/chosen": -1.3355379104614258, + "logits/rejected": -1.3139853477478027, + "logps/chosen": -478.2299499511719, + "logps/rejected": -658.5348510742188, + "loss": 0.1488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8407142162322998, + "rewards/margins": 2.9635887145996094, + "rewards/rejected": -3.80430269241333, + "step": 1153 + }, + { + "epoch": 0.99, + "grad_norm": 87.25455518532338, + "learning_rate": 6.457055542834489e-10, + "logits/chosen": -1.3901220560073853, + "logits/rejected": -1.3573310375213623, + "logps/chosen": -376.05596923828125, + "logps/rejected": -494.57879638671875, + "loss": 0.2486, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0375429391860962, + "rewards/margins": 1.868727684020996, + "rewards/rejected": -2.9062705039978027, + "step": 1154 + }, + { + "epoch": 0.99, + "grad_norm": 78.72533027928456, + "learning_rate": 5.425813557485837e-10, + "logits/chosen": -1.2947056293487549, + "logits/rejected": -1.2775053977966309, + "logps/chosen": -650.7203979492188, + "logps/rejected": -684.703857421875, + "loss": 0.3896, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9509949684143066, + "rewards/margins": 1.6460188627243042, + "rewards/rejected": -3.5970139503479004, + "step": 1155 + }, + { + "epoch": 0.99, + "grad_norm": 44.70986796433435, + "learning_rate": 4.4842138204170823e-10, + "logits/chosen": -1.3698441982269287, + "logits/rejected": -1.311873197555542, + "logps/chosen": -540.0364990234375, + "logps/rejected": -709.73193359375, + "loss": 0.2309, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1100777387619019, + "rewards/margins": 2.580242872238159, + "rewards/rejected": -3.6903204917907715, + "step": 1156 + }, + { + "epoch": 0.99, + "grad_norm": 58.44263577705729, + "learning_rate": 3.632264776922689e-10, + "logits/chosen": -1.4120718240737915, + "logits/rejected": -1.3665246963500977, + "logps/chosen": -391.5028991699219, + "logps/rejected": -562.1903686523438, + "loss": 0.4485, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3291339874267578, + "rewards/margins": 1.5925970077514648, + "rewards/rejected": -2.9217309951782227, + "step": 1157 + }, + { + "epoch": 0.99, + "grad_norm": 82.61556324764253, + "learning_rate": 2.8699740682103234e-10, + "logits/chosen": -1.3903688192367554, + "logits/rejected": -1.323358178138733, + "logps/chosen": -694.0431518554688, + "logps/rejected": -721.135986328125, + "loss": 0.61, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7189762592315674, + "rewards/margins": 1.5658252239227295, + "rewards/rejected": -3.284801483154297, + "step": 1158 + }, + { + "epoch": 0.99, + "grad_norm": 44.90677636669033, + "learning_rate": 2.1973485313364626e-10, + "logits/chosen": -1.3954801559448242, + "logits/rejected": -1.3531403541564941, + "logps/chosen": -416.509033203125, + "logps/rejected": -538.574462890625, + "loss": 0.3214, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2683722972869873, + "rewards/margins": 1.4201171398162842, + "rewards/rejected": -2.6884894371032715, + "step": 1159 + }, + { + "epoch": 0.99, + "grad_norm": 49.18377325889221, + "learning_rate": 1.614394199139779e-10, + "logits/chosen": -1.4112136363983154, + "logits/rejected": -1.3324227333068848, + "logps/chosen": -446.1578063964844, + "logps/rejected": -651.1507568359375, + "loss": 0.2812, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3055462837219238, + "rewards/margins": 2.007277011871338, + "rewards/rejected": -3.3128228187561035, + "step": 1160 + }, + { + "epoch": 1.0, + "grad_norm": 60.32037934519408, + "learning_rate": 1.121116300192293e-10, + "logits/chosen": -1.3937182426452637, + "logits/rejected": -1.340111255645752, + "logps/chosen": -367.4189758300781, + "logps/rejected": -490.84564208984375, + "loss": 0.4305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8444278240203857, + "rewards/margins": 1.7927734851837158, + "rewards/rejected": -2.6372013092041016, + "step": 1161 + }, + { + "epoch": 1.0, + "grad_norm": 43.045794842741344, + "learning_rate": 7.175192587471901e-11, + "logits/chosen": -1.3729491233825684, + "logits/rejected": -1.3426620960235596, + "logps/chosen": -493.204833984375, + "logps/rejected": -539.8018798828125, + "loss": 0.2829, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0588021278381348, + "rewards/margins": 1.6722935438156128, + "rewards/rejected": -2.731095790863037, + "step": 1162 + }, + { + "epoch": 1.0, + "grad_norm": 27.30597124731177, + "learning_rate": 4.0360669470329567e-11, + "logits/chosen": -1.4091095924377441, + "logits/rejected": -1.297658920288086, + "logps/chosen": -397.6080322265625, + "logps/rejected": -712.9400634765625, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6887233257293701, + "rewards/margins": 2.8686845302581787, + "rewards/rejected": -3.557407855987549, + "step": 1163 + }, + { + "epoch": 1.0, + "grad_norm": 38.20646142690474, + "learning_rate": 1.7938142357176723e-11, + "logits/chosen": -1.3071112632751465, + "logits/rejected": -1.283097743988037, + "logps/chosen": -783.91162109375, + "logps/rejected": -929.2320556640625, + "loss": 0.1792, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7789937257766724, + "rewards/margins": 2.8627429008483887, + "rewards/rejected": -4.64173698425293, + "step": 1164 + }, + { + "epoch": 1.0, + "grad_norm": 93.66277413850928, + "learning_rate": 4.484545644833914e-12, + "logits/chosen": -1.387592077255249, + "logits/rejected": -1.3474268913269043, + "logps/chosen": -512.49560546875, + "logps/rejected": -682.5592041015625, + "loss": 0.3537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0724601745605469, + "rewards/margins": 2.411895513534546, + "rewards/rejected": -3.4843556880950928, + "step": 1165 + }, + { + "epoch": 1.0, + "grad_norm": 42.24107456541779, + "learning_rate": 0.0, + "logits/chosen": -1.3576037883758545, + "logits/rejected": -1.2758808135986328, + "logps/chosen": -443.78759765625, + "logps/rejected": -610.3323974609375, + "loss": 0.2515, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3626399040222168, + "rewards/margins": 2.2888269424438477, + "rewards/rejected": -3.6514668464660645, + "step": 1166 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.352773666381836, + "eval_logits/rejected": -1.2867770195007324, + "eval_logps/chosen": -451.33642578125, + "eval_logps/rejected": -657.6813354492188, + "eval_loss": 0.2841675579547882, + "eval_rewards/accuracies": 0.88671875, + "eval_rewards/chosen": -1.1548646688461304, + "eval_rewards/margins": 2.4814703464508057, + "eval_rewards/rejected": -3.6363346576690674, + "eval_runtime": 513.7019, + "eval_samples_per_second": 2.99, + "eval_steps_per_second": 0.748, + "step": 1166 + } + ], + "logging_steps": 1, + "max_steps": 1166, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}