{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1166, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 103.41960351338825, "learning_rate": 1.7094017094017096e-08, "logits/chosen": -0.7548736333847046, "logits/rejected": -0.7155890464782715, "logps/chosen": -586.5838623046875, "logps/rejected": -658.200927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 94.92896844649115, "learning_rate": 3.418803418803419e-08, "logits/chosen": -0.7686550617218018, "logits/rejected": -0.712468147277832, "logps/chosen": -457.4859619140625, "logps/rejected": -713.6287841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "grad_norm": 90.79844942313059, "learning_rate": 5.128205128205128e-08, "logits/chosen": -0.9042060375213623, "logits/rejected": -0.8283087611198425, "logps/chosen": -373.404296875, "logps/rejected": -573.5121459960938, "loss": 0.7268, "rewards/accuracies": 0.3125, "rewards/chosen": -0.06697890162467957, "rewards/margins": -0.059918761253356934, "rewards/rejected": -0.007060145493596792, "step": 3 }, { "epoch": 0.0, "grad_norm": 113.84425518159345, "learning_rate": 6.837606837606839e-08, "logits/chosen": -0.7961872816085815, "logits/rejected": -0.7325714826583862, "logps/chosen": -538.67919921875, "logps/rejected": -809.2041015625, "loss": 0.7242, "rewards/accuracies": 0.5, "rewards/chosen": -0.049712374806404114, "rewards/margins": -0.05105018615722656, "rewards/rejected": 0.0013378113508224487, "step": 4 }, { "epoch": 0.0, "grad_norm": 97.52089397123956, "learning_rate": 8.547008547008547e-08, "logits/chosen": -0.7953340411186218, "logits/rejected": -0.7663401961326599, "logps/chosen": -565.4126586914062, "logps/rejected": -706.5430297851562, "loss": 0.6737, "rewards/accuracies": 0.625, "rewards/chosen": 0.00451049767434597, "rewards/margins": 0.04731311649084091, "rewards/rejected": -0.04280261695384979, "step": 5 }, { "epoch": 0.01, "grad_norm": 104.40420699285828, "learning_rate": 1.0256410256410256e-07, "logits/chosen": -0.7810869216918945, "logits/rejected": -0.7261877059936523, "logps/chosen": -576.093017578125, "logps/rejected": -735.2755737304688, "loss": 0.724, "rewards/accuracies": 0.375, "rewards/chosen": -0.016032269224524498, "rewards/margins": -0.04482083395123482, "rewards/rejected": 0.028788568452000618, "step": 6 }, { "epoch": 0.01, "grad_norm": 98.11913062972486, "learning_rate": 1.1965811965811965e-07, "logits/chosen": -0.7417203783988953, "logits/rejected": -0.7288725972175598, "logps/chosen": -536.7333984375, "logps/rejected": -579.6881103515625, "loss": 0.7402, "rewards/accuracies": 0.375, "rewards/chosen": -0.0824371799826622, "rewards/margins": -0.07586034387350082, "rewards/rejected": -0.00657684775069356, "step": 7 }, { "epoch": 0.01, "grad_norm": 98.41781433766984, "learning_rate": 1.3675213675213677e-07, "logits/chosen": -1.0201611518859863, "logits/rejected": -1.0109407901763916, "logps/chosen": -481.22076416015625, "logps/rejected": -501.9241943359375, "loss": 0.7265, "rewards/accuracies": 0.4375, "rewards/chosen": -0.009386436082422733, "rewards/margins": -0.053712133318185806, "rewards/rejected": 0.04432569816708565, "step": 8 }, { "epoch": 0.01, "grad_norm": 85.90502357414873, "learning_rate": 1.5384615384615385e-07, "logits/chosen": -0.7735470533370972, "logits/rejected": -0.7421846389770508, "logps/chosen": -355.7308349609375, "logps/rejected": -520.117431640625, "loss": 0.6849, "rewards/accuracies": 0.5625, "rewards/chosen": -0.033313192427158356, "rewards/margins": 0.026676924899220467, "rewards/rejected": -0.05999011546373367, "step": 9 }, { "epoch": 0.01, "grad_norm": 95.53596114513927, "learning_rate": 1.7094017094017095e-07, "logits/chosen": -0.7909107208251953, "logits/rejected": -0.729066789150238, "logps/chosen": -462.21490478515625, "logps/rejected": -622.1067504882812, "loss": 0.7079, "rewards/accuracies": 0.375, "rewards/chosen": -0.008044671267271042, "rewards/margins": -0.017987489700317383, "rewards/rejected": 0.009942816570401192, "step": 10 }, { "epoch": 0.01, "grad_norm": 94.62643944442853, "learning_rate": 1.8803418803418802e-07, "logits/chosen": -0.9328438639640808, "logits/rejected": -0.9020552039146423, "logps/chosen": -372.81146240234375, "logps/rejected": -567.58056640625, "loss": 0.7157, "rewards/accuracies": 0.4375, "rewards/chosen": 0.03133794292807579, "rewards/margins": -0.03527111932635307, "rewards/rejected": 0.06660906225442886, "step": 11 }, { "epoch": 0.01, "grad_norm": 93.39398928307216, "learning_rate": 2.0512820512820512e-07, "logits/chosen": -0.9031232595443726, "logits/rejected": -0.8751566410064697, "logps/chosen": -436.08343505859375, "logps/rejected": -568.6261596679688, "loss": 0.7387, "rewards/accuracies": 0.4375, "rewards/chosen": -0.04213881492614746, "rewards/margins": -0.07084135711193085, "rewards/rejected": 0.028702545911073685, "step": 12 }, { "epoch": 0.01, "grad_norm": 101.92780870439769, "learning_rate": 2.222222222222222e-07, "logits/chosen": -0.7718335390090942, "logits/rejected": -0.7280720472335815, "logps/chosen": -475.76568603515625, "logps/rejected": -623.90283203125, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": 0.028224896639585495, "rewards/margins": 0.012632707133889198, "rewards/rejected": 0.015592193230986595, "step": 13 }, { "epoch": 0.01, "grad_norm": 100.09509124265254, "learning_rate": 2.393162393162393e-07, "logits/chosen": -0.7317771911621094, "logits/rejected": -0.6727234125137329, "logps/chosen": -573.5426025390625, "logps/rejected": -711.6471557617188, "loss": 0.6586, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04689348116517067, "rewards/margins": 0.08301004022359848, "rewards/rejected": -0.03611656650900841, "step": 14 }, { "epoch": 0.01, "grad_norm": 99.7593876423607, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -0.7521563172340393, "logits/rejected": -0.6863745450973511, "logps/chosen": -498.97991943359375, "logps/rejected": -682.7203979492188, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.028330065310001373, "rewards/margins": 0.019269824028015137, "rewards/rejected": -0.04759988933801651, "step": 15 }, { "epoch": 0.01, "grad_norm": 88.29988329350945, "learning_rate": 2.7350427350427354e-07, "logits/chosen": -0.8903733491897583, "logits/rejected": -0.8610409498214722, "logps/chosen": -441.4660949707031, "logps/rejected": -485.55010986328125, "loss": 0.7074, "rewards/accuracies": 0.375, "rewards/chosen": -0.010682763531804085, "rewards/margins": -0.018920645117759705, "rewards/rejected": 0.008237885311245918, "step": 16 }, { "epoch": 0.01, "grad_norm": 104.0592089591977, "learning_rate": 2.905982905982906e-07, "logits/chosen": -1.0094716548919678, "logits/rejected": -0.963087797164917, "logps/chosen": -477.1594543457031, "logps/rejected": -731.2901611328125, "loss": 0.7002, "rewards/accuracies": 0.375, "rewards/chosen": -0.02713947370648384, "rewards/margins": -0.008646871894598007, "rewards/rejected": -0.018492601811885834, "step": 17 }, { "epoch": 0.02, "grad_norm": 105.17653654674692, "learning_rate": 3.076923076923077e-07, "logits/chosen": -1.0000486373901367, "logits/rejected": -0.9455796480178833, "logps/chosen": -692.1254272460938, "logps/rejected": -884.697021484375, "loss": 0.6617, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003712274134159088, "rewards/margins": 0.0707027018070221, "rewards/rejected": -0.07441498339176178, "step": 18 }, { "epoch": 0.02, "grad_norm": 91.15778116804893, "learning_rate": 3.2478632478632476e-07, "logits/chosen": -0.9160683155059814, "logits/rejected": -0.8868091106414795, "logps/chosen": -607.456787109375, "logps/rejected": -590.1045532226562, "loss": 0.6537, "rewards/accuracies": 0.8125, "rewards/chosen": -0.017058946192264557, "rewards/margins": 0.08485370874404907, "rewards/rejected": -0.10191265493631363, "step": 19 }, { "epoch": 0.02, "grad_norm": 88.18915926623042, "learning_rate": 3.418803418803419e-07, "logits/chosen": -0.9559439420700073, "logits/rejected": -0.9025272130966187, "logps/chosen": -379.51666259765625, "logps/rejected": -568.963134765625, "loss": 0.7113, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08084973692893982, "rewards/margins": -0.025880228728055954, "rewards/rejected": -0.05496950075030327, "step": 20 }, { "epoch": 0.02, "grad_norm": 105.50380476742227, "learning_rate": 3.5897435897435896e-07, "logits/chosen": -0.8581007719039917, "logits/rejected": -0.8482990264892578, "logps/chosen": -721.2944946289062, "logps/rejected": -692.0745239257812, "loss": 0.6512, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0144729632884264, "rewards/margins": 0.09765224158763885, "rewards/rejected": -0.0831792801618576, "step": 21 }, { "epoch": 0.02, "grad_norm": 100.17926934231421, "learning_rate": 3.7606837606837604e-07, "logits/chosen": -0.9451916813850403, "logits/rejected": -0.8803855180740356, "logps/chosen": -479.50274658203125, "logps/rejected": -680.472412109375, "loss": 0.6848, "rewards/accuracies": 0.5, "rewards/chosen": 0.02336878702044487, "rewards/margins": 0.02742600627243519, "rewards/rejected": -0.004057217389345169, "step": 22 }, { "epoch": 0.02, "grad_norm": 99.39544083096526, "learning_rate": 3.931623931623931e-07, "logits/chosen": -0.7238095998764038, "logits/rejected": -0.6513512134552002, "logps/chosen": -518.0869140625, "logps/rejected": -722.469482421875, "loss": 0.6238, "rewards/accuracies": 0.75, "rewards/chosen": 0.049712516367435455, "rewards/margins": 0.1534470021724701, "rewards/rejected": -0.10373449325561523, "step": 23 }, { "epoch": 0.02, "grad_norm": 103.09338152254325, "learning_rate": 4.1025641025641024e-07, "logits/chosen": -0.9146740436553955, "logits/rejected": -0.8484085202217102, "logps/chosen": -481.82745361328125, "logps/rejected": -674.730224609375, "loss": 0.7114, "rewards/accuracies": 0.625, "rewards/chosen": -0.010667498223483562, "rewards/margins": -0.010577872395515442, "rewards/rejected": -8.963048458099365e-05, "step": 24 }, { "epoch": 0.02, "grad_norm": 96.04021273997232, "learning_rate": 4.273504273504273e-07, "logits/chosen": -0.9091683626174927, "logits/rejected": -0.8563873767852783, "logps/chosen": -492.4463806152344, "logps/rejected": -638.01025390625, "loss": 0.7015, "rewards/accuracies": 0.625, "rewards/chosen": -0.018622679635882378, "rewards/margins": -0.0039053894579410553, "rewards/rejected": -0.01471729390323162, "step": 25 }, { "epoch": 0.02, "grad_norm": 109.06282899339998, "learning_rate": 4.444444444444444e-07, "logits/chosen": -0.9659221768379211, "logits/rejected": -0.9196881651878357, "logps/chosen": -492.91900634765625, "logps/rejected": -853.394287109375, "loss": 0.6713, "rewards/accuracies": 0.5625, "rewards/chosen": -0.028880760073661804, "rewards/margins": 0.059592749923467636, "rewards/rejected": -0.08847351372241974, "step": 26 }, { "epoch": 0.02, "grad_norm": 99.3240842921101, "learning_rate": 4.6153846153846156e-07, "logits/chosen": -0.9494356513023376, "logits/rejected": -0.8917842507362366, "logps/chosen": -528.2301025390625, "logps/rejected": -776.6889038085938, "loss": 0.6467, "rewards/accuracies": 0.5625, "rewards/chosen": -0.011080075986683369, "rewards/margins": 0.11547484248876572, "rewards/rejected": -0.12655490636825562, "step": 27 }, { "epoch": 0.02, "grad_norm": 108.40026128693336, "learning_rate": 4.786324786324786e-07, "logits/chosen": -0.9912917613983154, "logits/rejected": -0.9570505023002625, "logps/chosen": -555.765869140625, "logps/rejected": -669.6681518554688, "loss": 0.7363, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08198236674070358, "rewards/margins": -0.06766227632761002, "rewards/rejected": -0.014320090413093567, "step": 28 }, { "epoch": 0.02, "grad_norm": 99.90460421005808, "learning_rate": 4.957264957264958e-07, "logits/chosen": -0.95127934217453, "logits/rejected": -0.9175419807434082, "logps/chosen": -461.0755615234375, "logps/rejected": -580.02734375, "loss": 0.7048, "rewards/accuracies": 0.5, "rewards/chosen": -0.0809219628572464, "rewards/margins": -0.01347380131483078, "rewards/rejected": -0.06744816154241562, "step": 29 }, { "epoch": 0.03, "grad_norm": 112.99941238862309, "learning_rate": 5.128205128205127e-07, "logits/chosen": -0.9818075895309448, "logits/rejected": -0.9260789155960083, "logps/chosen": -535.239013671875, "logps/rejected": -700.30517578125, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.09685669839382172, "rewards/margins": 0.015113834291696548, "rewards/rejected": -0.11197052150964737, "step": 30 }, { "epoch": 0.03, "grad_norm": 83.0378730434607, "learning_rate": 5.299145299145299e-07, "logits/chosen": -0.6910840272903442, "logits/rejected": -0.6364599466323853, "logps/chosen": -465.90606689453125, "logps/rejected": -567.572021484375, "loss": 0.6282, "rewards/accuracies": 0.875, "rewards/chosen": 0.016259711235761642, "rewards/margins": 0.1518758237361908, "rewards/rejected": -0.13561612367630005, "step": 31 }, { "epoch": 0.03, "grad_norm": 98.11090569271404, "learning_rate": 5.470085470085471e-07, "logits/chosen": -0.7717163562774658, "logits/rejected": -0.7245222330093384, "logps/chosen": -592.1800537109375, "logps/rejected": -645.472900390625, "loss": 0.6386, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04043560102581978, "rewards/margins": 0.12422771751880646, "rewards/rejected": -0.16466331481933594, "step": 32 }, { "epoch": 0.03, "grad_norm": 100.66982974140217, "learning_rate": 5.641025641025641e-07, "logits/chosen": -0.7310692071914673, "logits/rejected": -0.7151703834533691, "logps/chosen": -553.5906982421875, "logps/rejected": -547.3599853515625, "loss": 0.705, "rewards/accuracies": 0.4375, "rewards/chosen": -0.04172036796808243, "rewards/margins": -0.008993186056613922, "rewards/rejected": -0.032727185636758804, "step": 33 }, { "epoch": 0.03, "grad_norm": 94.70197301435658, "learning_rate": 5.811965811965812e-07, "logits/chosen": -0.6653925776481628, "logits/rejected": -0.622881293296814, "logps/chosen": -426.7507629394531, "logps/rejected": -596.0507202148438, "loss": 0.6539, "rewards/accuracies": 0.75, "rewards/chosen": -0.04939991235733032, "rewards/margins": 0.08512675017118454, "rewards/rejected": -0.13452666997909546, "step": 34 }, { "epoch": 0.03, "grad_norm": 104.42226965215687, "learning_rate": 5.982905982905982e-07, "logits/chosen": -0.787976086139679, "logits/rejected": -0.7573559284210205, "logps/chosen": -565.1084594726562, "logps/rejected": -869.0287475585938, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": -0.07111568748950958, "rewards/margins": 0.08143329620361328, "rewards/rejected": -0.15254898369312286, "step": 35 }, { "epoch": 0.03, "grad_norm": 109.94862708513548, "learning_rate": 6.153846153846154e-07, "logits/chosen": -0.7678736448287964, "logits/rejected": -0.7088093757629395, "logps/chosen": -588.3760986328125, "logps/rejected": -797.10107421875, "loss": 0.7167, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13470612466335297, "rewards/margins": -0.018723297864198685, "rewards/rejected": -0.11598282307386398, "step": 36 }, { "epoch": 0.03, "grad_norm": 104.69858779468399, "learning_rate": 6.324786324786325e-07, "logits/chosen": -0.7733074426651001, "logits/rejected": -0.6972559094429016, "logps/chosen": -525.0222778320312, "logps/rejected": -643.43017578125, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": -0.07904539257287979, "rewards/margins": -0.021822579205036163, "rewards/rejected": -0.057222817093133926, "step": 37 }, { "epoch": 0.03, "grad_norm": 84.47695627881467, "learning_rate": 6.495726495726495e-07, "logits/chosen": -0.769538164138794, "logits/rejected": -0.7374118566513062, "logps/chosen": -469.9814758300781, "logps/rejected": -566.3709716796875, "loss": 0.6477, "rewards/accuracies": 0.5, "rewards/chosen": -0.04114339128136635, "rewards/margins": 0.13897393643856049, "rewards/rejected": -0.18011733889579773, "step": 38 }, { "epoch": 0.03, "grad_norm": 102.47977429806984, "learning_rate": 6.666666666666666e-07, "logits/chosen": -0.7609751224517822, "logits/rejected": -0.6807464361190796, "logps/chosen": -604.7998046875, "logps/rejected": -682.2073974609375, "loss": 0.7093, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07954426109790802, "rewards/margins": 0.0017297714948654175, "rewards/rejected": -0.08127403259277344, "step": 39 }, { "epoch": 0.03, "grad_norm": 92.09223558927283, "learning_rate": 6.837606837606838e-07, "logits/chosen": -0.7742663621902466, "logits/rejected": -0.7225289940834045, "logps/chosen": -542.4127197265625, "logps/rejected": -764.8325805664062, "loss": 0.5848, "rewards/accuracies": 0.75, "rewards/chosen": 0.0005925875157117844, "rewards/margins": 0.2573086619377136, "rewards/rejected": -0.2567160725593567, "step": 40 }, { "epoch": 0.04, "grad_norm": 91.93998818640803, "learning_rate": 7.008547008547007e-07, "logits/chosen": -0.9238195419311523, "logits/rejected": -0.8682441711425781, "logps/chosen": -558.4532470703125, "logps/rejected": -765.9144287109375, "loss": 0.5732, "rewards/accuracies": 0.875, "rewards/chosen": 0.03887009620666504, "rewards/margins": 0.2795259356498718, "rewards/rejected": -0.2406558394432068, "step": 41 }, { "epoch": 0.04, "grad_norm": 95.61208794097061, "learning_rate": 7.179487179487179e-07, "logits/chosen": -0.7909547090530396, "logits/rejected": -0.7456868886947632, "logps/chosen": -476.070556640625, "logps/rejected": -590.078369140625, "loss": 0.6532, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03314027935266495, "rewards/margins": 0.09669484198093414, "rewards/rejected": -0.1298351287841797, "step": 42 }, { "epoch": 0.04, "grad_norm": 81.14395901291414, "learning_rate": 7.350427350427351e-07, "logits/chosen": -0.9072396755218506, "logits/rejected": -0.8494898080825806, "logps/chosen": -438.7601318359375, "logps/rejected": -602.9498291015625, "loss": 0.6088, "rewards/accuracies": 0.75, "rewards/chosen": -0.03542018309235573, "rewards/margins": 0.19208072125911713, "rewards/rejected": -0.22750090062618256, "step": 43 }, { "epoch": 0.04, "grad_norm": 108.59101240066408, "learning_rate": 7.521367521367521e-07, "logits/chosen": -0.9132080078125, "logits/rejected": -0.8617574572563171, "logps/chosen": -545.069580078125, "logps/rejected": -611.75830078125, "loss": 0.7416, "rewards/accuracies": 0.375, "rewards/chosen": -0.09693308174610138, "rewards/margins": -0.07984675467014313, "rewards/rejected": -0.017086319625377655, "step": 44 }, { "epoch": 0.04, "grad_norm": 96.66496044258588, "learning_rate": 7.692307692307693e-07, "logits/chosen": -0.9789643883705139, "logits/rejected": -0.8941919803619385, "logps/chosen": -495.7006530761719, "logps/rejected": -747.5504760742188, "loss": 0.6242, "rewards/accuracies": 0.75, "rewards/chosen": -0.03485063090920448, "rewards/margins": 0.1573757827281952, "rewards/rejected": -0.19222640991210938, "step": 45 }, { "epoch": 0.04, "grad_norm": 93.27997514769116, "learning_rate": 7.863247863247862e-07, "logits/chosen": -0.9590734243392944, "logits/rejected": -0.9081867933273315, "logps/chosen": -468.8348693847656, "logps/rejected": -667.7100830078125, "loss": 0.6551, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0537002757191658, "rewards/margins": 0.08610428124666214, "rewards/rejected": -0.13980455696582794, "step": 46 }, { "epoch": 0.04, "grad_norm": 84.49214156499791, "learning_rate": 8.034188034188034e-07, "logits/chosen": -1.0012445449829102, "logits/rejected": -0.9466636180877686, "logps/chosen": -543.0889282226562, "logps/rejected": -655.0601196289062, "loss": 0.5992, "rewards/accuracies": 0.6875, "rewards/chosen": -0.031609345227479935, "rewards/margins": 0.22827112674713135, "rewards/rejected": -0.2598804533481598, "step": 47 }, { "epoch": 0.04, "grad_norm": 91.01884588886823, "learning_rate": 8.205128205128205e-07, "logits/chosen": -0.9361293315887451, "logits/rejected": -0.9137954711914062, "logps/chosen": -493.56622314453125, "logps/rejected": -541.2529907226562, "loss": 0.6555, "rewards/accuracies": 0.625, "rewards/chosen": 0.0510423518717289, "rewards/margins": 0.10465296357870102, "rewards/rejected": -0.053610607981681824, "step": 48 }, { "epoch": 0.04, "grad_norm": 78.77859928486089, "learning_rate": 8.376068376068375e-07, "logits/chosen": -0.9242331385612488, "logits/rejected": -0.8696056008338928, "logps/chosen": -497.6260986328125, "logps/rejected": -671.1222534179688, "loss": 0.5487, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07649482786655426, "rewards/margins": 0.35616305470466614, "rewards/rejected": -0.2796682119369507, "step": 49 }, { "epoch": 0.04, "grad_norm": 87.4924099852313, "learning_rate": 8.547008547008546e-07, "logits/chosen": -0.9026439189910889, "logits/rejected": -0.855921745300293, "logps/chosen": -401.735595703125, "logps/rejected": -690.1610717773438, "loss": 0.5861, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04143127053976059, "rewards/margins": 0.2539970278739929, "rewards/rejected": -0.2954282760620117, "step": 50 }, { "epoch": 0.04, "grad_norm": 81.48941014314516, "learning_rate": 8.717948717948718e-07, "logits/chosen": -0.942750871181488, "logits/rejected": -0.9072023630142212, "logps/chosen": -370.6748046875, "logps/rejected": -623.7341918945312, "loss": 0.6305, "rewards/accuracies": 0.75, "rewards/chosen": -0.03426019474864006, "rewards/margins": 0.16470417380332947, "rewards/rejected": -0.19896435737609863, "step": 51 }, { "epoch": 0.04, "grad_norm": 97.18373689098986, "learning_rate": 8.888888888888888e-07, "logits/chosen": -0.8328244686126709, "logits/rejected": -0.8133566975593567, "logps/chosen": -606.1514892578125, "logps/rejected": -718.189453125, "loss": 0.6274, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09226015210151672, "rewards/margins": 0.14438080787658691, "rewards/rejected": -0.23664094507694244, "step": 52 }, { "epoch": 0.05, "grad_norm": 84.23311513947658, "learning_rate": 9.059829059829059e-07, "logits/chosen": -0.8741067051887512, "logits/rejected": -0.8305940628051758, "logps/chosen": -443.4811706542969, "logps/rejected": -599.29248046875, "loss": 0.599, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07226061820983887, "rewards/margins": 0.24112094938755035, "rewards/rejected": -0.3133815824985504, "step": 53 }, { "epoch": 0.05, "grad_norm": 90.77895336471985, "learning_rate": 9.230769230769231e-07, "logits/chosen": -0.8400816917419434, "logits/rejected": -0.7951209545135498, "logps/chosen": -502.77703857421875, "logps/rejected": -656.4312133789062, "loss": 0.6166, "rewards/accuracies": 0.875, "rewards/chosen": -0.13248419761657715, "rewards/margins": 0.1832733154296875, "rewards/rejected": -0.31575751304626465, "step": 54 }, { "epoch": 0.05, "grad_norm": 92.83966456996117, "learning_rate": 9.401709401709401e-07, "logits/chosen": -0.9149412512779236, "logits/rejected": -0.8611310124397278, "logps/chosen": -636.2821044921875, "logps/rejected": -555.0322265625, "loss": 0.6557, "rewards/accuracies": 0.625, "rewards/chosen": -0.18232059478759766, "rewards/margins": 0.0952020213007927, "rewards/rejected": -0.27752262353897095, "step": 55 }, { "epoch": 0.05, "grad_norm": 91.35675517335002, "learning_rate": 9.572649572649572e-07, "logits/chosen": -0.8824781775474548, "logits/rejected": -0.8100583553314209, "logps/chosen": -575.037109375, "logps/rejected": -663.770263671875, "loss": 0.6307, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1571289300918579, "rewards/margins": 0.1695071905851364, "rewards/rejected": -0.3266361355781555, "step": 56 }, { "epoch": 0.05, "grad_norm": 89.40472477922503, "learning_rate": 9.743589743589742e-07, "logits/chosen": -0.8715717792510986, "logits/rejected": -0.8538134098052979, "logps/chosen": -436.94000244140625, "logps/rejected": -606.1776123046875, "loss": 0.6018, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05886922404170036, "rewards/margins": 0.2175457775592804, "rewards/rejected": -0.27641499042510986, "step": 57 }, { "epoch": 0.05, "grad_norm": 93.86457785629594, "learning_rate": 9.914529914529915e-07, "logits/chosen": -0.8911943435668945, "logits/rejected": -0.8368760347366333, "logps/chosen": -542.065185546875, "logps/rejected": -709.164306640625, "loss": 0.5788, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04741700738668442, "rewards/margins": 0.2624610662460327, "rewards/rejected": -0.30987805128097534, "step": 58 }, { "epoch": 0.05, "grad_norm": 86.26470924808025, "learning_rate": 1.0085470085470084e-06, "logits/chosen": -0.7618128657341003, "logits/rejected": -0.715040922164917, "logps/chosen": -546.5999755859375, "logps/rejected": -586.6727294921875, "loss": 0.6301, "rewards/accuracies": 0.8125, "rewards/chosen": -0.027117159217596054, "rewards/margins": 0.1485387682914734, "rewards/rejected": -0.17565591633319855, "step": 59 }, { "epoch": 0.05, "grad_norm": 80.45324400821403, "learning_rate": 1.0256410256410255e-06, "logits/chosen": -0.8306759595870972, "logits/rejected": -0.7963862419128418, "logps/chosen": -382.11224365234375, "logps/rejected": -530.7271728515625, "loss": 0.6279, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06177574023604393, "rewards/margins": 0.1595396101474762, "rewards/rejected": -0.22131533920764923, "step": 60 }, { "epoch": 0.05, "grad_norm": 83.54456700968679, "learning_rate": 1.0427350427350427e-06, "logits/chosen": -0.6904474496841431, "logits/rejected": -0.6812146902084351, "logps/chosen": -491.5672607421875, "logps/rejected": -609.932861328125, "loss": 0.5828, "rewards/accuracies": 0.75, "rewards/chosen": -0.12990780174732208, "rewards/margins": 0.25919151306152344, "rewards/rejected": -0.3890992999076843, "step": 61 }, { "epoch": 0.05, "grad_norm": 77.16981900105961, "learning_rate": 1.0598290598290598e-06, "logits/chosen": -0.6535602807998657, "logits/rejected": -0.6413712501525879, "logps/chosen": -464.19293212890625, "logps/rejected": -532.1095581054688, "loss": 0.6012, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03915134817361832, "rewards/margins": 0.22922548651695251, "rewards/rejected": -0.26837682723999023, "step": 62 }, { "epoch": 0.05, "grad_norm": 104.62885637579853, "learning_rate": 1.0769230769230769e-06, "logits/chosen": -0.8028097152709961, "logits/rejected": -0.7520387172698975, "logps/chosen": -637.960205078125, "logps/rejected": -638.2055053710938, "loss": 0.6533, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1958635449409485, "rewards/margins": 0.13062095642089844, "rewards/rejected": -0.3264845013618469, "step": 63 }, { "epoch": 0.05, "grad_norm": 100.23843850521743, "learning_rate": 1.0940170940170942e-06, "logits/chosen": -0.6847934126853943, "logits/rejected": -0.5922191143035889, "logps/chosen": -723.4080810546875, "logps/rejected": -724.36767578125, "loss": 0.6123, "rewards/accuracies": 0.75, "rewards/chosen": -0.18822208046913147, "rewards/margins": 0.1920095980167389, "rewards/rejected": -0.38023167848587036, "step": 64 }, { "epoch": 0.06, "grad_norm": 95.0522315726931, "learning_rate": 1.111111111111111e-06, "logits/chosen": -0.7592595815658569, "logits/rejected": -0.7065274715423584, "logps/chosen": -575.9987182617188, "logps/rejected": -705.1849365234375, "loss": 0.6486, "rewards/accuracies": 0.75, "rewards/chosen": 0.08855009078979492, "rewards/margins": 0.26747530698776245, "rewards/rejected": -0.17892523109912872, "step": 65 }, { "epoch": 0.06, "grad_norm": 82.5319959067867, "learning_rate": 1.1282051282051281e-06, "logits/chosen": -0.6380032300949097, "logits/rejected": -0.6153943538665771, "logps/chosen": -457.5918884277344, "logps/rejected": -647.5048828125, "loss": 0.5669, "rewards/accuracies": 0.6875, "rewards/chosen": -0.060186341404914856, "rewards/margins": 0.32465216517448425, "rewards/rejected": -0.3848384916782379, "step": 66 }, { "epoch": 0.06, "grad_norm": 133.2973708733622, "learning_rate": 1.1452991452991452e-06, "logits/chosen": -0.6638052463531494, "logits/rejected": -0.6150358319282532, "logps/chosen": -694.1866455078125, "logps/rejected": -747.4185791015625, "loss": 0.6363, "rewards/accuracies": 0.875, "rewards/chosen": -0.191949263215065, "rewards/margins": 0.1725366711616516, "rewards/rejected": -0.3644859492778778, "step": 67 }, { "epoch": 0.06, "grad_norm": 84.88844038959499, "learning_rate": 1.1623931623931625e-06, "logits/chosen": -0.800848662853241, "logits/rejected": -0.7365254163742065, "logps/chosen": -598.2763671875, "logps/rejected": -724.450927734375, "loss": 0.5192, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08877643197774887, "rewards/margins": 0.43157708644866943, "rewards/rejected": -0.5203535556793213, "step": 68 }, { "epoch": 0.06, "grad_norm": 74.27440749130595, "learning_rate": 1.1794871794871795e-06, "logits/chosen": -0.7443728446960449, "logits/rejected": -0.7063024044036865, "logps/chosen": -463.95013427734375, "logps/rejected": -710.5635986328125, "loss": 0.5168, "rewards/accuracies": 0.75, "rewards/chosen": -0.02280556596815586, "rewards/margins": 0.4473559260368347, "rewards/rejected": -0.470161497592926, "step": 69 }, { "epoch": 0.06, "grad_norm": 78.31022777921227, "learning_rate": 1.1965811965811964e-06, "logits/chosen": -0.6374633312225342, "logits/rejected": -0.6176419258117676, "logps/chosen": -513.8482666015625, "logps/rejected": -636.00927734375, "loss": 0.5565, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16604569554328918, "rewards/margins": 0.32729434967041016, "rewards/rejected": -0.49334001541137695, "step": 70 }, { "epoch": 0.06, "grad_norm": 82.71600203096435, "learning_rate": 1.2136752136752135e-06, "logits/chosen": -0.7886828184127808, "logits/rejected": -0.7378125190734863, "logps/chosen": -431.9499206542969, "logps/rejected": -634.23095703125, "loss": 0.5438, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04967509210109711, "rewards/margins": 0.3546733260154724, "rewards/rejected": -0.40434837341308594, "step": 71 }, { "epoch": 0.06, "grad_norm": 86.16215472228565, "learning_rate": 1.2307692307692308e-06, "logits/chosen": -0.7888700366020203, "logits/rejected": -0.739388644695282, "logps/chosen": -556.30078125, "logps/rejected": -581.4580078125, "loss": 0.5939, "rewards/accuracies": 0.75, "rewards/chosen": -0.11910320818424225, "rewards/margins": 0.2826518416404724, "rewards/rejected": -0.40175506472587585, "step": 72 }, { "epoch": 0.06, "grad_norm": 84.04344881624239, "learning_rate": 1.2478632478632478e-06, "logits/chosen": -0.766875684261322, "logits/rejected": -0.7309463024139404, "logps/chosen": -425.70941162109375, "logps/rejected": -436.1416931152344, "loss": 0.6546, "rewards/accuracies": 0.75, "rewards/chosen": -0.15280786156654358, "rewards/margins": 0.09877495467662811, "rewards/rejected": -0.2515828013420105, "step": 73 }, { "epoch": 0.06, "grad_norm": 83.32547240766198, "learning_rate": 1.264957264957265e-06, "logits/chosen": -0.7253504991531372, "logits/rejected": -0.7070431709289551, "logps/chosen": -471.2389831542969, "logps/rejected": -688.390869140625, "loss": 0.5852, "rewards/accuracies": 0.75, "rewards/chosen": -0.2157745659351349, "rewards/margins": 0.25578293204307556, "rewards/rejected": -0.47155749797821045, "step": 74 }, { "epoch": 0.06, "grad_norm": 81.5941281481595, "learning_rate": 1.2820512820512822e-06, "logits/chosen": -0.6651368141174316, "logits/rejected": -0.6502801179885864, "logps/chosen": -471.9756774902344, "logps/rejected": -554.71630859375, "loss": 0.5826, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13057751953601837, "rewards/margins": 0.2676604092121124, "rewards/rejected": -0.3982379138469696, "step": 75 }, { "epoch": 0.07, "grad_norm": 85.78261235986672, "learning_rate": 1.299145299145299e-06, "logits/chosen": -0.7806533575057983, "logits/rejected": -0.7606949806213379, "logps/chosen": -437.90997314453125, "logps/rejected": -730.5367431640625, "loss": 0.5606, "rewards/accuracies": 0.75, "rewards/chosen": -0.09575305134057999, "rewards/margins": 0.3341591954231262, "rewards/rejected": -0.429912269115448, "step": 76 }, { "epoch": 0.07, "grad_norm": 80.12358188477748, "learning_rate": 1.3162393162393161e-06, "logits/chosen": -0.7996336221694946, "logits/rejected": -0.761587381362915, "logps/chosen": -516.3214721679688, "logps/rejected": -669.3408203125, "loss": 0.5536, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17250573635101318, "rewards/margins": 0.3307540714740753, "rewards/rejected": -0.5032598376274109, "step": 77 }, { "epoch": 0.07, "grad_norm": 68.90500632179386, "learning_rate": 1.3333333333333332e-06, "logits/chosen": -0.7328706979751587, "logits/rejected": -0.7092921733856201, "logps/chosen": -478.4838562011719, "logps/rejected": -623.0524291992188, "loss": 0.5091, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05939531326293945, "rewards/margins": 0.44020992517471313, "rewards/rejected": -0.4996052384376526, "step": 78 }, { "epoch": 0.07, "grad_norm": 65.3752440554934, "learning_rate": 1.3504273504273505e-06, "logits/chosen": -0.8254345059394836, "logits/rejected": -0.7779566049575806, "logps/chosen": -351.9629821777344, "logps/rejected": -470.9985046386719, "loss": 0.5464, "rewards/accuracies": 0.875, "rewards/chosen": -0.057938240468502045, "rewards/margins": 0.3562437891960144, "rewards/rejected": -0.41418206691741943, "step": 79 }, { "epoch": 0.07, "grad_norm": 83.33809923207538, "learning_rate": 1.3675213675213676e-06, "logits/chosen": -0.7703996896743774, "logits/rejected": -0.7198264002799988, "logps/chosen": -644.7678833007812, "logps/rejected": -719.0814208984375, "loss": 0.5473, "rewards/accuracies": 0.75, "rewards/chosen": -0.24494361877441406, "rewards/margins": 0.37615740299224854, "rewards/rejected": -0.6211010217666626, "step": 80 }, { "epoch": 0.07, "grad_norm": 87.93972477475651, "learning_rate": 1.3846153846153844e-06, "logits/chosen": -0.7104290127754211, "logits/rejected": -0.6862534880638123, "logps/chosen": -476.5368957519531, "logps/rejected": -573.0159912109375, "loss": 0.6023, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1351388543844223, "rewards/margins": 0.23241892457008362, "rewards/rejected": -0.3675577938556671, "step": 81 }, { "epoch": 0.07, "grad_norm": 83.56310082112059, "learning_rate": 1.4017094017094015e-06, "logits/chosen": -0.7223315834999084, "logits/rejected": -0.6844210028648376, "logps/chosen": -574.5439453125, "logps/rejected": -774.9532470703125, "loss": 0.5268, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14116373658180237, "rewards/margins": 0.39218950271606445, "rewards/rejected": -0.5333532094955444, "step": 82 }, { "epoch": 0.07, "grad_norm": 83.086595568021, "learning_rate": 1.4188034188034188e-06, "logits/chosen": -0.6719001531600952, "logits/rejected": -0.6052131652832031, "logps/chosen": -555.9496459960938, "logps/rejected": -671.5604248046875, "loss": 0.5781, "rewards/accuracies": 0.75, "rewards/chosen": 0.03966934233903885, "rewards/margins": 0.32521799206733704, "rewards/rejected": -0.2855486571788788, "step": 83 }, { "epoch": 0.07, "grad_norm": 98.97658586758621, "learning_rate": 1.4358974358974359e-06, "logits/chosen": -0.7302659749984741, "logits/rejected": -0.6979095935821533, "logps/chosen": -745.6942138671875, "logps/rejected": -744.9570922851562, "loss": 0.5822, "rewards/accuracies": 0.75, "rewards/chosen": -0.2759849429130554, "rewards/margins": 0.28207817673683167, "rewards/rejected": -0.5580631494522095, "step": 84 }, { "epoch": 0.07, "grad_norm": 68.76305542901659, "learning_rate": 1.452991452991453e-06, "logits/chosen": -0.7735276222229004, "logits/rejected": -0.7231909036636353, "logps/chosen": -473.5228271484375, "logps/rejected": -680.1998291015625, "loss": 0.4859, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08385667204856873, "rewards/margins": 0.5274546146392822, "rewards/rejected": -0.6113112568855286, "step": 85 }, { "epoch": 0.07, "grad_norm": 74.88598476634989, "learning_rate": 1.4700854700854702e-06, "logits/chosen": -0.8076218962669373, "logits/rejected": -0.7630515694618225, "logps/chosen": -510.4481201171875, "logps/rejected": -677.6754150390625, "loss": 0.4774, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14206896722316742, "rewards/margins": 0.5710533857345581, "rewards/rejected": -0.7131223678588867, "step": 86 }, { "epoch": 0.07, "grad_norm": 74.58496629885917, "learning_rate": 1.487179487179487e-06, "logits/chosen": -0.7392367124557495, "logits/rejected": -0.7253636717796326, "logps/chosen": -430.012451171875, "logps/rejected": -599.80517578125, "loss": 0.4863, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10148727893829346, "rewards/margins": 0.5541262030601501, "rewards/rejected": -0.6556135416030884, "step": 87 }, { "epoch": 0.08, "grad_norm": 60.2346447729904, "learning_rate": 1.5042735042735041e-06, "logits/chosen": -0.7053150534629822, "logits/rejected": -0.6814218759536743, "logps/chosen": -347.4354248046875, "logps/rejected": -578.521728515625, "loss": 0.5126, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09811823070049286, "rewards/margins": 0.4941341280937195, "rewards/rejected": -0.5922523140907288, "step": 88 }, { "epoch": 0.08, "grad_norm": 102.6805009384135, "learning_rate": 1.5213675213675212e-06, "logits/chosen": -0.7140446901321411, "logits/rejected": -0.6726734638214111, "logps/chosen": -535.3824462890625, "logps/rejected": -766.9484252929688, "loss": 0.5048, "rewards/accuracies": 0.875, "rewards/chosen": -0.15884247422218323, "rewards/margins": 0.5328004956245422, "rewards/rejected": -0.6916429400444031, "step": 89 }, { "epoch": 0.08, "grad_norm": 69.69374210672956, "learning_rate": 1.5384615384615385e-06, "logits/chosen": -0.6098439693450928, "logits/rejected": -0.6184341907501221, "logps/chosen": -491.7330322265625, "logps/rejected": -803.6419067382812, "loss": 0.4604, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13347195088863373, "rewards/margins": 0.6201971769332886, "rewards/rejected": -0.7536691427230835, "step": 90 }, { "epoch": 0.08, "grad_norm": 79.95513092727752, "learning_rate": 1.5555555555555556e-06, "logits/chosen": -0.7003642320632935, "logits/rejected": -0.6302669048309326, "logps/chosen": -645.9889526367188, "logps/rejected": -725.8878173828125, "loss": 0.5059, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13106298446655273, "rewards/margins": 0.4724835157394409, "rewards/rejected": -0.6035465002059937, "step": 91 }, { "epoch": 0.08, "grad_norm": 88.05720697404861, "learning_rate": 1.5726495726495724e-06, "logits/chosen": -0.7634373903274536, "logits/rejected": -0.6494268178939819, "logps/chosen": -679.6563720703125, "logps/rejected": -634.7783203125, "loss": 0.5569, "rewards/accuracies": 0.75, "rewards/chosen": -0.19827817380428314, "rewards/margins": 0.3626251220703125, "rewards/rejected": -0.5609033107757568, "step": 92 }, { "epoch": 0.08, "grad_norm": 68.36179701549388, "learning_rate": 1.5897435897435895e-06, "logits/chosen": -0.6748229265213013, "logits/rejected": -0.6570998430252075, "logps/chosen": -484.94976806640625, "logps/rejected": -636.7830200195312, "loss": 0.4894, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09381289780139923, "rewards/margins": 0.5250856876373291, "rewards/rejected": -0.6188986301422119, "step": 93 }, { "epoch": 0.08, "grad_norm": 71.06292002236022, "learning_rate": 1.6068376068376068e-06, "logits/chosen": -0.68152916431427, "logits/rejected": -0.6525790691375732, "logps/chosen": -361.0284729003906, "logps/rejected": -344.1199645996094, "loss": 0.6176, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17955589294433594, "rewards/margins": 0.1847459077835083, "rewards/rejected": -0.36430180072784424, "step": 94 }, { "epoch": 0.08, "grad_norm": 76.75834996079558, "learning_rate": 1.6239316239316239e-06, "logits/chosen": -0.7209312319755554, "logits/rejected": -0.691003143787384, "logps/chosen": -538.3939208984375, "logps/rejected": -878.8992919921875, "loss": 0.4889, "rewards/accuracies": 0.8125, "rewards/chosen": -0.36771565675735474, "rewards/margins": 0.5804280042648315, "rewards/rejected": -0.9481436014175415, "step": 95 }, { "epoch": 0.08, "grad_norm": 72.2280506478773, "learning_rate": 1.641025641025641e-06, "logits/chosen": -0.8478317260742188, "logits/rejected": -0.7494789361953735, "logps/chosen": -455.7156066894531, "logps/rejected": -556.229736328125, "loss": 0.5173, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13087773323059082, "rewards/margins": 0.4210716485977173, "rewards/rejected": -0.5519493818283081, "step": 96 }, { "epoch": 0.08, "grad_norm": 79.87734471613219, "learning_rate": 1.6581196581196582e-06, "logits/chosen": -0.7853848934173584, "logits/rejected": -0.7578170299530029, "logps/chosen": -374.66754150390625, "logps/rejected": -629.7857666015625, "loss": 0.4637, "rewards/accuracies": 0.9375, "rewards/chosen": -0.131764218211174, "rewards/margins": 0.5719059705734253, "rewards/rejected": -0.7036702036857605, "step": 97 }, { "epoch": 0.08, "grad_norm": 63.36940463548466, "learning_rate": 1.675213675213675e-06, "logits/chosen": -0.7420146465301514, "logits/rejected": -0.7114475965499878, "logps/chosen": -419.96209716796875, "logps/rejected": -752.1236572265625, "loss": 0.4367, "rewards/accuracies": 1.0, "rewards/chosen": -0.15291455388069153, "rewards/margins": 0.6730989813804626, "rewards/rejected": -0.8260136246681213, "step": 98 }, { "epoch": 0.08, "grad_norm": 65.96679602813242, "learning_rate": 1.6923076923076922e-06, "logits/chosen": -0.7752172946929932, "logits/rejected": -0.6906420588493347, "logps/chosen": -543.2679443359375, "logps/rejected": -700.7498779296875, "loss": 0.4442, "rewards/accuracies": 0.75, "rewards/chosen": -0.19367974996566772, "rewards/margins": 0.6900912523269653, "rewards/rejected": -0.8837709426879883, "step": 99 }, { "epoch": 0.09, "grad_norm": 72.20674567697648, "learning_rate": 1.7094017094017092e-06, "logits/chosen": -0.766581654548645, "logits/rejected": -0.6910284161567688, "logps/chosen": -489.31884765625, "logps/rejected": -679.50390625, "loss": 0.4481, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10363808274269104, "rewards/margins": 0.6479713916778564, "rewards/rejected": -0.7516094446182251, "step": 100 }, { "epoch": 0.09, "grad_norm": 73.31143088988361, "learning_rate": 1.7264957264957265e-06, "logits/chosen": -0.823377251625061, "logits/rejected": -0.7363071441650391, "logps/chosen": -728.3934326171875, "logps/rejected": -869.7471923828125, "loss": 0.4868, "rewards/accuracies": 0.75, "rewards/chosen": -0.30075347423553467, "rewards/margins": 0.6340336799621582, "rewards/rejected": -0.9347871541976929, "step": 101 }, { "epoch": 0.09, "grad_norm": 71.59168235641044, "learning_rate": 1.7435897435897436e-06, "logits/chosen": -0.7990365028381348, "logits/rejected": -0.7497241497039795, "logps/chosen": -532.1788330078125, "logps/rejected": -694.0585327148438, "loss": 0.4825, "rewards/accuracies": 0.9375, "rewards/chosen": -0.26346921920776367, "rewards/margins": 0.5893768072128296, "rewards/rejected": -0.8528460264205933, "step": 102 }, { "epoch": 0.09, "grad_norm": 66.90381226463809, "learning_rate": 1.7606837606837607e-06, "logits/chosen": -0.7751025557518005, "logits/rejected": -0.7068533897399902, "logps/chosen": -400.0640869140625, "logps/rejected": -499.7941589355469, "loss": 0.5111, "rewards/accuracies": 0.875, "rewards/chosen": -0.1898057758808136, "rewards/margins": 0.4734571576118469, "rewards/rejected": -0.6632629632949829, "step": 103 }, { "epoch": 0.09, "grad_norm": 91.57313094191657, "learning_rate": 1.7777777777777775e-06, "logits/chosen": -0.7730945944786072, "logits/rejected": -0.7324565052986145, "logps/chosen": -584.8296508789062, "logps/rejected": -557.51416015625, "loss": 0.6382, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4563564658164978, "rewards/margins": 0.16472357511520386, "rewards/rejected": -0.6210800409317017, "step": 104 }, { "epoch": 0.09, "grad_norm": 80.8676493591247, "learning_rate": 1.7948717948717948e-06, "logits/chosen": -0.800835371017456, "logits/rejected": -0.7630007266998291, "logps/chosen": -688.7553100585938, "logps/rejected": -779.5538940429688, "loss": 0.4738, "rewards/accuracies": 0.875, "rewards/chosen": -0.33632346987724304, "rewards/margins": 0.6307164430618286, "rewards/rejected": -0.9670398235321045, "step": 105 }, { "epoch": 0.09, "grad_norm": 94.83615645547866, "learning_rate": 1.8119658119658119e-06, "logits/chosen": -0.7865450978279114, "logits/rejected": -0.7478854060173035, "logps/chosen": -679.5304565429688, "logps/rejected": -742.7913208007812, "loss": 0.5669, "rewards/accuracies": 0.75, "rewards/chosen": -0.301138699054718, "rewards/margins": 0.36614471673965454, "rewards/rejected": -0.6672834157943726, "step": 106 }, { "epoch": 0.09, "grad_norm": 75.67023157367913, "learning_rate": 1.829059829059829e-06, "logits/chosen": -0.8037106990814209, "logits/rejected": -0.7343106269836426, "logps/chosen": -478.39031982421875, "logps/rejected": -601.8953857421875, "loss": 0.5616, "rewards/accuracies": 0.625, "rewards/chosen": -0.2523706257343292, "rewards/margins": 0.36477893590927124, "rewards/rejected": -0.6171494722366333, "step": 107 }, { "epoch": 0.09, "grad_norm": 77.13542686202837, "learning_rate": 1.8461538461538462e-06, "logits/chosen": -0.8077168464660645, "logits/rejected": -0.7684412598609924, "logps/chosen": -580.583740234375, "logps/rejected": -786.945068359375, "loss": 0.4501, "rewards/accuracies": 0.875, "rewards/chosen": -0.27610066533088684, "rewards/margins": 0.6333410739898682, "rewards/rejected": -0.9094418287277222, "step": 108 }, { "epoch": 0.09, "grad_norm": 68.08513125203355, "learning_rate": 1.8632478632478631e-06, "logits/chosen": -0.7108200192451477, "logits/rejected": -0.6949425935745239, "logps/chosen": -625.0948486328125, "logps/rejected": -853.0823364257812, "loss": 0.4433, "rewards/accuracies": 0.875, "rewards/chosen": -0.26558175683021545, "rewards/margins": 0.6938874125480652, "rewards/rejected": -0.9594690799713135, "step": 109 }, { "epoch": 0.09, "grad_norm": 80.04012798449374, "learning_rate": 1.8803418803418802e-06, "logits/chosen": -0.8558471202850342, "logits/rejected": -0.8219131827354431, "logps/chosen": -504.2074279785156, "logps/rejected": -616.380859375, "loss": 0.5202, "rewards/accuracies": 0.875, "rewards/chosen": -0.24755051732063293, "rewards/margins": 0.41993147134780884, "rewards/rejected": -0.6674820184707642, "step": 110 }, { "epoch": 0.1, "grad_norm": 66.76691728685117, "learning_rate": 1.8974358974358973e-06, "logits/chosen": -0.8877596855163574, "logits/rejected": -0.8417974710464478, "logps/chosen": -395.5901794433594, "logps/rejected": -509.20428466796875, "loss": 0.5051, "rewards/accuracies": 1.0, "rewards/chosen": -0.11536961048841476, "rewards/margins": 0.46388569474220276, "rewards/rejected": -0.5792553424835205, "step": 111 }, { "epoch": 0.1, "grad_norm": 68.84182709142127, "learning_rate": 1.9145299145299143e-06, "logits/chosen": -0.8668128252029419, "logits/rejected": -0.8128141164779663, "logps/chosen": -615.441162109375, "logps/rejected": -841.2947998046875, "loss": 0.4201, "rewards/accuracies": 0.9375, "rewards/chosen": -0.30144259333610535, "rewards/margins": 0.7650531530380249, "rewards/rejected": -1.0664957761764526, "step": 112 }, { "epoch": 0.1, "grad_norm": 72.82730282619598, "learning_rate": 1.9316239316239316e-06, "logits/chosen": -0.8395406007766724, "logits/rejected": -0.7451783418655396, "logps/chosen": -712.5833129882812, "logps/rejected": -855.0979614257812, "loss": 0.3898, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15199768543243408, "rewards/margins": 0.9080096483230591, "rewards/rejected": -1.0600073337554932, "step": 113 }, { "epoch": 0.1, "grad_norm": 93.92928292851688, "learning_rate": 1.9487179487179485e-06, "logits/chosen": -0.8096364736557007, "logits/rejected": -0.7551666498184204, "logps/chosen": -771.3917236328125, "logps/rejected": -970.24462890625, "loss": 0.4669, "rewards/accuracies": 0.8125, "rewards/chosen": -0.45426732301712036, "rewards/margins": 0.7061700820922852, "rewards/rejected": -1.1604373455047607, "step": 114 }, { "epoch": 0.1, "grad_norm": 79.1858750636432, "learning_rate": 1.9658119658119658e-06, "logits/chosen": -0.8325074911117554, "logits/rejected": -0.7650719285011292, "logps/chosen": -597.4686279296875, "logps/rejected": -716.0536499023438, "loss": 0.4879, "rewards/accuracies": 0.875, "rewards/chosen": -0.2944122552871704, "rewards/margins": 0.6035362482070923, "rewards/rejected": -0.8979485034942627, "step": 115 }, { "epoch": 0.1, "grad_norm": 72.37840288546316, "learning_rate": 1.982905982905983e-06, "logits/chosen": -0.807362973690033, "logits/rejected": -0.7953593730926514, "logps/chosen": -589.3651733398438, "logps/rejected": -734.3743896484375, "loss": 0.4439, "rewards/accuracies": 0.875, "rewards/chosen": -0.3563947081565857, "rewards/margins": 0.7376902103424072, "rewards/rejected": -1.0940849781036377, "step": 116 }, { "epoch": 0.1, "grad_norm": 75.5794034970188, "learning_rate": 2e-06, "logits/chosen": -0.8776167631149292, "logits/rejected": -0.8628365993499756, "logps/chosen": -434.11993408203125, "logps/rejected": -607.8153076171875, "loss": 0.5061, "rewards/accuracies": 0.875, "rewards/chosen": -0.34286579489707947, "rewards/margins": 0.4919288456439972, "rewards/rejected": -0.8347946405410767, "step": 117 }, { "epoch": 0.1, "grad_norm": 68.7024215365996, "learning_rate": 1.999995515454355e-06, "logits/chosen": -0.8011682033538818, "logits/rejected": -0.7599575519561768, "logps/chosen": -565.426513671875, "logps/rejected": -722.574951171875, "loss": 0.4489, "rewards/accuracies": 0.9375, "rewards/chosen": -0.390835702419281, "rewards/margins": 0.7291101813316345, "rewards/rejected": -1.1199458837509155, "step": 118 }, { "epoch": 0.1, "grad_norm": 69.50092820504294, "learning_rate": 1.999982061857643e-06, "logits/chosen": -0.9195200204849243, "logits/rejected": -0.8597877025604248, "logps/chosen": -695.9260864257812, "logps/rejected": -779.2306518554688, "loss": 0.4423, "rewards/accuracies": 0.875, "rewards/chosen": -0.2885461449623108, "rewards/margins": 0.7777824401855469, "rewards/rejected": -1.0663286447525024, "step": 119 }, { "epoch": 0.1, "grad_norm": 71.50992646655071, "learning_rate": 1.9999596393305298e-06, "logits/chosen": -0.8019847869873047, "logits/rejected": -0.7569107413291931, "logps/chosen": -461.3133544921875, "logps/rejected": -575.7021484375, "loss": 0.5173, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2865118980407715, "rewards/margins": 0.47288602590560913, "rewards/rejected": -0.7593979239463806, "step": 120 }, { "epoch": 0.1, "grad_norm": 77.06119329349305, "learning_rate": 1.9999282480741252e-06, "logits/chosen": -0.8069489002227783, "logits/rejected": -0.7503291368484497, "logps/chosen": -412.9802551269531, "logps/rejected": -491.18585205078125, "loss": 0.5334, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1733241081237793, "rewards/margins": 0.42330116033554077, "rewards/rejected": -0.5966252684593201, "step": 121 }, { "epoch": 0.1, "grad_norm": 87.32968201538856, "learning_rate": 1.999887888369981e-06, "logits/chosen": -0.8172651529312134, "logits/rejected": -0.7478220462799072, "logps/chosen": -713.5738525390625, "logps/rejected": -749.0739135742188, "loss": 0.4664, "rewards/accuracies": 0.875, "rewards/chosen": -0.31455856561660767, "rewards/margins": 0.675393283367157, "rewards/rejected": -0.9899518489837646, "step": 122 }, { "epoch": 0.11, "grad_norm": 60.70810298598834, "learning_rate": 1.999838560580086e-06, "logits/chosen": -0.9219634532928467, "logits/rejected": -0.8740679025650024, "logps/chosen": -444.17626953125, "logps/rejected": -695.4278564453125, "loss": 0.4462, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2280620038509369, "rewards/margins": 0.6651239395141602, "rewards/rejected": -0.8931859731674194, "step": 123 }, { "epoch": 0.11, "grad_norm": 71.52073306791745, "learning_rate": 1.9997802651468664e-06, "logits/chosen": -0.797703742980957, "logits/rejected": -0.7664157152175903, "logps/chosen": -479.1082763671875, "logps/rejected": -589.7113037109375, "loss": 0.4608, "rewards/accuracies": 0.9375, "rewards/chosen": -0.24791169166564941, "rewards/margins": 0.5920776128768921, "rewards/rejected": -0.8399893045425415, "step": 124 }, { "epoch": 0.11, "grad_norm": 79.2006988044831, "learning_rate": 1.999713002593179e-06, "logits/chosen": -0.8782462477684021, "logits/rejected": -0.8354315757751465, "logps/chosen": -541.5379638671875, "logps/rejected": -650.455810546875, "loss": 0.5032, "rewards/accuracies": 0.875, "rewards/chosen": -0.30667245388031006, "rewards/margins": 0.5787320137023926, "rewards/rejected": -0.8854044675827026, "step": 125 }, { "epoch": 0.11, "grad_norm": 76.74145047917592, "learning_rate": 1.999636773522308e-06, "logits/chosen": -0.8741164207458496, "logits/rejected": -0.8457087278366089, "logps/chosen": -482.1501770019531, "logps/rejected": -639.4264526367188, "loss": 0.4945, "rewards/accuracies": 0.875, "rewards/chosen": -0.34282004833221436, "rewards/margins": 0.5826435089111328, "rewards/rejected": -0.9254635572433472, "step": 126 }, { "epoch": 0.11, "grad_norm": 72.2024867540662, "learning_rate": 1.999551578617958e-06, "logits/chosen": -0.8951029181480408, "logits/rejected": -0.8367560505867004, "logps/chosen": -508.193115234375, "logps/rejected": -651.3898315429688, "loss": 0.4561, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19248032569885254, "rewards/margins": 0.6740570068359375, "rewards/rejected": -0.86653733253479, "step": 127 }, { "epoch": 0.11, "grad_norm": 68.33385779419781, "learning_rate": 1.999457418644251e-06, "logits/chosen": -0.8008483052253723, "logits/rejected": -0.7975972890853882, "logps/chosen": -455.5728454589844, "logps/rejected": -730.8447875976562, "loss": 0.4601, "rewards/accuracies": 0.75, "rewards/chosen": -0.35138142108917236, "rewards/margins": 0.8937031030654907, "rewards/rejected": -1.245084524154663, "step": 128 }, { "epoch": 0.11, "grad_norm": 60.09864215410157, "learning_rate": 1.9993542944457167e-06, "logits/chosen": -0.7812170386314392, "logits/rejected": -0.7546092867851257, "logps/chosen": -365.2407531738281, "logps/rejected": -650.365966796875, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": -0.20425821840763092, "rewards/margins": 0.9624660611152649, "rewards/rejected": -1.1667242050170898, "step": 129 }, { "epoch": 0.11, "grad_norm": 81.74557893605306, "learning_rate": 1.999242206947284e-06, "logits/chosen": -0.8479536175727844, "logits/rejected": -0.7998827695846558, "logps/chosen": -603.7364501953125, "logps/rejected": -804.69482421875, "loss": 0.439, "rewards/accuracies": 0.875, "rewards/chosen": -0.47630196809768677, "rewards/margins": 0.7274442911148071, "rewards/rejected": -1.2037461996078491, "step": 130 }, { "epoch": 0.11, "grad_norm": 70.49366758942749, "learning_rate": 1.999121157154277e-06, "logits/chosen": -0.8379110097885132, "logits/rejected": -0.7947043180465698, "logps/chosen": -558.55615234375, "logps/rejected": -611.01611328125, "loss": 0.4813, "rewards/accuracies": 0.875, "rewards/chosen": -0.37627291679382324, "rewards/margins": 0.6493163704872131, "rewards/rejected": -1.0255892276763916, "step": 131 }, { "epoch": 0.11, "grad_norm": 72.10084104635021, "learning_rate": 1.9989911461524012e-06, "logits/chosen": -0.7971664667129517, "logits/rejected": -0.7829629182815552, "logps/chosen": -392.22967529296875, "logps/rejected": -476.143798828125, "loss": 0.5238, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3677898347377777, "rewards/margins": 0.45336997509002686, "rewards/rejected": -0.8211597800254822, "step": 132 }, { "epoch": 0.11, "grad_norm": 58.32316467653529, "learning_rate": 1.9988521751077387e-06, "logits/chosen": -0.8165258765220642, "logits/rejected": -0.7373073101043701, "logps/chosen": -504.18206787109375, "logps/rejected": -669.3347778320312, "loss": 0.3772, "rewards/accuracies": 1.0, "rewards/chosen": -0.23694676160812378, "rewards/margins": 0.9710106253623962, "rewards/rejected": -1.2079572677612305, "step": 133 }, { "epoch": 0.11, "grad_norm": 62.43807799568858, "learning_rate": 1.9987042452667324e-06, "logits/chosen": -0.8428569436073303, "logits/rejected": -0.7920178174972534, "logps/chosen": -502.78192138671875, "logps/rejected": -705.2931518554688, "loss": 0.4184, "rewards/accuracies": 0.875, "rewards/chosen": -0.3294200003147125, "rewards/margins": 0.806629478931427, "rewards/rejected": -1.136049509048462, "step": 134 }, { "epoch": 0.12, "grad_norm": 78.44898603046353, "learning_rate": 1.9985473579561792e-06, "logits/chosen": -0.8491930961608887, "logits/rejected": -0.8058072328567505, "logps/chosen": -486.43756103515625, "logps/rejected": -606.3853759765625, "loss": 0.4936, "rewards/accuracies": 0.8125, "rewards/chosen": -0.43683668971061707, "rewards/margins": 0.5353400111198425, "rewards/rejected": -0.9721767902374268, "step": 135 }, { "epoch": 0.12, "grad_norm": 64.12841449779542, "learning_rate": 1.998381514583215e-06, "logits/chosen": -0.772560715675354, "logits/rejected": -0.7717125415802002, "logps/chosen": -540.2191162109375, "logps/rejected": -791.0310668945312, "loss": 0.3893, "rewards/accuracies": 0.9375, "rewards/chosen": -0.38439521193504333, "rewards/margins": 0.9186478853225708, "rewards/rejected": -1.3030431270599365, "step": 136 }, { "epoch": 0.12, "grad_norm": 64.02262479770808, "learning_rate": 1.9982067166353048e-06, "logits/chosen": -0.8253116607666016, "logits/rejected": -0.7849256992340088, "logps/chosen": -575.468017578125, "logps/rejected": -662.4056396484375, "loss": 0.4202, "rewards/accuracies": 0.875, "rewards/chosen": -0.28426802158355713, "rewards/margins": 0.9540620446205139, "rewards/rejected": -1.2383301258087158, "step": 137 }, { "epoch": 0.12, "grad_norm": 82.62839604239348, "learning_rate": 1.998022965680227e-06, "logits/chosen": -0.8180713653564453, "logits/rejected": -0.7528756260871887, "logps/chosen": -600.6514892578125, "logps/rejected": -630.1636962890625, "loss": 0.5035, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5511524677276611, "rewards/margins": 0.5072042346000671, "rewards/rejected": -1.058356761932373, "step": 138 }, { "epoch": 0.12, "grad_norm": 71.44852839544707, "learning_rate": 1.997830263366061e-06, "logits/chosen": -0.7998161315917969, "logits/rejected": -0.7412513494491577, "logps/chosen": -513.164306640625, "logps/rejected": -934.647705078125, "loss": 0.4242, "rewards/accuracies": 1.0, "rewards/chosen": -0.18442249298095703, "rewards/margins": 0.7800825834274292, "rewards/rejected": -0.9645050168037415, "step": 139 }, { "epoch": 0.12, "grad_norm": 52.61456092779066, "learning_rate": 1.9976286114211705e-06, "logits/chosen": -0.8468146324157715, "logits/rejected": -0.78184574842453, "logps/chosen": -439.827880859375, "logps/rejected": -507.6515808105469, "loss": 0.3766, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20624279975891113, "rewards/margins": 0.8863797187805176, "rewards/rejected": -1.0926225185394287, "step": 140 }, { "epoch": 0.12, "grad_norm": 69.57744451282258, "learning_rate": 1.997418011654192e-06, "logits/chosen": -0.9197933673858643, "logits/rejected": -0.8328337073326111, "logps/chosen": -429.9018249511719, "logps/rejected": -433.46246337890625, "loss": 0.5531, "rewards/accuracies": 0.625, "rewards/chosen": -0.3884229063987732, "rewards/margins": 0.45711952447891235, "rewards/rejected": -0.8455424308776855, "step": 141 }, { "epoch": 0.12, "grad_norm": 59.410206466636474, "learning_rate": 1.997198465954012e-06, "logits/chosen": -0.8448234796524048, "logits/rejected": -0.806464672088623, "logps/chosen": -513.1571655273438, "logps/rejected": -833.9468383789062, "loss": 0.3146, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37702399492263794, "rewards/margins": 1.382176160812378, "rewards/rejected": -1.759200096130371, "step": 142 }, { "epoch": 0.12, "grad_norm": 72.20471516742066, "learning_rate": 1.9969699762897573e-06, "logits/chosen": -0.915653645992279, "logits/rejected": -0.8454491496086121, "logps/chosen": -619.8283081054688, "logps/rejected": -733.0457763671875, "loss": 0.4527, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5407058000564575, "rewards/margins": 0.7225282192230225, "rewards/rejected": -1.2632339000701904, "step": 143 }, { "epoch": 0.12, "grad_norm": 59.75240456422268, "learning_rate": 1.9967325447107722e-06, "logits/chosen": -0.8744698762893677, "logits/rejected": -0.8536149859428406, "logps/chosen": -528.2760009765625, "logps/rejected": -776.7694091796875, "loss": 0.3287, "rewards/accuracies": 1.0, "rewards/chosen": -0.4334297776222229, "rewards/margins": 1.1689581871032715, "rewards/rejected": -1.6023879051208496, "step": 144 }, { "epoch": 0.12, "grad_norm": 60.74667932979385, "learning_rate": 1.996486173346602e-06, "logits/chosen": -0.8356216549873352, "logits/rejected": -0.8344606757164001, "logps/chosen": -483.53900146484375, "logps/rejected": -659.2405395507812, "loss": 0.3723, "rewards/accuracies": 1.0, "rewards/chosen": -0.35159799456596375, "rewards/margins": 0.9288654327392578, "rewards/rejected": -1.280463457107544, "step": 145 }, { "epoch": 0.13, "grad_norm": 67.71810220038617, "learning_rate": 1.996230864406974e-06, "logits/chosen": -0.9083362817764282, "logits/rejected": -0.8268660306930542, "logps/chosen": -633.2772827148438, "logps/rejected": -801.44140625, "loss": 0.3127, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2331371307373047, "rewards/margins": 1.2440853118896484, "rewards/rejected": -1.4772224426269531, "step": 146 }, { "epoch": 0.13, "grad_norm": 65.4351547090826, "learning_rate": 1.9959666201817776e-06, "logits/chosen": -0.8755106925964355, "logits/rejected": -0.8636406064033508, "logps/chosen": -507.8783264160156, "logps/rejected": -684.91552734375, "loss": 0.3942, "rewards/accuracies": 0.875, "rewards/chosen": -0.40344667434692383, "rewards/margins": 0.9445846080780029, "rewards/rejected": -1.3480312824249268, "step": 147 }, { "epoch": 0.13, "grad_norm": 62.6842472143847, "learning_rate": 1.9956934430410437e-06, "logits/chosen": -0.903535008430481, "logits/rejected": -0.8534011840820312, "logps/chosen": -395.8306884765625, "logps/rejected": -533.202392578125, "loss": 0.4462, "rewards/accuracies": 0.875, "rewards/chosen": -0.3898427486419678, "rewards/margins": 0.7070665955543518, "rewards/rejected": -1.0969094038009644, "step": 148 }, { "epoch": 0.13, "grad_norm": 56.18745353407791, "learning_rate": 1.995411335434922e-06, "logits/chosen": -0.8835330605506897, "logits/rejected": -0.844456136226654, "logps/chosen": -450.849853515625, "logps/rejected": -655.7808837890625, "loss": 0.3416, "rewards/accuracies": 0.875, "rewards/chosen": -0.2940594255924225, "rewards/margins": 1.1423484086990356, "rewards/rejected": -1.4364076852798462, "step": 149 }, { "epoch": 0.13, "grad_norm": 63.93201700244688, "learning_rate": 1.995120299893662e-06, "logits/chosen": -0.8894577622413635, "logits/rejected": -0.8500162363052368, "logps/chosen": -529.0599365234375, "logps/rejected": -826.4214477539062, "loss": 0.3375, "rewards/accuracies": 0.75, "rewards/chosen": -0.28005659580230713, "rewards/margins": 1.3898392915725708, "rewards/rejected": -1.6698957681655884, "step": 150 }, { "epoch": 0.13, "grad_norm": 66.03579836821562, "learning_rate": 1.994820339027588e-06, "logits/chosen": -0.9007207155227661, "logits/rejected": -0.8309506177902222, "logps/chosen": -554.010009765625, "logps/rejected": -609.4149169921875, "loss": 0.4365, "rewards/accuracies": 0.875, "rewards/chosen": -0.41904115676879883, "rewards/margins": 0.7743521928787231, "rewards/rejected": -1.193393349647522, "step": 151 }, { "epoch": 0.13, "grad_norm": 63.85789808273116, "learning_rate": 1.9945114555270767e-06, "logits/chosen": -0.8752983808517456, "logits/rejected": -0.7941920757293701, "logps/chosen": -763.06640625, "logps/rejected": -773.3269653320312, "loss": 0.3656, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3239164352416992, "rewards/margins": 0.992270290851593, "rewards/rejected": -1.3161866664886475, "step": 152 }, { "epoch": 0.13, "grad_norm": 65.57581932182828, "learning_rate": 1.994193652162532e-06, "logits/chosen": -0.9188427925109863, "logits/rejected": -0.855987548828125, "logps/chosen": -587.0689086914062, "logps/rejected": -663.8291015625, "loss": 0.3913, "rewards/accuracies": 1.0, "rewards/chosen": -0.26296913623809814, "rewards/margins": 0.8903168439865112, "rewards/rejected": -1.1532859802246094, "step": 153 }, { "epoch": 0.13, "grad_norm": 67.85437233358816, "learning_rate": 1.993866931784361e-06, "logits/chosen": -0.8676232099533081, "logits/rejected": -0.8185759782791138, "logps/chosen": -503.1413269042969, "logps/rejected": -732.5165405273438, "loss": 0.396, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3670608699321747, "rewards/margins": 0.9600380659103394, "rewards/rejected": -1.3270988464355469, "step": 154 }, { "epoch": 0.13, "grad_norm": 54.16281611590089, "learning_rate": 1.9935312973229495e-06, "logits/chosen": -0.8576045632362366, "logits/rejected": -0.8100249767303467, "logps/chosen": -475.8677062988281, "logps/rejected": -678.9979248046875, "loss": 0.3568, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11242443323135376, "rewards/margins": 1.0827581882476807, "rewards/rejected": -1.1951825618743896, "step": 155 }, { "epoch": 0.13, "grad_norm": 65.15718998860318, "learning_rate": 1.993186751788633e-06, "logits/chosen": -0.877051055431366, "logits/rejected": -0.8274168372154236, "logps/chosen": -448.50860595703125, "logps/rejected": -672.7408447265625, "loss": 0.4592, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34052014350891113, "rewards/margins": 0.9053160548210144, "rewards/rejected": -1.2458362579345703, "step": 156 }, { "epoch": 0.13, "grad_norm": 67.06517025156816, "learning_rate": 1.992833298271672e-06, "logits/chosen": -0.8686350584030151, "logits/rejected": -0.8159289360046387, "logps/chosen": -624.25830078125, "logps/rejected": -718.9054565429688, "loss": 0.3583, "rewards/accuracies": 0.875, "rewards/chosen": -0.3402397334575653, "rewards/margins": 1.1835572719573975, "rewards/rejected": -1.5237970352172852, "step": 157 }, { "epoch": 0.14, "grad_norm": 74.89861873824249, "learning_rate": 1.992470939942223e-06, "logits/chosen": -0.8808647394180298, "logits/rejected": -0.8215168118476868, "logps/chosen": -559.8514404296875, "logps/rejected": -716.53125, "loss": 0.4391, "rewards/accuracies": 0.875, "rewards/chosen": -0.5647865533828735, "rewards/margins": 0.8510969877243042, "rewards/rejected": -1.4158835411071777, "step": 158 }, { "epoch": 0.14, "grad_norm": 58.337214417043405, "learning_rate": 1.9920996800503117e-06, "logits/chosen": -0.8979121446609497, "logits/rejected": -0.8257559537887573, "logps/chosen": -567.0684814453125, "logps/rejected": -770.5816650390625, "loss": 0.4238, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35424235463142395, "rewards/margins": 0.9558961391448975, "rewards/rejected": -1.310138463973999, "step": 159 }, { "epoch": 0.14, "grad_norm": 80.57856746871143, "learning_rate": 1.991719521925801e-06, "logits/chosen": -0.8393473625183105, "logits/rejected": -0.7686038017272949, "logps/chosen": -573.0647583007812, "logps/rejected": -683.67529296875, "loss": 0.5307, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3153061866760254, "rewards/margins": 0.6114298105239868, "rewards/rejected": -0.9267358779907227, "step": 160 }, { "epoch": 0.14, "grad_norm": 80.55202576409322, "learning_rate": 1.9913304689783644e-06, "logits/chosen": -0.8318350315093994, "logits/rejected": -0.7842640280723572, "logps/chosen": -634.771240234375, "logps/rejected": -816.2799072265625, "loss": 0.485, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5386346578598022, "rewards/margins": 1.1234714984893799, "rewards/rejected": -1.6621060371398926, "step": 161 }, { "epoch": 0.14, "grad_norm": 56.32400126506285, "learning_rate": 1.990932524697454e-06, "logits/chosen": -0.8687608242034912, "logits/rejected": -0.8195289373397827, "logps/chosen": -427.2818298339844, "logps/rejected": -599.0328369140625, "loss": 0.381, "rewards/accuracies": 0.875, "rewards/chosen": -0.1253286600112915, "rewards/margins": 0.9828024506568909, "rewards/rejected": -1.1081310510635376, "step": 162 }, { "epoch": 0.14, "grad_norm": 63.0643021542132, "learning_rate": 1.990525692652267e-06, "logits/chosen": -0.867060661315918, "logits/rejected": -0.8365573883056641, "logps/chosen": -675.904296875, "logps/rejected": -826.5540771484375, "loss": 0.3607, "rewards/accuracies": 0.875, "rewards/chosen": -0.33339425921440125, "rewards/margins": 1.0543599128723145, "rewards/rejected": -1.387754201889038, "step": 163 }, { "epoch": 0.14, "grad_norm": 67.5672182666397, "learning_rate": 1.990109976491718e-06, "logits/chosen": -0.842487096786499, "logits/rejected": -0.8210547566413879, "logps/chosen": -516.2122192382812, "logps/rejected": -652.6751708984375, "loss": 0.4432, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3034408688545227, "rewards/margins": 0.9074206948280334, "rewards/rejected": -1.2108616828918457, "step": 164 }, { "epoch": 0.14, "grad_norm": 61.464764505682616, "learning_rate": 1.9896853799444026e-06, "logits/chosen": -0.8813076019287109, "logits/rejected": -0.8397949934005737, "logps/chosen": -557.419677734375, "logps/rejected": -787.8724365234375, "loss": 0.4034, "rewards/accuracies": 0.875, "rewards/chosen": -0.3238334655761719, "rewards/margins": 0.8356885313987732, "rewards/rejected": -1.1595220565795898, "step": 165 }, { "epoch": 0.14, "grad_norm": 64.47594791896671, "learning_rate": 1.9892519068185667e-06, "logits/chosen": -0.8496279716491699, "logits/rejected": -0.7839698791503906, "logps/chosen": -526.0557250976562, "logps/rejected": -588.1710205078125, "loss": 0.4638, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06029529124498367, "rewards/margins": 0.7168031930923462, "rewards/rejected": -0.6565079689025879, "step": 166 }, { "epoch": 0.14, "grad_norm": 54.170446969718355, "learning_rate": 1.98880956100207e-06, "logits/chosen": -0.8706177473068237, "logits/rejected": -0.8323687314987183, "logps/chosen": -409.9179992675781, "logps/rejected": -655.827880859375, "loss": 0.3739, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1820179671049118, "rewards/margins": 0.9817320108413696, "rewards/rejected": -1.1637499332427979, "step": 167 }, { "epoch": 0.14, "grad_norm": 114.6857522864625, "learning_rate": 1.9883583464623523e-06, "logits/chosen": -0.9125653505325317, "logits/rejected": -0.874718427658081, "logps/chosen": -599.5379638671875, "logps/rejected": -680.3572998046875, "loss": 0.6713, "rewards/accuracies": 0.8125, "rewards/chosen": -0.444853812456131, "rewards/margins": 0.5304194688796997, "rewards/rejected": -0.9752731919288635, "step": 168 }, { "epoch": 0.14, "grad_norm": 76.6076339710897, "learning_rate": 1.9878982672463987e-06, "logits/chosen": -0.9233865141868591, "logits/rejected": -0.8617924451828003, "logps/chosen": -559.2950439453125, "logps/rejected": -761.9292602539062, "loss": 0.4374, "rewards/accuracies": 0.75, "rewards/chosen": -0.4529637098312378, "rewards/margins": 0.9664942622184753, "rewards/rejected": -1.419458031654358, "step": 169 }, { "epoch": 0.15, "grad_norm": 80.24554751811056, "learning_rate": 1.987429327480701e-06, "logits/chosen": -0.8420670032501221, "logits/rejected": -0.8396788239479065, "logps/chosen": -451.66748046875, "logps/rejected": -740.0496826171875, "loss": 0.5313, "rewards/accuracies": 0.8125, "rewards/chosen": -0.43624886870384216, "rewards/margins": 0.7384995222091675, "rewards/rejected": -1.1747483015060425, "step": 170 }, { "epoch": 0.15, "grad_norm": 50.03612260766987, "learning_rate": 1.9869515313712226e-06, "logits/chosen": -0.9093887805938721, "logits/rejected": -0.8626548051834106, "logps/chosen": -497.70465087890625, "logps/rejected": -659.1704711914062, "loss": 0.3677, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3543227016925812, "rewards/margins": 1.0449423789978027, "rewards/rejected": -1.3992650508880615, "step": 171 }, { "epoch": 0.15, "grad_norm": 77.8668671112791, "learning_rate": 1.986464883203361e-06, "logits/chosen": -0.8573567867279053, "logits/rejected": -0.8139798641204834, "logps/chosen": -475.87750244140625, "logps/rejected": -573.9266357421875, "loss": 0.4809, "rewards/accuracies": 0.75, "rewards/chosen": -0.28878140449523926, "rewards/margins": 0.7627514600753784, "rewards/rejected": -1.0515328645706177, "step": 172 }, { "epoch": 0.15, "grad_norm": 51.49285833661925, "learning_rate": 1.985969387341908e-06, "logits/chosen": -0.7776519656181335, "logits/rejected": -0.7655439376831055, "logps/chosen": -494.25653076171875, "logps/rejected": -746.9952392578125, "loss": 0.2972, "rewards/accuracies": 1.0, "rewards/chosen": -0.3533231019973755, "rewards/margins": 1.2390961647033691, "rewards/rejected": -1.5924192667007446, "step": 173 }, { "epoch": 0.15, "grad_norm": 69.87605864741144, "learning_rate": 1.985465048231011e-06, "logits/chosen": -0.8837787508964539, "logits/rejected": -0.8801698088645935, "logps/chosen": -396.24224853515625, "logps/rejected": -653.4398193359375, "loss": 0.4339, "rewards/accuracies": 0.875, "rewards/chosen": -0.20290517807006836, "rewards/margins": 0.9276418089866638, "rewards/rejected": -1.130547046661377, "step": 174 }, { "epoch": 0.15, "grad_norm": 61.25270124043899, "learning_rate": 1.9849518703941335e-06, "logits/chosen": -0.8431894779205322, "logits/rejected": -0.7994276285171509, "logps/chosen": -387.6629638671875, "logps/rejected": -628.9801025390625, "loss": 0.3654, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2903192639350891, "rewards/margins": 0.9974017143249512, "rewards/rejected": -1.2877209186553955, "step": 175 }, { "epoch": 0.15, "grad_norm": 46.026434612264836, "learning_rate": 1.9844298584340143e-06, "logits/chosen": -0.8580862283706665, "logits/rejected": -0.8199708461761475, "logps/chosen": -487.328125, "logps/rejected": -728.115234375, "loss": 0.2489, "rewards/accuracies": 1.0, "rewards/chosen": -0.17964893579483032, "rewards/margins": 1.5337719917297363, "rewards/rejected": -1.7134208679199219, "step": 176 }, { "epoch": 0.15, "grad_norm": 53.4694134691448, "learning_rate": 1.9838990170326268e-06, "logits/chosen": -0.923569917678833, "logits/rejected": -0.8764654397964478, "logps/chosen": -553.5648193359375, "logps/rejected": -613.848876953125, "loss": 0.3562, "rewards/accuracies": 0.875, "rewards/chosen": -0.2990966737270355, "rewards/margins": 1.1244242191314697, "rewards/rejected": -1.4235209226608276, "step": 177 }, { "epoch": 0.15, "grad_norm": 59.50137781436025, "learning_rate": 1.983359350951136e-06, "logits/chosen": -0.8597773313522339, "logits/rejected": -0.8245177865028381, "logps/chosen": -515.1898803710938, "logps/rejected": -726.3103637695312, "loss": 0.3574, "rewards/accuracies": 0.875, "rewards/chosen": -0.3775560259819031, "rewards/margins": 1.0746281147003174, "rewards/rejected": -1.4521839618682861, "step": 178 }, { "epoch": 0.15, "grad_norm": 46.84015049044058, "learning_rate": 1.982810865029855e-06, "logits/chosen": -0.8746932148933411, "logits/rejected": -0.8448111414909363, "logps/chosen": -353.440185546875, "logps/rejected": -537.9382934570312, "loss": 0.4036, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05355449020862579, "rewards/margins": 0.8959289789199829, "rewards/rejected": -0.9494835138320923, "step": 179 }, { "epoch": 0.15, "grad_norm": 57.555815006437385, "learning_rate": 1.9822535641882054e-06, "logits/chosen": -0.9322961568832397, "logits/rejected": -0.8470557928085327, "logps/chosen": -650.664306640625, "logps/rejected": -664.6771850585938, "loss": 0.3578, "rewards/accuracies": 0.8125, "rewards/chosen": 0.353482186794281, "rewards/margins": 1.0259697437286377, "rewards/rejected": -0.6724876165390015, "step": 180 }, { "epoch": 0.16, "grad_norm": 52.64380215131483, "learning_rate": 1.9816874534246694e-06, "logits/chosen": -0.94826340675354, "logits/rejected": -0.8962709903717041, "logps/chosen": -436.15155029296875, "logps/rejected": -667.5868530273438, "loss": 0.3527, "rewards/accuracies": 1.0, "rewards/chosen": -0.3777049779891968, "rewards/margins": 1.048713207244873, "rewards/rejected": -1.4264180660247803, "step": 181 }, { "epoch": 0.16, "grad_norm": 61.337145217195584, "learning_rate": 1.981112537816745e-06, "logits/chosen": -0.9093761444091797, "logits/rejected": -0.8457399606704712, "logps/chosen": -687.2678833007812, "logps/rejected": -717.3057861328125, "loss": 0.3736, "rewards/accuracies": 0.875, "rewards/chosen": -0.5045949220657349, "rewards/margins": 1.1356408596038818, "rewards/rejected": -1.6402359008789062, "step": 182 }, { "epoch": 0.16, "grad_norm": 55.882554466879625, "learning_rate": 1.9805288225209037e-06, "logits/chosen": -0.9058928489685059, "logits/rejected": -0.8362715244293213, "logps/chosen": -718.281005859375, "logps/rejected": -1016.6908569335938, "loss": 0.2412, "rewards/accuracies": 1.0, "rewards/chosen": -0.5235175490379333, "rewards/margins": 1.552617073059082, "rewards/rejected": -2.07613468170166, "step": 183 }, { "epoch": 0.16, "grad_norm": 85.18071092892326, "learning_rate": 1.979936312772541e-06, "logits/chosen": -0.8438854217529297, "logits/rejected": -0.8518179059028625, "logps/chosen": -588.9202880859375, "logps/rejected": -629.9645385742188, "loss": 0.487, "rewards/accuracies": 0.75, "rewards/chosen": -0.6537317037582397, "rewards/margins": 0.815560519695282, "rewards/rejected": -1.469292163848877, "step": 184 }, { "epoch": 0.16, "grad_norm": 40.526405519388895, "learning_rate": 1.979335013885931e-06, "logits/chosen": -0.9247338771820068, "logits/rejected": -0.8627129793167114, "logps/chosen": -581.0001220703125, "logps/rejected": -817.584716796875, "loss": 0.2342, "rewards/accuracies": 1.0, "rewards/chosen": -0.21191178262233734, "rewards/margins": 1.7073981761932373, "rewards/rejected": -1.9193100929260254, "step": 185 }, { "epoch": 0.16, "grad_norm": 66.44985379036187, "learning_rate": 1.978724931254178e-06, "logits/chosen": -0.914710283279419, "logits/rejected": -0.8438661694526672, "logps/chosen": -668.4779052734375, "logps/rejected": -792.9580078125, "loss": 0.3533, "rewards/accuracies": 0.875, "rewards/chosen": -0.5750971436500549, "rewards/margins": 1.072385549545288, "rewards/rejected": -1.6474826335906982, "step": 186 }, { "epoch": 0.16, "grad_norm": 86.06670016108346, "learning_rate": 1.9781060703491694e-06, "logits/chosen": -0.8819316029548645, "logits/rejected": -0.8386082053184509, "logps/chosen": -863.5635375976562, "logps/rejected": -774.0555419921875, "loss": 0.4269, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5523691177368164, "rewards/margins": 0.9685525894165039, "rewards/rejected": -1.5209217071533203, "step": 187 }, { "epoch": 0.16, "grad_norm": 55.92321246323812, "learning_rate": 1.9774784367215245e-06, "logits/chosen": -0.9131249189376831, "logits/rejected": -0.8162829875946045, "logps/chosen": -660.4703369140625, "logps/rejected": -848.2841796875, "loss": 0.2694, "rewards/accuracies": 1.0, "rewards/chosen": -0.3814217150211334, "rewards/margins": 1.388995885848999, "rewards/rejected": -1.7704176902770996, "step": 188 }, { "epoch": 0.16, "grad_norm": 47.296797461987886, "learning_rate": 1.976842036000547e-06, "logits/chosen": -0.8920987844467163, "logits/rejected": -0.8426529169082642, "logps/chosen": -641.9872436523438, "logps/rejected": -836.463623046875, "loss": 0.254, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2708077132701874, "rewards/margins": 1.5072784423828125, "rewards/rejected": -1.7780860662460327, "step": 189 }, { "epoch": 0.16, "grad_norm": 53.86538055736526, "learning_rate": 1.976196873894173e-06, "logits/chosen": -0.8458589315414429, "logits/rejected": -0.8193198442459106, "logps/chosen": -632.2750854492188, "logps/rejected": -753.638916015625, "loss": 0.3357, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37582093477249146, "rewards/margins": 1.048279881477356, "rewards/rejected": -1.4241008758544922, "step": 190 }, { "epoch": 0.16, "grad_norm": 54.769017667295614, "learning_rate": 1.9755429561889205e-06, "logits/chosen": -0.9080367088317871, "logits/rejected": -0.8948763608932495, "logps/chosen": -532.4638671875, "logps/rejected": -732.6849365234375, "loss": 0.3107, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3257588744163513, "rewards/margins": 1.5003809928894043, "rewards/rejected": -1.8261399269104004, "step": 191 }, { "epoch": 0.16, "grad_norm": 49.30835267251258, "learning_rate": 1.9748802887498368e-06, "logits/chosen": -0.8217437267303467, "logits/rejected": -0.8103574514389038, "logps/chosen": -381.0695495605469, "logps/rejected": -666.2606201171875, "loss": 0.3521, "rewards/accuracies": 0.875, "rewards/chosen": -0.370362251996994, "rewards/margins": 1.2242530584335327, "rewards/rejected": -1.5946152210235596, "step": 192 }, { "epoch": 0.17, "grad_norm": 39.70677456901692, "learning_rate": 1.9742088775204463e-06, "logits/chosen": -0.9067370891571045, "logits/rejected": -0.8767822980880737, "logps/chosen": -338.2500915527344, "logps/rejected": -571.2421875, "loss": 0.3016, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18578293919563293, "rewards/margins": 1.3771023750305176, "rewards/rejected": -1.5628852844238281, "step": 193 }, { "epoch": 0.17, "grad_norm": 37.226494875826404, "learning_rate": 1.9735287285226984e-06, "logits/chosen": -0.82257080078125, "logits/rejected": -0.7932236194610596, "logps/chosen": -422.1122131347656, "logps/rejected": -737.2191162109375, "loss": 0.2516, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22484631836414337, "rewards/margins": 1.4313842058181763, "rewards/rejected": -1.6562305688858032, "step": 194 }, { "epoch": 0.17, "grad_norm": 70.16182828765132, "learning_rate": 1.9728398478569113e-06, "logits/chosen": -0.9176048040390015, "logits/rejected": -0.8764656782150269, "logps/chosen": -725.5680541992188, "logps/rejected": -755.45849609375, "loss": 0.3571, "rewards/accuracies": 0.875, "rewards/chosen": -0.3184995651245117, "rewards/margins": 1.065685510635376, "rewards/rejected": -1.3841850757598877, "step": 195 }, { "epoch": 0.17, "grad_norm": 34.46849454206544, "learning_rate": 1.9721422417017185e-06, "logits/chosen": -0.9285256862640381, "logits/rejected": -0.882872462272644, "logps/chosen": -428.5306091308594, "logps/rejected": -768.4300537109375, "loss": 0.1898, "rewards/accuracies": 1.0, "rewards/chosen": -0.3067496120929718, "rewards/margins": 1.816524863243103, "rewards/rejected": -2.123274564743042, "step": 196 }, { "epoch": 0.17, "grad_norm": 68.90183135392742, "learning_rate": 1.971435916314013e-06, "logits/chosen": -0.851874589920044, "logits/rejected": -0.8348385095596313, "logps/chosen": -493.03045654296875, "logps/rejected": -677.19189453125, "loss": 0.4673, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6967003345489502, "rewards/margins": 1.1165249347686768, "rewards/rejected": -1.813225269317627, "step": 197 }, { "epoch": 0.17, "grad_norm": 47.658400615569406, "learning_rate": 1.970720878028892e-06, "logits/chosen": -0.9000753164291382, "logits/rejected": -0.8680911064147949, "logps/chosen": -508.72967529296875, "logps/rejected": -634.1900634765625, "loss": 0.3581, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3899848759174347, "rewards/margins": 1.2031042575836182, "rewards/rejected": -1.5930891036987305, "step": 198 }, { "epoch": 0.17, "grad_norm": 55.44563313958047, "learning_rate": 1.9699971332595994e-06, "logits/chosen": -0.9187253713607788, "logits/rejected": -0.9054923057556152, "logps/chosen": -457.06787109375, "logps/rejected": -592.365966796875, "loss": 0.3716, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3505123257637024, "rewards/margins": 1.2057658433914185, "rewards/rejected": -1.556278109550476, "step": 199 }, { "epoch": 0.17, "grad_norm": 50.47942610292587, "learning_rate": 1.9692646884974677e-06, "logits/chosen": -0.8936487436294556, "logits/rejected": -0.891660213470459, "logps/chosen": -641.30322265625, "logps/rejected": -801.561767578125, "loss": 0.3059, "rewards/accuracies": 0.9375, "rewards/chosen": -0.541850209236145, "rewards/margins": 1.4763543605804443, "rewards/rejected": -2.018204689025879, "step": 200 }, { "epoch": 0.17, "grad_norm": 55.01255322615232, "learning_rate": 1.968523550311861e-06, "logits/chosen": -0.789238452911377, "logits/rejected": -0.7798421382904053, "logps/chosen": -410.76922607421875, "logps/rejected": -635.4869995117188, "loss": 0.346, "rewards/accuracies": 1.0, "rewards/chosen": -0.5689191818237305, "rewards/margins": 1.2248907089233398, "rewards/rejected": -1.7938098907470703, "step": 201 }, { "epoch": 0.17, "grad_norm": 50.19339087306107, "learning_rate": 1.967773725350115e-06, "logits/chosen": -0.8648644685745239, "logits/rejected": -0.8245384693145752, "logps/chosen": -513.4756469726562, "logps/rejected": -949.0679321289062, "loss": 0.2709, "rewards/accuracies": 0.9375, "rewards/chosen": -0.48574915528297424, "rewards/margins": 1.6821191310882568, "rewards/rejected": -2.167868137359619, "step": 202 }, { "epoch": 0.17, "grad_norm": 46.244053912991895, "learning_rate": 1.9670152203374792e-06, "logits/chosen": -0.9518221020698547, "logits/rejected": -0.912643551826477, "logps/chosen": -523.580078125, "logps/rejected": -801.155029296875, "loss": 0.2803, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4601432681083679, "rewards/margins": 1.4460601806640625, "rewards/rejected": -1.9062033891677856, "step": 203 }, { "epoch": 0.17, "grad_norm": 86.33604808746128, "learning_rate": 1.9662480420770532e-06, "logits/chosen": -0.994076132774353, "logits/rejected": -0.9478283524513245, "logps/chosen": -551.2991943359375, "logps/rejected": -729.0390014648438, "loss": 0.4418, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9313852190971375, "rewards/margins": 0.9562951326370239, "rewards/rejected": -1.8876804113388062, "step": 204 }, { "epoch": 0.18, "grad_norm": 63.281110377931036, "learning_rate": 1.965472197449729e-06, "logits/chosen": -0.9953914880752563, "logits/rejected": -0.9329037666320801, "logps/chosen": -452.6296691894531, "logps/rejected": -581.63671875, "loss": 0.3916, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6000749468803406, "rewards/margins": 0.8758547306060791, "rewards/rejected": -1.4759297370910645, "step": 205 }, { "epoch": 0.18, "grad_norm": 61.35086829140238, "learning_rate": 1.964687693414129e-06, "logits/chosen": -0.979865550994873, "logits/rejected": -0.9563643932342529, "logps/chosen": -581.7698364257812, "logps/rejected": -679.9580078125, "loss": 0.4202, "rewards/accuracies": 0.75, "rewards/chosen": -0.6248217225074768, "rewards/margins": 1.460896372795105, "rewards/rejected": -2.0857181549072266, "step": 206 }, { "epoch": 0.18, "grad_norm": 66.71900425954713, "learning_rate": 1.96389453700654e-06, "logits/chosen": -1.0769457817077637, "logits/rejected": -1.0209407806396484, "logps/chosen": -532.283203125, "logps/rejected": -622.6073608398438, "loss": 0.387, "rewards/accuracies": 1.0, "rewards/chosen": -0.5653808116912842, "rewards/margins": 1.0661849975585938, "rewards/rejected": -1.631565809249878, "step": 207 }, { "epoch": 0.18, "grad_norm": 62.19269408403389, "learning_rate": 1.9630927353408553e-06, "logits/chosen": -1.0014283657073975, "logits/rejected": -0.978428840637207, "logps/chosen": -429.43206787109375, "logps/rejected": -454.16217041015625, "loss": 0.4662, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2503443956375122, "rewards/margins": 0.6810840964317322, "rewards/rejected": -0.9314284324645996, "step": 208 }, { "epoch": 0.18, "grad_norm": 50.77513215236795, "learning_rate": 1.9622822956085064e-06, "logits/chosen": -0.9936904907226562, "logits/rejected": -0.9656355381011963, "logps/chosen": -349.3846435546875, "logps/rejected": -486.734375, "loss": 0.3967, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2877790927886963, "rewards/margins": 0.8903200626373291, "rewards/rejected": -1.1780990362167358, "step": 209 }, { "epoch": 0.18, "grad_norm": 47.75737657842299, "learning_rate": 1.961463225078402e-06, "logits/chosen": -1.0057542324066162, "logits/rejected": -0.970438539981842, "logps/chosen": -588.5496215820312, "logps/rejected": -781.412841796875, "loss": 0.3138, "rewards/accuracies": 0.875, "rewards/chosen": -0.48664844036102295, "rewards/margins": 1.434399127960205, "rewards/rejected": -1.9210474491119385, "step": 210 }, { "epoch": 0.18, "grad_norm": 79.01899526621891, "learning_rate": 1.96063553109686e-06, "logits/chosen": -1.0867319107055664, "logits/rejected": -1.0168215036392212, "logps/chosen": -487.307861328125, "logps/rejected": -644.064208984375, "loss": 0.4499, "rewards/accuracies": 0.875, "rewards/chosen": -0.6227390766143799, "rewards/margins": 0.8902217149734497, "rewards/rejected": -1.5129609107971191, "step": 211 }, { "epoch": 0.18, "grad_norm": 82.77598850049955, "learning_rate": 1.9597992210875437e-06, "logits/chosen": -1.08547043800354, "logits/rejected": -1.0560569763183594, "logps/chosen": -607.5922241210938, "logps/rejected": -690.8724365234375, "loss": 0.5202, "rewards/accuracies": 0.875, "rewards/chosen": -0.5151211619377136, "rewards/margins": 1.051287055015564, "rewards/rejected": -1.5664081573486328, "step": 212 }, { "epoch": 0.18, "grad_norm": 48.76088540859079, "learning_rate": 1.9589543025513933e-06, "logits/chosen": -1.059824824333191, "logits/rejected": -0.9934217929840088, "logps/chosen": -470.1881408691406, "logps/rejected": -723.36474609375, "loss": 0.3258, "rewards/accuracies": 0.875, "rewards/chosen": -0.30257856845855713, "rewards/margins": 1.6323356628417969, "rewards/rejected": -1.9349143505096436, "step": 213 }, { "epoch": 0.18, "grad_norm": 47.021049477346295, "learning_rate": 1.958100783066561e-06, "logits/chosen": -0.962358295917511, "logits/rejected": -0.9208459854125977, "logps/chosen": -513.943359375, "logps/rejected": -723.7570190429688, "loss": 0.2964, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4522506594657898, "rewards/margins": 1.5093984603881836, "rewards/rejected": -1.961648941040039, "step": 214 }, { "epoch": 0.18, "grad_norm": 67.28984682350877, "learning_rate": 1.9572386702883406e-06, "logits/chosen": -1.0783056020736694, "logits/rejected": -0.9888733625411987, "logps/chosen": -603.6023559570312, "logps/rejected": -753.92431640625, "loss": 0.365, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4725470542907715, "rewards/margins": 1.2321484088897705, "rewards/rejected": -1.704695463180542, "step": 215 }, { "epoch": 0.19, "grad_norm": 63.185655249618556, "learning_rate": 1.9563679719491004e-06, "logits/chosen": -1.0363142490386963, "logits/rejected": -0.9776827692985535, "logps/chosen": -572.5197143554688, "logps/rejected": -808.026611328125, "loss": 0.2894, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6176286339759827, "rewards/margins": 1.396406888961792, "rewards/rejected": -2.01403546333313, "step": 216 }, { "epoch": 0.19, "grad_norm": 67.44667093371345, "learning_rate": 1.955488695858213e-06, "logits/chosen": -0.9920237064361572, "logits/rejected": -0.9496505260467529, "logps/chosen": -657.737548828125, "logps/rejected": -717.6353759765625, "loss": 0.3514, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7767983078956604, "rewards/margins": 1.014103889465332, "rewards/rejected": -1.7909021377563477, "step": 217 }, { "epoch": 0.19, "grad_norm": 53.50894829446239, "learning_rate": 1.9546008499019862e-06, "logits/chosen": -0.9635003805160522, "logits/rejected": -0.9544948935508728, "logps/chosen": -453.1898193359375, "logps/rejected": -485.15679931640625, "loss": 0.3781, "rewards/accuracies": 0.8125, "rewards/chosen": -0.20648762583732605, "rewards/margins": 1.0868135690689087, "rewards/rejected": -1.2933012247085571, "step": 218 }, { "epoch": 0.19, "grad_norm": 66.84092485372332, "learning_rate": 1.953704442043591e-06, "logits/chosen": -1.0142822265625, "logits/rejected": -0.9846318960189819, "logps/chosen": -579.3690185546875, "logps/rejected": -646.5301513671875, "loss": 0.4208, "rewards/accuracies": 0.75, "rewards/chosen": -0.5368911027908325, "rewards/margins": 0.9143826961517334, "rewards/rejected": -1.4512736797332764, "step": 219 }, { "epoch": 0.19, "grad_norm": 54.394417151247545, "learning_rate": 1.9527994803229923e-06, "logits/chosen": -1.0889610052108765, "logits/rejected": -1.04813814163208, "logps/chosen": -565.11767578125, "logps/rejected": -727.020263671875, "loss": 0.3149, "rewards/accuracies": 0.9375, "rewards/chosen": -0.34967750310897827, "rewards/margins": 1.3908071517944336, "rewards/rejected": -1.7404847145080566, "step": 220 }, { "epoch": 0.19, "grad_norm": 51.361945382744985, "learning_rate": 1.9518859728568736e-06, "logits/chosen": -1.0240240097045898, "logits/rejected": -0.9509984850883484, "logps/chosen": -501.5472717285156, "logps/rejected": -812.7438354492188, "loss": 0.2624, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5783480405807495, "rewards/margins": 1.6443132162094116, "rewards/rejected": -2.222661256790161, "step": 221 }, { "epoch": 0.19, "grad_norm": 48.47503303116554, "learning_rate": 1.950963927838567e-06, "logits/chosen": -0.9973111152648926, "logits/rejected": -0.9358306527137756, "logps/chosen": -522.234130859375, "logps/rejected": -693.0299682617188, "loss": 0.2573, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5451964139938354, "rewards/margins": 1.5520622730255127, "rewards/rejected": -2.0972585678100586, "step": 222 }, { "epoch": 0.19, "grad_norm": 51.089371754814614, "learning_rate": 1.9500333535379783e-06, "logits/chosen": -1.0092889070510864, "logits/rejected": -0.9825106859207153, "logps/chosen": -426.1991882324219, "logps/rejected": -652.1566772460938, "loss": 0.3307, "rewards/accuracies": 0.875, "rewards/chosen": -0.3618304133415222, "rewards/margins": 1.24405837059021, "rewards/rejected": -1.6058887243270874, "step": 223 }, { "epoch": 0.19, "grad_norm": 60.37540059428791, "learning_rate": 1.949094258301513e-06, "logits/chosen": -1.0221076011657715, "logits/rejected": -0.9619882106781006, "logps/chosen": -504.2724914550781, "logps/rejected": -607.927001953125, "loss": 0.4181, "rewards/accuracies": 0.9375, "rewards/chosen": -0.000596955418586731, "rewards/margins": 0.7537394762039185, "rewards/rejected": -0.7543364763259888, "step": 224 }, { "epoch": 0.19, "grad_norm": 80.2573736713337, "learning_rate": 1.9481466505520034e-06, "logits/chosen": -1.03997802734375, "logits/rejected": -0.9718859195709229, "logps/chosen": -615.118896484375, "logps/rejected": -781.4096069335938, "loss": 0.479, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7925766706466675, "rewards/margins": 0.7858482599258423, "rewards/rejected": -1.5784249305725098, "step": 225 }, { "epoch": 0.19, "grad_norm": 84.60494731211561, "learning_rate": 1.947190538788628e-06, "logits/chosen": -1.0425546169281006, "logits/rejected": -0.9868509769439697, "logps/chosen": -558.9896240234375, "logps/rejected": -764.4346923828125, "loss": 0.4825, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5591992139816284, "rewards/margins": 1.024910807609558, "rewards/rejected": -1.5841100215911865, "step": 226 }, { "epoch": 0.19, "grad_norm": 60.8006500130628, "learning_rate": 1.946225931586842e-06, "logits/chosen": -1.0703706741333008, "logits/rejected": -1.0248998403549194, "logps/chosen": -491.8314208984375, "logps/rejected": -698.8748779296875, "loss": 0.3613, "rewards/accuracies": 0.875, "rewards/chosen": -0.6801337003707886, "rewards/margins": 1.0860910415649414, "rewards/rejected": -1.7662248611450195, "step": 227 }, { "epoch": 0.2, "grad_norm": 57.94489583576497, "learning_rate": 1.9452528375982947e-06, "logits/chosen": -1.0509393215179443, "logits/rejected": -0.9977148771286011, "logps/chosen": -656.40625, "logps/rejected": -723.1608276367188, "loss": 0.3023, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2907889485359192, "rewards/margins": 1.4880188703536987, "rewards/rejected": -1.7788077592849731, "step": 228 }, { "epoch": 0.2, "grad_norm": 50.233865363274404, "learning_rate": 1.9442712655507552e-06, "logits/chosen": -1.024355173110962, "logits/rejected": -0.9878383874893188, "logps/chosen": -269.308837890625, "logps/rejected": -605.1350708007812, "loss": 0.3081, "rewards/accuracies": 0.9375, "rewards/chosen": -0.39276325702667236, "rewards/margins": 1.510910987854004, "rewards/rejected": -1.9036743640899658, "step": 229 }, { "epoch": 0.2, "grad_norm": 49.79390182856688, "learning_rate": 1.9432812242480326e-06, "logits/chosen": -1.0242599248886108, "logits/rejected": -0.9582304954528809, "logps/chosen": -533.2835693359375, "logps/rejected": -1009.9631958007812, "loss": 0.2663, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7415204048156738, "rewards/margins": 2.0787625312805176, "rewards/rejected": -2.8202829360961914, "step": 230 }, { "epoch": 0.2, "grad_norm": 65.90741394993712, "learning_rate": 1.9422827225698976e-06, "logits/chosen": -1.0297460556030273, "logits/rejected": -0.9815988540649414, "logps/chosen": -566.6522216796875, "logps/rejected": -730.4981079101562, "loss": 0.3494, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5851923823356628, "rewards/margins": 1.1192116737365723, "rewards/rejected": -1.7044041156768799, "step": 231 }, { "epoch": 0.2, "grad_norm": 61.83529221894195, "learning_rate": 1.9412757694720036e-06, "logits/chosen": -1.1333967447280884, "logits/rejected": -1.0697953701019287, "logps/chosen": -572.987548828125, "logps/rejected": -693.1698608398438, "loss": 0.3909, "rewards/accuracies": 0.875, "rewards/chosen": -0.7623158693313599, "rewards/margins": 1.2518970966339111, "rewards/rejected": -2.0142128467559814, "step": 232 }, { "epoch": 0.2, "grad_norm": 56.26007991200333, "learning_rate": 1.9402603739858045e-06, "logits/chosen": -1.0455517768859863, "logits/rejected": -1.022415280342102, "logps/chosen": -481.7862243652344, "logps/rejected": -479.995849609375, "loss": 0.4368, "rewards/accuracies": 0.75, "rewards/chosen": -0.4785584807395935, "rewards/margins": 0.7565988302230835, "rewards/rejected": -1.2351572513580322, "step": 233 }, { "epoch": 0.2, "grad_norm": 53.78226139691404, "learning_rate": 1.9392365452184743e-06, "logits/chosen": -1.2446177005767822, "logits/rejected": -1.1865254640579224, "logps/chosen": -427.7890625, "logps/rejected": -636.18017578125, "loss": 0.3274, "rewards/accuracies": 1.0, "rewards/chosen": -0.48274075984954834, "rewards/margins": 1.0873993635177612, "rewards/rejected": -1.5701401233673096, "step": 234 }, { "epoch": 0.2, "grad_norm": 51.257742769219234, "learning_rate": 1.938204292352828e-06, "logits/chosen": -1.0768308639526367, "logits/rejected": -1.0284702777862549, "logps/chosen": -443.66351318359375, "logps/rejected": -646.884033203125, "loss": 0.3885, "rewards/accuracies": 0.75, "rewards/chosen": -0.6171982288360596, "rewards/margins": 1.1787415742874146, "rewards/rejected": -1.7959399223327637, "step": 235 }, { "epoch": 0.2, "grad_norm": 57.867770622832765, "learning_rate": 1.9371636246472353e-06, "logits/chosen": -1.2808043956756592, "logits/rejected": -1.2098761796951294, "logps/chosen": -390.9673767089844, "logps/rejected": -572.98388671875, "loss": 0.4232, "rewards/accuracies": 0.6875, "rewards/chosen": -0.521543562412262, "rewards/margins": 1.1939754486083984, "rewards/rejected": -1.7155190706253052, "step": 236 }, { "epoch": 0.2, "grad_norm": 55.39431785341968, "learning_rate": 1.936114551435539e-06, "logits/chosen": -1.2188622951507568, "logits/rejected": -1.1490856409072876, "logps/chosen": -390.39910888671875, "logps/rejected": -651.4248046875, "loss": 0.3413, "rewards/accuracies": 0.8125, "rewards/chosen": -0.42151105403900146, "rewards/margins": 1.3620014190673828, "rewards/rejected": -1.7835125923156738, "step": 237 }, { "epoch": 0.2, "grad_norm": 73.23920168715607, "learning_rate": 1.935057082126974e-06, "logits/chosen": -1.290961742401123, "logits/rejected": -1.2376530170440674, "logps/chosen": -544.6636962890625, "logps/rejected": -715.2994995117188, "loss": 0.405, "rewards/accuracies": 0.875, "rewards/chosen": -0.37994012236595154, "rewards/margins": 1.172218918800354, "rewards/rejected": -1.552159070968628, "step": 238 }, { "epoch": 0.2, "grad_norm": 63.741831466021814, "learning_rate": 1.9339912262060782e-06, "logits/chosen": -1.1444615125656128, "logits/rejected": -1.0806961059570312, "logps/chosen": -594.5799560546875, "logps/rejected": -838.465087890625, "loss": 0.318, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4210588335990906, "rewards/margins": 1.4533946514129639, "rewards/rejected": -1.8744535446166992, "step": 239 }, { "epoch": 0.21, "grad_norm": 64.76955820674983, "learning_rate": 1.9329169932326104e-06, "logits/chosen": -1.163501501083374, "logits/rejected": -1.1278737783432007, "logps/chosen": -480.21234130859375, "logps/rejected": -577.05078125, "loss": 0.4336, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6075069308280945, "rewards/margins": 0.9887516498565674, "rewards/rejected": -1.5962586402893066, "step": 240 }, { "epoch": 0.21, "grad_norm": 44.25371144145174, "learning_rate": 1.9318343928414642e-06, "logits/chosen": -1.106231927871704, "logits/rejected": -1.084270715713501, "logps/chosen": -518.6808471679688, "logps/rejected": -728.62548828125, "loss": 0.2596, "rewards/accuracies": 1.0, "rewards/chosen": -0.43097513914108276, "rewards/margins": 1.509975552558899, "rewards/rejected": -1.940950632095337, "step": 241 }, { "epoch": 0.21, "grad_norm": 72.96662080811308, "learning_rate": 1.9307434347425826e-06, "logits/chosen": -1.1415469646453857, "logits/rejected": -1.0857826471328735, "logps/chosen": -627.71728515625, "logps/rejected": -860.5419921875, "loss": 0.2612, "rewards/accuracies": 1.0, "rewards/chosen": -0.6891728639602661, "rewards/margins": 1.6753425598144531, "rewards/rejected": -2.3645153045654297, "step": 242 }, { "epoch": 0.21, "grad_norm": 37.9024385841564, "learning_rate": 1.929644128720867e-06, "logits/chosen": -1.196105718612671, "logits/rejected": -1.1384600400924683, "logps/chosen": -535.7288208007812, "logps/rejected": -742.9649047851562, "loss": 0.2388, "rewards/accuracies": 1.0, "rewards/chosen": -0.5874618291854858, "rewards/margins": 1.8706109523773193, "rewards/rejected": -2.4580729007720947, "step": 243 }, { "epoch": 0.21, "grad_norm": 55.95864101578566, "learning_rate": 1.9285364846360943e-06, "logits/chosen": -1.1476514339447021, "logits/rejected": -1.1225823163986206, "logps/chosen": -625.8814697265625, "logps/rejected": -686.959716796875, "loss": 0.3768, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6543445587158203, "rewards/margins": 1.4013875722885132, "rewards/rejected": -2.055732250213623, "step": 244 }, { "epoch": 0.21, "grad_norm": 73.63823079566366, "learning_rate": 1.9274205124228243e-06, "logits/chosen": -1.0947630405426025, "logits/rejected": -1.04892897605896, "logps/chosen": -478.984375, "logps/rejected": -659.949951171875, "loss": 0.4166, "rewards/accuracies": 0.875, "rewards/chosen": -0.6773884296417236, "rewards/margins": 1.2850854396820068, "rewards/rejected": -1.9624738693237305, "step": 245 }, { "epoch": 0.21, "grad_norm": 83.81728984916164, "learning_rate": 1.926296222090315e-06, "logits/chosen": -1.0103175640106201, "logits/rejected": -1.0097943544387817, "logps/chosen": -879.529541015625, "logps/rejected": -963.75537109375, "loss": 0.2921, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1037704944610596, "rewards/margins": 1.5854477882385254, "rewards/rejected": -2.689218044281006, "step": 246 }, { "epoch": 0.21, "grad_norm": 64.47092513570345, "learning_rate": 1.925163623722428e-06, "logits/chosen": -1.226211667060852, "logits/rejected": -1.181449294090271, "logps/chosen": -425.39190673828125, "logps/rejected": -598.806884765625, "loss": 0.4008, "rewards/accuracies": 0.875, "rewards/chosen": -0.5162314176559448, "rewards/margins": 1.3047699928283691, "rewards/rejected": -1.8210015296936035, "step": 247 }, { "epoch": 0.21, "grad_norm": 41.54494614357395, "learning_rate": 1.9240227274775424e-06, "logits/chosen": -1.1234793663024902, "logits/rejected": -1.0364389419555664, "logps/chosen": -313.11126708984375, "logps/rejected": -506.63958740234375, "loss": 0.3275, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37168002128601074, "rewards/margins": 1.317708969116211, "rewards/rejected": -1.6893889904022217, "step": 248 }, { "epoch": 0.21, "grad_norm": 32.141234054176465, "learning_rate": 1.9228735435884606e-06, "logits/chosen": -1.1416168212890625, "logits/rejected": -1.0254522562026978, "logps/chosen": -370.5738525390625, "logps/rejected": -811.5826416015625, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": -0.25858139991760254, "rewards/margins": 2.1482415199279785, "rewards/rejected": -2.406822681427002, "step": 249 }, { "epoch": 0.21, "grad_norm": 64.94292695390094, "learning_rate": 1.9217160823623165e-06, "logits/chosen": -1.029049277305603, "logits/rejected": -0.9953701496124268, "logps/chosen": -461.535400390625, "logps/rejected": -589.9942626953125, "loss": 0.3597, "rewards/accuracies": 0.875, "rewards/chosen": -0.6874184012413025, "rewards/margins": 1.177901029586792, "rewards/rejected": -1.8653193712234497, "step": 250 }, { "epoch": 0.22, "grad_norm": 41.45077725222385, "learning_rate": 1.920550354180487e-06, "logits/chosen": -1.163294792175293, "logits/rejected": -1.0786793231964111, "logps/chosen": -449.732177734375, "logps/rejected": -768.60498046875, "loss": 0.2243, "rewards/accuracies": 0.9375, "rewards/chosen": -0.49330294132232666, "rewards/margins": 1.8286638259887695, "rewards/rejected": -2.3219666481018066, "step": 251 }, { "epoch": 0.22, "grad_norm": 41.54008016919367, "learning_rate": 1.919376369498494e-06, "logits/chosen": -1.3669646978378296, "logits/rejected": -1.305161476135254, "logps/chosen": -576.000244140625, "logps/rejected": -688.1344604492188, "loss": 0.2346, "rewards/accuracies": 1.0, "rewards/chosen": -0.6032384634017944, "rewards/margins": 1.8792409896850586, "rewards/rejected": -2.4824795722961426, "step": 252 }, { "epoch": 0.22, "grad_norm": 39.54645804416662, "learning_rate": 1.9181941388459134e-06, "logits/chosen": -1.297188639640808, "logits/rejected": -1.179163932800293, "logps/chosen": -419.82672119140625, "logps/rejected": -684.82666015625, "loss": 0.1996, "rewards/accuracies": 1.0, "rewards/chosen": -0.4974091351032257, "rewards/margins": 1.855064034461975, "rewards/rejected": -2.352473258972168, "step": 253 }, { "epoch": 0.22, "grad_norm": 66.18713318470269, "learning_rate": 1.91700367282628e-06, "logits/chosen": -1.2055079936981201, "logits/rejected": -1.1353366374969482, "logps/chosen": -578.854736328125, "logps/rejected": -769.0565185546875, "loss": 0.3484, "rewards/accuracies": 0.875, "rewards/chosen": -0.7658222913742065, "rewards/margins": 1.182613492012024, "rewards/rejected": -1.9484357833862305, "step": 254 }, { "epoch": 0.22, "grad_norm": 65.30291958637173, "learning_rate": 1.9158049821169918e-06, "logits/chosen": -1.1541709899902344, "logits/rejected": -1.1019580364227295, "logps/chosen": -475.2603454589844, "logps/rejected": -729.6705322265625, "loss": 0.4214, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21428640186786652, "rewards/margins": 1.0960335731506348, "rewards/rejected": -1.3103199005126953, "step": 255 }, { "epoch": 0.22, "grad_norm": 54.104609363402034, "learning_rate": 1.9145980774692156e-06, "logits/chosen": -1.2529370784759521, "logits/rejected": -1.2374582290649414, "logps/chosen": -668.109375, "logps/rejected": -857.1074829101562, "loss": 0.3398, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7821727991104126, "rewards/margins": 1.554344654083252, "rewards/rejected": -2.336517333984375, "step": 256 }, { "epoch": 0.22, "grad_norm": 39.49616108710431, "learning_rate": 1.913382969707789e-06, "logits/chosen": -1.3481879234313965, "logits/rejected": -1.238132357597351, "logps/chosen": -383.8392333984375, "logps/rejected": -611.764404296875, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": -0.3953891396522522, "rewards/margins": 1.674397587776184, "rewards/rejected": -2.069786548614502, "step": 257 }, { "epoch": 0.22, "grad_norm": 57.582765320833765, "learning_rate": 1.9121596697311243e-06, "logits/chosen": -1.295384168624878, "logits/rejected": -1.2667222023010254, "logps/chosen": -457.3524169921875, "logps/rejected": -507.17449951171875, "loss": 0.4294, "rewards/accuracies": 0.75, "rewards/chosen": -0.4994971752166748, "rewards/margins": 1.1396489143371582, "rewards/rejected": -1.639146089553833, "step": 258 }, { "epoch": 0.22, "grad_norm": 44.576501979807574, "learning_rate": 1.910928188511111e-06, "logits/chosen": -1.305537462234497, "logits/rejected": -1.2473948001861572, "logps/chosen": -395.11907958984375, "logps/rejected": -581.2030029296875, "loss": 0.2701, "rewards/accuracies": 1.0, "rewards/chosen": -0.6577807664871216, "rewards/margins": 1.5606365203857422, "rewards/rejected": -2.2184174060821533, "step": 259 }, { "epoch": 0.22, "grad_norm": 54.85712220293985, "learning_rate": 1.9096885370930173e-06, "logits/chosen": -1.2297561168670654, "logits/rejected": -1.1931822299957275, "logps/chosen": -437.64154052734375, "logps/rejected": -528.6766357421875, "loss": 0.415, "rewards/accuracies": 0.75, "rewards/chosen": -0.5516808032989502, "rewards/margins": 1.180525302886963, "rewards/rejected": -1.732206106185913, "step": 260 }, { "epoch": 0.22, "grad_norm": 56.27136975442982, "learning_rate": 1.9084407265953887e-06, "logits/chosen": -1.3499705791473389, "logits/rejected": -1.2703001499176025, "logps/chosen": -499.3698425292969, "logps/rejected": -805.4957275390625, "loss": 0.2961, "rewards/accuracies": 0.875, "rewards/chosen": -0.46790242195129395, "rewards/margins": 1.8862146139144897, "rewards/rejected": -2.354116916656494, "step": 261 }, { "epoch": 0.22, "grad_norm": 32.252199427793144, "learning_rate": 1.907184768209952e-06, "logits/chosen": -1.3755966424942017, "logits/rejected": -1.3271452188491821, "logps/chosen": -430.7845458984375, "logps/rejected": -596.4493408203125, "loss": 0.2377, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42741551995277405, "rewards/margins": 1.5471506118774414, "rewards/rejected": -1.974565863609314, "step": 262 }, { "epoch": 0.23, "grad_norm": 77.79391791238616, "learning_rate": 1.9059206732015125e-06, "logits/chosen": -1.3845618963241577, "logits/rejected": -1.3517491817474365, "logps/chosen": -527.8436889648438, "logps/rejected": -609.9903564453125, "loss": 0.5141, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7712549567222595, "rewards/margins": 1.2328592538833618, "rewards/rejected": -2.0041141510009766, "step": 263 }, { "epoch": 0.23, "grad_norm": 62.17984948886498, "learning_rate": 1.9046484529078539e-06, "logits/chosen": -1.3209779262542725, "logits/rejected": -1.2516530752182007, "logps/chosen": -445.5620422363281, "logps/rejected": -651.108642578125, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -0.5886340141296387, "rewards/margins": 1.6127263307571411, "rewards/rejected": -2.2013602256774902, "step": 264 }, { "epoch": 0.23, "grad_norm": 44.63454231126517, "learning_rate": 1.9033681187396362e-06, "logits/chosen": -1.380696415901184, "logits/rejected": -1.3153538703918457, "logps/chosen": -425.5417785644531, "logps/rejected": -688.957275390625, "loss": 0.279, "rewards/accuracies": 0.9375, "rewards/chosen": -0.534125566482544, "rewards/margins": 1.6324182748794556, "rewards/rejected": -2.166543960571289, "step": 265 }, { "epoch": 0.23, "grad_norm": 80.91662784064529, "learning_rate": 1.902079682180293e-06, "logits/chosen": -1.2613506317138672, "logits/rejected": -1.2341322898864746, "logps/chosen": -729.0734252929688, "logps/rejected": -847.1480102539062, "loss": 0.3454, "rewards/accuracies": 0.875, "rewards/chosen": -0.9773740768432617, "rewards/margins": 1.5395441055297852, "rewards/rejected": -2.516918182373047, "step": 266 }, { "epoch": 0.23, "grad_norm": 51.409969035837435, "learning_rate": 1.9007831547859299e-06, "logits/chosen": -1.3617085218429565, "logits/rejected": -1.2971055507659912, "logps/chosen": -602.2572631835938, "logps/rejected": -707.104736328125, "loss": 0.3547, "rewards/accuracies": 0.8125, "rewards/chosen": -0.602672815322876, "rewards/margins": 1.435534954071045, "rewards/rejected": -2.038207769393921, "step": 267 }, { "epoch": 0.23, "grad_norm": 59.718107084802284, "learning_rate": 1.899478548185219e-06, "logits/chosen": -1.3794385194778442, "logits/rejected": -1.2717067003250122, "logps/chosen": -433.0210266113281, "logps/rejected": -644.1810913085938, "loss": 0.356, "rewards/accuracies": 0.875, "rewards/chosen": -0.6217007040977478, "rewards/margins": 1.6019165515899658, "rewards/rejected": -2.2236175537109375, "step": 268 }, { "epoch": 0.23, "grad_norm": 69.55971626212478, "learning_rate": 1.8981658740792967e-06, "logits/chosen": -1.30446195602417, "logits/rejected": -1.25761079788208, "logps/chosen": -594.3277587890625, "logps/rejected": -729.9678955078125, "loss": 0.3306, "rewards/accuracies": 0.875, "rewards/chosen": -0.7294997572898865, "rewards/margins": 1.456763505935669, "rewards/rejected": -2.1862633228302, "step": 269 }, { "epoch": 0.23, "grad_norm": 49.94697620233205, "learning_rate": 1.8968451442416562e-06, "logits/chosen": -1.3442294597625732, "logits/rejected": -1.256706953048706, "logps/chosen": -462.27301025390625, "logps/rejected": -640.5385131835938, "loss": 0.3501, "rewards/accuracies": 0.875, "rewards/chosen": -0.6719030737876892, "rewards/margins": 1.362053632736206, "rewards/rejected": -2.03395676612854, "step": 270 }, { "epoch": 0.23, "grad_norm": 113.15812961073618, "learning_rate": 1.8955163705180443e-06, "logits/chosen": -1.2014787197113037, "logits/rejected": -1.2413536310195923, "logps/chosen": -776.939453125, "logps/rejected": -686.668212890625, "loss": 0.8989, "rewards/accuracies": 0.6875, "rewards/chosen": -1.169008731842041, "rewards/margins": 0.6716674566268921, "rewards/rejected": -1.8406760692596436, "step": 271 }, { "epoch": 0.23, "grad_norm": 63.927868317477646, "learning_rate": 1.894179564826354e-06, "logits/chosen": -1.3427788019180298, "logits/rejected": -1.270050287246704, "logps/chosen": -506.8124694824219, "logps/rejected": -714.5921630859375, "loss": 0.36, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5294536352157593, "rewards/margins": 1.8180158138275146, "rewards/rejected": -2.3474695682525635, "step": 272 }, { "epoch": 0.23, "grad_norm": 52.1760169157746, "learning_rate": 1.892834739156517e-06, "logits/chosen": -1.3242626190185547, "logits/rejected": -1.3110473155975342, "logps/chosen": -461.1047668457031, "logps/rejected": -539.557373046875, "loss": 0.3191, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4187151789665222, "rewards/margins": 1.3253779411315918, "rewards/rejected": -1.7440930604934692, "step": 273 }, { "epoch": 0.23, "grad_norm": 48.99631544015967, "learning_rate": 1.8914819055703983e-06, "logits/chosen": -1.3378002643585205, "logits/rejected": -1.2494276762008667, "logps/chosen": -472.304931640625, "logps/rejected": -652.1216430664062, "loss": 0.3131, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5154717564582825, "rewards/margins": 1.3171603679656982, "rewards/rejected": -1.832632064819336, "step": 274 }, { "epoch": 0.24, "grad_norm": 55.637858401703724, "learning_rate": 1.890121076201685e-06, "logits/chosen": -1.2824797630310059, "logits/rejected": -1.206345796585083, "logps/chosen": -503.8155212402344, "logps/rejected": -654.08056640625, "loss": 0.3554, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6268565058708191, "rewards/margins": 1.6319068670272827, "rewards/rejected": -2.258763313293457, "step": 275 }, { "epoch": 0.24, "grad_norm": 82.8687629552848, "learning_rate": 1.8887522632557804e-06, "logits/chosen": -1.2825202941894531, "logits/rejected": -1.2575864791870117, "logps/chosen": -591.2352294921875, "logps/rejected": -663.9232177734375, "loss": 0.4883, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8104920387268066, "rewards/margins": 0.9792019724845886, "rewards/rejected": -1.78969407081604, "step": 276 }, { "epoch": 0.24, "grad_norm": 83.84616386000505, "learning_rate": 1.887375479009693e-06, "logits/chosen": -1.312516450881958, "logits/rejected": -1.2433936595916748, "logps/chosen": -633.65283203125, "logps/rejected": -777.0487670898438, "loss": 0.5805, "rewards/accuracies": 0.875, "rewards/chosen": -0.9956063032150269, "rewards/margins": 1.1437734365463257, "rewards/rejected": -2.1393797397613525, "step": 277 }, { "epoch": 0.24, "grad_norm": 62.922250186992066, "learning_rate": 1.8859907358119257e-06, "logits/chosen": -1.346423625946045, "logits/rejected": -1.224915862083435, "logps/chosen": -625.4689331054688, "logps/rejected": -958.303955078125, "loss": 0.2429, "rewards/accuracies": 0.875, "rewards/chosen": -0.7824087142944336, "rewards/margins": 2.1031761169433594, "rewards/rejected": -2.885584831237793, "step": 278 }, { "epoch": 0.24, "grad_norm": 50.281759620820885, "learning_rate": 1.8845980460823674e-06, "logits/chosen": -1.232679843902588, "logits/rejected": -1.203260898590088, "logps/chosen": -696.5806274414062, "logps/rejected": -820.2708129882812, "loss": 0.2181, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6646475791931152, "rewards/margins": 1.7615255117416382, "rewards/rejected": -2.426173210144043, "step": 279 }, { "epoch": 0.24, "grad_norm": 91.55282932822111, "learning_rate": 1.8831974223121789e-06, "logits/chosen": -1.2700848579406738, "logits/rejected": -1.2630860805511475, "logps/chosen": -687.1458740234375, "logps/rejected": -726.3959350585938, "loss": 0.5542, "rewards/accuracies": 0.8125, "rewards/chosen": -0.814836859703064, "rewards/margins": 1.0548039674758911, "rewards/rejected": -1.869640827178955, "step": 280 }, { "epoch": 0.24, "grad_norm": 47.23631638320723, "learning_rate": 1.8817888770636828e-06, "logits/chosen": -1.2989399433135986, "logits/rejected": -1.2461373805999756, "logps/chosen": -542.2744140625, "logps/rejected": -695.8860473632812, "loss": 0.2762, "rewards/accuracies": 1.0, "rewards/chosen": -0.5193393230438232, "rewards/margins": 1.421765923500061, "rewards/rejected": -1.9411051273345947, "step": 281 }, { "epoch": 0.24, "grad_norm": 42.45351766075743, "learning_rate": 1.8803724229702501e-06, "logits/chosen": -1.3047549724578857, "logits/rejected": -1.2471988201141357, "logps/chosen": -623.1423950195312, "logps/rejected": -827.06201171875, "loss": 0.2677, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7911394834518433, "rewards/margins": 1.5670664310455322, "rewards/rejected": -2.358205795288086, "step": 282 }, { "epoch": 0.24, "grad_norm": 132.5548506788457, "learning_rate": 1.878948072736187e-06, "logits/chosen": -1.3195421695709229, "logits/rejected": -1.3033177852630615, "logps/chosen": -665.1011352539062, "logps/rejected": -786.5206298828125, "loss": 0.3534, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7998645305633545, "rewards/margins": 1.4624037742614746, "rewards/rejected": -2.262268304824829, "step": 283 }, { "epoch": 0.24, "grad_norm": 39.06583611828614, "learning_rate": 1.8775158391366205e-06, "logits/chosen": -1.4100770950317383, "logits/rejected": -1.359595775604248, "logps/chosen": -424.16839599609375, "logps/rejected": -588.47998046875, "loss": 0.3373, "rewards/accuracies": 0.875, "rewards/chosen": -0.7181679606437683, "rewards/margins": 1.374335765838623, "rewards/rejected": -2.092503786087036, "step": 284 }, { "epoch": 0.24, "grad_norm": 65.62725799965577, "learning_rate": 1.8760757350173844e-06, "logits/chosen": -1.3192460536956787, "logits/rejected": -1.311014175415039, "logps/chosen": -462.3755798339844, "logps/rejected": -593.5933837890625, "loss": 0.421, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7063227891921997, "rewards/margins": 1.034854769706726, "rewards/rejected": -1.7411775588989258, "step": 285 }, { "epoch": 0.25, "grad_norm": 88.17881420608741, "learning_rate": 1.8746277732949043e-06, "logits/chosen": -1.287336826324463, "logits/rejected": -1.2378259897232056, "logps/chosen": -743.6614379882812, "logps/rejected": -926.6859130859375, "loss": 0.3923, "rewards/accuracies": 0.9375, "rewards/chosen": -0.45865917205810547, "rewards/margins": 1.0680279731750488, "rewards/rejected": -1.5266873836517334, "step": 286 }, { "epoch": 0.25, "grad_norm": 92.58863179872984, "learning_rate": 1.873171966956081e-06, "logits/chosen": -1.3020586967468262, "logits/rejected": -1.2492753267288208, "logps/chosen": -561.0408325195312, "logps/rejected": -696.8612060546875, "loss": 0.4906, "rewards/accuracies": 0.75, "rewards/chosen": -0.4373791813850403, "rewards/margins": 1.1843516826629639, "rewards/rejected": -1.6217308044433594, "step": 287 }, { "epoch": 0.25, "grad_norm": 41.92668197491958, "learning_rate": 1.8717083290581745e-06, "logits/chosen": -1.3229628801345825, "logits/rejected": -1.3219010829925537, "logps/chosen": -593.58349609375, "logps/rejected": -677.1654663085938, "loss": 0.33, "rewards/accuracies": 0.9375, "rewards/chosen": -0.339139461517334, "rewards/margins": 1.3238906860351562, "rewards/rejected": -1.6630302667617798, "step": 288 }, { "epoch": 0.25, "grad_norm": 69.95582054379341, "learning_rate": 1.8702368727286868e-06, "logits/chosen": -1.2912516593933105, "logits/rejected": -1.2390421628952026, "logps/chosen": -511.74383544921875, "logps/rejected": -553.6732177734375, "loss": 0.4282, "rewards/accuracies": 0.75, "rewards/chosen": -0.4722836911678314, "rewards/margins": 1.0789742469787598, "rewards/rejected": -1.551257848739624, "step": 289 }, { "epoch": 0.25, "grad_norm": 44.21457140750985, "learning_rate": 1.8687576111652437e-06, "logits/chosen": -1.4131152629852295, "logits/rejected": -1.2697126865386963, "logps/chosen": -542.24951171875, "logps/rejected": -849.6591186523438, "loss": 0.2178, "rewards/accuracies": 1.0, "rewards/chosen": -0.7038062810897827, "rewards/margins": 2.04502534866333, "rewards/rejected": -2.7488315105438232, "step": 290 }, { "epoch": 0.25, "grad_norm": 64.80569375842471, "learning_rate": 1.8672705576354775e-06, "logits/chosen": -1.2966111898422241, "logits/rejected": -1.2468252182006836, "logps/chosen": -440.4309997558594, "logps/rejected": -538.8372802734375, "loss": 0.4205, "rewards/accuracies": 0.75, "rewards/chosen": -0.7583557367324829, "rewards/margins": 1.1637259721755981, "rewards/rejected": -1.922081708908081, "step": 291 }, { "epoch": 0.25, "grad_norm": 51.64831816245686, "learning_rate": 1.865775725476907e-06, "logits/chosen": -1.291834831237793, "logits/rejected": -1.2686530351638794, "logps/chosen": -423.56182861328125, "logps/rejected": -501.0770263671875, "loss": 0.4562, "rewards/accuracies": 0.75, "rewards/chosen": -0.3772410750389099, "rewards/margins": 1.0666435956954956, "rewards/rejected": -1.4438846111297607, "step": 292 }, { "epoch": 0.25, "grad_norm": 59.11445357687318, "learning_rate": 1.8642731280968182e-06, "logits/chosen": -1.3467607498168945, "logits/rejected": -1.3099944591522217, "logps/chosen": -549.3778076171875, "logps/rejected": -635.3603515625, "loss": 0.3034, "rewards/accuracies": 1.0, "rewards/chosen": -0.8041099309921265, "rewards/margins": 1.2853634357452393, "rewards/rejected": -2.0894734859466553, "step": 293 }, { "epoch": 0.25, "grad_norm": 62.39719980624235, "learning_rate": 1.8627627789721442e-06, "logits/chosen": -1.3086678981781006, "logits/rejected": -1.2643378973007202, "logps/chosen": -642.4073486328125, "logps/rejected": -816.996826171875, "loss": 0.3278, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6026842594146729, "rewards/margins": 1.8675678968429565, "rewards/rejected": -2.470252275466919, "step": 294 }, { "epoch": 0.25, "grad_norm": 61.69447609824399, "learning_rate": 1.8612446916493442e-06, "logits/chosen": -1.3641777038574219, "logits/rejected": -1.3854668140411377, "logps/chosen": -471.4262390136719, "logps/rejected": -504.0348815917969, "loss": 0.355, "rewards/accuracies": 0.8125, "rewards/chosen": -0.44899052381515503, "rewards/margins": 1.3087570667266846, "rewards/rejected": -1.7577476501464844, "step": 295 }, { "epoch": 0.25, "grad_norm": 51.73986225545052, "learning_rate": 1.8597188797442823e-06, "logits/chosen": -1.3421553373336792, "logits/rejected": -1.2874841690063477, "logps/chosen": -505.52557373046875, "logps/rejected": -656.2530517578125, "loss": 0.2572, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3865881562232971, "rewards/margins": 1.5258070230484009, "rewards/rejected": -1.9123951196670532, "step": 296 }, { "epoch": 0.25, "grad_norm": 71.58151264329189, "learning_rate": 1.8581853569421042e-06, "logits/chosen": -1.3670592308044434, "logits/rejected": -1.3198003768920898, "logps/chosen": -537.0482788085938, "logps/rejected": -658.83447265625, "loss": 0.3565, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5762161016464233, "rewards/margins": 1.3206223249435425, "rewards/rejected": -1.8968384265899658, "step": 297 }, { "epoch": 0.26, "grad_norm": 100.50925042279562, "learning_rate": 1.8566441369971163e-06, "logits/chosen": -1.2974328994750977, "logits/rejected": -1.2744688987731934, "logps/chosen": -639.2808227539062, "logps/rejected": -766.7714233398438, "loss": 0.6602, "rewards/accuracies": 0.8125, "rewards/chosen": -0.816308856010437, "rewards/margins": 1.0288530588150024, "rewards/rejected": -1.8451619148254395, "step": 298 }, { "epoch": 0.26, "grad_norm": 47.43785962452232, "learning_rate": 1.8550952337326606e-06, "logits/chosen": -1.365097165107727, "logits/rejected": -1.2711853981018066, "logps/chosen": -403.49005126953125, "logps/rejected": -644.4906616210938, "loss": 0.3325, "rewards/accuracies": 0.875, "rewards/chosen": -0.44394218921661377, "rewards/margins": 1.5209300518035889, "rewards/rejected": -1.9648722410202026, "step": 299 }, { "epoch": 0.26, "grad_norm": 49.38250053324777, "learning_rate": 1.8535386610409925e-06, "logits/chosen": -1.3787580728530884, "logits/rejected": -1.2780003547668457, "logps/chosen": -498.15521240234375, "logps/rejected": -632.5543212890625, "loss": 0.3357, "rewards/accuracies": 0.875, "rewards/chosen": -0.489728182554245, "rewards/margins": 1.3524258136749268, "rewards/rejected": -1.8421540260314941, "step": 300 }, { "epoch": 0.26, "grad_norm": 98.44072601146492, "learning_rate": 1.851974432883154e-06, "logits/chosen": -1.2081255912780762, "logits/rejected": -1.1339161396026611, "logps/chosen": -562.579833984375, "logps/rejected": -777.3492431640625, "loss": 0.6088, "rewards/accuracies": 0.625, "rewards/chosen": -0.0019558966159820557, "rewards/margins": 0.27304917573928833, "rewards/rejected": -0.2750050723552704, "step": 301 }, { "epoch": 0.26, "grad_norm": 91.11788015516005, "learning_rate": 1.8504025632888507e-06, "logits/chosen": -1.2894055843353271, "logits/rejected": -1.2341303825378418, "logps/chosen": -750.760986328125, "logps/rejected": -849.8092651367188, "loss": 0.4514, "rewards/accuracies": 0.875, "rewards/chosen": -0.8769577741622925, "rewards/margins": 1.1654422283172607, "rewards/rejected": -2.0423998832702637, "step": 302 }, { "epoch": 0.26, "grad_norm": 76.68001793142624, "learning_rate": 1.8488230663563241e-06, "logits/chosen": -1.2934482097625732, "logits/rejected": -1.351201057434082, "logps/chosen": -580.1993408203125, "logps/rejected": -513.2964477539062, "loss": 0.5818, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8330243229866028, "rewards/margins": 0.5873950719833374, "rewards/rejected": -1.420419454574585, "step": 303 }, { "epoch": 0.26, "grad_norm": 47.858777042823164, "learning_rate": 1.8472359562522266e-06, "logits/chosen": -1.3255586624145508, "logits/rejected": -1.2752617597579956, "logps/chosen": -406.1722412109375, "logps/rejected": -593.2540283203125, "loss": 0.3364, "rewards/accuracies": 0.875, "rewards/chosen": -0.43785959482192993, "rewards/margins": 1.5706452131271362, "rewards/rejected": -2.008504867553711, "step": 304 }, { "epoch": 0.26, "grad_norm": 85.19988100914014, "learning_rate": 1.8456412472114935e-06, "logits/chosen": -1.3651058673858643, "logits/rejected": -1.273531436920166, "logps/chosen": -481.7969970703125, "logps/rejected": -662.74658203125, "loss": 0.4825, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6961475610733032, "rewards/margins": 1.2806881666183472, "rewards/rejected": -1.9768357276916504, "step": 305 }, { "epoch": 0.26, "grad_norm": 37.84875798282611, "learning_rate": 1.8440389535372156e-06, "logits/chosen": -1.372624397277832, "logits/rejected": -1.293984293937683, "logps/chosen": -568.9570922851562, "logps/rejected": -868.6464233398438, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": -0.6600321531295776, "rewards/margins": 2.31733775138855, "rewards/rejected": -2.977370262145996, "step": 306 }, { "epoch": 0.26, "grad_norm": 47.54836575905022, "learning_rate": 1.8424290896005115e-06, "logits/chosen": -1.4119298458099365, "logits/rejected": -1.3349215984344482, "logps/chosen": -431.46490478515625, "logps/rejected": -647.0010375976562, "loss": 0.3487, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5202719569206238, "rewards/margins": 1.5132447481155396, "rewards/rejected": -2.0335168838500977, "step": 307 }, { "epoch": 0.26, "grad_norm": 49.36518160151795, "learning_rate": 1.8408116698403976e-06, "logits/chosen": -1.3197250366210938, "logits/rejected": -1.33034348487854, "logps/chosen": -673.7100219726562, "logps/rejected": -822.8350219726562, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": -0.7116418480873108, "rewards/margins": 1.844422698020935, "rewards/rejected": -2.5560646057128906, "step": 308 }, { "epoch": 0.27, "grad_norm": 54.41588382019316, "learning_rate": 1.8391867087636595e-06, "logits/chosen": -1.3856697082519531, "logits/rejected": -1.365121841430664, "logps/chosen": -472.591552734375, "logps/rejected": -553.991455078125, "loss": 0.4403, "rewards/accuracies": 0.75, "rewards/chosen": -0.7445403337478638, "rewards/margins": 0.8742175102233887, "rewards/rejected": -1.618757963180542, "step": 309 }, { "epoch": 0.27, "grad_norm": 34.64658589787295, "learning_rate": 1.8375542209447214e-06, "logits/chosen": -1.3953791856765747, "logits/rejected": -1.2682315111160278, "logps/chosen": -386.12139892578125, "logps/rejected": -713.0578002929688, "loss": 0.1955, "rewards/accuracies": 1.0, "rewards/chosen": -0.5013018250465393, "rewards/margins": 2.0638022422790527, "rewards/rejected": -2.5651040077209473, "step": 310 }, { "epoch": 0.27, "grad_norm": 73.96567656460611, "learning_rate": 1.8359142210255155e-06, "logits/chosen": -1.3954124450683594, "logits/rejected": -1.3836252689361572, "logps/chosen": -518.5062255859375, "logps/rejected": -664.3820190429688, "loss": 0.4891, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9950438737869263, "rewards/margins": 1.2294788360595703, "rewards/rejected": -2.224522829055786, "step": 311 }, { "epoch": 0.27, "grad_norm": 59.0139814547133, "learning_rate": 1.834266723715351e-06, "logits/chosen": -1.3194384574890137, "logits/rejected": -1.3285057544708252, "logps/chosen": -629.0936279296875, "logps/rejected": -770.8262329101562, "loss": 0.4121, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7154449820518494, "rewards/margins": 1.523475170135498, "rewards/rejected": -2.238920211791992, "step": 312 }, { "epoch": 0.27, "grad_norm": 51.14484872549365, "learning_rate": 1.8326117437907812e-06, "logits/chosen": -1.4025180339813232, "logits/rejected": -1.3029136657714844, "logps/chosen": -507.39154052734375, "logps/rejected": -731.9945678710938, "loss": 0.2758, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5277796983718872, "rewards/margins": 1.7257564067840576, "rewards/rejected": -2.2535362243652344, "step": 313 }, { "epoch": 0.27, "grad_norm": 66.36525875544719, "learning_rate": 1.8309492960954727e-06, "logits/chosen": -1.3588955402374268, "logits/rejected": -1.2931160926818848, "logps/chosen": -462.9432067871094, "logps/rejected": -654.9721069335938, "loss": 0.3511, "rewards/accuracies": 0.875, "rewards/chosen": -0.6080495715141296, "rewards/margins": 1.597927451133728, "rewards/rejected": -2.205976963043213, "step": 314 }, { "epoch": 0.27, "grad_norm": 54.03835330583822, "learning_rate": 1.82927939554007e-06, "logits/chosen": -1.3320766687393188, "logits/rejected": -1.2851673364639282, "logps/chosen": -712.2108154296875, "logps/rejected": -880.1512451171875, "loss": 0.2395, "rewards/accuracies": 0.9375, "rewards/chosen": -0.747326135635376, "rewards/margins": 1.9266762733459473, "rewards/rejected": -2.6740026473999023, "step": 315 }, { "epoch": 0.27, "grad_norm": 39.25357661887786, "learning_rate": 1.8276020571020645e-06, "logits/chosen": -1.4441983699798584, "logits/rejected": -1.351264238357544, "logps/chosen": -342.5238037109375, "logps/rejected": -649.1917724609375, "loss": 0.2823, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5742235779762268, "rewards/margins": 1.6854310035705566, "rewards/rejected": -2.2596545219421387, "step": 316 }, { "epoch": 0.27, "grad_norm": 35.63885420819062, "learning_rate": 1.8259172958256571e-06, "logits/chosen": -1.4204754829406738, "logits/rejected": -1.2960968017578125, "logps/chosen": -570.7115478515625, "logps/rejected": -947.851318359375, "loss": 0.1699, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6737782955169678, "rewards/margins": 2.6754255294799805, "rewards/rejected": -3.3492040634155273, "step": 317 }, { "epoch": 0.27, "grad_norm": 86.03391195673755, "learning_rate": 1.8242251268216257e-06, "logits/chosen": -1.3299427032470703, "logits/rejected": -1.2639224529266357, "logps/chosen": -499.9549255371094, "logps/rejected": -681.12841796875, "loss": 0.4574, "rewards/accuracies": 0.75, "rewards/chosen": -0.649490237236023, "rewards/margins": 1.0054829120635986, "rewards/rejected": -1.6549732685089111, "step": 318 }, { "epoch": 0.27, "grad_norm": 47.572685320587574, "learning_rate": 1.8225255652671887e-06, "logits/chosen": -1.3856923580169678, "logits/rejected": -1.3267571926116943, "logps/chosen": -375.29290771484375, "logps/rejected": -526.8903198242188, "loss": 0.3841, "rewards/accuracies": 0.875, "rewards/chosen": -0.3808596730232239, "rewards/margins": 0.9564526677131653, "rewards/rejected": -1.3373123407363892, "step": 319 }, { "epoch": 0.27, "grad_norm": 40.98992768419042, "learning_rate": 1.8208186264058686e-06, "logits/chosen": -1.3933082818984985, "logits/rejected": -1.2459851503372192, "logps/chosen": -535.0771484375, "logps/rejected": -1006.7776489257812, "loss": 0.258, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6266862154006958, "rewards/margins": 2.235572338104248, "rewards/rejected": -2.8622586727142334, "step": 320 }, { "epoch": 0.28, "grad_norm": 81.03986839354482, "learning_rate": 1.8191043255473557e-06, "logits/chosen": -1.2229186296463013, "logits/rejected": -1.1526546478271484, "logps/chosen": -610.3226928710938, "logps/rejected": -869.8119506835938, "loss": 0.4548, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8089736700057983, "rewards/margins": 1.7825186252593994, "rewards/rejected": -2.591492176055908, "step": 321 }, { "epoch": 0.28, "grad_norm": 54.00681041288272, "learning_rate": 1.8173826780673713e-06, "logits/chosen": -1.286928415298462, "logits/rejected": -1.2091097831726074, "logps/chosen": -352.4930419921875, "logps/rejected": -519.7963256835938, "loss": 0.3332, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30570027232170105, "rewards/margins": 1.4030377864837646, "rewards/rejected": -1.708738088607788, "step": 322 }, { "epoch": 0.28, "grad_norm": 59.45445472685165, "learning_rate": 1.8156536994075286e-06, "logits/chosen": -1.380958080291748, "logits/rejected": -1.3711278438568115, "logps/chosen": -557.9470825195312, "logps/rejected": -585.9173583984375, "loss": 0.3746, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8871201276779175, "rewards/margins": 1.1648715734481812, "rewards/rejected": -2.0519914627075195, "step": 323 }, { "epoch": 0.28, "grad_norm": 39.40230304474445, "learning_rate": 1.8139174050751956e-06, "logits/chosen": -1.3996045589447021, "logits/rejected": -1.306899070739746, "logps/chosen": -497.3608703613281, "logps/rejected": -719.8843994140625, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": -0.47391462326049805, "rewards/margins": 2.2606656551361084, "rewards/rejected": -2.7345802783966064, "step": 324 }, { "epoch": 0.28, "grad_norm": 50.28586406633642, "learning_rate": 1.8121738106433536e-06, "logits/chosen": -1.295037031173706, "logits/rejected": -1.2569923400878906, "logps/chosen": -646.929931640625, "logps/rejected": -846.8131103515625, "loss": 0.206, "rewards/accuracies": 1.0, "rewards/chosen": -1.1925201416015625, "rewards/margins": 1.9704111814498901, "rewards/rejected": -3.162931442260742, "step": 325 }, { "epoch": 0.28, "grad_norm": 56.423053246664416, "learning_rate": 1.8104229317504612e-06, "logits/chosen": -1.4025418758392334, "logits/rejected": -1.3613529205322266, "logps/chosen": -528.4503173828125, "logps/rejected": -717.5523681640625, "loss": 0.3203, "rewards/accuracies": 0.875, "rewards/chosen": -0.8349852561950684, "rewards/margins": 1.745392084121704, "rewards/rejected": -2.5803773403167725, "step": 326 }, { "epoch": 0.28, "grad_norm": 39.3618431458811, "learning_rate": 1.8086647841003102e-06, "logits/chosen": -1.3957802057266235, "logits/rejected": -1.4136476516723633, "logps/chosen": -421.7500915527344, "logps/rejected": -432.0810852050781, "loss": 0.2859, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2593984007835388, "rewards/margins": 1.647970199584961, "rewards/rejected": -1.907368779182434, "step": 327 }, { "epoch": 0.28, "grad_norm": 53.49012674039251, "learning_rate": 1.8068993834618881e-06, "logits/chosen": -1.3327207565307617, "logits/rejected": -1.258753776550293, "logps/chosen": -625.6865234375, "logps/rejected": -831.6580810546875, "loss": 0.2618, "rewards/accuracies": 0.9375, "rewards/chosen": -0.953024685382843, "rewards/margins": 1.8639934062957764, "rewards/rejected": -2.8170180320739746, "step": 328 }, { "epoch": 0.28, "grad_norm": 60.790641539855926, "learning_rate": 1.8051267456692342e-06, "logits/chosen": -1.4130940437316895, "logits/rejected": -1.3980942964553833, "logps/chosen": -448.37872314453125, "logps/rejected": -547.0968627929688, "loss": 0.4777, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6373985409736633, "rewards/margins": 0.8018583059310913, "rewards/rejected": -1.4392569065093994, "step": 329 }, { "epoch": 0.28, "grad_norm": 79.04052147407155, "learning_rate": 1.8033468866212984e-06, "logits/chosen": -1.3974734544754028, "logits/rejected": -1.339294672012329, "logps/chosen": -443.251708984375, "logps/rejected": -591.7564697265625, "loss": 0.5173, "rewards/accuracies": 0.875, "rewards/chosen": -0.6866668462753296, "rewards/margins": 1.2705740928649902, "rewards/rejected": -1.9572408199310303, "step": 330 }, { "epoch": 0.28, "grad_norm": 69.99995765079761, "learning_rate": 1.8015598222817994e-06, "logits/chosen": -1.3072905540466309, "logits/rejected": -1.19639253616333, "logps/chosen": -537.7159423828125, "logps/rejected": -831.37060546875, "loss": 0.2668, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8045264482498169, "rewards/margins": 1.8684673309326172, "rewards/rejected": -2.6729936599731445, "step": 331 }, { "epoch": 0.28, "grad_norm": 38.237575149701975, "learning_rate": 1.79976556867908e-06, "logits/chosen": -1.3282616138458252, "logits/rejected": -1.266440749168396, "logps/chosen": -435.3412170410156, "logps/rejected": -683.4049072265625, "loss": 0.2422, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5417599678039551, "rewards/margins": 1.7700687646865845, "rewards/rejected": -2.311828851699829, "step": 332 }, { "epoch": 0.29, "grad_norm": 86.16521686543277, "learning_rate": 1.7979641419059647e-06, "logits/chosen": -1.2827945947647095, "logits/rejected": -1.2071640491485596, "logps/chosen": -448.04608154296875, "logps/rejected": -671.893310546875, "loss": 0.5764, "rewards/accuracies": 0.75, "rewards/chosen": 0.0399840772151947, "rewards/margins": 0.3430766463279724, "rewards/rejected": -0.3030925691127777, "step": 333 }, { "epoch": 0.29, "grad_norm": 46.42168599676963, "learning_rate": 1.7961555581196148e-06, "logits/chosen": -1.31438410282135, "logits/rejected": -1.2875746488571167, "logps/chosen": -459.66156005859375, "logps/rejected": -604.5240478515625, "loss": 0.3108, "rewards/accuracies": 0.875, "rewards/chosen": -0.5575841665267944, "rewards/margins": 1.5096338987350464, "rewards/rejected": -2.067218065261841, "step": 334 }, { "epoch": 0.29, "grad_norm": 52.554734666068185, "learning_rate": 1.7943398335413833e-06, "logits/chosen": -1.3907947540283203, "logits/rejected": -1.3661128282546997, "logps/chosen": -453.46282958984375, "logps/rejected": -573.4193115234375, "loss": 0.4039, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5714187622070312, "rewards/margins": 1.3516925573349, "rewards/rejected": -1.9231112003326416, "step": 335 }, { "epoch": 0.29, "grad_norm": 73.66641233425995, "learning_rate": 1.79251698445667e-06, "logits/chosen": -1.282754898071289, "logits/rejected": -1.2947840690612793, "logps/chosen": -708.6914672851562, "logps/rejected": -734.3735961914062, "loss": 0.4046, "rewards/accuracies": 0.875, "rewards/chosen": -1.258388876914978, "rewards/margins": 1.2340383529663086, "rewards/rejected": -2.492427349090576, "step": 336 }, { "epoch": 0.29, "grad_norm": 55.273678419099184, "learning_rate": 1.790687027214774e-06, "logits/chosen": -1.2410988807678223, "logits/rejected": -1.2197015285491943, "logps/chosen": -626.3568115234375, "logps/rejected": -815.574951171875, "loss": 0.2696, "rewards/accuracies": 0.875, "rewards/chosen": -0.8296209573745728, "rewards/margins": 1.8387782573699951, "rewards/rejected": -2.6683993339538574, "step": 337 }, { "epoch": 0.29, "grad_norm": 95.16657798565699, "learning_rate": 1.7888499782287495e-06, "logits/chosen": -1.2814643383026123, "logits/rejected": -1.3364362716674805, "logps/chosen": -649.78125, "logps/rejected": -612.60400390625, "loss": 0.6059, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8462148904800415, "rewards/margins": 1.2750688791275024, "rewards/rejected": -2.121283531188965, "step": 338 }, { "epoch": 0.29, "grad_norm": 37.834308839348786, "learning_rate": 1.7870058539752563e-06, "logits/chosen": -1.3570611476898193, "logits/rejected": -1.3527884483337402, "logps/chosen": -523.838134765625, "logps/rejected": -617.7093505859375, "loss": 0.2005, "rewards/accuracies": 1.0, "rewards/chosen": -0.5999828577041626, "rewards/margins": 2.075089693069458, "rewards/rejected": -2.67507266998291, "step": 339 }, { "epoch": 0.29, "grad_norm": 62.59492431101693, "learning_rate": 1.7851546709944133e-06, "logits/chosen": -1.2640312910079956, "logits/rejected": -1.2444536685943604, "logps/chosen": -718.8636474609375, "logps/rejected": -756.96923828125, "loss": 0.4128, "rewards/accuracies": 0.875, "rewards/chosen": -0.7843811511993408, "rewards/margins": 1.6093783378601074, "rewards/rejected": -2.3937594890594482, "step": 340 }, { "epoch": 0.29, "grad_norm": 56.365393302117916, "learning_rate": 1.7832964458896496e-06, "logits/chosen": -1.356136441230774, "logits/rejected": -1.2352664470672607, "logps/chosen": -456.6836242675781, "logps/rejected": -874.0556030273438, "loss": 0.2437, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5173958539962769, "rewards/margins": 2.5600132942199707, "rewards/rejected": -3.077409267425537, "step": 341 }, { "epoch": 0.29, "grad_norm": 32.8496543014448, "learning_rate": 1.7814311953275559e-06, "logits/chosen": -1.3836917877197266, "logits/rejected": -1.3068275451660156, "logps/chosen": -476.92242431640625, "logps/rejected": -723.5286865234375, "loss": 0.1923, "rewards/accuracies": 1.0, "rewards/chosen": -0.5671064853668213, "rewards/margins": 2.021279811859131, "rewards/rejected": -2.588386058807373, "step": 342 }, { "epoch": 0.29, "grad_norm": 51.257559058374795, "learning_rate": 1.7795589360377342e-06, "logits/chosen": -1.423358678817749, "logits/rejected": -1.3222960233688354, "logps/chosen": -353.30499267578125, "logps/rejected": -577.5755615234375, "loss": 0.4257, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8450976610183716, "rewards/margins": 1.2097313404083252, "rewards/rejected": -2.0548288822174072, "step": 343 }, { "epoch": 0.3, "grad_norm": 53.5472303290772, "learning_rate": 1.7776796848126501e-06, "logits/chosen": -1.3959057331085205, "logits/rejected": -1.356281042098999, "logps/chosen": -535.9912109375, "logps/rejected": -663.2745971679688, "loss": 0.3195, "rewards/accuracies": 0.75, "rewards/chosen": -0.8873413801193237, "rewards/margins": 1.5602761507034302, "rewards/rejected": -2.447617530822754, "step": 344 }, { "epoch": 0.3, "grad_norm": 53.12673247773521, "learning_rate": 1.7757934585074784e-06, "logits/chosen": -1.3323516845703125, "logits/rejected": -1.3047423362731934, "logps/chosen": -465.5447998046875, "logps/rejected": -702.0733642578125, "loss": 0.3076, "rewards/accuracies": 0.875, "rewards/chosen": -0.813624382019043, "rewards/margins": 1.4025866985321045, "rewards/rejected": -2.2162110805511475, "step": 345 }, { "epoch": 0.3, "grad_norm": 61.43784959072281, "learning_rate": 1.7739002740399554e-06, "logits/chosen": -1.3370304107666016, "logits/rejected": -1.2953394651412964, "logps/chosen": -605.59033203125, "logps/rejected": -796.789794921875, "loss": 0.4125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.918696403503418, "rewards/margins": 1.4033992290496826, "rewards/rejected": -2.3220956325531006, "step": 346 }, { "epoch": 0.3, "grad_norm": 40.496158372532264, "learning_rate": 1.7720001483902254e-06, "logits/chosen": -1.3652217388153076, "logits/rejected": -1.2486958503723145, "logps/chosen": -460.3392028808594, "logps/rejected": -763.511962890625, "loss": 0.2107, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41109907627105713, "rewards/margins": 2.041755199432373, "rewards/rejected": -2.4528541564941406, "step": 347 }, { "epoch": 0.3, "grad_norm": 53.499096353198155, "learning_rate": 1.7700930986006888e-06, "logits/chosen": -1.371845006942749, "logits/rejected": -1.3209772109985352, "logps/chosen": -427.68170166015625, "logps/rejected": -644.3697509765625, "loss": 0.3152, "rewards/accuracies": 0.875, "rewards/chosen": -0.705706000328064, "rewards/margins": 1.6445813179016113, "rewards/rejected": -2.3502871990203857, "step": 348 }, { "epoch": 0.3, "grad_norm": 58.9588633137131, "learning_rate": 1.7681791417758495e-06, "logits/chosen": -1.4229094982147217, "logits/rejected": -1.3599036931991577, "logps/chosen": -471.8563232421875, "logps/rejected": -617.5604858398438, "loss": 0.3771, "rewards/accuracies": 0.8125, "rewards/chosen": -0.713165283203125, "rewards/margins": 1.2468420267105103, "rewards/rejected": -1.9600073099136353, "step": 349 }, { "epoch": 0.3, "grad_norm": 69.70902398852822, "learning_rate": 1.7662582950821604e-06, "logits/chosen": -1.2817418575286865, "logits/rejected": -1.2800257205963135, "logps/chosen": -619.9619750976562, "logps/rejected": -757.70751953125, "loss": 0.2988, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8852773904800415, "rewards/margins": 1.772665023803711, "rewards/rejected": -2.657942295074463, "step": 350 }, { "epoch": 0.3, "grad_norm": 34.282024233072164, "learning_rate": 1.7643305757478713e-06, "logits/chosen": -1.4060933589935303, "logits/rejected": -1.3654086589813232, "logps/chosen": -543.7503051757812, "logps/rejected": -668.7984619140625, "loss": 0.2402, "rewards/accuracies": 1.0, "rewards/chosen": -0.4676051437854767, "rewards/margins": 1.831949234008789, "rewards/rejected": -2.2995543479919434, "step": 351 }, { "epoch": 0.3, "grad_norm": 30.3024998135122, "learning_rate": 1.762396001062873e-06, "logits/chosen": -1.3362281322479248, "logits/rejected": -1.3081523180007935, "logps/chosen": -548.0572509765625, "logps/rejected": -773.1283569335938, "loss": 0.2547, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3063896596431732, "rewards/margins": 1.772287130355835, "rewards/rejected": -2.078676700592041, "step": 352 }, { "epoch": 0.3, "grad_norm": 58.827125540810336, "learning_rate": 1.760454588378542e-06, "logits/chosen": -1.3522402048110962, "logits/rejected": -1.330960750579834, "logps/chosen": -448.0185546875, "logps/rejected": -519.0568237304688, "loss": 0.3176, "rewards/accuracies": 0.875, "rewards/chosen": -0.4417378902435303, "rewards/margins": 1.2674145698547363, "rewards/rejected": -1.7091525793075562, "step": 353 }, { "epoch": 0.3, "grad_norm": 36.49894563740577, "learning_rate": 1.758506355107586e-06, "logits/chosen": -1.3145179748535156, "logits/rejected": -1.3016877174377441, "logps/chosen": -489.9166259765625, "logps/rejected": -590.7893676757812, "loss": 0.2841, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2938302457332611, "rewards/margins": 1.5872488021850586, "rewards/rejected": -1.8810791969299316, "step": 354 }, { "epoch": 0.3, "grad_norm": 50.86650433813805, "learning_rate": 1.7565513187238875e-06, "logits/chosen": -1.411332368850708, "logits/rejected": -1.342905044555664, "logps/chosen": -359.9126281738281, "logps/rejected": -543.0909423828125, "loss": 0.351, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4272446632385254, "rewards/margins": 1.5099502801895142, "rewards/rejected": -1.9371949434280396, "step": 355 }, { "epoch": 0.31, "grad_norm": 82.16799604256238, "learning_rate": 1.754589496762346e-06, "logits/chosen": -1.2620693445205688, "logits/rejected": -1.2261128425598145, "logps/chosen": -573.6851196289062, "logps/rejected": -668.463623046875, "loss": 0.4958, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9251641631126404, "rewards/margins": 1.105979084968567, "rewards/rejected": -2.0311431884765625, "step": 356 }, { "epoch": 0.31, "grad_norm": 36.40453847231406, "learning_rate": 1.7526209068187217e-06, "logits/chosen": -1.3427318334579468, "logits/rejected": -1.337662696838379, "logps/chosen": -574.2933349609375, "logps/rejected": -721.6268310546875, "loss": 0.2151, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4808851182460785, "rewards/margins": 1.956594467163086, "rewards/rejected": -2.4374794960021973, "step": 357 }, { "epoch": 0.31, "grad_norm": 40.2136820610071, "learning_rate": 1.7506455665494774e-06, "logits/chosen": -1.3520996570587158, "logits/rejected": -1.2746731042861938, "logps/chosen": -391.5846862792969, "logps/rejected": -690.509765625, "loss": 0.18, "rewards/accuracies": 0.9375, "rewards/chosen": -0.39890822768211365, "rewards/margins": 2.4898672103881836, "rewards/rejected": -2.88877534866333, "step": 358 }, { "epoch": 0.31, "grad_norm": 66.20511722931484, "learning_rate": 1.748663493671621e-06, "logits/chosen": -1.3608579635620117, "logits/rejected": -1.3019959926605225, "logps/chosen": -534.7568969726562, "logps/rejected": -757.3606567382812, "loss": 0.3957, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9439442753791809, "rewards/margins": 1.482708215713501, "rewards/rejected": -2.426652669906616, "step": 359 }, { "epoch": 0.31, "grad_norm": 64.49934725997622, "learning_rate": 1.746674705962544e-06, "logits/chosen": -1.3710341453552246, "logits/rejected": -1.2802362442016602, "logps/chosen": -408.6067810058594, "logps/rejected": -552.7216796875, "loss": 0.3656, "rewards/accuracies": 0.875, "rewards/chosen": -0.6530160903930664, "rewards/margins": 1.6699604988098145, "rewards/rejected": -2.322976589202881, "step": 360 }, { "epoch": 0.31, "grad_norm": 70.11251547062426, "learning_rate": 1.744679221259866e-06, "logits/chosen": -1.3386015892028809, "logits/rejected": -1.3565545082092285, "logps/chosen": -743.8438720703125, "logps/rejected": -634.8836669921875, "loss": 0.4044, "rewards/accuracies": 0.875, "rewards/chosen": -1.0614737272262573, "rewards/margins": 1.0349459648132324, "rewards/rejected": -2.0964198112487793, "step": 361 }, { "epoch": 0.31, "grad_norm": 82.91664116977694, "learning_rate": 1.7426770574612708e-06, "logits/chosen": -1.268636703491211, "logits/rejected": -1.2521432638168335, "logps/chosen": -695.606689453125, "logps/rejected": -826.245361328125, "loss": 0.4141, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3942124843597412, "rewards/margins": 1.6098053455352783, "rewards/rejected": -3.0040180683135986, "step": 362 }, { "epoch": 0.31, "grad_norm": 86.52667633185308, "learning_rate": 1.7406682325243482e-06, "logits/chosen": -1.4180306196212769, "logits/rejected": -1.4059098958969116, "logps/chosen": -618.788330078125, "logps/rejected": -620.6964721679688, "loss": 0.4947, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9639477729797363, "rewards/margins": 1.3326777219772339, "rewards/rejected": -2.2966253757476807, "step": 363 }, { "epoch": 0.31, "grad_norm": 45.85536985118218, "learning_rate": 1.7386527644664328e-06, "logits/chosen": -1.3537003993988037, "logits/rejected": -1.235815167427063, "logps/chosen": -436.8193359375, "logps/rejected": -825.0286865234375, "loss": 0.2462, "rewards/accuracies": 1.0, "rewards/chosen": -0.6824496984481812, "rewards/margins": 2.1164891719818115, "rewards/rejected": -2.798938751220703, "step": 364 }, { "epoch": 0.31, "grad_norm": 58.52944726682894, "learning_rate": 1.7366306713644416e-06, "logits/chosen": -1.3131760358810425, "logits/rejected": -1.2818546295166016, "logps/chosen": -392.1138000488281, "logps/rejected": -597.6663818359375, "loss": 0.4684, "rewards/accuracies": 0.75, "rewards/chosen": -0.38064292073249817, "rewards/margins": 1.8004159927368164, "rewards/rejected": -2.181058883666992, "step": 365 }, { "epoch": 0.31, "grad_norm": 32.81893405022377, "learning_rate": 1.7346019713547121e-06, "logits/chosen": -1.3495137691497803, "logits/rejected": -1.2485442161560059, "logps/chosen": -590.7689208984375, "logps/rejected": -916.5122680664062, "loss": 0.1726, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5310375690460205, "rewards/margins": 2.2462780475616455, "rewards/rejected": -2.777315616607666, "step": 366 }, { "epoch": 0.31, "grad_norm": 45.22131227486043, "learning_rate": 1.7325666826328397e-06, "logits/chosen": -1.372185230255127, "logits/rejected": -1.3290915489196777, "logps/chosen": -532.4417114257812, "logps/rejected": -732.6104736328125, "loss": 0.2474, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4011019170284271, "rewards/margins": 2.100841999053955, "rewards/rejected": -2.501944065093994, "step": 367 }, { "epoch": 0.32, "grad_norm": 766.8967712959084, "learning_rate": 1.7305248234535156e-06, "logits/chosen": -1.3028106689453125, "logits/rejected": -1.2383298873901367, "logps/chosen": -594.026611328125, "logps/rejected": -714.35302734375, "loss": 0.5844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14716491103172302, "rewards/margins": 0.5474169850349426, "rewards/rejected": -0.6945818662643433, "step": 368 }, { "epoch": 0.32, "grad_norm": 74.31008210706332, "learning_rate": 1.7284764121303599e-06, "logits/chosen": -1.4357049465179443, "logits/rejected": -1.3083711862564087, "logps/chosen": -327.5078125, "logps/rejected": -561.4296875, "loss": 0.4596, "rewards/accuracies": 0.75, "rewards/chosen": -0.2435588240623474, "rewards/margins": 0.7658228874206543, "rewards/rejected": -1.009381651878357, "step": 369 }, { "epoch": 0.32, "grad_norm": 43.852407517822435, "learning_rate": 1.7264214670357613e-06, "logits/chosen": -1.3738019466400146, "logits/rejected": -1.344679832458496, "logps/chosen": -456.21734619140625, "logps/rejected": -683.3712158203125, "loss": 0.2788, "rewards/accuracies": 1.0, "rewards/chosen": -0.6068999767303467, "rewards/margins": 1.8802766799926758, "rewards/rejected": -2.4871766567230225, "step": 370 }, { "epoch": 0.32, "grad_norm": 48.945596081457964, "learning_rate": 1.7243600066007104e-06, "logits/chosen": -1.3600564002990723, "logits/rejected": -1.2774728536605835, "logps/chosen": -429.667236328125, "logps/rejected": -666.998046875, "loss": 0.3351, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7039352655410767, "rewards/margins": 2.060344696044922, "rewards/rejected": -2.764279842376709, "step": 371 }, { "epoch": 0.32, "grad_norm": 87.69018998649285, "learning_rate": 1.7222920493146336e-06, "logits/chosen": -1.3533427715301514, "logits/rejected": -1.2983510494232178, "logps/chosen": -506.4881286621094, "logps/rejected": -719.895751953125, "loss": 0.6736, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1213555335998535, "rewards/margins": 1.4744019508361816, "rewards/rejected": -2.595757484436035, "step": 372 }, { "epoch": 0.32, "grad_norm": 35.82422267357685, "learning_rate": 1.7202176137252287e-06, "logits/chosen": -1.364864706993103, "logits/rejected": -1.2952933311462402, "logps/chosen": -399.06549072265625, "logps/rejected": -614.533447265625, "loss": 0.2149, "rewards/accuracies": 1.0, "rewards/chosen": -0.655327558517456, "rewards/margins": 1.7265857458114624, "rewards/rejected": -2.381913185119629, "step": 373 }, { "epoch": 0.32, "grad_norm": 57.76953763457483, "learning_rate": 1.7181367184382975e-06, "logits/chosen": -1.2076451778411865, "logits/rejected": -1.2127151489257812, "logps/chosen": -641.7158203125, "logps/rejected": -791.0142822265625, "loss": 0.276, "rewards/accuracies": 0.8125, "rewards/chosen": -0.530408501625061, "rewards/margins": 2.003429412841797, "rewards/rejected": -2.5338380336761475, "step": 374 }, { "epoch": 0.32, "grad_norm": 55.405902933053476, "learning_rate": 1.7160493821175806e-06, "logits/chosen": -1.3833491802215576, "logits/rejected": -1.3392293453216553, "logps/chosen": -614.4901733398438, "logps/rejected": -714.2847290039062, "loss": 0.2459, "rewards/accuracies": 1.0, "rewards/chosen": -1.013306736946106, "rewards/margins": 2.0114693641662598, "rewards/rejected": -3.024775981903076, "step": 375 }, { "epoch": 0.32, "grad_norm": 50.68328672154846, "learning_rate": 1.7139556234845874e-06, "logits/chosen": -1.4341458082199097, "logits/rejected": -1.3658862113952637, "logps/chosen": -387.8314208984375, "logps/rejected": -540.5579223632812, "loss": 0.3614, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8665677309036255, "rewards/margins": 1.5501694679260254, "rewards/rejected": -2.4167373180389404, "step": 376 }, { "epoch": 0.32, "grad_norm": 59.60217381880438, "learning_rate": 1.7118554613184302e-06, "logits/chosen": -1.3032417297363281, "logits/rejected": -1.2954423427581787, "logps/chosen": -630.8447265625, "logps/rejected": -679.8330078125, "loss": 0.3405, "rewards/accuracies": 0.875, "rewards/chosen": -1.1230149269104004, "rewards/margins": 1.624404788017273, "rewards/rejected": -2.747419834136963, "step": 377 }, { "epoch": 0.32, "grad_norm": 27.89135462360495, "learning_rate": 1.7097489144556553e-06, "logits/chosen": -1.3719645738601685, "logits/rejected": -1.3334708213806152, "logps/chosen": -499.67791748046875, "logps/rejected": -713.4361572265625, "loss": 0.1902, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8453254699707031, "rewards/margins": 1.9828104972839355, "rewards/rejected": -2.8281362056732178, "step": 378 }, { "epoch": 0.33, "grad_norm": 60.874062147399314, "learning_rate": 1.7076360017900742e-06, "logits/chosen": -1.2090768814086914, "logits/rejected": -1.1481618881225586, "logps/chosen": -740.1016845703125, "logps/rejected": -920.9785766601562, "loss": 0.2601, "rewards/accuracies": 0.875, "rewards/chosen": -0.9678854942321777, "rewards/margins": 2.6631102561950684, "rewards/rejected": -3.630995988845825, "step": 379 }, { "epoch": 0.33, "grad_norm": 47.97271908744414, "learning_rate": 1.705516742272593e-06, "logits/chosen": -1.2268987894058228, "logits/rejected": -1.2583870887756348, "logps/chosen": -560.3924560546875, "logps/rejected": -637.6690673828125, "loss": 0.4351, "rewards/accuracies": 0.75, "rewards/chosen": -1.14536714553833, "rewards/margins": 1.5063209533691406, "rewards/rejected": -2.6516880989074707, "step": 380 }, { "epoch": 0.33, "grad_norm": 57.222584056932256, "learning_rate": 1.7033911549110438e-06, "logits/chosen": -1.3473737239837646, "logits/rejected": -1.3357570171356201, "logps/chosen": -555.26025390625, "logps/rejected": -700.132080078125, "loss": 0.2627, "rewards/accuracies": 0.9375, "rewards/chosen": -0.967682957649231, "rewards/margins": 1.708535075187683, "rewards/rejected": -2.676218032836914, "step": 381 }, { "epoch": 0.33, "grad_norm": 82.14782264995479, "learning_rate": 1.7012592587700137e-06, "logits/chosen": -1.3172829151153564, "logits/rejected": -1.3071085214614868, "logps/chosen": -502.27252197265625, "logps/rejected": -572.5811157226562, "loss": 0.4242, "rewards/accuracies": 0.8125, "rewards/chosen": -0.954552412033081, "rewards/margins": 1.1110751628875732, "rewards/rejected": -2.0656275749206543, "step": 382 }, { "epoch": 0.33, "grad_norm": 39.535486116430114, "learning_rate": 1.6991210729706743e-06, "logits/chosen": -1.3450148105621338, "logits/rejected": -1.3007009029388428, "logps/chosen": -380.4422607421875, "logps/rejected": -574.0826416015625, "loss": 0.2487, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5027610063552856, "rewards/margins": 1.7666382789611816, "rewards/rejected": -2.2693991661071777, "step": 383 }, { "epoch": 0.33, "grad_norm": 115.97105805630862, "learning_rate": 1.6969766166906085e-06, "logits/chosen": -1.2329654693603516, "logits/rejected": -1.2378525733947754, "logps/chosen": -557.4694213867188, "logps/rejected": -650.35498046875, "loss": 0.4743, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0643105506896973, "rewards/margins": 1.2340928316116333, "rewards/rejected": -2.298403263092041, "step": 384 }, { "epoch": 0.33, "grad_norm": 46.15615789349278, "learning_rate": 1.694825909163641e-06, "logits/chosen": -1.415977120399475, "logits/rejected": -1.3116364479064941, "logps/chosen": -425.91217041015625, "logps/rejected": -695.7625732421875, "loss": 0.3217, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9717909097671509, "rewards/margins": 1.7135262489318848, "rewards/rejected": -2.685317039489746, "step": 385 }, { "epoch": 0.33, "grad_norm": 71.6040609635783, "learning_rate": 1.6926689696796636e-06, "logits/chosen": -1.3962388038635254, "logits/rejected": -1.2283735275268555, "logps/chosen": -575.3253173828125, "logps/rejected": -874.0101928710938, "loss": 0.2812, "rewards/accuracies": 0.875, "rewards/chosen": -0.8452811241149902, "rewards/margins": 2.1846938133239746, "rewards/rejected": -3.029974937438965, "step": 386 }, { "epoch": 0.33, "grad_norm": 36.61658137478186, "learning_rate": 1.6905058175844637e-06, "logits/chosen": -1.3449249267578125, "logits/rejected": -1.2902624607086182, "logps/chosen": -444.6363525390625, "logps/rejected": -710.4901123046875, "loss": 0.2367, "rewards/accuracies": 1.0, "rewards/chosen": -0.596918523311615, "rewards/margins": 2.046452522277832, "rewards/rejected": -2.643371105194092, "step": 387 }, { "epoch": 0.33, "grad_norm": 57.3810553940541, "learning_rate": 1.6883364722795498e-06, "logits/chosen": -1.337453007698059, "logits/rejected": -1.2524361610412598, "logps/chosen": -415.93402099609375, "logps/rejected": -661.5554809570312, "loss": 0.3094, "rewards/accuracies": 0.875, "rewards/chosen": -0.37221938371658325, "rewards/margins": 1.6274501085281372, "rewards/rejected": -1.9996695518493652, "step": 388 }, { "epoch": 0.33, "grad_norm": 51.57334994385235, "learning_rate": 1.686160953221978e-06, "logits/chosen": -1.2846753597259521, "logits/rejected": -1.1942753791809082, "logps/chosen": -485.0477294921875, "logps/rejected": -710.1423950195312, "loss": 0.3319, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6163160800933838, "rewards/margins": 1.3468387126922607, "rewards/rejected": -1.9631547927856445, "step": 389 }, { "epoch": 0.33, "grad_norm": 59.367200479176645, "learning_rate": 1.6839792799241771e-06, "logits/chosen": -1.2812542915344238, "logits/rejected": -1.2009220123291016, "logps/chosen": -701.4754028320312, "logps/rejected": -948.4730224609375, "loss": 0.298, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8413772583007812, "rewards/margins": 2.0989725589752197, "rewards/rejected": -2.940349578857422, "step": 390 }, { "epoch": 0.34, "grad_norm": 31.270768426311996, "learning_rate": 1.6817914719537748e-06, "logits/chosen": -1.2630783319473267, "logits/rejected": -1.1632015705108643, "logps/chosen": -367.569091796875, "logps/rejected": -747.9874267578125, "loss": 0.1791, "rewards/accuracies": 1.0, "rewards/chosen": -0.2967870831489563, "rewards/margins": 2.931680679321289, "rewards/rejected": -3.2284677028656006, "step": 391 }, { "epoch": 0.34, "grad_norm": 87.98851035196604, "learning_rate": 1.6795975489334193e-06, "logits/chosen": -1.2246571779251099, "logits/rejected": -1.2379138469696045, "logps/chosen": -650.24951171875, "logps/rejected": -619.5977172851562, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": -1.3291599750518799, "rewards/margins": 0.8845857381820679, "rewards/rejected": -2.213745594024658, "step": 392 }, { "epoch": 0.34, "grad_norm": 50.650897632871555, "learning_rate": 1.677397530540608e-06, "logits/chosen": -1.3371104001998901, "logits/rejected": -1.279056429862976, "logps/chosen": -473.9917297363281, "logps/rejected": -733.524169921875, "loss": 0.2614, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7630621194839478, "rewards/margins": 1.9246776103973389, "rewards/rejected": -2.687739849090576, "step": 393 }, { "epoch": 0.34, "grad_norm": 64.88482584948143, "learning_rate": 1.675191436507505e-06, "logits/chosen": -1.2914042472839355, "logits/rejected": -1.2568284273147583, "logps/chosen": -417.3004150390625, "logps/rejected": -581.8896484375, "loss": 0.4118, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6889206171035767, "rewards/margins": 1.3778702020645142, "rewards/rejected": -2.06679105758667, "step": 394 }, { "epoch": 0.34, "grad_norm": 37.73605637814536, "learning_rate": 1.6729792866207703e-06, "logits/chosen": -1.283979892730713, "logits/rejected": -1.2231981754302979, "logps/chosen": -483.034912109375, "logps/rejected": -708.1431884765625, "loss": 0.2694, "rewards/accuracies": 0.875, "rewards/chosen": -0.929643452167511, "rewards/margins": 2.2847232818603516, "rewards/rejected": -3.2143666744232178, "step": 395 }, { "epoch": 0.34, "grad_norm": 51.231246461675696, "learning_rate": 1.6707611007213778e-06, "logits/chosen": -1.3425097465515137, "logits/rejected": -1.2905793190002441, "logps/chosen": -516.3165283203125, "logps/rejected": -699.6722412109375, "loss": 0.2348, "rewards/accuracies": 0.875, "rewards/chosen": -0.7765346765518188, "rewards/margins": 2.0664963722229004, "rewards/rejected": -2.8430309295654297, "step": 396 }, { "epoch": 0.34, "grad_norm": 47.72450656288295, "learning_rate": 1.6685368987044392e-06, "logits/chosen": -1.3056159019470215, "logits/rejected": -1.2380082607269287, "logps/chosen": -491.7803955078125, "logps/rejected": -702.978759765625, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": -0.6762930154800415, "rewards/margins": 2.062279224395752, "rewards/rejected": -2.738572597503662, "step": 397 }, { "epoch": 0.34, "grad_norm": 70.21202586898191, "learning_rate": 1.6663067005190254e-06, "logits/chosen": -1.143059253692627, "logits/rejected": -1.0712796449661255, "logps/chosen": -617.78076171875, "logps/rejected": -838.5026245117188, "loss": 0.456, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4345775842666626, "rewards/margins": 1.6229698657989502, "rewards/rejected": -3.0575473308563232, "step": 398 }, { "epoch": 0.34, "grad_norm": 46.56299997352908, "learning_rate": 1.6640705261679883e-06, "logits/chosen": -1.2313382625579834, "logits/rejected": -1.1530615091323853, "logps/chosen": -631.331298828125, "logps/rejected": -757.3225708007812, "loss": 0.2068, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8394178152084351, "rewards/margins": 2.230578899383545, "rewards/rejected": -3.0699968338012695, "step": 399 }, { "epoch": 0.34, "grad_norm": 63.84886780378537, "learning_rate": 1.6618283957077787e-06, "logits/chosen": -1.2598519325256348, "logits/rejected": -1.2227567434310913, "logps/chosen": -512.57958984375, "logps/rejected": -578.773681640625, "loss": 0.3582, "rewards/accuracies": 0.875, "rewards/chosen": -0.6850414276123047, "rewards/margins": 1.4070050716400146, "rewards/rejected": -2.0920464992523193, "step": 400 }, { "epoch": 0.34, "grad_norm": 35.51582894200608, "learning_rate": 1.6595803292482699e-06, "logits/chosen": -1.271228551864624, "logits/rejected": -1.226959228515625, "logps/chosen": -730.7811279296875, "logps/rejected": -896.078125, "loss": 0.181, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9396979808807373, "rewards/margins": 2.190903902053833, "rewards/rejected": -3.1306021213531494, "step": 401 }, { "epoch": 0.34, "grad_norm": 28.390943749629226, "learning_rate": 1.6573263469525754e-06, "logits/chosen": -1.1954461336135864, "logits/rejected": -1.1502574682235718, "logps/chosen": -529.775634765625, "logps/rejected": -699.5891723632812, "loss": 0.1812, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41936323046684265, "rewards/margins": 2.2371792793273926, "rewards/rejected": -2.6565425395965576, "step": 402 }, { "epoch": 0.35, "grad_norm": 44.35213726968621, "learning_rate": 1.6550664690368678e-06, "logits/chosen": -1.2156412601470947, "logits/rejected": -1.1602269411087036, "logps/chosen": -542.976318359375, "logps/rejected": -711.1033325195312, "loss": 0.2548, "rewards/accuracies": 0.875, "rewards/chosen": -0.6662370562553406, "rewards/margins": 1.8856029510498047, "rewards/rejected": -2.551839828491211, "step": 403 }, { "epoch": 0.35, "grad_norm": 75.17976256340974, "learning_rate": 1.6528007157701986e-06, "logits/chosen": -1.204666018486023, "logits/rejected": -1.183717966079712, "logps/chosen": -747.551025390625, "logps/rejected": -933.54736328125, "loss": 0.4962, "rewards/accuracies": 0.875, "rewards/chosen": -1.092570185661316, "rewards/margins": 1.9422069787979126, "rewards/rejected": -3.0347769260406494, "step": 404 }, { "epoch": 0.35, "grad_norm": 103.04235564300753, "learning_rate": 1.6505291074743157e-06, "logits/chosen": -1.3840012550354004, "logits/rejected": -1.312503457069397, "logps/chosen": -491.85076904296875, "logps/rejected": -664.5662841796875, "loss": 0.5429, "rewards/accuracies": 0.75, "rewards/chosen": -0.6823098063468933, "rewards/margins": 1.6049988269805908, "rewards/rejected": -2.28730845451355, "step": 405 }, { "epoch": 0.35, "grad_norm": 85.63345072930973, "learning_rate": 1.6482516645234811e-06, "logits/chosen": -1.2395625114440918, "logits/rejected": -1.152807593345642, "logps/chosen": -504.61492919921875, "logps/rejected": -706.6768798828125, "loss": 0.4778, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9438247680664062, "rewards/margins": 2.0758321285247803, "rewards/rejected": -3.0196568965911865, "step": 406 }, { "epoch": 0.35, "grad_norm": 59.471028399123476, "learning_rate": 1.6459684073442887e-06, "logits/chosen": -1.1889766454696655, "logits/rejected": -1.1216541528701782, "logps/chosen": -654.0748901367188, "logps/rejected": -957.9883422851562, "loss": 0.2074, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9139692783355713, "rewards/margins": 2.6043825149536133, "rewards/rejected": -3.5183520317077637, "step": 407 }, { "epoch": 0.35, "grad_norm": 50.30155647050052, "learning_rate": 1.6436793564154808e-06, "logits/chosen": -1.2363783121109009, "logits/rejected": -1.2209937572479248, "logps/chosen": -587.3294067382812, "logps/rejected": -718.9139404296875, "loss": 0.2917, "rewards/accuracies": 0.875, "rewards/chosen": -0.8330459594726562, "rewards/margins": 2.0317726135253906, "rewards/rejected": -2.864818572998047, "step": 408 }, { "epoch": 0.35, "grad_norm": 65.97832606363103, "learning_rate": 1.6413845322677635e-06, "logits/chosen": -1.1675924062728882, "logits/rejected": -1.1158082485198975, "logps/chosen": -668.6107177734375, "logps/rejected": -825.7411499023438, "loss": 0.3324, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9506618976593018, "rewards/margins": 2.0555379390716553, "rewards/rejected": -3.006199836730957, "step": 409 }, { "epoch": 0.35, "grad_norm": 122.0389351510737, "learning_rate": 1.639083955483625e-06, "logits/chosen": -1.356440782546997, "logits/rejected": -1.3123233318328857, "logps/chosen": -572.5408935546875, "logps/rejected": -647.79736328125, "loss": 0.7814, "rewards/accuracies": 0.625, "rewards/chosen": -1.5167076587677002, "rewards/margins": 0.6254584789276123, "rewards/rejected": -2.1421661376953125, "step": 410 }, { "epoch": 0.35, "grad_norm": 66.59653601012016, "learning_rate": 1.6367776466971475e-06, "logits/chosen": -1.3011500835418701, "logits/rejected": -1.251386046409607, "logps/chosen": -421.4635314941406, "logps/rejected": -582.5531005859375, "loss": 0.4516, "rewards/accuracies": 0.875, "rewards/chosen": -0.6226481199264526, "rewards/margins": 1.8402199745178223, "rewards/rejected": -2.4628682136535645, "step": 411 }, { "epoch": 0.35, "grad_norm": 71.05323766925942, "learning_rate": 1.6344656265938258e-06, "logits/chosen": -1.2155036926269531, "logits/rejected": -1.1783559322357178, "logps/chosen": -667.48388671875, "logps/rejected": -756.5985107421875, "loss": 0.2926, "rewards/accuracies": 0.875, "rewards/chosen": -0.7361356616020203, "rewards/margins": 2.011352062225342, "rewards/rejected": -2.747488021850586, "step": 412 }, { "epoch": 0.35, "grad_norm": 68.98412970305316, "learning_rate": 1.6321479159103786e-06, "logits/chosen": -1.304020643234253, "logits/rejected": -1.280059576034546, "logps/chosen": -566.3141479492188, "logps/rejected": -568.9078979492188, "loss": 0.5021, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8730708360671997, "rewards/margins": 1.178234338760376, "rewards/rejected": -2.0513052940368652, "step": 413 }, { "epoch": 0.36, "grad_norm": 38.10469841786047, "learning_rate": 1.6298245354345654e-06, "logits/chosen": -1.4465073347091675, "logits/rejected": -1.380890965461731, "logps/chosen": -570.2463989257812, "logps/rejected": -813.3609619140625, "loss": 0.2257, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6123977899551392, "rewards/margins": 2.4449164867401123, "rewards/rejected": -3.057314157485962, "step": 414 }, { "epoch": 0.36, "grad_norm": 63.94973697259471, "learning_rate": 1.6274955060049972e-06, "logits/chosen": -1.3530452251434326, "logits/rejected": -1.2638301849365234, "logps/chosen": -340.5704345703125, "logps/rejected": -459.89715576171875, "loss": 0.4984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5781039595603943, "rewards/margins": 1.089951753616333, "rewards/rejected": -1.668055534362793, "step": 415 }, { "epoch": 0.36, "grad_norm": 42.9621695989501, "learning_rate": 1.6251608485109519e-06, "logits/chosen": -1.4585225582122803, "logits/rejected": -1.4104504585266113, "logps/chosen": -495.1009216308594, "logps/rejected": -635.91748046875, "loss": 0.2641, "rewards/accuracies": 0.875, "rewards/chosen": -0.71750408411026, "rewards/margins": 1.8936041593551636, "rewards/rejected": -2.6111083030700684, "step": 416 }, { "epoch": 0.36, "grad_norm": 56.19773710256689, "learning_rate": 1.622820583892185e-06, "logits/chosen": -1.4692611694335938, "logits/rejected": -1.3992235660552979, "logps/chosen": -479.1830749511719, "logps/rejected": -642.672119140625, "loss": 0.383, "rewards/accuracies": 0.625, "rewards/chosen": -0.5529279112815857, "rewards/margins": 1.683672308921814, "rewards/rejected": -2.236600399017334, "step": 417 }, { "epoch": 0.36, "grad_norm": 66.9721595221014, "learning_rate": 1.6204747331387448e-06, "logits/chosen": -1.3393265008926392, "logits/rejected": -1.2750935554504395, "logps/chosen": -490.73211669921875, "logps/rejected": -633.0335083007812, "loss": 0.558, "rewards/accuracies": 0.75, "rewards/chosen": -0.6580825448036194, "rewards/margins": 1.143843650817871, "rewards/rejected": -1.8019261360168457, "step": 418 }, { "epoch": 0.36, "grad_norm": 41.10087728789618, "learning_rate": 1.6181233172907796e-06, "logits/chosen": -1.4568686485290527, "logits/rejected": -1.3738291263580322, "logps/chosen": -479.49639892578125, "logps/rejected": -744.3993530273438, "loss": 0.2447, "rewards/accuracies": 1.0, "rewards/chosen": -0.5038151741027832, "rewards/margins": 1.8586931228637695, "rewards/rejected": -2.3625082969665527, "step": 419 }, { "epoch": 0.36, "grad_norm": 54.06812565547437, "learning_rate": 1.6157663574383538e-06, "logits/chosen": -1.2719099521636963, "logits/rejected": -1.268294334411621, "logps/chosen": -628.4613647460938, "logps/rejected": -751.2962036132812, "loss": 0.2633, "rewards/accuracies": 0.875, "rewards/chosen": -0.6819362640380859, "rewards/margins": 1.8705052137374878, "rewards/rejected": -2.5524415969848633, "step": 420 }, { "epoch": 0.36, "grad_norm": 97.92587825067058, "learning_rate": 1.6134038747212544e-06, "logits/chosen": -1.448411464691162, "logits/rejected": -1.3822431564331055, "logps/chosen": -352.2517395019531, "logps/rejected": -514.0858154296875, "loss": 0.3892, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6510804891586304, "rewards/margins": 1.0912226438522339, "rewards/rejected": -1.7423030138015747, "step": 421 }, { "epoch": 0.36, "grad_norm": 50.430038449778294, "learning_rate": 1.6110358903288056e-06, "logits/chosen": -1.3349554538726807, "logits/rejected": -1.2732388973236084, "logps/chosen": -494.1665954589844, "logps/rejected": -697.268798828125, "loss": 0.3301, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8012133240699768, "rewards/margins": 1.9825458526611328, "rewards/rejected": -2.783759117126465, "step": 422 }, { "epoch": 0.36, "grad_norm": 59.394078146172234, "learning_rate": 1.6086624254996748e-06, "logits/chosen": -1.2826063632965088, "logits/rejected": -1.2882716655731201, "logps/chosen": -613.343505859375, "logps/rejected": -738.014404296875, "loss": 0.272, "rewards/accuracies": 0.875, "rewards/chosen": -0.8677641153335571, "rewards/margins": 1.6630736589431763, "rewards/rejected": -2.5308375358581543, "step": 423 }, { "epoch": 0.36, "grad_norm": 44.53266060036663, "learning_rate": 1.6062835015216854e-06, "logits/chosen": -1.4333112239837646, "logits/rejected": -1.3385894298553467, "logps/chosen": -624.1317138671875, "logps/rejected": -977.5977783203125, "loss": 0.2785, "rewards/accuracies": 0.875, "rewards/chosen": -0.29590773582458496, "rewards/margins": 2.2630515098571777, "rewards/rejected": -2.558959484100342, "step": 424 }, { "epoch": 0.36, "grad_norm": 86.4811714615724, "learning_rate": 1.6038991397316232e-06, "logits/chosen": -1.2939656972885132, "logits/rejected": -1.2937078475952148, "logps/chosen": -461.0428466796875, "logps/rejected": -537.1651611328125, "loss": 0.5664, "rewards/accuracies": 0.9375, "rewards/chosen": -0.44620293378829956, "rewards/margins": 1.0640058517456055, "rewards/rejected": -1.5102088451385498, "step": 425 }, { "epoch": 0.37, "grad_norm": 85.17690049322667, "learning_rate": 1.601509361515047e-06, "logits/chosen": -1.366929054260254, "logits/rejected": -1.3334461450576782, "logps/chosen": -486.15484619140625, "logps/rejected": -612.0838623046875, "loss": 0.5528, "rewards/accuracies": 0.625, "rewards/chosen": -0.74158775806427, "rewards/margins": 0.6057339906692505, "rewards/rejected": -1.347321629524231, "step": 426 }, { "epoch": 0.37, "grad_norm": 58.22263704872626, "learning_rate": 1.5991141883060958e-06, "logits/chosen": -1.3588616847991943, "logits/rejected": -1.2897999286651611, "logps/chosen": -541.0726318359375, "logps/rejected": -708.3511962890625, "loss": 0.3274, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3630627393722534, "rewards/margins": 1.3744697570800781, "rewards/rejected": -1.7375324964523315, "step": 427 }, { "epoch": 0.37, "grad_norm": 39.379425670836454, "learning_rate": 1.5967136415872966e-06, "logits/chosen": -1.4818212985992432, "logits/rejected": -1.4103446006774902, "logps/chosen": -411.818115234375, "logps/rejected": -620.8701171875, "loss": 0.3029, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19901621341705322, "rewards/margins": 1.6631044149398804, "rewards/rejected": -1.8621206283569336, "step": 428 }, { "epoch": 0.37, "grad_norm": 71.56604182577863, "learning_rate": 1.5943077428893724e-06, "logits/chosen": -1.1709115505218506, "logits/rejected": -1.1403000354766846, "logps/chosen": -498.3337707519531, "logps/rejected": -666.076904296875, "loss": 0.3815, "rewards/accuracies": 0.8125, "rewards/chosen": -0.43662720918655396, "rewards/margins": 1.2479010820388794, "rewards/rejected": -1.684528112411499, "step": 429 }, { "epoch": 0.37, "grad_norm": 46.784228562458715, "learning_rate": 1.5918965137910478e-06, "logits/chosen": -1.3150413036346436, "logits/rejected": -1.2522664070129395, "logps/chosen": -569.6610107421875, "logps/rejected": -761.95556640625, "loss": 0.2254, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8883675932884216, "rewards/margins": 1.8457796573638916, "rewards/rejected": -2.734147310256958, "step": 430 }, { "epoch": 0.37, "grad_norm": 45.06090655791921, "learning_rate": 1.589479975918857e-06, "logits/chosen": -1.3054924011230469, "logits/rejected": -1.310208797454834, "logps/chosen": -558.9491577148438, "logps/rejected": -629.2244873046875, "loss": 0.3722, "rewards/accuracies": 0.75, "rewards/chosen": -0.9529801607131958, "rewards/margins": 1.5737230777740479, "rewards/rejected": -2.526703357696533, "step": 431 }, { "epoch": 0.37, "grad_norm": 65.54274021493728, "learning_rate": 1.5870581509469486e-06, "logits/chosen": -1.0618208646774292, "logits/rejected": -1.030731201171875, "logps/chosen": -448.14190673828125, "logps/rejected": -519.9080810546875, "loss": 0.4505, "rewards/accuracies": 0.625, "rewards/chosen": -0.646375298500061, "rewards/margins": 1.157083511352539, "rewards/rejected": -1.8034586906433105, "step": 432 }, { "epoch": 0.37, "grad_norm": 39.39951403805403, "learning_rate": 1.5846310605968923e-06, "logits/chosen": -1.3308734893798828, "logits/rejected": -1.255303978919983, "logps/chosen": -635.2354736328125, "logps/rejected": -828.3485107421875, "loss": 0.2347, "rewards/accuracies": 0.875, "rewards/chosen": -0.5919246673583984, "rewards/margins": 2.2630014419555664, "rewards/rejected": -2.854926109313965, "step": 433 }, { "epoch": 0.37, "grad_norm": 69.90226205895301, "learning_rate": 1.5821987266374826e-06, "logits/chosen": -1.2554914951324463, "logits/rejected": -1.2407859563827515, "logps/chosen": -719.94580078125, "logps/rejected": -779.0153198242188, "loss": 0.2793, "rewards/accuracies": 0.875, "rewards/chosen": -0.9973692893981934, "rewards/margins": 1.8503321409225464, "rewards/rejected": -2.8477015495300293, "step": 434 }, { "epoch": 0.37, "grad_norm": 64.8893318380094, "learning_rate": 1.5797611708845447e-06, "logits/chosen": -1.3571207523345947, "logits/rejected": -1.3038105964660645, "logps/chosen": -515.7115478515625, "logps/rejected": -759.386474609375, "loss": 0.3357, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5526295900344849, "rewards/margins": 2.4585442543029785, "rewards/rejected": -3.011174201965332, "step": 435 }, { "epoch": 0.37, "grad_norm": 33.07107510329828, "learning_rate": 1.577318415200739e-06, "logits/chosen": -1.208054780960083, "logits/rejected": -1.1525557041168213, "logps/chosen": -488.433837890625, "logps/rejected": -620.9463500976562, "loss": 0.1896, "rewards/accuracies": 0.9375, "rewards/chosen": -0.821509599685669, "rewards/margins": 2.117979049682617, "rewards/rejected": -2.939488649368286, "step": 436 }, { "epoch": 0.37, "grad_norm": 73.00783877428485, "learning_rate": 1.5748704814953643e-06, "logits/chosen": -1.3184621334075928, "logits/rejected": -1.2896759510040283, "logps/chosen": -582.6005859375, "logps/rejected": -633.3236694335938, "loss": 0.3145, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8894643783569336, "rewards/margins": 1.5803616046905518, "rewards/rejected": -2.4698259830474854, "step": 437 }, { "epoch": 0.38, "grad_norm": 77.38950721995847, "learning_rate": 1.5724173917241611e-06, "logits/chosen": -1.3838648796081543, "logits/rejected": -1.3053375482559204, "logps/chosen": -542.278564453125, "logps/rejected": -678.4103393554688, "loss": 0.5398, "rewards/accuracies": 0.875, "rewards/chosen": -0.7684734463691711, "rewards/margins": 1.3273355960845947, "rewards/rejected": -2.095808982849121, "step": 438 }, { "epoch": 0.38, "grad_norm": 45.569714523985276, "learning_rate": 1.5699591678891157e-06, "logits/chosen": -1.4103734493255615, "logits/rejected": -1.2873899936676025, "logps/chosen": -566.2435302734375, "logps/rejected": -853.83740234375, "loss": 0.2647, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6442338228225708, "rewards/margins": 2.577425241470337, "rewards/rejected": -3.2216591835021973, "step": 439 }, { "epoch": 0.38, "grad_norm": 41.351074889929265, "learning_rate": 1.5674958320382623e-06, "logits/chosen": -1.4523862600326538, "logits/rejected": -1.3567168712615967, "logps/chosen": -538.0159912109375, "logps/rejected": -816.7225341796875, "loss": 0.2577, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5122110843658447, "rewards/margins": 1.928330421447754, "rewards/rejected": -2.4405415058135986, "step": 440 }, { "epoch": 0.38, "grad_norm": 54.720135217149405, "learning_rate": 1.5650274062654844e-06, "logits/chosen": -1.1927452087402344, "logits/rejected": -1.1439197063446045, "logps/chosen": -622.1829833984375, "logps/rejected": -811.093017578125, "loss": 0.2502, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5815736055374146, "rewards/margins": 1.7208086252212524, "rewards/rejected": -2.302382469177246, "step": 441 }, { "epoch": 0.38, "grad_norm": 72.94654723585565, "learning_rate": 1.5625539127103188e-06, "logits/chosen": -1.243112564086914, "logits/rejected": -1.1448404788970947, "logps/chosen": -459.63360595703125, "logps/rejected": -657.576171875, "loss": 0.5021, "rewards/accuracies": 0.75, "rewards/chosen": -0.09865646809339523, "rewards/margins": 0.6924504637718201, "rewards/rejected": -0.7911069393157959, "step": 442 }, { "epoch": 0.38, "grad_norm": 50.440504920610245, "learning_rate": 1.5600753735577547e-06, "logits/chosen": -1.264277458190918, "logits/rejected": -1.223200798034668, "logps/chosen": -386.2403259277344, "logps/rejected": -529.3462524414062, "loss": 0.4116, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03323012590408325, "rewards/margins": 0.9878279566764832, "rewards/rejected": -0.9545978903770447, "step": 443 }, { "epoch": 0.38, "grad_norm": 74.39359903694583, "learning_rate": 1.5575918110380362e-06, "logits/chosen": -1.182845115661621, "logits/rejected": -1.1100221872329712, "logps/chosen": -555.8534545898438, "logps/rejected": -643.8975830078125, "loss": 0.3459, "rewards/accuracies": 0.875, "rewards/chosen": -0.7339385747909546, "rewards/margins": 1.4555790424346924, "rewards/rejected": -2.1895174980163574, "step": 444 }, { "epoch": 0.38, "grad_norm": 72.22911123903084, "learning_rate": 1.5551032474264618e-06, "logits/chosen": -1.075033187866211, "logits/rejected": -1.032829999923706, "logps/chosen": -525.1008911132812, "logps/rejected": -743.3673706054688, "loss": 0.4397, "rewards/accuracies": 0.875, "rewards/chosen": -0.8419838547706604, "rewards/margins": 1.3715593814849854, "rewards/rejected": -2.21354341506958, "step": 445 }, { "epoch": 0.38, "grad_norm": 45.646517241158904, "learning_rate": 1.5526097050431863e-06, "logits/chosen": -1.1065202951431274, "logits/rejected": -1.0796585083007812, "logps/chosen": -541.9082641601562, "logps/rejected": -585.971435546875, "loss": 0.3449, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7269726991653442, "rewards/margins": 1.2809374332427979, "rewards/rejected": -2.0079102516174316, "step": 446 }, { "epoch": 0.38, "grad_norm": 62.86632108103949, "learning_rate": 1.5501112062530185e-06, "logits/chosen": -1.0184428691864014, "logits/rejected": -0.985652506351471, "logps/chosen": -603.8740844726562, "logps/rejected": -795.9415283203125, "loss": 0.3501, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2108818292617798, "rewards/margins": 1.2921862602233887, "rewards/rejected": -2.503067970275879, "step": 447 }, { "epoch": 0.38, "grad_norm": 33.58903116651355, "learning_rate": 1.5476077734652222e-06, "logits/chosen": -0.8725420236587524, "logits/rejected": -0.8370548486709595, "logps/chosen": -456.96435546875, "logps/rejected": -775.3349609375, "loss": 0.2112, "rewards/accuracies": 1.0, "rewards/chosen": -0.5516107082366943, "rewards/margins": 2.407532215118408, "rewards/rejected": -2.9591431617736816, "step": 448 }, { "epoch": 0.39, "grad_norm": 47.09393568036316, "learning_rate": 1.5450994291333151e-06, "logits/chosen": -0.9912916421890259, "logits/rejected": -0.9687042236328125, "logps/chosen": -385.6941833496094, "logps/rejected": -556.1038818359375, "loss": 0.4064, "rewards/accuracies": 0.875, "rewards/chosen": -0.5287074446678162, "rewards/margins": 1.4976418018341064, "rewards/rejected": -2.0263493061065674, "step": 449 }, { "epoch": 0.39, "grad_norm": 73.63416167778202, "learning_rate": 1.5425861957548657e-06, "logits/chosen": -1.1284853219985962, "logits/rejected": -1.0713797807693481, "logps/chosen": -629.8704833984375, "logps/rejected": -706.3995361328125, "loss": 0.3777, "rewards/accuracies": 0.875, "rewards/chosen": -1.2089190483093262, "rewards/margins": 1.2346839904785156, "rewards/rejected": -2.443603038787842, "step": 450 }, { "epoch": 0.39, "grad_norm": 72.98530696780311, "learning_rate": 1.5400680958712942e-06, "logits/chosen": -1.0030968189239502, "logits/rejected": -0.974974513053894, "logps/chosen": -400.2067565917969, "logps/rejected": -490.24041748046875, "loss": 0.3404, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5827921032905579, "rewards/margins": 1.6719770431518555, "rewards/rejected": -2.2547693252563477, "step": 451 }, { "epoch": 0.39, "grad_norm": 38.68236963528052, "learning_rate": 1.5375451520676684e-06, "logits/chosen": -1.0761189460754395, "logits/rejected": -1.0248160362243652, "logps/chosen": -408.9266357421875, "logps/rejected": -681.0333251953125, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": -0.7555465698242188, "rewards/margins": 1.823000192642212, "rewards/rejected": -2.5785470008850098, "step": 452 }, { "epoch": 0.39, "grad_norm": 107.75050665764365, "learning_rate": 1.5350173869725009e-06, "logits/chosen": -0.9483497142791748, "logits/rejected": -0.9131959676742554, "logps/chosen": -538.731689453125, "logps/rejected": -595.2650756835938, "loss": 0.6406, "rewards/accuracies": 0.75, "rewards/chosen": -0.9458996057510376, "rewards/margins": 1.2236244678497314, "rewards/rejected": -2.1695239543914795, "step": 453 }, { "epoch": 0.39, "grad_norm": 53.19153906331593, "learning_rate": 1.5324848232575482e-06, "logits/chosen": -0.9808636903762817, "logits/rejected": -0.9459196329116821, "logps/chosen": -538.7525634765625, "logps/rejected": -663.7606201171875, "loss": 0.2758, "rewards/accuracies": 0.875, "rewards/chosen": -1.0494015216827393, "rewards/margins": 1.7582777738571167, "rewards/rejected": -2.8076791763305664, "step": 454 }, { "epoch": 0.39, "grad_norm": 54.0164683776154, "learning_rate": 1.5299474836376055e-06, "logits/chosen": -1.0516619682312012, "logits/rejected": -1.0118380784988403, "logps/chosen": -467.1536560058594, "logps/rejected": -640.5703125, "loss": 0.297, "rewards/accuracies": 0.875, "rewards/chosen": -0.914126455783844, "rewards/margins": 1.8050568103790283, "rewards/rejected": -2.7191834449768066, "step": 455 }, { "epoch": 0.39, "grad_norm": 69.76944754945215, "learning_rate": 1.5274053908703033e-06, "logits/chosen": -0.9428573846817017, "logits/rejected": -0.8974170684814453, "logps/chosen": -631.4688720703125, "logps/rejected": -762.390380859375, "loss": 0.4408, "rewards/accuracies": 0.875, "rewards/chosen": -0.977918267250061, "rewards/margins": 1.8812000751495361, "rewards/rejected": -2.8591182231903076, "step": 456 }, { "epoch": 0.39, "grad_norm": 35.547985558837226, "learning_rate": 1.5248585677559032e-06, "logits/chosen": -0.9528816342353821, "logits/rejected": -0.9294205904006958, "logps/chosen": -422.3802490234375, "logps/rejected": -676.3129272460938, "loss": 0.2149, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5416313409805298, "rewards/margins": 2.1708362102508545, "rewards/rejected": -2.712467670440674, "step": 457 }, { "epoch": 0.39, "grad_norm": 45.82051469410822, "learning_rate": 1.5223070371370953e-06, "logits/chosen": -0.8964556455612183, "logits/rejected": -0.8561989068984985, "logps/chosen": -565.7823486328125, "logps/rejected": -795.1091918945312, "loss": 0.1674, "rewards/accuracies": 1.0, "rewards/chosen": -0.7855107188224792, "rewards/margins": 2.320809841156006, "rewards/rejected": -3.106320381164551, "step": 458 }, { "epoch": 0.39, "grad_norm": 62.130735305048916, "learning_rate": 1.51975082189879e-06, "logits/chosen": -0.9833825826644897, "logits/rejected": -0.9350192546844482, "logps/chosen": -642.7136840820312, "logps/rejected": -894.32568359375, "loss": 0.3018, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6459782123565674, "rewards/margins": 2.0245954990386963, "rewards/rejected": -2.6705734729766846, "step": 459 }, { "epoch": 0.39, "grad_norm": 43.43854358154253, "learning_rate": 1.517189944967915e-06, "logits/chosen": -0.9655320644378662, "logits/rejected": -0.9203553199768066, "logps/chosen": -484.23211669921875, "logps/rejected": -691.60693359375, "loss": 0.1823, "rewards/accuracies": 0.9375, "rewards/chosen": -0.607116162776947, "rewards/margins": 2.231950044631958, "rewards/rejected": -2.83906626701355, "step": 460 }, { "epoch": 0.4, "grad_norm": 55.04848780763436, "learning_rate": 1.5146244293132094e-06, "logits/chosen": -0.9897803068161011, "logits/rejected": -0.9608893394470215, "logps/chosen": -388.45849609375, "logps/rejected": -519.8792724609375, "loss": 0.4448, "rewards/accuracies": 0.75, "rewards/chosen": -0.5719874501228333, "rewards/margins": 1.5956003665924072, "rewards/rejected": -2.1675877571105957, "step": 461 }, { "epoch": 0.4, "grad_norm": 56.574543353720905, "learning_rate": 1.5120542979450173e-06, "logits/chosen": -0.928221583366394, "logits/rejected": -0.8038352727890015, "logps/chosen": -673.7662353515625, "logps/rejected": -914.059326171875, "loss": 0.2237, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8671103715896606, "rewards/margins": 2.07771635055542, "rewards/rejected": -2.94482684135437, "step": 462 }, { "epoch": 0.4, "grad_norm": 62.25539236311385, "learning_rate": 1.509479573915082e-06, "logits/chosen": -0.8410499095916748, "logits/rejected": -0.8236804008483887, "logps/chosen": -453.6719970703125, "logps/rejected": -598.1178588867188, "loss": 0.3562, "rewards/accuracies": 0.875, "rewards/chosen": -0.49575868248939514, "rewards/margins": 1.7100486755371094, "rewards/rejected": -2.2058072090148926, "step": 463 }, { "epoch": 0.4, "grad_norm": 236.08526660286373, "learning_rate": 1.5069002803163375e-06, "logits/chosen": -0.9178378582000732, "logits/rejected": -0.8615768551826477, "logps/chosen": -548.9498291015625, "logps/rejected": -557.3529052734375, "loss": 0.4353, "rewards/accuracies": 0.875, "rewards/chosen": -0.7421918511390686, "rewards/margins": 0.7083767652511597, "rewards/rejected": -1.450568675994873, "step": 464 }, { "epoch": 0.4, "grad_norm": 58.82894013303105, "learning_rate": 1.5043164402827043e-06, "logits/chosen": -0.8117421865463257, "logits/rejected": -0.8035683631896973, "logps/chosen": -371.0589294433594, "logps/rejected": -584.568603515625, "loss": 0.3537, "rewards/accuracies": 0.875, "rewards/chosen": -0.3649640381336212, "rewards/margins": 1.6534215211868286, "rewards/rejected": -2.018385410308838, "step": 465 }, { "epoch": 0.4, "grad_norm": 52.38409147957441, "learning_rate": 1.5017280769888791e-06, "logits/chosen": -0.9991079568862915, "logits/rejected": -0.9378537535667419, "logps/chosen": -431.78387451171875, "logps/rejected": -594.4720458984375, "loss": 0.3443, "rewards/accuracies": 0.875, "rewards/chosen": -0.2835843563079834, "rewards/margins": 1.412635087966919, "rewards/rejected": -1.6962194442749023, "step": 466 }, { "epoch": 0.4, "grad_norm": 57.99597845314727, "learning_rate": 1.4991352136501295e-06, "logits/chosen": -1.061201810836792, "logits/rejected": -1.0437837839126587, "logps/chosen": -419.2176208496094, "logps/rejected": -547.6530151367188, "loss": 0.4095, "rewards/accuracies": 0.875, "rewards/chosen": -0.7993943691253662, "rewards/margins": 1.1544561386108398, "rewards/rejected": -1.9538503885269165, "step": 467 }, { "epoch": 0.4, "grad_norm": 55.909921186407395, "learning_rate": 1.4965378735220821e-06, "logits/chosen": -0.8839898705482483, "logits/rejected": -0.8770423531532288, "logps/chosen": -568.14794921875, "logps/rejected": -774.8721313476562, "loss": 0.276, "rewards/accuracies": 1.0, "rewards/chosen": -1.2065205574035645, "rewards/margins": 1.614347219467163, "rewards/rejected": -2.8208677768707275, "step": 468 }, { "epoch": 0.4, "grad_norm": 28.50181966886888, "learning_rate": 1.4939360799005183e-06, "logits/chosen": -0.881384015083313, "logits/rejected": -0.852988064289093, "logps/chosen": -335.64569091796875, "logps/rejected": -606.104736328125, "loss": 0.239, "rewards/accuracies": 0.875, "rewards/chosen": -0.6380329132080078, "rewards/margins": 1.972926139831543, "rewards/rejected": -2.6109588146209717, "step": 469 }, { "epoch": 0.4, "grad_norm": 74.33240566346468, "learning_rate": 1.4913298561211627e-06, "logits/chosen": -0.868405818939209, "logits/rejected": -0.869547426700592, "logps/chosen": -380.373779296875, "logps/rejected": -620.7781982421875, "loss": 0.4729, "rewards/accuracies": 0.6875, "rewards/chosen": -0.89825439453125, "rewards/margins": 1.3418068885803223, "rewards/rejected": -2.2400612831115723, "step": 470 }, { "epoch": 0.4, "grad_norm": 62.01430464542387, "learning_rate": 1.4887192255594744e-06, "logits/chosen": -0.9125020503997803, "logits/rejected": -0.8442560434341431, "logps/chosen": -540.2130126953125, "logps/rejected": -651.144287109375, "loss": 0.4313, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0115090608596802, "rewards/margins": 1.6366745233535767, "rewards/rejected": -2.648183584213257, "step": 471 }, { "epoch": 0.4, "grad_norm": 88.73643027706079, "learning_rate": 1.4861042116304369e-06, "logits/chosen": -0.8929340839385986, "logits/rejected": -0.7871063351631165, "logps/chosen": -654.5023193359375, "logps/rejected": -625.2041625976562, "loss": 0.4566, "rewards/accuracies": 0.75, "rewards/chosen": -1.387766718864441, "rewards/margins": 1.0511832237243652, "rewards/rejected": -2.4389500617980957, "step": 472 }, { "epoch": 0.41, "grad_norm": 68.26138348422057, "learning_rate": 1.4834848377883486e-06, "logits/chosen": -0.8650990724563599, "logits/rejected": -0.8362406492233276, "logps/chosen": -513.7879638671875, "logps/rejected": -628.8133544921875, "loss": 0.3425, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9172006845474243, "rewards/margins": 1.5339590311050415, "rewards/rejected": -2.451159715652466, "step": 473 }, { "epoch": 0.41, "grad_norm": 46.12241480060752, "learning_rate": 1.480861127526613e-06, "logits/chosen": -0.9154388904571533, "logits/rejected": -0.8683236837387085, "logps/chosen": -431.5248107910156, "logps/rejected": -608.5435791015625, "loss": 0.203, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6946539878845215, "rewards/margins": 2.112185478210449, "rewards/rejected": -2.80683970451355, "step": 474 }, { "epoch": 0.41, "grad_norm": 45.097472853318116, "learning_rate": 1.4782331043775276e-06, "logits/chosen": -0.8772906064987183, "logits/rejected": -0.8202885389328003, "logps/chosen": -567.1807250976562, "logps/rejected": -685.1033935546875, "loss": 0.2714, "rewards/accuracies": 0.875, "rewards/chosen": -0.72135990858078, "rewards/margins": 2.1727640628814697, "rewards/rejected": -2.8941240310668945, "step": 475 }, { "epoch": 0.41, "grad_norm": 73.02755926747012, "learning_rate": 1.4756007919120708e-06, "logits/chosen": -0.8577795028686523, "logits/rejected": -0.837565541267395, "logps/chosen": -476.64483642578125, "logps/rejected": -730.842529296875, "loss": 0.399, "rewards/accuracies": 0.875, "rewards/chosen": -0.8776594996452332, "rewards/margins": 1.9460971355438232, "rewards/rejected": -2.823756694793701, "step": 476 }, { "epoch": 0.41, "grad_norm": 54.704647833843524, "learning_rate": 1.472964213739694e-06, "logits/chosen": -0.9222075939178467, "logits/rejected": -0.8904163837432861, "logps/chosen": -489.0938720703125, "logps/rejected": -712.3372802734375, "loss": 0.2647, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1723819971084595, "rewards/margins": 1.9933929443359375, "rewards/rejected": -3.1657748222351074, "step": 477 }, { "epoch": 0.41, "grad_norm": 50.10043792173988, "learning_rate": 1.470323393508107e-06, "logits/chosen": -0.9438526630401611, "logits/rejected": -0.8643888235092163, "logps/chosen": -700.4232177734375, "logps/rejected": -811.2674560546875, "loss": 0.2606, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8976376056671143, "rewards/margins": 2.241746425628662, "rewards/rejected": -3.1393840312957764, "step": 478 }, { "epoch": 0.41, "grad_norm": 28.369621605558525, "learning_rate": 1.4676783549030684e-06, "logits/chosen": -0.8669946789741516, "logits/rejected": -0.8010271787643433, "logps/chosen": -577.3978271484375, "logps/rejected": -856.3585205078125, "loss": 0.1487, "rewards/accuracies": 1.0, "rewards/chosen": -0.5408543348312378, "rewards/margins": 2.634451389312744, "rewards/rejected": -3.1753058433532715, "step": 479 }, { "epoch": 0.41, "grad_norm": 61.711699275612446, "learning_rate": 1.4650291216481706e-06, "logits/chosen": -0.8652939796447754, "logits/rejected": -0.771173357963562, "logps/chosen": -670.241943359375, "logps/rejected": -756.6165771484375, "loss": 0.3041, "rewards/accuracies": 0.9375, "rewards/chosen": -0.716055691242218, "rewards/margins": 1.8450524806976318, "rewards/rejected": -2.561108112335205, "step": 480 }, { "epoch": 0.41, "grad_norm": 93.40482934075946, "learning_rate": 1.4623757175046278e-06, "logits/chosen": -0.751588761806488, "logits/rejected": -0.7028893232345581, "logps/chosen": -497.6466064453125, "logps/rejected": -766.5819091796875, "loss": 0.6932, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17273837327957153, "rewards/margins": 0.17415812611579895, "rewards/rejected": -0.34689652919769287, "step": 481 }, { "epoch": 0.41, "grad_norm": 69.34539505945266, "learning_rate": 1.459718166271065e-06, "logits/chosen": -0.8390201330184937, "logits/rejected": -0.7440624237060547, "logps/chosen": -695.57666015625, "logps/rejected": -682.0604858398438, "loss": 0.365, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8433142900466919, "rewards/margins": 1.2970421314239502, "rewards/rejected": -2.1403563022613525, "step": 482 }, { "epoch": 0.41, "grad_norm": 66.15245519454665, "learning_rate": 1.457056491783301e-06, "logits/chosen": -0.8030404448509216, "logits/rejected": -0.77801114320755, "logps/chosen": -459.1448974609375, "logps/rejected": -622.8702392578125, "loss": 0.4067, "rewards/accuracies": 0.875, "rewards/chosen": -0.6972125768661499, "rewards/margins": 1.2079944610595703, "rewards/rejected": -1.9052069187164307, "step": 483 }, { "epoch": 0.42, "grad_norm": 67.25380978057618, "learning_rate": 1.454390717914138e-06, "logits/chosen": -0.8256564140319824, "logits/rejected": -0.8271548748016357, "logps/chosen": -392.1229248046875, "logps/rejected": -539.0008544921875, "loss": 0.4194, "rewards/accuracies": 0.75, "rewards/chosen": -0.6776982545852661, "rewards/margins": 1.1847028732299805, "rewards/rejected": -1.862401008605957, "step": 484 }, { "epoch": 0.42, "grad_norm": 50.637462599424396, "learning_rate": 1.4517208685731445e-06, "logits/chosen": -0.729097843170166, "logits/rejected": -0.721123456954956, "logps/chosen": -490.776611328125, "logps/rejected": -603.486083984375, "loss": 0.3576, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4171733260154724, "rewards/margins": 1.5965278148651123, "rewards/rejected": -2.0137009620666504, "step": 485 }, { "epoch": 0.42, "grad_norm": 45.79116637926291, "learning_rate": 1.4490469677064435e-06, "logits/chosen": -0.8838560581207275, "logits/rejected": -0.7998265027999878, "logps/chosen": -529.9131469726562, "logps/rejected": -649.6559448242188, "loss": 0.2911, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9633562564849854, "rewards/margins": 1.5850216150283813, "rewards/rejected": -2.548377752304077, "step": 486 }, { "epoch": 0.42, "grad_norm": 69.41826659241754, "learning_rate": 1.4463690392964955e-06, "logits/chosen": -0.8535840511322021, "logits/rejected": -0.7974878549575806, "logps/chosen": -609.5081787109375, "logps/rejected": -682.191650390625, "loss": 0.3662, "rewards/accuracies": 0.75, "rewards/chosen": -1.127061367034912, "rewards/margins": 1.7252570390701294, "rewards/rejected": -2.85231876373291, "step": 487 }, { "epoch": 0.42, "grad_norm": 47.61794633825491, "learning_rate": 1.4436871073618857e-06, "logits/chosen": -0.8231527805328369, "logits/rejected": -0.7265893220901489, "logps/chosen": -580.491943359375, "logps/rejected": -609.302734375, "loss": 0.2244, "rewards/accuracies": 0.875, "rewards/chosen": -0.7856195569038391, "rewards/margins": 2.2862648963928223, "rewards/rejected": -3.0718846321105957, "step": 488 }, { "epoch": 0.42, "grad_norm": 88.33257603126563, "learning_rate": 1.4410011959571051e-06, "logits/chosen": -0.7357794046401978, "logits/rejected": -0.705422043800354, "logps/chosen": -509.08978271484375, "logps/rejected": -619.1658325195312, "loss": 0.4252, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9400831460952759, "rewards/margins": 1.3381483554840088, "rewards/rejected": -2.278231620788574, "step": 489 }, { "epoch": 0.42, "grad_norm": 85.65292490962175, "learning_rate": 1.4383113291723398e-06, "logits/chosen": -0.808850884437561, "logits/rejected": -0.8109133243560791, "logps/chosen": -512.6813354492188, "logps/rejected": -797.9263916015625, "loss": 0.2337, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8441105484962463, "rewards/margins": 2.7022745609283447, "rewards/rejected": -3.5463852882385254, "step": 490 }, { "epoch": 0.42, "grad_norm": 55.69288279315483, "learning_rate": 1.4356175311332495e-06, "logits/chosen": -0.876907467842102, "logits/rejected": -0.8422799110412598, "logps/chosen": -454.2169189453125, "logps/rejected": -666.6516723632812, "loss": 0.3201, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6969516277313232, "rewards/margins": 1.889945149421692, "rewards/rejected": -2.5868966579437256, "step": 491 }, { "epoch": 0.42, "grad_norm": 46.51583166431031, "learning_rate": 1.4329198260007551e-06, "logits/chosen": -0.849693775177002, "logits/rejected": -0.8062174320220947, "logps/chosen": -663.0662841796875, "logps/rejected": -922.7932739257812, "loss": 0.2157, "rewards/accuracies": 0.875, "rewards/chosen": -0.8148276209831238, "rewards/margins": 2.7568774223327637, "rewards/rejected": -3.5717051029205322, "step": 492 }, { "epoch": 0.42, "grad_norm": 75.15763989738159, "learning_rate": 1.4302182379708203e-06, "logits/chosen": -0.758513331413269, "logits/rejected": -0.754541277885437, "logps/chosen": -380.8668212890625, "logps/rejected": -550.5960693359375, "loss": 0.4863, "rewards/accuracies": 0.875, "rewards/chosen": -0.7542673945426941, "rewards/margins": 1.8577656745910645, "rewards/rejected": -2.612032890319824, "step": 493 }, { "epoch": 0.42, "grad_norm": 64.8880511331801, "learning_rate": 1.4275127912742343e-06, "logits/chosen": -0.8404750823974609, "logits/rejected": -0.8266079425811768, "logps/chosen": -414.9617004394531, "logps/rejected": -634.954345703125, "loss": 0.2856, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7905468940734863, "rewards/margins": 2.1434035301208496, "rewards/rejected": -2.933950424194336, "step": 494 }, { "epoch": 0.42, "grad_norm": 50.93952365848894, "learning_rate": 1.4248035101763962e-06, "logits/chosen": -0.8301562666893005, "logits/rejected": -0.8052735328674316, "logps/chosen": -585.7339477539062, "logps/rejected": -790.9482421875, "loss": 0.2048, "rewards/accuracies": 1.0, "rewards/chosen": -1.060710072517395, "rewards/margins": 2.148515462875366, "rewards/rejected": -3.209225654602051, "step": 495 }, { "epoch": 0.43, "grad_norm": 31.16762541396582, "learning_rate": 1.422090418977095e-06, "logits/chosen": -0.8170212507247925, "logits/rejected": -0.7670871019363403, "logps/chosen": -566.548583984375, "logps/rejected": -701.8504638671875, "loss": 0.2523, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7676441669464111, "rewards/margins": 2.0442118644714355, "rewards/rejected": -2.8118557929992676, "step": 496 }, { "epoch": 0.43, "grad_norm": 61.98372066168573, "learning_rate": 1.4193735420102932e-06, "logits/chosen": -0.8982579708099365, "logits/rejected": -0.8410770297050476, "logps/chosen": -454.56805419921875, "logps/rejected": -605.292724609375, "loss": 0.4107, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1641063690185547, "rewards/margins": 1.3544279336929321, "rewards/rejected": -2.5185341835021973, "step": 497 }, { "epoch": 0.43, "grad_norm": 49.7457946102483, "learning_rate": 1.4166529036439092e-06, "logits/chosen": -0.8498281240463257, "logits/rejected": -0.8173060417175293, "logps/chosen": -638.1928100585938, "logps/rejected": -833.4638671875, "loss": 0.2477, "rewards/accuracies": 0.875, "rewards/chosen": -0.9349879622459412, "rewards/margins": 2.4696130752563477, "rewards/rejected": -3.4046010971069336, "step": 498 }, { "epoch": 0.43, "grad_norm": 27.961072326529063, "learning_rate": 1.413928528279596e-06, "logits/chosen": -0.7953431606292725, "logits/rejected": -0.8111715316772461, "logps/chosen": -405.8520202636719, "logps/rejected": -663.355224609375, "loss": 0.1762, "rewards/accuracies": 1.0, "rewards/chosen": -0.34801992774009705, "rewards/margins": 2.1400349140167236, "rewards/rejected": -2.4880547523498535, "step": 499 }, { "epoch": 0.43, "grad_norm": 64.49815264864202, "learning_rate": 1.411200440352525e-06, "logits/chosen": -0.8560844659805298, "logits/rejected": -0.8431380987167358, "logps/chosen": -539.9221801757812, "logps/rejected": -746.8150024414062, "loss": 0.3124, "rewards/accuracies": 0.875, "rewards/chosen": -0.9053245186805725, "rewards/margins": 1.952344298362732, "rewards/rejected": -2.85766863822937, "step": 500 }, { "epoch": 0.43, "grad_norm": 38.310902654677925, "learning_rate": 1.4084686643311666e-06, "logits/chosen": -0.8619130849838257, "logits/rejected": -0.7855139374732971, "logps/chosen": -505.196044921875, "logps/rejected": -717.7348022460938, "loss": 0.1972, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6216053366661072, "rewards/margins": 2.5277938842773438, "rewards/rejected": -3.1493990421295166, "step": 501 }, { "epoch": 0.43, "grad_norm": 71.1766081581125, "learning_rate": 1.4057332247170684e-06, "logits/chosen": -0.8671563863754272, "logits/rejected": -0.8257827758789062, "logps/chosen": -600.34814453125, "logps/rejected": -878.1358642578125, "loss": 0.4022, "rewards/accuracies": 0.875, "rewards/chosen": -1.3607724905014038, "rewards/margins": 2.288400888442993, "rewards/rejected": -3.6491734981536865, "step": 502 }, { "epoch": 0.43, "grad_norm": 78.98485490319837, "learning_rate": 1.4029941460446385e-06, "logits/chosen": -0.7788889408111572, "logits/rejected": -0.8246132135391235, "logps/chosen": -631.4876708984375, "logps/rejected": -972.2409057617188, "loss": 0.3269, "rewards/accuracies": 0.8125, "rewards/chosen": -1.226896047592163, "rewards/margins": 2.414889097213745, "rewards/rejected": -3.641785144805908, "step": 503 }, { "epoch": 0.43, "grad_norm": 74.91603010165284, "learning_rate": 1.4002514528809234e-06, "logits/chosen": -0.837902307510376, "logits/rejected": -0.8045358061790466, "logps/chosen": -474.34075927734375, "logps/rejected": -522.15771484375, "loss": 0.4926, "rewards/accuracies": 0.75, "rewards/chosen": -0.8030575513839722, "rewards/margins": 0.9458550214767456, "rewards/rejected": -1.7489125728607178, "step": 504 }, { "epoch": 0.43, "grad_norm": 39.39261676529048, "learning_rate": 1.397505169825389e-06, "logits/chosen": -0.8392449021339417, "logits/rejected": -0.8159143328666687, "logps/chosen": -501.5545959472656, "logps/rejected": -852.7933349609375, "loss": 0.1755, "rewards/accuracies": 1.0, "rewards/chosen": -0.8916856646537781, "rewards/margins": 2.3363566398620605, "rewards/rejected": -3.2280421257019043, "step": 505 }, { "epoch": 0.43, "grad_norm": 44.17201764139978, "learning_rate": 1.394755321509698e-06, "logits/chosen": -0.8164315223693848, "logits/rejected": -0.7713231444358826, "logps/chosen": -511.5898132324219, "logps/rejected": -640.146484375, "loss": 0.2406, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3921697735786438, "rewards/margins": 1.7518031597137451, "rewards/rejected": -2.143972873687744, "step": 506 }, { "epoch": 0.43, "grad_norm": 39.94324558492835, "learning_rate": 1.3920019325974915e-06, "logits/chosen": -0.8083140850067139, "logits/rejected": -0.8277095556259155, "logps/chosen": -283.79443359375, "logps/rejected": -565.7360229492188, "loss": 0.2681, "rewards/accuracies": 1.0, "rewards/chosen": -0.32972055673599243, "rewards/margins": 1.5236279964447021, "rewards/rejected": -1.853348731994629, "step": 507 }, { "epoch": 0.44, "grad_norm": 49.77860810968748, "learning_rate": 1.3892450277841655e-06, "logits/chosen": -0.8014517426490784, "logits/rejected": -0.7942814826965332, "logps/chosen": -403.44921875, "logps/rejected": -607.3531494140625, "loss": 0.2493, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6396724581718445, "rewards/margins": 1.6928253173828125, "rewards/rejected": -2.3324978351593018, "step": 508 }, { "epoch": 0.44, "grad_norm": 34.07100201871765, "learning_rate": 1.3864846317966512e-06, "logits/chosen": -0.8613470792770386, "logits/rejected": -0.828007698059082, "logps/chosen": -487.0976867675781, "logps/rejected": -767.64697265625, "loss": 0.1583, "rewards/accuracies": 1.0, "rewards/chosen": -0.4158779978752136, "rewards/margins": 2.6424601078033447, "rewards/rejected": -3.058338165283203, "step": 509 }, { "epoch": 0.44, "grad_norm": 32.23216272215837, "learning_rate": 1.3837207693931925e-06, "logits/chosen": -0.8819748163223267, "logits/rejected": -0.858333945274353, "logps/chosen": -306.4765319824219, "logps/rejected": -589.0430908203125, "loss": 0.1842, "rewards/accuracies": 0.9375, "rewards/chosen": -0.46015307307243347, "rewards/margins": 2.3081138134002686, "rewards/rejected": -2.7682669162750244, "step": 510 }, { "epoch": 0.44, "grad_norm": 47.186284111106794, "learning_rate": 1.3809534653631233e-06, "logits/chosen": -0.9686492681503296, "logits/rejected": -0.8675292730331421, "logps/chosen": -590.91259765625, "logps/rejected": -778.565673828125, "loss": 0.1576, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4737451672554016, "rewards/margins": 3.0512213706970215, "rewards/rejected": -3.524966239929199, "step": 511 }, { "epoch": 0.44, "grad_norm": 35.326151954502585, "learning_rate": 1.3781827445266458e-06, "logits/chosen": -0.8942842483520508, "logits/rejected": -0.8705543279647827, "logps/chosen": -452.56634521484375, "logps/rejected": -680.036376953125, "loss": 0.2228, "rewards/accuracies": 0.875, "rewards/chosen": -0.7477368116378784, "rewards/margins": 2.2438342571258545, "rewards/rejected": -2.9915709495544434, "step": 512 }, { "epoch": 0.44, "grad_norm": 39.76095534582754, "learning_rate": 1.3754086317346087e-06, "logits/chosen": -0.902711033821106, "logits/rejected": -0.8663870096206665, "logps/chosen": -497.970703125, "logps/rejected": -667.9498291015625, "loss": 0.2127, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7539928555488586, "rewards/margins": 2.4936609268188477, "rewards/rejected": -3.2476539611816406, "step": 513 }, { "epoch": 0.44, "grad_norm": 68.85063612286014, "learning_rate": 1.3726311518682827e-06, "logits/chosen": -0.9456419944763184, "logits/rejected": -0.8960084915161133, "logps/chosen": -484.5071105957031, "logps/rejected": -608.3556518554688, "loss": 0.3153, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8282623291015625, "rewards/margins": 1.6905174255371094, "rewards/rejected": -2.518779993057251, "step": 514 }, { "epoch": 0.44, "grad_norm": 70.97166935156534, "learning_rate": 1.369850329839138e-06, "logits/chosen": -0.838205873966217, "logits/rejected": -0.8520830273628235, "logps/chosen": -349.7906188964844, "logps/rejected": -575.4569702148438, "loss": 0.3721, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9320447444915771, "rewards/margins": 1.8971960544586182, "rewards/rejected": -2.829240560531616, "step": 515 }, { "epoch": 0.44, "grad_norm": 63.54087433833503, "learning_rate": 1.3670661905886216e-06, "logits/chosen": -0.966612696647644, "logits/rejected": -0.8654012680053711, "logps/chosen": -421.6285095214844, "logps/rejected": -474.3664245605469, "loss": 0.505, "rewards/accuracies": 0.625, "rewards/chosen": -0.8842155933380127, "rewards/margins": 0.9205365180969238, "rewards/rejected": -1.804752230644226, "step": 516 }, { "epoch": 0.44, "grad_norm": 81.66997925349206, "learning_rate": 1.3642787590879323e-06, "logits/chosen": -0.9743420481681824, "logits/rejected": -0.8784420490264893, "logps/chosen": -709.7653198242188, "logps/rejected": -782.1478271484375, "loss": 0.375, "rewards/accuracies": 0.875, "rewards/chosen": -1.5563608407974243, "rewards/margins": 1.992991328239441, "rewards/rejected": -3.5493521690368652, "step": 517 }, { "epoch": 0.44, "grad_norm": 35.212048034707685, "learning_rate": 1.361488060337798e-06, "logits/chosen": -0.8872784972190857, "logits/rejected": -0.8517801761627197, "logps/chosen": -471.6920471191406, "logps/rejected": -743.310546875, "loss": 0.2124, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7769272327423096, "rewards/margins": 2.494457483291626, "rewards/rejected": -3.2713847160339355, "step": 518 }, { "epoch": 0.45, "grad_norm": 71.4255282290973, "learning_rate": 1.3586941193682505e-06, "logits/chosen": -0.9100337028503418, "logits/rejected": -0.8517925143241882, "logps/chosen": -512.8143920898438, "logps/rejected": -625.2344970703125, "loss": 0.4623, "rewards/accuracies": 0.9375, "rewards/chosen": -1.429246425628662, "rewards/margins": 1.5222432613372803, "rewards/rejected": -2.9514899253845215, "step": 519 }, { "epoch": 0.45, "grad_norm": 67.98416347949309, "learning_rate": 1.3558969612384007e-06, "logits/chosen": -0.9615705013275146, "logits/rejected": -0.9132786989212036, "logps/chosen": -456.4330749511719, "logps/rejected": -483.18768310546875, "loss": 0.4794, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8184623718261719, "rewards/margins": 1.3500648736953735, "rewards/rejected": -2.168527126312256, "step": 520 }, { "epoch": 0.45, "grad_norm": 61.255057172371146, "learning_rate": 1.3530966110362163e-06, "logits/chosen": -0.8489855527877808, "logits/rejected": -0.82160484790802, "logps/chosen": -631.4532470703125, "logps/rejected": -705.7227783203125, "loss": 0.3343, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1591343879699707, "rewards/margins": 2.0831398963928223, "rewards/rejected": -3.242274284362793, "step": 521 }, { "epoch": 0.45, "grad_norm": 33.594536176516364, "learning_rate": 1.3502930938782934e-06, "logits/chosen": -0.9394704103469849, "logits/rejected": -0.8963195085525513, "logps/chosen": -618.7100830078125, "logps/rejected": -790.1339721679688, "loss": 0.2045, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8805956244468689, "rewards/margins": 2.06959867477417, "rewards/rejected": -2.9501941204071045, "step": 522 }, { "epoch": 0.45, "grad_norm": 74.7822911987875, "learning_rate": 1.3474864349096333e-06, "logits/chosen": -0.8570155501365662, "logits/rejected": -0.826379656791687, "logps/chosen": -676.6806640625, "logps/rejected": -821.4932861328125, "loss": 0.403, "rewards/accuracies": 0.8125, "rewards/chosen": -1.221854329109192, "rewards/margins": 1.7235009670257568, "rewards/rejected": -2.9453554153442383, "step": 523 }, { "epoch": 0.45, "grad_norm": 60.51652877902085, "learning_rate": 1.3446766593034167e-06, "logits/chosen": -0.9486373662948608, "logits/rejected": -0.8681553602218628, "logps/chosen": -627.4612426757812, "logps/rejected": -750.2420654296875, "loss": 0.2923, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9254991412162781, "rewards/margins": 1.9712306261062622, "rewards/rejected": -2.8967297077178955, "step": 524 }, { "epoch": 0.45, "grad_norm": 66.00914028262723, "learning_rate": 1.3418637922607768e-06, "logits/chosen": -0.8991914987564087, "logits/rejected": -0.8221247792243958, "logps/chosen": -432.67388916015625, "logps/rejected": -493.1002197265625, "loss": 0.4838, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11370259523391724, "rewards/margins": 1.1485422849655151, "rewards/rejected": -1.2622448205947876, "step": 525 }, { "epoch": 0.45, "grad_norm": 95.95420441142826, "learning_rate": 1.3390478590105761e-06, "logits/chosen": -0.779647946357727, "logits/rejected": -0.7370286583900452, "logps/chosen": -520.8436279296875, "logps/rejected": -618.977294921875, "loss": 0.5084, "rewards/accuracies": 0.75, "rewards/chosen": -0.4177210330963135, "rewards/margins": 0.7659667730331421, "rewards/rejected": -1.183687686920166, "step": 526 }, { "epoch": 0.45, "grad_norm": 117.29601610029175, "learning_rate": 1.3362288848091763e-06, "logits/chosen": -0.7529869079589844, "logits/rejected": -0.7606877088546753, "logps/chosen": -380.1894836425781, "logps/rejected": -729.3990478515625, "loss": 0.4763, "rewards/accuracies": 0.75, "rewards/chosen": -0.15248233079910278, "rewards/margins": 1.381932258605957, "rewards/rejected": -1.534414529800415, "step": 527 }, { "epoch": 0.45, "grad_norm": 82.17873398853364, "learning_rate": 1.333406894940214e-06, "logits/chosen": -0.8722752332687378, "logits/rejected": -0.8514535427093506, "logps/chosen": -476.3056335449219, "logps/rejected": -726.4517822265625, "loss": 0.4771, "rewards/accuracies": 0.75, "rewards/chosen": -0.8900383710861206, "rewards/margins": 1.644824743270874, "rewards/rejected": -2.534862995147705, "step": 528 }, { "epoch": 0.45, "grad_norm": 34.13497579443595, "learning_rate": 1.3305819147143747e-06, "logits/chosen": -0.9461549520492554, "logits/rejected": -0.910599946975708, "logps/chosen": -434.37384033203125, "logps/rejected": -686.114990234375, "loss": 0.2242, "rewards/accuracies": 1.0, "rewards/chosen": -0.6752679347991943, "rewards/margins": 2.158499240875244, "rewards/rejected": -2.8337674140930176, "step": 529 }, { "epoch": 0.45, "grad_norm": 79.10388212463529, "learning_rate": 1.3277539694691635e-06, "logits/chosen": -0.9313993453979492, "logits/rejected": -0.8736569881439209, "logps/chosen": -559.302978515625, "logps/rejected": -660.0396728515625, "loss": 0.4475, "rewards/accuracies": 0.875, "rewards/chosen": -0.9553658962249756, "rewards/margins": 1.5288103818893433, "rewards/rejected": -2.4841761589050293, "step": 530 }, { "epoch": 0.46, "grad_norm": 35.47979017711729, "learning_rate": 1.3249230845686796e-06, "logits/chosen": -0.9311937093734741, "logits/rejected": -0.9165546298027039, "logps/chosen": -280.443359375, "logps/rejected": -392.58966064453125, "loss": 0.2719, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5209568738937378, "rewards/margins": 1.7553080320358276, "rewards/rejected": -2.2762649059295654, "step": 531 }, { "epoch": 0.46, "grad_norm": 32.579500006278366, "learning_rate": 1.322089285403388e-06, "logits/chosen": -0.9264934062957764, "logits/rejected": -0.8829073905944824, "logps/chosen": -457.67913818359375, "logps/rejected": -540.3162841796875, "loss": 0.2518, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8536669015884399, "rewards/margins": 1.865778923034668, "rewards/rejected": -2.7194457054138184, "step": 532 }, { "epoch": 0.46, "grad_norm": 47.65394221635163, "learning_rate": 1.3192525973898921e-06, "logits/chosen": -0.9628214240074158, "logits/rejected": -0.9117701649665833, "logps/chosen": -472.184326171875, "logps/rejected": -635.360107421875, "loss": 0.2958, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7624552249908447, "rewards/margins": 2.0999326705932617, "rewards/rejected": -2.8623881340026855, "step": 533 }, { "epoch": 0.46, "grad_norm": 55.1554666550412, "learning_rate": 1.3164130459707057e-06, "logits/chosen": -0.9542844295501709, "logits/rejected": -0.9096299409866333, "logps/chosen": -614.8569946289062, "logps/rejected": -683.0138549804688, "loss": 0.3101, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8923014998435974, "rewards/margins": 2.132594108581543, "rewards/rejected": -3.024895668029785, "step": 534 }, { "epoch": 0.46, "grad_norm": 78.2854978701924, "learning_rate": 1.313570656614025e-06, "logits/chosen": -0.9504913091659546, "logits/rejected": -0.937301516532898, "logps/chosen": -439.90118408203125, "logps/rejected": -611.697998046875, "loss": 0.4856, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3660770654678345, "rewards/margins": 1.6826179027557373, "rewards/rejected": -3.0486950874328613, "step": 535 }, { "epoch": 0.46, "grad_norm": 60.26173610703018, "learning_rate": 1.310725454813499e-06, "logits/chosen": -0.9467581510543823, "logits/rejected": -0.9146875143051147, "logps/chosen": -322.9682312011719, "logps/rejected": -368.23681640625, "loss": 0.5115, "rewards/accuracies": 0.75, "rewards/chosen": -0.7642669677734375, "rewards/margins": 1.2350019216537476, "rewards/rejected": -1.9992687702178955, "step": 536 }, { "epoch": 0.46, "grad_norm": 42.04846751083289, "learning_rate": 1.3078774660880031e-06, "logits/chosen": -0.9183229207992554, "logits/rejected": -0.915323793888092, "logps/chosen": -342.38861083984375, "logps/rejected": -661.6390991210938, "loss": 0.2917, "rewards/accuracies": 0.75, "rewards/chosen": -0.5758622884750366, "rewards/margins": 2.632599115371704, "rewards/rejected": -3.208461284637451, "step": 537 }, { "epoch": 0.46, "grad_norm": 95.46446791152454, "learning_rate": 1.3050267159814078e-06, "logits/chosen": -0.904394268989563, "logits/rejected": -0.8909515142440796, "logps/chosen": -494.231689453125, "logps/rejected": -684.6685791015625, "loss": 0.5835, "rewards/accuracies": 0.75, "rewards/chosen": -1.316034197807312, "rewards/margins": 1.488415002822876, "rewards/rejected": -2.8044490814208984, "step": 538 }, { "epoch": 0.46, "grad_norm": 37.14479971677457, "learning_rate": 1.3021732300623506e-06, "logits/chosen": -0.9366617202758789, "logits/rejected": -0.8918839693069458, "logps/chosen": -384.676513671875, "logps/rejected": -581.505126953125, "loss": 0.189, "rewards/accuracies": 1.0, "rewards/chosen": -0.5559818148612976, "rewards/margins": 2.3850436210632324, "rewards/rejected": -2.941025495529175, "step": 539 }, { "epoch": 0.46, "grad_norm": 55.17905263819993, "learning_rate": 1.299317033924008e-06, "logits/chosen": -0.8729772567749023, "logits/rejected": -0.8713866472244263, "logps/chosen": -436.38037109375, "logps/rejected": -499.4176330566406, "loss": 0.3997, "rewards/accuracies": 0.75, "rewards/chosen": -1.184898853302002, "rewards/margins": 1.184369444847107, "rewards/rejected": -2.3692681789398193, "step": 540 }, { "epoch": 0.46, "grad_norm": 76.69953904332482, "learning_rate": 1.2964581531838635e-06, "logits/chosen": -1.016025424003601, "logits/rejected": -0.9276155233383179, "logps/chosen": -535.6536865234375, "logps/rejected": -646.4261474609375, "loss": 0.4855, "rewards/accuracies": 0.75, "rewards/chosen": -1.5077444314956665, "rewards/margins": 1.6514971256256104, "rewards/rejected": -3.1592416763305664, "step": 541 }, { "epoch": 0.46, "grad_norm": 51.71547000713668, "learning_rate": 1.2935966134834795e-06, "logits/chosen": -0.978553295135498, "logits/rejected": -0.9328474998474121, "logps/chosen": -449.3985290527344, "logps/rejected": -534.5638427734375, "loss": 0.3566, "rewards/accuracies": 0.8125, "rewards/chosen": -0.960557222366333, "rewards/margins": 1.6971855163574219, "rewards/rejected": -2.657742738723755, "step": 542 }, { "epoch": 0.47, "grad_norm": 48.59844761404303, "learning_rate": 1.290732440488267e-06, "logits/chosen": -0.9164152145385742, "logits/rejected": -0.8484071493148804, "logps/chosen": -594.1298828125, "logps/rejected": -669.592041015625, "loss": 0.2708, "rewards/accuracies": 0.875, "rewards/chosen": -1.024033784866333, "rewards/margins": 1.7735604047775269, "rewards/rejected": -2.7975940704345703, "step": 543 }, { "epoch": 0.47, "grad_norm": 43.09196422252919, "learning_rate": 1.2878656598872546e-06, "logits/chosen": -0.9126098155975342, "logits/rejected": -0.8721531629562378, "logps/chosen": -389.77130126953125, "logps/rejected": -440.10699462890625, "loss": 0.3755, "rewards/accuracies": 0.875, "rewards/chosen": -0.8628970384597778, "rewards/margins": 1.5729416608810425, "rewards/rejected": -2.4358386993408203, "step": 544 }, { "epoch": 0.47, "grad_norm": 51.58322967373032, "learning_rate": 1.2849962973928596e-06, "logits/chosen": -0.9882091879844666, "logits/rejected": -0.9401744604110718, "logps/chosen": -444.46014404296875, "logps/rejected": -576.692138671875, "loss": 0.2728, "rewards/accuracies": 0.875, "rewards/chosen": -1.189112901687622, "rewards/margins": 1.7125802040100098, "rewards/rejected": -2.901693344116211, "step": 545 }, { "epoch": 0.47, "grad_norm": 40.06366735276771, "learning_rate": 1.282124378740656e-06, "logits/chosen": -0.9059635400772095, "logits/rejected": -0.8775876760482788, "logps/chosen": -593.1556396484375, "logps/rejected": -672.8941650390625, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": -0.5856313705444336, "rewards/margins": 2.3719394207000732, "rewards/rejected": -2.9575705528259277, "step": 546 }, { "epoch": 0.47, "grad_norm": 64.91149921888086, "learning_rate": 1.2792499296891447e-06, "logits/chosen": -0.8570492267608643, "logits/rejected": -0.8381605744361877, "logps/chosen": -486.0616455078125, "logps/rejected": -580.86328125, "loss": 0.2649, "rewards/accuracies": 0.875, "rewards/chosen": -1.176407814025879, "rewards/margins": 1.77084481716156, "rewards/rejected": -2.9472527503967285, "step": 547 }, { "epoch": 0.47, "grad_norm": 81.59847334724569, "learning_rate": 1.276372976019521e-06, "logits/chosen": -0.9134464263916016, "logits/rejected": -0.9017162322998047, "logps/chosen": -449.33380126953125, "logps/rejected": -634.1409912109375, "loss": 0.5316, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0740644931793213, "rewards/margins": 1.6356202363967896, "rewards/rejected": -2.7096848487854004, "step": 548 }, { "epoch": 0.47, "grad_norm": 74.91599389267594, "learning_rate": 1.2734935435354455e-06, "logits/chosen": -0.8812904953956604, "logits/rejected": -0.8673186302185059, "logps/chosen": -427.50567626953125, "logps/rejected": -558.2020263671875, "loss": 0.487, "rewards/accuracies": 0.75, "rewards/chosen": -0.8359134197235107, "rewards/margins": 1.2576205730438232, "rewards/rejected": -2.093533992767334, "step": 549 }, { "epoch": 0.47, "grad_norm": 58.499944184001166, "learning_rate": 1.270611658062811e-06, "logits/chosen": -0.9335320591926575, "logits/rejected": -0.9049162864685059, "logps/chosen": -519.2843627929688, "logps/rejected": -713.3898315429688, "loss": 0.4237, "rewards/accuracies": 0.75, "rewards/chosen": -1.122235655784607, "rewards/margins": 2.171722888946533, "rewards/rejected": -3.2939586639404297, "step": 550 }, { "epoch": 0.47, "grad_norm": 47.10898356331998, "learning_rate": 1.2677273454495112e-06, "logits/chosen": -0.8580210208892822, "logits/rejected": -0.8268457651138306, "logps/chosen": -403.48651123046875, "logps/rejected": -560.0176391601562, "loss": 0.3254, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9335359334945679, "rewards/margins": 1.762251853942871, "rewards/rejected": -2.6957879066467285, "step": 551 }, { "epoch": 0.47, "grad_norm": 103.34165034219963, "learning_rate": 1.2648406315652088e-06, "logits/chosen": -0.8921483755111694, "logits/rejected": -0.8262415528297424, "logps/chosen": -655.8408203125, "logps/rejected": -681.6361694335938, "loss": 0.6242, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9099090099334717, "rewards/margins": 1.288282871246338, "rewards/rejected": -2.1981921195983887, "step": 552 }, { "epoch": 0.47, "grad_norm": 30.0351296907791, "learning_rate": 1.2619515423011055e-06, "logits/chosen": -0.9337635040283203, "logits/rejected": -0.8803566694259644, "logps/chosen": -728.0569458007812, "logps/rejected": -904.9400634765625, "loss": 0.1531, "rewards/accuracies": 1.0, "rewards/chosen": -0.9571795463562012, "rewards/margins": 2.3777036666870117, "rewards/rejected": -3.334883213043213, "step": 553 }, { "epoch": 0.48, "grad_norm": 44.81285970122842, "learning_rate": 1.2590601035697054e-06, "logits/chosen": -0.9095577001571655, "logits/rejected": -0.8822007179260254, "logps/chosen": -641.730224609375, "logps/rejected": -791.3433837890625, "loss": 0.1613, "rewards/accuracies": 1.0, "rewards/chosen": -0.8262911438941956, "rewards/margins": 2.1680355072021484, "rewards/rejected": -2.994326591491699, "step": 554 }, { "epoch": 0.48, "grad_norm": 45.72398867570701, "learning_rate": 1.2561663413045868e-06, "logits/chosen": -0.9518017768859863, "logits/rejected": -0.8841419219970703, "logps/chosen": -622.923095703125, "logps/rejected": -859.262451171875, "loss": 0.1866, "rewards/accuracies": 0.875, "rewards/chosen": -0.8729736804962158, "rewards/margins": 2.8778254985809326, "rewards/rejected": -3.7507991790771484, "step": 555 }, { "epoch": 0.48, "grad_norm": 46.514213166710974, "learning_rate": 1.2532702814601678e-06, "logits/chosen": -0.9096502065658569, "logits/rejected": -0.8895667791366577, "logps/chosen": -670.5291748046875, "logps/rejected": -916.8817138671875, "loss": 0.1813, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9845317602157593, "rewards/margins": 2.594719648361206, "rewards/rejected": -3.579251289367676, "step": 556 }, { "epoch": 0.48, "grad_norm": 29.579282897800084, "learning_rate": 1.2503719500114733e-06, "logits/chosen": -0.9815787076950073, "logits/rejected": -0.9444399476051331, "logps/chosen": -427.94683837890625, "logps/rejected": -708.6376953125, "loss": 0.2072, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5375573039054871, "rewards/margins": 2.491065740585327, "rewards/rejected": -3.02862286567688, "step": 557 }, { "epoch": 0.48, "grad_norm": 61.526489949118854, "learning_rate": 1.2474713729539033e-06, "logits/chosen": -1.0085158348083496, "logits/rejected": -0.9338014125823975, "logps/chosen": -676.2670288085938, "logps/rejected": -752.7526245117188, "loss": 0.2631, "rewards/accuracies": 0.875, "rewards/chosen": -1.2552642822265625, "rewards/margins": 2.4462804794311523, "rewards/rejected": -3.701545238494873, "step": 558 }, { "epoch": 0.48, "grad_norm": 52.45376747069775, "learning_rate": 1.2445685763029969e-06, "logits/chosen": -0.9655488133430481, "logits/rejected": -0.948523759841919, "logps/chosen": -379.6853942871094, "logps/rejected": -659.249755859375, "loss": 0.3075, "rewards/accuracies": 0.875, "rewards/chosen": -0.8615530729293823, "rewards/margins": 2.0184621810913086, "rewards/rejected": -2.8800153732299805, "step": 559 }, { "epoch": 0.48, "grad_norm": 66.74324899476902, "learning_rate": 1.2416635860942034e-06, "logits/chosen": -0.97737717628479, "logits/rejected": -0.9480463266372681, "logps/chosen": -421.3519287109375, "logps/rejected": -556.2459106445312, "loss": 0.4132, "rewards/accuracies": 0.75, "rewards/chosen": -1.0725280046463013, "rewards/margins": 1.3633832931518555, "rewards/rejected": -2.435911178588867, "step": 560 }, { "epoch": 0.48, "grad_norm": 73.63127069033355, "learning_rate": 1.238756428382645e-06, "logits/chosen": -0.9148063659667969, "logits/rejected": -0.9012689590454102, "logps/chosen": -466.36785888671875, "logps/rejected": -678.955810546875, "loss": 0.3981, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2128989696502686, "rewards/margins": 1.9558484554290771, "rewards/rejected": -3.1687474250793457, "step": 561 }, { "epoch": 0.48, "grad_norm": 92.09052046516862, "learning_rate": 1.2358471292428842e-06, "logits/chosen": -0.9906730055809021, "logits/rejected": -0.9136836528778076, "logps/chosen": -727.5438232421875, "logps/rejected": -815.8961181640625, "loss": 0.5894, "rewards/accuracies": 0.75, "rewards/chosen": -1.4386944770812988, "rewards/margins": 1.7987642288208008, "rewards/rejected": -3.2374587059020996, "step": 562 }, { "epoch": 0.48, "grad_norm": 58.5981985039348, "learning_rate": 1.2329357147686907e-06, "logits/chosen": -0.9983447790145874, "logits/rejected": -0.9759050607681274, "logps/chosen": -643.207275390625, "logps/rejected": -822.6072998046875, "loss": 0.3253, "rewards/accuracies": 0.875, "rewards/chosen": -1.18256413936615, "rewards/margins": 2.6189026832580566, "rewards/rejected": -3.801466941833496, "step": 563 }, { "epoch": 0.48, "grad_norm": 79.50938619489895, "learning_rate": 1.230022211072807e-06, "logits/chosen": -0.9487758874893188, "logits/rejected": -0.919625997543335, "logps/chosen": -617.9252319335938, "logps/rejected": -784.6173706054688, "loss": 0.5057, "rewards/accuracies": 0.75, "rewards/chosen": -1.6651418209075928, "rewards/margins": 1.4620158672332764, "rewards/rejected": -3.127157688140869, "step": 564 }, { "epoch": 0.48, "grad_norm": 66.18762390042676, "learning_rate": 1.2271066442867135e-06, "logits/chosen": -0.9428020715713501, "logits/rejected": -0.9230005145072937, "logps/chosen": -454.8497009277344, "logps/rejected": -749.8408813476562, "loss": 0.3256, "rewards/accuracies": 0.875, "rewards/chosen": -0.8824508190155029, "rewards/margins": 2.7411446571350098, "rewards/rejected": -3.6235954761505127, "step": 565 }, { "epoch": 0.49, "grad_norm": 40.83906170657592, "learning_rate": 1.224189040560395e-06, "logits/chosen": -0.9711836576461792, "logits/rejected": -0.9430567026138306, "logps/chosen": -459.6309814453125, "logps/rejected": -692.2510986328125, "loss": 0.2556, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0203781127929688, "rewards/margins": 1.8715031147003174, "rewards/rejected": -2.891881227493286, "step": 566 }, { "epoch": 0.49, "grad_norm": 61.9853897269269, "learning_rate": 1.221269426062105e-06, "logits/chosen": -0.979697048664093, "logits/rejected": -0.9400407671928406, "logps/chosen": -524.4586181640625, "logps/rejected": -583.64453125, "loss": 0.5451, "rewards/accuracies": 0.75, "rewards/chosen": -1.1567959785461426, "rewards/margins": 1.524742603302002, "rewards/rejected": -2.6815383434295654, "step": 567 }, { "epoch": 0.49, "grad_norm": 78.27335936591861, "learning_rate": 1.2183478269781336e-06, "logits/chosen": -0.9543784856796265, "logits/rejected": -0.9156996011734009, "logps/chosen": -548.0863037109375, "logps/rejected": -758.37939453125, "loss": 0.3461, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1840085983276367, "rewards/margins": 2.1264264583587646, "rewards/rejected": -3.3104352951049805, "step": 568 }, { "epoch": 0.49, "grad_norm": 60.75530097259052, "learning_rate": 1.2154242695125692e-06, "logits/chosen": -1.001960039138794, "logits/rejected": -0.9309448003768921, "logps/chosen": -622.9661254882812, "logps/rejected": -733.677734375, "loss": 0.3013, "rewards/accuracies": 0.875, "rewards/chosen": -0.9647570848464966, "rewards/margins": 1.95701265335083, "rewards/rejected": -2.921769618988037, "step": 569 }, { "epoch": 0.49, "grad_norm": 39.07594264943836, "learning_rate": 1.2124987798870652e-06, "logits/chosen": -0.9486143589019775, "logits/rejected": -0.8990459442138672, "logps/chosen": -594.9383544921875, "logps/rejected": -679.9696044921875, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": -1.0007832050323486, "rewards/margins": 2.252715826034546, "rewards/rejected": -3.2534990310668945, "step": 570 }, { "epoch": 0.49, "grad_norm": 83.12897907525809, "learning_rate": 1.2095713843406055e-06, "logits/chosen": -1.01059889793396, "logits/rejected": -0.9461950063705444, "logps/chosen": -512.59619140625, "logps/rejected": -616.6376953125, "loss": 0.4345, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1057041883468628, "rewards/margins": 1.77592134475708, "rewards/rejected": -2.8816254138946533, "step": 571 }, { "epoch": 0.49, "grad_norm": 54.641379490857744, "learning_rate": 1.2066421091292678e-06, "logits/chosen": -0.9506186246871948, "logits/rejected": -0.9077310562133789, "logps/chosen": -608.8321533203125, "logps/rejected": -890.667236328125, "loss": 0.2483, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6453957557678223, "rewards/margins": 2.200510025024414, "rewards/rejected": -3.8459057807922363, "step": 572 }, { "epoch": 0.49, "grad_norm": 50.41011389855287, "learning_rate": 1.203710980525989e-06, "logits/chosen": -0.9624886512756348, "logits/rejected": -0.9110809564590454, "logps/chosen": -629.2330932617188, "logps/rejected": -815.7590942382812, "loss": 0.23, "rewards/accuracies": 0.875, "rewards/chosen": -0.8843998908996582, "rewards/margins": 2.8275225162506104, "rewards/rejected": -3.7119226455688477, "step": 573 }, { "epoch": 0.49, "grad_norm": 67.53346213341143, "learning_rate": 1.2007780248203297e-06, "logits/chosen": -0.952025294303894, "logits/rejected": -0.9283043146133423, "logps/chosen": -651.7374267578125, "logps/rejected": -827.676025390625, "loss": 0.3369, "rewards/accuracies": 0.875, "rewards/chosen": -1.4210906028747559, "rewards/margins": 1.837165117263794, "rewards/rejected": -3.25825572013855, "step": 574 }, { "epoch": 0.49, "grad_norm": 71.65141079463194, "learning_rate": 1.1978432683182362e-06, "logits/chosen": -1.015653371810913, "logits/rejected": -0.9584256410598755, "logps/chosen": -567.3726806640625, "logps/rejected": -817.5277709960938, "loss": 0.3354, "rewards/accuracies": 0.875, "rewards/chosen": -1.2098946571350098, "rewards/margins": 2.3213119506835938, "rewards/rejected": -3.5312068462371826, "step": 575 }, { "epoch": 0.49, "grad_norm": 38.6052770456671, "learning_rate": 1.1949067373418083e-06, "logits/chosen": -0.9470927715301514, "logits/rejected": -0.9256807565689087, "logps/chosen": -437.8433837890625, "logps/rejected": -804.2654418945312, "loss": 0.21, "rewards/accuracies": 0.9375, "rewards/chosen": -0.90472811460495, "rewards/margins": 3.095479965209961, "rewards/rejected": -4.000207901000977, "step": 576 }, { "epoch": 0.49, "grad_norm": 46.437618961672996, "learning_rate": 1.1919684582290603e-06, "logits/chosen": -0.9666635990142822, "logits/rejected": -0.9787067770957947, "logps/chosen": -413.5608825683594, "logps/rejected": -606.1953735351562, "loss": 0.2762, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9771944880485535, "rewards/margins": 2.034590244293213, "rewards/rejected": -3.011784553527832, "step": 577 }, { "epoch": 0.5, "grad_norm": 72.80610670978952, "learning_rate": 1.1890284573336854e-06, "logits/chosen": -0.9922659397125244, "logits/rejected": -0.92891925573349, "logps/chosen": -587.3009033203125, "logps/rejected": -689.83447265625, "loss": 0.2993, "rewards/accuracies": 0.875, "rewards/chosen": -1.1690540313720703, "rewards/margins": 2.1405553817749023, "rewards/rejected": -3.3096094131469727, "step": 578 }, { "epoch": 0.5, "grad_norm": 88.9899177492974, "learning_rate": 1.1860867610248207e-06, "logits/chosen": -1.0009547472000122, "logits/rejected": -0.997244119644165, "logps/chosen": -589.050537109375, "logps/rejected": -714.9618530273438, "loss": 0.7316, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4328802824020386, "rewards/margins": 1.5104745626449585, "rewards/rejected": -2.943354845046997, "step": 579 }, { "epoch": 0.5, "grad_norm": 83.58436085989156, "learning_rate": 1.1831433956868085e-06, "logits/chosen": -0.9802489280700684, "logits/rejected": -0.9745005965232849, "logps/chosen": -440.23333740234375, "logps/rejected": -646.380859375, "loss": 0.5048, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0479545593261719, "rewards/margins": 1.576943039894104, "rewards/rejected": -2.6248974800109863, "step": 580 }, { "epoch": 0.5, "grad_norm": 103.96994001693628, "learning_rate": 1.180198387718961e-06, "logits/chosen": -1.0007036924362183, "logits/rejected": -0.9575830698013306, "logps/chosen": -675.14990234375, "logps/rejected": -838.3320922851562, "loss": 0.753, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6041653156280518, "rewards/margins": 2.025559663772583, "rewards/rejected": -3.6297249794006348, "step": 581 }, { "epoch": 0.5, "grad_norm": 43.837917171955105, "learning_rate": 1.1772517635353242e-06, "logits/chosen": -1.0432533025741577, "logits/rejected": -0.980958104133606, "logps/chosen": -575.733154296875, "logps/rejected": -836.3759155273438, "loss": 0.2372, "rewards/accuracies": 0.875, "rewards/chosen": -0.785294771194458, "rewards/margins": 2.858860492706299, "rewards/rejected": -3.6441550254821777, "step": 582 }, { "epoch": 0.5, "grad_norm": 26.165545034747257, "learning_rate": 1.1743035495644384e-06, "logits/chosen": -0.9434425830841064, "logits/rejected": -0.9176989793777466, "logps/chosen": -377.6453552246094, "logps/rejected": -487.42156982421875, "loss": 0.2029, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3606542646884918, "rewards/margins": 2.1150460243225098, "rewards/rejected": -2.4756999015808105, "step": 583 }, { "epoch": 0.5, "grad_norm": 27.433102681077315, "learning_rate": 1.171353772249105e-06, "logits/chosen": -0.9990615844726562, "logits/rejected": -0.9651137590408325, "logps/chosen": -463.429443359375, "logps/rejected": -799.1802978515625, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": -0.7478154897689819, "rewards/margins": 3.271451473236084, "rewards/rejected": -4.0192670822143555, "step": 584 }, { "epoch": 0.5, "grad_norm": 54.56104954596308, "learning_rate": 1.1684024580461454e-06, "logits/chosen": -0.9565891027450562, "logits/rejected": -0.9538889527320862, "logps/chosen": -416.2050476074219, "logps/rejected": -626.102294921875, "loss": 0.298, "rewards/accuracies": 0.875, "rewards/chosen": -0.821914553642273, "rewards/margins": 2.1759161949157715, "rewards/rejected": -2.997830629348755, "step": 585 }, { "epoch": 0.5, "grad_norm": 46.90624235741009, "learning_rate": 1.1654496334261658e-06, "logits/chosen": -0.96038818359375, "logits/rejected": -0.965350329875946, "logps/chosen": -407.58795166015625, "logps/rejected": -632.771728515625, "loss": 0.2783, "rewards/accuracies": 0.875, "rewards/chosen": -1.224656343460083, "rewards/margins": 2.23311185836792, "rewards/rejected": -3.457768201828003, "step": 586 }, { "epoch": 0.5, "grad_norm": 44.85395647703205, "learning_rate": 1.1624953248733203e-06, "logits/chosen": -1.0106650590896606, "logits/rejected": -0.963015079498291, "logps/chosen": -562.22412109375, "logps/rejected": -734.1881103515625, "loss": 0.2537, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0141054391860962, "rewards/margins": 2.6984009742736816, "rewards/rejected": -3.7125065326690674, "step": 587 }, { "epoch": 0.5, "grad_norm": 26.980171055047652, "learning_rate": 1.1595395588850717e-06, "logits/chosen": -0.9881083965301514, "logits/rejected": -0.9700764417648315, "logps/chosen": -417.80352783203125, "logps/rejected": -763.685546875, "loss": 0.1568, "rewards/accuracies": 0.875, "rewards/chosen": -1.0090665817260742, "rewards/margins": 2.765437602996826, "rewards/rejected": -3.7745041847229004, "step": 588 }, { "epoch": 0.51, "grad_norm": 47.795237191457275, "learning_rate": 1.1565823619719554e-06, "logits/chosen": -0.9474834203720093, "logits/rejected": -0.9007887840270996, "logps/chosen": -632.9498291015625, "logps/rejected": -781.4476318359375, "loss": 0.2604, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4948418140411377, "rewards/margins": 2.670294761657715, "rewards/rejected": -4.165136337280273, "step": 589 }, { "epoch": 0.51, "grad_norm": 65.7744725890404, "learning_rate": 1.1536237606573404e-06, "logits/chosen": -0.9904187917709351, "logits/rejected": -0.9322486519813538, "logps/chosen": -599.295166015625, "logps/rejected": -696.2531127929688, "loss": 0.4085, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5187959671020508, "rewards/margins": 1.557621955871582, "rewards/rejected": -3.076417922973633, "step": 590 }, { "epoch": 0.51, "grad_norm": 61.573912975092064, "learning_rate": 1.1506637814771913e-06, "logits/chosen": -0.9480462074279785, "logits/rejected": -0.9528161287307739, "logps/chosen": -451.1434326171875, "logps/rejected": -786.1947021484375, "loss": 0.2855, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9499127864837646, "rewards/margins": 2.757876396179199, "rewards/rejected": -3.7077889442443848, "step": 591 }, { "epoch": 0.51, "grad_norm": 48.66734390755508, "learning_rate": 1.1477024509798325e-06, "logits/chosen": -0.9910733699798584, "logits/rejected": -0.9133542776107788, "logps/chosen": -637.9290771484375, "logps/rejected": -939.7032470703125, "loss": 0.1984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0765866041183472, "rewards/margins": 2.9725711345672607, "rewards/rejected": -4.049158096313477, "step": 592 }, { "epoch": 0.51, "grad_norm": 50.76606314718852, "learning_rate": 1.144739795725707e-06, "logits/chosen": -1.021897315979004, "logits/rejected": -0.9521700143814087, "logps/chosen": -539.6085205078125, "logps/rejected": -684.051025390625, "loss": 0.2097, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9156203269958496, "rewards/margins": 2.4041714668273926, "rewards/rejected": -3.319791793823242, "step": 593 }, { "epoch": 0.51, "grad_norm": 75.33018115295012, "learning_rate": 1.1417758422871404e-06, "logits/chosen": -0.9927308559417725, "logits/rejected": -0.9772903919219971, "logps/chosen": -481.47491455078125, "logps/rejected": -655.3239135742188, "loss": 0.505, "rewards/accuracies": 0.875, "rewards/chosen": -1.4134584665298462, "rewards/margins": 1.5445600748062134, "rewards/rejected": -2.9580185413360596, "step": 594 }, { "epoch": 0.51, "grad_norm": 79.49013356730272, "learning_rate": 1.1388106172481015e-06, "logits/chosen": -0.976418137550354, "logits/rejected": -0.9609758853912354, "logps/chosen": -582.8574829101562, "logps/rejected": -749.47119140625, "loss": 0.4163, "rewards/accuracies": 0.8125, "rewards/chosen": -1.068327784538269, "rewards/margins": 1.7612555027008057, "rewards/rejected": -2.829583168029785, "step": 595 }, { "epoch": 0.51, "grad_norm": 62.84741892276459, "learning_rate": 1.1358441472039646e-06, "logits/chosen": -0.9960223436355591, "logits/rejected": -0.976997971534729, "logps/chosen": -447.45001220703125, "logps/rejected": -551.7108764648438, "loss": 0.539, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1301624774932861, "rewards/margins": 1.4154155254364014, "rewards/rejected": -2.5455780029296875, "step": 596 }, { "epoch": 0.51, "grad_norm": 42.049801312127514, "learning_rate": 1.1328764587612702e-06, "logits/chosen": -1.0407339334487915, "logits/rejected": -0.9951431751251221, "logps/chosen": -420.0325927734375, "logps/rejected": -700.8817749023438, "loss": 0.2579, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9708338379859924, "rewards/margins": 2.6650800704956055, "rewards/rejected": -3.635913610458374, "step": 597 }, { "epoch": 0.51, "grad_norm": 66.43047811530603, "learning_rate": 1.1299075785374874e-06, "logits/chosen": -0.979002833366394, "logits/rejected": -0.9451375603675842, "logps/chosen": -594.0153198242188, "logps/rejected": -647.1163330078125, "loss": 0.4018, "rewards/accuracies": 0.75, "rewards/chosen": -1.270151138305664, "rewards/margins": 1.5419825315475464, "rewards/rejected": -2.812133550643921, "step": 598 }, { "epoch": 0.51, "grad_norm": 39.929678506324095, "learning_rate": 1.1269375331607726e-06, "logits/chosen": -1.0492507219314575, "logits/rejected": -0.9841207265853882, "logps/chosen": -607.1595458984375, "logps/rejected": -821.344970703125, "loss": 0.146, "rewards/accuracies": 1.0, "rewards/chosen": -0.8867940902709961, "rewards/margins": 2.6768949031829834, "rewards/rejected": -3.5636887550354004, "step": 599 }, { "epoch": 0.51, "grad_norm": 63.07068369742342, "learning_rate": 1.1239663492697355e-06, "logits/chosen": -1.0737665891647339, "logits/rejected": -0.991007924079895, "logps/chosen": -682.14501953125, "logps/rejected": -820.7092895507812, "loss": 0.2867, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1276676654815674, "rewards/margins": 2.4414119720458984, "rewards/rejected": -3.5690793991088867, "step": 600 }, { "epoch": 0.52, "grad_norm": 40.28793200844207, "learning_rate": 1.1209940535131947e-06, "logits/chosen": -1.0836031436920166, "logits/rejected": -1.001068353652954, "logps/chosen": -421.8529968261719, "logps/rejected": -553.0750122070312, "loss": 0.2355, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8329640030860901, "rewards/margins": 1.9275306463241577, "rewards/rejected": -2.7604947090148926, "step": 601 }, { "epoch": 0.52, "grad_norm": 78.6354473877312, "learning_rate": 1.1180206725499424e-06, "logits/chosen": -0.9799869060516357, "logits/rejected": -0.9672070741653442, "logps/chosen": -391.574951171875, "logps/rejected": -675.866455078125, "loss": 0.4875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9515032768249512, "rewards/margins": 2.0327067375183105, "rewards/rejected": -2.9842100143432617, "step": 602 }, { "epoch": 0.52, "grad_norm": 51.203160420320835, "learning_rate": 1.115046233048504e-06, "logits/chosen": -0.9612528085708618, "logits/rejected": -0.923812210559845, "logps/chosen": -569.8873291015625, "logps/rejected": -666.525390625, "loss": 0.2289, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5378499031066895, "rewards/margins": 1.9845186471939087, "rewards/rejected": -2.5223684310913086, "step": 603 }, { "epoch": 0.52, "grad_norm": 50.967899550632275, "learning_rate": 1.1120707616868987e-06, "logits/chosen": -0.948002278804779, "logits/rejected": -0.9246044158935547, "logps/chosen": -543.124755859375, "logps/rejected": -738.28515625, "loss": 0.3327, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8030984401702881, "rewards/margins": 1.7016018629074097, "rewards/rejected": -2.504700183868408, "step": 604 }, { "epoch": 0.52, "grad_norm": 43.06619559392241, "learning_rate": 1.1090942851524012e-06, "logits/chosen": -1.0343921184539795, "logits/rejected": -0.9407830834388733, "logps/chosen": -632.280029296875, "logps/rejected": -797.356689453125, "loss": 0.1989, "rewards/accuracies": 1.0, "rewards/chosen": -0.6318868398666382, "rewards/margins": 2.2190303802490234, "rewards/rejected": -2.850917339324951, "step": 605 }, { "epoch": 0.52, "grad_norm": 43.55443697785336, "learning_rate": 1.106116830141301e-06, "logits/chosen": -0.9716185331344604, "logits/rejected": -0.925656795501709, "logps/chosen": -493.6955871582031, "logps/rejected": -678.1077880859375, "loss": 0.1976, "rewards/accuracies": 1.0, "rewards/chosen": -0.7711277008056641, "rewards/margins": 2.2040441036224365, "rewards/rejected": -2.9751715660095215, "step": 606 }, { "epoch": 0.52, "grad_norm": 53.70680892666797, "learning_rate": 1.1031384233586632e-06, "logits/chosen": -1.0033338069915771, "logits/rejected": -0.9584630727767944, "logps/chosen": -653.1076049804688, "logps/rejected": -841.616943359375, "loss": 0.2092, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6441432237625122, "rewards/margins": 2.346440553665161, "rewards/rejected": -2.990583658218384, "step": 607 }, { "epoch": 0.52, "grad_norm": 63.030835220246225, "learning_rate": 1.1001590915180915e-06, "logits/chosen": -0.9717227220535278, "logits/rejected": -0.9596288204193115, "logps/chosen": -476.002197265625, "logps/rejected": -581.9302978515625, "loss": 0.3562, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0460882186889648, "rewards/margins": 1.1150057315826416, "rewards/rejected": -2.1610941886901855, "step": 608 }, { "epoch": 0.52, "grad_norm": 32.55524769432412, "learning_rate": 1.0971788613414842e-06, "logits/chosen": -1.0176827907562256, "logits/rejected": -0.9593334197998047, "logps/chosen": -490.81640625, "logps/rejected": -721.6340942382812, "loss": 0.1971, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3526301980018616, "rewards/margins": 2.5123186111450195, "rewards/rejected": -2.8649487495422363, "step": 609 }, { "epoch": 0.52, "grad_norm": 52.07686220232358, "learning_rate": 1.0941977595587983e-06, "logits/chosen": -0.9811424016952515, "logits/rejected": -0.9464911222457886, "logps/chosen": -440.84222412109375, "logps/rejected": -677.8856201171875, "loss": 0.2756, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5718835592269897, "rewards/margins": 2.2763261795043945, "rewards/rejected": -2.848209857940674, "step": 610 }, { "epoch": 0.52, "grad_norm": 84.38983739427725, "learning_rate": 1.0912158129078074e-06, "logits/chosen": -1.0566349029541016, "logits/rejected": -0.9579128623008728, "logps/chosen": -611.2342529296875, "logps/rejected": -662.5355224609375, "loss": 0.4657, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4705239534378052, "rewards/margins": 1.3576524257659912, "rewards/rejected": -2.828176498413086, "step": 611 }, { "epoch": 0.52, "grad_norm": 95.745219124751, "learning_rate": 1.0882330481338634e-06, "logits/chosen": -1.0057260990142822, "logits/rejected": -0.9815188646316528, "logps/chosen": -624.9097900390625, "logps/rejected": -802.2578125, "loss": 0.5204, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1399705410003662, "rewards/margins": 2.3458216190338135, "rewards/rejected": -3.4857921600341797, "step": 612 }, { "epoch": 0.53, "grad_norm": 35.61612150060107, "learning_rate": 1.0852494919896564e-06, "logits/chosen": -1.04823637008667, "logits/rejected": -0.9574418067932129, "logps/chosen": -450.38531494140625, "logps/rejected": -605.09619140625, "loss": 0.2273, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4428199529647827, "rewards/margins": 2.150094985961914, "rewards/rejected": -2.5929148197174072, "step": 613 }, { "epoch": 0.53, "grad_norm": 88.8221121726083, "learning_rate": 1.0822651712349728e-06, "logits/chosen": -0.9696275591850281, "logits/rejected": -0.9406725168228149, "logps/chosen": -438.52655029296875, "logps/rejected": -477.2586669921875, "loss": 0.6035, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3069283962249756, "rewards/margins": 0.9594730138778687, "rewards/rejected": -2.266401529312134, "step": 614 }, { "epoch": 0.53, "grad_norm": 108.04629258782866, "learning_rate": 1.0792801126364585e-06, "logits/chosen": -0.9994640350341797, "logits/rejected": -0.956566333770752, "logps/chosen": -533.0015258789062, "logps/rejected": -713.0087890625, "loss": 0.5597, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2643516063690186, "rewards/margins": 1.590078592300415, "rewards/rejected": -2.8544301986694336, "step": 615 }, { "epoch": 0.53, "grad_norm": 59.464095147384604, "learning_rate": 1.076294342967377e-06, "logits/chosen": -0.9867278933525085, "logits/rejected": -0.9743552207946777, "logps/chosen": -419.0953063964844, "logps/rejected": -548.52392578125, "loss": 0.3782, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8928730487823486, "rewards/margins": 1.9194822311401367, "rewards/rejected": -2.8123552799224854, "step": 616 }, { "epoch": 0.53, "grad_norm": 56.24052261515072, "learning_rate": 1.0733078890073682e-06, "logits/chosen": -1.029302716255188, "logits/rejected": -0.9855225086212158, "logps/chosen": -538.539794921875, "logps/rejected": -725.5313720703125, "loss": 0.2391, "rewards/accuracies": 1.0, "rewards/chosen": -0.9613665342330933, "rewards/margins": 2.064399480819702, "rewards/rejected": -3.025765895843506, "step": 617 }, { "epoch": 0.53, "grad_norm": 179.2209749040515, "learning_rate": 1.0703207775422106e-06, "logits/chosen": -0.9867770671844482, "logits/rejected": -0.9663810133934021, "logps/chosen": -509.22747802734375, "logps/rejected": -587.5941162109375, "loss": 0.3361, "rewards/accuracies": 0.875, "rewards/chosen": -0.7451854944229126, "rewards/margins": 1.7211875915527344, "rewards/rejected": -2.4663732051849365, "step": 618 }, { "epoch": 0.53, "grad_norm": 27.876934753230678, "learning_rate": 1.0673330353635796e-06, "logits/chosen": -0.9755138158798218, "logits/rejected": -0.9498250484466553, "logps/chosen": -447.90704345703125, "logps/rejected": -960.9149169921875, "loss": 0.158, "rewards/accuracies": 0.875, "rewards/chosen": -0.9585582613945007, "rewards/margins": 3.6578381061553955, "rewards/rejected": -4.616396903991699, "step": 619 }, { "epoch": 0.53, "grad_norm": 48.905850898670266, "learning_rate": 1.0643446892688077e-06, "logits/chosen": -1.0404212474822998, "logits/rejected": -0.9776492714881897, "logps/chosen": -468.32281494140625, "logps/rejected": -645.6575927734375, "loss": 0.3063, "rewards/accuracies": 0.875, "rewards/chosen": -1.310685396194458, "rewards/margins": 1.8421094417572021, "rewards/rejected": -3.15279483795166, "step": 620 }, { "epoch": 0.53, "grad_norm": 79.60473032815341, "learning_rate": 1.0613557660606441e-06, "logits/chosen": -1.0487558841705322, "logits/rejected": -1.0085748434066772, "logps/chosen": -588.3466796875, "logps/rejected": -665.53369140625, "loss": 0.4134, "rewards/accuracies": 0.875, "rewards/chosen": -1.34440279006958, "rewards/margins": 1.925700306892395, "rewards/rejected": -3.2701029777526855, "step": 621 }, { "epoch": 0.53, "grad_norm": 55.38539507823246, "learning_rate": 1.0583662925470126e-06, "logits/chosen": -1.027477502822876, "logits/rejected": -0.9840755462646484, "logps/chosen": -571.076904296875, "logps/rejected": -684.5631103515625, "loss": 0.2579, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2199088335037231, "rewards/margins": 2.1186861991882324, "rewards/rejected": -3.338594913482666, "step": 622 }, { "epoch": 0.53, "grad_norm": 65.12413866371425, "learning_rate": 1.0553762955407757e-06, "logits/chosen": -1.0024276971817017, "logits/rejected": -0.9541710615158081, "logps/chosen": -645.3480834960938, "logps/rejected": -868.0056762695312, "loss": 0.2964, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9870597720146179, "rewards/margins": 1.8057602643966675, "rewards/rejected": -2.7928199768066406, "step": 623 }, { "epoch": 0.54, "grad_norm": 54.89562559971715, "learning_rate": 1.052385801859489e-06, "logits/chosen": -0.9582788944244385, "logits/rejected": -0.9098652005195618, "logps/chosen": -510.34844970703125, "logps/rejected": -772.8673095703125, "loss": 0.2481, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3798649311065674, "rewards/margins": 2.6270527839660645, "rewards/rejected": -4.006917953491211, "step": 624 }, { "epoch": 0.54, "grad_norm": 77.07307778949189, "learning_rate": 1.0493948383251628e-06, "logits/chosen": -0.9984012842178345, "logits/rejected": -1.00254487991333, "logps/chosen": -596.7376708984375, "logps/rejected": -779.016357421875, "loss": 0.349, "rewards/accuracies": 0.875, "rewards/chosen": -1.0308185815811157, "rewards/margins": 2.1803455352783203, "rewards/rejected": -3.2111639976501465, "step": 625 }, { "epoch": 0.54, "grad_norm": 27.688582306282832, "learning_rate": 1.0464034317640226e-06, "logits/chosen": -1.0623116493225098, "logits/rejected": -1.0131748914718628, "logps/chosen": -588.09716796875, "logps/rejected": -838.4576416015625, "loss": 0.185, "rewards/accuracies": 1.0, "rewards/chosen": -0.712467610836029, "rewards/margins": 2.243401050567627, "rewards/rejected": -2.955868721008301, "step": 626 }, { "epoch": 0.54, "grad_norm": 55.18732834418408, "learning_rate": 1.0434116090062663e-06, "logits/chosen": -0.9450533390045166, "logits/rejected": -0.8979382514953613, "logps/chosen": -357.7943115234375, "logps/rejected": -512.4151611328125, "loss": 0.391, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3883112370967865, "rewards/margins": 1.048119306564331, "rewards/rejected": -1.4364304542541504, "step": 627 }, { "epoch": 0.54, "grad_norm": 84.22638818887654, "learning_rate": 1.040419396885826e-06, "logits/chosen": -1.0142240524291992, "logits/rejected": -0.9256457090377808, "logps/chosen": -577.904052734375, "logps/rejected": -668.9786987304688, "loss": 0.4729, "rewards/accuracies": 0.875, "rewards/chosen": -0.595212996006012, "rewards/margins": 1.0791289806365967, "rewards/rejected": -1.6743419170379639, "step": 628 }, { "epoch": 0.54, "grad_norm": 58.71812972569202, "learning_rate": 1.0374268222401257e-06, "logits/chosen": -0.8847454786300659, "logits/rejected": -0.876816987991333, "logps/chosen": -488.02508544921875, "logps/rejected": -507.3223571777344, "loss": 0.3981, "rewards/accuracies": 0.875, "rewards/chosen": -0.2222071886062622, "rewards/margins": 0.9527764320373535, "rewards/rejected": -1.1749836206436157, "step": 629 }, { "epoch": 0.54, "grad_norm": 69.55694695794018, "learning_rate": 1.0344339119098393e-06, "logits/chosen": -0.9340708255767822, "logits/rejected": -0.931158185005188, "logps/chosen": -617.0542602539062, "logps/rejected": -717.384521484375, "loss": 0.3453, "rewards/accuracies": 0.875, "rewards/chosen": -0.5446648597717285, "rewards/margins": 1.2806169986724854, "rewards/rejected": -1.8252819776535034, "step": 630 }, { "epoch": 0.54, "grad_norm": 54.83156555797582, "learning_rate": 1.0314406927386538e-06, "logits/chosen": -0.9774016737937927, "logits/rejected": -0.9222882986068726, "logps/chosen": -410.22833251953125, "logps/rejected": -523.1634521484375, "loss": 0.3641, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6278001070022583, "rewards/margins": 1.2517343759536743, "rewards/rejected": -1.879534363746643, "step": 631 }, { "epoch": 0.54, "grad_norm": 82.00952622693666, "learning_rate": 1.0284471915730251e-06, "logits/chosen": -1.0299391746520996, "logits/rejected": -0.9885897636413574, "logps/chosen": -451.1562805175781, "logps/rejected": -645.5934448242188, "loss": 0.5372, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9116708636283875, "rewards/margins": 1.7287958860397339, "rewards/rejected": -2.6404666900634766, "step": 632 }, { "epoch": 0.54, "grad_norm": 33.30811545108685, "learning_rate": 1.0254534352619379e-06, "logits/chosen": -0.9370101094245911, "logits/rejected": -0.9013323783874512, "logps/chosen": -387.3674011230469, "logps/rejected": -653.4251708984375, "loss": 0.1564, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5667197108268738, "rewards/margins": 2.4151532649993896, "rewards/rejected": -2.981873035430908, "step": 633 }, { "epoch": 0.54, "grad_norm": 90.51805812937883, "learning_rate": 1.0224594506566666e-06, "logits/chosen": -0.9417135715484619, "logits/rejected": -0.9171383380889893, "logps/chosen": -725.6295776367188, "logps/rejected": -796.2141723632812, "loss": 0.5424, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3215444087982178, "rewards/margins": 1.8195197582244873, "rewards/rejected": -3.141064167022705, "step": 634 }, { "epoch": 0.54, "grad_norm": 62.04604028789953, "learning_rate": 1.0194652646105317e-06, "logits/chosen": -0.981023371219635, "logits/rejected": -0.9579742550849915, "logps/chosen": -600.0975341796875, "logps/rejected": -712.9047241210938, "loss": 0.299, "rewards/accuracies": 0.875, "rewards/chosen": -1.0958032608032227, "rewards/margins": 1.629847526550293, "rewards/rejected": -2.7256507873535156, "step": 635 }, { "epoch": 0.55, "grad_norm": 44.86862238379095, "learning_rate": 1.0164709039786616e-06, "logits/chosen": -1.0345852375030518, "logits/rejected": -0.9957494735717773, "logps/chosen": -662.246337890625, "logps/rejected": -1078.583740234375, "loss": 0.2296, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1617629528045654, "rewards/margins": 4.0115203857421875, "rewards/rejected": -5.173283576965332, "step": 636 }, { "epoch": 0.55, "grad_norm": 66.67094105532799, "learning_rate": 1.0134763956177504e-06, "logits/chosen": -1.0106799602508545, "logits/rejected": -0.9603334665298462, "logps/chosen": -593.8453369140625, "logps/rejected": -729.3505859375, "loss": 0.3855, "rewards/accuracies": 0.75, "rewards/chosen": -1.4872370958328247, "rewards/margins": 1.5294179916381836, "rewards/rejected": -3.016655206680298, "step": 637 }, { "epoch": 0.55, "grad_norm": 29.81447058483848, "learning_rate": 1.0104817663858161e-06, "logits/chosen": -0.9926195740699768, "logits/rejected": -0.9689513444900513, "logps/chosen": -318.2772216796875, "logps/rejected": -550.7271118164062, "loss": 0.245, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8237345218658447, "rewards/margins": 1.9877159595489502, "rewards/rejected": -2.811450481414795, "step": 638 }, { "epoch": 0.55, "grad_norm": 75.35557968141725, "learning_rate": 1.0074870431419627e-06, "logits/chosen": -1.0136584043502808, "logits/rejected": -0.934355616569519, "logps/chosen": -665.5938110351562, "logps/rejected": -847.51953125, "loss": 0.395, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2840629816055298, "rewards/margins": 2.6745872497558594, "rewards/rejected": -3.9586501121520996, "step": 639 }, { "epoch": 0.55, "grad_norm": 57.02748396120161, "learning_rate": 1.0044922527461358e-06, "logits/chosen": -1.0077900886535645, "logits/rejected": -0.9565660953521729, "logps/chosen": -544.416259765625, "logps/rejected": -856.279541015625, "loss": 0.2009, "rewards/accuracies": 1.0, "rewards/chosen": -1.349835753440857, "rewards/margins": 2.358369827270508, "rewards/rejected": -3.7082056999206543, "step": 640 }, { "epoch": 0.55, "grad_norm": 21.76600647344353, "learning_rate": 1.0014974220588836e-06, "logits/chosen": -0.9938585162162781, "logits/rejected": -0.9594500064849854, "logps/chosen": -365.6549987792969, "logps/rejected": -681.994384765625, "loss": 0.1668, "rewards/accuracies": 1.0, "rewards/chosen": -0.7148664593696594, "rewards/margins": 2.855158805847168, "rewards/rejected": -3.5700254440307617, "step": 641 }, { "epoch": 0.55, "grad_norm": 88.30985096224623, "learning_rate": 9.985025779411165e-07, "logits/chosen": -0.9774501323699951, "logits/rejected": -0.9425498247146606, "logps/chosen": -552.4366455078125, "logps/rejected": -776.1644287109375, "loss": 0.6567, "rewards/accuracies": 0.875, "rewards/chosen": -1.5388054847717285, "rewards/margins": 2.0177862644195557, "rewards/rejected": -3.556591749191284, "step": 642 }, { "epoch": 0.55, "grad_norm": 33.67261866832234, "learning_rate": 9.955077472538648e-07, "logits/chosen": -1.0780702829360962, "logits/rejected": -1.0096969604492188, "logps/chosen": -564.3319091796875, "logps/rejected": -761.5578002929688, "loss": 0.226, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1705207824707031, "rewards/margins": 2.825336217880249, "rewards/rejected": -3.995857000350952, "step": 643 }, { "epoch": 0.55, "grad_norm": 19.057745564111876, "learning_rate": 9.925129568580374e-07, "logits/chosen": -0.9582267999649048, "logits/rejected": -0.941879391670227, "logps/chosen": -360.990478515625, "logps/rejected": -773.388671875, "loss": 0.1027, "rewards/accuracies": 1.0, "rewards/chosen": -0.433311402797699, "rewards/margins": 3.6941919326782227, "rewards/rejected": -4.127503395080566, "step": 644 }, { "epoch": 0.55, "grad_norm": 39.503260225728425, "learning_rate": 9.89518233614184e-07, "logits/chosen": -1.0283384323120117, "logits/rejected": -0.9627246856689453, "logps/chosen": -544.3568725585938, "logps/rejected": -672.4243774414062, "loss": 0.2435, "rewards/accuracies": 0.875, "rewards/chosen": -1.3087122440338135, "rewards/margins": 1.910134196281433, "rewards/rejected": -3.218846559524536, "step": 645 }, { "epoch": 0.55, "grad_norm": 33.99114210586287, "learning_rate": 9.8652360438225e-07, "logits/chosen": -1.0589323043823242, "logits/rejected": -0.9973713159561157, "logps/chosen": -550.265625, "logps/rejected": -813.226318359375, "loss": 0.1675, "rewards/accuracies": 1.0, "rewards/chosen": -1.1273949146270752, "rewards/margins": 2.866863250732422, "rewards/rejected": -3.9942586421966553, "step": 646 }, { "epoch": 0.55, "grad_norm": 50.68095458780209, "learning_rate": 9.835290960213381e-07, "logits/chosen": -1.0335006713867188, "logits/rejected": -1.0002275705337524, "logps/chosen": -356.583740234375, "logps/rejected": -690.9955444335938, "loss": 0.2406, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8581660389900208, "rewards/margins": 2.352982521057129, "rewards/rejected": -3.211148262023926, "step": 647 }, { "epoch": 0.56, "grad_norm": 55.01033262884792, "learning_rate": 9.805347353894684e-07, "logits/chosen": -0.9837990999221802, "logits/rejected": -0.9763227701187134, "logps/chosen": -447.79180908203125, "logps/rejected": -744.4036865234375, "loss": 0.3265, "rewards/accuracies": 0.875, "rewards/chosen": -1.1442887783050537, "rewards/margins": 2.827482223510742, "rewards/rejected": -3.971770763397217, "step": 648 }, { "epoch": 0.56, "grad_norm": 69.9962549998083, "learning_rate": 9.775405493433336e-07, "logits/chosen": -1.059086799621582, "logits/rejected": -0.9900112152099609, "logps/chosen": -807.8218994140625, "logps/rejected": -719.1181640625, "loss": 0.3185, "rewards/accuracies": 0.875, "rewards/chosen": -1.7500087022781372, "rewards/margins": 1.4685823917388916, "rewards/rejected": -3.2185912132263184, "step": 649 }, { "epoch": 0.56, "grad_norm": 127.33599373241286, "learning_rate": 9.745465647380618e-07, "logits/chosen": -1.0178935527801514, "logits/rejected": -0.9754936695098877, "logps/chosen": -677.5139770507812, "logps/rejected": -957.6383056640625, "loss": 0.2906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8714618682861328, "rewards/margins": 2.3842825889587402, "rewards/rejected": -4.255744934082031, "step": 650 }, { "epoch": 0.56, "grad_norm": 82.60462428449118, "learning_rate": 9.71552808426975e-07, "logits/chosen": -1.0387744903564453, "logits/rejected": -0.9887288808822632, "logps/chosen": -718.5755004882812, "logps/rejected": -1037.51953125, "loss": 0.2611, "rewards/accuracies": 0.875, "rewards/chosen": -1.3768072128295898, "rewards/margins": 3.0493197441101074, "rewards/rejected": -4.4261274337768555, "step": 651 }, { "epoch": 0.56, "grad_norm": 79.66272459241623, "learning_rate": 9.685593072613463e-07, "logits/chosen": -1.0097852945327759, "logits/rejected": -0.9862075448036194, "logps/chosen": -632.6588134765625, "logps/rejected": -866.3151245117188, "loss": 0.3264, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7535400390625, "rewards/margins": 2.2010996341705322, "rewards/rejected": -3.9546396732330322, "step": 652 }, { "epoch": 0.56, "grad_norm": 55.47087463249963, "learning_rate": 9.655660880901604e-07, "logits/chosen": -1.0378929376602173, "logits/rejected": -0.9814661741256714, "logps/chosen": -619.1103515625, "logps/rejected": -685.860107421875, "loss": 0.2389, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1459777355194092, "rewards/margins": 2.4581964015960693, "rewards/rejected": -3.6041741371154785, "step": 653 }, { "epoch": 0.56, "grad_norm": 27.190095964458937, "learning_rate": 9.625731777598744e-07, "logits/chosen": -1.0018420219421387, "logits/rejected": -0.9675527811050415, "logps/chosen": -427.070068359375, "logps/rejected": -564.972900390625, "loss": 0.1653, "rewards/accuracies": 1.0, "rewards/chosen": -0.7308350801467896, "rewards/margins": 2.6819570064544678, "rewards/rejected": -3.4127919673919678, "step": 654 }, { "epoch": 0.56, "grad_norm": 47.02737017794142, "learning_rate": 9.595806031141738e-07, "logits/chosen": -1.0094876289367676, "logits/rejected": -0.9599051475524902, "logps/chosen": -543.79541015625, "logps/rejected": -703.00927734375, "loss": 0.1868, "rewards/accuracies": 0.9375, "rewards/chosen": -1.366621971130371, "rewards/margins": 2.607872486114502, "rewards/rejected": -3.974494457244873, "step": 655 }, { "epoch": 0.56, "grad_norm": 42.21459880445656, "learning_rate": 9.565883909937339e-07, "logits/chosen": -1.0014833211898804, "logits/rejected": -0.940342366695404, "logps/chosen": -557.1934204101562, "logps/rejected": -724.4772338867188, "loss": 0.2306, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4227467775344849, "rewards/margins": 2.1267175674438477, "rewards/rejected": -3.549463987350464, "step": 656 }, { "epoch": 0.56, "grad_norm": 52.19377804700672, "learning_rate": 9.535965682359777e-07, "logits/chosen": -1.0168036222457886, "logits/rejected": -0.9771864414215088, "logps/chosen": -378.285888671875, "logps/rejected": -516.0848388671875, "loss": 0.3705, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6493128538131714, "rewards/margins": 2.358621120452881, "rewards/rejected": -3.007934093475342, "step": 657 }, { "epoch": 0.56, "grad_norm": 47.28051655396848, "learning_rate": 9.506051616748373e-07, "logits/chosen": -1.052863359451294, "logits/rejected": -0.9798964262008667, "logps/chosen": -632.6463012695312, "logps/rejected": -971.0015869140625, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": -1.1951112747192383, "rewards/margins": 3.460440158843994, "rewards/rejected": -4.655550956726074, "step": 658 }, { "epoch": 0.57, "grad_norm": 44.463965490286, "learning_rate": 9.476141981405112e-07, "logits/chosen": -1.0496432781219482, "logits/rejected": -1.0346564054489136, "logps/chosen": -476.4952392578125, "logps/rejected": -637.9500122070312, "loss": 0.2351, "rewards/accuracies": 1.0, "rewards/chosen": -1.085493564605713, "rewards/margins": 2.589820384979248, "rewards/rejected": -3.675313949584961, "step": 659 }, { "epoch": 0.57, "grad_norm": 68.27512907635017, "learning_rate": 9.44623704459224e-07, "logits/chosen": -1.050779104232788, "logits/rejected": -1.0321484804153442, "logps/chosen": -491.0112609863281, "logps/rejected": -692.8443603515625, "loss": 0.2597, "rewards/accuracies": 0.875, "rewards/chosen": -1.3593103885650635, "rewards/margins": 2.38303542137146, "rewards/rejected": -3.7423458099365234, "step": 660 }, { "epoch": 0.57, "grad_norm": 82.05647902225522, "learning_rate": 9.416337074529872e-07, "logits/chosen": -1.0249155759811401, "logits/rejected": -0.9780560731887817, "logps/chosen": -583.6378173828125, "logps/rejected": -788.2223510742188, "loss": 0.5512, "rewards/accuracies": 0.875, "rewards/chosen": -1.0925850868225098, "rewards/margins": 2.51676082611084, "rewards/rejected": -3.6093459129333496, "step": 661 }, { "epoch": 0.57, "grad_norm": 45.265104089038, "learning_rate": 9.386442339393563e-07, "logits/chosen": -1.024889588356018, "logits/rejected": -0.9728207588195801, "logps/chosen": -473.43365478515625, "logps/rejected": -656.6044921875, "loss": 0.1839, "rewards/accuracies": 1.0, "rewards/chosen": -0.80340975522995, "rewards/margins": 2.5102243423461914, "rewards/rejected": -3.313633918762207, "step": 662 }, { "epoch": 0.57, "grad_norm": 65.92164850179016, "learning_rate": 9.356553107311921e-07, "logits/chosen": -1.016336441040039, "logits/rejected": -1.0131916999816895, "logps/chosen": -474.9246520996094, "logps/rejected": -530.8618774414062, "loss": 0.4065, "rewards/accuracies": 0.875, "rewards/chosen": -0.9806596040725708, "rewards/margins": 1.3442504405975342, "rewards/rejected": -2.3249101638793945, "step": 663 }, { "epoch": 0.57, "grad_norm": 81.83170951612074, "learning_rate": 9.326669646364203e-07, "logits/chosen": -0.9990056753158569, "logits/rejected": -0.9700546264648438, "logps/chosen": -473.5721740722656, "logps/rejected": -528.7078247070312, "loss": 0.6083, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9107904434204102, "rewards/margins": 1.244102954864502, "rewards/rejected": -2.154893398284912, "step": 664 }, { "epoch": 0.57, "grad_norm": 55.94802612690403, "learning_rate": 9.296792224577894e-07, "logits/chosen": -1.024969458580017, "logits/rejected": -0.9708298444747925, "logps/chosen": -538.3052978515625, "logps/rejected": -770.4991455078125, "loss": 0.2874, "rewards/accuracies": 0.875, "rewards/chosen": -0.726364254951477, "rewards/margins": 2.3672268390655518, "rewards/rejected": -3.0935909748077393, "step": 665 }, { "epoch": 0.57, "grad_norm": 65.92778234189511, "learning_rate": 9.266921109926317e-07, "logits/chosen": -1.020105242729187, "logits/rejected": -0.9524201154708862, "logps/chosen": -530.647705078125, "logps/rejected": -631.1388549804688, "loss": 0.4601, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9452372789382935, "rewards/margins": 1.8844785690307617, "rewards/rejected": -2.8297157287597656, "step": 666 }, { "epoch": 0.57, "grad_norm": 47.23741485222515, "learning_rate": 9.237056570326231e-07, "logits/chosen": -0.995692789554596, "logits/rejected": -0.963083028793335, "logps/chosen": -438.02374267578125, "logps/rejected": -751.4575805664062, "loss": 0.2614, "rewards/accuracies": 0.875, "rewards/chosen": -0.4857766032218933, "rewards/margins": 2.190925359725952, "rewards/rejected": -2.6767020225524902, "step": 667 }, { "epoch": 0.57, "grad_norm": 37.82814644493825, "learning_rate": 9.207198873635413e-07, "logits/chosen": -1.0047978162765503, "logits/rejected": -0.9785387516021729, "logps/chosen": -464.0424499511719, "logps/rejected": -700.0529174804688, "loss": 0.196, "rewards/accuracies": 0.875, "rewards/chosen": -0.7992026805877686, "rewards/margins": 2.433967351913452, "rewards/rejected": -3.2331700325012207, "step": 668 }, { "epoch": 0.57, "grad_norm": 67.30485233904321, "learning_rate": 9.177348287650273e-07, "logits/chosen": -1.024238109588623, "logits/rejected": -0.9735568761825562, "logps/chosen": -440.551513671875, "logps/rejected": -649.260009765625, "loss": 0.3645, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7555053234100342, "rewards/margins": 1.5785208940505981, "rewards/rejected": -2.334026336669922, "step": 669 }, { "epoch": 0.57, "grad_norm": 51.33789079996192, "learning_rate": 9.147505080103436e-07, "logits/chosen": -1.0277349948883057, "logits/rejected": -0.9844967126846313, "logps/chosen": -485.0599365234375, "logps/rejected": -690.2149658203125, "loss": 0.2801, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0582692623138428, "rewards/margins": 1.9671037197113037, "rewards/rejected": -3.0253729820251465, "step": 670 }, { "epoch": 0.58, "grad_norm": 84.470816979724, "learning_rate": 9.117669518661365e-07, "logits/chosen": -0.9990596771240234, "logits/rejected": -0.9557846784591675, "logps/chosen": -474.22113037109375, "logps/rejected": -616.2432861328125, "loss": 0.4854, "rewards/accuracies": 0.625, "rewards/chosen": -1.1680184602737427, "rewards/margins": 1.3425781726837158, "rewards/rejected": -2.510596513748169, "step": 671 }, { "epoch": 0.58, "grad_norm": 34.1166550327403, "learning_rate": 9.087841870921929e-07, "logits/chosen": -1.0641005039215088, "logits/rejected": -1.0280168056488037, "logps/chosen": -365.0167236328125, "logps/rejected": -560.3458251953125, "loss": 0.238, "rewards/accuracies": 1.0, "rewards/chosen": -0.7898271083831787, "rewards/margins": 2.0286502838134766, "rewards/rejected": -2.8184773921966553, "step": 672 }, { "epoch": 0.58, "grad_norm": 60.0255646685014, "learning_rate": 9.058022404412018e-07, "logits/chosen": -1.0256595611572266, "logits/rejected": -1.0097715854644775, "logps/chosen": -336.9100646972656, "logps/rejected": -485.3856201171875, "loss": 0.3979, "rewards/accuracies": 0.875, "rewards/chosen": -0.9657609462738037, "rewards/margins": 1.0824462175369263, "rewards/rejected": -2.0482070446014404, "step": 673 }, { "epoch": 0.58, "grad_norm": 52.558938660404124, "learning_rate": 9.028211386585157e-07, "logits/chosen": -1.0598628520965576, "logits/rejected": -1.0078938007354736, "logps/chosen": -487.5439147949219, "logps/rejected": -664.8508911132812, "loss": 0.2814, "rewards/accuracies": 0.875, "rewards/chosen": -0.8298671245574951, "rewards/margins": 2.1915345191955566, "rewards/rejected": -3.0214016437530518, "step": 674 }, { "epoch": 0.58, "grad_norm": 79.31656307930656, "learning_rate": 8.998409084819087e-07, "logits/chosen": -1.0546629428863525, "logits/rejected": -1.0089797973632812, "logps/chosen": -675.748046875, "logps/rejected": -908.391845703125, "loss": 0.4046, "rewards/accuracies": 0.75, "rewards/chosen": -1.3004239797592163, "rewards/margins": 2.9876890182495117, "rewards/rejected": -4.288113117218018, "step": 675 }, { "epoch": 0.58, "grad_norm": 35.20555496949064, "learning_rate": 8.968615766413365e-07, "logits/chosen": -1.0567575693130493, "logits/rejected": -1.0212836265563965, "logps/chosen": -423.4122314453125, "logps/rejected": -738.394287109375, "loss": 0.1584, "rewards/accuracies": 0.9375, "rewards/chosen": -0.609798014163971, "rewards/margins": 3.163112163543701, "rewards/rejected": -3.7729101181030273, "step": 676 }, { "epoch": 0.58, "grad_norm": 49.798090039342085, "learning_rate": 8.938831698586993e-07, "logits/chosen": -1.0136868953704834, "logits/rejected": -0.9751554727554321, "logps/chosen": -400.88995361328125, "logps/rejected": -635.0789794921875, "loss": 0.2124, "rewards/accuracies": 0.875, "rewards/chosen": -1.1595284938812256, "rewards/margins": 2.3611745834350586, "rewards/rejected": -3.520702838897705, "step": 677 }, { "epoch": 0.58, "grad_norm": 38.677926146867755, "learning_rate": 8.90905714847599e-07, "logits/chosen": -1.075408697128296, "logits/rejected": -1.0172092914581299, "logps/chosen": -539.0830078125, "logps/rejected": -641.5501708984375, "loss": 0.169, "rewards/accuracies": 1.0, "rewards/chosen": -1.0429928302764893, "rewards/margins": 2.260859251022339, "rewards/rejected": -3.303852081298828, "step": 678 }, { "epoch": 0.58, "grad_norm": 55.458709677682904, "learning_rate": 8.87929238313101e-07, "logits/chosen": -1.0920605659484863, "logits/rejected": -1.0440154075622559, "logps/chosen": -455.96051025390625, "logps/rejected": -724.99560546875, "loss": 0.2578, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0661029815673828, "rewards/margins": 2.8346362113952637, "rewards/rejected": -3.9007389545440674, "step": 679 }, { "epoch": 0.58, "grad_norm": 52.56159035268648, "learning_rate": 8.849537669514961e-07, "logits/chosen": -1.018843650817871, "logits/rejected": -0.997867226600647, "logps/chosen": -395.1448974609375, "logps/rejected": -477.63116455078125, "loss": 0.428, "rewards/accuracies": 0.75, "rewards/chosen": -0.9292417764663696, "rewards/margins": 1.5945565700531006, "rewards/rejected": -2.5237984657287598, "step": 680 }, { "epoch": 0.58, "grad_norm": 31.972474581807276, "learning_rate": 8.819793274500577e-07, "logits/chosen": -1.0466132164001465, "logits/rejected": -1.0130168199539185, "logps/chosen": -539.3006591796875, "logps/rejected": -813.1967163085938, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": -0.9003525972366333, "rewards/margins": 3.3533737659454346, "rewards/rejected": -4.253726005554199, "step": 681 }, { "epoch": 0.58, "grad_norm": 47.21458017557427, "learning_rate": 8.790059464868051e-07, "logits/chosen": -1.108288288116455, "logits/rejected": -1.025892972946167, "logps/chosen": -577.5280151367188, "logps/rejected": -683.5076904296875, "loss": 0.1913, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9032981395721436, "rewards/margins": 2.549906015396118, "rewards/rejected": -3.4532041549682617, "step": 682 }, { "epoch": 0.59, "grad_norm": 67.17080337761237, "learning_rate": 8.760336507302644e-07, "logits/chosen": -1.068331003189087, "logits/rejected": -1.0443072319030762, "logps/chosen": -470.96563720703125, "logps/rejected": -577.9490356445312, "loss": 0.3619, "rewards/accuracies": 0.875, "rewards/chosen": -1.0024915933609009, "rewards/margins": 2.034213066101074, "rewards/rejected": -3.0367047786712646, "step": 683 }, { "epoch": 0.59, "grad_norm": 79.39125605456371, "learning_rate": 8.730624668392274e-07, "logits/chosen": -1.011876106262207, "logits/rejected": -0.9848489761352539, "logps/chosen": -395.51123046875, "logps/rejected": -556.3402099609375, "loss": 0.4688, "rewards/accuracies": 0.75, "rewards/chosen": -1.3166017532348633, "rewards/margins": 1.682083249092102, "rewards/rejected": -2.998684883117676, "step": 684 }, { "epoch": 0.59, "grad_norm": 32.74442625831233, "learning_rate": 8.700924214625129e-07, "logits/chosen": -1.104191780090332, "logits/rejected": -1.0686068534851074, "logps/chosen": -329.3619689941406, "logps/rejected": -559.1907958984375, "loss": 0.248, "rewards/accuracies": 0.875, "rewards/chosen": -0.5531414151191711, "rewards/margins": 2.6192808151245117, "rewards/rejected": -3.172422409057617, "step": 685 }, { "epoch": 0.59, "grad_norm": 61.697891131967666, "learning_rate": 8.671235412387294e-07, "logits/chosen": -1.0755338668823242, "logits/rejected": -1.0327041149139404, "logps/chosen": -499.4652099609375, "logps/rejected": -629.8323364257812, "loss": 0.3479, "rewards/accuracies": 0.8125, "rewards/chosen": -1.190791130065918, "rewards/margins": 2.2594170570373535, "rewards/rejected": -3.4502081871032715, "step": 686 }, { "epoch": 0.59, "grad_norm": 52.226803144753, "learning_rate": 8.641558527960353e-07, "logits/chosen": -1.091662883758545, "logits/rejected": -1.0301198959350586, "logps/chosen": -495.9996337890625, "logps/rejected": -590.9207153320312, "loss": 0.3159, "rewards/accuracies": 0.875, "rewards/chosen": -0.9306167960166931, "rewards/margins": 2.0519721508026123, "rewards/rejected": -2.98258900642395, "step": 687 }, { "epoch": 0.59, "grad_norm": 37.689109383788484, "learning_rate": 8.611893827518987e-07, "logits/chosen": -1.003065586090088, "logits/rejected": -0.9978610873222351, "logps/chosen": -495.4908752441406, "logps/rejected": -803.0103759765625, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": -1.143528699874878, "rewards/margins": 2.74113130569458, "rewards/rejected": -3.884659767150879, "step": 688 }, { "epoch": 0.59, "grad_norm": 57.00456224390602, "learning_rate": 8.582241577128596e-07, "logits/chosen": -1.0166442394256592, "logits/rejected": -1.0022616386413574, "logps/chosen": -444.8175964355469, "logps/rejected": -605.216064453125, "loss": 0.2829, "rewards/accuracies": 0.9375, "rewards/chosen": -0.45799413323402405, "rewards/margins": 1.3977954387664795, "rewards/rejected": -1.8557895421981812, "step": 689 }, { "epoch": 0.59, "grad_norm": 90.3576776617285, "learning_rate": 8.552602042742929e-07, "logits/chosen": -0.9633793830871582, "logits/rejected": -0.9141581058502197, "logps/chosen": -486.4992980957031, "logps/rejected": -623.2057495117188, "loss": 0.5209, "rewards/accuracies": 0.75, "rewards/chosen": -0.09763508290052414, "rewards/margins": 0.8215664029121399, "rewards/rejected": -0.9192014932632446, "step": 690 }, { "epoch": 0.59, "grad_norm": 75.80511298063934, "learning_rate": 8.522975490201676e-07, "logits/chosen": -0.9981414079666138, "logits/rejected": -0.9680377840995789, "logps/chosen": -582.814208984375, "logps/rejected": -789.6953125, "loss": 0.4338, "rewards/accuracies": 0.75, "rewards/chosen": -0.12799492478370667, "rewards/margins": 1.251299262046814, "rewards/rejected": -1.3792941570281982, "step": 691 }, { "epoch": 0.59, "grad_norm": 58.03890101081773, "learning_rate": 8.493362185228085e-07, "logits/chosen": -1.0232558250427246, "logits/rejected": -1.0005247592926025, "logps/chosen": -470.6429138183594, "logps/rejected": -584.0569458007812, "loss": 0.2724, "rewards/accuracies": 0.875, "rewards/chosen": -0.6013145446777344, "rewards/margins": 2.231989860534668, "rewards/rejected": -2.8333044052124023, "step": 692 }, { "epoch": 0.59, "grad_norm": 39.57112612968621, "learning_rate": 8.463762393426596e-07, "logits/chosen": -1.0991417169570923, "logits/rejected": -1.0630440711975098, "logps/chosen": -457.98760986328125, "logps/rejected": -599.1575317382812, "loss": 0.2307, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7417198419570923, "rewards/margins": 2.1613929271698, "rewards/rejected": -2.9031128883361816, "step": 693 }, { "epoch": 0.6, "grad_norm": 85.58269634576412, "learning_rate": 8.434176380280445e-07, "logits/chosen": -1.139190912246704, "logits/rejected": -1.1178913116455078, "logps/chosen": -432.321533203125, "logps/rejected": -602.7254638671875, "loss": 0.6591, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0561460256576538, "rewards/margins": 1.7356929779052734, "rewards/rejected": -2.791839122772217, "step": 694 }, { "epoch": 0.6, "grad_norm": 60.306927620731216, "learning_rate": 8.404604411149279e-07, "logits/chosen": -1.0914738178253174, "logits/rejected": -1.0556695461273193, "logps/chosen": -388.7018737792969, "logps/rejected": -444.811279296875, "loss": 0.4614, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7933942079544067, "rewards/margins": 1.2613402605056763, "rewards/rejected": -2.054734230041504, "step": 695 }, { "epoch": 0.6, "grad_norm": 112.37605995114386, "learning_rate": 8.375046751266796e-07, "logits/chosen": -1.071258783340454, "logits/rejected": -1.0431017875671387, "logps/chosen": -693.3665771484375, "logps/rejected": -638.94482421875, "loss": 0.8419, "rewards/accuracies": 0.75, "rewards/chosen": -1.8593192100524902, "rewards/margins": 0.6869137287139893, "rewards/rejected": -2.5462329387664795, "step": 696 }, { "epoch": 0.6, "grad_norm": 81.30328331871632, "learning_rate": 8.345503665738343e-07, "logits/chosen": -1.1141998767852783, "logits/rejected": -1.0982260704040527, "logps/chosen": -762.9190673828125, "logps/rejected": -865.8209228515625, "loss": 0.4519, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9578491449356079, "rewards/margins": 2.394010305404663, "rewards/rejected": -3.3518595695495605, "step": 697 }, { "epoch": 0.6, "grad_norm": 51.28776378440233, "learning_rate": 8.31597541953855e-07, "logits/chosen": -1.0857526063919067, "logits/rejected": -1.0541229248046875, "logps/chosen": -576.4926147460938, "logps/rejected": -903.2041625976562, "loss": 0.1986, "rewards/accuracies": 0.9375, "rewards/chosen": -1.033154010772705, "rewards/margins": 3.0796685218811035, "rewards/rejected": -4.112822532653809, "step": 698 }, { "epoch": 0.6, "grad_norm": 57.160000515124885, "learning_rate": 8.28646227750895e-07, "logits/chosen": -1.0621399879455566, "logits/rejected": -1.0101492404937744, "logps/chosen": -623.4498291015625, "logps/rejected": -849.6536254882812, "loss": 0.2408, "rewards/accuracies": 0.875, "rewards/chosen": -1.0524137020111084, "rewards/margins": 2.7270641326904297, "rewards/rejected": -3.779478073120117, "step": 699 }, { "epoch": 0.6, "grad_norm": 35.66216623997116, "learning_rate": 8.256964504355616e-07, "logits/chosen": -1.0806918144226074, "logits/rejected": -1.068446159362793, "logps/chosen": -572.2581176757812, "logps/rejected": -722.5521240234375, "loss": 0.1679, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0125174522399902, "rewards/margins": 2.556424140930176, "rewards/rejected": -3.568941593170166, "step": 700 }, { "epoch": 0.6, "grad_norm": 46.657284636799524, "learning_rate": 8.227482364646761e-07, "logits/chosen": -1.1175563335418701, "logits/rejected": -1.0721676349639893, "logps/chosen": -326.6000061035156, "logps/rejected": -496.0318298339844, "loss": 0.3426, "rewards/accuracies": 0.8125, "rewards/chosen": -0.769618034362793, "rewards/margins": 1.6918368339538574, "rewards/rejected": -2.4614548683166504, "step": 701 }, { "epoch": 0.6, "grad_norm": 37.87281245613246, "learning_rate": 8.198016122810387e-07, "logits/chosen": -1.1282148361206055, "logits/rejected": -1.045536994934082, "logps/chosen": -478.67999267578125, "logps/rejected": -792.648193359375, "loss": 0.1563, "rewards/accuracies": 0.875, "rewards/chosen": -0.9513812065124512, "rewards/margins": 3.4069557189941406, "rewards/rejected": -4.358336925506592, "step": 702 }, { "epoch": 0.6, "grad_norm": 82.09000373881834, "learning_rate": 8.168566043131917e-07, "logits/chosen": -1.1367781162261963, "logits/rejected": -1.0976234674453735, "logps/chosen": -491.50787353515625, "logps/rejected": -632.8331298828125, "loss": 0.5357, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3084259033203125, "rewards/margins": 1.7329047918319702, "rewards/rejected": -3.0413308143615723, "step": 703 }, { "epoch": 0.6, "grad_norm": 47.92059762815023, "learning_rate": 8.139132389751793e-07, "logits/chosen": -1.1266899108886719, "logits/rejected": -1.0857560634613037, "logps/chosen": -483.18505859375, "logps/rejected": -612.6836547851562, "loss": 0.2122, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8871411085128784, "rewards/margins": 2.389219045639038, "rewards/rejected": -3.276360034942627, "step": 704 }, { "epoch": 0.6, "grad_norm": 64.29930782092856, "learning_rate": 8.109715426663144e-07, "logits/chosen": -1.1286720037460327, "logits/rejected": -1.106456995010376, "logps/chosen": -493.1081237792969, "logps/rejected": -593.7147827148438, "loss": 0.3804, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0602256059646606, "rewards/margins": 1.802858829498291, "rewards/rejected": -2.863084316253662, "step": 705 }, { "epoch": 0.61, "grad_norm": 67.54118085026357, "learning_rate": 8.080315417709396e-07, "logits/chosen": -1.1312260627746582, "logits/rejected": -1.0784671306610107, "logps/chosen": -658.8128051757812, "logps/rejected": -790.6376953125, "loss": 0.3104, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2036482095718384, "rewards/margins": 2.3108599185943604, "rewards/rejected": -3.5145082473754883, "step": 706 }, { "epoch": 0.61, "grad_norm": 66.40738539477269, "learning_rate": 8.050932626581918e-07, "logits/chosen": -1.0875067710876465, "logits/rejected": -1.0200226306915283, "logps/chosen": -600.2384033203125, "logps/rejected": -778.6403198242188, "loss": 0.3488, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3704009056091309, "rewards/margins": 2.3894481658935547, "rewards/rejected": -3.7598493099212646, "step": 707 }, { "epoch": 0.61, "grad_norm": 43.056248164179266, "learning_rate": 8.021567316817637e-07, "logits/chosen": -1.1687889099121094, "logits/rejected": -1.0820575952529907, "logps/chosen": -612.1283569335938, "logps/rejected": -858.5739135742188, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": -0.9502978324890137, "rewards/margins": 3.2523093223571777, "rewards/rejected": -4.202607154846191, "step": 708 }, { "epoch": 0.61, "grad_norm": 34.882399762039014, "learning_rate": 7.992219751796704e-07, "logits/chosen": -1.0977517366409302, "logits/rejected": -1.0606008768081665, "logps/chosen": -386.17108154296875, "logps/rejected": -614.8777465820312, "loss": 0.2711, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7608810663223267, "rewards/margins": 2.5815372467041016, "rewards/rejected": -3.3424181938171387, "step": 709 }, { "epoch": 0.61, "grad_norm": 41.4699572158317, "learning_rate": 7.962890194740108e-07, "logits/chosen": -1.159165859222412, "logits/rejected": -1.1114017963409424, "logps/chosen": -440.3230285644531, "logps/rejected": -511.60504150390625, "loss": 0.2059, "rewards/accuracies": 0.875, "rewards/chosen": -0.5045056343078613, "rewards/margins": 2.3634910583496094, "rewards/rejected": -2.8679966926574707, "step": 710 }, { "epoch": 0.61, "grad_norm": 32.88599014002069, "learning_rate": 7.933578908707324e-07, "logits/chosen": -1.0776267051696777, "logits/rejected": -1.0276210308074951, "logps/chosen": -521.4286499023438, "logps/rejected": -695.8472900390625, "loss": 0.1634, "rewards/accuracies": 1.0, "rewards/chosen": -1.188821792602539, "rewards/margins": 2.572869062423706, "rewards/rejected": -3.761690855026245, "step": 711 }, { "epoch": 0.61, "grad_norm": 65.68899031321791, "learning_rate": 7.904286156593946e-07, "logits/chosen": -1.1048779487609863, "logits/rejected": -1.0757970809936523, "logps/chosen": -292.3482360839844, "logps/rejected": -528.6915283203125, "loss": 0.5532, "rewards/accuracies": 0.75, "rewards/chosen": -0.6723714470863342, "rewards/margins": 1.703259825706482, "rewards/rejected": -2.375631093978882, "step": 712 }, { "epoch": 0.61, "grad_norm": 78.97284178105878, "learning_rate": 7.87501220112935e-07, "logits/chosen": -1.143258810043335, "logits/rejected": -1.079023838043213, "logps/chosen": -615.0197143554688, "logps/rejected": -760.0322265625, "loss": 0.3618, "rewards/accuracies": 0.8125, "rewards/chosen": -1.426388144493103, "rewards/margins": 2.533200740814209, "rewards/rejected": -3.9595890045166016, "step": 713 }, { "epoch": 0.61, "grad_norm": 40.012440631122075, "learning_rate": 7.845757304874312e-07, "logits/chosen": -1.1630828380584717, "logits/rejected": -1.1020534038543701, "logps/chosen": -494.54901123046875, "logps/rejected": -600.6971435546875, "loss": 0.229, "rewards/accuracies": 0.875, "rewards/chosen": -1.0917311906814575, "rewards/margins": 2.534214973449707, "rewards/rejected": -3.625946521759033, "step": 714 }, { "epoch": 0.61, "grad_norm": 39.65384568557147, "learning_rate": 7.816521730218663e-07, "logits/chosen": -1.1046650409698486, "logits/rejected": -1.0768277645111084, "logps/chosen": -405.21148681640625, "logps/rejected": -637.754638671875, "loss": 0.2215, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9128862619400024, "rewards/margins": 2.3731446266174316, "rewards/rejected": -3.2860312461853027, "step": 715 }, { "epoch": 0.61, "grad_norm": 85.73045814483734, "learning_rate": 7.787305739378949e-07, "logits/chosen": -1.1642417907714844, "logits/rejected": -1.1198418140411377, "logps/chosen": -488.8841857910156, "logps/rejected": -612.7138671875, "loss": 0.4937, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2218539714813232, "rewards/margins": 1.5381131172180176, "rewards/rejected": -2.759967088699341, "step": 716 }, { "epoch": 0.61, "grad_norm": 42.53386185849525, "learning_rate": 7.758109594396053e-07, "logits/chosen": -1.089572548866272, "logits/rejected": -1.0360852479934692, "logps/chosen": -579.4656982421875, "logps/rejected": -960.8588256835938, "loss": 0.1665, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8310198783874512, "rewards/margins": 4.329297065734863, "rewards/rejected": -5.160316467285156, "step": 717 }, { "epoch": 0.62, "grad_norm": 61.91236169763918, "learning_rate": 7.728933557132864e-07, "logits/chosen": -1.1526691913604736, "logits/rejected": -1.1194026470184326, "logps/chosen": -639.4688720703125, "logps/rejected": -732.7659912109375, "loss": 0.2888, "rewards/accuracies": 0.875, "rewards/chosen": -1.6607766151428223, "rewards/margins": 1.501218557357788, "rewards/rejected": -3.1619949340820312, "step": 718 }, { "epoch": 0.62, "grad_norm": 62.81120284872164, "learning_rate": 7.69977788927193e-07, "logits/chosen": -1.0805715322494507, "logits/rejected": -1.0669435262680054, "logps/chosen": -659.6182861328125, "logps/rejected": -815.5236206054688, "loss": 0.2639, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2269022464752197, "rewards/margins": 2.239307165145874, "rewards/rejected": -3.4662094116210938, "step": 719 }, { "epoch": 0.62, "grad_norm": 54.1255651596908, "learning_rate": 7.670642852313093e-07, "logits/chosen": -1.1983035802841187, "logits/rejected": -1.1525425910949707, "logps/chosen": -632.2000732421875, "logps/rejected": -597.90283203125, "loss": 0.2917, "rewards/accuracies": 0.875, "rewards/chosen": -1.4242873191833496, "rewards/margins": 1.8659169673919678, "rewards/rejected": -3.2902045249938965, "step": 720 }, { "epoch": 0.62, "grad_norm": 30.82111376515087, "learning_rate": 7.641528707571157e-07, "logits/chosen": -1.1706678867340088, "logits/rejected": -1.1194672584533691, "logps/chosen": -472.59967041015625, "logps/rejected": -579.7078247070312, "loss": 0.1936, "rewards/accuracies": 1.0, "rewards/chosen": -0.8377057313919067, "rewards/margins": 2.2968480587005615, "rewards/rejected": -3.134553909301758, "step": 721 }, { "epoch": 0.62, "grad_norm": 50.88348011114949, "learning_rate": 7.612435716173551e-07, "logits/chosen": -1.167925238609314, "logits/rejected": -1.1308338642120361, "logps/chosen": -470.2132263183594, "logps/rejected": -618.6790771484375, "loss": 0.2607, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9302735328674316, "rewards/margins": 2.231713056564331, "rewards/rejected": -3.1619863510131836, "step": 722 }, { "epoch": 0.62, "grad_norm": 63.98869670043746, "learning_rate": 7.583364139057967e-07, "logits/chosen": -1.13923180103302, "logits/rejected": -1.11210298538208, "logps/chosen": -669.2499389648438, "logps/rejected": -699.835205078125, "loss": 0.2721, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8878486156463623, "rewards/margins": 1.806983232498169, "rewards/rejected": -3.6948318481445312, "step": 723 }, { "epoch": 0.62, "grad_norm": 64.99137054774114, "learning_rate": 7.55431423697003e-07, "logits/chosen": -1.1343735456466675, "logits/rejected": -1.081453800201416, "logps/chosen": -517.8303833007812, "logps/rejected": -810.4320068359375, "loss": 0.301, "rewards/accuracies": 0.875, "rewards/chosen": -1.0299971103668213, "rewards/margins": 3.119328022003174, "rewards/rejected": -4.149324893951416, "step": 724 }, { "epoch": 0.62, "grad_norm": 45.66387754211882, "learning_rate": 7.525286270460969e-07, "logits/chosen": -1.1713275909423828, "logits/rejected": -1.1240694522857666, "logps/chosen": -584.0823974609375, "logps/rejected": -670.561767578125, "loss": 0.1905, "rewards/accuracies": 1.0, "rewards/chosen": -1.169159173965454, "rewards/margins": 2.4920945167541504, "rewards/rejected": -3.6612532138824463, "step": 725 }, { "epoch": 0.62, "grad_norm": 59.89071668488572, "learning_rate": 7.496280499885266e-07, "logits/chosen": -1.136541724205017, "logits/rejected": -1.0874381065368652, "logps/chosen": -488.20233154296875, "logps/rejected": -660.3988647460938, "loss": 0.3706, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1385947465896606, "rewards/margins": 1.85256826877594, "rewards/rejected": -2.9911630153656006, "step": 726 }, { "epoch": 0.62, "grad_norm": 50.81307150074276, "learning_rate": 7.467297185398323e-07, "logits/chosen": -1.124204158782959, "logits/rejected": -1.1052558422088623, "logps/chosen": -549.4209594726562, "logps/rejected": -717.3243408203125, "loss": 0.2483, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2831385135650635, "rewards/margins": 2.285860776901245, "rewards/rejected": -3.5689992904663086, "step": 727 }, { "epoch": 0.62, "grad_norm": 62.236229962423614, "learning_rate": 7.43833658695413e-07, "logits/chosen": -1.1308350563049316, "logits/rejected": -1.0671460628509521, "logps/chosen": -371.38238525390625, "logps/rejected": -560.85205078125, "loss": 0.3585, "rewards/accuracies": 0.75, "rewards/chosen": -0.947990357875824, "rewards/margins": 1.855220079421997, "rewards/rejected": -2.803210496902466, "step": 728 }, { "epoch": 0.63, "grad_norm": 52.450467055842964, "learning_rate": 7.409398964302946e-07, "logits/chosen": -1.165910243988037, "logits/rejected": -1.0663230419158936, "logps/chosen": -549.573974609375, "logps/rejected": -1081.66064453125, "loss": 0.2114, "rewards/accuracies": 0.9375, "rewards/chosen": -1.556742548942566, "rewards/margins": 3.4699325561523438, "rewards/rejected": -5.026675224304199, "step": 729 }, { "epoch": 0.63, "grad_norm": 83.48049695338477, "learning_rate": 7.380484576988948e-07, "logits/chosen": -1.1759792566299438, "logits/rejected": -1.124603271484375, "logps/chosen": -603.390625, "logps/rejected": -697.0489501953125, "loss": 0.6446, "rewards/accuracies": 0.8125, "rewards/chosen": -1.043789267539978, "rewards/margins": 1.9236034154891968, "rewards/rejected": -2.9673924446105957, "step": 730 }, { "epoch": 0.63, "grad_norm": 25.114247133832123, "learning_rate": 7.351593684347908e-07, "logits/chosen": -1.200255274772644, "logits/rejected": -1.1304539442062378, "logps/chosen": -431.82489013671875, "logps/rejected": -722.521728515625, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": -1.025187611579895, "rewards/margins": 2.7052230834960938, "rewards/rejected": -3.730410575866699, "step": 731 }, { "epoch": 0.63, "grad_norm": 52.31421975307311, "learning_rate": 7.322726545504889e-07, "logits/chosen": -1.172715425491333, "logits/rejected": -1.12477445602417, "logps/chosen": -370.84576416015625, "logps/rejected": -681.51318359375, "loss": 0.2574, "rewards/accuracies": 0.875, "rewards/chosen": -0.6131293773651123, "rewards/margins": 2.7430875301361084, "rewards/rejected": -3.3562169075012207, "step": 732 }, { "epoch": 0.63, "grad_norm": 78.43657110280873, "learning_rate": 7.293883419371892e-07, "logits/chosen": -1.1670312881469727, "logits/rejected": -1.1011550426483154, "logps/chosen": -411.6805419921875, "logps/rejected": -600.664306640625, "loss": 0.5422, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9437257051467896, "rewards/margins": 2.7335152626037598, "rewards/rejected": -3.677241086959839, "step": 733 }, { "epoch": 0.63, "grad_norm": 87.72015513512574, "learning_rate": 7.265064564645544e-07, "logits/chosen": -1.233947992324829, "logits/rejected": -1.1877312660217285, "logps/chosen": -538.8570556640625, "logps/rejected": -600.8184814453125, "loss": 0.4649, "rewards/accuracies": 0.75, "rewards/chosen": -1.5215339660644531, "rewards/margins": 1.1645421981811523, "rewards/rejected": -2.6860761642456055, "step": 734 }, { "epoch": 0.63, "grad_norm": 38.87597940211733, "learning_rate": 7.236270239804791e-07, "logits/chosen": -1.1780080795288086, "logits/rejected": -1.12031888961792, "logps/chosen": -474.943603515625, "logps/rejected": -638.14013671875, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": -1.0575810670852661, "rewards/margins": 2.20149564743042, "rewards/rejected": -3.2590765953063965, "step": 735 }, { "epoch": 0.63, "grad_norm": 64.52242884909481, "learning_rate": 7.207500703108556e-07, "logits/chosen": -1.1174970865249634, "logits/rejected": -1.0595775842666626, "logps/chosen": -523.7102661132812, "logps/rejected": -703.5924072265625, "loss": 0.3152, "rewards/accuracies": 0.75, "rewards/chosen": -1.0679781436920166, "rewards/margins": 2.1514697074890137, "rewards/rejected": -3.2194480895996094, "step": 736 }, { "epoch": 0.63, "grad_norm": 55.6938214020685, "learning_rate": 7.178756212593442e-07, "logits/chosen": -1.1437046527862549, "logits/rejected": -1.09684419631958, "logps/chosen": -481.9346923828125, "logps/rejected": -824.975341796875, "loss": 0.2046, "rewards/accuracies": 0.875, "rewards/chosen": -0.9986522197723389, "rewards/margins": 2.441328525543213, "rewards/rejected": -3.4399807453155518, "step": 737 }, { "epoch": 0.63, "grad_norm": 105.47202970842274, "learning_rate": 7.150037026071404e-07, "logits/chosen": -1.2130227088928223, "logits/rejected": -1.1825370788574219, "logps/chosen": -605.1729736328125, "logps/rejected": -633.9487915039062, "loss": 0.8505, "rewards/accuracies": 0.625, "rewards/chosen": -1.5780174732208252, "rewards/margins": 0.7051810026168823, "rewards/rejected": -2.283198356628418, "step": 738 }, { "epoch": 0.63, "grad_norm": 52.61984772985579, "learning_rate": 7.121343401127456e-07, "logits/chosen": -1.1280416250228882, "logits/rejected": -1.0790488719940186, "logps/chosen": -385.1922607421875, "logps/rejected": -701.3255615234375, "loss": 0.3356, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5919640064239502, "rewards/margins": 2.0131258964538574, "rewards/rejected": -2.6050901412963867, "step": 739 }, { "epoch": 0.63, "grad_norm": 51.35346020631582, "learning_rate": 7.092675595117332e-07, "logits/chosen": -1.1492226123809814, "logits/rejected": -1.0986218452453613, "logps/chosen": -395.18731689453125, "logps/rejected": -550.7103271484375, "loss": 0.3553, "rewards/accuracies": 0.875, "rewards/chosen": -0.6217691898345947, "rewards/margins": 1.554800033569336, "rewards/rejected": -2.1765689849853516, "step": 740 }, { "epoch": 0.64, "grad_norm": 70.21040596452477, "learning_rate": 7.064033865165203e-07, "logits/chosen": -1.1506271362304688, "logits/rejected": -1.121349811553955, "logps/chosen": -496.9982604980469, "logps/rejected": -625.1170654296875, "loss": 0.4375, "rewards/accuracies": 0.75, "rewards/chosen": -1.0170750617980957, "rewards/margins": 1.751772403717041, "rewards/rejected": -2.7688472270965576, "step": 741 }, { "epoch": 0.64, "grad_norm": 68.89351576725261, "learning_rate": 7.035418468161365e-07, "logits/chosen": -1.2487106323242188, "logits/rejected": -1.219386100769043, "logps/chosen": -414.99365234375, "logps/rejected": -643.078369140625, "loss": 0.3391, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8841046690940857, "rewards/margins": 1.9309461116790771, "rewards/rejected": -2.8150508403778076, "step": 742 }, { "epoch": 0.64, "grad_norm": 28.199879890615307, "learning_rate": 7.006829660759923e-07, "logits/chosen": -1.115299940109253, "logits/rejected": -1.07859468460083, "logps/chosen": -562.867431640625, "logps/rejected": -755.9138793945312, "loss": 0.1586, "rewards/accuracies": 1.0, "rewards/chosen": -0.8423274159431458, "rewards/margins": 2.3967549800872803, "rewards/rejected": -3.2390823364257812, "step": 743 }, { "epoch": 0.64, "grad_norm": 73.92393577003973, "learning_rate": 6.978267699376493e-07, "logits/chosen": -1.1400046348571777, "logits/rejected": -1.1184709072113037, "logps/chosen": -516.0626831054688, "logps/rejected": -608.6300048828125, "loss": 0.527, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7962015271186829, "rewards/margins": 1.346013069152832, "rewards/rejected": -2.142214775085449, "step": 744 }, { "epoch": 0.64, "grad_norm": 48.84711403391857, "learning_rate": 6.949732840185925e-07, "logits/chosen": -1.183213710784912, "logits/rejected": -1.1628715991973877, "logps/chosen": -407.721923828125, "logps/rejected": -440.9341735839844, "loss": 0.2901, "rewards/accuracies": 0.875, "rewards/chosen": -0.6022155284881592, "rewards/margins": 1.7358124256134033, "rewards/rejected": -2.3380279541015625, "step": 745 }, { "epoch": 0.64, "grad_norm": 70.5674991574398, "learning_rate": 6.921225339119971e-07, "logits/chosen": -1.167476773262024, "logits/rejected": -1.1096925735473633, "logps/chosen": -490.8918151855469, "logps/rejected": -707.9902954101562, "loss": 0.2934, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7327123880386353, "rewards/margins": 2.283106803894043, "rewards/rejected": -3.015819549560547, "step": 746 }, { "epoch": 0.64, "grad_norm": 47.13349865460932, "learning_rate": 6.892745451865008e-07, "logits/chosen": -1.259293556213379, "logits/rejected": -1.201749563217163, "logps/chosen": -394.35711669921875, "logps/rejected": -533.4117431640625, "loss": 0.372, "rewards/accuracies": 0.75, "rewards/chosen": -0.6959115266799927, "rewards/margins": 1.4656853675842285, "rewards/rejected": -2.1615970134735107, "step": 747 }, { "epoch": 0.64, "grad_norm": 27.819387134703216, "learning_rate": 6.86429343385975e-07, "logits/chosen": -1.2313560247421265, "logits/rejected": -1.2034997940063477, "logps/chosen": -436.7022705078125, "logps/rejected": -587.44140625, "loss": 0.285, "rewards/accuracies": 0.75, "rewards/chosen": -1.0417014360427856, "rewards/margins": 2.1582226753234863, "rewards/rejected": -3.1999239921569824, "step": 748 }, { "epoch": 0.64, "grad_norm": 42.357745921717495, "learning_rate": 6.835869540292942e-07, "logits/chosen": -1.187593698501587, "logits/rejected": -1.2094995975494385, "logps/chosen": -465.0749816894531, "logps/rejected": -561.1448364257812, "loss": 0.2746, "rewards/accuracies": 0.9375, "rewards/chosen": -0.540374755859375, "rewards/margins": 2.182948112487793, "rewards/rejected": -2.723323106765747, "step": 749 }, { "epoch": 0.64, "grad_norm": 32.360209468440345, "learning_rate": 6.807474026101078e-07, "logits/chosen": -1.2537109851837158, "logits/rejected": -1.1764297485351562, "logps/chosen": -573.1024780273438, "logps/rejected": -926.65625, "loss": 0.2464, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1264824867248535, "rewards/margins": 2.9104440212249756, "rewards/rejected": -4.03692626953125, "step": 750 }, { "epoch": 0.64, "grad_norm": 21.1459484931328, "learning_rate": 6.779107145966121e-07, "logits/chosen": -1.2650989294052124, "logits/rejected": -1.208359718322754, "logps/chosen": -451.5343017578125, "logps/rejected": -670.3783569335938, "loss": 0.193, "rewards/accuracies": 0.875, "rewards/chosen": -1.0734288692474365, "rewards/margins": 2.3972225189208984, "rewards/rejected": -3.470651626586914, "step": 751 }, { "epoch": 0.64, "grad_norm": 62.5659394844564, "learning_rate": 6.750769154313205e-07, "logits/chosen": -1.2595267295837402, "logits/rejected": -1.2201892137527466, "logps/chosen": -526.067138671875, "logps/rejected": -715.4703369140625, "loss": 0.3608, "rewards/accuracies": 0.75, "rewards/chosen": -1.2677977085113525, "rewards/margins": 1.8319288492202759, "rewards/rejected": -3.099726676940918, "step": 752 }, { "epoch": 0.65, "grad_norm": 38.93281498713387, "learning_rate": 6.722460305308367e-07, "logits/chosen": -1.213770866394043, "logits/rejected": -1.1902894973754883, "logps/chosen": -570.1568603515625, "logps/rejected": -643.709228515625, "loss": 0.254, "rewards/accuracies": 0.875, "rewards/chosen": -1.2748959064483643, "rewards/margins": 1.9360785484313965, "rewards/rejected": -3.21097469329834, "step": 753 }, { "epoch": 0.65, "grad_norm": 54.254456805570406, "learning_rate": 6.694180852856253e-07, "logits/chosen": -1.1973718404769897, "logits/rejected": -1.1430859565734863, "logps/chosen": -592.2709350585938, "logps/rejected": -756.05615234375, "loss": 0.243, "rewards/accuracies": 0.875, "rewards/chosen": -1.6226004362106323, "rewards/margins": 2.1562705039978027, "rewards/rejected": -3.7788708209991455, "step": 754 }, { "epoch": 0.65, "grad_norm": 106.8955122637892, "learning_rate": 6.665931050597859e-07, "logits/chosen": -1.2209007740020752, "logits/rejected": -1.1845306158065796, "logps/chosen": -610.568359375, "logps/rejected": -641.6486206054688, "loss": 0.7744, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4062316417694092, "rewards/margins": 1.0251706838607788, "rewards/rejected": -2.4314022064208984, "step": 755 }, { "epoch": 0.65, "grad_norm": 71.75036324422791, "learning_rate": 6.637711151908239e-07, "logits/chosen": -1.2332406044006348, "logits/rejected": -1.163503646850586, "logps/chosen": -399.26348876953125, "logps/rejected": -641.740478515625, "loss": 0.4403, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0247321128845215, "rewards/margins": 2.3649094104766846, "rewards/rejected": -3.389641523361206, "step": 756 }, { "epoch": 0.65, "grad_norm": 55.89352128487458, "learning_rate": 6.609521409894237e-07, "logits/chosen": -1.2482631206512451, "logits/rejected": -1.1869556903839111, "logps/chosen": -506.63739013671875, "logps/rejected": -672.4251708984375, "loss": 0.1912, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8384074568748474, "rewards/margins": 2.9175162315368652, "rewards/rejected": -3.7559235095977783, "step": 757 }, { "epoch": 0.65, "grad_norm": 36.236798796376284, "learning_rate": 6.58136207739223e-07, "logits/chosen": -1.2353761196136475, "logits/rejected": -1.1979963779449463, "logps/chosen": -294.13189697265625, "logps/rejected": -533.3206787109375, "loss": 0.2674, "rewards/accuracies": 0.875, "rewards/chosen": -0.5782788395881653, "rewards/margins": 2.0374372005462646, "rewards/rejected": -2.615715980529785, "step": 758 }, { "epoch": 0.65, "grad_norm": 49.56045920531742, "learning_rate": 6.553233406965834e-07, "logits/chosen": -1.2530863285064697, "logits/rejected": -1.1859345436096191, "logps/chosen": -343.58172607421875, "logps/rejected": -638.3863525390625, "loss": 0.3254, "rewards/accuracies": 0.75, "rewards/chosen": -0.8800230026245117, "rewards/margins": 2.761399745941162, "rewards/rejected": -3.6414225101470947, "step": 759 }, { "epoch": 0.65, "grad_norm": 78.49067555425536, "learning_rate": 6.525135650903666e-07, "logits/chosen": -1.1959669589996338, "logits/rejected": -1.142458200454712, "logps/chosen": -699.4346313476562, "logps/rejected": -895.7913818359375, "loss": 0.3717, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4892971515655518, "rewards/margins": 2.3924736976623535, "rewards/rejected": -3.8817708492279053, "step": 760 }, { "epoch": 0.65, "grad_norm": 80.9773466833384, "learning_rate": 6.497069061217064e-07, "logits/chosen": -1.2011051177978516, "logits/rejected": -1.1678533554077148, "logps/chosen": -499.91290283203125, "logps/rejected": -533.1343383789062, "loss": 0.6389, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1470344066619873, "rewards/margins": 1.2944519519805908, "rewards/rejected": -2.441486358642578, "step": 761 }, { "epoch": 0.65, "grad_norm": 71.52760626195689, "learning_rate": 6.469033889637837e-07, "logits/chosen": -1.20949387550354, "logits/rejected": -1.177109956741333, "logps/chosen": -533.7897338867188, "logps/rejected": -590.6302490234375, "loss": 0.3683, "rewards/accuracies": 0.75, "rewards/chosen": -0.932750940322876, "rewards/margins": 2.078059196472168, "rewards/rejected": -3.010810136795044, "step": 762 }, { "epoch": 0.65, "grad_norm": 40.547912487993926, "learning_rate": 6.44103038761599e-07, "logits/chosen": -1.2746690511703491, "logits/rejected": -1.2152957916259766, "logps/chosen": -429.7515869140625, "logps/rejected": -710.8656616210938, "loss": 0.2478, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9441927075386047, "rewards/margins": 2.390007495880127, "rewards/rejected": -3.334199905395508, "step": 763 }, { "epoch": 0.66, "grad_norm": 58.98554065072182, "learning_rate": 6.413058806317495e-07, "logits/chosen": -1.2609431743621826, "logits/rejected": -1.2304378747940063, "logps/chosen": -449.83343505859375, "logps/rejected": -550.3903198242188, "loss": 0.268, "rewards/accuracies": 0.875, "rewards/chosen": -1.0245052576065063, "rewards/margins": 2.3938255310058594, "rewards/rejected": -3.4183309078216553, "step": 764 }, { "epoch": 0.66, "grad_norm": 20.32757958785188, "learning_rate": 6.385119396622021e-07, "logits/chosen": -1.2361825704574585, "logits/rejected": -1.1875343322753906, "logps/chosen": -436.0609130859375, "logps/rejected": -726.2998046875, "loss": 0.13, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5165493488311768, "rewards/margins": 3.3205623626708984, "rewards/rejected": -3.8371119499206543, "step": 765 }, { "epoch": 0.66, "grad_norm": 40.364014159059444, "learning_rate": 6.357212409120678e-07, "logits/chosen": -1.2350900173187256, "logits/rejected": -1.186276912689209, "logps/chosen": -407.8211669921875, "logps/rejected": -741.8107299804688, "loss": 0.1741, "rewards/accuracies": 0.875, "rewards/chosen": -0.8670476078987122, "rewards/margins": 3.295076370239258, "rewards/rejected": -4.162123680114746, "step": 766 }, { "epoch": 0.66, "grad_norm": 16.2455480762992, "learning_rate": 6.329338094113784e-07, "logits/chosen": -1.2227864265441895, "logits/rejected": -1.1488749980926514, "logps/chosen": -572.5362548828125, "logps/rejected": -886.8587036132812, "loss": 0.0625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9070994853973389, "rewards/margins": 3.629757881164551, "rewards/rejected": -4.536857604980469, "step": 767 }, { "epoch": 0.66, "grad_norm": 23.28049560328349, "learning_rate": 6.301496701608619e-07, "logits/chosen": -1.2397854328155518, "logits/rejected": -1.1634564399719238, "logps/chosen": -572.800048828125, "logps/rejected": -759.6251220703125, "loss": 0.1429, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8949357271194458, "rewards/margins": 2.677290439605713, "rewards/rejected": -3.5722262859344482, "step": 768 }, { "epoch": 0.66, "grad_norm": 78.84035838825896, "learning_rate": 6.273688481317174e-07, "logits/chosen": -1.2053560018539429, "logits/rejected": -1.1527836322784424, "logps/chosen": -603.7992553710938, "logps/rejected": -744.5968017578125, "loss": 0.447, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0674078464508057, "rewards/margins": 2.0926146507263184, "rewards/rejected": -3.160022497177124, "step": 769 }, { "epoch": 0.66, "grad_norm": 51.9242687714073, "learning_rate": 6.245913682653911e-07, "logits/chosen": -1.2491703033447266, "logits/rejected": -1.2077652215957642, "logps/chosen": -421.03265380859375, "logps/rejected": -591.1710205078125, "loss": 0.256, "rewards/accuracies": 0.875, "rewards/chosen": -1.1425645351409912, "rewards/margins": 2.417445182800293, "rewards/rejected": -3.560009717941284, "step": 770 }, { "epoch": 0.66, "grad_norm": 40.60942224554684, "learning_rate": 6.218172554733542e-07, "logits/chosen": -1.2798171043395996, "logits/rejected": -1.2115535736083984, "logps/chosen": -453.9419250488281, "logps/rejected": -695.1771240234375, "loss": 0.2188, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0825917720794678, "rewards/margins": 2.792531967163086, "rewards/rejected": -3.8751237392425537, "step": 771 }, { "epoch": 0.66, "grad_norm": 79.29510477081502, "learning_rate": 6.190465346368769e-07, "logits/chosen": -1.2755049467086792, "logits/rejected": -1.2568185329437256, "logps/chosen": -473.3320007324219, "logps/rejected": -498.23431396484375, "loss": 0.4651, "rewards/accuracies": 0.875, "rewards/chosen": -1.2007759809494019, "rewards/margins": 1.0075633525848389, "rewards/rejected": -2.208339214324951, "step": 772 }, { "epoch": 0.66, "grad_norm": 69.26698239236194, "learning_rate": 6.162792306068074e-07, "logits/chosen": -1.2635630369186401, "logits/rejected": -1.1861530542373657, "logps/chosen": -557.6167602539062, "logps/rejected": -946.214599609375, "loss": 0.253, "rewards/accuracies": 0.9375, "rewards/chosen": -1.246105670928955, "rewards/margins": 3.4181196689605713, "rewards/rejected": -4.6642255783081055, "step": 773 }, { "epoch": 0.66, "grad_norm": 44.00865651354992, "learning_rate": 6.135153682033488e-07, "logits/chosen": -1.2853941917419434, "logits/rejected": -1.1902930736541748, "logps/chosen": -329.769287109375, "logps/rejected": -618.7449951171875, "loss": 0.3086, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47864830493927, "rewards/margins": 2.291872024536133, "rewards/rejected": -2.7705204486846924, "step": 774 }, { "epoch": 0.66, "grad_norm": 50.05200409581262, "learning_rate": 6.107549722158347e-07, "logits/chosen": -1.21261727809906, "logits/rejected": -1.1689696311950684, "logps/chosen": -507.6671142578125, "logps/rejected": -783.21337890625, "loss": 0.2574, "rewards/accuracies": 0.875, "rewards/chosen": -0.6551064848899841, "rewards/margins": 1.705780029296875, "rewards/rejected": -2.360886573791504, "step": 775 }, { "epoch": 0.67, "grad_norm": 63.05478198929676, "learning_rate": 6.079980674025085e-07, "logits/chosen": -1.1971309185028076, "logits/rejected": -1.1465520858764648, "logps/chosen": -626.6431274414062, "logps/rejected": -827.80908203125, "loss": 0.3393, "rewards/accuracies": 0.875, "rewards/chosen": -0.7310771942138672, "rewards/margins": 1.9147140979766846, "rewards/rejected": -2.645791530609131, "step": 776 }, { "epoch": 0.67, "grad_norm": 66.27914639678409, "learning_rate": 6.052446784903021e-07, "logits/chosen": -1.1706092357635498, "logits/rejected": -1.1214057207107544, "logps/chosen": -586.0625, "logps/rejected": -695.3425903320312, "loss": 0.326, "rewards/accuracies": 0.875, "rewards/chosen": -0.38804376125335693, "rewards/margins": 1.602970004081726, "rewards/rejected": -1.991013765335083, "step": 777 }, { "epoch": 0.67, "grad_norm": 62.401850509194375, "learning_rate": 6.024948301746111e-07, "logits/chosen": -1.234520673751831, "logits/rejected": -1.1878061294555664, "logps/chosen": -542.1024169921875, "logps/rejected": -711.7498779296875, "loss": 0.3367, "rewards/accuracies": 0.875, "rewards/chosen": -0.5028198957443237, "rewards/margins": 1.2911505699157715, "rewards/rejected": -1.7939705848693848, "step": 778 }, { "epoch": 0.67, "grad_norm": 91.22663071211362, "learning_rate": 5.997485471190764e-07, "logits/chosen": -1.1923151016235352, "logits/rejected": -1.1633234024047852, "logps/chosen": -523.5685424804688, "logps/rejected": -662.0013427734375, "loss": 0.5362, "rewards/accuracies": 0.625, "rewards/chosen": -0.9992001056671143, "rewards/margins": 0.7236025333404541, "rewards/rejected": -1.7228026390075684, "step": 779 }, { "epoch": 0.67, "grad_norm": 61.3655202551869, "learning_rate": 5.970058539553613e-07, "logits/chosen": -1.2179090976715088, "logits/rejected": -1.1775586605072021, "logps/chosen": -623.544921875, "logps/rejected": -763.564697265625, "loss": 0.2909, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5548088550567627, "rewards/margins": 1.5996613502502441, "rewards/rejected": -2.154470205307007, "step": 780 }, { "epoch": 0.67, "grad_norm": 70.28615878748617, "learning_rate": 5.942667752829317e-07, "logits/chosen": -1.218387246131897, "logits/rejected": -1.1726099252700806, "logps/chosen": -542.6781005859375, "logps/rejected": -712.7110595703125, "loss": 0.4253, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7333188056945801, "rewards/margins": 1.2428584098815918, "rewards/rejected": -1.9761772155761719, "step": 781 }, { "epoch": 0.67, "grad_norm": 68.19048994228498, "learning_rate": 5.915313356688339e-07, "logits/chosen": -1.2339286804199219, "logits/rejected": -1.1851835250854492, "logps/chosen": -525.4983520507812, "logps/rejected": -790.62353515625, "loss": 0.3123, "rewards/accuracies": 0.75, "rewards/chosen": -0.9128273725509644, "rewards/margins": 1.8884153366088867, "rewards/rejected": -2.8012425899505615, "step": 782 }, { "epoch": 0.67, "grad_norm": 57.37838326558948, "learning_rate": 5.887995596474748e-07, "logits/chosen": -1.2295994758605957, "logits/rejected": -1.1778643131256104, "logps/chosen": -520.6300048828125, "logps/rejected": -704.8819580078125, "loss": 0.3803, "rewards/accuracies": 0.75, "rewards/chosen": -1.1053333282470703, "rewards/margins": 1.898297905921936, "rewards/rejected": -3.003631114959717, "step": 783 }, { "epoch": 0.67, "grad_norm": 37.27334657073891, "learning_rate": 5.86071471720404e-07, "logits/chosen": -1.2449480295181274, "logits/rejected": -1.2495803833007812, "logps/chosen": -420.33740234375, "logps/rejected": -505.73748779296875, "loss": 0.2564, "rewards/accuracies": 1.0, "rewards/chosen": -0.5788464546203613, "rewards/margins": 1.8256258964538574, "rewards/rejected": -2.4044723510742188, "step": 784 }, { "epoch": 0.67, "grad_norm": 55.97231080048403, "learning_rate": 5.83347096356091e-07, "logits/chosen": -1.3385541439056396, "logits/rejected": -1.30124032497406, "logps/chosen": -448.4632263183594, "logps/rejected": -502.9323425292969, "loss": 0.3686, "rewards/accuracies": 0.8125, "rewards/chosen": -1.049405813217163, "rewards/margins": 1.4481611251831055, "rewards/rejected": -2.4975671768188477, "step": 785 }, { "epoch": 0.67, "grad_norm": 55.72959258812196, "learning_rate": 5.806264579897063e-07, "logits/chosen": -1.302438735961914, "logits/rejected": -1.2615262269973755, "logps/chosen": -472.30706787109375, "logps/rejected": -655.6341552734375, "loss": 0.3095, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8192142248153687, "rewards/margins": 2.3826041221618652, "rewards/rejected": -3.2018184661865234, "step": 786 }, { "epoch": 0.67, "grad_norm": 26.834919776565844, "learning_rate": 5.779095810229051e-07, "logits/chosen": -1.1845002174377441, "logits/rejected": -1.153458595275879, "logps/chosen": -582.74365234375, "logps/rejected": -814.0584106445312, "loss": 0.0881, "rewards/accuracies": 1.0, "rewards/chosen": -1.0058015584945679, "rewards/margins": 3.220125675201416, "rewards/rejected": -4.225927352905273, "step": 787 }, { "epoch": 0.68, "grad_norm": 64.87831875624222, "learning_rate": 5.751964898236039e-07, "logits/chosen": -1.2975207567214966, "logits/rejected": -1.2309163808822632, "logps/chosen": -587.638671875, "logps/rejected": -691.3016357421875, "loss": 0.4853, "rewards/accuracies": 0.75, "rewards/chosen": -1.2252812385559082, "rewards/margins": 1.7551825046539307, "rewards/rejected": -2.980463743209839, "step": 788 }, { "epoch": 0.68, "grad_norm": 54.54179421648986, "learning_rate": 5.724872087257656e-07, "logits/chosen": -1.2868032455444336, "logits/rejected": -1.2407780885696411, "logps/chosen": -579.31005859375, "logps/rejected": -694.39599609375, "loss": 0.2304, "rewards/accuracies": 0.875, "rewards/chosen": -1.182490587234497, "rewards/margins": 2.7682316303253174, "rewards/rejected": -3.9507226943969727, "step": 789 }, { "epoch": 0.68, "grad_norm": 51.9322318937397, "learning_rate": 5.697817620291798e-07, "logits/chosen": -1.3190757036209106, "logits/rejected": -1.2874265909194946, "logps/chosen": -441.6192626953125, "logps/rejected": -552.3643798828125, "loss": 0.2837, "rewards/accuracies": 0.875, "rewards/chosen": -1.0770200490951538, "rewards/margins": 2.20762300491333, "rewards/rejected": -3.2846431732177734, "step": 790 }, { "epoch": 0.68, "grad_norm": 33.304444234616746, "learning_rate": 5.670801739992448e-07, "logits/chosen": -1.2871630191802979, "logits/rejected": -1.2591633796691895, "logps/chosen": -426.33038330078125, "logps/rejected": -657.7274169921875, "loss": 0.219, "rewards/accuracies": 0.875, "rewards/chosen": -0.49022212624549866, "rewards/margins": 2.8954200744628906, "rewards/rejected": -3.3856422901153564, "step": 791 }, { "epoch": 0.68, "grad_norm": 77.92285907029817, "learning_rate": 5.643824688667505e-07, "logits/chosen": -1.277097463607788, "logits/rejected": -1.2177878618240356, "logps/chosen": -528.1997680664062, "logps/rejected": -846.9409790039062, "loss": 0.3319, "rewards/accuracies": 0.875, "rewards/chosen": -1.0698024034500122, "rewards/margins": 3.044430732727051, "rewards/rejected": -4.114233016967773, "step": 792 }, { "epoch": 0.68, "grad_norm": 63.01592387354561, "learning_rate": 5.616886708276603e-07, "logits/chosen": -1.3257449865341187, "logits/rejected": -1.2598521709442139, "logps/chosen": -580.3109130859375, "logps/rejected": -764.5173950195312, "loss": 0.2923, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9693795442581177, "rewards/margins": 2.5392367839813232, "rewards/rejected": -3.5086162090301514, "step": 793 }, { "epoch": 0.68, "grad_norm": 88.32330940362355, "learning_rate": 5.589988040428946e-07, "logits/chosen": -1.2941094636917114, "logits/rejected": -1.233504295349121, "logps/chosen": -569.9989624023438, "logps/rejected": -704.6729736328125, "loss": 0.4141, "rewards/accuracies": 0.875, "rewards/chosen": -1.2531591653823853, "rewards/margins": 2.2078423500061035, "rewards/rejected": -3.4610016345977783, "step": 794 }, { "epoch": 0.68, "grad_norm": 24.135022838491736, "learning_rate": 5.563128926381149e-07, "logits/chosen": -1.2901718616485596, "logits/rejected": -1.1893861293792725, "logps/chosen": -485.58612060546875, "logps/rejected": -867.0072021484375, "loss": 0.1402, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3576972484588623, "rewards/margins": 3.1181297302246094, "rewards/rejected": -4.475826740264893, "step": 795 }, { "epoch": 0.68, "grad_norm": 49.316205154733275, "learning_rate": 5.536309607035042e-07, "logits/chosen": -1.315154790878296, "logits/rejected": -1.2445651292800903, "logps/chosen": -363.44677734375, "logps/rejected": -587.053955078125, "loss": 0.2735, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1718934774398804, "rewards/margins": 2.271083116531372, "rewards/rejected": -3.442976713180542, "step": 796 }, { "epoch": 0.68, "grad_norm": 114.13517381128635, "learning_rate": 5.509530322935564e-07, "logits/chosen": -1.2886552810668945, "logits/rejected": -1.2342393398284912, "logps/chosen": -513.4869384765625, "logps/rejected": -628.4658203125, "loss": 0.6908, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5586981773376465, "rewards/margins": 1.7595255374908447, "rewards/rejected": -3.318223476409912, "step": 797 }, { "epoch": 0.68, "grad_norm": 51.628883615524664, "learning_rate": 5.482791314268558e-07, "logits/chosen": -1.3084138631820679, "logits/rejected": -1.249441385269165, "logps/chosen": -418.89837646484375, "logps/rejected": -603.8793334960938, "loss": 0.2867, "rewards/accuracies": 0.875, "rewards/chosen": -0.818135142326355, "rewards/margins": 2.3880748748779297, "rewards/rejected": -3.206209659576416, "step": 798 }, { "epoch": 0.69, "grad_norm": 99.95898090957525, "learning_rate": 5.456092820858619e-07, "logits/chosen": -1.317805528640747, "logits/rejected": -1.304664969444275, "logps/chosen": -563.1351318359375, "logps/rejected": -533.6165161132812, "loss": 0.7847, "rewards/accuracies": 0.875, "rewards/chosen": -1.563063383102417, "rewards/margins": 1.5004191398620605, "rewards/rejected": -3.0634822845458984, "step": 799 }, { "epoch": 0.69, "grad_norm": 123.1394399884689, "learning_rate": 5.429435082166991e-07, "logits/chosen": -1.294856309890747, "logits/rejected": -1.2487471103668213, "logps/chosen": -667.078125, "logps/rejected": -792.073486328125, "loss": 0.3631, "rewards/accuracies": 0.875, "rewards/chosen": -1.8730590343475342, "rewards/margins": 1.7214754819869995, "rewards/rejected": -3.5945346355438232, "step": 800 }, { "epoch": 0.69, "grad_norm": 32.95676078668431, "learning_rate": 5.402818337289352e-07, "logits/chosen": -1.2985724210739136, "logits/rejected": -1.234189748764038, "logps/chosen": -457.0745849609375, "logps/rejected": -772.1397705078125, "loss": 0.2344, "rewards/accuracies": 0.875, "rewards/chosen": -0.9183647632598877, "rewards/margins": 3.1553783416748047, "rewards/rejected": -4.073742866516113, "step": 801 }, { "epoch": 0.69, "grad_norm": 30.11951749501376, "learning_rate": 5.376242824953718e-07, "logits/chosen": -1.3676421642303467, "logits/rejected": -1.3023195266723633, "logps/chosen": -296.14837646484375, "logps/rejected": -550.3222045898438, "loss": 0.2515, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8110196590423584, "rewards/margins": 2.414071798324585, "rewards/rejected": -3.2250914573669434, "step": 802 }, { "epoch": 0.69, "grad_norm": 37.1130777897999, "learning_rate": 5.349708783518296e-07, "logits/chosen": -1.3127175569534302, "logits/rejected": -1.2090966701507568, "logps/chosen": -555.64892578125, "logps/rejected": -886.0833740234375, "loss": 0.1507, "rewards/accuracies": 0.9375, "rewards/chosen": -1.431624412536621, "rewards/margins": 2.953738212585449, "rewards/rejected": -4.38536262512207, "step": 803 }, { "epoch": 0.69, "grad_norm": 135.12316330777904, "learning_rate": 5.323216450969315e-07, "logits/chosen": -1.256744384765625, "logits/rejected": -1.267289400100708, "logps/chosen": -616.8837280273438, "logps/rejected": -545.8270874023438, "loss": 1.0152, "rewards/accuracies": 0.75, "rewards/chosen": -2.1458804607391357, "rewards/margins": 0.6963490843772888, "rewards/rejected": -2.8422296047210693, "step": 804 }, { "epoch": 0.69, "grad_norm": 45.54737697343092, "learning_rate": 5.296766064918928e-07, "logits/chosen": -1.308955192565918, "logits/rejected": -1.2631933689117432, "logps/chosen": -527.814208984375, "logps/rejected": -647.0279541015625, "loss": 0.2665, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3599857091903687, "rewards/margins": 1.8339883089065552, "rewards/rejected": -3.193974018096924, "step": 805 }, { "epoch": 0.69, "grad_norm": 54.18978295900378, "learning_rate": 5.270357862603061e-07, "logits/chosen": -1.233064889907837, "logits/rejected": -1.1992673873901367, "logps/chosen": -444.14178466796875, "logps/rejected": -639.1141357421875, "loss": 0.2421, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0077587366104126, "rewards/margins": 2.1859450340270996, "rewards/rejected": -3.1937036514282227, "step": 806 }, { "epoch": 0.69, "grad_norm": 39.59007654916537, "learning_rate": 5.243992080879291e-07, "logits/chosen": -1.2696118354797363, "logits/rejected": -1.2163591384887695, "logps/chosen": -522.70751953125, "logps/rejected": -730.43896484375, "loss": 0.2281, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2305470705032349, "rewards/margins": 2.6388511657714844, "rewards/rejected": -3.869398593902588, "step": 807 }, { "epoch": 0.69, "grad_norm": 26.841044346191513, "learning_rate": 5.217668956224724e-07, "logits/chosen": -1.2543085813522339, "logits/rejected": -1.1556754112243652, "logps/chosen": -520.13916015625, "logps/rejected": -780.648681640625, "loss": 0.1559, "rewards/accuracies": 0.875, "rewards/chosen": -1.1710164546966553, "rewards/margins": 3.7417702674865723, "rewards/rejected": -4.912786483764648, "step": 808 }, { "epoch": 0.69, "grad_norm": 62.724830798722856, "learning_rate": 5.191388724733866e-07, "logits/chosen": -1.2515757083892822, "logits/rejected": -1.2258267402648926, "logps/chosen": -633.2823486328125, "logps/rejected": -840.8907470703125, "loss": 0.2973, "rewards/accuracies": 0.875, "rewards/chosen": -1.084636926651001, "rewards/margins": 3.320814847946167, "rewards/rejected": -4.40545129776001, "step": 809 }, { "epoch": 0.69, "grad_norm": 30.28960633435741, "learning_rate": 5.165151622116513e-07, "logits/chosen": -1.3251980543136597, "logits/rejected": -1.2799733877182007, "logps/chosen": -366.42340087890625, "logps/rejected": -616.395751953125, "loss": 0.3236, "rewards/accuracies": 0.875, "rewards/chosen": -1.1137187480926514, "rewards/margins": 2.0565237998962402, "rewards/rejected": -3.1702423095703125, "step": 810 }, { "epoch": 0.7, "grad_norm": 58.48126129769298, "learning_rate": 5.138957883695636e-07, "logits/chosen": -1.3073029518127441, "logits/rejected": -1.258333683013916, "logps/chosen": -563.4068603515625, "logps/rejected": -777.6580810546875, "loss": 0.2518, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3961129188537598, "rewards/margins": 2.564927577972412, "rewards/rejected": -3.961040496826172, "step": 811 }, { "epoch": 0.7, "grad_norm": 57.6106870953538, "learning_rate": 5.112807744405256e-07, "logits/chosen": -1.218122959136963, "logits/rejected": -1.1502938270568848, "logps/chosen": -655.7843627929688, "logps/rejected": -883.1055908203125, "loss": 0.1827, "rewards/accuracies": 0.9375, "rewards/chosen": -1.585040807723999, "rewards/margins": 3.1629416942596436, "rewards/rejected": -4.747982025146484, "step": 812 }, { "epoch": 0.7, "grad_norm": 100.19382275968717, "learning_rate": 5.08670143878837e-07, "logits/chosen": -1.3215523958206177, "logits/rejected": -1.2981047630310059, "logps/chosen": -634.1466064453125, "logps/rejected": -740.7958984375, "loss": 0.7425, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9333231449127197, "rewards/margins": 1.4622352123260498, "rewards/rejected": -3.3955583572387695, "step": 813 }, { "epoch": 0.7, "grad_norm": 72.74885728218051, "learning_rate": 5.060639200994818e-07, "logits/chosen": -1.283916711807251, "logits/rejected": -1.3019851446151733, "logps/chosen": -675.86181640625, "logps/rejected": -665.9303588867188, "loss": 0.3831, "rewards/accuracies": 0.875, "rewards/chosen": -1.5731701850891113, "rewards/margins": 1.503868579864502, "rewards/rejected": -3.0770387649536133, "step": 814 }, { "epoch": 0.7, "grad_norm": 48.932796076182605, "learning_rate": 5.034621264779178e-07, "logits/chosen": -1.2343616485595703, "logits/rejected": -1.1905256509780884, "logps/chosen": -759.7833251953125, "logps/rejected": -1085.231201171875, "loss": 0.1768, "rewards/accuracies": 0.875, "rewards/chosen": -1.4716095924377441, "rewards/margins": 3.619154691696167, "rewards/rejected": -5.090764045715332, "step": 815 }, { "epoch": 0.7, "grad_norm": 85.90267594791926, "learning_rate": 5.008647863498709e-07, "logits/chosen": -1.3507819175720215, "logits/rejected": -1.2976577281951904, "logps/chosen": -445.6289978027344, "logps/rejected": -615.7457885742188, "loss": 0.4761, "rewards/accuracies": 0.875, "rewards/chosen": -1.3056609630584717, "rewards/margins": 1.7964327335357666, "rewards/rejected": -3.1020936965942383, "step": 816 }, { "epoch": 0.7, "grad_norm": 41.18656467647798, "learning_rate": 4.982719230111208e-07, "logits/chosen": -1.2840217351913452, "logits/rejected": -1.2977279424667358, "logps/chosen": -477.98187255859375, "logps/rejected": -559.2445068359375, "loss": 0.3443, "rewards/accuracies": 0.8125, "rewards/chosen": -0.997779130935669, "rewards/margins": 1.459186315536499, "rewards/rejected": -2.456965446472168, "step": 817 }, { "epoch": 0.7, "grad_norm": 75.45175286746739, "learning_rate": 4.956835597172954e-07, "logits/chosen": -1.2445216178894043, "logits/rejected": -1.1829955577850342, "logps/chosen": -594.428955078125, "logps/rejected": -779.2150268554688, "loss": 0.1929, "rewards/accuracies": 0.9375, "rewards/chosen": -2.097069025039673, "rewards/margins": 2.3932814598083496, "rewards/rejected": -4.490350723266602, "step": 818 }, { "epoch": 0.7, "grad_norm": 45.938823252272236, "learning_rate": 4.930997196836624e-07, "logits/chosen": -1.336355447769165, "logits/rejected": -1.2834736108779907, "logps/chosen": -324.4188232421875, "logps/rejected": -535.7325439453125, "loss": 0.2532, "rewards/accuracies": 0.9375, "rewards/chosen": -0.83463454246521, "rewards/margins": 2.159453868865967, "rewards/rejected": -2.994088649749756, "step": 819 }, { "epoch": 0.7, "grad_norm": 58.34911238362092, "learning_rate": 4.905204260849183e-07, "logits/chosen": -1.3637986183166504, "logits/rejected": -1.3230304718017578, "logps/chosen": -612.1046752929688, "logps/rejected": -678.5731201171875, "loss": 0.2828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.068753957748413, "rewards/margins": 2.6097583770751953, "rewards/rejected": -3.6785125732421875, "step": 820 }, { "epoch": 0.7, "grad_norm": 58.100590563616066, "learning_rate": 4.879457020549827e-07, "logits/chosen": -1.3381030559539795, "logits/rejected": -1.3141615390777588, "logps/chosen": -430.4180908203125, "logps/rejected": -661.449462890625, "loss": 0.234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5537689924240112, "rewards/margins": 2.3485374450683594, "rewards/rejected": -3.90230655670166, "step": 821 }, { "epoch": 0.7, "grad_norm": 38.786982025083816, "learning_rate": 4.853755706867907e-07, "logits/chosen": -1.3229811191558838, "logits/rejected": -1.273648977279663, "logps/chosen": -517.710205078125, "logps/rejected": -821.1460571289062, "loss": 0.18, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2663819789886475, "rewards/margins": 3.2408554553985596, "rewards/rejected": -4.507237434387207, "step": 822 }, { "epoch": 0.71, "grad_norm": 52.03349499015386, "learning_rate": 4.828100550320852e-07, "logits/chosen": -1.2744662761688232, "logits/rejected": -1.265845775604248, "logps/chosen": -756.6103515625, "logps/rejected": -750.5546264648438, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": -1.724693775177002, "rewards/margins": 2.322697401046753, "rewards/rejected": -4.047390937805176, "step": 823 }, { "epoch": 0.71, "grad_norm": 29.282105070034646, "learning_rate": 4.802491781012101e-07, "logits/chosen": -1.3247438669204712, "logits/rejected": -1.255344033241272, "logps/chosen": -471.8821105957031, "logps/rejected": -746.1473999023438, "loss": 0.1789, "rewards/accuracies": 0.875, "rewards/chosen": -1.132800817489624, "rewards/margins": 3.1457228660583496, "rewards/rejected": -4.278523921966553, "step": 824 }, { "epoch": 0.71, "grad_norm": 37.228174031486915, "learning_rate": 4.776929628629046e-07, "logits/chosen": -1.3179203271865845, "logits/rejected": -1.2262849807739258, "logps/chosen": -446.9613952636719, "logps/rejected": -784.3031616210938, "loss": 0.1404, "rewards/accuracies": 1.0, "rewards/chosen": -1.2367494106292725, "rewards/margins": 3.3095202445983887, "rewards/rejected": -4.546269416809082, "step": 825 }, { "epoch": 0.71, "grad_norm": 37.98624611657895, "learning_rate": 4.7514143224409654e-07, "logits/chosen": -1.303385615348816, "logits/rejected": -1.2276699542999268, "logps/chosen": -395.5613708496094, "logps/rejected": -729.7018432617188, "loss": 0.171, "rewards/accuracies": 1.0, "rewards/chosen": -1.738844633102417, "rewards/margins": 3.0277833938598633, "rewards/rejected": -4.766628265380859, "step": 826 }, { "epoch": 0.71, "grad_norm": 45.503230516625734, "learning_rate": 4.7259460912969717e-07, "logits/chosen": -1.3405150175094604, "logits/rejected": -1.2592682838439941, "logps/chosen": -396.72015380859375, "logps/rejected": -673.91015625, "loss": 0.2404, "rewards/accuracies": 0.875, "rewards/chosen": -1.2170133590698242, "rewards/margins": 2.498927116394043, "rewards/rejected": -3.715940475463867, "step": 827 }, { "epoch": 0.71, "grad_norm": 31.217943894028497, "learning_rate": 4.700525163623944e-07, "logits/chosen": -1.376317024230957, "logits/rejected": -1.3095309734344482, "logps/chosen": -411.83831787109375, "logps/rejected": -690.10498046875, "loss": 0.1662, "rewards/accuracies": 1.0, "rewards/chosen": -1.012393832206726, "rewards/margins": 2.9905548095703125, "rewards/rejected": -4.002948760986328, "step": 828 }, { "epoch": 0.71, "grad_norm": 74.58665440702462, "learning_rate": 4.6751517674245155e-07, "logits/chosen": -1.3186997175216675, "logits/rejected": -1.2811741828918457, "logps/chosen": -424.9061279296875, "logps/rejected": -555.0997314453125, "loss": 0.4596, "rewards/accuracies": 0.75, "rewards/chosen": -1.0757957696914673, "rewards/margins": 1.781997561454773, "rewards/rejected": -2.8577933311462402, "step": 829 }, { "epoch": 0.71, "grad_norm": 36.606778713254194, "learning_rate": 4.649826130274993e-07, "logits/chosen": -1.2815415859222412, "logits/rejected": -1.2470782995224, "logps/chosen": -649.908935546875, "logps/rejected": -758.615234375, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": -1.5537364482879639, "rewards/margins": 2.073343276977539, "rewards/rejected": -3.627079486846924, "step": 830 }, { "epoch": 0.71, "grad_norm": 80.61121666736041, "learning_rate": 4.624548479323317e-07, "logits/chosen": -1.265627145767212, "logits/rejected": -1.2178000211715698, "logps/chosen": -451.3035583496094, "logps/rejected": -690.942626953125, "loss": 0.5877, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1930776834487915, "rewards/margins": 2.1340413093566895, "rewards/rejected": -3.3271191120147705, "step": 831 }, { "epoch": 0.71, "grad_norm": 57.27185180266218, "learning_rate": 4.5993190412870596e-07, "logits/chosen": -1.3708715438842773, "logits/rejected": -1.2856638431549072, "logps/chosen": -385.2818298339844, "logps/rejected": -777.6578369140625, "loss": 0.3799, "rewards/accuracies": 0.875, "rewards/chosen": -1.0449188947677612, "rewards/margins": 2.955021858215332, "rewards/rejected": -3.999940872192383, "step": 832 }, { "epoch": 0.71, "grad_norm": 54.264431774671195, "learning_rate": 4.574138042451344e-07, "logits/chosen": -1.3288730382919312, "logits/rejected": -1.240283489227295, "logps/chosen": -722.72998046875, "logps/rejected": -1019.0667114257812, "loss": 0.2717, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7918155193328857, "rewards/margins": 2.536999464035034, "rewards/rejected": -4.32881498336792, "step": 833 }, { "epoch": 0.72, "grad_norm": 48.72019081686653, "learning_rate": 4.549005708666852e-07, "logits/chosen": -1.295773983001709, "logits/rejected": -1.2186580896377563, "logps/chosen": -560.7127685546875, "logps/rejected": -935.4384155273438, "loss": 0.2302, "rewards/accuracies": 0.875, "rewards/chosen": -1.2861629724502563, "rewards/margins": 3.5684313774108887, "rewards/rejected": -4.854594707489014, "step": 834 }, { "epoch": 0.72, "grad_norm": 53.292990955117695, "learning_rate": 4.523922265347778e-07, "logits/chosen": -1.2727816104888916, "logits/rejected": -1.2295081615447998, "logps/chosen": -543.6903076171875, "logps/rejected": -791.6380004882812, "loss": 0.3634, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2239705324172974, "rewards/margins": 2.2526931762695312, "rewards/rejected": -3.476663589477539, "step": 835 }, { "epoch": 0.72, "grad_norm": 43.02721104114026, "learning_rate": 4.4988879374698166e-07, "logits/chosen": -1.3318668603897095, "logits/rejected": -1.2620775699615479, "logps/chosen": -580.2116088867188, "logps/rejected": -836.694091796875, "loss": 0.1654, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3591465950012207, "rewards/margins": 3.187530040740967, "rewards/rejected": -4.5466766357421875, "step": 836 }, { "epoch": 0.72, "grad_norm": 40.26525761919001, "learning_rate": 4.473902949568138e-07, "logits/chosen": -1.2813338041305542, "logits/rejected": -1.2284204959869385, "logps/chosen": -586.521240234375, "logps/rejected": -773.0760498046875, "loss": 0.2007, "rewards/accuracies": 1.0, "rewards/chosen": -1.4743058681488037, "rewards/margins": 2.6700499057769775, "rewards/rejected": -4.144355773925781, "step": 837 }, { "epoch": 0.72, "grad_norm": 72.74773020712323, "learning_rate": 4.4489675257353807e-07, "logits/chosen": -1.314551591873169, "logits/rejected": -1.2525804042816162, "logps/chosen": -429.6893615722656, "logps/rejected": -571.6607666015625, "loss": 0.4142, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7104672193527222, "rewards/margins": 1.4704867601394653, "rewards/rejected": -3.1809539794921875, "step": 838 }, { "epoch": 0.72, "grad_norm": 55.78038788914927, "learning_rate": 4.424081889619639e-07, "logits/chosen": -1.3101136684417725, "logits/rejected": -1.2543182373046875, "logps/chosen": -417.7502746582031, "logps/rejected": -665.982177734375, "loss": 0.2769, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3031742572784424, "rewards/margins": 2.185793399810791, "rewards/rejected": -3.4889674186706543, "step": 839 }, { "epoch": 0.72, "grad_norm": 45.93818042291123, "learning_rate": 4.3992462644224515e-07, "logits/chosen": -1.3367724418640137, "logits/rejected": -1.2911133766174316, "logps/chosen": -446.1727600097656, "logps/rejected": -677.7947998046875, "loss": 0.2157, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2187086343765259, "rewards/margins": 2.477630138397217, "rewards/rejected": -3.6963391304016113, "step": 840 }, { "epoch": 0.72, "grad_norm": 33.162499393664156, "learning_rate": 4.37446087289681e-07, "logits/chosen": -1.30533766746521, "logits/rejected": -1.2600688934326172, "logps/chosen": -485.5511474609375, "logps/rejected": -687.138671875, "loss": 0.2553, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8878797888755798, "rewards/margins": 2.69405460357666, "rewards/rejected": -3.5819342136383057, "step": 841 }, { "epoch": 0.72, "grad_norm": 32.77410242264212, "learning_rate": 4.3497259373451533e-07, "logits/chosen": -1.2432332038879395, "logits/rejected": -1.2274351119995117, "logps/chosen": -525.11083984375, "logps/rejected": -652.7245483398438, "loss": 0.2081, "rewards/accuracies": 0.9375, "rewards/chosen": -0.685070276260376, "rewards/margins": 2.884859085083008, "rewards/rejected": -3.5699291229248047, "step": 842 }, { "epoch": 0.72, "grad_norm": 89.81416550928549, "learning_rate": 4.3250416796173806e-07, "logits/chosen": -1.2505658864974976, "logits/rejected": -1.2060911655426025, "logps/chosen": -691.4589233398438, "logps/rejected": -829.2577514648438, "loss": 0.5968, "rewards/accuracies": 0.875, "rewards/chosen": -1.898880958557129, "rewards/margins": 1.7532389163970947, "rewards/rejected": -3.6521201133728027, "step": 843 }, { "epoch": 0.72, "grad_norm": 81.97624676646213, "learning_rate": 4.3004083211088414e-07, "logits/chosen": -1.2881934642791748, "logits/rejected": -1.2460763454437256, "logps/chosen": -575.97802734375, "logps/rejected": -724.1553955078125, "loss": 0.4642, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2311291694641113, "rewards/margins": 1.8496699333190918, "rewards/rejected": -3.080799102783203, "step": 844 }, { "epoch": 0.72, "grad_norm": 29.22772748448285, "learning_rate": 4.275826082758388e-07, "logits/chosen": -1.2812985181808472, "logits/rejected": -1.2197277545928955, "logps/chosen": -618.3961791992188, "logps/rejected": -909.3668823242188, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": -1.0194859504699707, "rewards/margins": 2.808882236480713, "rewards/rejected": -3.8283684253692627, "step": 845 }, { "epoch": 0.73, "grad_norm": 61.829876899345294, "learning_rate": 4.2512951850463597e-07, "logits/chosen": -1.3472239971160889, "logits/rejected": -1.2804555892944336, "logps/chosen": -587.024169921875, "logps/rejected": -706.7734375, "loss": 0.293, "rewards/accuracies": 0.875, "rewards/chosen": -1.0534980297088623, "rewards/margins": 1.6623592376708984, "rewards/rejected": -2.7158570289611816, "step": 846 }, { "epoch": 0.73, "grad_norm": 53.24683135146778, "learning_rate": 4.226815847992611e-07, "logits/chosen": -1.2948989868164062, "logits/rejected": -1.263986349105835, "logps/chosen": -561.6561889648438, "logps/rejected": -683.1256103515625, "loss": 0.3061, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1184791326522827, "rewards/margins": 1.7602111101150513, "rewards/rejected": -2.878690481185913, "step": 847 }, { "epoch": 0.73, "grad_norm": 32.03500041273063, "learning_rate": 4.202388291154555e-07, "logits/chosen": -1.2769651412963867, "logits/rejected": -1.295407772064209, "logps/chosen": -470.678955078125, "logps/rejected": -634.0687255859375, "loss": 0.1691, "rewards/accuracies": 1.0, "rewards/chosen": -0.8327485918998718, "rewards/margins": 2.287118911743164, "rewards/rejected": -3.1198675632476807, "step": 848 }, { "epoch": 0.73, "grad_norm": 45.721064817373104, "learning_rate": 4.1780127336251767e-07, "logits/chosen": -1.311166524887085, "logits/rejected": -1.19183349609375, "logps/chosen": -445.4447937011719, "logps/rejected": -761.5078125, "loss": 0.2777, "rewards/accuracies": 0.875, "rewards/chosen": -0.7454906702041626, "rewards/margins": 2.716456174850464, "rewards/rejected": -3.461946964263916, "step": 849 }, { "epoch": 0.73, "grad_norm": 93.05220178000309, "learning_rate": 4.153689394031079e-07, "logits/chosen": -1.3191728591918945, "logits/rejected": -1.242370367050171, "logps/chosen": -508.0107421875, "logps/rejected": -761.6094970703125, "loss": 0.3951, "rewards/accuracies": 0.8125, "rewards/chosen": -1.194153070449829, "rewards/margins": 1.9952038526535034, "rewards/rejected": -3.189356803894043, "step": 850 }, { "epoch": 0.73, "grad_norm": 28.96983105061877, "learning_rate": 4.129418490530514e-07, "logits/chosen": -1.3243119716644287, "logits/rejected": -1.217747449874878, "logps/chosen": -496.8054504394531, "logps/rejected": -842.6761474609375, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": -0.863527238368988, "rewards/margins": 3.1362767219543457, "rewards/rejected": -3.9998040199279785, "step": 851 }, { "epoch": 0.73, "grad_norm": 37.45097658640633, "learning_rate": 4.1052002408114304e-07, "logits/chosen": -1.3475794792175293, "logits/rejected": -1.279633641242981, "logps/chosen": -495.43853759765625, "logps/rejected": -691.76220703125, "loss": 0.2384, "rewards/accuracies": 1.0, "rewards/chosen": -1.2313153743743896, "rewards/margins": 1.863783359527588, "rewards/rejected": -3.0950987339019775, "step": 852 }, { "epoch": 0.73, "grad_norm": 52.686708466661, "learning_rate": 4.081034862089523e-07, "logits/chosen": -1.2625312805175781, "logits/rejected": -1.2287614345550537, "logps/chosen": -537.3236694335938, "logps/rejected": -665.4468994140625, "loss": 0.2602, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6482099890708923, "rewards/margins": 1.9057555198669434, "rewards/rejected": -2.5539653301239014, "step": 853 }, { "epoch": 0.73, "grad_norm": 60.405374160574816, "learning_rate": 4.056922571106277e-07, "logits/chosen": -1.3058245182037354, "logits/rejected": -1.2516499757766724, "logps/chosen": -636.6150512695312, "logps/rejected": -859.4437255859375, "loss": 0.3198, "rewards/accuracies": 0.875, "rewards/chosen": -1.4464586973190308, "rewards/margins": 2.4538464546203613, "rewards/rejected": -3.9003052711486816, "step": 854 }, { "epoch": 0.73, "grad_norm": 64.8038400855151, "learning_rate": 4.0328635841270344e-07, "logits/chosen": -1.268088936805725, "logits/rejected": -1.2096847295761108, "logps/chosen": -548.0049438476562, "logps/rejected": -827.2142944335938, "loss": 0.2479, "rewards/accuracies": 0.875, "rewards/chosen": -1.17325758934021, "rewards/margins": 2.010125160217285, "rewards/rejected": -3.183382987976074, "step": 855 }, { "epoch": 0.73, "grad_norm": 69.11316301966247, "learning_rate": 4.0088581169390425e-07, "logits/chosen": -1.4180727005004883, "logits/rejected": -1.3446136713027954, "logps/chosen": -458.9594421386719, "logps/rejected": -664.6500244140625, "loss": 0.468, "rewards/accuracies": 0.6875, "rewards/chosen": -1.034346580505371, "rewards/margins": 1.9922888278961182, "rewards/rejected": -3.02663516998291, "step": 856 }, { "epoch": 0.73, "grad_norm": 43.37475914020421, "learning_rate": 3.984906384849529e-07, "logits/chosen": -1.2790015935897827, "logits/rejected": -1.2007935047149658, "logps/chosen": -709.7748413085938, "logps/rejected": -934.968994140625, "loss": 0.2759, "rewards/accuracies": 0.875, "rewards/chosen": -0.8545200228691101, "rewards/margins": 3.244429588317871, "rewards/rejected": -4.098949432373047, "step": 857 }, { "epoch": 0.74, "grad_norm": 20.579376552917985, "learning_rate": 3.9610086026837673e-07, "logits/chosen": -1.387350082397461, "logits/rejected": -1.2440000772476196, "logps/chosen": -405.1728515625, "logps/rejected": -705.4710693359375, "loss": 0.1538, "rewards/accuracies": 1.0, "rewards/chosen": -0.6755664348602295, "rewards/margins": 3.2075552940368652, "rewards/rejected": -3.883122205734253, "step": 858 }, { "epoch": 0.74, "grad_norm": 61.989156105540516, "learning_rate": 3.937164984783149e-07, "logits/chosen": -1.3607792854309082, "logits/rejected": -1.338577389717102, "logps/chosen": -448.86956787109375, "logps/rejected": -516.97705078125, "loss": 0.3516, "rewards/accuracies": 0.875, "rewards/chosen": -1.2934528589248657, "rewards/margins": 1.3286328315734863, "rewards/rejected": -2.6220855712890625, "step": 859 }, { "epoch": 0.74, "grad_norm": 113.66640912811468, "learning_rate": 3.9133757450032536e-07, "logits/chosen": -1.4172799587249756, "logits/rejected": -1.325761079788208, "logps/chosen": -598.533935546875, "logps/rejected": -704.5578002929688, "loss": 0.5744, "rewards/accuracies": 0.75, "rewards/chosen": -1.6968603134155273, "rewards/margins": 1.3225895166397095, "rewards/rejected": -3.0194497108459473, "step": 860 }, { "epoch": 0.74, "grad_norm": 109.10917904229667, "learning_rate": 3.889641096711943e-07, "logits/chosen": -1.2880518436431885, "logits/rejected": -1.231501817703247, "logps/chosen": -564.6885986328125, "logps/rejected": -741.1936645507812, "loss": 0.5165, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8741509914398193, "rewards/margins": 1.7373528480529785, "rewards/rejected": -3.611504077911377, "step": 861 }, { "epoch": 0.74, "grad_norm": 64.27319512225013, "learning_rate": 3.8659612527874574e-07, "logits/chosen": -1.3026931285858154, "logits/rejected": -1.3118447065353394, "logps/chosen": -641.9603881835938, "logps/rejected": -660.7064208984375, "loss": 0.3268, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2755794525146484, "rewards/margins": 1.8998128175735474, "rewards/rejected": -3.1753921508789062, "step": 862 }, { "epoch": 0.74, "grad_norm": 51.47387197981583, "learning_rate": 3.842336425616466e-07, "logits/chosen": -1.3199818134307861, "logits/rejected": -1.2690174579620361, "logps/chosen": -632.9410400390625, "logps/rejected": -703.441650390625, "loss": 0.3068, "rewards/accuracies": 0.8125, "rewards/chosen": -1.29012930393219, "rewards/margins": 2.181661605834961, "rewards/rejected": -3.4717910289764404, "step": 863 }, { "epoch": 0.74, "grad_norm": 33.07989943791176, "learning_rate": 3.818766827092201e-07, "logits/chosen": -1.3294596672058105, "logits/rejected": -1.2252604961395264, "logps/chosen": -441.6009521484375, "logps/rejected": -716.4881591796875, "loss": 0.1971, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9453874230384827, "rewards/margins": 2.805691719055176, "rewards/rejected": -3.7510790824890137, "step": 864 }, { "epoch": 0.74, "grad_norm": 43.328260504476305, "learning_rate": 3.795252668612554e-07, "logits/chosen": -1.3333276510238647, "logits/rejected": -1.2647820711135864, "logps/chosen": -502.2019348144531, "logps/rejected": -724.9705810546875, "loss": 0.1994, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1261513233184814, "rewards/margins": 2.8922533988952637, "rewards/rejected": -4.018404960632324, "step": 865 }, { "epoch": 0.74, "grad_norm": 58.03496627953685, "learning_rate": 3.771794161078148e-07, "logits/chosen": -1.3372564315795898, "logits/rejected": -1.2777786254882812, "logps/chosen": -499.2994384765625, "logps/rejected": -706.81787109375, "loss": 0.2921, "rewards/accuracies": 0.875, "rewards/chosen": -1.3794007301330566, "rewards/margins": 2.1858272552490234, "rewards/rejected": -3.56522798538208, "step": 866 }, { "epoch": 0.74, "grad_norm": 59.009855907870005, "learning_rate": 3.748391514890483e-07, "logits/chosen": -1.4050931930541992, "logits/rejected": -1.3332555294036865, "logps/chosen": -293.22882080078125, "logps/rejected": -503.4451904296875, "loss": 0.7504, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0884900093078613, "rewards/margins": 1.6704758405685425, "rewards/rejected": -2.7589659690856934, "step": 867 }, { "epoch": 0.74, "grad_norm": 46.32140136229227, "learning_rate": 3.7250449399500285e-07, "logits/chosen": -1.3360533714294434, "logits/rejected": -1.3012781143188477, "logps/chosen": -473.07305908203125, "logps/rejected": -656.3292236328125, "loss": 0.2358, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2797776460647583, "rewards/margins": 2.2085609436035156, "rewards/rejected": -3.4883384704589844, "step": 868 }, { "epoch": 0.75, "grad_norm": 57.81381369600409, "learning_rate": 3.701754645654347e-07, "logits/chosen": -1.3500280380249023, "logits/rejected": -1.3101742267608643, "logps/chosen": -582.2659912109375, "logps/rejected": -626.7835693359375, "loss": 0.2263, "rewards/accuracies": 0.875, "rewards/chosen": -0.7205829620361328, "rewards/margins": 2.4664716720581055, "rewards/rejected": -3.1870546340942383, "step": 869 }, { "epoch": 0.75, "grad_norm": 36.414745480971156, "learning_rate": 3.678520840896213e-07, "logits/chosen": -1.4011168479919434, "logits/rejected": -1.3437912464141846, "logps/chosen": -468.67303466796875, "logps/rejected": -627.2376708984375, "loss": 0.1575, "rewards/accuracies": 1.0, "rewards/chosen": -1.0265426635742188, "rewards/margins": 2.329216718673706, "rewards/rejected": -3.355759382247925, "step": 870 }, { "epoch": 0.75, "grad_norm": 72.01976252804879, "learning_rate": 3.655343734061743e-07, "logits/chosen": -1.30082106590271, "logits/rejected": -1.233238935470581, "logps/chosen": -593.5780029296875, "logps/rejected": -778.603759765625, "loss": 0.3019, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1820402145385742, "rewards/margins": 2.6853573322296143, "rewards/rejected": -3.8673977851867676, "step": 871 }, { "epoch": 0.75, "grad_norm": 94.95198859210063, "learning_rate": 3.6322235330285245e-07, "logits/chosen": -1.3168257474899292, "logits/rejected": -1.3221426010131836, "logps/chosen": -592.3690795898438, "logps/rejected": -682.3460693359375, "loss": 0.4481, "rewards/accuracies": 0.75, "rewards/chosen": -1.4910621643066406, "rewards/margins": 2.1111958026885986, "rewards/rejected": -3.6022579669952393, "step": 872 }, { "epoch": 0.75, "grad_norm": 77.93664456604847, "learning_rate": 3.6091604451637514e-07, "logits/chosen": -1.3545784950256348, "logits/rejected": -1.3380658626556396, "logps/chosen": -574.0543212890625, "logps/rejected": -678.6168212890625, "loss": 0.3947, "rewards/accuracies": 0.8125, "rewards/chosen": -1.328019618988037, "rewards/margins": 1.94981050491333, "rewards/rejected": -3.277829885482788, "step": 873 }, { "epoch": 0.75, "grad_norm": 52.91490995676715, "learning_rate": 3.5861546773223625e-07, "logits/chosen": -1.3453152179718018, "logits/rejected": -1.2786279916763306, "logps/chosen": -525.9072265625, "logps/rejected": -661.3233642578125, "loss": 0.2586, "rewards/accuracies": 1.0, "rewards/chosen": -1.391870379447937, "rewards/margins": 1.7153456211090088, "rewards/rejected": -3.1072158813476562, "step": 874 }, { "epoch": 0.75, "grad_norm": 36.63533426304873, "learning_rate": 3.563206435845195e-07, "logits/chosen": -1.4298312664031982, "logits/rejected": -1.3732223510742188, "logps/chosen": -417.8304443359375, "logps/rejected": -637.6014404296875, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": -1.1234023571014404, "rewards/margins": 2.6905455589294434, "rewards/rejected": -3.8139476776123047, "step": 875 }, { "epoch": 0.75, "grad_norm": 55.72006781264042, "learning_rate": 3.5403159265571134e-07, "logits/chosen": -1.3255597352981567, "logits/rejected": -1.2867255210876465, "logps/chosen": -570.984375, "logps/rejected": -834.64794921875, "loss": 0.3485, "rewards/accuracies": 0.875, "rewards/chosen": -2.0393853187561035, "rewards/margins": 2.5629029273986816, "rewards/rejected": -4.602288246154785, "step": 876 }, { "epoch": 0.75, "grad_norm": 43.175911684475345, "learning_rate": 3.517483354765186e-07, "logits/chosen": -1.357595682144165, "logits/rejected": -1.2869113683700562, "logps/chosen": -397.64715576171875, "logps/rejected": -673.1788330078125, "loss": 0.2144, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9668766260147095, "rewards/margins": 2.812863349914551, "rewards/rejected": -3.779740333557129, "step": 877 }, { "epoch": 0.75, "grad_norm": 31.308656856204863, "learning_rate": 3.494708925256844e-07, "logits/chosen": -1.3941121101379395, "logits/rejected": -1.3153181076049805, "logps/chosen": -505.4889221191406, "logps/rejected": -811.90673828125, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": -1.4519171714782715, "rewards/margins": 3.0042178630828857, "rewards/rejected": -4.456134796142578, "step": 878 }, { "epoch": 0.75, "grad_norm": 52.39570886453552, "learning_rate": 3.471992842298015e-07, "logits/chosen": -1.407675862312317, "logits/rejected": -1.3652654886245728, "logps/chosen": -427.5823974609375, "logps/rejected": -508.2393798828125, "loss": 0.3277, "rewards/accuracies": 0.75, "rewards/chosen": -0.6519958972930908, "rewards/margins": 1.814331293106079, "rewards/rejected": -2.46632719039917, "step": 879 }, { "epoch": 0.75, "grad_norm": 67.58038638473134, "learning_rate": 3.4493353096313194e-07, "logits/chosen": -1.422912359237671, "logits/rejected": -1.3669354915618896, "logps/chosen": -604.5769653320312, "logps/rejected": -827.68603515625, "loss": 0.4145, "rewards/accuracies": 0.875, "rewards/chosen": -1.78847336769104, "rewards/margins": 2.4188690185546875, "rewards/rejected": -4.207342624664307, "step": 880 }, { "epoch": 0.76, "grad_norm": 63.33642822741839, "learning_rate": 3.426736530474247e-07, "logits/chosen": -1.3313500881195068, "logits/rejected": -1.2653937339782715, "logps/chosen": -582.1412353515625, "logps/rejected": -808.296630859375, "loss": 0.2413, "rewards/accuracies": 0.875, "rewards/chosen": -1.1795361042022705, "rewards/margins": 3.1476869583129883, "rewards/rejected": -4.32722282409668, "step": 881 }, { "epoch": 0.76, "grad_norm": 54.286660414687084, "learning_rate": 3.4041967075172993e-07, "logits/chosen": -1.410056471824646, "logits/rejected": -1.3243889808654785, "logps/chosen": -427.4581298828125, "logps/rejected": -628.5403442382812, "loss": 0.3641, "rewards/accuracies": 0.8125, "rewards/chosen": -1.216133952140808, "rewards/margins": 2.6110310554504395, "rewards/rejected": -3.827165126800537, "step": 882 }, { "epoch": 0.76, "grad_norm": 29.622758715360273, "learning_rate": 3.3817160429222124e-07, "logits/chosen": -1.3391587734222412, "logits/rejected": -1.3287652730941772, "logps/chosen": -481.3605041503906, "logps/rejected": -685.9511108398438, "loss": 0.1739, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9065276384353638, "rewards/margins": 2.8289523124694824, "rewards/rejected": -3.735480308532715, "step": 883 }, { "epoch": 0.76, "grad_norm": 112.93637118227241, "learning_rate": 3.3592947383201173e-07, "logits/chosen": -1.3215630054473877, "logits/rejected": -1.2854342460632324, "logps/chosen": -598.59765625, "logps/rejected": -838.0128173828125, "loss": 0.42, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8170629739761353, "rewards/margins": 2.4677867889404297, "rewards/rejected": -4.284849166870117, "step": 884 }, { "epoch": 0.76, "grad_norm": 98.09938605966101, "learning_rate": 3.3369329948097434e-07, "logits/chosen": -1.3352463245391846, "logits/rejected": -1.3223787546157837, "logps/chosen": -593.1196899414062, "logps/rejected": -615.0050048828125, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": -1.898991584777832, "rewards/margins": 0.8527295589447021, "rewards/rejected": -2.751720905303955, "step": 885 }, { "epoch": 0.76, "grad_norm": 90.878606470055, "learning_rate": 3.3146310129556077e-07, "logits/chosen": -1.321690559387207, "logits/rejected": -1.3665200471878052, "logps/chosen": -696.987060546875, "logps/rejected": -712.210693359375, "loss": 0.329, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7030959129333496, "rewards/margins": 2.140103340148926, "rewards/rejected": -3.8431992530822754, "step": 886 }, { "epoch": 0.76, "grad_norm": 112.21630938679121, "learning_rate": 3.2923889927862226e-07, "logits/chosen": -1.3148661851882935, "logits/rejected": -1.3028908967971802, "logps/chosen": -635.5806274414062, "logps/rejected": -855.8046875, "loss": 0.452, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8170392513275146, "rewards/margins": 2.2406771183013916, "rewards/rejected": -4.057716369628906, "step": 887 }, { "epoch": 0.76, "grad_norm": 44.05711014236523, "learning_rate": 3.2702071337922964e-07, "logits/chosen": -1.2637290954589844, "logits/rejected": -1.206153392791748, "logps/chosen": -570.122802734375, "logps/rejected": -821.76171875, "loss": 0.2241, "rewards/accuracies": 0.875, "rewards/chosen": -1.8035266399383545, "rewards/margins": 2.428105592727661, "rewards/rejected": -4.231632232666016, "step": 888 }, { "epoch": 0.76, "grad_norm": 109.6736924788751, "learning_rate": 3.2480856349249517e-07, "logits/chosen": -1.2842376232147217, "logits/rejected": -1.2788653373718262, "logps/chosen": -556.2728271484375, "logps/rejected": -554.0238037109375, "loss": 0.7193, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9949722290039062, "rewards/margins": 1.1830435991287231, "rewards/rejected": -3.178015947341919, "step": 889 }, { "epoch": 0.76, "grad_norm": 99.72169912710434, "learning_rate": 3.226024694593922e-07, "logits/chosen": -1.39703369140625, "logits/rejected": -1.400864601135254, "logps/chosen": -649.0518798828125, "logps/rejected": -563.8770141601562, "loss": 0.5959, "rewards/accuracies": 0.8125, "rewards/chosen": -2.150852680206299, "rewards/margins": 1.6393182277679443, "rewards/rejected": -3.790170669555664, "step": 890 }, { "epoch": 0.76, "grad_norm": 70.1925666635167, "learning_rate": 3.2040245106658037e-07, "logits/chosen": -1.362339973449707, "logits/rejected": -1.339266300201416, "logps/chosen": -674.0003051757812, "logps/rejected": -634.685302734375, "loss": 0.3611, "rewards/accuracies": 0.8125, "rewards/chosen": -1.792270302772522, "rewards/margins": 1.7869552373886108, "rewards/rejected": -3.579225540161133, "step": 891 }, { "epoch": 0.77, "grad_norm": 70.55623056745203, "learning_rate": 3.1820852804622555e-07, "logits/chosen": -1.3071826696395874, "logits/rejected": -1.267538070678711, "logps/chosen": -392.2010803222656, "logps/rejected": -595.3833618164062, "loss": 0.5074, "rewards/accuracies": 0.875, "rewards/chosen": -1.1206765174865723, "rewards/margins": 1.930222511291504, "rewards/rejected": -3.050899028778076, "step": 892 }, { "epoch": 0.77, "grad_norm": 110.22494900777089, "learning_rate": 3.160207200758226e-07, "logits/chosen": -1.378072738647461, "logits/rejected": -1.3356727361679077, "logps/chosen": -659.4681396484375, "logps/rejected": -700.875, "loss": 0.618, "rewards/accuracies": 0.8125, "rewards/chosen": -2.127047300338745, "rewards/margins": 1.8155895471572876, "rewards/rejected": -3.9426369667053223, "step": 893 }, { "epoch": 0.77, "grad_norm": 46.144916062738126, "learning_rate": 3.138390467780221e-07, "logits/chosen": -1.366478443145752, "logits/rejected": -1.2956424951553345, "logps/chosen": -498.22845458984375, "logps/rejected": -830.8386840820312, "loss": 0.2081, "rewards/accuracies": 0.875, "rewards/chosen": -1.1988614797592163, "rewards/margins": 3.48574161529541, "rewards/rejected": -4.684603691101074, "step": 894 }, { "epoch": 0.77, "grad_norm": 70.18858740432836, "learning_rate": 3.1166352772045023e-07, "logits/chosen": -1.3166738748550415, "logits/rejected": -1.3166499137878418, "logps/chosen": -695.97509765625, "logps/rejected": -737.0611572265625, "loss": 0.344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6051607131958008, "rewards/margins": 2.00355863571167, "rewards/rejected": -3.6087193489074707, "step": 895 }, { "epoch": 0.77, "grad_norm": 68.23376706593672, "learning_rate": 3.09494182415536e-07, "logits/chosen": -1.2961673736572266, "logits/rejected": -1.2795644998550415, "logps/chosen": -593.0159912109375, "logps/rejected": -706.4309692382812, "loss": 0.4114, "rewards/accuracies": 0.75, "rewards/chosen": -1.7094247341156006, "rewards/margins": 1.647386074066162, "rewards/rejected": -3.3568105697631836, "step": 896 }, { "epoch": 0.77, "grad_norm": 33.157370260694265, "learning_rate": 3.0733103032033634e-07, "logits/chosen": -1.3536429405212402, "logits/rejected": -1.2895846366882324, "logps/chosen": -411.0083923339844, "logps/rejected": -608.671630859375, "loss": 0.1843, "rewards/accuracies": 1.0, "rewards/chosen": -0.9498167037963867, "rewards/margins": 2.741847515106201, "rewards/rejected": -3.691663980484009, "step": 897 }, { "epoch": 0.77, "grad_norm": 25.068181022827815, "learning_rate": 3.0517409083635905e-07, "logits/chosen": -1.3577206134796143, "logits/rejected": -1.3004130125045776, "logps/chosen": -526.4813842773438, "logps/rejected": -718.7724609375, "loss": 0.1954, "rewards/accuracies": 1.0, "rewards/chosen": -0.991965651512146, "rewards/margins": 2.8482069969177246, "rewards/rejected": -3.840172529220581, "step": 898 }, { "epoch": 0.77, "grad_norm": 54.60576445454713, "learning_rate": 3.030233833093915e-07, "logits/chosen": -1.3127899169921875, "logits/rejected": -1.2542513608932495, "logps/chosen": -375.11737060546875, "logps/rejected": -528.6685180664062, "loss": 0.3104, "rewards/accuracies": 0.875, "rewards/chosen": -1.0060513019561768, "rewards/margins": 2.2479963302612305, "rewards/rejected": -3.2540478706359863, "step": 899 }, { "epoch": 0.77, "grad_norm": 69.03825593931651, "learning_rate": 3.008789270293258e-07, "logits/chosen": -1.369001865386963, "logits/rejected": -1.3474452495574951, "logps/chosen": -391.5672302246094, "logps/rejected": -490.455078125, "loss": 0.4387, "rewards/accuracies": 0.625, "rewards/chosen": -1.0773385763168335, "rewards/margins": 1.8839397430419922, "rewards/rejected": -2.9612784385681152, "step": 900 }, { "epoch": 0.77, "grad_norm": 82.4675140405883, "learning_rate": 2.9874074122998626e-07, "logits/chosen": -1.327359676361084, "logits/rejected": -1.3002171516418457, "logps/chosen": -580.1005859375, "logps/rejected": -651.4447021484375, "loss": 0.5103, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4842534065246582, "rewards/margins": 1.662112832069397, "rewards/rejected": -3.1463663578033447, "step": 901 }, { "epoch": 0.77, "grad_norm": 25.93819082249965, "learning_rate": 2.9660884508895635e-07, "logits/chosen": -1.3338119983673096, "logits/rejected": -1.2930299043655396, "logps/chosen": -528.5415649414062, "logps/rejected": -738.2474975585938, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": -1.1045358180999756, "rewards/margins": 2.8727259635925293, "rewards/rejected": -3.977262020111084, "step": 902 }, { "epoch": 0.77, "grad_norm": 57.906261612530855, "learning_rate": 2.944832577274071e-07, "logits/chosen": -1.3311939239501953, "logits/rejected": -1.3131906986236572, "logps/chosen": -518.005126953125, "logps/rejected": -610.8331298828125, "loss": 0.2076, "rewards/accuracies": 0.9375, "rewards/chosen": -1.115837812423706, "rewards/margins": 2.317455291748047, "rewards/rejected": -3.433293342590332, "step": 903 }, { "epoch": 0.78, "grad_norm": 60.51137036355153, "learning_rate": 2.9236399820992584e-07, "logits/chosen": -1.3146090507507324, "logits/rejected": -1.3134419918060303, "logps/chosen": -497.95281982421875, "logps/rejected": -555.810302734375, "loss": 0.3785, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9638426303863525, "rewards/margins": 1.6513032913208008, "rewards/rejected": -2.6151459217071533, "step": 904 }, { "epoch": 0.78, "grad_norm": 69.07987355925474, "learning_rate": 2.9025108554434484e-07, "logits/chosen": -1.2922639846801758, "logits/rejected": -1.2921481132507324, "logps/chosen": -638.66650390625, "logps/rejected": -691.6033935546875, "loss": 0.4971, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0466654300689697, "rewards/margins": 1.6078869104385376, "rewards/rejected": -2.654552459716797, "step": 905 }, { "epoch": 0.78, "grad_norm": 34.38374236627064, "learning_rate": 2.8814453868156975e-07, "logits/chosen": -1.1898102760314941, "logits/rejected": -1.150869607925415, "logps/chosen": -632.0054931640625, "logps/rejected": -763.151611328125, "loss": 0.2693, "rewards/accuracies": 0.875, "rewards/chosen": -0.688407301902771, "rewards/margins": 2.7878832817077637, "rewards/rejected": -3.476290702819824, "step": 906 }, { "epoch": 0.78, "grad_norm": 40.99030600675662, "learning_rate": 2.860443765154126e-07, "logits/chosen": -1.3723232746124268, "logits/rejected": -1.2995691299438477, "logps/chosen": -407.0501708984375, "logps/rejected": -686.1329956054688, "loss": 0.2894, "rewards/accuracies": 0.875, "rewards/chosen": -0.8385758399963379, "rewards/margins": 2.8460376262664795, "rewards/rejected": -3.6846137046813965, "step": 907 }, { "epoch": 0.78, "grad_norm": 79.22389737309626, "learning_rate": 2.8395061788241956e-07, "logits/chosen": -1.2611339092254639, "logits/rejected": -1.1847124099731445, "logps/chosen": -566.4060668945312, "logps/rejected": -827.1646728515625, "loss": 0.2721, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2613106966018677, "rewards/margins": 2.8124876022338867, "rewards/rejected": -4.073798179626465, "step": 908 }, { "epoch": 0.78, "grad_norm": 37.901490746080235, "learning_rate": 2.818632815617021e-07, "logits/chosen": -1.3264902830123901, "logits/rejected": -1.2834479808807373, "logps/chosen": -508.1125793457031, "logps/rejected": -708.22314453125, "loss": 0.2283, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0484519004821777, "rewards/margins": 2.244354009628296, "rewards/rejected": -3.2928059101104736, "step": 909 }, { "epoch": 0.78, "grad_norm": 66.8687708436843, "learning_rate": 2.7978238627477146e-07, "logits/chosen": -1.3407678604125977, "logits/rejected": -1.2671020030975342, "logps/chosen": -480.5046081542969, "logps/rejected": -702.8544311523438, "loss": 0.3486, "rewards/accuracies": 0.9375, "rewards/chosen": -1.188808560371399, "rewards/margins": 2.1973466873168945, "rewards/rejected": -3.386155128479004, "step": 910 }, { "epoch": 0.78, "grad_norm": 27.632333790497384, "learning_rate": 2.7770795068536643e-07, "logits/chosen": -1.3072032928466797, "logits/rejected": -1.2601563930511475, "logps/chosen": -443.11138916015625, "logps/rejected": -548.894287109375, "loss": 0.2195, "rewards/accuracies": 0.9375, "rewards/chosen": -0.793603241443634, "rewards/margins": 2.062899112701416, "rewards/rejected": -2.8565022945404053, "step": 911 }, { "epoch": 0.78, "grad_norm": 48.781777727628075, "learning_rate": 2.7563999339928935e-07, "logits/chosen": -1.3822541236877441, "logits/rejected": -1.3028593063354492, "logps/chosen": -374.9779052734375, "logps/rejected": -534.6602783203125, "loss": 0.2828, "rewards/accuracies": 0.875, "rewards/chosen": -0.7297624349594116, "rewards/margins": 2.3048460483551025, "rewards/rejected": -3.034608840942383, "step": 912 }, { "epoch": 0.78, "grad_norm": 70.82186567684192, "learning_rate": 2.735785329642386e-07, "logits/chosen": -1.3031113147735596, "logits/rejected": -1.2537412643432617, "logps/chosen": -854.8223876953125, "logps/rejected": -878.5704345703125, "loss": 0.2976, "rewards/accuracies": 0.875, "rewards/chosen": -1.5797104835510254, "rewards/margins": 2.0103750228881836, "rewards/rejected": -3.590085506439209, "step": 913 }, { "epoch": 0.78, "grad_norm": 71.14393446698111, "learning_rate": 2.7152358786964023e-07, "logits/chosen": -1.3568994998931885, "logits/rejected": -1.317795991897583, "logps/chosen": -554.6603393554688, "logps/rejected": -741.9503173828125, "loss": 0.3941, "rewards/accuracies": 0.875, "rewards/chosen": -1.5482054948806763, "rewards/margins": 1.7386507987976074, "rewards/rejected": -3.286856174468994, "step": 914 }, { "epoch": 0.78, "grad_norm": 64.13171433552547, "learning_rate": 2.6947517654648467e-07, "logits/chosen": -1.3185844421386719, "logits/rejected": -1.2491166591644287, "logps/chosen": -437.0296630859375, "logps/rejected": -627.7676391601562, "loss": 0.3991, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3248052597045898, "rewards/margins": 1.7850700616836548, "rewards/rejected": -3.109875202178955, "step": 915 }, { "epoch": 0.79, "grad_norm": 28.89268785562417, "learning_rate": 2.674333173671601e-07, "logits/chosen": -1.3552255630493164, "logits/rejected": -1.3029472827911377, "logps/chosen": -479.81939697265625, "logps/rejected": -634.21923828125, "loss": 0.1487, "rewards/accuracies": 1.0, "rewards/chosen": -0.6625692844390869, "rewards/margins": 2.2750813961029053, "rewards/rejected": -2.937650680541992, "step": 916 }, { "epoch": 0.79, "grad_norm": 25.6391724086389, "learning_rate": 2.6539802864528783e-07, "logits/chosen": -1.3402647972106934, "logits/rejected": -1.2553725242614746, "logps/chosen": -405.7496643066406, "logps/rejected": -715.4351806640625, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": -0.39334067702293396, "rewards/margins": 3.430051565170288, "rewards/rejected": -3.823391914367676, "step": 917 }, { "epoch": 0.79, "grad_norm": 46.84769643268985, "learning_rate": 2.6336932863555826e-07, "logits/chosen": -1.3779666423797607, "logits/rejected": -1.2912983894348145, "logps/chosen": -522.4635009765625, "logps/rejected": -788.0476684570312, "loss": 0.2176, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0136640071868896, "rewards/margins": 3.4003186225891113, "rewards/rejected": -4.41398286819458, "step": 918 }, { "epoch": 0.79, "grad_norm": 80.28931203730443, "learning_rate": 2.61347235533567e-07, "logits/chosen": -1.3870668411254883, "logits/rejected": -1.3371049165725708, "logps/chosen": -713.8438110351562, "logps/rejected": -824.6231079101562, "loss": 0.2795, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5177955627441406, "rewards/margins": 2.441744327545166, "rewards/rejected": -3.9595396518707275, "step": 919 }, { "epoch": 0.79, "grad_norm": 68.33602752330498, "learning_rate": 2.5933176747565165e-07, "logits/chosen": -1.349252462387085, "logits/rejected": -1.3031195402145386, "logps/chosen": -467.68505859375, "logps/rejected": -747.3790283203125, "loss": 0.3611, "rewards/accuracies": 0.875, "rewards/chosen": -1.1066734790802002, "rewards/margins": 3.041616201400757, "rewards/rejected": -4.148289680480957, "step": 920 }, { "epoch": 0.79, "grad_norm": 66.7312634364071, "learning_rate": 2.5732294253872943e-07, "logits/chosen": -1.3333121538162231, "logits/rejected": -1.2936124801635742, "logps/chosen": -596.22265625, "logps/rejected": -773.28125, "loss": 0.3431, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2985539436340332, "rewards/margins": 2.0604376792907715, "rewards/rejected": -3.3589913845062256, "step": 921 }, { "epoch": 0.79, "grad_norm": 46.716215063346944, "learning_rate": 2.553207787401339e-07, "logits/chosen": -1.3393959999084473, "logits/rejected": -1.277437686920166, "logps/chosen": -480.6090393066406, "logps/rejected": -705.7352294921875, "loss": 0.2781, "rewards/accuracies": 0.875, "rewards/chosen": -0.8253424167633057, "rewards/margins": 2.352390766143799, "rewards/rejected": -3.1777331829071045, "step": 922 }, { "epoch": 0.79, "grad_norm": 48.9397323986539, "learning_rate": 2.533252940374556e-07, "logits/chosen": -1.34089994430542, "logits/rejected": -1.2757072448730469, "logps/chosen": -557.601806640625, "logps/rejected": -759.7794799804688, "loss": 0.2221, "rewards/accuracies": 0.875, "rewards/chosen": -1.6078077554702759, "rewards/margins": 2.3486251831054688, "rewards/rejected": -3.956432819366455, "step": 923 }, { "epoch": 0.79, "grad_norm": 60.29091845061661, "learning_rate": 2.513365063283791e-07, "logits/chosen": -1.2821693420410156, "logits/rejected": -1.282141923904419, "logps/chosen": -505.06671142578125, "logps/rejected": -537.974365234375, "loss": 0.3372, "rewards/accuracies": 0.875, "rewards/chosen": -0.9660041332244873, "rewards/margins": 1.7687538862228394, "rewards/rejected": -2.734757900238037, "step": 924 }, { "epoch": 0.79, "grad_norm": 32.52231932265654, "learning_rate": 2.493544334505221e-07, "logits/chosen": -1.307525396347046, "logits/rejected": -1.2522432804107666, "logps/chosen": -570.719970703125, "logps/rejected": -773.0458984375, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": -1.1016355752944946, "rewards/margins": 2.7439346313476562, "rewards/rejected": -3.8455703258514404, "step": 925 }, { "epoch": 0.79, "grad_norm": 43.05479637811732, "learning_rate": 2.4737909318127826e-07, "logits/chosen": -1.3062845468521118, "logits/rejected": -1.2266449928283691, "logps/chosen": -376.541015625, "logps/rejected": -599.5220947265625, "loss": 0.217, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7425134181976318, "rewards/margins": 2.541034698486328, "rewards/rejected": -3.28354811668396, "step": 926 }, { "epoch": 0.8, "grad_norm": 46.41979495785788, "learning_rate": 2.45410503237654e-07, "logits/chosen": -1.3085191249847412, "logits/rejected": -1.2489107847213745, "logps/chosen": -297.17486572265625, "logps/rejected": -566.790771484375, "loss": 0.283, "rewards/accuracies": 0.875, "rewards/chosen": -0.9123257398605347, "rewards/margins": 2.1935391426086426, "rewards/rejected": -3.1058647632598877, "step": 927 }, { "epoch": 0.8, "grad_norm": 47.81115773318066, "learning_rate": 2.434486812761124e-07, "logits/chosen": -1.3504283428192139, "logits/rejected": -1.31508469581604, "logps/chosen": -531.3181762695312, "logps/rejected": -676.3302612304688, "loss": 0.2547, "rewards/accuracies": 0.875, "rewards/chosen": -1.1350109577178955, "rewards/margins": 2.2765567302703857, "rewards/rejected": -3.4115676879882812, "step": 928 }, { "epoch": 0.8, "grad_norm": 71.94119778498928, "learning_rate": 2.4149364489241386e-07, "logits/chosen": -1.3219082355499268, "logits/rejected": -1.3023300170898438, "logps/chosen": -632.0505981445312, "logps/rejected": -608.5750732421875, "loss": 0.5524, "rewards/accuracies": 0.875, "rewards/chosen": -0.9060620665550232, "rewards/margins": 2.401428461074829, "rewards/rejected": -3.307490348815918, "step": 929 }, { "epoch": 0.8, "grad_norm": 36.080090426945006, "learning_rate": 2.3954541162145804e-07, "logits/chosen": -1.283137321472168, "logits/rejected": -1.2344775199890137, "logps/chosen": -568.8763427734375, "logps/rejected": -669.472412109375, "loss": 0.1851, "rewards/accuracies": 1.0, "rewards/chosen": -1.0817811489105225, "rewards/margins": 2.3809902667999268, "rewards/rejected": -3.462771415710449, "step": 930 }, { "epoch": 0.8, "grad_norm": 86.54210763526781, "learning_rate": 2.3760399893712714e-07, "logits/chosen": -1.277703046798706, "logits/rejected": -1.2782983779907227, "logps/chosen": -728.403564453125, "logps/rejected": -810.8538818359375, "loss": 0.3426, "rewards/accuracies": 0.875, "rewards/chosen": -1.6086221933364868, "rewards/margins": 2.27468204498291, "rewards/rejected": -3.8833041191101074, "step": 931 }, { "epoch": 0.8, "grad_norm": 76.30793036259315, "learning_rate": 2.3566942425212867e-07, "logits/chosen": -1.3559530973434448, "logits/rejected": -1.2951138019561768, "logps/chosen": -556.3028564453125, "logps/rejected": -691.5188598632812, "loss": 0.4465, "rewards/accuracies": 0.875, "rewards/chosen": -1.14959716796875, "rewards/margins": 2.3433687686920166, "rewards/rejected": -3.4929661750793457, "step": 932 }, { "epoch": 0.8, "grad_norm": 67.37617332972937, "learning_rate": 2.3374170491783952e-07, "logits/chosen": -1.3153711557388306, "logits/rejected": -1.248844861984253, "logps/chosen": -566.6974487304688, "logps/rejected": -788.5882568359375, "loss": 0.565, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3778927326202393, "rewards/margins": 2.809847354888916, "rewards/rejected": -4.187740325927734, "step": 933 }, { "epoch": 0.8, "grad_norm": 59.85884327598616, "learning_rate": 2.3182085822415055e-07, "logits/chosen": -1.3591265678405762, "logits/rejected": -1.3253945112228394, "logps/chosen": -493.8184509277344, "logps/rejected": -725.8446044921875, "loss": 0.4137, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6604773998260498, "rewards/margins": 1.2979148626327515, "rewards/rejected": -2.9583921432495117, "step": 934 }, { "epoch": 0.8, "grad_norm": 86.97489953971993, "learning_rate": 2.2990690139931114e-07, "logits/chosen": -1.2852991819381714, "logits/rejected": -1.2641592025756836, "logps/chosen": -530.9556884765625, "logps/rejected": -662.2107543945312, "loss": 0.5082, "rewards/accuracies": 0.8125, "rewards/chosen": -1.521914005279541, "rewards/margins": 1.842146635055542, "rewards/rejected": -3.364060401916504, "step": 935 }, { "epoch": 0.8, "grad_norm": 82.77432246534761, "learning_rate": 2.2799985160977454e-07, "logits/chosen": -1.3647409677505493, "logits/rejected": -1.3406472206115723, "logps/chosen": -666.6566162109375, "logps/rejected": -643.3453979492188, "loss": 0.3221, "rewards/accuracies": 0.875, "rewards/chosen": -1.455517053604126, "rewards/margins": 2.4178004264831543, "rewards/rejected": -3.8733177185058594, "step": 936 }, { "epoch": 0.8, "grad_norm": 22.240157953731014, "learning_rate": 2.2609972596004477e-07, "logits/chosen": -1.3525817394256592, "logits/rejected": -1.2699151039123535, "logps/chosen": -370.5299377441406, "logps/rejected": -715.29736328125, "loss": 0.156, "rewards/accuracies": 1.0, "rewards/chosen": -1.0559579133987427, "rewards/margins": 2.735990047454834, "rewards/rejected": -3.791947841644287, "step": 937 }, { "epoch": 0.8, "grad_norm": 55.495969319969, "learning_rate": 2.242065414925215e-07, "logits/chosen": -1.3445532321929932, "logits/rejected": -1.2792248725891113, "logps/chosen": -530.41796875, "logps/rejected": -776.2282104492188, "loss": 0.3099, "rewards/accuracies": 0.875, "rewards/chosen": -1.4455312490463257, "rewards/margins": 1.9357490539550781, "rewards/rejected": -3.3812801837921143, "step": 938 }, { "epoch": 0.81, "grad_norm": 34.049314536628835, "learning_rate": 2.2232031518734984e-07, "logits/chosen": -1.3321669101715088, "logits/rejected": -1.2851825952529907, "logps/chosen": -369.9688720703125, "logps/rejected": -547.10302734375, "loss": 0.2334, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6528770327568054, "rewards/margins": 2.41916561126709, "rewards/rejected": -3.072042465209961, "step": 939 }, { "epoch": 0.81, "grad_norm": 61.74554258352496, "learning_rate": 2.204410639622657e-07, "logits/chosen": -1.3021714687347412, "logits/rejected": -1.2831745147705078, "logps/chosen": -489.5042724609375, "logps/rejected": -573.2079467773438, "loss": 0.3026, "rewards/accuracies": 0.875, "rewards/chosen": -1.3532168865203857, "rewards/margins": 1.6035637855529785, "rewards/rejected": -2.9567806720733643, "step": 940 }, { "epoch": 0.81, "grad_norm": 43.76747281023233, "learning_rate": 2.1856880467244408e-07, "logits/chosen": -1.3493702411651611, "logits/rejected": -1.3328311443328857, "logps/chosen": -335.95745849609375, "logps/rejected": -380.2527770996094, "loss": 0.3337, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7542818784713745, "rewards/margins": 1.4853137731552124, "rewards/rejected": -2.239595651626587, "step": 941 }, { "epoch": 0.81, "grad_norm": 77.37259259760772, "learning_rate": 2.1670355411035058e-07, "logits/chosen": -1.3695814609527588, "logits/rejected": -1.285352349281311, "logps/chosen": -460.94085693359375, "logps/rejected": -627.2164306640625, "loss": 0.3334, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6768767833709717, "rewards/margins": 2.2322726249694824, "rewards/rejected": -2.909149169921875, "step": 942 }, { "epoch": 0.81, "grad_norm": 37.64021272407518, "learning_rate": 2.1484532900558684e-07, "logits/chosen": -1.3233516216278076, "logits/rejected": -1.245588779449463, "logps/chosen": -558.4962158203125, "logps/rejected": -842.7387084960938, "loss": 0.1812, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2138087749481201, "rewards/margins": 2.9561476707458496, "rewards/rejected": -4.169956207275391, "step": 943 }, { "epoch": 0.81, "grad_norm": 93.38322909034852, "learning_rate": 2.1299414602474375e-07, "logits/chosen": -1.3307756185531616, "logits/rejected": -1.3243978023529053, "logps/chosen": -569.825439453125, "logps/rejected": -607.7615966796875, "loss": 0.4985, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2010345458984375, "rewards/margins": 1.0289090871810913, "rewards/rejected": -2.2299435138702393, "step": 944 }, { "epoch": 0.81, "grad_norm": 43.12913446481472, "learning_rate": 2.1115002177125063e-07, "logits/chosen": -1.3236948251724243, "logits/rejected": -1.2851643562316895, "logps/chosen": -598.6200561523438, "logps/rejected": -682.32421875, "loss": 0.1885, "rewards/accuracies": 0.9375, "rewards/chosen": -1.110639214515686, "rewards/margins": 2.551461696624756, "rewards/rejected": -3.6621007919311523, "step": 945 }, { "epoch": 0.81, "grad_norm": 79.79479941191578, "learning_rate": 2.0931297278522609e-07, "logits/chosen": -1.301816701889038, "logits/rejected": -1.2469000816345215, "logps/chosen": -523.5300903320312, "logps/rejected": -710.1577758789062, "loss": 0.5301, "rewards/accuracies": 0.875, "rewards/chosen": -1.259035587310791, "rewards/margins": 2.14595365524292, "rewards/rejected": -3.40498948097229, "step": 946 }, { "epoch": 0.81, "grad_norm": 52.533395564987856, "learning_rate": 2.0748301554333024e-07, "logits/chosen": -1.2569694519042969, "logits/rejected": -1.1883673667907715, "logps/chosen": -546.7478637695312, "logps/rejected": -734.0489501953125, "loss": 0.338, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6118360757827759, "rewards/margins": 2.1850857734680176, "rewards/rejected": -3.796921730041504, "step": 947 }, { "epoch": 0.81, "grad_norm": 39.58756476499145, "learning_rate": 2.0566016645861662e-07, "logits/chosen": -1.3819479942321777, "logits/rejected": -1.3744871616363525, "logps/chosen": -480.4954833984375, "logps/rejected": -505.9744873046875, "loss": 0.264, "rewards/accuracies": 1.0, "rewards/chosen": -1.1077557802200317, "rewards/margins": 1.5549559593200684, "rewards/rejected": -2.6627118587493896, "step": 948 }, { "epoch": 0.81, "grad_norm": 33.37668426570639, "learning_rate": 2.0384444188038508e-07, "logits/chosen": -1.3193440437316895, "logits/rejected": -1.2121649980545044, "logps/chosen": -444.0843811035156, "logps/rejected": -845.9827270507812, "loss": 0.2048, "rewards/accuracies": 0.875, "rewards/chosen": -0.88461834192276, "rewards/margins": 3.3373303413391113, "rewards/rejected": -4.221948623657227, "step": 949 }, { "epoch": 0.81, "grad_norm": 58.852877656840754, "learning_rate": 2.0203585809403523e-07, "logits/chosen": -1.2987149953842163, "logits/rejected": -1.2459378242492676, "logps/chosen": -350.14276123046875, "logps/rejected": -584.1080322265625, "loss": 0.3171, "rewards/accuracies": 0.8125, "rewards/chosen": -1.116916537284851, "rewards/margins": 1.9350357055664062, "rewards/rejected": -3.051952362060547, "step": 950 }, { "epoch": 0.82, "grad_norm": 88.45307444424925, "learning_rate": 2.0023443132092e-07, "logits/chosen": -1.3424527645111084, "logits/rejected": -1.3207765817642212, "logps/chosen": -555.0639038085938, "logps/rejected": -669.8426513671875, "loss": 0.3594, "rewards/accuracies": 0.875, "rewards/chosen": -1.3616647720336914, "rewards/margins": 2.0655899047851562, "rewards/rejected": -3.4272546768188477, "step": 951 }, { "epoch": 0.82, "grad_norm": 65.1823986201499, "learning_rate": 1.9844017771820054e-07, "logits/chosen": -1.2823162078857422, "logits/rejected": -1.2424159049987793, "logps/chosen": -427.4981689453125, "logps/rejected": -607.1182861328125, "loss": 0.367, "rewards/accuracies": 0.875, "rewards/chosen": -1.1584736108779907, "rewards/margins": 1.8016468286514282, "rewards/rejected": -2.960120439529419, "step": 952 }, { "epoch": 0.82, "grad_norm": 39.696090131290745, "learning_rate": 1.9665311337870173e-07, "logits/chosen": -1.3776021003723145, "logits/rejected": -1.3465569019317627, "logps/chosen": -385.9962463378906, "logps/rejected": -483.7662353515625, "loss": 0.3253, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5088029503822327, "rewards/margins": 2.1894595623016357, "rewards/rejected": -2.6982624530792236, "step": 953 }, { "epoch": 0.82, "grad_norm": 76.36659757388189, "learning_rate": 1.9487325433076573e-07, "logits/chosen": -1.3619831800460815, "logits/rejected": -1.2768319845199585, "logps/chosen": -510.9540710449219, "logps/rejected": -762.8428344726562, "loss": 0.4332, "rewards/accuracies": 0.875, "rewards/chosen": -1.6498937606811523, "rewards/margins": 1.9130890369415283, "rewards/rejected": -3.5629830360412598, "step": 954 }, { "epoch": 0.82, "grad_norm": 100.57414208414959, "learning_rate": 1.931006165381117e-07, "logits/chosen": -1.3057761192321777, "logits/rejected": -1.2368066310882568, "logps/chosen": -714.7298583984375, "logps/rejected": -942.5250244140625, "loss": 0.1497, "rewards/accuracies": 1.0, "rewards/chosen": -1.454352617263794, "rewards/margins": 3.2785496711730957, "rewards/rejected": -4.732902526855469, "step": 955 }, { "epoch": 0.82, "grad_norm": 77.062422421182, "learning_rate": 1.913352158996898e-07, "logits/chosen": -1.3390214443206787, "logits/rejected": -1.2953336238861084, "logps/chosen": -427.72222900390625, "logps/rejected": -579.9778442382812, "loss": 0.4281, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1499311923980713, "rewards/margins": 1.604925274848938, "rewards/rejected": -2.754856586456299, "step": 956 }, { "epoch": 0.82, "grad_norm": 81.358607096424, "learning_rate": 1.8957706824953912e-07, "logits/chosen": -1.3217957019805908, "logits/rejected": -1.3499778509140015, "logps/chosen": -604.0429077148438, "logps/rejected": -668.5711669921875, "loss": 0.4214, "rewards/accuracies": 0.75, "rewards/chosen": -1.2244956493377686, "rewards/margins": 1.9957714080810547, "rewards/rejected": -3.2202672958374023, "step": 957 }, { "epoch": 0.82, "grad_norm": 51.96635681812691, "learning_rate": 1.878261893566465e-07, "logits/chosen": -1.290670394897461, "logits/rejected": -1.2346240282058716, "logps/chosen": -511.96966552734375, "logps/rejected": -710.0076293945312, "loss": 0.2174, "rewards/accuracies": 0.9375, "rewards/chosen": -0.35284197330474854, "rewards/margins": 3.1048197746276855, "rewards/rejected": -3.4576616287231445, "step": 958 }, { "epoch": 0.82, "grad_norm": 65.51188245747359, "learning_rate": 1.860825949248047e-07, "logits/chosen": -1.3652698993682861, "logits/rejected": -1.305970311164856, "logps/chosen": -436.9747619628906, "logps/rejected": -593.4093627929688, "loss": 0.318, "rewards/accuracies": 0.875, "rewards/chosen": -1.0405833721160889, "rewards/margins": 2.0152382850646973, "rewards/rejected": -3.0558218955993652, "step": 959 }, { "epoch": 0.82, "grad_norm": 70.47338845675975, "learning_rate": 1.8434630059247126e-07, "logits/chosen": -1.31570565700531, "logits/rejected": -1.288224220275879, "logps/chosen": -541.5432739257812, "logps/rejected": -632.031982421875, "loss": 0.3616, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4176642894744873, "rewards/margins": 1.8958582878112793, "rewards/rejected": -3.3135228157043457, "step": 960 }, { "epoch": 0.82, "grad_norm": 103.46121882906307, "learning_rate": 1.826173219326287e-07, "logits/chosen": -1.4040485620498657, "logits/rejected": -1.3483211994171143, "logps/chosen": -695.0693359375, "logps/rejected": -754.0690307617188, "loss": 0.6171, "rewards/accuracies": 0.875, "rewards/chosen": -1.865354061126709, "rewards/margins": 1.688320517539978, "rewards/rejected": -3.5536746978759766, "step": 961 }, { "epoch": 0.83, "grad_norm": 32.83999159031318, "learning_rate": 1.808956744526443e-07, "logits/chosen": -1.3187965154647827, "logits/rejected": -1.2331805229187012, "logps/chosen": -490.5249328613281, "logps/rejected": -802.759521484375, "loss": 0.2065, "rewards/accuracies": 0.875, "rewards/chosen": -1.0255398750305176, "rewards/margins": 3.493373394012451, "rewards/rejected": -4.518913269042969, "step": 962 }, { "epoch": 0.83, "grad_norm": 19.993012647341114, "learning_rate": 1.7918137359413154e-07, "logits/chosen": -1.3293086290359497, "logits/rejected": -1.2440801858901978, "logps/chosen": -390.60986328125, "logps/rejected": -690.3939208984375, "loss": 0.1433, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9291788935661316, "rewards/margins": 3.344635248184204, "rewards/rejected": -4.2738142013549805, "step": 963 }, { "epoch": 0.83, "grad_norm": 71.96482022456891, "learning_rate": 1.7747443473281133e-07, "logits/chosen": -1.3135042190551758, "logits/rejected": -1.274287223815918, "logps/chosen": -664.2023315429688, "logps/rejected": -819.9852905273438, "loss": 0.3968, "rewards/accuracies": 0.75, "rewards/chosen": -1.9484283924102783, "rewards/margins": 2.0423455238342285, "rewards/rejected": -3.990774154663086, "step": 964 }, { "epoch": 0.83, "grad_norm": 49.96214887666054, "learning_rate": 1.7577487317837414e-07, "logits/chosen": -1.384070634841919, "logits/rejected": -1.3049094676971436, "logps/chosen": -397.2987060546875, "logps/rejected": -640.8340454101562, "loss": 0.3909, "rewards/accuracies": 0.75, "rewards/chosen": -0.8136837482452393, "rewards/margins": 2.6849682331085205, "rewards/rejected": -3.4986519813537598, "step": 965 }, { "epoch": 0.83, "grad_norm": 37.07711626565963, "learning_rate": 1.7408270417434278e-07, "logits/chosen": -1.3725950717926025, "logits/rejected": -1.3246898651123047, "logps/chosen": -635.620361328125, "logps/rejected": -773.0831909179688, "loss": 0.2158, "rewards/accuracies": 0.875, "rewards/chosen": -1.4327852725982666, "rewards/margins": 2.48598575592041, "rewards/rejected": -3.918771266937256, "step": 966 }, { "epoch": 0.83, "grad_norm": 59.25461140547252, "learning_rate": 1.723979428979353e-07, "logits/chosen": -1.3322556018829346, "logits/rejected": -1.30332350730896, "logps/chosen": -493.76800537109375, "logps/rejected": -601.7940673828125, "loss": 0.355, "rewards/accuracies": 0.75, "rewards/chosen": -1.601588487625122, "rewards/margins": 1.7294272184371948, "rewards/rejected": -3.3310158252716064, "step": 967 }, { "epoch": 0.83, "grad_norm": 81.86136207581325, "learning_rate": 1.7072060445992963e-07, "logits/chosen": -1.3395432233810425, "logits/rejected": -1.268092393875122, "logps/chosen": -471.16900634765625, "logps/rejected": -727.853759765625, "loss": 0.4345, "rewards/accuracies": 0.75, "rewards/chosen": -1.0265016555786133, "rewards/margins": 2.8057689666748047, "rewards/rejected": -3.832270622253418, "step": 968 }, { "epoch": 0.83, "grad_norm": 54.317732056880494, "learning_rate": 1.6905070390452746e-07, "logits/chosen": -1.2924282550811768, "logits/rejected": -1.228909969329834, "logps/chosen": -512.61572265625, "logps/rejected": -765.1852416992188, "loss": 0.227, "rewards/accuracies": 0.875, "rewards/chosen": -0.9856740236282349, "rewards/margins": 2.5250301361083984, "rewards/rejected": -3.510704517364502, "step": 969 }, { "epoch": 0.83, "grad_norm": 41.962128256470194, "learning_rate": 1.6738825620921893e-07, "logits/chosen": -1.3754926919937134, "logits/rejected": -1.2921273708343506, "logps/chosen": -585.1094970703125, "logps/rejected": -874.86083984375, "loss": 0.1683, "rewards/accuracies": 1.0, "rewards/chosen": -1.1646623611450195, "rewards/margins": 3.2304623126983643, "rewards/rejected": -4.395124912261963, "step": 970 }, { "epoch": 0.83, "grad_norm": 102.50633564708357, "learning_rate": 1.6573327628464896e-07, "logits/chosen": -1.3682212829589844, "logits/rejected": -1.3239730596542358, "logps/chosen": -514.51123046875, "logps/rejected": -719.0240478515625, "loss": 0.738, "rewards/accuracies": 0.875, "rewards/chosen": -1.6949272155761719, "rewards/margins": 1.5487298965454102, "rewards/rejected": -3.243657112121582, "step": 971 }, { "epoch": 0.83, "grad_norm": 59.90244100603329, "learning_rate": 1.640857789744846e-07, "logits/chosen": -1.3000390529632568, "logits/rejected": -1.270625352859497, "logps/chosen": -742.279541015625, "logps/rejected": -793.2999267578125, "loss": 0.2283, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4226157665252686, "rewards/margins": 2.159538745880127, "rewards/rejected": -3.5821545124053955, "step": 972 }, { "epoch": 0.83, "grad_norm": 58.278500296634284, "learning_rate": 1.6244577905527868e-07, "logits/chosen": -1.3476643562316895, "logits/rejected": -1.3438431024551392, "logps/chosen": -443.50811767578125, "logps/rejected": -473.27508544921875, "loss": 0.4892, "rewards/accuracies": 0.875, "rewards/chosen": -1.1845086812973022, "rewards/margins": 1.4653077125549316, "rewards/rejected": -2.6498162746429443, "step": 973 }, { "epoch": 0.84, "grad_norm": 35.075985813774416, "learning_rate": 1.6081329123634024e-07, "logits/chosen": -1.271264672279358, "logits/rejected": -1.2945821285247803, "logps/chosen": -486.099609375, "logps/rejected": -553.9111328125, "loss": 0.251, "rewards/accuracies": 0.9375, "rewards/chosen": -1.140496015548706, "rewards/margins": 2.0564515590667725, "rewards/rejected": -3.1969475746154785, "step": 974 }, { "epoch": 0.84, "grad_norm": 58.20649671335517, "learning_rate": 1.5918833015960244e-07, "logits/chosen": -1.3587465286254883, "logits/rejected": -1.2988827228546143, "logps/chosen": -520.69775390625, "logps/rejected": -818.2510986328125, "loss": 0.291, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5749714374542236, "rewards/margins": 2.2684600353240967, "rewards/rejected": -3.8434317111968994, "step": 975 }, { "epoch": 0.84, "grad_norm": 115.5741282409676, "learning_rate": 1.5757091039948855e-07, "logits/chosen": -1.336240291595459, "logits/rejected": -1.2873685359954834, "logps/chosen": -555.2882690429688, "logps/rejected": -592.8360595703125, "loss": 0.5613, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3236135244369507, "rewards/margins": 1.2951241731643677, "rewards/rejected": -2.6187376976013184, "step": 976 }, { "epoch": 0.84, "grad_norm": 31.639541138118187, "learning_rate": 1.559610464627844e-07, "logits/chosen": -1.259886622428894, "logits/rejected": -1.219529390335083, "logps/chosen": -432.88116455078125, "logps/rejected": -646.241943359375, "loss": 0.2021, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3749642372131348, "rewards/margins": 2.893190860748291, "rewards/rejected": -4.268155097961426, "step": 977 }, { "epoch": 0.84, "grad_norm": 57.79686067323558, "learning_rate": 1.5435875278850664e-07, "logits/chosen": -1.3151590824127197, "logits/rejected": -1.2965672016143799, "logps/chosen": -660.9667358398438, "logps/rejected": -831.7444458007812, "loss": 0.2738, "rewards/accuracies": 0.875, "rewards/chosen": -1.1809264421463013, "rewards/margins": 3.1272950172424316, "rewards/rejected": -4.308221340179443, "step": 978 }, { "epoch": 0.84, "grad_norm": 23.415580259184537, "learning_rate": 1.5276404374777352e-07, "logits/chosen": -1.2803633213043213, "logits/rejected": -1.2494806051254272, "logps/chosen": -546.5381469726562, "logps/rejected": -770.8450927734375, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": -0.8054540753364563, "rewards/margins": 3.301450252532959, "rewards/rejected": -4.10690450668335, "step": 979 }, { "epoch": 0.84, "grad_norm": 69.39581972642632, "learning_rate": 1.511769336436759e-07, "logits/chosen": -1.3319766521453857, "logits/rejected": -1.3051915168762207, "logps/chosen": -614.9429321289062, "logps/rejected": -808.3277587890625, "loss": 0.349, "rewards/accuracies": 0.875, "rewards/chosen": -1.653443455696106, "rewards/margins": 2.0603551864624023, "rewards/rejected": -3.7137985229492188, "step": 980 }, { "epoch": 0.84, "grad_norm": 71.17196160686636, "learning_rate": 1.4959743671114923e-07, "logits/chosen": -1.2849793434143066, "logits/rejected": -1.2056975364685059, "logps/chosen": -529.6190185546875, "logps/rejected": -878.0888671875, "loss": 0.4077, "rewards/accuracies": 0.875, "rewards/chosen": -1.2433801889419556, "rewards/margins": 2.849297523498535, "rewards/rejected": -4.092677593231201, "step": 981 }, { "epoch": 0.84, "grad_norm": 37.17978114274386, "learning_rate": 1.4802556711684578e-07, "logits/chosen": -1.342916488647461, "logits/rejected": -1.2826406955718994, "logps/chosen": -664.3662109375, "logps/rejected": -953.063720703125, "loss": 0.1432, "rewards/accuracies": 0.9375, "rewards/chosen": -1.191670536994934, "rewards/margins": 3.7289047241210938, "rewards/rejected": -4.920575141906738, "step": 982 }, { "epoch": 0.84, "grad_norm": 67.10573660182742, "learning_rate": 1.464613389590076e-07, "logits/chosen": -1.359865427017212, "logits/rejected": -1.3003504276275635, "logps/chosen": -522.3917236328125, "logps/rejected": -681.456298828125, "loss": 0.3646, "rewards/accuracies": 0.75, "rewards/chosen": -1.3133918046951294, "rewards/margins": 2.1504573822021484, "rewards/rejected": -3.4638490676879883, "step": 983 }, { "epoch": 0.84, "grad_norm": 29.117522756221938, "learning_rate": 1.4490476626733904e-07, "logits/chosen": -1.3106701374053955, "logits/rejected": -1.2240524291992188, "logps/chosen": -465.88330078125, "logps/rejected": -737.56787109375, "loss": 0.1507, "rewards/accuracies": 1.0, "rewards/chosen": -0.892052412033081, "rewards/margins": 2.813678026199341, "rewards/rejected": -3.705730438232422, "step": 984 }, { "epoch": 0.84, "grad_norm": 21.957818753419676, "learning_rate": 1.4335586300288384e-07, "logits/chosen": -1.4063811302185059, "logits/rejected": -1.3505864143371582, "logps/chosen": -529.839599609375, "logps/rejected": -693.262451171875, "loss": 0.0855, "rewards/accuracies": 1.0, "rewards/chosen": -0.9908750057220459, "rewards/margins": 3.0074610710144043, "rewards/rejected": -3.998335838317871, "step": 985 }, { "epoch": 0.85, "grad_norm": 55.984646307333605, "learning_rate": 1.4181464305789582e-07, "logits/chosen": -1.3785548210144043, "logits/rejected": -1.374957799911499, "logps/chosen": -412.48687744140625, "logps/rejected": -459.7071228027344, "loss": 0.402, "rewards/accuracies": 0.875, "rewards/chosen": -1.3801637887954712, "rewards/margins": 1.472696304321289, "rewards/rejected": -2.852860450744629, "step": 986 }, { "epoch": 0.85, "grad_norm": 45.42030857890214, "learning_rate": 1.402811202557176e-07, "logits/chosen": -1.3691201210021973, "logits/rejected": -1.2851285934448242, "logps/chosen": -588.96484375, "logps/rejected": -945.3374633789062, "loss": 0.1955, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0526621341705322, "rewards/margins": 4.162746429443359, "rewards/rejected": -5.2154083251953125, "step": 987 }, { "epoch": 0.85, "grad_norm": 93.84393859471162, "learning_rate": 1.3875530835065574e-07, "logits/chosen": -1.3202193975448608, "logits/rejected": -1.3074629306793213, "logps/chosen": -512.6787109375, "logps/rejected": -594.0485229492188, "loss": 0.8315, "rewards/accuracies": 0.75, "rewards/chosen": -1.14207124710083, "rewards/margins": 1.4724304676055908, "rewards/rejected": -2.614501714706421, "step": 988 }, { "epoch": 0.85, "grad_norm": 30.332149826016966, "learning_rate": 1.3723722102785574e-07, "logits/chosen": -1.347909688949585, "logits/rejected": -1.2588642835617065, "logps/chosen": -501.5035400390625, "logps/rejected": -778.224853515625, "loss": 0.184, "rewards/accuracies": 0.9375, "rewards/chosen": -1.144532322883606, "rewards/margins": 3.007246971130371, "rewards/rejected": -4.1517791748046875, "step": 989 }, { "epoch": 0.85, "grad_norm": 66.25531036689799, "learning_rate": 1.3572687190318167e-07, "logits/chosen": -1.3748841285705566, "logits/rejected": -1.3424025774002075, "logps/chosen": -370.31097412109375, "logps/rejected": -493.2507019042969, "loss": 0.3701, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7494587898254395, "rewards/margins": 1.8322503566741943, "rewards/rejected": -2.581709384918213, "step": 990 }, { "epoch": 0.85, "grad_norm": 79.38873125449446, "learning_rate": 1.3422427452309304e-07, "logits/chosen": -1.3242628574371338, "logits/rejected": -1.270920753479004, "logps/chosen": -503.6895751953125, "logps/rejected": -741.283935546875, "loss": 0.432, "rewards/accuracies": 0.75, "rewards/chosen": -1.2765638828277588, "rewards/margins": 2.556812286376953, "rewards/rejected": -3.833375930786133, "step": 991 }, { "epoch": 0.85, "grad_norm": 83.40848008718488, "learning_rate": 1.3272944236452255e-07, "logits/chosen": -1.3762929439544678, "logits/rejected": -1.2758681774139404, "logps/chosen": -531.3277587890625, "logps/rejected": -764.064208984375, "loss": 0.5236, "rewards/accuracies": 0.875, "rewards/chosen": -1.1648799180984497, "rewards/margins": 2.10933256149292, "rewards/rejected": -3.27421236038208, "step": 992 }, { "epoch": 0.85, "grad_norm": 27.36302955280646, "learning_rate": 1.3124238883475625e-07, "logits/chosen": -1.340722680091858, "logits/rejected": -1.3234275579452515, "logps/chosen": -476.11065673828125, "logps/rejected": -614.7606201171875, "loss": 0.2048, "rewards/accuracies": 1.0, "rewards/chosen": -0.8835915327072144, "rewards/margins": 2.2028541564941406, "rewards/rejected": -3.0864458084106445, "step": 993 }, { "epoch": 0.85, "grad_norm": 50.74166752479985, "learning_rate": 1.297631272713132e-07, "logits/chosen": -1.3843188285827637, "logits/rejected": -1.3354160785675049, "logps/chosen": -362.65545654296875, "logps/rejected": -511.3385314941406, "loss": 0.38, "rewards/accuracies": 0.875, "rewards/chosen": -0.8462494611740112, "rewards/margins": 1.3559162616729736, "rewards/rejected": -2.2021656036376953, "step": 994 }, { "epoch": 0.85, "grad_norm": 79.43461015601946, "learning_rate": 1.2829167094182535e-07, "logits/chosen": -1.3651742935180664, "logits/rejected": -1.3554399013519287, "logps/chosen": -345.17913818359375, "logps/rejected": -470.7667541503906, "loss": 0.5855, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4406148195266724, "rewards/margins": 0.8224074840545654, "rewards/rejected": -2.2630221843719482, "step": 995 }, { "epoch": 0.85, "grad_norm": 57.67269416870282, "learning_rate": 1.268280330439191e-07, "logits/chosen": -1.3220021724700928, "logits/rejected": -1.2686153650283813, "logps/chosen": -482.16314697265625, "logps/rejected": -677.6004638671875, "loss": 0.2283, "rewards/accuracies": 0.875, "rewards/chosen": -1.219618558883667, "rewards/margins": 2.923917770385742, "rewards/rejected": -4.14353609085083, "step": 996 }, { "epoch": 0.86, "grad_norm": 49.46726905531337, "learning_rate": 1.2537222670509563e-07, "logits/chosen": -1.366963267326355, "logits/rejected": -1.3310010433197021, "logps/chosen": -572.5836791992188, "logps/rejected": -686.4254760742188, "loss": 0.2289, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3257455825805664, "rewards/margins": 1.9714258909225464, "rewards/rejected": -3.2971715927124023, "step": 997 }, { "epoch": 0.86, "grad_norm": 32.18065986952302, "learning_rate": 1.2392426498261555e-07, "logits/chosen": -1.394819974899292, "logits/rejected": -1.2971246242523193, "logps/chosen": -410.18109130859375, "logps/rejected": -691.6347045898438, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": -1.0975755453109741, "rewards/margins": 2.950037956237793, "rewards/rejected": -4.047614097595215, "step": 998 }, { "epoch": 0.86, "grad_norm": 107.19719355463234, "learning_rate": 1.2248416086337975e-07, "logits/chosen": -1.3006575107574463, "logits/rejected": -1.2912487983703613, "logps/chosen": -520.3414306640625, "logps/rejected": -561.614013671875, "loss": 0.7654, "rewards/accuracies": 0.625, "rewards/chosen": -1.300315022468567, "rewards/margins": 0.9441814422607422, "rewards/rejected": -2.2444963455200195, "step": 999 }, { "epoch": 0.86, "grad_norm": 77.32374464766163, "learning_rate": 1.2105192726381298e-07, "logits/chosen": -1.334614634513855, "logits/rejected": -1.3272688388824463, "logps/chosen": -638.7611083984375, "logps/rejected": -625.6121215820312, "loss": 0.4573, "rewards/accuracies": 0.875, "rewards/chosen": -1.7237482070922852, "rewards/margins": 1.8083107471466064, "rewards/rejected": -3.5320589542388916, "step": 1000 }, { "epoch": 0.86, "grad_norm": 50.55706077901139, "learning_rate": 1.196275770297497e-07, "logits/chosen": -1.352452278137207, "logits/rejected": -1.3267664909362793, "logps/chosen": -478.7269592285156, "logps/rejected": -654.9346923828125, "loss": 0.2493, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6469810009002686, "rewards/margins": 2.880655288696289, "rewards/rejected": -3.5276360511779785, "step": 1001 }, { "epoch": 0.86, "grad_norm": 48.758499448833476, "learning_rate": 1.1821112293631719e-07, "logits/chosen": -1.3520736694335938, "logits/rejected": -1.3165087699890137, "logps/chosen": -473.81866455078125, "logps/rejected": -618.9940795898438, "loss": 0.2119, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1709930896759033, "rewards/margins": 2.36578369140625, "rewards/rejected": -3.5367767810821533, "step": 1002 }, { "epoch": 0.86, "grad_norm": 37.22498704233837, "learning_rate": 1.1680257768782099e-07, "logits/chosen": -1.3299715518951416, "logits/rejected": -1.2586889266967773, "logps/chosen": -405.7853088378906, "logps/rejected": -546.0199584960938, "loss": 0.2608, "rewards/accuracies": 0.9375, "rewards/chosen": -1.06967031955719, "rewards/margins": 1.9266602993011475, "rewards/rejected": -2.996330738067627, "step": 1003 }, { "epoch": 0.86, "grad_norm": 54.30371464873953, "learning_rate": 1.1540195391763263e-07, "logits/chosen": -1.3893147706985474, "logits/rejected": -1.37056565284729, "logps/chosen": -449.3239440917969, "logps/rejected": -508.5211181640625, "loss": 0.2624, "rewards/accuracies": 0.875, "rewards/chosen": -0.9809780120849609, "rewards/margins": 1.7566754817962646, "rewards/rejected": -2.7376537322998047, "step": 1004 }, { "epoch": 0.86, "grad_norm": 34.826076763047475, "learning_rate": 1.1400926418807422e-07, "logits/chosen": -1.3537343740463257, "logits/rejected": -1.2689040899276733, "logps/chosen": -424.01190185546875, "logps/rejected": -680.014892578125, "loss": 0.2066, "rewards/accuracies": 0.875, "rewards/chosen": -1.1469078063964844, "rewards/margins": 2.787965774536133, "rewards/rejected": -3.934873580932617, "step": 1005 }, { "epoch": 0.86, "grad_norm": 32.71959772315284, "learning_rate": 1.1262452099030683e-07, "logits/chosen": -1.343701958656311, "logits/rejected": -1.2804139852523804, "logps/chosen": -515.3363037109375, "logps/rejected": -678.830322265625, "loss": 0.176, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8507044315338135, "rewards/margins": 3.0262112617492676, "rewards/rejected": -3.876915693283081, "step": 1006 }, { "epoch": 0.86, "grad_norm": 47.6305946661059, "learning_rate": 1.112477367442195e-07, "logits/chosen": -1.3749220371246338, "logits/rejected": -1.2896788120269775, "logps/chosen": -507.39715576171875, "logps/rejected": -836.339599609375, "loss": 0.2065, "rewards/accuracies": 0.9375, "rewards/chosen": -1.589701771736145, "rewards/margins": 2.4947781562805176, "rewards/rejected": -4.084479808807373, "step": 1007 }, { "epoch": 0.86, "grad_norm": 91.08197293979373, "learning_rate": 1.0987892379831499e-07, "logits/chosen": -1.409374713897705, "logits/rejected": -1.3736358880996704, "logps/chosen": -566.7190551757812, "logps/rejected": -724.8707275390625, "loss": 0.7248, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5256876945495605, "rewards/margins": 1.7136530876159668, "rewards/rejected": -3.2393407821655273, "step": 1008 }, { "epoch": 0.87, "grad_norm": 73.79584711311568, "learning_rate": 1.085180944296018e-07, "logits/chosen": -1.388725996017456, "logits/rejected": -1.3790984153747559, "logps/chosen": -533.7811279296875, "logps/rejected": -701.3124389648438, "loss": 0.4499, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2950994968414307, "rewards/margins": 2.425567865371704, "rewards/rejected": -3.7206673622131348, "step": 1009 }, { "epoch": 0.87, "grad_norm": 104.60604217147537, "learning_rate": 1.0716526084348276e-07, "logits/chosen": -1.2697685956954956, "logits/rejected": -1.2896143198013306, "logps/chosen": -526.0863647460938, "logps/rejected": -502.4462890625, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": -1.7153476476669312, "rewards/margins": 0.7424807548522949, "rewards/rejected": -2.4578285217285156, "step": 1010 }, { "epoch": 0.87, "grad_norm": 62.41838282787974, "learning_rate": 1.0582043517364602e-07, "logits/chosen": -1.3432402610778809, "logits/rejected": -1.3046520948410034, "logps/chosen": -598.40966796875, "logps/rejected": -688.8533325195312, "loss": 0.2734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2707072496414185, "rewards/margins": 2.0681800842285156, "rewards/rejected": -3.3388874530792236, "step": 1011 }, { "epoch": 0.87, "grad_norm": 23.917482883938213, "learning_rate": 1.0448362948195566e-07, "logits/chosen": -1.3280737400054932, "logits/rejected": -1.2590006589889526, "logps/chosen": -574.62255859375, "logps/rejected": -878.9960327148438, "loss": 0.0858, "rewards/accuracies": 1.0, "rewards/chosen": -1.6344642639160156, "rewards/margins": 3.077831268310547, "rewards/rejected": -4.7122955322265625, "step": 1012 }, { "epoch": 0.87, "grad_norm": 90.17755555406973, "learning_rate": 1.031548557583436e-07, "logits/chosen": -1.2596286535263062, "logits/rejected": -1.235691785812378, "logps/chosen": -618.44482421875, "logps/rejected": -745.6839599609375, "loss": 0.6558, "rewards/accuracies": 0.875, "rewards/chosen": -2.025634765625, "rewards/margins": 1.847394347190857, "rewards/rejected": -3.8730289936065674, "step": 1013 }, { "epoch": 0.87, "grad_norm": 57.67179771729252, "learning_rate": 1.0183412592070318e-07, "logits/chosen": -1.4220824241638184, "logits/rejected": -1.2803467512130737, "logps/chosen": -543.591552734375, "logps/rejected": -951.6238403320312, "loss": 0.2018, "rewards/accuracies": 0.8125, "rewards/chosen": -1.496922254562378, "rewards/margins": 3.2244110107421875, "rewards/rejected": -4.7213335037231445, "step": 1014 }, { "epoch": 0.87, "grad_norm": 28.27046503087058, "learning_rate": 1.0052145181478088e-07, "logits/chosen": -1.3783986568450928, "logits/rejected": -1.3363351821899414, "logps/chosen": -430.4894714355469, "logps/rejected": -737.7826538085938, "loss": 0.1427, "rewards/accuracies": 1.0, "rewards/chosen": -1.000460147857666, "rewards/margins": 3.2528653144836426, "rewards/rejected": -4.253325462341309, "step": 1015 }, { "epoch": 0.87, "grad_norm": 68.50746326940782, "learning_rate": 9.921684521407003e-08, "logits/chosen": -1.325369119644165, "logits/rejected": -1.2970994710922241, "logps/chosen": -496.46337890625, "logps/rejected": -493.948974609375, "loss": 0.4351, "rewards/accuracies": 0.875, "rewards/chosen": -0.9476513862609863, "rewards/margins": 1.5521341562271118, "rewards/rejected": -2.4997854232788086, "step": 1016 }, { "epoch": 0.87, "grad_norm": 45.159571790116686, "learning_rate": 9.792031781970689e-08, "logits/chosen": -1.4115604162216187, "logits/rejected": -1.377731442451477, "logps/chosen": -377.2203369140625, "logps/rejected": -521.806884765625, "loss": 0.2619, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0020372867584229, "rewards/margins": 1.844942331314087, "rewards/rejected": -2.8469796180725098, "step": 1017 }, { "epoch": 0.87, "grad_norm": 63.42631756119659, "learning_rate": 9.663188126036392e-08, "logits/chosen": -1.3807454109191895, "logits/rejected": -1.337803840637207, "logps/chosen": -519.5097045898438, "logps/rejected": -685.4462280273438, "loss": 0.3507, "rewards/accuracies": 0.875, "rewards/chosen": -1.5622811317443848, "rewards/margins": 2.141068935394287, "rewards/rejected": -3.703350305557251, "step": 1018 }, { "epoch": 0.87, "grad_norm": 34.09574052170219, "learning_rate": 9.535154709214587e-08, "logits/chosen": -1.3408069610595703, "logits/rejected": -1.272974967956543, "logps/chosen": -395.58734130859375, "logps/rejected": -605.916748046875, "loss": 0.1966, "rewards/accuracies": 0.9375, "rewards/chosen": -1.33781898021698, "rewards/margins": 2.1266462802886963, "rewards/rejected": -3.4644651412963867, "step": 1019 }, { "epoch": 0.87, "grad_norm": 103.9306170119398, "learning_rate": 9.407932679848751e-08, "logits/chosen": -1.3915414810180664, "logits/rejected": -1.3104114532470703, "logps/chosen": -506.6783142089844, "logps/rejected": -715.592529296875, "loss": 0.5415, "rewards/accuracies": 0.75, "rewards/chosen": -1.56777024269104, "rewards/margins": 1.9328727722167969, "rewards/rejected": -3.500643014907837, "step": 1020 }, { "epoch": 0.88, "grad_norm": 34.40064754463839, "learning_rate": 9.281523179004802e-08, "logits/chosen": -1.3873071670532227, "logits/rejected": -1.3065744638442993, "logps/chosen": -335.1987609863281, "logps/rejected": -605.9613037109375, "loss": 0.262, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5773022174835205, "rewards/margins": 2.349818706512451, "rewards/rejected": -2.9271209239959717, "step": 1021 }, { "epoch": 0.88, "grad_norm": 67.12493341782304, "learning_rate": 9.155927340461111e-08, "logits/chosen": -1.347530722618103, "logits/rejected": -1.2916395664215088, "logps/chosen": -685.7630004882812, "logps/rejected": -955.0648803710938, "loss": 0.4494, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5220239162445068, "rewards/margins": 2.94976806640625, "rewards/rejected": -4.471792221069336, "step": 1022 }, { "epoch": 0.88, "grad_norm": 62.10208865187062, "learning_rate": 9.031146290698277e-08, "logits/chosen": -1.350818157196045, "logits/rejected": -1.2238643169403076, "logps/chosen": -477.2236022949219, "logps/rejected": -798.7186279296875, "loss": 0.2364, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9117076992988586, "rewards/margins": 3.2474284172058105, "rewards/rejected": -4.159135818481445, "step": 1023 }, { "epoch": 0.88, "grad_norm": 76.81999610296886, "learning_rate": 8.907181148888854e-08, "logits/chosen": -1.298819899559021, "logits/rejected": -1.3052400350570679, "logps/chosen": -656.9742431640625, "logps/rejected": -658.60009765625, "loss": 0.3144, "rewards/accuracies": 0.8125, "rewards/chosen": -1.474662184715271, "rewards/margins": 2.117044687271118, "rewards/rejected": -3.5917067527770996, "step": 1024 }, { "epoch": 0.88, "grad_norm": 38.17404499212991, "learning_rate": 8.78403302688755e-08, "logits/chosen": -1.358424186706543, "logits/rejected": -1.2844429016113281, "logps/chosen": -521.257080078125, "logps/rejected": -793.573974609375, "loss": 0.1823, "rewards/accuracies": 1.0, "rewards/chosen": -1.2657744884490967, "rewards/margins": 2.5545406341552734, "rewards/rejected": -3.820315361022949, "step": 1025 }, { "epoch": 0.88, "grad_norm": 57.92634731089754, "learning_rate": 8.661703029221112e-08, "logits/chosen": -1.429605484008789, "logits/rejected": -1.3548343181610107, "logps/chosen": -479.7242431640625, "logps/rejected": -599.1637573242188, "loss": 0.3724, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6530858278274536, "rewards/margins": 1.7444809675216675, "rewards/rejected": -3.397566795349121, "step": 1026 }, { "epoch": 0.88, "grad_norm": 57.762652805798844, "learning_rate": 8.540192253078448e-08, "logits/chosen": -1.3694772720336914, "logits/rejected": -1.3181052207946777, "logps/chosen": -578.649658203125, "logps/rejected": -698.65185546875, "loss": 0.3659, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3504722118377686, "rewards/margins": 2.076638698577881, "rewards/rejected": -3.4271106719970703, "step": 1027 }, { "epoch": 0.88, "grad_norm": 57.605495838687986, "learning_rate": 8.41950178830081e-08, "logits/chosen": -1.3705828189849854, "logits/rejected": -1.3483182191848755, "logps/chosen": -485.2783508300781, "logps/rejected": -574.943603515625, "loss": 0.3538, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2768254280090332, "rewards/margins": 2.3636741638183594, "rewards/rejected": -3.6404995918273926, "step": 1028 }, { "epoch": 0.88, "grad_norm": 42.00448683749464, "learning_rate": 8.299632717371996e-08, "logits/chosen": -1.4272587299346924, "logits/rejected": -1.3087083101272583, "logps/chosen": -408.137451171875, "logps/rejected": -737.99267578125, "loss": 0.1625, "rewards/accuracies": 0.875, "rewards/chosen": -0.9820849895477295, "rewards/margins": 3.0054163932800293, "rewards/rejected": -3.987501621246338, "step": 1029 }, { "epoch": 0.88, "grad_norm": 36.82882855714117, "learning_rate": 8.180586115408627e-08, "logits/chosen": -1.380150318145752, "logits/rejected": -1.309890866279602, "logps/chosen": -352.4360046386719, "logps/rejected": -592.15625, "loss": 0.2671, "rewards/accuracies": 0.875, "rewards/chosen": -0.7464353442192078, "rewards/margins": 2.4291770458221436, "rewards/rejected": -3.175612688064575, "step": 1030 }, { "epoch": 0.88, "grad_norm": 34.92691331449185, "learning_rate": 8.06236305015059e-08, "logits/chosen": -1.3108597993850708, "logits/rejected": -1.2580173015594482, "logps/chosen": -530.5318603515625, "logps/rejected": -834.72314453125, "loss": 0.1618, "rewards/accuracies": 0.875, "rewards/chosen": -1.1678415536880493, "rewards/margins": 3.0013504028320312, "rewards/rejected": -4.169192314147949, "step": 1031 }, { "epoch": 0.89, "grad_norm": 40.819265846962935, "learning_rate": 7.944964581951275e-08, "logits/chosen": -1.344224452972412, "logits/rejected": -1.2919423580169678, "logps/chosen": -646.0286254882812, "logps/rejected": -807.8919067382812, "loss": 0.2051, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2766971588134766, "rewards/margins": 3.460568428039551, "rewards/rejected": -4.737265586853027, "step": 1032 }, { "epoch": 0.89, "grad_norm": 44.888672218587935, "learning_rate": 7.828391763768316e-08, "logits/chosen": -1.3413419723510742, "logits/rejected": -1.2852157354354858, "logps/chosen": -467.3982238769531, "logps/rejected": -574.54833984375, "loss": 0.2917, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5983957648277283, "rewards/margins": 2.3117122650146484, "rewards/rejected": -2.9101080894470215, "step": 1033 }, { "epoch": 0.89, "grad_norm": 57.511637878172635, "learning_rate": 7.71264564115397e-08, "logits/chosen": -1.3855578899383545, "logits/rejected": -1.3349218368530273, "logps/chosen": -506.8746337890625, "logps/rejected": -727.783447265625, "loss": 0.2616, "rewards/accuracies": 0.875, "rewards/chosen": -1.276779294013977, "rewards/margins": 2.763277769088745, "rewards/rejected": -4.040057182312012, "step": 1034 }, { "epoch": 0.89, "grad_norm": 62.57801150274811, "learning_rate": 7.597727252245723e-08, "logits/chosen": -1.348046064376831, "logits/rejected": -1.2998453378677368, "logps/chosen": -576.858154296875, "logps/rejected": -694.7091674804688, "loss": 0.3253, "rewards/accuracies": 0.875, "rewards/chosen": -2.056487560272217, "rewards/margins": 1.7695956230163574, "rewards/rejected": -3.826083183288574, "step": 1035 }, { "epoch": 0.89, "grad_norm": 45.60053488392325, "learning_rate": 7.483637627757166e-08, "logits/chosen": -1.3614377975463867, "logits/rejected": -1.312305212020874, "logps/chosen": -544.572021484375, "logps/rejected": -674.3843994140625, "loss": 0.1863, "rewards/accuracies": 1.0, "rewards/chosen": -1.1244726181030273, "rewards/margins": 2.8420958518981934, "rewards/rejected": -3.9665684700012207, "step": 1036 }, { "epoch": 0.89, "grad_norm": 40.22451424779994, "learning_rate": 7.370377790968496e-08, "logits/chosen": -1.3451738357543945, "logits/rejected": -1.3830194473266602, "logps/chosen": -387.7374572753906, "logps/rejected": -351.87322998046875, "loss": 0.3227, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7732783555984497, "rewards/margins": 1.255499243736267, "rewards/rejected": -2.028777599334717, "step": 1037 }, { "epoch": 0.89, "grad_norm": 36.91936601511155, "learning_rate": 7.257948757717558e-08, "logits/chosen": -1.3813602924346924, "logits/rejected": -1.3303508758544922, "logps/chosen": -609.59375, "logps/rejected": -660.0333251953125, "loss": 0.1944, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2263309955596924, "rewards/margins": 2.387805461883545, "rewards/rejected": -3.6141366958618164, "step": 1038 }, { "epoch": 0.89, "grad_norm": 39.65751850865929, "learning_rate": 7.146351536390605e-08, "logits/chosen": -1.3641141653060913, "logits/rejected": -1.3134207725524902, "logps/chosen": -550.0794677734375, "logps/rejected": -780.9698486328125, "loss": 0.2307, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2107312679290771, "rewards/margins": 2.552530288696289, "rewards/rejected": -3.763261318206787, "step": 1039 }, { "epoch": 0.89, "grad_norm": 34.24417524354166, "learning_rate": 7.0355871279133e-08, "logits/chosen": -1.3492000102996826, "logits/rejected": -1.309326171875, "logps/chosen": -619.298583984375, "logps/rejected": -761.5640869140625, "loss": 0.1531, "rewards/accuracies": 1.0, "rewards/chosen": -0.9659633636474609, "rewards/margins": 2.9570584297180176, "rewards/rejected": -3.9230217933654785, "step": 1040 }, { "epoch": 0.89, "grad_norm": 111.866433260211, "learning_rate": 6.925656525741751e-08, "logits/chosen": -1.362472653388977, "logits/rejected": -1.3299155235290527, "logps/chosen": -527.7230224609375, "logps/rejected": -719.2764282226562, "loss": 0.6964, "rewards/accuracies": 0.6875, "rewards/chosen": -1.815812349319458, "rewards/margins": 1.7427184581756592, "rewards/rejected": -3.558530807495117, "step": 1041 }, { "epoch": 0.89, "grad_norm": 32.95656660320831, "learning_rate": 6.816560715853547e-08, "logits/chosen": -1.3642749786376953, "logits/rejected": -1.2888407707214355, "logps/chosen": -646.0355224609375, "logps/rejected": -815.24609375, "loss": 0.1344, "rewards/accuracies": 1.0, "rewards/chosen": -1.1993162631988525, "rewards/margins": 3.1481070518493652, "rewards/rejected": -4.347423553466797, "step": 1042 }, { "epoch": 0.89, "grad_norm": 78.96166752915579, "learning_rate": 6.708300676738976e-08, "logits/chosen": -1.3494223356246948, "logits/rejected": -1.2774553298950195, "logps/chosen": -482.1902160644531, "logps/rejected": -707.10986328125, "loss": 0.4936, "rewards/accuracies": 0.875, "rewards/chosen": -1.3335380554199219, "rewards/margins": 2.5525479316711426, "rewards/rejected": -3.8860857486724854, "step": 1043 }, { "epoch": 0.9, "grad_norm": 19.09480900466112, "learning_rate": 6.600877379392212e-08, "logits/chosen": -1.4266753196716309, "logits/rejected": -1.3531907796859741, "logps/chosen": -369.3329772949219, "logps/rejected": -586.53564453125, "loss": 0.1982, "rewards/accuracies": 0.875, "rewards/chosen": -0.4353833794593811, "rewards/margins": 2.7501494884490967, "rewards/rejected": -3.185532808303833, "step": 1044 }, { "epoch": 0.9, "grad_norm": 25.09530960091258, "learning_rate": 6.494291787302608e-08, "logits/chosen": -1.388257622718811, "logits/rejected": -1.3475077152252197, "logps/chosen": -452.254150390625, "logps/rejected": -888.9625854492188, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": -0.8969854116439819, "rewards/margins": 3.282654285430908, "rewards/rejected": -4.17963981628418, "step": 1045 }, { "epoch": 0.9, "grad_norm": 22.14700396565832, "learning_rate": 6.388544856446065e-08, "logits/chosen": -1.3584980964660645, "logits/rejected": -1.278203010559082, "logps/chosen": -422.3341369628906, "logps/rejected": -690.5770263671875, "loss": 0.1809, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0029441118240356, "rewards/margins": 3.2371349334716797, "rewards/rejected": -4.240078926086426, "step": 1046 }, { "epoch": 0.9, "grad_norm": 67.19354932374553, "learning_rate": 6.283637535276498e-08, "logits/chosen": -1.3828531503677368, "logits/rejected": -1.2909801006317139, "logps/chosen": -482.1180114746094, "logps/rejected": -852.4668579101562, "loss": 0.3232, "rewards/accuracies": 0.9375, "rewards/chosen": -1.214806079864502, "rewards/margins": 3.309603214263916, "rewards/rejected": -4.524409294128418, "step": 1047 }, { "epoch": 0.9, "grad_norm": 47.26993737738886, "learning_rate": 6.179570764717179e-08, "logits/chosen": -1.4263463020324707, "logits/rejected": -1.2946155071258545, "logps/chosen": -424.43975830078125, "logps/rejected": -778.6882934570312, "loss": 0.276, "rewards/accuracies": 0.875, "rewards/chosen": -1.0966931581497192, "rewards/margins": 3.0400359630584717, "rewards/rejected": -4.1367292404174805, "step": 1048 }, { "epoch": 0.9, "grad_norm": 82.17354273413754, "learning_rate": 6.076345478152533e-08, "logits/chosen": -1.319034218788147, "logits/rejected": -1.2957336902618408, "logps/chosen": -635.9437866210938, "logps/rejected": -813.4094848632812, "loss": 0.4827, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7995774745941162, "rewards/margins": 2.410430669784546, "rewards/rejected": -4.210008144378662, "step": 1049 }, { "epoch": 0.9, "grad_norm": 60.38245546238117, "learning_rate": 5.973962601419569e-08, "logits/chosen": -1.383164644241333, "logits/rejected": -1.2681220769882202, "logps/chosen": -520.2368774414062, "logps/rejected": -813.9752807617188, "loss": 0.383, "rewards/accuracies": 0.875, "rewards/chosen": -1.1653625965118408, "rewards/margins": 3.8374204635620117, "rewards/rejected": -5.002782821655273, "step": 1050 }, { "epoch": 0.9, "grad_norm": 54.869719235412234, "learning_rate": 5.872423052799636e-08, "logits/chosen": -1.3539948463439941, "logits/rejected": -1.324246883392334, "logps/chosen": -618.18896484375, "logps/rejected": -643.138427734375, "loss": 0.324, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3248660564422607, "rewards/margins": 2.106539726257324, "rewards/rejected": -3.431406021118164, "step": 1051 }, { "epoch": 0.9, "grad_norm": 79.63429487086702, "learning_rate": 5.771727743010213e-08, "logits/chosen": -1.3471636772155762, "logits/rejected": -1.3674094676971436, "logps/chosen": -564.186279296875, "logps/rejected": -493.5242919921875, "loss": 0.6197, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8813424110412598, "rewards/margins": 0.9798173308372498, "rewards/rejected": -2.8611598014831543, "step": 1052 }, { "epoch": 0.9, "grad_norm": 56.72483919511824, "learning_rate": 5.6718775751967486e-08, "logits/chosen": -1.3621277809143066, "logits/rejected": -1.3243482112884521, "logps/chosen": -392.24176025390625, "logps/rejected": -494.3200378417969, "loss": 0.2933, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9714163541793823, "rewards/margins": 2.1407787799835205, "rewards/rejected": -3.1121950149536133, "step": 1053 }, { "epoch": 0.9, "grad_norm": 71.94034161177133, "learning_rate": 5.5728734449244865e-08, "logits/chosen": -1.309327483177185, "logits/rejected": -1.2972667217254639, "logps/chosen": -524.4185791015625, "logps/rejected": -532.0087890625, "loss": 0.374, "rewards/accuracies": 0.75, "rewards/chosen": -0.9590820074081421, "rewards/margins": 1.8255786895751953, "rewards/rejected": -2.784660816192627, "step": 1054 }, { "epoch": 0.9, "grad_norm": 41.31298907116539, "learning_rate": 5.4747162401705295e-08, "logits/chosen": -1.336936354637146, "logits/rejected": -1.2083768844604492, "logps/chosen": -544.3446655273438, "logps/rejected": -971.6831665039062, "loss": 0.1877, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7546546459197998, "rewards/margins": 3.293325901031494, "rewards/rejected": -5.047980785369873, "step": 1055 }, { "epoch": 0.91, "grad_norm": 40.80957672653119, "learning_rate": 5.377406841315801e-08, "logits/chosen": -1.3370170593261719, "logits/rejected": -1.299474835395813, "logps/chosen": -566.1239013671875, "logps/rejected": -749.313232421875, "loss": 0.1484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3078569173812866, "rewards/margins": 2.86323881149292, "rewards/rejected": -4.171095848083496, "step": 1056 }, { "epoch": 0.91, "grad_norm": 48.62880251246643, "learning_rate": 5.280946121137186e-08, "logits/chosen": -1.4300320148468018, "logits/rejected": -1.3434855937957764, "logps/chosen": -404.80902099609375, "logps/rejected": -617.2060546875, "loss": 0.2349, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1375755071640015, "rewards/margins": 2.7895593643188477, "rewards/rejected": -3.9271347522735596, "step": 1057 }, { "epoch": 0.91, "grad_norm": 32.04470778039381, "learning_rate": 5.185334944799691e-08, "logits/chosen": -1.2941210269927979, "logits/rejected": -1.246250867843628, "logps/chosen": -477.75714111328125, "logps/rejected": -722.9541625976562, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": -0.9855974912643433, "rewards/margins": 2.813016891479492, "rewards/rejected": -3.798614263534546, "step": 1058 }, { "epoch": 0.91, "grad_norm": 60.862123746447736, "learning_rate": 5.0905741698486714e-08, "logits/chosen": -1.3242413997650146, "logits/rejected": -1.3037867546081543, "logps/chosen": -564.5498046875, "logps/rejected": -639.8648681640625, "loss": 0.3195, "rewards/accuracies": 0.875, "rewards/chosen": -1.3947079181671143, "rewards/margins": 1.843632459640503, "rewards/rejected": -3.238340377807617, "step": 1059 }, { "epoch": 0.91, "grad_norm": 66.02784035664585, "learning_rate": 4.996664646202176e-08, "logits/chosen": -1.3540661334991455, "logits/rejected": -1.2995530366897583, "logps/chosen": -457.6804504394531, "logps/rejected": -669.781005859375, "loss": 0.2487, "rewards/accuracies": 0.875, "rewards/chosen": -1.1613222360610962, "rewards/margins": 2.4262075424194336, "rewards/rejected": -3.5875296592712402, "step": 1060 }, { "epoch": 0.91, "grad_norm": 37.626471741124945, "learning_rate": 4.903607216143302e-08, "logits/chosen": -1.3499476909637451, "logits/rejected": -1.3001892566680908, "logps/chosen": -513.4680786132812, "logps/rejected": -740.8880615234375, "loss": 0.159, "rewards/accuracies": 1.0, "rewards/chosen": -1.5764918327331543, "rewards/margins": 2.7328219413757324, "rewards/rejected": -4.309313774108887, "step": 1061 }, { "epoch": 0.91, "grad_norm": 60.42659277512601, "learning_rate": 4.811402714312629e-08, "logits/chosen": -1.3405532836914062, "logits/rejected": -1.2848247289657593, "logps/chosen": -594.1248779296875, "logps/rejected": -725.2188720703125, "loss": 0.3342, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9339052438735962, "rewards/margins": 1.9501739740371704, "rewards/rejected": -3.8840792179107666, "step": 1062 }, { "epoch": 0.91, "grad_norm": 217.25995878770206, "learning_rate": 4.720051967700767e-08, "logits/chosen": -1.366823673248291, "logits/rejected": -1.2912871837615967, "logps/chosen": -408.3677062988281, "logps/rejected": -631.322998046875, "loss": 0.3042, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2799038887023926, "rewards/margins": 2.1277143955230713, "rewards/rejected": -3.407618284225464, "step": 1063 }, { "epoch": 0.91, "grad_norm": 84.30541015310905, "learning_rate": 4.629555795640872e-08, "logits/chosen": -1.3612558841705322, "logits/rejected": -1.2869057655334473, "logps/chosen": -506.45849609375, "logps/rejected": -751.8193359375, "loss": 0.52, "rewards/accuracies": 0.875, "rewards/chosen": -1.497196912765503, "rewards/margins": 2.4841039180755615, "rewards/rejected": -3.9813010692596436, "step": 1064 }, { "epoch": 0.91, "grad_norm": 81.58503198700524, "learning_rate": 4.539915009801376e-08, "logits/chosen": -1.411940574645996, "logits/rejected": -1.403637409210205, "logps/chosen": -568.0249633789062, "logps/rejected": -625.3397216796875, "loss": 0.2493, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9405019283294678, "rewards/margins": 2.0997798442840576, "rewards/rejected": -3.0402820110321045, "step": 1065 }, { "epoch": 0.91, "grad_norm": 52.81874353691749, "learning_rate": 4.4511304141787054e-08, "logits/chosen": -1.391019582748413, "logits/rejected": -1.3759312629699707, "logps/chosen": -340.2884521484375, "logps/rejected": -414.40118408203125, "loss": 0.3076, "rewards/accuracies": 0.875, "rewards/chosen": -0.9816931486129761, "rewards/margins": 1.8901349306106567, "rewards/rejected": -2.871828079223633, "step": 1066 }, { "epoch": 0.92, "grad_norm": 57.871120470857946, "learning_rate": 4.363202805089972e-08, "logits/chosen": -1.3674225807189941, "logits/rejected": -1.3060979843139648, "logps/chosen": -414.9110107421875, "logps/rejected": -597.43505859375, "loss": 0.3054, "rewards/accuracies": 0.875, "rewards/chosen": -1.1439032554626465, "rewards/margins": 1.9327505826950073, "rewards/rejected": -3.0766537189483643, "step": 1067 }, { "epoch": 0.92, "grad_norm": 39.46963037255718, "learning_rate": 4.276132971165936e-08, "logits/chosen": -1.3694632053375244, "logits/rejected": -1.3286242485046387, "logps/chosen": -538.0938720703125, "logps/rejected": -737.7407836914062, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": -1.7101688385009766, "rewards/margins": 2.513315200805664, "rewards/rejected": -4.223484039306641, "step": 1068 }, { "epoch": 0.92, "grad_norm": 31.99320375940036, "learning_rate": 4.18992169334389e-08, "logits/chosen": -1.329564094543457, "logits/rejected": -1.255440592765808, "logps/chosen": -491.8975524902344, "logps/rejected": -717.998046875, "loss": 0.1948, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9839257001876831, "rewards/margins": 2.8212757110595703, "rewards/rejected": -3.805201530456543, "step": 1069 }, { "epoch": 0.92, "grad_norm": 46.43656004948136, "learning_rate": 4.104569744860642e-08, "logits/chosen": -1.3599036931991577, "logits/rejected": -1.3458278179168701, "logps/chosen": -497.247314453125, "logps/rejected": -589.5523681640625, "loss": 0.3527, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3754032850265503, "rewards/margins": 1.8670722246170044, "rewards/rejected": -3.2424755096435547, "step": 1070 }, { "epoch": 0.92, "grad_norm": 59.26817635134149, "learning_rate": 4.020077891245621e-08, "logits/chosen": -1.314011812210083, "logits/rejected": -1.30635404586792, "logps/chosen": -509.9771728515625, "logps/rejected": -611.1748657226562, "loss": 0.3302, "rewards/accuracies": 0.875, "rewards/chosen": -1.4132881164550781, "rewards/margins": 1.814769983291626, "rewards/rejected": -3.228058099746704, "step": 1071 }, { "epoch": 0.92, "grad_norm": 55.05893563291395, "learning_rate": 3.9364468903139825e-08, "logits/chosen": -1.3378361463546753, "logits/rejected": -1.3174649477005005, "logps/chosen": -666.12109375, "logps/rejected": -693.6858520507812, "loss": 0.2653, "rewards/accuracies": 0.9375, "rewards/chosen": -1.633272409439087, "rewards/margins": 2.0344698429107666, "rewards/rejected": -3.6677422523498535, "step": 1072 }, { "epoch": 0.92, "grad_norm": 66.25089057476616, "learning_rate": 3.85367749215979e-08, "logits/chosen": -1.3286153078079224, "logits/rejected": -1.3204158544540405, "logps/chosen": -591.8280029296875, "logps/rejected": -673.842529296875, "loss": 0.3108, "rewards/accuracies": 0.875, "rewards/chosen": -1.0493413209915161, "rewards/margins": 2.0450916290283203, "rewards/rejected": -3.094432830810547, "step": 1073 }, { "epoch": 0.92, "grad_norm": 46.524895866045775, "learning_rate": 3.7717704391493466e-08, "logits/chosen": -1.3015832901000977, "logits/rejected": -1.2925136089324951, "logps/chosen": -576.459716796875, "logps/rejected": -705.7184448242188, "loss": 0.2449, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1967111825942993, "rewards/margins": 2.4815011024475098, "rewards/rejected": -3.6782124042510986, "step": 1074 }, { "epoch": 0.92, "grad_norm": 67.7835331038535, "learning_rate": 3.6907264659144846e-08, "logits/chosen": -1.3371295928955078, "logits/rejected": -1.2966009378433228, "logps/chosen": -502.981201171875, "logps/rejected": -667.0936889648438, "loss": 0.306, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9223822355270386, "rewards/margins": 2.2916440963745117, "rewards/rejected": -3.2140259742736816, "step": 1075 }, { "epoch": 0.92, "grad_norm": 98.2426503459349, "learning_rate": 3.6105462993459956e-08, "logits/chosen": -1.359288215637207, "logits/rejected": -1.3329675197601318, "logps/chosen": -543.23291015625, "logps/rejected": -600.3795166015625, "loss": 0.6025, "rewards/accuracies": 0.75, "rewards/chosen": -1.5733633041381836, "rewards/margins": 1.2107982635498047, "rewards/rejected": -2.7841615676879883, "step": 1076 }, { "epoch": 0.92, "grad_norm": 62.82469312316676, "learning_rate": 3.531230658587114e-08, "logits/chosen": -1.3349018096923828, "logits/rejected": -1.284590482711792, "logps/chosen": -522.4531860351562, "logps/rejected": -633.3223876953125, "loss": 0.2956, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2021509408950806, "rewards/margins": 2.3413257598876953, "rewards/rejected": -3.5434770584106445, "step": 1077 }, { "epoch": 0.92, "grad_norm": 63.67005318641801, "learning_rate": 3.452780255027066e-08, "logits/chosen": -1.4124021530151367, "logits/rejected": -1.3744122982025146, "logps/chosen": -489.9981689453125, "logps/rejected": -560.9869995117188, "loss": 0.3767, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2464032173156738, "rewards/margins": 1.5017939805984497, "rewards/rejected": -2.748197317123413, "step": 1078 }, { "epoch": 0.93, "grad_norm": 82.83531057954863, "learning_rate": 3.375195792294694e-08, "logits/chosen": -1.2610230445861816, "logits/rejected": -1.2093844413757324, "logps/chosen": -511.044677734375, "logps/rejected": -661.511474609375, "loss": 0.5594, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3683613538742065, "rewards/margins": 1.7111997604370117, "rewards/rejected": -3.079561233520508, "step": 1079 }, { "epoch": 0.93, "grad_norm": 63.97628020971092, "learning_rate": 3.298477966252089e-08, "logits/chosen": -1.346972942352295, "logits/rejected": -1.232084035873413, "logps/chosen": -533.18408203125, "logps/rejected": -757.748291015625, "loss": 0.2487, "rewards/accuracies": 0.875, "rewards/chosen": -1.215575933456421, "rewards/margins": 2.8929176330566406, "rewards/rejected": -4.108493804931641, "step": 1080 }, { "epoch": 0.93, "grad_norm": 40.72641388442564, "learning_rate": 3.222627464988459e-08, "logits/chosen": -1.4029819965362549, "logits/rejected": -1.268467903137207, "logps/chosen": -555.34033203125, "logps/rejected": -960.1851806640625, "loss": 0.2085, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8269891142845154, "rewards/margins": 4.098667621612549, "rewards/rejected": -4.925657272338867, "step": 1081 }, { "epoch": 0.93, "grad_norm": 56.2349025954042, "learning_rate": 3.1476449688138895e-08, "logits/chosen": -1.3948179483413696, "logits/rejected": -1.303943157196045, "logps/chosen": -385.4075012207031, "logps/rejected": -645.4246826171875, "loss": 0.2414, "rewards/accuracies": 0.875, "rewards/chosen": -0.961907148361206, "rewards/margins": 2.621835947036743, "rewards/rejected": -3.583743095397949, "step": 1082 }, { "epoch": 0.93, "grad_norm": 71.31375486781268, "learning_rate": 3.073531150253217e-08, "logits/chosen": -1.3443121910095215, "logits/rejected": -1.2920128107070923, "logps/chosen": -523.985107421875, "logps/rejected": -646.9969482421875, "loss": 0.4286, "rewards/accuracies": 0.875, "rewards/chosen": -1.0373486280441284, "rewards/margins": 2.374429225921631, "rewards/rejected": -3.411777973175049, "step": 1083 }, { "epoch": 0.93, "grad_norm": 77.27680706949431, "learning_rate": 3.0002866740400424e-08, "logits/chosen": -1.4241924285888672, "logits/rejected": -1.3850486278533936, "logps/chosen": -329.6954040527344, "logps/rejected": -459.6412353515625, "loss": 0.6585, "rewards/accuracies": 0.75, "rewards/chosen": -1.0908865928649902, "rewards/margins": 0.8222720623016357, "rewards/rejected": -1.913158655166626, "step": 1084 }, { "epoch": 0.93, "grad_norm": 66.59633314582754, "learning_rate": 2.9279121971107712e-08, "logits/chosen": -1.328117847442627, "logits/rejected": -1.2686291933059692, "logps/chosen": -499.4151611328125, "logps/rejected": -712.1885986328125, "loss": 0.3499, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1894209384918213, "rewards/margins": 2.26012921333313, "rewards/rejected": -3.449550151824951, "step": 1085 }, { "epoch": 0.93, "grad_norm": 34.118069670094165, "learning_rate": 2.8564083685986838e-08, "logits/chosen": -1.4095797538757324, "logits/rejected": -1.3639633655548096, "logps/chosen": -383.1153564453125, "logps/rejected": -568.6919555664062, "loss": 0.2055, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0487778186798096, "rewards/margins": 1.981377124786377, "rewards/rejected": -3.0301549434661865, "step": 1086 }, { "epoch": 0.93, "grad_norm": 86.39325868481784, "learning_rate": 2.785775829828152e-08, "logits/chosen": -1.3580666780471802, "logits/rejected": -1.2918117046356201, "logps/chosen": -563.025634765625, "logps/rejected": -764.1182250976562, "loss": 0.5369, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4875305891036987, "rewards/margins": 2.3983347415924072, "rewards/rejected": -3.8858656883239746, "step": 1087 }, { "epoch": 0.93, "grad_norm": 30.38437626026005, "learning_rate": 2.7160152143088533e-08, "logits/chosen": -1.3298563957214355, "logits/rejected": -1.2902374267578125, "logps/chosen": -627.9073486328125, "logps/rejected": -791.8634643554688, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": -1.1206411123275757, "rewards/margins": 3.556591033935547, "rewards/rejected": -4.677231788635254, "step": 1088 }, { "epoch": 0.93, "grad_norm": 108.26552979078068, "learning_rate": 2.6471271477301328e-08, "logits/chosen": -1.32789945602417, "logits/rejected": -1.2792022228240967, "logps/chosen": -668.3905029296875, "logps/rejected": -888.897705078125, "loss": 0.6575, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9736433029174805, "rewards/margins": 2.0397772789001465, "rewards/rejected": -4.013420581817627, "step": 1089 }, { "epoch": 0.93, "grad_norm": 113.96211501772365, "learning_rate": 2.5791122479553505e-08, "logits/chosen": -1.3528571128845215, "logits/rejected": -1.3356266021728516, "logps/chosen": -504.17742919921875, "logps/rejected": -588.0662841796875, "loss": 0.6183, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3590521812438965, "rewards/margins": 1.5391099452972412, "rewards/rejected": -2.8981621265411377, "step": 1090 }, { "epoch": 0.94, "grad_norm": 62.84055237165941, "learning_rate": 2.5119711250163323e-08, "logits/chosen": -1.3433167934417725, "logits/rejected": -1.3004448413848877, "logps/chosen": -630.4215087890625, "logps/rejected": -844.3784790039062, "loss": 0.2892, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4850654602050781, "rewards/margins": 2.3362314701080322, "rewards/rejected": -3.8212971687316895, "step": 1091 }, { "epoch": 0.94, "grad_norm": 49.72033236294848, "learning_rate": 2.445704381107949e-08, "logits/chosen": -1.3576364517211914, "logits/rejected": -1.2695292234420776, "logps/chosen": -506.31060791015625, "logps/rejected": -802.626953125, "loss": 0.267, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1525599956512451, "rewards/margins": 3.1654715538024902, "rewards/rejected": -4.318031311035156, "step": 1092 }, { "epoch": 0.94, "grad_norm": 35.0255603595314, "learning_rate": 2.380312610582691e-08, "logits/chosen": -1.3337469100952148, "logits/rejected": -1.279638648033142, "logps/chosen": -454.1136474609375, "logps/rejected": -615.020263671875, "loss": 0.2116, "rewards/accuracies": 0.9375, "rewards/chosen": -0.780484676361084, "rewards/margins": 2.418971061706543, "rewards/rejected": -3.199455738067627, "step": 1093 }, { "epoch": 0.94, "grad_norm": 46.86951995562813, "learning_rate": 2.31579639994528e-08, "logits/chosen": -1.363749384880066, "logits/rejected": -1.3308840990066528, "logps/chosen": -592.7799072265625, "logps/rejected": -634.8521728515625, "loss": 0.1967, "rewards/accuracies": 0.875, "rewards/chosen": -1.1438612937927246, "rewards/margins": 2.781632900238037, "rewards/rejected": -3.9254941940307617, "step": 1094 }, { "epoch": 0.94, "grad_norm": 90.86907239722963, "learning_rate": 2.252156327847543e-08, "logits/chosen": -1.3248136043548584, "logits/rejected": -1.3026912212371826, "logps/chosen": -611.6460571289062, "logps/rejected": -674.7425537109375, "loss": 0.7975, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9554386138916016, "rewards/margins": 1.4960346221923828, "rewards/rejected": -3.4514732360839844, "step": 1095 }, { "epoch": 0.94, "grad_norm": 55.03351828353279, "learning_rate": 2.189392965083059e-08, "logits/chosen": -1.3430871963500977, "logits/rejected": -1.294329047203064, "logps/chosen": -702.509521484375, "logps/rejected": -968.4942626953125, "loss": 0.1352, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1937084197998047, "rewards/margins": 4.060674667358398, "rewards/rejected": -5.254383087158203, "step": 1096 }, { "epoch": 0.94, "grad_norm": 92.13970354226072, "learning_rate": 2.1275068745821743e-08, "logits/chosen": -1.4347259998321533, "logits/rejected": -1.3344197273254395, "logps/chosen": -487.1264953613281, "logps/rejected": -683.416748046875, "loss": 0.7083, "rewards/accuracies": 0.875, "rewards/chosen": -1.7204651832580566, "rewards/margins": 1.733154535293579, "rewards/rejected": -3.4536197185516357, "step": 1097 }, { "epoch": 0.94, "grad_norm": 34.39366187826271, "learning_rate": 2.0664986114068973e-08, "logits/chosen": -1.3739490509033203, "logits/rejected": -1.3129985332489014, "logps/chosen": -410.9372863769531, "logps/rejected": -514.8299560546875, "loss": 0.2945, "rewards/accuracies": 0.8125, "rewards/chosen": -0.940898060798645, "rewards/margins": 1.9497122764587402, "rewards/rejected": -2.8906102180480957, "step": 1098 }, { "epoch": 0.94, "grad_norm": 35.32745819666828, "learning_rate": 2.0063687227458882e-08, "logits/chosen": -1.345841407775879, "logits/rejected": -1.2993870973587036, "logps/chosen": -627.6751098632812, "logps/rejected": -769.1766357421875, "loss": 0.16, "rewards/accuracies": 1.0, "rewards/chosen": -1.2006680965423584, "rewards/margins": 2.500814437866211, "rewards/rejected": -3.7014827728271484, "step": 1099 }, { "epoch": 0.94, "grad_norm": 42.89372340874667, "learning_rate": 1.9471177479096102e-08, "logits/chosen": -1.2868573665618896, "logits/rejected": -1.2914104461669922, "logps/chosen": -471.10211181640625, "logps/rejected": -511.0739440917969, "loss": 0.2658, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3655877113342285, "rewards/margins": 1.5534907579421997, "rewards/rejected": -2.9190783500671387, "step": 1100 }, { "epoch": 0.94, "grad_norm": 66.93499212181665, "learning_rate": 1.8887462183254877e-08, "logits/chosen": -1.2483763694763184, "logits/rejected": -1.217915415763855, "logps/chosen": -709.8128662109375, "logps/rejected": -865.47607421875, "loss": 0.3359, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7271713018417358, "rewards/margins": 2.937894821166992, "rewards/rejected": -4.665066242218018, "step": 1101 }, { "epoch": 0.95, "grad_norm": 57.77362074884795, "learning_rate": 1.831254657533077e-08, "logits/chosen": -1.3025121688842773, "logits/rejected": -1.2883214950561523, "logps/chosen": -581.310546875, "logps/rejected": -696.29638671875, "loss": 0.2503, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9122763276100159, "rewards/margins": 2.7274765968322754, "rewards/rejected": -3.6397528648376465, "step": 1102 }, { "epoch": 0.95, "grad_norm": 38.67736601897149, "learning_rate": 1.7746435811794357e-08, "logits/chosen": -1.3677195310592651, "logits/rejected": -1.3084774017333984, "logps/chosen": -369.76910400390625, "logps/rejected": -629.9393310546875, "loss": 0.162, "rewards/accuracies": 1.0, "rewards/chosen": -0.6241545677185059, "rewards/margins": 2.79376482963562, "rewards/rejected": -3.417919635772705, "step": 1103 }, { "epoch": 0.95, "grad_norm": 37.541347259643885, "learning_rate": 1.7189134970144847e-08, "logits/chosen": -1.3679184913635254, "logits/rejected": -1.3461575508117676, "logps/chosen": -533.3414306640625, "logps/rejected": -686.5682983398438, "loss": 0.1703, "rewards/accuracies": 1.0, "rewards/chosen": -1.156436562538147, "rewards/margins": 2.7757630348205566, "rewards/rejected": -3.932199478149414, "step": 1104 }, { "epoch": 0.95, "grad_norm": 53.39127411417644, "learning_rate": 1.664064904886431e-08, "logits/chosen": -1.3563721179962158, "logits/rejected": -1.312720775604248, "logps/chosen": -565.919921875, "logps/rejected": -785.633056640625, "loss": 0.2198, "rewards/accuracies": 0.9375, "rewards/chosen": -1.468775987625122, "rewards/margins": 2.6510097980499268, "rewards/rejected": -4.119785785675049, "step": 1105 }, { "epoch": 0.95, "grad_norm": 40.875245401109694, "learning_rate": 1.6100982967373056e-08, "logits/chosen": -1.3901944160461426, "logits/rejected": -1.314322590827942, "logps/chosen": -478.9412536621094, "logps/rejected": -664.1520385742188, "loss": 0.2461, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2784733772277832, "rewards/margins": 2.814133644104004, "rewards/rejected": -4.092606544494629, "step": 1106 }, { "epoch": 0.95, "grad_norm": 46.94616916655286, "learning_rate": 1.557014156598535e-08, "logits/chosen": -1.3899496793746948, "logits/rejected": -1.3239576816558838, "logps/chosen": -492.5782775878906, "logps/rejected": -698.148681640625, "loss": 0.2926, "rewards/accuracies": 0.875, "rewards/chosen": -0.575579047203064, "rewards/margins": 3.199647903442383, "rewards/rejected": -3.7752270698547363, "step": 1107 }, { "epoch": 0.95, "grad_norm": 88.45774590135527, "learning_rate": 1.5048129605866433e-08, "logits/chosen": -1.2571226358413696, "logits/rejected": -1.2514090538024902, "logps/chosen": -723.9155883789062, "logps/rejected": -808.6048583984375, "loss": 0.4931, "rewards/accuracies": 0.875, "rewards/chosen": -1.6201335191726685, "rewards/margins": 2.539370059967041, "rewards/rejected": -4.159502983093262, "step": 1108 }, { "epoch": 0.95, "grad_norm": 29.18465847959277, "learning_rate": 1.4534951768989e-08, "logits/chosen": -1.346365213394165, "logits/rejected": -1.2711735963821411, "logps/chosen": -389.49078369140625, "logps/rejected": -583.47802734375, "loss": 0.1784, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0953283309936523, "rewards/margins": 2.546407699584961, "rewards/rejected": -3.6417360305786133, "step": 1109 }, { "epoch": 0.95, "grad_norm": 28.249185451400095, "learning_rate": 1.403061265809191e-08, "logits/chosen": -1.3683123588562012, "logits/rejected": -1.2581026554107666, "logps/chosen": -507.8565979003906, "logps/rejected": -826.8822021484375, "loss": 0.1604, "rewards/accuracies": 0.9375, "rewards/chosen": -1.20506751537323, "rewards/margins": 3.6192493438720703, "rewards/rejected": -4.824316501617432, "step": 1110 }, { "epoch": 0.95, "grad_norm": 85.02557735435447, "learning_rate": 1.3535116796638767e-08, "logits/chosen": -1.340364933013916, "logits/rejected": -1.3544952869415283, "logps/chosen": -616.94091796875, "logps/rejected": -598.183837890625, "loss": 0.4654, "rewards/accuracies": 0.75, "rewards/chosen": -1.400769591331482, "rewards/margins": 1.4828534126281738, "rewards/rejected": -2.8836231231689453, "step": 1111 }, { "epoch": 0.95, "grad_norm": 72.5093628753269, "learning_rate": 1.3048468628777398e-08, "logits/chosen": -1.3603501319885254, "logits/rejected": -1.28261137008667, "logps/chosen": -603.1817626953125, "logps/rejected": -794.6780395507812, "loss": 0.6567, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2409727573394775, "rewards/margins": 2.7074697017669678, "rewards/rejected": -3.9484424591064453, "step": 1112 }, { "epoch": 0.95, "grad_norm": 62.751686560930395, "learning_rate": 1.2570672519299108e-08, "logits/chosen": -1.385198712348938, "logits/rejected": -1.3630871772766113, "logps/chosen": -443.33050537109375, "logps/rejected": -560.1493530273438, "loss": 0.3856, "rewards/accuracies": 0.875, "rewards/chosen": -1.240035891532898, "rewards/margins": 2.105348587036133, "rewards/rejected": -3.3453845977783203, "step": 1113 }, { "epoch": 0.96, "grad_norm": 45.999675175415945, "learning_rate": 1.2101732753601379e-08, "logits/chosen": -1.3332526683807373, "logits/rejected": -1.305140495300293, "logps/chosen": -409.5257568359375, "logps/rejected": -439.9317626953125, "loss": 0.3374, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9548590183258057, "rewards/margins": 1.4134576320648193, "rewards/rejected": -2.368316650390625, "step": 1114 }, { "epoch": 0.96, "grad_norm": 59.87516963513788, "learning_rate": 1.1641653537647456e-08, "logits/chosen": -1.2905126810073853, "logits/rejected": -1.2474026679992676, "logps/chosen": -825.4317626953125, "logps/rejected": -992.7080688476562, "loss": 0.2083, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1057610511779785, "rewards/margins": 3.1582183837890625, "rewards/rejected": -5.263979911804199, "step": 1115 }, { "epoch": 0.96, "grad_norm": 63.899841777084106, "learning_rate": 1.119043899792993e-08, "logits/chosen": -1.3253779411315918, "logits/rejected": -1.2959191799163818, "logps/chosen": -440.66455078125, "logps/rejected": -645.551513671875, "loss": 0.3568, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3789682388305664, "rewards/margins": 2.5655908584594727, "rewards/rejected": -3.944559097290039, "step": 1116 }, { "epoch": 0.96, "grad_norm": 31.935937833275386, "learning_rate": 1.0748093181433216e-08, "logits/chosen": -1.360658049583435, "logits/rejected": -1.3096072673797607, "logps/chosen": -497.0452575683594, "logps/rejected": -653.573974609375, "loss": 0.2061, "rewards/accuracies": 1.0, "rewards/chosen": -1.161726474761963, "rewards/margins": 2.291126251220703, "rewards/rejected": -3.452852487564087, "step": 1117 }, { "epoch": 0.96, "grad_norm": 74.0455033197263, "learning_rate": 1.0314620055597246e-08, "logits/chosen": -1.32560396194458, "logits/rejected": -1.2965167760849, "logps/chosen": -603.4779052734375, "logps/rejected": -789.289794921875, "loss": 0.3315, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2657091617584229, "rewards/margins": 2.866349697113037, "rewards/rejected": -4.132059097290039, "step": 1118 }, { "epoch": 0.96, "grad_norm": 48.797521802242606, "learning_rate": 9.890023508282165e-09, "logits/chosen": -1.3630735874176025, "logits/rejected": -1.3144967555999756, "logps/chosen": -476.443115234375, "logps/rejected": -639.4280395507812, "loss": 0.2791, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7719219326972961, "rewards/margins": 2.7206759452819824, "rewards/rejected": -3.492598056793213, "step": 1119 }, { "epoch": 0.96, "grad_norm": 75.32071662583112, "learning_rate": 9.474307347733024e-09, "logits/chosen": -1.3259435892105103, "logits/rejected": -1.2706059217453003, "logps/chosen": -482.0497741699219, "logps/rejected": -624.172607421875, "loss": 0.466, "rewards/accuracies": 0.75, "rewards/chosen": -1.3574128150939941, "rewards/margins": 1.9411921501159668, "rewards/rejected": -3.298604965209961, "step": 1120 }, { "epoch": 0.96, "grad_norm": 21.492431931667138, "learning_rate": 9.067475302546147e-09, "logits/chosen": -1.2945029735565186, "logits/rejected": -1.2453341484069824, "logps/chosen": -823.6737060546875, "logps/rejected": -1095.56640625, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": -1.8724188804626465, "rewards/margins": 3.2580301761627197, "rewards/rejected": -5.130449295043945, "step": 1121 }, { "epoch": 0.96, "grad_norm": 18.955868304864357, "learning_rate": 8.669531021635257e-09, "logits/chosen": -1.3908123970031738, "logits/rejected": -1.3216614723205566, "logps/chosen": -368.7554016113281, "logps/rejected": -699.5101928710938, "loss": 0.2035, "rewards/accuracies": 0.875, "rewards/chosen": -0.7341961860656738, "rewards/margins": 3.5546658039093018, "rewards/rejected": -4.288862228393555, "step": 1122 }, { "epoch": 0.96, "grad_norm": 54.82451185740064, "learning_rate": 8.28047807419885e-09, "logits/chosen": -1.3758102655410767, "logits/rejected": -1.298590898513794, "logps/chosen": -530.2117919921875, "logps/rejected": -753.3561401367188, "loss": 0.2113, "rewards/accuracies": 0.875, "rewards/chosen": -1.0154526233673096, "rewards/margins": 3.282680034637451, "rewards/rejected": -4.29813289642334, "step": 1123 }, { "epoch": 0.96, "grad_norm": 23.3833093803656, "learning_rate": 7.900319949688427e-09, "logits/chosen": -1.3352243900299072, "logits/rejected": -1.2280926704406738, "logps/chosen": -428.10418701171875, "logps/rejected": -751.018310546875, "loss": 0.1816, "rewards/accuracies": 0.875, "rewards/chosen": -0.796173632144928, "rewards/margins": 3.192791223526001, "rewards/rejected": -3.9889650344848633, "step": 1124 }, { "epoch": 0.96, "grad_norm": 42.988311388156085, "learning_rate": 7.529060057776981e-09, "logits/chosen": -1.351731300354004, "logits/rejected": -1.2667642831802368, "logps/chosen": -568.9866333007812, "logps/rejected": -843.7405395507812, "loss": 0.1532, "rewards/accuracies": 1.0, "rewards/chosen": -1.3037387132644653, "rewards/margins": 2.6970062255859375, "rewards/rejected": -4.0007452964782715, "step": 1125 }, { "epoch": 0.97, "grad_norm": 58.589929396690955, "learning_rate": 7.1667017283281176e-09, "logits/chosen": -1.2922766208648682, "logits/rejected": -1.253503680229187, "logps/chosen": -500.7716979980469, "logps/rejected": -637.717529296875, "loss": 0.3383, "rewards/accuracies": 0.875, "rewards/chosen": -1.3333024978637695, "rewards/margins": 2.46457839012146, "rewards/rejected": -3.7978806495666504, "step": 1126 }, { "epoch": 0.97, "grad_norm": 62.986156527543464, "learning_rate": 6.813248211366973e-09, "logits/chosen": -1.3930134773254395, "logits/rejected": -1.2740166187286377, "logps/chosen": -618.4913940429688, "logps/rejected": -840.85888671875, "loss": 0.2634, "rewards/accuracies": 0.875, "rewards/chosen": -1.4361460208892822, "rewards/margins": 2.6445565223693848, "rewards/rejected": -4.080702781677246, "step": 1127 }, { "epoch": 0.97, "grad_norm": 32.701636879677764, "learning_rate": 6.468702677050464e-09, "logits/chosen": -1.2866826057434082, "logits/rejected": -1.1640338897705078, "logps/chosen": -562.0260009765625, "logps/rejected": -916.1024169921875, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": -1.3056342601776123, "rewards/margins": 3.7452681064605713, "rewards/rejected": -5.050902366638184, "step": 1128 }, { "epoch": 0.97, "grad_norm": 79.45710287733499, "learning_rate": 6.133068215638748e-09, "logits/chosen": -1.354970932006836, "logits/rejected": -1.2986483573913574, "logps/chosen": -446.92877197265625, "logps/rejected": -665.33203125, "loss": 0.4294, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3372687101364136, "rewards/margins": 2.512016773223877, "rewards/rejected": -3.84928560256958, "step": 1129 }, { "epoch": 0.97, "grad_norm": 49.14039347193333, "learning_rate": 5.8063478374680285e-09, "logits/chosen": -1.319446325302124, "logits/rejected": -1.3057183027267456, "logps/chosen": -609.715087890625, "logps/rejected": -698.795654296875, "loss": 0.1921, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2187325954437256, "rewards/margins": 2.7987396717071533, "rewards/rejected": -4.017472267150879, "step": 1130 }, { "epoch": 0.97, "grad_norm": 66.63403806754211, "learning_rate": 5.48854447292324e-09, "logits/chosen": -1.3315446376800537, "logits/rejected": -1.2545630931854248, "logps/chosen": -419.51055908203125, "logps/rejected": -618.6432495117188, "loss": 0.4235, "rewards/accuracies": 0.875, "rewards/chosen": -1.0476508140563965, "rewards/margins": 2.285956382751465, "rewards/rejected": -3.3336071968078613, "step": 1131 }, { "epoch": 0.97, "grad_norm": 57.07860454385766, "learning_rate": 5.179660972411848e-09, "logits/chosen": -1.383811116218567, "logits/rejected": -1.3193073272705078, "logps/chosen": -589.6669311523438, "logps/rejected": -812.602783203125, "loss": 0.2477, "rewards/accuracies": 0.875, "rewards/chosen": -1.4525666236877441, "rewards/margins": 2.547089099884033, "rewards/rejected": -3.9996557235717773, "step": 1132 }, { "epoch": 0.97, "grad_norm": 87.49969249077617, "learning_rate": 4.87970010633798e-09, "logits/chosen": -1.3547697067260742, "logits/rejected": -1.292210578918457, "logps/chosen": -366.54345703125, "logps/rejected": -503.3235168457031, "loss": 0.499, "rewards/accuracies": 0.8125, "rewards/chosen": -1.019222617149353, "rewards/margins": 1.5081883668899536, "rewards/rejected": -2.5274109840393066, "step": 1133 }, { "epoch": 0.97, "grad_norm": 77.39361614522927, "learning_rate": 4.588664565078115e-09, "logits/chosen": -1.2798162698745728, "logits/rejected": -1.211428165435791, "logps/chosen": -636.0787353515625, "logps/rejected": -900.0787353515625, "loss": 0.6096, "rewards/accuracies": 0.875, "rewards/chosen": -1.2634141445159912, "rewards/margins": 2.9379217624664307, "rewards/rejected": -4.201335906982422, "step": 1134 }, { "epoch": 0.97, "grad_norm": 67.03503487794497, "learning_rate": 4.3065569589565424e-09, "logits/chosen": -1.3400976657867432, "logits/rejected": -1.3143208026885986, "logps/chosen": -568.5703735351562, "logps/rejected": -604.925048828125, "loss": 0.4941, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1162084341049194, "rewards/margins": 1.7637848854064941, "rewards/rejected": -2.879993438720703, "step": 1135 }, { "epoch": 0.97, "grad_norm": 57.41363643734463, "learning_rate": 4.033379818222271e-09, "logits/chosen": -1.337462067604065, "logits/rejected": -1.3244147300720215, "logps/chosen": -488.3711853027344, "logps/rejected": -537.2532958984375, "loss": 0.3174, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7379860877990723, "rewards/margins": 1.4863120317459106, "rewards/rejected": -3.2242980003356934, "step": 1136 }, { "epoch": 0.98, "grad_norm": 61.66213574933086, "learning_rate": 3.769135593025941e-09, "logits/chosen": -1.3855712413787842, "logits/rejected": -1.3328797817230225, "logps/chosen": -437.3864440917969, "logps/rejected": -679.2550048828125, "loss": 0.3173, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0746511220932007, "rewards/margins": 2.411076545715332, "rewards/rejected": -3.485727548599243, "step": 1137 }, { "epoch": 0.98, "grad_norm": 47.317221747492454, "learning_rate": 3.5138266533980553e-09, "logits/chosen": -1.3714405298233032, "logits/rejected": -1.3022525310516357, "logps/chosen": -397.1058349609375, "logps/rejected": -654.9545288085938, "loss": 0.2783, "rewards/accuracies": 0.875, "rewards/chosen": -1.5204839706420898, "rewards/margins": 2.413515090942383, "rewards/rejected": -3.9339990615844727, "step": 1138 }, { "epoch": 0.98, "grad_norm": 53.52800108897222, "learning_rate": 3.267455289227894e-09, "logits/chosen": -1.321838617324829, "logits/rejected": -1.2986551523208618, "logps/chosen": -569.0445556640625, "logps/rejected": -653.1036987304688, "loss": 0.355, "rewards/accuracies": 0.875, "rewards/chosen": -1.3003381490707397, "rewards/margins": 1.9961235523223877, "rewards/rejected": -3.296461820602417, "step": 1139 }, { "epoch": 0.98, "grad_norm": 71.56680662426476, "learning_rate": 3.0300237102426353e-09, "logits/chosen": -1.3216408491134644, "logits/rejected": -1.2848577499389648, "logps/chosen": -523.3908081054688, "logps/rejected": -589.765625, "loss": 0.5425, "rewards/accuracies": 0.75, "rewards/chosen": -1.445241928100586, "rewards/margins": 1.263198733329773, "rewards/rejected": -2.7084405422210693, "step": 1140 }, { "epoch": 0.98, "grad_norm": 47.095683960933016, "learning_rate": 2.80153404598793e-09, "logits/chosen": -1.435076355934143, "logits/rejected": -1.3584879636764526, "logps/chosen": -435.42779541015625, "logps/rejected": -651.4534912109375, "loss": 0.3493, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2850425243377686, "rewards/margins": 2.3500070571899414, "rewards/rejected": -3.63504958152771, "step": 1141 }, { "epoch": 0.98, "grad_norm": 59.8759315859725, "learning_rate": 2.5819883458082502e-09, "logits/chosen": -1.349071979522705, "logits/rejected": -1.2874417304992676, "logps/chosen": -474.7420349121094, "logps/rejected": -683.0074462890625, "loss": 0.2861, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2572880983352661, "rewards/margins": 2.384906768798828, "rewards/rejected": -3.6421947479248047, "step": 1142 }, { "epoch": 0.98, "grad_norm": 56.16289363032499, "learning_rate": 2.3713885788291253e-09, "logits/chosen": -1.3628742694854736, "logits/rejected": -1.3211891651153564, "logps/chosen": -480.4175720214844, "logps/rejected": -555.6670532226562, "loss": 0.3192, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1059783697128296, "rewards/margins": 1.4368137121200562, "rewards/rejected": -2.5427920818328857, "step": 1143 }, { "epoch": 0.98, "grad_norm": 80.38296285383446, "learning_rate": 2.1697366339391566e-09, "logits/chosen": -1.4191553592681885, "logits/rejected": -1.3954321146011353, "logps/chosen": -424.06219482421875, "logps/rejected": -492.4453125, "loss": 0.4467, "rewards/accuracies": 0.875, "rewards/chosen": -1.1218348741531372, "rewards/margins": 1.4760911464691162, "rewards/rejected": -2.597926139831543, "step": 1144 }, { "epoch": 0.98, "grad_norm": 64.79748784286423, "learning_rate": 1.977034319772919e-09, "logits/chosen": -1.317758560180664, "logits/rejected": -1.2300162315368652, "logps/chosen": -525.77880859375, "logps/rejected": -677.3091430664062, "loss": 0.1877, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2687187194824219, "rewards/margins": 2.5530333518981934, "rewards/rejected": -3.8217523097991943, "step": 1145 }, { "epoch": 0.98, "grad_norm": 117.9999058266044, "learning_rate": 1.7932833646950862e-09, "logits/chosen": -1.391584873199463, "logits/rejected": -1.3528341054916382, "logps/chosen": -698.065185546875, "logps/rejected": -626.8812255859375, "loss": 0.6825, "rewards/accuracies": 0.625, "rewards/chosen": -1.786995530128479, "rewards/margins": 1.257378101348877, "rewards/rejected": -3.0443735122680664, "step": 1146 }, { "epoch": 0.98, "grad_norm": 120.74778768884202, "learning_rate": 1.6184854167847762e-09, "logits/chosen": -1.3571929931640625, "logits/rejected": -1.308891773223877, "logps/chosen": -628.2859497070312, "logps/rejected": -804.627197265625, "loss": 0.9244, "rewards/accuracies": 0.75, "rewards/chosen": -1.9744269847869873, "rewards/margins": 0.7434270977973938, "rewards/rejected": -2.7178540229797363, "step": 1147 }, { "epoch": 0.98, "grad_norm": 43.30334270730007, "learning_rate": 1.4526420438207843e-09, "logits/chosen": -1.3464512825012207, "logits/rejected": -1.3035533428192139, "logps/chosen": -432.1238708496094, "logps/rejected": -540.8009033203125, "loss": 0.3103, "rewards/accuracies": 0.875, "rewards/chosen": -1.1443607807159424, "rewards/margins": 2.0538125038146973, "rewards/rejected": -3.1981732845306396, "step": 1148 }, { "epoch": 0.99, "grad_norm": 73.63501253133734, "learning_rate": 1.2957547332673735e-09, "logits/chosen": -1.4333579540252686, "logits/rejected": -1.3868712186813354, "logps/chosen": -352.1337890625, "logps/rejected": -462.68682861328125, "loss": 0.6665, "rewards/accuracies": 0.75, "rewards/chosen": -1.07978093624115, "rewards/margins": 1.6578272581100464, "rewards/rejected": -2.737607955932617, "step": 1149 }, { "epoch": 0.99, "grad_norm": 92.22993063444079, "learning_rate": 1.1478248922611732e-09, "logits/chosen": -1.25857675075531, "logits/rejected": -1.2520473003387451, "logps/chosen": -579.1795654296875, "logps/rejected": -641.9288330078125, "loss": 0.614, "rewards/accuracies": 0.75, "rewards/chosen": -2.0475940704345703, "rewards/margins": 1.7527912855148315, "rewards/rejected": -3.8003852367401123, "step": 1150 }, { "epoch": 0.99, "grad_norm": 77.95791372072948, "learning_rate": 1.0088538475985231e-09, "logits/chosen": -1.468308687210083, "logits/rejected": -1.372737169265747, "logps/chosen": -440.12872314453125, "logps/rejected": -676.8677978515625, "loss": 0.3524, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7725510001182556, "rewards/margins": 2.5467355251312256, "rewards/rejected": -3.319286346435547, "step": 1151 }, { "epoch": 0.99, "grad_norm": 37.89540413647167, "learning_rate": 8.788428457232599e-10, "logits/chosen": -1.327483892440796, "logits/rejected": -1.259476900100708, "logps/chosen": -606.5562744140625, "logps/rejected": -799.5638427734375, "loss": 0.2021, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3170266151428223, "rewards/margins": 2.9507436752319336, "rewards/rejected": -4.267770767211914, "step": 1152 }, { "epoch": 0.99, "grad_norm": 37.3700905910909, "learning_rate": 7.577930527160603e-10, "logits/chosen": -1.3355379104614258, "logits/rejected": -1.3139853477478027, "logps/chosen": -478.2299499511719, "logps/rejected": -658.5348510742188, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": -0.8407142162322998, "rewards/margins": 2.9635887145996094, "rewards/rejected": -3.80430269241333, "step": 1153 }, { "epoch": 0.99, "grad_norm": 87.25455518532338, "learning_rate": 6.457055542834489e-10, "logits/chosen": -1.3901220560073853, "logits/rejected": -1.3573310375213623, "logps/chosen": -376.05596923828125, "logps/rejected": -494.57879638671875, "loss": 0.2486, "rewards/accuracies": 0.875, "rewards/chosen": -1.0375429391860962, "rewards/margins": 1.868727684020996, "rewards/rejected": -2.9062705039978027, "step": 1154 }, { "epoch": 0.99, "grad_norm": 78.72533027928456, "learning_rate": 5.425813557485837e-10, "logits/chosen": -1.2947056293487549, "logits/rejected": -1.2775053977966309, "logps/chosen": -650.7203979492188, "logps/rejected": -684.703857421875, "loss": 0.3896, "rewards/accuracies": 0.75, "rewards/chosen": -1.9509949684143066, "rewards/margins": 1.6460188627243042, "rewards/rejected": -3.5970139503479004, "step": 1155 }, { "epoch": 0.99, "grad_norm": 44.70986796433435, "learning_rate": 4.4842138204170823e-10, "logits/chosen": -1.3698441982269287, "logits/rejected": -1.311873197555542, "logps/chosen": -540.0364990234375, "logps/rejected": -709.73193359375, "loss": 0.2309, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1100777387619019, "rewards/margins": 2.580242872238159, "rewards/rejected": -3.6903204917907715, "step": 1156 }, { "epoch": 0.99, "grad_norm": 58.44263577705729, "learning_rate": 3.632264776922689e-10, "logits/chosen": -1.4120718240737915, "logits/rejected": -1.3665246963500977, "logps/chosen": -391.5028991699219, "logps/rejected": -562.1903686523438, "loss": 0.4485, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3291339874267578, "rewards/margins": 1.5925970077514648, "rewards/rejected": -2.9217309951782227, "step": 1157 }, { "epoch": 0.99, "grad_norm": 82.61556324764253, "learning_rate": 2.8699740682103234e-10, "logits/chosen": -1.3903688192367554, "logits/rejected": -1.323358178138733, "logps/chosen": -694.0431518554688, "logps/rejected": -721.135986328125, "loss": 0.61, "rewards/accuracies": 0.75, "rewards/chosen": -1.7189762592315674, "rewards/margins": 1.5658252239227295, "rewards/rejected": -3.284801483154297, "step": 1158 }, { "epoch": 0.99, "grad_norm": 44.90677636669033, "learning_rate": 2.1973485313364626e-10, "logits/chosen": -1.3954801559448242, "logits/rejected": -1.3531403541564941, "logps/chosen": -416.509033203125, "logps/rejected": -538.574462890625, "loss": 0.3214, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2683722972869873, "rewards/margins": 1.4201171398162842, "rewards/rejected": -2.6884894371032715, "step": 1159 }, { "epoch": 0.99, "grad_norm": 49.18377325889221, "learning_rate": 1.614394199139779e-10, "logits/chosen": -1.4112136363983154, "logits/rejected": -1.3324227333068848, "logps/chosen": -446.1578063964844, "logps/rejected": -651.1507568359375, "loss": 0.2812, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3055462837219238, "rewards/margins": 2.007277011871338, "rewards/rejected": -3.3128228187561035, "step": 1160 }, { "epoch": 1.0, "grad_norm": 60.32037934519408, "learning_rate": 1.121116300192293e-10, "logits/chosen": -1.3937182426452637, "logits/rejected": -1.340111255645752, "logps/chosen": -367.4189758300781, "logps/rejected": -490.84564208984375, "loss": 0.4305, "rewards/accuracies": 0.75, "rewards/chosen": -0.8444278240203857, "rewards/margins": 1.7927734851837158, "rewards/rejected": -2.6372013092041016, "step": 1161 }, { "epoch": 1.0, "grad_norm": 43.045794842741344, "learning_rate": 7.175192587471901e-11, "logits/chosen": -1.3729491233825684, "logits/rejected": -1.3426620960235596, "logps/chosen": -493.204833984375, "logps/rejected": -539.8018798828125, "loss": 0.2829, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0588021278381348, "rewards/margins": 1.6722935438156128, "rewards/rejected": -2.731095790863037, "step": 1162 }, { "epoch": 1.0, "grad_norm": 27.30597124731177, "learning_rate": 4.0360669470329567e-11, "logits/chosen": -1.4091095924377441, "logits/rejected": -1.297658920288086, "logps/chosen": -397.6080322265625, "logps/rejected": -712.9400634765625, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": -0.6887233257293701, "rewards/margins": 2.8686845302581787, "rewards/rejected": -3.557407855987549, "step": 1163 }, { "epoch": 1.0, "grad_norm": 38.20646142690474, "learning_rate": 1.7938142357176723e-11, "logits/chosen": -1.3071112632751465, "logits/rejected": -1.283097743988037, "logps/chosen": -783.91162109375, "logps/rejected": -929.2320556640625, "loss": 0.1792, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7789937257766724, "rewards/margins": 2.8627429008483887, "rewards/rejected": -4.64173698425293, "step": 1164 }, { "epoch": 1.0, "grad_norm": 93.66277413850928, "learning_rate": 4.484545644833914e-12, "logits/chosen": -1.387592077255249, "logits/rejected": -1.3474268913269043, "logps/chosen": -512.49560546875, "logps/rejected": -682.5592041015625, "loss": 0.3537, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0724601745605469, "rewards/margins": 2.411895513534546, "rewards/rejected": -3.4843556880950928, "step": 1165 }, { "epoch": 1.0, "grad_norm": 42.24107456541779, "learning_rate": 0.0, "logits/chosen": -1.3576037883758545, "logits/rejected": -1.2758808135986328, "logps/chosen": -443.78759765625, "logps/rejected": -610.3323974609375, "loss": 0.2515, "rewards/accuracies": 0.875, "rewards/chosen": -1.3626399040222168, "rewards/margins": 2.2888269424438477, "rewards/rejected": -3.6514668464660645, "step": 1166 }, { "epoch": 1.0, "eval_logits/chosen": -1.352773666381836, "eval_logits/rejected": -1.2867770195007324, "eval_logps/chosen": -451.33642578125, "eval_logps/rejected": -657.6813354492188, "eval_loss": 0.2841675579547882, "eval_rewards/accuracies": 0.88671875, "eval_rewards/chosen": -1.1548646688461304, "eval_rewards/margins": 2.4814703464508057, "eval_rewards/rejected": -3.6363346576690674, "eval_runtime": 513.7019, "eval_samples_per_second": 2.99, "eval_steps_per_second": 0.748, "step": 1166 } ], "logging_steps": 1, "max_steps": 1166, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }