{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.997808219178082, "eval_steps": 500, "global_step": 5472, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007305936073059361, "grad_norm": 98.58670240263136, "learning_rate": 9.124087591240875e-10, "logits/chosen": -2.7902603149414062, "logits/rejected": -2.1948933601379395, "logps/chosen": -605.0096435546875, "logps/rejected": -422.4518127441406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0014611872146118722, "grad_norm": 100.11716768679086, "learning_rate": 1.824817518248175e-09, "logits/chosen": -2.7485883235931396, "logits/rejected": -2.0183637142181396, "logps/chosen": -493.4544677734375, "logps/rejected": -346.1533203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.002191780821917808, "grad_norm": 85.7699076062301, "learning_rate": 2.7372262773722627e-09, "logits/chosen": -3.3311805725097656, "logits/rejected": -2.9239985942840576, "logps/chosen": -827.3453369140625, "logps/rejected": -650.947998046875, "loss": 0.693, "rewards/accuracies": 0.25, "rewards/chosen": 0.004219055641442537, "rewards/margins": 0.03633499518036842, "rewards/rejected": -0.032115936279296875, "step": 3 }, { "epoch": 0.0029223744292237444, "grad_norm": 102.00822864326926, "learning_rate": 3.64963503649635e-09, "logits/chosen": -3.087874412536621, "logits/rejected": -2.098222255706787, "logps/chosen": -496.6165771484375, "logps/rejected": -235.69955444335938, "loss": 0.7077, "rewards/accuracies": 0.625, "rewards/chosen": -0.0013123499229550362, "rewards/margins": 0.009461834095418453, "rewards/rejected": -0.01077418215572834, "step": 4 }, { "epoch": 0.0036529680365296802, "grad_norm": 106.93799769755604, "learning_rate": 4.562043795620437e-09, "logits/chosen": -2.2754151821136475, "logits/rejected": -2.6072309017181396, "logps/chosen": -575.1276245117188, "logps/rejected": -787.0958862304688, "loss": 0.7091, "rewards/accuracies": 0.25, "rewards/chosen": -0.012850642204284668, "rewards/margins": -0.0442790761590004, "rewards/rejected": 0.03142843395471573, "step": 5 }, { "epoch": 0.004383561643835616, "grad_norm": 101.65092304671906, "learning_rate": 5.4744525547445254e-09, "logits/chosen": -2.8758678436279297, "logits/rejected": -2.583397626876831, "logps/chosen": -897.9839477539062, "logps/rejected": -650.86328125, "loss": 0.6753, "rewards/accuracies": 0.375, "rewards/chosen": 0.017635632306337357, "rewards/margins": -0.02673044055700302, "rewards/rejected": 0.044366076588630676, "step": 6 }, { "epoch": 0.0051141552511415524, "grad_norm": 97.32190766182863, "learning_rate": 6.386861313868613e-09, "logits/chosen": -2.494385004043579, "logits/rejected": -2.411435842514038, "logps/chosen": -646.9886474609375, "logps/rejected": -757.0853271484375, "loss": 0.6872, "rewards/accuracies": 0.375, "rewards/chosen": -0.03410835191607475, "rewards/margins": 0.0007591219618916512, "rewards/rejected": -0.03486747667193413, "step": 7 }, { "epoch": 0.005844748858447489, "grad_norm": 106.73668407142176, "learning_rate": 7.2992700729927e-09, "logits/chosen": -2.4562795162200928, "logits/rejected": -2.218390464782715, "logps/chosen": -683.8443603515625, "logps/rejected": -732.48681640625, "loss": 0.702, "rewards/accuracies": 0.5, "rewards/chosen": 0.009046075865626335, "rewards/margins": 0.03427471965551376, "rewards/rejected": -0.025228645652532578, "step": 8 }, { "epoch": 0.006575342465753425, "grad_norm": 97.21687658914989, "learning_rate": 8.211678832116789e-09, "logits/chosen": -2.5394222736358643, "logits/rejected": -2.165768623352051, "logps/chosen": -564.150390625, "logps/rejected": -527.756103515625, "loss": 0.6917, "rewards/accuracies": 0.75, "rewards/chosen": -0.0316988006234169, "rewards/margins": -0.02414989471435547, "rewards/rejected": -0.007548904046416283, "step": 9 }, { "epoch": 0.0073059360730593605, "grad_norm": 93.9766738844007, "learning_rate": 9.124087591240875e-09, "logits/chosen": -2.8343398571014404, "logits/rejected": -1.981256365776062, "logps/chosen": -746.1691284179688, "logps/rejected": -601.1179809570312, "loss": 0.7141, "rewards/accuracies": 0.25, "rewards/chosen": -0.08429823070764542, "rewards/margins": -0.08352003246545792, "rewards/rejected": -0.0007782001048326492, "step": 10 }, { "epoch": 0.008036529680365296, "grad_norm": 95.15026664275565, "learning_rate": 1.0036496350364964e-08, "logits/chosen": -3.3176941871643066, "logits/rejected": -2.1652281284332275, "logps/chosen": -503.1419982910156, "logps/rejected": -377.00836181640625, "loss": 0.6844, "rewards/accuracies": 0.375, "rewards/chosen": -0.007990837097167969, "rewards/margins": -0.030371474102139473, "rewards/rejected": 0.022380638867616653, "step": 11 }, { "epoch": 0.008767123287671232, "grad_norm": 93.80943949038391, "learning_rate": 1.0948905109489051e-08, "logits/chosen": -2.689056396484375, "logits/rejected": -1.911849856376648, "logps/chosen": -808.53759765625, "logps/rejected": -490.2763671875, "loss": 0.6987, "rewards/accuracies": 0.625, "rewards/chosen": 0.09343590587377548, "rewards/margins": 0.04428234323859215, "rewards/rejected": 0.04915357008576393, "step": 12 }, { "epoch": 0.009497716894977169, "grad_norm": 104.99358239316344, "learning_rate": 1.1861313868613138e-08, "logits/chosen": -2.34944486618042, "logits/rejected": -2.2206618785858154, "logps/chosen": -868.6431274414062, "logps/rejected": -791.850341796875, "loss": 0.6795, "rewards/accuracies": 0.875, "rewards/chosen": 0.01970539055764675, "rewards/margins": 0.16036531329154968, "rewards/rejected": -0.14065991342067719, "step": 13 }, { "epoch": 0.010228310502283105, "grad_norm": 111.36536328960186, "learning_rate": 1.2773722627737225e-08, "logits/chosen": -2.491974353790283, "logits/rejected": -2.400334119796753, "logps/chosen": -725.4588623046875, "logps/rejected": -683.2256469726562, "loss": 0.7215, "rewards/accuracies": 0.125, "rewards/chosen": -0.0016925819218158722, "rewards/margins": -0.07760396599769592, "rewards/rejected": 0.07591138780117035, "step": 14 }, { "epoch": 0.010958904109589041, "grad_norm": 97.74906089702345, "learning_rate": 1.3686131386861314e-08, "logits/chosen": -3.6008219718933105, "logits/rejected": -3.087625026702881, "logps/chosen": -774.1787719726562, "logps/rejected": -557.531005859375, "loss": 0.6788, "rewards/accuracies": 0.375, "rewards/chosen": -0.03199310228228569, "rewards/margins": -0.003922364674508572, "rewards/rejected": -0.028070734813809395, "step": 15 }, { "epoch": 0.011689497716894977, "grad_norm": 91.45042735855584, "learning_rate": 1.45985401459854e-08, "logits/chosen": -3.0833520889282227, "logits/rejected": -2.2203338146209717, "logps/chosen": -587.2229614257812, "logps/rejected": -474.2802429199219, "loss": 0.6909, "rewards/accuracies": 0.75, "rewards/chosen": 0.013139727525413036, "rewards/margins": 0.0335293784737587, "rewards/rejected": -0.020389650017023087, "step": 16 }, { "epoch": 0.012420091324200914, "grad_norm": 100.26249240275499, "learning_rate": 1.551094890510949e-08, "logits/chosen": -3.051006317138672, "logits/rejected": -2.2937211990356445, "logps/chosen": -887.1992797851562, "logps/rejected": -501.18658447265625, "loss": 0.7021, "rewards/accuracies": 0.375, "rewards/chosen": -0.036496352404356, "rewards/margins": -0.0425809845328331, "rewards/rejected": 0.006084633991122246, "step": 17 }, { "epoch": 0.01315068493150685, "grad_norm": 91.89695209683235, "learning_rate": 1.6423357664233578e-08, "logits/chosen": -2.3804993629455566, "logits/rejected": -1.8711321353912354, "logps/chosen": -503.77203369140625, "logps/rejected": -437.38739013671875, "loss": 0.716, "rewards/accuracies": 0.375, "rewards/chosen": 0.00018515437841415405, "rewards/margins": -0.008728505112230778, "rewards/rejected": 0.008913659490644932, "step": 18 }, { "epoch": 0.013881278538812785, "grad_norm": 92.26409424005364, "learning_rate": 1.7335766423357664e-08, "logits/chosen": -2.6616406440734863, "logits/rejected": -2.5910286903381348, "logps/chosen": -734.0892333984375, "logps/rejected": -632.194580078125, "loss": 0.6997, "rewards/accuracies": 0.375, "rewards/chosen": 0.004915047436952591, "rewards/margins": -0.04168495908379555, "rewards/rejected": 0.04660001024603844, "step": 19 }, { "epoch": 0.014611872146118721, "grad_norm": 111.53211245813915, "learning_rate": 1.824817518248175e-08, "logits/chosen": -2.838174343109131, "logits/rejected": -1.5423059463500977, "logps/chosen": -857.7124633789062, "logps/rejected": -506.28411865234375, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": 0.04167427867650986, "rewards/margins": 0.045662932097911835, "rewards/rejected": -0.003988645970821381, "step": 20 }, { "epoch": 0.015342465753424657, "grad_norm": 104.35850356164082, "learning_rate": 1.9160583941605838e-08, "logits/chosen": -2.980271577835083, "logits/rejected": -2.311079502105713, "logps/chosen": -1110.95263671875, "logps/rejected": -628.5374755859375, "loss": 0.679, "rewards/accuracies": 0.5, "rewards/chosen": 0.05158214643597603, "rewards/margins": 0.01637556590139866, "rewards/rejected": 0.03520657867193222, "step": 21 }, { "epoch": 0.016073059360730592, "grad_norm": 97.92517471901697, "learning_rate": 2.0072992700729927e-08, "logits/chosen": -2.0623950958251953, "logits/rejected": -2.399272918701172, "logps/chosen": -462.4492492675781, "logps/rejected": -524.2799682617188, "loss": 0.6858, "rewards/accuracies": 0.25, "rewards/chosen": 0.00030846521258354187, "rewards/margins": -0.006520463153719902, "rewards/rejected": 0.006828928366303444, "step": 22 }, { "epoch": 0.016803652968036528, "grad_norm": 90.99901270068813, "learning_rate": 2.0985401459854013e-08, "logits/chosen": -2.7853119373321533, "logits/rejected": -1.5476434230804443, "logps/chosen": -394.34576416015625, "logps/rejected": -259.28570556640625, "loss": 0.7005, "rewards/accuracies": 0.25, "rewards/chosen": 0.012472772970795631, "rewards/margins": -0.02002115175127983, "rewards/rejected": 0.03249392658472061, "step": 23 }, { "epoch": 0.017534246575342465, "grad_norm": 77.53386042058496, "learning_rate": 2.1897810218978102e-08, "logits/chosen": -2.076634168624878, "logits/rejected": -1.9006274938583374, "logps/chosen": -434.4104309082031, "logps/rejected": -441.36865234375, "loss": 0.666, "rewards/accuracies": 0.875, "rewards/chosen": 0.01811903715133667, "rewards/margins": 0.09447887539863586, "rewards/rejected": -0.0763598382472992, "step": 24 }, { "epoch": 0.0182648401826484, "grad_norm": 117.95708137655407, "learning_rate": 2.2810218978102187e-08, "logits/chosen": -2.6185998916625977, "logits/rejected": -2.1783931255340576, "logps/chosen": -1147.652099609375, "logps/rejected": -535.6358032226562, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": 0.03881416469812393, "rewards/margins": 0.08126036822795868, "rewards/rejected": -0.042446207255125046, "step": 25 }, { "epoch": 0.018995433789954337, "grad_norm": 96.40841772279275, "learning_rate": 2.3722627737226276e-08, "logits/chosen": -3.0687460899353027, "logits/rejected": -1.965311884880066, "logps/chosen": -623.932373046875, "logps/rejected": -384.6195068359375, "loss": 0.6962, "rewards/accuracies": 0.625, "rewards/chosen": 0.02098703384399414, "rewards/margins": 0.010926531627774239, "rewards/rejected": 0.010060503147542477, "step": 26 }, { "epoch": 0.019726027397260273, "grad_norm": 94.19171461207456, "learning_rate": 2.4635036496350365e-08, "logits/chosen": -3.03074312210083, "logits/rejected": -2.2732021808624268, "logps/chosen": -442.5380554199219, "logps/rejected": -368.1647644042969, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": -0.0015556220896542072, "rewards/margins": 0.028120437636971474, "rewards/rejected": -0.029676057398319244, "step": 27 }, { "epoch": 0.02045662100456621, "grad_norm": 97.79510628093821, "learning_rate": 2.554744525547445e-08, "logits/chosen": -2.598207473754883, "logits/rejected": -2.188399076461792, "logps/chosen": -996.2998046875, "logps/rejected": -648.86279296875, "loss": 0.6954, "rewards/accuracies": 0.375, "rewards/chosen": -0.037107277661561966, "rewards/margins": -0.07516627758741379, "rewards/rejected": 0.03805899992585182, "step": 28 }, { "epoch": 0.021187214611872146, "grad_norm": 107.19682904008869, "learning_rate": 2.6459854014598537e-08, "logits/chosen": -2.9980411529541016, "logits/rejected": -2.7248787879943848, "logps/chosen": -856.696044921875, "logps/rejected": -709.8429565429688, "loss": 0.7036, "rewards/accuracies": 0.125, "rewards/chosen": -0.01951294019818306, "rewards/margins": -0.0663788840174675, "rewards/rejected": 0.04686594009399414, "step": 29 }, { "epoch": 0.021917808219178082, "grad_norm": 96.59851169110992, "learning_rate": 2.737226277372263e-08, "logits/chosen": -2.669811248779297, "logits/rejected": -2.2585630416870117, "logps/chosen": -978.0850830078125, "logps/rejected": -702.4935913085938, "loss": 0.6891, "rewards/accuracies": 0.625, "rewards/chosen": 0.12169761955738068, "rewards/margins": 0.15225999057292938, "rewards/rejected": -0.030562376603484154, "step": 30 }, { "epoch": 0.02264840182648402, "grad_norm": 96.89416645075956, "learning_rate": 2.8284671532846714e-08, "logits/chosen": -2.89635968208313, "logits/rejected": -2.199286460876465, "logps/chosen": -877.2157592773438, "logps/rejected": -595.7017211914062, "loss": 0.7033, "rewards/accuracies": 0.625, "rewards/chosen": 0.03430977091193199, "rewards/margins": 0.036777690052986145, "rewards/rejected": -0.0024679191410541534, "step": 31 }, { "epoch": 0.023378995433789955, "grad_norm": 97.34670942531905, "learning_rate": 2.91970802919708e-08, "logits/chosen": -2.9905147552490234, "logits/rejected": -1.9665772914886475, "logps/chosen": -843.7747192382812, "logps/rejected": -580.8801879882812, "loss": 0.6989, "rewards/accuracies": 0.375, "rewards/chosen": -0.003688812255859375, "rewards/margins": -0.010088205337524414, "rewards/rejected": 0.006399394012987614, "step": 32 }, { "epoch": 0.02410958904109589, "grad_norm": 98.04419941568446, "learning_rate": 3.010948905109489e-08, "logits/chosen": -3.0424644947052, "logits/rejected": -2.5189735889434814, "logps/chosen": -471.99285888671875, "logps/rejected": -361.3125305175781, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": -0.04649047926068306, "rewards/margins": -0.011441615410149097, "rewards/rejected": -0.03504886478185654, "step": 33 }, { "epoch": 0.024840182648401828, "grad_norm": 97.08490364431022, "learning_rate": 3.102189781021898e-08, "logits/chosen": -3.1158459186553955, "logits/rejected": -2.9466185569763184, "logps/chosen": -693.440673828125, "logps/rejected": -671.16015625, "loss": 0.71, "rewards/accuracies": 0.375, "rewards/chosen": -0.03415832668542862, "rewards/margins": -0.008744146674871445, "rewards/rejected": -0.025414180010557175, "step": 34 }, { "epoch": 0.025570776255707764, "grad_norm": 97.88895773719833, "learning_rate": 3.193430656934307e-08, "logits/chosen": -2.4189696311950684, "logits/rejected": -3.1104137897491455, "logps/chosen": -529.6287841796875, "logps/rejected": -823.67236328125, "loss": 0.687, "rewards/accuracies": 0.375, "rewards/chosen": 0.043032266199588776, "rewards/margins": -0.003175924066454172, "rewards/rejected": 0.04620819166302681, "step": 35 }, { "epoch": 0.0263013698630137, "grad_norm": 98.8624804415836, "learning_rate": 3.2846715328467156e-08, "logits/chosen": -2.718959331512451, "logits/rejected": -2.2190959453582764, "logps/chosen": -616.1572875976562, "logps/rejected": -476.4368896484375, "loss": 0.7275, "rewards/accuracies": 0.375, "rewards/chosen": 0.027389049530029297, "rewards/margins": -0.06918086856603622, "rewards/rejected": 0.09656991809606552, "step": 36 }, { "epoch": 0.027031963470319633, "grad_norm": 119.56246732879846, "learning_rate": 3.375912408759124e-08, "logits/chosen": -2.7154572010040283, "logits/rejected": -2.4407176971435547, "logps/chosen": -673.6200561523438, "logps/rejected": -498.83819580078125, "loss": 0.7309, "rewards/accuracies": 0.5, "rewards/chosen": -0.05716096609830856, "rewards/margins": -0.04819909483194351, "rewards/rejected": -0.008961867541074753, "step": 37 }, { "epoch": 0.02776255707762557, "grad_norm": 91.90989452141429, "learning_rate": 3.467153284671533e-08, "logits/chosen": -2.7209692001342773, "logits/rejected": -2.2157468795776367, "logps/chosen": -774.0751953125, "logps/rejected": -531.5654296875, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": 0.011727523058652878, "rewards/margins": -0.019197802990674973, "rewards/rejected": 0.0309253241866827, "step": 38 }, { "epoch": 0.028493150684931506, "grad_norm": 98.49633162585295, "learning_rate": 3.558394160583941e-08, "logits/chosen": -2.512758731842041, "logits/rejected": -2.004457473754883, "logps/chosen": -448.1313171386719, "logps/rejected": -461.5560302734375, "loss": 0.6943, "rewards/accuracies": 0.125, "rewards/chosen": -0.021454716101288795, "rewards/margins": -0.014398672617971897, "rewards/rejected": -0.007056046277284622, "step": 39 }, { "epoch": 0.029223744292237442, "grad_norm": 85.2514605355093, "learning_rate": 3.64963503649635e-08, "logits/chosen": -2.78273344039917, "logits/rejected": -2.0569260120391846, "logps/chosen": -467.8907775878906, "logps/rejected": -322.9972839355469, "loss": 0.6999, "rewards/accuracies": 0.5, "rewards/chosen": -0.0034599313512444496, "rewards/margins": -0.017236996442079544, "rewards/rejected": 0.013777065090835094, "step": 40 }, { "epoch": 0.02995433789954338, "grad_norm": 105.49620475993437, "learning_rate": 3.7408759124087594e-08, "logits/chosen": -2.772449254989624, "logits/rejected": -2.1216025352478027, "logps/chosen": -830.4874267578125, "logps/rejected": -686.4775390625, "loss": 0.6974, "rewards/accuracies": 0.625, "rewards/chosen": 0.06182899326086044, "rewards/margins": 0.049954745918512344, "rewards/rejected": 0.011874246411025524, "step": 41 }, { "epoch": 0.030684931506849315, "grad_norm": 97.80051344286235, "learning_rate": 3.8321167883211676e-08, "logits/chosen": -2.7424285411834717, "logits/rejected": -2.242662191390991, "logps/chosen": -695.50390625, "logps/rejected": -537.3097534179688, "loss": 0.6881, "rewards/accuracies": 0.375, "rewards/chosen": -0.0035566319711506367, "rewards/margins": -0.02844497188925743, "rewards/rejected": 0.024888338521122932, "step": 42 }, { "epoch": 0.031415525114155254, "grad_norm": 111.50010510084732, "learning_rate": 3.9233576642335765e-08, "logits/chosen": -2.743603467941284, "logits/rejected": -1.8835499286651611, "logps/chosen": -778.1000366210938, "logps/rejected": -471.3153076171875, "loss": 0.7106, "rewards/accuracies": 0.25, "rewards/chosen": -0.014399196952581406, "rewards/margins": -0.06896664947271347, "rewards/rejected": 0.05456745624542236, "step": 43 }, { "epoch": 0.032146118721461184, "grad_norm": 105.47986978417967, "learning_rate": 4.0145985401459854e-08, "logits/chosen": -3.5355308055877686, "logits/rejected": -2.4990787506103516, "logps/chosen": -847.0742797851562, "logps/rejected": -542.60498046875, "loss": 0.7215, "rewards/accuracies": 0.375, "rewards/chosen": -0.05590075999498367, "rewards/margins": -0.013750838115811348, "rewards/rejected": -0.042149923741817474, "step": 44 }, { "epoch": 0.03287671232876712, "grad_norm": 94.49817603233144, "learning_rate": 4.1058394160583937e-08, "logits/chosen": -2.615377902984619, "logits/rejected": -2.3104498386383057, "logps/chosen": -789.4285888671875, "logps/rejected": -875.8193359375, "loss": 0.6966, "rewards/accuracies": 0.625, "rewards/chosen": -0.059810638427734375, "rewards/margins": -0.04386596754193306, "rewards/rejected": -0.015944670885801315, "step": 45 }, { "epoch": 0.033607305936073056, "grad_norm": 107.80511297156782, "learning_rate": 4.1970802919708026e-08, "logits/chosen": -2.5895445346832275, "logits/rejected": -2.7017955780029297, "logps/chosen": -843.39599609375, "logps/rejected": -752.0833740234375, "loss": 0.7024, "rewards/accuracies": 0.25, "rewards/chosen": 0.04405994340777397, "rewards/margins": 0.010834122076630592, "rewards/rejected": 0.03322582319378853, "step": 46 }, { "epoch": 0.03433789954337899, "grad_norm": 84.94924668129053, "learning_rate": 4.288321167883212e-08, "logits/chosen": -2.798778772354126, "logits/rejected": -2.2887637615203857, "logps/chosen": -521.227783203125, "logps/rejected": -412.2502746582031, "loss": 0.7042, "rewards/accuracies": 0.625, "rewards/chosen": -0.025887204334139824, "rewards/margins": -0.010368229821324348, "rewards/rejected": -0.015518976375460625, "step": 47 }, { "epoch": 0.03506849315068493, "grad_norm": 148.38618191858365, "learning_rate": 4.3795620437956203e-08, "logits/chosen": -2.5128142833709717, "logits/rejected": -2.3856515884399414, "logps/chosen": -828.2888793945312, "logps/rejected": -669.2066650390625, "loss": 0.7124, "rewards/accuracies": 0.375, "rewards/chosen": -0.0032766354270279408, "rewards/margins": -0.04716635122895241, "rewards/rejected": 0.04388970881700516, "step": 48 }, { "epoch": 0.035799086757990865, "grad_norm": 102.86405907039396, "learning_rate": 4.470802919708029e-08, "logits/chosen": -3.11265230178833, "logits/rejected": -2.3612236976623535, "logps/chosen": -553.3431396484375, "logps/rejected": -414.97052001953125, "loss": 0.7049, "rewards/accuracies": 0.625, "rewards/chosen": 0.03166542202234268, "rewards/margins": 0.013499070890247822, "rewards/rejected": 0.018166350200772285, "step": 49 }, { "epoch": 0.0365296803652968, "grad_norm": 94.0569817733511, "learning_rate": 4.5620437956204375e-08, "logits/chosen": -2.5401835441589355, "logits/rejected": -2.56681489944458, "logps/chosen": -725.497802734375, "logps/rejected": -697.5982055664062, "loss": 0.7167, "rewards/accuracies": 0.25, "rewards/chosen": -0.08054885268211365, "rewards/margins": -0.06019020080566406, "rewards/rejected": -0.020358657464385033, "step": 50 }, { "epoch": 0.03726027397260274, "grad_norm": 101.8002553640584, "learning_rate": 4.6532846715328464e-08, "logits/chosen": -3.3470330238342285, "logits/rejected": -2.9077532291412354, "logps/chosen": -601.55859375, "logps/rejected": -521.1825561523438, "loss": 0.6899, "rewards/accuracies": 0.375, "rewards/chosen": 0.002341080456972122, "rewards/margins": 0.005430126562714577, "rewards/rejected": -0.0030890461057424545, "step": 51 }, { "epoch": 0.037990867579908674, "grad_norm": 89.71904116549759, "learning_rate": 4.744525547445255e-08, "logits/chosen": -2.8604636192321777, "logits/rejected": -1.7552306652069092, "logps/chosen": -620.9646606445312, "logps/rejected": -324.3946533203125, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": -0.03075733222067356, "rewards/margins": -0.02991323731839657, "rewards/rejected": -0.0008440986275672913, "step": 52 }, { "epoch": 0.03872146118721461, "grad_norm": 102.84751018650135, "learning_rate": 4.835766423357664e-08, "logits/chosen": -2.9075164794921875, "logits/rejected": -2.6643402576446533, "logps/chosen": -802.4718017578125, "logps/rejected": -830.608642578125, "loss": 0.6851, "rewards/accuracies": 0.875, "rewards/chosen": 0.05985412746667862, "rewards/margins": 0.11007346957921982, "rewards/rejected": -0.050219349563121796, "step": 53 }, { "epoch": 0.03945205479452055, "grad_norm": 102.26304069500134, "learning_rate": 4.927007299270073e-08, "logits/chosen": -2.9336254596710205, "logits/rejected": -1.8862507343292236, "logps/chosen": -769.834716796875, "logps/rejected": -447.42913818359375, "loss": 0.6854, "rewards/accuracies": 0.5, "rewards/chosen": 0.007782174739986658, "rewards/margins": 0.0004807477816939354, "rewards/rejected": 0.007301425561308861, "step": 54 }, { "epoch": 0.04018264840182648, "grad_norm": 94.49769032395713, "learning_rate": 5.018248175182482e-08, "logits/chosen": -2.6671600341796875, "logits/rejected": -2.1238608360290527, "logps/chosen": -1139.3448486328125, "logps/rejected": -796.3018188476562, "loss": 0.6824, "rewards/accuracies": 0.75, "rewards/chosen": 0.06623377650976181, "rewards/margins": 0.10779910534620285, "rewards/rejected": -0.04156532138586044, "step": 55 }, { "epoch": 0.04091324200913242, "grad_norm": 107.3607792169119, "learning_rate": 5.10948905109489e-08, "logits/chosen": -3.309436082839966, "logits/rejected": -2.245055913925171, "logps/chosen": -863.382080078125, "logps/rejected": -608.4671630859375, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": -0.019271085038781166, "rewards/margins": -0.018642909824848175, "rewards/rejected": -0.0006281845271587372, "step": 56 }, { "epoch": 0.041643835616438356, "grad_norm": 97.93572010986338, "learning_rate": 5.200729927007299e-08, "logits/chosen": -2.5681979656219482, "logits/rejected": -1.564772129058838, "logps/chosen": -680.1888427734375, "logps/rejected": -346.97552490234375, "loss": 0.6953, "rewards/accuracies": 0.625, "rewards/chosen": 0.025568198412656784, "rewards/margins": 0.05837426334619522, "rewards/rejected": -0.03280606493353844, "step": 57 }, { "epoch": 0.04237442922374429, "grad_norm": 96.70413863784476, "learning_rate": 5.291970802919707e-08, "logits/chosen": -2.8932039737701416, "logits/rejected": -2.425630569458008, "logps/chosen": -366.90802001953125, "logps/rejected": -312.1206359863281, "loss": 0.6825, "rewards/accuracies": 0.625, "rewards/chosen": -0.00016441429033875465, "rewards/margins": 0.01482295896857977, "rewards/rejected": -0.014987372793257236, "step": 58 }, { "epoch": 0.04310502283105023, "grad_norm": 102.73368939885299, "learning_rate": 5.383211678832116e-08, "logits/chosen": -2.6368370056152344, "logits/rejected": -2.171207904815674, "logps/chosen": -743.3282470703125, "logps/rejected": -680.5465087890625, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": 0.025083353742957115, "rewards/margins": 0.019362477585673332, "rewards/rejected": 0.005720877088606358, "step": 59 }, { "epoch": 0.043835616438356165, "grad_norm": 91.76023337855838, "learning_rate": 5.474452554744526e-08, "logits/chosen": -2.753376007080078, "logits/rejected": -1.7965974807739258, "logps/chosen": -407.035888671875, "logps/rejected": -302.5956726074219, "loss": 0.6809, "rewards/accuracies": 0.5, "rewards/chosen": 0.0019084461964666843, "rewards/margins": 0.0074978359043598175, "rewards/rejected": -0.005589389242231846, "step": 60 }, { "epoch": 0.0445662100456621, "grad_norm": 95.37157493030877, "learning_rate": 5.565693430656934e-08, "logits/chosen": -2.793471574783325, "logits/rejected": -2.4545717239379883, "logps/chosen": -1034.04296875, "logps/rejected": -840.353271484375, "loss": 0.6947, "rewards/accuracies": 0.5, "rewards/chosen": 0.000353408046066761, "rewards/margins": -0.021979354321956635, "rewards/rejected": 0.02233276516199112, "step": 61 }, { "epoch": 0.04529680365296804, "grad_norm": 89.71981941630149, "learning_rate": 5.656934306569343e-08, "logits/chosen": -2.5865414142608643, "logits/rejected": -2.285508871078491, "logps/chosen": -558.9706420898438, "logps/rejected": -602.0510864257812, "loss": 0.6938, "rewards/accuracies": 0.375, "rewards/chosen": -0.02326660230755806, "rewards/margins": -0.037882234901189804, "rewards/rejected": 0.014615632593631744, "step": 62 }, { "epoch": 0.046027397260273974, "grad_norm": 87.60467232563141, "learning_rate": 5.748175182481752e-08, "logits/chosen": -2.9889299869537354, "logits/rejected": -2.708453416824341, "logps/chosen": -580.6427001953125, "logps/rejected": -506.6365966796875, "loss": 0.6779, "rewards/accuracies": 0.75, "rewards/chosen": 0.031000331044197083, "rewards/margins": 0.08864890038967133, "rewards/rejected": -0.057648561894893646, "step": 63 }, { "epoch": 0.04675799086757991, "grad_norm": 86.8907219361984, "learning_rate": 5.83941605839416e-08, "logits/chosen": -2.7107863426208496, "logits/rejected": -2.3751001358032227, "logps/chosen": -571.24169921875, "logps/rejected": -375.14801025390625, "loss": 0.6706, "rewards/accuracies": 0.625, "rewards/chosen": 0.016488265246152878, "rewards/margins": -0.006374834105372429, "rewards/rejected": 0.022863103076815605, "step": 64 }, { "epoch": 0.047488584474885846, "grad_norm": 88.62645725617858, "learning_rate": 5.930656934306569e-08, "logits/chosen": -2.7006783485412598, "logits/rejected": -2.6473798751831055, "logps/chosen": -793.8974609375, "logps/rejected": -662.7022094726562, "loss": 0.7031, "rewards/accuracies": 0.75, "rewards/chosen": 0.07292003184556961, "rewards/margins": 0.05472927540540695, "rewards/rejected": 0.018190767616033554, "step": 65 }, { "epoch": 0.04821917808219178, "grad_norm": 88.66589049188391, "learning_rate": 6.021897810218978e-08, "logits/chosen": -2.9283595085144043, "logits/rejected": -2.164547920227051, "logps/chosen": -585.265380859375, "logps/rejected": -434.82830810546875, "loss": 0.6846, "rewards/accuracies": 0.375, "rewards/chosen": 0.030930710956454277, "rewards/margins": 0.01856270059943199, "rewards/rejected": 0.012368010357022285, "step": 66 }, { "epoch": 0.04894977168949772, "grad_norm": 106.5284082196505, "learning_rate": 6.113138686131387e-08, "logits/chosen": -3.054546356201172, "logits/rejected": -2.354632616043091, "logps/chosen": -489.2772216796875, "logps/rejected": -464.72906494140625, "loss": 0.7034, "rewards/accuracies": 0.375, "rewards/chosen": 0.003332519205287099, "rewards/margins": -0.011888599023222923, "rewards/rejected": 0.015221117064356804, "step": 67 }, { "epoch": 0.049680365296803655, "grad_norm": 115.8905101876875, "learning_rate": 6.204379562043796e-08, "logits/chosen": -3.0258729457855225, "logits/rejected": -2.140477418899536, "logps/chosen": -815.11962890625, "logps/rejected": -516.3933715820312, "loss": 0.6889, "rewards/accuracies": 0.5, "rewards/chosen": -0.025342371314764023, "rewards/margins": -0.007102202624082565, "rewards/rejected": -0.01824016682803631, "step": 68 }, { "epoch": 0.05041095890410959, "grad_norm": 95.02794099796385, "learning_rate": 6.295620437956205e-08, "logits/chosen": -3.037353038787842, "logits/rejected": -2.180643081665039, "logps/chosen": -501.1915283203125, "logps/rejected": -420.9936828613281, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": 0.04378271475434303, "rewards/margins": -0.012476922944188118, "rewards/rejected": 0.0562596321105957, "step": 69 }, { "epoch": 0.05114155251141553, "grad_norm": 91.71016250258201, "learning_rate": 6.386861313868613e-08, "logits/chosen": -2.349607467651367, "logits/rejected": -1.8952126502990723, "logps/chosen": -506.85302734375, "logps/rejected": -441.86431884765625, "loss": 0.7078, "rewards/accuracies": 0.375, "rewards/chosen": -0.020307065919041634, "rewards/margins": -0.014028404839336872, "rewards/rejected": -0.006278658285737038, "step": 70 }, { "epoch": 0.051872146118721464, "grad_norm": 97.60158469992943, "learning_rate": 6.478102189781022e-08, "logits/chosen": -3.4000086784362793, "logits/rejected": -2.490023374557495, "logps/chosen": -646.0096435546875, "logps/rejected": -475.1502380371094, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.02245159074664116, "rewards/margins": 0.013377953320741653, "rewards/rejected": 0.009073635563254356, "step": 71 }, { "epoch": 0.0526027397260274, "grad_norm": 90.16474721655355, "learning_rate": 6.569343065693431e-08, "logits/chosen": -2.657656669616699, "logits/rejected": -2.934380292892456, "logps/chosen": -549.7326049804688, "logps/rejected": -488.98370361328125, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.05873119831085205, "rewards/margins": 0.0731273889541626, "rewards/rejected": -0.014396190643310547, "step": 72 }, { "epoch": 0.05333333333333334, "grad_norm": 78.70938933030799, "learning_rate": 6.66058394160584e-08, "logits/chosen": -2.6319196224212646, "logits/rejected": -2.4557363986968994, "logps/chosen": -420.98138427734375, "logps/rejected": -387.3236999511719, "loss": 0.6976, "rewards/accuracies": 0.375, "rewards/chosen": 0.009365880861878395, "rewards/margins": -0.0391240268945694, "rewards/rejected": 0.04848990589380264, "step": 73 }, { "epoch": 0.054063926940639266, "grad_norm": 94.01121421637687, "learning_rate": 6.751824817518248e-08, "logits/chosen": -2.8976526260375977, "logits/rejected": -2.7527427673339844, "logps/chosen": -597.1941528320312, "logps/rejected": -665.8828125, "loss": 0.681, "rewards/accuracies": 0.625, "rewards/chosen": 0.09152975678443909, "rewards/margins": 0.0420779287815094, "rewards/rejected": 0.049451831728219986, "step": 74 }, { "epoch": 0.0547945205479452, "grad_norm": 103.04378904590756, "learning_rate": 6.843065693430657e-08, "logits/chosen": -3.2935376167297363, "logits/rejected": -2.5826010704040527, "logps/chosen": -576.9124755859375, "logps/rejected": -496.21417236328125, "loss": 0.6977, "rewards/accuracies": 0.875, "rewards/chosen": 0.09970588982105255, "rewards/margins": 0.06071997061371803, "rewards/rejected": 0.03898591920733452, "step": 75 }, { "epoch": 0.05552511415525114, "grad_norm": 93.53510266037826, "learning_rate": 6.934306569343065e-08, "logits/chosen": -3.5062496662139893, "logits/rejected": -2.0470685958862305, "logps/chosen": -642.4967041015625, "logps/rejected": -320.52362060546875, "loss": 0.6762, "rewards/accuracies": 0.75, "rewards/chosen": 0.0455259308218956, "rewards/margins": 0.06943252682685852, "rewards/rejected": -0.023906590417027473, "step": 76 }, { "epoch": 0.056255707762557075, "grad_norm": 91.20665708639365, "learning_rate": 7.025547445255474e-08, "logits/chosen": -2.5581247806549072, "logits/rejected": -2.0720088481903076, "logps/chosen": -897.0526123046875, "logps/rejected": -596.6524658203125, "loss": 0.6957, "rewards/accuracies": 0.625, "rewards/chosen": 0.10064373165369034, "rewards/margins": 0.03284721449017525, "rewards/rejected": 0.06779651343822479, "step": 77 }, { "epoch": 0.05698630136986301, "grad_norm": 109.07859390566739, "learning_rate": 7.116788321167882e-08, "logits/chosen": -2.9864344596862793, "logits/rejected": -2.1131138801574707, "logps/chosen": -1057.9757080078125, "logps/rejected": -757.9194946289062, "loss": 0.7023, "rewards/accuracies": 0.375, "rewards/chosen": -0.018572043627500534, "rewards/margins": -0.09218816459178925, "rewards/rejected": 0.07361611723899841, "step": 78 }, { "epoch": 0.05771689497716895, "grad_norm": 99.3303112961371, "learning_rate": 7.208029197080291e-08, "logits/chosen": -2.594069480895996, "logits/rejected": -2.013278007507324, "logps/chosen": -701.856689453125, "logps/rejected": -535.0043334960938, "loss": 0.6823, "rewards/accuracies": 0.75, "rewards/chosen": 0.1349295675754547, "rewards/margins": 0.026475809514522552, "rewards/rejected": 0.10845375806093216, "step": 79 }, { "epoch": 0.058447488584474884, "grad_norm": 119.76868982542975, "learning_rate": 7.2992700729927e-08, "logits/chosen": -2.7168021202087402, "logits/rejected": -1.6579269170761108, "logps/chosen": -524.7047729492188, "logps/rejected": -266.1503601074219, "loss": 0.6631, "rewards/accuracies": 0.875, "rewards/chosen": 0.09443521499633789, "rewards/margins": 0.0808509811758995, "rewards/rejected": 0.01358423288911581, "step": 80 }, { "epoch": 0.05917808219178082, "grad_norm": 119.09017476921638, "learning_rate": 7.390510948905109e-08, "logits/chosen": -3.258880615234375, "logits/rejected": -2.2535793781280518, "logps/chosen": -885.543212890625, "logps/rejected": -592.299560546875, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": 0.06835231930017471, "rewards/margins": 0.03294315189123154, "rewards/rejected": 0.035409167408943176, "step": 81 }, { "epoch": 0.05990867579908676, "grad_norm": 91.17018127175683, "learning_rate": 7.481751824817519e-08, "logits/chosen": -2.3672075271606445, "logits/rejected": -1.941821813583374, "logps/chosen": -430.1943359375, "logps/rejected": -324.95538330078125, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.07300984859466553, "rewards/margins": 0.01380391139537096, "rewards/rejected": 0.05920593440532684, "step": 82 }, { "epoch": 0.06063926940639269, "grad_norm": 96.63325179027403, "learning_rate": 7.572992700729927e-08, "logits/chosen": -2.6038055419921875, "logits/rejected": -2.067300796508789, "logps/chosen": -904.725341796875, "logps/rejected": -694.62939453125, "loss": 0.6779, "rewards/accuracies": 0.625, "rewards/chosen": 0.0947931557893753, "rewards/margins": 0.003550935536623001, "rewards/rejected": 0.091242216527462, "step": 83 }, { "epoch": 0.06136986301369863, "grad_norm": 115.13969945516186, "learning_rate": 7.664233576642335e-08, "logits/chosen": -2.70742130279541, "logits/rejected": -1.7208338975906372, "logps/chosen": -679.933349609375, "logps/rejected": -528.845703125, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": 0.06867924332618713, "rewards/margins": -0.03136448562145233, "rewards/rejected": 0.10004372894763947, "step": 84 }, { "epoch": 0.062100456621004566, "grad_norm": 91.16324000453801, "learning_rate": 7.755474452554745e-08, "logits/chosen": -2.8200137615203857, "logits/rejected": -2.233279228210449, "logps/chosen": -628.6182250976562, "logps/rejected": -459.26654052734375, "loss": 0.684, "rewards/accuracies": 0.625, "rewards/chosen": 0.09259701520204544, "rewards/margins": 0.05584793537855148, "rewards/rejected": 0.03674907609820366, "step": 85 }, { "epoch": 0.06283105022831051, "grad_norm": 109.79843926771, "learning_rate": 7.846715328467153e-08, "logits/chosen": -2.5386157035827637, "logits/rejected": -2.161531925201416, "logps/chosen": -387.0860290527344, "logps/rejected": -408.859130859375, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.026392292231321335, "rewards/margins": -0.008986640721559525, "rewards/rejected": 0.03537892922759056, "step": 86 }, { "epoch": 0.06356164383561644, "grad_norm": 91.3536440501965, "learning_rate": 7.937956204379561e-08, "logits/chosen": -2.7689547538757324, "logits/rejected": -2.0676956176757812, "logps/chosen": -612.5611572265625, "logps/rejected": -475.7663879394531, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 0.07066116482019424, "rewards/margins": -0.018799731507897377, "rewards/rejected": 0.08946090191602707, "step": 87 }, { "epoch": 0.06429223744292237, "grad_norm": 95.49399216213814, "learning_rate": 8.029197080291971e-08, "logits/chosen": -2.0104658603668213, "logits/rejected": -2.323537826538086, "logps/chosen": -633.0625, "logps/rejected": -992.6498413085938, "loss": 0.6953, "rewards/accuracies": 0.5, "rewards/chosen": 0.107374407351017, "rewards/margins": 0.008601024746894836, "rewards/rejected": 0.09877338260412216, "step": 88 }, { "epoch": 0.06502283105022831, "grad_norm": 94.51174385526119, "learning_rate": 8.120437956204379e-08, "logits/chosen": -3.038116931915283, "logits/rejected": -2.2195708751678467, "logps/chosen": -641.5491943359375, "logps/rejected": -409.89508056640625, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": 0.03573808819055557, "rewards/margins": 0.006451508030295372, "rewards/rejected": 0.029286576434969902, "step": 89 }, { "epoch": 0.06575342465753424, "grad_norm": 109.01828885415281, "learning_rate": 8.211678832116787e-08, "logits/chosen": -3.3411002159118652, "logits/rejected": -2.012000322341919, "logps/chosen": -769.4367065429688, "logps/rejected": -466.07867431640625, "loss": 0.6883, "rewards/accuracies": 0.75, "rewards/chosen": 0.09736175835132599, "rewards/margins": 0.039212800562381744, "rewards/rejected": 0.058148957788944244, "step": 90 }, { "epoch": 0.06648401826484018, "grad_norm": 103.44619015610242, "learning_rate": 8.302919708029197e-08, "logits/chosen": -2.3937389850616455, "logits/rejected": -2.225966691970825, "logps/chosen": -869.635009765625, "logps/rejected": -720.650390625, "loss": 0.686, "rewards/accuracies": 0.75, "rewards/chosen": 0.09076805412769318, "rewards/margins": 0.06722994148731232, "rewards/rejected": 0.02353811077773571, "step": 91 }, { "epoch": 0.06721461187214611, "grad_norm": 100.94371110921175, "learning_rate": 8.394160583941605e-08, "logits/chosen": -2.6429264545440674, "logits/rejected": -2.364262104034424, "logps/chosen": -492.2616271972656, "logps/rejected": -506.5616149902344, "loss": 0.705, "rewards/accuracies": 0.375, "rewards/chosen": 0.0027873991057276726, "rewards/margins": -0.07622776925563812, "rewards/rejected": 0.07901516556739807, "step": 92 }, { "epoch": 0.06794520547945206, "grad_norm": 91.66521009732918, "learning_rate": 8.485401459854013e-08, "logits/chosen": -2.9174728393554688, "logits/rejected": -2.1603407859802246, "logps/chosen": -394.70068359375, "logps/rejected": -434.60406494140625, "loss": 0.6788, "rewards/accuracies": 0.875, "rewards/chosen": 0.05814705044031143, "rewards/margins": 0.09387025982141495, "rewards/rejected": -0.035723209381103516, "step": 93 }, { "epoch": 0.06867579908675799, "grad_norm": 88.53581383841937, "learning_rate": 8.576642335766424e-08, "logits/chosen": -2.4385428428649902, "logits/rejected": -2.6451587677001953, "logps/chosen": -389.4354248046875, "logps/rejected": -613.855224609375, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": 0.07498006522655487, "rewards/margins": 0.015926837921142578, "rewards/rejected": 0.05905322730541229, "step": 94 }, { "epoch": 0.06940639269406393, "grad_norm": 94.47773278496632, "learning_rate": 8.667883211678832e-08, "logits/chosen": -3.135110855102539, "logits/rejected": -2.46091365814209, "logps/chosen": -795.2398071289062, "logps/rejected": -623.299560546875, "loss": 0.6803, "rewards/accuracies": 0.5, "rewards/chosen": 0.06917190551757812, "rewards/margins": 0.014662650413811207, "rewards/rejected": 0.05450925976037979, "step": 95 }, { "epoch": 0.07013698630136986, "grad_norm": 95.71374558416362, "learning_rate": 8.759124087591241e-08, "logits/chosen": -2.9972681999206543, "logits/rejected": -2.068998098373413, "logps/chosen": -543.6600341796875, "logps/rejected": -437.19293212890625, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": 0.07190132141113281, "rewards/margins": -0.0013554096221923828, "rewards/rejected": 0.0732567310333252, "step": 96 }, { "epoch": 0.0708675799086758, "grad_norm": 94.60300479173482, "learning_rate": 8.850364963503649e-08, "logits/chosen": -2.5504584312438965, "logits/rejected": -2.615576982498169, "logps/chosen": -533.2852172851562, "logps/rejected": -577.6010131835938, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": 0.10484696179628372, "rewards/margins": 0.0307464599609375, "rewards/rejected": 0.07410049438476562, "step": 97 }, { "epoch": 0.07159817351598173, "grad_norm": 89.9676645471166, "learning_rate": 8.941605839416058e-08, "logits/chosen": -2.382157325744629, "logits/rejected": -2.020465612411499, "logps/chosen": -504.56890869140625, "logps/rejected": -507.96417236328125, "loss": 0.6638, "rewards/accuracies": 0.625, "rewards/chosen": 0.15448208153247833, "rewards/margins": 0.05458097532391548, "rewards/rejected": 0.09990110993385315, "step": 98 }, { "epoch": 0.07232876712328767, "grad_norm": 94.00043114096606, "learning_rate": 9.032846715328467e-08, "logits/chosen": -2.7359728813171387, "logits/rejected": -1.7435410022735596, "logps/chosen": -485.0408935546875, "logps/rejected": -344.109130859375, "loss": 0.6709, "rewards/accuracies": 0.5, "rewards/chosen": 0.03749723359942436, "rewards/margins": -0.01854703575372696, "rewards/rejected": 0.05604426935315132, "step": 99 }, { "epoch": 0.0730593607305936, "grad_norm": 95.67975862708633, "learning_rate": 9.124087591240875e-08, "logits/chosen": -2.9525041580200195, "logits/rejected": -3.2354702949523926, "logps/chosen": -644.9825439453125, "logps/rejected": -667.1747436523438, "loss": 0.6935, "rewards/accuracies": 0.625, "rewards/chosen": 0.06648736447095871, "rewards/margins": -0.010479879565536976, "rewards/rejected": 0.07696723937988281, "step": 100 }, { "epoch": 0.07378995433789955, "grad_norm": 97.18068706023847, "learning_rate": 9.215328467153285e-08, "logits/chosen": -2.8472275733947754, "logits/rejected": -2.259763240814209, "logps/chosen": -821.0194702148438, "logps/rejected": -552.3619995117188, "loss": 0.6619, "rewards/accuracies": 0.75, "rewards/chosen": 0.15932312607765198, "rewards/margins": 0.02240428514778614, "rewards/rejected": 0.1369188278913498, "step": 101 }, { "epoch": 0.07452054794520548, "grad_norm": 86.31302611172023, "learning_rate": 9.306569343065693e-08, "logits/chosen": -2.447491407394409, "logits/rejected": -2.52144718170166, "logps/chosen": -825.5758666992188, "logps/rejected": -1030.3154296875, "loss": 0.664, "rewards/accuracies": 0.625, "rewards/chosen": 0.16854286193847656, "rewards/margins": 0.015586464665830135, "rewards/rejected": 0.15295639634132385, "step": 102 }, { "epoch": 0.07525114155251142, "grad_norm": 88.94743389268602, "learning_rate": 9.397810218978101e-08, "logits/chosen": -2.986175060272217, "logits/rejected": -2.5595738887786865, "logps/chosen": -512.5946044921875, "logps/rejected": -437.3487854003906, "loss": 0.6744, "rewards/accuracies": 0.5, "rewards/chosen": 0.04892568662762642, "rewards/margins": -0.01271972805261612, "rewards/rejected": 0.06164541468024254, "step": 103 }, { "epoch": 0.07598173515981735, "grad_norm": 101.14886924726346, "learning_rate": 9.48905109489051e-08, "logits/chosen": -2.3799984455108643, "logits/rejected": -2.178776264190674, "logps/chosen": -662.5736083984375, "logps/rejected": -763.724365234375, "loss": 0.6822, "rewards/accuracies": 0.75, "rewards/chosen": 0.10745935142040253, "rewards/margins": 0.053589917719364166, "rewards/rejected": 0.05386943742632866, "step": 104 }, { "epoch": 0.07671232876712329, "grad_norm": 78.07240417054383, "learning_rate": 9.580291970802919e-08, "logits/chosen": -3.114513635635376, "logits/rejected": -2.5615978240966797, "logps/chosen": -525.9475708007812, "logps/rejected": -380.9859924316406, "loss": 0.6658, "rewards/accuracies": 0.5, "rewards/chosen": 0.06742119789123535, "rewards/margins": 0.0013691415078938007, "rewards/rejected": 0.06605205684900284, "step": 105 }, { "epoch": 0.07744292237442922, "grad_norm": 96.65304715029681, "learning_rate": 9.671532846715328e-08, "logits/chosen": -2.3343944549560547, "logits/rejected": -2.2930197715759277, "logps/chosen": -756.5433349609375, "logps/rejected": -500.9776611328125, "loss": 0.6817, "rewards/accuracies": 0.875, "rewards/chosen": 0.21431618928909302, "rewards/margins": 0.16217868030071259, "rewards/rejected": 0.05213749408721924, "step": 106 }, { "epoch": 0.07817351598173516, "grad_norm": 92.20119072068701, "learning_rate": 9.762773722627738e-08, "logits/chosen": -2.7596700191497803, "logits/rejected": -2.219231128692627, "logps/chosen": -742.866943359375, "logps/rejected": -638.9796752929688, "loss": 0.6724, "rewards/accuracies": 0.875, "rewards/chosen": 0.26747560501098633, "rewards/margins": 0.12694287300109863, "rewards/rejected": 0.1405327320098877, "step": 107 }, { "epoch": 0.0789041095890411, "grad_norm": 91.97848126396195, "learning_rate": 9.854014598540146e-08, "logits/chosen": -2.4471874237060547, "logits/rejected": -2.0335493087768555, "logps/chosen": -697.1626586914062, "logps/rejected": -690.1871337890625, "loss": 0.6699, "rewards/accuracies": 0.25, "rewards/chosen": 0.13184089958667755, "rewards/margins": -0.010776803828775883, "rewards/rejected": 0.14261770248413086, "step": 108 }, { "epoch": 0.07963470319634704, "grad_norm": 95.93025725918022, "learning_rate": 9.945255474452554e-08, "logits/chosen": -2.683851718902588, "logits/rejected": -1.4952396154403687, "logps/chosen": -584.3533325195312, "logps/rejected": -325.3695068359375, "loss": 0.64, "rewards/accuracies": 0.75, "rewards/chosen": 0.18841871619224548, "rewards/margins": 0.12948095798492432, "rewards/rejected": 0.058937765657901764, "step": 109 }, { "epoch": 0.08036529680365297, "grad_norm": 102.81126044085589, "learning_rate": 1.0036496350364964e-07, "logits/chosen": -3.168623208999634, "logits/rejected": -2.2697112560272217, "logps/chosen": -588.4923095703125, "logps/rejected": -378.5550537109375, "loss": 0.6566, "rewards/accuracies": 0.75, "rewards/chosen": 0.20265603065490723, "rewards/margins": 0.11244407296180725, "rewards/rejected": 0.09021196514368057, "step": 110 }, { "epoch": 0.08109589041095891, "grad_norm": 81.55954556430687, "learning_rate": 1.0127737226277372e-07, "logits/chosen": -2.5386548042297363, "logits/rejected": -2.102814197540283, "logps/chosen": -639.0172119140625, "logps/rejected": -388.9745788574219, "loss": 0.6694, "rewards/accuracies": 0.5, "rewards/chosen": 0.1332019716501236, "rewards/margins": 0.07878781110048294, "rewards/rejected": 0.05441415682435036, "step": 111 }, { "epoch": 0.08182648401826484, "grad_norm": 87.2667594487586, "learning_rate": 1.021897810218978e-07, "logits/chosen": -2.598144769668579, "logits/rejected": -1.9884908199310303, "logps/chosen": -537.0949096679688, "logps/rejected": -521.279052734375, "loss": 0.6562, "rewards/accuracies": 0.625, "rewards/chosen": 0.11527696996927261, "rewards/margins": 0.05879499018192291, "rewards/rejected": 0.0564819797873497, "step": 112 }, { "epoch": 0.08255707762557078, "grad_norm": 88.94275941933104, "learning_rate": 1.0310218978102189e-07, "logits/chosen": -3.0013363361358643, "logits/rejected": -2.721928119659424, "logps/chosen": -776.2058715820312, "logps/rejected": -720.6074829101562, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": 0.2475254088640213, "rewards/margins": 0.07749596238136292, "rewards/rejected": 0.17002946138381958, "step": 113 }, { "epoch": 0.08328767123287671, "grad_norm": 85.8268805093646, "learning_rate": 1.0401459854014598e-07, "logits/chosen": -3.0008060932159424, "logits/rejected": -2.2013654708862305, "logps/chosen": -662.2620849609375, "logps/rejected": -454.04351806640625, "loss": 0.6668, "rewards/accuracies": 0.625, "rewards/chosen": 0.19957561790943146, "rewards/margins": 0.09588389098644257, "rewards/rejected": 0.10369172692298889, "step": 114 }, { "epoch": 0.08401826484018265, "grad_norm": 94.63336942841568, "learning_rate": 1.0492700729927006e-07, "logits/chosen": -2.7132625579833984, "logits/rejected": -2.4411017894744873, "logps/chosen": -608.232421875, "logps/rejected": -553.7322387695312, "loss": 0.6794, "rewards/accuracies": 0.75, "rewards/chosen": 0.17762228846549988, "rewards/margins": 0.01989884115755558, "rewards/rejected": 0.15772344172000885, "step": 115 }, { "epoch": 0.08474885844748858, "grad_norm": 713.1353307753025, "learning_rate": 1.0583941605839415e-07, "logits/chosen": -2.3703041076660156, "logits/rejected": -1.954603672027588, "logps/chosen": -529.9540405273438, "logps/rejected": -531.83154296875, "loss": 0.6823, "rewards/accuracies": 0.625, "rewards/chosen": 0.16169339418411255, "rewards/margins": -0.08573475480079651, "rewards/rejected": 0.24742813408374786, "step": 116 }, { "epoch": 0.08547945205479453, "grad_norm": 147.0295397084375, "learning_rate": 1.0675182481751824e-07, "logits/chosen": -2.4207305908203125, "logits/rejected": -2.446768283843994, "logps/chosen": -1037.80029296875, "logps/rejected": -802.66357421875, "loss": 0.6729, "rewards/accuracies": 0.75, "rewards/chosen": 0.2835674285888672, "rewards/margins": 0.10608578473329544, "rewards/rejected": 0.17748165130615234, "step": 117 }, { "epoch": 0.08621004566210046, "grad_norm": 80.057300645967, "learning_rate": 1.0766423357664232e-07, "logits/chosen": -3.064739942550659, "logits/rejected": -2.0598158836364746, "logps/chosen": -637.473388671875, "logps/rejected": -484.28973388671875, "loss": 0.6576, "rewards/accuracies": 0.5, "rewards/chosen": 0.1610521376132965, "rewards/margins": 0.08253040164709091, "rewards/rejected": 0.078521728515625, "step": 118 }, { "epoch": 0.08694063926940639, "grad_norm": 98.20056803552615, "learning_rate": 1.0857664233576642e-07, "logits/chosen": -3.2472429275512695, "logits/rejected": -1.9320456981658936, "logps/chosen": -767.0244750976562, "logps/rejected": -335.1850891113281, "loss": 0.6612, "rewards/accuracies": 0.75, "rewards/chosen": 0.2938051223754883, "rewards/margins": 0.2448134571313858, "rewards/rejected": 0.04899168014526367, "step": 119 }, { "epoch": 0.08767123287671233, "grad_norm": 88.74018981872024, "learning_rate": 1.0948905109489052e-07, "logits/chosen": -2.7615883350372314, "logits/rejected": -2.226839303970337, "logps/chosen": -724.0440673828125, "logps/rejected": -617.214111328125, "loss": 0.6752, "rewards/accuracies": 0.625, "rewards/chosen": 0.2626591622829437, "rewards/margins": 0.04723710939288139, "rewards/rejected": 0.21542204916477203, "step": 120 }, { "epoch": 0.08840182648401826, "grad_norm": 98.5432963219721, "learning_rate": 1.104014598540146e-07, "logits/chosen": -2.904914379119873, "logits/rejected": -2.102940559387207, "logps/chosen": -1108.8612060546875, "logps/rejected": -681.6278686523438, "loss": 0.6571, "rewards/accuracies": 0.625, "rewards/chosen": 0.4344635009765625, "rewards/margins": 0.17547070980072021, "rewards/rejected": 0.2589927613735199, "step": 121 }, { "epoch": 0.0891324200913242, "grad_norm": 93.68438849903168, "learning_rate": 1.1131386861313868e-07, "logits/chosen": -2.287344455718994, "logits/rejected": -1.8324130773544312, "logps/chosen": -800.9357299804688, "logps/rejected": -566.4583740234375, "loss": 0.6545, "rewards/accuracies": 0.5, "rewards/chosen": 0.23448605835437775, "rewards/margins": 0.08121117949485779, "rewards/rejected": 0.15327487885951996, "step": 122 }, { "epoch": 0.08986301369863013, "grad_norm": 95.37411942581699, "learning_rate": 1.1222627737226278e-07, "logits/chosen": -2.467270612716675, "logits/rejected": -2.0179309844970703, "logps/chosen": -784.3834838867188, "logps/rejected": -621.7142944335938, "loss": 0.6585, "rewards/accuracies": 0.75, "rewards/chosen": 0.29315072298049927, "rewards/margins": 0.08346005529165268, "rewards/rejected": 0.20969067513942719, "step": 123 }, { "epoch": 0.09059360730593607, "grad_norm": 97.2771943420579, "learning_rate": 1.1313868613138686e-07, "logits/chosen": -2.651488780975342, "logits/rejected": -1.7223527431488037, "logps/chosen": -790.5413208007812, "logps/rejected": -441.6155090332031, "loss": 0.6388, "rewards/accuracies": 0.5, "rewards/chosen": 0.3090164363384247, "rewards/margins": 0.15691128373146057, "rewards/rejected": 0.15210513770580292, "step": 124 }, { "epoch": 0.091324200913242, "grad_norm": 107.59551988088305, "learning_rate": 1.1405109489051094e-07, "logits/chosen": -2.27437686920166, "logits/rejected": -2.0352530479431152, "logps/chosen": -552.7191772460938, "logps/rejected": -493.6623840332031, "loss": 0.6275, "rewards/accuracies": 0.625, "rewards/chosen": 0.2126983255147934, "rewards/margins": 0.04594102129340172, "rewards/rejected": 0.16675730049610138, "step": 125 }, { "epoch": 0.09205479452054795, "grad_norm": 92.19318308753907, "learning_rate": 1.1496350364963504e-07, "logits/chosen": -2.528703451156616, "logits/rejected": -2.2498910427093506, "logps/chosen": -619.3512573242188, "logps/rejected": -547.45263671875, "loss": 0.6445, "rewards/accuracies": 0.625, "rewards/chosen": 0.3070297837257385, "rewards/margins": 0.11459086835384369, "rewards/rejected": 0.19243890047073364, "step": 126 }, { "epoch": 0.09278538812785388, "grad_norm": 83.89268104902686, "learning_rate": 1.1587591240875912e-07, "logits/chosen": -2.8571367263793945, "logits/rejected": -2.164795160293579, "logps/chosen": -924.6870727539062, "logps/rejected": -587.15869140625, "loss": 0.6507, "rewards/accuracies": 1.0, "rewards/chosen": 0.4063735902309418, "rewards/margins": 0.2009914517402649, "rewards/rejected": 0.20538215339183807, "step": 127 }, { "epoch": 0.09351598173515982, "grad_norm": 95.93420698745834, "learning_rate": 1.167883211678832e-07, "logits/chosen": -2.585425853729248, "logits/rejected": -1.7010259628295898, "logps/chosen": -554.958251953125, "logps/rejected": -349.78369140625, "loss": 0.6269, "rewards/accuracies": 0.75, "rewards/chosen": 0.23900070786476135, "rewards/margins": 0.1494581252336502, "rewards/rejected": 0.08954258263111115, "step": 128 }, { "epoch": 0.09424657534246575, "grad_norm": 88.90495660306995, "learning_rate": 1.1770072992700728e-07, "logits/chosen": -2.958275318145752, "logits/rejected": -2.6844687461853027, "logps/chosen": -689.2169799804688, "logps/rejected": -587.2020263671875, "loss": 0.648, "rewards/accuracies": 0.75, "rewards/chosen": 0.2795402407646179, "rewards/margins": 0.06465473026037216, "rewards/rejected": 0.21488553285598755, "step": 129 }, { "epoch": 0.09497716894977169, "grad_norm": 101.09378863986896, "learning_rate": 1.1861313868613138e-07, "logits/chosen": -2.462498664855957, "logits/rejected": -1.691188931465149, "logps/chosen": -429.81866455078125, "logps/rejected": -350.25469970703125, "loss": 0.6529, "rewards/accuracies": 0.75, "rewards/chosen": 0.24372558295726776, "rewards/margins": 0.0839889645576477, "rewards/rejected": 0.15973663330078125, "step": 130 }, { "epoch": 0.09570776255707762, "grad_norm": 79.73793134583126, "learning_rate": 1.1952554744525547e-07, "logits/chosen": -2.8578786849975586, "logits/rejected": -2.8556134700775146, "logps/chosen": -831.7183227539062, "logps/rejected": -703.7633666992188, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": 0.267534464597702, "rewards/margins": 0.028171924874186516, "rewards/rejected": 0.23936253786087036, "step": 131 }, { "epoch": 0.09643835616438357, "grad_norm": 75.97855436013671, "learning_rate": 1.2043795620437956e-07, "logits/chosen": -2.0293586254119873, "logits/rejected": -1.9831421375274658, "logps/chosen": -490.15509033203125, "logps/rejected": -495.22967529296875, "loss": 0.66, "rewards/accuracies": 0.5, "rewards/chosen": 0.1951315999031067, "rewards/margins": 0.006664782762527466, "rewards/rejected": 0.18846681714057922, "step": 132 }, { "epoch": 0.0971689497716895, "grad_norm": 99.54158600178118, "learning_rate": 1.2135036496350364e-07, "logits/chosen": -3.317657470703125, "logits/rejected": -2.214343786239624, "logps/chosen": -910.9747314453125, "logps/rejected": -602.9983520507812, "loss": 0.6669, "rewards/accuracies": 0.75, "rewards/chosen": 0.4120626449584961, "rewards/margins": 0.11788903176784515, "rewards/rejected": 0.29417362809181213, "step": 133 }, { "epoch": 0.09789954337899544, "grad_norm": 91.93251813490697, "learning_rate": 1.2226277372262775e-07, "logits/chosen": -2.666168212890625, "logits/rejected": -2.0385971069335938, "logps/chosen": -691.299560546875, "logps/rejected": -470.41925048828125, "loss": 0.6751, "rewards/accuracies": 0.75, "rewards/chosen": 0.32471388578414917, "rewards/margins": 0.07633041590452194, "rewards/rejected": 0.24838349223136902, "step": 134 }, { "epoch": 0.09863013698630137, "grad_norm": 103.51324171784738, "learning_rate": 1.2317518248175183e-07, "logits/chosen": -2.5803799629211426, "logits/rejected": -1.7886734008789062, "logps/chosen": -656.9305419921875, "logps/rejected": -400.4693603515625, "loss": 0.6097, "rewards/accuracies": 0.75, "rewards/chosen": 0.2901232838630676, "rewards/margins": 0.11832865327596664, "rewards/rejected": 0.17179463803768158, "step": 135 }, { "epoch": 0.09936073059360731, "grad_norm": 81.82934079915957, "learning_rate": 1.240875912408759e-07, "logits/chosen": -2.4458611011505127, "logits/rejected": -2.1771135330200195, "logps/chosen": -310.5158386230469, "logps/rejected": -383.01776123046875, "loss": 0.6393, "rewards/accuracies": 0.75, "rewards/chosen": 0.21351584792137146, "rewards/margins": 0.08799911290407181, "rewards/rejected": 0.12551675736904144, "step": 136 }, { "epoch": 0.10009132420091324, "grad_norm": 87.89730391289741, "learning_rate": 1.25e-07, "logits/chosen": -2.8203911781311035, "logits/rejected": -2.08638858795166, "logps/chosen": -707.828857421875, "logps/rejected": -567.5728759765625, "loss": 0.6802, "rewards/accuracies": 0.875, "rewards/chosen": 0.39395657181739807, "rewards/margins": 0.17331524193286896, "rewards/rejected": 0.2206413298845291, "step": 137 }, { "epoch": 0.10082191780821918, "grad_norm": 82.51112910257167, "learning_rate": 1.259124087591241e-07, "logits/chosen": -2.924152374267578, "logits/rejected": -2.2872040271759033, "logps/chosen": -821.0283203125, "logps/rejected": -641.4526977539062, "loss": 0.6367, "rewards/accuracies": 0.75, "rewards/chosen": 0.46058711409568787, "rewards/margins": 0.14857502281665802, "rewards/rejected": 0.31201207637786865, "step": 138 }, { "epoch": 0.10155251141552511, "grad_norm": 95.79399894761154, "learning_rate": 1.2682481751824816e-07, "logits/chosen": -2.6391501426696777, "logits/rejected": -2.287895441055298, "logps/chosen": -767.7299194335938, "logps/rejected": -647.1353759765625, "loss": 0.6618, "rewards/accuracies": 0.75, "rewards/chosen": 0.40312808752059937, "rewards/margins": 0.11969813704490662, "rewards/rejected": 0.28342992067337036, "step": 139 }, { "epoch": 0.10228310502283106, "grad_norm": 86.54738699297917, "learning_rate": 1.2773722627737227e-07, "logits/chosen": -2.6085774898529053, "logits/rejected": -2.4963178634643555, "logps/chosen": -527.385986328125, "logps/rejected": -517.9359130859375, "loss": 0.6241, "rewards/accuracies": 0.75, "rewards/chosen": 0.36273860931396484, "rewards/margins": 0.10319538414478302, "rewards/rejected": 0.259543240070343, "step": 140 }, { "epoch": 0.10301369863013699, "grad_norm": 76.53795205954201, "learning_rate": 1.2864963503649635e-07, "logits/chosen": -2.577382802963257, "logits/rejected": -1.8262172937393188, "logps/chosen": -385.5703430175781, "logps/rejected": -281.7908020019531, "loss": 0.6389, "rewards/accuracies": 1.0, "rewards/chosen": 0.2991056442260742, "rewards/margins": 0.17908035218715668, "rewards/rejected": 0.12002529948949814, "step": 141 }, { "epoch": 0.10374429223744293, "grad_norm": 86.5718998732051, "learning_rate": 1.2956204379562043e-07, "logits/chosen": -2.9042534828186035, "logits/rejected": -2.1000313758850098, "logps/chosen": -751.4824829101562, "logps/rejected": -617.0573120117188, "loss": 0.6311, "rewards/accuracies": 0.875, "rewards/chosen": 0.4685096740722656, "rewards/margins": 0.1321544647216797, "rewards/rejected": 0.33635520935058594, "step": 142 }, { "epoch": 0.10447488584474886, "grad_norm": 97.71021009213929, "learning_rate": 1.3047445255474451e-07, "logits/chosen": -2.9800827503204346, "logits/rejected": -2.2563891410827637, "logps/chosen": -771.948486328125, "logps/rejected": -527.0831298828125, "loss": 0.6171, "rewards/accuracies": 0.625, "rewards/chosen": 0.3491176664829254, "rewards/margins": 0.09938299655914307, "rewards/rejected": 0.24973466992378235, "step": 143 }, { "epoch": 0.1052054794520548, "grad_norm": 85.53534311543388, "learning_rate": 1.3138686131386862e-07, "logits/chosen": -2.7839269638061523, "logits/rejected": -2.227013349533081, "logps/chosen": -1042.092041015625, "logps/rejected": -574.7708129882812, "loss": 0.6432, "rewards/accuracies": 0.875, "rewards/chosen": 0.4606683850288391, "rewards/margins": 0.19678232073783875, "rewards/rejected": 0.26388606429100037, "step": 144 }, { "epoch": 0.10593607305936073, "grad_norm": 79.93870026952683, "learning_rate": 1.3229927007299268e-07, "logits/chosen": -2.535118579864502, "logits/rejected": -1.9325989484786987, "logps/chosen": -631.7592163085938, "logps/rejected": -434.06341552734375, "loss": 0.6706, "rewards/accuracies": 0.625, "rewards/chosen": 0.38357508182525635, "rewards/margins": 0.09610119462013245, "rewards/rejected": 0.2874738574028015, "step": 145 }, { "epoch": 0.10666666666666667, "grad_norm": 84.55393641967534, "learning_rate": 1.332116788321168e-07, "logits/chosen": -2.865896463394165, "logits/rejected": -2.495795249938965, "logps/chosen": -659.4659423828125, "logps/rejected": -484.20220947265625, "loss": 0.6306, "rewards/accuracies": 0.625, "rewards/chosen": 0.4468676745891571, "rewards/margins": 0.08818086236715317, "rewards/rejected": 0.3586868345737457, "step": 146 }, { "epoch": 0.1073972602739726, "grad_norm": 92.2547900602592, "learning_rate": 1.3412408759124087e-07, "logits/chosen": -2.2436728477478027, "logits/rejected": -2.276709794998169, "logps/chosen": -506.27685546875, "logps/rejected": -475.4925231933594, "loss": 0.6645, "rewards/accuracies": 0.625, "rewards/chosen": 0.4134364724159241, "rewards/margins": 0.0756511315703392, "rewards/rejected": 0.33778536319732666, "step": 147 }, { "epoch": 0.10812785388127853, "grad_norm": 85.37176570099889, "learning_rate": 1.3503649635036495e-07, "logits/chosen": -2.8056797981262207, "logits/rejected": -2.4610068798065186, "logps/chosen": -637.3016357421875, "logps/rejected": -531.9166870117188, "loss": 0.6128, "rewards/accuracies": 0.625, "rewards/chosen": 0.3849315643310547, "rewards/margins": 0.1097232848405838, "rewards/rejected": 0.2752082943916321, "step": 148 }, { "epoch": 0.10885844748858448, "grad_norm": 80.03820803899691, "learning_rate": 1.3594890510948904e-07, "logits/chosen": -3.0430164337158203, "logits/rejected": -2.5563809871673584, "logps/chosen": -588.9742431640625, "logps/rejected": -471.31268310546875, "loss": 0.6211, "rewards/accuracies": 0.625, "rewards/chosen": 0.348283588886261, "rewards/margins": 0.05021143704652786, "rewards/rejected": 0.2980721592903137, "step": 149 }, { "epoch": 0.1095890410958904, "grad_norm": 81.16220918148267, "learning_rate": 1.3686131386861314e-07, "logits/chosen": -2.5658278465270996, "logits/rejected": -2.2037649154663086, "logps/chosen": -476.85595703125, "logps/rejected": -453.3637390136719, "loss": 0.6526, "rewards/accuracies": 0.5, "rewards/chosen": 0.3615339398384094, "rewards/margins": 0.04672684893012047, "rewards/rejected": 0.31480708718299866, "step": 150 }, { "epoch": 0.11031963470319635, "grad_norm": 90.72056922047778, "learning_rate": 1.3777372262773723e-07, "logits/chosen": -2.6281375885009766, "logits/rejected": -2.0779359340667725, "logps/chosen": -456.51800537109375, "logps/rejected": -445.09710693359375, "loss": 0.6239, "rewards/accuracies": 0.625, "rewards/chosen": 0.34226012229919434, "rewards/margins": 0.0985097661614418, "rewards/rejected": 0.24375037848949432, "step": 151 }, { "epoch": 0.11105022831050228, "grad_norm": 86.33846516128399, "learning_rate": 1.386861313868613e-07, "logits/chosen": -2.8927791118621826, "logits/rejected": -2.4390792846679688, "logps/chosen": -964.9547119140625, "logps/rejected": -647.8114013671875, "loss": 0.6522, "rewards/accuracies": 0.625, "rewards/chosen": 0.5775248408317566, "rewards/margins": 0.17105084657669067, "rewards/rejected": 0.40647396445274353, "step": 152 }, { "epoch": 0.11178082191780822, "grad_norm": 82.78684449291295, "learning_rate": 1.3959854014598542e-07, "logits/chosen": -2.450356960296631, "logits/rejected": -2.409151315689087, "logps/chosen": -625.5921020507812, "logps/rejected": -611.0306396484375, "loss": 0.6, "rewards/accuracies": 0.5, "rewards/chosen": 0.48889225721359253, "rewards/margins": 0.1419597715139389, "rewards/rejected": 0.3469325006008148, "step": 153 }, { "epoch": 0.11251141552511415, "grad_norm": 90.24304747285244, "learning_rate": 1.4051094890510947e-07, "logits/chosen": -2.8198962211608887, "logits/rejected": -1.786680817604065, "logps/chosen": -508.3049621582031, "logps/rejected": -371.30572509765625, "loss": 0.6297, "rewards/accuracies": 0.875, "rewards/chosen": 0.4266386032104492, "rewards/margins": 0.24120083451271057, "rewards/rejected": 0.18543776869773865, "step": 154 }, { "epoch": 0.1132420091324201, "grad_norm": 76.26508239665614, "learning_rate": 1.4142335766423358e-07, "logits/chosen": -2.7051219940185547, "logits/rejected": -2.870657444000244, "logps/chosen": -533.772705078125, "logps/rejected": -518.00927734375, "loss": 0.6553, "rewards/accuracies": 0.625, "rewards/chosen": 0.3571118712425232, "rewards/margins": 0.04973965883255005, "rewards/rejected": 0.30737218260765076, "step": 155 }, { "epoch": 0.11397260273972602, "grad_norm": 78.37399695331035, "learning_rate": 1.4233576642335764e-07, "logits/chosen": -2.355966567993164, "logits/rejected": -2.2231760025024414, "logps/chosen": -367.9817199707031, "logps/rejected": -394.854736328125, "loss": 0.6291, "rewards/accuracies": 0.625, "rewards/chosen": 0.3295977711677551, "rewards/margins": 0.10383579134941101, "rewards/rejected": 0.2257619947195053, "step": 156 }, { "epoch": 0.11470319634703197, "grad_norm": 77.58428821187799, "learning_rate": 1.4324817518248175e-07, "logits/chosen": -2.398081064224243, "logits/rejected": -1.5529608726501465, "logps/chosen": -773.405029296875, "logps/rejected": -492.0581970214844, "loss": 0.6098, "rewards/accuracies": 0.75, "rewards/chosen": 0.44515928626060486, "rewards/margins": 0.1381773203611374, "rewards/rejected": 0.30698198080062866, "step": 157 }, { "epoch": 0.1154337899543379, "grad_norm": 80.6197400173726, "learning_rate": 1.4416058394160583e-07, "logits/chosen": -2.8868865966796875, "logits/rejected": -2.806532859802246, "logps/chosen": -850.6734619140625, "logps/rejected": -863.1929931640625, "loss": 0.6026, "rewards/accuracies": 0.75, "rewards/chosen": 0.5462120175361633, "rewards/margins": 0.073549285531044, "rewards/rejected": 0.4726627469062805, "step": 158 }, { "epoch": 0.11616438356164384, "grad_norm": 79.52899063588384, "learning_rate": 1.450729927007299e-07, "logits/chosen": -3.39042067527771, "logits/rejected": -1.9892792701721191, "logps/chosen": -935.4354248046875, "logps/rejected": -509.3211669921875, "loss": 0.5769, "rewards/accuracies": 1.0, "rewards/chosen": 0.9575724601745605, "rewards/margins": 0.5498519539833069, "rewards/rejected": 0.407720685005188, "step": 159 }, { "epoch": 0.11689497716894977, "grad_norm": 85.75678157403938, "learning_rate": 1.45985401459854e-07, "logits/chosen": -3.030714750289917, "logits/rejected": -2.255446672439575, "logps/chosen": -858.9308471679688, "logps/rejected": -537.9014892578125, "loss": 0.6774, "rewards/accuracies": 0.875, "rewards/chosen": 0.7878488302230835, "rewards/margins": 0.3557095229625702, "rewards/rejected": 0.43213921785354614, "step": 160 }, { "epoch": 0.11762557077625571, "grad_norm": 78.35677806015731, "learning_rate": 1.468978102189781e-07, "logits/chosen": -2.2562429904937744, "logits/rejected": -2.3007333278656006, "logps/chosen": -414.35577392578125, "logps/rejected": -482.61932373046875, "loss": 0.625, "rewards/accuracies": 0.375, "rewards/chosen": 0.4636576473712921, "rewards/margins": 0.08946628868579865, "rewards/rejected": 0.37419137358665466, "step": 161 }, { "epoch": 0.11835616438356164, "grad_norm": 87.38181192151774, "learning_rate": 1.4781021897810219e-07, "logits/chosen": -2.5616559982299805, "logits/rejected": -1.8634941577911377, "logps/chosen": -552.1171264648438, "logps/rejected": -479.1460266113281, "loss": 0.6718, "rewards/accuracies": 0.75, "rewards/chosen": 0.5521547794342041, "rewards/margins": 0.2216389775276184, "rewards/rejected": 0.3305158317089081, "step": 162 }, { "epoch": 0.11908675799086758, "grad_norm": 78.82790045805483, "learning_rate": 1.4872262773722627e-07, "logits/chosen": -3.3257782459259033, "logits/rejected": -1.854256272315979, "logps/chosen": -952.1135864257812, "logps/rejected": -552.9578857421875, "loss": 0.5996, "rewards/accuracies": 0.75, "rewards/chosen": 0.7658790349960327, "rewards/margins": 0.38342398405075073, "rewards/rejected": 0.382455050945282, "step": 163 }, { "epoch": 0.11981735159817351, "grad_norm": 89.02583400953765, "learning_rate": 1.4963503649635038e-07, "logits/chosen": -2.946385145187378, "logits/rejected": -2.6132524013519287, "logps/chosen": -739.89111328125, "logps/rejected": -684.9638671875, "loss": 0.6241, "rewards/accuracies": 0.75, "rewards/chosen": 0.7202823758125305, "rewards/margins": 0.2340320497751236, "rewards/rejected": 0.4862503111362457, "step": 164 }, { "epoch": 0.12054794520547946, "grad_norm": 77.43140550573617, "learning_rate": 1.5054744525547443e-07, "logits/chosen": -2.826040744781494, "logits/rejected": -1.980116605758667, "logps/chosen": -689.2582397460938, "logps/rejected": -388.88140869140625, "loss": 0.6116, "rewards/accuracies": 0.75, "rewards/chosen": 0.7030048370361328, "rewards/margins": 0.3683123290538788, "rewards/rejected": 0.33469247817993164, "step": 165 }, { "epoch": 0.12127853881278539, "grad_norm": 82.78807127898168, "learning_rate": 1.5145985401459854e-07, "logits/chosen": -3.3495891094207764, "logits/rejected": -2.266263961791992, "logps/chosen": -693.3480224609375, "logps/rejected": -339.72747802734375, "loss": 0.5983, "rewards/accuracies": 0.875, "rewards/chosen": 0.6726528406143188, "rewards/margins": 0.44282734394073486, "rewards/rejected": 0.2298254668712616, "step": 166 }, { "epoch": 0.12200913242009133, "grad_norm": 87.56915793745536, "learning_rate": 1.5237226277372262e-07, "logits/chosen": -3.205108642578125, "logits/rejected": -2.7109625339508057, "logps/chosen": -722.3635864257812, "logps/rejected": -660.7918701171875, "loss": 0.549, "rewards/accuracies": 0.625, "rewards/chosen": 0.7717700004577637, "rewards/margins": 0.2970830202102661, "rewards/rejected": 0.47468701004981995, "step": 167 }, { "epoch": 0.12273972602739726, "grad_norm": 90.09072575569307, "learning_rate": 1.532846715328467e-07, "logits/chosen": -2.8535451889038086, "logits/rejected": -1.9590256214141846, "logps/chosen": -588.5404052734375, "logps/rejected": -479.2421569824219, "loss": 0.6284, "rewards/accuracies": 0.625, "rewards/chosen": 0.6827892661094666, "rewards/margins": 0.22375735640525818, "rewards/rejected": 0.4590318500995636, "step": 168 }, { "epoch": 0.1234703196347032, "grad_norm": 82.89434045631523, "learning_rate": 1.541970802919708e-07, "logits/chosen": -2.747089147567749, "logits/rejected": -2.4174201488494873, "logps/chosen": -846.9073486328125, "logps/rejected": -648.0985107421875, "loss": 0.6338, "rewards/accuracies": 0.625, "rewards/chosen": 0.6959822177886963, "rewards/margins": 0.15559719502925873, "rewards/rejected": 0.5403850078582764, "step": 169 }, { "epoch": 0.12420091324200913, "grad_norm": 80.82145094957, "learning_rate": 1.551094890510949e-07, "logits/chosen": -2.9211201667785645, "logits/rejected": -2.743304491043091, "logps/chosen": -845.20947265625, "logps/rejected": -864.624267578125, "loss": 0.6539, "rewards/accuracies": 0.5, "rewards/chosen": 0.7908254861831665, "rewards/margins": -0.036072760820388794, "rewards/rejected": 0.8268982172012329, "step": 170 }, { "epoch": 0.12493150684931507, "grad_norm": 88.23043061619245, "learning_rate": 1.5602189781021895e-07, "logits/chosen": -2.8357512950897217, "logits/rejected": -2.515125274658203, "logps/chosen": -759.7244262695312, "logps/rejected": -661.550048828125, "loss": 0.5903, "rewards/accuracies": 0.625, "rewards/chosen": 0.7130581140518188, "rewards/margins": 0.14479103684425354, "rewards/rejected": 0.5682671070098877, "step": 171 }, { "epoch": 0.12566210045662102, "grad_norm": 78.63955548895225, "learning_rate": 1.5693430656934306e-07, "logits/chosen": -2.6732373237609863, "logits/rejected": -2.288652181625366, "logps/chosen": -651.6874389648438, "logps/rejected": -500.07586669921875, "loss": 0.5695, "rewards/accuracies": 0.625, "rewards/chosen": 0.7071515321731567, "rewards/margins": 0.1848773956298828, "rewards/rejected": 0.5222741365432739, "step": 172 }, { "epoch": 0.12639269406392695, "grad_norm": 76.80476339191144, "learning_rate": 1.5784671532846714e-07, "logits/chosen": -2.691638231277466, "logits/rejected": -1.7786341905593872, "logps/chosen": -570.5703735351562, "logps/rejected": -295.77166748046875, "loss": 0.5963, "rewards/accuracies": 0.875, "rewards/chosen": 0.5306312441825867, "rewards/margins": 0.26684871315956116, "rewards/rejected": 0.2637825012207031, "step": 173 }, { "epoch": 0.12712328767123288, "grad_norm": 78.4937764999159, "learning_rate": 1.5875912408759123e-07, "logits/chosen": -2.4227328300476074, "logits/rejected": -2.0154895782470703, "logps/chosen": -652.8206176757812, "logps/rejected": -645.4627075195312, "loss": 0.614, "rewards/accuracies": 0.625, "rewards/chosen": 0.758234441280365, "rewards/margins": 0.1650792956352234, "rewards/rejected": 0.5931550860404968, "step": 174 }, { "epoch": 0.1278538812785388, "grad_norm": 80.55996650399626, "learning_rate": 1.5967153284671533e-07, "logits/chosen": -2.8908326625823975, "logits/rejected": -2.718569278717041, "logps/chosen": -882.20458984375, "logps/rejected": -714.5914306640625, "loss": 0.5778, "rewards/accuracies": 0.5, "rewards/chosen": 0.9946810007095337, "rewards/margins": 0.1381153166294098, "rewards/rejected": 0.8565656542778015, "step": 175 }, { "epoch": 0.12858447488584474, "grad_norm": 86.69139771338331, "learning_rate": 1.6058394160583942e-07, "logits/chosen": -3.134392023086548, "logits/rejected": -2.1642322540283203, "logps/chosen": -746.23486328125, "logps/rejected": -475.73980712890625, "loss": 0.5677, "rewards/accuracies": 0.75, "rewards/chosen": 0.879706621170044, "rewards/margins": 0.5476240515708923, "rewards/rejected": 0.3320825695991516, "step": 176 }, { "epoch": 0.1293150684931507, "grad_norm": 87.34215394337805, "learning_rate": 1.614963503649635e-07, "logits/chosen": -2.8779499530792236, "logits/rejected": -2.3495259284973145, "logps/chosen": -629.9741821289062, "logps/rejected": -508.7207946777344, "loss": 0.5922, "rewards/accuracies": 0.75, "rewards/chosen": 0.8931781649589539, "rewards/margins": 0.32973647117614746, "rewards/rejected": 0.5634416341781616, "step": 177 }, { "epoch": 0.13004566210045662, "grad_norm": 84.86092137469696, "learning_rate": 1.6240875912408758e-07, "logits/chosen": -3.2996253967285156, "logits/rejected": -2.104947805404663, "logps/chosen": -726.5145263671875, "logps/rejected": -393.6444091796875, "loss": 0.6139, "rewards/accuracies": 1.0, "rewards/chosen": 0.9281253218650818, "rewards/margins": 0.5385391712188721, "rewards/rejected": 0.3895861506462097, "step": 178 }, { "epoch": 0.13077625570776255, "grad_norm": 79.37333905742533, "learning_rate": 1.633211678832117e-07, "logits/chosen": -2.9171640872955322, "logits/rejected": -2.153207778930664, "logps/chosen": -776.2034301757812, "logps/rejected": -556.625, "loss": 0.5301, "rewards/accuracies": 0.875, "rewards/chosen": 0.9071258306503296, "rewards/margins": 0.4115884006023407, "rewards/rejected": 0.49553731083869934, "step": 179 }, { "epoch": 0.13150684931506848, "grad_norm": 74.32121545296638, "learning_rate": 1.6423357664233575e-07, "logits/chosen": -3.2384092807769775, "logits/rejected": -1.7445080280303955, "logps/chosen": -700.7176513671875, "logps/rejected": -314.8205871582031, "loss": 0.5186, "rewards/accuracies": 1.0, "rewards/chosen": 0.9475655555725098, "rewards/margins": 0.636527419090271, "rewards/rejected": 0.31103813648223877, "step": 180 }, { "epoch": 0.13223744292237444, "grad_norm": 78.73699359358503, "learning_rate": 1.6514598540145986e-07, "logits/chosen": -2.5820059776306152, "logits/rejected": -2.340888500213623, "logps/chosen": -617.4547119140625, "logps/rejected": -535.4717407226562, "loss": 0.5858, "rewards/accuracies": 0.75, "rewards/chosen": 0.6983575820922852, "rewards/margins": 0.18958429992198944, "rewards/rejected": 0.5087732672691345, "step": 181 }, { "epoch": 0.13296803652968037, "grad_norm": 71.55482013922088, "learning_rate": 1.6605839416058394e-07, "logits/chosen": -2.2918224334716797, "logits/rejected": -1.5707656145095825, "logps/chosen": -689.3817138671875, "logps/rejected": -527.7999877929688, "loss": 0.6017, "rewards/accuracies": 0.625, "rewards/chosen": 0.9046360850334167, "rewards/margins": 0.36593565344810486, "rewards/rejected": 0.5387004613876343, "step": 182 }, { "epoch": 0.1336986301369863, "grad_norm": 77.54427603694388, "learning_rate": 1.6697080291970802e-07, "logits/chosen": -3.0792856216430664, "logits/rejected": -1.7852325439453125, "logps/chosen": -561.5117797851562, "logps/rejected": -257.5528564453125, "loss": 0.5771, "rewards/accuracies": 0.75, "rewards/chosen": 0.6147441267967224, "rewards/margins": 0.35926634073257446, "rewards/rejected": 0.2554778456687927, "step": 183 }, { "epoch": 0.13442922374429223, "grad_norm": 103.1540726065263, "learning_rate": 1.678832116788321e-07, "logits/chosen": -2.83481502532959, "logits/rejected": -2.661140203475952, "logps/chosen": -456.9952087402344, "logps/rejected": -507.12322998046875, "loss": 0.7233, "rewards/accuracies": 0.25, "rewards/chosen": 0.6426022052764893, "rewards/margins": 0.00763702392578125, "rewards/rejected": 0.634965181350708, "step": 184 }, { "epoch": 0.13515981735159818, "grad_norm": 178.64500303499955, "learning_rate": 1.687956204379562e-07, "logits/chosen": -3.009525775909424, "logits/rejected": -1.3381484746932983, "logps/chosen": -813.422607421875, "logps/rejected": -328.04547119140625, "loss": 0.5356, "rewards/accuracies": 0.75, "rewards/chosen": 1.0504698753356934, "rewards/margins": 0.7035579681396484, "rewards/rejected": 0.34691184759140015, "step": 185 }, { "epoch": 0.1358904109589041, "grad_norm": 84.825731857475, "learning_rate": 1.6970802919708027e-07, "logits/chosen": -2.6678996086120605, "logits/rejected": -1.5287379026412964, "logps/chosen": -569.0751342773438, "logps/rejected": -481.09722900390625, "loss": 0.6003, "rewards/accuracies": 0.75, "rewards/chosen": 0.82135009765625, "rewards/margins": 0.41061291098594666, "rewards/rejected": 0.41073718667030334, "step": 186 }, { "epoch": 0.13662100456621004, "grad_norm": 88.88851701446734, "learning_rate": 1.7062043795620438e-07, "logits/chosen": -3.245687484741211, "logits/rejected": -3.1057562828063965, "logps/chosen": -742.8546142578125, "logps/rejected": -745.5220947265625, "loss": 0.6584, "rewards/accuracies": 0.625, "rewards/chosen": 0.9171077013015747, "rewards/margins": 0.00873296707868576, "rewards/rejected": 0.9083747267723083, "step": 187 }, { "epoch": 0.13735159817351597, "grad_norm": 68.85661123055831, "learning_rate": 1.7153284671532848e-07, "logits/chosen": -2.3408288955688477, "logits/rejected": -2.455859661102295, "logps/chosen": -307.6382751464844, "logps/rejected": -377.42987060546875, "loss": 0.5917, "rewards/accuracies": 0.375, "rewards/chosen": 0.39942875504493713, "rewards/margins": 0.054407790303230286, "rewards/rejected": 0.34502097964286804, "step": 188 }, { "epoch": 0.13808219178082193, "grad_norm": 86.58844658516082, "learning_rate": 1.7244525547445254e-07, "logits/chosen": -1.770728349685669, "logits/rejected": -2.412621259689331, "logps/chosen": -482.75262451171875, "logps/rejected": -575.696044921875, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": 0.6506732702255249, "rewards/margins": -0.07875415682792664, "rewards/rejected": 0.7294274568557739, "step": 189 }, { "epoch": 0.13881278538812786, "grad_norm": 88.12839089220908, "learning_rate": 1.7335766423357665e-07, "logits/chosen": -2.7301552295684814, "logits/rejected": -1.8274471759796143, "logps/chosen": -636.6163330078125, "logps/rejected": -423.581298828125, "loss": 0.572, "rewards/accuracies": 0.75, "rewards/chosen": 1.0559425354003906, "rewards/margins": 0.649682879447937, "rewards/rejected": 0.4062596559524536, "step": 190 }, { "epoch": 0.1395433789954338, "grad_norm": 84.5232060636678, "learning_rate": 1.742700729927007e-07, "logits/chosen": -2.3254761695861816, "logits/rejected": -2.031216621398926, "logps/chosen": -745.0237426757812, "logps/rejected": -711.3261108398438, "loss": 0.6067, "rewards/accuracies": 0.75, "rewards/chosen": 1.130950927734375, "rewards/margins": 0.2350543737411499, "rewards/rejected": 0.8958964943885803, "step": 191 }, { "epoch": 0.14027397260273972, "grad_norm": 78.83980804156775, "learning_rate": 1.7518248175182481e-07, "logits/chosen": -2.97668719291687, "logits/rejected": -1.903767704963684, "logps/chosen": -484.87420654296875, "logps/rejected": -300.475830078125, "loss": 0.52, "rewards/accuracies": 0.875, "rewards/chosen": 0.6172728538513184, "rewards/margins": 0.35382556915283203, "rewards/rejected": 0.26344728469848633, "step": 192 }, { "epoch": 0.14100456621004567, "grad_norm": 67.1773865934903, "learning_rate": 1.760948905109489e-07, "logits/chosen": -3.110548496246338, "logits/rejected": -2.0560271739959717, "logps/chosen": -1065.4144287109375, "logps/rejected": -552.9881591796875, "loss": 0.5664, "rewards/accuracies": 1.0, "rewards/chosen": 1.4490073919296265, "rewards/margins": 0.8108180165290833, "rewards/rejected": 0.638189435005188, "step": 193 }, { "epoch": 0.1417351598173516, "grad_norm": 85.05710685052279, "learning_rate": 1.7700729927007298e-07, "logits/chosen": -2.8937292098999023, "logits/rejected": -2.270338535308838, "logps/chosen": -1067.8717041015625, "logps/rejected": -776.0604858398438, "loss": 0.5672, "rewards/accuracies": 0.875, "rewards/chosen": 1.228278636932373, "rewards/margins": 0.32005056738853455, "rewards/rejected": 0.9082280397415161, "step": 194 }, { "epoch": 0.14246575342465753, "grad_norm": 83.7966113295311, "learning_rate": 1.7791970802919706e-07, "logits/chosen": -2.6625092029571533, "logits/rejected": -1.5471183061599731, "logps/chosen": -706.6519775390625, "logps/rejected": -342.4577331542969, "loss": 0.5772, "rewards/accuracies": 0.75, "rewards/chosen": 1.0580389499664307, "rewards/margins": 0.5144640803337097, "rewards/rejected": 0.5435748100280762, "step": 195 }, { "epoch": 0.14319634703196346, "grad_norm": 75.7382900138282, "learning_rate": 1.7883211678832117e-07, "logits/chosen": -2.6765236854553223, "logits/rejected": -2.5347814559936523, "logps/chosen": -472.42279052734375, "logps/rejected": -515.365478515625, "loss": 0.5903, "rewards/accuracies": 0.75, "rewards/chosen": 0.8594018220901489, "rewards/margins": 0.21497842669487, "rewards/rejected": 0.6444233655929565, "step": 196 }, { "epoch": 0.14392694063926942, "grad_norm": 78.71968352463537, "learning_rate": 1.7974452554744523e-07, "logits/chosen": -2.5754096508026123, "logits/rejected": -1.9804811477661133, "logps/chosen": -526.6971435546875, "logps/rejected": -476.05902099609375, "loss": 0.6653, "rewards/accuracies": 0.625, "rewards/chosen": 0.7043365240097046, "rewards/margins": 0.24690280854701996, "rewards/rejected": 0.4574337601661682, "step": 197 }, { "epoch": 0.14465753424657535, "grad_norm": 72.6857063944659, "learning_rate": 1.8065693430656933e-07, "logits/chosen": -3.012587070465088, "logits/rejected": -2.4343910217285156, "logps/chosen": -706.918212890625, "logps/rejected": -613.4925537109375, "loss": 0.5215, "rewards/accuracies": 0.75, "rewards/chosen": 1.087092638015747, "rewards/margins": 0.2196529507637024, "rewards/rejected": 0.8674396276473999, "step": 198 }, { "epoch": 0.14538812785388128, "grad_norm": 75.25715030397444, "learning_rate": 1.8156934306569342e-07, "logits/chosen": -2.9841954708099365, "logits/rejected": -2.443324565887451, "logps/chosen": -983.556640625, "logps/rejected": -783.7759399414062, "loss": 0.5185, "rewards/accuracies": 0.875, "rewards/chosen": 1.7861069440841675, "rewards/margins": 0.7371990084648132, "rewards/rejected": 1.0489078760147095, "step": 199 }, { "epoch": 0.1461187214611872, "grad_norm": 68.88123837446595, "learning_rate": 1.824817518248175e-07, "logits/chosen": -2.390197992324829, "logits/rejected": -2.1642444133758545, "logps/chosen": -541.718505859375, "logps/rejected": -464.4938659667969, "loss": 0.5573, "rewards/accuracies": 0.625, "rewards/chosen": 1.0004321336746216, "rewards/margins": 0.41516679525375366, "rewards/rejected": 0.5852653384208679, "step": 200 }, { "epoch": 0.14684931506849316, "grad_norm": 81.2957353249926, "learning_rate": 1.833941605839416e-07, "logits/chosen": -3.1481645107269287, "logits/rejected": -3.1305737495422363, "logps/chosen": -775.6622314453125, "logps/rejected": -723.1766357421875, "loss": 0.6035, "rewards/accuracies": 0.5, "rewards/chosen": 0.9031819105148315, "rewards/margins": 0.05318256467580795, "rewards/rejected": 0.8499992489814758, "step": 201 }, { "epoch": 0.1475799086757991, "grad_norm": 87.85625855890262, "learning_rate": 1.843065693430657e-07, "logits/chosen": -2.4703025817871094, "logits/rejected": -1.7432963848114014, "logps/chosen": -436.816162109375, "logps/rejected": -324.11749267578125, "loss": 0.5538, "rewards/accuracies": 0.875, "rewards/chosen": 0.8269995450973511, "rewards/margins": 0.38916996121406555, "rewards/rejected": 0.43782952427864075, "step": 202 }, { "epoch": 0.14831050228310502, "grad_norm": 74.33429492733318, "learning_rate": 1.8521897810218977e-07, "logits/chosen": -2.5425682067871094, "logits/rejected": -2.4242217540740967, "logps/chosen": -586.1406860351562, "logps/rejected": -468.6368103027344, "loss": 0.6336, "rewards/accuracies": 0.625, "rewards/chosen": 0.9226390719413757, "rewards/margins": 0.31960731744766235, "rewards/rejected": 0.6030316948890686, "step": 203 }, { "epoch": 0.14904109589041095, "grad_norm": 76.14382015407587, "learning_rate": 1.8613138686131385e-07, "logits/chosen": -2.8815910816192627, "logits/rejected": -2.2552220821380615, "logps/chosen": -561.5079345703125, "logps/rejected": -418.2566833496094, "loss": 0.5285, "rewards/accuracies": 0.625, "rewards/chosen": 0.9051660299301147, "rewards/margins": 0.5028525590896606, "rewards/rejected": 0.4023135304450989, "step": 204 }, { "epoch": 0.14977168949771688, "grad_norm": 85.52744499665737, "learning_rate": 1.8704379562043796e-07, "logits/chosen": -2.6947875022888184, "logits/rejected": -1.9797539710998535, "logps/chosen": -542.1869506835938, "logps/rejected": -407.9595947265625, "loss": 0.6821, "rewards/accuracies": 0.75, "rewards/chosen": 0.8420469760894775, "rewards/margins": 0.3320204019546509, "rewards/rejected": 0.5100265741348267, "step": 205 }, { "epoch": 0.15050228310502284, "grad_norm": 71.52934548578646, "learning_rate": 1.8795620437956202e-07, "logits/chosen": -2.823256731033325, "logits/rejected": -1.5095958709716797, "logps/chosen": -469.8023986816406, "logps/rejected": -253.16510009765625, "loss": 0.5741, "rewards/accuracies": 0.75, "rewards/chosen": 0.9283183813095093, "rewards/margins": 0.6453900933265686, "rewards/rejected": 0.2829282283782959, "step": 206 }, { "epoch": 0.15123287671232877, "grad_norm": 76.18781083304107, "learning_rate": 1.8886861313868613e-07, "logits/chosen": -2.568154811859131, "logits/rejected": -1.6165225505828857, "logps/chosen": -463.3511962890625, "logps/rejected": -273.97076416015625, "loss": 0.5803, "rewards/accuracies": 0.875, "rewards/chosen": 0.9411702156066895, "rewards/margins": 0.6181767582893372, "rewards/rejected": 0.3229934573173523, "step": 207 }, { "epoch": 0.1519634703196347, "grad_norm": 106.6313285783076, "learning_rate": 1.897810218978102e-07, "logits/chosen": -2.63698673248291, "logits/rejected": -2.602663040161133, "logps/chosen": -551.52734375, "logps/rejected": -543.1782836914062, "loss": 0.7517, "rewards/accuracies": 0.5, "rewards/chosen": 1.1540253162384033, "rewards/margins": 0.3960002660751343, "rewards/rejected": 0.7580249905586243, "step": 208 }, { "epoch": 0.15269406392694063, "grad_norm": 80.88105838494418, "learning_rate": 1.906934306569343e-07, "logits/chosen": -2.748206615447998, "logits/rejected": -2.5028367042541504, "logps/chosen": -424.3001403808594, "logps/rejected": -534.69091796875, "loss": 0.6547, "rewards/accuracies": 0.375, "rewards/chosen": 0.6310283541679382, "rewards/margins": -0.06428262591362, "rewards/rejected": 0.6953110098838806, "step": 209 }, { "epoch": 0.15342465753424658, "grad_norm": 73.71954482403225, "learning_rate": 1.9160583941605838e-07, "logits/chosen": -2.769094467163086, "logits/rejected": -1.9821791648864746, "logps/chosen": -591.4356079101562, "logps/rejected": -357.78216552734375, "loss": 0.5204, "rewards/accuracies": 0.875, "rewards/chosen": 0.9313857555389404, "rewards/margins": 0.3839508593082428, "rewards/rejected": 0.54743492603302, "step": 210 }, { "epoch": 0.1541552511415525, "grad_norm": 70.04354463140233, "learning_rate": 1.9251824817518248e-07, "logits/chosen": -2.799206256866455, "logits/rejected": -2.3669843673706055, "logps/chosen": -700.6721801757812, "logps/rejected": -544.6993408203125, "loss": 0.5211, "rewards/accuracies": 0.75, "rewards/chosen": 1.3830292224884033, "rewards/margins": 0.5995250344276428, "rewards/rejected": 0.7835041284561157, "step": 211 }, { "epoch": 0.15488584474885844, "grad_norm": 74.05880133169521, "learning_rate": 1.9343065693430657e-07, "logits/chosen": -2.244021415710449, "logits/rejected": -1.7688051462173462, "logps/chosen": -590.4176635742188, "logps/rejected": -498.4833068847656, "loss": 0.5922, "rewards/accuracies": 0.625, "rewards/chosen": 0.7582947015762329, "rewards/margins": 0.13028010725975037, "rewards/rejected": 0.6280146241188049, "step": 212 }, { "epoch": 0.15561643835616437, "grad_norm": 78.13439006069126, "learning_rate": 1.9434306569343065e-07, "logits/chosen": -2.4926886558532715, "logits/rejected": -1.9801173210144043, "logps/chosen": -796.3925170898438, "logps/rejected": -654.757080078125, "loss": 0.5708, "rewards/accuracies": 0.75, "rewards/chosen": 1.0442055463790894, "rewards/margins": 0.3082372546195984, "rewards/rejected": 0.7359682321548462, "step": 213 }, { "epoch": 0.15634703196347033, "grad_norm": 81.70436357590548, "learning_rate": 1.9525547445255476e-07, "logits/chosen": -2.6516776084899902, "logits/rejected": -1.7029396295547485, "logps/chosen": -419.9661865234375, "logps/rejected": -245.0514373779297, "loss": 0.6234, "rewards/accuracies": 0.75, "rewards/chosen": 0.7692387104034424, "rewards/margins": 0.44322672486305237, "rewards/rejected": 0.32601198554039, "step": 214 }, { "epoch": 0.15707762557077626, "grad_norm": 94.20419145398999, "learning_rate": 1.9616788321167881e-07, "logits/chosen": -3.009077548980713, "logits/rejected": -3.262133836746216, "logps/chosen": -900.0164794921875, "logps/rejected": -895.096435546875, "loss": 0.6602, "rewards/accuracies": 0.25, "rewards/chosen": 1.3191932439804077, "rewards/margins": -0.09091843664646149, "rewards/rejected": 1.410111665725708, "step": 215 }, { "epoch": 0.1578082191780822, "grad_norm": 76.95248889331, "learning_rate": 1.9708029197080292e-07, "logits/chosen": -3.142545700073242, "logits/rejected": -2.1240718364715576, "logps/chosen": -808.8876342773438, "logps/rejected": -639.8170166015625, "loss": 0.5428, "rewards/accuracies": 0.75, "rewards/chosen": 1.236344575881958, "rewards/margins": 0.41494402289390564, "rewards/rejected": 0.82140052318573, "step": 216 }, { "epoch": 0.15853881278538812, "grad_norm": 71.30340309528536, "learning_rate": 1.97992700729927e-07, "logits/chosen": -3.5179290771484375, "logits/rejected": -2.3180737495422363, "logps/chosen": -745.0448608398438, "logps/rejected": -441.0702819824219, "loss": 0.5425, "rewards/accuracies": 0.875, "rewards/chosen": 1.1852515935897827, "rewards/margins": 0.6750349402427673, "rewards/rejected": 0.5102167129516602, "step": 217 }, { "epoch": 0.15926940639269407, "grad_norm": 100.78639521749699, "learning_rate": 1.989051094890511e-07, "logits/chosen": -2.7862751483917236, "logits/rejected": -2.2040841579437256, "logps/chosen": -695.5875854492188, "logps/rejected": -706.9476318359375, "loss": 0.6745, "rewards/accuracies": 0.625, "rewards/chosen": 1.1939952373504639, "rewards/margins": 0.21458742022514343, "rewards/rejected": 0.9794078469276428, "step": 218 }, { "epoch": 0.16, "grad_norm": 81.39231826564624, "learning_rate": 1.9981751824817517e-07, "logits/chosen": -2.413259267807007, "logits/rejected": -2.2652482986450195, "logps/chosen": -528.039794921875, "logps/rejected": -516.8497924804688, "loss": 0.595, "rewards/accuracies": 0.5, "rewards/chosen": 0.9047300815582275, "rewards/margins": 0.09437532722949982, "rewards/rejected": 0.8103547096252441, "step": 219 }, { "epoch": 0.16073059360730593, "grad_norm": 70.1631260550191, "learning_rate": 2.0072992700729928e-07, "logits/chosen": -3.564574718475342, "logits/rejected": -2.2443180084228516, "logps/chosen": -791.29638671875, "logps/rejected": -403.9780578613281, "loss": 0.5532, "rewards/accuracies": 0.875, "rewards/chosen": 1.504280924797058, "rewards/margins": 0.8763354420661926, "rewards/rejected": 0.6279454231262207, "step": 220 }, { "epoch": 0.16146118721461186, "grad_norm": 67.5561543042157, "learning_rate": 2.0164233576642333e-07, "logits/chosen": -3.0908186435699463, "logits/rejected": -2.038489818572998, "logps/chosen": -588.0908813476562, "logps/rejected": -405.4306640625, "loss": 0.5409, "rewards/accuracies": 0.875, "rewards/chosen": 1.1421116590499878, "rewards/margins": 0.7327708005905151, "rewards/rejected": 0.40934085845947266, "step": 221 }, { "epoch": 0.16219178082191782, "grad_norm": 69.10737018062002, "learning_rate": 2.0255474452554744e-07, "logits/chosen": -2.703368663787842, "logits/rejected": -2.1498918533325195, "logps/chosen": -495.86492919921875, "logps/rejected": -473.0352783203125, "loss": 0.5107, "rewards/accuracies": 0.75, "rewards/chosen": 1.1054356098175049, "rewards/margins": 0.4494631886482239, "rewards/rejected": 0.6559724807739258, "step": 222 }, { "epoch": 0.16292237442922375, "grad_norm": 82.88927749847232, "learning_rate": 2.034671532846715e-07, "logits/chosen": -2.755833625793457, "logits/rejected": -1.9813343286514282, "logps/chosen": -747.56396484375, "logps/rejected": -519.1740112304688, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": 1.3883613348007202, "rewards/margins": 0.682245671749115, "rewards/rejected": 0.70611572265625, "step": 223 }, { "epoch": 0.16365296803652968, "grad_norm": 81.0521328663688, "learning_rate": 2.043795620437956e-07, "logits/chosen": -2.934709310531616, "logits/rejected": -2.1471290588378906, "logps/chosen": -713.1524658203125, "logps/rejected": -596.05810546875, "loss": 0.6198, "rewards/accuracies": 0.625, "rewards/chosen": 1.2987415790557861, "rewards/margins": 0.41292476654052734, "rewards/rejected": 0.8858168125152588, "step": 224 }, { "epoch": 0.1643835616438356, "grad_norm": 62.154119103116365, "learning_rate": 2.0529197080291972e-07, "logits/chosen": -3.1293444633483887, "logits/rejected": -2.381269931793213, "logps/chosen": -514.0813598632812, "logps/rejected": -333.535400390625, "loss": 0.5061, "rewards/accuracies": 0.75, "rewards/chosen": 1.0527342557907104, "rewards/margins": 0.6827306151390076, "rewards/rejected": 0.37000367045402527, "step": 225 }, { "epoch": 0.16511415525114156, "grad_norm": 64.53184019100571, "learning_rate": 2.0620437956204377e-07, "logits/chosen": -2.6537740230560303, "logits/rejected": -2.5749406814575195, "logps/chosen": -439.68743896484375, "logps/rejected": -393.2142028808594, "loss": 0.51, "rewards/accuracies": 0.5, "rewards/chosen": 0.7434030771255493, "rewards/margins": 0.13971538841724396, "rewards/rejected": 0.6036876440048218, "step": 226 }, { "epoch": 0.1658447488584475, "grad_norm": 69.12081107334824, "learning_rate": 2.0711678832116788e-07, "logits/chosen": -2.8800055980682373, "logits/rejected": -1.8125406503677368, "logps/chosen": -684.0807495117188, "logps/rejected": -339.61663818359375, "loss": 0.5151, "rewards/accuracies": 0.875, "rewards/chosen": 1.1461615562438965, "rewards/margins": 0.7433428764343262, "rewards/rejected": 0.4028186798095703, "step": 227 }, { "epoch": 0.16657534246575342, "grad_norm": 74.85012694649389, "learning_rate": 2.0802919708029196e-07, "logits/chosen": -3.5455808639526367, "logits/rejected": -2.4084603786468506, "logps/chosen": -996.4473876953125, "logps/rejected": -668.0835571289062, "loss": 0.5181, "rewards/accuracies": 1.0, "rewards/chosen": 1.7439647912979126, "rewards/margins": 0.6807949542999268, "rewards/rejected": 1.0631699562072754, "step": 228 }, { "epoch": 0.16730593607305935, "grad_norm": 89.81779244583865, "learning_rate": 2.0894160583941605e-07, "logits/chosen": -2.258965253829956, "logits/rejected": -2.3829548358917236, "logps/chosen": -708.6490478515625, "logps/rejected": -754.6040649414062, "loss": 0.6213, "rewards/accuracies": 0.625, "rewards/chosen": 1.4068360328674316, "rewards/margins": 0.1320466846227646, "rewards/rejected": 1.2747893333435059, "step": 229 }, { "epoch": 0.1680365296803653, "grad_norm": 64.428826859324, "learning_rate": 2.0985401459854013e-07, "logits/chosen": -2.725044012069702, "logits/rejected": -1.8444525003433228, "logps/chosen": -767.388671875, "logps/rejected": -509.34527587890625, "loss": 0.523, "rewards/accuracies": 0.625, "rewards/chosen": 1.3547019958496094, "rewards/margins": 0.509853720664978, "rewards/rejected": 0.8448483943939209, "step": 230 }, { "epoch": 0.16876712328767124, "grad_norm": 70.7712189360018, "learning_rate": 2.1076642335766424e-07, "logits/chosen": -3.2799556255340576, "logits/rejected": -2.75344181060791, "logps/chosen": -1155.4024658203125, "logps/rejected": -882.24658203125, "loss": 0.5164, "rewards/accuracies": 0.75, "rewards/chosen": 2.133986473083496, "rewards/margins": 0.5153324007987976, "rewards/rejected": 1.6186540126800537, "step": 231 }, { "epoch": 0.16949771689497717, "grad_norm": 90.60851681995739, "learning_rate": 2.116788321167883e-07, "logits/chosen": -2.5423197746276855, "logits/rejected": -2.5232248306274414, "logps/chosen": -624.2681884765625, "logps/rejected": -666.90869140625, "loss": 0.5974, "rewards/accuracies": 0.5, "rewards/chosen": 1.1147034168243408, "rewards/margins": 0.13325423002243042, "rewards/rejected": 0.9814491868019104, "step": 232 }, { "epoch": 0.1702283105022831, "grad_norm": 94.55974777058947, "learning_rate": 2.125912408759124e-07, "logits/chosen": -2.4678163528442383, "logits/rejected": -2.3685598373413086, "logps/chosen": -684.329345703125, "logps/rejected": -604.2761840820312, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 1.3901867866516113, "rewards/margins": 0.29098498821258545, "rewards/rejected": 1.0992019176483154, "step": 233 }, { "epoch": 0.17095890410958905, "grad_norm": 67.14617915414101, "learning_rate": 2.1350364963503648e-07, "logits/chosen": -2.6443300247192383, "logits/rejected": -2.3815507888793945, "logps/chosen": -815.861083984375, "logps/rejected": -730.37109375, "loss": 0.4778, "rewards/accuracies": 0.625, "rewards/chosen": 1.1177399158477783, "rewards/margins": 0.05571576952934265, "rewards/rejected": 1.0620241165161133, "step": 234 }, { "epoch": 0.17168949771689498, "grad_norm": 60.88356788075601, "learning_rate": 2.1441605839416057e-07, "logits/chosen": -3.1002659797668457, "logits/rejected": -2.596212148666382, "logps/chosen": -581.698486328125, "logps/rejected": -511.74859619140625, "loss": 0.5382, "rewards/accuracies": 0.875, "rewards/chosen": 1.2246569395065308, "rewards/margins": 0.42247888445854187, "rewards/rejected": 0.802178144454956, "step": 235 }, { "epoch": 0.1724200913242009, "grad_norm": 69.9613418644093, "learning_rate": 2.1532846715328465e-07, "logits/chosen": -2.5789170265197754, "logits/rejected": -2.5800859928131104, "logps/chosen": -650.28955078125, "logps/rejected": -668.7222900390625, "loss": 0.5077, "rewards/accuracies": 0.5, "rewards/chosen": 1.0968495607376099, "rewards/margins": 0.32358869910240173, "rewards/rejected": 0.7732609510421753, "step": 236 }, { "epoch": 0.17315068493150684, "grad_norm": 69.85015350958436, "learning_rate": 2.1624087591240876e-07, "logits/chosen": -3.0414507389068604, "logits/rejected": -2.265570640563965, "logps/chosen": -892.5299072265625, "logps/rejected": -574.36376953125, "loss": 0.574, "rewards/accuracies": 0.75, "rewards/chosen": 1.8259624242782593, "rewards/margins": 0.7247467637062073, "rewards/rejected": 1.1012157201766968, "step": 237 }, { "epoch": 0.17388127853881277, "grad_norm": 88.86111002570247, "learning_rate": 2.1715328467153284e-07, "logits/chosen": -2.7190353870391846, "logits/rejected": -2.19610595703125, "logps/chosen": -817.2662353515625, "logps/rejected": -700.686767578125, "loss": 0.6038, "rewards/accuracies": 0.75, "rewards/chosen": 1.4547163248062134, "rewards/margins": 0.42043083906173706, "rewards/rejected": 1.034285545349121, "step": 238 }, { "epoch": 0.17461187214611873, "grad_norm": 70.88730327516346, "learning_rate": 2.1806569343065692e-07, "logits/chosen": -2.622231960296631, "logits/rejected": -2.128929853439331, "logps/chosen": -887.3323974609375, "logps/rejected": -560.5359497070312, "loss": 0.5156, "rewards/accuracies": 0.875, "rewards/chosen": 1.6341009140014648, "rewards/margins": 0.7075695395469666, "rewards/rejected": 0.9265313744544983, "step": 239 }, { "epoch": 0.17534246575342466, "grad_norm": 107.76203075962098, "learning_rate": 2.1897810218978103e-07, "logits/chosen": -3.1331827640533447, "logits/rejected": -2.304816961288452, "logps/chosen": -642.5380249023438, "logps/rejected": -579.2041015625, "loss": 0.6712, "rewards/accuracies": 0.5, "rewards/chosen": 1.5731760263442993, "rewards/margins": 0.4546414017677307, "rewards/rejected": 1.1185345649719238, "step": 240 }, { "epoch": 0.1760730593607306, "grad_norm": 72.58038377809117, "learning_rate": 2.1989051094890509e-07, "logits/chosen": -3.0504534244537354, "logits/rejected": -2.261298656463623, "logps/chosen": -883.864990234375, "logps/rejected": -689.2882080078125, "loss": 0.5362, "rewards/accuracies": 0.75, "rewards/chosen": 1.7411744594573975, "rewards/margins": 0.7635552287101746, "rewards/rejected": 0.9776191711425781, "step": 241 }, { "epoch": 0.17680365296803652, "grad_norm": 65.47747555218893, "learning_rate": 2.208029197080292e-07, "logits/chosen": -2.655304431915283, "logits/rejected": -1.7437992095947266, "logps/chosen": -457.5860290527344, "logps/rejected": -313.00689697265625, "loss": 0.5623, "rewards/accuracies": 0.625, "rewards/chosen": 0.9686903953552246, "rewards/margins": 0.4919581115245819, "rewards/rejected": 0.4767322242259979, "step": 242 }, { "epoch": 0.17753424657534247, "grad_norm": 73.19833729007271, "learning_rate": 2.2171532846715328e-07, "logits/chosen": -2.6554746627807617, "logits/rejected": -2.0331335067749023, "logps/chosen": -730.6705322265625, "logps/rejected": -629.2322998046875, "loss": 0.4896, "rewards/accuracies": 0.625, "rewards/chosen": 1.4308528900146484, "rewards/margins": 0.466938853263855, "rewards/rejected": 0.9639139175415039, "step": 243 }, { "epoch": 0.1782648401826484, "grad_norm": 66.82559506843168, "learning_rate": 2.2262773722627736e-07, "logits/chosen": -2.3991191387176514, "logits/rejected": -1.2453111410140991, "logps/chosen": -512.8739013671875, "logps/rejected": -253.51632690429688, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": 1.045324683189392, "rewards/margins": 0.7577945590019226, "rewards/rejected": 0.2875301241874695, "step": 244 }, { "epoch": 0.17899543378995433, "grad_norm": 77.31514938366703, "learning_rate": 2.2354014598540144e-07, "logits/chosen": -2.8442437648773193, "logits/rejected": -2.171132802963257, "logps/chosen": -783.6129760742188, "logps/rejected": -726.4027709960938, "loss": 0.5492, "rewards/accuracies": 0.875, "rewards/chosen": 1.70475435256958, "rewards/margins": 0.5218314528465271, "rewards/rejected": 1.1829228401184082, "step": 245 }, { "epoch": 0.17972602739726026, "grad_norm": 74.25422507444442, "learning_rate": 2.2445255474452555e-07, "logits/chosen": -2.8666396141052246, "logits/rejected": -2.246265411376953, "logps/chosen": -495.06805419921875, "logps/rejected": -438.7255554199219, "loss": 0.503, "rewards/accuracies": 0.75, "rewards/chosen": 1.1834778785705566, "rewards/margins": 0.45554840564727783, "rewards/rejected": 0.7279293537139893, "step": 246 }, { "epoch": 0.18045662100456622, "grad_norm": 71.11929829408999, "learning_rate": 2.253649635036496e-07, "logits/chosen": -2.746885299682617, "logits/rejected": -1.8349051475524902, "logps/chosen": -1033.4638671875, "logps/rejected": -591.6947631835938, "loss": 0.5104, "rewards/accuracies": 1.0, "rewards/chosen": 2.0927340984344482, "rewards/margins": 0.9600479602813721, "rewards/rejected": 1.1326862573623657, "step": 247 }, { "epoch": 0.18118721461187215, "grad_norm": 90.49878333522639, "learning_rate": 2.2627737226277372e-07, "logits/chosen": -2.745241165161133, "logits/rejected": -2.4862418174743652, "logps/chosen": -945.704833984375, "logps/rejected": -653.7545166015625, "loss": 0.6383, "rewards/accuracies": 0.75, "rewards/chosen": 1.810364842414856, "rewards/margins": 0.6286590695381165, "rewards/rejected": 1.1817059516906738, "step": 248 }, { "epoch": 0.18191780821917808, "grad_norm": 71.11274814840691, "learning_rate": 2.271897810218978e-07, "logits/chosen": -2.731490135192871, "logits/rejected": -2.0587401390075684, "logps/chosen": -698.4864501953125, "logps/rejected": -520.62060546875, "loss": 0.5014, "rewards/accuracies": 0.875, "rewards/chosen": 1.6827136278152466, "rewards/margins": 0.8127367496490479, "rewards/rejected": 0.8699768781661987, "step": 249 }, { "epoch": 0.182648401826484, "grad_norm": 67.42017487717419, "learning_rate": 2.2810218978102188e-07, "logits/chosen": -2.6607940196990967, "logits/rejected": -1.679173231124878, "logps/chosen": -757.7349853515625, "logps/rejected": -445.6333312988281, "loss": 0.4913, "rewards/accuracies": 0.875, "rewards/chosen": 1.6813418865203857, "rewards/margins": 0.9653757214546204, "rewards/rejected": 0.7159662246704102, "step": 250 }, { "epoch": 0.18337899543378997, "grad_norm": 90.66895485776816, "learning_rate": 2.29014598540146e-07, "logits/chosen": -2.6416468620300293, "logits/rejected": -2.2556021213531494, "logps/chosen": -362.8624267578125, "logps/rejected": -355.3620300292969, "loss": 0.6398, "rewards/accuracies": 0.75, "rewards/chosen": 0.8460849523544312, "rewards/margins": 0.42813241481781006, "rewards/rejected": 0.41795259714126587, "step": 251 }, { "epoch": 0.1841095890410959, "grad_norm": 65.48944705661555, "learning_rate": 2.2992700729927007e-07, "logits/chosen": -3.545546054840088, "logits/rejected": -2.374039649963379, "logps/chosen": -677.2420654296875, "logps/rejected": -409.25927734375, "loss": 0.489, "rewards/accuracies": 0.75, "rewards/chosen": 1.425431728363037, "rewards/margins": 0.759490430355072, "rewards/rejected": 0.6659412980079651, "step": 252 }, { "epoch": 0.18484018264840182, "grad_norm": 71.58714095422445, "learning_rate": 2.3083941605839415e-07, "logits/chosen": -2.9167368412017822, "logits/rejected": -2.1838998794555664, "logps/chosen": -670.89892578125, "logps/rejected": -504.4697265625, "loss": 0.5413, "rewards/accuracies": 0.625, "rewards/chosen": 1.194069266319275, "rewards/margins": 0.3850575387477875, "rewards/rejected": 0.8090118169784546, "step": 253 }, { "epoch": 0.18557077625570775, "grad_norm": 69.4878656785665, "learning_rate": 2.3175182481751824e-07, "logits/chosen": -2.6278374195098877, "logits/rejected": -2.0797338485717773, "logps/chosen": -552.5968627929688, "logps/rejected": -436.6729736328125, "loss": 0.5148, "rewards/accuracies": 0.5, "rewards/chosen": 0.9751812219619751, "rewards/margins": 0.253040611743927, "rewards/rejected": 0.7221406102180481, "step": 254 }, { "epoch": 0.1863013698630137, "grad_norm": 75.44650940401418, "learning_rate": 2.3266423357664234e-07, "logits/chosen": -3.032348871231079, "logits/rejected": -2.331078290939331, "logps/chosen": -788.3209228515625, "logps/rejected": -532.6770629882812, "loss": 0.5023, "rewards/accuracies": 0.75, "rewards/chosen": 1.5835951566696167, "rewards/margins": 0.6411755084991455, "rewards/rejected": 0.9424195289611816, "step": 255 }, { "epoch": 0.18703196347031964, "grad_norm": 75.25121988213878, "learning_rate": 2.335766423357664e-07, "logits/chosen": -2.849893093109131, "logits/rejected": -1.8536896705627441, "logps/chosen": -607.2590942382812, "logps/rejected": -361.17095947265625, "loss": 0.5511, "rewards/accuracies": 0.75, "rewards/chosen": 1.0613261461257935, "rewards/margins": 0.482583612203598, "rewards/rejected": 0.5787426829338074, "step": 256 }, { "epoch": 0.18776255707762557, "grad_norm": 66.20457611402327, "learning_rate": 2.344890510948905e-07, "logits/chosen": -2.7254745960235596, "logits/rejected": -1.8978488445281982, "logps/chosen": -676.7239379882812, "logps/rejected": -414.331298828125, "loss": 0.5079, "rewards/accuracies": 0.875, "rewards/chosen": 1.431764841079712, "rewards/margins": 0.841387152671814, "rewards/rejected": 0.5903776288032532, "step": 257 }, { "epoch": 0.1884931506849315, "grad_norm": 84.31448640172992, "learning_rate": 2.3540145985401457e-07, "logits/chosen": -3.160921096801758, "logits/rejected": -2.5481700897216797, "logps/chosen": -825.5301513671875, "logps/rejected": -773.747314453125, "loss": 0.5468, "rewards/accuracies": 0.5, "rewards/chosen": 1.6706790924072266, "rewards/margins": 0.1400892287492752, "rewards/rejected": 1.5305898189544678, "step": 258 }, { "epoch": 0.18922374429223746, "grad_norm": 81.31870292443492, "learning_rate": 2.3631386861313867e-07, "logits/chosen": -2.8434391021728516, "logits/rejected": -1.9728132486343384, "logps/chosen": -769.5335083007812, "logps/rejected": -596.4971313476562, "loss": 0.6011, "rewards/accuracies": 0.5, "rewards/chosen": 1.32240891456604, "rewards/margins": 0.6401655673980713, "rewards/rejected": 0.6822434663772583, "step": 259 }, { "epoch": 0.18995433789954339, "grad_norm": 82.71346967940741, "learning_rate": 2.3722627737226276e-07, "logits/chosen": -2.6084976196289062, "logits/rejected": -2.131620407104492, "logps/chosen": -652.7649536132812, "logps/rejected": -617.6133422851562, "loss": 0.57, "rewards/accuracies": 0.5, "rewards/chosen": 1.7151387929916382, "rewards/margins": 0.6609997153282166, "rewards/rejected": 1.0541391372680664, "step": 260 }, { "epoch": 0.19068493150684931, "grad_norm": 72.74345237395082, "learning_rate": 2.3813868613138684e-07, "logits/chosen": -2.73063325881958, "logits/rejected": -1.8138930797576904, "logps/chosen": -671.9598999023438, "logps/rejected": -415.46490478515625, "loss": 0.5265, "rewards/accuracies": 0.875, "rewards/chosen": 1.760467290878296, "rewards/margins": 1.2247323989868164, "rewards/rejected": 0.5357349514961243, "step": 261 }, { "epoch": 0.19141552511415524, "grad_norm": 70.8509624967703, "learning_rate": 2.3905109489051095e-07, "logits/chosen": -2.6628236770629883, "logits/rejected": -2.1475298404693604, "logps/chosen": -812.60009765625, "logps/rejected": -626.3504638671875, "loss": 0.473, "rewards/accuracies": 1.0, "rewards/chosen": 1.682906985282898, "rewards/margins": 0.7541605234146118, "rewards/rejected": 0.9287464618682861, "step": 262 }, { "epoch": 0.1921461187214612, "grad_norm": 64.95826108813066, "learning_rate": 2.3996350364963503e-07, "logits/chosen": -3.372051239013672, "logits/rejected": -2.3225340843200684, "logps/chosen": -689.1383056640625, "logps/rejected": -512.4481201171875, "loss": 0.4913, "rewards/accuracies": 1.0, "rewards/chosen": 1.8212381601333618, "rewards/margins": 0.9415586590766907, "rewards/rejected": 0.8796795606613159, "step": 263 }, { "epoch": 0.19287671232876713, "grad_norm": 74.4314452302419, "learning_rate": 2.408759124087591e-07, "logits/chosen": -2.576101541519165, "logits/rejected": -2.496417284011841, "logps/chosen": -627.0614013671875, "logps/rejected": -444.6681823730469, "loss": 0.5481, "rewards/accuracies": 0.5, "rewards/chosen": 1.0842218399047852, "rewards/margins": 0.3844057321548462, "rewards/rejected": 0.699816107749939, "step": 264 }, { "epoch": 0.19360730593607306, "grad_norm": 71.12419334450304, "learning_rate": 2.417883211678832e-07, "logits/chosen": -2.797236442565918, "logits/rejected": -2.3201403617858887, "logps/chosen": -802.8722534179688, "logps/rejected": -624.4637451171875, "loss": 0.4669, "rewards/accuracies": 0.75, "rewards/chosen": 1.678586483001709, "rewards/margins": 0.4873347878456116, "rewards/rejected": 1.1912517547607422, "step": 265 }, { "epoch": 0.194337899543379, "grad_norm": 74.79682114318095, "learning_rate": 2.427007299270073e-07, "logits/chosen": -2.508422613143921, "logits/rejected": -1.5769749879837036, "logps/chosen": -627.1580810546875, "logps/rejected": -457.1664733886719, "loss": 0.4924, "rewards/accuracies": 0.75, "rewards/chosen": 1.1734263896942139, "rewards/margins": 0.5834678411483765, "rewards/rejected": 0.5899585485458374, "step": 266 }, { "epoch": 0.19506849315068492, "grad_norm": 69.71139234484023, "learning_rate": 2.4361313868613136e-07, "logits/chosen": -2.421278715133667, "logits/rejected": -2.243502140045166, "logps/chosen": -690.1507568359375, "logps/rejected": -526.5842895507812, "loss": 0.5561, "rewards/accuracies": 0.625, "rewards/chosen": 1.2264071702957153, "rewards/margins": 0.3267163336277008, "rewards/rejected": 0.8996908664703369, "step": 267 }, { "epoch": 0.19579908675799088, "grad_norm": 65.17310183752957, "learning_rate": 2.445255474452555e-07, "logits/chosen": -2.6044814586639404, "logits/rejected": -2.402209758758545, "logps/chosen": -361.38494873046875, "logps/rejected": -299.82293701171875, "loss": 0.5295, "rewards/accuracies": 0.625, "rewards/chosen": 0.8860179781913757, "rewards/margins": 0.4233168959617615, "rewards/rejected": 0.46270111203193665, "step": 268 }, { "epoch": 0.1965296803652968, "grad_norm": 61.47164139107154, "learning_rate": 2.454379562043795e-07, "logits/chosen": -2.8216962814331055, "logits/rejected": -1.6983342170715332, "logps/chosen": -567.5643310546875, "logps/rejected": -369.4913330078125, "loss": 0.4243, "rewards/accuracies": 0.875, "rewards/chosen": 1.7489521503448486, "rewards/margins": 1.2412927150726318, "rewards/rejected": 0.5076595544815063, "step": 269 }, { "epoch": 0.19726027397260273, "grad_norm": 70.918224253361, "learning_rate": 2.4635036496350366e-07, "logits/chosen": -2.3781256675720215, "logits/rejected": -2.1991961002349854, "logps/chosen": -449.66314697265625, "logps/rejected": -514.5548095703125, "loss": 0.5419, "rewards/accuracies": 0.625, "rewards/chosen": 1.0114105939865112, "rewards/margins": 0.10136910527944565, "rewards/rejected": 0.9100414514541626, "step": 270 }, { "epoch": 0.19799086757990866, "grad_norm": 57.197956267439096, "learning_rate": 2.4726277372262774e-07, "logits/chosen": -3.1233935356140137, "logits/rejected": -2.1951496601104736, "logps/chosen": -710.635498046875, "logps/rejected": -439.10302734375, "loss": 0.3854, "rewards/accuracies": 0.875, "rewards/chosen": 1.5878593921661377, "rewards/margins": 0.8460583686828613, "rewards/rejected": 0.7418009638786316, "step": 271 }, { "epoch": 0.19872146118721462, "grad_norm": 75.03315673337354, "learning_rate": 2.481751824817518e-07, "logits/chosen": -2.8225622177124023, "logits/rejected": -2.5081703662872314, "logps/chosen": -317.598876953125, "logps/rejected": -298.5233154296875, "loss": 0.6039, "rewards/accuracies": 0.625, "rewards/chosen": 0.7518444657325745, "rewards/margins": 0.26443809270858765, "rewards/rejected": 0.48740631341934204, "step": 272 }, { "epoch": 0.19945205479452055, "grad_norm": 70.33086565699658, "learning_rate": 2.490875912408759e-07, "logits/chosen": -3.1558618545532227, "logits/rejected": -2.77293062210083, "logps/chosen": -795.8041381835938, "logps/rejected": -569.3260498046875, "loss": 0.519, "rewards/accuracies": 0.75, "rewards/chosen": 1.583844542503357, "rewards/margins": 0.2978409230709076, "rewards/rejected": 1.286003589630127, "step": 273 }, { "epoch": 0.20018264840182648, "grad_norm": 58.50364530242432, "learning_rate": 2.5e-07, "logits/chosen": -2.4873745441436768, "logits/rejected": -1.551337480545044, "logps/chosen": -488.2410888671875, "logps/rejected": -315.169677734375, "loss": 0.4469, "rewards/accuracies": 0.875, "rewards/chosen": 1.5176268815994263, "rewards/margins": 1.040231704711914, "rewards/rejected": 0.4773952066898346, "step": 274 }, { "epoch": 0.2009132420091324, "grad_norm": 69.69044626619383, "learning_rate": 2.5091240875912407e-07, "logits/chosen": -3.580242156982422, "logits/rejected": -2.3127641677856445, "logps/chosen": -411.23516845703125, "logps/rejected": -311.59051513671875, "loss": 0.5264, "rewards/accuracies": 1.0, "rewards/chosen": 1.195625901222229, "rewards/margins": 0.8409671783447266, "rewards/rejected": 0.35465872287750244, "step": 275 }, { "epoch": 0.20164383561643837, "grad_norm": 78.03311784408432, "learning_rate": 2.518248175182482e-07, "logits/chosen": -2.5946433544158936, "logits/rejected": -1.9361159801483154, "logps/chosen": -632.93359375, "logps/rejected": -521.028564453125, "loss": 0.5812, "rewards/accuracies": 0.625, "rewards/chosen": 1.01936936378479, "rewards/margins": 0.09748248755931854, "rewards/rejected": 0.9218868017196655, "step": 276 }, { "epoch": 0.2023744292237443, "grad_norm": 80.96712175128556, "learning_rate": 2.5273722627737224e-07, "logits/chosen": -3.3311731815338135, "logits/rejected": -2.6428074836730957, "logps/chosen": -877.5778198242188, "logps/rejected": -623.4839477539062, "loss": 0.5581, "rewards/accuracies": 0.75, "rewards/chosen": 1.7416778802871704, "rewards/margins": 0.6135801076889038, "rewards/rejected": 1.1280977725982666, "step": 277 }, { "epoch": 0.20310502283105022, "grad_norm": 75.7725703792276, "learning_rate": 2.536496350364963e-07, "logits/chosen": -2.768047571182251, "logits/rejected": -2.195481777191162, "logps/chosen": -435.61767578125, "logps/rejected": -323.7370910644531, "loss": 0.5971, "rewards/accuracies": 0.875, "rewards/chosen": 1.1338032484054565, "rewards/margins": 0.46554189920425415, "rewards/rejected": 0.6682612895965576, "step": 278 }, { "epoch": 0.20383561643835615, "grad_norm": 73.11318681289649, "learning_rate": 2.545620437956204e-07, "logits/chosen": -2.969489336013794, "logits/rejected": -2.4015932083129883, "logps/chosen": -573.2581176757812, "logps/rejected": -430.1927185058594, "loss": 0.4536, "rewards/accuracies": 0.75, "rewards/chosen": 1.4678131341934204, "rewards/margins": 0.9855450391769409, "rewards/rejected": 0.48226815462112427, "step": 279 }, { "epoch": 0.2045662100456621, "grad_norm": 90.31439498444448, "learning_rate": 2.5547445255474454e-07, "logits/chosen": -2.5380241870880127, "logits/rejected": -1.9781020879745483, "logps/chosen": -593.6395874023438, "logps/rejected": -599.6492309570312, "loss": 0.5786, "rewards/accuracies": 0.625, "rewards/chosen": 1.518062949180603, "rewards/margins": 0.4034231901168823, "rewards/rejected": 1.1146397590637207, "step": 280 }, { "epoch": 0.20529680365296804, "grad_norm": 71.39440889238982, "learning_rate": 2.563868613138686e-07, "logits/chosen": -2.938767194747925, "logits/rejected": -2.407992124557495, "logps/chosen": -492.75311279296875, "logps/rejected": -335.11785888671875, "loss": 0.5628, "rewards/accuracies": 0.875, "rewards/chosen": 1.2950342893600464, "rewards/margins": 0.5722995400428772, "rewards/rejected": 0.7227347493171692, "step": 281 }, { "epoch": 0.20602739726027397, "grad_norm": 64.556071194931, "learning_rate": 2.572992700729927e-07, "logits/chosen": -3.274653911590576, "logits/rejected": -2.65183424949646, "logps/chosen": -607.8264770507812, "logps/rejected": -413.3287658691406, "loss": 0.5042, "rewards/accuracies": 0.75, "rewards/chosen": 1.569000005722046, "rewards/margins": 0.6596295833587646, "rewards/rejected": 0.9093705415725708, "step": 282 }, { "epoch": 0.2067579908675799, "grad_norm": 74.30648104304227, "learning_rate": 2.5821167883211673e-07, "logits/chosen": -3.2087037563323975, "logits/rejected": -2.6493022441864014, "logps/chosen": -731.0548706054688, "logps/rejected": -610.5682373046875, "loss": 0.5581, "rewards/accuracies": 0.625, "rewards/chosen": 1.539851427078247, "rewards/margins": 0.21267220377922058, "rewards/rejected": 1.327179193496704, "step": 283 }, { "epoch": 0.20748858447488586, "grad_norm": 63.98868842298418, "learning_rate": 2.5912408759124086e-07, "logits/chosen": -2.581681966781616, "logits/rejected": -2.0083556175231934, "logps/chosen": -664.96533203125, "logps/rejected": -510.489013671875, "loss": 0.4842, "rewards/accuracies": 0.75, "rewards/chosen": 1.4497509002685547, "rewards/margins": 0.6775259971618652, "rewards/rejected": 0.7722249031066895, "step": 284 }, { "epoch": 0.20821917808219179, "grad_norm": 68.0516053895406, "learning_rate": 2.6003649635036495e-07, "logits/chosen": -3.181183099746704, "logits/rejected": -2.2763900756835938, "logps/chosen": -563.7217407226562, "logps/rejected": -378.4830322265625, "loss": 0.5208, "rewards/accuracies": 0.625, "rewards/chosen": 1.1445131301879883, "rewards/margins": 0.39902645349502563, "rewards/rejected": 0.7454866766929626, "step": 285 }, { "epoch": 0.20894977168949772, "grad_norm": 75.88425023411479, "learning_rate": 2.6094890510948903e-07, "logits/chosen": -2.6437151432037354, "logits/rejected": -2.5975663661956787, "logps/chosen": -754.3407592773438, "logps/rejected": -646.8643798828125, "loss": 0.5421, "rewards/accuracies": 0.625, "rewards/chosen": 1.80771803855896, "rewards/margins": 0.27560359239578247, "rewards/rejected": 1.5321143865585327, "step": 286 }, { "epoch": 0.20968036529680364, "grad_norm": 68.70750817550801, "learning_rate": 2.6186131386861316e-07, "logits/chosen": -2.857146739959717, "logits/rejected": -2.6952714920043945, "logps/chosen": -902.001708984375, "logps/rejected": -856.3089599609375, "loss": 0.5115, "rewards/accuracies": 0.5, "rewards/chosen": 1.8099883794784546, "rewards/margins": 0.3188181221485138, "rewards/rejected": 1.4911702871322632, "step": 287 }, { "epoch": 0.2104109589041096, "grad_norm": 93.07170769593465, "learning_rate": 2.6277372262773725e-07, "logits/chosen": -3.1088149547576904, "logits/rejected": -3.0393388271331787, "logps/chosen": -673.1768798828125, "logps/rejected": -592.7464599609375, "loss": 0.671, "rewards/accuracies": 0.625, "rewards/chosen": 1.4981074333190918, "rewards/margins": 0.15454745292663574, "rewards/rejected": 1.3435600996017456, "step": 288 }, { "epoch": 0.21114155251141553, "grad_norm": 75.60710015435059, "learning_rate": 2.636861313868613e-07, "logits/chosen": -2.5323379039764404, "logits/rejected": -2.4044008255004883, "logps/chosen": -603.7645874023438, "logps/rejected": -605.3165893554688, "loss": 0.537, "rewards/accuracies": 0.75, "rewards/chosen": 1.4053325653076172, "rewards/margins": 0.4870976209640503, "rewards/rejected": 0.9182349443435669, "step": 289 }, { "epoch": 0.21187214611872146, "grad_norm": 65.9216768632955, "learning_rate": 2.6459854014598536e-07, "logits/chosen": -2.5624661445617676, "logits/rejected": -2.373194932937622, "logps/chosen": -561.5380859375, "logps/rejected": -479.85675048828125, "loss": 0.4947, "rewards/accuracies": 0.625, "rewards/chosen": 1.3597897291183472, "rewards/margins": 0.4487088620662689, "rewards/rejected": 0.9110808372497559, "step": 290 }, { "epoch": 0.2126027397260274, "grad_norm": 72.15017297926249, "learning_rate": 2.655109489051095e-07, "logits/chosen": -2.787193536758423, "logits/rejected": -1.8465055227279663, "logps/chosen": -833.089599609375, "logps/rejected": -489.57977294921875, "loss": 0.571, "rewards/accuracies": 0.75, "rewards/chosen": 1.8434723615646362, "rewards/margins": 0.9896187782287598, "rewards/rejected": 0.8538534641265869, "step": 291 }, { "epoch": 0.21333333333333335, "grad_norm": 62.07848484982539, "learning_rate": 2.664233576642336e-07, "logits/chosen": -3.17891788482666, "logits/rejected": -2.618992567062378, "logps/chosen": -630.57958984375, "logps/rejected": -404.7199401855469, "loss": 0.5175, "rewards/accuracies": 0.625, "rewards/chosen": 1.615642786026001, "rewards/margins": 0.8529044389724731, "rewards/rejected": 0.7627382874488831, "step": 292 }, { "epoch": 0.21406392694063928, "grad_norm": 61.363309970591345, "learning_rate": 2.6733576642335766e-07, "logits/chosen": -3.0196666717529297, "logits/rejected": -1.8327243328094482, "logps/chosen": -822.4632568359375, "logps/rejected": -424.9331359863281, "loss": 0.4949, "rewards/accuracies": 1.0, "rewards/chosen": 1.6666474342346191, "rewards/margins": 0.8891336917877197, "rewards/rejected": 0.7775137424468994, "step": 293 }, { "epoch": 0.2147945205479452, "grad_norm": 703.6931593386424, "learning_rate": 2.6824817518248174e-07, "logits/chosen": -2.585696220397949, "logits/rejected": -3.058145523071289, "logps/chosen": -382.94647216796875, "logps/rejected": -604.6951904296875, "loss": 0.6626, "rewards/accuracies": 0.5, "rewards/chosen": 0.07187340408563614, "rewards/margins": -0.5408883690834045, "rewards/rejected": 0.6127617955207825, "step": 294 }, { "epoch": 0.21552511415525114, "grad_norm": 83.06950685269535, "learning_rate": 2.691605839416058e-07, "logits/chosen": -2.9846179485321045, "logits/rejected": -1.9302775859832764, "logps/chosen": -1179.01806640625, "logps/rejected": -685.2986450195312, "loss": 0.5482, "rewards/accuracies": 0.75, "rewards/chosen": 2.4703643321990967, "rewards/margins": 1.1361846923828125, "rewards/rejected": 1.3341797590255737, "step": 295 }, { "epoch": 0.21625570776255706, "grad_norm": 68.68842768368083, "learning_rate": 2.700729927007299e-07, "logits/chosen": -2.320950984954834, "logits/rejected": -2.083979368209839, "logps/chosen": -608.9562377929688, "logps/rejected": -513.3939208984375, "loss": 0.5415, "rewards/accuracies": 0.75, "rewards/chosen": 1.1996068954467773, "rewards/margins": 0.5723210573196411, "rewards/rejected": 0.6272858381271362, "step": 296 }, { "epoch": 0.21698630136986302, "grad_norm": 64.29397586112754, "learning_rate": 2.70985401459854e-07, "logits/chosen": -2.5138614177703857, "logits/rejected": -1.7940773963928223, "logps/chosen": -757.4259643554688, "logps/rejected": -524.786865234375, "loss": 0.4443, "rewards/accuracies": 0.75, "rewards/chosen": 1.888533353805542, "rewards/margins": 0.8999877572059631, "rewards/rejected": 0.9885456562042236, "step": 297 }, { "epoch": 0.21771689497716895, "grad_norm": 71.3332983474408, "learning_rate": 2.7189781021897807e-07, "logits/chosen": -2.654924154281616, "logits/rejected": -1.9925119876861572, "logps/chosen": -457.6864929199219, "logps/rejected": -301.63555908203125, "loss": 0.542, "rewards/accuracies": 0.875, "rewards/chosen": 1.0791821479797363, "rewards/margins": 0.7918640375137329, "rewards/rejected": 0.2873181402683258, "step": 298 }, { "epoch": 0.21844748858447488, "grad_norm": 74.56006714911977, "learning_rate": 2.728102189781022e-07, "logits/chosen": -2.292485475540161, "logits/rejected": -2.5300114154815674, "logps/chosen": -663.7569580078125, "logps/rejected": -795.47119140625, "loss": 0.5531, "rewards/accuracies": 0.25, "rewards/chosen": 1.1360666751861572, "rewards/margins": -0.3662509322166443, "rewards/rejected": 1.5023175477981567, "step": 299 }, { "epoch": 0.2191780821917808, "grad_norm": 83.18953967728605, "learning_rate": 2.737226277372263e-07, "logits/chosen": -2.7812142372131348, "logits/rejected": -2.501270055770874, "logps/chosen": -996.5247802734375, "logps/rejected": -802.920654296875, "loss": 0.5509, "rewards/accuracies": 0.5, "rewards/chosen": 1.7781188488006592, "rewards/margins": 0.10958874970674515, "rewards/rejected": 1.6685301065444946, "step": 300 }, { "epoch": 0.21990867579908677, "grad_norm": 83.42953789257585, "learning_rate": 2.746350364963503e-07, "logits/chosen": -2.105761766433716, "logits/rejected": -2.285074234008789, "logps/chosen": -671.2239379882812, "logps/rejected": -720.477783203125, "loss": 0.6039, "rewards/accuracies": 0.75, "rewards/chosen": 1.720473289489746, "rewards/margins": 0.04338139295578003, "rewards/rejected": 1.6770918369293213, "step": 301 }, { "epoch": 0.2206392694063927, "grad_norm": 81.40273804804572, "learning_rate": 2.7554744525547445e-07, "logits/chosen": -2.779853582382202, "logits/rejected": -2.811357259750366, "logps/chosen": -458.46759033203125, "logps/rejected": -467.6694030761719, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": 1.0104929208755493, "rewards/margins": 0.16204509139060974, "rewards/rejected": 0.8484477996826172, "step": 302 }, { "epoch": 0.22136986301369863, "grad_norm": 76.9567527698187, "learning_rate": 2.7645985401459854e-07, "logits/chosen": -2.5465197563171387, "logits/rejected": -2.22859525680542, "logps/chosen": -502.5874328613281, "logps/rejected": -459.20849609375, "loss": 0.5863, "rewards/accuracies": 0.625, "rewards/chosen": 1.342724084854126, "rewards/margins": 0.48133721947669983, "rewards/rejected": 0.861386775970459, "step": 303 }, { "epoch": 0.22210045662100455, "grad_norm": 66.36089627700564, "learning_rate": 2.773722627737226e-07, "logits/chosen": -3.0416622161865234, "logits/rejected": -2.606421947479248, "logps/chosen": -681.1142578125, "logps/rejected": -589.0733032226562, "loss": 0.4428, "rewards/accuracies": 0.875, "rewards/chosen": 1.7655627727508545, "rewards/margins": 0.5711708068847656, "rewards/rejected": 1.1943920850753784, "step": 304 }, { "epoch": 0.2228310502283105, "grad_norm": 63.20954849929222, "learning_rate": 2.782846715328467e-07, "logits/chosen": -3.2883505821228027, "logits/rejected": -3.090327501296997, "logps/chosen": -538.9828491210938, "logps/rejected": -580.092529296875, "loss": 0.3992, "rewards/accuracies": 0.75, "rewards/chosen": 1.2429234981536865, "rewards/margins": 0.4306631088256836, "rewards/rejected": 0.8122604489326477, "step": 305 }, { "epoch": 0.22356164383561644, "grad_norm": 68.03964334420229, "learning_rate": 2.7919708029197084e-07, "logits/chosen": -2.575604200363159, "logits/rejected": -1.42901611328125, "logps/chosen": -738.0060424804688, "logps/rejected": -332.5115661621094, "loss": 0.4879, "rewards/accuracies": 0.875, "rewards/chosen": 1.4182416200637817, "rewards/margins": 0.8347818851470947, "rewards/rejected": 0.583459734916687, "step": 306 }, { "epoch": 0.22429223744292237, "grad_norm": 56.6647879494786, "learning_rate": 2.8010948905109486e-07, "logits/chosen": -3.076162815093994, "logits/rejected": -1.6696021556854248, "logps/chosen": -378.5327453613281, "logps/rejected": -199.64450073242188, "loss": 0.4469, "rewards/accuracies": 0.875, "rewards/chosen": 1.1390808820724487, "rewards/margins": 0.9515993595123291, "rewards/rejected": 0.1874815970659256, "step": 307 }, { "epoch": 0.2250228310502283, "grad_norm": 81.39408498356929, "learning_rate": 2.8102189781021895e-07, "logits/chosen": -3.0644350051879883, "logits/rejected": -2.173508882522583, "logps/chosen": -572.83154296875, "logps/rejected": -560.043212890625, "loss": 0.5365, "rewards/accuracies": 0.5, "rewards/chosen": 1.6795510053634644, "rewards/margins": 0.7518103718757629, "rewards/rejected": 0.9277405738830566, "step": 308 }, { "epoch": 0.22575342465753426, "grad_norm": 61.07190306311078, "learning_rate": 2.8193430656934303e-07, "logits/chosen": -2.884842872619629, "logits/rejected": -2.6308047771453857, "logps/chosen": -623.7673950195312, "logps/rejected": -626.447509765625, "loss": 0.4448, "rewards/accuracies": 0.625, "rewards/chosen": 1.3568463325500488, "rewards/margins": 0.48561403155326843, "rewards/rejected": 0.871232271194458, "step": 309 }, { "epoch": 0.2264840182648402, "grad_norm": 88.18502776035028, "learning_rate": 2.8284671532846716e-07, "logits/chosen": -2.7186532020568848, "logits/rejected": -2.905952215194702, "logps/chosen": -639.3110961914062, "logps/rejected": -821.43212890625, "loss": 0.5819, "rewards/accuracies": 0.5, "rewards/chosen": 1.4340736865997314, "rewards/margins": -0.010940566658973694, "rewards/rejected": 1.445014238357544, "step": 310 }, { "epoch": 0.22721461187214612, "grad_norm": 81.45678014290769, "learning_rate": 2.8375912408759125e-07, "logits/chosen": -2.943577527999878, "logits/rejected": -2.091942548751831, "logps/chosen": -987.73095703125, "logps/rejected": -640.8531494140625, "loss": 0.5572, "rewards/accuracies": 0.75, "rewards/chosen": 1.6930636167526245, "rewards/margins": 0.9007493257522583, "rewards/rejected": 0.7923142910003662, "step": 311 }, { "epoch": 0.22794520547945205, "grad_norm": 73.66121509570013, "learning_rate": 2.846715328467153e-07, "logits/chosen": -2.9156458377838135, "logits/rejected": -2.316946029663086, "logps/chosen": -638.7382202148438, "logps/rejected": -568.3553466796875, "loss": 0.4552, "rewards/accuracies": 0.5, "rewards/chosen": 1.3789916038513184, "rewards/margins": 0.3415542244911194, "rewards/rejected": 1.0374374389648438, "step": 312 }, { "epoch": 0.228675799086758, "grad_norm": 52.75468127554535, "learning_rate": 2.855839416058394e-07, "logits/chosen": -2.56632137298584, "logits/rejected": -1.5568091869354248, "logps/chosen": -552.75927734375, "logps/rejected": -339.1873474121094, "loss": 0.4383, "rewards/accuracies": 0.875, "rewards/chosen": 1.3367711305618286, "rewards/margins": 0.9224081039428711, "rewards/rejected": 0.4143630862236023, "step": 313 }, { "epoch": 0.22940639269406393, "grad_norm": 65.58462671784257, "learning_rate": 2.864963503649635e-07, "logits/chosen": -2.7817111015319824, "logits/rejected": -2.163630485534668, "logps/chosen": -862.2318115234375, "logps/rejected": -627.0550537109375, "loss": 0.4215, "rewards/accuracies": 0.75, "rewards/chosen": 2.15885591506958, "rewards/margins": 0.9382625818252563, "rewards/rejected": 1.2205933332443237, "step": 314 }, { "epoch": 0.23013698630136986, "grad_norm": 69.44001387776179, "learning_rate": 2.874087591240876e-07, "logits/chosen": -3.018465280532837, "logits/rejected": -2.2083637714385986, "logps/chosen": -706.13427734375, "logps/rejected": -536.0084838867188, "loss": 0.4516, "rewards/accuracies": 0.875, "rewards/chosen": 1.80712890625, "rewards/margins": 0.6629295945167542, "rewards/rejected": 1.1441993713378906, "step": 315 }, { "epoch": 0.2308675799086758, "grad_norm": 73.19604588235966, "learning_rate": 2.8832116788321166e-07, "logits/chosen": -2.458184242248535, "logits/rejected": -1.5168063640594482, "logps/chosen": -527.1397705078125, "logps/rejected": -329.56390380859375, "loss": 0.5474, "rewards/accuracies": 0.75, "rewards/chosen": 1.5586271286010742, "rewards/margins": 1.0915842056274414, "rewards/rejected": 0.46704280376434326, "step": 316 }, { "epoch": 0.23159817351598175, "grad_norm": 76.26443895051939, "learning_rate": 2.892335766423358e-07, "logits/chosen": -3.0624771118164062, "logits/rejected": -1.6362411975860596, "logps/chosen": -858.3270874023438, "logps/rejected": -428.4784851074219, "loss": 0.5048, "rewards/accuracies": 0.75, "rewards/chosen": 2.061056613922119, "rewards/margins": 1.3137731552124023, "rewards/rejected": 0.747283399105072, "step": 317 }, { "epoch": 0.23232876712328768, "grad_norm": 71.3742953664868, "learning_rate": 2.901459854014598e-07, "logits/chosen": -2.893404722213745, "logits/rejected": -2.7442734241485596, "logps/chosen": -599.0897216796875, "logps/rejected": -651.11328125, "loss": 0.5384, "rewards/accuracies": 0.625, "rewards/chosen": 1.102839469909668, "rewards/margins": 0.022508589550852776, "rewards/rejected": 1.0803308486938477, "step": 318 }, { "epoch": 0.2330593607305936, "grad_norm": 68.42649026738475, "learning_rate": 2.910583941605839e-07, "logits/chosen": -2.777451992034912, "logits/rejected": -2.497089385986328, "logps/chosen": -986.4182739257812, "logps/rejected": -839.2782592773438, "loss": 0.4563, "rewards/accuracies": 0.75, "rewards/chosen": 2.0583131313323975, "rewards/margins": 0.927151083946228, "rewards/rejected": 1.1311620473861694, "step": 319 }, { "epoch": 0.23378995433789954, "grad_norm": 67.62113335179505, "learning_rate": 2.91970802919708e-07, "logits/chosen": -2.914950370788574, "logits/rejected": -2.0341737270355225, "logps/chosen": -794.3770751953125, "logps/rejected": -513.2255859375, "loss": 0.4949, "rewards/accuracies": 0.625, "rewards/chosen": 1.6529265642166138, "rewards/margins": 0.9931697845458984, "rewards/rejected": 0.6597567200660706, "step": 320 }, { "epoch": 0.2345205479452055, "grad_norm": 57.14501922876787, "learning_rate": 2.928832116788321e-07, "logits/chosen": -2.4274556636810303, "logits/rejected": -1.716905117034912, "logps/chosen": -328.4242248535156, "logps/rejected": -230.2021026611328, "loss": 0.4722, "rewards/accuracies": 0.75, "rewards/chosen": 0.9612551331520081, "rewards/margins": 0.6358004808425903, "rewards/rejected": 0.32545462250709534, "step": 321 }, { "epoch": 0.23525114155251142, "grad_norm": 54.31955911044497, "learning_rate": 2.937956204379562e-07, "logits/chosen": -2.24282169342041, "logits/rejected": -1.911682367324829, "logps/chosen": -671.62158203125, "logps/rejected": -588.2261352539062, "loss": 0.393, "rewards/accuracies": 0.75, "rewards/chosen": 1.3457837104797363, "rewards/margins": 0.48278775811195374, "rewards/rejected": 0.8629959225654602, "step": 322 }, { "epoch": 0.23598173515981735, "grad_norm": 67.71434067901532, "learning_rate": 2.947080291970803e-07, "logits/chosen": -2.672564744949341, "logits/rejected": -2.3989877700805664, "logps/chosen": -583.5216064453125, "logps/rejected": -479.5408935546875, "loss": 0.498, "rewards/accuracies": 0.625, "rewards/chosen": 1.2355103492736816, "rewards/margins": 0.3166622519493103, "rewards/rejected": 0.9188480973243713, "step": 323 }, { "epoch": 0.23671232876712328, "grad_norm": 82.47412887441583, "learning_rate": 2.9562043795620437e-07, "logits/chosen": -3.197758436203003, "logits/rejected": -2.6337485313415527, "logps/chosen": -810.5760498046875, "logps/rejected": -575.293701171875, "loss": 0.5788, "rewards/accuracies": 0.75, "rewards/chosen": 1.5133363008499146, "rewards/margins": 0.6640822291374207, "rewards/rejected": 0.8492540121078491, "step": 324 }, { "epoch": 0.2374429223744292, "grad_norm": 64.95626157630032, "learning_rate": 2.9653284671532845e-07, "logits/chosen": -1.991675615310669, "logits/rejected": -2.4783923625946045, "logps/chosen": -383.2439270019531, "logps/rejected": -552.3189086914062, "loss": 0.5042, "rewards/accuracies": 0.75, "rewards/chosen": 0.9465519785881042, "rewards/margins": 0.356853187084198, "rewards/rejected": 0.5896987915039062, "step": 325 }, { "epoch": 0.23817351598173517, "grad_norm": 66.8768581809601, "learning_rate": 2.9744525547445253e-07, "logits/chosen": -2.89406418800354, "logits/rejected": -2.0779056549072266, "logps/chosen": -774.2554321289062, "logps/rejected": -491.4771728515625, "loss": 0.545, "rewards/accuracies": 0.75, "rewards/chosen": 1.7986063957214355, "rewards/margins": 0.6239631175994873, "rewards/rejected": 1.1746432781219482, "step": 326 }, { "epoch": 0.2389041095890411, "grad_norm": 68.78576484500627, "learning_rate": 2.983576642335766e-07, "logits/chosen": -2.3379552364349365, "logits/rejected": -2.421232223510742, "logps/chosen": -638.7080688476562, "logps/rejected": -746.5928955078125, "loss": 0.5181, "rewards/accuracies": 0.625, "rewards/chosen": 0.9805386066436768, "rewards/margins": 0.0440153107047081, "rewards/rejected": 0.9365232586860657, "step": 327 }, { "epoch": 0.23963470319634703, "grad_norm": 79.47703825240973, "learning_rate": 2.9927007299270075e-07, "logits/chosen": -2.6897315979003906, "logits/rejected": -2.719919204711914, "logps/chosen": -691.8466186523438, "logps/rejected": -738.3701171875, "loss": 0.5644, "rewards/accuracies": 0.375, "rewards/chosen": 1.1148539781570435, "rewards/margins": -0.19309282302856445, "rewards/rejected": 1.307946801185608, "step": 328 }, { "epoch": 0.24036529680365296, "grad_norm": 61.59197026221235, "learning_rate": 3.0018248175182483e-07, "logits/chosen": -2.8793036937713623, "logits/rejected": -1.9300910234451294, "logps/chosen": -532.0831298828125, "logps/rejected": -225.79766845703125, "loss": 0.4919, "rewards/accuracies": 0.875, "rewards/chosen": 1.2198001146316528, "rewards/margins": 0.8684647083282471, "rewards/rejected": 0.35133546590805054, "step": 329 }, { "epoch": 0.2410958904109589, "grad_norm": 73.93149014036577, "learning_rate": 3.0109489051094886e-07, "logits/chosen": -2.072329521179199, "logits/rejected": -1.7403192520141602, "logps/chosen": -729.366943359375, "logps/rejected": -558.7517700195312, "loss": 0.5438, "rewards/accuracies": 0.75, "rewards/chosen": 1.8929362297058105, "rewards/margins": 0.9909258484840393, "rewards/rejected": 0.9020103812217712, "step": 330 }, { "epoch": 0.24182648401826484, "grad_norm": 63.754901772334335, "learning_rate": 3.0200729927007295e-07, "logits/chosen": -2.5517663955688477, "logits/rejected": -2.108123302459717, "logps/chosen": -616.21728515625, "logps/rejected": -604.2529907226562, "loss": 0.5158, "rewards/accuracies": 0.625, "rewards/chosen": 1.309702754020691, "rewards/margins": 0.5201064944267273, "rewards/rejected": 0.7895961999893188, "step": 331 }, { "epoch": 0.24255707762557077, "grad_norm": 92.0438658585541, "learning_rate": 3.029197080291971e-07, "logits/chosen": -2.810366630554199, "logits/rejected": -2.554002046585083, "logps/chosen": -879.067138671875, "logps/rejected": -724.47412109375, "loss": 0.6226, "rewards/accuracies": 0.625, "rewards/chosen": 1.8611865043640137, "rewards/margins": 0.43602174520492554, "rewards/rejected": 1.4251649379730225, "step": 332 }, { "epoch": 0.2432876712328767, "grad_norm": 78.40385857253774, "learning_rate": 3.0383211678832116e-07, "logits/chosen": -2.85609769821167, "logits/rejected": -2.3262336254119873, "logps/chosen": -766.6273193359375, "logps/rejected": -765.9754638671875, "loss": 0.5478, "rewards/accuracies": 0.75, "rewards/chosen": 1.8558933734893799, "rewards/margins": 0.3908237814903259, "rewards/rejected": 1.4650695323944092, "step": 333 }, { "epoch": 0.24401826484018266, "grad_norm": 79.89075487716805, "learning_rate": 3.0474452554744525e-07, "logits/chosen": -2.4776248931884766, "logits/rejected": -2.4462451934814453, "logps/chosen": -475.6197204589844, "logps/rejected": -611.71435546875, "loss": 0.5887, "rewards/accuracies": 0.625, "rewards/chosen": 1.263580560684204, "rewards/margins": 0.09436751902103424, "rewards/rejected": 1.169213056564331, "step": 334 }, { "epoch": 0.2447488584474886, "grad_norm": 54.08097588917012, "learning_rate": 3.0565693430656933e-07, "logits/chosen": -2.864834785461426, "logits/rejected": -2.1287894248962402, "logps/chosen": -679.98388671875, "logps/rejected": -454.6954345703125, "loss": 0.3564, "rewards/accuracies": 0.75, "rewards/chosen": 1.7911301851272583, "rewards/margins": 1.1081721782684326, "rewards/rejected": 0.6829579472541809, "step": 335 }, { "epoch": 0.24547945205479452, "grad_norm": 67.23373772356591, "learning_rate": 3.065693430656934e-07, "logits/chosen": -2.7003049850463867, "logits/rejected": -2.3056640625, "logps/chosen": -493.521484375, "logps/rejected": -506.75091552734375, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 1.5630629062652588, "rewards/margins": 1.0774035453796387, "rewards/rejected": 0.48565933108329773, "step": 336 }, { "epoch": 0.24621004566210045, "grad_norm": 75.20001324592636, "learning_rate": 3.074817518248175e-07, "logits/chosen": -2.77557373046875, "logits/rejected": -2.0373833179473877, "logps/chosen": -692.891357421875, "logps/rejected": -551.462646484375, "loss": 0.4516, "rewards/accuracies": 0.75, "rewards/chosen": 2.1470892429351807, "rewards/margins": 1.2375848293304443, "rewards/rejected": 0.9095042943954468, "step": 337 }, { "epoch": 0.2469406392694064, "grad_norm": 75.34194509970027, "learning_rate": 3.083941605839416e-07, "logits/chosen": -2.3275065422058105, "logits/rejected": -2.3406496047973633, "logps/chosen": -614.1890258789062, "logps/rejected": -645.3617553710938, "loss": 0.5517, "rewards/accuracies": 0.375, "rewards/chosen": 1.2377128601074219, "rewards/margins": 0.2948717772960663, "rewards/rejected": 0.9428409934043884, "step": 338 }, { "epoch": 0.24767123287671233, "grad_norm": 81.79183216308185, "learning_rate": 3.093065693430657e-07, "logits/chosen": -1.9840370416641235, "logits/rejected": -2.1103222370147705, "logps/chosen": -681.4443359375, "logps/rejected": -677.425537109375, "loss": 0.5302, "rewards/accuracies": 0.625, "rewards/chosen": 1.4534173011779785, "rewards/margins": 0.5499621033668518, "rewards/rejected": 0.9034552574157715, "step": 339 }, { "epoch": 0.24840182648401826, "grad_norm": 73.79083879876282, "learning_rate": 3.102189781021898e-07, "logits/chosen": -2.35575795173645, "logits/rejected": -2.146472930908203, "logps/chosen": -673.0758666992188, "logps/rejected": -547.0565185546875, "loss": 0.4985, "rewards/accuracies": 0.625, "rewards/chosen": 1.2743754386901855, "rewards/margins": 0.47497838735580444, "rewards/rejected": 0.7993971109390259, "step": 340 }, { "epoch": 0.2491324200913242, "grad_norm": 71.64014619523749, "learning_rate": 3.111313868613139e-07, "logits/chosen": -3.2311668395996094, "logits/rejected": -2.833406925201416, "logps/chosen": -737.0320434570312, "logps/rejected": -527.4598999023438, "loss": 0.5246, "rewards/accuracies": 0.875, "rewards/chosen": 1.4626190662384033, "rewards/margins": 0.5879504680633545, "rewards/rejected": 0.874668538570404, "step": 341 }, { "epoch": 0.24986301369863015, "grad_norm": 61.630587744261575, "learning_rate": 3.120437956204379e-07, "logits/chosen": -3.1106655597686768, "logits/rejected": -1.5708032846450806, "logps/chosen": -589.257568359375, "logps/rejected": -288.2099609375, "loss": 0.3634, "rewards/accuracies": 0.875, "rewards/chosen": 1.8525669574737549, "rewards/margins": 1.5388537645339966, "rewards/rejected": 0.3137131333351135, "step": 342 }, { "epoch": 0.25059360730593605, "grad_norm": 70.60238506837959, "learning_rate": 3.1295620437956204e-07, "logits/chosen": -3.0517640113830566, "logits/rejected": -2.560793399810791, "logps/chosen": -441.9164123535156, "logps/rejected": -509.4173889160156, "loss": 0.4509, "rewards/accuracies": 0.75, "rewards/chosen": 1.32505202293396, "rewards/margins": 0.7069000601768494, "rewards/rejected": 0.6181519031524658, "step": 343 }, { "epoch": 0.25132420091324204, "grad_norm": 57.54991464616824, "learning_rate": 3.138686131386861e-07, "logits/chosen": -2.97263765335083, "logits/rejected": -1.884211540222168, "logps/chosen": -598.3499755859375, "logps/rejected": -219.9856719970703, "loss": 0.4069, "rewards/accuracies": 0.75, "rewards/chosen": 1.6223890781402588, "rewards/margins": 1.485105276107788, "rewards/rejected": 0.13728386163711548, "step": 344 }, { "epoch": 0.25205479452054796, "grad_norm": 63.98918251169978, "learning_rate": 3.147810218978102e-07, "logits/chosen": -2.7842164039611816, "logits/rejected": -2.5508511066436768, "logps/chosen": -709.7152099609375, "logps/rejected": -581.5972900390625, "loss": 0.4332, "rewards/accuracies": 0.875, "rewards/chosen": 1.134279489517212, "rewards/margins": 0.4700947403907776, "rewards/rejected": 0.6641847491264343, "step": 345 }, { "epoch": 0.2527853881278539, "grad_norm": 62.5484421643537, "learning_rate": 3.156934306569343e-07, "logits/chosen": -3.199657440185547, "logits/rejected": -2.6474757194519043, "logps/chosen": -676.4030151367188, "logps/rejected": -556.9136962890625, "loss": 0.4389, "rewards/accuracies": 0.75, "rewards/chosen": 1.653864860534668, "rewards/margins": 0.8211549520492554, "rewards/rejected": 0.8327099680900574, "step": 346 }, { "epoch": 0.2535159817351598, "grad_norm": 74.51051860517748, "learning_rate": 3.1660583941605837e-07, "logits/chosen": -2.725823402404785, "logits/rejected": -2.2539877891540527, "logps/chosen": -792.464599609375, "logps/rejected": -618.626220703125, "loss": 0.4796, "rewards/accuracies": 0.75, "rewards/chosen": 2.1740562915802, "rewards/margins": 0.8688697814941406, "rewards/rejected": 1.3051865100860596, "step": 347 }, { "epoch": 0.25424657534246575, "grad_norm": 72.17751554327705, "learning_rate": 3.1751824817518245e-07, "logits/chosen": -3.168917655944824, "logits/rejected": -1.9186875820159912, "logps/chosen": -798.685302734375, "logps/rejected": -477.2408447265625, "loss": 0.4923, "rewards/accuracies": 0.875, "rewards/chosen": 2.1557743549346924, "rewards/margins": 1.2908755540847778, "rewards/rejected": 0.8648988008499146, "step": 348 }, { "epoch": 0.2549771689497717, "grad_norm": 74.5678729466065, "learning_rate": 3.1843065693430653e-07, "logits/chosen": -2.8967783451080322, "logits/rejected": -2.3026318550109863, "logps/chosen": -928.54833984375, "logps/rejected": -666.3497314453125, "loss": 0.5068, "rewards/accuracies": 0.75, "rewards/chosen": 1.7114887237548828, "rewards/margins": 0.5908459424972534, "rewards/rejected": 1.120642900466919, "step": 349 }, { "epoch": 0.2557077625570776, "grad_norm": 67.33171658730936, "learning_rate": 3.1934306569343067e-07, "logits/chosen": -2.917487621307373, "logits/rejected": -2.487536907196045, "logps/chosen": -799.7305908203125, "logps/rejected": -752.6463012695312, "loss": 0.43, "rewards/accuracies": 0.5, "rewards/chosen": 1.614867091178894, "rewards/margins": 0.42322883009910583, "rewards/rejected": 1.1916382312774658, "step": 350 }, { "epoch": 0.25643835616438354, "grad_norm": 74.11982222641255, "learning_rate": 3.2025547445255475e-07, "logits/chosen": -2.857961654663086, "logits/rejected": -2.0536394119262695, "logps/chosen": -342.31463623046875, "logps/rejected": -246.44203186035156, "loss": 0.485, "rewards/accuracies": 0.625, "rewards/chosen": 0.7442098259925842, "rewards/margins": 0.41122135519981384, "rewards/rejected": 0.332988440990448, "step": 351 }, { "epoch": 0.25716894977168947, "grad_norm": 54.432335522101425, "learning_rate": 3.2116788321167883e-07, "logits/chosen": -3.4277918338775635, "logits/rejected": -2.23167085647583, "logps/chosen": -637.550048828125, "logps/rejected": -353.1361389160156, "loss": 0.3594, "rewards/accuracies": 0.875, "rewards/chosen": 1.7223354578018188, "rewards/margins": 1.1746888160705566, "rewards/rejected": 0.547646701335907, "step": 352 }, { "epoch": 0.25789954337899546, "grad_norm": 65.14172125186937, "learning_rate": 3.2208029197080286e-07, "logits/chosen": -2.7349019050598145, "logits/rejected": -2.1213040351867676, "logps/chosen": -354.80413818359375, "logps/rejected": -475.7827453613281, "loss": 0.4782, "rewards/accuracies": 0.625, "rewards/chosen": 1.273702621459961, "rewards/margins": 1.0503478050231934, "rewards/rejected": 0.2233547419309616, "step": 353 }, { "epoch": 0.2586301369863014, "grad_norm": 100.77797475298112, "learning_rate": 3.22992700729927e-07, "logits/chosen": -3.0285019874572754, "logits/rejected": -0.916675865650177, "logps/chosen": -619.8272705078125, "logps/rejected": -176.80502319335938, "loss": 0.6685, "rewards/accuracies": 0.875, "rewards/chosen": 1.4157419204711914, "rewards/margins": 1.332853078842163, "rewards/rejected": 0.08288885653018951, "step": 354 }, { "epoch": 0.2593607305936073, "grad_norm": 90.97128345836668, "learning_rate": 3.239051094890511e-07, "logits/chosen": -2.399510622024536, "logits/rejected": -1.8207087516784668, "logps/chosen": -499.43316650390625, "logps/rejected": -360.3272399902344, "loss": 0.5627, "rewards/accuracies": 0.75, "rewards/chosen": 1.4026367664337158, "rewards/margins": 1.0195523500442505, "rewards/rejected": 0.38308441638946533, "step": 355 }, { "epoch": 0.26009132420091324, "grad_norm": 62.49320080204226, "learning_rate": 3.2481751824817516e-07, "logits/chosen": -2.6886167526245117, "logits/rejected": -2.1871209144592285, "logps/chosen": -488.5709533691406, "logps/rejected": -377.8763427734375, "loss": 0.4686, "rewards/accuracies": 0.75, "rewards/chosen": 1.3007678985595703, "rewards/margins": 0.6745598316192627, "rewards/rejected": 0.6262080669403076, "step": 356 }, { "epoch": 0.2608219178082192, "grad_norm": 63.36215892465905, "learning_rate": 3.2572992700729925e-07, "logits/chosen": -2.5223498344421387, "logits/rejected": -2.483816146850586, "logps/chosen": -964.916259765625, "logps/rejected": -929.6116943359375, "loss": 0.4385, "rewards/accuracies": 1.0, "rewards/chosen": 1.912022590637207, "rewards/margins": 1.000117301940918, "rewards/rejected": 0.9119054079055786, "step": 357 }, { "epoch": 0.2615525114155251, "grad_norm": 59.55560115477175, "learning_rate": 3.266423357664234e-07, "logits/chosen": -2.386752128601074, "logits/rejected": -2.3189616203308105, "logps/chosen": -647.603271484375, "logps/rejected": -495.68841552734375, "loss": 0.4254, "rewards/accuracies": 0.5, "rewards/chosen": 1.1196098327636719, "rewards/margins": 0.4073161780834198, "rewards/rejected": 0.7122936844825745, "step": 358 }, { "epoch": 0.26228310502283103, "grad_norm": 70.28772132834618, "learning_rate": 3.275547445255474e-07, "logits/chosen": -2.905087947845459, "logits/rejected": -2.4893460273742676, "logps/chosen": -472.9693603515625, "logps/rejected": -348.1871032714844, "loss": 0.4848, "rewards/accuracies": 0.625, "rewards/chosen": 1.4381232261657715, "rewards/margins": 1.1200246810913086, "rewards/rejected": 0.3180985450744629, "step": 359 }, { "epoch": 0.26301369863013696, "grad_norm": 73.11611858458434, "learning_rate": 3.284671532846715e-07, "logits/chosen": -2.271131992340088, "logits/rejected": -1.6500517129898071, "logps/chosen": -575.5693969726562, "logps/rejected": -502.08013916015625, "loss": 0.477, "rewards/accuracies": 0.75, "rewards/chosen": 1.4339566230773926, "rewards/margins": 0.6196431517601013, "rewards/rejected": 0.8143135905265808, "step": 360 }, { "epoch": 0.26374429223744295, "grad_norm": 75.05851233163031, "learning_rate": 3.293795620437956e-07, "logits/chosen": -2.9240427017211914, "logits/rejected": -2.6160712242126465, "logps/chosen": -725.9891967773438, "logps/rejected": -589.2877807617188, "loss": 0.5117, "rewards/accuracies": 0.75, "rewards/chosen": 2.161649703979492, "rewards/margins": 1.2733564376831055, "rewards/rejected": 0.8882932066917419, "step": 361 }, { "epoch": 0.2644748858447489, "grad_norm": 68.84788840829651, "learning_rate": 3.302919708029197e-07, "logits/chosen": -3.0963103771209717, "logits/rejected": -2.083580255508423, "logps/chosen": -567.5488891601562, "logps/rejected": -374.01080322265625, "loss": 0.4955, "rewards/accuracies": 0.5, "rewards/chosen": 1.2662897109985352, "rewards/margins": 0.6540833711624146, "rewards/rejected": 0.6122063398361206, "step": 362 }, { "epoch": 0.2652054794520548, "grad_norm": 95.80558928110614, "learning_rate": 3.312043795620438e-07, "logits/chosen": -2.3870019912719727, "logits/rejected": -2.7108991146087646, "logps/chosen": -539.34423828125, "logps/rejected": -648.2745361328125, "loss": 0.5966, "rewards/accuracies": 0.625, "rewards/chosen": 1.1476855278015137, "rewards/margins": 0.13236147165298462, "rewards/rejected": 1.0153241157531738, "step": 363 }, { "epoch": 0.26593607305936073, "grad_norm": 70.94436571951115, "learning_rate": 3.321167883211679e-07, "logits/chosen": -2.6010470390319824, "logits/rejected": -2.4594132900238037, "logps/chosen": -739.75390625, "logps/rejected": -765.699951171875, "loss": 0.4747, "rewards/accuracies": 0.75, "rewards/chosen": 1.5196869373321533, "rewards/margins": 0.5209322571754456, "rewards/rejected": 0.998754620552063, "step": 364 }, { "epoch": 0.26666666666666666, "grad_norm": 77.11794690196959, "learning_rate": 3.3302919708029196e-07, "logits/chosen": -2.7163212299346924, "logits/rejected": -2.667623996734619, "logps/chosen": -634.9674682617188, "logps/rejected": -699.3147583007812, "loss": 0.489, "rewards/accuracies": 0.75, "rewards/chosen": 1.1369354724884033, "rewards/margins": 0.27647948265075684, "rewards/rejected": 0.8604559898376465, "step": 365 }, { "epoch": 0.2673972602739726, "grad_norm": 60.0863880045847, "learning_rate": 3.3394160583941604e-07, "logits/chosen": -2.886478900909424, "logits/rejected": -1.787085771560669, "logps/chosen": -409.0268249511719, "logps/rejected": -345.37786865234375, "loss": 0.372, "rewards/accuracies": 0.75, "rewards/chosen": 0.9466387629508972, "rewards/margins": 0.8209274411201477, "rewards/rejected": 0.12571127712726593, "step": 366 }, { "epoch": 0.2681278538812785, "grad_norm": 71.34534464928318, "learning_rate": 3.348540145985401e-07, "logits/chosen": -2.955108880996704, "logits/rejected": -2.253178358078003, "logps/chosen": -484.91162109375, "logps/rejected": -475.01275634765625, "loss": 0.432, "rewards/accuracies": 0.875, "rewards/chosen": 1.0630278587341309, "rewards/margins": 0.7278741598129272, "rewards/rejected": 0.33515360951423645, "step": 367 }, { "epoch": 0.26885844748858445, "grad_norm": 63.583566061104776, "learning_rate": 3.357664233576642e-07, "logits/chosen": -3.3694608211517334, "logits/rejected": -2.4357199668884277, "logps/chosen": -954.42138671875, "logps/rejected": -540.984375, "loss": 0.3865, "rewards/accuracies": 0.875, "rewards/chosen": 1.6273095607757568, "rewards/margins": 1.1576517820358276, "rewards/rejected": 0.46965789794921875, "step": 368 }, { "epoch": 0.26958904109589044, "grad_norm": 68.99682609887755, "learning_rate": 3.3667883211678834e-07, "logits/chosen": -2.5228986740112305, "logits/rejected": -2.495777130126953, "logps/chosen": -348.005859375, "logps/rejected": -420.9943542480469, "loss": 0.5107, "rewards/accuracies": 0.625, "rewards/chosen": 0.1602119505405426, "rewards/margins": -0.06407183408737183, "rewards/rejected": 0.22428378462791443, "step": 369 }, { "epoch": 0.27031963470319637, "grad_norm": 60.54029399718525, "learning_rate": 3.375912408759124e-07, "logits/chosen": -2.9932103157043457, "logits/rejected": -2.5944314002990723, "logps/chosen": -861.4326782226562, "logps/rejected": -565.2564086914062, "loss": 0.4533, "rewards/accuracies": 0.75, "rewards/chosen": 1.7288862466812134, "rewards/margins": 0.6853228211402893, "rewards/rejected": 1.0435633659362793, "step": 370 }, { "epoch": 0.2710502283105023, "grad_norm": 61.108741766726794, "learning_rate": 3.3850364963503645e-07, "logits/chosen": -2.6754932403564453, "logits/rejected": -2.3621597290039062, "logps/chosen": -801.98291015625, "logps/rejected": -756.056396484375, "loss": 0.4145, "rewards/accuracies": 0.75, "rewards/chosen": 1.6201605796813965, "rewards/margins": 0.9621235132217407, "rewards/rejected": 0.658037006855011, "step": 371 }, { "epoch": 0.2717808219178082, "grad_norm": 72.9258673510727, "learning_rate": 3.3941605839416053e-07, "logits/chosen": -2.4366323947906494, "logits/rejected": -1.9886013269424438, "logps/chosen": -738.192626953125, "logps/rejected": -545.857421875, "loss": 0.4222, "rewards/accuracies": 0.875, "rewards/chosen": 1.2282512187957764, "rewards/margins": 0.8965501189231873, "rewards/rejected": 0.3317010998725891, "step": 372 }, { "epoch": 0.27251141552511415, "grad_norm": 73.40425444286251, "learning_rate": 3.4032846715328467e-07, "logits/chosen": -2.230095863342285, "logits/rejected": -2.2623443603515625, "logps/chosen": -246.75405883789062, "logps/rejected": -242.29278564453125, "loss": 0.5504, "rewards/accuracies": 0.75, "rewards/chosen": 1.4387999773025513, "rewards/margins": 1.3302743434906006, "rewards/rejected": 0.1085255891084671, "step": 373 }, { "epoch": 0.2732420091324201, "grad_norm": 50.594242199582354, "learning_rate": 3.4124087591240875e-07, "logits/chosen": -3.072220802307129, "logits/rejected": -2.5158779621124268, "logps/chosen": -713.6994018554688, "logps/rejected": -501.1537780761719, "loss": 0.4022, "rewards/accuracies": 0.625, "rewards/chosen": 1.5378506183624268, "rewards/margins": 0.9936221837997437, "rewards/rejected": 0.5442284345626831, "step": 374 }, { "epoch": 0.273972602739726, "grad_norm": 76.96084510767471, "learning_rate": 3.4215328467153283e-07, "logits/chosen": -3.02197527885437, "logits/rejected": -1.9107486009597778, "logps/chosen": -546.15283203125, "logps/rejected": -298.21337890625, "loss": 0.4691, "rewards/accuracies": 0.875, "rewards/chosen": 1.640804648399353, "rewards/margins": 1.7185790538787842, "rewards/rejected": -0.07777443528175354, "step": 375 }, { "epoch": 0.27470319634703194, "grad_norm": 56.159406213474156, "learning_rate": 3.4306569343065697e-07, "logits/chosen": -2.760957717895508, "logits/rejected": -1.7923715114593506, "logps/chosen": -739.6419677734375, "logps/rejected": -634.124267578125, "loss": 0.3669, "rewards/accuracies": 0.875, "rewards/chosen": 1.7475645542144775, "rewards/margins": 1.1679000854492188, "rewards/rejected": 0.5796643495559692, "step": 376 }, { "epoch": 0.2754337899543379, "grad_norm": 74.55174967032129, "learning_rate": 3.43978102189781e-07, "logits/chosen": -3.0702731609344482, "logits/rejected": -1.9396933317184448, "logps/chosen": -827.3140869140625, "logps/rejected": -478.09619140625, "loss": 0.4993, "rewards/accuracies": 0.75, "rewards/chosen": 2.5454115867614746, "rewards/margins": 1.7131388187408447, "rewards/rejected": 0.8322728872299194, "step": 377 }, { "epoch": 0.27616438356164386, "grad_norm": 64.87715630448247, "learning_rate": 3.448905109489051e-07, "logits/chosen": -2.400067090988159, "logits/rejected": -2.8624908924102783, "logps/chosen": -252.84046936035156, "logps/rejected": -437.43304443359375, "loss": 0.4472, "rewards/accuracies": 0.375, "rewards/chosen": 0.555711567401886, "rewards/margins": 0.06545346230268478, "rewards/rejected": 0.4902580976486206, "step": 378 }, { "epoch": 0.2768949771689498, "grad_norm": 73.65961755135918, "learning_rate": 3.4580291970802916e-07, "logits/chosen": -2.4445083141326904, "logits/rejected": -2.591578722000122, "logps/chosen": -369.845947265625, "logps/rejected": -430.9102478027344, "loss": 0.494, "rewards/accuracies": 0.75, "rewards/chosen": 0.8981108665466309, "rewards/margins": 0.5167144536972046, "rewards/rejected": 0.3813963830471039, "step": 379 }, { "epoch": 0.2776255707762557, "grad_norm": 70.54497332708227, "learning_rate": 3.467153284671533e-07, "logits/chosen": -2.470676898956299, "logits/rejected": -2.2289621829986572, "logps/chosen": -527.9271850585938, "logps/rejected": -481.402099609375, "loss": 0.5215, "rewards/accuracies": 0.75, "rewards/chosen": 0.7302263379096985, "rewards/margins": 0.3210323452949524, "rewards/rejected": 0.4091939926147461, "step": 380 }, { "epoch": 0.27835616438356164, "grad_norm": 66.33563156875044, "learning_rate": 3.476277372262774e-07, "logits/chosen": -2.941246509552002, "logits/rejected": -2.615082263946533, "logps/chosen": -519.0614624023438, "logps/rejected": -467.2332458496094, "loss": 0.4519, "rewards/accuracies": 0.625, "rewards/chosen": 1.3365833759307861, "rewards/margins": 0.8681721687316895, "rewards/rejected": 0.46841126680374146, "step": 381 }, { "epoch": 0.2790867579908676, "grad_norm": 74.90954180293055, "learning_rate": 3.485401459854014e-07, "logits/chosen": -2.9824562072753906, "logits/rejected": -2.1903834342956543, "logps/chosen": -566.4522705078125, "logps/rejected": -320.92095947265625, "loss": 0.4811, "rewards/accuracies": 0.875, "rewards/chosen": 1.0372496843338013, "rewards/margins": 0.9067646861076355, "rewards/rejected": 0.13048501312732697, "step": 382 }, { "epoch": 0.2798173515981735, "grad_norm": 57.52864011597832, "learning_rate": 3.494525547445255e-07, "logits/chosen": -2.6866602897644043, "logits/rejected": -2.182722806930542, "logps/chosen": -660.5279541015625, "logps/rejected": -548.9330444335938, "loss": 0.3826, "rewards/accuracies": 0.75, "rewards/chosen": 1.9498240947723389, "rewards/margins": 1.106075406074524, "rewards/rejected": 0.8437488079071045, "step": 383 }, { "epoch": 0.28054794520547943, "grad_norm": 57.70880269251065, "learning_rate": 3.5036496350364963e-07, "logits/chosen": -2.7158241271972656, "logits/rejected": -2.1544179916381836, "logps/chosen": -799.2645874023438, "logps/rejected": -508.7663879394531, "loss": 0.4444, "rewards/accuracies": 1.0, "rewards/chosen": 1.592095136642456, "rewards/margins": 0.9800583124160767, "rewards/rejected": 0.6120368838310242, "step": 384 }, { "epoch": 0.28127853881278536, "grad_norm": 61.01791368947012, "learning_rate": 3.512773722627737e-07, "logits/chosen": -3.125530958175659, "logits/rejected": -1.835098147392273, "logps/chosen": -668.211181640625, "logps/rejected": -502.96954345703125, "loss": 0.3296, "rewards/accuracies": 0.875, "rewards/chosen": 2.243760108947754, "rewards/margins": 1.8235468864440918, "rewards/rejected": 0.4202132821083069, "step": 385 }, { "epoch": 0.28200913242009135, "grad_norm": 79.49546100446553, "learning_rate": 3.521897810218978e-07, "logits/chosen": -2.6909775733947754, "logits/rejected": -1.6820651292800903, "logps/chosen": -801.8557739257812, "logps/rejected": -489.9557800292969, "loss": 0.4485, "rewards/accuracies": 0.875, "rewards/chosen": 2.097705125808716, "rewards/margins": 1.478317379951477, "rewards/rejected": 0.6193877458572388, "step": 386 }, { "epoch": 0.2827397260273973, "grad_norm": 64.64371631174797, "learning_rate": 3.5310218978102193e-07, "logits/chosen": -2.4300169944763184, "logits/rejected": -2.074683666229248, "logps/chosen": -707.4773559570312, "logps/rejected": -503.5528564453125, "loss": 0.4533, "rewards/accuracies": 0.875, "rewards/chosen": 2.0083255767822266, "rewards/margins": 1.3360748291015625, "rewards/rejected": 0.6722507476806641, "step": 387 }, { "epoch": 0.2834703196347032, "grad_norm": 62.75065557737724, "learning_rate": 3.5401459854014596e-07, "logits/chosen": -3.039194345474243, "logits/rejected": -1.882209062576294, "logps/chosen": -438.83197021484375, "logps/rejected": -309.5884094238281, "loss": 0.4419, "rewards/accuracies": 0.875, "rewards/chosen": 1.2890483140945435, "rewards/margins": 1.0347347259521484, "rewards/rejected": 0.25431373715400696, "step": 388 }, { "epoch": 0.28420091324200913, "grad_norm": 66.38574315718135, "learning_rate": 3.5492700729927004e-07, "logits/chosen": -2.8681998252868652, "logits/rejected": -2.032710552215576, "logps/chosen": -682.5647583007812, "logps/rejected": -434.11737060546875, "loss": 0.4234, "rewards/accuracies": 0.875, "rewards/chosen": 1.554849624633789, "rewards/margins": 1.3126732110977173, "rewards/rejected": 0.24217624962329865, "step": 389 }, { "epoch": 0.28493150684931506, "grad_norm": 67.78540285226421, "learning_rate": 3.558394160583941e-07, "logits/chosen": -3.242905616760254, "logits/rejected": -2.122178792953491, "logps/chosen": -670.676025390625, "logps/rejected": -455.0798645019531, "loss": 0.5132, "rewards/accuracies": 1.0, "rewards/chosen": 1.5395225286483765, "rewards/margins": 1.5167927742004395, "rewards/rejected": 0.022729873657226562, "step": 390 }, { "epoch": 0.285662100456621, "grad_norm": 66.40731984432325, "learning_rate": 3.5675182481751826e-07, "logits/chosen": -3.1677541732788086, "logits/rejected": -2.1890695095062256, "logps/chosen": -866.6947021484375, "logps/rejected": -454.70361328125, "loss": 0.4193, "rewards/accuracies": 0.5, "rewards/chosen": 1.9513309001922607, "rewards/margins": 1.38962721824646, "rewards/rejected": 0.5617036819458008, "step": 391 }, { "epoch": 0.2863926940639269, "grad_norm": 73.46874003083727, "learning_rate": 3.5766423357664234e-07, "logits/chosen": -3.2884628772735596, "logits/rejected": -2.959207534790039, "logps/chosen": -669.9254760742188, "logps/rejected": -509.001708984375, "loss": 0.5573, "rewards/accuracies": 0.75, "rewards/chosen": 0.818830132484436, "rewards/margins": 0.46574079990386963, "rewards/rejected": 0.3530893325805664, "step": 392 }, { "epoch": 0.28712328767123285, "grad_norm": 69.15514148456353, "learning_rate": 3.585766423357664e-07, "logits/chosen": -2.728135824203491, "logits/rejected": -1.9368171691894531, "logps/chosen": -952.5960693359375, "logps/rejected": -645.7799072265625, "loss": 0.4373, "rewards/accuracies": 0.75, "rewards/chosen": 2.614717483520508, "rewards/margins": 1.8374053239822388, "rewards/rejected": 0.7773122787475586, "step": 393 }, { "epoch": 0.28785388127853884, "grad_norm": 62.05622332623558, "learning_rate": 3.5948905109489045e-07, "logits/chosen": -2.5525035858154297, "logits/rejected": -2.297220230102539, "logps/chosen": -587.1889038085938, "logps/rejected": -566.006103515625, "loss": 0.4025, "rewards/accuracies": 0.875, "rewards/chosen": 1.603887677192688, "rewards/margins": 0.8365820646286011, "rewards/rejected": 0.7673057317733765, "step": 394 }, { "epoch": 0.28858447488584477, "grad_norm": 63.39724515591647, "learning_rate": 3.604014598540146e-07, "logits/chosen": -2.590949773788452, "logits/rejected": -1.8515651226043701, "logps/chosen": -444.439208984375, "logps/rejected": -275.4432678222656, "loss": 0.5132, "rewards/accuracies": 0.625, "rewards/chosen": 0.5060741305351257, "rewards/margins": 0.4836372137069702, "rewards/rejected": 0.022436948493123055, "step": 395 }, { "epoch": 0.2893150684931507, "grad_norm": 65.0191227880306, "learning_rate": 3.6131386861313867e-07, "logits/chosen": -1.9494882822036743, "logits/rejected": -2.1524899005889893, "logps/chosen": -367.6574401855469, "logps/rejected": -545.2989501953125, "loss": 0.4849, "rewards/accuracies": 0.75, "rewards/chosen": 0.8524444103240967, "rewards/margins": 1.1184545755386353, "rewards/rejected": -0.2660100758075714, "step": 396 }, { "epoch": 0.2900456621004566, "grad_norm": 53.85080028566196, "learning_rate": 3.6222627737226275e-07, "logits/chosen": -2.6617238521575928, "logits/rejected": -2.202892541885376, "logps/chosen": -749.8721313476562, "logps/rejected": -610.5281982421875, "loss": 0.3696, "rewards/accuracies": 1.0, "rewards/chosen": 1.3859024047851562, "rewards/margins": 0.7530721426010132, "rewards/rejected": 0.6328302621841431, "step": 397 }, { "epoch": 0.29077625570776255, "grad_norm": 80.5329173769956, "learning_rate": 3.6313868613138683e-07, "logits/chosen": -2.3250205516815186, "logits/rejected": -1.6271016597747803, "logps/chosen": -629.6231079101562, "logps/rejected": -501.0259704589844, "loss": 0.6548, "rewards/accuracies": 0.875, "rewards/chosen": 1.3880834579467773, "rewards/margins": 0.9981052875518799, "rewards/rejected": 0.38997822999954224, "step": 398 }, { "epoch": 0.2915068493150685, "grad_norm": 63.434948378486524, "learning_rate": 3.6405109489051097e-07, "logits/chosen": -2.9334325790405273, "logits/rejected": -2.244391441345215, "logps/chosen": -615.547119140625, "logps/rejected": -503.2747802734375, "loss": 0.5631, "rewards/accuracies": 0.875, "rewards/chosen": 1.2154011726379395, "rewards/margins": 1.0267549753189087, "rewards/rejected": 0.188646137714386, "step": 399 }, { "epoch": 0.2922374429223744, "grad_norm": 63.05464829713191, "learning_rate": 3.64963503649635e-07, "logits/chosen": -2.727447986602783, "logits/rejected": -1.6556477546691895, "logps/chosen": -499.69287109375, "logps/rejected": -322.16143798828125, "loss": 0.474, "rewards/accuracies": 0.875, "rewards/chosen": 0.9981526732444763, "rewards/margins": 0.9117544889450073, "rewards/rejected": 0.08639810979366302, "step": 400 }, { "epoch": 0.29296803652968034, "grad_norm": 55.37115974974021, "learning_rate": 3.658759124087591e-07, "logits/chosen": -3.242347002029419, "logits/rejected": -2.4830305576324463, "logps/chosen": -626.187255859375, "logps/rejected": -435.19842529296875, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": 1.5346391201019287, "rewards/margins": 1.1428285837173462, "rewards/rejected": 0.3918105959892273, "step": 401 }, { "epoch": 0.2936986301369863, "grad_norm": 76.4411570312829, "learning_rate": 3.667883211678832e-07, "logits/chosen": -2.546297788619995, "logits/rejected": -2.1490650177001953, "logps/chosen": -686.1453857421875, "logps/rejected": -610.1552124023438, "loss": 0.5387, "rewards/accuracies": 0.875, "rewards/chosen": 1.8180707693099976, "rewards/margins": 1.100882649421692, "rewards/rejected": 0.7171881198883057, "step": 402 }, { "epoch": 0.29442922374429226, "grad_norm": 64.40634476582062, "learning_rate": 3.677007299270073e-07, "logits/chosen": -2.7202529907226562, "logits/rejected": -2.0475964546203613, "logps/chosen": -641.5408325195312, "logps/rejected": -388.8033752441406, "loss": 0.4172, "rewards/accuracies": 0.875, "rewards/chosen": 1.084060549736023, "rewards/margins": 0.821880042552948, "rewards/rejected": 0.2621805965900421, "step": 403 }, { "epoch": 0.2951598173515982, "grad_norm": 78.7579795064443, "learning_rate": 3.686131386861314e-07, "logits/chosen": -2.392764091491699, "logits/rejected": -1.8379217386245728, "logps/chosen": -594.4408569335938, "logps/rejected": -473.67657470703125, "loss": 0.5492, "rewards/accuracies": 0.5, "rewards/chosen": 1.3847202062606812, "rewards/margins": 0.9830659627914429, "rewards/rejected": 0.40165427327156067, "step": 404 }, { "epoch": 0.2958904109589041, "grad_norm": 71.36808334481528, "learning_rate": 3.6952554744525546e-07, "logits/chosen": -2.8430001735687256, "logits/rejected": -2.106400966644287, "logps/chosen": -737.970458984375, "logps/rejected": -522.6317138671875, "loss": 0.5021, "rewards/accuracies": 0.625, "rewards/chosen": 1.5014264583587646, "rewards/margins": 0.544174075126648, "rewards/rejected": 0.9572522044181824, "step": 405 }, { "epoch": 0.29662100456621004, "grad_norm": 63.54877863204768, "learning_rate": 3.7043795620437954e-07, "logits/chosen": -2.802656412124634, "logits/rejected": -2.3756067752838135, "logps/chosen": -994.13623046875, "logps/rejected": -813.6070556640625, "loss": 0.379, "rewards/accuracies": 0.75, "rewards/chosen": 2.0577597618103027, "rewards/margins": 1.1061921119689941, "rewards/rejected": 0.9515675902366638, "step": 406 }, { "epoch": 0.297351598173516, "grad_norm": 83.1100080465644, "learning_rate": 3.7135036496350363e-07, "logits/chosen": -3.185398817062378, "logits/rejected": -2.277226209640503, "logps/chosen": -1049.6358642578125, "logps/rejected": -646.7078247070312, "loss": 0.5563, "rewards/accuracies": 0.875, "rewards/chosen": 2.2733683586120605, "rewards/margins": 1.3938696384429932, "rewards/rejected": 0.8794988989830017, "step": 407 }, { "epoch": 0.2980821917808219, "grad_norm": 66.77045712972541, "learning_rate": 3.722627737226277e-07, "logits/chosen": -2.556863784790039, "logits/rejected": -2.221888303756714, "logps/chosen": -479.09381103515625, "logps/rejected": -469.9384460449219, "loss": 0.5268, "rewards/accuracies": 0.625, "rewards/chosen": 0.8834705352783203, "rewards/margins": 0.5647731423377991, "rewards/rejected": 0.31869742274284363, "step": 408 }, { "epoch": 0.29881278538812783, "grad_norm": 71.79353399462586, "learning_rate": 3.731751824817518e-07, "logits/chosen": -3.0326790809631348, "logits/rejected": -2.666672945022583, "logps/chosen": -842.289306640625, "logps/rejected": -790.7916870117188, "loss": 0.477, "rewards/accuracies": 0.5, "rewards/chosen": 1.8810371160507202, "rewards/margins": 0.8053518533706665, "rewards/rejected": 1.0756852626800537, "step": 409 }, { "epoch": 0.29954337899543376, "grad_norm": 58.33352632036529, "learning_rate": 3.7408759124087593e-07, "logits/chosen": -3.0157649517059326, "logits/rejected": -2.233018398284912, "logps/chosen": -424.5609436035156, "logps/rejected": -337.77313232421875, "loss": 0.415, "rewards/accuracies": 1.0, "rewards/chosen": 1.801344871520996, "rewards/margins": 1.668144702911377, "rewards/rejected": 0.13320012390613556, "step": 410 }, { "epoch": 0.30027397260273975, "grad_norm": 62.82418851134788, "learning_rate": 3.75e-07, "logits/chosen": -2.6918485164642334, "logits/rejected": -2.108341932296753, "logps/chosen": -637.0249633789062, "logps/rejected": -524.2815551757812, "loss": 0.4828, "rewards/accuracies": 0.5, "rewards/chosen": 1.4878185987472534, "rewards/margins": 0.4413679242134094, "rewards/rejected": 1.0464506149291992, "step": 411 }, { "epoch": 0.3010045662100457, "grad_norm": 83.5613809994261, "learning_rate": 3.7591240875912404e-07, "logits/chosen": -2.6447253227233887, "logits/rejected": -2.37985897064209, "logps/chosen": -934.8235473632812, "logps/rejected": -923.3673095703125, "loss": 0.5585, "rewards/accuracies": 0.75, "rewards/chosen": 1.6539980173110962, "rewards/margins": 0.6714655160903931, "rewards/rejected": 0.9825325012207031, "step": 412 }, { "epoch": 0.3017351598173516, "grad_norm": 72.96948607765783, "learning_rate": 3.768248175182482e-07, "logits/chosen": -2.8941171169281006, "logits/rejected": -2.221369504928589, "logps/chosen": -603.005615234375, "logps/rejected": -430.2441711425781, "loss": 0.5711, "rewards/accuracies": 0.875, "rewards/chosen": 1.3409483432769775, "rewards/margins": 0.7378309369087219, "rewards/rejected": 0.6031173467636108, "step": 413 }, { "epoch": 0.30246575342465754, "grad_norm": 73.00562505083084, "learning_rate": 3.7773722627737226e-07, "logits/chosen": -3.252376079559326, "logits/rejected": -2.1639280319213867, "logps/chosen": -743.2987670898438, "logps/rejected": -547.860595703125, "loss": 0.4399, "rewards/accuracies": 0.875, "rewards/chosen": 2.668722629547119, "rewards/margins": 1.8440520763397217, "rewards/rejected": 0.8246706128120422, "step": 414 }, { "epoch": 0.30319634703196346, "grad_norm": 67.68670648195996, "learning_rate": 3.7864963503649634e-07, "logits/chosen": -2.406378746032715, "logits/rejected": -2.597806930541992, "logps/chosen": -477.2957458496094, "logps/rejected": -532.0967407226562, "loss": 0.4571, "rewards/accuracies": 0.375, "rewards/chosen": 1.2229046821594238, "rewards/margins": 0.18518687784671783, "rewards/rejected": 1.0377178192138672, "step": 415 }, { "epoch": 0.3039269406392694, "grad_norm": 70.29147255024075, "learning_rate": 3.795620437956204e-07, "logits/chosen": -3.0422589778900146, "logits/rejected": -2.291886806488037, "logps/chosen": -747.72802734375, "logps/rejected": -561.6082763671875, "loss": 0.4756, "rewards/accuracies": 0.625, "rewards/chosen": 1.5787537097930908, "rewards/margins": 0.6957694292068481, "rewards/rejected": 0.8829842805862427, "step": 416 }, { "epoch": 0.3046575342465753, "grad_norm": 68.51140772368679, "learning_rate": 3.804744525547445e-07, "logits/chosen": -2.390530586242676, "logits/rejected": -2.048525810241699, "logps/chosen": -428.1729736328125, "logps/rejected": -350.5956726074219, "loss": 0.586, "rewards/accuracies": 0.5, "rewards/chosen": 1.2034071683883667, "rewards/margins": 1.0471317768096924, "rewards/rejected": 0.15627548098564148, "step": 417 }, { "epoch": 0.30538812785388125, "grad_norm": 75.9631001087041, "learning_rate": 3.813868613138686e-07, "logits/chosen": -2.806358814239502, "logits/rejected": -2.500453233718872, "logps/chosen": -861.806396484375, "logps/rejected": -799.6177978515625, "loss": 0.5614, "rewards/accuracies": 0.625, "rewards/chosen": 1.9531975984573364, "rewards/margins": 0.6257984638214111, "rewards/rejected": 1.3273990154266357, "step": 418 }, { "epoch": 0.30611872146118724, "grad_norm": 74.51909623126814, "learning_rate": 3.8229927007299267e-07, "logits/chosen": -2.670886516571045, "logits/rejected": -2.638563871383667, "logps/chosen": -638.573974609375, "logps/rejected": -662.8447265625, "loss": 0.4788, "rewards/accuracies": 0.375, "rewards/chosen": 0.9784455299377441, "rewards/margins": -0.009990274906158447, "rewards/rejected": 0.9884356260299683, "step": 419 }, { "epoch": 0.30684931506849317, "grad_norm": 51.425965064474, "learning_rate": 3.8321167883211675e-07, "logits/chosen": -2.75875186920166, "logits/rejected": -2.2063443660736084, "logps/chosen": -692.3477172851562, "logps/rejected": -534.5147705078125, "loss": 0.3808, "rewards/accuracies": 0.75, "rewards/chosen": 1.5941345691680908, "rewards/margins": 1.0291024446487427, "rewards/rejected": 0.5650320053100586, "step": 420 }, { "epoch": 0.3075799086757991, "grad_norm": 56.67445541409617, "learning_rate": 3.841240875912409e-07, "logits/chosen": -2.212170362472534, "logits/rejected": -1.9447052478790283, "logps/chosen": -624.6950073242188, "logps/rejected": -550.9940795898438, "loss": 0.3742, "rewards/accuracies": 0.625, "rewards/chosen": 1.2681156396865845, "rewards/margins": 0.6752625703811646, "rewards/rejected": 0.5928530693054199, "step": 421 }, { "epoch": 0.308310502283105, "grad_norm": 74.78923836877368, "learning_rate": 3.8503649635036497e-07, "logits/chosen": -2.6003079414367676, "logits/rejected": -2.7091920375823975, "logps/chosen": -632.369873046875, "logps/rejected": -561.746337890625, "loss": 0.5083, "rewards/accuracies": 0.75, "rewards/chosen": 1.2401961088180542, "rewards/margins": 0.4923917055130005, "rewards/rejected": 0.7478044033050537, "step": 422 }, { "epoch": 0.30904109589041096, "grad_norm": 68.17591990349305, "learning_rate": 3.85948905109489e-07, "logits/chosen": -2.682901620864868, "logits/rejected": -2.517759084701538, "logps/chosen": -714.9722900390625, "logps/rejected": -646.5712890625, "loss": 0.5425, "rewards/accuracies": 0.625, "rewards/chosen": 1.3442649841308594, "rewards/margins": 0.48620715737342834, "rewards/rejected": 0.8580577969551086, "step": 423 }, { "epoch": 0.3097716894977169, "grad_norm": 68.91358379340026, "learning_rate": 3.8686131386861313e-07, "logits/chosen": -2.542020082473755, "logits/rejected": -2.1021361351013184, "logps/chosen": -650.6712646484375, "logps/rejected": -818.2941284179688, "loss": 0.4213, "rewards/accuracies": 0.75, "rewards/chosen": 1.464009165763855, "rewards/margins": 1.134708285331726, "rewards/rejected": 0.32930079102516174, "step": 424 }, { "epoch": 0.3105022831050228, "grad_norm": 86.08863414927373, "learning_rate": 3.877737226277372e-07, "logits/chosen": -3.292895555496216, "logits/rejected": -2.4847872257232666, "logps/chosen": -655.4714965820312, "logps/rejected": -499.7412109375, "loss": 0.5936, "rewards/accuracies": 0.625, "rewards/chosen": 1.3980798721313477, "rewards/margins": 0.7421797513961792, "rewards/rejected": 0.6559001207351685, "step": 425 }, { "epoch": 0.31123287671232874, "grad_norm": 66.72798530929508, "learning_rate": 3.886861313868613e-07, "logits/chosen": -2.5342953205108643, "logits/rejected": -1.6308666467666626, "logps/chosen": -546.7681884765625, "logps/rejected": -412.86590576171875, "loss": 0.4619, "rewards/accuracies": 0.75, "rewards/chosen": 0.9704580307006836, "rewards/margins": 0.8511885404586792, "rewards/rejected": 0.119269460439682, "step": 426 }, { "epoch": 0.31196347031963473, "grad_norm": 53.20979216668559, "learning_rate": 3.895985401459854e-07, "logits/chosen": -2.511795997619629, "logits/rejected": -1.6758430004119873, "logps/chosen": -591.0272827148438, "logps/rejected": -484.1896667480469, "loss": 0.4359, "rewards/accuracies": 0.625, "rewards/chosen": 0.7769079208374023, "rewards/margins": 0.7935389876365662, "rewards/rejected": -0.016631033271551132, "step": 427 }, { "epoch": 0.31269406392694066, "grad_norm": 72.68171260816747, "learning_rate": 3.905109489051095e-07, "logits/chosen": -2.633873462677002, "logits/rejected": -2.2148144245147705, "logps/chosen": -750.1069946289062, "logps/rejected": -576.8854370117188, "loss": 0.5413, "rewards/accuracies": 0.625, "rewards/chosen": 0.9069653153419495, "rewards/margins": 0.16593578457832336, "rewards/rejected": 0.7410295605659485, "step": 428 }, { "epoch": 0.3134246575342466, "grad_norm": 79.7932050140001, "learning_rate": 3.9142335766423354e-07, "logits/chosen": -2.5299201011657715, "logits/rejected": -2.5208230018615723, "logps/chosen": -737.568359375, "logps/rejected": -988.9088745117188, "loss": 0.5172, "rewards/accuracies": 0.875, "rewards/chosen": 1.8865628242492676, "rewards/margins": 1.0825170278549194, "rewards/rejected": 0.8040457367897034, "step": 429 }, { "epoch": 0.3141552511415525, "grad_norm": 50.028412337547, "learning_rate": 3.9233576642335763e-07, "logits/chosen": -3.142920970916748, "logits/rejected": -1.491673469543457, "logps/chosen": -716.6190185546875, "logps/rejected": -360.9404602050781, "loss": 0.34, "rewards/accuracies": 0.875, "rewards/chosen": 2.555847644805908, "rewards/margins": 2.318981170654297, "rewards/rejected": 0.23686668276786804, "step": 430 }, { "epoch": 0.31488584474885845, "grad_norm": 70.04630862195738, "learning_rate": 3.932481751824817e-07, "logits/chosen": -2.6813926696777344, "logits/rejected": -1.6706784963607788, "logps/chosen": -427.72955322265625, "logps/rejected": -171.67938232421875, "loss": 0.5009, "rewards/accuracies": 0.875, "rewards/chosen": 0.6657592058181763, "rewards/margins": 0.7643535137176514, "rewards/rejected": -0.09859432280063629, "step": 431 }, { "epoch": 0.3156164383561644, "grad_norm": 67.25211282694023, "learning_rate": 3.9416058394160584e-07, "logits/chosen": -2.383779764175415, "logits/rejected": -2.125863790512085, "logps/chosen": -858.517333984375, "logps/rejected": -807.4769287109375, "loss": 0.4401, "rewards/accuracies": 0.625, "rewards/chosen": 2.4168999195098877, "rewards/margins": 1.5250247716903687, "rewards/rejected": 0.8918753862380981, "step": 432 }, { "epoch": 0.3163470319634703, "grad_norm": 72.4356530178682, "learning_rate": 3.9507299270072993e-07, "logits/chosen": -2.707230567932129, "logits/rejected": -2.291544198989868, "logps/chosen": -437.80010986328125, "logps/rejected": -555.25927734375, "loss": 0.5066, "rewards/accuracies": 0.625, "rewards/chosen": 1.1402331590652466, "rewards/margins": 0.6953232288360596, "rewards/rejected": 0.444909930229187, "step": 433 }, { "epoch": 0.31707762557077623, "grad_norm": 62.62770546800753, "learning_rate": 3.95985401459854e-07, "logits/chosen": -2.5424277782440186, "logits/rejected": -1.5201289653778076, "logps/chosen": -680.2384033203125, "logps/rejected": -445.1466979980469, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 1.9177610874176025, "rewards/margins": 1.4073165655136108, "rewards/rejected": 0.5104445219039917, "step": 434 }, { "epoch": 0.3178082191780822, "grad_norm": 70.589506429559, "learning_rate": 3.9689781021897804e-07, "logits/chosen": -2.656939744949341, "logits/rejected": -2.839566469192505, "logps/chosen": -490.5395812988281, "logps/rejected": -579.1959838867188, "loss": 0.5241, "rewards/accuracies": 0.625, "rewards/chosen": 1.3023680448532104, "rewards/margins": 0.7615251541137695, "rewards/rejected": 0.5408428311347961, "step": 435 }, { "epoch": 0.31853881278538815, "grad_norm": 63.18879627441383, "learning_rate": 3.978102189781022e-07, "logits/chosen": -3.0235512256622314, "logits/rejected": -1.6664369106292725, "logps/chosen": -653.3387451171875, "logps/rejected": -293.6246643066406, "loss": 0.3317, "rewards/accuracies": 1.0, "rewards/chosen": 1.734316110610962, "rewards/margins": 1.750242829322815, "rewards/rejected": -0.015926741063594818, "step": 436 }, { "epoch": 0.3192694063926941, "grad_norm": 80.39085125089639, "learning_rate": 3.9872262773722626e-07, "logits/chosen": -2.953507661819458, "logits/rejected": -2.2685389518737793, "logps/chosen": -604.7986450195312, "logps/rejected": -573.8317260742188, "loss": 0.5233, "rewards/accuracies": 0.75, "rewards/chosen": 1.8264753818511963, "rewards/margins": 1.0823619365692139, "rewards/rejected": 0.7441134452819824, "step": 437 }, { "epoch": 0.32, "grad_norm": 67.39281130524977, "learning_rate": 3.9963503649635034e-07, "logits/chosen": -2.6830549240112305, "logits/rejected": -1.6902358531951904, "logps/chosen": -790.457763671875, "logps/rejected": -530.90234375, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": 1.7585811614990234, "rewards/margins": 1.236290693283081, "rewards/rejected": 0.5222905874252319, "step": 438 }, { "epoch": 0.32073059360730594, "grad_norm": 81.7758464493031, "learning_rate": 4.005474452554745e-07, "logits/chosen": -2.9593100547790527, "logits/rejected": -2.002012252807617, "logps/chosen": -1005.2999877929688, "logps/rejected": -727.821044921875, "loss": 0.5337, "rewards/accuracies": 0.75, "rewards/chosen": 1.7989436388015747, "rewards/margins": 0.9689491987228394, "rewards/rejected": 0.8299944400787354, "step": 439 }, { "epoch": 0.32146118721461187, "grad_norm": 64.4182689840875, "learning_rate": 4.0145985401459856e-07, "logits/chosen": -3.039970636367798, "logits/rejected": -2.300909996032715, "logps/chosen": -628.4385986328125, "logps/rejected": -412.5433349609375, "loss": 0.4537, "rewards/accuracies": 0.625, "rewards/chosen": 1.213897705078125, "rewards/margins": 0.7188777327537537, "rewards/rejected": 0.49501994252204895, "step": 440 }, { "epoch": 0.3221917808219178, "grad_norm": 65.73156154842017, "learning_rate": 4.023722627737226e-07, "logits/chosen": -2.624136447906494, "logits/rejected": -2.659198760986328, "logps/chosen": -647.8939208984375, "logps/rejected": -518.5242309570312, "loss": 0.435, "rewards/accuracies": 0.75, "rewards/chosen": 1.457006812095642, "rewards/margins": 0.6893731355667114, "rewards/rejected": 0.7676336765289307, "step": 441 }, { "epoch": 0.3229223744292237, "grad_norm": 55.42713667932658, "learning_rate": 4.0328467153284667e-07, "logits/chosen": -3.2642672061920166, "logits/rejected": -1.9783011674880981, "logps/chosen": -1062.185302734375, "logps/rejected": -666.7640380859375, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 2.232957363128662, "rewards/margins": 1.728939414024353, "rewards/rejected": 0.5040180683135986, "step": 442 }, { "epoch": 0.32365296803652965, "grad_norm": 61.09813962810672, "learning_rate": 4.041970802919708e-07, "logits/chosen": -2.615471124649048, "logits/rejected": -1.497684121131897, "logps/chosen": -533.1825561523438, "logps/rejected": -324.2681579589844, "loss": 0.3821, "rewards/accuracies": 0.875, "rewards/chosen": 1.3101308345794678, "rewards/margins": 1.182222604751587, "rewards/rejected": 0.12790821492671967, "step": 443 }, { "epoch": 0.32438356164383564, "grad_norm": 63.93232009503781, "learning_rate": 4.051094890510949e-07, "logits/chosen": -2.952669382095337, "logits/rejected": -2.073992967605591, "logps/chosen": -580.8018798828125, "logps/rejected": -378.9000244140625, "loss": 0.4623, "rewards/accuracies": 0.875, "rewards/chosen": 1.0705572366714478, "rewards/margins": 0.8738815188407898, "rewards/rejected": 0.19667567312717438, "step": 444 }, { "epoch": 0.32511415525114157, "grad_norm": 59.7861627293996, "learning_rate": 4.0602189781021897e-07, "logits/chosen": -2.670620918273926, "logits/rejected": -1.7812598943710327, "logps/chosen": -780.9053955078125, "logps/rejected": -678.7969360351562, "loss": 0.4018, "rewards/accuracies": 0.75, "rewards/chosen": 2.14333176612854, "rewards/margins": 1.4925963878631592, "rewards/rejected": 0.650735080242157, "step": 445 }, { "epoch": 0.3258447488584475, "grad_norm": 61.48463309983947, "learning_rate": 4.06934306569343e-07, "logits/chosen": -2.271239757537842, "logits/rejected": -2.1667609214782715, "logps/chosen": -365.85455322265625, "logps/rejected": -388.2726745605469, "loss": 0.4881, "rewards/accuracies": 0.75, "rewards/chosen": 1.044189453125, "rewards/margins": 0.9612910747528076, "rewards/rejected": 0.08289843052625656, "step": 446 }, { "epoch": 0.3265753424657534, "grad_norm": 60.530259817017345, "learning_rate": 4.0784671532846713e-07, "logits/chosen": -2.501105546951294, "logits/rejected": -2.393195629119873, "logps/chosen": -525.487060546875, "logps/rejected": -705.09912109375, "loss": 0.4865, "rewards/accuracies": 0.75, "rewards/chosen": 1.1649763584136963, "rewards/margins": 0.6730178594589233, "rewards/rejected": 0.49195849895477295, "step": 447 }, { "epoch": 0.32730593607305936, "grad_norm": 59.66412336729251, "learning_rate": 4.087591240875912e-07, "logits/chosen": -2.9038596153259277, "logits/rejected": -1.9915305376052856, "logps/chosen": -635.3738403320312, "logps/rejected": -505.61767578125, "loss": 0.331, "rewards/accuracies": 0.875, "rewards/chosen": 2.2829699516296387, "rewards/margins": 1.85551917552948, "rewards/rejected": 0.4274507462978363, "step": 448 }, { "epoch": 0.3280365296803653, "grad_norm": 66.66127500983985, "learning_rate": 4.096715328467153e-07, "logits/chosen": -2.599583387374878, "logits/rejected": -2.380450963973999, "logps/chosen": -678.6322021484375, "logps/rejected": -709.5350341796875, "loss": 0.4008, "rewards/accuracies": 0.75, "rewards/chosen": 1.1776022911071777, "rewards/margins": 0.44003039598464966, "rewards/rejected": 0.7375719547271729, "step": 449 }, { "epoch": 0.3287671232876712, "grad_norm": 63.74998851102838, "learning_rate": 4.1058394160583943e-07, "logits/chosen": -2.8429598808288574, "logits/rejected": -1.461902141571045, "logps/chosen": -726.18701171875, "logps/rejected": -296.8395690917969, "loss": 0.4375, "rewards/accuracies": 1.0, "rewards/chosen": 1.6260573863983154, "rewards/margins": 1.5172761678695679, "rewards/rejected": 0.10878127068281174, "step": 450 }, { "epoch": 0.32949771689497714, "grad_norm": 42.18988997859854, "learning_rate": 4.114963503649635e-07, "logits/chosen": -2.9908549785614014, "logits/rejected": -2.3415210247039795, "logps/chosen": -512.61767578125, "logps/rejected": -426.0896301269531, "loss": 0.3319, "rewards/accuracies": 0.875, "rewards/chosen": 1.6657049655914307, "rewards/margins": 1.3071110248565674, "rewards/rejected": 0.35859397053718567, "step": 451 }, { "epoch": 0.33022831050228313, "grad_norm": 60.907715076658185, "learning_rate": 4.1240875912408754e-07, "logits/chosen": -2.7742722034454346, "logits/rejected": -2.584115982055664, "logps/chosen": -1061.055419921875, "logps/rejected": -875.2849731445312, "loss": 0.3792, "rewards/accuracies": 1.0, "rewards/chosen": 2.3823728561401367, "rewards/margins": 0.9845855236053467, "rewards/rejected": 1.39778733253479, "step": 452 }, { "epoch": 0.33095890410958906, "grad_norm": 61.000290166446135, "learning_rate": 4.133211678832116e-07, "logits/chosen": -3.2646868228912354, "logits/rejected": -2.261916160583496, "logps/chosen": -1288.9542236328125, "logps/rejected": -719.4686889648438, "loss": 0.3968, "rewards/accuracies": 1.0, "rewards/chosen": 1.9012309312820435, "rewards/margins": 1.241621971130371, "rewards/rejected": 0.6596089601516724, "step": 453 }, { "epoch": 0.331689497716895, "grad_norm": 64.19992881129231, "learning_rate": 4.1423357664233576e-07, "logits/chosen": -2.5105385780334473, "logits/rejected": -2.2159953117370605, "logps/chosen": -449.1396789550781, "logps/rejected": -511.2153015136719, "loss": 0.4328, "rewards/accuracies": 0.75, "rewards/chosen": 1.0153608322143555, "rewards/margins": 0.8970168828964233, "rewards/rejected": 0.11834394186735153, "step": 454 }, { "epoch": 0.3324200913242009, "grad_norm": 50.72942307216173, "learning_rate": 4.1514598540145984e-07, "logits/chosen": -2.960087299346924, "logits/rejected": -2.524059295654297, "logps/chosen": -497.42108154296875, "logps/rejected": -373.9635314941406, "loss": 0.4565, "rewards/accuracies": 0.875, "rewards/chosen": 1.211471676826477, "rewards/margins": 1.115602731704712, "rewards/rejected": 0.09586897492408752, "step": 455 }, { "epoch": 0.33315068493150685, "grad_norm": 64.40593158582135, "learning_rate": 4.160583941605839e-07, "logits/chosen": -2.806727409362793, "logits/rejected": -2.2392923831939697, "logps/chosen": -617.2734985351562, "logps/rejected": -335.9648132324219, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/chosen": 1.4272794723510742, "rewards/margins": 1.0357335805892944, "rewards/rejected": 0.39154574275016785, "step": 456 }, { "epoch": 0.3338812785388128, "grad_norm": 53.703136365288294, "learning_rate": 4.16970802919708e-07, "logits/chosen": -2.6469757556915283, "logits/rejected": -2.5058865547180176, "logps/chosen": -450.0065612792969, "logps/rejected": -385.09857177734375, "loss": 0.4134, "rewards/accuracies": 0.625, "rewards/chosen": 1.3327714204788208, "rewards/margins": 0.8332525491714478, "rewards/rejected": 0.49951887130737305, "step": 457 }, { "epoch": 0.3346118721461187, "grad_norm": 59.30073480520709, "learning_rate": 4.178832116788321e-07, "logits/chosen": -3.008760690689087, "logits/rejected": -2.715238094329834, "logps/chosen": -631.0863037109375, "logps/rejected": -564.241455078125, "loss": 0.4665, "rewards/accuracies": 0.875, "rewards/chosen": 1.8138407468795776, "rewards/margins": 1.5706055164337158, "rewards/rejected": 0.24323530495166779, "step": 458 }, { "epoch": 0.33534246575342463, "grad_norm": 54.90468480979105, "learning_rate": 4.1879562043795617e-07, "logits/chosen": -2.532738208770752, "logits/rejected": -1.8002936840057373, "logps/chosen": -406.89739990234375, "logps/rejected": -410.81170654296875, "loss": 0.3728, "rewards/accuracies": 0.875, "rewards/chosen": 0.8868148326873779, "rewards/margins": 1.105324149131775, "rewards/rejected": -0.21850939095020294, "step": 459 }, { "epoch": 0.3360730593607306, "grad_norm": 69.06600669830274, "learning_rate": 4.1970802919708026e-07, "logits/chosen": -2.6097769737243652, "logits/rejected": -2.109340190887451, "logps/chosen": -514.699462890625, "logps/rejected": -529.9578247070312, "loss": 0.5091, "rewards/accuracies": 0.875, "rewards/chosen": 1.1837414503097534, "rewards/margins": 0.8878669738769531, "rewards/rejected": 0.2958744168281555, "step": 460 }, { "epoch": 0.33680365296803655, "grad_norm": 78.29202458284783, "learning_rate": 4.206204379562044e-07, "logits/chosen": -2.843853235244751, "logits/rejected": -2.0870094299316406, "logps/chosen": -554.185302734375, "logps/rejected": -597.2554321289062, "loss": 0.427, "rewards/accuracies": 0.875, "rewards/chosen": 2.1215691566467285, "rewards/margins": 1.9146724939346313, "rewards/rejected": 0.2068966031074524, "step": 461 }, { "epoch": 0.3375342465753425, "grad_norm": 77.17697612974513, "learning_rate": 4.2153284671532847e-07, "logits/chosen": -2.802995204925537, "logits/rejected": -2.3710684776306152, "logps/chosen": -698.1070556640625, "logps/rejected": -736.10546875, "loss": 0.4741, "rewards/accuracies": 0.75, "rewards/chosen": 2.677177906036377, "rewards/margins": 1.521966576576233, "rewards/rejected": 1.1552114486694336, "step": 462 }, { "epoch": 0.3382648401826484, "grad_norm": 64.75295601360605, "learning_rate": 4.2244525547445256e-07, "logits/chosen": -2.4766945838928223, "logits/rejected": -2.448213815689087, "logps/chosen": -587.4860229492188, "logps/rejected": -570.84716796875, "loss": 0.4422, "rewards/accuracies": 0.75, "rewards/chosen": 1.7875216007232666, "rewards/margins": 1.1412079334259033, "rewards/rejected": 0.6463136672973633, "step": 463 }, { "epoch": 0.33899543378995434, "grad_norm": 55.74761766210283, "learning_rate": 4.233576642335766e-07, "logits/chosen": -3.1569442749023438, "logits/rejected": -1.9278342723846436, "logps/chosen": -848.021728515625, "logps/rejected": -480.4306640625, "loss": 0.3894, "rewards/accuracies": 0.875, "rewards/chosen": 2.9725840091705322, "rewards/margins": 2.4504005908966064, "rewards/rejected": 0.5221832990646362, "step": 464 }, { "epoch": 0.33972602739726027, "grad_norm": 51.708902008019855, "learning_rate": 4.242700729927007e-07, "logits/chosen": -2.8420612812042236, "logits/rejected": -2.223698854446411, "logps/chosen": -782.1868286132812, "logps/rejected": -593.903076171875, "loss": 0.314, "rewards/accuracies": 0.875, "rewards/chosen": 1.474047303199768, "rewards/margins": 1.1061804294586182, "rewards/rejected": 0.3678668141365051, "step": 465 }, { "epoch": 0.3404566210045662, "grad_norm": 70.67244672828654, "learning_rate": 4.251824817518248e-07, "logits/chosen": -2.425661563873291, "logits/rejected": -2.2976784706115723, "logps/chosen": -804.0443115234375, "logps/rejected": -790.4698486328125, "loss": 0.52, "rewards/accuracies": 0.875, "rewards/chosen": 1.817954182624817, "rewards/margins": 0.8594198822975159, "rewards/rejected": 0.9585342407226562, "step": 466 }, { "epoch": 0.3411872146118721, "grad_norm": 68.13929752490482, "learning_rate": 4.260948905109489e-07, "logits/chosen": -3.4037444591522217, "logits/rejected": -2.4079861640930176, "logps/chosen": -923.4427490234375, "logps/rejected": -646.1881103515625, "loss": 0.4171, "rewards/accuracies": 0.875, "rewards/chosen": 2.2777562141418457, "rewards/margins": 1.6237995624542236, "rewards/rejected": 0.6539567708969116, "step": 467 }, { "epoch": 0.3419178082191781, "grad_norm": 78.87537762389795, "learning_rate": 4.2700729927007297e-07, "logits/chosen": -2.638483762741089, "logits/rejected": -2.6184940338134766, "logps/chosen": -389.16680908203125, "logps/rejected": -444.55877685546875, "loss": 0.5618, "rewards/accuracies": 0.5, "rewards/chosen": 1.222074270248413, "rewards/margins": 0.3711809515953064, "rewards/rejected": 0.8508932590484619, "step": 468 }, { "epoch": 0.34264840182648404, "grad_norm": 62.87803251260531, "learning_rate": 4.279197080291971e-07, "logits/chosen": -2.5316052436828613, "logits/rejected": -2.0709147453308105, "logps/chosen": -585.155517578125, "logps/rejected": -480.6671142578125, "loss": 0.4569, "rewards/accuracies": 0.875, "rewards/chosen": 1.2227823734283447, "rewards/margins": 1.1726853847503662, "rewards/rejected": 0.05009707808494568, "step": 469 }, { "epoch": 0.34337899543378997, "grad_norm": 50.39677530296457, "learning_rate": 4.2883211678832113e-07, "logits/chosen": -2.961728096008301, "logits/rejected": -1.9741758108139038, "logps/chosen": -647.5169067382812, "logps/rejected": -383.69305419921875, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 1.321942687034607, "rewards/margins": 0.962601900100708, "rewards/rejected": 0.3593407869338989, "step": 470 }, { "epoch": 0.3441095890410959, "grad_norm": 76.14421452907634, "learning_rate": 4.297445255474452e-07, "logits/chosen": -2.858926773071289, "logits/rejected": -2.8106939792633057, "logps/chosen": -515.6644287109375, "logps/rejected": -458.9950256347656, "loss": 0.5824, "rewards/accuracies": 0.625, "rewards/chosen": 1.3200807571411133, "rewards/margins": 0.5785439610481262, "rewards/rejected": 0.7415368556976318, "step": 471 }, { "epoch": 0.3448401826484018, "grad_norm": 54.2445543781816, "learning_rate": 4.306569343065693e-07, "logits/chosen": -2.723400592803955, "logits/rejected": -2.3749923706054688, "logps/chosen": -548.376220703125, "logps/rejected": -438.18646240234375, "loss": 0.4306, "rewards/accuracies": 0.5, "rewards/chosen": 1.4096403121948242, "rewards/margins": 1.092281699180603, "rewards/rejected": 0.3173587918281555, "step": 472 }, { "epoch": 0.34557077625570776, "grad_norm": 62.62609155735331, "learning_rate": 4.3156934306569343e-07, "logits/chosen": -2.75776743888855, "logits/rejected": -1.997426152229309, "logps/chosen": -364.0853271484375, "logps/rejected": -290.5821838378906, "loss": 0.4349, "rewards/accuracies": 0.75, "rewards/chosen": 1.3918296098709106, "rewards/margins": 1.724780559539795, "rewards/rejected": -0.33295106887817383, "step": 473 }, { "epoch": 0.3463013698630137, "grad_norm": 66.0491131777142, "learning_rate": 4.324817518248175e-07, "logits/chosen": -2.9366354942321777, "logits/rejected": -1.4342886209487915, "logps/chosen": -377.1177978515625, "logps/rejected": -223.9526824951172, "loss": 0.3578, "rewards/accuracies": 1.0, "rewards/chosen": 2.230846881866455, "rewards/margins": 2.713860034942627, "rewards/rejected": -0.48301297426223755, "step": 474 }, { "epoch": 0.3470319634703196, "grad_norm": 66.30021313479128, "learning_rate": 4.333941605839416e-07, "logits/chosen": -1.9881703853607178, "logits/rejected": -2.3498291969299316, "logps/chosen": -524.0934448242188, "logps/rejected": -645.16650390625, "loss": 0.3711, "rewards/accuracies": 0.875, "rewards/chosen": 1.511301875114441, "rewards/margins": 1.0202102661132812, "rewards/rejected": 0.4910915493965149, "step": 475 }, { "epoch": 0.34776255707762554, "grad_norm": 46.183913405230996, "learning_rate": 4.343065693430657e-07, "logits/chosen": -2.603783130645752, "logits/rejected": -1.510715126991272, "logps/chosen": -868.3519287109375, "logps/rejected": -476.92498779296875, "loss": 0.3226, "rewards/accuracies": 1.0, "rewards/chosen": 2.3948564529418945, "rewards/margins": 2.0134544372558594, "rewards/rejected": 0.3814019560813904, "step": 476 }, { "epoch": 0.34849315068493153, "grad_norm": 60.022814672839196, "learning_rate": 4.3521897810218976e-07, "logits/chosen": -2.464519500732422, "logits/rejected": -1.3993675708770752, "logps/chosen": -632.828125, "logps/rejected": -481.4486999511719, "loss": 0.4154, "rewards/accuracies": 0.75, "rewards/chosen": 2.449397087097168, "rewards/margins": 2.4115958213806152, "rewards/rejected": 0.03780154883861542, "step": 477 }, { "epoch": 0.34922374429223746, "grad_norm": 57.30522590493493, "learning_rate": 4.3613138686131384e-07, "logits/chosen": -2.1817474365234375, "logits/rejected": -2.127521514892578, "logps/chosen": -521.425537109375, "logps/rejected": -523.473876953125, "loss": 0.432, "rewards/accuracies": 0.625, "rewards/chosen": 1.643643856048584, "rewards/margins": 1.158881664276123, "rewards/rejected": 0.48476219177246094, "step": 478 }, { "epoch": 0.3499543378995434, "grad_norm": 79.06732013468807, "learning_rate": 4.370437956204379e-07, "logits/chosen": -3.0991392135620117, "logits/rejected": -2.5649149417877197, "logps/chosen": -599.0025634765625, "logps/rejected": -550.489990234375, "loss": 0.5983, "rewards/accuracies": 0.5, "rewards/chosen": 0.6818256974220276, "rewards/margins": 0.3210504353046417, "rewards/rejected": 0.3607751727104187, "step": 479 }, { "epoch": 0.3506849315068493, "grad_norm": 84.95836526278622, "learning_rate": 4.3795620437956206e-07, "logits/chosen": -3.188084602355957, "logits/rejected": -2.811552047729492, "logps/chosen": -582.2879638671875, "logps/rejected": -531.4417724609375, "loss": 0.5565, "rewards/accuracies": 0.5, "rewards/chosen": 1.7438433170318604, "rewards/margins": 0.5478262305259705, "rewards/rejected": 1.1960170269012451, "step": 480 }, { "epoch": 0.35141552511415525, "grad_norm": 60.29656727700695, "learning_rate": 4.388686131386861e-07, "logits/chosen": -2.7621748447418213, "logits/rejected": -2.0302655696868896, "logps/chosen": -665.84423828125, "logps/rejected": -489.160400390625, "loss": 0.4144, "rewards/accuracies": 0.875, "rewards/chosen": 2.3876450061798096, "rewards/margins": 1.6428532600402832, "rewards/rejected": 0.7447916269302368, "step": 481 }, { "epoch": 0.3521461187214612, "grad_norm": 65.79892131999581, "learning_rate": 4.3978102189781017e-07, "logits/chosen": -2.7387123107910156, "logits/rejected": -2.221705436706543, "logps/chosen": -792.83740234375, "logps/rejected": -723.4072875976562, "loss": 0.4096, "rewards/accuracies": 0.75, "rewards/chosen": 2.5408849716186523, "rewards/margins": 1.9542235136032104, "rewards/rejected": 0.5866614580154419, "step": 482 }, { "epoch": 0.3528767123287671, "grad_norm": 54.772220091484336, "learning_rate": 4.4069343065693426e-07, "logits/chosen": -3.276000499725342, "logits/rejected": -2.7449522018432617, "logps/chosen": -698.1395263671875, "logps/rejected": -461.02813720703125, "loss": 0.4234, "rewards/accuracies": 0.75, "rewards/chosen": 2.046205759048462, "rewards/margins": 1.5468347072601318, "rewards/rejected": 0.49937117099761963, "step": 483 }, { "epoch": 0.35360730593607304, "grad_norm": 80.28781235343257, "learning_rate": 4.416058394160584e-07, "logits/chosen": -2.7670040130615234, "logits/rejected": -2.0220742225646973, "logps/chosen": -537.859130859375, "logps/rejected": -431.42608642578125, "loss": 0.5335, "rewards/accuracies": 0.75, "rewards/chosen": 1.7679325342178345, "rewards/margins": 1.15652334690094, "rewards/rejected": 0.611409068107605, "step": 484 }, { "epoch": 0.354337899543379, "grad_norm": 51.27791493954171, "learning_rate": 4.4251824817518247e-07, "logits/chosen": -2.4482390880584717, "logits/rejected": -2.163296937942505, "logps/chosen": -587.911376953125, "logps/rejected": -518.1017456054688, "loss": 0.3317, "rewards/accuracies": 0.625, "rewards/chosen": 1.9359805583953857, "rewards/margins": 1.0439302921295166, "rewards/rejected": 0.8920501470565796, "step": 485 }, { "epoch": 0.35506849315068495, "grad_norm": 62.8782480056945, "learning_rate": 4.4343065693430656e-07, "logits/chosen": -2.7363409996032715, "logits/rejected": -1.5855985879898071, "logps/chosen": -344.87030029296875, "logps/rejected": -246.51422119140625, "loss": 0.4594, "rewards/accuracies": 0.875, "rewards/chosen": 1.542357087135315, "rewards/margins": 1.8142194747924805, "rewards/rejected": -0.2718624174594879, "step": 486 }, { "epoch": 0.3557990867579909, "grad_norm": 62.72683446276203, "learning_rate": 4.4434306569343064e-07, "logits/chosen": -2.439805030822754, "logits/rejected": -1.9321544170379639, "logps/chosen": -719.8522338867188, "logps/rejected": -659.0249633789062, "loss": 0.3491, "rewards/accuracies": 0.875, "rewards/chosen": 2.0906620025634766, "rewards/margins": 1.227176308631897, "rewards/rejected": 0.8634856343269348, "step": 487 }, { "epoch": 0.3565296803652968, "grad_norm": 53.79686138541797, "learning_rate": 4.452554744525547e-07, "logits/chosen": -2.2773277759552, "logits/rejected": -1.8291007280349731, "logps/chosen": -473.55657958984375, "logps/rejected": -530.5, "loss": 0.37, "rewards/accuracies": 0.75, "rewards/chosen": 1.3206727504730225, "rewards/margins": 1.241703987121582, "rewards/rejected": 0.07896875590085983, "step": 488 }, { "epoch": 0.35726027397260274, "grad_norm": 69.2598142864966, "learning_rate": 4.461678832116788e-07, "logits/chosen": -3.1761038303375244, "logits/rejected": -2.62374210357666, "logps/chosen": -687.275390625, "logps/rejected": -544.0106811523438, "loss": 0.3756, "rewards/accuracies": 0.875, "rewards/chosen": 1.667771339416504, "rewards/margins": 0.6941277980804443, "rewards/rejected": 0.9736435413360596, "step": 489 }, { "epoch": 0.35799086757990867, "grad_norm": 63.50932279619465, "learning_rate": 4.470802919708029e-07, "logits/chosen": -2.9813907146453857, "logits/rejected": -2.0002570152282715, "logps/chosen": -944.7573852539062, "logps/rejected": -587.4722290039062, "loss": 0.3963, "rewards/accuracies": 0.75, "rewards/chosen": 2.1965575218200684, "rewards/margins": 1.0841304063796997, "rewards/rejected": 1.112426996231079, "step": 490 }, { "epoch": 0.3587214611872146, "grad_norm": 47.480238487975186, "learning_rate": 4.47992700729927e-07, "logits/chosen": -2.7100729942321777, "logits/rejected": -2.1465792655944824, "logps/chosen": -646.3826904296875, "logps/rejected": -395.20263671875, "loss": 0.2873, "rewards/accuracies": 0.875, "rewards/chosen": 2.593662977218628, "rewards/margins": 2.4480323791503906, "rewards/rejected": 0.1456303894519806, "step": 491 }, { "epoch": 0.3594520547945205, "grad_norm": 70.9239145856759, "learning_rate": 4.489051094890511e-07, "logits/chosen": -2.335470676422119, "logits/rejected": -2.078129291534424, "logps/chosen": -529.7578125, "logps/rejected": -457.38824462890625, "loss": 0.5471, "rewards/accuracies": 0.75, "rewards/chosen": 0.880675196647644, "rewards/margins": 0.6197315454483032, "rewards/rejected": 0.26094359159469604, "step": 492 }, { "epoch": 0.3601826484018265, "grad_norm": 51.90011393216661, "learning_rate": 4.4981751824817513e-07, "logits/chosen": -2.8168346881866455, "logits/rejected": -1.6534855365753174, "logps/chosen": -603.7760009765625, "logps/rejected": -382.0234375, "loss": 0.3101, "rewards/accuracies": 1.0, "rewards/chosen": 2.184333324432373, "rewards/margins": 2.114604949951172, "rewards/rejected": 0.06972840428352356, "step": 493 }, { "epoch": 0.36091324200913244, "grad_norm": 86.19658955423498, "learning_rate": 4.507299270072992e-07, "logits/chosen": -3.178004503250122, "logits/rejected": -2.261016607284546, "logps/chosen": -919.9517211914062, "logps/rejected": -722.8470458984375, "loss": 0.5516, "rewards/accuracies": 1.0, "rewards/chosen": 2.5614163875579834, "rewards/margins": 1.7225550413131714, "rewards/rejected": 0.8388614654541016, "step": 494 }, { "epoch": 0.36164383561643837, "grad_norm": 55.975761617613344, "learning_rate": 4.5164233576642335e-07, "logits/chosen": -3.515103340148926, "logits/rejected": -1.9409109354019165, "logps/chosen": -643.4150390625, "logps/rejected": -276.8206787109375, "loss": 0.401, "rewards/accuracies": 0.875, "rewards/chosen": 1.4676244258880615, "rewards/margins": 1.435754656791687, "rewards/rejected": 0.03186975419521332, "step": 495 }, { "epoch": 0.3623744292237443, "grad_norm": 79.34655309640898, "learning_rate": 4.5255474452554743e-07, "logits/chosen": -3.047654151916504, "logits/rejected": -2.300143241882324, "logps/chosen": -862.2069091796875, "logps/rejected": -656.7689208984375, "loss": 0.6059, "rewards/accuracies": 0.625, "rewards/chosen": 1.9358177185058594, "rewards/margins": 1.0283113718032837, "rewards/rejected": 0.9075063467025757, "step": 496 }, { "epoch": 0.36310502283105023, "grad_norm": 63.82679963434247, "learning_rate": 4.534671532846715e-07, "logits/chosen": -3.637629508972168, "logits/rejected": -2.411752700805664, "logps/chosen": -833.9844360351562, "logps/rejected": -673.55126953125, "loss": 0.4027, "rewards/accuracies": 1.0, "rewards/chosen": 2.63371205329895, "rewards/margins": 2.039763927459717, "rewards/rejected": 0.5939480066299438, "step": 497 }, { "epoch": 0.36383561643835616, "grad_norm": 72.71788044710014, "learning_rate": 4.543795620437956e-07, "logits/chosen": -3.1350011825561523, "logits/rejected": -2.517909049987793, "logps/chosen": -676.7994995117188, "logps/rejected": -517.3711547851562, "loss": 0.4709, "rewards/accuracies": 0.75, "rewards/chosen": 1.6492921113967896, "rewards/margins": 1.5627813339233398, "rewards/rejected": 0.08651086688041687, "step": 498 }, { "epoch": 0.3645662100456621, "grad_norm": 48.4256563141185, "learning_rate": 4.552919708029197e-07, "logits/chosen": -3.3894386291503906, "logits/rejected": -2.5502047538757324, "logps/chosen": -500.265869140625, "logps/rejected": -357.89862060546875, "loss": 0.3473, "rewards/accuracies": 0.75, "rewards/chosen": 1.933529257774353, "rewards/margins": 1.457592487335205, "rewards/rejected": 0.47593677043914795, "step": 499 }, { "epoch": 0.365296803652968, "grad_norm": 89.03962151322719, "learning_rate": 4.5620437956204376e-07, "logits/chosen": -2.85481595993042, "logits/rejected": -2.7935023307800293, "logps/chosen": -569.89404296875, "logps/rejected": -586.3229370117188, "loss": 0.6819, "rewards/accuracies": 0.625, "rewards/chosen": 0.8543252944946289, "rewards/margins": 0.49327659606933594, "rewards/rejected": 0.36104869842529297, "step": 500 }, { "epoch": 0.36602739726027395, "grad_norm": 74.23372217403961, "learning_rate": 4.5711678832116784e-07, "logits/chosen": -2.4280965328216553, "logits/rejected": -2.269599676132202, "logps/chosen": -587.1246337890625, "logps/rejected": -575.2628173828125, "loss": 0.5148, "rewards/accuracies": 0.625, "rewards/chosen": 1.276087999343872, "rewards/margins": 0.5552183389663696, "rewards/rejected": 0.7208697199821472, "step": 501 }, { "epoch": 0.36675799086757993, "grad_norm": 69.45238504948912, "learning_rate": 4.58029197080292e-07, "logits/chosen": -2.7813289165496826, "logits/rejected": -1.9194908142089844, "logps/chosen": -476.7442626953125, "logps/rejected": -220.19558715820312, "loss": 0.4399, "rewards/accuracies": 0.75, "rewards/chosen": 0.9025549292564392, "rewards/margins": 0.4284529983997345, "rewards/rejected": 0.4741019606590271, "step": 502 }, { "epoch": 0.36748858447488586, "grad_norm": 56.781980939158856, "learning_rate": 4.5894160583941606e-07, "logits/chosen": -2.787536144256592, "logits/rejected": -2.698723554611206, "logps/chosen": -478.7005920410156, "logps/rejected": -433.94561767578125, "loss": 0.5387, "rewards/accuracies": 0.75, "rewards/chosen": 1.1532747745513916, "rewards/margins": 0.533678412437439, "rewards/rejected": 0.6195963621139526, "step": 503 }, { "epoch": 0.3682191780821918, "grad_norm": 61.30379692627631, "learning_rate": 4.5985401459854014e-07, "logits/chosen": -2.7511420249938965, "logits/rejected": -2.0617220401763916, "logps/chosen": -816.4799194335938, "logps/rejected": -684.9895629882812, "loss": 0.3923, "rewards/accuracies": 0.875, "rewards/chosen": 3.3105058670043945, "rewards/margins": 2.0367472171783447, "rewards/rejected": 1.2737585306167603, "step": 504 }, { "epoch": 0.3689497716894977, "grad_norm": 67.57244376408183, "learning_rate": 4.6076642335766417e-07, "logits/chosen": -3.0178751945495605, "logits/rejected": -2.242319345474243, "logps/chosen": -581.9234008789062, "logps/rejected": -342.6377868652344, "loss": 0.4694, "rewards/accuracies": 0.875, "rewards/chosen": 1.434507131576538, "rewards/margins": 0.814419686794281, "rewards/rejected": 0.6200873255729675, "step": 505 }, { "epoch": 0.36968036529680365, "grad_norm": 75.51915371695448, "learning_rate": 4.616788321167883e-07, "logits/chosen": -3.0656938552856445, "logits/rejected": -2.2197065353393555, "logps/chosen": -530.8633422851562, "logps/rejected": -309.808837890625, "loss": 0.5401, "rewards/accuracies": 0.75, "rewards/chosen": 1.870051383972168, "rewards/margins": 1.7016440629959106, "rewards/rejected": 0.16840726137161255, "step": 506 }, { "epoch": 0.3704109589041096, "grad_norm": 44.03255298817285, "learning_rate": 4.625912408759124e-07, "logits/chosen": -3.0813205242156982, "logits/rejected": -2.3189280033111572, "logps/chosen": -680.5758056640625, "logps/rejected": -500.8650817871094, "loss": 0.3664, "rewards/accuracies": 0.625, "rewards/chosen": 1.5526847839355469, "rewards/margins": 0.6460072994232178, "rewards/rejected": 0.9066774845123291, "step": 507 }, { "epoch": 0.3711415525114155, "grad_norm": 75.51452739477517, "learning_rate": 4.6350364963503647e-07, "logits/chosen": -2.843088150024414, "logits/rejected": -2.174595832824707, "logps/chosen": -483.7886962890625, "logps/rejected": -358.3643493652344, "loss": 0.4764, "rewards/accuracies": 0.875, "rewards/chosen": 1.9911956787109375, "rewards/margins": 1.761576771736145, "rewards/rejected": 0.22961902618408203, "step": 508 }, { "epoch": 0.37187214611872144, "grad_norm": 62.54804988904031, "learning_rate": 4.6441605839416055e-07, "logits/chosen": -2.956127166748047, "logits/rejected": -2.43821120262146, "logps/chosen": -760.0145263671875, "logps/rejected": -677.0914916992188, "loss": 0.4102, "rewards/accuracies": 0.75, "rewards/chosen": 2.3443243503570557, "rewards/margins": 1.1770540475845337, "rewards/rejected": 1.167270302772522, "step": 509 }, { "epoch": 0.3726027397260274, "grad_norm": 69.09266521649566, "learning_rate": 4.653284671532847e-07, "logits/chosen": -2.5106935501098633, "logits/rejected": -1.9278130531311035, "logps/chosen": -436.4704895019531, "logps/rejected": -325.67303466796875, "loss": 0.4876, "rewards/accuracies": 0.625, "rewards/chosen": 1.4316396713256836, "rewards/margins": 1.52362060546875, "rewards/rejected": -0.09198087453842163, "step": 510 }, { "epoch": 0.37333333333333335, "grad_norm": 77.75822341459295, "learning_rate": 4.662408759124087e-07, "logits/chosen": -2.521353244781494, "logits/rejected": -2.429396867752075, "logps/chosen": -637.2588500976562, "logps/rejected": -641.7410888671875, "loss": 0.5094, "rewards/accuracies": 0.75, "rewards/chosen": 1.703089952468872, "rewards/margins": 1.0730019807815552, "rewards/rejected": 0.6300878524780273, "step": 511 }, { "epoch": 0.3740639269406393, "grad_norm": 47.88740813013244, "learning_rate": 4.671532846715328e-07, "logits/chosen": -2.9799201488494873, "logits/rejected": -2.3554160594940186, "logps/chosen": -525.6898193359375, "logps/rejected": -385.346435546875, "loss": 0.3468, "rewards/accuracies": 0.75, "rewards/chosen": 1.7812577486038208, "rewards/margins": 1.504034161567688, "rewards/rejected": 0.2772236764431, "step": 512 }, { "epoch": 0.3747945205479452, "grad_norm": 64.78205272657505, "learning_rate": 4.6806569343065694e-07, "logits/chosen": -2.7819478511810303, "logits/rejected": -2.227480173110962, "logps/chosen": -828.316650390625, "logps/rejected": -583.86865234375, "loss": 0.4631, "rewards/accuracies": 0.875, "rewards/chosen": 2.2291476726531982, "rewards/margins": 1.8433446884155273, "rewards/rejected": 0.3858029842376709, "step": 513 }, { "epoch": 0.37552511415525114, "grad_norm": 64.79053162524019, "learning_rate": 4.68978102189781e-07, "logits/chosen": -2.7863426208496094, "logits/rejected": -2.30100679397583, "logps/chosen": -691.4600830078125, "logps/rejected": -529.8033447265625, "loss": 0.4439, "rewards/accuracies": 0.625, "rewards/chosen": 1.5400118827819824, "rewards/margins": 1.0513503551483154, "rewards/rejected": 0.4886617064476013, "step": 514 }, { "epoch": 0.37625570776255707, "grad_norm": 72.2835805396011, "learning_rate": 4.698905109489051e-07, "logits/chosen": -2.781254529953003, "logits/rejected": -1.9940603971481323, "logps/chosen": -429.96246337890625, "logps/rejected": -333.8828125, "loss": 0.5375, "rewards/accuracies": 0.75, "rewards/chosen": 1.6249737739562988, "rewards/margins": 1.3107314109802246, "rewards/rejected": 0.31424224376678467, "step": 515 }, { "epoch": 0.376986301369863, "grad_norm": 59.84351746038371, "learning_rate": 4.7080291970802913e-07, "logits/chosen": -3.1960225105285645, "logits/rejected": -1.3619349002838135, "logps/chosen": -723.3856811523438, "logps/rejected": -303.71527099609375, "loss": 0.3843, "rewards/accuracies": 1.0, "rewards/chosen": 1.9630160331726074, "rewards/margins": 2.162031412124634, "rewards/rejected": -0.19901546835899353, "step": 516 }, { "epoch": 0.3777168949771689, "grad_norm": 71.91057543487605, "learning_rate": 4.7171532846715327e-07, "logits/chosen": -2.4896035194396973, "logits/rejected": -2.0762686729431152, "logps/chosen": -450.95550537109375, "logps/rejected": -249.24749755859375, "loss": 0.585, "rewards/accuracies": 1.0, "rewards/chosen": 0.824489176273346, "rewards/margins": 0.732171893119812, "rewards/rejected": 0.09231729805469513, "step": 517 }, { "epoch": 0.3784474885844749, "grad_norm": 44.78750419733793, "learning_rate": 4.7262773722627735e-07, "logits/chosen": -2.8693153858184814, "logits/rejected": -1.644311785697937, "logps/chosen": -823.3690185546875, "logps/rejected": -391.984619140625, "loss": 0.3267, "rewards/accuracies": 0.875, "rewards/chosen": 3.0998787879943848, "rewards/margins": 2.616203784942627, "rewards/rejected": 0.48367467522621155, "step": 518 }, { "epoch": 0.37917808219178084, "grad_norm": 69.88041193822666, "learning_rate": 4.7354014598540143e-07, "logits/chosen": -2.884032726287842, "logits/rejected": -2.2567901611328125, "logps/chosen": -816.2914428710938, "logps/rejected": -863.23095703125, "loss": 0.4377, "rewards/accuracies": 0.625, "rewards/chosen": 1.7766268253326416, "rewards/margins": 0.5266602039337158, "rewards/rejected": 1.2499666213989258, "step": 519 }, { "epoch": 0.37990867579908677, "grad_norm": 65.56508797918866, "learning_rate": 4.744525547445255e-07, "logits/chosen": -2.650399684906006, "logits/rejected": -2.1389453411102295, "logps/chosen": -731.236328125, "logps/rejected": -779.4322509765625, "loss": 0.4109, "rewards/accuracies": 0.875, "rewards/chosen": 3.3160314559936523, "rewards/margins": 2.7350659370422363, "rewards/rejected": 0.5809656977653503, "step": 520 }, { "epoch": 0.3806392694063927, "grad_norm": 72.59208524242673, "learning_rate": 4.7536496350364965e-07, "logits/chosen": -3.064652919769287, "logits/rejected": -1.962098479270935, "logps/chosen": -679.291015625, "logps/rejected": -544.0205078125, "loss": 0.5223, "rewards/accuracies": 0.75, "rewards/chosen": 1.9479374885559082, "rewards/margins": 1.4941027164459229, "rewards/rejected": 0.4538347125053406, "step": 521 }, { "epoch": 0.38136986301369863, "grad_norm": 68.06542693431078, "learning_rate": 4.762773722627737e-07, "logits/chosen": -2.9728572368621826, "logits/rejected": -2.191321611404419, "logps/chosen": -946.5797729492188, "logps/rejected": -578.88525390625, "loss": 0.4229, "rewards/accuracies": 1.0, "rewards/chosen": 3.0159554481506348, "rewards/margins": 2.3652968406677246, "rewards/rejected": 0.6506587266921997, "step": 522 }, { "epoch": 0.38210045662100456, "grad_norm": 67.21606224287093, "learning_rate": 4.771897810218978e-07, "logits/chosen": -2.8929264545440674, "logits/rejected": -1.7778148651123047, "logps/chosen": -897.705810546875, "logps/rejected": -658.7991943359375, "loss": 0.406, "rewards/accuracies": 0.75, "rewards/chosen": 2.486989736557007, "rewards/margins": 2.155250310897827, "rewards/rejected": 0.331739604473114, "step": 523 }, { "epoch": 0.3828310502283105, "grad_norm": 51.442896322342214, "learning_rate": 4.781021897810219e-07, "logits/chosen": -2.746321678161621, "logits/rejected": -1.9860708713531494, "logps/chosen": -819.6581420898438, "logps/rejected": -639.1848754882812, "loss": 0.3279, "rewards/accuracies": 0.875, "rewards/chosen": 2.3846042156219482, "rewards/margins": 1.7962331771850586, "rewards/rejected": 0.5883710980415344, "step": 524 }, { "epoch": 0.3835616438356164, "grad_norm": 87.58838449050336, "learning_rate": 4.790145985401459e-07, "logits/chosen": -2.5863211154937744, "logits/rejected": -2.170370578765869, "logps/chosen": -774.0906372070312, "logps/rejected": -586.9810180664062, "loss": 0.6303, "rewards/accuracies": 0.75, "rewards/chosen": 2.1122446060180664, "rewards/margins": 0.8589416146278381, "rewards/rejected": 1.253303050994873, "step": 525 }, { "epoch": 0.3842922374429224, "grad_norm": 78.00700464341327, "learning_rate": 4.799270072992701e-07, "logits/chosen": -2.2712764739990234, "logits/rejected": -2.4478464126586914, "logps/chosen": -831.0255737304688, "logps/rejected": -756.2081298828125, "loss": 0.4746, "rewards/accuracies": 0.75, "rewards/chosen": 2.28574800491333, "rewards/margins": 1.1258965730667114, "rewards/rejected": 1.159851312637329, "step": 526 }, { "epoch": 0.38502283105022833, "grad_norm": 69.52721898427622, "learning_rate": 4.808394160583941e-07, "logits/chosen": -2.8852102756500244, "logits/rejected": -2.3716657161712646, "logps/chosen": -801.5132446289062, "logps/rejected": -805.2318115234375, "loss": 0.451, "rewards/accuracies": 0.75, "rewards/chosen": 1.2896099090576172, "rewards/margins": 0.44425299763679504, "rewards/rejected": 0.8453569412231445, "step": 527 }, { "epoch": 0.38575342465753426, "grad_norm": 56.18828863114515, "learning_rate": 4.817518248175182e-07, "logits/chosen": -2.6400134563446045, "logits/rejected": -1.976215124130249, "logps/chosen": -585.368896484375, "logps/rejected": -569.5707397460938, "loss": 0.3675, "rewards/accuracies": 1.0, "rewards/chosen": 2.21443510055542, "rewards/margins": 2.5333971977233887, "rewards/rejected": -0.3189620077610016, "step": 528 }, { "epoch": 0.3864840182648402, "grad_norm": 54.39891715374489, "learning_rate": 4.826642335766424e-07, "logits/chosen": -2.949910879135132, "logits/rejected": -1.906163215637207, "logps/chosen": -787.4466552734375, "logps/rejected": -684.3917846679688, "loss": 0.2958, "rewards/accuracies": 0.875, "rewards/chosen": 1.892282247543335, "rewards/margins": 1.6711459159851074, "rewards/rejected": 0.22113628685474396, "step": 529 }, { "epoch": 0.3872146118721461, "grad_norm": 48.51073596541543, "learning_rate": 4.835766423357664e-07, "logits/chosen": -2.716263771057129, "logits/rejected": -2.494713306427002, "logps/chosen": -457.1477966308594, "logps/rejected": -324.52374267578125, "loss": 0.3601, "rewards/accuracies": 0.625, "rewards/chosen": 1.6857961416244507, "rewards/margins": 1.4473105669021606, "rewards/rejected": 0.23848551511764526, "step": 530 }, { "epoch": 0.38794520547945205, "grad_norm": 63.49631087032242, "learning_rate": 4.844890510948904e-07, "logits/chosen": -3.176542043685913, "logits/rejected": -2.0936145782470703, "logps/chosen": -676.0942993164062, "logps/rejected": -393.605712890625, "loss": 0.4787, "rewards/accuracies": 0.875, "rewards/chosen": 1.5064218044281006, "rewards/margins": 1.0493324995040894, "rewards/rejected": 0.45708924531936646, "step": 531 }, { "epoch": 0.388675799086758, "grad_norm": 73.31595660009832, "learning_rate": 4.854014598540146e-07, "logits/chosen": -2.5391383171081543, "logits/rejected": -2.063931703567505, "logps/chosen": -792.7978515625, "logps/rejected": -716.0550537109375, "loss": 0.4017, "rewards/accuracies": 0.625, "rewards/chosen": 1.4342952966690063, "rewards/margins": 0.7074408531188965, "rewards/rejected": 0.7268545031547546, "step": 532 }, { "epoch": 0.3894063926940639, "grad_norm": 61.4152625383646, "learning_rate": 4.863138686131387e-07, "logits/chosen": -2.722008466720581, "logits/rejected": -2.10689377784729, "logps/chosen": -539.1397705078125, "logps/rejected": -460.3428955078125, "loss": 0.4542, "rewards/accuracies": 0.625, "rewards/chosen": 1.4458706378936768, "rewards/margins": 1.1020680665969849, "rewards/rejected": 0.34380248188972473, "step": 533 }, { "epoch": 0.39013698630136984, "grad_norm": 60.65876355237654, "learning_rate": 4.872262773722627e-07, "logits/chosen": -2.83184552192688, "logits/rejected": -2.265838146209717, "logps/chosen": -877.2551879882812, "logps/rejected": -579.637939453125, "loss": 0.3615, "rewards/accuracies": 0.875, "rewards/chosen": 1.9306552410125732, "rewards/margins": 1.0537360906600952, "rewards/rejected": 0.8769190311431885, "step": 534 }, { "epoch": 0.3908675799086758, "grad_norm": 51.33861907564081, "learning_rate": 4.881386861313869e-07, "logits/chosen": -2.7120285034179688, "logits/rejected": -2.3944711685180664, "logps/chosen": -718.88330078125, "logps/rejected": -598.9680786132812, "loss": 0.3488, "rewards/accuracies": 0.875, "rewards/chosen": 1.621598720550537, "rewards/margins": 1.3062092065811157, "rewards/rejected": 0.3153894543647766, "step": 535 }, { "epoch": 0.39159817351598175, "grad_norm": 55.76028799061994, "learning_rate": 4.89051094890511e-07, "logits/chosen": -3.2516531944274902, "logits/rejected": -3.030851364135742, "logps/chosen": -667.2099609375, "logps/rejected": -618.7260131835938, "loss": 0.4431, "rewards/accuracies": 0.5, "rewards/chosen": 1.1512188911437988, "rewards/margins": 0.2790994644165039, "rewards/rejected": 0.8721193671226501, "step": 536 }, { "epoch": 0.3923287671232877, "grad_norm": 48.507655847105724, "learning_rate": 4.89963503649635e-07, "logits/chosen": -2.8860816955566406, "logits/rejected": -2.225321054458618, "logps/chosen": -857.5056762695312, "logps/rejected": -639.4248657226562, "loss": 0.3372, "rewards/accuracies": 0.875, "rewards/chosen": 1.9480208158493042, "rewards/margins": 1.1861112117767334, "rewards/rejected": 0.761909544467926, "step": 537 }, { "epoch": 0.3930593607305936, "grad_norm": 73.10399003730917, "learning_rate": 4.90875912408759e-07, "logits/chosen": -2.026909112930298, "logits/rejected": -2.1760036945343018, "logps/chosen": -414.9658203125, "logps/rejected": -525.665771484375, "loss": 0.4756, "rewards/accuracies": 0.625, "rewards/chosen": 0.8669200539588928, "rewards/margins": 0.6697767376899719, "rewards/rejected": 0.19714322686195374, "step": 538 }, { "epoch": 0.39378995433789954, "grad_norm": 73.5496816040865, "learning_rate": 4.917883211678832e-07, "logits/chosen": -2.341597318649292, "logits/rejected": -1.9582306146621704, "logps/chosen": -427.2432556152344, "logps/rejected": -442.2970275878906, "loss": 0.484, "rewards/accuracies": 0.5, "rewards/chosen": 1.1716227531433105, "rewards/margins": 0.5874232053756714, "rewards/rejected": 0.5841995477676392, "step": 539 }, { "epoch": 0.39452054794520547, "grad_norm": 52.77170206713537, "learning_rate": 4.927007299270073e-07, "logits/chosen": -3.172924518585205, "logits/rejected": -2.324219226837158, "logps/chosen": -680.4721069335938, "logps/rejected": -534.3718872070312, "loss": 0.3366, "rewards/accuracies": 0.875, "rewards/chosen": 1.6586549282073975, "rewards/margins": 1.4216026067733765, "rewards/rejected": 0.23705236613750458, "step": 540 }, { "epoch": 0.3952511415525114, "grad_norm": 54.425941475262974, "learning_rate": 4.936131386861313e-07, "logits/chosen": -3.0972700119018555, "logits/rejected": -1.95811128616333, "logps/chosen": -752.6707763671875, "logps/rejected": -479.4417724609375, "loss": 0.3037, "rewards/accuracies": 0.75, "rewards/chosen": 1.8118083477020264, "rewards/margins": 1.4969019889831543, "rewards/rejected": 0.3149062991142273, "step": 541 }, { "epoch": 0.3959817351598173, "grad_norm": 53.794006785288126, "learning_rate": 4.945255474452555e-07, "logits/chosen": -2.751338481903076, "logits/rejected": -2.0460236072540283, "logps/chosen": -637.3653564453125, "logps/rejected": -427.2353820800781, "loss": 0.3996, "rewards/accuracies": 1.0, "rewards/chosen": 1.9205002784729004, "rewards/margins": 2.006122589111328, "rewards/rejected": -0.08562223613262177, "step": 542 }, { "epoch": 0.3967123287671233, "grad_norm": 68.8535122731132, "learning_rate": 4.954379562043795e-07, "logits/chosen": -2.779965400695801, "logits/rejected": -2.5629186630249023, "logps/chosen": -894.9219970703125, "logps/rejected": -1128.79736328125, "loss": 0.3979, "rewards/accuracies": 0.75, "rewards/chosen": 1.976457953453064, "rewards/margins": 1.1715644598007202, "rewards/rejected": 0.804893434047699, "step": 543 }, { "epoch": 0.39744292237442924, "grad_norm": 50.61973593387884, "learning_rate": 4.963503649635036e-07, "logits/chosen": -2.749941110610962, "logits/rejected": -1.1100329160690308, "logps/chosen": -640.6730346679688, "logps/rejected": -277.8980407714844, "loss": 0.3068, "rewards/accuracies": 0.75, "rewards/chosen": 3.1976001262664795, "rewards/margins": 3.486421823501587, "rewards/rejected": -0.288821816444397, "step": 544 }, { "epoch": 0.39817351598173517, "grad_norm": 46.854412145495964, "learning_rate": 4.972627737226277e-07, "logits/chosen": -2.711481809616089, "logits/rejected": -2.166970729827881, "logps/chosen": -618.0488891601562, "logps/rejected": -450.3864440917969, "loss": 0.3653, "rewards/accuracies": 0.625, "rewards/chosen": 1.636525273323059, "rewards/margins": 1.484768271446228, "rewards/rejected": 0.15175706148147583, "step": 545 }, { "epoch": 0.3989041095890411, "grad_norm": 73.83049434028767, "learning_rate": 4.981751824817518e-07, "logits/chosen": -3.0917108058929443, "logits/rejected": -2.819300651550293, "logps/chosen": -617.1689453125, "logps/rejected": -614.313232421875, "loss": 0.4903, "rewards/accuracies": 0.625, "rewards/chosen": 0.6272110939025879, "rewards/margins": -0.036435604095458984, "rewards/rejected": 0.6636466979980469, "step": 546 }, { "epoch": 0.39963470319634703, "grad_norm": 51.65427202429026, "learning_rate": 4.99087591240876e-07, "logits/chosen": -2.723130941390991, "logits/rejected": -2.1894121170043945, "logps/chosen": -501.32763671875, "logps/rejected": -449.1488952636719, "loss": 0.3193, "rewards/accuracies": 0.75, "rewards/chosen": 2.101776123046875, "rewards/margins": 2.067674398422241, "rewards/rejected": 0.034101784229278564, "step": 547 }, { "epoch": 0.40036529680365296, "grad_norm": 73.18764344029496, "learning_rate": 5e-07, "logits/chosen": -2.6446592807769775, "logits/rejected": -1.873044490814209, "logps/chosen": -547.710205078125, "logps/rejected": -384.9400939941406, "loss": 0.4518, "rewards/accuracies": 0.75, "rewards/chosen": 1.3343523740768433, "rewards/margins": 1.751215934753418, "rewards/rejected": -0.4168636202812195, "step": 548 }, { "epoch": 0.4010958904109589, "grad_norm": 71.13654961696027, "learning_rate": 4.999999491168892e-07, "logits/chosen": -2.6456356048583984, "logits/rejected": -2.26519775390625, "logps/chosen": -750.01904296875, "logps/rejected": -610.3711547851562, "loss": 0.4641, "rewards/accuracies": 0.5, "rewards/chosen": 2.1573843955993652, "rewards/margins": 0.7288120985031128, "rewards/rejected": 1.4285722970962524, "step": 549 }, { "epoch": 0.4018264840182648, "grad_norm": 60.00328767034353, "learning_rate": 4.999997964675772e-07, "logits/chosen": -2.4860148429870605, "logits/rejected": -2.1443848609924316, "logps/chosen": -488.4798278808594, "logps/rejected": -648.637451171875, "loss": 0.3942, "rewards/accuracies": 0.75, "rewards/chosen": 0.7417255640029907, "rewards/margins": 0.6993524432182312, "rewards/rejected": 0.04237304627895355, "step": 550 }, { "epoch": 0.4025570776255708, "grad_norm": 61.28173205445282, "learning_rate": 4.999995420521264e-07, "logits/chosen": -2.7685437202453613, "logits/rejected": -2.4486818313598633, "logps/chosen": -822.7913208007812, "logps/rejected": -781.766357421875, "loss": 0.3647, "rewards/accuracies": 0.75, "rewards/chosen": 1.5166478157043457, "rewards/margins": 1.2589833736419678, "rewards/rejected": 0.2576645016670227, "step": 551 }, { "epoch": 0.40328767123287673, "grad_norm": 52.94639092988303, "learning_rate": 4.999991858706404e-07, "logits/chosen": -2.7483983039855957, "logits/rejected": -1.854215145111084, "logps/chosen": -712.826171875, "logps/rejected": -620.0610961914062, "loss": 0.3568, "rewards/accuracies": 1.0, "rewards/chosen": 2.3233766555786133, "rewards/margins": 2.5805845260620117, "rewards/rejected": -0.2572076916694641, "step": 552 }, { "epoch": 0.40401826484018266, "grad_norm": 53.93907515081251, "learning_rate": 4.999987279232639e-07, "logits/chosen": -2.7303340435028076, "logits/rejected": -1.9902458190917969, "logps/chosen": -687.9735717773438, "logps/rejected": -511.83880615234375, "loss": 0.3877, "rewards/accuracies": 0.875, "rewards/chosen": 1.8967864513397217, "rewards/margins": 1.6424503326416016, "rewards/rejected": 0.2543361783027649, "step": 553 }, { "epoch": 0.4047488584474886, "grad_norm": 69.8607676415651, "learning_rate": 4.999981682101836e-07, "logits/chosen": -2.5249598026275635, "logits/rejected": -1.7185486555099487, "logps/chosen": -525.4502563476562, "logps/rejected": -442.1912536621094, "loss": 0.4172, "rewards/accuracies": 0.625, "rewards/chosen": 1.2686212062835693, "rewards/margins": 1.2990355491638184, "rewards/rejected": -0.030414391309022903, "step": 554 }, { "epoch": 0.4054794520547945, "grad_norm": 71.71498492295736, "learning_rate": 4.999975067316271e-07, "logits/chosen": -3.0615174770355225, "logits/rejected": -2.218263626098633, "logps/chosen": -796.2058715820312, "logps/rejected": -441.8481140136719, "loss": 0.4307, "rewards/accuracies": 0.75, "rewards/chosen": 2.77457857131958, "rewards/margins": 2.514038324356079, "rewards/rejected": 0.2605401873588562, "step": 555 }, { "epoch": 0.40621004566210045, "grad_norm": 64.04091480866938, "learning_rate": 4.999967434878639e-07, "logits/chosen": -2.5201101303100586, "logits/rejected": -2.0333313941955566, "logps/chosen": -513.2720336914062, "logps/rejected": -514.2946166992188, "loss": 0.3793, "rewards/accuracies": 0.75, "rewards/chosen": 0.812606692314148, "rewards/margins": 1.1721861362457275, "rewards/rejected": -0.35957926511764526, "step": 556 }, { "epoch": 0.4069406392694064, "grad_norm": 82.97462020194037, "learning_rate": 4.999958784792045e-07, "logits/chosen": -2.582491636276245, "logits/rejected": -2.769965648651123, "logps/chosen": -703.7982788085938, "logps/rejected": -523.849853515625, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 1.7121505737304688, "rewards/margins": 1.6967389583587646, "rewards/rejected": 0.015411585569381714, "step": 557 }, { "epoch": 0.4076712328767123, "grad_norm": 58.92843580882663, "learning_rate": 4.999949117060012e-07, "logits/chosen": -3.2224066257476807, "logits/rejected": -2.3279130458831787, "logps/chosen": -758.3642578125, "logps/rejected": -510.4829406738281, "loss": 0.3652, "rewards/accuracies": 0.625, "rewards/chosen": 1.2609144449234009, "rewards/margins": 0.9944099187850952, "rewards/rejected": 0.2665044665336609, "step": 558 }, { "epoch": 0.40840182648401824, "grad_norm": 43.05534881432779, "learning_rate": 4.999938431686473e-07, "logits/chosen": -2.656266212463379, "logits/rejected": -1.830549955368042, "logps/chosen": -445.8688659667969, "logps/rejected": -325.04608154296875, "loss": 0.326, "rewards/accuracies": 1.0, "rewards/chosen": 1.6345733404159546, "rewards/margins": 2.1982884407043457, "rewards/rejected": -0.5637149810791016, "step": 559 }, { "epoch": 0.4091324200913242, "grad_norm": 52.79582068346532, "learning_rate": 4.999926728675779e-07, "logits/chosen": -2.2927749156951904, "logits/rejected": -2.126497507095337, "logps/chosen": -417.62432861328125, "logps/rejected": -426.85906982421875, "loss": 0.409, "rewards/accuracies": 0.5, "rewards/chosen": 0.830128014087677, "rewards/margins": 0.6024042367935181, "rewards/rejected": 0.22772376239299774, "step": 560 }, { "epoch": 0.40986301369863015, "grad_norm": 61.39317608914284, "learning_rate": 4.999914008032695e-07, "logits/chosen": -2.2162704467773438, "logits/rejected": -1.8355003595352173, "logps/chosen": -423.8902587890625, "logps/rejected": -396.1357421875, "loss": 0.4262, "rewards/accuracies": 0.5, "rewards/chosen": 0.8646644353866577, "rewards/margins": 0.7672218680381775, "rewards/rejected": 0.0974426418542862, "step": 561 }, { "epoch": 0.4105936073059361, "grad_norm": 71.68665199476644, "learning_rate": 4.999900269762397e-07, "logits/chosen": -2.7999274730682373, "logits/rejected": -1.884188175201416, "logps/chosen": -846.8661499023438, "logps/rejected": -469.4071044921875, "loss": 0.4025, "rewards/accuracies": 0.875, "rewards/chosen": 2.118298053741455, "rewards/margins": 1.5143734216690063, "rewards/rejected": 0.6039247512817383, "step": 562 }, { "epoch": 0.411324200913242, "grad_norm": 62.7227086755168, "learning_rate": 4.999885513870478e-07, "logits/chosen": -2.8670060634613037, "logits/rejected": -2.000669240951538, "logps/chosen": -695.091796875, "logps/rejected": -573.5550537109375, "loss": 0.3686, "rewards/accuracies": 0.75, "rewards/chosen": 1.906675934791565, "rewards/margins": 1.4725620746612549, "rewards/rejected": 0.43411388993263245, "step": 563 }, { "epoch": 0.41205479452054794, "grad_norm": 64.68153656086875, "learning_rate": 4.999869740362947e-07, "logits/chosen": -2.602158784866333, "logits/rejected": -2.0385677814483643, "logps/chosen": -417.19512939453125, "logps/rejected": -290.0202941894531, "loss": 0.3744, "rewards/accuracies": 0.625, "rewards/chosen": 1.027217149734497, "rewards/margins": 1.1151461601257324, "rewards/rejected": -0.08792901784181595, "step": 564 }, { "epoch": 0.41278538812785387, "grad_norm": 52.482751537856466, "learning_rate": 4.99985294924622e-07, "logits/chosen": -2.8043699264526367, "logits/rejected": -2.558070182800293, "logps/chosen": -544.9634399414062, "logps/rejected": -411.1170654296875, "loss": 0.3806, "rewards/accuracies": 0.5, "rewards/chosen": 1.281945824623108, "rewards/margins": 0.9795063138008118, "rewards/rejected": 0.30243945121765137, "step": 565 }, { "epoch": 0.4135159817351598, "grad_norm": 43.27366387713851, "learning_rate": 4.999835140527136e-07, "logits/chosen": -3.194371223449707, "logits/rejected": -1.7820603847503662, "logps/chosen": -641.2781982421875, "logps/rejected": -325.024658203125, "loss": 0.2683, "rewards/accuracies": 0.875, "rewards/chosen": 2.1101531982421875, "rewards/margins": 1.9988924264907837, "rewards/rejected": 0.11126069724559784, "step": 566 }, { "epoch": 0.41424657534246573, "grad_norm": 62.564404316671954, "learning_rate": 4.999816314212944e-07, "logits/chosen": -3.2888684272766113, "logits/rejected": -2.061440944671631, "logps/chosen": -692.2666015625, "logps/rejected": -397.7498779296875, "loss": 0.4552, "rewards/accuracies": 0.875, "rewards/chosen": 2.1418704986572266, "rewards/margins": 1.9293919801712036, "rewards/rejected": 0.21247854828834534, "step": 567 }, { "epoch": 0.4149771689497717, "grad_norm": 59.72854854856888, "learning_rate": 4.999796470311306e-07, "logits/chosen": -2.779773235321045, "logits/rejected": -2.277533531188965, "logps/chosen": -671.4705200195312, "logps/rejected": -553.7538452148438, "loss": 0.4057, "rewards/accuracies": 0.875, "rewards/chosen": 2.122647285461426, "rewards/margins": 1.028246521949768, "rewards/rejected": 1.0944008827209473, "step": 568 }, { "epoch": 0.41570776255707764, "grad_norm": 71.56136393748899, "learning_rate": 4.9997756088303e-07, "logits/chosen": -2.7253129482269287, "logits/rejected": -2.141425609588623, "logps/chosen": -752.5274658203125, "logps/rejected": -550.7702026367188, "loss": 0.4168, "rewards/accuracies": 0.875, "rewards/chosen": 1.6647809743881226, "rewards/margins": 1.6975319385528564, "rewards/rejected": -0.03275086730718613, "step": 569 }, { "epoch": 0.41643835616438357, "grad_norm": 73.40894391608559, "learning_rate": 4.999753729778419e-07, "logits/chosen": -2.2324650287628174, "logits/rejected": -2.399486541748047, "logps/chosen": -658.4730834960938, "logps/rejected": -742.920166015625, "loss": 0.508, "rewards/accuracies": 0.5, "rewards/chosen": 1.399985671043396, "rewards/margins": 0.2870970368385315, "rewards/rejected": 1.1128885746002197, "step": 570 }, { "epoch": 0.4171689497716895, "grad_norm": 70.05665954693387, "learning_rate": 4.999730833164569e-07, "logits/chosen": -2.747647523880005, "logits/rejected": -2.0192131996154785, "logps/chosen": -351.33502197265625, "logps/rejected": -282.9653625488281, "loss": 0.4706, "rewards/accuracies": 0.875, "rewards/chosen": 1.3274807929992676, "rewards/margins": 1.6848337650299072, "rewards/rejected": -0.3573529124259949, "step": 571 }, { "epoch": 0.41789954337899543, "grad_norm": 78.66317448165469, "learning_rate": 4.999706918998069e-07, "logits/chosen": -3.1499993801116943, "logits/rejected": -2.3588624000549316, "logps/chosen": -415.8963928222656, "logps/rejected": -294.0250244140625, "loss": 0.4427, "rewards/accuracies": 1.0, "rewards/chosen": 1.852418303489685, "rewards/margins": 2.079878330230713, "rewards/rejected": -0.22745996713638306, "step": 572 }, { "epoch": 0.41863013698630136, "grad_norm": 72.04096328218412, "learning_rate": 4.999681987288655e-07, "logits/chosen": -2.9748423099517822, "logits/rejected": -2.0616869926452637, "logps/chosen": -732.283203125, "logps/rejected": -448.00872802734375, "loss": 0.4798, "rewards/accuracies": 0.75, "rewards/chosen": 2.226365804672241, "rewards/margins": 1.3071919679641724, "rewards/rejected": 0.9191738367080688, "step": 573 }, { "epoch": 0.4193607305936073, "grad_norm": 54.17798359343028, "learning_rate": 4.999656038046476e-07, "logits/chosen": -2.6718459129333496, "logits/rejected": -2.2384986877441406, "logps/chosen": -441.822998046875, "logps/rejected": -369.46923828125, "loss": 0.3952, "rewards/accuracies": 0.75, "rewards/chosen": 0.913195013999939, "rewards/margins": 0.25585421919822693, "rewards/rejected": 0.6573408842086792, "step": 574 }, { "epoch": 0.4200913242009132, "grad_norm": 68.34193884774378, "learning_rate": 4.999629071282093e-07, "logits/chosen": -3.2263216972351074, "logits/rejected": -2.8487255573272705, "logps/chosen": -683.246826171875, "logps/rejected": -487.5662841796875, "loss": 0.5364, "rewards/accuracies": 0.625, "rewards/chosen": 1.4684596061706543, "rewards/margins": 0.5202001929283142, "rewards/rejected": 0.9482593536376953, "step": 575 }, { "epoch": 0.4208219178082192, "grad_norm": 65.63609281247247, "learning_rate": 4.999601087006486e-07, "logits/chosen": -3.322545289993286, "logits/rejected": -2.1981236934661865, "logps/chosen": -486.5215759277344, "logps/rejected": -322.03192138671875, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 2.494873523712158, "rewards/margins": 2.851627826690674, "rewards/rejected": -0.3567543029785156, "step": 576 }, { "epoch": 0.42155251141552513, "grad_norm": 94.36044486270453, "learning_rate": 4.999572085231045e-07, "logits/chosen": -2.333671808242798, "logits/rejected": -2.09857177734375, "logps/chosen": -518.247314453125, "logps/rejected": -639.6720581054688, "loss": 0.6327, "rewards/accuracies": 0.875, "rewards/chosen": 1.531021237373352, "rewards/margins": 0.9132016897201538, "rewards/rejected": 0.617819607257843, "step": 577 }, { "epoch": 0.42228310502283106, "grad_norm": 56.16711775752339, "learning_rate": 4.999542065967576e-07, "logits/chosen": -2.6574182510375977, "logits/rejected": -2.2944557666778564, "logps/chosen": -540.5036010742188, "logps/rejected": -494.63214111328125, "loss": 0.3493, "rewards/accuracies": 1.0, "rewards/chosen": 2.0201282501220703, "rewards/margins": 2.3271641731262207, "rewards/rejected": -0.30703625082969666, "step": 578 }, { "epoch": 0.423013698630137, "grad_norm": 115.306097383226, "learning_rate": 4.999511029228297e-07, "logits/chosen": -3.007132053375244, "logits/rejected": -3.0884971618652344, "logps/chosen": -590.6837768554688, "logps/rejected": -792.3193359375, "loss": 0.5768, "rewards/accuracies": 0.375, "rewards/chosen": 1.844988465309143, "rewards/margins": 0.30925047397613525, "rewards/rejected": 1.5357381105422974, "step": 579 }, { "epoch": 0.4237442922374429, "grad_norm": 56.95126883120669, "learning_rate": 4.999478975025845e-07, "logits/chosen": -2.746264934539795, "logits/rejected": -2.565934181213379, "logps/chosen": -528.0218505859375, "logps/rejected": -463.21820068359375, "loss": 0.399, "rewards/accuracies": 0.875, "rewards/chosen": 1.7386454343795776, "rewards/margins": 1.1374151706695557, "rewards/rejected": 0.6012301445007324, "step": 580 }, { "epoch": 0.42447488584474885, "grad_norm": 68.78274788237236, "learning_rate": 4.999445903373266e-07, "logits/chosen": -2.9117183685302734, "logits/rejected": -2.604464054107666, "logps/chosen": -653.1416625976562, "logps/rejected": -682.1966552734375, "loss": 0.4562, "rewards/accuracies": 0.625, "rewards/chosen": 1.6718106269836426, "rewards/margins": 0.5383838415145874, "rewards/rejected": 1.1334266662597656, "step": 581 }, { "epoch": 0.4252054794520548, "grad_norm": 56.22614077322042, "learning_rate": 4.999411814284023e-07, "logits/chosen": -2.9573118686676025, "logits/rejected": -2.349526882171631, "logps/chosen": -826.7688598632812, "logps/rejected": -608.0908203125, "loss": 0.3535, "rewards/accuracies": 0.875, "rewards/chosen": 2.808314085006714, "rewards/margins": 2.5241458415985107, "rewards/rejected": 0.28416842222213745, "step": 582 }, { "epoch": 0.4259360730593607, "grad_norm": 60.662085950575424, "learning_rate": 4.999376707771992e-07, "logits/chosen": -2.926497459411621, "logits/rejected": -2.068350076675415, "logps/chosen": -736.8185424804688, "logps/rejected": -642.7027587890625, "loss": 0.3716, "rewards/accuracies": 1.0, "rewards/chosen": 2.6727421283721924, "rewards/margins": 2.1428496837615967, "rewards/rejected": 0.5298923254013062, "step": 583 }, { "epoch": 0.4266666666666667, "grad_norm": 78.32580755348935, "learning_rate": 4.999340583851465e-07, "logits/chosen": -2.455547332763672, "logits/rejected": -2.520794630050659, "logps/chosen": -846.5213623046875, "logps/rejected": -806.5369873046875, "loss": 0.5084, "rewards/accuracies": 0.875, "rewards/chosen": 1.7068487405776978, "rewards/margins": 0.9161807298660278, "rewards/rejected": 0.7906678915023804, "step": 584 }, { "epoch": 0.4273972602739726, "grad_norm": 54.59999919514079, "learning_rate": 4.999303442537145e-07, "logits/chosen": -2.567073106765747, "logits/rejected": -1.8993257284164429, "logps/chosen": -833.731689453125, "logps/rejected": -585.6077880859375, "loss": 0.3474, "rewards/accuracies": 0.75, "rewards/chosen": 2.396432638168335, "rewards/margins": 1.6578922271728516, "rewards/rejected": 0.7385404706001282, "step": 585 }, { "epoch": 0.42812785388127855, "grad_norm": 80.43201224481456, "learning_rate": 4.999265283844152e-07, "logits/chosen": -3.0447397232055664, "logits/rejected": -2.23683500289917, "logps/chosen": -1037.435302734375, "logps/rejected": -802.3807373046875, "loss": 0.4606, "rewards/accuracies": 0.75, "rewards/chosen": 2.466402769088745, "rewards/margins": 1.1144194602966309, "rewards/rejected": 1.3519833087921143, "step": 586 }, { "epoch": 0.4288584474885845, "grad_norm": 68.41594266643924, "learning_rate": 4.999226107788018e-07, "logits/chosen": -2.4844722747802734, "logits/rejected": -2.5822696685791016, "logps/chosen": -461.1014404296875, "logps/rejected": -481.8622131347656, "loss": 0.3499, "rewards/accuracies": 0.75, "rewards/chosen": 1.2926679849624634, "rewards/margins": 1.2731024026870728, "rewards/rejected": 0.019565589725971222, "step": 587 }, { "epoch": 0.4295890410958904, "grad_norm": 57.78425830459798, "learning_rate": 4.999185914384692e-07, "logits/chosen": -3.2345423698425293, "logits/rejected": -2.4686245918273926, "logps/chosen": -639.8526611328125, "logps/rejected": -565.3299560546875, "loss": 0.3548, "rewards/accuracies": 0.75, "rewards/chosen": 2.7040927410125732, "rewards/margins": 1.8733125925064087, "rewards/rejected": 0.8307802677154541, "step": 588 }, { "epoch": 0.43031963470319634, "grad_norm": 56.5484076885404, "learning_rate": 4.999144703650535e-07, "logits/chosen": -2.5693178176879883, "logits/rejected": -2.2021114826202393, "logps/chosen": -393.87249755859375, "logps/rejected": -363.7457580566406, "loss": 0.3676, "rewards/accuracies": 0.875, "rewards/chosen": 0.7498546242713928, "rewards/margins": 1.208855390548706, "rewards/rejected": -0.45900073647499084, "step": 589 }, { "epoch": 0.43105022831050227, "grad_norm": 77.75395011066962, "learning_rate": 4.99910247560232e-07, "logits/chosen": -3.0165834426879883, "logits/rejected": -2.194620132446289, "logps/chosen": -888.0491333007812, "logps/rejected": -584.5869140625, "loss": 0.4545, "rewards/accuracies": 1.0, "rewards/chosen": 3.0475759506225586, "rewards/margins": 2.623901844024658, "rewards/rejected": 0.4236740469932556, "step": 590 }, { "epoch": 0.4317808219178082, "grad_norm": 81.65785852797634, "learning_rate": 4.99905923025724e-07, "logits/chosen": -2.515069007873535, "logits/rejected": -1.9452149868011475, "logps/chosen": -607.7557373046875, "logps/rejected": -405.8538818359375, "loss": 0.5486, "rewards/accuracies": 0.75, "rewards/chosen": 2.0092883110046387, "rewards/margins": 2.1204378604888916, "rewards/rejected": -0.11114940792322159, "step": 591 }, { "epoch": 0.43251141552511413, "grad_norm": 71.19028473482179, "learning_rate": 4.999014967632895e-07, "logits/chosen": -2.9590015411376953, "logits/rejected": -1.563875675201416, "logps/chosen": -658.7161254882812, "logps/rejected": -213.33424377441406, "loss": 0.4964, "rewards/accuracies": 0.75, "rewards/chosen": 1.3372899293899536, "rewards/margins": 1.4981310367584229, "rewards/rejected": -0.1608411967754364, "step": 592 }, { "epoch": 0.4332420091324201, "grad_norm": 61.49811036222983, "learning_rate": 4.998969687747306e-07, "logits/chosen": -2.8927016258239746, "logits/rejected": -2.594801425933838, "logps/chosen": -711.1346435546875, "logps/rejected": -544.3939819335938, "loss": 0.3356, "rewards/accuracies": 1.0, "rewards/chosen": 2.506847381591797, "rewards/margins": 2.579855442047119, "rewards/rejected": -0.0730079710483551, "step": 593 }, { "epoch": 0.43397260273972604, "grad_norm": 57.98577285434441, "learning_rate": 4.998923390618904e-07, "logits/chosen": -2.9305315017700195, "logits/rejected": -2.3726449012756348, "logps/chosen": -744.4442138671875, "logps/rejected": -643.2703247070312, "loss": 0.3232, "rewards/accuracies": 0.75, "rewards/chosen": 2.483175754547119, "rewards/margins": 1.9441118240356445, "rewards/rejected": 0.5390639305114746, "step": 594 }, { "epoch": 0.434703196347032, "grad_norm": 52.41813278371569, "learning_rate": 4.998876076266534e-07, "logits/chosen": -2.4021425247192383, "logits/rejected": -2.019292116165161, "logps/chosen": -339.15728759765625, "logps/rejected": -383.6541442871094, "loss": 0.373, "rewards/accuracies": 0.875, "rewards/chosen": 1.9187431335449219, "rewards/margins": 1.9743220806121826, "rewards/rejected": -0.05557906627655029, "step": 595 }, { "epoch": 0.4354337899543379, "grad_norm": 66.28131630123983, "learning_rate": 4.998827744709456e-07, "logits/chosen": -2.51328706741333, "logits/rejected": -1.5896291732788086, "logps/chosen": -788.4285888671875, "logps/rejected": -450.38232421875, "loss": 0.4491, "rewards/accuracies": 0.625, "rewards/chosen": 2.3499717712402344, "rewards/margins": 1.6437385082244873, "rewards/rejected": 0.7062332034111023, "step": 596 }, { "epoch": 0.43616438356164383, "grad_norm": 84.9364336071762, "learning_rate": 4.998778395967345e-07, "logits/chosen": -2.427157163619995, "logits/rejected": -1.5873830318450928, "logps/chosen": -502.70025634765625, "logps/rejected": -329.36395263671875, "loss": 0.5054, "rewards/accuracies": 1.0, "rewards/chosen": 1.7192707061767578, "rewards/margins": 2.0951294898986816, "rewards/rejected": -0.3758588135242462, "step": 597 }, { "epoch": 0.43689497716894976, "grad_norm": 60.69768516488087, "learning_rate": 4.998728030060289e-07, "logits/chosen": -2.979990243911743, "logits/rejected": -2.4025027751922607, "logps/chosen": -566.0048828125, "logps/rejected": -510.475341796875, "loss": 0.45, "rewards/accuracies": 0.875, "rewards/chosen": 1.2929004430770874, "rewards/margins": 1.3723288774490356, "rewards/rejected": -0.07942846417427063, "step": 598 }, { "epoch": 0.4376255707762557, "grad_norm": 68.30182398850158, "learning_rate": 4.998676647008789e-07, "logits/chosen": -2.8553707599639893, "logits/rejected": -2.610499620437622, "logps/chosen": -781.2103271484375, "logps/rejected": -702.0028076171875, "loss": 0.4058, "rewards/accuracies": 0.625, "rewards/chosen": 1.9987415075302124, "rewards/margins": 1.4156016111373901, "rewards/rejected": 0.5831398367881775, "step": 599 }, { "epoch": 0.4383561643835616, "grad_norm": 58.871564307264464, "learning_rate": 4.998624246833764e-07, "logits/chosen": -2.9021685123443604, "logits/rejected": -2.0744152069091797, "logps/chosen": -472.06005859375, "logps/rejected": -530.0932006835938, "loss": 0.31, "rewards/accuracies": 0.875, "rewards/chosen": 2.589475154876709, "rewards/margins": 2.8884940147399902, "rewards/rejected": -0.29901885986328125, "step": 600 }, { "epoch": 0.4390867579908676, "grad_norm": 67.27442777766676, "learning_rate": 4.99857082955654e-07, "logits/chosen": -2.690507411956787, "logits/rejected": -2.1472949981689453, "logps/chosen": -819.7312622070312, "logps/rejected": -773.3038940429688, "loss": 0.4221, "rewards/accuracies": 1.0, "rewards/chosen": 2.5155110359191895, "rewards/margins": 2.017008066177368, "rewards/rejected": 0.4985029697418213, "step": 601 }, { "epoch": 0.43981735159817353, "grad_norm": 64.66142665961098, "learning_rate": 4.998516395198867e-07, "logits/chosen": -2.6823463439941406, "logits/rejected": -2.3214612007141113, "logps/chosen": -675.6005859375, "logps/rejected": -704.613525390625, "loss": 0.4763, "rewards/accuracies": 0.875, "rewards/chosen": 1.7869035005569458, "rewards/margins": 1.0410141944885254, "rewards/rejected": 0.7458891868591309, "step": 602 }, { "epoch": 0.44054794520547946, "grad_norm": 45.590175600979265, "learning_rate": 4.998460943782898e-07, "logits/chosen": -3.473728895187378, "logits/rejected": -1.9006236791610718, "logps/chosen": -806.7015380859375, "logps/rejected": -461.0769348144531, "loss": 0.2512, "rewards/accuracies": 0.875, "rewards/chosen": 2.1985931396484375, "rewards/margins": 2.142850875854492, "rewards/rejected": 0.055742159485816956, "step": 603 }, { "epoch": 0.4412785388127854, "grad_norm": 66.45659488648398, "learning_rate": 4.998404475331207e-07, "logits/chosen": -2.848149299621582, "logits/rejected": -2.4080991744995117, "logps/chosen": -659.3477783203125, "logps/rejected": -525.9769897460938, "loss": 0.423, "rewards/accuracies": 0.25, "rewards/chosen": 0.9727944135665894, "rewards/margins": 0.6098709106445312, "rewards/rejected": 0.36292344331741333, "step": 604 }, { "epoch": 0.4420091324200913, "grad_norm": 53.07073745321545, "learning_rate": 4.99834698986678e-07, "logits/chosen": -2.893125057220459, "logits/rejected": -2.166882038116455, "logps/chosen": -620.5816650390625, "logps/rejected": -522.917236328125, "loss": 0.3893, "rewards/accuracies": 0.625, "rewards/chosen": 1.5147998332977295, "rewards/margins": 1.342354655265808, "rewards/rejected": 0.17244501411914825, "step": 605 }, { "epoch": 0.44273972602739725, "grad_norm": 49.2198114964899, "learning_rate": 4.998288487413021e-07, "logits/chosen": -2.314500093460083, "logits/rejected": -1.9865071773529053, "logps/chosen": -613.3119506835938, "logps/rejected": -376.2269287109375, "loss": 0.3614, "rewards/accuracies": 0.625, "rewards/chosen": 1.8458247184753418, "rewards/margins": 1.566479206085205, "rewards/rejected": 0.2793456017971039, "step": 606 }, { "epoch": 0.4434703196347032, "grad_norm": 49.97172540390966, "learning_rate": 4.998228967993739e-07, "logits/chosen": -2.5566494464874268, "logits/rejected": -1.6840523481369019, "logps/chosen": -540.0150146484375, "logps/rejected": -364.7887878417969, "loss": 0.3392, "rewards/accuracies": 0.875, "rewards/chosen": 1.9625314474105835, "rewards/margins": 1.9119247198104858, "rewards/rejected": 0.05060677230358124, "step": 607 }, { "epoch": 0.4442009132420091, "grad_norm": 59.162141066283894, "learning_rate": 4.998168431633165e-07, "logits/chosen": -2.806785821914673, "logits/rejected": -1.4430814981460571, "logps/chosen": -644.572265625, "logps/rejected": -245.3519287109375, "loss": 0.3741, "rewards/accuracies": 0.875, "rewards/chosen": 1.6450562477111816, "rewards/margins": 1.8897745609283447, "rewards/rejected": -0.24471817910671234, "step": 608 }, { "epoch": 0.4449315068493151, "grad_norm": 67.78756021802599, "learning_rate": 4.998106878355941e-07, "logits/chosen": -2.758831262588501, "logits/rejected": -2.301067590713501, "logps/chosen": -778.1099243164062, "logps/rejected": -726.3009033203125, "loss": 0.4427, "rewards/accuracies": 0.625, "rewards/chosen": 2.0374579429626465, "rewards/margins": 1.6715973615646362, "rewards/rejected": 0.36586061120033264, "step": 609 }, { "epoch": 0.445662100456621, "grad_norm": 72.80353351443527, "learning_rate": 4.998044308187123e-07, "logits/chosen": -2.6307950019836426, "logits/rejected": -1.9903857707977295, "logps/chosen": -676.7929077148438, "logps/rejected": -482.21722412109375, "loss": 0.4569, "rewards/accuracies": 0.75, "rewards/chosen": 1.2279595136642456, "rewards/margins": 1.222709059715271, "rewards/rejected": 0.0052505433559417725, "step": 610 }, { "epoch": 0.44639269406392695, "grad_norm": 68.74249248781645, "learning_rate": 4.997980721152181e-07, "logits/chosen": -2.2740378379821777, "logits/rejected": -1.9676792621612549, "logps/chosen": -492.7957458496094, "logps/rejected": -496.3948059082031, "loss": 0.4723, "rewards/accuracies": 0.875, "rewards/chosen": 1.3874833583831787, "rewards/margins": 1.2882423400878906, "rewards/rejected": 0.09924107789993286, "step": 611 }, { "epoch": 0.4471232876712329, "grad_norm": 58.62282403677662, "learning_rate": 4.997916117277e-07, "logits/chosen": -3.5315561294555664, "logits/rejected": -2.3300623893737793, "logps/chosen": -847.7717895507812, "logps/rejected": -422.0013427734375, "loss": 0.4264, "rewards/accuracies": 0.5, "rewards/chosen": 1.054171085357666, "rewards/margins": 0.45075517892837524, "rewards/rejected": 0.603415846824646, "step": 612 }, { "epoch": 0.4478538812785388, "grad_norm": 67.2801142205574, "learning_rate": 4.997850496587875e-07, "logits/chosen": -2.959784746170044, "logits/rejected": -2.766505002975464, "logps/chosen": -666.8211669921875, "logps/rejected": -569.2572631835938, "loss": 0.4642, "rewards/accuracies": 0.875, "rewards/chosen": 1.2516593933105469, "rewards/margins": 1.0612796545028687, "rewards/rejected": 0.19037970900535583, "step": 613 }, { "epoch": 0.44858447488584474, "grad_norm": 64.20819560470844, "learning_rate": 4.997783859111522e-07, "logits/chosen": -2.8304011821746826, "logits/rejected": -2.545989990234375, "logps/chosen": -668.2972412109375, "logps/rejected": -521.2689819335938, "loss": 0.4423, "rewards/accuracies": 0.75, "rewards/chosen": 1.841910719871521, "rewards/margins": 1.0990238189697266, "rewards/rejected": 0.7428869605064392, "step": 614 }, { "epoch": 0.44931506849315067, "grad_norm": 57.26637959939387, "learning_rate": 4.997716204875065e-07, "logits/chosen": -2.71586275100708, "logits/rejected": -2.1969621181488037, "logps/chosen": -514.0198364257812, "logps/rejected": -343.94171142578125, "loss": 0.3801, "rewards/accuracies": 1.0, "rewards/chosen": 1.6697702407836914, "rewards/margins": 1.87321138381958, "rewards/rejected": -0.2034410536289215, "step": 615 }, { "epoch": 0.4500456621004566, "grad_norm": 66.32167040148906, "learning_rate": 4.997647533906042e-07, "logits/chosen": -2.6282455921173096, "logits/rejected": -2.320450782775879, "logps/chosen": -396.114990234375, "logps/rejected": -327.94512939453125, "loss": 0.4612, "rewards/accuracies": 0.875, "rewards/chosen": 1.38502836227417, "rewards/margins": 1.5492119789123535, "rewards/rejected": -0.16418352723121643, "step": 616 }, { "epoch": 0.45077625570776253, "grad_norm": 56.75100021489159, "learning_rate": 4.997577846232408e-07, "logits/chosen": -2.463797092437744, "logits/rejected": -2.3258962631225586, "logps/chosen": -559.19921875, "logps/rejected": -580.7842407226562, "loss": 0.362, "rewards/accuracies": 0.625, "rewards/chosen": 2.227778196334839, "rewards/margins": 1.8256042003631592, "rewards/rejected": 0.40217411518096924, "step": 617 }, { "epoch": 0.4515068493150685, "grad_norm": 59.77582633525446, "learning_rate": 4.997507141882532e-07, "logits/chosen": -3.1937923431396484, "logits/rejected": -2.7970943450927734, "logps/chosen": -794.0245361328125, "logps/rejected": -671.5292358398438, "loss": 0.3659, "rewards/accuracies": 0.75, "rewards/chosen": 2.160447359085083, "rewards/margins": 1.1795347929000854, "rewards/rejected": 0.9809126853942871, "step": 618 }, { "epoch": 0.45223744292237444, "grad_norm": 58.13204342405529, "learning_rate": 4.997435420885192e-07, "logits/chosen": -2.782904624938965, "logits/rejected": -1.7695777416229248, "logps/chosen": -628.6068725585938, "logps/rejected": -451.8021240234375, "loss": 0.3543, "rewards/accuracies": 0.875, "rewards/chosen": 1.9632577896118164, "rewards/margins": 1.7203489542007446, "rewards/rejected": 0.24290898442268372, "step": 619 }, { "epoch": 0.4529680365296804, "grad_norm": 60.87700522878076, "learning_rate": 4.997362683269585e-07, "logits/chosen": -3.292448043823242, "logits/rejected": -1.965110421180725, "logps/chosen": -1093.24560546875, "logps/rejected": -607.7452392578125, "loss": 0.4165, "rewards/accuracies": 1.0, "rewards/chosen": 2.940886974334717, "rewards/margins": 2.1824734210968018, "rewards/rejected": 0.7584133148193359, "step": 620 }, { "epoch": 0.4536986301369863, "grad_norm": 48.75585151179933, "learning_rate": 4.997288929065321e-07, "logits/chosen": -3.1389245986938477, "logits/rejected": -2.862987518310547, "logps/chosen": -631.6646728515625, "logps/rejected": -570.926025390625, "loss": 0.325, "rewards/accuracies": 0.875, "rewards/chosen": 2.222205877304077, "rewards/margins": 1.8463895320892334, "rewards/rejected": 0.3758164644241333, "step": 621 }, { "epoch": 0.45442922374429223, "grad_norm": 68.23280604161388, "learning_rate": 4.99721415830242e-07, "logits/chosen": -2.672861337661743, "logits/rejected": -2.2200493812561035, "logps/chosen": -616.8643188476562, "logps/rejected": -519.4044799804688, "loss": 0.4256, "rewards/accuracies": 0.75, "rewards/chosen": 1.7463771104812622, "rewards/margins": 1.7008178234100342, "rewards/rejected": 0.04555937647819519, "step": 622 }, { "epoch": 0.45515981735159816, "grad_norm": 99.24746631684106, "learning_rate": 4.997138371011321e-07, "logits/chosen": -2.622143507003784, "logits/rejected": -2.302929639816284, "logps/chosen": -276.7217712402344, "logps/rejected": -405.7618103027344, "loss": 0.7255, "rewards/accuracies": 0.75, "rewards/chosen": 0.941838800907135, "rewards/margins": 0.9734082818031311, "rewards/rejected": -0.0315694659948349, "step": 623 }, { "epoch": 0.4558904109589041, "grad_norm": 53.30429303770945, "learning_rate": 4.997061567222873e-07, "logits/chosen": -2.432793617248535, "logits/rejected": -2.1329591274261475, "logps/chosen": -628.5570068359375, "logps/rejected": -588.7120361328125, "loss": 0.43, "rewards/accuracies": 0.75, "rewards/chosen": 1.305452585220337, "rewards/margins": 1.6057233810424805, "rewards/rejected": -0.30027079582214355, "step": 624 }, { "epoch": 0.45662100456621, "grad_norm": 53.65298748817488, "learning_rate": 4.99698374696834e-07, "logits/chosen": -2.819014310836792, "logits/rejected": -2.4410595893859863, "logps/chosen": -423.5130615234375, "logps/rejected": -300.1983642578125, "loss": 0.3735, "rewards/accuracies": 0.875, "rewards/chosen": 1.2228161096572876, "rewards/margins": 1.4967955350875854, "rewards/rejected": -0.2739794850349426, "step": 625 }, { "epoch": 0.457351598173516, "grad_norm": 82.56673437286956, "learning_rate": 4.9969049102794e-07, "logits/chosen": -2.521965980529785, "logits/rejected": -2.863795757293701, "logps/chosen": -595.963134765625, "logps/rejected": -746.1057739257812, "loss": 0.6069, "rewards/accuracies": 0.375, "rewards/chosen": 0.6367157697677612, "rewards/margins": 0.08175152540206909, "rewards/rejected": 0.5549641847610474, "step": 626 }, { "epoch": 0.45808219178082193, "grad_norm": 69.35238933338923, "learning_rate": 4.996825057188146e-07, "logits/chosen": -2.7616894245147705, "logits/rejected": -2.5486485958099365, "logps/chosen": -588.8822021484375, "logps/rejected": -709.5740966796875, "loss": 0.4253, "rewards/accuracies": 0.875, "rewards/chosen": 2.160865306854248, "rewards/margins": 1.6426647901535034, "rewards/rejected": 0.518200695514679, "step": 627 }, { "epoch": 0.45881278538812786, "grad_norm": 71.7637386128085, "learning_rate": 4.996744187727083e-07, "logits/chosen": -2.9531521797180176, "logits/rejected": -1.4490426778793335, "logps/chosen": -605.931640625, "logps/rejected": -335.59478759765625, "loss": 0.4224, "rewards/accuracies": 1.0, "rewards/chosen": 1.9986708164215088, "rewards/margins": 2.0648953914642334, "rewards/rejected": -0.06622452288866043, "step": 628 }, { "epoch": 0.4595433789954338, "grad_norm": 66.47173242016113, "learning_rate": 4.996662301929128e-07, "logits/chosen": -3.0927958488464355, "logits/rejected": -2.6851038932800293, "logps/chosen": -541.2169799804688, "logps/rejected": -472.4219970703125, "loss": 0.5152, "rewards/accuracies": 0.75, "rewards/chosen": 1.4514634609222412, "rewards/margins": 0.9764944314956665, "rewards/rejected": 0.47496894001960754, "step": 629 }, { "epoch": 0.4602739726027397, "grad_norm": 61.68007292865642, "learning_rate": 4.996579399827616e-07, "logits/chosen": -2.6177103519439697, "logits/rejected": -1.4362213611602783, "logps/chosen": -810.7174682617188, "logps/rejected": -374.4915771484375, "loss": 0.3807, "rewards/accuracies": 0.625, "rewards/chosen": 1.5195624828338623, "rewards/margins": 1.3015148639678955, "rewards/rejected": 0.21804772317409515, "step": 630 }, { "epoch": 0.46100456621004565, "grad_norm": 70.95874544189391, "learning_rate": 4.996495481456292e-07, "logits/chosen": -3.1225361824035645, "logits/rejected": -2.384453058242798, "logps/chosen": -672.2545166015625, "logps/rejected": -489.724853515625, "loss": 0.5433, "rewards/accuracies": 0.875, "rewards/chosen": 2.138415575027466, "rewards/margins": 1.8370107412338257, "rewards/rejected": 0.30140477418899536, "step": 631 }, { "epoch": 0.4617351598173516, "grad_norm": 46.24578475301154, "learning_rate": 4.996410546849318e-07, "logits/chosen": -2.8969225883483887, "logits/rejected": -2.178694486618042, "logps/chosen": -577.7547607421875, "logps/rejected": -394.2907409667969, "loss": 0.3638, "rewards/accuracies": 0.75, "rewards/chosen": 2.7223501205444336, "rewards/margins": 2.9998812675476074, "rewards/rejected": -0.277531236410141, "step": 632 }, { "epoch": 0.4624657534246575, "grad_norm": 58.08509634982755, "learning_rate": 4.996324596041266e-07, "logits/chosen": -2.9982030391693115, "logits/rejected": -2.100576639175415, "logps/chosen": -573.8936157226562, "logps/rejected": -286.9344177246094, "loss": 0.4296, "rewards/accuracies": 0.875, "rewards/chosen": 1.3049745559692383, "rewards/margins": 1.1581134796142578, "rewards/rejected": 0.14686095714569092, "step": 633 }, { "epoch": 0.4631963470319635, "grad_norm": 57.01250548125385, "learning_rate": 4.996237629067126e-07, "logits/chosen": -2.803372383117676, "logits/rejected": -2.8699631690979004, "logps/chosen": -830.52685546875, "logps/rejected": -819.8829956054688, "loss": 0.3587, "rewards/accuracies": 0.875, "rewards/chosen": 2.0369133949279785, "rewards/margins": 0.9232686758041382, "rewards/rejected": 1.1136448383331299, "step": 634 }, { "epoch": 0.4639269406392694, "grad_norm": 45.66155427777837, "learning_rate": 4.996149645962297e-07, "logits/chosen": -2.890219211578369, "logits/rejected": -1.9297735691070557, "logps/chosen": -381.60498046875, "logps/rejected": -305.4438171386719, "loss": 0.2681, "rewards/accuracies": 1.0, "rewards/chosen": 2.0612363815307617, "rewards/margins": 2.9639780521392822, "rewards/rejected": -0.902741551399231, "step": 635 }, { "epoch": 0.46465753424657535, "grad_norm": 44.16034266463308, "learning_rate": 4.996060646762594e-07, "logits/chosen": -2.833423614501953, "logits/rejected": -1.6272871494293213, "logps/chosen": -588.891845703125, "logps/rejected": -378.3670654296875, "loss": 0.2757, "rewards/accuracies": 0.75, "rewards/chosen": 1.9501428604125977, "rewards/margins": 2.7001163959503174, "rewards/rejected": -0.7499735355377197, "step": 636 }, { "epoch": 0.4653881278538813, "grad_norm": 55.394441864262205, "learning_rate": 4.995970631504247e-07, "logits/chosen": -2.6532087326049805, "logits/rejected": -1.6042935848236084, "logps/chosen": -501.3475646972656, "logps/rejected": -424.7204284667969, "loss": 0.3175, "rewards/accuracies": 0.875, "rewards/chosen": 1.4262502193450928, "rewards/margins": 1.4586856365203857, "rewards/rejected": -0.03243538737297058, "step": 637 }, { "epoch": 0.4661187214611872, "grad_norm": 64.82008604350584, "learning_rate": 4.995879600223897e-07, "logits/chosen": -2.911064624786377, "logits/rejected": -2.3194408416748047, "logps/chosen": -917.1346435546875, "logps/rejected": -648.2109375, "loss": 0.3064, "rewards/accuracies": 0.875, "rewards/chosen": 2.514869451522827, "rewards/margins": 2.4139928817749023, "rewards/rejected": 0.10087653994560242, "step": 638 }, { "epoch": 0.46684931506849314, "grad_norm": 55.99367814483614, "learning_rate": 4.995787552958599e-07, "logits/chosen": -3.0154833793640137, "logits/rejected": -2.148952007293701, "logps/chosen": -720.070068359375, "logps/rejected": -476.3360595703125, "loss": 0.2999, "rewards/accuracies": 0.75, "rewards/chosen": 1.7798445224761963, "rewards/margins": 1.5001471042633057, "rewards/rejected": 0.2796972692012787, "step": 639 }, { "epoch": 0.46757990867579907, "grad_norm": 55.14166104864021, "learning_rate": 4.995694489745823e-07, "logits/chosen": -2.7907779216766357, "logits/rejected": -2.207159996032715, "logps/chosen": -586.9248046875, "logps/rejected": -459.08612060546875, "loss": 0.4422, "rewards/accuracies": 0.875, "rewards/chosen": 2.4364914894104004, "rewards/margins": 2.256317615509033, "rewards/rejected": 0.18017399311065674, "step": 640 }, { "epoch": 0.468310502283105, "grad_norm": 60.28865704233882, "learning_rate": 4.995600410623453e-07, "logits/chosen": -2.5222535133361816, "logits/rejected": -1.7989380359649658, "logps/chosen": -558.4971313476562, "logps/rejected": -405.904052734375, "loss": 0.4382, "rewards/accuracies": 0.625, "rewards/chosen": 1.495400309562683, "rewards/margins": 1.0406125783920288, "rewards/rejected": 0.4547877311706543, "step": 641 }, { "epoch": 0.469041095890411, "grad_norm": 44.383698806279796, "learning_rate": 4.995505315629782e-07, "logits/chosen": -2.7910425662994385, "logits/rejected": -2.2329978942871094, "logps/chosen": -536.4456787109375, "logps/rejected": -373.5457763671875, "loss": 0.3206, "rewards/accuracies": 0.75, "rewards/chosen": 1.1609331369400024, "rewards/margins": 1.3867146968841553, "rewards/rejected": -0.22578164935112, "step": 642 }, { "epoch": 0.4697716894977169, "grad_norm": 70.0321842864943, "learning_rate": 4.995409204803523e-07, "logits/chosen": -2.4539594650268555, "logits/rejected": -1.9305589199066162, "logps/chosen": -673.0725708007812, "logps/rejected": -541.2491455078125, "loss": 0.4901, "rewards/accuracies": 0.75, "rewards/chosen": 1.9397457838058472, "rewards/margins": 1.4458091259002686, "rewards/rejected": 0.49393659830093384, "step": 643 }, { "epoch": 0.47050228310502284, "grad_norm": 55.52389424411125, "learning_rate": 4.995312078183798e-07, "logits/chosen": -2.9020333290100098, "logits/rejected": -2.1647989749908447, "logps/chosen": -572.6356201171875, "logps/rejected": -425.948974609375, "loss": 0.3551, "rewards/accuracies": 0.75, "rewards/chosen": 2.7077295780181885, "rewards/margins": 2.0628809928894043, "rewards/rejected": 0.6448487043380737, "step": 644 }, { "epoch": 0.4712328767123288, "grad_norm": 95.57963078141964, "learning_rate": 4.995213935810145e-07, "logits/chosen": -2.9189741611480713, "logits/rejected": -2.281289577484131, "logps/chosen": -829.5680541992188, "logps/rejected": -675.895263671875, "loss": 0.7301, "rewards/accuracies": 0.75, "rewards/chosen": 1.4173368215560913, "rewards/margins": 0.6000983715057373, "rewards/rejected": 0.817238450050354, "step": 645 }, { "epoch": 0.4719634703196347, "grad_norm": 72.97329014248125, "learning_rate": 4.995114777722511e-07, "logits/chosen": -3.168041706085205, "logits/rejected": -3.14081072807312, "logps/chosen": -901.93603515625, "logps/rejected": -929.881103515625, "loss": 0.4681, "rewards/accuracies": 0.625, "rewards/chosen": 1.263631820678711, "rewards/margins": -0.045030951499938965, "rewards/rejected": 1.3086626529693604, "step": 646 }, { "epoch": 0.47269406392694063, "grad_norm": 66.00076570933666, "learning_rate": 4.995014603961264e-07, "logits/chosen": -2.601436138153076, "logits/rejected": -2.225334644317627, "logps/chosen": -582.8671875, "logps/rejected": -438.8760986328125, "loss": 0.4929, "rewards/accuracies": 0.875, "rewards/chosen": 1.6122809648513794, "rewards/margins": 1.192855954170227, "rewards/rejected": 0.4194251000881195, "step": 647 }, { "epoch": 0.47342465753424656, "grad_norm": 34.80388519698178, "learning_rate": 4.994913414567178e-07, "logits/chosen": -3.0976758003234863, "logits/rejected": -2.2663626670837402, "logps/chosen": -398.40618896484375, "logps/rejected": -346.8280944824219, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 1.5069769620895386, "rewards/margins": 2.051455020904541, "rewards/rejected": -0.5444781184196472, "step": 648 }, { "epoch": 0.4741552511415525, "grad_norm": 71.34330855645293, "learning_rate": 4.994811209581446e-07, "logits/chosen": -2.4308037757873535, "logits/rejected": -2.4406626224517822, "logps/chosen": -714.8936157226562, "logps/rejected": -697.5227661132812, "loss": 0.5098, "rewards/accuracies": 0.75, "rewards/chosen": 1.267491340637207, "rewards/margins": 0.7724829316139221, "rewards/rejected": 0.4950084388256073, "step": 649 }, { "epoch": 0.4748858447488584, "grad_norm": 70.81772997566534, "learning_rate": 4.994707989045671e-07, "logits/chosen": -2.74182391166687, "logits/rejected": -2.965427875518799, "logps/chosen": -569.8129272460938, "logps/rejected": -602.8406372070312, "loss": 0.51, "rewards/accuracies": 0.625, "rewards/chosen": 1.7478468418121338, "rewards/margins": 0.9073483943939209, "rewards/rejected": 0.8404985070228577, "step": 650 }, { "epoch": 0.4756164383561644, "grad_norm": 64.45361174058634, "learning_rate": 4.99460375300187e-07, "logits/chosen": -2.374440908432007, "logits/rejected": -1.9849270582199097, "logps/chosen": -594.5028076171875, "logps/rejected": -477.9927978515625, "loss": 0.4227, "rewards/accuracies": 0.75, "rewards/chosen": 1.555449366569519, "rewards/margins": 1.1069910526275635, "rewards/rejected": 0.4484585225582123, "step": 651 }, { "epoch": 0.47634703196347034, "grad_norm": 58.460780372801004, "learning_rate": 4.994498501492474e-07, "logits/chosen": -2.5734479427337646, "logits/rejected": -1.6069767475128174, "logps/chosen": -570.4550170898438, "logps/rejected": -455.18743896484375, "loss": 0.3261, "rewards/accuracies": 0.625, "rewards/chosen": 2.702605724334717, "rewards/margins": 2.8485920429229736, "rewards/rejected": -0.1459864377975464, "step": 652 }, { "epoch": 0.47707762557077626, "grad_norm": 59.28529407660891, "learning_rate": 4.994392234560328e-07, "logits/chosen": -2.6892426013946533, "logits/rejected": -2.3139734268188477, "logps/chosen": -546.4810791015625, "logps/rejected": -508.32781982421875, "loss": 0.4174, "rewards/accuracies": 0.75, "rewards/chosen": 2.0443496704101562, "rewards/margins": 1.7990086078643799, "rewards/rejected": 0.24534112215042114, "step": 653 }, { "epoch": 0.4778082191780822, "grad_norm": 51.890709688277774, "learning_rate": 4.99428495224869e-07, "logits/chosen": -2.395095109939575, "logits/rejected": -2.6108903884887695, "logps/chosen": -439.1127014160156, "logps/rejected": -569.15673828125, "loss": 0.3568, "rewards/accuracies": 0.625, "rewards/chosen": 0.9786694049835205, "rewards/margins": 0.4140203595161438, "rewards/rejected": 0.5646490454673767, "step": 654 }, { "epoch": 0.4785388127853881, "grad_norm": 61.461731456197604, "learning_rate": 4.994176654601229e-07, "logits/chosen": -3.101269483566284, "logits/rejected": -2.476799964904785, "logps/chosen": -771.1871948242188, "logps/rejected": -629.5882568359375, "loss": 0.3998, "rewards/accuracies": 1.0, "rewards/chosen": 2.1343512535095215, "rewards/margins": 1.5705294609069824, "rewards/rejected": 0.5638217926025391, "step": 655 }, { "epoch": 0.47926940639269405, "grad_norm": 64.89785393456782, "learning_rate": 4.994067341662029e-07, "logits/chosen": -3.2588000297546387, "logits/rejected": -2.134106397628784, "logps/chosen": -604.602783203125, "logps/rejected": -480.8910217285156, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 2.3147506713867188, "rewards/margins": 2.2555110454559326, "rewards/rejected": 0.05923956632614136, "step": 656 }, { "epoch": 0.48, "grad_norm": 74.46975319299105, "learning_rate": 4.99395701347559e-07, "logits/chosen": -3.022524833679199, "logits/rejected": -2.323240041732788, "logps/chosen": -782.1650390625, "logps/rejected": -591.9097900390625, "loss": 0.4287, "rewards/accuracies": 0.75, "rewards/chosen": 2.3131160736083984, "rewards/margins": 1.3065019845962524, "rewards/rejected": 1.006614089012146, "step": 657 }, { "epoch": 0.4807305936073059, "grad_norm": 106.74347261220473, "learning_rate": 4.993845670086822e-07, "logits/chosen": -2.424537420272827, "logits/rejected": -2.393238067626953, "logps/chosen": -598.92138671875, "logps/rejected": -617.671142578125, "loss": 0.7126, "rewards/accuracies": 0.5, "rewards/chosen": 1.840031623840332, "rewards/margins": 0.5145000219345093, "rewards/rejected": 1.3255316019058228, "step": 658 }, { "epoch": 0.4814611872146119, "grad_norm": 69.8477907108557, "learning_rate": 4.993733311541046e-07, "logits/chosen": -2.817996025085449, "logits/rejected": -2.230415105819702, "logps/chosen": -310.7510986328125, "logps/rejected": -243.648193359375, "loss": 0.4577, "rewards/accuracies": 0.625, "rewards/chosen": 2.202529191970825, "rewards/margins": 2.624565601348877, "rewards/rejected": -0.42203646898269653, "step": 659 }, { "epoch": 0.4821917808219178, "grad_norm": 64.03547448241092, "learning_rate": 4.993619937884003e-07, "logits/chosen": -3.2275004386901855, "logits/rejected": -2.3036844730377197, "logps/chosen": -692.482666015625, "logps/rejected": -400.3083190917969, "loss": 0.3852, "rewards/accuracies": 0.875, "rewards/chosen": 1.4823535680770874, "rewards/margins": 1.4083677530288696, "rewards/rejected": 0.07398581504821777, "step": 660 }, { "epoch": 0.48292237442922376, "grad_norm": 71.83031899248742, "learning_rate": 4.993505549161841e-07, "logits/chosen": -3.073948383331299, "logits/rejected": -1.7053130865097046, "logps/chosen": -517.885986328125, "logps/rejected": -326.15576171875, "loss": 0.482, "rewards/accuracies": 0.75, "rewards/chosen": 1.3933522701263428, "rewards/margins": 1.5660439729690552, "rewards/rejected": -0.17269183695316315, "step": 661 }, { "epoch": 0.4836529680365297, "grad_norm": 49.2295697432892, "learning_rate": 4.993390145421125e-07, "logits/chosen": -2.7345662117004395, "logits/rejected": -2.100966215133667, "logps/chosen": -697.0243530273438, "logps/rejected": -525.996826171875, "loss": 0.2981, "rewards/accuracies": 0.875, "rewards/chosen": 2.3863487243652344, "rewards/margins": 2.601280689239502, "rewards/rejected": -0.2149316966533661, "step": 662 }, { "epoch": 0.4843835616438356, "grad_norm": 48.71938141027655, "learning_rate": 4.993273726708831e-07, "logits/chosen": -2.5187175273895264, "logits/rejected": -1.7097837924957275, "logps/chosen": -556.0059814453125, "logps/rejected": -391.9511413574219, "loss": 0.3401, "rewards/accuracies": 0.75, "rewards/chosen": 2.2369446754455566, "rewards/margins": 2.5246315002441406, "rewards/rejected": -0.2876867651939392, "step": 663 }, { "epoch": 0.48511415525114154, "grad_norm": 82.7610954844198, "learning_rate": 4.99315629307235e-07, "logits/chosen": -2.3466503620147705, "logits/rejected": -2.351799964904785, "logps/chosen": -615.45751953125, "logps/rejected": -582.3365478515625, "loss": 0.4623, "rewards/accuracies": 0.875, "rewards/chosen": 1.0728135108947754, "rewards/margins": 1.2943874597549438, "rewards/rejected": -0.22157400846481323, "step": 664 }, { "epoch": 0.4858447488584475, "grad_norm": 44.3930833532587, "learning_rate": 4.993037844559484e-07, "logits/chosen": -2.927938461303711, "logits/rejected": -2.170163631439209, "logps/chosen": -570.9603271484375, "logps/rejected": -354.3087158203125, "loss": 0.3032, "rewards/accuracies": 0.875, "rewards/chosen": 2.151205539703369, "rewards/margins": 2.290184497833252, "rewards/rejected": -0.13897880911827087, "step": 665 }, { "epoch": 0.4865753424657534, "grad_norm": 45.42057424178084, "learning_rate": 4.99291838121845e-07, "logits/chosen": -2.3883233070373535, "logits/rejected": -2.2880072593688965, "logps/chosen": -241.11923217773438, "logps/rejected": -326.94598388671875, "loss": 0.3447, "rewards/accuracies": 0.625, "rewards/chosen": 1.0788002014160156, "rewards/margins": 1.6609965562820435, "rewards/rejected": -0.5821963548660278, "step": 666 }, { "epoch": 0.4873059360730594, "grad_norm": 51.94222160021291, "learning_rate": 4.992797903097878e-07, "logits/chosen": -3.0444092750549316, "logits/rejected": -2.401963233947754, "logps/chosen": -732.2088012695312, "logps/rejected": -656.8184814453125, "loss": 0.321, "rewards/accuracies": 0.875, "rewards/chosen": 3.214362621307373, "rewards/margins": 2.1655125617980957, "rewards/rejected": 1.0488500595092773, "step": 667 }, { "epoch": 0.4880365296803653, "grad_norm": 55.497897271540836, "learning_rate": 4.992676410246807e-07, "logits/chosen": -2.1965994834899902, "logits/rejected": -2.1733622550964355, "logps/chosen": -747.3160400390625, "logps/rejected": -794.5643920898438, "loss": 0.4141, "rewards/accuracies": 0.625, "rewards/chosen": 1.7742165327072144, "rewards/margins": 1.0177866220474243, "rewards/rejected": 0.7564297914505005, "step": 668 }, { "epoch": 0.48876712328767125, "grad_norm": 55.302223621659195, "learning_rate": 4.992553902714696e-07, "logits/chosen": -2.893794536590576, "logits/rejected": -2.2159926891326904, "logps/chosen": -474.8892822265625, "logps/rejected": -473.0904541015625, "loss": 0.3671, "rewards/accuracies": 0.875, "rewards/chosen": 1.4313790798187256, "rewards/margins": 0.706089437007904, "rewards/rejected": 0.7252896428108215, "step": 669 }, { "epoch": 0.4894977168949772, "grad_norm": 83.7091639015332, "learning_rate": 4.992430380551412e-07, "logits/chosen": -2.7175443172454834, "logits/rejected": -2.17598819732666, "logps/chosen": -580.0443725585938, "logps/rejected": -560.5264892578125, "loss": 0.473, "rewards/accuracies": 0.75, "rewards/chosen": 1.9875893592834473, "rewards/margins": 2.1015734672546387, "rewards/rejected": -0.11398419737815857, "step": 670 }, { "epoch": 0.4902283105022831, "grad_norm": 54.186806151843825, "learning_rate": 4.992305843807238e-07, "logits/chosen": -2.9946682453155518, "logits/rejected": -1.530630350112915, "logps/chosen": -656.2801513671875, "logps/rejected": -307.3781433105469, "loss": 0.2853, "rewards/accuracies": 0.875, "rewards/chosen": 2.4924519062042236, "rewards/margins": 2.912248134613037, "rewards/rejected": -0.4197964072227478, "step": 671 }, { "epoch": 0.49095890410958903, "grad_norm": 66.75333193346044, "learning_rate": 4.992180292532867e-07, "logits/chosen": -2.531186580657959, "logits/rejected": -2.0155584812164307, "logps/chosen": -733.8284912109375, "logps/rejected": -473.1738586425781, "loss": 0.4264, "rewards/accuracies": 0.75, "rewards/chosen": 2.405531883239746, "rewards/margins": 1.8002891540527344, "rewards/rejected": 0.6052429676055908, "step": 672 }, { "epoch": 0.49168949771689496, "grad_norm": 45.57274672587362, "learning_rate": 4.992053726779406e-07, "logits/chosen": -2.7025516033172607, "logits/rejected": -1.9426929950714111, "logps/chosen": -820.6594848632812, "logps/rejected": -484.008056640625, "loss": 0.3105, "rewards/accuracies": 1.0, "rewards/chosen": 2.5598998069763184, "rewards/margins": 2.560783624649048, "rewards/rejected": -0.0008834749460220337, "step": 673 }, { "epoch": 0.4924200913242009, "grad_norm": 50.57973719240863, "learning_rate": 4.991926146598377e-07, "logits/chosen": -3.0407042503356934, "logits/rejected": -2.2719271183013916, "logps/chosen": -967.517822265625, "logps/rejected": -759.638916015625, "loss": 0.3849, "rewards/accuracies": 0.625, "rewards/chosen": 2.298307180404663, "rewards/margins": 1.2936599254608154, "rewards/rejected": 1.0046473741531372, "step": 674 }, { "epoch": 0.4931506849315068, "grad_norm": 57.667340989479264, "learning_rate": 4.991797552041714e-07, "logits/chosen": -2.9054083824157715, "logits/rejected": -2.281726837158203, "logps/chosen": -514.969482421875, "logps/rejected": -448.0055236816406, "loss": 0.3934, "rewards/accuracies": 1.0, "rewards/chosen": 1.859304428100586, "rewards/margins": 1.8647969961166382, "rewards/rejected": -0.005492553114891052, "step": 675 }, { "epoch": 0.4938812785388128, "grad_norm": 75.95232301314918, "learning_rate": 4.991667943161762e-07, "logits/chosen": -2.1876349449157715, "logits/rejected": -1.9100770950317383, "logps/chosen": -446.6458435058594, "logps/rejected": -460.109375, "loss": 0.5084, "rewards/accuracies": 0.5, "rewards/chosen": 1.8983443975448608, "rewards/margins": 1.2131390571594238, "rewards/rejected": 0.6852054595947266, "step": 676 }, { "epoch": 0.49461187214611874, "grad_norm": 65.24192134495466, "learning_rate": 4.991537320011278e-07, "logits/chosen": -3.0526607036590576, "logits/rejected": -2.800122022628784, "logps/chosen": -527.0548706054688, "logps/rejected": -517.7200317382812, "loss": 0.4295, "rewards/accuracies": 0.75, "rewards/chosen": 0.9693746566772461, "rewards/margins": 0.05513861030340195, "rewards/rejected": 0.9142360687255859, "step": 677 }, { "epoch": 0.49534246575342467, "grad_norm": 88.08281071224255, "learning_rate": 4.99140568264344e-07, "logits/chosen": -3.232499837875366, "logits/rejected": -2.8083653450012207, "logps/chosen": -714.696044921875, "logps/rejected": -614.3226318359375, "loss": 0.5685, "rewards/accuracies": 0.625, "rewards/chosen": 1.5221689939498901, "rewards/margins": 0.7635618448257446, "rewards/rejected": 0.7586071491241455, "step": 678 }, { "epoch": 0.4960730593607306, "grad_norm": 59.64037734098435, "learning_rate": 4.991273031111827e-07, "logits/chosen": -2.7157416343688965, "logits/rejected": -2.2941391468048096, "logps/chosen": -598.181396484375, "logps/rejected": -517.5470581054688, "loss": 0.3098, "rewards/accuracies": 0.625, "rewards/chosen": 2.6575205326080322, "rewards/margins": 2.5541279315948486, "rewards/rejected": 0.10339269042015076, "step": 679 }, { "epoch": 0.4968036529680365, "grad_norm": 79.26146222806673, "learning_rate": 4.99113936547044e-07, "logits/chosen": -3.2583186626434326, "logits/rejected": -2.3259871006011963, "logps/chosen": -889.5570068359375, "logps/rejected": -615.746337890625, "loss": 0.5631, "rewards/accuracies": 0.75, "rewards/chosen": 2.218979597091675, "rewards/margins": 1.2991564273834229, "rewards/rejected": 0.9198230504989624, "step": 680 }, { "epoch": 0.49753424657534245, "grad_norm": 52.100385750131444, "learning_rate": 4.991004685773689e-07, "logits/chosen": -2.7510104179382324, "logits/rejected": -2.297636032104492, "logps/chosen": -917.426025390625, "logps/rejected": -720.970703125, "loss": 0.2741, "rewards/accuracies": 0.875, "rewards/chosen": 2.5083069801330566, "rewards/margins": 1.8775041103363037, "rewards/rejected": 0.6308026909828186, "step": 681 }, { "epoch": 0.4982648401826484, "grad_norm": 63.267975073664836, "learning_rate": 4.990868992076397e-07, "logits/chosen": -3.2310922145843506, "logits/rejected": -2.403191566467285, "logps/chosen": -499.22930908203125, "logps/rejected": -366.06134033203125, "loss": 0.3997, "rewards/accuracies": 0.75, "rewards/chosen": 1.9051976203918457, "rewards/margins": 2.217838764190674, "rewards/rejected": -0.3126411437988281, "step": 682 }, { "epoch": 0.4989954337899543, "grad_norm": 48.471950954102056, "learning_rate": 4.9907322844338e-07, "logits/chosen": -3.0176196098327637, "logits/rejected": -2.8720152378082275, "logps/chosen": -496.8078308105469, "logps/rejected": -563.54345703125, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 1.4001811742782593, "rewards/margins": 1.737656831741333, "rewards/rejected": -0.3374756872653961, "step": 683 }, { "epoch": 0.4997260273972603, "grad_norm": 61.18406048810834, "learning_rate": 4.990594562901547e-07, "logits/chosen": -2.8007116317749023, "logits/rejected": -2.3924050331115723, "logps/chosen": -665.411865234375, "logps/rejected": -690.019775390625, "loss": 0.4072, "rewards/accuracies": 0.625, "rewards/chosen": 1.6463418006896973, "rewards/margins": 0.8677330017089844, "rewards/rejected": 0.7786086797714233, "step": 684 }, { "epoch": 0.5004566210045662, "grad_norm": 62.864487969411236, "learning_rate": 4.990455827535701e-07, "logits/chosen": -2.5245678424835205, "logits/rejected": -1.705889344215393, "logps/chosen": -683.3610229492188, "logps/rejected": -506.05645751953125, "loss": 0.3723, "rewards/accuracies": 0.75, "rewards/chosen": 2.268611431121826, "rewards/margins": 1.6788921356201172, "rewards/rejected": 0.5897192358970642, "step": 685 }, { "epoch": 0.5011872146118721, "grad_norm": 47.02457645758891, "learning_rate": 4.990316078392735e-07, "logits/chosen": -3.0314388275146484, "logits/rejected": -1.753129482269287, "logps/chosen": -392.0046691894531, "logps/rejected": -257.55670166015625, "loss": 0.2817, "rewards/accuracies": 0.625, "rewards/chosen": 1.984032154083252, "rewards/margins": 2.7014102935791016, "rewards/rejected": -0.7173779010772705, "step": 686 }, { "epoch": 0.5019178082191781, "grad_norm": 66.00221572393423, "learning_rate": 4.990175315529536e-07, "logits/chosen": -3.048623561859131, "logits/rejected": -3.134315252304077, "logps/chosen": -853.0582275390625, "logps/rejected": -907.3509521484375, "loss": 0.3824, "rewards/accuracies": 0.625, "rewards/chosen": 1.5395417213439941, "rewards/margins": 0.8481966257095337, "rewards/rejected": 0.6913449764251709, "step": 687 }, { "epoch": 0.5026484018264841, "grad_norm": 62.264393808404854, "learning_rate": 4.990033539003402e-07, "logits/chosen": -3.0368189811706543, "logits/rejected": -2.068464994430542, "logps/chosen": -704.9039306640625, "logps/rejected": -523.1248779296875, "loss": 0.4471, "rewards/accuracies": 0.875, "rewards/chosen": 1.6399203538894653, "rewards/margins": 1.6637486219406128, "rewards/rejected": -0.023828215897083282, "step": 688 }, { "epoch": 0.5033789954337899, "grad_norm": 67.82261949264235, "learning_rate": 4.989890748872048e-07, "logits/chosen": -3.243846893310547, "logits/rejected": -2.15824031829834, "logps/chosen": -607.2335205078125, "logps/rejected": -522.060546875, "loss": 0.3537, "rewards/accuracies": 0.875, "rewards/chosen": 2.5871522426605225, "rewards/margins": 2.4480643272399902, "rewards/rejected": 0.13908809423446655, "step": 689 }, { "epoch": 0.5041095890410959, "grad_norm": 54.06369669918299, "learning_rate": 4.9897469451936e-07, "logits/chosen": -2.3253073692321777, "logits/rejected": -1.598310947418213, "logps/chosen": -870.7001953125, "logps/rejected": -430.53521728515625, "loss": 0.2877, "rewards/accuracies": 0.875, "rewards/chosen": 2.6968178749084473, "rewards/margins": 2.784642219543457, "rewards/rejected": -0.0878242552280426, "step": 690 }, { "epoch": 0.5048401826484018, "grad_norm": 70.04859501847628, "learning_rate": 4.98960212802659e-07, "logits/chosen": -2.0621371269226074, "logits/rejected": -2.3057374954223633, "logps/chosen": -352.912109375, "logps/rejected": -551.1044921875, "loss": 0.4937, "rewards/accuracies": 0.75, "rewards/chosen": 0.7721143364906311, "rewards/margins": 0.6733249425888062, "rewards/rejected": 0.09878939390182495, "step": 691 }, { "epoch": 0.5055707762557078, "grad_norm": 62.62557501919144, "learning_rate": 4.989456297429973e-07, "logits/chosen": -2.3478310108184814, "logits/rejected": -2.1895151138305664, "logps/chosen": -546.9484252929688, "logps/rejected": -497.86956787109375, "loss": 0.3903, "rewards/accuracies": 0.875, "rewards/chosen": 0.9560618996620178, "rewards/margins": 0.9986319541931152, "rewards/rejected": -0.04257005453109741, "step": 692 }, { "epoch": 0.5063013698630137, "grad_norm": 43.882792332423705, "learning_rate": 4.989309453463109e-07, "logits/chosen": -2.9811275005340576, "logits/rejected": -2.3522427082061768, "logps/chosen": -769.59716796875, "logps/rejected": -702.57373046875, "loss": 0.2795, "rewards/accuracies": 1.0, "rewards/chosen": 3.320807695388794, "rewards/margins": 3.065560817718506, "rewards/rejected": 0.2552468776702881, "step": 693 }, { "epoch": 0.5070319634703196, "grad_norm": 68.52307007725574, "learning_rate": 4.989161596185774e-07, "logits/chosen": -2.743436336517334, "logits/rejected": -2.00535249710083, "logps/chosen": -653.6000366210938, "logps/rejected": -514.283447265625, "loss": 0.4049, "rewards/accuracies": 1.0, "rewards/chosen": 2.170651435852051, "rewards/margins": 1.9787465333938599, "rewards/rejected": 0.19190475344657898, "step": 694 }, { "epoch": 0.5077625570776255, "grad_norm": 55.89216039615932, "learning_rate": 4.989012725658156e-07, "logits/chosen": -2.779451370239258, "logits/rejected": -2.015652656555176, "logps/chosen": -560.25927734375, "logps/rejected": -464.2225646972656, "loss": 0.338, "rewards/accuracies": 0.75, "rewards/chosen": 2.766308307647705, "rewards/margins": 2.251718521118164, "rewards/rejected": 0.5145897269248962, "step": 695 }, { "epoch": 0.5084931506849315, "grad_norm": 57.497959797364764, "learning_rate": 4.988862841940853e-07, "logits/chosen": -3.165987253189087, "logits/rejected": -2.343754768371582, "logps/chosen": -831.4325561523438, "logps/rejected": -612.0936279296875, "loss": 0.3711, "rewards/accuracies": 0.875, "rewards/chosen": 1.6861398220062256, "rewards/margins": 1.3520946502685547, "rewards/rejected": 0.3340449929237366, "step": 696 }, { "epoch": 0.5092237442922375, "grad_norm": 58.79872896027388, "learning_rate": 4.98871194509488e-07, "logits/chosen": -2.349344491958618, "logits/rejected": -1.646880865097046, "logps/chosen": -529.1026611328125, "logps/rejected": -454.6851806640625, "loss": 0.412, "rewards/accuracies": 1.0, "rewards/chosen": 1.7446489334106445, "rewards/margins": 1.329231858253479, "rewards/rejected": 0.4154170751571655, "step": 697 }, { "epoch": 0.5099543378995434, "grad_norm": 62.51873156802687, "learning_rate": 4.988560035181659e-07, "logits/chosen": -3.4095678329467773, "logits/rejected": -2.208767890930176, "logps/chosen": -848.5492553710938, "logps/rejected": -480.26849365234375, "loss": 0.4069, "rewards/accuracies": 0.875, "rewards/chosen": 2.0139410495758057, "rewards/margins": 1.758241057395935, "rewards/rejected": 0.2557000517845154, "step": 698 }, { "epoch": 0.5106849315068493, "grad_norm": 55.00482489490127, "learning_rate": 4.988407112263029e-07, "logits/chosen": -2.697870969772339, "logits/rejected": -2.531118869781494, "logps/chosen": -477.95556640625, "logps/rejected": -545.5894775390625, "loss": 0.348, "rewards/accuracies": 0.625, "rewards/chosen": 1.3472800254821777, "rewards/margins": 1.3426506519317627, "rewards/rejected": 0.0046293288469314575, "step": 699 }, { "epoch": 0.5114155251141552, "grad_norm": 91.79908640953352, "learning_rate": 4.98825317640124e-07, "logits/chosen": -3.0684542655944824, "logits/rejected": -2.4955148696899414, "logps/chosen": -552.1353149414062, "logps/rejected": -525.0772705078125, "loss": 0.5571, "rewards/accuracies": 0.875, "rewards/chosen": 2.4786059856414795, "rewards/margins": 2.2422902584075928, "rewards/rejected": 0.23631566762924194, "step": 700 }, { "epoch": 0.5121461187214612, "grad_norm": 53.175313949913594, "learning_rate": 4.988098227658952e-07, "logits/chosen": -2.922109603881836, "logits/rejected": -1.8557407855987549, "logps/chosen": -804.3439331054688, "logps/rejected": -436.73492431640625, "loss": 0.3116, "rewards/accuracies": 0.875, "rewards/chosen": 1.7327855825424194, "rewards/margins": 1.4393359422683716, "rewards/rejected": 0.2934498190879822, "step": 701 }, { "epoch": 0.5128767123287671, "grad_norm": 63.57584591331641, "learning_rate": 4.987942266099241e-07, "logits/chosen": -2.782912254333496, "logits/rejected": -1.801133155822754, "logps/chosen": -513.3269653320312, "logps/rejected": -496.19635009765625, "loss": 0.4409, "rewards/accuracies": 0.75, "rewards/chosen": 1.6996909379959106, "rewards/margins": 1.4173715114593506, "rewards/rejected": 0.28231945633888245, "step": 702 }, { "epoch": 0.5136073059360731, "grad_norm": 64.84530871620552, "learning_rate": 4.987785291785592e-07, "logits/chosen": -2.9239163398742676, "logits/rejected": -2.516294240951538, "logps/chosen": -538.3204345703125, "logps/rejected": -436.462890625, "loss": 0.3933, "rewards/accuracies": 0.75, "rewards/chosen": 2.2442262172698975, "rewards/margins": 2.2744669914245605, "rewards/rejected": -0.030240634456276894, "step": 703 }, { "epoch": 0.5143378995433789, "grad_norm": 53.25620745802784, "learning_rate": 4.987627304781905e-07, "logits/chosen": -2.885317802429199, "logits/rejected": -2.1893012523651123, "logps/chosen": -958.2396850585938, "logps/rejected": -850.3572998046875, "loss": 0.281, "rewards/accuracies": 0.875, "rewards/chosen": 2.5810787677764893, "rewards/margins": 1.9327813386917114, "rewards/rejected": 0.6482973694801331, "step": 704 }, { "epoch": 0.5150684931506849, "grad_norm": 67.38253668421811, "learning_rate": 4.987468305152491e-07, "logits/chosen": -3.183941602706909, "logits/rejected": -2.4263200759887695, "logps/chosen": -729.0610961914062, "logps/rejected": -543.1806640625, "loss": 0.435, "rewards/accuracies": 0.875, "rewards/chosen": 2.358060359954834, "rewards/margins": 2.4904026985168457, "rewards/rejected": -0.132342129945755, "step": 705 }, { "epoch": 0.5157990867579909, "grad_norm": 43.049407253105706, "learning_rate": 4.987308292962072e-07, "logits/chosen": -3.1218321323394775, "logits/rejected": -2.1764938831329346, "logps/chosen": -546.802978515625, "logps/rejected": -489.6099853515625, "loss": 0.2714, "rewards/accuracies": 0.875, "rewards/chosen": 1.0376182794570923, "rewards/margins": 1.172997236251831, "rewards/rejected": -0.13537894189357758, "step": 706 }, { "epoch": 0.5165296803652968, "grad_norm": 42.47518519585141, "learning_rate": 4.987147268275784e-07, "logits/chosen": -3.082568645477295, "logits/rejected": -1.8957860469818115, "logps/chosen": -592.4110717773438, "logps/rejected": -305.6651611328125, "loss": 0.2781, "rewards/accuracies": 0.75, "rewards/chosen": 2.090646505355835, "rewards/margins": 2.180830955505371, "rewards/rejected": -0.09018459916114807, "step": 707 }, { "epoch": 0.5172602739726028, "grad_norm": 44.965138134569735, "learning_rate": 4.986985231159174e-07, "logits/chosen": -2.9856784343719482, "logits/rejected": -2.4954752922058105, "logps/chosen": -693.0319213867188, "logps/rejected": -515.7711181640625, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 2.28427791595459, "rewards/margins": 2.7750017642974854, "rewards/rejected": -0.49072378873825073, "step": 708 }, { "epoch": 0.5179908675799086, "grad_norm": 75.67829991566275, "learning_rate": 4.986822181678203e-07, "logits/chosen": -2.54217529296875, "logits/rejected": -2.495037078857422, "logps/chosen": -543.7750854492188, "logps/rejected": -516.1842041015625, "loss": 0.5188, "rewards/accuracies": 0.625, "rewards/chosen": 2.057208299636841, "rewards/margins": 1.7260394096374512, "rewards/rejected": 0.3311689794063568, "step": 709 }, { "epoch": 0.5187214611872146, "grad_norm": 68.53884833048153, "learning_rate": 4.986658119899241e-07, "logits/chosen": -2.7448678016662598, "logits/rejected": -2.3062853813171387, "logps/chosen": -661.581787109375, "logps/rejected": -517.2496948242188, "loss": 0.5092, "rewards/accuracies": 0.75, "rewards/chosen": 1.8311020135879517, "rewards/margins": 1.193498969078064, "rewards/rejected": 0.6376030445098877, "step": 710 }, { "epoch": 0.5194520547945205, "grad_norm": 76.01838542007454, "learning_rate": 4.986493045889073e-07, "logits/chosen": -2.8837504386901855, "logits/rejected": -1.9910000562667847, "logps/chosen": -990.3812255859375, "logps/rejected": -627.465576171875, "loss": 0.4315, "rewards/accuracies": 0.75, "rewards/chosen": 2.8839433193206787, "rewards/margins": 2.8799307346343994, "rewards/rejected": 0.004012584686279297, "step": 711 }, { "epoch": 0.5201826484018265, "grad_norm": 55.76344910296865, "learning_rate": 4.986326959714894e-07, "logits/chosen": -3.1064445972442627, "logits/rejected": -1.9377673864364624, "logps/chosen": -891.3203735351562, "logps/rejected": -640.5811767578125, "loss": 0.3445, "rewards/accuracies": 1.0, "rewards/chosen": 3.0580356121063232, "rewards/margins": 2.7225048542022705, "rewards/rejected": 0.33553051948547363, "step": 712 }, { "epoch": 0.5209132420091325, "grad_norm": 43.821295145204516, "learning_rate": 4.986159861444311e-07, "logits/chosen": -2.3385138511657715, "logits/rejected": -1.2331169843673706, "logps/chosen": -854.360595703125, "logps/rejected": -392.07080078125, "loss": 0.2737, "rewards/accuracies": 1.0, "rewards/chosen": 2.376574754714966, "rewards/margins": 2.6669058799743652, "rewards/rejected": -0.29033106565475464, "step": 713 }, { "epoch": 0.5216438356164383, "grad_norm": 67.00682830176713, "learning_rate": 4.985991751145348e-07, "logits/chosen": -2.6402242183685303, "logits/rejected": -2.2536566257476807, "logps/chosen": -586.9761962890625, "logps/rejected": -532.9592895507812, "loss": 0.4349, "rewards/accuracies": 0.75, "rewards/chosen": 1.9558756351470947, "rewards/margins": 2.242621660232544, "rewards/rejected": -0.2867460548877716, "step": 714 }, { "epoch": 0.5223744292237443, "grad_norm": 43.49580731411075, "learning_rate": 4.985822628886431e-07, "logits/chosen": -2.3336057662963867, "logits/rejected": -1.7654459476470947, "logps/chosen": -924.81787109375, "logps/rejected": -616.7224731445312, "loss": 0.2714, "rewards/accuracies": 1.0, "rewards/chosen": 3.1040866374969482, "rewards/margins": 3.3629910945892334, "rewards/rejected": -0.2589043974876404, "step": 715 }, { "epoch": 0.5231050228310502, "grad_norm": 58.808166019907475, "learning_rate": 4.985652494736408e-07, "logits/chosen": -2.616128921508789, "logits/rejected": -2.458024263381958, "logps/chosen": -578.622802734375, "logps/rejected": -561.5111083984375, "loss": 0.3497, "rewards/accuracies": 0.875, "rewards/chosen": 2.3501901626586914, "rewards/margins": 2.534317970275879, "rewards/rejected": -0.1841280460357666, "step": 716 }, { "epoch": 0.5238356164383562, "grad_norm": 64.59578341018089, "learning_rate": 4.985481348764533e-07, "logits/chosen": -3.1345152854919434, "logits/rejected": -2.505197048187256, "logps/chosen": -438.7666015625, "logps/rejected": -408.85430908203125, "loss": 0.3901, "rewards/accuracies": 0.875, "rewards/chosen": 0.9809331893920898, "rewards/margins": 1.459611177444458, "rewards/rejected": -0.4786779284477234, "step": 717 }, { "epoch": 0.5245662100456621, "grad_norm": 40.65041422615288, "learning_rate": 4.985309191040474e-07, "logits/chosen": -2.6642282009124756, "logits/rejected": -1.9513763189315796, "logps/chosen": -492.4386901855469, "logps/rejected": -406.8043212890625, "loss": 0.232, "rewards/accuracies": 0.875, "rewards/chosen": 2.0241687297821045, "rewards/margins": 2.0775811672210693, "rewards/rejected": -0.053412213921546936, "step": 718 }, { "epoch": 0.525296803652968, "grad_norm": 47.500292967846526, "learning_rate": 4.985136021634311e-07, "logits/chosen": -3.7518062591552734, "logits/rejected": -2.425427198410034, "logps/chosen": -1035.6978759765625, "logps/rejected": -672.13037109375, "loss": 0.2906, "rewards/accuracies": 0.875, "rewards/chosen": 3.990772247314453, "rewards/margins": 2.866631031036377, "rewards/rejected": 1.1241414546966553, "step": 719 }, { "epoch": 0.5260273972602739, "grad_norm": 89.80674237807816, "learning_rate": 4.984961840616533e-07, "logits/chosen": -2.6972382068634033, "logits/rejected": -2.663991689682007, "logps/chosen": -322.4483947753906, "logps/rejected": -390.0559997558594, "loss": 0.5272, "rewards/accuracies": 0.75, "rewards/chosen": 0.7615358829498291, "rewards/margins": 1.1532340049743652, "rewards/rejected": -0.3916982412338257, "step": 720 }, { "epoch": 0.5267579908675799, "grad_norm": 58.9803082207316, "learning_rate": 4.984786648058044e-07, "logits/chosen": -2.8463165760040283, "logits/rejected": -1.8484041690826416, "logps/chosen": -778.6373901367188, "logps/rejected": -430.32000732421875, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 2.2294113636016846, "rewards/margins": 2.2477734088897705, "rewards/rejected": -0.018361859023571014, "step": 721 }, { "epoch": 0.5274885844748859, "grad_norm": 75.70392640184689, "learning_rate": 4.984610444030161e-07, "logits/chosen": -3.4968178272247314, "logits/rejected": -2.5908620357513428, "logps/chosen": -915.0643310546875, "logps/rejected": -694.2031860351562, "loss": 0.4227, "rewards/accuracies": 1.0, "rewards/chosen": 3.6770124435424805, "rewards/margins": 2.622805118560791, "rewards/rejected": 1.0542072057724, "step": 722 }, { "epoch": 0.5282191780821918, "grad_norm": 60.306029040817094, "learning_rate": 4.984433228604606e-07, "logits/chosen": -2.572523355484009, "logits/rejected": -2.1991493701934814, "logps/chosen": -749.986083984375, "logps/rejected": -710.8793334960938, "loss": 0.3899, "rewards/accuracies": 0.75, "rewards/chosen": 2.9987847805023193, "rewards/margins": 2.004275321960449, "rewards/rejected": 0.9945094585418701, "step": 723 }, { "epoch": 0.5289497716894977, "grad_norm": 66.47634903083014, "learning_rate": 4.984255001853521e-07, "logits/chosen": -2.4876108169555664, "logits/rejected": -2.298398494720459, "logps/chosen": -429.3870544433594, "logps/rejected": -484.5883483886719, "loss": 0.4478, "rewards/accuracies": 0.875, "rewards/chosen": 1.7373844385147095, "rewards/margins": 2.0269534587860107, "rewards/rejected": -0.28956910967826843, "step": 724 }, { "epoch": 0.5296803652968036, "grad_norm": 58.58778719659713, "learning_rate": 4.984075763849455e-07, "logits/chosen": -2.7096052169799805, "logits/rejected": -2.05830454826355, "logps/chosen": -645.323974609375, "logps/rejected": -475.5256652832031, "loss": 0.3833, "rewards/accuracies": 1.0, "rewards/chosen": 1.8020570278167725, "rewards/margins": 1.2093662023544312, "rewards/rejected": 0.5926908254623413, "step": 725 }, { "epoch": 0.5304109589041096, "grad_norm": 63.04262880682404, "learning_rate": 4.983895514665368e-07, "logits/chosen": -2.6329314708709717, "logits/rejected": -2.513674736022949, "logps/chosen": -800.3803100585938, "logps/rejected": -885.1229248046875, "loss": 0.449, "rewards/accuracies": 0.875, "rewards/chosen": 1.9676408767700195, "rewards/margins": 0.76532381772995, "rewards/rejected": 1.2023171186447144, "step": 726 }, { "epoch": 0.5311415525114155, "grad_norm": 57.12010201923225, "learning_rate": 4.983714254374635e-07, "logits/chosen": -2.494145393371582, "logits/rejected": -2.9830682277679443, "logps/chosen": -354.399658203125, "logps/rejected": -450.04705810546875, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 1.2677191495895386, "rewards/margins": 2.3082501888275146, "rewards/rejected": -1.0405309200286865, "step": 727 }, { "epoch": 0.5318721461187215, "grad_norm": 79.99357884386721, "learning_rate": 4.983531983051039e-07, "logits/chosen": -2.91999888420105, "logits/rejected": -2.4263317584991455, "logps/chosen": -718.536376953125, "logps/rejected": -530.0726318359375, "loss": 0.5762, "rewards/accuracies": 0.625, "rewards/chosen": 1.223973035812378, "rewards/margins": 0.005087375640869141, "rewards/rejected": 1.2188856601715088, "step": 728 }, { "epoch": 0.5326027397260275, "grad_norm": 59.736485952106776, "learning_rate": 4.983348700768778e-07, "logits/chosen": -2.9979665279388428, "logits/rejected": -1.480710506439209, "logps/chosen": -445.0993957519531, "logps/rejected": -220.96548461914062, "loss": 0.3736, "rewards/accuracies": 0.625, "rewards/chosen": 2.508063316345215, "rewards/margins": 2.9726269245147705, "rewards/rejected": -0.46456360816955566, "step": 729 }, { "epoch": 0.5333333333333333, "grad_norm": 70.95194649153397, "learning_rate": 4.983164407602457e-07, "logits/chosen": -3.3719263076782227, "logits/rejected": -2.2814676761627197, "logps/chosen": -551.8487548828125, "logps/rejected": -383.01959228515625, "loss": 0.4544, "rewards/accuracies": 0.75, "rewards/chosen": 2.2033958435058594, "rewards/margins": 2.444638252258301, "rewards/rejected": -0.24124258756637573, "step": 730 }, { "epoch": 0.5340639269406393, "grad_norm": 92.54369744882685, "learning_rate": 4.9829791036271e-07, "logits/chosen": -2.482142925262451, "logits/rejected": -1.704419493675232, "logps/chosen": -541.0580444335938, "logps/rejected": -395.3839111328125, "loss": 0.653, "rewards/accuracies": 0.75, "rewards/chosen": 2.4058494567871094, "rewards/margins": 2.499894618988037, "rewards/rejected": -0.09404518455266953, "step": 731 }, { "epoch": 0.5347945205479452, "grad_norm": 65.36609269505674, "learning_rate": 4.982792788918132e-07, "logits/chosen": -2.1870532035827637, "logits/rejected": -1.8728561401367188, "logps/chosen": -670.11279296875, "logps/rejected": -562.2532348632812, "loss": 0.3764, "rewards/accuracies": 0.75, "rewards/chosen": 2.6851470470428467, "rewards/margins": 2.2377068996429443, "rewards/rejected": 0.4474402368068695, "step": 732 }, { "epoch": 0.5355251141552512, "grad_norm": 55.74239280358128, "learning_rate": 4.9826054635514e-07, "logits/chosen": -2.560948610305786, "logits/rejected": -2.19946551322937, "logps/chosen": -524.7435302734375, "logps/rejected": -426.0478820800781, "loss": 0.3436, "rewards/accuracies": 0.875, "rewards/chosen": 2.370410919189453, "rewards/margins": 1.9386042356491089, "rewards/rejected": 0.4318068027496338, "step": 733 }, { "epoch": 0.536255707762557, "grad_norm": 58.76357680557213, "learning_rate": 4.982417127603156e-07, "logits/chosen": -3.0960397720336914, "logits/rejected": -1.904831886291504, "logps/chosen": -695.7511596679688, "logps/rejected": -476.37017822265625, "loss": 0.3857, "rewards/accuracies": 0.875, "rewards/chosen": 2.439004421234131, "rewards/margins": 1.694627285003662, "rewards/rejected": 0.7443771958351135, "step": 734 }, { "epoch": 0.536986301369863, "grad_norm": 63.3886915837764, "learning_rate": 4.982227781150063e-07, "logits/chosen": -2.7353904247283936, "logits/rejected": -2.4885146617889404, "logps/chosen": -523.6363525390625, "logps/rejected": -383.3025817871094, "loss": 0.4012, "rewards/accuracies": 0.875, "rewards/chosen": 2.0146117210388184, "rewards/margins": 2.0402517318725586, "rewards/rejected": -0.02563995122909546, "step": 735 }, { "epoch": 0.5377168949771689, "grad_norm": 43.96325952970915, "learning_rate": 4.9820374242692e-07, "logits/chosen": -3.310778856277466, "logits/rejected": -1.9385361671447754, "logps/chosen": -741.7080688476562, "logps/rejected": -402.80938720703125, "loss": 0.2754, "rewards/accuracies": 0.875, "rewards/chosen": 1.5701738595962524, "rewards/margins": 1.6116327047348022, "rewards/rejected": -0.041458889842033386, "step": 736 }, { "epoch": 0.5384474885844749, "grad_norm": 90.070379925055, "learning_rate": 4.981846057038053e-07, "logits/chosen": -2.3762550354003906, "logits/rejected": -2.3832051753997803, "logps/chosen": -669.9444580078125, "logps/rejected": -688.2310791015625, "loss": 0.6175, "rewards/accuracies": 0.5, "rewards/chosen": 1.9843083620071411, "rewards/margins": 0.46337777376174927, "rewards/rejected": 1.5209306478500366, "step": 737 }, { "epoch": 0.5391780821917809, "grad_norm": 48.79188905443199, "learning_rate": 4.981653679534522e-07, "logits/chosen": -2.569489002227783, "logits/rejected": -1.5044106245040894, "logps/chosen": -436.1018981933594, "logps/rejected": -219.13919067382812, "loss": 0.2576, "rewards/accuracies": 1.0, "rewards/chosen": 2.397580623626709, "rewards/margins": 3.3083176612854004, "rewards/rejected": -0.9107370376586914, "step": 738 }, { "epoch": 0.5399086757990867, "grad_norm": 63.07352093365924, "learning_rate": 4.981460291836915e-07, "logits/chosen": -2.391631603240967, "logits/rejected": -2.144523859024048, "logps/chosen": -712.6326293945312, "logps/rejected": -726.1722412109375, "loss": 0.4341, "rewards/accuracies": 0.75, "rewards/chosen": 1.6651456356048584, "rewards/margins": 0.5980274677276611, "rewards/rejected": 1.0671181678771973, "step": 739 }, { "epoch": 0.5406392694063927, "grad_norm": 80.08336720646179, "learning_rate": 4.981265894023956e-07, "logits/chosen": -2.799913167953491, "logits/rejected": -2.0264251232147217, "logps/chosen": -876.603515625, "logps/rejected": -581.7454833984375, "loss": 0.4073, "rewards/accuracies": 0.75, "rewards/chosen": 3.233200788497925, "rewards/margins": 2.9132962226867676, "rewards/rejected": 0.3199045956134796, "step": 740 }, { "epoch": 0.5413698630136986, "grad_norm": 61.365711014223976, "learning_rate": 4.981070486174777e-07, "logits/chosen": -2.3722851276397705, "logits/rejected": -2.125239133834839, "logps/chosen": -420.75244140625, "logps/rejected": -442.16046142578125, "loss": 0.3626, "rewards/accuracies": 0.875, "rewards/chosen": 1.2177778482437134, "rewards/margins": 1.5780425071716309, "rewards/rejected": -0.36026477813720703, "step": 741 }, { "epoch": 0.5421004566210046, "grad_norm": 44.55065914741049, "learning_rate": 4.980874068368919e-07, "logits/chosen": -2.7163326740264893, "logits/rejected": -2.4566142559051514, "logps/chosen": -320.8462829589844, "logps/rejected": -332.20751953125, "loss": 0.2462, "rewards/accuracies": 1.0, "rewards/chosen": 1.5587822198867798, "rewards/margins": 3.18609881401062, "rewards/rejected": -1.6273164749145508, "step": 742 }, { "epoch": 0.5428310502283105, "grad_norm": 47.34499792084693, "learning_rate": 4.980676640686341e-07, "logits/chosen": -2.645965576171875, "logits/rejected": -1.777640700340271, "logps/chosen": -486.20916748046875, "logps/rejected": -370.06182861328125, "loss": 0.2647, "rewards/accuracies": 0.875, "rewards/chosen": 1.7794809341430664, "rewards/margins": 2.22446346282959, "rewards/rejected": -0.4449824392795563, "step": 743 }, { "epoch": 0.5435616438356164, "grad_norm": 88.95348316940012, "learning_rate": 4.980478203207406e-07, "logits/chosen": -3.1050491333007812, "logits/rejected": -2.156731367111206, "logps/chosen": -893.65185546875, "logps/rejected": -745.621337890625, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": 2.5581603050231934, "rewards/margins": 1.6195330619812012, "rewards/rejected": 0.9386273622512817, "step": 744 }, { "epoch": 0.5442922374429223, "grad_norm": 65.96669599478778, "learning_rate": 4.980278756012891e-07, "logits/chosen": -2.2545008659362793, "logits/rejected": -2.117027759552002, "logps/chosen": -399.6393737792969, "logps/rejected": -522.6390380859375, "loss": 0.3895, "rewards/accuracies": 1.0, "rewards/chosen": 1.9830918312072754, "rewards/margins": 3.176417827606201, "rewards/rejected": -1.1933261156082153, "step": 745 }, { "epoch": 0.5450228310502283, "grad_norm": 72.0716298151967, "learning_rate": 4.980078299183986e-07, "logits/chosen": -3.3488264083862305, "logits/rejected": -2.901836633682251, "logps/chosen": -648.9508056640625, "logps/rejected": -558.5449829101562, "loss": 0.4367, "rewards/accuracies": 1.0, "rewards/chosen": 1.5063773393630981, "rewards/margins": 1.4513276815414429, "rewards/rejected": 0.055049605667591095, "step": 746 }, { "epoch": 0.5457534246575343, "grad_norm": 61.76400635053356, "learning_rate": 4.979876832802288e-07, "logits/chosen": -2.8424649238586426, "logits/rejected": -2.7409698963165283, "logps/chosen": -979.5422973632812, "logps/rejected": -767.3148193359375, "loss": 0.383, "rewards/accuracies": 0.625, "rewards/chosen": 1.7871086597442627, "rewards/margins": 1.057106375694275, "rewards/rejected": 0.730002224445343, "step": 747 }, { "epoch": 0.5464840182648402, "grad_norm": 79.68593135710971, "learning_rate": 4.979674356949807e-07, "logits/chosen": -3.0795493125915527, "logits/rejected": -2.5811076164245605, "logps/chosen": -844.5624389648438, "logps/rejected": -590.18701171875, "loss": 0.4685, "rewards/accuracies": 0.75, "rewards/chosen": 2.6843113899230957, "rewards/margins": 2.78477144241333, "rewards/rejected": -0.10045984387397766, "step": 748 }, { "epoch": 0.5472146118721462, "grad_norm": 50.02716791201432, "learning_rate": 4.979470871708964e-07, "logits/chosen": -2.6591219902038574, "logits/rejected": -1.9364850521087646, "logps/chosen": -652.744140625, "logps/rejected": -556.7877197265625, "loss": 0.3326, "rewards/accuracies": 0.75, "rewards/chosen": 2.2267847061157227, "rewards/margins": 1.864954948425293, "rewards/rejected": 0.3618296980857849, "step": 749 }, { "epoch": 0.547945205479452, "grad_norm": 98.62016781571607, "learning_rate": 4.979266377162591e-07, "logits/chosen": -2.7657439708709717, "logits/rejected": -2.741521120071411, "logps/chosen": -712.2747802734375, "logps/rejected": -772.8214111328125, "loss": 0.6274, "rewards/accuracies": 0.875, "rewards/chosen": 1.6713769435882568, "rewards/margins": 0.5057920813560486, "rewards/rejected": 1.1655848026275635, "step": 750 }, { "epoch": 0.548675799086758, "grad_norm": 65.84681263641392, "learning_rate": 4.979060873393931e-07, "logits/chosen": -2.3323028087615967, "logits/rejected": -1.933318853378296, "logps/chosen": -418.6187438964844, "logps/rejected": -485.86395263671875, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": 1.9909813404083252, "rewards/margins": 2.466738224029541, "rewards/rejected": -0.47575700283050537, "step": 751 }, { "epoch": 0.5494063926940639, "grad_norm": 59.110806138503555, "learning_rate": 4.978854360486637e-07, "logits/chosen": -3.4872584342956543, "logits/rejected": -3.0722525119781494, "logps/chosen": -781.501220703125, "logps/rejected": -610.18408203125, "loss": 0.4499, "rewards/accuracies": 0.625, "rewards/chosen": 2.2609941959381104, "rewards/margins": 1.4027574062347412, "rewards/rejected": 0.8582368493080139, "step": 752 }, { "epoch": 0.5501369863013699, "grad_norm": 56.76425516169925, "learning_rate": 4.978646838524772e-07, "logits/chosen": -2.509979724884033, "logits/rejected": -2.4922378063201904, "logps/chosen": -576.8201293945312, "logps/rejected": -739.774658203125, "loss": 0.3236, "rewards/accuracies": 0.625, "rewards/chosen": 2.3929383754730225, "rewards/margins": 1.8728842735290527, "rewards/rejected": 0.5200542211532593, "step": 753 }, { "epoch": 0.5508675799086759, "grad_norm": 54.25126906637761, "learning_rate": 4.978438307592813e-07, "logits/chosen": -2.4555368423461914, "logits/rejected": -2.2759315967559814, "logps/chosen": -679.9061279296875, "logps/rejected": -650.1737670898438, "loss": 0.3229, "rewards/accuracies": 0.875, "rewards/chosen": 2.3135669231414795, "rewards/margins": 2.7465403079986572, "rewards/rejected": -0.4329734742641449, "step": 754 }, { "epoch": 0.5515981735159817, "grad_norm": 59.652486867297704, "learning_rate": 4.978228767775644e-07, "logits/chosen": -3.0227246284484863, "logits/rejected": -2.416206121444702, "logps/chosen": -765.9542846679688, "logps/rejected": -602.5970458984375, "loss": 0.4145, "rewards/accuracies": 1.0, "rewards/chosen": 2.6490907669067383, "rewards/margins": 1.8418231010437012, "rewards/rejected": 0.8072677850723267, "step": 755 }, { "epoch": 0.5523287671232877, "grad_norm": 61.32346917040612, "learning_rate": 4.978018219158561e-07, "logits/chosen": -2.488670825958252, "logits/rejected": -1.773406744003296, "logps/chosen": -586.6593017578125, "logps/rejected": -484.2906494140625, "loss": 0.4459, "rewards/accuracies": 0.75, "rewards/chosen": 1.1581497192382812, "rewards/margins": 0.989741325378418, "rewards/rejected": 0.16840840876102448, "step": 756 }, { "epoch": 0.5530593607305936, "grad_norm": 71.45145245965088, "learning_rate": 4.977806661827273e-07, "logits/chosen": -2.835677146911621, "logits/rejected": -2.6469290256500244, "logps/chosen": -429.6455383300781, "logps/rejected": -527.378173828125, "loss": 0.4262, "rewards/accuracies": 0.75, "rewards/chosen": 1.6902410984039307, "rewards/margins": 1.7211143970489502, "rewards/rejected": -0.03087332844734192, "step": 757 }, { "epoch": 0.5537899543378996, "grad_norm": 60.613288172850346, "learning_rate": 4.977594095867895e-07, "logits/chosen": -3.0664703845977783, "logits/rejected": -2.241203784942627, "logps/chosen": -811.828125, "logps/rejected": -731.1734008789062, "loss": 0.3435, "rewards/accuracies": 0.75, "rewards/chosen": 2.4761478900909424, "rewards/margins": 2.0416438579559326, "rewards/rejected": 0.434503972530365, "step": 758 }, { "epoch": 0.5545205479452054, "grad_norm": 65.39051112594625, "learning_rate": 4.977380521366959e-07, "logits/chosen": -2.470344066619873, "logits/rejected": -2.287436008453369, "logps/chosen": -585.5550537109375, "logps/rejected": -506.89080810546875, "loss": 0.4761, "rewards/accuracies": 0.625, "rewards/chosen": 1.504395604133606, "rewards/margins": 1.4259488582611084, "rewards/rejected": 0.07844698429107666, "step": 759 }, { "epoch": 0.5552511415525114, "grad_norm": 68.42266534425929, "learning_rate": 4.977165938411399e-07, "logits/chosen": -3.181112289428711, "logits/rejected": -2.7962284088134766, "logps/chosen": -500.6651611328125, "logps/rejected": -405.235107421875, "loss": 0.4801, "rewards/accuracies": 0.5, "rewards/chosen": 1.3587225675582886, "rewards/margins": 0.3970670700073242, "rewards/rejected": 0.9616554975509644, "step": 760 }, { "epoch": 0.5559817351598173, "grad_norm": 44.06787838407288, "learning_rate": 4.976950347088567e-07, "logits/chosen": -3.201005220413208, "logits/rejected": -2.678694248199463, "logps/chosen": -674.3121337890625, "logps/rejected": -720.3922119140625, "loss": 0.2492, "rewards/accuracies": 0.875, "rewards/chosen": 2.406352996826172, "rewards/margins": 2.3192174434661865, "rewards/rejected": 0.08713570237159729, "step": 761 }, { "epoch": 0.5567123287671233, "grad_norm": 64.87984688059922, "learning_rate": 4.976733747486221e-07, "logits/chosen": -2.9750590324401855, "logits/rejected": -2.4628524780273438, "logps/chosen": -751.0556030273438, "logps/rejected": -620.3228149414062, "loss": 0.4797, "rewards/accuracies": 0.875, "rewards/chosen": 1.9981248378753662, "rewards/margins": 2.0635054111480713, "rewards/rejected": -0.06538069248199463, "step": 762 }, { "epoch": 0.5574429223744293, "grad_norm": 57.51036761152288, "learning_rate": 4.976516139692534e-07, "logits/chosen": -2.844345808029175, "logits/rejected": -1.649272084236145, "logps/chosen": -615.671630859375, "logps/rejected": -391.29644775390625, "loss": 0.3316, "rewards/accuracies": 0.625, "rewards/chosen": 2.201388120651245, "rewards/margins": 2.775660514831543, "rewards/rejected": -0.5742721557617188, "step": 763 }, { "epoch": 0.5581735159817351, "grad_norm": 45.43783313151608, "learning_rate": 4.976297523796083e-07, "logits/chosen": -2.90948224067688, "logits/rejected": -1.5580767393112183, "logps/chosen": -683.991455078125, "logps/rejected": -286.7789001464844, "loss": 0.261, "rewards/accuracies": 0.875, "rewards/chosen": 2.587629556655884, "rewards/margins": 2.9721689224243164, "rewards/rejected": -0.38453930616378784, "step": 764 }, { "epoch": 0.5589041095890411, "grad_norm": 65.56615665446238, "learning_rate": 4.976077899885861e-07, "logits/chosen": -3.0475311279296875, "logits/rejected": -3.175508975982666, "logps/chosen": -612.1949462890625, "logps/rejected": -625.200439453125, "loss": 0.4109, "rewards/accuracies": 0.75, "rewards/chosen": 1.3856847286224365, "rewards/margins": 0.8186951875686646, "rewards/rejected": 0.566989541053772, "step": 765 }, { "epoch": 0.559634703196347, "grad_norm": 74.44593518537462, "learning_rate": 4.975857268051268e-07, "logits/chosen": -3.037381172180176, "logits/rejected": -2.6892406940460205, "logps/chosen": -948.7891845703125, "logps/rejected": -805.6862182617188, "loss": 0.4754, "rewards/accuracies": 0.75, "rewards/chosen": 2.7064735889434814, "rewards/margins": 0.5805097222328186, "rewards/rejected": 2.1259641647338867, "step": 766 }, { "epoch": 0.560365296803653, "grad_norm": 51.086163905485456, "learning_rate": 4.975635628382118e-07, "logits/chosen": -2.871654510498047, "logits/rejected": -2.4803547859191895, "logps/chosen": -662.204345703125, "logps/rejected": -640.7418212890625, "loss": 0.345, "rewards/accuracies": 0.875, "rewards/chosen": 3.1072425842285156, "rewards/margins": 1.9029158353805542, "rewards/rejected": 1.2043266296386719, "step": 767 }, { "epoch": 0.5610958904109589, "grad_norm": 52.08178617493716, "learning_rate": 4.975412980968629e-07, "logits/chosen": -2.5345656871795654, "logits/rejected": -2.1449477672576904, "logps/chosen": -667.8071899414062, "logps/rejected": -544.45849609375, "loss": 0.3045, "rewards/accuracies": 0.875, "rewards/chosen": 2.0689544677734375, "rewards/margins": 1.207513689994812, "rewards/rejected": 0.8614407777786255, "step": 768 }, { "epoch": 0.5618264840182648, "grad_norm": 64.95024387704406, "learning_rate": 4.975189325901436e-07, "logits/chosen": -2.824967384338379, "logits/rejected": -1.8837230205535889, "logps/chosen": -464.2457580566406, "logps/rejected": -360.0504455566406, "loss": 0.3349, "rewards/accuracies": 0.875, "rewards/chosen": 1.6439825296401978, "rewards/margins": 2.2966060638427734, "rewards/rejected": -0.6526235938072205, "step": 769 }, { "epoch": 0.5625570776255707, "grad_norm": 72.56390300646217, "learning_rate": 4.974964663271579e-07, "logits/chosen": -2.6768550872802734, "logits/rejected": -2.2917468547821045, "logps/chosen": -810.4447631835938, "logps/rejected": -602.5372314453125, "loss": 0.4029, "rewards/accuracies": 1.0, "rewards/chosen": 2.779240131378174, "rewards/margins": 2.4854674339294434, "rewards/rejected": 0.29377248883247375, "step": 770 }, { "epoch": 0.5632876712328767, "grad_norm": 64.52447755019021, "learning_rate": 4.974738993170511e-07, "logits/chosen": -2.5488855838775635, "logits/rejected": -2.0983667373657227, "logps/chosen": -389.9431457519531, "logps/rejected": -354.2098693847656, "loss": 0.3482, "rewards/accuracies": 0.75, "rewards/chosen": 2.2122340202331543, "rewards/margins": 2.452359676361084, "rewards/rejected": -0.24012558162212372, "step": 771 }, { "epoch": 0.5640182648401827, "grad_norm": 52.34254429581519, "learning_rate": 4.974512315690094e-07, "logits/chosen": -2.5058820247650146, "logits/rejected": -2.5366134643554688, "logps/chosen": -528.0532836914062, "logps/rejected": -526.7225952148438, "loss": 0.3332, "rewards/accuracies": 0.75, "rewards/chosen": 0.5667917728424072, "rewards/margins": 0.7591676115989685, "rewards/rejected": -0.19237588346004486, "step": 772 }, { "epoch": 0.5647488584474886, "grad_norm": 44.76764965971173, "learning_rate": 4.974284630922603e-07, "logits/chosen": -2.59977388381958, "logits/rejected": -2.3538286685943604, "logps/chosen": -1074.84814453125, "logps/rejected": -714.6483154296875, "loss": 0.2952, "rewards/accuracies": 1.0, "rewards/chosen": 2.3807811737060547, "rewards/margins": 2.378993034362793, "rewards/rejected": 0.0017881467938423157, "step": 773 }, { "epoch": 0.5654794520547946, "grad_norm": 73.41685880129985, "learning_rate": 4.974055938960718e-07, "logits/chosen": -2.7051525115966797, "logits/rejected": -1.995375156402588, "logps/chosen": -454.32879638671875, "logps/rejected": -324.7449951171875, "loss": 0.4981, "rewards/accuracies": 0.875, "rewards/chosen": 2.517953395843506, "rewards/margins": 2.822798252105713, "rewards/rejected": -0.30484485626220703, "step": 774 }, { "epoch": 0.5662100456621004, "grad_norm": 44.194461085355485, "learning_rate": 4.973826239897531e-07, "logits/chosen": -2.2838072776794434, "logits/rejected": -2.159792184829712, "logps/chosen": -546.461181640625, "logps/rejected": -561.4926147460938, "loss": 0.2615, "rewards/accuracies": 0.875, "rewards/chosen": 1.1170378923416138, "rewards/margins": 1.3250467777252197, "rewards/rejected": -0.20800885558128357, "step": 775 }, { "epoch": 0.5669406392694064, "grad_norm": 58.282986317969545, "learning_rate": 4.973595533826545e-07, "logits/chosen": -2.6860270500183105, "logits/rejected": -2.134918212890625, "logps/chosen": -587.867431640625, "logps/rejected": -471.4370422363281, "loss": 0.4026, "rewards/accuracies": 0.625, "rewards/chosen": 1.440547227859497, "rewards/margins": 0.7397794723510742, "rewards/rejected": 0.7007676959037781, "step": 776 }, { "epoch": 0.5676712328767123, "grad_norm": 68.20437606240893, "learning_rate": 4.973363820841673e-07, "logits/chosen": -2.5210795402526855, "logits/rejected": -2.1204607486724854, "logps/chosen": -453.8863830566406, "logps/rejected": -397.3208312988281, "loss": 0.4446, "rewards/accuracies": 0.75, "rewards/chosen": 2.049621105194092, "rewards/margins": 1.421682596206665, "rewards/rejected": 0.6279383897781372, "step": 777 }, { "epoch": 0.5684018264840183, "grad_norm": 53.1657587211038, "learning_rate": 4.973131101037237e-07, "logits/chosen": -2.5347418785095215, "logits/rejected": -1.8783122301101685, "logps/chosen": -554.0335693359375, "logps/rejected": -414.99603271484375, "loss": 0.3491, "rewards/accuracies": 0.875, "rewards/chosen": 1.9278461933135986, "rewards/margins": 1.3937937021255493, "rewards/rejected": 0.5340524911880493, "step": 778 }, { "epoch": 0.5691324200913243, "grad_norm": 62.44762649362664, "learning_rate": 4.972897374507969e-07, "logits/chosen": -2.776712417602539, "logits/rejected": -2.834423303604126, "logps/chosen": -624.9732055664062, "logps/rejected": -574.85986328125, "loss": 0.4368, "rewards/accuracies": 0.875, "rewards/chosen": 1.9981746673583984, "rewards/margins": 0.8458778858184814, "rewards/rejected": 1.1522969007492065, "step": 779 }, { "epoch": 0.5698630136986301, "grad_norm": 66.81118450529073, "learning_rate": 4.972662641349011e-07, "logits/chosen": -2.9599013328552246, "logits/rejected": -1.8721895217895508, "logps/chosen": -672.78662109375, "logps/rejected": -398.0088806152344, "loss": 0.3681, "rewards/accuracies": 0.875, "rewards/chosen": 2.736917018890381, "rewards/margins": 2.309385299682617, "rewards/rejected": 0.4275316894054413, "step": 780 }, { "epoch": 0.5705936073059361, "grad_norm": 59.0031489162337, "learning_rate": 4.972426901655915e-07, "logits/chosen": -2.367570400238037, "logits/rejected": -2.213709831237793, "logps/chosen": -660.5228271484375, "logps/rejected": -593.939208984375, "loss": 0.4726, "rewards/accuracies": 0.625, "rewards/chosen": 2.1294617652893066, "rewards/margins": 1.0722635984420776, "rewards/rejected": 1.0571980476379395, "step": 781 }, { "epoch": 0.571324200913242, "grad_norm": 65.80571843027778, "learning_rate": 4.97219015552464e-07, "logits/chosen": -2.5728113651275635, "logits/rejected": -2.907494068145752, "logps/chosen": -617.379638671875, "logps/rejected": -707.548095703125, "loss": 0.4667, "rewards/accuracies": 0.75, "rewards/chosen": 1.4996368885040283, "rewards/margins": 1.973103642463684, "rewards/rejected": -0.4734668731689453, "step": 782 }, { "epoch": 0.572054794520548, "grad_norm": 65.44330875006436, "learning_rate": 4.971952403051561e-07, "logits/chosen": -2.1756670475006104, "logits/rejected": -2.2461166381835938, "logps/chosen": -606.9027709960938, "logps/rejected": -710.2481079101562, "loss": 0.3782, "rewards/accuracies": 0.875, "rewards/chosen": 2.1216280460357666, "rewards/margins": 1.8134280443191528, "rewards/rejected": 0.30820029973983765, "step": 783 }, { "epoch": 0.5727853881278538, "grad_norm": 62.942763713601344, "learning_rate": 4.971713644333455e-07, "logits/chosen": -2.815851926803589, "logits/rejected": -2.4535484313964844, "logps/chosen": -662.6080322265625, "logps/rejected": -685.0006103515625, "loss": 0.4095, "rewards/accuracies": 0.625, "rewards/chosen": 1.829370141029358, "rewards/margins": 0.835375189781189, "rewards/rejected": 0.9939947724342346, "step": 784 }, { "epoch": 0.5735159817351598, "grad_norm": 68.18340028349093, "learning_rate": 4.971473879467515e-07, "logits/chosen": -2.80458402633667, "logits/rejected": -2.769594192504883, "logps/chosen": -354.473876953125, "logps/rejected": -358.01019287109375, "loss": 0.4333, "rewards/accuracies": 0.75, "rewards/chosen": 1.810901165008545, "rewards/margins": 1.9012600183486938, "rewards/rejected": -0.09035874158143997, "step": 785 }, { "epoch": 0.5742465753424657, "grad_norm": 75.73914100693743, "learning_rate": 4.971233108551339e-07, "logits/chosen": -3.1659600734710693, "logits/rejected": -2.471590518951416, "logps/chosen": -888.7725219726562, "logps/rejected": -622.1175537109375, "loss": 0.5071, "rewards/accuracies": 0.875, "rewards/chosen": 2.9636783599853516, "rewards/margins": 2.3487181663513184, "rewards/rejected": 0.6149600148200989, "step": 786 }, { "epoch": 0.5749771689497717, "grad_norm": 67.71615059039645, "learning_rate": 4.970991331682937e-07, "logits/chosen": -2.6646647453308105, "logits/rejected": -2.1964406967163086, "logps/chosen": -942.613037109375, "logps/rejected": -723.5032958984375, "loss": 0.4148, "rewards/accuracies": 0.875, "rewards/chosen": 3.0487897396087646, "rewards/margins": 2.675515651702881, "rewards/rejected": 0.3732742369174957, "step": 787 }, { "epoch": 0.5757077625570777, "grad_norm": 50.485765320533666, "learning_rate": 4.970748548960728e-07, "logits/chosen": -2.635164260864258, "logits/rejected": -2.312955617904663, "logps/chosen": -771.739013671875, "logps/rejected": -652.3582763671875, "loss": 0.291, "rewards/accuracies": 0.875, "rewards/chosen": 3.2200071811676025, "rewards/margins": 3.406283378601074, "rewards/rejected": -0.18627607822418213, "step": 788 }, { "epoch": 0.5764383561643835, "grad_norm": 75.4915952741181, "learning_rate": 4.97050476048354e-07, "logits/chosen": -2.689361095428467, "logits/rejected": -2.0076851844787598, "logps/chosen": -576.400146484375, "logps/rejected": -461.8363037109375, "loss": 0.4129, "rewards/accuracies": 0.75, "rewards/chosen": 2.2725229263305664, "rewards/margins": 2.4463682174682617, "rewards/rejected": -0.17384546995162964, "step": 789 }, { "epoch": 0.5771689497716895, "grad_norm": 79.8648730620587, "learning_rate": 4.970259966350611e-07, "logits/chosen": -3.2516400814056396, "logits/rejected": -1.829780101776123, "logps/chosen": -645.9255981445312, "logps/rejected": -443.0382385253906, "loss": 0.5737, "rewards/accuracies": 0.75, "rewards/chosen": 0.802075207233429, "rewards/margins": 0.5698407292366028, "rewards/rejected": 0.23223447799682617, "step": 790 }, { "epoch": 0.5778995433789954, "grad_norm": 57.17219527103516, "learning_rate": 4.970014166661588e-07, "logits/chosen": -2.5743002891540527, "logits/rejected": -2.459606409072876, "logps/chosen": -597.6632690429688, "logps/rejected": -662.2290649414062, "loss": 0.3679, "rewards/accuracies": 0.75, "rewards/chosen": 0.9014501571655273, "rewards/margins": 0.8669065833091736, "rewards/rejected": 0.03454360365867615, "step": 791 }, { "epoch": 0.5786301369863014, "grad_norm": 42.84207321436122, "learning_rate": 4.969767361516528e-07, "logits/chosen": -2.5763654708862305, "logits/rejected": -1.9405808448791504, "logps/chosen": -708.4669189453125, "logps/rejected": -608.0777587890625, "loss": 0.2636, "rewards/accuracies": 1.0, "rewards/chosen": 2.882577419281006, "rewards/margins": 3.375709056854248, "rewards/rejected": -0.49313169717788696, "step": 792 }, { "epoch": 0.5793607305936073, "grad_norm": 76.68776423650483, "learning_rate": 4.969519551015897e-07, "logits/chosen": -2.7520761489868164, "logits/rejected": -2.0229413509368896, "logps/chosen": -418.0118103027344, "logps/rejected": -331.576904296875, "loss": 0.4861, "rewards/accuracies": 0.875, "rewards/chosen": 1.7382876873016357, "rewards/margins": 1.882325291633606, "rewards/rejected": -0.14403760433197021, "step": 793 }, { "epoch": 0.5800913242009132, "grad_norm": 66.30280177667834, "learning_rate": 4.969270735260568e-07, "logits/chosen": -2.9018259048461914, "logits/rejected": -2.085988759994507, "logps/chosen": -710.498291015625, "logps/rejected": -510.7222595214844, "loss": 0.3662, "rewards/accuracies": 0.5, "rewards/chosen": 2.940544605255127, "rewards/margins": 2.1546497344970703, "rewards/rejected": 0.7858947515487671, "step": 794 }, { "epoch": 0.5808219178082191, "grad_norm": 72.60781315567905, "learning_rate": 4.969020914351826e-07, "logits/chosen": -2.8439693450927734, "logits/rejected": -2.4676504135131836, "logps/chosen": -812.3465576171875, "logps/rejected": -800.186767578125, "loss": 0.4564, "rewards/accuracies": 0.875, "rewards/chosen": 3.4882922172546387, "rewards/margins": 2.1348743438720703, "rewards/rejected": 1.3534178733825684, "step": 795 }, { "epoch": 0.5815525114155251, "grad_norm": 48.45592643465425, "learning_rate": 4.968770088391366e-07, "logits/chosen": -2.9790544509887695, "logits/rejected": -2.477825164794922, "logps/chosen": -872.0184326171875, "logps/rejected": -725.7027587890625, "loss": 0.2861, "rewards/accuracies": 1.0, "rewards/chosen": 3.9440460205078125, "rewards/margins": 3.505314588546753, "rewards/rejected": 0.43873119354248047, "step": 796 }, { "epoch": 0.5822831050228311, "grad_norm": 63.28654298851771, "learning_rate": 4.968518257481288e-07, "logits/chosen": -2.4947848320007324, "logits/rejected": -1.725396990776062, "logps/chosen": -685.4866943359375, "logps/rejected": -551.4608764648438, "loss": 0.4376, "rewards/accuracies": 0.875, "rewards/chosen": 2.093596935272217, "rewards/margins": 2.358363628387451, "rewards/rejected": -0.2647669017314911, "step": 797 }, { "epoch": 0.583013698630137, "grad_norm": 57.80481790176533, "learning_rate": 4.968265421724105e-07, "logits/chosen": -3.210265636444092, "logits/rejected": -2.3775198459625244, "logps/chosen": -706.864990234375, "logps/rejected": -467.8658142089844, "loss": 0.3891, "rewards/accuracies": 0.875, "rewards/chosen": 2.323230504989624, "rewards/margins": 2.0940446853637695, "rewards/rejected": 0.22918567061424255, "step": 798 }, { "epoch": 0.583744292237443, "grad_norm": 39.41086226477575, "learning_rate": 4.968011581222737e-07, "logits/chosen": -2.9908246994018555, "logits/rejected": -1.5684863328933716, "logps/chosen": -855.9266967773438, "logps/rejected": -490.58880615234375, "loss": 0.21, "rewards/accuracies": 0.875, "rewards/chosen": 3.2847836017608643, "rewards/margins": 3.3433682918548584, "rewards/rejected": -0.058584533631801605, "step": 799 }, { "epoch": 0.5844748858447488, "grad_norm": 74.75874609159453, "learning_rate": 4.967756736080513e-07, "logits/chosen": -2.9638214111328125, "logits/rejected": -2.0912303924560547, "logps/chosen": -675.9367065429688, "logps/rejected": -384.9849853515625, "loss": 0.498, "rewards/accuracies": 0.875, "rewards/chosen": 2.4964890480041504, "rewards/margins": 2.467445135116577, "rewards/rejected": 0.02904394268989563, "step": 800 }, { "epoch": 0.5852054794520548, "grad_norm": 71.44014512819456, "learning_rate": 4.967500886401174e-07, "logits/chosen": -2.46490478515625, "logits/rejected": -2.6862576007843018, "logps/chosen": -631.5436401367188, "logps/rejected": -794.9080810546875, "loss": 0.4209, "rewards/accuracies": 0.5, "rewards/chosen": 1.503713607788086, "rewards/margins": 0.5583709478378296, "rewards/rejected": 0.9453426599502563, "step": 801 }, { "epoch": 0.5859360730593607, "grad_norm": 66.96949005074065, "learning_rate": 4.967244032288864e-07, "logits/chosen": -3.1323444843292236, "logits/rejected": -1.9276800155639648, "logps/chosen": -616.0172119140625, "logps/rejected": -338.925048828125, "loss": 0.4755, "rewards/accuracies": 0.875, "rewards/chosen": 2.9283339977264404, "rewards/margins": 2.9769294261932373, "rewards/rejected": -0.04859556257724762, "step": 802 }, { "epoch": 0.5866666666666667, "grad_norm": 74.8891954928246, "learning_rate": 4.966986173848141e-07, "logits/chosen": -3.1470813751220703, "logits/rejected": -3.0440878868103027, "logps/chosen": -801.5424194335938, "logps/rejected": -706.3206787109375, "loss": 0.4657, "rewards/accuracies": 0.75, "rewards/chosen": 2.198918104171753, "rewards/margins": 0.8034316897392273, "rewards/rejected": 1.3954863548278809, "step": 803 }, { "epoch": 0.5873972602739727, "grad_norm": 56.934416937659506, "learning_rate": 4.96672731118397e-07, "logits/chosen": -2.6502017974853516, "logits/rejected": -1.7050272226333618, "logps/chosen": -1079.42138671875, "logps/rejected": -488.7759094238281, "loss": 0.3679, "rewards/accuracies": 1.0, "rewards/chosen": 1.2669358253479004, "rewards/margins": 1.1487703323364258, "rewards/rejected": 0.11816540360450745, "step": 804 }, { "epoch": 0.5881278538812785, "grad_norm": 64.46455391094784, "learning_rate": 4.966467444401726e-07, "logits/chosen": -3.030076026916504, "logits/rejected": -2.619704246520996, "logps/chosen": -810.9844970703125, "logps/rejected": -609.1709594726562, "loss": 0.2868, "rewards/accuracies": 0.875, "rewards/chosen": 2.9175353050231934, "rewards/margins": 1.8331129550933838, "rewards/rejected": 1.0844224691390991, "step": 805 }, { "epoch": 0.5888584474885845, "grad_norm": 61.11376550323482, "learning_rate": 4.96620657360719e-07, "logits/chosen": -2.718407154083252, "logits/rejected": -1.70456063747406, "logps/chosen": -961.9553833007812, "logps/rejected": -508.4305419921875, "loss": 0.336, "rewards/accuracies": 1.0, "rewards/chosen": 2.5290751457214355, "rewards/margins": 2.5127081871032715, "rewards/rejected": 0.01636696606874466, "step": 806 }, { "epoch": 0.5895890410958904, "grad_norm": 59.11142979841124, "learning_rate": 4.965944698906554e-07, "logits/chosen": -2.670731544494629, "logits/rejected": -2.0944511890411377, "logps/chosen": -545.7745971679688, "logps/rejected": -429.07073974609375, "loss": 0.4015, "rewards/accuracies": 0.75, "rewards/chosen": 1.4328131675720215, "rewards/margins": 0.8848216533660889, "rewards/rejected": 0.5479915142059326, "step": 807 }, { "epoch": 0.5903196347031964, "grad_norm": 70.68219130706652, "learning_rate": 4.965681820406418e-07, "logits/chosen": -2.2907962799072266, "logits/rejected": -1.6257425546646118, "logps/chosen": -511.6972351074219, "logps/rejected": -394.97076416015625, "loss": 0.402, "rewards/accuracies": 0.75, "rewards/chosen": 1.2499573230743408, "rewards/margins": 2.205991744995117, "rewards/rejected": -0.9560344815254211, "step": 808 }, { "epoch": 0.5910502283105022, "grad_norm": 73.28557423275743, "learning_rate": 4.965417938213791e-07, "logits/chosen": -2.7372984886169434, "logits/rejected": -2.9112324714660645, "logps/chosen": -633.797607421875, "logps/rejected": -710.5889892578125, "loss": 0.463, "rewards/accuracies": 0.75, "rewards/chosen": 1.4083784818649292, "rewards/margins": 0.8862468004226685, "rewards/rejected": 0.5221318006515503, "step": 809 }, { "epoch": 0.5917808219178082, "grad_norm": 46.47033401385636, "learning_rate": 4.965153052436089e-07, "logits/chosen": -2.6395583152770996, "logits/rejected": -2.399311065673828, "logps/chosen": -547.9324951171875, "logps/rejected": -535.2076416015625, "loss": 0.2516, "rewards/accuracies": 0.875, "rewards/chosen": 1.7803919315338135, "rewards/margins": 1.840141773223877, "rewards/rejected": -0.05974970757961273, "step": 810 }, { "epoch": 0.5925114155251141, "grad_norm": 58.26075795909928, "learning_rate": 4.964887163181139e-07, "logits/chosen": -3.1416406631469727, "logits/rejected": -2.0487983226776123, "logps/chosen": -753.9905395507812, "logps/rejected": -478.6015930175781, "loss": 0.3159, "rewards/accuracies": 0.75, "rewards/chosen": 1.4056458473205566, "rewards/margins": 1.1294833421707153, "rewards/rejected": 0.27616238594055176, "step": 811 }, { "epoch": 0.5932420091324201, "grad_norm": 54.5222234649551, "learning_rate": 4.964620270557173e-07, "logits/chosen": -2.539337396621704, "logits/rejected": -2.1992835998535156, "logps/chosen": -653.57275390625, "logps/rejected": -571.65283203125, "loss": 0.3116, "rewards/accuracies": 0.875, "rewards/chosen": 2.7860260009765625, "rewards/margins": 2.139545440673828, "rewards/rejected": 0.6464804410934448, "step": 812 }, { "epoch": 0.5939726027397261, "grad_norm": 72.28107773974286, "learning_rate": 4.964352374672838e-07, "logits/chosen": -2.562469482421875, "logits/rejected": -2.316166877746582, "logps/chosen": -704.7152709960938, "logps/rejected": -688.9566650390625, "loss": 0.4335, "rewards/accuracies": 0.875, "rewards/chosen": 3.1987552642822266, "rewards/margins": 2.6623077392578125, "rewards/rejected": 0.5364475250244141, "step": 813 }, { "epoch": 0.594703196347032, "grad_norm": 61.329874576954744, "learning_rate": 4.964083475637179e-07, "logits/chosen": -2.6666488647460938, "logits/rejected": -2.3191590309143066, "logps/chosen": -523.8924560546875, "logps/rejected": -541.1710205078125, "loss": 0.3135, "rewards/accuracies": 0.875, "rewards/chosen": 1.7176456451416016, "rewards/margins": 1.5426781177520752, "rewards/rejected": 0.17496749758720398, "step": 814 }, { "epoch": 0.5954337899543379, "grad_norm": 66.61662632063972, "learning_rate": 4.963813573559661e-07, "logits/chosen": -2.5826735496520996, "logits/rejected": -1.811246633529663, "logps/chosen": -745.7080078125, "logps/rejected": -532.8875732421875, "loss": 0.3788, "rewards/accuracies": 0.75, "rewards/chosen": 2.568051338195801, "rewards/margins": 2.492736339569092, "rewards/rejected": 0.075314961373806, "step": 815 }, { "epoch": 0.5961643835616438, "grad_norm": 65.23776750160715, "learning_rate": 4.963542668550149e-07, "logits/chosen": -2.746999740600586, "logits/rejected": -2.080385446548462, "logps/chosen": -610.3221435546875, "logps/rejected": -423.9848937988281, "loss": 0.4126, "rewards/accuracies": 0.875, "rewards/chosen": 2.2142586708068848, "rewards/margins": 1.9322919845581055, "rewards/rejected": 0.2819668650627136, "step": 816 }, { "epoch": 0.5968949771689498, "grad_norm": 79.8453950417618, "learning_rate": 4.963270760718918e-07, "logits/chosen": -2.5132133960723877, "logits/rejected": -2.6525180339813232, "logps/chosen": -850.074462890625, "logps/rejected": -686.7063598632812, "loss": 0.3933, "rewards/accuracies": 0.75, "rewards/chosen": 1.9939820766448975, "rewards/margins": 0.8050594329833984, "rewards/rejected": 1.1889225244522095, "step": 817 }, { "epoch": 0.5976255707762557, "grad_norm": 64.6965045061362, "learning_rate": 4.962997850176655e-07, "logits/chosen": -2.9198787212371826, "logits/rejected": -2.0744411945343018, "logps/chosen": -755.9617309570312, "logps/rejected": -636.4381103515625, "loss": 0.35, "rewards/accuracies": 0.875, "rewards/chosen": 3.146728754043579, "rewards/margins": 3.4701192378997803, "rewards/rejected": -0.32339054346084595, "step": 818 }, { "epoch": 0.5983561643835617, "grad_norm": 47.036827430914435, "learning_rate": 4.962723937034449e-07, "logits/chosen": -2.8508827686309814, "logits/rejected": -2.160254955291748, "logps/chosen": -903.883056640625, "logps/rejected": -507.2398681640625, "loss": 0.2416, "rewards/accuracies": 1.0, "rewards/chosen": 2.087782621383667, "rewards/margins": 1.4250446557998657, "rewards/rejected": 0.6627380847930908, "step": 819 }, { "epoch": 0.5990867579908675, "grad_norm": 45.29873443924096, "learning_rate": 4.962449021403802e-07, "logits/chosen": -2.5662357807159424, "logits/rejected": -2.224764347076416, "logps/chosen": -774.3370971679688, "logps/rejected": -633.3668212890625, "loss": 0.2476, "rewards/accuracies": 0.875, "rewards/chosen": 2.4003803730010986, "rewards/margins": 1.5427409410476685, "rewards/rejected": 0.8576390743255615, "step": 820 }, { "epoch": 0.5998173515981735, "grad_norm": 50.62501929011978, "learning_rate": 4.962173103396623e-07, "logits/chosen": -2.539463758468628, "logits/rejected": -2.3520498275756836, "logps/chosen": -361.9227600097656, "logps/rejected": -329.81427001953125, "loss": 0.294, "rewards/accuracies": 0.75, "rewards/chosen": 2.452610731124878, "rewards/margins": 3.2353451251983643, "rewards/rejected": -0.7827342748641968, "step": 821 }, { "epoch": 0.6005479452054795, "grad_norm": 63.771243396808565, "learning_rate": 4.961896183125228e-07, "logits/chosen": -3.0538642406463623, "logits/rejected": -1.9467923641204834, "logps/chosen": -790.5636596679688, "logps/rejected": -599.5275268554688, "loss": 0.2948, "rewards/accuracies": 1.0, "rewards/chosen": 3.2617177963256836, "rewards/margins": 3.7632155418395996, "rewards/rejected": -0.5014975666999817, "step": 822 }, { "epoch": 0.6012785388127854, "grad_norm": 53.726003000758396, "learning_rate": 4.961618260702342e-07, "logits/chosen": -2.710627555847168, "logits/rejected": -2.2141175270080566, "logps/chosen": -516.1710815429688, "logps/rejected": -377.5382385253906, "loss": 0.3077, "rewards/accuracies": 0.875, "rewards/chosen": 2.5968222618103027, "rewards/margins": 3.33528995513916, "rewards/rejected": -0.7384674549102783, "step": 823 }, { "epoch": 0.6020091324200914, "grad_norm": 56.97405522472488, "learning_rate": 4.961339336241096e-07, "logits/chosen": -2.0525014400482178, "logits/rejected": -2.425903558731079, "logps/chosen": -625.162841796875, "logps/rejected": -790.080322265625, "loss": 0.4382, "rewards/accuracies": 0.75, "rewards/chosen": 1.8796966075897217, "rewards/margins": 1.9984608888626099, "rewards/rejected": -0.11876419186592102, "step": 824 }, { "epoch": 0.6027397260273972, "grad_norm": 41.383642118123646, "learning_rate": 4.961059409855032e-07, "logits/chosen": -2.6292552947998047, "logits/rejected": -1.7352075576782227, "logps/chosen": -625.9396362304688, "logps/rejected": -303.4127197265625, "loss": 0.2323, "rewards/accuracies": 0.875, "rewards/chosen": 1.470733404159546, "rewards/margins": 1.4496731758117676, "rewards/rejected": 0.021060079336166382, "step": 825 }, { "epoch": 0.6034703196347032, "grad_norm": 63.11438454987401, "learning_rate": 4.960778481658098e-07, "logits/chosen": -2.584949016571045, "logits/rejected": -2.1712229251861572, "logps/chosen": -548.2200927734375, "logps/rejected": -471.8887634277344, "loss": 0.3866, "rewards/accuracies": 0.875, "rewards/chosen": 2.2765603065490723, "rewards/margins": 2.669959545135498, "rewards/rejected": -0.3933993875980377, "step": 826 }, { "epoch": 0.6042009132420091, "grad_norm": 71.88067572699042, "learning_rate": 4.960496551764648e-07, "logits/chosen": -2.4826207160949707, "logits/rejected": -1.7785437107086182, "logps/chosen": -946.7059326171875, "logps/rejected": -480.1312255859375, "loss": 0.461, "rewards/accuracies": 1.0, "rewards/chosen": 2.553166389465332, "rewards/margins": 2.6584830284118652, "rewards/rejected": -0.10531646013259888, "step": 827 }, { "epoch": 0.6049315068493151, "grad_norm": 35.90377493645341, "learning_rate": 4.960213620289449e-07, "logits/chosen": -2.8943464756011963, "logits/rejected": -2.00943922996521, "logps/chosen": -606.0720825195312, "logps/rejected": -489.9300537109375, "loss": 0.2361, "rewards/accuracies": 0.875, "rewards/chosen": 0.6631461381912231, "rewards/margins": 1.1911346912384033, "rewards/rejected": -0.5279886722564697, "step": 828 }, { "epoch": 0.605662100456621, "grad_norm": 64.46983306167311, "learning_rate": 4.95992968734767e-07, "logits/chosen": -2.8157033920288086, "logits/rejected": -2.2663304805755615, "logps/chosen": -619.9676513671875, "logps/rejected": -493.0052490234375, "loss": 0.3917, "rewards/accuracies": 1.0, "rewards/chosen": 2.2715959548950195, "rewards/margins": 2.222722053527832, "rewards/rejected": 0.04887378215789795, "step": 829 }, { "epoch": 0.6063926940639269, "grad_norm": 73.42550766444349, "learning_rate": 4.959644753054891e-07, "logits/chosen": -2.5078771114349365, "logits/rejected": -1.8244681358337402, "logps/chosen": -611.2894897460938, "logps/rejected": -405.33544921875, "loss": 0.4987, "rewards/accuracies": 1.0, "rewards/chosen": 2.1210670471191406, "rewards/margins": 2.736260414123535, "rewards/rejected": -0.6151933670043945, "step": 830 }, { "epoch": 0.6071232876712329, "grad_norm": 68.45379372248115, "learning_rate": 4.959358817527099e-07, "logits/chosen": -3.2491002082824707, "logits/rejected": -2.17872953414917, "logps/chosen": -824.1737060546875, "logps/rejected": -606.3387451171875, "loss": 0.3731, "rewards/accuracies": 0.625, "rewards/chosen": 1.3487730026245117, "rewards/margins": 1.2720046043395996, "rewards/rejected": 0.0767684280872345, "step": 831 }, { "epoch": 0.6078538812785388, "grad_norm": 54.97589137806226, "learning_rate": 4.959071880880688e-07, "logits/chosen": -2.580740213394165, "logits/rejected": -2.148949146270752, "logps/chosen": -679.5516967773438, "logps/rejected": -585.5499877929688, "loss": 0.3189, "rewards/accuracies": 0.875, "rewards/chosen": 1.7507457733154297, "rewards/margins": 2.27655291557312, "rewards/rejected": -0.52580726146698, "step": 832 }, { "epoch": 0.6085844748858448, "grad_norm": 41.34017280679064, "learning_rate": 4.958783943232459e-07, "logits/chosen": -2.518047332763672, "logits/rejected": -1.887714147567749, "logps/chosen": -503.0477294921875, "logps/rejected": -401.1246337890625, "loss": 0.2994, "rewards/accuracies": 0.75, "rewards/chosen": 2.4207305908203125, "rewards/margins": 2.825028419494629, "rewards/rejected": -0.4042978286743164, "step": 833 }, { "epoch": 0.6093150684931506, "grad_norm": 72.72857287545247, "learning_rate": 4.958495004699623e-07, "logits/chosen": -2.7087836265563965, "logits/rejected": -2.0513248443603516, "logps/chosen": -494.9747009277344, "logps/rejected": -318.2748718261719, "loss": 0.5904, "rewards/accuracies": 0.75, "rewards/chosen": 0.8676496148109436, "rewards/margins": 0.5115337371826172, "rewards/rejected": 0.35611584782600403, "step": 834 }, { "epoch": 0.6100456621004566, "grad_norm": 43.158491580543135, "learning_rate": 4.958205065399795e-07, "logits/chosen": -2.897477865219116, "logits/rejected": -2.0631508827209473, "logps/chosen": -544.4259643554688, "logps/rejected": -405.9892578125, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": 2.0143826007843018, "rewards/margins": 2.6973941326141357, "rewards/rejected": -0.6830114126205444, "step": 835 }, { "epoch": 0.6107762557077625, "grad_norm": 57.67895894247312, "learning_rate": 4.957914125451002e-07, "logits/chosen": -3.0897746086120605, "logits/rejected": -2.538227081298828, "logps/chosen": -437.0836181640625, "logps/rejected": -379.0921325683594, "loss": 0.3833, "rewards/accuracies": 0.5, "rewards/chosen": 2.2485690116882324, "rewards/margins": 1.8680542707443237, "rewards/rejected": 0.38051480054855347, "step": 836 }, { "epoch": 0.6115068493150685, "grad_norm": 73.86567993239485, "learning_rate": 4.957622184971672e-07, "logits/chosen": -2.679929494857788, "logits/rejected": -2.6765661239624023, "logps/chosen": -626.64013671875, "logps/rejected": -613.993896484375, "loss": 0.5387, "rewards/accuracies": 0.75, "rewards/chosen": 2.33115291595459, "rewards/margins": 2.0738472938537598, "rewards/rejected": 0.2573051452636719, "step": 837 }, { "epoch": 0.6122374429223745, "grad_norm": 56.812046632541986, "learning_rate": 4.957329244080644e-07, "logits/chosen": -3.0243563652038574, "logits/rejected": -2.4023711681365967, "logps/chosen": -1036.712890625, "logps/rejected": -704.4869995117188, "loss": 0.4076, "rewards/accuracies": 0.875, "rewards/chosen": 3.0841946601867676, "rewards/margins": 2.020374298095703, "rewards/rejected": 1.0638203620910645, "step": 838 }, { "epoch": 0.6129680365296803, "grad_norm": 49.93133713144857, "learning_rate": 4.957035302897167e-07, "logits/chosen": -2.843977212905884, "logits/rejected": -2.469942569732666, "logps/chosen": -673.636962890625, "logps/rejected": -562.9098510742188, "loss": 0.2571, "rewards/accuracies": 0.625, "rewards/chosen": 2.0904228687286377, "rewards/margins": 1.6631799936294556, "rewards/rejected": 0.4272429347038269, "step": 839 }, { "epoch": 0.6136986301369863, "grad_norm": 66.89660865843649, "learning_rate": 4.956740361540891e-07, "logits/chosen": -2.754915237426758, "logits/rejected": -2.208796501159668, "logps/chosen": -1003.380615234375, "logps/rejected": -803.04638671875, "loss": 0.4696, "rewards/accuracies": 0.5, "rewards/chosen": 1.4290001392364502, "rewards/margins": 0.27893567085266113, "rewards/rejected": 1.150064468383789, "step": 840 }, { "epoch": 0.6144292237442922, "grad_norm": 66.67413979924113, "learning_rate": 4.956444420131878e-07, "logits/chosen": -2.965230941772461, "logits/rejected": -2.199784994125366, "logps/chosen": -837.8756103515625, "logps/rejected": -668.7236328125, "loss": 0.3609, "rewards/accuracies": 0.875, "rewards/chosen": 2.4870920181274414, "rewards/margins": 2.2392725944519043, "rewards/rejected": 0.24781924486160278, "step": 841 }, { "epoch": 0.6151598173515982, "grad_norm": 61.852866543187595, "learning_rate": 4.956147478790595e-07, "logits/chosen": -2.914623737335205, "logits/rejected": -2.3008787631988525, "logps/chosen": -855.0739135742188, "logps/rejected": -705.0942993164062, "loss": 0.3805, "rewards/accuracies": 0.875, "rewards/chosen": 2.203486680984497, "rewards/margins": 1.962299108505249, "rewards/rejected": 0.2411874234676361, "step": 842 }, { "epoch": 0.6158904109589041, "grad_norm": 70.14500884328834, "learning_rate": 4.955849537637915e-07, "logits/chosen": -2.9827966690063477, "logits/rejected": -2.2007696628570557, "logps/chosen": -622.372314453125, "logps/rejected": -502.47344970703125, "loss": 0.5019, "rewards/accuracies": 0.875, "rewards/chosen": 2.991851806640625, "rewards/margins": 2.6843347549438477, "rewards/rejected": 0.30751723051071167, "step": 843 }, { "epoch": 0.61662100456621, "grad_norm": 46.97516641551816, "learning_rate": 4.955550596795122e-07, "logits/chosen": -2.845576763153076, "logits/rejected": -2.472198009490967, "logps/chosen": -575.005615234375, "logps/rejected": -567.5, "loss": 0.3108, "rewards/accuracies": 0.875, "rewards/chosen": 1.4222095012664795, "rewards/margins": 1.4545416831970215, "rewards/rejected": -0.03233218193054199, "step": 844 }, { "epoch": 0.617351598173516, "grad_norm": 53.43436348616754, "learning_rate": 4.955250656383902e-07, "logits/chosen": -3.3210816383361816, "logits/rejected": -2.6993660926818848, "logps/chosen": -561.9176025390625, "logps/rejected": -455.82977294921875, "loss": 0.3517, "rewards/accuracies": 0.875, "rewards/chosen": 1.7460027933120728, "rewards/margins": 1.3967537879943848, "rewards/rejected": 0.34924888610839844, "step": 845 }, { "epoch": 0.6180821917808219, "grad_norm": 66.59713374204816, "learning_rate": 4.954949716526352e-07, "logits/chosen": -2.8567917346954346, "logits/rejected": -2.029116153717041, "logps/chosen": -570.3134155273438, "logps/rejected": -347.9111022949219, "loss": 0.4353, "rewards/accuracies": 0.75, "rewards/chosen": 1.3874741792678833, "rewards/margins": 1.842543601989746, "rewards/rejected": -0.45506948232650757, "step": 846 }, { "epoch": 0.6188127853881279, "grad_norm": 53.94471001202296, "learning_rate": 4.954647777344972e-07, "logits/chosen": -2.3461782932281494, "logits/rejected": -2.226505756378174, "logps/chosen": -533.1724853515625, "logps/rejected": -405.3256530761719, "loss": 0.3819, "rewards/accuracies": 0.875, "rewards/chosen": 1.0319262742996216, "rewards/margins": 1.2098429203033447, "rewards/rejected": -0.17791670560836792, "step": 847 }, { "epoch": 0.6195433789954338, "grad_norm": 55.17670698891539, "learning_rate": 4.954344838962674e-07, "logits/chosen": -3.288188934326172, "logits/rejected": -2.0906333923339844, "logps/chosen": -584.801513671875, "logps/rejected": -395.1275634765625, "loss": 0.3557, "rewards/accuracies": 0.5, "rewards/chosen": 2.1741816997528076, "rewards/margins": 1.5643913745880127, "rewards/rejected": 0.6097903847694397, "step": 848 }, { "epoch": 0.6202739726027398, "grad_norm": 54.45120826970922, "learning_rate": 4.954040901502771e-07, "logits/chosen": -3.193337917327881, "logits/rejected": -2.1542677879333496, "logps/chosen": -756.4261474609375, "logps/rejected": -548.1611938476562, "loss": 0.3794, "rewards/accuracies": 0.875, "rewards/chosen": 1.8661950826644897, "rewards/margins": 2.6043145656585693, "rewards/rejected": -0.7381194829940796, "step": 849 }, { "epoch": 0.6210045662100456, "grad_norm": 68.00541037185019, "learning_rate": 4.953735965088984e-07, "logits/chosen": -2.7951817512512207, "logits/rejected": -2.1291418075561523, "logps/chosen": -713.2532348632812, "logps/rejected": -563.4091796875, "loss": 0.3992, "rewards/accuracies": 1.0, "rewards/chosen": 3.148707389831543, "rewards/margins": 3.7812893390655518, "rewards/rejected": -0.6325820088386536, "step": 850 }, { "epoch": 0.6217351598173516, "grad_norm": 34.71742796762888, "learning_rate": 4.953430029845446e-07, "logits/chosen": -2.3616695404052734, "logits/rejected": -1.4123436212539673, "logps/chosen": -586.5059814453125, "logps/rejected": -313.6849060058594, "loss": 0.3163, "rewards/accuracies": 0.75, "rewards/chosen": 2.373115301132202, "rewards/margins": 2.8829638957977295, "rewards/rejected": -0.5098484754562378, "step": 851 }, { "epoch": 0.6224657534246575, "grad_norm": 59.2411546502672, "learning_rate": 4.953123095896689e-07, "logits/chosen": -3.1418399810791016, "logits/rejected": -3.105478286743164, "logps/chosen": -875.57861328125, "logps/rejected": -927.7098388671875, "loss": 0.3345, "rewards/accuracies": 0.875, "rewards/chosen": 3.378369092941284, "rewards/margins": 1.5665693283081055, "rewards/rejected": 1.8117996454238892, "step": 852 }, { "epoch": 0.6231963470319635, "grad_norm": 68.53280678236337, "learning_rate": 4.952815163367656e-07, "logits/chosen": -2.8428664207458496, "logits/rejected": -2.415212392807007, "logps/chosen": -645.73779296875, "logps/rejected": -594.7931518554688, "loss": 0.4774, "rewards/accuracies": 0.625, "rewards/chosen": 1.770869255065918, "rewards/margins": 1.120352029800415, "rewards/rejected": 0.6505171656608582, "step": 853 }, { "epoch": 0.6239269406392695, "grad_norm": 36.40700106521409, "learning_rate": 4.952506232383697e-07, "logits/chosen": -3.1820154190063477, "logits/rejected": -2.419214963912964, "logps/chosen": -797.57080078125, "logps/rejected": -557.6189575195312, "loss": 0.2598, "rewards/accuracies": 0.875, "rewards/chosen": 2.2683658599853516, "rewards/margins": 1.319332480430603, "rewards/rejected": 0.9490334391593933, "step": 854 }, { "epoch": 0.6246575342465753, "grad_norm": 67.30452381364309, "learning_rate": 4.952196303070565e-07, "logits/chosen": -3.163621425628662, "logits/rejected": -2.2856051921844482, "logps/chosen": -1081.8751220703125, "logps/rejected": -754.1175537109375, "loss": 0.3876, "rewards/accuracies": 0.625, "rewards/chosen": 2.773641586303711, "rewards/margins": 1.4349009990692139, "rewards/rejected": 1.338740587234497, "step": 855 }, { "epoch": 0.6253881278538813, "grad_norm": 52.6875648724393, "learning_rate": 4.951885375554422e-07, "logits/chosen": -2.3430581092834473, "logits/rejected": -1.3881646394729614, "logps/chosen": -662.1301879882812, "logps/rejected": -319.70489501953125, "loss": 0.3419, "rewards/accuracies": 0.875, "rewards/chosen": 2.8948941230773926, "rewards/margins": 2.6409215927124023, "rewards/rejected": 0.2539726495742798, "step": 856 }, { "epoch": 0.6261187214611872, "grad_norm": 47.03575300286873, "learning_rate": 4.951573449961837e-07, "logits/chosen": -2.741292953491211, "logits/rejected": -2.7260847091674805, "logps/chosen": -698.1105346679688, "logps/rejected": -606.0423583984375, "loss": 0.3106, "rewards/accuracies": 0.75, "rewards/chosen": 2.416994094848633, "rewards/margins": 1.6930896043777466, "rewards/rejected": 0.723904550075531, "step": 857 }, { "epoch": 0.6268493150684932, "grad_norm": 73.51371045594148, "learning_rate": 4.951260526419781e-07, "logits/chosen": -2.7608609199523926, "logits/rejected": -2.1127567291259766, "logps/chosen": -379.95697021484375, "logps/rejected": -330.67498779296875, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 1.133829951286316, "rewards/margins": 2.105902910232544, "rewards/rejected": -0.9720730781555176, "step": 858 }, { "epoch": 0.627579908675799, "grad_norm": 71.54102775571336, "learning_rate": 4.950946605055638e-07, "logits/chosen": -2.861025333404541, "logits/rejected": -2.6325831413269043, "logps/chosen": -959.6657104492188, "logps/rejected": -774.18994140625, "loss": 0.5071, "rewards/accuracies": 0.625, "rewards/chosen": 2.470247983932495, "rewards/margins": 1.5829377174377441, "rewards/rejected": 0.8873103857040405, "step": 859 }, { "epoch": 0.628310502283105, "grad_norm": 59.417836110438046, "learning_rate": 4.950631685997191e-07, "logits/chosen": -3.0673584938049316, "logits/rejected": -2.0051867961883545, "logps/chosen": -811.5911254882812, "logps/rejected": -444.8504943847656, "loss": 0.3669, "rewards/accuracies": 1.0, "rewards/chosen": 2.9583864212036133, "rewards/margins": 2.9535605907440186, "rewards/rejected": 0.0048256516456604, "step": 860 }, { "epoch": 0.6290410958904109, "grad_norm": 60.84584854496032, "learning_rate": 4.950315769372633e-07, "logits/chosen": -2.6547508239746094, "logits/rejected": -2.214069366455078, "logps/chosen": -495.398681640625, "logps/rejected": -447.89556884765625, "loss": 0.3075, "rewards/accuracies": 1.0, "rewards/chosen": 2.0687456130981445, "rewards/margins": 2.592529773712158, "rewards/rejected": -0.5237839221954346, "step": 861 }, { "epoch": 0.6297716894977169, "grad_norm": 50.801899647675576, "learning_rate": 4.949998855310565e-07, "logits/chosen": -3.0217418670654297, "logits/rejected": -2.0190563201904297, "logps/chosen": -1257.72314453125, "logps/rejected": -789.5471801757812, "loss": 0.2738, "rewards/accuracies": 0.875, "rewards/chosen": 3.9746668338775635, "rewards/margins": 2.4716920852661133, "rewards/rejected": 1.502974510192871, "step": 862 }, { "epoch": 0.6305022831050229, "grad_norm": 55.83822151915187, "learning_rate": 4.94968094393999e-07, "logits/chosen": -2.4273486137390137, "logits/rejected": -2.172861099243164, "logps/chosen": -527.9479370117188, "logps/rejected": -524.452392578125, "loss": 0.3846, "rewards/accuracies": 0.875, "rewards/chosen": 2.789274215698242, "rewards/margins": 2.456233024597168, "rewards/rejected": 0.33304086327552795, "step": 863 }, { "epoch": 0.6312328767123287, "grad_norm": 57.816547577904764, "learning_rate": 4.949362035390318e-07, "logits/chosen": -2.676887273788452, "logits/rejected": -2.019822835922241, "logps/chosen": -665.783203125, "logps/rejected": -509.3079528808594, "loss": 0.4049, "rewards/accuracies": 0.875, "rewards/chosen": 2.084261417388916, "rewards/margins": 1.3440684080123901, "rewards/rejected": 0.7401930093765259, "step": 864 }, { "epoch": 0.6319634703196347, "grad_norm": 62.53615750982586, "learning_rate": 4.949042129791366e-07, "logits/chosen": -2.8418569564819336, "logits/rejected": -2.7791695594787598, "logps/chosen": -569.0588989257812, "logps/rejected": -506.7899475097656, "loss": 0.4936, "rewards/accuracies": 0.5, "rewards/chosen": 1.6109131574630737, "rewards/margins": 0.6928425431251526, "rewards/rejected": 0.9180704951286316, "step": 865 }, { "epoch": 0.6326940639269406, "grad_norm": 51.158927274169365, "learning_rate": 4.948721227273356e-07, "logits/chosen": -2.7980566024780273, "logits/rejected": -2.12638258934021, "logps/chosen": -790.5848388671875, "logps/rejected": -474.442626953125, "loss": 0.3322, "rewards/accuracies": 0.875, "rewards/chosen": 2.5192294120788574, "rewards/margins": 2.476807117462158, "rewards/rejected": 0.04242238402366638, "step": 866 }, { "epoch": 0.6334246575342466, "grad_norm": 51.19682597812461, "learning_rate": 4.948399327966917e-07, "logits/chosen": -3.1660497188568115, "logits/rejected": -2.7774417400360107, "logps/chosen": -780.082275390625, "logps/rejected": -652.1351928710938, "loss": 0.2968, "rewards/accuracies": 0.875, "rewards/chosen": 2.566196918487549, "rewards/margins": 1.882054090499878, "rewards/rejected": 0.6841428279876709, "step": 867 }, { "epoch": 0.6341552511415525, "grad_norm": 72.87390114951991, "learning_rate": 4.948076432003081e-07, "logits/chosen": -2.355421543121338, "logits/rejected": -1.8756470680236816, "logps/chosen": -326.3033142089844, "logps/rejected": -287.9959716796875, "loss": 0.5096, "rewards/accuracies": 0.875, "rewards/chosen": 1.584566354751587, "rewards/margins": 2.0083494186401367, "rewards/rejected": -0.42378294467926025, "step": 868 }, { "epoch": 0.6348858447488585, "grad_norm": 84.83749113654734, "learning_rate": 4.94775253951329e-07, "logits/chosen": -2.5492777824401855, "logits/rejected": -1.7027223110198975, "logps/chosen": -595.547607421875, "logps/rejected": -432.4244384765625, "loss": 0.5411, "rewards/accuracies": 0.875, "rewards/chosen": 2.8255562782287598, "rewards/margins": 2.5982131958007812, "rewards/rejected": 0.22734302282333374, "step": 869 }, { "epoch": 0.6356164383561644, "grad_norm": 55.92610742924542, "learning_rate": 4.947427650629389e-07, "logits/chosen": -2.1058032512664795, "logits/rejected": -2.345219373703003, "logps/chosen": -560.8619384765625, "logps/rejected": -539.6914672851562, "loss": 0.3513, "rewards/accuracies": 1.0, "rewards/chosen": 1.6520531177520752, "rewards/margins": 1.772588849067688, "rewards/rejected": -0.12053576111793518, "step": 870 }, { "epoch": 0.6363470319634703, "grad_norm": 64.54187261237854, "learning_rate": 4.947101765483626e-07, "logits/chosen": -2.2669873237609863, "logits/rejected": -2.6001577377319336, "logps/chosen": -619.1602783203125, "logps/rejected": -916.55126953125, "loss": 0.3951, "rewards/accuracies": 0.875, "rewards/chosen": 2.5278661251068115, "rewards/margins": 2.45766282081604, "rewards/rejected": 0.07020336389541626, "step": 871 }, { "epoch": 0.6370776255707763, "grad_norm": 38.19560093340675, "learning_rate": 4.946774884208662e-07, "logits/chosen": -2.4609532356262207, "logits/rejected": -2.103001832962036, "logps/chosen": -649.0516357421875, "logps/rejected": -737.8178100585938, "loss": 0.2174, "rewards/accuracies": 1.0, "rewards/chosen": 2.4835896492004395, "rewards/margins": 2.5745766162872314, "rewards/rejected": -0.09098678827285767, "step": 872 }, { "epoch": 0.6378082191780822, "grad_norm": 90.42044262948782, "learning_rate": 4.946447006937555e-07, "logits/chosen": -3.059199810028076, "logits/rejected": -2.9465396404266357, "logps/chosen": -807.4127197265625, "logps/rejected": -535.21728515625, "loss": 0.6502, "rewards/accuracies": 0.25, "rewards/chosen": 1.0172383785247803, "rewards/margins": -0.20906662940979004, "rewards/rejected": 1.2263048887252808, "step": 873 }, { "epoch": 0.6385388127853882, "grad_norm": 40.17914139600686, "learning_rate": 4.946118133803774e-07, "logits/chosen": -2.571671962738037, "logits/rejected": -1.8863211870193481, "logps/chosen": -749.1961669921875, "logps/rejected": -487.21826171875, "loss": 0.2814, "rewards/accuracies": 1.0, "rewards/chosen": 2.3566808700561523, "rewards/margins": 1.495071530342102, "rewards/rejected": 0.8616091012954712, "step": 874 }, { "epoch": 0.639269406392694, "grad_norm": 41.984883812510425, "learning_rate": 4.945788264941191e-07, "logits/chosen": -2.3224027156829834, "logits/rejected": -2.134232521057129, "logps/chosen": -441.1449279785156, "logps/rejected": -362.6922302246094, "loss": 0.2939, "rewards/accuracies": 0.75, "rewards/chosen": 1.4246070384979248, "rewards/margins": 1.5823309421539307, "rewards/rejected": -0.1577240228652954, "step": 875 }, { "epoch": 0.64, "grad_norm": 58.18623286751343, "learning_rate": 4.945457400484085e-07, "logits/chosen": -2.251019239425659, "logits/rejected": -2.1046814918518066, "logps/chosen": -288.5109558105469, "logps/rejected": -360.20709228515625, "loss": 0.362, "rewards/accuracies": 0.75, "rewards/chosen": 1.4165167808532715, "rewards/margins": 1.8365285396575928, "rewards/rejected": -0.4200117886066437, "step": 876 }, { "epoch": 0.6407305936073059, "grad_norm": 47.35932344932859, "learning_rate": 4.945125540567138e-07, "logits/chosen": -3.079472780227661, "logits/rejected": -1.849731206893921, "logps/chosen": -662.7115478515625, "logps/rejected": -453.1530456542969, "loss": 0.3244, "rewards/accuracies": 0.75, "rewards/chosen": 2.281808376312256, "rewards/margins": 2.72877836227417, "rewards/rejected": -0.446970134973526, "step": 877 }, { "epoch": 0.6414611872146119, "grad_norm": 55.64706812947336, "learning_rate": 4.94479268532544e-07, "logits/chosen": -2.488327741622925, "logits/rejected": -2.0342321395874023, "logps/chosen": -660.4818115234375, "logps/rejected": -541.6734619140625, "loss": 0.3254, "rewards/accuracies": 0.875, "rewards/chosen": 1.9725608825683594, "rewards/margins": 1.647395133972168, "rewards/rejected": 0.32516586780548096, "step": 878 }, { "epoch": 0.6421917808219179, "grad_norm": 52.16486223083983, "learning_rate": 4.944458834894483e-07, "logits/chosen": -2.8095691204071045, "logits/rejected": -1.9103584289550781, "logps/chosen": -801.964111328125, "logps/rejected": -463.3938903808594, "loss": 0.231, "rewards/accuracies": 1.0, "rewards/chosen": 2.6098685264587402, "rewards/margins": 2.584402322769165, "rewards/rejected": 0.025466330349445343, "step": 879 }, { "epoch": 0.6429223744292237, "grad_norm": 44.884280464462165, "learning_rate": 4.944123989410168e-07, "logits/chosen": -3.0036590099334717, "logits/rejected": -2.1367745399475098, "logps/chosen": -625.281494140625, "logps/rejected": -389.98406982421875, "loss": 0.3086, "rewards/accuracies": 0.875, "rewards/chosen": 1.5197726488113403, "rewards/margins": 1.6711499691009521, "rewards/rejected": -0.15137749910354614, "step": 880 }, { "epoch": 0.6436529680365297, "grad_norm": 58.025938350744276, "learning_rate": 4.943788149008798e-07, "logits/chosen": -2.5285136699676514, "logits/rejected": -2.1431009769439697, "logps/chosen": -945.5923461914062, "logps/rejected": -702.7506103515625, "loss": 0.4087, "rewards/accuracies": 1.0, "rewards/chosen": 3.2770473957061768, "rewards/margins": 2.0040063858032227, "rewards/rejected": 1.2730411291122437, "step": 881 }, { "epoch": 0.6443835616438356, "grad_norm": 58.811069904902894, "learning_rate": 4.94345131382708e-07, "logits/chosen": -2.920135259628296, "logits/rejected": -2.6130900382995605, "logps/chosen": -326.99713134765625, "logps/rejected": -280.4598083496094, "loss": 0.4441, "rewards/accuracies": 0.5, "rewards/chosen": 1.2923710346221924, "rewards/margins": 1.310500979423523, "rewards/rejected": -0.018129967153072357, "step": 882 }, { "epoch": 0.6451141552511416, "grad_norm": 52.01820876355461, "learning_rate": 4.94311348400213e-07, "logits/chosen": -2.790419101715088, "logits/rejected": -2.4249863624572754, "logps/chosen": -758.2681274414062, "logps/rejected": -643.06982421875, "loss": 0.3427, "rewards/accuracies": 0.625, "rewards/chosen": 2.2394282817840576, "rewards/margins": 1.1449041366577148, "rewards/rejected": 1.0945241451263428, "step": 883 }, { "epoch": 0.6458447488584474, "grad_norm": 55.601605860071615, "learning_rate": 4.942774659671465e-07, "logits/chosen": -3.3382771015167236, "logits/rejected": -1.941870927810669, "logps/chosen": -856.6427001953125, "logps/rejected": -546.7882690429688, "loss": 0.2782, "rewards/accuracies": 0.875, "rewards/chosen": 4.2144083976745605, "rewards/margins": 4.444570541381836, "rewards/rejected": -0.23016266524791718, "step": 884 }, { "epoch": 0.6465753424657534, "grad_norm": 64.65609896067424, "learning_rate": 4.94243484097301e-07, "logits/chosen": -2.383833646774292, "logits/rejected": -1.9700257778167725, "logps/chosen": -717.3999633789062, "logps/rejected": -612.34912109375, "loss": 0.4167, "rewards/accuracies": 0.625, "rewards/chosen": 3.1709399223327637, "rewards/margins": 2.8242015838623047, "rewards/rejected": 0.34673798084259033, "step": 885 }, { "epoch": 0.6473059360730593, "grad_norm": 67.32324223707961, "learning_rate": 4.942094028045092e-07, "logits/chosen": -2.995544672012329, "logits/rejected": -2.4418833255767822, "logps/chosen": -681.30517578125, "logps/rejected": -456.344970703125, "loss": 0.4138, "rewards/accuracies": 0.625, "rewards/chosen": 1.0332204103469849, "rewards/margins": 1.0200177431106567, "rewards/rejected": 0.013202555477619171, "step": 886 }, { "epoch": 0.6480365296803653, "grad_norm": 1266.7594483563169, "learning_rate": 4.941752221026445e-07, "logits/chosen": -2.5863797664642334, "logits/rejected": -1.9733312129974365, "logps/chosen": -736.0822143554688, "logps/rejected": -651.9111328125, "loss": 1.0507, "rewards/accuracies": 1.0, "rewards/chosen": 3.3136515617370605, "rewards/margins": 2.807572841644287, "rewards/rejected": 0.5060788989067078, "step": 887 }, { "epoch": 0.6487671232876713, "grad_norm": 53.1862629299439, "learning_rate": 4.941409420056206e-07, "logits/chosen": -3.0887789726257324, "logits/rejected": -1.9597086906433105, "logps/chosen": -549.4454345703125, "logps/rejected": -345.0615539550781, "loss": 0.3472, "rewards/accuracies": 0.75, "rewards/chosen": 2.6216630935668945, "rewards/margins": 3.2388782501220703, "rewards/rejected": -0.6172152161598206, "step": 888 }, { "epoch": 0.6494977168949772, "grad_norm": 76.00880461437511, "learning_rate": 4.941065625273918e-07, "logits/chosen": -2.799771785736084, "logits/rejected": -2.114041328430176, "logps/chosen": -586.114013671875, "logps/rejected": -516.5181884765625, "loss": 0.4434, "rewards/accuracies": 0.875, "rewards/chosen": 2.347651243209839, "rewards/margins": 2.296440362930298, "rewards/rejected": 0.05121105909347534, "step": 889 }, { "epoch": 0.6502283105022831, "grad_norm": 61.45346070020182, "learning_rate": 4.940720836819527e-07, "logits/chosen": -3.505096673965454, "logits/rejected": -2.528386354446411, "logps/chosen": -969.1098022460938, "logps/rejected": -729.86572265625, "loss": 0.3226, "rewards/accuracies": 0.875, "rewards/chosen": 3.4306976795196533, "rewards/margins": 2.1338443756103516, "rewards/rejected": 1.2968528270721436, "step": 890 }, { "epoch": 0.650958904109589, "grad_norm": 64.81191236474203, "learning_rate": 4.940375054833384e-07, "logits/chosen": -2.694981813430786, "logits/rejected": -2.4000468254089355, "logps/chosen": -550.6650390625, "logps/rejected": -496.05133056640625, "loss": 0.3686, "rewards/accuracies": 0.75, "rewards/chosen": 0.8439410924911499, "rewards/margins": 1.1586757898330688, "rewards/rejected": -0.31473466753959656, "step": 891 }, { "epoch": 0.651689497716895, "grad_norm": 94.30738989635171, "learning_rate": 4.940028279456246e-07, "logits/chosen": -2.656019926071167, "logits/rejected": -2.5909316539764404, "logps/chosen": -947.049072265625, "logps/rejected": -876.1315307617188, "loss": 0.6962, "rewards/accuracies": 0.625, "rewards/chosen": 2.2593753337860107, "rewards/margins": 0.5160991549491882, "rewards/rejected": 1.7432762384414673, "step": 892 }, { "epoch": 0.6524200913242009, "grad_norm": 51.649909339374695, "learning_rate": 4.939680510829272e-07, "logits/chosen": -2.232304334640503, "logits/rejected": -2.227292776107788, "logps/chosen": -442.5836181640625, "logps/rejected": -454.00738525390625, "loss": 0.3474, "rewards/accuracies": 0.625, "rewards/chosen": 0.9070858359336853, "rewards/margins": 0.9852018356323242, "rewards/rejected": -0.0781160369515419, "step": 893 }, { "epoch": 0.6531506849315069, "grad_norm": 60.75679336052163, "learning_rate": 4.939331749094026e-07, "logits/chosen": -2.8827648162841797, "logits/rejected": -1.798945426940918, "logps/chosen": -424.68682861328125, "logps/rejected": -267.74285888671875, "loss": 0.3259, "rewards/accuracies": 0.875, "rewards/chosen": 3.1906940937042236, "rewards/margins": 4.237615585327148, "rewards/rejected": -1.046921730041504, "step": 894 }, { "epoch": 0.6538812785388128, "grad_norm": 74.17081675830292, "learning_rate": 4.938981994392479e-07, "logits/chosen": -2.511941909790039, "logits/rejected": -2.314192295074463, "logps/chosen": -527.7523193359375, "logps/rejected": -763.5256958007812, "loss": 0.4971, "rewards/accuracies": 0.625, "rewards/chosen": 1.845599889755249, "rewards/margins": 1.145053505897522, "rewards/rejected": 0.7005465030670166, "step": 895 }, { "epoch": 0.6546118721461187, "grad_norm": 62.82774807148809, "learning_rate": 4.938631246867e-07, "logits/chosen": -2.746319055557251, "logits/rejected": -2.2491583824157715, "logps/chosen": -767.2687377929688, "logps/rejected": -633.8179931640625, "loss": 0.3582, "rewards/accuracies": 1.0, "rewards/chosen": 2.231450319290161, "rewards/margins": 2.05902099609375, "rewards/rejected": 0.17242926359176636, "step": 896 }, { "epoch": 0.6553424657534247, "grad_norm": 43.218865803351726, "learning_rate": 4.938279506660369e-07, "logits/chosen": -3.087899923324585, "logits/rejected": -2.852038860321045, "logps/chosen": -703.9903564453125, "logps/rejected": -675.7357177734375, "loss": 0.2435, "rewards/accuracies": 0.875, "rewards/chosen": 3.2270665168762207, "rewards/margins": 3.574326515197754, "rewards/rejected": -0.34726014733314514, "step": 897 }, { "epoch": 0.6560730593607306, "grad_norm": 80.06531452410432, "learning_rate": 4.937926773915767e-07, "logits/chosen": -2.0958948135375977, "logits/rejected": -2.475428581237793, "logps/chosen": -352.11431884765625, "logps/rejected": -564.7581787109375, "loss": 0.5111, "rewards/accuracies": 0.625, "rewards/chosen": 1.3117461204528809, "rewards/margins": 0.6923261284828186, "rewards/rejected": 0.619420051574707, "step": 898 }, { "epoch": 0.6568036529680366, "grad_norm": 62.35490775780504, "learning_rate": 4.937573048776777e-07, "logits/chosen": -2.9837491512298584, "logits/rejected": -2.5525827407836914, "logps/chosen": -700.430419921875, "logps/rejected": -635.0186767578125, "loss": 0.3676, "rewards/accuracies": 0.75, "rewards/chosen": 1.998189926147461, "rewards/margins": 1.6150095462799072, "rewards/rejected": 0.3831803798675537, "step": 899 }, { "epoch": 0.6575342465753424, "grad_norm": 69.21772360977172, "learning_rate": 4.937218331387391e-07, "logits/chosen": -2.8126540184020996, "logits/rejected": -2.089111804962158, "logps/chosen": -683.80126953125, "logps/rejected": -576.7792358398438, "loss": 0.4062, "rewards/accuracies": 0.75, "rewards/chosen": 2.1431326866149902, "rewards/margins": 1.4462800025939941, "rewards/rejected": 0.6968527436256409, "step": 900 }, { "epoch": 0.6582648401826484, "grad_norm": 64.19329635464283, "learning_rate": 4.936862621891999e-07, "logits/chosen": -2.8880441188812256, "logits/rejected": -2.1220076084136963, "logps/chosen": -454.35394287109375, "logps/rejected": -441.85711669921875, "loss": 0.4007, "rewards/accuracies": 0.75, "rewards/chosen": 2.059741973876953, "rewards/margins": 2.2224221229553223, "rewards/rejected": -0.1626802235841751, "step": 901 }, { "epoch": 0.6589954337899543, "grad_norm": 48.96705235969258, "learning_rate": 4.936505920435401e-07, "logits/chosen": -3.3665857315063477, "logits/rejected": -2.621155023574829, "logps/chosen": -923.7412109375, "logps/rejected": -577.2442626953125, "loss": 0.3194, "rewards/accuracies": 0.875, "rewards/chosen": 2.9513306617736816, "rewards/margins": 1.860923409461975, "rewards/rejected": 1.090407371520996, "step": 902 }, { "epoch": 0.6597260273972603, "grad_norm": 55.419412477009125, "learning_rate": 4.936148227162795e-07, "logits/chosen": -2.871873140335083, "logits/rejected": -1.8878321647644043, "logps/chosen": -542.079345703125, "logps/rejected": -423.2261047363281, "loss": 0.399, "rewards/accuracies": 0.75, "rewards/chosen": 3.069192886352539, "rewards/margins": 3.2538554668426514, "rewards/rejected": -0.18466243147850037, "step": 903 }, { "epoch": 0.6604566210045663, "grad_norm": 69.49836300904863, "learning_rate": 4.935789542219787e-07, "logits/chosen": -3.4346683025360107, "logits/rejected": -3.0754740238189697, "logps/chosen": -911.1216430664062, "logps/rejected": -782.01025390625, "loss": 0.3241, "rewards/accuracies": 0.75, "rewards/chosen": 2.266672134399414, "rewards/margins": 1.7020765542984009, "rewards/rejected": 0.564595639705658, "step": 904 }, { "epoch": 0.6611872146118721, "grad_norm": 50.9226133346289, "learning_rate": 4.935429865752384e-07, "logits/chosen": -2.6683781147003174, "logits/rejected": -1.9789679050445557, "logps/chosen": -649.667724609375, "logps/rejected": -527.42529296875, "loss": 0.2765, "rewards/accuracies": 0.875, "rewards/chosen": 2.0859744548797607, "rewards/margins": 2.4150326251983643, "rewards/rejected": -0.3290581703186035, "step": 905 }, { "epoch": 0.6619178082191781, "grad_norm": 73.5663961903633, "learning_rate": 4.935069197906998e-07, "logits/chosen": -2.9427053928375244, "logits/rejected": -2.292163848876953, "logps/chosen": -736.4996337890625, "logps/rejected": -520.7926025390625, "loss": 0.4169, "rewards/accuracies": 1.0, "rewards/chosen": 1.9073214530944824, "rewards/margins": 1.3313028812408447, "rewards/rejected": 0.5760185718536377, "step": 906 }, { "epoch": 0.662648401826484, "grad_norm": 52.55448019600113, "learning_rate": 4.934707538830444e-07, "logits/chosen": -3.160762310028076, "logits/rejected": -2.1640701293945312, "logps/chosen": -598.3094482421875, "logps/rejected": -488.5225524902344, "loss": 0.267, "rewards/accuracies": 1.0, "rewards/chosen": 2.577397584915161, "rewards/margins": 4.3487420082092285, "rewards/rejected": -1.7713441848754883, "step": 907 }, { "epoch": 0.66337899543379, "grad_norm": 53.40990698421502, "learning_rate": 4.934344888669941e-07, "logits/chosen": -3.0788774490356445, "logits/rejected": -2.071681261062622, "logps/chosen": -531.5466918945312, "logps/rejected": -330.03839111328125, "loss": 0.3565, "rewards/accuracies": 0.75, "rewards/chosen": 0.7731748819351196, "rewards/margins": 1.3105928897857666, "rewards/rejected": -0.537418007850647, "step": 908 }, { "epoch": 0.6641095890410958, "grad_norm": 66.29716776096095, "learning_rate": 4.933981247573112e-07, "logits/chosen": -2.861691951751709, "logits/rejected": -1.7485616207122803, "logps/chosen": -963.1710205078125, "logps/rejected": -555.4337158203125, "loss": 0.408, "rewards/accuracies": 0.875, "rewards/chosen": 2.467695951461792, "rewards/margins": 2.351679801940918, "rewards/rejected": 0.11601614952087402, "step": 909 }, { "epoch": 0.6648401826484018, "grad_norm": 71.94121949550203, "learning_rate": 4.93361661568798e-07, "logits/chosen": -2.755171775817871, "logits/rejected": -2.368405342102051, "logps/chosen": -611.7208251953125, "logps/rejected": -549.2432861328125, "loss": 0.4668, "rewards/accuracies": 0.375, "rewards/chosen": 1.1875005960464478, "rewards/margins": 0.7030258774757385, "rewards/rejected": 0.4844748079776764, "step": 910 }, { "epoch": 0.6655707762557077, "grad_norm": 82.59479343362842, "learning_rate": 4.933250993162977e-07, "logits/chosen": -2.6900949478149414, "logits/rejected": -2.2861359119415283, "logps/chosen": -846.5252685546875, "logps/rejected": -710.3515625, "loss": 0.551, "rewards/accuracies": 0.75, "rewards/chosen": 2.1504595279693604, "rewards/margins": 2.5050840377807617, "rewards/rejected": -0.35462450981140137, "step": 911 }, { "epoch": 0.6663013698630137, "grad_norm": 65.4019759413372, "learning_rate": 4.932884380146933e-07, "logits/chosen": -2.79636812210083, "logits/rejected": -2.3312571048736572, "logps/chosen": -531.7986450195312, "logps/rejected": -491.260009765625, "loss": 0.3769, "rewards/accuracies": 0.625, "rewards/chosen": 1.9778181314468384, "rewards/margins": 1.569877028465271, "rewards/rejected": 0.4079412519931793, "step": 912 }, { "epoch": 0.6670319634703197, "grad_norm": 72.26811651110398, "learning_rate": 4.932516776789083e-07, "logits/chosen": -2.042647123336792, "logits/rejected": -2.5705840587615967, "logps/chosen": -299.7626037597656, "logps/rejected": -455.84686279296875, "loss": 0.4331, "rewards/accuracies": 0.75, "rewards/chosen": 0.5433505177497864, "rewards/margins": 0.6544384956359863, "rewards/rejected": -0.11108788847923279, "step": 913 }, { "epoch": 0.6677625570776256, "grad_norm": 73.40169339838148, "learning_rate": 4.932148183239067e-07, "logits/chosen": -2.7540862560272217, "logits/rejected": -2.555201530456543, "logps/chosen": -727.80712890625, "logps/rejected": -566.6559448242188, "loss": 0.4872, "rewards/accuracies": 0.875, "rewards/chosen": 2.1384458541870117, "rewards/margins": 1.3771039247512817, "rewards/rejected": 0.7613418102264404, "step": 914 }, { "epoch": 0.6684931506849315, "grad_norm": 55.96625701106092, "learning_rate": 4.931778599646926e-07, "logits/chosen": -2.6392505168914795, "logits/rejected": -1.9247242212295532, "logps/chosen": -521.2755126953125, "logps/rejected": -435.4347839355469, "loss": 0.3452, "rewards/accuracies": 0.875, "rewards/chosen": 0.8342106342315674, "rewards/margins": 2.203885793685913, "rewards/rejected": -1.3696751594543457, "step": 915 }, { "epoch": 0.6692237442922374, "grad_norm": 44.45610161387439, "learning_rate": 4.931408026163104e-07, "logits/chosen": -2.9477880001068115, "logits/rejected": -2.49404239654541, "logps/chosen": -607.2876586914062, "logps/rejected": -581.0928955078125, "loss": 0.2396, "rewards/accuracies": 0.875, "rewards/chosen": 2.397975444793701, "rewards/margins": 2.836374044418335, "rewards/rejected": -0.438398540019989, "step": 916 }, { "epoch": 0.6699543378995434, "grad_norm": 57.06689382727739, "learning_rate": 4.931036462938449e-07, "logits/chosen": -2.8308911323547363, "logits/rejected": -2.2164931297302246, "logps/chosen": -458.6614074707031, "logps/rejected": -420.4760437011719, "loss": 0.2789, "rewards/accuracies": 0.625, "rewards/chosen": 1.7158535718917847, "rewards/margins": 2.4847002029418945, "rewards/rejected": -0.7688465118408203, "step": 917 }, { "epoch": 0.6706849315068493, "grad_norm": 74.5174974209078, "learning_rate": 4.93066391012421e-07, "logits/chosen": -2.3233792781829834, "logits/rejected": -2.015622854232788, "logps/chosen": -486.2101745605469, "logps/rejected": -358.131591796875, "loss": 0.4337, "rewards/accuracies": 0.625, "rewards/chosen": 2.094123363494873, "rewards/margins": 1.7962307929992676, "rewards/rejected": 0.2978927493095398, "step": 918 }, { "epoch": 0.6714155251141553, "grad_norm": 60.057835488099514, "learning_rate": 4.930290367872043e-07, "logits/chosen": -2.7270255088806152, "logits/rejected": -2.0756335258483887, "logps/chosen": -444.1468200683594, "logps/rejected": -355.49749755859375, "loss": 0.4114, "rewards/accuracies": 0.75, "rewards/chosen": 1.0450445413589478, "rewards/margins": 1.479905605316162, "rewards/rejected": -0.4348610043525696, "step": 919 }, { "epoch": 0.6721461187214612, "grad_norm": 65.1357131903267, "learning_rate": 4.929915836334001e-07, "logits/chosen": -2.611739158630371, "logits/rejected": -2.219984531402588, "logps/chosen": -606.3237915039062, "logps/rejected": -502.8520202636719, "loss": 0.3479, "rewards/accuracies": 0.75, "rewards/chosen": 1.9275360107421875, "rewards/margins": 1.862315058708191, "rewards/rejected": 0.06522104144096375, "step": 920 }, { "epoch": 0.6728767123287671, "grad_norm": 57.88982160373704, "learning_rate": 4.929540315662544e-07, "logits/chosen": -2.467137336730957, "logits/rejected": -1.7548129558563232, "logps/chosen": -601.98193359375, "logps/rejected": -392.14178466796875, "loss": 0.359, "rewards/accuracies": 0.75, "rewards/chosen": 1.44003427028656, "rewards/margins": 1.8650550842285156, "rewards/rejected": -0.4250207245349884, "step": 921 }, { "epoch": 0.6736073059360731, "grad_norm": 48.500445491671776, "learning_rate": 4.929163806010533e-07, "logits/chosen": -2.8287951946258545, "logits/rejected": -1.983891487121582, "logps/chosen": -664.7245483398438, "logps/rejected": -490.0953369140625, "loss": 0.2818, "rewards/accuracies": 0.875, "rewards/chosen": 2.7109594345092773, "rewards/margins": 2.6683647632598877, "rewards/rejected": 0.042594701051712036, "step": 922 }, { "epoch": 0.674337899543379, "grad_norm": 36.92575624966096, "learning_rate": 4.928786307531232e-07, "logits/chosen": -2.965285539627075, "logits/rejected": -2.048971652984619, "logps/chosen": -798.52099609375, "logps/rejected": -435.0608825683594, "loss": 0.1908, "rewards/accuracies": 0.75, "rewards/chosen": 2.2848944664001465, "rewards/margins": 2.55867338180542, "rewards/rejected": -0.27377915382385254, "step": 923 }, { "epoch": 0.675068493150685, "grad_norm": 46.6647916871288, "learning_rate": 4.928407820378307e-07, "logits/chosen": -3.133445978164673, "logits/rejected": -2.2335031032562256, "logps/chosen": -426.4561462402344, "logps/rejected": -300.8240051269531, "loss": 0.3224, "rewards/accuracies": 0.625, "rewards/chosen": 1.3480267524719238, "rewards/margins": 1.1989283561706543, "rewards/rejected": 0.1490984857082367, "step": 924 }, { "epoch": 0.6757990867579908, "grad_norm": 47.65265355281907, "learning_rate": 4.928028344705828e-07, "logits/chosen": -3.030505657196045, "logits/rejected": -2.064818859100342, "logps/chosen": -553.4521484375, "logps/rejected": -438.64141845703125, "loss": 0.2636, "rewards/accuracies": 0.875, "rewards/chosen": 3.1887881755828857, "rewards/margins": 3.5315194129943848, "rewards/rejected": -0.34273138642311096, "step": 925 }, { "epoch": 0.6765296803652968, "grad_norm": 51.74828037224401, "learning_rate": 4.927647880668265e-07, "logits/chosen": -2.702692985534668, "logits/rejected": -2.633432388305664, "logps/chosen": -475.14105224609375, "logps/rejected": -513.3734130859375, "loss": 0.3641, "rewards/accuracies": 1.0, "rewards/chosen": 1.5113164186477661, "rewards/margins": 1.4558370113372803, "rewards/rejected": 0.05547948181629181, "step": 926 }, { "epoch": 0.6772602739726027, "grad_norm": 49.58169010976356, "learning_rate": 4.927266428420493e-07, "logits/chosen": -2.937946081161499, "logits/rejected": -2.475146770477295, "logps/chosen": -580.5358276367188, "logps/rejected": -404.68353271484375, "loss": 0.324, "rewards/accuracies": 0.875, "rewards/chosen": 2.5154836177825928, "rewards/margins": 2.3161447048187256, "rewards/rejected": 0.199338898062706, "step": 927 }, { "epoch": 0.6779908675799087, "grad_norm": 126.20894675967905, "learning_rate": 4.926883988117785e-07, "logits/chosen": -2.5185322761535645, "logits/rejected": -1.9349713325500488, "logps/chosen": -764.0927734375, "logps/rejected": -606.7362670898438, "loss": 0.5474, "rewards/accuracies": 0.75, "rewards/chosen": 2.4166224002838135, "rewards/margins": 2.2272510528564453, "rewards/rejected": 0.18937119841575623, "step": 928 }, { "epoch": 0.6787214611872147, "grad_norm": 73.08063892137403, "learning_rate": 4.92650055991582e-07, "logits/chosen": -2.263636589050293, "logits/rejected": -2.4027464389801025, "logps/chosen": -538.745849609375, "logps/rejected": -533.3817138671875, "loss": 0.4108, "rewards/accuracies": 0.625, "rewards/chosen": 1.38490629196167, "rewards/margins": 1.2636432647705078, "rewards/rejected": 0.12126301229000092, "step": 929 }, { "epoch": 0.6794520547945205, "grad_norm": 59.86518871859087, "learning_rate": 4.926116143970681e-07, "logits/chosen": -3.1219465732574463, "logits/rejected": -1.7010993957519531, "logps/chosen": -673.5379638671875, "logps/rejected": -470.86773681640625, "loss": 0.3989, "rewards/accuracies": 0.875, "rewards/chosen": 3.8847897052764893, "rewards/margins": 3.6426262855529785, "rewards/rejected": 0.24216300249099731, "step": 930 }, { "epoch": 0.6801826484018265, "grad_norm": 65.15733287469455, "learning_rate": 4.925730740438847e-07, "logits/chosen": -2.8559322357177734, "logits/rejected": -2.2970428466796875, "logps/chosen": -961.2964477539062, "logps/rejected": -738.7664184570312, "loss": 0.4159, "rewards/accuracies": 0.75, "rewards/chosen": 1.978811502456665, "rewards/margins": 1.8807859420776367, "rewards/rejected": 0.09802570939064026, "step": 931 }, { "epoch": 0.6809132420091324, "grad_norm": 52.82238489223224, "learning_rate": 4.925344349477204e-07, "logits/chosen": -2.525325059890747, "logits/rejected": -2.0192532539367676, "logps/chosen": -496.0273742675781, "logps/rejected": -473.9057922363281, "loss": 0.3694, "rewards/accuracies": 0.875, "rewards/chosen": 1.1806787252426147, "rewards/margins": 2.0074284076690674, "rewards/rejected": -0.8267496824264526, "step": 932 }, { "epoch": 0.6816438356164384, "grad_norm": 70.18297096919238, "learning_rate": 4.924956971243037e-07, "logits/chosen": -2.6544811725616455, "logits/rejected": -2.9057555198669434, "logps/chosen": -966.1417236328125, "logps/rejected": -1088.1771240234375, "loss": 0.3114, "rewards/accuracies": 0.625, "rewards/chosen": 2.264045238494873, "rewards/margins": 1.1827392578125, "rewards/rejected": 1.0813060998916626, "step": 933 }, { "epoch": 0.6823744292237442, "grad_norm": 47.7195634753845, "learning_rate": 4.924568605894035e-07, "logits/chosen": -2.5267062187194824, "logits/rejected": -2.336824655532837, "logps/chosen": -623.086181640625, "logps/rejected": -489.60479736328125, "loss": 0.306, "rewards/accuracies": 0.75, "rewards/chosen": 2.3005101680755615, "rewards/margins": 2.5970702171325684, "rewards/rejected": -0.29655981063842773, "step": 934 }, { "epoch": 0.6831050228310502, "grad_norm": 57.057975330606716, "learning_rate": 4.924179253588287e-07, "logits/chosen": -3.5476460456848145, "logits/rejected": -2.263206958770752, "logps/chosen": -950.0072021484375, "logps/rejected": -626.3217163085938, "loss": 0.3666, "rewards/accuracies": 0.875, "rewards/chosen": 3.21407413482666, "rewards/margins": 1.8430633544921875, "rewards/rejected": 1.3710107803344727, "step": 935 }, { "epoch": 0.6838356164383562, "grad_norm": 60.97691233483887, "learning_rate": 4.923788914484287e-07, "logits/chosen": -3.129023313522339, "logits/rejected": -2.220106601715088, "logps/chosen": -878.6098022460938, "logps/rejected": -589.9588012695312, "loss": 0.3314, "rewards/accuracies": 0.875, "rewards/chosen": 2.7871479988098145, "rewards/margins": 2.2629575729370117, "rewards/rejected": 0.5241904258728027, "step": 936 }, { "epoch": 0.6845662100456621, "grad_norm": 56.26626717906231, "learning_rate": 4.923397588740925e-07, "logits/chosen": -3.454819917678833, "logits/rejected": -2.6880667209625244, "logps/chosen": -935.4348754882812, "logps/rejected": -763.6896362304688, "loss": 0.2878, "rewards/accuracies": 0.75, "rewards/chosen": 2.5008034706115723, "rewards/margins": 1.7465474605560303, "rewards/rejected": 0.7542560696601868, "step": 937 }, { "epoch": 0.6852968036529681, "grad_norm": 47.26576187996709, "learning_rate": 4.923005276517498e-07, "logits/chosen": -2.7871603965759277, "logits/rejected": -2.0008838176727295, "logps/chosen": -601.3297119140625, "logps/rejected": -445.8765869140625, "loss": 0.2666, "rewards/accuracies": 0.875, "rewards/chosen": 2.398538112640381, "rewards/margins": 2.709686756134033, "rewards/rejected": -0.31114864349365234, "step": 938 }, { "epoch": 0.686027397260274, "grad_norm": 61.91824364825261, "learning_rate": 4.922611977973702e-07, "logits/chosen": -3.2147719860076904, "logits/rejected": -2.477771759033203, "logps/chosen": -926.0745849609375, "logps/rejected": -602.239501953125, "loss": 0.2921, "rewards/accuracies": 0.75, "rewards/chosen": 1.4744713306427002, "rewards/margins": 1.3029650449752808, "rewards/rejected": 0.17150628566741943, "step": 939 }, { "epoch": 0.6867579908675799, "grad_norm": 75.32408494922126, "learning_rate": 4.922217693269635e-07, "logits/chosen": -2.4113731384277344, "logits/rejected": -2.1279923915863037, "logps/chosen": -793.038818359375, "logps/rejected": -577.8466796875, "loss": 0.5064, "rewards/accuracies": 1.0, "rewards/chosen": 3.2682156562805176, "rewards/margins": 2.9409117698669434, "rewards/rejected": 0.3273039758205414, "step": 940 }, { "epoch": 0.6874885844748858, "grad_norm": 58.05046464338162, "learning_rate": 4.921822422565796e-07, "logits/chosen": -2.85858154296875, "logits/rejected": -2.097067356109619, "logps/chosen": -674.2628173828125, "logps/rejected": -527.0262451171875, "loss": 0.3237, "rewards/accuracies": 0.875, "rewards/chosen": 2.3410592079162598, "rewards/margins": 1.653433084487915, "rewards/rejected": 0.6876261234283447, "step": 941 }, { "epoch": 0.6882191780821918, "grad_norm": 57.850538583950325, "learning_rate": 4.921426166023087e-07, "logits/chosen": -2.6792168617248535, "logits/rejected": -1.9822622537612915, "logps/chosen": -471.7249450683594, "logps/rejected": -447.81243896484375, "loss": 0.3006, "rewards/accuracies": 0.875, "rewards/chosen": 1.787753701210022, "rewards/margins": 2.6097493171691895, "rewards/rejected": -0.8219956755638123, "step": 942 }, { "epoch": 0.6889497716894977, "grad_norm": 57.82100233087347, "learning_rate": 4.921028923802809e-07, "logits/chosen": -2.9440219402313232, "logits/rejected": -2.549098253250122, "logps/chosen": -586.4796142578125, "logps/rejected": -467.1344909667969, "loss": 0.368, "rewards/accuracies": 0.875, "rewards/chosen": 1.3736364841461182, "rewards/margins": 1.827956199645996, "rewards/rejected": -0.45431989431381226, "step": 943 }, { "epoch": 0.6896803652968037, "grad_norm": 52.69298143936012, "learning_rate": 4.920630696066667e-07, "logits/chosen": -2.9566659927368164, "logits/rejected": -2.0857250690460205, "logps/chosen": -621.8773193359375, "logps/rejected": -540.8276977539062, "loss": 0.4159, "rewards/accuracies": 0.875, "rewards/chosen": 1.0470550060272217, "rewards/margins": 2.323245048522949, "rewards/rejected": -1.2761902809143066, "step": 944 }, { "epoch": 0.6904109589041096, "grad_norm": 50.179584719846474, "learning_rate": 4.920231482976763e-07, "logits/chosen": -2.3813881874084473, "logits/rejected": -2.0693674087524414, "logps/chosen": -777.0482177734375, "logps/rejected": -679.3743286132812, "loss": 0.255, "rewards/accuracies": 0.875, "rewards/chosen": 2.434983253479004, "rewards/margins": 2.3705735206604004, "rewards/rejected": 0.06440985202789307, "step": 945 }, { "epoch": 0.6911415525114155, "grad_norm": 59.66693035614452, "learning_rate": 4.919831284695605e-07, "logits/chosen": -2.676412343978882, "logits/rejected": -2.396084785461426, "logps/chosen": -682.8955078125, "logps/rejected": -522.9075927734375, "loss": 0.399, "rewards/accuracies": 0.875, "rewards/chosen": 2.4268484115600586, "rewards/margins": 2.633355140686035, "rewards/rejected": -0.20650669932365417, "step": 946 }, { "epoch": 0.6918721461187215, "grad_norm": 50.304264659855136, "learning_rate": 4.919430101386097e-07, "logits/chosen": -3.3293933868408203, "logits/rejected": -2.741257667541504, "logps/chosen": -1017.3448486328125, "logps/rejected": -731.84912109375, "loss": 0.2486, "rewards/accuracies": 0.875, "rewards/chosen": 2.795750856399536, "rewards/margins": 2.0757789611816406, "rewards/rejected": 0.7199719548225403, "step": 947 }, { "epoch": 0.6926027397260274, "grad_norm": 62.586863466031154, "learning_rate": 4.919027933211551e-07, "logits/chosen": -2.6620819568634033, "logits/rejected": -2.523405075073242, "logps/chosen": -673.203369140625, "logps/rejected": -526.5364990234375, "loss": 0.4438, "rewards/accuracies": 0.875, "rewards/chosen": 0.9613515734672546, "rewards/margins": 1.1488358974456787, "rewards/rejected": -0.18748432397842407, "step": 948 }, { "epoch": 0.6933333333333334, "grad_norm": 48.429893555982034, "learning_rate": 4.91862478033567e-07, "logits/chosen": -2.3370511531829834, "logits/rejected": -1.222306489944458, "logps/chosen": -576.379150390625, "logps/rejected": -492.84637451171875, "loss": 0.2939, "rewards/accuracies": 0.875, "rewards/chosen": 2.9999873638153076, "rewards/margins": 4.753854751586914, "rewards/rejected": -1.7538673877716064, "step": 949 }, { "epoch": 0.6940639269406392, "grad_norm": 47.936952510726265, "learning_rate": 4.918220642922568e-07, "logits/chosen": -3.2568535804748535, "logits/rejected": -1.9663759469985962, "logps/chosen": -576.90771484375, "logps/rejected": -410.5830078125, "loss": 0.2796, "rewards/accuracies": 0.875, "rewards/chosen": 3.036396026611328, "rewards/margins": 3.7288875579833984, "rewards/rejected": -0.6924912333488464, "step": 950 }, { "epoch": 0.6947945205479452, "grad_norm": 53.52086653163302, "learning_rate": 4.917815521136753e-07, "logits/chosen": -2.907644510269165, "logits/rejected": -1.993178367614746, "logps/chosen": -759.1592407226562, "logps/rejected": -473.64599609375, "loss": 0.3424, "rewards/accuracies": 0.875, "rewards/chosen": 2.5535802841186523, "rewards/margins": 2.3903398513793945, "rewards/rejected": 0.16324035823345184, "step": 951 }, { "epoch": 0.6955251141552511, "grad_norm": 64.302614036095, "learning_rate": 4.917409415143137e-07, "logits/chosen": -3.09364652633667, "logits/rejected": -2.480926752090454, "logps/chosen": -743.2952880859375, "logps/rejected": -540.82470703125, "loss": 0.4133, "rewards/accuracies": 0.75, "rewards/chosen": 3.9319088459014893, "rewards/margins": 3.6436691284179688, "rewards/rejected": 0.28823983669281006, "step": 952 }, { "epoch": 0.6962557077625571, "grad_norm": 64.40855466521897, "learning_rate": 4.917002325107029e-07, "logits/chosen": -2.8216640949249268, "logits/rejected": -2.2322354316711426, "logps/chosen": -691.8482666015625, "logps/rejected": -439.7445068359375, "loss": 0.4109, "rewards/accuracies": 0.75, "rewards/chosen": 3.0367860794067383, "rewards/margins": 2.569281578063965, "rewards/rejected": 0.46750450134277344, "step": 953 }, { "epoch": 0.6969863013698631, "grad_norm": 69.382921141256, "learning_rate": 4.916594251194144e-07, "logits/chosen": -2.78701114654541, "logits/rejected": -2.019909620285034, "logps/chosen": -1060.714111328125, "logps/rejected": -703.0965576171875, "loss": 0.4203, "rewards/accuracies": 0.75, "rewards/chosen": 2.067343235015869, "rewards/margins": 1.4570271968841553, "rewards/rejected": 0.6103160977363586, "step": 954 }, { "epoch": 0.6977168949771689, "grad_norm": 59.440724314490396, "learning_rate": 4.916185193570594e-07, "logits/chosen": -2.940387487411499, "logits/rejected": -2.003636598587036, "logps/chosen": -853.9863891601562, "logps/rejected": -594.6240844726562, "loss": 0.3546, "rewards/accuracies": 0.875, "rewards/chosen": 2.1374266147613525, "rewards/margins": 1.3393099308013916, "rewards/rejected": 0.7981165647506714, "step": 955 }, { "epoch": 0.6984474885844749, "grad_norm": 73.76970737750354, "learning_rate": 4.91577515240289e-07, "logits/chosen": -2.8705856800079346, "logits/rejected": -2.3487017154693604, "logps/chosen": -989.6878051757812, "logps/rejected": -798.801513671875, "loss": 0.4035, "rewards/accuracies": 1.0, "rewards/chosen": 3.699967622756958, "rewards/margins": 2.0982825756073, "rewards/rejected": 1.6016850471496582, "step": 956 }, { "epoch": 0.6991780821917808, "grad_norm": 58.65285768693415, "learning_rate": 4.915364127857947e-07, "logits/chosen": -2.788540840148926, "logits/rejected": -2.3722763061523438, "logps/chosen": -783.4063720703125, "logps/rejected": -616.735595703125, "loss": 0.2949, "rewards/accuracies": 0.875, "rewards/chosen": 2.562775135040283, "rewards/margins": 2.2664387226104736, "rewards/rejected": 0.29633644223213196, "step": 957 }, { "epoch": 0.6999086757990868, "grad_norm": 79.40522615927902, "learning_rate": 4.914952120103078e-07, "logits/chosen": -2.290286064147949, "logits/rejected": -2.022536516189575, "logps/chosen": -810.7512817382812, "logps/rejected": -582.5762939453125, "loss": 0.5584, "rewards/accuracies": 0.5, "rewards/chosen": 1.6610578298568726, "rewards/margins": 0.7936946153640747, "rewards/rejected": 0.8673630952835083, "step": 958 }, { "epoch": 0.7006392694063927, "grad_norm": 61.03538212458205, "learning_rate": 4.914539129305998e-07, "logits/chosen": -2.8866682052612305, "logits/rejected": -2.4931819438934326, "logps/chosen": -678.226806640625, "logps/rejected": -523.2513427734375, "loss": 0.3367, "rewards/accuracies": 0.75, "rewards/chosen": 1.4161068201065063, "rewards/margins": 1.7369226217269897, "rewards/rejected": -0.32081595063209534, "step": 959 }, { "epoch": 0.7013698630136986, "grad_norm": 55.410374211987595, "learning_rate": 4.91412515563482e-07, "logits/chosen": -2.7195091247558594, "logits/rejected": -2.3546814918518066, "logps/chosen": -925.8353881835938, "logps/rejected": -864.90087890625, "loss": 0.3434, "rewards/accuracies": 0.875, "rewards/chosen": 2.7494187355041504, "rewards/margins": 2.339773654937744, "rewards/rejected": 0.40964508056640625, "step": 960 }, { "epoch": 0.7021004566210046, "grad_norm": 53.50052934901672, "learning_rate": 4.913710199258058e-07, "logits/chosen": -2.6955859661102295, "logits/rejected": -2.3024818897247314, "logps/chosen": -342.1742248535156, "logps/rejected": -349.60064697265625, "loss": 0.4068, "rewards/accuracies": 0.375, "rewards/chosen": 0.415531724691391, "rewards/margins": -0.11677440255880356, "rewards/rejected": 0.5323061347007751, "step": 961 }, { "epoch": 0.7028310502283105, "grad_norm": 71.72128263502054, "learning_rate": 4.913294260344628e-07, "logits/chosen": -2.369981050491333, "logits/rejected": -2.160801887512207, "logps/chosen": -530.1112060546875, "logps/rejected": -648.7601928710938, "loss": 0.556, "rewards/accuracies": 0.75, "rewards/chosen": 2.076688528060913, "rewards/margins": 1.9140896797180176, "rewards/rejected": 0.16259869933128357, "step": 962 }, { "epoch": 0.7035616438356165, "grad_norm": 65.7847571808527, "learning_rate": 4.912877339063843e-07, "logits/chosen": -2.71107816696167, "logits/rejected": -1.993410348892212, "logps/chosen": -689.9249267578125, "logps/rejected": -484.6548156738281, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 2.8500189781188965, "rewards/margins": 3.749232053756714, "rewards/rejected": -0.8992128372192383, "step": 963 }, { "epoch": 0.7042922374429224, "grad_norm": 71.56769904695135, "learning_rate": 4.912459435585415e-07, "logits/chosen": -2.5269789695739746, "logits/rejected": -2.453157424926758, "logps/chosen": -550.2933959960938, "logps/rejected": -541.9686279296875, "loss": 0.44, "rewards/accuracies": 0.875, "rewards/chosen": 2.0209531784057617, "rewards/margins": 2.0786285400390625, "rewards/rejected": -0.05767540633678436, "step": 964 }, { "epoch": 0.7050228310502283, "grad_norm": 50.7619698976569, "learning_rate": 4.912040550079461e-07, "logits/chosen": -2.636837959289551, "logits/rejected": -2.7614474296569824, "logps/chosen": -396.5837097167969, "logps/rejected": -390.646728515625, "loss": 0.3748, "rewards/accuracies": 0.75, "rewards/chosen": 1.0257657766342163, "rewards/margins": 1.1952226161956787, "rewards/rejected": -0.1694568693637848, "step": 965 }, { "epoch": 0.7057534246575342, "grad_norm": 69.54346451702885, "learning_rate": 4.911620682716492e-07, "logits/chosen": -2.6936566829681396, "logits/rejected": -2.4781312942504883, "logps/chosen": -549.0975952148438, "logps/rejected": -487.6446533203125, "loss": 0.4155, "rewards/accuracies": 0.875, "rewards/chosen": 3.1484804153442383, "rewards/margins": 2.7742183208465576, "rewards/rejected": 0.37426233291625977, "step": 966 }, { "epoch": 0.7064840182648402, "grad_norm": 55.358839169718415, "learning_rate": 4.911199833667423e-07, "logits/chosen": -2.8650381565093994, "logits/rejected": -2.5814013481140137, "logps/chosen": -809.7106323242188, "logps/rejected": -694.2072143554688, "loss": 0.3721, "rewards/accuracies": 0.625, "rewards/chosen": 2.5483558177948, "rewards/margins": 1.4170050621032715, "rewards/rejected": 1.1313509941101074, "step": 967 }, { "epoch": 0.7072146118721461, "grad_norm": 43.33676352985058, "learning_rate": 4.910778003103566e-07, "logits/chosen": -2.8814167976379395, "logits/rejected": -1.8740543127059937, "logps/chosen": -562.4468994140625, "logps/rejected": -400.8707275390625, "loss": 0.1792, "rewards/accuracies": 0.875, "rewards/chosen": 3.6361758708953857, "rewards/margins": 4.373167037963867, "rewards/rejected": -0.736990749835968, "step": 968 }, { "epoch": 0.707945205479452, "grad_norm": 46.62860502567737, "learning_rate": 4.910355191196633e-07, "logits/chosen": -2.536729097366333, "logits/rejected": -2.555638313293457, "logps/chosen": -821.5042114257812, "logps/rejected": -801.0037841796875, "loss": 0.2517, "rewards/accuracies": 0.75, "rewards/chosen": 2.328831195831299, "rewards/margins": 2.525716781616211, "rewards/rejected": -0.19688549637794495, "step": 969 }, { "epoch": 0.708675799086758, "grad_norm": 68.59498148097279, "learning_rate": 4.909931398118738e-07, "logits/chosen": -2.8107213973999023, "logits/rejected": -2.318437337875366, "logps/chosen": -507.6168212890625, "logps/rejected": -386.79583740234375, "loss": 0.4089, "rewards/accuracies": 0.75, "rewards/chosen": 1.4947788715362549, "rewards/margins": 1.4901082515716553, "rewards/rejected": 0.004670426249504089, "step": 970 }, { "epoch": 0.7094063926940639, "grad_norm": 65.43864537560164, "learning_rate": 4.909506624042391e-07, "logits/chosen": -3.436028480529785, "logits/rejected": -2.0964818000793457, "logps/chosen": -832.6343383789062, "logps/rejected": -567.1517333984375, "loss": 0.4153, "rewards/accuracies": 0.875, "rewards/chosen": 2.994227886199951, "rewards/margins": 3.0223708152770996, "rewards/rejected": -0.028142541646957397, "step": 971 }, { "epoch": 0.7101369863013699, "grad_norm": 72.13024989017264, "learning_rate": 4.909080869140501e-07, "logits/chosen": -2.537276029586792, "logits/rejected": -1.852884292602539, "logps/chosen": -562.2939453125, "logps/rejected": -648.9173583984375, "loss": 0.4067, "rewards/accuracies": 0.75, "rewards/chosen": 2.7297842502593994, "rewards/margins": 2.8388893604278564, "rewards/rejected": -0.10910496115684509, "step": 972 }, { "epoch": 0.7108675799086758, "grad_norm": 88.8711568481656, "learning_rate": 4.90865413358638e-07, "logits/chosen": -3.0831708908081055, "logits/rejected": -2.184720277786255, "logps/chosen": -708.7874755859375, "logps/rejected": -633.1735229492188, "loss": 0.5908, "rewards/accuracies": 0.625, "rewards/chosen": 1.361393690109253, "rewards/margins": 0.31222233176231384, "rewards/rejected": 1.0491712093353271, "step": 973 }, { "epoch": 0.7115981735159818, "grad_norm": 49.25147225749957, "learning_rate": 4.908226417553737e-07, "logits/chosen": -2.8577003479003906, "logits/rejected": -2.5116357803344727, "logps/chosen": -529.8011474609375, "logps/rejected": -546.9913330078125, "loss": 0.2929, "rewards/accuracies": 0.75, "rewards/chosen": 2.868826150894165, "rewards/margins": 3.5409412384033203, "rewards/rejected": -0.6721153855323792, "step": 974 }, { "epoch": 0.7123287671232876, "grad_norm": 72.83024864153286, "learning_rate": 4.907797721216679e-07, "logits/chosen": -2.2171030044555664, "logits/rejected": -2.260246753692627, "logps/chosen": -525.20947265625, "logps/rejected": -523.0082397460938, "loss": 0.4642, "rewards/accuracies": 0.5, "rewards/chosen": 1.9395893812179565, "rewards/margins": 1.967769980430603, "rewards/rejected": -0.02818077802658081, "step": 975 }, { "epoch": 0.7130593607305936, "grad_norm": 57.24622795638482, "learning_rate": 4.907368044749715e-07, "logits/chosen": -2.4422757625579834, "logits/rejected": -1.4635529518127441, "logps/chosen": -554.0838623046875, "logps/rejected": -352.640625, "loss": 0.3242, "rewards/accuracies": 0.875, "rewards/chosen": 2.340353012084961, "rewards/margins": 2.9004416465759277, "rewards/rejected": -0.560088574886322, "step": 976 }, { "epoch": 0.7137899543378995, "grad_norm": 59.92506917822978, "learning_rate": 4.906937388327749e-07, "logits/chosen": -2.9813828468322754, "logits/rejected": -2.4126765727996826, "logps/chosen": -820.366455078125, "logps/rejected": -612.36279296875, "loss": 0.3766, "rewards/accuracies": 0.875, "rewards/chosen": 2.1214377880096436, "rewards/margins": 1.782411813735962, "rewards/rejected": 0.3390260636806488, "step": 977 }, { "epoch": 0.7145205479452055, "grad_norm": 56.44506109373078, "learning_rate": 4.906505752126087e-07, "logits/chosen": -2.693439483642578, "logits/rejected": -1.757328987121582, "logps/chosen": -505.17547607421875, "logps/rejected": -362.5099792480469, "loss": 0.4237, "rewards/accuracies": 0.75, "rewards/chosen": 1.2320690155029297, "rewards/margins": 0.9645670652389526, "rewards/rejected": 0.2675018906593323, "step": 978 }, { "epoch": 0.7152511415525115, "grad_norm": 62.47463743204617, "learning_rate": 4.906073136320435e-07, "logits/chosen": -3.022954225540161, "logits/rejected": -1.9646844863891602, "logps/chosen": -654.2000122070312, "logps/rejected": -463.4246826171875, "loss": 0.398, "rewards/accuracies": 0.875, "rewards/chosen": 2.90533709526062, "rewards/margins": 2.3364315032958984, "rewards/rejected": 0.5689057111740112, "step": 979 }, { "epoch": 0.7159817351598173, "grad_norm": 53.87528775662711, "learning_rate": 4.905639541086892e-07, "logits/chosen": -2.8297247886657715, "logits/rejected": -2.74104905128479, "logps/chosen": -513.4091796875, "logps/rejected": -485.7617492675781, "loss": 0.3511, "rewards/accuracies": 0.875, "rewards/chosen": 0.9531850218772888, "rewards/margins": 0.8933401107788086, "rewards/rejected": 0.05984486639499664, "step": 980 }, { "epoch": 0.7167123287671233, "grad_norm": 55.997395046136965, "learning_rate": 4.90520496660196e-07, "logits/chosen": -3.0926783084869385, "logits/rejected": -1.9606338739395142, "logps/chosen": -405.3331298828125, "logps/rejected": -315.6785888671875, "loss": 0.3356, "rewards/accuracies": 0.875, "rewards/chosen": 1.3890397548675537, "rewards/margins": 2.406501293182373, "rewards/rejected": -1.0174615383148193, "step": 981 }, { "epoch": 0.7174429223744292, "grad_norm": 59.14552076829762, "learning_rate": 4.904769413042542e-07, "logits/chosen": -2.8790388107299805, "logits/rejected": -2.5304698944091797, "logps/chosen": -840.96826171875, "logps/rejected": -716.6978149414062, "loss": 0.3811, "rewards/accuracies": 0.875, "rewards/chosen": 3.346827983856201, "rewards/margins": 2.015582799911499, "rewards/rejected": 1.3312451839447021, "step": 982 }, { "epoch": 0.7181735159817352, "grad_norm": 51.73672034852881, "learning_rate": 4.904332880585934e-07, "logits/chosen": -2.672877073287964, "logits/rejected": -1.9715766906738281, "logps/chosen": -442.8163146972656, "logps/rejected": -309.3948059082031, "loss": 0.3075, "rewards/accuracies": 0.75, "rewards/chosen": 1.6270629167556763, "rewards/margins": 2.0373544692993164, "rewards/rejected": -0.41029125452041626, "step": 983 }, { "epoch": 0.718904109589041, "grad_norm": 57.13303844564238, "learning_rate": 4.903895369409835e-07, "logits/chosen": -2.6724138259887695, "logits/rejected": -2.566476345062256, "logps/chosen": -760.90234375, "logps/rejected": -658.75048828125, "loss": 0.3908, "rewards/accuracies": 0.875, "rewards/chosen": 2.3533425331115723, "rewards/margins": 1.7904986143112183, "rewards/rejected": 0.5628438591957092, "step": 984 }, { "epoch": 0.719634703196347, "grad_norm": 51.08682101334942, "learning_rate": 4.903456879692338e-07, "logits/chosen": -2.5126070976257324, "logits/rejected": -1.8093535900115967, "logps/chosen": -763.877685546875, "logps/rejected": -484.3240661621094, "loss": 0.287, "rewards/accuracies": 0.875, "rewards/chosen": 2.8761532306671143, "rewards/margins": 3.002333641052246, "rewards/rejected": -0.12618020176887512, "step": 985 }, { "epoch": 0.720365296803653, "grad_norm": 51.084323958499, "learning_rate": 4.903017411611938e-07, "logits/chosen": -2.6072306632995605, "logits/rejected": -2.186994791030884, "logps/chosen": -550.399169921875, "logps/rejected": -740.7538452148438, "loss": 0.2958, "rewards/accuracies": 0.875, "rewards/chosen": 1.716325044631958, "rewards/margins": 1.9991869926452637, "rewards/rejected": -0.2828619182109833, "step": 986 }, { "epoch": 0.7210958904109589, "grad_norm": 48.716702871519075, "learning_rate": 4.902576965347528e-07, "logits/chosen": -2.7016866207122803, "logits/rejected": -1.760689616203308, "logps/chosen": -730.851806640625, "logps/rejected": -340.2505798339844, "loss": 0.3208, "rewards/accuracies": 0.75, "rewards/chosen": 3.215294361114502, "rewards/margins": 3.4692797660827637, "rewards/rejected": -0.2539854347705841, "step": 987 }, { "epoch": 0.7218264840182649, "grad_norm": 60.23114858414726, "learning_rate": 4.902135541078396e-07, "logits/chosen": -2.7363808155059814, "logits/rejected": -2.211740732192993, "logps/chosen": -342.2799072265625, "logps/rejected": -249.15625, "loss": 0.3543, "rewards/accuracies": 1.0, "rewards/chosen": 2.0083518028259277, "rewards/margins": 2.9733338356018066, "rewards/rejected": -0.9649820327758789, "step": 988 }, { "epoch": 0.7225570776255708, "grad_norm": 35.63321074514353, "learning_rate": 4.901693138984232e-07, "logits/chosen": -2.558467388153076, "logits/rejected": -1.842807412147522, "logps/chosen": -625.4201049804688, "logps/rejected": -353.2705993652344, "loss": 0.2547, "rewards/accuracies": 1.0, "rewards/chosen": 2.1326656341552734, "rewards/margins": 3.3032822608947754, "rewards/rejected": -1.1706167459487915, "step": 989 }, { "epoch": 0.7232876712328767, "grad_norm": 62.586953185023916, "learning_rate": 4.901249759245123e-07, "logits/chosen": -2.9319212436676025, "logits/rejected": -2.560331344604492, "logps/chosen": -919.5262451171875, "logps/rejected": -663.6865234375, "loss": 0.3838, "rewards/accuracies": 0.75, "rewards/chosen": 2.0251381397247314, "rewards/margins": 1.3799896240234375, "rewards/rejected": 0.645148515701294, "step": 990 }, { "epoch": 0.7240182648401826, "grad_norm": 49.628190329596194, "learning_rate": 4.900805402041551e-07, "logits/chosen": -2.2335057258605957, "logits/rejected": -2.3638055324554443, "logps/chosen": -468.7121276855469, "logps/rejected": -527.9559326171875, "loss": 0.2991, "rewards/accuracies": 1.0, "rewards/chosen": 1.292140007019043, "rewards/margins": 2.5314338207244873, "rewards/rejected": -1.2392935752868652, "step": 991 }, { "epoch": 0.7247488584474886, "grad_norm": 53.44924715544995, "learning_rate": 4.900360067554399e-07, "logits/chosen": -3.256319046020508, "logits/rejected": -1.784322738647461, "logps/chosen": -917.2132568359375, "logps/rejected": -438.30767822265625, "loss": 0.311, "rewards/accuracies": 1.0, "rewards/chosen": 3.8337435722351074, "rewards/margins": 3.9496638774871826, "rewards/rejected": -0.11592027544975281, "step": 992 }, { "epoch": 0.7254794520547945, "grad_norm": 52.85506790389167, "learning_rate": 4.899913755964948e-07, "logits/chosen": -2.9783647060394287, "logits/rejected": -2.4502756595611572, "logps/chosen": -703.1780395507812, "logps/rejected": -449.110107421875, "loss": 0.3792, "rewards/accuracies": 0.625, "rewards/chosen": 1.639078140258789, "rewards/margins": 1.5739631652832031, "rewards/rejected": 0.06511493027210236, "step": 993 }, { "epoch": 0.7262100456621005, "grad_norm": 59.804090971061505, "learning_rate": 4.899466467454875e-07, "logits/chosen": -3.0023844242095947, "logits/rejected": -2.2693958282470703, "logps/chosen": -796.8930053710938, "logps/rejected": -482.8681640625, "loss": 0.2866, "rewards/accuracies": 0.75, "rewards/chosen": 2.7509498596191406, "rewards/margins": 3.750664472579956, "rewards/rejected": -0.9997144937515259, "step": 994 }, { "epoch": 0.7269406392694064, "grad_norm": 79.05450181876108, "learning_rate": 4.899018202206257e-07, "logits/chosen": -2.680391311645508, "logits/rejected": -2.500962257385254, "logps/chosen": -868.9354248046875, "logps/rejected": -884.8834228515625, "loss": 0.5185, "rewards/accuracies": 0.875, "rewards/chosen": 3.4008710384368896, "rewards/margins": 1.6147372722625732, "rewards/rejected": 1.786134123802185, "step": 995 }, { "epoch": 0.7276712328767123, "grad_norm": 50.416018311085914, "learning_rate": 4.898568960401565e-07, "logits/chosen": -2.521122694015503, "logits/rejected": -2.3982088565826416, "logps/chosen": -514.406982421875, "logps/rejected": -455.01165771484375, "loss": 0.2768, "rewards/accuracies": 0.625, "rewards/chosen": 1.6617512702941895, "rewards/margins": 1.9872212409973145, "rewards/rejected": -0.325469970703125, "step": 996 }, { "epoch": 0.7284018264840183, "grad_norm": 66.11320266335802, "learning_rate": 4.89811874222367e-07, "logits/chosen": -3.2077136039733887, "logits/rejected": -2.3507115840911865, "logps/chosen": -815.8447875976562, "logps/rejected": -619.8359375, "loss": 0.4522, "rewards/accuracies": 0.75, "rewards/chosen": 2.2170684337615967, "rewards/margins": 1.252598524093628, "rewards/rejected": 0.9644699096679688, "step": 997 }, { "epoch": 0.7291324200913242, "grad_norm": 70.99397531657176, "learning_rate": 4.897667547855841e-07, "logits/chosen": -2.634389638900757, "logits/rejected": -1.7254374027252197, "logps/chosen": -865.1062622070312, "logps/rejected": -653.3444213867188, "loss": 0.4102, "rewards/accuracies": 0.75, "rewards/chosen": 2.031158924102783, "rewards/margins": 1.3502894639968872, "rewards/rejected": 0.6808693408966064, "step": 998 }, { "epoch": 0.7298630136986302, "grad_norm": 48.89073236245802, "learning_rate": 4.897215377481742e-07, "logits/chosen": -3.0096373558044434, "logits/rejected": -2.429983139038086, "logps/chosen": -1133.83544921875, "logps/rejected": -717.6209106445312, "loss": 0.2297, "rewards/accuracies": 0.875, "rewards/chosen": 3.634352684020996, "rewards/margins": 2.1690773963928223, "rewards/rejected": 1.4652756452560425, "step": 999 }, { "epoch": 0.730593607305936, "grad_norm": 70.07075455945404, "learning_rate": 4.896762231285437e-07, "logits/chosen": -2.741339683532715, "logits/rejected": -2.044464588165283, "logps/chosen": -549.0057373046875, "logps/rejected": -388.28167724609375, "loss": 0.4507, "rewards/accuracies": 0.875, "rewards/chosen": 2.3182520866394043, "rewards/margins": 3.057532548904419, "rewards/rejected": -0.7392804622650146, "step": 1000 }, { "epoch": 0.731324200913242, "grad_norm": 65.31398772229039, "learning_rate": 4.896308109451385e-07, "logits/chosen": -3.221139669418335, "logits/rejected": -1.7288824319839478, "logps/chosen": -453.74505615234375, "logps/rejected": -255.81362915039062, "loss": 0.4142, "rewards/accuracies": 1.0, "rewards/chosen": 1.8651039600372314, "rewards/margins": 2.7162961959838867, "rewards/rejected": -0.8511921167373657, "step": 1001 }, { "epoch": 0.7320547945205479, "grad_norm": 75.73279056265027, "learning_rate": 4.895853012164442e-07, "logits/chosen": -3.1803791522979736, "logits/rejected": -2.535827159881592, "logps/chosen": -944.6064453125, "logps/rejected": -774.57373046875, "loss": 0.4448, "rewards/accuracies": 0.875, "rewards/chosen": 2.970571994781494, "rewards/margins": 1.97745680809021, "rewards/rejected": 0.9931153059005737, "step": 1002 }, { "epoch": 0.7327853881278539, "grad_norm": 52.64296011813925, "learning_rate": 4.895396939609866e-07, "logits/chosen": -2.6552581787109375, "logits/rejected": -1.964118480682373, "logps/chosen": -722.587646484375, "logps/rejected": -608.833984375, "loss": 0.2703, "rewards/accuracies": 0.625, "rewards/chosen": 1.6682881116867065, "rewards/margins": 1.2902235984802246, "rewards/rejected": 0.37806448340415955, "step": 1003 }, { "epoch": 0.7335159817351599, "grad_norm": 50.82819224557495, "learning_rate": 4.894939891973304e-07, "logits/chosen": -2.293443202972412, "logits/rejected": -2.5493507385253906, "logps/chosen": -422.9368896484375, "logps/rejected": -452.75311279296875, "loss": 0.3589, "rewards/accuracies": 0.625, "rewards/chosen": 1.6505554914474487, "rewards/margins": 1.1761136054992676, "rewards/rejected": 0.47444185614585876, "step": 1004 }, { "epoch": 0.7342465753424657, "grad_norm": 69.92442907346474, "learning_rate": 4.894481869440806e-07, "logits/chosen": -2.4912736415863037, "logits/rejected": -2.4655466079711914, "logps/chosen": -630.7225341796875, "logps/rejected": -584.507080078125, "loss": 0.4635, "rewards/accuracies": 0.625, "rewards/chosen": 1.643568515777588, "rewards/margins": 1.2195897102355957, "rewards/rejected": 0.4239788055419922, "step": 1005 }, { "epoch": 0.7349771689497717, "grad_norm": 76.98022206933253, "learning_rate": 4.894022872198817e-07, "logits/chosen": -3.301537036895752, "logits/rejected": -2.04217529296875, "logps/chosen": -777.9219360351562, "logps/rejected": -430.8382263183594, "loss": 0.6144, "rewards/accuracies": 0.75, "rewards/chosen": 2.4953112602233887, "rewards/margins": 1.926556944847107, "rewards/rejected": 0.5687544345855713, "step": 1006 }, { "epoch": 0.7357077625570776, "grad_norm": 56.01692597861605, "learning_rate": 4.893562900434177e-07, "logits/chosen": -2.530107259750366, "logits/rejected": -2.0363919734954834, "logps/chosen": -468.0442199707031, "logps/rejected": -490.59210205078125, "loss": 0.4272, "rewards/accuracies": 0.75, "rewards/chosen": 3.218968152999878, "rewards/margins": 3.903165340423584, "rewards/rejected": -0.6841970682144165, "step": 1007 }, { "epoch": 0.7364383561643836, "grad_norm": 70.96725697460766, "learning_rate": 4.893101954334127e-07, "logits/chosen": -2.592374801635742, "logits/rejected": -2.240432024002075, "logps/chosen": -595.7696533203125, "logps/rejected": -469.7864685058594, "loss": 0.4334, "rewards/accuracies": 1.0, "rewards/chosen": 3.385957956314087, "rewards/margins": 3.3730087280273438, "rewards/rejected": 0.012949302792549133, "step": 1008 }, { "epoch": 0.7371689497716895, "grad_norm": 62.673994136448854, "learning_rate": 4.8926400340863e-07, "logits/chosen": -2.9319372177124023, "logits/rejected": -2.3692140579223633, "logps/chosen": -621.877197265625, "logps/rejected": -511.6533203125, "loss": 0.4143, "rewards/accuracies": 0.875, "rewards/chosen": 2.721696615219116, "rewards/margins": 2.5436553955078125, "rewards/rejected": 0.17804116010665894, "step": 1009 }, { "epoch": 0.7378995433789954, "grad_norm": 47.23275199824932, "learning_rate": 4.892177139878727e-07, "logits/chosen": -2.9674696922302246, "logits/rejected": -2.67047381401062, "logps/chosen": -673.0137939453125, "logps/rejected": -546.8467407226562, "loss": 0.289, "rewards/accuracies": 0.875, "rewards/chosen": 2.720538377761841, "rewards/margins": 2.057762622833252, "rewards/rejected": 0.6627755761146545, "step": 1010 }, { "epoch": 0.7386301369863014, "grad_norm": 52.91025300494106, "learning_rate": 4.891713271899837e-07, "logits/chosen": -3.0725960731506348, "logits/rejected": -2.3590891361236572, "logps/chosen": -728.3058471679688, "logps/rejected": -613.6593627929688, "loss": 0.4014, "rewards/accuracies": 1.0, "rewards/chosen": 1.838215947151184, "rewards/margins": 1.808241367340088, "rewards/rejected": 0.029974546283483505, "step": 1011 }, { "epoch": 0.7393607305936073, "grad_norm": 77.01105211875436, "learning_rate": 4.891248430338456e-07, "logits/chosen": -2.900505781173706, "logits/rejected": -1.9826782941818237, "logps/chosen": -817.14501953125, "logps/rejected": -653.2138061523438, "loss": 0.4699, "rewards/accuracies": 0.75, "rewards/chosen": 2.3838882446289062, "rewards/margins": 2.1129984855651855, "rewards/rejected": 0.2708897590637207, "step": 1012 }, { "epoch": 0.7400913242009133, "grad_norm": 63.798711858199965, "learning_rate": 4.890782615383802e-07, "logits/chosen": -2.6262047290802, "logits/rejected": -2.683370351791382, "logps/chosen": -901.358154296875, "logps/rejected": -892.6171875, "loss": 0.4184, "rewards/accuracies": 0.75, "rewards/chosen": 3.4823527336120605, "rewards/margins": 1.852966070175171, "rewards/rejected": 1.6293861865997314, "step": 1013 }, { "epoch": 0.7408219178082192, "grad_norm": 60.22337367109323, "learning_rate": 4.890315827225493e-07, "logits/chosen": -2.4467406272888184, "logits/rejected": -2.2618651390075684, "logps/chosen": -478.8119201660156, "logps/rejected": -634.8660278320312, "loss": 0.4225, "rewards/accuracies": 0.75, "rewards/chosen": 1.9741028547286987, "rewards/margins": 1.8224296569824219, "rewards/rejected": 0.1516733467578888, "step": 1014 }, { "epoch": 0.7415525114155251, "grad_norm": 75.47038688701019, "learning_rate": 4.889848066053543e-07, "logits/chosen": -3.16664981842041, "logits/rejected": -2.760441780090332, "logps/chosen": -367.8298645019531, "logps/rejected": -320.919189453125, "loss": 0.5331, "rewards/accuracies": 0.625, "rewards/chosen": 1.5042994022369385, "rewards/margins": 1.1327756643295288, "rewards/rejected": 0.3715237081050873, "step": 1015 }, { "epoch": 0.742283105022831, "grad_norm": 63.179760437233774, "learning_rate": 4.889379332058358e-07, "logits/chosen": -2.7773594856262207, "logits/rejected": -1.983656644821167, "logps/chosen": -557.342529296875, "logps/rejected": -424.2332458496094, "loss": 0.3031, "rewards/accuracies": 0.875, "rewards/chosen": 2.803175687789917, "rewards/margins": 3.1828064918518066, "rewards/rejected": -0.3796304166316986, "step": 1016 }, { "epoch": 0.743013698630137, "grad_norm": 67.14574647385528, "learning_rate": 4.888909625430747e-07, "logits/chosen": -2.7017931938171387, "logits/rejected": -2.4688892364501953, "logps/chosen": -490.0467834472656, "logps/rejected": -423.3097839355469, "loss": 0.4886, "rewards/accuracies": 0.75, "rewards/chosen": 0.9439789056777954, "rewards/margins": 0.8238439559936523, "rewards/rejected": 0.12013493478298187, "step": 1017 }, { "epoch": 0.7437442922374429, "grad_norm": 58.1824627753877, "learning_rate": 4.88843894636191e-07, "logits/chosen": -2.270202875137329, "logits/rejected": -1.3608958721160889, "logps/chosen": -485.2283630371094, "logps/rejected": -261.491455078125, "loss": 0.2618, "rewards/accuracies": 0.75, "rewards/chosen": 1.4850820302963257, "rewards/margins": 1.6771847009658813, "rewards/rejected": -0.19210273027420044, "step": 1018 }, { "epoch": 0.7444748858447489, "grad_norm": 41.232218290983475, "learning_rate": 4.887967295043441e-07, "logits/chosen": -3.3965389728546143, "logits/rejected": -2.425065517425537, "logps/chosen": -623.7088623046875, "logps/rejected": -399.7816162109375, "loss": 0.2415, "rewards/accuracies": 0.625, "rewards/chosen": 2.7258450984954834, "rewards/margins": 2.919898509979248, "rewards/rejected": -0.19405320286750793, "step": 1019 }, { "epoch": 0.7452054794520548, "grad_norm": 69.70676233224256, "learning_rate": 4.887494671667337e-07, "logits/chosen": -2.7526423931121826, "logits/rejected": -2.9418702125549316, "logps/chosen": -614.670166015625, "logps/rejected": -723.5755615234375, "loss": 0.5847, "rewards/accuracies": 0.625, "rewards/chosen": 1.220487356185913, "rewards/margins": 0.3521454930305481, "rewards/rejected": 0.8683418035507202, "step": 1020 }, { "epoch": 0.7459360730593607, "grad_norm": 59.426640783117, "learning_rate": 4.887021076425985e-07, "logits/chosen": -2.6351470947265625, "logits/rejected": -1.627040147781372, "logps/chosen": -764.9781494140625, "logps/rejected": -539.077880859375, "loss": 0.3929, "rewards/accuracies": 1.0, "rewards/chosen": 4.395371913909912, "rewards/margins": 4.329984664916992, "rewards/rejected": 0.06538712978363037, "step": 1021 }, { "epoch": 0.7466666666666667, "grad_norm": 46.69631306962976, "learning_rate": 4.886546509512166e-07, "logits/chosen": -2.9236104488372803, "logits/rejected": -2.1768112182617188, "logps/chosen": -714.81298828125, "logps/rejected": -412.4330139160156, "loss": 0.2809, "rewards/accuracies": 0.75, "rewards/chosen": 2.3996634483337402, "rewards/margins": 1.631277322769165, "rewards/rejected": 0.7683857679367065, "step": 1022 }, { "epoch": 0.7473972602739726, "grad_norm": 51.35156903715917, "learning_rate": 4.886070971119064e-07, "logits/chosen": -2.853837490081787, "logits/rejected": -2.105550765991211, "logps/chosen": -531.9666748046875, "logps/rejected": -416.45660400390625, "loss": 0.306, "rewards/accuracies": 1.0, "rewards/chosen": 2.750455856323242, "rewards/margins": 3.2101521492004395, "rewards/rejected": -0.45969659090042114, "step": 1023 }, { "epoch": 0.7481278538812786, "grad_norm": 48.9711256004814, "learning_rate": 4.885594461440251e-07, "logits/chosen": -2.8312020301818848, "logits/rejected": -2.083604335784912, "logps/chosen": -602.4619750976562, "logps/rejected": -414.68170166015625, "loss": 0.2649, "rewards/accuracies": 1.0, "rewards/chosen": 2.2970590591430664, "rewards/margins": 2.1907825469970703, "rewards/rejected": 0.10627643764019012, "step": 1024 }, { "epoch": 0.7488584474885844, "grad_norm": 63.79861236306888, "learning_rate": 4.885116980669698e-07, "logits/chosen": -3.307528018951416, "logits/rejected": -2.276362419128418, "logps/chosen": -690.109130859375, "logps/rejected": -522.8570556640625, "loss": 0.3637, "rewards/accuracies": 0.75, "rewards/chosen": 2.6587908267974854, "rewards/margins": 1.5957592725753784, "rewards/rejected": 1.063031554222107, "step": 1025 }, { "epoch": 0.7495890410958904, "grad_norm": 60.07551400373117, "learning_rate": 4.884638529001771e-07, "logits/chosen": -2.9391939640045166, "logits/rejected": -1.7927929162979126, "logps/chosen": -630.3684692382812, "logps/rejected": -393.4323425292969, "loss": 0.3673, "rewards/accuracies": 1.0, "rewards/chosen": 2.737462043762207, "rewards/margins": 2.8505024909973145, "rewards/rejected": -0.11304055154323578, "step": 1026 }, { "epoch": 0.7503196347031963, "grad_norm": 43.724162292307696, "learning_rate": 4.884159106631231e-07, "logits/chosen": -2.6350553035736084, "logits/rejected": -2.192521572113037, "logps/chosen": -571.0623779296875, "logps/rejected": -506.84063720703125, "loss": 0.2435, "rewards/accuracies": 0.75, "rewards/chosen": 2.942931890487671, "rewards/margins": 2.4542036056518555, "rewards/rejected": 0.4887281656265259, "step": 1027 }, { "epoch": 0.7510502283105023, "grad_norm": 59.33061869265418, "learning_rate": 4.883678713753235e-07, "logits/chosen": -2.7921533584594727, "logits/rejected": -2.76945161819458, "logps/chosen": -534.8853149414062, "logps/rejected": -456.96697998046875, "loss": 0.3042, "rewards/accuracies": 0.875, "rewards/chosen": 2.116152048110962, "rewards/margins": 2.1798593997955322, "rewards/rejected": -0.06370730698108673, "step": 1028 }, { "epoch": 0.7517808219178083, "grad_norm": 70.47276873241536, "learning_rate": 4.88319735056333e-07, "logits/chosen": -2.9132440090179443, "logits/rejected": -2.3954782485961914, "logps/chosen": -807.7698974609375, "logps/rejected": -690.9949340820312, "loss": 0.3125, "rewards/accuracies": 0.625, "rewards/chosen": 3.302701473236084, "rewards/margins": 2.6129581928253174, "rewards/rejected": 0.6897432804107666, "step": 1029 }, { "epoch": 0.7525114155251141, "grad_norm": 47.75753580915268, "learning_rate": 4.882715017257467e-07, "logits/chosen": -1.913335919380188, "logits/rejected": -2.3057594299316406, "logps/chosen": -508.4936218261719, "logps/rejected": -441.93511962890625, "loss": 0.3336, "rewards/accuracies": 0.625, "rewards/chosen": 1.7455544471740723, "rewards/margins": 1.293984055519104, "rewards/rejected": 0.45157039165496826, "step": 1030 }, { "epoch": 0.7532420091324201, "grad_norm": 64.97971062546178, "learning_rate": 4.882231714031985e-07, "logits/chosen": -2.825758457183838, "logits/rejected": -2.3353970050811768, "logps/chosen": -541.6205444335938, "logps/rejected": -441.44781494140625, "loss": 0.4116, "rewards/accuracies": 0.75, "rewards/chosen": 2.4057700634002686, "rewards/margins": 2.2939517498016357, "rewards/rejected": 0.11181829869747162, "step": 1031 }, { "epoch": 0.753972602739726, "grad_norm": 48.606140401863065, "learning_rate": 4.881747441083619e-07, "logits/chosen": -2.537773847579956, "logits/rejected": -2.2331371307373047, "logps/chosen": -569.13525390625, "logps/rejected": -455.8241271972656, "loss": 0.343, "rewards/accuracies": 0.75, "rewards/chosen": 1.7774215936660767, "rewards/margins": 1.8177876472473145, "rewards/rejected": -0.04036596417427063, "step": 1032 }, { "epoch": 0.754703196347032, "grad_norm": 40.722194429335765, "learning_rate": 4.881262198609501e-07, "logits/chosen": -2.7554354667663574, "logits/rejected": -1.9768683910369873, "logps/chosen": -722.9266967773438, "logps/rejected": -466.43438720703125, "loss": 0.215, "rewards/accuracies": 1.0, "rewards/chosen": 2.8079090118408203, "rewards/margins": 2.223174571990967, "rewards/rejected": 0.5847344994544983, "step": 1033 }, { "epoch": 0.7554337899543379, "grad_norm": 55.28405430281185, "learning_rate": 4.880775986807154e-07, "logits/chosen": -2.812486171722412, "logits/rejected": -1.9954159259796143, "logps/chosen": -1010.517822265625, "logps/rejected": -689.31298828125, "loss": 0.3954, "rewards/accuracies": 1.0, "rewards/chosen": 3.537041664123535, "rewards/margins": 2.409641981124878, "rewards/rejected": 1.1273994445800781, "step": 1034 }, { "epoch": 0.7561643835616438, "grad_norm": 48.312952273917965, "learning_rate": 4.8802888058745e-07, "logits/chosen": -2.86727237701416, "logits/rejected": -2.46706485748291, "logps/chosen": -571.802490234375, "logps/rejected": -445.8328857421875, "loss": 0.3247, "rewards/accuracies": 0.875, "rewards/chosen": 2.119497299194336, "rewards/margins": 1.860414743423462, "rewards/rejected": 0.259082555770874, "step": 1035 }, { "epoch": 0.7568949771689498, "grad_norm": 67.15090573197803, "learning_rate": 4.879800656009853e-07, "logits/chosen": -2.591078281402588, "logits/rejected": -1.6172527074813843, "logps/chosen": -729.3237915039062, "logps/rejected": -468.581298828125, "loss": 0.4434, "rewards/accuracies": 0.75, "rewards/chosen": 2.8913116455078125, "rewards/margins": 2.9125287532806396, "rewards/rejected": -0.021216988563537598, "step": 1036 }, { "epoch": 0.7576255707762557, "grad_norm": 75.51034373990592, "learning_rate": 4.87931153741192e-07, "logits/chosen": -2.4488048553466797, "logits/rejected": -1.491403579711914, "logps/chosen": -415.41644287109375, "logps/rejected": -508.7161560058594, "loss": 0.4581, "rewards/accuracies": 1.0, "rewards/chosen": 1.6455516815185547, "rewards/margins": 2.64803409576416, "rewards/rejected": -1.0024826526641846, "step": 1037 }, { "epoch": 0.7583561643835617, "grad_norm": 59.12224367584188, "learning_rate": 4.878821450279805e-07, "logits/chosen": -2.4795613288879395, "logits/rejected": -2.4366238117218018, "logps/chosen": -673.8134765625, "logps/rejected": -706.4244384765625, "loss": 0.2968, "rewards/accuracies": 0.75, "rewards/chosen": 3.385793924331665, "rewards/margins": 3.037043571472168, "rewards/rejected": 0.34875062108039856, "step": 1038 }, { "epoch": 0.7590867579908676, "grad_norm": 36.70035609766358, "learning_rate": 4.878330394813005e-07, "logits/chosen": -3.1633615493774414, "logits/rejected": -1.9597371816635132, "logps/chosen": -499.9194641113281, "logps/rejected": -276.1891784667969, "loss": 0.2711, "rewards/accuracies": 0.75, "rewards/chosen": 1.3654330968856812, "rewards/margins": 0.9206608533859253, "rewards/rejected": 0.44477224349975586, "step": 1039 }, { "epoch": 0.7598173515981735, "grad_norm": 59.34451314346066, "learning_rate": 4.877838371211412e-07, "logits/chosen": -2.4803056716918945, "logits/rejected": -2.6754231452941895, "logps/chosen": -576.42529296875, "logps/rejected": -681.832763671875, "loss": 0.3808, "rewards/accuracies": 0.875, "rewards/chosen": 2.9005014896392822, "rewards/margins": 2.1080565452575684, "rewards/rejected": 0.792445182800293, "step": 1040 }, { "epoch": 0.7605479452054794, "grad_norm": 57.818920673864405, "learning_rate": 4.877345379675311e-07, "logits/chosen": -2.833916187286377, "logits/rejected": -2.3704516887664795, "logps/chosen": -782.0903930664062, "logps/rejected": -670.7415161132812, "loss": 0.326, "rewards/accuracies": 0.75, "rewards/chosen": 2.5954794883728027, "rewards/margins": 1.3322374820709229, "rewards/rejected": 1.2632417678833008, "step": 1041 }, { "epoch": 0.7612785388127854, "grad_norm": 54.48829343165144, "learning_rate": 4.876851420405383e-07, "logits/chosen": -2.6913700103759766, "logits/rejected": -2.363081693649292, "logps/chosen": -634.58251953125, "logps/rejected": -599.8182373046875, "loss": 0.3288, "rewards/accuracies": 0.75, "rewards/chosen": 2.3891682624816895, "rewards/margins": 1.5053404569625854, "rewards/rejected": 0.8838277459144592, "step": 1042 }, { "epoch": 0.7620091324200913, "grad_norm": 47.25929451408209, "learning_rate": 4.876356493602699e-07, "logits/chosen": -3.282895803451538, "logits/rejected": -1.855670690536499, "logps/chosen": -608.3270263671875, "logps/rejected": -411.2239990234375, "loss": 0.2973, "rewards/accuracies": 0.875, "rewards/chosen": 2.2036752700805664, "rewards/margins": 2.418733835220337, "rewards/rejected": -0.215058833360672, "step": 1043 }, { "epoch": 0.7627397260273973, "grad_norm": 66.96951556958584, "learning_rate": 4.875860599468729e-07, "logits/chosen": -2.9564051628112793, "logits/rejected": -2.9042775630950928, "logps/chosen": -847.4281616210938, "logps/rejected": -805.11572265625, "loss": 0.4221, "rewards/accuracies": 0.5, "rewards/chosen": 2.868461847305298, "rewards/margins": 0.18111415207386017, "rewards/rejected": 2.687347412109375, "step": 1044 }, { "epoch": 0.7634703196347032, "grad_norm": 49.91711147069835, "learning_rate": 4.875363738205331e-07, "logits/chosen": -3.033604145050049, "logits/rejected": -2.4378151893615723, "logps/chosen": -630.9593505859375, "logps/rejected": -401.2276306152344, "loss": 0.3055, "rewards/accuracies": 0.875, "rewards/chosen": 2.365349769592285, "rewards/margins": 2.876703977584839, "rewards/rejected": -0.5113542079925537, "step": 1045 }, { "epoch": 0.7642009132420091, "grad_norm": 44.53963325695512, "learning_rate": 4.874865910014762e-07, "logits/chosen": -2.376224994659424, "logits/rejected": -1.9223164319992065, "logps/chosen": -439.276611328125, "logps/rejected": -322.2965087890625, "loss": 0.2983, "rewards/accuracies": 0.875, "rewards/chosen": 2.409461498260498, "rewards/margins": 2.742753744125366, "rewards/rejected": -0.3332920968532562, "step": 1046 }, { "epoch": 0.7649315068493151, "grad_norm": 63.19807419558961, "learning_rate": 4.87436711509967e-07, "logits/chosen": -3.1076159477233887, "logits/rejected": -3.035646915435791, "logps/chosen": -360.258056640625, "logps/rejected": -477.265380859375, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 1.1244640350341797, "rewards/margins": 1.8609373569488525, "rewards/rejected": -0.7364733219146729, "step": 1047 }, { "epoch": 0.765662100456621, "grad_norm": 60.962302644749236, "learning_rate": 4.873867353663097e-07, "logits/chosen": -2.668821096420288, "logits/rejected": -2.542400598526001, "logps/chosen": -384.0576171875, "logps/rejected": -350.258056640625, "loss": 0.4007, "rewards/accuracies": 0.875, "rewards/chosen": 2.1834218502044678, "rewards/margins": 2.520526170730591, "rewards/rejected": -0.3371043801307678, "step": 1048 }, { "epoch": 0.766392694063927, "grad_norm": 67.18186022841284, "learning_rate": 4.873366625908478e-07, "logits/chosen": -3.037663698196411, "logits/rejected": -2.248476028442383, "logps/chosen": -803.9094848632812, "logps/rejected": -705.6635131835938, "loss": 0.3636, "rewards/accuracies": 0.75, "rewards/chosen": 3.736454486846924, "rewards/margins": 2.811457395553589, "rewards/rejected": 0.9249969720840454, "step": 1049 }, { "epoch": 0.7671232876712328, "grad_norm": 61.5718267046707, "learning_rate": 4.872864932039642e-07, "logits/chosen": -2.565976142883301, "logits/rejected": -2.233569383621216, "logps/chosen": -596.72119140625, "logps/rejected": -534.005859375, "loss": 0.2995, "rewards/accuracies": 1.0, "rewards/chosen": 2.706439971923828, "rewards/margins": 2.679990768432617, "rewards/rejected": 0.026449307799339294, "step": 1050 }, { "epoch": 0.7678538812785388, "grad_norm": 82.06945241398441, "learning_rate": 4.87236227226081e-07, "logits/chosen": -2.85215425491333, "logits/rejected": -2.577056407928467, "logps/chosen": -779.066650390625, "logps/rejected": -678.468994140625, "loss": 0.5559, "rewards/accuracies": 0.75, "rewards/chosen": 3.4089503288269043, "rewards/margins": 1.8769432306289673, "rewards/rejected": 1.5320069789886475, "step": 1051 }, { "epoch": 0.7685844748858448, "grad_norm": 36.68029069658442, "learning_rate": 4.871858646776599e-07, "logits/chosen": -2.390913963317871, "logits/rejected": -1.6135276556015015, "logps/chosen": -476.04486083984375, "logps/rejected": -436.16973876953125, "loss": 0.2236, "rewards/accuracies": 0.875, "rewards/chosen": 3.357208251953125, "rewards/margins": 4.260622024536133, "rewards/rejected": -0.9034138321876526, "step": 1052 }, { "epoch": 0.7693150684931507, "grad_norm": 53.25937509167987, "learning_rate": 4.871354055792015e-07, "logits/chosen": -3.1084837913513184, "logits/rejected": -2.295982837677002, "logps/chosen": -532.9639282226562, "logps/rejected": -354.248046875, "loss": 0.3163, "rewards/accuracies": 1.0, "rewards/chosen": 2.712618350982666, "rewards/margins": 3.110736608505249, "rewards/rejected": -0.3981180191040039, "step": 1053 }, { "epoch": 0.7700456621004567, "grad_norm": 46.21896921432125, "learning_rate": 4.87084849951246e-07, "logits/chosen": -3.4211418628692627, "logits/rejected": -2.4859094619750977, "logps/chosen": -593.4012451171875, "logps/rejected": -380.73651123046875, "loss": 0.331, "rewards/accuracies": 0.875, "rewards/chosen": 2.6211328506469727, "rewards/margins": 2.005680561065674, "rewards/rejected": 0.615452229976654, "step": 1054 }, { "epoch": 0.7707762557077625, "grad_norm": 56.48711957370791, "learning_rate": 4.87034197814373e-07, "logits/chosen": -2.5161168575286865, "logits/rejected": -1.646458625793457, "logps/chosen": -500.55389404296875, "logps/rejected": -318.52520751953125, "loss": 0.3241, "rewards/accuracies": 1.0, "rewards/chosen": 2.6690893173217773, "rewards/margins": 3.123281240463257, "rewards/rejected": -0.4541921019554138, "step": 1055 }, { "epoch": 0.7715068493150685, "grad_norm": 76.99743806614607, "learning_rate": 4.86983449189201e-07, "logits/chosen": -3.058999538421631, "logits/rejected": -2.013982057571411, "logps/chosen": -566.6721801757812, "logps/rejected": -332.2837829589844, "loss": 0.5854, "rewards/accuracies": 0.75, "rewards/chosen": 2.5305092334747314, "rewards/margins": 2.814807415008545, "rewards/rejected": -0.2842983603477478, "step": 1056 }, { "epoch": 0.7722374429223744, "grad_norm": 36.3456846002727, "learning_rate": 4.86932604096388e-07, "logits/chosen": -3.237657070159912, "logits/rejected": -2.5058679580688477, "logps/chosen": -991.2127685546875, "logps/rejected": -514.0682373046875, "loss": 0.2115, "rewards/accuracies": 0.875, "rewards/chosen": 3.787436008453369, "rewards/margins": 2.722179651260376, "rewards/rejected": 1.0652563571929932, "step": 1057 }, { "epoch": 0.7729680365296804, "grad_norm": 62.27611070742737, "learning_rate": 4.868816625566313e-07, "logits/chosen": -2.7621874809265137, "logits/rejected": -2.314371109008789, "logps/chosen": -523.2212524414062, "logps/rejected": -499.9158020019531, "loss": 0.3407, "rewards/accuracies": 1.0, "rewards/chosen": 2.4381027221679688, "rewards/margins": 3.5459883213043213, "rewards/rejected": -1.107885479927063, "step": 1058 }, { "epoch": 0.7736986301369863, "grad_norm": 62.84113347673341, "learning_rate": 4.868306245906675e-07, "logits/chosen": -2.5969550609588623, "logits/rejected": -2.3894762992858887, "logps/chosen": -469.9226989746094, "logps/rejected": -422.59515380859375, "loss": 0.3866, "rewards/accuracies": 0.875, "rewards/chosen": 2.269223690032959, "rewards/margins": 2.7576122283935547, "rewards/rejected": -0.4883885979652405, "step": 1059 }, { "epoch": 0.7744292237442922, "grad_norm": 65.29014644382605, "learning_rate": 4.867794902192722e-07, "logits/chosen": -2.851562261581421, "logits/rejected": -2.127506732940674, "logps/chosen": -707.409912109375, "logps/rejected": -531.825927734375, "loss": 0.343, "rewards/accuracies": 1.0, "rewards/chosen": 2.7220404148101807, "rewards/margins": 2.6437220573425293, "rewards/rejected": 0.07831857353448868, "step": 1060 }, { "epoch": 0.7751598173515982, "grad_norm": 61.87672622276388, "learning_rate": 4.867282594632605e-07, "logits/chosen": -2.6288628578186035, "logits/rejected": -1.9928970336914062, "logps/chosen": -720.9005126953125, "logps/rejected": -606.5848388671875, "loss": 0.4006, "rewards/accuracies": 0.875, "rewards/chosen": 2.7680301666259766, "rewards/margins": 2.6082842350006104, "rewards/rejected": 0.159745991230011, "step": 1061 }, { "epoch": 0.7758904109589041, "grad_norm": 50.09720044261456, "learning_rate": 4.866769323434866e-07, "logits/chosen": -2.222006320953369, "logits/rejected": -2.046173095703125, "logps/chosen": -409.970703125, "logps/rejected": -401.4251403808594, "loss": 0.3455, "rewards/accuracies": 0.625, "rewards/chosen": 1.6101559400558472, "rewards/margins": 1.8687787055969238, "rewards/rejected": -0.25862282514572144, "step": 1062 }, { "epoch": 0.7766210045662101, "grad_norm": 64.76930968107118, "learning_rate": 4.866255088808441e-07, "logits/chosen": -3.1440982818603516, "logits/rejected": -2.1781716346740723, "logps/chosen": -839.636474609375, "logps/rejected": -651.5377197265625, "loss": 0.4286, "rewards/accuracies": 0.875, "rewards/chosen": 3.113806962966919, "rewards/margins": 2.05729603767395, "rewards/rejected": 1.0565109252929688, "step": 1063 }, { "epoch": 0.777351598173516, "grad_norm": 57.34041841480035, "learning_rate": 4.865739890962654e-07, "logits/chosen": -3.252424478530884, "logits/rejected": -2.1151442527770996, "logps/chosen": -720.873046875, "logps/rejected": -501.32049560546875, "loss": 0.3258, "rewards/accuracies": 0.875, "rewards/chosen": 3.25907564163208, "rewards/margins": 3.1118500232696533, "rewards/rejected": 0.1472260057926178, "step": 1064 }, { "epoch": 0.7780821917808219, "grad_norm": 75.93031087368864, "learning_rate": 4.865223730107228e-07, "logits/chosen": -2.6401824951171875, "logits/rejected": -2.127136468887329, "logps/chosen": -553.557373046875, "logps/rejected": -390.60662841796875, "loss": 0.479, "rewards/accuracies": 0.875, "rewards/chosen": 2.888807773590088, "rewards/margins": 2.5178985595703125, "rewards/rejected": 0.37090909481048584, "step": 1065 }, { "epoch": 0.7788127853881278, "grad_norm": 56.766521720828116, "learning_rate": 4.86470660645227e-07, "logits/chosen": -2.934826135635376, "logits/rejected": -1.8837387561798096, "logps/chosen": -815.8653564453125, "logps/rejected": -602.7396240234375, "loss": 0.3288, "rewards/accuracies": 1.0, "rewards/chosen": 4.37171745300293, "rewards/margins": 3.969186305999756, "rewards/rejected": 0.40253138542175293, "step": 1066 }, { "epoch": 0.7795433789954338, "grad_norm": 53.72862947256332, "learning_rate": 4.864188520208285e-07, "logits/chosen": -2.956465721130371, "logits/rejected": -1.7532131671905518, "logps/chosen": -340.1739501953125, "logps/rejected": -304.45697021484375, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 3.00992751121521, "rewards/margins": 4.435563087463379, "rewards/rejected": -1.4256354570388794, "step": 1067 }, { "epoch": 0.7802739726027397, "grad_norm": 63.51739696175273, "learning_rate": 4.863669471586167e-07, "logits/chosen": -2.847097873687744, "logits/rejected": -2.1594886779785156, "logps/chosen": -643.1832885742188, "logps/rejected": -575.9200439453125, "loss": 0.3599, "rewards/accuracies": 1.0, "rewards/chosen": 2.2553226947784424, "rewards/margins": 1.9304161071777344, "rewards/rejected": 0.3249066472053528, "step": 1068 }, { "epoch": 0.7810045662100457, "grad_norm": 44.19735292819441, "learning_rate": 4.863149460797204e-07, "logits/chosen": -2.539232015609741, "logits/rejected": -2.1630496978759766, "logps/chosen": -281.4227294921875, "logps/rejected": -261.49029541015625, "loss": 0.2733, "rewards/accuracies": 0.875, "rewards/chosen": 1.5822194814682007, "rewards/margins": 2.8394863605499268, "rewards/rejected": -1.2572667598724365, "step": 1069 }, { "epoch": 0.7817351598173516, "grad_norm": 54.63357385873749, "learning_rate": 4.862628488053072e-07, "logits/chosen": -3.2693405151367188, "logits/rejected": -2.308675527572632, "logps/chosen": -820.8652954101562, "logps/rejected": -528.8369140625, "loss": 0.3496, "rewards/accuracies": 1.0, "rewards/chosen": 2.937318801879883, "rewards/margins": 2.7499589920043945, "rewards/rejected": 0.18735966086387634, "step": 1070 }, { "epoch": 0.7824657534246575, "grad_norm": 54.808644836632524, "learning_rate": 4.862106553565841e-07, "logits/chosen": -2.8806395530700684, "logits/rejected": -2.520186424255371, "logps/chosen": -697.5361938476562, "logps/rejected": -558.4091796875, "loss": 0.3096, "rewards/accuracies": 0.75, "rewards/chosen": 2.0345005989074707, "rewards/margins": 1.2702364921569824, "rewards/rejected": 0.7642643451690674, "step": 1071 }, { "epoch": 0.7831963470319635, "grad_norm": 68.86091904611062, "learning_rate": 4.861583657547974e-07, "logits/chosen": -3.2227630615234375, "logits/rejected": -2.482192039489746, "logps/chosen": -960.3257446289062, "logps/rejected": -772.764892578125, "loss": 0.4091, "rewards/accuracies": 0.625, "rewards/chosen": 3.3992762565612793, "rewards/margins": 1.9214375019073486, "rewards/rejected": 1.4778389930725098, "step": 1072 }, { "epoch": 0.7839269406392694, "grad_norm": 58.94849902337951, "learning_rate": 4.861059800212322e-07, "logits/chosen": -2.5579071044921875, "logits/rejected": -2.128920793533325, "logps/chosen": -649.0702514648438, "logps/rejected": -579.459228515625, "loss": 0.3152, "rewards/accuracies": 0.875, "rewards/chosen": 1.9488046169281006, "rewards/margins": 2.9510304927825928, "rewards/rejected": -1.0022261142730713, "step": 1073 }, { "epoch": 0.7846575342465754, "grad_norm": 54.3814808458208, "learning_rate": 4.860534981772129e-07, "logits/chosen": -2.8688111305236816, "logits/rejected": -2.4442055225372314, "logps/chosen": -523.581787109375, "logps/rejected": -469.51068115234375, "loss": 0.3416, "rewards/accuracies": 0.75, "rewards/chosen": 2.0322015285491943, "rewards/margins": 1.9030977487564087, "rewards/rejected": 0.129103884100914, "step": 1074 }, { "epoch": 0.7853881278538812, "grad_norm": 67.94710862363829, "learning_rate": 4.860009202441032e-07, "logits/chosen": -2.797483205795288, "logits/rejected": -2.421052932739258, "logps/chosen": -852.7379760742188, "logps/rejected": -665.0563354492188, "loss": 0.3977, "rewards/accuracies": 0.625, "rewards/chosen": 2.9664828777313232, "rewards/margins": 2.561469554901123, "rewards/rejected": 0.40501344203948975, "step": 1075 }, { "epoch": 0.7861187214611872, "grad_norm": 86.65891516817528, "learning_rate": 4.859482462433054e-07, "logits/chosen": -2.6948130130767822, "logits/rejected": -3.2345073223114014, "logps/chosen": -702.4840087890625, "logps/rejected": -830.19384765625, "loss": 0.5064, "rewards/accuracies": 0.625, "rewards/chosen": 1.9669643640518188, "rewards/margins": 0.7031534910202026, "rewards/rejected": 1.2638107538223267, "step": 1076 }, { "epoch": 0.7868493150684932, "grad_norm": 63.30865533119267, "learning_rate": 4.858954761962615e-07, "logits/chosen": -2.1268503665924072, "logits/rejected": -1.8964686393737793, "logps/chosen": -516.7821044921875, "logps/rejected": -448.431884765625, "loss": 0.3041, "rewards/accuracies": 0.75, "rewards/chosen": 2.490156650543213, "rewards/margins": 2.9937901496887207, "rewards/rejected": -0.5036337375640869, "step": 1077 }, { "epoch": 0.7875799086757991, "grad_norm": 65.08500676399096, "learning_rate": 4.858426101244523e-07, "logits/chosen": -2.291292667388916, "logits/rejected": -2.1926209926605225, "logps/chosen": -632.0328369140625, "logps/rejected": -750.1234741210938, "loss": 0.3172, "rewards/accuracies": 0.875, "rewards/chosen": 3.0175018310546875, "rewards/margins": 4.121882915496826, "rewards/rejected": -1.1043810844421387, "step": 1078 }, { "epoch": 0.7883105022831051, "grad_norm": 54.220481330426466, "learning_rate": 4.857896480493976e-07, "logits/chosen": -2.7737855911254883, "logits/rejected": -2.5120275020599365, "logps/chosen": -518.3296508789062, "logps/rejected": -533.5214233398438, "loss": 0.3391, "rewards/accuracies": 0.75, "rewards/chosen": 2.552539110183716, "rewards/margins": 2.928575038909912, "rewards/rejected": -0.3760358691215515, "step": 1079 }, { "epoch": 0.7890410958904109, "grad_norm": 65.83832144209404, "learning_rate": 4.857365899926565e-07, "logits/chosen": -2.4775924682617188, "logits/rejected": -1.9862093925476074, "logps/chosen": -443.035400390625, "logps/rejected": -409.052490234375, "loss": 0.4856, "rewards/accuracies": 0.5, "rewards/chosen": 1.2240692377090454, "rewards/margins": 1.223825216293335, "rewards/rejected": 0.00024405121803283691, "step": 1080 }, { "epoch": 0.7897716894977169, "grad_norm": 51.1734780283266, "learning_rate": 4.85683435975827e-07, "logits/chosen": -2.7550442218780518, "logits/rejected": -1.5599613189697266, "logps/chosen": -359.978759765625, "logps/rejected": -205.39544677734375, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 2.822826623916626, "rewards/margins": 3.684154510498047, "rewards/rejected": -0.8613277673721313, "step": 1081 }, { "epoch": 0.7905022831050228, "grad_norm": 52.416707550825336, "learning_rate": 4.856301860205462e-07, "logits/chosen": -2.798804759979248, "logits/rejected": -1.9356389045715332, "logps/chosen": -700.9213256835938, "logps/rejected": -498.46868896484375, "loss": 0.2862, "rewards/accuracies": 0.875, "rewards/chosen": 2.658909797668457, "rewards/margins": 2.0361764430999756, "rewards/rejected": 0.6227332353591919, "step": 1082 }, { "epoch": 0.7912328767123288, "grad_norm": 57.76649563179443, "learning_rate": 4.855768401484906e-07, "logits/chosen": -2.944427967071533, "logits/rejected": -2.7395591735839844, "logps/chosen": -500.4962158203125, "logps/rejected": -431.6598205566406, "loss": 0.4496, "rewards/accuracies": 0.375, "rewards/chosen": 0.5069372057914734, "rewards/margins": -0.04435238242149353, "rewards/rejected": 0.5512895584106445, "step": 1083 }, { "epoch": 0.7919634703196347, "grad_norm": 50.038520318100936, "learning_rate": 4.855233983813751e-07, "logits/chosen": -2.6313202381134033, "logits/rejected": -1.9496264457702637, "logps/chosen": -688.0363159179688, "logps/rejected": -540.9590454101562, "loss": 0.3529, "rewards/accuracies": 0.75, "rewards/chosen": 2.743600368499756, "rewards/margins": 2.8107125759124756, "rewards/rejected": -0.06711225211620331, "step": 1084 }, { "epoch": 0.7926940639269406, "grad_norm": 61.307753384054124, "learning_rate": 4.85469860740954e-07, "logits/chosen": -3.041499614715576, "logits/rejected": -2.0369081497192383, "logps/chosen": -904.0930786132812, "logps/rejected": -466.67083740234375, "loss": 0.4343, "rewards/accuracies": 0.875, "rewards/chosen": 3.2477095127105713, "rewards/margins": 2.674689531326294, "rewards/rejected": 0.5730198621749878, "step": 1085 }, { "epoch": 0.7934246575342466, "grad_norm": 62.08868955851581, "learning_rate": 4.854162272490207e-07, "logits/chosen": -2.5138142108917236, "logits/rejected": -2.039987564086914, "logps/chosen": -389.4400634765625, "logps/rejected": -254.5110321044922, "loss": 0.3892, "rewards/accuracies": 0.75, "rewards/chosen": 1.9048137664794922, "rewards/margins": 1.8545749187469482, "rewards/rejected": 0.050238899886608124, "step": 1086 }, { "epoch": 0.7941552511415525, "grad_norm": 70.86570223956191, "learning_rate": 4.853624979274075e-07, "logits/chosen": -2.756704330444336, "logits/rejected": -2.2379140853881836, "logps/chosen": -696.3284912109375, "logps/rejected": -509.83758544921875, "loss": 0.4184, "rewards/accuracies": 0.875, "rewards/chosen": 2.8924288749694824, "rewards/margins": 3.4544482231140137, "rewards/rejected": -0.5620192885398865, "step": 1087 }, { "epoch": 0.7948858447488585, "grad_norm": 58.18147038442523, "learning_rate": 4.853086727979857e-07, "logits/chosen": -3.00420880317688, "logits/rejected": -2.6696128845214844, "logps/chosen": -494.7095031738281, "logps/rejected": -422.70977783203125, "loss": 0.4313, "rewards/accuracies": 0.625, "rewards/chosen": 1.4217157363891602, "rewards/margins": 1.4724514484405518, "rewards/rejected": -0.050735682249069214, "step": 1088 }, { "epoch": 0.7956164383561644, "grad_norm": 48.83312474402621, "learning_rate": 4.852547518826655e-07, "logits/chosen": -2.7771806716918945, "logits/rejected": -2.191673755645752, "logps/chosen": -714.7711791992188, "logps/rejected": -487.38946533203125, "loss": 0.2978, "rewards/accuracies": 0.875, "rewards/chosen": 2.0569493770599365, "rewards/margins": 2.2430195808410645, "rewards/rejected": -0.18607017397880554, "step": 1089 }, { "epoch": 0.7963470319634703, "grad_norm": 48.16592719930501, "learning_rate": 4.852007352033965e-07, "logits/chosen": -3.262190580368042, "logits/rejected": -1.9331541061401367, "logps/chosen": -672.7557373046875, "logps/rejected": -372.47314453125, "loss": 0.2418, "rewards/accuracies": 1.0, "rewards/chosen": 3.6216931343078613, "rewards/margins": 4.578502655029297, "rewards/rejected": -0.9568096399307251, "step": 1090 }, { "epoch": 0.7970776255707762, "grad_norm": 53.82159566713207, "learning_rate": 4.851466227821667e-07, "logits/chosen": -2.7848312854766846, "logits/rejected": -2.215909957885742, "logps/chosen": -576.1546630859375, "logps/rejected": -459.94317626953125, "loss": 0.3027, "rewards/accuracies": 0.625, "rewards/chosen": 2.0342330932617188, "rewards/margins": 1.9672062397003174, "rewards/rejected": 0.06702691316604614, "step": 1091 }, { "epoch": 0.7978082191780822, "grad_norm": 49.02020220898089, "learning_rate": 4.850924146410034e-07, "logits/chosen": -2.9151053428649902, "logits/rejected": -2.0798532962799072, "logps/chosen": -474.2899169921875, "logps/rejected": -361.3594665527344, "loss": 0.2652, "rewards/accuracies": 1.0, "rewards/chosen": 2.1443445682525635, "rewards/margins": 3.2534127235412598, "rewards/rejected": -1.109068512916565, "step": 1092 }, { "epoch": 0.7985388127853881, "grad_norm": 38.87974435981254, "learning_rate": 4.850381108019731e-07, "logits/chosen": -2.4212851524353027, "logits/rejected": -2.412498950958252, "logps/chosen": -465.0708312988281, "logps/rejected": -500.5428466796875, "loss": 0.2083, "rewards/accuracies": 0.875, "rewards/chosen": 1.7182607650756836, "rewards/margins": 1.8343693017959595, "rewards/rejected": -0.11610852181911469, "step": 1093 }, { "epoch": 0.7992694063926941, "grad_norm": 73.14779959737339, "learning_rate": 4.849837112871807e-07, "logits/chosen": -3.048255443572998, "logits/rejected": -2.5706305503845215, "logps/chosen": -558.1559448242188, "logps/rejected": -564.0203247070312, "loss": 0.5082, "rewards/accuracies": 0.5, "rewards/chosen": 1.8660883903503418, "rewards/margins": 2.309999465942383, "rewards/rejected": -0.44391098618507385, "step": 1094 }, { "epoch": 0.8, "grad_norm": 69.09099383919369, "learning_rate": 4.849292161187704e-07, "logits/chosen": -3.4026906490325928, "logits/rejected": -2.416365623474121, "logps/chosen": -707.4216918945312, "logps/rejected": -470.91888427734375, "loss": 0.375, "rewards/accuracies": 0.625, "rewards/chosen": 1.9786007404327393, "rewards/margins": 1.4992740154266357, "rewards/rejected": 0.47932666540145874, "step": 1095 }, { "epoch": 0.8007305936073059, "grad_norm": 63.202228825537674, "learning_rate": 4.848746253189253e-07, "logits/chosen": -2.9397757053375244, "logits/rejected": -2.6780505180358887, "logps/chosen": -615.0545654296875, "logps/rejected": -652.904541015625, "loss": 0.3811, "rewards/accuracies": 0.75, "rewards/chosen": 2.2571535110473633, "rewards/margins": 2.6499547958374023, "rewards/rejected": -0.3928012251853943, "step": 1096 }, { "epoch": 0.8014611872146119, "grad_norm": 51.75385892919, "learning_rate": 4.848199389098674e-07, "logits/chosen": -3.1920292377471924, "logits/rejected": -2.9940683841705322, "logps/chosen": -860.4344482421875, "logps/rejected": -877.45751953125, "loss": 0.3085, "rewards/accuracies": 0.75, "rewards/chosen": 2.6823742389678955, "rewards/margins": 2.4630751609802246, "rewards/rejected": 0.2192992866039276, "step": 1097 }, { "epoch": 0.8021917808219178, "grad_norm": 52.381604781938954, "learning_rate": 4.847651569138577e-07, "logits/chosen": -2.4486429691314697, "logits/rejected": -2.3192920684814453, "logps/chosen": -436.804443359375, "logps/rejected": -669.7884521484375, "loss": 0.3244, "rewards/accuracies": 0.875, "rewards/chosen": 2.338153600692749, "rewards/margins": 3.0840907096862793, "rewards/rejected": -0.7459369897842407, "step": 1098 }, { "epoch": 0.8029223744292238, "grad_norm": 43.99769724975672, "learning_rate": 4.84710279353196e-07, "logits/chosen": -3.0049915313720703, "logits/rejected": -1.8171225786209106, "logps/chosen": -648.31787109375, "logps/rejected": -403.95123291015625, "loss": 0.1772, "rewards/accuracies": 1.0, "rewards/chosen": 2.8346340656280518, "rewards/margins": 3.5103085041046143, "rewards/rejected": -0.6756742596626282, "step": 1099 }, { "epoch": 0.8036529680365296, "grad_norm": 71.93716740800613, "learning_rate": 4.846553062502208e-07, "logits/chosen": -3.0388193130493164, "logits/rejected": -2.2903735637664795, "logps/chosen": -665.1681518554688, "logps/rejected": -580.7235717773438, "loss": 0.4531, "rewards/accuracies": 0.75, "rewards/chosen": 2.076457977294922, "rewards/margins": 2.306112766265869, "rewards/rejected": -0.22965455055236816, "step": 1100 }, { "epoch": 0.8043835616438356, "grad_norm": 48.8480784230557, "learning_rate": 4.8460023762731e-07, "logits/chosen": -3.0839216709136963, "logits/rejected": -1.8264150619506836, "logps/chosen": -371.7157287597656, "logps/rejected": -262.270263671875, "loss": 0.2901, "rewards/accuracies": 0.875, "rewards/chosen": 2.3590281009674072, "rewards/margins": 4.465622425079346, "rewards/rejected": -2.1065945625305176, "step": 1101 }, { "epoch": 0.8051141552511416, "grad_norm": 67.07813045946868, "learning_rate": 4.845450735068799e-07, "logits/chosen": -2.663029670715332, "logits/rejected": -2.0464353561401367, "logps/chosen": -656.3435668945312, "logps/rejected": -423.5509033203125, "loss": 0.3591, "rewards/accuracies": 0.875, "rewards/chosen": 2.5787529945373535, "rewards/margins": 2.2986631393432617, "rewards/rejected": 0.2800900936126709, "step": 1102 }, { "epoch": 0.8058447488584475, "grad_norm": 58.780726672311566, "learning_rate": 4.84489813911386e-07, "logits/chosen": -2.6634128093719482, "logits/rejected": -2.315396785736084, "logps/chosen": -869.444580078125, "logps/rejected": -750.3341674804688, "loss": 0.3737, "rewards/accuracies": 1.0, "rewards/chosen": 2.3551578521728516, "rewards/margins": 1.5322799682617188, "rewards/rejected": 0.8228777647018433, "step": 1103 }, { "epoch": 0.8065753424657535, "grad_norm": 54.97107726739126, "learning_rate": 4.844344588633226e-07, "logits/chosen": -2.7699482440948486, "logits/rejected": -1.8837182521820068, "logps/chosen": -704.20947265625, "logps/rejected": -392.69720458984375, "loss": 0.321, "rewards/accuracies": 1.0, "rewards/chosen": 2.079275608062744, "rewards/margins": 1.9093198776245117, "rewards/rejected": 0.1699555516242981, "step": 1104 }, { "epoch": 0.8073059360730593, "grad_norm": 61.95127085813091, "learning_rate": 4.843790083852226e-07, "logits/chosen": -2.6932640075683594, "logits/rejected": -2.374760150909424, "logps/chosen": -440.24261474609375, "logps/rejected": -458.80548095703125, "loss": 0.3441, "rewards/accuracies": 0.75, "rewards/chosen": 1.8923795223236084, "rewards/margins": 1.9462977647781372, "rewards/rejected": -0.05391812324523926, "step": 1105 }, { "epoch": 0.8080365296803653, "grad_norm": 69.00808801102592, "learning_rate": 4.843234624996581e-07, "logits/chosen": -2.4512948989868164, "logits/rejected": -2.700995922088623, "logps/chosen": -509.7396240234375, "logps/rejected": -722.2730102539062, "loss": 0.3823, "rewards/accuracies": 0.875, "rewards/chosen": 1.118847370147705, "rewards/margins": 0.4528953433036804, "rewards/rejected": 0.6659519076347351, "step": 1106 }, { "epoch": 0.8087671232876712, "grad_norm": 70.81977773170959, "learning_rate": 4.842678212292399e-07, "logits/chosen": -2.582045555114746, "logits/rejected": -2.2195136547088623, "logps/chosen": -542.8109741210938, "logps/rejected": -477.235107421875, "loss": 0.4886, "rewards/accuracies": 0.75, "rewards/chosen": 1.6087403297424316, "rewards/margins": 1.776855230331421, "rewards/rejected": -0.16811498999595642, "step": 1107 }, { "epoch": 0.8094977168949772, "grad_norm": 52.19225277759143, "learning_rate": 4.842120845966174e-07, "logits/chosen": -2.778491497039795, "logits/rejected": -1.705564260482788, "logps/chosen": -740.7492065429688, "logps/rejected": -507.13739013671875, "loss": 0.3487, "rewards/accuracies": 0.75, "rewards/chosen": 2.259678363800049, "rewards/margins": 2.0589122772216797, "rewards/rejected": 0.20076611638069153, "step": 1108 }, { "epoch": 0.8102283105022831, "grad_norm": 68.26076637837068, "learning_rate": 4.841562526244792e-07, "logits/chosen": -3.0544371604919434, "logits/rejected": -2.794626474380493, "logps/chosen": -877.5407104492188, "logps/rejected": -769.259521484375, "loss": 0.3807, "rewards/accuracies": 0.625, "rewards/chosen": 1.9178142547607422, "rewards/margins": 1.1715031862258911, "rewards/rejected": 0.7463111877441406, "step": 1109 }, { "epoch": 0.810958904109589, "grad_norm": 42.454933596918394, "learning_rate": 4.841003253355526e-07, "logits/chosen": -3.2719407081604004, "logits/rejected": -2.3304836750030518, "logps/chosen": -936.6983642578125, "logps/rejected": -662.6555786132812, "loss": 0.3135, "rewards/accuracies": 0.875, "rewards/chosen": 2.863374710083008, "rewards/margins": 2.624192476272583, "rewards/rejected": 0.23918208479881287, "step": 1110 }, { "epoch": 0.811689497716895, "grad_norm": 54.082424977124035, "learning_rate": 4.840443027526034e-07, "logits/chosen": -3.123398780822754, "logits/rejected": -2.627927780151367, "logps/chosen": -668.6817626953125, "logps/rejected": -537.977783203125, "loss": 0.3943, "rewards/accuracies": 1.0, "rewards/chosen": 3.741361379623413, "rewards/margins": 3.9466440677642822, "rewards/rejected": -0.20528274774551392, "step": 1111 }, { "epoch": 0.8124200913242009, "grad_norm": 61.05710217953764, "learning_rate": 4.839881848984366e-07, "logits/chosen": -2.7066586017608643, "logits/rejected": -2.4985010623931885, "logps/chosen": -312.61944580078125, "logps/rejected": -351.787841796875, "loss": 0.4074, "rewards/accuracies": 0.875, "rewards/chosen": 1.4572879076004028, "rewards/margins": 3.0513293743133545, "rewards/rejected": -1.5940415859222412, "step": 1112 }, { "epoch": 0.8131506849315069, "grad_norm": 59.96091256106973, "learning_rate": 4.839319717958957e-07, "logits/chosen": -2.6203455924987793, "logits/rejected": -2.141474723815918, "logps/chosen": -429.083251953125, "logps/rejected": -372.3306884765625, "loss": 0.4688, "rewards/accuracies": 0.625, "rewards/chosen": 1.4698882102966309, "rewards/margins": 1.697967529296875, "rewards/rejected": -0.22807921469211578, "step": 1113 }, { "epoch": 0.8138812785388128, "grad_norm": 64.03900865024575, "learning_rate": 4.838756634678633e-07, "logits/chosen": -2.6258983612060547, "logits/rejected": -2.4594125747680664, "logps/chosen": -714.3848266601562, "logps/rejected": -624.0897216796875, "loss": 0.3382, "rewards/accuracies": 0.875, "rewards/chosen": 2.3800106048583984, "rewards/margins": 1.6256784200668335, "rewards/rejected": 0.7543323040008545, "step": 1114 }, { "epoch": 0.8146118721461187, "grad_norm": 59.560555041041006, "learning_rate": 4.838192599372603e-07, "logits/chosen": -2.3783717155456543, "logits/rejected": -2.379674196243286, "logps/chosen": -323.4302978515625, "logps/rejected": -363.00946044921875, "loss": 0.3399, "rewards/accuracies": 0.875, "rewards/chosen": 0.6947383880615234, "rewards/margins": 1.7020764350891113, "rewards/rejected": -1.007338047027588, "step": 1115 }, { "epoch": 0.8153424657534246, "grad_norm": 52.666440037750796, "learning_rate": 4.837627612270467e-07, "logits/chosen": -2.934185266494751, "logits/rejected": -1.907247543334961, "logps/chosen": -931.8165283203125, "logps/rejected": -540.528564453125, "loss": 0.3559, "rewards/accuracies": 0.875, "rewards/chosen": 2.51709246635437, "rewards/margins": 3.3466663360595703, "rewards/rejected": -0.8295739889144897, "step": 1116 }, { "epoch": 0.8160730593607306, "grad_norm": 70.85923201473997, "learning_rate": 4.837061673602211e-07, "logits/chosen": -2.6807703971862793, "logits/rejected": -2.0461585521698, "logps/chosen": -740.5049438476562, "logps/rejected": -624.5802001953125, "loss": 0.4999, "rewards/accuracies": 0.75, "rewards/chosen": 3.7978522777557373, "rewards/margins": 3.888925552368164, "rewards/rejected": -0.09107343852519989, "step": 1117 }, { "epoch": 0.8168036529680365, "grad_norm": 60.00074258980571, "learning_rate": 4.83649478359821e-07, "logits/chosen": -2.879823684692383, "logits/rejected": -2.44266939163208, "logps/chosen": -983.2512817382812, "logps/rejected": -713.3104248046875, "loss": 0.413, "rewards/accuracies": 0.75, "rewards/chosen": 2.9000954627990723, "rewards/margins": 1.7665379047393799, "rewards/rejected": 1.1335573196411133, "step": 1118 }, { "epoch": 0.8175342465753425, "grad_norm": 44.80942992011419, "learning_rate": 4.835926942489223e-07, "logits/chosen": -2.6226110458374023, "logits/rejected": -2.731147050857544, "logps/chosen": -427.896484375, "logps/rejected": -413.44189453125, "loss": 0.2627, "rewards/accuracies": 0.875, "rewards/chosen": 0.719323456287384, "rewards/margins": 1.534332036972046, "rewards/rejected": -0.8150084018707275, "step": 1119 }, { "epoch": 0.8182648401826484, "grad_norm": 55.955995981915336, "learning_rate": 4.8353581505064e-07, "logits/chosen": -3.141996383666992, "logits/rejected": -2.4080300331115723, "logps/chosen": -799.282958984375, "logps/rejected": -611.0940551757812, "loss": 0.2739, "rewards/accuracies": 1.0, "rewards/chosen": 2.4221808910369873, "rewards/margins": 2.253870964050293, "rewards/rejected": 0.1683099865913391, "step": 1120 }, { "epoch": 0.8189954337899543, "grad_norm": 49.31703232894671, "learning_rate": 4.834788407881275e-07, "logits/chosen": -2.961824655532837, "logits/rejected": -2.3590810298919678, "logps/chosen": -536.4735107421875, "logps/rejected": -459.6643371582031, "loss": 0.2964, "rewards/accuracies": 0.875, "rewards/chosen": 3.0171380043029785, "rewards/margins": 3.222334384918213, "rewards/rejected": -0.20519648492336273, "step": 1121 }, { "epoch": 0.8197260273972603, "grad_norm": 58.3505808300408, "learning_rate": 4.834217714845772e-07, "logits/chosen": -2.859654426574707, "logits/rejected": -2.083033323287964, "logps/chosen": -894.9918212890625, "logps/rejected": -616.1182250976562, "loss": 0.3167, "rewards/accuracies": 1.0, "rewards/chosen": 3.8574185371398926, "rewards/margins": 3.2841620445251465, "rewards/rejected": 0.5732566714286804, "step": 1122 }, { "epoch": 0.8204566210045662, "grad_norm": 44.45810192921463, "learning_rate": 4.833646071632197e-07, "logits/chosen": -2.8163297176361084, "logits/rejected": -2.345902919769287, "logps/chosen": -1026.85400390625, "logps/rejected": -725.0202026367188, "loss": 0.2015, "rewards/accuracies": 1.0, "rewards/chosen": 3.8625712394714355, "rewards/margins": 3.427438259124756, "rewards/rejected": 0.43513262271881104, "step": 1123 }, { "epoch": 0.8211872146118722, "grad_norm": 51.598562199869, "learning_rate": 4.833073478473248e-07, "logits/chosen": -2.4784319400787354, "logits/rejected": -2.4539949893951416, "logps/chosen": -783.4489135742188, "logps/rejected": -857.7720336914062, "loss": 0.2383, "rewards/accuracies": 0.5, "rewards/chosen": 3.105480432510376, "rewards/margins": 2.2374205589294434, "rewards/rejected": 0.8680601716041565, "step": 1124 }, { "epoch": 0.821917808219178, "grad_norm": 75.39293552158915, "learning_rate": 4.832499935602008e-07, "logits/chosen": -2.6930766105651855, "logits/rejected": -2.2868728637695312, "logps/chosen": -558.4915771484375, "logps/rejected": -392.07061767578125, "loss": 0.4043, "rewards/accuracies": 0.625, "rewards/chosen": 1.8080964088439941, "rewards/margins": 1.384722113609314, "rewards/rejected": 0.42337432503700256, "step": 1125 }, { "epoch": 0.822648401826484, "grad_norm": 54.732619779878526, "learning_rate": 4.831925443251945e-07, "logits/chosen": -2.691713333129883, "logits/rejected": -2.407555341720581, "logps/chosen": -439.8815612792969, "logps/rejected": -446.66717529296875, "loss": 0.2842, "rewards/accuracies": 1.0, "rewards/chosen": 2.12209415435791, "rewards/margins": 3.113865613937378, "rewards/rejected": -0.9917715787887573, "step": 1126 }, { "epoch": 0.82337899543379, "grad_norm": 56.54874012536111, "learning_rate": 4.831350001656916e-07, "logits/chosen": -3.2724361419677734, "logits/rejected": -2.0340824127197266, "logps/chosen": -1147.86767578125, "logps/rejected": -577.0864868164062, "loss": 0.3165, "rewards/accuracies": 0.75, "rewards/chosen": 2.663166046142578, "rewards/margins": 1.4735703468322754, "rewards/rejected": 1.1895956993103027, "step": 1127 }, { "epoch": 0.8241095890410959, "grad_norm": 67.95389205418995, "learning_rate": 4.830773611051161e-07, "logits/chosen": -2.5439205169677734, "logits/rejected": -1.5965750217437744, "logps/chosen": -503.59765625, "logps/rejected": -338.953125, "loss": 0.3467, "rewards/accuracies": 0.875, "rewards/chosen": 1.5871217250823975, "rewards/margins": 2.6860318183898926, "rewards/rejected": -1.0989099740982056, "step": 1128 }, { "epoch": 0.8248401826484019, "grad_norm": 47.04562695538237, "learning_rate": 4.83019627166931e-07, "logits/chosen": -2.8708207607269287, "logits/rejected": -2.586388111114502, "logps/chosen": -331.9892578125, "logps/rejected": -332.6925964355469, "loss": 0.3262, "rewards/accuracies": 1.0, "rewards/chosen": 1.3810409307479858, "rewards/margins": 1.8354793787002563, "rewards/rejected": -0.4544384181499481, "step": 1129 }, { "epoch": 0.8255707762557077, "grad_norm": 57.09661280110559, "learning_rate": 4.829617983746377e-07, "logits/chosen": -2.485360622406006, "logits/rejected": -2.4371440410614014, "logps/chosen": -524.654541015625, "logps/rejected": -633.1805419921875, "loss": 0.4122, "rewards/accuracies": 0.75, "rewards/chosen": 2.202637195587158, "rewards/margins": 1.9499726295471191, "rewards/rejected": 0.2526644170284271, "step": 1130 }, { "epoch": 0.8263013698630137, "grad_norm": 48.796322143041685, "learning_rate": 4.829038747517763e-07, "logits/chosen": -3.0168073177337646, "logits/rejected": -2.09334659576416, "logps/chosen": -589.9304809570312, "logps/rejected": -385.2255859375, "loss": 0.3931, "rewards/accuracies": 0.75, "rewards/chosen": 1.7131437063217163, "rewards/margins": 1.4938387870788574, "rewards/rejected": 0.21930500864982605, "step": 1131 }, { "epoch": 0.8270319634703196, "grad_norm": 63.00960988214417, "learning_rate": 4.828458563219254e-07, "logits/chosen": -3.101069688796997, "logits/rejected": -2.507232189178467, "logps/chosen": -404.653076171875, "logps/rejected": -379.999755859375, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 1.1072067022323608, "rewards/margins": 1.7481107711791992, "rewards/rejected": -0.6409041285514832, "step": 1132 }, { "epoch": 0.8277625570776256, "grad_norm": 61.161970382695735, "learning_rate": 4.827877431087025e-07, "logits/chosen": -2.9498348236083984, "logits/rejected": -1.9816205501556396, "logps/chosen": -800.7689208984375, "logps/rejected": -417.73046875, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 2.3626842498779297, "rewards/margins": 2.426828145980835, "rewards/rejected": -0.06414391845464706, "step": 1133 }, { "epoch": 0.8284931506849315, "grad_norm": 50.81224775075764, "learning_rate": 4.82729535135763e-07, "logits/chosen": -2.9448585510253906, "logits/rejected": -2.1347906589508057, "logps/chosen": -701.2877807617188, "logps/rejected": -454.13995361328125, "loss": 0.28, "rewards/accuracies": 1.0, "rewards/chosen": 2.200223445892334, "rewards/margins": 2.4681882858276367, "rewards/rejected": -0.2679649591445923, "step": 1134 }, { "epoch": 0.8292237442922374, "grad_norm": 49.55508348119419, "learning_rate": 4.826712324268018e-07, "logits/chosen": -2.7830519676208496, "logits/rejected": -2.1660890579223633, "logps/chosen": -595.8357543945312, "logps/rejected": -514.9376220703125, "loss": 0.2965, "rewards/accuracies": 1.0, "rewards/chosen": 2.601856231689453, "rewards/margins": 2.9711060523986816, "rewards/rejected": -0.36924996972084045, "step": 1135 }, { "epoch": 0.8299543378995434, "grad_norm": 69.51184420208544, "learning_rate": 4.826128350055515e-07, "logits/chosen": -2.6328439712524414, "logits/rejected": -2.344489097595215, "logps/chosen": -684.3597412109375, "logps/rejected": -495.01678466796875, "loss": 0.515, "rewards/accuracies": 0.5, "rewards/chosen": 1.49613356590271, "rewards/margins": 1.0352513790130615, "rewards/rejected": 0.46088215708732605, "step": 1136 }, { "epoch": 0.8306849315068493, "grad_norm": 57.96184047433169, "learning_rate": 4.825543428957839e-07, "logits/chosen": -2.4266514778137207, "logits/rejected": -2.1149048805236816, "logps/chosen": -839.5366821289062, "logps/rejected": -669.9209594726562, "loss": 0.3678, "rewards/accuracies": 0.75, "rewards/chosen": 2.855832815170288, "rewards/margins": 2.31404447555542, "rewards/rejected": 0.5417879819869995, "step": 1137 }, { "epoch": 0.8314155251141553, "grad_norm": 73.17244054073569, "learning_rate": 4.824957561213091e-07, "logits/chosen": -3.2637574672698975, "logits/rejected": -2.325845718383789, "logps/chosen": -537.7445678710938, "logps/rejected": -392.93817138671875, "loss": 0.4893, "rewards/accuracies": 1.0, "rewards/chosen": 2.4219589233398438, "rewards/margins": 2.859807252883911, "rewards/rejected": -0.437848299741745, "step": 1138 }, { "epoch": 0.8321461187214612, "grad_norm": 69.38354747457245, "learning_rate": 4.824370747059755e-07, "logits/chosen": -2.932424545288086, "logits/rejected": -2.869689464569092, "logps/chosen": -614.6990356445312, "logps/rejected": -708.1049194335938, "loss": 0.4957, "rewards/accuracies": 0.875, "rewards/chosen": 2.4901015758514404, "rewards/margins": 2.480419397354126, "rewards/rejected": 0.00968211516737938, "step": 1139 }, { "epoch": 0.8328767123287671, "grad_norm": 70.30829848558064, "learning_rate": 4.823782986736704e-07, "logits/chosen": -2.493659019470215, "logits/rejected": -2.10209321975708, "logps/chosen": -678.12255859375, "logps/rejected": -525.52490234375, "loss": 0.3644, "rewards/accuracies": 0.75, "rewards/chosen": 1.942722201347351, "rewards/margins": 2.152604579925537, "rewards/rejected": -0.209882453083992, "step": 1140 }, { "epoch": 0.833607305936073, "grad_norm": 56.94448236893216, "learning_rate": 4.823194280483194e-07, "logits/chosen": -2.7867825031280518, "logits/rejected": -3.0952916145324707, "logps/chosen": -580.654296875, "logps/rejected": -802.083984375, "loss": 0.294, "rewards/accuracies": 0.75, "rewards/chosen": 0.9654104113578796, "rewards/margins": 1.278568148612976, "rewards/rejected": -0.31315773725509644, "step": 1141 }, { "epoch": 0.834337899543379, "grad_norm": 55.05687311629448, "learning_rate": 4.822604628538867e-07, "logits/chosen": -2.587003231048584, "logits/rejected": -2.366569995880127, "logps/chosen": -408.89361572265625, "logps/rejected": -381.71356201171875, "loss": 0.3645, "rewards/accuracies": 0.75, "rewards/chosen": 1.5224170684814453, "rewards/margins": 2.3464231491088867, "rewards/rejected": -0.8240060806274414, "step": 1142 }, { "epoch": 0.8350684931506849, "grad_norm": 56.993472076852804, "learning_rate": 4.82201403114375e-07, "logits/chosen": -3.0955913066864014, "logits/rejected": -2.9121646881103516, "logps/chosen": -645.4386596679688, "logps/rejected": -613.47314453125, "loss": 0.3304, "rewards/accuracies": 0.875, "rewards/chosen": 1.1485710144042969, "rewards/margins": 1.003077745437622, "rewards/rejected": 0.14549332857131958, "step": 1143 }, { "epoch": 0.8357990867579909, "grad_norm": 52.002610361163605, "learning_rate": 4.821422488538254e-07, "logits/chosen": -2.341780662536621, "logits/rejected": -2.388941526412964, "logps/chosen": -407.5285949707031, "logps/rejected": -503.4302978515625, "loss": 0.3277, "rewards/accuracies": 0.875, "rewards/chosen": 1.720731258392334, "rewards/margins": 2.172834634780884, "rewards/rejected": -0.45210322737693787, "step": 1144 }, { "epoch": 0.8365296803652968, "grad_norm": 60.97534081495568, "learning_rate": 4.820830000963175e-07, "logits/chosen": -2.1793923377990723, "logits/rejected": -2.4472482204437256, "logps/chosen": -549.38623046875, "logps/rejected": -476.02911376953125, "loss": 0.3981, "rewards/accuracies": 0.75, "rewards/chosen": 2.6094021797180176, "rewards/margins": 2.8190698623657227, "rewards/rejected": -0.2096676528453827, "step": 1145 }, { "epoch": 0.8372602739726027, "grad_norm": 51.04374225159118, "learning_rate": 4.820236568659693e-07, "logits/chosen": -2.2202975749969482, "logits/rejected": -2.1933634281158447, "logps/chosen": -426.59521484375, "logps/rejected": -410.69586181640625, "loss": 0.3927, "rewards/accuracies": 0.75, "rewards/chosen": 2.16597318649292, "rewards/margins": 3.1341657638549805, "rewards/rejected": -0.9681925177574158, "step": 1146 }, { "epoch": 0.8379908675799087, "grad_norm": 58.17748596361719, "learning_rate": 4.819642191869374e-07, "logits/chosen": -2.4649453163146973, "logits/rejected": -2.03387713432312, "logps/chosen": -494.31158447265625, "logps/rejected": -460.0510559082031, "loss": 0.3764, "rewards/accuracies": 0.625, "rewards/chosen": 1.4768218994140625, "rewards/margins": 1.7726540565490723, "rewards/rejected": -0.29583215713500977, "step": 1147 }, { "epoch": 0.8387214611872146, "grad_norm": 39.462665925569766, "learning_rate": 4.81904687083417e-07, "logits/chosen": -3.2544918060302734, "logits/rejected": -2.9159250259399414, "logps/chosen": -520.8896484375, "logps/rejected": -403.1198425292969, "loss": 0.2333, "rewards/accuracies": 0.75, "rewards/chosen": 2.5991101264953613, "rewards/margins": 2.3857874870300293, "rewards/rejected": 0.2133227288722992, "step": 1148 }, { "epoch": 0.8394520547945206, "grad_norm": 53.47527219498864, "learning_rate": 4.818450605796413e-07, "logits/chosen": -2.6740732192993164, "logits/rejected": -2.6590051651000977, "logps/chosen": -443.2109680175781, "logps/rejected": -449.6946716308594, "loss": 0.3252, "rewards/accuracies": 0.875, "rewards/chosen": 1.8956598043441772, "rewards/margins": 1.4769967794418335, "rewards/rejected": 0.4186629056930542, "step": 1149 }, { "epoch": 0.8401826484018264, "grad_norm": 54.0109106273619, "learning_rate": 4.817853396998823e-07, "logits/chosen": -2.2141435146331787, "logits/rejected": -1.8249280452728271, "logps/chosen": -649.9528198242188, "logps/rejected": -479.8720703125, "loss": 0.375, "rewards/accuracies": 1.0, "rewards/chosen": 2.6071887016296387, "rewards/margins": 2.2950220108032227, "rewards/rejected": 0.3121665418148041, "step": 1150 }, { "epoch": 0.8409132420091324, "grad_norm": 55.827486527275475, "learning_rate": 4.817255244684501e-07, "logits/chosen": -2.5999085903167725, "logits/rejected": -2.280979871749878, "logps/chosen": -525.9398193359375, "logps/rejected": -424.6073913574219, "loss": 0.3544, "rewards/accuracies": 0.875, "rewards/chosen": 2.1649258136749268, "rewards/margins": 2.220848798751831, "rewards/rejected": -0.05592294782400131, "step": 1151 }, { "epoch": 0.8416438356164384, "grad_norm": 53.31796755374777, "learning_rate": 4.816656149096936e-07, "logits/chosen": -2.668992042541504, "logits/rejected": -2.2411584854125977, "logps/chosen": -544.4627075195312, "logps/rejected": -694.7655639648438, "loss": 0.3362, "rewards/accuracies": 0.875, "rewards/chosen": 2.4294095039367676, "rewards/margins": 2.429424285888672, "rewards/rejected": -1.4662742614746094e-05, "step": 1152 }, { "epoch": 0.8423744292237443, "grad_norm": 61.31230289532538, "learning_rate": 4.816056110479997e-07, "logits/chosen": -3.0760293006896973, "logits/rejected": -2.135615825653076, "logps/chosen": -1007.6103515625, "logps/rejected": -605.7556762695312, "loss": 0.3688, "rewards/accuracies": 1.0, "rewards/chosen": 3.8653252124786377, "rewards/margins": 3.299347400665283, "rewards/rejected": 0.5659779906272888, "step": 1153 }, { "epoch": 0.8431050228310503, "grad_norm": 92.83173727401906, "learning_rate": 4.815455129077939e-07, "logits/chosen": -3.215934991836548, "logits/rejected": -2.9957942962646484, "logps/chosen": -974.2650756835938, "logps/rejected": -897.4907836914062, "loss": 0.5907, "rewards/accuracies": 0.875, "rewards/chosen": 2.4568753242492676, "rewards/margins": 1.4064804315567017, "rewards/rejected": 1.050394892692566, "step": 1154 }, { "epoch": 0.8438356164383561, "grad_norm": 37.819115833939556, "learning_rate": 4.814853205135401e-07, "logits/chosen": -2.183413505554199, "logits/rejected": -1.5839524269104004, "logps/chosen": -363.00836181640625, "logps/rejected": -240.8004150390625, "loss": 0.2956, "rewards/accuracies": 0.875, "rewards/chosen": 1.1941512823104858, "rewards/margins": 0.595718502998352, "rewards/rejected": 0.598432719707489, "step": 1155 }, { "epoch": 0.8445662100456621, "grad_norm": 54.082540089922965, "learning_rate": 4.814250338897405e-07, "logits/chosen": -2.919023036956787, "logits/rejected": -2.58232045173645, "logps/chosen": -714.10791015625, "logps/rejected": -771.8385620117188, "loss": 0.3714, "rewards/accuracies": 0.625, "rewards/chosen": 0.7559317946434021, "rewards/margins": 0.9742828607559204, "rewards/rejected": -0.21835088729858398, "step": 1156 }, { "epoch": 0.845296803652968, "grad_norm": 44.72681154406647, "learning_rate": 4.813646530609355e-07, "logits/chosen": -2.975986957550049, "logits/rejected": -2.322751522064209, "logps/chosen": -569.8505859375, "logps/rejected": -405.8725280761719, "loss": 0.262, "rewards/accuracies": 1.0, "rewards/chosen": 1.9908015727996826, "rewards/margins": 2.0896291732788086, "rewards/rejected": -0.0988275408744812, "step": 1157 }, { "epoch": 0.846027397260274, "grad_norm": 77.88535745729104, "learning_rate": 4.813041780517043e-07, "logits/chosen": -3.0298447608947754, "logits/rejected": -2.038334608078003, "logps/chosen": -352.001708984375, "logps/rejected": -303.13592529296875, "loss": 0.5702, "rewards/accuracies": 0.875, "rewards/chosen": 1.7586981058120728, "rewards/margins": 2.6314210891723633, "rewards/rejected": -0.8727229833602905, "step": 1158 }, { "epoch": 0.8467579908675799, "grad_norm": 47.421249481047525, "learning_rate": 4.812436088866641e-07, "logits/chosen": -2.8930702209472656, "logits/rejected": -1.6912868022918701, "logps/chosen": -723.2269287109375, "logps/rejected": -463.92987060546875, "loss": 0.276, "rewards/accuracies": 1.0, "rewards/chosen": 2.964046001434326, "rewards/margins": 2.593785524368286, "rewards/rejected": 0.3702603280544281, "step": 1159 }, { "epoch": 0.8474885844748858, "grad_norm": 60.854737467724114, "learning_rate": 4.811829455904702e-07, "logits/chosen": -2.545156240463257, "logits/rejected": -1.8808379173278809, "logps/chosen": -606.6534423828125, "logps/rejected": -539.604736328125, "loss": 0.3904, "rewards/accuracies": 0.875, "rewards/chosen": 1.5287423133850098, "rewards/margins": 1.8069499731063843, "rewards/rejected": -0.27820760011672974, "step": 1160 }, { "epoch": 0.8482191780821918, "grad_norm": 68.39547796638313, "learning_rate": 4.811221881878167e-07, "logits/chosen": -3.5631754398345947, "logits/rejected": -2.617351531982422, "logps/chosen": -869.63916015625, "logps/rejected": -520.623779296875, "loss": 0.4661, "rewards/accuracies": 0.875, "rewards/chosen": 3.3304378986358643, "rewards/margins": 2.9384374618530273, "rewards/rejected": 0.3920007348060608, "step": 1161 }, { "epoch": 0.8489497716894977, "grad_norm": 51.83318391800879, "learning_rate": 4.810613367034358e-07, "logits/chosen": -2.9755752086639404, "logits/rejected": -2.185014247894287, "logps/chosen": -855.1490478515625, "logps/rejected": -518.4053955078125, "loss": 0.3484, "rewards/accuracies": 0.875, "rewards/chosen": 2.312859058380127, "rewards/margins": 1.8601951599121094, "rewards/rejected": 0.452663779258728, "step": 1162 }, { "epoch": 0.8496803652968037, "grad_norm": 55.75031530179896, "learning_rate": 4.810003911620981e-07, "logits/chosen": -2.7724103927612305, "logits/rejected": -2.0404672622680664, "logps/chosen": -586.2408447265625, "logps/rejected": -446.0126037597656, "loss": 0.4167, "rewards/accuracies": 0.875, "rewards/chosen": 2.644257068634033, "rewards/margins": 3.4915847778320312, "rewards/rejected": -0.8473278284072876, "step": 1163 }, { "epoch": 0.8504109589041096, "grad_norm": 62.20050900364482, "learning_rate": 4.809393515886122e-07, "logits/chosen": -2.7312614917755127, "logits/rejected": -2.1481773853302, "logps/chosen": -784.1828002929688, "logps/rejected": -578.4332885742188, "loss": 0.3984, "rewards/accuracies": 0.875, "rewards/chosen": 3.277348041534424, "rewards/margins": 2.980774164199829, "rewards/rejected": 0.2965737283229828, "step": 1164 }, { "epoch": 0.8511415525114155, "grad_norm": 68.47232989272017, "learning_rate": 4.808782180078253e-07, "logits/chosen": -2.790408134460449, "logits/rejected": -2.355661153793335, "logps/chosen": -737.09375, "logps/rejected": -658.03515625, "loss": 0.4189, "rewards/accuracies": 0.625, "rewards/chosen": 1.5985920429229736, "rewards/margins": 0.8126907348632812, "rewards/rejected": 0.7859013080596924, "step": 1165 }, { "epoch": 0.8518721461187214, "grad_norm": 68.72504163444538, "learning_rate": 4.808169904446228e-07, "logits/chosen": -2.9857327938079834, "logits/rejected": -2.2385201454162598, "logps/chosen": -517.3416137695312, "logps/rejected": -393.63848876953125, "loss": 0.3663, "rewards/accuracies": 0.875, "rewards/chosen": 2.0545074939727783, "rewards/margins": 1.9576892852783203, "rewards/rejected": 0.09681802988052368, "step": 1166 }, { "epoch": 0.8526027397260274, "grad_norm": 49.691111307443606, "learning_rate": 4.80755668923928e-07, "logits/chosen": -2.7338314056396484, "logits/rejected": -2.14643931388855, "logps/chosen": -970.3779296875, "logps/rejected": -833.6790771484375, "loss": 0.2957, "rewards/accuracies": 1.0, "rewards/chosen": 1.8034703731536865, "rewards/margins": 2.200063943862915, "rewards/rejected": -0.39659351110458374, "step": 1167 }, { "epoch": 0.8533333333333334, "grad_norm": 57.6226231546148, "learning_rate": 4.80694253470703e-07, "logits/chosen": -3.173300266265869, "logits/rejected": -2.5882911682128906, "logps/chosen": -464.8956298828125, "logps/rejected": -566.102783203125, "loss": 0.2896, "rewards/accuracies": 0.625, "rewards/chosen": 1.4632527828216553, "rewards/margins": 1.417544960975647, "rewards/rejected": 0.04570779204368591, "step": 1168 }, { "epoch": 0.8540639269406393, "grad_norm": 63.165826448004104, "learning_rate": 4.806327441099477e-07, "logits/chosen": -2.7150309085845947, "logits/rejected": -2.104326009750366, "logps/chosen": -607.9597778320312, "logps/rejected": -544.4454956054688, "loss": 0.346, "rewards/accuracies": 0.75, "rewards/chosen": 2.920083999633789, "rewards/margins": 2.9725496768951416, "rewards/rejected": -0.05246564745903015, "step": 1169 }, { "epoch": 0.8547945205479452, "grad_norm": 81.61176972123108, "learning_rate": 4.805711408667006e-07, "logits/chosen": -2.798332691192627, "logits/rejected": -1.894897699356079, "logps/chosen": -670.2100219726562, "logps/rejected": -356.3788146972656, "loss": 0.5529, "rewards/accuracies": 0.625, "rewards/chosen": 0.6252975463867188, "rewards/margins": 1.0415147542953491, "rewards/rejected": -0.41621720790863037, "step": 1170 }, { "epoch": 0.8555251141552511, "grad_norm": 63.133699508835015, "learning_rate": 4.805094437660381e-07, "logits/chosen": -2.8812255859375, "logits/rejected": -2.1111326217651367, "logps/chosen": -540.583984375, "logps/rejected": -509.3651123046875, "loss": 0.4066, "rewards/accuracies": 0.875, "rewards/chosen": 1.9263980388641357, "rewards/margins": 2.6339738368988037, "rewards/rejected": -0.7075756788253784, "step": 1171 }, { "epoch": 0.8562557077625571, "grad_norm": 59.90370487225206, "learning_rate": 4.80447652833075e-07, "logits/chosen": -3.3908321857452393, "logits/rejected": -2.5281217098236084, "logps/chosen": -646.0800170898438, "logps/rejected": -458.3226318359375, "loss": 0.4006, "rewards/accuracies": 0.75, "rewards/chosen": 2.2776026725769043, "rewards/margins": 1.7497296333312988, "rewards/rejected": 0.5278730392456055, "step": 1172 }, { "epoch": 0.856986301369863, "grad_norm": 75.15384611540183, "learning_rate": 4.803857680929639e-07, "logits/chosen": -2.942495346069336, "logits/rejected": -2.017162799835205, "logps/chosen": -487.8052062988281, "logps/rejected": -384.3017272949219, "loss": 0.5627, "rewards/accuracies": 0.75, "rewards/chosen": 1.6685643196105957, "rewards/margins": 1.8061342239379883, "rewards/rejected": -0.13757003843784332, "step": 1173 }, { "epoch": 0.857716894977169, "grad_norm": 46.38489091793523, "learning_rate": 4.803237895708964e-07, "logits/chosen": -2.6891720294952393, "logits/rejected": -2.4614877700805664, "logps/chosen": -490.8915100097656, "logps/rejected": -477.5807800292969, "loss": 0.2796, "rewards/accuracies": 0.75, "rewards/chosen": 2.108806610107422, "rewards/margins": 2.1254067420959473, "rewards/rejected": -0.01660016179084778, "step": 1174 }, { "epoch": 0.8584474885844748, "grad_norm": 54.03859433529313, "learning_rate": 4.802617172921015e-07, "logits/chosen": -2.4655117988586426, "logits/rejected": -2.3010146617889404, "logps/chosen": -829.5279541015625, "logps/rejected": -791.5563354492188, "loss": 0.2717, "rewards/accuracies": 0.75, "rewards/chosen": 2.4541139602661133, "rewards/margins": 0.9646883010864258, "rewards/rejected": 1.489425539970398, "step": 1175 }, { "epoch": 0.8591780821917808, "grad_norm": 42.66785413361196, "learning_rate": 4.801995512818467e-07, "logits/chosen": -2.767176628112793, "logits/rejected": -2.0278594493865967, "logps/chosen": -349.1517028808594, "logps/rejected": -260.6458740234375, "loss": 0.2616, "rewards/accuracies": 0.75, "rewards/chosen": 1.4300154447555542, "rewards/margins": 2.1785898208618164, "rewards/rejected": -0.748574435710907, "step": 1176 }, { "epoch": 0.8599086757990868, "grad_norm": 74.83453100841884, "learning_rate": 4.801372915654374e-07, "logits/chosen": -3.0324459075927734, "logits/rejected": -2.1796183586120605, "logps/chosen": -1048.12451171875, "logps/rejected": -604.1588134765625, "loss": 0.4115, "rewards/accuracies": 0.75, "rewards/chosen": 4.061223983764648, "rewards/margins": 2.9598264694213867, "rewards/rejected": 1.1013970375061035, "step": 1177 }, { "epoch": 0.8606392694063927, "grad_norm": 51.798383273666225, "learning_rate": 4.800749381682177e-07, "logits/chosen": -2.758538246154785, "logits/rejected": -2.0719361305236816, "logps/chosen": -928.2728271484375, "logps/rejected": -507.38214111328125, "loss": 0.2654, "rewards/accuracies": 0.875, "rewards/chosen": 3.291802167892456, "rewards/margins": 3.4852359294891357, "rewards/rejected": -0.19343379139900208, "step": 1178 }, { "epoch": 0.8613698630136987, "grad_norm": 57.45893149540591, "learning_rate": 4.800124911155692e-07, "logits/chosen": -3.0505497455596924, "logits/rejected": -2.416853427886963, "logps/chosen": -438.0838928222656, "logps/rejected": -446.9143371582031, "loss": 0.282, "rewards/accuracies": 1.0, "rewards/chosen": 2.6690468788146973, "rewards/margins": 4.276911735534668, "rewards/rejected": -1.6078649759292603, "step": 1179 }, { "epoch": 0.8621004566210045, "grad_norm": 68.97115353197488, "learning_rate": 4.799499504329121e-07, "logits/chosen": -2.5784912109375, "logits/rejected": -1.6952991485595703, "logps/chosen": -685.1495971679688, "logps/rejected": -531.748779296875, "loss": 0.3958, "rewards/accuracies": 0.75, "rewards/chosen": 2.778162956237793, "rewards/margins": 2.5632290840148926, "rewards/rejected": 0.21493393182754517, "step": 1180 }, { "epoch": 0.8628310502283105, "grad_norm": 48.72933005307866, "learning_rate": 4.798873161457045e-07, "logits/chosen": -2.648418426513672, "logits/rejected": -1.9545572996139526, "logps/chosen": -729.762939453125, "logps/rejected": -659.7337646484375, "loss": 0.2873, "rewards/accuracies": 0.875, "rewards/chosen": 3.4959025382995605, "rewards/margins": 3.618856906890869, "rewards/rejected": -0.12295415997505188, "step": 1181 }, { "epoch": 0.8635616438356164, "grad_norm": 86.95133505570084, "learning_rate": 4.798245882794423e-07, "logits/chosen": -2.4014527797698975, "logits/rejected": -2.3381996154785156, "logps/chosen": -898.221435546875, "logps/rejected": -740.67724609375, "loss": 0.5664, "rewards/accuracies": 0.75, "rewards/chosen": 2.7676329612731934, "rewards/margins": 2.4773900508880615, "rewards/rejected": 0.29024314880371094, "step": 1182 }, { "epoch": 0.8642922374429224, "grad_norm": 53.77103285793732, "learning_rate": 4.797617668596603e-07, "logits/chosen": -2.2661819458007812, "logits/rejected": -2.2743396759033203, "logps/chosen": -344.1710510253906, "logps/rejected": -544.9985961914062, "loss": 0.2793, "rewards/accuracies": 1.0, "rewards/chosen": 1.7824797630310059, "rewards/margins": 4.890385627746582, "rewards/rejected": -3.107905387878418, "step": 1183 }, { "epoch": 0.8650228310502283, "grad_norm": 96.93094150927763, "learning_rate": 4.796988519119305e-07, "logits/chosen": -2.818901777267456, "logits/rejected": -3.248593807220459, "logps/chosen": -530.7105102539062, "logps/rejected": -916.0430908203125, "loss": 0.6376, "rewards/accuracies": 0.75, "rewards/chosen": 1.9788875579833984, "rewards/margins": 0.863681972026825, "rewards/rejected": 1.1152056455612183, "step": 1184 }, { "epoch": 0.8657534246575342, "grad_norm": 84.11893145366416, "learning_rate": 4.796358434618635e-07, "logits/chosen": -2.8269853591918945, "logits/rejected": -2.396540403366089, "logps/chosen": -610.131591796875, "logps/rejected": -443.5721130371094, "loss": 0.6434, "rewards/accuracies": 0.75, "rewards/chosen": 1.8193979263305664, "rewards/margins": 1.7173216342926025, "rewards/rejected": 0.10207618027925491, "step": 1185 }, { "epoch": 0.8664840182648402, "grad_norm": 66.75874094219797, "learning_rate": 4.795727415351079e-07, "logits/chosen": -3.0357062816619873, "logits/rejected": -2.297276496887207, "logps/chosen": -471.2340087890625, "logps/rejected": -408.87762451171875, "loss": 0.5072, "rewards/accuracies": 0.75, "rewards/chosen": 2.455597400665283, "rewards/margins": 2.598060369491577, "rewards/rejected": -0.14246252179145813, "step": 1186 }, { "epoch": 0.8672146118721461, "grad_norm": 47.578248152549975, "learning_rate": 4.795095461573503e-07, "logits/chosen": -2.4380686283111572, "logits/rejected": -2.378868818283081, "logps/chosen": -601.505615234375, "logps/rejected": -693.404541015625, "loss": 0.2549, "rewards/accuracies": 1.0, "rewards/chosen": 1.5152844190597534, "rewards/margins": 2.3553614616394043, "rewards/rejected": -0.8400770425796509, "step": 1187 }, { "epoch": 0.8679452054794521, "grad_norm": 70.13563171411163, "learning_rate": 4.794462573543151e-07, "logits/chosen": -3.009265661239624, "logits/rejected": -2.4314675331115723, "logps/chosen": -778.1229248046875, "logps/rejected": -532.7830200195312, "loss": 0.5321, "rewards/accuracies": 0.75, "rewards/chosen": 1.8921750783920288, "rewards/margins": 1.77798593044281, "rewards/rejected": 0.1141892671585083, "step": 1188 }, { "epoch": 0.868675799086758, "grad_norm": 70.88437575772605, "learning_rate": 4.793828751517652e-07, "logits/chosen": -2.6541800498962402, "logits/rejected": -2.4706528186798096, "logps/chosen": -670.8496704101562, "logps/rejected": -643.418212890625, "loss": 0.5174, "rewards/accuracies": 0.625, "rewards/chosen": 2.713850736618042, "rewards/margins": 1.7953530550003052, "rewards/rejected": 0.9184978008270264, "step": 1189 }, { "epoch": 0.869406392694064, "grad_norm": 53.63300994592206, "learning_rate": 4.79319399575501e-07, "logits/chosen": -3.2246227264404297, "logits/rejected": -2.5656955242156982, "logps/chosen": -617.3999633789062, "logps/rejected": -616.9451904296875, "loss": 0.2769, "rewards/accuracies": 0.875, "rewards/chosen": 3.4625277519226074, "rewards/margins": 4.046076774597168, "rewards/rejected": -0.5835492014884949, "step": 1190 }, { "epoch": 0.8701369863013698, "grad_norm": 66.43781602700001, "learning_rate": 4.792558306513615e-07, "logits/chosen": -2.899418830871582, "logits/rejected": -2.1853723526000977, "logps/chosen": -793.201416015625, "logps/rejected": -497.3349914550781, "loss": 0.4155, "rewards/accuracies": 0.75, "rewards/chosen": 2.4418561458587646, "rewards/margins": 2.0101475715637207, "rewards/rejected": 0.43170857429504395, "step": 1191 }, { "epoch": 0.8708675799086758, "grad_norm": 58.662792655886236, "learning_rate": 4.791921684052232e-07, "logits/chosen": -3.1033692359924316, "logits/rejected": -1.6978132724761963, "logps/chosen": -547.82421875, "logps/rejected": -365.7710876464844, "loss": 0.4031, "rewards/accuracies": 0.875, "rewards/chosen": 2.055694103240967, "rewards/margins": 2.2789230346679688, "rewards/rejected": -0.22322909533977509, "step": 1192 }, { "epoch": 0.8715981735159818, "grad_norm": 67.66740309341161, "learning_rate": 4.791284128630007e-07, "logits/chosen": -2.3403053283691406, "logits/rejected": -2.282888889312744, "logps/chosen": -312.1187438964844, "logps/rejected": -408.8284912109375, "loss": 0.3807, "rewards/accuracies": 0.875, "rewards/chosen": 1.5205974578857422, "rewards/margins": 2.917832374572754, "rewards/rejected": -1.3972349166870117, "step": 1193 }, { "epoch": 0.8723287671232877, "grad_norm": 59.13528294895633, "learning_rate": 4.790645640506467e-07, "logits/chosen": -3.026278257369995, "logits/rejected": -1.8182159662246704, "logps/chosen": -896.363037109375, "logps/rejected": -494.61895751953125, "loss": 0.3768, "rewards/accuracies": 0.75, "rewards/chosen": 2.830047369003296, "rewards/margins": 3.2644996643066406, "rewards/rejected": -0.434452623128891, "step": 1194 }, { "epoch": 0.8730593607305936, "grad_norm": 58.0898415172567, "learning_rate": 4.79000621994152e-07, "logits/chosen": -2.555785655975342, "logits/rejected": -2.1339309215545654, "logps/chosen": -636.41552734375, "logps/rejected": -541.2765502929688, "loss": 0.5199, "rewards/accuracies": 1.0, "rewards/chosen": 2.63875150680542, "rewards/margins": 2.3741798400878906, "rewards/rejected": 0.26457175612449646, "step": 1195 }, { "epoch": 0.8737899543378995, "grad_norm": 58.77849209372847, "learning_rate": 4.789365867195449e-07, "logits/chosen": -3.071791172027588, "logits/rejected": -2.478966236114502, "logps/chosen": -701.010498046875, "logps/rejected": -751.8682861328125, "loss": 0.3463, "rewards/accuracies": 0.875, "rewards/chosen": 1.2137303352355957, "rewards/margins": 1.2205525636672974, "rewards/rejected": -0.00682232528924942, "step": 1196 }, { "epoch": 0.8745205479452055, "grad_norm": 194.92226014491007, "learning_rate": 4.78872458252892e-07, "logits/chosen": -2.600612163543701, "logits/rejected": -1.680503487586975, "logps/chosen": -464.55120849609375, "logps/rejected": -344.23028564453125, "loss": 0.4101, "rewards/accuracies": 1.0, "rewards/chosen": 3.0927202701568604, "rewards/margins": 3.1743228435516357, "rewards/rejected": -0.08160239458084106, "step": 1197 }, { "epoch": 0.8752511415525114, "grad_norm": 70.80257680912821, "learning_rate": 4.788082366202978e-07, "logits/chosen": -2.377553939819336, "logits/rejected": -2.182060956954956, "logps/chosen": -655.5340576171875, "logps/rejected": -579.2108154296875, "loss": 0.5757, "rewards/accuracies": 0.625, "rewards/chosen": 1.8377069234848022, "rewards/margins": 1.1148796081542969, "rewards/rejected": 0.7228274345397949, "step": 1198 }, { "epoch": 0.8759817351598174, "grad_norm": 78.60139127863768, "learning_rate": 4.787439218479046e-07, "logits/chosen": -3.228482723236084, "logits/rejected": -2.2190186977386475, "logps/chosen": -986.6474609375, "logps/rejected": -723.1559448242188, "loss": 0.4406, "rewards/accuracies": 0.75, "rewards/chosen": 2.6480460166931152, "rewards/margins": 1.550264835357666, "rewards/rejected": 1.0977813005447388, "step": 1199 }, { "epoch": 0.8767123287671232, "grad_norm": 45.81987758212177, "learning_rate": 4.786795139618927e-07, "logits/chosen": -2.5289487838745117, "logits/rejected": -1.730646014213562, "logps/chosen": -553.0811767578125, "logps/rejected": -378.5467224121094, "loss": 0.2799, "rewards/accuracies": 0.875, "rewards/chosen": 2.0570249557495117, "rewards/margins": 2.3570566177368164, "rewards/rejected": -0.3000316619873047, "step": 1200 }, { "epoch": 0.8774429223744292, "grad_norm": 80.39596829910067, "learning_rate": 4.786150129884802e-07, "logits/chosen": -2.7492668628692627, "logits/rejected": -2.120206832885742, "logps/chosen": -666.0162963867188, "logps/rejected": -465.51324462890625, "loss": 0.5696, "rewards/accuracies": 0.875, "rewards/chosen": 3.3644399642944336, "rewards/margins": 2.66229248046875, "rewards/rejected": 0.7021472454071045, "step": 1201 }, { "epoch": 0.8781735159817352, "grad_norm": 67.32950472884269, "learning_rate": 4.785504189539234e-07, "logits/chosen": -2.9234771728515625, "logits/rejected": -2.447617530822754, "logps/chosen": -858.8395385742188, "logps/rejected": -612.5498046875, "loss": 0.4585, "rewards/accuracies": 0.625, "rewards/chosen": 2.3821592330932617, "rewards/margins": 1.0235552787780762, "rewards/rejected": 1.358604073524475, "step": 1202 }, { "epoch": 0.8789041095890411, "grad_norm": 45.77979413214142, "learning_rate": 4.784857318845162e-07, "logits/chosen": -3.518521308898926, "logits/rejected": -2.120211601257324, "logps/chosen": -659.80712890625, "logps/rejected": -469.27313232421875, "loss": 0.2838, "rewards/accuracies": 0.875, "rewards/chosen": 2.9658474922180176, "rewards/margins": 2.946728229522705, "rewards/rejected": 0.0191192626953125, "step": 1203 }, { "epoch": 0.8796347031963471, "grad_norm": 46.262163202131234, "learning_rate": 4.784209518065903e-07, "logits/chosen": -2.8169004917144775, "logits/rejected": -2.0555343627929688, "logps/chosen": -516.8042602539062, "logps/rejected": -329.64208984375, "loss": 0.2824, "rewards/accuracies": 0.875, "rewards/chosen": 1.0366711616516113, "rewards/margins": 1.2382686138153076, "rewards/rejected": -0.20159748196601868, "step": 1204 }, { "epoch": 0.8803652968036529, "grad_norm": 51.95549806610341, "learning_rate": 4.783560787465155e-07, "logits/chosen": -3.0148208141326904, "logits/rejected": -2.299903154373169, "logps/chosen": -970.9921875, "logps/rejected": -714.4722290039062, "loss": 0.3944, "rewards/accuracies": 1.0, "rewards/chosen": 2.926079511642456, "rewards/margins": 2.354231834411621, "rewards/rejected": 0.5718475580215454, "step": 1205 }, { "epoch": 0.8810958904109589, "grad_norm": 61.2952061332848, "learning_rate": 4.782911127306993e-07, "logits/chosen": -2.696573257446289, "logits/rejected": -2.2997403144836426, "logps/chosen": -702.6087646484375, "logps/rejected": -687.318359375, "loss": 0.3288, "rewards/accuracies": 0.875, "rewards/chosen": 3.1563916206359863, "rewards/margins": 2.2439680099487305, "rewards/rejected": 0.9124234914779663, "step": 1206 }, { "epoch": 0.8818264840182648, "grad_norm": 58.466386577032445, "learning_rate": 4.782260537855873e-07, "logits/chosen": -1.9477078914642334, "logits/rejected": -2.283916711807251, "logps/chosen": -276.1191711425781, "logps/rejected": -405.86309814453125, "loss": 0.3664, "rewards/accuracies": 0.625, "rewards/chosen": 1.2464537620544434, "rewards/margins": 1.1053152084350586, "rewards/rejected": 0.14113855361938477, "step": 1207 }, { "epoch": 0.8825570776255708, "grad_norm": 61.922898127429406, "learning_rate": 4.781609019376623e-07, "logits/chosen": -2.5377883911132812, "logits/rejected": -1.9083365201950073, "logps/chosen": -646.25341796875, "logps/rejected": -421.78369140625, "loss": 0.4206, "rewards/accuracies": 0.875, "rewards/chosen": 2.5980234146118164, "rewards/margins": 2.8773226737976074, "rewards/rejected": -0.27929919958114624, "step": 1208 }, { "epoch": 0.8832876712328767, "grad_norm": 63.31352571972675, "learning_rate": 4.780956572134456e-07, "logits/chosen": -3.05536150932312, "logits/rejected": -2.5705671310424805, "logps/chosen": -719.3406982421875, "logps/rejected": -514.08837890625, "loss": 0.3881, "rewards/accuracies": 0.5, "rewards/chosen": 1.4725315570831299, "rewards/margins": 1.4257622957229614, "rewards/rejected": 0.04676932096481323, "step": 1209 }, { "epoch": 0.8840182648401826, "grad_norm": 67.51974176004933, "learning_rate": 4.780303196394959e-07, "logits/chosen": -2.9138810634613037, "logits/rejected": -1.7573745250701904, "logps/chosen": -750.5528564453125, "logps/rejected": -531.6885375976562, "loss": 0.3716, "rewards/accuracies": 1.0, "rewards/chosen": 3.0881097316741943, "rewards/margins": 4.068377494812012, "rewards/rejected": -0.9802677631378174, "step": 1210 }, { "epoch": 0.8847488584474886, "grad_norm": 45.67904012817312, "learning_rate": 4.7796488924241e-07, "logits/chosen": -3.4458675384521484, "logits/rejected": -1.6953095197677612, "logps/chosen": -636.7830810546875, "logps/rejected": -332.2205810546875, "loss": 0.3421, "rewards/accuracies": 1.0, "rewards/chosen": 1.783639669418335, "rewards/margins": 2.0474178791046143, "rewards/rejected": -0.26377812027931213, "step": 1211 }, { "epoch": 0.8854794520547945, "grad_norm": 59.38044794085213, "learning_rate": 4.778993660488223e-07, "logits/chosen": -2.8803930282592773, "logits/rejected": -2.0587968826293945, "logps/chosen": -811.095458984375, "logps/rejected": -601.9857788085938, "loss": 0.33, "rewards/accuracies": 0.875, "rewards/chosen": 2.1144070625305176, "rewards/margins": 2.3344554901123047, "rewards/rejected": -0.22004815936088562, "step": 1212 }, { "epoch": 0.8862100456621005, "grad_norm": 59.12651334281538, "learning_rate": 4.778337500854048e-07, "logits/chosen": -2.970757246017456, "logits/rejected": -2.4831955432891846, "logps/chosen": -926.642822265625, "logps/rejected": -701.00244140625, "loss": 0.315, "rewards/accuracies": 0.75, "rewards/chosen": 3.191118001937866, "rewards/margins": 3.606895685195923, "rewards/rejected": -0.4157772362232208, "step": 1213 }, { "epoch": 0.8869406392694064, "grad_norm": 65.3284594735518, "learning_rate": 4.777680413788676e-07, "logits/chosen": -2.632920265197754, "logits/rejected": -2.2828545570373535, "logps/chosen": -666.1655883789062, "logps/rejected": -542.5079956054688, "loss": 0.4414, "rewards/accuracies": 0.75, "rewards/chosen": 2.9071269035339355, "rewards/margins": 1.579845666885376, "rewards/rejected": 1.3272812366485596, "step": 1214 }, { "epoch": 0.8876712328767123, "grad_norm": 66.05612108512352, "learning_rate": 4.777022399559584e-07, "logits/chosen": -3.1725566387176514, "logits/rejected": -2.688878297805786, "logps/chosen": -863.8591918945312, "logps/rejected": -767.615966796875, "loss": 0.3792, "rewards/accuracies": 0.875, "rewards/chosen": 3.2813069820404053, "rewards/margins": 2.0678539276123047, "rewards/rejected": 1.213452935218811, "step": 1215 }, { "epoch": 0.8884018264840182, "grad_norm": 59.65999857264078, "learning_rate": 4.776363458434627e-07, "logits/chosen": -2.860933542251587, "logits/rejected": -2.112539768218994, "logps/chosen": -902.57958984375, "logps/rejected": -593.3650512695312, "loss": 0.3225, "rewards/accuracies": 1.0, "rewards/chosen": 4.0332159996032715, "rewards/margins": 4.412878036499023, "rewards/rejected": -0.3796616196632385, "step": 1216 }, { "epoch": 0.8891324200913242, "grad_norm": 53.12560661418371, "learning_rate": 4.775703590682036e-07, "logits/chosen": -3.2087411880493164, "logits/rejected": -2.9111971855163574, "logps/chosen": -591.6298828125, "logps/rejected": -499.0987548828125, "loss": 0.2943, "rewards/accuracies": 0.875, "rewards/chosen": 2.2759804725646973, "rewards/margins": 1.9235241413116455, "rewards/rejected": 0.3524564802646637, "step": 1217 }, { "epoch": 0.8898630136986302, "grad_norm": 62.1510394843431, "learning_rate": 4.77504279657042e-07, "logits/chosen": -2.7136192321777344, "logits/rejected": -2.0890657901763916, "logps/chosen": -318.6436767578125, "logps/rejected": -214.86752319335938, "loss": 0.4333, "rewards/accuracies": 0.75, "rewards/chosen": 0.9261701703071594, "rewards/margins": 1.1345574855804443, "rewards/rejected": -0.20838727056980133, "step": 1218 }, { "epoch": 0.8905936073059361, "grad_norm": 40.96281719533124, "learning_rate": 4.774381076368765e-07, "logits/chosen": -3.125690221786499, "logits/rejected": -2.1455748081207275, "logps/chosen": -565.1483154296875, "logps/rejected": -383.1591491699219, "loss": 0.2632, "rewards/accuracies": 0.875, "rewards/chosen": 1.848198652267456, "rewards/margins": 2.941026449203491, "rewards/rejected": -1.0928277969360352, "step": 1219 }, { "epoch": 0.891324200913242, "grad_norm": 59.72954589098891, "learning_rate": 4.773718430346434e-07, "logits/chosen": -3.033721446990967, "logits/rejected": -3.021104574203491, "logps/chosen": -612.2870483398438, "logps/rejected": -667.0337524414062, "loss": 0.3764, "rewards/accuracies": 0.75, "rewards/chosen": 2.5661110877990723, "rewards/margins": 1.7753804922103882, "rewards/rejected": 0.7907308340072632, "step": 1220 }, { "epoch": 0.8920547945205479, "grad_norm": 56.80924684717929, "learning_rate": 4.773054858773168e-07, "logits/chosen": -3.2351577281951904, "logits/rejected": -2.468411445617676, "logps/chosen": -999.1766357421875, "logps/rejected": -772.11083984375, "loss": 0.2986, "rewards/accuracies": 1.0, "rewards/chosen": 3.712184190750122, "rewards/margins": 3.016145944595337, "rewards/rejected": 0.6960383057594299, "step": 1221 }, { "epoch": 0.8927853881278539, "grad_norm": 40.51326280232962, "learning_rate": 4.772390361919082e-07, "logits/chosen": -2.7681570053100586, "logits/rejected": -2.4296913146972656, "logps/chosen": -543.913818359375, "logps/rejected": -439.712158203125, "loss": 0.1751, "rewards/accuracies": 0.875, "rewards/chosen": 2.4078938961029053, "rewards/margins": 2.845614433288574, "rewards/rejected": -0.43772047758102417, "step": 1222 }, { "epoch": 0.8935159817351598, "grad_norm": 76.93623704284282, "learning_rate": 4.771724940054671e-07, "logits/chosen": -3.349977493286133, "logits/rejected": -2.5534942150115967, "logps/chosen": -669.622314453125, "logps/rejected": -505.37176513671875, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": 2.283154249191284, "rewards/margins": 2.5295071601867676, "rewards/rejected": -0.2463531196117401, "step": 1223 }, { "epoch": 0.8942465753424658, "grad_norm": 49.255608718544686, "learning_rate": 4.771058593450804e-07, "logits/chosen": -3.055372953414917, "logits/rejected": -1.861836552619934, "logps/chosen": -616.490234375, "logps/rejected": -460.76873779296875, "loss": 0.3334, "rewards/accuracies": 0.875, "rewards/chosen": 2.7507989406585693, "rewards/margins": 2.9583919048309326, "rewards/rejected": -0.2075928896665573, "step": 1224 }, { "epoch": 0.8949771689497716, "grad_norm": 68.2955863096218, "learning_rate": 4.770391322378727e-07, "logits/chosen": -2.6038050651550293, "logits/rejected": -2.082127094268799, "logps/chosen": -441.75347900390625, "logps/rejected": -442.94671630859375, "loss": 0.4465, "rewards/accuracies": 0.75, "rewards/chosen": 3.0467827320098877, "rewards/margins": 4.14687967300415, "rewards/rejected": -1.1000968217849731, "step": 1225 }, { "epoch": 0.8957077625570776, "grad_norm": 72.65499326870109, "learning_rate": 4.769723127110063e-07, "logits/chosen": -3.0517218112945557, "logits/rejected": -1.9090113639831543, "logps/chosen": -525.0613403320312, "logps/rejected": -354.4062805175781, "loss": 0.3345, "rewards/accuracies": 0.875, "rewards/chosen": 2.8020339012145996, "rewards/margins": 4.569541931152344, "rewards/rejected": -1.767507791519165, "step": 1226 }, { "epoch": 0.8964383561643836, "grad_norm": 68.89053463080941, "learning_rate": 4.769054007916811e-07, "logits/chosen": -2.386873483657837, "logits/rejected": -2.194206953048706, "logps/chosen": -903.6959838867188, "logps/rejected": -702.8628540039062, "loss": 0.5138, "rewards/accuracies": 0.75, "rewards/chosen": 3.339151382446289, "rewards/margins": 2.1074087619781494, "rewards/rejected": 1.231742262840271, "step": 1227 }, { "epoch": 0.8971689497716895, "grad_norm": 66.77851904219463, "learning_rate": 4.768383965071346e-07, "logits/chosen": -2.8163089752197266, "logits/rejected": -2.028419256210327, "logps/chosen": -657.0010375976562, "logps/rejected": -421.4444274902344, "loss": 0.4624, "rewards/accuracies": 0.875, "rewards/chosen": 1.6031498908996582, "rewards/margins": 2.7467470169067383, "rewards/rejected": -1.14359712600708, "step": 1228 }, { "epoch": 0.8978995433789955, "grad_norm": 46.665330558905026, "learning_rate": 4.767712998846419e-07, "logits/chosen": -2.1141529083251953, "logits/rejected": -2.096130609512329, "logps/chosen": -425.8871765136719, "logps/rejected": -438.31842041015625, "loss": 0.2426, "rewards/accuracies": 0.75, "rewards/chosen": 1.1156892776489258, "rewards/margins": 1.7393040657043457, "rewards/rejected": -0.6236147880554199, "step": 1229 }, { "epoch": 0.8986301369863013, "grad_norm": 62.151381235280084, "learning_rate": 4.767041109515156e-07, "logits/chosen": -2.958451986312866, "logits/rejected": -2.257251262664795, "logps/chosen": -916.2105102539062, "logps/rejected": -616.1754760742188, "loss": 0.3635, "rewards/accuracies": 0.625, "rewards/chosen": 2.8420114517211914, "rewards/margins": 1.578662633895874, "rewards/rejected": 1.2633486986160278, "step": 1230 }, { "epoch": 0.8993607305936073, "grad_norm": 75.15267663022722, "learning_rate": 4.76636829735106e-07, "logits/chosen": -2.877685546875, "logits/rejected": -2.0931780338287354, "logps/chosen": -511.87664794921875, "logps/rejected": -285.2847900390625, "loss": 0.5164, "rewards/accuracies": 1.0, "rewards/chosen": 2.063027858734131, "rewards/margins": 2.275540590286255, "rewards/rejected": -0.2125125527381897, "step": 1231 }, { "epoch": 0.9000913242009132, "grad_norm": 69.40367158087712, "learning_rate": 4.76569456262801e-07, "logits/chosen": -2.723695993423462, "logits/rejected": -2.0636041164398193, "logps/chosen": -736.9896850585938, "logps/rejected": -567.900634765625, "loss": 0.429, "rewards/accuracies": 0.75, "rewards/chosen": 1.6102312803268433, "rewards/margins": 1.1121175289154053, "rewards/rejected": 0.498113751411438, "step": 1232 }, { "epoch": 0.9008219178082192, "grad_norm": 51.100190725549155, "learning_rate": 4.7650199056202577e-07, "logits/chosen": -2.638429641723633, "logits/rejected": -1.7131280899047852, "logps/chosen": -510.17041015625, "logps/rejected": -367.1796875, "loss": 0.2609, "rewards/accuracies": 1.0, "rewards/chosen": 2.830970048904419, "rewards/margins": 3.5483882427215576, "rewards/rejected": -0.7174184322357178, "step": 1233 }, { "epoch": 0.9015525114155251, "grad_norm": 57.200559898783986, "learning_rate": 4.764344326602435e-07, "logits/chosen": -2.44842267036438, "logits/rejected": -1.5128636360168457, "logps/chosen": -625.4161376953125, "logps/rejected": -432.75408935546875, "loss": 0.3482, "rewards/accuracies": 0.875, "rewards/chosen": 2.2755913734436035, "rewards/margins": 2.3972909450531006, "rewards/rejected": -0.12169978022575378, "step": 1234 }, { "epoch": 0.902283105022831, "grad_norm": 49.254104365778495, "learning_rate": 4.7636678258495444e-07, "logits/chosen": -2.8338212966918945, "logits/rejected": -1.9641540050506592, "logps/chosen": -637.9423217773438, "logps/rejected": -443.1888427734375, "loss": 0.3179, "rewards/accuracies": 0.875, "rewards/chosen": 1.5753555297851562, "rewards/margins": 1.252667784690857, "rewards/rejected": 0.32268768548965454, "step": 1235 }, { "epoch": 0.903013698630137, "grad_norm": 68.82565794386748, "learning_rate": 4.7629904036369663e-07, "logits/chosen": -3.056306838989258, "logits/rejected": -2.386263370513916, "logps/chosen": -1113.517822265625, "logps/rejected": -804.287841796875, "loss": 0.3961, "rewards/accuracies": 0.75, "rewards/chosen": 3.1978254318237305, "rewards/margins": 1.3571339845657349, "rewards/rejected": 1.8406914472579956, "step": 1236 }, { "epoch": 0.9037442922374429, "grad_norm": 75.59736388116487, "learning_rate": 4.7623120602404547e-07, "logits/chosen": -3.181628704071045, "logits/rejected": -1.7766757011413574, "logps/chosen": -598.3563232421875, "logps/rejected": -380.39971923828125, "loss": 0.3817, "rewards/accuracies": 1.0, "rewards/chosen": 3.934945583343506, "rewards/margins": 5.150116920471191, "rewards/rejected": -1.215170979499817, "step": 1237 }, { "epoch": 0.9044748858447489, "grad_norm": 67.56304925771187, "learning_rate": 4.761632795936141e-07, "logits/chosen": -2.144857168197632, "logits/rejected": -2.2961018085479736, "logps/chosen": -410.9783935546875, "logps/rejected": -480.8815002441406, "loss": 0.3571, "rewards/accuracies": 0.75, "rewards/chosen": 2.282414197921753, "rewards/margins": 3.221797466278076, "rewards/rejected": -0.9393835067749023, "step": 1238 }, { "epoch": 0.9052054794520548, "grad_norm": 67.56231570858307, "learning_rate": 4.7609526110005285e-07, "logits/chosen": -2.965836524963379, "logits/rejected": -2.3374812602996826, "logps/chosen": -418.4992370605469, "logps/rejected": -333.24224853515625, "loss": 0.4627, "rewards/accuracies": 0.75, "rewards/chosen": 2.657029867172241, "rewards/margins": 3.026815414428711, "rewards/rejected": -0.3697856068611145, "step": 1239 }, { "epoch": 0.9059360730593607, "grad_norm": 82.03893118799604, "learning_rate": 4.760271505710497e-07, "logits/chosen": -2.9225869178771973, "logits/rejected": -2.3007359504699707, "logps/chosen": -778.6843872070312, "logps/rejected": -646.7734375, "loss": 0.5385, "rewards/accuracies": 0.875, "rewards/chosen": 2.408797264099121, "rewards/margins": 1.6748614311218262, "rewards/rejected": 0.7339357137680054, "step": 1240 }, { "epoch": 0.9066666666666666, "grad_norm": 71.9537442291487, "learning_rate": 4.7595894803433006e-07, "logits/chosen": -2.7833352088928223, "logits/rejected": -2.3709752559661865, "logps/chosen": -1119.340576171875, "logps/rejected": -838.8836059570312, "loss": 0.3981, "rewards/accuracies": 0.625, "rewards/chosen": 2.9869019985198975, "rewards/margins": 1.4327690601348877, "rewards/rejected": 1.5541330575942993, "step": 1241 }, { "epoch": 0.9073972602739726, "grad_norm": 60.734015225973245, "learning_rate": 4.7589065351765677e-07, "logits/chosen": -2.963721752166748, "logits/rejected": -2.0037648677825928, "logps/chosen": -754.3323974609375, "logps/rejected": -428.34637451171875, "loss": 0.3381, "rewards/accuracies": 0.875, "rewards/chosen": 1.939276933670044, "rewards/margins": 2.1642403602600098, "rewards/rejected": -0.2249635010957718, "step": 1242 }, { "epoch": 0.9081278538812786, "grad_norm": 59.069774684266406, "learning_rate": 4.7582226704883023e-07, "logits/chosen": -3.030691385269165, "logits/rejected": -2.2291386127471924, "logps/chosen": -511.5264892578125, "logps/rejected": -465.883056640625, "loss": 0.4251, "rewards/accuracies": 0.875, "rewards/chosen": 2.6169955730438232, "rewards/margins": 3.9088826179504395, "rewards/rejected": -1.291886806488037, "step": 1243 }, { "epoch": 0.9088584474885845, "grad_norm": 73.43040856718522, "learning_rate": 4.757537886556879e-07, "logits/chosen": -2.827599287033081, "logits/rejected": -2.2016186714172363, "logps/chosen": -822.708251953125, "logps/rejected": -551.6522216796875, "loss": 0.4931, "rewards/accuracies": 0.625, "rewards/chosen": 3.519706964492798, "rewards/margins": 2.274744987487793, "rewards/rejected": 1.2449617385864258, "step": 1244 }, { "epoch": 0.9095890410958904, "grad_norm": 55.369376213441534, "learning_rate": 4.7568521836610533e-07, "logits/chosen": -2.6265931129455566, "logits/rejected": -1.6507599353790283, "logps/chosen": -669.18798828125, "logps/rejected": -381.2083740234375, "loss": 0.4054, "rewards/accuracies": 0.875, "rewards/chosen": 2.020350217819214, "rewards/margins": 2.1454219818115234, "rewards/rejected": -0.1250716894865036, "step": 1245 }, { "epoch": 0.9103196347031963, "grad_norm": 68.18216494940228, "learning_rate": 4.7561655620799473e-07, "logits/chosen": -3.1350111961364746, "logits/rejected": -2.4138617515563965, "logps/chosen": -682.8194580078125, "logps/rejected": -696.8140869140625, "loss": 0.3691, "rewards/accuracies": 0.75, "rewards/chosen": 3.081618309020996, "rewards/margins": 2.0208523273468018, "rewards/rejected": 1.0607659816741943, "step": 1246 }, { "epoch": 0.9110502283105023, "grad_norm": 63.3502555860077, "learning_rate": 4.7554780220930625e-07, "logits/chosen": -1.8479886054992676, "logits/rejected": -2.317026138305664, "logps/chosen": -290.1664123535156, "logps/rejected": -380.9139099121094, "loss": 0.489, "rewards/accuracies": 0.875, "rewards/chosen": 0.779083251953125, "rewards/margins": 1.8761186599731445, "rewards/rejected": -1.0970354080200195, "step": 1247 }, { "epoch": 0.9117808219178082, "grad_norm": 55.59123580247947, "learning_rate": 4.7547895639802707e-07, "logits/chosen": -2.841759204864502, "logits/rejected": -1.904705286026001, "logps/chosen": -697.755615234375, "logps/rejected": -364.1857604980469, "loss": 0.3252, "rewards/accuracies": 1.0, "rewards/chosen": 2.5421409606933594, "rewards/margins": 2.2342443466186523, "rewards/rejected": 0.3078964948654175, "step": 1248 }, { "epoch": 0.9125114155251142, "grad_norm": 70.29325778368613, "learning_rate": 4.7541001880218213e-07, "logits/chosen": -2.2533116340637207, "logits/rejected": -2.6153900623321533, "logps/chosen": -580.6163940429688, "logps/rejected": -792.1041259765625, "loss": 0.4256, "rewards/accuracies": 0.75, "rewards/chosen": 0.25805699825286865, "rewards/margins": 1.003227949142456, "rewards/rejected": -0.7451708912849426, "step": 1249 }, { "epoch": 0.91324200913242, "grad_norm": 56.17460876518745, "learning_rate": 4.7534098944983327e-07, "logits/chosen": -2.9144818782806396, "logits/rejected": -2.5971932411193848, "logps/chosen": -454.8157653808594, "logps/rejected": -345.64593505859375, "loss": 0.3909, "rewards/accuracies": 0.5, "rewards/chosen": 1.5415160655975342, "rewards/margins": 1.2131448984146118, "rewards/rejected": 0.3283711075782776, "step": 1250 }, { "epoch": 0.913972602739726, "grad_norm": 46.201555604052714, "learning_rate": 4.7527186836908e-07, "logits/chosen": -2.6631569862365723, "logits/rejected": -1.6237496137619019, "logps/chosen": -552.0026245117188, "logps/rejected": -306.9260559082031, "loss": 0.2643, "rewards/accuracies": 1.0, "rewards/chosen": 1.347454309463501, "rewards/margins": 1.4101835489273071, "rewards/rejected": -0.06272915005683899, "step": 1251 }, { "epoch": 0.914703196347032, "grad_norm": 68.59519858948767, "learning_rate": 4.7520265558805915e-07, "logits/chosen": -2.569385051727295, "logits/rejected": -1.8808252811431885, "logps/chosen": -486.8861083984375, "logps/rejected": -414.9339599609375, "loss": 0.5859, "rewards/accuracies": 0.875, "rewards/chosen": 2.0028727054595947, "rewards/margins": 2.5691094398498535, "rewards/rejected": -0.5662367939949036, "step": 1252 }, { "epoch": 0.9154337899543379, "grad_norm": 74.97934937055169, "learning_rate": 4.751333511349448e-07, "logits/chosen": -2.369375228881836, "logits/rejected": -2.622237205505371, "logps/chosen": -609.042724609375, "logps/rejected": -614.743408203125, "loss": 0.495, "rewards/accuracies": 0.75, "rewards/chosen": 1.7413614988327026, "rewards/margins": 1.671147108078003, "rewards/rejected": 0.07021437585353851, "step": 1253 }, { "epoch": 0.9161643835616439, "grad_norm": 60.19554925732155, "learning_rate": 4.7506395503794826e-07, "logits/chosen": -2.4733638763427734, "logits/rejected": -2.4493234157562256, "logps/chosen": -666.5484008789062, "logps/rejected": -569.2542114257812, "loss": 0.375, "rewards/accuracies": 0.875, "rewards/chosen": 1.5332555770874023, "rewards/margins": 1.6665904521942139, "rewards/rejected": -0.1333349347114563, "step": 1254 }, { "epoch": 0.9168949771689497, "grad_norm": 53.353776701197546, "learning_rate": 4.7499446732531835e-07, "logits/chosen": -2.849194049835205, "logits/rejected": -2.3133389949798584, "logps/chosen": -447.2489013671875, "logps/rejected": -465.9971008300781, "loss": 0.3279, "rewards/accuracies": 0.75, "rewards/chosen": 2.321946620941162, "rewards/margins": 3.109854221343994, "rewards/rejected": -0.7879076600074768, "step": 1255 }, { "epoch": 0.9176255707762557, "grad_norm": 80.97547070235993, "learning_rate": 4.7492488802534097e-07, "logits/chosen": -3.1023786067962646, "logits/rejected": -2.154487371444702, "logps/chosen": -744.657958984375, "logps/rejected": -497.3401184082031, "loss": 0.4653, "rewards/accuracies": 0.875, "rewards/chosen": 2.7113401889801025, "rewards/margins": 2.4599597454071045, "rewards/rejected": 0.2513805627822876, "step": 1256 }, { "epoch": 0.9183561643835616, "grad_norm": 72.73459150075189, "learning_rate": 4.748552171663395e-07, "logits/chosen": -3.132460355758667, "logits/rejected": -2.4617695808410645, "logps/chosen": -664.6790161132812, "logps/rejected": -441.0460510253906, "loss": 0.5679, "rewards/accuracies": 0.875, "rewards/chosen": 1.920000433921814, "rewards/margins": 1.9895610809326172, "rewards/rejected": -0.06956067681312561, "step": 1257 }, { "epoch": 0.9190867579908676, "grad_norm": 47.69256044988175, "learning_rate": 4.7478545477667453e-07, "logits/chosen": -2.9044251441955566, "logits/rejected": -2.0908737182617188, "logps/chosen": -646.6578369140625, "logps/rejected": -580.8163452148438, "loss": 0.2993, "rewards/accuracies": 1.0, "rewards/chosen": 1.6880111694335938, "rewards/margins": 1.6050724983215332, "rewards/rejected": 0.08293861150741577, "step": 1258 }, { "epoch": 0.9198173515981735, "grad_norm": 45.89739427005906, "learning_rate": 4.747156008847438e-07, "logits/chosen": -3.015489101409912, "logits/rejected": -2.331159830093384, "logps/chosen": -757.9979858398438, "logps/rejected": -517.8119506835938, "loss": 0.2409, "rewards/accuracies": 0.875, "rewards/chosen": 2.658370018005371, "rewards/margins": 2.1512298583984375, "rewards/rejected": 0.507140576839447, "step": 1259 }, { "epoch": 0.9205479452054794, "grad_norm": 52.96171136862696, "learning_rate": 4.746456555189824e-07, "logits/chosen": -2.695352792739868, "logits/rejected": -1.8137428760528564, "logps/chosen": -585.5433959960938, "logps/rejected": -327.760009765625, "loss": 0.2847, "rewards/accuracies": 0.625, "rewards/chosen": 1.6656484603881836, "rewards/margins": 1.5232337713241577, "rewards/rejected": 0.14241473376750946, "step": 1260 }, { "epoch": 0.9212785388127854, "grad_norm": 56.69244612173323, "learning_rate": 4.7457561870786257e-07, "logits/chosen": -2.5670130252838135, "logits/rejected": -2.5212321281433105, "logps/chosen": -489.8707580566406, "logps/rejected": -461.2422790527344, "loss": 0.3332, "rewards/accuracies": 1.0, "rewards/chosen": 1.9540956020355225, "rewards/margins": 1.7419583797454834, "rewards/rejected": 0.2121371328830719, "step": 1261 }, { "epoch": 0.9220091324200913, "grad_norm": 61.51421805922189, "learning_rate": 4.7450549047989396e-07, "logits/chosen": -2.939694404602051, "logits/rejected": -1.7230510711669922, "logps/chosen": -814.558837890625, "logps/rejected": -393.64556884765625, "loss": 0.3766, "rewards/accuracies": 1.0, "rewards/chosen": 3.2582364082336426, "rewards/margins": 2.6139283180236816, "rewards/rejected": 0.64430832862854, "step": 1262 }, { "epoch": 0.9227397260273973, "grad_norm": 68.55252177846924, "learning_rate": 4.7443527086362325e-07, "logits/chosen": -3.327254056930542, "logits/rejected": -2.613402843475342, "logps/chosen": -871.656005859375, "logps/rejected": -627.01416015625, "loss": 0.5492, "rewards/accuracies": 0.625, "rewards/chosen": 2.6398119926452637, "rewards/margins": 1.3196159601211548, "rewards/rejected": 1.3201961517333984, "step": 1263 }, { "epoch": 0.9234703196347032, "grad_norm": 105.06151471875893, "learning_rate": 4.7436495988763437e-07, "logits/chosen": -2.464966297149658, "logits/rejected": -2.2550034523010254, "logps/chosen": -467.2567138671875, "logps/rejected": -426.45135498046875, "loss": 0.7275, "rewards/accuracies": 0.625, "rewards/chosen": 1.3267972469329834, "rewards/margins": 1.2506390810012817, "rewards/rejected": 0.07615825533866882, "step": 1264 }, { "epoch": 0.9242009132420091, "grad_norm": 64.8024190568917, "learning_rate": 4.742945575805485e-07, "logits/chosen": -2.98636794090271, "logits/rejected": -2.3262972831726074, "logps/chosen": -512.5783081054688, "logps/rejected": -468.6345520019531, "loss": 0.3536, "rewards/accuracies": 0.75, "rewards/chosen": 3.0697052478790283, "rewards/margins": 2.7823286056518555, "rewards/rejected": 0.28737664222717285, "step": 1265 }, { "epoch": 0.924931506849315, "grad_norm": 49.3625874429461, "learning_rate": 4.742240639710239e-07, "logits/chosen": -2.8741555213928223, "logits/rejected": -1.5903488397598267, "logps/chosen": -413.34283447265625, "logps/rejected": -247.02540588378906, "loss": 0.3218, "rewards/accuracies": 0.875, "rewards/chosen": 3.478703022003174, "rewards/margins": 4.569772243499756, "rewards/rejected": -1.0910694599151611, "step": 1266 }, { "epoch": 0.925662100456621, "grad_norm": 53.687322649590655, "learning_rate": 4.741534790877559e-07, "logits/chosen": -2.6241021156311035, "logits/rejected": -2.380837917327881, "logps/chosen": -651.411376953125, "logps/rejected": -614.63134765625, "loss": 0.3208, "rewards/accuracies": 1.0, "rewards/chosen": 2.1610653400421143, "rewards/margins": 2.252394199371338, "rewards/rejected": -0.09132891893386841, "step": 1267 }, { "epoch": 0.926392694063927, "grad_norm": 54.40956157354147, "learning_rate": 4.7408280295947744e-07, "logits/chosen": -2.4927892684936523, "logits/rejected": -2.183445453643799, "logps/chosen": -674.6234130859375, "logps/rejected": -683.2291259765625, "loss": 0.2981, "rewards/accuracies": 1.0, "rewards/chosen": 1.7986613512039185, "rewards/margins": 1.5739578008651733, "rewards/rejected": 0.2247035950422287, "step": 1268 }, { "epoch": 0.9271232876712329, "grad_norm": 70.9501915959358, "learning_rate": 4.740120356149581e-07, "logits/chosen": -2.7983028888702393, "logits/rejected": -2.1649365425109863, "logps/chosen": -320.20281982421875, "logps/rejected": -204.68667602539062, "loss": 0.4439, "rewards/accuracies": 0.875, "rewards/chosen": 2.0505118370056152, "rewards/margins": 3.3381569385528564, "rewards/rejected": -1.2876453399658203, "step": 1269 }, { "epoch": 0.9278538812785389, "grad_norm": 66.1905356657487, "learning_rate": 4.739411770830048e-07, "logits/chosen": -2.649786949157715, "logits/rejected": -2.369515895843506, "logps/chosen": -456.9914855957031, "logps/rejected": -433.86175537109375, "loss": 0.4871, "rewards/accuracies": 0.75, "rewards/chosen": 0.5906431674957275, "rewards/margins": 0.6303116083145142, "rewards/rejected": -0.039668455719947815, "step": 1270 }, { "epoch": 0.9285844748858447, "grad_norm": 54.22785631380449, "learning_rate": 4.738702273924615e-07, "logits/chosen": -2.688528537750244, "logits/rejected": -2.273695468902588, "logps/chosen": -619.2506103515625, "logps/rejected": -499.6518249511719, "loss": 0.3326, "rewards/accuracies": 0.875, "rewards/chosen": 2.769840955734253, "rewards/margins": 2.006373643875122, "rewards/rejected": 0.7634676098823547, "step": 1271 }, { "epoch": 0.9293150684931507, "grad_norm": 76.58799320471192, "learning_rate": 4.7379918657220954e-07, "logits/chosen": -3.0489370822906494, "logits/rejected": -2.283557176589966, "logps/chosen": -468.66583251953125, "logps/rejected": -303.9598083496094, "loss": 0.4282, "rewards/accuracies": 1.0, "rewards/chosen": 2.368955373764038, "rewards/margins": 4.136275291442871, "rewards/rejected": -1.767319917678833, "step": 1272 }, { "epoch": 0.9300456621004566, "grad_norm": 66.77548816978053, "learning_rate": 4.737280546511669e-07, "logits/chosen": -2.7408294677734375, "logits/rejected": -2.128660202026367, "logps/chosen": -924.7740478515625, "logps/rejected": -597.5045166015625, "loss": 0.4795, "rewards/accuracies": 0.75, "rewards/chosen": 1.6678171157836914, "rewards/margins": 1.5136114358901978, "rewards/rejected": 0.15420566499233246, "step": 1273 }, { "epoch": 0.9307762557077626, "grad_norm": 65.95557735016615, "learning_rate": 4.73656831658289e-07, "logits/chosen": -3.031008720397949, "logits/rejected": -2.38031005859375, "logps/chosen": -658.2246704101562, "logps/rejected": -654.299072265625, "loss": 0.353, "rewards/accuracies": 1.0, "rewards/chosen": 3.349822759628296, "rewards/margins": 4.474318981170654, "rewards/rejected": -1.1244964599609375, "step": 1274 }, { "epoch": 0.9315068493150684, "grad_norm": 66.28765144712098, "learning_rate": 4.735855176225682e-07, "logits/chosen": -3.003955125808716, "logits/rejected": -2.685896873474121, "logps/chosen": -695.8663330078125, "logps/rejected": -921.1937255859375, "loss": 0.4084, "rewards/accuracies": 1.0, "rewards/chosen": 2.4133188724517822, "rewards/margins": 1.669716477394104, "rewards/rejected": 0.7436025142669678, "step": 1275 }, { "epoch": 0.9322374429223744, "grad_norm": 56.49171589572891, "learning_rate": 4.73514112573034e-07, "logits/chosen": -2.334848165512085, "logits/rejected": -2.2033629417419434, "logps/chosen": -773.4605712890625, "logps/rejected": -873.3026123046875, "loss": 0.3887, "rewards/accuracies": 0.875, "rewards/chosen": 2.7314836978912354, "rewards/margins": 2.7093286514282227, "rewards/rejected": 0.022155165672302246, "step": 1276 }, { "epoch": 0.9329680365296804, "grad_norm": 53.970966587369915, "learning_rate": 4.734426165387527e-07, "logits/chosen": -2.4289088249206543, "logits/rejected": -2.3258745670318604, "logps/chosen": -401.2014465332031, "logps/rejected": -495.5870361328125, "loss": 0.294, "rewards/accuracies": 0.75, "rewards/chosen": 0.6880079507827759, "rewards/margins": 1.043480396270752, "rewards/rejected": -0.35547250509262085, "step": 1277 }, { "epoch": 0.9336986301369863, "grad_norm": 42.770582809990444, "learning_rate": 4.7337102954882815e-07, "logits/chosen": -2.37939190864563, "logits/rejected": -2.2779064178466797, "logps/chosen": -731.103515625, "logps/rejected": -885.6277465820312, "loss": 0.2588, "rewards/accuracies": 0.875, "rewards/chosen": 2.056858539581299, "rewards/margins": 3.526559829711914, "rewards/rejected": -1.4697014093399048, "step": 1278 }, { "epoch": 0.9344292237442923, "grad_norm": 49.23280305329339, "learning_rate": 4.7329935163240053e-07, "logits/chosen": -3.0811257362365723, "logits/rejected": -2.5709900856018066, "logps/chosen": -910.91796875, "logps/rejected": -762.9908447265625, "loss": 0.2603, "rewards/accuracies": 0.875, "rewards/chosen": 2.8246278762817383, "rewards/margins": 1.3836390972137451, "rewards/rejected": 1.4409886598587036, "step": 1279 }, { "epoch": 0.9351598173515981, "grad_norm": 68.92750168315438, "learning_rate": 4.7322758281864773e-07, "logits/chosen": -2.6870107650756836, "logits/rejected": -2.157623052597046, "logps/chosen": -437.9079895019531, "logps/rejected": -429.9189147949219, "loss": 0.4611, "rewards/accuracies": 0.875, "rewards/chosen": 1.9415823221206665, "rewards/margins": 3.5254931449890137, "rewards/rejected": -1.5839107036590576, "step": 1280 }, { "epoch": 0.9358904109589041, "grad_norm": 61.72307935764891, "learning_rate": 4.731557231367841e-07, "logits/chosen": -2.728450298309326, "logits/rejected": -2.3236513137817383, "logps/chosen": -560.1114501953125, "logps/rejected": -457.6759338378906, "loss": 0.3371, "rewards/accuracies": 0.875, "rewards/chosen": 2.066861867904663, "rewards/margins": 2.16666316986084, "rewards/rejected": -0.09980162978172302, "step": 1281 }, { "epoch": 0.93662100456621, "grad_norm": 53.40954358261382, "learning_rate": 4.7308377261606127e-07, "logits/chosen": -3.314373016357422, "logits/rejected": -2.1848878860473633, "logps/chosen": -808.3495483398438, "logps/rejected": -572.12890625, "loss": 0.3007, "rewards/accuracies": 1.0, "rewards/chosen": 2.544532060623169, "rewards/margins": 2.7101142406463623, "rewards/rejected": -0.1655820906162262, "step": 1282 }, { "epoch": 0.937351598173516, "grad_norm": 62.99766899519477, "learning_rate": 4.7301173128576774e-07, "logits/chosen": -2.647855043411255, "logits/rejected": -1.8410847187042236, "logps/chosen": -943.3524169921875, "logps/rejected": -795.0311889648438, "loss": 0.4258, "rewards/accuracies": 0.75, "rewards/chosen": 3.009382963180542, "rewards/margins": 3.155881881713867, "rewards/rejected": -0.1464986801147461, "step": 1283 }, { "epoch": 0.938082191780822, "grad_norm": 72.84562014973858, "learning_rate": 4.7293959917522907e-07, "logits/chosen": -2.505863666534424, "logits/rejected": -2.3452439308166504, "logps/chosen": -819.3187866210938, "logps/rejected": -767.15087890625, "loss": 0.4405, "rewards/accuracies": 0.75, "rewards/chosen": 1.7436811923980713, "rewards/margins": 1.8069305419921875, "rewards/rejected": -0.06324940919876099, "step": 1284 }, { "epoch": 0.9388127853881278, "grad_norm": 68.32156990338044, "learning_rate": 4.7286737631380764e-07, "logits/chosen": -2.8294079303741455, "logits/rejected": -1.3851673603057861, "logps/chosen": -513.5989990234375, "logps/rejected": -287.3175964355469, "loss": 0.454, "rewards/accuracies": 0.875, "rewards/chosen": 1.851466417312622, "rewards/margins": 3.2821834087371826, "rewards/rejected": -1.43071711063385, "step": 1285 }, { "epoch": 0.9395433789954338, "grad_norm": 58.9970950688464, "learning_rate": 4.7279506273090286e-07, "logits/chosen": -2.762728452682495, "logits/rejected": -2.425452709197998, "logps/chosen": -796.92333984375, "logps/rejected": -714.4528198242188, "loss": 0.4262, "rewards/accuracies": 0.875, "rewards/chosen": 2.5572378635406494, "rewards/margins": 3.117528200149536, "rewards/rejected": -0.5602903366088867, "step": 1286 }, { "epoch": 0.9402739726027397, "grad_norm": 58.58864352748427, "learning_rate": 4.7272265845595103e-07, "logits/chosen": -2.8873672485351562, "logits/rejected": -2.737326145172119, "logps/chosen": -697.8809814453125, "logps/rejected": -666.955810546875, "loss": 0.3049, "rewards/accuracies": 0.875, "rewards/chosen": 2.52463698387146, "rewards/margins": 1.7483701705932617, "rewards/rejected": 0.7762668132781982, "step": 1287 }, { "epoch": 0.9410045662100457, "grad_norm": 70.7213840373352, "learning_rate": 4.7265016351842545e-07, "logits/chosen": -2.6096837520599365, "logits/rejected": -1.9128735065460205, "logps/chosen": -587.3185424804688, "logps/rejected": -537.407470703125, "loss": 0.3673, "rewards/accuracies": 0.625, "rewards/chosen": 1.6212078332901, "rewards/margins": 1.95488440990448, "rewards/rejected": -0.33367666602134705, "step": 1288 }, { "epoch": 0.9417351598173516, "grad_norm": 70.01974121743274, "learning_rate": 4.7257757794783623e-07, "logits/chosen": -3.1089885234832764, "logits/rejected": -2.3665802478790283, "logps/chosen": -923.3779296875, "logps/rejected": -647.6971435546875, "loss": 0.4509, "rewards/accuracies": 0.625, "rewards/chosen": 3.045635461807251, "rewards/margins": 1.9444860219955444, "rewards/rejected": 1.101149559020996, "step": 1289 }, { "epoch": 0.9424657534246575, "grad_norm": 71.92553250802179, "learning_rate": 4.725049017737304e-07, "logits/chosen": -2.640000343322754, "logits/rejected": -1.9059514999389648, "logps/chosen": -654.7914428710938, "logps/rejected": -456.6678771972656, "loss": 0.4482, "rewards/accuracies": 0.875, "rewards/chosen": 2.258214235305786, "rewards/margins": 2.3841609954833984, "rewards/rejected": -0.12594696879386902, "step": 1290 }, { "epoch": 0.9431963470319634, "grad_norm": 61.96192402836499, "learning_rate": 4.724321350256919e-07, "logits/chosen": -2.9809329509735107, "logits/rejected": -2.288804054260254, "logps/chosen": -775.4129638671875, "logps/rejected": -600.6239624023438, "loss": 0.3817, "rewards/accuracies": 0.875, "rewards/chosen": 3.1209819316864014, "rewards/margins": 2.249305009841919, "rewards/rejected": 0.8716768622398376, "step": 1291 }, { "epoch": 0.9439269406392694, "grad_norm": 65.80006040437007, "learning_rate": 4.723592777333415e-07, "logits/chosen": -2.155925750732422, "logits/rejected": -2.7400054931640625, "logps/chosen": -526.732666015625, "logps/rejected": -720.3740844726562, "loss": 0.4454, "rewards/accuracies": 0.25, "rewards/chosen": 1.3016457557678223, "rewards/margins": 0.3212577700614929, "rewards/rejected": 0.9803879857063293, "step": 1292 }, { "epoch": 0.9446575342465754, "grad_norm": 56.07475958137828, "learning_rate": 4.7228632992633686e-07, "logits/chosen": -2.9049341678619385, "logits/rejected": -3.1672563552856445, "logps/chosen": -344.5491943359375, "logps/rejected": -654.9337158203125, "loss": 0.3018, "rewards/accuracies": 0.625, "rewards/chosen": 0.8653080463409424, "rewards/margins": 2.651146650314331, "rewards/rejected": -1.7858386039733887, "step": 1293 }, { "epoch": 0.9453881278538813, "grad_norm": 61.07666401105049, "learning_rate": 4.7221329163437245e-07, "logits/chosen": -3.102416515350342, "logits/rejected": -1.6679790019989014, "logps/chosen": -603.1795654296875, "logps/rejected": -388.7013244628906, "loss": 0.3622, "rewards/accuracies": 0.875, "rewards/chosen": 2.682260036468506, "rewards/margins": 3.4213967323303223, "rewards/rejected": -0.7391364574432373, "step": 1294 }, { "epoch": 0.9461187214611873, "grad_norm": 56.91053148363667, "learning_rate": 4.721401628871796e-07, "logits/chosen": -3.0681591033935547, "logits/rejected": -2.425455093383789, "logps/chosen": -1016.30419921875, "logps/rejected": -841.006103515625, "loss": 0.3107, "rewards/accuracies": 0.75, "rewards/chosen": 2.7032828330993652, "rewards/margins": 1.1077622175216675, "rewards/rejected": 1.5955207347869873, "step": 1295 }, { "epoch": 0.9468493150684931, "grad_norm": 59.62536237114727, "learning_rate": 4.720669437145265e-07, "logits/chosen": -3.013355255126953, "logits/rejected": -2.237351655960083, "logps/chosen": -660.5653076171875, "logps/rejected": -462.1124572753906, "loss": 0.4038, "rewards/accuracies": 1.0, "rewards/chosen": 1.8097397089004517, "rewards/margins": 2.6307590007781982, "rewards/rejected": -0.8210194110870361, "step": 1296 }, { "epoch": 0.9475799086757991, "grad_norm": 64.11181029069449, "learning_rate": 4.7199363414621793e-07, "logits/chosen": -3.1613359451293945, "logits/rejected": -2.5505361557006836, "logps/chosen": -700.1616821289062, "logps/rejected": -590.98095703125, "loss": 0.4037, "rewards/accuracies": 0.625, "rewards/chosen": 2.0557732582092285, "rewards/margins": 1.8755356073379517, "rewards/rejected": 0.1802375763654709, "step": 1297 }, { "epoch": 0.948310502283105, "grad_norm": 68.96308597041246, "learning_rate": 4.7192023421209595e-07, "logits/chosen": -2.605001449584961, "logits/rejected": -1.8501965999603271, "logps/chosen": -863.86279296875, "logps/rejected": -565.4191284179688, "loss": 0.3517, "rewards/accuracies": 0.625, "rewards/chosen": 1.3734569549560547, "rewards/margins": 2.1867995262145996, "rewards/rejected": -0.8133424520492554, "step": 1298 }, { "epoch": 0.949041095890411, "grad_norm": 66.92456545198856, "learning_rate": 4.7184674394203883e-07, "logits/chosen": -2.3990981578826904, "logits/rejected": -1.8983235359191895, "logps/chosen": -367.2080078125, "logps/rejected": -302.9057922363281, "loss": 0.4357, "rewards/accuracies": 0.875, "rewards/chosen": 1.020820140838623, "rewards/margins": 2.906944751739502, "rewards/rejected": -1.886124610900879, "step": 1299 }, { "epoch": 0.9497716894977168, "grad_norm": 75.29825137515645, "learning_rate": 4.7177316336596194e-07, "logits/chosen": -2.626232147216797, "logits/rejected": -2.0019798278808594, "logps/chosen": -984.71826171875, "logps/rejected": -641.401611328125, "loss": 0.4645, "rewards/accuracies": 0.75, "rewards/chosen": 3.1829833984375, "rewards/margins": 2.7792227268218994, "rewards/rejected": 0.40376076102256775, "step": 1300 }, { "epoch": 0.9505022831050228, "grad_norm": 62.47445962526407, "learning_rate": 4.7169949251381736e-07, "logits/chosen": -2.737271547317505, "logits/rejected": -1.8875203132629395, "logps/chosen": -762.3189086914062, "logps/rejected": -445.1319580078125, "loss": 0.348, "rewards/accuracies": 0.75, "rewards/chosen": 2.1225733757019043, "rewards/margins": 2.4894113540649414, "rewards/rejected": -0.36683791875839233, "step": 1301 }, { "epoch": 0.9512328767123288, "grad_norm": 51.107019970199545, "learning_rate": 4.7162573141559404e-07, "logits/chosen": -2.713111400604248, "logits/rejected": -2.582024097442627, "logps/chosen": -584.0859375, "logps/rejected": -506.30926513671875, "loss": 0.357, "rewards/accuracies": 0.75, "rewards/chosen": 0.6249747276306152, "rewards/margins": 0.7004414796829224, "rewards/rejected": -0.07546672224998474, "step": 1302 }, { "epoch": 0.9519634703196347, "grad_norm": 35.61513706671996, "learning_rate": 4.715518801013173e-07, "logits/chosen": -2.7745895385742188, "logits/rejected": -2.5682477951049805, "logps/chosen": -637.734619140625, "logps/rejected": -661.0301513671875, "loss": 0.1858, "rewards/accuracies": 0.625, "rewards/chosen": 2.3670382499694824, "rewards/margins": 1.6659451723098755, "rewards/rejected": 0.7010930776596069, "step": 1303 }, { "epoch": 0.9526940639269407, "grad_norm": 58.75619980585027, "learning_rate": 4.714779386010496e-07, "logits/chosen": -2.3698911666870117, "logits/rejected": -1.746573567390442, "logps/chosen": -493.8692321777344, "logps/rejected": -316.1737976074219, "loss": 0.3897, "rewards/accuracies": 0.75, "rewards/chosen": 2.143845558166504, "rewards/margins": 2.9838643074035645, "rewards/rejected": -0.8400187492370605, "step": 1304 }, { "epoch": 0.9534246575342465, "grad_norm": 64.51477543942937, "learning_rate": 4.7140390694488985e-07, "logits/chosen": -2.7023496627807617, "logits/rejected": -2.4362196922302246, "logps/chosen": -617.4375, "logps/rejected": -499.90460205078125, "loss": 0.4794, "rewards/accuracies": 0.75, "rewards/chosen": 1.9784886837005615, "rewards/margins": 1.0858051776885986, "rewards/rejected": 0.8926836848258972, "step": 1305 }, { "epoch": 0.9541552511415525, "grad_norm": 54.80171332360992, "learning_rate": 4.7132978516297373e-07, "logits/chosen": -2.9216248989105225, "logits/rejected": -2.0914089679718018, "logps/chosen": -1103.2025146484375, "logps/rejected": -709.9649658203125, "loss": 0.3127, "rewards/accuracies": 0.75, "rewards/chosen": 2.8659141063690186, "rewards/margins": 2.1640987396240234, "rewards/rejected": 0.7018153667449951, "step": 1306 }, { "epoch": 0.9548858447488584, "grad_norm": 69.34726007903087, "learning_rate": 4.712555732854737e-07, "logits/chosen": -3.00797176361084, "logits/rejected": -2.632814407348633, "logps/chosen": -745.0020751953125, "logps/rejected": -648.4008178710938, "loss": 0.3802, "rewards/accuracies": 0.75, "rewards/chosen": 2.056489944458008, "rewards/margins": 0.7681105136871338, "rewards/rejected": 1.288379192352295, "step": 1307 }, { "epoch": 0.9556164383561644, "grad_norm": 59.98914900252274, "learning_rate": 4.7118127134259864e-07, "logits/chosen": -2.8453807830810547, "logits/rejected": -2.309326648712158, "logps/chosen": -716.9261474609375, "logps/rejected": -594.8341064453125, "loss": 0.4081, "rewards/accuracies": 0.5, "rewards/chosen": 1.9306343793869019, "rewards/margins": 1.760480284690857, "rewards/rejected": 0.17015419900417328, "step": 1308 }, { "epoch": 0.9563470319634704, "grad_norm": 51.467094501304906, "learning_rate": 4.711068793645945e-07, "logits/chosen": -2.9462287425994873, "logits/rejected": -1.9615103006362915, "logps/chosen": -821.4553833007812, "logps/rejected": -513.3300170898438, "loss": 0.2796, "rewards/accuracies": 1.0, "rewards/chosen": 2.667351484298706, "rewards/margins": 2.7838072776794434, "rewards/rejected": -0.11645570397377014, "step": 1309 }, { "epoch": 0.9570776255707762, "grad_norm": 73.10466798001855, "learning_rate": 4.7103239738174337e-07, "logits/chosen": -3.5814902782440186, "logits/rejected": -2.540900707244873, "logps/chosen": -753.3143310546875, "logps/rejected": -599.3797607421875, "loss": 0.4824, "rewards/accuracies": 0.625, "rewards/chosen": 2.5914478302001953, "rewards/margins": 2.0608925819396973, "rewards/rejected": 0.5305553078651428, "step": 1310 }, { "epoch": 0.9578082191780822, "grad_norm": 55.939468366205894, "learning_rate": 4.709578254243645e-07, "logits/chosen": -3.5048882961273193, "logits/rejected": -2.7667839527130127, "logps/chosen": -1100.5902099609375, "logps/rejected": -733.3408813476562, "loss": 0.2854, "rewards/accuracies": 0.875, "rewards/chosen": 2.261782169342041, "rewards/margins": 1.3152625560760498, "rewards/rejected": 0.9465194940567017, "step": 1311 }, { "epoch": 0.9585388127853881, "grad_norm": 77.34069754077571, "learning_rate": 4.7088316352281333e-07, "logits/chosen": -3.019927740097046, "logits/rejected": -2.9457345008850098, "logps/chosen": -838.8257446289062, "logps/rejected": -719.218994140625, "loss": 0.4987, "rewards/accuracies": 0.75, "rewards/chosen": 0.5734046697616577, "rewards/margins": 0.5397071838378906, "rewards/rejected": 0.03369751572608948, "step": 1312 }, { "epoch": 0.9592694063926941, "grad_norm": 41.80934685420235, "learning_rate": 4.708084117074822e-07, "logits/chosen": -3.1688554286956787, "logits/rejected": -1.9832000732421875, "logps/chosen": -982.8165283203125, "logps/rejected": -498.08367919921875, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 3.5548949241638184, "rewards/margins": 3.709235668182373, "rewards/rejected": -0.15434063971042633, "step": 1313 }, { "epoch": 0.96, "grad_norm": 51.15263235528549, "learning_rate": 4.7073357000879994e-07, "logits/chosen": -2.8663554191589355, "logits/rejected": -2.13179612159729, "logps/chosen": -511.2474670410156, "logps/rejected": -467.395263671875, "loss": 0.3748, "rewards/accuracies": 0.75, "rewards/chosen": 1.1096441745758057, "rewards/margins": 1.197331428527832, "rewards/rejected": -0.0876871645450592, "step": 1314 }, { "epoch": 0.960730593607306, "grad_norm": 53.28654894838026, "learning_rate": 4.70658638457232e-07, "logits/chosen": -2.8563907146453857, "logits/rejected": -1.687988042831421, "logps/chosen": -579.45263671875, "logps/rejected": -351.559326171875, "loss": 0.3492, "rewards/accuracies": 0.875, "rewards/chosen": 3.5008792877197266, "rewards/margins": 3.469447374343872, "rewards/rejected": 0.031432121992111206, "step": 1315 }, { "epoch": 0.9614611872146118, "grad_norm": 78.16244727529006, "learning_rate": 4.705836170832802e-07, "logits/chosen": -2.473477363586426, "logits/rejected": -2.785733938217163, "logps/chosen": -589.4110107421875, "logps/rejected": -689.1480102539062, "loss": 0.5712, "rewards/accuracies": 0.875, "rewards/chosen": 1.084214687347412, "rewards/margins": 1.6431547403335571, "rewards/rejected": -0.5589399337768555, "step": 1316 }, { "epoch": 0.9621917808219178, "grad_norm": 53.93938519663183, "learning_rate": 4.7050850591748335e-07, "logits/chosen": -2.6872520446777344, "logits/rejected": -2.424112558364868, "logps/chosen": -873.0035400390625, "logps/rejected": -845.6981811523438, "loss": 0.2325, "rewards/accuracies": 0.875, "rewards/chosen": 4.306736946105957, "rewards/margins": 3.524409055709839, "rewards/rejected": 0.7823281288146973, "step": 1317 }, { "epoch": 0.9629223744292238, "grad_norm": 70.36014011076335, "learning_rate": 4.7043330499041644e-07, "logits/chosen": -2.916494607925415, "logits/rejected": -2.346912145614624, "logps/chosen": -626.7926025390625, "logps/rejected": -474.4075012207031, "loss": 0.4686, "rewards/accuracies": 0.75, "rewards/chosen": 2.9500985145568848, "rewards/margins": 3.2884883880615234, "rewards/rejected": -0.3383898138999939, "step": 1318 }, { "epoch": 0.9636529680365297, "grad_norm": 65.337076715675, "learning_rate": 4.703580143326912e-07, "logits/chosen": -3.2132551670074463, "logits/rejected": -2.1053662300109863, "logps/chosen": -974.090576171875, "logps/rejected": -494.36468505859375, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 4.364799499511719, "rewards/margins": 3.9191856384277344, "rewards/rejected": 0.4456140995025635, "step": 1319 }, { "epoch": 0.9643835616438357, "grad_norm": 37.43817331667054, "learning_rate": 4.7028263397495575e-07, "logits/chosen": -2.9392902851104736, "logits/rejected": -1.5175082683563232, "logps/chosen": -632.1217041015625, "logps/rejected": -362.87347412109375, "loss": 0.2367, "rewards/accuracies": 0.875, "rewards/chosen": 3.5492537021636963, "rewards/margins": 3.8957934379577637, "rewards/rejected": -0.34654009342193604, "step": 1320 }, { "epoch": 0.9651141552511415, "grad_norm": 52.19074185290194, "learning_rate": 4.7020716394789483e-07, "logits/chosen": -3.0361099243164062, "logits/rejected": -1.8733484745025635, "logps/chosen": -755.6498413085938, "logps/rejected": -421.76556396484375, "loss": 0.3359, "rewards/accuracies": 0.875, "rewards/chosen": 3.1785354614257812, "rewards/margins": 3.3171565532684326, "rewards/rejected": -0.13862085342407227, "step": 1321 }, { "epoch": 0.9658447488584475, "grad_norm": 55.0142448705049, "learning_rate": 4.701316042822295e-07, "logits/chosen": -3.1393685340881348, "logits/rejected": -2.0560555458068848, "logps/chosen": -622.1358642578125, "logps/rejected": -704.55224609375, "loss": 0.2633, "rewards/accuracies": 0.875, "rewards/chosen": 2.3788516521453857, "rewards/margins": 4.076337814331055, "rewards/rejected": -1.6974862813949585, "step": 1322 }, { "epoch": 0.9665753424657534, "grad_norm": 47.81903805379846, "learning_rate": 4.700559550087177e-07, "logits/chosen": -2.631253242492676, "logits/rejected": -2.0911097526550293, "logps/chosen": -530.907470703125, "logps/rejected": -516.8456420898438, "loss": 0.301, "rewards/accuracies": 0.75, "rewards/chosen": 1.613534688949585, "rewards/margins": 2.9689762592315674, "rewards/rejected": -1.355441689491272, "step": 1323 }, { "epoch": 0.9673059360730594, "grad_norm": 68.56023526341636, "learning_rate": 4.699802161581534e-07, "logits/chosen": -2.5662457942962646, "logits/rejected": -1.9409946203231812, "logps/chosen": -660.3804931640625, "logps/rejected": -470.4085388183594, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 3.4130544662475586, "rewards/margins": 4.409206390380859, "rewards/rejected": -0.996152400970459, "step": 1324 }, { "epoch": 0.9680365296803652, "grad_norm": 64.99307354922523, "learning_rate": 4.6990438776136724e-07, "logits/chosen": -2.6335630416870117, "logits/rejected": -2.459705352783203, "logps/chosen": -614.067626953125, "logps/rejected": -637.9940795898438, "loss": 0.4264, "rewards/accuracies": 0.75, "rewards/chosen": 2.5351438522338867, "rewards/margins": 2.247331380844116, "rewards/rejected": 0.2878124713897705, "step": 1325 }, { "epoch": 0.9687671232876712, "grad_norm": 57.62713526412709, "learning_rate": 4.698284698492264e-07, "logits/chosen": -2.63429856300354, "logits/rejected": -2.2973155975341797, "logps/chosen": -625.34814453125, "logps/rejected": -543.597412109375, "loss": 0.3002, "rewards/accuracies": 0.625, "rewards/chosen": 1.9230945110321045, "rewards/margins": 2.094205141067505, "rewards/rejected": -0.171110600233078, "step": 1326 }, { "epoch": 0.9694977168949772, "grad_norm": 43.97662703739269, "learning_rate": 4.6975246245263426e-07, "logits/chosen": -2.6574487686157227, "logits/rejected": -2.5170400142669678, "logps/chosen": -783.0347290039062, "logps/rejected": -758.9445190429688, "loss": 0.3052, "rewards/accuracies": 0.875, "rewards/chosen": 3.2564163208007812, "rewards/margins": 2.2083845138549805, "rewards/rejected": 1.0480318069458008, "step": 1327 }, { "epoch": 0.9702283105022831, "grad_norm": 57.083133505961065, "learning_rate": 4.6967636560253086e-07, "logits/chosen": -2.88804292678833, "logits/rejected": -1.9891777038574219, "logps/chosen": -539.3933715820312, "logps/rejected": -394.70843505859375, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 2.0988097190856934, "rewards/margins": 3.138657808303833, "rewards/rejected": -1.0398482084274292, "step": 1328 }, { "epoch": 0.9709589041095891, "grad_norm": 53.661061501654764, "learning_rate": 4.696001793298926e-07, "logits/chosen": -2.7638609409332275, "logits/rejected": -1.9454835653305054, "logps/chosen": -746.451416015625, "logps/rejected": -610.3584594726562, "loss": 0.397, "rewards/accuracies": 0.75, "rewards/chosen": 2.358975648880005, "rewards/margins": 3.432135820388794, "rewards/rejected": -1.0731604099273682, "step": 1329 }, { "epoch": 0.971689497716895, "grad_norm": 48.586487097912574, "learning_rate": 4.695239036657321e-07, "logits/chosen": -2.6894614696502686, "logits/rejected": -2.0759100914001465, "logps/chosen": -711.36279296875, "logps/rejected": -520.01123046875, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 2.016636371612549, "rewards/margins": 2.4350473880767822, "rewards/rejected": -0.4184112548828125, "step": 1330 }, { "epoch": 0.9724200913242009, "grad_norm": 75.87110330294564, "learning_rate": 4.6944753864109867e-07, "logits/chosen": -2.725161075592041, "logits/rejected": -2.003086566925049, "logps/chosen": -790.884521484375, "logps/rejected": -536.1444091796875, "loss": 0.395, "rewards/accuracies": 0.875, "rewards/chosen": 2.28871750831604, "rewards/margins": 1.9873065948486328, "rewards/rejected": 0.3014107346534729, "step": 1331 }, { "epoch": 0.9731506849315068, "grad_norm": 58.73927761196081, "learning_rate": 4.693710842870776e-07, "logits/chosen": -2.9617035388946533, "logits/rejected": -2.235001564025879, "logps/chosen": -670.2879028320312, "logps/rejected": -421.2470397949219, "loss": 0.4361, "rewards/accuracies": 0.625, "rewards/chosen": 2.5601515769958496, "rewards/margins": 1.838619351387024, "rewards/rejected": 0.7215323448181152, "step": 1332 }, { "epoch": 0.9738812785388128, "grad_norm": 62.3707609093095, "learning_rate": 4.69294540634791e-07, "logits/chosen": -2.7365965843200684, "logits/rejected": -2.9062747955322266, "logps/chosen": -475.45684814453125, "logps/rejected": -426.05242919921875, "loss": 0.4022, "rewards/accuracies": 0.625, "rewards/chosen": 1.3902816772460938, "rewards/margins": 0.8366630673408508, "rewards/rejected": 0.5536186695098877, "step": 1333 }, { "epoch": 0.9746118721461188, "grad_norm": 43.534585467601936, "learning_rate": 4.6921790771539693e-07, "logits/chosen": -3.0337390899658203, "logits/rejected": -2.21181058883667, "logps/chosen": -442.7410888671875, "logps/rejected": -383.3520202636719, "loss": 0.2265, "rewards/accuracies": 0.875, "rewards/chosen": 2.907599449157715, "rewards/margins": 3.8042025566101074, "rewards/rejected": -0.8966034650802612, "step": 1334 }, { "epoch": 0.9753424657534246, "grad_norm": 69.4288888393639, "learning_rate": 4.691411855600901e-07, "logits/chosen": -2.6315388679504395, "logits/rejected": -2.6174182891845703, "logps/chosen": -543.9755249023438, "logps/rejected": -664.1803588867188, "loss": 0.4632, "rewards/accuracies": 0.75, "rewards/chosen": 2.0841481685638428, "rewards/margins": 2.129741668701172, "rewards/rejected": -0.04559361934661865, "step": 1335 }, { "epoch": 0.9760730593607306, "grad_norm": 55.606615641711194, "learning_rate": 4.6906437420010135e-07, "logits/chosen": -2.4739882946014404, "logits/rejected": -1.263448715209961, "logps/chosen": -663.2120971679688, "logps/rejected": -336.2406921386719, "loss": 0.3716, "rewards/accuracies": 0.875, "rewards/chosen": 2.1256744861602783, "rewards/margins": 2.701725482940674, "rewards/rejected": -0.5760512351989746, "step": 1336 }, { "epoch": 0.9768036529680365, "grad_norm": 60.49149281613903, "learning_rate": 4.6898747366669784e-07, "logits/chosen": -2.8704986572265625, "logits/rejected": -2.711761951446533, "logps/chosen": -536.4774169921875, "logps/rejected": -595.4181518554688, "loss": 0.3991, "rewards/accuracies": 0.875, "rewards/chosen": 2.015742778778076, "rewards/margins": 2.2390010356903076, "rewards/rejected": -0.22325840592384338, "step": 1337 }, { "epoch": 0.9775342465753425, "grad_norm": 67.41236353824497, "learning_rate": 4.6891048399118317e-07, "logits/chosen": -3.014298677444458, "logits/rejected": -2.758552312850952, "logps/chosen": -729.75537109375, "logps/rejected": -637.3540649414062, "loss": 0.4646, "rewards/accuracies": 0.625, "rewards/chosen": 1.7984353303909302, "rewards/margins": 0.9639660120010376, "rewards/rejected": 0.834469199180603, "step": 1338 }, { "epoch": 0.9782648401826484, "grad_norm": 57.108013407161174, "learning_rate": 4.68833405204897e-07, "logits/chosen": -3.391695976257324, "logits/rejected": -2.7463459968566895, "logps/chosen": -782.951904296875, "logps/rejected": -638.9097900390625, "loss": 0.3705, "rewards/accuracies": 0.875, "rewards/chosen": 1.9996906518936157, "rewards/margins": 0.8935680985450745, "rewards/rejected": 1.106122612953186, "step": 1339 }, { "epoch": 0.9789954337899544, "grad_norm": 41.120712103114016, "learning_rate": 4.6875623733921555e-07, "logits/chosen": -2.4467504024505615, "logits/rejected": -1.7684321403503418, "logps/chosen": -471.5013427734375, "logps/rejected": -341.22430419921875, "loss": 0.2306, "rewards/accuracies": 0.875, "rewards/chosen": 1.696475863456726, "rewards/margins": 2.5400583744049072, "rewards/rejected": -0.8435825109481812, "step": 1340 }, { "epoch": 0.9797260273972602, "grad_norm": 62.69819940783288, "learning_rate": 4.686789804255511e-07, "logits/chosen": -2.7523512840270996, "logits/rejected": -2.0258660316467285, "logps/chosen": -496.2164611816406, "logps/rejected": -348.37811279296875, "loss": 0.3908, "rewards/accuracies": 0.75, "rewards/chosen": 1.962122917175293, "rewards/margins": 2.72487735748291, "rewards/rejected": -0.7627546191215515, "step": 1341 }, { "epoch": 0.9804566210045662, "grad_norm": 54.21727078313584, "learning_rate": 4.6860163449535217e-07, "logits/chosen": -2.787435531616211, "logits/rejected": -1.9036123752593994, "logps/chosen": -633.7088012695312, "logps/rejected": -344.0830993652344, "loss": 0.3036, "rewards/accuracies": 1.0, "rewards/chosen": 3.048511505126953, "rewards/margins": 3.8162972927093506, "rewards/rejected": -0.767785906791687, "step": 1342 }, { "epoch": 0.9811872146118722, "grad_norm": 57.64430135334823, "learning_rate": 4.685241995801036e-07, "logits/chosen": -3.4033203125, "logits/rejected": -2.6499452590942383, "logps/chosen": -1107.509521484375, "logps/rejected": -751.1587524414062, "loss": 0.3932, "rewards/accuracies": 0.875, "rewards/chosen": 4.611387729644775, "rewards/margins": 2.637793779373169, "rewards/rejected": 1.9735939502716064, "step": 1343 }, { "epoch": 0.9819178082191781, "grad_norm": 66.02927188825991, "learning_rate": 4.6844667571132646e-07, "logits/chosen": -2.6191296577453613, "logits/rejected": -1.9321719408035278, "logps/chosen": -850.2501831054688, "logps/rejected": -584.630615234375, "loss": 0.3583, "rewards/accuracies": 0.625, "rewards/chosen": 2.189232587814331, "rewards/margins": 2.2191824913024902, "rewards/rejected": -0.02994990348815918, "step": 1344 }, { "epoch": 0.982648401826484, "grad_norm": 52.15621723336299, "learning_rate": 4.683690629205779e-07, "logits/chosen": -3.05505108833313, "logits/rejected": -2.418478488922119, "logps/chosen": -651.868408203125, "logps/rejected": -484.21148681640625, "loss": 0.3198, "rewards/accuracies": 0.875, "rewards/chosen": 2.7236738204956055, "rewards/margins": 2.9946036338806152, "rewards/rejected": -0.2709296941757202, "step": 1345 }, { "epoch": 0.9833789954337899, "grad_norm": 62.509011069155065, "learning_rate": 4.6829136123945143e-07, "logits/chosen": -2.7078466415405273, "logits/rejected": -2.230926275253296, "logps/chosen": -481.2637939453125, "logps/rejected": -487.96533203125, "loss": 0.3433, "rewards/accuracies": 0.75, "rewards/chosen": 1.9793810844421387, "rewards/margins": 2.674315929412842, "rewards/rejected": -0.6949349641799927, "step": 1346 }, { "epoch": 0.9841095890410959, "grad_norm": 60.096571452875956, "learning_rate": 4.682135706995768e-07, "logits/chosen": -2.2932589054107666, "logits/rejected": -1.9463576078414917, "logps/chosen": -745.2836303710938, "logps/rejected": -562.7269287109375, "loss": 0.3606, "rewards/accuracies": 0.875, "rewards/chosen": 2.9648470878601074, "rewards/margins": 3.1605238914489746, "rewards/rejected": -0.19567662477493286, "step": 1347 }, { "epoch": 0.9848401826484018, "grad_norm": 45.132311453208054, "learning_rate": 4.681356913326196e-07, "logits/chosen": -2.6210763454437256, "logits/rejected": -2.221076011657715, "logps/chosen": -484.119140625, "logps/rejected": -502.03497314453125, "loss": 0.3496, "rewards/accuracies": 0.875, "rewards/chosen": 2.0360302925109863, "rewards/margins": 2.8543074131011963, "rewards/rejected": -0.8182772397994995, "step": 1348 }, { "epoch": 0.9855707762557078, "grad_norm": 41.756759370489945, "learning_rate": 4.680577231702819e-07, "logits/chosen": -2.383938789367676, "logits/rejected": -2.019753932952881, "logps/chosen": -456.86053466796875, "logps/rejected": -394.24761962890625, "loss": 0.2614, "rewards/accuracies": 0.875, "rewards/chosen": 1.0589962005615234, "rewards/margins": 1.695461392402649, "rewards/rejected": -0.6364650726318359, "step": 1349 }, { "epoch": 0.9863013698630136, "grad_norm": 73.04199348662384, "learning_rate": 4.6797966624430176e-07, "logits/chosen": -2.695266008377075, "logits/rejected": -2.0806491374969482, "logps/chosen": -782.2921142578125, "logps/rejected": -598.6415405273438, "loss": 0.4096, "rewards/accuracies": 0.75, "rewards/chosen": 2.15148663520813, "rewards/margins": 2.1269097328186035, "rewards/rejected": 0.02457670494914055, "step": 1350 }, { "epoch": 0.9870319634703196, "grad_norm": 43.6187134506044, "learning_rate": 4.679015205864534e-07, "logits/chosen": -3.2125136852264404, "logits/rejected": -2.5086355209350586, "logps/chosen": -632.315185546875, "logps/rejected": -495.03277587890625, "loss": 0.2969, "rewards/accuracies": 0.875, "rewards/chosen": 1.533609390258789, "rewards/margins": 0.8947415947914124, "rewards/rejected": 0.6388677954673767, "step": 1351 }, { "epoch": 0.9877625570776256, "grad_norm": 62.82887885023992, "learning_rate": 4.6782328622854714e-07, "logits/chosen": -2.895057201385498, "logits/rejected": -2.047565460205078, "logps/chosen": -990.9097900390625, "logps/rejected": -559.0283203125, "loss": 0.3634, "rewards/accuracies": 1.0, "rewards/chosen": 2.6997556686401367, "rewards/margins": 2.2091064453125, "rewards/rejected": 0.4906490743160248, "step": 1352 }, { "epoch": 0.9884931506849315, "grad_norm": 59.74536625599093, "learning_rate": 4.6774496320242963e-07, "logits/chosen": -2.752556324005127, "logits/rejected": -2.5764453411102295, "logps/chosen": -790.130859375, "logps/rejected": -677.728271484375, "loss": 0.3451, "rewards/accuracies": 0.875, "rewards/chosen": 2.5141642093658447, "rewards/margins": 1.8074450492858887, "rewards/rejected": 0.706719160079956, "step": 1353 }, { "epoch": 0.9892237442922375, "grad_norm": 48.3000393253269, "learning_rate": 4.6766655153998323e-07, "logits/chosen": -2.537804126739502, "logits/rejected": -2.0200893878936768, "logps/chosen": -728.3038330078125, "logps/rejected": -936.6130981445312, "loss": 0.3045, "rewards/accuracies": 0.875, "rewards/chosen": 2.35672664642334, "rewards/margins": 3.0066423416137695, "rewards/rejected": -0.6499158143997192, "step": 1354 }, { "epoch": 0.9899543378995433, "grad_norm": 63.11892374839934, "learning_rate": 4.675880512731266e-07, "logits/chosen": -2.7843856811523438, "logits/rejected": -2.5239338874816895, "logps/chosen": -551.9025268554688, "logps/rejected": -462.185302734375, "loss": 0.3915, "rewards/accuracies": 0.625, "rewards/chosen": 1.9142730236053467, "rewards/margins": 0.5660557746887207, "rewards/rejected": 1.348217487335205, "step": 1355 }, { "epoch": 0.9906849315068493, "grad_norm": 54.89976884414565, "learning_rate": 4.675094624338146e-07, "logits/chosen": -2.52630877494812, "logits/rejected": -2.5446665287017822, "logps/chosen": -551.7257080078125, "logps/rejected": -572.370361328125, "loss": 0.354, "rewards/accuracies": 0.875, "rewards/chosen": 2.190321922302246, "rewards/margins": 2.8714051246643066, "rewards/rejected": -0.6810829639434814, "step": 1356 }, { "epoch": 0.9914155251141552, "grad_norm": 64.69077588323547, "learning_rate": 4.674307850540378e-07, "logits/chosen": -2.4809770584106445, "logits/rejected": -2.374610662460327, "logps/chosen": -532.7099609375, "logps/rejected": -492.7078857421875, "loss": 0.4496, "rewards/accuracies": 0.625, "rewards/chosen": 0.8588900566101074, "rewards/margins": 0.38033944368362427, "rewards/rejected": 0.47855061292648315, "step": 1357 }, { "epoch": 0.9921461187214612, "grad_norm": 49.334773961932775, "learning_rate": 4.67352019165823e-07, "logits/chosen": -3.1193907260894775, "logits/rejected": -2.4864466190338135, "logps/chosen": -597.0310668945312, "logps/rejected": -527.821533203125, "loss": 0.3448, "rewards/accuracies": 0.875, "rewards/chosen": 2.30067777633667, "rewards/margins": 1.9957598447799683, "rewards/rejected": 0.304918110370636, "step": 1358 }, { "epoch": 0.9928767123287672, "grad_norm": 64.21482272054678, "learning_rate": 4.6727316480123315e-07, "logits/chosen": -2.9221954345703125, "logits/rejected": -1.8987290859222412, "logps/chosen": -866.3734130859375, "logps/rejected": -434.7637939453125, "loss": 0.4076, "rewards/accuracies": 0.875, "rewards/chosen": 2.6991825103759766, "rewards/margins": 1.4428415298461914, "rewards/rejected": 1.2563410997390747, "step": 1359 }, { "epoch": 0.993607305936073, "grad_norm": 55.1657082896046, "learning_rate": 4.67194221992367e-07, "logits/chosen": -3.07802414894104, "logits/rejected": -2.4910435676574707, "logps/chosen": -564.2584228515625, "logps/rejected": -506.74627685546875, "loss": 0.3532, "rewards/accuracies": 0.875, "rewards/chosen": 2.5578978061676025, "rewards/margins": 2.66998553276062, "rewards/rejected": -0.11208763718605042, "step": 1360 }, { "epoch": 0.994337899543379, "grad_norm": 49.8326496260233, "learning_rate": 4.6711519077135943e-07, "logits/chosen": -2.6776461601257324, "logits/rejected": -2.4138050079345703, "logps/chosen": -532.2826538085938, "logps/rejected": -546.5687866210938, "loss": 0.2665, "rewards/accuracies": 0.75, "rewards/chosen": 2.5960090160369873, "rewards/margins": 2.634322166442871, "rewards/rejected": -0.038313090801239014, "step": 1361 }, { "epoch": 0.9950684931506849, "grad_norm": 53.82300411055086, "learning_rate": 4.6703607117038133e-07, "logits/chosen": -3.2782270908355713, "logits/rejected": -1.972000241279602, "logps/chosen": -829.704833984375, "logps/rejected": -487.5403747558594, "loss": 0.2979, "rewards/accuracies": 1.0, "rewards/chosen": 3.3066816329956055, "rewards/margins": 3.047839403152466, "rewards/rejected": 0.2588423490524292, "step": 1362 }, { "epoch": 0.9957990867579909, "grad_norm": 51.47770964061775, "learning_rate": 4.669568632216394e-07, "logits/chosen": -2.875169038772583, "logits/rejected": -2.481353282928467, "logps/chosen": -666.2113037109375, "logps/rejected": -518.9606323242188, "loss": 0.3446, "rewards/accuracies": 0.75, "rewards/chosen": 1.774916410446167, "rewards/margins": 0.6826164722442627, "rewards/rejected": 1.0923001766204834, "step": 1363 }, { "epoch": 0.9965296803652968, "grad_norm": 62.7886304148689, "learning_rate": 4.668775669573765e-07, "logits/chosen": -2.4665563106536865, "logits/rejected": -1.9581612348556519, "logps/chosen": -497.76806640625, "logps/rejected": -389.1536865234375, "loss": 0.3951, "rewards/accuracies": 0.875, "rewards/chosen": 1.3423668146133423, "rewards/margins": 1.6602232456207275, "rewards/rejected": -0.31785646080970764, "step": 1364 }, { "epoch": 0.9972602739726028, "grad_norm": 67.93966691001472, "learning_rate": 4.6679818240987135e-07, "logits/chosen": -2.445521354675293, "logits/rejected": -2.765557289123535, "logps/chosen": -672.8639526367188, "logps/rejected": -729.69140625, "loss": 0.3877, "rewards/accuracies": 0.625, "rewards/chosen": 2.4713358879089355, "rewards/margins": 0.8945029973983765, "rewards/rejected": 1.5768327713012695, "step": 1365 }, { "epoch": 0.9979908675799086, "grad_norm": 66.35081509719757, "learning_rate": 4.6671870961143853e-07, "logits/chosen": -2.7396533489227295, "logits/rejected": -1.7770106792449951, "logps/chosen": -533.6700439453125, "logps/rejected": -277.694091796875, "loss": 0.3762, "rewards/accuracies": 0.875, "rewards/chosen": 1.5637072324752808, "rewards/margins": 1.6156648397445679, "rewards/rejected": -0.0519576221704483, "step": 1366 }, { "epoch": 0.9987214611872146, "grad_norm": 44.08239245116751, "learning_rate": 4.666391485944287e-07, "logits/chosen": -2.5569727420806885, "logits/rejected": -2.256558656692505, "logps/chosen": -428.1105651855469, "logps/rejected": -430.917236328125, "loss": 0.2315, "rewards/accuracies": 0.75, "rewards/chosen": 2.8457717895507812, "rewards/margins": 3.143986225128174, "rewards/rejected": -0.29821428656578064, "step": 1367 }, { "epoch": 0.9994520547945206, "grad_norm": 50.11703344189104, "learning_rate": 4.665594993912284e-07, "logits/chosen": -2.6466474533081055, "logits/rejected": -1.7461642026901245, "logps/chosen": -428.4178466796875, "logps/rejected": -382.0372009277344, "loss": 0.3646, "rewards/accuracies": 0.75, "rewards/chosen": 1.2887420654296875, "rewards/margins": 1.9301612377166748, "rewards/rejected": -0.6414192318916321, "step": 1368 }, { "epoch": 0.9994520547945206, "eval_logits/chosen": -2.8821990489959717, "eval_logits/rejected": -2.3914120197296143, "eval_logps/chosen": -705.845703125, "eval_logps/rejected": -578.4398803710938, "eval_loss": 0.34227442741394043, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 3.006746292114258, "eval_rewards/margins": 2.8299334049224854, "eval_rewards/rejected": 0.17681224644184113, "eval_runtime": 15.3394, "eval_samples_per_second": 7.171, "eval_steps_per_second": 0.913, "step": 1368 }, { "epoch": 1.0001826484018266, "grad_norm": 47.511776397903446, "learning_rate": 4.6647976203425985e-07, "logits/chosen": -3.0389833450317383, "logits/rejected": -2.444136142730713, "logps/chosen": -753.4556884765625, "logps/rejected": -527.2291259765625, "loss": 0.2227, "rewards/accuracies": 1.0, "rewards/chosen": 4.4470062255859375, "rewards/margins": 4.67352819442749, "rewards/rejected": -0.2265217900276184, "step": 1369 }, { "epoch": 1.0009132420091325, "grad_norm": 38.701409176742196, "learning_rate": 4.6639993655598155e-07, "logits/chosen": -2.9723002910614014, "logits/rejected": -2.1371681690216064, "logps/chosen": -620.0714111328125, "logps/rejected": -481.5477294921875, "loss": 0.2006, "rewards/accuracies": 0.75, "rewards/chosen": 2.8768277168273926, "rewards/margins": 3.1865878105163574, "rewards/rejected": -0.30976009368896484, "step": 1370 }, { "epoch": 1.0016438356164383, "grad_norm": 24.1530345834606, "learning_rate": 4.663200229888875e-07, "logits/chosen": -3.4888055324554443, "logits/rejected": -2.8242905139923096, "logps/chosen": -742.9774169921875, "logps/rejected": -594.6091918945312, "loss": 0.1559, "rewards/accuracies": 1.0, "rewards/chosen": 3.652395725250244, "rewards/margins": 3.213308095932007, "rewards/rejected": 0.43908756971359253, "step": 1371 }, { "epoch": 1.0023744292237442, "grad_norm": 46.20349488353803, "learning_rate": 4.6624002136550773e-07, "logits/chosen": -3.1572704315185547, "logits/rejected": -2.3242950439453125, "logps/chosen": -679.9694213867188, "logps/rejected": -512.854736328125, "loss": 0.2307, "rewards/accuracies": 0.875, "rewards/chosen": 2.451167583465576, "rewards/margins": 2.1903891563415527, "rewards/rejected": 0.26077863574028015, "step": 1372 }, { "epoch": 1.0031050228310503, "grad_norm": 24.954970433090203, "learning_rate": 4.661599317184082e-07, "logits/chosen": -3.392064094543457, "logits/rejected": -2.2469449043273926, "logps/chosen": -847.2333374023438, "logps/rejected": -690.5064697265625, "loss": 0.1801, "rewards/accuracies": 1.0, "rewards/chosen": 3.086653470993042, "rewards/margins": 2.395052909851074, "rewards/rejected": 0.6916004419326782, "step": 1373 }, { "epoch": 1.0038356164383562, "grad_norm": 56.56013893552817, "learning_rate": 4.660797540801904e-07, "logits/chosen": -2.1797966957092285, "logits/rejected": -2.5232067108154297, "logps/chosen": -686.4591064453125, "logps/rejected": -823.9132080078125, "loss": 0.3806, "rewards/accuracies": 0.75, "rewards/chosen": 2.481384754180908, "rewards/margins": 1.1746011972427368, "rewards/rejected": 1.3067834377288818, "step": 1374 }, { "epoch": 1.004566210045662, "grad_norm": 22.746361745480307, "learning_rate": 4.659994884834919e-07, "logits/chosen": -3.2425551414489746, "logits/rejected": -2.6015806198120117, "logps/chosen": -990.3614501953125, "logps/rejected": -721.16796875, "loss": 0.1084, "rewards/accuracies": 1.0, "rewards/chosen": 3.5347375869750977, "rewards/margins": 4.099781036376953, "rewards/rejected": -0.5650427937507629, "step": 1375 }, { "epoch": 1.0052968036529681, "grad_norm": 26.48638796738041, "learning_rate": 4.659191349609861e-07, "logits/chosen": -2.7071452140808105, "logits/rejected": -1.9383918046951294, "logps/chosen": -698.3276977539062, "logps/rejected": -504.5634460449219, "loss": 0.1489, "rewards/accuracies": 1.0, "rewards/chosen": 3.7653722763061523, "rewards/margins": 3.8302128314971924, "rewards/rejected": -0.0648408830165863, "step": 1376 }, { "epoch": 1.006027397260274, "grad_norm": 42.83548368011473, "learning_rate": 4.658386935453821e-07, "logits/chosen": -2.422060012817383, "logits/rejected": -2.146099328994751, "logps/chosen": -593.4236450195312, "logps/rejected": -655.7147827148438, "loss": 0.1757, "rewards/accuracies": 0.875, "rewards/chosen": 3.3609423637390137, "rewards/margins": 4.085707664489746, "rewards/rejected": -0.7247653007507324, "step": 1377 }, { "epoch": 1.0067579908675799, "grad_norm": 31.435513807162632, "learning_rate": 4.657581642694246e-07, "logits/chosen": -2.453364372253418, "logits/rejected": -2.4708425998687744, "logps/chosen": -502.01080322265625, "logps/rejected": -499.5513916015625, "loss": 0.1387, "rewards/accuracies": 0.875, "rewards/chosen": 2.275380849838257, "rewards/margins": 3.232177734375, "rewards/rejected": -0.9567971229553223, "step": 1378 }, { "epoch": 1.0074885844748858, "grad_norm": 37.5366292460611, "learning_rate": 4.656775471658945e-07, "logits/chosen": -2.741894483566284, "logits/rejected": -3.0239996910095215, "logps/chosen": -349.4464111328125, "logps/rejected": -405.84478759765625, "loss": 0.232, "rewards/accuracies": 0.75, "rewards/chosen": 2.4781901836395264, "rewards/margins": 2.110081195831299, "rewards/rejected": 0.36810868978500366, "step": 1379 }, { "epoch": 1.0082191780821919, "grad_norm": 52.472653778241956, "learning_rate": 4.6559684226760785e-07, "logits/chosen": -2.8531510829925537, "logits/rejected": -2.3385229110717773, "logps/chosen": -601.7177734375, "logps/rejected": -493.3514099121094, "loss": 0.2855, "rewards/accuracies": 0.625, "rewards/chosen": 3.3817126750946045, "rewards/margins": 2.1518356800079346, "rewards/rejected": 1.2298768758773804, "step": 1380 }, { "epoch": 1.0089497716894977, "grad_norm": 31.700223576572572, "learning_rate": 4.6551604960741714e-07, "logits/chosen": -2.8612537384033203, "logits/rejected": -2.2811954021453857, "logps/chosen": -887.9891967773438, "logps/rejected": -857.5844116210938, "loss": 0.1364, "rewards/accuracies": 1.0, "rewards/chosen": 3.6871025562286377, "rewards/margins": 3.229846477508545, "rewards/rejected": 0.4572557508945465, "step": 1381 }, { "epoch": 1.0096803652968036, "grad_norm": 36.929896570965205, "learning_rate": 4.6543516921821e-07, "logits/chosen": -2.837414026260376, "logits/rejected": -1.9760785102844238, "logps/chosen": -654.24560546875, "logps/rejected": -526.6519775390625, "loss": 0.1986, "rewards/accuracies": 1.0, "rewards/chosen": 3.812744617462158, "rewards/margins": 3.6238012313842773, "rewards/rejected": 0.18894344568252563, "step": 1382 }, { "epoch": 1.0104109589041095, "grad_norm": 23.621119266254617, "learning_rate": 4.653542011329101e-07, "logits/chosen": -2.644340991973877, "logits/rejected": -2.035688877105713, "logps/chosen": -556.09033203125, "logps/rejected": -716.74462890625, "loss": 0.1581, "rewards/accuracies": 1.0, "rewards/chosen": 1.6172109842300415, "rewards/margins": 3.1446475982666016, "rewards/rejected": -1.5274364948272705, "step": 1383 }, { "epoch": 1.0111415525114156, "grad_norm": 50.253568456161645, "learning_rate": 4.652731453844766e-07, "logits/chosen": -3.049839496612549, "logits/rejected": -2.3147454261779785, "logps/chosen": -445.6567077636719, "logps/rejected": -437.4063415527344, "loss": 0.2672, "rewards/accuracies": 1.0, "rewards/chosen": 2.7695651054382324, "rewards/margins": 3.259082078933716, "rewards/rejected": -0.48951682448387146, "step": 1384 }, { "epoch": 1.0118721461187214, "grad_norm": 50.82623007918746, "learning_rate": 4.6519200200590457e-07, "logits/chosen": -3.030758857727051, "logits/rejected": -2.161001443862915, "logps/chosen": -892.548095703125, "logps/rejected": -553.25244140625, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 4.041322708129883, "rewards/margins": 4.606326103210449, "rewards/rejected": -0.5650037527084351, "step": 1385 }, { "epoch": 1.0126027397260273, "grad_norm": 16.79411677934977, "learning_rate": 4.651107710302246e-07, "logits/chosen": -2.6936705112457275, "logits/rejected": -1.933851957321167, "logps/chosen": -606.8153076171875, "logps/rejected": -347.4513244628906, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 3.7028870582580566, "rewards/margins": 4.086030006408691, "rewards/rejected": -0.38314303755760193, "step": 1386 }, { "epoch": 1.0133333333333334, "grad_norm": 36.75978093161907, "learning_rate": 4.6502945249050286e-07, "logits/chosen": -2.1862053871154785, "logits/rejected": -2.0867292881011963, "logps/chosen": -601.9827880859375, "logps/rejected": -547.1435546875, "loss": 0.213, "rewards/accuracies": 0.875, "rewards/chosen": 1.6115251779556274, "rewards/margins": 2.379206418991089, "rewards/rejected": -0.767681360244751, "step": 1387 }, { "epoch": 1.0140639269406393, "grad_norm": 34.827576473361056, "learning_rate": 4.649480464198413e-07, "logits/chosen": -3.111603021621704, "logits/rejected": -2.2642624378204346, "logps/chosen": -552.9743041992188, "logps/rejected": -440.0929870605469, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 1.690804123878479, "rewards/margins": 2.3879034519195557, "rewards/rejected": -0.6970993280410767, "step": 1388 }, { "epoch": 1.0147945205479452, "grad_norm": 22.29544809182436, "learning_rate": 4.6486655285137764e-07, "logits/chosen": -2.920086145401001, "logits/rejected": -1.8756957054138184, "logps/chosen": -689.8223266601562, "logps/rejected": -448.94024658203125, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 3.2709591388702393, "rewards/margins": 4.6089253425598145, "rewards/rejected": -1.3379662036895752, "step": 1389 }, { "epoch": 1.015525114155251, "grad_norm": 36.2099792551204, "learning_rate": 4.647849718182849e-07, "logits/chosen": -2.481362819671631, "logits/rejected": -2.489304542541504, "logps/chosen": -677.4268798828125, "logps/rejected": -688.71826171875, "loss": 0.2136, "rewards/accuracies": 0.875, "rewards/chosen": 1.9191917181015015, "rewards/margins": 1.9375295639038086, "rewards/rejected": -0.01833784580230713, "step": 1390 }, { "epoch": 1.0162557077625571, "grad_norm": 52.203835860170514, "learning_rate": 4.647033033537719e-07, "logits/chosen": -2.613443374633789, "logits/rejected": -1.8067235946655273, "logps/chosen": -852.33251953125, "logps/rejected": -850.726806640625, "loss": 0.2874, "rewards/accuracies": 0.75, "rewards/chosen": 2.888237476348877, "rewards/margins": 2.4323699474334717, "rewards/rejected": 0.4558674097061157, "step": 1391 }, { "epoch": 1.016986301369863, "grad_norm": 36.483050029474306, "learning_rate": 4.6462154749108296e-07, "logits/chosen": -2.867255687713623, "logits/rejected": -2.114274024963379, "logps/chosen": -969.1644897460938, "logps/rejected": -691.1229248046875, "loss": 0.1958, "rewards/accuracies": 1.0, "rewards/chosen": 3.6348788738250732, "rewards/margins": 2.3774704933166504, "rewards/rejected": 1.2574084997177124, "step": 1392 }, { "epoch": 1.0177168949771689, "grad_norm": 44.45735782791261, "learning_rate": 4.6453970426349807e-07, "logits/chosen": -3.4255456924438477, "logits/rejected": -2.5427422523498535, "logps/chosen": -933.1229858398438, "logps/rejected": -688.0400390625, "loss": 0.275, "rewards/accuracies": 0.875, "rewards/chosen": 3.984422445297241, "rewards/margins": 3.1591227054595947, "rewards/rejected": 0.8252995610237122, "step": 1393 }, { "epoch": 1.018447488584475, "grad_norm": 33.55650733569143, "learning_rate": 4.644577737043327e-07, "logits/chosen": -2.808580160140991, "logits/rejected": -2.860053062438965, "logps/chosen": -650.2825927734375, "logps/rejected": -695.1973876953125, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": 2.313687562942505, "rewards/margins": 2.4975666999816895, "rewards/rejected": -0.1838790774345398, "step": 1394 }, { "epoch": 1.0191780821917809, "grad_norm": 29.51911436328957, "learning_rate": 4.64375755846938e-07, "logits/chosen": -3.2467663288116455, "logits/rejected": -2.089179515838623, "logps/chosen": -743.0549926757812, "logps/rejected": -519.9404296875, "loss": 0.1927, "rewards/accuracies": 1.0, "rewards/chosen": 3.9883546829223633, "rewards/margins": 4.41792106628418, "rewards/rejected": -0.4295663833618164, "step": 1395 }, { "epoch": 1.0199086757990867, "grad_norm": 44.6025525649878, "learning_rate": 4.642936507247004e-07, "logits/chosen": -3.3901963233947754, "logits/rejected": -2.766467571258545, "logps/chosen": -580.6340942382812, "logps/rejected": -569.028076171875, "loss": 0.27, "rewards/accuracies": 0.875, "rewards/chosen": 3.467838764190674, "rewards/margins": 2.7112860679626465, "rewards/rejected": 0.7565525770187378, "step": 1396 }, { "epoch": 1.0206392694063926, "grad_norm": 24.497506150977642, "learning_rate": 4.6421145837104214e-07, "logits/chosen": -2.894192695617676, "logits/rejected": -2.423339605331421, "logps/chosen": -361.4339294433594, "logps/rejected": -315.77813720703125, "loss": 0.2033, "rewards/accuracies": 0.875, "rewards/chosen": 2.2866249084472656, "rewards/margins": 2.470498561859131, "rewards/rejected": -0.18387362360954285, "step": 1397 }, { "epoch": 1.0213698630136987, "grad_norm": 21.422121454180402, "learning_rate": 4.6412917881942083e-07, "logits/chosen": -3.3793962001800537, "logits/rejected": -2.513293743133545, "logps/chosen": -826.3289184570312, "logps/rejected": -552.1497192382812, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": 4.218634605407715, "rewards/margins": 4.012576103210449, "rewards/rejected": 0.20605850219726562, "step": 1398 }, { "epoch": 1.0221004566210046, "grad_norm": 43.485260669654636, "learning_rate": 4.6404681210332945e-07, "logits/chosen": -3.1157126426696777, "logits/rejected": -2.397251844406128, "logps/chosen": -597.29052734375, "logps/rejected": -464.22784423828125, "loss": 0.2927, "rewards/accuracies": 0.875, "rewards/chosen": 2.6404104232788086, "rewards/margins": 2.967751979827881, "rewards/rejected": -0.3273416757583618, "step": 1399 }, { "epoch": 1.0228310502283104, "grad_norm": 44.36312148453445, "learning_rate": 4.639643582562968e-07, "logits/chosen": -2.8465628623962402, "logits/rejected": -2.106147527694702, "logps/chosen": -672.1686401367188, "logps/rejected": -486.8603820800781, "loss": 0.3423, "rewards/accuracies": 0.75, "rewards/chosen": 2.653684139251709, "rewards/margins": 2.13218092918396, "rewards/rejected": 0.5215034484863281, "step": 1400 }, { "epoch": 1.0235616438356165, "grad_norm": 23.558787733926113, "learning_rate": 4.638818173118868e-07, "logits/chosen": -2.2596447467803955, "logits/rejected": -2.209015369415283, "logps/chosen": -209.79287719726562, "logps/rejected": -290.59869384765625, "loss": 0.1896, "rewards/accuracies": 1.0, "rewards/chosen": 1.4284483194351196, "rewards/margins": 2.518444538116455, "rewards/rejected": -1.089996337890625, "step": 1401 }, { "epoch": 1.0242922374429224, "grad_norm": 21.030702943758055, "learning_rate": 4.63799189303699e-07, "logits/chosen": -3.2557268142700195, "logits/rejected": -2.5273327827453613, "logps/chosen": -800.337158203125, "logps/rejected": -578.4664916992188, "loss": 0.0954, "rewards/accuracies": 1.0, "rewards/chosen": 3.4546802043914795, "rewards/margins": 2.8982608318328857, "rewards/rejected": 0.5564192533493042, "step": 1402 }, { "epoch": 1.0250228310502283, "grad_norm": 24.642178038407287, "learning_rate": 4.6371647426536843e-07, "logits/chosen": -2.5017499923706055, "logits/rejected": -1.9322675466537476, "logps/chosen": -453.76641845703125, "logps/rejected": -498.35516357421875, "loss": 0.1504, "rewards/accuracies": 0.75, "rewards/chosen": 2.2303385734558105, "rewards/margins": 2.5788753032684326, "rewards/rejected": -0.3485368490219116, "step": 1403 }, { "epoch": 1.0257534246575342, "grad_norm": 37.411216514486, "learning_rate": 4.636336722305654e-07, "logits/chosen": -2.6917901039123535, "logits/rejected": -2.4142205715179443, "logps/chosen": -410.92779541015625, "logps/rejected": -318.8879699707031, "loss": 0.2099, "rewards/accuracies": 0.875, "rewards/chosen": 1.870383381843567, "rewards/margins": 2.40012264251709, "rewards/rejected": -0.529739260673523, "step": 1404 }, { "epoch": 1.0264840182648403, "grad_norm": 31.324324699362535, "learning_rate": 4.6355078323299566e-07, "logits/chosen": -3.0418028831481934, "logits/rejected": -2.3905773162841797, "logps/chosen": -951.9977416992188, "logps/rejected": -689.2997436523438, "loss": 0.1912, "rewards/accuracies": 1.0, "rewards/chosen": 4.468233585357666, "rewards/margins": 3.8875441551208496, "rewards/rejected": 0.5806891322135925, "step": 1405 }, { "epoch": 1.0272146118721461, "grad_norm": 31.412329363406915, "learning_rate": 4.6346780730640056e-07, "logits/chosen": -2.84268856048584, "logits/rejected": -1.7669126987457275, "logps/chosen": -577.3455810546875, "logps/rejected": -281.7803039550781, "loss": 0.195, "rewards/accuracies": 0.875, "rewards/chosen": 2.7780189514160156, "rewards/margins": 3.389209270477295, "rewards/rejected": -0.6111903190612793, "step": 1406 }, { "epoch": 1.027945205479452, "grad_norm": 40.878443208378485, "learning_rate": 4.6338474448455665e-07, "logits/chosen": -2.479966402053833, "logits/rejected": -2.5333759784698486, "logps/chosen": -536.2557373046875, "logps/rejected": -584.4386596679688, "loss": 0.2512, "rewards/accuracies": 1.0, "rewards/chosen": 0.7230391502380371, "rewards/margins": 1.5880451202392578, "rewards/rejected": -0.8650059700012207, "step": 1407 }, { "epoch": 1.0286757990867579, "grad_norm": 37.51420629345802, "learning_rate": 4.633015948012758e-07, "logits/chosen": -2.8039538860321045, "logits/rejected": -2.068568229675293, "logps/chosen": -450.93865966796875, "logps/rejected": -354.2396240234375, "loss": 0.2481, "rewards/accuracies": 0.75, "rewards/chosen": 1.4932446479797363, "rewards/margins": 1.8775349855422974, "rewards/rejected": -0.3842903971672058, "step": 1408 }, { "epoch": 1.029406392694064, "grad_norm": 29.415347954979218, "learning_rate": 4.6321835829040537e-07, "logits/chosen": -2.87716007232666, "logits/rejected": -2.558065176010132, "logps/chosen": -521.0798950195312, "logps/rejected": -619.699462890625, "loss": 0.1556, "rewards/accuracies": 1.0, "rewards/chosen": 3.3476083278656006, "rewards/margins": 3.47560715675354, "rewards/rejected": -0.12799882888793945, "step": 1409 }, { "epoch": 1.0301369863013699, "grad_norm": 37.521546544327364, "learning_rate": 4.6313503498582807e-07, "logits/chosen": -2.686697244644165, "logits/rejected": -2.7030344009399414, "logps/chosen": -617.2521362304688, "logps/rejected": -639.62744140625, "loss": 0.2059, "rewards/accuracies": 1.0, "rewards/chosen": 2.5159897804260254, "rewards/margins": 2.8763015270233154, "rewards/rejected": -0.36031168699264526, "step": 1410 }, { "epoch": 1.0308675799086757, "grad_norm": 40.817209062649816, "learning_rate": 4.6305162492146175e-07, "logits/chosen": -2.9861466884613037, "logits/rejected": -2.021682024002075, "logps/chosen": -828.4380493164062, "logps/rejected": -632.2809448242188, "loss": 0.2861, "rewards/accuracies": 0.75, "rewards/chosen": 2.5175085067749023, "rewards/margins": 2.311420202255249, "rewards/rejected": 0.20608830451965332, "step": 1411 }, { "epoch": 1.0315981735159818, "grad_norm": 34.9196657578595, "learning_rate": 4.6296812813125994e-07, "logits/chosen": -3.603086471557617, "logits/rejected": -2.125164747238159, "logps/chosen": -972.2730102539062, "logps/rejected": -628.57470703125, "loss": 0.1983, "rewards/accuracies": 0.875, "rewards/chosen": 3.97578501701355, "rewards/margins": 2.5031306743621826, "rewards/rejected": 1.4726544618606567, "step": 1412 }, { "epoch": 1.0323287671232877, "grad_norm": 27.304295195679607, "learning_rate": 4.628845446492111e-07, "logits/chosen": -3.024506092071533, "logits/rejected": -2.7670912742614746, "logps/chosen": -632.4856567382812, "logps/rejected": -538.0574951171875, "loss": 0.1462, "rewards/accuracies": 1.0, "rewards/chosen": 2.0733981132507324, "rewards/margins": 2.963036298751831, "rewards/rejected": -0.8896380662918091, "step": 1413 }, { "epoch": 1.0330593607305936, "grad_norm": 53.116626178273975, "learning_rate": 4.6280087450933916e-07, "logits/chosen": -2.9271936416625977, "logits/rejected": -2.135502338409424, "logps/chosen": -688.526611328125, "logps/rejected": -546.684326171875, "loss": 0.3007, "rewards/accuracies": 0.875, "rewards/chosen": 1.8089734315872192, "rewards/margins": 1.2682898044586182, "rewards/rejected": 0.5406835675239563, "step": 1414 }, { "epoch": 1.0337899543378994, "grad_norm": 34.805712491921675, "learning_rate": 4.6271711774570327e-07, "logits/chosen": -2.9096150398254395, "logits/rejected": -2.002152442932129, "logps/chosen": -645.8206787109375, "logps/rejected": -363.784912109375, "loss": 0.2574, "rewards/accuracies": 1.0, "rewards/chosen": 3.099616527557373, "rewards/margins": 3.5209126472473145, "rewards/rejected": -0.4212961196899414, "step": 1415 }, { "epoch": 1.0345205479452055, "grad_norm": 51.92081231985036, "learning_rate": 4.6263327439239783e-07, "logits/chosen": -2.9599719047546387, "logits/rejected": -2.1286568641662598, "logps/chosen": -630.3310546875, "logps/rejected": -442.978759765625, "loss": 0.2141, "rewards/accuracies": 0.875, "rewards/chosen": 1.1299922466278076, "rewards/margins": 2.664246082305908, "rewards/rejected": -1.534253716468811, "step": 1416 }, { "epoch": 1.0352511415525114, "grad_norm": 32.13589613364625, "learning_rate": 4.625493444835527e-07, "logits/chosen": -2.5871999263763428, "logits/rejected": -2.1958484649658203, "logps/chosen": -413.38616943359375, "logps/rejected": -557.7692260742188, "loss": 0.1841, "rewards/accuracies": 0.875, "rewards/chosen": 2.8806073665618896, "rewards/margins": 4.233229160308838, "rewards/rejected": -1.3526217937469482, "step": 1417 }, { "epoch": 1.0359817351598173, "grad_norm": 49.43456158302863, "learning_rate": 4.624653280533327e-07, "logits/chosen": -2.9586033821105957, "logits/rejected": -2.3853635787963867, "logps/chosen": -752.1710815429688, "logps/rejected": -578.9136352539062, "loss": 0.245, "rewards/accuracies": 0.75, "rewards/chosen": 3.4473650455474854, "rewards/margins": 3.482701063156128, "rewards/rejected": -0.035336121916770935, "step": 1418 }, { "epoch": 1.0367123287671234, "grad_norm": 46.10616358267623, "learning_rate": 4.623812251359379e-07, "logits/chosen": -2.834606170654297, "logits/rejected": -1.9992507696151733, "logps/chosen": -765.306640625, "logps/rejected": -527.2223510742188, "loss": 0.235, "rewards/accuracies": 0.75, "rewards/chosen": 2.8961868286132812, "rewards/margins": 2.9463791847229004, "rewards/rejected": -0.05019247531890869, "step": 1419 }, { "epoch": 1.0374429223744293, "grad_norm": 32.67292486954987, "learning_rate": 4.622970357656037e-07, "logits/chosen": -2.2606542110443115, "logits/rejected": -2.555755615234375, "logps/chosen": -436.4761962890625, "logps/rejected": -565.03173828125, "loss": 0.2348, "rewards/accuracies": 0.875, "rewards/chosen": 1.674644947052002, "rewards/margins": 1.9388858079910278, "rewards/rejected": -0.2642408013343811, "step": 1420 }, { "epoch": 1.0381735159817351, "grad_norm": 38.37019800218084, "learning_rate": 4.622127599766006e-07, "logits/chosen": -2.5554609298706055, "logits/rejected": -2.273404598236084, "logps/chosen": -421.2896728515625, "logps/rejected": -540.4016723632812, "loss": 0.2508, "rewards/accuracies": 1.0, "rewards/chosen": 1.984737515449524, "rewards/margins": 2.8084254264831543, "rewards/rejected": -0.8236879706382751, "step": 1421 }, { "epoch": 1.038904109589041, "grad_norm": 39.805962473447366, "learning_rate": 4.6212839780323444e-07, "logits/chosen": -2.9881839752197266, "logits/rejected": -2.6101958751678467, "logps/chosen": -370.53973388671875, "logps/rejected": -347.0181884765625, "loss": 0.2243, "rewards/accuracies": 1.0, "rewards/chosen": 1.8847897052764893, "rewards/margins": 2.5863475799560547, "rewards/rejected": -0.7015580534934998, "step": 1422 }, { "epoch": 1.039634703196347, "grad_norm": 19.942457198677744, "learning_rate": 4.62043949279846e-07, "logits/chosen": -2.9136645793914795, "logits/rejected": -1.957966923713684, "logps/chosen": -630.4873046875, "logps/rejected": -513.4038696289062, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 3.9524176120758057, "rewards/margins": 4.82147216796875, "rewards/rejected": -0.8690543174743652, "step": 1423 }, { "epoch": 1.040365296803653, "grad_norm": 23.2439508259676, "learning_rate": 4.619594144408113e-07, "logits/chosen": -3.0769271850585938, "logits/rejected": -2.6917200088500977, "logps/chosen": -631.332275390625, "logps/rejected": -653.7772827148438, "loss": 0.1217, "rewards/accuracies": 1.0, "rewards/chosen": 3.3929708003997803, "rewards/margins": 4.113675117492676, "rewards/rejected": -0.7207044363021851, "step": 1424 }, { "epoch": 1.0410958904109588, "grad_norm": 33.18722965108812, "learning_rate": 4.618747933205415e-07, "logits/chosen": -2.7349061965942383, "logits/rejected": -2.92641544342041, "logps/chosen": -298.9700927734375, "logps/rejected": -427.8751220703125, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": 2.0782346725463867, "rewards/margins": 3.1917564868927, "rewards/rejected": -1.113521933555603, "step": 1425 }, { "epoch": 1.041826484018265, "grad_norm": 50.80461721988834, "learning_rate": 4.617900859534829e-07, "logits/chosen": -2.687934637069702, "logits/rejected": -1.734925389289856, "logps/chosen": -786.6649169921875, "logps/rejected": -499.6104736328125, "loss": 0.26, "rewards/accuracies": 1.0, "rewards/chosen": 4.353910446166992, "rewards/margins": 4.12760066986084, "rewards/rejected": 0.22630935907363892, "step": 1426 }, { "epoch": 1.0425570776255708, "grad_norm": 20.946876544317785, "learning_rate": 4.617052923741169e-07, "logits/chosen": -2.5845422744750977, "logits/rejected": -1.6798248291015625, "logps/chosen": -667.609619140625, "logps/rejected": -428.40606689453125, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": 3.247236967086792, "rewards/margins": 3.3531010150909424, "rewards/rejected": -0.10586392879486084, "step": 1427 }, { "epoch": 1.0432876712328767, "grad_norm": 40.739593487651085, "learning_rate": 4.6162041261696004e-07, "logits/chosen": -2.826704502105713, "logits/rejected": -2.229135513305664, "logps/chosen": -689.169921875, "logps/rejected": -593.1953125, "loss": 0.2243, "rewards/accuracies": 0.875, "rewards/chosen": 2.2965102195739746, "rewards/margins": 2.0813398361206055, "rewards/rejected": 0.2151707112789154, "step": 1428 }, { "epoch": 1.0440182648401826, "grad_norm": 33.56548956657343, "learning_rate": 4.6153544671656387e-07, "logits/chosen": -2.706489324569702, "logits/rejected": -1.7700309753417969, "logps/chosen": -550.683349609375, "logps/rejected": -442.35430908203125, "loss": 0.2021, "rewards/accuracies": 0.875, "rewards/chosen": 4.0127129554748535, "rewards/margins": 4.9898176193237305, "rewards/rejected": -0.9771050214767456, "step": 1429 }, { "epoch": 1.0447488584474887, "grad_norm": 39.42431807233639, "learning_rate": 4.614503947075149e-07, "logits/chosen": -2.8182921409606934, "logits/rejected": -1.546779751777649, "logps/chosen": -1016.8969116210938, "logps/rejected": -533.2545776367188, "loss": 0.2315, "rewards/accuracies": 0.875, "rewards/chosen": 3.835216999053955, "rewards/margins": 3.588618278503418, "rewards/rejected": 0.2465984970331192, "step": 1430 }, { "epoch": 1.0454794520547945, "grad_norm": 35.56321911344553, "learning_rate": 4.6136525662443497e-07, "logits/chosen": -2.8152456283569336, "logits/rejected": -2.531270980834961, "logps/chosen": -564.4029541015625, "logps/rejected": -537.362060546875, "loss": 0.2473, "rewards/accuracies": 1.0, "rewards/chosen": 2.306016445159912, "rewards/margins": 1.758901834487915, "rewards/rejected": 0.5471144318580627, "step": 1431 }, { "epoch": 1.0462100456621004, "grad_norm": 35.85514000046751, "learning_rate": 4.6128003250198076e-07, "logits/chosen": -2.5336802005767822, "logits/rejected": -2.1649651527404785, "logps/chosen": -640.3236083984375, "logps/rejected": -599.2962646484375, "loss": 0.2013, "rewards/accuracies": 0.875, "rewards/chosen": 2.771111249923706, "rewards/margins": 3.2486720085144043, "rewards/rejected": -0.4775606393814087, "step": 1432 }, { "epoch": 1.0469406392694065, "grad_norm": 36.31332023600293, "learning_rate": 4.6119472237484405e-07, "logits/chosen": -3.235915184020996, "logits/rejected": -1.8688842058181763, "logps/chosen": -703.8973999023438, "logps/rejected": -392.6524658203125, "loss": 0.2781, "rewards/accuracies": 1.0, "rewards/chosen": 4.543827533721924, "rewards/margins": 5.20977783203125, "rewards/rejected": -0.6659499406814575, "step": 1433 }, { "epoch": 1.0476712328767124, "grad_norm": 24.08518867429862, "learning_rate": 4.6110932627775144e-07, "logits/chosen": -2.59376859664917, "logits/rejected": -1.8644206523895264, "logps/chosen": -794.4132080078125, "logps/rejected": -450.134033203125, "loss": 0.1616, "rewards/accuracies": 1.0, "rewards/chosen": 3.300259590148926, "rewards/margins": 4.100785732269287, "rewards/rejected": -0.800525963306427, "step": 1434 }, { "epoch": 1.0484018264840183, "grad_norm": 45.1987351502195, "learning_rate": 4.6102384424546486e-07, "logits/chosen": -2.926713466644287, "logits/rejected": -2.3434677124023438, "logps/chosen": -876.216552734375, "logps/rejected": -698.6642456054688, "loss": 0.2524, "rewards/accuracies": 1.0, "rewards/chosen": 2.9703550338745117, "rewards/margins": 1.9934906959533691, "rewards/rejected": 0.9768642783164978, "step": 1435 }, { "epoch": 1.0491324200913241, "grad_norm": 34.920375387264954, "learning_rate": 4.6093827631278093e-07, "logits/chosen": -2.652672290802002, "logits/rejected": -2.1679441928863525, "logps/chosen": -1017.1607666015625, "logps/rejected": -699.3276977539062, "loss": 0.1929, "rewards/accuracies": 1.0, "rewards/chosen": 3.176413059234619, "rewards/margins": 2.335569143295288, "rewards/rejected": 0.8408439755439758, "step": 1436 }, { "epoch": 1.0498630136986302, "grad_norm": 33.41598862668045, "learning_rate": 4.608526225145315e-07, "logits/chosen": -2.9202065467834473, "logits/rejected": -2.036098003387451, "logps/chosen": -720.5414428710938, "logps/rejected": -522.113525390625, "loss": 0.1644, "rewards/accuracies": 1.0, "rewards/chosen": 3.416872978210449, "rewards/margins": 4.4370012283325195, "rewards/rejected": -1.0201284885406494, "step": 1437 }, { "epoch": 1.050593607305936, "grad_norm": 31.369771453345916, "learning_rate": 4.607668828855831e-07, "logits/chosen": -2.850606679916382, "logits/rejected": -2.2623653411865234, "logps/chosen": -607.431884765625, "logps/rejected": -559.9776611328125, "loss": 0.1968, "rewards/accuracies": 1.0, "rewards/chosen": 2.4464893341064453, "rewards/margins": 2.786686420440674, "rewards/rejected": -0.34019729495048523, "step": 1438 }, { "epoch": 1.051324200913242, "grad_norm": 31.34345109061112, "learning_rate": 4.606810574608373e-07, "logits/chosen": -2.5581278800964355, "logits/rejected": -2.500223159790039, "logps/chosen": -713.2123413085938, "logps/rejected": -625.328125, "loss": 0.1938, "rewards/accuracies": 0.875, "rewards/chosen": 3.491422653198242, "rewards/margins": 3.1483399868011475, "rewards/rejected": 0.3430825471878052, "step": 1439 }, { "epoch": 1.0520547945205478, "grad_norm": 36.847979043229934, "learning_rate": 4.6059514627523065e-07, "logits/chosen": -2.5971076488494873, "logits/rejected": -2.041438579559326, "logps/chosen": -613.0286865234375, "logps/rejected": -478.13818359375, "loss": 0.2666, "rewards/accuracies": 1.0, "rewards/chosen": 2.1806790828704834, "rewards/margins": 3.2548179626464844, "rewards/rejected": -1.074138879776001, "step": 1440 }, { "epoch": 1.052785388127854, "grad_norm": 44.50199422761159, "learning_rate": 4.6050914936373466e-07, "logits/chosen": -3.223639965057373, "logits/rejected": -2.8258302211761475, "logps/chosen": -882.41845703125, "logps/rejected": -661.978271484375, "loss": 0.2442, "rewards/accuracies": 0.875, "rewards/chosen": 3.4550740718841553, "rewards/margins": 2.699805736541748, "rewards/rejected": 0.7552685737609863, "step": 1441 }, { "epoch": 1.0535159817351598, "grad_norm": 45.35967102900301, "learning_rate": 4.604230667613555e-07, "logits/chosen": -3.1709487438201904, "logits/rejected": -2.411238193511963, "logps/chosen": -825.8873291015625, "logps/rejected": -561.54150390625, "loss": 0.2647, "rewards/accuracies": 1.0, "rewards/chosen": 3.202996253967285, "rewards/margins": 3.5148203372955322, "rewards/rejected": -0.31182408332824707, "step": 1442 }, { "epoch": 1.0542465753424657, "grad_norm": 42.941617854306706, "learning_rate": 4.6033689850313453e-07, "logits/chosen": -3.2076570987701416, "logits/rejected": -2.320181369781494, "logps/chosen": -526.8613891601562, "logps/rejected": -457.41217041015625, "loss": 0.2337, "rewards/accuracies": 1.0, "rewards/chosen": 4.1049089431762695, "rewards/margins": 4.572461128234863, "rewards/rejected": -0.46755218505859375, "step": 1443 }, { "epoch": 1.0549771689497718, "grad_norm": 32.89833824343849, "learning_rate": 4.602506446241476e-07, "logits/chosen": -3.0771701335906982, "logits/rejected": -2.4858617782592773, "logps/chosen": -611.6174926757812, "logps/rejected": -595.60888671875, "loss": 0.1977, "rewards/accuracies": 1.0, "rewards/chosen": 2.6377668380737305, "rewards/margins": 2.270705223083496, "rewards/rejected": 0.36706170439720154, "step": 1444 }, { "epoch": 1.0557077625570777, "grad_norm": 20.7211743255465, "learning_rate": 4.60164305159506e-07, "logits/chosen": -2.820619583129883, "logits/rejected": -2.0012145042419434, "logps/chosen": -824.5753173828125, "logps/rejected": -520.096923828125, "loss": 0.1433, "rewards/accuracies": 1.0, "rewards/chosen": 3.148932933807373, "rewards/margins": 3.566713333129883, "rewards/rejected": -0.4177806079387665, "step": 1445 }, { "epoch": 1.0564383561643835, "grad_norm": 46.144119724578665, "learning_rate": 4.600778801443552e-07, "logits/chosen": -2.61678409576416, "logits/rejected": -1.9076387882232666, "logps/chosen": -567.6495971679688, "logps/rejected": -249.34317016601562, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": 1.6841893196105957, "rewards/margins": 2.2116384506225586, "rewards/rejected": -0.5274492502212524, "step": 1446 }, { "epoch": 1.0571689497716894, "grad_norm": 23.002528839408342, "learning_rate": 4.5999136961387587e-07, "logits/chosen": -2.166635274887085, "logits/rejected": -2.1075940132141113, "logps/chosen": -515.6983642578125, "logps/rejected": -626.2881469726562, "loss": 0.1412, "rewards/accuracies": 1.0, "rewards/chosen": 2.2892274856567383, "rewards/margins": 2.306985378265381, "rewards/rejected": -0.017757952213287354, "step": 1447 }, { "epoch": 1.0578995433789955, "grad_norm": 33.941835804408655, "learning_rate": 4.5990477360328337e-07, "logits/chosen": -2.44189715385437, "logits/rejected": -1.9676796197891235, "logps/chosen": -594.6326904296875, "logps/rejected": -535.4591064453125, "loss": 0.2215, "rewards/accuracies": 0.875, "rewards/chosen": 2.917513132095337, "rewards/margins": 3.4557602405548096, "rewards/rejected": -0.5382471084594727, "step": 1448 }, { "epoch": 1.0586301369863014, "grad_norm": 39.22841172075174, "learning_rate": 4.5981809214782796e-07, "logits/chosen": -2.6107752323150635, "logits/rejected": -2.2565207481384277, "logps/chosen": -562.8198852539062, "logps/rejected": -592.9638671875, "loss": 0.181, "rewards/accuracies": 1.0, "rewards/chosen": 2.7330551147460938, "rewards/margins": 3.135207176208496, "rewards/rejected": -0.40215203166007996, "step": 1449 }, { "epoch": 1.0593607305936072, "grad_norm": 31.144351688385548, "learning_rate": 4.597313252827946e-07, "logits/chosen": -1.9404594898223877, "logits/rejected": -1.8036457300186157, "logps/chosen": -547.4194946289062, "logps/rejected": -457.3371887207031, "loss": 0.1792, "rewards/accuracies": 0.875, "rewards/chosen": 1.314603328704834, "rewards/margins": 2.41005277633667, "rewards/rejected": -1.0954492092132568, "step": 1450 }, { "epoch": 1.0600913242009133, "grad_norm": 50.98881856976709, "learning_rate": 4.59644473043503e-07, "logits/chosen": -2.6036226749420166, "logits/rejected": -2.388216495513916, "logps/chosen": -617.7408447265625, "logps/rejected": -649.134033203125, "loss": 0.2772, "rewards/accuracies": 1.0, "rewards/chosen": 2.455141067504883, "rewards/margins": 2.2149763107299805, "rewards/rejected": 0.24016478657722473, "step": 1451 }, { "epoch": 1.0608219178082192, "grad_norm": 22.578983953168084, "learning_rate": 4.5955753546530773e-07, "logits/chosen": -2.444519519805908, "logits/rejected": -1.6987899541854858, "logps/chosen": -756.8573608398438, "logps/rejected": -499.98687744140625, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 2.8634376525878906, "rewards/margins": 2.6893367767333984, "rewards/rejected": 0.1741006076335907, "step": 1452 }, { "epoch": 1.061552511415525, "grad_norm": 29.683201785887565, "learning_rate": 4.5947051258359795e-07, "logits/chosen": -2.9006354808807373, "logits/rejected": -2.592783212661743, "logps/chosen": -508.7384033203125, "logps/rejected": -447.9510498046875, "loss": 0.2086, "rewards/accuracies": 1.0, "rewards/chosen": 2.473557472229004, "rewards/margins": 2.912179946899414, "rewards/rejected": -0.4386223554611206, "step": 1453 }, { "epoch": 1.062283105022831, "grad_norm": 33.03506363982821, "learning_rate": 4.5938340443379764e-07, "logits/chosen": -2.910409450531006, "logits/rejected": -2.6073756217956543, "logps/chosen": -540.6859130859375, "logps/rejected": -451.5372619628906, "loss": 0.2194, "rewards/accuracies": 0.875, "rewards/chosen": 2.114633798599243, "rewards/margins": 2.2767953872680664, "rewards/rejected": -0.16216151416301727, "step": 1454 }, { "epoch": 1.063013698630137, "grad_norm": 23.912115038312038, "learning_rate": 4.5929621105136546e-07, "logits/chosen": -3.448049545288086, "logits/rejected": -2.38261079788208, "logps/chosen": -1067.5703125, "logps/rejected": -568.6386108398438, "loss": 0.1552, "rewards/accuracies": 1.0, "rewards/chosen": 2.8829641342163086, "rewards/margins": 2.582099437713623, "rewards/rejected": 0.30086439847946167, "step": 1455 }, { "epoch": 1.063744292237443, "grad_norm": 28.082460233282735, "learning_rate": 4.592089324717948e-07, "logits/chosen": -3.1811070442199707, "logits/rejected": -2.7604682445526123, "logps/chosen": -588.43798828125, "logps/rejected": -470.07073974609375, "loss": 0.1804, "rewards/accuracies": 1.0, "rewards/chosen": 1.8648351430892944, "rewards/margins": 2.0482606887817383, "rewards/rejected": -0.18342572450637817, "step": 1456 }, { "epoch": 1.0644748858447488, "grad_norm": 33.26837155465289, "learning_rate": 4.591215687306137e-07, "logits/chosen": -2.8844799995422363, "logits/rejected": -2.401341199874878, "logps/chosen": -711.73681640625, "logps/rejected": -593.584228515625, "loss": 0.1606, "rewards/accuracies": 1.0, "rewards/chosen": 3.3303418159484863, "rewards/margins": 2.9260177612304688, "rewards/rejected": 0.40432390570640564, "step": 1457 }, { "epoch": 1.0652054794520547, "grad_norm": 28.56397442173664, "learning_rate": 4.5903411986338493e-07, "logits/chosen": -2.988771677017212, "logits/rejected": -1.9257924556732178, "logps/chosen": -986.680908203125, "logps/rejected": -696.0423583984375, "loss": 0.1607, "rewards/accuracies": 1.0, "rewards/chosen": 4.827862739562988, "rewards/margins": 4.467729568481445, "rewards/rejected": 0.3601332902908325, "step": 1458 }, { "epoch": 1.0659360730593608, "grad_norm": 37.092280602214764, "learning_rate": 4.589465859057057e-07, "logits/chosen": -3.293182134628296, "logits/rejected": -2.35971999168396, "logps/chosen": -685.83740234375, "logps/rejected": -557.7835693359375, "loss": 0.2308, "rewards/accuracies": 0.875, "rewards/chosen": 2.3950512409210205, "rewards/margins": 2.475494861602783, "rewards/rejected": -0.0804433822631836, "step": 1459 }, { "epoch": 1.0666666666666667, "grad_norm": 21.15715009163569, "learning_rate": 4.5885896689320813e-07, "logits/chosen": -2.1827666759490967, "logits/rejected": -2.194277048110962, "logps/chosen": -504.7818298339844, "logps/rejected": -536.511474609375, "loss": 0.1666, "rewards/accuracies": 1.0, "rewards/chosen": 2.042276382446289, "rewards/margins": 4.422617435455322, "rewards/rejected": -2.380341053009033, "step": 1460 }, { "epoch": 1.0673972602739725, "grad_norm": 28.73791674576845, "learning_rate": 4.5877126286155887e-07, "logits/chosen": -3.117356300354004, "logits/rejected": -2.041433572769165, "logps/chosen": -715.1044311523438, "logps/rejected": -454.15948486328125, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 3.0055503845214844, "rewards/margins": 3.5102996826171875, "rewards/rejected": -0.5047492980957031, "step": 1461 }, { "epoch": 1.0681278538812786, "grad_norm": 32.09016114610228, "learning_rate": 4.58683473846459e-07, "logits/chosen": -2.5085644721984863, "logits/rejected": -2.1980693340301514, "logps/chosen": -600.9113159179688, "logps/rejected": -627.8260498046875, "loss": 0.201, "rewards/accuracies": 1.0, "rewards/chosen": 3.5969958305358887, "rewards/margins": 3.4133658409118652, "rewards/rejected": 0.18363022804260254, "step": 1462 }, { "epoch": 1.0688584474885845, "grad_norm": 25.86234759160929, "learning_rate": 4.585955998836445e-07, "logits/chosen": -2.5028936862945557, "logits/rejected": -2.1842215061187744, "logps/chosen": -422.743408203125, "logps/rejected": -412.2728271484375, "loss": 0.1723, "rewards/accuracies": 0.875, "rewards/chosen": 1.7429375648498535, "rewards/margins": 2.2382137775421143, "rewards/rejected": -0.4952763020992279, "step": 1463 }, { "epoch": 1.0695890410958904, "grad_norm": 29.755330609943343, "learning_rate": 4.585076410088857e-07, "logits/chosen": -2.94844388961792, "logits/rejected": -2.7647922039031982, "logps/chosen": -1031.92431640625, "logps/rejected": -970.3587646484375, "loss": 0.1856, "rewards/accuracies": 0.875, "rewards/chosen": 5.150813579559326, "rewards/margins": 2.8646130561828613, "rewards/rejected": 2.286200523376465, "step": 1464 }, { "epoch": 1.0703196347031962, "grad_norm": 20.9145177621349, "learning_rate": 4.584195972579876e-07, "logits/chosen": -2.913576126098633, "logits/rejected": -2.103358030319214, "logps/chosen": -432.2129821777344, "logps/rejected": -398.9280090332031, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 1.9471166133880615, "rewards/margins": 3.2995965480804443, "rewards/rejected": -1.3524796962738037, "step": 1465 }, { "epoch": 1.0710502283105023, "grad_norm": 24.130925691051125, "learning_rate": 4.583314686667897e-07, "logits/chosen": -3.1079556941986084, "logits/rejected": -2.1374032497406006, "logps/chosen": -821.7852783203125, "logps/rejected": -530.6197509765625, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": 3.55428147315979, "rewards/margins": 3.3179171085357666, "rewards/rejected": 0.23636415600776672, "step": 1466 }, { "epoch": 1.0717808219178082, "grad_norm": 37.373004362482355, "learning_rate": 4.58243255271166e-07, "logits/chosen": -3.1115152835845947, "logits/rejected": -2.659215211868286, "logps/chosen": -729.8424072265625, "logps/rejected": -562.7523193359375, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": 2.976872205734253, "rewards/margins": 1.4767817258834839, "rewards/rejected": 1.5000905990600586, "step": 1467 }, { "epoch": 1.072511415525114, "grad_norm": 34.67846694039311, "learning_rate": 4.5815495710702525e-07, "logits/chosen": -2.9086384773254395, "logits/rejected": -2.1107101440429688, "logps/chosen": -691.9732666015625, "logps/rejected": -470.02642822265625, "loss": 0.1502, "rewards/accuracies": 1.0, "rewards/chosen": 2.207524299621582, "rewards/margins": 2.190422773361206, "rewards/rejected": 0.017101529985666275, "step": 1468 }, { "epoch": 1.0732420091324202, "grad_norm": 36.015980882189645, "learning_rate": 4.580665742103104e-07, "logits/chosen": -2.653148651123047, "logits/rejected": -2.453533411026001, "logps/chosen": -857.03857421875, "logps/rejected": -710.7476196289062, "loss": 0.2072, "rewards/accuracies": 0.875, "rewards/chosen": 2.6194727420806885, "rewards/margins": 2.4405479431152344, "rewards/rejected": 0.17892485857009888, "step": 1469 }, { "epoch": 1.073972602739726, "grad_norm": 32.55673570920559, "learning_rate": 4.57978106616999e-07, "logits/chosen": -2.6734604835510254, "logits/rejected": -1.783971905708313, "logps/chosen": -663.5574951171875, "logps/rejected": -451.1529846191406, "loss": 0.1829, "rewards/accuracies": 1.0, "rewards/chosen": 3.2801337242126465, "rewards/margins": 4.3869853019714355, "rewards/rejected": -1.106851577758789, "step": 1470 }, { "epoch": 1.074703196347032, "grad_norm": 39.50614012304764, "learning_rate": 4.578895543631032e-07, "logits/chosen": -3.0223615169525146, "logits/rejected": -2.381610870361328, "logps/chosen": -511.1317138671875, "logps/rejected": -388.7618408203125, "loss": 0.2642, "rewards/accuracies": 0.875, "rewards/chosen": 2.2834372520446777, "rewards/margins": 1.5929927825927734, "rewards/rejected": 0.6904443502426147, "step": 1471 }, { "epoch": 1.0754337899543378, "grad_norm": 30.3216138844008, "learning_rate": 4.578009174846693e-07, "logits/chosen": -2.6315579414367676, "logits/rejected": -2.5169787406921387, "logps/chosen": -506.76629638671875, "logps/rejected": -548.3790893554688, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": 1.9979571104049683, "rewards/margins": 1.828883171081543, "rewards/rejected": 0.1690739095211029, "step": 1472 }, { "epoch": 1.076164383561644, "grad_norm": 50.1915009496441, "learning_rate": 4.5771219601777855e-07, "logits/chosen": -2.8049769401550293, "logits/rejected": -2.654362201690674, "logps/chosen": -610.209228515625, "logps/rejected": -656.5407104492188, "loss": 0.2466, "rewards/accuracies": 1.0, "rewards/chosen": 3.8624794483184814, "rewards/margins": 4.429845333099365, "rewards/rejected": -0.5673661231994629, "step": 1473 }, { "epoch": 1.0768949771689498, "grad_norm": 41.121561109543705, "learning_rate": 4.5762338999854623e-07, "logits/chosen": -2.896000623703003, "logits/rejected": -2.172206401824951, "logps/chosen": -868.0030517578125, "logps/rejected": -641.514404296875, "loss": 0.1669, "rewards/accuracies": 1.0, "rewards/chosen": 4.955514907836914, "rewards/margins": 3.521149158477783, "rewards/rejected": 1.43436598777771, "step": 1474 }, { "epoch": 1.0776255707762556, "grad_norm": 46.15483769843363, "learning_rate": 4.5753449946312205e-07, "logits/chosen": -3.0783846378326416, "logits/rejected": -2.30534029006958, "logps/chosen": -655.787841796875, "logps/rejected": -472.53533935546875, "loss": 0.238, "rewards/accuracies": 0.875, "rewards/chosen": 3.939643383026123, "rewards/margins": 3.654099225997925, "rewards/rejected": 0.28554466366767883, "step": 1475 }, { "epoch": 1.0783561643835617, "grad_norm": 27.090614005361708, "learning_rate": 4.5744552444769034e-07, "logits/chosen": -2.8783016204833984, "logits/rejected": -2.144338369369507, "logps/chosen": -835.0466918945312, "logps/rejected": -649.6325073242188, "loss": 0.1872, "rewards/accuracies": 1.0, "rewards/chosen": 4.497734546661377, "rewards/margins": 3.9682369232177734, "rewards/rejected": 0.5294976234436035, "step": 1476 }, { "epoch": 1.0790867579908676, "grad_norm": 46.730493402829325, "learning_rate": 4.573564649884697e-07, "logits/chosen": -2.2317700386047363, "logits/rejected": -2.596224546432495, "logps/chosen": -647.533935546875, "logps/rejected": -679.8370361328125, "loss": 0.167, "rewards/accuracies": 0.875, "rewards/chosen": 3.755335807800293, "rewards/margins": 3.6764070987701416, "rewards/rejected": 0.07892875373363495, "step": 1477 }, { "epoch": 1.0798173515981735, "grad_norm": 35.32025115428205, "learning_rate": 4.5726732112171306e-07, "logits/chosen": -2.5737996101379395, "logits/rejected": -2.903653621673584, "logps/chosen": -669.225341796875, "logps/rejected": -868.9080810546875, "loss": 0.2173, "rewards/accuracies": 1.0, "rewards/chosen": 3.2083241939544678, "rewards/margins": 3.393481731414795, "rewards/rejected": -0.18515777587890625, "step": 1478 }, { "epoch": 1.0805479452054794, "grad_norm": 30.97063014600486, "learning_rate": 4.571780928837078e-07, "logits/chosen": -2.9706428050994873, "logits/rejected": -1.9356141090393066, "logps/chosen": -614.7152099609375, "logps/rejected": -458.89801025390625, "loss": 0.1385, "rewards/accuracies": 0.875, "rewards/chosen": 2.902116060256958, "rewards/margins": 3.002382516860962, "rewards/rejected": -0.10026675462722778, "step": 1479 }, { "epoch": 1.0812785388127855, "grad_norm": 29.688873323498655, "learning_rate": 4.570887803107756e-07, "logits/chosen": -2.251973867416382, "logits/rejected": -2.269456148147583, "logps/chosen": -689.0787353515625, "logps/rejected": -615.2216186523438, "loss": 0.1868, "rewards/accuracies": 0.875, "rewards/chosen": 3.0064640045166016, "rewards/margins": 3.141812801361084, "rewards/rejected": -0.1353491246700287, "step": 1480 }, { "epoch": 1.0820091324200913, "grad_norm": 43.90971683377012, "learning_rate": 4.5699938343927236e-07, "logits/chosen": -2.691474199295044, "logits/rejected": -2.1759209632873535, "logps/chosen": -586.8297729492188, "logps/rejected": -437.76446533203125, "loss": 0.2674, "rewards/accuracies": 0.875, "rewards/chosen": 2.6667418479919434, "rewards/margins": 3.6167287826538086, "rewards/rejected": -0.9499868750572205, "step": 1481 }, { "epoch": 1.0827397260273972, "grad_norm": 41.04550697187361, "learning_rate": 4.5690990230558857e-07, "logits/chosen": -2.450207233428955, "logits/rejected": -2.4690418243408203, "logps/chosen": -613.2505493164062, "logps/rejected": -655.1104736328125, "loss": 0.2224, "rewards/accuracies": 0.875, "rewards/chosen": 2.9258365631103516, "rewards/margins": 3.10201358795166, "rewards/rejected": -0.17617683112621307, "step": 1482 }, { "epoch": 1.0834703196347033, "grad_norm": 38.402712281125076, "learning_rate": 4.568203369461488e-07, "logits/chosen": -2.489856004714966, "logits/rejected": -1.890661597251892, "logps/chosen": -430.02996826171875, "logps/rejected": -449.71539306640625, "loss": 0.1906, "rewards/accuracies": 0.75, "rewards/chosen": 1.0353591442108154, "rewards/margins": 1.894580602645874, "rewards/rejected": -0.8592214584350586, "step": 1483 }, { "epoch": 1.0842009132420092, "grad_norm": 41.15569979937849, "learning_rate": 4.567306873974119e-07, "logits/chosen": -2.719943046569824, "logits/rejected": -2.3531670570373535, "logps/chosen": -904.10546875, "logps/rejected": -678.26025390625, "loss": 0.2062, "rewards/accuracies": 0.875, "rewards/chosen": 3.410343647003174, "rewards/margins": 1.9655518531799316, "rewards/rejected": 1.444791555404663, "step": 1484 }, { "epoch": 1.084931506849315, "grad_norm": 30.66011817784934, "learning_rate": 4.566409536958712e-07, "logits/chosen": -3.027714490890503, "logits/rejected": -2.4158146381378174, "logps/chosen": -579.322021484375, "logps/rejected": -458.0976867675781, "loss": 0.1717, "rewards/accuracies": 0.875, "rewards/chosen": 2.588625431060791, "rewards/margins": 2.5980966091156006, "rewards/rejected": -0.009471118450164795, "step": 1485 }, { "epoch": 1.085662100456621, "grad_norm": 34.0156214113457, "learning_rate": 4.565511358780539e-07, "logits/chosen": -2.596438407897949, "logits/rejected": -2.121675968170166, "logps/chosen": -711.5881958007812, "logps/rejected": -671.420166015625, "loss": 0.1446, "rewards/accuracies": 1.0, "rewards/chosen": 2.90271258354187, "rewards/margins": 3.5774905681610107, "rewards/rejected": -0.6747778654098511, "step": 1486 }, { "epoch": 1.086392694063927, "grad_norm": 23.824104425143798, "learning_rate": 4.564612339805219e-07, "logits/chosen": -2.3539271354675293, "logits/rejected": -2.3195509910583496, "logps/chosen": -639.1788940429688, "logps/rejected": -623.8629760742188, "loss": 0.1544, "rewards/accuracies": 1.0, "rewards/chosen": 2.6978883743286133, "rewards/margins": 3.9143102169036865, "rewards/rejected": -1.2164218425750732, "step": 1487 }, { "epoch": 1.087123287671233, "grad_norm": 46.388575127327954, "learning_rate": 4.5637124803987114e-07, "logits/chosen": -2.4155468940734863, "logits/rejected": -2.0347328186035156, "logps/chosen": -880.484130859375, "logps/rejected": -740.9441528320312, "loss": 0.2322, "rewards/accuracies": 0.75, "rewards/chosen": 2.0835490226745605, "rewards/margins": 1.0373140573501587, "rewards/rejected": 1.0462348461151123, "step": 1488 }, { "epoch": 1.0878538812785388, "grad_norm": 27.454713466068196, "learning_rate": 4.562811780927315e-07, "logits/chosen": -2.417841911315918, "logits/rejected": -2.4718751907348633, "logps/chosen": -274.21600341796875, "logps/rejected": -391.2403564453125, "loss": 0.1805, "rewards/accuracies": 0.875, "rewards/chosen": 1.2872254848480225, "rewards/margins": 1.6358366012573242, "rewards/rejected": -0.3486112952232361, "step": 1489 }, { "epoch": 1.0885844748858449, "grad_norm": 24.52984296323116, "learning_rate": 4.561910241757675e-07, "logits/chosen": -2.41267991065979, "logits/rejected": -2.446415662765503, "logps/chosen": -376.77349853515625, "logps/rejected": -391.68389892578125, "loss": 0.1344, "rewards/accuracies": 1.0, "rewards/chosen": 2.482941150665283, "rewards/margins": 3.0181915760040283, "rewards/rejected": -0.5352505445480347, "step": 1490 }, { "epoch": 1.0893150684931507, "grad_norm": 33.14413693968456, "learning_rate": 4.561007863256775e-07, "logits/chosen": -2.5416250228881836, "logits/rejected": -2.2107245922088623, "logps/chosen": -669.5719604492188, "logps/rejected": -643.0001220703125, "loss": 0.167, "rewards/accuracies": 1.0, "rewards/chosen": 3.511162281036377, "rewards/margins": 4.33363676071167, "rewards/rejected": -0.8224747180938721, "step": 1491 }, { "epoch": 1.0900456621004566, "grad_norm": 27.13494043987901, "learning_rate": 4.5601046457919425e-07, "logits/chosen": -2.927473545074463, "logits/rejected": -1.8707928657531738, "logps/chosen": -642.883056640625, "logps/rejected": -475.7174072265625, "loss": 0.146, "rewards/accuracies": 1.0, "rewards/chosen": 3.073038339614868, "rewards/margins": 3.542058229446411, "rewards/rejected": -0.46902021765708923, "step": 1492 }, { "epoch": 1.0907762557077625, "grad_norm": 24.975543457472426, "learning_rate": 4.559200589730845e-07, "logits/chosen": -3.0531527996063232, "logits/rejected": -2.467115640640259, "logps/chosen": -969.1026611328125, "logps/rejected": -647.805419921875, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 3.7411422729492188, "rewards/margins": 2.748704433441162, "rewards/rejected": 0.9924381971359253, "step": 1493 }, { "epoch": 1.0915068493150686, "grad_norm": 35.666324031526926, "learning_rate": 4.558295695441492e-07, "logits/chosen": -2.427062511444092, "logits/rejected": -1.5727925300598145, "logps/chosen": -471.24774169921875, "logps/rejected": -264.85687255859375, "loss": 0.183, "rewards/accuracies": 0.875, "rewards/chosen": 2.540520668029785, "rewards/margins": 3.786698341369629, "rewards/rejected": -1.2461776733398438, "step": 1494 }, { "epoch": 1.0922374429223745, "grad_norm": 33.510516437591086, "learning_rate": 4.5573899632922354e-07, "logits/chosen": -3.074387788772583, "logits/rejected": -2.2003183364868164, "logps/chosen": -819.1826171875, "logps/rejected": -493.7804870605469, "loss": 0.1778, "rewards/accuracies": 1.0, "rewards/chosen": 3.8250913619995117, "rewards/margins": 3.9494266510009766, "rewards/rejected": -0.124335378408432, "step": 1495 }, { "epoch": 1.0929680365296803, "grad_norm": 51.027958405784766, "learning_rate": 4.556483393651765e-07, "logits/chosen": -2.5792291164398193, "logits/rejected": -2.105335235595703, "logps/chosen": -397.06805419921875, "logps/rejected": -279.1260986328125, "loss": 0.2398, "rewards/accuracies": 1.0, "rewards/chosen": 2.0330491065979004, "rewards/margins": 3.43365216255188, "rewards/rejected": -1.4006032943725586, "step": 1496 }, { "epoch": 1.0936986301369862, "grad_norm": 35.912355333854116, "learning_rate": 4.5555759868891154e-07, "logits/chosen": -2.5151431560516357, "logits/rejected": -2.174590587615967, "logps/chosen": -444.9607849121094, "logps/rejected": -508.60797119140625, "loss": 0.2807, "rewards/accuracies": 0.75, "rewards/chosen": 1.5369935035705566, "rewards/margins": 1.6318026781082153, "rewards/rejected": -0.0948091447353363, "step": 1497 }, { "epoch": 1.0944292237442923, "grad_norm": 47.03585681341284, "learning_rate": 4.554667743373658e-07, "logits/chosen": -3.0077760219573975, "logits/rejected": -2.026265859603882, "logps/chosen": -723.1605834960938, "logps/rejected": -387.92083740234375, "loss": 0.1817, "rewards/accuracies": 0.875, "rewards/chosen": 3.014281749725342, "rewards/margins": 4.192314147949219, "rewards/rejected": -1.178032398223877, "step": 1498 }, { "epoch": 1.0951598173515982, "grad_norm": 32.793485934352304, "learning_rate": 4.5537586634751086e-07, "logits/chosen": -2.6287405490875244, "logits/rejected": -1.7764177322387695, "logps/chosen": -319.56085205078125, "logps/rejected": -279.2174072265625, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 2.42166805267334, "rewards/margins": 4.019059181213379, "rewards/rejected": -1.59739089012146, "step": 1499 }, { "epoch": 1.095890410958904, "grad_norm": 33.87380179788191, "learning_rate": 4.55284874756352e-07, "logits/chosen": -3.267770767211914, "logits/rejected": -2.6546857357025146, "logps/chosen": -535.7847290039062, "logps/rejected": -492.256103515625, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 2.3743746280670166, "rewards/margins": 2.525397777557373, "rewards/rejected": -0.1510230451822281, "step": 1500 }, { "epoch": 1.0966210045662101, "grad_norm": 46.251227285236716, "learning_rate": 4.5519379960092896e-07, "logits/chosen": -2.982971429824829, "logits/rejected": -1.8854467868804932, "logps/chosen": -700.8868408203125, "logps/rejected": -485.90447998046875, "loss": 0.2592, "rewards/accuracies": 0.875, "rewards/chosen": 3.197126865386963, "rewards/margins": 3.913501262664795, "rewards/rejected": -0.7163746953010559, "step": 1501 }, { "epoch": 1.097351598173516, "grad_norm": 67.67685526294237, "learning_rate": 4.55102640918315e-07, "logits/chosen": -2.688753843307495, "logits/rejected": -1.821669340133667, "logps/chosen": -757.13330078125, "logps/rejected": -594.1512451171875, "loss": 0.351, "rewards/accuracies": 0.875, "rewards/chosen": 2.1261768341064453, "rewards/margins": 1.9740560054779053, "rewards/rejected": 0.15212059020996094, "step": 1502 }, { "epoch": 1.098082191780822, "grad_norm": 37.29213875981438, "learning_rate": 4.550113987456178e-07, "logits/chosen": -2.877126455307007, "logits/rejected": -1.7358512878417969, "logps/chosen": -582.955078125, "logps/rejected": -458.78509521484375, "loss": 0.2003, "rewards/accuracies": 0.875, "rewards/chosen": 3.2459912300109863, "rewards/margins": 3.519138813018799, "rewards/rejected": -0.2731473445892334, "step": 1503 }, { "epoch": 1.0988127853881278, "grad_norm": 44.1286723308065, "learning_rate": 4.549200731199786e-07, "logits/chosen": -3.064656972885132, "logits/rejected": -2.131466865539551, "logps/chosen": -424.8863525390625, "logps/rejected": -403.0836181640625, "loss": 0.2578, "rewards/accuracies": 0.875, "rewards/chosen": 2.164715051651001, "rewards/margins": 2.9682912826538086, "rewards/rejected": -0.8035762310028076, "step": 1504 }, { "epoch": 1.0995433789954339, "grad_norm": 36.05798755888735, "learning_rate": 4.548286640785731e-07, "logits/chosen": -2.676469326019287, "logits/rejected": -2.1386208534240723, "logps/chosen": -524.284912109375, "logps/rejected": -415.005859375, "loss": 0.2003, "rewards/accuracies": 0.875, "rewards/chosen": 1.8474491834640503, "rewards/margins": 3.458169460296631, "rewards/rejected": -1.6107200384140015, "step": 1505 }, { "epoch": 1.1002739726027397, "grad_norm": 30.349840081436177, "learning_rate": 4.547371716586106e-07, "logits/chosen": -3.1995811462402344, "logits/rejected": -2.190910816192627, "logps/chosen": -881.65283203125, "logps/rejected": -534.8353881835938, "loss": 0.2472, "rewards/accuracies": 1.0, "rewards/chosen": 3.18458890914917, "rewards/margins": 2.792917251586914, "rewards/rejected": 0.3916715979576111, "step": 1506 }, { "epoch": 1.1010045662100456, "grad_norm": 40.31552193130767, "learning_rate": 4.5464559589733444e-07, "logits/chosen": -2.6669862270355225, "logits/rejected": -2.58402156829834, "logps/chosen": -540.4664306640625, "logps/rejected": -684.25634765625, "loss": 0.2332, "rewards/accuracies": 0.875, "rewards/chosen": 2.4259910583496094, "rewards/margins": 2.279780864715576, "rewards/rejected": 0.1462101936340332, "step": 1507 }, { "epoch": 1.1017351598173515, "grad_norm": 33.56775520426001, "learning_rate": 4.545539368320219e-07, "logits/chosen": -3.459587335586548, "logits/rejected": -2.3711748123168945, "logps/chosen": -652.1793212890625, "logps/rejected": -498.52587890625, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": 3.2590811252593994, "rewards/margins": 3.193800210952759, "rewards/rejected": 0.06528082489967346, "step": 1508 }, { "epoch": 1.1024657534246576, "grad_norm": 27.31604399918892, "learning_rate": 4.5446219449998425e-07, "logits/chosen": -3.1370139122009277, "logits/rejected": -2.2453317642211914, "logps/chosen": -1041.2265625, "logps/rejected": -681.4954223632812, "loss": 0.1745, "rewards/accuracies": 0.875, "rewards/chosen": 3.233307123184204, "rewards/margins": 3.0403404235839844, "rewards/rejected": 0.19296684861183167, "step": 1509 }, { "epoch": 1.1031963470319635, "grad_norm": 46.489211933220325, "learning_rate": 4.5437036893856653e-07, "logits/chosen": -3.1230335235595703, "logits/rejected": -2.346853256225586, "logps/chosen": -915.2930908203125, "logps/rejected": -619.1134643554688, "loss": 0.2126, "rewards/accuracies": 0.875, "rewards/chosen": 2.4772231578826904, "rewards/margins": 2.752875804901123, "rewards/rejected": -0.2756526470184326, "step": 1510 }, { "epoch": 1.1039269406392693, "grad_norm": 49.08159212522425, "learning_rate": 4.5427846018514757e-07, "logits/chosen": -2.5925493240356445, "logits/rejected": -1.6930510997772217, "logps/chosen": -415.87384033203125, "logps/rejected": -390.8414001464844, "loss": 0.2584, "rewards/accuracies": 1.0, "rewards/chosen": 2.7043442726135254, "rewards/margins": 4.077075958251953, "rewards/rejected": -1.3727315664291382, "step": 1511 }, { "epoch": 1.1046575342465754, "grad_norm": 25.133805738844423, "learning_rate": 4.5418646827714036e-07, "logits/chosen": -2.712573766708374, "logits/rejected": -2.5249032974243164, "logps/chosen": -568.5538940429688, "logps/rejected": -654.746826171875, "loss": 0.1726, "rewards/accuracies": 1.0, "rewards/chosen": 2.754531145095825, "rewards/margins": 3.103276252746582, "rewards/rejected": -0.3487451672554016, "step": 1512 }, { "epoch": 1.1053881278538813, "grad_norm": 34.012212018718365, "learning_rate": 4.5409439325199157e-07, "logits/chosen": -3.0035483837127686, "logits/rejected": -2.0045151710510254, "logps/chosen": -631.0419921875, "logps/rejected": -519.1254272460938, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 3.603919267654419, "rewards/margins": 4.4088239669799805, "rewards/rejected": -0.804904580116272, "step": 1513 }, { "epoch": 1.1061187214611872, "grad_norm": 34.76912728145563, "learning_rate": 4.5400223514718163e-07, "logits/chosen": -2.5645601749420166, "logits/rejected": -2.1631202697753906, "logps/chosen": -813.9965209960938, "logps/rejected": -765.412109375, "loss": 0.1605, "rewards/accuracies": 0.875, "rewards/chosen": 3.921001434326172, "rewards/margins": 4.38421106338501, "rewards/rejected": -0.46320924162864685, "step": 1514 }, { "epoch": 1.106849315068493, "grad_norm": 30.072529917917823, "learning_rate": 4.539099940002249e-07, "logits/chosen": -2.426211357116699, "logits/rejected": -1.8285030126571655, "logps/chosen": -518.1026000976562, "logps/rejected": -345.3380126953125, "loss": 0.1842, "rewards/accuracies": 0.875, "rewards/chosen": 2.309854030609131, "rewards/margins": 2.7836620807647705, "rewards/rejected": -0.4738081991672516, "step": 1515 }, { "epoch": 1.1075799086757991, "grad_norm": 38.63161730330158, "learning_rate": 4.5381766984866956e-07, "logits/chosen": -2.8690195083618164, "logits/rejected": -2.211949110031128, "logps/chosen": -599.046875, "logps/rejected": -409.2356262207031, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": 2.6193857192993164, "rewards/margins": 3.849498748779297, "rewards/rejected": -1.2301127910614014, "step": 1516 }, { "epoch": 1.108310502283105, "grad_norm": 29.781095266190366, "learning_rate": 4.537252627300975e-07, "logits/chosen": -3.413529634475708, "logits/rejected": -2.4276068210601807, "logps/chosen": -738.6177368164062, "logps/rejected": -592.300537109375, "loss": 0.1766, "rewards/accuracies": 1.0, "rewards/chosen": 2.8259220123291016, "rewards/margins": 3.0398035049438477, "rewards/rejected": -0.2138815075159073, "step": 1517 }, { "epoch": 1.1090410958904109, "grad_norm": 42.77745628951729, "learning_rate": 4.536327726821243e-07, "logits/chosen": -3.3168585300445557, "logits/rejected": -2.047808885574341, "logps/chosen": -676.510498046875, "logps/rejected": -354.59429931640625, "loss": 0.1933, "rewards/accuracies": 0.75, "rewards/chosen": 1.8488422632217407, "rewards/margins": 1.9804600477218628, "rewards/rejected": -0.13161781430244446, "step": 1518 }, { "epoch": 1.109771689497717, "grad_norm": 45.654348972229634, "learning_rate": 4.5354019974239955e-07, "logits/chosen": -2.825467586517334, "logits/rejected": -2.2485697269439697, "logps/chosen": -682.394775390625, "logps/rejected": -580.2222290039062, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/chosen": 3.2896370887756348, "rewards/margins": 2.6425962448120117, "rewards/rejected": 0.6470409035682678, "step": 1519 }, { "epoch": 1.1105022831050229, "grad_norm": 27.093303953940815, "learning_rate": 4.534475439486064e-07, "logits/chosen": -2.539418935775757, "logits/rejected": -2.067140817642212, "logps/chosen": -433.9444580078125, "logps/rejected": -334.93170166015625, "loss": 0.1562, "rewards/accuracies": 1.0, "rewards/chosen": 2.241079568862915, "rewards/margins": 3.365596294403076, "rewards/rejected": -1.1245168447494507, "step": 1520 }, { "epoch": 1.1112328767123287, "grad_norm": 40.25986601472287, "learning_rate": 4.533548053384618e-07, "logits/chosen": -2.6434762477874756, "logits/rejected": -2.132211446762085, "logps/chosen": -549.424560546875, "logps/rejected": -599.853271484375, "loss": 0.1836, "rewards/accuracies": 0.875, "rewards/chosen": 3.5192689895629883, "rewards/margins": 4.135219097137451, "rewards/rejected": -0.6159502267837524, "step": 1521 }, { "epoch": 1.1119634703196346, "grad_norm": 34.063068014186015, "learning_rate": 4.532619839497164e-07, "logits/chosen": -3.4655022621154785, "logits/rejected": -1.6915571689605713, "logps/chosen": -823.5519409179688, "logps/rejected": -316.1917419433594, "loss": 0.1809, "rewards/accuracies": 0.875, "rewards/chosen": 3.5602145195007324, "rewards/margins": 4.256866931915283, "rewards/rejected": -0.696652889251709, "step": 1522 }, { "epoch": 1.1126940639269407, "grad_norm": 31.719351325888685, "learning_rate": 4.531690798201544e-07, "logits/chosen": -2.891364097595215, "logits/rejected": -2.589442729949951, "logps/chosen": -580.919921875, "logps/rejected": -526.6781005859375, "loss": 0.2453, "rewards/accuracies": 1.0, "rewards/chosen": 1.6562703847885132, "rewards/margins": 1.739914059638977, "rewards/rejected": -0.08364379405975342, "step": 1523 }, { "epoch": 1.1134246575342466, "grad_norm": 26.83356294079592, "learning_rate": 4.53076092987594e-07, "logits/chosen": -2.4622387886047363, "logits/rejected": -2.0894901752471924, "logps/chosen": -305.5484313964844, "logps/rejected": -244.3030548095703, "loss": 0.1651, "rewards/accuracies": 1.0, "rewards/chosen": 2.4569244384765625, "rewards/margins": 4.5044264793396, "rewards/rejected": -2.047501802444458, "step": 1524 }, { "epoch": 1.1141552511415524, "grad_norm": 26.063065557038158, "learning_rate": 4.529830234898866e-07, "logits/chosen": -3.147134304046631, "logits/rejected": -2.822533130645752, "logps/chosen": -690.8333129882812, "logps/rejected": -568.9908447265625, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 2.5647730827331543, "rewards/margins": 2.760125160217285, "rewards/rejected": -0.1953524500131607, "step": 1525 }, { "epoch": 1.1148858447488585, "grad_norm": 56.11636469621629, "learning_rate": 4.528898713649178e-07, "logits/chosen": -2.9463350772857666, "logits/rejected": -2.7213664054870605, "logps/chosen": -398.8304748535156, "logps/rejected": -476.6778259277344, "loss": 0.3266, "rewards/accuracies": 1.0, "rewards/chosen": 2.5987000465393066, "rewards/margins": 4.060859203338623, "rewards/rejected": -1.4621591567993164, "step": 1526 }, { "epoch": 1.1156164383561644, "grad_norm": 46.16689641094309, "learning_rate": 4.5279663665060643e-07, "logits/chosen": -2.6670355796813965, "logits/rejected": -2.084152936935425, "logps/chosen": -752.406494140625, "logps/rejected": -527.533203125, "loss": 0.2788, "rewards/accuracies": 0.875, "rewards/chosen": 3.4476122856140137, "rewards/margins": 4.4303998947143555, "rewards/rejected": -0.9827873706817627, "step": 1527 }, { "epoch": 1.1163470319634703, "grad_norm": 43.22140351353352, "learning_rate": 4.5270331938490516e-07, "logits/chosen": -2.8967854976654053, "logits/rejected": -2.676302909851074, "logps/chosen": -576.2613525390625, "logps/rejected": -581.865966796875, "loss": 0.2309, "rewards/accuracies": 0.875, "rewards/chosen": 2.1470391750335693, "rewards/margins": 2.280683994293213, "rewards/rejected": -0.13364499807357788, "step": 1528 }, { "epoch": 1.1170776255707762, "grad_norm": 30.415369906718215, "learning_rate": 4.526099196058e-07, "logits/chosen": -2.538769245147705, "logits/rejected": -2.0319180488586426, "logps/chosen": -699.3096923828125, "logps/rejected": -525.5175170898438, "loss": 0.1635, "rewards/accuracies": 1.0, "rewards/chosen": 2.363126039505005, "rewards/margins": 3.589895725250244, "rewards/rejected": -1.2267696857452393, "step": 1529 }, { "epoch": 1.1178082191780823, "grad_norm": 30.59911114766806, "learning_rate": 4.5251643735131086e-07, "logits/chosen": -2.597165107727051, "logits/rejected": -1.8817025423049927, "logps/chosen": -378.96343994140625, "logps/rejected": -324.8585205078125, "loss": 0.1989, "rewards/accuracies": 1.0, "rewards/chosen": 3.3941733837127686, "rewards/margins": 4.740485191345215, "rewards/rejected": -1.346311330795288, "step": 1530 }, { "epoch": 1.1185388127853881, "grad_norm": 20.705515781768455, "learning_rate": 4.5242287265949097e-07, "logits/chosen": -3.153535842895508, "logits/rejected": -2.39559006690979, "logps/chosen": -1037.546630859375, "logps/rejected": -815.8275146484375, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": 3.926396369934082, "rewards/margins": 3.2116661071777344, "rewards/rejected": 0.7147301435470581, "step": 1531 }, { "epoch": 1.119269406392694, "grad_norm": 36.195726477505254, "learning_rate": 4.523292255684275e-07, "logits/chosen": -2.3681414127349854, "logits/rejected": -2.347130298614502, "logps/chosen": -662.4341430664062, "logps/rejected": -698.6453857421875, "loss": 0.1795, "rewards/accuracies": 0.75, "rewards/chosen": 1.8090639114379883, "rewards/margins": 1.8917243480682373, "rewards/rejected": -0.08266028761863708, "step": 1532 }, { "epoch": 1.12, "grad_norm": 24.74693695534611, "learning_rate": 4.5223549611624045e-07, "logits/chosen": -2.964855909347534, "logits/rejected": -1.8451015949249268, "logps/chosen": -772.8001708984375, "logps/rejected": -491.4472961425781, "loss": 0.1072, "rewards/accuracies": 1.0, "rewards/chosen": 4.316333293914795, "rewards/margins": 5.3868255615234375, "rewards/rejected": -1.0704925060272217, "step": 1533 }, { "epoch": 1.120730593607306, "grad_norm": 33.35114616014437, "learning_rate": 4.521416843410842e-07, "logits/chosen": -2.5904629230499268, "logits/rejected": -1.885589838027954, "logps/chosen": -600.2517700195312, "logps/rejected": -617.7111206054688, "loss": 0.1574, "rewards/accuracies": 1.0, "rewards/chosen": 3.4814915657043457, "rewards/margins": 4.360762119293213, "rewards/rejected": -0.8792704939842224, "step": 1534 }, { "epoch": 1.1214611872146119, "grad_norm": 31.441312827929913, "learning_rate": 4.52047790281146e-07, "logits/chosen": -3.3136610984802246, "logits/rejected": -2.2644200325012207, "logps/chosen": -838.44384765625, "logps/rejected": -659.0891723632812, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 2.758995532989502, "rewards/margins": 3.357496976852417, "rewards/rejected": -0.598501443862915, "step": 1535 }, { "epoch": 1.1221917808219177, "grad_norm": 32.69368368852002, "learning_rate": 4.519538139746469e-07, "logits/chosen": -2.5903360843658447, "logits/rejected": -2.4445321559906006, "logps/chosen": -474.39013671875, "logps/rejected": -599.3382568359375, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 2.900214195251465, "rewards/margins": 3.497699737548828, "rewards/rejected": -0.5974854826927185, "step": 1536 }, { "epoch": 1.1229223744292238, "grad_norm": 27.334730655340287, "learning_rate": 4.5185975545984146e-07, "logits/chosen": -2.722086191177368, "logits/rejected": -1.5710002183914185, "logps/chosen": -397.2540588378906, "logps/rejected": -188.0051727294922, "loss": 0.1785, "rewards/accuracies": 1.0, "rewards/chosen": 2.7837042808532715, "rewards/margins": 4.320627689361572, "rewards/rejected": -1.5369231700897217, "step": 1537 }, { "epoch": 1.1236529680365297, "grad_norm": 28.00321941940895, "learning_rate": 4.517656147750174e-07, "logits/chosen": -2.831132411956787, "logits/rejected": -2.0642614364624023, "logps/chosen": -554.705810546875, "logps/rejected": -249.990478515625, "loss": 0.1927, "rewards/accuracies": 0.625, "rewards/chosen": 1.41843581199646, "rewards/margins": 2.027872323989868, "rewards/rejected": -0.6094364523887634, "step": 1538 }, { "epoch": 1.1243835616438356, "grad_norm": 28.18536165842841, "learning_rate": 4.516713919584961e-07, "logits/chosen": -2.923827648162842, "logits/rejected": -2.0595595836639404, "logps/chosen": -537.4036865234375, "logps/rejected": -400.31854248046875, "loss": 0.1698, "rewards/accuracies": 1.0, "rewards/chosen": 2.7908172607421875, "rewards/margins": 3.402364492416382, "rewards/rejected": -0.6115474104881287, "step": 1539 }, { "epoch": 1.1251141552511417, "grad_norm": 37.040887458995904, "learning_rate": 4.515770870486325e-07, "logits/chosen": -2.435410976409912, "logits/rejected": -2.0013108253479004, "logps/chosen": -689.174072265625, "logps/rejected": -669.1357421875, "loss": 0.2277, "rewards/accuracies": 0.875, "rewards/chosen": 2.4460949897766113, "rewards/margins": 2.790299415588379, "rewards/rejected": -0.34420448541641235, "step": 1540 }, { "epoch": 1.1258447488584475, "grad_norm": 30.54329322830904, "learning_rate": 4.514827000838148e-07, "logits/chosen": -2.521829843521118, "logits/rejected": -1.7525650262832642, "logps/chosen": -765.771484375, "logps/rejected": -452.6562805175781, "loss": 0.1573, "rewards/accuracies": 1.0, "rewards/chosen": 3.2347543239593506, "rewards/margins": 3.335948944091797, "rewards/rejected": -0.101194366812706, "step": 1541 }, { "epoch": 1.1265753424657534, "grad_norm": 26.30474086718673, "learning_rate": 4.5138823110246447e-07, "logits/chosen": -2.764537811279297, "logits/rejected": -2.3625802993774414, "logps/chosen": -623.2381591796875, "logps/rejected": -624.3055419921875, "loss": 0.1627, "rewards/accuracies": 0.875, "rewards/chosen": 2.676767587661743, "rewards/margins": 2.5976662635803223, "rewards/rejected": 0.07910144329071045, "step": 1542 }, { "epoch": 1.1273059360730593, "grad_norm": 28.56980512093909, "learning_rate": 4.5129368014303673e-07, "logits/chosen": -3.128222703933716, "logits/rejected": -2.057302713394165, "logps/chosen": -747.6917724609375, "logps/rejected": -469.66131591796875, "loss": 0.1648, "rewards/accuracies": 1.0, "rewards/chosen": 4.498360633850098, "rewards/margins": 5.1941819190979, "rewards/rejected": -0.6958221197128296, "step": 1543 }, { "epoch": 1.1280365296803654, "grad_norm": 29.25343759013364, "learning_rate": 4.5119904724401976e-07, "logits/chosen": -2.9998016357421875, "logits/rejected": -3.1687731742858887, "logps/chosen": -695.5599365234375, "logps/rejected": -777.0140991210938, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 2.8158111572265625, "rewards/margins": 2.912081718444824, "rewards/rejected": -0.0962703675031662, "step": 1544 }, { "epoch": 1.1287671232876713, "grad_norm": 42.220954617216584, "learning_rate": 4.5110433244393537e-07, "logits/chosen": -2.5773279666900635, "logits/rejected": -2.2965638637542725, "logps/chosen": -705.155517578125, "logps/rejected": -647.5655517578125, "loss": 0.2337, "rewards/accuracies": 1.0, "rewards/chosen": 2.269775390625, "rewards/margins": 2.1898815631866455, "rewards/rejected": 0.07989373803138733, "step": 1545 }, { "epoch": 1.1294977168949771, "grad_norm": 28.927388807878728, "learning_rate": 4.510095357813387e-07, "logits/chosen": -2.6348717212677, "logits/rejected": -2.3857669830322266, "logps/chosen": -697.6597290039062, "logps/rejected": -582.3829345703125, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": 1.6395851373672485, "rewards/margins": 1.9481816291809082, "rewards/rejected": -0.30859655141830444, "step": 1546 }, { "epoch": 1.1302283105022832, "grad_norm": 32.01013023247484, "learning_rate": 4.5091465729481793e-07, "logits/chosen": -2.34466290473938, "logits/rejected": -2.3275527954101562, "logps/chosen": -661.19384765625, "logps/rejected": -703.699951171875, "loss": 0.1919, "rewards/accuracies": 0.875, "rewards/chosen": 3.0890839099884033, "rewards/margins": 3.2770392894744873, "rewards/rejected": -0.18795543909072876, "step": 1547 }, { "epoch": 1.130958904109589, "grad_norm": 32.24974450298502, "learning_rate": 4.5081969702299506e-07, "logits/chosen": -3.1700053215026855, "logits/rejected": -2.917938232421875, "logps/chosen": -742.2672729492188, "logps/rejected": -624.0336303710938, "loss": 0.153, "rewards/accuracies": 1.0, "rewards/chosen": 3.6349408626556396, "rewards/margins": 3.069920778274536, "rewards/rejected": 0.5650200843811035, "step": 1548 }, { "epoch": 1.131689497716895, "grad_norm": 33.15409247221601, "learning_rate": 4.5072465500452485e-07, "logits/chosen": -2.757538318634033, "logits/rejected": -2.0238258838653564, "logps/chosen": -798.1156005859375, "logps/rejected": -468.91851806640625, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 2.9371304512023926, "rewards/margins": 3.8417465686798096, "rewards/rejected": -0.9046162962913513, "step": 1549 }, { "epoch": 1.1324200913242009, "grad_norm": 48.860114521106325, "learning_rate": 4.506295312780957e-07, "logits/chosen": -2.3735570907592773, "logits/rejected": -2.69956111907959, "logps/chosen": -289.3321228027344, "logps/rejected": -364.0074462890625, "loss": 0.3298, "rewards/accuracies": 0.75, "rewards/chosen": 0.9455525279045105, "rewards/margins": 1.9429799318313599, "rewards/rejected": -0.9974273443222046, "step": 1550 }, { "epoch": 1.133150684931507, "grad_norm": 43.345622819128174, "learning_rate": 4.5053432588242894e-07, "logits/chosen": -2.7898142337799072, "logits/rejected": -2.5083861351013184, "logps/chosen": -407.9337463378906, "logps/rejected": -376.2844543457031, "loss": 0.2273, "rewards/accuracies": 0.875, "rewards/chosen": 1.6746776103973389, "rewards/margins": 2.4121832847595215, "rewards/rejected": -0.7375057339668274, "step": 1551 }, { "epoch": 1.1338812785388128, "grad_norm": 42.189845131578316, "learning_rate": 4.504390388562796e-07, "logits/chosen": -3.0519261360168457, "logits/rejected": -2.1929738521575928, "logps/chosen": -534.1294555664062, "logps/rejected": -401.532470703125, "loss": 0.2844, "rewards/accuracies": 0.875, "rewards/chosen": 3.736355781555176, "rewards/margins": 4.163155555725098, "rewards/rejected": -0.4268002510070801, "step": 1552 }, { "epoch": 1.1346118721461187, "grad_norm": 33.94249394944061, "learning_rate": 4.5034367023843556e-07, "logits/chosen": -2.622580051422119, "logits/rejected": -1.899718999862671, "logps/chosen": -512.8128662109375, "logps/rejected": -345.9070739746094, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": 1.5395865440368652, "rewards/margins": 1.2758933305740356, "rewards/rejected": 0.26369333267211914, "step": 1553 }, { "epoch": 1.1353424657534246, "grad_norm": 37.87672586569076, "learning_rate": 4.50248220067718e-07, "logits/chosen": -3.0446789264678955, "logits/rejected": -1.7366173267364502, "logps/chosen": -720.804931640625, "logps/rejected": -464.6275939941406, "loss": 0.1525, "rewards/accuracies": 0.875, "rewards/chosen": 3.9910941123962402, "rewards/margins": 4.353589057922363, "rewards/rejected": -0.3624948263168335, "step": 1554 }, { "epoch": 1.1360730593607307, "grad_norm": 31.058336067114293, "learning_rate": 4.5015268838298145e-07, "logits/chosen": -2.7793385982513428, "logits/rejected": -2.0001444816589355, "logps/chosen": -363.6285400390625, "logps/rejected": -267.0023498535156, "loss": 0.2154, "rewards/accuracies": 0.875, "rewards/chosen": 1.8591610193252563, "rewards/margins": 2.585681200027466, "rewards/rejected": -0.7265201210975647, "step": 1555 }, { "epoch": 1.1368036529680365, "grad_norm": 25.886039406073934, "learning_rate": 4.500570752231134e-07, "logits/chosen": -2.1542234420776367, "logits/rejected": -2.4633052349090576, "logps/chosen": -309.1629333496094, "logps/rejected": -657.2114868164062, "loss": 0.1293, "rewards/accuracies": 1.0, "rewards/chosen": 1.7353346347808838, "rewards/margins": 3.4475531578063965, "rewards/rejected": -1.7122185230255127, "step": 1556 }, { "epoch": 1.1375342465753424, "grad_norm": 29.677093159513408, "learning_rate": 4.499613806270346e-07, "logits/chosen": -2.751171350479126, "logits/rejected": -2.6181132793426514, "logps/chosen": -728.3507080078125, "logps/rejected": -645.4091796875, "loss": 0.2058, "rewards/accuracies": 0.875, "rewards/chosen": 3.1301097869873047, "rewards/margins": 2.3036386966705322, "rewards/rejected": 0.826471209526062, "step": 1557 }, { "epoch": 1.1382648401826483, "grad_norm": 29.666356574448837, "learning_rate": 4.4986560463369905e-07, "logits/chosen": -2.591872453689575, "logits/rejected": -1.515784740447998, "logps/chosen": -519.2852783203125, "logps/rejected": -329.3253173828125, "loss": 0.1531, "rewards/accuracies": 1.0, "rewards/chosen": 3.997070789337158, "rewards/margins": 4.742652893066406, "rewards/rejected": -0.745581865310669, "step": 1558 }, { "epoch": 1.1389954337899544, "grad_norm": 41.67322978909467, "learning_rate": 4.497697472820937e-07, "logits/chosen": -2.9574191570281982, "logits/rejected": -1.568372368812561, "logps/chosen": -690.2735595703125, "logps/rejected": -385.3739929199219, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 3.5790019035339355, "rewards/margins": 3.4685416221618652, "rewards/rejected": 0.11046019196510315, "step": 1559 }, { "epoch": 1.1397260273972603, "grad_norm": 24.336635975128885, "learning_rate": 4.496738086112388e-07, "logits/chosen": -2.851116418838501, "logits/rejected": -2.8435707092285156, "logps/chosen": -815.7440185546875, "logps/rejected": -726.7825927734375, "loss": 0.1483, "rewards/accuracies": 1.0, "rewards/chosen": 2.468587875366211, "rewards/margins": 3.523038864135742, "rewards/rejected": -1.0544506311416626, "step": 1560 }, { "epoch": 1.1404566210045661, "grad_norm": 28.756690797534407, "learning_rate": 4.495777886601876e-07, "logits/chosen": -2.2625572681427, "logits/rejected": -2.173178195953369, "logps/chosen": -487.5314025878906, "logps/rejected": -491.0187683105469, "loss": 0.1581, "rewards/accuracies": 1.0, "rewards/chosen": 2.1367154121398926, "rewards/margins": 4.130983352661133, "rewards/rejected": -1.9942679405212402, "step": 1561 }, { "epoch": 1.1411872146118722, "grad_norm": 35.200323068696754, "learning_rate": 4.494816874680263e-07, "logits/chosen": -3.2216544151306152, "logits/rejected": -2.050212860107422, "logps/chosen": -1334.486083984375, "logps/rejected": -755.5488891601562, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": 4.616581439971924, "rewards/margins": 4.3548150062561035, "rewards/rejected": 0.2617664337158203, "step": 1562 }, { "epoch": 1.141917808219178, "grad_norm": 26.0147193911447, "learning_rate": 4.493855050738746e-07, "logits/chosen": -2.491605758666992, "logits/rejected": -2.0382282733917236, "logps/chosen": -557.238037109375, "logps/rejected": -454.7733154296875, "loss": 0.1051, "rewards/accuracies": 1.0, "rewards/chosen": 3.5804696083068848, "rewards/margins": 4.912139892578125, "rewards/rejected": -1.3316704034805298, "step": 1563 }, { "epoch": 1.142648401826484, "grad_norm": 32.004359901962964, "learning_rate": 4.492892415168847e-07, "logits/chosen": -2.824751853942871, "logits/rejected": -2.121487617492676, "logps/chosen": -749.9754638671875, "logps/rejected": -886.6260375976562, "loss": 0.16, "rewards/accuracies": 1.0, "rewards/chosen": 2.674811840057373, "rewards/margins": 2.7982025146484375, "rewards/rejected": -0.12339061498641968, "step": 1564 }, { "epoch": 1.1433789954337898, "grad_norm": 36.30400072650396, "learning_rate": 4.4919289683624226e-07, "logits/chosen": -2.3973922729492188, "logits/rejected": -2.5025811195373535, "logps/chosen": -498.9544372558594, "logps/rejected": -493.95733642578125, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": 2.91457462310791, "rewards/margins": 4.2700910568237305, "rewards/rejected": -1.3555166721343994, "step": 1565 }, { "epoch": 1.144109589041096, "grad_norm": 43.24045205980087, "learning_rate": 4.490964710711659e-07, "logits/chosen": -2.4658799171447754, "logits/rejected": -2.109389305114746, "logps/chosen": -608.1250610351562, "logps/rejected": -739.66796875, "loss": 0.242, "rewards/accuracies": 0.875, "rewards/chosen": 2.5466458797454834, "rewards/margins": 3.682856559753418, "rewards/rejected": -1.1362106800079346, "step": 1566 }, { "epoch": 1.1448401826484018, "grad_norm": 32.52721028892099, "learning_rate": 4.4899996426090694e-07, "logits/chosen": -2.6397435665130615, "logits/rejected": -2.0693647861480713, "logps/chosen": -707.0589599609375, "logps/rejected": -543.3277587890625, "loss": 0.1364, "rewards/accuracies": 1.0, "rewards/chosen": 3.0482606887817383, "rewards/margins": 3.1465811729431152, "rewards/rejected": -0.09832039475440979, "step": 1567 }, { "epoch": 1.1455707762557077, "grad_norm": 44.1669293857654, "learning_rate": 4.489033764447501e-07, "logits/chosen": -2.2868075370788574, "logits/rejected": -2.4783966541290283, "logps/chosen": -572.7174682617188, "logps/rejected": -667.1043701171875, "loss": 0.2524, "rewards/accuracies": 0.875, "rewards/chosen": 1.7709827423095703, "rewards/margins": 2.436098575592041, "rewards/rejected": -0.6651158332824707, "step": 1568 }, { "epoch": 1.1463013698630138, "grad_norm": 50.37776240875035, "learning_rate": 4.4880670766201265e-07, "logits/chosen": -2.5785202980041504, "logits/rejected": -2.2804360389709473, "logps/chosen": -425.4693298339844, "logps/rejected": -427.302490234375, "loss": 0.2511, "rewards/accuracies": 0.875, "rewards/chosen": 2.277216672897339, "rewards/margins": 3.431817054748535, "rewards/rejected": -1.1546003818511963, "step": 1569 }, { "epoch": 1.1470319634703197, "grad_norm": 38.80515542928059, "learning_rate": 4.4870995795204534e-07, "logits/chosen": -2.949273109436035, "logits/rejected": -2.591585874557495, "logps/chosen": -449.1492614746094, "logps/rejected": -475.2996826171875, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 1.6661163568496704, "rewards/margins": 3.3668220043182373, "rewards/rejected": -1.7007057666778564, "step": 1570 }, { "epoch": 1.1477625570776255, "grad_norm": 50.39502161052221, "learning_rate": 4.486131273542315e-07, "logits/chosen": -2.665883779525757, "logits/rejected": -2.478659152984619, "logps/chosen": -634.7384643554688, "logps/rejected": -542.5628662109375, "loss": 0.284, "rewards/accuracies": 0.625, "rewards/chosen": 2.760734796524048, "rewards/margins": 2.1867997646331787, "rewards/rejected": 0.5739351511001587, "step": 1571 }, { "epoch": 1.1484931506849314, "grad_norm": 30.177849089747827, "learning_rate": 4.485162159079874e-07, "logits/chosen": -2.164781093597412, "logits/rejected": -1.8476314544677734, "logps/chosen": -571.0982666015625, "logps/rejected": -528.265625, "loss": 0.1749, "rewards/accuracies": 1.0, "rewards/chosen": 3.150815486907959, "rewards/margins": 3.8485593795776367, "rewards/rejected": -0.6977440118789673, "step": 1572 }, { "epoch": 1.1492237442922375, "grad_norm": 50.83329064205858, "learning_rate": 4.484192236527623e-07, "logits/chosen": -3.148683547973633, "logits/rejected": -3.0618836879730225, "logps/chosen": -689.6345825195312, "logps/rejected": -776.7465209960938, "loss": 0.355, "rewards/accuracies": 0.875, "rewards/chosen": 2.067687511444092, "rewards/margins": 1.562608003616333, "rewards/rejected": 0.5050795078277588, "step": 1573 }, { "epoch": 1.1499543378995434, "grad_norm": 31.94179391083393, "learning_rate": 4.4832215062803835e-07, "logits/chosen": -2.9227960109710693, "logits/rejected": -2.006190538406372, "logps/chosen": -403.39862060546875, "logps/rejected": -325.8412170410156, "loss": 0.1477, "rewards/accuracies": 0.75, "rewards/chosen": 2.296203851699829, "rewards/margins": 2.9395012855529785, "rewards/rejected": -0.6432974934577942, "step": 1574 }, { "epoch": 1.1506849315068493, "grad_norm": 41.880571865779096, "learning_rate": 4.482249968733306e-07, "logits/chosen": -2.647340774536133, "logits/rejected": -2.3380186557769775, "logps/chosen": -618.2295532226562, "logps/rejected": -515.754150390625, "loss": 0.2353, "rewards/accuracies": 1.0, "rewards/chosen": 3.1614818572998047, "rewards/margins": 3.8644747734069824, "rewards/rejected": -0.7029926180839539, "step": 1575 }, { "epoch": 1.1514155251141553, "grad_norm": 32.000249861695416, "learning_rate": 4.4812776242818687e-07, "logits/chosen": -2.6793861389160156, "logits/rejected": -2.1024656295776367, "logps/chosen": -421.56170654296875, "logps/rejected": -396.1346130371094, "loss": 0.129, "rewards/accuracies": 0.875, "rewards/chosen": 2.7873947620391846, "rewards/margins": 4.61324405670166, "rewards/rejected": -1.8258492946624756, "step": 1576 }, { "epoch": 1.1521461187214612, "grad_norm": 27.385528351411033, "learning_rate": 4.4803044733218795e-07, "logits/chosen": -3.2290165424346924, "logits/rejected": -2.4795584678649902, "logps/chosen": -784.624267578125, "logps/rejected": -541.0645141601562, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 3.519132614135742, "rewards/margins": 2.9480385780334473, "rewards/rejected": 0.5710939764976501, "step": 1577 }, { "epoch": 1.152876712328767, "grad_norm": 29.63650945828631, "learning_rate": 4.479330516249474e-07, "logits/chosen": -2.8763680458068848, "logits/rejected": -2.233490467071533, "logps/chosen": -583.7898559570312, "logps/rejected": -587.880126953125, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": 2.661540985107422, "rewards/margins": 3.1985297203063965, "rewards/rejected": -0.5369886159896851, "step": 1578 }, { "epoch": 1.153607305936073, "grad_norm": 39.42157154064732, "learning_rate": 4.478355753461115e-07, "logits/chosen": -2.779442071914673, "logits/rejected": -1.677833914756775, "logps/chosen": -816.5076904296875, "logps/rejected": -531.5726928710938, "loss": 0.2196, "rewards/accuracies": 0.875, "rewards/chosen": 4.23546028137207, "rewards/margins": 3.334810256958008, "rewards/rejected": 0.9006496071815491, "step": 1579 }, { "epoch": 1.154337899543379, "grad_norm": 43.43988162310859, "learning_rate": 4.477380185353595e-07, "logits/chosen": -2.7960243225097656, "logits/rejected": -2.473970413208008, "logps/chosen": -808.3197631835938, "logps/rejected": -626.2919311523438, "loss": 0.2903, "rewards/accuracies": 1.0, "rewards/chosen": 3.8921167850494385, "rewards/margins": 5.2380805015563965, "rewards/rejected": -1.345963954925537, "step": 1580 }, { "epoch": 1.155068493150685, "grad_norm": 31.95477723417795, "learning_rate": 4.4764038123240346e-07, "logits/chosen": -2.764620780944824, "logits/rejected": -2.3709545135498047, "logps/chosen": -882.9043579101562, "logps/rejected": -681.8515625, "loss": 0.1573, "rewards/accuracies": 1.0, "rewards/chosen": 5.47011661529541, "rewards/margins": 4.968568801879883, "rewards/rejected": 0.5015475749969482, "step": 1581 }, { "epoch": 1.1557990867579908, "grad_norm": 37.17095220060955, "learning_rate": 4.475426634769879e-07, "logits/chosen": -3.060209274291992, "logits/rejected": -2.4366648197174072, "logps/chosen": -793.7726440429688, "logps/rejected": -671.7398681640625, "loss": 0.2011, "rewards/accuracies": 0.75, "rewards/chosen": 3.208993434906006, "rewards/margins": 3.068726062774658, "rewards/rejected": 0.14026758074760437, "step": 1582 }, { "epoch": 1.156529680365297, "grad_norm": 37.03771535068595, "learning_rate": 4.474448653088903e-07, "logits/chosen": -2.4116430282592773, "logits/rejected": -2.630098342895508, "logps/chosen": -488.8990783691406, "logps/rejected": -490.6795654296875, "loss": 0.2218, "rewards/accuracies": 0.875, "rewards/chosen": 2.9447615146636963, "rewards/margins": 2.889268398284912, "rewards/rejected": 0.05549311637878418, "step": 1583 }, { "epoch": 1.1572602739726028, "grad_norm": 37.03705123335955, "learning_rate": 4.47346986767921e-07, "logits/chosen": -3.6567277908325195, "logits/rejected": -2.4791440963745117, "logps/chosen": -776.1526489257812, "logps/rejected": -553.7648315429688, "loss": 0.1953, "rewards/accuracies": 1.0, "rewards/chosen": 4.591652870178223, "rewards/margins": 4.553475856781006, "rewards/rejected": 0.03817674517631531, "step": 1584 }, { "epoch": 1.1579908675799087, "grad_norm": 29.564780278401464, "learning_rate": 4.4724902789392284e-07, "logits/chosen": -2.784862518310547, "logits/rejected": -2.260770559310913, "logps/chosen": -566.3265380859375, "logps/rejected": -460.77398681640625, "loss": 0.1636, "rewards/accuracies": 1.0, "rewards/chosen": 2.7156972885131836, "rewards/margins": 4.846817970275879, "rewards/rejected": -2.131120443344116, "step": 1585 }, { "epoch": 1.1587214611872145, "grad_norm": 43.34223197143434, "learning_rate": 4.471509887267714e-07, "logits/chosen": -2.0992164611816406, "logits/rejected": -2.4316940307617188, "logps/chosen": -595.04931640625, "logps/rejected": -570.6057739257812, "loss": 0.2302, "rewards/accuracies": 0.875, "rewards/chosen": 1.495273470878601, "rewards/margins": 1.8166489601135254, "rewards/rejected": -0.32137566804885864, "step": 1586 }, { "epoch": 1.1594520547945206, "grad_norm": 35.35305159156614, "learning_rate": 4.4705286930637505e-07, "logits/chosen": -2.8682615756988525, "logits/rejected": -2.6332712173461914, "logps/chosen": -488.5908203125, "logps/rejected": -462.24310302734375, "loss": 0.2236, "rewards/accuracies": 1.0, "rewards/chosen": 1.6575220823287964, "rewards/margins": 2.9316327571868896, "rewards/rejected": -1.2741105556488037, "step": 1587 }, { "epoch": 1.1601826484018265, "grad_norm": 49.72106767068161, "learning_rate": 4.469546696726747e-07, "logits/chosen": -2.9208576679229736, "logits/rejected": -2.5332536697387695, "logps/chosen": -487.93060302734375, "logps/rejected": -501.07757568359375, "loss": 0.2523, "rewards/accuracies": 0.875, "rewards/chosen": 1.5007438659667969, "rewards/margins": 2.349538564682007, "rewards/rejected": -0.8487948179244995, "step": 1588 }, { "epoch": 1.1609132420091324, "grad_norm": 27.388031109352767, "learning_rate": 4.4685638986564406e-07, "logits/chosen": -2.757343292236328, "logits/rejected": -2.4058046340942383, "logps/chosen": -826.663818359375, "logps/rejected": -1026.99609375, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": 3.4668002128601074, "rewards/margins": 3.346557378768921, "rewards/rejected": 0.12024277448654175, "step": 1589 }, { "epoch": 1.1616438356164385, "grad_norm": 34.699751496318385, "learning_rate": 4.467580299252893e-07, "logits/chosen": -2.826146364212036, "logits/rejected": -2.4273550510406494, "logps/chosen": -679.1317138671875, "logps/rejected": -624.427490234375, "loss": 0.1985, "rewards/accuracies": 1.0, "rewards/chosen": 3.3256120681762695, "rewards/margins": 4.445844650268555, "rewards/rejected": -1.120232343673706, "step": 1590 }, { "epoch": 1.1623744292237443, "grad_norm": 48.04319345651369, "learning_rate": 4.466595898916493e-07, "logits/chosen": -2.959649085998535, "logits/rejected": -2.3176004886627197, "logps/chosen": -459.15216064453125, "logps/rejected": -389.2847900390625, "loss": 0.2292, "rewards/accuracies": 1.0, "rewards/chosen": 2.428492784500122, "rewards/margins": 2.4917850494384766, "rewards/rejected": -0.06329229474067688, "step": 1591 }, { "epoch": 1.1631050228310502, "grad_norm": 31.2526610195161, "learning_rate": 4.465610698047957e-07, "logits/chosen": -2.889566421508789, "logits/rejected": -2.506098747253418, "logps/chosen": -603.24169921875, "logps/rejected": -587.6011352539062, "loss": 0.1526, "rewards/accuracies": 0.875, "rewards/chosen": 2.001561164855957, "rewards/margins": 2.414102554321289, "rewards/rejected": -0.4125411808490753, "step": 1592 }, { "epoch": 1.163835616438356, "grad_norm": 32.848193012397005, "learning_rate": 4.4646246970483237e-07, "logits/chosen": -2.8692786693573, "logits/rejected": -2.048788070678711, "logps/chosen": -635.28369140625, "logps/rejected": -489.91619873046875, "loss": 0.159, "rewards/accuracies": 0.875, "rewards/chosen": 3.0605106353759766, "rewards/margins": 3.8026671409606934, "rewards/rejected": -0.7421566247940063, "step": 1593 }, { "epoch": 1.1645662100456622, "grad_norm": 33.34292810315536, "learning_rate": 4.4636378963189596e-07, "logits/chosen": -2.087456226348877, "logits/rejected": -1.9651458263397217, "logps/chosen": -391.3800048828125, "logps/rejected": -356.1418151855469, "loss": 0.1536, "rewards/accuracies": 1.0, "rewards/chosen": 2.6621017456054688, "rewards/margins": 4.016177654266357, "rewards/rejected": -1.3540757894515991, "step": 1594 }, { "epoch": 1.165296803652968, "grad_norm": 22.431711964541545, "learning_rate": 4.462650296261558e-07, "logits/chosen": -3.024677276611328, "logits/rejected": -2.223243236541748, "logps/chosen": -747.9882202148438, "logps/rejected": -571.982177734375, "loss": 0.1147, "rewards/accuracies": 1.0, "rewards/chosen": 2.5328993797302246, "rewards/margins": 3.1401896476745605, "rewards/rejected": -0.6072906851768494, "step": 1595 }, { "epoch": 1.166027397260274, "grad_norm": 37.447798953802014, "learning_rate": 4.461661897278135e-07, "logits/chosen": -3.0846731662750244, "logits/rejected": -1.0402185916900635, "logps/chosen": -1137.714599609375, "logps/rejected": -353.4227294921875, "loss": 0.2203, "rewards/accuracies": 1.0, "rewards/chosen": 3.3973751068115234, "rewards/margins": 4.38238525390625, "rewards/rejected": -0.9850105047225952, "step": 1596 }, { "epoch": 1.16675799086758, "grad_norm": 28.767717027403304, "learning_rate": 4.460672699771034e-07, "logits/chosen": -3.0621540546417236, "logits/rejected": -2.273484706878662, "logps/chosen": -697.1666259765625, "logps/rejected": -530.4890747070312, "loss": 0.1566, "rewards/accuracies": 1.0, "rewards/chosen": 2.814406156539917, "rewards/margins": 2.891927719116211, "rewards/rejected": -0.0775216817855835, "step": 1597 }, { "epoch": 1.167488584474886, "grad_norm": 25.243403715670826, "learning_rate": 4.4596827041429217e-07, "logits/chosen": -3.083296775817871, "logits/rejected": -2.0859427452087402, "logps/chosen": -490.0132141113281, "logps/rejected": -367.9539489746094, "loss": 0.1809, "rewards/accuracies": 0.75, "rewards/chosen": 1.7307441234588623, "rewards/margins": 2.630513906478882, "rewards/rejected": -0.8997699022293091, "step": 1598 }, { "epoch": 1.1682191780821918, "grad_norm": 35.674345621845326, "learning_rate": 4.458691910796791e-07, "logits/chosen": -2.403266429901123, "logits/rejected": -2.1426665782928467, "logps/chosen": -336.1684265136719, "logps/rejected": -441.2503356933594, "loss": 0.2227, "rewards/accuracies": 0.875, "rewards/chosen": 1.2857849597930908, "rewards/margins": 2.2508082389831543, "rewards/rejected": -0.965023398399353, "step": 1599 }, { "epoch": 1.1689497716894977, "grad_norm": 22.0080468460189, "learning_rate": 4.45770032013596e-07, "logits/chosen": -2.8406527042388916, "logits/rejected": -1.6244957447052002, "logps/chosen": -371.0121765136719, "logps/rejected": -171.26612854003906, "loss": 0.1546, "rewards/accuracies": 0.875, "rewards/chosen": 2.274207353591919, "rewards/margins": 3.954258918762207, "rewards/rejected": -1.6800518035888672, "step": 1600 }, { "epoch": 1.1696803652968037, "grad_norm": 31.566741477074707, "learning_rate": 4.456707932564069e-07, "logits/chosen": -2.697167158126831, "logits/rejected": -1.665116548538208, "logps/chosen": -955.0145263671875, "logps/rejected": -525.7027587890625, "loss": 0.1633, "rewards/accuracies": 1.0, "rewards/chosen": 2.466142177581787, "rewards/margins": 2.2025532722473145, "rewards/rejected": 0.2635890543460846, "step": 1601 }, { "epoch": 1.1704109589041096, "grad_norm": 35.10974702183854, "learning_rate": 4.455714748485084e-07, "logits/chosen": -2.6118314266204834, "logits/rejected": -2.545362949371338, "logps/chosen": -459.71246337890625, "logps/rejected": -461.33294677734375, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": 1.8425734043121338, "rewards/margins": 3.4403185844421387, "rewards/rejected": -1.5977448225021362, "step": 1602 }, { "epoch": 1.1711415525114155, "grad_norm": 32.57468507018717, "learning_rate": 4.454720768303296e-07, "logits/chosen": -2.7253646850585938, "logits/rejected": -2.1338257789611816, "logps/chosen": -611.163818359375, "logps/rejected": -549.7906494140625, "loss": 0.1428, "rewards/accuracies": 1.0, "rewards/chosen": 3.0732364654541016, "rewards/margins": 4.250417709350586, "rewards/rejected": -1.1771814823150635, "step": 1603 }, { "epoch": 1.1718721461187214, "grad_norm": 27.920478158261144, "learning_rate": 4.453725992423321e-07, "logits/chosen": -2.3865230083465576, "logits/rejected": -2.054337501525879, "logps/chosen": -455.0406799316406, "logps/rejected": -452.9449768066406, "loss": 0.1874, "rewards/accuracies": 0.875, "rewards/chosen": 1.84779691696167, "rewards/margins": 3.4742140769958496, "rewards/rejected": -1.6264175176620483, "step": 1604 }, { "epoch": 1.1726027397260275, "grad_norm": 52.791561585251245, "learning_rate": 4.452730421250094e-07, "logits/chosen": -3.0449728965759277, "logits/rejected": -2.7442474365234375, "logps/chosen": -397.48309326171875, "logps/rejected": -481.9439392089844, "loss": 0.3042, "rewards/accuracies": 0.875, "rewards/chosen": 3.3393054008483887, "rewards/margins": 4.338099002838135, "rewards/rejected": -0.9987938404083252, "step": 1605 }, { "epoch": 1.1733333333333333, "grad_norm": 31.476892139624127, "learning_rate": 4.451734055188879e-07, "logits/chosen": -2.679868459701538, "logits/rejected": -1.999939203262329, "logps/chosen": -562.0106811523438, "logps/rejected": -478.0439453125, "loss": 0.1722, "rewards/accuracies": 0.875, "rewards/chosen": 3.3516135215759277, "rewards/margins": 4.574109077453613, "rewards/rejected": -1.222495436668396, "step": 1606 }, { "epoch": 1.1740639269406392, "grad_norm": 36.559364849558385, "learning_rate": 4.450736894645263e-07, "logits/chosen": -2.694486379623413, "logits/rejected": -2.1005704402923584, "logps/chosen": -537.50048828125, "logps/rejected": -409.7037048339844, "loss": 0.2213, "rewards/accuracies": 0.875, "rewards/chosen": 2.3579752445220947, "rewards/margins": 3.053798198699951, "rewards/rejected": -0.6958228349685669, "step": 1607 }, { "epoch": 1.174794520547945, "grad_norm": 25.759253163381043, "learning_rate": 4.4497389400251525e-07, "logits/chosen": -2.952218532562256, "logits/rejected": -2.798161506652832, "logps/chosen": -587.122802734375, "logps/rejected": -753.2666625976562, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": 3.133348226547241, "rewards/margins": 3.7648391723632812, "rewards/rejected": -0.6314910650253296, "step": 1608 }, { "epoch": 1.1755251141552512, "grad_norm": 52.98558913751927, "learning_rate": 4.4487401917347807e-07, "logits/chosen": -2.7556469440460205, "logits/rejected": -2.3548648357391357, "logps/chosen": -485.360107421875, "logps/rejected": -340.89306640625, "loss": 0.2953, "rewards/accuracies": 0.75, "rewards/chosen": 2.078394889831543, "rewards/margins": 2.423212766647339, "rewards/rejected": -0.34481775760650635, "step": 1609 }, { "epoch": 1.176255707762557, "grad_norm": 39.406215924846485, "learning_rate": 4.447740650180703e-07, "logits/chosen": -2.6714954376220703, "logits/rejected": -1.8644906282424927, "logps/chosen": -538.915283203125, "logps/rejected": -423.5685729980469, "loss": 0.2398, "rewards/accuracies": 1.0, "rewards/chosen": 2.4135167598724365, "rewards/margins": 2.988471508026123, "rewards/rejected": -0.5749548077583313, "step": 1610 }, { "epoch": 1.176986301369863, "grad_norm": 55.33319925192402, "learning_rate": 4.446740315769798e-07, "logits/chosen": -3.321676254272461, "logits/rejected": -1.84796941280365, "logps/chosen": -753.6603393554688, "logps/rejected": -356.2894287109375, "loss": 0.3598, "rewards/accuracies": 0.75, "rewards/chosen": 2.5680994987487793, "rewards/margins": 2.649989128112793, "rewards/rejected": -0.0818895474076271, "step": 1611 }, { "epoch": 1.177716894977169, "grad_norm": 48.10984187009582, "learning_rate": 4.4457391889092666e-07, "logits/chosen": -3.468284845352173, "logits/rejected": -1.8241194486618042, "logps/chosen": -646.8451538085938, "logps/rejected": -355.64068603515625, "loss": 0.3096, "rewards/accuracies": 0.75, "rewards/chosen": 3.645683765411377, "rewards/margins": 4.1592888832092285, "rewards/rejected": -0.5136053562164307, "step": 1612 }, { "epoch": 1.178447488584475, "grad_norm": 28.804358004285785, "learning_rate": 4.444737270006632e-07, "logits/chosen": -2.479614734649658, "logits/rejected": -1.9322680234909058, "logps/chosen": -424.58740234375, "logps/rejected": -537.4285278320312, "loss": 0.1664, "rewards/accuracies": 1.0, "rewards/chosen": 2.774437427520752, "rewards/margins": 4.863254547119141, "rewards/rejected": -2.0888168811798096, "step": 1613 }, { "epoch": 1.1791780821917808, "grad_norm": 27.476401451404623, "learning_rate": 4.443734559469741e-07, "logits/chosen": -2.87382435798645, "logits/rejected": -2.844139575958252, "logps/chosen": -508.6954650878906, "logps/rejected": -523.9099731445312, "loss": 0.2033, "rewards/accuracies": 1.0, "rewards/chosen": 2.420644998550415, "rewards/margins": 2.7418954372406006, "rewards/rejected": -0.32125037908554077, "step": 1614 }, { "epoch": 1.1799086757990866, "grad_norm": 18.15974663108156, "learning_rate": 4.44273105770676e-07, "logits/chosen": -2.9458296298980713, "logits/rejected": -2.14450740814209, "logps/chosen": -378.9736022949219, "logps/rejected": -300.9443054199219, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 2.007491111755371, "rewards/margins": 3.4857802391052246, "rewards/rejected": -1.4782888889312744, "step": 1615 }, { "epoch": 1.1806392694063927, "grad_norm": 46.851590508787694, "learning_rate": 4.4417267651261815e-07, "logits/chosen": -3.1894843578338623, "logits/rejected": -2.543846845626831, "logps/chosen": -625.2295532226562, "logps/rejected": -544.48779296875, "loss": 0.1844, "rewards/accuracies": 1.0, "rewards/chosen": 3.011681079864502, "rewards/margins": 2.7951347827911377, "rewards/rejected": 0.21654623746871948, "step": 1616 }, { "epoch": 1.1813698630136986, "grad_norm": 31.14377159685936, "learning_rate": 4.4407216821368165e-07, "logits/chosen": -2.512875556945801, "logits/rejected": -1.4741146564483643, "logps/chosen": -937.938720703125, "logps/rejected": -493.4965515136719, "loss": 0.1419, "rewards/accuracies": 1.0, "rewards/chosen": 3.5999832153320312, "rewards/margins": 4.234097003936768, "rewards/rejected": -0.6341140270233154, "step": 1617 }, { "epoch": 1.1821004566210045, "grad_norm": 28.875721843994114, "learning_rate": 4.4397158091478006e-07, "logits/chosen": -2.334789752960205, "logits/rejected": -2.3214898109436035, "logps/chosen": -737.5750732421875, "logps/rejected": -731.77294921875, "loss": 0.1687, "rewards/accuracies": 1.0, "rewards/chosen": 3.286970376968384, "rewards/margins": 3.1983425617218018, "rewards/rejected": 0.08862799406051636, "step": 1618 }, { "epoch": 1.1828310502283106, "grad_norm": 42.87374867206403, "learning_rate": 4.4387091465685864e-07, "logits/chosen": -2.38559627532959, "logits/rejected": -2.039012908935547, "logps/chosen": -372.048095703125, "logps/rejected": -383.5404052734375, "loss": 0.264, "rewards/accuracies": 1.0, "rewards/chosen": 2.6837470531463623, "rewards/margins": 4.105041980743408, "rewards/rejected": -1.4212946891784668, "step": 1619 }, { "epoch": 1.1835616438356165, "grad_norm": 54.573729872416635, "learning_rate": 4.437701694808954e-07, "logits/chosen": -2.728299617767334, "logits/rejected": -2.075590133666992, "logps/chosen": -614.9747314453125, "logps/rejected": -582.0678100585938, "loss": 0.2764, "rewards/accuracies": 0.625, "rewards/chosen": 2.6668853759765625, "rewards/margins": 2.4745030403137207, "rewards/rejected": 0.19238227605819702, "step": 1620 }, { "epoch": 1.1842922374429223, "grad_norm": 27.63752124497431, "learning_rate": 4.4366934542789993e-07, "logits/chosen": -2.312563896179199, "logits/rejected": -2.0437827110290527, "logps/chosen": -727.9797973632812, "logps/rejected": -606.5862426757812, "loss": 0.1451, "rewards/accuracies": 1.0, "rewards/chosen": 1.9214948415756226, "rewards/margins": 2.8078973293304443, "rewards/rejected": -0.8864024877548218, "step": 1621 }, { "epoch": 1.1850228310502282, "grad_norm": 27.736776964649636, "learning_rate": 4.4356844253891434e-07, "logits/chosen": -2.793750047683716, "logits/rejected": -2.248960494995117, "logps/chosen": -643.3953857421875, "logps/rejected": -491.4825134277344, "loss": 0.2062, "rewards/accuracies": 0.875, "rewards/chosen": 2.4622955322265625, "rewards/margins": 2.105550765991211, "rewards/rejected": 0.3567444086074829, "step": 1622 }, { "epoch": 1.1857534246575343, "grad_norm": 23.350237290934942, "learning_rate": 4.434674608550125e-07, "logits/chosen": -3.122436761856079, "logits/rejected": -1.7275837659835815, "logps/chosen": -576.9696655273438, "logps/rejected": -352.1780090332031, "loss": 0.1131, "rewards/accuracies": 1.0, "rewards/chosen": 4.698081970214844, "rewards/margins": 6.159863471984863, "rewards/rejected": -1.4617807865142822, "step": 1623 }, { "epoch": 1.1864840182648402, "grad_norm": 37.62557263112832, "learning_rate": 4.433664004173006e-07, "logits/chosen": -3.0393872261047363, "logits/rejected": -1.8643834590911865, "logps/chosen": -448.6755065917969, "logps/rejected": -307.89202880859375, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 4.091918468475342, "rewards/margins": 5.515718460083008, "rewards/rejected": -1.4238005876541138, "step": 1624 }, { "epoch": 1.187214611872146, "grad_norm": 28.078068482135663, "learning_rate": 4.4326526126691685e-07, "logits/chosen": -2.630359411239624, "logits/rejected": -1.4949474334716797, "logps/chosen": -530.2586059570312, "logps/rejected": -397.84808349609375, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": 2.413013458251953, "rewards/margins": 4.152459621429443, "rewards/rejected": -1.7394461631774902, "step": 1625 }, { "epoch": 1.1879452054794521, "grad_norm": 45.618506066103535, "learning_rate": 4.4316404344503133e-07, "logits/chosen": -2.7478647232055664, "logits/rejected": -2.6999499797821045, "logps/chosen": -859.5843505859375, "logps/rejected": -681.5052490234375, "loss": 0.1923, "rewards/accuracies": 0.875, "rewards/chosen": 2.909299612045288, "rewards/margins": 2.766439199447632, "rewards/rejected": 0.1428602784872055, "step": 1626 }, { "epoch": 1.188675799086758, "grad_norm": 29.295423922397312, "learning_rate": 4.4306274699284623e-07, "logits/chosen": -2.4661664962768555, "logits/rejected": -2.6464521884918213, "logps/chosen": -464.614013671875, "logps/rejected": -535.4617919921875, "loss": 0.1662, "rewards/accuracies": 1.0, "rewards/chosen": 1.2197251319885254, "rewards/margins": 2.314818859100342, "rewards/rejected": -1.0950937271118164, "step": 1627 }, { "epoch": 1.189406392694064, "grad_norm": 59.867994436196874, "learning_rate": 4.4296137195159587e-07, "logits/chosen": -2.7425127029418945, "logits/rejected": -2.2001428604125977, "logps/chosen": -581.2739868164062, "logps/rejected": -471.2933044433594, "loss": 0.3237, "rewards/accuracies": 0.875, "rewards/chosen": 2.820960521697998, "rewards/margins": 2.789816379547119, "rewards/rejected": 0.03114408254623413, "step": 1628 }, { "epoch": 1.1901369863013698, "grad_norm": 50.38387551696847, "learning_rate": 4.4285991836254657e-07, "logits/chosen": -2.5865728855133057, "logits/rejected": -2.0035643577575684, "logps/chosen": -468.2991638183594, "logps/rejected": -341.8756103515625, "loss": 0.2888, "rewards/accuracies": 0.875, "rewards/chosen": 1.5912641286849976, "rewards/margins": 2.6616129875183105, "rewards/rejected": -1.070348858833313, "step": 1629 }, { "epoch": 1.1908675799086759, "grad_norm": 31.25321565486552, "learning_rate": 4.427583862669963e-07, "logits/chosen": -2.770219564437866, "logits/rejected": -2.37839937210083, "logps/chosen": -707.0466918945312, "logps/rejected": -511.677001953125, "loss": 0.1738, "rewards/accuracies": 1.0, "rewards/chosen": 2.4375295639038086, "rewards/margins": 4.087146282196045, "rewards/rejected": -1.6496169567108154, "step": 1630 }, { "epoch": 1.1915981735159817, "grad_norm": 43.668484327229635, "learning_rate": 4.4265677570627536e-07, "logits/chosen": -3.4396586418151855, "logits/rejected": -2.031287670135498, "logps/chosen": -1038.831298828125, "logps/rejected": -583.3409423828125, "loss": 0.2032, "rewards/accuracies": 0.875, "rewards/chosen": 3.9799001216888428, "rewards/margins": 4.197539329528809, "rewards/rejected": -0.21763941645622253, "step": 1631 }, { "epoch": 1.1923287671232876, "grad_norm": 36.87252706220615, "learning_rate": 4.425550867217458e-07, "logits/chosen": -2.560102701187134, "logits/rejected": -2.6153783798217773, "logps/chosen": -384.972412109375, "logps/rejected": -414.8377685546875, "loss": 0.1678, "rewards/accuracies": 0.875, "rewards/chosen": 1.7112737894058228, "rewards/margins": 1.9219584465026855, "rewards/rejected": -0.2106846272945404, "step": 1632 }, { "epoch": 1.1930593607305937, "grad_norm": 28.62873009778322, "learning_rate": 4.424533193548016e-07, "logits/chosen": -2.8180348873138428, "logits/rejected": -2.6059138774871826, "logps/chosen": -414.44140625, "logps/rejected": -435.03594970703125, "loss": 0.1533, "rewards/accuracies": 0.875, "rewards/chosen": 1.927492380142212, "rewards/margins": 3.154981851577759, "rewards/rejected": -1.2274895906448364, "step": 1633 }, { "epoch": 1.1937899543378996, "grad_norm": 39.67251808754077, "learning_rate": 4.423514736468688e-07, "logits/chosen": -1.7957792282104492, "logits/rejected": -2.3566102981567383, "logps/chosen": -697.50927734375, "logps/rejected": -758.5707397460938, "loss": 0.1652, "rewards/accuracies": 0.875, "rewards/chosen": 2.101283073425293, "rewards/margins": 2.758207321166992, "rewards/rejected": -0.6569240689277649, "step": 1634 }, { "epoch": 1.1945205479452055, "grad_norm": 40.20839725680368, "learning_rate": 4.42249549639405e-07, "logits/chosen": -2.959970235824585, "logits/rejected": -2.6048407554626465, "logps/chosen": -959.5178833007812, "logps/rejected": -710.6155395507812, "loss": 0.2208, "rewards/accuracies": 0.875, "rewards/chosen": 3.6061737537384033, "rewards/margins": 3.1450917720794678, "rewards/rejected": 0.4610818922519684, "step": 1635 }, { "epoch": 1.1952511415525113, "grad_norm": 29.153160005070635, "learning_rate": 4.4214754737390006e-07, "logits/chosen": -3.3280653953552246, "logits/rejected": -2.1431632041931152, "logps/chosen": -528.4686279296875, "logps/rejected": -420.39263916015625, "loss": 0.1794, "rewards/accuracies": 1.0, "rewards/chosen": 3.532463312149048, "rewards/margins": 4.343564987182617, "rewards/rejected": -0.8111015558242798, "step": 1636 }, { "epoch": 1.1959817351598174, "grad_norm": 17.042233234615903, "learning_rate": 4.420454668918755e-07, "logits/chosen": -2.8324005603790283, "logits/rejected": -2.8976056575775146, "logps/chosen": -765.4366455078125, "logps/rejected": -751.9344482421875, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 4.005509376525879, "rewards/margins": 3.7067551612854004, "rewards/rejected": 0.2987545430660248, "step": 1637 }, { "epoch": 1.1967123287671233, "grad_norm": 24.34521542644789, "learning_rate": 4.4194330823488455e-07, "logits/chosen": -2.860862970352173, "logits/rejected": -2.2340986728668213, "logps/chosen": -827.361572265625, "logps/rejected": -503.110595703125, "loss": 0.1523, "rewards/accuracies": 1.0, "rewards/chosen": 4.1449384689331055, "rewards/margins": 3.759326219558716, "rewards/rejected": 0.3856118321418762, "step": 1638 }, { "epoch": 1.1974429223744292, "grad_norm": 22.290755417876923, "learning_rate": 4.4184107144451263e-07, "logits/chosen": -2.717214584350586, "logits/rejected": -2.2878458499908447, "logps/chosen": -565.611328125, "logps/rejected": -574.0399169921875, "loss": 0.1444, "rewards/accuracies": 1.0, "rewards/chosen": 3.636045455932617, "rewards/margins": 3.937361478805542, "rewards/rejected": -0.30131611227989197, "step": 1639 }, { "epoch": 1.1981735159817353, "grad_norm": 26.89666187321161, "learning_rate": 4.417387565623767e-07, "logits/chosen": -2.7610929012298584, "logits/rejected": -2.0074052810668945, "logps/chosen": -831.335205078125, "logps/rejected": -540.4796142578125, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": 2.527838706970215, "rewards/margins": 3.2749364376068115, "rewards/rejected": -0.7470974326133728, "step": 1640 }, { "epoch": 1.1989041095890411, "grad_norm": 34.51890156754249, "learning_rate": 4.4163636363012546e-07, "logits/chosen": -2.9815120697021484, "logits/rejected": -2.1376307010650635, "logps/chosen": -509.4407653808594, "logps/rejected": -422.3813781738281, "loss": 0.1903, "rewards/accuracies": 0.875, "rewards/chosen": 1.526809573173523, "rewards/margins": 3.0009498596191406, "rewards/rejected": -1.4741401672363281, "step": 1641 }, { "epoch": 1.199634703196347, "grad_norm": 30.397618290811966, "learning_rate": 4.4153389268943955e-07, "logits/chosen": -2.7727532386779785, "logits/rejected": -1.8555967807769775, "logps/chosen": -672.6107177734375, "logps/rejected": -405.27978515625, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": 2.0918478965759277, "rewards/margins": 3.0187532901763916, "rewards/rejected": -0.9269052743911743, "step": 1642 }, { "epoch": 1.200365296803653, "grad_norm": 46.29887883783607, "learning_rate": 4.4143134378203127e-07, "logits/chosen": -2.5748143196105957, "logits/rejected": -1.6727941036224365, "logps/chosen": -908.7683715820312, "logps/rejected": -516.200439453125, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": 3.6836366653442383, "rewards/margins": 3.920048713684082, "rewards/rejected": -0.23641221225261688, "step": 1643 }, { "epoch": 1.201095890410959, "grad_norm": 26.17647580829374, "learning_rate": 4.4132871694964463e-07, "logits/chosen": -2.6442477703094482, "logits/rejected": -1.8152084350585938, "logps/chosen": -614.1212158203125, "logps/rejected": -363.55865478515625, "loss": 0.1379, "rewards/accuracies": 1.0, "rewards/chosen": 1.577379822731018, "rewards/margins": 2.0890188217163086, "rewards/rejected": -0.5116391181945801, "step": 1644 }, { "epoch": 1.2018264840182649, "grad_norm": 35.39189256835967, "learning_rate": 4.4122601223405545e-07, "logits/chosen": -2.8636271953582764, "logits/rejected": -1.9163724184036255, "logps/chosen": -692.6549072265625, "logps/rejected": -497.57098388671875, "loss": 0.1776, "rewards/accuracies": 0.875, "rewards/chosen": 3.38743257522583, "rewards/margins": 3.357138156890869, "rewards/rejected": 0.030294179916381836, "step": 1645 }, { "epoch": 1.2025570776255707, "grad_norm": 24.405919892133753, "learning_rate": 4.4112322967707127e-07, "logits/chosen": -2.690864324569702, "logits/rejected": -2.1588592529296875, "logps/chosen": -529.2901611328125, "logps/rejected": -340.17120361328125, "loss": 0.1769, "rewards/accuracies": 0.875, "rewards/chosen": 2.522834062576294, "rewards/margins": 3.0858962535858154, "rewards/rejected": -0.5630618929862976, "step": 1646 }, { "epoch": 1.2032876712328768, "grad_norm": 29.06426510936852, "learning_rate": 4.410203693205312e-07, "logits/chosen": -2.6079611778259277, "logits/rejected": -2.6850013732910156, "logps/chosen": -543.8905639648438, "logps/rejected": -632.592529296875, "loss": 0.1389, "rewards/accuracies": 1.0, "rewards/chosen": 2.3565797805786133, "rewards/margins": 3.748006582260132, "rewards/rejected": -1.391426682472229, "step": 1647 }, { "epoch": 1.2040182648401827, "grad_norm": 45.7100048240604, "learning_rate": 4.409174312063061e-07, "logits/chosen": -2.6825108528137207, "logits/rejected": -2.2688090801239014, "logps/chosen": -901.5604248046875, "logps/rejected": -751.4554443359375, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 3.7156715393066406, "rewards/margins": 3.5391011238098145, "rewards/rejected": 0.17657065391540527, "step": 1648 }, { "epoch": 1.2047488584474886, "grad_norm": 27.150794636880565, "learning_rate": 4.4081441537629837e-07, "logits/chosen": -3.1061360836029053, "logits/rejected": -2.6502764225006104, "logps/chosen": -773.6685180664062, "logps/rejected": -627.151611328125, "loss": 0.1432, "rewards/accuracies": 1.0, "rewards/chosen": 3.415613889694214, "rewards/margins": 4.018841743469238, "rewards/rejected": -0.6032278537750244, "step": 1649 }, { "epoch": 1.2054794520547945, "grad_norm": 29.770868755628275, "learning_rate": 4.407113218724423e-07, "logits/chosen": -2.7288198471069336, "logits/rejected": -2.522426128387451, "logps/chosen": -616.9837646484375, "logps/rejected": -493.1402587890625, "loss": 0.1504, "rewards/accuracies": 1.0, "rewards/chosen": 3.2477190494537354, "rewards/margins": 4.403003215789795, "rewards/rejected": -1.1552844047546387, "step": 1650 }, { "epoch": 1.2062100456621005, "grad_norm": 26.846391931982684, "learning_rate": 4.4060815073670356e-07, "logits/chosen": -2.3416967391967773, "logits/rejected": -2.671893358230591, "logps/chosen": -279.7763671875, "logps/rejected": -322.67071533203125, "loss": 0.149, "rewards/accuracies": 0.875, "rewards/chosen": 1.0112231969833374, "rewards/margins": 1.943637490272522, "rewards/rejected": -0.9324143528938293, "step": 1651 }, { "epoch": 1.2069406392694064, "grad_norm": 26.22892220170858, "learning_rate": 4.405049020110794e-07, "logits/chosen": -2.8624398708343506, "logits/rejected": -2.1400580406188965, "logps/chosen": -418.6378479003906, "logps/rejected": -333.1395263671875, "loss": 0.1783, "rewards/accuracies": 0.875, "rewards/chosen": 1.478482723236084, "rewards/margins": 1.6618256568908691, "rewards/rejected": -0.18334299325942993, "step": 1652 }, { "epoch": 1.2076712328767123, "grad_norm": 43.572721182524276, "learning_rate": 4.4040157573759893e-07, "logits/chosen": -2.7742793560028076, "logits/rejected": -2.142521619796753, "logps/chosen": -1130.5406494140625, "logps/rejected": -564.9488525390625, "loss": 0.2566, "rewards/accuracies": 0.875, "rewards/chosen": 3.978501319885254, "rewards/margins": 3.500849962234497, "rewards/rejected": 0.47765105962753296, "step": 1653 }, { "epoch": 1.2084018264840182, "grad_norm": 23.853818115175503, "learning_rate": 4.402981719583225e-07, "logits/chosen": -3.2239956855773926, "logits/rejected": -2.1536154747009277, "logps/chosen": -618.0567626953125, "logps/rejected": -481.75830078125, "loss": 0.153, "rewards/accuracies": 0.875, "rewards/chosen": 1.934335708618164, "rewards/margins": 3.191540241241455, "rewards/rejected": -1.257204532623291, "step": 1654 }, { "epoch": 1.2091324200913243, "grad_norm": 41.868714546932296, "learning_rate": 4.4019469071534224e-07, "logits/chosen": -2.519042491912842, "logits/rejected": -1.9205938577651978, "logps/chosen": -420.8230895996094, "logps/rejected": -357.8184814453125, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": 2.852301597595215, "rewards/margins": 3.7442169189453125, "rewards/rejected": -0.8919152021408081, "step": 1655 }, { "epoch": 1.2098630136986301, "grad_norm": 27.72681565114075, "learning_rate": 4.4009113205078174e-07, "logits/chosen": -2.9746437072753906, "logits/rejected": -2.245781421661377, "logps/chosen": -615.0738525390625, "logps/rejected": -486.8208923339844, "loss": 0.1571, "rewards/accuracies": 0.875, "rewards/chosen": 2.276515007019043, "rewards/margins": 1.833331823348999, "rewards/rejected": 0.44318318367004395, "step": 1656 }, { "epoch": 1.210593607305936, "grad_norm": 32.315340784215785, "learning_rate": 4.3998749600679604e-07, "logits/chosen": -2.7745633125305176, "logits/rejected": -2.327705144882202, "logps/chosen": -858.8388671875, "logps/rejected": -681.3423461914062, "loss": 0.1437, "rewards/accuracies": 0.75, "rewards/chosen": 3.7750606536865234, "rewards/margins": 3.075364112854004, "rewards/rejected": 0.6996965408325195, "step": 1657 }, { "epoch": 1.2113242009132419, "grad_norm": 32.40748927837707, "learning_rate": 4.398837826255717e-07, "logits/chosen": -2.7006144523620605, "logits/rejected": -2.5837395191192627, "logps/chosen": -563.9258422851562, "logps/rejected": -604.0054931640625, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": 1.1913871765136719, "rewards/margins": 1.361360788345337, "rewards/rejected": -0.16997355222702026, "step": 1658 }, { "epoch": 1.212054794520548, "grad_norm": 42.51602696428871, "learning_rate": 4.397799919493269e-07, "logits/chosen": -2.8446664810180664, "logits/rejected": -2.4691638946533203, "logps/chosen": -996.9905395507812, "logps/rejected": -752.3250122070312, "loss": 0.1937, "rewards/accuracies": 1.0, "rewards/chosen": 3.4417500495910645, "rewards/margins": 4.285124778747559, "rewards/rejected": -0.8433745503425598, "step": 1659 }, { "epoch": 1.2127853881278539, "grad_norm": 30.551013746158553, "learning_rate": 4.3967612402031116e-07, "logits/chosen": -2.4409871101379395, "logits/rejected": -1.839798927307129, "logps/chosen": -669.537109375, "logps/rejected": -695.40234375, "loss": 0.1713, "rewards/accuracies": 1.0, "rewards/chosen": 2.143261432647705, "rewards/margins": 4.511665344238281, "rewards/rejected": -2.3684041500091553, "step": 1660 }, { "epoch": 1.2135159817351597, "grad_norm": 41.29597195616889, "learning_rate": 4.3957217888080545e-07, "logits/chosen": -3.1252517700195312, "logits/rejected": -2.249145269393921, "logps/chosen": -584.815185546875, "logps/rejected": -555.487548828125, "loss": 0.2545, "rewards/accuracies": 1.0, "rewards/chosen": 3.0541701316833496, "rewards/margins": 3.6502888202667236, "rewards/rejected": -0.5961184501647949, "step": 1661 }, { "epoch": 1.2142465753424658, "grad_norm": 29.368225615131227, "learning_rate": 4.3946815657312206e-07, "logits/chosen": -2.61040997505188, "logits/rejected": -1.626205563545227, "logps/chosen": -368.7767639160156, "logps/rejected": -246.30825805664062, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.44639253616333, "rewards/margins": 4.484617233276367, "rewards/rejected": -2.0382254123687744, "step": 1662 }, { "epoch": 1.2149771689497717, "grad_norm": 30.240395274328943, "learning_rate": 4.3936405713960504e-07, "logits/chosen": -3.1392323970794678, "logits/rejected": -2.676067352294922, "logps/chosen": -1037.935546875, "logps/rejected": -874.9907836914062, "loss": 0.1238, "rewards/accuracies": 1.0, "rewards/chosen": 3.316575527191162, "rewards/margins": 2.79902982711792, "rewards/rejected": 0.5175455808639526, "step": 1663 }, { "epoch": 1.2157077625570776, "grad_norm": 52.49051253560018, "learning_rate": 4.3925988062262953e-07, "logits/chosen": -2.577460289001465, "logits/rejected": -1.9769420623779297, "logps/chosen": -653.2214965820312, "logps/rejected": -441.6319885253906, "loss": 0.2717, "rewards/accuracies": 0.875, "rewards/chosen": 2.317976951599121, "rewards/margins": 3.8413968086242676, "rewards/rejected": -1.523419976234436, "step": 1664 }, { "epoch": 1.2164383561643834, "grad_norm": 39.45287283335255, "learning_rate": 4.391556270646021e-07, "logits/chosen": -2.7163138389587402, "logits/rejected": -2.1651034355163574, "logps/chosen": -755.9140014648438, "logps/rejected": -521.1251831054688, "loss": 0.1255, "rewards/accuracies": 1.0, "rewards/chosen": 2.9660592079162598, "rewards/margins": 3.643967866897583, "rewards/rejected": -0.6779085993766785, "step": 1665 }, { "epoch": 1.2171689497716895, "grad_norm": 48.410786484489954, "learning_rate": 4.390512965079606e-07, "logits/chosen": -2.729348659515381, "logits/rejected": -1.8947522640228271, "logps/chosen": -792.1695556640625, "logps/rejected": -600.5901489257812, "loss": 0.2256, "rewards/accuracies": 0.875, "rewards/chosen": 1.3572479486465454, "rewards/margins": 2.4526238441467285, "rewards/rejected": -1.095375895500183, "step": 1666 }, { "epoch": 1.2178995433789954, "grad_norm": 29.7389326266013, "learning_rate": 4.389468889951746e-07, "logits/chosen": -2.6555185317993164, "logits/rejected": -2.018489122390747, "logps/chosen": -758.3502197265625, "logps/rejected": -449.8136291503906, "loss": 0.2213, "rewards/accuracies": 1.0, "rewards/chosen": 2.3323538303375244, "rewards/margins": 2.4331655502319336, "rewards/rejected": -0.10081194341182709, "step": 1667 }, { "epoch": 1.2186301369863013, "grad_norm": 40.19132101780495, "learning_rate": 4.388424045687446e-07, "logits/chosen": -3.0124988555908203, "logits/rejected": -2.3388257026672363, "logps/chosen": -790.8170166015625, "logps/rejected": -781.8611450195312, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 3.2656660079956055, "rewards/margins": 4.89644193649292, "rewards/rejected": -1.6307756900787354, "step": 1668 }, { "epoch": 1.2193607305936074, "grad_norm": 32.27081336917018, "learning_rate": 4.3873784327120246e-07, "logits/chosen": -2.8861443996429443, "logits/rejected": -1.751476526260376, "logps/chosen": -499.2530212402344, "logps/rejected": -335.3498229980469, "loss": 0.1606, "rewards/accuracies": 0.875, "rewards/chosen": 0.21898937225341797, "rewards/margins": 1.337700605392456, "rewards/rejected": -1.118711233139038, "step": 1669 }, { "epoch": 1.2200913242009133, "grad_norm": 23.282125612728642, "learning_rate": 4.386332051451115e-07, "logits/chosen": -2.4744760990142822, "logits/rejected": -2.213435411453247, "logps/chosen": -530.743408203125, "logps/rejected": -577.4295043945312, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": 2.4977478981018066, "rewards/margins": 4.135989189147949, "rewards/rejected": -1.6382412910461426, "step": 1670 }, { "epoch": 1.2208219178082191, "grad_norm": 39.79429375462878, "learning_rate": 4.3852849023306617e-07, "logits/chosen": -2.247746229171753, "logits/rejected": -2.1669728755950928, "logps/chosen": -618.5919189453125, "logps/rejected": -755.766357421875, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 2.003591775894165, "rewards/margins": 2.8616974353790283, "rewards/rejected": -0.8581055998802185, "step": 1671 }, { "epoch": 1.221552511415525, "grad_norm": 39.046477964167906, "learning_rate": 4.3842369857769235e-07, "logits/chosen": -3.0778591632843018, "logits/rejected": -2.1797704696655273, "logps/chosen": -499.0964050292969, "logps/rejected": -325.34490966796875, "loss": 0.2416, "rewards/accuracies": 0.75, "rewards/chosen": 1.803846836090088, "rewards/margins": 2.4464941024780273, "rewards/rejected": -0.6426472663879395, "step": 1672 }, { "epoch": 1.222283105022831, "grad_norm": 27.16592849321439, "learning_rate": 4.3831883022164694e-07, "logits/chosen": -2.9161858558654785, "logits/rejected": -2.426939010620117, "logps/chosen": -578.09326171875, "logps/rejected": -591.5484008789062, "loss": 0.1774, "rewards/accuracies": 1.0, "rewards/chosen": 2.7544662952423096, "rewards/margins": 2.7313902378082275, "rewards/rejected": 0.023076295852661133, "step": 1673 }, { "epoch": 1.223013698630137, "grad_norm": 34.61099918592986, "learning_rate": 4.3821388520761817e-07, "logits/chosen": -2.9806294441223145, "logits/rejected": -1.2418100833892822, "logps/chosen": -603.5596313476562, "logps/rejected": -256.44183349609375, "loss": 0.1666, "rewards/accuracies": 1.0, "rewards/chosen": 3.996356964111328, "rewards/margins": 5.262259483337402, "rewards/rejected": -1.2659025192260742, "step": 1674 }, { "epoch": 1.2237442922374429, "grad_norm": 37.53189236284173, "learning_rate": 4.3810886357832556e-07, "logits/chosen": -2.6288540363311768, "logits/rejected": -2.5007894039154053, "logps/chosen": -816.63525390625, "logps/rejected": -635.8987426757812, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 2.726652145385742, "rewards/margins": 1.7032313346862793, "rewards/rejected": 1.023420810699463, "step": 1675 }, { "epoch": 1.224474885844749, "grad_norm": 34.99534532385611, "learning_rate": 4.380037653765196e-07, "logits/chosen": -2.4414596557617188, "logits/rejected": -2.4820151329040527, "logps/chosen": -447.69281005859375, "logps/rejected": -474.4453430175781, "loss": 0.1927, "rewards/accuracies": 0.875, "rewards/chosen": 3.883584976196289, "rewards/margins": 4.626710891723633, "rewards/rejected": -0.7431257367134094, "step": 1676 }, { "epoch": 1.2252054794520548, "grad_norm": 27.76034105778478, "learning_rate": 4.3789859064498223e-07, "logits/chosen": -2.472368001937866, "logits/rejected": -1.97011399269104, "logps/chosen": -568.7356567382812, "logps/rejected": -568.2259521484375, "loss": 0.1407, "rewards/accuracies": 1.0, "rewards/chosen": 3.1168060302734375, "rewards/margins": 3.8331170082092285, "rewards/rejected": -0.7163110375404358, "step": 1677 }, { "epoch": 1.2259360730593607, "grad_norm": 28.59623132164064, "learning_rate": 4.3779333942652624e-07, "logits/chosen": -3.0572452545166016, "logits/rejected": -2.4875195026397705, "logps/chosen": -442.76751708984375, "logps/rejected": -358.1763000488281, "loss": 0.1475, "rewards/accuracies": 0.875, "rewards/chosen": 2.8232812881469727, "rewards/margins": 3.628584861755371, "rewards/rejected": -0.8053032159805298, "step": 1678 }, { "epoch": 1.2266666666666666, "grad_norm": 40.43302597931238, "learning_rate": 4.376880117639958e-07, "logits/chosen": -2.3924543857574463, "logits/rejected": -2.56149959564209, "logps/chosen": -431.9461669921875, "logps/rejected": -416.3132629394531, "loss": 0.2335, "rewards/accuracies": 1.0, "rewards/chosen": 1.490190029144287, "rewards/margins": 2.066767692565918, "rewards/rejected": -0.5765777826309204, "step": 1679 }, { "epoch": 1.2273972602739727, "grad_norm": 36.31622789061558, "learning_rate": 4.37582607700266e-07, "logits/chosen": -2.8352808952331543, "logits/rejected": -3.2243642807006836, "logps/chosen": -828.8597412109375, "logps/rejected": -896.9075317382812, "loss": 0.1625, "rewards/accuracies": 0.875, "rewards/chosen": 3.137188196182251, "rewards/margins": 2.054945230484009, "rewards/rejected": 1.0822429656982422, "step": 1680 }, { "epoch": 1.2281278538812785, "grad_norm": 32.10028574213261, "learning_rate": 4.3747712727824326e-07, "logits/chosen": -3.163813352584839, "logits/rejected": -2.2175328731536865, "logps/chosen": -306.07781982421875, "logps/rejected": -243.84341430664062, "loss": 0.1999, "rewards/accuracies": 1.0, "rewards/chosen": 1.876715898513794, "rewards/margins": 2.8141398429870605, "rewards/rejected": -0.9374238848686218, "step": 1681 }, { "epoch": 1.2288584474885844, "grad_norm": 47.26013656150921, "learning_rate": 4.3737157054086493e-07, "logits/chosen": -2.710432529449463, "logits/rejected": -2.3555359840393066, "logps/chosen": -607.0792236328125, "logps/rejected": -658.5292358398438, "loss": 0.2359, "rewards/accuracies": 0.875, "rewards/chosen": 3.108276844024658, "rewards/margins": 3.8773717880249023, "rewards/rejected": -0.7690948247909546, "step": 1682 }, { "epoch": 1.2295890410958905, "grad_norm": 33.95719279302711, "learning_rate": 4.372659375310994e-07, "logits/chosen": -2.64654541015625, "logits/rejected": -1.9843645095825195, "logps/chosen": -538.0384521484375, "logps/rejected": -394.1349182128906, "loss": 0.1723, "rewards/accuracies": 1.0, "rewards/chosen": 3.9747629165649414, "rewards/margins": 5.283553123474121, "rewards/rejected": -1.308789849281311, "step": 1683 }, { "epoch": 1.2303196347031964, "grad_norm": 20.664404386349442, "learning_rate": 4.371602282919461e-07, "logits/chosen": -2.8182168006896973, "logits/rejected": -2.1186251640319824, "logps/chosen": -631.5303344726562, "logps/rejected": -488.9724426269531, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 2.7562062740325928, "rewards/margins": 3.5511550903320312, "rewards/rejected": -0.7949490547180176, "step": 1684 }, { "epoch": 1.2310502283105023, "grad_norm": 32.682036021576515, "learning_rate": 4.370544428664357e-07, "logits/chosen": -2.9455225467681885, "logits/rejected": -2.0906553268432617, "logps/chosen": -960.4297485351562, "logps/rejected": -770.6259765625, "loss": 0.1922, "rewards/accuracies": 0.875, "rewards/chosen": 4.021844387054443, "rewards/margins": 4.114593505859375, "rewards/rejected": -0.09274941682815552, "step": 1685 }, { "epoch": 1.2317808219178081, "grad_norm": 42.94312161843777, "learning_rate": 4.369485812976297e-07, "logits/chosen": -2.597372055053711, "logits/rejected": -2.2430615425109863, "logps/chosen": -684.36962890625, "logps/rejected": -706.3568725585938, "loss": 0.217, "rewards/accuracies": 0.875, "rewards/chosen": 1.9423842430114746, "rewards/margins": 2.1165847778320312, "rewards/rejected": -0.17420044541358948, "step": 1686 }, { "epoch": 1.2325114155251142, "grad_norm": 23.992988833898828, "learning_rate": 4.3684264362862057e-07, "logits/chosen": -2.195753812789917, "logits/rejected": -2.3547298908233643, "logps/chosen": -623.075439453125, "logps/rejected": -666.718017578125, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": 2.6732242107391357, "rewards/margins": 2.828690528869629, "rewards/rejected": -0.15546637773513794, "step": 1687 }, { "epoch": 1.23324200913242, "grad_norm": 21.09900252890747, "learning_rate": 4.367366299025318e-07, "logits/chosen": -2.8927814960479736, "logits/rejected": -2.192621946334839, "logps/chosen": -652.354736328125, "logps/rejected": -408.02093505859375, "loss": 0.1385, "rewards/accuracies": 0.75, "rewards/chosen": 3.1722686290740967, "rewards/margins": 2.7013983726501465, "rewards/rejected": 0.47087031602859497, "step": 1688 }, { "epoch": 1.233972602739726, "grad_norm": 37.96420851182977, "learning_rate": 4.366305401625179e-07, "logits/chosen": -2.4111757278442383, "logits/rejected": -2.3691799640655518, "logps/chosen": -635.890869140625, "logps/rejected": -600.4066162109375, "loss": 0.2037, "rewards/accuracies": 1.0, "rewards/chosen": 1.9654746055603027, "rewards/margins": 1.8340198993682861, "rewards/rejected": 0.13145466148853302, "step": 1689 }, { "epoch": 1.234703196347032, "grad_norm": 48.6667408006356, "learning_rate": 4.3652437445176426e-07, "logits/chosen": -3.040971040725708, "logits/rejected": -2.373164176940918, "logps/chosen": -807.3410034179688, "logps/rejected": -688.4713745117188, "loss": 0.2406, "rewards/accuracies": 0.875, "rewards/chosen": 2.964477062225342, "rewards/margins": 2.851682186126709, "rewards/rejected": 0.11279506236314774, "step": 1690 }, { "epoch": 1.235433789954338, "grad_norm": 24.423883024853332, "learning_rate": 4.364181328134872e-07, "logits/chosen": -2.830230951309204, "logits/rejected": -1.8450934886932373, "logps/chosen": -647.1052856445312, "logps/rejected": -394.01751708984375, "loss": 0.1246, "rewards/accuracies": 0.875, "rewards/chosen": 2.526353359222412, "rewards/margins": 3.0505685806274414, "rewards/rejected": -0.5242151618003845, "step": 1691 }, { "epoch": 1.2361643835616438, "grad_norm": 21.757105576487103, "learning_rate": 4.3631181529093405e-07, "logits/chosen": -2.741914987564087, "logits/rejected": -1.8219242095947266, "logps/chosen": -671.1485595703125, "logps/rejected": -497.8492736816406, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 3.3098413944244385, "rewards/margins": 4.256879806518555, "rewards/rejected": -0.9470382928848267, "step": 1692 }, { "epoch": 1.2368949771689497, "grad_norm": 23.933662286196142, "learning_rate": 4.362054219273828e-07, "logits/chosen": -2.6627540588378906, "logits/rejected": -2.100465774536133, "logps/chosen": -1436.3343505859375, "logps/rejected": -680.8159790039062, "loss": 0.1082, "rewards/accuracies": 1.0, "rewards/chosen": 2.145146131515503, "rewards/margins": 2.7452571392059326, "rewards/rejected": -0.6001110076904297, "step": 1693 }, { "epoch": 1.2376255707762558, "grad_norm": 34.1950706454207, "learning_rate": 4.3609895276614263e-07, "logits/chosen": -2.9523980617523193, "logits/rejected": -2.5756115913391113, "logps/chosen": -730.8366088867188, "logps/rejected": -579.1777954101562, "loss": 0.1878, "rewards/accuracies": 1.0, "rewards/chosen": 3.2686729431152344, "rewards/margins": 2.987964391708374, "rewards/rejected": 0.2807087302207947, "step": 1694 }, { "epoch": 1.2383561643835617, "grad_norm": 28.85060522545411, "learning_rate": 4.359924078505532e-07, "logits/chosen": -2.5410349369049072, "logits/rejected": -2.168020248413086, "logps/chosen": -290.33404541015625, "logps/rejected": -377.523681640625, "loss": 0.2839, "rewards/accuracies": 0.875, "rewards/chosen": 2.3693525791168213, "rewards/margins": 4.579161643981934, "rewards/rejected": -2.2098100185394287, "step": 1695 }, { "epoch": 1.2390867579908675, "grad_norm": 39.498936892339266, "learning_rate": 4.358857872239853e-07, "logits/chosen": -2.9670066833496094, "logits/rejected": -2.859121322631836, "logps/chosen": -1019.6220092773438, "logps/rejected": -784.62841796875, "loss": 0.2089, "rewards/accuracies": 0.75, "rewards/chosen": 3.858259439468384, "rewards/margins": 2.008716106414795, "rewards/rejected": 1.8495433330535889, "step": 1696 }, { "epoch": 1.2398173515981736, "grad_norm": 19.85088915623098, "learning_rate": 4.3577909092984046e-07, "logits/chosen": -2.5068345069885254, "logits/rejected": -2.0540146827697754, "logps/chosen": -830.8980712890625, "logps/rejected": -485.12847900390625, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 3.4907960891723633, "rewards/margins": 4.36788272857666, "rewards/rejected": -0.8770869374275208, "step": 1697 }, { "epoch": 1.2405479452054795, "grad_norm": 28.369615489248574, "learning_rate": 4.35672319011551e-07, "logits/chosen": -2.412060022354126, "logits/rejected": -1.4922722578048706, "logps/chosen": -691.5145263671875, "logps/rejected": -402.2707824707031, "loss": 0.1527, "rewards/accuracies": 0.75, "rewards/chosen": 2.5061404705047607, "rewards/margins": 2.741183042526245, "rewards/rejected": -0.23504255712032318, "step": 1698 }, { "epoch": 1.2412785388127854, "grad_norm": 32.7690169471055, "learning_rate": 4.3556547151257993e-07, "logits/chosen": -3.013352870941162, "logits/rejected": -1.719597339630127, "logps/chosen": -659.456298828125, "logps/rejected": -344.3247985839844, "loss": 0.1686, "rewards/accuracies": 1.0, "rewards/chosen": 3.7577357292175293, "rewards/margins": 4.629177093505859, "rewards/rejected": -0.8714412450790405, "step": 1699 }, { "epoch": 1.2420091324200913, "grad_norm": 34.95415000452825, "learning_rate": 4.3545854847642124e-07, "logits/chosen": -2.9735031127929688, "logits/rejected": -2.21463680267334, "logps/chosen": -865.90771484375, "logps/rejected": -560.4239501953125, "loss": 0.1613, "rewards/accuracies": 1.0, "rewards/chosen": 2.94163179397583, "rewards/margins": 2.555206775665283, "rewards/rejected": 0.3864251971244812, "step": 1700 }, { "epoch": 1.2427397260273974, "grad_norm": 20.760809966587285, "learning_rate": 4.353515499465994e-07, "logits/chosen": -2.6919634342193604, "logits/rejected": -1.6707907915115356, "logps/chosen": -585.3094482421875, "logps/rejected": -337.5210266113281, "loss": 0.1391, "rewards/accuracies": 1.0, "rewards/chosen": 2.874541759490967, "rewards/margins": 4.790722846984863, "rewards/rejected": -1.9161810874938965, "step": 1701 }, { "epoch": 1.2434703196347032, "grad_norm": 41.602766267795374, "learning_rate": 4.352444759666699e-07, "logits/chosen": -2.6070809364318848, "logits/rejected": -2.152384042739868, "logps/chosen": -607.4674682617188, "logps/rejected": -578.7806396484375, "loss": 0.1991, "rewards/accuracies": 1.0, "rewards/chosen": 3.4700071811676025, "rewards/margins": 4.703068256378174, "rewards/rejected": -1.2330609560012817, "step": 1702 }, { "epoch": 1.244200913242009, "grad_norm": 39.7134219602021, "learning_rate": 4.3513732658021874e-07, "logits/chosen": -2.946660041809082, "logits/rejected": -2.867069959640503, "logps/chosen": -737.4630126953125, "logps/rejected": -930.9573974609375, "loss": 0.2102, "rewards/accuracies": 1.0, "rewards/chosen": 1.9738534688949585, "rewards/margins": 2.8015084266662598, "rewards/rejected": -0.8276549577713013, "step": 1703 }, { "epoch": 1.244931506849315, "grad_norm": 46.35174619148668, "learning_rate": 4.3503010183086266e-07, "logits/chosen": -2.433591842651367, "logits/rejected": -1.9000458717346191, "logps/chosen": -839.891845703125, "logps/rejected": -590.1243896484375, "loss": 0.2744, "rewards/accuracies": 1.0, "rewards/chosen": 3.325788736343384, "rewards/margins": 4.08831262588501, "rewards/rejected": -0.7625237703323364, "step": 1704 }, { "epoch": 1.245662100456621, "grad_norm": 22.534514678746483, "learning_rate": 4.349228017622491e-07, "logits/chosen": -2.545686960220337, "logits/rejected": -1.2632677555084229, "logps/chosen": -727.5939331054688, "logps/rejected": -329.12164306640625, "loss": 0.1039, "rewards/accuracies": 0.875, "rewards/chosen": 3.5384535789489746, "rewards/margins": 4.401745319366455, "rewards/rejected": -0.8632915019989014, "step": 1705 }, { "epoch": 1.246392694063927, "grad_norm": 35.422693459335676, "learning_rate": 4.3481542641805613e-07, "logits/chosen": -2.406778573989868, "logits/rejected": -2.2773597240448, "logps/chosen": -708.2163696289062, "logps/rejected": -670.4735107421875, "loss": 0.2172, "rewards/accuracies": 1.0, "rewards/chosen": 2.8453328609466553, "rewards/margins": 1.8906190395355225, "rewards/rejected": 0.9547138214111328, "step": 1706 }, { "epoch": 1.2471232876712328, "grad_norm": 47.68515516843266, "learning_rate": 4.3470797584199254e-07, "logits/chosen": -2.8878226280212402, "logits/rejected": -2.2334952354431152, "logps/chosen": -781.9513549804688, "logps/rejected": -492.72637939453125, "loss": 0.2452, "rewards/accuracies": 0.875, "rewards/chosen": 1.9642205238342285, "rewards/margins": 2.0473575592041016, "rewards/rejected": -0.08313716948032379, "step": 1707 }, { "epoch": 1.2478538812785387, "grad_norm": 32.22556153748527, "learning_rate": 4.3460045007779757e-07, "logits/chosen": -2.5972306728363037, "logits/rejected": -2.3056693077087402, "logps/chosen": -602.0888671875, "logps/rejected": -692.2030639648438, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 2.1375021934509277, "rewards/margins": 2.8860244750976562, "rewards/rejected": -0.748522162437439, "step": 1708 }, { "epoch": 1.2485844748858448, "grad_norm": 30.740760965949185, "learning_rate": 4.3449284916924135e-07, "logits/chosen": -2.6763017177581787, "logits/rejected": -2.1583569049835205, "logps/chosen": -624.2596435546875, "logps/rejected": -435.25262451171875, "loss": 0.1667, "rewards/accuracies": 1.0, "rewards/chosen": 2.7246618270874023, "rewards/margins": 3.6784303188323975, "rewards/rejected": -0.953768253326416, "step": 1709 }, { "epoch": 1.2493150684931507, "grad_norm": 25.849472943656778, "learning_rate": 4.343851731601243e-07, "logits/chosen": -2.061363935470581, "logits/rejected": -2.328859329223633, "logps/chosen": -361.26348876953125, "logps/rejected": -562.69677734375, "loss": 0.1569, "rewards/accuracies": 1.0, "rewards/chosen": 0.7721886038780212, "rewards/margins": 2.5966672897338867, "rewards/rejected": -1.8244786262512207, "step": 1710 }, { "epoch": 1.2500456621004568, "grad_norm": 23.68126231978764, "learning_rate": 4.3427742209427753e-07, "logits/chosen": -1.9839849472045898, "logits/rejected": -2.1675949096679688, "logps/chosen": -523.8641967773438, "logps/rejected": -846.2608642578125, "loss": 0.1165, "rewards/accuracies": 0.875, "rewards/chosen": 1.6283138990402222, "rewards/margins": 2.7406959533691406, "rewards/rejected": -1.112381935119629, "step": 1711 }, { "epoch": 1.2507762557077626, "grad_norm": 37.13679250314807, "learning_rate": 4.341695960155628e-07, "logits/chosen": -2.651430606842041, "logits/rejected": -2.571441411972046, "logps/chosen": -1000.8046264648438, "logps/rejected": -889.8888549804688, "loss": 0.2183, "rewards/accuracies": 0.75, "rewards/chosen": 2.560328245162964, "rewards/margins": 1.0412940979003906, "rewards/rejected": 1.5190342664718628, "step": 1712 }, { "epoch": 1.2515068493150685, "grad_norm": 25.801259990354033, "learning_rate": 4.340616949678724e-07, "logits/chosen": -3.0204615592956543, "logits/rejected": -2.5941879749298096, "logps/chosen": -466.82586669921875, "logps/rejected": -574.5451049804688, "loss": 0.1293, "rewards/accuracies": 1.0, "rewards/chosen": 1.4571388959884644, "rewards/margins": 2.527658224105835, "rewards/rejected": -1.0705193281173706, "step": 1713 }, { "epoch": 1.2522374429223744, "grad_norm": 33.206953823356976, "learning_rate": 4.339537189951288e-07, "logits/chosen": -2.5638270378112793, "logits/rejected": -2.0284390449523926, "logps/chosen": -693.5203247070312, "logps/rejected": -487.3026123046875, "loss": 0.1899, "rewards/accuracies": 1.0, "rewards/chosen": 2.9378113746643066, "rewards/margins": 4.1266374588012695, "rewards/rejected": -1.1888264417648315, "step": 1714 }, { "epoch": 1.2529680365296803, "grad_norm": 41.64130657275222, "learning_rate": 4.338456681412854e-07, "logits/chosen": -3.247354507446289, "logits/rejected": -1.7453465461730957, "logps/chosen": -949.109130859375, "logps/rejected": -507.9368591308594, "loss": 0.257, "rewards/accuracies": 1.0, "rewards/chosen": 2.9078097343444824, "rewards/margins": 2.9059505462646484, "rewards/rejected": 0.0018590688705444336, "step": 1715 }, { "epoch": 1.2536986301369863, "grad_norm": 39.57561276817533, "learning_rate": 4.337375424503259e-07, "logits/chosen": -2.7968673706054688, "logits/rejected": -2.566818952560425, "logps/chosen": -539.6434326171875, "logps/rejected": -637.4949951171875, "loss": 0.19, "rewards/accuracies": 1.0, "rewards/chosen": 2.548135995864868, "rewards/margins": 3.474329948425293, "rewards/rejected": -0.9261940121650696, "step": 1716 }, { "epoch": 1.2544292237442922, "grad_norm": 48.366929438214804, "learning_rate": 4.3362934196626447e-07, "logits/chosen": -2.5663061141967773, "logits/rejected": -1.6948837041854858, "logps/chosen": -898.0093994140625, "logps/rejected": -416.3398742675781, "loss": 0.2673, "rewards/accuracies": 1.0, "rewards/chosen": 2.552184581756592, "rewards/margins": 2.092255115509033, "rewards/rejected": 0.45992955565452576, "step": 1717 }, { "epoch": 1.255159817351598, "grad_norm": 46.14829664465553, "learning_rate": 4.3352106673314575e-07, "logits/chosen": -2.078047752380371, "logits/rejected": -2.1811835765838623, "logps/chosen": -314.57452392578125, "logps/rejected": -319.9244689941406, "loss": 0.2404, "rewards/accuracies": 1.0, "rewards/chosen": 0.7707734704017639, "rewards/margins": 2.5889945030212402, "rewards/rejected": -1.8182213306427002, "step": 1718 }, { "epoch": 1.2558904109589042, "grad_norm": 31.840528810029895, "learning_rate": 4.3341271679504473e-07, "logits/chosen": -3.045994281768799, "logits/rejected": -2.1504077911376953, "logps/chosen": -648.1719360351562, "logps/rejected": -470.5555419921875, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 3.289124011993408, "rewards/margins": 4.491415023803711, "rewards/rejected": -1.2022905349731445, "step": 1719 }, { "epoch": 1.25662100456621, "grad_norm": 39.80615567091194, "learning_rate": 4.3330429219606685e-07, "logits/chosen": -2.3675568103790283, "logits/rejected": -2.228848695755005, "logps/chosen": -496.6966552734375, "logps/rejected": -513.83642578125, "loss": 0.2007, "rewards/accuracies": 1.0, "rewards/chosen": 1.4858245849609375, "rewards/margins": 2.5802245140075684, "rewards/rejected": -1.0943999290466309, "step": 1720 }, { "epoch": 1.257351598173516, "grad_norm": 22.56507245982577, "learning_rate": 4.33195792980348e-07, "logits/chosen": -2.8586933612823486, "logits/rejected": -1.9257185459136963, "logps/chosen": -534.9708251953125, "logps/rejected": -431.6627197265625, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": 3.770275592803955, "rewards/margins": 5.545691013336182, "rewards/rejected": -1.7754156589508057, "step": 1721 }, { "epoch": 1.2580821917808218, "grad_norm": 30.42719722640729, "learning_rate": 4.330872191920544e-07, "logits/chosen": -3.0741400718688965, "logits/rejected": -2.162768840789795, "logps/chosen": -747.340087890625, "logps/rejected": -561.5211181640625, "loss": 0.149, "rewards/accuracies": 1.0, "rewards/chosen": 3.274014472961426, "rewards/margins": 4.14091682434082, "rewards/rejected": -0.866902232170105, "step": 1722 }, { "epoch": 1.258812785388128, "grad_norm": 31.240968421451925, "learning_rate": 4.3297857087538256e-07, "logits/chosen": -2.687685489654541, "logits/rejected": -2.5508763790130615, "logps/chosen": -384.2413635253906, "logps/rejected": -450.30560302734375, "loss": 0.2037, "rewards/accuracies": 0.875, "rewards/chosen": 2.825021505355835, "rewards/margins": 4.480469703674316, "rewards/rejected": -1.6554478406906128, "step": 1723 }, { "epoch": 1.2595433789954338, "grad_norm": 28.858607275251305, "learning_rate": 4.328698480745595e-07, "logits/chosen": -2.3085474967956543, "logits/rejected": -1.720075249671936, "logps/chosen": -541.4586181640625, "logps/rejected": -437.8776550292969, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": 2.0981273651123047, "rewards/margins": 3.7304506301879883, "rewards/rejected": -1.632323145866394, "step": 1724 }, { "epoch": 1.2602739726027397, "grad_norm": 29.086253313824, "learning_rate": 4.3276105083384244e-07, "logits/chosen": -2.8854448795318604, "logits/rejected": -1.6114472150802612, "logps/chosen": -477.9489440917969, "logps/rejected": -250.492919921875, "loss": 0.162, "rewards/accuracies": 0.875, "rewards/chosen": 3.0569722652435303, "rewards/margins": 4.372749328613281, "rewards/rejected": -1.3157771825790405, "step": 1725 }, { "epoch": 1.2610045662100458, "grad_norm": 41.37637188529977, "learning_rate": 4.3265217919751883e-07, "logits/chosen": -2.5407490730285645, "logits/rejected": -2.3428149223327637, "logps/chosen": -574.6029052734375, "logps/rejected": -599.01318359375, "loss": 0.2053, "rewards/accuracies": 0.875, "rewards/chosen": 3.2163946628570557, "rewards/margins": 3.311824321746826, "rewards/rejected": -0.0954294204711914, "step": 1726 }, { "epoch": 1.2617351598173516, "grad_norm": 28.74156211095919, "learning_rate": 4.325432332099066e-07, "logits/chosen": -2.7313942909240723, "logits/rejected": -2.674349069595337, "logps/chosen": -723.767578125, "logps/rejected": -745.754150390625, "loss": 0.1405, "rewards/accuracies": 1.0, "rewards/chosen": 3.3162670135498047, "rewards/margins": 3.372251033782959, "rewards/rejected": -0.05598419904708862, "step": 1727 }, { "epoch": 1.2624657534246575, "grad_norm": 40.5363305171654, "learning_rate": 4.3243421291535377e-07, "logits/chosen": -2.8416049480438232, "logits/rejected": -2.7974395751953125, "logps/chosen": -540.9149780273438, "logps/rejected": -802.913330078125, "loss": 0.2153, "rewards/accuracies": 0.875, "rewards/chosen": 2.5050556659698486, "rewards/margins": 3.049126625061035, "rewards/rejected": -0.5440709590911865, "step": 1728 }, { "epoch": 1.2631963470319634, "grad_norm": 34.63516956365488, "learning_rate": 4.323251183582387e-07, "logits/chosen": -3.062612533569336, "logits/rejected": -2.3816633224487305, "logps/chosen": -930.44775390625, "logps/rejected": -615.2149658203125, "loss": 0.2242, "rewards/accuracies": 0.875, "rewards/chosen": 4.0168352127075195, "rewards/margins": 3.814096450805664, "rewards/rejected": 0.20273855328559875, "step": 1729 }, { "epoch": 1.2639269406392695, "grad_norm": 47.46391773050477, "learning_rate": 4.322159495829699e-07, "logits/chosen": -2.759147882461548, "logits/rejected": -2.0359065532684326, "logps/chosen": -543.9293212890625, "logps/rejected": -330.6839599609375, "loss": 0.2731, "rewards/accuracies": 1.0, "rewards/chosen": 3.2189173698425293, "rewards/margins": 3.79323148727417, "rewards/rejected": -0.5743148326873779, "step": 1730 }, { "epoch": 1.2646575342465753, "grad_norm": 33.7295872876285, "learning_rate": 4.321067066339862e-07, "logits/chosen": -2.571068525314331, "logits/rejected": -1.8349361419677734, "logps/chosen": -710.9285888671875, "logps/rejected": -437.0213928222656, "loss": 0.2157, "rewards/accuracies": 1.0, "rewards/chosen": 2.870465040206909, "rewards/margins": 3.297807455062866, "rewards/rejected": -0.42734238505363464, "step": 1731 }, { "epoch": 1.2653881278538812, "grad_norm": 56.59615064276125, "learning_rate": 4.3199738955575654e-07, "logits/chosen": -2.713653087615967, "logits/rejected": -2.1463184356689453, "logps/chosen": -785.75830078125, "logps/rejected": -727.0520629882812, "loss": 0.3148, "rewards/accuracies": 1.0, "rewards/chosen": 3.620032548904419, "rewards/margins": 4.121972560882568, "rewards/rejected": -0.5019400119781494, "step": 1732 }, { "epoch": 1.2661187214611873, "grad_norm": 41.30591321828964, "learning_rate": 4.318879983927801e-07, "logits/chosen": -2.828122138977051, "logits/rejected": -1.7292617559432983, "logps/chosen": -780.5579833984375, "logps/rejected": -490.62200927734375, "loss": 0.2565, "rewards/accuracies": 0.75, "rewards/chosen": 3.7099428176879883, "rewards/margins": 3.095146894454956, "rewards/rejected": 0.614795982837677, "step": 1733 }, { "epoch": 1.2668493150684932, "grad_norm": 41.200165209392665, "learning_rate": 4.3177853318958613e-07, "logits/chosen": -2.883626699447632, "logits/rejected": -2.3784444332122803, "logps/chosen": -748.05712890625, "logps/rejected": -606.0558471679688, "loss": 0.233, "rewards/accuracies": 0.875, "rewards/chosen": 3.016906499862671, "rewards/margins": 3.102375030517578, "rewards/rejected": -0.08546826243400574, "step": 1734 }, { "epoch": 1.267579908675799, "grad_norm": 37.70753606863206, "learning_rate": 4.316689939907341e-07, "logits/chosen": -2.6586387157440186, "logits/rejected": -1.736995816230774, "logps/chosen": -513.8482666015625, "logps/rejected": -318.072021484375, "loss": 0.1952, "rewards/accuracies": 1.0, "rewards/chosen": 2.850393772125244, "rewards/margins": 3.8332362174987793, "rewards/rejected": -0.9828424453735352, "step": 1735 }, { "epoch": 1.268310502283105, "grad_norm": 37.66701739212636, "learning_rate": 4.3155938084081356e-07, "logits/chosen": -2.8362157344818115, "logits/rejected": -2.660252571105957, "logps/chosen": -930.8828125, "logps/rejected": -762.2006225585938, "loss": 0.1869, "rewards/accuracies": 1.0, "rewards/chosen": 4.017646312713623, "rewards/margins": 2.935849189758301, "rewards/rejected": 1.0817973613739014, "step": 1736 }, { "epoch": 1.269041095890411, "grad_norm": 50.18184218575729, "learning_rate": 4.3144969378444416e-07, "logits/chosen": -3.0094926357269287, "logits/rejected": -2.7123382091522217, "logps/chosen": -647.3436889648438, "logps/rejected": -828.67822265625, "loss": 0.2891, "rewards/accuracies": 1.0, "rewards/chosen": 2.7536864280700684, "rewards/margins": 2.7476816177368164, "rewards/rejected": 0.006004810333251953, "step": 1737 }, { "epoch": 1.269771689497717, "grad_norm": 24.705357218485403, "learning_rate": 4.313399328662758e-07, "logits/chosen": -2.5919947624206543, "logits/rejected": -1.583404541015625, "logps/chosen": -538.3719482421875, "logps/rejected": -332.7831726074219, "loss": 0.1191, "rewards/accuracies": 0.875, "rewards/chosen": 2.480027198791504, "rewards/margins": 3.0203299522399902, "rewards/rejected": -0.5403028130531311, "step": 1738 }, { "epoch": 1.2705022831050228, "grad_norm": 28.110436767745846, "learning_rate": 4.312300981309881e-07, "logits/chosen": -3.397475242614746, "logits/rejected": -2.5016424655914307, "logps/chosen": -633.3917846679688, "logps/rejected": -572.3349609375, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 3.7252955436706543, "rewards/margins": 5.136668682098389, "rewards/rejected": -1.4113730192184448, "step": 1739 }, { "epoch": 1.2712328767123289, "grad_norm": 37.04077211436237, "learning_rate": 4.3112018962329095e-07, "logits/chosen": -2.6190972328186035, "logits/rejected": -2.3976755142211914, "logps/chosen": -760.07275390625, "logps/rejected": -572.5897827148438, "loss": 0.227, "rewards/accuracies": 0.75, "rewards/chosen": 2.2005233764648438, "rewards/margins": 1.7046641111373901, "rewards/rejected": 0.4958593249320984, "step": 1740 }, { "epoch": 1.2719634703196347, "grad_norm": 23.957192709487714, "learning_rate": 4.310102073879243e-07, "logits/chosen": -2.9482529163360596, "logits/rejected": -1.979504108428955, "logps/chosen": -530.2392578125, "logps/rejected": -395.97613525390625, "loss": 0.1502, "rewards/accuracies": 0.875, "rewards/chosen": 2.6831912994384766, "rewards/margins": 3.453887701034546, "rewards/rejected": -0.7706961035728455, "step": 1741 }, { "epoch": 1.2726940639269406, "grad_norm": 29.603521230392193, "learning_rate": 4.3090015146965806e-07, "logits/chosen": -2.5895743370056152, "logits/rejected": -2.069704294204712, "logps/chosen": -648.3154907226562, "logps/rejected": -550.4671020507812, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 2.4853334426879883, "rewards/margins": 3.031980514526367, "rewards/rejected": -0.546646773815155, "step": 1742 }, { "epoch": 1.2734246575342465, "grad_norm": 32.94167097345606, "learning_rate": 4.307900219132922e-07, "logits/chosen": -2.8728280067443848, "logits/rejected": -2.2010674476623535, "logps/chosen": -705.99951171875, "logps/rejected": -469.0928039550781, "loss": 0.2114, "rewards/accuracies": 0.875, "rewards/chosen": 3.509632110595703, "rewards/margins": 3.7684826850891113, "rewards/rejected": -0.2588503956794739, "step": 1743 }, { "epoch": 1.2741552511415526, "grad_norm": 37.88856308822789, "learning_rate": 4.3067981876365634e-07, "logits/chosen": -2.610399007797241, "logits/rejected": -2.2715353965759277, "logps/chosen": -615.2715454101562, "logps/rejected": -471.9484558105469, "loss": 0.2322, "rewards/accuracies": 0.875, "rewards/chosen": 1.7349283695220947, "rewards/margins": 1.8850197792053223, "rewards/rejected": -0.15009132027626038, "step": 1744 }, { "epoch": 1.2748858447488585, "grad_norm": 37.086896152292624, "learning_rate": 4.305695420656106e-07, "logits/chosen": -2.835193634033203, "logits/rejected": -2.322007179260254, "logps/chosen": -856.3038330078125, "logps/rejected": -661.4217529296875, "loss": 0.2225, "rewards/accuracies": 0.75, "rewards/chosen": 3.8202474117279053, "rewards/margins": 3.011934280395508, "rewards/rejected": 0.8083130717277527, "step": 1745 }, { "epoch": 1.2756164383561643, "grad_norm": 43.92281160065063, "learning_rate": 4.304591918640446e-07, "logits/chosen": -3.059710741043091, "logits/rejected": -1.7174382209777832, "logps/chosen": -817.0767822265625, "logps/rejected": -574.1246337890625, "loss": 0.2102, "rewards/accuracies": 0.875, "rewards/chosen": 4.856677055358887, "rewards/margins": 5.725743770599365, "rewards/rejected": -0.8690662384033203, "step": 1746 }, { "epoch": 1.2763470319634704, "grad_norm": 25.36480417853891, "learning_rate": 4.3034876820387804e-07, "logits/chosen": -2.651865243911743, "logits/rejected": -2.535111427307129, "logps/chosen": -835.6445922851562, "logps/rejected": -815.3114013671875, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": 3.4406015872955322, "rewards/margins": 4.446022987365723, "rewards/rejected": -1.0054210424423218, "step": 1747 }, { "epoch": 1.2770776255707763, "grad_norm": 24.385011465646013, "learning_rate": 4.3023827113006063e-07, "logits/chosen": -2.7335567474365234, "logits/rejected": -1.79030442237854, "logps/chosen": -806.5111694335938, "logps/rejected": -444.0537109375, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": 1.921671748161316, "rewards/margins": 2.722201347351074, "rewards/rejected": -0.8005297183990479, "step": 1748 }, { "epoch": 1.2778082191780822, "grad_norm": 30.914111858691957, "learning_rate": 4.3012770068757166e-07, "logits/chosen": -2.79899001121521, "logits/rejected": -2.106562614440918, "logps/chosen": -705.5074462890625, "logps/rejected": -517.9801025390625, "loss": 0.1982, "rewards/accuracies": 0.875, "rewards/chosen": 3.2240843772888184, "rewards/margins": 3.48724365234375, "rewards/rejected": -0.2631591558456421, "step": 1749 }, { "epoch": 1.278538812785388, "grad_norm": 48.112498376201586, "learning_rate": 4.300170569214206e-07, "logits/chosen": -2.9396157264709473, "logits/rejected": -2.185185194015503, "logps/chosen": -674.8590087890625, "logps/rejected": -519.3291015625, "loss": 0.2998, "rewards/accuracies": 0.875, "rewards/chosen": 4.098676681518555, "rewards/margins": 3.7870430946350098, "rewards/rejected": 0.31163302063941956, "step": 1750 }, { "epoch": 1.279269406392694, "grad_norm": 32.700256309321375, "learning_rate": 4.299063398766466e-07, "logits/chosen": -3.271923780441284, "logits/rejected": -2.3547682762145996, "logps/chosen": -837.6564331054688, "logps/rejected": -678.4032592773438, "loss": 0.1761, "rewards/accuracies": 1.0, "rewards/chosen": 4.248681545257568, "rewards/margins": 4.194899559020996, "rewards/rejected": 0.05378144979476929, "step": 1751 }, { "epoch": 1.28, "grad_norm": 22.2209584872874, "learning_rate": 4.2979554959831877e-07, "logits/chosen": -3.104194402694702, "logits/rejected": -2.530296802520752, "logps/chosen": -844.9915161132812, "logps/rejected": -678.523193359375, "loss": 0.1374, "rewards/accuracies": 1.0, "rewards/chosen": 3.898082971572876, "rewards/margins": 4.121639251708984, "rewards/rejected": -0.22355642914772034, "step": 1752 }, { "epoch": 1.280730593607306, "grad_norm": 44.29651813127853, "learning_rate": 4.296846861315359e-07, "logits/chosen": -3.153759479522705, "logits/rejected": -2.3236641883850098, "logps/chosen": -862.2311401367188, "logps/rejected": -554.0375366210938, "loss": 0.2146, "rewards/accuracies": 0.875, "rewards/chosen": 3.886523723602295, "rewards/margins": 3.1143226623535156, "rewards/rejected": 0.7722010612487793, "step": 1753 }, { "epoch": 1.281461187214612, "grad_norm": 40.191642962965574, "learning_rate": 4.2957374952142644e-07, "logits/chosen": -2.648380994796753, "logits/rejected": -1.955582857131958, "logps/chosen": -696.305908203125, "logps/rejected": -498.5621032714844, "loss": 0.1927, "rewards/accuracies": 1.0, "rewards/chosen": 3.333158016204834, "rewards/margins": 4.049984931945801, "rewards/rejected": -0.7168267965316772, "step": 1754 }, { "epoch": 1.2821917808219179, "grad_norm": 38.37383256062163, "learning_rate": 4.2946273981314895e-07, "logits/chosen": -2.132643461227417, "logits/rejected": -2.0802714824676514, "logps/chosen": -598.6429443359375, "logps/rejected": -671.7728271484375, "loss": 0.2196, "rewards/accuracies": 0.875, "rewards/chosen": 2.0459816455841064, "rewards/margins": 1.23252534866333, "rewards/rejected": 0.8134563565254211, "step": 1755 }, { "epoch": 1.2829223744292237, "grad_norm": 30.579384475211604, "learning_rate": 4.2935165705189167e-07, "logits/chosen": -3.020160675048828, "logits/rejected": -2.595021963119507, "logps/chosen": -573.6500854492188, "logps/rejected": -495.6191101074219, "loss": 0.1651, "rewards/accuracies": 1.0, "rewards/chosen": 3.002883195877075, "rewards/margins": 2.6814560890197754, "rewards/rejected": 0.32142698764801025, "step": 1756 }, { "epoch": 1.2836529680365296, "grad_norm": 33.65475021321749, "learning_rate": 4.2924050128287233e-07, "logits/chosen": -3.026028633117676, "logits/rejected": -2.221982479095459, "logps/chosen": -353.3842468261719, "logps/rejected": -315.60955810546875, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": 3.48701810836792, "rewards/margins": 4.94013786315918, "rewards/rejected": -1.4531193971633911, "step": 1757 }, { "epoch": 1.2843835616438355, "grad_norm": 35.60256455003014, "learning_rate": 4.2912927255133855e-07, "logits/chosen": -2.1373696327209473, "logits/rejected": -1.955627679824829, "logps/chosen": -484.0577087402344, "logps/rejected": -423.53973388671875, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 3.1519854068756104, "rewards/margins": 3.3926796913146973, "rewards/rejected": -0.24069422483444214, "step": 1758 }, { "epoch": 1.2851141552511416, "grad_norm": 32.467550657589264, "learning_rate": 4.290179709025679e-07, "logits/chosen": -2.7554540634155273, "logits/rejected": -2.2005679607391357, "logps/chosen": -649.111328125, "logps/rejected": -465.6227111816406, "loss": 0.1586, "rewards/accuracies": 1.0, "rewards/chosen": 3.383131742477417, "rewards/margins": 3.815321445465088, "rewards/rejected": -0.4321897625923157, "step": 1759 }, { "epoch": 1.2858447488584475, "grad_norm": 26.5065575889221, "learning_rate": 4.2890659638186694e-07, "logits/chosen": -3.0815582275390625, "logits/rejected": -1.8454835414886475, "logps/chosen": -739.3189086914062, "logps/rejected": -417.89349365234375, "loss": 0.1343, "rewards/accuracies": 1.0, "rewards/chosen": 4.472011566162109, "rewards/margins": 4.44812536239624, "rewards/rejected": 0.023885734379291534, "step": 1760 }, { "epoch": 1.2865753424657536, "grad_norm": 27.27265310566383, "learning_rate": 4.287951490345726e-07, "logits/chosen": -2.8101930618286133, "logits/rejected": -2.0799338817596436, "logps/chosen": -838.8223266601562, "logps/rejected": -530.766357421875, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": 4.800323486328125, "rewards/margins": 5.172605514526367, "rewards/rejected": -0.3722820281982422, "step": 1761 }, { "epoch": 1.2873059360730594, "grad_norm": 21.80799307620302, "learning_rate": 4.2868362890605116e-07, "logits/chosen": -2.809880018234253, "logits/rejected": -2.7072596549987793, "logps/chosen": -630.7985229492188, "logps/rejected": -533.3060913085938, "loss": 0.1062, "rewards/accuracies": 0.875, "rewards/chosen": 2.983872413635254, "rewards/margins": 3.6934380531311035, "rewards/rejected": -0.7095655202865601, "step": 1762 }, { "epoch": 1.2880365296803653, "grad_norm": 37.44674767096353, "learning_rate": 4.2857203604169854e-07, "logits/chosen": -3.1727471351623535, "logits/rejected": -1.8089509010314941, "logps/chosen": -482.2314453125, "logps/rejected": -228.87313842773438, "loss": 0.2365, "rewards/accuracies": 1.0, "rewards/chosen": 2.1802072525024414, "rewards/margins": 3.500091791152954, "rewards/rejected": -1.3198845386505127, "step": 1763 }, { "epoch": 1.2887671232876712, "grad_norm": 36.95314482029067, "learning_rate": 4.284603704869402e-07, "logits/chosen": -2.5546112060546875, "logits/rejected": -1.9922593832015991, "logps/chosen": -825.09521484375, "logps/rejected": -467.2869873046875, "loss": 0.1614, "rewards/accuracies": 1.0, "rewards/chosen": 4.267240047454834, "rewards/margins": 4.319145202636719, "rewards/rejected": -0.051904767751693726, "step": 1764 }, { "epoch": 1.289497716894977, "grad_norm": 42.07897644654706, "learning_rate": 4.2834863228723137e-07, "logits/chosen": -2.609858989715576, "logits/rejected": -2.1758406162261963, "logps/chosen": -508.7486572265625, "logps/rejected": -392.0083312988281, "loss": 0.2386, "rewards/accuracies": 1.0, "rewards/chosen": 2.531203031539917, "rewards/margins": 4.104308605194092, "rewards/rejected": -1.5731053352355957, "step": 1765 }, { "epoch": 1.2902283105022831, "grad_norm": 40.041593669992864, "learning_rate": 4.282368214880567e-07, "logits/chosen": -2.624375581741333, "logits/rejected": -2.4019298553466797, "logps/chosen": -650.7061767578125, "logps/rejected": -532.115478515625, "loss": 0.2501, "rewards/accuracies": 0.875, "rewards/chosen": 3.078756332397461, "rewards/margins": 2.5669124126434326, "rewards/rejected": 0.5118441581726074, "step": 1766 }, { "epoch": 1.290958904109589, "grad_norm": 30.92495109289029, "learning_rate": 4.2812493813493046e-07, "logits/chosen": -3.0356853008270264, "logits/rejected": -1.8865795135498047, "logps/chosen": -825.162109375, "logps/rejected": -535.9661865234375, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": 4.127279281616211, "rewards/margins": 3.9961025714874268, "rewards/rejected": 0.13117694854736328, "step": 1767 }, { "epoch": 1.291689497716895, "grad_norm": 42.744654271031365, "learning_rate": 4.2801298227339634e-07, "logits/chosen": -2.9383487701416016, "logits/rejected": -2.003037452697754, "logps/chosen": -741.0953369140625, "logps/rejected": -538.2008056640625, "loss": 0.2431, "rewards/accuracies": 1.0, "rewards/chosen": 2.8335976600646973, "rewards/margins": 2.2510361671447754, "rewards/rejected": 0.5825613141059875, "step": 1768 }, { "epoch": 1.292420091324201, "grad_norm": 29.10533322886514, "learning_rate": 4.279009539490278e-07, "logits/chosen": -3.2991960048675537, "logits/rejected": -2.8244290351867676, "logps/chosen": -772.5689086914062, "logps/rejected": -742.4863891601562, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": 3.381518840789795, "rewards/margins": 3.0519087314605713, "rewards/rejected": 0.32961010932922363, "step": 1769 }, { "epoch": 1.2931506849315069, "grad_norm": 29.732561401929892, "learning_rate": 4.2778885320742753e-07, "logits/chosen": -2.212305784225464, "logits/rejected": -1.8208593130111694, "logps/chosen": -495.475341796875, "logps/rejected": -429.605224609375, "loss": 0.2, "rewards/accuracies": 1.0, "rewards/chosen": 2.0041604042053223, "rewards/margins": 2.6363425254821777, "rewards/rejected": -0.6321820020675659, "step": 1770 }, { "epoch": 1.2938812785388127, "grad_norm": 28.543616398581577, "learning_rate": 4.276766800942278e-07, "logits/chosen": -3.059638023376465, "logits/rejected": -2.282707929611206, "logps/chosen": -535.3880004882812, "logps/rejected": -529.2850952148438, "loss": 0.1331, "rewards/accuracies": 1.0, "rewards/chosen": 2.380345106124878, "rewards/margins": 3.3536624908447266, "rewards/rejected": -0.9733173847198486, "step": 1771 }, { "epoch": 1.2946118721461186, "grad_norm": 34.776613431972855, "learning_rate": 4.275644346550904e-07, "logits/chosen": -2.7272579669952393, "logits/rejected": -2.270949363708496, "logps/chosen": -520.6340942382812, "logps/rejected": -471.6927795410156, "loss": 0.1953, "rewards/accuracies": 0.875, "rewards/chosen": 2.627933979034424, "rewards/margins": 3.9671802520751953, "rewards/rejected": -1.339246153831482, "step": 1772 }, { "epoch": 1.2953424657534247, "grad_norm": 32.42644546098716, "learning_rate": 4.274521169357065e-07, "logits/chosen": -3.00901460647583, "logits/rejected": -2.326101303100586, "logps/chosen": -900.9769897460938, "logps/rejected": -697.339111328125, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": 4.207093715667725, "rewards/margins": 3.9383411407470703, "rewards/rejected": 0.2687520980834961, "step": 1773 }, { "epoch": 1.2960730593607306, "grad_norm": 27.338382285645466, "learning_rate": 4.2733972698179666e-07, "logits/chosen": -2.2718710899353027, "logits/rejected": -2.413506269454956, "logps/chosen": -501.0461730957031, "logps/rejected": -544.551025390625, "loss": 0.1329, "rewards/accuracies": 1.0, "rewards/chosen": 1.1824029684066772, "rewards/margins": 2.05165433883667, "rewards/rejected": -0.8692512512207031, "step": 1774 }, { "epoch": 1.2968036529680365, "grad_norm": 30.183509578713362, "learning_rate": 4.272272648391109e-07, "logits/chosen": -2.7206339836120605, "logits/rejected": -1.7626113891601562, "logps/chosen": -604.035400390625, "logps/rejected": -482.66253662109375, "loss": 0.1276, "rewards/accuracies": 1.0, "rewards/chosen": 3.8520617485046387, "rewards/margins": 5.380516052246094, "rewards/rejected": -1.528454303741455, "step": 1775 }, { "epoch": 1.2975342465753426, "grad_norm": 46.73790890600745, "learning_rate": 4.2711473055342864e-07, "logits/chosen": -2.0755057334899902, "logits/rejected": -2.3309009075164795, "logps/chosen": -622.3472900390625, "logps/rejected": -640.29345703125, "loss": 0.2134, "rewards/accuracies": 0.875, "rewards/chosen": 2.4645535945892334, "rewards/margins": 2.9744677543640137, "rewards/rejected": -0.5099141001701355, "step": 1776 }, { "epoch": 1.2982648401826484, "grad_norm": 41.48290957863702, "learning_rate": 4.2700212417055853e-07, "logits/chosen": -2.8904972076416016, "logits/rejected": -2.6830291748046875, "logps/chosen": -823.0213623046875, "logps/rejected": -898.3579711914062, "loss": 0.2564, "rewards/accuracies": 0.75, "rewards/chosen": 2.6209068298339844, "rewards/margins": 2.072539806365967, "rewards/rejected": 0.5483670234680176, "step": 1777 }, { "epoch": 1.2989954337899543, "grad_norm": 44.74127146237926, "learning_rate": 4.268894457363388e-07, "logits/chosen": -3.155766248703003, "logits/rejected": -1.6971089839935303, "logps/chosen": -701.3991088867188, "logps/rejected": -335.5740966796875, "loss": 0.1851, "rewards/accuracies": 1.0, "rewards/chosen": 2.9393601417541504, "rewards/margins": 4.018399238586426, "rewards/rejected": -1.079039216041565, "step": 1778 }, { "epoch": 1.2997260273972602, "grad_norm": 39.82026881619146, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -2.584224224090576, "logits/rejected": -2.1915011405944824, "logps/chosen": -640.6417236328125, "logps/rejected": -704.228515625, "loss": 0.2019, "rewards/accuracies": 1.0, "rewards/chosen": 2.318667411804199, "rewards/margins": 3.5977349281311035, "rewards/rejected": -1.2790675163269043, "step": 1779 }, { "epoch": 1.3004566210045663, "grad_norm": 40.89081904329198, "learning_rate": 4.266638728973494e-07, "logits/chosen": -2.3808276653289795, "logits/rejected": -2.077981472015381, "logps/chosen": -549.7946166992188, "logps/rejected": -428.50018310546875, "loss": 0.2146, "rewards/accuracies": 0.75, "rewards/chosen": 2.833568572998047, "rewards/margins": 3.4246950149536133, "rewards/rejected": -0.5911263227462769, "step": 1780 }, { "epoch": 1.3011872146118721, "grad_norm": 42.20713374148824, "learning_rate": 4.2655097858440257e-07, "logits/chosen": -2.9692282676696777, "logits/rejected": -1.7044118642807007, "logps/chosen": -542.217529296875, "logps/rejected": -332.8336486816406, "loss": 0.1967, "rewards/accuracies": 1.0, "rewards/chosen": 2.9162869453430176, "rewards/margins": 3.782752513885498, "rewards/rejected": -0.8664656281471252, "step": 1781 }, { "epoch": 1.301917808219178, "grad_norm": 37.29728738774946, "learning_rate": 4.2643801240375153e-07, "logits/chosen": -2.7632861137390137, "logits/rejected": -1.9674139022827148, "logps/chosen": -608.5172729492188, "logps/rejected": -592.7537231445312, "loss": 0.146, "rewards/accuracies": 0.875, "rewards/chosen": 3.8473403453826904, "rewards/margins": 5.009336471557617, "rewards/rejected": -1.1619960069656372, "step": 1782 }, { "epoch": 1.3026484018264841, "grad_norm": 40.07927477639238, "learning_rate": 4.263249744013809e-07, "logits/chosen": -2.692742347717285, "logits/rejected": -2.2128427028656006, "logps/chosen": -718.6739501953125, "logps/rejected": -568.9835205078125, "loss": 0.1688, "rewards/accuracies": 1.0, "rewards/chosen": 2.97324800491333, "rewards/margins": 2.4706907272338867, "rewards/rejected": 0.5025572180747986, "step": 1783 }, { "epoch": 1.30337899543379, "grad_norm": 46.11759182124133, "learning_rate": 4.262118646233046e-07, "logits/chosen": -3.069267511367798, "logits/rejected": -2.695793628692627, "logps/chosen": -614.4266967773438, "logps/rejected": -563.75146484375, "loss": 0.2536, "rewards/accuracies": 0.875, "rewards/chosen": 1.525341272354126, "rewards/margins": 1.8848915100097656, "rewards/rejected": -0.3595500588417053, "step": 1784 }, { "epoch": 1.3041095890410959, "grad_norm": 22.384202481928032, "learning_rate": 4.260986831155655e-07, "logits/chosen": -2.780534505844116, "logits/rejected": -2.938415050506592, "logps/chosen": -349.2906494140625, "logps/rejected": -545.0291748046875, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 1.9833147525787354, "rewards/margins": 5.388606548309326, "rewards/rejected": -3.4052915573120117, "step": 1785 }, { "epoch": 1.3048401826484017, "grad_norm": 45.991228677155114, "learning_rate": 4.259854299242358e-07, "logits/chosen": -3.3314807415008545, "logits/rejected": -2.0018606185913086, "logps/chosen": -701.3358154296875, "logps/rejected": -392.51922607421875, "loss": 0.213, "rewards/accuracies": 1.0, "rewards/chosen": 3.3495230674743652, "rewards/margins": 4.603033065795898, "rewards/rejected": -1.2535094022750854, "step": 1786 }, { "epoch": 1.3055707762557078, "grad_norm": 27.923917673253808, "learning_rate": 4.2587210509541704e-07, "logits/chosen": -2.712359666824341, "logits/rejected": -1.888846516609192, "logps/chosen": -412.43017578125, "logps/rejected": -339.70782470703125, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 2.7737791538238525, "rewards/margins": 3.9493179321289062, "rewards/rejected": -1.1755385398864746, "step": 1787 }, { "epoch": 1.3063013698630137, "grad_norm": 24.96509526209168, "learning_rate": 4.2575870867523973e-07, "logits/chosen": -2.6684141159057617, "logits/rejected": -1.466355562210083, "logps/chosen": -554.62060546875, "logps/rejected": -385.7414855957031, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 4.527894020080566, "rewards/margins": 5.540718078613281, "rewards/rejected": -1.0128246545791626, "step": 1788 }, { "epoch": 1.3070319634703196, "grad_norm": 24.375937732362857, "learning_rate": 4.256452407098635e-07, "logits/chosen": -2.888148784637451, "logits/rejected": -1.8504598140716553, "logps/chosen": -528.5081176757812, "logps/rejected": -289.189208984375, "loss": 0.1489, "rewards/accuracies": 0.875, "rewards/chosen": 2.665004253387451, "rewards/margins": 3.143850088119507, "rewards/rejected": -0.4788457751274109, "step": 1789 }, { "epoch": 1.3077625570776257, "grad_norm": 41.2001696536821, "learning_rate": 4.255317012454772e-07, "logits/chosen": -2.429511785507202, "logits/rejected": -2.2415788173675537, "logps/chosen": -594.2510375976562, "logps/rejected": -499.8289794921875, "loss": 0.1753, "rewards/accuracies": 0.875, "rewards/chosen": 3.1480255126953125, "rewards/margins": 2.5493385791778564, "rewards/rejected": 0.598686695098877, "step": 1790 }, { "epoch": 1.3084931506849315, "grad_norm": 56.91384104709833, "learning_rate": 4.2541809032829876e-07, "logits/chosen": -2.3729071617126465, "logits/rejected": -2.353097915649414, "logps/chosen": -493.84423828125, "logps/rejected": -627.4158325195312, "loss": 0.2631, "rewards/accuracies": 0.75, "rewards/chosen": 1.4210543632507324, "rewards/margins": 2.1436538696289062, "rewards/rejected": -0.7225996851921082, "step": 1791 }, { "epoch": 1.3092237442922374, "grad_norm": 34.10931952869974, "learning_rate": 4.253044080045753e-07, "logits/chosen": -3.115719795227051, "logits/rejected": -2.4130468368530273, "logps/chosen": -646.1329345703125, "logps/rejected": -580.251708984375, "loss": 0.1782, "rewards/accuracies": 1.0, "rewards/chosen": 2.056281089782715, "rewards/margins": 2.4914705753326416, "rewards/rejected": -0.43518972396850586, "step": 1792 }, { "epoch": 1.3099543378995433, "grad_norm": 32.80135785525406, "learning_rate": 4.2519065432058275e-07, "logits/chosen": -3.150590419769287, "logits/rejected": -2.066650629043579, "logps/chosen": -711.8294677734375, "logps/rejected": -414.5650634765625, "loss": 0.1812, "rewards/accuracies": 1.0, "rewards/chosen": 3.0423383712768555, "rewards/margins": 3.7994956970214844, "rewards/rejected": -0.757157564163208, "step": 1793 }, { "epoch": 1.3106849315068494, "grad_norm": 37.1228084238442, "learning_rate": 4.2507682932262636e-07, "logits/chosen": -2.9356930255889893, "logits/rejected": -2.1777641773223877, "logps/chosen": -723.2352294921875, "logps/rejected": -459.7544250488281, "loss": 0.2661, "rewards/accuracies": 1.0, "rewards/chosen": 2.470116138458252, "rewards/margins": 2.400413990020752, "rewards/rejected": 0.06970192492008209, "step": 1794 }, { "epoch": 1.3114155251141553, "grad_norm": 34.938312336821774, "learning_rate": 4.249629330570401e-07, "logits/chosen": -3.040198802947998, "logits/rejected": -2.6760830879211426, "logps/chosen": -879.2646484375, "logps/rejected": -691.4085693359375, "loss": 0.1892, "rewards/accuracies": 1.0, "rewards/chosen": 3.3456523418426514, "rewards/margins": 3.5042190551757812, "rewards/rejected": -0.1585668921470642, "step": 1795 }, { "epoch": 1.3121461187214611, "grad_norm": 25.575002434246585, "learning_rate": 4.248489655701875e-07, "logits/chosen": -2.7597413063049316, "logits/rejected": -2.8648788928985596, "logps/chosen": -601.5045166015625, "logps/rejected": -602.6712036132812, "loss": 0.1526, "rewards/accuracies": 1.0, "rewards/chosen": 2.08535099029541, "rewards/margins": 2.134769916534424, "rewards/rejected": -0.04941880702972412, "step": 1796 }, { "epoch": 1.3128767123287672, "grad_norm": 26.59875398169821, "learning_rate": 4.2473492690846025e-07, "logits/chosen": -2.7036209106445312, "logits/rejected": -2.317469358444214, "logps/chosen": -559.8565063476562, "logps/rejected": -471.0831604003906, "loss": 0.1962, "rewards/accuracies": 0.75, "rewards/chosen": 2.068444013595581, "rewards/margins": 2.194218158721924, "rewards/rejected": -0.12577387690544128, "step": 1797 }, { "epoch": 1.313607305936073, "grad_norm": 24.67955147952387, "learning_rate": 4.246208171182799e-07, "logits/chosen": -2.817070960998535, "logits/rejected": -2.2316296100616455, "logps/chosen": -852.5242919921875, "logps/rejected": -688.1063842773438, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 2.4610707759857178, "rewards/margins": 2.5233120918273926, "rewards/rejected": -0.06224152445793152, "step": 1798 }, { "epoch": 1.314337899543379, "grad_norm": 27.461071968020704, "learning_rate": 4.2450663624609627e-07, "logits/chosen": -2.9375953674316406, "logits/rejected": -2.281344175338745, "logps/chosen": -843.3637084960938, "logps/rejected": -697.293212890625, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 3.0289828777313232, "rewards/margins": 4.477448463439941, "rewards/rejected": -1.448465347290039, "step": 1799 }, { "epoch": 1.3150684931506849, "grad_norm": 38.76908552168209, "learning_rate": 4.2439238433838857e-07, "logits/chosen": -2.779010772705078, "logits/rejected": -2.602424383163452, "logps/chosen": -472.7967529296875, "logps/rejected": -536.7210083007812, "loss": 0.229, "rewards/accuracies": 1.0, "rewards/chosen": 1.0145113468170166, "rewards/margins": 2.2140305042266846, "rewards/rejected": -1.1995189189910889, "step": 1800 }, { "epoch": 1.3157990867579907, "grad_norm": 24.32312313103842, "learning_rate": 4.242780614416647e-07, "logits/chosen": -2.43298077583313, "logits/rejected": -2.8658523559570312, "logps/chosen": -657.5291748046875, "logps/rejected": -911.6541748046875, "loss": 0.1527, "rewards/accuracies": 0.875, "rewards/chosen": 2.3071811199188232, "rewards/margins": 2.4169068336486816, "rewards/rejected": -0.10972577333450317, "step": 1801 }, { "epoch": 1.3165296803652968, "grad_norm": 38.787856382305726, "learning_rate": 4.2416366760246136e-07, "logits/chosen": -2.360239028930664, "logits/rejected": -2.630028009414673, "logps/chosen": -612.4796752929688, "logps/rejected": -484.3408203125, "loss": 0.2531, "rewards/accuracies": 0.875, "rewards/chosen": 1.578870177268982, "rewards/margins": 1.8974943161010742, "rewards/rejected": -0.3186242878437042, "step": 1802 }, { "epoch": 1.3172602739726027, "grad_norm": 48.465903970278426, "learning_rate": 4.240492028673444e-07, "logits/chosen": -2.7488112449645996, "logits/rejected": -2.48305606842041, "logps/chosen": -563.1893920898438, "logps/rejected": -716.695556640625, "loss": 0.2437, "rewards/accuracies": 0.875, "rewards/chosen": 1.2915029525756836, "rewards/margins": 2.262333869934082, "rewards/rejected": -0.9708309173583984, "step": 1803 }, { "epoch": 1.3179908675799088, "grad_norm": 40.90192424845758, "learning_rate": 4.2393466728290826e-07, "logits/chosen": -2.8463797569274902, "logits/rejected": -1.2066398859024048, "logps/chosen": -708.8365478515625, "logps/rejected": -332.405517578125, "loss": 0.1809, "rewards/accuracies": 1.0, "rewards/chosen": 3.455542802810669, "rewards/margins": 3.5053696632385254, "rewards/rejected": -0.04982671141624451, "step": 1804 }, { "epoch": 1.3187214611872147, "grad_norm": 36.391183510634654, "learning_rate": 4.2382006089577646e-07, "logits/chosen": -2.916996479034424, "logits/rejected": -2.3510937690734863, "logps/chosen": -1124.7958984375, "logps/rejected": -924.8524780273438, "loss": 0.2403, "rewards/accuracies": 1.0, "rewards/chosen": 4.456892013549805, "rewards/margins": 3.0697174072265625, "rewards/rejected": 1.3871750831604004, "step": 1805 }, { "epoch": 1.3194520547945205, "grad_norm": 44.08933967578643, "learning_rate": 4.2370538375260133e-07, "logits/chosen": -2.680375814437866, "logits/rejected": -2.620560646057129, "logps/chosen": -932.8338012695312, "logps/rejected": -902.2759399414062, "loss": 0.1917, "rewards/accuracies": 0.875, "rewards/chosen": 3.7578213214874268, "rewards/margins": 3.111400842666626, "rewards/rejected": 0.6464205980300903, "step": 1806 }, { "epoch": 1.3201826484018264, "grad_norm": 23.52256827520581, "learning_rate": 4.2359063590006374e-07, "logits/chosen": -2.68919038772583, "logits/rejected": -2.260401725769043, "logps/chosen": -520.0604248046875, "logps/rejected": -482.5045166015625, "loss": 0.1215, "rewards/accuracies": 1.0, "rewards/chosen": 1.8721412420272827, "rewards/margins": 4.183918476104736, "rewards/rejected": -2.311777353286743, "step": 1807 }, { "epoch": 1.3209132420091323, "grad_norm": 33.98255921873289, "learning_rate": 4.234758173848736e-07, "logits/chosen": -3.1824121475219727, "logits/rejected": -2.79121732711792, "logps/chosen": -555.8595581054688, "logps/rejected": -506.0635986328125, "loss": 0.1861, "rewards/accuracies": 0.875, "rewards/chosen": 1.5764174461364746, "rewards/margins": 2.6224026679992676, "rewards/rejected": -1.045985221862793, "step": 1808 }, { "epoch": 1.3216438356164384, "grad_norm": 32.011886296465136, "learning_rate": 4.2336092825376946e-07, "logits/chosen": -2.3405463695526123, "logits/rejected": -2.698183536529541, "logps/chosen": -618.0093994140625, "logps/rejected": -764.3460083007812, "loss": 0.1444, "rewards/accuracies": 1.0, "rewards/chosen": 1.3977187871932983, "rewards/margins": 2.752865791320801, "rewards/rejected": -1.355146884918213, "step": 1809 }, { "epoch": 1.3223744292237443, "grad_norm": 21.687915105447406, "learning_rate": 4.232459685535186e-07, "logits/chosen": -2.270587682723999, "logits/rejected": -2.357099771499634, "logps/chosen": -247.89833068847656, "logps/rejected": -425.3964538574219, "loss": 0.1209, "rewards/accuracies": 0.875, "rewards/chosen": 1.7629971504211426, "rewards/margins": 3.3944242000579834, "rewards/rejected": -1.6314269304275513, "step": 1810 }, { "epoch": 1.3231050228310504, "grad_norm": 44.60355423933445, "learning_rate": 4.2313093833091716e-07, "logits/chosen": -2.48878812789917, "logits/rejected": -1.3767293691635132, "logps/chosen": -1003.6798095703125, "logps/rejected": -369.4459533691406, "loss": 0.2855, "rewards/accuracies": 0.875, "rewards/chosen": 2.3941285610198975, "rewards/margins": 3.447523593902588, "rewards/rejected": -1.0533949136734009, "step": 1811 }, { "epoch": 1.3238356164383562, "grad_norm": 32.38121886157063, "learning_rate": 4.230158376327899e-07, "logits/chosen": -2.7606351375579834, "logits/rejected": -2.417895555496216, "logps/chosen": -385.0032653808594, "logps/rejected": -395.513427734375, "loss": 0.1731, "rewards/accuracies": 1.0, "rewards/chosen": 2.9129278659820557, "rewards/margins": 4.1940813064575195, "rewards/rejected": -1.2811535596847534, "step": 1812 }, { "epoch": 1.324566210045662, "grad_norm": 36.45832849702349, "learning_rate": 4.229006665059903e-07, "logits/chosen": -3.017338275909424, "logits/rejected": -2.341287136077881, "logps/chosen": -516.4862060546875, "logps/rejected": -429.6248779296875, "loss": 0.1874, "rewards/accuracies": 1.0, "rewards/chosen": 1.7637282609939575, "rewards/margins": 2.7928595542907715, "rewards/rejected": -1.029131293296814, "step": 1813 }, { "epoch": 1.325296803652968, "grad_norm": 26.208779950321514, "learning_rate": 4.227854249974004e-07, "logits/chosen": -2.8397269248962402, "logits/rejected": -2.2770028114318848, "logps/chosen": -847.1107177734375, "logps/rejected": -706.4539794921875, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": 3.3476004600524902, "rewards/margins": 3.3911962509155273, "rewards/rejected": -0.04359562695026398, "step": 1814 }, { "epoch": 1.3260273972602739, "grad_norm": 46.223448353994264, "learning_rate": 4.2267011315393103e-07, "logits/chosen": -2.5438122749328613, "logits/rejected": -2.037818670272827, "logps/chosen": -669.5199584960938, "logps/rejected": -537.0862426757812, "loss": 0.2033, "rewards/accuracies": 0.875, "rewards/chosen": 1.7620192766189575, "rewards/margins": 3.1962525844573975, "rewards/rejected": -1.4342331886291504, "step": 1815 }, { "epoch": 1.32675799086758, "grad_norm": 31.65075165879013, "learning_rate": 4.2255473102252154e-07, "logits/chosen": -2.218575954437256, "logits/rejected": -2.465163230895996, "logps/chosen": -333.2450256347656, "logps/rejected": -597.734375, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 1.0984468460083008, "rewards/margins": 3.388796806335449, "rewards/rejected": -2.2903499603271484, "step": 1816 }, { "epoch": 1.3274885844748858, "grad_norm": 46.07722161288048, "learning_rate": 4.2243927865013997e-07, "logits/chosen": -2.845888137817383, "logits/rejected": -2.665153980255127, "logps/chosen": -765.4312744140625, "logps/rejected": -816.0450439453125, "loss": 0.2607, "rewards/accuracies": 1.0, "rewards/chosen": 2.0491466522216797, "rewards/margins": 2.890242576599121, "rewards/rejected": -0.841096043586731, "step": 1817 }, { "epoch": 1.3282191780821917, "grad_norm": 45.521012704642786, "learning_rate": 4.2232375608378295e-07, "logits/chosen": -2.3865723609924316, "logits/rejected": -2.1564207077026367, "logps/chosen": -735.276611328125, "logps/rejected": -572.7471923828125, "loss": 0.2851, "rewards/accuracies": 1.0, "rewards/chosen": 2.831087589263916, "rewards/margins": 3.157547950744629, "rewards/rejected": -0.3264607787132263, "step": 1818 }, { "epoch": 1.3289497716894978, "grad_norm": 40.230495868825216, "learning_rate": 4.222081633704756e-07, "logits/chosen": -2.5408499240875244, "logits/rejected": -2.4171981811523438, "logps/chosen": -604.9302368164062, "logps/rejected": -590.6170654296875, "loss": 0.2309, "rewards/accuracies": 0.875, "rewards/chosen": 1.8899672031402588, "rewards/margins": 2.0815482139587402, "rewards/rejected": -0.19158107042312622, "step": 1819 }, { "epoch": 1.3296803652968037, "grad_norm": 39.660296102198515, "learning_rate": 4.2209250055727177e-07, "logits/chosen": -3.1582441329956055, "logits/rejected": -3.180060386657715, "logps/chosen": -640.8667602539062, "logps/rejected": -696.669189453125, "loss": 0.1829, "rewards/accuracies": 1.0, "rewards/chosen": 3.07680082321167, "rewards/margins": 3.4818789958953857, "rewards/rejected": -0.4050781726837158, "step": 1820 }, { "epoch": 1.3304109589041095, "grad_norm": 22.668567502970248, "learning_rate": 4.2197676769125363e-07, "logits/chosen": -2.911588191986084, "logits/rejected": -1.703389286994934, "logps/chosen": -514.4267578125, "logps/rejected": -394.9584045410156, "loss": 0.0975, "rewards/accuracies": 1.0, "rewards/chosen": 3.262265682220459, "rewards/margins": 5.924951553344727, "rewards/rejected": -2.6626858711242676, "step": 1821 }, { "epoch": 1.3311415525114154, "grad_norm": 19.20794443438785, "learning_rate": 4.21860964819532e-07, "logits/chosen": -2.881317138671875, "logits/rejected": -2.8550009727478027, "logps/chosen": -603.31494140625, "logps/rejected": -572.8736572265625, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 3.728208065032959, "rewards/margins": 3.6367335319519043, "rewards/rejected": 0.0914747565984726, "step": 1822 }, { "epoch": 1.3318721461187215, "grad_norm": 28.93070650379271, "learning_rate": 4.2174509198924613e-07, "logits/chosen": -2.2849416732788086, "logits/rejected": -2.694453001022339, "logps/chosen": -645.8103637695312, "logps/rejected": -652.8519897460938, "loss": 0.1524, "rewards/accuracies": 1.0, "rewards/chosen": 2.5871715545654297, "rewards/margins": 4.035329341888428, "rewards/rejected": -1.4481574296951294, "step": 1823 }, { "epoch": 1.3326027397260274, "grad_norm": 20.08175353277626, "learning_rate": 4.216291492475638e-07, "logits/chosen": -3.186539649963379, "logits/rejected": -2.268181800842285, "logps/chosen": -1369.31005859375, "logps/rejected": -809.2474365234375, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": 4.832579135894775, "rewards/margins": 3.5159120559692383, "rewards/rejected": 1.316667079925537, "step": 1824 }, { "epoch": 1.3333333333333333, "grad_norm": 48.41825726643567, "learning_rate": 4.215131366416812e-07, "logits/chosen": -2.845266580581665, "logits/rejected": -2.244335174560547, "logps/chosen": -823.604248046875, "logps/rejected": -685.695556640625, "loss": 0.2977, "rewards/accuracies": 1.0, "rewards/chosen": 4.03903865814209, "rewards/margins": 3.680259943008423, "rewards/rejected": 0.3587791621685028, "step": 1825 }, { "epoch": 1.3340639269406394, "grad_norm": 21.887806658554567, "learning_rate": 4.213970542188231e-07, "logits/chosen": -2.8558754920959473, "logits/rejected": -2.3279471397399902, "logps/chosen": -812.4522094726562, "logps/rejected": -537.5689697265625, "loss": 0.1389, "rewards/accuracies": 1.0, "rewards/chosen": 2.981571912765503, "rewards/margins": 3.3260879516601562, "rewards/rejected": -0.34451594948768616, "step": 1826 }, { "epoch": 1.3347945205479452, "grad_norm": 36.23587413349856, "learning_rate": 4.212809020262425e-07, "logits/chosen": -2.9530510902404785, "logits/rejected": -2.610499858856201, "logps/chosen": -658.759521484375, "logps/rejected": -637.5595703125, "loss": 0.2016, "rewards/accuracies": 0.875, "rewards/chosen": 1.9649136066436768, "rewards/margins": 3.2973241806030273, "rewards/rejected": -1.3324106931686401, "step": 1827 }, { "epoch": 1.335525114155251, "grad_norm": 28.792439591613608, "learning_rate": 4.2116468011122086e-07, "logits/chosen": -2.7475109100341797, "logits/rejected": -1.5339686870574951, "logps/chosen": -797.194580078125, "logps/rejected": -495.4429626464844, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": 3.6627554893493652, "rewards/margins": 4.346040725708008, "rewards/rejected": -0.6832854747772217, "step": 1828 }, { "epoch": 1.336255707762557, "grad_norm": 41.13143323304803, "learning_rate": 4.2104838852106804e-07, "logits/chosen": -2.1700916290283203, "logits/rejected": -2.482015609741211, "logps/chosen": -349.3969421386719, "logps/rejected": -577.0925903320312, "loss": 0.2391, "rewards/accuracies": 1.0, "rewards/chosen": 1.2608819007873535, "rewards/margins": 4.384854316711426, "rewards/rejected": -3.1239724159240723, "step": 1829 }, { "epoch": 1.336986301369863, "grad_norm": 41.826900722184014, "learning_rate": 4.2093202730312227e-07, "logits/chosen": -3.0365347862243652, "logits/rejected": -1.9503895044326782, "logps/chosen": -608.6923828125, "logps/rejected": -447.06744384765625, "loss": 0.181, "rewards/accuracies": 1.0, "rewards/chosen": 3.618849992752075, "rewards/margins": 5.905117511749268, "rewards/rejected": -2.2862672805786133, "step": 1830 }, { "epoch": 1.337716894977169, "grad_norm": 37.53715444552487, "learning_rate": 4.208155965047502e-07, "logits/chosen": -3.07535982131958, "logits/rejected": -2.7504067420959473, "logps/chosen": -691.9225463867188, "logps/rejected": -716.01171875, "loss": 0.2292, "rewards/accuracies": 1.0, "rewards/chosen": 1.464526653289795, "rewards/margins": 1.96506667137146, "rewards/rejected": -0.5005399584770203, "step": 1831 }, { "epoch": 1.3384474885844748, "grad_norm": 34.8596428861289, "learning_rate": 4.206990961733466e-07, "logits/chosen": -2.611666679382324, "logits/rejected": -2.370710611343384, "logps/chosen": -500.2347412109375, "logps/rejected": -642.47265625, "loss": 0.1697, "rewards/accuracies": 1.0, "rewards/chosen": 2.2387828826904297, "rewards/margins": 4.811971664428711, "rewards/rejected": -2.573188304901123, "step": 1832 }, { "epoch": 1.339178082191781, "grad_norm": 22.998626667983487, "learning_rate": 4.205825263563347e-07, "logits/chosen": -2.7765533924102783, "logits/rejected": -3.167976140975952, "logps/chosen": -524.048583984375, "logps/rejected": -676.301513671875, "loss": 0.1241, "rewards/accuracies": 1.0, "rewards/chosen": 2.740530490875244, "rewards/margins": 3.073981761932373, "rewards/rejected": -0.3334510922431946, "step": 1833 }, { "epoch": 1.3399086757990868, "grad_norm": 30.34507954412152, "learning_rate": 4.2046588710116603e-07, "logits/chosen": -2.6248855590820312, "logits/rejected": -1.686500072479248, "logps/chosen": -874.1297607421875, "logps/rejected": -462.0053405761719, "loss": 0.1854, "rewards/accuracies": 1.0, "rewards/chosen": 3.519031524658203, "rewards/margins": 2.905937671661377, "rewards/rejected": 0.6130934953689575, "step": 1834 }, { "epoch": 1.3406392694063927, "grad_norm": 42.4618107373408, "learning_rate": 4.2034917845532035e-07, "logits/chosen": -2.5909042358398438, "logits/rejected": -2.129404306411743, "logps/chosen": -406.05780029296875, "logps/rejected": -262.27734375, "loss": 0.2597, "rewards/accuracies": 0.875, "rewards/chosen": 1.2979252338409424, "rewards/margins": 2.1833930015563965, "rewards/rejected": -0.8854677677154541, "step": 1835 }, { "epoch": 1.3413698630136985, "grad_norm": 37.06169963828056, "learning_rate": 4.2023240046630553e-07, "logits/chosen": -2.863795757293701, "logits/rejected": -2.5442936420440674, "logps/chosen": -657.3939208984375, "logps/rejected": -756.0400390625, "loss": 0.1713, "rewards/accuracies": 1.0, "rewards/chosen": 3.0490384101867676, "rewards/margins": 4.37984561920166, "rewards/rejected": -1.3308072090148926, "step": 1836 }, { "epoch": 1.3421004566210046, "grad_norm": 45.93105822180674, "learning_rate": 4.201155531816579e-07, "logits/chosen": -2.450366497039795, "logits/rejected": -1.873131513595581, "logps/chosen": -692.091552734375, "logps/rejected": -618.4495849609375, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 2.3593192100524902, "rewards/margins": 3.7488982677459717, "rewards/rejected": -1.3895792961120605, "step": 1837 }, { "epoch": 1.3428310502283105, "grad_norm": 23.494354219168937, "learning_rate": 4.1999863664894183e-07, "logits/chosen": -2.901287078857422, "logits/rejected": -2.288663864135742, "logps/chosen": -939.48583984375, "logps/rejected": -664.48876953125, "loss": 0.1871, "rewards/accuracies": 1.0, "rewards/chosen": 4.181014537811279, "rewards/margins": 3.850510358810425, "rewards/rejected": 0.33050400018692017, "step": 1838 }, { "epoch": 1.3435616438356164, "grad_norm": 43.251943202823995, "learning_rate": 4.198816509157499e-07, "logits/chosen": -2.5891528129577637, "logits/rejected": -2.442122220993042, "logps/chosen": -304.95330810546875, "logps/rejected": -349.4421081542969, "loss": 0.1749, "rewards/accuracies": 1.0, "rewards/chosen": 2.0063023567199707, "rewards/margins": 5.025832653045654, "rewards/rejected": -3.0195302963256836, "step": 1839 }, { "epoch": 1.3442922374429225, "grad_norm": 27.90712484630941, "learning_rate": 4.1976459602970305e-07, "logits/chosen": -2.803079843521118, "logits/rejected": -2.1637892723083496, "logps/chosen": -542.7521362304688, "logps/rejected": -513.6553955078125, "loss": 0.218, "rewards/accuracies": 1.0, "rewards/chosen": 2.725339651107788, "rewards/margins": 4.596558094024658, "rewards/rejected": -1.8712189197540283, "step": 1840 }, { "epoch": 1.3450228310502284, "grad_norm": 36.95765531162404, "learning_rate": 4.1964747203845007e-07, "logits/chosen": -2.9062788486480713, "logits/rejected": -2.675276279449463, "logps/chosen": -745.9749755859375, "logps/rejected": -570.2723999023438, "loss": 0.1547, "rewards/accuracies": 1.0, "rewards/chosen": 3.1751670837402344, "rewards/margins": 3.752293586730957, "rewards/rejected": -0.5771267414093018, "step": 1841 }, { "epoch": 1.3457534246575342, "grad_norm": 31.217421494818502, "learning_rate": 4.1953027898966795e-07, "logits/chosen": -2.834841728210449, "logits/rejected": -2.6903576850891113, "logps/chosen": -651.229248046875, "logps/rejected": -632.9823608398438, "loss": 0.1304, "rewards/accuracies": 1.0, "rewards/chosen": 4.162495136260986, "rewards/margins": 4.913296699523926, "rewards/rejected": -0.7508017420768738, "step": 1842 }, { "epoch": 1.34648401826484, "grad_norm": 26.12833832214675, "learning_rate": 4.194130169310621e-07, "logits/chosen": -3.200617551803589, "logits/rejected": -2.314145088195801, "logps/chosen": -722.8455810546875, "logps/rejected": -498.81036376953125, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 3.434990644454956, "rewards/margins": 4.2181396484375, "rewards/rejected": -0.7831487655639648, "step": 1843 }, { "epoch": 1.3472146118721462, "grad_norm": 39.71316837256252, "learning_rate": 4.192956859103657e-07, "logits/chosen": -2.70192289352417, "logits/rejected": -2.6205875873565674, "logps/chosen": -503.9673156738281, "logps/rejected": -510.66900634765625, "loss": 0.2164, "rewards/accuracies": 0.75, "rewards/chosen": 2.0533523559570312, "rewards/margins": 2.9901070594787598, "rewards/rejected": -0.9367549419403076, "step": 1844 }, { "epoch": 1.347945205479452, "grad_norm": 40.2468821579697, "learning_rate": 4.1917828597534e-07, "logits/chosen": -2.9104254245758057, "logits/rejected": -2.455204725265503, "logps/chosen": -580.1201171875, "logps/rejected": -444.16552734375, "loss": 0.1881, "rewards/accuracies": 0.875, "rewards/chosen": 3.190408229827881, "rewards/margins": 3.272961139678955, "rewards/rejected": -0.08255285024642944, "step": 1845 }, { "epoch": 1.348675799086758, "grad_norm": 35.188469610863834, "learning_rate": 4.190608171737744e-07, "logits/chosen": -2.1920056343078613, "logits/rejected": -2.7917208671569824, "logps/chosen": -625.9637451171875, "logps/rejected": -652.586181640625, "loss": 0.2144, "rewards/accuracies": 0.875, "rewards/chosen": 2.586552858352661, "rewards/margins": 3.336608409881592, "rewards/rejected": -0.7500556707382202, "step": 1846 }, { "epoch": 1.349406392694064, "grad_norm": 43.91385054320846, "learning_rate": 4.1894327955348643e-07, "logits/chosen": -2.7971739768981934, "logits/rejected": -2.825392723083496, "logps/chosen": -287.7797546386719, "logps/rejected": -322.56884765625, "loss": 0.337, "rewards/accuracies": 0.875, "rewards/chosen": 1.7540793418884277, "rewards/margins": 2.615330696105957, "rewards/rejected": -0.8612511157989502, "step": 1847 }, { "epoch": 1.35013698630137, "grad_norm": 48.26835133122298, "learning_rate": 4.1882567316232145e-07, "logits/chosen": -2.9874701499938965, "logits/rejected": -2.2405128479003906, "logps/chosen": -542.3388061523438, "logps/rejected": -434.8624572753906, "loss": 0.3349, "rewards/accuracies": 1.0, "rewards/chosen": 1.3467016220092773, "rewards/margins": 1.8701986074447632, "rewards/rejected": -0.5234968662261963, "step": 1848 }, { "epoch": 1.3508675799086758, "grad_norm": 37.71657188719403, "learning_rate": 4.187079980481529e-07, "logits/chosen": -2.7206735610961914, "logits/rejected": -2.4977800846099854, "logps/chosen": -595.748291015625, "logps/rejected": -680.0039672851562, "loss": 0.2237, "rewards/accuracies": 0.875, "rewards/chosen": 2.3322644233703613, "rewards/margins": 3.5233778953552246, "rewards/rejected": -1.1911134719848633, "step": 1849 }, { "epoch": 1.3515981735159817, "grad_norm": 31.49303444036028, "learning_rate": 4.1859025425888224e-07, "logits/chosen": -3.1732077598571777, "logits/rejected": -2.211994171142578, "logps/chosen": -730.1064453125, "logps/rejected": -524.0045166015625, "loss": 0.158, "rewards/accuracies": 1.0, "rewards/chosen": 3.1629159450531006, "rewards/margins": 3.6849865913391113, "rewards/rejected": -0.5220708847045898, "step": 1850 }, { "epoch": 1.3523287671232875, "grad_norm": 37.01000511369358, "learning_rate": 4.1847244184243867e-07, "logits/chosen": -2.5282018184661865, "logits/rejected": -2.1053361892700195, "logps/chosen": -567.116943359375, "logps/rejected": -642.321044921875, "loss": 0.1438, "rewards/accuracies": 1.0, "rewards/chosen": 4.456905364990234, "rewards/margins": 5.879448890686035, "rewards/rejected": -1.422544002532959, "step": 1851 }, { "epoch": 1.3530593607305936, "grad_norm": 49.50830255103052, "learning_rate": 4.183545608467798e-07, "logits/chosen": -2.58707332611084, "logits/rejected": -2.432145118713379, "logps/chosen": -772.4356689453125, "logps/rejected": -842.5879516601562, "loss": 0.2025, "rewards/accuracies": 0.75, "rewards/chosen": 1.5737626552581787, "rewards/margins": 1.3269599676132202, "rewards/rejected": 0.24680262804031372, "step": 1852 }, { "epoch": 1.3537899543378995, "grad_norm": 44.71541550504772, "learning_rate": 4.182366113198905e-07, "logits/chosen": -2.3812215328216553, "logits/rejected": -2.162850856781006, "logps/chosen": -642.3908081054688, "logps/rejected": -623.4268798828125, "loss": 0.2077, "rewards/accuracies": 1.0, "rewards/chosen": 3.4485864639282227, "rewards/margins": 3.102203369140625, "rewards/rejected": 0.34638291597366333, "step": 1853 }, { "epoch": 1.3545205479452056, "grad_norm": 31.11583798542402, "learning_rate": 4.1811859330978406e-07, "logits/chosen": -2.963357448577881, "logits/rejected": -2.474370002746582, "logps/chosen": -854.5436401367188, "logps/rejected": -683.5357055664062, "loss": 0.1351, "rewards/accuracies": 0.75, "rewards/chosen": 2.0249359607696533, "rewards/margins": 2.7820937633514404, "rewards/rejected": -0.7571580410003662, "step": 1854 }, { "epoch": 1.3552511415525115, "grad_norm": 25.986385595694625, "learning_rate": 4.180005068645015e-07, "logits/chosen": -2.6064600944519043, "logits/rejected": -1.9611331224441528, "logps/chosen": -589.20166015625, "logps/rejected": -470.6888427734375, "loss": 0.1695, "rewards/accuracies": 1.0, "rewards/chosen": 2.669809103012085, "rewards/margins": 3.482882499694824, "rewards/rejected": -0.8130737543106079, "step": 1855 }, { "epoch": 1.3559817351598173, "grad_norm": 30.635545059517874, "learning_rate": 4.1788235203211154e-07, "logits/chosen": -2.8604300022125244, "logits/rejected": -2.088768243789673, "logps/chosen": -735.28125, "logps/rejected": -563.6869506835938, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": 3.28403377532959, "rewards/margins": 3.4999232292175293, "rewards/rejected": -0.21588951349258423, "step": 1856 }, { "epoch": 1.3567123287671232, "grad_norm": 38.36365333829584, "learning_rate": 4.17764128860711e-07, "logits/chosen": -3.4677648544311523, "logits/rejected": -2.4681949615478516, "logps/chosen": -696.4749755859375, "logps/rejected": -577.6263427734375, "loss": 0.2285, "rewards/accuracies": 1.0, "rewards/chosen": 3.555147647857666, "rewards/margins": 3.101733446121216, "rewards/rejected": 0.45341432094573975, "step": 1857 }, { "epoch": 1.357442922374429, "grad_norm": 37.02628362003048, "learning_rate": 4.176458373984243e-07, "logits/chosen": -2.822723150253296, "logits/rejected": -2.7000865936279297, "logps/chosen": -763.2709350585938, "logps/rejected": -743.8597412109375, "loss": 0.164, "rewards/accuracies": 1.0, "rewards/chosen": 3.3108599185943604, "rewards/margins": 2.40316104888916, "rewards/rejected": 0.9076987504959106, "step": 1858 }, { "epoch": 1.3581735159817352, "grad_norm": 37.355550240748194, "learning_rate": 4.175274776934037e-07, "logits/chosen": -2.5625624656677246, "logits/rejected": -2.391587495803833, "logps/chosen": -546.106201171875, "logps/rejected": -565.446533203125, "loss": 0.2167, "rewards/accuracies": 0.875, "rewards/chosen": 2.7434916496276855, "rewards/margins": 3.5558676719665527, "rewards/rejected": -0.8123759031295776, "step": 1859 }, { "epoch": 1.358904109589041, "grad_norm": 20.025113169984913, "learning_rate": 4.1740904979382935e-07, "logits/chosen": -2.906920909881592, "logits/rejected": -1.554775595664978, "logps/chosen": -589.6283569335938, "logps/rejected": -352.0015869140625, "loss": 0.1329, "rewards/accuracies": 1.0, "rewards/chosen": 3.796196699142456, "rewards/margins": 5.055367469787598, "rewards/rejected": -1.2591708898544312, "step": 1860 }, { "epoch": 1.3596347031963472, "grad_norm": 25.257400485736355, "learning_rate": 4.172905537479091e-07, "logits/chosen": -2.9769434928894043, "logits/rejected": -2.1943342685699463, "logps/chosen": -796.1422119140625, "logps/rejected": -553.186279296875, "loss": 0.1385, "rewards/accuracies": 1.0, "rewards/chosen": 3.3225903511047363, "rewards/margins": 2.755176305770874, "rewards/rejected": 0.5674139261245728, "step": 1861 }, { "epoch": 1.360365296803653, "grad_norm": 30.452107834795793, "learning_rate": 4.1717198960387847e-07, "logits/chosen": -2.6620049476623535, "logits/rejected": -2.8102474212646484, "logps/chosen": -745.3826904296875, "logps/rejected": -897.13330078125, "loss": 0.1579, "rewards/accuracies": 1.0, "rewards/chosen": 3.283245801925659, "rewards/margins": 2.919224977493286, "rewards/rejected": 0.3640207052230835, "step": 1862 }, { "epoch": 1.361095890410959, "grad_norm": 48.25757661934419, "learning_rate": 4.170533574100008e-07, "logits/chosen": -3.063253402709961, "logits/rejected": -2.504883289337158, "logps/chosen": -721.3966064453125, "logps/rejected": -651.803466796875, "loss": 0.2541, "rewards/accuracies": 1.0, "rewards/chosen": 2.5428807735443115, "rewards/margins": 3.165766716003418, "rewards/rejected": -0.6228859424591064, "step": 1863 }, { "epoch": 1.3618264840182648, "grad_norm": 34.477491029587874, "learning_rate": 4.169346572145671e-07, "logits/chosen": -2.9096121788024902, "logits/rejected": -2.3193254470825195, "logps/chosen": -756.739013671875, "logps/rejected": -572.2615966796875, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 2.754680633544922, "rewards/margins": 3.074597120285034, "rewards/rejected": -0.3199165463447571, "step": 1864 }, { "epoch": 1.3625570776255707, "grad_norm": 15.069531331004718, "learning_rate": 4.1681588906589605e-07, "logits/chosen": -2.773348569869995, "logits/rejected": -2.5750246047973633, "logps/chosen": -583.1776733398438, "logps/rejected": -510.71826171875, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 2.403927803039551, "rewards/margins": 2.913588047027588, "rewards/rejected": -0.509660005569458, "step": 1865 }, { "epoch": 1.3632876712328768, "grad_norm": 38.007707316144284, "learning_rate": 4.1669705301233393e-07, "logits/chosen": -2.7319369316101074, "logits/rejected": -2.3235080242156982, "logps/chosen": -502.06048583984375, "logps/rejected": -483.92547607421875, "loss": 0.1738, "rewards/accuracies": 1.0, "rewards/chosen": 1.1588449478149414, "rewards/margins": 2.08146595954895, "rewards/rejected": -0.9226208925247192, "step": 1866 }, { "epoch": 1.3640182648401826, "grad_norm": 19.610037183789565, "learning_rate": 4.1657814910225477e-07, "logits/chosen": -3.4308221340179443, "logits/rejected": -2.496757984161377, "logps/chosen": -567.2506713867188, "logps/rejected": -422.61419677734375, "loss": 0.1405, "rewards/accuracies": 1.0, "rewards/chosen": 3.4438276290893555, "rewards/margins": 5.565902233123779, "rewards/rejected": -2.122074604034424, "step": 1867 }, { "epoch": 1.3647488584474887, "grad_norm": 27.61654658025748, "learning_rate": 4.1645917738406024e-07, "logits/chosen": -3.4638607501983643, "logits/rejected": -1.9735031127929688, "logps/chosen": -765.5072631835938, "logps/rejected": -424.46502685546875, "loss": 0.149, "rewards/accuracies": 1.0, "rewards/chosen": 2.726743221282959, "rewards/margins": 3.9302749633789062, "rewards/rejected": -1.2035318613052368, "step": 1868 }, { "epoch": 1.3654794520547946, "grad_norm": 43.62195128360333, "learning_rate": 4.163401379061794e-07, "logits/chosen": -2.6846325397491455, "logits/rejected": -2.154712438583374, "logps/chosen": -672.0625, "logps/rejected": -495.9486083984375, "loss": 0.1892, "rewards/accuracies": 0.875, "rewards/chosen": 2.934331178665161, "rewards/margins": 3.7126829624176025, "rewards/rejected": -0.7783517241477966, "step": 1869 }, { "epoch": 1.3662100456621005, "grad_norm": 41.39176191105619, "learning_rate": 4.162210307170693e-07, "logits/chosen": -2.6314592361450195, "logits/rejected": -2.4646658897399902, "logps/chosen": -581.0072631835938, "logps/rejected": -534.5010986328125, "loss": 0.2385, "rewards/accuracies": 0.875, "rewards/chosen": 2.0976130962371826, "rewards/margins": 3.4596645832061768, "rewards/rejected": -1.3620517253875732, "step": 1870 }, { "epoch": 1.3669406392694063, "grad_norm": 39.71922636909454, "learning_rate": 4.1610185586521396e-07, "logits/chosen": -2.5038795471191406, "logits/rejected": -2.6832785606384277, "logps/chosen": -603.5342407226562, "logps/rejected": -697.9798583984375, "loss": 0.1956, "rewards/accuracies": 1.0, "rewards/chosen": 2.8722615242004395, "rewards/margins": 3.86256742477417, "rewards/rejected": -0.99030601978302, "step": 1871 }, { "epoch": 1.3676712328767122, "grad_norm": 23.167703655644882, "learning_rate": 4.159826133991254e-07, "logits/chosen": -2.4041683673858643, "logits/rejected": -1.9047625064849854, "logps/chosen": -498.0954284667969, "logps/rejected": -513.8358154296875, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": 2.352691650390625, "rewards/margins": 4.178401470184326, "rewards/rejected": -1.8257100582122803, "step": 1872 }, { "epoch": 1.3684018264840183, "grad_norm": 45.64570645283218, "learning_rate": 4.158633033673432e-07, "logits/chosen": -3.061854362487793, "logits/rejected": -2.316948652267456, "logps/chosen": -704.9261474609375, "logps/rejected": -585.727783203125, "loss": 0.2118, "rewards/accuracies": 1.0, "rewards/chosen": 1.7910715341567993, "rewards/margins": 2.86065411567688, "rewards/rejected": -1.069582462310791, "step": 1873 }, { "epoch": 1.3691324200913242, "grad_norm": 41.81159349493795, "learning_rate": 4.1574392581843414e-07, "logits/chosen": -3.0236358642578125, "logits/rejected": -2.423255443572998, "logps/chosen": -806.61962890625, "logps/rejected": -746.53271484375, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 3.241997241973877, "rewards/margins": 3.8686177730560303, "rewards/rejected": -0.6266206502914429, "step": 1874 }, { "epoch": 1.36986301369863, "grad_norm": 35.55620584952875, "learning_rate": 4.1562448080099264e-07, "logits/chosen": -3.2341606616973877, "logits/rejected": -2.8779122829437256, "logps/chosen": -329.830810546875, "logps/rejected": -355.5600280761719, "loss": 0.257, "rewards/accuracies": 1.0, "rewards/chosen": 2.3607730865478516, "rewards/margins": 3.4382240772247314, "rewards/rejected": -1.0774509906768799, "step": 1875 }, { "epoch": 1.3705936073059362, "grad_norm": 30.534578327411246, "learning_rate": 4.155049683636406e-07, "logits/chosen": -2.903369903564453, "logits/rejected": -1.9581888914108276, "logps/chosen": -815.2701416015625, "logps/rejected": -552.1519775390625, "loss": 0.1557, "rewards/accuracies": 1.0, "rewards/chosen": 3.172677755355835, "rewards/margins": 3.2200558185577393, "rewards/rejected": -0.0473780632019043, "step": 1876 }, { "epoch": 1.371324200913242, "grad_norm": 27.50229371648788, "learning_rate": 4.153853885550273e-07, "logits/chosen": -2.816457748413086, "logits/rejected": -2.339219570159912, "logps/chosen": -768.9918823242188, "logps/rejected": -573.728271484375, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": 3.234081983566284, "rewards/margins": 2.9153666496276855, "rewards/rejected": 0.31871548295021057, "step": 1877 }, { "epoch": 1.372054794520548, "grad_norm": 39.59005939291295, "learning_rate": 4.1526574142382955e-07, "logits/chosen": -2.9139842987060547, "logits/rejected": -2.262209892272949, "logps/chosen": -674.0296630859375, "logps/rejected": -449.99725341796875, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": 2.893592119216919, "rewards/margins": 2.9435577392578125, "rewards/rejected": -0.049965664744377136, "step": 1878 }, { "epoch": 1.3727853881278538, "grad_norm": 37.10734040475095, "learning_rate": 4.1514602701875145e-07, "logits/chosen": -3.329948902130127, "logits/rejected": -2.713737964630127, "logps/chosen": -761.6334228515625, "logps/rejected": -618.2531127929688, "loss": 0.192, "rewards/accuracies": 1.0, "rewards/chosen": 4.008735656738281, "rewards/margins": 3.461822509765625, "rewards/rejected": 0.5469134449958801, "step": 1879 }, { "epoch": 1.3735159817351599, "grad_norm": 39.983504899677065, "learning_rate": 4.150262453885245e-07, "logits/chosen": -3.1353962421417236, "logits/rejected": -1.8857784271240234, "logps/chosen": -374.9773254394531, "logps/rejected": -217.585205078125, "loss": 0.2323, "rewards/accuracies": 0.875, "rewards/chosen": 1.9646257162094116, "rewards/margins": 3.849841356277466, "rewards/rejected": -1.8852155208587646, "step": 1880 }, { "epoch": 1.3742465753424657, "grad_norm": 29.661529553433667, "learning_rate": 4.149063965819076e-07, "logits/chosen": -2.759286880493164, "logits/rejected": -2.0751070976257324, "logps/chosen": -454.005615234375, "logps/rejected": -360.5793151855469, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": 2.739434242248535, "rewards/margins": 3.969315528869629, "rewards/rejected": -1.2298812866210938, "step": 1881 }, { "epoch": 1.3749771689497716, "grad_norm": 32.07729731335992, "learning_rate": 4.1478648064768704e-07, "logits/chosen": -2.734048843383789, "logits/rejected": -1.8977453708648682, "logps/chosen": -647.3148193359375, "logps/rejected": -417.6510009765625, "loss": 0.1449, "rewards/accuracies": 1.0, "rewards/chosen": 2.4386329650878906, "rewards/margins": 4.770754814147949, "rewards/rejected": -2.3321220874786377, "step": 1882 }, { "epoch": 1.3757077625570777, "grad_norm": 42.96052620165909, "learning_rate": 4.1466649763467643e-07, "logits/chosen": -2.926830530166626, "logits/rejected": -2.17586612701416, "logps/chosen": -795.15576171875, "logps/rejected": -666.8501586914062, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 3.097891092300415, "rewards/margins": 4.087231636047363, "rewards/rejected": -0.9893407821655273, "step": 1883 }, { "epoch": 1.3764383561643836, "grad_norm": 26.273732528822276, "learning_rate": 4.145464475917165e-07, "logits/chosen": -2.579611301422119, "logits/rejected": -2.4780001640319824, "logps/chosen": -438.75689697265625, "logps/rejected": -508.61529541015625, "loss": 0.1257, "rewards/accuracies": 0.875, "rewards/chosen": 0.9430896043777466, "rewards/margins": 2.4122743606567383, "rewards/rejected": -1.4691848754882812, "step": 1884 }, { "epoch": 1.3771689497716895, "grad_norm": 34.07319392051929, "learning_rate": 4.144263305676755e-07, "logits/chosen": -3.202786445617676, "logits/rejected": -2.3397035598754883, "logps/chosen": -887.6802978515625, "logps/rejected": -543.8069458007812, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 3.066622018814087, "rewards/margins": 2.8485500812530518, "rewards/rejected": 0.21807220578193665, "step": 1885 }, { "epoch": 1.3778995433789953, "grad_norm": 46.78206933985231, "learning_rate": 4.1430614661144884e-07, "logits/chosen": -2.6911375522613525, "logits/rejected": -1.6977300643920898, "logps/chosen": -738.8693237304688, "logps/rejected": -441.21429443359375, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 3.713606595993042, "rewards/margins": 4.631039619445801, "rewards/rejected": -0.9174330234527588, "step": 1886 }, { "epoch": 1.3786301369863014, "grad_norm": 39.29331494808468, "learning_rate": 4.1418589577195927e-07, "logits/chosen": -2.389280319213867, "logits/rejected": -2.013190984725952, "logps/chosen": -623.6915283203125, "logps/rejected": -691.3297119140625, "loss": 0.2055, "rewards/accuracies": 1.0, "rewards/chosen": 2.9365506172180176, "rewards/margins": 3.996958017349243, "rewards/rejected": -1.060407280921936, "step": 1887 }, { "epoch": 1.3793607305936073, "grad_norm": 48.32966070143287, "learning_rate": 4.1406557809815646e-07, "logits/chosen": -2.667120933532715, "logits/rejected": -1.5598853826522827, "logps/chosen": -545.9498291015625, "logps/rejected": -402.41754150390625, "loss": 0.3682, "rewards/accuracies": 0.875, "rewards/chosen": 3.5819475650787354, "rewards/margins": 5.092731475830078, "rewards/rejected": -1.5107841491699219, "step": 1888 }, { "epoch": 1.3800913242009132, "grad_norm": 32.29575242969267, "learning_rate": 4.1394519363901775e-07, "logits/chosen": -2.6221156120300293, "logits/rejected": -2.1616969108581543, "logps/chosen": -436.1201171875, "logps/rejected": -316.87200927734375, "loss": 0.2258, "rewards/accuracies": 0.75, "rewards/chosen": 1.1608762741088867, "rewards/margins": 2.017578601837158, "rewards/rejected": -0.8567025065422058, "step": 1889 }, { "epoch": 1.3808219178082193, "grad_norm": 27.778740922972464, "learning_rate": 4.1382474244354725e-07, "logits/chosen": -2.852370262145996, "logits/rejected": -1.5451759099960327, "logps/chosen": -752.3734741210938, "logps/rejected": -423.47808837890625, "loss": 0.1338, "rewards/accuracies": 0.875, "rewards/chosen": 2.8218770027160645, "rewards/margins": 3.95493221282959, "rewards/rejected": -1.1330549716949463, "step": 1890 }, { "epoch": 1.3815525114155252, "grad_norm": 24.233011174148505, "learning_rate": 4.137042245607766e-07, "logits/chosen": -2.718384027481079, "logits/rejected": -2.032550096511841, "logps/chosen": -622.3713989257812, "logps/rejected": -488.59906005859375, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 3.4675889015197754, "rewards/margins": 4.396920204162598, "rewards/rejected": -0.9293312430381775, "step": 1891 }, { "epoch": 1.382283105022831, "grad_norm": 17.784270393735582, "learning_rate": 4.135836400397642e-07, "logits/chosen": -3.1032354831695557, "logits/rejected": -2.483335018157959, "logps/chosen": -695.4012451171875, "logps/rejected": -536.4566040039062, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 3.2261650562286377, "rewards/margins": 2.4980273246765137, "rewards/rejected": 0.7281379699707031, "step": 1892 }, { "epoch": 1.383013698630137, "grad_norm": 39.49278637651755, "learning_rate": 4.134629889295959e-07, "logits/chosen": -3.167147159576416, "logits/rejected": -2.5264077186584473, "logps/chosen": -777.0408325195312, "logps/rejected": -569.4949340820312, "loss": 0.1937, "rewards/accuracies": 1.0, "rewards/chosen": 2.8707540035247803, "rewards/margins": 3.603508472442627, "rewards/rejected": -0.7327545285224915, "step": 1893 }, { "epoch": 1.383744292237443, "grad_norm": 41.5220329901224, "learning_rate": 4.133422712793845e-07, "logits/chosen": -3.017634391784668, "logits/rejected": -2.3188891410827637, "logps/chosen": -792.6373291015625, "logps/rejected": -527.5626831054688, "loss": 0.2073, "rewards/accuracies": 0.875, "rewards/chosen": 1.902198314666748, "rewards/margins": 2.024437427520752, "rewards/rejected": -0.12223923206329346, "step": 1894 }, { "epoch": 1.3844748858447489, "grad_norm": 28.074095643340076, "learning_rate": 4.1322148713826975e-07, "logits/chosen": -2.7779552936553955, "logits/rejected": -2.336538553237915, "logps/chosen": -741.4406127929688, "logps/rejected": -632.4717407226562, "loss": 0.1166, "rewards/accuracies": 1.0, "rewards/chosen": 3.927786350250244, "rewards/margins": 2.9258036613464355, "rewards/rejected": 1.0019828081130981, "step": 1895 }, { "epoch": 1.3852054794520547, "grad_norm": 31.542060987324923, "learning_rate": 4.13100636555419e-07, "logits/chosen": -1.9455997943878174, "logits/rejected": -2.181642532348633, "logps/chosen": -384.48590087890625, "logps/rejected": -502.165283203125, "loss": 0.2177, "rewards/accuracies": 1.0, "rewards/chosen": 2.501471996307373, "rewards/margins": 4.126490592956543, "rewards/rejected": -1.62501859664917, "step": 1896 }, { "epoch": 1.3859360730593608, "grad_norm": 19.89830560419577, "learning_rate": 4.1297971958002595e-07, "logits/chosen": -2.9169392585754395, "logits/rejected": -2.210965156555176, "logps/chosen": -460.56219482421875, "logps/rejected": -364.2352294921875, "loss": 0.0883, "rewards/accuracies": 1.0, "rewards/chosen": 2.2853379249572754, "rewards/margins": 3.9458417892456055, "rewards/rejected": -1.6605039834976196, "step": 1897 }, { "epoch": 1.3866666666666667, "grad_norm": 35.43033092717841, "learning_rate": 4.1285873626131186e-07, "logits/chosen": -2.8335628509521484, "logits/rejected": -2.3790507316589355, "logps/chosen": -447.44573974609375, "logps/rejected": -393.3999938964844, "loss": 0.2237, "rewards/accuracies": 0.875, "rewards/chosen": 2.0152344703674316, "rewards/margins": 2.2836694717407227, "rewards/rejected": -0.26843512058258057, "step": 1898 }, { "epoch": 1.3873972602739726, "grad_norm": 26.980590625629613, "learning_rate": 4.1273768664852463e-07, "logits/chosen": -2.929131507873535, "logits/rejected": -1.9331471920013428, "logps/chosen": -679.7300415039062, "logps/rejected": -520.8641967773438, "loss": 0.1248, "rewards/accuracies": 1.0, "rewards/chosen": 5.004229545593262, "rewards/margins": 5.988880634307861, "rewards/rejected": -0.9846507906913757, "step": 1899 }, { "epoch": 1.3881278538812785, "grad_norm": 34.70651224464221, "learning_rate": 4.126165707909394e-07, "logits/chosen": -2.867380142211914, "logits/rejected": -2.1968250274658203, "logps/chosen": -480.5525817871094, "logps/rejected": -321.8202209472656, "loss": 0.1534, "rewards/accuracies": 0.875, "rewards/chosen": 1.2117822170257568, "rewards/margins": 1.7653728723526, "rewards/rejected": -0.5535905957221985, "step": 1900 }, { "epoch": 1.3888584474885846, "grad_norm": 21.368037288860318, "learning_rate": 4.1249538873785815e-07, "logits/chosen": -2.7624988555908203, "logits/rejected": -1.932904839515686, "logps/chosen": -729.9010620117188, "logps/rejected": -455.14593505859375, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": 3.514191150665283, "rewards/margins": 3.5546178817749023, "rewards/rejected": -0.04042696952819824, "step": 1901 }, { "epoch": 1.3895890410958904, "grad_norm": 40.84261875585278, "learning_rate": 4.1237414053860995e-07, "logits/chosen": -3.049985647201538, "logits/rejected": -2.5225722789764404, "logps/chosen": -958.5255126953125, "logps/rejected": -766.103271484375, "loss": 0.1598, "rewards/accuracies": 1.0, "rewards/chosen": 4.018584728240967, "rewards/margins": 3.759739398956299, "rewards/rejected": 0.2588452100753784, "step": 1902 }, { "epoch": 1.3903196347031963, "grad_norm": 25.708417483232754, "learning_rate": 4.122528262425505e-07, "logits/chosen": -2.9715161323547363, "logits/rejected": -2.1233701705932617, "logps/chosen": -406.5405578613281, "logps/rejected": -295.9496154785156, "loss": 0.1347, "rewards/accuracies": 1.0, "rewards/chosen": 2.409052848815918, "rewards/margins": 4.536306381225586, "rewards/rejected": -2.127253532409668, "step": 1903 }, { "epoch": 1.3910502283105024, "grad_norm": 18.243674658331926, "learning_rate": 4.1213144589906266e-07, "logits/chosen": -2.5588760375976562, "logits/rejected": -2.484368085861206, "logps/chosen": -514.8984375, "logps/rejected": -572.9358520507812, "loss": 0.1102, "rewards/accuracies": 1.0, "rewards/chosen": 1.5912485122680664, "rewards/margins": 3.2866568565368652, "rewards/rejected": -1.6954084634780884, "step": 1904 }, { "epoch": 1.3917808219178083, "grad_norm": 34.319317472202336, "learning_rate": 4.120099995575562e-07, "logits/chosen": -3.0258679389953613, "logits/rejected": -2.2477834224700928, "logps/chosen": -762.3416748046875, "logps/rejected": -569.37939453125, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": 1.7243812084197998, "rewards/margins": 1.7964047193527222, "rewards/rejected": -0.07202353328466415, "step": 1905 }, { "epoch": 1.3925114155251141, "grad_norm": 20.39865667370763, "learning_rate": 4.1188848726746763e-07, "logits/chosen": -2.755080461502075, "logits/rejected": -2.686068058013916, "logps/chosen": -461.36602783203125, "logps/rejected": -548.5745239257812, "loss": 0.1209, "rewards/accuracies": 1.0, "rewards/chosen": 2.1721534729003906, "rewards/margins": 3.4354653358459473, "rewards/rejected": -1.263311743736267, "step": 1906 }, { "epoch": 1.39324200913242, "grad_norm": 29.36539799017023, "learning_rate": 4.117669090782602e-07, "logits/chosen": -3.6706652641296387, "logits/rejected": -2.713444709777832, "logps/chosen": -555.0335693359375, "logps/rejected": -416.551025390625, "loss": 0.2049, "rewards/accuracies": 0.875, "rewards/chosen": 2.009638786315918, "rewards/margins": 3.4235618114471436, "rewards/rejected": -1.4139232635498047, "step": 1907 }, { "epoch": 1.393972602739726, "grad_norm": 48.840500931603884, "learning_rate": 4.116452650394242e-07, "logits/chosen": -3.295572519302368, "logits/rejected": -2.3568739891052246, "logps/chosen": -626.8760986328125, "logps/rejected": -407.82232666015625, "loss": 0.2035, "rewards/accuracies": 1.0, "rewards/chosen": 4.498948097229004, "rewards/margins": 4.528298377990723, "rewards/rejected": -0.029350489377975464, "step": 1908 }, { "epoch": 1.394703196347032, "grad_norm": 26.68988755420768, "learning_rate": 4.115235552004767e-07, "logits/chosen": -2.402442455291748, "logits/rejected": -1.743666648864746, "logps/chosen": -355.293212890625, "logps/rejected": -320.30963134765625, "loss": 0.151, "rewards/accuracies": 1.0, "rewards/chosen": 3.077298164367676, "rewards/margins": 4.635501861572266, "rewards/rejected": -1.5582034587860107, "step": 1909 }, { "epoch": 1.3954337899543379, "grad_norm": 55.12236255382769, "learning_rate": 4.1140177961096146e-07, "logits/chosen": -2.523935317993164, "logits/rejected": -2.5460052490234375, "logps/chosen": -382.52142333984375, "logps/rejected": -463.35821533203125, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 1.4178309440612793, "rewards/margins": 2.4760732650756836, "rewards/rejected": -1.0582420825958252, "step": 1910 }, { "epoch": 1.396164383561644, "grad_norm": 31.42407473808149, "learning_rate": 4.1127993832044903e-07, "logits/chosen": -2.896317958831787, "logits/rejected": -2.3081393241882324, "logps/chosen": -711.0062255859375, "logps/rejected": -602.2554931640625, "loss": 0.1412, "rewards/accuracies": 1.0, "rewards/chosen": 3.558133602142334, "rewards/margins": 3.8935651779174805, "rewards/rejected": -0.3354315757751465, "step": 1911 }, { "epoch": 1.3968949771689498, "grad_norm": 32.849662734903, "learning_rate": 4.111580313785368e-07, "logits/chosen": -2.9239964485168457, "logits/rejected": -2.0105435848236084, "logps/chosen": -909.6900634765625, "logps/rejected": -559.3890991210938, "loss": 0.1811, "rewards/accuracies": 1.0, "rewards/chosen": 4.937607765197754, "rewards/margins": 5.946786880493164, "rewards/rejected": -1.0091789960861206, "step": 1912 }, { "epoch": 1.3976255707762557, "grad_norm": 37.881016823032546, "learning_rate": 4.110360588348487e-07, "logits/chosen": -2.656129837036133, "logits/rejected": -2.789425849914551, "logps/chosen": -821.8626708984375, "logps/rejected": -1087.5076904296875, "loss": 0.1857, "rewards/accuracies": 0.875, "rewards/chosen": 4.628342628479004, "rewards/margins": 4.001533508300781, "rewards/rejected": 0.6268091201782227, "step": 1913 }, { "epoch": 1.3983561643835616, "grad_norm": 37.291798329609286, "learning_rate": 4.1091402073903555e-07, "logits/chosen": -2.7158565521240234, "logits/rejected": -2.152665615081787, "logps/chosen": -738.9750366210938, "logps/rejected": -650.3494873046875, "loss": 0.1581, "rewards/accuracies": 1.0, "rewards/chosen": 3.0789830684661865, "rewards/margins": 3.1445095539093018, "rewards/rejected": -0.06552660465240479, "step": 1914 }, { "epoch": 1.3990867579908675, "grad_norm": 18.008751331275548, "learning_rate": 4.107919171407747e-07, "logits/chosen": -2.678757905960083, "logits/rejected": -1.9193933010101318, "logps/chosen": -688.5234985351562, "logps/rejected": -598.3662719726562, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 4.208199501037598, "rewards/margins": 5.131146430969238, "rewards/rejected": -0.9229468703269958, "step": 1915 }, { "epoch": 1.3998173515981736, "grad_norm": 3643.262823349422, "learning_rate": 4.106697480897703e-07, "logits/chosen": -2.0102593898773193, "logits/rejected": -2.284146308898926, "logps/chosen": -574.2493896484375, "logps/rejected": -1059.39013671875, "loss": 3.6065, "rewards/accuracies": 0.75, "rewards/chosen": 2.7353620529174805, "rewards/margins": -10.699642181396484, "rewards/rejected": 13.435003280639648, "step": 1916 }, { "epoch": 1.4005479452054794, "grad_norm": 36.674894022512184, "learning_rate": 4.1054751363575303e-07, "logits/chosen": -2.9721150398254395, "logits/rejected": -2.0923099517822266, "logps/chosen": -622.3681640625, "logps/rejected": -481.7441101074219, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": 4.189370155334473, "rewards/margins": 4.729488372802734, "rewards/rejected": -0.5401178598403931, "step": 1917 }, { "epoch": 1.4012785388127855, "grad_norm": 34.071886249768845, "learning_rate": 4.104252138284803e-07, "logits/chosen": -2.5299417972564697, "logits/rejected": -2.030663251876831, "logps/chosen": -412.9339599609375, "logps/rejected": -305.109619140625, "loss": 0.2246, "rewards/accuracies": 0.875, "rewards/chosen": 2.019132614135742, "rewards/margins": 3.664311170578003, "rewards/rejected": -1.645179033279419, "step": 1918 }, { "epoch": 1.4020091324200914, "grad_norm": 38.87043406666885, "learning_rate": 4.1030284871773604e-07, "logits/chosen": -2.680457353591919, "logits/rejected": -2.286497116088867, "logps/chosen": -586.8753662109375, "logps/rejected": -520.5922241210938, "loss": 0.217, "rewards/accuracies": 1.0, "rewards/chosen": 2.2355728149414062, "rewards/margins": 2.6266236305236816, "rewards/rejected": -0.39105066657066345, "step": 1919 }, { "epoch": 1.4027397260273973, "grad_norm": 40.46283288063952, "learning_rate": 4.1018041835333076e-07, "logits/chosen": -2.49139142036438, "logits/rejected": -2.0390424728393555, "logps/chosen": -388.881591796875, "logps/rejected": -340.17181396484375, "loss": 0.2965, "rewards/accuracies": 0.875, "rewards/chosen": 1.4422324895858765, "rewards/margins": 3.477146625518799, "rewards/rejected": -2.0349137783050537, "step": 1920 }, { "epoch": 1.4034703196347031, "grad_norm": 32.778252216361004, "learning_rate": 4.1005792278510164e-07, "logits/chosen": -2.6817078590393066, "logits/rejected": -2.3293466567993164, "logps/chosen": -447.4471740722656, "logps/rejected": -426.3336181640625, "loss": 0.1855, "rewards/accuracies": 1.0, "rewards/chosen": 3.5003130435943604, "rewards/margins": 5.48728609085083, "rewards/rejected": -1.9869732856750488, "step": 1921 }, { "epoch": 1.404200913242009, "grad_norm": 36.14663333648792, "learning_rate": 4.0993536206291225e-07, "logits/chosen": -2.4967942237854004, "logits/rejected": -1.9148062467575073, "logps/chosen": -911.0606689453125, "logps/rejected": -611.9662475585938, "loss": 0.2232, "rewards/accuracies": 1.0, "rewards/chosen": 4.408260345458984, "rewards/margins": 3.6086478233337402, "rewards/rejected": 0.799612820148468, "step": 1922 }, { "epoch": 1.4049315068493151, "grad_norm": 42.970842278605765, "learning_rate": 4.098127362366528e-07, "logits/chosen": -2.944599151611328, "logits/rejected": -2.398496150970459, "logps/chosen": -900.5916748046875, "logps/rejected": -854.8463134765625, "loss": 0.2466, "rewards/accuracies": 1.0, "rewards/chosen": 3.8532183170318604, "rewards/margins": 3.865509510040283, "rewards/rejected": -0.012291178107261658, "step": 1923 }, { "epoch": 1.405662100456621, "grad_norm": 24.48678917780527, "learning_rate": 4.096900453562399e-07, "logits/chosen": -2.485703706741333, "logits/rejected": -2.2733333110809326, "logps/chosen": -516.3048706054688, "logps/rejected": -477.19293212890625, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": 4.046845436096191, "rewards/margins": 5.805071830749512, "rewards/rejected": -1.7582261562347412, "step": 1924 }, { "epoch": 1.4063926940639269, "grad_norm": 21.997952452657383, "learning_rate": 4.0956728947161677e-07, "logits/chosen": -2.7867226600646973, "logits/rejected": -1.96846342086792, "logps/chosen": -409.1112976074219, "logps/rejected": -318.0888671875, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": 2.018392562866211, "rewards/margins": 3.7043685913085938, "rewards/rejected": -1.6859757900238037, "step": 1925 }, { "epoch": 1.407123287671233, "grad_norm": 45.07052849770568, "learning_rate": 4.09444468632753e-07, "logits/chosen": -2.460293769836426, "logits/rejected": -2.390324354171753, "logps/chosen": -495.96905517578125, "logps/rejected": -485.5946350097656, "loss": 0.285, "rewards/accuracies": 0.75, "rewards/chosen": 1.8346834182739258, "rewards/margins": 2.6068849563598633, "rewards/rejected": -0.7722017765045166, "step": 1926 }, { "epoch": 1.4078538812785388, "grad_norm": 48.80358633402365, "learning_rate": 4.0932158288964456e-07, "logits/chosen": -2.870208263397217, "logits/rejected": -3.0697946548461914, "logps/chosen": -552.9407958984375, "logps/rejected": -629.1555786132812, "loss": 0.2464, "rewards/accuracies": 0.75, "rewards/chosen": 2.0854530334472656, "rewards/margins": 2.7555270195007324, "rewards/rejected": -0.670073926448822, "step": 1927 }, { "epoch": 1.4085844748858447, "grad_norm": 33.961438021098836, "learning_rate": 4.091986322923141e-07, "logits/chosen": -2.549323558807373, "logits/rejected": -2.615154504776001, "logps/chosen": -666.4890747070312, "logps/rejected": -787.5010986328125, "loss": 0.206, "rewards/accuracies": 1.0, "rewards/chosen": 2.217503547668457, "rewards/margins": 1.5159285068511963, "rewards/rejected": 0.7015750408172607, "step": 1928 }, { "epoch": 1.4093150684931506, "grad_norm": 28.69298758618722, "learning_rate": 4.090756168908104e-07, "logits/chosen": -2.029327392578125, "logits/rejected": -2.3804454803466797, "logps/chosen": -384.27642822265625, "logps/rejected": -553.0016479492188, "loss": 0.1731, "rewards/accuracies": 0.875, "rewards/chosen": 2.0434892177581787, "rewards/margins": 2.422710418701172, "rewards/rejected": -0.379221111536026, "step": 1929 }, { "epoch": 1.4100456621004567, "grad_norm": 31.92844401328344, "learning_rate": 4.0895253673520856e-07, "logits/chosen": -2.7204771041870117, "logits/rejected": -1.76722252368927, "logps/chosen": -464.974853515625, "logps/rejected": -351.67071533203125, "loss": 0.2048, "rewards/accuracies": 1.0, "rewards/chosen": 2.7210612297058105, "rewards/margins": 4.31925106048584, "rewards/rejected": -1.5981900691986084, "step": 1930 }, { "epoch": 1.4107762557077625, "grad_norm": 31.554628504834675, "learning_rate": 4.0882939187561047e-07, "logits/chosen": -2.89357852935791, "logits/rejected": -2.3370182514190674, "logps/chosen": -593.080322265625, "logps/rejected": -443.5544128417969, "loss": 0.1849, "rewards/accuracies": 1.0, "rewards/chosen": 2.0670299530029297, "rewards/margins": 2.0370190143585205, "rewards/rejected": 0.030010923743247986, "step": 1931 }, { "epoch": 1.4115068493150684, "grad_norm": 26.452328971336577, "learning_rate": 4.0870618236214383e-07, "logits/chosen": -2.925231456756592, "logits/rejected": -2.7121570110321045, "logps/chosen": -857.8592529296875, "logps/rejected": -688.610595703125, "loss": 0.1351, "rewards/accuracies": 0.875, "rewards/chosen": 2.491924285888672, "rewards/margins": 2.183610439300537, "rewards/rejected": 0.3083137571811676, "step": 1932 }, { "epoch": 1.4122374429223745, "grad_norm": 26.817233292508824, "learning_rate": 4.085829082449631e-07, "logits/chosen": -2.7102699279785156, "logits/rejected": -2.089898109436035, "logps/chosen": -801.5009765625, "logps/rejected": -538.7581176757812, "loss": 0.1775, "rewards/accuracies": 0.875, "rewards/chosen": 3.6155545711517334, "rewards/margins": 2.8376779556274414, "rewards/rejected": 0.7778767943382263, "step": 1933 }, { "epoch": 1.4129680365296804, "grad_norm": 46.10525809730112, "learning_rate": 4.0845956957424865e-07, "logits/chosen": -3.048717975616455, "logits/rejected": -1.734205722808838, "logps/chosen": -443.5471496582031, "logps/rejected": -287.6368408203125, "loss": 0.194, "rewards/accuracies": 1.0, "rewards/chosen": 3.2643396854400635, "rewards/margins": 5.180629730224609, "rewards/rejected": -1.9162904024124146, "step": 1934 }, { "epoch": 1.4136986301369863, "grad_norm": 23.1240445326261, "learning_rate": 4.083361664002075e-07, "logits/chosen": -2.788217544555664, "logits/rejected": -2.4925947189331055, "logps/chosen": -359.669189453125, "logps/rejected": -350.76318359375, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 2.1576719284057617, "rewards/margins": 5.058759689331055, "rewards/rejected": -2.901087760925293, "step": 1935 }, { "epoch": 1.4144292237442921, "grad_norm": 45.60067577195951, "learning_rate": 4.0821269877307264e-07, "logits/chosen": -3.2945871353149414, "logits/rejected": -2.9046630859375, "logps/chosen": -1145.7349853515625, "logps/rejected": -884.952392578125, "loss": 0.2197, "rewards/accuracies": 1.0, "rewards/chosen": 4.254508972167969, "rewards/margins": 3.1385695934295654, "rewards/rejected": 1.1159394979476929, "step": 1936 }, { "epoch": 1.4151598173515982, "grad_norm": 41.284249808208486, "learning_rate": 4.080891667431035e-07, "logits/chosen": -2.9417238235473633, "logits/rejected": -2.2690253257751465, "logps/chosen": -885.2344970703125, "logps/rejected": -596.7174072265625, "loss": 0.1655, "rewards/accuracies": 0.75, "rewards/chosen": 2.9993228912353516, "rewards/margins": 3.205465793609619, "rewards/rejected": -0.20614314079284668, "step": 1937 }, { "epoch": 1.415890410958904, "grad_norm": 22.40313213793651, "learning_rate": 4.079655703605854e-07, "logits/chosen": -3.34120512008667, "logits/rejected": -2.1912384033203125, "logps/chosen": -901.9937744140625, "logps/rejected": -594.6114501953125, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 4.042596817016602, "rewards/margins": 3.397599458694458, "rewards/rejected": 0.6449970602989197, "step": 1938 }, { "epoch": 1.41662100456621, "grad_norm": 29.49077604238368, "learning_rate": 4.0784190967583046e-07, "logits/chosen": -2.877326250076294, "logits/rejected": -2.1921534538269043, "logps/chosen": -760.7653198242188, "logps/rejected": -720.607421875, "loss": 0.1772, "rewards/accuracies": 1.0, "rewards/chosen": 3.1313672065734863, "rewards/margins": 4.583432197570801, "rewards/rejected": -1.4520654678344727, "step": 1939 }, { "epoch": 1.417351598173516, "grad_norm": 29.232509502545387, "learning_rate": 4.077181847391763e-07, "logits/chosen": -2.430189609527588, "logits/rejected": -2.7976460456848145, "logps/chosen": -500.29962158203125, "logps/rejected": -557.5121459960938, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": 1.277940034866333, "rewards/margins": 2.183539390563965, "rewards/rejected": -0.9055991768836975, "step": 1940 }, { "epoch": 1.418082191780822, "grad_norm": 25.836961585766524, "learning_rate": 4.0759439560098715e-07, "logits/chosen": -2.9643654823303223, "logits/rejected": -1.7342454195022583, "logps/chosen": -757.9157104492188, "logps/rejected": -394.2328186035156, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 4.307642936706543, "rewards/margins": 5.258055686950684, "rewards/rejected": -0.9504125118255615, "step": 1941 }, { "epoch": 1.4188127853881278, "grad_norm": 40.757411914602265, "learning_rate": 4.074705423116531e-07, "logits/chosen": -2.3659911155700684, "logits/rejected": -1.8803867101669312, "logps/chosen": -466.0182189941406, "logps/rejected": -453.4617919921875, "loss": 0.189, "rewards/accuracies": 1.0, "rewards/chosen": 3.1755757331848145, "rewards/margins": 5.033355712890625, "rewards/rejected": -1.8577799797058105, "step": 1942 }, { "epoch": 1.4195433789954337, "grad_norm": 32.86226323195897, "learning_rate": 4.0734662492159063e-07, "logits/chosen": -3.005950450897217, "logits/rejected": -1.8051724433898926, "logps/chosen": -677.07958984375, "logps/rejected": -524.016357421875, "loss": 0.1609, "rewards/accuracies": 1.0, "rewards/chosen": 3.6547446250915527, "rewards/margins": 4.106103897094727, "rewards/rejected": -0.45135971903800964, "step": 1943 }, { "epoch": 1.4202739726027398, "grad_norm": 23.31006763031132, "learning_rate": 4.07222643481242e-07, "logits/chosen": -2.9786791801452637, "logits/rejected": -1.603939414024353, "logps/chosen": -803.3727416992188, "logps/rejected": -354.69219970703125, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 3.691981792449951, "rewards/margins": 5.066527843475342, "rewards/rejected": -1.3745458126068115, "step": 1944 }, { "epoch": 1.4210045662100457, "grad_norm": 28.153617168455792, "learning_rate": 4.0709859804107584e-07, "logits/chosen": -2.1018612384796143, "logits/rejected": -2.854060649871826, "logps/chosen": -394.79632568359375, "logps/rejected": -493.9763488769531, "loss": 0.1594, "rewards/accuracies": 1.0, "rewards/chosen": 1.9244112968444824, "rewards/margins": 2.794602870941162, "rewards/rejected": -0.8701915740966797, "step": 1945 }, { "epoch": 1.4217351598173515, "grad_norm": 26.668904528495354, "learning_rate": 4.0697448865158663e-07, "logits/chosen": -2.9985241889953613, "logits/rejected": -2.1193811893463135, "logps/chosen": -756.02001953125, "logps/rejected": -668.5516357421875, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": 4.6885175704956055, "rewards/margins": 4.1663689613342285, "rewards/rejected": 0.5221484303474426, "step": 1946 }, { "epoch": 1.4224657534246576, "grad_norm": 42.04431430329641, "learning_rate": 4.068503153632949e-07, "logits/chosen": -3.001981735229492, "logits/rejected": -2.282785177230835, "logps/chosen": -722.3651123046875, "logps/rejected": -456.349609375, "loss": 0.2559, "rewards/accuracies": 0.75, "rewards/chosen": 2.3974947929382324, "rewards/margins": 2.9836597442626953, "rewards/rejected": -0.5861648321151733, "step": 1947 }, { "epoch": 1.4231963470319635, "grad_norm": 29.03135091478693, "learning_rate": 4.0672607822674734e-07, "logits/chosen": -2.3169431686401367, "logits/rejected": -1.807892084121704, "logps/chosen": -473.68768310546875, "logps/rejected": -333.4276123046875, "loss": 0.1621, "rewards/accuracies": 1.0, "rewards/chosen": 2.7430927753448486, "rewards/margins": 3.6349873542785645, "rewards/rejected": -0.8918945789337158, "step": 1948 }, { "epoch": 1.4239269406392694, "grad_norm": 37.27744178475268, "learning_rate": 4.0660177729251636e-07, "logits/chosen": -2.3790979385375977, "logits/rejected": -1.7778346538543701, "logps/chosen": -483.9113464355469, "logps/rejected": -367.56512451171875, "loss": 0.2111, "rewards/accuracies": 0.875, "rewards/chosen": 2.0176949501037598, "rewards/margins": 3.627298355102539, "rewards/rejected": -1.6096036434173584, "step": 1949 }, { "epoch": 1.4246575342465753, "grad_norm": 35.75471006723712, "learning_rate": 4.064774126112007e-07, "logits/chosen": -2.962789297103882, "logits/rejected": -2.292280673980713, "logps/chosen": -628.74267578125, "logps/rejected": -445.3310546875, "loss": 0.2245, "rewards/accuracies": 0.875, "rewards/chosen": 3.9273271560668945, "rewards/margins": 3.8902292251586914, "rewards/rejected": 0.037098243832588196, "step": 1950 }, { "epoch": 1.4253881278538814, "grad_norm": 38.33428723903707, "learning_rate": 4.063529842334247e-07, "logits/chosen": -3.0230860710144043, "logits/rejected": -2.231135129928589, "logps/chosen": -838.7352294921875, "logps/rejected": -625.0888061523438, "loss": 0.1678, "rewards/accuracies": 1.0, "rewards/chosen": 3.9467549324035645, "rewards/margins": 3.544198513031006, "rewards/rejected": 0.4025562107563019, "step": 1951 }, { "epoch": 1.4261187214611872, "grad_norm": 48.708270401121304, "learning_rate": 4.0622849220983895e-07, "logits/chosen": -3.0813870429992676, "logits/rejected": -2.5463755130767822, "logps/chosen": -922.45654296875, "logps/rejected": -591.4156494140625, "loss": 0.176, "rewards/accuracies": 0.875, "rewards/chosen": 2.550107479095459, "rewards/margins": 2.298755168914795, "rewards/rejected": 0.25135231018066406, "step": 1952 }, { "epoch": 1.426849315068493, "grad_norm": 22.737604474230846, "learning_rate": 4.061039365911196e-07, "logits/chosen": -2.713766574859619, "logits/rejected": -1.7493176460266113, "logps/chosen": -390.4486999511719, "logps/rejected": -227.69668579101562, "loss": 0.1497, "rewards/accuracies": 1.0, "rewards/chosen": 2.805166244506836, "rewards/margins": 3.5015385150909424, "rewards/rejected": -0.6963719725608826, "step": 1953 }, { "epoch": 1.4275799086757992, "grad_norm": 41.16741703806467, "learning_rate": 4.05979317427969e-07, "logits/chosen": -2.5843052864074707, "logits/rejected": -1.660097599029541, "logps/chosen": -688.880859375, "logps/rejected": -569.3901977539062, "loss": 0.1904, "rewards/accuracies": 1.0, "rewards/chosen": 2.340071678161621, "rewards/margins": 3.614081382751465, "rewards/rejected": -1.2740097045898438, "step": 1954 }, { "epoch": 1.428310502283105, "grad_norm": 32.33686143431169, "learning_rate": 4.0585463477111516e-07, "logits/chosen": -2.9849724769592285, "logits/rejected": -2.5668535232543945, "logps/chosen": -438.80755615234375, "logps/rejected": -351.7822265625, "loss": 0.2297, "rewards/accuracies": 0.875, "rewards/chosen": 3.2383697032928467, "rewards/margins": 4.6601080894470215, "rewards/rejected": -1.4217383861541748, "step": 1955 }, { "epoch": 1.429041095890411, "grad_norm": 28.302758568346743, "learning_rate": 4.05729888671312e-07, "logits/chosen": -2.9909934997558594, "logits/rejected": -2.2370667457580566, "logps/chosen": -801.3383178710938, "logps/rejected": -567.1781616210938, "loss": 0.2049, "rewards/accuracies": 1.0, "rewards/chosen": 4.418990135192871, "rewards/margins": 4.976740837097168, "rewards/rejected": -0.5577507019042969, "step": 1956 }, { "epoch": 1.4297716894977168, "grad_norm": 52.329692608648344, "learning_rate": 4.056050791793394e-07, "logits/chosen": -2.9676692485809326, "logits/rejected": -2.2353665828704834, "logps/chosen": -756.1669311523438, "logps/rejected": -603.62841796875, "loss": 0.3231, "rewards/accuracies": 0.875, "rewards/chosen": 3.209105968475342, "rewards/margins": 3.4085824489593506, "rewards/rejected": -0.1994766741991043, "step": 1957 }, { "epoch": 1.4305022831050227, "grad_norm": 39.11532365866189, "learning_rate": 4.0548020634600275e-07, "logits/chosen": -2.6828503608703613, "logits/rejected": -2.077178716659546, "logps/chosen": -800.5660400390625, "logps/rejected": -655.651123046875, "loss": 0.1899, "rewards/accuracies": 1.0, "rewards/chosen": 4.289173603057861, "rewards/margins": 4.9719696044921875, "rewards/rejected": -0.6827962398529053, "step": 1958 }, { "epoch": 1.4312328767123288, "grad_norm": 39.17550996547062, "learning_rate": 4.0535527022213356e-07, "logits/chosen": -3.047339677810669, "logits/rejected": -2.188680648803711, "logps/chosen": -818.74609375, "logps/rejected": -651.3204345703125, "loss": 0.1753, "rewards/accuracies": 1.0, "rewards/chosen": 3.396742343902588, "rewards/margins": 3.286470651626587, "rewards/rejected": 0.11027181148529053, "step": 1959 }, { "epoch": 1.4319634703196347, "grad_norm": 25.617792669735056, "learning_rate": 4.052302708585888e-07, "logits/chosen": -3.3102169036865234, "logits/rejected": -2.7181994915008545, "logps/chosen": -937.3836669921875, "logps/rejected": -669.3475341796875, "loss": 0.1101, "rewards/accuracies": 1.0, "rewards/chosen": 3.764704704284668, "rewards/margins": 3.988955497741699, "rewards/rejected": -0.2242508828639984, "step": 1960 }, { "epoch": 1.4326940639269408, "grad_norm": 24.04099713940816, "learning_rate": 4.0510520830625137e-07, "logits/chosen": -2.96118426322937, "logits/rejected": -1.9127583503723145, "logps/chosen": -614.297119140625, "logps/rejected": -390.221923828125, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": 2.1893208026885986, "rewards/margins": 3.9171488285064697, "rewards/rejected": -1.7278279066085815, "step": 1961 }, { "epoch": 1.4334246575342466, "grad_norm": 22.939284086355688, "learning_rate": 4.049800826160299e-07, "logits/chosen": -2.690645933151245, "logits/rejected": -2.5202627182006836, "logps/chosen": -783.6119995117188, "logps/rejected": -655.1409912109375, "loss": 0.1305, "rewards/accuracies": 1.0, "rewards/chosen": 3.4060513973236084, "rewards/margins": 4.167673110961914, "rewards/rejected": -0.7616220712661743, "step": 1962 }, { "epoch": 1.4341552511415525, "grad_norm": 51.3012605686985, "learning_rate": 4.0485489383885865e-07, "logits/chosen": -2.6242144107818604, "logits/rejected": -2.402261257171631, "logps/chosen": -611.5343627929688, "logps/rejected": -460.53448486328125, "loss": 0.2584, "rewards/accuracies": 0.875, "rewards/chosen": 3.553070545196533, "rewards/margins": 4.635909557342529, "rewards/rejected": -1.0828388929367065, "step": 1963 }, { "epoch": 1.4348858447488584, "grad_norm": 50.12182424478383, "learning_rate": 4.0472964202569747e-07, "logits/chosen": -2.3626949787139893, "logits/rejected": -2.067142963409424, "logps/chosen": -376.29364013671875, "logps/rejected": -319.93170166015625, "loss": 0.2884, "rewards/accuracies": 0.875, "rewards/chosen": 1.468532681465149, "rewards/margins": 2.2804129123687744, "rewards/rejected": -0.8118804693222046, "step": 1964 }, { "epoch": 1.4356164383561643, "grad_norm": 62.2508216405364, "learning_rate": 4.0460432722753214e-07, "logits/chosen": -2.659209728240967, "logits/rejected": -2.5467987060546875, "logps/chosen": -586.902587890625, "logps/rejected": -619.7962646484375, "loss": 0.3089, "rewards/accuracies": 0.875, "rewards/chosen": 1.8641383647918701, "rewards/margins": 2.2386674880981445, "rewards/rejected": -0.374529093503952, "step": 1965 }, { "epoch": 1.4363470319634704, "grad_norm": 25.06008770147201, "learning_rate": 4.0447894949537375e-07, "logits/chosen": -2.808556318283081, "logits/rejected": -2.4829883575439453, "logps/chosen": -666.0682373046875, "logps/rejected": -709.7405395507812, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": 3.443330764770508, "rewards/margins": 4.822451591491699, "rewards/rejected": -1.379120945930481, "step": 1966 }, { "epoch": 1.4370776255707762, "grad_norm": 18.75787106925467, "learning_rate": 4.0435350888025925e-07, "logits/chosen": -2.97453236579895, "logits/rejected": -1.2827929258346558, "logps/chosen": -952.9946899414062, "logps/rejected": -370.09210205078125, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 4.078826904296875, "rewards/margins": 4.766506195068359, "rewards/rejected": -0.6876789331436157, "step": 1967 }, { "epoch": 1.4378082191780823, "grad_norm": 52.49963262520272, "learning_rate": 4.0422800543325114e-07, "logits/chosen": -3.1466774940490723, "logits/rejected": -2.2873101234436035, "logps/chosen": -656.0782470703125, "logps/rejected": -487.3040771484375, "loss": 0.3091, "rewards/accuracies": 0.875, "rewards/chosen": 2.3065226078033447, "rewards/margins": 2.3884153366088867, "rewards/rejected": -0.0818926990032196, "step": 1968 }, { "epoch": 1.4385388127853882, "grad_norm": 37.2484024012482, "learning_rate": 4.041024392054374e-07, "logits/chosen": -2.9801058769226074, "logits/rejected": -2.4626266956329346, "logps/chosen": -941.920654296875, "logps/rejected": -735.5092163085938, "loss": 0.1211, "rewards/accuracies": 1.0, "rewards/chosen": 4.1328043937683105, "rewards/margins": 4.385514259338379, "rewards/rejected": -0.25270992517471313, "step": 1969 }, { "epoch": 1.439269406392694, "grad_norm": 34.8549204347912, "learning_rate": 4.0397681024793175e-07, "logits/chosen": -2.8029556274414062, "logits/rejected": -2.651606798171997, "logps/chosen": -784.1436767578125, "logps/rejected": -753.5350952148438, "loss": 0.1577, "rewards/accuracies": 1.0, "rewards/chosen": 3.695981025695801, "rewards/margins": 3.9502620697021484, "rewards/rejected": -0.25428086519241333, "step": 1970 }, { "epoch": 1.44, "grad_norm": 26.24160487035568, "learning_rate": 4.0385111861187313e-07, "logits/chosen": -2.370553970336914, "logits/rejected": -2.0400948524475098, "logps/chosen": -670.901611328125, "logps/rejected": -554.4205932617188, "loss": 0.1056, "rewards/accuracies": 0.875, "rewards/chosen": 3.3429653644561768, "rewards/margins": 3.683757781982422, "rewards/rejected": -0.34079256653785706, "step": 1971 }, { "epoch": 1.4407305936073058, "grad_norm": 28.79421913620966, "learning_rate": 4.037253643484264e-07, "logits/chosen": -2.840019941329956, "logits/rejected": -1.7240769863128662, "logps/chosen": -792.1919555664062, "logps/rejected": -447.0564880371094, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 3.383056640625, "rewards/margins": 4.127557277679443, "rewards/rejected": -0.7445007562637329, "step": 1972 }, { "epoch": 1.441461187214612, "grad_norm": 26.709606439853715, "learning_rate": 4.0359954750878155e-07, "logits/chosen": -2.5152437686920166, "logits/rejected": -1.812014102935791, "logps/chosen": -681.1270141601562, "logps/rejected": -508.67156982421875, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 3.4884138107299805, "rewards/margins": 6.313063621520996, "rewards/rejected": -2.824650287628174, "step": 1973 }, { "epoch": 1.4421917808219178, "grad_norm": 17.938434855593155, "learning_rate": 4.0347366814415433e-07, "logits/chosen": -2.672985076904297, "logits/rejected": -2.0373806953430176, "logps/chosen": -488.7737731933594, "logps/rejected": -490.52606201171875, "loss": 0.1145, "rewards/accuracies": 0.875, "rewards/chosen": 2.7783827781677246, "rewards/margins": 3.651566743850708, "rewards/rejected": -0.8731839060783386, "step": 1974 }, { "epoch": 1.4429223744292237, "grad_norm": 38.2997866472721, "learning_rate": 4.0334772630578565e-07, "logits/chosen": -2.5316081047058105, "logits/rejected": -2.192113161087036, "logps/chosen": -778.1560668945312, "logps/rejected": -841.227294921875, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 3.5367281436920166, "rewards/margins": 3.818617343902588, "rewards/rejected": -0.2818893790245056, "step": 1975 }, { "epoch": 1.4436529680365298, "grad_norm": 32.78898524978987, "learning_rate": 4.032217220449422e-07, "logits/chosen": -2.801476001739502, "logits/rejected": -2.7152822017669678, "logps/chosen": -559.8713989257812, "logps/rejected": -611.1491088867188, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 2.125714063644409, "rewards/margins": 2.3531877994537354, "rewards/rejected": -0.2274736762046814, "step": 1976 }, { "epoch": 1.4443835616438356, "grad_norm": 48.7980207198189, "learning_rate": 4.0309565541291566e-07, "logits/chosen": -2.9699015617370605, "logits/rejected": -2.817857027053833, "logps/chosen": -887.3269653320312, "logps/rejected": -927.4454345703125, "loss": 0.2581, "rewards/accuracies": 0.875, "rewards/chosen": 2.1320297718048096, "rewards/margins": 2.149717092514038, "rewards/rejected": -0.01768721640110016, "step": 1977 }, { "epoch": 1.4451141552511415, "grad_norm": 38.9452255202138, "learning_rate": 4.0296952646102356e-07, "logits/chosen": -2.792243242263794, "logits/rejected": -2.014737129211426, "logps/chosen": -731.6087646484375, "logps/rejected": -612.8687744140625, "loss": 0.1604, "rewards/accuracies": 1.0, "rewards/chosen": 3.395749092102051, "rewards/margins": 5.008784770965576, "rewards/rejected": -1.6130354404449463, "step": 1978 }, { "epoch": 1.4458447488584474, "grad_norm": 19.19463839373918, "learning_rate": 4.0284333524060844e-07, "logits/chosen": -2.507267951965332, "logits/rejected": -2.217437982559204, "logps/chosen": -553.4274291992188, "logps/rejected": -400.9840393066406, "loss": 0.1404, "rewards/accuracies": 1.0, "rewards/chosen": 2.7486469745635986, "rewards/margins": 3.1670727729797363, "rewards/rejected": -0.4184260368347168, "step": 1979 }, { "epoch": 1.4465753424657535, "grad_norm": 39.428719245838906, "learning_rate": 4.027170818030383e-07, "logits/chosen": -3.2440240383148193, "logits/rejected": -2.5333163738250732, "logps/chosen": -465.1361389160156, "logps/rejected": -346.72991943359375, "loss": 0.191, "rewards/accuracies": 0.875, "rewards/chosen": 3.157027006149292, "rewards/margins": 5.085226535797119, "rewards/rejected": -1.9282000064849854, "step": 1980 }, { "epoch": 1.4473059360730594, "grad_norm": 24.958021396562067, "learning_rate": 4.0259076619970663e-07, "logits/chosen": -2.805746078491211, "logits/rejected": -1.6299842596054077, "logps/chosen": -562.2324829101562, "logps/rejected": -452.82720947265625, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 4.438881874084473, "rewards/margins": 6.04442024230957, "rewards/rejected": -1.6055378913879395, "step": 1981 }, { "epoch": 1.4480365296803652, "grad_norm": 27.093327608082884, "learning_rate": 4.024643884820319e-07, "logits/chosen": -2.49710750579834, "logits/rejected": -2.1931304931640625, "logps/chosen": -415.7704162597656, "logps/rejected": -404.73480224609375, "loss": 0.1834, "rewards/accuracies": 1.0, "rewards/chosen": 2.4486825466156006, "rewards/margins": 4.7624311447143555, "rewards/rejected": -2.3137481212615967, "step": 1982 }, { "epoch": 1.4487671232876713, "grad_norm": 39.130461192836236, "learning_rate": 4.023379487014581e-07, "logits/chosen": -3.284273624420166, "logits/rejected": -2.86750864982605, "logps/chosen": -715.2356567382812, "logps/rejected": -634.9166870117188, "loss": 0.2077, "rewards/accuracies": 1.0, "rewards/chosen": 2.9772629737854004, "rewards/margins": 3.8581206798553467, "rewards/rejected": -0.880857527256012, "step": 1983 }, { "epoch": 1.4494977168949772, "grad_norm": 38.37598450233476, "learning_rate": 4.022114469094544e-07, "logits/chosen": -2.931859254837036, "logits/rejected": -2.445181369781494, "logps/chosen": -381.50360107421875, "logps/rejected": -346.62884521484375, "loss": 0.1856, "rewards/accuracies": 1.0, "rewards/chosen": 3.466449737548828, "rewards/margins": 5.045340538024902, "rewards/rejected": -1.5788904428482056, "step": 1984 }, { "epoch": 1.450228310502283, "grad_norm": 48.93825829552989, "learning_rate": 4.020848831575153e-07, "logits/chosen": -2.732842206954956, "logits/rejected": -1.6334216594696045, "logps/chosen": -916.3790893554688, "logps/rejected": -507.02490234375, "loss": 0.2166, "rewards/accuracies": 0.75, "rewards/chosen": 3.71928071975708, "rewards/margins": 4.229614734649658, "rewards/rejected": -0.5103340148925781, "step": 1985 }, { "epoch": 1.450958904109589, "grad_norm": 35.525955382781405, "learning_rate": 4.0195825749716044e-07, "logits/chosen": -3.1510913372039795, "logits/rejected": -1.5970652103424072, "logps/chosen": -757.3736572265625, "logps/rejected": -374.364013671875, "loss": 0.1767, "rewards/accuracies": 1.0, "rewards/chosen": 4.269710540771484, "rewards/margins": 4.927927494049072, "rewards/rejected": -0.6582164764404297, "step": 1986 }, { "epoch": 1.451689497716895, "grad_norm": 36.050095353549345, "learning_rate": 4.018315699799347e-07, "logits/chosen": -2.4336745738983154, "logits/rejected": -2.468148708343506, "logps/chosen": -655.75048828125, "logps/rejected": -470.6011047363281, "loss": 0.1336, "rewards/accuracies": 1.0, "rewards/chosen": 2.29026460647583, "rewards/margins": 3.716808319091797, "rewards/rejected": -1.426543951034546, "step": 1987 }, { "epoch": 1.452420091324201, "grad_norm": 39.39749636368487, "learning_rate": 4.0170482065740807e-07, "logits/chosen": -3.214698314666748, "logits/rejected": -2.6593401432037354, "logps/chosen": -736.7396850585938, "logps/rejected": -587.6674194335938, "loss": 0.1773, "rewards/accuracies": 0.875, "rewards/chosen": 3.052882671356201, "rewards/margins": 2.6601624488830566, "rewards/rejected": 0.39272019267082214, "step": 1988 }, { "epoch": 1.4531506849315068, "grad_norm": 35.61243782078198, "learning_rate": 4.015780095811758e-07, "logits/chosen": -2.851515769958496, "logits/rejected": -2.844977378845215, "logps/chosen": -624.578857421875, "logps/rejected": -702.1490478515625, "loss": 0.2007, "rewards/accuracies": 0.875, "rewards/chosen": 2.7173495292663574, "rewards/margins": 2.772205114364624, "rewards/rejected": -0.05485524237155914, "step": 1989 }, { "epoch": 1.4538812785388129, "grad_norm": 26.077194645252895, "learning_rate": 4.0145113680285814e-07, "logits/chosen": -2.556192398071289, "logits/rejected": -1.2414703369140625, "logps/chosen": -644.75927734375, "logps/rejected": -394.42413330078125, "loss": 0.1303, "rewards/accuracies": 1.0, "rewards/chosen": 3.413057804107666, "rewards/margins": 6.6244354248046875, "rewards/rejected": -3.2113771438598633, "step": 1990 }, { "epoch": 1.4546118721461188, "grad_norm": 11.369973214284839, "learning_rate": 4.0132420237410056e-07, "logits/chosen": -2.699310302734375, "logits/rejected": -2.1630430221557617, "logps/chosen": -946.1588745117188, "logps/rejected": -763.0400390625, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 3.7836556434631348, "rewards/margins": 4.598755836486816, "rewards/rejected": -0.815099835395813, "step": 1991 }, { "epoch": 1.4553424657534246, "grad_norm": 28.29079927698171, "learning_rate": 4.0119720634657374e-07, "logits/chosen": -2.4520580768585205, "logits/rejected": -2.362502336502075, "logps/chosen": -369.9779968261719, "logps/rejected": -383.9998474121094, "loss": 0.2095, "rewards/accuracies": 0.875, "rewards/chosen": 2.154360771179199, "rewards/margins": 3.0627474784851074, "rewards/rejected": -0.9083869457244873, "step": 1992 }, { "epoch": 1.4560730593607305, "grad_norm": 26.24401438453395, "learning_rate": 4.010701487719732e-07, "logits/chosen": -2.09952974319458, "logits/rejected": -2.389951229095459, "logps/chosen": -608.6270141601562, "logps/rejected": -762.7904052734375, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 2.9113383293151855, "rewards/margins": 3.4696767330169678, "rewards/rejected": -0.5583384037017822, "step": 1993 }, { "epoch": 1.4568036529680366, "grad_norm": 53.22447488291923, "learning_rate": 4.0094302970201965e-07, "logits/chosen": -2.141359806060791, "logits/rejected": -1.974938154220581, "logps/chosen": -653.8773193359375, "logps/rejected": -666.183837890625, "loss": 0.3359, "rewards/accuracies": 1.0, "rewards/chosen": 1.9296557903289795, "rewards/margins": 3.1242640018463135, "rewards/rejected": -1.194608211517334, "step": 1994 }, { "epoch": 1.4575342465753425, "grad_norm": 47.78166387165426, "learning_rate": 4.008158491884587e-07, "logits/chosen": -2.9614510536193848, "logits/rejected": -2.13468074798584, "logps/chosen": -961.0645751953125, "logps/rejected": -559.7939453125, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 4.033557891845703, "rewards/margins": 3.991555690765381, "rewards/rejected": 0.042002275586128235, "step": 1995 }, { "epoch": 1.4582648401826483, "grad_norm": 27.271557601352846, "learning_rate": 4.006886072830612e-07, "logits/chosen": -2.3991265296936035, "logits/rejected": -2.3884730339050293, "logps/chosen": -564.4703369140625, "logps/rejected": -449.2028503417969, "loss": 0.1375, "rewards/accuracies": 1.0, "rewards/chosen": 2.665393590927124, "rewards/margins": 3.4826502799987793, "rewards/rejected": -0.8172565698623657, "step": 1996 }, { "epoch": 1.4589954337899544, "grad_norm": 26.776430566576995, "learning_rate": 4.0056130403762277e-07, "logits/chosen": -2.704005241394043, "logits/rejected": -1.7339940071105957, "logps/chosen": -689.5609130859375, "logps/rejected": -484.8673095703125, "loss": 0.1344, "rewards/accuracies": 1.0, "rewards/chosen": 3.496830463409424, "rewards/margins": 4.68547248840332, "rewards/rejected": -1.1886422634124756, "step": 1997 }, { "epoch": 1.4597260273972603, "grad_norm": 34.72516254911559, "learning_rate": 4.004339395039642e-07, "logits/chosen": -2.7961699962615967, "logits/rejected": -2.0700466632843018, "logps/chosen": -467.5685729980469, "logps/rejected": -328.9584045410156, "loss": 0.1554, "rewards/accuracies": 0.875, "rewards/chosen": 2.106064558029175, "rewards/margins": 3.166604518890381, "rewards/rejected": -1.060539960861206, "step": 1998 }, { "epoch": 1.4604566210045662, "grad_norm": 21.277220043357108, "learning_rate": 4.0030651373393104e-07, "logits/chosen": -2.9862375259399414, "logits/rejected": -1.959592580795288, "logps/chosen": -646.1953125, "logps/rejected": -481.187744140625, "loss": 0.1046, "rewards/accuracies": 0.875, "rewards/chosen": 2.1203880310058594, "rewards/margins": 3.1326353549957275, "rewards/rejected": -1.0122475624084473, "step": 1999 }, { "epoch": 1.461187214611872, "grad_norm": 35.39545703767999, "learning_rate": 4.0017902677939386e-07, "logits/chosen": -2.757385730743408, "logits/rejected": -2.349562168121338, "logps/chosen": -837.4406127929688, "logps/rejected": -647.9788818359375, "loss": 0.1849, "rewards/accuracies": 0.875, "rewards/chosen": 2.414149284362793, "rewards/margins": 2.690181016921997, "rewards/rejected": -0.27603164315223694, "step": 2000 }, { "epoch": 1.4619178082191782, "grad_norm": 35.82511810049009, "learning_rate": 4.000514786922481e-07, "logits/chosen": -3.0748801231384277, "logits/rejected": -2.7982091903686523, "logps/chosen": -702.6117553710938, "logps/rejected": -732.5177001953125, "loss": 0.1845, "rewards/accuracies": 1.0, "rewards/chosen": 3.9450130462646484, "rewards/margins": 2.8848447799682617, "rewards/rejected": 1.0601682662963867, "step": 2001 }, { "epoch": 1.462648401826484, "grad_norm": 47.89836550414586, "learning_rate": 3.9992386952441414e-07, "logits/chosen": -2.581446886062622, "logits/rejected": -2.5385689735412598, "logps/chosen": -794.279052734375, "logps/rejected": -846.5274658203125, "loss": 0.2366, "rewards/accuracies": 1.0, "rewards/chosen": 4.281888008117676, "rewards/margins": 2.671304702758789, "rewards/rejected": 1.610582947731018, "step": 2002 }, { "epoch": 1.46337899543379, "grad_norm": 30.021532596348138, "learning_rate": 3.9979619932783716e-07, "logits/chosen": -2.8681349754333496, "logits/rejected": -2.1831462383270264, "logps/chosen": -563.96484375, "logps/rejected": -446.7774963378906, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/chosen": 2.4777345657348633, "rewards/margins": 3.09551739692688, "rewards/rejected": -0.6177826523780823, "step": 2003 }, { "epoch": 1.464109589041096, "grad_norm": 35.43021723005308, "learning_rate": 3.9966846815448725e-07, "logits/chosen": -2.7489736080169678, "logits/rejected": -2.1619348526000977, "logps/chosen": -679.8635864257812, "logps/rejected": -584.4925537109375, "loss": 0.1808, "rewards/accuracies": 1.0, "rewards/chosen": 3.8712611198425293, "rewards/margins": 4.405383110046387, "rewards/rejected": -0.5341222286224365, "step": 2004 }, { "epoch": 1.4648401826484019, "grad_norm": 43.30636258148521, "learning_rate": 3.9954067605635925e-07, "logits/chosen": -2.6506567001342773, "logits/rejected": -2.06860089302063, "logps/chosen": -581.0849609375, "logps/rejected": -522.1395874023438, "loss": 0.1767, "rewards/accuracies": 0.875, "rewards/chosen": 1.545326828956604, "rewards/margins": 2.529538631439209, "rewards/rejected": -0.9842116832733154, "step": 2005 }, { "epoch": 1.4655707762557078, "grad_norm": 43.137094941140134, "learning_rate": 3.9941282308547285e-07, "logits/chosen": -2.7582321166992188, "logits/rejected": -2.5182855129241943, "logps/chosen": -925.979248046875, "logps/rejected": -893.2974243164062, "loss": 0.1916, "rewards/accuracies": 0.75, "rewards/chosen": 2.7752349376678467, "rewards/margins": 2.340967893600464, "rewards/rejected": 0.4342670142650604, "step": 2006 }, { "epoch": 1.4663013698630136, "grad_norm": 38.3196013249502, "learning_rate": 3.992849092938726e-07, "logits/chosen": -2.604238986968994, "logits/rejected": -2.5759315490722656, "logps/chosen": -354.16729736328125, "logps/rejected": -364.61663818359375, "loss": 0.2342, "rewards/accuracies": 0.875, "rewards/chosen": 1.2245028018951416, "rewards/margins": 2.2045013904571533, "rewards/rejected": -0.9799988865852356, "step": 2007 }, { "epoch": 1.4670319634703195, "grad_norm": 49.565704203643925, "learning_rate": 3.9915693473362755e-07, "logits/chosen": -2.853687286376953, "logits/rejected": -1.8852450847625732, "logps/chosen": -578.6363525390625, "logps/rejected": -348.15325927734375, "loss": 0.2604, "rewards/accuracies": 0.875, "rewards/chosen": 1.413327932357788, "rewards/margins": 1.7772952318191528, "rewards/rejected": -0.3639672100543976, "step": 2008 }, { "epoch": 1.4677625570776256, "grad_norm": 30.6567825753306, "learning_rate": 3.9902889945683184e-07, "logits/chosen": -2.898118019104004, "logits/rejected": -2.449323892593384, "logps/chosen": -499.10699462890625, "logps/rejected": -384.37652587890625, "loss": 0.2095, "rewards/accuracies": 0.875, "rewards/chosen": 2.176264762878418, "rewards/margins": 3.714240312576294, "rewards/rejected": -1.537975788116455, "step": 2009 }, { "epoch": 1.4684931506849315, "grad_norm": 38.08292696434815, "learning_rate": 3.9890080351560384e-07, "logits/chosen": -3.1738576889038086, "logits/rejected": -2.564260721206665, "logps/chosen": -578.75927734375, "logps/rejected": -511.2660217285156, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 3.8820416927337646, "rewards/margins": 4.202953338623047, "rewards/rejected": -0.32091227173805237, "step": 2010 }, { "epoch": 1.4692237442922376, "grad_norm": 27.003773743436653, "learning_rate": 3.987726469620872e-07, "logits/chosen": -2.85871958732605, "logits/rejected": -2.4426004886627197, "logps/chosen": -772.75390625, "logps/rejected": -686.205810546875, "loss": 0.1291, "rewards/accuracies": 0.875, "rewards/chosen": 3.4883499145507812, "rewards/margins": 4.491114616394043, "rewards/rejected": -1.002764344215393, "step": 2011 }, { "epoch": 1.4699543378995434, "grad_norm": 30.482687116906007, "learning_rate": 3.986444298484498e-07, "logits/chosen": -2.351066827774048, "logits/rejected": -2.566793441772461, "logps/chosen": -672.4873046875, "logps/rejected": -735.714599609375, "loss": 0.1327, "rewards/accuracies": 1.0, "rewards/chosen": 2.475375175476074, "rewards/margins": 2.984272003173828, "rewards/rejected": -0.5088970065116882, "step": 2012 }, { "epoch": 1.4706849315068493, "grad_norm": 21.180204772823846, "learning_rate": 3.985161522268845e-07, "logits/chosen": -3.3864002227783203, "logits/rejected": -2.689664125442505, "logps/chosen": -696.6240234375, "logps/rejected": -475.38677978515625, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 2.68034029006958, "rewards/margins": 4.122424125671387, "rewards/rejected": -1.4420838356018066, "step": 2013 }, { "epoch": 1.4714155251141552, "grad_norm": 26.6210372913011, "learning_rate": 3.983878141496083e-07, "logits/chosen": -2.5725347995758057, "logits/rejected": -2.346982002258301, "logps/chosen": -542.096923828125, "logps/rejected": -675.6262817382812, "loss": 0.141, "rewards/accuracies": 0.75, "rewards/chosen": 1.9766417741775513, "rewards/margins": 3.5872511863708496, "rewards/rejected": -1.610609531402588, "step": 2014 }, { "epoch": 1.472146118721461, "grad_norm": 43.12348700923822, "learning_rate": 3.9825941566886345e-07, "logits/chosen": -2.171691417694092, "logits/rejected": -2.3528566360473633, "logps/chosen": -665.8108520507812, "logps/rejected": -712.58544921875, "loss": 0.2386, "rewards/accuracies": 0.875, "rewards/chosen": 4.104864120483398, "rewards/margins": 5.0782880783081055, "rewards/rejected": -0.9734240770339966, "step": 2015 }, { "epoch": 1.4728767123287672, "grad_norm": 21.070405919564873, "learning_rate": 3.981309568369162e-07, "logits/chosen": -2.8006882667541504, "logits/rejected": -1.8255001306533813, "logps/chosen": -627.0532836914062, "logps/rejected": -506.93017578125, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 14.36644458770752, "rewards/margins": 15.438410758972168, "rewards/rejected": -1.071967363357544, "step": 2016 }, { "epoch": 1.473607305936073, "grad_norm": 33.92649345287268, "learning_rate": 3.980024377060578e-07, "logits/chosen": -2.757080078125, "logits/rejected": -2.3429605960845947, "logps/chosen": -442.4732971191406, "logps/rejected": -490.429443359375, "loss": 0.175, "rewards/accuracies": 0.75, "rewards/chosen": 1.624559760093689, "rewards/margins": 3.109562873840332, "rewards/rejected": -1.4850029945373535, "step": 2017 }, { "epoch": 1.4743378995433791, "grad_norm": 51.715228354421875, "learning_rate": 3.9787385832860386e-07, "logits/chosen": -2.369680643081665, "logits/rejected": -1.9829905033111572, "logps/chosen": -845.9168701171875, "logps/rejected": -554.708251953125, "loss": 0.2496, "rewards/accuracies": 1.0, "rewards/chosen": 4.270478248596191, "rewards/margins": 4.92077112197876, "rewards/rejected": -0.6502929925918579, "step": 2018 }, { "epoch": 1.475068493150685, "grad_norm": 58.03723212300876, "learning_rate": 3.977452187568945e-07, "logits/chosen": -1.987714171409607, "logits/rejected": -1.9334416389465332, "logps/chosen": -611.2620849609375, "logps/rejected": -632.068603515625, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 1.6183358430862427, "rewards/margins": 3.73760986328125, "rewards/rejected": -2.119274377822876, "step": 2019 }, { "epoch": 1.4757990867579909, "grad_norm": 26.12580132966619, "learning_rate": 3.9761651904329435e-07, "logits/chosen": -2.6101527214050293, "logits/rejected": -2.0804922580718994, "logps/chosen": -538.580078125, "logps/rejected": -428.66119384765625, "loss": 0.1227, "rewards/accuracies": 0.875, "rewards/chosen": 2.387378454208374, "rewards/margins": 3.45053768157959, "rewards/rejected": -1.0631592273712158, "step": 2020 }, { "epoch": 1.4765296803652967, "grad_norm": 40.32377658414217, "learning_rate": 3.974877592401925e-07, "logits/chosen": -3.0953989028930664, "logits/rejected": -2.3090667724609375, "logps/chosen": -726.9664916992188, "logps/rejected": -573.9375610351562, "loss": 0.2085, "rewards/accuracies": 0.875, "rewards/chosen": 3.1251063346862793, "rewards/margins": 3.0145199298858643, "rewards/rejected": 0.11058646440505981, "step": 2021 }, { "epoch": 1.4772602739726026, "grad_norm": 39.881852186038984, "learning_rate": 3.9735893940000275e-07, "logits/chosen": -2.19653058052063, "logits/rejected": -2.557119846343994, "logps/chosen": -356.7682189941406, "logps/rejected": -567.810791015625, "loss": 0.1886, "rewards/accuracies": 0.875, "rewards/chosen": 1.3779798746109009, "rewards/margins": 2.4598817825317383, "rewards/rejected": -1.0819017887115479, "step": 2022 }, { "epoch": 1.4779908675799087, "grad_norm": 32.317769607785046, "learning_rate": 3.9723005957516287e-07, "logits/chosen": -2.753589630126953, "logits/rejected": -2.3788645267486572, "logps/chosen": -597.4796752929688, "logps/rejected": -677.7657470703125, "loss": 0.1344, "rewards/accuracies": 0.875, "rewards/chosen": 2.9136767387390137, "rewards/margins": 4.787827968597412, "rewards/rejected": -1.8741511106491089, "step": 2023 }, { "epoch": 1.4787214611872146, "grad_norm": 22.16522043811684, "learning_rate": 3.9710111981813553e-07, "logits/chosen": -2.9995791912078857, "logits/rejected": -1.7017053365707397, "logps/chosen": -492.32611083984375, "logps/rejected": -334.07476806640625, "loss": 0.1377, "rewards/accuracies": 1.0, "rewards/chosen": 2.7901833057403564, "rewards/margins": 4.014430522918701, "rewards/rejected": -1.2242472171783447, "step": 2024 }, { "epoch": 1.4794520547945205, "grad_norm": 38.63296495799038, "learning_rate": 3.969721201814074e-07, "logits/chosen": -2.642688274383545, "logits/rejected": -2.3809633255004883, "logps/chosen": -713.39453125, "logps/rejected": -691.1512451171875, "loss": 0.2214, "rewards/accuracies": 0.875, "rewards/chosen": 3.5645883083343506, "rewards/margins": 2.556037425994873, "rewards/rejected": 1.008551001548767, "step": 2025 }, { "epoch": 1.4801826484018266, "grad_norm": 25.550103723722312, "learning_rate": 3.9684306071748983e-07, "logits/chosen": -2.6602272987365723, "logits/rejected": -2.3644192218780518, "logps/chosen": -594.6978759765625, "logps/rejected": -710.3408203125, "loss": 0.1498, "rewards/accuracies": 1.0, "rewards/chosen": 2.3176486492156982, "rewards/margins": 2.1818926334381104, "rewards/rejected": 0.13575607538223267, "step": 2026 }, { "epoch": 1.4809132420091324, "grad_norm": 37.61015721558805, "learning_rate": 3.9671394147891837e-07, "logits/chosen": -2.6555733680725098, "logits/rejected": -2.5816102027893066, "logps/chosen": -988.1665649414062, "logps/rejected": -772.031494140625, "loss": 0.187, "rewards/accuracies": 1.0, "rewards/chosen": 4.038677215576172, "rewards/margins": 3.0882058143615723, "rewards/rejected": 0.9504711031913757, "step": 2027 }, { "epoch": 1.4816438356164383, "grad_norm": 49.3420133085766, "learning_rate": 3.9658476251825286e-07, "logits/chosen": -3.427804708480835, "logits/rejected": -2.041398286819458, "logps/chosen": -876.3052978515625, "logps/rejected": -632.010009765625, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": 4.1204047203063965, "rewards/margins": 4.567910194396973, "rewards/rejected": -0.44750598073005676, "step": 2028 }, { "epoch": 1.4823744292237442, "grad_norm": 26.164595536149847, "learning_rate": 3.9645552388807757e-07, "logits/chosen": -2.990318775177002, "logits/rejected": -1.8270468711853027, "logps/chosen": -600.8782958984375, "logps/rejected": -362.5821533203125, "loss": 0.1368, "rewards/accuracies": 1.0, "rewards/chosen": 2.3174445629119873, "rewards/margins": 3.936984062194824, "rewards/rejected": -1.6195393800735474, "step": 2029 }, { "epoch": 1.4831050228310503, "grad_norm": 20.510545204835715, "learning_rate": 3.9632622564100104e-07, "logits/chosen": -2.6439976692199707, "logits/rejected": -2.021228313446045, "logps/chosen": -648.8969116210938, "logps/rejected": -471.253662109375, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 2.9255385398864746, "rewards/margins": 3.8146724700927734, "rewards/rejected": -0.8891339898109436, "step": 2030 }, { "epoch": 1.4838356164383562, "grad_norm": 19.882712733050287, "learning_rate": 3.96196867829656e-07, "logits/chosen": -2.7992115020751953, "logits/rejected": -2.4069998264312744, "logps/chosen": -663.1290893554688, "logps/rejected": -775.4932861328125, "loss": 0.0884, "rewards/accuracies": 1.0, "rewards/chosen": 3.6103625297546387, "rewards/margins": 4.784677505493164, "rewards/rejected": -1.1743147373199463, "step": 2031 }, { "epoch": 1.484566210045662, "grad_norm": 61.783867942130534, "learning_rate": 3.9606745050669944e-07, "logits/chosen": -2.4063963890075684, "logits/rejected": -1.7956215143203735, "logps/chosen": -902.356201171875, "logps/rejected": -556.6953735351562, "loss": 0.3841, "rewards/accuracies": 1.0, "rewards/chosen": 2.772810935974121, "rewards/margins": 2.4817874431610107, "rewards/rejected": 0.2910235524177551, "step": 2032 }, { "epoch": 1.4852968036529681, "grad_norm": 52.121182601522484, "learning_rate": 3.9593797372481275e-07, "logits/chosen": -3.307703733444214, "logits/rejected": -2.1975741386413574, "logps/chosen": -620.3331298828125, "logps/rejected": -399.2884521484375, "loss": 0.2418, "rewards/accuracies": 0.875, "rewards/chosen": 2.1821305751800537, "rewards/margins": 2.8863542079925537, "rewards/rejected": -0.7042236328125, "step": 2033 }, { "epoch": 1.486027397260274, "grad_norm": 51.07645923025597, "learning_rate": 3.958084375367012e-07, "logits/chosen": -3.3962764739990234, "logits/rejected": -1.8217897415161133, "logps/chosen": -915.8225708007812, "logps/rejected": -439.8701171875, "loss": 0.3451, "rewards/accuracies": 1.0, "rewards/chosen": 4.555021286010742, "rewards/margins": 4.5982208251953125, "rewards/rejected": -0.04319979250431061, "step": 2034 }, { "epoch": 1.4867579908675799, "grad_norm": 33.262447006487285, "learning_rate": 3.9567884199509456e-07, "logits/chosen": -2.438955545425415, "logits/rejected": -1.8194870948791504, "logps/chosen": -712.3458862304688, "logps/rejected": -576.50439453125, "loss": 0.1846, "rewards/accuracies": 0.875, "rewards/chosen": 2.6154370307922363, "rewards/margins": 4.796720027923584, "rewards/rejected": -2.1812829971313477, "step": 2035 }, { "epoch": 1.4874885844748857, "grad_norm": 29.044996409127855, "learning_rate": 3.9554918715274654e-07, "logits/chosen": -2.446140766143799, "logits/rejected": -2.638371706008911, "logps/chosen": -492.7169189453125, "logps/rejected": -668.6528930664062, "loss": 0.1267, "rewards/accuracies": 0.875, "rewards/chosen": 1.3515777587890625, "rewards/margins": 2.807981491088867, "rewards/rejected": -1.4564034938812256, "step": 2036 }, { "epoch": 1.4882191780821918, "grad_norm": 36.14050847095351, "learning_rate": 3.954194730624351e-07, "logits/chosen": -2.6989974975585938, "logits/rejected": -2.9097423553466797, "logps/chosen": -352.55108642578125, "logps/rejected": -418.1647644042969, "loss": 0.1826, "rewards/accuracies": 0.875, "rewards/chosen": 0.7255959510803223, "rewards/margins": 1.0513644218444824, "rewards/rejected": -0.32576847076416016, "step": 2037 }, { "epoch": 1.4889497716894977, "grad_norm": 20.328944665450233, "learning_rate": 3.952896997769623e-07, "logits/chosen": -2.9087016582489014, "logits/rejected": -2.072800874710083, "logps/chosen": -373.9752197265625, "logps/rejected": -273.23529052734375, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": 2.454268217086792, "rewards/margins": 3.533350944519043, "rewards/rejected": -1.0790824890136719, "step": 2038 }, { "epoch": 1.4896803652968036, "grad_norm": 41.294628438704045, "learning_rate": 3.951598673491543e-07, "logits/chosen": -2.4958348274230957, "logits/rejected": -1.4374337196350098, "logps/chosen": -291.02984619140625, "logps/rejected": -215.00787353515625, "loss": 0.1629, "rewards/accuracies": 1.0, "rewards/chosen": 2.609248399734497, "rewards/margins": 6.001482009887695, "rewards/rejected": -3.3922338485717773, "step": 2039 }, { "epoch": 1.4904109589041097, "grad_norm": 43.27770038467776, "learning_rate": 3.950299758318614e-07, "logits/chosen": -2.7305097579956055, "logits/rejected": -2.738119602203369, "logps/chosen": -641.5196533203125, "logps/rejected": -592.9754638671875, "loss": 0.2006, "rewards/accuracies": 0.875, "rewards/chosen": 3.008439779281616, "rewards/margins": 3.3712782859802246, "rewards/rejected": -0.3628385663032532, "step": 2040 }, { "epoch": 1.4911415525114156, "grad_norm": 47.00347238253388, "learning_rate": 3.9490002527795763e-07, "logits/chosen": -2.3330750465393066, "logits/rejected": -2.539062023162842, "logps/chosen": -674.06298828125, "logps/rejected": -600.5253295898438, "loss": 0.2469, "rewards/accuracies": 0.875, "rewards/chosen": 2.8010025024414062, "rewards/margins": 2.2708942890167236, "rewards/rejected": 0.5301079750061035, "step": 2041 }, { "epoch": 1.4918721461187214, "grad_norm": 68.54721513763342, "learning_rate": 3.947700157403415e-07, "logits/chosen": -2.941361427307129, "logits/rejected": -1.829237461090088, "logps/chosen": -759.40869140625, "logps/rejected": -519.2645263671875, "loss": 0.3837, "rewards/accuracies": 0.75, "rewards/chosen": 1.453477382659912, "rewards/margins": 1.8805135488510132, "rewards/rejected": -0.4270361363887787, "step": 2042 }, { "epoch": 1.4926027397260273, "grad_norm": 25.855003406920584, "learning_rate": 3.946399472719353e-07, "logits/chosen": -2.009100914001465, "logits/rejected": -1.9191054105758667, "logps/chosen": -634.1023559570312, "logps/rejected": -569.5465698242188, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 3.493508815765381, "rewards/margins": 3.970093250274658, "rewards/rejected": -0.47658422589302063, "step": 2043 }, { "epoch": 1.4933333333333334, "grad_norm": 24.63483430094784, "learning_rate": 3.9450981992568534e-07, "logits/chosen": -3.0777664184570312, "logits/rejected": -2.1887030601501465, "logps/chosen": -585.1337280273438, "logps/rejected": -398.92132568359375, "loss": 0.14, "rewards/accuracies": 0.875, "rewards/chosen": 2.1119308471679688, "rewards/margins": 2.2488036155700684, "rewards/rejected": -0.13687238097190857, "step": 2044 }, { "epoch": 1.4940639269406393, "grad_norm": 45.75613912447296, "learning_rate": 3.9437963375456184e-07, "logits/chosen": -2.973320245742798, "logits/rejected": -2.296114921569824, "logps/chosen": -715.826416015625, "logps/rejected": -557.35009765625, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 3.456427574157715, "rewards/margins": 3.4355621337890625, "rewards/rejected": 0.02086549997329712, "step": 2045 }, { "epoch": 1.4947945205479451, "grad_norm": 50.36924449518401, "learning_rate": 3.94249388811559e-07, "logits/chosen": -2.888882637023926, "logits/rejected": -1.6878317594528198, "logps/chosen": -1295.846435546875, "logps/rejected": -628.271240234375, "loss": 0.2034, "rewards/accuracies": 0.875, "rewards/chosen": 4.587575435638428, "rewards/margins": 5.28541374206543, "rewards/rejected": -0.6978386640548706, "step": 2046 }, { "epoch": 1.4955251141552512, "grad_norm": 26.956959645698987, "learning_rate": 3.941190851496951e-07, "logits/chosen": -2.926286220550537, "logits/rejected": -2.1218819618225098, "logps/chosen": -779.5520629882812, "logps/rejected": -742.8275756835938, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 3.731653928756714, "rewards/margins": 4.552395820617676, "rewards/rejected": -0.8207416534423828, "step": 2047 }, { "epoch": 1.4962557077625571, "grad_norm": 39.573046394946175, "learning_rate": 3.939887228220121e-07, "logits/chosen": -3.0505316257476807, "logits/rejected": -2.197049856185913, "logps/chosen": -613.0873413085938, "logps/rejected": -415.622314453125, "loss": 0.2409, "rewards/accuracies": 0.875, "rewards/chosen": 2.751866340637207, "rewards/margins": 3.001797914505005, "rewards/rejected": -0.24993157386779785, "step": 2048 }, { "epoch": 1.496986301369863, "grad_norm": 29.035460791908186, "learning_rate": 3.938583018815759e-07, "logits/chosen": -2.7758731842041016, "logits/rejected": -2.5160927772521973, "logps/chosen": -730.6234130859375, "logps/rejected": -673.0419921875, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 3.63899564743042, "rewards/margins": 3.1038780212402344, "rewards/rejected": 0.535117506980896, "step": 2049 }, { "epoch": 1.4977168949771689, "grad_norm": 42.5206564576082, "learning_rate": 3.937278223814763e-07, "logits/chosen": -2.4335436820983887, "logits/rejected": -2.3280091285705566, "logps/chosen": -439.3822021484375, "logps/rejected": -359.7938232421875, "loss": 0.1812, "rewards/accuracies": 1.0, "rewards/chosen": 1.5680146217346191, "rewards/margins": 2.3885631561279297, "rewards/rejected": -0.820548415184021, "step": 2050 }, { "epoch": 1.498447488584475, "grad_norm": 33.16360883818097, "learning_rate": 3.935972843748269e-07, "logits/chosen": -3.1005287170410156, "logits/rejected": -2.858140707015991, "logps/chosen": -721.6197509765625, "logps/rejected": -718.3309326171875, "loss": 0.169, "rewards/accuracies": 1.0, "rewards/chosen": 2.8580970764160156, "rewards/margins": 3.312145233154297, "rewards/rejected": -0.454047828912735, "step": 2051 }, { "epoch": 1.4991780821917808, "grad_norm": 22.211397781460086, "learning_rate": 3.934666879147652e-07, "logits/chosen": -2.6840498447418213, "logits/rejected": -1.3547818660736084, "logps/chosen": -608.3017578125, "logps/rejected": -321.5207214355469, "loss": 0.0946, "rewards/accuracies": 1.0, "rewards/chosen": 4.230090141296387, "rewards/margins": 6.305905342102051, "rewards/rejected": -2.075815200805664, "step": 2052 }, { "epoch": 1.4999086757990867, "grad_norm": 18.92784552403259, "learning_rate": 3.933360330544523e-07, "logits/chosen": -2.295407772064209, "logits/rejected": -2.349919557571411, "logps/chosen": -515.318359375, "logps/rejected": -623.345947265625, "loss": 0.142, "rewards/accuracies": 0.875, "rewards/chosen": 2.0023200511932373, "rewards/margins": 3.6775612831115723, "rewards/rejected": -1.6752415895462036, "step": 2053 }, { "epoch": 1.5006392694063928, "grad_norm": 36.47576015796741, "learning_rate": 3.9320531984707347e-07, "logits/chosen": -2.754687786102295, "logits/rejected": -2.216726541519165, "logps/chosen": -348.57928466796875, "logps/rejected": -207.39376831054688, "loss": 0.2144, "rewards/accuracies": 0.875, "rewards/chosen": 1.0367307662963867, "rewards/margins": 2.589524745941162, "rewards/rejected": -1.5527942180633545, "step": 2054 }, { "epoch": 1.5013698630136987, "grad_norm": 27.502065130200336, "learning_rate": 3.930745483458372e-07, "logits/chosen": -2.374785900115967, "logits/rejected": -2.112578868865967, "logps/chosen": -480.8114318847656, "logps/rejected": -571.2554931640625, "loss": 0.1415, "rewards/accuracies": 1.0, "rewards/chosen": 3.087625026702881, "rewards/margins": 5.862153053283691, "rewards/rejected": -2.7745282649993896, "step": 2055 }, { "epoch": 1.5021004566210046, "grad_norm": 43.08903334865932, "learning_rate": 3.929437186039761e-07, "logits/chosen": -3.2199482917785645, "logits/rejected": -2.8421077728271484, "logps/chosen": -780.8740234375, "logps/rejected": -738.747314453125, "loss": 0.2095, "rewards/accuracies": 0.875, "rewards/chosen": 2.508850574493408, "rewards/margins": 2.0277438163757324, "rewards/rejected": 0.48110657930374146, "step": 2056 }, { "epoch": 1.5028310502283104, "grad_norm": 24.031095070209297, "learning_rate": 3.928128306747465e-07, "logits/chosen": -2.3179523944854736, "logits/rejected": -2.221741199493408, "logps/chosen": -672.5963745117188, "logps/rejected": -683.4457397460938, "loss": 0.1311, "rewards/accuracies": 1.0, "rewards/chosen": 2.2258970737457275, "rewards/margins": 2.539543628692627, "rewards/rejected": -0.31364643573760986, "step": 2057 }, { "epoch": 1.5035616438356163, "grad_norm": 16.8888289690349, "learning_rate": 3.926818846114279e-07, "logits/chosen": -2.964026689529419, "logits/rejected": -2.5172669887542725, "logps/chosen": -625.008056640625, "logps/rejected": -461.8682861328125, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 3.3529841899871826, "rewards/margins": 2.98207426071167, "rewards/rejected": 0.370909720659256, "step": 2058 }, { "epoch": 1.5042922374429224, "grad_norm": 32.99132261919886, "learning_rate": 3.925508804673242e-07, "logits/chosen": -2.8317437171936035, "logits/rejected": -2.886335849761963, "logps/chosen": -460.2156982421875, "logps/rejected": -554.304931640625, "loss": 0.1815, "rewards/accuracies": 0.875, "rewards/chosen": 2.2964513301849365, "rewards/margins": 2.779303550720215, "rewards/rejected": -0.48285233974456787, "step": 2059 }, { "epoch": 1.5050228310502283, "grad_norm": 43.21326875717327, "learning_rate": 3.924198182957624e-07, "logits/chosen": -2.1596760749816895, "logits/rejected": -1.5068978071212769, "logps/chosen": -525.43017578125, "logps/rejected": -308.9991455078125, "loss": 0.2951, "rewards/accuracies": 0.875, "rewards/chosen": 1.8609659671783447, "rewards/margins": 2.5907821655273438, "rewards/rejected": -0.7298163175582886, "step": 2060 }, { "epoch": 1.5057534246575344, "grad_norm": 50.45485005954581, "learning_rate": 3.9228869815009346e-07, "logits/chosen": -2.8716611862182617, "logits/rejected": -2.7328271865844727, "logps/chosen": -708.7794189453125, "logps/rejected": -758.253173828125, "loss": 0.2407, "rewards/accuracies": 0.875, "rewards/chosen": 3.8557519912719727, "rewards/margins": 3.5031280517578125, "rewards/rejected": 0.3526240587234497, "step": 2061 }, { "epoch": 1.5064840182648402, "grad_norm": 22.1978299016762, "learning_rate": 3.921575200836916e-07, "logits/chosen": -2.729665517807007, "logits/rejected": -2.1040899753570557, "logps/chosen": -442.9503479003906, "logps/rejected": -521.8724365234375, "loss": 0.1154, "rewards/accuracies": 0.875, "rewards/chosen": 2.5919623374938965, "rewards/margins": 4.275289535522461, "rewards/rejected": -1.683327555656433, "step": 2062 }, { "epoch": 1.5072146118721461, "grad_norm": 35.36022942792245, "learning_rate": 3.9202628414995497e-07, "logits/chosen": -2.50882625579834, "logits/rejected": -2.313809394836426, "logps/chosen": -1005.94189453125, "logps/rejected": -817.3544921875, "loss": 0.1718, "rewards/accuracies": 0.875, "rewards/chosen": 2.956571340560913, "rewards/margins": 3.1957054138183594, "rewards/rejected": -0.2391340434551239, "step": 2063 }, { "epoch": 1.507945205479452, "grad_norm": 28.20853912714198, "learning_rate": 3.91894990402305e-07, "logits/chosen": -2.7756834030151367, "logits/rejected": -2.4814772605895996, "logps/chosen": -641.153076171875, "logps/rejected": -675.02001953125, "loss": 0.1485, "rewards/accuracies": 1.0, "rewards/chosen": 3.4384074211120605, "rewards/margins": 4.073886394500732, "rewards/rejected": -0.6354789733886719, "step": 2064 }, { "epoch": 1.5086757990867579, "grad_norm": 25.713177175696316, "learning_rate": 3.9176363889418677e-07, "logits/chosen": -2.486931562423706, "logits/rejected": -2.5312442779541016, "logps/chosen": -505.8447265625, "logps/rejected": -543.1812133789062, "loss": 0.1189, "rewards/accuracies": 0.75, "rewards/chosen": 0.9504318833351135, "rewards/margins": 2.158762216567993, "rewards/rejected": -1.2083302736282349, "step": 2065 }, { "epoch": 1.509406392694064, "grad_norm": 39.98045023967573, "learning_rate": 3.9163222967906897e-07, "logits/chosen": -3.0049266815185547, "logits/rejected": -2.3735008239746094, "logps/chosen": -950.3441162109375, "logps/rejected": -663.6646728515625, "loss": 0.2136, "rewards/accuracies": 0.875, "rewards/chosen": 2.8332273960113525, "rewards/margins": 2.3761849403381348, "rewards/rejected": 0.45704275369644165, "step": 2066 }, { "epoch": 1.5101369863013698, "grad_norm": 31.104719916344187, "learning_rate": 3.9150076281044355e-07, "logits/chosen": -2.8876843452453613, "logits/rejected": -2.255343198776245, "logps/chosen": -478.1247863769531, "logps/rejected": -412.1853942871094, "loss": 0.164, "rewards/accuracies": 1.0, "rewards/chosen": 2.1669392585754395, "rewards/margins": 3.754565477371216, "rewards/rejected": -1.587626338005066, "step": 2067 }, { "epoch": 1.510867579908676, "grad_norm": 35.92780439863046, "learning_rate": 3.9136923834182616e-07, "logits/chosen": -2.8743884563446045, "logits/rejected": -1.6769025325775146, "logps/chosen": -344.8359680175781, "logps/rejected": -261.67376708984375, "loss": 0.164, "rewards/accuracies": 1.0, "rewards/chosen": 3.616178035736084, "rewards/margins": 6.847119331359863, "rewards/rejected": -3.2309417724609375, "step": 2068 }, { "epoch": 1.5115981735159818, "grad_norm": 26.29932122460578, "learning_rate": 3.9123765632675574e-07, "logits/chosen": -3.1276488304138184, "logits/rejected": -2.9126267433166504, "logps/chosen": -756.0218505859375, "logps/rejected": -588.06689453125, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": 3.392162799835205, "rewards/margins": 3.447305202484131, "rewards/rejected": -0.055142223834991455, "step": 2069 }, { "epoch": 1.5123287671232877, "grad_norm": 53.391836758271154, "learning_rate": 3.9110601681879474e-07, "logits/chosen": -3.3860974311828613, "logits/rejected": -2.259260654449463, "logps/chosen": -753.0557861328125, "logps/rejected": -572.3944091796875, "loss": 0.2485, "rewards/accuracies": 1.0, "rewards/chosen": 3.7344467639923096, "rewards/margins": 4.705453872680664, "rewards/rejected": -0.9710066318511963, "step": 2070 }, { "epoch": 1.5130593607305935, "grad_norm": 38.96757685433413, "learning_rate": 3.9097431987152883e-07, "logits/chosen": -2.858487844467163, "logits/rejected": -2.434929370880127, "logps/chosen": -519.9581909179688, "logps/rejected": -554.0702514648438, "loss": 0.2368, "rewards/accuracies": 1.0, "rewards/chosen": 3.4079623222351074, "rewards/margins": 4.521728038787842, "rewards/rejected": -1.1137654781341553, "step": 2071 }, { "epoch": 1.5137899543378994, "grad_norm": 34.47580744272895, "learning_rate": 3.908425655385675e-07, "logits/chosen": -2.9188313484191895, "logits/rejected": -2.2914416790008545, "logps/chosen": -635.0799560546875, "logps/rejected": -411.8494567871094, "loss": 0.1957, "rewards/accuracies": 1.0, "rewards/chosen": 1.9883896112442017, "rewards/margins": 1.7559645175933838, "rewards/rejected": 0.23242507874965668, "step": 2072 }, { "epoch": 1.5145205479452055, "grad_norm": 56.26388066582437, "learning_rate": 3.9071075387354303e-07, "logits/chosen": -1.9874553680419922, "logits/rejected": -2.5158493518829346, "logps/chosen": -608.2017822265625, "logps/rejected": -917.2142333984375, "loss": 0.2365, "rewards/accuracies": 1.0, "rewards/chosen": 2.898181915283203, "rewards/margins": 3.7231788635253906, "rewards/rejected": -0.824997067451477, "step": 2073 }, { "epoch": 1.5152511415525114, "grad_norm": 34.64183191684134, "learning_rate": 3.9057888493011155e-07, "logits/chosen": -2.337101459503174, "logits/rejected": -1.498834252357483, "logps/chosen": -741.4012451171875, "logps/rejected": -314.5173034667969, "loss": 0.1694, "rewards/accuracies": 1.0, "rewards/chosen": 3.7603235244750977, "rewards/margins": 4.611852169036865, "rewards/rejected": -0.8515281081199646, "step": 2074 }, { "epoch": 1.5159817351598175, "grad_norm": 38.621927972493815, "learning_rate": 3.904469587619521e-07, "logits/chosen": -2.4956674575805664, "logits/rejected": -2.534233808517456, "logps/chosen": -320.4864501953125, "logps/rejected": -516.4617919921875, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 2.480125665664673, "rewards/margins": 4.107148170471191, "rewards/rejected": -1.6270229816436768, "step": 2075 }, { "epoch": 1.5167123287671234, "grad_norm": 48.24703480202177, "learning_rate": 3.9031497542276727e-07, "logits/chosen": -3.3524749279022217, "logits/rejected": -2.489480972290039, "logps/chosen": -749.3978881835938, "logps/rejected": -619.1942138671875, "loss": 0.2292, "rewards/accuracies": 1.0, "rewards/chosen": 3.6692867279052734, "rewards/margins": 5.100183963775635, "rewards/rejected": -1.4308971166610718, "step": 2076 }, { "epoch": 1.5174429223744292, "grad_norm": 35.671281272435216, "learning_rate": 3.9018293496628287e-07, "logits/chosen": -3.5989432334899902, "logits/rejected": -1.857743263244629, "logps/chosen": -807.4669189453125, "logps/rejected": -412.60711669921875, "loss": 0.1605, "rewards/accuracies": 0.875, "rewards/chosen": 3.5370843410491943, "rewards/margins": 3.5333187580108643, "rewards/rejected": 0.003765508532524109, "step": 2077 }, { "epoch": 1.5181735159817351, "grad_norm": 19.70991330201751, "learning_rate": 3.9005083744624776e-07, "logits/chosen": -2.9218215942382812, "logits/rejected": -2.2594923973083496, "logps/chosen": -618.86083984375, "logps/rejected": -533.1600341796875, "loss": 0.1324, "rewards/accuracies": 0.875, "rewards/chosen": 2.1839101314544678, "rewards/margins": 3.126729965209961, "rewards/rejected": -0.9428198933601379, "step": 2078 }, { "epoch": 1.518904109589041, "grad_norm": 42.437163451353285, "learning_rate": 3.8991868291643446e-07, "logits/chosen": -2.8480069637298584, "logits/rejected": -1.7628886699676514, "logps/chosen": -778.4925537109375, "logps/rejected": -474.22454833984375, "loss": 0.2286, "rewards/accuracies": 0.75, "rewards/chosen": 3.7280843257904053, "rewards/margins": 3.5767269134521484, "rewards/rejected": 0.15135729312896729, "step": 2079 }, { "epoch": 1.519634703196347, "grad_norm": 46.583164458176306, "learning_rate": 3.897864714306384e-07, "logits/chosen": -2.9421818256378174, "logits/rejected": -2.310513973236084, "logps/chosen": -662.1827392578125, "logps/rejected": -495.9148254394531, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": 2.856527805328369, "rewards/margins": 2.756051540374756, "rewards/rejected": 0.10047605633735657, "step": 2080 }, { "epoch": 1.520365296803653, "grad_norm": 29.883474257378545, "learning_rate": 3.8965420304267796e-07, "logits/chosen": -2.1752333641052246, "logits/rejected": -2.0383100509643555, "logps/chosen": -398.8538818359375, "logps/rejected": -269.1472473144531, "loss": 0.1775, "rewards/accuracies": 1.0, "rewards/chosen": 1.805747389793396, "rewards/margins": 2.226008892059326, "rewards/rejected": -0.4202614724636078, "step": 2081 }, { "epoch": 1.521095890410959, "grad_norm": 17.346550132913226, "learning_rate": 3.895218778063952e-07, "logits/chosen": -2.267003059387207, "logits/rejected": -2.054842472076416, "logps/chosen": -693.9815673828125, "logps/rejected": -685.86767578125, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 4.521918296813965, "rewards/margins": 4.423695087432861, "rewards/rejected": 0.09822356700897217, "step": 2082 }, { "epoch": 1.521826484018265, "grad_norm": 34.675800391807826, "learning_rate": 3.8938949577565516e-07, "logits/chosen": -2.6385245323181152, "logits/rejected": -2.6135997772216797, "logps/chosen": -291.1665954589844, "logps/rejected": -438.76116943359375, "loss": 0.199, "rewards/accuracies": 1.0, "rewards/chosen": 2.366621971130371, "rewards/margins": 3.755939483642578, "rewards/rejected": -1.3893176317214966, "step": 2083 }, { "epoch": 1.5225570776255708, "grad_norm": 31.616895036638443, "learning_rate": 3.8925705700434565e-07, "logits/chosen": -2.9316298961639404, "logits/rejected": -2.3243846893310547, "logps/chosen": -1002.242431640625, "logps/rejected": -808.4124145507812, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 5.689395904541016, "rewards/margins": 5.165524005889893, "rewards/rejected": 0.5238716006278992, "step": 2084 }, { "epoch": 1.5232876712328767, "grad_norm": 36.32072169187653, "learning_rate": 3.891245615463781e-07, "logits/chosen": -3.007110357284546, "logits/rejected": -2.5373551845550537, "logps/chosen": -695.6798095703125, "logps/rejected": -751.04736328125, "loss": 0.1802, "rewards/accuracies": 0.75, "rewards/chosen": 1.844559907913208, "rewards/margins": 2.1700003147125244, "rewards/rejected": -0.3254404067993164, "step": 2085 }, { "epoch": 1.5240182648401825, "grad_norm": 71.93680766391431, "learning_rate": 3.8899200945568644e-07, "logits/chosen": -2.3836376667022705, "logits/rejected": -2.786158561706543, "logps/chosen": -189.36949157714844, "logps/rejected": -344.1628112792969, "loss": 0.2681, "rewards/accuracies": 0.875, "rewards/chosen": 1.3474031686782837, "rewards/margins": 3.398714780807495, "rewards/rejected": -2.0513112545013428, "step": 2086 }, { "epoch": 1.5247488584474884, "grad_norm": 33.30796499677761, "learning_rate": 3.888594007862283e-07, "logits/chosen": -2.7164864540100098, "logits/rejected": -2.6210808753967285, "logps/chosen": -449.13250732421875, "logps/rejected": -458.4549560546875, "loss": 0.168, "rewards/accuracies": 0.875, "rewards/chosen": 1.416174054145813, "rewards/margins": 2.313127040863037, "rewards/rejected": -0.8969526290893555, "step": 2087 }, { "epoch": 1.5254794520547945, "grad_norm": 27.10550787439531, "learning_rate": 3.8872673559198384e-07, "logits/chosen": -2.806905746459961, "logits/rejected": -2.1207756996154785, "logps/chosen": -689.8788452148438, "logps/rejected": -467.83331298828125, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 3.39646577835083, "rewards/margins": 4.056559085845947, "rewards/rejected": -0.6600935459136963, "step": 2088 }, { "epoch": 1.5262100456621006, "grad_norm": 25.183417734862633, "learning_rate": 3.8859401392695645e-07, "logits/chosen": -2.9394426345825195, "logits/rejected": -1.8919682502746582, "logps/chosen": -896.730224609375, "logps/rejected": -450.4561462402344, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": 3.9971137046813965, "rewards/margins": 3.402620553970337, "rewards/rejected": 0.5944935083389282, "step": 2089 }, { "epoch": 1.5269406392694065, "grad_norm": 35.62879125933916, "learning_rate": 3.8846123584517244e-07, "logits/chosen": -3.5513172149658203, "logits/rejected": -2.2905023097991943, "logps/chosen": -867.3721313476562, "logps/rejected": -505.63995361328125, "loss": 0.1302, "rewards/accuracies": 1.0, "rewards/chosen": 4.704199314117432, "rewards/margins": 4.148782730102539, "rewards/rejected": 0.5554169416427612, "step": 2090 }, { "epoch": 1.5276712328767124, "grad_norm": 26.063823126635942, "learning_rate": 3.883284014006811e-07, "logits/chosen": -3.0186233520507812, "logits/rejected": -2.205317497253418, "logps/chosen": -787.27685546875, "logps/rejected": -559.2437133789062, "loss": 0.1163, "rewards/accuracies": 0.875, "rewards/chosen": 3.1233010292053223, "rewards/margins": 3.689512014389038, "rewards/rejected": -0.5662111043930054, "step": 2091 }, { "epoch": 1.5284018264840182, "grad_norm": 31.354741459813038, "learning_rate": 3.8819551064755474e-07, "logits/chosen": -2.6960747241973877, "logits/rejected": -2.138549566268921, "logps/chosen": -642.3903198242188, "logps/rejected": -396.23406982421875, "loss": 0.1664, "rewards/accuracies": 0.75, "rewards/chosen": 2.564499616622925, "rewards/margins": 2.2357778549194336, "rewards/rejected": 0.32872164249420166, "step": 2092 }, { "epoch": 1.529132420091324, "grad_norm": 59.609237794730696, "learning_rate": 3.880625636398884e-07, "logits/chosen": -3.0433380603790283, "logits/rejected": -1.9287774562835693, "logps/chosen": -1026.478271484375, "logps/rejected": -614.4116821289062, "loss": 0.2455, "rewards/accuracies": 0.875, "rewards/chosen": 5.253866195678711, "rewards/margins": 4.28134822845459, "rewards/rejected": 0.9725180864334106, "step": 2093 }, { "epoch": 1.52986301369863, "grad_norm": 28.07634137177098, "learning_rate": 3.8792956043180024e-07, "logits/chosen": -2.75240421295166, "logits/rejected": -2.5903680324554443, "logps/chosen": -542.3292846679688, "logps/rejected": -576.48681640625, "loss": 0.1641, "rewards/accuracies": 1.0, "rewards/chosen": 2.9715280532836914, "rewards/margins": 3.913952350616455, "rewards/rejected": -0.9424249529838562, "step": 2094 }, { "epoch": 1.530593607305936, "grad_norm": 26.324887365841068, "learning_rate": 3.877965010774311e-07, "logits/chosen": -2.875999927520752, "logits/rejected": -2.3490257263183594, "logps/chosen": -747.0516357421875, "logps/rejected": -634.7401123046875, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": 4.145581245422363, "rewards/margins": 5.956198692321777, "rewards/rejected": -1.8106175661087036, "step": 2095 }, { "epoch": 1.5313242009132422, "grad_norm": 43.63997884859994, "learning_rate": 3.876633856309449e-07, "logits/chosen": -3.0791826248168945, "logits/rejected": -2.0563650131225586, "logps/chosen": -755.0982666015625, "logps/rejected": -523.8614501953125, "loss": 0.1764, "rewards/accuracies": 1.0, "rewards/chosen": 3.699789047241211, "rewards/margins": 3.9376115798950195, "rewards/rejected": -0.23782263696193695, "step": 2096 }, { "epoch": 1.532054794520548, "grad_norm": 35.36508287724012, "learning_rate": 3.8753021414652814e-07, "logits/chosen": -2.8802828788757324, "logits/rejected": -2.562833786010742, "logps/chosen": -822.1615600585938, "logps/rejected": -734.5416259765625, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 4.361935138702393, "rewards/margins": 4.434131145477295, "rewards/rejected": -0.072196364402771, "step": 2097 }, { "epoch": 1.532785388127854, "grad_norm": 54.62988929777819, "learning_rate": 3.873969866783904e-07, "logits/chosen": -2.61529803276062, "logits/rejected": -1.7530152797698975, "logps/chosen": -495.05291748046875, "logps/rejected": -384.0670471191406, "loss": 0.3213, "rewards/accuracies": 0.75, "rewards/chosen": 1.1238774061203003, "rewards/margins": 1.5226174592971802, "rewards/rejected": -0.3987400233745575, "step": 2098 }, { "epoch": 1.5335159817351598, "grad_norm": 28.595593135609963, "learning_rate": 3.8726370328076366e-07, "logits/chosen": -2.701305866241455, "logits/rejected": -2.0561370849609375, "logps/chosen": -550.8096923828125, "logps/rejected": -419.3560485839844, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 3.088046073913574, "rewards/margins": 4.487891674041748, "rewards/rejected": -1.3998456001281738, "step": 2099 }, { "epoch": 1.5342465753424657, "grad_norm": 32.221434251058916, "learning_rate": 3.871303640079032e-07, "logits/chosen": -2.6961190700531006, "logits/rejected": -2.0754971504211426, "logps/chosen": -492.63800048828125, "logps/rejected": -348.41607666015625, "loss": 0.2072, "rewards/accuracies": 1.0, "rewards/chosen": 3.1288065910339355, "rewards/margins": 4.6065993309021, "rewards/rejected": -1.477792739868164, "step": 2100 }, { "epoch": 1.5349771689497715, "grad_norm": 46.82561982371403, "learning_rate": 3.869969689140865e-07, "logits/chosen": -3.0687968730926514, "logits/rejected": -2.1634156703948975, "logps/chosen": -705.5149536132812, "logps/rejected": -432.768310546875, "loss": 0.2198, "rewards/accuracies": 1.0, "rewards/chosen": 3.4496030807495117, "rewards/margins": 3.454193115234375, "rewards/rejected": -0.004589825868606567, "step": 2101 }, { "epoch": 1.5357077625570776, "grad_norm": 35.15903981986265, "learning_rate": 3.8686351805361424e-07, "logits/chosen": -2.616640567779541, "logits/rejected": -2.132303476333618, "logps/chosen": -669.9718017578125, "logps/rejected": -463.62408447265625, "loss": 0.1575, "rewards/accuracies": 0.75, "rewards/chosen": 1.8985153436660767, "rewards/margins": 2.907374143600464, "rewards/rejected": -1.0088587999343872, "step": 2102 }, { "epoch": 1.5364383561643835, "grad_norm": 27.377684300470886, "learning_rate": 3.867300114808094e-07, "logits/chosen": -2.4954171180725098, "logits/rejected": -1.7706326246261597, "logps/chosen": -508.3323974609375, "logps/rejected": -410.3906555175781, "loss": 0.1385, "rewards/accuracies": 1.0, "rewards/chosen": 4.156904220581055, "rewards/margins": 5.424143314361572, "rewards/rejected": -1.2672388553619385, "step": 2103 }, { "epoch": 1.5371689497716896, "grad_norm": 34.555530914701265, "learning_rate": 3.8659644925001794e-07, "logits/chosen": -2.8349146842956543, "logits/rejected": -2.7501914501190186, "logps/chosen": -439.68182373046875, "logps/rejected": -515.3914184570312, "loss": 0.1568, "rewards/accuracies": 0.875, "rewards/chosen": 2.68017315864563, "rewards/margins": 2.0891919136047363, "rewards/rejected": 0.5909812450408936, "step": 2104 }, { "epoch": 1.5378995433789955, "grad_norm": 50.78818942969787, "learning_rate": 3.864628314156083e-07, "logits/chosen": -2.5017635822296143, "logits/rejected": -2.3176474571228027, "logps/chosen": -909.204833984375, "logps/rejected": -694.6162719726562, "loss": 0.2167, "rewards/accuracies": 1.0, "rewards/chosen": 3.9827067852020264, "rewards/margins": 3.5320725440979004, "rewards/rejected": 0.45063403248786926, "step": 2105 }, { "epoch": 1.5386301369863014, "grad_norm": 23.713026318618635, "learning_rate": 3.8632915803197164e-07, "logits/chosen": -2.684340238571167, "logits/rejected": -1.9656351804733276, "logps/chosen": -839.8438720703125, "logps/rejected": -653.1931762695312, "loss": 0.1489, "rewards/accuracies": 1.0, "rewards/chosen": 3.5125179290771484, "rewards/margins": 5.636085510253906, "rewards/rejected": -2.123567581176758, "step": 2106 }, { "epoch": 1.5393607305936072, "grad_norm": 43.33243109913102, "learning_rate": 3.8619542915352164e-07, "logits/chosen": -3.006941556930542, "logits/rejected": -2.7166831493377686, "logps/chosen": -956.059326171875, "logps/rejected": -937.7825317382812, "loss": 0.1845, "rewards/accuracies": 0.875, "rewards/chosen": 4.311374187469482, "rewards/margins": 2.599632501602173, "rewards/rejected": 1.7117416858673096, "step": 2107 }, { "epoch": 1.540091324200913, "grad_norm": 38.02946213443434, "learning_rate": 3.860616448346947e-07, "logits/chosen": -2.180797815322876, "logits/rejected": -2.03407621383667, "logps/chosen": -746.4674072265625, "logps/rejected": -804.9462890625, "loss": 0.2193, "rewards/accuracies": 1.0, "rewards/chosen": 3.54396390914917, "rewards/margins": 5.418026924133301, "rewards/rejected": -1.874063491821289, "step": 2108 }, { "epoch": 1.5408219178082192, "grad_norm": 23.444655157812623, "learning_rate": 3.8592780512994967e-07, "logits/chosen": -2.9442594051361084, "logits/rejected": -1.9507555961608887, "logps/chosen": -503.28448486328125, "logps/rejected": -377.45758056640625, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 3.579482078552246, "rewards/margins": 4.0057692527771, "rewards/rejected": -0.4262872040271759, "step": 2109 }, { "epoch": 1.541552511415525, "grad_norm": 26.486815429597467, "learning_rate": 3.85793910093768e-07, "logits/chosen": -2.9532432556152344, "logits/rejected": -2.7123804092407227, "logps/chosen": -597.2657470703125, "logps/rejected": -516.3098754882812, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": 1.7812567949295044, "rewards/margins": 1.753007173538208, "rewards/rejected": 0.02824963629245758, "step": 2110 }, { "epoch": 1.5422831050228312, "grad_norm": 20.827453922984983, "learning_rate": 3.856599597806537e-07, "logits/chosen": -2.21012020111084, "logits/rejected": -2.6511390209198, "logps/chosen": -448.4278564453125, "logps/rejected": -711.93994140625, "loss": 0.1187, "rewards/accuracies": 1.0, "rewards/chosen": 1.6222305297851562, "rewards/margins": 2.4819130897521973, "rewards/rejected": -0.8596826791763306, "step": 2111 }, { "epoch": 1.543013698630137, "grad_norm": 44.699850784496476, "learning_rate": 3.8552595424513316e-07, "logits/chosen": -3.1517670154571533, "logits/rejected": -2.427980661392212, "logps/chosen": -765.9453125, "logps/rejected": -634.33837890625, "loss": 0.2048, "rewards/accuracies": 1.0, "rewards/chosen": 4.255034446716309, "rewards/margins": 3.5810322761535645, "rewards/rejected": 0.674002468585968, "step": 2112 }, { "epoch": 1.543744292237443, "grad_norm": 42.30408226136767, "learning_rate": 3.8539189354175547e-07, "logits/chosen": -2.7334885597229004, "logits/rejected": -2.6472864151000977, "logps/chosen": -711.440185546875, "logps/rejected": -670.2012939453125, "loss": 0.2293, "rewards/accuracies": 0.875, "rewards/chosen": 3.1697139739990234, "rewards/margins": 3.406709671020508, "rewards/rejected": -0.23699572682380676, "step": 2113 }, { "epoch": 1.5444748858447488, "grad_norm": 28.7203736619364, "learning_rate": 3.8525777772509184e-07, "logits/chosen": -2.606452703475952, "logits/rejected": -2.051248550415039, "logps/chosen": -445.7181396484375, "logps/rejected": -446.39398193359375, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 3.837798595428467, "rewards/margins": 6.002185821533203, "rewards/rejected": -2.1643874645233154, "step": 2114 }, { "epoch": 1.5452054794520547, "grad_norm": 30.58596942846354, "learning_rate": 3.851236068497362e-07, "logits/chosen": -2.9642927646636963, "logits/rejected": -2.0896570682525635, "logps/chosen": -470.96441650390625, "logps/rejected": -354.9498291015625, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": 2.063732862472534, "rewards/margins": 3.853269338607788, "rewards/rejected": -1.789536714553833, "step": 2115 }, { "epoch": 1.5459360730593608, "grad_norm": 34.58525542161175, "learning_rate": 3.849893809703049e-07, "logits/chosen": -2.4755282402038574, "logits/rejected": -2.37042236328125, "logps/chosen": -481.3847961425781, "logps/rejected": -596.228515625, "loss": 0.1396, "rewards/accuracies": 0.875, "rewards/chosen": 2.558293581008911, "rewards/margins": 3.585517168045044, "rewards/rejected": -1.0272235870361328, "step": 2116 }, { "epoch": 1.5466666666666666, "grad_norm": 36.527946747041334, "learning_rate": 3.848551001414365e-07, "logits/chosen": -2.7778611183166504, "logits/rejected": -2.82987117767334, "logps/chosen": -742.51171875, "logps/rejected": -726.4317626953125, "loss": 0.1905, "rewards/accuracies": 0.875, "rewards/chosen": 2.2431087493896484, "rewards/margins": 2.3711142539978027, "rewards/rejected": -0.12800580263137817, "step": 2117 }, { "epoch": 1.5473972602739727, "grad_norm": 58.920723105950806, "learning_rate": 3.84720764417792e-07, "logits/chosen": -3.0648043155670166, "logits/rejected": -2.247535228729248, "logps/chosen": -762.9102783203125, "logps/rejected": -585.0645751953125, "loss": 0.3088, "rewards/accuracies": 0.875, "rewards/chosen": 2.108548164367676, "rewards/margins": 2.169504165649414, "rewards/rejected": -0.060956209897994995, "step": 2118 }, { "epoch": 1.5481278538812786, "grad_norm": 31.604050041156217, "learning_rate": 3.845863738540547e-07, "logits/chosen": -2.586733341217041, "logits/rejected": -2.267286539077759, "logps/chosen": -511.5462951660156, "logps/rejected": -536.4591064453125, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 3.009537696838379, "rewards/margins": 4.4250006675720215, "rewards/rejected": -1.4154630899429321, "step": 2119 }, { "epoch": 1.5488584474885845, "grad_norm": 47.82148863628803, "learning_rate": 3.844519285049304e-07, "logits/chosen": -2.932419538497925, "logits/rejected": -2.554845094680786, "logps/chosen": -635.1802978515625, "logps/rejected": -618.942626953125, "loss": 0.2283, "rewards/accuracies": 0.875, "rewards/chosen": 2.854118824005127, "rewards/margins": 2.789341449737549, "rewards/rejected": 0.06477728486061096, "step": 2120 }, { "epoch": 1.5495890410958904, "grad_norm": 27.619946600153167, "learning_rate": 3.84317428425147e-07, "logits/chosen": -2.4243719577789307, "logits/rejected": -2.5626063346862793, "logps/chosen": -550.799560546875, "logps/rejected": -536.5814819335938, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 0.7978070974349976, "rewards/margins": 1.480358362197876, "rewards/rejected": -0.6825512051582336, "step": 2121 }, { "epoch": 1.5503196347031962, "grad_norm": 20.29275106656585, "learning_rate": 3.841828736694548e-07, "logits/chosen": -2.957400321960449, "logits/rejected": -1.7276806831359863, "logps/chosen": -835.6493530273438, "logps/rejected": -437.4551086425781, "loss": 0.0771, "rewards/accuracies": 1.0, "rewards/chosen": 5.374288558959961, "rewards/margins": 6.571524620056152, "rewards/rejected": -1.1972365379333496, "step": 2122 }, { "epoch": 1.5510502283105023, "grad_norm": 27.433693589185406, "learning_rate": 3.840482642926263e-07, "logits/chosen": -2.9899072647094727, "logits/rejected": -2.6284019947052, "logps/chosen": -562.9577026367188, "logps/rejected": -560.2684326171875, "loss": 0.1465, "rewards/accuracies": 0.875, "rewards/chosen": 3.133214235305786, "rewards/margins": 3.5670268535614014, "rewards/rejected": -0.43381267786026, "step": 2123 }, { "epoch": 1.5517808219178082, "grad_norm": 34.69439155921404, "learning_rate": 3.839136003494562e-07, "logits/chosen": -2.349452495574951, "logits/rejected": -2.359480142593384, "logps/chosen": -622.3114013671875, "logps/rejected": -705.7979736328125, "loss": 0.1358, "rewards/accuracies": 0.875, "rewards/chosen": 3.101457357406616, "rewards/margins": 4.035547256469727, "rewards/rejected": -0.9340894222259521, "step": 2124 }, { "epoch": 1.5525114155251143, "grad_norm": 24.65621534376525, "learning_rate": 3.837788818947616e-07, "logits/chosen": -2.676940441131592, "logits/rejected": -2.3464574813842773, "logps/chosen": -928.999267578125, "logps/rejected": -722.8211669921875, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 2.346463918685913, "rewards/margins": 2.5248563289642334, "rewards/rejected": -0.1783924102783203, "step": 2125 }, { "epoch": 1.5532420091324202, "grad_norm": 28.430846792109556, "learning_rate": 3.836441089833815e-07, "logits/chosen": -2.4097719192504883, "logits/rejected": -2.620145320892334, "logps/chosen": -423.95947265625, "logps/rejected": -463.20074462890625, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 2.0695230960845947, "rewards/margins": 3.9021472930908203, "rewards/rejected": -1.8326244354248047, "step": 2126 }, { "epoch": 1.553972602739726, "grad_norm": 38.19963717463083, "learning_rate": 3.8350928167017724e-07, "logits/chosen": -3.06168794631958, "logits/rejected": -2.353254795074463, "logps/chosen": -526.8292846679688, "logps/rejected": -403.01007080078125, "loss": 0.216, "rewards/accuracies": 0.875, "rewards/chosen": 3.4982070922851562, "rewards/margins": 5.070396900177002, "rewards/rejected": -1.5721895694732666, "step": 2127 }, { "epoch": 1.554703196347032, "grad_norm": 24.352516172775914, "learning_rate": 3.833744000100324e-07, "logits/chosen": -2.843243360519409, "logits/rejected": -2.407939910888672, "logps/chosen": -708.5069580078125, "logps/rejected": -674.23095703125, "loss": 0.1341, "rewards/accuracies": 1.0, "rewards/chosen": 2.6378583908081055, "rewards/margins": 2.782036542892456, "rewards/rejected": -0.14417782425880432, "step": 2128 }, { "epoch": 1.5554337899543378, "grad_norm": 35.167859194333964, "learning_rate": 3.8323946405785256e-07, "logits/chosen": -2.852663993835449, "logits/rejected": -2.2962594032287598, "logps/chosen": -693.7881469726562, "logps/rejected": -598.4368286132812, "loss": 0.2125, "rewards/accuracies": 1.0, "rewards/chosen": 2.673685073852539, "rewards/margins": 3.6703312397003174, "rewards/rejected": -0.9966461658477783, "step": 2129 }, { "epoch": 1.5561643835616439, "grad_norm": 46.17456897809117, "learning_rate": 3.831044738685653e-07, "logits/chosen": -2.849273681640625, "logits/rejected": -2.4104132652282715, "logps/chosen": -661.704833984375, "logps/rejected": -600.135986328125, "loss": 0.1885, "rewards/accuracies": 1.0, "rewards/chosen": 4.285911560058594, "rewards/margins": 5.687513828277588, "rewards/rejected": -1.401601791381836, "step": 2130 }, { "epoch": 1.5568949771689498, "grad_norm": 65.50196800478594, "learning_rate": 3.829694294971204e-07, "logits/chosen": -2.8533928394317627, "logits/rejected": -2.170011281967163, "logps/chosen": -571.3439331054688, "logps/rejected": -365.36865234375, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 2.373872995376587, "rewards/margins": 4.576124668121338, "rewards/rejected": -2.202251672744751, "step": 2131 }, { "epoch": 1.5576255707762559, "grad_norm": 33.85665836183021, "learning_rate": 3.828343309984897e-07, "logits/chosen": -2.7735021114349365, "logits/rejected": -2.325432777404785, "logps/chosen": -663.9922485351562, "logps/rejected": -572.9677734375, "loss": 0.1667, "rewards/accuracies": 0.875, "rewards/chosen": 3.449112892150879, "rewards/margins": 3.7275729179382324, "rewards/rejected": -0.27845993638038635, "step": 2132 }, { "epoch": 1.5583561643835617, "grad_norm": 35.81000652586945, "learning_rate": 3.826991784276671e-07, "logits/chosen": -2.999283790588379, "logits/rejected": -2.351518154144287, "logps/chosen": -805.9443359375, "logps/rejected": -732.58154296875, "loss": 0.2641, "rewards/accuracies": 0.875, "rewards/chosen": 2.4464688301086426, "rewards/margins": 2.085254669189453, "rewards/rejected": 0.3612141013145447, "step": 2133 }, { "epoch": 1.5590867579908676, "grad_norm": 31.656137208276355, "learning_rate": 3.825639718396684e-07, "logits/chosen": -2.3038909435272217, "logits/rejected": -2.6335906982421875, "logps/chosen": -602.7747802734375, "logps/rejected": -773.0162353515625, "loss": 0.2262, "rewards/accuracies": 0.875, "rewards/chosen": 1.9303724765777588, "rewards/margins": 2.3645424842834473, "rewards/rejected": -0.43416985869407654, "step": 2134 }, { "epoch": 1.5598173515981735, "grad_norm": 37.00243967621848, "learning_rate": 3.824287112895316e-07, "logits/chosen": -2.905600070953369, "logits/rejected": -2.271409511566162, "logps/chosen": -563.689208984375, "logps/rejected": -498.7729187011719, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 1.3515896797180176, "rewards/margins": 2.4489166736602783, "rewards/rejected": -1.0973272323608398, "step": 2135 }, { "epoch": 1.5605479452054793, "grad_norm": 27.47143735965537, "learning_rate": 3.8229339683231633e-07, "logits/chosen": -3.2340126037597656, "logits/rejected": -2.7005720138549805, "logps/chosen": -1041.4051513671875, "logps/rejected": -842.6036376953125, "loss": 0.1213, "rewards/accuracies": 1.0, "rewards/chosen": 3.4739880561828613, "rewards/margins": 2.96545672416687, "rewards/rejected": 0.5085311532020569, "step": 2136 }, { "epoch": 1.5612785388127854, "grad_norm": 28.690310404957497, "learning_rate": 3.8215802852310444e-07, "logits/chosen": -2.602573871612549, "logits/rejected": -2.246229410171509, "logps/chosen": -400.89825439453125, "logps/rejected": -312.516845703125, "loss": 0.1667, "rewards/accuracies": 0.75, "rewards/chosen": 0.9447146654129028, "rewards/margins": 2.104613780975342, "rewards/rejected": -1.1598992347717285, "step": 2137 }, { "epoch": 1.5620091324200913, "grad_norm": 31.655720743046093, "learning_rate": 3.8202260641699957e-07, "logits/chosen": -2.304229259490967, "logits/rejected": -2.6190848350524902, "logps/chosen": -657.2273559570312, "logps/rejected": -807.0894775390625, "loss": 0.1706, "rewards/accuracies": 0.875, "rewards/chosen": 2.581171989440918, "rewards/margins": 4.22364616394043, "rewards/rejected": -1.6424739360809326, "step": 2138 }, { "epoch": 1.5627397260273974, "grad_norm": 37.11615891939368, "learning_rate": 3.818871305691274e-07, "logits/chosen": -3.1814754009246826, "logits/rejected": -2.1542046070098877, "logps/chosen": -474.59661865234375, "logps/rejected": -378.2197265625, "loss": 0.2123, "rewards/accuracies": 0.875, "rewards/chosen": 1.3948888778686523, "rewards/margins": 2.384310483932495, "rewards/rejected": -0.9894217252731323, "step": 2139 }, { "epoch": 1.5634703196347033, "grad_norm": 44.749945443402495, "learning_rate": 3.817516010346353e-07, "logits/chosen": -3.324944496154785, "logits/rejected": -1.7603623867034912, "logps/chosen": -595.8671875, "logps/rejected": -319.9993896484375, "loss": 0.1643, "rewards/accuracies": 1.0, "rewards/chosen": 3.4221975803375244, "rewards/margins": 4.7740397453308105, "rewards/rejected": -1.3518420457839966, "step": 2140 }, { "epoch": 1.5642009132420092, "grad_norm": 34.67952822248848, "learning_rate": 3.816160178686927e-07, "logits/chosen": -2.045191526412964, "logits/rejected": -1.8344042301177979, "logps/chosen": -585.0574340820312, "logps/rejected": -450.4253234863281, "loss": 0.1642, "rewards/accuracies": 0.875, "rewards/chosen": 2.595916748046875, "rewards/margins": 3.2263097763061523, "rewards/rejected": -0.6303932666778564, "step": 2141 }, { "epoch": 1.564931506849315, "grad_norm": 43.61716860894271, "learning_rate": 3.814803811264906e-07, "logits/chosen": -3.2091503143310547, "logits/rejected": -2.0181539058685303, "logps/chosen": -883.1736450195312, "logps/rejected": -492.93585205078125, "loss": 0.1962, "rewards/accuracies": 1.0, "rewards/chosen": 2.5083420276641846, "rewards/margins": 2.6923036575317383, "rewards/rejected": -0.18396149575710297, "step": 2142 }, { "epoch": 1.565662100456621, "grad_norm": 25.70877478749656, "learning_rate": 3.81344690863242e-07, "logits/chosen": -3.4030611515045166, "logits/rejected": -2.4105587005615234, "logps/chosen": -745.1329345703125, "logps/rejected": -554.2236328125, "loss": 0.135, "rewards/accuracies": 1.0, "rewards/chosen": 2.9713401794433594, "rewards/margins": 3.8504490852355957, "rewards/rejected": -0.8791090250015259, "step": 2143 }, { "epoch": 1.5663926940639268, "grad_norm": 24.472553683444286, "learning_rate": 3.812089471341817e-07, "logits/chosen": -3.0733802318573, "logits/rejected": -2.29927659034729, "logps/chosen": -591.965576171875, "logps/rejected": -487.6360778808594, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 3.396170139312744, "rewards/margins": 3.8458914756774902, "rewards/rejected": -0.44972142577171326, "step": 2144 }, { "epoch": 1.5671232876712329, "grad_norm": 18.3989666192493, "learning_rate": 3.8107314999456613e-07, "logits/chosen": -2.428208351135254, "logits/rejected": -1.7160788774490356, "logps/chosen": -545.192138671875, "logps/rejected": -480.9190673828125, "loss": 0.1163, "rewards/accuracies": 1.0, "rewards/chosen": 2.3484044075012207, "rewards/margins": 3.2678561210632324, "rewards/rejected": -0.9194516539573669, "step": 2145 }, { "epoch": 1.567853881278539, "grad_norm": 54.726813993890374, "learning_rate": 3.809372994996737e-07, "logits/chosen": -1.8118152618408203, "logits/rejected": -2.366250991821289, "logps/chosen": -274.76727294921875, "logps/rejected": -585.3365478515625, "loss": 0.2556, "rewards/accuracies": 1.0, "rewards/chosen": 1.5689185857772827, "rewards/margins": 3.989163637161255, "rewards/rejected": -2.4202451705932617, "step": 2146 }, { "epoch": 1.5685844748858448, "grad_norm": 22.550237562192734, "learning_rate": 3.808013957048041e-07, "logits/chosen": -2.634063720703125, "logits/rejected": -1.8430075645446777, "logps/chosen": -485.1082458496094, "logps/rejected": -414.53033447265625, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 1.783642292022705, "rewards/margins": 3.6922032833099365, "rewards/rejected": -1.9085609912872314, "step": 2147 }, { "epoch": 1.5693150684931507, "grad_norm": 35.53398289566886, "learning_rate": 3.806654386652792e-07, "logits/chosen": -2.3846559524536133, "logits/rejected": -1.8638124465942383, "logps/chosen": -638.4375610351562, "logps/rejected": -651.6934814453125, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 3.0673065185546875, "rewards/margins": 4.174360275268555, "rewards/rejected": -1.107053518295288, "step": 2148 }, { "epoch": 1.5700456621004566, "grad_norm": 23.99693278740275, "learning_rate": 3.805294284364423e-07, "logits/chosen": -2.7079012393951416, "logits/rejected": -1.9903428554534912, "logps/chosen": -412.3992614746094, "logps/rejected": -354.1494140625, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": 2.8813092708587646, "rewards/margins": 4.387206077575684, "rewards/rejected": -1.5058965682983398, "step": 2149 }, { "epoch": 1.5707762557077625, "grad_norm": 32.799556598922756, "learning_rate": 3.8039336507365837e-07, "logits/chosen": -2.1036553382873535, "logits/rejected": -2.2504329681396484, "logps/chosen": -328.31658935546875, "logps/rejected": -537.1773681640625, "loss": 0.2517, "rewards/accuracies": 0.625, "rewards/chosen": 0.5059656500816345, "rewards/margins": 1.149040937423706, "rewards/rejected": -0.6430754065513611, "step": 2150 }, { "epoch": 1.5715068493150683, "grad_norm": 36.91066700653332, "learning_rate": 3.8025724863231403e-07, "logits/chosen": -2.617936611175537, "logits/rejected": -1.7265129089355469, "logps/chosen": -392.8316955566406, "logps/rejected": -217.27825927734375, "loss": 0.2081, "rewards/accuracies": 0.75, "rewards/chosen": 1.2697949409484863, "rewards/margins": 2.614096164703369, "rewards/rejected": -1.3443012237548828, "step": 2151 }, { "epoch": 1.5722374429223744, "grad_norm": 42.52059904062309, "learning_rate": 3.801210791678175e-07, "logits/chosen": -3.3367202281951904, "logits/rejected": -2.2365782260894775, "logps/chosen": -634.6553955078125, "logps/rejected": -491.78387451171875, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 3.3541855812072754, "rewards/margins": 2.9515509605407715, "rewards/rejected": 0.40263473987579346, "step": 2152 }, { "epoch": 1.5729680365296803, "grad_norm": 18.85428761255522, "learning_rate": 3.7998485673559854e-07, "logits/chosen": -2.7069361209869385, "logits/rejected": -2.7758164405822754, "logps/chosen": -457.98895263671875, "logps/rejected": -565.2324829101562, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 1.3259963989257812, "rewards/margins": 2.288763999938965, "rewards/rejected": -0.9627677202224731, "step": 2153 }, { "epoch": 1.5736986301369864, "grad_norm": 32.04774346702315, "learning_rate": 3.7984858139110867e-07, "logits/chosen": -2.764944553375244, "logits/rejected": -1.516052484512329, "logps/chosen": -909.85791015625, "logps/rejected": -477.5727844238281, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 5.541105270385742, "rewards/margins": 6.154959678649902, "rewards/rejected": -0.6138544082641602, "step": 2154 }, { "epoch": 1.5744292237442923, "grad_norm": 34.134265354465484, "learning_rate": 3.797122531898206e-07, "logits/chosen": -3.039987325668335, "logits/rejected": -2.307896614074707, "logps/chosen": -699.7493896484375, "logps/rejected": -473.20013427734375, "loss": 0.1621, "rewards/accuracies": 1.0, "rewards/chosen": 2.4418435096740723, "rewards/margins": 4.309331893920898, "rewards/rejected": -1.867487907409668, "step": 2155 }, { "epoch": 1.5751598173515982, "grad_norm": 36.64072328078503, "learning_rate": 3.7957587218722887e-07, "logits/chosen": -2.2877891063690186, "logits/rejected": -2.0354843139648438, "logps/chosen": -341.5306396484375, "logps/rejected": -439.09002685546875, "loss": 0.2349, "rewards/accuracies": 0.875, "rewards/chosen": 2.523557424545288, "rewards/margins": 3.962759017944336, "rewards/rejected": -1.4392014741897583, "step": 2156 }, { "epoch": 1.575890410958904, "grad_norm": 32.765187243737635, "learning_rate": 3.794394384388494e-07, "logits/chosen": -2.8388803005218506, "logits/rejected": -2.27262020111084, "logps/chosen": -731.2322998046875, "logps/rejected": -623.873779296875, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": 3.7437336444854736, "rewards/margins": 3.4985389709472656, "rewards/rejected": 0.245194673538208, "step": 2157 }, { "epoch": 1.57662100456621, "grad_norm": 31.50885660308215, "learning_rate": 3.7930295200021957e-07, "logits/chosen": -2.8726792335510254, "logits/rejected": -2.3730924129486084, "logps/chosen": -392.6484375, "logps/rejected": -366.897705078125, "loss": 0.1597, "rewards/accuracies": 0.875, "rewards/chosen": 1.7640553712844849, "rewards/margins": 3.0205929279327393, "rewards/rejected": -1.2565374374389648, "step": 2158 }, { "epoch": 1.577351598173516, "grad_norm": 45.00082600153371, "learning_rate": 3.791664129268982e-07, "logits/chosen": -2.8735008239746094, "logits/rejected": -2.1793460845947266, "logps/chosen": -799.3126220703125, "logps/rejected": -574.6314697265625, "loss": 0.248, "rewards/accuracies": 0.75, "rewards/chosen": 2.7355337142944336, "rewards/margins": 2.1229190826416016, "rewards/rejected": 0.612614631652832, "step": 2159 }, { "epoch": 1.5780821917808219, "grad_norm": 39.54152225726616, "learning_rate": 3.790298212744655e-07, "logits/chosen": -2.7884368896484375, "logits/rejected": -2.4812166690826416, "logps/chosen": -619.36376953125, "logps/rejected": -640.1373291015625, "loss": 0.2095, "rewards/accuracies": 1.0, "rewards/chosen": 2.0419397354125977, "rewards/margins": 2.9895918369293213, "rewards/rejected": -0.947652280330658, "step": 2160 }, { "epoch": 1.578812785388128, "grad_norm": 53.694018357931405, "learning_rate": 3.788931770985232e-07, "logits/chosen": -2.395148277282715, "logits/rejected": -2.63144588470459, "logps/chosen": -546.4022216796875, "logps/rejected": -678.6353759765625, "loss": 0.2933, "rewards/accuracies": 0.875, "rewards/chosen": 1.664250373840332, "rewards/margins": 3.870187282562256, "rewards/rejected": -2.205936908721924, "step": 2161 }, { "epoch": 1.5795433789954338, "grad_norm": 49.690612618701685, "learning_rate": 3.787564804546943e-07, "logits/chosen": -2.606689929962158, "logits/rejected": -1.886310338973999, "logps/chosen": -582.0225830078125, "logps/rejected": -444.7523498535156, "loss": 0.217, "rewards/accuracies": 1.0, "rewards/chosen": 3.9915738105773926, "rewards/margins": 5.043895721435547, "rewards/rejected": -1.0523221492767334, "step": 2162 }, { "epoch": 1.5802739726027397, "grad_norm": 24.237798708432017, "learning_rate": 3.7861973139862336e-07, "logits/chosen": -2.7138359546661377, "logits/rejected": -0.9531159400939941, "logps/chosen": -671.187255859375, "logps/rejected": -241.8540496826172, "loss": 0.1477, "rewards/accuracies": 1.0, "rewards/chosen": 3.472144603729248, "rewards/margins": 5.255445957183838, "rewards/rejected": -1.783301591873169, "step": 2163 }, { "epoch": 1.5810045662100456, "grad_norm": 36.31374715478802, "learning_rate": 3.7848292998597597e-07, "logits/chosen": -2.7726638317108154, "logits/rejected": -2.08256459236145, "logps/chosen": -616.0465698242188, "logps/rejected": -505.2021484375, "loss": 0.1753, "rewards/accuracies": 0.875, "rewards/chosen": 3.573362112045288, "rewards/margins": 4.540532112121582, "rewards/rejected": -0.9671701192855835, "step": 2164 }, { "epoch": 1.5817351598173515, "grad_norm": 22.429941928633585, "learning_rate": 3.7834607627243915e-07, "logits/chosen": -3.0804991722106934, "logits/rejected": -2.4301342964172363, "logps/chosen": -673.257568359375, "logps/rejected": -584.3799438476562, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 2.3596372604370117, "rewards/margins": 2.8052260875701904, "rewards/rejected": -0.44558876752853394, "step": 2165 }, { "epoch": 1.5824657534246576, "grad_norm": 24.97429953386143, "learning_rate": 3.7820917031372137e-07, "logits/chosen": -3.3114168643951416, "logits/rejected": -2.6657357215881348, "logps/chosen": -968.4982299804688, "logps/rejected": -682.111328125, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": 5.836081504821777, "rewards/margins": 5.489518165588379, "rewards/rejected": 0.34656351804733276, "step": 2166 }, { "epoch": 1.5831963470319634, "grad_norm": 45.944809686289645, "learning_rate": 3.7807221216555214e-07, "logits/chosen": -2.8764514923095703, "logits/rejected": -2.2705931663513184, "logps/chosen": -768.2365112304688, "logps/rejected": -712.2527465820312, "loss": 0.2291, "rewards/accuracies": 1.0, "rewards/chosen": 4.0233073234558105, "rewards/margins": 3.517693519592285, "rewards/rejected": 0.5056138634681702, "step": 2167 }, { "epoch": 1.5839269406392695, "grad_norm": 37.379444891418935, "learning_rate": 3.7793520188368233e-07, "logits/chosen": -2.852182388305664, "logits/rejected": -2.1928915977478027, "logps/chosen": -565.192138671875, "logps/rejected": -506.89471435546875, "loss": 0.2153, "rewards/accuracies": 1.0, "rewards/chosen": 2.2639083862304688, "rewards/margins": 2.7225215435028076, "rewards/rejected": -0.4586133360862732, "step": 2168 }, { "epoch": 1.5846575342465754, "grad_norm": 27.034956167533707, "learning_rate": 3.77798139523884e-07, "logits/chosen": -2.3329498767852783, "logits/rejected": -2.0484960079193115, "logps/chosen": -531.4287719726562, "logps/rejected": -485.34088134765625, "loss": 0.1459, "rewards/accuracies": 0.875, "rewards/chosen": 3.3577029705047607, "rewards/margins": 5.532161235809326, "rewards/rejected": -2.1744582653045654, "step": 2169 }, { "epoch": 1.5853881278538813, "grad_norm": 63.56521933711076, "learning_rate": 3.776610251419505e-07, "logits/chosen": -2.6517159938812256, "logits/rejected": -2.067648410797119, "logps/chosen": -529.6209716796875, "logps/rejected": -473.5425109863281, "loss": 0.3491, "rewards/accuracies": 0.875, "rewards/chosen": 2.1230337619781494, "rewards/margins": 3.3928678035736084, "rewards/rejected": -1.269834041595459, "step": 2170 }, { "epoch": 1.5861187214611872, "grad_norm": 35.275974957181376, "learning_rate": 3.7752385879369626e-07, "logits/chosen": -2.8667709827423096, "logits/rejected": -1.7934248447418213, "logps/chosen": -500.7159423828125, "logps/rejected": -244.8832244873047, "loss": 0.2117, "rewards/accuracies": 1.0, "rewards/chosen": 1.5447038412094116, "rewards/margins": 1.769634485244751, "rewards/rejected": -0.22493073344230652, "step": 2171 }, { "epoch": 1.586849315068493, "grad_norm": 35.36305320849095, "learning_rate": 3.7738664053495685e-07, "logits/chosen": -3.041516065597534, "logits/rejected": -2.2171411514282227, "logps/chosen": -695.665771484375, "logps/rejected": -584.0693359375, "loss": 0.1909, "rewards/accuracies": 0.875, "rewards/chosen": 2.4791266918182373, "rewards/margins": 3.097043514251709, "rewards/rejected": -0.6179167032241821, "step": 2172 }, { "epoch": 1.5875799086757991, "grad_norm": 25.342278490726965, "learning_rate": 3.7724937042158896e-07, "logits/chosen": -2.620879650115967, "logits/rejected": -2.2696616649627686, "logps/chosen": -468.0501403808594, "logps/rejected": -392.5856018066406, "loss": 0.15, "rewards/accuracies": 1.0, "rewards/chosen": 1.8744783401489258, "rewards/margins": 2.2678277492523193, "rewards/rejected": -0.3933492600917816, "step": 2173 }, { "epoch": 1.588310502283105, "grad_norm": 26.41491282644912, "learning_rate": 3.7711204850947056e-07, "logits/chosen": -2.3438446521759033, "logits/rejected": -1.8643527030944824, "logps/chosen": -440.8923034667969, "logps/rejected": -411.2909851074219, "loss": 0.1532, "rewards/accuracies": 0.875, "rewards/chosen": 2.844552516937256, "rewards/margins": 4.776223182678223, "rewards/rejected": -1.9316705465316772, "step": 2174 }, { "epoch": 1.589041095890411, "grad_norm": 51.398016532893806, "learning_rate": 3.769746748545004e-07, "logits/chosen": -2.6915488243103027, "logits/rejected": -2.3508503437042236, "logps/chosen": -650.5391845703125, "logps/rejected": -686.5211791992188, "loss": 0.256, "rewards/accuracies": 0.875, "rewards/chosen": 2.4089407920837402, "rewards/margins": 2.7055020332336426, "rewards/rejected": -0.29656147956848145, "step": 2175 }, { "epoch": 1.589771689497717, "grad_norm": 34.45086798162711, "learning_rate": 3.7683724951259867e-07, "logits/chosen": -2.207275152206421, "logits/rejected": -1.8255391120910645, "logps/chosen": -296.61798095703125, "logps/rejected": -273.3734436035156, "loss": 0.1581, "rewards/accuracies": 0.75, "rewards/chosen": 0.839816689491272, "rewards/margins": 2.615149736404419, "rewards/rejected": -1.7753329277038574, "step": 2176 }, { "epoch": 1.5905022831050228, "grad_norm": 52.33045073586458, "learning_rate": 3.7669977253970626e-07, "logits/chosen": -2.8885703086853027, "logits/rejected": -2.3132777214050293, "logps/chosen": -566.6331176757812, "logps/rejected": -424.59051513671875, "loss": 0.3197, "rewards/accuracies": 0.875, "rewards/chosen": 2.8534607887268066, "rewards/margins": 3.403564691543579, "rewards/rejected": -0.5501035451889038, "step": 2177 }, { "epoch": 1.5912328767123287, "grad_norm": 29.263641567197983, "learning_rate": 3.765622439917853e-07, "logits/chosen": -2.5872411727905273, "logits/rejected": -2.061962366104126, "logps/chosen": -957.1018676757812, "logps/rejected": -665.5654296875, "loss": 0.1134, "rewards/accuracies": 0.875, "rewards/chosen": 4.430603981018066, "rewards/margins": 4.447502136230469, "rewards/rejected": -0.01689818501472473, "step": 2178 }, { "epoch": 1.5919634703196346, "grad_norm": 28.986960801097705, "learning_rate": 3.7642466392481874e-07, "logits/chosen": -3.0365490913391113, "logits/rejected": -2.6100168228149414, "logps/chosen": -530.1170043945312, "logps/rejected": -583.5352783203125, "loss": 0.1841, "rewards/accuracies": 0.875, "rewards/chosen": 1.9235094785690308, "rewards/margins": 2.594456672668457, "rewards/rejected": -0.670947253704071, "step": 2179 }, { "epoch": 1.5926940639269407, "grad_norm": 30.15535843079707, "learning_rate": 3.7628703239481066e-07, "logits/chosen": -2.6753604412078857, "logits/rejected": -1.9885804653167725, "logps/chosen": -375.82757568359375, "logps/rejected": -342.6805114746094, "loss": 0.1539, "rewards/accuracies": 0.875, "rewards/chosen": 2.9259727001190186, "rewards/margins": 5.060305118560791, "rewards/rejected": -2.1343326568603516, "step": 2180 }, { "epoch": 1.5934246575342466, "grad_norm": 47.87079524208236, "learning_rate": 3.761493494577861e-07, "logits/chosen": -2.043799638748169, "logits/rejected": -2.0062952041625977, "logps/chosen": -665.6979370117188, "logps/rejected": -564.3523559570312, "loss": 0.2951, "rewards/accuracies": 0.75, "rewards/chosen": 1.3701226711273193, "rewards/margins": 2.087035655975342, "rewards/rejected": -0.7169129252433777, "step": 2181 }, { "epoch": 1.5941552511415527, "grad_norm": 36.35523182238528, "learning_rate": 3.7601161516979074e-07, "logits/chosen": -3.159026622772217, "logits/rejected": -2.22737455368042, "logps/chosen": -549.6510620117188, "logps/rejected": -445.57330322265625, "loss": 0.1729, "rewards/accuracies": 0.875, "rewards/chosen": 2.3008642196655273, "rewards/margins": 3.357144832611084, "rewards/rejected": -1.0562806129455566, "step": 2182 }, { "epoch": 1.5948858447488585, "grad_norm": 34.2489093690405, "learning_rate": 3.758738295868916e-07, "logits/chosen": -3.183927536010742, "logits/rejected": -1.9994100332260132, "logps/chosen": -817.8360595703125, "logps/rejected": -589.8593139648438, "loss": 0.1098, "rewards/accuracies": 0.875, "rewards/chosen": 3.673760414123535, "rewards/margins": 4.1352081298828125, "rewards/rejected": -0.46144789457321167, "step": 2183 }, { "epoch": 1.5956164383561644, "grad_norm": 20.135968175269564, "learning_rate": 3.757359927651762e-07, "logits/chosen": -2.6366519927978516, "logits/rejected": -1.877968668937683, "logps/chosen": -797.8255615234375, "logps/rejected": -575.719482421875, "loss": 0.0831, "rewards/accuracies": 1.0, "rewards/chosen": 4.645515441894531, "rewards/margins": 5.168840408325195, "rewards/rejected": -0.5233244895935059, "step": 2184 }, { "epoch": 1.5963470319634703, "grad_norm": 23.693806878064148, "learning_rate": 3.7559810476075317e-07, "logits/chosen": -2.958486318588257, "logits/rejected": -1.9455273151397705, "logps/chosen": -520.12451171875, "logps/rejected": -295.6961975097656, "loss": 0.1554, "rewards/accuracies": 1.0, "rewards/chosen": 3.034464120864868, "rewards/margins": 4.319027900695801, "rewards/rejected": -1.2845637798309326, "step": 2185 }, { "epoch": 1.5970776255707761, "grad_norm": 41.16796856467644, "learning_rate": 3.7546016562975176e-07, "logits/chosen": -2.4483256340026855, "logits/rejected": -2.5546793937683105, "logps/chosen": -478.45477294921875, "logps/rejected": -485.860107421875, "loss": 0.3319, "rewards/accuracies": 0.875, "rewards/chosen": 1.2670235633850098, "rewards/margins": 1.8351013660430908, "rewards/rejected": -0.5680777430534363, "step": 2186 }, { "epoch": 1.5978082191780822, "grad_norm": 46.424657018624, "learning_rate": 3.753221754283223e-07, "logits/chosen": -3.3221702575683594, "logits/rejected": -2.690896987915039, "logps/chosen": -1061.510498046875, "logps/rejected": -796.61572265625, "loss": 0.1793, "rewards/accuracies": 1.0, "rewards/chosen": 4.540530681610107, "rewards/margins": 5.401251316070557, "rewards/rejected": -0.8607208728790283, "step": 2187 }, { "epoch": 1.5985388127853881, "grad_norm": 37.58844712703459, "learning_rate": 3.7518413421263557e-07, "logits/chosen": -3.327812433242798, "logits/rejected": -2.2094154357910156, "logps/chosen": -622.4423828125, "logps/rejected": -457.3522033691406, "loss": 0.1991, "rewards/accuracies": 0.875, "rewards/chosen": 3.3530962467193604, "rewards/margins": 5.833906173706055, "rewards/rejected": -2.4808101654052734, "step": 2188 }, { "epoch": 1.5992694063926942, "grad_norm": 33.42180560418228, "learning_rate": 3.7504604203888347e-07, "logits/chosen": -3.0440192222595215, "logits/rejected": -2.1693332195281982, "logps/chosen": -655.9948120117188, "logps/rejected": -499.0091552734375, "loss": 0.1505, "rewards/accuracies": 1.0, "rewards/chosen": 2.99627685546875, "rewards/margins": 3.57326602935791, "rewards/rejected": -0.5769893527030945, "step": 2189 }, { "epoch": 1.6, "grad_norm": 27.338564240401553, "learning_rate": 3.7490789896327833e-07, "logits/chosen": -2.8843374252319336, "logits/rejected": -1.4136048555374146, "logps/chosen": -724.7897338867188, "logps/rejected": -248.03021240234375, "loss": 0.1416, "rewards/accuracies": 0.875, "rewards/chosen": 2.656188488006592, "rewards/margins": 3.034135580062866, "rewards/rejected": -0.37794703245162964, "step": 2190 }, { "epoch": 1.600730593607306, "grad_norm": 30.816657538349897, "learning_rate": 3.747697050420534e-07, "logits/chosen": -3.0718021392822266, "logits/rejected": -1.6060982942581177, "logps/chosen": -735.4825439453125, "logps/rejected": -422.1263427734375, "loss": 0.2169, "rewards/accuracies": 1.0, "rewards/chosen": 3.2919650077819824, "rewards/margins": 4.793237686157227, "rewards/rejected": -1.5012725591659546, "step": 2191 }, { "epoch": 1.6014611872146118, "grad_norm": 45.01103651553833, "learning_rate": 3.7463146033146275e-07, "logits/chosen": -2.653020143508911, "logits/rejected": -1.7439289093017578, "logps/chosen": -619.8622436523438, "logps/rejected": -400.56640625, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 4.083728313446045, "rewards/margins": 5.360170841217041, "rewards/rejected": -1.2764431238174438, "step": 2192 }, { "epoch": 1.6021917808219177, "grad_norm": 27.28031033605098, "learning_rate": 3.7449316488778063e-07, "logits/chosen": -3.2799553871154785, "logits/rejected": -3.0181121826171875, "logps/chosen": -738.5288696289062, "logps/rejected": -704.9248046875, "loss": 0.0984, "rewards/accuracies": 0.875, "rewards/chosen": 2.6169064044952393, "rewards/margins": 3.300870895385742, "rewards/rejected": -0.6839646100997925, "step": 2193 }, { "epoch": 1.6029223744292236, "grad_norm": 42.967338044983364, "learning_rate": 3.7435481876730255e-07, "logits/chosen": -2.5325520038604736, "logits/rejected": -1.607908844947815, "logps/chosen": -553.0101318359375, "logps/rejected": -355.2466735839844, "loss": 0.1953, "rewards/accuracies": 1.0, "rewards/chosen": 3.450369119644165, "rewards/margins": 5.21604061126709, "rewards/rejected": -1.7656716108322144, "step": 2194 }, { "epoch": 1.6036529680365297, "grad_norm": 42.636558240937, "learning_rate": 3.7421642202634417e-07, "logits/chosen": -2.5675973892211914, "logits/rejected": -2.1721014976501465, "logps/chosen": -701.3955078125, "logps/rejected": -514.955322265625, "loss": 0.2182, "rewards/accuracies": 1.0, "rewards/chosen": 2.373234272003174, "rewards/margins": 2.47351336479187, "rewards/rejected": -0.1002790629863739, "step": 2195 }, { "epoch": 1.6043835616438358, "grad_norm": 46.48515755643811, "learning_rate": 3.7407797472124197e-07, "logits/chosen": -2.873533010482788, "logits/rejected": -2.2217085361480713, "logps/chosen": -712.24560546875, "logps/rejected": -617.7743530273438, "loss": 0.2178, "rewards/accuracies": 0.75, "rewards/chosen": 2.390977382659912, "rewards/margins": 2.284882068634033, "rewards/rejected": 0.10609552264213562, "step": 2196 }, { "epoch": 1.6051141552511416, "grad_norm": 38.305483401123524, "learning_rate": 3.7393947690835303e-07, "logits/chosen": -2.4887475967407227, "logits/rejected": -2.140717029571533, "logps/chosen": -666.8692626953125, "logps/rejected": -859.3414306640625, "loss": 0.1894, "rewards/accuracies": 1.0, "rewards/chosen": 2.587031602859497, "rewards/margins": 4.490180969238281, "rewards/rejected": -1.9031490087509155, "step": 2197 }, { "epoch": 1.6058447488584475, "grad_norm": 20.12203521688428, "learning_rate": 3.73800928644055e-07, "logits/chosen": -2.764782667160034, "logits/rejected": -2.2417285442352295, "logps/chosen": -956.99072265625, "logps/rejected": -779.3636474609375, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": 3.612365961074829, "rewards/margins": 3.152770757675171, "rewards/rejected": 0.459595263004303, "step": 2198 }, { "epoch": 1.6065753424657534, "grad_norm": 47.10068068609784, "learning_rate": 3.736623299847459e-07, "logits/chosen": -2.861083984375, "logits/rejected": -2.399719476699829, "logps/chosen": -914.4564819335938, "logps/rejected": -719.789794921875, "loss": 0.1697, "rewards/accuracies": 1.0, "rewards/chosen": 1.8808071613311768, "rewards/margins": 3.2003183364868164, "rewards/rejected": -1.31951105594635, "step": 2199 }, { "epoch": 1.6073059360730593, "grad_norm": 23.648221990676433, "learning_rate": 3.7352368098684445e-07, "logits/chosen": -2.4736058712005615, "logits/rejected": -1.9250011444091797, "logps/chosen": -646.6339111328125, "logps/rejected": -566.5975341796875, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 3.909865140914917, "rewards/margins": 5.0672502517700195, "rewards/rejected": -1.1573853492736816, "step": 2200 }, { "epoch": 1.6080365296803651, "grad_norm": 24.567737593615533, "learning_rate": 3.7338498170678974e-07, "logits/chosen": -3.269385576248169, "logits/rejected": -1.970672845840454, "logps/chosen": -724.0924682617188, "logps/rejected": -471.3143615722656, "loss": 0.1058, "rewards/accuracies": 1.0, "rewards/chosen": 2.5127220153808594, "rewards/margins": 3.272571325302124, "rewards/rejected": -0.7598493695259094, "step": 2201 }, { "epoch": 1.6087671232876712, "grad_norm": 34.32595578034754, "learning_rate": 3.7324623220104134e-07, "logits/chosen": -3.2390189170837402, "logits/rejected": -2.4450438022613525, "logps/chosen": -587.4549560546875, "logps/rejected": -453.7267761230469, "loss": 0.1484, "rewards/accuracies": 0.625, "rewards/chosen": 1.998042345046997, "rewards/margins": 2.112398862838745, "rewards/rejected": -0.11435624212026596, "step": 2202 }, { "epoch": 1.6094977168949771, "grad_norm": 21.907833896466265, "learning_rate": 3.731074325260794e-07, "logits/chosen": -3.276442527770996, "logits/rejected": -2.447021007537842, "logps/chosen": -559.6769409179688, "logps/rejected": -422.0311279296875, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": 3.2251029014587402, "rewards/margins": 4.375242233276367, "rewards/rejected": -1.1501392126083374, "step": 2203 }, { "epoch": 1.6102283105022832, "grad_norm": 36.80858193548839, "learning_rate": 3.729685827384044e-07, "logits/chosen": -2.8684732913970947, "logits/rejected": -1.7644966840744019, "logps/chosen": -1140.8089599609375, "logps/rejected": -484.7603759765625, "loss": 0.1979, "rewards/accuracies": 1.0, "rewards/chosen": 3.507777214050293, "rewards/margins": 3.6605396270751953, "rewards/rejected": -0.15276211500167847, "step": 2204 }, { "epoch": 1.610958904109589, "grad_norm": 29.002834839217655, "learning_rate": 3.728296828945372e-07, "logits/chosen": -2.6057283878326416, "logits/rejected": -2.232539653778076, "logps/chosen": -836.1402587890625, "logps/rejected": -655.945068359375, "loss": 0.147, "rewards/accuracies": 1.0, "rewards/chosen": 3.060060739517212, "rewards/margins": 2.3017334938049316, "rewards/rejected": 0.7583274841308594, "step": 2205 }, { "epoch": 1.611689497716895, "grad_norm": 36.78104423264448, "learning_rate": 3.7269073305101896e-07, "logits/chosen": -2.2028589248657227, "logits/rejected": -2.374718189239502, "logps/chosen": -735.9061279296875, "logps/rejected": -748.302490234375, "loss": 0.1689, "rewards/accuracies": 0.875, "rewards/chosen": 3.183852195739746, "rewards/margins": 3.2226223945617676, "rewards/rejected": -0.03877049684524536, "step": 2206 }, { "epoch": 1.6124200913242008, "grad_norm": 34.39837983330392, "learning_rate": 3.7255173326441136e-07, "logits/chosen": -2.771657943725586, "logits/rejected": -1.5734038352966309, "logps/chosen": -293.7323913574219, "logps/rejected": -224.06593322753906, "loss": 0.1692, "rewards/accuracies": 1.0, "rewards/chosen": 2.1123669147491455, "rewards/margins": 3.863126039505005, "rewards/rejected": -1.7507588863372803, "step": 2207 }, { "epoch": 1.6131506849315067, "grad_norm": 27.316405970702846, "learning_rate": 3.724126835912963e-07, "logits/chosen": -2.9819986820220947, "logits/rejected": -1.5805087089538574, "logps/chosen": -1094.803955078125, "logps/rejected": -534.5474853515625, "loss": 0.1493, "rewards/accuracies": 0.875, "rewards/chosen": 4.011345863342285, "rewards/margins": 4.913023471832275, "rewards/rejected": -0.9016777873039246, "step": 2208 }, { "epoch": 1.6138812785388128, "grad_norm": 26.44830346823131, "learning_rate": 3.7227358408827604e-07, "logits/chosen": -2.9315695762634277, "logits/rejected": -1.9476042985916138, "logps/chosen": -424.49163818359375, "logps/rejected": -333.4074401855469, "loss": 0.1813, "rewards/accuracies": 1.0, "rewards/chosen": 2.2600491046905518, "rewards/margins": 4.132148742675781, "rewards/rejected": -1.8720996379852295, "step": 2209 }, { "epoch": 1.6146118721461187, "grad_norm": 32.450089380173516, "learning_rate": 3.7213443481197306e-07, "logits/chosen": -2.273599624633789, "logits/rejected": -1.5584666728973389, "logps/chosen": -773.0383911132812, "logps/rejected": -396.83502197265625, "loss": 0.1649, "rewards/accuracies": 1.0, "rewards/chosen": 3.503110408782959, "rewards/margins": 4.334272861480713, "rewards/rejected": -0.8311622142791748, "step": 2210 }, { "epoch": 1.6153424657534248, "grad_norm": 23.452851743078334, "learning_rate": 3.7199523581903027e-07, "logits/chosen": -2.8466787338256836, "logits/rejected": -1.8910212516784668, "logps/chosen": -306.7862548828125, "logps/rejected": -355.5995788574219, "loss": 0.1079, "rewards/accuracies": 1.0, "rewards/chosen": 3.89247989654541, "rewards/margins": 6.466109752655029, "rewards/rejected": -2.57362961769104, "step": 2211 }, { "epoch": 1.6160730593607306, "grad_norm": 31.69342334769303, "learning_rate": 3.718559871661105e-07, "logits/chosen": -2.9232053756713867, "logits/rejected": -1.9808788299560547, "logps/chosen": -585.7666015625, "logps/rejected": -331.2901611328125, "loss": 0.1717, "rewards/accuracies": 0.875, "rewards/chosen": 3.533586025238037, "rewards/margins": 4.893864154815674, "rewards/rejected": -1.3602778911590576, "step": 2212 }, { "epoch": 1.6168036529680365, "grad_norm": 17.18892572211739, "learning_rate": 3.7171668890989714e-07, "logits/chosen": -2.797146797180176, "logits/rejected": -2.1727638244628906, "logps/chosen": -878.1307983398438, "logps/rejected": -741.99169921875, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 3.343090534210205, "rewards/margins": 3.410388469696045, "rewards/rejected": -0.0672980546951294, "step": 2213 }, { "epoch": 1.6175342465753424, "grad_norm": 33.88614935580705, "learning_rate": 3.7157734110709354e-07, "logits/chosen": -3.04848575592041, "logits/rejected": -2.2629072666168213, "logps/chosen": -996.7161865234375, "logps/rejected": -715.326416015625, "loss": 0.1405, "rewards/accuracies": 1.0, "rewards/chosen": 4.378599166870117, "rewards/margins": 3.5078039169311523, "rewards/rejected": 0.8707951307296753, "step": 2214 }, { "epoch": 1.6182648401826483, "grad_norm": 50.202643555114456, "learning_rate": 3.7143794381442334e-07, "logits/chosen": -2.896623134613037, "logits/rejected": -2.222468852996826, "logps/chosen": -672.3017578125, "logps/rejected": -623.9675903320312, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 3.37641978263855, "rewards/margins": 3.546198844909668, "rewards/rejected": -0.1697790026664734, "step": 2215 }, { "epoch": 1.6189954337899544, "grad_norm": 33.36551099891548, "learning_rate": 3.712984970886303e-07, "logits/chosen": -3.1529083251953125, "logits/rejected": -1.9415583610534668, "logps/chosen": -583.8575439453125, "logps/rejected": -412.9114074707031, "loss": 0.2179, "rewards/accuracies": 0.875, "rewards/chosen": 2.701765298843384, "rewards/margins": 2.417234182357788, "rewards/rejected": 0.2845311164855957, "step": 2216 }, { "epoch": 1.6197260273972602, "grad_norm": 20.065580042485845, "learning_rate": 3.7115900098647826e-07, "logits/chosen": -2.6347858905792236, "logits/rejected": -2.319373846054077, "logps/chosen": -303.01171875, "logps/rejected": -392.42376708984375, "loss": 0.1188, "rewards/accuracies": 1.0, "rewards/chosen": 1.205493688583374, "rewards/margins": 3.408992290496826, "rewards/rejected": -2.203498601913452, "step": 2217 }, { "epoch": 1.6204566210045663, "grad_norm": 28.43273268762497, "learning_rate": 3.710194555647512e-07, "logits/chosen": -2.2241761684417725, "logits/rejected": -1.9642452001571655, "logps/chosen": -549.8204345703125, "logps/rejected": -504.35723876953125, "loss": 0.1866, "rewards/accuracies": 0.875, "rewards/chosen": 1.6013835668563843, "rewards/margins": 2.8219058513641357, "rewards/rejected": -1.220522165298462, "step": 2218 }, { "epoch": 1.6211872146118722, "grad_norm": 29.64466996813308, "learning_rate": 3.7087986088025307e-07, "logits/chosen": -2.5349090099334717, "logits/rejected": -2.2839303016662598, "logps/chosen": -556.1446533203125, "logps/rejected": -671.1290283203125, "loss": 0.1356, "rewards/accuracies": 0.875, "rewards/chosen": 3.226661443710327, "rewards/margins": 4.574760437011719, "rewards/rejected": -1.3480991125106812, "step": 2219 }, { "epoch": 1.621917808219178, "grad_norm": 42.889966126542205, "learning_rate": 3.7074021698980807e-07, "logits/chosen": -2.701369285583496, "logits/rejected": -2.0203964710235596, "logps/chosen": -502.6507263183594, "logps/rejected": -430.984375, "loss": 0.247, "rewards/accuracies": 1.0, "rewards/chosen": 2.696645975112915, "rewards/margins": 3.6644558906555176, "rewards/rejected": -0.9678095579147339, "step": 2220 }, { "epoch": 1.622648401826484, "grad_norm": 17.490239500452905, "learning_rate": 3.706005239502603e-07, "logits/chosen": -2.830427408218384, "logits/rejected": -1.6927251815795898, "logps/chosen": -640.3869018554688, "logps/rejected": -547.3927001953125, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 2.789376735687256, "rewards/margins": 3.1623809337615967, "rewards/rejected": -0.3730042278766632, "step": 2221 }, { "epoch": 1.6233789954337898, "grad_norm": 26.446007191761836, "learning_rate": 3.704607818184739e-07, "logits/chosen": -3.6868011951446533, "logits/rejected": -2.335659980773926, "logps/chosen": -989.2325439453125, "logps/rejected": -546.45654296875, "loss": 0.1158, "rewards/accuracies": 1.0, "rewards/chosen": 4.030899524688721, "rewards/margins": 3.6368918418884277, "rewards/rejected": 0.39400750398635864, "step": 2222 }, { "epoch": 1.624109589041096, "grad_norm": 42.789889102158234, "learning_rate": 3.703209906513329e-07, "logits/chosen": -2.8693675994873047, "logits/rejected": -1.8004508018493652, "logps/chosen": -531.2948608398438, "logps/rejected": -395.4879150390625, "loss": 0.2122, "rewards/accuracies": 0.875, "rewards/chosen": 3.6062538623809814, "rewards/margins": 4.976397514343262, "rewards/rejected": -1.3701435327529907, "step": 2223 }, { "epoch": 1.6248401826484018, "grad_norm": 25.85537284928959, "learning_rate": 3.7018115050574155e-07, "logits/chosen": -2.1602096557617188, "logits/rejected": -1.6516226530075073, "logps/chosen": -491.0417785644531, "logps/rejected": -414.8528137207031, "loss": 0.1252, "rewards/accuracies": 0.875, "rewards/chosen": 2.6353750228881836, "rewards/margins": 4.147050857543945, "rewards/rejected": -1.5116755962371826, "step": 2224 }, { "epoch": 1.625570776255708, "grad_norm": 33.43620215063027, "learning_rate": 3.700412614386237e-07, "logits/chosen": -2.6097826957702637, "logits/rejected": -1.985588788986206, "logps/chosen": -533.076904296875, "logps/rejected": -308.21173095703125, "loss": 0.17, "rewards/accuracies": 1.0, "rewards/chosen": 2.0540571212768555, "rewards/margins": 3.784553289413452, "rewards/rejected": -1.7304964065551758, "step": 2225 }, { "epoch": 1.6263013698630138, "grad_norm": 22.645358373392437, "learning_rate": 3.699013235069233e-07, "logits/chosen": -3.1884994506835938, "logits/rejected": -1.8093106746673584, "logps/chosen": -821.1521606445312, "logps/rejected": -505.33612060546875, "loss": 0.0832, "rewards/accuracies": 1.0, "rewards/chosen": 3.869621992111206, "rewards/margins": 3.6664414405822754, "rewards/rejected": 0.2031807005405426, "step": 2226 }, { "epoch": 1.6270319634703196, "grad_norm": 51.43380957675724, "learning_rate": 3.6976133676760426e-07, "logits/chosen": -2.4538350105285645, "logits/rejected": -1.9598814249038696, "logps/chosen": -706.0501708984375, "logps/rejected": -642.0391235351562, "loss": 0.2605, "rewards/accuracies": 0.875, "rewards/chosen": 3.735149383544922, "rewards/margins": 3.683854818344116, "rewards/rejected": 0.05129486322402954, "step": 2227 }, { "epoch": 1.6277625570776255, "grad_norm": 44.12711767891075, "learning_rate": 3.696213012776501e-07, "logits/chosen": -2.3556699752807617, "logits/rejected": -2.403261184692383, "logps/chosen": -476.18536376953125, "logps/rejected": -627.9007568359375, "loss": 0.2558, "rewards/accuracies": 0.875, "rewards/chosen": 2.665083646774292, "rewards/margins": 5.4044036865234375, "rewards/rejected": -2.7393205165863037, "step": 2228 }, { "epoch": 1.6284931506849314, "grad_norm": 42.57765194691342, "learning_rate": 3.6948121709406467e-07, "logits/chosen": -2.897209644317627, "logits/rejected": -2.208754777908325, "logps/chosen": -782.11376953125, "logps/rejected": -598.266357421875, "loss": 0.2286, "rewards/accuracies": 0.75, "rewards/chosen": 2.659527540206909, "rewards/margins": 1.9978375434875488, "rewards/rejected": 0.6616900563240051, "step": 2229 }, { "epoch": 1.6292237442922375, "grad_norm": 40.82426491032937, "learning_rate": 3.693410842738709e-07, "logits/chosen": -2.7993369102478027, "logits/rejected": -2.8312158584594727, "logps/chosen": -651.4678955078125, "logps/rejected": -755.9578247070312, "loss": 0.1865, "rewards/accuracies": 0.875, "rewards/chosen": 3.6366922855377197, "rewards/margins": 3.7924888134002686, "rewards/rejected": -0.15579688549041748, "step": 2230 }, { "epoch": 1.6299543378995434, "grad_norm": 22.48453447975929, "learning_rate": 3.6920090287411226e-07, "logits/chosen": -3.172229766845703, "logits/rejected": -2.0716569423675537, "logps/chosen": -686.3880004882812, "logps/rejected": -406.1515808105469, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 4.041447162628174, "rewards/margins": 5.094540596008301, "rewards/rejected": -1.053093671798706, "step": 2231 }, { "epoch": 1.6306849315068495, "grad_norm": 44.73772854614715, "learning_rate": 3.6906067295185153e-07, "logits/chosen": -2.540147542953491, "logits/rejected": -2.4293556213378906, "logps/chosen": -562.6212158203125, "logps/rejected": -772.18994140625, "loss": 0.1661, "rewards/accuracies": 0.75, "rewards/chosen": 1.3635904788970947, "rewards/margins": 1.9990966320037842, "rewards/rejected": -0.6355061531066895, "step": 2232 }, { "epoch": 1.6314155251141553, "grad_norm": 25.577462814046008, "learning_rate": 3.689203945641715e-07, "logits/chosen": -2.9199869632720947, "logits/rejected": -2.282052993774414, "logps/chosen": -612.04248046875, "logps/rejected": -415.6856689453125, "loss": 0.1294, "rewards/accuracies": 1.0, "rewards/chosen": 1.7708287239074707, "rewards/margins": 2.5993218421936035, "rewards/rejected": -0.8284931182861328, "step": 2233 }, { "epoch": 1.6321461187214612, "grad_norm": 37.91783655618504, "learning_rate": 3.6878006776817437e-07, "logits/chosen": -2.512596845626831, "logits/rejected": -2.273283004760742, "logps/chosen": -726.2301635742188, "logps/rejected": -535.7315673828125, "loss": 0.2031, "rewards/accuracies": 0.875, "rewards/chosen": 2.4055681228637695, "rewards/margins": 2.451878786087036, "rewards/rejected": -0.046310797333717346, "step": 2234 }, { "epoch": 1.632876712328767, "grad_norm": 21.524652585937652, "learning_rate": 3.686396926209825e-07, "logits/chosen": -2.2710416316986084, "logits/rejected": -2.022829055786133, "logps/chosen": -337.2087097167969, "logps/rejected": -369.4866027832031, "loss": 0.1242, "rewards/accuracies": 0.875, "rewards/chosen": 1.7957723140716553, "rewards/margins": 2.870833158493042, "rewards/rejected": -1.0750607252120972, "step": 2235 }, { "epoch": 1.633607305936073, "grad_norm": 31.653837904771713, "learning_rate": 3.684992691797375e-07, "logits/chosen": -3.029607057571411, "logits/rejected": -2.240187883377075, "logps/chosen": -679.0903930664062, "logps/rejected": -536.4774780273438, "loss": 0.1238, "rewards/accuracies": 1.0, "rewards/chosen": 4.021679878234863, "rewards/margins": 3.555607318878174, "rewards/rejected": 0.46607276797294617, "step": 2236 }, { "epoch": 1.634337899543379, "grad_norm": 26.9360373823486, "learning_rate": 3.683587975016009e-07, "logits/chosen": -2.5457229614257812, "logits/rejected": -2.4939115047454834, "logps/chosen": -475.60882568359375, "logps/rejected": -467.0963134765625, "loss": 0.1383, "rewards/accuracies": 1.0, "rewards/chosen": 1.92982816696167, "rewards/margins": 3.5113794803619385, "rewards/rejected": -1.5815510749816895, "step": 2237 }, { "epoch": 1.635068493150685, "grad_norm": 25.82897807499239, "learning_rate": 3.6821827764375377e-07, "logits/chosen": -3.711054801940918, "logits/rejected": -2.391289234161377, "logps/chosen": -610.6766357421875, "logps/rejected": -367.17578125, "loss": 0.1442, "rewards/accuracies": 0.875, "rewards/chosen": 2.573578119277954, "rewards/margins": 3.3704075813293457, "rewards/rejected": -0.7968294024467468, "step": 2238 }, { "epoch": 1.635799086757991, "grad_norm": 20.103660500698467, "learning_rate": 3.680777096633969e-07, "logits/chosen": -2.252257823944092, "logits/rejected": -2.2003183364868164, "logps/chosen": -451.20989990234375, "logps/rejected": -515.4338989257812, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 1.751044750213623, "rewards/margins": 1.9573010206222534, "rewards/rejected": -0.20625616610050201, "step": 2239 }, { "epoch": 1.636529680365297, "grad_norm": 27.132962091079136, "learning_rate": 3.679370936177504e-07, "logits/chosen": -3.196775197982788, "logits/rejected": -2.853986978530884, "logps/chosen": -378.7805480957031, "logps/rejected": -359.3278503417969, "loss": 0.1461, "rewards/accuracies": 0.875, "rewards/chosen": 1.515882134437561, "rewards/margins": 2.837512969970703, "rewards/rejected": -1.321630835533142, "step": 2240 }, { "epoch": 1.6372602739726028, "grad_norm": 34.325646388669796, "learning_rate": 3.6779642956405424e-07, "logits/chosen": -2.565396308898926, "logits/rejected": -1.5667439699172974, "logps/chosen": -854.057861328125, "logps/rejected": -528.5382080078125, "loss": 0.1433, "rewards/accuracies": 1.0, "rewards/chosen": 3.6824450492858887, "rewards/margins": 4.133196830749512, "rewards/rejected": -0.4507519006729126, "step": 2241 }, { "epoch": 1.6379908675799086, "grad_norm": 38.62453067827734, "learning_rate": 3.6765571755956783e-07, "logits/chosen": -2.5841574668884277, "logits/rejected": -2.6275196075439453, "logps/chosen": -489.8212890625, "logps/rejected": -522.0794067382812, "loss": 0.1661, "rewards/accuracies": 0.875, "rewards/chosen": 2.472698211669922, "rewards/margins": 3.1750071048736572, "rewards/rejected": -0.7023090720176697, "step": 2242 }, { "epoch": 1.6387214611872145, "grad_norm": 35.71505031966128, "learning_rate": 3.6751495766156997e-07, "logits/chosen": -2.817725658416748, "logits/rejected": -2.4943299293518066, "logps/chosen": -606.832275390625, "logps/rejected": -630.502685546875, "loss": 0.2121, "rewards/accuracies": 0.875, "rewards/chosen": 3.1135852336883545, "rewards/margins": 3.3472518920898438, "rewards/rejected": -0.2336665242910385, "step": 2243 }, { "epoch": 1.6394520547945204, "grad_norm": 35.48187394862693, "learning_rate": 3.673741499273592e-07, "logits/chosen": -2.596238851547241, "logits/rejected": -2.4998860359191895, "logps/chosen": -680.0400390625, "logps/rejected": -502.0863037109375, "loss": 0.1765, "rewards/accuracies": 0.875, "rewards/chosen": 3.562380790710449, "rewards/margins": 3.924941301345825, "rewards/rejected": -0.362560510635376, "step": 2244 }, { "epoch": 1.6401826484018265, "grad_norm": 41.275711092261936, "learning_rate": 3.672332944142534e-07, "logits/chosen": -2.9555792808532715, "logits/rejected": -2.510241746902466, "logps/chosen": -587.0838623046875, "logps/rejected": -441.5660400390625, "loss": 0.2022, "rewards/accuracies": 0.875, "rewards/chosen": 2.4482290744781494, "rewards/margins": 3.121185779571533, "rewards/rejected": -0.6729567646980286, "step": 2245 }, { "epoch": 1.6409132420091326, "grad_norm": 28.541703614445368, "learning_rate": 3.6709239117958986e-07, "logits/chosen": -1.6047600507736206, "logits/rejected": -2.048497438430786, "logps/chosen": -219.6867218017578, "logps/rejected": -441.1383972167969, "loss": 0.2226, "rewards/accuracies": 1.0, "rewards/chosen": 0.45098164677619934, "rewards/margins": 2.966987133026123, "rewards/rejected": -2.516005516052246, "step": 2246 }, { "epoch": 1.6416438356164385, "grad_norm": 28.377090713174773, "learning_rate": 3.669514402807253e-07, "logits/chosen": -3.483851909637451, "logits/rejected": -2.8242945671081543, "logps/chosen": -790.1629028320312, "logps/rejected": -596.548828125, "loss": 0.1466, "rewards/accuracies": 1.0, "rewards/chosen": 2.8835549354553223, "rewards/margins": 2.245710849761963, "rewards/rejected": 0.6378442645072937, "step": 2247 }, { "epoch": 1.6423744292237443, "grad_norm": 51.97033784405293, "learning_rate": 3.6681044177503594e-07, "logits/chosen": -2.382406234741211, "logits/rejected": -1.6661709547042847, "logps/chosen": -443.4946594238281, "logps/rejected": -298.64776611328125, "loss": 0.2562, "rewards/accuracies": 1.0, "rewards/chosen": 2.7168285846710205, "rewards/margins": 4.708461761474609, "rewards/rejected": -1.9916330575942993, "step": 2248 }, { "epoch": 1.6431050228310502, "grad_norm": 26.301021147075378, "learning_rate": 3.666693957199173e-07, "logits/chosen": -3.230109214782715, "logits/rejected": -2.5213921070098877, "logps/chosen": -902.7803344726562, "logps/rejected": -753.0081787109375, "loss": 0.1626, "rewards/accuracies": 1.0, "rewards/chosen": 4.859180450439453, "rewards/margins": 4.038206100463867, "rewards/rejected": 0.8209743499755859, "step": 2249 }, { "epoch": 1.643835616438356, "grad_norm": 39.610644703515305, "learning_rate": 3.665283021727843e-07, "logits/chosen": -2.574388027191162, "logits/rejected": -1.8778049945831299, "logps/chosen": -749.01123046875, "logps/rejected": -689.2937622070312, "loss": 0.151, "rewards/accuracies": 0.875, "rewards/chosen": 3.1896371841430664, "rewards/margins": 4.371696949005127, "rewards/rejected": -1.18205988407135, "step": 2250 }, { "epoch": 1.644566210045662, "grad_norm": 34.2523878615601, "learning_rate": 3.6638716119107116e-07, "logits/chosen": -2.7382938861846924, "logits/rejected": -1.3978121280670166, "logps/chosen": -449.2664489746094, "logps/rejected": -275.58697509765625, "loss": 0.1696, "rewards/accuracies": 0.875, "rewards/chosen": 2.9257373809814453, "rewards/margins": 4.90134859085083, "rewards/rejected": -1.9756112098693848, "step": 2251 }, { "epoch": 1.645296803652968, "grad_norm": 35.82002368340489, "learning_rate": 3.6624597283223135e-07, "logits/chosen": -2.82210111618042, "logits/rejected": -2.1497719287872314, "logps/chosen": -1119.284912109375, "logps/rejected": -829.6866455078125, "loss": 0.1513, "rewards/accuracies": 1.0, "rewards/chosen": 4.222884178161621, "rewards/margins": 3.558361530303955, "rewards/rejected": 0.664522647857666, "step": 2252 }, { "epoch": 1.6460273972602741, "grad_norm": 28.056157064337054, "learning_rate": 3.661047371537378e-07, "logits/chosen": -2.5967252254486084, "logits/rejected": -2.3825440406799316, "logps/chosen": -632.5828857421875, "logps/rejected": -527.1766967773438, "loss": 0.159, "rewards/accuracies": 0.875, "rewards/chosen": 3.650562047958374, "rewards/margins": 3.2240447998046875, "rewards/rejected": 0.4265173673629761, "step": 2253 }, { "epoch": 1.64675799086758, "grad_norm": 40.346117583995145, "learning_rate": 3.6596345421308253e-07, "logits/chosen": -3.0770342350006104, "logits/rejected": -2.089634418487549, "logps/chosen": -725.8819580078125, "logps/rejected": -555.7281494140625, "loss": 0.2051, "rewards/accuracies": 0.875, "rewards/chosen": 4.245621681213379, "rewards/margins": 4.393337249755859, "rewards/rejected": -0.1477157473564148, "step": 2254 }, { "epoch": 1.6474885844748859, "grad_norm": 36.564316194207706, "learning_rate": 3.658221240677769e-07, "logits/chosen": -2.995591402053833, "logits/rejected": -3.4218077659606934, "logps/chosen": -500.4927978515625, "logps/rejected": -680.572265625, "loss": 0.1904, "rewards/accuracies": 1.0, "rewards/chosen": 2.1940975189208984, "rewards/margins": 3.397594928741455, "rewards/rejected": -1.2034976482391357, "step": 2255 }, { "epoch": 1.6482191780821918, "grad_norm": 20.640872650624463, "learning_rate": 3.656807467753514e-07, "logits/chosen": -2.4119584560394287, "logits/rejected": -1.832392930984497, "logps/chosen": -647.8765869140625, "logps/rejected": -499.93798828125, "loss": 0.0798, "rewards/accuracies": 1.0, "rewards/chosen": 4.500371932983398, "rewards/margins": 6.518949508666992, "rewards/rejected": -2.0185775756835938, "step": 2256 }, { "epoch": 1.6489497716894976, "grad_norm": 18.799914031136552, "learning_rate": 3.655393223933558e-07, "logits/chosen": -2.479689598083496, "logits/rejected": -2.2162258625030518, "logps/chosen": -469.3383483886719, "logps/rejected": -803.104248046875, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 1.6704343557357788, "rewards/margins": 3.4728264808654785, "rewards/rejected": -1.8023922443389893, "step": 2257 }, { "epoch": 1.6496803652968035, "grad_norm": 46.13799518411452, "learning_rate": 3.6539785097935904e-07, "logits/chosen": -2.631293773651123, "logits/rejected": -2.497509241104126, "logps/chosen": -478.73150634765625, "logps/rejected": -427.720458984375, "loss": 0.2404, "rewards/accuracies": 1.0, "rewards/chosen": 1.063430666923523, "rewards/margins": 2.0012056827545166, "rewards/rejected": -0.9377750158309937, "step": 2258 }, { "epoch": 1.6504109589041096, "grad_norm": 46.21227874744168, "learning_rate": 3.652563325909491e-07, "logits/chosen": -2.6081156730651855, "logits/rejected": -1.306853175163269, "logps/chosen": -504.5645751953125, "logps/rejected": -270.4130859375, "loss": 0.1718, "rewards/accuracies": 1.0, "rewards/chosen": 3.25156307220459, "rewards/margins": 5.7531023025512695, "rewards/rejected": -2.5015389919281006, "step": 2259 }, { "epoch": 1.6511415525114155, "grad_norm": 37.70842793208668, "learning_rate": 3.651147672857331e-07, "logits/chosen": -2.5878396034240723, "logits/rejected": -2.6268017292022705, "logps/chosen": -473.9962158203125, "logps/rejected": -385.38873291015625, "loss": 0.22, "rewards/accuracies": 0.875, "rewards/chosen": 1.788435459136963, "rewards/margins": 2.5058374404907227, "rewards/rejected": -0.7174022197723389, "step": 2260 }, { "epoch": 1.6518721461187216, "grad_norm": 51.207003686629626, "learning_rate": 3.649731551213374e-07, "logits/chosen": -2.8635003566741943, "logits/rejected": -2.110250949859619, "logps/chosen": -441.46380615234375, "logps/rejected": -366.49188232421875, "loss": 0.2729, "rewards/accuracies": 0.875, "rewards/chosen": 1.8440272808074951, "rewards/margins": 2.3836793899536133, "rewards/rejected": -0.5396519899368286, "step": 2261 }, { "epoch": 1.6526027397260274, "grad_norm": 33.895193689864605, "learning_rate": 3.648314961554073e-07, "logits/chosen": -3.365633010864258, "logits/rejected": -1.718949317932129, "logps/chosen": -690.8692626953125, "logps/rejected": -293.7857360839844, "loss": 0.1262, "rewards/accuracies": 0.875, "rewards/chosen": 4.500981330871582, "rewards/margins": 5.556180000305176, "rewards/rejected": -1.0551989078521729, "step": 2262 }, { "epoch": 1.6533333333333333, "grad_norm": 25.559652528982934, "learning_rate": 3.646897904456073e-07, "logits/chosen": -2.521223306655884, "logits/rejected": -1.7091755867004395, "logps/chosen": -317.5714111328125, "logps/rejected": -230.7986602783203, "loss": 0.1557, "rewards/accuracies": 1.0, "rewards/chosen": 0.9741373658180237, "rewards/margins": 3.2651028633117676, "rewards/rejected": -2.2909655570983887, "step": 2263 }, { "epoch": 1.6540639269406392, "grad_norm": 16.14763714302295, "learning_rate": 3.6454803804962067e-07, "logits/chosen": -3.5065951347351074, "logits/rejected": -2.2028119564056396, "logps/chosen": -865.4951171875, "logps/rejected": -559.277587890625, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 4.5269670486450195, "rewards/margins": 4.949148178100586, "rewards/rejected": -0.4221813380718231, "step": 2264 }, { "epoch": 1.654794520547945, "grad_norm": 22.057003038868192, "learning_rate": 3.6440623902514977e-07, "logits/chosen": -2.849874973297119, "logits/rejected": -2.1243984699249268, "logps/chosen": -825.4412841796875, "logps/rejected": -495.343017578125, "loss": 0.13, "rewards/accuracies": 0.875, "rewards/chosen": 2.3850595951080322, "rewards/margins": 2.3092284202575684, "rewards/rejected": 0.07583111524581909, "step": 2265 }, { "epoch": 1.6555251141552512, "grad_norm": 19.64870542188109, "learning_rate": 3.642643934299163e-07, "logits/chosen": -2.5566418170928955, "logits/rejected": -1.9458661079406738, "logps/chosen": -585.6490478515625, "logps/rejected": -289.099365234375, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": 3.24825382232666, "rewards/margins": 3.8158140182495117, "rewards/rejected": -0.5675603151321411, "step": 2266 }, { "epoch": 1.656255707762557, "grad_norm": 30.529671666278713, "learning_rate": 3.641225013216602e-07, "logits/chosen": -2.6827104091644287, "logits/rejected": -2.1579813957214355, "logps/chosen": -655.2115478515625, "logps/rejected": -421.232666015625, "loss": 0.1808, "rewards/accuracies": 1.0, "rewards/chosen": 2.322341203689575, "rewards/margins": 3.154592990875244, "rewards/rejected": -0.832251787185669, "step": 2267 }, { "epoch": 1.6569863013698631, "grad_norm": 40.72774567267534, "learning_rate": 3.639805627581412e-07, "logits/chosen": -3.1960439682006836, "logits/rejected": -2.452273368835449, "logps/chosen": -862.4222412109375, "logps/rejected": -683.012451171875, "loss": 0.228, "rewards/accuracies": 0.875, "rewards/chosen": 3.9852705001831055, "rewards/margins": 2.8864033222198486, "rewards/rejected": 1.0988670587539673, "step": 2268 }, { "epoch": 1.657716894977169, "grad_norm": 38.11264688445375, "learning_rate": 3.6383857779713723e-07, "logits/chosen": -3.1116244792938232, "logits/rejected": -2.116844654083252, "logps/chosen": -406.59185791015625, "logps/rejected": -363.470703125, "loss": 0.1586, "rewards/accuracies": 0.875, "rewards/chosen": 3.208188056945801, "rewards/margins": 4.434557914733887, "rewards/rejected": -1.226369857788086, "step": 2269 }, { "epoch": 1.6584474885844749, "grad_norm": 30.85305018254121, "learning_rate": 3.636965464964455e-07, "logits/chosen": -2.9335970878601074, "logits/rejected": -1.869659423828125, "logps/chosen": -657.057373046875, "logps/rejected": -405.1436767578125, "loss": 0.1429, "rewards/accuracies": 1.0, "rewards/chosen": 3.4813754558563232, "rewards/margins": 5.224294662475586, "rewards/rejected": -1.7429190874099731, "step": 2270 }, { "epoch": 1.6591780821917808, "grad_norm": 60.97550297365841, "learning_rate": 3.6355446891388185e-07, "logits/chosen": -2.893690586090088, "logits/rejected": -2.2462263107299805, "logps/chosen": -758.0845947265625, "logps/rejected": -667.5911865234375, "loss": 0.3416, "rewards/accuracies": 0.75, "rewards/chosen": 3.8967971801757812, "rewards/margins": 4.002519607543945, "rewards/rejected": -0.10572223365306854, "step": 2271 }, { "epoch": 1.6599086757990866, "grad_norm": 23.73166181414478, "learning_rate": 3.6341234510728126e-07, "logits/chosen": -3.195220470428467, "logits/rejected": -3.003329038619995, "logps/chosen": -780.3404541015625, "logps/rejected": -724.17724609375, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 2.970762252807617, "rewards/margins": 3.3673386573791504, "rewards/rejected": -0.39657628536224365, "step": 2272 }, { "epoch": 1.6606392694063927, "grad_norm": 24.17574432710462, "learning_rate": 3.632701751344971e-07, "logits/chosen": -2.780182123184204, "logits/rejected": -2.1043272018432617, "logps/chosen": -878.3013305664062, "logps/rejected": -710.7891235351562, "loss": 0.1247, "rewards/accuracies": 0.875, "rewards/chosen": 9.560335159301758, "rewards/margins": 8.899773597717285, "rewards/rejected": 0.6605623364448547, "step": 2273 }, { "epoch": 1.6613698630136986, "grad_norm": 26.264679210366104, "learning_rate": 3.6312795905340204e-07, "logits/chosen": -3.149355411529541, "logits/rejected": -2.497701644897461, "logps/chosen": -730.0494384765625, "logps/rejected": -504.49969482421875, "loss": 0.1673, "rewards/accuracies": 1.0, "rewards/chosen": 2.9176442623138428, "rewards/margins": 2.787440776824951, "rewards/rejected": 0.13020363450050354, "step": 2274 }, { "epoch": 1.6621004566210047, "grad_norm": 25.527773822410072, "learning_rate": 3.62985696921887e-07, "logits/chosen": -2.747164726257324, "logits/rejected": -2.4132962226867676, "logps/chosen": -437.165283203125, "logps/rejected": -355.94366455078125, "loss": 0.1556, "rewards/accuracies": 1.0, "rewards/chosen": 2.328691005706787, "rewards/margins": 4.4155659675598145, "rewards/rejected": -2.0868749618530273, "step": 2275 }, { "epoch": 1.6628310502283106, "grad_norm": 32.511305500572604, "learning_rate": 3.6284338879786215e-07, "logits/chosen": -2.96565318107605, "logits/rejected": -2.4746227264404297, "logps/chosen": -591.9163818359375, "logps/rejected": -597.8557739257812, "loss": 0.2078, "rewards/accuracies": 0.875, "rewards/chosen": 3.2705681324005127, "rewards/margins": 3.4551641941070557, "rewards/rejected": -0.18459588289260864, "step": 2276 }, { "epoch": 1.6635616438356164, "grad_norm": 31.39443477223262, "learning_rate": 3.6270103473925587e-07, "logits/chosen": -3.0233004093170166, "logits/rejected": -2.750303268432617, "logps/chosen": -677.3053588867188, "logps/rejected": -655.4847412109375, "loss": 0.1522, "rewards/accuracies": 1.0, "rewards/chosen": 3.1440517902374268, "rewards/margins": 3.455819606781006, "rewards/rejected": -0.31176820397377014, "step": 2277 }, { "epoch": 1.6642922374429223, "grad_norm": 32.228277416492105, "learning_rate": 3.6255863480401564e-07, "logits/chosen": -2.7469100952148438, "logits/rejected": -2.2686285972595215, "logps/chosen": -443.59527587890625, "logps/rejected": -401.53436279296875, "loss": 0.2147, "rewards/accuracies": 0.75, "rewards/chosen": 2.047722816467285, "rewards/margins": 3.622581720352173, "rewards/rejected": -1.5748589038848877, "step": 2278 }, { "epoch": 1.6650228310502282, "grad_norm": 22.43115756785602, "learning_rate": 3.6241618905010754e-07, "logits/chosen": -2.9387094974517822, "logits/rejected": -2.057231903076172, "logps/chosen": -190.23928833007812, "logps/rejected": -172.7744903564453, "loss": 0.1514, "rewards/accuracies": 0.875, "rewards/chosen": 2.4050776958465576, "rewards/margins": 4.340325355529785, "rewards/rejected": -1.935247540473938, "step": 2279 }, { "epoch": 1.6657534246575343, "grad_norm": 35.18862398293659, "learning_rate": 3.622736975355161e-07, "logits/chosen": -2.6045987606048584, "logits/rejected": -2.4949564933776855, "logps/chosen": -804.8881225585938, "logps/rejected": -604.3990478515625, "loss": 0.1524, "rewards/accuracies": 0.875, "rewards/chosen": 3.2280402183532715, "rewards/margins": 2.5982418060302734, "rewards/rejected": 0.6297983527183533, "step": 2280 }, { "epoch": 1.6664840182648402, "grad_norm": 39.870859343694704, "learning_rate": 3.621311603182446e-07, "logits/chosen": -3.00363826751709, "logits/rejected": -2.8809478282928467, "logps/chosen": -858.5128784179688, "logps/rejected": -737.877685546875, "loss": 0.198, "rewards/accuracies": 0.875, "rewards/chosen": 1.8946603536605835, "rewards/margins": 2.0264830589294434, "rewards/rejected": -0.13182282447814941, "step": 2281 }, { "epoch": 1.6672146118721463, "grad_norm": 71.26522932087043, "learning_rate": 3.619885774563151e-07, "logits/chosen": -2.7737619876861572, "logits/rejected": -2.2903661727905273, "logps/chosen": -912.87353515625, "logps/rejected": -681.8081665039062, "loss": 0.2924, "rewards/accuracies": 0.5, "rewards/chosen": 3.4644529819488525, "rewards/margins": 2.3987176418304443, "rewards/rejected": 1.0657354593276978, "step": 2282 }, { "epoch": 1.6679452054794521, "grad_norm": 25.175416078131285, "learning_rate": 3.61845949007768e-07, "logits/chosen": -2.5837080478668213, "logits/rejected": -2.2346291542053223, "logps/chosen": -330.3228759765625, "logps/rejected": -285.63330078125, "loss": 0.1889, "rewards/accuracies": 0.875, "rewards/chosen": 1.6965080499649048, "rewards/margins": 3.7422640323638916, "rewards/rejected": -2.0457558631896973, "step": 2283 }, { "epoch": 1.668675799086758, "grad_norm": 22.93160261380569, "learning_rate": 3.617032750306622e-07, "logits/chosen": -2.106290340423584, "logits/rejected": -2.3640685081481934, "logps/chosen": -386.6771240234375, "logps/rejected": -447.33404541015625, "loss": 0.1297, "rewards/accuracies": 1.0, "rewards/chosen": 2.1383581161499023, "rewards/margins": 4.051172256469727, "rewards/rejected": -1.9128139019012451, "step": 2284 }, { "epoch": 1.6694063926940639, "grad_norm": 32.24905938247947, "learning_rate": 3.615605555830755e-07, "logits/chosen": -3.0213770866394043, "logits/rejected": -2.600234270095825, "logps/chosen": -744.6004028320312, "logps/rejected": -799.6088256835938, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": 4.853663921356201, "rewards/margins": 4.781949996948242, "rewards/rejected": 0.07171404361724854, "step": 2285 }, { "epoch": 1.6701369863013698, "grad_norm": 27.914827777782925, "learning_rate": 3.6141779072310376e-07, "logits/chosen": -2.7902960777282715, "logits/rejected": -2.2970471382141113, "logps/chosen": -483.4726867675781, "logps/rejected": -385.135986328125, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": 1.6166647672653198, "rewards/margins": 2.5344696044921875, "rewards/rejected": -0.9178049564361572, "step": 2286 }, { "epoch": 1.6708675799086758, "grad_norm": 14.985752205513045, "learning_rate": 3.612749805088617e-07, "logits/chosen": -2.825495958328247, "logits/rejected": -2.513314962387085, "logps/chosen": -586.4370727539062, "logps/rejected": -540.0632934570312, "loss": 0.0763, "rewards/accuracies": 1.0, "rewards/chosen": 2.5963196754455566, "rewards/margins": 3.717827558517456, "rewards/rejected": -1.121508002281189, "step": 2287 }, { "epoch": 1.6715981735159817, "grad_norm": 32.21822714173221, "learning_rate": 3.611321249984822e-07, "logits/chosen": -3.0108797550201416, "logits/rejected": -2.5821852684020996, "logps/chosen": -571.3794555664062, "logps/rejected": -447.7202453613281, "loss": 0.1315, "rewards/accuracies": 0.875, "rewards/chosen": 2.1499836444854736, "rewards/margins": 3.9297542572021484, "rewards/rejected": -1.7797706127166748, "step": 2288 }, { "epoch": 1.6723287671232878, "grad_norm": 23.36263302948348, "learning_rate": 3.6098922425011686e-07, "logits/chosen": -3.001068115234375, "logits/rejected": -2.2746520042419434, "logps/chosen": -738.5447998046875, "logps/rejected": -473.42877197265625, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 3.4905500411987305, "rewards/margins": 3.9645209312438965, "rewards/rejected": -0.4739706516265869, "step": 2289 }, { "epoch": 1.6730593607305937, "grad_norm": 45.66713059089929, "learning_rate": 3.6084627832193547e-07, "logits/chosen": -2.8400957584381104, "logits/rejected": -1.4400917291641235, "logps/chosen": -625.9984741210938, "logps/rejected": -324.8023986816406, "loss": 0.2207, "rewards/accuracies": 0.875, "rewards/chosen": 4.109187602996826, "rewards/margins": 6.3944902420043945, "rewards/rejected": -2.2853024005889893, "step": 2290 }, { "epoch": 1.6737899543378996, "grad_norm": 25.492800425515362, "learning_rate": 3.6070328727212633e-07, "logits/chosen": -2.756256580352783, "logits/rejected": -2.495547294616699, "logps/chosen": -767.12646484375, "logps/rejected": -640.12451171875, "loss": 0.1449, "rewards/accuracies": 1.0, "rewards/chosen": 3.561511516571045, "rewards/margins": 4.450890064239502, "rewards/rejected": -0.8893786668777466, "step": 2291 }, { "epoch": 1.6745205479452054, "grad_norm": 37.9223610650761, "learning_rate": 3.6056025115889613e-07, "logits/chosen": -2.1508853435516357, "logits/rejected": -2.362323522567749, "logps/chosen": -626.3641357421875, "logps/rejected": -622.4666748046875, "loss": 0.1497, "rewards/accuracies": 0.875, "rewards/chosen": 2.5669283866882324, "rewards/margins": 3.0715994834899902, "rewards/rejected": -0.5046710968017578, "step": 2292 }, { "epoch": 1.6752511415525113, "grad_norm": 24.147161079174374, "learning_rate": 3.604171700404698e-07, "logits/chosen": -2.6753976345062256, "logits/rejected": -2.617433547973633, "logps/chosen": -885.6077880859375, "logps/rejected": -882.7734985351562, "loss": 0.1537, "rewards/accuracies": 1.0, "rewards/chosen": 3.0520682334899902, "rewards/margins": 4.125107765197754, "rewards/rejected": -1.073039174079895, "step": 2293 }, { "epoch": 1.6759817351598172, "grad_norm": 32.940535172062326, "learning_rate": 3.602740439750906e-07, "logits/chosen": -3.321533441543579, "logits/rejected": -2.906189441680908, "logps/chosen": -691.621337890625, "logps/rejected": -662.0718994140625, "loss": 0.1637, "rewards/accuracies": 1.0, "rewards/chosen": 4.0846147537231445, "rewards/margins": 4.242921829223633, "rewards/rejected": -0.15830713510513306, "step": 2294 }, { "epoch": 1.6767123287671233, "grad_norm": 36.68059853101841, "learning_rate": 3.601308730210201e-07, "logits/chosen": -2.6880078315734863, "logits/rejected": -2.0322413444519043, "logps/chosen": -788.8687744140625, "logps/rejected": -691.4403076171875, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": 2.45595121383667, "rewards/margins": 3.3712408542633057, "rewards/rejected": -0.9152897596359253, "step": 2295 }, { "epoch": 1.6774429223744294, "grad_norm": 36.61104216221009, "learning_rate": 3.5998765723653825e-07, "logits/chosen": -3.0855393409729004, "logits/rejected": -1.6741725206375122, "logps/chosen": -718.6688842773438, "logps/rejected": -374.0713806152344, "loss": 0.1632, "rewards/accuracies": 1.0, "rewards/chosen": 4.310633659362793, "rewards/margins": 6.617283821105957, "rewards/rejected": -2.306649923324585, "step": 2296 }, { "epoch": 1.6781735159817353, "grad_norm": 35.07478060337655, "learning_rate": 3.5984439667994314e-07, "logits/chosen": -2.5668039321899414, "logits/rejected": -2.529949188232422, "logps/chosen": -389.4118957519531, "logps/rejected": -487.66461181640625, "loss": 0.1967, "rewards/accuracies": 1.0, "rewards/chosen": 2.3529281616210938, "rewards/margins": 4.709140300750732, "rewards/rejected": -2.3562121391296387, "step": 2297 }, { "epoch": 1.6789041095890411, "grad_norm": 19.56217395644829, "learning_rate": 3.597010914095512e-07, "logits/chosen": -2.8909902572631836, "logits/rejected": -2.1345436573028564, "logps/chosen": -717.213134765625, "logps/rejected": -580.5338745117188, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": 2.678560495376587, "rewards/margins": 3.839991807937622, "rewards/rejected": -1.1614314317703247, "step": 2298 }, { "epoch": 1.679634703196347, "grad_norm": 36.603414274128, "learning_rate": 3.5955774148369677e-07, "logits/chosen": -3.1127371788024902, "logits/rejected": -2.5539937019348145, "logps/chosen": -683.615234375, "logps/rejected": -652.0626831054688, "loss": 0.168, "rewards/accuracies": 1.0, "rewards/chosen": 3.09523344039917, "rewards/margins": 4.639823913574219, "rewards/rejected": -1.5445901155471802, "step": 2299 }, { "epoch": 1.6803652968036529, "grad_norm": 19.04453639380802, "learning_rate": 3.594143469607328e-07, "logits/chosen": -2.379105806350708, "logits/rejected": -2.238982915878296, "logps/chosen": -582.7416381835938, "logps/rejected": -429.0097961425781, "loss": 0.0917, "rewards/accuracies": 0.875, "rewards/chosen": 1.8325942754745483, "rewards/margins": 3.66176176071167, "rewards/rejected": -1.8291679620742798, "step": 2300 }, { "epoch": 1.6810958904109587, "grad_norm": 27.026895196541606, "learning_rate": 3.5927090789902994e-07, "logits/chosen": -3.038491725921631, "logits/rejected": -2.8928444385528564, "logps/chosen": -726.3482666015625, "logps/rejected": -672.4927978515625, "loss": 0.1439, "rewards/accuracies": 1.0, "rewards/chosen": 3.2739319801330566, "rewards/margins": 1.7292966842651367, "rewards/rejected": 1.54463529586792, "step": 2301 }, { "epoch": 1.6818264840182648, "grad_norm": 45.554785976784316, "learning_rate": 3.591274243569773e-07, "logits/chosen": -2.8598341941833496, "logits/rejected": -1.7421820163726807, "logps/chosen": -839.662353515625, "logps/rejected": -499.10595703125, "loss": 0.2305, "rewards/accuracies": 1.0, "rewards/chosen": 3.721756935119629, "rewards/margins": 4.197145938873291, "rewards/rejected": -0.47538918256759644, "step": 2302 }, { "epoch": 1.682557077625571, "grad_norm": 42.62357808982826, "learning_rate": 3.589838963929821e-07, "logits/chosen": -3.101677179336548, "logits/rejected": -2.642211675643921, "logps/chosen": -677.5941162109375, "logps/rejected": -593.8793334960938, "loss": 0.265, "rewards/accuracies": 0.875, "rewards/chosen": 2.8926901817321777, "rewards/margins": 2.7374043464660645, "rewards/rejected": 0.15528559684753418, "step": 2303 }, { "epoch": 1.6832876712328768, "grad_norm": 46.623531205919775, "learning_rate": 3.5884032406546936e-07, "logits/chosen": -2.739450216293335, "logits/rejected": -2.178236722946167, "logps/chosen": -889.66455078125, "logps/rejected": -511.231201171875, "loss": 0.2605, "rewards/accuracies": 0.75, "rewards/chosen": 2.2125296592712402, "rewards/margins": 2.5412001609802246, "rewards/rejected": -0.3286705017089844, "step": 2304 }, { "epoch": 1.6840182648401827, "grad_norm": 34.738672839142836, "learning_rate": 3.586967074328825e-07, "logits/chosen": -2.589895009994507, "logits/rejected": -2.1598896980285645, "logps/chosen": -396.3406982421875, "logps/rejected": -475.7484130859375, "loss": 0.1738, "rewards/accuracies": 1.0, "rewards/chosen": 1.5266867876052856, "rewards/margins": 2.953378677368164, "rewards/rejected": -1.426692008972168, "step": 2305 }, { "epoch": 1.6847488584474886, "grad_norm": 202.74692699582403, "learning_rate": 3.585530465536827e-07, "logits/chosen": -2.5525965690612793, "logits/rejected": -2.3864126205444336, "logps/chosen": -633.6565551757812, "logps/rejected": -535.1481323242188, "loss": 0.2887, "rewards/accuracies": 0.75, "rewards/chosen": 3.1448006629943848, "rewards/margins": 3.042001247406006, "rewards/rejected": 0.1027994155883789, "step": 2306 }, { "epoch": 1.6854794520547944, "grad_norm": 34.70408528050048, "learning_rate": 3.5840934148634915e-07, "logits/chosen": -2.868194103240967, "logits/rejected": -2.024003505706787, "logps/chosen": -494.5184020996094, "logps/rejected": -398.5238037109375, "loss": 0.1764, "rewards/accuracies": 1.0, "rewards/chosen": 4.23974084854126, "rewards/margins": 6.615617275238037, "rewards/rejected": -2.3758766651153564, "step": 2307 }, { "epoch": 1.6862100456621003, "grad_norm": 20.80217534940945, "learning_rate": 3.5826559228937937e-07, "logits/chosen": -2.9941673278808594, "logits/rejected": -1.4607129096984863, "logps/chosen": -1041.4072265625, "logps/rejected": -677.53759765625, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 4.735781669616699, "rewards/margins": 5.417259216308594, "rewards/rejected": -0.6814776659011841, "step": 2308 }, { "epoch": 1.6869406392694064, "grad_norm": 23.84144479149898, "learning_rate": 3.581217990212885e-07, "logits/chosen": -2.5382111072540283, "logits/rejected": -2.3123509883880615, "logps/chosen": -500.98687744140625, "logps/rejected": -447.814697265625, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 4.018485069274902, "rewards/margins": 5.112698554992676, "rewards/rejected": -1.0942137241363525, "step": 2309 }, { "epoch": 1.6876712328767123, "grad_norm": 39.62440081122284, "learning_rate": 3.579779617406097e-07, "logits/chosen": -2.9095120429992676, "logits/rejected": -2.3799901008605957, "logps/chosen": -640.8726806640625, "logps/rejected": -614.7809448242188, "loss": 0.1853, "rewards/accuracies": 1.0, "rewards/chosen": 3.551966667175293, "rewards/margins": 4.749210357666016, "rewards/rejected": -1.1972440481185913, "step": 2310 }, { "epoch": 1.6884018264840184, "grad_norm": 38.37289412992262, "learning_rate": 3.5783408050589424e-07, "logits/chosen": -3.107166051864624, "logits/rejected": -2.1768605709075928, "logps/chosen": -801.1998901367188, "logps/rejected": -524.5145263671875, "loss": 0.1844, "rewards/accuracies": 0.875, "rewards/chosen": 2.704122543334961, "rewards/margins": 2.828260898590088, "rewards/rejected": -0.12413865327835083, "step": 2311 }, { "epoch": 1.6891324200913242, "grad_norm": 35.06360187941068, "learning_rate": 3.5769015537571084e-07, "logits/chosen": -2.262291431427002, "logits/rejected": -2.32916259765625, "logps/chosen": -457.7401428222656, "logps/rejected": -627.0181884765625, "loss": 0.2199, "rewards/accuracies": 0.875, "rewards/chosen": 1.522030234336853, "rewards/margins": 2.550327777862549, "rewards/rejected": -1.0282974243164062, "step": 2312 }, { "epoch": 1.6898630136986301, "grad_norm": 44.24208472198021, "learning_rate": 3.575461864086466e-07, "logits/chosen": -2.502401113510132, "logits/rejected": -2.33313250541687, "logps/chosen": -676.077880859375, "logps/rejected": -687.3773193359375, "loss": 0.1898, "rewards/accuracies": 0.75, "rewards/chosen": 2.955615520477295, "rewards/margins": 2.529909133911133, "rewards/rejected": 0.42570599913597107, "step": 2313 }, { "epoch": 1.690593607305936, "grad_norm": 26.09664444088969, "learning_rate": 3.5740217366330605e-07, "logits/chosen": -3.0248215198516846, "logits/rejected": -2.41204833984375, "logps/chosen": -386.35919189453125, "logps/rejected": -343.37664794921875, "loss": 0.1219, "rewards/accuracies": 1.0, "rewards/chosen": 1.9691389799118042, "rewards/margins": 3.754486322402954, "rewards/rejected": -1.7853474617004395, "step": 2314 }, { "epoch": 1.6913242009132419, "grad_norm": 31.48828197687941, "learning_rate": 3.572581171983119e-07, "logits/chosen": -2.577770233154297, "logits/rejected": -2.350635051727295, "logps/chosen": -652.2047119140625, "logps/rejected": -666.17138671875, "loss": 0.1209, "rewards/accuracies": 1.0, "rewards/chosen": 2.023244857788086, "rewards/margins": 2.7974514961242676, "rewards/rejected": -0.7742068767547607, "step": 2315 }, { "epoch": 1.692054794520548, "grad_norm": 19.803608821709272, "learning_rate": 3.5711401707230444e-07, "logits/chosen": -2.880751132965088, "logits/rejected": -3.170360565185547, "logps/chosen": -539.3441772460938, "logps/rejected": -612.3582763671875, "loss": 0.1094, "rewards/accuracies": 1.0, "rewards/chosen": 2.1250369548797607, "rewards/margins": 2.8992953300476074, "rewards/rejected": -0.7742583751678467, "step": 2316 }, { "epoch": 1.6927853881278538, "grad_norm": 19.461700387746745, "learning_rate": 3.569698733439416e-07, "logits/chosen": -2.4637715816497803, "logits/rejected": -2.631803274154663, "logps/chosen": -673.208740234375, "logps/rejected": -778.1461791992188, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 3.642867088317871, "rewards/margins": 4.075980186462402, "rewards/rejected": -0.43311309814453125, "step": 2317 }, { "epoch": 1.69351598173516, "grad_norm": 27.431711198845672, "learning_rate": 3.568256860718995e-07, "logits/chosen": -2.5069210529327393, "logits/rejected": -2.1991934776306152, "logps/chosen": -778.306396484375, "logps/rejected": -672.8418579101562, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 3.0831754207611084, "rewards/margins": 3.307474136352539, "rewards/rejected": -0.22429856657981873, "step": 2318 }, { "epoch": 1.6942465753424658, "grad_norm": 25.93925422343458, "learning_rate": 3.566814553148715e-07, "logits/chosen": -2.7481529712677, "logits/rejected": -1.8367493152618408, "logps/chosen": -773.0546875, "logps/rejected": -487.6950988769531, "loss": 0.1075, "rewards/accuracies": 1.0, "rewards/chosen": 4.508004188537598, "rewards/margins": 5.264559745788574, "rewards/rejected": -0.7565553188323975, "step": 2319 }, { "epoch": 1.6949771689497717, "grad_norm": 43.8643246784011, "learning_rate": 3.565371811315689e-07, "logits/chosen": -2.5940470695495605, "logits/rejected": -1.683816909790039, "logps/chosen": -557.1139526367188, "logps/rejected": -331.4630432128906, "loss": 0.2341, "rewards/accuracies": 0.75, "rewards/chosen": 1.6349358558654785, "rewards/margins": 2.5491223335266113, "rewards/rejected": -0.9141865968704224, "step": 2320 }, { "epoch": 1.6957077625570776, "grad_norm": 27.985031486576666, "learning_rate": 3.563928635807208e-07, "logits/chosen": -3.228241443634033, "logits/rejected": -2.146042823791504, "logps/chosen": -856.1868286132812, "logps/rejected": -632.9662475585938, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": 4.142936706542969, "rewards/margins": 4.4621381759643555, "rewards/rejected": -0.3192019760608673, "step": 2321 }, { "epoch": 1.6964383561643834, "grad_norm": 30.921417063309452, "learning_rate": 3.5624850272107374e-07, "logits/chosen": -2.956404685974121, "logits/rejected": -2.424659252166748, "logps/chosen": -485.64801025390625, "logps/rejected": -342.9731140136719, "loss": 0.1803, "rewards/accuracies": 0.875, "rewards/chosen": 1.3778883218765259, "rewards/margins": 2.230487823486328, "rewards/rejected": -0.8525996208190918, "step": 2322 }, { "epoch": 1.6971689497716895, "grad_norm": 64.60412425761803, "learning_rate": 3.5610409861139183e-07, "logits/chosen": -2.555671453475952, "logits/rejected": -1.915686845779419, "logps/chosen": -483.1929931640625, "logps/rejected": -430.1083679199219, "loss": 0.2699, "rewards/accuracies": 1.0, "rewards/chosen": 2.5158305168151855, "rewards/margins": 2.9600296020507812, "rewards/rejected": -0.44419869780540466, "step": 2323 }, { "epoch": 1.6978995433789954, "grad_norm": 29.077219264843418, "learning_rate": 3.559596513104571e-07, "logits/chosen": -3.1365997791290283, "logits/rejected": -2.5306284427642822, "logps/chosen": -478.43218994140625, "logps/rejected": -382.9481506347656, "loss": 0.1419, "rewards/accuracies": 0.875, "rewards/chosen": 2.880995750427246, "rewards/margins": 3.894266128540039, "rewards/rejected": -1.0132702589035034, "step": 2324 }, { "epoch": 1.6986301369863015, "grad_norm": 39.99045754096766, "learning_rate": 3.558151608770688e-07, "logits/chosen": -2.620779275894165, "logits/rejected": -1.6783236265182495, "logps/chosen": -363.63824462890625, "logps/rejected": -278.3109130859375, "loss": 0.2544, "rewards/accuracies": 0.875, "rewards/chosen": 2.320507526397705, "rewards/margins": 3.5245745182037354, "rewards/rejected": -1.2040667533874512, "step": 2325 }, { "epoch": 1.6993607305936074, "grad_norm": 30.425886315857237, "learning_rate": 3.556706273700441e-07, "logits/chosen": -2.8897345066070557, "logits/rejected": -2.0706894397735596, "logps/chosen": -399.1197509765625, "logps/rejected": -260.2347106933594, "loss": 0.1775, "rewards/accuracies": 1.0, "rewards/chosen": 0.7100738286972046, "rewards/margins": 2.0645864009857178, "rewards/rejected": -1.3545124530792236, "step": 2326 }, { "epoch": 1.7000913242009132, "grad_norm": 50.25209154928902, "learning_rate": 3.5552605084821734e-07, "logits/chosen": -3.323704719543457, "logits/rejected": -2.273324489593506, "logps/chosen": -1049.6346435546875, "logps/rejected": -695.6348876953125, "loss": 0.2483, "rewards/accuracies": 1.0, "rewards/chosen": 4.208491325378418, "rewards/margins": 4.976726055145264, "rewards/rejected": -0.7682343125343323, "step": 2327 }, { "epoch": 1.7008219178082191, "grad_norm": 26.11117845170376, "learning_rate": 3.5538143137044065e-07, "logits/chosen": -3.150979518890381, "logits/rejected": -2.2976553440093994, "logps/chosen": -649.1647338867188, "logps/rejected": -463.9897155761719, "loss": 0.1433, "rewards/accuracies": 1.0, "rewards/chosen": 3.2136147022247314, "rewards/margins": 3.943080425262451, "rewards/rejected": -0.7294657230377197, "step": 2328 }, { "epoch": 1.701552511415525, "grad_norm": 31.68813947826142, "learning_rate": 3.552367689955836e-07, "logits/chosen": -2.1712565422058105, "logits/rejected": -2.470681667327881, "logps/chosen": -481.4704895019531, "logps/rejected": -670.9515380859375, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": 2.5170726776123047, "rewards/margins": 3.866377353668213, "rewards/rejected": -1.3493045568466187, "step": 2329 }, { "epoch": 1.702283105022831, "grad_norm": 31.047907082115657, "learning_rate": 3.5509206378253293e-07, "logits/chosen": -2.4648423194885254, "logits/rejected": -2.6145734786987305, "logps/chosen": -375.6138916015625, "logps/rejected": -357.9296569824219, "loss": 0.1665, "rewards/accuracies": 1.0, "rewards/chosen": 1.2846975326538086, "rewards/margins": 2.4001355171203613, "rewards/rejected": -1.1154381036758423, "step": 2330 }, { "epoch": 1.703013698630137, "grad_norm": 34.86306394365049, "learning_rate": 3.5494731579019326e-07, "logits/chosen": -2.813882350921631, "logits/rejected": -2.136490821838379, "logps/chosen": -784.083740234375, "logps/rejected": -672.8414306640625, "loss": 0.1402, "rewards/accuracies": 1.0, "rewards/chosen": 4.031522750854492, "rewards/margins": 5.150489330291748, "rewards/rejected": -1.118966817855835, "step": 2331 }, { "epoch": 1.703744292237443, "grad_norm": 35.85627539708443, "learning_rate": 3.5480252507748634e-07, "logits/chosen": -2.8995561599731445, "logits/rejected": -2.5790085792541504, "logps/chosen": -952.5374755859375, "logps/rejected": -908.8807373046875, "loss": 0.1822, "rewards/accuracies": 1.0, "rewards/chosen": 4.915082931518555, "rewards/margins": 5.686368465423584, "rewards/rejected": -0.7712851762771606, "step": 2332 }, { "epoch": 1.704474885844749, "grad_norm": 27.137884639648018, "learning_rate": 3.5465769170335145e-07, "logits/chosen": -2.6215875148773193, "logits/rejected": -2.7144455909729004, "logps/chosen": -627.7213745117188, "logps/rejected": -594.4718017578125, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 2.5127336978912354, "rewards/margins": 2.8225693702697754, "rewards/rejected": -0.3098357915878296, "step": 2333 }, { "epoch": 1.7052054794520548, "grad_norm": 20.007321352815918, "learning_rate": 3.5451281572674507e-07, "logits/chosen": -2.7790918350219727, "logits/rejected": -2.016359567642212, "logps/chosen": -847.3368530273438, "logps/rejected": -538.5908203125, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 3.5078976154327393, "rewards/margins": 3.860508441925049, "rewards/rejected": -0.35261070728302, "step": 2334 }, { "epoch": 1.7059360730593607, "grad_norm": 20.00691256273613, "learning_rate": 3.5436789720664116e-07, "logits/chosen": -3.266624689102173, "logits/rejected": -2.4432289600372314, "logps/chosen": -576.888427734375, "logps/rejected": -485.02923583984375, "loss": 0.1159, "rewards/accuracies": 0.875, "rewards/chosen": 3.2609829902648926, "rewards/margins": 3.4154040813446045, "rewards/rejected": -0.1544211506843567, "step": 2335 }, { "epoch": 1.7066666666666666, "grad_norm": 43.133373564318646, "learning_rate": 3.5422293620203103e-07, "logits/chosen": -2.766026496887207, "logits/rejected": -2.5251245498657227, "logps/chosen": -740.9315795898438, "logps/rejected": -679.0115966796875, "loss": 0.2183, "rewards/accuracies": 1.0, "rewards/chosen": 2.1808323860168457, "rewards/margins": 2.475088119506836, "rewards/rejected": -0.2942558526992798, "step": 2336 }, { "epoch": 1.7073972602739727, "grad_norm": 17.280888862354473, "learning_rate": 3.540779327719231e-07, "logits/chosen": -2.4267215728759766, "logits/rejected": -2.233579158782959, "logps/chosen": -540.4105224609375, "logps/rejected": -635.0748901367188, "loss": 0.0882, "rewards/accuracies": 1.0, "rewards/chosen": 3.6874234676361084, "rewards/margins": 3.4262876510620117, "rewards/rejected": 0.2611355185508728, "step": 2337 }, { "epoch": 1.7081278538812785, "grad_norm": 26.365723338977446, "learning_rate": 3.539328869753432e-07, "logits/chosen": -3.4833664894104004, "logits/rejected": -2.458592414855957, "logps/chosen": -391.69793701171875, "logps/rejected": -358.0077819824219, "loss": 0.1446, "rewards/accuracies": 1.0, "rewards/chosen": 2.914090633392334, "rewards/margins": 4.78006649017334, "rewards/rejected": -1.8659758567810059, "step": 2338 }, { "epoch": 1.7088584474885846, "grad_norm": 37.623496193473464, "learning_rate": 3.537877988713345e-07, "logits/chosen": -2.421665668487549, "logits/rejected": -2.7426929473876953, "logps/chosen": -633.7322387695312, "logps/rejected": -828.7554321289062, "loss": 0.2029, "rewards/accuracies": 0.875, "rewards/chosen": 2.4170942306518555, "rewards/margins": 2.367305278778076, "rewards/rejected": 0.04978868365287781, "step": 2339 }, { "epoch": 1.7095890410958905, "grad_norm": 26.635847720488687, "learning_rate": 3.536426685189572e-07, "logits/chosen": -2.567338466644287, "logits/rejected": -1.506895661354065, "logps/chosen": -413.1332092285156, "logps/rejected": -269.6418762207031, "loss": 0.1354, "rewards/accuracies": 1.0, "rewards/chosen": 3.9501876831054688, "rewards/margins": 5.16280460357666, "rewards/rejected": -1.2126169204711914, "step": 2340 }, { "epoch": 1.7103196347031964, "grad_norm": 15.927353924219295, "learning_rate": 3.5349749597728867e-07, "logits/chosen": -3.3849620819091797, "logits/rejected": -2.575310707092285, "logps/chosen": -743.0955200195312, "logps/rejected": -621.9866333007812, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 3.310932159423828, "rewards/margins": 3.930077075958252, "rewards/rejected": -0.6191450357437134, "step": 2341 }, { "epoch": 1.7110502283105022, "grad_norm": 32.21017860038578, "learning_rate": 3.5335228130542366e-07, "logits/chosen": -2.7695655822753906, "logits/rejected": -2.804241180419922, "logps/chosen": -672.1273193359375, "logps/rejected": -558.120849609375, "loss": 0.1851, "rewards/accuracies": 1.0, "rewards/chosen": 2.87270188331604, "rewards/margins": 2.370842933654785, "rewards/rejected": 0.5018590688705444, "step": 2342 }, { "epoch": 1.7117808219178081, "grad_norm": 24.06534835008497, "learning_rate": 3.5320702456247395e-07, "logits/chosen": -2.2850918769836426, "logits/rejected": -1.784809947013855, "logps/chosen": -497.7953796386719, "logps/rejected": -443.48541259765625, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 3.109781503677368, "rewards/margins": 3.9894638061523438, "rewards/rejected": -0.8796822428703308, "step": 2343 }, { "epoch": 1.712511415525114, "grad_norm": 60.11232568466096, "learning_rate": 3.5306172580756854e-07, "logits/chosen": -2.5520663261413574, "logits/rejected": -1.3790651559829712, "logps/chosen": -736.1730346679688, "logps/rejected": -543.8162231445312, "loss": 0.2893, "rewards/accuracies": 1.0, "rewards/chosen": 4.6072797775268555, "rewards/margins": 5.425288200378418, "rewards/rejected": -0.8180090188980103, "step": 2344 }, { "epoch": 1.71324200913242, "grad_norm": 26.262320397410623, "learning_rate": 3.529163850998533e-07, "logits/chosen": -2.7749032974243164, "logits/rejected": -2.0988056659698486, "logps/chosen": -782.33251953125, "logps/rejected": -688.9988403320312, "loss": 0.143, "rewards/accuracies": 1.0, "rewards/chosen": 3.921210527420044, "rewards/margins": 3.972107172012329, "rewards/rejected": -0.05089661478996277, "step": 2345 }, { "epoch": 1.7139726027397262, "grad_norm": 24.653828897290477, "learning_rate": 3.527710024984914e-07, "logits/chosen": -3.1158392429351807, "logits/rejected": -2.4053263664245605, "logps/chosen": -827.9395751953125, "logps/rejected": -631.871337890625, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": 3.9238510131835938, "rewards/margins": 4.560293197631836, "rewards/rejected": -0.6364421844482422, "step": 2346 }, { "epoch": 1.714703196347032, "grad_norm": 25.95619972567669, "learning_rate": 3.5262557806266297e-07, "logits/chosen": -2.6996705532073975, "logits/rejected": -1.9517021179199219, "logps/chosen": -493.70074462890625, "logps/rejected": -511.1190185546875, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 3.625702381134033, "rewards/margins": 3.223975658416748, "rewards/rejected": 0.4017266035079956, "step": 2347 }, { "epoch": 1.715433789954338, "grad_norm": 51.21286049497846, "learning_rate": 3.5248011185156523e-07, "logits/chosen": -2.9625964164733887, "logits/rejected": -2.1672708988189697, "logps/chosen": -845.4822387695312, "logps/rejected": -615.9755249023438, "loss": 0.2292, "rewards/accuracies": 0.875, "rewards/chosen": 3.564505100250244, "rewards/margins": 2.782252311706543, "rewards/rejected": 0.782253086566925, "step": 2348 }, { "epoch": 1.7161643835616438, "grad_norm": 51.88934760171157, "learning_rate": 3.5233460392441227e-07, "logits/chosen": -2.965852737426758, "logits/rejected": -1.8320879936218262, "logps/chosen": -818.9896240234375, "logps/rejected": -475.844482421875, "loss": 0.2338, "rewards/accuracies": 1.0, "rewards/chosen": 3.4422249794006348, "rewards/margins": 3.963223934173584, "rewards/rejected": -0.5209990739822388, "step": 2349 }, { "epoch": 1.7168949771689497, "grad_norm": 17.491213963529482, "learning_rate": 3.5218905434043545e-07, "logits/chosen": -2.654055595397949, "logits/rejected": -2.1129305362701416, "logps/chosen": -617.44970703125, "logps/rejected": -493.7103271484375, "loss": 0.0771, "rewards/accuracies": 1.0, "rewards/chosen": 3.827996253967285, "rewards/margins": 4.549893379211426, "rewards/rejected": -0.7218974232673645, "step": 2350 }, { "epoch": 1.7176255707762556, "grad_norm": 23.161744983141112, "learning_rate": 3.520434631588827e-07, "logits/chosen": -2.570497989654541, "logits/rejected": -2.8242745399475098, "logps/chosen": -575.6602172851562, "logps/rejected": -601.1800537109375, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": 2.4773285388946533, "rewards/margins": 3.482868194580078, "rewards/rejected": -1.0055397748947144, "step": 2351 }, { "epoch": 1.7183561643835616, "grad_norm": 26.486166744332667, "learning_rate": 3.518978304390192e-07, "logits/chosen": -2.698535680770874, "logits/rejected": -2.0930356979370117, "logps/chosen": -562.537841796875, "logps/rejected": -407.24609375, "loss": 0.0977, "rewards/accuracies": 1.0, "rewards/chosen": 3.5414772033691406, "rewards/margins": 4.783266067504883, "rewards/rejected": -1.241788387298584, "step": 2352 }, { "epoch": 1.7190867579908677, "grad_norm": 36.94829183509368, "learning_rate": 3.517521562401269e-07, "logits/chosen": -3.469115734100342, "logits/rejected": -2.3112876415252686, "logps/chosen": -693.9241333007812, "logps/rejected": -509.84423828125, "loss": 0.2101, "rewards/accuracies": 1.0, "rewards/chosen": 3.570617437362671, "rewards/margins": 5.189385890960693, "rewards/rejected": -1.618768572807312, "step": 2353 }, { "epoch": 1.7198173515981736, "grad_norm": 45.85256817030536, "learning_rate": 3.5160644062150456e-07, "logits/chosen": -2.9866104125976562, "logits/rejected": -2.1052703857421875, "logps/chosen": -447.931884765625, "logps/rejected": -391.6981201171875, "loss": 0.2393, "rewards/accuracies": 1.0, "rewards/chosen": 2.7046589851379395, "rewards/margins": 4.443159580230713, "rewards/rejected": -1.7385010719299316, "step": 2354 }, { "epoch": 1.7205479452054795, "grad_norm": 49.937553778131246, "learning_rate": 3.5146068364246797e-07, "logits/chosen": -2.5176775455474854, "logits/rejected": -2.3658649921417236, "logps/chosen": -303.7145080566406, "logps/rejected": -374.1540222167969, "loss": 0.3192, "rewards/accuracies": 0.75, "rewards/chosen": 0.6221657395362854, "rewards/margins": 1.7186368703842163, "rewards/rejected": -1.0964710712432861, "step": 2355 }, { "epoch": 1.7212785388127854, "grad_norm": 40.372108001591734, "learning_rate": 3.5131488536234966e-07, "logits/chosen": -3.0460286140441895, "logits/rejected": -2.2314682006835938, "logps/chosen": -1197.47412109375, "logps/rejected": -748.5537109375, "loss": 0.2708, "rewards/accuracies": 1.0, "rewards/chosen": 4.343299865722656, "rewards/margins": 3.159627914428711, "rewards/rejected": 1.1836724281311035, "step": 2356 }, { "epoch": 1.7220091324200912, "grad_norm": 26.797922720042592, "learning_rate": 3.51169045840499e-07, "logits/chosen": -2.812955379486084, "logits/rejected": -2.151453733444214, "logps/chosen": -814.5652465820312, "logps/rejected": -659.8865356445312, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": 3.9371180534362793, "rewards/margins": 4.469151973724365, "rewards/rejected": -0.5320334434509277, "step": 2357 }, { "epoch": 1.7227397260273971, "grad_norm": 38.42948817670406, "learning_rate": 3.510231651362821e-07, "logits/chosen": -2.9482052326202393, "logits/rejected": -2.196183443069458, "logps/chosen": -719.1666870117188, "logps/rejected": -410.6540832519531, "loss": 0.1881, "rewards/accuracies": 1.0, "rewards/chosen": 3.7151830196380615, "rewards/margins": 4.490055561065674, "rewards/rejected": -0.7748728394508362, "step": 2358 }, { "epoch": 1.7234703196347032, "grad_norm": 19.581865664196172, "learning_rate": 3.50877243309082e-07, "logits/chosen": -2.6144423484802246, "logits/rejected": -2.514406204223633, "logps/chosen": -430.226318359375, "logps/rejected": -576.7243041992188, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": 2.2587227821350098, "rewards/margins": 4.636971473693848, "rewards/rejected": -2.378248691558838, "step": 2359 }, { "epoch": 1.724200913242009, "grad_norm": 24.41383624895164, "learning_rate": 3.507312804182981e-07, "logits/chosen": -2.9924004077911377, "logits/rejected": -2.5796709060668945, "logps/chosen": -347.78045654296875, "logps/rejected": -329.9278259277344, "loss": 0.1557, "rewards/accuracies": 1.0, "rewards/chosen": 1.4832959175109863, "rewards/margins": 2.3376219272613525, "rewards/rejected": -0.854326069355011, "step": 2360 }, { "epoch": 1.7249315068493152, "grad_norm": 22.90235674195862, "learning_rate": 3.5058527652334707e-07, "logits/chosen": -3.0159106254577637, "logits/rejected": -2.535883903503418, "logps/chosen": -628.86767578125, "logps/rejected": -478.8781433105469, "loss": 0.1, "rewards/accuracies": 1.0, "rewards/chosen": 3.44144606590271, "rewards/margins": 4.316627025604248, "rewards/rejected": -0.8751810789108276, "step": 2361 }, { "epoch": 1.725662100456621, "grad_norm": 21.5170929850513, "learning_rate": 3.504392316836618e-07, "logits/chosen": -2.9162940979003906, "logits/rejected": -2.2147741317749023, "logps/chosen": -669.8714599609375, "logps/rejected": -528.4056396484375, "loss": 0.1267, "rewards/accuracies": 1.0, "rewards/chosen": 3.2233829498291016, "rewards/margins": 4.635037422180176, "rewards/rejected": -1.4116543531417847, "step": 2362 }, { "epoch": 1.726392694063927, "grad_norm": 24.543401150595702, "learning_rate": 3.5029314595869203e-07, "logits/chosen": -2.3964192867279053, "logits/rejected": -2.126957893371582, "logps/chosen": -742.0023193359375, "logps/rejected": -558.0174560546875, "loss": 0.1413, "rewards/accuracies": 1.0, "rewards/chosen": 3.8649933338165283, "rewards/margins": 3.72465443611145, "rewards/rejected": 0.14033900201320648, "step": 2363 }, { "epoch": 1.7271232876712328, "grad_norm": 37.909919378767974, "learning_rate": 3.5014701940790416e-07, "logits/chosen": -3.1824398040771484, "logits/rejected": -2.275740146636963, "logps/chosen": -664.7127685546875, "logps/rejected": -523.2274169921875, "loss": 0.1726, "rewards/accuracies": 0.875, "rewards/chosen": 3.6639184951782227, "rewards/margins": 4.114143371582031, "rewards/rejected": -0.4502250850200653, "step": 2364 }, { "epoch": 1.7278538812785387, "grad_norm": 26.460398900261076, "learning_rate": 3.500008520907811e-07, "logits/chosen": -2.6746249198913574, "logits/rejected": -2.7123594284057617, "logps/chosen": -657.7763671875, "logps/rejected": -678.354248046875, "loss": 0.1407, "rewards/accuracies": 1.0, "rewards/chosen": 2.832397937774658, "rewards/margins": 4.4123969078063965, "rewards/rejected": -1.5799989700317383, "step": 2365 }, { "epoch": 1.7285844748858448, "grad_norm": 37.91134581627989, "learning_rate": 3.4985464406682247e-07, "logits/chosen": -2.555809736251831, "logits/rejected": -2.466496706008911, "logps/chosen": -398.01837158203125, "logps/rejected": -357.8701477050781, "loss": 0.1817, "rewards/accuracies": 0.875, "rewards/chosen": 1.632291555404663, "rewards/margins": 2.242689609527588, "rewards/rejected": -0.6103982329368591, "step": 2366 }, { "epoch": 1.7293150684931506, "grad_norm": 56.49592724189838, "learning_rate": 3.4970839539554446e-07, "logits/chosen": -2.7892343997955322, "logits/rejected": -2.34649920463562, "logps/chosen": -729.6200561523438, "logps/rejected": -750.9229125976562, "loss": 0.2217, "rewards/accuracies": 0.875, "rewards/chosen": 2.998178482055664, "rewards/margins": 3.7225544452667236, "rewards/rejected": -0.7243756055831909, "step": 2367 }, { "epoch": 1.7300456621004567, "grad_norm": 57.60949154110805, "learning_rate": 3.495621061364798e-07, "logits/chosen": -2.7007699012756348, "logits/rejected": -1.7387605905532837, "logps/chosen": -549.2023315429688, "logps/rejected": -447.7432861328125, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 3.3614161014556885, "rewards/margins": 4.663400173187256, "rewards/rejected": -1.3019840717315674, "step": 2368 }, { "epoch": 1.7307762557077626, "grad_norm": 40.29258834799832, "learning_rate": 3.494157763491776e-07, "logits/chosen": -2.957930326461792, "logits/rejected": -1.4921871423721313, "logps/chosen": -696.2994384765625, "logps/rejected": -336.3323669433594, "loss": 0.2169, "rewards/accuracies": 1.0, "rewards/chosen": 4.688338756561279, "rewards/margins": 6.641057014465332, "rewards/rejected": -1.9527182579040527, "step": 2369 }, { "epoch": 1.7315068493150685, "grad_norm": 33.485437672708, "learning_rate": 3.4926940609320377e-07, "logits/chosen": -3.2203102111816406, "logits/rejected": -2.1580917835235596, "logps/chosen": -750.4131469726562, "logps/rejected": -458.566162109375, "loss": 0.1558, "rewards/accuracies": 0.875, "rewards/chosen": 3.0898804664611816, "rewards/margins": 2.915134906768799, "rewards/rejected": 0.17474527657032013, "step": 2370 }, { "epoch": 1.7322374429223744, "grad_norm": 36.31377572791333, "learning_rate": 3.491229954281402e-07, "logits/chosen": -2.7940926551818848, "logits/rejected": -2.2669990062713623, "logps/chosen": -555.7064208984375, "logps/rejected": -511.579833984375, "loss": 0.1808, "rewards/accuracies": 1.0, "rewards/chosen": 3.436734437942505, "rewards/margins": 4.905697345733643, "rewards/rejected": -1.4689630270004272, "step": 2371 }, { "epoch": 1.7329680365296802, "grad_norm": 33.510534651318046, "learning_rate": 3.489765444135858e-07, "logits/chosen": -2.570969343185425, "logits/rejected": -2.347275495529175, "logps/chosen": -759.968017578125, "logps/rejected": -678.5217895507812, "loss": 0.1678, "rewards/accuracies": 0.875, "rewards/chosen": 3.639139413833618, "rewards/margins": 4.678478240966797, "rewards/rejected": -1.0393385887145996, "step": 2372 }, { "epoch": 1.7336986301369863, "grad_norm": 25.033940583782325, "learning_rate": 3.4883005310915546e-07, "logits/chosen": -3.1574463844299316, "logits/rejected": -2.377107620239258, "logps/chosen": -995.9912109375, "logps/rejected": -701.7293090820312, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 3.3706440925598145, "rewards/margins": 3.3314599990844727, "rewards/rejected": 0.03918418288230896, "step": 2373 }, { "epoch": 1.7344292237442922, "grad_norm": 48.04295461648097, "learning_rate": 3.4868352157448086e-07, "logits/chosen": -2.4804673194885254, "logits/rejected": -2.40757417678833, "logps/chosen": -448.2281188964844, "logps/rejected": -410.75225830078125, "loss": 0.2594, "rewards/accuracies": 0.75, "rewards/chosen": 2.2745585441589355, "rewards/margins": 3.478139877319336, "rewards/rejected": -1.2035810947418213, "step": 2374 }, { "epoch": 1.7351598173515983, "grad_norm": 36.168849427999675, "learning_rate": 3.485369498692096e-07, "logits/chosen": -2.7063307762145996, "logits/rejected": -2.1312708854675293, "logps/chosen": -734.1387939453125, "logps/rejected": -560.93603515625, "loss": 0.1722, "rewards/accuracies": 1.0, "rewards/chosen": 1.8063359260559082, "rewards/margins": 2.688547134399414, "rewards/rejected": -0.8822112083435059, "step": 2375 }, { "epoch": 1.7358904109589042, "grad_norm": 52.337805166145195, "learning_rate": 3.48390338053006e-07, "logits/chosen": -2.704636573791504, "logits/rejected": -1.8249965906143188, "logps/chosen": -654.5361328125, "logps/rejected": -386.79498291015625, "loss": 0.3037, "rewards/accuracies": 0.875, "rewards/chosen": 0.804867148399353, "rewards/margins": 1.874713659286499, "rewards/rejected": -1.0698463916778564, "step": 2376 }, { "epoch": 1.73662100456621, "grad_norm": 34.92570413850396, "learning_rate": 3.4824368618555054e-07, "logits/chosen": -2.7649030685424805, "logits/rejected": -2.7766592502593994, "logps/chosen": -795.2744140625, "logps/rejected": -861.177734375, "loss": 0.1875, "rewards/accuracies": 1.0, "rewards/chosen": 2.859217405319214, "rewards/margins": 3.1021554470062256, "rewards/rejected": -0.24293816089630127, "step": 2377 }, { "epoch": 1.737351598173516, "grad_norm": 38.16587684402702, "learning_rate": 3.4809699432654015e-07, "logits/chosen": -2.9507875442504883, "logits/rejected": -2.386223554611206, "logps/chosen": -693.0694580078125, "logps/rejected": -685.882080078125, "loss": 0.1894, "rewards/accuracies": 0.75, "rewards/chosen": 3.4918737411499023, "rewards/margins": 2.5000925064086914, "rewards/rejected": 0.99178147315979, "step": 2378 }, { "epoch": 1.7380821917808218, "grad_norm": 26.284923754823314, "learning_rate": 3.479502625356878e-07, "logits/chosen": -3.5184998512268066, "logits/rejected": -2.8794069290161133, "logps/chosen": -610.2772827148438, "logps/rejected": -482.6055908203125, "loss": 0.168, "rewards/accuracies": 0.875, "rewards/chosen": 2.1717734336853027, "rewards/margins": 1.9890062808990479, "rewards/rejected": 0.18276730179786682, "step": 2379 }, { "epoch": 1.738812785388128, "grad_norm": 26.922182193183538, "learning_rate": 3.478034908727229e-07, "logits/chosen": -2.8534674644470215, "logits/rejected": -1.5066628456115723, "logps/chosen": -533.55615234375, "logps/rejected": -280.8845520019531, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 3.8653035163879395, "rewards/margins": 6.298896789550781, "rewards/rejected": -2.4335930347442627, "step": 2380 }, { "epoch": 1.7395433789954338, "grad_norm": 42.3004448761328, "learning_rate": 3.476566793973911e-07, "logits/chosen": -2.82755184173584, "logits/rejected": -2.1890487670898438, "logps/chosen": -534.2855224609375, "logps/rejected": -402.63775634765625, "loss": 0.2287, "rewards/accuracies": 0.875, "rewards/chosen": 1.2100329399108887, "rewards/margins": 1.459854245185852, "rewards/rejected": -0.24982133507728577, "step": 2381 }, { "epoch": 1.7402739726027399, "grad_norm": 39.08111275313738, "learning_rate": 3.475098281694541e-07, "logits/chosen": -3.362955093383789, "logits/rejected": -2.678443193435669, "logps/chosen": -800.8054809570312, "logps/rejected": -694.5966796875, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": 3.913975715637207, "rewards/margins": 4.2265143394470215, "rewards/rejected": -0.3125385046005249, "step": 2382 }, { "epoch": 1.7410045662100457, "grad_norm": 29.58224556942369, "learning_rate": 3.473629372486899e-07, "logits/chosen": -2.5004501342773438, "logits/rejected": -1.9824285507202148, "logps/chosen": -816.2023315429688, "logps/rejected": -573.46044921875, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 2.7105987071990967, "rewards/margins": 3.732029914855957, "rewards/rejected": -1.0214309692382812, "step": 2383 }, { "epoch": 1.7417351598173516, "grad_norm": 30.073656323890848, "learning_rate": 3.472160066948927e-07, "logits/chosen": -2.7828800678253174, "logits/rejected": -1.5014792680740356, "logps/chosen": -490.5834045410156, "logps/rejected": -275.8319091796875, "loss": 0.1462, "rewards/accuracies": 1.0, "rewards/chosen": 2.5130906105041504, "rewards/margins": 4.62710428237915, "rewards/rejected": -2.114013433456421, "step": 2384 }, { "epoch": 1.7424657534246575, "grad_norm": 22.59887940996052, "learning_rate": 3.4706903656787275e-07, "logits/chosen": -2.968139171600342, "logits/rejected": -2.448350667953491, "logps/chosen": -621.9849243164062, "logps/rejected": -526.870849609375, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": 3.241501569747925, "rewards/margins": 3.8036012649536133, "rewards/rejected": -0.562099814414978, "step": 2385 }, { "epoch": 1.7431963470319634, "grad_norm": 45.14995225331432, "learning_rate": 3.469220269274563e-07, "logits/chosen": -2.5077342987060547, "logits/rejected": -2.7284293174743652, "logps/chosen": -772.4532470703125, "logps/rejected": -847.1542358398438, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": 3.7712717056274414, "rewards/margins": 3.7245187759399414, "rewards/rejected": 0.04675315320491791, "step": 2386 }, { "epoch": 1.7439269406392695, "grad_norm": 18.505954093590056, "learning_rate": 3.46774977833486e-07, "logits/chosen": -2.1432387828826904, "logits/rejected": -2.3133015632629395, "logps/chosen": -484.6473693847656, "logps/rejected": -673.22119140625, "loss": 0.0981, "rewards/accuracies": 0.875, "rewards/chosen": 0.5040010809898376, "rewards/margins": 2.2013003826141357, "rewards/rejected": -1.6972992420196533, "step": 2387 }, { "epoch": 1.7446575342465753, "grad_norm": 36.134828491057284, "learning_rate": 3.466278893458203e-07, "logits/chosen": -2.7228987216949463, "logits/rejected": -2.1129531860351562, "logps/chosen": -913.8060302734375, "logps/rejected": -614.1876220703125, "loss": 0.2475, "rewards/accuracies": 1.0, "rewards/chosen": 4.855646133422852, "rewards/margins": 4.870180606842041, "rewards/rejected": -0.014534726738929749, "step": 2388 }, { "epoch": 1.7453881278538814, "grad_norm": 35.87744302439557, "learning_rate": 3.464807615243337e-07, "logits/chosen": -3.527933359146118, "logits/rejected": -2.2970542907714844, "logps/chosen": -818.1932983398438, "logps/rejected": -458.6772155761719, "loss": 0.2335, "rewards/accuracies": 0.875, "rewards/chosen": 2.5252113342285156, "rewards/margins": 2.8431105613708496, "rewards/rejected": -0.31789904832839966, "step": 2389 }, { "epoch": 1.7461187214611873, "grad_norm": 16.84952051403049, "learning_rate": 3.463335944289168e-07, "logits/chosen": -2.5272040367126465, "logits/rejected": -2.5082898139953613, "logps/chosen": -479.81121826171875, "logps/rejected": -525.2252807617188, "loss": 0.0679, "rewards/accuracies": 1.0, "rewards/chosen": 3.559192657470703, "rewards/margins": 4.3814802169799805, "rewards/rejected": -0.8222875595092773, "step": 2390 }, { "epoch": 1.7468493150684932, "grad_norm": 35.005848423201016, "learning_rate": 3.461863881194762e-07, "logits/chosen": -2.9999783039093018, "logits/rejected": -2.211663007736206, "logps/chosen": -613.4734497070312, "logps/rejected": -493.3555603027344, "loss": 0.1453, "rewards/accuracies": 0.875, "rewards/chosen": 3.0210299491882324, "rewards/margins": 4.005263805389404, "rewards/rejected": -0.9842337965965271, "step": 2391 }, { "epoch": 1.747579908675799, "grad_norm": 38.99535009589685, "learning_rate": 3.4603914265593443e-07, "logits/chosen": -2.8409533500671387, "logits/rejected": -1.7147185802459717, "logps/chosen": -467.1488037109375, "logps/rejected": -250.57217407226562, "loss": 0.1873, "rewards/accuracies": 0.875, "rewards/chosen": 2.0896997451782227, "rewards/margins": 3.3473434448242188, "rewards/rejected": -1.257643699645996, "step": 2392 }, { "epoch": 1.748310502283105, "grad_norm": 28.276980858719252, "learning_rate": 3.4589185809822983e-07, "logits/chosen": -2.414165496826172, "logits/rejected": -1.932863473892212, "logps/chosen": -532.4133911132812, "logps/rejected": -360.59075927734375, "loss": 0.1945, "rewards/accuracies": 1.0, "rewards/chosen": 1.9800602197647095, "rewards/margins": 3.063586950302124, "rewards/rejected": -1.083526849746704, "step": 2393 }, { "epoch": 1.749041095890411, "grad_norm": 26.560595060369742, "learning_rate": 3.45744534506317e-07, "logits/chosen": -2.2883362770080566, "logits/rejected": -2.2603189945220947, "logps/chosen": -376.8776550292969, "logps/rejected": -434.4967346191406, "loss": 0.1495, "rewards/accuracies": 0.875, "rewards/chosen": 1.1847132444381714, "rewards/margins": 2.7397568225860596, "rewards/rejected": -1.5550435781478882, "step": 2394 }, { "epoch": 1.7497716894977169, "grad_norm": 37.78412707459393, "learning_rate": 3.455971719401659e-07, "logits/chosen": -2.4841244220733643, "logits/rejected": -2.127084732055664, "logps/chosen": -688.282470703125, "logps/rejected": -696.13525390625, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 3.7691073417663574, "rewards/margins": 3.8579087257385254, "rewards/rejected": -0.08880159258842468, "step": 2395 }, { "epoch": 1.750502283105023, "grad_norm": 36.09969174578399, "learning_rate": 3.454497704597629e-07, "logits/chosen": -2.6192069053649902, "logits/rejected": -2.2379167079925537, "logps/chosen": -509.735107421875, "logps/rejected": -567.0131225585938, "loss": 0.1718, "rewards/accuracies": 0.875, "rewards/chosen": 1.406280755996704, "rewards/margins": 2.2142691612243652, "rewards/rejected": -0.8079885244369507, "step": 2396 }, { "epoch": 1.7512328767123289, "grad_norm": 35.423000161331444, "learning_rate": 3.453023301251098e-07, "logits/chosen": -2.9893267154693604, "logits/rejected": -2.3009843826293945, "logps/chosen": -680.7882080078125, "logps/rejected": -539.7552490234375, "loss": 0.1435, "rewards/accuracies": 0.875, "rewards/chosen": 3.742851734161377, "rewards/margins": 4.367298126220703, "rewards/rejected": -0.62444669008255, "step": 2397 }, { "epoch": 1.7519634703196347, "grad_norm": 52.79337209081032, "learning_rate": 3.451548509962246e-07, "logits/chosen": -2.4382944107055664, "logits/rejected": -1.8759799003601074, "logps/chosen": -369.64544677734375, "logps/rejected": -306.1518249511719, "loss": 0.2646, "rewards/accuracies": 1.0, "rewards/chosen": 3.2739133834838867, "rewards/margins": 4.629121780395508, "rewards/rejected": -1.3552087545394897, "step": 2398 }, { "epoch": 1.7526940639269406, "grad_norm": 56.15655649958093, "learning_rate": 3.450073331331406e-07, "logits/chosen": -2.6468231678009033, "logits/rejected": -2.054455041885376, "logps/chosen": -520.0426025390625, "logps/rejected": -309.76690673828125, "loss": 0.2135, "rewards/accuracies": 0.875, "rewards/chosen": 2.16399884223938, "rewards/margins": 3.4332478046417236, "rewards/rejected": -1.2692489624023438, "step": 2399 }, { "epoch": 1.7534246575342465, "grad_norm": 29.487821489742803, "learning_rate": 3.448597765959074e-07, "logits/chosen": -2.7576348781585693, "logits/rejected": -2.4639649391174316, "logps/chosen": -795.8563842773438, "logps/rejected": -665.27490234375, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": 4.425223350524902, "rewards/margins": 4.509374618530273, "rewards/rejected": -0.08415153622627258, "step": 2400 }, { "epoch": 1.7541552511415524, "grad_norm": 42.65553170182766, "learning_rate": 3.447121814445898e-07, "logits/chosen": -2.916914224624634, "logits/rejected": -2.55458664894104, "logps/chosen": -898.702880859375, "logps/rejected": -863.0145874023438, "loss": 0.1655, "rewards/accuracies": 1.0, "rewards/chosen": 4.641751289367676, "rewards/margins": 6.008576393127441, "rewards/rejected": -1.3668251037597656, "step": 2401 }, { "epoch": 1.7548858447488584, "grad_norm": 37.540983464117254, "learning_rate": 3.445645477392689e-07, "logits/chosen": -2.506659507751465, "logits/rejected": -2.439673900604248, "logps/chosen": -410.2625732421875, "logps/rejected": -393.14031982421875, "loss": 0.217, "rewards/accuracies": 0.875, "rewards/chosen": 1.8031774759292603, "rewards/margins": 2.1445465087890625, "rewards/rejected": -0.3413691520690918, "step": 2402 }, { "epoch": 1.7556164383561645, "grad_norm": 42.355900874225156, "learning_rate": 3.4441687554004105e-07, "logits/chosen": -2.942671775817871, "logits/rejected": -2.0338118076324463, "logps/chosen": -553.5812377929688, "logps/rejected": -327.82501220703125, "loss": 0.2819, "rewards/accuracies": 0.75, "rewards/chosen": 1.7451469898223877, "rewards/margins": 2.4924473762512207, "rewards/rejected": -0.747300386428833, "step": 2403 }, { "epoch": 1.7563470319634704, "grad_norm": 31.137244822506677, "learning_rate": 3.4426916490701843e-07, "logits/chosen": -3.105120897293091, "logits/rejected": -2.4337031841278076, "logps/chosen": -666.7027587890625, "logps/rejected": -519.1797485351562, "loss": 0.2153, "rewards/accuracies": 0.875, "rewards/chosen": 3.6168274879455566, "rewards/margins": 3.2360968589782715, "rewards/rejected": 0.3807307481765747, "step": 2404 }, { "epoch": 1.7570776255707763, "grad_norm": 44.41357120990371, "learning_rate": 3.4412141590032883e-07, "logits/chosen": -2.4216628074645996, "logits/rejected": -1.784422516822815, "logps/chosen": -619.5413818359375, "logps/rejected": -507.65435791015625, "loss": 0.2328, "rewards/accuracies": 0.875, "rewards/chosen": 2.442995548248291, "rewards/margins": 3.717470169067383, "rewards/rejected": -1.2744746208190918, "step": 2405 }, { "epoch": 1.7578082191780822, "grad_norm": 26.40969952505909, "learning_rate": 3.4397362858011567e-07, "logits/chosen": -2.6937026977539062, "logits/rejected": -2.3912582397460938, "logps/chosen": -608.6815185546875, "logps/rejected": -553.4418334960938, "loss": 0.1465, "rewards/accuracies": 0.875, "rewards/chosen": 3.4647693634033203, "rewards/margins": 3.2718093395233154, "rewards/rejected": 0.192959725856781, "step": 2406 }, { "epoch": 1.758538812785388, "grad_norm": 25.38131800578799, "learning_rate": 3.438258030065381e-07, "logits/chosen": -2.961935043334961, "logits/rejected": -2.4546329975128174, "logps/chosen": -819.42041015625, "logps/rejected": -579.3422241210938, "loss": 0.141, "rewards/accuracies": 1.0, "rewards/chosen": 1.8245872259140015, "rewards/margins": 2.171571731567383, "rewards/rejected": -0.3469845652580261, "step": 2407 }, { "epoch": 1.759269406392694, "grad_norm": 45.22900712968946, "learning_rate": 3.436779392397706e-07, "logits/chosen": -2.7669217586517334, "logits/rejected": -2.14823055267334, "logps/chosen": -422.4787292480469, "logps/rejected": -277.3609619140625, "loss": 0.2855, "rewards/accuracies": 0.875, "rewards/chosen": 1.6836915016174316, "rewards/margins": 2.5631930828094482, "rewards/rejected": -0.8795017004013062, "step": 2408 }, { "epoch": 1.76, "grad_norm": 23.432977428238686, "learning_rate": 3.4353003734000335e-07, "logits/chosen": -3.0940351486206055, "logits/rejected": -1.6902104616165161, "logps/chosen": -884.714111328125, "logps/rejected": -485.92840576171875, "loss": 0.1479, "rewards/accuracies": 1.0, "rewards/chosen": 4.702445983886719, "rewards/margins": 6.006263256072998, "rewards/rejected": -1.3038171529769897, "step": 2409 }, { "epoch": 1.7607305936073059, "grad_norm": 30.688152360683105, "learning_rate": 3.433820973674421e-07, "logits/chosen": -2.8560588359832764, "logits/rejected": -1.8713160753250122, "logps/chosen": -368.8000793457031, "logps/rejected": -280.2232666015625, "loss": 0.1696, "rewards/accuracies": 1.0, "rewards/chosen": 2.1746859550476074, "rewards/margins": 4.248930931091309, "rewards/rejected": -2.074244737625122, "step": 2410 }, { "epoch": 1.761461187214612, "grad_norm": 21.122072276551357, "learning_rate": 3.4323411938230784e-07, "logits/chosen": -2.914506435394287, "logits/rejected": -2.5124282836914062, "logps/chosen": -570.3173217773438, "logps/rejected": -585.1905517578125, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 2.316812515258789, "rewards/margins": 3.65195894241333, "rewards/rejected": -1.335146427154541, "step": 2411 }, { "epoch": 1.7621917808219179, "grad_norm": 18.389978648635392, "learning_rate": 3.4308610344483733e-07, "logits/chosen": -2.4660356044769287, "logits/rejected": -2.2181780338287354, "logps/chosen": -715.09228515625, "logps/rejected": -544.0718994140625, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 3.6664209365844727, "rewards/margins": 4.837419509887695, "rewards/rejected": -1.1709988117218018, "step": 2412 }, { "epoch": 1.7629223744292237, "grad_norm": 39.341787518072785, "learning_rate": 3.4293804961528266e-07, "logits/chosen": -3.223771572113037, "logits/rejected": -2.67399263381958, "logps/chosen": -657.6812744140625, "logps/rejected": -578.6142578125, "loss": 0.1973, "rewards/accuracies": 1.0, "rewards/chosen": 3.246551513671875, "rewards/margins": 3.402235507965088, "rewards/rejected": -0.1556837260723114, "step": 2413 }, { "epoch": 1.7636529680365296, "grad_norm": 17.088484320931414, "learning_rate": 3.427899579539113e-07, "logits/chosen": -2.882004976272583, "logits/rejected": -1.879226565361023, "logps/chosen": -905.3150634765625, "logps/rejected": -604.0751342773438, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 4.889588356018066, "rewards/margins": 4.772865295410156, "rewards/rejected": 0.11672347784042358, "step": 2414 }, { "epoch": 1.7643835616438355, "grad_norm": 27.40751827702517, "learning_rate": 3.426418285210062e-07, "logits/chosen": -2.5442259311676025, "logits/rejected": -2.154705047607422, "logps/chosen": -697.6717529296875, "logps/rejected": -582.515869140625, "loss": 0.1517, "rewards/accuracies": 0.875, "rewards/chosen": 2.332939624786377, "rewards/margins": 3.3043289184570312, "rewards/rejected": -0.9713893532752991, "step": 2415 }, { "epoch": 1.7651141552511416, "grad_norm": 28.326658771693648, "learning_rate": 3.4249366137686575e-07, "logits/chosen": -3.2061104774475098, "logits/rejected": -2.1423516273498535, "logps/chosen": -937.8489990234375, "logps/rejected": -697.791748046875, "loss": 0.1448, "rewards/accuracies": 1.0, "rewards/chosen": 3.579615592956543, "rewards/margins": 4.367280006408691, "rewards/rejected": -0.7876642346382141, "step": 2416 }, { "epoch": 1.7658447488584474, "grad_norm": 10.020059683920847, "learning_rate": 3.4234545658180335e-07, "logits/chosen": -2.9195098876953125, "logits/rejected": -2.464189052581787, "logps/chosen": -521.3140869140625, "logps/rejected": -449.245849609375, "loss": 0.0854, "rewards/accuracies": 0.875, "rewards/chosen": 2.5662834644317627, "rewards/margins": 3.930321216583252, "rewards/rejected": -1.3640378713607788, "step": 2417 }, { "epoch": 1.7665753424657535, "grad_norm": 43.254127614183396, "learning_rate": 3.4219721419614815e-07, "logits/chosen": -2.736016035079956, "logits/rejected": -2.312654972076416, "logps/chosen": -769.0739135742188, "logps/rejected": -702.9261474609375, "loss": 0.2123, "rewards/accuracies": 0.875, "rewards/chosen": 2.941455602645874, "rewards/margins": 3.176853656768799, "rewards/rejected": -0.2353980839252472, "step": 2418 }, { "epoch": 1.7673059360730594, "grad_norm": 45.42580661089025, "learning_rate": 3.420489342802444e-07, "logits/chosen": -2.7696726322174072, "logits/rejected": -2.1678295135498047, "logps/chosen": -424.38519287109375, "logps/rejected": -283.24993896484375, "loss": 0.2032, "rewards/accuracies": 0.75, "rewards/chosen": 1.0826189517974854, "rewards/margins": 2.277235269546509, "rewards/rejected": -1.1946163177490234, "step": 2419 }, { "epoch": 1.7680365296803653, "grad_norm": 43.58817448664402, "learning_rate": 3.419006168944517e-07, "logits/chosen": -2.968903064727783, "logits/rejected": -1.990945816040039, "logps/chosen": -1022.605224609375, "logps/rejected": -776.7454833984375, "loss": 0.1577, "rewards/accuracies": 0.75, "rewards/chosen": 2.685547113418579, "rewards/margins": 4.441834449768066, "rewards/rejected": -1.7562878131866455, "step": 2420 }, { "epoch": 1.7687671232876712, "grad_norm": 37.7134533744881, "learning_rate": 3.417522620991447e-07, "logits/chosen": -2.8606395721435547, "logits/rejected": -1.8803038597106934, "logps/chosen": -683.6129760742188, "logps/rejected": -507.8256530761719, "loss": 0.1778, "rewards/accuracies": 1.0, "rewards/chosen": 4.312714576721191, "rewards/margins": 4.635143280029297, "rewards/rejected": -0.32242870330810547, "step": 2421 }, { "epoch": 1.769497716894977, "grad_norm": 29.938002548300233, "learning_rate": 3.416038699547135e-07, "logits/chosen": -2.4504640102386475, "logits/rejected": -2.4007856845855713, "logps/chosen": -400.7113037109375, "logps/rejected": -446.8008117675781, "loss": 0.1583, "rewards/accuracies": 0.875, "rewards/chosen": 2.4339044094085693, "rewards/margins": 3.784367322921753, "rewards/rejected": -1.3504626750946045, "step": 2422 }, { "epoch": 1.7702283105022831, "grad_norm": 46.877794187553505, "learning_rate": 3.4145544052156325e-07, "logits/chosen": -3.072599411010742, "logits/rejected": -2.3129825592041016, "logps/chosen": -1016.2554321289062, "logps/rejected": -614.7337646484375, "loss": 0.194, "rewards/accuracies": 1.0, "rewards/chosen": 5.136122703552246, "rewards/margins": 4.324077606201172, "rewards/rejected": 0.8120449185371399, "step": 2423 }, { "epoch": 1.770958904109589, "grad_norm": 21.797317423825344, "learning_rate": 3.4130697386011453e-07, "logits/chosen": -2.491586446762085, "logits/rejected": -1.70687997341156, "logps/chosen": -618.6046142578125, "logps/rejected": -507.35357666015625, "loss": 0.1227, "rewards/accuracies": 1.0, "rewards/chosen": 3.691854238510132, "rewards/margins": 5.903848171234131, "rewards/rejected": -2.21199369430542, "step": 2424 }, { "epoch": 1.771689497716895, "grad_norm": 48.499149275721834, "learning_rate": 3.4115847003080286e-07, "logits/chosen": -2.368116855621338, "logits/rejected": -2.0443906784057617, "logps/chosen": -485.4747314453125, "logps/rejected": -375.0700988769531, "loss": 0.301, "rewards/accuracies": 0.875, "rewards/chosen": 1.514481782913208, "rewards/margins": 2.2195310592651367, "rewards/rejected": -0.7050493955612183, "step": 2425 }, { "epoch": 1.772420091324201, "grad_norm": 45.75557887347591, "learning_rate": 3.410099290940788e-07, "logits/chosen": -3.3490023612976074, "logits/rejected": -2.692113161087036, "logps/chosen": -614.3907470703125, "logps/rejected": -425.41796875, "loss": 0.189, "rewards/accuracies": 0.875, "rewards/chosen": 2.0958733558654785, "rewards/margins": 1.6797294616699219, "rewards/rejected": 0.41614383459091187, "step": 2426 }, { "epoch": 1.7731506849315068, "grad_norm": 24.280537362853117, "learning_rate": 3.4086135111040834e-07, "logits/chosen": -2.1162383556365967, "logits/rejected": -2.428251266479492, "logps/chosen": -541.7356567382812, "logps/rejected": -619.6181640625, "loss": 0.1485, "rewards/accuracies": 0.875, "rewards/chosen": 1.592231035232544, "rewards/margins": 3.974544048309326, "rewards/rejected": -2.3823130130767822, "step": 2427 }, { "epoch": 1.7738812785388127, "grad_norm": 22.947580061175568, "learning_rate": 3.4071273614027216e-07, "logits/chosen": -2.9540939331054688, "logits/rejected": -2.116075038909912, "logps/chosen": -665.959228515625, "logps/rejected": -490.4960632324219, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 2.8394298553466797, "rewards/margins": 4.986006736755371, "rewards/rejected": -2.1465771198272705, "step": 2428 }, { "epoch": 1.7746118721461186, "grad_norm": 26.984552556234025, "learning_rate": 3.4056408424416637e-07, "logits/chosen": -2.458078145980835, "logits/rejected": -1.7671167850494385, "logps/chosen": -579.78857421875, "logps/rejected": -444.3316345214844, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": 2.6413679122924805, "rewards/margins": 4.839797496795654, "rewards/rejected": -2.198429584503174, "step": 2429 }, { "epoch": 1.7753424657534247, "grad_norm": 24.079416486697593, "learning_rate": 3.404153954826018e-07, "logits/chosen": -3.1293630599975586, "logits/rejected": -2.558906078338623, "logps/chosen": -669.7213134765625, "logps/rejected": -435.1849670410156, "loss": 0.1062, "rewards/accuracies": 0.875, "rewards/chosen": 2.3675804138183594, "rewards/margins": 1.929117202758789, "rewards/rejected": 0.4384632110595703, "step": 2430 }, { "epoch": 1.7760730593607306, "grad_norm": 37.16440656373212, "learning_rate": 3.4026666991610457e-07, "logits/chosen": -2.7721710205078125, "logits/rejected": -2.528162717819214, "logps/chosen": -582.4481201171875, "logps/rejected": -573.6483154296875, "loss": 0.2035, "rewards/accuracies": 0.875, "rewards/chosen": 2.910198211669922, "rewards/margins": 3.6744847297668457, "rewards/rejected": -0.764286994934082, "step": 2431 }, { "epoch": 1.7768036529680367, "grad_norm": 36.03659740512501, "learning_rate": 3.401179076052155e-07, "logits/chosen": -2.5742833614349365, "logits/rejected": -2.961657762527466, "logps/chosen": -759.4423217773438, "logps/rejected": -925.5328979492188, "loss": 0.2124, "rewards/accuracies": 1.0, "rewards/chosen": 2.4832863807678223, "rewards/margins": 3.1511082649230957, "rewards/rejected": -0.6678221225738525, "step": 2432 }, { "epoch": 1.7775342465753425, "grad_norm": 49.70261415172873, "learning_rate": 3.3996910861049067e-07, "logits/chosen": -2.9136059284210205, "logits/rejected": -1.9286847114562988, "logps/chosen": -735.8353881835938, "logps/rejected": -450.8521728515625, "loss": 0.2884, "rewards/accuracies": 1.0, "rewards/chosen": 4.164819717407227, "rewards/margins": 5.828567028045654, "rewards/rejected": -1.6637474298477173, "step": 2433 }, { "epoch": 1.7782648401826484, "grad_norm": 28.862752323361114, "learning_rate": 3.3982027299250065e-07, "logits/chosen": -2.567830801010132, "logits/rejected": -2.1299521923065186, "logps/chosen": -645.2403564453125, "logps/rejected": -623.4356689453125, "loss": 0.1358, "rewards/accuracies": 1.0, "rewards/chosen": 3.7113699913024902, "rewards/margins": 4.07865047454834, "rewards/rejected": -0.3672804832458496, "step": 2434 }, { "epoch": 1.7789954337899543, "grad_norm": 37.04926710961197, "learning_rate": 3.3967140081183144e-07, "logits/chosen": -2.4633235931396484, "logits/rejected": -2.5992441177368164, "logps/chosen": -546.3362426757812, "logps/rejected": -633.2557373046875, "loss": 0.1534, "rewards/accuracies": 0.75, "rewards/chosen": 2.604055404663086, "rewards/margins": 3.5135140419006348, "rewards/rejected": -0.9094583988189697, "step": 2435 }, { "epoch": 1.7797260273972602, "grad_norm": 36.694752796721495, "learning_rate": 3.395224921290836e-07, "logits/chosen": -2.8853609561920166, "logits/rejected": -2.5149495601654053, "logps/chosen": -506.301025390625, "logps/rejected": -459.876708984375, "loss": 0.182, "rewards/accuracies": 1.0, "rewards/chosen": 2.354562520980835, "rewards/margins": 4.521773338317871, "rewards/rejected": -2.167210817337036, "step": 2436 }, { "epoch": 1.7804566210045663, "grad_norm": 37.47816584411599, "learning_rate": 3.3937354700487267e-07, "logits/chosen": -3.1113903522491455, "logits/rejected": -2.0740439891815186, "logps/chosen": -543.267578125, "logps/rejected": -404.0339660644531, "loss": 0.1767, "rewards/accuracies": 1.0, "rewards/chosen": 3.495316743850708, "rewards/margins": 4.1699018478393555, "rewards/rejected": -0.6745851039886475, "step": 2437 }, { "epoch": 1.7811872146118721, "grad_norm": 18.17531069735295, "learning_rate": 3.3922456549982883e-07, "logits/chosen": -2.868865489959717, "logits/rejected": -2.3058793544769287, "logps/chosen": -591.3631591796875, "logps/rejected": -501.4976806640625, "loss": 0.1453, "rewards/accuracies": 1.0, "rewards/chosen": 2.2589478492736816, "rewards/margins": 3.498662233352661, "rewards/rejected": -1.2397143840789795, "step": 2438 }, { "epoch": 1.7819178082191782, "grad_norm": 28.215122534705348, "learning_rate": 3.3907554767459735e-07, "logits/chosen": -2.7183475494384766, "logits/rejected": -1.92533540725708, "logps/chosen": -705.7656860351562, "logps/rejected": -557.228759765625, "loss": 0.1327, "rewards/accuracies": 1.0, "rewards/chosen": 3.040600061416626, "rewards/margins": 3.016136884689331, "rewards/rejected": 0.024463146924972534, "step": 2439 }, { "epoch": 1.782648401826484, "grad_norm": 25.077506807022303, "learning_rate": 3.389264935898382e-07, "logits/chosen": -2.8073389530181885, "logits/rejected": -2.194610118865967, "logps/chosen": -632.2532958984375, "logps/rejected": -400.7047119140625, "loss": 0.11, "rewards/accuracies": 0.875, "rewards/chosen": 2.135657787322998, "rewards/margins": 3.1821177005767822, "rewards/rejected": -1.0464599132537842, "step": 2440 }, { "epoch": 1.78337899543379, "grad_norm": 30.540195681201745, "learning_rate": 3.387774033062259e-07, "logits/chosen": -3.383779525756836, "logits/rejected": -2.183732509613037, "logps/chosen": -684.03515625, "logps/rejected": -439.1829833984375, "loss": 0.1622, "rewards/accuracies": 0.875, "rewards/chosen": 2.3643429279327393, "rewards/margins": 3.2703073024749756, "rewards/rejected": -0.9059643745422363, "step": 2441 }, { "epoch": 1.7841095890410958, "grad_norm": 29.17538769357337, "learning_rate": 3.3862827688444994e-07, "logits/chosen": -2.923624038696289, "logits/rejected": -2.5300583839416504, "logps/chosen": -935.090087890625, "logps/rejected": -900.783447265625, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 3.916377067565918, "rewards/margins": 2.8809266090393066, "rewards/rejected": 1.0354503393173218, "step": 2442 }, { "epoch": 1.7848401826484017, "grad_norm": 34.94640497537801, "learning_rate": 3.3847911438521456e-07, "logits/chosen": -2.7218687534332275, "logits/rejected": -2.474618434906006, "logps/chosen": -389.3219909667969, "logps/rejected": -289.4414978027344, "loss": 0.1772, "rewards/accuracies": 0.875, "rewards/chosen": 3.0575060844421387, "rewards/margins": 4.157622814178467, "rewards/rejected": -1.100116491317749, "step": 2443 }, { "epoch": 1.7855707762557078, "grad_norm": 33.25982107952758, "learning_rate": 3.383299158692385e-07, "logits/chosen": -2.302170991897583, "logits/rejected": -1.680309772491455, "logps/chosen": -655.7442016601562, "logps/rejected": -569.5557250976562, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 2.5818228721618652, "rewards/margins": 4.026756763458252, "rewards/rejected": -1.444933533668518, "step": 2444 }, { "epoch": 1.7863013698630137, "grad_norm": 43.526484001930505, "learning_rate": 3.3818068139725513e-07, "logits/chosen": -2.74934720993042, "logits/rejected": -2.306191921234131, "logps/chosen": -781.519287109375, "logps/rejected": -600.4532470703125, "loss": 0.1945, "rewards/accuracies": 0.875, "rewards/chosen": 3.0398001670837402, "rewards/margins": 3.1235766410827637, "rewards/rejected": -0.08377639949321747, "step": 2445 }, { "epoch": 1.7870319634703198, "grad_norm": 49.21051518413011, "learning_rate": 3.3803141103001276e-07, "logits/chosen": -2.895984172821045, "logits/rejected": -2.404259443283081, "logps/chosen": -588.0684814453125, "logps/rejected": -502.36578369140625, "loss": 0.2489, "rewards/accuracies": 0.875, "rewards/chosen": 3.3623597621917725, "rewards/margins": 4.3050150871276855, "rewards/rejected": -0.9426552057266235, "step": 2446 }, { "epoch": 1.7877625570776257, "grad_norm": 19.13534525396083, "learning_rate": 3.3788210482827393e-07, "logits/chosen": -2.5636682510375977, "logits/rejected": -1.6788619756698608, "logps/chosen": -443.5166931152344, "logps/rejected": -236.90008544921875, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": 2.9719057083129883, "rewards/margins": 4.710541248321533, "rewards/rejected": -1.7386353015899658, "step": 2447 }, { "epoch": 1.7884931506849315, "grad_norm": 28.21085095874292, "learning_rate": 3.37732762852816e-07, "logits/chosen": -2.388575315475464, "logits/rejected": -2.2290773391723633, "logps/chosen": -555.0181274414062, "logps/rejected": -463.80859375, "loss": 0.1852, "rewards/accuracies": 1.0, "rewards/chosen": 2.0103323459625244, "rewards/margins": 1.7453250885009766, "rewards/rejected": 0.2650071680545807, "step": 2448 }, { "epoch": 1.7892237442922374, "grad_norm": 29.08843663719475, "learning_rate": 3.37583385164431e-07, "logits/chosen": -2.6332037448883057, "logits/rejected": -2.2162089347839355, "logps/chosen": -533.3905029296875, "logps/rejected": -464.5546875, "loss": 0.1694, "rewards/accuracies": 1.0, "rewards/chosen": 2.6733627319335938, "rewards/margins": 3.2462944984436035, "rewards/rejected": -0.5729318261146545, "step": 2449 }, { "epoch": 1.7899543378995433, "grad_norm": 30.75587766964989, "learning_rate": 3.374339718239251e-07, "logits/chosen": -3.1741654872894287, "logits/rejected": -2.512179374694824, "logps/chosen": -893.0972900390625, "logps/rejected": -591.5806884765625, "loss": 0.1217, "rewards/accuracies": 1.0, "rewards/chosen": 2.2515575885772705, "rewards/margins": 1.9195735454559326, "rewards/rejected": 0.3319839537143707, "step": 2450 }, { "epoch": 1.7906849315068492, "grad_norm": 21.981129963286477, "learning_rate": 3.372845228921194e-07, "logits/chosen": -3.0279579162597656, "logits/rejected": -2.092736005783081, "logps/chosen": -689.657958984375, "logps/rejected": -590.357666015625, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 5.288409233093262, "rewards/margins": 5.825041770935059, "rewards/rejected": -0.5366321802139282, "step": 2451 }, { "epoch": 1.7914155251141552, "grad_norm": 48.46146332638311, "learning_rate": 3.371350384298493e-07, "logits/chosen": -3.175403118133545, "logits/rejected": -2.844425678253174, "logps/chosen": -624.347412109375, "logps/rejected": -689.9248046875, "loss": 0.2202, "rewards/accuracies": 1.0, "rewards/chosen": 4.416845798492432, "rewards/margins": 6.0551910400390625, "rewards/rejected": -1.638345718383789, "step": 2452 }, { "epoch": 1.7921461187214613, "grad_norm": 24.895295877654025, "learning_rate": 3.369855184979645e-07, "logits/chosen": -2.491201877593994, "logits/rejected": -2.4869701862335205, "logps/chosen": -376.0887451171875, "logps/rejected": -537.1525268554688, "loss": 0.1, "rewards/accuracies": 0.875, "rewards/chosen": 2.441786766052246, "rewards/margins": 4.535747051239014, "rewards/rejected": -2.0939605236053467, "step": 2453 }, { "epoch": 1.7928767123287672, "grad_norm": 21.581403059296505, "learning_rate": 3.3683596315732955e-07, "logits/chosen": -3.169323444366455, "logits/rejected": -1.8274872303009033, "logps/chosen": -647.92626953125, "logps/rejected": -383.65643310546875, "loss": 0.1222, "rewards/accuracies": 1.0, "rewards/chosen": 2.9991304874420166, "rewards/margins": 5.446526527404785, "rewards/rejected": -2.4473962783813477, "step": 2454 }, { "epoch": 1.793607305936073, "grad_norm": 28.47096995838206, "learning_rate": 3.366863724688231e-07, "logits/chosen": -2.139474868774414, "logits/rejected": -2.227571487426758, "logps/chosen": -294.9823913574219, "logps/rejected": -378.402587890625, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": 1.6731271743774414, "rewards/margins": 3.9889779090881348, "rewards/rejected": -2.3158507347106934, "step": 2455 }, { "epoch": 1.794337899543379, "grad_norm": 26.770455498509104, "learning_rate": 3.3653674649333816e-07, "logits/chosen": -2.544196605682373, "logits/rejected": -1.4475255012512207, "logps/chosen": -682.3396606445312, "logps/rejected": -434.3192443847656, "loss": 0.1775, "rewards/accuracies": 1.0, "rewards/chosen": 2.712345838546753, "rewards/margins": 3.1704928874969482, "rewards/rejected": -0.45814695954322815, "step": 2456 }, { "epoch": 1.7950684931506848, "grad_norm": 17.717867282630884, "learning_rate": 3.363870852917824e-07, "logits/chosen": -3.300830364227295, "logits/rejected": -2.5102689266204834, "logps/chosen": -941.9282836914062, "logps/rejected": -760.1292114257812, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 4.9708709716796875, "rewards/margins": 4.849149227142334, "rewards/rejected": 0.12172207236289978, "step": 2457 }, { "epoch": 1.7957990867579907, "grad_norm": 32.49595076938433, "learning_rate": 3.3623738892507745e-07, "logits/chosen": -2.6005029678344727, "logits/rejected": -1.774023175239563, "logps/chosen": -650.7818603515625, "logps/rejected": -378.54791259765625, "loss": 0.2093, "rewards/accuracies": 0.875, "rewards/chosen": 1.3782598972320557, "rewards/margins": 1.9838383197784424, "rewards/rejected": -0.6055784821510315, "step": 2458 }, { "epoch": 1.7965296803652968, "grad_norm": 29.965011538524497, "learning_rate": 3.360876574541595e-07, "logits/chosen": -2.481948137283325, "logits/rejected": -2.325808048248291, "logps/chosen": -695.29052734375, "logps/rejected": -484.75921630859375, "loss": 0.1787, "rewards/accuracies": 1.0, "rewards/chosen": 2.7587437629699707, "rewards/margins": 3.7619357109069824, "rewards/rejected": -1.0031919479370117, "step": 2459 }, { "epoch": 1.7972602739726027, "grad_norm": 43.57648487636064, "learning_rate": 3.3593789093997904e-07, "logits/chosen": -2.90667462348938, "logits/rejected": -3.0416812896728516, "logps/chosen": -632.8207397460938, "logps/rejected": -666.879638671875, "loss": 0.2055, "rewards/accuracies": 0.875, "rewards/chosen": 3.266144275665283, "rewards/margins": 3.2334749698638916, "rewards/rejected": 0.0326690673828125, "step": 2460 }, { "epoch": 1.7979908675799088, "grad_norm": 27.30540503349051, "learning_rate": 3.357880894435008e-07, "logits/chosen": -2.658548355102539, "logits/rejected": -2.383168935775757, "logps/chosen": -944.2249755859375, "logps/rejected": -880.3577880859375, "loss": 0.1299, "rewards/accuracies": 1.0, "rewards/chosen": 3.9348576068878174, "rewards/margins": 4.79271936416626, "rewards/rejected": -0.8578616380691528, "step": 2461 }, { "epoch": 1.7987214611872147, "grad_norm": 34.604216249274636, "learning_rate": 3.3563825302570355e-07, "logits/chosen": -2.3139162063598633, "logits/rejected": -2.158383369445801, "logps/chosen": -750.054443359375, "logps/rejected": -995.9400024414062, "loss": 0.1419, "rewards/accuracies": 1.0, "rewards/chosen": 3.613072395324707, "rewards/margins": 4.133662700653076, "rewards/rejected": -0.5205901861190796, "step": 2462 }, { "epoch": 1.7994520547945205, "grad_norm": 39.78212145549807, "learning_rate": 3.354883817475805e-07, "logits/chosen": -3.0206007957458496, "logits/rejected": -2.2609453201293945, "logps/chosen": -686.49658203125, "logps/rejected": -456.3923034667969, "loss": 0.1512, "rewards/accuracies": 0.875, "rewards/chosen": 3.0680863857269287, "rewards/margins": 2.8587074279785156, "rewards/rejected": 0.20937861502170563, "step": 2463 }, { "epoch": 1.8001826484018264, "grad_norm": 39.27621536948284, "learning_rate": 3.35338475670139e-07, "logits/chosen": -3.052093505859375, "logits/rejected": -2.5449953079223633, "logps/chosen": -939.9057006835938, "logps/rejected": -962.6148071289062, "loss": 0.2472, "rewards/accuracies": 1.0, "rewards/chosen": 4.428621292114258, "rewards/margins": 4.097038745880127, "rewards/rejected": 0.3315829932689667, "step": 2464 }, { "epoch": 1.8009132420091323, "grad_norm": 42.546520667364064, "learning_rate": 3.3518853485440055e-07, "logits/chosen": -3.035951614379883, "logits/rejected": -2.7681236267089844, "logps/chosen": -410.685302734375, "logps/rejected": -483.9619445800781, "loss": 0.2176, "rewards/accuracies": 0.875, "rewards/chosen": 2.3966426849365234, "rewards/margins": 3.8148622512817383, "rewards/rejected": -1.4182193279266357, "step": 2465 }, { "epoch": 1.8016438356164384, "grad_norm": 22.494464786807484, "learning_rate": 3.350385593614008e-07, "logits/chosen": -3.3284668922424316, "logits/rejected": -2.459888219833374, "logps/chosen": -849.1437377929688, "logps/rejected": -609.5030517578125, "loss": 0.1246, "rewards/accuracies": 1.0, "rewards/chosen": 3.8830251693725586, "rewards/margins": 3.7751338481903076, "rewards/rejected": 0.10789167881011963, "step": 2466 }, { "epoch": 1.8023744292237442, "grad_norm": 21.321677282032496, "learning_rate": 3.3488854925218954e-07, "logits/chosen": -3.0658907890319824, "logits/rejected": -2.6850481033325195, "logps/chosen": -604.2764892578125, "logps/rejected": -588.8699951171875, "loss": 0.1247, "rewards/accuracies": 0.875, "rewards/chosen": 2.9286141395568848, "rewards/margins": 3.827240228652954, "rewards/rejected": -0.8986260890960693, "step": 2467 }, { "epoch": 1.8031050228310503, "grad_norm": 33.60686880474548, "learning_rate": 3.3473850458783056e-07, "logits/chosen": -2.9953360557556152, "logits/rejected": -1.7651126384735107, "logps/chosen": -587.9638671875, "logps/rejected": -351.08697509765625, "loss": 0.188, "rewards/accuracies": 1.0, "rewards/chosen": 3.232386589050293, "rewards/margins": 5.504848003387451, "rewards/rejected": -2.272461175918579, "step": 2468 }, { "epoch": 1.8038356164383562, "grad_norm": 22.733882350761853, "learning_rate": 3.3458842542940175e-07, "logits/chosen": -2.8592910766601562, "logits/rejected": -1.9133448600769043, "logps/chosen": -627.797119140625, "logps/rejected": -385.2864990234375, "loss": 0.1384, "rewards/accuracies": 0.875, "rewards/chosen": 1.7495553493499756, "rewards/margins": 2.912519931793213, "rewards/rejected": -1.1629647016525269, "step": 2469 }, { "epoch": 1.804566210045662, "grad_norm": 36.90168195163162, "learning_rate": 3.344383118379951e-07, "logits/chosen": -2.5908851623535156, "logits/rejected": -2.863388776779175, "logps/chosen": -841.6552124023438, "logps/rejected": -921.7263793945312, "loss": 0.1644, "rewards/accuracies": 0.875, "rewards/chosen": 2.9938502311706543, "rewards/margins": 3.317959785461426, "rewards/rejected": -0.3241100311279297, "step": 2470 }, { "epoch": 1.805296803652968, "grad_norm": 38.25409272681242, "learning_rate": 3.342881638747166e-07, "logits/chosen": -2.1102545261383057, "logits/rejected": -2.127408504486084, "logps/chosen": -465.94903564453125, "logps/rejected": -731.2962036132812, "loss": 0.2185, "rewards/accuracies": 1.0, "rewards/chosen": 2.4076216220855713, "rewards/margins": 3.836843967437744, "rewards/rejected": -1.4292222261428833, "step": 2471 }, { "epoch": 1.8060273972602738, "grad_norm": 32.19614757925725, "learning_rate": 3.341379816006863e-07, "logits/chosen": -2.5216803550720215, "logits/rejected": -2.4900619983673096, "logps/chosen": -497.20819091796875, "logps/rejected": -546.0999755859375, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 0.7457922101020813, "rewards/margins": 1.4750524759292603, "rewards/rejected": -0.7292602062225342, "step": 2472 }, { "epoch": 1.80675799086758, "grad_norm": 51.71569099714805, "learning_rate": 3.339877650770379e-07, "logits/chosen": -2.055788040161133, "logits/rejected": -2.4848458766937256, "logps/chosen": -595.1527709960938, "logps/rejected": -819.9043579101562, "loss": 0.2545, "rewards/accuracies": 1.0, "rewards/chosen": 2.2159030437469482, "rewards/margins": 3.5554394721984863, "rewards/rejected": -1.339536428451538, "step": 2473 }, { "epoch": 1.8074885844748858, "grad_norm": 34.589534733585786, "learning_rate": 3.338375143649195e-07, "logits/chosen": -2.6533381938934326, "logits/rejected": -2.5268921852111816, "logps/chosen": -746.0360107421875, "logps/rejected": -579.833740234375, "loss": 0.1583, "rewards/accuracies": 0.75, "rewards/chosen": 2.5748345851898193, "rewards/margins": 3.538698196411133, "rewards/rejected": -0.9638637900352478, "step": 2474 }, { "epoch": 1.808219178082192, "grad_norm": 28.76996058954928, "learning_rate": 3.336872295254927e-07, "logits/chosen": -3.244553804397583, "logits/rejected": -1.6935534477233887, "logps/chosen": -834.178955078125, "logps/rejected": -436.47723388671875, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": 3.9016261100769043, "rewards/margins": 3.766897678375244, "rewards/rejected": 0.13472788035869598, "step": 2475 }, { "epoch": 1.8089497716894978, "grad_norm": 27.28185740840135, "learning_rate": 3.335369106199333e-07, "logits/chosen": -2.5360445976257324, "logits/rejected": -2.3042397499084473, "logps/chosen": -510.8681945800781, "logps/rejected": -473.6712646484375, "loss": 0.1728, "rewards/accuracies": 0.875, "rewards/chosen": 1.9544196128845215, "rewards/margins": 3.6170899868011475, "rewards/rejected": -1.662670373916626, "step": 2476 }, { "epoch": 1.8096803652968037, "grad_norm": 20.362927938499308, "learning_rate": 3.3338655770943086e-07, "logits/chosen": -3.0835208892822266, "logits/rejected": -1.9086343050003052, "logps/chosen": -1026.888916015625, "logps/rejected": -716.6566772460938, "loss": 0.1003, "rewards/accuracies": 1.0, "rewards/chosen": 4.3942694664001465, "rewards/margins": 5.589848518371582, "rewards/rejected": -1.1955795288085938, "step": 2477 }, { "epoch": 1.8104109589041095, "grad_norm": 39.17086687224879, "learning_rate": 3.3323617085518867e-07, "logits/chosen": -3.0334742069244385, "logits/rejected": -2.771218776702881, "logps/chosen": -990.2203369140625, "logps/rejected": -1099.9498291015625, "loss": 0.1504, "rewards/accuracies": 0.875, "rewards/chosen": 4.334043502807617, "rewards/margins": 2.829472064971924, "rewards/rejected": 1.504571557044983, "step": 2478 }, { "epoch": 1.8111415525114154, "grad_norm": 32.61466214026197, "learning_rate": 3.330857501184241e-07, "logits/chosen": -2.3700473308563232, "logits/rejected": -1.8902254104614258, "logps/chosen": -780.9913940429688, "logps/rejected": -607.55712890625, "loss": 0.1925, "rewards/accuracies": 1.0, "rewards/chosen": 3.211087942123413, "rewards/margins": 3.7401950359344482, "rewards/rejected": -0.529106855392456, "step": 2479 }, { "epoch": 1.8118721461187215, "grad_norm": 37.52583745318017, "learning_rate": 3.32935295560368e-07, "logits/chosen": -3.0289885997772217, "logits/rejected": -2.867661476135254, "logps/chosen": -964.5809936523438, "logps/rejected": -835.8572998046875, "loss": 0.21, "rewards/accuracies": 1.0, "rewards/chosen": 5.860017776489258, "rewards/margins": 4.950498104095459, "rewards/rejected": 0.9095195531845093, "step": 2480 }, { "epoch": 1.8126027397260274, "grad_norm": 30.087765297633336, "learning_rate": 3.327848072422652e-07, "logits/chosen": -2.7213785648345947, "logits/rejected": -3.1393673419952393, "logps/chosen": -827.1468505859375, "logps/rejected": -968.4568481445312, "loss": 0.1336, "rewards/accuracies": 1.0, "rewards/chosen": 2.9323248863220215, "rewards/margins": 3.3286385536193848, "rewards/rejected": -0.39631345868110657, "step": 2481 }, { "epoch": 1.8133333333333335, "grad_norm": 43.167559954459726, "learning_rate": 3.326342852253742e-07, "logits/chosen": -2.4588255882263184, "logits/rejected": -2.1273112297058105, "logps/chosen": -698.6968383789062, "logps/rejected": -606.793701171875, "loss": 0.2271, "rewards/accuracies": 0.75, "rewards/chosen": 2.304867744445801, "rewards/margins": 2.6005682945251465, "rewards/rejected": -0.2957006096839905, "step": 2482 }, { "epoch": 1.8140639269406393, "grad_norm": 39.30661689634377, "learning_rate": 3.324837295709672e-07, "logits/chosen": -2.8621413707733154, "logits/rejected": -2.6460118293762207, "logps/chosen": -262.122802734375, "logps/rejected": -305.68304443359375, "loss": 0.3021, "rewards/accuracies": 0.625, "rewards/chosen": 1.3025182485580444, "rewards/margins": 3.3919677734375, "rewards/rejected": -2.089449405670166, "step": 2483 }, { "epoch": 1.8147945205479452, "grad_norm": 55.26786246177565, "learning_rate": 3.3233314034033013e-07, "logits/chosen": -2.9304943084716797, "logits/rejected": -2.6737215518951416, "logps/chosen": -811.7824096679688, "logps/rejected": -700.7698974609375, "loss": 0.2293, "rewards/accuracies": 0.875, "rewards/chosen": 1.8242301940917969, "rewards/margins": 2.9519832134246826, "rewards/rejected": -1.1277530193328857, "step": 2484 }, { "epoch": 1.815525114155251, "grad_norm": 50.49810485305365, "learning_rate": 3.321825175947627e-07, "logits/chosen": -3.005302667617798, "logits/rejected": -1.639209508895874, "logps/chosen": -594.3497924804688, "logps/rejected": -306.06787109375, "loss": 0.2325, "rewards/accuracies": 0.875, "rewards/chosen": 2.198636531829834, "rewards/margins": 3.737903356552124, "rewards/rejected": -1.5392670631408691, "step": 2485 }, { "epoch": 1.816255707762557, "grad_norm": 30.193919760894637, "learning_rate": 3.320318613955779e-07, "logits/chosen": -2.829692840576172, "logits/rejected": -1.688440203666687, "logps/chosen": -702.1368408203125, "logps/rejected": -414.1288146972656, "loss": 0.1308, "rewards/accuracies": 1.0, "rewards/chosen": 4.007349967956543, "rewards/margins": 4.4868693351745605, "rewards/rejected": -0.4795195460319519, "step": 2486 }, { "epoch": 1.816986301369863, "grad_norm": 23.495295926292616, "learning_rate": 3.3188117180410287e-07, "logits/chosen": -2.5166022777557373, "logits/rejected": -1.7876195907592773, "logps/chosen": -498.5935363769531, "logps/rejected": -327.59515380859375, "loss": 0.0743, "rewards/accuracies": 1.0, "rewards/chosen": 3.4973833560943604, "rewards/margins": 5.488030433654785, "rewards/rejected": -1.9906470775604248, "step": 2487 }, { "epoch": 1.817716894977169, "grad_norm": 30.289862479763237, "learning_rate": 3.317304488816777e-07, "logits/chosen": -3.0404040813446045, "logits/rejected": -2.2016420364379883, "logps/chosen": -829.620849609375, "logps/rejected": -786.0698852539062, "loss": 0.1396, "rewards/accuracies": 0.875, "rewards/chosen": 3.618499517440796, "rewards/margins": 4.3224616050720215, "rewards/rejected": -0.7039620876312256, "step": 2488 }, { "epoch": 1.818447488584475, "grad_norm": 15.28740220370012, "learning_rate": 3.3157969268965666e-07, "logits/chosen": -2.977644205093384, "logits/rejected": -2.3470897674560547, "logps/chosen": -628.55712890625, "logps/rejected": -651.7003784179688, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": 3.2303004264831543, "rewards/margins": 4.428010940551758, "rewards/rejected": -1.1977102756500244, "step": 2489 }, { "epoch": 1.819178082191781, "grad_norm": 35.13110732881099, "learning_rate": 3.314289032894074e-07, "logits/chosen": -2.808765411376953, "logits/rejected": -2.0204756259918213, "logps/chosen": -637.2403564453125, "logps/rejected": -571.782958984375, "loss": 0.1853, "rewards/accuracies": 1.0, "rewards/chosen": 4.573514461517334, "rewards/margins": 7.1880574226379395, "rewards/rejected": -2.6145429611206055, "step": 2490 }, { "epoch": 1.8199086757990868, "grad_norm": 41.05470608348771, "learning_rate": 3.3127808074231067e-07, "logits/chosen": -2.9967246055603027, "logits/rejected": -2.1174535751342773, "logps/chosen": -713.7926025390625, "logps/rejected": -386.7867431640625, "loss": 0.2069, "rewards/accuracies": 0.875, "rewards/chosen": 1.5501651763916016, "rewards/margins": 1.7578084468841553, "rewards/rejected": -0.2076433300971985, "step": 2491 }, { "epoch": 1.8206392694063926, "grad_norm": 26.007183549769596, "learning_rate": 3.3112722510976125e-07, "logits/chosen": -2.9348907470703125, "logits/rejected": -2.0176637172698975, "logps/chosen": -325.28277587890625, "logps/rejected": -286.9529724121094, "loss": 0.1316, "rewards/accuracies": 0.875, "rewards/chosen": 3.3061323165893555, "rewards/margins": 5.515195369720459, "rewards/rejected": -2.2090635299682617, "step": 2492 }, { "epoch": 1.8213698630136985, "grad_norm": 35.7958519605013, "learning_rate": 3.309763364531671e-07, "logits/chosen": -2.253133773803711, "logits/rejected": -2.3913214206695557, "logps/chosen": -424.12237548828125, "logps/rejected": -645.8915405273438, "loss": 0.1806, "rewards/accuracies": 1.0, "rewards/chosen": 2.011225461959839, "rewards/margins": 4.059459686279297, "rewards/rejected": -2.048234462738037, "step": 2493 }, { "epoch": 1.8221004566210046, "grad_norm": 34.419101681288474, "learning_rate": 3.3082541483394967e-07, "logits/chosen": -2.9058547019958496, "logits/rejected": -2.0081381797790527, "logps/chosen": -706.8802490234375, "logps/rejected": -371.5479431152344, "loss": 0.1828, "rewards/accuracies": 0.75, "rewards/chosen": 2.5476787090301514, "rewards/margins": 3.0031847953796387, "rewards/rejected": -0.45550602674484253, "step": 2494 }, { "epoch": 1.8228310502283105, "grad_norm": 34.778912488837975, "learning_rate": 3.306744603135439e-07, "logits/chosen": -2.235896587371826, "logits/rejected": -2.2220699787139893, "logps/chosen": -656.08740234375, "logps/rejected": -717.7710571289062, "loss": 0.1954, "rewards/accuracies": 0.875, "rewards/chosen": 2.3043062686920166, "rewards/margins": 3.460606336593628, "rewards/rejected": -1.1563003063201904, "step": 2495 }, { "epoch": 1.8235616438356166, "grad_norm": 26.942648552715752, "learning_rate": 3.305234729533981e-07, "logits/chosen": -2.501642942428589, "logits/rejected": -2.322453022003174, "logps/chosen": -587.893798828125, "logps/rejected": -484.6654357910156, "loss": 0.1099, "rewards/accuracies": 0.875, "rewards/chosen": 3.071512460708618, "rewards/margins": 3.8593595027923584, "rewards/rejected": -0.7878471612930298, "step": 2496 }, { "epoch": 1.8242922374429225, "grad_norm": 57.17466343295669, "learning_rate": 3.303724528149739e-07, "logits/chosen": -2.804898500442505, "logits/rejected": -1.780819296836853, "logps/chosen": -499.91558837890625, "logps/rejected": -287.1393737792969, "loss": 0.2793, "rewards/accuracies": 1.0, "rewards/chosen": 3.373595714569092, "rewards/margins": 5.1961894035339355, "rewards/rejected": -1.8225938081741333, "step": 2497 }, { "epoch": 1.8250228310502283, "grad_norm": 26.93211137372774, "learning_rate": 3.302213999597463e-07, "logits/chosen": -3.043229818344116, "logits/rejected": -2.5312910079956055, "logps/chosen": -592.9701538085938, "logps/rejected": -530.2279052734375, "loss": 0.1586, "rewards/accuracies": 1.0, "rewards/chosen": 2.5128090381622314, "rewards/margins": 3.4829416275024414, "rewards/rejected": -0.9701323509216309, "step": 2498 }, { "epoch": 1.8257534246575342, "grad_norm": 31.69271079413165, "learning_rate": 3.300703144492035e-07, "logits/chosen": -2.46993350982666, "logits/rejected": -2.182999610900879, "logps/chosen": -758.596923828125, "logps/rejected": -697.8634033203125, "loss": 0.1776, "rewards/accuracies": 1.0, "rewards/chosen": 2.4975991249084473, "rewards/margins": 2.30869460105896, "rewards/rejected": 0.1889043152332306, "step": 2499 }, { "epoch": 1.82648401826484, "grad_norm": 39.02404669819783, "learning_rate": 3.299191963448472e-07, "logits/chosen": -2.886847496032715, "logits/rejected": -1.8171474933624268, "logps/chosen": -825.6801147460938, "logps/rejected": -462.8016357421875, "loss": 0.1683, "rewards/accuracies": 1.0, "rewards/chosen": 3.1317825317382812, "rewards/margins": 4.292786598205566, "rewards/rejected": -1.161003828048706, "step": 2500 }, { "epoch": 1.827214611872146, "grad_norm": 1135.8098432396155, "learning_rate": 3.2976804570819236e-07, "logits/chosen": -2.6469051837921143, "logits/rejected": -2.6001269817352295, "logps/chosen": -450.8719482421875, "logps/rejected": -492.6106262207031, "loss": 1.0872, "rewards/accuracies": 1.0, "rewards/chosen": 3.6254611015319824, "rewards/margins": 3.6017143726348877, "rewards/rejected": 0.02374666929244995, "step": 2501 }, { "epoch": 1.827945205479452, "grad_norm": 33.08503448443023, "learning_rate": 3.296168626007669e-07, "logits/chosen": -2.680997133255005, "logits/rejected": -1.8500607013702393, "logps/chosen": -569.56005859375, "logps/rejected": -365.0787353515625, "loss": 0.1155, "rewards/accuracies": 1.0, "rewards/chosen": 3.008359670639038, "rewards/margins": 3.6098408699035645, "rewards/rejected": -0.6014813780784607, "step": 2502 }, { "epoch": 1.8286757990867581, "grad_norm": 34.26073872194582, "learning_rate": 3.294656470841124e-07, "logits/chosen": -2.946981430053711, "logits/rejected": -2.7277960777282715, "logps/chosen": -834.7921142578125, "logps/rejected": -816.6387939453125, "loss": 0.136, "rewards/accuracies": 0.875, "rewards/chosen": 1.9539039134979248, "rewards/margins": 2.157508373260498, "rewards/rejected": -0.2036043107509613, "step": 2503 }, { "epoch": 1.829406392694064, "grad_norm": 51.819115229083174, "learning_rate": 3.2931439921978324e-07, "logits/chosen": -3.0661184787750244, "logits/rejected": -2.8333704471588135, "logps/chosen": -860.7561645507812, "logps/rejected": -807.022216796875, "loss": 0.2335, "rewards/accuracies": 0.875, "rewards/chosen": 2.6110692024230957, "rewards/margins": 2.8577585220336914, "rewards/rejected": -0.24668921530246735, "step": 2504 }, { "epoch": 1.83013698630137, "grad_norm": 30.364797629139467, "learning_rate": 3.2916311906934707e-07, "logits/chosen": -3.0999624729156494, "logits/rejected": -1.9885575771331787, "logps/chosen": -1032.6925048828125, "logps/rejected": -595.4088134765625, "loss": 0.1468, "rewards/accuracies": 1.0, "rewards/chosen": 4.43840217590332, "rewards/margins": 4.024676322937012, "rewards/rejected": 0.4137260615825653, "step": 2505 }, { "epoch": 1.8308675799086758, "grad_norm": 25.587241140452942, "learning_rate": 3.2901180669438474e-07, "logits/chosen": -3.0559654235839844, "logits/rejected": -2.5815184116363525, "logps/chosen": -581.9483032226562, "logps/rejected": -524.265380859375, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.5130398273468018, "rewards/margins": 5.411700248718262, "rewards/rejected": -2.898660182952881, "step": 2506 }, { "epoch": 1.8315981735159816, "grad_norm": 40.25481125407946, "learning_rate": 3.288604621564903e-07, "logits/chosen": -2.6587603092193604, "logits/rejected": -2.656428575515747, "logps/chosen": -489.487548828125, "logps/rejected": -514.2564697265625, "loss": 0.2484, "rewards/accuracies": 1.0, "rewards/chosen": 1.3306083679199219, "rewards/margins": 1.9459364414215088, "rewards/rejected": -0.6153278946876526, "step": 2507 }, { "epoch": 1.8323287671232875, "grad_norm": 20.59799146026781, "learning_rate": 3.287090855172708e-07, "logits/chosen": -2.7870850563049316, "logits/rejected": -1.7492237091064453, "logps/chosen": -531.3329467773438, "logps/rejected": -381.9947814941406, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 4.232574939727783, "rewards/margins": 5.340356826782227, "rewards/rejected": -1.1077817678451538, "step": 2508 }, { "epoch": 1.8330593607305936, "grad_norm": 24.02060069632939, "learning_rate": 3.2855767683834627e-07, "logits/chosen": -3.119189977645874, "logits/rejected": -1.8273968696594238, "logps/chosen": -546.559814453125, "logps/rejected": -464.84442138671875, "loss": 0.0977, "rewards/accuracies": 1.0, "rewards/chosen": 1.7269532680511475, "rewards/margins": 3.8546671867370605, "rewards/rejected": -2.127713680267334, "step": 2509 }, { "epoch": 1.8337899543378997, "grad_norm": 36.96320059521615, "learning_rate": 3.284062361813499e-07, "logits/chosen": -2.951904773712158, "logits/rejected": -2.4436097145080566, "logps/chosen": -699.3853149414062, "logps/rejected": -476.3838806152344, "loss": 0.178, "rewards/accuracies": 0.875, "rewards/chosen": 2.6118061542510986, "rewards/margins": 2.4815499782562256, "rewards/rejected": 0.13025647401809692, "step": 2510 }, { "epoch": 1.8345205479452056, "grad_norm": 39.42985235625335, "learning_rate": 3.282547636079278e-07, "logits/chosen": -3.0227901935577393, "logits/rejected": -2.6419849395751953, "logps/chosen": -737.2290649414062, "logps/rejected": -694.53662109375, "loss": 0.2305, "rewards/accuracies": 0.875, "rewards/chosen": 2.8116049766540527, "rewards/margins": 3.4127891063690186, "rewards/rejected": -0.601184070110321, "step": 2511 }, { "epoch": 1.8352511415525115, "grad_norm": 39.61115781244203, "learning_rate": 3.281032591797392e-07, "logits/chosen": -2.9530458450317383, "logits/rejected": -2.388392210006714, "logps/chosen": -693.9696044921875, "logps/rejected": -748.493896484375, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 3.0103790760040283, "rewards/margins": 6.86629056930542, "rewards/rejected": -3.8559117317199707, "step": 2512 }, { "epoch": 1.8359817351598173, "grad_norm": 26.127986755479863, "learning_rate": 3.279517229584563e-07, "logits/chosen": -2.948202133178711, "logits/rejected": -2.134556770324707, "logps/chosen": -700.7982788085938, "logps/rejected": -636.4481811523438, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": 4.559898853302002, "rewards/margins": 5.318097114562988, "rewards/rejected": -0.7581984400749207, "step": 2513 }, { "epoch": 1.8367123287671232, "grad_norm": 30.091353625057835, "learning_rate": 3.278001550057642e-07, "logits/chosen": -3.0514369010925293, "logits/rejected": -2.370374917984009, "logps/chosen": -924.0760498046875, "logps/rejected": -684.7808837890625, "loss": 0.2155, "rewards/accuracies": 1.0, "rewards/chosen": 4.230848789215088, "rewards/margins": 4.4521708488464355, "rewards/rejected": -0.22132226824760437, "step": 2514 }, { "epoch": 1.837442922374429, "grad_norm": 31.768661661604998, "learning_rate": 3.2764855538336065e-07, "logits/chosen": -2.573202610015869, "logits/rejected": -1.6659389734268188, "logps/chosen": -511.8691101074219, "logps/rejected": -345.00543212890625, "loss": 0.1679, "rewards/accuracies": 1.0, "rewards/chosen": 2.9801244735717773, "rewards/margins": 4.89186954498291, "rewards/rejected": -1.9117450714111328, "step": 2515 }, { "epoch": 1.8381735159817352, "grad_norm": 33.30059040451179, "learning_rate": 3.274969241529568e-07, "logits/chosen": -3.014892578125, "logits/rejected": -1.8801443576812744, "logps/chosen": -739.7379150390625, "logps/rejected": -412.2811584472656, "loss": 0.2023, "rewards/accuracies": 0.875, "rewards/chosen": 2.9556796550750732, "rewards/margins": 4.572381019592285, "rewards/rejected": -1.6167017221450806, "step": 2516 }, { "epoch": 1.838904109589041, "grad_norm": 25.217079986526493, "learning_rate": 3.2734526137627617e-07, "logits/chosen": -2.8893814086914062, "logits/rejected": -2.590762138366699, "logps/chosen": -630.5858764648438, "logps/rejected": -505.3665771484375, "loss": 0.1661, "rewards/accuracies": 1.0, "rewards/chosen": 2.4035019874572754, "rewards/margins": 1.9750815629959106, "rewards/rejected": 0.4284202754497528, "step": 2517 }, { "epoch": 1.8396347031963471, "grad_norm": 31.743951656920014, "learning_rate": 3.271935671150554e-07, "logits/chosen": -2.4317383766174316, "logits/rejected": -2.225511074066162, "logps/chosen": -465.67864990234375, "logps/rejected": -487.478759765625, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": 2.1996893882751465, "rewards/margins": 3.7224676609039307, "rewards/rejected": -1.5227785110473633, "step": 2518 }, { "epoch": 1.840365296803653, "grad_norm": 31.19192846502421, "learning_rate": 3.2704184143104406e-07, "logits/chosen": -2.072572708129883, "logits/rejected": -2.4211599826812744, "logps/chosen": -343.6796875, "logps/rejected": -466.95263671875, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": 1.6189227104187012, "rewards/margins": 3.0283195972442627, "rewards/rejected": -1.4093968868255615, "step": 2519 }, { "epoch": 1.841095890410959, "grad_norm": 49.49881134147314, "learning_rate": 3.268900843860043e-07, "logits/chosen": -2.7362864017486572, "logits/rejected": -1.6934990882873535, "logps/chosen": -933.5259399414062, "logps/rejected": -723.5689697265625, "loss": 0.2177, "rewards/accuracies": 0.875, "rewards/chosen": 4.461466312408447, "rewards/margins": 4.4050703048706055, "rewards/rejected": 0.05639582872390747, "step": 2520 }, { "epoch": 1.8418264840182648, "grad_norm": 30.423330529830945, "learning_rate": 3.26738296041711e-07, "logits/chosen": -2.4678280353546143, "logits/rejected": -2.2732954025268555, "logps/chosen": -439.6986083984375, "logps/rejected": -512.791748046875, "loss": 0.1148, "rewards/accuracies": 0.875, "rewards/chosen": 2.3739407062530518, "rewards/margins": 3.409160614013672, "rewards/rejected": -1.035219669342041, "step": 2521 }, { "epoch": 1.8425570776255706, "grad_norm": 25.04170593495604, "learning_rate": 3.2658647645995185e-07, "logits/chosen": -2.8937344551086426, "logits/rejected": -2.0335612297058105, "logps/chosen": -585.8406372070312, "logps/rejected": -454.4588623046875, "loss": 0.1038, "rewards/accuracies": 0.875, "rewards/chosen": 2.9412007331848145, "rewards/margins": 4.090259552001953, "rewards/rejected": -1.1490590572357178, "step": 2522 }, { "epoch": 1.8432876712328767, "grad_norm": 43.49310879547831, "learning_rate": 3.2643462570252726e-07, "logits/chosen": -3.150618553161621, "logits/rejected": -2.5627617835998535, "logps/chosen": -919.46923828125, "logps/rejected": -784.1763305664062, "loss": 0.2021, "rewards/accuracies": 1.0, "rewards/chosen": 2.4167275428771973, "rewards/margins": 3.331843852996826, "rewards/rejected": -0.9151161909103394, "step": 2523 }, { "epoch": 1.8440182648401826, "grad_norm": 28.376868098266346, "learning_rate": 3.2628274383125053e-07, "logits/chosen": -2.93430757522583, "logits/rejected": -2.194342851638794, "logps/chosen": -682.2376098632812, "logps/rejected": -520.5914916992188, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 4.420439720153809, "rewards/margins": 5.059109687805176, "rewards/rejected": -0.6386703252792358, "step": 2524 }, { "epoch": 1.8447488584474887, "grad_norm": 31.055896202892676, "learning_rate": 3.2613083090794723e-07, "logits/chosen": -2.523751735687256, "logits/rejected": -2.398155689239502, "logps/chosen": -354.44342041015625, "logps/rejected": -416.3572692871094, "loss": 0.1736, "rewards/accuracies": 0.875, "rewards/chosen": 1.9258630275726318, "rewards/margins": 3.125239372253418, "rewards/rejected": -1.1993764638900757, "step": 2525 }, { "epoch": 1.8454794520547946, "grad_norm": 27.381442770663583, "learning_rate": 3.259788869944559e-07, "logits/chosen": -2.6326889991760254, "logits/rejected": -2.5412309169769287, "logps/chosen": -664.3160400390625, "logps/rejected": -598.14697265625, "loss": 0.0993, "rewards/accuracies": 1.0, "rewards/chosen": 1.3588271141052246, "rewards/margins": 3.437121629714966, "rewards/rejected": -2.0782947540283203, "step": 2526 }, { "epoch": 1.8462100456621005, "grad_norm": 35.77502504530482, "learning_rate": 3.258269121526275e-07, "logits/chosen": -2.648876667022705, "logits/rejected": -2.27333402633667, "logps/chosen": -542.647216796875, "logps/rejected": -363.67877197265625, "loss": 0.1802, "rewards/accuracies": 1.0, "rewards/chosen": 1.8145473003387451, "rewards/margins": 2.6782233715057373, "rewards/rejected": -0.8636758327484131, "step": 2527 }, { "epoch": 1.8469406392694063, "grad_norm": 17.287787140884042, "learning_rate": 3.2567490644432573e-07, "logits/chosen": -2.5164694786071777, "logits/rejected": -2.3928892612457275, "logps/chosen": -487.33929443359375, "logps/rejected": -516.8319091796875, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 2.8347554206848145, "rewards/margins": 4.451253890991211, "rewards/rejected": -1.6164988279342651, "step": 2528 }, { "epoch": 1.8476712328767122, "grad_norm": 32.615773734330794, "learning_rate": 3.2552286993142665e-07, "logits/chosen": -2.5290465354919434, "logits/rejected": -2.7686469554901123, "logps/chosen": -406.2376403808594, "logps/rejected": -546.5740356445312, "loss": 0.1741, "rewards/accuracies": 1.0, "rewards/chosen": 2.790605068206787, "rewards/margins": 3.868391990661621, "rewards/rejected": -1.077787160873413, "step": 2529 }, { "epoch": 1.8484018264840183, "grad_norm": 33.161056069559024, "learning_rate": 3.2537080267581906e-07, "logits/chosen": -2.2628085613250732, "logits/rejected": -2.097038745880127, "logps/chosen": -668.12646484375, "logps/rejected": -757.061279296875, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": 2.704674482345581, "rewards/margins": 4.907069683074951, "rewards/rejected": -2.202395439147949, "step": 2530 }, { "epoch": 1.8491324200913242, "grad_norm": 29.79950499667528, "learning_rate": 3.252187047394043e-07, "logits/chosen": -2.970421314239502, "logits/rejected": -2.625483512878418, "logps/chosen": -541.0967407226562, "logps/rejected": -485.01849365234375, "loss": 0.1673, "rewards/accuracies": 1.0, "rewards/chosen": 2.3359522819519043, "rewards/margins": 2.3286218643188477, "rewards/rejected": 0.0073302388191223145, "step": 2531 }, { "epoch": 1.8498630136986303, "grad_norm": 26.298891875270858, "learning_rate": 3.250665761840959e-07, "logits/chosen": -2.545400619506836, "logits/rejected": -1.978081226348877, "logps/chosen": -597.5682983398438, "logps/rejected": -523.391357421875, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 2.7706007957458496, "rewards/margins": 4.623690605163574, "rewards/rejected": -1.8530895709991455, "step": 2532 }, { "epoch": 1.8505936073059361, "grad_norm": 21.70290915681906, "learning_rate": 3.2491441707182024e-07, "logits/chosen": -2.6458213329315186, "logits/rejected": -2.2794854640960693, "logps/chosen": -701.202392578125, "logps/rejected": -607.9244384765625, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 4.599727153778076, "rewards/margins": 5.1623616218566895, "rewards/rejected": -0.5626354813575745, "step": 2533 }, { "epoch": 1.851324200913242, "grad_norm": 37.036334368786584, "learning_rate": 3.2476222746451576e-07, "logits/chosen": -3.1275954246520996, "logits/rejected": -2.357858419418335, "logps/chosen": -711.3692016601562, "logps/rejected": -538.4617919921875, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": 4.402895927429199, "rewards/margins": 6.322117328643799, "rewards/rejected": -1.9192215204238892, "step": 2534 }, { "epoch": 1.8520547945205479, "grad_norm": 41.20415154437945, "learning_rate": 3.2461000742413366e-07, "logits/chosen": -2.809337615966797, "logits/rejected": -2.2703802585601807, "logps/chosen": -697.2780151367188, "logps/rejected": -581.8656005859375, "loss": 0.1845, "rewards/accuracies": 0.875, "rewards/chosen": 2.7830495834350586, "rewards/margins": 3.086331844329834, "rewards/rejected": -0.3032822906970978, "step": 2535 }, { "epoch": 1.8527853881278538, "grad_norm": 31.60032058746349, "learning_rate": 3.2445775701263725e-07, "logits/chosen": -2.4768917560577393, "logits/rejected": -2.179795503616333, "logps/chosen": -541.8765869140625, "logps/rejected": -595.00732421875, "loss": 0.1553, "rewards/accuracies": 0.875, "rewards/chosen": 1.358060598373413, "rewards/margins": 1.7475576400756836, "rewards/rejected": -0.3894970417022705, "step": 2536 }, { "epoch": 1.8535159817351599, "grad_norm": 50.085544497171284, "learning_rate": 3.243054762920025e-07, "logits/chosen": -2.491196870803833, "logits/rejected": -2.0382418632507324, "logps/chosen": -786.7119140625, "logps/rejected": -695.3277587890625, "loss": 0.2567, "rewards/accuracies": 0.875, "rewards/chosen": 3.5129923820495605, "rewards/margins": 3.087825059890747, "rewards/rejected": 0.4251672923564911, "step": 2537 }, { "epoch": 1.8542465753424657, "grad_norm": 22.96385072110961, "learning_rate": 3.241531653242174e-07, "logits/chosen": -2.904815435409546, "logits/rejected": -1.9639548063278198, "logps/chosen": -838.429931640625, "logps/rejected": -396.98089599609375, "loss": 0.1087, "rewards/accuracies": 0.875, "rewards/chosen": 4.045186996459961, "rewards/margins": 5.521523475646973, "rewards/rejected": -1.4763364791870117, "step": 2538 }, { "epoch": 1.8549771689497718, "grad_norm": 32.47619884178162, "learning_rate": 3.2400082417128246e-07, "logits/chosen": -2.87829852104187, "logits/rejected": -1.7674777507781982, "logps/chosen": -712.5162353515625, "logps/rejected": -581.830810546875, "loss": 0.159, "rewards/accuracies": 0.875, "rewards/chosen": 3.7171225547790527, "rewards/margins": 4.039636611938477, "rewards/rejected": -0.3225138485431671, "step": 2539 }, { "epoch": 1.8557077625570777, "grad_norm": 60.24676429005856, "learning_rate": 3.238484528952104e-07, "logits/chosen": -2.522775173187256, "logits/rejected": -1.340002179145813, "logps/chosen": -655.3597412109375, "logps/rejected": -312.037109375, "loss": 0.3249, "rewards/accuracies": 0.875, "rewards/chosen": 3.3016834259033203, "rewards/margins": 3.6020731925964355, "rewards/rejected": -0.30038943886756897, "step": 2540 }, { "epoch": 1.8564383561643836, "grad_norm": 54.95340044713453, "learning_rate": 3.2369605155802614e-07, "logits/chosen": -2.7611024379730225, "logits/rejected": -2.751904010772705, "logps/chosen": -579.6663208007812, "logps/rejected": -572.0816650390625, "loss": 0.296, "rewards/accuracies": 0.75, "rewards/chosen": 2.6653056144714355, "rewards/margins": 2.545186758041382, "rewards/rejected": 0.12011884897947311, "step": 2541 }, { "epoch": 1.8571689497716894, "grad_norm": 54.079558363696904, "learning_rate": 3.2354362022176704e-07, "logits/chosen": -2.912104606628418, "logits/rejected": -2.3278768062591553, "logps/chosen": -819.5817260742188, "logps/rejected": -661.194091796875, "loss": 0.2813, "rewards/accuracies": 1.0, "rewards/chosen": 2.7978551387786865, "rewards/margins": 2.0404322147369385, "rewards/rejected": 0.7574228048324585, "step": 2542 }, { "epoch": 1.8578995433789953, "grad_norm": 27.692863960873904, "learning_rate": 3.233911589484825e-07, "logits/chosen": -2.8716843128204346, "logits/rejected": -2.238217353820801, "logps/chosen": -499.7293701171875, "logps/rejected": -553.9776611328125, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 2.733778238296509, "rewards/margins": 3.881829261779785, "rewards/rejected": -1.148051142692566, "step": 2543 }, { "epoch": 1.8586301369863014, "grad_norm": 32.33939621965077, "learning_rate": 3.232386678002342e-07, "logits/chosen": -2.7629339694976807, "logits/rejected": -2.3904378414154053, "logps/chosen": -573.8262329101562, "logps/rejected": -606.800537109375, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 3.5458250045776367, "rewards/margins": 4.16127872467041, "rewards/rejected": -0.6154533624649048, "step": 2544 }, { "epoch": 1.8593607305936073, "grad_norm": 43.29194926419634, "learning_rate": 3.2308614683909573e-07, "logits/chosen": -2.719393730163574, "logits/rejected": -1.81748366355896, "logps/chosen": -936.6205444335938, "logps/rejected": -605.0198974609375, "loss": 0.2369, "rewards/accuracies": 0.875, "rewards/chosen": 1.8878084421157837, "rewards/margins": 4.193091869354248, "rewards/rejected": -2.305283308029175, "step": 2545 }, { "epoch": 1.8600913242009134, "grad_norm": 30.004380987082016, "learning_rate": 3.229335961271532e-07, "logits/chosen": -2.6089630126953125, "logits/rejected": -1.9810848236083984, "logps/chosen": -805.5315551757812, "logps/rejected": -593.4120483398438, "loss": 0.1381, "rewards/accuracies": 1.0, "rewards/chosen": 3.3861403465270996, "rewards/margins": 4.498762130737305, "rewards/rejected": -1.112621784210205, "step": 2546 }, { "epoch": 1.8608219178082193, "grad_norm": 29.179077763040464, "learning_rate": 3.2278101572650467e-07, "logits/chosen": -2.1848509311676025, "logits/rejected": -2.258228063583374, "logps/chosen": -431.9064636230469, "logps/rejected": -479.2538757324219, "loss": 0.1199, "rewards/accuracies": 0.875, "rewards/chosen": 1.6158896684646606, "rewards/margins": 3.9255213737487793, "rewards/rejected": -2.309631586074829, "step": 2547 }, { "epoch": 1.8615525114155251, "grad_norm": 59.253949383801626, "learning_rate": 3.226284056992602e-07, "logits/chosen": -2.960531234741211, "logits/rejected": -2.484327554702759, "logps/chosen": -694.3763427734375, "logps/rejected": -673.0567626953125, "loss": 0.2701, "rewards/accuracies": 0.875, "rewards/chosen": 2.757085084915161, "rewards/margins": 1.8895996809005737, "rewards/rejected": 0.8674854636192322, "step": 2548 }, { "epoch": 1.862283105022831, "grad_norm": 33.660067172694596, "learning_rate": 3.224757661075419e-07, "logits/chosen": -2.867187023162842, "logits/rejected": -2.390076160430908, "logps/chosen": -691.9193725585938, "logps/rejected": -714.4903564453125, "loss": 0.1434, "rewards/accuracies": 0.875, "rewards/chosen": 3.5835537910461426, "rewards/margins": 3.7642927169799805, "rewards/rejected": -0.18073883652687073, "step": 2549 }, { "epoch": 1.8630136986301369, "grad_norm": 28.996465007137616, "learning_rate": 3.2232309701348413e-07, "logits/chosen": -3.200896739959717, "logits/rejected": -2.6018543243408203, "logps/chosen": -785.279541015625, "logps/rejected": -756.845703125, "loss": 0.1436, "rewards/accuracies": 1.0, "rewards/chosen": 3.3817989826202393, "rewards/margins": 3.3131396770477295, "rewards/rejected": 0.06865949928760529, "step": 2550 }, { "epoch": 1.8637442922374428, "grad_norm": 35.62019396035678, "learning_rate": 3.221703984792331e-07, "logits/chosen": -2.010125160217285, "logits/rejected": -1.7673108577728271, "logps/chosen": -569.09130859375, "logps/rejected": -579.4823608398438, "loss": 0.1911, "rewards/accuracies": 1.0, "rewards/chosen": 2.8030614852905273, "rewards/margins": 3.645293712615967, "rewards/rejected": -0.842232346534729, "step": 2551 }, { "epoch": 1.8644748858447489, "grad_norm": 46.69513408134728, "learning_rate": 3.220176705669468e-07, "logits/chosen": -2.8609447479248047, "logits/rejected": -2.4491355419158936, "logps/chosen": -908.5523681640625, "logps/rejected": -855.15283203125, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 2.242658853530884, "rewards/margins": 2.4726200103759766, "rewards/rejected": -0.22996121644973755, "step": 2552 }, { "epoch": 1.865205479452055, "grad_norm": 27.5864364246159, "learning_rate": 3.2186491333879574e-07, "logits/chosen": -2.9659037590026855, "logits/rejected": -2.1572561264038086, "logps/chosen": -494.6399841308594, "logps/rejected": -319.5474548339844, "loss": 0.1757, "rewards/accuracies": 1.0, "rewards/chosen": 2.1994593143463135, "rewards/margins": 3.2224645614624023, "rewards/rejected": -1.0230053663253784, "step": 2553 }, { "epoch": 1.8659360730593608, "grad_norm": 35.425760359804734, "learning_rate": 3.2171212685696173e-07, "logits/chosen": -2.471566677093506, "logits/rejected": -2.1062138080596924, "logps/chosen": -832.5053100585938, "logps/rejected": -628.2896728515625, "loss": 0.139, "rewards/accuracies": 1.0, "rewards/chosen": 2.219808578491211, "rewards/margins": 3.186755657196045, "rewards/rejected": -0.9669471979141235, "step": 2554 }, { "epoch": 1.8666666666666667, "grad_norm": 33.95679556206313, "learning_rate": 3.2155931118363904e-07, "logits/chosen": -2.8565456867218018, "logits/rejected": -1.8283852338790894, "logps/chosen": -639.3302001953125, "logps/rejected": -381.4600830078125, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": 3.0509209632873535, "rewards/margins": 4.309543132781982, "rewards/rejected": -1.2586222887039185, "step": 2555 }, { "epoch": 1.8673972602739726, "grad_norm": 25.077166064220656, "learning_rate": 3.214064663810333e-07, "logits/chosen": -2.555299758911133, "logits/rejected": -2.3335037231445312, "logps/chosen": -735.5197143554688, "logps/rejected": -660.1614990234375, "loss": 0.1697, "rewards/accuracies": 1.0, "rewards/chosen": 3.3594820499420166, "rewards/margins": 4.298807144165039, "rewards/rejected": -0.939325213432312, "step": 2556 }, { "epoch": 1.8681278538812784, "grad_norm": 50.378237333942124, "learning_rate": 3.2125359251136253e-07, "logits/chosen": -2.8495938777923584, "logits/rejected": -2.287883758544922, "logps/chosen": -392.77789306640625, "logps/rejected": -377.78277587890625, "loss": 0.2751, "rewards/accuracies": 1.0, "rewards/chosen": 1.0836222171783447, "rewards/margins": 1.876592993736267, "rewards/rejected": -0.7929707765579224, "step": 2557 }, { "epoch": 1.8688584474885843, "grad_norm": 46.82713180878688, "learning_rate": 3.211006896368561e-07, "logits/chosen": -2.577944040298462, "logits/rejected": -2.2072651386260986, "logps/chosen": -910.2057495117188, "logps/rejected": -783.8098754882812, "loss": 0.2179, "rewards/accuracies": 0.875, "rewards/chosen": 4.09901237487793, "rewards/margins": 4.459997177124023, "rewards/rejected": -0.36098426580429077, "step": 2558 }, { "epoch": 1.8695890410958904, "grad_norm": 41.275328102402014, "learning_rate": 3.2094775781975546e-07, "logits/chosen": -2.647000789642334, "logits/rejected": -2.294607162475586, "logps/chosen": -587.1248779296875, "logps/rejected": -509.9465637207031, "loss": 0.2202, "rewards/accuracies": 0.625, "rewards/chosen": 1.5369681119918823, "rewards/margins": 2.150031566619873, "rewards/rejected": -0.613063395023346, "step": 2559 }, { "epoch": 1.8703196347031965, "grad_norm": 32.90115441613049, "learning_rate": 3.207947971223138e-07, "logits/chosen": -2.5670394897460938, "logits/rejected": -1.4924348592758179, "logps/chosen": -581.77294921875, "logps/rejected": -407.84539794921875, "loss": 0.1893, "rewards/accuracies": 1.0, "rewards/chosen": 2.3120205402374268, "rewards/margins": 3.897942304611206, "rewards/rejected": -1.5859216451644897, "step": 2560 }, { "epoch": 1.8710502283105024, "grad_norm": 35.5209211570273, "learning_rate": 3.206418076067962e-07, "logits/chosen": -2.236259698867798, "logits/rejected": -2.6321773529052734, "logps/chosen": -562.0205078125, "logps/rejected": -680.0830688476562, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": 2.246880054473877, "rewards/margins": 3.40295147895813, "rewards/rejected": -1.1560711860656738, "step": 2561 }, { "epoch": 1.8717808219178083, "grad_norm": 36.69990207752604, "learning_rate": 3.2048878933547903e-07, "logits/chosen": -2.388458728790283, "logits/rejected": -2.2117667198181152, "logps/chosen": -598.1431884765625, "logps/rejected": -553.342041015625, "loss": 0.1745, "rewards/accuracies": 0.875, "rewards/chosen": 2.2874956130981445, "rewards/margins": 2.1485540866851807, "rewards/rejected": 0.13894161581993103, "step": 2562 }, { "epoch": 1.8725114155251141, "grad_norm": 48.9454839631873, "learning_rate": 3.203357423706508e-07, "logits/chosen": -2.490154981613159, "logits/rejected": -2.5353803634643555, "logps/chosen": -488.8948059082031, "logps/rejected": -604.0239868164062, "loss": 0.2208, "rewards/accuracies": 0.75, "rewards/chosen": 1.9168041944503784, "rewards/margins": 3.3383264541625977, "rewards/rejected": -1.4215223789215088, "step": 2563 }, { "epoch": 1.87324200913242, "grad_norm": 30.208802977671095, "learning_rate": 3.201826667746116e-07, "logits/chosen": -2.8510525226593018, "logits/rejected": -2.1837871074676514, "logps/chosen": -654.0027465820312, "logps/rejected": -461.0645446777344, "loss": 0.1448, "rewards/accuracies": 1.0, "rewards/chosen": 2.4437880516052246, "rewards/margins": 4.110433101654053, "rewards/rejected": -1.6666452884674072, "step": 2564 }, { "epoch": 1.8739726027397259, "grad_norm": 38.61416147991401, "learning_rate": 3.20029562609673e-07, "logits/chosen": -2.697856903076172, "logits/rejected": -1.89353609085083, "logps/chosen": -514.8805541992188, "logps/rejected": -361.8417663574219, "loss": 0.2538, "rewards/accuracies": 0.625, "rewards/chosen": 1.9594769477844238, "rewards/margins": 3.130096912384033, "rewards/rejected": -1.1706197261810303, "step": 2565 }, { "epoch": 1.874703196347032, "grad_norm": 35.25856639040008, "learning_rate": 3.1987642993815854e-07, "logits/chosen": -2.946760654449463, "logits/rejected": -1.9819672107696533, "logps/chosen": -621.6567993164062, "logps/rejected": -441.09234619140625, "loss": 0.2164, "rewards/accuracies": 1.0, "rewards/chosen": 3.7889246940612793, "rewards/margins": 4.717036247253418, "rewards/rejected": -0.9281115531921387, "step": 2566 }, { "epoch": 1.8754337899543378, "grad_norm": 37.02376143955139, "learning_rate": 3.19723268822403e-07, "logits/chosen": -2.7661428451538086, "logits/rejected": -1.953096628189087, "logps/chosen": -928.99951171875, "logps/rejected": -665.1915283203125, "loss": 0.1742, "rewards/accuracies": 0.75, "rewards/chosen": 4.061299800872803, "rewards/margins": 4.383777618408203, "rewards/rejected": -0.32247766852378845, "step": 2567 }, { "epoch": 1.876164383561644, "grad_norm": 46.3772031358417, "learning_rate": 3.1957007932475287e-07, "logits/chosen": -3.001157283782959, "logits/rejected": -1.9245694875717163, "logps/chosen": -699.2286987304688, "logps/rejected": -475.3810729980469, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": 6.100053787231445, "rewards/margins": 7.288252830505371, "rewards/rejected": -1.1881989240646362, "step": 2568 }, { "epoch": 1.8768949771689498, "grad_norm": 43.4497397323908, "learning_rate": 3.1941686150756626e-07, "logits/chosen": -2.574769973754883, "logits/rejected": -2.5405068397521973, "logps/chosen": -804.7646484375, "logps/rejected": -665.7572021484375, "loss": 0.2295, "rewards/accuracies": 0.875, "rewards/chosen": 3.659900188446045, "rewards/margins": 3.2474498748779297, "rewards/rejected": 0.4124504327774048, "step": 2569 }, { "epoch": 1.8776255707762557, "grad_norm": 24.653753388730564, "learning_rate": 3.192636154332128e-07, "logits/chosen": -3.057744026184082, "logits/rejected": -1.7604267597198486, "logps/chosen": -604.8856201171875, "logps/rejected": -346.1068115234375, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": 3.0166280269622803, "rewards/margins": 5.227137088775635, "rewards/rejected": -2.2105088233947754, "step": 2570 }, { "epoch": 1.8783561643835616, "grad_norm": 48.666720734309294, "learning_rate": 3.1911034116407353e-07, "logits/chosen": -2.221406936645508, "logits/rejected": -2.366623640060425, "logps/chosen": -490.91021728515625, "logps/rejected": -729.2714233398438, "loss": 0.2568, "rewards/accuracies": 1.0, "rewards/chosen": 2.127851724624634, "rewards/margins": 4.382823944091797, "rewards/rejected": -2.254971981048584, "step": 2571 }, { "epoch": 1.8790867579908674, "grad_norm": 48.26496994942347, "learning_rate": 3.189570387625411e-07, "logits/chosen": -2.1569032669067383, "logits/rejected": -2.527430295944214, "logps/chosen": -520.607177734375, "logps/rejected": -715.1273803710938, "loss": 0.2833, "rewards/accuracies": 0.875, "rewards/chosen": 0.7512099146842957, "rewards/margins": 1.1338930130004883, "rewards/rejected": -0.3826831579208374, "step": 2572 }, { "epoch": 1.8798173515981735, "grad_norm": 39.81651796100536, "learning_rate": 3.188037082910194e-07, "logits/chosen": -3.082050323486328, "logits/rejected": -1.6200406551361084, "logps/chosen": -927.56689453125, "logps/rejected": -477.0766296386719, "loss": 0.2049, "rewards/accuracies": 0.625, "rewards/chosen": 2.6714065074920654, "rewards/margins": 2.7834348678588867, "rewards/rejected": -0.11202839016914368, "step": 2573 }, { "epoch": 1.8805479452054794, "grad_norm": 30.829008001799213, "learning_rate": 3.1865034981192407e-07, "logits/chosen": -2.6229608058929443, "logits/rejected": -1.9028472900390625, "logps/chosen": -454.2479248046875, "logps/rejected": -487.4287414550781, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 2.173265218734741, "rewards/margins": 3.3131370544433594, "rewards/rejected": -1.1398721933364868, "step": 2574 }, { "epoch": 1.8812785388127855, "grad_norm": 23.369374949854244, "learning_rate": 3.184969633876818e-07, "logits/chosen": -2.6888961791992188, "logits/rejected": -2.202782154083252, "logps/chosen": -433.453857421875, "logps/rejected": -308.7692565917969, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 2.846029758453369, "rewards/margins": 4.491416931152344, "rewards/rejected": -1.6453869342803955, "step": 2575 }, { "epoch": 1.8820091324200914, "grad_norm": 33.97723344503088, "learning_rate": 3.183435490807308e-07, "logits/chosen": -3.0288546085357666, "logits/rejected": -1.8310213088989258, "logps/chosen": -546.5467529296875, "logps/rejected": -312.50457763671875, "loss": 0.1611, "rewards/accuracies": 1.0, "rewards/chosen": 2.5917410850524902, "rewards/margins": 3.919219493865967, "rewards/rejected": -1.327478289604187, "step": 2576 }, { "epoch": 1.8827397260273973, "grad_norm": 32.73671120840008, "learning_rate": 3.181901069535208e-07, "logits/chosen": -2.7887372970581055, "logits/rejected": -2.9721970558166504, "logps/chosen": -430.0021057128906, "logps/rejected": -474.21441650390625, "loss": 0.1834, "rewards/accuracies": 1.0, "rewards/chosen": 1.6584466695785522, "rewards/margins": 3.2449896335601807, "rewards/rejected": -1.5865432024002075, "step": 2577 }, { "epoch": 1.8834703196347031, "grad_norm": 26.51778750573318, "learning_rate": 3.1803663706851256e-07, "logits/chosen": -2.9810843467712402, "logits/rejected": -1.955348014831543, "logps/chosen": -404.02435302734375, "logps/rejected": -314.9716796875, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 2.4307870864868164, "rewards/margins": 4.179549694061279, "rewards/rejected": -1.748762607574463, "step": 2578 }, { "epoch": 1.884200913242009, "grad_norm": 19.048275302299352, "learning_rate": 3.178831394881785e-07, "logits/chosen": -3.0351014137268066, "logits/rejected": -1.7769062519073486, "logps/chosen": -739.532470703125, "logps/rejected": -451.1850280761719, "loss": 0.0889, "rewards/accuracies": 1.0, "rewards/chosen": 4.367505073547363, "rewards/margins": 5.8992767333984375, "rewards/rejected": -1.5317718982696533, "step": 2579 }, { "epoch": 1.884931506849315, "grad_norm": 33.148200896221354, "learning_rate": 3.177296142750018e-07, "logits/chosen": -2.5953996181488037, "logits/rejected": -2.0313456058502197, "logps/chosen": -520.4545288085938, "logps/rejected": -500.3861083984375, "loss": 0.1431, "rewards/accuracies": 0.875, "rewards/chosen": 3.0616753101348877, "rewards/margins": 4.238535404205322, "rewards/rejected": -1.1768604516983032, "step": 2580 }, { "epoch": 1.885662100456621, "grad_norm": 35.22162671259307, "learning_rate": 3.1757606149147734e-07, "logits/chosen": -3.0685839653015137, "logits/rejected": -1.815833568572998, "logps/chosen": -875.1016845703125, "logps/rejected": -542.5608520507812, "loss": 0.1154, "rewards/accuracies": 0.875, "rewards/chosen": 4.85587215423584, "rewards/margins": 5.41999626159668, "rewards/rejected": -0.5641239881515503, "step": 2581 }, { "epoch": 1.886392694063927, "grad_norm": 31.53726172249463, "learning_rate": 3.17422481200111e-07, "logits/chosen": -2.553004741668701, "logits/rejected": -1.7506948709487915, "logps/chosen": -656.6232299804688, "logps/rejected": -463.429443359375, "loss": 0.1615, "rewards/accuracies": 1.0, "rewards/chosen": 2.176004409790039, "rewards/margins": 3.84187912940979, "rewards/rejected": -1.6658750772476196, "step": 2582 }, { "epoch": 1.887123287671233, "grad_norm": 24.37097072065837, "learning_rate": 3.1726887346342006e-07, "logits/chosen": -2.7617599964141846, "logits/rejected": -2.3572018146514893, "logps/chosen": -552.4759521484375, "logps/rejected": -518.824462890625, "loss": 0.1817, "rewards/accuracies": 1.0, "rewards/chosen": 2.000523805618286, "rewards/margins": 2.9021425247192383, "rewards/rejected": -0.9016189575195312, "step": 2583 }, { "epoch": 1.8878538812785388, "grad_norm": 30.187588962084234, "learning_rate": 3.171152383439327e-07, "logits/chosen": -2.8901031017303467, "logits/rejected": -2.3153164386749268, "logps/chosen": -728.6683349609375, "logps/rejected": -610.37646484375, "loss": 0.1506, "rewards/accuracies": 1.0, "rewards/chosen": 2.7961838245391846, "rewards/margins": 3.585287570953369, "rewards/rejected": -0.7891038656234741, "step": 2584 }, { "epoch": 1.8885844748858447, "grad_norm": 47.150326831449426, "learning_rate": 3.1696157590418856e-07, "logits/chosen": -3.1124253273010254, "logits/rejected": -2.4971847534179688, "logps/chosen": -736.340087890625, "logps/rejected": -566.9400024414062, "loss": 0.2205, "rewards/accuracies": 0.75, "rewards/chosen": 4.5464582443237305, "rewards/margins": 3.733241558074951, "rewards/rejected": 0.8132164478302002, "step": 2585 }, { "epoch": 1.8893150684931506, "grad_norm": 25.326040456323064, "learning_rate": 3.168078862067379e-07, "logits/chosen": -2.448582887649536, "logits/rejected": -2.5942680835723877, "logps/chosen": -444.46612548828125, "logps/rejected": -638.9609375, "loss": 0.1564, "rewards/accuracies": 0.75, "rewards/chosen": 2.556966781616211, "rewards/margins": 2.680589199066162, "rewards/rejected": -0.12362246215343475, "step": 2586 }, { "epoch": 1.8900456621004567, "grad_norm": 34.78511041020281, "learning_rate": 3.1665416931414276e-07, "logits/chosen": -2.8772895336151123, "logits/rejected": -1.958507776260376, "logps/chosen": -935.6824951171875, "logps/rejected": -620.5072631835938, "loss": 0.1517, "rewards/accuracies": 1.0, "rewards/chosen": 4.2978410720825195, "rewards/margins": 5.2371320724487305, "rewards/rejected": -0.9392907023429871, "step": 2587 }, { "epoch": 1.8907762557077625, "grad_norm": 57.190831955529596, "learning_rate": 3.165004252889756e-07, "logits/chosen": -2.793778657913208, "logits/rejected": -2.4688260555267334, "logps/chosen": -932.260009765625, "logps/rejected": -845.560302734375, "loss": 0.2456, "rewards/accuracies": 1.0, "rewards/chosen": 3.1869699954986572, "rewards/margins": 4.197437286376953, "rewards/rejected": -1.0104678869247437, "step": 2588 }, { "epoch": 1.8915068493150686, "grad_norm": 47.0957562197089, "learning_rate": 3.1634665419382034e-07, "logits/chosen": -2.235755681991577, "logits/rejected": -2.099142551422119, "logps/chosen": -660.406494140625, "logps/rejected": -837.4140014648438, "loss": 0.2572, "rewards/accuracies": 0.75, "rewards/chosen": 2.604125499725342, "rewards/margins": 1.921950101852417, "rewards/rejected": 0.68217533826828, "step": 2589 }, { "epoch": 1.8922374429223745, "grad_norm": 29.26047426143169, "learning_rate": 3.161928560912719e-07, "logits/chosen": -2.960601806640625, "logits/rejected": -1.7163664102554321, "logps/chosen": -538.0711669921875, "logps/rejected": -325.9388732910156, "loss": 0.1381, "rewards/accuracies": 1.0, "rewards/chosen": 2.974870204925537, "rewards/margins": 4.073451519012451, "rewards/rejected": -1.098581075668335, "step": 2590 }, { "epoch": 1.8929680365296804, "grad_norm": 39.9343761004382, "learning_rate": 3.160390310439359e-07, "logits/chosen": -2.7818446159362793, "logits/rejected": -2.2155776023864746, "logps/chosen": -338.62872314453125, "logps/rejected": -297.7384338378906, "loss": 0.2414, "rewards/accuracies": 0.875, "rewards/chosen": 3.5067873001098633, "rewards/margins": 4.8163981437683105, "rewards/rejected": -1.3096108436584473, "step": 2591 }, { "epoch": 1.8936986301369862, "grad_norm": 16.20384069055852, "learning_rate": 3.1588517911442927e-07, "logits/chosen": -2.3250277042388916, "logits/rejected": -2.522632122039795, "logps/chosen": -757.9306640625, "logps/rejected": -837.63525390625, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": 3.400606393814087, "rewards/margins": 4.383998870849609, "rewards/rejected": -0.983392596244812, "step": 2592 }, { "epoch": 1.8944292237442921, "grad_norm": 48.399808924274176, "learning_rate": 3.1573130036537956e-07, "logits/chosen": -3.003164052963257, "logits/rejected": -2.731022834777832, "logps/chosen": -526.5181884765625, "logps/rejected": -510.1338806152344, "loss": 0.2424, "rewards/accuracies": 0.75, "rewards/chosen": 1.3269015550613403, "rewards/margins": 1.5554375648498535, "rewards/rejected": -0.2285359799861908, "step": 2593 }, { "epoch": 1.8951598173515982, "grad_norm": 35.513947661667025, "learning_rate": 3.155773948594255e-07, "logits/chosen": -3.44218111038208, "logits/rejected": -2.385676860809326, "logps/chosen": -835.0296630859375, "logps/rejected": -569.115478515625, "loss": 0.1271, "rewards/accuracies": 0.875, "rewards/chosen": 4.431820869445801, "rewards/margins": 5.203072547912598, "rewards/rejected": -0.7712519764900208, "step": 2594 }, { "epoch": 1.895890410958904, "grad_norm": 31.6920502234122, "learning_rate": 3.1542346265921664e-07, "logits/chosen": -3.160351514816284, "logits/rejected": -2.793520450592041, "logps/chosen": -636.2955322265625, "logps/rejected": -584.787109375, "loss": 0.1971, "rewards/accuracies": 1.0, "rewards/chosen": 1.2522236108779907, "rewards/margins": 2.6149063110351562, "rewards/rejected": -1.362682580947876, "step": 2595 }, { "epoch": 1.8966210045662102, "grad_norm": 25.958030278461038, "learning_rate": 3.1526950382741343e-07, "logits/chosen": -2.655125856399536, "logits/rejected": -2.07281756401062, "logps/chosen": -847.8348388671875, "logps/rejected": -562.2785034179688, "loss": 0.1107, "rewards/accuracies": 0.875, "rewards/chosen": 4.141880989074707, "rewards/margins": 4.372763633728027, "rewards/rejected": -0.23088262975215912, "step": 2596 }, { "epoch": 1.897351598173516, "grad_norm": 31.1918619350715, "learning_rate": 3.1511551842668694e-07, "logits/chosen": -3.0375173091888428, "logits/rejected": -2.6758203506469727, "logps/chosen": -522.8521118164062, "logps/rejected": -479.2867126464844, "loss": 0.1332, "rewards/accuracies": 0.875, "rewards/chosen": 3.042736053466797, "rewards/margins": 2.620880126953125, "rewards/rejected": 0.4218556880950928, "step": 2597 }, { "epoch": 1.898082191780822, "grad_norm": 31.05227191863091, "learning_rate": 3.149615065197193e-07, "logits/chosen": -2.568295478820801, "logits/rejected": -2.1799263954162598, "logps/chosen": -678.266845703125, "logps/rejected": -621.3465576171875, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": 2.946012496948242, "rewards/margins": 3.842522382736206, "rewards/rejected": -0.896510124206543, "step": 2598 }, { "epoch": 1.8988127853881278, "grad_norm": 37.09902487241915, "learning_rate": 3.148074681692033e-07, "logits/chosen": -2.530186653137207, "logits/rejected": -2.014254093170166, "logps/chosen": -313.7449035644531, "logps/rejected": -423.8101806640625, "loss": 0.2209, "rewards/accuracies": 1.0, "rewards/chosen": 2.896467685699463, "rewards/margins": 4.414421081542969, "rewards/rejected": -1.517953872680664, "step": 2599 }, { "epoch": 1.8995433789954337, "grad_norm": 48.38674508178221, "learning_rate": 3.146534034378427e-07, "logits/chosen": -2.708343505859375, "logits/rejected": -2.0326406955718994, "logps/chosen": -691.3690185546875, "logps/rejected": -564.0987548828125, "loss": 0.2905, "rewards/accuracies": 0.75, "rewards/chosen": 2.272322416305542, "rewards/margins": 2.5610737800598145, "rewards/rejected": -0.28875142335891724, "step": 2600 }, { "epoch": 1.9002739726027398, "grad_norm": 34.58908653356666, "learning_rate": 3.144993123883517e-07, "logits/chosen": -2.567890167236328, "logits/rejected": -2.52493953704834, "logps/chosen": -758.5366821289062, "logps/rejected": -722.5106811523438, "loss": 0.1702, "rewards/accuracies": 1.0, "rewards/chosen": 3.810309886932373, "rewards/margins": 3.816617012023926, "rewards/rejected": -0.006306827068328857, "step": 2601 }, { "epoch": 1.9010045662100457, "grad_norm": 26.931629789254654, "learning_rate": 3.1434519508345534e-07, "logits/chosen": -2.7301013469696045, "logits/rejected": -2.283397912979126, "logps/chosen": -485.9410705566406, "logps/rejected": -495.2358093261719, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 1.9072115421295166, "rewards/margins": 3.330218553543091, "rewards/rejected": -1.4230068922042847, "step": 2602 }, { "epoch": 1.9017351598173518, "grad_norm": 42.46030573932037, "learning_rate": 3.1419105158588955e-07, "logits/chosen": -3.154021739959717, "logits/rejected": -2.461397171020508, "logps/chosen": -586.5245361328125, "logps/rejected": -571.468017578125, "loss": 0.213, "rewards/accuracies": 0.75, "rewards/chosen": 2.7710094451904297, "rewards/margins": 3.642831325531006, "rewards/rejected": -0.8718221783638, "step": 2603 }, { "epoch": 1.9024657534246576, "grad_norm": 27.177268863355156, "learning_rate": 3.140368819584005e-07, "logits/chosen": -2.935992956161499, "logits/rejected": -2.2766597270965576, "logps/chosen": -583.7151489257812, "logps/rejected": -452.5581970214844, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 2.39906644821167, "rewards/margins": 2.855165481567383, "rewards/rejected": -0.45609915256500244, "step": 2604 }, { "epoch": 1.9031963470319635, "grad_norm": 21.214149302181184, "learning_rate": 3.1388268626374537e-07, "logits/chosen": -2.379218578338623, "logits/rejected": -2.125715732574463, "logps/chosen": -663.4251098632812, "logps/rejected": -671.817626953125, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 4.13170051574707, "rewards/margins": 4.7412028312683105, "rewards/rejected": -0.6095021963119507, "step": 2605 }, { "epoch": 1.9039269406392694, "grad_norm": 53.60725959466684, "learning_rate": 3.1372846456469175e-07, "logits/chosen": -2.923942804336548, "logits/rejected": -2.0482757091522217, "logps/chosen": -704.7974243164062, "logps/rejected": -598.1884155273438, "loss": 0.285, "rewards/accuracies": 0.875, "rewards/chosen": 3.4375152587890625, "rewards/margins": 4.0330352783203125, "rewards/rejected": -0.5955203175544739, "step": 2606 }, { "epoch": 1.9046575342465752, "grad_norm": 835.857531221679, "learning_rate": 3.13574216924018e-07, "logits/chosen": -2.296515464782715, "logits/rejected": -2.7534334659576416, "logps/chosen": -577.4221801757812, "logps/rejected": -794.3291625976562, "loss": 0.175, "rewards/accuracies": 1.0, "rewards/chosen": 2.5101358890533447, "rewards/margins": 4.9889726638793945, "rewards/rejected": -2.4788365364074707, "step": 2607 }, { "epoch": 1.9053881278538811, "grad_norm": 27.574060208585873, "learning_rate": 3.134199434045127e-07, "logits/chosen": -2.9614362716674805, "logits/rejected": -2.5447299480438232, "logps/chosen": -574.986083984375, "logps/rejected": -507.8302917480469, "loss": 0.137, "rewards/accuracies": 0.875, "rewards/chosen": 3.3338024616241455, "rewards/margins": 3.920962333679199, "rewards/rejected": -0.5871601700782776, "step": 2608 }, { "epoch": 1.9061187214611872, "grad_norm": 36.19368099468901, "learning_rate": 3.1326564406897545e-07, "logits/chosen": -2.774308681488037, "logits/rejected": -1.9375510215759277, "logps/chosen": -729.2027587890625, "logps/rejected": -393.1101379394531, "loss": 0.173, "rewards/accuracies": 1.0, "rewards/chosen": 4.937941074371338, "rewards/margins": 6.237311363220215, "rewards/rejected": -1.2993698120117188, "step": 2609 }, { "epoch": 1.9068493150684933, "grad_norm": 25.77674676579892, "learning_rate": 3.131113189802158e-07, "logits/chosen": -3.2589492797851562, "logits/rejected": -2.114995002746582, "logps/chosen": -755.3521118164062, "logps/rejected": -524.1924438476562, "loss": 0.1389, "rewards/accuracies": 1.0, "rewards/chosen": 4.517442226409912, "rewards/margins": 5.852578639984131, "rewards/rejected": -1.3351361751556396, "step": 2610 }, { "epoch": 1.9075799086757992, "grad_norm": 43.90893146838479, "learning_rate": 3.129569682010543e-07, "logits/chosen": -2.57358980178833, "logits/rejected": -2.371793746948242, "logps/chosen": -338.79937744140625, "logps/rejected": -524.872314453125, "loss": 0.2755, "rewards/accuracies": 1.0, "rewards/chosen": 1.1016062498092651, "rewards/margins": 3.459728240966797, "rewards/rejected": -2.3581221103668213, "step": 2611 }, { "epoch": 1.908310502283105, "grad_norm": 26.05304647193395, "learning_rate": 3.1280259179432163e-07, "logits/chosen": -2.5109853744506836, "logits/rejected": -2.5334999561309814, "logps/chosen": -461.4117736816406, "logps/rejected": -390.248291015625, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": 2.889357566833496, "rewards/margins": 3.765237331390381, "rewards/rejected": -0.8758798837661743, "step": 2612 }, { "epoch": 1.909041095890411, "grad_norm": 18.667342219162954, "learning_rate": 3.1264818982285903e-07, "logits/chosen": -2.844654083251953, "logits/rejected": -1.5933629274368286, "logps/chosen": -719.1477661132812, "logps/rejected": -386.9910888671875, "loss": 0.1121, "rewards/accuracies": 1.0, "rewards/chosen": 3.7863898277282715, "rewards/margins": 6.045113563537598, "rewards/rejected": -2.258723735809326, "step": 2613 }, { "epoch": 1.9097716894977168, "grad_norm": 23.214685672207043, "learning_rate": 3.124937623495182e-07, "logits/chosen": -2.7842214107513428, "logits/rejected": -1.9653310775756836, "logps/chosen": -1037.5224609375, "logps/rejected": -723.8997192382812, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 5.089280128479004, "rewards/margins": 4.375473976135254, "rewards/rejected": 0.7138057947158813, "step": 2614 }, { "epoch": 1.9105022831050227, "grad_norm": 51.56921594520441, "learning_rate": 3.12339309437161e-07, "logits/chosen": -2.955200672149658, "logits/rejected": -2.191336154937744, "logps/chosen": -513.5374755859375, "logps/rejected": -400.3296203613281, "loss": 0.2723, "rewards/accuracies": 0.875, "rewards/chosen": 2.3538999557495117, "rewards/margins": 2.6039817333221436, "rewards/rejected": -0.2500816881656647, "step": 2615 }, { "epoch": 1.9112328767123288, "grad_norm": 30.93229418795943, "learning_rate": 3.121848311486598e-07, "logits/chosen": -2.798426389694214, "logits/rejected": -1.9672110080718994, "logps/chosen": -1035.016357421875, "logps/rejected": -718.0670776367188, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": 3.6435141563415527, "rewards/margins": 4.079960346221924, "rewards/rejected": -0.43644604086875916, "step": 2616 }, { "epoch": 1.9119634703196347, "grad_norm": 45.36128569864808, "learning_rate": 3.120303275468974e-07, "logits/chosen": -2.8167691230773926, "logits/rejected": -2.986013650894165, "logps/chosen": -712.2171020507812, "logps/rejected": -821.21044921875, "loss": 0.1806, "rewards/accuracies": 0.75, "rewards/chosen": 3.5000386238098145, "rewards/margins": 3.687913417816162, "rewards/rejected": -0.18787479400634766, "step": 2617 }, { "epoch": 1.9126940639269407, "grad_norm": 25.103885717653352, "learning_rate": 3.118757986947667e-07, "logits/chosen": -2.554643154144287, "logits/rejected": -2.363144636154175, "logps/chosen": -510.4095153808594, "logps/rejected": -517.6126098632812, "loss": 0.1448, "rewards/accuracies": 1.0, "rewards/chosen": 3.171241521835327, "rewards/margins": 3.62300968170166, "rewards/rejected": -0.45176830887794495, "step": 2618 }, { "epoch": 1.9134246575342466, "grad_norm": 32.451936364847704, "learning_rate": 3.1172124465517104e-07, "logits/chosen": -2.5665783882141113, "logits/rejected": -2.245429039001465, "logps/chosen": -696.135009765625, "logps/rejected": -612.7950439453125, "loss": 0.2085, "rewards/accuracies": 0.875, "rewards/chosen": 2.0640721321105957, "rewards/margins": 2.3557138442993164, "rewards/rejected": -0.29164183139801025, "step": 2619 }, { "epoch": 1.9141552511415525, "grad_norm": 24.974889733240065, "learning_rate": 3.1156666549102394e-07, "logits/chosen": -3.060175657272339, "logits/rejected": -2.4517316818237305, "logps/chosen": -922.96875, "logps/rejected": -610.3836669921875, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 4.0067219734191895, "rewards/margins": 3.327582359313965, "rewards/rejected": 0.6791393160820007, "step": 2620 }, { "epoch": 1.9148858447488584, "grad_norm": 25.15424705729693, "learning_rate": 3.114120612652491e-07, "logits/chosen": -2.9538421630859375, "logits/rejected": -2.16741943359375, "logps/chosen": -708.07421875, "logps/rejected": -500.9626770019531, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 4.159942150115967, "rewards/margins": 3.543325662612915, "rewards/rejected": 0.6166165471076965, "step": 2621 }, { "epoch": 1.9156164383561642, "grad_norm": 38.997370704638534, "learning_rate": 3.1125743204078035e-07, "logits/chosen": -2.861340045928955, "logits/rejected": -2.2010583877563477, "logps/chosen": -937.4450073242188, "logps/rejected": -712.7041015625, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": 4.486687660217285, "rewards/margins": 6.2424726486206055, "rewards/rejected": -1.7557848691940308, "step": 2622 }, { "epoch": 1.9163470319634703, "grad_norm": 43.612050930082674, "learning_rate": 3.1110277788056205e-07, "logits/chosen": -2.8445186614990234, "logits/rejected": -1.4331295490264893, "logps/chosen": -554.7899169921875, "logps/rejected": -245.42535400390625, "loss": 0.2291, "rewards/accuracies": 1.0, "rewards/chosen": 4.856104373931885, "rewards/margins": 6.331587791442871, "rewards/rejected": -1.4754835367202759, "step": 2623 }, { "epoch": 1.9170776255707762, "grad_norm": 22.519644744401774, "learning_rate": 3.109480988475484e-07, "logits/chosen": -2.76865816116333, "logits/rejected": -3.0709376335144043, "logps/chosen": -447.15533447265625, "logps/rejected": -607.20556640625, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 1.719545841217041, "rewards/margins": 2.9044911861419678, "rewards/rejected": -1.1849451065063477, "step": 2624 }, { "epoch": 1.9178082191780823, "grad_norm": 37.23347342068357, "learning_rate": 3.1079339500470376e-07, "logits/chosen": -2.3443596363067627, "logits/rejected": -2.4898290634155273, "logps/chosen": -664.9352416992188, "logps/rejected": -726.8062133789062, "loss": 0.1528, "rewards/accuracies": 0.875, "rewards/chosen": 2.3104848861694336, "rewards/margins": 3.0422568321228027, "rewards/rejected": -0.7317718863487244, "step": 2625 }, { "epoch": 1.9185388127853882, "grad_norm": 46.65039418208328, "learning_rate": 3.1063866641500264e-07, "logits/chosen": -2.6342148780822754, "logits/rejected": -2.2061116695404053, "logps/chosen": -716.8065185546875, "logps/rejected": -653.15673828125, "loss": 0.2242, "rewards/accuracies": 0.875, "rewards/chosen": 3.0913708209991455, "rewards/margins": 4.545989036560059, "rewards/rejected": -1.4546180963516235, "step": 2626 }, { "epoch": 1.919269406392694, "grad_norm": 45.72708506556864, "learning_rate": 3.1048391314142964e-07, "logits/chosen": -2.743277072906494, "logits/rejected": -1.9896434545516968, "logps/chosen": -541.3377685546875, "logps/rejected": -593.716796875, "loss": 0.2428, "rewards/accuracies": 1.0, "rewards/chosen": 4.304584980010986, "rewards/margins": 6.22932767868042, "rewards/rejected": -1.924742579460144, "step": 2627 }, { "epoch": 1.92, "grad_norm": 26.274710514060022, "learning_rate": 3.103291352469794e-07, "logits/chosen": -2.364607095718384, "logits/rejected": -2.024416446685791, "logps/chosen": -405.8942565917969, "logps/rejected": -383.94219970703125, "loss": 0.1753, "rewards/accuracies": 1.0, "rewards/chosen": 1.8972101211547852, "rewards/margins": 2.810755491256714, "rewards/rejected": -0.9135454893112183, "step": 2628 }, { "epoch": 1.9207305936073058, "grad_norm": 49.619661827180465, "learning_rate": 3.101743327946565e-07, "logits/chosen": -2.6098387241363525, "logits/rejected": -2.2793242931365967, "logps/chosen": -540.2523803710938, "logps/rejected": -541.932373046875, "loss": 0.2321, "rewards/accuracies": 1.0, "rewards/chosen": 2.5281620025634766, "rewards/margins": 3.4764137268066406, "rewards/rejected": -0.9482520222663879, "step": 2629 }, { "epoch": 1.921461187214612, "grad_norm": 21.933598271927405, "learning_rate": 3.100195058474756e-07, "logits/chosen": -2.869910955429077, "logits/rejected": -2.7044739723205566, "logps/chosen": -658.349365234375, "logps/rejected": -545.9950561523438, "loss": 0.1199, "rewards/accuracies": 1.0, "rewards/chosen": 3.2344086170196533, "rewards/margins": 3.823394298553467, "rewards/rejected": -0.5889856219291687, "step": 2630 }, { "epoch": 1.9221917808219178, "grad_norm": 34.87135257584297, "learning_rate": 3.0986465446846146e-07, "logits/chosen": -2.6152098178863525, "logits/rejected": -2.023348093032837, "logps/chosen": -360.0379943847656, "logps/rejected": -341.5215148925781, "loss": 0.168, "rewards/accuracies": 1.0, "rewards/chosen": 0.6134947538375854, "rewards/margins": 3.46351957321167, "rewards/rejected": -2.850024938583374, "step": 2631 }, { "epoch": 1.9229223744292239, "grad_norm": 44.23955037500735, "learning_rate": 3.097097787206484e-07, "logits/chosen": -3.0647830963134766, "logits/rejected": -2.4654488563537598, "logps/chosen": -783.85546875, "logps/rejected": -383.6734619140625, "loss": 0.2345, "rewards/accuracies": 1.0, "rewards/chosen": 2.2008328437805176, "rewards/margins": 2.736708402633667, "rewards/rejected": -0.5358755588531494, "step": 2632 }, { "epoch": 1.9236529680365297, "grad_norm": 29.10447657389882, "learning_rate": 3.095548786670811e-07, "logits/chosen": -1.9998693466186523, "logits/rejected": -2.0833868980407715, "logps/chosen": -487.3673400878906, "logps/rejected": -740.5711669921875, "loss": 0.2053, "rewards/accuracies": 0.875, "rewards/chosen": 0.9426041841506958, "rewards/margins": 3.624330759048462, "rewards/rejected": -2.6817264556884766, "step": 2633 }, { "epoch": 1.9243835616438356, "grad_norm": 29.042090027476714, "learning_rate": 3.093999543708137e-07, "logits/chosen": -2.6626110076904297, "logits/rejected": -2.046022891998291, "logps/chosen": -872.84814453125, "logps/rejected": -517.5299072265625, "loss": 0.123, "rewards/accuracies": 1.0, "rewards/chosen": 3.4637539386749268, "rewards/margins": 4.08298921585083, "rewards/rejected": -0.6192352771759033, "step": 2634 }, { "epoch": 1.9251141552511415, "grad_norm": 40.18215057223014, "learning_rate": 3.092450058949108e-07, "logits/chosen": -2.9498822689056396, "logits/rejected": -2.1484646797180176, "logps/chosen": -597.5595092773438, "logps/rejected": -557.0521850585938, "loss": 0.1578, "rewards/accuracies": 1.0, "rewards/chosen": 3.7423720359802246, "rewards/margins": 5.104648590087891, "rewards/rejected": -1.3622760772705078, "step": 2635 }, { "epoch": 1.9258447488584474, "grad_norm": 37.24904308754119, "learning_rate": 3.090900333024461e-07, "logits/chosen": -2.844773769378662, "logits/rejected": -2.148977279663086, "logps/chosen": -545.5488891601562, "logps/rejected": -421.8282165527344, "loss": 0.1775, "rewards/accuracies": 0.75, "rewards/chosen": 3.1858479976654053, "rewards/margins": 4.49310302734375, "rewards/rejected": -1.3072550296783447, "step": 2636 }, { "epoch": 1.9265753424657535, "grad_norm": 47.143488892124594, "learning_rate": 3.0893503665650374e-07, "logits/chosen": -2.586796283721924, "logits/rejected": -2.683919906616211, "logps/chosen": -616.1549682617188, "logps/rejected": -567.177734375, "loss": 0.1785, "rewards/accuracies": 0.875, "rewards/chosen": 2.1940481662750244, "rewards/margins": 3.0530996322631836, "rewards/rejected": -0.8590513467788696, "step": 2637 }, { "epoch": 1.9273059360730593, "grad_norm": 26.839056632778657, "learning_rate": 3.087800160201774e-07, "logits/chosen": -2.7693045139312744, "logits/rejected": -2.2560558319091797, "logps/chosen": -607.2586669921875, "logps/rejected": -388.55462646484375, "loss": 0.1867, "rewards/accuracies": 0.75, "rewards/chosen": 1.0344167947769165, "rewards/margins": 1.2846477031707764, "rewards/rejected": -0.2502310276031494, "step": 2638 }, { "epoch": 1.9280365296803654, "grad_norm": 63.19557665061001, "learning_rate": 3.086249714565704e-07, "logits/chosen": -2.8672516345977783, "logits/rejected": -2.3874616622924805, "logps/chosen": -699.6040649414062, "logps/rejected": -564.9283447265625, "loss": 0.3371, "rewards/accuracies": 0.875, "rewards/chosen": 1.4929286241531372, "rewards/margins": 2.870051145553589, "rewards/rejected": -1.3771226406097412, "step": 2639 }, { "epoch": 1.9287671232876713, "grad_norm": 22.9607221834069, "learning_rate": 3.084699030287961e-07, "logits/chosen": -2.8221993446350098, "logits/rejected": -2.3700294494628906, "logps/chosen": -616.9315185546875, "logps/rejected": -610.8748168945312, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 2.9809703826904297, "rewards/margins": 4.334532737731934, "rewards/rejected": -1.3535619974136353, "step": 2640 }, { "epoch": 1.9294977168949772, "grad_norm": 40.08193664866772, "learning_rate": 3.083148107999772e-07, "logits/chosen": -2.840703248977661, "logits/rejected": -1.2787508964538574, "logps/chosen": -644.3406982421875, "logps/rejected": -300.27813720703125, "loss": 0.1839, "rewards/accuracies": 1.0, "rewards/chosen": 3.7930448055267334, "rewards/margins": 5.707734107971191, "rewards/rejected": -1.9146888256072998, "step": 2641 }, { "epoch": 1.930228310502283, "grad_norm": 29.265628128336726, "learning_rate": 3.0815969483324656e-07, "logits/chosen": -2.7423880100250244, "logits/rejected": -2.428264617919922, "logps/chosen": -483.29705810546875, "logps/rejected": -416.560791015625, "loss": 0.1664, "rewards/accuracies": 1.0, "rewards/chosen": 2.2681238651275635, "rewards/margins": 2.988539695739746, "rewards/rejected": -0.7204160690307617, "step": 2642 }, { "epoch": 1.930958904109589, "grad_norm": 25.888631722078443, "learning_rate": 3.080045551917463e-07, "logits/chosen": -2.9766082763671875, "logits/rejected": -2.901482343673706, "logps/chosen": -621.3540649414062, "logps/rejected": -637.0260620117188, "loss": 0.1647, "rewards/accuracies": 1.0, "rewards/chosen": 2.5861494541168213, "rewards/margins": 3.6952712535858154, "rewards/rejected": -1.1091219186782837, "step": 2643 }, { "epoch": 1.931689497716895, "grad_norm": 24.59985430914683, "learning_rate": 3.0784939193862835e-07, "logits/chosen": -3.0810976028442383, "logits/rejected": -2.132377862930298, "logps/chosen": -865.33056640625, "logps/rejected": -513.759521484375, "loss": 0.1365, "rewards/accuracies": 0.875, "rewards/chosen": 3.6284117698669434, "rewards/margins": 4.191366195678711, "rewards/rejected": -0.5629549622535706, "step": 2644 }, { "epoch": 1.932420091324201, "grad_norm": 19.857543956943747, "learning_rate": 3.0769420513705414e-07, "logits/chosen": -2.787461280822754, "logits/rejected": -2.1764917373657227, "logps/chosen": -775.961669921875, "logps/rejected": -494.3525390625, "loss": 0.1228, "rewards/accuracies": 1.0, "rewards/chosen": 2.91939377784729, "rewards/margins": 3.524435520172119, "rewards/rejected": -0.6050421595573425, "step": 2645 }, { "epoch": 1.933150684931507, "grad_norm": 33.7638498781426, "learning_rate": 3.0753899485019483e-07, "logits/chosen": -2.8494880199432373, "logits/rejected": -2.302185535430908, "logps/chosen": -873.1876220703125, "logps/rejected": -659.8441162109375, "loss": 0.1573, "rewards/accuracies": 1.0, "rewards/chosen": 3.767338514328003, "rewards/margins": 3.30191969871521, "rewards/rejected": 0.4654187858104706, "step": 2646 }, { "epoch": 1.9338812785388129, "grad_norm": 28.760343333013473, "learning_rate": 3.073837611412312e-07, "logits/chosen": -3.7646641731262207, "logits/rejected": -2.297494888305664, "logps/chosen": -786.4373779296875, "logps/rejected": -475.856689453125, "loss": 0.1402, "rewards/accuracies": 1.0, "rewards/chosen": 2.5025784969329834, "rewards/margins": 2.7066125869750977, "rewards/rejected": -0.2040342390537262, "step": 2647 }, { "epoch": 1.9346118721461187, "grad_norm": 35.90375963676628, "learning_rate": 3.0722850407335327e-07, "logits/chosen": -2.4211623668670654, "logits/rejected": -2.064603090286255, "logps/chosen": -469.6168212890625, "logps/rejected": -494.03271484375, "loss": 0.157, "rewards/accuracies": 1.0, "rewards/chosen": 2.731045722961426, "rewards/margins": 3.588695526123047, "rewards/rejected": -0.8576498031616211, "step": 2648 }, { "epoch": 1.9353424657534246, "grad_norm": 27.332560210420283, "learning_rate": 3.070732237097607e-07, "logits/chosen": -3.128002405166626, "logits/rejected": -2.841439723968506, "logps/chosen": -687.0298461914062, "logps/rejected": -648.319091796875, "loss": 0.1205, "rewards/accuracies": 1.0, "rewards/chosen": 1.6982026100158691, "rewards/margins": 3.499307632446289, "rewards/rejected": -1.8011047840118408, "step": 2649 }, { "epoch": 1.9360730593607305, "grad_norm": 27.3461137371188, "learning_rate": 3.069179201136629e-07, "logits/chosen": -2.9259369373321533, "logits/rejected": -2.5285749435424805, "logps/chosen": -548.754638671875, "logps/rejected": -551.1661987304688, "loss": 0.1348, "rewards/accuracies": 1.0, "rewards/chosen": 2.0825347900390625, "rewards/margins": 3.1113126277923584, "rewards/rejected": -1.028777837753296, "step": 2650 }, { "epoch": 1.9368036529680366, "grad_norm": 41.51266249544755, "learning_rate": 3.067625933482784e-07, "logits/chosen": -2.639375686645508, "logits/rejected": -2.2411205768585205, "logps/chosen": -944.9378662109375, "logps/rejected": -784.0579223632812, "loss": 0.1879, "rewards/accuracies": 1.0, "rewards/chosen": 2.4004929065704346, "rewards/margins": 1.9937548637390137, "rewards/rejected": 0.4067380130290985, "step": 2651 }, { "epoch": 1.9375342465753425, "grad_norm": 44.451134072287736, "learning_rate": 3.0660724347683515e-07, "logits/chosen": -2.389490842819214, "logits/rejected": -2.3613104820251465, "logps/chosen": -483.9043273925781, "logps/rejected": -745.1253662109375, "loss": 0.2355, "rewards/accuracies": 1.0, "rewards/chosen": 2.6073601245880127, "rewards/margins": 6.2347092628479, "rewards/rejected": -3.627349376678467, "step": 2652 }, { "epoch": 1.9382648401826486, "grad_norm": 27.779259454841036, "learning_rate": 3.064518705625708e-07, "logits/chosen": -2.8664727210998535, "logits/rejected": -1.9301173686981201, "logps/chosen": -577.6063842773438, "logps/rejected": -396.37451171875, "loss": 0.1266, "rewards/accuracies": 1.0, "rewards/chosen": 3.51031494140625, "rewards/margins": 4.059528350830078, "rewards/rejected": -0.5492130517959595, "step": 2653 }, { "epoch": 1.9389954337899544, "grad_norm": 26.33223377620158, "learning_rate": 3.0629647466873215e-07, "logits/chosen": -2.950681209564209, "logits/rejected": -2.203338146209717, "logps/chosen": -638.773681640625, "logps/rejected": -473.471435546875, "loss": 0.1314, "rewards/accuracies": 0.875, "rewards/chosen": 4.304651737213135, "rewards/margins": 6.402467727661133, "rewards/rejected": -2.097815752029419, "step": 2654 }, { "epoch": 1.9397260273972603, "grad_norm": 43.34514124479524, "learning_rate": 3.061410558585754e-07, "logits/chosen": -3.157186508178711, "logits/rejected": -2.1113390922546387, "logps/chosen": -511.2388610839844, "logps/rejected": -229.60324096679688, "loss": 0.2095, "rewards/accuracies": 0.875, "rewards/chosen": 2.9516825675964355, "rewards/margins": 4.388864040374756, "rewards/rejected": -1.437180995941162, "step": 2655 }, { "epoch": 1.9404566210045662, "grad_norm": 23.025110220878286, "learning_rate": 3.0598561419536606e-07, "logits/chosen": -3.1267993450164795, "logits/rejected": -2.236790895462036, "logps/chosen": -852.0999755859375, "logps/rejected": -610.9959106445312, "loss": 0.135, "rewards/accuracies": 1.0, "rewards/chosen": 3.495668888092041, "rewards/margins": 2.472472906112671, "rewards/rejected": 1.0231962203979492, "step": 2656 }, { "epoch": 1.941187214611872, "grad_norm": 29.142052369532824, "learning_rate": 3.0583014974237905e-07, "logits/chosen": -2.7042016983032227, "logits/rejected": -2.0681986808776855, "logps/chosen": -936.235595703125, "logps/rejected": -695.7985229492188, "loss": 0.1704, "rewards/accuracies": 1.0, "rewards/chosen": 2.4990785121917725, "rewards/margins": 2.927649736404419, "rewards/rejected": -0.4285712242126465, "step": 2657 }, { "epoch": 1.941917808219178, "grad_norm": 47.16253219833913, "learning_rate": 3.0567466256289837e-07, "logits/chosen": -2.3649652004241943, "logits/rejected": -2.129011392593384, "logps/chosen": -423.0342102050781, "logps/rejected": -600.9241333007812, "loss": 0.3094, "rewards/accuracies": 0.75, "rewards/chosen": 1.5977224111557007, "rewards/margins": 2.9863617420196533, "rewards/rejected": -1.388639211654663, "step": 2658 }, { "epoch": 1.942648401826484, "grad_norm": 42.504865300560006, "learning_rate": 3.0551915272021755e-07, "logits/chosen": -2.3085107803344727, "logits/rejected": -2.193629264831543, "logps/chosen": -553.92041015625, "logps/rejected": -536.0416259765625, "loss": 0.2584, "rewards/accuracies": 0.875, "rewards/chosen": 2.560570240020752, "rewards/margins": 3.869239568710327, "rewards/rejected": -1.3086692094802856, "step": 2659 }, { "epoch": 1.9433789954337901, "grad_norm": 31.835129399253077, "learning_rate": 3.0536362027763906e-07, "logits/chosen": -2.955449104309082, "logits/rejected": -2.1732778549194336, "logps/chosen": -967.75537109375, "logps/rejected": -570.913818359375, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": 4.383061408996582, "rewards/margins": 4.36660099029541, "rewards/rejected": 0.016460031270980835, "step": 2660 }, { "epoch": 1.944109589041096, "grad_norm": 42.32775009062465, "learning_rate": 3.052080652984748e-07, "logits/chosen": -3.3245527744293213, "logits/rejected": -2.258572578430176, "logps/chosen": -987.0667724609375, "logps/rejected": -621.240234375, "loss": 0.2014, "rewards/accuracies": 1.0, "rewards/chosen": 3.971475839614868, "rewards/margins": 5.390533447265625, "rewards/rejected": -1.4190572500228882, "step": 2661 }, { "epoch": 1.9448401826484019, "grad_norm": 37.001090368752145, "learning_rate": 3.0505248784604564e-07, "logits/chosen": -2.772118091583252, "logits/rejected": -2.3952159881591797, "logps/chosen": -1044.145263671875, "logps/rejected": -916.8386840820312, "loss": 0.1492, "rewards/accuracies": 1.0, "rewards/chosen": 2.7762317657470703, "rewards/margins": 3.121136426925659, "rewards/rejected": -0.3449043333530426, "step": 2662 }, { "epoch": 1.9455707762557077, "grad_norm": 32.912963761924416, "learning_rate": 3.048968879836817e-07, "logits/chosen": -3.071885347366333, "logits/rejected": -2.544368267059326, "logps/chosen": -798.6612548828125, "logps/rejected": -638.802490234375, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 3.120410442352295, "rewards/margins": 2.0250372886657715, "rewards/rejected": 1.0953731536865234, "step": 2663 }, { "epoch": 1.9463013698630136, "grad_norm": 28.349242766424776, "learning_rate": 3.047412657747223e-07, "logits/chosen": -2.3517303466796875, "logits/rejected": -2.3062939643859863, "logps/chosen": -564.9629516601562, "logps/rejected": -468.6990661621094, "loss": 0.1599, "rewards/accuracies": 1.0, "rewards/chosen": 1.2530057430267334, "rewards/margins": 2.7198433876037598, "rewards/rejected": -1.4668376445770264, "step": 2664 }, { "epoch": 1.9470319634703195, "grad_norm": 45.31129578141222, "learning_rate": 3.0458562128251577e-07, "logits/chosen": -3.0159623622894287, "logits/rejected": -2.002443313598633, "logps/chosen": -714.783447265625, "logps/rejected": -419.1029052734375, "loss": 0.1958, "rewards/accuracies": 1.0, "rewards/chosen": 3.741086959838867, "rewards/margins": 4.443551540374756, "rewards/rejected": -0.7024648189544678, "step": 2665 }, { "epoch": 1.9477625570776256, "grad_norm": 35.696010984356676, "learning_rate": 3.0442995457041943e-07, "logits/chosen": -2.6619129180908203, "logits/rejected": -2.2922258377075195, "logps/chosen": -649.969482421875, "logps/rejected": -641.41943359375, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": 3.2440831661224365, "rewards/margins": 4.049243927001953, "rewards/rejected": -0.8051607608795166, "step": 2666 }, { "epoch": 1.9484931506849315, "grad_norm": 24.22173368717648, "learning_rate": 3.042742657017998e-07, "logits/chosen": -2.7151126861572266, "logits/rejected": -2.4917752742767334, "logps/chosen": -412.73260498046875, "logps/rejected": -432.8744201660156, "loss": 0.1336, "rewards/accuracies": 1.0, "rewards/chosen": 2.232966423034668, "rewards/margins": 2.8942723274230957, "rewards/rejected": -0.661305844783783, "step": 2667 }, { "epoch": 1.9492237442922375, "grad_norm": 37.79956196526125, "learning_rate": 3.041185547400324e-07, "logits/chosen": -2.735201597213745, "logits/rejected": -2.280423164367676, "logps/chosen": -842.2175903320312, "logps/rejected": -583.845703125, "loss": 0.1561, "rewards/accuracies": 0.875, "rewards/chosen": 3.0644521713256836, "rewards/margins": 2.7905712127685547, "rewards/rejected": 0.27388086915016174, "step": 2668 }, { "epoch": 1.9499543378995434, "grad_norm": 18.89121464391997, "learning_rate": 3.0396282174850146e-07, "logits/chosen": -3.2776639461517334, "logits/rejected": -2.014284610748291, "logps/chosen": -515.1143798828125, "logps/rejected": -350.6387023925781, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 2.201828956604004, "rewards/margins": 3.5529818534851074, "rewards/rejected": -1.3511526584625244, "step": 2669 }, { "epoch": 1.9506849315068493, "grad_norm": 30.61563452938251, "learning_rate": 3.0380706679060077e-07, "logits/chosen": -3.0737719535827637, "logits/rejected": -2.312260627746582, "logps/chosen": -471.42431640625, "logps/rejected": -405.8356018066406, "loss": 0.1334, "rewards/accuracies": 0.75, "rewards/chosen": 2.8472564220428467, "rewards/margins": 3.0005688667297363, "rewards/rejected": -0.15331238508224487, "step": 2670 }, { "epoch": 1.9514155251141552, "grad_norm": 23.734957458702265, "learning_rate": 3.0365128992973244e-07, "logits/chosen": -2.59769868850708, "logits/rejected": -2.631166934967041, "logps/chosen": -538.52392578125, "logps/rejected": -573.17529296875, "loss": 0.161, "rewards/accuracies": 1.0, "rewards/chosen": 2.4531190395355225, "rewards/margins": 4.07217264175415, "rewards/rejected": -1.619053602218628, "step": 2671 }, { "epoch": 1.952146118721461, "grad_norm": 28.944395228380895, "learning_rate": 3.034954912293079e-07, "logits/chosen": -2.780184745788574, "logits/rejected": -1.900254249572754, "logps/chosen": -586.651611328125, "logps/rejected": -442.92254638671875, "loss": 0.1313, "rewards/accuracies": 0.875, "rewards/chosen": 3.3086624145507812, "rewards/margins": 4.726552963256836, "rewards/rejected": -1.417890191078186, "step": 2672 }, { "epoch": 1.9528767123287671, "grad_norm": 50.34052499757278, "learning_rate": 3.033396707527472e-07, "logits/chosen": -3.273282766342163, "logits/rejected": -1.8812633752822876, "logps/chosen": -625.7781982421875, "logps/rejected": -390.6820983886719, "loss": 0.1823, "rewards/accuracies": 0.875, "rewards/chosen": 3.3925018310546875, "rewards/margins": 3.973318099975586, "rewards/rejected": -0.5808160901069641, "step": 2673 }, { "epoch": 1.953607305936073, "grad_norm": 37.66432097116248, "learning_rate": 3.0318382856347946e-07, "logits/chosen": -3.5001842975616455, "logits/rejected": -2.849881649017334, "logps/chosen": -680.8768920898438, "logps/rejected": -630.0870361328125, "loss": 0.2039, "rewards/accuracies": 1.0, "rewards/chosen": 4.220110893249512, "rewards/margins": 4.609020233154297, "rewards/rejected": -0.38890933990478516, "step": 2674 }, { "epoch": 1.954337899543379, "grad_norm": 33.83689623583995, "learning_rate": 3.030279647249425e-07, "logits/chosen": -2.4598517417907715, "logits/rejected": -2.116720199584961, "logps/chosen": -624.2799682617188, "logps/rejected": -524.7156372070312, "loss": 0.1423, "rewards/accuracies": 0.875, "rewards/chosen": 4.078061580657959, "rewards/margins": 5.007104873657227, "rewards/rejected": -0.929043173789978, "step": 2675 }, { "epoch": 1.955068493150685, "grad_norm": 40.64594261875578, "learning_rate": 3.028720793005832e-07, "logits/chosen": -2.716273069381714, "logits/rejected": -2.519120693206787, "logps/chosen": -461.4362487792969, "logps/rejected": -449.95843505859375, "loss": 0.2306, "rewards/accuracies": 0.875, "rewards/chosen": 2.4487063884735107, "rewards/margins": 3.8716063499450684, "rewards/rejected": -1.4229004383087158, "step": 2676 }, { "epoch": 1.9557990867579909, "grad_norm": 46.80205783069266, "learning_rate": 3.027161723538569e-07, "logits/chosen": -2.9901905059814453, "logits/rejected": -2.007086992263794, "logps/chosen": -836.3005981445312, "logps/rejected": -517.9022216796875, "loss": 0.2353, "rewards/accuracies": 1.0, "rewards/chosen": 2.5983424186706543, "rewards/margins": 1.4255479574203491, "rewards/rejected": 1.1727945804595947, "step": 2677 }, { "epoch": 1.9565296803652967, "grad_norm": 44.14840362876952, "learning_rate": 3.0256024394822783e-07, "logits/chosen": -2.6344995498657227, "logits/rejected": -2.703876256942749, "logps/chosen": -1021.141845703125, "logps/rejected": -1220.53564453125, "loss": 0.1468, "rewards/accuracies": 0.875, "rewards/chosen": 2.9670891761779785, "rewards/margins": 2.2333357334136963, "rewards/rejected": 0.7337532639503479, "step": 2678 }, { "epoch": 1.9572602739726026, "grad_norm": 41.13734023631997, "learning_rate": 3.02404294147169e-07, "logits/chosen": -2.435187339782715, "logits/rejected": -2.4996869564056396, "logps/chosen": -575.2994995117188, "logps/rejected": -742.9219970703125, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": 1.9876335859298706, "rewards/margins": 2.308570623397827, "rewards/rejected": -0.320936918258667, "step": 2679 }, { "epoch": 1.9579908675799087, "grad_norm": 20.436028383800892, "learning_rate": 3.022483230141621e-07, "logits/chosen": -2.666084051132202, "logits/rejected": -1.9135198593139648, "logps/chosen": -523.1389770507812, "logps/rejected": -388.12518310546875, "loss": 0.1198, "rewards/accuracies": 0.875, "rewards/chosen": 2.9663166999816895, "rewards/margins": 3.247540235519409, "rewards/rejected": -0.2812235355377197, "step": 2680 }, { "epoch": 1.9587214611872146, "grad_norm": 31.048369050613996, "learning_rate": 3.020923306126975e-07, "logits/chosen": -2.0587990283966064, "logits/rejected": -1.2548303604125977, "logps/chosen": -612.6849975585938, "logps/rejected": -469.5294494628906, "loss": 0.1226, "rewards/accuracies": 0.875, "rewards/chosen": 2.7754788398742676, "rewards/margins": 2.3039369583129883, "rewards/rejected": 0.47154176235198975, "step": 2681 }, { "epoch": 1.9594520547945207, "grad_norm": 42.917651434899135, "learning_rate": 3.019363170062742e-07, "logits/chosen": -3.0866544246673584, "logits/rejected": -2.831231117248535, "logps/chosen": -948.3818359375, "logps/rejected": -744.2638549804688, "loss": 0.206, "rewards/accuracies": 0.875, "rewards/chosen": 3.3213601112365723, "rewards/margins": 3.036778211593628, "rewards/rejected": 0.28458189964294434, "step": 2682 }, { "epoch": 1.9601826484018265, "grad_norm": 42.390668194780794, "learning_rate": 3.017802822583999e-07, "logits/chosen": -3.007389545440674, "logits/rejected": -1.9503233432769775, "logps/chosen": -662.9600830078125, "logps/rejected": -436.3128967285156, "loss": 0.1542, "rewards/accuracies": 1.0, "rewards/chosen": 4.354064464569092, "rewards/margins": 5.71423864364624, "rewards/rejected": -1.3601741790771484, "step": 2683 }, { "epoch": 1.9609132420091324, "grad_norm": 20.45700029535084, "learning_rate": 3.016242264325909e-07, "logits/chosen": -2.972670555114746, "logits/rejected": -2.3466837406158447, "logps/chosen": -785.5164184570312, "logps/rejected": -592.855712890625, "loss": 0.1085, "rewards/accuracies": 1.0, "rewards/chosen": 3.634916067123413, "rewards/margins": 3.855543851852417, "rewards/rejected": -0.22062790393829346, "step": 2684 }, { "epoch": 1.9616438356164383, "grad_norm": 30.92681613976726, "learning_rate": 3.01468149592372e-07, "logits/chosen": -2.037461996078491, "logits/rejected": -1.9772865772247314, "logps/chosen": -672.3179931640625, "logps/rejected": -562.9946899414062, "loss": 0.1931, "rewards/accuracies": 0.875, "rewards/chosen": 1.806341528892517, "rewards/margins": 1.9837186336517334, "rewards/rejected": -0.17737704515457153, "step": 2685 }, { "epoch": 1.9623744292237442, "grad_norm": 41.87415460900375, "learning_rate": 3.0131205180127656e-07, "logits/chosen": -3.468268394470215, "logits/rejected": -2.024907112121582, "logps/chosen": -1313.01416015625, "logps/rejected": -834.0355834960938, "loss": 0.1831, "rewards/accuracies": 0.875, "rewards/chosen": 5.1497344970703125, "rewards/margins": 3.4731760025024414, "rewards/rejected": 1.676558494567871, "step": 2686 }, { "epoch": 1.9631050228310503, "grad_norm": 33.50906841461912, "learning_rate": 3.011559331228465e-07, "logits/chosen": -2.794588327407837, "logits/rejected": -2.22562837600708, "logps/chosen": -646.6331787109375, "logps/rejected": -601.7343139648438, "loss": 0.1396, "rewards/accuracies": 0.875, "rewards/chosen": 2.3506648540496826, "rewards/margins": 3.6365137100219727, "rewards/rejected": -1.2858487367630005, "step": 2687 }, { "epoch": 1.9638356164383561, "grad_norm": 29.335036604009783, "learning_rate": 3.009997936206324e-07, "logits/chosen": -2.6095237731933594, "logits/rejected": -2.069904327392578, "logps/chosen": -367.30517578125, "logps/rejected": -324.9158935546875, "loss": 0.1874, "rewards/accuracies": 1.0, "rewards/chosen": 1.8144810199737549, "rewards/margins": 4.354301452636719, "rewards/rejected": -2.5398199558258057, "step": 2688 }, { "epoch": 1.9645662100456622, "grad_norm": 42.5147966544627, "learning_rate": 3.0084363335819306e-07, "logits/chosen": -2.5312747955322266, "logits/rejected": -1.9619786739349365, "logps/chosen": -920.228271484375, "logps/rejected": -787.64501953125, "loss": 0.186, "rewards/accuracies": 0.875, "rewards/chosen": 4.597120761871338, "rewards/margins": 4.554453372955322, "rewards/rejected": 0.04266747832298279, "step": 2689 }, { "epoch": 1.965296803652968, "grad_norm": 20.32464856058594, "learning_rate": 3.006874523990959e-07, "logits/chosen": -3.3672447204589844, "logits/rejected": -2.6849489212036133, "logps/chosen": -461.4769592285156, "logps/rejected": -389.16949462890625, "loss": 0.1282, "rewards/accuracies": 0.875, "rewards/chosen": 1.3128776550292969, "rewards/margins": 2.6427857875823975, "rewards/rejected": -1.3299081325531006, "step": 2690 }, { "epoch": 1.966027397260274, "grad_norm": 36.709666429634034, "learning_rate": 3.0053125080691655e-07, "logits/chosen": -2.5749428272247314, "logits/rejected": -2.675510883331299, "logps/chosen": -736.7855224609375, "logps/rejected": -889.9765014648438, "loss": 0.1673, "rewards/accuracies": 0.875, "rewards/chosen": 2.923536777496338, "rewards/margins": 2.3808014392852783, "rewards/rejected": 0.5427354574203491, "step": 2691 }, { "epoch": 1.9667579908675799, "grad_norm": 22.705524273170422, "learning_rate": 3.003750286452394e-07, "logits/chosen": -2.4990782737731934, "logits/rejected": -2.1062817573547363, "logps/chosen": -275.07073974609375, "logps/rejected": -271.957275390625, "loss": 0.1065, "rewards/accuracies": 0.875, "rewards/chosen": 1.5954922437667847, "rewards/margins": 4.224625110626221, "rewards/rejected": -2.6291329860687256, "step": 2692 }, { "epoch": 1.9674885844748857, "grad_norm": 43.49726385521996, "learning_rate": 3.002187859776568e-07, "logits/chosen": -3.0031044483184814, "logits/rejected": -1.903444528579712, "logps/chosen": -903.5989990234375, "logps/rejected": -512.113037109375, "loss": 0.2451, "rewards/accuracies": 0.625, "rewards/chosen": 3.139632225036621, "rewards/margins": 3.341297149658203, "rewards/rejected": -0.2016649842262268, "step": 2693 }, { "epoch": 1.9682191780821918, "grad_norm": 39.289700202370945, "learning_rate": 3.000625228677699e-07, "logits/chosen": -2.877303123474121, "logits/rejected": -2.0276479721069336, "logps/chosen": -708.671630859375, "logps/rejected": -554.5592041015625, "loss": 0.2029, "rewards/accuracies": 0.875, "rewards/chosen": 4.085434436798096, "rewards/margins": 5.944423198699951, "rewards/rejected": -1.8589890003204346, "step": 2694 }, { "epoch": 1.9689497716894977, "grad_norm": 37.351009362407126, "learning_rate": 2.999062393791877e-07, "logits/chosen": -2.461116313934326, "logits/rejected": -2.534651756286621, "logps/chosen": -579.6488037109375, "logps/rejected": -591.27490234375, "loss": 0.1665, "rewards/accuracies": 1.0, "rewards/chosen": 2.218914747238159, "rewards/margins": 2.198190212249756, "rewards/rejected": 0.02072465419769287, "step": 2695 }, { "epoch": 1.9696803652968038, "grad_norm": 24.334701011926235, "learning_rate": 2.9974993557552786e-07, "logits/chosen": -2.216493606567383, "logits/rejected": -1.9860515594482422, "logps/chosen": -693.5361938476562, "logps/rejected": -622.8790283203125, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": 2.6857876777648926, "rewards/margins": 3.5372791290283203, "rewards/rejected": -0.8514919281005859, "step": 2696 }, { "epoch": 1.9704109589041097, "grad_norm": 27.400034705369787, "learning_rate": 2.995936115204161e-07, "logits/chosen": -3.2840704917907715, "logits/rejected": -2.4694833755493164, "logps/chosen": -956.22265625, "logps/rejected": -938.9422607421875, "loss": 0.1213, "rewards/accuracies": 1.0, "rewards/chosen": 5.697064399719238, "rewards/margins": 6.144957065582275, "rewards/rejected": -0.44789260625839233, "step": 2697 }, { "epoch": 1.9711415525114155, "grad_norm": 11.947969169911524, "learning_rate": 2.994372672774865e-07, "logits/chosen": -2.4660611152648926, "logits/rejected": -2.1240761280059814, "logps/chosen": -541.103759765625, "logps/rejected": -581.4150390625, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 3.4394969940185547, "rewards/margins": 6.60155725479126, "rewards/rejected": -3.162060260772705, "step": 2698 }, { "epoch": 1.9718721461187214, "grad_norm": 23.833746775351173, "learning_rate": 2.992809029103812e-07, "logits/chosen": -2.5205774307250977, "logits/rejected": -2.1413321495056152, "logps/chosen": -563.4327392578125, "logps/rejected": -567.9811401367188, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 3.7355518341064453, "rewards/margins": 4.572498321533203, "rewards/rejected": -0.8369466662406921, "step": 2699 }, { "epoch": 1.9726027397260273, "grad_norm": 53.21291897738509, "learning_rate": 2.991245184827508e-07, "logits/chosen": -2.9044318199157715, "logits/rejected": -2.577606201171875, "logps/chosen": -757.2564697265625, "logps/rejected": -776.9844970703125, "loss": 0.1932, "rewards/accuracies": 1.0, "rewards/chosen": 2.232161045074463, "rewards/margins": 3.4720191955566406, "rewards/rejected": -1.2398585081100464, "step": 2700 }, { "epoch": 1.9733333333333334, "grad_norm": 23.708499668746835, "learning_rate": 2.989681140582538e-07, "logits/chosen": -3.1487174034118652, "logits/rejected": -1.899290919303894, "logps/chosen": -567.23876953125, "logps/rejected": -410.6462097167969, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 3.490448474884033, "rewards/margins": 4.339014053344727, "rewards/rejected": -0.8485655784606934, "step": 2701 }, { "epoch": 1.9740639269406393, "grad_norm": 21.813603851456577, "learning_rate": 2.98811689700557e-07, "logits/chosen": -2.764625310897827, "logits/rejected": -2.2100613117218018, "logps/chosen": -642.972412109375, "logps/rejected": -615.2391967773438, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 2.799288749694824, "rewards/margins": 3.3683207035064697, "rewards/rejected": -0.5690320730209351, "step": 2702 }, { "epoch": 1.9747945205479454, "grad_norm": 29.470677172204258, "learning_rate": 2.986552454733353e-07, "logits/chosen": -3.063255786895752, "logits/rejected": -2.502084493637085, "logps/chosen": -763.693603515625, "logps/rejected": -729.72412109375, "loss": 0.1639, "rewards/accuracies": 0.875, "rewards/chosen": 2.4600419998168945, "rewards/margins": 2.169710874557495, "rewards/rejected": 0.290331095457077, "step": 2703 }, { "epoch": 1.9755251141552512, "grad_norm": 38.67104323600461, "learning_rate": 2.9849878144027153e-07, "logits/chosen": -2.277411937713623, "logits/rejected": -2.8175740242004395, "logps/chosen": -763.0797729492188, "logps/rejected": -1033.872802734375, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 3.0327908992767334, "rewards/margins": 3.9530131816864014, "rewards/rejected": -0.9202220439910889, "step": 2704 }, { "epoch": 1.976255707762557, "grad_norm": 22.593504845013324, "learning_rate": 2.983422976650568e-07, "logits/chosen": -2.5930910110473633, "logits/rejected": -2.010307550430298, "logps/chosen": -782.8414306640625, "logps/rejected": -711.0288696289062, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": 3.8768482208251953, "rewards/margins": 4.35375452041626, "rewards/rejected": -0.4769059121608734, "step": 2705 }, { "epoch": 1.976986301369863, "grad_norm": 24.957363544997847, "learning_rate": 2.9818579421139014e-07, "logits/chosen": -2.7707982063293457, "logits/rejected": -2.1759912967681885, "logps/chosen": -489.77392578125, "logps/rejected": -467.1372985839844, "loss": 0.123, "rewards/accuracies": 1.0, "rewards/chosen": 3.6539831161499023, "rewards/margins": 4.184752464294434, "rewards/rejected": -0.530769407749176, "step": 2706 }, { "epoch": 1.9777168949771688, "grad_norm": 38.05349980824694, "learning_rate": 2.9802927114297866e-07, "logits/chosen": -3.0475411415100098, "logits/rejected": -2.648805618286133, "logps/chosen": -797.8602294921875, "logps/rejected": -795.776123046875, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 2.8750572204589844, "rewards/margins": 3.9923858642578125, "rewards/rejected": -1.117328405380249, "step": 2707 }, { "epoch": 1.9784474885844747, "grad_norm": 30.351551439830676, "learning_rate": 2.978727285235373e-07, "logits/chosen": -3.43892765045166, "logits/rejected": -2.332653045654297, "logps/chosen": -707.2217407226562, "logps/rejected": -552.7167358398438, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": 4.668032646179199, "rewards/margins": 5.852144718170166, "rewards/rejected": -1.1841115951538086, "step": 2708 }, { "epoch": 1.9791780821917808, "grad_norm": 30.48540910758836, "learning_rate": 2.977161664167891e-07, "logits/chosen": -3.3412370681762695, "logits/rejected": -2.217280387878418, "logps/chosen": -840.0264892578125, "logps/rejected": -480.338134765625, "loss": 0.1235, "rewards/accuracies": 1.0, "rewards/chosen": 4.132271766662598, "rewards/margins": 5.613670349121094, "rewards/rejected": -1.4813989400863647, "step": 2709 }, { "epoch": 1.979908675799087, "grad_norm": 36.27949029431953, "learning_rate": 2.97559584886465e-07, "logits/chosen": -2.3663604259490967, "logits/rejected": -1.9810764789581299, "logps/chosen": -560.883056640625, "logps/rejected": -583.2760009765625, "loss": 0.1559, "rewards/accuracies": 0.875, "rewards/chosen": 2.0042638778686523, "rewards/margins": 4.646840572357178, "rewards/rejected": -2.6425766944885254, "step": 2710 }, { "epoch": 1.9806392694063928, "grad_norm": 32.48150765964767, "learning_rate": 2.974029839963039e-07, "logits/chosen": -3.5159947872161865, "logits/rejected": -3.239058256149292, "logps/chosen": -695.6845703125, "logps/rejected": -782.2423706054688, "loss": 0.2621, "rewards/accuracies": 1.0, "rewards/chosen": 2.924105405807495, "rewards/margins": 3.4903335571289062, "rewards/rejected": -0.5662280917167664, "step": 2711 }, { "epoch": 1.9813698630136987, "grad_norm": 54.3347327553357, "learning_rate": 2.972463638100524e-07, "logits/chosen": -3.2987701892852783, "logits/rejected": -2.2152211666107178, "logps/chosen": -757.8428344726562, "logps/rejected": -403.590576171875, "loss": 0.3014, "rewards/accuracies": 1.0, "rewards/chosen": 2.5056192874908447, "rewards/margins": 2.7935030460357666, "rewards/rejected": -0.28788384795188904, "step": 2712 }, { "epoch": 1.9821004566210045, "grad_norm": 15.406525336662344, "learning_rate": 2.9708972439146525e-07, "logits/chosen": -2.9940924644470215, "logits/rejected": -2.1484155654907227, "logps/chosen": -615.2098388671875, "logps/rejected": -474.39727783203125, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 3.127633571624756, "rewards/margins": 4.069831848144531, "rewards/rejected": -0.9421980977058411, "step": 2713 }, { "epoch": 1.9828310502283104, "grad_norm": 53.1866439230459, "learning_rate": 2.969330658043048e-07, "logits/chosen": -2.816432237625122, "logits/rejected": -2.110788345336914, "logps/chosen": -773.4202880859375, "logps/rejected": -522.3358154296875, "loss": 0.1609, "rewards/accuracies": 1.0, "rewards/chosen": 4.224416255950928, "rewards/margins": 5.006732940673828, "rewards/rejected": -0.7823168635368347, "step": 2714 }, { "epoch": 1.9835616438356163, "grad_norm": 18.504916800698705, "learning_rate": 2.9677638811234115e-07, "logits/chosen": -3.0400478839874268, "logits/rejected": -1.8643836975097656, "logps/chosen": -593.6766357421875, "logps/rejected": -393.6023864746094, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 3.45902943611145, "rewards/margins": 4.704776763916016, "rewards/rejected": -1.2457473278045654, "step": 2715 }, { "epoch": 1.9842922374429224, "grad_norm": 17.09241364606084, "learning_rate": 2.9661969137935234e-07, "logits/chosen": -3.1037378311157227, "logits/rejected": -1.9188923835754395, "logps/chosen": -794.3697509765625, "logps/rejected": -497.60693359375, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 4.033730983734131, "rewards/margins": 4.784416675567627, "rewards/rejected": -0.7506857514381409, "step": 2716 }, { "epoch": 1.9850228310502285, "grad_norm": 36.40151357333494, "learning_rate": 2.964629756691241e-07, "logits/chosen": -3.1748921871185303, "logits/rejected": -2.2310805320739746, "logps/chosen": -683.721435546875, "logps/rejected": -484.32940673828125, "loss": 0.1743, "rewards/accuracies": 0.875, "rewards/chosen": 2.7202508449554443, "rewards/margins": 3.051774501800537, "rewards/rejected": -0.33152371644973755, "step": 2717 }, { "epoch": 1.9857534246575343, "grad_norm": 26.15121385955392, "learning_rate": 2.9630624104545005e-07, "logits/chosen": -2.662224292755127, "logits/rejected": -2.2656126022338867, "logps/chosen": -475.1727294921875, "logps/rejected": -477.657470703125, "loss": 0.1242, "rewards/accuracies": 0.875, "rewards/chosen": 2.0241456031799316, "rewards/margins": 2.166802406311035, "rewards/rejected": -0.1426568627357483, "step": 2718 }, { "epoch": 1.9864840182648402, "grad_norm": 26.318096662070047, "learning_rate": 2.961494875721311e-07, "logits/chosen": -2.992037773132324, "logits/rejected": -2.2279367446899414, "logps/chosen": -399.0978088378906, "logps/rejected": -352.582763671875, "loss": 0.1138, "rewards/accuracies": 0.875, "rewards/chosen": 3.2229549884796143, "rewards/margins": 4.663558006286621, "rewards/rejected": -1.4406030178070068, "step": 2719 }, { "epoch": 1.987214611872146, "grad_norm": 37.399548076000364, "learning_rate": 2.9599271531297634e-07, "logits/chosen": -2.688192844390869, "logits/rejected": -2.726468563079834, "logps/chosen": -679.2054443359375, "logps/rejected": -958.0170288085938, "loss": 0.1843, "rewards/accuracies": 0.875, "rewards/chosen": 3.2687525749206543, "rewards/margins": 2.353729248046875, "rewards/rejected": 0.915023148059845, "step": 2720 }, { "epoch": 1.987945205479452, "grad_norm": 45.6920900139592, "learning_rate": 2.9583592433180204e-07, "logits/chosen": -2.8519225120544434, "logits/rejected": -2.392547130584717, "logps/chosen": -686.9575805664062, "logps/rejected": -525.2605590820312, "loss": 0.2108, "rewards/accuracies": 0.75, "rewards/chosen": 2.195058584213257, "rewards/margins": 2.118002414703369, "rewards/rejected": 0.07705630362033844, "step": 2721 }, { "epoch": 1.9886757990867578, "grad_norm": 29.263752109051225, "learning_rate": 2.9567911469243236e-07, "logits/chosen": -2.912391424179077, "logits/rejected": -2.1937246322631836, "logps/chosen": -698.8762817382812, "logps/rejected": -582.5777587890625, "loss": 0.1473, "rewards/accuracies": 1.0, "rewards/chosen": 2.932703971862793, "rewards/margins": 4.674483299255371, "rewards/rejected": -1.7417793273925781, "step": 2722 }, { "epoch": 1.989406392694064, "grad_norm": 30.61992544054239, "learning_rate": 2.95522286458699e-07, "logits/chosen": -2.854017496109009, "logits/rejected": -2.5589797496795654, "logps/chosen": -815.7579345703125, "logps/rejected": -611.5177612304688, "loss": 0.1382, "rewards/accuracies": 1.0, "rewards/chosen": 2.739621162414551, "rewards/margins": 3.8388173580169678, "rewards/rejected": -1.099196434020996, "step": 2723 }, { "epoch": 1.9901369863013698, "grad_norm": 47.71209586464291, "learning_rate": 2.953654396944414e-07, "logits/chosen": -2.9286646842956543, "logits/rejected": -2.2090840339660645, "logps/chosen": -885.9033203125, "logps/rejected": -550.022216796875, "loss": 0.2091, "rewards/accuracies": 1.0, "rewards/chosen": 2.5535054206848145, "rewards/margins": 2.504383087158203, "rewards/rejected": 0.04912222921848297, "step": 2724 }, { "epoch": 1.990867579908676, "grad_norm": 40.13979021854826, "learning_rate": 2.9520857446350603e-07, "logits/chosen": -2.4773614406585693, "logits/rejected": -2.339726448059082, "logps/chosen": -623.5717163085938, "logps/rejected": -616.8544311523438, "loss": 0.1994, "rewards/accuracies": 1.0, "rewards/chosen": 2.771451950073242, "rewards/margins": 3.3398191928863525, "rewards/rejected": -0.5683672428131104, "step": 2725 }, { "epoch": 1.9915981735159818, "grad_norm": 24.395873224197032, "learning_rate": 2.950516908297475e-07, "logits/chosen": -2.6672635078430176, "logits/rejected": -2.286137819290161, "logps/chosen": -587.8612670898438, "logps/rejected": -623.7179565429688, "loss": 0.151, "rewards/accuracies": 1.0, "rewards/chosen": 3.196221351623535, "rewards/margins": 4.591175556182861, "rewards/rejected": -1.3949545621871948, "step": 2726 }, { "epoch": 1.9923287671232877, "grad_norm": 42.5854552176415, "learning_rate": 2.948947888570276e-07, "logits/chosen": -2.834479808807373, "logits/rejected": -2.552366256713867, "logps/chosen": -595.3133544921875, "logps/rejected": -596.3375244140625, "loss": 0.225, "rewards/accuracies": 0.875, "rewards/chosen": 3.59652042388916, "rewards/margins": 4.73179292678833, "rewards/rejected": -1.1352722644805908, "step": 2727 }, { "epoch": 1.9930593607305935, "grad_norm": 26.726297590765473, "learning_rate": 2.947378686092154e-07, "logits/chosen": -2.7826523780822754, "logits/rejected": -2.093848943710327, "logps/chosen": -693.2935791015625, "logps/rejected": -552.1055908203125, "loss": 0.1293, "rewards/accuracies": 0.875, "rewards/chosen": 2.547196626663208, "rewards/margins": 3.24426007270813, "rewards/rejected": -0.697063684463501, "step": 2728 }, { "epoch": 1.9937899543378994, "grad_norm": 29.241004016927477, "learning_rate": 2.945809301501879e-07, "logits/chosen": -2.860705852508545, "logits/rejected": -2.3100366592407227, "logps/chosen": -681.3005981445312, "logps/rejected": -639.852783203125, "loss": 0.1301, "rewards/accuracies": 1.0, "rewards/chosen": 3.2366251945495605, "rewards/margins": 4.873924255371094, "rewards/rejected": -1.6372991800308228, "step": 2729 }, { "epoch": 1.9945205479452055, "grad_norm": 18.753317577591854, "learning_rate": 2.94423973543829e-07, "logits/chosen": -2.955453395843506, "logits/rejected": -2.613588809967041, "logps/chosen": -695.6522216796875, "logps/rejected": -669.255615234375, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": 3.0520482063293457, "rewards/margins": 3.982454776763916, "rewards/rejected": -0.9304064512252808, "step": 2730 }, { "epoch": 1.9952511415525114, "grad_norm": 22.787297454787584, "learning_rate": 2.942669988540305e-07, "logits/chosen": -2.680882215499878, "logits/rejected": -1.9773231744766235, "logps/chosen": -769.709716796875, "logps/rejected": -569.6981811523438, "loss": 0.0833, "rewards/accuracies": 1.0, "rewards/chosen": 2.819856643676758, "rewards/margins": 3.7259597778320312, "rewards/rejected": -0.9061031341552734, "step": 2731 }, { "epoch": 1.9959817351598175, "grad_norm": 35.38689446397758, "learning_rate": 2.9411000614469096e-07, "logits/chosen": -2.0820472240448, "logits/rejected": -2.201920986175537, "logps/chosen": -475.8755798339844, "logps/rejected": -437.96270751953125, "loss": 0.1422, "rewards/accuracies": 1.0, "rewards/chosen": 1.553694486618042, "rewards/margins": 2.730233669281006, "rewards/rejected": -1.1765391826629639, "step": 2732 }, { "epoch": 1.9967123287671233, "grad_norm": 24.526481174611938, "learning_rate": 2.939529954797168e-07, "logits/chosen": -2.8616487979888916, "logits/rejected": -2.5384674072265625, "logps/chosen": -756.7982177734375, "logps/rejected": -764.3287353515625, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 2.5596089363098145, "rewards/margins": 2.983816146850586, "rewards/rejected": -0.4242071509361267, "step": 2733 }, { "epoch": 1.9974429223744292, "grad_norm": 40.2683745908312, "learning_rate": 2.937959669230215e-07, "logits/chosen": -2.7656033039093018, "logits/rejected": -2.716156482696533, "logps/chosen": -719.9976196289062, "logps/rejected": -667.729736328125, "loss": 0.2025, "rewards/accuracies": 0.625, "rewards/chosen": 2.6755895614624023, "rewards/margins": 2.85040020942688, "rewards/rejected": -0.17481067776679993, "step": 2734 }, { "epoch": 1.998173515981735, "grad_norm": 35.951048736767575, "learning_rate": 2.936389205385259e-07, "logits/chosen": -3.193145751953125, "logits/rejected": -2.292015314102173, "logps/chosen": -776.4246215820312, "logps/rejected": -607.740966796875, "loss": 0.2332, "rewards/accuracies": 1.0, "rewards/chosen": 1.8889636993408203, "rewards/margins": 3.323956251144409, "rewards/rejected": -1.4349925518035889, "step": 2735 }, { "epoch": 1.998904109589041, "grad_norm": 42.364018730613985, "learning_rate": 2.9348185639015805e-07, "logits/chosen": -2.8653271198272705, "logits/rejected": -2.3151979446411133, "logps/chosen": -624.0606689453125, "logps/rejected": -513.796142578125, "loss": 0.1711, "rewards/accuracies": 0.875, "rewards/chosen": 2.0241031646728516, "rewards/margins": 2.9805169105529785, "rewards/rejected": -0.9564136266708374, "step": 2736 }, { "epoch": 1.999634703196347, "grad_norm": 38.94090763078559, "learning_rate": 2.933247745418532e-07, "logits/chosen": -2.4364871978759766, "logits/rejected": -2.002227544784546, "logps/chosen": -516.9058227539062, "logps/rejected": -404.74932861328125, "loss": 0.1542, "rewards/accuracies": 1.0, "rewards/chosen": 2.4472174644470215, "rewards/margins": 4.622048377990723, "rewards/rejected": -2.174830913543701, "step": 2737 }, { "epoch": 1.999634703196347, "eval_logits/chosen": -2.878718614578247, "eval_logits/rejected": -2.3920962810516357, "eval_logps/chosen": -706.7914428710938, "eval_logps/rejected": -587.60498046875, "eval_loss": 0.34945425391197205, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": 2.9121761322021484, "eval_rewards/margins": 3.651867389678955, "eval_rewards/rejected": -0.7396917343139648, "eval_runtime": 14.44, "eval_samples_per_second": 7.618, "eval_steps_per_second": 0.97, "step": 2737 }, { "epoch": 2.000365296803653, "grad_norm": 21.03635783186049, "learning_rate": 2.93167675057554e-07, "logits/chosen": -2.8955307006835938, "logits/rejected": -2.489542007446289, "logps/chosen": -559.50341796875, "logps/rejected": -450.29547119140625, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": 2.761516571044922, "rewards/margins": 3.1212501525878906, "rewards/rejected": -0.3597336411476135, "step": 2738 }, { "epoch": 2.001095890410959, "grad_norm": 11.527008848066286, "learning_rate": 2.930105580012099e-07, "logits/chosen": -2.7470126152038574, "logits/rejected": -2.0794572830200195, "logps/chosen": -559.9192504882812, "logps/rejected": -401.66204833984375, "loss": 0.0895, "rewards/accuracies": 0.875, "rewards/chosen": 2.334916591644287, "rewards/margins": 3.175537586212158, "rewards/rejected": -0.840620756149292, "step": 2739 }, { "epoch": 2.001826484018265, "grad_norm": 19.102132851914405, "learning_rate": 2.9285342343677795e-07, "logits/chosen": -2.557499885559082, "logits/rejected": -1.8177032470703125, "logps/chosen": -705.1431884765625, "logps/rejected": -441.24029541015625, "loss": 0.1077, "rewards/accuracies": 0.875, "rewards/chosen": 2.1463966369628906, "rewards/margins": 1.9044275283813477, "rewards/rejected": 0.24196912348270416, "step": 2740 }, { "epoch": 2.002557077625571, "grad_norm": 12.292692566209558, "learning_rate": 2.9269627142822195e-07, "logits/chosen": -2.3448288440704346, "logits/rejected": -2.6657190322875977, "logps/chosen": -274.681640625, "logps/rejected": -336.6288146972656, "loss": 0.0729, "rewards/accuracies": 1.0, "rewards/chosen": 0.6841524839401245, "rewards/margins": 3.1684467792510986, "rewards/rejected": -2.4842944145202637, "step": 2741 }, { "epoch": 2.0032876712328767, "grad_norm": 15.682906371144787, "learning_rate": 2.9253910203951316e-07, "logits/chosen": -2.6794793605804443, "logits/rejected": -1.8940579891204834, "logps/chosen": -536.353759765625, "logps/rejected": -420.37701416015625, "loss": 0.0989, "rewards/accuracies": 1.0, "rewards/chosen": 3.120185613632202, "rewards/margins": 4.6117262840271, "rewards/rejected": -1.4915411472320557, "step": 2742 }, { "epoch": 2.0040182648401825, "grad_norm": 13.122646157047972, "learning_rate": 2.923819153346295e-07, "logits/chosen": -3.320117712020874, "logits/rejected": -2.399960994720459, "logps/chosen": -821.4220581054688, "logps/rejected": -603.34423828125, "loss": 0.074, "rewards/accuracies": 1.0, "rewards/chosen": 2.533761739730835, "rewards/margins": 3.3449246883392334, "rewards/rejected": -0.8111631870269775, "step": 2743 }, { "epoch": 2.0047488584474884, "grad_norm": 12.243856329697177, "learning_rate": 2.9222471137755646e-07, "logits/chosen": -2.749847888946533, "logits/rejected": -1.9507964849472046, "logps/chosen": -572.930908203125, "logps/rejected": -481.30859375, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 2.981344699859619, "rewards/margins": 4.459229469299316, "rewards/rejected": -1.4778848886489868, "step": 2744 }, { "epoch": 2.0054794520547947, "grad_norm": 11.821743371136417, "learning_rate": 2.920674902322858e-07, "logits/chosen": -2.8743653297424316, "logits/rejected": -1.8379285335540771, "logps/chosen": -794.663330078125, "logps/rejected": -405.5560302734375, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 3.491152286529541, "rewards/margins": 4.177947521209717, "rewards/rejected": -0.6867950558662415, "step": 2745 }, { "epoch": 2.0062100456621006, "grad_norm": 10.341411222448189, "learning_rate": 2.9191025196281715e-07, "logits/chosen": -3.281226634979248, "logits/rejected": -2.1023740768432617, "logps/chosen": -853.669189453125, "logps/rejected": -522.776611328125, "loss": 0.0679, "rewards/accuracies": 1.0, "rewards/chosen": 3.1985878944396973, "rewards/margins": 3.83129620552063, "rewards/rejected": -0.6327087879180908, "step": 2746 }, { "epoch": 2.0069406392694065, "grad_norm": 15.367538005979824, "learning_rate": 2.9175299663315646e-07, "logits/chosen": -2.277099847793579, "logits/rejected": -2.3909354209899902, "logps/chosen": -665.0746459960938, "logps/rejected": -696.0157470703125, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": 3.106912612915039, "rewards/margins": 4.512862205505371, "rewards/rejected": -1.4059498310089111, "step": 2747 }, { "epoch": 2.0076712328767123, "grad_norm": 11.788617246396257, "learning_rate": 2.91595724307317e-07, "logits/chosen": -2.4087369441986084, "logits/rejected": -2.2713749408721924, "logps/chosen": -456.2413024902344, "logps/rejected": -766.126953125, "loss": 0.078, "rewards/accuracies": 1.0, "rewards/chosen": 2.4624600410461426, "rewards/margins": 6.5111870765686035, "rewards/rejected": -4.048727035522461, "step": 2748 }, { "epoch": 2.008401826484018, "grad_norm": 14.037915793805873, "learning_rate": 2.9143843504931866e-07, "logits/chosen": -2.394484043121338, "logits/rejected": -2.4144701957702637, "logps/chosen": -553.095458984375, "logps/rejected": -529.7869873046875, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 2.792679786682129, "rewards/margins": 5.784968376159668, "rewards/rejected": -2.99228835105896, "step": 2749 }, { "epoch": 2.009132420091324, "grad_norm": 12.913716887956957, "learning_rate": 2.912811289231884e-07, "logits/chosen": -2.9633984565734863, "logits/rejected": -2.2684996128082275, "logps/chosen": -274.5935363769531, "logps/rejected": -276.81640625, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": 3.6680173873901367, "rewards/margins": 7.002631187438965, "rewards/rejected": -3.33461332321167, "step": 2750 }, { "epoch": 2.00986301369863, "grad_norm": 6.886220203168313, "learning_rate": 2.911238059929601e-07, "logits/chosen": -2.5513830184936523, "logits/rejected": -2.32148814201355, "logps/chosen": -912.2921752929688, "logps/rejected": -885.13134765625, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 4.166134834289551, "rewards/margins": 6.467521667480469, "rewards/rejected": -2.301386833190918, "step": 2751 }, { "epoch": 2.0105936073059363, "grad_norm": 9.044230389177375, "learning_rate": 2.9096646632267435e-07, "logits/chosen": -2.7879528999328613, "logits/rejected": -2.29182767868042, "logps/chosen": -892.6749267578125, "logps/rejected": -731.8252563476562, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 5.12061882019043, "rewards/margins": 5.159180641174316, "rewards/rejected": -0.03856205940246582, "step": 2752 }, { "epoch": 2.011324200913242, "grad_norm": 11.208462814849991, "learning_rate": 2.9080910997637863e-07, "logits/chosen": -2.4613966941833496, "logits/rejected": -2.234700918197632, "logps/chosen": -553.7579956054688, "logps/rejected": -574.3289794921875, "loss": 0.0712, "rewards/accuracies": 0.875, "rewards/chosen": 2.3404598236083984, "rewards/margins": 4.594668865203857, "rewards/rejected": -2.254209041595459, "step": 2753 }, { "epoch": 2.012054794520548, "grad_norm": 12.43969400501783, "learning_rate": 2.9065173701812717e-07, "logits/chosen": -2.053091049194336, "logits/rejected": -2.31503963470459, "logps/chosen": -583.587158203125, "logps/rejected": -859.9452514648438, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 3.1618964672088623, "rewards/margins": 2.9893345832824707, "rewards/rejected": 0.17256194353103638, "step": 2754 }, { "epoch": 2.012785388127854, "grad_norm": 13.998110507433505, "learning_rate": 2.9049434751198107e-07, "logits/chosen": -2.6386160850524902, "logits/rejected": -2.167686700820923, "logps/chosen": -967.0457763671875, "logps/rejected": -553.3027954101562, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": 3.043975830078125, "rewards/margins": 4.557620525360107, "rewards/rejected": -1.5136445760726929, "step": 2755 }, { "epoch": 2.0135159817351598, "grad_norm": 13.12854595422015, "learning_rate": 2.9033694152200784e-07, "logits/chosen": -2.3064942359924316, "logits/rejected": -1.8969240188598633, "logps/chosen": -729.7620849609375, "logps/rejected": -625.9979858398438, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 2.6774208545684814, "rewards/margins": 3.3327887058258057, "rewards/rejected": -0.6553680300712585, "step": 2756 }, { "epoch": 2.0142465753424657, "grad_norm": 9.014685200015089, "learning_rate": 2.9017951911228213e-07, "logits/chosen": -2.6125683784484863, "logits/rejected": -2.2138748168945312, "logps/chosen": -504.3870544433594, "logps/rejected": -463.775146484375, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 3.7284398078918457, "rewards/margins": 5.878605365753174, "rewards/rejected": -2.150165557861328, "step": 2757 }, { "epoch": 2.0149771689497715, "grad_norm": 13.183978003659464, "learning_rate": 2.9002208034688495e-07, "logits/chosen": -3.1237192153930664, "logits/rejected": -1.8847006559371948, "logps/chosen": -394.57513427734375, "logps/rejected": -192.89674377441406, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": 2.1148252487182617, "rewards/margins": 3.7833123207092285, "rewards/rejected": -1.6684870719909668, "step": 2758 }, { "epoch": 2.015707762557078, "grad_norm": 8.036954879666363, "learning_rate": 2.898646252899043e-07, "logits/chosen": -2.5316739082336426, "logits/rejected": -2.1526927947998047, "logps/chosen": -567.2120361328125, "logps/rejected": -568.4862060546875, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 3.249966621398926, "rewards/margins": 6.687125205993652, "rewards/rejected": -3.4371583461761475, "step": 2759 }, { "epoch": 2.0164383561643837, "grad_norm": 11.007701258642138, "learning_rate": 2.897071540054343e-07, "logits/chosen": -2.808553457260132, "logits/rejected": -1.819779396057129, "logps/chosen": -530.7363891601562, "logps/rejected": -353.3680419921875, "loss": 0.0721, "rewards/accuracies": 0.875, "rewards/chosen": 2.9894156455993652, "rewards/margins": 5.99226188659668, "rewards/rejected": -3.0028457641601562, "step": 2760 }, { "epoch": 2.0171689497716896, "grad_norm": 12.728700686519115, "learning_rate": 2.895496665575763e-07, "logits/chosen": -3.1343188285827637, "logits/rejected": -2.616680145263672, "logps/chosen": -892.3555297851562, "logps/rejected": -695.32177734375, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": 4.153264045715332, "rewards/margins": 5.7681803703308105, "rewards/rejected": -1.614916205406189, "step": 2761 }, { "epoch": 2.0178995433789955, "grad_norm": 12.649970109625322, "learning_rate": 2.8939216301043765e-07, "logits/chosen": -2.7016818523406982, "logits/rejected": -2.4915573596954346, "logps/chosen": -833.524658203125, "logps/rejected": -794.7786254882812, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 2.1529362201690674, "rewards/margins": 4.164336204528809, "rewards/rejected": -2.0114002227783203, "step": 2762 }, { "epoch": 2.0186301369863013, "grad_norm": 12.310349639711827, "learning_rate": 2.892346434281326e-07, "logits/chosen": -2.344681978225708, "logits/rejected": -2.3671367168426514, "logps/chosen": -502.96978759765625, "logps/rejected": -649.072998046875, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": 1.5152486562728882, "rewards/margins": 4.152772903442383, "rewards/rejected": -2.637524366378784, "step": 2763 }, { "epoch": 2.019360730593607, "grad_norm": 20.426309023947447, "learning_rate": 2.8907710787478194e-07, "logits/chosen": -3.062204360961914, "logits/rejected": -2.2174127101898193, "logps/chosen": -698.9284057617188, "logps/rejected": -646.5906982421875, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 3.048882484436035, "rewards/margins": 5.023674011230469, "rewards/rejected": -1.9747915267944336, "step": 2764 }, { "epoch": 2.020091324200913, "grad_norm": 15.429825948745947, "learning_rate": 2.889195564145127e-07, "logits/chosen": -3.446392059326172, "logits/rejected": -2.8813064098358154, "logps/chosen": -626.2589111328125, "logps/rejected": -526.06201171875, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 1.8048735857009888, "rewards/margins": 3.3964486122131348, "rewards/rejected": -1.5915749073028564, "step": 2765 }, { "epoch": 2.020821917808219, "grad_norm": 11.356052589695148, "learning_rate": 2.887619891114587e-07, "logits/chosen": -2.7825543880462646, "logits/rejected": -1.9613906145095825, "logps/chosen": -726.8602905273438, "logps/rejected": -541.6177368164062, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": 3.966780185699463, "rewards/margins": 4.9939727783203125, "rewards/rejected": -1.0271923542022705, "step": 2766 }, { "epoch": 2.0215525114155253, "grad_norm": 9.014672505062796, "learning_rate": 2.8860440602976e-07, "logits/chosen": -2.513643980026245, "logits/rejected": -1.7051587104797363, "logps/chosen": -400.205078125, "logps/rejected": -286.7459716796875, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": 3.402366876602173, "rewards/margins": 4.798349857330322, "rewards/rejected": -1.3959826231002808, "step": 2767 }, { "epoch": 2.022283105022831, "grad_norm": 12.789917932982286, "learning_rate": 2.88446807233563e-07, "logits/chosen": -2.8726537227630615, "logits/rejected": -1.9564416408538818, "logps/chosen": -568.2840576171875, "logps/rejected": -444.5093078613281, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": 3.7353134155273438, "rewards/margins": 5.806370735168457, "rewards/rejected": -2.071057081222534, "step": 2768 }, { "epoch": 2.023013698630137, "grad_norm": 13.605958844890983, "learning_rate": 2.8828919278702085e-07, "logits/chosen": -2.8070969581604004, "logits/rejected": -2.2405683994293213, "logps/chosen": -456.4331359863281, "logps/rejected": -346.48663330078125, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 3.1430959701538086, "rewards/margins": 4.361237049102783, "rewards/rejected": -1.2181410789489746, "step": 2769 }, { "epoch": 2.023744292237443, "grad_norm": 11.239157304770973, "learning_rate": 2.8813156275429277e-07, "logits/chosen": -2.9516115188598633, "logits/rejected": -2.5645740032196045, "logps/chosen": -1006.402587890625, "logps/rejected": -957.6138305664062, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 4.063698768615723, "rewards/margins": 4.232636451721191, "rewards/rejected": -0.1689378023147583, "step": 2770 }, { "epoch": 2.0244748858447488, "grad_norm": 10.684520640779454, "learning_rate": 2.8797391719954447e-07, "logits/chosen": -2.9719433784484863, "logits/rejected": -2.8598203659057617, "logps/chosen": -755.4390869140625, "logps/rejected": -769.6465454101562, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 4.848050117492676, "rewards/margins": 5.372829437255859, "rewards/rejected": -0.5247789025306702, "step": 2771 }, { "epoch": 2.0252054794520546, "grad_norm": 12.420842503276939, "learning_rate": 2.8781625618694776e-07, "logits/chosen": -3.0795812606811523, "logits/rejected": -2.657853603363037, "logps/chosen": -673.0791015625, "logps/rejected": -590.828125, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 3.129608631134033, "rewards/margins": 4.592140197753906, "rewards/rejected": -1.4625316858291626, "step": 2772 }, { "epoch": 2.0259360730593605, "grad_norm": 12.657371791527074, "learning_rate": 2.8765857978068105e-07, "logits/chosen": -2.5671000480651855, "logits/rejected": -2.0001626014709473, "logps/chosen": -417.4136962890625, "logps/rejected": -359.4456787109375, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 2.7617006301879883, "rewards/margins": 5.157078742980957, "rewards/rejected": -2.3953781127929688, "step": 2773 }, { "epoch": 2.026666666666667, "grad_norm": 13.429111435070709, "learning_rate": 2.875008880449288e-07, "logits/chosen": -2.5951011180877686, "logits/rejected": -2.0951297283172607, "logps/chosen": -410.6268005371094, "logps/rejected": -396.187255859375, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 4.074591636657715, "rewards/margins": 5.828280448913574, "rewards/rejected": -1.753688931465149, "step": 2774 }, { "epoch": 2.0273972602739727, "grad_norm": 6.599666020584069, "learning_rate": 2.8734318104388174e-07, "logits/chosen": -3.2324671745300293, "logits/rejected": -2.6985530853271484, "logps/chosen": -617.0678100585938, "logps/rejected": -543.9070434570312, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 3.5677545070648193, "rewards/margins": 5.348860740661621, "rewards/rejected": -1.78110671043396, "step": 2775 }, { "epoch": 2.0281278538812786, "grad_norm": 9.826275334565157, "learning_rate": 2.8718545884173693e-07, "logits/chosen": -2.73895001411438, "logits/rejected": -2.3295557498931885, "logps/chosen": -551.3740844726562, "logps/rejected": -475.2360534667969, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 2.377763271331787, "rewards/margins": 3.20855975151062, "rewards/rejected": -0.8307967185974121, "step": 2776 }, { "epoch": 2.0288584474885845, "grad_norm": 11.564032045882747, "learning_rate": 2.8702772150269763e-07, "logits/chosen": -2.911332368850708, "logits/rejected": -2.3426291942596436, "logps/chosen": -936.7966918945312, "logps/rejected": -612.9821166992188, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": 5.415465831756592, "rewards/margins": 6.235922813415527, "rewards/rejected": -0.8204569220542908, "step": 2777 }, { "epoch": 2.0295890410958903, "grad_norm": 15.057043836417455, "learning_rate": 2.8686996909097295e-07, "logits/chosen": -2.6559040546417236, "logits/rejected": -1.8683634996414185, "logps/chosen": -368.423095703125, "logps/rejected": -349.9239501953125, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 3.581751823425293, "rewards/margins": 7.182572364807129, "rewards/rejected": -3.600820541381836, "step": 2778 }, { "epoch": 2.030319634703196, "grad_norm": 9.580416780138135, "learning_rate": 2.867122016707785e-07, "logits/chosen": -2.5993027687072754, "logits/rejected": -2.5007078647613525, "logps/chosen": -684.3629150390625, "logps/rejected": -612.2815551757812, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 2.3231160640716553, "rewards/margins": 4.033052444458008, "rewards/rejected": -1.7099361419677734, "step": 2779 }, { "epoch": 2.031050228310502, "grad_norm": 14.526881588843803, "learning_rate": 2.865544193063358e-07, "logits/chosen": -2.7597644329071045, "logits/rejected": -2.050027370452881, "logps/chosen": -825.6019287109375, "logps/rejected": -762.5067749023438, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": 2.517585039138794, "rewards/margins": 4.75590705871582, "rewards/rejected": -2.2383222579956055, "step": 2780 }, { "epoch": 2.0317808219178084, "grad_norm": 9.919579043531984, "learning_rate": 2.863966220618726e-07, "logits/chosen": -2.7281999588012695, "logits/rejected": -2.260049343109131, "logps/chosen": -713.00390625, "logps/rejected": -619.6124267578125, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 2.5164172649383545, "rewards/margins": 2.949889898300171, "rewards/rejected": -0.433472603559494, "step": 2781 }, { "epoch": 2.0325114155251143, "grad_norm": 15.007388012118867, "learning_rate": 2.862388100016225e-07, "logits/chosen": -2.1079447269439697, "logits/rejected": -2.3642337322235107, "logps/chosen": -412.3871765136719, "logps/rejected": -620.2969970703125, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": 2.0662832260131836, "rewards/margins": 4.371044158935547, "rewards/rejected": -2.3047614097595215, "step": 2782 }, { "epoch": 2.03324200913242, "grad_norm": 13.700791963950593, "learning_rate": 2.860809831898254e-07, "logits/chosen": -3.21732497215271, "logits/rejected": -1.963481068611145, "logps/chosen": -840.78759765625, "logps/rejected": -553.48046875, "loss": 0.0714, "rewards/accuracies": 1.0, "rewards/chosen": 4.131526947021484, "rewards/margins": 5.52020263671875, "rewards/rejected": -1.3886756896972656, "step": 2783 }, { "epoch": 2.033972602739726, "grad_norm": 14.05875221326364, "learning_rate": 2.8592314169072685e-07, "logits/chosen": -2.8480546474456787, "logits/rejected": -2.3911077976226807, "logps/chosen": -643.6402587890625, "logps/rejected": -520.5150756835938, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 2.7461037635803223, "rewards/margins": 3.0814075469970703, "rewards/rejected": -0.33530378341674805, "step": 2784 }, { "epoch": 2.034703196347032, "grad_norm": 11.404959660533589, "learning_rate": 2.857652855685787e-07, "logits/chosen": -3.1899309158325195, "logits/rejected": -2.4478092193603516, "logps/chosen": -576.639404296875, "logps/rejected": -537.3720703125, "loss": 0.055, "rewards/accuracies": 1.0, "rewards/chosen": 2.6373586654663086, "rewards/margins": 4.40239143371582, "rewards/rejected": -1.7650330066680908, "step": 2785 }, { "epoch": 2.0354337899543378, "grad_norm": 12.930065728040415, "learning_rate": 2.8560741488763867e-07, "logits/chosen": -2.7164969444274902, "logits/rejected": -1.6742138862609863, "logps/chosen": -393.6080322265625, "logps/rejected": -293.52215576171875, "loss": 0.055, "rewards/accuracies": 1.0, "rewards/chosen": 1.9405474662780762, "rewards/margins": 4.50141716003418, "rewards/rejected": -2.5608699321746826, "step": 2786 }, { "epoch": 2.0361643835616436, "grad_norm": 12.54978185213324, "learning_rate": 2.854495297121703e-07, "logits/chosen": -2.6598894596099854, "logits/rejected": -2.5107672214508057, "logps/chosen": -676.6817626953125, "logps/rejected": -704.6312255859375, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": 2.580256462097168, "rewards/margins": 2.929692506790161, "rewards/rejected": -0.3494361340999603, "step": 2787 }, { "epoch": 2.03689497716895, "grad_norm": 14.439537003721274, "learning_rate": 2.8529163010644316e-07, "logits/chosen": -2.687445878982544, "logits/rejected": -2.317927598953247, "logps/chosen": -693.0025634765625, "logps/rejected": -683.201416015625, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": 2.5818288326263428, "rewards/margins": 3.9908335208892822, "rewards/rejected": -1.409004807472229, "step": 2788 }, { "epoch": 2.037625570776256, "grad_norm": 10.368132742915044, "learning_rate": 2.8513371613473255e-07, "logits/chosen": -2.3131139278411865, "logits/rejected": -2.0523622035980225, "logps/chosen": -488.7772521972656, "logps/rejected": -372.67401123046875, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 2.190854072570801, "rewards/margins": 3.7711994647979736, "rewards/rejected": -1.5803451538085938, "step": 2789 }, { "epoch": 2.0383561643835617, "grad_norm": 8.459555501601013, "learning_rate": 2.849757878613198e-07, "logits/chosen": -2.7159857749938965, "logits/rejected": -2.346130847930908, "logps/chosen": -628.7581787109375, "logps/rejected": -839.0069580078125, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 3.706559658050537, "rewards/margins": 6.412755012512207, "rewards/rejected": -2.7061948776245117, "step": 2790 }, { "epoch": 2.0390867579908676, "grad_norm": 16.433152236611228, "learning_rate": 2.8481784535049185e-07, "logits/chosen": -2.735243082046509, "logits/rejected": -2.7240705490112305, "logps/chosen": -534.919189453125, "logps/rejected": -618.2230224609375, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": 2.847304344177246, "rewards/margins": 4.709786891937256, "rewards/rejected": -1.862483024597168, "step": 2791 }, { "epoch": 2.0398173515981735, "grad_norm": 11.842991787034787, "learning_rate": 2.846598886665417e-07, "logits/chosen": -2.502242088317871, "logits/rejected": -1.9920685291290283, "logps/chosen": -716.6061401367188, "logps/rejected": -502.76068115234375, "loss": 0.0606, "rewards/accuracies": 1.0, "rewards/chosen": 2.9097087383270264, "rewards/margins": 3.957643985748291, "rewards/rejected": -1.0479352474212646, "step": 2792 }, { "epoch": 2.0405479452054793, "grad_norm": 8.745153338231393, "learning_rate": 2.845019178737678e-07, "logits/chosen": -2.8665266036987305, "logits/rejected": -2.4581985473632812, "logps/chosen": -687.2991333007812, "logps/rejected": -680.9486083984375, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 3.6906285285949707, "rewards/margins": 5.179945945739746, "rewards/rejected": -1.4893171787261963, "step": 2793 }, { "epoch": 2.041278538812785, "grad_norm": 16.98909364794119, "learning_rate": 2.843439330364747e-07, "logits/chosen": -3.327024459838867, "logits/rejected": -2.414316177368164, "logps/chosen": -577.0582885742188, "logps/rejected": -499.4842224121094, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 3.456225872039795, "rewards/margins": 5.629327774047852, "rewards/rejected": -2.173102378845215, "step": 2794 }, { "epoch": 2.0420091324200915, "grad_norm": 16.01502857121144, "learning_rate": 2.841859342189723e-07, "logits/chosen": -2.6208860874176025, "logits/rejected": -1.5924036502838135, "logps/chosen": -480.6597900390625, "logps/rejected": -288.73370361328125, "loss": 0.1207, "rewards/accuracies": 1.0, "rewards/chosen": 3.5971360206604004, "rewards/margins": 5.6295928955078125, "rewards/rejected": -2.032456636428833, "step": 2795 }, { "epoch": 2.0427397260273974, "grad_norm": 12.791087051497398, "learning_rate": 2.840279214855765e-07, "logits/chosen": -2.7322287559509277, "logits/rejected": -1.3074016571044922, "logps/chosen": -632.6278686523438, "logps/rejected": -376.4090881347656, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": 3.974773406982422, "rewards/margins": 6.823557376861572, "rewards/rejected": -2.8487842082977295, "step": 2796 }, { "epoch": 2.0434703196347033, "grad_norm": 17.721535653766985, "learning_rate": 2.838698949006087e-07, "logits/chosen": -2.4272682666778564, "logits/rejected": -2.4239959716796875, "logps/chosen": -530.453857421875, "logps/rejected": -499.5579833984375, "loss": 0.0838, "rewards/accuracies": 0.875, "rewards/chosen": 1.6715617179870605, "rewards/margins": 2.845762252807617, "rewards/rejected": -1.1742007732391357, "step": 2797 }, { "epoch": 2.044200913242009, "grad_norm": 7.4702649994077825, "learning_rate": 2.8371185452839593e-07, "logits/chosen": -2.818507671356201, "logits/rejected": -2.176116943359375, "logps/chosen": -309.43121337890625, "logps/rejected": -353.84796142578125, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": 1.9264557361602783, "rewards/margins": 4.1480255126953125, "rewards/rejected": -2.221569538116455, "step": 2798 }, { "epoch": 2.044931506849315, "grad_norm": 11.940694491382125, "learning_rate": 2.835538004332709e-07, "logits/chosen": -2.853574752807617, "logits/rejected": -2.7320375442504883, "logps/chosen": -761.4261474609375, "logps/rejected": -640.9293823242188, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": 2.737541437149048, "rewards/margins": 3.7332773208618164, "rewards/rejected": -0.9957361817359924, "step": 2799 }, { "epoch": 2.045662100456621, "grad_norm": 14.37076672127227, "learning_rate": 2.833957326795718e-07, "logits/chosen": -3.107070207595825, "logits/rejected": -2.1145572662353516, "logps/chosen": -480.3493957519531, "logps/rejected": -386.3134765625, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": 1.8907734155654907, "rewards/margins": 3.1783010959625244, "rewards/rejected": -1.2875277996063232, "step": 2800 }, { "epoch": 2.0463926940639268, "grad_norm": 15.413215594606477, "learning_rate": 2.8323765133164277e-07, "logits/chosen": -2.5828421115875244, "logits/rejected": -1.8894484043121338, "logps/chosen": -601.4205322265625, "logps/rejected": -529.218017578125, "loss": 0.0809, "rewards/accuracies": 1.0, "rewards/chosen": 4.229267597198486, "rewards/margins": 5.748125076293945, "rewards/rejected": -1.5188579559326172, "step": 2801 }, { "epoch": 2.047123287671233, "grad_norm": 9.873856104014934, "learning_rate": 2.830795564538328e-07, "logits/chosen": -2.076244354248047, "logits/rejected": -2.1298742294311523, "logps/chosen": -753.827880859375, "logps/rejected": -845.9708862304688, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 2.083237648010254, "rewards/margins": 3.48266339302063, "rewards/rejected": -1.3994258642196655, "step": 2802 }, { "epoch": 2.047853881278539, "grad_norm": 13.266389894393958, "learning_rate": 2.8292144811049694e-07, "logits/chosen": -2.93098783493042, "logits/rejected": -2.0357842445373535, "logps/chosen": -756.17236328125, "logps/rejected": -469.9952697753906, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": 3.4108757972717285, "rewards/margins": 4.86814022064209, "rewards/rejected": -1.4572643041610718, "step": 2803 }, { "epoch": 2.048584474885845, "grad_norm": 17.322278728043585, "learning_rate": 2.8276332636599555e-07, "logits/chosen": -2.820770263671875, "logits/rejected": -2.349514961242676, "logps/chosen": -569.971435546875, "logps/rejected": -584.5299682617188, "loss": 0.0895, "rewards/accuracies": 1.0, "rewards/chosen": 3.4129717350006104, "rewards/margins": 5.138643741607666, "rewards/rejected": -1.7256717681884766, "step": 2804 }, { "epoch": 2.0493150684931507, "grad_norm": 14.622665871096638, "learning_rate": 2.8260519128469443e-07, "logits/chosen": -3.2906084060668945, "logits/rejected": -2.0944323539733887, "logps/chosen": -866.614013671875, "logps/rejected": -486.4020080566406, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 3.831016778945923, "rewards/margins": 4.003119468688965, "rewards/rejected": -0.17210230231285095, "step": 2805 }, { "epoch": 2.0500456621004566, "grad_norm": 15.24501231192107, "learning_rate": 2.824470429309648e-07, "logits/chosen": -2.7861270904541016, "logits/rejected": -2.00576114654541, "logps/chosen": -448.75, "logps/rejected": -346.2218017578125, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 3.781294345855713, "rewards/margins": 6.578821659088135, "rewards/rejected": -2.797527313232422, "step": 2806 }, { "epoch": 2.0507762557077625, "grad_norm": 13.302898257552046, "learning_rate": 2.8228888136918337e-07, "logits/chosen": -2.7178235054016113, "logits/rejected": -2.785399913787842, "logps/chosen": -680.1475830078125, "logps/rejected": -804.85400390625, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 3.6328139305114746, "rewards/margins": 4.043583869934082, "rewards/rejected": -0.41076987981796265, "step": 2807 }, { "epoch": 2.0515068493150683, "grad_norm": 12.022577028176508, "learning_rate": 2.82130706663732e-07, "logits/chosen": -2.8901875019073486, "logits/rejected": -2.194830894470215, "logps/chosen": -563.0361938476562, "logps/rejected": -461.7312927246094, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": 3.153153896331787, "rewards/margins": 5.416736125946045, "rewards/rejected": -2.263582229614258, "step": 2808 }, { "epoch": 2.0522374429223746, "grad_norm": 7.743010291483761, "learning_rate": 2.819725188789982e-07, "logits/chosen": -2.7253222465515137, "logits/rejected": -1.899350643157959, "logps/chosen": -485.7938537597656, "logps/rejected": -357.9721984863281, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 4.752298831939697, "rewards/margins": 6.627305030822754, "rewards/rejected": -1.8750059604644775, "step": 2809 }, { "epoch": 2.0529680365296805, "grad_norm": 20.073664235749522, "learning_rate": 2.8181431807937456e-07, "logits/chosen": -2.408284902572632, "logits/rejected": -1.7224740982055664, "logps/chosen": -629.8564453125, "logps/rejected": -610.0287475585938, "loss": 0.1132, "rewards/accuracies": 0.875, "rewards/chosen": 3.24747371673584, "rewards/margins": 5.732237339019775, "rewards/rejected": -2.4847638607025146, "step": 2810 }, { "epoch": 2.0536986301369864, "grad_norm": 12.629507940739233, "learning_rate": 2.816561043292592e-07, "logits/chosen": -2.645106077194214, "logits/rejected": -2.308567523956299, "logps/chosen": -790.98046875, "logps/rejected": -596.8068237304688, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 3.359739303588867, "rewards/margins": 4.0048980712890625, "rewards/rejected": -0.6451584100723267, "step": 2811 }, { "epoch": 2.0544292237442923, "grad_norm": 19.18156734041214, "learning_rate": 2.814978776930553e-07, "logits/chosen": -2.7673301696777344, "logits/rejected": -2.0800130367279053, "logps/chosen": -628.3873901367188, "logps/rejected": -461.1058349609375, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 3.448598861694336, "rewards/margins": 5.339125156402588, "rewards/rejected": -1.8905258178710938, "step": 2812 }, { "epoch": 2.055159817351598, "grad_norm": 14.573172264593445, "learning_rate": 2.813396382351713e-07, "logits/chosen": -2.596759796142578, "logits/rejected": -2.26279616355896, "logps/chosen": -403.8642578125, "logps/rejected": -529.9150390625, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": 3.2424190044403076, "rewards/margins": 6.032200336456299, "rewards/rejected": -2.789781093597412, "step": 2813 }, { "epoch": 2.055890410958904, "grad_norm": 12.925679727238053, "learning_rate": 2.8118138602002114e-07, "logits/chosen": -2.7353358268737793, "logits/rejected": -2.004655361175537, "logps/chosen": -725.41748046875, "logps/rejected": -546.2952880859375, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": 3.6392040252685547, "rewards/margins": 5.012126922607422, "rewards/rejected": -1.3729228973388672, "step": 2814 }, { "epoch": 2.05662100456621, "grad_norm": 11.593054408673364, "learning_rate": 2.8102312111202345e-07, "logits/chosen": -2.974548578262329, "logits/rejected": -2.3849568367004395, "logps/chosen": -991.1032104492188, "logps/rejected": -614.7850341796875, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 4.7317728996276855, "rewards/margins": 4.9466233253479, "rewards/rejected": -0.2148503065109253, "step": 2815 }, { "epoch": 2.0573515981735158, "grad_norm": 15.574920232822674, "learning_rate": 2.808648435756026e-07, "logits/chosen": -2.6671459674835205, "logits/rejected": -2.2769899368286133, "logps/chosen": -579.2822265625, "logps/rejected": -444.07305908203125, "loss": 0.1323, "rewards/accuracies": 0.875, "rewards/chosen": 2.246476888656616, "rewards/margins": 3.8768484592437744, "rewards/rejected": -1.6303715705871582, "step": 2816 }, { "epoch": 2.058082191780822, "grad_norm": 9.791348932400256, "learning_rate": 2.8070655347518757e-07, "logits/chosen": -2.9828498363494873, "logits/rejected": -3.0023725032806396, "logps/chosen": -222.04678344726562, "logps/rejected": -311.9112854003906, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 2.1708922386169434, "rewards/margins": 5.190472602844238, "rewards/rejected": -3.019580602645874, "step": 2817 }, { "epoch": 2.058812785388128, "grad_norm": 9.32290375493752, "learning_rate": 2.8054825087521295e-07, "logits/chosen": -3.306472063064575, "logits/rejected": -2.237179756164551, "logps/chosen": -734.2496948242188, "logps/rejected": -733.7556762695312, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 3.3096415996551514, "rewards/margins": 4.439505577087402, "rewards/rejected": -1.1298637390136719, "step": 2818 }, { "epoch": 2.059543378995434, "grad_norm": 7.2285478163968415, "learning_rate": 2.8038993584011794e-07, "logits/chosen": -2.9165477752685547, "logits/rejected": -2.3791422843933105, "logps/chosen": -845.432373046875, "logps/rejected": -613.53369140625, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 3.3990025520324707, "rewards/margins": 3.3587894439697266, "rewards/rejected": 0.040213242173194885, "step": 2819 }, { "epoch": 2.0602739726027397, "grad_norm": 15.037656184247098, "learning_rate": 2.802316084343472e-07, "logits/chosen": -2.780968189239502, "logits/rejected": -2.1433944702148438, "logps/chosen": -863.168212890625, "logps/rejected": -795.52490234375, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 4.824169635772705, "rewards/margins": 5.724096298217773, "rewards/rejected": -0.8999266624450684, "step": 2820 }, { "epoch": 2.0610045662100456, "grad_norm": 9.2443983282667, "learning_rate": 2.800732687223501e-07, "logits/chosen": -3.040537118911743, "logits/rejected": -1.3597095012664795, "logps/chosen": -754.66162109375, "logps/rejected": -319.5744323730469, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 5.182303428649902, "rewards/margins": 6.461315155029297, "rewards/rejected": -1.2790114879608154, "step": 2821 }, { "epoch": 2.0617351598173514, "grad_norm": 6.475679233818383, "learning_rate": 2.7991491676858135e-07, "logits/chosen": -2.784775972366333, "logits/rejected": -2.3017067909240723, "logps/chosen": -686.0027465820312, "logps/rejected": -592.2652587890625, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 4.946116924285889, "rewards/margins": 6.120927333831787, "rewards/rejected": -1.1748104095458984, "step": 2822 }, { "epoch": 2.0624657534246573, "grad_norm": 8.241869532521932, "learning_rate": 2.7975655263750034e-07, "logits/chosen": -2.4493608474731445, "logits/rejected": -2.701991558074951, "logps/chosen": -670.98681640625, "logps/rejected": -988.7477416992188, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": 4.577834129333496, "rewards/margins": 5.158603668212891, "rewards/rejected": -0.5807696580886841, "step": 2823 }, { "epoch": 2.0631963470319636, "grad_norm": 12.547845444257304, "learning_rate": 2.795981763935716e-07, "logits/chosen": -3.143493175506592, "logits/rejected": -2.4418797492980957, "logps/chosen": -699.350341796875, "logps/rejected": -481.46331787109375, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": 3.7819838523864746, "rewards/margins": 4.860162734985352, "rewards/rejected": -1.0781792402267456, "step": 2824 }, { "epoch": 2.0639269406392695, "grad_norm": 11.044414672538531, "learning_rate": 2.794397881012647e-07, "logits/chosen": -2.7999484539031982, "logits/rejected": -2.1931190490722656, "logps/chosen": -556.7482299804688, "logps/rejected": -455.94842529296875, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 3.087207317352295, "rewards/margins": 5.4626007080078125, "rewards/rejected": -2.3753933906555176, "step": 2825 }, { "epoch": 2.0646575342465754, "grad_norm": 8.514775169034072, "learning_rate": 2.7928138782505355e-07, "logits/chosen": -2.830043315887451, "logits/rejected": -1.522032618522644, "logps/chosen": -953.346923828125, "logps/rejected": -448.32366943359375, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 3.2869277000427246, "rewards/margins": 3.474583625793457, "rewards/rejected": -0.18765594065189362, "step": 2826 }, { "epoch": 2.0653881278538813, "grad_norm": 9.746262836709596, "learning_rate": 2.7912297562941766e-07, "logits/chosen": -2.219637870788574, "logits/rejected": -2.1641464233398438, "logps/chosen": -610.3864135742188, "logps/rejected": -798.6988525390625, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 4.265057563781738, "rewards/margins": 5.902202606201172, "rewards/rejected": -1.6371450424194336, "step": 2827 }, { "epoch": 2.066118721461187, "grad_norm": 10.077834298132958, "learning_rate": 2.78964551578841e-07, "logits/chosen": -2.589313507080078, "logits/rejected": -2.381667375564575, "logps/chosen": -695.0732421875, "logps/rejected": -768.863525390625, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 2.0254440307617188, "rewards/margins": 3.718930959701538, "rewards/rejected": -1.6934869289398193, "step": 2828 }, { "epoch": 2.066849315068493, "grad_norm": 10.755180707377251, "learning_rate": 2.788061157378124e-07, "logits/chosen": -3.1474549770355225, "logits/rejected": -2.500870704650879, "logps/chosen": -833.7789306640625, "logps/rejected": -587.979736328125, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": 2.9509315490722656, "rewards/margins": 3.6356382369995117, "rewards/rejected": -0.6847065091133118, "step": 2829 }, { "epoch": 2.067579908675799, "grad_norm": 12.625088228729458, "learning_rate": 2.786476681708256e-07, "logits/chosen": -2.5897531509399414, "logits/rejected": -1.6311495304107666, "logps/chosen": -700.72900390625, "logps/rejected": -606.2816772460938, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": 5.017858982086182, "rewards/margins": 7.628782272338867, "rewards/rejected": -2.6109232902526855, "step": 2830 }, { "epoch": 2.068310502283105, "grad_norm": 10.940081656812021, "learning_rate": 2.7848920894237904e-07, "logits/chosen": -2.874840021133423, "logits/rejected": -2.805259943008423, "logps/chosen": -509.80084228515625, "logps/rejected": -510.90789794921875, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 2.9704463481903076, "rewards/margins": 4.852057456970215, "rewards/rejected": -1.8816107511520386, "step": 2831 }, { "epoch": 2.069041095890411, "grad_norm": 9.1617664995931, "learning_rate": 2.783307381169758e-07, "logits/chosen": -2.99375581741333, "logits/rejected": -2.3683319091796875, "logps/chosen": -730.78759765625, "logps/rejected": -593.91015625, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 3.453695058822632, "rewards/margins": 3.8453192710876465, "rewards/rejected": -0.39162424206733704, "step": 2832 }, { "epoch": 2.069771689497717, "grad_norm": 16.532411127109718, "learning_rate": 2.781722557591238e-07, "logits/chosen": -3.0303311347961426, "logits/rejected": -2.1327292919158936, "logps/chosen": -707.552734375, "logps/rejected": -467.7052307128906, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 2.99472713470459, "rewards/margins": 4.1635212898254395, "rewards/rejected": -1.1687942743301392, "step": 2833 }, { "epoch": 2.070502283105023, "grad_norm": 12.781891076908499, "learning_rate": 2.780137619333357e-07, "logits/chosen": -2.718979835510254, "logits/rejected": -2.3465676307678223, "logps/chosen": -592.8646240234375, "logps/rejected": -494.0007019042969, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.3775999546051025, "rewards/margins": 3.534968376159668, "rewards/rejected": -1.157368540763855, "step": 2834 }, { "epoch": 2.0712328767123287, "grad_norm": 16.520411033412294, "learning_rate": 2.778552567041288e-07, "logits/chosen": -2.8044216632843018, "logits/rejected": -2.713064670562744, "logps/chosen": -585.9417114257812, "logps/rejected": -575.5286254882812, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 2.5857224464416504, "rewards/margins": 2.8753294944763184, "rewards/rejected": -0.2896069586277008, "step": 2835 }, { "epoch": 2.0719634703196346, "grad_norm": 17.356284181412146, "learning_rate": 2.77696740136025e-07, "logits/chosen": -2.849046468734741, "logits/rejected": -2.008124589920044, "logps/chosen": -638.0811767578125, "logps/rejected": -485.873779296875, "loss": 0.0881, "rewards/accuracies": 1.0, "rewards/chosen": 3.9163661003112793, "rewards/margins": 5.729956150054932, "rewards/rejected": -1.8135896921157837, "step": 2836 }, { "epoch": 2.0726940639269404, "grad_norm": 13.153527637354195, "learning_rate": 2.7753821229355076e-07, "logits/chosen": -2.9094529151916504, "logits/rejected": -2.0721821784973145, "logps/chosen": -662.6350708007812, "logps/rejected": -424.9360656738281, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": 2.9440813064575195, "rewards/margins": 4.557198524475098, "rewards/rejected": -1.613116979598999, "step": 2837 }, { "epoch": 2.0734246575342468, "grad_norm": 9.740879194304144, "learning_rate": 2.773796732412373e-07, "logits/chosen": -2.8256053924560547, "logits/rejected": -1.9886685609817505, "logps/chosen": -700.5616455078125, "logps/rejected": -548.9320678710938, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 5.491672515869141, "rewards/margins": 6.1939167976379395, "rewards/rejected": -0.7022445797920227, "step": 2838 }, { "epoch": 2.0741552511415526, "grad_norm": 10.547813543309987, "learning_rate": 2.772211230436202e-07, "logits/chosen": -2.3935279846191406, "logits/rejected": -2.1428136825561523, "logps/chosen": -776.595947265625, "logps/rejected": -988.8290405273438, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 3.0582637786865234, "rewards/margins": 4.28659725189209, "rewards/rejected": -1.2283337116241455, "step": 2839 }, { "epoch": 2.0748858447488585, "grad_norm": 12.969907704350675, "learning_rate": 2.770625617652398e-07, "logits/chosen": -2.8624255657196045, "logits/rejected": -2.713290214538574, "logps/chosen": -883.71728515625, "logps/rejected": -798.7321166992188, "loss": 0.0568, "rewards/accuracies": 0.875, "rewards/chosen": 4.303781986236572, "rewards/margins": 4.356916427612305, "rewards/rejected": -0.05313413590192795, "step": 2840 }, { "epoch": 2.0756164383561644, "grad_norm": 11.373112448559459, "learning_rate": 2.7690398947064064e-07, "logits/chosen": -3.2947983741760254, "logits/rejected": -2.817534923553467, "logps/chosen": -1072.9852294921875, "logps/rejected": -952.4578247070312, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 2.890988826751709, "rewards/margins": 4.974687576293945, "rewards/rejected": -2.0836987495422363, "step": 2841 }, { "epoch": 2.0763470319634703, "grad_norm": 8.439296502900415, "learning_rate": 2.767454062243722e-07, "logits/chosen": -2.510685443878174, "logits/rejected": -2.5605039596557617, "logps/chosen": -250.87246704101562, "logps/rejected": -430.3250732421875, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 3.0542759895324707, "rewards/margins": 5.373325824737549, "rewards/rejected": -2.319049835205078, "step": 2842 }, { "epoch": 2.077077625570776, "grad_norm": 11.237506626284198, "learning_rate": 2.765868120909879e-07, "logits/chosen": -2.997992515563965, "logits/rejected": -2.6921355724334717, "logps/chosen": -1051.0635986328125, "logps/rejected": -875.0570678710938, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": 4.349395751953125, "rewards/margins": 4.41776180267334, "rewards/rejected": -0.06836624443531036, "step": 2843 }, { "epoch": 2.077808219178082, "grad_norm": 11.417517043322961, "learning_rate": 2.764282071350459e-07, "logits/chosen": -2.4333274364471436, "logits/rejected": -2.0097713470458984, "logps/chosen": -627.7901000976562, "logps/rejected": -461.2400817871094, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": 2.3010153770446777, "rewards/margins": 4.138881683349609, "rewards/rejected": -1.837866187095642, "step": 2844 }, { "epoch": 2.0785388127853883, "grad_norm": 11.669461451429807, "learning_rate": 2.762695914211088e-07, "logits/chosen": -3.0358529090881348, "logits/rejected": -2.593984365463257, "logps/chosen": -692.8508911132812, "logps/rejected": -696.610107421875, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 2.11025071144104, "rewards/margins": 3.60366153717041, "rewards/rejected": -1.4934108257293701, "step": 2845 }, { "epoch": 2.079269406392694, "grad_norm": 11.92178028714016, "learning_rate": 2.761109650137435e-07, "logits/chosen": -2.6151485443115234, "logits/rejected": -2.140639543533325, "logps/chosen": -491.7170715332031, "logps/rejected": -374.398193359375, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": 3.098637342453003, "rewards/margins": 5.347391128540039, "rewards/rejected": -2.248753786087036, "step": 2846 }, { "epoch": 2.08, "grad_norm": 16.461520553568157, "learning_rate": 2.7595232797752113e-07, "logits/chosen": -2.8177247047424316, "logits/rejected": -2.1644933223724365, "logps/chosen": -660.60791015625, "logps/rejected": -534.325927734375, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": 4.573816299438477, "rewards/margins": 6.652486801147461, "rewards/rejected": -2.0786709785461426, "step": 2847 }, { "epoch": 2.080730593607306, "grad_norm": 14.417037929686376, "learning_rate": 2.757936803770173e-07, "logits/chosen": -3.2078707218170166, "logits/rejected": -2.6044180393218994, "logps/chosen": -902.917236328125, "logps/rejected": -824.0797119140625, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": 3.5141751766204834, "rewards/margins": 4.770003318786621, "rewards/rejected": -1.2558281421661377, "step": 2848 }, { "epoch": 2.081461187214612, "grad_norm": 15.991532469334308, "learning_rate": 2.7563502227681184e-07, "logits/chosen": -2.912709951400757, "logits/rejected": -3.031276226043701, "logps/chosen": -468.47882080078125, "logps/rejected": -477.458984375, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": 2.641663074493408, "rewards/margins": 3.561333417892456, "rewards/rejected": -0.9196702241897583, "step": 2849 }, { "epoch": 2.0821917808219177, "grad_norm": 19.119641718999095, "learning_rate": 2.7547635374148897e-07, "logits/chosen": -2.333909273147583, "logits/rejected": -1.6664237976074219, "logps/chosen": -432.322509765625, "logps/rejected": -410.5354919433594, "loss": 0.0914, "rewards/accuracies": 1.0, "rewards/chosen": 2.60856556892395, "rewards/margins": 5.5541839599609375, "rewards/rejected": -2.9456183910369873, "step": 2850 }, { "epoch": 2.0829223744292236, "grad_norm": 14.818382929869575, "learning_rate": 2.7531767483563706e-07, "logits/chosen": -2.4497897624969482, "logits/rejected": -2.0667850971221924, "logps/chosen": -349.6015319824219, "logps/rejected": -303.4981994628906, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 3.2358479499816895, "rewards/margins": 6.725107669830322, "rewards/rejected": -3.489259719848633, "step": 2851 }, { "epoch": 2.08365296803653, "grad_norm": 10.770962879771792, "learning_rate": 2.7515898562384867e-07, "logits/chosen": -3.3279733657836914, "logits/rejected": -1.8618628978729248, "logps/chosen": -565.3404541015625, "logps/rejected": -317.1680603027344, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 2.894556760787964, "rewards/margins": 4.686995983123779, "rewards/rejected": -1.7924389839172363, "step": 2852 }, { "epoch": 2.0843835616438358, "grad_norm": 13.019115772136034, "learning_rate": 2.750002861707207e-07, "logits/chosen": -2.524200439453125, "logits/rejected": -1.242738962173462, "logps/chosen": -493.5924377441406, "logps/rejected": -236.47784423828125, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": 3.4862618446350098, "rewards/margins": 6.24079704284668, "rewards/rejected": -2.75453519821167, "step": 2853 }, { "epoch": 2.0851141552511416, "grad_norm": 10.581061852861556, "learning_rate": 2.74841576540854e-07, "logits/chosen": -2.2112088203430176, "logits/rejected": -1.8711371421813965, "logps/chosen": -428.0736999511719, "logps/rejected": -404.17755126953125, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 2.852184772491455, "rewards/margins": 4.47296142578125, "rewards/rejected": -1.6207767724990845, "step": 2854 }, { "epoch": 2.0858447488584475, "grad_norm": 15.25060583693496, "learning_rate": 2.746828567988538e-07, "logits/chosen": -2.9562788009643555, "logits/rejected": -1.8661020994186401, "logps/chosen": -489.2752380371094, "logps/rejected": -349.6240234375, "loss": 0.1144, "rewards/accuracies": 1.0, "rewards/chosen": 3.259594440460205, "rewards/margins": 4.940882205963135, "rewards/rejected": -1.6812875270843506, "step": 2855 }, { "epoch": 2.0865753424657534, "grad_norm": 8.43123184779018, "learning_rate": 2.7452412700932926e-07, "logits/chosen": -2.9073808193206787, "logits/rejected": -1.801117181777954, "logps/chosen": -593.8988037109375, "logps/rejected": -467.46826171875, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": 2.6274991035461426, "rewards/margins": 5.0547003746032715, "rewards/rejected": -2.427201271057129, "step": 2856 }, { "epoch": 2.0873059360730593, "grad_norm": 11.240400160321702, "learning_rate": 2.7436538723689376e-07, "logits/chosen": -2.4401395320892334, "logits/rejected": -2.48087739944458, "logps/chosen": -240.4925537109375, "logps/rejected": -329.3779602050781, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 0.8109318017959595, "rewards/margins": 3.6307668685913086, "rewards/rejected": -2.8198351860046387, "step": 2857 }, { "epoch": 2.088036529680365, "grad_norm": 8.480885274490069, "learning_rate": 2.742066375461646e-07, "logits/chosen": -3.4116668701171875, "logits/rejected": -1.917006254196167, "logps/chosen": -600.935546875, "logps/rejected": -263.79388427734375, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 3.802145004272461, "rewards/margins": 4.56767463684082, "rewards/rejected": -0.7655292749404907, "step": 2858 }, { "epoch": 2.0887671232876714, "grad_norm": 12.237020038548115, "learning_rate": 2.740478780017634e-07, "logits/chosen": -3.106822967529297, "logits/rejected": -2.2703640460968018, "logps/chosen": -830.7164916992188, "logps/rejected": -473.6798095703125, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": 4.477914810180664, "rewards/margins": 5.2365312576293945, "rewards/rejected": -0.7586162090301514, "step": 2859 }, { "epoch": 2.0894977168949773, "grad_norm": 12.868011059140573, "learning_rate": 2.738891086683153e-07, "logits/chosen": -2.5246589183807373, "logits/rejected": -1.5824089050292969, "logps/chosen": -493.021728515625, "logps/rejected": -378.16192626953125, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 3.5983715057373047, "rewards/margins": 6.246539115905762, "rewards/rejected": -2.648167848587036, "step": 2860 }, { "epoch": 2.090228310502283, "grad_norm": 10.589463348479425, "learning_rate": 2.7373032961044995e-07, "logits/chosen": -2.445969581604004, "logits/rejected": -2.198025703430176, "logps/chosen": -521.9286499023438, "logps/rejected": -464.0326843261719, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 2.949031352996826, "rewards/margins": 5.769327163696289, "rewards/rejected": -2.820296049118042, "step": 2861 }, { "epoch": 2.090958904109589, "grad_norm": 12.710875076675238, "learning_rate": 2.7357154089280063e-07, "logits/chosen": -2.5595898628234863, "logits/rejected": -2.09291410446167, "logps/chosen": -351.94970703125, "logps/rejected": -399.1710510253906, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 3.080423355102539, "rewards/margins": 5.679587364196777, "rewards/rejected": -2.5991640090942383, "step": 2862 }, { "epoch": 2.091689497716895, "grad_norm": 10.0709609954112, "learning_rate": 2.7341274258000476e-07, "logits/chosen": -2.8189356327056885, "logits/rejected": -1.6208484172821045, "logps/chosen": -387.50286865234375, "logps/rejected": -272.6556396484375, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 3.3362679481506348, "rewards/margins": 6.184452533721924, "rewards/rejected": -2.848184585571289, "step": 2863 }, { "epoch": 2.092420091324201, "grad_norm": 13.22271452627483, "learning_rate": 2.7325393473670343e-07, "logits/chosen": -2.819911479949951, "logits/rejected": -2.2794671058654785, "logps/chosen": -603.2364501953125, "logps/rejected": -547.9590454101562, "loss": 0.0823, "rewards/accuracies": 1.0, "rewards/chosen": 3.52329683303833, "rewards/margins": 4.047979354858398, "rewards/rejected": -0.5246821045875549, "step": 2864 }, { "epoch": 2.0931506849315067, "grad_norm": 12.698677438670778, "learning_rate": 2.730951174275418e-07, "logits/chosen": -2.6173760890960693, "logits/rejected": -2.111680507659912, "logps/chosen": -530.2626953125, "logps/rejected": -404.5290222167969, "loss": 0.0752, "rewards/accuracies": 1.0, "rewards/chosen": 2.7056941986083984, "rewards/margins": 3.8461461067199707, "rewards/rejected": -1.1404515504837036, "step": 2865 }, { "epoch": 2.093881278538813, "grad_norm": 10.823687283957277, "learning_rate": 2.7293629071716876e-07, "logits/chosen": -3.028290271759033, "logits/rejected": -2.6751246452331543, "logps/chosen": -590.4658203125, "logps/rejected": -662.7252807617188, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 2.266392230987549, "rewards/margins": 3.8402764797210693, "rewards/rejected": -1.573884129524231, "step": 2866 }, { "epoch": 2.094611872146119, "grad_norm": 6.997780039131998, "learning_rate": 2.727774546702372e-07, "logits/chosen": -2.490987539291382, "logits/rejected": -2.393066167831421, "logps/chosen": -307.1809387207031, "logps/rejected": -451.2991943359375, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 2.3593647480010986, "rewards/margins": 6.031164646148682, "rewards/rejected": -3.6718006134033203, "step": 2867 }, { "epoch": 2.0953424657534248, "grad_norm": 10.326179078692872, "learning_rate": 2.726186093514036e-07, "logits/chosen": -2.879061222076416, "logits/rejected": -2.2171716690063477, "logps/chosen": -990.6331176757812, "logps/rejected": -714.064453125, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 4.380631923675537, "rewards/margins": 5.5188093185424805, "rewards/rejected": -1.1381776332855225, "step": 2868 }, { "epoch": 2.0960730593607306, "grad_norm": 9.678056211538111, "learning_rate": 2.724597548253283e-07, "logits/chosen": -2.9721503257751465, "logits/rejected": -2.6373934745788574, "logps/chosen": -718.1233520507812, "logps/rejected": -578.6135864257812, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 3.0788779258728027, "rewards/margins": 4.057523250579834, "rewards/rejected": -0.9786452651023865, "step": 2869 }, { "epoch": 2.0968036529680365, "grad_norm": 16.628502039514114, "learning_rate": 2.723008911566755e-07, "logits/chosen": -3.268397331237793, "logits/rejected": -2.6709368228912354, "logps/chosen": -562.5034790039062, "logps/rejected": -597.775146484375, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 3.252622127532959, "rewards/margins": 4.416927814483643, "rewards/rejected": -1.1643054485321045, "step": 2870 }, { "epoch": 2.0975342465753424, "grad_norm": 7.070064578267241, "learning_rate": 2.7214201841011293e-07, "logits/chosen": -2.8629708290100098, "logits/rejected": -2.568779706954956, "logps/chosen": -713.1378173828125, "logps/rejected": -563.2899780273438, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 2.780637264251709, "rewards/margins": 3.9962708950042725, "rewards/rejected": -1.2156333923339844, "step": 2871 }, { "epoch": 2.0982648401826482, "grad_norm": 11.533210236291204, "learning_rate": 2.719831366503122e-07, "logits/chosen": -2.7347288131713867, "logits/rejected": -2.2469592094421387, "logps/chosen": -673.5558471679688, "logps/rejected": -559.8635864257812, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 3.9892420768737793, "rewards/margins": 5.352745056152344, "rewards/rejected": -1.3635032176971436, "step": 2872 }, { "epoch": 2.098995433789954, "grad_norm": 17.083670760133053, "learning_rate": 2.718242459419483e-07, "logits/chosen": -3.174899101257324, "logits/rejected": -2.042390823364258, "logps/chosen": -598.4630737304688, "logps/rejected": -551.7789306640625, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": 2.818685531616211, "rewards/margins": 4.7976861000061035, "rewards/rejected": -1.9790005683898926, "step": 2873 }, { "epoch": 2.0997260273972604, "grad_norm": 10.65139758472157, "learning_rate": 2.7166534634970025e-07, "logits/chosen": -3.154475688934326, "logits/rejected": -2.2021548748016357, "logps/chosen": -700.143798828125, "logps/rejected": -479.61492919921875, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 3.7429685592651367, "rewards/margins": 4.812596321105957, "rewards/rejected": -1.0696280002593994, "step": 2874 }, { "epoch": 2.1004566210045663, "grad_norm": 13.74852897404647, "learning_rate": 2.7150643793825053e-07, "logits/chosen": -3.16157603263855, "logits/rejected": -2.123056173324585, "logps/chosen": -854.207763671875, "logps/rejected": -602.9002685546875, "loss": 0.0625, "rewards/accuracies": 1.0, "rewards/chosen": 4.583884239196777, "rewards/margins": 4.6135711669921875, "rewards/rejected": -0.02968701720237732, "step": 2875 }, { "epoch": 2.101187214611872, "grad_norm": 10.111632388398299, "learning_rate": 2.7134752077228494e-07, "logits/chosen": -2.656038999557495, "logits/rejected": -2.5760960578918457, "logps/chosen": -652.5908813476562, "logps/rejected": -587.48388671875, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 3.0208957195281982, "rewards/margins": 4.687455654144287, "rewards/rejected": -1.6665599346160889, "step": 2876 }, { "epoch": 2.101917808219178, "grad_norm": 22.244005402966156, "learning_rate": 2.7118859491649337e-07, "logits/chosen": -2.639852285385132, "logits/rejected": -2.0847630500793457, "logps/chosen": -576.6890869140625, "logps/rejected": -485.3173828125, "loss": 0.1713, "rewards/accuracies": 1.0, "rewards/chosen": 4.289801597595215, "rewards/margins": 6.115780353546143, "rewards/rejected": -1.825979232788086, "step": 2877 }, { "epoch": 2.102648401826484, "grad_norm": 20.06183449660426, "learning_rate": 2.710296604355687e-07, "logits/chosen": -2.9166831970214844, "logits/rejected": -2.28608775138855, "logps/chosen": -639.919677734375, "logps/rejected": -474.6485595703125, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": 3.465179681777954, "rewards/margins": 4.349239826202393, "rewards/rejected": -0.8840606212615967, "step": 2878 }, { "epoch": 2.10337899543379, "grad_norm": 10.988231953097738, "learning_rate": 2.708707173942077e-07, "logits/chosen": -2.9371798038482666, "logits/rejected": -1.7518466711044312, "logps/chosen": -606.3685302734375, "logps/rejected": -409.7336120605469, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 2.86924409866333, "rewards/margins": 5.093676567077637, "rewards/rejected": -2.2244327068328857, "step": 2879 }, { "epoch": 2.1041095890410957, "grad_norm": 11.728111495171081, "learning_rate": 2.707117658571105e-07, "logits/chosen": -2.676046848297119, "logits/rejected": -2.0195178985595703, "logps/chosen": -587.0741577148438, "logps/rejected": -442.9754638671875, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": 4.856799125671387, "rewards/margins": 6.634403228759766, "rewards/rejected": -1.7776042222976685, "step": 2880 }, { "epoch": 2.104840182648402, "grad_norm": 16.716446248988227, "learning_rate": 2.705528058889807e-07, "logits/chosen": -2.708181858062744, "logits/rejected": -2.1064679622650146, "logps/chosen": -777.5584716796875, "logps/rejected": -623.4353637695312, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 3.9278125762939453, "rewards/margins": 4.2850260734558105, "rewards/rejected": -0.3572137951850891, "step": 2881 }, { "epoch": 2.105570776255708, "grad_norm": 16.873069370173972, "learning_rate": 2.7039383755452523e-07, "logits/chosen": -3.1401212215423584, "logits/rejected": -2.8000717163085938, "logps/chosen": -766.6809692382812, "logps/rejected": -719.452880859375, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 3.8323590755462646, "rewards/margins": 4.959713935852051, "rewards/rejected": -1.1273545026779175, "step": 2882 }, { "epoch": 2.1063013698630138, "grad_norm": 9.590015591637563, "learning_rate": 2.7023486091845467e-07, "logits/chosen": -2.990835189819336, "logits/rejected": -1.919644832611084, "logps/chosen": -813.7744140625, "logps/rejected": -536.6641235351562, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 3.534069538116455, "rewards/margins": 4.630268573760986, "rewards/rejected": -1.0961990356445312, "step": 2883 }, { "epoch": 2.1070319634703196, "grad_norm": 11.654460370742315, "learning_rate": 2.7007587604548267e-07, "logits/chosen": -2.9970521926879883, "logits/rejected": -2.229076862335205, "logps/chosen": -490.3515319824219, "logps/rejected": -419.89935302734375, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 2.717223644256592, "rewards/margins": 4.048781871795654, "rewards/rejected": -1.3315582275390625, "step": 2884 }, { "epoch": 2.1077625570776255, "grad_norm": 10.664728902598414, "learning_rate": 2.6991688300032647e-07, "logits/chosen": -2.5904295444488525, "logits/rejected": -2.558016777038574, "logps/chosen": -590.294921875, "logps/rejected": -724.0599365234375, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 3.3992323875427246, "rewards/margins": 4.645425796508789, "rewards/rejected": -1.246193289756775, "step": 2885 }, { "epoch": 2.1084931506849314, "grad_norm": 8.39478299849358, "learning_rate": 2.697578818477065e-07, "logits/chosen": -2.5355937480926514, "logits/rejected": -2.890108108520508, "logps/chosen": -411.8811950683594, "logps/rejected": -546.157470703125, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": 0.9034607410430908, "rewards/margins": 4.578205585479736, "rewards/rejected": -3.6747450828552246, "step": 2886 }, { "epoch": 2.1092237442922372, "grad_norm": 17.329748712248758, "learning_rate": 2.695988726523466e-07, "logits/chosen": -2.831099033355713, "logits/rejected": -2.210205554962158, "logps/chosen": -541.2808837890625, "logps/rejected": -475.6600646972656, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 2.1305525302886963, "rewards/margins": 3.6857399940490723, "rewards/rejected": -1.555187463760376, "step": 2887 }, { "epoch": 2.1099543378995436, "grad_norm": 10.675166637540503, "learning_rate": 2.694398554789739e-07, "logits/chosen": -3.0909955501556396, "logits/rejected": -2.683350086212158, "logps/chosen": -559.8866577148438, "logps/rejected": -456.826416015625, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 3.3051958084106445, "rewards/margins": 6.176356792449951, "rewards/rejected": -2.8711609840393066, "step": 2888 }, { "epoch": 2.1106849315068494, "grad_norm": 8.177412735121504, "learning_rate": 2.692808303923186e-07, "logits/chosen": -3.080832004547119, "logits/rejected": -2.8675012588500977, "logps/chosen": -815.60498046875, "logps/rejected": -914.2713623046875, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 4.0924787521362305, "rewards/margins": 4.6637797355651855, "rewards/rejected": -0.5713010430335999, "step": 2889 }, { "epoch": 2.1114155251141553, "grad_norm": 5.6068391726366205, "learning_rate": 2.6912179745711427e-07, "logits/chosen": -2.5755529403686523, "logits/rejected": -2.21745228767395, "logps/chosen": -809.69287109375, "logps/rejected": -873.5238037109375, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 4.315925121307373, "rewards/margins": 6.799016952514648, "rewards/rejected": -2.4830918312072754, "step": 2890 }, { "epoch": 2.112146118721461, "grad_norm": 11.027573445624055, "learning_rate": 2.689627567380975e-07, "logits/chosen": -2.5644617080688477, "logits/rejected": -1.6376172304153442, "logps/chosen": -668.365234375, "logps/rejected": -390.469482421875, "loss": 0.0895, "rewards/accuracies": 0.875, "rewards/chosen": 2.4178054332733154, "rewards/margins": 4.526312828063965, "rewards/rejected": -2.1085071563720703, "step": 2891 }, { "epoch": 2.112876712328767, "grad_norm": 10.827655027905179, "learning_rate": 2.688037083000084e-07, "logits/chosen": -3.003324031829834, "logits/rejected": -2.2041616439819336, "logps/chosen": -751.7403564453125, "logps/rejected": -499.4893798828125, "loss": 0.0655, "rewards/accuracies": 1.0, "rewards/chosen": 3.1358275413513184, "rewards/margins": 4.240187168121338, "rewards/rejected": -1.1043596267700195, "step": 2892 }, { "epoch": 2.113607305936073, "grad_norm": 15.674734811197515, "learning_rate": 2.686446522075899e-07, "logits/chosen": -2.84648060798645, "logits/rejected": -1.8192706108093262, "logps/chosen": -654.147705078125, "logps/rejected": -437.28961181640625, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": 3.2087883949279785, "rewards/margins": 3.9692976474761963, "rewards/rejected": -0.7605091333389282, "step": 2893 }, { "epoch": 2.114337899543379, "grad_norm": 13.104574531713212, "learning_rate": 2.684855885255882e-07, "logits/chosen": -2.5280838012695312, "logits/rejected": -2.175957202911377, "logps/chosen": -623.2987670898438, "logps/rejected": -598.5313720703125, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 3.311807632446289, "rewards/margins": 4.29599142074585, "rewards/rejected": -0.9841840267181396, "step": 2894 }, { "epoch": 2.115068493150685, "grad_norm": 10.64540707406067, "learning_rate": 2.6832651731875246e-07, "logits/chosen": -2.638490915298462, "logits/rejected": -2.3380913734436035, "logps/chosen": -713.5505981445312, "logps/rejected": -733.053955078125, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 4.868085861206055, "rewards/margins": 5.940957546234131, "rewards/rejected": -1.0728718042373657, "step": 2895 }, { "epoch": 2.115799086757991, "grad_norm": 11.729019588725023, "learning_rate": 2.6816743865183497e-07, "logits/chosen": -2.8888988494873047, "logits/rejected": -2.1713948249816895, "logps/chosen": -542.578857421875, "logps/rejected": -504.5234069824219, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 3.831448793411255, "rewards/margins": 5.945219993591309, "rewards/rejected": -2.113771438598633, "step": 2896 }, { "epoch": 2.116529680365297, "grad_norm": 15.557762068218201, "learning_rate": 2.6800835258959113e-07, "logits/chosen": -2.6245265007019043, "logits/rejected": -2.396770477294922, "logps/chosen": -533.61181640625, "logps/rejected": -457.46600341796875, "loss": 0.0869, "rewards/accuracies": 1.0, "rewards/chosen": 2.216010332107544, "rewards/margins": 5.068706512451172, "rewards/rejected": -2.852695941925049, "step": 2897 }, { "epoch": 2.1172602739726027, "grad_norm": 13.471578562109123, "learning_rate": 2.678492591967794e-07, "logits/chosen": -3.415217638015747, "logits/rejected": -2.512603282928467, "logps/chosen": -1136.3079833984375, "logps/rejected": -766.48388671875, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 4.3772172927856445, "rewards/margins": 3.524047613143921, "rewards/rejected": 0.8531696200370789, "step": 2898 }, { "epoch": 2.1179908675799086, "grad_norm": 11.858969056993384, "learning_rate": 2.676901585381608e-07, "logits/chosen": -2.7824995517730713, "logits/rejected": -2.3157567977905273, "logps/chosen": -386.3795166015625, "logps/rejected": -337.1032409667969, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": 3.212430477142334, "rewards/margins": 4.745660781860352, "rewards/rejected": -1.533230185508728, "step": 2899 }, { "epoch": 2.1187214611872145, "grad_norm": 15.713494516246518, "learning_rate": 2.675310506785e-07, "logits/chosen": -2.579899311065674, "logits/rejected": -1.7570830583572388, "logps/chosen": -753.8355712890625, "logps/rejected": -454.06781005859375, "loss": 0.084, "rewards/accuracies": 1.0, "rewards/chosen": 3.4907026290893555, "rewards/margins": 4.779290199279785, "rewards/rejected": -1.288587212562561, "step": 2900 }, { "epoch": 2.1194520547945204, "grad_norm": 11.729403360710304, "learning_rate": 2.6737193568256395e-07, "logits/chosen": -2.813486337661743, "logits/rejected": -2.0944812297821045, "logps/chosen": -791.1646728515625, "logps/rejected": -546.1519165039062, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 3.8891959190368652, "rewards/margins": 5.124998092651367, "rewards/rejected": -1.235802173614502, "step": 2901 }, { "epoch": 2.1201826484018267, "grad_norm": 9.056571336604156, "learning_rate": 2.672128136151228e-07, "logits/chosen": -2.8227462768554688, "logits/rejected": -1.214902639389038, "logps/chosen": -765.2940063476562, "logps/rejected": -339.8107604980469, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": 3.9603021144866943, "rewards/margins": 6.58408260345459, "rewards/rejected": -2.6237807273864746, "step": 2902 }, { "epoch": 2.1209132420091326, "grad_norm": 13.35179506727671, "learning_rate": 2.6705368454094967e-07, "logits/chosen": -2.8098480701446533, "logits/rejected": -2.1190733909606934, "logps/chosen": -567.041748046875, "logps/rejected": -568.9288940429688, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 4.632410526275635, "rewards/margins": 6.263755798339844, "rewards/rejected": -1.631345272064209, "step": 2903 }, { "epoch": 2.1216438356164384, "grad_norm": 10.715227176538265, "learning_rate": 2.668945485248204e-07, "logits/chosen": -2.656024932861328, "logits/rejected": -2.314621925354004, "logps/chosen": -679.9314575195312, "logps/rejected": -667.3499755859375, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": 4.619641304016113, "rewards/margins": 4.960056781768799, "rewards/rejected": -0.3404158353805542, "step": 2904 }, { "epoch": 2.1223744292237443, "grad_norm": 12.101754154389406, "learning_rate": 2.667354056315137e-07, "logits/chosen": -2.707651138305664, "logits/rejected": -1.6334452629089355, "logps/chosen": -615.4617919921875, "logps/rejected": -374.19195556640625, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 2.1851227283477783, "rewards/margins": 3.963840961456299, "rewards/rejected": -1.7787182331085205, "step": 2905 }, { "epoch": 2.12310502283105, "grad_norm": 12.547797410248254, "learning_rate": 2.665762559258109e-07, "logits/chosen": -2.9553864002227783, "logits/rejected": -3.1295127868652344, "logps/chosen": -317.1142883300781, "logps/rejected": -436.677978515625, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 2.362022638320923, "rewards/margins": 6.036381244659424, "rewards/rejected": -3.6743581295013428, "step": 2906 }, { "epoch": 2.123835616438356, "grad_norm": 14.24535146818598, "learning_rate": 2.664170994724964e-07, "logits/chosen": -3.086822509765625, "logits/rejected": -2.5694384574890137, "logps/chosen": -923.0802001953125, "logps/rejected": -821.2051391601562, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 4.521116256713867, "rewards/margins": 4.783915996551514, "rewards/rejected": -0.2627997398376465, "step": 2907 }, { "epoch": 2.124566210045662, "grad_norm": 13.487564576058919, "learning_rate": 2.662579363363572e-07, "logits/chosen": -2.706454038619995, "logits/rejected": -2.5705714225769043, "logps/chosen": -493.9635009765625, "logps/rejected": -601.138427734375, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": 2.083606243133545, "rewards/margins": 4.967164993286133, "rewards/rejected": -2.883558988571167, "step": 2908 }, { "epoch": 2.125296803652968, "grad_norm": 9.182341288610838, "learning_rate": 2.660987665821829e-07, "logits/chosen": -2.967378616333008, "logits/rejected": -2.148090362548828, "logps/chosen": -485.2866516113281, "logps/rejected": -533.1578979492188, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 3.1787288188934326, "rewards/margins": 4.827203750610352, "rewards/rejected": -1.6484754085540771, "step": 2909 }, { "epoch": 2.126027397260274, "grad_norm": 11.123197945363478, "learning_rate": 2.6593959027476595e-07, "logits/chosen": -2.644507884979248, "logits/rejected": -2.0144145488739014, "logps/chosen": -575.2493896484375, "logps/rejected": -476.5867919921875, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 3.235917568206787, "rewards/margins": 4.142790794372559, "rewards/rejected": -0.9068729281425476, "step": 2910 }, { "epoch": 2.12675799086758, "grad_norm": 8.919887020610139, "learning_rate": 2.6578040747890156e-07, "logits/chosen": -2.829207420349121, "logits/rejected": -2.0587806701660156, "logps/chosen": -767.8707275390625, "logps/rejected": -547.934326171875, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 4.694872856140137, "rewards/margins": 6.133979320526123, "rewards/rejected": -1.4391069412231445, "step": 2911 }, { "epoch": 2.127488584474886, "grad_norm": 6.660635349795052, "learning_rate": 2.656212182593874e-07, "logits/chosen": -3.3391194343566895, "logits/rejected": -2.2960848808288574, "logps/chosen": -532.4107666015625, "logps/rejected": -373.28765869140625, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": 2.1171789169311523, "rewards/margins": 5.222977161407471, "rewards/rejected": -3.1057982444763184, "step": 2912 }, { "epoch": 2.1282191780821917, "grad_norm": 8.828782642816707, "learning_rate": 2.6546202268102383e-07, "logits/chosen": -2.84305477142334, "logits/rejected": -2.420304775238037, "logps/chosen": -806.2139282226562, "logps/rejected": -622.1148071289062, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 4.054957866668701, "rewards/margins": 5.140003681182861, "rewards/rejected": -1.085045576095581, "step": 2913 }, { "epoch": 2.1289497716894976, "grad_norm": 16.892938418410314, "learning_rate": 2.653028208086137e-07, "logits/chosen": -2.455660104751587, "logits/rejected": -2.649947166442871, "logps/chosen": -473.2277526855469, "logps/rejected": -655.9326171875, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 3.8641104698181152, "rewards/margins": 5.51788854598999, "rewards/rejected": -1.6537779569625854, "step": 2914 }, { "epoch": 2.1296803652968035, "grad_norm": 8.524094483731286, "learning_rate": 2.6514361270696253e-07, "logits/chosen": -2.5188417434692383, "logits/rejected": -2.6163241863250732, "logps/chosen": -499.6536560058594, "logps/rejected": -704.6556396484375, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 3.538759469985962, "rewards/margins": 6.065150737762451, "rewards/rejected": -2.52639102935791, "step": 2915 }, { "epoch": 2.1304109589041094, "grad_norm": 9.465119424379887, "learning_rate": 2.649843984408784e-07, "logits/chosen": -2.405787944793701, "logits/rejected": -2.799510955810547, "logps/chosen": -607.9594116210938, "logps/rejected": -661.92431640625, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 1.9149854183197021, "rewards/margins": 3.0630440711975098, "rewards/rejected": -1.1480584144592285, "step": 2916 }, { "epoch": 2.1311415525114157, "grad_norm": 8.161373473121632, "learning_rate": 2.648251780751718e-07, "logits/chosen": -2.5190656185150146, "logits/rejected": -2.1493029594421387, "logps/chosen": -778.5838623046875, "logps/rejected": -715.0106811523438, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": 5.172938346862793, "rewards/margins": 5.794621467590332, "rewards/rejected": -0.6216830611228943, "step": 2917 }, { "epoch": 2.1318721461187216, "grad_norm": 13.718019227048483, "learning_rate": 2.6466595167465584e-07, "logits/chosen": -2.9051990509033203, "logits/rejected": -2.5138158798217773, "logps/chosen": -492.374755859375, "logps/rejected": -494.71685791015625, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": 3.9302377700805664, "rewards/margins": 5.029046058654785, "rewards/rejected": -1.0988086462020874, "step": 2918 }, { "epoch": 2.1326027397260274, "grad_norm": 15.983641845356301, "learning_rate": 2.645067193041458e-07, "logits/chosen": -3.082303762435913, "logits/rejected": -2.3127822875976562, "logps/chosen": -742.6251220703125, "logps/rejected": -516.5982666015625, "loss": 0.0849, "rewards/accuracies": 0.875, "rewards/chosen": 1.6380324363708496, "rewards/margins": 3.296809196472168, "rewards/rejected": -1.6587766408920288, "step": 2919 }, { "epoch": 2.1333333333333333, "grad_norm": 10.896230385270252, "learning_rate": 2.643474810284597e-07, "logits/chosen": -2.589540481567383, "logits/rejected": -2.6741652488708496, "logps/chosen": -592.732666015625, "logps/rejected": -524.4219970703125, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 1.8365718126296997, "rewards/margins": 3.7073235511779785, "rewards/rejected": -1.870751976966858, "step": 2920 }, { "epoch": 2.134063926940639, "grad_norm": 9.923988609765622, "learning_rate": 2.6418823691241795e-07, "logits/chosen": -3.0698812007904053, "logits/rejected": -1.934508204460144, "logps/chosen": -494.4476318359375, "logps/rejected": -290.3399963378906, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 3.6060328483581543, "rewards/margins": 5.635668754577637, "rewards/rejected": -2.0296359062194824, "step": 2921 }, { "epoch": 2.134794520547945, "grad_norm": 8.923876215604169, "learning_rate": 2.640289870208431e-07, "logits/chosen": -2.4659078121185303, "logits/rejected": -3.003028631210327, "logps/chosen": -704.1190795898438, "logps/rejected": -909.0921630859375, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 3.5439696311950684, "rewards/margins": 4.089376926422119, "rewards/rejected": -0.5454070568084717, "step": 2922 }, { "epoch": 2.135525114155251, "grad_norm": 13.459322645990294, "learning_rate": 2.6386973141856024e-07, "logits/chosen": -2.585191249847412, "logits/rejected": -2.2952427864074707, "logps/chosen": -454.3131103515625, "logps/rejected": -532.0879516601562, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": 3.1003079414367676, "rewards/margins": 4.674229621887207, "rewards/rejected": -1.5739216804504395, "step": 2923 }, { "epoch": 2.1362557077625572, "grad_norm": 10.990084253739681, "learning_rate": 2.637104701703967e-07, "logits/chosen": -3.110903024673462, "logits/rejected": -2.3039674758911133, "logps/chosen": -937.4423217773438, "logps/rejected": -744.6112060546875, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 6.042755126953125, "rewards/margins": 5.7010393142700195, "rewards/rejected": 0.3417160212993622, "step": 2924 }, { "epoch": 2.136986301369863, "grad_norm": 9.902425804023942, "learning_rate": 2.6355120334118225e-07, "logits/chosen": -2.697343111038208, "logits/rejected": -2.525028944015503, "logps/chosen": -690.9295043945312, "logps/rejected": -609.5602416992188, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 3.28410005569458, "rewards/margins": 5.513356685638428, "rewards/rejected": -2.2292566299438477, "step": 2925 }, { "epoch": 2.137716894977169, "grad_norm": 15.47684890967317, "learning_rate": 2.633919309957486e-07, "logits/chosen": -2.068270444869995, "logits/rejected": -2.778639793395996, "logps/chosen": -530.0078125, "logps/rejected": -622.6019287109375, "loss": 0.0719, "rewards/accuracies": 1.0, "rewards/chosen": 3.8189830780029297, "rewards/margins": 5.833942413330078, "rewards/rejected": -2.0149595737457275, "step": 2926 }, { "epoch": 2.138447488584475, "grad_norm": 31.217017750990003, "learning_rate": 2.632326531989302e-07, "logits/chosen": -2.4271068572998047, "logits/rejected": -2.1606647968292236, "logps/chosen": -497.0567626953125, "logps/rejected": -448.0310974121094, "loss": 0.1363, "rewards/accuracies": 0.875, "rewards/chosen": 1.8223059177398682, "rewards/margins": 4.113475322723389, "rewards/rejected": -2.2911696434020996, "step": 2927 }, { "epoch": 2.1391780821917807, "grad_norm": 13.159003687656517, "learning_rate": 2.630733700155633e-07, "logits/chosen": -2.9898641109466553, "logits/rejected": -2.219019651412964, "logps/chosen": -927.665283203125, "logps/rejected": -563.829833984375, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 5.292443752288818, "rewards/margins": 6.258519649505615, "rewards/rejected": -0.966076135635376, "step": 2928 }, { "epoch": 2.1399086757990866, "grad_norm": 9.270522986085892, "learning_rate": 2.629140815104865e-07, "logits/chosen": -3.42747163772583, "logits/rejected": -2.3948700428009033, "logps/chosen": -828.9371948242188, "logps/rejected": -597.7677001953125, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 3.184845447540283, "rewards/margins": 4.473236083984375, "rewards/rejected": -1.2883906364440918, "step": 2929 }, { "epoch": 2.1406392694063925, "grad_norm": 18.570449719465685, "learning_rate": 2.6275478774854053e-07, "logits/chosen": -2.655954599380493, "logits/rejected": -2.614168882369995, "logps/chosen": -747.1703491210938, "logps/rejected": -788.8277587890625, "loss": 0.1058, "rewards/accuracies": 1.0, "rewards/chosen": 3.4487104415893555, "rewards/margins": 5.266547203063965, "rewards/rejected": -1.8178361654281616, "step": 2930 }, { "epoch": 2.141369863013699, "grad_norm": 13.29416364376882, "learning_rate": 2.625954887945684e-07, "logits/chosen": -3.1210741996765137, "logits/rejected": -2.147648334503174, "logps/chosen": -740.7954711914062, "logps/rejected": -428.10455322265625, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 3.2406673431396484, "rewards/margins": 5.162811279296875, "rewards/rejected": -1.9221434593200684, "step": 2931 }, { "epoch": 2.1421004566210047, "grad_norm": 12.709301187316393, "learning_rate": 2.6243618471341497e-07, "logits/chosen": -2.9890496730804443, "logits/rejected": -2.581810712814331, "logps/chosen": -718.2674560546875, "logps/rejected": -557.760498046875, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 3.6247756481170654, "rewards/margins": 5.650918960571289, "rewards/rejected": -2.0261430740356445, "step": 2932 }, { "epoch": 2.1428310502283106, "grad_norm": 12.536389293514887, "learning_rate": 2.622768755699275e-07, "logits/chosen": -2.532155990600586, "logits/rejected": -2.265590190887451, "logps/chosen": -707.8115844726562, "logps/rejected": -654.6483154296875, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 3.339369297027588, "rewards/margins": 4.47441291809082, "rewards/rejected": -1.1350436210632324, "step": 2933 }, { "epoch": 2.1435616438356164, "grad_norm": 8.544909598205466, "learning_rate": 2.6211756142895497e-07, "logits/chosen": -2.872274160385132, "logits/rejected": -2.2397119998931885, "logps/chosen": -665.733642578125, "logps/rejected": -494.4529113769531, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 2.1908278465270996, "rewards/margins": 3.80527925491333, "rewards/rejected": -1.6144511699676514, "step": 2934 }, { "epoch": 2.1442922374429223, "grad_norm": 7.7532260702142315, "learning_rate": 2.619582423553488e-07, "logits/chosen": -3.046149730682373, "logits/rejected": -2.614271402359009, "logps/chosen": -516.96142578125, "logps/rejected": -616.7655639648438, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 2.9168765544891357, "rewards/margins": 5.051979064941406, "rewards/rejected": -2.1351022720336914, "step": 2935 }, { "epoch": 2.145022831050228, "grad_norm": 8.754775569703595, "learning_rate": 2.6179891841396196e-07, "logits/chosen": -3.0128653049468994, "logits/rejected": -1.6716513633728027, "logps/chosen": -732.3638305664062, "logps/rejected": -414.10577392578125, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 3.5538837909698486, "rewards/margins": 5.3160552978515625, "rewards/rejected": -1.7621711492538452, "step": 2936 }, { "epoch": 2.145753424657534, "grad_norm": 12.991660744545602, "learning_rate": 2.6163958966964974e-07, "logits/chosen": -2.4202003479003906, "logits/rejected": -2.3388547897338867, "logps/chosen": -454.7049255371094, "logps/rejected": -437.043212890625, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 1.8627734184265137, "rewards/margins": 4.661257266998291, "rewards/rejected": -2.7984838485717773, "step": 2937 }, { "epoch": 2.1464840182648404, "grad_norm": 12.805781393939808, "learning_rate": 2.614802561872692e-07, "logits/chosen": -3.0789337158203125, "logits/rejected": -2.8660178184509277, "logps/chosen": -555.9315185546875, "logps/rejected": -674.3228149414062, "loss": 0.0967, "rewards/accuracies": 1.0, "rewards/chosen": 2.1070775985717773, "rewards/margins": 3.352552890777588, "rewards/rejected": -1.2454754114151, "step": 2938 }, { "epoch": 2.1472146118721462, "grad_norm": 14.590362482411603, "learning_rate": 2.6132091803167957e-07, "logits/chosen": -2.8074851036071777, "logits/rejected": -2.4666943550109863, "logps/chosen": -737.8165893554688, "logps/rejected": -698.4338989257812, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 3.1190903186798096, "rewards/margins": 4.510251045227051, "rewards/rejected": -1.3911606073379517, "step": 2939 }, { "epoch": 2.147945205479452, "grad_norm": 9.819803229262071, "learning_rate": 2.6116157526774175e-07, "logits/chosen": -2.941532611846924, "logits/rejected": -2.4303207397460938, "logps/chosen": -714.0259399414062, "logps/rejected": -663.67724609375, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 3.5101912021636963, "rewards/margins": 3.6011009216308594, "rewards/rejected": -0.09090995788574219, "step": 2940 }, { "epoch": 2.148675799086758, "grad_norm": 9.042455776487191, "learning_rate": 2.6100222796031847e-07, "logits/chosen": -2.925769567489624, "logits/rejected": -1.5657451152801514, "logps/chosen": -535.865478515625, "logps/rejected": -317.91192626953125, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 5.142682075500488, "rewards/margins": 7.928149223327637, "rewards/rejected": -2.7854666709899902, "step": 2941 }, { "epoch": 2.149406392694064, "grad_norm": 20.722144859971813, "learning_rate": 2.608428761742746e-07, "logits/chosen": -2.7928895950317383, "logits/rejected": -2.1664228439331055, "logps/chosen": -750.4032592773438, "logps/rejected": -445.0537109375, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 3.192723035812378, "rewards/margins": 3.084047794342041, "rewards/rejected": 0.10867512226104736, "step": 2942 }, { "epoch": 2.1501369863013697, "grad_norm": 11.59070672659243, "learning_rate": 2.6068351997447645e-07, "logits/chosen": -2.5113916397094727, "logits/rejected": -1.4121447801589966, "logps/chosen": -749.7115478515625, "logps/rejected": -448.0301208496094, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": 3.6314353942871094, "rewards/margins": 4.03048849105835, "rewards/rejected": -0.39905333518981934, "step": 2943 }, { "epoch": 2.1508675799086756, "grad_norm": 10.581540974707515, "learning_rate": 2.6052415942579246e-07, "logits/chosen": -2.34120774269104, "logits/rejected": -2.0376784801483154, "logps/chosen": -528.296630859375, "logps/rejected": -515.05712890625, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": 2.694812774658203, "rewards/margins": 5.243175983428955, "rewards/rejected": -2.548363447189331, "step": 2944 }, { "epoch": 2.151598173515982, "grad_norm": 9.585797023598857, "learning_rate": 2.603647945930928e-07, "logits/chosen": -2.302835702896118, "logits/rejected": -2.5773305892944336, "logps/chosen": -490.9560241699219, "logps/rejected": -578.9459228515625, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 1.893578052520752, "rewards/margins": 4.715311527252197, "rewards/rejected": -2.8217334747314453, "step": 2945 }, { "epoch": 2.152328767123288, "grad_norm": 12.334414735415473, "learning_rate": 2.6020542554124913e-07, "logits/chosen": -2.370741367340088, "logits/rejected": -2.1094820499420166, "logps/chosen": -461.35443115234375, "logps/rejected": -524.98583984375, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": 3.2615599632263184, "rewards/margins": 4.818809986114502, "rewards/rejected": -1.5572500228881836, "step": 2946 }, { "epoch": 2.1530593607305937, "grad_norm": 5.975790135958879, "learning_rate": 2.600460523351351e-07, "logits/chosen": -3.0092616081237793, "logits/rejected": -2.3049535751342773, "logps/chosen": -788.3590087890625, "logps/rejected": -529.660400390625, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 3.7320454120635986, "rewards/margins": 4.346678733825684, "rewards/rejected": -0.6146329641342163, "step": 2947 }, { "epoch": 2.1537899543378995, "grad_norm": 11.384939022579417, "learning_rate": 2.598866750396259e-07, "logits/chosen": -3.4195520877838135, "logits/rejected": -2.667757511138916, "logps/chosen": -616.857421875, "logps/rejected": -475.04852294921875, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 2.893284320831299, "rewards/margins": 3.8797788619995117, "rewards/rejected": -0.9864947199821472, "step": 2948 }, { "epoch": 2.1545205479452054, "grad_norm": 16.691595025114342, "learning_rate": 2.597272937195984e-07, "logits/chosen": -2.6376302242279053, "logits/rejected": -2.4279067516326904, "logps/chosen": -596.588134765625, "logps/rejected": -417.3347473144531, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 1.8628947734832764, "rewards/margins": 3.748445987701416, "rewards/rejected": -1.8855514526367188, "step": 2949 }, { "epoch": 2.1552511415525113, "grad_norm": 7.802677174383577, "learning_rate": 2.595679084399312e-07, "logits/chosen": -2.6831986904144287, "logits/rejected": -1.796818733215332, "logps/chosen": -879.610595703125, "logps/rejected": -840.62646484375, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 4.265359878540039, "rewards/margins": 4.887552261352539, "rewards/rejected": -0.6221919655799866, "step": 2950 }, { "epoch": 2.155981735159817, "grad_norm": 8.865665738775585, "learning_rate": 2.594085192655045e-07, "logits/chosen": -2.8955507278442383, "logits/rejected": -1.5650181770324707, "logps/chosen": -733.2144775390625, "logps/rejected": -364.0687561035156, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 4.067976474761963, "rewards/margins": 5.212413787841797, "rewards/rejected": -1.1444374322891235, "step": 2951 }, { "epoch": 2.1567123287671235, "grad_norm": 8.77061692889825, "learning_rate": 2.592491262611999e-07, "logits/chosen": -2.944765090942383, "logits/rejected": -2.1033685207366943, "logps/chosen": -840.3890380859375, "logps/rejected": -734.630126953125, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 4.01704216003418, "rewards/margins": 5.527905464172363, "rewards/rejected": -1.510862946510315, "step": 2952 }, { "epoch": 2.1574429223744294, "grad_norm": 14.827652146866562, "learning_rate": 2.5908972949190083e-07, "logits/chosen": -3.0039353370666504, "logits/rejected": -2.194945812225342, "logps/chosen": -712.6398315429688, "logps/rejected": -524.859619140625, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 5.202521324157715, "rewards/margins": 5.405801296234131, "rewards/rejected": -0.20327982306480408, "step": 2953 }, { "epoch": 2.1581735159817352, "grad_norm": 8.903990341770799, "learning_rate": 2.58930329022492e-07, "logits/chosen": -2.583928346633911, "logits/rejected": -2.6141915321350098, "logps/chosen": -647.5594482421875, "logps/rejected": -684.4022216796875, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 2.2330758571624756, "rewards/margins": 4.621622085571289, "rewards/rejected": -2.3885464668273926, "step": 2954 }, { "epoch": 2.158904109589041, "grad_norm": 15.536176241816932, "learning_rate": 2.587709249178598e-07, "logits/chosen": -2.397934913635254, "logits/rejected": -2.1809120178222656, "logps/chosen": -938.4284057617188, "logps/rejected": -708.9691162109375, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 3.858161449432373, "rewards/margins": 3.985119581222534, "rewards/rejected": -0.12695828080177307, "step": 2955 }, { "epoch": 2.159634703196347, "grad_norm": 11.522757527607663, "learning_rate": 2.5861151724289205e-07, "logits/chosen": -2.625321626663208, "logits/rejected": -2.587892770767212, "logps/chosen": -421.59173583984375, "logps/rejected": -535.323486328125, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 2.8472771644592285, "rewards/margins": 3.7542853355407715, "rewards/rejected": -0.9070079326629639, "step": 2956 }, { "epoch": 2.160365296803653, "grad_norm": 8.714477804956871, "learning_rate": 2.58452106062478e-07, "logits/chosen": -2.7059807777404785, "logits/rejected": -2.0822484493255615, "logps/chosen": -505.07476806640625, "logps/rejected": -327.851318359375, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 1.3966107368469238, "rewards/margins": 3.247321128845215, "rewards/rejected": -1.8507105112075806, "step": 2957 }, { "epoch": 2.1610958904109587, "grad_norm": 17.34799620740417, "learning_rate": 2.5829269144150834e-07, "logits/chosen": -2.530728340148926, "logits/rejected": -1.918304204940796, "logps/chosen": -263.2116394042969, "logps/rejected": -286.351318359375, "loss": 0.0966, "rewards/accuracies": 0.875, "rewards/chosen": 3.0254859924316406, "rewards/margins": 4.777180194854736, "rewards/rejected": -1.7516943216323853, "step": 2958 }, { "epoch": 2.161826484018265, "grad_norm": 9.577139226520629, "learning_rate": 2.581332734448752e-07, "logits/chosen": -3.327181816101074, "logits/rejected": -2.2046759128570557, "logps/chosen": -911.6367797851562, "logps/rejected": -582.4961547851562, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 2.7069857120513916, "rewards/margins": 3.4539854526519775, "rewards/rejected": -0.7470000386238098, "step": 2959 }, { "epoch": 2.162557077625571, "grad_norm": 9.962676012501769, "learning_rate": 2.5797385213747204e-07, "logits/chosen": -2.8233823776245117, "logits/rejected": -1.753644347190857, "logps/chosen": -487.22235107421875, "logps/rejected": -331.4952697753906, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 3.7159311771392822, "rewards/margins": 5.471157550811768, "rewards/rejected": -1.755226492881775, "step": 2960 }, { "epoch": 2.163287671232877, "grad_norm": 9.705901358410728, "learning_rate": 2.5781442758419356e-07, "logits/chosen": -2.650123119354248, "logits/rejected": -1.8109028339385986, "logps/chosen": -762.0885620117188, "logps/rejected": -451.74310302734375, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 4.026769161224365, "rewards/margins": 5.270087718963623, "rewards/rejected": -1.2433178424835205, "step": 2961 }, { "epoch": 2.1640182648401827, "grad_norm": 11.459470232424032, "learning_rate": 2.576549998499361e-07, "logits/chosen": -2.2822892665863037, "logits/rejected": -2.0889174938201904, "logps/chosen": -330.313232421875, "logps/rejected": -437.46441650390625, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 2.2077252864837646, "rewards/margins": 4.420284748077393, "rewards/rejected": -2.212559700012207, "step": 2962 }, { "epoch": 2.1647488584474885, "grad_norm": 6.025473561017548, "learning_rate": 2.5749556899959696e-07, "logits/chosen": -3.150693893432617, "logits/rejected": -2.377253770828247, "logps/chosen": -704.6415405273438, "logps/rejected": -610.68701171875, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": 4.101377487182617, "rewards/margins": 5.353968620300293, "rewards/rejected": -1.2525912523269653, "step": 2963 }, { "epoch": 2.1654794520547944, "grad_norm": 14.44773706582599, "learning_rate": 2.5733613509807494e-07, "logits/chosen": -2.374483346939087, "logits/rejected": -2.2667856216430664, "logps/chosen": -609.670166015625, "logps/rejected": -572.6390380859375, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": 2.5959036350250244, "rewards/margins": 2.6379151344299316, "rewards/rejected": -0.04201102256774902, "step": 2964 }, { "epoch": 2.1662100456621003, "grad_norm": 6.235663488796345, "learning_rate": 2.571766982102698e-07, "logits/chosen": -2.8456521034240723, "logits/rejected": -2.365605354309082, "logps/chosen": -945.0076904296875, "logps/rejected": -857.8086547851562, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 3.4975128173828125, "rewards/margins": 4.0498046875, "rewards/rejected": -0.5522919297218323, "step": 2965 }, { "epoch": 2.1669406392694066, "grad_norm": 7.6339931297281325, "learning_rate": 2.570172584010829e-07, "logits/chosen": -2.717073440551758, "logits/rejected": -2.256593704223633, "logps/chosen": -557.528076171875, "logps/rejected": -596.252685546875, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 4.048643589019775, "rewards/margins": 4.953080654144287, "rewards/rejected": -0.9044373631477356, "step": 2966 }, { "epoch": 2.1676712328767125, "grad_norm": 11.722431062482237, "learning_rate": 2.5685781573541645e-07, "logits/chosen": -3.192551374435425, "logits/rejected": -2.408508777618408, "logps/chosen": -901.0634765625, "logps/rejected": -677.369873046875, "loss": 0.0681, "rewards/accuracies": 1.0, "rewards/chosen": 3.8689684867858887, "rewards/margins": 4.337912559509277, "rewards/rejected": -0.46894413232803345, "step": 2967 }, { "epoch": 2.1684018264840184, "grad_norm": 14.282096720830737, "learning_rate": 2.5669837027817407e-07, "logits/chosen": -2.8926923274993896, "logits/rejected": -2.6827750205993652, "logps/chosen": -540.9876098632812, "logps/rejected": -635.6307373046875, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 3.4486260414123535, "rewards/margins": 4.757118225097656, "rewards/rejected": -1.3084917068481445, "step": 2968 }, { "epoch": 2.1691324200913242, "grad_norm": 14.034357691065924, "learning_rate": 2.565389220942603e-07, "logits/chosen": -2.481017589569092, "logits/rejected": -1.7322537899017334, "logps/chosen": -529.3506469726562, "logps/rejected": -451.463134765625, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 4.42010498046875, "rewards/margins": 6.989968299865723, "rewards/rejected": -2.5698628425598145, "step": 2969 }, { "epoch": 2.16986301369863, "grad_norm": 13.16947565817744, "learning_rate": 2.56379471248581e-07, "logits/chosen": -2.854010581970215, "logits/rejected": -2.2246286869049072, "logps/chosen": -597.78271484375, "logps/rejected": -468.31640625, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": 3.076106548309326, "rewards/margins": 5.0022759437561035, "rewards/rejected": -1.9261692762374878, "step": 2970 }, { "epoch": 2.170593607305936, "grad_norm": 16.679145007278645, "learning_rate": 2.562200178060429e-07, "logits/chosen": -3.230863094329834, "logits/rejected": -2.363856792449951, "logps/chosen": -694.0703735351562, "logps/rejected": -461.45782470703125, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 3.175595998764038, "rewards/margins": 5.073736667633057, "rewards/rejected": -1.898140549659729, "step": 2971 }, { "epoch": 2.171324200913242, "grad_norm": 11.63856223858623, "learning_rate": 2.5606056183155395e-07, "logits/chosen": -2.5573973655700684, "logits/rejected": -2.1333441734313965, "logps/chosen": -806.4195556640625, "logps/rejected": -558.7534790039062, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 2.7520647048950195, "rewards/margins": 4.7440266609191895, "rewards/rejected": -1.99196195602417, "step": 2972 }, { "epoch": 2.172054794520548, "grad_norm": 15.566129820318466, "learning_rate": 2.559011033900231e-07, "logits/chosen": -2.6138172149658203, "logits/rejected": -2.362450122833252, "logps/chosen": -366.6408996582031, "logps/rejected": -442.3927917480469, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 1.7703945636749268, "rewards/margins": 4.937661170959473, "rewards/rejected": -3.167266607284546, "step": 2973 }, { "epoch": 2.172785388127854, "grad_norm": 12.296972407690758, "learning_rate": 2.5574164254636025e-07, "logits/chosen": -2.6740479469299316, "logits/rejected": -1.7874705791473389, "logps/chosen": -564.2809448242188, "logps/rejected": -312.41851806640625, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 2.3116567134857178, "rewards/margins": 2.81276535987854, "rewards/rejected": -0.5011087656021118, "step": 2974 }, { "epoch": 2.17351598173516, "grad_norm": 7.961860580760675, "learning_rate": 2.555821793654764e-07, "logits/chosen": -2.46236252784729, "logits/rejected": -2.197427272796631, "logps/chosen": -485.05706787109375, "logps/rejected": -625.0132446289062, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 2.6692702770233154, "rewards/margins": 5.9007954597473145, "rewards/rejected": -3.23152494430542, "step": 2975 }, { "epoch": 2.174246575342466, "grad_norm": 11.713134722371038, "learning_rate": 2.5542271391228327e-07, "logits/chosen": -2.4474105834960938, "logits/rejected": -1.8759534358978271, "logps/chosen": -625.2828369140625, "logps/rejected": -478.0328369140625, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 3.0516867637634277, "rewards/margins": 5.346772193908691, "rewards/rejected": -2.2950856685638428, "step": 2976 }, { "epoch": 2.1749771689497717, "grad_norm": 14.326957763251764, "learning_rate": 2.5526324625169377e-07, "logits/chosen": -2.957868814468384, "logits/rejected": -1.7421112060546875, "logps/chosen": -605.3526611328125, "logps/rejected": -347.4512939453125, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": 2.8931853771209717, "rewards/margins": 4.551481246948242, "rewards/rejected": -1.6582961082458496, "step": 2977 }, { "epoch": 2.1757077625570775, "grad_norm": 9.901457292052905, "learning_rate": 2.5510377644862146e-07, "logits/chosen": -2.9478025436401367, "logits/rejected": -2.3508198261260986, "logps/chosen": -804.2279052734375, "logps/rejected": -725.076904296875, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": 4.198545932769775, "rewards/margins": 5.211625099182129, "rewards/rejected": -1.0130785703659058, "step": 2978 }, { "epoch": 2.1764383561643834, "grad_norm": 35.12650130243546, "learning_rate": 2.54944304567981e-07, "logits/chosen": -2.1134865283966064, "logits/rejected": -2.6705198287963867, "logps/chosen": -441.50286865234375, "logps/rejected": -705.1785888671875, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": 2.002378463745117, "rewards/margins": 3.9679436683654785, "rewards/rejected": -1.9655654430389404, "step": 2979 }, { "epoch": 2.1771689497716897, "grad_norm": 12.231037644466292, "learning_rate": 2.5478483067468774e-07, "logits/chosen": -2.93131685256958, "logits/rejected": -2.5172770023345947, "logps/chosen": -591.70556640625, "logps/rejected": -446.74755859375, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 1.161186695098877, "rewards/margins": 3.0019547939300537, "rewards/rejected": -1.8407682180404663, "step": 2980 }, { "epoch": 2.1778995433789956, "grad_norm": 11.59200140010175, "learning_rate": 2.54625354833658e-07, "logits/chosen": -2.723090171813965, "logits/rejected": -2.2528514862060547, "logps/chosen": -672.9605712890625, "logps/rejected": -660.5506591796875, "loss": 0.0785, "rewards/accuracies": 1.0, "rewards/chosen": 2.4667599201202393, "rewards/margins": 3.572782039642334, "rewards/rejected": -1.1060221195220947, "step": 2981 }, { "epoch": 2.1786301369863015, "grad_norm": 12.51879933995519, "learning_rate": 2.544658771098086e-07, "logits/chosen": -2.97306489944458, "logits/rejected": -2.450507640838623, "logps/chosen": -665.9395141601562, "logps/rejected": -557.8930053710938, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 3.7816038131713867, "rewards/margins": 5.2016401290893555, "rewards/rejected": -1.4200360774993896, "step": 2982 }, { "epoch": 2.1793607305936074, "grad_norm": 11.140015487797106, "learning_rate": 2.543063975680576e-07, "logits/chosen": -2.7471671104431152, "logits/rejected": -2.636178970336914, "logps/chosen": -899.2283935546875, "logps/rejected": -762.8452758789062, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 3.841233730316162, "rewards/margins": 3.4102468490600586, "rewards/rejected": 0.4309867024421692, "step": 2983 }, { "epoch": 2.1800913242009132, "grad_norm": 20.334748932803578, "learning_rate": 2.5414691627332315e-07, "logits/chosen": -2.8402445316314697, "logits/rejected": -2.467087745666504, "logps/chosen": -761.3941650390625, "logps/rejected": -602.7911376953125, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 4.094240188598633, "rewards/margins": 6.482450485229492, "rewards/rejected": -2.3882107734680176, "step": 2984 }, { "epoch": 2.180821917808219, "grad_norm": 10.369821382049233, "learning_rate": 2.5398743329052486e-07, "logits/chosen": -3.248920440673828, "logits/rejected": -2.3978986740112305, "logps/chosen": -786.31201171875, "logps/rejected": -619.7960205078125, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 4.444301605224609, "rewards/margins": 6.251270294189453, "rewards/rejected": -1.8069688081741333, "step": 2985 }, { "epoch": 2.181552511415525, "grad_norm": 7.0949695227884435, "learning_rate": 2.538279486845824e-07, "logits/chosen": -2.648775577545166, "logits/rejected": -2.086108684539795, "logps/chosen": -756.4661865234375, "logps/rejected": -547.4819946289062, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 3.458343505859375, "rewards/margins": 4.960326194763184, "rewards/rejected": -1.501982569694519, "step": 2986 }, { "epoch": 2.182283105022831, "grad_norm": 7.5583787517872345, "learning_rate": 2.5366846252041646e-07, "logits/chosen": -2.8244504928588867, "logits/rejected": -2.043799638748169, "logps/chosen": -838.967529296875, "logps/rejected": -823.8109130859375, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 4.003809452056885, "rewards/margins": 4.859398365020752, "rewards/rejected": -0.8555893301963806, "step": 2987 }, { "epoch": 2.183013698630137, "grad_norm": 8.154918342710094, "learning_rate": 2.5350897486294826e-07, "logits/chosen": -3.5588340759277344, "logits/rejected": -2.0070459842681885, "logps/chosen": -637.2294311523438, "logps/rejected": -394.17767333984375, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": 4.356326580047607, "rewards/margins": 6.320773124694824, "rewards/rejected": -1.9644463062286377, "step": 2988 }, { "epoch": 2.183744292237443, "grad_norm": 16.34066504106779, "learning_rate": 2.533494857770996e-07, "logits/chosen": -3.00976300239563, "logits/rejected": -1.8756853342056274, "logps/chosen": -974.90185546875, "logps/rejected": -613.2172241210938, "loss": 0.0845, "rewards/accuracies": 1.0, "rewards/chosen": 4.030178070068359, "rewards/margins": 4.240486145019531, "rewards/rejected": -0.21030808985233307, "step": 2989 }, { "epoch": 2.184474885844749, "grad_norm": 16.836090667806744, "learning_rate": 2.531899953277929e-07, "logits/chosen": -2.9173898696899414, "logits/rejected": -1.892433524131775, "logps/chosen": -787.6861572265625, "logps/rejected": -509.70709228515625, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 3.9259235858917236, "rewards/margins": 5.837218284606934, "rewards/rejected": -1.91129469871521, "step": 2990 }, { "epoch": 2.185205479452055, "grad_norm": 12.11600121127274, "learning_rate": 2.53030503579951e-07, "logits/chosen": -2.461866617202759, "logits/rejected": -1.8507424592971802, "logps/chosen": -638.627685546875, "logps/rejected": -512.0635986328125, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 3.169172525405884, "rewards/margins": 3.3344898223876953, "rewards/rejected": -0.16531717777252197, "step": 2991 }, { "epoch": 2.1859360730593607, "grad_norm": 15.022964700211109, "learning_rate": 2.528710105984977e-07, "logits/chosen": -2.4928202629089355, "logits/rejected": -2.0383520126342773, "logps/chosen": -595.7982788085938, "logps/rejected": -469.8163146972656, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": 2.8429927825927734, "rewards/margins": 4.404498100280762, "rewards/rejected": -1.5615051984786987, "step": 2992 }, { "epoch": 2.1866666666666665, "grad_norm": 11.900084012800866, "learning_rate": 2.527115164483567e-07, "logits/chosen": -3.0035135746002197, "logits/rejected": -2.3001606464385986, "logps/chosen": -520.2413330078125, "logps/rejected": -326.77313232421875, "loss": 0.072, "rewards/accuracies": 0.875, "rewards/chosen": 1.5676071643829346, "rewards/margins": 2.9603347778320312, "rewards/rejected": -1.3927273750305176, "step": 2993 }, { "epoch": 2.1873972602739724, "grad_norm": 7.379545976685929, "learning_rate": 2.5255202119445255e-07, "logits/chosen": -3.1342825889587402, "logits/rejected": -2.0746614933013916, "logps/chosen": -702.3012084960938, "logps/rejected": -517.4371948242188, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 4.762627601623535, "rewards/margins": 7.091067790985107, "rewards/rejected": -2.3284401893615723, "step": 2994 }, { "epoch": 2.1881278538812787, "grad_norm": 12.249230808320489, "learning_rate": 2.523925249017102e-07, "logits/chosen": -2.5874650478363037, "logits/rejected": -2.2109711170196533, "logps/chosen": -462.0145263671875, "logps/rejected": -426.1920471191406, "loss": 0.0735, "rewards/accuracies": 1.0, "rewards/chosen": 3.4320778846740723, "rewards/margins": 5.183476448059082, "rewards/rejected": -1.7513983249664307, "step": 2995 }, { "epoch": 2.1888584474885846, "grad_norm": 11.435696152153353, "learning_rate": 2.5223302763505496e-07, "logits/chosen": -3.1859688758850098, "logits/rejected": -2.3731794357299805, "logps/chosen": -549.3938598632812, "logps/rejected": -448.3624572753906, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 2.9424710273742676, "rewards/margins": 3.617555618286133, "rewards/rejected": -0.6750845313072205, "step": 2996 }, { "epoch": 2.1895890410958905, "grad_norm": 10.038325301240024, "learning_rate": 2.5207352945941256e-07, "logits/chosen": -2.547966241836548, "logits/rejected": -2.298179864883423, "logps/chosen": -596.3182373046875, "logps/rejected": -538.745361328125, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 4.357651233673096, "rewards/margins": 6.30496072769165, "rewards/rejected": -1.9473100900650024, "step": 2997 }, { "epoch": 2.1903196347031963, "grad_norm": 14.913826158748108, "learning_rate": 2.5191403043970914e-07, "logits/chosen": -2.994870662689209, "logits/rejected": -2.4100265502929688, "logps/chosen": -741.343017578125, "logps/rejected": -509.94403076171875, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": 2.692667007446289, "rewards/margins": 4.5935139656066895, "rewards/rejected": -1.9008471965789795, "step": 2998 }, { "epoch": 2.1910502283105022, "grad_norm": 11.739165160147742, "learning_rate": 2.5175453064087115e-07, "logits/chosen": -3.0850510597229004, "logits/rejected": -2.5765836238861084, "logps/chosen": -701.3330078125, "logps/rejected": -654.6541748046875, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 2.7827115058898926, "rewards/margins": 3.897127389907837, "rewards/rejected": -1.1144158840179443, "step": 2999 }, { "epoch": 2.191780821917808, "grad_norm": 17.015955898125533, "learning_rate": 2.5159503012782535e-07, "logits/chosen": -3.0149986743927, "logits/rejected": -2.93754506111145, "logps/chosen": -606.6984252929688, "logps/rejected": -619.721435546875, "loss": 0.1346, "rewards/accuracies": 1.0, "rewards/chosen": 1.7852160930633545, "rewards/margins": 4.364218711853027, "rewards/rejected": -2.579002618789673, "step": 3000 }, { "epoch": 2.192511415525114, "grad_norm": 14.49419365747103, "learning_rate": 2.514355289654988e-07, "logits/chosen": -2.58906626701355, "logits/rejected": -2.2972347736358643, "logps/chosen": -503.508544921875, "logps/rejected": -420.4034118652344, "loss": 0.0877, "rewards/accuracies": 1.0, "rewards/chosen": 1.6511540412902832, "rewards/margins": 2.8259992599487305, "rewards/rejected": -1.1748450994491577, "step": 3001 }, { "epoch": 2.1932420091324203, "grad_norm": 8.512382015805436, "learning_rate": 2.5127602721881873e-07, "logits/chosen": -2.3095333576202393, "logits/rejected": -1.904821753501892, "logps/chosen": -358.9684143066406, "logps/rejected": -280.09234619140625, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 1.5967192649841309, "rewards/margins": 4.222479343414307, "rewards/rejected": -2.6257598400115967, "step": 3002 }, { "epoch": 2.193972602739726, "grad_norm": 13.115893301811058, "learning_rate": 2.511165249527129e-07, "logits/chosen": -2.296743631362915, "logits/rejected": -2.751955986022949, "logps/chosen": -477.91876220703125, "logps/rejected": -621.547607421875, "loss": 0.0912, "rewards/accuracies": 1.0, "rewards/chosen": 1.8264905214309692, "rewards/margins": 3.2309868335723877, "rewards/rejected": -1.4044963121414185, "step": 3003 }, { "epoch": 2.194703196347032, "grad_norm": 13.357906649020471, "learning_rate": 2.509570222321089e-07, "logits/chosen": -2.485426425933838, "logits/rejected": -1.9387636184692383, "logps/chosen": -654.8607788085938, "logps/rejected": -542.72314453125, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": 3.2986881732940674, "rewards/margins": 5.022905349731445, "rewards/rejected": -1.724217414855957, "step": 3004 }, { "epoch": 2.195433789954338, "grad_norm": 9.583637885078124, "learning_rate": 2.507975191219348e-07, "logits/chosen": -2.76128888130188, "logits/rejected": -2.449432373046875, "logps/chosen": -654.5956420898438, "logps/rejected": -783.7598876953125, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": 3.983517646789551, "rewards/margins": 5.365731239318848, "rewards/rejected": -1.382213830947876, "step": 3005 }, { "epoch": 2.196164383561644, "grad_norm": 22.224338778259735, "learning_rate": 2.506380156871186e-07, "logits/chosen": -2.6447153091430664, "logits/rejected": -2.1796467304229736, "logps/chosen": -546.0867309570312, "logps/rejected": -474.3254699707031, "loss": 0.1097, "rewards/accuracies": 0.875, "rewards/chosen": 1.8667619228363037, "rewards/margins": 2.418738842010498, "rewards/rejected": -0.5519766807556152, "step": 3006 }, { "epoch": 2.1968949771689497, "grad_norm": 12.714236496635683, "learning_rate": 2.504785119925886e-07, "logits/chosen": -2.7936322689056396, "logits/rejected": -2.894909143447876, "logps/chosen": -474.864013671875, "logps/rejected": -491.87255859375, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 3.037822961807251, "rewards/margins": 4.021094799041748, "rewards/rejected": -0.9832717180252075, "step": 3007 }, { "epoch": 2.1976255707762555, "grad_norm": 10.23948166965893, "learning_rate": 2.503190081032732e-07, "logits/chosen": -2.604532480239868, "logits/rejected": -2.5046067237854004, "logps/chosen": -947.777099609375, "logps/rejected": -920.310791015625, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 2.763160467147827, "rewards/margins": 3.6870970726013184, "rewards/rejected": -0.9239364862442017, "step": 3008 }, { "epoch": 2.198356164383562, "grad_norm": 19.8263180076934, "learning_rate": 2.501595040841009e-07, "logits/chosen": -3.0114474296569824, "logits/rejected": -2.4476208686828613, "logps/chosen": -564.50537109375, "logps/rejected": -512.4981689453125, "loss": 0.0745, "rewards/accuracies": 1.0, "rewards/chosen": 3.6607584953308105, "rewards/margins": 4.788033962249756, "rewards/rejected": -1.1272757053375244, "step": 3009 }, { "epoch": 2.1990867579908677, "grad_norm": 12.233246226264797, "learning_rate": 2.5e-07, "logits/chosen": -2.7417047023773193, "logits/rejected": -2.2459373474121094, "logps/chosen": -700.0369262695312, "logps/rejected": -650.154541015625, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 3.793567657470703, "rewards/margins": 5.495663642883301, "rewards/rejected": -1.7020961046218872, "step": 3010 }, { "epoch": 2.1998173515981736, "grad_norm": 8.127214804802346, "learning_rate": 2.4984049591589907e-07, "logits/chosen": -3.024794101715088, "logits/rejected": -2.4443750381469727, "logps/chosen": -896.6260986328125, "logps/rejected": -626.697509765625, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 4.457253932952881, "rewards/margins": 4.305649757385254, "rewards/rejected": 0.15160444378852844, "step": 3011 }, { "epoch": 2.2005479452054795, "grad_norm": 13.372828797559182, "learning_rate": 2.496809918967267e-07, "logits/chosen": -2.6109962463378906, "logits/rejected": -2.0658323764801025, "logps/chosen": -339.4864501953125, "logps/rejected": -267.2044372558594, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": 2.1073899269104004, "rewards/margins": 3.539978265762329, "rewards/rejected": -1.4325883388519287, "step": 3012 }, { "epoch": 2.2012785388127853, "grad_norm": 15.678958102273619, "learning_rate": 2.4952148800741135e-07, "logits/chosen": -2.65472149848938, "logits/rejected": -2.031141757965088, "logps/chosen": -533.048583984375, "logps/rejected": -533.6947631835938, "loss": 0.0588, "rewards/accuracies": 1.0, "rewards/chosen": 2.2619361877441406, "rewards/margins": 3.99609375, "rewards/rejected": -1.7341578006744385, "step": 3013 }, { "epoch": 2.202009132420091, "grad_norm": 9.659935109175246, "learning_rate": 2.493619843128814e-07, "logits/chosen": -2.6170427799224854, "logits/rejected": -2.4482617378234863, "logps/chosen": -577.6409301757812, "logps/rejected": -536.6450805664062, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 3.0183053016662598, "rewards/margins": 4.695910453796387, "rewards/rejected": -1.677605390548706, "step": 3014 }, { "epoch": 2.202739726027397, "grad_norm": 13.636692454679602, "learning_rate": 2.4920248087806525e-07, "logits/chosen": -2.6040585041046143, "logits/rejected": -2.0608103275299072, "logps/chosen": -686.4317626953125, "logps/rejected": -413.070556640625, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 2.0306780338287354, "rewards/margins": 2.892792224884033, "rewards/rejected": -0.8621140122413635, "step": 3015 }, { "epoch": 2.203470319634703, "grad_norm": 10.828566769262089, "learning_rate": 2.4904297776789107e-07, "logits/chosen": -2.55039119720459, "logits/rejected": -1.9657409191131592, "logps/chosen": -489.501220703125, "logps/rejected": -398.87908935546875, "loss": 0.0885, "rewards/accuracies": 1.0, "rewards/chosen": 2.1422982215881348, "rewards/margins": 4.1769819259643555, "rewards/rejected": -2.0346839427948, "step": 3016 }, { "epoch": 2.2042009132420093, "grad_norm": 9.646978476475727, "learning_rate": 2.488834750472872e-07, "logits/chosen": -3.249236822128296, "logits/rejected": -2.3060264587402344, "logps/chosen": -598.5875244140625, "logps/rejected": -471.9101257324219, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 4.696319103240967, "rewards/margins": 6.556952476501465, "rewards/rejected": -1.8606330156326294, "step": 3017 }, { "epoch": 2.204931506849315, "grad_norm": 10.139706327560956, "learning_rate": 2.4872397278118125e-07, "logits/chosen": -2.498598575592041, "logits/rejected": -1.9157649278640747, "logps/chosen": -424.79290771484375, "logps/rejected": -481.6145935058594, "loss": 0.0725, "rewards/accuracies": 0.875, "rewards/chosen": 1.914520502090454, "rewards/margins": 3.5591840744018555, "rewards/rejected": -1.6446635723114014, "step": 3018 }, { "epoch": 2.205662100456621, "grad_norm": 11.215918563525362, "learning_rate": 2.485644710345012e-07, "logits/chosen": -3.043095350265503, "logits/rejected": -2.145139694213867, "logps/chosen": -777.7094116210938, "logps/rejected": -623.3658447265625, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 4.160789966583252, "rewards/margins": 5.030397891998291, "rewards/rejected": -0.8696079850196838, "step": 3019 }, { "epoch": 2.206392694063927, "grad_norm": 5.365237396761526, "learning_rate": 2.484049698721746e-07, "logits/chosen": -2.497025966644287, "logits/rejected": -1.9081270694732666, "logps/chosen": -508.27630615234375, "logps/rejected": -464.484375, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 2.228712797164917, "rewards/margins": 6.029845714569092, "rewards/rejected": -3.801133155822754, "step": 3020 }, { "epoch": 2.207123287671233, "grad_norm": 13.034387151419354, "learning_rate": 2.482454693591289e-07, "logits/chosen": -2.8723580837249756, "logits/rejected": -2.462495803833008, "logps/chosen": -765.9177856445312, "logps/rejected": -591.78564453125, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 3.8149871826171875, "rewards/margins": 4.1221604347229, "rewards/rejected": -0.30717340111732483, "step": 3021 }, { "epoch": 2.2078538812785387, "grad_norm": 13.270211405331397, "learning_rate": 2.480859695602909e-07, "logits/chosen": -2.6758649349212646, "logits/rejected": -2.1573071479797363, "logps/chosen": -581.4411010742188, "logps/rejected": -636.4889526367188, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": 2.9504380226135254, "rewards/margins": 5.727540493011475, "rewards/rejected": -2.777101993560791, "step": 3022 }, { "epoch": 2.2085844748858445, "grad_norm": 7.225576104555581, "learning_rate": 2.479264705405874e-07, "logits/chosen": -2.6081924438476562, "logits/rejected": -2.2445380687713623, "logps/chosen": -864.567626953125, "logps/rejected": -730.0657958984375, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 2.9277517795562744, "rewards/margins": 3.4811930656433105, "rewards/rejected": -0.5534414052963257, "step": 3023 }, { "epoch": 2.209315068493151, "grad_norm": 12.863612796554765, "learning_rate": 2.47766972364945e-07, "logits/chosen": -2.26391339302063, "logits/rejected": -2.653205633163452, "logps/chosen": -461.0955505371094, "logps/rejected": -534.6914672851562, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 2.733985424041748, "rewards/margins": 3.477835178375244, "rewards/rejected": -0.7438496351242065, "step": 3024 }, { "epoch": 2.2100456621004567, "grad_norm": 9.45365805896325, "learning_rate": 2.476074750982898e-07, "logits/chosen": -2.528264284133911, "logits/rejected": -1.840235710144043, "logps/chosen": -651.0112915039062, "logps/rejected": -854.160888671875, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 4.618956565856934, "rewards/margins": 7.450660228729248, "rewards/rejected": -2.8317041397094727, "step": 3025 }, { "epoch": 2.2107762557077626, "grad_norm": 10.89554523113869, "learning_rate": 2.474479788055475e-07, "logits/chosen": -3.183037757873535, "logits/rejected": -2.05765438079834, "logps/chosen": -646.6073608398438, "logps/rejected": -351.4300231933594, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 3.27266526222229, "rewards/margins": 4.683383941650391, "rewards/rejected": -1.4107186794281006, "step": 3026 }, { "epoch": 2.2115068493150685, "grad_norm": 12.771981233883483, "learning_rate": 2.472884835516433e-07, "logits/chosen": -2.9544570446014404, "logits/rejected": -2.3606977462768555, "logps/chosen": -826.715087890625, "logps/rejected": -687.4456176757812, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": 5.9459710121154785, "rewards/margins": 6.261063098907471, "rewards/rejected": -0.31509196758270264, "step": 3027 }, { "epoch": 2.2122374429223743, "grad_norm": 11.45019560401119, "learning_rate": 2.471289894015023e-07, "logits/chosen": -2.8844428062438965, "logits/rejected": -1.106977939605713, "logps/chosen": -690.5374755859375, "logps/rejected": -274.8599548339844, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 3.2408809661865234, "rewards/margins": 5.381758689880371, "rewards/rejected": -2.1408774852752686, "step": 3028 }, { "epoch": 2.21296803652968, "grad_norm": 10.336252312973082, "learning_rate": 2.469694964200489e-07, "logits/chosen": -2.8662357330322266, "logits/rejected": -2.4115610122680664, "logps/chosen": -307.00811767578125, "logps/rejected": -341.46124267578125, "loss": 0.0629, "rewards/accuracies": 1.0, "rewards/chosen": 2.0085346698760986, "rewards/margins": 4.468003273010254, "rewards/rejected": -2.4594686031341553, "step": 3029 }, { "epoch": 2.213698630136986, "grad_norm": 13.2758450628873, "learning_rate": 2.4681000467220713e-07, "logits/chosen": -2.6136937141418457, "logits/rejected": -1.7328492403030396, "logps/chosen": -601.5269165039062, "logps/rejected": -595.46142578125, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 3.353717803955078, "rewards/margins": 7.574792861938477, "rewards/rejected": -4.221075057983398, "step": 3030 }, { "epoch": 2.2144292237442924, "grad_norm": 16.031911497987284, "learning_rate": 2.466505142229004e-07, "logits/chosen": -2.6865713596343994, "logits/rejected": -1.8902978897094727, "logps/chosen": -696.827392578125, "logps/rejected": -581.2816772460938, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 2.5416784286499023, "rewards/margins": 4.679354190826416, "rewards/rejected": -2.1376757621765137, "step": 3031 }, { "epoch": 2.2151598173515983, "grad_norm": 11.871524342305875, "learning_rate": 2.464910251370517e-07, "logits/chosen": -2.7866666316986084, "logits/rejected": -2.1638236045837402, "logps/chosen": -402.65203857421875, "logps/rejected": -329.36956787109375, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 2.1123368740081787, "rewards/margins": 5.007658004760742, "rewards/rejected": -2.8953211307525635, "step": 3032 }, { "epoch": 2.215890410958904, "grad_norm": 9.973360053993918, "learning_rate": 2.4633153747958346e-07, "logits/chosen": -3.1342663764953613, "logits/rejected": -2.195100784301758, "logps/chosen": -426.93408203125, "logps/rejected": -377.32806396484375, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 1.4864072799682617, "rewards/margins": 3.2641754150390625, "rewards/rejected": -1.7777681350708008, "step": 3033 }, { "epoch": 2.21662100456621, "grad_norm": 9.079864072748547, "learning_rate": 2.4617205131541767e-07, "logits/chosen": -2.7376174926757812, "logits/rejected": -2.5921411514282227, "logps/chosen": -665.9102172851562, "logps/rejected": -683.6485595703125, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 4.1872053146362305, "rewards/margins": 4.461771488189697, "rewards/rejected": -0.2745664715766907, "step": 3034 }, { "epoch": 2.217351598173516, "grad_norm": 22.946078575056266, "learning_rate": 2.4601256670947523e-07, "logits/chosen": -2.6679534912109375, "logits/rejected": -2.641079902648926, "logps/chosen": -760.37548828125, "logps/rejected": -747.9395751953125, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 3.6224751472473145, "rewards/margins": 4.904192924499512, "rewards/rejected": -1.281717300415039, "step": 3035 }, { "epoch": 2.2180821917808218, "grad_norm": 8.327470878978465, "learning_rate": 2.458530837266769e-07, "logits/chosen": -2.4002530574798584, "logits/rejected": -2.615731954574585, "logps/chosen": -785.9205322265625, "logps/rejected": -842.126708984375, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 3.815092086791992, "rewards/margins": 4.586178779602051, "rewards/rejected": -0.7710866928100586, "step": 3036 }, { "epoch": 2.2188127853881277, "grad_norm": 10.078576555412019, "learning_rate": 2.4569360243194245e-07, "logits/chosen": -2.6801562309265137, "logits/rejected": -2.4314818382263184, "logps/chosen": -826.2463989257812, "logps/rejected": -758.6314086914062, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 3.500011444091797, "rewards/margins": 3.7956056594848633, "rewards/rejected": -0.2955942153930664, "step": 3037 }, { "epoch": 2.219543378995434, "grad_norm": 5.883574376918377, "learning_rate": 2.455341228901913e-07, "logits/chosen": -2.6556386947631836, "logits/rejected": -1.5971264839172363, "logps/chosen": -575.5670776367188, "logps/rejected": -358.9881896972656, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 1.9596669673919678, "rewards/margins": 4.509365558624268, "rewards/rejected": -2.549698829650879, "step": 3038 }, { "epoch": 2.22027397260274, "grad_norm": 10.612881559919662, "learning_rate": 2.4537464516634207e-07, "logits/chosen": -2.3611888885498047, "logits/rejected": -2.5742435455322266, "logps/chosen": -492.85455322265625, "logps/rejected": -582.3247680664062, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 2.2217202186584473, "rewards/margins": 4.693009376525879, "rewards/rejected": -2.4712891578674316, "step": 3039 }, { "epoch": 2.2210045662100457, "grad_norm": 14.262309329390755, "learning_rate": 2.452151693253123e-07, "logits/chosen": -2.712831974029541, "logits/rejected": -2.423755645751953, "logps/chosen": -1213.2340087890625, "logps/rejected": -909.1734619140625, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 4.199054718017578, "rewards/margins": 4.135621547698975, "rewards/rejected": 0.06343311071395874, "step": 3040 }, { "epoch": 2.2217351598173516, "grad_norm": 13.391276215750441, "learning_rate": 2.45055695432019e-07, "logits/chosen": -2.2161569595336914, "logits/rejected": -2.674497604370117, "logps/chosen": -391.612548828125, "logps/rejected": -531.14306640625, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": 2.2326924800872803, "rewards/margins": 3.6270434856414795, "rewards/rejected": -1.3943510055541992, "step": 3041 }, { "epoch": 2.2224657534246575, "grad_norm": 12.068062720146212, "learning_rate": 2.448962235513785e-07, "logits/chosen": -2.6377429962158203, "logits/rejected": -2.475264072418213, "logps/chosen": -580.06884765625, "logps/rejected": -603.5582275390625, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": 2.4818999767303467, "rewards/margins": 4.013275146484375, "rewards/rejected": -1.5313751697540283, "step": 3042 }, { "epoch": 2.2231963470319633, "grad_norm": 12.740819281041091, "learning_rate": 2.447367537483063e-07, "logits/chosen": -2.288503646850586, "logits/rejected": -1.97296142578125, "logps/chosen": -389.50177001953125, "logps/rejected": -448.4491882324219, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 1.9306045770645142, "rewards/margins": 5.338471412658691, "rewards/rejected": -3.407867431640625, "step": 3043 }, { "epoch": 2.223926940639269, "grad_norm": 13.825196105780245, "learning_rate": 2.4457728608771676e-07, "logits/chosen": -2.1941587924957275, "logits/rejected": -2.593851327896118, "logps/chosen": -482.8184509277344, "logps/rejected": -693.2920532226562, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 2.7076187133789062, "rewards/margins": 5.372734069824219, "rewards/rejected": -2.6651148796081543, "step": 3044 }, { "epoch": 2.2246575342465755, "grad_norm": 10.971191134608373, "learning_rate": 2.444178206345236e-07, "logits/chosen": -2.3317620754241943, "logits/rejected": -2.018624782562256, "logps/chosen": -260.6221923828125, "logps/rejected": -340.4742126464844, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 1.984409213066101, "rewards/margins": 6.363893508911133, "rewards/rejected": -4.379484176635742, "step": 3045 }, { "epoch": 2.2253881278538814, "grad_norm": 10.392735775972547, "learning_rate": 2.442583574536397e-07, "logits/chosen": -2.838071823120117, "logits/rejected": -2.0476980209350586, "logps/chosen": -704.7864379882812, "logps/rejected": -481.4229736328125, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 3.6152446269989014, "rewards/margins": 5.597904205322266, "rewards/rejected": -1.9826602935791016, "step": 3046 }, { "epoch": 2.2261187214611873, "grad_norm": 13.076259247514296, "learning_rate": 2.440988966099769e-07, "logits/chosen": -1.6957807540893555, "logits/rejected": -2.4676239490509033, "logps/chosen": -265.570556640625, "logps/rejected": -390.29632568359375, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": 1.1126394271850586, "rewards/margins": 3.7912368774414062, "rewards/rejected": -2.6785972118377686, "step": 3047 }, { "epoch": 2.226849315068493, "grad_norm": 7.936213351519719, "learning_rate": 2.43939438168446e-07, "logits/chosen": -2.7246181964874268, "logits/rejected": -2.0766618251800537, "logps/chosen": -746.0021362304688, "logps/rejected": -390.3619384765625, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 4.409406661987305, "rewards/margins": 6.002641677856445, "rewards/rejected": -1.5932352542877197, "step": 3048 }, { "epoch": 2.227579908675799, "grad_norm": 9.656354687564598, "learning_rate": 2.437799821939571e-07, "logits/chosen": -2.6369998455047607, "logits/rejected": -2.25974440574646, "logps/chosen": -540.1646728515625, "logps/rejected": -442.18231201171875, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 2.4356272220611572, "rewards/margins": 5.48267126083374, "rewards/rejected": -3.047044277191162, "step": 3049 }, { "epoch": 2.228310502283105, "grad_norm": 8.735796926967724, "learning_rate": 2.43620528751419e-07, "logits/chosen": -2.196629285812378, "logits/rejected": -2.0849480628967285, "logps/chosen": -652.06640625, "logps/rejected": -554.4784545898438, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 2.7893505096435547, "rewards/margins": 4.732219696044922, "rewards/rejected": -1.9428693056106567, "step": 3050 }, { "epoch": 2.2290410958904108, "grad_norm": 9.976036350652059, "learning_rate": 2.4346107790573966e-07, "logits/chosen": -2.7941434383392334, "logits/rejected": -2.0988521575927734, "logps/chosen": -671.4047241210938, "logps/rejected": -688.6198120117188, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": 3.7537667751312256, "rewards/margins": 6.517056941986084, "rewards/rejected": -2.7632901668548584, "step": 3051 }, { "epoch": 2.229771689497717, "grad_norm": 13.019634384905038, "learning_rate": 2.43301629721826e-07, "logits/chosen": -3.6981759071350098, "logits/rejected": -2.128772020339966, "logps/chosen": -837.5151977539062, "logps/rejected": -518.6607055664062, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": 5.777521133422852, "rewards/margins": 7.027822017669678, "rewards/rejected": -1.2503005266189575, "step": 3052 }, { "epoch": 2.230502283105023, "grad_norm": 13.217003961775374, "learning_rate": 2.431421842645835e-07, "logits/chosen": -3.262770414352417, "logits/rejected": -2.287977933883667, "logps/chosen": -574.720947265625, "logps/rejected": -380.2039489746094, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 2.404723882675171, "rewards/margins": 4.6217546463012695, "rewards/rejected": -2.2170310020446777, "step": 3053 }, { "epoch": 2.231232876712329, "grad_norm": 14.744375934711567, "learning_rate": 2.429827415989171e-07, "logits/chosen": -2.984332323074341, "logits/rejected": -2.4364852905273438, "logps/chosen": -518.7618408203125, "logps/rejected": -483.35809326171875, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 3.4428720474243164, "rewards/margins": 6.162856101989746, "rewards/rejected": -2.719984292984009, "step": 3054 }, { "epoch": 2.2319634703196347, "grad_norm": 20.03304003068665, "learning_rate": 2.428233017897301e-07, "logits/chosen": -3.4277169704437256, "logits/rejected": -1.4179713726043701, "logps/chosen": -900.9384765625, "logps/rejected": -364.9664611816406, "loss": 0.0971, "rewards/accuracies": 1.0, "rewards/chosen": 2.971268653869629, "rewards/margins": 3.753627061843872, "rewards/rejected": -0.7823585271835327, "step": 3055 }, { "epoch": 2.2326940639269406, "grad_norm": 15.367303674255835, "learning_rate": 2.426638649019251e-07, "logits/chosen": -2.83575177192688, "logits/rejected": -2.2672054767608643, "logps/chosen": -371.64056396484375, "logps/rejected": -392.55609130859375, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": 2.1043453216552734, "rewards/margins": 4.418330669403076, "rewards/rejected": -2.3139853477478027, "step": 3056 }, { "epoch": 2.2334246575342465, "grad_norm": 15.6609367010563, "learning_rate": 2.42504431000403e-07, "logits/chosen": -3.0239241123199463, "logits/rejected": -2.371767997741699, "logps/chosen": -520.55712890625, "logps/rejected": -448.8436584472656, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": 3.6718392372131348, "rewards/margins": 6.947953224182129, "rewards/rejected": -3.276113748550415, "step": 3057 }, { "epoch": 2.2341552511415523, "grad_norm": 12.388960544928258, "learning_rate": 2.4234500015006387e-07, "logits/chosen": -2.7723283767700195, "logits/rejected": -2.346054792404175, "logps/chosen": -967.914794921875, "logps/rejected": -804.6182250976562, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 4.62573766708374, "rewards/margins": 3.7365646362304688, "rewards/rejected": 0.8891732096672058, "step": 3058 }, { "epoch": 2.2348858447488587, "grad_norm": 16.952526451127333, "learning_rate": 2.421855724158064e-07, "logits/chosen": -2.272401809692383, "logits/rejected": -2.235868215560913, "logps/chosen": -573.04736328125, "logps/rejected": -532.5391235351562, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 3.306342840194702, "rewards/margins": 7.406245231628418, "rewards/rejected": -4.099903106689453, "step": 3059 }, { "epoch": 2.2356164383561645, "grad_norm": 13.955332206618024, "learning_rate": 2.4202614786252794e-07, "logits/chosen": -2.4519829750061035, "logits/rejected": -1.8905179500579834, "logps/chosen": -665.079345703125, "logps/rejected": -532.1947021484375, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": 3.972984552383423, "rewards/margins": 5.638147830963135, "rewards/rejected": -1.6651633977890015, "step": 3060 }, { "epoch": 2.2363470319634704, "grad_norm": 16.627019085568975, "learning_rate": 2.418667265551248e-07, "logits/chosen": -2.5728042125701904, "logits/rejected": -2.6140475273132324, "logps/chosen": -496.86932373046875, "logps/rejected": -526.2005615234375, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 2.24220871925354, "rewards/margins": 3.283829689025879, "rewards/rejected": -1.0416209697723389, "step": 3061 }, { "epoch": 2.2370776255707763, "grad_norm": 7.662921350042904, "learning_rate": 2.4170730855849164e-07, "logits/chosen": -2.3138763904571533, "logits/rejected": -2.267672061920166, "logps/chosen": -710.5641479492188, "logps/rejected": -704.6500244140625, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 3.3023481369018555, "rewards/margins": 4.378559112548828, "rewards/rejected": -1.076210856437683, "step": 3062 }, { "epoch": 2.237808219178082, "grad_norm": 14.068002938051977, "learning_rate": 2.41547893937522e-07, "logits/chosen": -2.985536575317383, "logits/rejected": -2.076662302017212, "logps/chosen": -559.7886962890625, "logps/rejected": -375.46466064453125, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 3.669055461883545, "rewards/margins": 6.498149394989014, "rewards/rejected": -2.8290936946868896, "step": 3063 }, { "epoch": 2.238538812785388, "grad_norm": 9.78177453727822, "learning_rate": 2.413884827571079e-07, "logits/chosen": -2.6243643760681152, "logits/rejected": -1.9986870288848877, "logps/chosen": -670.2308959960938, "logps/rejected": -486.2882385253906, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": 2.0265283584594727, "rewards/margins": 3.8401408195495605, "rewards/rejected": -1.8136119842529297, "step": 3064 }, { "epoch": 2.239269406392694, "grad_norm": 8.605098153617915, "learning_rate": 2.412290750821402e-07, "logits/chosen": -3.192873954772949, "logits/rejected": -2.6694607734680176, "logps/chosen": -906.0306396484375, "logps/rejected": -605.9696044921875, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 4.1926727294921875, "rewards/margins": 5.080852031707764, "rewards/rejected": -0.8881791830062866, "step": 3065 }, { "epoch": 2.24, "grad_norm": 7.59642753962322, "learning_rate": 2.41069670977508e-07, "logits/chosen": -2.700079917907715, "logits/rejected": -2.329894542694092, "logps/chosen": -835.0367431640625, "logps/rejected": -866.9383544921875, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 3.3396358489990234, "rewards/margins": 5.468105316162109, "rewards/rejected": -2.128469467163086, "step": 3066 }, { "epoch": 2.240730593607306, "grad_norm": 13.45539154560386, "learning_rate": 2.4091027050809915e-07, "logits/chosen": -3.015026092529297, "logits/rejected": -1.9834586381912231, "logps/chosen": -552.96142578125, "logps/rejected": -484.6831359863281, "loss": 0.0606, "rewards/accuracies": 1.0, "rewards/chosen": 3.584939479827881, "rewards/margins": 5.460698127746582, "rewards/rejected": -1.8757585287094116, "step": 3067 }, { "epoch": 2.241461187214612, "grad_norm": 13.793935598724456, "learning_rate": 2.4075087373880006e-07, "logits/chosen": -2.753908634185791, "logits/rejected": -1.6840605735778809, "logps/chosen": -475.7276916503906, "logps/rejected": -512.0695190429688, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 3.1871747970581055, "rewards/margins": 8.126646041870117, "rewards/rejected": -4.939471244812012, "step": 3068 }, { "epoch": 2.242191780821918, "grad_norm": 12.472631782390659, "learning_rate": 2.4059148073449555e-07, "logits/chosen": -2.8962652683258057, "logits/rejected": -2.7230606079101562, "logps/chosen": -761.2160034179688, "logps/rejected": -660.0110473632812, "loss": 0.0636, "rewards/accuracies": 1.0, "rewards/chosen": 4.326382637023926, "rewards/margins": 5.606403350830078, "rewards/rejected": -1.280019760131836, "step": 3069 }, { "epoch": 2.2429223744292237, "grad_norm": 8.84021831398225, "learning_rate": 2.404320915600688e-07, "logits/chosen": -2.368093729019165, "logits/rejected": -2.2603092193603516, "logps/chosen": -372.06072998046875, "logps/rejected": -554.6900634765625, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 3.915079116821289, "rewards/margins": 7.146645545959473, "rewards/rejected": -3.2315661907196045, "step": 3070 }, { "epoch": 2.2436529680365296, "grad_norm": 23.32797194147712, "learning_rate": 2.402727062804016e-07, "logits/chosen": -2.6691348552703857, "logits/rejected": -2.5748343467712402, "logps/chosen": -549.6142578125, "logps/rejected": -649.1991577148438, "loss": 0.067, "rewards/accuracies": 0.875, "rewards/chosen": 2.214050531387329, "rewards/margins": 3.160616397857666, "rewards/rejected": -0.9465659856796265, "step": 3071 }, { "epoch": 2.2443835616438355, "grad_norm": 6.7483648509562, "learning_rate": 2.4011332496037404e-07, "logits/chosen": -2.558502674102783, "logits/rejected": -2.1050524711608887, "logps/chosen": -659.490234375, "logps/rejected": -447.2948303222656, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 2.6591098308563232, "rewards/margins": 4.331628799438477, "rewards/rejected": -1.6725187301635742, "step": 3072 }, { "epoch": 2.2451141552511418, "grad_norm": 7.996607537993797, "learning_rate": 2.3995394766486485e-07, "logits/chosen": -2.814042091369629, "logits/rejected": -3.052700996398926, "logps/chosen": -533.54296875, "logps/rejected": -648.7806396484375, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 2.1148300170898438, "rewards/margins": 4.7110748291015625, "rewards/rejected": -2.596245288848877, "step": 3073 }, { "epoch": 2.2458447488584476, "grad_norm": 14.104270104498994, "learning_rate": 2.397945744587509e-07, "logits/chosen": -2.4885904788970947, "logits/rejected": -2.5950427055358887, "logps/chosen": -365.8173828125, "logps/rejected": -384.715576171875, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 2.426532506942749, "rewards/margins": 5.877685070037842, "rewards/rejected": -3.4511523246765137, "step": 3074 }, { "epoch": 2.2465753424657535, "grad_norm": 6.55715404928649, "learning_rate": 2.3963520540690723e-07, "logits/chosen": -3.247312545776367, "logits/rejected": -1.8911607265472412, "logps/chosen": -726.4578247070312, "logps/rejected": -518.1455688476562, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.806878089904785, "rewards/margins": 5.882706642150879, "rewards/rejected": -2.0758283138275146, "step": 3075 }, { "epoch": 2.2473059360730594, "grad_norm": 10.425139823726024, "learning_rate": 2.394758405742075e-07, "logits/chosen": -2.4121718406677246, "logits/rejected": -2.3156802654266357, "logps/chosen": -700.5892944335938, "logps/rejected": -554.9466552734375, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 1.6325366497039795, "rewards/margins": 4.522793292999268, "rewards/rejected": -2.890256881713867, "step": 3076 }, { "epoch": 2.2480365296803653, "grad_norm": 12.52574436313411, "learning_rate": 2.3931648002552353e-07, "logits/chosen": -2.924220323562622, "logits/rejected": -2.0053110122680664, "logps/chosen": -759.5297241210938, "logps/rejected": -688.9043579101562, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 3.0246548652648926, "rewards/margins": 3.6361169815063477, "rewards/rejected": -0.6114619374275208, "step": 3077 }, { "epoch": 2.248767123287671, "grad_norm": 16.374401751357016, "learning_rate": 2.391571238257255e-07, "logits/chosen": -2.5963010787963867, "logits/rejected": -2.762437343597412, "logps/chosen": -417.7877197265625, "logps/rejected": -607.5491943359375, "loss": 0.0659, "rewards/accuracies": 0.875, "rewards/chosen": 2.326533794403076, "rewards/margins": 6.081949234008789, "rewards/rejected": -3.755415439605713, "step": 3078 }, { "epoch": 2.249497716894977, "grad_norm": 31.527539037695878, "learning_rate": 2.3899777203968156e-07, "logits/chosen": -2.068499803543091, "logits/rejected": -1.9304603338241577, "logps/chosen": -309.74114990234375, "logps/rejected": -431.360595703125, "loss": 0.1602, "rewards/accuracies": 1.0, "rewards/chosen": 0.80974280834198, "rewards/margins": 4.44535493850708, "rewards/rejected": -3.6356120109558105, "step": 3079 }, { "epoch": 2.2502283105022833, "grad_norm": 29.570217552687726, "learning_rate": 2.388384247322583e-07, "logits/chosen": -2.5402770042419434, "logits/rejected": -2.188675880432129, "logps/chosen": -408.0318298339844, "logps/rejected": -406.7096252441406, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": 3.765800714492798, "rewards/margins": 6.913273811340332, "rewards/rejected": -3.147473096847534, "step": 3080 }, { "epoch": 2.250958904109589, "grad_norm": 9.67256835029926, "learning_rate": 2.386790819683204e-07, "logits/chosen": -3.017230272293091, "logits/rejected": -2.607055187225342, "logps/chosen": -730.4850463867188, "logps/rejected": -762.3682250976562, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": 2.8263778686523438, "rewards/margins": 4.219854354858398, "rewards/rejected": -1.3934767246246338, "step": 3081 }, { "epoch": 2.251689497716895, "grad_norm": 12.490556124911745, "learning_rate": 2.385197438127308e-07, "logits/chosen": -2.647766351699829, "logits/rejected": -2.229233741760254, "logps/chosen": -656.36328125, "logps/rejected": -442.40557861328125, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 2.3650319576263428, "rewards/margins": 5.348256587982178, "rewards/rejected": -2.983224630355835, "step": 3082 }, { "epoch": 2.252420091324201, "grad_norm": 13.0490828970133, "learning_rate": 2.383604103303503e-07, "logits/chosen": -2.5903124809265137, "logits/rejected": -2.0172994136810303, "logps/chosen": -479.392822265625, "logps/rejected": -503.6304931640625, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 2.3162922859191895, "rewards/margins": 5.179887771606445, "rewards/rejected": -2.863595962524414, "step": 3083 }, { "epoch": 2.253150684931507, "grad_norm": 18.101771551447147, "learning_rate": 2.3820108158603807e-07, "logits/chosen": -2.7588136196136475, "logits/rejected": -2.0266480445861816, "logps/chosen": -552.9364013671875, "logps/rejected": -311.2945861816406, "loss": 0.1153, "rewards/accuracies": 0.875, "rewards/chosen": 2.3501734733581543, "rewards/margins": 3.5370869636535645, "rewards/rejected": -1.1869136095046997, "step": 3084 }, { "epoch": 2.2538812785388127, "grad_norm": 12.080064704165949, "learning_rate": 2.3804175764465123e-07, "logits/chosen": -3.1499884128570557, "logits/rejected": -2.639643907546997, "logps/chosen": -953.4600830078125, "logps/rejected": -885.6453247070312, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": 4.329641342163086, "rewards/margins": 4.583196640014648, "rewards/rejected": -0.2535558342933655, "step": 3085 }, { "epoch": 2.2546118721461186, "grad_norm": 21.34743717007896, "learning_rate": 2.3788243857104496e-07, "logits/chosen": -2.552309036254883, "logits/rejected": -2.122591257095337, "logps/chosen": -587.369140625, "logps/rejected": -464.396484375, "loss": 0.1272, "rewards/accuracies": 1.0, "rewards/chosen": 3.285489559173584, "rewards/margins": 6.386021614074707, "rewards/rejected": -3.100532054901123, "step": 3086 }, { "epoch": 2.255342465753425, "grad_norm": 9.704975338746802, "learning_rate": 2.3772312443007258e-07, "logits/chosen": -3.5046849250793457, "logits/rejected": -2.2932112216949463, "logps/chosen": -627.720703125, "logps/rejected": -392.4347839355469, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 2.8689162731170654, "rewards/margins": 5.217012882232666, "rewards/rejected": -2.3480963706970215, "step": 3087 }, { "epoch": 2.2560730593607308, "grad_norm": 15.982446101083008, "learning_rate": 2.3756381528658503e-07, "logits/chosen": -2.798645257949829, "logits/rejected": -2.1523325443267822, "logps/chosen": -586.471923828125, "logps/rejected": -465.03997802734375, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 2.8370656967163086, "rewards/margins": 3.872011423110962, "rewards/rejected": -1.0349457263946533, "step": 3088 }, { "epoch": 2.2568036529680366, "grad_norm": 8.377943290381921, "learning_rate": 2.3740451120543164e-07, "logits/chosen": -2.564542055130005, "logits/rejected": -1.6966032981872559, "logps/chosen": -511.0589599609375, "logps/rejected": -364.2130432128906, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 2.3381705284118652, "rewards/margins": 4.928631782531738, "rewards/rejected": -2.590461492538452, "step": 3089 }, { "epoch": 2.2575342465753425, "grad_norm": 11.560899778413592, "learning_rate": 2.3724521225145942e-07, "logits/chosen": -3.206014633178711, "logits/rejected": -2.838845729827881, "logps/chosen": -645.72021484375, "logps/rejected": -565.6578979492188, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 2.9334189891815186, "rewards/margins": 3.222334146499634, "rewards/rejected": -0.28891557455062866, "step": 3090 }, { "epoch": 2.2582648401826484, "grad_norm": 9.32439957800149, "learning_rate": 2.3708591848951355e-07, "logits/chosen": -2.8536930084228516, "logits/rejected": -2.4812698364257812, "logps/chosen": -636.237548828125, "logps/rejected": -558.3812866210938, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": 3.0189197063446045, "rewards/margins": 3.8787317276000977, "rewards/rejected": -0.8598120212554932, "step": 3091 }, { "epoch": 2.2589954337899543, "grad_norm": 18.374760840280345, "learning_rate": 2.3692662998443674e-07, "logits/chosen": -2.2917966842651367, "logits/rejected": -1.9950108528137207, "logps/chosen": -593.1754150390625, "logps/rejected": -440.1452941894531, "loss": 0.0958, "rewards/accuracies": 0.875, "rewards/chosen": 3.0407652854919434, "rewards/margins": 4.104647159576416, "rewards/rejected": -1.0638818740844727, "step": 3092 }, { "epoch": 2.25972602739726, "grad_norm": 11.764644505957778, "learning_rate": 2.367673468010698e-07, "logits/chosen": -2.7637903690338135, "logits/rejected": -2.0418167114257812, "logps/chosen": -566.62255859375, "logps/rejected": -498.753662109375, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": 2.7408902645111084, "rewards/margins": 3.7153477668762207, "rewards/rejected": -0.9744571447372437, "step": 3093 }, { "epoch": 2.2604566210045665, "grad_norm": 16.884507955126164, "learning_rate": 2.3660806900425135e-07, "logits/chosen": -2.438626527786255, "logits/rejected": -2.3174896240234375, "logps/chosen": -431.6646728515625, "logps/rejected": -514.6422729492188, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 3.400794506072998, "rewards/margins": 7.28790807723999, "rewards/rejected": -3.887113571166992, "step": 3094 }, { "epoch": 2.2611872146118723, "grad_norm": 14.8716570519791, "learning_rate": 2.3644879665881784e-07, "logits/chosen": -2.862874984741211, "logits/rejected": -1.9194920063018799, "logps/chosen": -554.7900390625, "logps/rejected": -433.1768493652344, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 3.2518653869628906, "rewards/margins": 4.201440811157227, "rewards/rejected": -0.9495749473571777, "step": 3095 }, { "epoch": 2.261917808219178, "grad_norm": 10.812020881048397, "learning_rate": 2.3628952982960333e-07, "logits/chosen": -2.675750255584717, "logits/rejected": -1.7936367988586426, "logps/chosen": -517.7080688476562, "logps/rejected": -390.07269287109375, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 3.397460460662842, "rewards/margins": 5.626778602600098, "rewards/rejected": -2.229318618774414, "step": 3096 }, { "epoch": 2.262648401826484, "grad_norm": 8.03581897485749, "learning_rate": 2.361302685814398e-07, "logits/chosen": -3.012935161590576, "logits/rejected": -2.6615827083587646, "logps/chosen": -715.3536376953125, "logps/rejected": -595.3857421875, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 2.8179666996002197, "rewards/margins": 4.61037015914917, "rewards/rejected": -1.7924034595489502, "step": 3097 }, { "epoch": 2.26337899543379, "grad_norm": 12.1206487339755, "learning_rate": 2.3597101297915688e-07, "logits/chosen": -2.44863224029541, "logits/rejected": -2.2168235778808594, "logps/chosen": -565.9179077148438, "logps/rejected": -538.3678588867188, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 2.0393731594085693, "rewards/margins": 4.503852367401123, "rewards/rejected": -2.4644792079925537, "step": 3098 }, { "epoch": 2.264109589041096, "grad_norm": 11.578890673484901, "learning_rate": 2.3581176308758202e-07, "logits/chosen": -2.878774642944336, "logits/rejected": -1.7864142656326294, "logps/chosen": -882.8489990234375, "logps/rejected": -501.13525390625, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 4.680765151977539, "rewards/margins": 6.538257598876953, "rewards/rejected": -1.857492208480835, "step": 3099 }, { "epoch": 2.2648401826484017, "grad_norm": 15.498661106487553, "learning_rate": 2.3565251897154028e-07, "logits/chosen": -2.2777369022369385, "logits/rejected": -2.5054328441619873, "logps/chosen": -279.1280212402344, "logps/rejected": -388.8374938964844, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 0.7949658036231995, "rewards/margins": 2.4852213859558105, "rewards/rejected": -1.6902556419372559, "step": 3100 }, { "epoch": 2.2655707762557076, "grad_norm": 15.509669579454851, "learning_rate": 2.3549328069585423e-07, "logits/chosen": -3.236095666885376, "logits/rejected": -2.2854950428009033, "logps/chosen": -391.2782897949219, "logps/rejected": -358.0157470703125, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 1.5003211498260498, "rewards/margins": 4.487823486328125, "rewards/rejected": -2.987502098083496, "step": 3101 }, { "epoch": 2.266301369863014, "grad_norm": 10.881470257207713, "learning_rate": 2.3533404832534421e-07, "logits/chosen": -2.7650997638702393, "logits/rejected": -2.4088144302368164, "logps/chosen": -666.760498046875, "logps/rejected": -510.61041259765625, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 3.7817764282226562, "rewards/margins": 4.762071132659912, "rewards/rejected": -0.9802944660186768, "step": 3102 }, { "epoch": 2.2670319634703198, "grad_norm": 10.327001744303884, "learning_rate": 2.3517482192482816e-07, "logits/chosen": -3.1443450450897217, "logits/rejected": -2.326390266418457, "logps/chosen": -577.4367065429688, "logps/rejected": -444.1109619140625, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": 3.947603940963745, "rewards/margins": 5.958813667297363, "rewards/rejected": -2.011209726333618, "step": 3103 }, { "epoch": 2.2677625570776256, "grad_norm": 29.75958559137396, "learning_rate": 2.3501560155912166e-07, "logits/chosen": -3.0008606910705566, "logits/rejected": -2.2646100521087646, "logps/chosen": -887.9580078125, "logps/rejected": -653.190185546875, "loss": 0.1205, "rewards/accuracies": 1.0, "rewards/chosen": 4.405051231384277, "rewards/margins": 5.953527450561523, "rewards/rejected": -1.5484765768051147, "step": 3104 }, { "epoch": 2.2684931506849315, "grad_norm": 15.36613096901856, "learning_rate": 2.3485638729303748e-07, "logits/chosen": -2.2472052574157715, "logits/rejected": -1.9502842426300049, "logps/chosen": -487.650390625, "logps/rejected": -476.3095703125, "loss": 0.0748, "rewards/accuracies": 0.875, "rewards/chosen": 3.4149718284606934, "rewards/margins": 4.812598705291748, "rewards/rejected": -1.3976272344589233, "step": 3105 }, { "epoch": 2.2692237442922374, "grad_norm": 18.420319392016694, "learning_rate": 2.3469717919138631e-07, "logits/chosen": -2.970930814743042, "logits/rejected": -2.3279855251312256, "logps/chosen": -662.36572265625, "logps/rejected": -340.41168212890625, "loss": 0.1055, "rewards/accuracies": 0.875, "rewards/chosen": 2.7709031105041504, "rewards/margins": 3.2266464233398438, "rewards/rejected": -0.4557434320449829, "step": 3106 }, { "epoch": 2.2699543378995433, "grad_norm": 10.6607062455453, "learning_rate": 2.3453797731897617e-07, "logits/chosen": -2.3085615634918213, "logits/rejected": -2.1462340354919434, "logps/chosen": -607.923828125, "logps/rejected": -545.5018920898438, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 2.565918445587158, "rewards/margins": 3.8299365043640137, "rewards/rejected": -1.2640180587768555, "step": 3107 }, { "epoch": 2.270684931506849, "grad_norm": 8.369793996155815, "learning_rate": 2.3437878174061258e-07, "logits/chosen": -2.9721853733062744, "logits/rejected": -2.533357620239258, "logps/chosen": -951.3237915039062, "logps/rejected": -871.2843017578125, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 5.422266006469727, "rewards/margins": 5.480483055114746, "rewards/rejected": -0.058217406272888184, "step": 3108 }, { "epoch": 2.271415525114155, "grad_norm": 14.721166327336615, "learning_rate": 2.3421959252109842e-07, "logits/chosen": -2.506721019744873, "logits/rejected": -1.810185432434082, "logps/chosen": -921.115478515625, "logps/rejected": -424.118896484375, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 2.0862934589385986, "rewards/margins": 4.234294414520264, "rewards/rejected": -2.148001194000244, "step": 3109 }, { "epoch": 2.2721461187214613, "grad_norm": 6.074600256809439, "learning_rate": 2.3406040972523402e-07, "logits/chosen": -2.6988117694854736, "logits/rejected": -2.249835729598999, "logps/chosen": -847.0960083007812, "logps/rejected": -778.6820678710938, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": 3.977180242538452, "rewards/margins": 4.387564659118652, "rewards/rejected": -0.4103846549987793, "step": 3110 }, { "epoch": 2.272876712328767, "grad_norm": 15.779901125831417, "learning_rate": 2.3390123341781716e-07, "logits/chosen": -2.481846332550049, "logits/rejected": -1.879641056060791, "logps/chosen": -491.9884948730469, "logps/rejected": -346.2103271484375, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": 2.2340891361236572, "rewards/margins": 3.8893566131591797, "rewards/rejected": -1.6552672386169434, "step": 3111 }, { "epoch": 2.273607305936073, "grad_norm": 10.024358836382824, "learning_rate": 2.3374206366364284e-07, "logits/chosen": -2.557251214981079, "logits/rejected": -2.532776355743408, "logps/chosen": -960.4857177734375, "logps/rejected": -860.701171875, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 4.761882305145264, "rewards/margins": 4.4177117347717285, "rewards/rejected": 0.3441706597805023, "step": 3112 }, { "epoch": 2.274337899543379, "grad_norm": 14.936922836326074, "learning_rate": 2.3358290052750365e-07, "logits/chosen": -2.927680253982544, "logits/rejected": -2.597663164138794, "logps/chosen": -607.8057250976562, "logps/rejected": -629.8041381835938, "loss": 0.0925, "rewards/accuracies": 1.0, "rewards/chosen": 3.816993474960327, "rewards/margins": 6.08394718170166, "rewards/rejected": -2.266953706741333, "step": 3113 }, { "epoch": 2.275068493150685, "grad_norm": 18.723599577868132, "learning_rate": 2.334237440741891e-07, "logits/chosen": -2.8184409141540527, "logits/rejected": -2.7361812591552734, "logps/chosen": -558.6000366210938, "logps/rejected": -519.298828125, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": 2.544214963912964, "rewards/margins": 4.475652694702148, "rewards/rejected": -1.9314379692077637, "step": 3114 }, { "epoch": 2.2757990867579907, "grad_norm": 13.122672319638015, "learning_rate": 2.3326459436848633e-07, "logits/chosen": -2.5023386478424072, "logits/rejected": -2.055203437805176, "logps/chosen": -758.7469482421875, "logps/rejected": -492.89599609375, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 3.0666041374206543, "rewards/margins": 5.356876850128174, "rewards/rejected": -2.2902724742889404, "step": 3115 }, { "epoch": 2.2765296803652966, "grad_norm": 12.30245577479636, "learning_rate": 2.3310545147517954e-07, "logits/chosen": -2.951233148574829, "logits/rejected": -2.1674506664276123, "logps/chosen": -403.7108154296875, "logps/rejected": -414.4952697753906, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": 3.211256980895996, "rewards/margins": 6.270114898681641, "rewards/rejected": -3.0588583946228027, "step": 3116 }, { "epoch": 2.277260273972603, "grad_norm": 13.56550360965614, "learning_rate": 2.329463154590503e-07, "logits/chosen": -2.7639706134796143, "logits/rejected": -2.352367877960205, "logps/chosen": -372.8113098144531, "logps/rejected": -380.9433288574219, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": 2.8460211753845215, "rewards/margins": 4.2903265953063965, "rewards/rejected": -1.444305419921875, "step": 3117 }, { "epoch": 2.2779908675799088, "grad_norm": 11.463786285346293, "learning_rate": 2.3278718638487718e-07, "logits/chosen": -2.7682695388793945, "logits/rejected": -1.9788637161254883, "logps/chosen": -548.9547119140625, "logps/rejected": -491.54022216796875, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 4.123069763183594, "rewards/margins": 5.923129081726074, "rewards/rejected": -1.8000595569610596, "step": 3118 }, { "epoch": 2.2787214611872146, "grad_norm": 7.243650879070673, "learning_rate": 2.3262806431743608e-07, "logits/chosen": -3.365170955657959, "logits/rejected": -1.7027440071105957, "logps/chosen": -953.6943359375, "logps/rejected": -483.6936340332031, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 3.7967305183410645, "rewards/margins": 5.20890998840332, "rewards/rejected": -1.4121795892715454, "step": 3119 }, { "epoch": 2.2794520547945205, "grad_norm": 9.54200213202135, "learning_rate": 2.3246894932150003e-07, "logits/chosen": -2.887509822845459, "logits/rejected": -2.3405513763427734, "logps/chosen": -730.5855102539062, "logps/rejected": -544.499755859375, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 3.1550588607788086, "rewards/margins": 4.440971851348877, "rewards/rejected": -1.2859128713607788, "step": 3120 }, { "epoch": 2.2801826484018264, "grad_norm": 11.280380257764612, "learning_rate": 2.3230984146183923e-07, "logits/chosen": -2.853663921356201, "logits/rejected": -2.309612274169922, "logps/chosen": -604.5450439453125, "logps/rejected": -569.8834228515625, "loss": 0.0611, "rewards/accuracies": 0.875, "rewards/chosen": 2.0899908542633057, "rewards/margins": 3.297210454940796, "rewards/rejected": -1.2072196006774902, "step": 3121 }, { "epoch": 2.2809132420091323, "grad_norm": 16.422848746106748, "learning_rate": 2.3215074080322073e-07, "logits/chosen": -2.808953285217285, "logits/rejected": -2.2357358932495117, "logps/chosen": -712.1658325195312, "logps/rejected": -651.1797485351562, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 4.957790374755859, "rewards/margins": 5.2203168869018555, "rewards/rejected": -0.26252639293670654, "step": 3122 }, { "epoch": 2.281643835616438, "grad_norm": 9.17677521955157, "learning_rate": 2.319916474104089e-07, "logits/chosen": -2.7719435691833496, "logits/rejected": -2.7846786975860596, "logps/chosen": -944.2603759765625, "logps/rejected": -899.6917724609375, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 4.742986679077148, "rewards/margins": 4.940264701843262, "rewards/rejected": -0.19727808237075806, "step": 3123 }, { "epoch": 2.2823744292237444, "grad_norm": 11.60230032376056, "learning_rate": 2.3183256134816504e-07, "logits/chosen": -2.601046323776245, "logits/rejected": -1.5389492511749268, "logps/chosen": -595.7069702148438, "logps/rejected": -337.3862609863281, "loss": 0.0578, "rewards/accuracies": 0.875, "rewards/chosen": 2.5964555740356445, "rewards/margins": 4.93492317199707, "rewards/rejected": -2.338467597961426, "step": 3124 }, { "epoch": 2.2831050228310503, "grad_norm": 12.2190852985085, "learning_rate": 2.3167348268124754e-07, "logits/chosen": -2.7991859912872314, "logits/rejected": -2.292172908782959, "logps/chosen": -684.0245361328125, "logps/rejected": -540.640380859375, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 1.9076848030090332, "rewards/margins": 4.215269565582275, "rewards/rejected": -2.307584762573242, "step": 3125 }, { "epoch": 2.283835616438356, "grad_norm": 15.741954383354603, "learning_rate": 2.3151441147441185e-07, "logits/chosen": -2.852497100830078, "logits/rejected": -1.8511691093444824, "logps/chosen": -743.0584716796875, "logps/rejected": -513.2606201171875, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 4.607668399810791, "rewards/margins": 6.350686073303223, "rewards/rejected": -1.7430176734924316, "step": 3126 }, { "epoch": 2.284566210045662, "grad_norm": 17.60215458252954, "learning_rate": 2.313553477924101e-07, "logits/chosen": -3.159738063812256, "logits/rejected": -2.475182294845581, "logps/chosen": -1141.62646484375, "logps/rejected": -761.84130859375, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 4.5185546875, "rewards/margins": 6.805118083953857, "rewards/rejected": -2.2865636348724365, "step": 3127 }, { "epoch": 2.285296803652968, "grad_norm": 24.100938251640226, "learning_rate": 2.3119629169999157e-07, "logits/chosen": -2.636563777923584, "logits/rejected": -2.629024028778076, "logps/chosen": -911.736083984375, "logps/rejected": -1027.953857421875, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 5.269268989562988, "rewards/margins": 4.135618686676025, "rewards/rejected": 1.1336506605148315, "step": 3128 }, { "epoch": 2.286027397260274, "grad_norm": 10.684183598890684, "learning_rate": 2.3103724326190247e-07, "logits/chosen": -2.871812343597412, "logits/rejected": -1.529870629310608, "logps/chosen": -602.0718994140625, "logps/rejected": -339.7403259277344, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 3.3320999145507812, "rewards/margins": 5.447383880615234, "rewards/rejected": -2.115283966064453, "step": 3129 }, { "epoch": 2.2867579908675797, "grad_norm": 22.567792023510105, "learning_rate": 2.308782025428858e-07, "logits/chosen": -2.7826714515686035, "logits/rejected": -2.1511006355285645, "logps/chosen": -753.26416015625, "logps/rejected": -692.6107788085938, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 3.154961585998535, "rewards/margins": 4.063990592956543, "rewards/rejected": -0.9090288877487183, "step": 3130 }, { "epoch": 2.287488584474886, "grad_norm": 15.93534950258485, "learning_rate": 2.3071916960768141e-07, "logits/chosen": -2.616663932800293, "logits/rejected": -2.41286301612854, "logps/chosen": -794.6124267578125, "logps/rejected": -959.344970703125, "loss": 0.0763, "rewards/accuracies": 1.0, "rewards/chosen": 4.496408462524414, "rewards/margins": 5.5486860275268555, "rewards/rejected": -1.0522769689559937, "step": 3131 }, { "epoch": 2.288219178082192, "grad_norm": 14.92866491052506, "learning_rate": 2.305601445210261e-07, "logits/chosen": -3.1042890548706055, "logits/rejected": -2.2559683322906494, "logps/chosen": -672.3598022460938, "logps/rejected": -617.5579223632812, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 3.4139513969421387, "rewards/margins": 5.5361552238464355, "rewards/rejected": -2.122203826904297, "step": 3132 }, { "epoch": 2.2889497716894978, "grad_norm": 10.928290975268581, "learning_rate": 2.3040112734765333e-07, "logits/chosen": -2.612513542175293, "logits/rejected": -2.023287296295166, "logps/chosen": -801.5894775390625, "logps/rejected": -576.9562377929688, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": 2.880077362060547, "rewards/margins": 4.679506778717041, "rewards/rejected": -1.7994296550750732, "step": 3133 }, { "epoch": 2.2896803652968036, "grad_norm": 8.597419316078772, "learning_rate": 2.3024211815229354e-07, "logits/chosen": -2.2919492721557617, "logits/rejected": -1.9700746536254883, "logps/chosen": -531.5130615234375, "logps/rejected": -565.0948486328125, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 2.921165704727173, "rewards/margins": 5.2865705490112305, "rewards/rejected": -2.3654048442840576, "step": 3134 }, { "epoch": 2.2904109589041095, "grad_norm": 14.747226777663519, "learning_rate": 2.3008311699967356e-07, "logits/chosen": -2.5902762413024902, "logits/rejected": -1.5332043170928955, "logps/chosen": -514.5345458984375, "logps/rejected": -485.70654296875, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": 2.3761210441589355, "rewards/margins": 3.9387364387512207, "rewards/rejected": -1.562614917755127, "step": 3135 }, { "epoch": 2.2911415525114154, "grad_norm": 13.404611482948287, "learning_rate": 2.2992412395451736e-07, "logits/chosen": -2.1850523948669434, "logits/rejected": -1.6456142663955688, "logps/chosen": -687.5342407226562, "logps/rejected": -640.2496337890625, "loss": 0.0801, "rewards/accuracies": 0.875, "rewards/chosen": 4.3633809089660645, "rewards/margins": 6.741365909576416, "rewards/rejected": -2.3779852390289307, "step": 3136 }, { "epoch": 2.2918721461187213, "grad_norm": 8.651051416607865, "learning_rate": 2.2976513908154534e-07, "logits/chosen": -2.795466184616089, "logits/rejected": -1.963820457458496, "logps/chosen": -450.3109130859375, "logps/rejected": -435.4886169433594, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 3.1811177730560303, "rewards/margins": 6.7462053298950195, "rewards/rejected": -3.5650877952575684, "step": 3137 }, { "epoch": 2.2926027397260276, "grad_norm": 8.694380635713614, "learning_rate": 2.296061624454747e-07, "logits/chosen": -3.4208755493164062, "logits/rejected": -1.6721560955047607, "logps/chosen": -972.9534912109375, "logps/rejected": -508.2578430175781, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 4.8142499923706055, "rewards/margins": 4.716618537902832, "rewards/rejected": 0.09763127565383911, "step": 3138 }, { "epoch": 2.2933333333333334, "grad_norm": 9.054062276192488, "learning_rate": 2.2944719411101938e-07, "logits/chosen": -2.909635066986084, "logits/rejected": -2.656367778778076, "logps/chosen": -535.8538818359375, "logps/rejected": -588.3273315429688, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": 3.488328695297241, "rewards/margins": 5.169580459594727, "rewards/rejected": -1.6812520027160645, "step": 3139 }, { "epoch": 2.2940639269406393, "grad_norm": 12.052095501672948, "learning_rate": 2.2928823414288952e-07, "logits/chosen": -2.561760902404785, "logits/rejected": -2.8699164390563965, "logps/chosen": -634.9892578125, "logps/rejected": -705.1947631835938, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 4.003620147705078, "rewards/margins": 3.1917884349823, "rewards/rejected": 0.8118319511413574, "step": 3140 }, { "epoch": 2.294794520547945, "grad_norm": 13.797269269033874, "learning_rate": 2.291292826057923e-07, "logits/chosen": -2.6915555000305176, "logits/rejected": -1.499861478805542, "logps/chosen": -706.4393310546875, "logps/rejected": -358.0982360839844, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 3.996476173400879, "rewards/margins": 6.1300554275512695, "rewards/rejected": -2.1335792541503906, "step": 3141 }, { "epoch": 2.295525114155251, "grad_norm": 7.346495289727762, "learning_rate": 2.289703395644313e-07, "logits/chosen": -2.8282501697540283, "logits/rejected": -2.1790695190429688, "logps/chosen": -1084.7613525390625, "logps/rejected": -710.8348388671875, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 5.397960186004639, "rewards/margins": 4.837070941925049, "rewards/rejected": 0.5608898401260376, "step": 3142 }, { "epoch": 2.296255707762557, "grad_norm": 4.830517025082745, "learning_rate": 2.288114050835067e-07, "logits/chosen": -2.603048801422119, "logits/rejected": -2.210824728012085, "logps/chosen": -690.95654296875, "logps/rejected": -729.0338134765625, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 5.041374206542969, "rewards/margins": 8.654999732971191, "rewards/rejected": -3.613626003265381, "step": 3143 }, { "epoch": 2.296986301369863, "grad_norm": 13.692750246088542, "learning_rate": 2.2865247922771506e-07, "logits/chosen": -2.741002082824707, "logits/rejected": -1.3654205799102783, "logps/chosen": -661.4375610351562, "logps/rejected": -370.55908203125, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": 4.259252071380615, "rewards/margins": 6.775819778442383, "rewards/rejected": -2.5165677070617676, "step": 3144 }, { "epoch": 2.297716894977169, "grad_norm": 8.892580913339668, "learning_rate": 2.2849356206174953e-07, "logits/chosen": -2.93772554397583, "logits/rejected": -2.109929084777832, "logps/chosen": -538.5630493164062, "logps/rejected": -372.91217041015625, "loss": 0.0687, "rewards/accuracies": 0.875, "rewards/chosen": 3.2278432846069336, "rewards/margins": 4.202573299407959, "rewards/rejected": -0.9747298955917358, "step": 3145 }, { "epoch": 2.298447488584475, "grad_norm": 12.152978451964204, "learning_rate": 2.2833465365029972e-07, "logits/chosen": -2.793074369430542, "logits/rejected": -1.8726310729980469, "logps/chosen": -642.5059814453125, "logps/rejected": -595.1514282226562, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": 6.018845081329346, "rewards/margins": 7.053883075714111, "rewards/rejected": -1.0350377559661865, "step": 3146 }, { "epoch": 2.299178082191781, "grad_norm": 22.470074907694944, "learning_rate": 2.2817575405805167e-07, "logits/chosen": -2.0522971153259277, "logits/rejected": -2.457056999206543, "logps/chosen": -407.3564453125, "logps/rejected": -521.8697509765625, "loss": 0.1004, "rewards/accuracies": 1.0, "rewards/chosen": 2.595855712890625, "rewards/margins": 5.026654243469238, "rewards/rejected": -2.4307987689971924, "step": 3147 }, { "epoch": 2.2999086757990868, "grad_norm": 32.63289573098076, "learning_rate": 2.280168633496879e-07, "logits/chosen": -2.6022024154663086, "logits/rejected": -2.109328508377075, "logps/chosen": -644.6025390625, "logps/rejected": -508.9066467285156, "loss": 0.1059, "rewards/accuracies": 1.0, "rewards/chosen": 3.7476654052734375, "rewards/margins": 5.212294578552246, "rewards/rejected": -1.4646291732788086, "step": 3148 }, { "epoch": 2.3006392694063926, "grad_norm": 14.154611522177145, "learning_rate": 2.278579815898871e-07, "logits/chosen": -3.2470955848693848, "logits/rejected": -1.6969690322875977, "logps/chosen": -806.84423828125, "logps/rejected": -440.9238586425781, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 4.464300632476807, "rewards/margins": 4.855470657348633, "rewards/rejected": -0.3911699652671814, "step": 3149 }, { "epoch": 2.3013698630136985, "grad_norm": 13.339532047053787, "learning_rate": 2.2769910884332453e-07, "logits/chosen": -2.939033031463623, "logits/rejected": -2.119109630584717, "logps/chosen": -588.824951171875, "logps/rejected": -513.5787353515625, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": 4.447937965393066, "rewards/margins": 6.3533549308776855, "rewards/rejected": -1.9054166078567505, "step": 3150 }, { "epoch": 2.3021004566210044, "grad_norm": 29.22773508239862, "learning_rate": 2.275402451746717e-07, "logits/chosen": -2.9012961387634277, "logits/rejected": -2.372316360473633, "logps/chosen": -789.0512084960938, "logps/rejected": -562.3557739257812, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 4.816849231719971, "rewards/margins": 5.400808334350586, "rewards/rejected": -0.5839592218399048, "step": 3151 }, { "epoch": 2.3028310502283107, "grad_norm": 13.764081368296466, "learning_rate": 2.2738139064859648e-07, "logits/chosen": -2.4672069549560547, "logits/rejected": -1.9292504787445068, "logps/chosen": -565.8564453125, "logps/rejected": -429.4200134277344, "loss": 0.0681, "rewards/accuracies": 1.0, "rewards/chosen": 2.9110541343688965, "rewards/margins": 6.2535810470581055, "rewards/rejected": -3.342526912689209, "step": 3152 }, { "epoch": 2.3035616438356166, "grad_norm": 7.496217664019463, "learning_rate": 2.272225453297628e-07, "logits/chosen": -3.093367576599121, "logits/rejected": -2.143289089202881, "logps/chosen": -980.7719116210938, "logps/rejected": -576.3775634765625, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 4.697112083435059, "rewards/margins": 5.8365478515625, "rewards/rejected": -1.1394357681274414, "step": 3153 }, { "epoch": 2.3042922374429224, "grad_norm": 19.500554786028285, "learning_rate": 2.270637092828312e-07, "logits/chosen": -2.7099337577819824, "logits/rejected": -2.20207142829895, "logps/chosen": -977.8182983398438, "logps/rejected": -876.9442138671875, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 3.780247688293457, "rewards/margins": 6.349765300750732, "rewards/rejected": -2.5695176124572754, "step": 3154 }, { "epoch": 2.3050228310502283, "grad_norm": 9.138597364584578, "learning_rate": 2.269048825724582e-07, "logits/chosen": -2.8185434341430664, "logits/rejected": -2.3291091918945312, "logps/chosen": -480.50799560546875, "logps/rejected": -500.1317138671875, "loss": 0.0504, "rewards/accuracies": 1.0, "rewards/chosen": 3.069218397140503, "rewards/margins": 6.646351337432861, "rewards/rejected": -3.5771334171295166, "step": 3155 }, { "epoch": 2.305753424657534, "grad_norm": 12.940278422973075, "learning_rate": 2.2674606526329663e-07, "logits/chosen": -2.6623034477233887, "logits/rejected": -2.141693592071533, "logps/chosen": -534.951171875, "logps/rejected": -402.849853515625, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 2.5841097831726074, "rewards/margins": 3.976332187652588, "rewards/rejected": -1.3922226428985596, "step": 3156 }, { "epoch": 2.30648401826484, "grad_norm": 15.036143182566486, "learning_rate": 2.265872574199953e-07, "logits/chosen": -3.00832462310791, "logits/rejected": -1.8496909141540527, "logps/chosen": -685.0993041992188, "logps/rejected": -382.27813720703125, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": 2.6241283416748047, "rewards/margins": 3.832993745803833, "rewards/rejected": -1.2088654041290283, "step": 3157 }, { "epoch": 2.307214611872146, "grad_norm": 8.41935893393306, "learning_rate": 2.2642845910719934e-07, "logits/chosen": -2.8401906490325928, "logits/rejected": -1.5466586351394653, "logps/chosen": -389.5198974609375, "logps/rejected": -239.1134033203125, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 2.6971611976623535, "rewards/margins": 4.807807445526123, "rewards/rejected": -2.1106460094451904, "step": 3158 }, { "epoch": 2.3079452054794523, "grad_norm": 13.795410078478945, "learning_rate": 2.2626967038955005e-07, "logits/chosen": -2.569441795349121, "logits/rejected": -1.458561897277832, "logps/chosen": -609.0252075195312, "logps/rejected": -374.262939453125, "loss": 0.0804, "rewards/accuracies": 0.875, "rewards/chosen": 3.0611560344696045, "rewards/margins": 4.975792407989502, "rewards/rejected": -1.914636254310608, "step": 3159 }, { "epoch": 2.308675799086758, "grad_norm": 18.609026517255593, "learning_rate": 2.261108913316846e-07, "logits/chosen": -3.011862277984619, "logits/rejected": -1.85459566116333, "logps/chosen": -914.1095581054688, "logps/rejected": -560.2489013671875, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": 5.487670421600342, "rewards/margins": 7.337454795837402, "rewards/rejected": -1.8497850894927979, "step": 3160 }, { "epoch": 2.309406392694064, "grad_norm": 24.05057791096843, "learning_rate": 2.259521219982367e-07, "logits/chosen": -2.185443162918091, "logits/rejected": -2.219618082046509, "logps/chosen": -450.0017395019531, "logps/rejected": -626.6937255859375, "loss": 0.0883, "rewards/accuracies": 1.0, "rewards/chosen": 2.341052293777466, "rewards/margins": 4.4247589111328125, "rewards/rejected": -2.0837063789367676, "step": 3161 }, { "epoch": 2.31013698630137, "grad_norm": 11.029074305738623, "learning_rate": 2.2579336245383536e-07, "logits/chosen": -3.0203452110290527, "logits/rejected": -1.8694652318954468, "logps/chosen": -756.0853271484375, "logps/rejected": -389.9556579589844, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": 2.8566136360168457, "rewards/margins": 4.840932846069336, "rewards/rejected": -1.9843193292617798, "step": 3162 }, { "epoch": 2.3108675799086758, "grad_norm": 7.280286471421096, "learning_rate": 2.2563461276310624e-07, "logits/chosen": -2.9363479614257812, "logits/rejected": -2.584082841873169, "logps/chosen": -660.553466796875, "logps/rejected": -725.402587890625, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 4.692083835601807, "rewards/margins": 5.092308044433594, "rewards/rejected": -0.40022385120391846, "step": 3163 }, { "epoch": 2.3115981735159816, "grad_norm": 9.35376949925616, "learning_rate": 2.2547587299067072e-07, "logits/chosen": -2.350743055343628, "logits/rejected": -1.9067087173461914, "logps/chosen": -551.4371337890625, "logps/rejected": -532.6913452148438, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 3.342444896697998, "rewards/margins": 6.252011299133301, "rewards/rejected": -2.9095659255981445, "step": 3164 }, { "epoch": 2.3123287671232875, "grad_norm": 6.853992618410996, "learning_rate": 2.2531714320114623e-07, "logits/chosen": -2.8824462890625, "logits/rejected": -1.8005790710449219, "logps/chosen": -868.5540161132812, "logps/rejected": -678.3992919921875, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 4.095142364501953, "rewards/margins": 5.917638778686523, "rewards/rejected": -1.8224965333938599, "step": 3165 }, { "epoch": 2.313059360730594, "grad_norm": 22.842889646931166, "learning_rate": 2.25158423459146e-07, "logits/chosen": -3.39233660697937, "logits/rejected": -2.369504928588867, "logps/chosen": -543.8023681640625, "logps/rejected": -322.4954833984375, "loss": 0.0975, "rewards/accuracies": 1.0, "rewards/chosen": 2.950528860092163, "rewards/margins": 4.6069183349609375, "rewards/rejected": -1.6563892364501953, "step": 3166 }, { "epoch": 2.3137899543378997, "grad_norm": 15.861331371790396, "learning_rate": 2.2499971382927933e-07, "logits/chosen": -2.5316452980041504, "logits/rejected": -1.822100043296814, "logps/chosen": -763.8899536132812, "logps/rejected": -503.0246887207031, "loss": 0.0704, "rewards/accuracies": 0.875, "rewards/chosen": 3.6184804439544678, "rewards/margins": 3.74230694770813, "rewards/rejected": -0.12382686138153076, "step": 3167 }, { "epoch": 2.3145205479452056, "grad_norm": 23.965948744390445, "learning_rate": 2.248410143761513e-07, "logits/chosen": -2.3793132305145264, "logits/rejected": -1.8045401573181152, "logps/chosen": -632.008056640625, "logps/rejected": -513.8491821289062, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": 3.728886842727661, "rewards/margins": 4.610504150390625, "rewards/rejected": -0.8816171884536743, "step": 3168 }, { "epoch": 2.3152511415525114, "grad_norm": 13.868826377627396, "learning_rate": 2.2468232516436303e-07, "logits/chosen": -2.6759002208709717, "logits/rejected": -2.215381383895874, "logps/chosen": -548.7635498046875, "logps/rejected": -482.84442138671875, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": 4.32856559753418, "rewards/margins": 6.783785820007324, "rewards/rejected": -2.4552202224731445, "step": 3169 }, { "epoch": 2.3159817351598173, "grad_norm": 8.390928238090071, "learning_rate": 2.2452364625851103e-07, "logits/chosen": -2.721268892288208, "logits/rejected": -2.253185749053955, "logps/chosen": -428.6570129394531, "logps/rejected": -532.9468994140625, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 3.1922454833984375, "rewards/margins": 5.544186592102051, "rewards/rejected": -2.351940870285034, "step": 3170 }, { "epoch": 2.316712328767123, "grad_norm": 15.440746630073969, "learning_rate": 2.2436497772318816e-07, "logits/chosen": -2.779118537902832, "logits/rejected": -1.8969707489013672, "logps/chosen": -673.5040283203125, "logps/rejected": -384.5584411621094, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 3.228811502456665, "rewards/margins": 3.9713962078094482, "rewards/rejected": -0.7425848841667175, "step": 3171 }, { "epoch": 2.317442922374429, "grad_norm": 12.265853899138046, "learning_rate": 2.2420631962298274e-07, "logits/chosen": -2.6256103515625, "logits/rejected": -2.4038214683532715, "logps/chosen": -730.5763549804688, "logps/rejected": -683.413818359375, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 3.48577618598938, "rewards/margins": 3.357456684112549, "rewards/rejected": 0.12831956148147583, "step": 3172 }, { "epoch": 2.3181735159817354, "grad_norm": 12.697516034726966, "learning_rate": 2.2404767202247887e-07, "logits/chosen": -3.2187466621398926, "logits/rejected": -2.3204023838043213, "logps/chosen": -548.2465209960938, "logps/rejected": -460.821533203125, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 4.2462849617004395, "rewards/margins": 4.573665618896484, "rewards/rejected": -0.32738035917282104, "step": 3173 }, { "epoch": 2.3189041095890413, "grad_norm": 15.336081769889413, "learning_rate": 2.2388903498625657e-07, "logits/chosen": -2.7160215377807617, "logits/rejected": -1.7625564336776733, "logps/chosen": -700.817138671875, "logps/rejected": -556.65234375, "loss": 0.0952, "rewards/accuracies": 1.0, "rewards/chosen": 4.4001359939575195, "rewards/margins": 5.406632900238037, "rewards/rejected": -1.0064969062805176, "step": 3174 }, { "epoch": 2.319634703196347, "grad_norm": 10.500382552671184, "learning_rate": 2.237304085788912e-07, "logits/chosen": -3.1316254138946533, "logits/rejected": -2.5134308338165283, "logps/chosen": -628.64794921875, "logps/rejected": -548.88671875, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 4.301152229309082, "rewards/margins": 4.684581279754639, "rewards/rejected": -0.38342875242233276, "step": 3175 }, { "epoch": 2.320365296803653, "grad_norm": 12.22063242273179, "learning_rate": 2.235717928649541e-07, "logits/chosen": -2.412538766860962, "logits/rejected": -1.8180276155471802, "logps/chosen": -636.0550537109375, "logps/rejected": -585.1016845703125, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 3.7403526306152344, "rewards/margins": 6.310585021972656, "rewards/rejected": -2.57023286819458, "step": 3176 }, { "epoch": 2.321095890410959, "grad_norm": 24.34510636866396, "learning_rate": 2.2341318790901215e-07, "logits/chosen": -2.8652231693267822, "logits/rejected": -2.315596580505371, "logps/chosen": -346.55908203125, "logps/rejected": -326.21490478515625, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": 2.968562126159668, "rewards/margins": 6.253417015075684, "rewards/rejected": -3.2848546504974365, "step": 3177 }, { "epoch": 2.3218264840182647, "grad_norm": 12.529754625552828, "learning_rate": 2.232545937756279e-07, "logits/chosen": -3.0952532291412354, "logits/rejected": -2.8779382705688477, "logps/chosen": -668.047119140625, "logps/rejected": -577.1453247070312, "loss": 0.0802, "rewards/accuracies": 1.0, "rewards/chosen": 2.6677961349487305, "rewards/margins": 2.821398973464966, "rewards/rejected": -0.15360309183597565, "step": 3178 }, { "epoch": 2.3225570776255706, "grad_norm": 9.449905023652603, "learning_rate": 2.2309601052935934e-07, "logits/chosen": -2.6898200511932373, "logits/rejected": -2.1513190269470215, "logps/chosen": -398.4613952636719, "logps/rejected": -423.92340087890625, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 2.7544362545013428, "rewards/margins": 5.99960994720459, "rewards/rejected": -3.2451744079589844, "step": 3179 }, { "epoch": 2.323287671232877, "grad_norm": 11.053559737085973, "learning_rate": 2.2293743823476022e-07, "logits/chosen": -2.878084182739258, "logits/rejected": -2.655742645263672, "logps/chosen": -897.36328125, "logps/rejected": -911.9672241210938, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 4.016181468963623, "rewards/margins": 5.096187591552734, "rewards/rejected": -1.0800062417984009, "step": 3180 }, { "epoch": 2.324018264840183, "grad_norm": 15.88354598573803, "learning_rate": 2.2277887695637975e-07, "logits/chosen": -2.978834867477417, "logits/rejected": -1.8720937967300415, "logps/chosen": -819.06591796875, "logps/rejected": -508.09808349609375, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 2.5365264415740967, "rewards/margins": 4.14726448059082, "rewards/rejected": -1.6107378005981445, "step": 3181 }, { "epoch": 2.3247488584474887, "grad_norm": 12.901641244499004, "learning_rate": 2.2262032675876273e-07, "logits/chosen": -2.909411668777466, "logits/rejected": -1.724516749382019, "logps/chosen": -1043.97802734375, "logps/rejected": -602.34765625, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 5.052309036254883, "rewards/margins": 6.020556449890137, "rewards/rejected": -0.9682471752166748, "step": 3182 }, { "epoch": 2.3254794520547946, "grad_norm": 12.263763791909481, "learning_rate": 2.2246178770644921e-07, "logits/chosen": -2.9074578285217285, "logits/rejected": -2.373213768005371, "logps/chosen": -304.7958984375, "logps/rejected": -327.0278015136719, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": 1.3041080236434937, "rewards/margins": 3.382416009902954, "rewards/rejected": -2.07830810546875, "step": 3183 }, { "epoch": 2.3262100456621004, "grad_norm": 7.285201281697284, "learning_rate": 2.22303259863975e-07, "logits/chosen": -2.3317956924438477, "logits/rejected": -1.9826455116271973, "logps/chosen": -788.4825439453125, "logps/rejected": -788.0240478515625, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 3.282850980758667, "rewards/margins": 4.034977912902832, "rewards/rejected": -0.752126932144165, "step": 3184 }, { "epoch": 2.3269406392694063, "grad_norm": 9.370111437025985, "learning_rate": 2.2214474329587115e-07, "logits/chosen": -2.9933924674987793, "logits/rejected": -1.5006824731826782, "logps/chosen": -613.73291015625, "logps/rejected": -310.4579772949219, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 5.856230735778809, "rewards/margins": 9.32624340057373, "rewards/rejected": -3.47001314163208, "step": 3185 }, { "epoch": 2.327671232876712, "grad_norm": 10.66125835957324, "learning_rate": 2.2198623806666425e-07, "logits/chosen": -2.8480381965637207, "logits/rejected": -2.5025546550750732, "logps/chosen": -856.3052368164062, "logps/rejected": -727.810546875, "loss": 0.0611, "rewards/accuracies": 0.875, "rewards/chosen": 3.9229159355163574, "rewards/margins": 4.561304569244385, "rewards/rejected": -0.6383885741233826, "step": 3186 }, { "epoch": 2.3284018264840185, "grad_norm": 13.949493858425473, "learning_rate": 2.2182774424087628e-07, "logits/chosen": -2.753089666366577, "logits/rejected": -2.169029474258423, "logps/chosen": -474.77984619140625, "logps/rejected": -485.4716796875, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 3.3866374492645264, "rewards/margins": 5.279306411743164, "rewards/rejected": -1.892668604850769, "step": 3187 }, { "epoch": 2.3291324200913244, "grad_norm": 9.28037121655121, "learning_rate": 2.2166926188302427e-07, "logits/chosen": -2.5558550357818604, "logits/rejected": -2.320291757583618, "logps/chosen": -717.4411010742188, "logps/rejected": -530.8742065429688, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 4.125063419342041, "rewards/margins": 4.090633392333984, "rewards/rejected": 0.034430187195539474, "step": 3188 }, { "epoch": 2.3298630136986302, "grad_norm": 12.172880354019801, "learning_rate": 2.21510791057621e-07, "logits/chosen": -2.70039963722229, "logits/rejected": -2.323974132537842, "logps/chosen": -798.5294799804688, "logps/rejected": -775.2222900390625, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": 3.7041094303131104, "rewards/margins": 4.892511367797852, "rewards/rejected": -1.1884018182754517, "step": 3189 }, { "epoch": 2.330593607305936, "grad_norm": 17.091045939858667, "learning_rate": 2.2135233182917433e-07, "logits/chosen": -3.2231998443603516, "logits/rejected": -2.7228007316589355, "logps/chosen": -985.9825439453125, "logps/rejected": -628.6517944335938, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 4.709447383880615, "rewards/margins": 5.041493892669678, "rewards/rejected": -0.33204588294029236, "step": 3190 }, { "epoch": 2.331324200913242, "grad_norm": 7.747376520896798, "learning_rate": 2.211938842621876e-07, "logits/chosen": -2.743389129638672, "logits/rejected": -2.51613187789917, "logps/chosen": -1076.46923828125, "logps/rejected": -894.802734375, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 3.5056586265563965, "rewards/margins": 4.365220069885254, "rewards/rejected": -0.8595612049102783, "step": 3191 }, { "epoch": 2.332054794520548, "grad_norm": 11.463785619824924, "learning_rate": 2.21035448421159e-07, "logits/chosen": -2.9664435386657715, "logits/rejected": -2.8314313888549805, "logps/chosen": -655.2960205078125, "logps/rejected": -686.99267578125, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": 3.184190273284912, "rewards/margins": 4.581912040710449, "rewards/rejected": -1.397721529006958, "step": 3192 }, { "epoch": 2.3327853881278537, "grad_norm": 14.558661351969421, "learning_rate": 2.2087702437058235e-07, "logits/chosen": -2.7515406608581543, "logits/rejected": -1.9875142574310303, "logps/chosen": -645.1941528320312, "logps/rejected": -508.8082275390625, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 2.672800064086914, "rewards/margins": 4.572765827178955, "rewards/rejected": -1.8999660015106201, "step": 3193 }, { "epoch": 2.33351598173516, "grad_norm": 12.452781911836006, "learning_rate": 2.2071861217494645e-07, "logits/chosen": -2.943371057510376, "logits/rejected": -1.8386186361312866, "logps/chosen": -524.3131103515625, "logps/rejected": -262.839111328125, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 3.6805009841918945, "rewards/margins": 5.926977157592773, "rewards/rejected": -2.246476173400879, "step": 3194 }, { "epoch": 2.334246575342466, "grad_norm": 13.361194366245037, "learning_rate": 2.2056021189873542e-07, "logits/chosen": -3.2730185985565186, "logits/rejected": -1.9465528726577759, "logps/chosen": -747.4405517578125, "logps/rejected": -495.0860290527344, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": 4.844476699829102, "rewards/margins": 5.769430160522461, "rewards/rejected": -0.9249529242515564, "step": 3195 }, { "epoch": 2.334977168949772, "grad_norm": 9.959935611378846, "learning_rate": 2.2040182360642838e-07, "logits/chosen": -2.9406495094299316, "logits/rejected": -2.088918447494507, "logps/chosen": -923.5286865234375, "logps/rejected": -648.0589599609375, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 5.568696975708008, "rewards/margins": 6.571966648101807, "rewards/rejected": -1.0032695531845093, "step": 3196 }, { "epoch": 2.3357077625570777, "grad_norm": 8.97286116848066, "learning_rate": 2.202434473624996e-07, "logits/chosen": -2.720362663269043, "logits/rejected": -2.0105578899383545, "logps/chosen": -435.452392578125, "logps/rejected": -421.8283386230469, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": 3.829533576965332, "rewards/margins": 6.534416198730469, "rewards/rejected": -2.7048826217651367, "step": 3197 }, { "epoch": 2.3364383561643836, "grad_norm": 8.878189588323226, "learning_rate": 2.2008508323141862e-07, "logits/chosen": -2.6245017051696777, "logits/rejected": -1.8697569370269775, "logps/chosen": -647.0584716796875, "logps/rejected": -380.8236389160156, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 3.444319725036621, "rewards/margins": 5.195450782775879, "rewards/rejected": -1.7511308193206787, "step": 3198 }, { "epoch": 2.3371689497716894, "grad_norm": 10.771759013101255, "learning_rate": 2.1992673127764984e-07, "logits/chosen": -2.5498485565185547, "logits/rejected": -2.319533109664917, "logps/chosen": -723.8182983398438, "logps/rejected": -706.9906616210938, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 6.1963911056518555, "rewards/margins": 5.93373966217041, "rewards/rejected": 0.2626506984233856, "step": 3199 }, { "epoch": 2.3378995433789953, "grad_norm": 9.388220658558739, "learning_rate": 2.1976839156565287e-07, "logits/chosen": -3.215161085128784, "logits/rejected": -2.8026862144470215, "logps/chosen": -602.5238647460938, "logps/rejected": -434.9118957519531, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 3.8253326416015625, "rewards/margins": 6.049017906188965, "rewards/rejected": -2.2236855030059814, "step": 3200 }, { "epoch": 2.338630136986301, "grad_norm": 6.037296722970589, "learning_rate": 2.1961006415988206e-07, "logits/chosen": -2.7618703842163086, "logits/rejected": -1.9383299350738525, "logps/chosen": -682.3037109375, "logps/rejected": -488.1225891113281, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 3.6741156578063965, "rewards/margins": 5.697450160980225, "rewards/rejected": -2.023334264755249, "step": 3201 }, { "epoch": 2.3393607305936075, "grad_norm": 15.496162185537061, "learning_rate": 2.1945174912478705e-07, "logits/chosen": -2.8623714447021484, "logits/rejected": -2.308828115463257, "logps/chosen": -876.00390625, "logps/rejected": -665.408935546875, "loss": 0.1105, "rewards/accuracies": 1.0, "rewards/chosen": 3.677511692047119, "rewards/margins": 5.854890823364258, "rewards/rejected": -2.1773786544799805, "step": 3202 }, { "epoch": 2.3400913242009134, "grad_norm": 10.707005954115504, "learning_rate": 2.1929344652481238e-07, "logits/chosen": -2.261568784713745, "logits/rejected": -1.9803404808044434, "logps/chosen": -795.5465087890625, "logps/rejected": -480.2237548828125, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 4.466181755065918, "rewards/margins": 4.614684104919434, "rewards/rejected": -0.14850181341171265, "step": 3203 }, { "epoch": 2.3408219178082192, "grad_norm": 13.958664498267567, "learning_rate": 2.1913515642439751e-07, "logits/chosen": -2.74381947517395, "logits/rejected": -2.285557746887207, "logps/chosen": -513.9188232421875, "logps/rejected": -313.08380126953125, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": 3.328747510910034, "rewards/margins": 4.598792552947998, "rewards/rejected": -1.2700451612472534, "step": 3204 }, { "epoch": 2.341552511415525, "grad_norm": 14.264042945202393, "learning_rate": 2.1897687888797658e-07, "logits/chosen": -2.923506498336792, "logits/rejected": -2.685739755630493, "logps/chosen": -565.7894287109375, "logps/rejected": -500.2327880859375, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": 3.401075839996338, "rewards/margins": 3.5335631370544434, "rewards/rejected": -0.13248729705810547, "step": 3205 }, { "epoch": 2.342283105022831, "grad_norm": 14.857869004749796, "learning_rate": 2.1881861397997892e-07, "logits/chosen": -2.7981600761413574, "logits/rejected": -2.2692933082580566, "logps/chosen": -551.0512084960938, "logps/rejected": -509.56317138671875, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": 2.875093460083008, "rewards/margins": 4.75128698348999, "rewards/rejected": -1.8761935234069824, "step": 3206 }, { "epoch": 2.343013698630137, "grad_norm": 10.18363420590353, "learning_rate": 2.1866036176482865e-07, "logits/chosen": -2.5184621810913086, "logits/rejected": -2.604179859161377, "logps/chosen": -884.51708984375, "logps/rejected": -749.681884765625, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 4.746648788452148, "rewards/margins": 5.847291469573975, "rewards/rejected": -1.1006426811218262, "step": 3207 }, { "epoch": 2.3437442922374427, "grad_norm": 8.039821221940306, "learning_rate": 2.185021223069448e-07, "logits/chosen": -2.5484535694122314, "logits/rejected": -1.791927456855774, "logps/chosen": -527.7449340820312, "logps/rejected": -413.11700439453125, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 4.655811786651611, "rewards/margins": 8.418611526489258, "rewards/rejected": -3.7627999782562256, "step": 3208 }, { "epoch": 2.3444748858447486, "grad_norm": 11.787904028981758, "learning_rate": 2.1834389567074086e-07, "logits/chosen": -3.2985918521881104, "logits/rejected": -2.676189661026001, "logps/chosen": -1048.26708984375, "logps/rejected": -800.79833984375, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 4.935397624969482, "rewards/margins": 5.2216715812683105, "rewards/rejected": -0.2862739562988281, "step": 3209 }, { "epoch": 2.345205479452055, "grad_norm": 13.031441446997468, "learning_rate": 2.1818568192062545e-07, "logits/chosen": -2.6223483085632324, "logits/rejected": -1.9618470668792725, "logps/chosen": -500.375732421875, "logps/rejected": -513.6287231445312, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": 3.2161507606506348, "rewards/margins": 5.391873359680176, "rewards/rejected": -2.175722122192383, "step": 3210 }, { "epoch": 2.345936073059361, "grad_norm": 15.225400501614738, "learning_rate": 2.1802748112100183e-07, "logits/chosen": -3.1818270683288574, "logits/rejected": -2.4085144996643066, "logps/chosen": -852.5225219726562, "logps/rejected": -643.8779296875, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 4.618712902069092, "rewards/margins": 4.106255531311035, "rewards/rejected": 0.5124574899673462, "step": 3211 }, { "epoch": 2.3466666666666667, "grad_norm": 8.979754246532863, "learning_rate": 2.1786929333626798e-07, "logits/chosen": -2.0300004482269287, "logits/rejected": -2.083162546157837, "logps/chosen": -671.3568115234375, "logps/rejected": -592.2293701171875, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 3.941532850265503, "rewards/margins": 9.793181419372559, "rewards/rejected": -5.851648807525635, "step": 3212 }, { "epoch": 2.3473972602739726, "grad_norm": 7.712579276049969, "learning_rate": 2.177111186308167e-07, "logits/chosen": -2.8460845947265625, "logits/rejected": -2.2539665699005127, "logps/chosen": -817.291748046875, "logps/rejected": -768.89697265625, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 4.058583736419678, "rewards/margins": 5.416020393371582, "rewards/rejected": -1.357437014579773, "step": 3213 }, { "epoch": 2.3481278538812784, "grad_norm": 56.7689989191394, "learning_rate": 2.1755295706903522e-07, "logits/chosen": -2.8805694580078125, "logits/rejected": -2.2307729721069336, "logps/chosen": -635.1085205078125, "logps/rejected": -395.3861083984375, "loss": 0.1318, "rewards/accuracies": 1.0, "rewards/chosen": 2.397658109664917, "rewards/margins": 4.540799140930176, "rewards/rejected": -2.143141031265259, "step": 3214 }, { "epoch": 2.3488584474885843, "grad_norm": 6.0063687538053685, "learning_rate": 2.1739480871530552e-07, "logits/chosen": -2.671456813812256, "logits/rejected": -1.8540172576904297, "logps/chosen": -611.11572265625, "logps/rejected": -399.3921203613281, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 2.6841931343078613, "rewards/margins": 3.783191204071045, "rewards/rejected": -1.098997712135315, "step": 3215 }, { "epoch": 2.34958904109589, "grad_norm": 13.119913332583115, "learning_rate": 2.1723667363400438e-07, "logits/chosen": -2.964224338531494, "logits/rejected": -1.9608328342437744, "logps/chosen": -371.07421875, "logps/rejected": -271.3431396484375, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": 2.8622629642486572, "rewards/margins": 4.9057183265686035, "rewards/rejected": -2.043454885482788, "step": 3216 }, { "epoch": 2.3503196347031965, "grad_norm": 14.277321840783463, "learning_rate": 2.1707855188950301e-07, "logits/chosen": -2.2845988273620605, "logits/rejected": -2.293210506439209, "logps/chosen": -294.88909912109375, "logps/rejected": -493.1849365234375, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": 2.308560848236084, "rewards/margins": 6.562014102935791, "rewards/rejected": -4.253453254699707, "step": 3217 }, { "epoch": 2.3510502283105024, "grad_norm": 14.741047356712873, "learning_rate": 2.1692044354616717e-07, "logits/chosen": -3.026365280151367, "logits/rejected": -2.5507476329803467, "logps/chosen": -427.9838562011719, "logps/rejected": -339.19189453125, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 2.2046327590942383, "rewards/margins": 4.015827655792236, "rewards/rejected": -1.8111947774887085, "step": 3218 }, { "epoch": 2.3517808219178082, "grad_norm": 8.078439968321689, "learning_rate": 2.1676234866835723e-07, "logits/chosen": -3.226841449737549, "logits/rejected": -2.297252655029297, "logps/chosen": -951.79052734375, "logps/rejected": -617.2006225585938, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 4.986065864562988, "rewards/margins": 6.059765815734863, "rewards/rejected": -1.0736993551254272, "step": 3219 }, { "epoch": 2.352511415525114, "grad_norm": 7.602615769623462, "learning_rate": 2.1660426732042808e-07, "logits/chosen": -2.4104950428009033, "logits/rejected": -1.4729368686676025, "logps/chosen": -699.5580444335938, "logps/rejected": -447.36383056640625, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 3.894683599472046, "rewards/margins": 4.708644866943359, "rewards/rejected": -0.8139612078666687, "step": 3220 }, { "epoch": 2.35324200913242, "grad_norm": 10.380699706262655, "learning_rate": 2.164461995667292e-07, "logits/chosen": -2.3921685218811035, "logits/rejected": -1.5284297466278076, "logps/chosen": -373.97705078125, "logps/rejected": -335.8888854980469, "loss": 0.0611, "rewards/accuracies": 0.875, "rewards/chosen": 3.9687018394470215, "rewards/margins": 6.687376976013184, "rewards/rejected": -2.7186758518218994, "step": 3221 }, { "epoch": 2.353972602739726, "grad_norm": 11.036703431361918, "learning_rate": 2.1628814547160416e-07, "logits/chosen": -2.6722376346588135, "logits/rejected": -2.7399888038635254, "logps/chosen": -334.1835632324219, "logps/rejected": -502.04986572265625, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": 1.9867687225341797, "rewards/margins": 4.321871757507324, "rewards/rejected": -2.3351032733917236, "step": 3222 }, { "epoch": 2.3547031963470317, "grad_norm": 15.458810529857946, "learning_rate": 2.161301050993913e-07, "logits/chosen": -2.6902787685394287, "logits/rejected": -2.456718921661377, "logps/chosen": -423.3415832519531, "logps/rejected": -391.4222106933594, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 1.9088690280914307, "rewards/margins": 5.028783798217773, "rewards/rejected": -3.1199145317077637, "step": 3223 }, { "epoch": 2.355433789954338, "grad_norm": 3.6086012766483533, "learning_rate": 2.1597207851442345e-07, "logits/chosen": -2.832569122314453, "logits/rejected": -2.7099533081054688, "logps/chosen": -1045.06982421875, "logps/rejected": -936.616943359375, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 5.18226432800293, "rewards/margins": 5.935611248016357, "rewards/rejected": -0.7533469796180725, "step": 3224 }, { "epoch": 2.356164383561644, "grad_norm": 15.369378775834699, "learning_rate": 2.1581406578102761e-07, "logits/chosen": -3.0217607021331787, "logits/rejected": -2.1313297748565674, "logps/chosen": -737.387451171875, "logps/rejected": -471.1142272949219, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": 3.235628604888916, "rewards/margins": 4.375641822814941, "rewards/rejected": -1.1400134563446045, "step": 3225 }, { "epoch": 2.35689497716895, "grad_norm": 10.681837935586222, "learning_rate": 2.1565606696352537e-07, "logits/chosen": -2.793349266052246, "logits/rejected": -2.713737964630127, "logps/chosen": -852.6478881835938, "logps/rejected": -774.0675048828125, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 3.121737480163574, "rewards/margins": 3.621835231781006, "rewards/rejected": -0.5000976920127869, "step": 3226 }, { "epoch": 2.3576255707762557, "grad_norm": 9.931704960767531, "learning_rate": 2.1549808212623218e-07, "logits/chosen": -2.706076145172119, "logits/rejected": -2.4437646865844727, "logps/chosen": -716.0775756835938, "logps/rejected": -629.33837890625, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 3.998255968093872, "rewards/margins": 5.211296081542969, "rewards/rejected": -1.2130403518676758, "step": 3227 }, { "epoch": 2.3583561643835615, "grad_norm": 6.529543420973395, "learning_rate": 2.1534011133345834e-07, "logits/chosen": -3.349363327026367, "logits/rejected": -2.205353021621704, "logps/chosen": -835.9329833984375, "logps/rejected": -551.1066284179688, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 5.678954124450684, "rewards/margins": 6.825380325317383, "rewards/rejected": -1.1464265584945679, "step": 3228 }, { "epoch": 2.3590867579908674, "grad_norm": 10.776212803677215, "learning_rate": 2.1518215464950812e-07, "logits/chosen": -2.5751171112060547, "logits/rejected": -2.0601439476013184, "logps/chosen": -558.1322021484375, "logps/rejected": -474.3846130371094, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 3.200829029083252, "rewards/margins": 5.40001106262207, "rewards/rejected": -2.1991822719573975, "step": 3229 }, { "epoch": 2.3598173515981733, "grad_norm": 6.94958817613469, "learning_rate": 2.1502421213868027e-07, "logits/chosen": -3.3044331073760986, "logits/rejected": -2.1388089656829834, "logps/chosen": -694.1693725585938, "logps/rejected": -467.3094482421875, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": 5.543830871582031, "rewards/margins": 7.48823356628418, "rewards/rejected": -1.9444023370742798, "step": 3230 }, { "epoch": 2.3605479452054796, "grad_norm": 8.968947351922772, "learning_rate": 2.1486628386526748e-07, "logits/chosen": -3.394728422164917, "logits/rejected": -2.322004795074463, "logps/chosen": -622.9041137695312, "logps/rejected": -537.1915283203125, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": 4.544680595397949, "rewards/margins": 5.927895545959473, "rewards/rejected": -1.3832144737243652, "step": 3231 }, { "epoch": 2.3612785388127855, "grad_norm": 16.200739450415046, "learning_rate": 2.1470836989355687e-07, "logits/chosen": -2.442305326461792, "logits/rejected": -1.8293378353118896, "logps/chosen": -699.3958740234375, "logps/rejected": -451.3475341796875, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 4.8605146408081055, "rewards/margins": 6.236593246459961, "rewards/rejected": -1.3760783672332764, "step": 3232 }, { "epoch": 2.3620091324200914, "grad_norm": 18.447109188226314, "learning_rate": 2.145504702878297e-07, "logits/chosen": -3.1135408878326416, "logits/rejected": -1.9723814725875854, "logps/chosen": -509.4600830078125, "logps/rejected": -307.8126220703125, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 2.643674612045288, "rewards/margins": 3.4632463455200195, "rewards/rejected": -0.8195716738700867, "step": 3233 }, { "epoch": 2.3627397260273972, "grad_norm": 29.04439330105834, "learning_rate": 2.143925851123613e-07, "logits/chosen": -2.8270115852355957, "logits/rejected": -1.8466556072235107, "logps/chosen": -538.5269775390625, "logps/rejected": -408.498779296875, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": 2.417232036590576, "rewards/margins": 5.221994400024414, "rewards/rejected": -2.804762363433838, "step": 3234 }, { "epoch": 2.363470319634703, "grad_norm": 8.920525924847643, "learning_rate": 2.1423471443142128e-07, "logits/chosen": -2.786372423171997, "logits/rejected": -2.38698148727417, "logps/chosen": -786.0132446289062, "logps/rejected": -790.3073120117188, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 4.385632514953613, "rewards/margins": 6.380941867828369, "rewards/rejected": -1.9953093528747559, "step": 3235 }, { "epoch": 2.364200913242009, "grad_norm": 12.316604472318621, "learning_rate": 2.1407685830927312e-07, "logits/chosen": -2.545412063598633, "logits/rejected": -1.941904067993164, "logps/chosen": -835.1239013671875, "logps/rejected": -592.898681640625, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 4.814752578735352, "rewards/margins": 6.046104431152344, "rewards/rejected": -1.231351613998413, "step": 3236 }, { "epoch": 2.364931506849315, "grad_norm": 10.732743362734992, "learning_rate": 2.1391901681017463e-07, "logits/chosen": -3.0332131385803223, "logits/rejected": -2.641695261001587, "logps/chosen": -918.753173828125, "logps/rejected": -835.2080688476562, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": 5.315586090087891, "rewards/margins": 6.087773323059082, "rewards/rejected": -0.7721877098083496, "step": 3237 }, { "epoch": 2.365662100456621, "grad_norm": 9.218316743659104, "learning_rate": 2.1376118999837743e-07, "logits/chosen": -2.58830189704895, "logits/rejected": -2.5492944717407227, "logps/chosen": -515.9072265625, "logps/rejected": -661.1538696289062, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 2.952585458755493, "rewards/margins": 6.242288589477539, "rewards/rejected": -3.289702892303467, "step": 3238 }, { "epoch": 2.366392694063927, "grad_norm": 12.237113558441283, "learning_rate": 2.1360337793812745e-07, "logits/chosen": -3.256319761276245, "logits/rejected": -2.3105618953704834, "logps/chosen": -704.950439453125, "logps/rejected": -489.9397888183594, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 4.142513751983643, "rewards/margins": 5.423303604125977, "rewards/rejected": -1.280789852142334, "step": 3239 }, { "epoch": 2.367123287671233, "grad_norm": 9.182342950364328, "learning_rate": 2.134455806936642e-07, "logits/chosen": -3.1565942764282227, "logits/rejected": -2.24660062789917, "logps/chosen": -569.3362426757812, "logps/rejected": -322.2732849121094, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 1.7691582441329956, "rewards/margins": 2.9492478370666504, "rewards/rejected": -1.1800894737243652, "step": 3240 }, { "epoch": 2.367853881278539, "grad_norm": 11.417872529957215, "learning_rate": 2.1328779832922146e-07, "logits/chosen": -2.6016135215759277, "logits/rejected": -1.986232042312622, "logps/chosen": -279.52496337890625, "logps/rejected": -238.18475341796875, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": 0.14267438650131226, "rewards/margins": 2.7679872512817383, "rewards/rejected": -2.6253128051757812, "step": 3241 }, { "epoch": 2.3685844748858447, "grad_norm": 12.223020777961418, "learning_rate": 2.13130030909027e-07, "logits/chosen": -3.140925407409668, "logits/rejected": -2.292361259460449, "logps/chosen": -912.1324462890625, "logps/rejected": -578.8829345703125, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": 3.526099443435669, "rewards/margins": 4.5254387855529785, "rewards/rejected": -0.9993394017219543, "step": 3242 }, { "epoch": 2.3693150684931505, "grad_norm": 10.468920897042596, "learning_rate": 2.1297227849730245e-07, "logits/chosen": -2.8055033683776855, "logits/rejected": -2.4851303100585938, "logps/chosen": -427.05682373046875, "logps/rejected": -425.2106018066406, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 3.4674360752105713, "rewards/margins": 6.820620059967041, "rewards/rejected": -3.353184223175049, "step": 3243 }, { "epoch": 2.3700456621004564, "grad_norm": 10.531775577459142, "learning_rate": 2.128145411582631e-07, "logits/chosen": -2.7296290397644043, "logits/rejected": -2.7323923110961914, "logps/chosen": -535.5946655273438, "logps/rejected": -792.9771118164062, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 2.0228261947631836, "rewards/margins": 5.403316497802734, "rewards/rejected": -3.38049054145813, "step": 3244 }, { "epoch": 2.3707762557077627, "grad_norm": 6.44294523288269, "learning_rate": 2.126568189561183e-07, "logits/chosen": -2.7398006916046143, "logits/rejected": -1.7695155143737793, "logps/chosen": -336.963134765625, "logps/rejected": -357.97113037109375, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 3.4049668312072754, "rewards/margins": 6.97774076461792, "rewards/rejected": -3.5727744102478027, "step": 3245 }, { "epoch": 2.3715068493150686, "grad_norm": 9.402099504773188, "learning_rate": 2.1249911195507124e-07, "logits/chosen": -2.9084808826446533, "logits/rejected": -2.11454439163208, "logps/chosen": -1069.171142578125, "logps/rejected": -589.6220092773438, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 5.788543701171875, "rewards/margins": 5.291865348815918, "rewards/rejected": 0.4966781437397003, "step": 3246 }, { "epoch": 2.3722374429223745, "grad_norm": 12.774115401545117, "learning_rate": 2.12341420219319e-07, "logits/chosen": -3.119137763977051, "logits/rejected": -2.2067975997924805, "logps/chosen": -785.868408203125, "logps/rejected": -492.5049133300781, "loss": 0.0788, "rewards/accuracies": 1.0, "rewards/chosen": 4.8604583740234375, "rewards/margins": 5.914273262023926, "rewards/rejected": -1.0538151264190674, "step": 3247 }, { "epoch": 2.3729680365296804, "grad_norm": 14.947795815150986, "learning_rate": 2.121837438130523e-07, "logits/chosen": -2.530256748199463, "logits/rejected": -1.835095763206482, "logps/chosen": -410.65289306640625, "logps/rejected": -295.29510498046875, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": 2.298905849456787, "rewards/margins": 5.814750671386719, "rewards/rejected": -3.5158448219299316, "step": 3248 }, { "epoch": 2.3736986301369862, "grad_norm": 7.954761389241391, "learning_rate": 2.1202608280045559e-07, "logits/chosen": -2.5468573570251465, "logits/rejected": -1.811657428741455, "logps/chosen": -517.0000610351562, "logps/rejected": -445.30419921875, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 3.5617761611938477, "rewards/margins": 7.207512855529785, "rewards/rejected": -3.6457366943359375, "step": 3249 }, { "epoch": 2.374429223744292, "grad_norm": 11.908012680175098, "learning_rate": 2.1186843724570718e-07, "logits/chosen": -2.8583784103393555, "logits/rejected": -2.411553382873535, "logps/chosen": -514.700439453125, "logps/rejected": -495.3212890625, "loss": 0.0655, "rewards/accuracies": 1.0, "rewards/chosen": 2.9843897819519043, "rewards/margins": 2.6609065532684326, "rewards/rejected": 0.32348328828811646, "step": 3250 }, { "epoch": 2.375159817351598, "grad_norm": 7.64593284050186, "learning_rate": 2.117108072129791e-07, "logits/chosen": -2.871943712234497, "logits/rejected": -2.2781219482421875, "logps/chosen": -739.1212158203125, "logps/rejected": -585.9860229492188, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 4.359673500061035, "rewards/margins": 5.58512020111084, "rewards/rejected": -1.2254467010498047, "step": 3251 }, { "epoch": 2.3758904109589043, "grad_norm": 13.933609629055992, "learning_rate": 2.1155319276643697e-07, "logits/chosen": -2.8637380599975586, "logits/rejected": -1.817155361175537, "logps/chosen": -559.969970703125, "logps/rejected": -501.9647216796875, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": 3.931227684020996, "rewards/margins": 5.499495983123779, "rewards/rejected": -1.5682684183120728, "step": 3252 }, { "epoch": 2.37662100456621, "grad_norm": 8.798205314042567, "learning_rate": 2.1139559397024003e-07, "logits/chosen": -3.096219539642334, "logits/rejected": -2.9711966514587402, "logps/chosen": -536.8731689453125, "logps/rejected": -637.1417236328125, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 2.1322059631347656, "rewards/margins": 3.6173229217529297, "rewards/rejected": -1.4851171970367432, "step": 3253 }, { "epoch": 2.377351598173516, "grad_norm": 14.864898585625719, "learning_rate": 2.1123801088854125e-07, "logits/chosen": -3.322150945663452, "logits/rejected": -2.0757861137390137, "logps/chosen": -647.1425170898438, "logps/rejected": -376.9111328125, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": 4.112119197845459, "rewards/margins": 5.226171493530273, "rewards/rejected": -1.1140519380569458, "step": 3254 }, { "epoch": 2.378082191780822, "grad_norm": 9.54168670074714, "learning_rate": 2.1108044358548723e-07, "logits/chosen": -2.7760252952575684, "logits/rejected": -2.4896950721740723, "logps/chosen": -694.8831176757812, "logps/rejected": -710.7523803710938, "loss": 0.0629, "rewards/accuracies": 1.0, "rewards/chosen": 3.5532758235931396, "rewards/margins": 4.479866981506348, "rewards/rejected": -0.9265913963317871, "step": 3255 }, { "epoch": 2.378812785388128, "grad_norm": 10.237678379093085, "learning_rate": 2.1092289212521815e-07, "logits/chosen": -3.0226125717163086, "logits/rejected": -1.6516752243041992, "logps/chosen": -553.533935546875, "logps/rejected": -278.1886901855469, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 3.699557304382324, "rewards/margins": 5.891382694244385, "rewards/rejected": -2.1918253898620605, "step": 3256 }, { "epoch": 2.3795433789954337, "grad_norm": 7.348554362641888, "learning_rate": 2.1076535657186743e-07, "logits/chosen": -2.5166046619415283, "logits/rejected": -1.7160800695419312, "logps/chosen": -666.9852294921875, "logps/rejected": -442.53472900390625, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 3.803572177886963, "rewards/margins": 6.143396377563477, "rewards/rejected": -2.339824676513672, "step": 3257 }, { "epoch": 2.3802739726027395, "grad_norm": 12.809050602747652, "learning_rate": 2.1060783698956232e-07, "logits/chosen": -3.096494197845459, "logits/rejected": -2.0032200813293457, "logps/chosen": -410.83856201171875, "logps/rejected": -293.96160888671875, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": 2.9665727615356445, "rewards/margins": 5.378962516784668, "rewards/rejected": -2.4123902320861816, "step": 3258 }, { "epoch": 2.381004566210046, "grad_norm": 21.18346445366682, "learning_rate": 2.1045033344242368e-07, "logits/chosen": -2.9561049938201904, "logits/rejected": -2.3327977657318115, "logps/chosen": -854.4967041015625, "logps/rejected": -596.026123046875, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 2.8470699787139893, "rewards/margins": 4.08048677444458, "rewards/rejected": -1.2334166765213013, "step": 3259 }, { "epoch": 2.3817351598173517, "grad_norm": 14.37124133520304, "learning_rate": 2.1029284599456558e-07, "logits/chosen": -2.6268422603607178, "logits/rejected": -2.3547048568725586, "logps/chosen": -293.2623596191406, "logps/rejected": -382.6225891113281, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 1.7826274633407593, "rewards/margins": 3.995192289352417, "rewards/rejected": -2.212564706802368, "step": 3260 }, { "epoch": 2.3824657534246576, "grad_norm": 29.88989193102641, "learning_rate": 2.1013537471009578e-07, "logits/chosen": -2.5879974365234375, "logits/rejected": -2.2730228900909424, "logps/chosen": -726.0223999023438, "logps/rejected": -581.909912109375, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": 3.1936755180358887, "rewards/margins": 4.549412727355957, "rewards/rejected": -1.3557369709014893, "step": 3261 }, { "epoch": 2.3831963470319635, "grad_norm": 11.662357806190366, "learning_rate": 2.0997791965311505e-07, "logits/chosen": -2.634890079498291, "logits/rejected": -2.4998788833618164, "logps/chosen": -521.8649291992188, "logps/rejected": -482.88433837890625, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 2.7580649852752686, "rewards/margins": 3.9597115516662598, "rewards/rejected": -1.2016465663909912, "step": 3262 }, { "epoch": 2.3839269406392694, "grad_norm": 6.32449168699564, "learning_rate": 2.098204808877179e-07, "logits/chosen": -3.0488901138305664, "logits/rejected": -2.386322259902954, "logps/chosen": -842.3619995117188, "logps/rejected": -763.9119873046875, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 4.18773078918457, "rewards/margins": 5.686225414276123, "rewards/rejected": -1.4984949827194214, "step": 3263 }, { "epoch": 2.3846575342465752, "grad_norm": 9.828145571446077, "learning_rate": 2.0966305847799214e-07, "logits/chosen": -2.841519355773926, "logits/rejected": -2.2117128372192383, "logps/chosen": -604.3095092773438, "logps/rejected": -495.02447509765625, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 4.335815906524658, "rewards/margins": 6.700581073760986, "rewards/rejected": -2.36476469039917, "step": 3264 }, { "epoch": 2.385388127853881, "grad_norm": 7.862400936115164, "learning_rate": 2.0950565248801902e-07, "logits/chosen": -2.2649736404418945, "logits/rejected": -2.3035430908203125, "logps/chosen": -836.1757202148438, "logps/rejected": -785.7737426757812, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 3.6916699409484863, "rewards/margins": 8.13565444946289, "rewards/rejected": -4.443984508514404, "step": 3265 }, { "epoch": 2.3861187214611874, "grad_norm": 12.3088193707503, "learning_rate": 2.093482629818728e-07, "logits/chosen": -2.8691911697387695, "logits/rejected": -2.0659847259521484, "logps/chosen": -1085.189453125, "logps/rejected": -686.3433837890625, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 4.424211502075195, "rewards/margins": 3.96451473236084, "rewards/rejected": 0.45969638228416443, "step": 3266 }, { "epoch": 2.3868493150684933, "grad_norm": 12.301972046600966, "learning_rate": 2.0919089002362135e-07, "logits/chosen": -1.8855500221252441, "logits/rejected": -2.4396896362304688, "logps/chosen": -286.52142333984375, "logps/rejected": -497.818115234375, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": 0.7534410953521729, "rewards/margins": 2.7583134174346924, "rewards/rejected": -2.0048725605010986, "step": 3267 }, { "epoch": 2.387579908675799, "grad_norm": 9.898175820954057, "learning_rate": 2.0903353367732563e-07, "logits/chosen": -3.132113456726074, "logits/rejected": -2.204155683517456, "logps/chosen": -776.6647338867188, "logps/rejected": -498.7330322265625, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 5.8120012283325195, "rewards/margins": 6.477255344390869, "rewards/rejected": -0.6652541756629944, "step": 3268 }, { "epoch": 2.388310502283105, "grad_norm": 17.355636235265457, "learning_rate": 2.0887619400703994e-07, "logits/chosen": -3.1471166610717773, "logits/rejected": -2.912219524383545, "logps/chosen": -621.6504516601562, "logps/rejected": -653.489990234375, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 2.218984842300415, "rewards/margins": 5.728034973144531, "rewards/rejected": -3.5090503692626953, "step": 3269 }, { "epoch": 2.389041095890411, "grad_norm": 14.816407794786368, "learning_rate": 2.0871887107681163e-07, "logits/chosen": -2.919034242630005, "logits/rejected": -2.2525293827056885, "logps/chosen": -859.5333251953125, "logps/rejected": -751.7206420898438, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 5.600754737854004, "rewards/margins": 6.0314459800720215, "rewards/rejected": -0.4306911528110504, "step": 3270 }, { "epoch": 2.389771689497717, "grad_norm": 25.67831249170932, "learning_rate": 2.085615649506814e-07, "logits/chosen": -2.983076810836792, "logits/rejected": -2.56773042678833, "logps/chosen": -540.5064697265625, "logps/rejected": -540.784423828125, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 3.360506534576416, "rewards/margins": 6.33820915222168, "rewards/rejected": -2.9777028560638428, "step": 3271 }, { "epoch": 2.3905022831050227, "grad_norm": 12.826960446911833, "learning_rate": 2.0840427569268304e-07, "logits/chosen": -2.8789422512054443, "logits/rejected": -2.2991909980773926, "logps/chosen": -598.8678588867188, "logps/rejected": -444.3808288574219, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": 2.599274158477783, "rewards/margins": 5.04013729095459, "rewards/rejected": -2.4408631324768066, "step": 3272 }, { "epoch": 2.391232876712329, "grad_norm": 7.732373774398889, "learning_rate": 2.0824700336684347e-07, "logits/chosen": -2.891530990600586, "logits/rejected": -2.2682697772979736, "logps/chosen": -547.36767578125, "logps/rejected": -476.8209533691406, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 2.169098377227783, "rewards/margins": 4.486983299255371, "rewards/rejected": -2.3178844451904297, "step": 3273 }, { "epoch": 2.391963470319635, "grad_norm": 12.163269056291659, "learning_rate": 2.080897480371829e-07, "logits/chosen": -2.66314697265625, "logits/rejected": -2.275028944015503, "logps/chosen": -591.2669067382812, "logps/rejected": -531.810791015625, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": 2.501727342605591, "rewards/margins": 3.4372971057891846, "rewards/rejected": -0.9355697631835938, "step": 3274 }, { "epoch": 2.3926940639269407, "grad_norm": 10.369706975315466, "learning_rate": 2.079325097677142e-07, "logits/chosen": -2.7171475887298584, "logits/rejected": -1.9529415369033813, "logps/chosen": -936.864013671875, "logps/rejected": -609.728515625, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 4.190535545349121, "rewards/margins": 5.619162082672119, "rewards/rejected": -1.4286266565322876, "step": 3275 }, { "epoch": 2.3934246575342466, "grad_norm": 13.901881116740439, "learning_rate": 2.077752886224436e-07, "logits/chosen": -2.4056546688079834, "logits/rejected": -1.8058457374572754, "logps/chosen": -386.0167541503906, "logps/rejected": -370.4329833984375, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 3.481782913208008, "rewards/margins": 5.0663557052612305, "rewards/rejected": -1.5845730304718018, "step": 3276 }, { "epoch": 2.3941552511415525, "grad_norm": 5.69605525527899, "learning_rate": 2.076180846653704e-07, "logits/chosen": -3.3764514923095703, "logits/rejected": -2.484699249267578, "logps/chosen": -855.934326171875, "logps/rejected": -695.8046264648438, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 4.720966339111328, "rewards/margins": 5.0580902099609375, "rewards/rejected": -0.3371240794658661, "step": 3277 }, { "epoch": 2.3948858447488584, "grad_norm": 35.52061873713281, "learning_rate": 2.0746089796048687e-07, "logits/chosen": -2.7216053009033203, "logits/rejected": -2.4766225814819336, "logps/chosen": -528.4219360351562, "logps/rejected": -468.7589416503906, "loss": 0.1464, "rewards/accuracies": 1.0, "rewards/chosen": 3.7133824825286865, "rewards/margins": 5.840068340301514, "rewards/rejected": -2.126685857772827, "step": 3278 }, { "epoch": 2.3956164383561642, "grad_norm": 18.981850237206444, "learning_rate": 2.0730372857177803e-07, "logits/chosen": -2.8651483058929443, "logits/rejected": -2.014737367630005, "logps/chosen": -509.8684997558594, "logps/rejected": -352.23052978515625, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 2.806900978088379, "rewards/margins": 5.277135848999023, "rewards/rejected": -2.4702343940734863, "step": 3279 }, { "epoch": 2.3963470319634705, "grad_norm": 5.7841067292640425, "learning_rate": 2.071465765632221e-07, "logits/chosen": -3.1055030822753906, "logits/rejected": -2.447995901107788, "logps/chosen": -516.6448974609375, "logps/rejected": -379.2862548828125, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 2.3894150257110596, "rewards/margins": 5.818399429321289, "rewards/rejected": -3.4289841651916504, "step": 3280 }, { "epoch": 2.3970776255707764, "grad_norm": 17.92523177886891, "learning_rate": 2.0698944199879008e-07, "logits/chosen": -2.557088851928711, "logits/rejected": -2.521368980407715, "logps/chosen": -728.8955078125, "logps/rejected": -639.9723510742188, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 3.942326307296753, "rewards/margins": 6.466446876525879, "rewards/rejected": -2.524120807647705, "step": 3281 }, { "epoch": 2.3978082191780823, "grad_norm": 16.22621268279643, "learning_rate": 2.068323249424461e-07, "logits/chosen": -2.7168283462524414, "logits/rejected": -2.050342559814453, "logps/chosen": -694.8328247070312, "logps/rejected": -482.7398376464844, "loss": 0.0877, "rewards/accuracies": 1.0, "rewards/chosen": 2.7445077896118164, "rewards/margins": 3.2035374641418457, "rewards/rejected": -0.4590294361114502, "step": 3282 }, { "epoch": 2.398538812785388, "grad_norm": 10.208295881760595, "learning_rate": 2.0667522545814682e-07, "logits/chosen": -2.9899635314941406, "logits/rejected": -2.4654977321624756, "logps/chosen": -912.3071899414062, "logps/rejected": -719.4200439453125, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 5.670777320861816, "rewards/margins": 7.349211692810059, "rewards/rejected": -1.6784348487854004, "step": 3283 }, { "epoch": 2.399269406392694, "grad_norm": 8.708534299244235, "learning_rate": 2.0651814360984195e-07, "logits/chosen": -2.80727481842041, "logits/rejected": -2.7366433143615723, "logps/chosen": -626.3572998046875, "logps/rejected": -767.9189453125, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 4.438985824584961, "rewards/margins": 6.254001617431641, "rewards/rejected": -1.8150157928466797, "step": 3284 }, { "epoch": 2.4, "grad_norm": 14.440841486285738, "learning_rate": 2.0636107946147408e-07, "logits/chosen": -2.1533005237579346, "logits/rejected": -1.9960029125213623, "logps/chosen": -776.082763671875, "logps/rejected": -718.0213012695312, "loss": 0.086, "rewards/accuracies": 0.875, "rewards/chosen": 3.515472888946533, "rewards/margins": 4.793317794799805, "rewards/rejected": -1.2778446674346924, "step": 3285 }, { "epoch": 2.400730593607306, "grad_norm": 8.833216348509206, "learning_rate": 2.0620403307697846e-07, "logits/chosen": -3.3208532333374023, "logits/rejected": -2.2296814918518066, "logps/chosen": -1054.744140625, "logps/rejected": -810.6175537109375, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": 6.117766380310059, "rewards/margins": 7.205667495727539, "rewards/rejected": -1.0879006385803223, "step": 3286 }, { "epoch": 2.401461187214612, "grad_norm": 15.93583975727452, "learning_rate": 2.060470045202832e-07, "logits/chosen": -3.1591110229492188, "logits/rejected": -2.04032301902771, "logps/chosen": -927.3074340820312, "logps/rejected": -591.7244262695312, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 4.0870866775512695, "rewards/margins": 4.813049793243408, "rewards/rejected": -0.7259632349014282, "step": 3287 }, { "epoch": 2.402191780821918, "grad_norm": 5.781445639109522, "learning_rate": 2.0588999385530904e-07, "logits/chosen": -2.470960855484009, "logits/rejected": -2.422914505004883, "logps/chosen": -289.72686767578125, "logps/rejected": -362.72186279296875, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 2.4570682048797607, "rewards/margins": 6.839597225189209, "rewards/rejected": -4.382529258728027, "step": 3288 }, { "epoch": 2.402922374429224, "grad_norm": 10.805603297459625, "learning_rate": 2.0573300114596954e-07, "logits/chosen": -2.541879892349243, "logits/rejected": -1.9731884002685547, "logps/chosen": -682.1115112304688, "logps/rejected": -632.2555541992188, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 4.995461463928223, "rewards/margins": 8.100862503051758, "rewards/rejected": -3.105400800704956, "step": 3289 }, { "epoch": 2.4036529680365297, "grad_norm": 11.021488009203281, "learning_rate": 2.055760264561709e-07, "logits/chosen": -2.347231388092041, "logits/rejected": -2.3361735343933105, "logps/chosen": -741.1771240234375, "logps/rejected": -733.3976440429688, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 3.7398011684417725, "rewards/margins": 4.865853786468506, "rewards/rejected": -1.1260526180267334, "step": 3290 }, { "epoch": 2.4043835616438356, "grad_norm": 11.182819090464559, "learning_rate": 2.0541906984981217e-07, "logits/chosen": -2.6254677772521973, "logits/rejected": -2.634521961212158, "logps/chosen": -413.1361083984375, "logps/rejected": -469.3686218261719, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 2.696370840072632, "rewards/margins": 4.810658931732178, "rewards/rejected": -2.114287853240967, "step": 3291 }, { "epoch": 2.4051141552511415, "grad_norm": 29.95599698509622, "learning_rate": 2.052621313907846e-07, "logits/chosen": -3.1786985397338867, "logits/rejected": -2.1128268241882324, "logps/chosen": -701.5901489257812, "logps/rejected": -511.5746765136719, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 2.7955100536346436, "rewards/margins": 4.579727649688721, "rewards/rejected": -1.7842175960540771, "step": 3292 }, { "epoch": 2.4058447488584473, "grad_norm": 30.203125, "learning_rate": 2.0510521114297247e-07, "logits/chosen": -2.568075180053711, "logits/rejected": -2.943115711212158, "logps/chosen": -483.4615478515625, "logps/rejected": -629.3409423828125, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": 2.3310306072235107, "rewards/margins": 3.4954092502593994, "rewards/rejected": -1.1643786430358887, "step": 3293 }, { "epoch": 2.4065753424657537, "grad_norm": 19.532171072031634, "learning_rate": 2.0494830917025243e-07, "logits/chosen": -2.8876564502716064, "logits/rejected": -3.12387752532959, "logps/chosen": -327.309814453125, "logps/rejected": -563.6180419921875, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 2.4321398735046387, "rewards/margins": 7.219239711761475, "rewards/rejected": -4.787099361419678, "step": 3294 }, { "epoch": 2.4073059360730595, "grad_norm": 12.344875467876653, "learning_rate": 2.0479142553649397e-07, "logits/chosen": -2.946026563644409, "logits/rejected": -2.8330061435699463, "logps/chosen": -697.6967163085938, "logps/rejected": -601.0592651367188, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": 3.59956955909729, "rewards/margins": 3.1928298473358154, "rewards/rejected": 0.40673959255218506, "step": 3295 }, { "epoch": 2.4080365296803654, "grad_norm": 16.07988062875954, "learning_rate": 2.046345603055587e-07, "logits/chosen": -2.70786452293396, "logits/rejected": -2.0063302516937256, "logps/chosen": -687.0689086914062, "logps/rejected": -485.9326477050781, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 4.806509017944336, "rewards/margins": 6.639956951141357, "rewards/rejected": -1.8334479331970215, "step": 3296 }, { "epoch": 2.4087671232876713, "grad_norm": 13.725686511604575, "learning_rate": 2.0447771354130096e-07, "logits/chosen": -2.9243757724761963, "logits/rejected": -2.292604684829712, "logps/chosen": -717.5719604492188, "logps/rejected": -634.489501953125, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 4.5340399742126465, "rewards/margins": 6.97376823425293, "rewards/rejected": -2.439728260040283, "step": 3297 }, { "epoch": 2.409497716894977, "grad_norm": 10.184183716009143, "learning_rate": 2.0432088530756767e-07, "logits/chosen": -2.7873494625091553, "logits/rejected": -2.543156862258911, "logps/chosen": -657.8326416015625, "logps/rejected": -637.76806640625, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 2.0085859298706055, "rewards/margins": 4.6087727546691895, "rewards/rejected": -2.600186824798584, "step": 3298 }, { "epoch": 2.410228310502283, "grad_norm": 6.798996447392099, "learning_rate": 2.04164075668198e-07, "logits/chosen": -3.1553220748901367, "logits/rejected": -2.5747435092926025, "logps/chosen": -792.0040893554688, "logps/rejected": -769.3431396484375, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 3.950164794921875, "rewards/margins": 4.835516929626465, "rewards/rejected": -0.8853521347045898, "step": 3299 }, { "epoch": 2.410958904109589, "grad_norm": 11.506803407471734, "learning_rate": 2.0400728468702374e-07, "logits/chosen": -2.3796560764312744, "logits/rejected": -2.7947540283203125, "logps/chosen": -620.4978637695312, "logps/rejected": -654.2205200195312, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 3.2359657287597656, "rewards/margins": 5.577646255493164, "rewards/rejected": -2.3416800498962402, "step": 3300 }, { "epoch": 2.4116894977168952, "grad_norm": 13.918894576104208, "learning_rate": 2.038505124278689e-07, "logits/chosen": -2.53893780708313, "logits/rejected": -1.907118558883667, "logps/chosen": -602.798828125, "logps/rejected": -576.3583984375, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 4.53765869140625, "rewards/margins": 6.810520172119141, "rewards/rejected": -2.2728614807128906, "step": 3301 }, { "epoch": 2.412420091324201, "grad_norm": 10.088150597146294, "learning_rate": 2.0369375895454998e-07, "logits/chosen": -2.207395076751709, "logits/rejected": -1.9259730577468872, "logps/chosen": -508.6260986328125, "logps/rejected": -486.81658935546875, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 2.32364821434021, "rewards/margins": 5.145516395568848, "rewards/rejected": -2.821868658065796, "step": 3302 }, { "epoch": 2.413150684931507, "grad_norm": 8.60137002367528, "learning_rate": 2.0353702433087583e-07, "logits/chosen": -3.364647388458252, "logits/rejected": -2.0549557209014893, "logps/chosen": -541.4755859375, "logps/rejected": -403.60467529296875, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 3.775907039642334, "rewards/margins": 6.143289566040039, "rewards/rejected": -2.367382526397705, "step": 3303 }, { "epoch": 2.413881278538813, "grad_norm": 7.763526924747495, "learning_rate": 2.033803086206477e-07, "logits/chosen": -2.9107704162597656, "logits/rejected": -2.492485761642456, "logps/chosen": -639.4989624023438, "logps/rejected": -576.664306640625, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 3.804586887359619, "rewards/margins": 4.419656753540039, "rewards/rejected": -0.6150695085525513, "step": 3304 }, { "epoch": 2.4146118721461187, "grad_norm": 8.906398624719328, "learning_rate": 2.032236118876589e-07, "logits/chosen": -2.5928192138671875, "logits/rejected": -1.903133511543274, "logps/chosen": -334.61956787109375, "logps/rejected": -288.4481506347656, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 3.367849826812744, "rewards/margins": 7.115782737731934, "rewards/rejected": -3.7479329109191895, "step": 3305 }, { "epoch": 2.4153424657534246, "grad_norm": 11.540004642690587, "learning_rate": 2.0306693419569524e-07, "logits/chosen": -2.743217945098877, "logits/rejected": -2.2641053199768066, "logps/chosen": -630.159423828125, "logps/rejected": -584.7814331054688, "loss": 0.0753, "rewards/accuracies": 1.0, "rewards/chosen": 4.2459516525268555, "rewards/margins": 6.444332599639893, "rewards/rejected": -2.198380470275879, "step": 3306 }, { "epoch": 2.4160730593607305, "grad_norm": 16.471175476632844, "learning_rate": 2.0291027560853473e-07, "logits/chosen": -2.910518169403076, "logits/rejected": -2.4203879833221436, "logps/chosen": -659.872802734375, "logps/rejected": -538.6534423828125, "loss": 0.0764, "rewards/accuracies": 1.0, "rewards/chosen": 1.7029398679733276, "rewards/margins": 3.4522058963775635, "rewards/rejected": -1.7492659091949463, "step": 3307 }, { "epoch": 2.4168036529680363, "grad_norm": 14.901043316457587, "learning_rate": 2.027536361899476e-07, "logits/chosen": -2.5261764526367188, "logits/rejected": -2.0654067993164062, "logps/chosen": -608.75439453125, "logps/rejected": -521.5895385742188, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 3.220142364501953, "rewards/margins": 4.780004978179932, "rewards/rejected": -1.5598628520965576, "step": 3308 }, { "epoch": 2.417534246575342, "grad_norm": 9.672010282146582, "learning_rate": 2.0259701600369616e-07, "logits/chosen": -2.7323970794677734, "logits/rejected": -2.7458295822143555, "logps/chosen": -398.9154357910156, "logps/rejected": -561.437255859375, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 1.8018591403961182, "rewards/margins": 4.831790924072266, "rewards/rejected": -3.0299322605133057, "step": 3309 }, { "epoch": 2.4182648401826485, "grad_norm": 6.218662893940832, "learning_rate": 2.0244041511353505e-07, "logits/chosen": -2.475700855255127, "logits/rejected": -2.580470323562622, "logps/chosen": -612.9613037109375, "logps/rejected": -650.5975341796875, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 2.454481840133667, "rewards/margins": 4.210619926452637, "rewards/rejected": -1.7561380863189697, "step": 3310 }, { "epoch": 2.4189954337899544, "grad_norm": 9.968394114679954, "learning_rate": 2.022838335832109e-07, "logits/chosen": -2.8208096027374268, "logits/rejected": -2.5641794204711914, "logps/chosen": -813.9129028320312, "logps/rejected": -792.0380859375, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": 5.033167839050293, "rewards/margins": 4.697427749633789, "rewards/rejected": 0.33574026823043823, "step": 3311 }, { "epoch": 2.4197260273972603, "grad_norm": 8.639283226092575, "learning_rate": 2.021272714764627e-07, "logits/chosen": -3.2190537452697754, "logits/rejected": -2.763864755630493, "logps/chosen": -545.98291015625, "logps/rejected": -624.9891357421875, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": 2.8534913063049316, "rewards/margins": 5.300436973571777, "rewards/rejected": -2.4469456672668457, "step": 3312 }, { "epoch": 2.420456621004566, "grad_norm": 17.525577708693014, "learning_rate": 2.0197072885702145e-07, "logits/chosen": -2.8448328971862793, "logits/rejected": -1.741553783416748, "logps/chosen": -574.0647583007812, "logps/rejected": -402.8617858886719, "loss": 0.0718, "rewards/accuracies": 0.875, "rewards/chosen": 3.6280362606048584, "rewards/margins": 6.231081962585449, "rewards/rejected": -2.6030454635620117, "step": 3313 }, { "epoch": 2.421187214611872, "grad_norm": 10.28168447208414, "learning_rate": 2.018142057886099e-07, "logits/chosen": -2.649056911468506, "logits/rejected": -2.1158652305603027, "logps/chosen": -596.7120361328125, "logps/rejected": -663.680419921875, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 2.5701026916503906, "rewards/margins": 5.996651649475098, "rewards/rejected": -3.426548957824707, "step": 3314 }, { "epoch": 2.421917808219178, "grad_norm": 9.529727001142719, "learning_rate": 2.016577023349432e-07, "logits/chosen": -3.256434202194214, "logits/rejected": -2.6440906524658203, "logps/chosen": -1294.41943359375, "logps/rejected": -907.7890625, "loss": 0.0414, "rewards/accuracies": 1.0, "rewards/chosen": 2.4968085289001465, "rewards/margins": 4.457881927490234, "rewards/rejected": -1.9610735177993774, "step": 3315 }, { "epoch": 2.4226484018264838, "grad_norm": 8.340744791782123, "learning_rate": 2.0150121855972845e-07, "logits/chosen": -2.5342061519622803, "logits/rejected": -2.248810291290283, "logps/chosen": -523.18701171875, "logps/rejected": -402.13916015625, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 2.4170081615448, "rewards/margins": 4.508241653442383, "rewards/rejected": -2.091233491897583, "step": 3316 }, { "epoch": 2.42337899543379, "grad_norm": 12.900910314333503, "learning_rate": 2.0134475452666477e-07, "logits/chosen": -2.85280442237854, "logits/rejected": -2.612717628479004, "logps/chosen": -757.1589965820312, "logps/rejected": -616.5592041015625, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 3.5123605728149414, "rewards/margins": 3.429481267929077, "rewards/rejected": 0.08287939429283142, "step": 3317 }, { "epoch": 2.424109589041096, "grad_norm": 9.201085847546787, "learning_rate": 2.01188310299443e-07, "logits/chosen": -3.236400604248047, "logits/rejected": -2.2097949981689453, "logps/chosen": -902.414306640625, "logps/rejected": -729.2623901367188, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 6.4520978927612305, "rewards/margins": 7.980191230773926, "rewards/rejected": -1.528093695640564, "step": 3318 }, { "epoch": 2.424840182648402, "grad_norm": 12.536118472534547, "learning_rate": 2.010318859417462e-07, "logits/chosen": -3.291290283203125, "logits/rejected": -2.7105162143707275, "logps/chosen": -607.0497436523438, "logps/rejected": -473.37493896484375, "loss": 0.0735, "rewards/accuracies": 1.0, "rewards/chosen": 2.490231990814209, "rewards/margins": 3.9766995906829834, "rewards/rejected": -1.486467719078064, "step": 3319 }, { "epoch": 2.4255707762557077, "grad_norm": 7.447767591429479, "learning_rate": 2.008754815172492e-07, "logits/chosen": -3.044788360595703, "logits/rejected": -2.5046799182891846, "logps/chosen": -957.6297607421875, "logps/rejected": -688.7381591796875, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 5.112526893615723, "rewards/margins": 5.735818386077881, "rewards/rejected": -0.6232911944389343, "step": 3320 }, { "epoch": 2.4263013698630136, "grad_norm": 10.90113023970128, "learning_rate": 2.0071909708961875e-07, "logits/chosen": -2.711611747741699, "logits/rejected": -2.581718921661377, "logps/chosen": -562.1936645507812, "logps/rejected": -576.3536987304688, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 2.392989158630371, "rewards/margins": 4.9179511070251465, "rewards/rejected": -2.5249619483947754, "step": 3321 }, { "epoch": 2.4270319634703195, "grad_norm": 19.201013633674314, "learning_rate": 2.0056273272251357e-07, "logits/chosen": -2.6964259147644043, "logits/rejected": -2.2430810928344727, "logps/chosen": -681.0314331054688, "logps/rejected": -649.3572387695312, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 2.3760597705841064, "rewards/margins": 4.364593505859375, "rewards/rejected": -1.9885339736938477, "step": 3322 }, { "epoch": 2.4277625570776253, "grad_norm": 11.323052474221974, "learning_rate": 2.0040638847958392e-07, "logits/chosen": -2.7182774543762207, "logits/rejected": -2.5805392265319824, "logps/chosen": -713.2506713867188, "logps/rejected": -625.2772827148438, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 3.245208740234375, "rewards/margins": 4.092101573944092, "rewards/rejected": -0.8468929529190063, "step": 3323 }, { "epoch": 2.4284931506849317, "grad_norm": 10.630233732221514, "learning_rate": 2.0025006442447212e-07, "logits/chosen": -2.673750400543213, "logits/rejected": -1.9887839555740356, "logps/chosen": -983.4672241210938, "logps/rejected": -628.5855102539062, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 5.060407638549805, "rewards/margins": 5.77691650390625, "rewards/rejected": -0.7165088653564453, "step": 3324 }, { "epoch": 2.4292237442922375, "grad_norm": 12.246954714551428, "learning_rate": 2.0009376062081223e-07, "logits/chosen": -2.8495075702667236, "logits/rejected": -2.3896872997283936, "logps/chosen": -867.048095703125, "logps/rejected": -637.1670532226562, "loss": 0.0575, "rewards/accuracies": 1.0, "rewards/chosen": 3.9716687202453613, "rewards/margins": 4.967828750610352, "rewards/rejected": -0.9961605072021484, "step": 3325 }, { "epoch": 2.4299543378995434, "grad_norm": 13.791351284020642, "learning_rate": 1.9993747713223016e-07, "logits/chosen": -2.478926658630371, "logits/rejected": -1.7761995792388916, "logps/chosen": -606.4916381835938, "logps/rejected": -382.42578125, "loss": 0.0632, "rewards/accuracies": 1.0, "rewards/chosen": 2.546940565109253, "rewards/margins": 4.117547988891602, "rewards/rejected": -1.5706074237823486, "step": 3326 }, { "epoch": 2.4306849315068493, "grad_norm": 11.150923453780424, "learning_rate": 1.9978121402234318e-07, "logits/chosen": -2.112105369567871, "logits/rejected": -2.368325710296631, "logps/chosen": -725.072509765625, "logps/rejected": -760.8036499023438, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": 2.47302508354187, "rewards/margins": 4.847164154052734, "rewards/rejected": -2.3741393089294434, "step": 3327 }, { "epoch": 2.431415525114155, "grad_norm": 8.430685272419892, "learning_rate": 1.9962497135476063e-07, "logits/chosen": -2.8674683570861816, "logits/rejected": -2.0772080421447754, "logps/chosen": -975.982421875, "logps/rejected": -605.1265869140625, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 6.098092555999756, "rewards/margins": 6.45150899887085, "rewards/rejected": -0.3534161150455475, "step": 3328 }, { "epoch": 2.432146118721461, "grad_norm": 6.762189525873842, "learning_rate": 1.9946874919308337e-07, "logits/chosen": -2.6847827434539795, "logits/rejected": -2.152754545211792, "logps/chosen": -667.4112548828125, "logps/rejected": -561.0045166015625, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 3.08050537109375, "rewards/margins": 4.979426383972168, "rewards/rejected": -1.898921251296997, "step": 3329 }, { "epoch": 2.432876712328767, "grad_norm": 13.616122932426011, "learning_rate": 1.9931254760090418e-07, "logits/chosen": -2.6279428005218506, "logits/rejected": -2.348038911819458, "logps/chosen": -947.516357421875, "logps/rejected": -913.063232421875, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 4.498388767242432, "rewards/margins": 5.159768581390381, "rewards/rejected": -0.6613799929618835, "step": 3330 }, { "epoch": 2.433607305936073, "grad_norm": 15.94453400613676, "learning_rate": 1.9915636664180697e-07, "logits/chosen": -3.220482587814331, "logits/rejected": -2.3437955379486084, "logps/chosen": -740.7330932617188, "logps/rejected": -604.0032348632812, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 4.226768493652344, "rewards/margins": 5.752678871154785, "rewards/rejected": -1.525909662246704, "step": 3331 }, { "epoch": 2.434337899543379, "grad_norm": 15.239720616620774, "learning_rate": 1.990002063793676e-07, "logits/chosen": -3.1924424171447754, "logits/rejected": -2.6736536026000977, "logps/chosen": -841.2461547851562, "logps/rejected": -728.751708984375, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 4.502110481262207, "rewards/margins": 3.6651690006256104, "rewards/rejected": 0.836941123008728, "step": 3332 }, { "epoch": 2.435068493150685, "grad_norm": 20.577842484535626, "learning_rate": 1.9884406687715347e-07, "logits/chosen": -2.687788486480713, "logits/rejected": -2.2070159912109375, "logps/chosen": -667.302001953125, "logps/rejected": -568.3532104492188, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": 3.480297565460205, "rewards/margins": 5.326694011688232, "rewards/rejected": -1.8463964462280273, "step": 3333 }, { "epoch": 2.435799086757991, "grad_norm": 17.033652711824065, "learning_rate": 1.9868794819872344e-07, "logits/chosen": -2.881317138671875, "logits/rejected": -2.318425416946411, "logps/chosen": -530.904052734375, "logps/rejected": -472.29669189453125, "loss": 0.0934, "rewards/accuracies": 1.0, "rewards/chosen": 3.033705234527588, "rewards/margins": 3.7857375144958496, "rewards/rejected": -0.7520323991775513, "step": 3334 }, { "epoch": 2.4365296803652967, "grad_norm": 8.097538006402731, "learning_rate": 1.9853185040762807e-07, "logits/chosen": -2.147608757019043, "logits/rejected": -2.666130304336548, "logps/chosen": -396.33258056640625, "logps/rejected": -836.1066284179688, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 1.4470767974853516, "rewards/margins": 6.170149326324463, "rewards/rejected": -4.7230730056762695, "step": 3335 }, { "epoch": 2.4372602739726026, "grad_norm": 13.721985732978107, "learning_rate": 1.9837577356740913e-07, "logits/chosen": -3.40617036819458, "logits/rejected": -2.9943883419036865, "logps/chosen": -604.363525390625, "logps/rejected": -446.6531066894531, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 2.8533267974853516, "rewards/margins": 2.870999574661255, "rewards/rejected": -0.017672687768936157, "step": 3336 }, { "epoch": 2.4379908675799085, "grad_norm": 13.807050501634572, "learning_rate": 1.982197177416001e-07, "logits/chosen": -2.9743497371673584, "logits/rejected": -2.5704827308654785, "logps/chosen": -376.0931701660156, "logps/rejected": -470.0617370605469, "loss": 0.0829, "rewards/accuracies": 0.875, "rewards/chosen": 1.978400707244873, "rewards/margins": 3.4729700088500977, "rewards/rejected": -1.4945695400238037, "step": 3337 }, { "epoch": 2.438721461187215, "grad_norm": 9.076914721389366, "learning_rate": 1.9806368299372577e-07, "logits/chosen": -2.4612271785736084, "logits/rejected": -1.6127454042434692, "logps/chosen": -482.8551330566406, "logps/rejected": -327.3107604980469, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": 1.6991770267486572, "rewards/margins": 4.322805881500244, "rewards/rejected": -2.623628854751587, "step": 3338 }, { "epoch": 2.4394520547945207, "grad_norm": 13.832008635430538, "learning_rate": 1.9790766938730255e-07, "logits/chosen": -2.6711044311523438, "logits/rejected": -2.3702709674835205, "logps/chosen": -939.0624389648438, "logps/rejected": -909.5364990234375, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 3.5003418922424316, "rewards/margins": 4.674969673156738, "rewards/rejected": -1.1746273040771484, "step": 3339 }, { "epoch": 2.4401826484018265, "grad_norm": 10.933379874091607, "learning_rate": 1.9775167698583793e-07, "logits/chosen": -3.179194927215576, "logits/rejected": -2.9092538356781006, "logps/chosen": -677.7127075195312, "logps/rejected": -677.5115966796875, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 3.0497586727142334, "rewards/margins": 3.9336822032928467, "rewards/rejected": -0.8839234113693237, "step": 3340 }, { "epoch": 2.4409132420091324, "grad_norm": 6.157020481905157, "learning_rate": 1.97595705852831e-07, "logits/chosen": -2.851539373397827, "logits/rejected": -2.2509450912475586, "logps/chosen": -788.5216064453125, "logps/rejected": -653.8938598632812, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 4.466014862060547, "rewards/margins": 5.421463966369629, "rewards/rejected": -0.9554489254951477, "step": 3341 }, { "epoch": 2.4416438356164383, "grad_norm": 8.80632766028789, "learning_rate": 1.9743975605177215e-07, "logits/chosen": -2.6107261180877686, "logits/rejected": -2.2695672512054443, "logps/chosen": -527.5960083007812, "logps/rejected": -592.097900390625, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 3.5417585372924805, "rewards/margins": 5.68015193939209, "rewards/rejected": -2.1383931636810303, "step": 3342 }, { "epoch": 2.442374429223744, "grad_norm": 23.85229307143498, "learning_rate": 1.972838276461432e-07, "logits/chosen": -2.652226686477661, "logits/rejected": -2.0685815811157227, "logps/chosen": -1077.404052734375, "logps/rejected": -727.8995361328125, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 2.86299729347229, "rewards/margins": 3.874199628829956, "rewards/rejected": -1.011202335357666, "step": 3343 }, { "epoch": 2.44310502283105, "grad_norm": 8.21152611100868, "learning_rate": 1.9712792069941683e-07, "logits/chosen": -2.784633159637451, "logits/rejected": -2.5020108222961426, "logps/chosen": -544.9061889648438, "logps/rejected": -596.1251220703125, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": 4.204168319702148, "rewards/margins": 7.106247901916504, "rewards/rejected": -2.9020795822143555, "step": 3344 }, { "epoch": 2.4438356164383563, "grad_norm": 9.840977257062605, "learning_rate": 1.9697203527505745e-07, "logits/chosen": -3.1180057525634766, "logits/rejected": -2.2212791442871094, "logps/chosen": -621.008544921875, "logps/rejected": -433.7398681640625, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 3.2860841751098633, "rewards/margins": 4.737712383270264, "rewards/rejected": -1.4516282081604004, "step": 3345 }, { "epoch": 2.444566210045662, "grad_norm": 10.204008055441147, "learning_rate": 1.9681617143652057e-07, "logits/chosen": -2.3267855644226074, "logits/rejected": -2.1602864265441895, "logps/chosen": -549.0415649414062, "logps/rejected": -583.7855224609375, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 2.9163806438446045, "rewards/margins": 4.2107930183410645, "rewards/rejected": -1.29441237449646, "step": 3346 }, { "epoch": 2.445296803652968, "grad_norm": 11.83707837942061, "learning_rate": 1.9666032924725275e-07, "logits/chosen": -2.791489839553833, "logits/rejected": -2.156538486480713, "logps/chosen": -611.195556640625, "logps/rejected": -327.9372863769531, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 3.354588031768799, "rewards/margins": 5.093924045562744, "rewards/rejected": -1.7393360137939453, "step": 3347 }, { "epoch": 2.446027397260274, "grad_norm": 10.248272633960344, "learning_rate": 1.9650450877069218e-07, "logits/chosen": -2.638202667236328, "logits/rejected": -2.216731548309326, "logps/chosen": -705.0445556640625, "logps/rejected": -673.2044067382812, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 4.008464813232422, "rewards/margins": 3.710723876953125, "rewards/rejected": 0.2977413535118103, "step": 3348 }, { "epoch": 2.44675799086758, "grad_norm": 11.025301531692731, "learning_rate": 1.9634871007026756e-07, "logits/chosen": -2.316418170928955, "logits/rejected": -2.3645544052124023, "logps/chosen": -752.706787109375, "logps/rejected": -742.77490234375, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 2.5234856605529785, "rewards/margins": 6.608056545257568, "rewards/rejected": -4.08457088470459, "step": 3349 }, { "epoch": 2.4474885844748857, "grad_norm": 7.804956318881512, "learning_rate": 1.9619293320939926e-07, "logits/chosen": -2.4347450733184814, "logits/rejected": -2.4467053413391113, "logps/chosen": -658.5248413085938, "logps/rejected": -738.8488159179688, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 3.6555581092834473, "rewards/margins": 5.6067094802856445, "rewards/rejected": -1.9511513710021973, "step": 3350 }, { "epoch": 2.4482191780821916, "grad_norm": 8.969021357774267, "learning_rate": 1.9603717825149846e-07, "logits/chosen": -2.609032154083252, "logits/rejected": -2.5097813606262207, "logps/chosen": -212.75369262695312, "logps/rejected": -314.4307556152344, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 1.734316349029541, "rewards/margins": 5.369906425476074, "rewards/rejected": -3.635590076446533, "step": 3351 }, { "epoch": 2.448949771689498, "grad_norm": 19.751648653004104, "learning_rate": 1.958814452599677e-07, "logits/chosen": -3.0468239784240723, "logits/rejected": -2.8460516929626465, "logps/chosen": -605.621826171875, "logps/rejected": -657.994384765625, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 2.8716959953308105, "rewards/margins": 4.7322187423706055, "rewards/rejected": -1.860522747039795, "step": 3352 }, { "epoch": 2.4496803652968038, "grad_norm": 10.49929262230783, "learning_rate": 1.9572573429820023e-07, "logits/chosen": -2.408677816390991, "logits/rejected": -1.8990604877471924, "logps/chosen": -372.47442626953125, "logps/rejected": -322.2026672363281, "loss": 0.055, "rewards/accuracies": 1.0, "rewards/chosen": 1.7878897190093994, "rewards/margins": 4.901893615722656, "rewards/rejected": -3.1140036582946777, "step": 3353 }, { "epoch": 2.4504109589041096, "grad_norm": 8.826420604276331, "learning_rate": 1.9557004542958054e-07, "logits/chosen": -2.864342212677002, "logits/rejected": -2.5039079189300537, "logps/chosen": -634.3682861328125, "logps/rejected": -638.2183837890625, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 3.8815293312072754, "rewards/margins": 6.445239067077637, "rewards/rejected": -2.5637102127075195, "step": 3354 }, { "epoch": 2.4511415525114155, "grad_norm": 7.298525245839748, "learning_rate": 1.9541437871748423e-07, "logits/chosen": -2.636801242828369, "logits/rejected": -2.0981605052948, "logps/chosen": -769.8912963867188, "logps/rejected": -742.4061889648438, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 3.686845302581787, "rewards/margins": 5.371156692504883, "rewards/rejected": -1.6843113899230957, "step": 3355 }, { "epoch": 2.4518721461187214, "grad_norm": 18.2330678489275, "learning_rate": 1.952587342252777e-07, "logits/chosen": -2.847881317138672, "logits/rejected": -2.043531894683838, "logps/chosen": -828.3272705078125, "logps/rejected": -417.57073974609375, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 3.614879846572876, "rewards/margins": 5.926480770111084, "rewards/rejected": -2.311601161956787, "step": 3356 }, { "epoch": 2.4526027397260273, "grad_norm": 16.331444040657427, "learning_rate": 1.9510311201631828e-07, "logits/chosen": -3.260995864868164, "logits/rejected": -2.7739977836608887, "logps/chosen": -718.548583984375, "logps/rejected": -650.3748779296875, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 2.9493305683135986, "rewards/margins": 3.132481813430786, "rewards/rejected": -0.1831512749195099, "step": 3357 }, { "epoch": 2.453333333333333, "grad_norm": 10.859731896113676, "learning_rate": 1.9494751215395436e-07, "logits/chosen": -2.9744699001312256, "logits/rejected": -1.8661353588104248, "logps/chosen": -795.7674560546875, "logps/rejected": -515.1314086914062, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 4.115716934204102, "rewards/margins": 4.666043281555176, "rewards/rejected": -0.550326943397522, "step": 3358 }, { "epoch": 2.4540639269406395, "grad_norm": 16.745639518128108, "learning_rate": 1.947919347015252e-07, "logits/chosen": -2.6545965671539307, "logits/rejected": -2.282719850540161, "logps/chosen": -477.91888427734375, "logps/rejected": -593.3182373046875, "loss": 0.0578, "rewards/accuracies": 1.0, "rewards/chosen": 2.7932541370391846, "rewards/margins": 18.854978561401367, "rewards/rejected": -16.061723709106445, "step": 3359 }, { "epoch": 2.4547945205479453, "grad_norm": 22.916586174101063, "learning_rate": 1.9463637972236086e-07, "logits/chosen": -2.324436664581299, "logits/rejected": -2.573838472366333, "logps/chosen": -611.7735595703125, "logps/rejected": -848.4111938476562, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": 1.9368703365325928, "rewards/margins": 3.9261531829833984, "rewards/rejected": -1.9892828464508057, "step": 3360 }, { "epoch": 2.455525114155251, "grad_norm": 19.110587879282964, "learning_rate": 1.9448084727978248e-07, "logits/chosen": -2.485644817352295, "logits/rejected": -2.150484800338745, "logps/chosen": -401.2565002441406, "logps/rejected": -435.4277648925781, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 2.683661460876465, "rewards/margins": 6.090715408325195, "rewards/rejected": -3.4070537090301514, "step": 3361 }, { "epoch": 2.456255707762557, "grad_norm": 9.12603252265146, "learning_rate": 1.943253374371016e-07, "logits/chosen": -2.5643765926361084, "logits/rejected": -1.9088877439498901, "logps/chosen": -870.8299560546875, "logps/rejected": -649.5517578125, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 3.4265859127044678, "rewards/margins": 4.316395282745361, "rewards/rejected": -0.8898090720176697, "step": 3362 }, { "epoch": 2.456986301369863, "grad_norm": 5.664007610186621, "learning_rate": 1.9416985025762098e-07, "logits/chosen": -2.478433609008789, "logits/rejected": -1.75409734249115, "logps/chosen": -1016.87841796875, "logps/rejected": -824.0634765625, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 6.1229352951049805, "rewards/margins": 7.159235000610352, "rewards/rejected": -1.0363001823425293, "step": 3363 }, { "epoch": 2.457716894977169, "grad_norm": 15.108316599434565, "learning_rate": 1.9401438580463387e-07, "logits/chosen": -2.988114833831787, "logits/rejected": -2.481656074523926, "logps/chosen": -1039.7098388671875, "logps/rejected": -941.46142578125, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": 6.490373611450195, "rewards/margins": 4.934751033782959, "rewards/rejected": 1.5556230545043945, "step": 3364 }, { "epoch": 2.4584474885844747, "grad_norm": 13.092956955958648, "learning_rate": 1.9385894414142464e-07, "logits/chosen": -2.4738128185272217, "logits/rejected": -1.9676544666290283, "logps/chosen": -401.3253173828125, "logps/rejected": -425.6170349121094, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 1.964935302734375, "rewards/margins": 7.2734527587890625, "rewards/rejected": -5.308518409729004, "step": 3365 }, { "epoch": 2.459178082191781, "grad_norm": 17.258538385877202, "learning_rate": 1.9370352533126788e-07, "logits/chosen": -2.968240261077881, "logits/rejected": -1.928729772567749, "logps/chosen": -708.4627685546875, "logps/rejected": -527.0887451171875, "loss": 0.0918, "rewards/accuracies": 0.875, "rewards/chosen": 2.6796655654907227, "rewards/margins": 5.739368915557861, "rewards/rejected": -3.0597031116485596, "step": 3366 }, { "epoch": 2.459908675799087, "grad_norm": 14.320463272358229, "learning_rate": 1.9354812943742917e-07, "logits/chosen": -2.8958230018615723, "logits/rejected": -2.8777389526367188, "logps/chosen": -449.8197021484375, "logps/rejected": -604.29345703125, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": 2.836108684539795, "rewards/margins": 5.2220001220703125, "rewards/rejected": -2.3858914375305176, "step": 3367 }, { "epoch": 2.4606392694063928, "grad_norm": 16.49159257720193, "learning_rate": 1.933927565231648e-07, "logits/chosen": -2.3350472450256348, "logits/rejected": -2.098288059234619, "logps/chosen": -527.768798828125, "logps/rejected": -463.92437744140625, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 2.9251761436462402, "rewards/margins": 4.8657402992248535, "rewards/rejected": -1.9405641555786133, "step": 3368 }, { "epoch": 2.4613698630136986, "grad_norm": 11.386768331795997, "learning_rate": 1.9323740665172167e-07, "logits/chosen": -2.882613182067871, "logits/rejected": -2.5687735080718994, "logps/chosen": -449.3076171875, "logps/rejected": -484.7263488769531, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": 1.6854008436203003, "rewards/margins": 3.534226894378662, "rewards/rejected": -1.8488259315490723, "step": 3369 }, { "epoch": 2.4621004566210045, "grad_norm": 9.991888571198054, "learning_rate": 1.930820798863371e-07, "logits/chosen": -2.7672061920166016, "logits/rejected": -2.3829197883605957, "logps/chosen": -639.2593994140625, "logps/rejected": -465.2848815917969, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 2.6071557998657227, "rewards/margins": 4.0350542068481445, "rewards/rejected": -1.427898645401001, "step": 3370 }, { "epoch": 2.4628310502283104, "grad_norm": 9.963349508827772, "learning_rate": 1.9292677629023923e-07, "logits/chosen": -2.886706829071045, "logits/rejected": -1.8149471282958984, "logps/chosen": -627.1390380859375, "logps/rejected": -515.9738159179688, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": 4.837835311889648, "rewards/margins": 6.44425630569458, "rewards/rejected": -1.6064214706420898, "step": 3371 }, { "epoch": 2.4635616438356163, "grad_norm": 8.587161892338724, "learning_rate": 1.9277149592664673e-07, "logits/chosen": -2.4417073726654053, "logits/rejected": -2.2843823432922363, "logps/chosen": -588.4993286132812, "logps/rejected": -656.9373779296875, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 2.0208048820495605, "rewards/margins": 5.467484951019287, "rewards/rejected": -3.4466803073883057, "step": 3372 }, { "epoch": 2.4642922374429226, "grad_norm": 14.129848492047008, "learning_rate": 1.926162388587688e-07, "logits/chosen": -2.8314318656921387, "logits/rejected": -1.7351593971252441, "logps/chosen": -325.1693115234375, "logps/rejected": -249.3162841796875, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 2.4118165969848633, "rewards/margins": 6.38948392868042, "rewards/rejected": -3.9776673316955566, "step": 3373 }, { "epoch": 2.4650228310502285, "grad_norm": 9.14113496841296, "learning_rate": 1.9246100514980512e-07, "logits/chosen": -2.1341025829315186, "logits/rejected": -2.5742902755737305, "logps/chosen": -543.9019165039062, "logps/rejected": -535.859130859375, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 1.785274863243103, "rewards/margins": 4.073578834533691, "rewards/rejected": -2.288304328918457, "step": 3374 }, { "epoch": 2.4657534246575343, "grad_norm": 11.045832432349096, "learning_rate": 1.9230579486294586e-07, "logits/chosen": -2.8784422874450684, "logits/rejected": -2.7306525707244873, "logps/chosen": -557.0484619140625, "logps/rejected": -689.2533569335938, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 3.1152548789978027, "rewards/margins": 5.1352410316467285, "rewards/rejected": -2.0199859142303467, "step": 3375 }, { "epoch": 2.46648401826484, "grad_norm": 14.526746614004802, "learning_rate": 1.9215060806137168e-07, "logits/chosen": -2.4268476963043213, "logits/rejected": -1.6748276948928833, "logps/chosen": -643.2498168945312, "logps/rejected": -462.57476806640625, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 4.103207588195801, "rewards/margins": 6.076104164123535, "rewards/rejected": -1.9728962182998657, "step": 3376 }, { "epoch": 2.467214611872146, "grad_norm": 8.748654943353927, "learning_rate": 1.919954448082537e-07, "logits/chosen": -2.6677684783935547, "logits/rejected": -2.208179473876953, "logps/chosen": -837.697509765625, "logps/rejected": -657.0626220703125, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 3.77777099609375, "rewards/margins": 6.178459167480469, "rewards/rejected": -2.4006876945495605, "step": 3377 }, { "epoch": 2.467945205479452, "grad_norm": 9.989101192758042, "learning_rate": 1.9184030516675347e-07, "logits/chosen": -2.4348299503326416, "logits/rejected": -1.8650634288787842, "logps/chosen": -431.86761474609375, "logps/rejected": -451.3792419433594, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 4.986344337463379, "rewards/margins": 7.796473503112793, "rewards/rejected": -2.8101296424865723, "step": 3378 }, { "epoch": 2.468675799086758, "grad_norm": 9.18935245860112, "learning_rate": 1.916851892000228e-07, "logits/chosen": -2.8373804092407227, "logits/rejected": -2.349923610687256, "logps/chosen": -821.0007934570312, "logps/rejected": -724.1884155273438, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": 4.0587239265441895, "rewards/margins": 6.1952643394470215, "rewards/rejected": -2.136540412902832, "step": 3379 }, { "epoch": 2.469406392694064, "grad_norm": 10.424439442152536, "learning_rate": 1.9153009697120398e-07, "logits/chosen": -3.2646236419677734, "logits/rejected": -2.3225436210632324, "logps/chosen": -603.7802124023438, "logps/rejected": -410.9042053222656, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 3.6469719409942627, "rewards/margins": 6.430877685546875, "rewards/rejected": -2.7839059829711914, "step": 3380 }, { "epoch": 2.47013698630137, "grad_norm": 22.71839805865639, "learning_rate": 1.913750285434296e-07, "logits/chosen": -3.1242117881774902, "logits/rejected": -2.2316322326660156, "logps/chosen": -767.0523071289062, "logps/rejected": -561.3587646484375, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 4.046143531799316, "rewards/margins": 6.063960552215576, "rewards/rejected": -2.0178167819976807, "step": 3381 }, { "epoch": 2.470867579908676, "grad_norm": 21.075835853735285, "learning_rate": 1.9121998397982268e-07, "logits/chosen": -2.65775728225708, "logits/rejected": -2.3470048904418945, "logps/chosen": -576.0496826171875, "logps/rejected": -461.455322265625, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": 2.954808235168457, "rewards/margins": 6.258347988128662, "rewards/rejected": -3.303539276123047, "step": 3382 }, { "epoch": 2.4715981735159818, "grad_norm": 14.561345787312906, "learning_rate": 1.9106496334349626e-07, "logits/chosen": -3.0394442081451416, "logits/rejected": -2.2424492835998535, "logps/chosen": -866.3480224609375, "logps/rejected": -702.4381103515625, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": 3.8119988441467285, "rewards/margins": 5.567434310913086, "rewards/rejected": -1.7554359436035156, "step": 3383 }, { "epoch": 2.4723287671232876, "grad_norm": 15.123169843431675, "learning_rate": 1.9090996669755388e-07, "logits/chosen": -2.563282012939453, "logits/rejected": -2.698395252227783, "logps/chosen": -910.9918212890625, "logps/rejected": -934.9752807617188, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": 5.042696952819824, "rewards/margins": 4.4220476150512695, "rewards/rejected": 0.6206496953964233, "step": 3384 }, { "epoch": 2.4730593607305935, "grad_norm": 11.135399233474622, "learning_rate": 1.9075499410508925e-07, "logits/chosen": -2.998556613922119, "logits/rejected": -2.7481813430786133, "logps/chosen": -881.2672119140625, "logps/rejected": -689.0128173828125, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 3.373368740081787, "rewards/margins": 4.633630275726318, "rewards/rejected": -1.2602612972259521, "step": 3385 }, { "epoch": 2.4737899543378994, "grad_norm": 17.128303599004997, "learning_rate": 1.9060004562918621e-07, "logits/chosen": -2.6744961738586426, "logits/rejected": -1.4998304843902588, "logps/chosen": -371.1846618652344, "logps/rejected": -221.0310821533203, "loss": 0.0675, "rewards/accuracies": 1.0, "rewards/chosen": 1.6350209712982178, "rewards/margins": 3.4589531421661377, "rewards/rejected": -1.8239325284957886, "step": 3386 }, { "epoch": 2.4745205479452057, "grad_norm": 10.781298474189034, "learning_rate": 1.9044512133291897e-07, "logits/chosen": -3.1004960536956787, "logits/rejected": -1.5444564819335938, "logps/chosen": -669.6903076171875, "logps/rejected": -317.1629333496094, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 6.252297878265381, "rewards/margins": 10.732290267944336, "rewards/rejected": -4.479991912841797, "step": 3387 }, { "epoch": 2.4752511415525116, "grad_norm": 21.68376510076865, "learning_rate": 1.902902212793516e-07, "logits/chosen": -2.9371602535247803, "logits/rejected": -1.86572265625, "logps/chosen": -692.696044921875, "logps/rejected": -517.7882690429688, "loss": 0.0737, "rewards/accuracies": 1.0, "rewards/chosen": 3.174560070037842, "rewards/margins": 4.915392875671387, "rewards/rejected": -1.740833044052124, "step": 3388 }, { "epoch": 2.4759817351598175, "grad_norm": 13.556414807989945, "learning_rate": 1.9013534553153857e-07, "logits/chosen": -3.161832809448242, "logits/rejected": -2.4689087867736816, "logps/chosen": -749.8828125, "logps/rejected": -569.9259033203125, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 3.2446186542510986, "rewards/margins": 5.561423301696777, "rewards/rejected": -2.3168044090270996, "step": 3389 }, { "epoch": 2.4767123287671233, "grad_norm": 4.7789021630275155, "learning_rate": 1.8998049415252435e-07, "logits/chosen": -2.909651756286621, "logits/rejected": -2.164189100265503, "logps/chosen": -853.3787841796875, "logps/rejected": -722.8423461914062, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 5.131772518157959, "rewards/margins": 6.13792610168457, "rewards/rejected": -1.0061534643173218, "step": 3390 }, { "epoch": 2.477442922374429, "grad_norm": 10.126170526354402, "learning_rate": 1.8982566720534358e-07, "logits/chosen": -2.563096523284912, "logits/rejected": -1.9122947454452515, "logps/chosen": -692.9202880859375, "logps/rejected": -572.5836791992188, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 3.4301905632019043, "rewards/margins": 5.986514568328857, "rewards/rejected": -2.556324005126953, "step": 3391 }, { "epoch": 2.478173515981735, "grad_norm": 10.201491033742245, "learning_rate": 1.8967086475302064e-07, "logits/chosen": -2.7376513481140137, "logits/rejected": -2.380607843399048, "logps/chosen": -547.6119384765625, "logps/rejected": -561.9287109375, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 3.189448356628418, "rewards/margins": 5.256408214569092, "rewards/rejected": -2.066960096359253, "step": 3392 }, { "epoch": 2.478904109589041, "grad_norm": 9.862568043521218, "learning_rate": 1.8951608685857034e-07, "logits/chosen": -2.6732826232910156, "logits/rejected": -2.1223092079162598, "logps/chosen": -585.68994140625, "logps/rejected": -573.3558959960938, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": 4.335958957672119, "rewards/margins": 7.63216495513916, "rewards/rejected": -3.29620623588562, "step": 3393 }, { "epoch": 2.4796347031963473, "grad_norm": 14.780998114925714, "learning_rate": 1.8936133358499734e-07, "logits/chosen": -2.778333902359009, "logits/rejected": -1.9808365106582642, "logps/chosen": -493.67144775390625, "logps/rejected": -412.8513488769531, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 2.0914435386657715, "rewards/margins": 4.9404802322387695, "rewards/rejected": -2.849036455154419, "step": 3394 }, { "epoch": 2.480365296803653, "grad_norm": 11.20735324890329, "learning_rate": 1.8920660499529632e-07, "logits/chosen": -2.7569711208343506, "logits/rejected": -1.7204515933990479, "logps/chosen": -404.42816162109375, "logps/rejected": -277.5081481933594, "loss": 0.0739, "rewards/accuracies": 1.0, "rewards/chosen": 2.7796545028686523, "rewards/margins": 4.470302581787109, "rewards/rejected": -1.6906479597091675, "step": 3395 }, { "epoch": 2.481095890410959, "grad_norm": 15.3303328079956, "learning_rate": 1.8905190115245167e-07, "logits/chosen": -2.693742036819458, "logits/rejected": -2.16312837600708, "logps/chosen": -662.9535522460938, "logps/rejected": -655.2396240234375, "loss": 0.0625, "rewards/accuracies": 1.0, "rewards/chosen": 3.6084389686584473, "rewards/margins": 5.939501762390137, "rewards/rejected": -2.3310623168945312, "step": 3396 }, { "epoch": 2.481826484018265, "grad_norm": 12.690121534891322, "learning_rate": 1.8889722211943798e-07, "logits/chosen": -3.135532855987549, "logits/rejected": -2.132028579711914, "logps/chosen": -815.424072265625, "logps/rejected": -478.5511474609375, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 4.953272819519043, "rewards/margins": 7.3466386795043945, "rewards/rejected": -2.3933656215667725, "step": 3397 }, { "epoch": 2.4825570776255708, "grad_norm": 21.434465466030478, "learning_rate": 1.8874256795921966e-07, "logits/chosen": -2.821228504180908, "logits/rejected": -2.539346694946289, "logps/chosen": -647.8846435546875, "logps/rejected": -627.9339599609375, "loss": 0.1071, "rewards/accuracies": 1.0, "rewards/chosen": 3.614166021347046, "rewards/margins": 5.5588274002075195, "rewards/rejected": -1.9446611404418945, "step": 3398 }, { "epoch": 2.4832876712328766, "grad_norm": 22.070694133868585, "learning_rate": 1.8858793873475098e-07, "logits/chosen": -2.8339920043945312, "logits/rejected": -2.4546260833740234, "logps/chosen": -670.17919921875, "logps/rejected": -618.270263671875, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 3.8524558544158936, "rewards/margins": 4.312237739562988, "rewards/rejected": -0.45978206396102905, "step": 3399 }, { "epoch": 2.4840182648401825, "grad_norm": 10.83359687680239, "learning_rate": 1.8843333450897614e-07, "logits/chosen": -2.318817615509033, "logits/rejected": -1.8226991891860962, "logps/chosen": -461.61328125, "logps/rejected": -426.1322021484375, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 3.2395694255828857, "rewards/margins": 7.074958801269531, "rewards/rejected": -3.8353891372680664, "step": 3400 }, { "epoch": 2.484748858447489, "grad_norm": 8.469744071471727, "learning_rate": 1.8827875534482897e-07, "logits/chosen": -2.7918756008148193, "logits/rejected": -2.219240188598633, "logps/chosen": -809.2689208984375, "logps/rejected": -657.1224365234375, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 4.595191478729248, "rewards/margins": 6.333327293395996, "rewards/rejected": -1.738135576248169, "step": 3401 }, { "epoch": 2.4854794520547947, "grad_norm": 8.297633134647917, "learning_rate": 1.8812420130523326e-07, "logits/chosen": -2.5701072216033936, "logits/rejected": -2.6056952476501465, "logps/chosen": -569.05029296875, "logps/rejected": -578.776123046875, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 2.4448788166046143, "rewards/margins": 4.318998336791992, "rewards/rejected": -1.8741192817687988, "step": 3402 }, { "epoch": 2.4862100456621006, "grad_norm": 7.350553956208659, "learning_rate": 1.8796967245310258e-07, "logits/chosen": -3.0453286170959473, "logits/rejected": -2.6225192546844482, "logps/chosen": -697.1634521484375, "logps/rejected": -559.522705078125, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 4.437443733215332, "rewards/margins": 6.278725624084473, "rewards/rejected": -1.8412824869155884, "step": 3403 }, { "epoch": 2.4869406392694065, "grad_norm": 9.615797920147516, "learning_rate": 1.878151688513402e-07, "logits/chosen": -3.106754779815674, "logits/rejected": -2.3587656021118164, "logps/chosen": -590.9935302734375, "logps/rejected": -575.1444091796875, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 3.1674516201019287, "rewards/margins": 4.647610187530518, "rewards/rejected": -1.4801585674285889, "step": 3404 }, { "epoch": 2.4876712328767123, "grad_norm": 5.931353469444574, "learning_rate": 1.8766069056283906e-07, "logits/chosen": -2.806516170501709, "logits/rejected": -1.709092140197754, "logps/chosen": -492.60540771484375, "logps/rejected": -349.40130615234375, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 4.2657246589660645, "rewards/margins": 7.140982627868652, "rewards/rejected": -2.875257968902588, "step": 3405 }, { "epoch": 2.488401826484018, "grad_norm": 8.895526627577121, "learning_rate": 1.8750623765048183e-07, "logits/chosen": -2.587453842163086, "logits/rejected": -2.5006983280181885, "logps/chosen": -381.8294982910156, "logps/rejected": -564.2147216796875, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 2.9088950157165527, "rewards/margins": 7.737758159637451, "rewards/rejected": -4.828863620758057, "step": 3406 }, { "epoch": 2.489132420091324, "grad_norm": 12.35500685786972, "learning_rate": 1.8735181017714092e-07, "logits/chosen": -2.9524989128112793, "logits/rejected": -2.138341188430786, "logps/chosen": -700.5811157226562, "logps/rejected": -589.2733154296875, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": 3.135953426361084, "rewards/margins": 4.071036338806152, "rewards/rejected": -0.9350830316543579, "step": 3407 }, { "epoch": 2.48986301369863, "grad_norm": 13.987548195327212, "learning_rate": 1.8719740820567834e-07, "logits/chosen": -2.754636287689209, "logits/rejected": -2.280225992202759, "logps/chosen": -805.7042846679688, "logps/rejected": -572.2222290039062, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 3.9226882457733154, "rewards/margins": 5.055559158325195, "rewards/rejected": -1.1328705549240112, "step": 3408 }, { "epoch": 2.4905936073059363, "grad_norm": 15.655697871845003, "learning_rate": 1.8704303179894572e-07, "logits/chosen": -3.162229299545288, "logits/rejected": -1.6734511852264404, "logps/chosen": -823.4478149414062, "logps/rejected": -398.0358581542969, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": 2.943126678466797, "rewards/margins": 4.488868713378906, "rewards/rejected": -1.5457419157028198, "step": 3409 }, { "epoch": 2.491324200913242, "grad_norm": 12.411459494444099, "learning_rate": 1.8688868101978416e-07, "logits/chosen": -2.207874298095703, "logits/rejected": -2.2158586978912354, "logps/chosen": -603.5831298828125, "logps/rejected": -686.77294921875, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": 3.67706298828125, "rewards/margins": 5.415987014770508, "rewards/rejected": -1.7389237880706787, "step": 3410 }, { "epoch": 2.492054794520548, "grad_norm": 12.586615801670725, "learning_rate": 1.867343559310246e-07, "logits/chosen": -2.533046245574951, "logits/rejected": -1.7636762857437134, "logps/chosen": -566.6288452148438, "logps/rejected": -367.6393737792969, "loss": 0.07, "rewards/accuracies": 1.0, "rewards/chosen": 2.938588857650757, "rewards/margins": 4.332283020019531, "rewards/rejected": -1.393694281578064, "step": 3411 }, { "epoch": 2.492785388127854, "grad_norm": 6.734751679756892, "learning_rate": 1.8658005659548723e-07, "logits/chosen": -2.228015899658203, "logits/rejected": -2.466862678527832, "logps/chosen": -515.0629272460938, "logps/rejected": -619.1942138671875, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 2.2738943099975586, "rewards/margins": 5.39672327041626, "rewards/rejected": -3.122828960418701, "step": 3412 }, { "epoch": 2.4935159817351598, "grad_norm": 8.725012714190907, "learning_rate": 1.8642578307598207e-07, "logits/chosen": -2.276423454284668, "logits/rejected": -1.8074989318847656, "logps/chosen": -537.0594482421875, "logps/rejected": -451.88336181640625, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 2.1220948696136475, "rewards/margins": 4.148797512054443, "rewards/rejected": -2.026702642440796, "step": 3413 }, { "epoch": 2.4942465753424656, "grad_norm": 7.97456417028885, "learning_rate": 1.8627153543530825e-07, "logits/chosen": -2.8564980030059814, "logits/rejected": -2.508654832839966, "logps/chosen": -415.48614501953125, "logps/rejected": -318.4113464355469, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 1.9240977764129639, "rewards/margins": 3.613226890563965, "rewards/rejected": -1.689129114151001, "step": 3414 }, { "epoch": 2.4949771689497715, "grad_norm": 9.996524588336431, "learning_rate": 1.861173137362546e-07, "logits/chosen": -2.9116897583007812, "logits/rejected": -2.455667018890381, "logps/chosen": -744.2420043945312, "logps/rejected": -515.0162353515625, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 3.880762815475464, "rewards/margins": 6.240419387817383, "rewards/rejected": -2.359656810760498, "step": 3415 }, { "epoch": 2.4957077625570774, "grad_norm": 13.895583273006942, "learning_rate": 1.8596311804159947e-07, "logits/chosen": -2.671717882156372, "logits/rejected": -1.9045765399932861, "logps/chosen": -716.1234130859375, "logps/rejected": -460.42303466796875, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": 4.892319679260254, "rewards/margins": 6.310751914978027, "rewards/rejected": -1.4184317588806152, "step": 3416 }, { "epoch": 2.4964383561643837, "grad_norm": 12.336178700539346, "learning_rate": 1.8580894841411048e-07, "logits/chosen": -2.872648239135742, "logits/rejected": -1.7632839679718018, "logps/chosen": -647.5095825195312, "logps/rejected": -465.0411376953125, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 4.844038486480713, "rewards/margins": 6.078518867492676, "rewards/rejected": -1.2344801425933838, "step": 3417 }, { "epoch": 2.4971689497716896, "grad_norm": 9.089583973108958, "learning_rate": 1.856548049165446e-07, "logits/chosen": -2.7515087127685547, "logits/rejected": -2.232602596282959, "logps/chosen": -559.3002319335938, "logps/rejected": -581.343994140625, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 2.597014904022217, "rewards/margins": 4.686913013458252, "rewards/rejected": -2.089898109436035, "step": 3418 }, { "epoch": 2.4978995433789954, "grad_norm": 16.506911246398715, "learning_rate": 1.8550068761164828e-07, "logits/chosen": -2.897636890411377, "logits/rejected": -2.0654635429382324, "logps/chosen": -659.8239135742188, "logps/rejected": -479.8329162597656, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 3.937831401824951, "rewards/margins": 6.865767478942871, "rewards/rejected": -2.927935838699341, "step": 3419 }, { "epoch": 2.4986301369863013, "grad_norm": 17.80650341085254, "learning_rate": 1.8534659656215728e-07, "logits/chosen": -2.4154059886932373, "logits/rejected": -1.9929097890853882, "logps/chosen": -609.20556640625, "logps/rejected": -587.63623046875, "loss": 0.0615, "rewards/accuracies": 1.0, "rewards/chosen": 3.6482253074645996, "rewards/margins": 4.656982421875, "rewards/rejected": -1.0087573528289795, "step": 3420 }, { "epoch": 2.499360730593607, "grad_norm": 8.056663588982424, "learning_rate": 1.8519253183079665e-07, "logits/chosen": -3.05220365524292, "logits/rejected": -2.398916721343994, "logps/chosen": -753.5860595703125, "logps/rejected": -564.9324951171875, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 3.9557690620422363, "rewards/margins": 5.1970086097717285, "rewards/rejected": -1.2412395477294922, "step": 3421 }, { "epoch": 2.5000913242009135, "grad_norm": 8.729305973322216, "learning_rate": 1.850384934802807e-07, "logits/chosen": -2.882354974746704, "logits/rejected": -1.6925562620162964, "logps/chosen": -599.74072265625, "logps/rejected": -421.7725830078125, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 4.027451992034912, "rewards/margins": 5.839161396026611, "rewards/rejected": -1.811708927154541, "step": 3422 }, { "epoch": 2.500821917808219, "grad_norm": 11.171837439840708, "learning_rate": 1.848844815733131e-07, "logits/chosen": -2.535876750946045, "logits/rejected": -2.1355202198028564, "logps/chosen": -433.07489013671875, "logps/rejected": -442.24176025390625, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 2.679821014404297, "rewards/margins": 5.452868461608887, "rewards/rejected": -2.77304744720459, "step": 3423 }, { "epoch": 2.5015525114155253, "grad_norm": 14.139931138017987, "learning_rate": 1.847304961725866e-07, "logits/chosen": -2.454281806945801, "logits/rejected": -1.9854130744934082, "logps/chosen": -380.9344787597656, "logps/rejected": -328.18389892578125, "loss": 0.0989, "rewards/accuracies": 1.0, "rewards/chosen": 2.4564437866210938, "rewards/margins": 5.718296051025391, "rewards/rejected": -3.261852502822876, "step": 3424 }, { "epoch": 2.502283105022831, "grad_norm": 15.53828280157994, "learning_rate": 1.8457653734078329e-07, "logits/chosen": -2.9069724082946777, "logits/rejected": -1.8030884265899658, "logps/chosen": -1090.9788818359375, "logps/rejected": -746.712646484375, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": 4.263952732086182, "rewards/margins": 4.817920684814453, "rewards/rejected": -0.5539675951004028, "step": 3425 }, { "epoch": 2.503013698630137, "grad_norm": 9.412327624765192, "learning_rate": 1.8442260514057457e-07, "logits/chosen": -2.841906785964966, "logits/rejected": -1.8022350072860718, "logps/chosen": -620.0310668945312, "logps/rejected": -493.3789978027344, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 3.0084571838378906, "rewards/margins": 6.319982051849365, "rewards/rejected": -3.3115243911743164, "step": 3426 }, { "epoch": 2.503744292237443, "grad_norm": 15.31227628583389, "learning_rate": 1.8426869963462044e-07, "logits/chosen": -2.9801366329193115, "logits/rejected": -2.670057535171509, "logps/chosen": -609.395263671875, "logps/rejected": -599.0262451171875, "loss": 0.0763, "rewards/accuracies": 1.0, "rewards/chosen": 3.5546658039093018, "rewards/margins": 4.960600852966309, "rewards/rejected": -1.4059354066848755, "step": 3427 }, { "epoch": 2.5044748858447488, "grad_norm": 8.806432921633064, "learning_rate": 1.8411482088557076e-07, "logits/chosen": -2.4672188758850098, "logits/rejected": -2.125589609146118, "logps/chosen": -887.0509033203125, "logps/rejected": -831.6821899414062, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 3.7694904804229736, "rewards/margins": 5.633858680725098, "rewards/rejected": -1.864368200302124, "step": 3428 }, { "epoch": 2.5052054794520546, "grad_norm": 8.780060494112409, "learning_rate": 1.8396096895606407e-07, "logits/chosen": -2.4212417602539062, "logits/rejected": -2.5331451892852783, "logps/chosen": -665.8247680664062, "logps/rejected": -806.6358032226562, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 3.2758686542510986, "rewards/margins": 4.946167469024658, "rewards/rejected": -1.6702989339828491, "step": 3429 }, { "epoch": 2.5059360730593605, "grad_norm": 13.317688982843457, "learning_rate": 1.8380714390872814e-07, "logits/chosen": -3.0043094158172607, "logits/rejected": -1.8468947410583496, "logps/chosen": -411.8523864746094, "logps/rejected": -266.2368469238281, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 3.53064227104187, "rewards/margins": 6.849156379699707, "rewards/rejected": -3.318513870239258, "step": 3430 }, { "epoch": 2.506666666666667, "grad_norm": 11.150875902198466, "learning_rate": 1.8365334580617964e-07, "logits/chosen": -2.785515308380127, "logits/rejected": -2.101693630218506, "logps/chosen": -702.6072387695312, "logps/rejected": -679.919189453125, "loss": 0.0568, "rewards/accuracies": 1.0, "rewards/chosen": 2.5269956588745117, "rewards/margins": 3.45276141166687, "rewards/rejected": -0.9257656335830688, "step": 3431 }, { "epoch": 2.5073972602739727, "grad_norm": 7.192042234065512, "learning_rate": 1.834995747110244e-07, "logits/chosen": -2.666604995727539, "logits/rejected": -1.9125018119812012, "logps/chosen": -813.9490966796875, "logps/rejected": -584.4390258789062, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 3.493557929992676, "rewards/margins": 4.967019081115723, "rewards/rejected": -1.4734609127044678, "step": 3432 }, { "epoch": 2.5081278538812786, "grad_norm": 8.770440341242672, "learning_rate": 1.833458306858573e-07, "logits/chosen": -3.1281378269195557, "logits/rejected": -2.2257742881774902, "logps/chosen": -787.5401611328125, "logps/rejected": -589.455078125, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 4.725954532623291, "rewards/margins": 4.669655799865723, "rewards/rejected": 0.056298792362213135, "step": 3433 }, { "epoch": 2.5088584474885844, "grad_norm": 25.45601196027915, "learning_rate": 1.8319211379326205e-07, "logits/chosen": -2.8860321044921875, "logits/rejected": -2.346477508544922, "logps/chosen": -735.8630981445312, "logps/rejected": -563.452880859375, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 4.050145149230957, "rewards/margins": 5.350283145904541, "rewards/rejected": -1.3001381158828735, "step": 3434 }, { "epoch": 2.5095890410958903, "grad_norm": 10.546507516746065, "learning_rate": 1.8303842409581153e-07, "logits/chosen": -2.479726791381836, "logits/rejected": -1.9208076000213623, "logps/chosen": -825.7944946289062, "logps/rejected": -624.1937866210938, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 3.9668970108032227, "rewards/margins": 4.306258201599121, "rewards/rejected": -0.339360773563385, "step": 3435 }, { "epoch": 2.510319634703196, "grad_norm": 8.033678214662215, "learning_rate": 1.8288476165606724e-07, "logits/chosen": -3.3367655277252197, "logits/rejected": -1.4894120693206787, "logps/chosen": -796.9521484375, "logps/rejected": -332.4413757324219, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 5.00572395324707, "rewards/margins": 7.222512245178223, "rewards/rejected": -2.2167880535125732, "step": 3436 }, { "epoch": 2.511050228310502, "grad_norm": 10.163139189903845, "learning_rate": 1.8273112653657992e-07, "logits/chosen": -2.8930718898773193, "logits/rejected": -2.10978102684021, "logps/chosen": -687.4769897460938, "logps/rejected": -603.0236206054688, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 4.940960884094238, "rewards/margins": 8.771171569824219, "rewards/rejected": -3.8302109241485596, "step": 3437 }, { "epoch": 2.5117808219178084, "grad_norm": 8.719262330237353, "learning_rate": 1.8257751879988893e-07, "logits/chosen": -2.628784418106079, "logits/rejected": -1.8568694591522217, "logps/chosen": -502.851806640625, "logps/rejected": -360.595703125, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 3.5993950366973877, "rewards/margins": 6.116275787353516, "rewards/rejected": -2.516880989074707, "step": 3438 }, { "epoch": 2.5125114155251143, "grad_norm": 10.697976849816007, "learning_rate": 1.824239385085227e-07, "logits/chosen": -2.925809383392334, "logits/rejected": -2.208812713623047, "logps/chosen": -807.0286254882812, "logps/rejected": -623.0010375976562, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 4.52677583694458, "rewards/margins": 4.795022010803223, "rewards/rejected": -0.26824575662612915, "step": 3439 }, { "epoch": 2.51324200913242, "grad_norm": 12.883305805632547, "learning_rate": 1.8227038572499826e-07, "logits/chosen": -2.874917984008789, "logits/rejected": -2.9957330226898193, "logps/chosen": -540.7440795898438, "logps/rejected": -652.8417358398438, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 2.9026505947113037, "rewards/margins": 7.219030380249023, "rewards/rejected": -4.316380500793457, "step": 3440 }, { "epoch": 2.513972602739726, "grad_norm": 8.931964573927862, "learning_rate": 1.8211686051182157e-07, "logits/chosen": -2.5601277351379395, "logits/rejected": -2.2769484519958496, "logps/chosen": -567.7681274414062, "logps/rejected": -604.69482421875, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": 3.3945834636688232, "rewards/margins": 6.323652744293213, "rewards/rejected": -2.9290690422058105, "step": 3441 }, { "epoch": 2.514703196347032, "grad_norm": 12.305016116643626, "learning_rate": 1.8196336293148736e-07, "logits/chosen": -2.8453218936920166, "logits/rejected": -2.1955785751342773, "logps/chosen": -795.0675048828125, "logps/rejected": -521.2341918945312, "loss": 0.0667, "rewards/accuracies": 1.0, "rewards/chosen": 3.6270244121551514, "rewards/margins": 3.740048408508301, "rewards/rejected": -0.11302384734153748, "step": 3442 }, { "epoch": 2.5154337899543378, "grad_norm": 8.073620601029937, "learning_rate": 1.8180989304647926e-07, "logits/chosen": -3.195462226867676, "logits/rejected": -2.027113676071167, "logps/chosen": -922.8898315429688, "logps/rejected": -532.6570434570312, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 4.619185924530029, "rewards/margins": 5.472489833831787, "rewards/rejected": -0.8533041477203369, "step": 3443 }, { "epoch": 2.5161643835616436, "grad_norm": 10.332441435028837, "learning_rate": 1.8165645091926923e-07, "logits/chosen": -2.801314115524292, "logits/rejected": -2.4248416423797607, "logps/chosen": -922.1305541992188, "logps/rejected": -642.526611328125, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 3.7101497650146484, "rewards/margins": 4.292648792266846, "rewards/rejected": -0.5824990272521973, "step": 3444 }, { "epoch": 2.51689497716895, "grad_norm": 22.861259961586764, "learning_rate": 1.8150303661231824e-07, "logits/chosen": -2.6499180793762207, "logits/rejected": -2.2760848999023438, "logps/chosen": -670.8050537109375, "logps/rejected": -565.2481689453125, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 4.955172538757324, "rewards/margins": 6.210674285888672, "rewards/rejected": -1.2555017471313477, "step": 3445 }, { "epoch": 2.517625570776256, "grad_norm": 8.212026420420694, "learning_rate": 1.8134965018807596e-07, "logits/chosen": -2.4961671829223633, "logits/rejected": -2.1272354125976562, "logps/chosen": -391.2232360839844, "logps/rejected": -467.487548828125, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 2.5681941509246826, "rewards/margins": 6.946516036987305, "rewards/rejected": -4.378321647644043, "step": 3446 }, { "epoch": 2.5183561643835617, "grad_norm": 9.968353167919519, "learning_rate": 1.8119629170898055e-07, "logits/chosen": -2.7093663215637207, "logits/rejected": -2.4156813621520996, "logps/chosen": -243.2164306640625, "logps/rejected": -356.13409423828125, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": 2.0587778091430664, "rewards/margins": 7.7625017166137695, "rewards/rejected": -5.703724384307861, "step": 3447 }, { "epoch": 2.5190867579908676, "grad_norm": 22.163924810111617, "learning_rate": 1.81042961237459e-07, "logits/chosen": -2.5485422611236572, "logits/rejected": -2.139007806777954, "logps/chosen": -592.0597534179688, "logps/rejected": -574.8701782226562, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 3.6754863262176514, "rewards/margins": 6.108554363250732, "rewards/rejected": -2.433068037033081, "step": 3448 }, { "epoch": 2.5198173515981734, "grad_norm": 10.902020090479684, "learning_rate": 1.808896588359265e-07, "logits/chosen": -2.8592281341552734, "logits/rejected": -2.234053134918213, "logps/chosen": -619.7305908203125, "logps/rejected": -478.72906494140625, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": 3.0622410774230957, "rewards/margins": 3.642488718032837, "rewards/rejected": -0.580247700214386, "step": 3449 }, { "epoch": 2.5205479452054793, "grad_norm": 11.505375144973002, "learning_rate": 1.8073638456678723e-07, "logits/chosen": -2.4728798866271973, "logits/rejected": -2.8587801456451416, "logps/chosen": -723.5225830078125, "logps/rejected": -977.940185546875, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 3.2013559341430664, "rewards/margins": 4.942010879516602, "rewards/rejected": -1.740654706954956, "step": 3450 }, { "epoch": 2.521278538812785, "grad_norm": 6.033147167614407, "learning_rate": 1.8058313849243374e-07, "logits/chosen": -3.0191547870635986, "logits/rejected": -1.726535677909851, "logps/chosen": -532.22021484375, "logps/rejected": -263.9996643066406, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 2.8061587810516357, "rewards/margins": 4.85075044631958, "rewards/rejected": -2.0445919036865234, "step": 3451 }, { "epoch": 2.5220091324200915, "grad_norm": 6.036922337162508, "learning_rate": 1.804299206752472e-07, "logits/chosen": -3.0925662517547607, "logits/rejected": -2.0817794799804688, "logps/chosen": -558.1583862304688, "logps/rejected": -367.24444580078125, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 1.6066845655441284, "rewards/margins": 4.338236331939697, "rewards/rejected": -2.7315516471862793, "step": 3452 }, { "epoch": 2.5227397260273974, "grad_norm": 10.520355113022639, "learning_rate": 1.8027673117759705e-07, "logits/chosen": -2.3115522861480713, "logits/rejected": -2.590052366256714, "logps/chosen": -987.2274169921875, "logps/rejected": -597.1812133789062, "loss": 0.0575, "rewards/accuracies": 1.0, "rewards/chosen": 1.6137232780456543, "rewards/margins": 2.8985843658447266, "rewards/rejected": -1.2848612070083618, "step": 3453 }, { "epoch": 2.5234703196347033, "grad_norm": 23.460867257637105, "learning_rate": 1.8012357006184149e-07, "logits/chosen": -3.2167649269104004, "logits/rejected": -2.5528564453125, "logps/chosen": -960.132568359375, "logps/rejected": -633.4893798828125, "loss": 0.101, "rewards/accuracies": 0.875, "rewards/chosen": 4.711550712585449, "rewards/margins": 3.2908377647399902, "rewards/rejected": 1.4207128286361694, "step": 3454 }, { "epoch": 2.524200913242009, "grad_norm": 16.625995806936366, "learning_rate": 1.7997043739032693e-07, "logits/chosen": -3.3174960613250732, "logits/rejected": -3.0487051010131836, "logps/chosen": -793.9564819335938, "logps/rejected": -710.3465576171875, "loss": 0.0735, "rewards/accuracies": 0.875, "rewards/chosen": 4.7310099601745605, "rewards/margins": 4.675551891326904, "rewards/rejected": 0.055457860231399536, "step": 3455 }, { "epoch": 2.524931506849315, "grad_norm": 11.695281187332832, "learning_rate": 1.7981733322538844e-07, "logits/chosen": -3.2179903984069824, "logits/rejected": -2.845376491546631, "logps/chosen": -625.8397216796875, "logps/rejected": -539.934814453125, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 2.8271143436431885, "rewards/margins": 3.6471500396728516, "rewards/rejected": -0.8200356364250183, "step": 3456 }, { "epoch": 2.525662100456621, "grad_norm": 12.159428433487388, "learning_rate": 1.7966425762934923e-07, "logits/chosen": -3.197230339050293, "logits/rejected": -1.8531126976013184, "logps/chosen": -678.801025390625, "logps/rejected": -470.09893798828125, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 3.726682186126709, "rewards/margins": 5.490827560424805, "rewards/rejected": -1.764145016670227, "step": 3457 }, { "epoch": 2.5263926940639267, "grad_norm": 14.360492318302441, "learning_rate": 1.7951121066452103e-07, "logits/chosen": -2.824801445007324, "logits/rejected": -2.3796916007995605, "logps/chosen": -621.114013671875, "logps/rejected": -734.8836669921875, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 2.1692512035369873, "rewards/margins": 3.493886947631836, "rewards/rejected": -1.3246357440948486, "step": 3458 }, { "epoch": 2.527123287671233, "grad_norm": 13.765230378849912, "learning_rate": 1.7935819239320386e-07, "logits/chosen": -2.5698423385620117, "logits/rejected": -2.4019510746002197, "logps/chosen": -475.8634033203125, "logps/rejected": -527.2655029296875, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": 0.4422546327114105, "rewards/margins": 3.858819007873535, "rewards/rejected": -3.4165639877319336, "step": 3459 }, { "epoch": 2.527853881278539, "grad_norm": 12.376776057248877, "learning_rate": 1.7920520287768613e-07, "logits/chosen": -2.764007091522217, "logits/rejected": -1.8883373737335205, "logps/chosen": -545.3892822265625, "logps/rejected": -402.34820556640625, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 2.6073946952819824, "rewards/margins": 5.155332565307617, "rewards/rejected": -2.5479373931884766, "step": 3460 }, { "epoch": 2.528584474885845, "grad_norm": 16.893782949468424, "learning_rate": 1.790522421802446e-07, "logits/chosen": -2.9521288871765137, "logits/rejected": -2.2334275245666504, "logps/chosen": -669.2993774414062, "logps/rejected": -546.5267333984375, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 5.765017032623291, "rewards/margins": 7.466748237609863, "rewards/rejected": -1.7017308473587036, "step": 3461 }, { "epoch": 2.5293150684931507, "grad_norm": 12.026198082004852, "learning_rate": 1.7889931036314391e-07, "logits/chosen": -2.670508623123169, "logits/rejected": -2.345933198928833, "logps/chosen": -783.34228515625, "logps/rejected": -605.9234008789062, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 3.9556336402893066, "rewards/margins": 4.758051872253418, "rewards/rejected": -0.8024182319641113, "step": 3462 }, { "epoch": 2.5300456621004566, "grad_norm": 28.3838780164022, "learning_rate": 1.7874640748863745e-07, "logits/chosen": -1.733340859413147, "logits/rejected": -2.4205989837646484, "logps/chosen": -358.42486572265625, "logps/rejected": -793.6946411132812, "loss": 0.0767, "rewards/accuracies": 0.875, "rewards/chosen": 1.1548746824264526, "rewards/margins": 3.584867477416992, "rewards/rejected": -2.429993152618408, "step": 3463 }, { "epoch": 2.5307762557077624, "grad_norm": 9.641825052563352, "learning_rate": 1.7859353361896662e-07, "logits/chosen": -2.322716236114502, "logits/rejected": -2.3104934692382812, "logps/chosen": -342.6723327636719, "logps/rejected": -417.66900634765625, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 1.4323641061782837, "rewards/margins": 4.58823299407959, "rewards/rejected": -3.155869245529175, "step": 3464 }, { "epoch": 2.5315068493150683, "grad_norm": 8.132564368663136, "learning_rate": 1.7844068881636105e-07, "logits/chosen": -2.428171157836914, "logits/rejected": -1.431719422340393, "logps/chosen": -641.2869262695312, "logps/rejected": -430.7265625, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 3.1398825645446777, "rewards/margins": 7.486751079559326, "rewards/rejected": -4.346868991851807, "step": 3465 }, { "epoch": 2.5322374429223746, "grad_norm": 16.785263064734604, "learning_rate": 1.7828787314303825e-07, "logits/chosen": -2.576263904571533, "logits/rejected": -2.3935248851776123, "logps/chosen": -449.5871276855469, "logps/rejected": -602.2972412109375, "loss": 0.1022, "rewards/accuracies": 0.875, "rewards/chosen": 2.53788685798645, "rewards/margins": 5.853270053863525, "rewards/rejected": -3.3153834342956543, "step": 3466 }, { "epoch": 2.5329680365296805, "grad_norm": 8.127503933584387, "learning_rate": 1.7813508666120432e-07, "logits/chosen": -2.415243625640869, "logits/rejected": -2.4597291946411133, "logps/chosen": -392.81683349609375, "logps/rejected": -680.622802734375, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 2.5055527687072754, "rewards/margins": 3.40191912651062, "rewards/rejected": -0.8963662981987, "step": 3467 }, { "epoch": 2.5336986301369864, "grad_norm": 20.692402883893767, "learning_rate": 1.7798232943305313e-07, "logits/chosen": -2.539677858352661, "logits/rejected": -2.6090586185455322, "logps/chosen": -746.222412109375, "logps/rejected": -884.7611083984375, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 4.4166154861450195, "rewards/margins": 6.310732841491699, "rewards/rejected": -1.8941173553466797, "step": 3468 }, { "epoch": 2.5344292237442922, "grad_norm": 10.439721150967632, "learning_rate": 1.7782960152076698e-07, "logits/chosen": -2.9044737815856934, "logits/rejected": -2.55582857131958, "logps/chosen": -408.23565673828125, "logps/rejected": -476.89166259765625, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 3.0010998249053955, "rewards/margins": 6.781717300415039, "rewards/rejected": -3.7806177139282227, "step": 3469 }, { "epoch": 2.535159817351598, "grad_norm": 9.928212174034973, "learning_rate": 1.776769029865159e-07, "logits/chosen": -2.554445743560791, "logits/rejected": -2.1113810539245605, "logps/chosen": -612.9232788085938, "logps/rejected": -550.2577514648438, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 2.646770715713501, "rewards/margins": 3.9094622135162354, "rewards/rejected": -1.2626912593841553, "step": 3470 }, { "epoch": 2.535890410958904, "grad_norm": 13.09952477910846, "learning_rate": 1.7752423389245807e-07, "logits/chosen": -2.8506312370300293, "logits/rejected": -2.370408296585083, "logps/chosen": -587.60205078125, "logps/rejected": -423.73370361328125, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 2.653144598007202, "rewards/margins": 4.5334038734436035, "rewards/rejected": -1.88025963306427, "step": 3471 }, { "epoch": 2.53662100456621, "grad_norm": 10.070591676332915, "learning_rate": 1.773715943007398e-07, "logits/chosen": -2.7035062313079834, "logits/rejected": -2.0366623401641846, "logps/chosen": -483.7989196777344, "logps/rejected": -488.15966796875, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 3.3354601860046387, "rewards/margins": 7.3353495597839355, "rewards/rejected": -3.999889612197876, "step": 3472 }, { "epoch": 2.537351598173516, "grad_norm": 17.784643617008026, "learning_rate": 1.772189842734953e-07, "logits/chosen": -2.392457962036133, "logits/rejected": -2.136786699295044, "logps/chosen": -298.6542663574219, "logps/rejected": -426.6314392089844, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 2.998889684677124, "rewards/margins": 6.2084174156188965, "rewards/rejected": -3.2095274925231934, "step": 3473 }, { "epoch": 2.538082191780822, "grad_norm": 16.263271135817337, "learning_rate": 1.7706640387284676e-07, "logits/chosen": -2.9521360397338867, "logits/rejected": -2.120840549468994, "logps/chosen": -687.30126953125, "logps/rejected": -493.18377685546875, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": 3.337859630584717, "rewards/margins": 5.569231986999512, "rewards/rejected": -2.231372356414795, "step": 3474 }, { "epoch": 2.538812785388128, "grad_norm": 18.715644795395693, "learning_rate": 1.7691385316090425e-07, "logits/chosen": -2.3863518238067627, "logits/rejected": -2.0673980712890625, "logps/chosen": -400.1367492675781, "logps/rejected": -308.1884765625, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": 1.9965441226959229, "rewards/margins": 5.931415557861328, "rewards/rejected": -3.9348714351654053, "step": 3475 }, { "epoch": 2.539543378995434, "grad_norm": 15.232016043325999, "learning_rate": 1.7676133219976586e-07, "logits/chosen": -3.0728347301483154, "logits/rejected": -2.036339044570923, "logps/chosen": -718.9812622070312, "logps/rejected": -431.0638427734375, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 3.9886956214904785, "rewards/margins": 6.326543807983398, "rewards/rejected": -2.33784818649292, "step": 3476 }, { "epoch": 2.5402739726027397, "grad_norm": 11.678644915841584, "learning_rate": 1.7660884105151747e-07, "logits/chosen": -2.723125457763672, "logits/rejected": -2.174760580062866, "logps/chosen": -590.38671875, "logps/rejected": -368.5430908203125, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 2.0801165103912354, "rewards/margins": 3.6085805892944336, "rewards/rejected": -1.5284640789031982, "step": 3477 }, { "epoch": 2.5410045662100456, "grad_norm": 6.661187717964781, "learning_rate": 1.76456379778233e-07, "logits/chosen": -2.7646515369415283, "logits/rejected": -1.9759584665298462, "logps/chosen": -265.501220703125, "logps/rejected": -305.2681884765625, "loss": 0.0498, "rewards/accuracies": 0.875, "rewards/chosen": 3.086306095123291, "rewards/margins": 7.499235153198242, "rewards/rejected": -4.412929534912109, "step": 3478 }, { "epoch": 2.5417351598173514, "grad_norm": 14.45824550341809, "learning_rate": 1.763039484419739e-07, "logits/chosen": -2.8257973194122314, "logits/rejected": -1.9549753665924072, "logps/chosen": -747.67333984375, "logps/rejected": -476.73272705078125, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": 3.799281358718872, "rewards/margins": 5.406718730926514, "rewards/rejected": -1.6074376106262207, "step": 3479 }, { "epoch": 2.5424657534246577, "grad_norm": 16.586520759292394, "learning_rate": 1.761515471047896e-07, "logits/chosen": -2.1421899795532227, "logits/rejected": -2.136091947555542, "logps/chosen": -440.0922546386719, "logps/rejected": -525.7221069335938, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": 2.3057358264923096, "rewards/margins": 5.040771484375, "rewards/rejected": -2.7350356578826904, "step": 3480 }, { "epoch": 2.5431963470319636, "grad_norm": 14.853929486550225, "learning_rate": 1.7599917582871751e-07, "logits/chosen": -3.1903390884399414, "logits/rejected": -2.7278552055358887, "logps/chosen": -948.9169921875, "logps/rejected": -872.8508911132812, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 5.015592098236084, "rewards/margins": 5.940459728240967, "rewards/rejected": -0.9248672723770142, "step": 3481 }, { "epoch": 2.5439269406392695, "grad_norm": 15.045077936560764, "learning_rate": 1.7584683467578266e-07, "logits/chosen": -3.000241994857788, "logits/rejected": -2.873767375946045, "logps/chosen": -713.0873413085938, "logps/rejected": -790.0151977539062, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 3.211069107055664, "rewards/margins": 3.725355625152588, "rewards/rejected": -0.5142863988876343, "step": 3482 }, { "epoch": 2.5446575342465754, "grad_norm": 15.025645394832463, "learning_rate": 1.7569452370799752e-07, "logits/chosen": -2.923145294189453, "logits/rejected": -1.738608717918396, "logps/chosen": -573.7147827148438, "logps/rejected": -371.41943359375, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": 3.9913110733032227, "rewards/margins": 4.639337062835693, "rewards/rejected": -0.648026168346405, "step": 3483 }, { "epoch": 2.5453881278538812, "grad_norm": 14.768461409117002, "learning_rate": 1.7554224298736275e-07, "logits/chosen": -2.119694709777832, "logits/rejected": -1.8066004514694214, "logps/chosen": -420.68341064453125, "logps/rejected": -405.85931396484375, "loss": 0.1396, "rewards/accuracies": 0.875, "rewards/chosen": 1.629608154296875, "rewards/margins": 4.423701763153076, "rewards/rejected": -2.794093608856201, "step": 3484 }, { "epoch": 2.546118721461187, "grad_norm": 15.161559757726424, "learning_rate": 1.753899925758664e-07, "logits/chosen": -1.9944875240325928, "logits/rejected": -2.3433754444122314, "logps/chosen": -523.580810546875, "logps/rejected": -714.1953125, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 1.6392043828964233, "rewards/margins": 4.047261714935303, "rewards/rejected": -2.408057451248169, "step": 3485 }, { "epoch": 2.546849315068493, "grad_norm": 7.981204603479817, "learning_rate": 1.7523777253548427e-07, "logits/chosen": -2.519200086593628, "logits/rejected": -2.4671332836151123, "logps/chosen": -619.6414184570312, "logps/rejected": -590.0610961914062, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 1.8600884675979614, "rewards/margins": 4.7681379318237305, "rewards/rejected": -2.9080495834350586, "step": 3486 }, { "epoch": 2.5475799086757993, "grad_norm": 14.328778866623413, "learning_rate": 1.7508558292817987e-07, "logits/chosen": -2.8620779514312744, "logits/rejected": -2.3549306392669678, "logps/chosen": -412.76214599609375, "logps/rejected": -411.04632568359375, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 3.4644007682800293, "rewards/margins": 6.616235256195068, "rewards/rejected": -3.151834487915039, "step": 3487 }, { "epoch": 2.548310502283105, "grad_norm": 9.874764355129491, "learning_rate": 1.7493342381590414e-07, "logits/chosen": -2.0370867252349854, "logits/rejected": -2.4554543495178223, "logps/chosen": -377.3590087890625, "logps/rejected": -687.7628173828125, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 2.0521039962768555, "rewards/margins": 7.292336463928223, "rewards/rejected": -5.240232944488525, "step": 3488 }, { "epoch": 2.549041095890411, "grad_norm": 10.898566657500806, "learning_rate": 1.7478129526059574e-07, "logits/chosen": -2.3971400260925293, "logits/rejected": -2.0229392051696777, "logps/chosen": -513.548095703125, "logps/rejected": -480.7998962402344, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 3.7567319869995117, "rewards/margins": 5.65816068649292, "rewards/rejected": -1.9014285802841187, "step": 3489 }, { "epoch": 2.549771689497717, "grad_norm": 18.067496166447896, "learning_rate": 1.7462919732418092e-07, "logits/chosen": -2.3030824661254883, "logits/rejected": -1.9394323825836182, "logps/chosen": -361.0799560546875, "logps/rejected": -295.0603942871094, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 2.292178153991699, "rewards/margins": 4.266096591949463, "rewards/rejected": -1.9739181995391846, "step": 3490 }, { "epoch": 2.550502283105023, "grad_norm": 16.295418049686926, "learning_rate": 1.7447713006857338e-07, "logits/chosen": -2.856084108352661, "logits/rejected": -2.737252950668335, "logps/chosen": -340.823974609375, "logps/rejected": -463.27252197265625, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 3.010181188583374, "rewards/margins": 6.611560821533203, "rewards/rejected": -3.601379871368408, "step": 3491 }, { "epoch": 2.5512328767123287, "grad_norm": 16.733806924589906, "learning_rate": 1.743250935556743e-07, "logits/chosen": -2.637357711791992, "logits/rejected": -2.051501512527466, "logps/chosen": -709.2547607421875, "logps/rejected": -571.5702514648438, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": 4.10366678237915, "rewards/margins": 6.143092632293701, "rewards/rejected": -2.039425849914551, "step": 3492 }, { "epoch": 2.5519634703196346, "grad_norm": 14.049311306728148, "learning_rate": 1.7417308784737245e-07, "logits/chosen": -3.125462055206299, "logits/rejected": -2.0509161949157715, "logps/chosen": -865.8088989257812, "logps/rejected": -560.699951171875, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 5.108351707458496, "rewards/margins": 6.112243175506592, "rewards/rejected": -1.0038914680480957, "step": 3493 }, { "epoch": 2.552694063926941, "grad_norm": 9.038399476682676, "learning_rate": 1.7402111300554408e-07, "logits/chosen": -2.444911479949951, "logits/rejected": -2.247901439666748, "logps/chosen": -461.537353515625, "logps/rejected": -525.0523681640625, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 2.890878438949585, "rewards/margins": 6.62315559387207, "rewards/rejected": -3.7322769165039062, "step": 3494 }, { "epoch": 2.5534246575342463, "grad_norm": 10.487984821811986, "learning_rate": 1.738691690920527e-07, "logits/chosen": -2.8422296047210693, "logits/rejected": -2.4606661796569824, "logps/chosen": -408.58160400390625, "logps/rejected": -481.6473693847656, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": 3.294999361038208, "rewards/margins": 6.684930324554443, "rewards/rejected": -3.3899307250976562, "step": 3495 }, { "epoch": 2.5541552511415526, "grad_norm": 11.968462509806045, "learning_rate": 1.737172561687495e-07, "logits/chosen": -2.9669835567474365, "logits/rejected": -2.297703504562378, "logps/chosen": -671.4326171875, "logps/rejected": -558.4655151367188, "loss": 0.0684, "rewards/accuracies": 0.875, "rewards/chosen": 3.530977487564087, "rewards/margins": 3.3206348419189453, "rewards/rejected": 0.2103423774242401, "step": 3496 }, { "epoch": 2.5548858447488585, "grad_norm": 10.718523651491287, "learning_rate": 1.735653742974727e-07, "logits/chosen": -3.3439645767211914, "logits/rejected": -2.7509350776672363, "logps/chosen": -584.3768310546875, "logps/rejected": -621.193603515625, "loss": 0.0674, "rewards/accuracies": 1.0, "rewards/chosen": 2.9316844940185547, "rewards/margins": 4.385957717895508, "rewards/rejected": -1.454272985458374, "step": 3497 }, { "epoch": 2.5556164383561644, "grad_norm": 21.252424752515374, "learning_rate": 1.7341352354004813e-07, "logits/chosen": -2.3907551765441895, "logits/rejected": -2.4728827476501465, "logps/chosen": -605.3397216796875, "logps/rejected": -581.9125366210938, "loss": 0.0952, "rewards/accuracies": 0.875, "rewards/chosen": 3.036492347717285, "rewards/margins": 3.1378560066223145, "rewards/rejected": -0.10136392712593079, "step": 3498 }, { "epoch": 2.5563470319634702, "grad_norm": 11.90343083524301, "learning_rate": 1.73261703958289e-07, "logits/chosen": -2.357072114944458, "logits/rejected": -1.9300615787506104, "logps/chosen": -470.4288330078125, "logps/rejected": -365.7463684082031, "loss": 0.0606, "rewards/accuracies": 1.0, "rewards/chosen": 2.842454671859741, "rewards/margins": 5.1004486083984375, "rewards/rejected": -2.2579941749572754, "step": 3499 }, { "epoch": 2.557077625570776, "grad_norm": 8.065945620927879, "learning_rate": 1.7310991561399574e-07, "logits/chosen": -2.4073565006256104, "logits/rejected": -1.852588176727295, "logps/chosen": -618.7549438476562, "logps/rejected": -594.4891357421875, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 3.479328155517578, "rewards/margins": 6.401315689086914, "rewards/rejected": -2.9219884872436523, "step": 3500 }, { "epoch": 2.5578082191780824, "grad_norm": 13.625965329192049, "learning_rate": 1.7295815856895592e-07, "logits/chosen": -3.2102465629577637, "logits/rejected": -2.0034561157226562, "logps/chosen": -712.4674682617188, "logps/rejected": -489.47564697265625, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 3.187716484069824, "rewards/margins": 4.8671979904174805, "rewards/rejected": -1.6794815063476562, "step": 3501 }, { "epoch": 2.558538812785388, "grad_norm": 6.938897619283515, "learning_rate": 1.7280643288494455e-07, "logits/chosen": -2.760923147201538, "logits/rejected": -2.347951889038086, "logps/chosen": -524.3330078125, "logps/rejected": -401.407470703125, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 2.5287461280822754, "rewards/margins": 4.657884120941162, "rewards/rejected": -2.129138231277466, "step": 3502 }, { "epoch": 2.559269406392694, "grad_norm": 12.643995441613345, "learning_rate": 1.7265473862372386e-07, "logits/chosen": -2.8675754070281982, "logits/rejected": -1.823378562927246, "logps/chosen": -664.9276733398438, "logps/rejected": -379.333251953125, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": 3.5644049644470215, "rewards/margins": 4.218321800231934, "rewards/rejected": -0.653916597366333, "step": 3503 }, { "epoch": 2.56, "grad_norm": 10.794692706371961, "learning_rate": 1.7250307584704332e-07, "logits/chosen": -1.8189234733581543, "logits/rejected": -2.5508475303649902, "logps/chosen": -370.86907958984375, "logps/rejected": -701.8798828125, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 0.803191065788269, "rewards/margins": 8.55276107788086, "rewards/rejected": -7.749570369720459, "step": 3504 }, { "epoch": 2.560730593607306, "grad_norm": 12.526260866823385, "learning_rate": 1.7235144461663935e-07, "logits/chosen": -3.009411096572876, "logits/rejected": -1.960134506225586, "logps/chosen": -765.739501953125, "logps/rejected": -585.370849609375, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 4.346357345581055, "rewards/margins": 6.153564453125, "rewards/rejected": -1.8072072267532349, "step": 3505 }, { "epoch": 2.561461187214612, "grad_norm": 10.195214469116825, "learning_rate": 1.7219984499423585e-07, "logits/chosen": -3.1373188495635986, "logits/rejected": -2.787659168243408, "logps/chosen": -889.857177734375, "logps/rejected": -776.7056274414062, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 5.317035675048828, "rewards/margins": 6.547072410583496, "rewards/rejected": -1.2300368547439575, "step": 3506 }, { "epoch": 2.5621917808219177, "grad_norm": 24.160503294294053, "learning_rate": 1.720482770415436e-07, "logits/chosen": -2.558124542236328, "logits/rejected": -2.391937732696533, "logps/chosen": -544.6647338867188, "logps/rejected": -661.38623046875, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 3.8946616649627686, "rewards/margins": 5.708076000213623, "rewards/rejected": -1.8134143352508545, "step": 3507 }, { "epoch": 2.562922374429224, "grad_norm": 22.920325228795292, "learning_rate": 1.7189674082026067e-07, "logits/chosen": -2.5068936347961426, "logits/rejected": -2.175656795501709, "logps/chosen": -642.051025390625, "logps/rejected": -448.443359375, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": 4.492753505706787, "rewards/margins": 5.75739049911499, "rewards/rejected": -1.2646371126174927, "step": 3508 }, { "epoch": 2.5636529680365294, "grad_norm": 20.527716531462865, "learning_rate": 1.7174523639207216e-07, "logits/chosen": -2.8654866218566895, "logits/rejected": -2.812410831451416, "logps/chosen": -560.3977661132812, "logps/rejected": -503.34844970703125, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 1.6169968843460083, "rewards/margins": 3.444201946258545, "rewards/rejected": -1.827204942703247, "step": 3509 }, { "epoch": 2.5643835616438357, "grad_norm": 12.601142928345267, "learning_rate": 1.7159376381865013e-07, "logits/chosen": -2.478513717651367, "logits/rejected": -2.2094225883483887, "logps/chosen": -868.6219482421875, "logps/rejected": -736.6192016601562, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 3.2524337768554688, "rewards/margins": 4.069156646728516, "rewards/rejected": -0.8167226910591125, "step": 3510 }, { "epoch": 2.5651141552511416, "grad_norm": 9.232558431597989, "learning_rate": 1.714423231616537e-07, "logits/chosen": -2.7038168907165527, "logits/rejected": -2.2081594467163086, "logps/chosen": -665.2026977539062, "logps/rejected": -633.0308837890625, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": 2.8926382064819336, "rewards/margins": 4.552680969238281, "rewards/rejected": -1.6600427627563477, "step": 3511 }, { "epoch": 2.5658447488584475, "grad_norm": 13.942398920949667, "learning_rate": 1.7129091448272916e-07, "logits/chosen": -2.6732211112976074, "logits/rejected": -2.5923705101013184, "logps/chosen": -432.166015625, "logps/rejected": -402.038818359375, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 2.742046594619751, "rewards/margins": 3.9224419593811035, "rewards/rejected": -1.180395245552063, "step": 3512 }, { "epoch": 2.5665753424657534, "grad_norm": 15.196312125699436, "learning_rate": 1.711395378435097e-07, "logits/chosen": -2.9476847648620605, "logits/rejected": -2.2014577388763428, "logps/chosen": -598.7908325195312, "logps/rejected": -384.95660400390625, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 2.4127440452575684, "rewards/margins": 4.030447006225586, "rewards/rejected": -1.617702841758728, "step": 3513 }, { "epoch": 2.5673059360730592, "grad_norm": 7.613851933940707, "learning_rate": 1.7098819330561527e-07, "logits/chosen": -2.2222211360931396, "logits/rejected": -2.3888280391693115, "logps/chosen": -472.5494384765625, "logps/rejected": -747.7283935546875, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": 2.6083099842071533, "rewards/margins": 3.6556105613708496, "rewards/rejected": -1.0473006963729858, "step": 3514 }, { "epoch": 2.5680365296803656, "grad_norm": 10.395658951534582, "learning_rate": 1.7083688093065294e-07, "logits/chosen": -3.1840577125549316, "logits/rejected": -2.6676042079925537, "logps/chosen": -568.2211303710938, "logps/rejected": -597.1668701171875, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 3.2591469287872314, "rewards/margins": 5.028813362121582, "rewards/rejected": -1.7696666717529297, "step": 3515 }, { "epoch": 2.568767123287671, "grad_norm": 8.059840037081372, "learning_rate": 1.7068560078021677e-07, "logits/chosen": -2.809659481048584, "logits/rejected": -2.609712600708008, "logps/chosen": -487.0518798828125, "logps/rejected": -451.66326904296875, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 3.2756712436676025, "rewards/margins": 3.2800958156585693, "rewards/rejected": -0.004424482583999634, "step": 3516 }, { "epoch": 2.5694977168949773, "grad_norm": 9.086268752694442, "learning_rate": 1.7053435291588764e-07, "logits/chosen": -2.4307126998901367, "logits/rejected": -2.5018129348754883, "logps/chosen": -576.4291381835938, "logps/rejected": -713.7456665039062, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 2.1897525787353516, "rewards/margins": 4.483883380889893, "rewards/rejected": -2.294130802154541, "step": 3517 }, { "epoch": 2.570228310502283, "grad_norm": 9.35556864834451, "learning_rate": 1.7038313739923306e-07, "logits/chosen": -3.4600603580474854, "logits/rejected": -2.081709861755371, "logps/chosen": -697.1295166015625, "logps/rejected": -360.0853576660156, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 3.9032554626464844, "rewards/margins": 7.074195861816406, "rewards/rejected": -3.17094087600708, "step": 3518 }, { "epoch": 2.570958904109589, "grad_norm": 26.353523079674517, "learning_rate": 1.7023195429180767e-07, "logits/chosen": -3.4962615966796875, "logits/rejected": -2.1923513412475586, "logps/chosen": -771.3909912109375, "logps/rejected": -446.5802307128906, "loss": 0.0977, "rewards/accuracies": 1.0, "rewards/chosen": 3.6926846504211426, "rewards/margins": 5.828258514404297, "rewards/rejected": -2.1355738639831543, "step": 3519 }, { "epoch": 2.571689497716895, "grad_norm": 12.073054148582676, "learning_rate": 1.7008080365515277e-07, "logits/chosen": -3.1229641437530518, "logits/rejected": -2.3673057556152344, "logps/chosen": -664.5501098632812, "logps/rejected": -514.2810668945312, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 3.348165512084961, "rewards/margins": 3.9266433715820312, "rewards/rejected": -0.5784776210784912, "step": 3520 }, { "epoch": 2.572420091324201, "grad_norm": 11.639170551963542, "learning_rate": 1.699296855507965e-07, "logits/chosen": -2.507847785949707, "logits/rejected": -2.480490207672119, "logps/chosen": -634.3485107421875, "logps/rejected": -782.472900390625, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 4.249233245849609, "rewards/margins": 6.442728042602539, "rewards/rejected": -2.193495273590088, "step": 3521 }, { "epoch": 2.573150684931507, "grad_norm": 8.110788916658228, "learning_rate": 1.697786000402538e-07, "logits/chosen": -2.648036479949951, "logits/rejected": -1.7123713493347168, "logps/chosen": -838.624267578125, "logps/rejected": -647.3636474609375, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 3.86049747467041, "rewards/margins": 5.574410438537598, "rewards/rejected": -1.7139132022857666, "step": 3522 }, { "epoch": 2.5738812785388125, "grad_norm": 15.614361124152012, "learning_rate": 1.6962754718502615e-07, "logits/chosen": -2.9674875736236572, "logits/rejected": -2.059278726577759, "logps/chosen": -886.175048828125, "logps/rejected": -598.2471923828125, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": 5.174866676330566, "rewards/margins": 4.899536609649658, "rewards/rejected": 0.27533042430877686, "step": 3523 }, { "epoch": 2.574611872146119, "grad_norm": 7.639129055370735, "learning_rate": 1.6947652704660188e-07, "logits/chosen": -2.5038957595825195, "logits/rejected": -1.99175226688385, "logps/chosen": -432.343017578125, "logps/rejected": -501.60626220703125, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 2.539301633834839, "rewards/margins": 5.476639270782471, "rewards/rejected": -2.937337875366211, "step": 3524 }, { "epoch": 2.5753424657534247, "grad_norm": 7.019341043263294, "learning_rate": 1.6932553968645605e-07, "logits/chosen": -2.5967907905578613, "logits/rejected": -2.510035276412964, "logps/chosen": -648.2969970703125, "logps/rejected": -721.242919921875, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 4.1476874351501465, "rewards/margins": 6.884624004364014, "rewards/rejected": -2.736936092376709, "step": 3525 }, { "epoch": 2.5760730593607306, "grad_norm": 14.424563205922064, "learning_rate": 1.691745851660503e-07, "logits/chosen": -2.6565933227539062, "logits/rejected": -2.605034112930298, "logps/chosen": -767.5945434570312, "logps/rejected": -651.757568359375, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 3.063328266143799, "rewards/margins": 4.326154708862305, "rewards/rejected": -1.262826919555664, "step": 3526 }, { "epoch": 2.5768036529680365, "grad_norm": 9.372494375451558, "learning_rate": 1.6902366354683292e-07, "logits/chosen": -2.4475531578063965, "logits/rejected": -2.1270601749420166, "logps/chosen": -768.691650390625, "logps/rejected": -643.4854125976562, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 4.348005294799805, "rewards/margins": 7.587731838226318, "rewards/rejected": -3.239727020263672, "step": 3527 }, { "epoch": 2.5775342465753424, "grad_norm": 12.718949749851904, "learning_rate": 1.6887277489023875e-07, "logits/chosen": -2.947751045227051, "logits/rejected": -2.299854278564453, "logps/chosen": -541.8956909179688, "logps/rejected": -342.937744140625, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": 3.0959839820861816, "rewards/margins": 5.491064071655273, "rewards/rejected": -2.395080089569092, "step": 3528 }, { "epoch": 2.5782648401826487, "grad_norm": 13.565767378061002, "learning_rate": 1.687219192576893e-07, "logits/chosen": -2.636698007583618, "logits/rejected": -1.8305342197418213, "logps/chosen": -556.4046630859375, "logps/rejected": -514.83056640625, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": 3.0466575622558594, "rewards/margins": 5.282154083251953, "rewards/rejected": -2.235496997833252, "step": 3529 }, { "epoch": 2.578995433789954, "grad_norm": 11.911735172276671, "learning_rate": 1.6857109671059268e-07, "logits/chosen": -2.7807021141052246, "logits/rejected": -2.16862416267395, "logps/chosen": -484.3919677734375, "logps/rejected": -531.5345458984375, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 3.863075017929077, "rewards/margins": 5.310728549957275, "rewards/rejected": -1.4476536512374878, "step": 3530 }, { "epoch": 2.5797260273972604, "grad_norm": 12.70470145603249, "learning_rate": 1.684203073103433e-07, "logits/chosen": -2.4950008392333984, "logits/rejected": -1.9658613204956055, "logps/chosen": -623.190185546875, "logps/rejected": -634.7012939453125, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": 3.559809684753418, "rewards/margins": 3.931175470352173, "rewards/rejected": -0.37136560678482056, "step": 3531 }, { "epoch": 2.5804566210045663, "grad_norm": 23.549815520193704, "learning_rate": 1.682695511183223e-07, "logits/chosen": -2.6297359466552734, "logits/rejected": -2.318751811981201, "logps/chosen": -614.1175537109375, "logps/rejected": -587.2684326171875, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": 3.4652554988861084, "rewards/margins": 4.301737308502197, "rewards/rejected": -0.8364816904067993, "step": 3532 }, { "epoch": 2.581187214611872, "grad_norm": 10.255462679494661, "learning_rate": 1.6811882819589718e-07, "logits/chosen": -2.910348892211914, "logits/rejected": -1.927341103553772, "logps/chosen": -409.77606201171875, "logps/rejected": -280.7693786621094, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 2.511258602142334, "rewards/margins": 4.636491298675537, "rewards/rejected": -2.1252329349517822, "step": 3533 }, { "epoch": 2.581917808219178, "grad_norm": 8.9589794783478, "learning_rate": 1.6796813860442202e-07, "logits/chosen": -2.772007465362549, "logits/rejected": -1.9631271362304688, "logps/chosen": -908.7734985351562, "logps/rejected": -774.7870483398438, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 6.209003448486328, "rewards/margins": 6.56273889541626, "rewards/rejected": -0.3537350296974182, "step": 3534 }, { "epoch": 2.582648401826484, "grad_norm": 20.822783624460342, "learning_rate": 1.6781748240523737e-07, "logits/chosen": -3.0861077308654785, "logits/rejected": -2.6600611209869385, "logps/chosen": -581.3689575195312, "logps/rejected": -432.7031555175781, "loss": 0.1003, "rewards/accuracies": 0.875, "rewards/chosen": 3.10459041595459, "rewards/margins": 4.889992713928223, "rewards/rejected": -1.7854024171829224, "step": 3535 }, { "epoch": 2.58337899543379, "grad_norm": 7.753321274375995, "learning_rate": 1.6766685965966987e-07, "logits/chosen": -2.7165634632110596, "logits/rejected": -2.012483835220337, "logps/chosen": -633.2221069335938, "logps/rejected": -498.39935302734375, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 4.027205467224121, "rewards/margins": 5.05600643157959, "rewards/rejected": -1.0288012027740479, "step": 3536 }, { "epoch": 2.5841095890410957, "grad_norm": 6.241357550035314, "learning_rate": 1.6751627042903283e-07, "logits/chosen": -2.6854562759399414, "logits/rejected": -2.358578681945801, "logps/chosen": -536.8604125976562, "logps/rejected": -440.5831298828125, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 2.5799546241760254, "rewards/margins": 4.779939651489258, "rewards/rejected": -2.1999850273132324, "step": 3537 }, { "epoch": 2.584840182648402, "grad_norm": 6.520597833477927, "learning_rate": 1.673657147746258e-07, "logits/chosen": -3.3434438705444336, "logits/rejected": -2.076491117477417, "logps/chosen": -1045.1722412109375, "logps/rejected": -524.8632202148438, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 4.250133514404297, "rewards/margins": 6.627125263214111, "rewards/rejected": -2.3769917488098145, "step": 3538 }, { "epoch": 2.585570776255708, "grad_norm": 10.290655871546845, "learning_rate": 1.6721519275773483e-07, "logits/chosen": -2.7004244327545166, "logits/rejected": -1.8512382507324219, "logps/chosen": -584.510498046875, "logps/rejected": -385.3194885253906, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 3.2678966522216797, "rewards/margins": 6.623065948486328, "rewards/rejected": -3.3551692962646484, "step": 3539 }, { "epoch": 2.5863013698630137, "grad_norm": 21.763500715163133, "learning_rate": 1.67064704439632e-07, "logits/chosen": -2.759230852127075, "logits/rejected": -2.0491092205047607, "logps/chosen": -834.359130859375, "logps/rejected": -540.5346069335938, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": 2.695901393890381, "rewards/margins": 4.54487943649292, "rewards/rejected": -1.848978042602539, "step": 3540 }, { "epoch": 2.5870319634703196, "grad_norm": 14.253990283905482, "learning_rate": 1.6691424988157592e-07, "logits/chosen": -3.075320243835449, "logits/rejected": -2.082012414932251, "logps/chosen": -845.3888549804688, "logps/rejected": -422.4978942871094, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": 2.8160877227783203, "rewards/margins": 4.369554042816162, "rewards/rejected": -1.553466558456421, "step": 3541 }, { "epoch": 2.5877625570776255, "grad_norm": 19.57259841939028, "learning_rate": 1.6676382914481128e-07, "logits/chosen": -3.121849775314331, "logits/rejected": -2.018139362335205, "logps/chosen": -883.8381958007812, "logps/rejected": -526.931396484375, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 4.665197372436523, "rewards/margins": 5.589356899261475, "rewards/rejected": -0.9241594076156616, "step": 3542 }, { "epoch": 2.5884931506849314, "grad_norm": 5.747403761168732, "learning_rate": 1.6661344229056917e-07, "logits/chosen": -2.6896302700042725, "logits/rejected": -1.722487211227417, "logps/chosen": -557.7382202148438, "logps/rejected": -427.72705078125, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 4.547546863555908, "rewards/margins": 7.545283317565918, "rewards/rejected": -2.9977362155914307, "step": 3543 }, { "epoch": 2.5892237442922372, "grad_norm": 11.127229274318122, "learning_rate": 1.664630893800667e-07, "logits/chosen": -2.919461727142334, "logits/rejected": -2.351314067840576, "logps/chosen": -905.8994140625, "logps/rejected": -581.9038696289062, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": 3.8789453506469727, "rewards/margins": 5.534663200378418, "rewards/rejected": -1.655718207359314, "step": 3544 }, { "epoch": 2.5899543378995435, "grad_norm": 18.452698317463287, "learning_rate": 1.6631277047450728e-07, "logits/chosen": -2.8434016704559326, "logits/rejected": -1.902876615524292, "logps/chosen": -394.31158447265625, "logps/rejected": -248.28515625, "loss": 0.08, "rewards/accuracies": 0.875, "rewards/chosen": 2.8353536128997803, "rewards/margins": 7.277480602264404, "rewards/rejected": -4.442127227783203, "step": 3545 }, { "epoch": 2.5906849315068494, "grad_norm": 10.543571169455399, "learning_rate": 1.6616248563508052e-07, "logits/chosen": -2.3657283782958984, "logits/rejected": -2.361867904663086, "logps/chosen": -476.29974365234375, "logps/rejected": -603.3002319335938, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 1.8372539281845093, "rewards/margins": 4.3307719230651855, "rewards/rejected": -2.4935178756713867, "step": 3546 }, { "epoch": 2.5914155251141553, "grad_norm": 10.775231846131694, "learning_rate": 1.6601223492296206e-07, "logits/chosen": -2.620626211166382, "logits/rejected": -2.4040331840515137, "logps/chosen": -544.9815673828125, "logps/rejected": -536.9609985351562, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 1.3004499673843384, "rewards/margins": 3.2483437061309814, "rewards/rejected": -1.9478936195373535, "step": 3547 }, { "epoch": 2.592146118721461, "grad_norm": 9.456005823950095, "learning_rate": 1.658620183993138e-07, "logits/chosen": -3.036764621734619, "logits/rejected": -2.186296224594116, "logps/chosen": -770.3209228515625, "logps/rejected": -698.0421142578125, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 4.687179088592529, "rewards/margins": 6.455626010894775, "rewards/rejected": -1.768446445465088, "step": 3548 }, { "epoch": 2.592876712328767, "grad_norm": 17.566281600486146, "learning_rate": 1.6571183612528338e-07, "logits/chosen": -2.2960898876190186, "logits/rejected": -1.9508428573608398, "logps/chosen": -601.4609375, "logps/rejected": -630.9443969726562, "loss": 0.0823, "rewards/accuracies": 1.0, "rewards/chosen": 2.619959831237793, "rewards/margins": 6.704583168029785, "rewards/rejected": -4.084622859954834, "step": 3549 }, { "epoch": 2.593607305936073, "grad_norm": 6.835010539830194, "learning_rate": 1.655616881620049e-07, "logits/chosen": -2.9191112518310547, "logits/rejected": -2.474094867706299, "logps/chosen": -615.1170043945312, "logps/rejected": -583.1980590820312, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 3.4115376472473145, "rewards/margins": 5.422096252441406, "rewards/rejected": -2.010558605194092, "step": 3550 }, { "epoch": 2.594337899543379, "grad_norm": 10.86712712285528, "learning_rate": 1.654115745705982e-07, "logits/chosen": -2.8767049312591553, "logits/rejected": -1.9617975950241089, "logps/chosen": -660.1470336914062, "logps/rejected": -500.1827087402344, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": 4.46120023727417, "rewards/margins": 6.7954607009887695, "rewards/rejected": -2.3342602252960205, "step": 3551 }, { "epoch": 2.595068493150685, "grad_norm": 9.689288755597877, "learning_rate": 1.652614954121695e-07, "logits/chosen": -3.072077989578247, "logits/rejected": -1.989102840423584, "logps/chosen": -746.5731201171875, "logps/rejected": -522.7906494140625, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 3.5211966037750244, "rewards/margins": 4.886735916137695, "rewards/rejected": -1.3655391931533813, "step": 3552 }, { "epoch": 2.595799086757991, "grad_norm": 15.775080496556953, "learning_rate": 1.651114507478105e-07, "logits/chosen": -2.791445732116699, "logits/rejected": -1.958003282546997, "logps/chosen": -441.3681335449219, "logps/rejected": -279.603515625, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 2.9441614151000977, "rewards/margins": 6.055661201477051, "rewards/rejected": -3.111499786376953, "step": 3553 }, { "epoch": 2.596529680365297, "grad_norm": 12.745134210549605, "learning_rate": 1.6496144063859918e-07, "logits/chosen": -2.6423420906066895, "logits/rejected": -2.4253687858581543, "logps/chosen": -711.651123046875, "logps/rejected": -530.2528076171875, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 4.640811443328857, "rewards/margins": 6.069547176361084, "rewards/rejected": -1.4287357330322266, "step": 3554 }, { "epoch": 2.5972602739726027, "grad_norm": 10.894979079963028, "learning_rate": 1.6481146514559943e-07, "logits/chosen": -2.241943597793579, "logits/rejected": -2.3477437496185303, "logps/chosen": -455.359619140625, "logps/rejected": -773.7344970703125, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": 3.2115697860717773, "rewards/margins": 5.5102691650390625, "rewards/rejected": -2.298699378967285, "step": 3555 }, { "epoch": 2.5979908675799086, "grad_norm": 8.592997126112344, "learning_rate": 1.6466152432986104e-07, "logits/chosen": -2.811309337615967, "logits/rejected": -2.3711130619049072, "logps/chosen": -449.31817626953125, "logps/rejected": -437.340087890625, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 2.165565013885498, "rewards/margins": 4.559901714324951, "rewards/rejected": -2.394336700439453, "step": 3556 }, { "epoch": 2.5987214611872145, "grad_norm": 11.677497051077756, "learning_rate": 1.6451161825241955e-07, "logits/chosen": -2.805449962615967, "logits/rejected": -2.4474642276763916, "logps/chosen": -723.3367309570312, "logps/rejected": -356.29803466796875, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": 2.8791000843048096, "rewards/margins": 4.341769218444824, "rewards/rejected": -1.462669014930725, "step": 3557 }, { "epoch": 2.5994520547945204, "grad_norm": 11.945271623540263, "learning_rate": 1.643617469742965e-07, "logits/chosen": -2.5106287002563477, "logits/rejected": -1.8104650974273682, "logps/chosen": -593.8304443359375, "logps/rejected": -574.9540405273438, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 3.2596044540405273, "rewards/margins": 4.802481651306152, "rewards/rejected": -1.542877197265625, "step": 3558 }, { "epoch": 2.6001826484018267, "grad_norm": 8.923635546250457, "learning_rate": 1.6421191055649925e-07, "logits/chosen": -3.233914852142334, "logits/rejected": -3.0882396697998047, "logps/chosen": -713.5966796875, "logps/rejected": -788.0604248046875, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 4.056040287017822, "rewards/margins": 5.2179107666015625, "rewards/rejected": -1.161870002746582, "step": 3559 }, { "epoch": 2.6009132420091325, "grad_norm": 24.887692074535057, "learning_rate": 1.640621090600209e-07, "logits/chosen": -2.9031612873077393, "logits/rejected": -2.3591361045837402, "logps/chosen": -846.1986694335938, "logps/rejected": -702.042724609375, "loss": 0.0835, "rewards/accuracies": 0.875, "rewards/chosen": 4.561847686767578, "rewards/margins": 5.141173362731934, "rewards/rejected": -0.5793257355690002, "step": 3560 }, { "epoch": 2.6016438356164384, "grad_norm": 8.900758140933752, "learning_rate": 1.6391234254584046e-07, "logits/chosen": -2.658454179763794, "logits/rejected": -2.662245750427246, "logps/chosen": -706.6485595703125, "logps/rejected": -790.1193237304688, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": 2.819786548614502, "rewards/margins": 4.377067565917969, "rewards/rejected": -1.5572807788848877, "step": 3561 }, { "epoch": 2.6023744292237443, "grad_norm": 7.624170727270243, "learning_rate": 1.6376261107492255e-07, "logits/chosen": -2.820061206817627, "logits/rejected": -2.833596706390381, "logps/chosen": -907.0960083007812, "logps/rejected": -793.8038330078125, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 4.888092517852783, "rewards/margins": 5.934732437133789, "rewards/rejected": -1.0466396808624268, "step": 3562 }, { "epoch": 2.60310502283105, "grad_norm": 7.889649827581416, "learning_rate": 1.636129147082176e-07, "logits/chosen": -3.0782129764556885, "logits/rejected": -2.3544063568115234, "logps/chosen": -542.5014038085938, "logps/rejected": -528.4813232421875, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 2.581117630004883, "rewards/margins": 5.47018575668335, "rewards/rejected": -2.889068126678467, "step": 3563 }, { "epoch": 2.603835616438356, "grad_norm": 17.874715736102893, "learning_rate": 1.6346325350666176e-07, "logits/chosen": -2.996812582015991, "logits/rejected": -2.6766765117645264, "logps/chosen": -847.3071899414062, "logps/rejected": -691.6283569335938, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": 2.682393789291382, "rewards/margins": 4.513767719268799, "rewards/rejected": -1.831374168395996, "step": 3564 }, { "epoch": 2.604566210045662, "grad_norm": 15.003085518427275, "learning_rate": 1.6331362753117695e-07, "logits/chosen": -2.8382768630981445, "logits/rejected": -2.4276645183563232, "logps/chosen": -216.26220703125, "logps/rejected": -212.86737060546875, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": 0.9485878348350525, "rewards/margins": 2.871182918548584, "rewards/rejected": -1.9225949048995972, "step": 3565 }, { "epoch": 2.6052968036529682, "grad_norm": 15.73758268569422, "learning_rate": 1.6316403684267043e-07, "logits/chosen": -2.459444522857666, "logits/rejected": -2.5787930488586426, "logps/chosen": -662.9869995117188, "logps/rejected": -720.05908203125, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 2.3797996044158936, "rewards/margins": 4.7033586502075195, "rewards/rejected": -2.323558807373047, "step": 3566 }, { "epoch": 2.606027397260274, "grad_norm": 14.85191746929158, "learning_rate": 1.6301448150203545e-07, "logits/chosen": -2.6403424739837646, "logits/rejected": -2.791203498840332, "logps/chosen": -577.0678100585938, "logps/rejected": -658.54345703125, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 2.555622100830078, "rewards/margins": 4.505005359649658, "rewards/rejected": -1.949383020401001, "step": 3567 }, { "epoch": 2.60675799086758, "grad_norm": 13.332556574765993, "learning_rate": 1.6286496157015068e-07, "logits/chosen": -2.398625373840332, "logits/rejected": -1.6899011135101318, "logps/chosen": -621.3199462890625, "logps/rejected": -566.8900146484375, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 4.0197224617004395, "rewards/margins": 6.310145378112793, "rewards/rejected": -2.2904229164123535, "step": 3568 }, { "epoch": 2.607488584474886, "grad_norm": 15.69996470064702, "learning_rate": 1.6271547710788063e-07, "logits/chosen": -2.9469802379608154, "logits/rejected": -2.3795039653778076, "logps/chosen": -798.9644165039062, "logps/rejected": -658.3718872070312, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": 4.466236591339111, "rewards/margins": 5.681744575500488, "rewards/rejected": -1.215508222579956, "step": 3569 }, { "epoch": 2.6082191780821917, "grad_norm": 11.090276125883424, "learning_rate": 1.6256602817607493e-07, "logits/chosen": -2.618405818939209, "logits/rejected": -2.2830586433410645, "logps/chosen": -727.34228515625, "logps/rejected": -600.6311645507812, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 3.7203476428985596, "rewards/margins": 4.268023490905762, "rewards/rejected": -0.5476762056350708, "step": 3570 }, { "epoch": 2.6089497716894976, "grad_norm": 13.601499676764737, "learning_rate": 1.6241661483556906e-07, "logits/chosen": -2.809386730194092, "logits/rejected": -2.4739394187927246, "logps/chosen": -764.0732421875, "logps/rejected": -880.739013671875, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 3.896759033203125, "rewards/margins": 5.297738075256348, "rewards/rejected": -1.4009790420532227, "step": 3571 }, { "epoch": 2.6096803652968035, "grad_norm": 11.676317056541537, "learning_rate": 1.6226723714718398e-07, "logits/chosen": -2.65395188331604, "logits/rejected": -2.1496105194091797, "logps/chosen": -510.28369140625, "logps/rejected": -526.0653076171875, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 3.7150728702545166, "rewards/margins": 5.582674503326416, "rewards/rejected": -1.8676016330718994, "step": 3572 }, { "epoch": 2.61041095890411, "grad_norm": 28.261589981133735, "learning_rate": 1.6211789517172607e-07, "logits/chosen": -2.710301637649536, "logits/rejected": -1.6452345848083496, "logps/chosen": -752.3734741210938, "logps/rejected": -472.94696044921875, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 6.305537700653076, "rewards/margins": 8.630257606506348, "rewards/rejected": -2.324719190597534, "step": 3573 }, { "epoch": 2.6111415525114157, "grad_norm": 14.44334867836825, "learning_rate": 1.6196858896998732e-07, "logits/chosen": -2.4028029441833496, "logits/rejected": -3.0777950286865234, "logps/chosen": -281.8041687011719, "logps/rejected": -458.5091247558594, "loss": 0.0816, "rewards/accuracies": 0.875, "rewards/chosen": 0.9884293079376221, "rewards/margins": 4.565962791442871, "rewards/rejected": -3.57753324508667, "step": 3574 }, { "epoch": 2.6118721461187215, "grad_norm": 6.25878471985046, "learning_rate": 1.618193186027449e-07, "logits/chosen": -2.5234484672546387, "logits/rejected": -2.0842337608337402, "logps/chosen": -419.05517578125, "logps/rejected": -482.0821533203125, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 1.9417710304260254, "rewards/margins": 4.890039920806885, "rewards/rejected": -2.9482691287994385, "step": 3575 }, { "epoch": 2.6126027397260274, "grad_norm": 17.080396049249526, "learning_rate": 1.6167008413076156e-07, "logits/chosen": -2.9520349502563477, "logits/rejected": -2.324692726135254, "logps/chosen": -635.1697998046875, "logps/rejected": -506.9693908691406, "loss": 0.088, "rewards/accuracies": 0.875, "rewards/chosen": 3.7577965259552, "rewards/margins": 4.083688735961914, "rewards/rejected": -0.32589247822761536, "step": 3576 }, { "epoch": 2.6133333333333333, "grad_norm": 8.754046131328849, "learning_rate": 1.6152088561478542e-07, "logits/chosen": -2.977228879928589, "logits/rejected": -1.968795657157898, "logps/chosen": -744.4429931640625, "logps/rejected": -481.9659118652344, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 4.672590255737305, "rewards/margins": 5.993344306945801, "rewards/rejected": -1.320754051208496, "step": 3577 }, { "epoch": 2.614063926940639, "grad_norm": 10.95160885492312, "learning_rate": 1.6137172311555004e-07, "logits/chosen": -2.912794828414917, "logits/rejected": -2.22251558303833, "logps/chosen": -664.3663940429688, "logps/rejected": -437.8243103027344, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 3.593999147415161, "rewards/margins": 5.225069046020508, "rewards/rejected": -1.6310696601867676, "step": 3578 }, { "epoch": 2.614794520547945, "grad_norm": 10.414361032269298, "learning_rate": 1.6122259669377412e-07, "logits/chosen": -2.6319258213043213, "logits/rejected": -2.296875, "logps/chosen": -557.4036865234375, "logps/rejected": -568.01806640625, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 3.3020987510681152, "rewards/margins": 5.945897102355957, "rewards/rejected": -2.643798351287842, "step": 3579 }, { "epoch": 2.6155251141552514, "grad_norm": 7.419485892199069, "learning_rate": 1.6107350641016182e-07, "logits/chosen": -3.053725481033325, "logits/rejected": -2.0736629962921143, "logps/chosen": -545.9978637695312, "logps/rejected": -431.6983642578125, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 3.136354446411133, "rewards/margins": 5.285078048706055, "rewards/rejected": -2.148723840713501, "step": 3580 }, { "epoch": 2.6162557077625572, "grad_norm": 12.878124265386111, "learning_rate": 1.609244523254026e-07, "logits/chosen": -3.008915901184082, "logits/rejected": -1.8403574228286743, "logps/chosen": -780.2929077148438, "logps/rejected": -311.0844421386719, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": 3.2170045375823975, "rewards/margins": 4.680533409118652, "rewards/rejected": -1.4635292291641235, "step": 3581 }, { "epoch": 2.616986301369863, "grad_norm": 12.246173494556945, "learning_rate": 1.6077543450017112e-07, "logits/chosen": -2.613595485687256, "logits/rejected": -2.0309574604034424, "logps/chosen": -459.6081237792969, "logps/rejected": -528.2791748046875, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 4.361785888671875, "rewards/margins": 6.9496283531188965, "rewards/rejected": -2.5878424644470215, "step": 3582 }, { "epoch": 2.617716894977169, "grad_norm": 9.372675900074217, "learning_rate": 1.6062645299512744e-07, "logits/chosen": -2.6986396312713623, "logits/rejected": -2.1521787643432617, "logps/chosen": -682.673095703125, "logps/rejected": -628.704345703125, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 3.628162384033203, "rewards/margins": 4.2759623527526855, "rewards/rejected": -0.6477996110916138, "step": 3583 }, { "epoch": 2.618447488584475, "grad_norm": 10.23985272063199, "learning_rate": 1.6047750787091642e-07, "logits/chosen": -3.0575900077819824, "logits/rejected": -2.9526805877685547, "logps/chosen": -690.15380859375, "logps/rejected": -622.60693359375, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 3.1738314628601074, "rewards/margins": 3.567347764968872, "rewards/rejected": -0.39351654052734375, "step": 3584 }, { "epoch": 2.6191780821917807, "grad_norm": 21.478200106945284, "learning_rate": 1.6032859918816854e-07, "logits/chosen": -2.824798583984375, "logits/rejected": -2.717254161834717, "logps/chosen": -508.1180114746094, "logps/rejected": -477.0242614746094, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": 3.1733269691467285, "rewards/margins": 4.678919792175293, "rewards/rejected": -1.5055925846099854, "step": 3585 }, { "epoch": 2.6199086757990866, "grad_norm": 9.681104554770112, "learning_rate": 1.6017972700749927e-07, "logits/chosen": -2.450410842895508, "logits/rejected": -1.9001818895339966, "logps/chosen": -386.15362548828125, "logps/rejected": -360.15008544921875, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 1.4175806045532227, "rewards/margins": 4.149082660675049, "rewards/rejected": -2.7315022945404053, "step": 3586 }, { "epoch": 2.620639269406393, "grad_norm": 9.4682838652539, "learning_rate": 1.6003089138950944e-07, "logits/chosen": -2.815805673599243, "logits/rejected": -2.2736170291900635, "logps/chosen": -670.7508544921875, "logps/rejected": -571.021240234375, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": 3.8889801502227783, "rewards/margins": 5.666793346405029, "rewards/rejected": -1.77781343460083, "step": 3587 }, { "epoch": 2.621369863013699, "grad_norm": 13.883146076992679, "learning_rate": 1.598820923947845e-07, "logits/chosen": -2.666883707046509, "logits/rejected": -2.194713592529297, "logps/chosen": -672.232666015625, "logps/rejected": -629.114990234375, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": 4.878304481506348, "rewards/margins": 6.072667121887207, "rewards/rejected": -1.194362998008728, "step": 3588 }, { "epoch": 2.6221004566210047, "grad_norm": 8.380996151805705, "learning_rate": 1.597333300838954e-07, "logits/chosen": -2.5992980003356934, "logits/rejected": -1.8621127605438232, "logps/chosen": -420.18389892578125, "logps/rejected": -394.5192565917969, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": 3.110745429992676, "rewards/margins": 6.172706604003906, "rewards/rejected": -3.0619609355926514, "step": 3589 }, { "epoch": 2.6228310502283105, "grad_norm": 15.00649121021909, "learning_rate": 1.5958460451739814e-07, "logits/chosen": -2.19614315032959, "logits/rejected": -2.3071959018707275, "logps/chosen": -508.70220947265625, "logps/rejected": -632.6495971679688, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": 3.133629322052002, "rewards/margins": 6.716616630554199, "rewards/rejected": -3.582986831665039, "step": 3590 }, { "epoch": 2.6235616438356164, "grad_norm": 11.374107241966634, "learning_rate": 1.5943591575583366e-07, "logits/chosen": -2.7169251441955566, "logits/rejected": -2.4330267906188965, "logps/chosen": -671.066162109375, "logps/rejected": -653.6583862304688, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 3.549530029296875, "rewards/margins": 4.815778732299805, "rewards/rejected": -1.2662487030029297, "step": 3591 }, { "epoch": 2.6242922374429223, "grad_norm": 8.654232740326, "learning_rate": 1.5928726385972784e-07, "logits/chosen": -2.6987547874450684, "logits/rejected": -2.07369065284729, "logps/chosen": -520.7870483398438, "logps/rejected": -402.2153625488281, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 2.6981070041656494, "rewards/margins": 4.929974555969238, "rewards/rejected": -2.231867551803589, "step": 3592 }, { "epoch": 2.625022831050228, "grad_norm": 24.131728741042362, "learning_rate": 1.591386488895917e-07, "logits/chosen": -2.992649555206299, "logits/rejected": -1.6787182092666626, "logps/chosen": -523.5440673828125, "logps/rejected": -332.00469970703125, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 2.6077632904052734, "rewards/margins": 5.179053783416748, "rewards/rejected": -2.5712900161743164, "step": 3593 }, { "epoch": 2.6257534246575345, "grad_norm": 20.271953221195627, "learning_rate": 1.5899007090592115e-07, "logits/chosen": -2.9462783336639404, "logits/rejected": -2.1281726360321045, "logps/chosen": -709.4771118164062, "logps/rejected": -385.91424560546875, "loss": 0.1006, "rewards/accuracies": 0.875, "rewards/chosen": 3.834616184234619, "rewards/margins": 4.968543529510498, "rewards/rejected": -1.133927345275879, "step": 3594 }, { "epoch": 2.6264840182648403, "grad_norm": 23.55886286583565, "learning_rate": 1.5884152996919715e-07, "logits/chosen": -2.997615098953247, "logits/rejected": -2.5672616958618164, "logps/chosen": -724.3667602539062, "logps/rejected": -728.61865234375, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 4.977312088012695, "rewards/margins": 5.296399116516113, "rewards/rejected": -0.3190871775150299, "step": 3595 }, { "epoch": 2.627214611872146, "grad_norm": 10.01577125478238, "learning_rate": 1.5869302613988545e-07, "logits/chosen": -3.0015172958374023, "logits/rejected": -2.4083147048950195, "logps/chosen": -640.684814453125, "logps/rejected": -504.3298034667969, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 4.687170028686523, "rewards/margins": 5.605044841766357, "rewards/rejected": -0.9178750514984131, "step": 3596 }, { "epoch": 2.627945205479452, "grad_norm": 19.84665871275432, "learning_rate": 1.585445594784367e-07, "logits/chosen": -2.286559581756592, "logits/rejected": -2.2390055656433105, "logps/chosen": -369.32720947265625, "logps/rejected": -483.7165222167969, "loss": 0.0657, "rewards/accuracies": 1.0, "rewards/chosen": 2.5994439125061035, "rewards/margins": 4.564723968505859, "rewards/rejected": -1.9652798175811768, "step": 3597 }, { "epoch": 2.628675799086758, "grad_norm": 11.61283976834915, "learning_rate": 1.5839613004528652e-07, "logits/chosen": -2.649925470352173, "logits/rejected": -2.1072330474853516, "logps/chosen": -555.2827758789062, "logps/rejected": -466.4445495605469, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 3.5868711471557617, "rewards/margins": 5.740009307861328, "rewards/rejected": -2.1531383991241455, "step": 3598 }, { "epoch": 2.629406392694064, "grad_norm": 11.27025899405017, "learning_rate": 1.582477379008553e-07, "logits/chosen": -2.7282629013061523, "logits/rejected": -2.2595324516296387, "logps/chosen": -976.0082397460938, "logps/rejected": -1076.098876953125, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 5.046567440032959, "rewards/margins": 5.664168357849121, "rewards/rejected": -0.6176010370254517, "step": 3599 }, { "epoch": 2.6301369863013697, "grad_norm": 25.71484345330776, "learning_rate": 1.5809938310554838e-07, "logits/chosen": -2.5191736221313477, "logits/rejected": -2.4389848709106445, "logps/chosen": -491.7181396484375, "logps/rejected": -462.4805908203125, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 1.9763145446777344, "rewards/margins": 2.6758570671081543, "rewards/rejected": -0.6995425224304199, "step": 3600 }, { "epoch": 2.630867579908676, "grad_norm": 11.93269547048806, "learning_rate": 1.5795106571975559e-07, "logits/chosen": -2.9525129795074463, "logits/rejected": -2.6614041328430176, "logps/chosen": -607.1744384765625, "logps/rejected": -535.8624267578125, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 3.848999261856079, "rewards/margins": 6.087523460388184, "rewards/rejected": -2.2385237216949463, "step": 3601 }, { "epoch": 2.6315981735159815, "grad_norm": 16.26670377966811, "learning_rate": 1.578027858038518e-07, "logits/chosen": -2.893958330154419, "logits/rejected": -2.1526386737823486, "logps/chosen": -761.0599365234375, "logps/rejected": -474.41156005859375, "loss": 0.0904, "rewards/accuracies": 0.875, "rewards/chosen": 3.223743438720703, "rewards/margins": 3.5350637435913086, "rewards/rejected": -0.311320424079895, "step": 3602 }, { "epoch": 2.632328767123288, "grad_norm": 9.060756331446706, "learning_rate": 1.5765454341819655e-07, "logits/chosen": -2.378981113433838, "logits/rejected": -1.881550669670105, "logps/chosen": -578.50439453125, "logps/rejected": -509.7087707519531, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": 1.8688836097717285, "rewards/margins": 4.432850360870361, "rewards/rejected": -2.5639665126800537, "step": 3603 }, { "epoch": 2.6330593607305937, "grad_norm": 14.069252393607808, "learning_rate": 1.5750633862313434e-07, "logits/chosen": -2.3891642093658447, "logits/rejected": -2.085196018218994, "logps/chosen": -796.3465576171875, "logps/rejected": -736.8224487304688, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 4.283186912536621, "rewards/margins": 6.419593811035156, "rewards/rejected": -2.136406898498535, "step": 3604 }, { "epoch": 2.6337899543378995, "grad_norm": 7.182646066074678, "learning_rate": 1.5735817147899378e-07, "logits/chosen": -2.6742491722106934, "logits/rejected": -2.4808003902435303, "logps/chosen": -537.6572265625, "logps/rejected": -493.0632019042969, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 3.253289222717285, "rewards/margins": 4.614385604858398, "rewards/rejected": -1.3610968589782715, "step": 3605 }, { "epoch": 2.6345205479452054, "grad_norm": 7.967199795599946, "learning_rate": 1.5721004204608871e-07, "logits/chosen": -1.9246197938919067, "logits/rejected": -2.352421522140503, "logps/chosen": -411.2973937988281, "logps/rejected": -485.1668701171875, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 1.1902023553848267, "rewards/margins": 4.274267196655273, "rewards/rejected": -3.084064245223999, "step": 3606 }, { "epoch": 2.6352511415525113, "grad_norm": 5.230338382022152, "learning_rate": 1.5706195038471737e-07, "logits/chosen": -2.5807688236236572, "logits/rejected": -2.418792963027954, "logps/chosen": -714.1412353515625, "logps/rejected": -738.3109130859375, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 4.43516731262207, "rewards/margins": 9.1425142288208, "rewards/rejected": -4.707347393035889, "step": 3607 }, { "epoch": 2.6359817351598176, "grad_norm": 16.348762750038038, "learning_rate": 1.569138965551627e-07, "logits/chosen": -2.9239392280578613, "logits/rejected": -2.4979500770568848, "logps/chosen": -840.927001953125, "logps/rejected": -798.6192016601562, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 3.8757777214050293, "rewards/margins": 5.004043102264404, "rewards/rejected": -1.1282658576965332, "step": 3608 }, { "epoch": 2.636712328767123, "grad_norm": 11.92534238055892, "learning_rate": 1.5676588061769222e-07, "logits/chosen": -2.305520534515381, "logits/rejected": -2.302187204360962, "logps/chosen": -530.309326171875, "logps/rejected": -464.7548828125, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": 3.8047075271606445, "rewards/margins": 6.175369739532471, "rewards/rejected": -2.370661735534668, "step": 3609 }, { "epoch": 2.6374429223744293, "grad_norm": 10.934422348805207, "learning_rate": 1.5661790263255798e-07, "logits/chosen": -3.2095751762390137, "logits/rejected": -2.3622169494628906, "logps/chosen": -917.8927612304688, "logps/rejected": -683.4600219726562, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 5.579586982727051, "rewards/margins": 5.986196041107178, "rewards/rejected": -0.4066096544265747, "step": 3610 }, { "epoch": 2.638173515981735, "grad_norm": 17.970007598654576, "learning_rate": 1.5646996265999663e-07, "logits/chosen": -2.6840662956237793, "logits/rejected": -2.3805878162384033, "logps/chosen": -742.3015747070312, "logps/rejected": -600.9907836914062, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 2.6943159103393555, "rewards/margins": 4.157589912414551, "rewards/rejected": -1.4632738828659058, "step": 3611 }, { "epoch": 2.638904109589041, "grad_norm": 13.922613156462123, "learning_rate": 1.5632206076022935e-07, "logits/chosen": -2.645176410675049, "logits/rejected": -2.6732072830200195, "logps/chosen": -774.38671875, "logps/rejected": -772.9315795898438, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 3.3652141094207764, "rewards/margins": 4.834836959838867, "rewards/rejected": -1.4696227312088013, "step": 3612 }, { "epoch": 2.639634703196347, "grad_norm": 9.354673194906175, "learning_rate": 1.561741969934619e-07, "logits/chosen": -2.595268726348877, "logits/rejected": -2.2375659942626953, "logps/chosen": -889.3335571289062, "logps/rejected": -712.9097900390625, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 4.351108551025391, "rewards/margins": 5.374545097351074, "rewards/rejected": -1.0234365463256836, "step": 3613 }, { "epoch": 2.640365296803653, "grad_norm": 6.689726271795635, "learning_rate": 1.5602637141988428e-07, "logits/chosen": -3.081721305847168, "logits/rejected": -2.8654236793518066, "logps/chosen": -671.1840209960938, "logps/rejected": -637.3817749023438, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 3.6383581161499023, "rewards/margins": 4.642332077026367, "rewards/rejected": -1.0039739608764648, "step": 3614 }, { "epoch": 2.641095890410959, "grad_norm": 13.320220857456015, "learning_rate": 1.5587858409967118e-07, "logits/chosen": -3.2848193645477295, "logits/rejected": -2.65562105178833, "logps/chosen": -621.7989501953125, "logps/rejected": -497.55157470703125, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 1.5383350849151611, "rewards/margins": 4.587242126464844, "rewards/rejected": -3.0489068031311035, "step": 3615 }, { "epoch": 2.6418264840182646, "grad_norm": 13.601307278801652, "learning_rate": 1.5573083509298158e-07, "logits/chosen": -2.8748230934143066, "logits/rejected": -1.5505790710449219, "logps/chosen": -516.5331420898438, "logps/rejected": -366.5856628417969, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 3.2864370346069336, "rewards/margins": 6.260470390319824, "rewards/rejected": -2.9740331172943115, "step": 3616 }, { "epoch": 2.642557077625571, "grad_norm": 6.034174234931385, "learning_rate": 1.5558312445995903e-07, "logits/chosen": -2.9330384731292725, "logits/rejected": -2.3989944458007812, "logps/chosen": -682.877685546875, "logps/rejected": -639.8824462890625, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 2.93086576461792, "rewards/margins": 6.070619106292725, "rewards/rejected": -3.1397533416748047, "step": 3617 }, { "epoch": 2.643287671232877, "grad_norm": 12.044786165777161, "learning_rate": 1.5543545226073113e-07, "logits/chosen": -2.517669200897217, "logits/rejected": -2.3268682956695557, "logps/chosen": -652.2993774414062, "logps/rejected": -483.7744140625, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 2.5060131549835205, "rewards/margins": 5.113016605377197, "rewards/rejected": -2.6070032119750977, "step": 3618 }, { "epoch": 2.6440182648401827, "grad_norm": 10.095898662318511, "learning_rate": 1.5528781855541018e-07, "logits/chosen": -3.2622413635253906, "logits/rejected": -2.7760791778564453, "logps/chosen": -594.32568359375, "logps/rejected": -671.9467163085938, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": 3.821744441986084, "rewards/margins": 4.367681980133057, "rewards/rejected": -0.5459372997283936, "step": 3619 }, { "epoch": 2.6447488584474885, "grad_norm": 12.02752834078063, "learning_rate": 1.5514022340409267e-07, "logits/chosen": -2.5643112659454346, "logits/rejected": -2.002786159515381, "logps/chosen": -727.41943359375, "logps/rejected": -788.5274658203125, "loss": 0.0616, "rewards/accuracies": 1.0, "rewards/chosen": 4.757780075073242, "rewards/margins": 8.027828216552734, "rewards/rejected": -3.2700483798980713, "step": 3620 }, { "epoch": 2.6454794520547944, "grad_norm": 7.238761610055742, "learning_rate": 1.5499266686685934e-07, "logits/chosen": -3.194063186645508, "logits/rejected": -2.2093558311462402, "logps/chosen": -756.630859375, "logps/rejected": -467.59552001953125, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 4.662125110626221, "rewards/margins": 6.925286293029785, "rewards/rejected": -2.2631614208221436, "step": 3621 }, { "epoch": 2.6462100456621007, "grad_norm": 7.460388866444847, "learning_rate": 1.5484514900377548e-07, "logits/chosen": -2.0903239250183105, "logits/rejected": -2.63277530670166, "logps/chosen": -597.3350219726562, "logps/rejected": -963.0166015625, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 1.800764560699463, "rewards/margins": 4.554219722747803, "rewards/rejected": -2.75345516204834, "step": 3622 }, { "epoch": 2.646940639269406, "grad_norm": 14.704046011768698, "learning_rate": 1.5469766987489016e-07, "logits/chosen": -3.119823455810547, "logits/rejected": -2.826669216156006, "logps/chosen": -948.5728759765625, "logps/rejected": -1071.98974609375, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 4.535544395446777, "rewards/margins": 6.1148834228515625, "rewards/rejected": -1.5793390274047852, "step": 3623 }, { "epoch": 2.6476712328767125, "grad_norm": 10.472515647368354, "learning_rate": 1.545502295402371e-07, "logits/chosen": -3.063141345977783, "logits/rejected": -2.6944327354431152, "logps/chosen": -589.7005615234375, "logps/rejected": -593.4876708984375, "loss": 0.0466, "rewards/accuracies": 0.875, "rewards/chosen": 2.7845211029052734, "rewards/margins": 6.331743240356445, "rewards/rejected": -3.54722261428833, "step": 3624 }, { "epoch": 2.6484018264840183, "grad_norm": 8.722833367687889, "learning_rate": 1.5440282805983406e-07, "logits/chosen": -2.3354759216308594, "logits/rejected": -2.147812843322754, "logps/chosen": -852.0138549804688, "logps/rejected": -850.5068969726562, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 4.490207672119141, "rewards/margins": 6.597875118255615, "rewards/rejected": -2.1076669692993164, "step": 3625 }, { "epoch": 2.649132420091324, "grad_norm": 9.12909178245355, "learning_rate": 1.5425546549368306e-07, "logits/chosen": -3.047959089279175, "logits/rejected": -2.7055416107177734, "logps/chosen": -773.2518310546875, "logps/rejected": -650.50732421875, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 3.776890277862549, "rewards/margins": 4.395233154296875, "rewards/rejected": -0.6183429956436157, "step": 3626 }, { "epoch": 2.64986301369863, "grad_norm": 6.712801975156575, "learning_rate": 1.5410814190177012e-07, "logits/chosen": -2.988586902618408, "logits/rejected": -2.22955584526062, "logps/chosen": -486.27349853515625, "logps/rejected": -425.46014404296875, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 3.7521674633026123, "rewards/margins": 7.8282575607299805, "rewards/rejected": -4.076090335845947, "step": 3627 }, { "epoch": 2.650593607305936, "grad_norm": 8.060932332299904, "learning_rate": 1.5396085734406555e-07, "logits/chosen": -2.7217421531677246, "logits/rejected": -1.828694462776184, "logps/chosen": -674.2806396484375, "logps/rejected": -342.0587463378906, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 3.022641658782959, "rewards/margins": 6.6773295402526855, "rewards/rejected": -3.6546878814697266, "step": 3628 }, { "epoch": 2.6513242009132423, "grad_norm": 13.654889001831679, "learning_rate": 1.5381361188052378e-07, "logits/chosen": -2.797435998916626, "logits/rejected": -2.2648885250091553, "logps/chosen": -617.1829833984375, "logps/rejected": -623.6954345703125, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 3.309387683868408, "rewards/margins": 4.158134460449219, "rewards/rejected": -0.8487468957901001, "step": 3629 }, { "epoch": 2.6520547945205477, "grad_norm": 7.183297960706152, "learning_rate": 1.5366640557108323e-07, "logits/chosen": -3.000269889831543, "logits/rejected": -2.2273921966552734, "logps/chosen": -630.8751220703125, "logps/rejected": -561.9512329101562, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 5.102614879608154, "rewards/margins": 6.190865516662598, "rewards/rejected": -1.0882506370544434, "step": 3630 }, { "epoch": 2.652785388127854, "grad_norm": 11.544907832052447, "learning_rate": 1.5351923847566634e-07, "logits/chosen": -2.3616790771484375, "logits/rejected": -2.2830276489257812, "logps/chosen": -874.3113403320312, "logps/rejected": -791.9216918945312, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 3.2093796730041504, "rewards/margins": 5.109959602355957, "rewards/rejected": -1.9005796909332275, "step": 3631 }, { "epoch": 2.65351598173516, "grad_norm": 11.323377237834677, "learning_rate": 1.533721106541797e-07, "logits/chosen": -2.9426488876342773, "logits/rejected": -2.806515693664551, "logps/chosen": -780.0753173828125, "logps/rejected": -691.7626342773438, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 3.162248134613037, "rewards/margins": 4.7624101638793945, "rewards/rejected": -1.600162148475647, "step": 3632 }, { "epoch": 2.6542465753424658, "grad_norm": 11.645449791849698, "learning_rate": 1.5322502216651394e-07, "logits/chosen": -2.894554376602173, "logits/rejected": -2.2881174087524414, "logps/chosen": -601.0580444335938, "logps/rejected": -672.1430053710938, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": 2.9319448471069336, "rewards/margins": 7.0312981605529785, "rewards/rejected": -4.099353790283203, "step": 3633 }, { "epoch": 2.6549771689497716, "grad_norm": 14.69363963265388, "learning_rate": 1.530779730725436e-07, "logits/chosen": -3.0081570148468018, "logits/rejected": -2.824871063232422, "logps/chosen": -693.5340576171875, "logps/rejected": -591.4699096679688, "loss": 0.0655, "rewards/accuracies": 0.875, "rewards/chosen": 2.2323741912841797, "rewards/margins": 2.5183067321777344, "rewards/rejected": -0.2859325408935547, "step": 3634 }, { "epoch": 2.6557077625570775, "grad_norm": 12.47249292779853, "learning_rate": 1.5293096343212734e-07, "logits/chosen": -2.788494110107422, "logits/rejected": -2.015791177749634, "logps/chosen": -414.5506896972656, "logps/rejected": -363.71856689453125, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 2.6598119735717773, "rewards/margins": 5.626628875732422, "rewards/rejected": -2.9668166637420654, "step": 3635 }, { "epoch": 2.6564383561643834, "grad_norm": 8.62668405207789, "learning_rate": 1.5278399330510733e-07, "logits/chosen": -2.4764065742492676, "logits/rejected": -2.692819833755493, "logps/chosen": -333.59478759765625, "logps/rejected": -559.1670532226562, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": 2.406228542327881, "rewards/margins": 7.431724548339844, "rewards/rejected": -5.025496006011963, "step": 3636 }, { "epoch": 2.6571689497716893, "grad_norm": 13.632242780001045, "learning_rate": 1.5263706275131008e-07, "logits/chosen": -2.9101734161376953, "logits/rejected": -1.713392734527588, "logps/chosen": -539.5958251953125, "logps/rejected": -290.3532409667969, "loss": 0.0632, "rewards/accuracies": 1.0, "rewards/chosen": 2.3361690044403076, "rewards/margins": 4.6027984619140625, "rewards/rejected": -2.266629695892334, "step": 3637 }, { "epoch": 2.6578995433789956, "grad_norm": 10.137763373437965, "learning_rate": 1.5249017183054587e-07, "logits/chosen": -2.3821864128112793, "logits/rejected": -2.24552059173584, "logps/chosen": -554.57958984375, "logps/rejected": -496.96307373046875, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 2.3077023029327393, "rewards/margins": 4.0780839920043945, "rewards/rejected": -1.7703816890716553, "step": 3638 }, { "epoch": 2.6586301369863015, "grad_norm": 6.563020812666984, "learning_rate": 1.5234332060260894e-07, "logits/chosen": -2.85426664352417, "logits/rejected": -2.1869661808013916, "logps/chosen": -574.2625732421875, "logps/rejected": -495.4697265625, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 3.4044628143310547, "rewards/margins": 5.384931564331055, "rewards/rejected": -1.98046875, "step": 3639 }, { "epoch": 2.6593607305936073, "grad_norm": 11.481556573802258, "learning_rate": 1.521965091272771e-07, "logits/chosen": -2.6009788513183594, "logits/rejected": -1.7425109148025513, "logps/chosen": -592.3717041015625, "logps/rejected": -408.6853332519531, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": 4.612208366394043, "rewards/margins": 6.775737762451172, "rewards/rejected": -2.163529396057129, "step": 3640 }, { "epoch": 2.660091324200913, "grad_norm": 6.106842530784692, "learning_rate": 1.520497374643122e-07, "logits/chosen": -2.6112682819366455, "logits/rejected": -2.6073498725891113, "logps/chosen": -521.4159545898438, "logps/rejected": -554.9993286132812, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 2.9780900478363037, "rewards/margins": 4.8268232345581055, "rewards/rejected": -1.8487331867218018, "step": 3641 }, { "epoch": 2.660821917808219, "grad_norm": 20.771595942489515, "learning_rate": 1.5190300567345983e-07, "logits/chosen": -2.1731486320495605, "logits/rejected": -1.8197031021118164, "logps/chosen": -449.81146240234375, "logps/rejected": -562.2640991210938, "loss": 0.0945, "rewards/accuracies": 0.875, "rewards/chosen": 2.389418125152588, "rewards/margins": 5.476569652557373, "rewards/rejected": -3.0871520042419434, "step": 3642 }, { "epoch": 2.661552511415525, "grad_norm": 14.626140843424135, "learning_rate": 1.517563138144494e-07, "logits/chosen": -3.262892723083496, "logits/rejected": -2.4111673831939697, "logps/chosen": -790.8280639648438, "logps/rejected": -543.3414306640625, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 4.699717998504639, "rewards/margins": 5.9058918952941895, "rewards/rejected": -1.2061740159988403, "step": 3643 }, { "epoch": 2.662283105022831, "grad_norm": 10.513766392327794, "learning_rate": 1.5160966194699399e-07, "logits/chosen": -3.394388437271118, "logits/rejected": -2.2903523445129395, "logps/chosen": -813.4415893554688, "logps/rejected": -540.3357543945312, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 4.354897499084473, "rewards/margins": 6.076332092285156, "rewards/rejected": -1.7214345932006836, "step": 3644 }, { "epoch": 2.663013698630137, "grad_norm": 10.621378561945354, "learning_rate": 1.514630501307904e-07, "logits/chosen": -2.4212124347686768, "logits/rejected": -1.3233524560928345, "logps/chosen": -509.7850036621094, "logps/rejected": -335.5986633300781, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": 3.1595993041992188, "rewards/margins": 6.383650779724121, "rewards/rejected": -3.2240514755249023, "step": 3645 }, { "epoch": 2.663744292237443, "grad_norm": 11.58436093243346, "learning_rate": 1.5131647842551914e-07, "logits/chosen": -2.6878902912139893, "logits/rejected": -2.471040725708008, "logps/chosen": -872.269287109375, "logps/rejected": -813.9521484375, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 5.486100673675537, "rewards/margins": 5.518594741821289, "rewards/rejected": -0.032494574785232544, "step": 3646 }, { "epoch": 2.664474885844749, "grad_norm": 7.952391345167154, "learning_rate": 1.5116994689084444e-07, "logits/chosen": -2.4503557682037354, "logits/rejected": -1.9068269729614258, "logps/chosen": -278.8934020996094, "logps/rejected": -286.49395751953125, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 2.469764232635498, "rewards/margins": 5.796154022216797, "rewards/rejected": -3.326390266418457, "step": 3647 }, { "epoch": 2.6652054794520548, "grad_norm": 9.495879233231085, "learning_rate": 1.5102345558641427e-07, "logits/chosen": -2.9885807037353516, "logits/rejected": -2.839231014251709, "logps/chosen": -483.4900207519531, "logps/rejected": -544.8153686523438, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 2.8257007598876953, "rewards/margins": 4.7495551109313965, "rewards/rejected": -1.923854112625122, "step": 3648 }, { "epoch": 2.6659360730593606, "grad_norm": 6.886265073861191, "learning_rate": 1.5087700457185976e-07, "logits/chosen": -2.7713513374328613, "logits/rejected": -1.374272108078003, "logps/chosen": -562.842529296875, "logps/rejected": -308.0093688964844, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 2.1758534908294678, "rewards/margins": 5.19732666015625, "rewards/rejected": -3.0214731693267822, "step": 3649 }, { "epoch": 2.6666666666666665, "grad_norm": 16.214431099452927, "learning_rate": 1.5073059390679626e-07, "logits/chosen": -2.7290940284729004, "logits/rejected": -2.029245615005493, "logps/chosen": -805.36767578125, "logps/rejected": -718.3953857421875, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": 4.053640365600586, "rewards/margins": 5.61497163772583, "rewards/rejected": -1.5613315105438232, "step": 3650 }, { "epoch": 2.6673972602739724, "grad_norm": 13.378581200430913, "learning_rate": 1.5058422365082233e-07, "logits/chosen": -2.604119300842285, "logits/rejected": -2.2179083824157715, "logps/chosen": -627.7136840820312, "logps/rejected": -619.5101928710938, "loss": 0.0942, "rewards/accuracies": 1.0, "rewards/chosen": 3.2527589797973633, "rewards/margins": 6.012355804443359, "rewards/rejected": -2.759596824645996, "step": 3651 }, { "epoch": 2.6681278538812787, "grad_norm": 11.094152308946438, "learning_rate": 1.5043789386352023e-07, "logits/chosen": -2.791268825531006, "logits/rejected": -2.235504150390625, "logps/chosen": -281.1580810546875, "logps/rejected": -357.31243896484375, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 2.888108015060425, "rewards/margins": 6.90985107421875, "rewards/rejected": -4.021742820739746, "step": 3652 }, { "epoch": 2.6688584474885846, "grad_norm": 15.352112148239062, "learning_rate": 1.5029160460445555e-07, "logits/chosen": -2.411245584487915, "logits/rejected": -1.59239661693573, "logps/chosen": -444.5869445800781, "logps/rejected": -597.3143920898438, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": 2.9396352767944336, "rewards/margins": 7.651537895202637, "rewards/rejected": -4.711902618408203, "step": 3653 }, { "epoch": 2.6695890410958905, "grad_norm": 15.809028093551333, "learning_rate": 1.5014535593317756e-07, "logits/chosen": -2.7899065017700195, "logits/rejected": -2.8214306831359863, "logps/chosen": -540.6470947265625, "logps/rejected": -644.2274169921875, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 3.0946273803710938, "rewards/margins": 4.942281246185303, "rewards/rejected": -1.847654104232788, "step": 3654 }, { "epoch": 2.6703196347031963, "grad_norm": 17.292782127833078, "learning_rate": 1.4999914790921895e-07, "logits/chosen": -2.8057336807250977, "logits/rejected": -2.787546157836914, "logps/chosen": -495.71435546875, "logps/rejected": -600.4609375, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 1.6077189445495605, "rewards/margins": 4.669258117675781, "rewards/rejected": -3.0615389347076416, "step": 3655 }, { "epoch": 2.671050228310502, "grad_norm": 6.941622222477239, "learning_rate": 1.4985298059209595e-07, "logits/chosen": -3.1738061904907227, "logits/rejected": -2.281510829925537, "logps/chosen": -935.3021850585938, "logps/rejected": -592.14404296875, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 5.2567925453186035, "rewards/margins": 5.9020161628723145, "rewards/rejected": -0.6452236175537109, "step": 3656 }, { "epoch": 2.671780821917808, "grad_norm": 26.569391339515924, "learning_rate": 1.49706854041308e-07, "logits/chosen": -2.8948278427124023, "logits/rejected": -2.495177984237671, "logps/chosen": -551.6615600585938, "logps/rejected": -484.7100524902344, "loss": 0.0995, "rewards/accuracies": 1.0, "rewards/chosen": 2.2220044136047363, "rewards/margins": 4.117112159729004, "rewards/rejected": -1.8951081037521362, "step": 3657 }, { "epoch": 2.672511415525114, "grad_norm": 14.750569995276347, "learning_rate": 1.4956076831633825e-07, "logits/chosen": -2.921083927154541, "logits/rejected": -2.0837178230285645, "logps/chosen": -541.44873046875, "logps/rejected": -515.73486328125, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 3.437584638595581, "rewards/margins": 6.123020172119141, "rewards/rejected": -2.6854350566864014, "step": 3658 }, { "epoch": 2.6732420091324203, "grad_norm": 8.359317501262533, "learning_rate": 1.494147234766529e-07, "logits/chosen": -3.050445079803467, "logits/rejected": -2.157274007797241, "logps/chosen": -929.652099609375, "logps/rejected": -757.8048095703125, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 4.564513206481934, "rewards/margins": 6.221683025360107, "rewards/rejected": -1.657170057296753, "step": 3659 }, { "epoch": 2.673972602739726, "grad_norm": 11.964184400702935, "learning_rate": 1.4926871958170183e-07, "logits/chosen": -2.2926111221313477, "logits/rejected": -2.164437770843506, "logps/chosen": -598.7900390625, "logps/rejected": -685.2579345703125, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": 2.594470500946045, "rewards/margins": 4.384186267852783, "rewards/rejected": -1.7897157669067383, "step": 3660 }, { "epoch": 2.674703196347032, "grad_norm": 6.310896565964981, "learning_rate": 1.4912275669091807e-07, "logits/chosen": -3.140045642852783, "logits/rejected": -2.3355729579925537, "logps/chosen": -686.171630859375, "logps/rejected": -507.04071044921875, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 3.2371413707733154, "rewards/margins": 4.788031101226807, "rewards/rejected": -1.5508897304534912, "step": 3661 }, { "epoch": 2.675433789954338, "grad_norm": 14.298655114588138, "learning_rate": 1.4897683486371786e-07, "logits/chosen": -3.255352020263672, "logits/rejected": -2.171720266342163, "logps/chosen": -679.002685546875, "logps/rejected": -441.3074951171875, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 3.924405097961426, "rewards/margins": 4.386506080627441, "rewards/rejected": -0.4621010422706604, "step": 3662 }, { "epoch": 2.6761643835616438, "grad_norm": 19.021703270540733, "learning_rate": 1.48830954159501e-07, "logits/chosen": -2.5069639682769775, "logits/rejected": -2.3217878341674805, "logps/chosen": -706.9847412109375, "logps/rejected": -667.44970703125, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": 2.6943840980529785, "rewards/margins": 4.503777980804443, "rewards/rejected": -1.8093936443328857, "step": 3663 }, { "epoch": 2.6768949771689496, "grad_norm": 14.144118611938806, "learning_rate": 1.4868511463765032e-07, "logits/chosen": -2.342625141143799, "logits/rejected": -1.7390402555465698, "logps/chosen": -561.5590209960938, "logps/rejected": -453.58746337890625, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 2.6269540786743164, "rewards/margins": 3.7244300842285156, "rewards/rejected": -1.0974760055541992, "step": 3664 }, { "epoch": 2.6776255707762555, "grad_norm": 13.639673573455843, "learning_rate": 1.4853931635753212e-07, "logits/chosen": -2.7608437538146973, "logits/rejected": -2.3569159507751465, "logps/chosen": -776.2003784179688, "logps/rejected": -634.1568603515625, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": 2.433460235595703, "rewards/margins": 4.249537467956543, "rewards/rejected": -1.816077470779419, "step": 3665 }, { "epoch": 2.678356164383562, "grad_norm": 9.648095103995512, "learning_rate": 1.4839355937849547e-07, "logits/chosen": -3.0571281909942627, "logits/rejected": -2.6652274131774902, "logps/chosen": -590.2882080078125, "logps/rejected": -554.8101806640625, "loss": 0.0579, "rewards/accuracies": 0.875, "rewards/chosen": 4.322005271911621, "rewards/margins": 4.727387428283691, "rewards/rejected": -0.4053819179534912, "step": 3666 }, { "epoch": 2.6790867579908677, "grad_norm": 6.520082408173612, "learning_rate": 1.4824784375987313e-07, "logits/chosen": -2.785775899887085, "logits/rejected": -1.8217549324035645, "logps/chosen": -790.597412109375, "logps/rejected": -780.7794799804688, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 3.537914276123047, "rewards/margins": 5.510398864746094, "rewards/rejected": -1.9724849462509155, "step": 3667 }, { "epoch": 2.6798173515981736, "grad_norm": 8.246386112178644, "learning_rate": 1.4810216956098075e-07, "logits/chosen": -2.8067212104797363, "logits/rejected": -2.2872464656829834, "logps/chosen": -742.087158203125, "logps/rejected": -678.4555053710938, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 3.807734489440918, "rewards/margins": 5.537717819213867, "rewards/rejected": -1.7299834489822388, "step": 3668 }, { "epoch": 2.6805479452054795, "grad_norm": 6.8779429292476895, "learning_rate": 1.4795653684111734e-07, "logits/chosen": -2.520717144012451, "logits/rejected": -1.9651705026626587, "logps/chosen": -490.6662902832031, "logps/rejected": -391.1755065917969, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 2.9540317058563232, "rewards/margins": 5.480316162109375, "rewards/rejected": -2.526284694671631, "step": 3669 }, { "epoch": 2.6812785388127853, "grad_norm": 15.68104061566193, "learning_rate": 1.4781094565956458e-07, "logits/chosen": -2.646775484085083, "logits/rejected": -2.2841286659240723, "logps/chosen": -866.383056640625, "logps/rejected": -718.823974609375, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": 3.069263458251953, "rewards/margins": 4.553892135620117, "rewards/rejected": -1.484628677368164, "step": 3670 }, { "epoch": 2.682009132420091, "grad_norm": 11.229426921784532, "learning_rate": 1.476653960755877e-07, "logits/chosen": -2.755425453186035, "logits/rejected": -2.1663312911987305, "logps/chosen": -579.688720703125, "logps/rejected": -430.0440673828125, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 3.490403890609741, "rewards/margins": 6.061972618103027, "rewards/rejected": -2.571568250656128, "step": 3671 }, { "epoch": 2.682739726027397, "grad_norm": 9.32684204889203, "learning_rate": 1.475198881484348e-07, "logits/chosen": -2.835120677947998, "logits/rejected": -2.6285035610198975, "logps/chosen": -268.7119140625, "logps/rejected": -469.7479553222656, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 2.342703104019165, "rewards/margins": 7.504820823669434, "rewards/rejected": -5.162117958068848, "step": 3672 }, { "epoch": 2.6834703196347034, "grad_norm": 17.205957773694443, "learning_rate": 1.47374421937337e-07, "logits/chosen": -2.6122214794158936, "logits/rejected": -2.2414770126342773, "logps/chosen": -533.10693359375, "logps/rejected": -455.25933837890625, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 2.3113038539886475, "rewards/margins": 3.9589414596557617, "rewards/rejected": -1.6476376056671143, "step": 3673 }, { "epoch": 2.6842009132420093, "grad_norm": 5.605545479385566, "learning_rate": 1.4722899750150864e-07, "logits/chosen": -2.650482416152954, "logits/rejected": -2.5769224166870117, "logps/chosen": -485.5643615722656, "logps/rejected": -539.6083374023438, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 2.4805126190185547, "rewards/margins": 5.554682731628418, "rewards/rejected": -3.074169635772705, "step": 3674 }, { "epoch": 2.684931506849315, "grad_norm": 9.241265374274967, "learning_rate": 1.4708361490014673e-07, "logits/chosen": -3.2051849365234375, "logits/rejected": -2.746743679046631, "logps/chosen": -605.08837890625, "logps/rejected": -527.8182373046875, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 2.2879557609558105, "rewards/margins": 4.227368354797363, "rewards/rejected": -1.9394124746322632, "step": 3675 }, { "epoch": 2.685662100456621, "grad_norm": 17.16447943422832, "learning_rate": 1.469382741924315e-07, "logits/chosen": -2.3683745861053467, "logits/rejected": -2.729668378829956, "logps/chosen": -509.91357421875, "logps/rejected": -764.931640625, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": 2.9230270385742188, "rewards/margins": 5.696713447570801, "rewards/rejected": -2.773686408996582, "step": 3676 }, { "epoch": 2.686392694063927, "grad_norm": 15.930594111185922, "learning_rate": 1.46792975437526e-07, "logits/chosen": -2.49859356880188, "logits/rejected": -2.1828227043151855, "logps/chosen": -510.6666259765625, "logps/rejected": -543.9658203125, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 3.606703996658325, "rewards/margins": 5.510541915893555, "rewards/rejected": -1.9038381576538086, "step": 3677 }, { "epoch": 2.6871232876712328, "grad_norm": 17.091383412787, "learning_rate": 1.4664771869457632e-07, "logits/chosen": -2.915247917175293, "logits/rejected": -2.2646708488464355, "logps/chosen": -759.9313354492188, "logps/rejected": -753.2149658203125, "loss": 0.0588, "rewards/accuracies": 1.0, "rewards/chosen": 3.923778533935547, "rewards/margins": 5.427966594696045, "rewards/rejected": -1.504188060760498, "step": 3678 }, { "epoch": 2.6878538812785386, "grad_norm": 10.59484893590321, "learning_rate": 1.465025040227113e-07, "logits/chosen": -2.898573637008667, "logits/rejected": -2.4051222801208496, "logps/chosen": -557.6751708984375, "logps/rejected": -496.8310546875, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 2.54469895362854, "rewards/margins": 3.158971071243286, "rewards/rejected": -0.6142718195915222, "step": 3679 }, { "epoch": 2.688584474885845, "grad_norm": 8.951684752721656, "learning_rate": 1.4635733148104282e-07, "logits/chosen": -2.791881561279297, "logits/rejected": -2.0650529861450195, "logps/chosen": -854.0107421875, "logps/rejected": -600.0555419921875, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 3.3331518173217773, "rewards/margins": 4.56031608581543, "rewards/rejected": -1.2271642684936523, "step": 3680 }, { "epoch": 2.689315068493151, "grad_norm": 6.749705555463084, "learning_rate": 1.4621220112866544e-07, "logits/chosen": -2.8608195781707764, "logits/rejected": -1.9442387819290161, "logps/chosen": -459.01556396484375, "logps/rejected": -338.0052185058594, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": 1.7998894453048706, "rewards/margins": 6.227443695068359, "rewards/rejected": -4.427554130554199, "step": 3681 }, { "epoch": 2.6900456621004567, "grad_norm": 17.27258237933116, "learning_rate": 1.4606711302465673e-07, "logits/chosen": -2.6819303035736084, "logits/rejected": -2.627093553543091, "logps/chosen": -717.493896484375, "logps/rejected": -608.8271484375, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": 2.2993106842041016, "rewards/margins": 4.371639251708984, "rewards/rejected": -2.072328805923462, "step": 3682 }, { "epoch": 2.6907762557077626, "grad_norm": 12.938712390354478, "learning_rate": 1.4592206722807697e-07, "logits/chosen": -2.873594284057617, "logits/rejected": -2.1452529430389404, "logps/chosen": -951.6355590820312, "logps/rejected": -623.1153564453125, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 3.83373761177063, "rewards/margins": 6.525397777557373, "rewards/rejected": -2.6916606426239014, "step": 3683 }, { "epoch": 2.6915068493150685, "grad_norm": 10.312144510326426, "learning_rate": 1.45777063797969e-07, "logits/chosen": -2.658936023712158, "logits/rejected": -2.4462785720825195, "logps/chosen": -520.7247924804688, "logps/rejected": -480.4017333984375, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 3.3196699619293213, "rewards/margins": 6.3312578201293945, "rewards/rejected": -3.011587381362915, "step": 3684 }, { "epoch": 2.6922374429223743, "grad_norm": 6.083119044601299, "learning_rate": 1.4563210279335887e-07, "logits/chosen": -2.7323787212371826, "logits/rejected": -2.3715944290161133, "logps/chosen": -703.7886962890625, "logps/rejected": -621.843994140625, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 3.2595157623291016, "rewards/margins": 4.504227161407471, "rewards/rejected": -1.24471116065979, "step": 3685 }, { "epoch": 2.69296803652968, "grad_norm": 9.755789969410273, "learning_rate": 1.454871842732549e-07, "logits/chosen": -2.848328113555908, "logits/rejected": -1.8885504007339478, "logps/chosen": -982.7952880859375, "logps/rejected": -641.64111328125, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 5.573540210723877, "rewards/margins": 7.455976486206055, "rewards/rejected": -1.8824365139007568, "step": 3686 }, { "epoch": 2.6936986301369865, "grad_norm": 7.814896849124189, "learning_rate": 1.4534230829664855e-07, "logits/chosen": -2.71132493019104, "logits/rejected": -1.8175190687179565, "logps/chosen": -550.1254272460938, "logps/rejected": -387.22735595703125, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 3.318837881088257, "rewards/margins": 5.707523345947266, "rewards/rejected": -2.3886852264404297, "step": 3687 }, { "epoch": 2.6944292237442924, "grad_norm": 11.070107812813058, "learning_rate": 1.4519747492251367e-07, "logits/chosen": -2.3672778606414795, "logits/rejected": -1.8760772943496704, "logps/chosen": -545.270751953125, "logps/rejected": -463.4786376953125, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 4.430201530456543, "rewards/margins": 6.358724594116211, "rewards/rejected": -1.9285229444503784, "step": 3688 }, { "epoch": 2.6951598173515983, "grad_norm": 12.412144257671283, "learning_rate": 1.450526842098067e-07, "logits/chosen": -2.9816393852233887, "logits/rejected": -2.2197723388671875, "logps/chosen": -971.0136108398438, "logps/rejected": -697.0209350585938, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 4.890073776245117, "rewards/margins": 4.215387344360352, "rewards/rejected": 0.6746858358383179, "step": 3689 }, { "epoch": 2.695890410958904, "grad_norm": 9.714844535333704, "learning_rate": 1.4490793621746705e-07, "logits/chosen": -2.670851230621338, "logits/rejected": -2.2716622352600098, "logps/chosen": -617.70068359375, "logps/rejected": -519.956298828125, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 1.8005948066711426, "rewards/margins": 5.385516166687012, "rewards/rejected": -3.58492112159729, "step": 3690 }, { "epoch": 2.69662100456621, "grad_norm": 12.792300793630774, "learning_rate": 1.447632310044165e-07, "logits/chosen": -3.095054864883423, "logits/rejected": -2.6179358959198, "logps/chosen": -823.0726928710938, "logps/rejected": -480.59576416015625, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": 0.9074691534042358, "rewards/margins": 3.793458938598633, "rewards/rejected": -2.8859899044036865, "step": 3691 }, { "epoch": 2.697351598173516, "grad_norm": 9.016489817165374, "learning_rate": 1.446185686295594e-07, "logits/chosen": -3.196031332015991, "logits/rejected": -2.2970404624938965, "logps/chosen": -1043.6724853515625, "logps/rejected": -692.7872924804688, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 6.389996528625488, "rewards/margins": 6.002840042114258, "rewards/rejected": 0.38715630769729614, "step": 3692 }, { "epoch": 2.6980821917808218, "grad_norm": 8.204747328415698, "learning_rate": 1.4447394915178261e-07, "logits/chosen": -2.308133602142334, "logits/rejected": -2.0876049995422363, "logps/chosen": -495.3577575683594, "logps/rejected": -444.7911376953125, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 2.3188552856445312, "rewards/margins": 4.575936794281006, "rewards/rejected": -2.2570815086364746, "step": 3693 }, { "epoch": 2.698812785388128, "grad_norm": 16.934888124549733, "learning_rate": 1.4432937262995584e-07, "logits/chosen": -2.335660696029663, "logits/rejected": -2.4806785583496094, "logps/chosen": -802.4456787109375, "logps/rejected": -619.2076416015625, "loss": 0.0839, "rewards/accuracies": 0.875, "rewards/chosen": 3.507465124130249, "rewards/margins": 4.369329452514648, "rewards/rejected": -0.8618640899658203, "step": 3694 }, { "epoch": 2.699543378995434, "grad_norm": 13.95190342322005, "learning_rate": 1.4418483912293115e-07, "logits/chosen": -2.6785247325897217, "logits/rejected": -2.315986156463623, "logps/chosen": -895.21826171875, "logps/rejected": -829.5880126953125, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 5.285806655883789, "rewards/margins": 6.526261806488037, "rewards/rejected": -1.2404553890228271, "step": 3695 }, { "epoch": 2.70027397260274, "grad_norm": 6.2170484505726336, "learning_rate": 1.4404034868954292e-07, "logits/chosen": -2.7217612266540527, "logits/rejected": -2.1887736320495605, "logps/chosen": -524.5238647460938, "logps/rejected": -567.3818969726562, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": 1.9473758935928345, "rewards/margins": 7.810935020446777, "rewards/rejected": -5.863559246063232, "step": 3696 }, { "epoch": 2.7010045662100457, "grad_norm": 8.267049569316914, "learning_rate": 1.438959013886082e-07, "logits/chosen": -2.400362014770508, "logits/rejected": -1.5711860656738281, "logps/chosen": -564.8765258789062, "logps/rejected": -317.33221435546875, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": 3.537174701690674, "rewards/margins": 6.0201497077941895, "rewards/rejected": -2.4829747676849365, "step": 3697 }, { "epoch": 2.7017351598173516, "grad_norm": 8.24210789357291, "learning_rate": 1.4375149727892626e-07, "logits/chosen": -2.815343141555786, "logits/rejected": -3.3972842693328857, "logps/chosen": -1026.04541015625, "logps/rejected": -1235.5325927734375, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 4.377814292907715, "rewards/margins": 4.2965407371521, "rewards/rejected": 0.08127367496490479, "step": 3698 }, { "epoch": 2.7024657534246574, "grad_norm": 12.492983870800993, "learning_rate": 1.4360713641927918e-07, "logits/chosen": -2.993029832839966, "logits/rejected": -2.3767666816711426, "logps/chosen": -660.191650390625, "logps/rejected": -571.9888305664062, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 4.275179862976074, "rewards/margins": 6.069156646728516, "rewards/rejected": -1.7939765453338623, "step": 3699 }, { "epoch": 2.7031963470319633, "grad_norm": 15.099088305303972, "learning_rate": 1.4346281886843108e-07, "logits/chosen": -2.380364179611206, "logits/rejected": -2.0043280124664307, "logps/chosen": -460.906005859375, "logps/rejected": -496.76593017578125, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 2.00864839553833, "rewards/margins": 5.65660285949707, "rewards/rejected": -3.6479544639587402, "step": 3700 }, { "epoch": 2.7039269406392696, "grad_norm": 11.320382591225384, "learning_rate": 1.433185446851285e-07, "logits/chosen": -2.5832014083862305, "logits/rejected": -2.433641195297241, "logps/chosen": -301.8038330078125, "logps/rejected": -367.1841735839844, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 1.159497618675232, "rewards/margins": 4.453196048736572, "rewards/rejected": -3.293698310852051, "step": 3701 }, { "epoch": 2.704657534246575, "grad_norm": 6.935844430043372, "learning_rate": 1.4317431392810054e-07, "logits/chosen": -2.850109815597534, "logits/rejected": -1.9132121801376343, "logps/chosen": -860.4859008789062, "logps/rejected": -464.10699462890625, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 4.720683574676514, "rewards/margins": 4.802882194519043, "rewards/rejected": -0.08219808340072632, "step": 3702 }, { "epoch": 2.7053881278538814, "grad_norm": 9.945081973344562, "learning_rate": 1.4303012665605832e-07, "logits/chosen": -2.698707342147827, "logits/rejected": -2.137120246887207, "logps/chosen": -558.960693359375, "logps/rejected": -491.6922607421875, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 3.1778969764709473, "rewards/margins": 6.036367416381836, "rewards/rejected": -2.8584704399108887, "step": 3703 }, { "epoch": 2.7061187214611873, "grad_norm": 9.778356934744288, "learning_rate": 1.428859829276956e-07, "logits/chosen": -3.078052282333374, "logits/rejected": -1.8706549406051636, "logps/chosen": -936.343994140625, "logps/rejected": -559.64453125, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 5.51960563659668, "rewards/margins": 6.797572612762451, "rewards/rejected": -1.277966856956482, "step": 3704 }, { "epoch": 2.706849315068493, "grad_norm": 17.963227751980746, "learning_rate": 1.4274188280168811e-07, "logits/chosen": -2.7949821949005127, "logits/rejected": -1.5226256847381592, "logps/chosen": -885.0787963867188, "logps/rejected": -470.7182312011719, "loss": 0.0896, "rewards/accuracies": 0.875, "rewards/chosen": 3.195496082305908, "rewards/margins": 4.210608959197998, "rewards/rejected": -1.0151128768920898, "step": 3705 }, { "epoch": 2.707579908675799, "grad_norm": 8.802341531130455, "learning_rate": 1.4259782633669387e-07, "logits/chosen": -2.459282398223877, "logits/rejected": -2.9408493041992188, "logps/chosen": -415.66448974609375, "logps/rejected": -607.5125732421875, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 1.171140193939209, "rewards/margins": 5.270872116088867, "rewards/rejected": -4.099731922149658, "step": 3706 }, { "epoch": 2.708310502283105, "grad_norm": 17.284928935652204, "learning_rate": 1.4245381359135345e-07, "logits/chosen": -3.0842790603637695, "logits/rejected": -2.02115535736084, "logps/chosen": -642.8171997070312, "logps/rejected": -469.8482360839844, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": 2.0926451683044434, "rewards/margins": 4.250122547149658, "rewards/rejected": -2.1574771404266357, "step": 3707 }, { "epoch": 2.709041095890411, "grad_norm": 13.7164858120521, "learning_rate": 1.423098446242891e-07, "logits/chosen": -2.88287091255188, "logits/rejected": -2.3892266750335693, "logps/chosen": -521.6378784179688, "logps/rejected": -428.84381103515625, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 1.730379343032837, "rewards/margins": 3.083772897720337, "rewards/rejected": -1.3533935546875, "step": 3708 }, { "epoch": 2.7097716894977166, "grad_norm": 18.076707789361233, "learning_rate": 1.421659194941059e-07, "logits/chosen": -2.9640138149261475, "logits/rejected": -2.645521402359009, "logps/chosen": -776.26416015625, "logps/rejected": -594.8701782226562, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 4.358410835266113, "rewards/margins": 5.073708534240723, "rewards/rejected": -0.7152976989746094, "step": 3709 }, { "epoch": 2.710502283105023, "grad_norm": 5.930821889494921, "learning_rate": 1.4202203825939024e-07, "logits/chosen": -2.2780826091766357, "logits/rejected": -2.6199185848236084, "logps/chosen": -335.55499267578125, "logps/rejected": -374.86944580078125, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 0.656583845615387, "rewards/margins": 3.4975032806396484, "rewards/rejected": -2.8409194946289062, "step": 3710 }, { "epoch": 2.711232876712329, "grad_norm": 8.562026407551736, "learning_rate": 1.4187820097871142e-07, "logits/chosen": -3.177589178085327, "logits/rejected": -1.9298629760742188, "logps/chosen": -1072.1109619140625, "logps/rejected": -748.8225708007812, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 7.005078315734863, "rewards/margins": 7.418066501617432, "rewards/rejected": -0.41298848390579224, "step": 3711 }, { "epoch": 2.7119634703196347, "grad_norm": 11.588384245185456, "learning_rate": 1.4173440771062055e-07, "logits/chosen": -2.7137019634246826, "logits/rejected": -2.294062376022339, "logps/chosen": -690.8443603515625, "logps/rejected": -639.092041015625, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 4.260869979858398, "rewards/margins": 6.584118843078613, "rewards/rejected": -2.323249101638794, "step": 3712 }, { "epoch": 2.7126940639269406, "grad_norm": 9.30022056071937, "learning_rate": 1.4159065851365083e-07, "logits/chosen": -2.2534475326538086, "logits/rejected": -2.919424295425415, "logps/chosen": -544.5825805664062, "logps/rejected": -733.5584106445312, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 2.774678945541382, "rewards/margins": 5.035154819488525, "rewards/rejected": -2.2604756355285645, "step": 3713 }, { "epoch": 2.7134246575342464, "grad_norm": 12.151593490782778, "learning_rate": 1.414469534463174e-07, "logits/chosen": -2.7726354598999023, "logits/rejected": -2.1780195236206055, "logps/chosen": -834.8568725585938, "logps/rejected": -707.9319458007812, "loss": 0.0616, "rewards/accuracies": 1.0, "rewards/chosen": 3.1630215644836426, "rewards/margins": 3.986227035522461, "rewards/rejected": -0.8232053518295288, "step": 3714 }, { "epoch": 2.7141552511415528, "grad_norm": 6.669505500363933, "learning_rate": 1.413032925671175e-07, "logits/chosen": -3.1454291343688965, "logits/rejected": -2.3290152549743652, "logps/chosen": -569.922119140625, "logps/rejected": -376.7905578613281, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 3.575960874557495, "rewards/margins": 5.458232879638672, "rewards/rejected": -1.8822720050811768, "step": 3715 }, { "epoch": 2.714885844748858, "grad_norm": 5.009303021404665, "learning_rate": 1.4115967593453062e-07, "logits/chosen": -2.9697437286376953, "logits/rejected": -1.8219863176345825, "logps/chosen": -346.5347595214844, "logps/rejected": -295.37261962890625, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 3.673335552215576, "rewards/margins": 7.686095237731934, "rewards/rejected": -4.012759685516357, "step": 3716 }, { "epoch": 2.7156164383561645, "grad_norm": 11.693498836492973, "learning_rate": 1.4101610360701796e-07, "logits/chosen": -3.254289150238037, "logits/rejected": -2.4669013023376465, "logps/chosen": -683.0750122070312, "logps/rejected": -537.0213623046875, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 3.601043224334717, "rewards/margins": 3.7753524780273438, "rewards/rejected": -0.17430934309959412, "step": 3717 }, { "epoch": 2.7163470319634704, "grad_norm": 8.305935440249801, "learning_rate": 1.4087257564302267e-07, "logits/chosen": -2.9990768432617188, "logits/rejected": -2.0893685817718506, "logps/chosen": -710.0485229492188, "logps/rejected": -509.9525146484375, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 3.6743505001068115, "rewards/margins": 5.445405006408691, "rewards/rejected": -1.7710542678833008, "step": 3718 }, { "epoch": 2.7170776255707763, "grad_norm": 8.9413286024916, "learning_rate": 1.4072909210097012e-07, "logits/chosen": -2.9972586631774902, "logits/rejected": -2.4457473754882812, "logps/chosen": -750.025146484375, "logps/rejected": -604.85546875, "loss": 0.0648, "rewards/accuracies": 0.875, "rewards/chosen": 3.2023026943206787, "rewards/margins": 4.872668743133545, "rewards/rejected": -1.6703659296035767, "step": 3719 }, { "epoch": 2.717808219178082, "grad_norm": 24.207926008657267, "learning_rate": 1.4058565303926723e-07, "logits/chosen": -2.5455403327941895, "logits/rejected": -1.7132296562194824, "logps/chosen": -440.6705322265625, "logps/rejected": -200.71449279785156, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 2.673858404159546, "rewards/margins": 4.818554401397705, "rewards/rejected": -2.144695997238159, "step": 3720 }, { "epoch": 2.718538812785388, "grad_norm": 12.916207450989326, "learning_rate": 1.4044225851630326e-07, "logits/chosen": -2.8453733921051025, "logits/rejected": -1.6782326698303223, "logps/chosen": -465.6047668457031, "logps/rejected": -336.7538757324219, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 3.3298544883728027, "rewards/margins": 7.386646747589111, "rewards/rejected": -4.056792736053467, "step": 3721 }, { "epoch": 2.7192694063926943, "grad_norm": 14.215945057296413, "learning_rate": 1.402989085904489e-07, "logits/chosen": -2.5399367809295654, "logits/rejected": -2.4665603637695312, "logps/chosen": -585.127197265625, "logps/rejected": -595.3494873046875, "loss": 0.0815, "rewards/accuracies": 1.0, "rewards/chosen": 2.535905122756958, "rewards/margins": 3.2857325077056885, "rewards/rejected": -0.7498276829719543, "step": 3722 }, { "epoch": 2.7199999999999998, "grad_norm": 12.416952518185257, "learning_rate": 1.4015560332005682e-07, "logits/chosen": -2.783740520477295, "logits/rejected": -2.178504228591919, "logps/chosen": -619.5365600585938, "logps/rejected": -508.2645263671875, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": 3.7336297035217285, "rewards/margins": 4.079831123352051, "rewards/rejected": -0.3462010324001312, "step": 3723 }, { "epoch": 2.720730593607306, "grad_norm": 12.513010197164228, "learning_rate": 1.4001234276346173e-07, "logits/chosen": -2.6136183738708496, "logits/rejected": -1.9730439186096191, "logps/chosen": -944.1265869140625, "logps/rejected": -656.809326171875, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": 2.7179830074310303, "rewards/margins": 4.8284077644348145, "rewards/rejected": -2.110424518585205, "step": 3724 }, { "epoch": 2.721461187214612, "grad_norm": 15.393109828757648, "learning_rate": 1.3986912697897985e-07, "logits/chosen": -2.572007894515991, "logits/rejected": -2.299159049987793, "logps/chosen": -773.801513671875, "logps/rejected": -675.8870239257812, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 4.842607021331787, "rewards/margins": 7.610015869140625, "rewards/rejected": -2.7674083709716797, "step": 3725 }, { "epoch": 2.722191780821918, "grad_norm": 12.043594645996283, "learning_rate": 1.3972595602490956e-07, "logits/chosen": -2.533491611480713, "logits/rejected": -1.939626932144165, "logps/chosen": -729.858154296875, "logps/rejected": -573.698486328125, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 3.4241530895233154, "rewards/margins": 3.861845016479492, "rewards/rejected": -0.43769198656082153, "step": 3726 }, { "epoch": 2.7229223744292237, "grad_norm": 10.490548739413201, "learning_rate": 1.3958282995953025e-07, "logits/chosen": -2.835293769836426, "logits/rejected": -2.0309832096099854, "logps/chosen": -635.5601806640625, "logps/rejected": -380.8756103515625, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 4.021894931793213, "rewards/margins": 5.783721923828125, "rewards/rejected": -1.7618271112442017, "step": 3727 }, { "epoch": 2.7236529680365296, "grad_norm": 11.731268053645538, "learning_rate": 1.3943974884110382e-07, "logits/chosen": -2.78705096244812, "logits/rejected": -2.572969675064087, "logps/chosen": -743.8424072265625, "logps/rejected": -839.5791625976562, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 2.5457136631011963, "rewards/margins": 4.062341690063477, "rewards/rejected": -1.5166285037994385, "step": 3728 }, { "epoch": 2.724383561643836, "grad_norm": 20.598750015407866, "learning_rate": 1.3929671272787362e-07, "logits/chosen": -3.0921030044555664, "logits/rejected": -2.159658193588257, "logps/chosen": -656.9529418945312, "logps/rejected": -513.749755859375, "loss": 0.0886, "rewards/accuracies": 0.875, "rewards/chosen": 2.6228859424591064, "rewards/margins": 4.00391960144043, "rewards/rejected": -1.3810334205627441, "step": 3729 }, { "epoch": 2.7251141552511413, "grad_norm": 10.63562844439512, "learning_rate": 1.3915372167806448e-07, "logits/chosen": -3.299968957901001, "logits/rejected": -1.9300236701965332, "logps/chosen": -617.1444702148438, "logps/rejected": -369.24530029296875, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": 3.9692413806915283, "rewards/margins": 6.178124904632568, "rewards/rejected": -2.20888352394104, "step": 3730 }, { "epoch": 2.7258447488584476, "grad_norm": 12.555683469167645, "learning_rate": 1.3901077574988317e-07, "logits/chosen": -3.4423868656158447, "logits/rejected": -1.7554106712341309, "logps/chosen": -491.32647705078125, "logps/rejected": -301.657470703125, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 4.287380695343018, "rewards/margins": 6.403162002563477, "rewards/rejected": -2.115781545639038, "step": 3731 }, { "epoch": 2.7265753424657535, "grad_norm": 9.869485208522782, "learning_rate": 1.3886787500151774e-07, "logits/chosen": -2.704164981842041, "logits/rejected": -2.3844847679138184, "logps/chosen": -350.74359130859375, "logps/rejected": -357.36700439453125, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 2.448925018310547, "rewards/margins": 4.519717216491699, "rewards/rejected": -2.0707921981811523, "step": 3732 }, { "epoch": 2.7273059360730594, "grad_norm": 8.10677841452491, "learning_rate": 1.3872501949113835e-07, "logits/chosen": -3.1246554851531982, "logits/rejected": -1.819206714630127, "logps/chosen": -688.0118408203125, "logps/rejected": -466.84515380859375, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 3.3656134605407715, "rewards/margins": 5.126486778259277, "rewards/rejected": -1.760873556137085, "step": 3733 }, { "epoch": 2.7280365296803653, "grad_norm": 15.321504148740841, "learning_rate": 1.3858220927689622e-07, "logits/chosen": -2.8083553314208984, "logits/rejected": -1.6421513557434082, "logps/chosen": -704.8659057617188, "logps/rejected": -410.66387939453125, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 2.6041371822357178, "rewards/margins": 4.35993766784668, "rewards/rejected": -1.7558001279830933, "step": 3734 }, { "epoch": 2.728767123287671, "grad_norm": 12.275370580980056, "learning_rate": 1.384394444169245e-07, "logits/chosen": -2.3059043884277344, "logits/rejected": -1.7671335935592651, "logps/chosen": -671.1253662109375, "logps/rejected": -520.4459228515625, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 3.3481838703155518, "rewards/margins": 4.4387125968933105, "rewards/rejected": -1.0905284881591797, "step": 3735 }, { "epoch": 2.7294977168949774, "grad_norm": 28.916266212292282, "learning_rate": 1.382967249693378e-07, "logits/chosen": -2.380932331085205, "logits/rejected": -2.2476963996887207, "logps/chosen": -389.6563720703125, "logps/rejected": -451.99188232421875, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": 3.8580808639526367, "rewards/margins": 5.201504707336426, "rewards/rejected": -1.3434243202209473, "step": 3736 }, { "epoch": 2.730228310502283, "grad_norm": 6.147184406632877, "learning_rate": 1.38154050992232e-07, "logits/chosen": -2.9026567935943604, "logits/rejected": -2.0989935398101807, "logps/chosen": -910.5804443359375, "logps/rejected": -676.3446044921875, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 5.1865458488464355, "rewards/margins": 6.093360424041748, "rewards/rejected": -0.9068148136138916, "step": 3737 }, { "epoch": 2.730958904109589, "grad_norm": 12.092699581925904, "learning_rate": 1.3801142254368488e-07, "logits/chosen": -2.5018208026885986, "logits/rejected": -2.1841413974761963, "logps/chosen": -704.63623046875, "logps/rejected": -592.7359619140625, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 3.2115540504455566, "rewards/margins": 4.407863616943359, "rewards/rejected": -1.1963098049163818, "step": 3738 }, { "epoch": 2.731689497716895, "grad_norm": 10.435084263269893, "learning_rate": 1.3786883968175538e-07, "logits/chosen": -2.715987205505371, "logits/rejected": -2.7731432914733887, "logps/chosen": -535.4608154296875, "logps/rejected": -722.4300537109375, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 2.3420510292053223, "rewards/margins": 4.752639293670654, "rewards/rejected": -2.410588026046753, "step": 3739 }, { "epoch": 2.732420091324201, "grad_norm": 9.426453447461986, "learning_rate": 1.3772630246448392e-07, "logits/chosen": -2.9489498138427734, "logits/rejected": -1.9467240571975708, "logps/chosen": -657.664306640625, "logps/rejected": -442.3272399902344, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 3.615431547164917, "rewards/margins": 6.7695159912109375, "rewards/rejected": -3.1540846824645996, "step": 3740 }, { "epoch": 2.733150684931507, "grad_norm": 16.043284994296, "learning_rate": 1.3758381094989252e-07, "logits/chosen": -3.2137153148651123, "logits/rejected": -1.9620000123977661, "logps/chosen": -626.842041015625, "logps/rejected": -398.762451171875, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": 1.936248779296875, "rewards/margins": 4.167662620544434, "rewards/rejected": -2.2314136028289795, "step": 3741 }, { "epoch": 2.7338812785388127, "grad_norm": 12.278741866781575, "learning_rate": 1.3744136519598428e-07, "logits/chosen": -3.1466469764709473, "logits/rejected": -2.473053216934204, "logps/chosen": -689.4400024414062, "logps/rejected": -762.4139404296875, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 3.1777725219726562, "rewards/margins": 5.636898994445801, "rewards/rejected": -2.4591259956359863, "step": 3742 }, { "epoch": 2.7346118721461186, "grad_norm": 8.787603177459266, "learning_rate": 1.372989652607442e-07, "logits/chosen": -2.977830171585083, "logits/rejected": -2.8042402267456055, "logps/chosen": -695.8568115234375, "logps/rejected": -481.8619689941406, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 3.2431373596191406, "rewards/margins": 5.08401346206665, "rewards/rejected": -1.8408757448196411, "step": 3743 }, { "epoch": 2.7353424657534244, "grad_norm": 10.724581808549074, "learning_rate": 1.3715661120213793e-07, "logits/chosen": -2.9471118450164795, "logits/rejected": -1.6500811576843262, "logps/chosen": -673.4283447265625, "logps/rejected": -440.3079528808594, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 4.963791847229004, "rewards/margins": 7.518233776092529, "rewards/rejected": -2.5544419288635254, "step": 3744 }, { "epoch": 2.7360730593607308, "grad_norm": 11.060377704041336, "learning_rate": 1.37014303078113e-07, "logits/chosen": -2.7615156173706055, "logits/rejected": -1.9140446186065674, "logps/chosen": -583.283447265625, "logps/rejected": -378.9103698730469, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 2.969696044921875, "rewards/margins": 4.837884902954102, "rewards/rejected": -1.8681889772415161, "step": 3745 }, { "epoch": 2.7368036529680366, "grad_norm": 6.93134096325005, "learning_rate": 1.36872040946598e-07, "logits/chosen": -3.3052053451538086, "logits/rejected": -2.3048739433288574, "logps/chosen": -1252.7677001953125, "logps/rejected": -618.6859741210938, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 2.9859848022460938, "rewards/margins": 5.569059371948242, "rewards/rejected": -2.5830745697021484, "step": 3746 }, { "epoch": 2.7375342465753425, "grad_norm": 10.880647793165892, "learning_rate": 1.3672982486550277e-07, "logits/chosen": -2.5845789909362793, "logits/rejected": -2.0706799030303955, "logps/chosen": -428.96380615234375, "logps/rejected": -453.5962829589844, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 2.707714796066284, "rewards/margins": 5.831491470336914, "rewards/rejected": -3.12377667427063, "step": 3747 }, { "epoch": 2.7382648401826484, "grad_norm": 10.112379709478471, "learning_rate": 1.3658765489271883e-07, "logits/chosen": -3.1361215114593506, "logits/rejected": -2.5408990383148193, "logps/chosen": -483.8021240234375, "logps/rejected": -358.76495361328125, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 2.3282711505889893, "rewards/margins": 5.625941276550293, "rewards/rejected": -3.2976694107055664, "step": 3748 }, { "epoch": 2.7389954337899542, "grad_norm": 12.963562592054894, "learning_rate": 1.3644553108611813e-07, "logits/chosen": -3.161792755126953, "logits/rejected": -2.3868985176086426, "logps/chosen": -401.93841552734375, "logps/rejected": -390.1955871582031, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 2.474489450454712, "rewards/margins": 5.425527572631836, "rewards/rejected": -2.951038360595703, "step": 3749 }, { "epoch": 2.73972602739726, "grad_norm": 24.518390837447154, "learning_rate": 1.3630345350355455e-07, "logits/chosen": -2.6690926551818848, "logits/rejected": -2.3164377212524414, "logps/chosen": -561.6114501953125, "logps/rejected": -494.5157470703125, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 4.562791347503662, "rewards/margins": 7.15681791305542, "rewards/rejected": -2.594026803970337, "step": 3750 }, { "epoch": 2.740456621004566, "grad_norm": 8.871880641289154, "learning_rate": 1.3616142220286272e-07, "logits/chosen": -2.5822815895080566, "logits/rejected": -1.9932523965835571, "logps/chosen": -503.8553161621094, "logps/rejected": -420.05767822265625, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 3.0737829208374023, "rewards/margins": 5.387606620788574, "rewards/rejected": -2.313823699951172, "step": 3751 }, { "epoch": 2.7411872146118723, "grad_norm": 15.011303520961713, "learning_rate": 1.3601943724185882e-07, "logits/chosen": -2.593517780303955, "logits/rejected": -1.9758009910583496, "logps/chosen": -422.3141174316406, "logps/rejected": -418.219482421875, "loss": 0.1107, "rewards/accuracies": 1.0, "rewards/chosen": 4.628647327423096, "rewards/margins": 7.760717391967773, "rewards/rejected": -3.132070541381836, "step": 3752 }, { "epoch": 2.741917808219178, "grad_norm": 11.936903054855376, "learning_rate": 1.3587749867833977e-07, "logits/chosen": -2.3224620819091797, "logits/rejected": -2.3017187118530273, "logps/chosen": -397.079833984375, "logps/rejected": -597.803955078125, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 3.222851276397705, "rewards/margins": 6.60629415512085, "rewards/rejected": -3.3834426403045654, "step": 3753 }, { "epoch": 2.742648401826484, "grad_norm": 11.700294712626032, "learning_rate": 1.3573560657008376e-07, "logits/chosen": -3.098496675491333, "logits/rejected": -2.6999003887176514, "logps/chosen": -561.1077270507812, "logps/rejected": -511.35693359375, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 2.76373028755188, "rewards/margins": 5.265232086181641, "rewards/rejected": -2.5015015602111816, "step": 3754 }, { "epoch": 2.74337899543379, "grad_norm": 13.942374843769612, "learning_rate": 1.355937609748502e-07, "logits/chosen": -2.143249988555908, "logits/rejected": -2.322690010070801, "logps/chosen": -412.27239990234375, "logps/rejected": -527.2103881835938, "loss": 0.0795, "rewards/accuracies": 1.0, "rewards/chosen": 2.3654985427856445, "rewards/margins": 4.033763408660889, "rewards/rejected": -1.6682649850845337, "step": 3755 }, { "epoch": 2.744109589041096, "grad_norm": 8.033870047002809, "learning_rate": 1.3545196195037944e-07, "logits/chosen": -2.8910398483276367, "logits/rejected": -2.6765966415405273, "logps/chosen": -671.6348266601562, "logps/rejected": -635.6907348632812, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 2.0591747760772705, "rewards/margins": 4.686200141906738, "rewards/rejected": -2.627025604248047, "step": 3756 }, { "epoch": 2.7448401826484017, "grad_norm": 8.214007383571266, "learning_rate": 1.353102095543927e-07, "logits/chosen": -2.746175765991211, "logits/rejected": -1.9299074411392212, "logps/chosen": -668.420654296875, "logps/rejected": -547.36181640625, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 3.670046091079712, "rewards/margins": 6.838936805725098, "rewards/rejected": -3.168890953063965, "step": 3757 }, { "epoch": 2.7455707762557076, "grad_norm": 10.54698639987464, "learning_rate": 1.3516850384459265e-07, "logits/chosen": -2.833232879638672, "logits/rejected": -1.756248950958252, "logps/chosen": -781.7342529296875, "logps/rejected": -545.3887329101562, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 3.949397563934326, "rewards/margins": 6.155675888061523, "rewards/rejected": -2.206277847290039, "step": 3758 }, { "epoch": 2.746301369863014, "grad_norm": 13.843300840096413, "learning_rate": 1.3502684487866253e-07, "logits/chosen": -2.8476614952087402, "logits/rejected": -1.6892361640930176, "logps/chosen": -650.2428588867188, "logps/rejected": -401.535400390625, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 5.147997856140137, "rewards/margins": 7.1684184074401855, "rewards/rejected": -2.0204203128814697, "step": 3759 }, { "epoch": 2.7470319634703197, "grad_norm": 9.682966980649054, "learning_rate": 1.3488523271426689e-07, "logits/chosen": -2.729236602783203, "logits/rejected": -2.163214683532715, "logps/chosen": -558.69970703125, "logps/rejected": -658.522705078125, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 2.395986557006836, "rewards/margins": 5.281121253967285, "rewards/rejected": -2.8851349353790283, "step": 3760 }, { "epoch": 2.7477625570776256, "grad_norm": 18.095539432779596, "learning_rate": 1.3474366740905097e-07, "logits/chosen": -2.7541229724884033, "logits/rejected": -1.9394571781158447, "logps/chosen": -836.2364501953125, "logps/rejected": -445.24053955078125, "loss": 0.0711, "rewards/accuracies": 1.0, "rewards/chosen": 3.5705747604370117, "rewards/margins": 4.605800628662109, "rewards/rejected": -1.0352264642715454, "step": 3761 }, { "epoch": 2.7484931506849315, "grad_norm": 7.610700362631919, "learning_rate": 1.3460214902064104e-07, "logits/chosen": -2.9187138080596924, "logits/rejected": -2.6577131748199463, "logps/chosen": -573.8766479492188, "logps/rejected": -483.3119201660156, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 4.171180725097656, "rewards/margins": 5.569637775421143, "rewards/rejected": -1.3984571695327759, "step": 3762 }, { "epoch": 2.7492237442922374, "grad_norm": 9.66102694279213, "learning_rate": 1.3446067760664417e-07, "logits/chosen": -2.566969633102417, "logits/rejected": -2.4215266704559326, "logps/chosen": -598.0011596679688, "logps/rejected": -616.5615234375, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 2.973459005355835, "rewards/margins": 4.983473777770996, "rewards/rejected": -2.0100150108337402, "step": 3763 }, { "epoch": 2.7499543378995432, "grad_norm": 17.40443967106301, "learning_rate": 1.343192532246485e-07, "logits/chosen": -2.8719139099121094, "logits/rejected": -1.3574159145355225, "logps/chosen": -540.690673828125, "logps/rejected": -250.6018829345703, "loss": 0.0628, "rewards/accuracies": 1.0, "rewards/chosen": 3.064683437347412, "rewards/margins": 4.590609550476074, "rewards/rejected": -1.5259268283843994, "step": 3764 }, { "epoch": 2.750684931506849, "grad_norm": 17.96667532180455, "learning_rate": 1.3417787593222318e-07, "logits/chosen": -2.9201836585998535, "logits/rejected": -1.7203421592712402, "logps/chosen": -644.9161376953125, "logps/rejected": -382.5693359375, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 3.3549463748931885, "rewards/margins": 5.766767978668213, "rewards/rejected": -2.4118213653564453, "step": 3765 }, { "epoch": 2.7514155251141554, "grad_norm": 8.399249143193313, "learning_rate": 1.3403654578691747e-07, "logits/chosen": -3.221278667449951, "logits/rejected": -2.697995662689209, "logps/chosen": -566.4447021484375, "logps/rejected": -527.107177734375, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 3.2980527877807617, "rewards/margins": 6.619054317474365, "rewards/rejected": -3.3210015296936035, "step": 3766 }, { "epoch": 2.7521461187214613, "grad_norm": 13.695236736812685, "learning_rate": 1.3389526284626225e-07, "logits/chosen": -2.4341771602630615, "logits/rejected": -2.2408230304718018, "logps/chosen": -437.9419250488281, "logps/rejected": -456.6234436035156, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": 2.2823410034179688, "rewards/margins": 4.092883586883545, "rewards/rejected": -1.8105427026748657, "step": 3767 }, { "epoch": 2.752876712328767, "grad_norm": 7.686881187778274, "learning_rate": 1.3375402716776863e-07, "logits/chosen": -2.260979652404785, "logits/rejected": -1.6484198570251465, "logps/chosen": -425.2716064453125, "logps/rejected": -428.3912353515625, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 3.170706033706665, "rewards/margins": 6.099471569061279, "rewards/rejected": -2.928765296936035, "step": 3768 }, { "epoch": 2.753607305936073, "grad_norm": 7.2651290888387, "learning_rate": 1.3361283880892887e-07, "logits/chosen": -2.825136661529541, "logits/rejected": -2.3407819271087646, "logps/chosen": -792.4501953125, "logps/rejected": -748.0609741210938, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 3.628296375274658, "rewards/margins": 4.749641418457031, "rewards/rejected": -1.121345043182373, "step": 3769 }, { "epoch": 2.754337899543379, "grad_norm": 12.188760076085787, "learning_rate": 1.3347169782721574e-07, "logits/chosen": -2.426992416381836, "logits/rejected": -2.1304538249969482, "logps/chosen": -367.9272155761719, "logps/rejected": -358.070556640625, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": 1.8914077281951904, "rewards/margins": 4.094860553741455, "rewards/rejected": -2.2034525871276855, "step": 3770 }, { "epoch": 2.755068493150685, "grad_norm": 11.517088425711773, "learning_rate": 1.333306042800827e-07, "logits/chosen": -2.9717373847961426, "logits/rejected": -2.0981647968292236, "logps/chosen": -627.624755859375, "logps/rejected": -418.407958984375, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 4.526639938354492, "rewards/margins": 6.453324317932129, "rewards/rejected": -1.9266841411590576, "step": 3771 }, { "epoch": 2.7557990867579907, "grad_norm": 13.074831457857007, "learning_rate": 1.331895582249641e-07, "logits/chosen": -2.7379307746887207, "logits/rejected": -2.0530269145965576, "logps/chosen": -669.7681274414062, "logps/rejected": -509.2380065917969, "loss": 0.0814, "rewards/accuracies": 0.875, "rewards/chosen": 2.8055272102355957, "rewards/margins": 5.845692157745361, "rewards/rejected": -3.0401649475097656, "step": 3772 }, { "epoch": 2.756529680365297, "grad_norm": 17.851831747148964, "learning_rate": 1.3304855971927465e-07, "logits/chosen": -2.5679640769958496, "logits/rejected": -2.5113401412963867, "logps/chosen": -684.4981689453125, "logps/rejected": -789.089599609375, "loss": 0.0777, "rewards/accuracies": 0.875, "rewards/chosen": 2.241340398788452, "rewards/margins": 2.749380350112915, "rewards/rejected": -0.508040189743042, "step": 3773 }, { "epoch": 2.757260273972603, "grad_norm": 11.46275201851155, "learning_rate": 1.3290760882041014e-07, "logits/chosen": -3.010974884033203, "logits/rejected": -1.8415708541870117, "logps/chosen": -955.8170166015625, "logps/rejected": -561.2569580078125, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": 4.681897163391113, "rewards/margins": 6.828974723815918, "rewards/rejected": -2.147078037261963, "step": 3774 }, { "epoch": 2.7579908675799087, "grad_norm": 6.11138211814995, "learning_rate": 1.3276670558574663e-07, "logits/chosen": -2.8482284545898438, "logits/rejected": -1.7166234254837036, "logps/chosen": -506.6572265625, "logps/rejected": -298.7142333984375, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 1.2070086002349854, "rewards/margins": 4.2510528564453125, "rewards/rejected": -3.044044256210327, "step": 3775 }, { "epoch": 2.7587214611872146, "grad_norm": 14.475435166743747, "learning_rate": 1.3262585007264072e-07, "logits/chosen": -3.006761312484741, "logits/rejected": -2.4618914127349854, "logps/chosen": -703.5048828125, "logps/rejected": -608.0030517578125, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 2.343834638595581, "rewards/margins": 3.9338910579681396, "rewards/rejected": -1.5900566577911377, "step": 3776 }, { "epoch": 2.7594520547945205, "grad_norm": 13.928632575379462, "learning_rate": 1.3248504233843e-07, "logits/chosen": -2.621920347213745, "logits/rejected": -1.8172271251678467, "logps/chosen": -409.13055419921875, "logps/rejected": -295.51922607421875, "loss": 0.0919, "rewards/accuracies": 0.875, "rewards/chosen": 2.229635000228882, "rewards/margins": 4.929171562194824, "rewards/rejected": -2.6995363235473633, "step": 3777 }, { "epoch": 2.7601826484018264, "grad_norm": 15.958657735344385, "learning_rate": 1.3234428244043223e-07, "logits/chosen": -2.8411121368408203, "logits/rejected": -2.695089340209961, "logps/chosen": -697.43896484375, "logps/rejected": -593.7880859375, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 5.517341613769531, "rewards/margins": 6.545855522155762, "rewards/rejected": -1.0285135507583618, "step": 3778 }, { "epoch": 2.7609132420091322, "grad_norm": 7.547087040483293, "learning_rate": 1.3220357043594582e-07, "logits/chosen": -2.497384786605835, "logits/rejected": -2.5059123039245605, "logps/chosen": -584.1435546875, "logps/rejected": -717.604248046875, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 3.480038642883301, "rewards/margins": 5.177756309509277, "rewards/rejected": -1.6977179050445557, "step": 3779 }, { "epoch": 2.7616438356164386, "grad_norm": 9.16637457179922, "learning_rate": 1.3206290638224962e-07, "logits/chosen": -2.4728403091430664, "logits/rejected": -2.3505923748016357, "logps/chosen": -432.69537353515625, "logps/rejected": -649.7974853515625, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 2.84312105178833, "rewards/margins": 5.7275710105896, "rewards/rejected": -2.8844494819641113, "step": 3780 }, { "epoch": 2.7623744292237444, "grad_norm": 9.253830812403576, "learning_rate": 1.3192229033660306e-07, "logits/chosen": -2.487527370452881, "logits/rejected": -1.7785724401474, "logps/chosen": -477.3883361816406, "logps/rejected": -462.0332336425781, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 4.152877330780029, "rewards/margins": 7.856095314025879, "rewards/rejected": -3.7032182216644287, "step": 3781 }, { "epoch": 2.7631050228310503, "grad_norm": 16.87942650277254, "learning_rate": 1.3178172235624618e-07, "logits/chosen": -2.481750249862671, "logits/rejected": -2.2910501956939697, "logps/chosen": -639.9229125976562, "logps/rejected": -832.8959350585938, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 3.108171224594116, "rewards/margins": 5.120766639709473, "rewards/rejected": -2.0125954151153564, "step": 3782 }, { "epoch": 2.763835616438356, "grad_norm": 10.773023922445997, "learning_rate": 1.316412024983991e-07, "logits/chosen": -2.62904691696167, "logits/rejected": -1.9175976514816284, "logps/chosen": -664.4111938476562, "logps/rejected": -485.21612548828125, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 3.7949883937835693, "rewards/margins": 7.05687141418457, "rewards/rejected": -3.261882781982422, "step": 3783 }, { "epoch": 2.764566210045662, "grad_norm": 8.883750792160203, "learning_rate": 1.3150073082026253e-07, "logits/chosen": -3.049591302871704, "logits/rejected": -2.491114616394043, "logps/chosen": -841.411865234375, "logps/rejected": -657.4503784179688, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 3.8451614379882812, "rewards/margins": 5.88720703125, "rewards/rejected": -2.0420453548431396, "step": 3784 }, { "epoch": 2.765296803652968, "grad_norm": 12.111981196359931, "learning_rate": 1.313603073790175e-07, "logits/chosen": -2.739358425140381, "logits/rejected": -1.8615672588348389, "logps/chosen": -886.348388671875, "logps/rejected": -521.9063720703125, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": 4.63437557220459, "rewards/margins": 6.455671787261963, "rewards/rejected": -1.8212963342666626, "step": 3785 }, { "epoch": 2.766027397260274, "grad_norm": 12.974653329793451, "learning_rate": 1.312199322318256e-07, "logits/chosen": -2.9785475730895996, "logits/rejected": -2.7073512077331543, "logps/chosen": -843.9703369140625, "logps/rejected": -730.5725708007812, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": 5.545190334320068, "rewards/margins": 5.864633083343506, "rewards/rejected": -0.31944286823272705, "step": 3786 }, { "epoch": 2.76675799086758, "grad_norm": 17.84935195558581, "learning_rate": 1.3107960543582858e-07, "logits/chosen": -2.510547399520874, "logits/rejected": -2.625443458557129, "logps/chosen": -366.1885070800781, "logps/rejected": -462.6339416503906, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 2.3698225021362305, "rewards/margins": 4.318063259124756, "rewards/rejected": -1.9482407569885254, "step": 3787 }, { "epoch": 2.767488584474886, "grad_norm": 9.71798222198399, "learning_rate": 1.3093932704814845e-07, "logits/chosen": -2.638672351837158, "logits/rejected": -2.1440489292144775, "logps/chosen": -383.18292236328125, "logps/rejected": -480.666015625, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 1.974729299545288, "rewards/margins": 4.9604268074035645, "rewards/rejected": -2.9856975078582764, "step": 3788 }, { "epoch": 2.768219178082192, "grad_norm": 14.467210197492726, "learning_rate": 1.3079909712588777e-07, "logits/chosen": -3.050868511199951, "logits/rejected": -2.5986011028289795, "logps/chosen": -772.4013671875, "logps/rejected": -700.004638671875, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 3.726895809173584, "rewards/margins": 3.8224401473999023, "rewards/rejected": -0.09554451704025269, "step": 3789 }, { "epoch": 2.7689497716894977, "grad_norm": 20.275453747282775, "learning_rate": 1.3065891572612901e-07, "logits/chosen": -2.7416257858276367, "logits/rejected": -2.1022775173187256, "logps/chosen": -694.9545288085938, "logps/rejected": -781.571533203125, "loss": 0.0737, "rewards/accuracies": 1.0, "rewards/chosen": 3.938275098800659, "rewards/margins": 5.590476036071777, "rewards/rejected": -1.6522008180618286, "step": 3790 }, { "epoch": 2.7696803652968036, "grad_norm": 18.293989284606912, "learning_rate": 1.305187829059354e-07, "logits/chosen": -2.6977477073669434, "logits/rejected": -2.006333351135254, "logps/chosen": -352.425537109375, "logps/rejected": -397.4286193847656, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": 3.0594279766082764, "rewards/margins": 7.277349948883057, "rewards/rejected": -4.217921257019043, "step": 3791 }, { "epoch": 2.7704109589041095, "grad_norm": 9.171225737640619, "learning_rate": 1.3037869872234985e-07, "logits/chosen": -3.102982521057129, "logits/rejected": -2.938983678817749, "logps/chosen": -676.86962890625, "logps/rejected": -751.2838134765625, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 4.387447357177734, "rewards/margins": 5.037128448486328, "rewards/rejected": -0.6496809720993042, "step": 3792 }, { "epoch": 2.7711415525114154, "grad_norm": 15.177482912157343, "learning_rate": 1.3023866323239572e-07, "logits/chosen": -2.800053358078003, "logits/rejected": -2.204397678375244, "logps/chosen": -991.073974609375, "logps/rejected": -706.4774169921875, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": 3.0049028396606445, "rewards/margins": 4.326726913452148, "rewards/rejected": -1.3218241930007935, "step": 3793 }, { "epoch": 2.7718721461187217, "grad_norm": 7.694316540086128, "learning_rate": 1.300986764930767e-07, "logits/chosen": -2.786440849304199, "logits/rejected": -1.849249005317688, "logps/chosen": -676.4702758789062, "logps/rejected": -622.3287353515625, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 4.809681415557861, "rewards/margins": 6.832724571228027, "rewards/rejected": -2.023043155670166, "step": 3794 }, { "epoch": 2.7726027397260276, "grad_norm": 14.723505049822492, "learning_rate": 1.2995873856137623e-07, "logits/chosen": -2.358119010925293, "logits/rejected": -2.216839075088501, "logps/chosen": -319.14862060546875, "logps/rejected": -435.79986572265625, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 1.6109998226165771, "rewards/margins": 6.187043190002441, "rewards/rejected": -4.576042652130127, "step": 3795 }, { "epoch": 2.7733333333333334, "grad_norm": 9.542527028126225, "learning_rate": 1.2981884949425854e-07, "logits/chosen": -3.200479030609131, "logits/rejected": -2.6592459678649902, "logps/chosen": -583.505615234375, "logps/rejected": -512.9716796875, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 3.2188668251037598, "rewards/margins": 4.6925048828125, "rewards/rejected": -1.4736382961273193, "step": 3796 }, { "epoch": 2.7740639269406393, "grad_norm": 18.34184419002123, "learning_rate": 1.2967900934866704e-07, "logits/chosen": -3.1280694007873535, "logits/rejected": -2.321392059326172, "logps/chosen": -252.06790161132812, "logps/rejected": -241.88790893554688, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 3.9225220680236816, "rewards/margins": 7.599276542663574, "rewards/rejected": -3.6767547130584717, "step": 3797 }, { "epoch": 2.774794520547945, "grad_norm": 13.211852815164507, "learning_rate": 1.2953921818152605e-07, "logits/chosen": -2.832658529281616, "logits/rejected": -2.4075472354888916, "logps/chosen": -930.4331665039062, "logps/rejected": -854.0869140625, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 5.067020893096924, "rewards/margins": 6.452893257141113, "rewards/rejected": -1.3858720064163208, "step": 3798 }, { "epoch": 2.775525114155251, "grad_norm": 10.563862187958147, "learning_rate": 1.2939947604973967e-07, "logits/chosen": -3.437168836593628, "logits/rejected": -2.0533976554870605, "logps/chosen": -888.15576171875, "logps/rejected": -567.3329467773438, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": 5.501194477081299, "rewards/margins": 6.567263603210449, "rewards/rejected": -1.0660691261291504, "step": 3799 }, { "epoch": 2.776255707762557, "grad_norm": 7.690726494768487, "learning_rate": 1.292597830101919e-07, "logits/chosen": -3.0995070934295654, "logits/rejected": -2.8150887489318848, "logps/chosen": -628.6766357421875, "logps/rejected": -511.53240966796875, "loss": 0.0559, "rewards/accuracies": 0.875, "rewards/chosen": 2.4873433113098145, "rewards/margins": 4.108938694000244, "rewards/rejected": -1.6215957403182983, "step": 3800 }, { "epoch": 2.7769863013698632, "grad_norm": 9.896061709261701, "learning_rate": 1.2912013911974696e-07, "logits/chosen": -2.8792076110839844, "logits/rejected": -2.667640209197998, "logps/chosen": -884.4393310546875, "logps/rejected": -894.995849609375, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 3.315251111984253, "rewards/margins": 4.2495832443237305, "rewards/rejected": -0.934332013130188, "step": 3801 }, { "epoch": 2.777716894977169, "grad_norm": 8.284999243414504, "learning_rate": 1.289805444352488e-07, "logits/chosen": -2.6537840366363525, "logits/rejected": -1.8273944854736328, "logps/chosen": -644.66064453125, "logps/rejected": -452.7530212402344, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 3.932007312774658, "rewards/margins": 5.921329498291016, "rewards/rejected": -1.9893220663070679, "step": 3802 }, { "epoch": 2.778447488584475, "grad_norm": 8.455224077987785, "learning_rate": 1.2884099901352175e-07, "logits/chosen": -2.469191312789917, "logits/rejected": -1.8225359916687012, "logps/chosen": -465.5968017578125, "logps/rejected": -322.87890625, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": 2.893486261367798, "rewards/margins": 5.478793144226074, "rewards/rejected": -2.585306406021118, "step": 3803 }, { "epoch": 2.779178082191781, "grad_norm": 6.98639665057601, "learning_rate": 1.287015029113697e-07, "logits/chosen": -2.5696113109588623, "logits/rejected": -2.015629529953003, "logps/chosen": -477.1419982910156, "logps/rejected": -430.95318603515625, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 2.1458280086517334, "rewards/margins": 5.3372626304626465, "rewards/rejected": -3.191434621810913, "step": 3804 }, { "epoch": 2.7799086757990867, "grad_norm": 14.134727706742355, "learning_rate": 1.285620561855766e-07, "logits/chosen": -2.684657573699951, "logits/rejected": -2.4088690280914307, "logps/chosen": -797.94677734375, "logps/rejected": -762.8113403320312, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 5.2572221755981445, "rewards/margins": 5.361521244049072, "rewards/rejected": -0.1042989194393158, "step": 3805 }, { "epoch": 2.7806392694063926, "grad_norm": 13.762696109382738, "learning_rate": 1.2842265889290647e-07, "logits/chosen": -3.0957589149475098, "logits/rejected": -2.1931769847869873, "logps/chosen": -992.662841796875, "logps/rejected": -586.28564453125, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 5.780493259429932, "rewards/margins": 4.837860584259033, "rewards/rejected": 0.9426324367523193, "step": 3806 }, { "epoch": 2.7813698630136985, "grad_norm": 16.941168018413755, "learning_rate": 1.2828331109010281e-07, "logits/chosen": -2.86733341217041, "logits/rejected": -2.225044012069702, "logps/chosen": -244.07911682128906, "logps/rejected": -184.1256866455078, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 1.1931766271591187, "rewards/margins": 4.6407790184021, "rewards/rejected": -3.4476022720336914, "step": 3807 }, { "epoch": 2.782100456621005, "grad_norm": 6.196541701533032, "learning_rate": 1.2814401283388951e-07, "logits/chosen": -2.1832213401794434, "logits/rejected": -2.2559986114501953, "logps/chosen": -417.541748046875, "logps/rejected": -614.7803344726562, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 3.1472177505493164, "rewards/margins": 6.580463409423828, "rewards/rejected": -3.4332451820373535, "step": 3808 }, { "epoch": 2.7828310502283102, "grad_norm": 6.807300228083353, "learning_rate": 1.2800476418096984e-07, "logits/chosen": -2.675147771835327, "logits/rejected": -2.129209518432617, "logps/chosen": -727.4415283203125, "logps/rejected": -523.0418701171875, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": 2.9810876846313477, "rewards/margins": 5.302677154541016, "rewards/rejected": -2.321589708328247, "step": 3809 }, { "epoch": 2.7835616438356166, "grad_norm": 16.06327325169418, "learning_rate": 1.2786556518802691e-07, "logits/chosen": -2.6684610843658447, "logits/rejected": -2.823054313659668, "logps/chosen": -367.07916259765625, "logps/rejected": -530.5562744140625, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 2.789578914642334, "rewards/margins": 7.187591075897217, "rewards/rejected": -4.398011684417725, "step": 3810 }, { "epoch": 2.7842922374429224, "grad_norm": 6.066638595739684, "learning_rate": 1.2772641591172401e-07, "logits/chosen": -2.7846224308013916, "logits/rejected": -2.5293684005737305, "logps/chosen": -820.7040405273438, "logps/rejected": -786.9952392578125, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 4.535624980926514, "rewards/margins": 3.7581489086151123, "rewards/rejected": 0.7774757742881775, "step": 3811 }, { "epoch": 2.7850228310502283, "grad_norm": 12.126257133202598, "learning_rate": 1.275873164087037e-07, "logits/chosen": -2.6091737747192383, "logits/rejected": -1.616807460784912, "logps/chosen": -385.5085754394531, "logps/rejected": -223.407470703125, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 2.2540552616119385, "rewards/margins": 4.413169860839844, "rewards/rejected": -2.1591148376464844, "step": 3812 }, { "epoch": 2.785753424657534, "grad_norm": 14.087940854313963, "learning_rate": 1.2744826673558875e-07, "logits/chosen": -2.23598313331604, "logits/rejected": -2.4700047969818115, "logps/chosen": -497.4190673828125, "logps/rejected": -838.7086791992188, "loss": 0.0629, "rewards/accuracies": 1.0, "rewards/chosen": 2.6898281574249268, "rewards/margins": 7.915197849273682, "rewards/rejected": -5.225369453430176, "step": 3813 }, { "epoch": 2.78648401826484, "grad_norm": 6.974274912783553, "learning_rate": 1.273092669489811e-07, "logits/chosen": -2.914478063583374, "logits/rejected": -2.031416654586792, "logps/chosen": -602.78662109375, "logps/rejected": -521.56884765625, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 5.486698627471924, "rewards/margins": 7.4586405754089355, "rewards/rejected": -1.9719419479370117, "step": 3814 }, { "epoch": 2.7872146118721464, "grad_norm": 13.80964127038069, "learning_rate": 1.2717031710546289e-07, "logits/chosen": -3.2970824241638184, "logits/rejected": -2.2124459743499756, "logps/chosen": -529.1149291992188, "logps/rejected": -386.0921325683594, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": 1.844716191291809, "rewards/margins": 2.7680468559265137, "rewards/rejected": -0.923330545425415, "step": 3815 }, { "epoch": 2.787945205479452, "grad_norm": 35.33354945086716, "learning_rate": 1.2703141726159556e-07, "logits/chosen": -2.4503684043884277, "logits/rejected": -2.4365217685699463, "logps/chosen": -504.2029113769531, "logps/rejected": -658.4010620117188, "loss": 0.1428, "rewards/accuracies": 1.0, "rewards/chosen": 3.43215012550354, "rewards/margins": 7.008404731750488, "rewards/rejected": -3.5762548446655273, "step": 3816 }, { "epoch": 2.788675799086758, "grad_norm": 8.928616890927785, "learning_rate": 1.2689256747392059e-07, "logits/chosen": -2.7318973541259766, "logits/rejected": -2.559537410736084, "logps/chosen": -730.5916748046875, "logps/rejected": -801.9105224609375, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 3.374926805496216, "rewards/margins": 5.299424171447754, "rewards/rejected": -1.9244978427886963, "step": 3817 }, { "epoch": 2.789406392694064, "grad_norm": 12.802000032572424, "learning_rate": 1.267537677989587e-07, "logits/chosen": -3.0240426063537598, "logits/rejected": -1.9911946058273315, "logps/chosen": -699.740966796875, "logps/rejected": -599.2022705078125, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 4.686134338378906, "rewards/margins": 5.307519912719727, "rewards/rejected": -0.6213855743408203, "step": 3818 }, { "epoch": 2.79013698630137, "grad_norm": 9.62778700074624, "learning_rate": 1.266150182932103e-07, "logits/chosen": -2.7747209072113037, "logits/rejected": -1.7553091049194336, "logps/chosen": -764.762939453125, "logps/rejected": -484.84368896484375, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 3.2698540687561035, "rewards/margins": 5.370025634765625, "rewards/rejected": -2.1001713275909424, "step": 3819 }, { "epoch": 2.7908675799086757, "grad_norm": 10.14616160609095, "learning_rate": 1.264763190131556e-07, "logits/chosen": -2.4452455043792725, "logits/rejected": -2.1864449977874756, "logps/chosen": -674.654296875, "logps/rejected": -616.2264404296875, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": 3.8799901008605957, "rewards/margins": 5.769559860229492, "rewards/rejected": -1.8895694017410278, "step": 3820 }, { "epoch": 2.7915981735159816, "grad_norm": 21.858901023497914, "learning_rate": 1.2633767001525408e-07, "logits/chosen": -2.689117908477783, "logits/rejected": -2.684300422668457, "logps/chosen": -698.1671142578125, "logps/rejected": -616.0081787109375, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 2.8444223403930664, "rewards/margins": 4.166468620300293, "rewards/rejected": -1.3220463991165161, "step": 3821 }, { "epoch": 2.792328767123288, "grad_norm": 5.630486080671781, "learning_rate": 1.2619907135594503e-07, "logits/chosen": -2.91410493850708, "logits/rejected": -1.5850412845611572, "logps/chosen": -714.6490478515625, "logps/rejected": -392.94378662109375, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 3.2263736724853516, "rewards/margins": 6.15269660949707, "rewards/rejected": -2.9263229370117188, "step": 3822 }, { "epoch": 2.7930593607305934, "grad_norm": 9.716724442255154, "learning_rate": 1.2606052309164698e-07, "logits/chosen": -2.8597826957702637, "logits/rejected": -2.415395736694336, "logps/chosen": -270.16461181640625, "logps/rejected": -287.071533203125, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 2.3684706687927246, "rewards/margins": 5.920572280883789, "rewards/rejected": -3.5521018505096436, "step": 3823 }, { "epoch": 2.7937899543378997, "grad_norm": 6.798895454456905, "learning_rate": 1.2592202527875798e-07, "logits/chosen": -2.9753196239471436, "logits/rejected": -2.3052098751068115, "logps/chosen": -595.4570922851562, "logps/rejected": -396.193115234375, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 3.6279869079589844, "rewards/margins": 6.289103031158447, "rewards/rejected": -2.661116600036621, "step": 3824 }, { "epoch": 2.7945205479452055, "grad_norm": 11.457395499025672, "learning_rate": 1.2578357797365586e-07, "logits/chosen": -2.7594594955444336, "logits/rejected": -2.1440205574035645, "logps/chosen": -640.7994995117188, "logps/rejected": -713.0478515625, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 2.1126387119293213, "rewards/margins": 4.035066604614258, "rewards/rejected": -1.9224278926849365, "step": 3825 }, { "epoch": 2.7952511415525114, "grad_norm": 14.440144614554862, "learning_rate": 1.2564518123269748e-07, "logits/chosen": -2.703305721282959, "logits/rejected": -2.5449562072753906, "logps/chosen": -599.7446899414062, "logps/rejected": -515.4617309570312, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 3.0703232288360596, "rewards/margins": 3.6257593631744385, "rewards/rejected": -0.5554359555244446, "step": 3826 }, { "epoch": 2.7959817351598173, "grad_norm": 14.099222343048249, "learning_rate": 1.255068351122193e-07, "logits/chosen": -2.5916991233825684, "logits/rejected": -2.0306477546691895, "logps/chosen": -470.6639404296875, "logps/rejected": -387.84100341796875, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 1.8777918815612793, "rewards/margins": 6.987886905670166, "rewards/rejected": -5.110095024108887, "step": 3827 }, { "epoch": 2.796712328767123, "grad_norm": 7.373755657157015, "learning_rate": 1.2536853966853728e-07, "logits/chosen": -2.9490232467651367, "logits/rejected": -1.7414735555648804, "logps/chosen": -921.9695434570312, "logps/rejected": -516.0986328125, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 4.243643760681152, "rewards/margins": 6.530623912811279, "rewards/rejected": -2.286980390548706, "step": 3828 }, { "epoch": 2.7974429223744295, "grad_norm": 8.779884096392944, "learning_rate": 1.2523029495794646e-07, "logits/chosen": -2.6455259323120117, "logits/rejected": -1.9274392127990723, "logps/chosen": -639.3104248046875, "logps/rejected": -581.8530883789062, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 4.914088726043701, "rewards/margins": 7.679166793823242, "rewards/rejected": -2.765078544616699, "step": 3829 }, { "epoch": 2.798173515981735, "grad_norm": 20.856856999239334, "learning_rate": 1.2509210103672175e-07, "logits/chosen": -2.82930850982666, "logits/rejected": -2.2067649364471436, "logps/chosen": -461.4541015625, "logps/rejected": -516.1158447265625, "loss": 0.0997, "rewards/accuracies": 1.0, "rewards/chosen": 3.2033748626708984, "rewards/margins": 4.667577266693115, "rewards/rejected": -1.464202642440796, "step": 3830 }, { "epoch": 2.7989041095890412, "grad_norm": 12.385085186748125, "learning_rate": 1.2495395796111658e-07, "logits/chosen": -2.854461193084717, "logits/rejected": -2.2939066886901855, "logps/chosen": -916.02783203125, "logps/rejected": -678.1126098632812, "loss": 0.0575, "rewards/accuracies": 1.0, "rewards/chosen": 3.170673370361328, "rewards/margins": 3.8316402435302734, "rewards/rejected": -0.6609667539596558, "step": 3831 }, { "epoch": 2.799634703196347, "grad_norm": 14.987142011043323, "learning_rate": 1.2481586578736446e-07, "logits/chosen": -2.857825756072998, "logits/rejected": -2.116981029510498, "logps/chosen": -933.574951171875, "logps/rejected": -543.7156982421875, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": 4.225392818450928, "rewards/margins": 5.588865280151367, "rewards/rejected": -1.3634722232818604, "step": 3832 }, { "epoch": 2.800365296803653, "grad_norm": 11.034517404074377, "learning_rate": 1.2467782457167773e-07, "logits/chosen": -2.7183523178100586, "logits/rejected": -2.4107396602630615, "logps/chosen": -602.7830810546875, "logps/rejected": -604.9227294921875, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 5.165683746337891, "rewards/margins": 6.987732410430908, "rewards/rejected": -1.8220494985580444, "step": 3833 }, { "epoch": 2.801095890410959, "grad_norm": 4.850322177843636, "learning_rate": 1.2453983437024814e-07, "logits/chosen": -2.4875216484069824, "logits/rejected": -1.7975316047668457, "logps/chosen": -644.4025268554688, "logps/rejected": -410.0498352050781, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 4.356741905212402, "rewards/margins": 7.072936534881592, "rewards/rejected": -2.7161948680877686, "step": 3834 }, { "epoch": 2.8018264840182647, "grad_norm": 8.913389750577673, "learning_rate": 1.244018952392469e-07, "logits/chosen": -2.597472667694092, "logits/rejected": -2.1648921966552734, "logps/chosen": -621.538330078125, "logps/rejected": -640.475830078125, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 1.5898666381835938, "rewards/margins": 4.385130405426025, "rewards/rejected": -2.7952635288238525, "step": 3835 }, { "epoch": 2.802557077625571, "grad_norm": 14.725566735233931, "learning_rate": 1.2426400723482377e-07, "logits/chosen": -2.7576544284820557, "logits/rejected": -2.5702359676361084, "logps/chosen": -588.3464965820312, "logps/rejected": -513.4239501953125, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": 3.0279390811920166, "rewards/margins": 4.273159980773926, "rewards/rejected": -1.2452208995819092, "step": 3836 }, { "epoch": 2.8032876712328765, "grad_norm": 15.662687946236172, "learning_rate": 1.2412617041310844e-07, "logits/chosen": -2.9749560356140137, "logits/rejected": -2.2578930854797363, "logps/chosen": -518.2501831054688, "logps/rejected": -440.175537109375, "loss": 0.0833, "rewards/accuracies": 0.875, "rewards/chosen": 2.6524648666381836, "rewards/margins": 4.782236099243164, "rewards/rejected": -2.1297712326049805, "step": 3837 }, { "epoch": 2.804018264840183, "grad_norm": 8.483115086873951, "learning_rate": 1.2398838483020918e-07, "logits/chosen": -2.9035918712615967, "logits/rejected": -2.4917407035827637, "logps/chosen": -642.2445068359375, "logps/rejected": -651.8048095703125, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": 4.1543498039245605, "rewards/margins": 6.8443756103515625, "rewards/rejected": -2.690025806427002, "step": 3838 }, { "epoch": 2.8047488584474887, "grad_norm": 11.959145619166517, "learning_rate": 1.2385065054221394e-07, "logits/chosen": -3.258566379547119, "logits/rejected": -2.9096791744232178, "logps/chosen": -679.2456665039062, "logps/rejected": -679.2241821289062, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 4.6298112869262695, "rewards/margins": 6.7996978759765625, "rewards/rejected": -2.169886350631714, "step": 3839 }, { "epoch": 2.8054794520547945, "grad_norm": 9.565158848338344, "learning_rate": 1.2371296760518935e-07, "logits/chosen": -3.1071879863739014, "logits/rejected": -1.7513329982757568, "logps/chosen": -902.424072265625, "logps/rejected": -563.1555786132812, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 5.1795148849487305, "rewards/margins": 6.05846643447876, "rewards/rejected": -0.8789514303207397, "step": 3840 }, { "epoch": 2.8062100456621004, "grad_norm": 8.402247954620119, "learning_rate": 1.2357533607518124e-07, "logits/chosen": -2.665160655975342, "logits/rejected": -2.2652530670166016, "logps/chosen": -607.3670654296875, "logps/rejected": -513.9888305664062, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 3.188598394393921, "rewards/margins": 5.125107765197754, "rewards/rejected": -1.936509132385254, "step": 3841 }, { "epoch": 2.8069406392694063, "grad_norm": 16.66178324045638, "learning_rate": 1.2343775600821475e-07, "logits/chosen": -2.799549102783203, "logits/rejected": -2.439751148223877, "logps/chosen": -554.8206787109375, "logps/rejected": -452.9519958496094, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": 2.697032928466797, "rewards/margins": 5.819502353668213, "rewards/rejected": -3.122469663619995, "step": 3842 }, { "epoch": 2.807671232876712, "grad_norm": 8.888581106367464, "learning_rate": 1.2330022746029377e-07, "logits/chosen": -2.8608334064483643, "logits/rejected": -2.3002305030822754, "logps/chosen": -784.431396484375, "logps/rejected": -542.5608520507812, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 3.9482579231262207, "rewards/margins": 5.838290214538574, "rewards/rejected": -1.8900322914123535, "step": 3843 }, { "epoch": 2.808401826484018, "grad_norm": 10.442075902698411, "learning_rate": 1.2316275048740134e-07, "logits/chosen": -2.9421346187591553, "logits/rejected": -2.3897392749786377, "logps/chosen": -618.2559814453125, "logps/rejected": -468.13916015625, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 2.7873425483703613, "rewards/margins": 4.235265731811523, "rewards/rejected": -1.4479236602783203, "step": 3844 }, { "epoch": 2.8091324200913244, "grad_norm": 8.885310242087472, "learning_rate": 1.230253251454996e-07, "logits/chosen": -2.6778337955474854, "logits/rejected": -1.9924070835113525, "logps/chosen": -865.5018310546875, "logps/rejected": -557.1658325195312, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 4.098793983459473, "rewards/margins": 5.699963092803955, "rewards/rejected": -1.6011688709259033, "step": 3845 }, { "epoch": 2.8098630136986302, "grad_norm": 9.118463552785444, "learning_rate": 1.2288795149052945e-07, "logits/chosen": -2.8479483127593994, "logits/rejected": -2.2894303798675537, "logps/chosen": -600.274169921875, "logps/rejected": -365.637939453125, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 3.7660508155822754, "rewards/margins": 4.191422462463379, "rewards/rejected": -0.42537179589271545, "step": 3846 }, { "epoch": 2.810593607305936, "grad_norm": 11.343003429720042, "learning_rate": 1.2275062957841105e-07, "logits/chosen": -2.8294858932495117, "logits/rejected": -1.4955697059631348, "logps/chosen": -407.7340393066406, "logps/rejected": -366.1131591796875, "loss": 0.089, "rewards/accuracies": 0.875, "rewards/chosen": 1.2786402702331543, "rewards/margins": 3.4891879558563232, "rewards/rejected": -2.21054744720459, "step": 3847 }, { "epoch": 2.811324200913242, "grad_norm": 7.879047140999834, "learning_rate": 1.226133594650432e-07, "logits/chosen": -3.103296995162964, "logits/rejected": -2.1341552734375, "logps/chosen": -818.7136840820312, "logps/rejected": -554.5028076171875, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 4.895334720611572, "rewards/margins": 7.120515823364258, "rewards/rejected": -2.2251808643341064, "step": 3848 }, { "epoch": 2.812054794520548, "grad_norm": 8.088549736560957, "learning_rate": 1.224761412063038e-07, "logits/chosen": -2.9679372310638428, "logits/rejected": -2.7908005714416504, "logps/chosen": -613.0567016601562, "logps/rejected": -713.193603515625, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 3.5074520111083984, "rewards/margins": 4.575976371765137, "rewards/rejected": -1.0685242414474487, "step": 3849 }, { "epoch": 2.8127853881278537, "grad_norm": 9.568282230437635, "learning_rate": 1.2233897485804944e-07, "logits/chosen": -2.7951693534851074, "logits/rejected": -2.4562902450561523, "logps/chosen": -884.4907836914062, "logps/rejected": -846.4933471679688, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 3.6410651206970215, "rewards/margins": 5.549566268920898, "rewards/rejected": -1.9085016250610352, "step": 3850 }, { "epoch": 2.8135159817351596, "grad_norm": 8.993513949055478, "learning_rate": 1.222018604761159e-07, "logits/chosen": -2.919400930404663, "logits/rejected": -1.4918819665908813, "logps/chosen": -650.3028564453125, "logps/rejected": -299.20819091796875, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 4.04498815536499, "rewards/margins": 7.099967002868652, "rewards/rejected": -3.054979085922241, "step": 3851 }, { "epoch": 2.814246575342466, "grad_norm": 16.570818039494817, "learning_rate": 1.2206479811631772e-07, "logits/chosen": -2.583477020263672, "logits/rejected": -2.8299858570098877, "logps/chosen": -478.19305419921875, "logps/rejected": -471.645263671875, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 2.8719358444213867, "rewards/margins": 4.433185577392578, "rewards/rejected": -1.5612497329711914, "step": 3852 }, { "epoch": 2.814977168949772, "grad_norm": 8.731648410235088, "learning_rate": 1.2192778783444786e-07, "logits/chosen": -3.3652234077453613, "logits/rejected": -2.28528094291687, "logps/chosen": -465.0150451660156, "logps/rejected": -311.6742248535156, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 3.0183193683624268, "rewards/margins": 5.214014053344727, "rewards/rejected": -2.1956946849823, "step": 3853 }, { "epoch": 2.8157077625570777, "grad_norm": 11.747160872918135, "learning_rate": 1.217908296862787e-07, "logits/chosen": -2.2270894050598145, "logits/rejected": -1.9932491779327393, "logps/chosen": -841.4371948242188, "logps/rejected": -868.521484375, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": 3.4592385292053223, "rewards/margins": 5.15325403213501, "rewards/rejected": -1.6940152645111084, "step": 3854 }, { "epoch": 2.8164383561643835, "grad_norm": 16.348460348354482, "learning_rate": 1.216539237275608e-07, "logits/chosen": -2.2747340202331543, "logits/rejected": -1.8915562629699707, "logps/chosen": -383.31036376953125, "logps/rejected": -322.25213623046875, "loss": 0.0876, "rewards/accuracies": 0.875, "rewards/chosen": 3.6761021614074707, "rewards/margins": 7.688957691192627, "rewards/rejected": -4.0128560066223145, "step": 3855 }, { "epoch": 2.8171689497716894, "grad_norm": 10.122426400589513, "learning_rate": 1.2151707001402406e-07, "logits/chosen": -3.441683053970337, "logits/rejected": -2.511082649230957, "logps/chosen": -673.6680908203125, "logps/rejected": -480.6170349121094, "loss": 0.0569, "rewards/accuracies": 0.875, "rewards/chosen": 2.439063549041748, "rewards/margins": 4.062013626098633, "rewards/rejected": -1.6229500770568848, "step": 3856 }, { "epoch": 2.8178995433789953, "grad_norm": 7.939505015974336, "learning_rate": 1.2138026860137668e-07, "logits/chosen": -2.643425941467285, "logits/rejected": -1.3086529970169067, "logps/chosen": -577.6109619140625, "logps/rejected": -362.3687744140625, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 4.609151363372803, "rewards/margins": 8.768622398376465, "rewards/rejected": -4.159470558166504, "step": 3857 }, { "epoch": 2.818630136986301, "grad_norm": 10.814650128860755, "learning_rate": 1.2124351954530562e-07, "logits/chosen": -2.478107213973999, "logits/rejected": -1.5333184003829956, "logps/chosen": -802.1406860351562, "logps/rejected": -373.7185974121094, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 5.483471870422363, "rewards/margins": 8.472880363464355, "rewards/rejected": -2.989408493041992, "step": 3858 }, { "epoch": 2.8193607305936075, "grad_norm": 10.778185776616299, "learning_rate": 1.211068229014768e-07, "logits/chosen": -3.572589635848999, "logits/rejected": -1.8233766555786133, "logps/chosen": -525.8729248046875, "logps/rejected": -220.26043701171875, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 3.122645616531372, "rewards/margins": 4.919286727905273, "rewards/rejected": -1.7966405153274536, "step": 3859 }, { "epoch": 2.8200913242009134, "grad_norm": 8.549833186792164, "learning_rate": 1.2097017872553448e-07, "logits/chosen": -2.7217438220977783, "logits/rejected": -2.287468194961548, "logps/chosen": -828.5885009765625, "logps/rejected": -599.7352905273438, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 3.9453325271606445, "rewards/margins": 5.426603317260742, "rewards/rejected": -1.4812707901000977, "step": 3860 }, { "epoch": 2.8208219178082192, "grad_norm": 7.844504027580118, "learning_rate": 1.2083358707310185e-07, "logits/chosen": -2.400510311126709, "logits/rejected": -1.9668338298797607, "logps/chosen": -594.857666015625, "logps/rejected": -414.5052795410156, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 4.0925469398498535, "rewards/margins": 6.8993377685546875, "rewards/rejected": -2.806790828704834, "step": 3861 }, { "epoch": 2.821552511415525, "grad_norm": 6.162297255951154, "learning_rate": 1.206970479997805e-07, "logits/chosen": -2.8072848320007324, "logits/rejected": -1.5021286010742188, "logps/chosen": -616.9893798828125, "logps/rejected": -350.60650634765625, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 3.861114740371704, "rewards/margins": 8.354341506958008, "rewards/rejected": -4.493226528167725, "step": 3862 }, { "epoch": 2.822283105022831, "grad_norm": 13.517007922385343, "learning_rate": 1.2056056156115058e-07, "logits/chosen": -3.192795753479004, "logits/rejected": -2.5532405376434326, "logps/chosen": -654.9720458984375, "logps/rejected": -476.32281494140625, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 3.860170364379883, "rewards/margins": 4.1057047843933105, "rewards/rejected": -0.2455342710018158, "step": 3863 }, { "epoch": 2.823013698630137, "grad_norm": 21.589345833434255, "learning_rate": 1.204241278127711e-07, "logits/chosen": -2.6432242393493652, "logits/rejected": -2.1900198459625244, "logps/chosen": -505.1936340332031, "logps/rejected": -507.01495361328125, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": 3.715825080871582, "rewards/margins": 5.71180534362793, "rewards/rejected": -1.995980143547058, "step": 3864 }, { "epoch": 2.8237442922374427, "grad_norm": 7.0049252212965865, "learning_rate": 1.2028774681017945e-07, "logits/chosen": -3.102139949798584, "logits/rejected": -2.310610055923462, "logps/chosen": -674.094970703125, "logps/rejected": -538.14111328125, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 3.469536304473877, "rewards/margins": 4.673077583312988, "rewards/rejected": -1.2035412788391113, "step": 3865 }, { "epoch": 2.824474885844749, "grad_norm": 8.846064719577244, "learning_rate": 1.2015141860889144e-07, "logits/chosen": -2.957522392272949, "logits/rejected": -1.6950377225875854, "logps/chosen": -499.7720031738281, "logps/rejected": -273.52618408203125, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 2.89233136177063, "rewards/margins": 5.5993852615356445, "rewards/rejected": -2.7070541381835938, "step": 3866 }, { "epoch": 2.825205479452055, "grad_norm": 16.308956302107823, "learning_rate": 1.2001514326440146e-07, "logits/chosen": -2.4744014739990234, "logits/rejected": -2.631784439086914, "logps/chosen": -587.0259399414062, "logps/rejected": -646.3878173828125, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": 2.246943950653076, "rewards/margins": 3.762434482574463, "rewards/rejected": -1.5154907703399658, "step": 3867 }, { "epoch": 2.825936073059361, "grad_norm": 6.854495735346887, "learning_rate": 1.1987892083218257e-07, "logits/chosen": -2.1024677753448486, "logits/rejected": -2.4223368167877197, "logps/chosen": -245.203857421875, "logps/rejected": -391.48541259765625, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 1.6544982194900513, "rewards/margins": 5.420331954956055, "rewards/rejected": -3.765833616256714, "step": 3868 }, { "epoch": 2.8266666666666667, "grad_norm": 25.213594384713875, "learning_rate": 1.1974275136768598e-07, "logits/chosen": -3.1903090476989746, "logits/rejected": -2.2548887729644775, "logps/chosen": -633.7294311523438, "logps/rejected": -471.28399658203125, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 3.173100471496582, "rewards/margins": 6.261632919311523, "rewards/rejected": -3.0885326862335205, "step": 3869 }, { "epoch": 2.8273972602739725, "grad_norm": 9.010945340267744, "learning_rate": 1.1960663492634166e-07, "logits/chosen": -2.8473665714263916, "logits/rejected": -2.0160341262817383, "logps/chosen": -628.73828125, "logps/rejected": -525.7275390625, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 1.9975429773330688, "rewards/margins": 6.807671070098877, "rewards/rejected": -4.810128688812256, "step": 3870 }, { "epoch": 2.8281278538812784, "grad_norm": 8.347176866060812, "learning_rate": 1.1947057156355776e-07, "logits/chosen": -2.7970964908599854, "logits/rejected": -2.17183256149292, "logps/chosen": -510.70111083984375, "logps/rejected": -338.65087890625, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 3.5072989463806152, "rewards/margins": 7.125370502471924, "rewards/rejected": -3.6180715560913086, "step": 3871 }, { "epoch": 2.8288584474885843, "grad_norm": 8.797253995221766, "learning_rate": 1.193345613347208e-07, "logits/chosen": -2.67413592338562, "logits/rejected": -2.3882579803466797, "logps/chosen": -784.4082641601562, "logps/rejected": -879.0773315429688, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 3.6494383811950684, "rewards/margins": 5.603157043457031, "rewards/rejected": -1.9537183046340942, "step": 3872 }, { "epoch": 2.8295890410958906, "grad_norm": 9.16044067722066, "learning_rate": 1.191986042951959e-07, "logits/chosen": -2.9537765979766846, "logits/rejected": -2.1880884170532227, "logps/chosen": -899.0540771484375, "logps/rejected": -718.4349365234375, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 4.143795013427734, "rewards/margins": 4.505382537841797, "rewards/rejected": -0.36158791184425354, "step": 3873 }, { "epoch": 2.8303196347031965, "grad_norm": 8.89459515306974, "learning_rate": 1.1906270050032641e-07, "logits/chosen": -3.0296645164489746, "logits/rejected": -1.918209433555603, "logps/chosen": -750.4969482421875, "logps/rejected": -457.0639953613281, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 5.333080291748047, "rewards/margins": 6.1899309158325195, "rewards/rejected": -0.8568508625030518, "step": 3874 }, { "epoch": 2.8310502283105023, "grad_norm": 17.009702718515268, "learning_rate": 1.1892685000543381e-07, "logits/chosen": -2.6019082069396973, "logits/rejected": -1.9982655048370361, "logps/chosen": -592.9114990234375, "logps/rejected": -531.8541259765625, "loss": 0.0665, "rewards/accuracies": 0.875, "rewards/chosen": 1.1681984663009644, "rewards/margins": 4.315448760986328, "rewards/rejected": -3.147250175476074, "step": 3875 }, { "epoch": 2.831780821917808, "grad_norm": 11.055513578896463, "learning_rate": 1.1879105286581831e-07, "logits/chosen": -2.5761194229125977, "logits/rejected": -1.224483847618103, "logps/chosen": -749.6172485351562, "logps/rejected": -335.3331298828125, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": 5.123539924621582, "rewards/margins": 10.016626358032227, "rewards/rejected": -4.8930864334106445, "step": 3876 }, { "epoch": 2.832511415525114, "grad_norm": 9.374263480183307, "learning_rate": 1.1865530913675795e-07, "logits/chosen": -2.7665586471557617, "logits/rejected": -2.339733600616455, "logps/chosen": -271.7435607910156, "logps/rejected": -275.54815673828125, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 0.8052855134010315, "rewards/margins": 4.74676513671875, "rewards/rejected": -3.9414796829223633, "step": 3877 }, { "epoch": 2.83324200913242, "grad_norm": 15.161708202899923, "learning_rate": 1.185196188735095e-07, "logits/chosen": -2.704817295074463, "logits/rejected": -1.8832846879959106, "logps/chosen": -694.8533325195312, "logps/rejected": -423.0845642089844, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 3.3610897064208984, "rewards/margins": 4.159262180328369, "rewards/rejected": -0.7981722950935364, "step": 3878 }, { "epoch": 2.833972602739726, "grad_norm": 8.034000623327398, "learning_rate": 1.1838398213130735e-07, "logits/chosen": -2.873915195465088, "logits/rejected": -2.0234034061431885, "logps/chosen": -820.4750366210938, "logps/rejected": -540.1560668945312, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 5.182500839233398, "rewards/margins": 6.356647491455078, "rewards/rejected": -1.174147367477417, "step": 3879 }, { "epoch": 2.834703196347032, "grad_norm": 8.101196163065115, "learning_rate": 1.1824839896536464e-07, "logits/chosen": -3.2681221961975098, "logits/rejected": -3.0818142890930176, "logps/chosen": -589.8621826171875, "logps/rejected": -613.1181030273438, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 2.338592529296875, "rewards/margins": 4.062050819396973, "rewards/rejected": -1.7234582901000977, "step": 3880 }, { "epoch": 2.835433789954338, "grad_norm": 9.707431298346096, "learning_rate": 1.1811286943087259e-07, "logits/chosen": -2.6263058185577393, "logits/rejected": -2.295640468597412, "logps/chosen": -601.2022705078125, "logps/rejected": -475.9468994140625, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 2.912707567214966, "rewards/margins": 6.049571990966797, "rewards/rejected": -3.136864423751831, "step": 3881 }, { "epoch": 2.836164383561644, "grad_norm": 11.936015890647804, "learning_rate": 1.1797739358300034e-07, "logits/chosen": -2.3457589149475098, "logits/rejected": -2.2778000831604004, "logps/chosen": -530.9508666992188, "logps/rejected": -567.8471069335938, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": 2.724778890609741, "rewards/margins": 2.811710834503174, "rewards/rejected": -0.08693177998065948, "step": 3882 }, { "epoch": 2.83689497716895, "grad_norm": 7.9734267928260705, "learning_rate": 1.1784197147689565e-07, "logits/chosen": -3.433478593826294, "logits/rejected": -2.583564281463623, "logps/chosen": -1219.332763671875, "logps/rejected": -747.8114013671875, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 4.610982894897461, "rewards/margins": 5.350689888000488, "rewards/rejected": -0.7397071719169617, "step": 3883 }, { "epoch": 2.8376255707762557, "grad_norm": 18.6720204622995, "learning_rate": 1.1770660316768371e-07, "logits/chosen": -2.8951830863952637, "logits/rejected": -2.471405506134033, "logps/chosen": -669.4694213867188, "logps/rejected": -659.16455078125, "loss": 0.0737, "rewards/accuracies": 1.0, "rewards/chosen": 3.0965449810028076, "rewards/margins": 4.730443954467773, "rewards/rejected": -1.633899211883545, "step": 3884 }, { "epoch": 2.8383561643835615, "grad_norm": 9.867859782001672, "learning_rate": 1.1757128871046849e-07, "logits/chosen": -2.775074005126953, "logits/rejected": -2.3497016429901123, "logps/chosen": -628.4966430664062, "logps/rejected": -638.46533203125, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 4.801117897033691, "rewards/margins": 6.347867012023926, "rewards/rejected": -1.5467491149902344, "step": 3885 }, { "epoch": 2.8390867579908674, "grad_norm": 8.975121338430517, "learning_rate": 1.1743602816033155e-07, "logits/chosen": -2.6867198944091797, "logits/rejected": -2.163982629776001, "logps/chosen": -507.771728515625, "logps/rejected": -615.2678833007812, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 1.8565254211425781, "rewards/margins": 6.050689697265625, "rewards/rejected": -4.194164276123047, "step": 3886 }, { "epoch": 2.8398173515981737, "grad_norm": 7.8069162174584665, "learning_rate": 1.173008215723329e-07, "logits/chosen": -2.911607265472412, "logits/rejected": -2.186042547225952, "logps/chosen": -759.4046630859375, "logps/rejected": -560.3040771484375, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 3.663170099258423, "rewards/margins": 5.039608478546143, "rewards/rejected": -1.3764386177062988, "step": 3887 }, { "epoch": 2.8405479452054796, "grad_norm": 22.324737868526867, "learning_rate": 1.1716566900151035e-07, "logits/chosen": -2.1959025859832764, "logits/rejected": -2.5140905380249023, "logps/chosen": -725.579833984375, "logps/rejected": -834.4169921875, "loss": 0.0718, "rewards/accuracies": 1.0, "rewards/chosen": 3.226480484008789, "rewards/margins": 4.647744178771973, "rewards/rejected": -1.4212632179260254, "step": 3888 }, { "epoch": 2.8412785388127855, "grad_norm": 10.737940921989683, "learning_rate": 1.1703057050287962e-07, "logits/chosen": -2.517923593521118, "logits/rejected": -1.6846139430999756, "logps/chosen": -511.6504211425781, "logps/rejected": -293.6324768066406, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": 4.554025650024414, "rewards/margins": 7.794365882873535, "rewards/rejected": -3.240339994430542, "step": 3889 }, { "epoch": 2.8420091324200913, "grad_norm": 7.967326916052577, "learning_rate": 1.1689552613143474e-07, "logits/chosen": -2.3046727180480957, "logits/rejected": -2.149184465408325, "logps/chosen": -579.5975341796875, "logps/rejected": -659.7630615234375, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 3.427949905395508, "rewards/margins": 4.226114273071289, "rewards/rejected": -0.798164427280426, "step": 3890 }, { "epoch": 2.842739726027397, "grad_norm": 6.175026390853271, "learning_rate": 1.1676053594214752e-07, "logits/chosen": -2.3255205154418945, "logits/rejected": -1.4753780364990234, "logps/chosen": -671.9207763671875, "logps/rejected": -417.934326171875, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 4.452671527862549, "rewards/margins": 7.923069477081299, "rewards/rejected": -3.47039794921875, "step": 3891 }, { "epoch": 2.843470319634703, "grad_norm": 12.251263611685442, "learning_rate": 1.1662559998996755e-07, "logits/chosen": -3.031644821166992, "logits/rejected": -2.088775873184204, "logps/chosen": -612.5983276367188, "logps/rejected": -547.5068969726562, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 3.9035227298736572, "rewards/margins": 5.945014953613281, "rewards/rejected": -2.041492462158203, "step": 3892 }, { "epoch": 2.844200913242009, "grad_norm": 14.349261519407323, "learning_rate": 1.1649071832982273e-07, "logits/chosen": -3.0451560020446777, "logits/rejected": -2.057374954223633, "logps/chosen": -484.25787353515625, "logps/rejected": -425.70501708984375, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": 4.392274856567383, "rewards/margins": 7.230240345001221, "rewards/rejected": -2.837965488433838, "step": 3893 }, { "epoch": 2.8449315068493153, "grad_norm": 10.656106092090978, "learning_rate": 1.1635589101661847e-07, "logits/chosen": -3.1805996894836426, "logits/rejected": -2.234271764755249, "logps/chosen": -639.449951171875, "logps/rejected": -413.7044982910156, "loss": 0.0659, "rewards/accuracies": 0.875, "rewards/chosen": 4.176538467407227, "rewards/margins": 6.2778239250183105, "rewards/rejected": -2.101285457611084, "step": 3894 }, { "epoch": 2.845662100456621, "grad_norm": 6.327683120181793, "learning_rate": 1.1622111810523844e-07, "logits/chosen": -2.7382490634918213, "logits/rejected": -2.4539170265197754, "logps/chosen": -694.5034790039062, "logps/rejected": -665.2861328125, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 3.7527194023132324, "rewards/margins": 5.1802825927734375, "rewards/rejected": -1.4275634288787842, "step": 3895 }, { "epoch": 2.846392694063927, "grad_norm": 10.96895205754509, "learning_rate": 1.1608639965054382e-07, "logits/chosen": -2.503023862838745, "logits/rejected": -1.8333498239517212, "logps/chosen": -626.2998046875, "logps/rejected": -470.578857421875, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 3.2864952087402344, "rewards/margins": 5.506511688232422, "rewards/rejected": -2.2200162410736084, "step": 3896 }, { "epoch": 2.847123287671233, "grad_norm": 13.13325691265457, "learning_rate": 1.1595173570737371e-07, "logits/chosen": -2.7853095531463623, "logits/rejected": -2.5881829261779785, "logps/chosen": -480.85498046875, "logps/rejected": -445.0628662109375, "loss": 0.0806, "rewards/accuracies": 0.875, "rewards/chosen": 2.5667436122894287, "rewards/margins": 4.764683723449707, "rewards/rejected": -2.197939872741699, "step": 3897 }, { "epoch": 2.847853881278539, "grad_norm": 8.813687291888636, "learning_rate": 1.1581712633054522e-07, "logits/chosen": -2.7890143394470215, "logits/rejected": -1.9701815843582153, "logps/chosen": -449.7823486328125, "logps/rejected": -385.01995849609375, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 2.766247272491455, "rewards/margins": 5.0243706703186035, "rewards/rejected": -2.2581236362457275, "step": 3898 }, { "epoch": 2.8485844748858447, "grad_norm": 8.2324726143899, "learning_rate": 1.1568257157485295e-07, "logits/chosen": -2.7435357570648193, "logits/rejected": -2.245591878890991, "logps/chosen": -820.4598388671875, "logps/rejected": -623.1201171875, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 3.1065714359283447, "rewards/margins": 4.560533046722412, "rewards/rejected": -1.4539616107940674, "step": 3899 }, { "epoch": 2.8493150684931505, "grad_norm": 8.175735991125327, "learning_rate": 1.1554807149506968e-07, "logits/chosen": -2.9492011070251465, "logits/rejected": -1.974225401878357, "logps/chosen": -745.30908203125, "logps/rejected": -542.6871337890625, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 3.317854166030884, "rewards/margins": 6.9869914054870605, "rewards/rejected": -3.669137477874756, "step": 3900 }, { "epoch": 2.850045662100457, "grad_norm": 12.054667880128234, "learning_rate": 1.154136261459453e-07, "logits/chosen": -2.6121597290039062, "logits/rejected": -2.4447693824768066, "logps/chosen": -624.4085693359375, "logps/rejected": -585.4982299804688, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": 4.267273902893066, "rewards/margins": 6.136266231536865, "rewards/rejected": -1.868992805480957, "step": 3901 }, { "epoch": 2.8507762557077627, "grad_norm": 12.442291716920373, "learning_rate": 1.1527923558220806e-07, "logits/chosen": -2.831268072128296, "logits/rejected": -1.6879746913909912, "logps/chosen": -642.3138427734375, "logps/rejected": -360.32763671875, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 4.675832748413086, "rewards/margins": 9.028298377990723, "rewards/rejected": -4.352466583251953, "step": 3902 }, { "epoch": 2.8515068493150686, "grad_norm": 21.43630844675031, "learning_rate": 1.1514489985856348e-07, "logits/chosen": -2.9440979957580566, "logits/rejected": -2.494374990463257, "logps/chosen": -682.7361450195312, "logps/rejected": -533.7930908203125, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": 2.7143232822418213, "rewards/margins": 5.103704452514648, "rewards/rejected": -2.389381170272827, "step": 3903 }, { "epoch": 2.8522374429223745, "grad_norm": 9.60613631588859, "learning_rate": 1.1501061902969509e-07, "logits/chosen": -2.732264280319214, "logits/rejected": -1.592841625213623, "logps/chosen": -979.9385986328125, "logps/rejected": -527.8822021484375, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 5.930185317993164, "rewards/margins": 6.437654972076416, "rewards/rejected": -0.5074697732925415, "step": 3904 }, { "epoch": 2.8529680365296803, "grad_norm": 12.800073170452789, "learning_rate": 1.1487639315026379e-07, "logits/chosen": -2.694589614868164, "logits/rejected": -1.9710404872894287, "logps/chosen": -538.994140625, "logps/rejected": -387.9017639160156, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 1.9312679767608643, "rewards/margins": 5.357199668884277, "rewards/rejected": -3.425931930541992, "step": 3905 }, { "epoch": 2.853698630136986, "grad_norm": 10.914526654410798, "learning_rate": 1.1474222227490815e-07, "logits/chosen": -2.3488454818725586, "logits/rejected": -1.9789389371871948, "logps/chosen": -637.4755859375, "logps/rejected": -719.355712890625, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 3.612938642501831, "rewards/margins": 4.590103626251221, "rewards/rejected": -0.9771651029586792, "step": 3906 }, { "epoch": 2.854429223744292, "grad_norm": 11.239048353207766, "learning_rate": 1.1460810645824459e-07, "logits/chosen": -3.0591816902160645, "logits/rejected": -2.715977907180786, "logps/chosen": -623.5951538085938, "logps/rejected": -611.70654296875, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 3.8617138862609863, "rewards/margins": 5.352553367614746, "rewards/rejected": -1.4908397197723389, "step": 3907 }, { "epoch": 2.8551598173515984, "grad_norm": 13.669107978922879, "learning_rate": 1.1447404575486678e-07, "logits/chosen": -2.628767251968384, "logits/rejected": -1.5118861198425293, "logps/chosen": -521.2601318359375, "logps/rejected": -367.48614501953125, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 3.4288065433502197, "rewards/margins": 6.193272590637207, "rewards/rejected": -2.7644660472869873, "step": 3908 }, { "epoch": 2.855890410958904, "grad_norm": 20.140036326492112, "learning_rate": 1.1434004021934632e-07, "logits/chosen": -2.442420482635498, "logits/rejected": -2.5519611835479736, "logps/chosen": -448.4839782714844, "logps/rejected": -457.5364074707031, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": 1.6177375316619873, "rewards/margins": 3.559861898422241, "rewards/rejected": -1.942124366760254, "step": 3909 }, { "epoch": 2.85662100456621, "grad_norm": 9.449535653145006, "learning_rate": 1.1420608990623204e-07, "logits/chosen": -2.7416234016418457, "logits/rejected": -2.504452705383301, "logps/chosen": -284.8866882324219, "logps/rejected": -330.89508056640625, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 1.3950802087783813, "rewards/margins": 5.032552719116211, "rewards/rejected": -3.637472152709961, "step": 3910 }, { "epoch": 2.857351598173516, "grad_norm": 9.653180372104663, "learning_rate": 1.1407219487005032e-07, "logits/chosen": -2.4391531944274902, "logits/rejected": -2.6006240844726562, "logps/chosen": -767.1504516601562, "logps/rejected": -774.0274047851562, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 5.502730846405029, "rewards/margins": 6.283602714538574, "rewards/rejected": -0.7808719873428345, "step": 3911 }, { "epoch": 2.858082191780822, "grad_norm": 21.59502754903892, "learning_rate": 1.1393835516530531e-07, "logits/chosen": -2.614731550216675, "logits/rejected": -1.8813461065292358, "logps/chosen": -633.5116577148438, "logps/rejected": -447.18682861328125, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 2.558654546737671, "rewards/margins": 3.8732194900512695, "rewards/rejected": -1.3145651817321777, "step": 3912 }, { "epoch": 2.8588127853881278, "grad_norm": 7.828878339811541, "learning_rate": 1.138045708464784e-07, "logits/chosen": -2.6408002376556396, "logits/rejected": -2.744621992111206, "logps/chosen": -518.10693359375, "logps/rejected": -573.876220703125, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 2.859635829925537, "rewards/margins": 5.263771057128906, "rewards/rejected": -2.404135227203369, "step": 3913 }, { "epoch": 2.8595433789954336, "grad_norm": 8.495954111639836, "learning_rate": 1.1367084196802834e-07, "logits/chosen": -2.4739675521850586, "logits/rejected": -2.162071943283081, "logps/chosen": -662.346923828125, "logps/rejected": -716.807861328125, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 2.593440532684326, "rewards/margins": 3.759884834289551, "rewards/rejected": -1.1664445400238037, "step": 3914 }, { "epoch": 2.86027397260274, "grad_norm": 12.582993525954734, "learning_rate": 1.1353716858439169e-07, "logits/chosen": -2.8466057777404785, "logits/rejected": -2.221921443939209, "logps/chosen": -434.054443359375, "logps/rejected": -501.1424865722656, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 3.1553914546966553, "rewards/margins": 6.752357006072998, "rewards/rejected": -3.5969655513763428, "step": 3915 }, { "epoch": 2.8610045662100454, "grad_norm": 13.630737776404532, "learning_rate": 1.1340355074998201e-07, "logits/chosen": -2.4749014377593994, "logits/rejected": -2.119694709777832, "logps/chosen": -423.4371032714844, "logps/rejected": -498.0625, "loss": 0.0586, "rewards/accuracies": 0.875, "rewards/chosen": 1.9286317825317383, "rewards/margins": 3.501152276992798, "rewards/rejected": -1.5725204944610596, "step": 3916 }, { "epoch": 2.8617351598173517, "grad_norm": 10.870577910641556, "learning_rate": 1.1326998851919065e-07, "logits/chosen": -2.460057020187378, "logits/rejected": -1.806999921798706, "logps/chosen": -492.16448974609375, "logps/rejected": -359.0338134765625, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 3.1679766178131104, "rewards/margins": 5.932392597198486, "rewards/rejected": -2.764415740966797, "step": 3917 }, { "epoch": 2.8624657534246576, "grad_norm": 7.196053101306049, "learning_rate": 1.1313648194638578e-07, "logits/chosen": -3.368473529815674, "logits/rejected": -2.526646614074707, "logps/chosen": -1563.7467041015625, "logps/rejected": -878.7366943359375, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 6.580835819244385, "rewards/margins": 5.871424674987793, "rewards/rejected": 0.7094115018844604, "step": 3918 }, { "epoch": 2.8631963470319635, "grad_norm": 5.7876169147587815, "learning_rate": 1.1300303108591352e-07, "logits/chosen": -2.801640510559082, "logits/rejected": -2.0637481212615967, "logps/chosen": -693.4638061523438, "logps/rejected": -612.233642578125, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 4.043405055999756, "rewards/margins": 5.4302520751953125, "rewards/rejected": -1.3868471384048462, "step": 3919 }, { "epoch": 2.8639269406392693, "grad_norm": 12.047373403884233, "learning_rate": 1.128696359920968e-07, "logits/chosen": -2.7255806922912598, "logits/rejected": -2.0733797550201416, "logps/chosen": -492.21673583984375, "logps/rejected": -512.516845703125, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 2.743617057800293, "rewards/margins": 6.550416469573975, "rewards/rejected": -3.8067996501922607, "step": 3920 }, { "epoch": 2.864657534246575, "grad_norm": 17.690444108165874, "learning_rate": 1.1273629671923635e-07, "logits/chosen": -3.2720720767974854, "logits/rejected": -2.236969470977783, "logps/chosen": -441.4543151855469, "logps/rejected": -275.999267578125, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 4.107763290405273, "rewards/margins": 8.313624382019043, "rewards/rejected": -4.2058610916137695, "step": 3921 }, { "epoch": 2.8653881278538815, "grad_norm": 13.587019593730984, "learning_rate": 1.126030133216097e-07, "logits/chosen": -2.9113285541534424, "logits/rejected": -2.3831522464752197, "logps/chosen": -650.8082885742188, "logps/rejected": -648.00341796875, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 2.7567145824432373, "rewards/margins": 4.249565124511719, "rewards/rejected": -1.4928503036499023, "step": 3922 }, { "epoch": 2.866118721461187, "grad_norm": 10.529086923743858, "learning_rate": 1.1246978585347183e-07, "logits/chosen": -2.712376594543457, "logits/rejected": -2.7755606174468994, "logps/chosen": -883.89306640625, "logps/rejected": -803.8883056640625, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 4.794274806976318, "rewards/margins": 5.541265487670898, "rewards/rejected": -0.7469907999038696, "step": 3923 }, { "epoch": 2.8668493150684933, "grad_norm": 5.579271727587204, "learning_rate": 1.1233661436905514e-07, "logits/chosen": -3.0544543266296387, "logits/rejected": -2.6137564182281494, "logps/chosen": -511.7379455566406, "logps/rejected": -564.8082275390625, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 3.415119171142578, "rewards/margins": 7.808038711547852, "rewards/rejected": -4.392920017242432, "step": 3924 }, { "epoch": 2.867579908675799, "grad_norm": 13.835780344780826, "learning_rate": 1.1220349892256884e-07, "logits/chosen": -3.0677199363708496, "logits/rejected": -2.2918224334716797, "logps/chosen": -741.1405029296875, "logps/rejected": -533.6128540039062, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": 3.67563796043396, "rewards/margins": 7.021961212158203, "rewards/rejected": -3.346323013305664, "step": 3925 }, { "epoch": 2.868310502283105, "grad_norm": 22.732095568717646, "learning_rate": 1.1207043956819979e-07, "logits/chosen": -3.1686244010925293, "logits/rejected": -1.9987080097198486, "logps/chosen": -671.6029052734375, "logps/rejected": -429.8056335449219, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 3.2582387924194336, "rewards/margins": 4.8001933097839355, "rewards/rejected": -1.5419543981552124, "step": 3926 }, { "epoch": 2.869041095890411, "grad_norm": 9.490537247237953, "learning_rate": 1.1193743636011164e-07, "logits/chosen": -2.241989850997925, "logits/rejected": -2.578141689300537, "logps/chosen": -430.62213134765625, "logps/rejected": -505.76812744140625, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 1.432487964630127, "rewards/margins": 4.270177364349365, "rewards/rejected": -2.8376896381378174, "step": 3927 }, { "epoch": 2.8697716894977168, "grad_norm": 11.594335659483281, "learning_rate": 1.1180448935244527e-07, "logits/chosen": -2.600348949432373, "logits/rejected": -2.484116554260254, "logps/chosen": -576.0616455078125, "logps/rejected": -604.1563720703125, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": 3.3009085655212402, "rewards/margins": 7.71010684967041, "rewards/rejected": -4.40919828414917, "step": 3928 }, { "epoch": 2.870502283105023, "grad_norm": 18.6959531923752, "learning_rate": 1.1167159859931891e-07, "logits/chosen": -2.7670300006866455, "logits/rejected": -2.487367868423462, "logps/chosen": -352.0650634765625, "logps/rejected": -357.33056640625, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 1.6885420083999634, "rewards/margins": 4.0808258056640625, "rewards/rejected": -2.3922836780548096, "step": 3929 }, { "epoch": 2.8712328767123285, "grad_norm": 15.502391015493888, "learning_rate": 1.115387641548276e-07, "logits/chosen": -3.087632179260254, "logits/rejected": -2.5496459007263184, "logps/chosen": -688.2772216796875, "logps/rejected": -552.7866821289062, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": 2.705010175704956, "rewards/margins": 4.947575569152832, "rewards/rejected": -2.242565393447876, "step": 3930 }, { "epoch": 2.871963470319635, "grad_norm": 20.070001548535146, "learning_rate": 1.1140598607304364e-07, "logits/chosen": -3.265376567840576, "logits/rejected": -2.5841259956359863, "logps/chosen": -587.5225830078125, "logps/rejected": -570.7506103515625, "loss": 0.1283, "rewards/accuracies": 0.875, "rewards/chosen": 2.2739460468292236, "rewards/margins": 3.195380926132202, "rewards/rejected": -0.9214349985122681, "step": 3931 }, { "epoch": 2.8726940639269407, "grad_norm": 16.645115664611083, "learning_rate": 1.1127326440801618e-07, "logits/chosen": -2.1146907806396484, "logits/rejected": -2.5070929527282715, "logps/chosen": -436.14068603515625, "logps/rejected": -659.540771484375, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 1.1039198637008667, "rewards/margins": 4.408575057983398, "rewards/rejected": -3.304654836654663, "step": 3932 }, { "epoch": 2.8734246575342466, "grad_norm": 6.501835783899114, "learning_rate": 1.1114059921377166e-07, "logits/chosen": -2.68162202835083, "logits/rejected": -1.8879783153533936, "logps/chosen": -645.06396484375, "logps/rejected": -457.842529296875, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 4.735391616821289, "rewards/margins": 7.927981376647949, "rewards/rejected": -3.192589282989502, "step": 3933 }, { "epoch": 2.8741552511415525, "grad_norm": 12.495920965318417, "learning_rate": 1.1100799054431351e-07, "logits/chosen": -2.521636486053467, "logits/rejected": -1.6321625709533691, "logps/chosen": -863.075439453125, "logps/rejected": -661.5420532226562, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": 3.386168956756592, "rewards/margins": 7.333338737487793, "rewards/rejected": -3.947169780731201, "step": 3934 }, { "epoch": 2.8748858447488583, "grad_norm": 9.804312819658636, "learning_rate": 1.1087543845362199e-07, "logits/chosen": -2.488884925842285, "logits/rejected": -1.9204500913619995, "logps/chosen": -543.62841796875, "logps/rejected": -532.3126831054688, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": 4.84837007522583, "rewards/margins": 7.090230941772461, "rewards/rejected": -2.24186110496521, "step": 3935 }, { "epoch": 2.8756164383561647, "grad_norm": 21.78883770388701, "learning_rate": 1.1074294299565437e-07, "logits/chosen": -1.932815432548523, "logits/rejected": -2.3155465126037598, "logps/chosen": -424.8038024902344, "logps/rejected": -653.2371826171875, "loss": 0.081, "rewards/accuracies": 0.875, "rewards/chosen": 0.32725632190704346, "rewards/margins": 6.50148868560791, "rewards/rejected": -6.174232006072998, "step": 3936 }, { "epoch": 2.87634703196347, "grad_norm": 11.416674388873606, "learning_rate": 1.1061050422434484e-07, "logits/chosen": -2.9669206142425537, "logits/rejected": -2.670515298843384, "logps/chosen": -784.670166015625, "logps/rejected": -667.0938720703125, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": 3.610318422317505, "rewards/margins": 4.586672782897949, "rewards/rejected": -0.9763542413711548, "step": 3937 }, { "epoch": 2.8770776255707764, "grad_norm": 9.36580155835935, "learning_rate": 1.1047812219360476e-07, "logits/chosen": -3.1628127098083496, "logits/rejected": -2.547739028930664, "logps/chosen": -1091.0682373046875, "logps/rejected": -953.1964721679688, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 5.459053039550781, "rewards/margins": 4.600955963134766, "rewards/rejected": 0.8580970764160156, "step": 3938 }, { "epoch": 2.8778082191780823, "grad_norm": 11.858600415251107, "learning_rate": 1.1034579695732205e-07, "logits/chosen": -2.7095260620117188, "logits/rejected": -1.9428399801254272, "logps/chosen": -909.4025268554688, "logps/rejected": -502.03839111328125, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 4.307791709899902, "rewards/margins": 6.517788887023926, "rewards/rejected": -2.2099967002868652, "step": 3939 }, { "epoch": 2.878538812785388, "grad_norm": 8.660075640270982, "learning_rate": 1.1021352856936164e-07, "logits/chosen": -2.629495859146118, "logits/rejected": -2.000361442565918, "logps/chosen": -496.9178161621094, "logps/rejected": -430.3605041503906, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 3.734039545059204, "rewards/margins": 5.663361072540283, "rewards/rejected": -1.929321527481079, "step": 3940 }, { "epoch": 2.879269406392694, "grad_norm": 15.07768376945088, "learning_rate": 1.1008131708356552e-07, "logits/chosen": -2.8401336669921875, "logits/rejected": -1.8938876390457153, "logps/chosen": -694.9503173828125, "logps/rejected": -404.6142883300781, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 3.0368764400482178, "rewards/margins": 4.1734795570373535, "rewards/rejected": -1.1366032361984253, "step": 3941 }, { "epoch": 2.88, "grad_norm": 10.530932684852308, "learning_rate": 1.0994916255375214e-07, "logits/chosen": -3.1012275218963623, "logits/rejected": -1.9139429330825806, "logps/chosen": -566.0728149414062, "logps/rejected": -424.9624328613281, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 3.6260952949523926, "rewards/margins": 6.081806182861328, "rewards/rejected": -2.4557108879089355, "step": 3942 }, { "epoch": 2.880730593607306, "grad_norm": 12.187757283942961, "learning_rate": 1.0981706503371716e-07, "logits/chosen": -2.978797674179077, "logits/rejected": -2.1664960384368896, "logps/chosen": -854.167724609375, "logps/rejected": -631.7840576171875, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 5.403563499450684, "rewards/margins": 5.5707502365112305, "rewards/rejected": -0.167186439037323, "step": 3943 }, { "epoch": 2.8814611872146116, "grad_norm": 15.238688792294383, "learning_rate": 1.0968502457723277e-07, "logits/chosen": -3.0275628566741943, "logits/rejected": -2.641232490539551, "logps/chosen": -951.812255859375, "logps/rejected": -649.2388916015625, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 2.8104279041290283, "rewards/margins": 4.362785816192627, "rewards/rejected": -1.552357792854309, "step": 3944 }, { "epoch": 2.882191780821918, "grad_norm": 21.83559718540712, "learning_rate": 1.0955304123804787e-07, "logits/chosen": -2.4330639839172363, "logits/rejected": -2.17580509185791, "logps/chosen": -646.7797241210938, "logps/rejected": -552.0946655273438, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": 4.60294771194458, "rewards/margins": 6.777096271514893, "rewards/rejected": -2.1741480827331543, "step": 3945 }, { "epoch": 2.882922374429224, "grad_norm": 8.22081333266885, "learning_rate": 1.0942111506988847e-07, "logits/chosen": -2.6711933612823486, "logits/rejected": -1.7136542797088623, "logps/chosen": -463.0662841796875, "logps/rejected": -338.47381591796875, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 2.502394914627075, "rewards/margins": 6.4237542152404785, "rewards/rejected": -3.921359062194824, "step": 3946 }, { "epoch": 2.8836529680365297, "grad_norm": 10.217764804023043, "learning_rate": 1.0928924612645687e-07, "logits/chosen": -2.7682044506073, "logits/rejected": -1.9489970207214355, "logps/chosen": -702.9161376953125, "logps/rejected": -562.1665649414062, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": 3.9351212978363037, "rewards/margins": 5.659549236297607, "rewards/rejected": -1.7244281768798828, "step": 3947 }, { "epoch": 2.8843835616438356, "grad_norm": 12.338643623382747, "learning_rate": 1.0915743446143258e-07, "logits/chosen": -3.1083157062530518, "logits/rejected": -2.644421339035034, "logps/chosen": -659.3138427734375, "logps/rejected": -586.409912109375, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 3.5833897590637207, "rewards/margins": 5.175848960876465, "rewards/rejected": -1.5924592018127441, "step": 3948 }, { "epoch": 2.8851141552511415, "grad_norm": 9.602982343910716, "learning_rate": 1.0902568012847113e-07, "logits/chosen": -2.8639142513275146, "logits/rejected": -2.126267910003662, "logps/chosen": -448.9453125, "logps/rejected": -491.2508239746094, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 3.2996625900268555, "rewards/margins": 6.559280872344971, "rewards/rejected": -3.2596182823181152, "step": 3949 }, { "epoch": 2.8858447488584473, "grad_norm": 8.569608758566655, "learning_rate": 1.0889398318120524e-07, "logits/chosen": -2.2147021293640137, "logits/rejected": -2.4534363746643066, "logps/chosen": -404.8007507324219, "logps/rejected": -623.5629272460938, "loss": 0.0522, "rewards/accuracies": 1.0, "rewards/chosen": 3.647317409515381, "rewards/margins": 5.775638580322266, "rewards/rejected": -2.1283209323883057, "step": 3950 }, { "epoch": 2.886575342465753, "grad_norm": 11.999995549519031, "learning_rate": 1.0876234367324425e-07, "logits/chosen": -2.749039649963379, "logits/rejected": -1.699453592300415, "logps/chosen": -493.5972900390625, "logps/rejected": -244.96524047851562, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 2.5151920318603516, "rewards/margins": 5.062363147735596, "rewards/rejected": -2.547170877456665, "step": 3951 }, { "epoch": 2.8873059360730595, "grad_norm": 14.307920297931252, "learning_rate": 1.0863076165817383e-07, "logits/chosen": -2.5195155143737793, "logits/rejected": -1.541845679283142, "logps/chosen": -587.5031127929688, "logps/rejected": -343.0517272949219, "loss": 0.0772, "rewards/accuracies": 0.875, "rewards/chosen": 3.9249768257141113, "rewards/margins": 6.640229225158691, "rewards/rejected": -2.715251922607422, "step": 3952 }, { "epoch": 2.8880365296803654, "grad_norm": 11.142790553135026, "learning_rate": 1.0849923718955648e-07, "logits/chosen": -2.6310713291168213, "logits/rejected": -2.6571195125579834, "logps/chosen": -476.2947998046875, "logps/rejected": -501.5503234863281, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 3.111293077468872, "rewards/margins": 5.284970283508301, "rewards/rejected": -2.1736769676208496, "step": 3953 }, { "epoch": 2.8887671232876713, "grad_norm": 15.509976037681461, "learning_rate": 1.0836777032093102e-07, "logits/chosen": -3.6553590297698975, "logits/rejected": -2.151139974594116, "logps/chosen": -547.5904541015625, "logps/rejected": -307.82525634765625, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 3.684098958969116, "rewards/margins": 6.690247058868408, "rewards/rejected": -3.006148099899292, "step": 3954 }, { "epoch": 2.889497716894977, "grad_norm": 12.021765361511036, "learning_rate": 1.0823636110581321e-07, "logits/chosen": -3.1363985538482666, "logits/rejected": -3.0266010761260986, "logps/chosen": -538.0411987304688, "logps/rejected": -588.5354614257812, "loss": 0.0578, "rewards/accuracies": 1.0, "rewards/chosen": 2.12619686126709, "rewards/margins": 3.522097587585449, "rewards/rejected": -1.3959004878997803, "step": 3955 }, { "epoch": 2.890228310502283, "grad_norm": 6.006925083293351, "learning_rate": 1.0810500959769498e-07, "logits/chosen": -3.3706741333007812, "logits/rejected": -2.1918344497680664, "logps/chosen": -529.136474609375, "logps/rejected": -308.7482604980469, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 2.612764358520508, "rewards/margins": 4.876598834991455, "rewards/rejected": -2.2638344764709473, "step": 3956 }, { "epoch": 2.890958904109589, "grad_norm": 9.947025364330424, "learning_rate": 1.0797371585004503e-07, "logits/chosen": -2.6428158283233643, "logits/rejected": -2.898294448852539, "logps/chosen": -646.5585327148438, "logps/rejected": -823.0567016601562, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 2.38191556930542, "rewards/margins": 4.808413505554199, "rewards/rejected": -2.4264976978302, "step": 3957 }, { "epoch": 2.8916894977168948, "grad_norm": 7.754237093461311, "learning_rate": 1.0784247991630841e-07, "logits/chosen": -2.8153491020202637, "logits/rejected": -1.6990063190460205, "logps/chosen": -528.776123046875, "logps/rejected": -251.74588012695312, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 2.3771228790283203, "rewards/margins": 4.8941240310668945, "rewards/rejected": -2.517000675201416, "step": 3958 }, { "epoch": 2.892420091324201, "grad_norm": 7.672494028138204, "learning_rate": 1.0771130184990652e-07, "logits/chosen": -2.673041582107544, "logits/rejected": -1.923957109451294, "logps/chosen": -842.8862915039062, "logps/rejected": -639.4927368164062, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 3.6173365116119385, "rewards/margins": 5.339791774749756, "rewards/rejected": -1.7224550247192383, "step": 3959 }, { "epoch": 2.893150684931507, "grad_norm": 8.242379107666196, "learning_rate": 1.0758018170423755e-07, "logits/chosen": -3.082857847213745, "logits/rejected": -1.868296504020691, "logps/chosen": -839.0829467773438, "logps/rejected": -443.89874267578125, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 3.6083242893218994, "rewards/margins": 7.892512321472168, "rewards/rejected": -4.2841877937316895, "step": 3960 }, { "epoch": 2.893881278538813, "grad_norm": 11.232449553576139, "learning_rate": 1.0744911953267583e-07, "logits/chosen": -2.558532238006592, "logits/rejected": -2.1790997982025146, "logps/chosen": -397.9217834472656, "logps/rejected": -404.5379943847656, "loss": 0.0652, "rewards/accuracies": 0.875, "rewards/chosen": 2.4008655548095703, "rewards/margins": 5.628121852874756, "rewards/rejected": -3.2272562980651855, "step": 3961 }, { "epoch": 2.8946118721461187, "grad_norm": 7.7940250302314915, "learning_rate": 1.0731811538857203e-07, "logits/chosen": -2.6647772789001465, "logits/rejected": -1.9559338092803955, "logps/chosen": -801.642578125, "logps/rejected": -660.8899536132812, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 3.3933258056640625, "rewards/margins": 4.271576881408691, "rewards/rejected": -0.8782508373260498, "step": 3962 }, { "epoch": 2.8953424657534246, "grad_norm": 10.127496858896237, "learning_rate": 1.0718716932525357e-07, "logits/chosen": -2.631657123565674, "logits/rejected": -1.9047125577926636, "logps/chosen": -467.29156494140625, "logps/rejected": -379.8376159667969, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 1.857704758644104, "rewards/margins": 5.1528215408325195, "rewards/rejected": -3.295116424560547, "step": 3963 }, { "epoch": 2.8960730593607305, "grad_norm": 10.843332771827393, "learning_rate": 1.070562813960238e-07, "logits/chosen": -3.0593748092651367, "logits/rejected": -2.6815669536590576, "logps/chosen": -438.65679931640625, "logps/rejected": -389.7918701171875, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 3.09269118309021, "rewards/margins": 6.289114475250244, "rewards/rejected": -3.196423292160034, "step": 3964 }, { "epoch": 2.8968036529680363, "grad_norm": 14.89275832370489, "learning_rate": 1.0692545165416284e-07, "logits/chosen": -2.5042989253997803, "logits/rejected": -2.2486329078674316, "logps/chosen": -617.7210083007812, "logps/rejected": -615.4778442382812, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 3.574028491973877, "rewards/margins": 5.407608985900879, "rewards/rejected": -1.833580732345581, "step": 3965 }, { "epoch": 2.8975342465753426, "grad_norm": 7.053362311079548, "learning_rate": 1.0679468015292656e-07, "logits/chosen": -2.50736141204834, "logits/rejected": -2.0038440227508545, "logps/chosen": -865.3892211914062, "logps/rejected": -623.9324951171875, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 4.863452911376953, "rewards/margins": 5.516452789306641, "rewards/rejected": -0.6529998779296875, "step": 3966 }, { "epoch": 2.8982648401826485, "grad_norm": 12.071883114844866, "learning_rate": 1.0666396694554761e-07, "logits/chosen": -2.916083812713623, "logits/rejected": -2.407923698425293, "logps/chosen": -644.9887084960938, "logps/rejected": -531.2008056640625, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 2.850766181945801, "rewards/margins": 5.3663763999938965, "rewards/rejected": -2.5156099796295166, "step": 3967 }, { "epoch": 2.8989954337899544, "grad_norm": 8.034603621005811, "learning_rate": 1.0653331208523483e-07, "logits/chosen": -2.708759307861328, "logits/rejected": -2.243894100189209, "logps/chosen": -380.4455871582031, "logps/rejected": -343.3380432128906, "loss": 0.0743, "rewards/accuracies": 0.875, "rewards/chosen": 2.5593068599700928, "rewards/margins": 5.499444961547852, "rewards/rejected": -2.9401378631591797, "step": 3968 }, { "epoch": 2.8997260273972603, "grad_norm": 10.052949721136734, "learning_rate": 1.0640271562517309e-07, "logits/chosen": -2.6041157245635986, "logits/rejected": -2.037231683731079, "logps/chosen": -599.589599609375, "logps/rejected": -481.2740478515625, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 4.56021785736084, "rewards/margins": 7.811749458312988, "rewards/rejected": -3.2515311241149902, "step": 3969 }, { "epoch": 2.900456621004566, "grad_norm": 15.456994724495637, "learning_rate": 1.0627217761852383e-07, "logits/chosen": -2.727414608001709, "logits/rejected": -2.6687028408050537, "logps/chosen": -447.4239501953125, "logps/rejected": -469.54449462890625, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 2.332318067550659, "rewards/margins": 3.675291061401367, "rewards/rejected": -1.3429728746414185, "step": 3970 }, { "epoch": 2.901187214611872, "grad_norm": 21.13992558779292, "learning_rate": 1.0614169811842417e-07, "logits/chosen": -2.951427698135376, "logits/rejected": -2.4096617698669434, "logps/chosen": -603.224609375, "logps/rejected": -562.2783203125, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 4.326441764831543, "rewards/margins": 6.765923023223877, "rewards/rejected": -2.439481735229492, "step": 3971 }, { "epoch": 2.901917808219178, "grad_norm": 10.955134361205722, "learning_rate": 1.0601127717798797e-07, "logits/chosen": -3.0562639236450195, "logits/rejected": -2.4896128177642822, "logps/chosen": -652.3291625976562, "logps/rejected": -647.2394409179688, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 3.03460693359375, "rewards/margins": 5.085662841796875, "rewards/rejected": -2.051055908203125, "step": 3972 }, { "epoch": 2.902648401826484, "grad_norm": 7.411576428056705, "learning_rate": 1.0588091485030488e-07, "logits/chosen": -3.209428071975708, "logits/rejected": -2.361483097076416, "logps/chosen": -948.3068237304688, "logps/rejected": -759.7571411132812, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 7.216468811035156, "rewards/margins": 5.1966657638549805, "rewards/rejected": 2.019803285598755, "step": 3973 }, { "epoch": 2.90337899543379, "grad_norm": 31.132333159735232, "learning_rate": 1.0575061118844098e-07, "logits/chosen": -2.544095754623413, "logits/rejected": -2.3482556343078613, "logps/chosen": -678.748779296875, "logps/rejected": -623.343994140625, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 3.6903836727142334, "rewards/margins": 5.480807304382324, "rewards/rejected": -1.7904243469238281, "step": 3974 }, { "epoch": 2.904109589041096, "grad_norm": 15.30237478139244, "learning_rate": 1.0562036624543822e-07, "logits/chosen": -2.800266742706299, "logits/rejected": -2.503181219100952, "logps/chosen": -581.2135620117188, "logps/rejected": -635.3724365234375, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": 2.1799681186676025, "rewards/margins": 3.9202699661254883, "rewards/rejected": -1.7403018474578857, "step": 3975 }, { "epoch": 2.904840182648402, "grad_norm": 7.406781527604028, "learning_rate": 1.0549018007431465e-07, "logits/chosen": -2.80202579498291, "logits/rejected": -2.2652902603149414, "logps/chosen": -771.7913818359375, "logps/rejected": -716.180908203125, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 2.182508945465088, "rewards/margins": 4.614845275878906, "rewards/rejected": -2.4323360919952393, "step": 3976 }, { "epoch": 2.9055707762557077, "grad_norm": 14.298685528289447, "learning_rate": 1.053600527280647e-07, "logits/chosen": -2.722320556640625, "logits/rejected": -2.263218879699707, "logps/chosen": -661.528564453125, "logps/rejected": -566.068115234375, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": 3.3865909576416016, "rewards/margins": 6.052607536315918, "rewards/rejected": -2.6660170555114746, "step": 3977 }, { "epoch": 2.9063013698630136, "grad_norm": 25.90335431268386, "learning_rate": 1.0522998425965854e-07, "logits/chosen": -2.584345817565918, "logits/rejected": -1.736244559288025, "logps/chosen": -572.7550659179688, "logps/rejected": -298.39080810546875, "loss": 0.0912, "rewards/accuracies": 1.0, "rewards/chosen": 3.0325822830200195, "rewards/margins": 5.36392879486084, "rewards/rejected": -2.3313469886779785, "step": 3978 }, { "epoch": 2.9070319634703194, "grad_norm": 8.627705467605056, "learning_rate": 1.0509997472204238e-07, "logits/chosen": -2.568800449371338, "logits/rejected": -2.2105090618133545, "logps/chosen": -484.7828063964844, "logps/rejected": -507.95855712890625, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 2.086149215698242, "rewards/margins": 5.915172576904297, "rewards/rejected": -3.8290231227874756, "step": 3979 }, { "epoch": 2.9077625570776258, "grad_norm": 7.3513160202105245, "learning_rate": 1.0497002416813869e-07, "logits/chosen": -2.4458508491516113, "logits/rejected": -2.546704053878784, "logps/chosen": -621.707275390625, "logps/rejected": -623.607421875, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 0.6811493635177612, "rewards/margins": 5.363974571228027, "rewards/rejected": -4.682825088500977, "step": 3980 }, { "epoch": 2.9084931506849316, "grad_norm": 13.972839712592036, "learning_rate": 1.0484013265084566e-07, "logits/chosen": -3.155029296875, "logits/rejected": -1.828057885169983, "logps/chosen": -788.7029418945312, "logps/rejected": -481.36248779296875, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 4.352670669555664, "rewards/margins": 7.01512336730957, "rewards/rejected": -2.662452220916748, "step": 3981 }, { "epoch": 2.9092237442922375, "grad_norm": 38.95892361824422, "learning_rate": 1.0471030022303768e-07, "logits/chosen": -2.9641668796539307, "logits/rejected": -2.0774571895599365, "logps/chosen": -623.0303955078125, "logps/rejected": -685.1029052734375, "loss": 0.1061, "rewards/accuracies": 1.0, "rewards/chosen": 2.6328158378601074, "rewards/margins": 6.237225532531738, "rewards/rejected": -3.60440993309021, "step": 3982 }, { "epoch": 2.9099543378995434, "grad_norm": 9.799021800520208, "learning_rate": 1.0458052693756492e-07, "logits/chosen": -2.31809139251709, "logits/rejected": -2.40059232711792, "logps/chosen": -457.910888671875, "logps/rejected": -419.8929138183594, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 2.377918243408203, "rewards/margins": 6.04591703414917, "rewards/rejected": -3.667999267578125, "step": 3983 }, { "epoch": 2.9106849315068493, "grad_norm": 16.241884992974185, "learning_rate": 1.0445081284725354e-07, "logits/chosen": -2.896076202392578, "logits/rejected": -1.9370770454406738, "logps/chosen": -719.1357421875, "logps/rejected": -452.044921875, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 3.1862850189208984, "rewards/margins": 4.256045341491699, "rewards/rejected": -1.0697605609893799, "step": 3984 }, { "epoch": 2.911415525114155, "grad_norm": 10.770546727945042, "learning_rate": 1.0432115800490546e-07, "logits/chosen": -3.188255548477173, "logits/rejected": -1.8309565782546997, "logps/chosen": -757.0253295898438, "logps/rejected": -431.07928466796875, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": 2.928473472595215, "rewards/margins": 5.085757732391357, "rewards/rejected": -2.1572842597961426, "step": 3985 }, { "epoch": 2.912146118721461, "grad_norm": 12.675179950106354, "learning_rate": 1.0419156246329875e-07, "logits/chosen": -2.4322056770324707, "logits/rejected": -1.396857738494873, "logps/chosen": -726.034423828125, "logps/rejected": -437.40850830078125, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 3.8140952587127686, "rewards/margins": 6.893032073974609, "rewards/rejected": -3.078937292098999, "step": 3986 }, { "epoch": 2.9128767123287673, "grad_norm": 12.047643179342497, "learning_rate": 1.0406202627518734e-07, "logits/chosen": -2.3618547916412354, "logits/rejected": -2.008549690246582, "logps/chosen": -416.7643737792969, "logps/rejected": -445.322998046875, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 3.560805559158325, "rewards/margins": 6.560606956481934, "rewards/rejected": -2.9998013973236084, "step": 3987 }, { "epoch": 2.913607305936073, "grad_norm": 9.548999706029013, "learning_rate": 1.0393254949330055e-07, "logits/chosen": -2.903909921646118, "logits/rejected": -2.461899518966675, "logps/chosen": -696.5020141601562, "logps/rejected": -621.9324340820312, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 2.552225351333618, "rewards/margins": 4.108897686004639, "rewards/rejected": -1.556672215461731, "step": 3988 }, { "epoch": 2.914337899543379, "grad_norm": 11.36794456241843, "learning_rate": 1.0380313217034407e-07, "logits/chosen": -2.4285166263580322, "logits/rejected": -1.6392621994018555, "logps/chosen": -930.4915161132812, "logps/rejected": -700.7747192382812, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 4.667752742767334, "rewards/margins": 7.42643404006958, "rewards/rejected": -2.758681297302246, "step": 3989 }, { "epoch": 2.915068493150685, "grad_norm": 16.11739294131062, "learning_rate": 1.0367377435899893e-07, "logits/chosen": -2.8570542335510254, "logits/rejected": -2.688556671142578, "logps/chosen": -708.903564453125, "logps/rejected": -569.72509765625, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 3.3224258422851562, "rewards/margins": 4.374453544616699, "rewards/rejected": -1.0520280599594116, "step": 3990 }, { "epoch": 2.915799086757991, "grad_norm": 6.293612484532929, "learning_rate": 1.0354447611192243e-07, "logits/chosen": -2.6723997592926025, "logits/rejected": -2.237774610519409, "logps/chosen": -652.39599609375, "logps/rejected": -643.7835083007812, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 2.513916492462158, "rewards/margins": 4.528617858886719, "rewards/rejected": -2.0147013664245605, "step": 3991 }, { "epoch": 2.9165296803652967, "grad_norm": 13.524709622973138, "learning_rate": 1.0341523748174719e-07, "logits/chosen": -3.139185667037964, "logits/rejected": -1.8584105968475342, "logps/chosen": -734.2481689453125, "logps/rejected": -479.24969482421875, "loss": 0.0504, "rewards/accuracies": 1.0, "rewards/chosen": 3.583310127258301, "rewards/margins": 6.909080505371094, "rewards/rejected": -3.325770378112793, "step": 3992 }, { "epoch": 2.9172602739726026, "grad_norm": 15.171343965989797, "learning_rate": 1.0328605852108163e-07, "logits/chosen": -2.9125349521636963, "logits/rejected": -1.8379530906677246, "logps/chosen": -719.818359375, "logps/rejected": -405.56298828125, "loss": 0.0714, "rewards/accuracies": 1.0, "rewards/chosen": 4.933257579803467, "rewards/margins": 6.259239196777344, "rewards/rejected": -1.3259817361831665, "step": 3993 }, { "epoch": 2.917990867579909, "grad_norm": 7.670739985334363, "learning_rate": 1.0315693928251018e-07, "logits/chosen": -3.0096218585968018, "logits/rejected": -2.434586524963379, "logps/chosen": -560.81689453125, "logps/rejected": -404.1188659667969, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 4.28933048248291, "rewards/margins": 6.919519424438477, "rewards/rejected": -2.6301887035369873, "step": 3994 }, { "epoch": 2.9187214611872148, "grad_norm": 9.601297894797789, "learning_rate": 1.0302787981859254e-07, "logits/chosen": -3.2827141284942627, "logits/rejected": -2.5888993740081787, "logps/chosen": -370.95526123046875, "logps/rejected": -379.43310546875, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 2.5690550804138184, "rewards/margins": 4.949158668518066, "rewards/rejected": -2.380103588104248, "step": 3995 }, { "epoch": 2.9194520547945206, "grad_norm": 20.06966018062152, "learning_rate": 1.0289888018186446e-07, "logits/chosen": -2.1301097869873047, "logits/rejected": -1.7156639099121094, "logps/chosen": -572.0775146484375, "logps/rejected": -573.943115234375, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 3.6711385250091553, "rewards/margins": 8.384209632873535, "rewards/rejected": -4.713070869445801, "step": 3996 }, { "epoch": 2.9201826484018265, "grad_norm": 15.435514129291155, "learning_rate": 1.0276994042483711e-07, "logits/chosen": -2.8299520015716553, "logits/rejected": -2.6020889282226562, "logps/chosen": -413.15582275390625, "logps/rejected": -490.54791259765625, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 2.055528402328491, "rewards/margins": 4.3105149269104, "rewards/rejected": -2.254986524581909, "step": 3997 }, { "epoch": 2.9209132420091324, "grad_norm": 7.024740639239572, "learning_rate": 1.0264106059999722e-07, "logits/chosen": -2.839172840118408, "logits/rejected": -2.1882967948913574, "logps/chosen": -366.9692077636719, "logps/rejected": -313.76177978515625, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 0.9166768789291382, "rewards/margins": 3.368278741836548, "rewards/rejected": -2.451601982116699, "step": 3998 }, { "epoch": 2.9216438356164383, "grad_norm": 8.011509245798061, "learning_rate": 1.0251224075980744e-07, "logits/chosen": -2.71600604057312, "logits/rejected": -2.6554555892944336, "logps/chosen": -404.5811767578125, "logps/rejected": -543.7918090820312, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 1.967405080795288, "rewards/margins": 4.788872241973877, "rewards/rejected": -2.821467161178589, "step": 3999 }, { "epoch": 2.922374429223744, "grad_norm": 17.568028350306914, "learning_rate": 1.0238348095670568e-07, "logits/chosen": -2.503366470336914, "logits/rejected": -2.1494288444519043, "logps/chosen": -571.7476806640625, "logps/rejected": -539.4432983398438, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 2.248943328857422, "rewards/margins": 4.612128257751465, "rewards/rejected": -2.3631844520568848, "step": 4000 }, { "epoch": 2.9231050228310504, "grad_norm": 8.984153784640723, "learning_rate": 1.0225478124310555e-07, "logits/chosen": -2.5922701358795166, "logits/rejected": -2.4146339893341064, "logps/chosen": -596.3775634765625, "logps/rejected": -544.6434326171875, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 3.6877975463867188, "rewards/margins": 5.458149433135986, "rewards/rejected": -1.7703521251678467, "step": 4001 }, { "epoch": 2.9238356164383563, "grad_norm": 6.6751672273754865, "learning_rate": 1.0212614167139613e-07, "logits/chosen": -2.9158573150634766, "logits/rejected": -2.0211269855499268, "logps/chosen": -1006.7211303710938, "logps/rejected": -597.0787963867188, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 5.261438369750977, "rewards/margins": 6.477525234222412, "rewards/rejected": -1.2160866260528564, "step": 4002 }, { "epoch": 2.924566210045662, "grad_norm": 9.631637860508283, "learning_rate": 1.0199756229394211e-07, "logits/chosen": -2.9066481590270996, "logits/rejected": -2.698592185974121, "logps/chosen": -395.93572998046875, "logps/rejected": -416.94696044921875, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 1.3885834217071533, "rewards/margins": 3.9621496200561523, "rewards/rejected": -2.573566436767578, "step": 4003 }, { "epoch": 2.925296803652968, "grad_norm": 14.614397427968667, "learning_rate": 1.0186904316308384e-07, "logits/chosen": -2.9698984622955322, "logits/rejected": -2.662653684616089, "logps/chosen": -843.85400390625, "logps/rejected": -693.1288452148438, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 4.706427097320557, "rewards/margins": 6.351879119873047, "rewards/rejected": -1.6454521417617798, "step": 4004 }, { "epoch": 2.926027397260274, "grad_norm": 15.456711895709033, "learning_rate": 1.0174058433113658e-07, "logits/chosen": -3.3310184478759766, "logits/rejected": -2.3352043628692627, "logps/chosen": -583.710205078125, "logps/rejected": -395.13616943359375, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": 2.919029712677002, "rewards/margins": 4.87923526763916, "rewards/rejected": -1.9602057933807373, "step": 4005 }, { "epoch": 2.92675799086758, "grad_norm": 17.796105908261136, "learning_rate": 1.0161218585039172e-07, "logits/chosen": -2.498924732208252, "logits/rejected": -1.629321575164795, "logps/chosen": -522.5037841796875, "logps/rejected": -377.43597412109375, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 3.4521801471710205, "rewards/margins": 5.446805953979492, "rewards/rejected": -1.9946255683898926, "step": 4006 }, { "epoch": 2.9274885844748857, "grad_norm": 13.837173174105876, "learning_rate": 1.0148384777311553e-07, "logits/chosen": -2.6786954402923584, "logits/rejected": -1.8616870641708374, "logps/chosen": -656.14208984375, "logps/rejected": -462.2906188964844, "loss": 0.0796, "rewards/accuracies": 1.0, "rewards/chosen": 2.569272518157959, "rewards/margins": 6.676562786102295, "rewards/rejected": -4.107291221618652, "step": 4007 }, { "epoch": 2.928219178082192, "grad_norm": 15.22860016749811, "learning_rate": 1.0135557015155017e-07, "logits/chosen": -2.4644265174865723, "logits/rejected": -2.3670036792755127, "logps/chosen": -619.857421875, "logps/rejected": -488.4002990722656, "loss": 0.0809, "rewards/accuracies": 0.875, "rewards/chosen": 2.554300546646118, "rewards/margins": 4.168820858001709, "rewards/rejected": -1.6145204305648804, "step": 4008 }, { "epoch": 2.928949771689498, "grad_norm": 9.084335645151468, "learning_rate": 1.0122735303791283e-07, "logits/chosen": -2.5559310913085938, "logits/rejected": -1.6831104755401611, "logps/chosen": -832.8458862304688, "logps/rejected": -459.2607421875, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 4.165166854858398, "rewards/margins": 5.335251808166504, "rewards/rejected": -1.170085072517395, "step": 4009 }, { "epoch": 2.9296803652968038, "grad_norm": 6.952298713192705, "learning_rate": 1.010991964843961e-07, "logits/chosen": -2.77508544921875, "logits/rejected": -1.9491658210754395, "logps/chosen": -530.6356201171875, "logps/rejected": -515.643798828125, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 3.1583831310272217, "rewards/margins": 5.717772006988525, "rewards/rejected": -2.5593886375427246, "step": 4010 }, { "epoch": 2.9304109589041096, "grad_norm": 22.63805708133913, "learning_rate": 1.0097110054316823e-07, "logits/chosen": -2.8067827224731445, "logits/rejected": -2.505354404449463, "logps/chosen": -809.4511108398438, "logps/rejected": -669.0938110351562, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": 2.727810859680176, "rewards/margins": 5.029202461242676, "rewards/rejected": -2.3013916015625, "step": 4011 }, { "epoch": 2.9311415525114155, "grad_norm": 12.718193923078822, "learning_rate": 1.0084306526637238e-07, "logits/chosen": -2.5725297927856445, "logits/rejected": -2.2159831523895264, "logps/chosen": -680.5084228515625, "logps/rejected": -572.4296264648438, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 2.4934680461883545, "rewards/margins": 4.525444030761719, "rewards/rejected": -2.0319762229919434, "step": 4012 }, { "epoch": 2.9318721461187214, "grad_norm": 15.438218073032516, "learning_rate": 1.0071509070612738e-07, "logits/chosen": -1.981144905090332, "logits/rejected": -2.387490749359131, "logps/chosen": -275.0080261230469, "logps/rejected": -340.4193115234375, "loss": 0.11, "rewards/accuracies": 0.875, "rewards/chosen": 0.5722063183784485, "rewards/margins": 2.7220406532287598, "rewards/rejected": -2.149834394454956, "step": 4013 }, { "epoch": 2.9326027397260273, "grad_norm": 9.880884276443716, "learning_rate": 1.0058717691452712e-07, "logits/chosen": -2.552614212036133, "logits/rejected": -1.7375794649124146, "logps/chosen": -482.5660095214844, "logps/rejected": -404.80718994140625, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 3.1590864658355713, "rewards/margins": 4.852496147155762, "rewards/rejected": -1.6934093236923218, "step": 4014 }, { "epoch": 2.9333333333333336, "grad_norm": 19.3708255331552, "learning_rate": 1.0045932394364067e-07, "logits/chosen": -1.958493947982788, "logits/rejected": -2.0643250942230225, "logps/chosen": -614.02392578125, "logps/rejected": -668.1602783203125, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": 1.9783939123153687, "rewards/margins": 4.3754658699035645, "rewards/rejected": -2.3970718383789062, "step": 4015 }, { "epoch": 2.934063926940639, "grad_norm": 11.578819511474254, "learning_rate": 1.0033153184551274e-07, "logits/chosen": -2.8677284717559814, "logits/rejected": -2.3617634773254395, "logps/chosen": -965.289306640625, "logps/rejected": -804.3626098632812, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 5.699769020080566, "rewards/margins": 5.755324363708496, "rewards/rejected": -0.05555480718612671, "step": 4016 }, { "epoch": 2.9347945205479453, "grad_norm": 10.612915347253495, "learning_rate": 1.0020380067216285e-07, "logits/chosen": -2.845248222351074, "logits/rejected": -1.882204294204712, "logps/chosen": -861.88916015625, "logps/rejected": -603.20751953125, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 5.381100654602051, "rewards/margins": 6.1673994064331055, "rewards/rejected": -0.7862985134124756, "step": 4017 }, { "epoch": 2.935525114155251, "grad_norm": 17.48669538194279, "learning_rate": 1.0007613047558594e-07, "logits/chosen": -1.968092918395996, "logits/rejected": -2.7254326343536377, "logps/chosen": -435.000732421875, "logps/rejected": -643.552001953125, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 2.139094591140747, "rewards/margins": 3.5077733993530273, "rewards/rejected": -1.3686788082122803, "step": 4018 }, { "epoch": 2.936255707762557, "grad_norm": 5.427326199398983, "learning_rate": 9.994852130775191e-08, "logits/chosen": -2.3659603595733643, "logits/rejected": -1.9481992721557617, "logps/chosen": -304.56536865234375, "logps/rejected": -499.7760314941406, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 3.0893590450286865, "rewards/margins": 8.09121322631836, "rewards/rejected": -5.001854419708252, "step": 4019 }, { "epoch": 2.936986301369863, "grad_norm": 13.442628245636737, "learning_rate": 9.982097322060612e-08, "logits/chosen": -2.398216724395752, "logits/rejected": -2.0585639476776123, "logps/chosen": -768.9688720703125, "logps/rejected": -787.1676025390625, "loss": 0.0677, "rewards/accuracies": 1.0, "rewards/chosen": 1.603358268737793, "rewards/margins": 4.367762088775635, "rewards/rejected": -2.764404058456421, "step": 4020 }, { "epoch": 2.937716894977169, "grad_norm": 11.849759230602206, "learning_rate": 9.969348626606894e-08, "logits/chosen": -3.142967462539673, "logits/rejected": -2.1746134757995605, "logps/chosen": -389.67083740234375, "logps/rejected": -344.2331237792969, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 3.496286153793335, "rewards/margins": 7.461506366729736, "rewards/rejected": -3.965219497680664, "step": 4021 }, { "epoch": 2.938447488584475, "grad_norm": 9.398792643825114, "learning_rate": 9.956606049603581e-08, "logits/chosen": -2.7512447834014893, "logits/rejected": -1.9834400415420532, "logps/chosen": -358.1206359863281, "logps/rejected": -298.99835205078125, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": 1.6029572486877441, "rewards/margins": 4.205813884735107, "rewards/rejected": -2.6028568744659424, "step": 4022 }, { "epoch": 2.9391780821917806, "grad_norm": 12.342805541399917, "learning_rate": 9.943869596237722e-08, "logits/chosen": -3.023224353790283, "logits/rejected": -2.4851813316345215, "logps/chosen": -784.8768310546875, "logps/rejected": -824.341064453125, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 3.5057263374328613, "rewards/margins": 5.562782287597656, "rewards/rejected": -2.057055950164795, "step": 4023 }, { "epoch": 2.939908675799087, "grad_norm": 10.945952021712003, "learning_rate": 9.931139271693879e-08, "logits/chosen": -2.264338970184326, "logits/rejected": -1.8868459463119507, "logps/chosen": -543.1671142578125, "logps/rejected": -485.0446472167969, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 1.8283374309539795, "rewards/margins": 3.474459171295166, "rewards/rejected": -1.646121621131897, "step": 4024 }, { "epoch": 2.9406392694063928, "grad_norm": 12.267441147390416, "learning_rate": 9.91841508115413e-08, "logits/chosen": -2.9069206714630127, "logits/rejected": -2.832768201828003, "logps/chosen": -932.8756103515625, "logps/rejected": -916.9860229492188, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": 4.218405246734619, "rewards/margins": 4.77359676361084, "rewards/rejected": -0.5551921725273132, "step": 4025 }, { "epoch": 2.9413698630136986, "grad_norm": 13.736124146475337, "learning_rate": 9.905697029798044e-08, "logits/chosen": -3.294595241546631, "logits/rejected": -2.2055187225341797, "logps/chosen": -837.1591796875, "logps/rejected": -642.07861328125, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 4.369385719299316, "rewards/margins": 5.902042388916016, "rewards/rejected": -1.5326566696166992, "step": 4026 }, { "epoch": 2.9421004566210045, "grad_norm": 10.714919225946348, "learning_rate": 9.89298512280268e-08, "logits/chosen": -2.9225099086761475, "logits/rejected": -2.5693321228027344, "logps/chosen": -799.309326171875, "logps/rejected": -765.8035888671875, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 4.882809162139893, "rewards/margins": 5.076765060424805, "rewards/rejected": -0.19395571947097778, "step": 4027 }, { "epoch": 2.9428310502283104, "grad_norm": 8.231958255406182, "learning_rate": 9.880279365342625e-08, "logits/chosen": -2.827151298522949, "logits/rejected": -2.851959705352783, "logps/chosen": -690.8013305664062, "logps/rejected": -702.5657958984375, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 4.0211029052734375, "rewards/margins": 4.69368839263916, "rewards/rejected": -0.6725854277610779, "step": 4028 }, { "epoch": 2.9435616438356167, "grad_norm": 12.86899107919826, "learning_rate": 9.867579762589936e-08, "logits/chosen": -2.6612813472747803, "logits/rejected": -2.037491798400879, "logps/chosen": -663.051513671875, "logps/rejected": -558.7919921875, "loss": 0.0628, "rewards/accuracies": 1.0, "rewards/chosen": 4.489674091339111, "rewards/margins": 5.399099826812744, "rewards/rejected": -0.9094255566596985, "step": 4029 }, { "epoch": 2.944292237442922, "grad_norm": 15.39353210567596, "learning_rate": 9.854886319714187e-08, "logits/chosen": -3.19865083694458, "logits/rejected": -2.465325355529785, "logps/chosen": -548.3927612304688, "logps/rejected": -403.70086669921875, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": 3.134592056274414, "rewards/margins": 3.8972184658050537, "rewards/rejected": -0.7626262903213501, "step": 4030 }, { "epoch": 2.9450228310502284, "grad_norm": 16.215335435059135, "learning_rate": 9.842199041882426e-08, "logits/chosen": -2.6628472805023193, "logits/rejected": -2.0547361373901367, "logps/chosen": -652.6734619140625, "logps/rejected": -431.59912109375, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 4.458591461181641, "rewards/margins": 6.450379371643066, "rewards/rejected": -1.9917879104614258, "step": 4031 }, { "epoch": 2.9457534246575343, "grad_norm": 13.528204383537878, "learning_rate": 9.829517934259191e-08, "logits/chosen": -2.9199070930480957, "logits/rejected": -2.0756566524505615, "logps/chosen": -553.648681640625, "logps/rejected": -518.9014892578125, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 2.9589052200317383, "rewards/margins": 5.782989501953125, "rewards/rejected": -2.8240840435028076, "step": 4032 }, { "epoch": 2.94648401826484, "grad_norm": 7.247854244105549, "learning_rate": 9.81684300200653e-08, "logits/chosen": -2.4173519611358643, "logits/rejected": -2.6950159072875977, "logps/chosen": -473.8667907714844, "logps/rejected": -600.4041748046875, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 2.594083309173584, "rewards/margins": 5.3461503982543945, "rewards/rejected": -2.7520670890808105, "step": 4033 }, { "epoch": 2.947214611872146, "grad_norm": 9.004103678692323, "learning_rate": 9.804174250283947e-08, "logits/chosen": -3.2970776557922363, "logits/rejected": -2.3195371627807617, "logps/chosen": -1289.568359375, "logps/rejected": -893.9563598632812, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 6.173397064208984, "rewards/margins": 5.207793712615967, "rewards/rejected": 0.9656032919883728, "step": 4034 }, { "epoch": 2.947945205479452, "grad_norm": 8.511958069432746, "learning_rate": 9.791511684248474e-08, "logits/chosen": -2.6194798946380615, "logits/rejected": -2.158860206604004, "logps/chosen": -438.52069091796875, "logps/rejected": -479.05413818359375, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 3.605797052383423, "rewards/margins": 6.8518524169921875, "rewards/rejected": -3.2460551261901855, "step": 4035 }, { "epoch": 2.9486757990867583, "grad_norm": 11.17306627444519, "learning_rate": 9.77885530905456e-08, "logits/chosen": -3.0264878273010254, "logits/rejected": -2.3780906200408936, "logps/chosen": -702.9076538085938, "logps/rejected": -587.4694213867188, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 3.3788580894470215, "rewards/margins": 6.051031589508057, "rewards/rejected": -2.672173500061035, "step": 4036 }, { "epoch": 2.9494063926940637, "grad_norm": 10.769648847743873, "learning_rate": 9.766205129854197e-08, "logits/chosen": -3.050353527069092, "logits/rejected": -2.255739212036133, "logps/chosen": -736.181396484375, "logps/rejected": -537.3175048828125, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 4.175040245056152, "rewards/margins": 3.9903903007507324, "rewards/rejected": 0.18465036153793335, "step": 4037 }, { "epoch": 2.95013698630137, "grad_norm": 10.166719738764417, "learning_rate": 9.753561151796812e-08, "logits/chosen": -2.7135488986968994, "logits/rejected": -2.103881359100342, "logps/chosen": -627.0540161132812, "logps/rejected": -562.4458618164062, "loss": 0.052, "rewards/accuracies": 0.875, "rewards/chosen": 2.9736928939819336, "rewards/margins": 4.946046829223633, "rewards/rejected": -1.972353458404541, "step": 4038 }, { "epoch": 2.950867579908676, "grad_norm": 15.357986092466884, "learning_rate": 9.740923380029342e-08, "logits/chosen": -2.5182607173919678, "logits/rejected": -1.8963193893432617, "logps/chosen": -623.8206176757812, "logps/rejected": -346.4385986328125, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": 3.679433822631836, "rewards/margins": 5.914515018463135, "rewards/rejected": -2.2350809574127197, "step": 4039 }, { "epoch": 2.9515981735159817, "grad_norm": 16.212340865460813, "learning_rate": 9.728291819696169e-08, "logits/chosen": -2.8399994373321533, "logits/rejected": -2.0534181594848633, "logps/chosen": -593.319091796875, "logps/rejected": -594.0498046875, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 4.842781066894531, "rewards/margins": 8.056079864501953, "rewards/rejected": -3.2132983207702637, "step": 4040 }, { "epoch": 2.9523287671232876, "grad_norm": 8.727701602859408, "learning_rate": 9.715666475939155e-08, "logits/chosen": -2.7012548446655273, "logits/rejected": -2.056382179260254, "logps/chosen": -697.5765991210938, "logps/rejected": -513.8298950195312, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.6922125816345215, "rewards/margins": 5.750075340270996, "rewards/rejected": -2.0578629970550537, "step": 4041 }, { "epoch": 2.9530593607305935, "grad_norm": 6.496736734376789, "learning_rate": 9.703047353897645e-08, "logits/chosen": -3.081975221633911, "logits/rejected": -2.609971523284912, "logps/chosen": -738.7908325195312, "logps/rejected": -657.5928344726562, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 2.9772751331329346, "rewards/margins": 5.050625324249268, "rewards/rejected": -2.073349952697754, "step": 4042 }, { "epoch": 2.9537899543379, "grad_norm": 7.201070430445621, "learning_rate": 9.690434458708424e-08, "logits/chosen": -2.6241297721862793, "logits/rejected": -2.1298623085021973, "logps/chosen": -673.297119140625, "logps/rejected": -549.682373046875, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 3.0108540058135986, "rewards/margins": 4.08289909362793, "rewards/rejected": -1.0720453262329102, "step": 4043 }, { "epoch": 2.9545205479452052, "grad_norm": 25.683810894994608, "learning_rate": 9.677827795505783e-08, "logits/chosen": -2.659447431564331, "logits/rejected": -1.9919288158416748, "logps/chosen": -488.7297668457031, "logps/rejected": -366.56793212890625, "loss": 0.0952, "rewards/accuracies": 1.0, "rewards/chosen": 3.1217563152313232, "rewards/margins": 6.750802040100098, "rewards/rejected": -3.629045248031616, "step": 4044 }, { "epoch": 2.9552511415525116, "grad_norm": 14.597458688788896, "learning_rate": 9.665227369421433e-08, "logits/chosen": -2.8567721843719482, "logits/rejected": -2.6560206413269043, "logps/chosen": -621.1703491210938, "logps/rejected": -616.619873046875, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 2.134002685546875, "rewards/margins": 4.181649208068848, "rewards/rejected": -2.047646999359131, "step": 4045 }, { "epoch": 2.9559817351598174, "grad_norm": 20.53300312982241, "learning_rate": 9.652633185584566e-08, "logits/chosen": -2.9245991706848145, "logits/rejected": -2.782027244567871, "logps/chosen": -732.7537231445312, "logps/rejected": -599.0556640625, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 4.172749042510986, "rewards/margins": 4.337011337280273, "rewards/rejected": -0.16426235437393188, "step": 4046 }, { "epoch": 2.9567123287671233, "grad_norm": 19.251763597429928, "learning_rate": 9.640045249121842e-08, "logits/chosen": -2.822331666946411, "logits/rejected": -2.3234736919403076, "logps/chosen": -529.9457397460938, "logps/rejected": -436.3630676269531, "loss": 0.0714, "rewards/accuracies": 1.0, "rewards/chosen": 1.6630631685256958, "rewards/margins": 3.534503936767578, "rewards/rejected": -1.8714408874511719, "step": 4047 }, { "epoch": 2.957442922374429, "grad_norm": 9.601711883500254, "learning_rate": 9.627463565157362e-08, "logits/chosen": -2.3693554401397705, "logits/rejected": -1.749169111251831, "logps/chosen": -432.3518371582031, "logps/rejected": -412.7889404296875, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": 3.478604793548584, "rewards/margins": 7.278158187866211, "rewards/rejected": -3.7995541095733643, "step": 4048 }, { "epoch": 2.958173515981735, "grad_norm": 13.478083021106086, "learning_rate": 9.614888138812679e-08, "logits/chosen": -2.521791934967041, "logits/rejected": -2.554130792617798, "logps/chosen": -535.9151611328125, "logps/rejected": -588.898193359375, "loss": 0.0816, "rewards/accuracies": 1.0, "rewards/chosen": 2.967040538787842, "rewards/margins": 5.7053632736206055, "rewards/rejected": -2.7383224964141846, "step": 4049 }, { "epoch": 2.958904109589041, "grad_norm": 12.865887129760706, "learning_rate": 9.602318975206827e-08, "logits/chosen": -3.071537494659424, "logits/rejected": -2.7264671325683594, "logps/chosen": -830.2975463867188, "logps/rejected": -642.7760009765625, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 1.174811601638794, "rewards/margins": 3.373873710632324, "rewards/rejected": -2.1990623474121094, "step": 4050 }, { "epoch": 2.959634703196347, "grad_norm": 25.45342474117555, "learning_rate": 9.589756079456252e-08, "logits/chosen": -2.7831687927246094, "logits/rejected": -2.092742443084717, "logps/chosen": -504.77130126953125, "logps/rejected": -437.9246826171875, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": 3.080885410308838, "rewards/margins": 6.887986660003662, "rewards/rejected": -3.807100534439087, "step": 4051 }, { "epoch": 2.960365296803653, "grad_norm": 12.868883179859665, "learning_rate": 9.577199456674892e-08, "logits/chosen": -2.3990283012390137, "logits/rejected": -2.4140148162841797, "logps/chosen": -403.85760498046875, "logps/rejected": -534.7195434570312, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": 1.7205952405929565, "rewards/margins": 5.679114818572998, "rewards/rejected": -3.95851993560791, "step": 4052 }, { "epoch": 2.961095890410959, "grad_norm": 8.356495926518596, "learning_rate": 9.564649111974074e-08, "logits/chosen": -2.3237812519073486, "logits/rejected": -2.5447440147399902, "logps/chosen": -893.1235961914062, "logps/rejected": -826.557861328125, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 3.3187623023986816, "rewards/margins": 4.533692836761475, "rewards/rejected": -1.214930534362793, "step": 4053 }, { "epoch": 2.961826484018265, "grad_norm": 13.945215664054542, "learning_rate": 9.55210505046263e-08, "logits/chosen": -2.5348312854766846, "logits/rejected": -2.337819814682007, "logps/chosen": -603.1505737304688, "logps/rejected": -710.3673095703125, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 2.0343079566955566, "rewards/margins": 20.21071434020996, "rewards/rejected": -18.17640495300293, "step": 4054 }, { "epoch": 2.9625570776255707, "grad_norm": 24.375118940014296, "learning_rate": 9.539567277246787e-08, "logits/chosen": -2.779634952545166, "logits/rejected": -1.8046835660934448, "logps/chosen": -523.109619140625, "logps/rejected": -374.4446105957031, "loss": 0.1175, "rewards/accuracies": 0.75, "rewards/chosen": 3.92202091217041, "rewards/margins": 6.126282691955566, "rewards/rejected": -2.2042622566223145, "step": 4055 }, { "epoch": 2.9632876712328766, "grad_norm": 14.68165972232528, "learning_rate": 9.527035797430244e-08, "logits/chosen": -2.6737594604492188, "logits/rejected": -2.823378801345825, "logps/chosen": -597.2178955078125, "logps/rejected": -794.5677490234375, "loss": 0.0612, "rewards/accuracies": 0.875, "rewards/chosen": 1.4706521034240723, "rewards/margins": 3.9029271602630615, "rewards/rejected": -2.432274580001831, "step": 4056 }, { "epoch": 2.9640182648401825, "grad_norm": 8.929402557926476, "learning_rate": 9.514510616114141e-08, "logits/chosen": -2.4603588581085205, "logits/rejected": -1.9649271965026855, "logps/chosen": -643.4998168945312, "logps/rejected": -695.2945556640625, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 4.73271369934082, "rewards/margins": 8.158973693847656, "rewards/rejected": -3.4262592792510986, "step": 4057 }, { "epoch": 2.9647488584474884, "grad_norm": 11.443207864300375, "learning_rate": 9.501991738397008e-08, "logits/chosen": -2.888260841369629, "logits/rejected": -2.422051191329956, "logps/chosen": -559.1635131835938, "logps/rejected": -559.9110717773438, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 2.3098554611206055, "rewards/margins": 3.2699124813079834, "rewards/rejected": -0.9600574374198914, "step": 4058 }, { "epoch": 2.9654794520547947, "grad_norm": 13.192473282836671, "learning_rate": 9.489479169374861e-08, "logits/chosen": -3.050689220428467, "logits/rejected": -2.3268237113952637, "logps/chosen": -766.8005981445312, "logps/rejected": -512.9908447265625, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 3.115293502807617, "rewards/margins": 4.478251934051514, "rewards/rejected": -1.3629586696624756, "step": 4059 }, { "epoch": 2.9662100456621006, "grad_norm": 13.984494478365304, "learning_rate": 9.476972914141118e-08, "logits/chosen": -2.430522918701172, "logits/rejected": -1.9975711107254028, "logps/chosen": -472.186767578125, "logps/rejected": -408.881103515625, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 3.29040265083313, "rewards/margins": 5.594003677368164, "rewards/rejected": -2.303600788116455, "step": 4060 }, { "epoch": 2.9669406392694064, "grad_norm": 5.664717769208379, "learning_rate": 9.464472977786647e-08, "logits/chosen": -2.3903746604919434, "logits/rejected": -2.500950574874878, "logps/chosen": -676.1131591796875, "logps/rejected": -686.2586059570312, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 3.768186092376709, "rewards/margins": 4.5001678466796875, "rewards/rejected": -0.7319813966751099, "step": 4061 }, { "epoch": 2.9676712328767123, "grad_norm": 7.993413359448501, "learning_rate": 9.451979365399725e-08, "logits/chosen": -2.8141090869903564, "logits/rejected": -2.0894265174865723, "logps/chosen": -752.3071899414062, "logps/rejected": -602.351806640625, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 4.803039073944092, "rewards/margins": 7.4694294929504395, "rewards/rejected": -2.6663906574249268, "step": 4062 }, { "epoch": 2.968401826484018, "grad_norm": 11.639074849595744, "learning_rate": 9.439492082066058e-08, "logits/chosen": -3.1959142684936523, "logits/rejected": -2.386045455932617, "logps/chosen": -907.7638549804688, "logps/rejected": -766.1629638671875, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 3.973555564880371, "rewards/margins": 4.6909894943237305, "rewards/rejected": -0.7174339294433594, "step": 4063 }, { "epoch": 2.969132420091324, "grad_norm": 17.70941508261522, "learning_rate": 9.427011132868797e-08, "logits/chosen": -2.9593026638031006, "logits/rejected": -2.7765707969665527, "logps/chosen": -425.91156005859375, "logps/rejected": -310.2928466796875, "loss": 0.0935, "rewards/accuracies": 0.875, "rewards/chosen": 2.652344226837158, "rewards/margins": 3.9021570682525635, "rewards/rejected": -1.2498130798339844, "step": 4064 }, { "epoch": 2.96986301369863, "grad_norm": 15.940136500675152, "learning_rate": 9.414536522888489e-08, "logits/chosen": -2.2695472240448, "logits/rejected": -2.2071373462677, "logps/chosen": -519.6283569335938, "logps/rejected": -543.29296875, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": 2.476630449295044, "rewards/margins": 4.561992168426514, "rewards/rejected": -2.0853617191314697, "step": 4065 }, { "epoch": 2.9705936073059362, "grad_norm": 14.437035491208507, "learning_rate": 9.4020682572031e-08, "logits/chosen": -2.410048484802246, "logits/rejected": -1.8885955810546875, "logps/chosen": -278.11444091796875, "logps/rejected": -260.41363525390625, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 1.4234685897827148, "rewards/margins": 4.07010555267334, "rewards/rejected": -2.646636486053467, "step": 4066 }, { "epoch": 2.971324200913242, "grad_norm": 7.9818667897134485, "learning_rate": 9.38960634088804e-08, "logits/chosen": -2.384521245956421, "logits/rejected": -2.1133503913879395, "logps/chosen": -572.5892944335938, "logps/rejected": -579.9723510742188, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": 3.5638251304626465, "rewards/margins": 5.576002597808838, "rewards/rejected": -2.0121772289276123, "step": 4067 }, { "epoch": 2.972054794520548, "grad_norm": 16.198287609632537, "learning_rate": 9.377150779016102e-08, "logits/chosen": -2.749084949493408, "logits/rejected": -2.341921329498291, "logps/chosen": -367.1207275390625, "logps/rejected": -418.7050476074219, "loss": 0.0788, "rewards/accuracies": 1.0, "rewards/chosen": 2.620859384536743, "rewards/margins": 5.262676239013672, "rewards/rejected": -2.6418166160583496, "step": 4068 }, { "epoch": 2.972785388127854, "grad_norm": 11.500482300305864, "learning_rate": 9.364701576657524e-08, "logits/chosen": -2.7066965103149414, "logits/rejected": -2.1027398109436035, "logps/chosen": -498.5589599609375, "logps/rejected": -441.0814208984375, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 3.0965254306793213, "rewards/margins": 5.041419506072998, "rewards/rejected": -1.944894552230835, "step": 4069 }, { "epoch": 2.9735159817351597, "grad_norm": 28.193592571883688, "learning_rate": 9.352258738879931e-08, "logits/chosen": -2.5679683685302734, "logits/rejected": -2.052921772003174, "logps/chosen": -505.45709228515625, "logps/rejected": -453.27490234375, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": 1.6158511638641357, "rewards/margins": 4.555830001831055, "rewards/rejected": -2.93997859954834, "step": 4070 }, { "epoch": 2.9742465753424656, "grad_norm": 12.93882383493358, "learning_rate": 9.339822270748366e-08, "logits/chosen": -3.34419584274292, "logits/rejected": -2.0662543773651123, "logps/chosen": -801.1873779296875, "logps/rejected": -471.027099609375, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 4.576627731323242, "rewards/margins": 6.942887306213379, "rewards/rejected": -2.366260051727295, "step": 4071 }, { "epoch": 2.9749771689497715, "grad_norm": 15.102048500004374, "learning_rate": 9.32739217732527e-08, "logits/chosen": -2.9247822761535645, "logits/rejected": -2.322944402694702, "logps/chosen": -689.9367065429688, "logps/rejected": -583.4912109375, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 3.797161817550659, "rewards/margins": 4.433688163757324, "rewards/rejected": -0.6365264654159546, "step": 4072 }, { "epoch": 2.975707762557078, "grad_norm": 9.716838685525625, "learning_rate": 9.314968463670503e-08, "logits/chosen": -2.8413093090057373, "logits/rejected": -1.4062788486480713, "logps/chosen": -813.6416625976562, "logps/rejected": -413.6001892089844, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 3.2151412963867188, "rewards/margins": 5.990759372711182, "rewards/rejected": -2.775618076324463, "step": 4073 }, { "epoch": 2.9764383561643837, "grad_norm": 3.8583038944140196, "learning_rate": 9.302551134841344e-08, "logits/chosen": -2.739938974380493, "logits/rejected": -1.9543598890304565, "logps/chosen": -802.0563354492188, "logps/rejected": -669.5562744140625, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 4.667293548583984, "rewards/margins": 7.292746067047119, "rewards/rejected": -2.6254520416259766, "step": 4074 }, { "epoch": 2.9771689497716896, "grad_norm": 21.177338000215954, "learning_rate": 9.290140195892415e-08, "logits/chosen": -2.590855360031128, "logits/rejected": -2.0884439945220947, "logps/chosen": -522.8088989257812, "logps/rejected": -459.9551086425781, "loss": 0.0802, "rewards/accuracies": 0.875, "rewards/chosen": 2.562469005584717, "rewards/margins": 4.458176136016846, "rewards/rejected": -1.895707130432129, "step": 4075 }, { "epoch": 2.9778995433789954, "grad_norm": 19.16888857139349, "learning_rate": 9.277735651875801e-08, "logits/chosen": -2.943617105484009, "logits/rejected": -2.343740940093994, "logps/chosen": -555.3712768554688, "logps/rejected": -400.7110290527344, "loss": 0.1007, "rewards/accuracies": 0.875, "rewards/chosen": 2.6590044498443604, "rewards/margins": 3.9127731323242188, "rewards/rejected": -1.253768801689148, "step": 4076 }, { "epoch": 2.9786301369863013, "grad_norm": 13.150388739464072, "learning_rate": 9.265337507840934e-08, "logits/chosen": -3.0954132080078125, "logits/rejected": -2.5209851264953613, "logps/chosen": -277.7113037109375, "logps/rejected": -244.93243408203125, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 1.7660784721374512, "rewards/margins": 4.241031646728516, "rewards/rejected": -2.4749531745910645, "step": 4077 }, { "epoch": 2.979360730593607, "grad_norm": 9.355649789480069, "learning_rate": 9.252945768834688e-08, "logits/chosen": -2.8291993141174316, "logits/rejected": -2.2943992614746094, "logps/chosen": -649.7561645507812, "logps/rejected": -544.4171142578125, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": 1.3015192747116089, "rewards/margins": 2.9410042762756348, "rewards/rejected": -1.6394851207733154, "step": 4078 }, { "epoch": 2.980091324200913, "grad_norm": 12.01861654079568, "learning_rate": 9.24056043990129e-08, "logits/chosen": -2.598504066467285, "logits/rejected": -2.1892547607421875, "logps/chosen": -764.2202758789062, "logps/rejected": -667.3792114257812, "loss": 0.0654, "rewards/accuracies": 1.0, "rewards/chosen": 3.745107889175415, "rewards/margins": 3.5611000061035156, "rewards/rejected": 0.18400797247886658, "step": 4079 }, { "epoch": 2.9808219178082194, "grad_norm": 15.530952312602345, "learning_rate": 9.228181526082368e-08, "logits/chosen": -3.128938674926758, "logits/rejected": -2.281741142272949, "logps/chosen": -603.3817138671875, "logps/rejected": -483.1202697753906, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": 3.560457706451416, "rewards/margins": 4.461446285247803, "rewards/rejected": -0.900988757610321, "step": 4080 }, { "epoch": 2.9815525114155252, "grad_norm": 8.768739959428277, "learning_rate": 9.215809032416957e-08, "logits/chosen": -2.7116544246673584, "logits/rejected": -2.013424873352051, "logps/chosen": -607.2100830078125, "logps/rejected": -434.4600830078125, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 2.9543423652648926, "rewards/margins": 5.428343772888184, "rewards/rejected": -2.474001407623291, "step": 4081 }, { "epoch": 2.982283105022831, "grad_norm": 8.170442240405876, "learning_rate": 9.203442963941449e-08, "logits/chosen": -2.7657761573791504, "logits/rejected": -2.4920358657836914, "logps/chosen": -458.5875549316406, "logps/rejected": -500.73834228515625, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 2.412172555923462, "rewards/margins": 4.980618476867676, "rewards/rejected": -2.5684456825256348, "step": 4082 }, { "epoch": 2.983013698630137, "grad_norm": 5.57616417879285, "learning_rate": 9.191083325689655e-08, "logits/chosen": -2.7543160915374756, "logits/rejected": -2.700563430786133, "logps/chosen": -682.517333984375, "logps/rejected": -753.4341430664062, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 3.323525905609131, "rewards/margins": 5.3280744552612305, "rewards/rejected": -2.0045483112335205, "step": 4083 }, { "epoch": 2.983744292237443, "grad_norm": 10.101342714401342, "learning_rate": 9.17873012269274e-08, "logits/chosen": -2.4865565299987793, "logits/rejected": -2.005749225616455, "logps/chosen": -795.52880859375, "logps/rejected": -722.6810913085938, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 1.846976399421692, "rewards/margins": 5.975101470947266, "rewards/rejected": -4.128124713897705, "step": 4084 }, { "epoch": 2.9844748858447487, "grad_norm": 14.758120095887838, "learning_rate": 9.166383359979248e-08, "logits/chosen": -2.453503131866455, "logits/rejected": -2.130185842514038, "logps/chosen": -690.4461669921875, "logps/rejected": -637.9482421875, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": 2.673703908920288, "rewards/margins": 3.934539318084717, "rewards/rejected": -1.2608352899551392, "step": 4085 }, { "epoch": 2.9852054794520546, "grad_norm": 9.48827481730947, "learning_rate": 9.154043042575135e-08, "logits/chosen": -2.335326671600342, "logits/rejected": -2.9637343883514404, "logps/chosen": -634.3192138671875, "logps/rejected": -978.1654052734375, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 4.185199737548828, "rewards/margins": 6.444766998291016, "rewards/rejected": -2.2595670223236084, "step": 4086 }, { "epoch": 2.985936073059361, "grad_norm": 8.052211614049982, "learning_rate": 9.141709175503698e-08, "logits/chosen": -2.908755302429199, "logits/rejected": -1.9286072254180908, "logps/chosen": -811.4696655273438, "logps/rejected": -520.1905517578125, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 3.9722237586975098, "rewards/margins": 4.965234756469727, "rewards/rejected": -0.9930109977722168, "step": 4087 }, { "epoch": 2.986666666666667, "grad_norm": 9.251395043002931, "learning_rate": 9.129381763785621e-08, "logits/chosen": -2.3768115043640137, "logits/rejected": -1.607530117034912, "logps/chosen": -565.759521484375, "logps/rejected": -547.3614501953125, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 4.896458625793457, "rewards/margins": 5.82890510559082, "rewards/rejected": -0.9324465394020081, "step": 4088 }, { "epoch": 2.9873972602739727, "grad_norm": 11.16911296713648, "learning_rate": 9.117060812438956e-08, "logits/chosen": -2.567636013031006, "logits/rejected": -2.035404920578003, "logps/chosen": -625.7327880859375, "logps/rejected": -517.3228759765625, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 5.725312232971191, "rewards/margins": 6.838134288787842, "rewards/rejected": -1.1128227710723877, "step": 4089 }, { "epoch": 2.9881278538812786, "grad_norm": 9.210066090034996, "learning_rate": 9.104746326479143e-08, "logits/chosen": -2.461329460144043, "logits/rejected": -2.1085915565490723, "logps/chosen": -581.7828979492188, "logps/rejected": -605.3101196289062, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 2.316617012023926, "rewards/margins": 4.356014728546143, "rewards/rejected": -2.039397716522217, "step": 4090 }, { "epoch": 2.9888584474885844, "grad_norm": 11.97985738780836, "learning_rate": 9.092438310918968e-08, "logits/chosen": -2.432741165161133, "logits/rejected": -2.0082578659057617, "logps/chosen": -669.8912353515625, "logps/rejected": -679.791015625, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": 3.8840274810791016, "rewards/margins": 6.026132106781006, "rewards/rejected": -2.142104387283325, "step": 4091 }, { "epoch": 2.9895890410958903, "grad_norm": 9.489526295551578, "learning_rate": 9.080136770768588e-08, "logits/chosen": -2.303166627883911, "logits/rejected": -1.7308261394500732, "logps/chosen": -391.092041015625, "logps/rejected": -320.07867431640625, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 2.652780294418335, "rewards/margins": 5.671462059020996, "rewards/rejected": -3.018681764602661, "step": 4092 }, { "epoch": 2.990319634703196, "grad_norm": 11.285289094168782, "learning_rate": 9.06784171103554e-08, "logits/chosen": -3.4965274333953857, "logits/rejected": -2.601836919784546, "logps/chosen": -1069.898681640625, "logps/rejected": -803.735595703125, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 5.585245132446289, "rewards/margins": 6.545803546905518, "rewards/rejected": -0.9605587124824524, "step": 4093 }, { "epoch": 2.9910502283105025, "grad_norm": 9.690852809419207, "learning_rate": 9.055553136724698e-08, "logits/chosen": -2.8674368858337402, "logits/rejected": -2.6327197551727295, "logps/chosen": -707.4451293945312, "logps/rejected": -580.6455078125, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 3.213409662246704, "rewards/margins": 4.934564590454102, "rewards/rejected": -1.7211549282073975, "step": 4094 }, { "epoch": 2.9917808219178084, "grad_norm": 11.475494812912073, "learning_rate": 9.043271052838322e-08, "logits/chosen": -1.9962782859802246, "logits/rejected": -1.797094464302063, "logps/chosen": -430.39874267578125, "logps/rejected": -531.5303955078125, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 3.9369351863861084, "rewards/margins": 9.104130744934082, "rewards/rejected": -5.167195796966553, "step": 4095 }, { "epoch": 2.9925114155251142, "grad_norm": 8.723139488545456, "learning_rate": 9.030995464376013e-08, "logits/chosen": -3.236783027648926, "logits/rejected": -2.4397294521331787, "logps/chosen": -666.4959106445312, "logps/rejected": -492.373779296875, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 2.969078302383423, "rewards/margins": 4.0117340087890625, "rewards/rejected": -1.0426560640335083, "step": 4096 }, { "epoch": 2.99324200913242, "grad_norm": 6.61496593740492, "learning_rate": 9.01872637633472e-08, "logits/chosen": -2.707106590270996, "logits/rejected": -2.815980911254883, "logps/chosen": -737.27978515625, "logps/rejected": -793.8558349609375, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 3.5418503284454346, "rewards/margins": 5.681934833526611, "rewards/rejected": -2.1400842666625977, "step": 4097 }, { "epoch": 2.993972602739726, "grad_norm": 12.431922160254738, "learning_rate": 9.006463793708777e-08, "logits/chosen": -2.6967976093292236, "logits/rejected": -1.9602842330932617, "logps/chosen": -745.0629272460938, "logps/rejected": -642.563232421875, "loss": 0.0719, "rewards/accuracies": 1.0, "rewards/chosen": 3.7757558822631836, "rewards/margins": 5.792085647583008, "rewards/rejected": -2.016329765319824, "step": 4098 }, { "epoch": 2.994703196347032, "grad_norm": 12.598249138180124, "learning_rate": 8.994207721489832e-08, "logits/chosen": -3.1362223625183105, "logits/rejected": -2.40535306930542, "logps/chosen": -735.5377197265625, "logps/rejected": -608.7020263671875, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 3.8006715774536133, "rewards/margins": 6.181008338928223, "rewards/rejected": -2.3803369998931885, "step": 4099 }, { "epoch": 2.9954337899543377, "grad_norm": 17.325110514675025, "learning_rate": 8.981958164666922e-08, "logits/chosen": -2.5366928577423096, "logits/rejected": -2.3146209716796875, "logps/chosen": -709.644287109375, "logps/rejected": -821.10595703125, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 2.9654111862182617, "rewards/margins": 5.88809871673584, "rewards/rejected": -2.92268705368042, "step": 4100 }, { "epoch": 2.996164383561644, "grad_norm": 14.689374568705318, "learning_rate": 8.969715128226399e-08, "logits/chosen": -2.540597438812256, "logits/rejected": -2.4692554473876953, "logps/chosen": -498.96234130859375, "logps/rejected": -661.5162963867188, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": 3.543323040008545, "rewards/margins": 4.4497857093811035, "rewards/rejected": -0.9064626097679138, "step": 4101 }, { "epoch": 2.99689497716895, "grad_norm": 15.174336823956262, "learning_rate": 8.957478617151967e-08, "logits/chosen": -2.66835355758667, "logits/rejected": -1.9749279022216797, "logps/chosen": -544.1963500976562, "logps/rejected": -489.47967529296875, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": 3.371054172515869, "rewards/margins": 3.7236504554748535, "rewards/rejected": -0.3525966703891754, "step": 4102 }, { "epoch": 2.997625570776256, "grad_norm": 16.45503720784972, "learning_rate": 8.945248636424696e-08, "logits/chosen": -2.5294713973999023, "logits/rejected": -2.4065566062927246, "logps/chosen": -980.1555786132812, "logps/rejected": -784.4453125, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 2.9257402420043945, "rewards/margins": 4.858926296234131, "rewards/rejected": -1.9331860542297363, "step": 4103 }, { "epoch": 2.9983561643835617, "grad_norm": 15.093016549370452, "learning_rate": 8.933025191022975e-08, "logits/chosen": -3.1683952808380127, "logits/rejected": -2.199936866760254, "logps/chosen": -690.9817504882812, "logps/rejected": -522.8899536132812, "loss": 0.0587, "rewards/accuracies": 0.875, "rewards/chosen": 2.995645523071289, "rewards/margins": 4.385821342468262, "rewards/rejected": -1.390175700187683, "step": 4104 }, { "epoch": 2.9990867579908675, "grad_norm": 17.691182430550413, "learning_rate": 8.920808285922537e-08, "logits/chosen": -3.250140905380249, "logits/rejected": -2.6824793815612793, "logps/chosen": -1054.617919921875, "logps/rejected": -1002.2682495117188, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 5.188467979431152, "rewards/margins": 4.752846717834473, "rewards/rejected": 0.4356207847595215, "step": 4105 }, { "epoch": 2.9998173515981734, "grad_norm": 12.939839764270653, "learning_rate": 8.908597926096448e-08, "logits/chosen": -2.783851385116577, "logits/rejected": -1.907330870628357, "logps/chosen": -628.4356079101562, "logps/rejected": -505.7417907714844, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 3.81552791595459, "rewards/margins": 7.848548889160156, "rewards/rejected": -4.033020973205566, "step": 4106 }, { "epoch": 2.9998173515981734, "eval_logits/chosen": -2.8675687313079834, "eval_logits/rejected": -2.3869822025299072, "eval_logps/chosen": -705.1044921875, "eval_logps/rejected": -593.3734741210938, "eval_loss": 0.38906311988830566, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 3.080876111984253, "eval_rewards/margins": 4.397425651550293, "eval_rewards/rejected": -1.3165498971939087, "eval_runtime": 14.4306, "eval_samples_per_second": 7.623, "eval_steps_per_second": 0.97, "step": 4106 }, { "epoch": 3.0005479452054793, "grad_norm": 8.784750244203153, "learning_rate": 8.896394116515132e-08, "logits/chosen": -2.9332704544067383, "logits/rejected": -2.627403736114502, "logps/chosen": -766.5107421875, "logps/rejected": -773.4293212890625, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 3.527371883392334, "rewards/margins": 4.096399784088135, "rewards/rejected": -0.5690276622772217, "step": 4107 }, { "epoch": 3.001278538812785, "grad_norm": 6.628097098155558, "learning_rate": 8.884196862146318e-08, "logits/chosen": -2.369752883911133, "logits/rejected": -2.4305715560913086, "logps/chosen": -488.2215270996094, "logps/rejected": -662.1231689453125, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 2.9652771949768066, "rewards/margins": 5.7760701179504395, "rewards/rejected": -2.8107926845550537, "step": 4108 }, { "epoch": 3.0020091324200915, "grad_norm": 6.897639503851854, "learning_rate": 8.872006167955096e-08, "logits/chosen": -2.917560577392578, "logits/rejected": -2.535820484161377, "logps/chosen": -593.7889404296875, "logps/rejected": -448.92047119140625, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 3.118759870529175, "rewards/margins": 4.432726860046387, "rewards/rejected": -1.313967227935791, "step": 4109 }, { "epoch": 3.0027397260273974, "grad_norm": 8.257235042191313, "learning_rate": 8.859822038903855e-08, "logits/chosen": -2.635820150375366, "logits/rejected": -2.0273633003234863, "logps/chosen": -399.26385498046875, "logps/rejected": -443.94378662109375, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 2.7783939838409424, "rewards/margins": 6.044520854949951, "rewards/rejected": -3.266127109527588, "step": 4110 }, { "epoch": 3.0034703196347032, "grad_norm": 22.8214598735865, "learning_rate": 8.847644479952327e-08, "logits/chosen": -3.165513515472412, "logits/rejected": -1.8321800231933594, "logps/chosen": -707.4790649414062, "logps/rejected": -391.76824951171875, "loss": 0.0875, "rewards/accuracies": 1.0, "rewards/chosen": 3.7796313762664795, "rewards/margins": 5.836349964141846, "rewards/rejected": -2.056718587875366, "step": 4111 }, { "epoch": 3.004200913242009, "grad_norm": 5.2322924670214315, "learning_rate": 8.83547349605758e-08, "logits/chosen": -2.9516098499298096, "logits/rejected": -2.4452288150787354, "logps/chosen": -646.737548828125, "logps/rejected": -558.2655639648438, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 3.7439637184143066, "rewards/margins": 5.506969928741455, "rewards/rejected": -1.7630059719085693, "step": 4112 }, { "epoch": 3.004931506849315, "grad_norm": 7.213959132117625, "learning_rate": 8.823309092173987e-08, "logits/chosen": -2.833256483078003, "logits/rejected": -2.23201060295105, "logps/chosen": -491.4332275390625, "logps/rejected": -475.68402099609375, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 3.617647886276245, "rewards/margins": 6.137119770050049, "rewards/rejected": -2.519472122192383, "step": 4113 }, { "epoch": 3.005662100456621, "grad_norm": 4.607058806302191, "learning_rate": 8.811151273253239e-08, "logits/chosen": -2.810269594192505, "logits/rejected": -2.4027745723724365, "logps/chosen": -635.4117431640625, "logps/rejected": -578.3411254882812, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 2.644066333770752, "rewards/margins": 4.0882062911987305, "rewards/rejected": -1.4441401958465576, "step": 4114 }, { "epoch": 3.0063926940639267, "grad_norm": 4.377408699726184, "learning_rate": 8.799000044244379e-08, "logits/chosen": -2.406611204147339, "logits/rejected": -2.1277472972869873, "logps/chosen": -664.17431640625, "logps/rejected": -639.5687866210938, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 4.68740177154541, "rewards/margins": 7.257560729980469, "rewards/rejected": -2.5701584815979004, "step": 4115 }, { "epoch": 3.007123287671233, "grad_norm": 7.708686971714499, "learning_rate": 8.786855410093724e-08, "logits/chosen": -3.2101378440856934, "logits/rejected": -2.569166660308838, "logps/chosen": -855.5374755859375, "logps/rejected": -772.6115112304688, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 3.553879976272583, "rewards/margins": 5.239414215087891, "rewards/rejected": -1.685534119606018, "step": 4116 }, { "epoch": 3.007853881278539, "grad_norm": 8.388745315816474, "learning_rate": 8.774717375744948e-08, "logits/chosen": -2.8636252880096436, "logits/rejected": -2.281586170196533, "logps/chosen": -474.3829345703125, "logps/rejected": -364.94464111328125, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 2.138960838317871, "rewards/margins": 4.233783721923828, "rewards/rejected": -2.094822645187378, "step": 4117 }, { "epoch": 3.008584474885845, "grad_norm": 9.578315770301451, "learning_rate": 8.762585946139007e-08, "logits/chosen": -3.186779022216797, "logits/rejected": -2.591176748275757, "logps/chosen": -694.516845703125, "logps/rejected": -444.7037048339844, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 3.907435655593872, "rewards/margins": 5.735250949859619, "rewards/rejected": -1.8278155326843262, "step": 4118 }, { "epoch": 3.0093150684931507, "grad_norm": 3.6812055497651284, "learning_rate": 8.750461126214176e-08, "logits/chosen": -3.44173526763916, "logits/rejected": -2.568946599960327, "logps/chosen": -945.89794921875, "logps/rejected": -836.488037109375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 6.540001392364502, "rewards/margins": 7.112587928771973, "rewards/rejected": -0.5725864171981812, "step": 4119 }, { "epoch": 3.0100456621004565, "grad_norm": 4.988536281533666, "learning_rate": 8.738342920906056e-08, "logits/chosen": -3.033723831176758, "logits/rejected": -2.377443313598633, "logps/chosen": -774.013916015625, "logps/rejected": -544.3991088867188, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 3.944580078125, "rewards/margins": 4.055920600891113, "rewards/rejected": -0.11134093999862671, "step": 4120 }, { "epoch": 3.0107762557077624, "grad_norm": 5.431812337159052, "learning_rate": 8.72623133514753e-08, "logits/chosen": -3.02316951751709, "logits/rejected": -2.5235471725463867, "logps/chosen": -787.4395141601562, "logps/rejected": -603.9808349609375, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 3.857055425643921, "rewards/margins": 5.229775428771973, "rewards/rejected": -1.3727202415466309, "step": 4121 }, { "epoch": 3.0115068493150683, "grad_norm": 6.115591179506158, "learning_rate": 8.714126373868821e-08, "logits/chosen": -2.6258115768432617, "logits/rejected": -2.37150239944458, "logps/chosen": -1337.107177734375, "logps/rejected": -842.5092163085938, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 4.774725437164307, "rewards/margins": 4.116771221160889, "rewards/rejected": 0.657954216003418, "step": 4122 }, { "epoch": 3.0122374429223746, "grad_norm": 3.885270075892031, "learning_rate": 8.702028041997403e-08, "logits/chosen": -2.4070162773132324, "logits/rejected": -2.3009016513824463, "logps/chosen": -566.5843505859375, "logps/rejected": -558.3123168945312, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 3.510502815246582, "rewards/margins": 5.9998321533203125, "rewards/rejected": -2.4893293380737305, "step": 4123 }, { "epoch": 3.0129680365296805, "grad_norm": 3.2440972643945267, "learning_rate": 8.689936344458104e-08, "logits/chosen": -2.577195167541504, "logits/rejected": -2.3734142780303955, "logps/chosen": -532.2211303710938, "logps/rejected": -648.1951293945312, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 4.049421310424805, "rewards/margins": 6.656111717224121, "rewards/rejected": -2.6066901683807373, "step": 4124 }, { "epoch": 3.0136986301369864, "grad_norm": 5.601544453313851, "learning_rate": 8.677851286173016e-08, "logits/chosen": -2.8265273571014404, "logits/rejected": -2.0060818195343018, "logps/chosen": -678.7725830078125, "logps/rejected": -640.1431884765625, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 3.8597750663757324, "rewards/margins": 6.154496669769287, "rewards/rejected": -2.2947216033935547, "step": 4125 }, { "epoch": 3.0144292237442922, "grad_norm": 3.0073688923075177, "learning_rate": 8.665772872061558e-08, "logits/chosen": -3.0450453758239746, "logits/rejected": -2.320467233657837, "logps/chosen": -983.4564208984375, "logps/rejected": -763.2645263671875, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 5.2362823486328125, "rewards/margins": 5.664909362792969, "rewards/rejected": -0.428627073764801, "step": 4126 }, { "epoch": 3.015159817351598, "grad_norm": 4.628069503420536, "learning_rate": 8.653701107040418e-08, "logits/chosen": -3.1337132453918457, "logits/rejected": -1.5159039497375488, "logps/chosen": -745.9520874023438, "logps/rejected": -448.03204345703125, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 4.7101850509643555, "rewards/margins": 7.366514205932617, "rewards/rejected": -2.6563291549682617, "step": 4127 }, { "epoch": 3.015890410958904, "grad_norm": 4.949421843085367, "learning_rate": 8.641635996023581e-08, "logits/chosen": -2.606543779373169, "logits/rejected": -2.464681625366211, "logps/chosen": -983.1818237304688, "logps/rejected": -713.569580078125, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 5.8931450843811035, "rewards/margins": 6.833620071411133, "rewards/rejected": -0.940475344657898, "step": 4128 }, { "epoch": 3.01662100456621, "grad_norm": 8.364763979229938, "learning_rate": 8.629577543922345e-08, "logits/chosen": -2.6899943351745605, "logits/rejected": -2.152512550354004, "logps/chosen": -557.5506591796875, "logps/rejected": -410.0812683105469, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 2.434572696685791, "rewards/margins": 4.854859352111816, "rewards/rejected": -2.4202864170074463, "step": 4129 }, { "epoch": 3.017351598173516, "grad_norm": 6.932948088335628, "learning_rate": 8.617525755645266e-08, "logits/chosen": -2.723256826400757, "logits/rejected": -2.523916006088257, "logps/chosen": -527.9317626953125, "logps/rejected": -656.180419921875, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 2.2702748775482178, "rewards/margins": 6.202693462371826, "rewards/rejected": -3.9324190616607666, "step": 4130 }, { "epoch": 3.018082191780822, "grad_norm": 3.7064029655604416, "learning_rate": 8.605480636098225e-08, "logits/chosen": -2.7740094661712646, "logits/rejected": -2.3253579139709473, "logps/chosen": -518.9378051757812, "logps/rejected": -427.5934753417969, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 2.6059062480926514, "rewards/margins": 5.956973075866699, "rewards/rejected": -3.3510663509368896, "step": 4131 }, { "epoch": 3.018812785388128, "grad_norm": 4.967417412327156, "learning_rate": 8.593442190184352e-08, "logits/chosen": -2.677069664001465, "logits/rejected": -2.7236580848693848, "logps/chosen": -696.734619140625, "logps/rejected": -1012.5773315429688, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 3.0275633335113525, "rewards/margins": 4.945417404174805, "rewards/rejected": -1.9178539514541626, "step": 4132 }, { "epoch": 3.019543378995434, "grad_norm": 5.733996137114103, "learning_rate": 8.581410422804072e-08, "logits/chosen": -2.9081547260284424, "logits/rejected": -2.9487202167510986, "logps/chosen": -763.48828125, "logps/rejected": -865.453369140625, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 2.5078961849212646, "rewards/margins": 5.238760948181152, "rewards/rejected": -2.730865001678467, "step": 4133 }, { "epoch": 3.0202739726027397, "grad_norm": 3.8829459890833125, "learning_rate": 8.569385338855112e-08, "logits/chosen": -3.037330150604248, "logits/rejected": -2.1588735580444336, "logps/chosen": -617.1031494140625, "logps/rejected": -428.9997863769531, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 3.1945884227752686, "rewards/margins": 4.921316146850586, "rewards/rejected": -1.7267274856567383, "step": 4134 }, { "epoch": 3.0210045662100455, "grad_norm": 3.7532186047064133, "learning_rate": 8.557366943232449e-08, "logits/chosen": -2.6183273792266846, "logits/rejected": -1.4677541255950928, "logps/chosen": -682.0438232421875, "logps/rejected": -405.818115234375, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 4.033931255340576, "rewards/margins": 8.238214492797852, "rewards/rejected": -4.204283237457275, "step": 4135 }, { "epoch": 3.0217351598173514, "grad_norm": 3.0007312201411485, "learning_rate": 8.545355240828344e-08, "logits/chosen": -2.4479100704193115, "logits/rejected": -2.305159091949463, "logps/chosen": -699.776123046875, "logps/rejected": -654.7189331054688, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 4.552772521972656, "rewards/margins": 7.89667272567749, "rewards/rejected": -3.343899965286255, "step": 4136 }, { "epoch": 3.0224657534246577, "grad_norm": 4.765943772957993, "learning_rate": 8.533350236532358e-08, "logits/chosen": -2.487499952316284, "logits/rejected": -2.035858631134033, "logps/chosen": -486.26611328125, "logps/rejected": -384.77880859375, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 2.5778181552886963, "rewards/margins": 5.543006896972656, "rewards/rejected": -2.965188980102539, "step": 4137 }, { "epoch": 3.0231963470319636, "grad_norm": 9.454425110751298, "learning_rate": 8.521351935231289e-08, "logits/chosen": -3.3682518005371094, "logits/rejected": -2.4587807655334473, "logps/chosen": -520.76953125, "logps/rejected": -494.7514343261719, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 2.5974111557006836, "rewards/margins": 6.153483867645264, "rewards/rejected": -3.55607271194458, "step": 4138 }, { "epoch": 3.0239269406392695, "grad_norm": 7.627621809675381, "learning_rate": 8.509360341809244e-08, "logits/chosen": -2.8503499031066895, "logits/rejected": -2.68658709526062, "logps/chosen": -447.75335693359375, "logps/rejected": -745.7738647460938, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 2.1770710945129395, "rewards/margins": 5.5157365798950195, "rewards/rejected": -3.3386659622192383, "step": 4139 }, { "epoch": 3.0246575342465754, "grad_norm": 6.593566313440423, "learning_rate": 8.497375461147552e-08, "logits/chosen": -2.896705150604248, "logits/rejected": -2.2874836921691895, "logps/chosen": -673.5994873046875, "logps/rejected": -470.2419128417969, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 2.649432897567749, "rewards/margins": 4.24490213394165, "rewards/rejected": -1.595469355583191, "step": 4140 }, { "epoch": 3.0253881278538812, "grad_norm": 5.080413021208458, "learning_rate": 8.48539729812486e-08, "logits/chosen": -2.9370622634887695, "logits/rejected": -2.0635106563568115, "logps/chosen": -649.8955688476562, "logps/rejected": -434.4630432128906, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 4.399431228637695, "rewards/margins": 6.441442966461182, "rewards/rejected": -2.0420117378234863, "step": 4141 }, { "epoch": 3.026118721461187, "grad_norm": 5.112355520729844, "learning_rate": 8.473425857617045e-08, "logits/chosen": -3.204465389251709, "logits/rejected": -2.159761905670166, "logps/chosen": -655.254150390625, "logps/rejected": -509.414794921875, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 5.053614139556885, "rewards/margins": 7.3948469161987305, "rewards/rejected": -2.3412327766418457, "step": 4142 }, { "epoch": 3.026849315068493, "grad_norm": 6.21194254102987, "learning_rate": 8.461461144497261e-08, "logits/chosen": -2.8312854766845703, "logits/rejected": -2.1834301948547363, "logps/chosen": -759.6725463867188, "logps/rejected": -540.3016357421875, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 5.62454891204834, "rewards/margins": 7.031187534332275, "rewards/rejected": -1.4066390991210938, "step": 4143 }, { "epoch": 3.0275799086757993, "grad_norm": 4.61517940945369, "learning_rate": 8.449503163635943e-08, "logits/chosen": -2.4149224758148193, "logits/rejected": -2.1346659660339355, "logps/chosen": -589.1240844726562, "logps/rejected": -795.4439086914062, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": 3.889939785003662, "rewards/margins": 6.985913276672363, "rewards/rejected": -3.095973253250122, "step": 4144 }, { "epoch": 3.028310502283105, "grad_norm": 3.9622826934819284, "learning_rate": 8.437551919900735e-08, "logits/chosen": -2.912317991256714, "logits/rejected": -2.890505313873291, "logps/chosen": -584.3519287109375, "logps/rejected": -611.330078125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 2.9014058113098145, "rewards/margins": 5.075255870819092, "rewards/rejected": -2.1738498210906982, "step": 4145 }, { "epoch": 3.029041095890411, "grad_norm": 6.558391338448948, "learning_rate": 8.425607418156588e-08, "logits/chosen": -2.561741828918457, "logits/rejected": -2.0263993740081787, "logps/chosen": -667.0792236328125, "logps/rejected": -462.7176208496094, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 3.5742409229278564, "rewards/margins": 7.417788505554199, "rewards/rejected": -3.8435473442077637, "step": 4146 }, { "epoch": 3.029771689497717, "grad_norm": 7.957226608043034, "learning_rate": 8.413669663265677e-08, "logits/chosen": -2.85602068901062, "logits/rejected": -2.4182209968566895, "logps/chosen": -753.44189453125, "logps/rejected": -676.5701904296875, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 3.9508910179138184, "rewards/margins": 3.642284393310547, "rewards/rejected": 0.30860674381256104, "step": 4147 }, { "epoch": 3.030502283105023, "grad_norm": 3.845717756050761, "learning_rate": 8.401738660087455e-08, "logits/chosen": -3.222785234451294, "logits/rejected": -2.725377082824707, "logps/chosen": -813.2982788085938, "logps/rejected": -701.8760986328125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 5.2794647216796875, "rewards/margins": 5.714014530181885, "rewards/rejected": -0.43455010652542114, "step": 4148 }, { "epoch": 3.0312328767123287, "grad_norm": 5.056002744987805, "learning_rate": 8.38981441347861e-08, "logits/chosen": -2.8413138389587402, "logits/rejected": -2.7589988708496094, "logps/chosen": -564.3961181640625, "logps/rejected": -613.78564453125, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 1.844560980796814, "rewards/margins": 6.514638900756836, "rewards/rejected": -4.670078277587891, "step": 4149 }, { "epoch": 3.0319634703196345, "grad_norm": 7.792073393897089, "learning_rate": 8.377896928293074e-08, "logits/chosen": -2.6820967197418213, "logits/rejected": -2.068016529083252, "logps/chosen": -853.0977172851562, "logps/rejected": -613.073974609375, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 3.5750315189361572, "rewards/margins": 4.431106090545654, "rewards/rejected": -0.8560745716094971, "step": 4150 }, { "epoch": 3.032694063926941, "grad_norm": 7.095338349504223, "learning_rate": 8.365986209382056e-08, "logits/chosen": -2.441784381866455, "logits/rejected": -1.8887455463409424, "logps/chosen": -1248.72021484375, "logps/rejected": -728.1636352539062, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": 6.51432466506958, "rewards/margins": 6.000222206115723, "rewards/rejected": 0.5141022801399231, "step": 4151 }, { "epoch": 3.0334246575342467, "grad_norm": 4.651745966790403, "learning_rate": 8.354082261593982e-08, "logits/chosen": -2.2883801460266113, "logits/rejected": -1.9860109090805054, "logps/chosen": -598.6400756835938, "logps/rejected": -594.4039916992188, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 3.0713939666748047, "rewards/margins": 5.903441905975342, "rewards/rejected": -2.832047700881958, "step": 4152 }, { "epoch": 3.0341552511415526, "grad_norm": 8.320260233110481, "learning_rate": 8.342185089774517e-08, "logits/chosen": -2.845242977142334, "logits/rejected": -2.6238017082214355, "logps/chosen": -690.5808715820312, "logps/rejected": -576.3321533203125, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 3.6634740829467773, "rewards/margins": 5.053948402404785, "rewards/rejected": -1.3904740810394287, "step": 4153 }, { "epoch": 3.0348858447488585, "grad_norm": 7.564915294992414, "learning_rate": 8.330294698766607e-08, "logits/chosen": -2.652383804321289, "logits/rejected": -2.2876837253570557, "logps/chosen": -440.59307861328125, "logps/rejected": -363.558837890625, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 3.110851764678955, "rewards/margins": 7.241354942321777, "rewards/rejected": -4.130502700805664, "step": 4154 }, { "epoch": 3.0356164383561643, "grad_norm": 3.382093674754869, "learning_rate": 8.318411093410391e-08, "logits/chosen": -2.9100959300994873, "logits/rejected": -1.8619675636291504, "logps/chosen": -634.673095703125, "logps/rejected": -475.3121337890625, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 4.115002155303955, "rewards/margins": 5.829154014587402, "rewards/rejected": -1.7141523361206055, "step": 4155 }, { "epoch": 3.0363470319634702, "grad_norm": 3.9677245249492548, "learning_rate": 8.306534278543287e-08, "logits/chosen": -2.8305282592773438, "logits/rejected": -2.6471686363220215, "logps/chosen": -586.3468627929688, "logps/rejected": -638.1353759765625, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 2.3606479167938232, "rewards/margins": 6.38966178894043, "rewards/rejected": -4.0290141105651855, "step": 4156 }, { "epoch": 3.037077625570776, "grad_norm": 10.618204254801265, "learning_rate": 8.29466425899992e-08, "logits/chosen": -2.5067670345306396, "logits/rejected": -2.613330602645874, "logps/chosen": -441.8583068847656, "logps/rejected": -636.9972534179688, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": 1.8450641632080078, "rewards/margins": 4.256659030914307, "rewards/rejected": -2.411595106124878, "step": 4157 }, { "epoch": 3.037808219178082, "grad_norm": 4.075420315836444, "learning_rate": 8.282801039612155e-08, "logits/chosen": -3.1805925369262695, "logits/rejected": -2.26588773727417, "logps/chosen": -382.5480041503906, "logps/rejected": -372.7740173339844, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 3.9517412185668945, "rewards/margins": 9.148038864135742, "rewards/rejected": -5.196297645568848, "step": 4158 }, { "epoch": 3.0385388127853883, "grad_norm": 5.90141623358425, "learning_rate": 8.27094462520909e-08, "logits/chosen": -3.133607864379883, "logits/rejected": -1.9567375183105469, "logps/chosen": -441.4631042480469, "logps/rejected": -304.4222106933594, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 3.2570242881774902, "rewards/margins": 6.577687740325928, "rewards/rejected": -3.3206634521484375, "step": 4159 }, { "epoch": 3.039269406392694, "grad_norm": 5.085358617509365, "learning_rate": 8.259095020617066e-08, "logits/chosen": -2.629343271255493, "logits/rejected": -2.0358574390411377, "logps/chosen": -461.7383117675781, "logps/rejected": -505.79168701171875, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 3.3653461933135986, "rewards/margins": 6.0279059410095215, "rewards/rejected": -2.6625595092773438, "step": 4160 }, { "epoch": 3.04, "grad_norm": 4.608172554030473, "learning_rate": 8.247252230659635e-08, "logits/chosen": -3.045083522796631, "logits/rejected": -2.359365463256836, "logps/chosen": -523.567626953125, "logps/rejected": -390.80450439453125, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 3.813312530517578, "rewards/margins": 5.888448715209961, "rewards/rejected": -2.0751359462738037, "step": 4161 }, { "epoch": 3.040730593607306, "grad_norm": 6.497361454589242, "learning_rate": 8.23541626015757e-08, "logits/chosen": -2.8700103759765625, "logits/rejected": -2.249697685241699, "logps/chosen": -411.68817138671875, "logps/rejected": -538.5269775390625, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 3.494503974914551, "rewards/margins": 7.510169506072998, "rewards/rejected": -4.015665531158447, "step": 4162 }, { "epoch": 3.041461187214612, "grad_norm": 8.806915461654578, "learning_rate": 8.223587113928901e-08, "logits/chosen": -2.5627024173736572, "logits/rejected": -1.4498956203460693, "logps/chosen": -547.2186889648438, "logps/rejected": -311.41497802734375, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 2.8054256439208984, "rewards/margins": 5.821713447570801, "rewards/rejected": -3.0162878036499023, "step": 4163 }, { "epoch": 3.0421917808219177, "grad_norm": 4.782450805666292, "learning_rate": 8.211764796788839e-08, "logits/chosen": -2.3821661472320557, "logits/rejected": -2.5000383853912354, "logps/chosen": -754.4673461914062, "logps/rejected": -722.669189453125, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 4.900218963623047, "rewards/margins": 5.485771179199219, "rewards/rejected": -0.5855523347854614, "step": 4164 }, { "epoch": 3.0429223744292235, "grad_norm": 4.244905222990597, "learning_rate": 8.19994931354985e-08, "logits/chosen": -2.3215126991271973, "logits/rejected": -2.046790599822998, "logps/chosen": -752.2157592773438, "logps/rejected": -422.1564025878906, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 2.250502109527588, "rewards/margins": 4.745702266693115, "rewards/rejected": -2.4951999187469482, "step": 4165 }, { "epoch": 3.04365296803653, "grad_norm": 8.197042480805434, "learning_rate": 8.188140669021592e-08, "logits/chosen": -2.6396000385284424, "logits/rejected": -2.610914945602417, "logps/chosen": -629.8560180664062, "logps/rejected": -674.540283203125, "loss": 0.0507, "rewards/accuracies": 0.875, "rewards/chosen": 2.943593740463257, "rewards/margins": 4.8522748947143555, "rewards/rejected": -1.9086816310882568, "step": 4166 }, { "epoch": 3.0443835616438357, "grad_norm": 5.038916581281174, "learning_rate": 8.176338868010945e-08, "logits/chosen": -2.5222294330596924, "logits/rejected": -2.7732865810394287, "logps/chosen": -333.4951171875, "logps/rejected": -479.01416015625, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 1.6958266496658325, "rewards/margins": 4.437531471252441, "rewards/rejected": -2.7417049407958984, "step": 4167 }, { "epoch": 3.0451141552511416, "grad_norm": 4.20268820155705, "learning_rate": 8.164543915322026e-08, "logits/chosen": -2.8595962524414062, "logits/rejected": -2.195657968521118, "logps/chosen": -711.98193359375, "logps/rejected": -438.2187194824219, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 3.47286057472229, "rewards/margins": 5.247441291809082, "rewards/rejected": -1.7745808362960815, "step": 4168 }, { "epoch": 3.0458447488584475, "grad_norm": 6.887381484436462, "learning_rate": 8.152755815756122e-08, "logits/chosen": -2.8732473850250244, "logits/rejected": -2.407083511352539, "logps/chosen": -1011.3400268554688, "logps/rejected": -823.10791015625, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 3.8218190670013428, "rewards/margins": 5.132086753845215, "rewards/rejected": -1.310267686843872, "step": 4169 }, { "epoch": 3.0465753424657533, "grad_norm": 7.817021640723721, "learning_rate": 8.140974574111786e-08, "logits/chosen": -2.5400497913360596, "logits/rejected": -2.365192413330078, "logps/chosen": -264.16912841796875, "logps/rejected": -328.1048583984375, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 2.121685266494751, "rewards/margins": 4.136116981506348, "rewards/rejected": -2.014431953430176, "step": 4170 }, { "epoch": 3.047305936073059, "grad_norm": 11.97963448758161, "learning_rate": 8.12920019518471e-08, "logits/chosen": -2.6481430530548096, "logits/rejected": -2.2300384044647217, "logps/chosen": -629.1748657226562, "logps/rejected": -483.16387939453125, "loss": 0.0456, "rewards/accuracies": 0.875, "rewards/chosen": 2.8003010749816895, "rewards/margins": 5.321750164031982, "rewards/rejected": -2.521449089050293, "step": 4171 }, { "epoch": 3.048036529680365, "grad_norm": 5.45669202903923, "learning_rate": 8.117432683767853e-08, "logits/chosen": -2.651776075363159, "logits/rejected": -2.273134231567383, "logps/chosen": -829.9486694335938, "logps/rejected": -629.1782836914062, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 2.500725269317627, "rewards/margins": 4.308205604553223, "rewards/rejected": -1.8074800968170166, "step": 4172 }, { "epoch": 3.0487671232876714, "grad_norm": 4.648240952383602, "learning_rate": 8.105672044651358e-08, "logits/chosen": -2.68267560005188, "logits/rejected": -1.8838051557540894, "logps/chosen": -681.983154296875, "logps/rejected": -537.2634887695312, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 2.7448458671569824, "rewards/margins": 4.601761817932129, "rewards/rejected": -1.856915831565857, "step": 4173 }, { "epoch": 3.0494977168949773, "grad_norm": 7.0738188927061625, "learning_rate": 8.093918282622562e-08, "logits/chosen": -2.8354856967926025, "logits/rejected": -2.278940200805664, "logps/chosen": -988.6951293945312, "logps/rejected": -752.7088623046875, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 4.2132110595703125, "rewards/margins": 6.123506546020508, "rewards/rejected": -1.910295009613037, "step": 4174 }, { "epoch": 3.050228310502283, "grad_norm": 5.429957804366637, "learning_rate": 8.08217140246601e-08, "logits/chosen": -2.824267864227295, "logits/rejected": -2.220670700073242, "logps/chosen": -590.7048950195312, "logps/rejected": -604.3680419921875, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 5.207972526550293, "rewards/margins": 8.858260154724121, "rewards/rejected": -3.650287389755249, "step": 4175 }, { "epoch": 3.050958904109589, "grad_norm": 8.25480835403912, "learning_rate": 8.070431408963432e-08, "logits/chosen": -3.1734893321990967, "logits/rejected": -2.7442431449890137, "logps/chosen": -1022.1050415039062, "logps/rejected": -788.2396240234375, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": 3.9577860832214355, "rewards/margins": 6.469860076904297, "rewards/rejected": -2.5120739936828613, "step": 4176 }, { "epoch": 3.051689497716895, "grad_norm": 4.148062536388059, "learning_rate": 8.05869830689379e-08, "logits/chosen": -2.7734057903289795, "logits/rejected": -1.9644148349761963, "logps/chosen": -802.7613525390625, "logps/rejected": -557.480712890625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 4.290094375610352, "rewards/margins": 6.775646686553955, "rewards/rejected": -2.4855523109436035, "step": 4177 }, { "epoch": 3.052420091324201, "grad_norm": 5.183762088390832, "learning_rate": 8.046972101033205e-08, "logits/chosen": -2.6429970264434814, "logits/rejected": -2.3462061882019043, "logps/chosen": -556.0497436523438, "logps/rejected": -476.1501770019531, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 3.182068347930908, "rewards/margins": 4.200055122375488, "rewards/rejected": -1.01798677444458, "step": 4178 }, { "epoch": 3.0531506849315067, "grad_norm": 4.660732492662936, "learning_rate": 8.035252796154998e-08, "logits/chosen": -2.9508588314056396, "logits/rejected": -1.7609505653381348, "logps/chosen": -599.5631103515625, "logps/rejected": -413.54681396484375, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 2.0678188800811768, "rewards/margins": 6.1220245361328125, "rewards/rejected": -4.054205417633057, "step": 4179 }, { "epoch": 3.053881278538813, "grad_norm": 6.586743143747084, "learning_rate": 8.023540397029702e-08, "logits/chosen": -2.994058847427368, "logits/rejected": -1.7824888229370117, "logps/chosen": -545.4881591796875, "logps/rejected": -319.9891357421875, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 3.7141013145446777, "rewards/margins": 5.634041786193848, "rewards/rejected": -1.919940710067749, "step": 4180 }, { "epoch": 3.054611872146119, "grad_norm": 6.81267078430567, "learning_rate": 8.011834908425005e-08, "logits/chosen": -2.3874566555023193, "logits/rejected": -2.1811740398406982, "logps/chosen": -578.6904296875, "logps/rejected": -593.4618530273438, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 3.167203187942505, "rewards/margins": 5.202839374542236, "rewards/rejected": -2.0356364250183105, "step": 4181 }, { "epoch": 3.0553424657534247, "grad_norm": 5.980302585358969, "learning_rate": 8.00013633510582e-08, "logits/chosen": -2.8352911472320557, "logits/rejected": -2.12373423576355, "logps/chosen": -271.72613525390625, "logps/rejected": -271.38427734375, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 1.5712307691574097, "rewards/margins": 4.90837287902832, "rewards/rejected": -3.3371422290802, "step": 4182 }, { "epoch": 3.0560730593607306, "grad_norm": 9.221076077039745, "learning_rate": 7.988444681834213e-08, "logits/chosen": -2.949953079223633, "logits/rejected": -2.3467729091644287, "logps/chosen": -703.4566650390625, "logps/rejected": -510.51031494140625, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 4.029397010803223, "rewards/margins": 4.214768409729004, "rewards/rejected": -0.1853717565536499, "step": 4183 }, { "epoch": 3.0568036529680365, "grad_norm": 5.208096165661342, "learning_rate": 7.976759953369443e-08, "logits/chosen": -2.8737568855285645, "logits/rejected": -2.5893492698669434, "logps/chosen": -635.80322265625, "logps/rejected": -1091.86669921875, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": 2.5378527641296387, "rewards/margins": 5.742292881011963, "rewards/rejected": -3.204440116882324, "step": 4184 }, { "epoch": 3.0575342465753423, "grad_norm": 6.053759064400767, "learning_rate": 7.965082154467964e-08, "logits/chosen": -2.596278429031372, "logits/rejected": -1.7675963640213013, "logps/chosen": -702.4813232421875, "logps/rejected": -492.78643798828125, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 5.243754863739014, "rewards/margins": 7.251758575439453, "rewards/rejected": -2.0080037117004395, "step": 4185 }, { "epoch": 3.058264840182648, "grad_norm": 4.08782856871794, "learning_rate": 7.953411289883385e-08, "logits/chosen": -3.2031712532043457, "logits/rejected": -2.5335042476654053, "logps/chosen": -846.0853271484375, "logps/rejected": -624.0235595703125, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 4.1894612312316895, "rewards/margins": 5.0761213302612305, "rewards/rejected": -0.8866598606109619, "step": 4186 }, { "epoch": 3.0589954337899545, "grad_norm": 5.166667240922137, "learning_rate": 7.94174736436653e-08, "logits/chosen": -2.528805732727051, "logits/rejected": -1.8779959678649902, "logps/chosen": -522.027099609375, "logps/rejected": -428.6514892578125, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 2.6842665672302246, "rewards/margins": 3.9631149768829346, "rewards/rejected": -1.27884840965271, "step": 4187 }, { "epoch": 3.0597260273972604, "grad_norm": 4.570414328459627, "learning_rate": 7.930090382665338e-08, "logits/chosen": -2.939586639404297, "logits/rejected": -2.2520627975463867, "logps/chosen": -996.9358520507812, "logps/rejected": -829.5233154296875, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 6.063238143920898, "rewards/margins": 7.635959625244141, "rewards/rejected": -1.5727216005325317, "step": 4188 }, { "epoch": 3.0604566210045663, "grad_norm": 6.338621876150869, "learning_rate": 7.918440349524974e-08, "logits/chosen": -2.8398332595825195, "logits/rejected": -2.2585184574127197, "logps/chosen": -601.0916748046875, "logps/rejected": -716.0022583007812, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 3.0474016666412354, "rewards/margins": 5.673068046569824, "rewards/rejected": -2.625666379928589, "step": 4189 }, { "epoch": 3.061187214611872, "grad_norm": 3.2022870712308515, "learning_rate": 7.906797269687768e-08, "logits/chosen": -2.804361581802368, "logits/rejected": -2.163019895553589, "logps/chosen": -719.1553955078125, "logps/rejected": -733.2616577148438, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 5.124222755432129, "rewards/margins": 8.693371772766113, "rewards/rejected": -3.5691497325897217, "step": 4190 }, { "epoch": 3.061917808219178, "grad_norm": 5.449073138735528, "learning_rate": 7.895161147893195e-08, "logits/chosen": -3.4671995639801025, "logits/rejected": -2.5456948280334473, "logps/chosen": -804.3890991210938, "logps/rejected": -640.9644165039062, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 4.217511177062988, "rewards/margins": 5.297323226928711, "rewards/rejected": -1.0798118114471436, "step": 4191 }, { "epoch": 3.062648401826484, "grad_norm": 6.812204617037334, "learning_rate": 7.88353198887792e-08, "logits/chosen": -2.800605058670044, "logits/rejected": -2.2987403869628906, "logps/chosen": -308.2013244628906, "logps/rejected": -257.35113525390625, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 2.3599488735198975, "rewards/margins": 5.527642250061035, "rewards/rejected": -3.1676933765411377, "step": 4192 }, { "epoch": 3.0633789954337898, "grad_norm": 4.067686789924078, "learning_rate": 7.87190979737575e-08, "logits/chosen": -2.5907135009765625, "logits/rejected": -2.6220438480377197, "logps/chosen": -784.0360717773438, "logps/rejected": -840.7237548828125, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 3.5776994228363037, "rewards/margins": 5.057819843292236, "rewards/rejected": -1.4801206588745117, "step": 4193 }, { "epoch": 3.064109589041096, "grad_norm": 3.9653463110909146, "learning_rate": 7.860294578117691e-08, "logits/chosen": -2.9062068462371826, "logits/rejected": -1.823634386062622, "logps/chosen": -600.7838745117188, "logps/rejected": -388.9402770996094, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 3.198105812072754, "rewards/margins": 7.873366832733154, "rewards/rejected": -4.675260543823242, "step": 4194 }, { "epoch": 3.064840182648402, "grad_norm": 6.44367817821122, "learning_rate": 7.848686335831872e-08, "logits/chosen": -3.6364059448242188, "logits/rejected": -2.7416458129882812, "logps/chosen": -667.74951171875, "logps/rejected": -536.1796264648438, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 4.294105529785156, "rewards/margins": 5.550414085388184, "rewards/rejected": -1.2563080787658691, "step": 4195 }, { "epoch": 3.065570776255708, "grad_norm": 6.600506398974278, "learning_rate": 7.837085075243621e-08, "logits/chosen": -2.912698745727539, "logits/rejected": -2.1791279315948486, "logps/chosen": -755.641845703125, "logps/rejected": -531.4815673828125, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": 3.731632709503174, "rewards/margins": 4.6331400871276855, "rewards/rejected": -0.9015077352523804, "step": 4196 }, { "epoch": 3.0663013698630137, "grad_norm": 9.119115316748454, "learning_rate": 7.825490801075391e-08, "logits/chosen": -2.797476291656494, "logits/rejected": -2.224403142929077, "logps/chosen": -523.0245361328125, "logps/rejected": -406.6296691894531, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 4.169512748718262, "rewards/margins": 7.957998752593994, "rewards/rejected": -3.7884857654571533, "step": 4197 }, { "epoch": 3.0670319634703196, "grad_norm": 6.699978034140036, "learning_rate": 7.8139035180468e-08, "logits/chosen": -2.9323647022247314, "logits/rejected": -1.5754599571228027, "logps/chosen": -570.7778930664062, "logps/rejected": -318.6186828613281, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 3.208808660507202, "rewards/margins": 5.878302574157715, "rewards/rejected": -2.6694936752319336, "step": 4198 }, { "epoch": 3.0677625570776255, "grad_norm": 4.369554018356557, "learning_rate": 7.802323230874639e-08, "logits/chosen": -2.5722720623016357, "logits/rejected": -2.676682949066162, "logps/chosen": -652.27099609375, "logps/rejected": -724.6775512695312, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 3.458986759185791, "rewards/margins": 4.79174280166626, "rewards/rejected": -1.332756519317627, "step": 4199 }, { "epoch": 3.0684931506849313, "grad_norm": 6.630299014430039, "learning_rate": 7.790749944272826e-08, "logits/chosen": -3.0860633850097656, "logits/rejected": -1.769062876701355, "logps/chosen": -618.678955078125, "logps/rejected": -356.345703125, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 3.3516182899475098, "rewards/margins": 6.226491451263428, "rewards/rejected": -2.874873161315918, "step": 4200 }, { "epoch": 3.0692237442922377, "grad_norm": 6.3026154432728365, "learning_rate": 7.779183662952435e-08, "logits/chosen": -2.412447452545166, "logits/rejected": -2.1982555389404297, "logps/chosen": -680.6183471679688, "logps/rejected": -789.3013305664062, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 5.54409122467041, "rewards/margins": 5.925398826599121, "rewards/rejected": -0.3813079595565796, "step": 4201 }, { "epoch": 3.0699543378995435, "grad_norm": 4.274981510947774, "learning_rate": 7.767624391621704e-08, "logits/chosen": -2.8948326110839844, "logits/rejected": -1.8044044971466064, "logps/chosen": -572.7359619140625, "logps/rejected": -361.03558349609375, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 3.919174909591675, "rewards/margins": 7.00140905380249, "rewards/rejected": -3.0822346210479736, "step": 4202 }, { "epoch": 3.0706849315068494, "grad_norm": 8.93657588849554, "learning_rate": 7.756072134985997e-08, "logits/chosen": -2.319488286972046, "logits/rejected": -2.113530158996582, "logps/chosen": -558.3799438476562, "logps/rejected": -611.8870849609375, "loss": 0.0467, "rewards/accuracies": 0.875, "rewards/chosen": 2.590595006942749, "rewards/margins": 5.962268829345703, "rewards/rejected": -3.371673583984375, "step": 4203 }, { "epoch": 3.0714155251141553, "grad_norm": 6.022475110070279, "learning_rate": 7.744526897747844e-08, "logits/chosen": -2.7457494735717773, "logits/rejected": -2.285710334777832, "logps/chosen": -627.8486328125, "logps/rejected": -578.8341064453125, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 2.3761584758758545, "rewards/margins": 5.411194801330566, "rewards/rejected": -3.03503680229187, "step": 4204 }, { "epoch": 3.072146118721461, "grad_norm": 4.764752417553921, "learning_rate": 7.7329886846069e-08, "logits/chosen": -2.563202381134033, "logits/rejected": -2.012388229370117, "logps/chosen": -485.41131591796875, "logps/rejected": -532.8460083007812, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 1.7109756469726562, "rewards/margins": 4.811878681182861, "rewards/rejected": -3.100903034210205, "step": 4205 }, { "epoch": 3.072876712328767, "grad_norm": 8.18105336092109, "learning_rate": 7.721457500259956e-08, "logits/chosen": -2.2583093643188477, "logits/rejected": -2.5869948863983154, "logps/chosen": -435.99420166015625, "logps/rejected": -462.44482421875, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 1.0014472007751465, "rewards/margins": 5.309765815734863, "rewards/rejected": -4.308318614959717, "step": 4206 }, { "epoch": 3.073607305936073, "grad_norm": 6.872178209527592, "learning_rate": 7.709933349400968e-08, "logits/chosen": -2.745854377746582, "logits/rejected": -2.7578935623168945, "logps/chosen": -491.82489013671875, "logps/rejected": -553.1126708984375, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 3.4567160606384277, "rewards/margins": 6.1117658615112305, "rewards/rejected": -2.655050039291382, "step": 4207 }, { "epoch": 3.074337899543379, "grad_norm": 7.670605711812072, "learning_rate": 7.698416236721e-08, "logits/chosen": -2.543210029602051, "logits/rejected": -2.5143752098083496, "logps/chosen": -611.3287963867188, "logps/rejected": -666.22705078125, "loss": 0.0433, "rewards/accuracies": 0.875, "rewards/chosen": 3.6331546306610107, "rewards/margins": 4.494087219238281, "rewards/rejected": -0.8609323501586914, "step": 4208 }, { "epoch": 3.075068493150685, "grad_norm": 8.435503688170881, "learning_rate": 7.686906166908286e-08, "logits/chosen": -2.8000311851501465, "logits/rejected": -1.5723049640655518, "logps/chosen": -571.0322265625, "logps/rejected": -419.78338623046875, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 2.300652027130127, "rewards/margins": 4.240271091461182, "rewards/rejected": -1.9396189451217651, "step": 4209 }, { "epoch": 3.075799086757991, "grad_norm": 8.339150750627981, "learning_rate": 7.675403144648137e-08, "logits/chosen": -2.8432841300964355, "logits/rejected": -2.2208251953125, "logps/chosen": -711.5697021484375, "logps/rejected": -440.86932373046875, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 3.908132553100586, "rewards/margins": 5.315540313720703, "rewards/rejected": -1.4074077606201172, "step": 4210 }, { "epoch": 3.076529680365297, "grad_norm": 5.394519758937345, "learning_rate": 7.663907174623061e-08, "logits/chosen": -2.992643356323242, "logits/rejected": -2.3281960487365723, "logps/chosen": -632.2317504882812, "logps/rejected": -558.1183471679688, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 5.1173906326293945, "rewards/margins": 8.46845531463623, "rewards/rejected": -3.351064682006836, "step": 4211 }, { "epoch": 3.0772602739726027, "grad_norm": 4.382274791895708, "learning_rate": 7.652418261512639e-08, "logits/chosen": -2.652843952178955, "logits/rejected": -2.1901354789733887, "logps/chosen": -706.5660400390625, "logps/rejected": -693.8193359375, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 4.316391944885254, "rewards/margins": 6.457735538482666, "rewards/rejected": -2.141343593597412, "step": 4212 }, { "epoch": 3.0779908675799086, "grad_norm": 7.33135511708328, "learning_rate": 7.640936409993626e-08, "logits/chosen": -3.0470693111419678, "logits/rejected": -2.5214951038360596, "logps/chosen": -844.4610595703125, "logps/rejected": -804.2698364257812, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 5.295663833618164, "rewards/margins": 6.695679664611816, "rewards/rejected": -1.4000158309936523, "step": 4213 }, { "epoch": 3.0787214611872145, "grad_norm": 7.187396239485419, "learning_rate": 7.629461624739872e-08, "logits/chosen": -2.5933923721313477, "logits/rejected": -2.3533334732055664, "logps/chosen": -798.8773193359375, "logps/rejected": -820.096923828125, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 4.529674053192139, "rewards/margins": 5.844725131988525, "rewards/rejected": -1.3150508403778076, "step": 4214 }, { "epoch": 3.0794520547945208, "grad_norm": 6.089344799456664, "learning_rate": 7.617993910422349e-08, "logits/chosen": -2.87906551361084, "logits/rejected": -2.367050886154175, "logps/chosen": -713.3662719726562, "logps/rejected": -676.2940063476562, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": 2.9511797428131104, "rewards/margins": 4.8931732177734375, "rewards/rejected": -1.9419937133789062, "step": 4215 }, { "epoch": 3.0801826484018267, "grad_norm": 6.899002345841022, "learning_rate": 7.606533271709175e-08, "logits/chosen": -2.6385719776153564, "logits/rejected": -1.9413645267486572, "logps/chosen": -366.87017822265625, "logps/rejected": -279.05426025390625, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 2.040393114089966, "rewards/margins": 4.903968334197998, "rewards/rejected": -2.863574981689453, "step": 4216 }, { "epoch": 3.0809132420091325, "grad_norm": 8.99776558160001, "learning_rate": 7.595079713265565e-08, "logits/chosen": -2.949948787689209, "logits/rejected": -3.065241813659668, "logps/chosen": -554.8523559570312, "logps/rejected": -639.904541015625, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": 2.0900440216064453, "rewards/margins": 3.6516976356506348, "rewards/rejected": -1.5616538524627686, "step": 4217 }, { "epoch": 3.0816438356164384, "grad_norm": 5.750162536977555, "learning_rate": 7.583633239753867e-08, "logits/chosen": -2.775381088256836, "logits/rejected": -2.0311472415924072, "logps/chosen": -686.3790283203125, "logps/rejected": -512.5784912109375, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 3.7773492336273193, "rewards/margins": 6.752799987792969, "rewards/rejected": -2.9754505157470703, "step": 4218 }, { "epoch": 3.0823744292237443, "grad_norm": 8.96244734106196, "learning_rate": 7.57219385583354e-08, "logits/chosen": -2.772617816925049, "logits/rejected": -2.5249617099761963, "logps/chosen": -1179.707275390625, "logps/rejected": -1139.754638671875, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 4.8519816398620605, "rewards/margins": 4.7559709548950195, "rewards/rejected": 0.09601044654846191, "step": 4219 }, { "epoch": 3.08310502283105, "grad_norm": 7.109404243157576, "learning_rate": 7.560761566161139e-08, "logits/chosen": -2.4434125423431396, "logits/rejected": -2.6845860481262207, "logps/chosen": -395.84014892578125, "logps/rejected": -545.0993041992188, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 2.5453898906707764, "rewards/margins": 5.331932067871094, "rewards/rejected": -2.7865424156188965, "step": 4220 }, { "epoch": 3.083835616438356, "grad_norm": 7.580108068495997, "learning_rate": 7.549336375390369e-08, "logits/chosen": -2.689833641052246, "logits/rejected": -1.915576457977295, "logps/chosen": -478.806640625, "logps/rejected": -470.24859619140625, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 3.18466854095459, "rewards/margins": 5.138866901397705, "rewards/rejected": -1.9541980028152466, "step": 4221 }, { "epoch": 3.084566210045662, "grad_norm": 4.644151163550474, "learning_rate": 7.537918288172013e-08, "logits/chosen": -2.604306697845459, "logits/rejected": -2.3843703269958496, "logps/chosen": -513.2171020507812, "logps/rejected": -582.59814453125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 3.4513802528381348, "rewards/margins": 5.3739728927612305, "rewards/rejected": -1.9225927591323853, "step": 4222 }, { "epoch": 3.085296803652968, "grad_norm": 4.796477758007081, "learning_rate": 7.526507309153976e-08, "logits/chosen": -2.9817066192626953, "logits/rejected": -2.2199244499206543, "logps/chosen": -604.2534790039062, "logps/rejected": -425.8041076660156, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 4.344989776611328, "rewards/margins": 8.164276123046875, "rewards/rejected": -3.819286584854126, "step": 4223 }, { "epoch": 3.086027397260274, "grad_norm": 5.974334500970982, "learning_rate": 7.515103442981258e-08, "logits/chosen": -2.3751907348632812, "logits/rejected": -1.9933080673217773, "logps/chosen": -722.111328125, "logps/rejected": -446.211669921875, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": 3.885878562927246, "rewards/margins": 5.5970988273620605, "rewards/rejected": -1.711220622062683, "step": 4224 }, { "epoch": 3.08675799086758, "grad_norm": 3.150662603776303, "learning_rate": 7.50370669429598e-08, "logits/chosen": -2.9694671630859375, "logits/rejected": -1.9461517333984375, "logps/chosen": -956.364013671875, "logps/rejected": -571.9216918945312, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 6.20627498626709, "rewards/margins": 8.423504829406738, "rewards/rejected": -2.2172293663024902, "step": 4225 }, { "epoch": 3.087488584474886, "grad_norm": 5.995575545095179, "learning_rate": 7.492317067737375e-08, "logits/chosen": -2.493363857269287, "logits/rejected": -2.1427202224731445, "logps/chosen": -574.1619262695312, "logps/rejected": -637.4037475585938, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 2.3947439193725586, "rewards/margins": 4.181204795837402, "rewards/rejected": -1.7864608764648438, "step": 4226 }, { "epoch": 3.0882191780821917, "grad_norm": 2.53919376914276, "learning_rate": 7.480934567941724e-08, "logits/chosen": -2.521758556365967, "logits/rejected": -2.217021942138672, "logps/chosen": -839.6099853515625, "logps/rejected": -724.1507568359375, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 3.0413951873779297, "rewards/margins": 5.499696254730225, "rewards/rejected": -2.458300828933716, "step": 4227 }, { "epoch": 3.0889497716894976, "grad_norm": 5.711329915019212, "learning_rate": 7.469559199542475e-08, "logits/chosen": -2.8528425693511963, "logits/rejected": -2.605869770050049, "logps/chosen": -606.4913330078125, "logps/rejected": -604.3902587890625, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 4.918333053588867, "rewards/margins": 5.098219871520996, "rewards/rejected": -0.17988717555999756, "step": 4228 }, { "epoch": 3.0896803652968035, "grad_norm": 11.350288797477042, "learning_rate": 7.458190967170119e-08, "logits/chosen": -3.02071475982666, "logits/rejected": -2.5611019134521484, "logps/chosen": -481.1209716796875, "logps/rejected": -498.3533020019531, "loss": 0.0677, "rewards/accuracies": 1.0, "rewards/chosen": 2.899226188659668, "rewards/margins": 6.385605812072754, "rewards/rejected": -3.4863791465759277, "step": 4229 }, { "epoch": 3.0904109589041098, "grad_norm": 6.424808472624854, "learning_rate": 7.446829875452279e-08, "logits/chosen": -2.7248260974884033, "logits/rejected": -2.361708641052246, "logps/chosen": -678.4075317382812, "logps/rejected": -573.867431640625, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 3.52915096282959, "rewards/margins": 5.361242771148682, "rewards/rejected": -1.83209228515625, "step": 4230 }, { "epoch": 3.0911415525114156, "grad_norm": 9.726650351008884, "learning_rate": 7.435475929013654e-08, "logits/chosen": -2.5970335006713867, "logits/rejected": -2.002354383468628, "logps/chosen": -669.108154296875, "logps/rejected": -492.9018859863281, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": 4.443073749542236, "rewards/margins": 7.4681077003479, "rewards/rejected": -3.025033712387085, "step": 4231 }, { "epoch": 3.0918721461187215, "grad_norm": 4.839936952101482, "learning_rate": 7.424129132476026e-08, "logits/chosen": -2.368231773376465, "logits/rejected": -2.2601590156555176, "logps/chosen": -680.35986328125, "logps/rejected": -636.2972412109375, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 3.7281577587127686, "rewards/margins": 4.5665082931518555, "rewards/rejected": -0.8383510708808899, "step": 4232 }, { "epoch": 3.0926027397260274, "grad_norm": 3.891636605864804, "learning_rate": 7.412789490458293e-08, "logits/chosen": -2.691620111465454, "logits/rejected": -2.748386859893799, "logps/chosen": -539.77197265625, "logps/rejected": -634.3111572265625, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 3.4612202644348145, "rewards/margins": 5.408936023712158, "rewards/rejected": -1.9477155208587646, "step": 4233 }, { "epoch": 3.0933333333333333, "grad_norm": 2.7928527381117974, "learning_rate": 7.401457007576411e-08, "logits/chosen": -3.2241547107696533, "logits/rejected": -2.1011149883270264, "logps/chosen": -745.669677734375, "logps/rejected": -579.4412841796875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 5.001699447631836, "rewards/margins": 7.2943315505981445, "rewards/rejected": -2.2926321029663086, "step": 4234 }, { "epoch": 3.094063926940639, "grad_norm": 3.660639679045487, "learning_rate": 7.390131688443448e-08, "logits/chosen": -1.9565439224243164, "logits/rejected": -1.857893466949463, "logps/chosen": -464.019287109375, "logps/rejected": -750.0712280273438, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": 3.387964963912964, "rewards/margins": 8.710226058959961, "rewards/rejected": -5.322260856628418, "step": 4235 }, { "epoch": 3.094794520547945, "grad_norm": 6.020964237184853, "learning_rate": 7.378813537669543e-08, "logits/chosen": -2.6470742225646973, "logits/rejected": -1.2844825983047485, "logps/chosen": -797.9496459960938, "logps/rejected": -345.4873046875, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 4.63140344619751, "rewards/margins": 5.089140892028809, "rewards/rejected": -0.45773717761039734, "step": 4236 }, { "epoch": 3.0955251141552513, "grad_norm": 7.073073043180731, "learning_rate": 7.367502559861902e-08, "logits/chosen": -3.1870670318603516, "logits/rejected": -2.166465997695923, "logps/chosen": -801.866943359375, "logps/rejected": -542.3909912109375, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 4.7085280418396, "rewards/margins": 5.553841590881348, "rewards/rejected": -0.845313549041748, "step": 4237 }, { "epoch": 3.096255707762557, "grad_norm": 4.9740324910279545, "learning_rate": 7.356198759624846e-08, "logits/chosen": -3.2901923656463623, "logits/rejected": -2.0386416912078857, "logps/chosen": -931.906982421875, "logps/rejected": -519.3350830078125, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": 4.127664566040039, "rewards/margins": 5.854881286621094, "rewards/rejected": -1.727217435836792, "step": 4238 }, { "epoch": 3.096986301369863, "grad_norm": 13.496175153517164, "learning_rate": 7.344902141559748e-08, "logits/chosen": -2.9540927410125732, "logits/rejected": -2.3569276332855225, "logps/chosen": -554.408935546875, "logps/rejected": -457.11175537109375, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 3.6986169815063477, "rewards/margins": 5.910994529724121, "rewards/rejected": -2.2123780250549316, "step": 4239 }, { "epoch": 3.097716894977169, "grad_norm": 5.420358759137516, "learning_rate": 7.333612710265061e-08, "logits/chosen": -2.222989797592163, "logits/rejected": -2.4654488563537598, "logps/chosen": -425.0879821777344, "logps/rejected": -515.5720825195312, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 4.544952869415283, "rewards/margins": 9.529875755310059, "rewards/rejected": -4.984923362731934, "step": 4240 }, { "epoch": 3.098447488584475, "grad_norm": 10.6655197321662, "learning_rate": 7.322330470336313e-08, "logits/chosen": -2.7183308601379395, "logits/rejected": -1.4421966075897217, "logps/chosen": -538.7998046875, "logps/rejected": -318.56927490234375, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": 4.601837158203125, "rewards/margins": 8.109818458557129, "rewards/rejected": -3.507981777191162, "step": 4241 }, { "epoch": 3.0991780821917807, "grad_norm": 6.48598348132114, "learning_rate": 7.311055426366114e-08, "logits/chosen": -2.6346304416656494, "logits/rejected": -2.502420425415039, "logps/chosen": -264.12451171875, "logps/rejected": -475.5592346191406, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 1.4018588066101074, "rewards/margins": 5.245905876159668, "rewards/rejected": -3.8440468311309814, "step": 4242 }, { "epoch": 3.0999086757990866, "grad_norm": 4.549068171062683, "learning_rate": 7.299787582944145e-08, "logits/chosen": -2.7452330589294434, "logits/rejected": -2.1108896732330322, "logps/chosen": -659.0467529296875, "logps/rejected": -474.6816101074219, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 2.4792685508728027, "rewards/margins": 4.440033435821533, "rewards/rejected": -1.960764765739441, "step": 4243 }, { "epoch": 3.100639269406393, "grad_norm": 10.30332140655798, "learning_rate": 7.288526944657142e-08, "logits/chosen": -3.6510400772094727, "logits/rejected": -2.344068765640259, "logps/chosen": -762.40869140625, "logps/rejected": -514.2713012695312, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 5.4210920333862305, "rewards/margins": 7.5041399002075195, "rewards/rejected": -2.083047866821289, "step": 4244 }, { "epoch": 3.1013698630136988, "grad_norm": 5.694365664715107, "learning_rate": 7.277273516088916e-08, "logits/chosen": -2.7765302658081055, "logits/rejected": -1.548646330833435, "logps/chosen": -585.6509399414062, "logps/rejected": -349.28582763671875, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 3.5777013301849365, "rewards/margins": 6.765895843505859, "rewards/rejected": -3.1881940364837646, "step": 4245 }, { "epoch": 3.1021004566210046, "grad_norm": 6.170527443869517, "learning_rate": 7.266027301820335e-08, "logits/chosen": -3.065122365951538, "logits/rejected": -2.0261311531066895, "logps/chosen": -725.441162109375, "logps/rejected": -531.9276733398438, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 4.981401443481445, "rewards/margins": 6.023635387420654, "rewards/rejected": -1.0422337055206299, "step": 4246 }, { "epoch": 3.1028310502283105, "grad_norm": 3.2860728832295374, "learning_rate": 7.254788306429354e-08, "logits/chosen": -3.0573782920837402, "logits/rejected": -1.591461181640625, "logps/chosen": -878.6209716796875, "logps/rejected": -474.31256103515625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 4.7559895515441895, "rewards/margins": 5.909161567687988, "rewards/rejected": -1.153172492980957, "step": 4247 }, { "epoch": 3.1035616438356164, "grad_norm": 8.664131184917109, "learning_rate": 7.243556534490966e-08, "logits/chosen": -2.8926379680633545, "logits/rejected": -1.9228848218917847, "logps/chosen": -433.73675537109375, "logps/rejected": -395.3272705078125, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 2.2135801315307617, "rewards/margins": 5.477006912231445, "rewards/rejected": -3.2634265422821045, "step": 4248 }, { "epoch": 3.1042922374429223, "grad_norm": 5.740400178895822, "learning_rate": 7.23233199057722e-08, "logits/chosen": -2.5006542205810547, "logits/rejected": -2.2245001792907715, "logps/chosen": -436.1790466308594, "logps/rejected": -638.090576171875, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 1.5813193321228027, "rewards/margins": 5.478797912597656, "rewards/rejected": -3.8974785804748535, "step": 4249 }, { "epoch": 3.105022831050228, "grad_norm": 8.121328610120063, "learning_rate": 7.221114679257251e-08, "logits/chosen": -2.814406394958496, "logits/rejected": -1.7352464199066162, "logps/chosen": -559.6627807617188, "logps/rejected": -315.9568786621094, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": 3.5259766578674316, "rewards/margins": 5.616422176361084, "rewards/rejected": -2.0904455184936523, "step": 4250 }, { "epoch": 3.1057534246575345, "grad_norm": 3.8681777456918427, "learning_rate": 7.209904605097216e-08, "logits/chosen": -2.2457525730133057, "logits/rejected": -1.9025980234146118, "logps/chosen": -487.34564208984375, "logps/rejected": -469.86419677734375, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 3.762929677963257, "rewards/margins": 6.961106777191162, "rewards/rejected": -3.198176622390747, "step": 4251 }, { "epoch": 3.1064840182648403, "grad_norm": 3.622136530678406, "learning_rate": 7.198701772660362e-08, "logits/chosen": -2.8784220218658447, "logits/rejected": -1.9941837787628174, "logps/chosen": -768.05517578125, "logps/rejected": -478.1717529296875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 4.977337837219238, "rewards/margins": 6.946232795715332, "rewards/rejected": -1.9688951969146729, "step": 4252 }, { "epoch": 3.107214611872146, "grad_norm": 4.3413272385737995, "learning_rate": 7.187506186506958e-08, "logits/chosen": -3.1121416091918945, "logits/rejected": -3.2056612968444824, "logps/chosen": -375.8202819824219, "logps/rejected": -492.4502868652344, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 1.3086374998092651, "rewards/margins": 4.120192050933838, "rewards/rejected": -2.811554431915283, "step": 4253 }, { "epoch": 3.107945205479452, "grad_norm": 6.563584592293704, "learning_rate": 7.176317851194327e-08, "logits/chosen": -3.3830621242523193, "logits/rejected": -2.4230263233184814, "logps/chosen": -635.4860229492188, "logps/rejected": -449.89483642578125, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 4.061895370483398, "rewards/margins": 5.292436122894287, "rewards/rejected": -1.2305408716201782, "step": 4254 }, { "epoch": 3.108675799086758, "grad_norm": 5.580070434180472, "learning_rate": 7.165136771276861e-08, "logits/chosen": -2.6381847858428955, "logits/rejected": -2.4050955772399902, "logps/chosen": -543.805419921875, "logps/rejected": -568.0244750976562, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 2.1748244762420654, "rewards/margins": 3.435670852661133, "rewards/rejected": -1.2608463764190674, "step": 4255 }, { "epoch": 3.109406392694064, "grad_norm": 6.891114371086191, "learning_rate": 7.153962951305972e-08, "logits/chosen": -2.5569210052490234, "logits/rejected": -1.173046588897705, "logps/chosen": -634.568359375, "logps/rejected": -285.0154724121094, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 3.3519465923309326, "rewards/margins": 5.092106342315674, "rewards/rejected": -1.740159273147583, "step": 4256 }, { "epoch": 3.1101369863013697, "grad_norm": 8.718647617846456, "learning_rate": 7.142796395830153e-08, "logits/chosen": -3.0428409576416016, "logits/rejected": -2.930234909057617, "logps/chosen": -651.5026245117188, "logps/rejected": -632.5743408203125, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 3.2858986854553223, "rewards/margins": 3.5759057998657227, "rewards/rejected": -0.29000720381736755, "step": 4257 }, { "epoch": 3.1108675799086756, "grad_norm": 4.8016562227089, "learning_rate": 7.131637109394883e-08, "logits/chosen": -2.876704692840576, "logits/rejected": -2.190378189086914, "logps/chosen": -728.4004516601562, "logps/rejected": -551.6461791992188, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 3.5731260776519775, "rewards/margins": 4.604809761047363, "rewards/rejected": -1.0316840410232544, "step": 4258 }, { "epoch": 3.111598173515982, "grad_norm": 5.898411954104449, "learning_rate": 7.120485096542733e-08, "logits/chosen": -3.2015981674194336, "logits/rejected": -2.180919647216797, "logps/chosen": -548.7201538085938, "logps/rejected": -392.7918395996094, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 3.406261444091797, "rewards/margins": 6.198433876037598, "rewards/rejected": -2.792172431945801, "step": 4259 }, { "epoch": 3.1123287671232878, "grad_norm": 9.631962623447633, "learning_rate": 7.109340361813305e-08, "logits/chosen": -2.9790802001953125, "logits/rejected": -1.8279671669006348, "logps/chosen": -759.518310546875, "logps/rejected": -463.3393859863281, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 3.4618115425109863, "rewards/margins": 6.227707862854004, "rewards/rejected": -2.7658963203430176, "step": 4260 }, { "epoch": 3.1130593607305936, "grad_norm": 7.545762384826448, "learning_rate": 7.09820290974322e-08, "logits/chosen": -2.388367176055908, "logits/rejected": -2.2207579612731934, "logps/chosen": -653.8464965820312, "logps/rejected": -663.57373046875, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 4.155123710632324, "rewards/margins": 5.574423789978027, "rewards/rejected": -1.419299602508545, "step": 4261 }, { "epoch": 3.1137899543378995, "grad_norm": 7.394257828080178, "learning_rate": 7.087072744866143e-08, "logits/chosen": -2.7593297958374023, "logits/rejected": -2.269219160079956, "logps/chosen": -679.2775268554688, "logps/rejected": -538.263671875, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 3.741403341293335, "rewards/margins": 6.518313407897949, "rewards/rejected": -2.776909828186035, "step": 4262 }, { "epoch": 3.1145205479452054, "grad_norm": 7.211095409379482, "learning_rate": 7.075949871712766e-08, "logits/chosen": -2.696451187133789, "logits/rejected": -2.5591466426849365, "logps/chosen": -515.4395751953125, "logps/rejected": -514.6215209960938, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 1.1366393566131592, "rewards/margins": 4.254140377044678, "rewards/rejected": -3.1175007820129395, "step": 4263 }, { "epoch": 3.1152511415525113, "grad_norm": 2.5238540352874543, "learning_rate": 7.064834294810834e-08, "logits/chosen": -2.538407325744629, "logits/rejected": -1.7576395273208618, "logps/chosen": -715.310302734375, "logps/rejected": -643.749755859375, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 3.058605670928955, "rewards/margins": 6.942363262176514, "rewards/rejected": -3.8837578296661377, "step": 4264 }, { "epoch": 3.115981735159817, "grad_norm": 6.535242639895223, "learning_rate": 7.053726018685106e-08, "logits/chosen": -2.2212624549865723, "logits/rejected": -2.3045711517333984, "logps/chosen": -518.5128784179688, "logps/rejected": -554.29052734375, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 2.680335521697998, "rewards/margins": 6.009601593017578, "rewards/rejected": -3.329266309738159, "step": 4265 }, { "epoch": 3.1167123287671235, "grad_norm": 8.321217490395316, "learning_rate": 7.042625047857357e-08, "logits/chosen": -3.0581228733062744, "logits/rejected": -2.509672164916992, "logps/chosen": -797.8264770507812, "logps/rejected": -687.074951171875, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 3.9003195762634277, "rewards/margins": 5.576936721801758, "rewards/rejected": -1.6766173839569092, "step": 4266 }, { "epoch": 3.1174429223744293, "grad_norm": 3.5741567137290486, "learning_rate": 7.03153138684642e-08, "logits/chosen": -3.1339805126190186, "logits/rejected": -1.5491658449172974, "logps/chosen": -759.07275390625, "logps/rejected": -490.41717529296875, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 4.019761085510254, "rewards/margins": 7.414889335632324, "rewards/rejected": -3.3951282501220703, "step": 4267 }, { "epoch": 3.118173515981735, "grad_norm": 5.813624447582532, "learning_rate": 7.020445040168121e-08, "logits/chosen": -2.757816791534424, "logits/rejected": -2.529045343399048, "logps/chosen": -519.7386474609375, "logps/rejected": -591.4632568359375, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 1.673980951309204, "rewards/margins": 4.660888671875, "rewards/rejected": -2.986907720565796, "step": 4268 }, { "epoch": 3.118904109589041, "grad_norm": 5.525558215633996, "learning_rate": 7.009366012335336e-08, "logits/chosen": -2.7325286865234375, "logits/rejected": -2.291934013366699, "logps/chosen": -731.6019287109375, "logps/rejected": -807.2507934570312, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 4.96156120300293, "rewards/margins": 6.851430892944336, "rewards/rejected": -1.8898696899414062, "step": 4269 }, { "epoch": 3.119634703196347, "grad_norm": 6.021396949373445, "learning_rate": 6.998294307857941e-08, "logits/chosen": -2.680987596511841, "logits/rejected": -2.1774070262908936, "logps/chosen": -422.6852722167969, "logps/rejected": -359.817138671875, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 3.1893932819366455, "rewards/margins": 5.029206275939941, "rewards/rejected": -1.839813232421875, "step": 4270 }, { "epoch": 3.120365296803653, "grad_norm": 4.790582661401197, "learning_rate": 6.987229931242833e-08, "logits/chosen": -2.5266335010528564, "logits/rejected": -2.256129264831543, "logps/chosen": -505.6932067871094, "logps/rejected": -480.2749938964844, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": 3.0730175971984863, "rewards/margins": 4.881698131561279, "rewards/rejected": -1.8086804151535034, "step": 4271 }, { "epoch": 3.1210958904109587, "grad_norm": 5.306397107784427, "learning_rate": 6.976172886993942e-08, "logits/chosen": -2.5758023262023926, "logits/rejected": -2.325784206390381, "logps/chosen": -590.9453125, "logps/rejected": -531.7916870117188, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 2.760328531265259, "rewards/margins": 5.416735649108887, "rewards/rejected": -2.656407356262207, "step": 4272 }, { "epoch": 3.121826484018265, "grad_norm": 4.741981966947314, "learning_rate": 6.96512317961219e-08, "logits/chosen": -2.8832767009735107, "logits/rejected": -2.345937490463257, "logps/chosen": -1296.7611083984375, "logps/rejected": -990.9432373046875, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 7.397313594818115, "rewards/margins": 7.134805679321289, "rewards/rejected": 0.2625085115432739, "step": 4273 }, { "epoch": 3.122557077625571, "grad_norm": 4.423703735489062, "learning_rate": 6.954080813595548e-08, "logits/chosen": -2.7498769760131836, "logits/rejected": -1.879564642906189, "logps/chosen": -473.6182556152344, "logps/rejected": -439.5333557128906, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 2.8560473918914795, "rewards/margins": 4.976214408874512, "rewards/rejected": -2.120166778564453, "step": 4274 }, { "epoch": 3.1232876712328768, "grad_norm": 7.2391468236677685, "learning_rate": 6.943045793438942e-08, "logits/chosen": -2.9390697479248047, "logits/rejected": -2.7569663524627686, "logps/chosen": -638.6983642578125, "logps/rejected": -645.6398315429688, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 2.7301719188690186, "rewards/margins": 4.515064716339111, "rewards/rejected": -1.7848926782608032, "step": 4275 }, { "epoch": 3.1240182648401826, "grad_norm": 5.188985956460824, "learning_rate": 6.932018123634367e-08, "logits/chosen": -2.6856467723846436, "logits/rejected": -2.179234027862549, "logps/chosen": -853.2452392578125, "logps/rejected": -600.2962646484375, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 2.6400206089019775, "rewards/margins": 3.760880470275879, "rewards/rejected": -1.1208598613739014, "step": 4276 }, { "epoch": 3.1247488584474885, "grad_norm": 7.882321057912432, "learning_rate": 6.920997808670786e-08, "logits/chosen": -2.9875123500823975, "logits/rejected": -2.4565505981445312, "logps/chosen": -507.72979736328125, "logps/rejected": -474.05279541015625, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 4.34299373626709, "rewards/margins": 6.704373359680176, "rewards/rejected": -2.361379623413086, "step": 4277 }, { "epoch": 3.1254794520547944, "grad_norm": 4.842578377077109, "learning_rate": 6.909984853034192e-08, "logits/chosen": -2.6335363388061523, "logits/rejected": -2.554232120513916, "logps/chosen": -731.8818359375, "logps/rejected": -767.72314453125, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 3.288461208343506, "rewards/margins": 6.348725318908691, "rewards/rejected": -3.0602643489837646, "step": 4278 }, { "epoch": 3.1262100456621003, "grad_norm": 6.066372607863091, "learning_rate": 6.898979261207572e-08, "logits/chosen": -2.7769956588745117, "logits/rejected": -2.257282257080078, "logps/chosen": -739.2388305664062, "logps/rejected": -478.5054931640625, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 4.1895432472229, "rewards/margins": 6.520487308502197, "rewards/rejected": -2.330944061279297, "step": 4279 }, { "epoch": 3.1269406392694066, "grad_norm": 6.713300899413377, "learning_rate": 6.887981037670904e-08, "logits/chosen": -3.086261510848999, "logits/rejected": -2.808835029602051, "logps/chosen": -769.7051391601562, "logps/rejected": -762.69287109375, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 3.9333763122558594, "rewards/margins": 5.7235493659973145, "rewards/rejected": -1.7901729345321655, "step": 4280 }, { "epoch": 3.1276712328767124, "grad_norm": 6.454071640520176, "learning_rate": 6.876990186901197e-08, "logits/chosen": -3.281815528869629, "logits/rejected": -2.0813252925872803, "logps/chosen": -749.3447875976562, "logps/rejected": -498.15277099609375, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 4.605834007263184, "rewards/margins": 6.449526786804199, "rewards/rejected": -1.8436918258666992, "step": 4281 }, { "epoch": 3.1284018264840183, "grad_norm": 6.832976479245188, "learning_rate": 6.866006713372419e-08, "logits/chosen": -3.1237878799438477, "logits/rejected": -2.3289031982421875, "logps/chosen": -830.1759033203125, "logps/rejected": -613.7942504882812, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 5.007022380828857, "rewards/margins": 5.098958492279053, "rewards/rejected": -0.0919359028339386, "step": 4282 }, { "epoch": 3.129132420091324, "grad_norm": 6.473313937618115, "learning_rate": 6.855030621555576e-08, "logits/chosen": -2.9044103622436523, "logits/rejected": -2.5297679901123047, "logps/chosen": -683.18115234375, "logps/rejected": -807.3479614257812, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 3.9036290645599365, "rewards/margins": 6.478204250335693, "rewards/rejected": -2.5745749473571777, "step": 4283 }, { "epoch": 3.12986301369863, "grad_norm": 5.658143121877219, "learning_rate": 6.844061915918644e-08, "logits/chosen": -2.553825855255127, "logits/rejected": -2.4622037410736084, "logps/chosen": -624.6744384765625, "logps/rejected": -588.0592041015625, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 4.0478620529174805, "rewards/margins": 5.45961856842041, "rewards/rejected": -1.4117567539215088, "step": 4284 }, { "epoch": 3.130593607305936, "grad_norm": 5.820308895237978, "learning_rate": 6.833100600926589e-08, "logits/chosen": -3.0892770290374756, "logits/rejected": -2.007659435272217, "logps/chosen": -774.4449462890625, "logps/rejected": -445.44537353515625, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 4.104642868041992, "rewards/margins": 8.097131729125977, "rewards/rejected": -3.9924888610839844, "step": 4285 }, { "epoch": 3.131324200913242, "grad_norm": 2.4604806627375315, "learning_rate": 6.822146681041388e-08, "logits/chosen": -2.483447551727295, "logits/rejected": -1.5592827796936035, "logps/chosen": -842.2547607421875, "logps/rejected": -556.8349609375, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 4.742668628692627, "rewards/margins": 6.4134440422058105, "rewards/rejected": -1.670775055885315, "step": 4286 }, { "epoch": 3.132054794520548, "grad_norm": 7.662129787444133, "learning_rate": 6.811200160721995e-08, "logits/chosen": -2.685427188873291, "logits/rejected": -2.1185383796691895, "logps/chosen": -535.3922119140625, "logps/rejected": -405.40704345703125, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 3.799160957336426, "rewards/margins": 6.31840181350708, "rewards/rejected": -2.519240379333496, "step": 4287 }, { "epoch": 3.132785388127854, "grad_norm": 5.442600007021073, "learning_rate": 6.800261044424344e-08, "logits/chosen": -2.493241786956787, "logits/rejected": -2.1929681301116943, "logps/chosen": -379.3280334472656, "logps/rejected": -465.9000244140625, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 2.003937244415283, "rewards/margins": 5.911980152130127, "rewards/rejected": -3.9080429077148438, "step": 4288 }, { "epoch": 3.13351598173516, "grad_norm": 2.8279442492308458, "learning_rate": 6.789329336601382e-08, "logits/chosen": -2.7974917888641357, "logits/rejected": -1.9009666442871094, "logps/chosen": -860.479248046875, "logps/rejected": -513.647705078125, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 4.447012901306152, "rewards/margins": 7.270533084869385, "rewards/rejected": -2.8235199451446533, "step": 4289 }, { "epoch": 3.1342465753424658, "grad_norm": 9.96900102544445, "learning_rate": 6.778405041703006e-08, "logits/chosen": -3.079796075820923, "logits/rejected": -2.546879291534424, "logps/chosen": -462.8922424316406, "logps/rejected": -446.4189758300781, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 1.582311749458313, "rewards/margins": 4.035242080688477, "rewards/rejected": -2.452930212020874, "step": 4290 }, { "epoch": 3.1349771689497716, "grad_norm": 2.8247374589842362, "learning_rate": 6.76748816417613e-08, "logits/chosen": -2.9424757957458496, "logits/rejected": -2.6960086822509766, "logps/chosen": -555.3981323242188, "logps/rejected": -584.9649658203125, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 2.1802139282226562, "rewards/margins": 6.03042459487915, "rewards/rejected": -3.850210428237915, "step": 4291 }, { "epoch": 3.1357077625570775, "grad_norm": 6.033757928622173, "learning_rate": 6.756578708464622e-08, "logits/chosen": -2.9932198524475098, "logits/rejected": -2.6611480712890625, "logps/chosen": -503.4097900390625, "logps/rejected": -547.2472534179688, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 3.9122891426086426, "rewards/margins": 6.520936012268066, "rewards/rejected": -2.608646869659424, "step": 4292 }, { "epoch": 3.1364383561643834, "grad_norm": 6.259353657465643, "learning_rate": 6.745676679009341e-08, "logits/chosen": -2.6199517250061035, "logits/rejected": -2.3136038780212402, "logps/chosen": -581.049072265625, "logps/rejected": -784.1668090820312, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": 0.27686169743537903, "rewards/margins": 5.694728374481201, "rewards/rejected": -5.4178667068481445, "step": 4293 }, { "epoch": 3.1371689497716897, "grad_norm": 6.171074846767319, "learning_rate": 6.73478208024811e-08, "logits/chosen": -2.788111686706543, "logits/rejected": -2.3753790855407715, "logps/chosen": -786.4844970703125, "logps/rejected": -544.9666748046875, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 3.622771978378296, "rewards/margins": 4.427420616149902, "rewards/rejected": -0.8046485781669617, "step": 4294 }, { "epoch": 3.1378995433789956, "grad_norm": 6.35123126490622, "learning_rate": 6.723894916615749e-08, "logits/chosen": -2.6659603118896484, "logits/rejected": -2.0269126892089844, "logps/chosen": -863.1373291015625, "logps/rejected": -682.2840576171875, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 3.4487438201904297, "rewards/margins": 5.8454790115356445, "rewards/rejected": -2.396735429763794, "step": 4295 }, { "epoch": 3.1386301369863014, "grad_norm": 5.159808416432447, "learning_rate": 6.71301519254405e-08, "logits/chosen": -3.096393346786499, "logits/rejected": -2.518425226211548, "logps/chosen": -686.9306640625, "logps/rejected": -717.7335815429688, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 3.608977794647217, "rewards/margins": 4.9167633056640625, "rewards/rejected": -1.3077855110168457, "step": 4296 }, { "epoch": 3.1393607305936073, "grad_norm": 7.96251407544196, "learning_rate": 6.702142912461739e-08, "logits/chosen": -2.864163637161255, "logits/rejected": -1.7581764459609985, "logps/chosen": -654.0703125, "logps/rejected": -416.9642639160156, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": 4.087433815002441, "rewards/margins": 6.120612144470215, "rewards/rejected": -2.0331778526306152, "step": 4297 }, { "epoch": 3.140091324200913, "grad_norm": 6.523503286064246, "learning_rate": 6.691278080794563e-08, "logits/chosen": -2.5696115493774414, "logits/rejected": -2.1662566661834717, "logps/chosen": -579.1851806640625, "logps/rejected": -526.9532470703125, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 3.794625997543335, "rewards/margins": 6.586483955383301, "rewards/rejected": -2.791857957839966, "step": 4298 }, { "epoch": 3.140821917808219, "grad_norm": 4.744617022061439, "learning_rate": 6.680420701965198e-08, "logits/chosen": -3.1048026084899902, "logits/rejected": -2.6554534435272217, "logps/chosen": -548.56982421875, "logps/rejected": -402.00433349609375, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 2.6521148681640625, "rewards/margins": 5.741433143615723, "rewards/rejected": -3.089317798614502, "step": 4299 }, { "epoch": 3.141552511415525, "grad_norm": 3.8981396517371976, "learning_rate": 6.669570780393316e-08, "logits/chosen": -2.2722508907318115, "logits/rejected": -2.216395139694214, "logps/chosen": -590.156005859375, "logps/rejected": -596.3342895507812, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": 2.8799338340759277, "rewards/margins": 6.352872848510742, "rewards/rejected": -3.4729387760162354, "step": 4300 }, { "epoch": 3.1422831050228313, "grad_norm": 5.9256769622497565, "learning_rate": 6.658728320495532e-08, "logits/chosen": -2.8888089656829834, "logits/rejected": -2.3718767166137695, "logps/chosen": -566.8909912109375, "logps/rejected": -522.946533203125, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 1.9316273927688599, "rewards/margins": 5.515075206756592, "rewards/rejected": -3.5834476947784424, "step": 4301 }, { "epoch": 3.143013698630137, "grad_norm": 4.250338372616736, "learning_rate": 6.647893326685425e-08, "logits/chosen": -2.8273119926452637, "logits/rejected": -2.9975602626800537, "logps/chosen": -852.6659545898438, "logps/rejected": -926.708740234375, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 3.225722312927246, "rewards/margins": 4.303706169128418, "rewards/rejected": -1.0779838562011719, "step": 4302 }, { "epoch": 3.143744292237443, "grad_norm": 4.19711179881859, "learning_rate": 6.63706580337355e-08, "logits/chosen": -3.456552505493164, "logits/rejected": -2.682973623275757, "logps/chosen": -1050.0450439453125, "logps/rejected": -685.6307373046875, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 2.3639650344848633, "rewards/margins": 4.550907135009766, "rewards/rejected": -2.1869425773620605, "step": 4303 }, { "epoch": 3.144474885844749, "grad_norm": 4.518827893019759, "learning_rate": 6.626245754967403e-08, "logits/chosen": -2.592892646789551, "logits/rejected": -2.0203394889831543, "logps/chosen": -379.04541015625, "logps/rejected": -369.6560363769531, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 3.317396640777588, "rewards/margins": 8.680119514465332, "rewards/rejected": -5.362723350524902, "step": 4304 }, { "epoch": 3.1452054794520548, "grad_norm": 4.569849235334948, "learning_rate": 6.615433185871455e-08, "logits/chosen": -2.643641710281372, "logits/rejected": -2.021207332611084, "logps/chosen": -847.30029296875, "logps/rejected": -546.5208740234375, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 3.7002837657928467, "rewards/margins": 7.228753089904785, "rewards/rejected": -3.5284690856933594, "step": 4305 }, { "epoch": 3.1459360730593606, "grad_norm": 5.102740445338195, "learning_rate": 6.604628100487123e-08, "logits/chosen": -2.596680164337158, "logits/rejected": -1.9992049932479858, "logps/chosen": -674.656005859375, "logps/rejected": -522.0182495117188, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 2.4485080242156982, "rewards/margins": 6.450518608093262, "rewards/rejected": -4.002010345458984, "step": 4306 }, { "epoch": 3.1466666666666665, "grad_norm": 4.565393653768226, "learning_rate": 6.59383050321276e-08, "logits/chosen": -2.566221237182617, "logits/rejected": -1.7108534574508667, "logps/chosen": -438.3160400390625, "logps/rejected": -366.3875427246094, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 3.5369415283203125, "rewards/margins": 7.317911624908447, "rewards/rejected": -3.780970811843872, "step": 4307 }, { "epoch": 3.147397260273973, "grad_norm": 14.303386068597314, "learning_rate": 6.583040398443714e-08, "logits/chosen": -2.9320616722106934, "logits/rejected": -2.3810014724731445, "logps/chosen": -859.5382080078125, "logps/rejected": -708.4446411132812, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 4.05088996887207, "rewards/margins": 5.503500938415527, "rewards/rejected": -1.4526114463806152, "step": 4308 }, { "epoch": 3.1481278538812787, "grad_norm": 6.010631043981572, "learning_rate": 6.572257790572247e-08, "logits/chosen": -2.4676852226257324, "logits/rejected": -2.3795666694641113, "logps/chosen": -871.2095336914062, "logps/rejected": -810.1993408203125, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": 5.503649711608887, "rewards/margins": 6.52556037902832, "rewards/rejected": -1.0219107866287231, "step": 4309 }, { "epoch": 3.1488584474885846, "grad_norm": 6.437262077008804, "learning_rate": 6.561482683987577e-08, "logits/chosen": -2.751317024230957, "logits/rejected": -1.9091770648956299, "logps/chosen": -530.8472900390625, "logps/rejected": -393.71337890625, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 3.198270320892334, "rewards/margins": 5.408031463623047, "rewards/rejected": -2.209761142730713, "step": 4310 }, { "epoch": 3.1495890410958904, "grad_norm": 7.223287370174948, "learning_rate": 6.550715083075867e-08, "logits/chosen": -2.702479600906372, "logits/rejected": -2.2542338371276855, "logps/chosen": -566.667236328125, "logps/rejected": -528.0984497070312, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 2.6292505264282227, "rewards/margins": 5.990113258361816, "rewards/rejected": -3.360862970352173, "step": 4311 }, { "epoch": 3.1503196347031963, "grad_norm": 5.569857927904098, "learning_rate": 6.539954992220234e-08, "logits/chosen": -3.3149759769439697, "logits/rejected": -2.9709227085113525, "logps/chosen": -781.031982421875, "logps/rejected": -676.1851196289062, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 3.945293426513672, "rewards/margins": 5.113616943359375, "rewards/rejected": -1.1683239936828613, "step": 4312 }, { "epoch": 3.151050228310502, "grad_norm": 8.57999609602548, "learning_rate": 6.529202415800752e-08, "logits/chosen": -2.8597843647003174, "logits/rejected": -2.156562566757202, "logps/chosen": -766.8844604492188, "logps/rejected": -578.85986328125, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": 4.189489364624023, "rewards/margins": 7.18247127532959, "rewards/rejected": -2.9929819107055664, "step": 4313 }, { "epoch": 3.151780821917808, "grad_norm": 5.422620914389716, "learning_rate": 6.518457358194385e-08, "logits/chosen": -3.095092296600342, "logits/rejected": -2.946377754211426, "logps/chosen": -726.19970703125, "logps/rejected": -709.4436645507812, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 4.574183464050293, "rewards/margins": 5.206653594970703, "rewards/rejected": -0.6324697732925415, "step": 4314 }, { "epoch": 3.1525114155251144, "grad_norm": 7.377621976644876, "learning_rate": 6.507719823775094e-08, "logits/chosen": -3.254241943359375, "logits/rejected": -2.492408037185669, "logps/chosen": -563.5684814453125, "logps/rejected": -495.6755676269531, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": 2.9918017387390137, "rewards/margins": 5.297995567321777, "rewards/rejected": -2.306194305419922, "step": 4315 }, { "epoch": 3.1532420091324203, "grad_norm": 6.414901028090403, "learning_rate": 6.49698981691373e-08, "logits/chosen": -1.8898181915283203, "logits/rejected": -1.6980011463165283, "logps/chosen": -534.66845703125, "logps/rejected": -669.7926635742188, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 2.92647647857666, "rewards/margins": 4.9636101722717285, "rewards/rejected": -2.0371334552764893, "step": 4316 }, { "epoch": 3.153972602739726, "grad_norm": 9.541264510251764, "learning_rate": 6.486267341978124e-08, "logits/chosen": -2.7842893600463867, "logits/rejected": -2.694087028503418, "logps/chosen": -517.0995483398438, "logps/rejected": -527.3707885742188, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 1.2174354791641235, "rewards/margins": 3.7204272747039795, "rewards/rejected": -2.5029919147491455, "step": 4317 }, { "epoch": 3.154703196347032, "grad_norm": 6.900091541070052, "learning_rate": 6.475552403333009e-08, "logits/chosen": -2.7137885093688965, "logits/rejected": -1.8639471530914307, "logps/chosen": -642.1240844726562, "logps/rejected": -529.92578125, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 3.76084566116333, "rewards/margins": 7.160154342651367, "rewards/rejected": -3.399308443069458, "step": 4318 }, { "epoch": 3.155433789954338, "grad_norm": 7.233162368418145, "learning_rate": 6.464845005340056e-08, "logits/chosen": -2.1663060188293457, "logits/rejected": -2.097222328186035, "logps/chosen": -712.9544677734375, "logps/rejected": -614.9708251953125, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": 2.052887201309204, "rewards/margins": 4.211322784423828, "rewards/rejected": -2.158435583114624, "step": 4319 }, { "epoch": 3.1561643835616437, "grad_norm": 4.922916362739302, "learning_rate": 6.454145152357878e-08, "logits/chosen": -3.0387697219848633, "logits/rejected": -2.7755560874938965, "logps/chosen": -973.4833984375, "logps/rejected": -866.187255859375, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 5.231391906738281, "rewards/margins": 6.78580904006958, "rewards/rejected": -1.5544166564941406, "step": 4320 }, { "epoch": 3.1568949771689496, "grad_norm": 6.962117550000811, "learning_rate": 6.443452848742004e-08, "logits/chosen": -2.7007317543029785, "logits/rejected": -2.8888189792633057, "logps/chosen": -921.0235595703125, "logps/rejected": -779.8897094726562, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 3.7508997917175293, "rewards/margins": 4.8686299324035645, "rewards/rejected": -1.1177297830581665, "step": 4321 }, { "epoch": 3.157625570776256, "grad_norm": 5.3402716486840145, "learning_rate": 6.432768098844901e-08, "logits/chosen": -3.213918685913086, "logits/rejected": -2.7645673751831055, "logps/chosen": -370.72772216796875, "logps/rejected": -496.3764343261719, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": 2.408510684967041, "rewards/margins": 7.317286491394043, "rewards/rejected": -4.908775329589844, "step": 4322 }, { "epoch": 3.158356164383562, "grad_norm": 6.875102649269193, "learning_rate": 6.422090907015956e-08, "logits/chosen": -2.4881300926208496, "logits/rejected": -2.3526535034179688, "logps/chosen": -954.0374755859375, "logps/rejected": -1044.6292724609375, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 4.528309345245361, "rewards/margins": 6.517881870269775, "rewards/rejected": -1.9895727634429932, "step": 4323 }, { "epoch": 3.1590867579908677, "grad_norm": 7.017409881350176, "learning_rate": 6.411421277601468e-08, "logits/chosen": -1.9689236879348755, "logits/rejected": -2.4770994186401367, "logps/chosen": -431.27728271484375, "logps/rejected": -886.7506103515625, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 2.129789352416992, "rewards/margins": 5.6746978759765625, "rewards/rejected": -3.544908285140991, "step": 4324 }, { "epoch": 3.1598173515981736, "grad_norm": 7.740963743865352, "learning_rate": 6.400759214944682e-08, "logits/chosen": -2.9740633964538574, "logits/rejected": -1.9397642612457275, "logps/chosen": -572.5908813476562, "logps/rejected": -333.432861328125, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 3.3274412155151367, "rewards/margins": 5.351381778717041, "rewards/rejected": -2.0239410400390625, "step": 4325 }, { "epoch": 3.1605479452054794, "grad_norm": 8.060991959407286, "learning_rate": 6.390104723385744e-08, "logits/chosen": -3.177680015563965, "logits/rejected": -2.2131869792938232, "logps/chosen": -672.5911865234375, "logps/rejected": -479.80645751953125, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 2.7858309745788574, "rewards/margins": 4.696836471557617, "rewards/rejected": -1.9110056161880493, "step": 4326 }, { "epoch": 3.1612785388127853, "grad_norm": 5.0105091279421305, "learning_rate": 6.379457807261723e-08, "logits/chosen": -2.4899401664733887, "logits/rejected": -2.4609315395355225, "logps/chosen": -760.2222290039062, "logps/rejected": -883.0767211914062, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 4.407151222229004, "rewards/margins": 7.317286491394043, "rewards/rejected": -2.910135507583618, "step": 4327 }, { "epoch": 3.162009132420091, "grad_norm": 7.687397529725472, "learning_rate": 6.368818470906598e-08, "logits/chosen": -2.5635433197021484, "logits/rejected": -2.642972946166992, "logps/chosen": -534.2933349609375, "logps/rejected": -876.9901733398438, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": 1.4607367515563965, "rewards/margins": 4.437931060791016, "rewards/rejected": -2.9771947860717773, "step": 4328 }, { "epoch": 3.162739726027397, "grad_norm": 4.507073670768037, "learning_rate": 6.358186718651282e-08, "logits/chosen": -2.0582828521728516, "logits/rejected": -1.9053218364715576, "logps/chosen": -358.1898498535156, "logps/rejected": -331.46990966796875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 3.004704713821411, "rewards/margins": 8.235462188720703, "rewards/rejected": -5.230757236480713, "step": 4329 }, { "epoch": 3.1634703196347034, "grad_norm": 5.4423364636966385, "learning_rate": 6.347562554823574e-08, "logits/chosen": -2.9538745880126953, "logits/rejected": -1.8806337118148804, "logps/chosen": -774.307861328125, "logps/rejected": -409.3250732421875, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 3.7662057876586914, "rewards/margins": 6.7606964111328125, "rewards/rejected": -2.9944908618927, "step": 4330 }, { "epoch": 3.1642009132420092, "grad_norm": 8.59231166940648, "learning_rate": 6.336945983748212e-08, "logits/chosen": -2.393148422241211, "logits/rejected": -2.419290065765381, "logps/chosen": -448.74676513671875, "logps/rejected": -549.1927490234375, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 2.705590009689331, "rewards/margins": 4.037988662719727, "rewards/rejected": -1.3323988914489746, "step": 4331 }, { "epoch": 3.164931506849315, "grad_norm": 6.815666092963437, "learning_rate": 6.326337009746826e-08, "logits/chosen": -2.230208396911621, "logits/rejected": -2.413830280303955, "logps/chosen": -311.5669860839844, "logps/rejected": -747.0027465820312, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 1.2638672590255737, "rewards/margins": 6.974325180053711, "rewards/rejected": -5.710457801818848, "step": 4332 }, { "epoch": 3.165662100456621, "grad_norm": 4.307613091736037, "learning_rate": 6.315735637137945e-08, "logits/chosen": -2.7743287086486816, "logits/rejected": -2.688518762588501, "logps/chosen": -1100.439208984375, "logps/rejected": -990.6890258789062, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": 4.847046852111816, "rewards/margins": 8.380182266235352, "rewards/rejected": -3.533134937286377, "step": 4333 }, { "epoch": 3.166392694063927, "grad_norm": 7.070124468753157, "learning_rate": 6.305141870237033e-08, "logits/chosen": -2.94197416305542, "logits/rejected": -1.8149139881134033, "logps/chosen": -419.3424072265625, "logps/rejected": -387.88995361328125, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": 3.6202096939086914, "rewards/margins": 7.971386909484863, "rewards/rejected": -4.351177215576172, "step": 4334 }, { "epoch": 3.1671232876712327, "grad_norm": 9.078156095310343, "learning_rate": 6.29455571335643e-08, "logits/chosen": -2.5262093544006348, "logits/rejected": -1.7032182216644287, "logps/chosen": -382.66888427734375, "logps/rejected": -355.021484375, "loss": 0.073, "rewards/accuracies": 0.875, "rewards/chosen": 2.777425765991211, "rewards/margins": 8.003960609436035, "rewards/rejected": -5.226533889770508, "step": 4335 }, { "epoch": 3.1678538812785386, "grad_norm": 5.618497862796864, "learning_rate": 6.283977170805385e-08, "logits/chosen": -3.0858163833618164, "logits/rejected": -2.447829008102417, "logps/chosen": -643.6473388671875, "logps/rejected": -497.9259338378906, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 1.8540894985198975, "rewards/margins": 4.645571231842041, "rewards/rejected": -2.7914817333221436, "step": 4336 }, { "epoch": 3.168584474885845, "grad_norm": 6.2452197295383325, "learning_rate": 6.273406246890064e-08, "logits/chosen": -3.1372666358947754, "logits/rejected": -2.5713753700256348, "logps/chosen": -315.810546875, "logps/rejected": -304.76507568359375, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 1.55268132686615, "rewards/margins": 6.063324451446533, "rewards/rejected": -4.510643005371094, "step": 4337 }, { "epoch": 3.169315068493151, "grad_norm": 3.9681079976013054, "learning_rate": 6.262842945913505e-08, "logits/chosen": -3.2633578777313232, "logits/rejected": -2.5884757041931152, "logps/chosen": -760.67529296875, "logps/rejected": -657.414306640625, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 5.950279235839844, "rewards/margins": 6.992923259735107, "rewards/rejected": -1.0426437854766846, "step": 4338 }, { "epoch": 3.1700456621004567, "grad_norm": 7.134944734954127, "learning_rate": 6.25228727217568e-08, "logits/chosen": -2.8816428184509277, "logits/rejected": -2.260313034057617, "logps/chosen": -1075.53662109375, "logps/rejected": -766.4883422851562, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 5.262414932250977, "rewards/margins": 6.457676410675049, "rewards/rejected": -1.1952624320983887, "step": 4339 }, { "epoch": 3.1707762557077626, "grad_norm": 7.702583711807509, "learning_rate": 6.2417392299734e-08, "logits/chosen": -2.402942180633545, "logits/rejected": -2.4928102493286133, "logps/chosen": -718.318115234375, "logps/rejected": -802.1754150390625, "loss": 0.0359, "rewards/accuracies": 0.875, "rewards/chosen": 3.95993709564209, "rewards/margins": 4.30828332901001, "rewards/rejected": -0.34834611415863037, "step": 4340 }, { "epoch": 3.1715068493150684, "grad_norm": 6.5647523920179855, "learning_rate": 6.23119882360042e-08, "logits/chosen": -2.6548755168914795, "logits/rejected": -1.9189040660858154, "logps/chosen": -378.16796875, "logps/rejected": -391.2265319824219, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 0.5352780818939209, "rewards/margins": 4.942200660705566, "rewards/rejected": -4.406922340393066, "step": 4341 }, { "epoch": 3.1722374429223743, "grad_norm": 6.235214750481265, "learning_rate": 6.220666057347376e-08, "logits/chosen": -2.570927619934082, "logits/rejected": -2.3045101165771484, "logps/chosen": -669.027587890625, "logps/rejected": -588.6724243164062, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 3.6771461963653564, "rewards/margins": 5.101517200469971, "rewards/rejected": -1.4243712425231934, "step": 4342 }, { "epoch": 3.17296803652968, "grad_norm": 5.428800617485337, "learning_rate": 6.210140935501774e-08, "logits/chosen": -2.74381422996521, "logits/rejected": -1.9622548818588257, "logps/chosen": -371.35101318359375, "logps/rejected": -252.08860778808594, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 3.046936511993408, "rewards/margins": 7.57324743270874, "rewards/rejected": -4.526309967041016, "step": 4343 }, { "epoch": 3.1736986301369865, "grad_norm": 4.348379179563095, "learning_rate": 6.199623462348041e-08, "logits/chosen": -2.510237216949463, "logits/rejected": -2.098757743835449, "logps/chosen": -669.131103515625, "logps/rejected": -607.005615234375, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 3.5649824142456055, "rewards/margins": 6.2799177169799805, "rewards/rejected": -2.714935541152954, "step": 4344 }, { "epoch": 3.1744292237442924, "grad_norm": 7.3872225068673, "learning_rate": 6.189113642167443e-08, "logits/chosen": -3.0170507431030273, "logits/rejected": -2.325634479522705, "logps/chosen": -601.1804809570312, "logps/rejected": -479.65484619140625, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 4.358234405517578, "rewards/margins": 7.145681858062744, "rewards/rejected": -2.787446975708008, "step": 4345 }, { "epoch": 3.1751598173515982, "grad_norm": 4.412448500205728, "learning_rate": 6.178611479238182e-08, "logits/chosen": -2.9151430130004883, "logits/rejected": -2.0557861328125, "logps/chosen": -641.4442749023438, "logps/rejected": -445.75640869140625, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 4.459095478057861, "rewards/margins": 7.678411960601807, "rewards/rejected": -3.2193167209625244, "step": 4346 }, { "epoch": 3.175890410958904, "grad_norm": 8.338050981928935, "learning_rate": 6.168116977835305e-08, "logits/chosen": -2.8674492835998535, "logits/rejected": -2.629680633544922, "logps/chosen": -614.4736328125, "logps/rejected": -618.3892822265625, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 4.683646202087402, "rewards/margins": 7.234364032745361, "rewards/rejected": -2.5507171154022217, "step": 4347 }, { "epoch": 3.17662100456621, "grad_norm": 4.550570634544155, "learning_rate": 6.157630142230765e-08, "logits/chosen": -3.2063066959381104, "logits/rejected": -2.1812584400177, "logps/chosen": -439.1176452636719, "logps/rejected": -339.45794677734375, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 2.456212282180786, "rewards/margins": 5.414289951324463, "rewards/rejected": -2.9580774307250977, "step": 4348 }, { "epoch": 3.177351598173516, "grad_norm": 6.52401522516641, "learning_rate": 6.147150976693386e-08, "logits/chosen": -2.63533353805542, "logits/rejected": -2.2646188735961914, "logps/chosen": -692.233642578125, "logps/rejected": -633.5953369140625, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 3.856804847717285, "rewards/margins": 4.903131008148193, "rewards/rejected": -1.0463261604309082, "step": 4349 }, { "epoch": 3.1780821917808217, "grad_norm": 6.384342304431699, "learning_rate": 6.136679485488852e-08, "logits/chosen": -2.9376840591430664, "logits/rejected": -2.7166402339935303, "logps/chosen": -832.823486328125, "logps/rejected": -918.5222778320312, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 4.253174781799316, "rewards/margins": 4.7912516593933105, "rewards/rejected": -0.5380769371986389, "step": 4350 }, { "epoch": 3.178812785388128, "grad_norm": 4.613243212692468, "learning_rate": 6.126215672879758e-08, "logits/chosen": -3.072122573852539, "logits/rejected": -2.0208773612976074, "logps/chosen": -617.849609375, "logps/rejected": -378.7751159667969, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 3.955753803253174, "rewards/margins": 6.705873012542725, "rewards/rejected": -2.7501187324523926, "step": 4351 }, { "epoch": 3.179543378995434, "grad_norm": 9.578135753333074, "learning_rate": 6.115759543125551e-08, "logits/chosen": -2.8376755714416504, "logits/rejected": -2.2216427326202393, "logps/chosen": -485.4622497558594, "logps/rejected": -359.2508544921875, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": 3.889472484588623, "rewards/margins": 7.293549537658691, "rewards/rejected": -3.4040770530700684, "step": 4352 }, { "epoch": 3.18027397260274, "grad_norm": 6.553874714487099, "learning_rate": 6.10531110048254e-08, "logits/chosen": -2.638502836227417, "logits/rejected": -2.1334116458892822, "logps/chosen": -790.8767700195312, "logps/rejected": -812.5294189453125, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 3.959596633911133, "rewards/margins": 5.203917503356934, "rewards/rejected": -1.2443208694458008, "step": 4353 }, { "epoch": 3.1810045662100457, "grad_norm": 3.0945784875528686, "learning_rate": 6.094870349203937e-08, "logits/chosen": -2.891605854034424, "logits/rejected": -1.8850113153457642, "logps/chosen": -579.1410522460938, "logps/rejected": -333.53204345703125, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 3.8154282569885254, "rewards/margins": 7.641746520996094, "rewards/rejected": -3.8263182640075684, "step": 4354 }, { "epoch": 3.1817351598173516, "grad_norm": 12.552041326217745, "learning_rate": 6.084437293539794e-08, "logits/chosen": -2.963789701461792, "logits/rejected": -2.1843340396881104, "logps/chosen": -792.31689453125, "logps/rejected": -693.625732421875, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 4.804342746734619, "rewards/margins": 6.442466735839844, "rewards/rejected": -1.6381242275238037, "step": 4355 }, { "epoch": 3.1824657534246574, "grad_norm": 4.7760364306416685, "learning_rate": 6.074011937737047e-08, "logits/chosen": -2.6785483360290527, "logits/rejected": -1.9849857091903687, "logps/chosen": -723.878662109375, "logps/rejected": -624.7550659179688, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 3.453667640686035, "rewards/margins": 5.908880233764648, "rewards/rejected": -2.4552125930786133, "step": 4356 }, { "epoch": 3.1831963470319633, "grad_norm": 4.551908977087894, "learning_rate": 6.063594286039495e-08, "logits/chosen": -2.958191156387329, "logits/rejected": -2.441481590270996, "logps/chosen": -705.5887451171875, "logps/rejected": -643.15380859375, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 3.4220266342163086, "rewards/margins": 5.925684928894043, "rewards/rejected": -2.5036582946777344, "step": 4357 }, { "epoch": 3.183926940639269, "grad_norm": 5.845182029167403, "learning_rate": 6.053184342687786e-08, "logits/chosen": -2.274639129638672, "logits/rejected": -2.572911262512207, "logps/chosen": -325.6116027832031, "logps/rejected": -507.37469482421875, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 1.344395637512207, "rewards/margins": 5.4775495529174805, "rewards/rejected": -4.133153915405273, "step": 4358 }, { "epoch": 3.1846575342465755, "grad_norm": 10.649995014028322, "learning_rate": 6.042782111919458e-08, "logits/chosen": -2.524686813354492, "logits/rejected": -2.326345682144165, "logps/chosen": -536.2788696289062, "logps/rejected": -403.7472229003906, "loss": 0.0607, "rewards/accuracies": 0.875, "rewards/chosen": 2.954061269760132, "rewards/margins": 3.1916675567626953, "rewards/rejected": -0.23760627210140228, "step": 4359 }, { "epoch": 3.1853881278538814, "grad_norm": 3.432535644257271, "learning_rate": 6.03238759796888e-08, "logits/chosen": -2.827249526977539, "logits/rejected": -1.7460095882415771, "logps/chosen": -761.166015625, "logps/rejected": -581.5289306640625, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 5.755035400390625, "rewards/margins": 7.000249862670898, "rewards/rejected": -1.2452143430709839, "step": 4360 }, { "epoch": 3.1861187214611872, "grad_norm": 6.012348184235221, "learning_rate": 6.022000805067312e-08, "logits/chosen": -2.9548261165618896, "logits/rejected": -2.104856014251709, "logps/chosen": -825.1539306640625, "logps/rejected": -575.58935546875, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 4.267552852630615, "rewards/margins": 4.593854904174805, "rewards/rejected": -0.32630211114883423, "step": 4361 }, { "epoch": 3.186849315068493, "grad_norm": 4.782576433080483, "learning_rate": 6.011621737442827e-08, "logits/chosen": -2.636206865310669, "logits/rejected": -1.8052394390106201, "logps/chosen": -384.3122863769531, "logps/rejected": -299.6026611328125, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 1.6788041591644287, "rewards/margins": 5.7119855880737305, "rewards/rejected": -4.033181667327881, "step": 4362 }, { "epoch": 3.187579908675799, "grad_norm": 9.807575131920311, "learning_rate": 6.001250399320401e-08, "logits/chosen": -2.9213831424713135, "logits/rejected": -1.9765907526016235, "logps/chosen": -856.7434692382812, "logps/rejected": -630.8990478515625, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 5.15771484375, "rewards/margins": 6.675102233886719, "rewards/rejected": -1.517387866973877, "step": 4363 }, { "epoch": 3.188310502283105, "grad_norm": 7.513748791278392, "learning_rate": 5.990886794921827e-08, "logits/chosen": -3.1554112434387207, "logits/rejected": -2.304840564727783, "logps/chosen": -423.5997314453125, "logps/rejected": -355.61895751953125, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 3.113286018371582, "rewards/margins": 5.839712142944336, "rewards/rejected": -2.726426362991333, "step": 4364 }, { "epoch": 3.1890410958904107, "grad_norm": 6.30107033735065, "learning_rate": 5.980530928465774e-08, "logits/chosen": -2.524980306625366, "logits/rejected": -2.4235098361968994, "logps/chosen": -509.5476379394531, "logps/rejected": -433.81561279296875, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 2.4225149154663086, "rewards/margins": 3.884484052658081, "rewards/rejected": -1.4619691371917725, "step": 4365 }, { "epoch": 3.189771689497717, "grad_norm": 6.275241996080709, "learning_rate": 5.97018280416775e-08, "logits/chosen": -2.4071319103240967, "logits/rejected": -1.2500994205474854, "logps/chosen": -633.3128662109375, "logps/rejected": -349.96978759765625, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": 3.027487277984619, "rewards/margins": 6.402413368225098, "rewards/rejected": -3.3749260902404785, "step": 4366 }, { "epoch": 3.190502283105023, "grad_norm": 7.470328575169642, "learning_rate": 5.959842426240105e-08, "logits/chosen": -2.981642246246338, "logits/rejected": -2.337489604949951, "logps/chosen": -679.556396484375, "logps/rejected": -577.4156494140625, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.88165545463562, "rewards/margins": 6.5972161293029785, "rewards/rejected": -2.7155609130859375, "step": 4367 }, { "epoch": 3.191232876712329, "grad_norm": 4.4789723672050625, "learning_rate": 5.949509798892058e-08, "logits/chosen": -2.820026159286499, "logits/rejected": -2.132572889328003, "logps/chosen": -859.651123046875, "logps/rejected": -601.4842529296875, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": 3.650480270385742, "rewards/margins": 4.923616886138916, "rewards/rejected": -1.2731367349624634, "step": 4368 }, { "epoch": 3.1919634703196347, "grad_norm": 6.666381003299493, "learning_rate": 5.9391849263296434e-08, "logits/chosen": -2.3695731163024902, "logits/rejected": -2.713529586791992, "logps/chosen": -390.4182434082031, "logps/rejected": -560.9360961914062, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 1.411500096321106, "rewards/margins": 3.767998695373535, "rewards/rejected": -2.3564987182617188, "step": 4369 }, { "epoch": 3.1926940639269406, "grad_norm": 9.303265185031938, "learning_rate": 5.9288678127557695e-08, "logits/chosen": -1.842153549194336, "logits/rejected": -1.9111218452453613, "logps/chosen": -486.938232421875, "logps/rejected": -408.3408203125, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 0.8613222241401672, "rewards/margins": 3.9264214038848877, "rewards/rejected": -3.0650992393493652, "step": 4370 }, { "epoch": 3.1934246575342464, "grad_norm": 6.978969863470153, "learning_rate": 5.918558462370163e-08, "logits/chosen": -3.0234498977661133, "logits/rejected": -2.4162094593048096, "logps/chosen": -587.8690185546875, "logps/rejected": -605.2431640625, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 4.0315070152282715, "rewards/margins": 6.651151657104492, "rewards/rejected": -2.6196444034576416, "step": 4371 }, { "epoch": 3.1941552511415523, "grad_norm": 6.264181603441001, "learning_rate": 5.908256879369389e-08, "logits/chosen": -3.1139652729034424, "logits/rejected": -2.3992538452148438, "logps/chosen": -395.4176940917969, "logps/rejected": -237.15719604492188, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 3.104450225830078, "rewards/margins": 6.52046012878418, "rewards/rejected": -3.4160094261169434, "step": 4372 }, { "epoch": 3.1948858447488586, "grad_norm": 5.094339535537921, "learning_rate": 5.897963067946879e-08, "logits/chosen": -3.07944393157959, "logits/rejected": -1.9047017097473145, "logps/chosen": -769.409912109375, "logps/rejected": -526.8789672851562, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 5.339922904968262, "rewards/margins": 6.683042049407959, "rewards/rejected": -1.3431190252304077, "step": 4373 }, { "epoch": 3.1956164383561645, "grad_norm": 4.567083695204622, "learning_rate": 5.8876770322928735e-08, "logits/chosen": -2.4112775325775146, "logits/rejected": -1.5485395193099976, "logps/chosen": -541.9989013671875, "logps/rejected": -399.86846923828125, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 4.168258190155029, "rewards/margins": 8.785795211791992, "rewards/rejected": -4.617537498474121, "step": 4374 }, { "epoch": 3.1963470319634704, "grad_norm": 13.333705579011914, "learning_rate": 5.87739877659445e-08, "logits/chosen": -3.109116792678833, "logits/rejected": -2.390065908432007, "logps/chosen": -480.1728515625, "logps/rejected": -399.439453125, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 3.5082192420959473, "rewards/margins": 7.062907695770264, "rewards/rejected": -3.554687976837158, "step": 4375 }, { "epoch": 3.1970776255707762, "grad_norm": 8.033199088635627, "learning_rate": 5.867128305035537e-08, "logits/chosen": -2.6119158267974854, "logits/rejected": -2.3221821784973145, "logps/chosen": -912.2923583984375, "logps/rejected": -580.0326538085938, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 4.6474456787109375, "rewards/margins": 3.6112568378448486, "rewards/rejected": 1.0361888408660889, "step": 4376 }, { "epoch": 3.197808219178082, "grad_norm": 11.092210431793276, "learning_rate": 5.856865621796869e-08, "logits/chosen": -3.2119154930114746, "logits/rejected": -2.8157718181610107, "logps/chosen": -593.207763671875, "logps/rejected": -472.041015625, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 3.316405773162842, "rewards/margins": 5.720388889312744, "rewards/rejected": -2.4039831161499023, "step": 4377 }, { "epoch": 3.198538812785388, "grad_norm": 6.869324994860103, "learning_rate": 5.846610731056043e-08, "logits/chosen": -2.8092544078826904, "logits/rejected": -2.594816207885742, "logps/chosen": -530.6405639648438, "logps/rejected": -544.58544921875, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 3.6515955924987793, "rewards/margins": 5.783463478088379, "rewards/rejected": -2.1318678855895996, "step": 4378 }, { "epoch": 3.199269406392694, "grad_norm": 5.462048593545909, "learning_rate": 5.8363636369874514e-08, "logits/chosen": -2.727269172668457, "logits/rejected": -2.567669153213501, "logps/chosen": -877.635986328125, "logps/rejected": -772.9072265625, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 2.902214765548706, "rewards/margins": 4.235649585723877, "rewards/rejected": -1.3334349393844604, "step": 4379 }, { "epoch": 3.2, "grad_norm": 3.9540569428206105, "learning_rate": 5.8261243437623314e-08, "logits/chosen": -2.923194408416748, "logits/rejected": -2.5545918941497803, "logps/chosen": -746.6755981445312, "logps/rejected": -681.2811279296875, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 6.826425075531006, "rewards/margins": 6.790935039520264, "rewards/rejected": 0.035490453243255615, "step": 4380 }, { "epoch": 3.200730593607306, "grad_norm": 11.622300357527589, "learning_rate": 5.815892855548729e-08, "logits/chosen": -2.95465087890625, "logits/rejected": -2.2407312393188477, "logps/chosen": -422.59722900390625, "logps/rejected": -333.5140075683594, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 1.0152933597564697, "rewards/margins": 3.8341751098632812, "rewards/rejected": -2.8188817501068115, "step": 4381 }, { "epoch": 3.201461187214612, "grad_norm": 6.450354029675759, "learning_rate": 5.8056691765115404e-08, "logits/chosen": -3.1743814945220947, "logits/rejected": -1.9720404148101807, "logps/chosen": -436.7153015136719, "logps/rejected": -377.38092041015625, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 3.552701234817505, "rewards/margins": 7.065062046051025, "rewards/rejected": -3.5123608112335205, "step": 4382 }, { "epoch": 3.202191780821918, "grad_norm": 4.415569217056858, "learning_rate": 5.795453310812456e-08, "logits/chosen": -3.1082167625427246, "logits/rejected": -1.4582433700561523, "logps/chosen": -761.8646850585938, "logps/rejected": -389.77587890625, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 3.1484591960906982, "rewards/margins": 4.777369976043701, "rewards/rejected": -1.6289106607437134, "step": 4383 }, { "epoch": 3.2029223744292237, "grad_norm": 4.321102021905502, "learning_rate": 5.7852452626099947e-08, "logits/chosen": -2.3831422328948975, "logits/rejected": -2.134432554244995, "logps/chosen": -713.6426391601562, "logps/rejected": -598.8294677734375, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 2.9995317459106445, "rewards/margins": 5.822413921356201, "rewards/rejected": -2.8228821754455566, "step": 4384 }, { "epoch": 3.2036529680365295, "grad_norm": 5.431085420316406, "learning_rate": 5.775045036059503e-08, "logits/chosen": -2.355668067932129, "logits/rejected": -2.4545187950134277, "logps/chosen": -443.7926025390625, "logps/rejected": -484.78076171875, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": 2.4880316257476807, "rewards/margins": 5.5857625007629395, "rewards/rejected": -3.097731351852417, "step": 4385 }, { "epoch": 3.2043835616438354, "grad_norm": 5.27197914585973, "learning_rate": 5.764852635313125e-08, "logits/chosen": -2.916456460952759, "logits/rejected": -2.0173168182373047, "logps/chosen": -428.7001953125, "logps/rejected": -461.8497314453125, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 3.0041067600250244, "rewards/margins": 7.033071994781494, "rewards/rejected": -4.028965950012207, "step": 4386 }, { "epoch": 3.2051141552511417, "grad_norm": 6.9400984563082355, "learning_rate": 5.7546680645198406e-08, "logits/chosen": -3.3268628120422363, "logits/rejected": -2.1767289638519287, "logps/chosen": -847.4404907226562, "logps/rejected": -442.3311767578125, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 3.3824472427368164, "rewards/margins": 5.194808006286621, "rewards/rejected": -1.8123608827590942, "step": 4387 }, { "epoch": 3.2058447488584476, "grad_norm": 6.562004724614222, "learning_rate": 5.744491327825424e-08, "logits/chosen": -2.4480581283569336, "logits/rejected": -2.3476312160491943, "logps/chosen": -487.63946533203125, "logps/rejected": -598.2494506835938, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": 2.5500545501708984, "rewards/margins": 5.86043119430542, "rewards/rejected": -3.3103766441345215, "step": 4388 }, { "epoch": 3.2065753424657535, "grad_norm": 5.7448096691505475, "learning_rate": 5.734322429372462e-08, "logits/chosen": -3.0101234912872314, "logits/rejected": -2.1687886714935303, "logps/chosen": -945.97265625, "logps/rejected": -561.126708984375, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 3.4450840950012207, "rewards/margins": 4.492617607116699, "rewards/rejected": -1.0475338697433472, "step": 4389 }, { "epoch": 3.2073059360730594, "grad_norm": 6.345626478357517, "learning_rate": 5.724161373300371e-08, "logits/chosen": -2.5231122970581055, "logits/rejected": -1.5859215259552002, "logps/chosen": -673.1494750976562, "logps/rejected": -433.1009521484375, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 3.491734027862549, "rewards/margins": 6.20715856552124, "rewards/rejected": -2.7154242992401123, "step": 4390 }, { "epoch": 3.2080365296803652, "grad_norm": 6.751400837533124, "learning_rate": 5.71400816374534e-08, "logits/chosen": -2.6176156997680664, "logits/rejected": -1.8523703813552856, "logps/chosen": -675.0263061523438, "logps/rejected": -540.2457275390625, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 2.687027931213379, "rewards/margins": 4.233039379119873, "rewards/rejected": -1.5460115671157837, "step": 4391 }, { "epoch": 3.208767123287671, "grad_norm": 6.198688909895394, "learning_rate": 5.703862804840406e-08, "logits/chosen": -2.182708263397217, "logits/rejected": -1.741965413093567, "logps/chosen": -618.6812133789062, "logps/rejected": -703.2747802734375, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.420637607574463, "rewards/margins": 4.990811347961426, "rewards/rejected": -1.570173740386963, "step": 4392 }, { "epoch": 3.209497716894977, "grad_norm": 5.384911381267703, "learning_rate": 5.693725300715377e-08, "logits/chosen": -2.8677144050598145, "logits/rejected": -1.5235114097595215, "logps/chosen": -1000.4588012695312, "logps/rejected": -529.6414794921875, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 5.7650861740112305, "rewards/margins": 8.229610443115234, "rewards/rejected": -2.464524269104004, "step": 4393 }, { "epoch": 3.2102283105022833, "grad_norm": 4.805837602123422, "learning_rate": 5.6835956554968674e-08, "logits/chosen": -2.844144105911255, "logits/rejected": -2.14750599861145, "logps/chosen": -898.1605224609375, "logps/rejected": -563.34716796875, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 4.374862194061279, "rewards/margins": 6.508479595184326, "rewards/rejected": -2.1336169242858887, "step": 4394 }, { "epoch": 3.210958904109589, "grad_norm": 7.05503491193648, "learning_rate": 5.6734738733083154e-08, "logits/chosen": -2.501232147216797, "logits/rejected": -1.61204195022583, "logps/chosen": -387.7590637207031, "logps/rejected": -307.8074035644531, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 2.411062717437744, "rewards/margins": 5.848906993865967, "rewards/rejected": -3.4378442764282227, "step": 4395 }, { "epoch": 3.211689497716895, "grad_norm": 4.533187136443153, "learning_rate": 5.6633599582699366e-08, "logits/chosen": -2.5293331146240234, "logits/rejected": -2.161501169204712, "logps/chosen": -595.6303100585938, "logps/rejected": -504.1553649902344, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 3.5182125568389893, "rewards/margins": 4.938559055328369, "rewards/rejected": -1.420346736907959, "step": 4396 }, { "epoch": 3.212420091324201, "grad_norm": 5.026072806095516, "learning_rate": 5.653253914498751e-08, "logits/chosen": -2.650998592376709, "logits/rejected": -2.2141873836517334, "logps/chosen": -705.7658081054688, "logps/rejected": -635.6820068359375, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 3.4848124980926514, "rewards/margins": 5.336591720581055, "rewards/rejected": -1.8517794609069824, "step": 4397 }, { "epoch": 3.213150684931507, "grad_norm": 4.736334568294587, "learning_rate": 5.643155746108566e-08, "logits/chosen": -2.465029239654541, "logits/rejected": -1.7446868419647217, "logps/chosen": -699.634033203125, "logps/rejected": -599.0571899414062, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 4.105599403381348, "rewards/margins": 8.14519214630127, "rewards/rejected": -4.03959321975708, "step": 4398 }, { "epoch": 3.2138812785388127, "grad_norm": 6.1370460233586535, "learning_rate": 5.633065457210007e-08, "logits/chosen": -2.919114828109741, "logits/rejected": -1.623057246208191, "logps/chosen": -340.8140869140625, "logps/rejected": -254.33468627929688, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 4.389450550079346, "rewards/margins": 10.282604217529297, "rewards/rejected": -5.893153190612793, "step": 4399 }, { "epoch": 3.2146118721461185, "grad_norm": 14.088484023488054, "learning_rate": 5.622983051910465e-08, "logits/chosen": -2.8282675743103027, "logits/rejected": -1.9066400527954102, "logps/chosen": -641.1013793945312, "logps/rejected": -520.15478515625, "loss": 0.0507, "rewards/accuracies": 0.875, "rewards/chosen": 3.7397775650024414, "rewards/margins": 6.089944839477539, "rewards/rejected": -2.3501665592193604, "step": 4400 }, { "epoch": 3.215342465753425, "grad_norm": 9.08799871154561, "learning_rate": 5.6129085343141315e-08, "logits/chosen": -2.621427059173584, "logits/rejected": -2.4507741928100586, "logps/chosen": -570.2294311523438, "logps/rejected": -584.6951904296875, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 3.9942681789398193, "rewards/margins": 5.947643280029297, "rewards/rejected": -1.9533754587173462, "step": 4401 }, { "epoch": 3.2160730593607307, "grad_norm": 6.922403767455185, "learning_rate": 5.602841908522002e-08, "logits/chosen": -2.117465019226074, "logits/rejected": -2.28324031829834, "logps/chosen": -617.1494750976562, "logps/rejected": -654.2605590820312, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 2.3024072647094727, "rewards/margins": 4.7293477058410645, "rewards/rejected": -2.426940441131592, "step": 4402 }, { "epoch": 3.2168036529680366, "grad_norm": 4.752137355784467, "learning_rate": 5.59278317863183e-08, "logits/chosen": -2.741395950317383, "logits/rejected": -2.063685178756714, "logps/chosen": -925.8375854492188, "logps/rejected": -699.8141479492188, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 4.163774013519287, "rewards/margins": 6.104983329772949, "rewards/rejected": -1.9412095546722412, "step": 4403 }, { "epoch": 3.2175342465753425, "grad_norm": 13.041231938308206, "learning_rate": 5.582732348738184e-08, "logits/chosen": -1.5298011302947998, "logits/rejected": -2.1977059841156006, "logps/chosen": -525.561279296875, "logps/rejected": -1042.051513671875, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 2.0801033973693848, "rewards/margins": 4.061971664428711, "rewards/rejected": -1.981868028640747, "step": 4404 }, { "epoch": 3.2182648401826484, "grad_norm": 2.998043216864625, "learning_rate": 5.5726894229324e-08, "logits/chosen": -2.9575579166412354, "logits/rejected": -2.3293163776397705, "logps/chosen": -685.1144409179688, "logps/rejected": -534.7911376953125, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 3.081545829772949, "rewards/margins": 4.889425754547119, "rewards/rejected": -1.8078800439834595, "step": 4405 }, { "epoch": 3.2189954337899542, "grad_norm": 5.043882445306424, "learning_rate": 5.562654405302594e-08, "logits/chosen": -2.793799877166748, "logits/rejected": -2.107853412628174, "logps/chosen": -748.9046630859375, "logps/rejected": -474.7811279296875, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 4.642228126525879, "rewards/margins": 7.937353610992432, "rewards/rejected": -3.2951250076293945, "step": 4406 }, { "epoch": 3.21972602739726, "grad_norm": 9.236120585067848, "learning_rate": 5.552627299933682e-08, "logits/chosen": -3.101778268814087, "logits/rejected": -1.7609224319458008, "logps/chosen": -774.333251953125, "logps/rejected": -685.8675537109375, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 3.2693395614624023, "rewards/margins": 6.969869613647461, "rewards/rejected": -3.7005295753479004, "step": 4407 }, { "epoch": 3.2204566210045664, "grad_norm": 5.065491534198061, "learning_rate": 5.542608110907332e-08, "logits/chosen": -2.708771228790283, "logits/rejected": -1.7015999555587769, "logps/chosen": -520.92626953125, "logps/rejected": -422.55999755859375, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 4.440236568450928, "rewards/margins": 7.33073616027832, "rewards/rejected": -2.890500068664551, "step": 4408 }, { "epoch": 3.2211872146118723, "grad_norm": 5.054826926146522, "learning_rate": 5.5325968423020256e-08, "logits/chosen": -2.1904587745666504, "logits/rejected": -2.552579641342163, "logps/chosen": -467.8780822753906, "logps/rejected": -760.3182373046875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 2.684349775314331, "rewards/margins": 4.553622722625732, "rewards/rejected": -1.869273066520691, "step": 4409 }, { "epoch": 3.221917808219178, "grad_norm": 3.855830367888055, "learning_rate": 5.522593498192971e-08, "logits/chosen": -3.049877882003784, "logits/rejected": -2.5821568965911865, "logps/chosen": -861.6297607421875, "logps/rejected": -773.7488403320312, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 5.43422269821167, "rewards/margins": 7.186826705932617, "rewards/rejected": -1.752604603767395, "step": 4410 }, { "epoch": 3.222648401826484, "grad_norm": 7.211182429988456, "learning_rate": 5.5125980826521926e-08, "logits/chosen": -2.898380756378174, "logits/rejected": -1.753164291381836, "logps/chosen": -329.9991149902344, "logps/rejected": -356.12811279296875, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 3.44191837310791, "rewards/margins": 9.769742012023926, "rewards/rejected": -6.327823638916016, "step": 4411 }, { "epoch": 3.22337899543379, "grad_norm": 4.869093618154565, "learning_rate": 5.5026105997484775e-08, "logits/chosen": -2.9875993728637695, "logits/rejected": -2.3183085918426514, "logps/chosen": -872.6797485351562, "logps/rejected": -731.600341796875, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": 4.213578701019287, "rewards/margins": 8.038238525390625, "rewards/rejected": -3.824659824371338, "step": 4412 }, { "epoch": 3.224109589041096, "grad_norm": 4.394282979098026, "learning_rate": 5.492631053547375e-08, "logits/chosen": -3.203287124633789, "logits/rejected": -1.9795321226119995, "logps/chosen": -803.7789306640625, "logps/rejected": -565.4219360351562, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 3.5023834705352783, "rewards/margins": 6.506694793701172, "rewards/rejected": -3.0043113231658936, "step": 4413 }, { "epoch": 3.2248401826484017, "grad_norm": 7.40149983852051, "learning_rate": 5.482659448111207e-08, "logits/chosen": -3.2167110443115234, "logits/rejected": -2.1476731300354004, "logps/chosen": -815.2188720703125, "logps/rejected": -766.1075439453125, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 4.171926498413086, "rewards/margins": 5.4776506423950195, "rewards/rejected": -1.3057239055633545, "step": 4414 }, { "epoch": 3.225570776255708, "grad_norm": 6.2920400075058875, "learning_rate": 5.47269578749906e-08, "logits/chosen": -2.718777656555176, "logits/rejected": -1.7236424684524536, "logps/chosen": -713.1605224609375, "logps/rejected": -547.251220703125, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 4.350922584533691, "rewards/margins": 5.584244728088379, "rewards/rejected": -1.2333223819732666, "step": 4415 }, { "epoch": 3.226301369863014, "grad_norm": 6.037780704646563, "learning_rate": 5.462740075766797e-08, "logits/chosen": -3.148538112640381, "logits/rejected": -2.8952853679656982, "logps/chosen": -566.450439453125, "logps/rejected": -626.4072265625, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 2.737809896469116, "rewards/margins": 6.64097261428833, "rewards/rejected": -3.903162717819214, "step": 4416 }, { "epoch": 3.2270319634703197, "grad_norm": 4.795962765712184, "learning_rate": 5.4527923169670337e-08, "logits/chosen": -2.6847457885742188, "logits/rejected": -1.8880958557128906, "logps/chosen": -960.0604248046875, "logps/rejected": -653.2186279296875, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 4.852004051208496, "rewards/margins": 6.117954730987549, "rewards/rejected": -1.2659506797790527, "step": 4417 }, { "epoch": 3.2277625570776256, "grad_norm": 8.564371391785025, "learning_rate": 5.44285251514916e-08, "logits/chosen": -3.0443613529205322, "logits/rejected": -2.4184908866882324, "logps/chosen": -792.6697998046875, "logps/rejected": -701.6777954101562, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 2.9332215785980225, "rewards/margins": 4.898166656494141, "rewards/rejected": -1.964944839477539, "step": 4418 }, { "epoch": 3.2284931506849315, "grad_norm": 6.807182266379742, "learning_rate": 5.4329206743593174e-08, "logits/chosen": -3.2077245712280273, "logits/rejected": -2.4675984382629395, "logps/chosen": -757.268798828125, "logps/rejected": -605.5224609375, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 3.756377696990967, "rewards/margins": 6.84250020980835, "rewards/rejected": -3.086122512817383, "step": 4419 }, { "epoch": 3.2292237442922374, "grad_norm": 5.172368420807475, "learning_rate": 5.4229967986404e-08, "logits/chosen": -2.884172201156616, "logits/rejected": -1.982043981552124, "logps/chosen": -661.5453491210938, "logps/rejected": -516.8847045898438, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 3.913437604904175, "rewards/margins": 6.290129661560059, "rewards/rejected": -2.376692533493042, "step": 4420 }, { "epoch": 3.2299543378995432, "grad_norm": 4.661915449464843, "learning_rate": 5.413080892032085e-08, "logits/chosen": -3.460897445678711, "logits/rejected": -2.5721230506896973, "logps/chosen": -548.4326171875, "logps/rejected": -455.1730651855469, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": 3.297302722930908, "rewards/margins": 7.204914569854736, "rewards/rejected": -3.9076120853424072, "step": 4421 }, { "epoch": 3.2306849315068495, "grad_norm": 4.279734322332518, "learning_rate": 5.4031729585707845e-08, "logits/chosen": -2.510986566543579, "logits/rejected": -2.614894390106201, "logps/chosen": -642.8778076171875, "logps/rejected": -789.249267578125, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 4.485384941101074, "rewards/margins": 5.091793537139893, "rewards/rejected": -0.6064082384109497, "step": 4422 }, { "epoch": 3.2314155251141554, "grad_norm": 5.082894019649656, "learning_rate": 5.393273002289658e-08, "logits/chosen": -2.9606404304504395, "logits/rejected": -2.2756834030151367, "logps/chosen": -1172.2493896484375, "logps/rejected": -851.1019897460938, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 4.513463020324707, "rewards/margins": 7.0204267501831055, "rewards/rejected": -2.5069639682769775, "step": 4423 }, { "epoch": 3.2321461187214613, "grad_norm": 6.933942277772047, "learning_rate": 5.383381027218648e-08, "logits/chosen": -3.044816493988037, "logits/rejected": -2.805919647216797, "logps/chosen": -762.7039184570312, "logps/rejected": -633.4560546875, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 4.239287376403809, "rewards/margins": 4.830534934997559, "rewards/rejected": -0.5912471413612366, "step": 4424 }, { "epoch": 3.232876712328767, "grad_norm": 6.329878952532263, "learning_rate": 5.373497037384417e-08, "logits/chosen": -2.4824702739715576, "logits/rejected": -1.9295248985290527, "logps/chosen": -456.86737060546875, "logps/rejected": -346.90667724609375, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 2.306226968765259, "rewards/margins": 5.034750938415527, "rewards/rejected": -2.7285237312316895, "step": 4425 }, { "epoch": 3.233607305936073, "grad_norm": 4.573202878761503, "learning_rate": 5.363621036810406e-08, "logits/chosen": -2.227736234664917, "logits/rejected": -2.2914319038391113, "logps/chosen": -491.50213623046875, "logps/rejected": -491.39447021484375, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 2.2158942222595215, "rewards/margins": 5.671590328216553, "rewards/rejected": -3.4556961059570312, "step": 4426 }, { "epoch": 3.234337899543379, "grad_norm": 8.607320162988222, "learning_rate": 5.3537530295167664e-08, "logits/chosen": -2.428483486175537, "logits/rejected": -2.2313733100891113, "logps/chosen": -492.06329345703125, "logps/rejected": -668.4161376953125, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 4.051191329956055, "rewards/margins": 5.618574142456055, "rewards/rejected": -1.567383050918579, "step": 4427 }, { "epoch": 3.235068493150685, "grad_norm": 3.8319223682049857, "learning_rate": 5.343893019520429e-08, "logits/chosen": -2.772066593170166, "logits/rejected": -1.9679003953933716, "logps/chosen": -579.754150390625, "logps/rejected": -516.7311401367188, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 3.048475503921509, "rewards/margins": 6.804407119750977, "rewards/rejected": -3.755931854248047, "step": 4428 }, { "epoch": 3.2357990867579907, "grad_norm": 7.70274615178305, "learning_rate": 5.334041010835064e-08, "logits/chosen": -2.6871988773345947, "logits/rejected": -2.0153870582580566, "logps/chosen": -402.9283142089844, "logps/rejected": -332.2955627441406, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 1.9823315143585205, "rewards/margins": 4.448016166687012, "rewards/rejected": -2.465684413909912, "step": 4429 }, { "epoch": 3.236529680365297, "grad_norm": 6.369194528865735, "learning_rate": 5.324197007471063e-08, "logits/chosen": -2.9580352306365967, "logits/rejected": -2.1427578926086426, "logps/chosen": -1089.737060546875, "logps/rejected": -688.85595703125, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 5.004962921142578, "rewards/margins": 6.326016902923584, "rewards/rejected": -1.321054220199585, "step": 4430 }, { "epoch": 3.237260273972603, "grad_norm": 5.327157836122552, "learning_rate": 5.314361013435597e-08, "logits/chosen": -3.1825647354125977, "logits/rejected": -2.201756000518799, "logps/chosen": -540.0231323242188, "logps/rejected": -408.1155700683594, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 2.7252578735351562, "rewards/margins": 6.138764381408691, "rewards/rejected": -3.4135069847106934, "step": 4431 }, { "epoch": 3.2379908675799087, "grad_norm": 3.5665860752258074, "learning_rate": 5.304533032732528e-08, "logits/chosen": -2.9694833755493164, "logits/rejected": -2.0906364917755127, "logps/chosen": -928.4429931640625, "logps/rejected": -642.9299926757812, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 2.527474880218506, "rewards/margins": 5.515583515167236, "rewards/rejected": -2.9881081581115723, "step": 4432 }, { "epoch": 3.2387214611872146, "grad_norm": 5.466617713491942, "learning_rate": 5.294713069362497e-08, "logits/chosen": -2.7782187461853027, "logits/rejected": -1.7705131769180298, "logps/chosen": -630.9698486328125, "logps/rejected": -429.80914306640625, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 3.393449306488037, "rewards/margins": 6.545701503753662, "rewards/rejected": -3.1522514820098877, "step": 4433 }, { "epoch": 3.2394520547945205, "grad_norm": 5.674644949809326, "learning_rate": 5.2849011273228575e-08, "logits/chosen": -3.2830896377563477, "logits/rejected": -2.584526777267456, "logps/chosen": -410.6875305175781, "logps/rejected": -382.1310119628906, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": 3.2485995292663574, "rewards/margins": 8.292757034301758, "rewards/rejected": -5.044157981872559, "step": 4434 }, { "epoch": 3.2401826484018263, "grad_norm": 9.211644010488929, "learning_rate": 5.2750972106077177e-08, "logits/chosen": -3.15057373046875, "logits/rejected": -2.459470748901367, "logps/chosen": -811.4141845703125, "logps/rejected": -703.7783813476562, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 4.335672378540039, "rewards/margins": 4.183979034423828, "rewards/rejected": 0.15169301629066467, "step": 4435 }, { "epoch": 3.2409132420091322, "grad_norm": 7.341131514559762, "learning_rate": 5.265301323207905e-08, "logits/chosen": -2.428863286972046, "logits/rejected": -2.3065743446350098, "logps/chosen": -257.2083435058594, "logps/rejected": -323.6369934082031, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 1.7832908630371094, "rewards/margins": 6.588155746459961, "rewards/rejected": -4.80486536026001, "step": 4436 }, { "epoch": 3.2416438356164385, "grad_norm": 7.333920137451698, "learning_rate": 5.255513469110967e-08, "logits/chosen": -3.0618398189544678, "logits/rejected": -2.7127037048339844, "logps/chosen": -779.3713989257812, "logps/rejected": -691.8101196289062, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": 3.261547565460205, "rewards/margins": 5.493005752563477, "rewards/rejected": -2.2314579486846924, "step": 4437 }, { "epoch": 3.2423744292237444, "grad_norm": 8.732511645772634, "learning_rate": 5.245733652301215e-08, "logits/chosen": -3.189469337463379, "logits/rejected": -1.9590885639190674, "logps/chosen": -856.1639404296875, "logps/rejected": -476.6182861328125, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 3.5212464332580566, "rewards/margins": 5.045470714569092, "rewards/rejected": -1.5242239236831665, "step": 4438 }, { "epoch": 3.2431050228310503, "grad_norm": 7.448425730845406, "learning_rate": 5.2359618767596594e-08, "logits/chosen": -2.8658652305603027, "logits/rejected": -1.921442985534668, "logps/chosen": -845.5468139648438, "logps/rejected": -566.564453125, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 3.8286945819854736, "rewards/margins": 5.720937252044678, "rewards/rejected": -1.892242670059204, "step": 4439 }, { "epoch": 3.243835616438356, "grad_norm": 6.224403150509945, "learning_rate": 5.226198146464042e-08, "logits/chosen": -2.525632381439209, "logits/rejected": -2.5934455394744873, "logps/chosen": -412.3788146972656, "logps/rejected": -552.748291015625, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 1.823514699935913, "rewards/margins": 5.39688777923584, "rewards/rejected": -3.5733730792999268, "step": 4440 }, { "epoch": 3.244566210045662, "grad_norm": 4.102845084062214, "learning_rate": 5.2164424653888484e-08, "logits/chosen": -3.002035140991211, "logits/rejected": -1.9991445541381836, "logps/chosen": -431.47784423828125, "logps/rejected": -297.24169921875, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 2.1312427520751953, "rewards/margins": 6.217947006225586, "rewards/rejected": -4.086704254150391, "step": 4441 }, { "epoch": 3.245296803652968, "grad_norm": 3.9875700463148704, "learning_rate": 5.206694837505257e-08, "logits/chosen": -2.7260730266571045, "logits/rejected": -2.0416033267974854, "logps/chosen": -757.8568725585938, "logps/rejected": -609.5721435546875, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 5.284684181213379, "rewards/margins": 6.979593276977539, "rewards/rejected": -1.6949093341827393, "step": 4442 }, { "epoch": 3.246027397260274, "grad_norm": 5.83911895847153, "learning_rate": 5.196955266781203e-08, "logits/chosen": -2.4966506958007812, "logits/rejected": -2.052783250808716, "logps/chosen": -620.0440673828125, "logps/rejected": -471.97930908203125, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 2.873948097229004, "rewards/margins": 4.934948921203613, "rewards/rejected": -2.0610008239746094, "step": 4443 }, { "epoch": 3.24675799086758, "grad_norm": 7.972992130663682, "learning_rate": 5.187223757181314e-08, "logits/chosen": -2.447216510772705, "logits/rejected": -2.541353702545166, "logps/chosen": -528.3345336914062, "logps/rejected": -507.8627014160156, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 1.8362350463867188, "rewards/margins": 4.019979953765869, "rewards/rejected": -2.1837449073791504, "step": 4444 }, { "epoch": 3.247488584474886, "grad_norm": 4.937546741892085, "learning_rate": 5.177500312666938e-08, "logits/chosen": -2.9155049324035645, "logits/rejected": -2.163689613342285, "logps/chosen": -599.3370361328125, "logps/rejected": -449.978759765625, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 4.441451549530029, "rewards/margins": 6.640956878662109, "rewards/rejected": -2.199505090713501, "step": 4445 }, { "epoch": 3.248219178082192, "grad_norm": 4.2179838261879805, "learning_rate": 5.167784937196165e-08, "logits/chosen": -2.785822629928589, "logits/rejected": -2.5416808128356934, "logps/chosen": -1032.992431640625, "logps/rejected": -960.5733642578125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 4.819465637207031, "rewards/margins": 7.428081512451172, "rewards/rejected": -2.6086161136627197, "step": 4446 }, { "epoch": 3.2489497716894977, "grad_norm": 31.155852320054773, "learning_rate": 5.158077634723765e-08, "logits/chosen": -3.1591405868530273, "logits/rejected": -2.344672203063965, "logps/chosen": -792.99169921875, "logps/rejected": -537.6014404296875, "loss": 0.0901, "rewards/accuracies": 1.0, "rewards/chosen": 5.045554161071777, "rewards/margins": 7.022958755493164, "rewards/rejected": -1.9774047136306763, "step": 4447 }, { "epoch": 3.2496803652968036, "grad_norm": 7.20909655578525, "learning_rate": 5.148378409201265e-08, "logits/chosen": -2.7249622344970703, "logits/rejected": -1.8670196533203125, "logps/chosen": -435.3029479980469, "logps/rejected": -317.5274658203125, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 2.2642741203308105, "rewards/margins": 5.241084575653076, "rewards/rejected": -2.976810932159424, "step": 4448 }, { "epoch": 3.2504109589041095, "grad_norm": 19.83567595337612, "learning_rate": 5.138687264576849e-08, "logits/chosen": -3.069779872894287, "logits/rejected": -2.248354434967041, "logps/chosen": -475.80670166015625, "logps/rejected": -367.4786071777344, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 2.212052583694458, "rewards/margins": 4.77511739730835, "rewards/rejected": -2.5630650520324707, "step": 4449 }, { "epoch": 3.2511415525114153, "grad_norm": 6.339512504343264, "learning_rate": 5.1290042047954616e-08, "logits/chosen": -2.9552526473999023, "logits/rejected": -2.2109038829803467, "logps/chosen": -857.6814575195312, "logps/rejected": -665.66943359375, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 4.565227508544922, "rewards/margins": 5.167564392089844, "rewards/rejected": -0.6023367643356323, "step": 4450 }, { "epoch": 3.251872146118721, "grad_norm": 3.9480454449883626, "learning_rate": 5.1193292337987276e-08, "logits/chosen": -2.7108888626098633, "logits/rejected": -2.2986419200897217, "logps/chosen": -566.85205078125, "logps/rejected": -652.4281005859375, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 2.8311846256256104, "rewards/margins": 5.74308443069458, "rewards/rejected": -2.9118995666503906, "step": 4451 }, { "epoch": 3.2526027397260275, "grad_norm": 5.677892262166612, "learning_rate": 5.109662355524996e-08, "logits/chosen": -3.2468717098236084, "logits/rejected": -2.3596184253692627, "logps/chosen": -896.5017700195312, "logps/rejected": -558.5394287109375, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": 4.2895355224609375, "rewards/margins": 4.606741428375244, "rewards/rejected": -0.3172060549259186, "step": 4452 }, { "epoch": 3.2533333333333334, "grad_norm": 5.14975769065506, "learning_rate": 5.100003573909309e-08, "logits/chosen": -2.861607551574707, "logits/rejected": -2.364433765411377, "logps/chosen": -666.9209594726562, "logps/rejected": -503.9356689453125, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 3.4385488033294678, "rewards/margins": 7.560405254364014, "rewards/rejected": -4.121856212615967, "step": 4453 }, { "epoch": 3.2540639269406393, "grad_norm": 9.182965255902545, "learning_rate": 5.090352892883412e-08, "logits/chosen": -2.807950496673584, "logits/rejected": -1.867527723312378, "logps/chosen": -526.5480346679688, "logps/rejected": -360.93145751953125, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": 4.698794364929199, "rewards/margins": 8.547670364379883, "rewards/rejected": -3.8488755226135254, "step": 4454 }, { "epoch": 3.254794520547945, "grad_norm": 5.609310394170083, "learning_rate": 5.080710316375769e-08, "logits/chosen": -2.558300018310547, "logits/rejected": -2.5796990394592285, "logps/chosen": -500.152099609375, "logps/rejected": -583.1539916992188, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 2.214672327041626, "rewards/margins": 4.7441725730896, "rewards/rejected": -2.5295002460479736, "step": 4455 }, { "epoch": 3.255525114155251, "grad_norm": 5.795334274485583, "learning_rate": 5.071075848311523e-08, "logits/chosen": -2.2392141819000244, "logits/rejected": -1.2895461320877075, "logps/chosen": -506.2963562011719, "logps/rejected": -471.6554870605469, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 4.022622108459473, "rewards/margins": 8.572441101074219, "rewards/rejected": -4.549818515777588, "step": 4456 }, { "epoch": 3.256255707762557, "grad_norm": 3.4471242556883954, "learning_rate": 5.061449492612541e-08, "logits/chosen": -3.1485934257507324, "logits/rejected": -2.227813243865967, "logps/chosen": -850.4964599609375, "logps/rejected": -571.6692504882812, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 5.475297927856445, "rewards/margins": 6.525531768798828, "rewards/rejected": -1.050234079360962, "step": 4457 }, { "epoch": 3.256986301369863, "grad_norm": 5.7620563990048295, "learning_rate": 5.051831253197364e-08, "logits/chosen": -2.6052112579345703, "logits/rejected": -2.0812156200408936, "logps/chosen": -504.8952331542969, "logps/rejected": -498.85443115234375, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 3.3841803073883057, "rewards/margins": 7.480613708496094, "rewards/rejected": -4.096433639526367, "step": 4458 }, { "epoch": 3.257716894977169, "grad_norm": 4.2509365171916365, "learning_rate": 5.042221133981239e-08, "logits/chosen": -3.2415125370025635, "logits/rejected": -2.081836700439453, "logps/chosen": -566.5728759765625, "logps/rejected": -398.45928955078125, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 3.593020439147949, "rewards/margins": 5.904117107391357, "rewards/rejected": -2.3110969066619873, "step": 4459 }, { "epoch": 3.258447488584475, "grad_norm": 6.428919731271562, "learning_rate": 5.032619138876118e-08, "logits/chosen": -2.8876593112945557, "logits/rejected": -1.953834056854248, "logps/chosen": -852.6756591796875, "logps/rejected": -442.46746826171875, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 3.5911006927490234, "rewards/margins": 6.376735210418701, "rewards/rejected": -2.7856345176696777, "step": 4460 }, { "epoch": 3.259178082191781, "grad_norm": 5.638525236500353, "learning_rate": 5.023025271790629e-08, "logits/chosen": -3.1935040950775146, "logits/rejected": -2.317866086959839, "logps/chosen": -723.00244140625, "logps/rejected": -490.041259765625, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 3.038180351257324, "rewards/margins": 5.884344100952148, "rewards/rejected": -2.846163749694824, "step": 4461 }, { "epoch": 3.2599086757990867, "grad_norm": 8.11603556326364, "learning_rate": 5.013439536630099e-08, "logits/chosen": -2.7192249298095703, "logits/rejected": -1.91488778591156, "logps/chosen": -934.2389526367188, "logps/rejected": -638.5487670898438, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 4.970974922180176, "rewards/margins": 4.839399337768555, "rewards/rejected": 0.1315755844116211, "step": 4462 }, { "epoch": 3.2606392694063926, "grad_norm": 6.481894292047754, "learning_rate": 5.0038619372965385e-08, "logits/chosen": -2.749807357788086, "logits/rejected": -2.2627201080322266, "logps/chosen": -741.2200927734375, "logps/rejected": -588.6592407226562, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 3.293491840362549, "rewards/margins": 3.9527347087860107, "rewards/rejected": -0.6592429876327515, "step": 4463 }, { "epoch": 3.2613698630136985, "grad_norm": 3.542560928240734, "learning_rate": 4.994292477688658e-08, "logits/chosen": -2.7776265144348145, "logits/rejected": -2.244149684906006, "logps/chosen": -882.38427734375, "logps/rejected": -786.741943359375, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 4.702214241027832, "rewards/margins": 7.2767558097839355, "rewards/rejected": -2.5745415687561035, "step": 4464 }, { "epoch": 3.2621004566210043, "grad_norm": 4.219219944528564, "learning_rate": 4.984731161701855e-08, "logits/chosen": -2.496842622756958, "logits/rejected": -2.059544086456299, "logps/chosen": -761.580810546875, "logps/rejected": -662.606201171875, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 3.2667996883392334, "rewards/margins": 5.85433292388916, "rewards/rejected": -2.5875329971313477, "step": 4465 }, { "epoch": 3.2628310502283107, "grad_norm": 6.856088046279733, "learning_rate": 4.9751779932281996e-08, "logits/chosen": -2.903657913208008, "logits/rejected": -2.196990489959717, "logps/chosen": -731.6365966796875, "logps/rejected": -448.3996276855469, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 4.650930404663086, "rewards/margins": 6.149022579193115, "rewards/rejected": -1.4980921745300293, "step": 4466 }, { "epoch": 3.2635616438356165, "grad_norm": 7.214028668174541, "learning_rate": 4.965632976156448e-08, "logits/chosen": -2.2705771923065186, "logits/rejected": -1.859769344329834, "logps/chosen": -682.3779907226562, "logps/rejected": -468.70208740234375, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 2.8907248973846436, "rewards/margins": 5.278532981872559, "rewards/rejected": -2.387807846069336, "step": 4467 }, { "epoch": 3.2642922374429224, "grad_norm": 8.150421281469423, "learning_rate": 4.956096114372038e-08, "logits/chosen": -2.8608598709106445, "logits/rejected": -2.4167613983154297, "logps/chosen": -797.1869506835938, "logps/rejected": -584.32666015625, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": 3.8538453578948975, "rewards/margins": 4.581598281860352, "rewards/rejected": -0.7277528047561646, "step": 4468 }, { "epoch": 3.2650228310502283, "grad_norm": 4.814658733565976, "learning_rate": 4.946567411757105e-08, "logits/chosen": -2.5920252799987793, "logits/rejected": -2.2606678009033203, "logps/chosen": -505.32196044921875, "logps/rejected": -470.28533935546875, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 3.5500845909118652, "rewards/margins": 5.40687894821167, "rewards/rejected": -1.8567943572998047, "step": 4469 }, { "epoch": 3.265753424657534, "grad_norm": 6.11295613652272, "learning_rate": 4.93704687219044e-08, "logits/chosen": -2.4663472175598145, "logits/rejected": -2.3485817909240723, "logps/chosen": -536.6119384765625, "logps/rejected": -541.2761840820312, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": 4.409688472747803, "rewards/margins": 6.941074371337891, "rewards/rejected": -2.531386137008667, "step": 4470 }, { "epoch": 3.26648401826484, "grad_norm": 5.258617413197947, "learning_rate": 4.9275344995475173e-08, "logits/chosen": -2.4287490844726562, "logits/rejected": -2.2116289138793945, "logps/chosen": -565.0237426757812, "logps/rejected": -548.5859375, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 3.4462997913360596, "rewards/margins": 4.486033916473389, "rewards/rejected": -1.0397342443466187, "step": 4471 }, { "epoch": 3.267214611872146, "grad_norm": 6.748456990522329, "learning_rate": 4.918030297700498e-08, "logits/chosen": -2.7695088386535645, "logits/rejected": -2.247857093811035, "logps/chosen": -610.6591796875, "logps/rejected": -555.5242919921875, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 3.4386463165283203, "rewards/margins": 6.5985212326049805, "rewards/rejected": -3.15987491607666, "step": 4472 }, { "epoch": 3.267945205479452, "grad_norm": 11.783285840346592, "learning_rate": 4.9085342705181996e-08, "logits/chosen": -2.826820135116577, "logits/rejected": -2.6318788528442383, "logps/chosen": -547.5552368164062, "logps/rejected": -698.8336791992188, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 3.1980886459350586, "rewards/margins": 6.7786054611206055, "rewards/rejected": -3.5805165767669678, "step": 4473 }, { "epoch": 3.268675799086758, "grad_norm": 13.89626957076605, "learning_rate": 4.899046421866135e-08, "logits/chosen": -2.858828544616699, "logits/rejected": -2.498079299926758, "logps/chosen": -597.9325561523438, "logps/rejected": -580.71044921875, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 3.2911536693573, "rewards/margins": 6.471927642822266, "rewards/rejected": -3.1807737350463867, "step": 4474 }, { "epoch": 3.269406392694064, "grad_norm": 5.640663210245407, "learning_rate": 4.889566755606464e-08, "logits/chosen": -2.8235011100769043, "logits/rejected": -2.0606558322906494, "logps/chosen": -659.5083618164062, "logps/rejected": -426.4229431152344, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 4.707742214202881, "rewards/margins": 5.661443710327148, "rewards/rejected": -0.9537009596824646, "step": 4475 }, { "epoch": 3.27013698630137, "grad_norm": 6.245263402925759, "learning_rate": 4.8800952755980227e-08, "logits/chosen": -2.704029083251953, "logits/rejected": -2.1966586112976074, "logps/chosen": -439.960693359375, "logps/rejected": -437.1482238769531, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 2.40059232711792, "rewards/margins": 5.281685829162598, "rewards/rejected": -2.8810932636260986, "step": 4476 }, { "epoch": 3.2708675799086757, "grad_norm": 5.507472424119925, "learning_rate": 4.87063198569633e-08, "logits/chosen": -3.002349615097046, "logits/rejected": -2.368062973022461, "logps/chosen": -537.0032348632812, "logps/rejected": -402.4429931640625, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 2.4359874725341797, "rewards/margins": 4.690267562866211, "rewards/rejected": -2.2542800903320312, "step": 4477 }, { "epoch": 3.2715981735159816, "grad_norm": 4.931844635322367, "learning_rate": 4.861176889753543e-08, "logits/chosen": -2.8529748916625977, "logits/rejected": -2.5038723945617676, "logps/chosen": -828.1056518554688, "logps/rejected": -645.6577758789062, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 3.1746716499328613, "rewards/margins": 4.43496561050415, "rewards/rejected": -1.2602941989898682, "step": 4478 }, { "epoch": 3.2723287671232875, "grad_norm": 7.305165060007548, "learning_rate": 4.851729991618525e-08, "logits/chosen": -2.733456611633301, "logits/rejected": -1.6137139797210693, "logps/chosen": -483.0257568359375, "logps/rejected": -376.865234375, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 1.8415539264678955, "rewards/margins": 6.730180740356445, "rewards/rejected": -4.888626575469971, "step": 4479 }, { "epoch": 3.273059360730594, "grad_norm": 6.637370813409272, "learning_rate": 4.842291295136747e-08, "logits/chosen": -3.382510185241699, "logits/rejected": -2.465587615966797, "logps/chosen": -647.9971923828125, "logps/rejected": -511.333251953125, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 3.197950839996338, "rewards/margins": 6.622534275054932, "rewards/rejected": -3.4245827198028564, "step": 4480 }, { "epoch": 3.2737899543378997, "grad_norm": 7.435822369797812, "learning_rate": 4.832860804150382e-08, "logits/chosen": -2.7785229682922363, "logits/rejected": -2.8768091201782227, "logps/chosen": -216.4243621826172, "logps/rejected": -395.93896484375, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 1.4963279962539673, "rewards/margins": 6.203202247619629, "rewards/rejected": -4.706874370574951, "step": 4481 }, { "epoch": 3.2745205479452055, "grad_norm": 5.825430648667242, "learning_rate": 4.8234385224982606e-08, "logits/chosen": -2.884838104248047, "logits/rejected": -2.954608201980591, "logps/chosen": -667.661865234375, "logps/rejected": -1162.260498046875, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": 3.6545209884643555, "rewards/margins": 7.1818647384643555, "rewards/rejected": -3.5273430347442627, "step": 4482 }, { "epoch": 3.2752511415525114, "grad_norm": 4.966465070759309, "learning_rate": 4.814024454015858e-08, "logits/chosen": -2.5316147804260254, "logits/rejected": -1.9927722215652466, "logps/chosen": -393.04058837890625, "logps/rejected": -369.00897216796875, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 2.3607873916625977, "rewards/margins": 6.9190144538879395, "rewards/rejected": -4.558226585388184, "step": 4483 }, { "epoch": 3.2759817351598173, "grad_norm": 6.807933431745935, "learning_rate": 4.804618602535307e-08, "logits/chosen": -2.340529441833496, "logits/rejected": -1.8545677661895752, "logps/chosen": -593.3994140625, "logps/rejected": -619.7116088867188, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 2.572415590286255, "rewards/margins": 4.815392971038818, "rewards/rejected": -2.2429776191711426, "step": 4484 }, { "epoch": 3.276712328767123, "grad_norm": 6.654184226831218, "learning_rate": 4.795220971885394e-08, "logits/chosen": -3.1634182929992676, "logits/rejected": -1.8852436542510986, "logps/chosen": -768.03857421875, "logps/rejected": -470.83990478515625, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 5.515518665313721, "rewards/margins": 7.5832014083862305, "rewards/rejected": -2.0676825046539307, "step": 4485 }, { "epoch": 3.277442922374429, "grad_norm": 5.070524660911735, "learning_rate": 4.785831565891579e-08, "logits/chosen": -2.810227632522583, "logits/rejected": -1.4894689321517944, "logps/chosen": -488.1103515625, "logps/rejected": -323.5629577636719, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 2.016552686691284, "rewards/margins": 5.050207138061523, "rewards/rejected": -3.0336544513702393, "step": 4486 }, { "epoch": 3.2781735159817353, "grad_norm": 5.336236680050612, "learning_rate": 4.776450388375952e-08, "logits/chosen": -2.780518054962158, "logits/rejected": -2.2057840824127197, "logps/chosen": -604.370849609375, "logps/rejected": -553.9752197265625, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 4.017297267913818, "rewards/margins": 5.234180450439453, "rewards/rejected": -1.2168831825256348, "step": 4487 }, { "epoch": 3.278904109589041, "grad_norm": 6.940429893612809, "learning_rate": 4.767077443157258e-08, "logits/chosen": -2.5311012268066406, "logits/rejected": -2.413393020629883, "logps/chosen": -517.9437255859375, "logps/rejected": -568.2608642578125, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 1.8727065324783325, "rewards/margins": 3.302478790283203, "rewards/rejected": -1.4297723770141602, "step": 4488 }, { "epoch": 3.279634703196347, "grad_norm": 3.205116657627529, "learning_rate": 4.7577127340509006e-08, "logits/chosen": -3.027202606201172, "logits/rejected": -2.3962786197662354, "logps/chosen": -739.4369506835938, "logps/rejected": -575.573974609375, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 3.605191230773926, "rewards/margins": 5.264817237854004, "rewards/rejected": -1.6596260070800781, "step": 4489 }, { "epoch": 3.280365296803653, "grad_norm": 5.827483522841088, "learning_rate": 4.7483562648689134e-08, "logits/chosen": -2.570977210998535, "logits/rejected": -2.271632194519043, "logps/chosen": -482.68511962890625, "logps/rejected": -440.48944091796875, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 3.3590822219848633, "rewards/margins": 7.329835891723633, "rewards/rejected": -3.9707539081573486, "step": 4490 }, { "epoch": 3.281095890410959, "grad_norm": 4.782283097895603, "learning_rate": 4.7390080394200003e-08, "logits/chosen": -2.922635793685913, "logits/rejected": -1.976535439491272, "logps/chosen": -561.342041015625, "logps/rejected": -466.52349853515625, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 2.977572202682495, "rewards/margins": 4.981975078582764, "rewards/rejected": -2.0044026374816895, "step": 4491 }, { "epoch": 3.2818264840182647, "grad_norm": 6.214194311212034, "learning_rate": 4.7296680615094925e-08, "logits/chosen": -2.7046375274658203, "logits/rejected": -2.2459802627563477, "logps/chosen": -459.63885498046875, "logps/rejected": -342.00054931640625, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 0.9499139189720154, "rewards/margins": 3.924591541290283, "rewards/rejected": -2.974677562713623, "step": 4492 }, { "epoch": 3.2825570776255706, "grad_norm": 6.042798938103737, "learning_rate": 4.7203363349393536e-08, "logits/chosen": -2.664666175842285, "logits/rejected": -1.95574951171875, "logps/chosen": -588.303955078125, "logps/rejected": -483.7030029296875, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 2.7916531562805176, "rewards/margins": 7.456326484680176, "rewards/rejected": -4.664673328399658, "step": 4493 }, { "epoch": 3.283287671232877, "grad_norm": 3.5908346541561573, "learning_rate": 4.711012863508218e-08, "logits/chosen": -2.683340072631836, "logits/rejected": -2.0171611309051514, "logps/chosen": -922.0886840820312, "logps/rejected": -588.0311889648438, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 4.3700337409973145, "rewards/margins": 5.474917411804199, "rewards/rejected": -1.1048836708068848, "step": 4494 }, { "epoch": 3.2840182648401828, "grad_norm": 6.026431041151139, "learning_rate": 4.7016976510113326e-08, "logits/chosen": -2.7694873809814453, "logits/rejected": -2.659911632537842, "logps/chosen": -835.9627685546875, "logps/rejected": -902.6428833007812, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 5.474888801574707, "rewards/margins": 7.007977485656738, "rewards/rejected": -1.533088207244873, "step": 4495 }, { "epoch": 3.2847488584474887, "grad_norm": 6.300227860462668, "learning_rate": 4.692390701240612e-08, "logits/chosen": -3.0047190189361572, "logits/rejected": -1.5681655406951904, "logps/chosen": -895.9763793945312, "logps/rejected": -507.36920166015625, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 6.057475566864014, "rewards/margins": 8.447917938232422, "rewards/rejected": -2.390442132949829, "step": 4496 }, { "epoch": 3.2854794520547945, "grad_norm": 4.519778971036248, "learning_rate": 4.683092017984561e-08, "logits/chosen": -2.8876354694366455, "logits/rejected": -1.9980812072753906, "logps/chosen": -654.1952514648438, "logps/rejected": -587.4351196289062, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 4.580196380615234, "rewards/margins": 7.3582234382629395, "rewards/rejected": -2.778027296066284, "step": 4497 }, { "epoch": 3.2862100456621004, "grad_norm": 4.747486855085434, "learning_rate": 4.673801605028357e-08, "logits/chosen": -3.0286998748779297, "logits/rejected": -2.462425708770752, "logps/chosen": -837.3447265625, "logps/rejected": -625.3621826171875, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 2.971052646636963, "rewards/margins": 4.92750358581543, "rewards/rejected": -1.9564509391784668, "step": 4498 }, { "epoch": 3.2869406392694063, "grad_norm": 6.250693931203322, "learning_rate": 4.664519466153816e-08, "logits/chosen": -2.6050190925598145, "logits/rejected": -2.464533805847168, "logps/chosen": -647.359375, "logps/rejected": -817.1517944335938, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 4.396019458770752, "rewards/margins": 4.46756649017334, "rewards/rejected": -0.07154744863510132, "step": 4499 }, { "epoch": 3.287671232876712, "grad_norm": 5.198148162925954, "learning_rate": 4.655245605139357e-08, "logits/chosen": -2.6286168098449707, "logits/rejected": -1.8599154949188232, "logps/chosen": -806.9679565429688, "logps/rejected": -483.4000244140625, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 2.0408902168273926, "rewards/margins": 5.0840163230896, "rewards/rejected": -3.043126106262207, "step": 4500 }, { "epoch": 3.2884018264840185, "grad_norm": 8.256788293124663, "learning_rate": 4.6459800257600463e-08, "logits/chosen": -2.362311840057373, "logits/rejected": -2.9973695278167725, "logps/chosen": -494.2977600097656, "logps/rejected": -720.8697509765625, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 0.6120185852050781, "rewards/margins": 5.941960334777832, "rewards/rejected": -5.329942226409912, "step": 4501 }, { "epoch": 3.2891324200913243, "grad_norm": 7.654652705272139, "learning_rate": 4.636722731787568e-08, "logits/chosen": -2.538813352584839, "logits/rejected": -2.3104496002197266, "logps/chosen": -503.4932556152344, "logps/rejected": -446.4611511230469, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": 1.5682133436203003, "rewards/margins": 4.2230072021484375, "rewards/rejected": -2.6547937393188477, "step": 4502 }, { "epoch": 3.28986301369863, "grad_norm": 6.12651728307163, "learning_rate": 4.627473726990255e-08, "logits/chosen": -3.1650593280792236, "logits/rejected": -1.8213462829589844, "logps/chosen": -428.2572326660156, "logps/rejected": -271.5996398925781, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 3.702788829803467, "rewards/margins": 8.158802032470703, "rewards/rejected": -4.456013202667236, "step": 4503 }, { "epoch": 3.290593607305936, "grad_norm": 11.97709440579281, "learning_rate": 4.618233015133041e-08, "logits/chosen": -3.181175947189331, "logits/rejected": -2.047214984893799, "logps/chosen": -616.4439086914062, "logps/rejected": -400.5704345703125, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 4.660349369049072, "rewards/margins": 9.144638061523438, "rewards/rejected": -4.484288692474365, "step": 4504 }, { "epoch": 3.291324200913242, "grad_norm": 8.481213171927456, "learning_rate": 4.609000599977505e-08, "logits/chosen": -2.6999282836914062, "logits/rejected": -2.043056011199951, "logps/chosen": -580.9564819335938, "logps/rejected": -521.6533203125, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": 2.1465728282928467, "rewards/margins": 4.512653827667236, "rewards/rejected": -2.3660807609558105, "step": 4505 }, { "epoch": 3.292054794520548, "grad_norm": 5.082936797804839, "learning_rate": 4.5997764852818364e-08, "logits/chosen": -2.915009021759033, "logits/rejected": -2.6185998916625977, "logps/chosen": -722.8740844726562, "logps/rejected": -822.0970458984375, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 2.9754624366760254, "rewards/margins": 4.243675231933594, "rewards/rejected": -1.2682123184204102, "step": 4506 }, { "epoch": 3.2927853881278537, "grad_norm": 4.643818074647714, "learning_rate": 4.590560674800839e-08, "logits/chosen": -2.2550747394561768, "logits/rejected": -2.1717545986175537, "logps/chosen": -611.5298461914062, "logps/rejected": -697.2836303710938, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 2.546032428741455, "rewards/margins": 4.3352742195129395, "rewards/rejected": -1.7892415523529053, "step": 4507 }, { "epoch": 3.29351598173516, "grad_norm": 9.871172972203858, "learning_rate": 4.581353172285959e-08, "logits/chosen": -2.6633989810943604, "logits/rejected": -2.5200681686401367, "logps/chosen": -626.8272705078125, "logps/rejected": -687.1256713867188, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 2.6228771209716797, "rewards/margins": 6.328099250793457, "rewards/rejected": -3.7052221298217773, "step": 4508 }, { "epoch": 3.294246575342466, "grad_norm": 5.23643102548325, "learning_rate": 4.572153981485244e-08, "logits/chosen": -2.2377331256866455, "logits/rejected": -1.97516667842865, "logps/chosen": -524.8901977539062, "logps/rejected": -680.5116577148438, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 3.1194982528686523, "rewards/margins": 4.988908290863037, "rewards/rejected": -1.8694102764129639, "step": 4509 }, { "epoch": 3.2949771689497718, "grad_norm": 9.932467739956342, "learning_rate": 4.56296310614335e-08, "logits/chosen": -2.907824993133545, "logits/rejected": -2.07523512840271, "logps/chosen": -451.8854064941406, "logps/rejected": -336.868896484375, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": 3.4471688270568848, "rewards/margins": 5.977029800415039, "rewards/rejected": -2.5298609733581543, "step": 4510 }, { "epoch": 3.2957077625570776, "grad_norm": 8.88195485291821, "learning_rate": 4.5537805500015766e-08, "logits/chosen": -2.7353098392486572, "logits/rejected": -2.406095027923584, "logps/chosen": -630.40869140625, "logps/rejected": -634.7855834960938, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 2.7560110092163086, "rewards/margins": 4.078234672546387, "rewards/rejected": -1.3222241401672363, "step": 4511 }, { "epoch": 3.2964383561643835, "grad_norm": 8.492318665615379, "learning_rate": 4.5446063167978054e-08, "logits/chosen": -3.0017483234405518, "logits/rejected": -2.263904571533203, "logps/chosen": -519.128173828125, "logps/rejected": -432.26904296875, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 3.0334553718566895, "rewards/margins": 5.4542951583862305, "rewards/rejected": -2.420839548110962, "step": 4512 }, { "epoch": 3.2971689497716894, "grad_norm": 7.500714077334362, "learning_rate": 4.535440410266564e-08, "logits/chosen": -3.1114501953125, "logits/rejected": -1.948906421661377, "logps/chosen": -534.9232788085938, "logps/rejected": -341.3287353515625, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 1.4641945362091064, "rewards/margins": 5.437839984893799, "rewards/rejected": -3.9736456871032715, "step": 4513 }, { "epoch": 3.2978995433789953, "grad_norm": 6.946775267344571, "learning_rate": 4.526282834138945e-08, "logits/chosen": -2.898899793624878, "logits/rejected": -1.666073203086853, "logps/chosen": -795.581787109375, "logps/rejected": -392.92938232421875, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 4.1600260734558105, "rewards/margins": 5.517694473266602, "rewards/rejected": -1.3576682806015015, "step": 4514 }, { "epoch": 3.2986301369863016, "grad_norm": 4.074431518967866, "learning_rate": 4.5171335921426937e-08, "logits/chosen": -2.271775245666504, "logits/rejected": -2.5341877937316895, "logps/chosen": -683.439208984375, "logps/rejected": -813.1318359375, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 3.3002536296844482, "rewards/margins": 5.186750888824463, "rewards/rejected": -1.8864974975585938, "step": 4515 }, { "epoch": 3.2993607305936075, "grad_norm": 4.7683693910151215, "learning_rate": 4.507992688002138e-08, "logits/chosen": -2.495224714279175, "logits/rejected": -1.5754566192626953, "logps/chosen": -556.6006469726562, "logps/rejected": -398.48956298828125, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 4.0466461181640625, "rewards/margins": 8.522953987121582, "rewards/rejected": -4.4763078689575195, "step": 4516 }, { "epoch": 3.3000913242009133, "grad_norm": 4.593073996285229, "learning_rate": 4.4988601254382194e-08, "logits/chosen": -3.310422658920288, "logits/rejected": -1.7704261541366577, "logps/chosen": -969.525146484375, "logps/rejected": -540.6173095703125, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 5.5117082595825195, "rewards/margins": 6.484500885009766, "rewards/rejected": -0.9727917909622192, "step": 4517 }, { "epoch": 3.300821917808219, "grad_norm": 3.935123513596586, "learning_rate": 4.489735908168502e-08, "logits/chosen": -2.1585752964019775, "logits/rejected": -2.1373348236083984, "logps/chosen": -529.2896118164062, "logps/rejected": -591.1226806640625, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 3.975741147994995, "rewards/margins": 7.990520477294922, "rewards/rejected": -4.014779090881348, "step": 4518 }, { "epoch": 3.301552511415525, "grad_norm": 5.447407257985269, "learning_rate": 4.4806200399071045e-08, "logits/chosen": -2.587979555130005, "logits/rejected": -2.0421061515808105, "logps/chosen": -557.6456298828125, "logps/rejected": -458.720947265625, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 3.7649362087249756, "rewards/margins": 7.555741786956787, "rewards/rejected": -3.7908058166503906, "step": 4519 }, { "epoch": 3.302283105022831, "grad_norm": 5.438144864834225, "learning_rate": 4.471512524364795e-08, "logits/chosen": -2.5930323600769043, "logits/rejected": -1.7993909120559692, "logps/chosen": -586.2838134765625, "logps/rejected": -403.1107177734375, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 3.454037666320801, "rewards/margins": 6.958140850067139, "rewards/rejected": -3.504103183746338, "step": 4520 }, { "epoch": 3.303013698630137, "grad_norm": 9.68291812954225, "learning_rate": 4.462413365248913e-08, "logits/chosen": -2.8271684646606445, "logits/rejected": -1.9696853160858154, "logps/chosen": -624.1044921875, "logps/rejected": -396.075927734375, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 2.4415183067321777, "rewards/margins": 5.689972877502441, "rewards/rejected": -3.2484543323516846, "step": 4521 }, { "epoch": 3.303744292237443, "grad_norm": 11.314688286493261, "learning_rate": 4.453322566263421e-08, "logits/chosen": -2.8627285957336426, "logits/rejected": -2.7265238761901855, "logps/chosen": -534.429443359375, "logps/rejected": -395.8723449707031, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": 2.884373426437378, "rewards/margins": 3.4992008209228516, "rewards/rejected": -0.6148272752761841, "step": 4522 }, { "epoch": 3.304474885844749, "grad_norm": 8.360068604402269, "learning_rate": 4.4442401311088534e-08, "logits/chosen": -2.841731548309326, "logits/rejected": -2.5889267921447754, "logps/chosen": -316.3095703125, "logps/rejected": -412.3368835449219, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 0.7261412143707275, "rewards/margins": 3.9352993965148926, "rewards/rejected": -3.209158182144165, "step": 4523 }, { "epoch": 3.305205479452055, "grad_norm": 6.76391326182746, "learning_rate": 4.435166063482348e-08, "logits/chosen": -2.9943952560424805, "logits/rejected": -2.4435505867004395, "logps/chosen": -773.5970458984375, "logps/rejected": -625.4615478515625, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 4.553410053253174, "rewards/margins": 4.537590980529785, "rewards/rejected": 0.015819057822227478, "step": 4524 }, { "epoch": 3.3059360730593608, "grad_norm": 4.007376306901101, "learning_rate": 4.426100367077651e-08, "logits/chosen": -2.395233631134033, "logits/rejected": -2.038130760192871, "logps/chosen": -518.302490234375, "logps/rejected": -396.5242004394531, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 3.0276002883911133, "rewards/margins": 5.139554977416992, "rewards/rejected": -2.111955165863037, "step": 4525 }, { "epoch": 3.3066666666666666, "grad_norm": 4.075805004153752, "learning_rate": 4.417043045585078e-08, "logits/chosen": -3.0866291522979736, "logits/rejected": -2.3801724910736084, "logps/chosen": -833.7437744140625, "logps/rejected": -578.6568603515625, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 4.364981174468994, "rewards/margins": 7.098048686981201, "rewards/rejected": -2.733067512512207, "step": 4526 }, { "epoch": 3.3073972602739725, "grad_norm": 8.111765720422856, "learning_rate": 4.407994102691548e-08, "logits/chosen": -2.2493510246276855, "logits/rejected": -2.293592691421509, "logps/chosen": -425.85699462890625, "logps/rejected": -562.1563720703125, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": 2.0348212718963623, "rewards/margins": 6.74418830871582, "rewards/rejected": -4.709366798400879, "step": 4527 }, { "epoch": 3.3081278538812784, "grad_norm": 13.684648051197964, "learning_rate": 4.3989535420805776e-08, "logits/chosen": -2.7142744064331055, "logits/rejected": -2.1924028396606445, "logps/chosen": -475.5466003417969, "logps/rejected": -445.1685791015625, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 3.6762661933898926, "rewards/margins": 7.429085731506348, "rewards/rejected": -3.752819776535034, "step": 4528 }, { "epoch": 3.3088584474885847, "grad_norm": 2.9091339568724437, "learning_rate": 4.3899213674322446e-08, "logits/chosen": -2.380746603012085, "logits/rejected": -2.7445809841156006, "logps/chosen": -604.4948120117188, "logps/rejected": -952.080322265625, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 3.4395976066589355, "rewards/margins": 7.533959865570068, "rewards/rejected": -4.094362258911133, "step": 4529 }, { "epoch": 3.3095890410958906, "grad_norm": 6.302724993698472, "learning_rate": 4.380897582423249e-08, "logits/chosen": -3.106499195098877, "logits/rejected": -2.555297613143921, "logps/chosen": -774.3446044921875, "logps/rejected": -717.7193603515625, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 2.246124505996704, "rewards/margins": 5.427347660064697, "rewards/rejected": -3.181222915649414, "step": 4530 }, { "epoch": 3.3103196347031965, "grad_norm": 7.09118579111225, "learning_rate": 4.371882190726847e-08, "logits/chosen": -2.776817560195923, "logits/rejected": -2.2518844604492188, "logps/chosen": -444.7914123535156, "logps/rejected": -436.3866882324219, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 3.217994213104248, "rewards/margins": 6.623222827911377, "rewards/rejected": -3.405228853225708, "step": 4531 }, { "epoch": 3.3110502283105023, "grad_norm": 4.755476956198255, "learning_rate": 4.362875196012888e-08, "logits/chosen": -2.7102537155151367, "logits/rejected": -2.6972098350524902, "logps/chosen": -506.19354248046875, "logps/rejected": -679.161376953125, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": 2.6431896686553955, "rewards/margins": 4.303234100341797, "rewards/rejected": -1.6600444316864014, "step": 4532 }, { "epoch": 3.311780821917808, "grad_norm": 8.08643803176058, "learning_rate": 4.353876601947801e-08, "logits/chosen": -3.5034732818603516, "logits/rejected": -1.9958691596984863, "logps/chosen": -726.48876953125, "logps/rejected": -416.739990234375, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 4.5177154541015625, "rewards/margins": 6.285701751708984, "rewards/rejected": -1.76798677444458, "step": 4533 }, { "epoch": 3.312511415525114, "grad_norm": 5.347028830637986, "learning_rate": 4.344886412194598e-08, "logits/chosen": -2.5669729709625244, "logits/rejected": -2.359433174133301, "logps/chosen": -559.5170288085938, "logps/rejected": -524.6221923828125, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 2.1638898849487305, "rewards/margins": 5.597218990325928, "rewards/rejected": -3.433328628540039, "step": 4534 }, { "epoch": 3.31324200913242, "grad_norm": 5.992677671036989, "learning_rate": 4.335904630412885e-08, "logits/chosen": -2.9041385650634766, "logits/rejected": -2.443450689315796, "logps/chosen": -544.8770751953125, "logps/rejected": -565.6471557617188, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 1.5367563962936401, "rewards/margins": 5.441984176635742, "rewards/rejected": -3.9052281379699707, "step": 4535 }, { "epoch": 3.3139726027397263, "grad_norm": 4.829871395760446, "learning_rate": 4.326931260258806e-08, "logits/chosen": -3.0365519523620605, "logits/rejected": -1.334995985031128, "logps/chosen": -545.0054931640625, "logps/rejected": -230.9056396484375, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 4.99116325378418, "rewards/margins": 7.905337810516357, "rewards/rejected": -2.9141745567321777, "step": 4536 }, { "epoch": 3.314703196347032, "grad_norm": 7.735930192050552, "learning_rate": 4.3179663053851233e-08, "logits/chosen": -2.7054784297943115, "logits/rejected": -1.800228476524353, "logps/chosen": -752.284912109375, "logps/rejected": -571.0614013671875, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 5.20860481262207, "rewards/margins": 7.672148704528809, "rewards/rejected": -2.4635438919067383, "step": 4537 }, { "epoch": 3.315433789954338, "grad_norm": 5.423696074700146, "learning_rate": 4.3090097694411406e-08, "logits/chosen": -2.801726818084717, "logits/rejected": -2.212498188018799, "logps/chosen": -461.26177978515625, "logps/rejected": -439.16259765625, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 2.7167069911956787, "rewards/margins": 6.1331915855407715, "rewards/rejected": -3.416485071182251, "step": 4538 }, { "epoch": 3.316164383561644, "grad_norm": 5.007354667790466, "learning_rate": 4.300061656072762e-08, "logits/chosen": -2.8038992881774902, "logits/rejected": -2.31709885597229, "logps/chosen": -674.1399536132812, "logps/rejected": -686.056884765625, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 3.3014559745788574, "rewards/margins": 6.668467044830322, "rewards/rejected": -3.367011070251465, "step": 4539 }, { "epoch": 3.3168949771689498, "grad_norm": 6.296070132629578, "learning_rate": 4.291121968922448e-08, "logits/chosen": -2.8641867637634277, "logits/rejected": -2.0690531730651855, "logps/chosen": -469.7127380371094, "logps/rejected": -461.92242431640625, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 4.782318115234375, "rewards/margins": 8.187189102172852, "rewards/rejected": -3.4048712253570557, "step": 4540 }, { "epoch": 3.3176255707762556, "grad_norm": 5.657868380350942, "learning_rate": 4.282190711629219e-08, "logits/chosen": -2.7459287643432617, "logits/rejected": -3.039487600326538, "logps/chosen": -478.4781188964844, "logps/rejected": -668.285888671875, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 2.318406105041504, "rewards/margins": 4.919877529144287, "rewards/rejected": -2.601471424102783, "step": 4541 }, { "epoch": 3.3183561643835615, "grad_norm": 6.0594263838920135, "learning_rate": 4.273267887828694e-08, "logits/chosen": -3.1964235305786133, "logits/rejected": -1.8300665616989136, "logps/chosen": -736.6526489257812, "logps/rejected": -436.250244140625, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": 3.8760595321655273, "rewards/margins": 4.394047737121582, "rewards/rejected": -0.5179880857467651, "step": 4542 }, { "epoch": 3.3190867579908674, "grad_norm": 7.223347574574095, "learning_rate": 4.264353501153026e-08, "logits/chosen": -2.9195051193237305, "logits/rejected": -2.1416025161743164, "logps/chosen": -486.6597900390625, "logps/rejected": -372.2431640625, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": 3.080092430114746, "rewards/margins": 4.373682022094727, "rewards/rejected": -1.2935893535614014, "step": 4543 }, { "epoch": 3.3198173515981737, "grad_norm": 5.219919953006549, "learning_rate": 4.255447555230962e-08, "logits/chosen": -2.6691513061523438, "logits/rejected": -1.70163094997406, "logps/chosen": -536.696533203125, "logps/rejected": -379.90093994140625, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 2.2250826358795166, "rewards/margins": 6.188577651977539, "rewards/rejected": -3.9634952545166016, "step": 4544 }, { "epoch": 3.3205479452054796, "grad_norm": 4.8324419493131465, "learning_rate": 4.246550053687795e-08, "logits/chosen": -2.792813301086426, "logits/rejected": -1.621465802192688, "logps/chosen": -874.6939697265625, "logps/rejected": -484.5154113769531, "loss": 0.0373, "rewards/accuracies": 0.875, "rewards/chosen": 2.9092540740966797, "rewards/margins": 5.343163967132568, "rewards/rejected": -2.4339098930358887, "step": 4545 }, { "epoch": 3.3212785388127855, "grad_norm": 6.522488060118222, "learning_rate": 4.237661000145376e-08, "logits/chosen": -2.6822495460510254, "logits/rejected": -1.8697158098220825, "logps/chosen": -615.3751220703125, "logps/rejected": -463.20770263671875, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 3.763343572616577, "rewards/margins": 7.116442680358887, "rewards/rejected": -3.3530995845794678, "step": 4546 }, { "epoch": 3.3220091324200913, "grad_norm": 3.0303484962404887, "learning_rate": 4.2287803982221425e-08, "logits/chosen": -2.562612533569336, "logits/rejected": -2.3171310424804688, "logps/chosen": -362.385009765625, "logps/rejected": -401.22686767578125, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 0.973806619644165, "rewards/margins": 5.664500713348389, "rewards/rejected": -4.690694332122803, "step": 4547 }, { "epoch": 3.322739726027397, "grad_norm": 6.239715965360054, "learning_rate": 4.219908251533066e-08, "logits/chosen": -2.762341022491455, "logits/rejected": -2.2652745246887207, "logps/chosen": -691.0082397460938, "logps/rejected": -738.0559692382812, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 3.1503922939300537, "rewards/margins": 6.74497127532959, "rewards/rejected": -3.594578742980957, "step": 4548 }, { "epoch": 3.323470319634703, "grad_norm": 4.690176237979821, "learning_rate": 4.211044563689689e-08, "logits/chosen": -2.4394311904907227, "logits/rejected": -2.4834721088409424, "logps/chosen": -486.6295166015625, "logps/rejected": -669.3572998046875, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 1.9150681495666504, "rewards/margins": 4.391872406005859, "rewards/rejected": -2.476804256439209, "step": 4549 }, { "epoch": 3.324200913242009, "grad_norm": 5.198546631988688, "learning_rate": 4.2021893383000995e-08, "logits/chosen": -3.0994341373443604, "logits/rejected": -2.396463632583618, "logps/chosen": -728.797607421875, "logps/rejected": -574.7648315429688, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 4.057022571563721, "rewards/margins": 6.2096333503723145, "rewards/rejected": -2.1526107788085938, "step": 4550 }, { "epoch": 3.324931506849315, "grad_norm": 4.490215472380287, "learning_rate": 4.1933425789689586e-08, "logits/chosen": -2.7016048431396484, "logits/rejected": -2.725600242614746, "logps/chosen": -738.39697265625, "logps/rejected": -835.7948608398438, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 5.51529598236084, "rewards/margins": 7.170438289642334, "rewards/rejected": -1.6551421880722046, "step": 4551 }, { "epoch": 3.325662100456621, "grad_norm": 5.265782022823357, "learning_rate": 4.184504289297472e-08, "logits/chosen": -2.8054757118225098, "logits/rejected": -2.1188783645629883, "logps/chosen": -652.251953125, "logps/rejected": -489.33782958984375, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 3.996288537979126, "rewards/margins": 5.88424015045166, "rewards/rejected": -1.8879518508911133, "step": 4552 }, { "epoch": 3.326392694063927, "grad_norm": 7.877975386182844, "learning_rate": 4.175674472883392e-08, "logits/chosen": -2.5558505058288574, "logits/rejected": -1.715119481086731, "logps/chosen": -479.1142578125, "logps/rejected": -347.8268127441406, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 3.2098731994628906, "rewards/margins": 6.550559997558594, "rewards/rejected": -3.3406870365142822, "step": 4553 }, { "epoch": 3.327123287671233, "grad_norm": 8.377707428078894, "learning_rate": 4.16685313332103e-08, "logits/chosen": -2.360349416732788, "logits/rejected": -1.8549553155899048, "logps/chosen": -430.8531494140625, "logps/rejected": -525.8552856445312, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 1.4903745651245117, "rewards/margins": 3.2607407569885254, "rewards/rejected": -1.7703660726547241, "step": 4554 }, { "epoch": 3.3278538812785388, "grad_norm": 10.40265838685823, "learning_rate": 4.158040274201236e-08, "logits/chosen": -1.4753539562225342, "logits/rejected": -2.475640058517456, "logps/chosen": -228.7559356689453, "logps/rejected": -656.831298828125, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 1.5297045707702637, "rewards/margins": 9.0147705078125, "rewards/rejected": -7.48506498336792, "step": 4555 }, { "epoch": 3.3285844748858446, "grad_norm": 9.420624657914809, "learning_rate": 4.149235899111428e-08, "logits/chosen": -2.635709047317505, "logits/rejected": -2.2819693088531494, "logps/chosen": -560.0914306640625, "logps/rejected": -546.8345947265625, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 1.2070133686065674, "rewards/margins": 6.632242202758789, "rewards/rejected": -5.425229072570801, "step": 4556 }, { "epoch": 3.3293150684931505, "grad_norm": 4.91098513165932, "learning_rate": 4.140440011635551e-08, "logits/chosen": -2.703723192214966, "logits/rejected": -2.1453096866607666, "logps/chosen": -766.0389404296875, "logps/rejected": -714.4176025390625, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 3.679291248321533, "rewards/margins": 6.365663051605225, "rewards/rejected": -2.6863722801208496, "step": 4557 }, { "epoch": 3.3300456621004564, "grad_norm": 3.732203406822498, "learning_rate": 4.131652615354095e-08, "logits/chosen": -2.723524570465088, "logits/rejected": -2.416276454925537, "logps/chosen": -416.18072509765625, "logps/rejected": -435.16192626953125, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 2.3306005001068115, "rewards/margins": 4.718490123748779, "rewards/rejected": -2.3878893852233887, "step": 4558 }, { "epoch": 3.3307762557077627, "grad_norm": 5.402367228403881, "learning_rate": 4.122873713844116e-08, "logits/chosen": -2.3688108921051025, "logits/rejected": -2.0026278495788574, "logps/chosen": -353.5957336425781, "logps/rejected": -458.7818603515625, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 3.132056713104248, "rewards/margins": 9.179737091064453, "rewards/rejected": -6.047680377960205, "step": 4559 }, { "epoch": 3.3315068493150686, "grad_norm": 7.7407986562544, "learning_rate": 4.1141033106791814e-08, "logits/chosen": -2.482719898223877, "logits/rejected": -2.100647211074829, "logps/chosen": -567.2155151367188, "logps/rejected": -798.1826782226562, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 2.4410407543182373, "rewards/margins": 7.400335311889648, "rewards/rejected": -4.95929479598999, "step": 4560 }, { "epoch": 3.3322374429223744, "grad_norm": 3.869793378060306, "learning_rate": 4.105341409429427e-08, "logits/chosen": -2.7793283462524414, "logits/rejected": -1.8793396949768066, "logps/chosen": -973.280029296875, "logps/rejected": -750.2313842773438, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 4.1326799392700195, "rewards/margins": 5.735634803771973, "rewards/rejected": -1.6029554605484009, "step": 4561 }, { "epoch": 3.3329680365296803, "grad_norm": 9.42420842011164, "learning_rate": 4.096588013661509e-08, "logits/chosen": -2.916555166244507, "logits/rejected": -2.585358142852783, "logps/chosen": -892.5077514648438, "logps/rejected": -855.137939453125, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": 2.8265371322631836, "rewards/margins": 6.267121315002441, "rewards/rejected": -3.4405837059020996, "step": 4562 }, { "epoch": 3.333698630136986, "grad_norm": 4.610795969069092, "learning_rate": 4.087843126938623e-08, "logits/chosen": -2.2646684646606445, "logits/rejected": -1.4035630226135254, "logps/chosen": -646.418701171875, "logps/rejected": -444.57525634765625, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 3.2293312549591064, "rewards/margins": 5.449775218963623, "rewards/rejected": -2.2204437255859375, "step": 4563 }, { "epoch": 3.334429223744292, "grad_norm": 5.58984221452416, "learning_rate": 4.0791067528205156e-08, "logits/chosen": -2.668076515197754, "logits/rejected": -2.1314120292663574, "logps/chosen": -780.427978515625, "logps/rejected": -611.6976318359375, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 5.076780796051025, "rewards/margins": 6.895901203155518, "rewards/rejected": -1.8191204071044922, "step": 4564 }, { "epoch": 3.335159817351598, "grad_norm": 4.46963044143014, "learning_rate": 4.0703788948634493e-08, "logits/chosen": -2.4043421745300293, "logits/rejected": -2.207789421081543, "logps/chosen": -320.16815185546875, "logps/rejected": -407.74444580078125, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 2.3397727012634277, "rewards/margins": 7.082851409912109, "rewards/rejected": -4.743078708648682, "step": 4565 }, { "epoch": 3.3358904109589043, "grad_norm": 8.202906433100118, "learning_rate": 4.0616595566202395e-08, "logits/chosen": -2.72739315032959, "logits/rejected": -2.3804497718811035, "logps/chosen": -518.90283203125, "logps/rejected": -509.1434326171875, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 2.9613454341888428, "rewards/margins": 4.044839859008789, "rewards/rejected": -1.0834949016571045, "step": 4566 }, { "epoch": 3.33662100456621, "grad_norm": 5.5158224529768605, "learning_rate": 4.052948741640205e-08, "logits/chosen": -2.397517442703247, "logits/rejected": -2.2128376960754395, "logps/chosen": -357.25836181640625, "logps/rejected": -504.8946533203125, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 1.8533779382705688, "rewards/margins": 5.0010986328125, "rewards/rejected": -3.1477210521698, "step": 4567 }, { "epoch": 3.337351598173516, "grad_norm": 8.548864046003468, "learning_rate": 4.04424645346923e-08, "logits/chosen": -2.696721076965332, "logits/rejected": -2.388679265975952, "logps/chosen": -904.8914794921875, "logps/rejected": -720.0238037109375, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 4.742019176483154, "rewards/margins": 5.458930969238281, "rewards/rejected": -0.716912031173706, "step": 4568 }, { "epoch": 3.338082191780822, "grad_norm": 6.9008234721304, "learning_rate": 4.035552695649696e-08, "logits/chosen": -2.4583606719970703, "logits/rejected": -2.3660221099853516, "logps/chosen": -403.7574157714844, "logps/rejected": -573.8995971679688, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 1.8075228929519653, "rewards/margins": 4.720890045166016, "rewards/rejected": -2.9133670330047607, "step": 4569 }, { "epoch": 3.3388127853881278, "grad_norm": 6.487975809807137, "learning_rate": 4.026867471720541e-08, "logits/chosen": -2.4726974964141846, "logits/rejected": -2.681086540222168, "logps/chosen": -405.49078369140625, "logps/rejected": -561.6099243164062, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 3.4778189659118652, "rewards/margins": 6.184199333190918, "rewards/rejected": -2.7063798904418945, "step": 4570 }, { "epoch": 3.3395433789954336, "grad_norm": 6.944221608295432, "learning_rate": 4.018190785217207e-08, "logits/chosen": -2.2428128719329834, "logits/rejected": -1.7280199527740479, "logps/chosen": -581.1705932617188, "logps/rejected": -457.0196838378906, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 1.9571633338928223, "rewards/margins": 4.029478549957275, "rewards/rejected": -2.0723154544830322, "step": 4571 }, { "epoch": 3.3402739726027395, "grad_norm": 6.097374410426814, "learning_rate": 4.009522639671661e-08, "logits/chosen": -3.200847625732422, "logits/rejected": -2.435620069503784, "logps/chosen": -718.889892578125, "logps/rejected": -426.13873291015625, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 2.7527480125427246, "rewards/margins": 4.374739646911621, "rewards/rejected": -1.6219918727874756, "step": 4572 }, { "epoch": 3.341004566210046, "grad_norm": 7.074007634636295, "learning_rate": 4.0008630386124177e-08, "logits/chosen": -2.7566025257110596, "logits/rejected": -2.0453553199768066, "logps/chosen": -307.0936279296875, "logps/rejected": -319.6540222167969, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": 2.9808261394500732, "rewards/margins": 8.053915023803711, "rewards/rejected": -5.073088645935059, "step": 4573 }, { "epoch": 3.3417351598173517, "grad_norm": 5.387164522776243, "learning_rate": 3.992211985564484e-08, "logits/chosen": -3.285315752029419, "logits/rejected": -2.1017470359802246, "logps/chosen": -634.4974975585938, "logps/rejected": -486.6331481933594, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 4.339844226837158, "rewards/margins": 7.099853515625, "rewards/rejected": -2.760009527206421, "step": 4574 }, { "epoch": 3.3424657534246576, "grad_norm": 6.477084038768037, "learning_rate": 3.983569484049398e-08, "logits/chosen": -2.712317943572998, "logits/rejected": -2.069826364517212, "logps/chosen": -543.5338134765625, "logps/rejected": -329.0491638183594, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 2.6419496536254883, "rewards/margins": 4.527133941650391, "rewards/rejected": -1.8851845264434814, "step": 4575 }, { "epoch": 3.3431963470319634, "grad_norm": 7.33188270902078, "learning_rate": 3.974935537585233e-08, "logits/chosen": -2.9017043113708496, "logits/rejected": -2.2653307914733887, "logps/chosen": -724.2095336914062, "logps/rejected": -1003.7769775390625, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": 4.088225364685059, "rewards/margins": 5.840907096862793, "rewards/rejected": -1.7526817321777344, "step": 4576 }, { "epoch": 3.3439269406392693, "grad_norm": 4.592098847604642, "learning_rate": 3.966310149686547e-08, "logits/chosen": -2.6827571392059326, "logits/rejected": -2.0563101768493652, "logps/chosen": -738.412109375, "logps/rejected": -690.76123046875, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 3.09243106842041, "rewards/margins": 3.7888717651367188, "rewards/rejected": -0.6964408159255981, "step": 4577 }, { "epoch": 3.344657534246575, "grad_norm": 6.830027858515344, "learning_rate": 3.95769332386445e-08, "logits/chosen": -2.654698610305786, "logits/rejected": -1.9785094261169434, "logps/chosen": -538.07568359375, "logps/rejected": -471.469970703125, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 2.714182138442993, "rewards/margins": 6.034181594848633, "rewards/rejected": -3.3199987411499023, "step": 4578 }, { "epoch": 3.345388127853881, "grad_norm": 5.673635834736406, "learning_rate": 3.949085063626539e-08, "logits/chosen": -2.759338855743408, "logits/rejected": -1.6622021198272705, "logps/chosen": -762.1539306640625, "logps/rejected": -478.13287353515625, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 3.36637020111084, "rewards/margins": 6.205088138580322, "rewards/rejected": -2.8387181758880615, "step": 4579 }, { "epoch": 3.3461187214611874, "grad_norm": 2.2329461924608243, "learning_rate": 3.9404853724769344e-08, "logits/chosen": -2.278615951538086, "logits/rejected": -2.1461057662963867, "logps/chosen": -373.32928466796875, "logps/rejected": -517.0120239257812, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 2.053529977798462, "rewards/margins": 5.170845031738281, "rewards/rejected": -3.1173150539398193, "step": 4580 }, { "epoch": 3.3468493150684933, "grad_norm": 3.519760615488536, "learning_rate": 3.931894253916273e-08, "logits/chosen": -2.4718821048736572, "logits/rejected": -2.2747550010681152, "logps/chosen": -541.2476196289062, "logps/rejected": -450.1694030761719, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 1.1801846027374268, "rewards/margins": 5.352512836456299, "rewards/rejected": -4.172328472137451, "step": 4581 }, { "epoch": 3.347579908675799, "grad_norm": 5.491097788465012, "learning_rate": 3.9233117114416905e-08, "logits/chosen": -3.3637166023254395, "logits/rejected": -2.898049831390381, "logps/chosen": -623.8014526367188, "logps/rejected": -611.8802490234375, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 2.6546101570129395, "rewards/margins": 5.622098922729492, "rewards/rejected": -2.9674885272979736, "step": 4582 }, { "epoch": 3.348310502283105, "grad_norm": 5.708664970324341, "learning_rate": 3.914737748546856e-08, "logits/chosen": -2.533188581466675, "logits/rejected": -1.88178551197052, "logps/chosen": -949.088134765625, "logps/rejected": -764.3423461914062, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 5.881512641906738, "rewards/margins": 6.377558708190918, "rewards/rejected": -0.49604588747024536, "step": 4583 }, { "epoch": 3.349041095890411, "grad_norm": 6.267896435513142, "learning_rate": 3.906172368721902e-08, "logits/chosen": -2.6243090629577637, "logits/rejected": -2.087440252304077, "logps/chosen": -472.2923583984375, "logps/rejected": -313.5162353515625, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 3.3957114219665527, "rewards/margins": 5.7051191329956055, "rewards/rejected": -2.3094074726104736, "step": 4584 }, { "epoch": 3.3497716894977168, "grad_norm": 5.387245246546286, "learning_rate": 3.897615575453517e-08, "logits/chosen": -2.7715237140655518, "logits/rejected": -1.9731848239898682, "logps/chosen": -916.1856689453125, "logps/rejected": -725.2570190429688, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 3.141965627670288, "rewards/margins": 5.509858131408691, "rewards/rejected": -2.3678927421569824, "step": 4585 }, { "epoch": 3.3505022831050226, "grad_norm": 3.3998825333442846, "learning_rate": 3.889067372224855e-08, "logits/chosen": -3.180152416229248, "logits/rejected": -2.293612003326416, "logps/chosen": -831.9620361328125, "logps/rejected": -570.0860595703125, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 3.960648775100708, "rewards/margins": 5.483212471008301, "rewards/rejected": -1.5225634574890137, "step": 4586 }, { "epoch": 3.351232876712329, "grad_norm": 8.673968198246937, "learning_rate": 3.880527762515601e-08, "logits/chosen": -2.278764247894287, "logits/rejected": -2.6318795680999756, "logps/chosen": -410.861083984375, "logps/rejected": -563.6810302734375, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 2.037564516067505, "rewards/margins": 6.197443008422852, "rewards/rejected": -4.159878730773926, "step": 4587 }, { "epoch": 3.351963470319635, "grad_norm": 5.165385343758171, "learning_rate": 3.871996749801926e-08, "logits/chosen": -2.675574779510498, "logits/rejected": -2.992581844329834, "logps/chosen": -692.46240234375, "logps/rejected": -937.20556640625, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 4.307705879211426, "rewards/margins": 5.272442817687988, "rewards/rejected": -0.964736819267273, "step": 4588 }, { "epoch": 3.3526940639269407, "grad_norm": 6.435452033633478, "learning_rate": 3.8634743375564995e-08, "logits/chosen": -2.9071435928344727, "logits/rejected": -2.3773250579833984, "logps/chosen": -751.5193481445312, "logps/rejected": -602.864990234375, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 4.003228187561035, "rewards/margins": 6.484658241271973, "rewards/rejected": -2.4814298152923584, "step": 4589 }, { "epoch": 3.3534246575342466, "grad_norm": 5.414512010109239, "learning_rate": 3.85496052924851e-08, "logits/chosen": -2.54866623878479, "logits/rejected": -2.6258764266967773, "logps/chosen": -330.789306640625, "logps/rejected": -500.779296875, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 0.37197327613830566, "rewards/margins": 3.933037281036377, "rewards/rejected": -3.5610642433166504, "step": 4590 }, { "epoch": 3.3541552511415524, "grad_norm": 6.007990285080777, "learning_rate": 3.8464553283436144e-08, "logits/chosen": -2.7918283939361572, "logits/rejected": -1.6317743062973022, "logps/chosen": -522.8489379882812, "logps/rejected": -345.4259948730469, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 4.088049411773682, "rewards/margins": 7.487338542938232, "rewards/rejected": -3.3992886543273926, "step": 4591 }, { "epoch": 3.3548858447488583, "grad_norm": 5.895844188810494, "learning_rate": 3.837958738303995e-08, "logits/chosen": -3.2630019187927246, "logits/rejected": -2.4674670696258545, "logps/chosen": -865.8770751953125, "logps/rejected": -615.9049682617188, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 4.097423553466797, "rewards/margins": 5.279872417449951, "rewards/rejected": -1.1824486255645752, "step": 4592 }, { "epoch": 3.355616438356164, "grad_norm": 2.76649064993989, "learning_rate": 3.82947076258831e-08, "logits/chosen": -2.7877445220947266, "logits/rejected": -2.3254833221435547, "logps/chosen": -878.464599609375, "logps/rejected": -636.6619873046875, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 4.894984722137451, "rewards/margins": 6.357851982116699, "rewards/rejected": -1.4628673791885376, "step": 4593 }, { "epoch": 3.3563470319634705, "grad_norm": 4.1078885367035545, "learning_rate": 3.820991404651708e-08, "logits/chosen": -2.512068271636963, "logits/rejected": -2.2861342430114746, "logps/chosen": -767.2692260742188, "logps/rejected": -782.6508178710938, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 4.1148858070373535, "rewards/margins": 7.0742573738098145, "rewards/rejected": -2.959371566772461, "step": 4594 }, { "epoch": 3.3570776255707764, "grad_norm": 5.849040365167553, "learning_rate": 3.8125206679458535e-08, "logits/chosen": -3.039738893508911, "logits/rejected": -2.0871944427490234, "logps/chosen": -844.1913452148438, "logps/rejected": -686.5421142578125, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 5.136964797973633, "rewards/margins": 6.212959289550781, "rewards/rejected": -1.075994610786438, "step": 4595 }, { "epoch": 3.3578082191780823, "grad_norm": 6.124816580381044, "learning_rate": 3.8040585559188765e-08, "logits/chosen": -2.5578131675720215, "logits/rejected": -2.3931145668029785, "logps/chosen": -689.448486328125, "logps/rejected": -601.8652954101562, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 3.058448314666748, "rewards/margins": 4.161667346954346, "rewards/rejected": -1.1032192707061768, "step": 4596 }, { "epoch": 3.358538812785388, "grad_norm": 2.3265160242771294, "learning_rate": 3.795605072015401e-08, "logits/chosen": -2.872286796569824, "logits/rejected": -1.9068489074707031, "logps/chosen": -955.587646484375, "logps/rejected": -550.7040405273438, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 4.900907516479492, "rewards/margins": 6.775423049926758, "rewards/rejected": -1.8745156526565552, "step": 4597 }, { "epoch": 3.359269406392694, "grad_norm": 6.092699791814285, "learning_rate": 3.787160219676555e-08, "logits/chosen": -2.498912811279297, "logits/rejected": -1.5893476009368896, "logps/chosen": -520.0387573242188, "logps/rejected": -330.8564453125, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 2.1174027919769287, "rewards/margins": 5.486194610595703, "rewards/rejected": -3.3687922954559326, "step": 4598 }, { "epoch": 3.36, "grad_norm": 8.079184132544485, "learning_rate": 3.7787240023399335e-08, "logits/chosen": -2.982928514480591, "logits/rejected": -2.635645866394043, "logps/chosen": -694.4628295898438, "logps/rejected": -630.8819580078125, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 2.8530397415161133, "rewards/margins": 4.548157691955566, "rewards/rejected": -1.6951180696487427, "step": 4599 }, { "epoch": 3.3607305936073057, "grad_norm": 12.330296391670611, "learning_rate": 3.770296423439634e-08, "logits/chosen": -2.74181866645813, "logits/rejected": -2.3311519622802734, "logps/chosen": -443.0298156738281, "logps/rejected": -472.1878967285156, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 3.3201467990875244, "rewards/margins": 4.448966026306152, "rewards/rejected": -1.1288189888000488, "step": 4600 }, { "epoch": 3.361461187214612, "grad_norm": 4.232379738844824, "learning_rate": 3.7618774864062116e-08, "logits/chosen": -2.767731189727783, "logits/rejected": -1.8917453289031982, "logps/chosen": -721.2230224609375, "logps/rejected": -620.72607421875, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 4.7031707763671875, "rewards/margins": 6.893918514251709, "rewards/rejected": -2.1907477378845215, "step": 4601 }, { "epoch": 3.362191780821918, "grad_norm": 6.585399383969564, "learning_rate": 3.7534671946667344e-08, "logits/chosen": -2.9299304485321045, "logits/rejected": -2.130197286605835, "logps/chosen": -710.208984375, "logps/rejected": -471.9129943847656, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 3.682593822479248, "rewards/margins": 5.036208152770996, "rewards/rejected": -1.3536145687103271, "step": 4602 }, { "epoch": 3.362922374429224, "grad_norm": 11.450259569715753, "learning_rate": 3.745065551644727e-08, "logits/chosen": -2.2759296894073486, "logits/rejected": -2.3020899295806885, "logps/chosen": -408.2063293457031, "logps/rejected": -423.5034484863281, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 2.314934253692627, "rewards/margins": 5.471297264099121, "rewards/rejected": -3.156362533569336, "step": 4603 }, { "epoch": 3.3636529680365297, "grad_norm": 9.87472804214121, "learning_rate": 3.7366725607602066e-08, "logits/chosen": -2.4902262687683105, "logits/rejected": -1.689903736114502, "logps/chosen": -533.9158325195312, "logps/rejected": -487.59674072265625, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": 4.695468902587891, "rewards/margins": 7.379434585571289, "rewards/rejected": -2.683964967727661, "step": 4604 }, { "epoch": 3.3643835616438356, "grad_norm": 4.0130778149176205, "learning_rate": 3.7282882254296767e-08, "logits/chosen": -3.1869654655456543, "logits/rejected": -2.7413458824157715, "logps/chosen": -811.3297119140625, "logps/rejected": -749.1646118164062, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 3.016939401626587, "rewards/margins": 5.658280372619629, "rewards/rejected": -2.641340732574463, "step": 4605 }, { "epoch": 3.3651141552511414, "grad_norm": 3.4428813080993885, "learning_rate": 3.7199125490660846e-08, "logits/chosen": -3.11794376373291, "logits/rejected": -2.3298394680023193, "logps/chosen": -720.6311645507812, "logps/rejected": -490.4734191894531, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 3.7601661682128906, "rewards/margins": 6.679582595825195, "rewards/rejected": -2.9194161891937256, "step": 4606 }, { "epoch": 3.3658447488584473, "grad_norm": 6.91730784313022, "learning_rate": 3.7115455350788916e-08, "logits/chosen": -2.505054235458374, "logits/rejected": -2.1048660278320312, "logps/chosen": -1217.43603515625, "logps/rejected": -869.7901000976562, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 5.046074390411377, "rewards/margins": 6.14410924911499, "rewards/rejected": -1.0980346202850342, "step": 4607 }, { "epoch": 3.3665753424657536, "grad_norm": 6.785960727891688, "learning_rate": 3.703187186874002e-08, "logits/chosen": -2.7738966941833496, "logits/rejected": -1.8075374364852905, "logps/chosen": -441.67083740234375, "logps/rejected": -249.77630615234375, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 3.7507143020629883, "rewards/margins": 6.940027236938477, "rewards/rejected": -3.189312696456909, "step": 4608 }, { "epoch": 3.3673059360730595, "grad_norm": 4.633901947222204, "learning_rate": 3.694837507853818e-08, "logits/chosen": -2.2652335166931152, "logits/rejected": -2.3483052253723145, "logps/chosen": -496.4330749511719, "logps/rejected": -718.1738891601562, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 2.75482177734375, "rewards/margins": 7.5490031242370605, "rewards/rejected": -4.794181823730469, "step": 4609 }, { "epoch": 3.3680365296803654, "grad_norm": 8.759622133370456, "learning_rate": 3.6864965014171965e-08, "logits/chosen": -2.856839179992676, "logits/rejected": -2.3892409801483154, "logps/chosen": -710.2335205078125, "logps/rejected": -777.390869140625, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 3.2815327644348145, "rewards/margins": 6.327509880065918, "rewards/rejected": -3.0459768772125244, "step": 4610 }, { "epoch": 3.3687671232876713, "grad_norm": 5.276311066036211, "learning_rate": 3.6781641709594605e-08, "logits/chosen": -3.2859301567077637, "logits/rejected": -2.2773406505584717, "logps/chosen": -598.287841796875, "logps/rejected": -425.28564453125, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": 3.781135082244873, "rewards/margins": 6.751278400421143, "rewards/rejected": -2.9701435565948486, "step": 4611 }, { "epoch": 3.369497716894977, "grad_norm": 4.810379155495844, "learning_rate": 3.669840519872419e-08, "logits/chosen": -2.9203524589538574, "logits/rejected": -1.786130428314209, "logps/chosen": -671.379638671875, "logps/rejected": -567.9642333984375, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 4.968629837036133, "rewards/margins": 7.843935966491699, "rewards/rejected": -2.875305652618408, "step": 4612 }, { "epoch": 3.370228310502283, "grad_norm": 5.465226613308404, "learning_rate": 3.66152555154434e-08, "logits/chosen": -2.8658106327056885, "logits/rejected": -2.492392063140869, "logps/chosen": -776.7716064453125, "logps/rejected": -591.12109375, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 3.665038585662842, "rewards/margins": 4.337558269500732, "rewards/rejected": -0.6725193858146667, "step": 4613 }, { "epoch": 3.370958904109589, "grad_norm": 5.037019063932063, "learning_rate": 3.653219269359939e-08, "logits/chosen": -2.5499424934387207, "logits/rejected": -2.258824110031128, "logps/chosen": -651.189453125, "logps/rejected": -708.78662109375, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 6.240048885345459, "rewards/margins": 9.568833351135254, "rewards/rejected": -3.328784227371216, "step": 4614 }, { "epoch": 3.371689497716895, "grad_norm": 12.074291415383088, "learning_rate": 3.6449216767004295e-08, "logits/chosen": -2.9566030502319336, "logits/rejected": -1.8248260021209717, "logps/chosen": -1000.4088745117188, "logps/rejected": -475.7897644042969, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 2.631171703338623, "rewards/margins": 4.411708831787109, "rewards/rejected": -1.7805373668670654, "step": 4615 }, { "epoch": 3.372420091324201, "grad_norm": 2.7661143586576675, "learning_rate": 3.63663277694346e-08, "logits/chosen": -2.3200550079345703, "logits/rejected": -2.5933117866516113, "logps/chosen": -522.2706298828125, "logps/rejected": -517.354736328125, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 2.5404374599456787, "rewards/margins": 4.360846042633057, "rewards/rejected": -1.820408821105957, "step": 4616 }, { "epoch": 3.373150684931507, "grad_norm": 10.016170778445638, "learning_rate": 3.628352573463159e-08, "logits/chosen": -3.3588151931762695, "logits/rejected": -2.0790913105010986, "logps/chosen": -558.1551513671875, "logps/rejected": -316.96636962890625, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 2.4411606788635254, "rewards/margins": 5.33764123916626, "rewards/rejected": -2.896480083465576, "step": 4617 }, { "epoch": 3.373881278538813, "grad_norm": 5.475068545674473, "learning_rate": 3.620081069630101e-08, "logits/chosen": -2.8474197387695312, "logits/rejected": -2.21345853805542, "logps/chosen": -521.0315551757812, "logps/rejected": -558.0626220703125, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.578932285308838, "rewards/margins": 5.909603595733643, "rewards/rejected": -2.3306713104248047, "step": 4618 }, { "epoch": 3.3746118721461187, "grad_norm": 5.626277354694509, "learning_rate": 3.611818268811326e-08, "logits/chosen": -2.476290464401245, "logits/rejected": -2.5493664741516113, "logps/chosen": -623.3931884765625, "logps/rejected": -982.2400512695312, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 4.194384574890137, "rewards/margins": 5.790830612182617, "rewards/rejected": -1.596445918083191, "step": 4619 }, { "epoch": 3.3753424657534246, "grad_norm": 4.98409488094143, "learning_rate": 3.6035641743703213e-08, "logits/chosen": -3.007204532623291, "logits/rejected": -1.8611152172088623, "logps/chosen": -682.1408081054688, "logps/rejected": -490.63043212890625, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 3.8666200637817383, "rewards/margins": 5.623822212219238, "rewards/rejected": -1.7572017908096313, "step": 4620 }, { "epoch": 3.3760730593607304, "grad_norm": 10.813224988677602, "learning_rate": 3.595318789667054e-08, "logits/chosen": -2.498553991317749, "logits/rejected": -1.9387866258621216, "logps/chosen": -494.07440185546875, "logps/rejected": -412.1837158203125, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 1.159111738204956, "rewards/margins": 4.654564380645752, "rewards/rejected": -3.495452642440796, "step": 4621 }, { "epoch": 3.3768036529680368, "grad_norm": 3.692107732414029, "learning_rate": 3.587082118057924e-08, "logits/chosen": -2.5608325004577637, "logits/rejected": -1.7544257640838623, "logps/chosen": -930.8194580078125, "logps/rejected": -688.042236328125, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 4.012972831726074, "rewards/margins": 5.703488349914551, "rewards/rejected": -1.6905159950256348, "step": 4622 }, { "epoch": 3.3775342465753426, "grad_norm": 4.4274810732022845, "learning_rate": 3.5788541628957836e-08, "logits/chosen": -2.432887077331543, "logits/rejected": -2.138094186782837, "logps/chosen": -665.261474609375, "logps/rejected": -523.8525390625, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 2.2785401344299316, "rewards/margins": 5.474640846252441, "rewards/rejected": -3.196100950241089, "step": 4623 }, { "epoch": 3.3782648401826485, "grad_norm": 5.053242071396235, "learning_rate": 3.570634927529958e-08, "logits/chosen": -3.2554404735565186, "logits/rejected": -3.0420117378234863, "logps/chosen": -699.9862060546875, "logps/rejected": -663.7615356445312, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 4.047781944274902, "rewards/margins": 4.6700944900512695, "rewards/rejected": -0.6223123073577881, "step": 4624 }, { "epoch": 3.3789954337899544, "grad_norm": 6.434724376024987, "learning_rate": 3.562424415306198e-08, "logits/chosen": -2.488297700881958, "logits/rejected": -2.248443365097046, "logps/chosen": -610.8115234375, "logps/rejected": -551.6400146484375, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 3.0196497440338135, "rewards/margins": 5.743119239807129, "rewards/rejected": -2.7234699726104736, "step": 4625 }, { "epoch": 3.3797260273972602, "grad_norm": 5.778727873096191, "learning_rate": 3.554222629566725e-08, "logits/chosen": -2.671452045440674, "logits/rejected": -2.0454742908477783, "logps/chosen": -590.845947265625, "logps/rejected": -575.203369140625, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 4.328029632568359, "rewards/margins": 7.611851692199707, "rewards/rejected": -3.283822536468506, "step": 4626 }, { "epoch": 3.380456621004566, "grad_norm": 7.130882645555703, "learning_rate": 3.5460295736501917e-08, "logits/chosen": -2.847757577896118, "logits/rejected": -2.224513053894043, "logps/chosen": -523.3658447265625, "logps/rejected": -460.2474365234375, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 2.305784225463867, "rewards/margins": 5.594574928283691, "rewards/rejected": -3.288790702819824, "step": 4627 }, { "epoch": 3.381187214611872, "grad_norm": 6.416775723145725, "learning_rate": 3.537845250891702e-08, "logits/chosen": -2.8313887119293213, "logits/rejected": -1.9045906066894531, "logps/chosen": -627.4291381835938, "logps/rejected": -399.6299743652344, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": 4.559929370880127, "rewards/margins": 7.431222915649414, "rewards/rejected": -2.871293544769287, "step": 4628 }, { "epoch": 3.3819178082191783, "grad_norm": 8.564075185859972, "learning_rate": 3.5296696646228114e-08, "logits/chosen": -2.4779484272003174, "logits/rejected": -1.4560474157333374, "logps/chosen": -709.3382568359375, "logps/rejected": -369.8361511230469, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 3.910412311553955, "rewards/margins": 6.628731727600098, "rewards/rejected": -2.7183196544647217, "step": 4629 }, { "epoch": 3.382648401826484, "grad_norm": 8.32967398089083, "learning_rate": 3.521502818171507e-08, "logits/chosen": -2.522529363632202, "logits/rejected": -2.224529504776001, "logps/chosen": -497.40374755859375, "logps/rejected": -555.7589721679688, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 3.0867202281951904, "rewards/margins": 5.88395881652832, "rewards/rejected": -2.797238826751709, "step": 4630 }, { "epoch": 3.38337899543379, "grad_norm": 4.687298376197624, "learning_rate": 3.51334471486224e-08, "logits/chosen": -2.7184524536132812, "logits/rejected": -2.6057839393615723, "logps/chosen": -728.65673828125, "logps/rejected": -741.1864013671875, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 4.166097640991211, "rewards/margins": 5.709559440612793, "rewards/rejected": -1.5434613227844238, "step": 4631 }, { "epoch": 3.384109589041096, "grad_norm": 5.379611676180636, "learning_rate": 3.5051953580158664e-08, "logits/chosen": -3.0578534603118896, "logits/rejected": -2.2167201042175293, "logps/chosen": -660.87158203125, "logps/rejected": -525.3021850585938, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 3.665311098098755, "rewards/margins": 6.271306037902832, "rewards/rejected": -2.6059951782226562, "step": 4632 }, { "epoch": 3.384840182648402, "grad_norm": 4.264266360152943, "learning_rate": 3.4970547509497154e-08, "logits/chosen": -2.4943130016326904, "logits/rejected": -2.371732234954834, "logps/chosen": -744.1543579101562, "logps/rejected": -794.3118286132812, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 2.1918447017669678, "rewards/margins": 4.297001838684082, "rewards/rejected": -2.1051571369171143, "step": 4633 }, { "epoch": 3.3855707762557077, "grad_norm": 9.74497621661553, "learning_rate": 3.488922896977545e-08, "logits/chosen": -2.037950277328491, "logits/rejected": -2.5576276779174805, "logps/chosen": -452.9661865234375, "logps/rejected": -628.701416015625, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 1.7087104320526123, "rewards/margins": 4.729349136352539, "rewards/rejected": -3.020638942718506, "step": 4634 }, { "epoch": 3.3863013698630136, "grad_norm": 6.26468946364684, "learning_rate": 3.480799799409545e-08, "logits/chosen": -2.677457332611084, "logits/rejected": -1.5480105876922607, "logps/chosen": -317.0758361816406, "logps/rejected": -198.90277099609375, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 2.277928113937378, "rewards/margins": 4.579695701599121, "rewards/rejected": -2.301767587661743, "step": 4635 }, { "epoch": 3.38703196347032, "grad_norm": 5.719780834240771, "learning_rate": 3.472685461552341e-08, "logits/chosen": -3.112240791320801, "logits/rejected": -2.002271890640259, "logps/chosen": -755.659912109375, "logps/rejected": -433.92120361328125, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 4.205592632293701, "rewards/margins": 5.9454193115234375, "rewards/rejected": -1.7398267984390259, "step": 4636 }, { "epoch": 3.3877625570776257, "grad_norm": 7.9000830440743774, "learning_rate": 3.4645798867089906e-08, "logits/chosen": -3.222555160522461, "logits/rejected": -2.3519186973571777, "logps/chosen": -660.23681640625, "logps/rejected": -547.33447265625, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 4.07696008682251, "rewards/margins": 5.624094009399414, "rewards/rejected": -1.5471335649490356, "step": 4637 }, { "epoch": 3.3884931506849316, "grad_norm": 3.4370287485482525, "learning_rate": 3.456483078178998e-08, "logits/chosen": -2.9256930351257324, "logits/rejected": -2.0998334884643555, "logps/chosen": -668.5010375976562, "logps/rejected": -558.98388671875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 4.310235977172852, "rewards/margins": 7.753561019897461, "rewards/rejected": -3.4433252811431885, "step": 4638 }, { "epoch": 3.3892237442922375, "grad_norm": 5.17125059663079, "learning_rate": 3.4483950392582814e-08, "logits/chosen": -2.4241180419921875, "logits/rejected": -2.632266044616699, "logps/chosen": -549.3021240234375, "logps/rejected": -660.2725830078125, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": 2.131631851196289, "rewards/margins": 4.347509384155273, "rewards/rejected": -2.2158775329589844, "step": 4639 }, { "epoch": 3.3899543378995434, "grad_norm": 5.318436301603495, "learning_rate": 3.440315773239208e-08, "logits/chosen": -2.1480159759521484, "logits/rejected": -1.9032179117202759, "logps/chosen": -402.83966064453125, "logps/rejected": -372.8417663574219, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 1.9803147315979004, "rewards/margins": 5.100928783416748, "rewards/rejected": -3.120614528656006, "step": 4640 }, { "epoch": 3.3906849315068492, "grad_norm": 5.392686178913359, "learning_rate": 3.432245283410556e-08, "logits/chosen": -2.352193832397461, "logits/rejected": -2.1979291439056396, "logps/chosen": -493.26495361328125, "logps/rejected": -637.1554565429688, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 4.959410190582275, "rewards/margins": 9.188032150268555, "rewards/rejected": -4.2286224365234375, "step": 4641 }, { "epoch": 3.391415525114155, "grad_norm": 5.954435589721219, "learning_rate": 3.424183573057535e-08, "logits/chosen": -2.9420876502990723, "logits/rejected": -1.973575234413147, "logps/chosen": -621.788330078125, "logps/rejected": -405.9076843261719, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 3.3463292121887207, "rewards/margins": 5.815372467041016, "rewards/rejected": -2.469043016433716, "step": 4642 }, { "epoch": 3.3921461187214614, "grad_norm": 5.489148619182811, "learning_rate": 3.4161306454617924e-08, "logits/chosen": -2.888963460922241, "logits/rejected": -2.7437944412231445, "logps/chosen": -682.99462890625, "logps/rejected": -507.428955078125, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 3.504223346710205, "rewards/margins": 5.505281448364258, "rewards/rejected": -2.0010578632354736, "step": 4643 }, { "epoch": 3.3928767123287673, "grad_norm": 4.990472872236197, "learning_rate": 3.408086503901389e-08, "logits/chosen": -2.859318256378174, "logits/rejected": -2.1056301593780518, "logps/chosen": -771.1009521484375, "logps/rejected": -550.07373046875, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 3.5924878120422363, "rewards/margins": 4.863547325134277, "rewards/rejected": -1.271059274673462, "step": 4644 }, { "epoch": 3.393607305936073, "grad_norm": 5.636145506410706, "learning_rate": 3.400051151650804e-08, "logits/chosen": -2.553906202316284, "logits/rejected": -1.8506450653076172, "logps/chosen": -485.24090576171875, "logps/rejected": -370.3001403808594, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": 2.157193660736084, "rewards/margins": 5.189602851867676, "rewards/rejected": -3.0324084758758545, "step": 4645 }, { "epoch": 3.394337899543379, "grad_norm": 5.855830633726412, "learning_rate": 3.392024591980963e-08, "logits/chosen": -3.165121555328369, "logits/rejected": -1.5920149087905884, "logps/chosen": -963.2482299804688, "logps/rejected": -491.2436218261719, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 4.64469575881958, "rewards/margins": 6.279330253601074, "rewards/rejected": -1.6346347332000732, "step": 4646 }, { "epoch": 3.395068493150685, "grad_norm": 5.174492026935243, "learning_rate": 3.3840068281591834e-08, "logits/chosen": -3.0536742210388184, "logits/rejected": -2.3696725368499756, "logps/chosen": -439.278076171875, "logps/rejected": -305.0692138671875, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 2.287487506866455, "rewards/margins": 5.088686466217041, "rewards/rejected": -2.801198720932007, "step": 4647 }, { "epoch": 3.395799086757991, "grad_norm": 5.977685441379798, "learning_rate": 3.3759978634492316e-08, "logits/chosen": -2.829432487487793, "logits/rejected": -2.5296835899353027, "logps/chosen": -555.3740234375, "logps/rejected": -573.3804321289062, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 2.1184537410736084, "rewards/margins": 4.069246292114258, "rewards/rejected": -1.950792670249939, "step": 4648 }, { "epoch": 3.3965296803652967, "grad_norm": 9.102279595543592, "learning_rate": 3.367997701111253e-08, "logits/chosen": -2.9160985946655273, "logits/rejected": -2.131836414337158, "logps/chosen": -403.9163818359375, "logps/rejected": -385.88031005859375, "loss": 0.0565, "rewards/accuracies": 0.875, "rewards/chosen": 3.499439239501953, "rewards/margins": 7.4319562911987305, "rewards/rejected": -3.9325170516967773, "step": 4649 }, { "epoch": 3.3972602739726026, "grad_norm": 7.157511370906665, "learning_rate": 3.3600063444018416e-08, "logits/chosen": -2.956515312194824, "logits/rejected": -2.8276309967041016, "logps/chosen": -1073.552734375, "logps/rejected": -1022.764404296875, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 6.201761245727539, "rewards/margins": 5.821336269378662, "rewards/rejected": 0.3804250657558441, "step": 4650 }, { "epoch": 3.397990867579909, "grad_norm": 6.812064340642614, "learning_rate": 3.3520237965740106e-08, "logits/chosen": -2.841198444366455, "logits/rejected": -2.0936944484710693, "logps/chosen": -650.0826416015625, "logps/rejected": -684.3797607421875, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": 1.6057305335998535, "rewards/margins": 7.209159851074219, "rewards/rejected": -5.603428840637207, "step": 4651 }, { "epoch": 3.3987214611872147, "grad_norm": 4.487404680435141, "learning_rate": 3.344050060877157e-08, "logits/chosen": -3.215503215789795, "logits/rejected": -2.560051441192627, "logps/chosen": -412.5848388671875, "logps/rejected": -437.6575927734375, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 2.5611038208007812, "rewards/margins": 6.059405326843262, "rewards/rejected": -3.4983015060424805, "step": 4652 }, { "epoch": 3.3994520547945206, "grad_norm": 4.953788448148918, "learning_rate": 3.3360851405571325e-08, "logits/chosen": -3.236581802368164, "logits/rejected": -2.719521999359131, "logps/chosen": -650.9137573242188, "logps/rejected": -538.927490234375, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 2.716233730316162, "rewards/margins": 3.8184821605682373, "rewards/rejected": -1.1022486686706543, "step": 4653 }, { "epoch": 3.4001826484018265, "grad_norm": 4.106404089975912, "learning_rate": 3.328129038856145e-08, "logits/chosen": -2.3439314365386963, "logits/rejected": -1.8522387742996216, "logps/chosen": -709.6961059570312, "logps/rejected": -530.5870361328125, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 4.3474836349487305, "rewards/margins": 7.760931015014648, "rewards/rejected": -3.4134480953216553, "step": 4654 }, { "epoch": 3.4009132420091324, "grad_norm": 3.250653567988996, "learning_rate": 3.320181759012869e-08, "logits/chosen": -2.947277784347534, "logits/rejected": -2.557809591293335, "logps/chosen": -732.1764526367188, "logps/rejected": -663.9613647460938, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 2.7748637199401855, "rewards/margins": 6.270654678344727, "rewards/rejected": -3.495790481567383, "step": 4655 }, { "epoch": 3.4016438356164382, "grad_norm": 4.822421065614453, "learning_rate": 3.3122433042623465e-08, "logits/chosen": -2.691439151763916, "logits/rejected": -2.1399059295654297, "logps/chosen": -490.31591796875, "logps/rejected": -461.80047607421875, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 2.3107616901397705, "rewards/margins": 5.502598762512207, "rewards/rejected": -3.1918373107910156, "step": 4656 }, { "epoch": 3.402374429223744, "grad_norm": 6.928645626564102, "learning_rate": 3.3043136778360595e-08, "logits/chosen": -3.1052026748657227, "logits/rejected": -2.4286179542541504, "logps/chosen": -420.40753173828125, "logps/rejected": -464.4054260253906, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 2.1609692573547363, "rewards/margins": 5.185302734375, "rewards/rejected": -3.0243332386016846, "step": 4657 }, { "epoch": 3.40310502283105, "grad_norm": 5.736667933414961, "learning_rate": 3.296392882961871e-08, "logits/chosen": -2.31150484085083, "logits/rejected": -1.6850999593734741, "logps/chosen": -450.2352294921875, "logps/rejected": -493.09698486328125, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 3.1907644271850586, "rewards/margins": 7.798647880554199, "rewards/rejected": -4.607883453369141, "step": 4658 }, { "epoch": 3.4038356164383563, "grad_norm": 5.038556403243106, "learning_rate": 3.288480922864054e-08, "logits/chosen": -2.600100040435791, "logits/rejected": -2.6718385219573975, "logps/chosen": -682.6178588867188, "logps/rejected": -741.6766967773438, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 3.9835057258605957, "rewards/margins": 5.627208709716797, "rewards/rejected": -1.6437032222747803, "step": 4659 }, { "epoch": 3.404566210045662, "grad_norm": 5.063687408971969, "learning_rate": 3.280577800763301e-08, "logits/chosen": -2.6640725135803223, "logits/rejected": -1.757550597190857, "logps/chosen": -374.48681640625, "logps/rejected": -251.53982543945312, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 3.427232027053833, "rewards/margins": 6.243170738220215, "rewards/rejected": -2.815938949584961, "step": 4660 }, { "epoch": 3.405296803652968, "grad_norm": 4.666476699958949, "learning_rate": 3.27268351987669e-08, "logits/chosen": -2.622439384460449, "logits/rejected": -1.9137221574783325, "logps/chosen": -532.7796630859375, "logps/rejected": -502.63519287109375, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 3.454202890396118, "rewards/margins": 7.593024253845215, "rewards/rejected": -4.138820648193359, "step": 4661 }, { "epoch": 3.406027397260274, "grad_norm": 8.718008358493902, "learning_rate": 3.2647980834176995e-08, "logits/chosen": -2.2797837257385254, "logits/rejected": -2.080615997314453, "logps/chosen": -273.9947814941406, "logps/rejected": -351.9169006347656, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": 2.3349738121032715, "rewards/margins": 6.2670745849609375, "rewards/rejected": -3.932100772857666, "step": 4662 }, { "epoch": 3.40675799086758, "grad_norm": 5.09208623944032, "learning_rate": 3.256921494596227e-08, "logits/chosen": -2.469204902648926, "logits/rejected": -2.16498064994812, "logps/chosen": -773.7686157226562, "logps/rejected": -858.53466796875, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 3.118098258972168, "rewards/margins": 7.104599952697754, "rewards/rejected": -3.986501693725586, "step": 4663 }, { "epoch": 3.4074885844748857, "grad_norm": 9.905790589085973, "learning_rate": 3.2490537566185413e-08, "logits/chosen": -3.0435609817504883, "logits/rejected": -2.114154815673828, "logps/chosen": -713.8977661132812, "logps/rejected": -459.6612854003906, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 3.324983835220337, "rewards/margins": 5.94880485534668, "rewards/rejected": -2.623821258544922, "step": 4664 }, { "epoch": 3.4082191780821915, "grad_norm": 5.909174442367735, "learning_rate": 3.241194872687333e-08, "logits/chosen": -2.861863613128662, "logits/rejected": -1.8715051412582397, "logps/chosen": -706.4849853515625, "logps/rejected": -532.4508666992188, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 3.0227837562561035, "rewards/margins": 5.121187210083008, "rewards/rejected": -2.0984036922454834, "step": 4665 }, { "epoch": 3.408949771689498, "grad_norm": 6.591197021317761, "learning_rate": 3.233344846001676e-08, "logits/chosen": -2.9371068477630615, "logits/rejected": -2.5241076946258545, "logps/chosen": -804.8609619140625, "logps/rejected": -666.0986328125, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 4.379642963409424, "rewards/margins": 5.884955406188965, "rewards/rejected": -1.5053120851516724, "step": 4666 }, { "epoch": 3.4096803652968037, "grad_norm": 10.437687489544537, "learning_rate": 3.225503679757033e-08, "logits/chosen": -2.40301251411438, "logits/rejected": -2.3365626335144043, "logps/chosen": -702.4780883789062, "logps/rejected": -823.4053955078125, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 3.385056257247925, "rewards/margins": 5.730715274810791, "rewards/rejected": -2.345659017562866, "step": 4667 }, { "epoch": 3.4104109589041096, "grad_norm": 6.252383883746769, "learning_rate": 3.217671377145278e-08, "logits/chosen": -1.9906349182128906, "logits/rejected": -2.054434061050415, "logps/chosen": -323.5807189941406, "logps/rejected": -369.9842529296875, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 2.1873862743377686, "rewards/margins": 5.502763748168945, "rewards/rejected": -3.315377712249756, "step": 4668 }, { "epoch": 3.4111415525114155, "grad_norm": 7.105324867976987, "learning_rate": 3.209847941354657e-08, "logits/chosen": -3.1016314029693604, "logits/rejected": -2.692718029022217, "logps/chosen": -895.164306640625, "logps/rejected": -942.1400146484375, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 2.842947244644165, "rewards/margins": 4.384281635284424, "rewards/rejected": -1.5413342714309692, "step": 4669 }, { "epoch": 3.4118721461187214, "grad_norm": 4.608607341399055, "learning_rate": 3.202033375569829e-08, "logits/chosen": -3.2150466442108154, "logits/rejected": -2.5169453620910645, "logps/chosen": -481.9061584472656, "logps/rejected": -434.8149108886719, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 2.6172738075256348, "rewards/margins": 5.569153785705566, "rewards/rejected": -2.9518799781799316, "step": 4670 }, { "epoch": 3.4126027397260272, "grad_norm": 3.5068173090653234, "learning_rate": 3.1942276829718116e-08, "logits/chosen": -3.1296143531799316, "logits/rejected": -2.364762306213379, "logps/chosen": -818.7825927734375, "logps/rejected": -664.6688232421875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 4.531815528869629, "rewards/margins": 7.792258262634277, "rewards/rejected": -3.2604422569274902, "step": 4671 }, { "epoch": 3.413333333333333, "grad_norm": 5.742868728192213, "learning_rate": 3.186430866738041e-08, "logits/chosen": -3.14823579788208, "logits/rejected": -2.0743188858032227, "logps/chosen": -747.610595703125, "logps/rejected": -471.0862121582031, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 3.4235308170318604, "rewards/margins": 6.121909141540527, "rewards/rejected": -2.698378801345825, "step": 4672 }, { "epoch": 3.4140639269406394, "grad_norm": 4.3361017969294675, "learning_rate": 3.178642930042319e-08, "logits/chosen": -2.6993625164031982, "logits/rejected": -2.0860981941223145, "logps/chosen": -488.0690002441406, "logps/rejected": -333.28143310546875, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 1.9514260292053223, "rewards/margins": 5.387267112731934, "rewards/rejected": -3.4358408451080322, "step": 4673 }, { "epoch": 3.4147945205479453, "grad_norm": 3.199787317599195, "learning_rate": 3.1708638760548494e-08, "logits/chosen": -2.919987678527832, "logits/rejected": -2.496397018432617, "logps/chosen": -746.7901611328125, "logps/rejected": -586.598388671875, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 3.9842612743377686, "rewards/margins": 6.7671685218811035, "rewards/rejected": -2.782907009124756, "step": 4674 }, { "epoch": 3.415525114155251, "grad_norm": 8.411514619022853, "learning_rate": 3.163093707942211e-08, "logits/chosen": -2.975332260131836, "logits/rejected": -1.5776970386505127, "logps/chosen": -923.215087890625, "logps/rejected": -516.4716186523438, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 5.3908586502075195, "rewards/margins": 5.6709136962890625, "rewards/rejected": -0.2800554037094116, "step": 4675 }, { "epoch": 3.416255707762557, "grad_norm": 11.987677287042777, "learning_rate": 3.155332428867355e-08, "logits/chosen": -2.8538060188293457, "logits/rejected": -2.198523998260498, "logps/chosen": -446.8183288574219, "logps/rejected": -449.30938720703125, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 2.9238343238830566, "rewards/margins": 5.69107723236084, "rewards/rejected": -2.767242908477783, "step": 4676 }, { "epoch": 3.416986301369863, "grad_norm": 6.943092909795514, "learning_rate": 3.147580041989642e-08, "logits/chosen": -2.795660972595215, "logits/rejected": -2.1420516967773438, "logps/chosen": -526.977294921875, "logps/rejected": -480.1661682128906, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": 3.2212326526641846, "rewards/margins": 5.2751874923706055, "rewards/rejected": -2.053955078125, "step": 4677 }, { "epoch": 3.417716894977169, "grad_norm": 5.130536322267055, "learning_rate": 3.139836550464783e-08, "logits/chosen": -2.2045841217041016, "logits/rejected": -1.5112318992614746, "logps/chosen": -776.6878051757812, "logps/rejected": -540.0543823242188, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 4.17542028427124, "rewards/margins": 6.833615303039551, "rewards/rejected": -2.6581947803497314, "step": 4678 }, { "epoch": 3.4184474885844747, "grad_norm": 14.44777719897462, "learning_rate": 3.1321019574448906e-08, "logits/chosen": -2.715244770050049, "logits/rejected": -1.9867792129516602, "logps/chosen": -575.4224853515625, "logps/rejected": -492.09149169921875, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": 3.386133909225464, "rewards/margins": 7.4406585693359375, "rewards/rejected": -4.054524898529053, "step": 4679 }, { "epoch": 3.419178082191781, "grad_norm": 6.949911476504575, "learning_rate": 3.124376266078446e-08, "logits/chosen": -2.921869993209839, "logits/rejected": -2.2852230072021484, "logps/chosen": -677.20703125, "logps/rejected": -481.43267822265625, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": 4.983846664428711, "rewards/margins": 7.3571085929870605, "rewards/rejected": -2.3732621669769287, "step": 4680 }, { "epoch": 3.419908675799087, "grad_norm": 3.3174125663314635, "learning_rate": 3.1166594795102945e-08, "logits/chosen": -2.6894969940185547, "logits/rejected": -2.2290687561035156, "logps/chosen": -651.189697265625, "logps/rejected": -604.647216796875, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 2.398160457611084, "rewards/margins": 5.85250186920166, "rewards/rejected": -3.454341411590576, "step": 4681 }, { "epoch": 3.4206392694063927, "grad_norm": 5.168005287923558, "learning_rate": 3.1089516008816846e-08, "logits/chosen": -2.5935168266296387, "logits/rejected": -2.710160255432129, "logps/chosen": -454.5743713378906, "logps/rejected": -527.7388305664062, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 1.7333343029022217, "rewards/margins": 4.234090805053711, "rewards/rejected": -2.50075626373291, "step": 4682 }, { "epoch": 3.4213698630136986, "grad_norm": 3.205326868013705, "learning_rate": 3.101252633330217e-08, "logits/chosen": -3.066288709640503, "logits/rejected": -2.0656492710113525, "logps/chosen": -611.8516235351562, "logps/rejected": -440.7564697265625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 4.025914192199707, "rewards/margins": 7.222954750061035, "rewards/rejected": -3.197040319442749, "step": 4683 }, { "epoch": 3.4221004566210045, "grad_norm": 5.861273455468137, "learning_rate": 3.0935625799898697e-08, "logits/chosen": -3.1491832733154297, "logits/rejected": -2.788975954055786, "logps/chosen": -554.852294921875, "logps/rejected": -490.693115234375, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 2.7698488235473633, "rewards/margins": 4.692625999450684, "rewards/rejected": -1.9227771759033203, "step": 4684 }, { "epoch": 3.4228310502283104, "grad_norm": 4.939937521041333, "learning_rate": 3.0858814439909895e-08, "logits/chosen": -2.7679603099823, "logits/rejected": -2.1795217990875244, "logps/chosen": -492.1087646484375, "logps/rejected": -428.9193115234375, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 3.596885919570923, "rewards/margins": 5.577641487121582, "rewards/rejected": -1.9807555675506592, "step": 4685 }, { "epoch": 3.4235616438356162, "grad_norm": 7.486454493617774, "learning_rate": 3.078209228460302e-08, "logits/chosen": -2.475005865097046, "logits/rejected": -2.145254135131836, "logps/chosen": -704.3330688476562, "logps/rejected": -577.202392578125, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 3.3257687091827393, "rewards/margins": 5.930349826812744, "rewards/rejected": -2.604581356048584, "step": 4686 }, { "epoch": 3.4242922374429225, "grad_norm": 3.4526851598324524, "learning_rate": 3.070545936520905e-08, "logits/chosen": -2.833653211593628, "logits/rejected": -2.48561692237854, "logps/chosen": -630.9802856445312, "logps/rejected": -582.6561889648438, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 3.9743151664733887, "rewards/margins": 6.4568867683410645, "rewards/rejected": -2.482571601867676, "step": 4687 }, { "epoch": 3.4250228310502284, "grad_norm": 4.819251340917848, "learning_rate": 3.062891571292239e-08, "logits/chosen": -2.688206911087036, "logits/rejected": -2.1508755683898926, "logps/chosen": -644.9932861328125, "logps/rejected": -509.04669189453125, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 2.2744357585906982, "rewards/margins": 4.822234153747559, "rewards/rejected": -2.5477983951568604, "step": 4688 }, { "epoch": 3.4257534246575343, "grad_norm": 5.347052373569325, "learning_rate": 3.0552461358901385e-08, "logits/chosen": -2.886528253555298, "logits/rejected": -2.2111892700195312, "logps/chosen": -811.4913330078125, "logps/rejected": -478.9121398925781, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 5.109646320343018, "rewards/margins": 7.400433540344238, "rewards/rejected": -2.2907872200012207, "step": 4689 }, { "epoch": 3.42648401826484, "grad_norm": 4.7002240229034316, "learning_rate": 3.047609633426784e-08, "logits/chosen": -2.8127541542053223, "logits/rejected": -2.121725559234619, "logps/chosen": -759.38037109375, "logps/rejected": -655.669189453125, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 4.496418476104736, "rewards/margins": 5.313234806060791, "rewards/rejected": -0.8168166279792786, "step": 4690 }, { "epoch": 3.427214611872146, "grad_norm": 7.731868083304137, "learning_rate": 3.039982067010738e-08, "logits/chosen": -2.5933427810668945, "logits/rejected": -1.8063404560089111, "logps/chosen": -373.72509765625, "logps/rejected": -284.60540771484375, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 2.436263084411621, "rewards/margins": 6.4576520919799805, "rewards/rejected": -4.021389007568359, "step": 4691 }, { "epoch": 3.427945205479452, "grad_norm": 6.5581717613834, "learning_rate": 3.03236343974691e-08, "logits/chosen": -2.7138757705688477, "logits/rejected": -2.0561840534210205, "logps/chosen": -491.61279296875, "logps/rejected": -392.4764709472656, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 2.399136781692505, "rewards/margins": 6.355836868286133, "rewards/rejected": -3.956700563430786, "step": 4692 }, { "epoch": 3.428675799086758, "grad_norm": 5.755320533820987, "learning_rate": 3.0247537547365696e-08, "logits/chosen": -2.684756278991699, "logits/rejected": -2.0161185264587402, "logps/chosen": -574.779541015625, "logps/rejected": -469.7331848144531, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 3.4460394382476807, "rewards/margins": 5.85983419418335, "rewards/rejected": -2.41379451751709, "step": 4693 }, { "epoch": 3.429406392694064, "grad_norm": 8.46400820028272, "learning_rate": 3.0171530150773614e-08, "logits/chosen": -2.9881319999694824, "logits/rejected": -1.7662092447280884, "logps/chosen": -629.541259765625, "logps/rejected": -366.150634765625, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 4.788938522338867, "rewards/margins": 8.785987854003906, "rewards/rejected": -3.997048854827881, "step": 4694 }, { "epoch": 3.43013698630137, "grad_norm": 4.620662201492103, "learning_rate": 3.0095612238632715e-08, "logits/chosen": -2.799619197845459, "logits/rejected": -2.1556951999664307, "logps/chosen": -621.5906982421875, "logps/rejected": -537.3096923828125, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 4.794719219207764, "rewards/margins": 7.702925205230713, "rewards/rejected": -2.9082064628601074, "step": 4695 }, { "epoch": 3.430867579908676, "grad_norm": 5.576805150937974, "learning_rate": 3.001978384184661e-08, "logits/chosen": -2.7185919284820557, "logits/rejected": -2.179440975189209, "logps/chosen": -588.8013916015625, "logps/rejected": -495.0675964355469, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 2.6257331371307373, "rewards/margins": 4.642755031585693, "rewards/rejected": -2.017021656036377, "step": 4696 }, { "epoch": 3.4315981735159817, "grad_norm": 10.058909658267321, "learning_rate": 2.994404499128231e-08, "logits/chosen": -2.9320435523986816, "logits/rejected": -3.022972583770752, "logps/chosen": -678.158447265625, "logps/rejected": -664.875, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 3.788140296936035, "rewards/margins": 4.524948596954346, "rewards/rejected": -0.7368083596229553, "step": 4697 }, { "epoch": 3.4323287671232876, "grad_norm": 7.200490341443877, "learning_rate": 2.9868395717770416e-08, "logits/chosen": -2.434274911880493, "logits/rejected": -2.653441905975342, "logps/chosen": -620.6907958984375, "logps/rejected": -601.7142333984375, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 2.3483011722564697, "rewards/margins": 6.413158416748047, "rewards/rejected": -4.064857482910156, "step": 4698 }, { "epoch": 3.4330593607305935, "grad_norm": 7.885992945413242, "learning_rate": 2.9792836052105197e-08, "logits/chosen": -2.878103733062744, "logits/rejected": -2.758528232574463, "logps/chosen": -581.769287109375, "logps/rejected": -663.1942138671875, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 4.260732650756836, "rewards/margins": 7.387233734130859, "rewards/rejected": -3.1265017986297607, "step": 4699 }, { "epoch": 3.4337899543378994, "grad_norm": 11.691733832697716, "learning_rate": 2.9717366025044264e-08, "logits/chosen": -2.6408908367156982, "logits/rejected": -1.9369155168533325, "logps/chosen": -433.0059814453125, "logps/rejected": -402.50408935546875, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 2.6148509979248047, "rewards/margins": 7.639563083648682, "rewards/rejected": -5.024711608886719, "step": 4700 }, { "epoch": 3.4345205479452057, "grad_norm": 4.612427402328198, "learning_rate": 2.96419856673088e-08, "logits/chosen": -2.4589877128601074, "logits/rejected": -2.327674388885498, "logps/chosen": -616.4097290039062, "logps/rejected": -623.869384765625, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 2.5338354110717773, "rewards/margins": 6.024822235107422, "rewards/rejected": -3.4909865856170654, "step": 4701 }, { "epoch": 3.4352511415525115, "grad_norm": 4.99702594044815, "learning_rate": 2.9566695009583504e-08, "logits/chosen": -2.8894505500793457, "logits/rejected": -2.09844708442688, "logps/chosen": -521.0943603515625, "logps/rejected": -445.93597412109375, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": 2.11775541305542, "rewards/margins": 6.160772323608398, "rewards/rejected": -4.04301643371582, "step": 4702 }, { "epoch": 3.4359817351598174, "grad_norm": 8.350526950678256, "learning_rate": 2.949149408251658e-08, "logits/chosen": -2.897244453430176, "logits/rejected": -2.313723087310791, "logps/chosen": -706.38671875, "logps/rejected": -730.8945922851562, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 4.255495071411133, "rewards/margins": 6.723208904266357, "rewards/rejected": -2.4677138328552246, "step": 4703 }, { "epoch": 3.4367123287671233, "grad_norm": 6.489859961363977, "learning_rate": 2.9416382916719744e-08, "logits/chosen": -2.3679986000061035, "logits/rejected": -2.2951207160949707, "logps/chosen": -526.3023071289062, "logps/rejected": -596.0227661132812, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 3.5504486560821533, "rewards/margins": 6.5993242263793945, "rewards/rejected": -3.0488758087158203, "step": 4704 }, { "epoch": 3.437442922374429, "grad_norm": 7.677362472311048, "learning_rate": 2.9341361542768032e-08, "logits/chosen": -2.735999822616577, "logits/rejected": -2.624332904815674, "logps/chosen": -620.8521118164062, "logps/rejected": -642.6522216796875, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 3.50175142288208, "rewards/margins": 5.6560821533203125, "rewards/rejected": -2.1543309688568115, "step": 4705 }, { "epoch": 3.438173515981735, "grad_norm": 4.728403688072532, "learning_rate": 2.9266429991200076e-08, "logits/chosen": -2.6103382110595703, "logits/rejected": -3.044511318206787, "logps/chosen": -429.7598876953125, "logps/rejected": -780.6627197265625, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 1.253700613975525, "rewards/margins": 5.024890899658203, "rewards/rejected": -3.7711901664733887, "step": 4706 }, { "epoch": 3.438904109589041, "grad_norm": 8.979848554052063, "learning_rate": 2.9191588292517748e-08, "logits/chosen": -3.414492130279541, "logits/rejected": -2.212475299835205, "logps/chosen": -901.89404296875, "logps/rejected": -571.06884765625, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": 5.145528316497803, "rewards/margins": 7.027048110961914, "rewards/rejected": -1.8815197944641113, "step": 4707 }, { "epoch": 3.4396347031963472, "grad_norm": 4.579019817095309, "learning_rate": 2.9116836477186658e-08, "logits/chosen": -2.9316864013671875, "logits/rejected": -1.7541754245758057, "logps/chosen": -747.0359497070312, "logps/rejected": -493.2976989746094, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 4.708258152008057, "rewards/margins": 8.122072219848633, "rewards/rejected": -3.413814067840576, "step": 4708 }, { "epoch": 3.440365296803653, "grad_norm": 9.684446579655402, "learning_rate": 2.9042174575635543e-08, "logits/chosen": -3.1709811687469482, "logits/rejected": -2.0809664726257324, "logps/chosen": -770.6696166992188, "logps/rejected": -506.54510498046875, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": 4.233410835266113, "rewards/margins": 6.878383636474609, "rewards/rejected": -2.644972801208496, "step": 4709 }, { "epoch": 3.441095890410959, "grad_norm": 4.2874305830693835, "learning_rate": 2.896760261825659e-08, "logits/chosen": -3.068474054336548, "logits/rejected": -1.8400540351867676, "logps/chosen": -562.9384765625, "logps/rejected": -533.7521362304688, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 2.3060951232910156, "rewards/margins": 7.266623497009277, "rewards/rejected": -4.960528373718262, "step": 4710 }, { "epoch": 3.441826484018265, "grad_norm": 3.3021666378901187, "learning_rate": 2.8893120635405525e-08, "logits/chosen": -3.2945475578308105, "logits/rejected": -2.004859447479248, "logps/chosen": -665.7957153320312, "logps/rejected": -447.602294921875, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 5.0865478515625, "rewards/margins": 8.014801025390625, "rewards/rejected": -2.928252696990967, "step": 4711 }, { "epoch": 3.4425570776255707, "grad_norm": 8.318424264905978, "learning_rate": 2.8818728657401286e-08, "logits/chosen": -2.5250232219696045, "logits/rejected": -2.6845481395721436, "logps/chosen": -316.0293273925781, "logps/rejected": -512.135498046875, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": 2.1204724311828613, "rewards/margins": 7.0326457023620605, "rewards/rejected": -4.912173271179199, "step": 4712 }, { "epoch": 3.4432876712328766, "grad_norm": 6.2938740207609, "learning_rate": 2.8744426714526315e-08, "logits/chosen": -2.7122929096221924, "logits/rejected": -1.4797930717468262, "logps/chosen": -808.4685668945312, "logps/rejected": -438.31658935546875, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 3.8381595611572266, "rewards/margins": 7.677117347717285, "rewards/rejected": -3.8389573097229004, "step": 4713 }, { "epoch": 3.4440182648401825, "grad_norm": 5.248218960881162, "learning_rate": 2.8670214837026252e-08, "logits/chosen": -3.3907289505004883, "logits/rejected": -2.074474573135376, "logps/chosen": -1013.945068359375, "logps/rejected": -650.8131713867188, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 3.6957483291625977, "rewards/margins": 5.458545684814453, "rewards/rejected": -1.7627977132797241, "step": 4714 }, { "epoch": 3.444748858447489, "grad_norm": 4.657392528070021, "learning_rate": 2.8596093055110127e-08, "logits/chosen": -3.390404462814331, "logits/rejected": -1.4171479940414429, "logps/chosen": -783.7698364257812, "logps/rejected": -244.77639770507812, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 4.374524116516113, "rewards/margins": 6.8634934425354, "rewards/rejected": -2.488969564437866, "step": 4715 }, { "epoch": 3.4454794520547947, "grad_norm": 5.43448195612698, "learning_rate": 2.8522061398950386e-08, "logits/chosen": -3.074479103088379, "logits/rejected": -2.8288731575012207, "logps/chosen": -753.4301147460938, "logps/rejected": -795.4230346679688, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 2.9706356525421143, "rewards/margins": 6.316184043884277, "rewards/rejected": -3.345548391342163, "step": 4716 }, { "epoch": 3.4462100456621005, "grad_norm": 7.212524371709256, "learning_rate": 2.844811989868265e-08, "logits/chosen": -1.9610382318496704, "logits/rejected": -2.5563652515411377, "logps/chosen": -444.8179016113281, "logps/rejected": -705.6219482421875, "loss": 0.0548, "rewards/accuracies": 0.875, "rewards/chosen": 0.6164277791976929, "rewards/margins": 4.130039215087891, "rewards/rejected": -3.513611316680908, "step": 4717 }, { "epoch": 3.4469406392694064, "grad_norm": 3.82903145159449, "learning_rate": 2.837426858440603e-08, "logits/chosen": -3.0916030406951904, "logits/rejected": -1.816388487815857, "logps/chosen": -574.9884033203125, "logps/rejected": -421.6762390136719, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 2.6675853729248047, "rewards/margins": 7.854227542877197, "rewards/rejected": -5.186642646789551, "step": 4718 }, { "epoch": 3.4476712328767123, "grad_norm": 3.646136258574118, "learning_rate": 2.8300507486182596e-08, "logits/chosen": -3.006452798843384, "logits/rejected": -1.6687817573547363, "logps/chosen": -924.8726806640625, "logps/rejected": -574.1134643554688, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 5.662119388580322, "rewards/margins": 8.076388359069824, "rewards/rejected": -2.414268732070923, "step": 4719 }, { "epoch": 3.448401826484018, "grad_norm": 6.417325898362262, "learning_rate": 2.8226836634038048e-08, "logits/chosen": -2.4415862560272217, "logits/rejected": -2.370286226272583, "logps/chosen": -383.3511047363281, "logps/rejected": -390.97589111328125, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 1.1879372596740723, "rewards/margins": 3.7544124126434326, "rewards/rejected": -2.5664749145507812, "step": 4720 }, { "epoch": 3.449132420091324, "grad_norm": 4.23935960683906, "learning_rate": 2.8153256057961207e-08, "logits/chosen": -2.707939863204956, "logits/rejected": -2.230090618133545, "logps/chosen": -430.45355224609375, "logps/rejected": -435.77886962890625, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 4.357409954071045, "rewards/margins": 8.374523162841797, "rewards/rejected": -4.017113208770752, "step": 4721 }, { "epoch": 3.4498630136986304, "grad_norm": 4.54036999778618, "learning_rate": 2.8079765787904107e-08, "logits/chosen": -2.5254180431365967, "logits/rejected": -1.9119741916656494, "logps/chosen": -592.2894287109375, "logps/rejected": -510.353271484375, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": 2.5094282627105713, "rewards/margins": 7.2227888107299805, "rewards/rejected": -4.713360786437988, "step": 4722 }, { "epoch": 3.4505936073059362, "grad_norm": 5.933704970361517, "learning_rate": 2.800636585378205e-08, "logits/chosen": -3.1425294876098633, "logits/rejected": -2.3713951110839844, "logps/chosen": -545.7406005859375, "logps/rejected": -422.6382141113281, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 5.477487087249756, "rewards/margins": 7.371547698974609, "rewards/rejected": -1.8940601348876953, "step": 4723 }, { "epoch": 3.451324200913242, "grad_norm": 10.275966968190763, "learning_rate": 2.7933056285473543e-08, "logits/chosen": -2.229477643966675, "logits/rejected": -2.268684148788452, "logps/chosen": -473.6647033691406, "logps/rejected": -496.0856018066406, "loss": 0.0681, "rewards/accuracies": 1.0, "rewards/chosen": 2.6644022464752197, "rewards/margins": 4.183189868927002, "rewards/rejected": -1.5187876224517822, "step": 4724 }, { "epoch": 3.452054794520548, "grad_norm": 11.138115513294878, "learning_rate": 2.7859837112820418e-08, "logits/chosen": -2.5352509021759033, "logits/rejected": -2.0028076171875, "logps/chosen": -609.3284301757812, "logps/rejected": -541.165771484375, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 2.680838108062744, "rewards/margins": 5.6361308097839355, "rewards/rejected": -2.9552927017211914, "step": 4725 }, { "epoch": 3.452785388127854, "grad_norm": 8.33214067189636, "learning_rate": 2.778670836562752e-08, "logits/chosen": -2.5738155841827393, "logits/rejected": -2.360872745513916, "logps/chosen": -535.39453125, "logps/rejected": -518.9635009765625, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 2.889437675476074, "rewards/margins": 6.312263488769531, "rewards/rejected": -3.422826051712036, "step": 4726 }, { "epoch": 3.4535159817351597, "grad_norm": 8.958415280935748, "learning_rate": 2.7713670073663127e-08, "logits/chosen": -2.694943904876709, "logits/rejected": -1.736358880996704, "logps/chosen": -690.6004638671875, "logps/rejected": -543.5446166992188, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 4.501749038696289, "rewards/margins": 8.58229923248291, "rewards/rejected": -4.080550193786621, "step": 4727 }, { "epoch": 3.4542465753424656, "grad_norm": 3.934771606719493, "learning_rate": 2.764072226665848e-08, "logits/chosen": -2.762861728668213, "logits/rejected": -2.3802523612976074, "logps/chosen": -539.5117797851562, "logps/rejected": -554.0221557617188, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 3.518372058868408, "rewards/margins": 6.305584907531738, "rewards/rejected": -2.78721284866333, "step": 4728 }, { "epoch": 3.454977168949772, "grad_norm": 9.317833928213656, "learning_rate": 2.7567864974308047e-08, "logits/chosen": -2.256573438644409, "logits/rejected": -2.2052698135375977, "logps/chosen": -528.1194458007812, "logps/rejected": -525.9132690429688, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 2.1518447399139404, "rewards/margins": 6.341968059539795, "rewards/rejected": -4.190123558044434, "step": 4729 }, { "epoch": 3.455707762557078, "grad_norm": 6.521799066793376, "learning_rate": 2.7495098226269553e-08, "logits/chosen": -2.9157016277313232, "logits/rejected": -2.39542293548584, "logps/chosen": -341.6595458984375, "logps/rejected": -258.2449645996094, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 1.4637796878814697, "rewards/margins": 3.9882872104644775, "rewards/rejected": -2.524507761001587, "step": 4730 }, { "epoch": 3.4564383561643837, "grad_norm": 3.7011011444016644, "learning_rate": 2.7422422052163774e-08, "logits/chosen": -3.3010194301605225, "logits/rejected": -1.9481983184814453, "logps/chosen": -764.4495239257812, "logps/rejected": -390.32525634765625, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": 4.8751373291015625, "rewards/margins": 5.4774088859558105, "rewards/rejected": -0.6022716164588928, "step": 4731 }, { "epoch": 3.4571689497716895, "grad_norm": 5.882057009714148, "learning_rate": 2.7349836481574518e-08, "logits/chosen": -2.396658182144165, "logits/rejected": -2.1703782081604004, "logps/chosen": -636.356201171875, "logps/rejected": -551.6222534179688, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 2.309788942337036, "rewards/margins": 6.689231872558594, "rewards/rejected": -4.379443168640137, "step": 4732 }, { "epoch": 3.4578995433789954, "grad_norm": 13.913327728353217, "learning_rate": 2.7277341544048954e-08, "logits/chosen": -2.914914608001709, "logits/rejected": -2.5355443954467773, "logps/chosen": -754.6004638671875, "logps/rejected": -593.32763671875, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 4.047469615936279, "rewards/margins": 5.200981140136719, "rewards/rejected": -1.1535115242004395, "step": 4733 }, { "epoch": 3.4586301369863013, "grad_norm": 5.0642770544620275, "learning_rate": 2.7204937269097117e-08, "logits/chosen": -2.3681111335754395, "logits/rejected": -2.2249302864074707, "logps/chosen": -724.4046020507812, "logps/rejected": -646.6983642578125, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": 2.6188220977783203, "rewards/margins": 4.640141487121582, "rewards/rejected": -2.0213193893432617, "step": 4734 }, { "epoch": 3.459360730593607, "grad_norm": 6.305386907457335, "learning_rate": 2.713262368619243e-08, "logits/chosen": -2.835934638977051, "logits/rejected": -2.182252883911133, "logps/chosen": -648.6450805664062, "logps/rejected": -465.167724609375, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 5.114522933959961, "rewards/margins": 7.871886253356934, "rewards/rejected": -2.7573633193969727, "step": 4735 }, { "epoch": 3.4600913242009135, "grad_norm": 7.964682584425716, "learning_rate": 2.7060400824770957e-08, "logits/chosen": -2.664607286453247, "logits/rejected": -2.372692346572876, "logps/chosen": -552.1489868164062, "logps/rejected": -474.10357666015625, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 2.691545248031616, "rewards/margins": 5.646538734436035, "rewards/rejected": -2.9549930095672607, "step": 4736 }, { "epoch": 3.4608219178082194, "grad_norm": 9.09300160687637, "learning_rate": 2.6988268714232236e-08, "logits/chosen": -2.4103832244873047, "logits/rejected": -1.977738618850708, "logps/chosen": -554.7673950195312, "logps/rejected": -554.2027587890625, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 2.7513058185577393, "rewards/margins": 5.8053178787231445, "rewards/rejected": -3.0540122985839844, "step": 4737 }, { "epoch": 3.4615525114155252, "grad_norm": 5.90389144065729, "learning_rate": 2.6916227383938727e-08, "logits/chosen": -2.533078670501709, "logits/rejected": -1.8588697910308838, "logps/chosen": -653.4017333984375, "logps/rejected": -400.36517333984375, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 3.93662428855896, "rewards/margins": 7.080093860626221, "rewards/rejected": -3.1434693336486816, "step": 4738 }, { "epoch": 3.462283105022831, "grad_norm": 7.298951729536075, "learning_rate": 2.684427686321586e-08, "logits/chosen": -2.6467061042785645, "logits/rejected": -1.5527145862579346, "logps/chosen": -590.1481323242188, "logps/rejected": -407.6204528808594, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 4.794213771820068, "rewards/margins": 8.263993263244629, "rewards/rejected": -3.4697792530059814, "step": 4739 }, { "epoch": 3.463013698630137, "grad_norm": 5.074454803497059, "learning_rate": 2.6772417181352312e-08, "logits/chosen": -3.3202409744262695, "logits/rejected": -2.4675567150115967, "logps/chosen": -606.6901245117188, "logps/rejected": -533.5836181640625, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 3.2468748092651367, "rewards/margins": 4.842694282531738, "rewards/rejected": -1.5958197116851807, "step": 4740 }, { "epoch": 3.463744292237443, "grad_norm": 5.490118165959299, "learning_rate": 2.6700648367599414e-08, "logits/chosen": -3.035444974899292, "logits/rejected": -1.560113787651062, "logps/chosen": -648.0476684570312, "logps/rejected": -309.59442138671875, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 3.3956215381622314, "rewards/margins": 6.028363227844238, "rewards/rejected": -2.632742166519165, "step": 4741 }, { "epoch": 3.4644748858447487, "grad_norm": 5.8687737120128, "learning_rate": 2.6628970451171908e-08, "logits/chosen": -2.728084087371826, "logits/rejected": -1.9201809167861938, "logps/chosen": -660.6769409179688, "logps/rejected": -546.1866455078125, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 3.081834316253662, "rewards/margins": 5.973109722137451, "rewards/rejected": -2.891275405883789, "step": 4742 }, { "epoch": 3.465205479452055, "grad_norm": 4.839906804472227, "learning_rate": 2.6557383461247223e-08, "logits/chosen": -2.659414052963257, "logits/rejected": -2.198390007019043, "logps/chosen": -489.5558776855469, "logps/rejected": -376.30987548828125, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 2.41568660736084, "rewards/margins": 6.025609016418457, "rewards/rejected": -3.6099226474761963, "step": 4743 }, { "epoch": 3.465936073059361, "grad_norm": 26.70766400129839, "learning_rate": 2.6485887426966032e-08, "logits/chosen": -2.8210232257843018, "logits/rejected": -1.9447941780090332, "logps/chosen": -314.9102478027344, "logps/rejected": -290.19134521484375, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 1.9611625671386719, "rewards/margins": 5.139019012451172, "rewards/rejected": -3.177855968475342, "step": 4744 }, { "epoch": 3.466666666666667, "grad_norm": 6.658235209111609, "learning_rate": 2.6414482377431796e-08, "logits/chosen": -3.0776660442352295, "logits/rejected": -2.6031265258789062, "logps/chosen": -598.2825927734375, "logps/rejected": -549.0587768554688, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 3.1439998149871826, "rewards/margins": 5.056101322174072, "rewards/rejected": -1.9121016263961792, "step": 4745 }, { "epoch": 3.4673972602739727, "grad_norm": 8.00452914776637, "learning_rate": 2.634316834171099e-08, "logits/chosen": -3.1835155487060547, "logits/rejected": -2.165841817855835, "logps/chosen": -591.2145385742188, "logps/rejected": -433.954833984375, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 3.2876908779144287, "rewards/margins": 7.4467267990112305, "rewards/rejected": -4.159035682678223, "step": 4746 }, { "epoch": 3.4681278538812785, "grad_norm": 8.45871627266486, "learning_rate": 2.627194534883309e-08, "logits/chosen": -2.7681219577789307, "logits/rejected": -2.288119316101074, "logps/chosen": -700.1022338867188, "logps/rejected": -587.28369140625, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 4.004179954528809, "rewards/margins": 6.561548233032227, "rewards/rejected": -2.557368278503418, "step": 4747 }, { "epoch": 3.4688584474885844, "grad_norm": 4.3801502975386954, "learning_rate": 2.6200813427790487e-08, "logits/chosen": -2.6338112354278564, "logits/rejected": -2.203057050704956, "logps/chosen": -598.1353759765625, "logps/rejected": -486.0689697265625, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 4.094091892242432, "rewards/margins": 7.271425247192383, "rewards/rejected": -3.177332878112793, "step": 4748 }, { "epoch": 3.4695890410958903, "grad_norm": 9.707016316638413, "learning_rate": 2.612977260753843e-08, "logits/chosen": -2.619659900665283, "logits/rejected": -2.0531795024871826, "logps/chosen": -679.5992431640625, "logps/rejected": -608.50439453125, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 3.320589542388916, "rewards/margins": 6.4335832595825195, "rewards/rejected": -3.1129941940307617, "step": 4749 }, { "epoch": 3.470319634703196, "grad_norm": 5.42178828365227, "learning_rate": 2.605882291699521e-08, "logits/chosen": -3.3303439617156982, "logits/rejected": -2.030555248260498, "logps/chosen": -572.5025634765625, "logps/rejected": -333.92681884765625, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 3.443058729171753, "rewards/margins": 6.175775527954102, "rewards/rejected": -2.7327165603637695, "step": 4750 }, { "epoch": 3.4710502283105025, "grad_norm": 11.148191474707378, "learning_rate": 2.598796438504186e-08, "logits/chosen": -2.2221522331237793, "logits/rejected": -1.8306734561920166, "logps/chosen": -463.71978759765625, "logps/rejected": -450.5806884765625, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": 3.9138081073760986, "rewards/margins": 6.982629299163818, "rewards/rejected": -3.0688207149505615, "step": 4751 }, { "epoch": 3.4717808219178083, "grad_norm": 6.170817378725612, "learning_rate": 2.5917197040522532e-08, "logits/chosen": -3.2040605545043945, "logits/rejected": -2.1597445011138916, "logps/chosen": -491.24444580078125, "logps/rejected": -369.57733154296875, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": 3.548168659210205, "rewards/margins": 6.05075216293335, "rewards/rejected": -2.5025839805603027, "step": 4752 }, { "epoch": 3.472511415525114, "grad_norm": 4.611425945661422, "learning_rate": 2.584652091224404e-08, "logits/chosen": -2.5859265327453613, "logits/rejected": -2.4708049297332764, "logps/chosen": -426.20281982421875, "logps/rejected": -427.2279052734375, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 2.598757743835449, "rewards/margins": 4.548306465148926, "rewards/rejected": -1.9495489597320557, "step": 4753 }, { "epoch": 3.47324200913242, "grad_norm": 10.28180653651375, "learning_rate": 2.577593602897618e-08, "logits/chosen": -3.0230064392089844, "logits/rejected": -2.692741870880127, "logps/chosen": -797.4297485351562, "logps/rejected": -832.50537109375, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 3.074415922164917, "rewards/margins": 5.855291843414307, "rewards/rejected": -2.7808761596679688, "step": 4754 }, { "epoch": 3.473972602739726, "grad_norm": 5.499490194101589, "learning_rate": 2.5705442419451522e-08, "logits/chosen": -2.294041395187378, "logits/rejected": -1.856673002243042, "logps/chosen": -595.5435180664062, "logps/rejected": -487.6822509765625, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 3.224036693572998, "rewards/margins": 6.862190246582031, "rewards/rejected": -3.6381537914276123, "step": 4755 }, { "epoch": 3.474703196347032, "grad_norm": 6.042448251965882, "learning_rate": 2.5635040112365558e-08, "logits/chosen": -2.8316733837127686, "logits/rejected": -2.0146560668945312, "logps/chosen": -429.7247314453125, "logps/rejected": -315.661865234375, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 2.8049063682556152, "rewards/margins": 5.638545989990234, "rewards/rejected": -2.833639621734619, "step": 4756 }, { "epoch": 3.4754337899543377, "grad_norm": 3.352339647797755, "learning_rate": 2.556472913637675e-08, "logits/chosen": -3.097761631011963, "logits/rejected": -1.9482543468475342, "logps/chosen": -911.6105346679688, "logps/rejected": -617.138427734375, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 4.803898334503174, "rewards/margins": 8.230131149291992, "rewards/rejected": -3.4262330532073975, "step": 4757 }, { "epoch": 3.4761643835616436, "grad_norm": 7.787527960517371, "learning_rate": 2.549450952010601e-08, "logits/chosen": -2.3590564727783203, "logits/rejected": -1.500274896621704, "logps/chosen": -496.21087646484375, "logps/rejected": -366.6659240722656, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 2.415152072906494, "rewards/margins": 5.557201862335205, "rewards/rejected": -3.142049789428711, "step": 4758 }, { "epoch": 3.47689497716895, "grad_norm": 12.678441915483814, "learning_rate": 2.542438129213742e-08, "logits/chosen": -2.817049980163574, "logits/rejected": -2.7971298694610596, "logps/chosen": -894.1122436523438, "logps/rejected": -920.0647583007812, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 4.102757453918457, "rewards/margins": 6.243647575378418, "rewards/rejected": -2.140890598297119, "step": 4759 }, { "epoch": 3.477625570776256, "grad_norm": 11.536147651478753, "learning_rate": 2.5354344481017613e-08, "logits/chosen": -2.4541091918945312, "logits/rejected": -2.1853911876678467, "logps/chosen": -404.16485595703125, "logps/rejected": -381.6222229003906, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": 1.9570996761322021, "rewards/margins": 5.187094688415527, "rewards/rejected": -3.2299952507019043, "step": 4760 }, { "epoch": 3.4783561643835617, "grad_norm": 6.245919383217365, "learning_rate": 2.5284399115256205e-08, "logits/chosen": -2.5137791633605957, "logits/rejected": -1.870680332183838, "logps/chosen": -376.2164306640625, "logps/rejected": -364.6240539550781, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": 3.181250810623169, "rewards/margins": 6.40993595123291, "rewards/rejected": -3.228684902191162, "step": 4761 }, { "epoch": 3.4790867579908675, "grad_norm": 8.974459967132294, "learning_rate": 2.52145452233255e-08, "logits/chosen": -2.4932148456573486, "logits/rejected": -2.1628916263580322, "logps/chosen": -558.50146484375, "logps/rejected": -598.94482421875, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 3.73996639251709, "rewards/margins": 4.478232383728027, "rewards/rejected": -0.738266110420227, "step": 4762 }, { "epoch": 3.4798173515981734, "grad_norm": 9.601642356894315, "learning_rate": 2.514478283366045e-08, "logits/chosen": -2.221230983734131, "logits/rejected": -2.4031295776367188, "logps/chosen": -826.7105712890625, "logps/rejected": -733.7662353515625, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 2.1567368507385254, "rewards/margins": 3.8110830783843994, "rewards/rejected": -1.6543464660644531, "step": 4763 }, { "epoch": 3.4805479452054793, "grad_norm": 3.5739932796882585, "learning_rate": 2.5075111974659e-08, "logits/chosen": -3.2064483165740967, "logits/rejected": -2.589770793914795, "logps/chosen": -893.3709716796875, "logps/rejected": -739.187744140625, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 4.2544636726379395, "rewards/margins": 4.899865627288818, "rewards/rejected": -0.6454017758369446, "step": 4764 }, { "epoch": 3.481278538812785, "grad_norm": 6.40942274472938, "learning_rate": 2.500553267468164e-08, "logits/chosen": -2.496169328689575, "logits/rejected": -2.263324499130249, "logps/chosen": -555.5203857421875, "logps/rejected": -680.5704956054688, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 3.870039463043213, "rewards/margins": 6.644251346588135, "rewards/rejected": -2.774211883544922, "step": 4765 }, { "epoch": 3.4820091324200915, "grad_norm": 6.879816760794532, "learning_rate": 2.4936044962051733e-08, "logits/chosen": -2.5882601737976074, "logits/rejected": -1.6724027395248413, "logps/chosen": -448.15081787109375, "logps/rejected": -282.77520751953125, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": 2.6895976066589355, "rewards/margins": 6.9965009689331055, "rewards/rejected": -4.306903839111328, "step": 4766 }, { "epoch": 3.4827397260273973, "grad_norm": 5.255728185200779, "learning_rate": 2.486664886505524e-08, "logits/chosen": -2.914142608642578, "logits/rejected": -2.844327926635742, "logps/chosen": -760.046142578125, "logps/rejected": -815.0352783203125, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 2.1261954307556152, "rewards/margins": 4.579622745513916, "rewards/rejected": -2.4534270763397217, "step": 4767 }, { "epoch": 3.483470319634703, "grad_norm": 4.209768950585005, "learning_rate": 2.4797344411940813e-08, "logits/chosen": -2.750885486602783, "logits/rejected": -1.601944923400879, "logps/chosen": -615.7374267578125, "logps/rejected": -388.82220458984375, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 3.262396812438965, "rewards/margins": 6.520709037780762, "rewards/rejected": -3.258312463760376, "step": 4768 }, { "epoch": 3.484200913242009, "grad_norm": 5.363134194599481, "learning_rate": 2.472813163091997e-08, "logits/chosen": -2.3484978675842285, "logits/rejected": -1.7522428035736084, "logps/chosen": -928.0269165039062, "logps/rejected": -765.35986328125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 3.9461443424224854, "rewards/margins": 7.2282819747924805, "rewards/rejected": -3.282137870788574, "step": 4769 }, { "epoch": 3.484931506849315, "grad_norm": 5.6377253383760015, "learning_rate": 2.4659010550166755e-08, "logits/chosen": -2.8368608951568604, "logits/rejected": -2.0581154823303223, "logps/chosen": -773.9212646484375, "logps/rejected": -508.86480712890625, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 3.0545477867126465, "rewards/margins": 6.126729488372803, "rewards/rejected": -3.072181463241577, "step": 4770 }, { "epoch": 3.485662100456621, "grad_norm": 8.40648052367469, "learning_rate": 2.458998119781794e-08, "logits/chosen": -2.9102325439453125, "logits/rejected": -2.1580748558044434, "logps/chosen": -546.7233276367188, "logps/rejected": -344.48016357421875, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 3.4486284255981445, "rewards/margins": 6.85544490814209, "rewards/rejected": -3.406815528869629, "step": 4771 }, { "epoch": 3.4863926940639267, "grad_norm": 7.722557457005925, "learning_rate": 2.452104360197288e-08, "logits/chosen": -2.9078571796417236, "logits/rejected": -2.42838454246521, "logps/chosen": -504.3341369628906, "logps/rejected": -298.0833740234375, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 2.5286598205566406, "rewards/margins": 5.867530822753906, "rewards/rejected": -3.3388710021972656, "step": 4772 }, { "epoch": 3.487123287671233, "grad_norm": 2.8764848606421376, "learning_rate": 2.445219779069374e-08, "logits/chosen": -2.608452081680298, "logits/rejected": -2.1396820545196533, "logps/chosen": -606.7113037109375, "logps/rejected": -467.49102783203125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 4.840876579284668, "rewards/margins": 8.435277938842773, "rewards/rejected": -3.5944020748138428, "step": 4773 }, { "epoch": 3.487853881278539, "grad_norm": 6.870965224512134, "learning_rate": 2.43834437920053e-08, "logits/chosen": -2.529557228088379, "logits/rejected": -2.3052361011505127, "logps/chosen": -411.2100524902344, "logps/rejected": -491.58099365234375, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 2.5445425510406494, "rewards/margins": 5.028649806976318, "rewards/rejected": -2.484107255935669, "step": 4774 }, { "epoch": 3.4885844748858448, "grad_norm": 6.506812120646229, "learning_rate": 2.4314781633894695e-08, "logits/chosen": -2.134371280670166, "logits/rejected": -2.3120369911193848, "logps/chosen": -432.07275390625, "logps/rejected": -630.158447265625, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": 2.3277103900909424, "rewards/margins": 6.953245162963867, "rewards/rejected": -4.6255340576171875, "step": 4775 }, { "epoch": 3.4893150684931507, "grad_norm": 5.581322701761321, "learning_rate": 2.424621134431204e-08, "logits/chosen": -2.946362018585205, "logits/rejected": -1.942017912864685, "logps/chosen": -441.07177734375, "logps/rejected": -438.4039611816406, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 3.099517822265625, "rewards/margins": 8.096325874328613, "rewards/rejected": -4.996808052062988, "step": 4776 }, { "epoch": 3.4900456621004565, "grad_norm": 6.604427424757328, "learning_rate": 2.417773295116979e-08, "logits/chosen": -2.77592396736145, "logits/rejected": -2.583927869796753, "logps/chosen": -228.96759033203125, "logps/rejected": -287.51007080078125, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": -0.10026400536298752, "rewards/margins": 3.4733595848083496, "rewards/rejected": -3.5736231803894043, "step": 4777 }, { "epoch": 3.4907762557077624, "grad_norm": 3.5646583309938182, "learning_rate": 2.4109346482343195e-08, "logits/chosen": -2.6119275093078613, "logits/rejected": -2.3110523223876953, "logps/chosen": -669.8340454101562, "logps/rejected": -615.2201538085938, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 3.6974825859069824, "rewards/margins": 7.082116603851318, "rewards/rejected": -3.384634017944336, "step": 4778 }, { "epoch": 3.4915068493150683, "grad_norm": 5.301972493610095, "learning_rate": 2.404105196566994e-08, "logits/chosen": -2.909029006958008, "logits/rejected": -2.063642978668213, "logps/chosen": -788.30224609375, "logps/rejected": -552.724365234375, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 2.602555513381958, "rewards/margins": 5.6680145263671875, "rewards/rejected": -3.0654592514038086, "step": 4779 }, { "epoch": 3.4922374429223746, "grad_norm": 8.69534145463508, "learning_rate": 2.397284942895028e-08, "logits/chosen": -2.806487798690796, "logits/rejected": -1.6024035215377808, "logps/chosen": -924.0943603515625, "logps/rejected": -506.03399658203125, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": 5.755157470703125, "rewards/margins": 8.98987865447998, "rewards/rejected": -3.2347211837768555, "step": 4780 }, { "epoch": 3.4929680365296805, "grad_norm": 6.627192872081627, "learning_rate": 2.3904738899947152e-08, "logits/chosen": -2.7488961219787598, "logits/rejected": -2.178953170776367, "logps/chosen": -503.8096923828125, "logps/rejected": -346.2359313964844, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 2.5054550170898438, "rewards/margins": 4.579532623291016, "rewards/rejected": -2.0740773677825928, "step": 4781 }, { "epoch": 3.4936986301369863, "grad_norm": 5.413001634993237, "learning_rate": 2.3836720406385875e-08, "logits/chosen": -2.8627078533172607, "logits/rejected": -2.2062597274780273, "logps/chosen": -861.04443359375, "logps/rejected": -711.061767578125, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 3.818523406982422, "rewards/margins": 4.9074625968933105, "rewards/rejected": -1.0889393091201782, "step": 4782 }, { "epoch": 3.494429223744292, "grad_norm": 4.998968017890941, "learning_rate": 2.3768793975954495e-08, "logits/chosen": -2.5469629764556885, "logits/rejected": -1.8463854789733887, "logps/chosen": -575.4296875, "logps/rejected": -481.87945556640625, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 4.235841274261475, "rewards/margins": 7.412254810333252, "rewards/rejected": -3.176413059234619, "step": 4783 }, { "epoch": 3.495159817351598, "grad_norm": 3.037274538314531, "learning_rate": 2.370095963630339e-08, "logits/chosen": -2.9587974548339844, "logits/rejected": -2.346578598022461, "logps/chosen": -826.1288452148438, "logps/rejected": -547.2494506835938, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 4.279499053955078, "rewards/margins": 6.533138751983643, "rewards/rejected": -2.2536396980285645, "step": 4784 }, { "epoch": 3.495890410958904, "grad_norm": 7.464243786394866, "learning_rate": 2.3633217415045565e-08, "logits/chosen": -3.1610798835754395, "logits/rejected": -2.6365652084350586, "logps/chosen": -935.7344360351562, "logps/rejected": -965.3297119140625, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 4.8987345695495605, "rewards/margins": 4.839688301086426, "rewards/rejected": 0.059046268463134766, "step": 4785 }, { "epoch": 3.49662100456621, "grad_norm": 4.806637252068605, "learning_rate": 2.356556733975651e-08, "logits/chosen": -2.599512815475464, "logits/rejected": -2.198166847229004, "logps/chosen": -367.66851806640625, "logps/rejected": -506.5153503417969, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 2.249556541442871, "rewards/margins": 6.059135437011719, "rewards/rejected": -3.8095784187316895, "step": 4786 }, { "epoch": 3.497351598173516, "grad_norm": 4.766410187621304, "learning_rate": 2.3498009437974197e-08, "logits/chosen": -3.027086019515991, "logits/rejected": -1.7096749544143677, "logps/chosen": -758.760009765625, "logps/rejected": -459.86993408203125, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 4.559970855712891, "rewards/margins": 8.242411613464355, "rewards/rejected": -3.6824402809143066, "step": 4787 }, { "epoch": 3.498082191780822, "grad_norm": 5.520675292242159, "learning_rate": 2.3430543737199048e-08, "logits/chosen": -2.6413114070892334, "logits/rejected": -2.1841633319854736, "logps/chosen": -580.6397705078125, "logps/rejected": -497.4585266113281, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 2.3352415561676025, "rewards/margins": 5.59337043762207, "rewards/rejected": -3.258129358291626, "step": 4788 }, { "epoch": 3.498812785388128, "grad_norm": 5.987095625249267, "learning_rate": 2.3363170264893983e-08, "logits/chosen": -2.70763897895813, "logits/rejected": -2.340169906616211, "logps/chosen": -749.0674438476562, "logps/rejected": -617.951416015625, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 3.684136390686035, "rewards/margins": 6.318961143493652, "rewards/rejected": -2.6348249912261963, "step": 4789 }, { "epoch": 3.4995433789954338, "grad_norm": 6.36207227187253, "learning_rate": 2.3295889048484368e-08, "logits/chosen": -2.4547455310821533, "logits/rejected": -2.578990936279297, "logps/chosen": -427.3285217285156, "logps/rejected": -535.1726684570312, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 2.7279882431030273, "rewards/margins": 6.539124965667725, "rewards/rejected": -3.8111367225646973, "step": 4790 }, { "epoch": 3.5002739726027396, "grad_norm": 5.9389951730719135, "learning_rate": 2.32287001153581e-08, "logits/chosen": -2.6977481842041016, "logits/rejected": -2.625520944595337, "logps/chosen": -725.3359985351562, "logps/rejected": -625.6488037109375, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": 4.172783374786377, "rewards/margins": 5.328522682189941, "rewards/rejected": -1.1557389497756958, "step": 4791 }, { "epoch": 3.5010045662100455, "grad_norm": 7.169845792636377, "learning_rate": 2.316160349286539e-08, "logits/chosen": -2.610769033432007, "logits/rejected": -2.2878623008728027, "logps/chosen": -866.9111328125, "logps/rejected": -907.8665161132812, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 3.6526637077331543, "rewards/margins": 7.280166149139404, "rewards/rejected": -3.62750244140625, "step": 4792 }, { "epoch": 3.5017351598173514, "grad_norm": 5.3780551478068315, "learning_rate": 2.3094599208318883e-08, "logits/chosen": -2.873124599456787, "logits/rejected": -2.2085585594177246, "logps/chosen": -819.7263793945312, "logps/rejected": -621.47119140625, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 3.6443328857421875, "rewards/margins": 5.751548767089844, "rewards/rejected": -2.1072158813476562, "step": 4793 }, { "epoch": 3.5024657534246577, "grad_norm": 5.705773143752628, "learning_rate": 2.302768728899368e-08, "logits/chosen": -3.3783962726593018, "logits/rejected": -2.8661043643951416, "logps/chosen": -746.8529052734375, "logps/rejected": -605.4376220703125, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 2.6233019828796387, "rewards/margins": 3.939105272293091, "rewards/rejected": -1.315803050994873, "step": 4794 }, { "epoch": 3.5031963470319636, "grad_norm": 5.764294313116104, "learning_rate": 2.2960867762127328e-08, "logits/chosen": -2.4981231689453125, "logits/rejected": -2.2834417819976807, "logps/chosen": -880.9564208984375, "logps/rejected": -823.925537109375, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 4.951549053192139, "rewards/margins": 5.883999347686768, "rewards/rejected": -0.9324503540992737, "step": 4795 }, { "epoch": 3.5039269406392695, "grad_norm": 4.2391492666721415, "learning_rate": 2.2894140654919652e-08, "logits/chosen": -2.735595941543579, "logits/rejected": -2.412020206451416, "logps/chosen": -359.932861328125, "logps/rejected": -343.1551513671875, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 2.6274094581604004, "rewards/margins": 7.701064586639404, "rewards/rejected": -5.073655605316162, "step": 4796 }, { "epoch": 3.5046575342465753, "grad_norm": 9.223047942100548, "learning_rate": 2.2827505994532898e-08, "logits/chosen": -3.1689772605895996, "logits/rejected": -2.804810047149658, "logps/chosen": -754.05908203125, "logps/rejected": -674.94580078125, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 3.2366602420806885, "rewards/margins": 5.893304824829102, "rewards/rejected": -2.656644344329834, "step": 4797 }, { "epoch": 3.505388127853881, "grad_norm": 6.895336244612697, "learning_rate": 2.276096380809181e-08, "logits/chosen": -2.760178804397583, "logits/rejected": -2.3038339614868164, "logps/chosen": -518.8113403320312, "logps/rejected": -377.5892333984375, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 2.7895376682281494, "rewards/margins": 5.988842964172363, "rewards/rejected": -3.1993050575256348, "step": 4798 }, { "epoch": 3.506118721461187, "grad_norm": 3.304990497523511, "learning_rate": 2.2694514122683223e-08, "logits/chosen": -2.556828022003174, "logits/rejected": -2.415057420730591, "logps/chosen": -878.0049438476562, "logps/rejected": -708.3811645507812, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 4.43695592880249, "rewards/margins": 4.897713661193848, "rewards/rejected": -0.460757851600647, "step": 4799 }, { "epoch": 3.506849315068493, "grad_norm": 4.280904268658887, "learning_rate": 2.262815696535658e-08, "logits/chosen": -3.29960298538208, "logits/rejected": -2.2010855674743652, "logps/chosen": -694.1152954101562, "logps/rejected": -494.38836669921875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 4.388607025146484, "rewards/margins": 7.466183662414551, "rewards/rejected": -3.077576160430908, "step": 4800 }, { "epoch": 3.5075799086757993, "grad_norm": 8.024486261483796, "learning_rate": 2.256189236312353e-08, "logits/chosen": -2.344433069229126, "logits/rejected": -1.8602294921875, "logps/chosen": -472.02947998046875, "logps/rejected": -478.17681884765625, "loss": 0.0438, "rewards/accuracies": 0.875, "rewards/chosen": 2.745596408843994, "rewards/margins": 6.005038261413574, "rewards/rejected": -3.25944185256958, "step": 4801 }, { "epoch": 3.508310502283105, "grad_norm": 7.266488628947885, "learning_rate": 2.2495720342958017e-08, "logits/chosen": -2.586500644683838, "logits/rejected": -2.1604177951812744, "logps/chosen": -410.6710205078125, "logps/rejected": -472.23980712890625, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 3.711129665374756, "rewards/margins": 7.853653907775879, "rewards/rejected": -4.142523765563965, "step": 4802 }, { "epoch": 3.509041095890411, "grad_norm": 5.910447662490966, "learning_rate": 2.242964093179642e-08, "logits/chosen": -2.929168701171875, "logits/rejected": -1.900251865386963, "logps/chosen": -470.78851318359375, "logps/rejected": -282.46051025390625, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 2.423743963241577, "rewards/margins": 7.363709926605225, "rewards/rejected": -4.939965724945068, "step": 4803 }, { "epoch": 3.509771689497717, "grad_norm": 6.661479203283701, "learning_rate": 2.2363654156537264e-08, "logits/chosen": -3.371558904647827, "logits/rejected": -2.195403814315796, "logps/chosen": -693.5596313476562, "logps/rejected": -477.3993225097656, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 5.000728607177734, "rewards/margins": 6.373048305511475, "rewards/rejected": -1.3723196983337402, "step": 4804 }, { "epoch": 3.5105022831050228, "grad_norm": 8.081731998145184, "learning_rate": 2.2297760044041576e-08, "logits/chosen": -2.4581778049468994, "logits/rejected": -1.8602615594863892, "logps/chosen": -756.3995361328125, "logps/rejected": -567.7432861328125, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 2.346289873123169, "rewards/margins": 3.8040056228637695, "rewards/rejected": -1.4577155113220215, "step": 4805 }, { "epoch": 3.5112328767123286, "grad_norm": 7.34286065093588, "learning_rate": 2.2231958621132364e-08, "logits/chosen": -3.2458980083465576, "logits/rejected": -2.2969391345977783, "logps/chosen": -675.54736328125, "logps/rejected": -535.7531127929688, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 4.091396331787109, "rewards/margins": 6.924544334411621, "rewards/rejected": -2.833148241043091, "step": 4806 }, { "epoch": 3.5119634703196345, "grad_norm": 6.293943721513569, "learning_rate": 2.216624991459523e-08, "logits/chosen": -2.484036922454834, "logits/rejected": -2.0466413497924805, "logps/chosen": -772.2760009765625, "logps/rejected": -659.1752319335938, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 3.081942081451416, "rewards/margins": 4.686716079711914, "rewards/rejected": -1.6047741174697876, "step": 4807 }, { "epoch": 3.512694063926941, "grad_norm": 4.265872487025547, "learning_rate": 2.2100633951177755e-08, "logits/chosen": -2.9636635780334473, "logits/rejected": -1.8120085000991821, "logps/chosen": -380.846435546875, "logps/rejected": -230.67166137695312, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 4.064082145690918, "rewards/margins": 8.898735046386719, "rewards/rejected": -4.834653377532959, "step": 4808 }, { "epoch": 3.5134246575342467, "grad_norm": 4.622602305211163, "learning_rate": 2.2035110757589987e-08, "logits/chosen": -3.0730273723602295, "logits/rejected": -2.7080399990081787, "logps/chosen": -646.978515625, "logps/rejected": -535.041015625, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 3.4220926761627197, "rewards/margins": 5.180466175079346, "rewards/rejected": -1.758373498916626, "step": 4809 }, { "epoch": 3.5141552511415526, "grad_norm": 4.979100800188964, "learning_rate": 2.1969680360504116e-08, "logits/chosen": -2.956064224243164, "logits/rejected": -2.495487689971924, "logps/chosen": -813.5658569335938, "logps/rejected": -700.517333984375, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 3.9196627140045166, "rewards/margins": 6.552709579467773, "rewards/rejected": -2.6330466270446777, "step": 4810 }, { "epoch": 3.5148858447488585, "grad_norm": 9.431844267124902, "learning_rate": 2.1904342786554453e-08, "logits/chosen": -2.1374616622924805, "logits/rejected": -2.6516823768615723, "logps/chosen": -462.5954284667969, "logps/rejected": -482.4300231933594, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 2.651524782180786, "rewards/margins": 4.607914924621582, "rewards/rejected": -1.9563907384872437, "step": 4811 }, { "epoch": 3.5156164383561643, "grad_norm": 4.035795739047561, "learning_rate": 2.1839098062337773e-08, "logits/chosen": -2.7963027954101562, "logits/rejected": -2.0234715938568115, "logps/chosen": -840.615478515625, "logps/rejected": -576.6029052734375, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 5.866509437561035, "rewards/margins": 8.990662574768066, "rewards/rejected": -3.124152660369873, "step": 4812 }, { "epoch": 3.51634703196347, "grad_norm": 5.099782542660214, "learning_rate": 2.177394621441278e-08, "logits/chosen": -2.620861530303955, "logits/rejected": -2.649949312210083, "logps/chosen": -326.39056396484375, "logps/rejected": -399.7472839355469, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 2.0849146842956543, "rewards/margins": 5.694513320922852, "rewards/rejected": -3.609598159790039, "step": 4813 }, { "epoch": 3.517077625570776, "grad_norm": 11.174934704481009, "learning_rate": 2.170888726930062e-08, "logits/chosen": -2.8563075065612793, "logits/rejected": -2.2713418006896973, "logps/chosen": -766.5202026367188, "logps/rejected": -568.2091674804688, "loss": 0.056, "rewards/accuracies": 1.0, "rewards/chosen": 3.508788824081421, "rewards/margins": 4.209385395050049, "rewards/rejected": -0.7005965709686279, "step": 4814 }, { "epoch": 3.5178082191780824, "grad_norm": 11.102590901192311, "learning_rate": 2.1643921253484466e-08, "logits/chosen": -2.3820462226867676, "logits/rejected": -2.408358097076416, "logps/chosen": -546.416259765625, "logps/rejected": -705.0278930664062, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": 1.9915562868118286, "rewards/margins": 6.416776657104492, "rewards/rejected": -4.425220489501953, "step": 4815 }, { "epoch": 3.5185388127853883, "grad_norm": 7.23436260840259, "learning_rate": 2.1579048193409637e-08, "logits/chosen": -2.7682313919067383, "logits/rejected": -2.4080307483673096, "logps/chosen": -459.85333251953125, "logps/rejected": -372.3076171875, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 4.052987098693848, "rewards/margins": 7.576396942138672, "rewards/rejected": -3.523409843444824, "step": 4816 }, { "epoch": 3.519269406392694, "grad_norm": 10.341093984317157, "learning_rate": 2.151426811548379e-08, "logits/chosen": -3.0404186248779297, "logits/rejected": -1.941662073135376, "logps/chosen": -457.9233703613281, "logps/rejected": -345.219970703125, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 4.513494491577148, "rewards/margins": 8.846532821655273, "rewards/rejected": -4.333038330078125, "step": 4817 }, { "epoch": 3.52, "grad_norm": 9.472220914610036, "learning_rate": 2.1449581046076527e-08, "logits/chosen": -2.6681618690490723, "logits/rejected": -2.3781967163085938, "logps/chosen": -594.18408203125, "logps/rejected": -614.301513671875, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 4.728733062744141, "rewards/margins": 6.768438339233398, "rewards/rejected": -2.039705753326416, "step": 4818 }, { "epoch": 3.520730593607306, "grad_norm": 8.392536989732212, "learning_rate": 2.1384987011519696e-08, "logits/chosen": -2.25081729888916, "logits/rejected": -2.331305980682373, "logps/chosen": -556.8212280273438, "logps/rejected": -770.988525390625, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": 2.0310683250427246, "rewards/margins": 6.6223578453063965, "rewards/rejected": -4.591289520263672, "step": 4819 }, { "epoch": 3.5214611872146118, "grad_norm": 5.180472888349463, "learning_rate": 2.1320486038107322e-08, "logits/chosen": -2.6723179817199707, "logits/rejected": -2.1251492500305176, "logps/chosen": -750.1904907226562, "logps/rejected": -451.5756530761719, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 1.8021736145019531, "rewards/margins": 3.851365327835083, "rewards/rejected": -2.049191951751709, "step": 4820 }, { "epoch": 3.5221917808219176, "grad_norm": 4.251824155786949, "learning_rate": 2.1256078152095403e-08, "logits/chosen": -2.5056698322296143, "logits/rejected": -2.011823892593384, "logps/chosen": -640.3900146484375, "logps/rejected": -660.3648071289062, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 1.9912583827972412, "rewards/margins": 5.940574645996094, "rewards/rejected": -3.9493160247802734, "step": 4821 }, { "epoch": 3.522922374429224, "grad_norm": 3.8236392773512544, "learning_rate": 2.1191763379702245e-08, "logits/chosen": -2.8869965076446533, "logits/rejected": -2.1688473224639893, "logps/chosen": -712.1250610351562, "logps/rejected": -546.1416625976562, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 4.361955642700195, "rewards/margins": 6.959313869476318, "rewards/rejected": -2.597357749938965, "step": 4822 }, { "epoch": 3.52365296803653, "grad_norm": 5.825243035350408, "learning_rate": 2.1127541747107986e-08, "logits/chosen": -2.5952775478363037, "logits/rejected": -2.1989219188690186, "logps/chosen": -843.255859375, "logps/rejected": -801.7357788085938, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": 4.324331760406494, "rewards/margins": 5.835148334503174, "rewards/rejected": -1.5108168125152588, "step": 4823 }, { "epoch": 3.5243835616438357, "grad_norm": 5.172984393205593, "learning_rate": 2.1063413280455104e-08, "logits/chosen": -2.9116382598876953, "logits/rejected": -2.1764144897460938, "logps/chosen": -818.9520874023438, "logps/rejected": -597.36279296875, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 3.9943418502807617, "rewards/margins": 5.103065490722656, "rewards/rejected": -1.1087232828140259, "step": 4824 }, { "epoch": 3.5251141552511416, "grad_norm": 4.837016686240976, "learning_rate": 2.099937800584797e-08, "logits/chosen": -2.6097934246063232, "logits/rejected": -2.2525691986083984, "logps/chosen": -400.2752685546875, "logps/rejected": -443.57574462890625, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 3.540419578552246, "rewards/margins": 6.542486667633057, "rewards/rejected": -3.0020673274993896, "step": 4825 }, { "epoch": 3.5258447488584475, "grad_norm": 5.541702824907247, "learning_rate": 2.0935435949353152e-08, "logits/chosen": -2.7584943771362305, "logits/rejected": -1.9718295335769653, "logps/chosen": -538.688232421875, "logps/rejected": -417.3164367675781, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 4.1522674560546875, "rewards/margins": 6.628187656402588, "rewards/rejected": -2.4759202003479004, "step": 4826 }, { "epoch": 3.5265753424657533, "grad_norm": 7.429537060615459, "learning_rate": 2.0871587136999268e-08, "logits/chosen": -2.756249189376831, "logits/rejected": -2.4561948776245117, "logps/chosen": -581.7523803710938, "logps/rejected": -479.97625732421875, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 1.8987407684326172, "rewards/margins": 5.068214416503906, "rewards/rejected": -3.169473886489868, "step": 4827 }, { "epoch": 3.527305936073059, "grad_norm": 5.994283496077546, "learning_rate": 2.080783159477681e-08, "logits/chosen": -2.637042760848999, "logits/rejected": -2.0228970050811768, "logps/chosen": -540.7817993164062, "logps/rejected": -443.3717041015625, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 2.395432949066162, "rewards/margins": 5.769721984863281, "rewards/rejected": -3.374289035797119, "step": 4828 }, { "epoch": 3.5280365296803655, "grad_norm": 7.191742283617692, "learning_rate": 2.0744169348638484e-08, "logits/chosen": -2.7400262355804443, "logits/rejected": -2.1736319065093994, "logps/chosen": -598.078369140625, "logps/rejected": -592.6091918945312, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 4.0652618408203125, "rewards/margins": 6.27805757522583, "rewards/rejected": -2.212796211242676, "step": 4829 }, { "epoch": 3.5287671232876714, "grad_norm": 5.911801590105294, "learning_rate": 2.06806004244989e-08, "logits/chosen": -2.8249096870422363, "logits/rejected": -2.482743263244629, "logps/chosen": -518.3511962890625, "logps/rejected": -499.4450378417969, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 3.5321044921875, "rewards/margins": 4.556091785430908, "rewards/rejected": -1.0239874124526978, "step": 4830 }, { "epoch": 3.5294977168949773, "grad_norm": 7.707624935556789, "learning_rate": 2.061712484823483e-08, "logits/chosen": -2.737818717956543, "logits/rejected": -1.8344216346740723, "logps/chosen": -740.8033447265625, "logps/rejected": -473.02960205078125, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 3.0373682975769043, "rewards/margins": 5.408219337463379, "rewards/rejected": -2.3708508014678955, "step": 4831 }, { "epoch": 3.530228310502283, "grad_norm": 7.347141472946219, "learning_rate": 2.0553742645684908e-08, "logits/chosen": -2.655132293701172, "logits/rejected": -1.7570887804031372, "logps/chosen": -574.40283203125, "logps/rejected": -428.1981506347656, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 2.174663543701172, "rewards/margins": 5.482327461242676, "rewards/rejected": -3.307664394378662, "step": 4832 }, { "epoch": 3.530958904109589, "grad_norm": 10.056135166762004, "learning_rate": 2.049045384264972e-08, "logits/chosen": -2.854896306991577, "logits/rejected": -2.486187696456909, "logps/chosen": -523.9376831054688, "logps/rejected": -467.8679504394531, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 2.4247336387634277, "rewards/margins": 4.9552412033081055, "rewards/rejected": -2.5305073261260986, "step": 4833 }, { "epoch": 3.531689497716895, "grad_norm": 8.195666717350976, "learning_rate": 2.0427258464892072e-08, "logits/chosen": -2.884094715118408, "logits/rejected": -2.8089375495910645, "logps/chosen": -683.3502197265625, "logps/rejected": -710.241943359375, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 3.3517472743988037, "rewards/margins": 5.281445503234863, "rewards/rejected": -1.9296983480453491, "step": 4834 }, { "epoch": 3.5324200913242008, "grad_norm": 12.57571490399968, "learning_rate": 2.0364156538136472e-08, "logits/chosen": -3.0336954593658447, "logits/rejected": -2.555579900741577, "logps/chosen": -730.5423583984375, "logps/rejected": -736.2566528320312, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 2.94384503364563, "rewards/margins": 4.765193939208984, "rewards/rejected": -1.8213491439819336, "step": 4835 }, { "epoch": 3.533150684931507, "grad_norm": 4.301459956167975, "learning_rate": 2.0301148088069515e-08, "logits/chosen": -2.8717970848083496, "logits/rejected": -2.4719834327697754, "logps/chosen": -701.9365234375, "logps/rejected": -860.810546875, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 3.0702648162841797, "rewards/margins": 7.717846870422363, "rewards/rejected": -4.647582530975342, "step": 4836 }, { "epoch": 3.5338812785388125, "grad_norm": 6.936656453853619, "learning_rate": 2.023823314033976e-08, "logits/chosen": -2.361079216003418, "logits/rejected": -2.3299028873443604, "logps/chosen": -522.9779052734375, "logps/rejected": -508.0372314453125, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 2.8080649375915527, "rewards/margins": 7.490571022033691, "rewards/rejected": -4.682506561279297, "step": 4837 }, { "epoch": 3.534611872146119, "grad_norm": 6.376680040573131, "learning_rate": 2.0175411720557616e-08, "logits/chosen": -2.969485282897949, "logits/rejected": -2.249668836593628, "logps/chosen": -842.0411376953125, "logps/rejected": -743.86572265625, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 3.8230602741241455, "rewards/margins": 6.647774696350098, "rewards/rejected": -2.824714183807373, "step": 4838 }, { "epoch": 3.5353424657534247, "grad_norm": 5.851738187345058, "learning_rate": 2.011268385429557e-08, "logits/chosen": -2.87844181060791, "logits/rejected": -2.095991611480713, "logps/chosen": -689.89404296875, "logps/rejected": -555.1302490234375, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 3.7594215869903564, "rewards/margins": 5.579658031463623, "rewards/rejected": -1.8202365636825562, "step": 4839 }, { "epoch": 3.5360730593607306, "grad_norm": 5.283994965786958, "learning_rate": 2.005004956708789e-08, "logits/chosen": -3.165895700454712, "logits/rejected": -1.521121859550476, "logps/chosen": -643.0084838867188, "logps/rejected": -311.47344970703125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 4.4866461753845215, "rewards/margins": 7.306154251098633, "rewards/rejected": -2.8195078372955322, "step": 4840 }, { "epoch": 3.5368036529680364, "grad_norm": 5.042489144990175, "learning_rate": 1.9987508884430764e-08, "logits/chosen": -2.685635566711426, "logits/rejected": -2.5057156085968018, "logps/chosen": -482.35064697265625, "logps/rejected": -485.4664306640625, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 3.1916768550872803, "rewards/margins": 4.785049915313721, "rewards/rejected": -1.5933728218078613, "step": 4841 }, { "epoch": 3.5375342465753423, "grad_norm": 7.337442229529286, "learning_rate": 1.9925061831782302e-08, "logits/chosen": -2.192160129547119, "logits/rejected": -2.0246658325195312, "logps/chosen": -661.5747680664062, "logps/rejected": -715.1295166015625, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 3.8495755195617676, "rewards/margins": 5.9444580078125, "rewards/rejected": -2.0948829650878906, "step": 4842 }, { "epoch": 3.5382648401826486, "grad_norm": 3.195882508563937, "learning_rate": 1.986270843456253e-08, "logits/chosen": -2.8209452629089355, "logits/rejected": -1.5968385934829712, "logps/chosen": -548.7590942382812, "logps/rejected": -405.05517578125, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 4.923152923583984, "rewards/margins": 9.585693359375, "rewards/rejected": -4.662540435791016, "step": 4843 }, { "epoch": 3.538995433789954, "grad_norm": 8.378201911684226, "learning_rate": 1.9800448718153423e-08, "logits/chosen": -2.164628744125366, "logits/rejected": -1.9705135822296143, "logps/chosen": -613.7823486328125, "logps/rejected": -675.4730224609375, "loss": 0.0499, "rewards/accuracies": 0.875, "rewards/chosen": 1.4820674657821655, "rewards/margins": 5.3603925704956055, "rewards/rejected": -3.8783249855041504, "step": 4844 }, { "epoch": 3.5397260273972604, "grad_norm": 6.8219558466385815, "learning_rate": 1.973828270789854e-08, "logits/chosen": -2.627213954925537, "logits/rejected": -2.705073118209839, "logps/chosen": -585.2998657226562, "logps/rejected": -558.1766967773438, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 1.2770286798477173, "rewards/margins": 3.961083173751831, "rewards/rejected": -2.6840546131134033, "step": 4845 }, { "epoch": 3.5404566210045663, "grad_norm": 8.626002156735634, "learning_rate": 1.9676210429103613e-08, "logits/chosen": -2.463047981262207, "logits/rejected": -2.173719882965088, "logps/chosen": -548.9241333007812, "logps/rejected": -553.9806518554688, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 1.6505126953125, "rewards/margins": 5.052939414978027, "rewards/rejected": -3.4024267196655273, "step": 4846 }, { "epoch": 3.541187214611872, "grad_norm": 5.48826317835103, "learning_rate": 1.9614231907035984e-08, "logits/chosen": -3.236934185028076, "logits/rejected": -1.7593436241149902, "logps/chosen": -795.0912475585938, "logps/rejected": -344.80267333984375, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 4.632387161254883, "rewards/margins": 6.68378210067749, "rewards/rejected": -2.0513949394226074, "step": 4847 }, { "epoch": 3.541917808219178, "grad_norm": 7.741326924217025, "learning_rate": 1.955234716692508e-08, "logits/chosen": -2.628802537918091, "logits/rejected": -2.1802566051483154, "logps/chosen": -514.3135375976562, "logps/rejected": -602.8670043945312, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": 2.8508803844451904, "rewards/margins": 6.560248851776123, "rewards/rejected": -3.709368944168091, "step": 4848 }, { "epoch": 3.542648401826484, "grad_norm": 4.904504356112448, "learning_rate": 1.9490556233961898e-08, "logits/chosen": -2.6659793853759766, "logits/rejected": -2.221130847930908, "logps/chosen": -488.96356201171875, "logps/rejected": -466.6416320800781, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 4.286529541015625, "rewards/margins": 6.7467427253723145, "rewards/rejected": -2.4602127075195312, "step": 4849 }, { "epoch": 3.54337899543379, "grad_norm": 3.644179283229, "learning_rate": 1.9428859133299364e-08, "logits/chosen": -2.8078691959381104, "logits/rejected": -1.9988245964050293, "logps/chosen": -604.6416015625, "logps/rejected": -403.8801574707031, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 4.773167133331299, "rewards/margins": 7.471510887145996, "rewards/rejected": -2.6983437538146973, "step": 4850 }, { "epoch": 3.5441095890410956, "grad_norm": 7.917532729577174, "learning_rate": 1.9367255890052225e-08, "logits/chosen": -2.6228394508361816, "logits/rejected": -2.1846139430999756, "logps/chosen": -740.9230346679688, "logps/rejected": -506.0140075683594, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 2.471104621887207, "rewards/margins": 5.675793647766113, "rewards/rejected": -3.204688549041748, "step": 4851 }, { "epoch": 3.544840182648402, "grad_norm": 5.226774887080449, "learning_rate": 1.9305746529296978e-08, "logits/chosen": -2.9927308559417725, "logits/rejected": -2.0260009765625, "logps/chosen": -660.0760498046875, "logps/rejected": -571.130859375, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 3.736941337585449, "rewards/margins": 6.314746856689453, "rewards/rejected": -2.577805280685425, "step": 4852 }, { "epoch": 3.545570776255708, "grad_norm": 18.727828688420143, "learning_rate": 1.9244331076071986e-08, "logits/chosen": -2.487361431121826, "logits/rejected": -1.8217188119888306, "logps/chosen": -463.7930908203125, "logps/rejected": -512.0383911132812, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 2.7029619216918945, "rewards/margins": 5.414252281188965, "rewards/rejected": -2.7112903594970703, "step": 4853 }, { "epoch": 3.5463013698630137, "grad_norm": 5.120549013683795, "learning_rate": 1.9183009555377246e-08, "logits/chosen": -2.4006409645080566, "logits/rejected": -2.3270695209503174, "logps/chosen": -543.5421752929688, "logps/rejected": -501.03387451171875, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 3.4942467212677, "rewards/margins": 5.726053237915039, "rewards/rejected": -2.231806516647339, "step": 4854 }, { "epoch": 3.5470319634703196, "grad_norm": 5.196112411986296, "learning_rate": 1.9121781992174598e-08, "logits/chosen": -3.059696674346924, "logits/rejected": -2.2484889030456543, "logps/chosen": -752.7958984375, "logps/rejected": -598.1300048828125, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 3.359951972961426, "rewards/margins": 6.00372314453125, "rewards/rejected": -2.643771171569824, "step": 4855 }, { "epoch": 3.5477625570776254, "grad_norm": 9.223571964545853, "learning_rate": 1.9060648411387714e-08, "logits/chosen": -3.297595977783203, "logits/rejected": -2.095306396484375, "logps/chosen": -720.916748046875, "logps/rejected": -513.8900756835938, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 4.11453914642334, "rewards/margins": 5.912365436553955, "rewards/rejected": -1.797825813293457, "step": 4856 }, { "epoch": 3.5484931506849318, "grad_norm": 3.2364045930069114, "learning_rate": 1.8999608837901858e-08, "logits/chosen": -3.275749921798706, "logits/rejected": -2.118013381958008, "logps/chosen": -696.5105590820312, "logps/rejected": -383.6875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 3.0576162338256836, "rewards/margins": 5.685469627380371, "rewards/rejected": -2.6278536319732666, "step": 4857 }, { "epoch": 3.549223744292237, "grad_norm": 8.492353702662932, "learning_rate": 1.893866329656413e-08, "logits/chosen": -2.948765754699707, "logits/rejected": -1.9840844869613647, "logps/chosen": -566.9368896484375, "logps/rejected": -399.49041748046875, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 4.34035587310791, "rewards/margins": 7.606492519378662, "rewards/rejected": -3.26613712310791, "step": 4858 }, { "epoch": 3.5499543378995435, "grad_norm": 7.04989417246937, "learning_rate": 1.8877811812183257e-08, "logits/chosen": -2.8219754695892334, "logits/rejected": -1.8864080905914307, "logps/chosen": -692.8480834960938, "logps/rejected": -496.85845947265625, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 3.8069281578063965, "rewards/margins": 7.799125671386719, "rewards/rejected": -3.9921977519989014, "step": 4859 }, { "epoch": 3.5506849315068494, "grad_norm": 5.230137444872815, "learning_rate": 1.8817054409529825e-08, "logits/chosen": -3.127153158187866, "logits/rejected": -2.184495687484741, "logps/chosen": -733.5446166992188, "logps/rejected": -640.2113647460938, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 5.097587585449219, "rewards/margins": 6.250665664672852, "rewards/rejected": -1.153078317642212, "step": 4860 }, { "epoch": 3.5514155251141553, "grad_norm": 6.521510697386232, "learning_rate": 1.8756391113336018e-08, "logits/chosen": -2.799314498901367, "logits/rejected": -2.2742550373077393, "logps/chosen": -522.3919067382812, "logps/rejected": -509.4454345703125, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 2.7172980308532715, "rewards/margins": 5.372272491455078, "rewards/rejected": -2.6549744606018066, "step": 4861 }, { "epoch": 3.552146118721461, "grad_norm": 4.497618362879442, "learning_rate": 1.8695821948295663e-08, "logits/chosen": -2.8915529251098633, "logits/rejected": -2.6034677028656006, "logps/chosen": -505.66168212890625, "logps/rejected": -546.8878784179688, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 3.1546058654785156, "rewards/margins": 5.603085517883301, "rewards/rejected": -2.448479652404785, "step": 4862 }, { "epoch": 3.552876712328767, "grad_norm": 6.32716705230033, "learning_rate": 1.8635346939064432e-08, "logits/chosen": -3.160982847213745, "logits/rejected": -3.0485267639160156, "logps/chosen": -845.1464233398438, "logps/rejected": -884.6210327148438, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 4.034989356994629, "rewards/margins": 4.21074104309082, "rewards/rejected": -0.1757516860961914, "step": 4863 }, { "epoch": 3.5536073059360733, "grad_norm": 9.144011041509415, "learning_rate": 1.857496611025952e-08, "logits/chosen": -2.047813892364502, "logits/rejected": -1.954265832901001, "logps/chosen": -637.100341796875, "logps/rejected": -658.3924560546875, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": 3.192213773727417, "rewards/margins": 6.286571502685547, "rewards/rejected": -3.094357490539551, "step": 4864 }, { "epoch": 3.5543378995433788, "grad_norm": 7.906082814978283, "learning_rate": 1.8514679486459877e-08, "logits/chosen": -2.454864501953125, "logits/rejected": -2.1789095401763916, "logps/chosen": -701.392333984375, "logps/rejected": -571.307861328125, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 2.0949997901916504, "rewards/margins": 4.635601043701172, "rewards/rejected": -2.5406012535095215, "step": 4865 }, { "epoch": 3.555068493150685, "grad_norm": 8.439640932083545, "learning_rate": 1.8454487092206095e-08, "logits/chosen": -2.9811758995056152, "logits/rejected": -2.4202964305877686, "logps/chosen": -745.5017700195312, "logps/rejected": -567.154052734375, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": 4.448339462280273, "rewards/margins": 5.0238189697265625, "rewards/rejected": -0.5754794478416443, "step": 4866 }, { "epoch": 3.555799086757991, "grad_norm": 3.9198334384097864, "learning_rate": 1.8394388952000295e-08, "logits/chosen": -2.9546451568603516, "logits/rejected": -2.7056539058685303, "logps/chosen": -626.948486328125, "logps/rejected": -580.9708251953125, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 3.0859014987945557, "rewards/margins": 7.368070125579834, "rewards/rejected": -4.282168388366699, "step": 4867 }, { "epoch": 3.556529680365297, "grad_norm": 4.672620426787525, "learning_rate": 1.8334385090306382e-08, "logits/chosen": -2.930814743041992, "logits/rejected": -2.57112979888916, "logps/chosen": -894.0138549804688, "logps/rejected": -758.9475708007812, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 4.356770038604736, "rewards/margins": 5.572525501251221, "rewards/rejected": -1.2157552242279053, "step": 4868 }, { "epoch": 3.5572602739726027, "grad_norm": 7.6412380729243985, "learning_rate": 1.8274475531549816e-08, "logits/chosen": -2.8274731636047363, "logits/rejected": -2.834334373474121, "logps/chosen": -775.7328491210938, "logps/rejected": -719.7572021484375, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 3.693241596221924, "rewards/margins": 4.682442665100098, "rewards/rejected": -0.9892010688781738, "step": 4869 }, { "epoch": 3.5579908675799086, "grad_norm": 6.741110705763798, "learning_rate": 1.8214660300117702e-08, "logits/chosen": -2.2198212146759033, "logits/rejected": -1.857062578201294, "logps/chosen": -498.65838623046875, "logps/rejected": -576.92138671875, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 2.2059290409088135, "rewards/margins": 5.348215579986572, "rewards/rejected": -3.142286777496338, "step": 4870 }, { "epoch": 3.558721461187215, "grad_norm": 4.9522608054262385, "learning_rate": 1.8154939420358645e-08, "logits/chosen": -2.886867046356201, "logits/rejected": -2.65940523147583, "logps/chosen": -1018.687744140625, "logps/rejected": -925.7523193359375, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 5.697323799133301, "rewards/margins": 6.646238327026367, "rewards/rejected": -0.9489138722419739, "step": 4871 }, { "epoch": 3.5594520547945203, "grad_norm": 4.6447495135715595, "learning_rate": 1.809531291658295e-08, "logits/chosen": -2.326706647872925, "logits/rejected": -2.5037806034088135, "logps/chosen": -706.2554321289062, "logps/rejected": -970.7412109375, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 3.5449156761169434, "rewards/margins": 5.348372459411621, "rewards/rejected": -1.8034563064575195, "step": 4872 }, { "epoch": 3.5601826484018266, "grad_norm": 3.7300033199101215, "learning_rate": 1.8035780813062535e-08, "logits/chosen": -2.6364662647247314, "logits/rejected": -1.8767848014831543, "logps/chosen": -672.7173461914062, "logps/rejected": -516.7789306640625, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 3.3405160903930664, "rewards/margins": 6.120352745056152, "rewards/rejected": -2.779836893081665, "step": 4873 }, { "epoch": 3.5609132420091325, "grad_norm": 5.996396254075332, "learning_rate": 1.7976343134030763e-08, "logits/chosen": -2.617696762084961, "logits/rejected": -2.675696849822998, "logps/chosen": -686.5028076171875, "logps/rejected": -802.406982421875, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": 4.266768932342529, "rewards/margins": 4.936633586883545, "rewards/rejected": -0.6698648929595947, "step": 4874 }, { "epoch": 3.5616438356164384, "grad_norm": 5.831106941782487, "learning_rate": 1.7916999903682644e-08, "logits/chosen": -2.973832845687866, "logits/rejected": -1.9840151071548462, "logps/chosen": -643.4454956054688, "logps/rejected": -436.5862731933594, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 1.9403502941131592, "rewards/margins": 5.312495231628418, "rewards/rejected": -3.3721446990966797, "step": 4875 }, { "epoch": 3.5623744292237443, "grad_norm": 11.08927960655618, "learning_rate": 1.785775114617466e-08, "logits/chosen": -3.003446340560913, "logits/rejected": -1.9977705478668213, "logps/chosen": -328.4969177246094, "logps/rejected": -315.00439453125, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 1.611167073249817, "rewards/margins": 4.690725803375244, "rewards/rejected": -3.0795583724975586, "step": 4876 }, { "epoch": 3.56310502283105, "grad_norm": 7.5141148308962205, "learning_rate": 1.7798596885625045e-08, "logits/chosen": -2.7217891216278076, "logits/rejected": -1.7775447368621826, "logps/chosen": -867.6563720703125, "logps/rejected": -518.2860107421875, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 3.7642629146575928, "rewards/margins": 3.9473679065704346, "rewards/rejected": -0.18310493230819702, "step": 4877 }, { "epoch": 3.563835616438356, "grad_norm": 8.544548875787571, "learning_rate": 1.773953714611326e-08, "logits/chosen": -2.887646198272705, "logits/rejected": -2.2982640266418457, "logps/chosen": -448.5920715332031, "logps/rejected": -397.30377197265625, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 2.5176901817321777, "rewards/margins": 5.856730937957764, "rewards/rejected": -3.339040517807007, "step": 4878 }, { "epoch": 3.564566210045662, "grad_norm": 7.907109752846149, "learning_rate": 1.7680571951680573e-08, "logits/chosen": -2.948190212249756, "logits/rejected": -2.408766746520996, "logps/chosen": -498.77520751953125, "logps/rejected": -403.5972595214844, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": 2.026076316833496, "rewards/margins": 4.967981338500977, "rewards/rejected": -2.9419050216674805, "step": 4879 }, { "epoch": 3.565296803652968, "grad_norm": 17.309110784847075, "learning_rate": 1.762170132632962e-08, "logits/chosen": -2.7210636138916016, "logits/rejected": -2.399620294570923, "logps/chosen": -523.9238891601562, "logps/rejected": -571.8642578125, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": 3.0990540981292725, "rewards/margins": 6.2071123123168945, "rewards/rejected": -3.108058452606201, "step": 4880 }, { "epoch": 3.566027397260274, "grad_norm": 7.137161049085005, "learning_rate": 1.7562925294024504e-08, "logits/chosen": -2.9901721477508545, "logits/rejected": -2.20475435256958, "logps/chosen": -663.4395141601562, "logps/rejected": -559.9168701171875, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 3.788376808166504, "rewards/margins": 6.4556193351745605, "rewards/rejected": -2.6672422885894775, "step": 4881 }, { "epoch": 3.56675799086758, "grad_norm": 7.259189438141708, "learning_rate": 1.7504243878690927e-08, "logits/chosen": -2.815439462661743, "logits/rejected": -2.338632822036743, "logps/chosen": -850.3824462890625, "logps/rejected": -584.917724609375, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": 3.5706734657287598, "rewards/margins": 5.363778114318848, "rewards/rejected": -1.793104648590088, "step": 4882 }, { "epoch": 3.567488584474886, "grad_norm": 5.876266343013496, "learning_rate": 1.744565710421608e-08, "logits/chosen": -2.579587459564209, "logits/rejected": -2.475062608718872, "logps/chosen": -357.41070556640625, "logps/rejected": -387.64093017578125, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 3.1580400466918945, "rewards/margins": 8.297560691833496, "rewards/rejected": -5.139520168304443, "step": 4883 }, { "epoch": 3.5682191780821917, "grad_norm": 6.470890299228418, "learning_rate": 1.738716499444845e-08, "logits/chosen": -3.0125861167907715, "logits/rejected": -2.2517318725585938, "logps/chosen": -430.7841491699219, "logps/rejected": -397.0392761230469, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 2.890169143676758, "rewards/margins": 4.333325386047363, "rewards/rejected": -1.443156123161316, "step": 4884 }, { "epoch": 3.5689497716894976, "grad_norm": 4.783726468473182, "learning_rate": 1.732876757319826e-08, "logits/chosen": -2.9166781902313232, "logits/rejected": -2.039815902709961, "logps/chosen": -647.3126220703125, "logps/rejected": -460.2417907714844, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 4.797560691833496, "rewards/margins": 5.572994709014893, "rewards/rejected": -0.7754338979721069, "step": 4885 }, { "epoch": 3.5696803652968034, "grad_norm": 3.187994376346314, "learning_rate": 1.727046486423697e-08, "logits/chosen": -2.85599946975708, "logits/rejected": -2.3929951190948486, "logps/chosen": -668.150634765625, "logps/rejected": -632.1555786132812, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 3.8101375102996826, "rewards/margins": 7.178520202636719, "rewards/rejected": -3.3683829307556152, "step": 4886 }, { "epoch": 3.5704109589041098, "grad_norm": 4.513616095230846, "learning_rate": 1.7212256891297656e-08, "logits/chosen": -2.5739641189575195, "logits/rejected": -1.7514822483062744, "logps/chosen": -922.6348876953125, "logps/rejected": -501.62396240234375, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 6.333173751831055, "rewards/margins": 7.961819648742676, "rewards/rejected": -1.6286462545394897, "step": 4887 }, { "epoch": 3.5711415525114156, "grad_norm": 10.830858466307623, "learning_rate": 1.715414367807458e-08, "logits/chosen": -2.5325825214385986, "logits/rejected": -2.5764880180358887, "logps/chosen": -347.21435546875, "logps/rejected": -474.40972900390625, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": 2.047370195388794, "rewards/margins": 5.161467552185059, "rewards/rejected": -3.1140971183776855, "step": 4888 }, { "epoch": 3.5718721461187215, "grad_norm": 5.257858752292384, "learning_rate": 1.709612524822368e-08, "logits/chosen": -3.136716604232788, "logits/rejected": -2.391031265258789, "logps/chosen": -521.3920288085938, "logps/rejected": -522.3560180664062, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 3.5360159873962402, "rewards/margins": 6.268232345581055, "rewards/rejected": -2.7322161197662354, "step": 4889 }, { "epoch": 3.5726027397260274, "grad_norm": 4.7287487680028715, "learning_rate": 1.7038201625362292e-08, "logits/chosen": -2.776254415512085, "logits/rejected": -2.5332653522491455, "logps/chosen": -435.0797119140625, "logps/rejected": -422.4189453125, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 2.4559619426727295, "rewards/margins": 6.344265937805176, "rewards/rejected": -3.888303756713867, "step": 4890 }, { "epoch": 3.5733333333333333, "grad_norm": 9.40684654397293, "learning_rate": 1.6980372833068967e-08, "logits/chosen": -2.7872962951660156, "logits/rejected": -1.974740982055664, "logps/chosen": -655.2728271484375, "logps/rejected": -467.2845458984375, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": 4.105263710021973, "rewards/margins": 6.090040683746338, "rewards/rejected": -1.9847768545150757, "step": 4891 }, { "epoch": 3.574063926940639, "grad_norm": 5.24767633513491, "learning_rate": 1.6922638894883906e-08, "logits/chosen": -2.7268457412719727, "logits/rejected": -2.0674476623535156, "logps/chosen": -503.465576171875, "logps/rejected": -406.3929443359375, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 4.149513244628906, "rewards/margins": 6.160094261169434, "rewards/rejected": -2.0105814933776855, "step": 4892 }, { "epoch": 3.574794520547945, "grad_norm": 22.159896313806883, "learning_rate": 1.686499983430842e-08, "logits/chosen": -2.610121250152588, "logits/rejected": -1.8254640102386475, "logps/chosen": -676.0352783203125, "logps/rejected": -453.5526123046875, "loss": 0.0933, "rewards/accuracies": 1.0, "rewards/chosen": 3.301849365234375, "rewards/margins": 5.389643669128418, "rewards/rejected": -2.0877938270568848, "step": 4893 }, { "epoch": 3.5755251141552513, "grad_norm": 5.862649150325683, "learning_rate": 1.680745567480546e-08, "logits/chosen": -2.746654510498047, "logits/rejected": -2.357487440109253, "logps/chosen": -567.0794677734375, "logps/rejected": -578.034423828125, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 3.5470027923583984, "rewards/margins": 6.652477264404297, "rewards/rejected": -3.1054739952087402, "step": 4894 }, { "epoch": 3.576255707762557, "grad_norm": 3.8804294326907023, "learning_rate": 1.6750006439799123e-08, "logits/chosen": -2.889190435409546, "logits/rejected": -2.684541940689087, "logps/chosen": -632.6590576171875, "logps/rejected": -639.3232421875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 3.6034154891967773, "rewards/margins": 6.547574043273926, "rewards/rejected": -2.9441583156585693, "step": 4895 }, { "epoch": 3.576986301369863, "grad_norm": 3.714411023405118, "learning_rate": 1.6692652152675125e-08, "logits/chosen": -2.581294536590576, "logits/rejected": -2.3073744773864746, "logps/chosen": -464.41607666015625, "logps/rejected": -580.3941650390625, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 3.150664806365967, "rewards/margins": 7.715061664581299, "rewards/rejected": -4.564396858215332, "step": 4896 }, { "epoch": 3.577716894977169, "grad_norm": 4.669475845230324, "learning_rate": 1.663539283678028e-08, "logits/chosen": -3.025291681289673, "logits/rejected": -2.53090238571167, "logps/chosen": -719.5418090820312, "logps/rejected": -633.0737915039062, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 5.302355766296387, "rewards/margins": 6.950382232666016, "rewards/rejected": -1.6480263471603394, "step": 4897 }, { "epoch": 3.578447488584475, "grad_norm": 8.551002970086818, "learning_rate": 1.6578228515422842e-08, "logits/chosen": -3.3590304851531982, "logits/rejected": -3.1999316215515137, "logps/chosen": -1048.3818359375, "logps/rejected": -978.6917114257812, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 4.979784965515137, "rewards/margins": 6.224365234375, "rewards/rejected": -1.2445809841156006, "step": 4898 }, { "epoch": 3.5791780821917807, "grad_norm": 7.448526623342537, "learning_rate": 1.6521159211872445e-08, "logits/chosen": -3.0487594604492188, "logits/rejected": -2.2916367053985596, "logps/chosen": -726.5914306640625, "logps/rejected": -599.850341796875, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 4.956512451171875, "rewards/margins": 7.120466232299805, "rewards/rejected": -2.1639533042907715, "step": 4899 }, { "epoch": 3.5799086757990866, "grad_norm": 6.527230079706685, "learning_rate": 1.646418494935997e-08, "logits/chosen": -2.360118865966797, "logits/rejected": -2.397233486175537, "logps/chosen": -449.001953125, "logps/rejected": -548.9594116210938, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 2.0797743797302246, "rewards/margins": 5.64009428024292, "rewards/rejected": -3.5603199005126953, "step": 4900 }, { "epoch": 3.580639269406393, "grad_norm": 8.042534288897123, "learning_rate": 1.6407305751077628e-08, "logits/chosen": -2.896800994873047, "logits/rejected": -1.9146451950073242, "logps/chosen": -473.1169128417969, "logps/rejected": -289.3771667480469, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 3.1656696796417236, "rewards/margins": 5.5453691482543945, "rewards/rejected": -2.37969970703125, "step": 4901 }, { "epoch": 3.5813698630136988, "grad_norm": 2.881901129176219, "learning_rate": 1.6350521640179e-08, "logits/chosen": -2.9130187034606934, "logits/rejected": -2.550943374633789, "logps/chosen": -837.0110473632812, "logps/rejected": -681.0567626953125, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 5.150650978088379, "rewards/margins": 4.8626861572265625, "rewards/rejected": 0.2879650592803955, "step": 4902 }, { "epoch": 3.5821004566210046, "grad_norm": 3.580946430435184, "learning_rate": 1.629383263977882e-08, "logits/chosen": -2.518085479736328, "logits/rejected": -1.9532195329666138, "logps/chosen": -540.214599609375, "logps/rejected": -444.5341491699219, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 3.8166697025299072, "rewards/margins": 6.5129475593566895, "rewards/rejected": -2.696277618408203, "step": 4903 }, { "epoch": 3.5828310502283105, "grad_norm": 4.474362989724723, "learning_rate": 1.623723877295327e-08, "logits/chosen": -2.6765058040618896, "logits/rejected": -1.8453947305679321, "logps/chosen": -452.97479248046875, "logps/rejected": -343.712646484375, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 3.7160961627960205, "rewards/margins": 8.015926361083984, "rewards/rejected": -4.299830436706543, "step": 4904 }, { "epoch": 3.5835616438356164, "grad_norm": 8.374207871966766, "learning_rate": 1.6180740062739672e-08, "logits/chosen": -2.850508213043213, "logits/rejected": -2.6727302074432373, "logps/chosen": -511.30877685546875, "logps/rejected": -510.802001953125, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": 1.4362412691116333, "rewards/margins": 4.218466758728027, "rewards/rejected": -2.7822251319885254, "step": 4905 }, { "epoch": 3.5842922374429222, "grad_norm": 7.347831208875248, "learning_rate": 1.6124336532136684e-08, "logits/chosen": -2.3990674018859863, "logits/rejected": -2.486849546432495, "logps/chosen": -658.0004272460938, "logps/rejected": -772.2501220703125, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 2.5367186069488525, "rewards/margins": 6.798020362854004, "rewards/rejected": -4.261301517486572, "step": 4906 }, { "epoch": 3.585022831050228, "grad_norm": 5.03284234858523, "learning_rate": 1.6068028204104216e-08, "logits/chosen": -2.5344691276550293, "logits/rejected": -1.9038143157958984, "logps/chosen": -579.224365234375, "logps/rejected": -417.50189208984375, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 2.590108871459961, "rewards/margins": 5.867135047912598, "rewards/rejected": -3.277026414871216, "step": 4907 }, { "epoch": 3.5857534246575344, "grad_norm": 4.272561717561385, "learning_rate": 1.601181510156338e-08, "logits/chosen": -2.6733081340789795, "logits/rejected": -2.134432554244995, "logps/chosen": -908.408203125, "logps/rejected": -696.169677734375, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 6.453851222991943, "rewards/margins": 8.477476119995117, "rewards/rejected": -2.023625135421753, "step": 4908 }, { "epoch": 3.5864840182648403, "grad_norm": 5.06886306053344, "learning_rate": 1.595569724739665e-08, "logits/chosen": -3.290571451187134, "logits/rejected": -2.2041285037994385, "logps/chosen": -808.386962890625, "logps/rejected": -551.8763427734375, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 5.2730584144592285, "rewards/margins": 7.240603446960449, "rewards/rejected": -1.9675450325012207, "step": 4909 }, { "epoch": 3.587214611872146, "grad_norm": 4.611697061427274, "learning_rate": 1.589967466444747e-08, "logits/chosen": -3.0974936485290527, "logits/rejected": -2.374222755432129, "logps/chosen": -789.5791625976562, "logps/rejected": -690.27294921875, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 3.256450891494751, "rewards/margins": 5.45888614654541, "rewards/rejected": -2.2024354934692383, "step": 4910 }, { "epoch": 3.587945205479452, "grad_norm": 4.970407077689345, "learning_rate": 1.58437473755208e-08, "logits/chosen": -2.847419261932373, "logits/rejected": -1.813666582107544, "logps/chosen": -642.5481567382812, "logps/rejected": -381.18292236328125, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 3.2054920196533203, "rewards/margins": 5.180058479309082, "rewards/rejected": -1.97456693649292, "step": 4911 }, { "epoch": 3.588675799086758, "grad_norm": 5.040601108066343, "learning_rate": 1.578791540338259e-08, "logits/chosen": -2.3250532150268555, "logits/rejected": -2.3150863647460938, "logps/chosen": -573.2464599609375, "logps/rejected": -706.762451171875, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 2.807178258895874, "rewards/margins": 5.576594352722168, "rewards/rejected": -2.769416093826294, "step": 4912 }, { "epoch": 3.589406392694064, "grad_norm": 6.297470783985146, "learning_rate": 1.5732178770760164e-08, "logits/chosen": -2.4044063091278076, "logits/rejected": -1.7791881561279297, "logps/chosen": -614.2791748046875, "logps/rejected": -523.5413818359375, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 3.1145360469818115, "rewards/margins": 6.350857734680176, "rewards/rejected": -3.236321449279785, "step": 4913 }, { "epoch": 3.5901369863013697, "grad_norm": 7.032729065355031, "learning_rate": 1.5676537500341895e-08, "logits/chosen": -3.3829030990600586, "logits/rejected": -2.873345375061035, "logps/chosen": -796.1717529296875, "logps/rejected": -733.8903198242188, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 4.153195381164551, "rewards/margins": 4.627471446990967, "rewards/rejected": -0.47427594661712646, "step": 4914 }, { "epoch": 3.590867579908676, "grad_norm": 2.673189074301579, "learning_rate": 1.562099161477737e-08, "logits/chosen": -2.9885377883911133, "logits/rejected": -1.7618751525878906, "logps/chosen": -797.9766845703125, "logps/rejected": -492.27581787109375, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 3.654284954071045, "rewards/margins": 6.56683874130249, "rewards/rejected": -2.9125537872314453, "step": 4915 }, { "epoch": 3.591598173515982, "grad_norm": 5.009389263092316, "learning_rate": 1.5565541136677406e-08, "logits/chosen": -2.095522880554199, "logits/rejected": -1.4848482608795166, "logps/chosen": -474.5289306640625, "logps/rejected": -488.5196533203125, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 3.429502010345459, "rewards/margins": 7.830309867858887, "rewards/rejected": -4.4008073806762695, "step": 4916 }, { "epoch": 3.5923287671232877, "grad_norm": 10.914106889114278, "learning_rate": 1.551018608861393e-08, "logits/chosen": -2.8711230754852295, "logits/rejected": -1.5272393226623535, "logps/chosen": -618.21435546875, "logps/rejected": -348.3147888183594, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 4.637611389160156, "rewards/margins": 7.911177635192871, "rewards/rejected": -3.2735657691955566, "step": 4917 }, { "epoch": 3.5930593607305936, "grad_norm": 8.75087799026966, "learning_rate": 1.545492649312005e-08, "logits/chosen": -2.4560368061065674, "logits/rejected": -2.0903167724609375, "logps/chosen": -686.449462890625, "logps/rejected": -579.2957153320312, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 2.0934364795684814, "rewards/margins": 4.039799690246582, "rewards/rejected": -1.9463633298873901, "step": 4918 }, { "epoch": 3.5937899543378995, "grad_norm": 4.131670788733019, "learning_rate": 1.539976237269003e-08, "logits/chosen": -2.140322208404541, "logits/rejected": -2.3378705978393555, "logps/chosen": -547.8108520507812, "logps/rejected": -681.5611572265625, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 3.0952885150909424, "rewards/margins": 7.622241497039795, "rewards/rejected": -4.526952743530273, "step": 4919 }, { "epoch": 3.5945205479452054, "grad_norm": 6.308106054581645, "learning_rate": 1.5344693749779146e-08, "logits/chosen": -2.8602399826049805, "logits/rejected": -2.5237083435058594, "logps/chosen": -787.0771484375, "logps/rejected": -704.6786499023438, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 4.59580135345459, "rewards/margins": 6.6047163009643555, "rewards/rejected": -2.008915662765503, "step": 4920 }, { "epoch": 3.5952511415525112, "grad_norm": 7.202394394634964, "learning_rate": 1.5289720646804033e-08, "logits/chosen": -2.8988289833068848, "logits/rejected": -2.099982976913452, "logps/chosen": -1147.819580078125, "logps/rejected": -829.509765625, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 5.019123077392578, "rewards/margins": 9.181215286254883, "rewards/rejected": -4.162093162536621, "step": 4921 }, { "epoch": 3.5959817351598176, "grad_norm": 9.651045794965613, "learning_rate": 1.5234843086142258e-08, "logits/chosen": -2.7400121688842773, "logits/rejected": -2.2908313274383545, "logps/chosen": -494.3288269042969, "logps/rejected": -714.34521484375, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": 2.657857894897461, "rewards/margins": 5.838477611541748, "rewards/rejected": -3.180619716644287, "step": 4922 }, { "epoch": 3.5967123287671234, "grad_norm": 7.723758200181003, "learning_rate": 1.5180061090132505e-08, "logits/chosen": -2.5287742614746094, "logits/rejected": -2.2543160915374756, "logps/chosen": -547.1033935546875, "logps/rejected": -549.699951171875, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 2.736309051513672, "rewards/margins": 6.426928520202637, "rewards/rejected": -3.6906192302703857, "step": 4923 }, { "epoch": 3.5974429223744293, "grad_norm": 3.2668086922840187, "learning_rate": 1.5125374681074637e-08, "logits/chosen": -2.942697525024414, "logits/rejected": -2.3850347995758057, "logps/chosen": -686.1607055664062, "logps/rejected": -665.4669189453125, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 5.057311534881592, "rewards/margins": 8.19879150390625, "rewards/rejected": -3.1414804458618164, "step": 4924 }, { "epoch": 3.598173515981735, "grad_norm": 3.372962336538322, "learning_rate": 1.5070783881229537e-08, "logits/chosen": -3.0053257942199707, "logits/rejected": -1.455701470375061, "logps/chosen": -803.572021484375, "logps/rejected": -450.62945556640625, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 2.7127010822296143, "rewards/margins": 6.903635501861572, "rewards/rejected": -4.190934181213379, "step": 4925 }, { "epoch": 3.598904109589041, "grad_norm": 4.265285827790328, "learning_rate": 1.5016288712819292e-08, "logits/chosen": -2.786501407623291, "logits/rejected": -2.091097593307495, "logps/chosen": -484.48992919921875, "logps/rejected": -463.6345520019531, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 3.6482954025268555, "rewards/margins": 6.094715118408203, "rewards/rejected": -2.4464194774627686, "step": 4926 }, { "epoch": 3.599634703196347, "grad_norm": 4.0783401374048225, "learning_rate": 1.4961889198026932e-08, "logits/chosen": -2.480146884918213, "logits/rejected": -1.8131487369537354, "logps/chosen": -587.7929077148438, "logps/rejected": -291.56201171875, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": 3.584686279296875, "rewards/margins": 6.378373146057129, "rewards/rejected": -2.793686866760254, "step": 4927 }, { "epoch": 3.600365296803653, "grad_norm": 5.4021974048666035, "learning_rate": 1.4907585358996555e-08, "logits/chosen": -2.71903920173645, "logits/rejected": -2.1971030235290527, "logps/chosen": -598.340087890625, "logps/rejected": -480.6671447753906, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 2.713291883468628, "rewards/margins": 4.921570301055908, "rewards/rejected": -2.208278179168701, "step": 4928 }, { "epoch": 3.601095890410959, "grad_norm": 3.900530847465356, "learning_rate": 1.4853377217833308e-08, "logits/chosen": -2.5251991748809814, "logits/rejected": -2.177750825881958, "logps/chosen": -658.3442993164062, "logps/rejected": -637.48388671875, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 3.634279251098633, "rewards/margins": 5.6939005851745605, "rewards/rejected": -2.0596213340759277, "step": 4929 }, { "epoch": 3.601826484018265, "grad_norm": 5.548777251394735, "learning_rate": 1.479926479660354e-08, "logits/chosen": -3.0815672874450684, "logits/rejected": -2.3279876708984375, "logps/chosen": -428.8382568359375, "logps/rejected": -395.64239501953125, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 2.686600685119629, "rewards/margins": 6.36191463470459, "rewards/rejected": -3.675313711166382, "step": 4930 }, { "epoch": 3.602557077625571, "grad_norm": 6.594336027164536, "learning_rate": 1.4745248117334463e-08, "logits/chosen": -2.7068121433258057, "logits/rejected": -2.081106424331665, "logps/chosen": -704.3719482421875, "logps/rejected": -577.7283935546875, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 4.630320072174072, "rewards/margins": 7.348093032836914, "rewards/rejected": -2.7177727222442627, "step": 4931 }, { "epoch": 3.6032876712328767, "grad_norm": 8.698389048892814, "learning_rate": 1.4691327202014298e-08, "logits/chosen": -3.076620101928711, "logits/rejected": -2.777165412902832, "logps/chosen": -729.5391845703125, "logps/rejected": -669.199951171875, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": 2.0385284423828125, "rewards/margins": 4.688797473907471, "rewards/rejected": -2.6502685546875, "step": 4932 }, { "epoch": 3.6040182648401826, "grad_norm": 5.805448787666696, "learning_rate": 1.4637502072592484e-08, "logits/chosen": -2.42877459526062, "logits/rejected": -2.193941116333008, "logps/chosen": -615.9039306640625, "logps/rejected": -650.2303466796875, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 3.770371675491333, "rewards/margins": 5.6415114402771, "rewards/rejected": -1.8711395263671875, "step": 4933 }, { "epoch": 3.6047488584474885, "grad_norm": 4.153053603408953, "learning_rate": 1.4583772750979246e-08, "logits/chosen": -2.4772775173187256, "logits/rejected": -2.3363921642303467, "logps/chosen": -588.1773071289062, "logps/rejected": -583.0360107421875, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 2.339474678039551, "rewards/margins": 5.841885566711426, "rewards/rejected": -3.502410888671875, "step": 4934 }, { "epoch": 3.6054794520547944, "grad_norm": 3.8605239949315036, "learning_rate": 1.4530139259045949e-08, "logits/chosen": -2.9320054054260254, "logits/rejected": -1.7614599466323853, "logps/chosen": -1157.4149169921875, "logps/rejected": -673.194580078125, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 6.001422882080078, "rewards/margins": 5.506832599639893, "rewards/rejected": 0.49459078907966614, "step": 4935 }, { "epoch": 3.6062100456621007, "grad_norm": 5.1542134280025085, "learning_rate": 1.4476601618624906e-08, "logits/chosen": -2.4123313426971436, "logits/rejected": -2.5348763465881348, "logps/chosen": -551.796142578125, "logps/rejected": -579.2904052734375, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 1.2983862161636353, "rewards/margins": 4.487605094909668, "rewards/rejected": -3.189218759536743, "step": 4936 }, { "epoch": 3.6069406392694066, "grad_norm": 22.64764518707257, "learning_rate": 1.4423159851509381e-08, "logits/chosen": -2.7184789180755615, "logits/rejected": -2.1777751445770264, "logps/chosen": -605.9635009765625, "logps/rejected": -645.1378173828125, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 3.227332592010498, "rewards/margins": 9.833824157714844, "rewards/rejected": -6.6064910888671875, "step": 4937 }, { "epoch": 3.6076712328767124, "grad_norm": 3.7263833568707447, "learning_rate": 1.4369813979453665e-08, "logits/chosen": -2.958972692489624, "logits/rejected": -2.4528427124023438, "logps/chosen": -706.7979125976562, "logps/rejected": -576.266357421875, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 4.251306533813477, "rewards/margins": 5.532644271850586, "rewards/rejected": -1.2813377380371094, "step": 4938 }, { "epoch": 3.6084018264840183, "grad_norm": 5.94176354171013, "learning_rate": 1.4316564024172972e-08, "logits/chosen": -2.6601104736328125, "logits/rejected": -2.4280929565429688, "logps/chosen": -424.94146728515625, "logps/rejected": -502.9349060058594, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 3.1454098224639893, "rewards/margins": 5.903762340545654, "rewards/rejected": -2.758352279663086, "step": 4939 }, { "epoch": 3.609132420091324, "grad_norm": 4.127767385952035, "learning_rate": 1.4263410007343546e-08, "logits/chosen": -2.750270366668701, "logits/rejected": -2.4334793090820312, "logps/chosen": -543.7855224609375, "logps/rejected": -610.4910888671875, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 2.8297414779663086, "rewards/margins": 5.369838237762451, "rewards/rejected": -2.5400962829589844, "step": 4940 }, { "epoch": 3.60986301369863, "grad_norm": 5.29680766285093, "learning_rate": 1.4210351950602411e-08, "logits/chosen": -2.694340944290161, "logits/rejected": -1.7536299228668213, "logps/chosen": -563.4503784179688, "logps/rejected": -271.2481689453125, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": 2.112499952316284, "rewards/margins": 3.153938055038452, "rewards/rejected": -1.041438341140747, "step": 4941 }, { "epoch": 3.610593607305936, "grad_norm": 9.417708302729118, "learning_rate": 1.415738987554771e-08, "logits/chosen": -2.2814159393310547, "logits/rejected": -2.4195876121520996, "logps/chosen": -428.4518737792969, "logps/rejected": -653.7402954101562, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 1.6833388805389404, "rewards/margins": 7.379876613616943, "rewards/rejected": -5.696537971496582, "step": 4942 }, { "epoch": 3.6113242009132422, "grad_norm": 6.9978680770139565, "learning_rate": 1.4104523803738444e-08, "logits/chosen": -3.0603702068328857, "logits/rejected": -2.0634803771972656, "logps/chosen": -376.5545959472656, "logps/rejected": -243.95254516601562, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 1.5098463296890259, "rewards/margins": 5.015449523925781, "rewards/rejected": -3.5056025981903076, "step": 4943 }, { "epoch": 3.6120547945205477, "grad_norm": 9.759938578803235, "learning_rate": 1.4051753756694567e-08, "logits/chosen": -3.4869821071624756, "logits/rejected": -2.7221784591674805, "logps/chosen": -688.7457275390625, "logps/rejected": -677.357421875, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 2.632488965988159, "rewards/margins": 4.54912805557251, "rewards/rejected": -1.9166390895843506, "step": 4944 }, { "epoch": 3.612785388127854, "grad_norm": 4.430499290074493, "learning_rate": 1.3999079755896841e-08, "logits/chosen": -2.3179399967193604, "logits/rejected": -2.0514044761657715, "logps/chosen": -479.42047119140625, "logps/rejected": -414.65216064453125, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 2.3721225261688232, "rewards/margins": 6.423916816711426, "rewards/rejected": -4.05179500579834, "step": 4945 }, { "epoch": 3.61351598173516, "grad_norm": 3.3766375207223676, "learning_rate": 1.3946501822787005e-08, "logits/chosen": -2.8434810638427734, "logits/rejected": -2.1425728797912598, "logps/chosen": -777.9695434570312, "logps/rejected": -659.5164184570312, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 5.7906599044799805, "rewards/margins": 7.026459693908691, "rewards/rejected": -1.2357995510101318, "step": 4946 }, { "epoch": 3.6142465753424657, "grad_norm": 4.744328978416688, "learning_rate": 1.3894019978767768e-08, "logits/chosen": -2.6811342239379883, "logits/rejected": -2.476076364517212, "logps/chosen": -741.80517578125, "logps/rejected": -736.726806640625, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 2.288625955581665, "rewards/margins": 5.298202991485596, "rewards/rejected": -3.0095770359039307, "step": 4947 }, { "epoch": 3.6149771689497716, "grad_norm": 5.938234303894485, "learning_rate": 1.3841634245202572e-08, "logits/chosen": -2.70212459564209, "logits/rejected": -2.630253314971924, "logps/chosen": -627.7560424804688, "logps/rejected": -647.461669921875, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": 2.109879970550537, "rewards/margins": 3.959348678588867, "rewards/rejected": -1.84946870803833, "step": 4948 }, { "epoch": 3.6157077625570775, "grad_norm": 4.553098843197645, "learning_rate": 1.3789344643415801e-08, "logits/chosen": -2.9690914154052734, "logits/rejected": -2.472815990447998, "logps/chosen": -1105.2318115234375, "logps/rejected": -896.9271850585938, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 4.378971099853516, "rewards/margins": 6.56414794921875, "rewards/rejected": -2.1851768493652344, "step": 4949 }, { "epoch": 3.616438356164384, "grad_norm": 3.2050977633082085, "learning_rate": 1.3737151194692792e-08, "logits/chosen": -2.4156651496887207, "logits/rejected": -2.4721527099609375, "logps/chosen": -494.01434326171875, "logps/rejected": -606.6868896484375, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 3.994915008544922, "rewards/margins": 7.985865592956543, "rewards/rejected": -3.9909510612487793, "step": 4950 }, { "epoch": 3.6171689497716892, "grad_norm": 8.050199366404982, "learning_rate": 1.3685053920279577e-08, "logits/chosen": -2.531856060028076, "logits/rejected": -2.0835039615631104, "logps/chosen": -652.43212890625, "logps/rejected": -597.5093994140625, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 4.920926570892334, "rewards/margins": 7.181002616882324, "rewards/rejected": -2.2600765228271484, "step": 4951 }, { "epoch": 3.6178995433789956, "grad_norm": 3.5771596347592975, "learning_rate": 1.363305284138322e-08, "logits/chosen": -2.689711093902588, "logits/rejected": -1.8034050464630127, "logps/chosen": -590.4208984375, "logps/rejected": -292.2442321777344, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": 2.583934783935547, "rewards/margins": 5.2344818115234375, "rewards/rejected": -2.6505470275878906, "step": 4952 }, { "epoch": 3.6186301369863014, "grad_norm": 3.944320826018651, "learning_rate": 1.3581147979171482e-08, "logits/chosen": -2.6328325271606445, "logits/rejected": -1.9900513887405396, "logps/chosen": -1004.3038330078125, "logps/rejected": -614.4046630859375, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 5.050418853759766, "rewards/margins": 5.765948295593262, "rewards/rejected": -0.7155298590660095, "step": 4953 }, { "epoch": 3.6193607305936073, "grad_norm": 5.088369136867181, "learning_rate": 1.3529339354772963e-08, "logits/chosen": -3.3538615703582764, "logits/rejected": -1.9224574565887451, "logps/chosen": -355.56011962890625, "logps/rejected": -219.252197265625, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 2.8815176486968994, "rewards/margins": 7.4639081954956055, "rewards/rejected": -4.582391262054443, "step": 4954 }, { "epoch": 3.620091324200913, "grad_norm": 4.273358053134908, "learning_rate": 1.3477626989277235e-08, "logits/chosen": -2.697906732559204, "logits/rejected": -2.3901782035827637, "logps/chosen": -474.09112548828125, "logps/rejected": -533.9302368164062, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 3.53340744972229, "rewards/margins": 6.711707592010498, "rewards/rejected": -3.178300380706787, "step": 4955 }, { "epoch": 3.620821917808219, "grad_norm": 5.127855947295218, "learning_rate": 1.3426010903734491e-08, "logits/chosen": -2.7272696495056152, "logits/rejected": -2.1327109336853027, "logps/chosen": -617.5863647460938, "logps/rejected": -364.83929443359375, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 3.19571852684021, "rewards/margins": 5.962785720825195, "rewards/rejected": -2.7670669555664062, "step": 4956 }, { "epoch": 3.6215525114155254, "grad_norm": 5.5725240503736, "learning_rate": 1.337449111915595e-08, "logits/chosen": -2.5288779735565186, "logits/rejected": -2.047586441040039, "logps/chosen": -420.2648010253906, "logps/rejected": -316.1480407714844, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 3.2208380699157715, "rewards/margins": 7.8068013191223145, "rewards/rejected": -4.585963249206543, "step": 4957 }, { "epoch": 3.622283105022831, "grad_norm": 6.840700070878476, "learning_rate": 1.3323067656513365e-08, "logits/chosen": -2.451873540878296, "logits/rejected": -1.6555516719818115, "logps/chosen": -745.0344848632812, "logps/rejected": -496.76043701171875, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 3.0292937755584717, "rewards/margins": 4.628366947174072, "rewards/rejected": -1.5990735292434692, "step": 4958 }, { "epoch": 3.623013698630137, "grad_norm": 9.026283033474556, "learning_rate": 1.3271740536739461e-08, "logits/chosen": -3.091524600982666, "logits/rejected": -3.0044901371002197, "logps/chosen": -677.6026000976562, "logps/rejected": -618.42626953125, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 1.9941725730895996, "rewards/margins": 4.894344329833984, "rewards/rejected": -2.9001717567443848, "step": 4959 }, { "epoch": 3.623744292237443, "grad_norm": 4.421153734241423, "learning_rate": 1.322050978072778e-08, "logits/chosen": -2.6559348106384277, "logits/rejected": -2.093665599822998, "logps/chosen": -666.587158203125, "logps/rejected": -565.078857421875, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 3.058643102645874, "rewards/margins": 4.240514755249023, "rewards/rejected": -1.1818721294403076, "step": 4960 }, { "epoch": 3.624474885844749, "grad_norm": 6.663250461131089, "learning_rate": 1.3169375409332522e-08, "logits/chosen": -2.7479207515716553, "logits/rejected": -2.266472339630127, "logps/chosen": -496.8941650390625, "logps/rejected": -406.0262756347656, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 1.2300984859466553, "rewards/margins": 4.370258331298828, "rewards/rejected": -3.1401596069335938, "step": 4961 }, { "epoch": 3.6252054794520547, "grad_norm": 4.681394746424465, "learning_rate": 1.3118337443368683e-08, "logits/chosen": -2.479274272918701, "logits/rejected": -1.8501332998275757, "logps/chosen": -591.069580078125, "logps/rejected": -507.423828125, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 2.927041530609131, "rewards/margins": 5.88829231262207, "rewards/rejected": -2.9612510204315186, "step": 4962 }, { "epoch": 3.6259360730593606, "grad_norm": 5.825294113920076, "learning_rate": 1.3067395903612e-08, "logits/chosen": -3.115133285522461, "logits/rejected": -2.4177699089050293, "logps/chosen": -652.1610107421875, "logps/rejected": -540.2487182617188, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 4.3379011154174805, "rewards/margins": 7.972149848937988, "rewards/rejected": -3.634248733520508, "step": 4963 }, { "epoch": 3.626666666666667, "grad_norm": 4.838168564760343, "learning_rate": 1.3016550810799054e-08, "logits/chosen": -2.762763261795044, "logits/rejected": -1.7890461683273315, "logps/chosen": -530.1361694335938, "logps/rejected": -437.7515563964844, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 4.360979080200195, "rewards/margins": 6.129787445068359, "rewards/rejected": -1.7688078880310059, "step": 4964 }, { "epoch": 3.6273972602739724, "grad_norm": 8.362952377787675, "learning_rate": 1.2965802185627012e-08, "logits/chosen": -2.879387378692627, "logits/rejected": -2.156440258026123, "logps/chosen": -724.29443359375, "logps/rejected": -521.943359375, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 3.9365200996398926, "rewards/margins": 4.483674049377441, "rewards/rejected": -0.5471538305282593, "step": 4965 }, { "epoch": 3.6281278538812787, "grad_norm": 4.821797888603383, "learning_rate": 1.2915150048753959e-08, "logits/chosen": -2.601569652557373, "logits/rejected": -1.7517011165618896, "logps/chosen": -702.8552856445312, "logps/rejected": -458.84637451171875, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": 3.4237594604492188, "rewards/margins": 5.349665641784668, "rewards/rejected": -1.9259058237075806, "step": 4966 }, { "epoch": 3.6288584474885845, "grad_norm": 6.178810877298469, "learning_rate": 1.2864594420798541e-08, "logits/chosen": -2.767418146133423, "logits/rejected": -2.360182285308838, "logps/chosen": -574.9163818359375, "logps/rejected": -481.7040100097656, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 3.8834755420684814, "rewards/margins": 6.738829612731934, "rewards/rejected": -2.855354070663452, "step": 4967 }, { "epoch": 3.6295890410958904, "grad_norm": 11.69902310979993, "learning_rate": 1.281413532234013e-08, "logits/chosen": -2.4465556144714355, "logits/rejected": -2.206759452819824, "logps/chosen": -724.1290283203125, "logps/rejected": -760.5428466796875, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 2.8107268810272217, "rewards/margins": 4.091342449188232, "rewards/rejected": -1.2806155681610107, "step": 4968 }, { "epoch": 3.6303196347031963, "grad_norm": 6.024426964065655, "learning_rate": 1.2763772773918962e-08, "logits/chosen": -2.8919434547424316, "logits/rejected": -2.1327898502349854, "logps/chosen": -698.1096801757812, "logps/rejected": -691.4432373046875, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": 5.333352088928223, "rewards/margins": 5.508305549621582, "rewards/rejected": -0.1749529242515564, "step": 4969 }, { "epoch": 3.631050228310502, "grad_norm": 5.090746971434826, "learning_rate": 1.2713506796035805e-08, "logits/chosen": -2.527717351913452, "logits/rejected": -2.198263168334961, "logps/chosen": -778.7467041015625, "logps/rejected": -744.8602294921875, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 3.9868085384368896, "rewards/margins": 4.994714260101318, "rewards/rejected": -1.0079063177108765, "step": 4970 }, { "epoch": 3.6317808219178085, "grad_norm": 4.964373118306447, "learning_rate": 1.2663337409152152e-08, "logits/chosen": -2.6394248008728027, "logits/rejected": -2.440413236618042, "logps/chosen": -593.75439453125, "logps/rejected": -601.484130859375, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 2.3491270542144775, "rewards/margins": 4.706028938293457, "rewards/rejected": -2.3569018840789795, "step": 4971 }, { "epoch": 3.632511415525114, "grad_norm": 20.994720521953177, "learning_rate": 1.2613264633690251e-08, "logits/chosen": -2.4414944648742676, "logits/rejected": -2.43949031829834, "logps/chosen": -602.1036376953125, "logps/rejected": -529.75, "loss": 0.0385, "rewards/accuracies": 0.875, "rewards/chosen": 3.366373062133789, "rewards/margins": 4.892458915710449, "rewards/rejected": -1.526085376739502, "step": 4972 }, { "epoch": 3.6332420091324202, "grad_norm": 4.779703126889909, "learning_rate": 1.2563288490032908e-08, "logits/chosen": -2.6154637336730957, "logits/rejected": -2.490981101989746, "logps/chosen": -430.9783630371094, "logps/rejected": -457.033203125, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 1.9435690641403198, "rewards/margins": 6.892870903015137, "rewards/rejected": -4.949301719665527, "step": 4973 }, { "epoch": 3.633972602739726, "grad_norm": 5.028863376341632, "learning_rate": 1.2513408998523766e-08, "logits/chosen": -2.7714080810546875, "logits/rejected": -1.8111541271209717, "logps/chosen": -606.610595703125, "logps/rejected": -515.3637084960938, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 2.486809730529785, "rewards/margins": 4.129487991333008, "rewards/rejected": -1.6426780223846436, "step": 4974 }, { "epoch": 3.634703196347032, "grad_norm": 8.62447941632634, "learning_rate": 1.2463626179466862e-08, "logits/chosen": -2.82859206199646, "logits/rejected": -2.2128055095672607, "logps/chosen": -730.5551147460938, "logps/rejected": -570.25390625, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 3.17203688621521, "rewards/margins": 6.717136383056641, "rewards/rejected": -3.5450997352600098, "step": 4975 }, { "epoch": 3.635433789954338, "grad_norm": 4.605680375094357, "learning_rate": 1.2413940053127152e-08, "logits/chosen": -2.755937099456787, "logits/rejected": -1.9866008758544922, "logps/chosen": -905.93701171875, "logps/rejected": -673.9881591796875, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 4.484188556671143, "rewards/margins": 6.212436676025391, "rewards/rejected": -1.728247880935669, "step": 4976 }, { "epoch": 3.6361643835616437, "grad_norm": 5.983773861166716, "learning_rate": 1.2364350639730042e-08, "logits/chosen": -2.8310351371765137, "logits/rejected": -2.9169158935546875, "logps/chosen": -502.45977783203125, "logps/rejected": -670.9161376953125, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 2.297565460205078, "rewards/margins": 6.091044902801514, "rewards/rejected": -3.7934794425964355, "step": 4977 }, { "epoch": 3.6368949771689496, "grad_norm": 4.061805078785401, "learning_rate": 1.231485795946166e-08, "logits/chosen": -2.489978790283203, "logits/rejected": -1.8111284971237183, "logps/chosen": -504.44769287109375, "logps/rejected": -479.1871337890625, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 4.706709861755371, "rewards/margins": 9.793432235717773, "rewards/rejected": -5.0867228507995605, "step": 4978 }, { "epoch": 3.6376255707762555, "grad_norm": 6.572413921525285, "learning_rate": 1.2265462032468837e-08, "logits/chosen": -2.5088276863098145, "logits/rejected": -2.2071290016174316, "logps/chosen": -507.25164794921875, "logps/rejected": -449.140625, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 1.902218222618103, "rewards/margins": 4.910658836364746, "rewards/rejected": -3.0084404945373535, "step": 4979 }, { "epoch": 3.638356164383562, "grad_norm": 6.87981842422407, "learning_rate": 1.2216162878858766e-08, "logits/chosen": -2.9053659439086914, "logits/rejected": -2.2742605209350586, "logps/chosen": -682.93701171875, "logps/rejected": -656.45849609375, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 3.4096474647521973, "rewards/margins": 6.298251628875732, "rewards/rejected": -2.888603687286377, "step": 4980 }, { "epoch": 3.6390867579908677, "grad_norm": 7.734969545882674, "learning_rate": 1.2166960518699476e-08, "logits/chosen": -2.5555145740509033, "logits/rejected": -2.030280113220215, "logps/chosen": -662.6485595703125, "logps/rejected": -443.122314453125, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 2.8420424461364746, "rewards/margins": 5.088775634765625, "rewards/rejected": -2.2467329502105713, "step": 4981 }, { "epoch": 3.6398173515981735, "grad_norm": 6.815262820464939, "learning_rate": 1.2117854972019504e-08, "logits/chosen": -2.494877815246582, "logits/rejected": -2.06886887550354, "logps/chosen": -438.3285217285156, "logps/rejected": -349.3526611328125, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 3.0063390731811523, "rewards/margins": 6.595122814178467, "rewards/rejected": -3.5887837409973145, "step": 4982 }, { "epoch": 3.6405479452054794, "grad_norm": 4.866174387311311, "learning_rate": 1.2068846258808053e-08, "logits/chosen": -2.5969972610473633, "logits/rejected": -2.057610034942627, "logps/chosen": -719.48876953125, "logps/rejected": -516.5559692382812, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 3.1342883110046387, "rewards/margins": 7.562150955200195, "rewards/rejected": -4.427862644195557, "step": 4983 }, { "epoch": 3.6412785388127853, "grad_norm": 6.824301201115388, "learning_rate": 1.2019934399014775e-08, "logits/chosen": -2.730226516723633, "logits/rejected": -1.7602723836898804, "logps/chosen": -736.7938842773438, "logps/rejected": -464.6603698730469, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 5.25307035446167, "rewards/margins": 8.24604320526123, "rewards/rejected": -2.9929730892181396, "step": 4984 }, { "epoch": 3.642009132420091, "grad_norm": 5.387653626555019, "learning_rate": 1.1971119412549968e-08, "logits/chosen": -2.9423961639404297, "logits/rejected": -2.243894100189209, "logps/chosen": -367.57196044921875, "logps/rejected": -316.951416015625, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 3.4940381050109863, "rewards/margins": 7.372396469116211, "rewards/rejected": -3.8783578872680664, "step": 4985 }, { "epoch": 3.642739726027397, "grad_norm": 4.346195547943833, "learning_rate": 1.1922401319284541e-08, "logits/chosen": -2.6520771980285645, "logits/rejected": -1.8140076398849487, "logps/chosen": -673.7935180664062, "logps/rejected": -517.4631958007812, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 3.2812232971191406, "rewards/margins": 7.3910040855407715, "rewards/rejected": -4.109781265258789, "step": 4986 }, { "epoch": 3.6434703196347034, "grad_norm": 6.090965221565648, "learning_rate": 1.1873780139049938e-08, "logits/chosen": -2.2286362648010254, "logits/rejected": -2.2299914360046387, "logps/chosen": -384.91107177734375, "logps/rejected": -519.7764282226562, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 1.8030693531036377, "rewards/margins": 5.252211570739746, "rewards/rejected": -3.4491419792175293, "step": 4987 }, { "epoch": 3.6442009132420092, "grad_norm": 4.431616307709286, "learning_rate": 1.1825255891638047e-08, "logits/chosen": -2.8287863731384277, "logits/rejected": -1.8730733394622803, "logps/chosen": -357.5176696777344, "logps/rejected": -286.6629333496094, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": 2.7830159664154053, "rewards/margins": 5.721095085144043, "rewards/rejected": -2.938079357147217, "step": 4988 }, { "epoch": 3.644931506849315, "grad_norm": 5.104303952401441, "learning_rate": 1.1776828596801486e-08, "logits/chosen": -3.1557559967041016, "logits/rejected": -1.8025275468826294, "logps/chosen": -454.87835693359375, "logps/rejected": -270.68798828125, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 3.9384517669677734, "rewards/margins": 6.076869964599609, "rewards/rejected": -2.138417959213257, "step": 4989 }, { "epoch": 3.645662100456621, "grad_norm": 23.241154126187183, "learning_rate": 1.1728498274253207e-08, "logits/chosen": -2.766709804534912, "logits/rejected": -1.921900749206543, "logps/chosen": -337.01165771484375, "logps/rejected": -244.45591735839844, "loss": 0.1086, "rewards/accuracies": 0.875, "rewards/chosen": 1.8135473728179932, "rewards/margins": 5.842642784118652, "rewards/rejected": -4.029094696044922, "step": 4990 }, { "epoch": 3.646392694063927, "grad_norm": 4.946986489114997, "learning_rate": 1.168026494366689e-08, "logits/chosen": -2.5711050033569336, "logits/rejected": -2.3076632022857666, "logps/chosen": -642.8716430664062, "logps/rejected": -727.2966918945312, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 4.418731212615967, "rewards/margins": 5.500716209411621, "rewards/rejected": -1.0819849967956543, "step": 4991 }, { "epoch": 3.6471232876712327, "grad_norm": 3.8163957546874623, "learning_rate": 1.1632128624676579e-08, "logits/chosen": -2.49723482131958, "logits/rejected": -2.022437572479248, "logps/chosen": -515.4949951171875, "logps/rejected": -635.9481811523438, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 3.5298516750335693, "rewards/margins": 9.15672492980957, "rewards/rejected": -5.6268720626831055, "step": 4992 }, { "epoch": 3.6478538812785386, "grad_norm": 5.15865496411791, "learning_rate": 1.1584089336876878e-08, "logits/chosen": -2.9760546684265137, "logits/rejected": -2.042757034301758, "logps/chosen": -781.492431640625, "logps/rejected": -627.048828125, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": 3.78318452835083, "rewards/margins": 5.278162956237793, "rewards/rejected": -1.4949781894683838, "step": 4993 }, { "epoch": 3.648584474885845, "grad_norm": 10.96296108759424, "learning_rate": 1.153614709982284e-08, "logits/chosen": -2.888509750366211, "logits/rejected": -2.1270081996917725, "logps/chosen": -322.24554443359375, "logps/rejected": -405.07647705078125, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 3.2514896392822266, "rewards/margins": 7.629096984863281, "rewards/rejected": -4.377607345581055, "step": 4994 }, { "epoch": 3.649315068493151, "grad_norm": 8.134846985262403, "learning_rate": 1.148830193303016e-08, "logits/chosen": -2.9876837730407715, "logits/rejected": -2.4236247539520264, "logps/chosen": -804.34130859375, "logps/rejected": -661.3295288085938, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 4.147801399230957, "rewards/margins": 7.04709529876709, "rewards/rejected": -2.899293899536133, "step": 4995 }, { "epoch": 3.6500456621004567, "grad_norm": 9.888604558888494, "learning_rate": 1.1440553855974921e-08, "logits/chosen": -2.6522483825683594, "logits/rejected": -2.0315802097320557, "logps/chosen": -438.24737548828125, "logps/rejected": -325.1896057128906, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 3.414846658706665, "rewards/margins": 5.464251518249512, "rewards/rejected": -2.0494046211242676, "step": 4996 }, { "epoch": 3.6507762557077625, "grad_norm": 7.08548392765192, "learning_rate": 1.1392902888093609e-08, "logits/chosen": -2.796035051345825, "logits/rejected": -1.6291238069534302, "logps/chosen": -447.4072265625, "logps/rejected": -234.86109924316406, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 2.404801607131958, "rewards/margins": 5.602197647094727, "rewards/rejected": -3.1973958015441895, "step": 4997 }, { "epoch": 3.6515068493150684, "grad_norm": 5.612322954473709, "learning_rate": 1.1345349048783343e-08, "logits/chosen": -3.1737120151519775, "logits/rejected": -2.2382586002349854, "logps/chosen": -871.295654296875, "logps/rejected": -706.2567749023438, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 5.5728759765625, "rewards/margins": 6.437324523925781, "rewards/rejected": -0.864449143409729, "step": 4998 }, { "epoch": 3.6522374429223743, "grad_norm": 5.589280202036742, "learning_rate": 1.1297892357401557e-08, "logits/chosen": -2.5547709465026855, "logits/rejected": -2.4644925594329834, "logps/chosen": -358.62640380859375, "logps/rejected": -325.69683837890625, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 2.02415132522583, "rewards/margins": 6.632464408874512, "rewards/rejected": -4.608313083648682, "step": 4999 }, { "epoch": 3.65296803652968, "grad_norm": 6.109171442021235, "learning_rate": 1.125053283326624e-08, "logits/chosen": -2.312709331512451, "logits/rejected": -2.216336727142334, "logps/chosen": -563.1953125, "logps/rejected": -604.1397094726562, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 3.045668601989746, "rewards/margins": 7.10048770904541, "rewards/rejected": -4.054819107055664, "step": 5000 }, { "epoch": 3.6536986301369865, "grad_norm": 10.774484474258394, "learning_rate": 1.120327049565581e-08, "logits/chosen": -2.65822172164917, "logits/rejected": -2.4944167137145996, "logps/chosen": -399.1329650878906, "logps/rejected": -442.3031921386719, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 3.4078099727630615, "rewards/margins": 9.393660545349121, "rewards/rejected": -5.985850811004639, "step": 5001 }, { "epoch": 3.6544292237442924, "grad_norm": 5.212516122331088, "learning_rate": 1.1156105363809038e-08, "logits/chosen": -3.213745355606079, "logits/rejected": -2.632384777069092, "logps/chosen": -755.2295532226562, "logps/rejected": -686.6834106445312, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 5.03738260269165, "rewards/margins": 7.800621032714844, "rewards/rejected": -2.7632381916046143, "step": 5002 }, { "epoch": 3.6551598173515982, "grad_norm": 7.637376590112551, "learning_rate": 1.1109037456925264e-08, "logits/chosen": -2.6473302841186523, "logits/rejected": -1.7815258502960205, "logps/chosen": -597.8436279296875, "logps/rejected": -495.144775390625, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 3.888540506362915, "rewards/margins": 6.449294090270996, "rewards/rejected": -2.56075382232666, "step": 5003 }, { "epoch": 3.655890410958904, "grad_norm": 2.4875266759007575, "learning_rate": 1.1062066794164105e-08, "logits/chosen": -3.220269203186035, "logits/rejected": -2.6763052940368652, "logps/chosen": -696.7132568359375, "logps/rejected": -561.9117431640625, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 3.7878189086914062, "rewards/margins": 5.676016330718994, "rewards/rejected": -1.888197660446167, "step": 5004 }, { "epoch": 3.65662100456621, "grad_norm": 6.429330030217081, "learning_rate": 1.101519339464574e-08, "logits/chosen": -2.4469704627990723, "logits/rejected": -2.2502031326293945, "logps/chosen": -617.1973876953125, "logps/rejected": -748.163818359375, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": 1.8087329864501953, "rewards/margins": 5.208615303039551, "rewards/rejected": -3.3998827934265137, "step": 5005 }, { "epoch": 3.657351598173516, "grad_norm": 4.392084171107023, "learning_rate": 1.0968417277450681e-08, "logits/chosen": -2.889047145843506, "logits/rejected": -1.9222571849822998, "logps/chosen": -741.5457763671875, "logps/rejected": -559.9583740234375, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 3.67885160446167, "rewards/margins": 6.773281097412109, "rewards/rejected": -3.0944290161132812, "step": 5006 }, { "epoch": 3.6580821917808217, "grad_norm": 14.264924382097268, "learning_rate": 1.0921738461619784e-08, "logits/chosen": -2.8421785831451416, "logits/rejected": -1.8360848426818848, "logps/chosen": -709.569580078125, "logps/rejected": -348.006591796875, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 3.2817211151123047, "rewards/margins": 4.430834770202637, "rewards/rejected": -1.149113655090332, "step": 5007 }, { "epoch": 3.658812785388128, "grad_norm": 5.755228402173248, "learning_rate": 1.0875156966154403e-08, "logits/chosen": -2.768486261367798, "logits/rejected": -1.5652518272399902, "logps/chosen": -478.0400390625, "logps/rejected": -452.94891357421875, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 4.313446044921875, "rewards/margins": 8.550421714782715, "rewards/rejected": -4.236974239349365, "step": 5008 }, { "epoch": 3.659543378995434, "grad_norm": 4.522807919134824, "learning_rate": 1.0828672810016231e-08, "logits/chosen": -3.2082014083862305, "logits/rejected": -2.030862331390381, "logps/chosen": -429.34698486328125, "logps/rejected": -299.21533203125, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 2.8899245262145996, "rewards/margins": 6.714264869689941, "rewards/rejected": -3.824340581893921, "step": 5009 }, { "epoch": 3.66027397260274, "grad_norm": 6.995201100750529, "learning_rate": 1.0782286012127328e-08, "logits/chosen": -3.0526461601257324, "logits/rejected": -2.303250312805176, "logps/chosen": -1052.0784912109375, "logps/rejected": -733.8595581054688, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 5.817885875701904, "rewards/margins": 6.6353607177734375, "rewards/rejected": -0.8174750804901123, "step": 5010 }, { "epoch": 3.6610045662100457, "grad_norm": 4.611870352094826, "learning_rate": 1.0735996591370089e-08, "logits/chosen": -2.4474825859069824, "logits/rejected": -2.843782663345337, "logps/chosen": -328.3268127441406, "logps/rejected": -494.6016845703125, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 2.4212841987609863, "rewards/margins": 5.297910690307617, "rewards/rejected": -2.876626491546631, "step": 5011 }, { "epoch": 3.6617351598173515, "grad_norm": 4.4422083845798195, "learning_rate": 1.0689804566587329e-08, "logits/chosen": -2.788879156112671, "logits/rejected": -2.2933897972106934, "logps/chosen": -741.582275390625, "logps/rejected": -704.850341796875, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 4.1613450050354, "rewards/margins": 6.157814979553223, "rewards/rejected": -1.9964697360992432, "step": 5012 }, { "epoch": 3.6624657534246574, "grad_norm": 5.934684648113882, "learning_rate": 1.0643709956582259e-08, "logits/chosen": -2.726708173751831, "logits/rejected": -2.6094868183135986, "logps/chosen": -562.7911987304688, "logps/rejected": -746.3187866210938, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 2.9765002727508545, "rewards/margins": 4.796983242034912, "rewards/rejected": -1.8204832077026367, "step": 5013 }, { "epoch": 3.6631963470319633, "grad_norm": 12.64077709658187, "learning_rate": 1.059771278011834e-08, "logits/chosen": -2.94533371925354, "logits/rejected": -2.081666946411133, "logps/chosen": -791.6185302734375, "logps/rejected": -776.4025268554688, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 3.048682928085327, "rewards/margins": 6.212512969970703, "rewards/rejected": -3.163829803466797, "step": 5014 }, { "epoch": 3.6639269406392696, "grad_norm": 7.865785566652984, "learning_rate": 1.05518130559194e-08, "logits/chosen": -2.454007863998413, "logits/rejected": -1.813387393951416, "logps/chosen": -450.84942626953125, "logps/rejected": -327.8338623046875, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 1.2528247833251953, "rewards/margins": 5.485146522521973, "rewards/rejected": -4.232321262359619, "step": 5015 }, { "epoch": 3.6646575342465755, "grad_norm": 2.4425907282928883, "learning_rate": 1.050601080266958e-08, "logits/chosen": -2.337956428527832, "logits/rejected": -2.1703686714172363, "logps/chosen": -432.38873291015625, "logps/rejected": -589.2066650390625, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 4.073193550109863, "rewards/margins": 8.074756622314453, "rewards/rejected": -4.00156307220459, "step": 5016 }, { "epoch": 3.6653881278538814, "grad_norm": 4.972496299838171, "learning_rate": 1.0460306039013433e-08, "logits/chosen": -3.0314087867736816, "logits/rejected": -1.9709181785583496, "logps/chosen": -895.0570068359375, "logps/rejected": -583.5997314453125, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 5.030900001525879, "rewards/margins": 7.429391860961914, "rewards/rejected": -2.398491382598877, "step": 5017 }, { "epoch": 3.6661187214611872, "grad_norm": 4.558682869871148, "learning_rate": 1.0414698783555692e-08, "logits/chosen": -2.775031805038452, "logits/rejected": -2.356797218322754, "logps/chosen": -605.9387817382812, "logps/rejected": -659.4324340820312, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 3.6190271377563477, "rewards/margins": 7.0377020835876465, "rewards/rejected": -3.4186747074127197, "step": 5018 }, { "epoch": 3.666849315068493, "grad_norm": 6.209715448270261, "learning_rate": 1.0369189054861504e-08, "logits/chosen": -2.3677597045898438, "logits/rejected": -1.680794596672058, "logps/chosen": -567.0107421875, "logps/rejected": -416.6210632324219, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 4.080602645874023, "rewards/margins": 6.195878028869629, "rewards/rejected": -2.1152756214141846, "step": 5019 }, { "epoch": 3.667579908675799, "grad_norm": 8.738726956251577, "learning_rate": 1.0323776871456302e-08, "logits/chosen": -2.9042718410491943, "logits/rejected": -2.3305392265319824, "logps/chosen": -353.13922119140625, "logps/rejected": -269.0190124511719, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 2.225142002105713, "rewards/margins": 4.923352241516113, "rewards/rejected": -2.6982102394104004, "step": 5020 }, { "epoch": 3.668310502283105, "grad_norm": 10.206937442761625, "learning_rate": 1.0278462251825742e-08, "logits/chosen": -3.380833148956299, "logits/rejected": -2.038905143737793, "logps/chosen": -881.7860107421875, "logps/rejected": -447.4192810058594, "loss": 0.0616, "rewards/accuracies": 1.0, "rewards/chosen": 4.28671932220459, "rewards/margins": 5.249815940856934, "rewards/rejected": -0.9630969166755676, "step": 5021 }, { "epoch": 3.669041095890411, "grad_norm": 5.574969427597997, "learning_rate": 1.0233245214415904e-08, "logits/chosen": -2.7984695434570312, "logits/rejected": -1.8546152114868164, "logps/chosen": -622.156005859375, "logps/rejected": -374.0086364746094, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 3.581740379333496, "rewards/margins": 6.892285346984863, "rewards/rejected": -3.310544729232788, "step": 5022 }, { "epoch": 3.669771689497717, "grad_norm": 7.621445124203808, "learning_rate": 1.018812577763295e-08, "logits/chosen": -2.7129485607147217, "logits/rejected": -1.839579701423645, "logps/chosen": -220.93130493164062, "logps/rejected": -188.08241271972656, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 1.882807731628418, "rewards/margins": 5.164862632751465, "rewards/rejected": -3.282054901123047, "step": 5023 }, { "epoch": 3.670502283105023, "grad_norm": 5.346247039430203, "learning_rate": 1.014310395984344e-08, "logits/chosen": -2.892454147338867, "logits/rejected": -2.0929346084594727, "logps/chosen": -481.2275390625, "logps/rejected": -438.8293762207031, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 3.302556037902832, "rewards/margins": 6.5696611404418945, "rewards/rejected": -3.2671053409576416, "step": 5024 }, { "epoch": 3.671232876712329, "grad_norm": 9.205145881155788, "learning_rate": 1.0098179779374245e-08, "logits/chosen": -2.565753698348999, "logits/rejected": -1.817800760269165, "logps/chosen": -395.488037109375, "logps/rejected": -342.58123779296875, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 1.5134303569793701, "rewards/margins": 4.409404277801514, "rewards/rejected": -2.8959739208221436, "step": 5025 }, { "epoch": 3.6719634703196347, "grad_norm": 7.29279273647735, "learning_rate": 1.0053353254512343e-08, "logits/chosen": -3.2475247383117676, "logits/rejected": -2.071342945098877, "logps/chosen": -629.945068359375, "logps/rejected": -526.7030029296875, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 3.6097495555877686, "rewards/margins": 3.9947264194488525, "rewards/rejected": -0.3849765658378601, "step": 5026 }, { "epoch": 3.6726940639269405, "grad_norm": 4.05925703457289, "learning_rate": 1.000862440350514e-08, "logits/chosen": -2.5609498023986816, "logits/rejected": -1.5062772035598755, "logps/chosen": -858.53076171875, "logps/rejected": -414.47509765625, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 2.9030656814575195, "rewards/margins": 4.787032127380371, "rewards/rejected": -1.8839665651321411, "step": 5027 }, { "epoch": 3.6734246575342464, "grad_norm": 8.412894987041517, "learning_rate": 9.963993244560043e-09, "logits/chosen": -2.6715633869171143, "logits/rejected": -2.0171985626220703, "logps/chosen": -578.77587890625, "logps/rejected": -388.9113464355469, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 4.188660144805908, "rewards/margins": 6.832197666168213, "rewards/rejected": -2.6435372829437256, "step": 5028 }, { "epoch": 3.6741552511415527, "grad_norm": 3.7038290263978317, "learning_rate": 9.919459795844882e-09, "logits/chosen": -3.074591636657715, "logits/rejected": -2.3832831382751465, "logps/chosen": -814.585205078125, "logps/rejected": -658.4573364257812, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 3.299025297164917, "rewards/margins": 5.372915267944336, "rewards/rejected": -2.07388973236084, "step": 5029 }, { "epoch": 3.6748858447488586, "grad_norm": 5.134804813646101, "learning_rate": 9.875024075487737e-09, "logits/chosen": -2.66650652885437, "logits/rejected": -2.23826265335083, "logps/chosen": -398.7612609863281, "logps/rejected": -309.9295349121094, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 2.371500015258789, "rewards/margins": 5.053155899047852, "rewards/rejected": -2.6816558837890625, "step": 5030 }, { "epoch": 3.6756164383561645, "grad_norm": 9.698971361499703, "learning_rate": 9.830686101576752e-09, "logits/chosen": -3.498220443725586, "logits/rejected": -2.3192574977874756, "logps/chosen": -482.06231689453125, "logps/rejected": -376.4936828613281, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 4.230009078979492, "rewards/margins": 7.388519763946533, "rewards/rejected": -3.158510684967041, "step": 5031 }, { "epoch": 3.6763470319634703, "grad_norm": 10.949803696357735, "learning_rate": 9.786445892160378e-09, "logits/chosen": -3.0255391597747803, "logits/rejected": -2.1837518215179443, "logps/chosen": -822.0245971679688, "logps/rejected": -640.118896484375, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 3.637388229370117, "rewards/margins": 6.272444725036621, "rewards/rejected": -2.635056495666504, "step": 5032 }, { "epoch": 3.677077625570776, "grad_norm": 6.246325213620918, "learning_rate": 9.742303465247209e-09, "logits/chosen": -2.90877628326416, "logits/rejected": -2.0492947101593018, "logps/chosen": -602.4557495117188, "logps/rejected": -475.2513122558594, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 3.1231882572174072, "rewards/margins": 4.883848190307617, "rewards/rejected": -1.760660171508789, "step": 5033 }, { "epoch": 3.677808219178082, "grad_norm": 3.556908351786739, "learning_rate": 9.698258838806151e-09, "logits/chosen": -2.5687813758850098, "logits/rejected": -2.41752290725708, "logps/chosen": -572.8116455078125, "logps/rejected": -491.2940368652344, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 3.194937229156494, "rewards/margins": 6.96864652633667, "rewards/rejected": -3.773709535598755, "step": 5034 }, { "epoch": 3.678538812785388, "grad_norm": 7.345844895268467, "learning_rate": 9.654312030766192e-09, "logits/chosen": -3.188194990158081, "logits/rejected": -2.194950580596924, "logps/chosen": -677.454833984375, "logps/rejected": -509.7347717285156, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 3.1455800533294678, "rewards/margins": 5.550493240356445, "rewards/rejected": -2.4049134254455566, "step": 5035 }, { "epoch": 3.6792694063926943, "grad_norm": 6.073364904316744, "learning_rate": 9.610463059016528e-09, "logits/chosen": -3.434969902038574, "logits/rejected": -2.3412585258483887, "logps/chosen": -824.503173828125, "logps/rejected": -496.74993896484375, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 3.523646354675293, "rewards/margins": 5.063518047332764, "rewards/rejected": -1.5398718118667603, "step": 5036 }, { "epoch": 3.68, "grad_norm": 13.910022421077645, "learning_rate": 9.566711941406542e-09, "logits/chosen": -2.8088018894195557, "logits/rejected": -1.959862232208252, "logps/chosen": -645.4884643554688, "logps/rejected": -554.910888671875, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 3.638185501098633, "rewards/margins": 8.221769332885742, "rewards/rejected": -4.583584308624268, "step": 5037 }, { "epoch": 3.680730593607306, "grad_norm": 5.025755921247674, "learning_rate": 9.523058695745766e-09, "logits/chosen": -3.400773048400879, "logits/rejected": -2.1231000423431396, "logps/chosen": -615.6619873046875, "logps/rejected": -348.7728271484375, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 3.8900856971740723, "rewards/margins": 5.978219509124756, "rewards/rejected": -2.0881340503692627, "step": 5038 }, { "epoch": 3.681461187214612, "grad_norm": 6.036167176152993, "learning_rate": 9.479503339803934e-09, "logits/chosen": -2.5731418132781982, "logits/rejected": -1.9697538614273071, "logps/chosen": -426.8103332519531, "logps/rejected": -418.65777587890625, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 2.2693281173706055, "rewards/margins": 4.581599235534668, "rewards/rejected": -2.3122715950012207, "step": 5039 }, { "epoch": 3.682191780821918, "grad_norm": 5.209665214633237, "learning_rate": 9.436045891310862e-09, "logits/chosen": -2.618593692779541, "logits/rejected": -2.0185794830322266, "logps/chosen": -551.0594482421875, "logps/rejected": -597.8787231445312, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 3.793001890182495, "rewards/margins": 6.971484184265137, "rewards/rejected": -3.1784822940826416, "step": 5040 }, { "epoch": 3.6829223744292237, "grad_norm": 7.924384387441202, "learning_rate": 9.392686367956564e-09, "logits/chosen": -2.882692337036133, "logits/rejected": -2.1158251762390137, "logps/chosen": -789.023681640625, "logps/rejected": -590.9784545898438, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 5.377256393432617, "rewards/margins": 6.239599227905273, "rewards/rejected": -0.8623433113098145, "step": 5041 }, { "epoch": 3.6836529680365295, "grad_norm": 9.369225109388216, "learning_rate": 9.349424787391231e-09, "logits/chosen": -2.8556370735168457, "logits/rejected": -2.5064218044281006, "logps/chosen": -593.1741333007812, "logps/rejected": -527.498046875, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 2.222100257873535, "rewards/margins": 5.836189270019531, "rewards/rejected": -3.614089012145996, "step": 5042 }, { "epoch": 3.684383561643836, "grad_norm": 9.707296509831414, "learning_rate": 9.306261167225049e-09, "logits/chosen": -3.049774169921875, "logits/rejected": -1.7836401462554932, "logps/chosen": -636.2188110351562, "logps/rejected": -428.2032775878906, "loss": 0.0635, "rewards/accuracies": 1.0, "rewards/chosen": 3.273406982421875, "rewards/margins": 5.591802597045898, "rewards/rejected": -2.3183958530426025, "step": 5043 }, { "epoch": 3.6851141552511413, "grad_norm": 3.6672535484422144, "learning_rate": 9.263195525028495e-09, "logits/chosen": -2.685478925704956, "logits/rejected": -2.0220727920532227, "logps/chosen": -492.8677978515625, "logps/rejected": -403.22357177734375, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 4.2809648513793945, "rewards/margins": 6.985711097717285, "rewards/rejected": -2.7047457695007324, "step": 5044 }, { "epoch": 3.6858447488584476, "grad_norm": 6.314942444503559, "learning_rate": 9.220227878331988e-09, "logits/chosen": -2.4019529819488525, "logits/rejected": -1.6351741552352905, "logps/chosen": -756.2339477539062, "logps/rejected": -649.398681640625, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": 3.084904432296753, "rewards/margins": 4.793069839477539, "rewards/rejected": -1.708164930343628, "step": 5045 }, { "epoch": 3.6865753424657535, "grad_norm": 6.8559030422445595, "learning_rate": 9.177358244626232e-09, "logits/chosen": -2.7916312217712402, "logits/rejected": -2.1924214363098145, "logps/chosen": -530.0902709960938, "logps/rejected": -499.3133239746094, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 2.675196647644043, "rewards/margins": 6.515890121459961, "rewards/rejected": -3.840693712234497, "step": 5046 }, { "epoch": 3.6873059360730593, "grad_norm": 4.797447344405029, "learning_rate": 9.13458664136188e-09, "logits/chosen": -2.7058844566345215, "logits/rejected": -2.4431376457214355, "logps/chosen": -509.8936462402344, "logps/rejected": -602.3110961914062, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 3.4713711738586426, "rewards/margins": 6.627808570861816, "rewards/rejected": -3.156437635421753, "step": 5047 }, { "epoch": 3.688036529680365, "grad_norm": 11.037088810313083, "learning_rate": 9.091913085949838e-09, "logits/chosen": -2.8711934089660645, "logits/rejected": -2.1356937885284424, "logps/chosen": -559.8658447265625, "logps/rejected": -465.31622314453125, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 2.3381471633911133, "rewards/margins": 4.147368431091309, "rewards/rejected": -1.8092211484909058, "step": 5048 }, { "epoch": 3.688767123287671, "grad_norm": 5.77526975059509, "learning_rate": 9.049337595760932e-09, "logits/chosen": -2.835561752319336, "logits/rejected": -2.9276955127716064, "logps/chosen": -577.837646484375, "logps/rejected": -605.9193115234375, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 1.2904380559921265, "rewards/margins": 4.520406723022461, "rewards/rejected": -3.229968309402466, "step": 5049 }, { "epoch": 3.6894977168949774, "grad_norm": 3.2992972926249116, "learning_rate": 9.006860188126159e-09, "logits/chosen": -2.5798468589782715, "logits/rejected": -1.629941463470459, "logps/chosen": -534.8768310546875, "logps/rejected": -365.4146423339844, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 3.3678386211395264, "rewards/margins": 7.375581741333008, "rewards/rejected": -4.007742881774902, "step": 5050 }, { "epoch": 3.690228310502283, "grad_norm": 7.215088547383326, "learning_rate": 8.964480880336634e-09, "logits/chosen": -3.1546475887298584, "logits/rejected": -2.3447060585021973, "logps/chosen": -808.6317138671875, "logps/rejected": -585.6484985351562, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 5.648570537567139, "rewards/margins": 6.499456882476807, "rewards/rejected": -0.8508871793746948, "step": 5051 }, { "epoch": 3.690958904109589, "grad_norm": 7.827006063992876, "learning_rate": 8.922199689643389e-09, "logits/chosen": -2.3888022899627686, "logits/rejected": -2.3374671936035156, "logps/chosen": -470.3553771972656, "logps/rejected": -516.9263916015625, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 2.149846076965332, "rewards/margins": 4.708239555358887, "rewards/rejected": -2.5583934783935547, "step": 5052 }, { "epoch": 3.691689497716895, "grad_norm": 4.367097529540591, "learning_rate": 8.880016633257742e-09, "logits/chosen": -2.672499656677246, "logits/rejected": -2.248317241668701, "logps/chosen": -701.4856567382812, "logps/rejected": -642.4390258789062, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 3.154810667037964, "rewards/margins": 6.309783935546875, "rewards/rejected": -3.154973030090332, "step": 5053 }, { "epoch": 3.692420091324201, "grad_norm": 6.119303954690646, "learning_rate": 8.837931728350845e-09, "logits/chosen": -2.4049274921417236, "logits/rejected": -2.692828416824341, "logps/chosen": -512.72412109375, "logps/rejected": -630.3656616210938, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 1.8608218431472778, "rewards/margins": 4.717554569244385, "rewards/rejected": -2.8567328453063965, "step": 5054 }, { "epoch": 3.6931506849315068, "grad_norm": 4.898209123900282, "learning_rate": 8.795944992053966e-09, "logits/chosen": -2.7830114364624023, "logits/rejected": -2.185962677001953, "logps/chosen": -808.3312377929688, "logps/rejected": -497.28619384765625, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 4.3887457847595215, "rewards/margins": 5.654379844665527, "rewards/rejected": -1.2656335830688477, "step": 5055 }, { "epoch": 3.6938812785388127, "grad_norm": 4.3271498700559015, "learning_rate": 8.754056441458519e-09, "logits/chosen": -3.0049712657928467, "logits/rejected": -2.341064691543579, "logps/chosen": -775.647216796875, "logps/rejected": -621.7024536132812, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 2.8144755363464355, "rewards/margins": 4.356524467468262, "rewards/rejected": -1.5420490503311157, "step": 5056 }, { "epoch": 3.694611872146119, "grad_norm": 5.449652235502284, "learning_rate": 8.712266093615778e-09, "logits/chosen": -3.0058913230895996, "logits/rejected": -2.531808853149414, "logps/chosen": -712.953857421875, "logps/rejected": -604.2498168945312, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 4.31389045715332, "rewards/margins": 5.136478424072266, "rewards/rejected": -0.8225875496864319, "step": 5057 }, { "epoch": 3.6953424657534244, "grad_norm": 4.710246324840542, "learning_rate": 8.670573965537164e-09, "logits/chosen": -3.154130220413208, "logits/rejected": -2.6591227054595947, "logps/chosen": -606.7923583984375, "logps/rejected": -533.64794921875, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 3.798262357711792, "rewards/margins": 6.286894798278809, "rewards/rejected": -2.488632917404175, "step": 5058 }, { "epoch": 3.6960730593607307, "grad_norm": 5.084355027547012, "learning_rate": 8.628980074194103e-09, "logits/chosen": -2.8742759227752686, "logits/rejected": -1.7649638652801514, "logps/chosen": -660.1542358398438, "logps/rejected": -443.21826171875, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 3.7479300498962402, "rewards/margins": 8.036016464233398, "rewards/rejected": -4.288085460662842, "step": 5059 }, { "epoch": 3.6968036529680366, "grad_norm": 4.681151095925632, "learning_rate": 8.58748443651794e-09, "logits/chosen": -2.569105625152588, "logits/rejected": -2.4298624992370605, "logps/chosen": -319.1777038574219, "logps/rejected": -373.2764892578125, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 1.9039833545684814, "rewards/margins": 4.757515907287598, "rewards/rejected": -2.8535327911376953, "step": 5060 }, { "epoch": 3.6975342465753425, "grad_norm": 7.491136272298514, "learning_rate": 8.546087069400188e-09, "logits/chosen": -2.6237387657165527, "logits/rejected": -2.477534770965576, "logps/chosen": -687.9176025390625, "logps/rejected": -771.757568359375, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 3.1955008506774902, "rewards/margins": 5.6241655349731445, "rewards/rejected": -2.428664445877075, "step": 5061 }, { "epoch": 3.6982648401826483, "grad_norm": 6.341232025584218, "learning_rate": 8.504787989692148e-09, "logits/chosen": -2.7330679893493652, "logits/rejected": -2.249089002609253, "logps/chosen": -581.3637084960938, "logps/rejected": -464.7267761230469, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 2.838322639465332, "rewards/margins": 4.925880432128906, "rewards/rejected": -2.0875582695007324, "step": 5062 }, { "epoch": 3.698995433789954, "grad_norm": 10.072995227295657, "learning_rate": 8.463587214205315e-09, "logits/chosen": -2.177626609802246, "logits/rejected": -2.050426959991455, "logps/chosen": -538.1229248046875, "logps/rejected": -586.8942260742188, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 1.2070196866989136, "rewards/margins": 5.524992942810059, "rewards/rejected": -4.317973613739014, "step": 5063 }, { "epoch": 3.6997260273972605, "grad_norm": 3.95447236873491, "learning_rate": 8.422484759710995e-09, "logits/chosen": -2.5176734924316406, "logits/rejected": -2.249882698059082, "logps/chosen": -405.72271728515625, "logps/rejected": -525.605224609375, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 1.464372158050537, "rewards/margins": 6.044486999511719, "rewards/rejected": -4.580114841461182, "step": 5064 }, { "epoch": 3.700456621004566, "grad_norm": 7.167467412645008, "learning_rate": 8.381480642940613e-09, "logits/chosen": -2.8144264221191406, "logits/rejected": -2.797603130340576, "logps/chosen": -484.5921936035156, "logps/rejected": -604.9864501953125, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 2.3235840797424316, "rewards/margins": 4.975071430206299, "rewards/rejected": -2.651487112045288, "step": 5065 }, { "epoch": 3.7011872146118723, "grad_norm": 8.839784198042889, "learning_rate": 8.340574880585565e-09, "logits/chosen": -2.625257730484009, "logits/rejected": -2.603688955307007, "logps/chosen": -475.04010009765625, "logps/rejected": -579.0245971679688, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 2.8958091735839844, "rewards/margins": 5.843235492706299, "rewards/rejected": -2.9474258422851562, "step": 5066 }, { "epoch": 3.701917808219178, "grad_norm": 4.701737817124277, "learning_rate": 8.299767489297033e-09, "logits/chosen": -2.2155067920684814, "logits/rejected": -2.4938650131225586, "logps/chosen": -412.4917297363281, "logps/rejected": -563.496337890625, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": 1.466621994972229, "rewards/margins": 3.6370911598205566, "rewards/rejected": -2.170469284057617, "step": 5067 }, { "epoch": 3.702648401826484, "grad_norm": 4.104398433641061, "learning_rate": 8.259058485686338e-09, "logits/chosen": -2.592960834503174, "logits/rejected": -1.9607031345367432, "logps/chosen": -512.3428955078125, "logps/rejected": -526.190673828125, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 2.1612601280212402, "rewards/margins": 5.701000213623047, "rewards/rejected": -3.5397400856018066, "step": 5068 }, { "epoch": 3.70337899543379, "grad_norm": 5.091869170828335, "learning_rate": 8.218447886324642e-09, "logits/chosen": -3.170189380645752, "logits/rejected": -2.268812894821167, "logps/chosen": -888.031005859375, "logps/rejected": -752.5208740234375, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 4.929838180541992, "rewards/margins": 6.091067314147949, "rewards/rejected": -1.1612286567687988, "step": 5069 }, { "epoch": 3.7041095890410958, "grad_norm": 3.950929899249874, "learning_rate": 8.177935707743166e-09, "logits/chosen": -3.029808521270752, "logits/rejected": -2.224611282348633, "logps/chosen": -588.7421875, "logps/rejected": -409.0157470703125, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 3.9010443687438965, "rewards/margins": 7.460653305053711, "rewards/rejected": -3.5596089363098145, "step": 5070 }, { "epoch": 3.704840182648402, "grad_norm": 5.502292502068598, "learning_rate": 8.137521966432943e-09, "logits/chosen": -2.603707790374756, "logits/rejected": -2.4564530849456787, "logps/chosen": -838.802490234375, "logps/rejected": -914.10888671875, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 3.9759745597839355, "rewards/margins": 3.904309034347534, "rewards/rejected": 0.07166527211666107, "step": 5071 }, { "epoch": 3.7055707762557075, "grad_norm": 9.084236963336805, "learning_rate": 8.097206678844948e-09, "logits/chosen": -2.2606945037841797, "logits/rejected": -1.9299852848052979, "logps/chosen": -436.77752685546875, "logps/rejected": -491.685546875, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 2.6302127838134766, "rewards/margins": 5.53331184387207, "rewards/rejected": -2.9030985832214355, "step": 5072 }, { "epoch": 3.706301369863014, "grad_norm": 8.257437388003428, "learning_rate": 8.056989861390224e-09, "logits/chosen": -2.9869303703308105, "logits/rejected": -2.6468913555145264, "logps/chosen": -802.6857299804688, "logps/rejected": -590.25, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": 5.67018461227417, "rewards/margins": 6.25648307800293, "rewards/rejected": -0.5862980484962463, "step": 5073 }, { "epoch": 3.7070319634703197, "grad_norm": 4.421501871818018, "learning_rate": 8.016871530439562e-09, "logits/chosen": -2.4008946418762207, "logits/rejected": -1.9380593299865723, "logps/chosen": -614.7699584960938, "logps/rejected": -462.58502197265625, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 2.45828914642334, "rewards/margins": 5.9088215827941895, "rewards/rejected": -3.4505324363708496, "step": 5074 }, { "epoch": 3.7077625570776256, "grad_norm": 8.59452189360117, "learning_rate": 7.976851702323678e-09, "logits/chosen": -2.613285541534424, "logits/rejected": -2.372197389602661, "logps/chosen": -706.485595703125, "logps/rejected": -714.3635864257812, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 4.440736293792725, "rewards/margins": 7.805972099304199, "rewards/rejected": -3.3652360439300537, "step": 5075 }, { "epoch": 3.7084931506849315, "grad_norm": 4.050291526568836, "learning_rate": 7.936930393333346e-09, "logits/chosen": -2.744561195373535, "logits/rejected": -2.145289421081543, "logps/chosen": -606.6604614257812, "logps/rejected": -545.4505004882812, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 3.153050184249878, "rewards/margins": 6.796442031860352, "rewards/rejected": -3.6433916091918945, "step": 5076 }, { "epoch": 3.7092237442922373, "grad_norm": 3.6533678220480716, "learning_rate": 7.89710761971904e-09, "logits/chosen": -3.063739776611328, "logits/rejected": -2.370452404022217, "logps/chosen": -461.9040832519531, "logps/rejected": -407.93072509765625, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 2.679072856903076, "rewards/margins": 6.202139854431152, "rewards/rejected": -3.523066997528076, "step": 5077 }, { "epoch": 3.7099543378995437, "grad_norm": 4.394254548580422, "learning_rate": 7.857383397691291e-09, "logits/chosen": -3.2160277366638184, "logits/rejected": -2.458526134490967, "logps/chosen": -699.3499145507812, "logps/rejected": -591.102294921875, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 4.4851975440979, "rewards/margins": 8.930644035339355, "rewards/rejected": -4.445446014404297, "step": 5078 }, { "epoch": 3.710684931506849, "grad_norm": 6.976012136658777, "learning_rate": 7.81775774342039e-09, "logits/chosen": -2.9662563800811768, "logits/rejected": -1.5400795936584473, "logps/chosen": -502.540283203125, "logps/rejected": -271.12750244140625, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": 4.133876323699951, "rewards/margins": 8.755796432495117, "rewards/rejected": -4.62191915512085, "step": 5079 }, { "epoch": 3.7114155251141554, "grad_norm": 10.886775864915661, "learning_rate": 7.77823067303654e-09, "logits/chosen": -2.3981058597564697, "logits/rejected": -2.0092146396636963, "logps/chosen": -652.9555053710938, "logps/rejected": -493.0001525878906, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 3.402529716491699, "rewards/margins": 7.071486949920654, "rewards/rejected": -3.668957471847534, "step": 5080 }, { "epoch": 3.7121461187214613, "grad_norm": 4.836329528990791, "learning_rate": 7.738802202629818e-09, "logits/chosen": -2.644071102142334, "logits/rejected": -2.5845603942871094, "logps/chosen": -940.265380859375, "logps/rejected": -658.072021484375, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 3.538025379180908, "rewards/margins": 4.192035675048828, "rewards/rejected": -0.6540101766586304, "step": 5081 }, { "epoch": 3.712876712328767, "grad_norm": 6.6147081578609255, "learning_rate": 7.699472348250191e-09, "logits/chosen": -2.948201894760132, "logits/rejected": -1.9489357471466064, "logps/chosen": -369.9847717285156, "logps/rejected": -325.1393127441406, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 2.362766742706299, "rewards/margins": 7.055796146392822, "rewards/rejected": -4.693029403686523, "step": 5082 }, { "epoch": 3.713607305936073, "grad_norm": 8.527660019759093, "learning_rate": 7.660241125907518e-09, "logits/chosen": -3.302248239517212, "logits/rejected": -2.019807815551758, "logps/chosen": -825.5071411132812, "logps/rejected": -727.3031005859375, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 3.979224681854248, "rewards/margins": 7.772692680358887, "rewards/rejected": -3.7934682369232178, "step": 5083 }, { "epoch": 3.714337899543379, "grad_norm": 4.586573801948755, "learning_rate": 7.621108551571332e-09, "logits/chosen": -2.656813859939575, "logits/rejected": -1.2723143100738525, "logps/chosen": -424.7811584472656, "logps/rejected": -286.2162170410156, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 3.742940902709961, "rewards/margins": 7.718925952911377, "rewards/rejected": -3.975984573364258, "step": 5084 }, { "epoch": 3.7150684931506848, "grad_norm": 4.566523619775847, "learning_rate": 7.582074641171193e-09, "logits/chosen": -2.7881383895874023, "logits/rejected": -1.9369020462036133, "logps/chosen": -716.2423095703125, "logps/rejected": -463.05731201171875, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 2.0290913581848145, "rewards/margins": 5.5343918800354, "rewards/rejected": -3.505300521850586, "step": 5085 }, { "epoch": 3.7157990867579906, "grad_norm": 4.0607991105820425, "learning_rate": 7.54313941059645e-09, "logits/chosen": -3.0158212184906006, "logits/rejected": -2.7470767498016357, "logps/chosen": -850.751708984375, "logps/rejected": -815.0722045898438, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 4.558330535888672, "rewards/margins": 6.604321002960205, "rewards/rejected": -2.045990467071533, "step": 5086 }, { "epoch": 3.716529680365297, "grad_norm": 3.1648727321498358, "learning_rate": 7.504302875696255e-09, "logits/chosen": -2.8860983848571777, "logits/rejected": -1.8144338130950928, "logps/chosen": -816.8818969726562, "logps/rejected": -348.695068359375, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 2.439476251602173, "rewards/margins": 5.9949541091918945, "rewards/rejected": -3.5554776191711426, "step": 5087 }, { "epoch": 3.717260273972603, "grad_norm": 4.663686868183559, "learning_rate": 7.465565052279576e-09, "logits/chosen": -2.405628204345703, "logits/rejected": -2.083636999130249, "logps/chosen": -412.9337158203125, "logps/rejected": -467.9239196777344, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 2.1587088108062744, "rewards/margins": 6.340037822723389, "rewards/rejected": -4.181328773498535, "step": 5088 }, { "epoch": 3.7179908675799087, "grad_norm": 4.769581637479543, "learning_rate": 7.426925956115243e-09, "logits/chosen": -3.0792064666748047, "logits/rejected": -2.659092426300049, "logps/chosen": -718.056884765625, "logps/rejected": -661.9677734375, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 3.5212202072143555, "rewards/margins": 6.266656875610352, "rewards/rejected": -2.745436668395996, "step": 5089 }, { "epoch": 3.7187214611872146, "grad_norm": 5.879959610261838, "learning_rate": 7.3883856029318956e-09, "logits/chosen": -2.483488082885742, "logits/rejected": -2.0755178928375244, "logps/chosen": -938.1800537109375, "logps/rejected": -670.1099853515625, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 5.521704196929932, "rewards/margins": 5.017334461212158, "rewards/rejected": 0.5043697953224182, "step": 5090 }, { "epoch": 3.7194520547945205, "grad_norm": 4.125076755618748, "learning_rate": 7.349944008417902e-09, "logits/chosen": -2.5369436740875244, "logits/rejected": -2.3626348972320557, "logps/chosen": -363.21917724609375, "logps/rejected": -369.86187744140625, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 1.7978274822235107, "rewards/margins": 5.967123031616211, "rewards/rejected": -4.169295310974121, "step": 5091 }, { "epoch": 3.7201826484018263, "grad_norm": 7.733762897969637, "learning_rate": 7.311601188221522e-09, "logits/chosen": -3.1929256916046143, "logits/rejected": -2.1674697399139404, "logps/chosen": -619.7457885742188, "logps/rejected": -407.32208251953125, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 2.4050564765930176, "rewards/margins": 4.1820831298828125, "rewards/rejected": -1.7770265340805054, "step": 5092 }, { "epoch": 3.720913242009132, "grad_norm": 6.298215813261715, "learning_rate": 7.273357157950799e-09, "logits/chosen": -2.45630145072937, "logits/rejected": -2.413308620452881, "logps/chosen": -525.9450073242188, "logps/rejected": -636.7787475585938, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 2.800288438796997, "rewards/margins": 5.244524955749512, "rewards/rejected": -2.4442362785339355, "step": 5093 }, { "epoch": 3.7216438356164385, "grad_norm": 8.62327890224356, "learning_rate": 7.235211933173446e-09, "logits/chosen": -2.609365940093994, "logits/rejected": -2.231405258178711, "logps/chosen": -649.0304565429688, "logps/rejected": -633.1611938476562, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 3.650658130645752, "rewards/margins": 5.669842720031738, "rewards/rejected": -2.0191845893859863, "step": 5094 }, { "epoch": 3.7223744292237444, "grad_norm": 6.421011242473927, "learning_rate": 7.197165529417154e-09, "logits/chosen": -2.9865429401397705, "logits/rejected": -2.017261505126953, "logps/chosen": -613.39208984375, "logps/rejected": -466.5948486328125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 3.482133388519287, "rewards/margins": 5.9161376953125, "rewards/rejected": -2.434004306793213, "step": 5095 }, { "epoch": 3.7231050228310503, "grad_norm": 4.319030902742278, "learning_rate": 7.159217962169229e-09, "logits/chosen": -2.709670066833496, "logits/rejected": -2.3804845809936523, "logps/chosen": -922.4927978515625, "logps/rejected": -833.0275268554688, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": 4.31215238571167, "rewards/margins": 5.348480224609375, "rewards/rejected": -1.0363280773162842, "step": 5096 }, { "epoch": 3.723835616438356, "grad_norm": 5.699917494862727, "learning_rate": 7.121369246876757e-09, "logits/chosen": -2.4691152572631836, "logits/rejected": -1.5377204418182373, "logps/chosen": -417.51220703125, "logps/rejected": -314.47186279296875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 2.3750147819519043, "rewards/margins": 6.64589786529541, "rewards/rejected": -4.270883083343506, "step": 5097 }, { "epoch": 3.724566210045662, "grad_norm": 12.057356770657567, "learning_rate": 7.083619398946667e-09, "logits/chosen": -2.600818634033203, "logits/rejected": -2.0366594791412354, "logps/chosen": -898.0039672851562, "logps/rejected": -665.27783203125, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": 3.427705764770508, "rewards/margins": 5.708762168884277, "rewards/rejected": -2.2810561656951904, "step": 5098 }, { "epoch": 3.725296803652968, "grad_norm": 6.37506043648589, "learning_rate": 7.0459684337455806e-09, "logits/chosen": -2.2671518325805664, "logits/rejected": -1.71909761428833, "logps/chosen": -414.21697998046875, "logps/rejected": -345.7305603027344, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 2.8307268619537354, "rewards/margins": 6.863399982452393, "rewards/rejected": -4.032673358917236, "step": 5099 }, { "epoch": 3.7260273972602738, "grad_norm": 6.597096249454039, "learning_rate": 7.008416366599851e-09, "logits/chosen": -2.992377758026123, "logits/rejected": -2.665470600128174, "logps/chosen": -619.439697265625, "logps/rejected": -679.1497192382812, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 4.31355094909668, "rewards/margins": 7.132406234741211, "rewards/rejected": -2.8188552856445312, "step": 5100 }, { "epoch": 3.72675799086758, "grad_norm": 6.251873803581467, "learning_rate": 6.970963212795694e-09, "logits/chosen": -3.123159646987915, "logits/rejected": -1.9817333221435547, "logps/chosen": -415.8374328613281, "logps/rejected": -268.5133056640625, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": 3.1039321422576904, "rewards/margins": 6.636877059936523, "rewards/rejected": -3.532944917678833, "step": 5101 }, { "epoch": 3.727488584474886, "grad_norm": 6.208510667286523, "learning_rate": 6.933608987578915e-09, "logits/chosen": -2.6125478744506836, "logits/rejected": -2.087252616882324, "logps/chosen": -663.6724243164062, "logps/rejected": -562.3287353515625, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 3.2261953353881836, "rewards/margins": 5.9099884033203125, "rewards/rejected": -2.683793306350708, "step": 5102 }, { "epoch": 3.728219178082192, "grad_norm": 4.252134796733265, "learning_rate": 6.896353706155073e-09, "logits/chosen": -3.2371058464050293, "logits/rejected": -2.065404176712036, "logps/chosen": -830.2171630859375, "logps/rejected": -623.5980224609375, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 3.2832400798797607, "rewards/margins": 7.5412092208862305, "rewards/rejected": -4.257968425750732, "step": 5103 }, { "epoch": 3.7289497716894977, "grad_norm": 7.7934274026998125, "learning_rate": 6.859197383689563e-09, "logits/chosen": -2.325915813446045, "logits/rejected": -1.9939959049224854, "logps/chosen": -736.9437866210938, "logps/rejected": -668.8948364257812, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 5.198722839355469, "rewards/margins": 10.255955696105957, "rewards/rejected": -5.0572333335876465, "step": 5104 }, { "epoch": 3.7296803652968036, "grad_norm": 7.932549802838412, "learning_rate": 6.8221400353074e-09, "logits/chosen": -2.9066579341888428, "logits/rejected": -2.5110068321228027, "logps/chosen": -416.2682800292969, "logps/rejected": -479.75140380859375, "loss": 0.0566, "rewards/accuracies": 0.875, "rewards/chosen": 1.8602830171585083, "rewards/margins": 3.0826823711395264, "rewards/rejected": -1.2223994731903076, "step": 5105 }, { "epoch": 3.7304109589041095, "grad_norm": 4.462970757964945, "learning_rate": 6.785181676093238e-09, "logits/chosen": -3.2582693099975586, "logits/rejected": -2.041839122772217, "logps/chosen": -676.477294921875, "logps/rejected": -403.8673400878906, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": 3.945931911468506, "rewards/margins": 6.203205108642578, "rewards/rejected": -2.2572736740112305, "step": 5106 }, { "epoch": 3.7311415525114153, "grad_norm": 8.225175905520672, "learning_rate": 6.748322321091654e-09, "logits/chosen": -2.846987247467041, "logits/rejected": -2.0065157413482666, "logps/chosen": -640.17333984375, "logps/rejected": -420.6883544921875, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 3.370133638381958, "rewards/margins": 5.869946479797363, "rewards/rejected": -2.499812602996826, "step": 5107 }, { "epoch": 3.7318721461187216, "grad_norm": 7.480101828876484, "learning_rate": 6.71156198530673e-09, "logits/chosen": -1.9997341632843018, "logits/rejected": -2.4122824668884277, "logps/chosen": -413.1813049316406, "logps/rejected": -498.0959167480469, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 1.7552613019943237, "rewards/margins": 5.50410795211792, "rewards/rejected": -3.7488465309143066, "step": 5108 }, { "epoch": 3.7326027397260275, "grad_norm": 6.777919250151352, "learning_rate": 6.674900683702356e-09, "logits/chosen": -3.0379161834716797, "logits/rejected": -2.192465305328369, "logps/chosen": -583.462890625, "logps/rejected": -463.1513671875, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 4.003178596496582, "rewards/margins": 8.495306015014648, "rewards/rejected": -4.492127418518066, "step": 5109 }, { "epoch": 3.7333333333333334, "grad_norm": 5.934628083194418, "learning_rate": 6.638338431201984e-09, "logits/chosen": -2.501500129699707, "logits/rejected": -2.1944642066955566, "logps/chosen": -732.1004638671875, "logps/rejected": -729.7444458007812, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": 3.358598232269287, "rewards/margins": 7.04538106918335, "rewards/rejected": -3.6867823600769043, "step": 5110 }, { "epoch": 3.7340639269406393, "grad_norm": 5.306394411959352, "learning_rate": 6.601875242688848e-09, "logits/chosen": -2.594242811203003, "logits/rejected": -1.7229927778244019, "logps/chosen": -351.3848876953125, "logps/rejected": -233.3987579345703, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 2.9767262935638428, "rewards/margins": 8.023309707641602, "rewards/rejected": -5.04658317565918, "step": 5111 }, { "epoch": 3.734794520547945, "grad_norm": 5.242165851121934, "learning_rate": 6.56551113300588e-09, "logits/chosen": -2.672478437423706, "logits/rejected": -2.321223735809326, "logps/chosen": -888.9583740234375, "logps/rejected": -752.5204467773438, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 4.98089599609375, "rewards/margins": 5.567633628845215, "rewards/rejected": -0.5867384672164917, "step": 5112 }, { "epoch": 3.735525114155251, "grad_norm": 4.154782602657063, "learning_rate": 6.529246116955572e-09, "logits/chosen": -2.5570151805877686, "logits/rejected": -1.6345703601837158, "logps/chosen": -736.3905639648438, "logps/rejected": -548.4368286132812, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 3.2051331996917725, "rewards/margins": 8.570262908935547, "rewards/rejected": -5.365130424499512, "step": 5113 }, { "epoch": 3.736255707762557, "grad_norm": 4.668980547052524, "learning_rate": 6.493080209300228e-09, "logits/chosen": -3.323869228363037, "logits/rejected": -1.9487318992614746, "logps/chosen": -504.2711181640625, "logps/rejected": -308.618896484375, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 1.7253409624099731, "rewards/margins": 5.419184684753418, "rewards/rejected": -3.693843364715576, "step": 5114 }, { "epoch": 3.736986301369863, "grad_norm": 4.655582047242686, "learning_rate": 6.457013424761598e-09, "logits/chosen": -2.9781553745269775, "logits/rejected": -2.7338573932647705, "logps/chosen": -531.755126953125, "logps/rejected": -622.0084838867188, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 2.459681749343872, "rewards/margins": 6.256024360656738, "rewards/rejected": -3.7963428497314453, "step": 5115 }, { "epoch": 3.737716894977169, "grad_norm": 6.002491115959104, "learning_rate": 6.4210457780213e-09, "logits/chosen": -3.315680742263794, "logits/rejected": -2.7301523685455322, "logps/chosen": -790.8153076171875, "logps/rejected": -637.4943237304688, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 4.534184455871582, "rewards/margins": 6.060452461242676, "rewards/rejected": -1.5262683629989624, "step": 5116 }, { "epoch": 3.738447488584475, "grad_norm": 4.406343066808588, "learning_rate": 6.385177283720455e-09, "logits/chosen": -2.648550510406494, "logits/rejected": -2.484412670135498, "logps/chosen": -717.3243408203125, "logps/rejected": -724.3602905273438, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 3.9871668815612793, "rewards/margins": 6.8972368240356445, "rewards/rejected": -2.9100699424743652, "step": 5117 }, { "epoch": 3.739178082191781, "grad_norm": 8.899654476016497, "learning_rate": 6.349407956459857e-09, "logits/chosen": -2.6663599014282227, "logits/rejected": -1.9968523979187012, "logps/chosen": -583.7122802734375, "logps/rejected": -526.6596069335938, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 2.789536476135254, "rewards/margins": 5.720430850982666, "rewards/rejected": -2.930894374847412, "step": 5118 }, { "epoch": 3.7399086757990867, "grad_norm": 5.113502445201339, "learning_rate": 6.313737810799996e-09, "logits/chosen": -2.99849271774292, "logits/rejected": -1.9239342212677002, "logps/chosen": -754.9656982421875, "logps/rejected": -500.41436767578125, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 3.9486987590789795, "rewards/margins": 5.947648525238037, "rewards/rejected": -1.9989497661590576, "step": 5119 }, { "epoch": 3.7406392694063926, "grad_norm": 12.618745066399482, "learning_rate": 6.27816686126087e-09, "logits/chosen": -2.671642303466797, "logits/rejected": -1.7951678037643433, "logps/chosen": -538.0097045898438, "logps/rejected": -320.97296142578125, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 2.9995062351226807, "rewards/margins": 7.398487091064453, "rewards/rejected": -4.398981094360352, "step": 5120 }, { "epoch": 3.7413698630136984, "grad_norm": 6.774834617482428, "learning_rate": 6.242695122322228e-09, "logits/chosen": -2.9752085208892822, "logits/rejected": -2.653841257095337, "logps/chosen": -607.8367919921875, "logps/rejected": -670.99609375, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": 5.756037712097168, "rewards/margins": 7.10850715637207, "rewards/rejected": -1.3524692058563232, "step": 5121 }, { "epoch": 3.7421004566210048, "grad_norm": 7.740148866784183, "learning_rate": 6.207322608423327e-09, "logits/chosen": -3.131885528564453, "logits/rejected": -2.6038310527801514, "logps/chosen": -612.6951293945312, "logps/rejected": -459.3956298828125, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 3.563872814178467, "rewards/margins": 5.675292015075684, "rewards/rejected": -2.111419200897217, "step": 5122 }, { "epoch": 3.7428310502283106, "grad_norm": 4.570694762325697, "learning_rate": 6.172049333963064e-09, "logits/chosen": -2.773271083831787, "logits/rejected": -2.160305976867676, "logps/chosen": -833.0020141601562, "logps/rejected": -731.8004150390625, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 5.484953880310059, "rewards/margins": 7.991246223449707, "rewards/rejected": -2.5062925815582275, "step": 5123 }, { "epoch": 3.7435616438356165, "grad_norm": 12.459731892109163, "learning_rate": 6.136875313299983e-09, "logits/chosen": -2.7231242656707764, "logits/rejected": -2.1727960109710693, "logps/chosen": -847.2528076171875, "logps/rejected": -544.2550048828125, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": 2.8014960289001465, "rewards/margins": 5.426286220550537, "rewards/rejected": -2.6247901916503906, "step": 5124 }, { "epoch": 3.7442922374429224, "grad_norm": 11.347527833825882, "learning_rate": 6.101800560752185e-09, "logits/chosen": -2.5517804622650146, "logits/rejected": -1.9044983386993408, "logps/chosen": -651.2333374023438, "logps/rejected": -491.1385192871094, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 3.2801437377929688, "rewards/margins": 4.189387798309326, "rewards/rejected": -0.909244179725647, "step": 5125 }, { "epoch": 3.7450228310502283, "grad_norm": 4.383373204232445, "learning_rate": 6.066825090597389e-09, "logits/chosen": -3.192718029022217, "logits/rejected": -3.0046029090881348, "logps/chosen": -498.13916015625, "logps/rejected": -521.3046875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 2.174786329269409, "rewards/margins": 5.48266077041626, "rewards/rejected": -3.3078746795654297, "step": 5126 }, { "epoch": 3.745753424657534, "grad_norm": 5.0368284019154155, "learning_rate": 6.031948917072843e-09, "logits/chosen": -2.8864996433258057, "logits/rejected": -2.910834312438965, "logps/chosen": -669.197998046875, "logps/rejected": -861.2554931640625, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 3.631131172180176, "rewards/margins": 6.673171043395996, "rewards/rejected": -3.042039394378662, "step": 5127 }, { "epoch": 3.74648401826484, "grad_norm": 7.56681461049558, "learning_rate": 5.997172054375416e-09, "logits/chosen": -2.558763027191162, "logits/rejected": -1.353428602218628, "logps/chosen": -742.8408203125, "logps/rejected": -484.75921630859375, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 6.017031669616699, "rewards/margins": 9.68010425567627, "rewards/rejected": -3.663072109222412, "step": 5128 }, { "epoch": 3.7472146118721463, "grad_norm": 5.9577170455653174, "learning_rate": 5.9624945166615595e-09, "logits/chosen": -2.5625808238983154, "logits/rejected": -2.2631683349609375, "logps/chosen": -596.8356323242188, "logps/rejected": -526.49609375, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 2.7193384170532227, "rewards/margins": 3.90476655960083, "rewards/rejected": -1.1854280233383179, "step": 5129 }, { "epoch": 3.747945205479452, "grad_norm": 5.918652948794302, "learning_rate": 5.927916318047288e-09, "logits/chosen": -3.3322343826293945, "logits/rejected": -2.0993359088897705, "logps/chosen": -732.4210205078125, "logps/rejected": -446.8671875, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 3.5652382373809814, "rewards/margins": 6.601967811584473, "rewards/rejected": -3.036729574203491, "step": 5130 }, { "epoch": 3.748675799086758, "grad_norm": 4.977723278209007, "learning_rate": 5.8934374726082034e-09, "logits/chosen": -2.807435989379883, "logits/rejected": -2.4270687103271484, "logps/chosen": -371.2173156738281, "logps/rejected": -311.77178955078125, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 3.0445055961608887, "rewards/margins": 5.7840375900268555, "rewards/rejected": -2.7395315170288086, "step": 5131 }, { "epoch": 3.749406392694064, "grad_norm": 3.7265055590104046, "learning_rate": 5.859057994379357e-09, "logits/chosen": -2.525338888168335, "logits/rejected": -1.874752163887024, "logps/chosen": -500.6813659667969, "logps/rejected": -435.2798767089844, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 3.735661029815674, "rewards/margins": 6.110260009765625, "rewards/rejected": -2.3745994567871094, "step": 5132 }, { "epoch": 3.75013698630137, "grad_norm": 4.205566604669173, "learning_rate": 5.824777897355471e-09, "logits/chosen": -3.0289571285247803, "logits/rejected": -2.2514519691467285, "logps/chosen": -656.8477783203125, "logps/rejected": -494.6942138671875, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 2.2969143390655518, "rewards/margins": 5.00663423538208, "rewards/rejected": -2.7097198963165283, "step": 5133 }, { "epoch": 3.7508675799086757, "grad_norm": 9.261883911565109, "learning_rate": 5.790597195490771e-09, "logits/chosen": -3.280158042907715, "logits/rejected": -2.5333707332611084, "logps/chosen": -359.4461669921875, "logps/rejected": -347.8981628417969, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 2.1804251670837402, "rewards/margins": 4.7478742599487305, "rewards/rejected": -2.5674490928649902, "step": 5134 }, { "epoch": 3.7515981735159816, "grad_norm": 3.6395831930025477, "learning_rate": 5.756515902699016e-09, "logits/chosen": -2.5464487075805664, "logits/rejected": -2.1986732482910156, "logps/chosen": -762.0135498046875, "logps/rejected": -769.3565673828125, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 3.1904842853546143, "rewards/margins": 6.651331901550293, "rewards/rejected": -3.4608473777770996, "step": 5135 }, { "epoch": 3.752328767123288, "grad_norm": 3.521312538054755, "learning_rate": 5.7225340328534985e-09, "logits/chosen": -2.5473105907440186, "logits/rejected": -2.6887638568878174, "logps/chosen": -502.9797058105469, "logps/rejected": -559.5946044921875, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 2.5192248821258545, "rewards/margins": 6.55029296875, "rewards/rejected": -4.031068325042725, "step": 5136 }, { "epoch": 3.7530593607305938, "grad_norm": 8.241240503949955, "learning_rate": 5.6886515997870145e-09, "logits/chosen": -2.8504397869110107, "logits/rejected": -1.9878592491149902, "logps/chosen": -367.5754699707031, "logps/rejected": -294.44989013671875, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 3.5277419090270996, "rewards/margins": 7.729416370391846, "rewards/rejected": -4.201674461364746, "step": 5137 }, { "epoch": 3.7537899543378996, "grad_norm": 7.200052727400248, "learning_rate": 5.6548686172920026e-09, "logits/chosen": -2.998897075653076, "logits/rejected": -2.127689838409424, "logps/chosen": -604.9366455078125, "logps/rejected": -445.8284606933594, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 2.675339698791504, "rewards/margins": 4.077101230621338, "rewards/rejected": -1.4017616510391235, "step": 5138 }, { "epoch": 3.7545205479452055, "grad_norm": 5.523147596982336, "learning_rate": 5.62118509912024e-09, "logits/chosen": -3.5781638622283936, "logits/rejected": -2.295947790145874, "logps/chosen": -690.9613037109375, "logps/rejected": -486.45611572265625, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 4.579733848571777, "rewards/margins": 7.543274879455566, "rewards/rejected": -2.963541030883789, "step": 5139 }, { "epoch": 3.7552511415525114, "grad_norm": 7.159348529182295, "learning_rate": 5.587601058983149e-09, "logits/chosen": -2.744692087173462, "logits/rejected": -1.8719069957733154, "logps/chosen": -863.9039306640625, "logps/rejected": -666.1290283203125, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 4.282778739929199, "rewards/margins": 5.974922180175781, "rewards/rejected": -1.6921430826187134, "step": 5140 }, { "epoch": 3.7559817351598173, "grad_norm": 3.708954419762855, "learning_rate": 5.554116510551598e-09, "logits/chosen": -2.649930000305176, "logits/rejected": -2.3163669109344482, "logps/chosen": -692.0838012695312, "logps/rejected": -637.8392333984375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 2.4195070266723633, "rewards/margins": 5.538094997406006, "rewards/rejected": -3.1185879707336426, "step": 5141 }, { "epoch": 3.756712328767123, "grad_norm": 4.556183505521516, "learning_rate": 5.520731467455963e-09, "logits/chosen": -3.0701990127563477, "logits/rejected": -2.2373790740966797, "logps/chosen": -575.3937377929688, "logps/rejected": -520.67822265625, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 2.9019362926483154, "rewards/margins": 5.333853721618652, "rewards/rejected": -2.431917190551758, "step": 5142 }, { "epoch": 3.7574429223744295, "grad_norm": 4.484351606673521, "learning_rate": 5.487445943286123e-09, "logits/chosen": -2.571146011352539, "logits/rejected": -2.4087302684783936, "logps/chosen": -806.5402221679688, "logps/rejected": -614.2059936523438, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": 3.926326036453247, "rewards/margins": 5.2581892013549805, "rewards/rejected": -1.331863522529602, "step": 5143 }, { "epoch": 3.7581735159817353, "grad_norm": 5.062749785866087, "learning_rate": 5.454259951591489e-09, "logits/chosen": -2.8596010208129883, "logits/rejected": -2.8832650184631348, "logps/chosen": -595.228515625, "logps/rejected": -636.0923461914062, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 2.427536964416504, "rewards/margins": 3.873537540435791, "rewards/rejected": -1.4460006952285767, "step": 5144 }, { "epoch": 3.758904109589041, "grad_norm": 4.041062352466668, "learning_rate": 5.421173505880866e-09, "logits/chosen": -2.8604636192321777, "logits/rejected": -2.619408130645752, "logps/chosen": -250.33193969726562, "logps/rejected": -325.8836669921875, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 1.4033403396606445, "rewards/margins": 5.762284278869629, "rewards/rejected": -4.358943939208984, "step": 5145 }, { "epoch": 3.759634703196347, "grad_norm": 7.424946424502892, "learning_rate": 5.388186619622592e-09, "logits/chosen": -2.7437186241149902, "logits/rejected": -2.0068178176879883, "logps/chosen": -619.0962524414062, "logps/rejected": -631.9202880859375, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 5.94633674621582, "rewards/margins": 8.121573448181152, "rewards/rejected": -2.175236940383911, "step": 5146 }, { "epoch": 3.760365296803653, "grad_norm": 5.751340626875412, "learning_rate": 5.355299306244482e-09, "logits/chosen": -2.618961811065674, "logits/rejected": -2.1863021850585938, "logps/chosen": -901.0029296875, "logps/rejected": -598.877685546875, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 3.867187976837158, "rewards/margins": 4.9665021896362305, "rewards/rejected": -1.0993142127990723, "step": 5147 }, { "epoch": 3.761095890410959, "grad_norm": 5.799941740072259, "learning_rate": 5.322511579133826e-09, "logits/chosen": -2.7151951789855957, "logits/rejected": -1.9247602224349976, "logps/chosen": -575.1453247070312, "logps/rejected": -417.6304016113281, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 3.188011407852173, "rewards/margins": 7.0483622550964355, "rewards/rejected": -3.860351085662842, "step": 5148 }, { "epoch": 3.7618264840182647, "grad_norm": 4.921203373694657, "learning_rate": 5.289823451637282e-09, "logits/chosen": -2.7517237663269043, "logits/rejected": -1.8118008375167847, "logps/chosen": -500.8332824707031, "logps/rejected": -319.5081481933594, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 4.269218444824219, "rewards/margins": 8.466647148132324, "rewards/rejected": -4.1974287033081055, "step": 5149 }, { "epoch": 3.762557077625571, "grad_norm": 6.523981603899384, "learning_rate": 5.2572349370611225e-09, "logits/chosen": -3.1680731773376465, "logits/rejected": -2.2464523315429688, "logps/chosen": -862.7791748046875, "logps/rejected": -557.00341796875, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 4.624202728271484, "rewards/margins": 5.960102081298828, "rewards/rejected": -1.3358995914459229, "step": 5150 }, { "epoch": 3.7632876712328764, "grad_norm": 7.5636706746596944, "learning_rate": 5.224746048670931e-09, "logits/chosen": -2.947460412979126, "logits/rejected": -2.317664623260498, "logps/chosen": -697.4237670898438, "logps/rejected": -656.5736694335938, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 4.552335262298584, "rewards/margins": 5.962742328643799, "rewards/rejected": -1.4104069471359253, "step": 5151 }, { "epoch": 3.7640182648401828, "grad_norm": 5.115192426266998, "learning_rate": 5.1923567996918494e-09, "logits/chosen": -2.5361502170562744, "logits/rejected": -1.8066716194152832, "logps/chosen": -856.3123779296875, "logps/rejected": -506.05255126953125, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 2.795863151550293, "rewards/margins": 4.774993896484375, "rewards/rejected": -1.979130506515503, "step": 5152 }, { "epoch": 3.7647488584474886, "grad_norm": 6.045945364687246, "learning_rate": 5.1600672033083604e-09, "logits/chosen": -2.989612579345703, "logits/rejected": -2.342419147491455, "logps/chosen": -353.04071044921875, "logps/rejected": -362.96185302734375, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 3.0475728511810303, "rewards/margins": 5.752796173095703, "rewards/rejected": -2.705223560333252, "step": 5153 }, { "epoch": 3.7654794520547945, "grad_norm": 6.624377815252462, "learning_rate": 5.127877272664393e-09, "logits/chosen": -2.675968647003174, "logits/rejected": -2.6076502799987793, "logps/chosen": -497.00177001953125, "logps/rejected": -388.7302551269531, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 3.7609453201293945, "rewards/margins": 5.59678840637207, "rewards/rejected": -1.8358432054519653, "step": 5154 }, { "epoch": 3.7662100456621004, "grad_norm": 4.342361996862076, "learning_rate": 5.095787020863412e-09, "logits/chosen": -2.9099810123443604, "logits/rejected": -2.1256937980651855, "logps/chosen": -760.2141723632812, "logps/rejected": -619.0924072265625, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 4.3188982009887695, "rewards/margins": 6.774139881134033, "rewards/rejected": -2.455242156982422, "step": 5155 }, { "epoch": 3.7669406392694063, "grad_norm": 7.032626547110478, "learning_rate": 5.0637964609681894e-09, "logits/chosen": -2.9659154415130615, "logits/rejected": -1.9605319499969482, "logps/chosen": -784.02783203125, "logps/rejected": -458.7486572265625, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 4.181698322296143, "rewards/margins": 5.94858980178833, "rewards/rejected": -1.766891598701477, "step": 5156 }, { "epoch": 3.7676712328767126, "grad_norm": 3.9818808733960127, "learning_rate": 5.031905606000974e-09, "logits/chosen": -3.078535795211792, "logits/rejected": -2.3290271759033203, "logps/chosen": -846.5670776367188, "logps/rejected": -606.43896484375, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 3.8852410316467285, "rewards/margins": 4.762623310089111, "rewards/rejected": -0.8773825168609619, "step": 5157 }, { "epoch": 3.768401826484018, "grad_norm": 5.0486161857111895, "learning_rate": 5.00011446894344e-09, "logits/chosen": -3.0140609741210938, "logits/rejected": -1.6751325130462646, "logps/chosen": -851.15087890625, "logps/rejected": -590.8020629882812, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 4.473629474639893, "rewards/margins": 8.27279281616211, "rewards/rejected": -3.7991623878479004, "step": 5158 }, { "epoch": 3.7691324200913243, "grad_norm": 7.084334657989183, "learning_rate": 4.968423062736565e-09, "logits/chosen": -3.0504324436187744, "logits/rejected": -1.8815587759017944, "logps/chosen": -740.1048583984375, "logps/rejected": -475.55560302734375, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": 4.523033142089844, "rewards/margins": 5.2007646560668945, "rewards/rejected": -0.6777316927909851, "step": 5159 }, { "epoch": 3.76986301369863, "grad_norm": 4.74981809568942, "learning_rate": 4.9368314002808665e-09, "logits/chosen": -2.3597047328948975, "logits/rejected": -2.289824962615967, "logps/chosen": -718.7372436523438, "logps/rejected": -599.8426513671875, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 2.667668104171753, "rewards/margins": 4.846795082092285, "rewards/rejected": -2.1791272163391113, "step": 5160 }, { "epoch": 3.770593607305936, "grad_norm": 5.366831405354196, "learning_rate": 4.905339494436194e-09, "logits/chosen": -3.3298025131225586, "logits/rejected": -1.8174329996109009, "logps/chosen": -669.5576171875, "logps/rejected": -278.2811584472656, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 5.546020984649658, "rewards/margins": 9.274900436401367, "rewards/rejected": -3.7288804054260254, "step": 5161 }, { "epoch": 3.771324200913242, "grad_norm": 5.810220579073589, "learning_rate": 4.873947358021796e-09, "logits/chosen": -2.8636083602905273, "logits/rejected": -2.5424540042877197, "logps/chosen": -515.410888671875, "logps/rejected": -508.1827697753906, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": 3.6263108253479004, "rewards/margins": 5.342717170715332, "rewards/rejected": -1.7164063453674316, "step": 5162 }, { "epoch": 3.772054794520548, "grad_norm": 8.689711474511332, "learning_rate": 4.842655003816281e-09, "logits/chosen": -3.288473129272461, "logits/rejected": -2.5719199180603027, "logps/chosen": -869.6942749023438, "logps/rejected": -781.533447265625, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 4.450631141662598, "rewards/margins": 6.939783573150635, "rewards/rejected": -2.489152669906616, "step": 5163 }, { "epoch": 3.772785388127854, "grad_norm": 3.920848940184149, "learning_rate": 4.811462444557713e-09, "logits/chosen": -2.767894744873047, "logits/rejected": -2.231898784637451, "logps/chosen": -584.400390625, "logps/rejected": -488.0439758300781, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 3.5513181686401367, "rewards/margins": 7.820854663848877, "rewards/rejected": -4.269536018371582, "step": 5164 }, { "epoch": 3.7735159817351596, "grad_norm": 5.839683382517981, "learning_rate": 4.780369692943459e-09, "logits/chosen": -3.068566083908081, "logits/rejected": -1.9840298891067505, "logps/chosen": -584.673828125, "logps/rejected": -481.0462646484375, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 4.633589744567871, "rewards/margins": 8.143031120300293, "rewards/rejected": -3.5094408988952637, "step": 5165 }, { "epoch": 3.774246575342466, "grad_norm": 3.6082358936664964, "learning_rate": 4.749376761630286e-09, "logits/chosen": -2.485765218734741, "logits/rejected": -1.7189143896102905, "logps/chosen": -729.8111572265625, "logps/rejected": -506.7640380859375, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 2.202085256576538, "rewards/margins": 5.048608303070068, "rewards/rejected": -2.8465232849121094, "step": 5166 }, { "epoch": 3.7749771689497718, "grad_norm": 5.386508951396227, "learning_rate": 4.718483663234351e-09, "logits/chosen": -2.5623860359191895, "logits/rejected": -2.258786678314209, "logps/chosen": -844.8350219726562, "logps/rejected": -842.1151733398438, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 4.048466205596924, "rewards/margins": 4.643453598022461, "rewards/rejected": -0.594987154006958, "step": 5167 }, { "epoch": 3.7757077625570776, "grad_norm": 8.62962814219469, "learning_rate": 4.687690410331096e-09, "logits/chosen": -2.879694700241089, "logits/rejected": -1.9916489124298096, "logps/chosen": -568.322265625, "logps/rejected": -399.9197692871094, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 4.201554775238037, "rewards/margins": 8.115274429321289, "rewards/rejected": -3.9137191772460938, "step": 5168 }, { "epoch": 3.7764383561643835, "grad_norm": 7.089185683130492, "learning_rate": 4.656997015455439e-09, "logits/chosen": -2.806800127029419, "logits/rejected": -2.1650443077087402, "logps/chosen": -510.18011474609375, "logps/rejected": -410.3210754394531, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 2.7208547592163086, "rewards/margins": 5.214028835296631, "rewards/rejected": -2.4931747913360596, "step": 5169 }, { "epoch": 3.7771689497716894, "grad_norm": 6.6652161291727055, "learning_rate": 4.626403491101577e-09, "logits/chosen": -2.950509786605835, "logits/rejected": -2.2281723022460938, "logps/chosen": -283.4637451171875, "logps/rejected": -362.3650817871094, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 2.811486005783081, "rewards/margins": 8.181905746459961, "rewards/rejected": -5.370420455932617, "step": 5170 }, { "epoch": 3.7778995433789957, "grad_norm": 5.6434415511187455, "learning_rate": 4.595909849722995e-09, "logits/chosen": -3.4021315574645996, "logits/rejected": -2.426774740219116, "logps/chosen": -1035.1746826171875, "logps/rejected": -670.8990478515625, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 5.464775085449219, "rewards/margins": 5.676264762878418, "rewards/rejected": -0.21148988604545593, "step": 5171 }, { "epoch": 3.778630136986301, "grad_norm": 7.0799994607698915, "learning_rate": 4.565516103732625e-09, "logits/chosen": -2.47373628616333, "logits/rejected": -2.1393368244171143, "logps/chosen": -428.04302978515625, "logps/rejected": -449.145263671875, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 1.9976624250411987, "rewards/margins": 5.074213981628418, "rewards/rejected": -3.0765514373779297, "step": 5172 }, { "epoch": 3.7793607305936074, "grad_norm": 11.886793845268869, "learning_rate": 4.535222265502708e-09, "logits/chosen": -2.7876386642456055, "logits/rejected": -2.221972703933716, "logps/chosen": -655.7671508789062, "logps/rejected": -546.284912109375, "loss": 0.0587, "rewards/accuracies": 1.0, "rewards/chosen": 3.5940520763397217, "rewards/margins": 7.025294303894043, "rewards/rejected": -3.4312424659729004, "step": 5173 }, { "epoch": 3.7800913242009133, "grad_norm": 4.387192005095651, "learning_rate": 4.505028347364797e-09, "logits/chosen": -2.8303253650665283, "logits/rejected": -1.919360637664795, "logps/chosen": -481.4100646972656, "logps/rejected": -331.3756103515625, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 3.5508627891540527, "rewards/margins": 7.673192501068115, "rewards/rejected": -4.1223297119140625, "step": 5174 }, { "epoch": 3.780821917808219, "grad_norm": 7.2208301800256685, "learning_rate": 4.4749343616097555e-09, "logits/chosen": -2.802170515060425, "logits/rejected": -1.9280034303665161, "logps/chosen": -619.4110717773438, "logps/rejected": -539.3590698242188, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 3.4437170028686523, "rewards/margins": 7.067459583282471, "rewards/rejected": -3.6237423419952393, "step": 5175 }, { "epoch": 3.781552511415525, "grad_norm": 4.928861570012164, "learning_rate": 4.444940320487783e-09, "logits/chosen": -2.5017611980438232, "logits/rejected": -2.4220104217529297, "logps/chosen": -802.7794189453125, "logps/rejected": -1066.1810302734375, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 3.052877187728882, "rewards/margins": 5.6449875831604, "rewards/rejected": -2.5921101570129395, "step": 5176 }, { "epoch": 3.782283105022831, "grad_norm": 19.247013801709684, "learning_rate": 4.4150462362084474e-09, "logits/chosen": -2.889707088470459, "logits/rejected": -2.1501624584198, "logps/chosen": -888.6146850585938, "logps/rejected": -620.2723388671875, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 5.728757858276367, "rewards/margins": 8.375174522399902, "rewards/rejected": -2.6464171409606934, "step": 5177 }, { "epoch": 3.7830136986301373, "grad_norm": 3.5224437454095976, "learning_rate": 4.3852521209405414e-09, "logits/chosen": -2.5882105827331543, "logits/rejected": -2.4707982540130615, "logps/chosen": -615.0635986328125, "logps/rejected": -812.9676513671875, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 2.2207541465759277, "rewards/margins": 5.667597770690918, "rewards/rejected": -3.4468438625335693, "step": 5178 }, { "epoch": 3.7837442922374427, "grad_norm": 4.9247397184577855, "learning_rate": 4.355557986812225e-09, "logits/chosen": -2.6529464721679688, "logits/rejected": -1.9033067226409912, "logps/chosen": -576.0217895507812, "logps/rejected": -398.6743469238281, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 4.438760757446289, "rewards/margins": 10.043415069580078, "rewards/rejected": -5.604653835296631, "step": 5179 }, { "epoch": 3.784474885844749, "grad_norm": 5.133244094238094, "learning_rate": 4.325963845910913e-09, "logits/chosen": -2.500535726547241, "logits/rejected": -1.9603557586669922, "logps/chosen": -598.67431640625, "logps/rejected": -561.9541625976562, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 3.5279223918914795, "rewards/margins": 6.1386260986328125, "rewards/rejected": -2.610703706741333, "step": 5180 }, { "epoch": 3.785205479452055, "grad_norm": 5.4816555660448945, "learning_rate": 4.296469710283329e-09, "logits/chosen": -2.653956413269043, "logits/rejected": -1.9110565185546875, "logps/chosen": -904.4849853515625, "logps/rejected": -726.289794921875, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 4.104782581329346, "rewards/margins": 5.5885210037231445, "rewards/rejected": -1.4837381839752197, "step": 5181 }, { "epoch": 3.7859360730593608, "grad_norm": 7.453133956935787, "learning_rate": 4.267075591935565e-09, "logits/chosen": -2.906431198120117, "logits/rejected": -1.7982869148254395, "logps/chosen": -398.8836975097656, "logps/rejected": -217.98748779296875, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 2.6033592224121094, "rewards/margins": 5.909095764160156, "rewards/rejected": -3.305736541748047, "step": 5182 }, { "epoch": 3.7866666666666666, "grad_norm": 7.441412657887661, "learning_rate": 4.237781502832882e-09, "logits/chosen": -2.9439609050750732, "logits/rejected": -2.6841697692871094, "logps/chosen": -740.1029052734375, "logps/rejected": -608.6629638671875, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 3.572831153869629, "rewards/margins": 6.497228622436523, "rewards/rejected": -2.9243969917297363, "step": 5183 }, { "epoch": 3.7873972602739725, "grad_norm": 13.285356097805877, "learning_rate": 4.208587454899881e-09, "logits/chosen": -2.199430465698242, "logits/rejected": -2.493457317352295, "logps/chosen": -369.1839599609375, "logps/rejected": -544.260009765625, "loss": 0.0719, "rewards/accuracies": 0.875, "rewards/chosen": 2.193368434906006, "rewards/margins": 4.2047834396362305, "rewards/rejected": -2.0114152431488037, "step": 5184 }, { "epoch": 3.7881278538812784, "grad_norm": 5.359312007077866, "learning_rate": 4.179493460020417e-09, "logits/chosen": -3.2937917709350586, "logits/rejected": -2.106405735015869, "logps/chosen": -1074.837890625, "logps/rejected": -662.7816162109375, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 7.128200531005859, "rewards/margins": 7.259602069854736, "rewards/rejected": -0.1314016580581665, "step": 5185 }, { "epoch": 3.7888584474885842, "grad_norm": 7.286031700079977, "learning_rate": 4.150499530037682e-09, "logits/chosen": -2.4804906845092773, "logits/rejected": -2.325435161590576, "logps/chosen": -257.56427001953125, "logps/rejected": -369.02117919921875, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 1.729555606842041, "rewards/margins": 5.408360958099365, "rewards/rejected": -3.6788055896759033, "step": 5186 }, { "epoch": 3.7895890410958906, "grad_norm": 8.180454163569747, "learning_rate": 4.1216056767540415e-09, "logits/chosen": -2.9349701404571533, "logits/rejected": -2.620460033416748, "logps/chosen": -509.3611145019531, "logps/rejected": -483.3476867675781, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 2.234036922454834, "rewards/margins": 4.21108865737915, "rewards/rejected": -1.977051854133606, "step": 5187 }, { "epoch": 3.7903196347031964, "grad_norm": 4.458061827951786, "learning_rate": 4.092811911931199e-09, "logits/chosen": -2.8115029335021973, "logits/rejected": -2.112464427947998, "logps/chosen": -605.660400390625, "logps/rejected": -470.6495361328125, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 3.244685411453247, "rewards/margins": 5.618669509887695, "rewards/rejected": -2.37398362159729, "step": 5188 }, { "epoch": 3.7910502283105023, "grad_norm": 4.352621336620038, "learning_rate": 4.064118247290082e-09, "logits/chosen": -2.8518903255462646, "logits/rejected": -1.8372654914855957, "logps/chosen": -575.7039794921875, "logps/rejected": -444.5540771484375, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 3.6134514808654785, "rewards/margins": 6.504453659057617, "rewards/rejected": -2.8910024166107178, "step": 5189 }, { "epoch": 3.791780821917808, "grad_norm": 5.865851238686038, "learning_rate": 4.03552469451085e-09, "logits/chosen": -3.124622344970703, "logits/rejected": -2.0254673957824707, "logps/chosen": -763.186767578125, "logps/rejected": -506.2679443359375, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 4.519408702850342, "rewards/margins": 7.526961803436279, "rewards/rejected": -3.0075533390045166, "step": 5190 }, { "epoch": 3.792511415525114, "grad_norm": 7.935009182574338, "learning_rate": 4.0070312652329685e-09, "logits/chosen": -2.7768754959106445, "logits/rejected": -2.068654775619507, "logps/chosen": -638.2225341796875, "logps/rejected": -521.9623413085938, "loss": 0.0542, "rewards/accuracies": 0.875, "rewards/chosen": 2.636143445968628, "rewards/margins": 4.820520401000977, "rewards/rejected": -2.1843771934509277, "step": 5191 }, { "epoch": 3.79324200913242, "grad_norm": 12.139916123127149, "learning_rate": 3.978637971055104e-09, "logits/chosen": -2.769181728363037, "logits/rejected": -2.0675573348999023, "logps/chosen": -654.4132080078125, "logps/rejected": -547.8516845703125, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": 3.8860795497894287, "rewards/margins": 7.188772201538086, "rewards/rejected": -3.302692413330078, "step": 5192 }, { "epoch": 3.793972602739726, "grad_norm": 4.508381138311317, "learning_rate": 3.950344823535124e-09, "logits/chosen": -2.9553911685943604, "logits/rejected": -2.218346357345581, "logps/chosen": -812.6791381835938, "logps/rejected": -639.96142578125, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 4.655959606170654, "rewards/margins": 6.683279991149902, "rewards/rejected": -2.027320146560669, "step": 5193 }, { "epoch": 3.794703196347032, "grad_norm": 7.3717370007815255, "learning_rate": 3.922151834190229e-09, "logits/chosen": -2.748141288757324, "logits/rejected": -2.243384599685669, "logps/chosen": -797.6971435546875, "logps/rejected": -554.1070556640625, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 3.9941461086273193, "rewards/margins": 5.760445594787598, "rewards/rejected": -1.7662992477416992, "step": 5194 }, { "epoch": 3.795433789954338, "grad_norm": 4.613013535111518, "learning_rate": 3.89405901449677e-09, "logits/chosen": -2.8808138370513916, "logits/rejected": -2.3791136741638184, "logps/chosen": -725.4290771484375, "logps/rejected": -717.81591796875, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 3.1229400634765625, "rewards/margins": 4.602346420288086, "rewards/rejected": -1.4794063568115234, "step": 5195 }, { "epoch": 3.796164383561644, "grad_norm": 9.828678861455444, "learning_rate": 3.8660663758904034e-09, "logits/chosen": -2.6691274642944336, "logits/rejected": -2.191728115081787, "logps/chosen": -522.9266357421875, "logps/rejected": -464.0645446777344, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 2.298126697540283, "rewards/margins": 6.860717296600342, "rewards/rejected": -4.562591075897217, "step": 5196 }, { "epoch": 3.7968949771689497, "grad_norm": 5.339337406555241, "learning_rate": 3.83817392976582e-09, "logits/chosen": -2.2292702198028564, "logits/rejected": -2.5228919982910156, "logps/chosen": -549.2205200195312, "logps/rejected": -678.1697998046875, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 2.7400753498077393, "rewards/margins": 5.101473331451416, "rewards/rejected": -2.3613977432250977, "step": 5197 }, { "epoch": 3.7976255707762556, "grad_norm": 7.744374109815667, "learning_rate": 3.810381687477188e-09, "logits/chosen": -3.064822196960449, "logits/rejected": -2.6601648330688477, "logps/chosen": -720.8853759765625, "logps/rejected": -849.5413208007812, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 4.611515998840332, "rewards/margins": 7.573864936828613, "rewards/rejected": -2.9623489379882812, "step": 5198 }, { "epoch": 3.7983561643835615, "grad_norm": 7.528535345729892, "learning_rate": 3.782689660337679e-09, "logits/chosen": -3.184319019317627, "logits/rejected": -2.3669159412384033, "logps/chosen": -693.05908203125, "logps/rejected": -512.50634765625, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 4.142409324645996, "rewards/margins": 6.375925064086914, "rewards/rejected": -2.233515739440918, "step": 5199 }, { "epoch": 3.7990867579908674, "grad_norm": 5.575762422029301, "learning_rate": 3.755097859619749e-09, "logits/chosen": -3.1497244834899902, "logits/rejected": -2.573895215988159, "logps/chosen": -1130.961669921875, "logps/rejected": -861.355712890625, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 5.404280662536621, "rewards/margins": 5.354145050048828, "rewards/rejected": 0.05013507604598999, "step": 5200 }, { "epoch": 3.7998173515981737, "grad_norm": 9.571208105415323, "learning_rate": 3.727606296555136e-09, "logits/chosen": -2.847665309906006, "logits/rejected": -2.5768918991088867, "logps/chosen": -531.5465698242188, "logps/rejected": -511.943359375, "loss": 0.0623, "rewards/accuracies": 0.875, "rewards/chosen": 1.5130605697631836, "rewards/margins": 3.348870038986206, "rewards/rejected": -1.835809588432312, "step": 5201 }, { "epoch": 3.8005479452054796, "grad_norm": 8.994433058831707, "learning_rate": 3.700214982334554e-09, "logits/chosen": -2.22697377204895, "logits/rejected": -2.279198408126831, "logps/chosen": -443.3799743652344, "logps/rejected": -617.073486328125, "loss": 0.0686, "rewards/accuracies": 0.875, "rewards/chosen": 1.8771159648895264, "rewards/margins": 2.9470021724700928, "rewards/rejected": -1.069886326789856, "step": 5202 }, { "epoch": 3.8012785388127854, "grad_norm": 7.863425331671774, "learning_rate": 3.672923928108168e-09, "logits/chosen": -2.904902696609497, "logits/rejected": -2.600597381591797, "logps/chosen": -556.0178833007812, "logps/rejected": -530.5030517578125, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 4.6124773025512695, "rewards/margins": 7.00604772567749, "rewards/rejected": -2.3935704231262207, "step": 5203 }, { "epoch": 3.8020091324200913, "grad_norm": 6.254128275259346, "learning_rate": 3.6457331449851193e-09, "logits/chosen": -2.7160372734069824, "logits/rejected": -3.0150539875030518, "logps/chosen": -697.81396484375, "logps/rejected": -775.7838134765625, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 3.4218578338623047, "rewards/margins": 5.642228603363037, "rewards/rejected": -2.2203707695007324, "step": 5204 }, { "epoch": 3.802739726027397, "grad_norm": 4.740476396981311, "learning_rate": 3.618642644033859e-09, "logits/chosen": -2.396299362182617, "logits/rejected": -2.3826985359191895, "logps/chosen": -700.6870727539062, "logps/rejected": -816.7591552734375, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 3.3179564476013184, "rewards/margins": 6.056096076965332, "rewards/rejected": -2.7381396293640137, "step": 5205 }, { "epoch": 3.803470319634703, "grad_norm": 4.058957242429886, "learning_rate": 3.5916524362819834e-09, "logits/chosen": -2.492201805114746, "logits/rejected": -2.1105315685272217, "logps/chosen": -587.9850463867188, "logps/rejected": -483.92718505859375, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": 2.687990665435791, "rewards/margins": 6.052621841430664, "rewards/rejected": -3.364631175994873, "step": 5206 }, { "epoch": 3.804200913242009, "grad_norm": 5.393686503260918, "learning_rate": 3.564762532716231e-09, "logits/chosen": -2.601369857788086, "logits/rejected": -2.1903185844421387, "logps/chosen": -451.07061767578125, "logps/rejected": -424.3819274902344, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 2.7737350463867188, "rewards/margins": 5.553728103637695, "rewards/rejected": -2.7799928188323975, "step": 5207 }, { "epoch": 3.8049315068493152, "grad_norm": 4.707911897325968, "learning_rate": 3.5379729442825967e-09, "logits/chosen": -2.4145054817199707, "logits/rejected": -1.9781016111373901, "logps/chosen": -636.4164428710938, "logps/rejected": -521.3028564453125, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 2.889857292175293, "rewards/margins": 6.197828769683838, "rewards/rejected": -3.307971477508545, "step": 5208 }, { "epoch": 3.805662100456621, "grad_norm": 4.280307638217515, "learning_rate": 3.5112836818861345e-09, "logits/chosen": -2.6592462062835693, "logits/rejected": -2.392157554626465, "logps/chosen": -732.759765625, "logps/rejected": -779.0654907226562, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 3.6583266258239746, "rewards/margins": 5.814702987670898, "rewards/rejected": -2.156376838684082, "step": 5209 }, { "epoch": 3.806392694063927, "grad_norm": 5.921689153890361, "learning_rate": 3.484694756391071e-09, "logits/chosen": -2.3727352619171143, "logits/rejected": -1.6165635585784912, "logps/chosen": -662.6681518554688, "logps/rejected": -590.5341796875, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 4.036504745483398, "rewards/margins": 8.550558090209961, "rewards/rejected": -4.514052391052246, "step": 5210 }, { "epoch": 3.807123287671233, "grad_norm": 6.262794825757987, "learning_rate": 3.458206178620915e-09, "logits/chosen": -2.9562182426452637, "logits/rejected": -2.6414921283721924, "logps/chosen": -880.1319580078125, "logps/rejected": -793.2999267578125, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 4.046532154083252, "rewards/margins": 4.834083557128906, "rewards/rejected": -0.7875512838363647, "step": 5211 }, { "epoch": 3.8078538812785387, "grad_norm": 5.750981454430972, "learning_rate": 3.431817959358152e-09, "logits/chosen": -3.0045065879821777, "logits/rejected": -2.2553884983062744, "logps/chosen": -714.0697021484375, "logps/rejected": -608.8748779296875, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 2.902721881866455, "rewards/margins": 7.413932800292969, "rewards/rejected": -4.511210918426514, "step": 5212 }, { "epoch": 3.8085844748858446, "grad_norm": 6.647542066150483, "learning_rate": 3.405530109344551e-09, "logits/chosen": -3.060776710510254, "logits/rejected": -2.5829169750213623, "logps/chosen": -697.486572265625, "logps/rejected": -556.9002685546875, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 4.10626220703125, "rewards/margins": 6.353505611419678, "rewards/rejected": -2.2472434043884277, "step": 5213 }, { "epoch": 3.8093150684931505, "grad_norm": 6.178876010937986, "learning_rate": 3.379342639280969e-09, "logits/chosen": -2.807565689086914, "logits/rejected": -2.072032928466797, "logps/chosen": -508.45758056640625, "logps/rejected": -346.14825439453125, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": 2.5130929946899414, "rewards/margins": 6.0061726570129395, "rewards/rejected": -3.493079662322998, "step": 5214 }, { "epoch": 3.810045662100457, "grad_norm": 3.1278655075077384, "learning_rate": 3.3532555598273783e-09, "logits/chosen": -3.181079387664795, "logits/rejected": -2.273641586303711, "logps/chosen": -598.685302734375, "logps/rejected": -547.9608154296875, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 4.806204319000244, "rewards/margins": 10.492132186889648, "rewards/rejected": -5.685927867889404, "step": 5215 }, { "epoch": 3.8107762557077627, "grad_norm": 4.655113542362329, "learning_rate": 3.3272688816029236e-09, "logits/chosen": -3.2597179412841797, "logits/rejected": -1.846008539199829, "logps/chosen": -868.9325561523438, "logps/rejected": -494.16925048828125, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 5.238460540771484, "rewards/margins": 6.8612141609191895, "rewards/rejected": -1.6227537393569946, "step": 5216 }, { "epoch": 3.8115068493150686, "grad_norm": 5.223874802332025, "learning_rate": 3.301382615185866e-09, "logits/chosen": -2.6102499961853027, "logits/rejected": -2.090876579284668, "logps/chosen": -462.8048095703125, "logps/rejected": -462.7817687988281, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 3.079562187194824, "rewards/margins": 5.635000228881836, "rewards/rejected": -2.5554380416870117, "step": 5217 }, { "epoch": 3.8122374429223744, "grad_norm": 3.692362279060534, "learning_rate": 3.2755967711136366e-09, "logits/chosen": -3.074223756790161, "logits/rejected": -2.2298927307128906, "logps/chosen": -843.734130859375, "logps/rejected": -622.911865234375, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 3.725541591644287, "rewards/margins": 7.583191394805908, "rewards/rejected": -3.857649087905884, "step": 5218 }, { "epoch": 3.8129680365296803, "grad_norm": 5.6502503668752615, "learning_rate": 3.2499113598826734e-09, "logits/chosen": -2.399674892425537, "logits/rejected": -2.1401443481445312, "logps/chosen": -356.9766845703125, "logps/rejected": -370.1672668457031, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 2.083890438079834, "rewards/margins": 4.9438886642456055, "rewards/rejected": -2.8599977493286133, "step": 5219 }, { "epoch": 3.813698630136986, "grad_norm": 6.074647982724075, "learning_rate": 3.2243263919486676e-09, "logits/chosen": -2.970709800720215, "logits/rejected": -2.6480801105499268, "logps/chosen": -429.0413513183594, "logps/rejected": -468.41961669921875, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 2.2275145053863525, "rewards/margins": 4.830199241638184, "rewards/rejected": -2.602684497833252, "step": 5220 }, { "epoch": 3.814429223744292, "grad_norm": 6.96864298237965, "learning_rate": 3.1988418777263437e-09, "logits/chosen": -3.3144006729125977, "logits/rejected": -2.865018367767334, "logps/chosen": -593.716064453125, "logps/rejected": -509.0058288574219, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 3.2111129760742188, "rewards/margins": 6.831942558288574, "rewards/rejected": -3.6208291053771973, "step": 5221 }, { "epoch": 3.8151598173515984, "grad_norm": 6.2910229014627275, "learning_rate": 3.173457827589543e-09, "logits/chosen": -3.0890164375305176, "logits/rejected": -2.2987709045410156, "logps/chosen": -632.9613037109375, "logps/rejected": -457.38323974609375, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 3.214035987854004, "rewards/margins": 4.789552211761475, "rewards/rejected": -1.5755162239074707, "step": 5222 }, { "epoch": 3.8158904109589042, "grad_norm": 4.621432397579294, "learning_rate": 3.148174251871222e-09, "logits/chosen": -2.706218719482422, "logits/rejected": -2.6127169132232666, "logps/chosen": -812.1680908203125, "logps/rejected": -892.8673095703125, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 5.319980144500732, "rewards/margins": 7.179752349853516, "rewards/rejected": -1.8597720861434937, "step": 5223 }, { "epoch": 3.81662100456621, "grad_norm": 3.6561096686143792, "learning_rate": 3.122991160863453e-09, "logits/chosen": -2.7091383934020996, "logits/rejected": -2.1006367206573486, "logps/chosen": -684.5999755859375, "logps/rejected": -576.1605224609375, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 5.7573089599609375, "rewards/margins": 7.925378799438477, "rewards/rejected": -2.168069839477539, "step": 5224 }, { "epoch": 3.817351598173516, "grad_norm": 5.754887369679385, "learning_rate": 3.09790856481737e-09, "logits/chosen": -3.0501444339752197, "logits/rejected": -2.316279888153076, "logps/chosen": -647.9722900390625, "logps/rejected": -595.6656494140625, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 1.743711233139038, "rewards/margins": 4.260889053344727, "rewards/rejected": -2.5171780586242676, "step": 5225 }, { "epoch": 3.818082191780822, "grad_norm": 3.732051749170138, "learning_rate": 3.072926473943194e-09, "logits/chosen": -2.492083787918091, "logits/rejected": -2.4164090156555176, "logps/chosen": -584.0874633789062, "logps/rejected": -621.1441040039062, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 4.587890625, "rewards/margins": 7.936004638671875, "rewards/rejected": -3.348114013671875, "step": 5226 }, { "epoch": 3.8188127853881277, "grad_norm": 4.814764109504824, "learning_rate": 3.048044898410318e-09, "logits/chosen": -2.6651525497436523, "logits/rejected": -2.3075122833251953, "logps/chosen": -504.91644287109375, "logps/rejected": -505.13116455078125, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 3.0246522426605225, "rewards/margins": 6.776699066162109, "rewards/rejected": -3.752047061920166, "step": 5227 }, { "epoch": 3.8195433789954336, "grad_norm": 7.674410461249409, "learning_rate": 3.0232638483471407e-09, "logits/chosen": -2.834707260131836, "logits/rejected": -2.3588709831237793, "logps/chosen": -552.1444091796875, "logps/rejected": -610.840087890625, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 3.9833219051361084, "rewards/margins": 4.644701957702637, "rewards/rejected": -0.6613801121711731, "step": 5228 }, { "epoch": 3.82027397260274, "grad_norm": 7.984252928754341, "learning_rate": 2.9985833338411202e-09, "logits/chosen": -2.6893343925476074, "logits/rejected": -2.7177157402038574, "logps/chosen": -852.16015625, "logps/rejected": -942.69384765625, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 4.038556098937988, "rewards/margins": 5.859444618225098, "rewards/rejected": -1.8208881616592407, "step": 5229 }, { "epoch": 3.821004566210046, "grad_norm": 4.230046724101985, "learning_rate": 2.9740033649388596e-09, "logits/chosen": -2.4911999702453613, "logits/rejected": -2.066964864730835, "logps/chosen": -840.8041381835938, "logps/rejected": -629.3493041992188, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 2.4576573371887207, "rewards/margins": 4.375649452209473, "rewards/rejected": -1.9179916381835938, "step": 5230 }, { "epoch": 3.8217351598173517, "grad_norm": 5.6739511604560215, "learning_rate": 2.9495239516459934e-09, "logits/chosen": -2.731813669204712, "logits/rejected": -2.526482582092285, "logps/chosen": -812.2105712890625, "logps/rejected": -819.2666015625, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": 3.9355719089508057, "rewards/margins": 5.140102863311768, "rewards/rejected": -1.204531192779541, "step": 5231 }, { "epoch": 3.8224657534246576, "grad_norm": 5.194648872382871, "learning_rate": 2.925145103927218e-09, "logits/chosen": -2.8472959995269775, "logits/rejected": -2.6334335803985596, "logps/chosen": -492.9873962402344, "logps/rejected": -783.2279663085938, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 2.9789047241210938, "rewards/margins": 6.649497985839844, "rewards/rejected": -3.67059326171875, "step": 5232 }, { "epoch": 3.8231963470319634, "grad_norm": 6.926442725214986, "learning_rate": 2.900866831706289e-09, "logits/chosen": -2.6000847816467285, "logits/rejected": -1.888007402420044, "logps/chosen": -761.0650634765625, "logps/rejected": -458.0749206542969, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 4.971622467041016, "rewards/margins": 5.910091400146484, "rewards/rejected": -0.9384692907333374, "step": 5233 }, { "epoch": 3.8239269406392693, "grad_norm": 7.33324365850008, "learning_rate": 2.876689144866107e-09, "logits/chosen": -3.2566113471984863, "logits/rejected": -2.246425151824951, "logps/chosen": -481.97430419921875, "logps/rejected": -289.7890319824219, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 1.8848772048950195, "rewards/margins": 4.5640668869018555, "rewards/rejected": -2.679189682006836, "step": 5234 }, { "epoch": 3.824657534246575, "grad_norm": 5.145021176308518, "learning_rate": 2.852612053248521e-09, "logits/chosen": -2.812753200531006, "logits/rejected": -2.3604280948638916, "logps/chosen": -701.9542846679688, "logps/rejected": -598.568115234375, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 4.244508266448975, "rewards/margins": 6.745742321014404, "rewards/rejected": -2.5012340545654297, "step": 5235 }, { "epoch": 3.8253881278538815, "grad_norm": 8.131900615304364, "learning_rate": 2.8286355666544413e-09, "logits/chosen": -2.7199041843414307, "logits/rejected": -1.8558025360107422, "logps/chosen": -715.6644287109375, "logps/rejected": -482.4815368652344, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 3.9224720001220703, "rewards/margins": 6.631243705749512, "rewards/rejected": -2.7087714672088623, "step": 5236 }, { "epoch": 3.8261187214611874, "grad_norm": 6.4465330875500015, "learning_rate": 2.8047596948439223e-09, "logits/chosen": -2.4793245792388916, "logits/rejected": -1.8944756984710693, "logps/chosen": -439.795166015625, "logps/rejected": -411.685302734375, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 2.616572380065918, "rewards/margins": 5.155879974365234, "rewards/rejected": -2.5393073558807373, "step": 5237 }, { "epoch": 3.8268493150684932, "grad_norm": 6.955016146873527, "learning_rate": 2.780984447535911e-09, "logits/chosen": -2.6635549068450928, "logits/rejected": -2.258091926574707, "logps/chosen": -589.5205688476562, "logps/rejected": -496.5456848144531, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": 3.121493339538574, "rewards/margins": 5.950862407684326, "rewards/rejected": -2.82936954498291, "step": 5238 }, { "epoch": 3.827579908675799, "grad_norm": 5.945965644560105, "learning_rate": 2.757309834408528e-09, "logits/chosen": -3.112118721008301, "logits/rejected": -2.3712778091430664, "logps/chosen": -715.546142578125, "logps/rejected": -574.0830688476562, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 3.7770943641662598, "rewards/margins": 6.123271942138672, "rewards/rejected": -2.346177816390991, "step": 5239 }, { "epoch": 3.828310502283105, "grad_norm": 7.105655845959661, "learning_rate": 2.7337358650988686e-09, "logits/chosen": -2.9216434955596924, "logits/rejected": -2.2986581325531006, "logps/chosen": -744.6755981445312, "logps/rejected": -677.470947265625, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 4.97965669631958, "rewards/margins": 7.830007076263428, "rewards/rejected": -2.8503506183624268, "step": 5240 }, { "epoch": 3.829041095890411, "grad_norm": 6.779241167662216, "learning_rate": 2.710262549203063e-09, "logits/chosen": -2.6154935359954834, "logits/rejected": -1.8498114347457886, "logps/chosen": -607.2868041992188, "logps/rejected": -499.66180419921875, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 4.559902191162109, "rewards/margins": 6.508238792419434, "rewards/rejected": -1.9483364820480347, "step": 5241 }, { "epoch": 3.8297716894977167, "grad_norm": 5.7670741133910965, "learning_rate": 2.686889896276273e-09, "logits/chosen": -2.4837164878845215, "logits/rejected": -2.876283884048462, "logps/chosen": -657.7582397460938, "logps/rejected": -839.1428833007812, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 2.9515321254730225, "rewards/margins": 5.32562255859375, "rewards/rejected": -2.3740904331207275, "step": 5242 }, { "epoch": 3.830502283105023, "grad_norm": 5.194641528864462, "learning_rate": 2.6636179158326668e-09, "logits/chosen": -2.8169612884521484, "logits/rejected": -2.0658133029937744, "logps/chosen": -942.9879150390625, "logps/rejected": -674.994873046875, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": 5.456801891326904, "rewards/margins": 6.152978420257568, "rewards/rejected": -0.696176290512085, "step": 5243 }, { "epoch": 3.831232876712329, "grad_norm": 6.115891515414602, "learning_rate": 2.6404466173454986e-09, "logits/chosen": -3.0393576622009277, "logits/rejected": -2.6499695777893066, "logps/chosen": -708.08447265625, "logps/rejected": -606.4956665039062, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 3.035054922103882, "rewards/margins": 7.141269683837891, "rewards/rejected": -4.106215476989746, "step": 5244 }, { "epoch": 3.831963470319635, "grad_norm": 6.413655685180401, "learning_rate": 2.6173760102469743e-09, "logits/chosen": -2.4699389934539795, "logits/rejected": -2.562906503677368, "logps/chosen": -459.02899169921875, "logps/rejected": -689.6075439453125, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 3.939563035964966, "rewards/margins": 7.197142124176025, "rewards/rejected": -3.2575790882110596, "step": 5245 }, { "epoch": 3.8326940639269407, "grad_norm": 7.816914280248063, "learning_rate": 2.594406103928276e-09, "logits/chosen": -2.762704849243164, "logits/rejected": -2.5482230186462402, "logps/chosen": -457.5686340332031, "logps/rejected": -534.8461303710938, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": 2.813941478729248, "rewards/margins": 4.6194167137146, "rewards/rejected": -1.8054746389389038, "step": 5246 }, { "epoch": 3.8334246575342465, "grad_norm": 4.069327852201291, "learning_rate": 2.571536907739702e-09, "logits/chosen": -2.9221372604370117, "logits/rejected": -2.0616421699523926, "logps/chosen": -1034.330810546875, "logps/rejected": -773.209716796875, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 6.423801422119141, "rewards/margins": 8.37324047088623, "rewards/rejected": -1.949439525604248, "step": 5247 }, { "epoch": 3.8341552511415524, "grad_norm": 15.4800626444966, "learning_rate": 2.5487684309905e-09, "logits/chosen": -2.380464553833008, "logits/rejected": -2.197984218597412, "logps/chosen": -299.91796875, "logps/rejected": -297.166748046875, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": 2.1244752407073975, "rewards/margins": 4.941379547119141, "rewards/rejected": -2.816904067993164, "step": 5248 }, { "epoch": 3.8348858447488583, "grad_norm": 6.654387450770409, "learning_rate": 2.526100682948895e-09, "logits/chosen": -3.057969570159912, "logits/rejected": -2.7429072856903076, "logps/chosen": -453.75982666015625, "logps/rejected": -317.21612548828125, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": 0.887670636177063, "rewards/margins": 3.987915515899658, "rewards/rejected": -3.1002447605133057, "step": 5249 }, { "epoch": 3.8356164383561646, "grad_norm": 6.645603315464569, "learning_rate": 2.5035336728421177e-09, "logits/chosen": -2.467844009399414, "logits/rejected": -2.1159310340881348, "logps/chosen": -567.4073486328125, "logps/rejected": -504.0792236328125, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 3.6973557472229004, "rewards/margins": 8.680526733398438, "rewards/rejected": -4.983171463012695, "step": 5250 }, { "epoch": 3.83634703196347, "grad_norm": 9.92666092434579, "learning_rate": 2.48106740985643e-09, "logits/chosen": -3.026670455932617, "logits/rejected": -1.9222476482391357, "logps/chosen": -771.4266967773438, "logps/rejected": -517.7152099609375, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 5.647372245788574, "rewards/margins": 7.614675998687744, "rewards/rejected": -1.9673035144805908, "step": 5251 }, { "epoch": 3.8370776255707764, "grad_norm": 9.47105173266684, "learning_rate": 2.4587019031370725e-09, "logits/chosen": -3.035889148712158, "logits/rejected": -2.3144283294677734, "logps/chosen": -527.0588989257812, "logps/rejected": -353.50579833984375, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 2.6661674976348877, "rewards/margins": 6.117983818054199, "rewards/rejected": -3.4518167972564697, "step": 5252 }, { "epoch": 3.8378082191780822, "grad_norm": 2.2399910725688144, "learning_rate": 2.436437161788235e-09, "logits/chosen": -2.7304975986480713, "logits/rejected": -2.0213470458984375, "logps/chosen": -581.7985229492188, "logps/rejected": -409.1255798339844, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 3.831435203552246, "rewards/margins": 6.740970611572266, "rewards/rejected": -2.9095349311828613, "step": 5253 }, { "epoch": 3.838538812785388, "grad_norm": 6.7305186264837955, "learning_rate": 2.414273194873112e-09, "logits/chosen": -2.6972875595092773, "logits/rejected": -2.0110952854156494, "logps/chosen": -476.6329650878906, "logps/rejected": -474.2299499511719, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 3.086911916732788, "rewards/margins": 8.531997680664062, "rewards/rejected": -5.445086479187012, "step": 5254 }, { "epoch": 3.839269406392694, "grad_norm": 6.212178040418624, "learning_rate": 2.392210011413848e-09, "logits/chosen": -2.226910352706909, "logits/rejected": -2.379427433013916, "logps/chosen": -377.79681396484375, "logps/rejected": -392.1246643066406, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 2.664052724838257, "rewards/margins": 4.853682518005371, "rewards/rejected": -2.1896300315856934, "step": 5255 }, { "epoch": 3.84, "grad_norm": 6.200022654338179, "learning_rate": 2.3702476203916745e-09, "logits/chosen": -2.7298943996429443, "logits/rejected": -2.3748795986175537, "logps/chosen": -743.7947387695312, "logps/rejected": -694.5921020507812, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 2.4829211235046387, "rewards/margins": 5.294037818908691, "rewards/rejected": -2.8111166954040527, "step": 5256 }, { "epoch": 3.840730593607306, "grad_norm": 3.8883236126070346, "learning_rate": 2.348386030746635e-09, "logits/chosen": -3.3234763145446777, "logits/rejected": -2.367082118988037, "logps/chosen": -694.7268676757812, "logps/rejected": -589.4625244140625, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 3.432522773742676, "rewards/margins": 6.305003643035889, "rewards/rejected": -2.872480869293213, "step": 5257 }, { "epoch": 3.8414611872146116, "grad_norm": 4.896840396410722, "learning_rate": 2.326625251377806e-09, "logits/chosen": -3.1293535232543945, "logits/rejected": -1.7655140161514282, "logps/chosen": -527.9343872070312, "logps/rejected": -401.551025390625, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 4.644782066345215, "rewards/margins": 8.743817329406738, "rewards/rejected": -4.099034309387207, "step": 5258 }, { "epoch": 3.842191780821918, "grad_norm": 3.1782472240010087, "learning_rate": 2.3049652911433237e-09, "logits/chosen": -2.7295925617218018, "logits/rejected": -1.9090304374694824, "logps/chosen": -595.7210083007812, "logps/rejected": -482.5689392089844, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 3.348572015762329, "rewards/margins": 6.835639953613281, "rewards/rejected": -3.4870681762695312, "step": 5259 }, { "epoch": 3.842922374429224, "grad_norm": 2.9709824048949387, "learning_rate": 2.2834061588600793e-09, "logits/chosen": -2.870372772216797, "logits/rejected": -2.0704331398010254, "logps/chosen": -511.3919982910156, "logps/rejected": -384.8826904296875, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 3.673490524291992, "rewards/margins": 7.92473840713501, "rewards/rejected": -4.251247406005859, "step": 5260 }, { "epoch": 3.8436529680365297, "grad_norm": 7.846260307418278, "learning_rate": 2.2619478633041643e-09, "logits/chosen": -2.9188895225524902, "logits/rejected": -1.757091760635376, "logps/chosen": -510.54541015625, "logps/rejected": -410.8735046386719, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 1.720239520072937, "rewards/margins": 5.532722473144531, "rewards/rejected": -3.812483310699463, "step": 5261 }, { "epoch": 3.8443835616438355, "grad_norm": 5.809167706596133, "learning_rate": 2.240590413210397e-09, "logits/chosen": -2.7393321990966797, "logits/rejected": -1.8531897068023682, "logps/chosen": -684.847900390625, "logps/rejected": -446.90118408203125, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 3.0491676330566406, "rewards/margins": 4.973120212554932, "rewards/rejected": -1.923952579498291, "step": 5262 }, { "epoch": 3.8451141552511414, "grad_norm": 7.4941698938215255, "learning_rate": 2.219333817272684e-09, "logits/chosen": -2.518155097961426, "logits/rejected": -2.6919403076171875, "logps/chosen": -623.5297241210938, "logps/rejected": -614.510498046875, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 3.8135271072387695, "rewards/margins": 6.27589225769043, "rewards/rejected": -2.46236515045166, "step": 5263 }, { "epoch": 3.8458447488584477, "grad_norm": 8.721080837634892, "learning_rate": 2.1981780841438536e-09, "logits/chosen": -3.7582924365997314, "logits/rejected": -2.484363317489624, "logps/chosen": -478.96026611328125, "logps/rejected": -298.7608337402344, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": 2.645456075668335, "rewards/margins": 5.491926193237305, "rewards/rejected": -2.846470355987549, "step": 5264 }, { "epoch": 3.846575342465753, "grad_norm": 8.79624012473689, "learning_rate": 2.1771232224356282e-09, "logits/chosen": -2.4980592727661133, "logits/rejected": -2.414987802505493, "logps/chosen": -439.876708984375, "logps/rejected": -431.4012451171875, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 2.4847075939178467, "rewards/margins": 6.0729780197143555, "rewards/rejected": -3.588270664215088, "step": 5265 }, { "epoch": 3.8473059360730595, "grad_norm": 9.519759659498204, "learning_rate": 2.156169240718736e-09, "logits/chosen": -2.378816604614258, "logits/rejected": -2.269125461578369, "logps/chosen": -490.6075439453125, "logps/rejected": -433.0450439453125, "loss": 0.0664, "rewards/accuracies": 0.875, "rewards/chosen": 1.3947608470916748, "rewards/margins": 4.064693927764893, "rewards/rejected": -2.6699330806732178, "step": 5266 }, { "epoch": 3.8480365296803654, "grad_norm": 12.657047500915436, "learning_rate": 2.13531614752277e-09, "logits/chosen": -2.4589734077453613, "logits/rejected": -1.59918212890625, "logps/chosen": -489.2373046875, "logps/rejected": -364.95098876953125, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 2.7674973011016846, "rewards/margins": 6.376322269439697, "rewards/rejected": -3.608825445175171, "step": 5267 }, { "epoch": 3.8487671232876712, "grad_norm": 4.388564525695322, "learning_rate": 2.114563951336329e-09, "logits/chosen": -2.6710665225982666, "logits/rejected": -2.2488882541656494, "logps/chosen": -471.61846923828125, "logps/rejected": -421.7451171875, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": 3.6153459548950195, "rewards/margins": 5.257635116577148, "rewards/rejected": -1.6422895193099976, "step": 5268 }, { "epoch": 3.849497716894977, "grad_norm": 6.177352136056499, "learning_rate": 2.093912660606878e-09, "logits/chosen": -2.700434446334839, "logits/rejected": -2.2084412574768066, "logps/chosen": -1032.2587890625, "logps/rejected": -812.318359375, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 5.038930416107178, "rewards/margins": 6.432521343231201, "rewards/rejected": -1.3935909271240234, "step": 5269 }, { "epoch": 3.850228310502283, "grad_norm": 7.633141595467431, "learning_rate": 2.073362283740859e-09, "logits/chosen": -2.8375136852264404, "logits/rejected": -2.5996956825256348, "logps/chosen": -310.53350830078125, "logps/rejected": -338.002685546875, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 1.3300080299377441, "rewards/margins": 4.247592449188232, "rewards/rejected": -2.91758394241333, "step": 5270 }, { "epoch": 3.8509589041095893, "grad_norm": 3.2285879949730623, "learning_rate": 2.0529128291036093e-09, "logits/chosen": -2.7293174266815186, "logits/rejected": -2.690619945526123, "logps/chosen": -795.4890747070312, "logps/rejected": -978.388427734375, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 2.7662854194641113, "rewards/margins": 6.708857536315918, "rewards/rejected": -3.942572593688965, "step": 5271 }, { "epoch": 3.8516894977168947, "grad_norm": 4.56492696513553, "learning_rate": 2.03256430501933e-09, "logits/chosen": -2.4124958515167236, "logits/rejected": -2.0772976875305176, "logps/chosen": -552.2783203125, "logps/rejected": -553.3615112304688, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 3.664337635040283, "rewards/margins": 6.562137126922607, "rewards/rejected": -2.897799491882324, "step": 5272 }, { "epoch": 3.852420091324201, "grad_norm": 6.221367519955671, "learning_rate": 2.012316719771229e-09, "logits/chosen": -2.7202212810516357, "logits/rejected": -2.5118722915649414, "logps/chosen": -648.3757934570312, "logps/rejected": -619.566650390625, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": 4.4204535484313965, "rewards/margins": 6.232489585876465, "rewards/rejected": -1.8120356798171997, "step": 5273 }, { "epoch": 3.853150684931507, "grad_norm": 7.086701104889977, "learning_rate": 1.9921700816013796e-09, "logits/chosen": -2.8271026611328125, "logits/rejected": -2.1732568740844727, "logps/chosen": -481.01934814453125, "logps/rejected": -420.7645568847656, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": 1.948785662651062, "rewards/margins": 5.292504787445068, "rewards/rejected": -3.343719005584717, "step": 5274 }, { "epoch": 3.853881278538813, "grad_norm": 4.958273344985997, "learning_rate": 1.972124398710806e-09, "logits/chosen": -2.65535569190979, "logits/rejected": -1.7875313758850098, "logps/chosen": -604.9908447265625, "logps/rejected": -467.7086181640625, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 3.837264060974121, "rewards/margins": 6.621866226196289, "rewards/rejected": -2.784602165222168, "step": 5275 }, { "epoch": 3.8546118721461187, "grad_norm": 5.639859726155686, "learning_rate": 1.9521796792593694e-09, "logits/chosen": -2.370422601699829, "logits/rejected": -2.577848434448242, "logps/chosen": -591.3310546875, "logps/rejected": -618.7339477539062, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 5.2683539390563965, "rewards/margins": 6.5351362228393555, "rewards/rejected": -1.266782522201538, "step": 5276 }, { "epoch": 3.8553424657534245, "grad_norm": 5.178346025794936, "learning_rate": 1.932335931365853e-09, "logits/chosen": -2.634406566619873, "logits/rejected": -1.7417141199111938, "logps/chosen": -376.078125, "logps/rejected": -311.1354064941406, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 3.5902585983276367, "rewards/margins": 7.577323913574219, "rewards/rejected": -3.9870657920837402, "step": 5277 }, { "epoch": 3.856073059360731, "grad_norm": 8.098723190835537, "learning_rate": 1.9125931631079615e-09, "logits/chosen": -2.6154537200927734, "logits/rejected": -1.9365661144256592, "logps/chosen": -530.426025390625, "logps/rejected": -534.40771484375, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 2.89064359664917, "rewards/margins": 5.006731986999512, "rewards/rejected": -2.1160888671875, "step": 5278 }, { "epoch": 3.8568036529680363, "grad_norm": 6.297575880852699, "learning_rate": 1.8929513825222955e-09, "logits/chosen": -2.8948566913604736, "logits/rejected": -2.180363178253174, "logps/chosen": -664.0488891601562, "logps/rejected": -529.5733032226562, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 3.735186815261841, "rewards/margins": 5.498239517211914, "rewards/rejected": -1.7630534172058105, "step": 5279 }, { "epoch": 3.8575342465753426, "grad_norm": 4.124897117487707, "learning_rate": 1.873410597604319e-09, "logits/chosen": -2.4724411964416504, "logits/rejected": -2.1404531002044678, "logps/chosen": -763.3971557617188, "logps/rejected": -640.8035888671875, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 4.301029682159424, "rewards/margins": 8.731067657470703, "rewards/rejected": -4.4300384521484375, "step": 5280 }, { "epoch": 3.8582648401826485, "grad_norm": 15.550880353238597, "learning_rate": 1.85397081630842e-09, "logits/chosen": -3.003566265106201, "logits/rejected": -2.801835298538208, "logps/chosen": -671.6346435546875, "logps/rejected": -623.3953857421875, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": 2.4966912269592285, "rewards/margins": 4.350244045257568, "rewards/rejected": -1.8535528182983398, "step": 5281 }, { "epoch": 3.8589954337899544, "grad_norm": 10.600646348356769, "learning_rate": 1.8346320465478237e-09, "logits/chosen": -2.75527286529541, "logits/rejected": -2.2314841747283936, "logps/chosen": -513.8711547851562, "logps/rejected": -526.9970703125, "loss": 0.0579, "rewards/accuracies": 0.875, "rewards/chosen": 1.8674328327178955, "rewards/margins": 4.5439677238464355, "rewards/rejected": -2.676534652709961, "step": 5282 }, { "epoch": 3.8597260273972602, "grad_norm": 4.442065402247224, "learning_rate": 1.8153942961947055e-09, "logits/chosen": -3.2591142654418945, "logits/rejected": -2.4663612842559814, "logps/chosen": -508.85943603515625, "logps/rejected": -376.035888671875, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 2.9080026149749756, "rewards/margins": 6.5687360763549805, "rewards/rejected": -3.660733699798584, "step": 5283 }, { "epoch": 3.860456621004566, "grad_norm": 4.287873428142044, "learning_rate": 1.7962575730799955e-09, "logits/chosen": -2.701746940612793, "logits/rejected": -2.064239740371704, "logps/chosen": -843.0045166015625, "logps/rejected": -716.26611328125, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 4.415875434875488, "rewards/margins": 6.701472759246826, "rewards/rejected": -2.285597324371338, "step": 5284 }, { "epoch": 3.8611872146118724, "grad_norm": 3.087975246903427, "learning_rate": 1.777221884993685e-09, "logits/chosen": -2.7175445556640625, "logits/rejected": -2.9544243812561035, "logps/chosen": -624.4144287109375, "logps/rejected": -681.1010131835938, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": 3.5893988609313965, "rewards/margins": 5.276727676391602, "rewards/rejected": -1.6873292922973633, "step": 5285 }, { "epoch": 3.861917808219178, "grad_norm": 4.095411356664657, "learning_rate": 1.7582872396844362e-09, "logits/chosen": -2.9584577083587646, "logits/rejected": -1.8137977123260498, "logps/chosen": -742.295166015625, "logps/rejected": -549.892822265625, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 5.929100036621094, "rewards/margins": 7.742117881774902, "rewards/rejected": -1.8130183219909668, "step": 5286 }, { "epoch": 3.862648401826484, "grad_norm": 9.205138421786922, "learning_rate": 1.7394536448599451e-09, "logits/chosen": -2.6780080795288086, "logits/rejected": -1.9950779676437378, "logps/chosen": -670.4388427734375, "logps/rejected": -539.2437133789062, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 2.780271530151367, "rewards/margins": 6.840703010559082, "rewards/rejected": -4.060431480407715, "step": 5287 }, { "epoch": 3.86337899543379, "grad_norm": 5.222117907697463, "learning_rate": 1.720721108186718e-09, "logits/chosen": -2.355945348739624, "logits/rejected": -2.255124092102051, "logps/chosen": -506.4473876953125, "logps/rejected": -816.7392578125, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 2.7807343006134033, "rewards/margins": 8.233304023742676, "rewards/rejected": -5.452569007873535, "step": 5288 }, { "epoch": 3.864109589041096, "grad_norm": 5.355398053816293, "learning_rate": 1.702089637290044e-09, "logits/chosen": -2.5922021865844727, "logits/rejected": -1.9192326068878174, "logps/chosen": -648.8111572265625, "logps/rejected": -549.3316650390625, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 3.253237009048462, "rewards/margins": 6.554808139801025, "rewards/rejected": -3.3015708923339844, "step": 5289 }, { "epoch": 3.864840182648402, "grad_norm": 5.025688936443308, "learning_rate": 1.6835592397542176e-09, "logits/chosen": -2.8416595458984375, "logits/rejected": -2.257702589035034, "logps/chosen": -513.3139038085938, "logps/rejected": -549.6953735351562, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 2.232560634613037, "rewards/margins": 4.728062629699707, "rewards/rejected": -2.49550199508667, "step": 5290 }, { "epoch": 3.8655707762557077, "grad_norm": 10.510049642777732, "learning_rate": 1.6651299231222326e-09, "logits/chosen": -3.0196971893310547, "logits/rejected": -2.1036808490753174, "logps/chosen": -788.5883178710938, "logps/rejected": -643.2088623046875, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 4.127843379974365, "rewards/margins": 7.787239074707031, "rewards/rejected": -3.659395217895508, "step": 5291 }, { "epoch": 3.8663013698630135, "grad_norm": 6.212086543672281, "learning_rate": 1.6468016948960883e-09, "logits/chosen": -3.428182601928711, "logits/rejected": -2.641767978668213, "logps/chosen": -831.4796142578125, "logps/rejected": -710.6408081054688, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": 3.4225943088531494, "rewards/margins": 6.121147155761719, "rewards/rejected": -2.6985526084899902, "step": 5292 }, { "epoch": 3.8670319634703194, "grad_norm": 8.320379437817593, "learning_rate": 1.6285745625365387e-09, "logits/chosen": -2.4121994972229004, "logits/rejected": -2.6213040351867676, "logps/chosen": -647.534423828125, "logps/rejected": -1036.57958984375, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": 3.4358086585998535, "rewards/margins": 6.473837852478027, "rewards/rejected": -3.038029432296753, "step": 5293 }, { "epoch": 3.8677625570776257, "grad_norm": 8.699127100858714, "learning_rate": 1.6104485334631767e-09, "logits/chosen": -2.4032015800476074, "logits/rejected": -2.672471761703491, "logps/chosen": -527.0780029296875, "logps/rejected": -758.1159057617188, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 1.8483445644378662, "rewards/margins": 4.16773796081543, "rewards/rejected": -2.3193933963775635, "step": 5294 }, { "epoch": 3.8684931506849316, "grad_norm": 6.138979159804317, "learning_rate": 1.5924236150545445e-09, "logits/chosen": -2.4145824909210205, "logits/rejected": -2.5705509185791016, "logps/chosen": -553.5245971679688, "logps/rejected": -613.711181640625, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 2.6766204833984375, "rewards/margins": 3.9947376251220703, "rewards/rejected": -1.3181171417236328, "step": 5295 }, { "epoch": 3.8692237442922375, "grad_norm": 3.8442036430807223, "learning_rate": 1.5744998146478839e-09, "logits/chosen": -2.6751084327697754, "logits/rejected": -2.437016487121582, "logps/chosen": -784.3072509765625, "logps/rejected": -736.8070678710938, "loss": 0.0338, "rewards/accuracies": 0.875, "rewards/chosen": 4.2072672843933105, "rewards/margins": 5.787822723388672, "rewards/rejected": -1.5805550813674927, "step": 5296 }, { "epoch": 3.8699543378995434, "grad_norm": 5.76220701056187, "learning_rate": 1.5566771395393585e-09, "logits/chosen": -2.28950834274292, "logits/rejected": -2.1935224533081055, "logps/chosen": -272.4101867675781, "logps/rejected": -470.6439208984375, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 2.826810121536255, "rewards/margins": 9.998830795288086, "rewards/rejected": -7.17202091217041, "step": 5297 }, { "epoch": 3.8706849315068492, "grad_norm": 5.714033339240604, "learning_rate": 1.5389555969839707e-09, "logits/chosen": -3.7558321952819824, "logits/rejected": -2.29727840423584, "logps/chosen": -794.0768432617188, "logps/rejected": -464.688232421875, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 4.128149509429932, "rewards/margins": 6.465209007263184, "rewards/rejected": -2.337059497833252, "step": 5298 }, { "epoch": 3.871415525114155, "grad_norm": 6.6738195511358445, "learning_rate": 1.5213351941955332e-09, "logits/chosen": -2.2795352935791016, "logits/rejected": -2.562257766723633, "logps/chosen": -646.1198120117188, "logps/rejected": -763.64013671875, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 2.809168815612793, "rewards/margins": 4.1040520668029785, "rewards/rejected": -1.294883131980896, "step": 5299 }, { "epoch": 3.872146118721461, "grad_norm": 8.405094109802038, "learning_rate": 1.5038159383466976e-09, "logits/chosen": -2.563643455505371, "logits/rejected": -1.8799312114715576, "logps/chosen": -374.55108642578125, "logps/rejected": -325.4872131347656, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 3.193803310394287, "rewards/margins": 6.366426467895508, "rewards/rejected": -3.1726229190826416, "step": 5300 }, { "epoch": 3.8728767123287673, "grad_norm": 7.513718583328956, "learning_rate": 1.4863978365689533e-09, "logits/chosen": -3.4216299057006836, "logits/rejected": -2.0191650390625, "logps/chosen": -420.6685791015625, "logps/rejected": -366.3792724609375, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 4.653346061706543, "rewards/margins": 8.806998252868652, "rewards/rejected": -4.153651714324951, "step": 5301 }, { "epoch": 3.873607305936073, "grad_norm": 4.449936109791419, "learning_rate": 1.4690808959525458e-09, "logits/chosen": -2.968743324279785, "logits/rejected": -2.8973939418792725, "logps/chosen": -424.9508972167969, "logps/rejected": -514.3642578125, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 2.5869476795196533, "rewards/margins": 5.284086227416992, "rewards/rejected": -2.697138786315918, "step": 5302 }, { "epoch": 3.874337899543379, "grad_norm": 6.069844938556915, "learning_rate": 1.4518651235466418e-09, "logits/chosen": -3.1423678398132324, "logits/rejected": -2.383068084716797, "logps/chosen": -725.7162475585938, "logps/rejected": -461.6709899902344, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": 3.4393773078918457, "rewards/margins": 5.169164657592773, "rewards/rejected": -1.729786992073059, "step": 5303 }, { "epoch": 3.875068493150685, "grad_norm": 3.493428600725468, "learning_rate": 1.4347505263591353e-09, "logits/chosen": -2.5463781356811523, "logits/rejected": -2.053135633468628, "logps/chosen": -553.0117797851562, "logps/rejected": -497.47406005859375, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 4.5828857421875, "rewards/margins": 7.429698944091797, "rewards/rejected": -2.846813678741455, "step": 5304 }, { "epoch": 3.875799086757991, "grad_norm": 5.549890692381941, "learning_rate": 1.4177371113568426e-09, "logits/chosen": -3.0990967750549316, "logits/rejected": -1.8373956680297852, "logps/chosen": -690.961669921875, "logps/rejected": -403.10931396484375, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 5.344723701477051, "rewards/margins": 7.207256317138672, "rewards/rejected": -1.8625333309173584, "step": 5305 }, { "epoch": 3.8765296803652967, "grad_norm": 5.300190026097568, "learning_rate": 1.4008248854652515e-09, "logits/chosen": -2.6771559715270996, "logits/rejected": -2.4935569763183594, "logps/chosen": -522.9790649414062, "logps/rejected": -615.356689453125, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 3.614434242248535, "rewards/margins": 5.679876327514648, "rewards/rejected": -2.0654423236846924, "step": 5306 }, { "epoch": 3.8772602739726025, "grad_norm": 5.347574928618346, "learning_rate": 1.3840138555687998e-09, "logits/chosen": -2.4107189178466797, "logits/rejected": -2.5789666175842285, "logps/chosen": -513.872314453125, "logps/rejected": -841.1797485351562, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 3.6562509536743164, "rewards/margins": 6.619801044464111, "rewards/rejected": -2.963550567626953, "step": 5307 }, { "epoch": 3.877990867579909, "grad_norm": 6.87535871090116, "learning_rate": 1.367304028510624e-09, "logits/chosen": -2.6964833736419678, "logits/rejected": -2.087524175643921, "logps/chosen": -456.14141845703125, "logps/rejected": -378.2062683105469, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 3.1953675746917725, "rewards/margins": 6.7797441482543945, "rewards/rejected": -3.584376335144043, "step": 5308 }, { "epoch": 3.8787214611872147, "grad_norm": 3.7160711817827403, "learning_rate": 1.350695411092756e-09, "logits/chosen": -2.988396167755127, "logits/rejected": -2.441267967224121, "logps/chosen": -785.8348999023438, "logps/rejected": -669.9849243164062, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 3.3898816108703613, "rewards/margins": 5.6761980056762695, "rewards/rejected": -2.286316394805908, "step": 5309 }, { "epoch": 3.8794520547945206, "grad_norm": 4.72240745829375, "learning_rate": 1.3341880100759262e-09, "logits/chosen": -2.793722629547119, "logits/rejected": -2.284579277038574, "logps/chosen": -587.6516723632812, "logps/rejected": -543.410888671875, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 3.859463691711426, "rewards/margins": 7.9935431480407715, "rewards/rejected": -4.134079456329346, "step": 5310 }, { "epoch": 3.8801826484018265, "grad_norm": 7.146931674133755, "learning_rate": 1.3177818321797318e-09, "logits/chosen": -3.10871958732605, "logits/rejected": -2.222611665725708, "logps/chosen": -819.303466796875, "logps/rejected": -539.2064819335938, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 3.5828161239624023, "rewards/margins": 4.868401527404785, "rewards/rejected": -1.285585641860962, "step": 5311 }, { "epoch": 3.8809132420091323, "grad_norm": 7.233310827170428, "learning_rate": 1.301476884082553e-09, "logits/chosen": -3.3894004821777344, "logits/rejected": -2.1206305027008057, "logps/chosen": -762.4293823242188, "logps/rejected": -537.659912109375, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": 4.907464981079102, "rewards/margins": 6.293430805206299, "rewards/rejected": -1.385965347290039, "step": 5312 }, { "epoch": 3.881643835616438, "grad_norm": 6.621528435675776, "learning_rate": 1.2852731724215805e-09, "logits/chosen": -2.893435478210449, "logits/rejected": -2.126159429550171, "logps/chosen": -735.34765625, "logps/rejected": -490.13970947265625, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 1.711076021194458, "rewards/margins": 4.344757556915283, "rewards/rejected": -2.633681297302246, "step": 5313 }, { "epoch": 3.882374429223744, "grad_norm": 8.38843244892604, "learning_rate": 1.2691707037927878e-09, "logits/chosen": -2.6263511180877686, "logits/rejected": -1.3697383403778076, "logps/chosen": -565.4732666015625, "logps/rejected": -327.24981689453125, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 4.205363750457764, "rewards/margins": 9.544024467468262, "rewards/rejected": -5.33866024017334, "step": 5314 }, { "epoch": 3.8831050228310504, "grad_norm": 6.703531604163404, "learning_rate": 1.2531694847508768e-09, "logits/chosen": -2.8667750358581543, "logits/rejected": -2.578582763671875, "logps/chosen": -436.6588134765625, "logps/rejected": -416.68328857421875, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 2.0300660133361816, "rewards/margins": 3.6678223609924316, "rewards/rejected": -1.6377564668655396, "step": 5315 }, { "epoch": 3.8838356164383563, "grad_norm": 6.8110081150273745, "learning_rate": 1.2372695218094143e-09, "logits/chosen": -2.6003034114837646, "logits/rejected": -1.7665815353393555, "logps/chosen": -619.5506591796875, "logps/rejected": -418.5453186035156, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": 2.475917339324951, "rewards/margins": 4.91632080078125, "rewards/rejected": -2.440403699874878, "step": 5316 }, { "epoch": 3.884566210045662, "grad_norm": 10.954861012233943, "learning_rate": 1.2214708214406956e-09, "logits/chosen": -3.091003894805908, "logits/rejected": -1.9623754024505615, "logps/chosen": -559.1675415039062, "logps/rejected": -374.476806640625, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.8753104209899902, "rewards/margins": 7.45311975479126, "rewards/rejected": -3.5778090953826904, "step": 5317 }, { "epoch": 3.885296803652968, "grad_norm": 4.575136843443097, "learning_rate": 1.2057733900758538e-09, "logits/chosen": -2.7488009929656982, "logits/rejected": -1.8859307765960693, "logps/chosen": -716.0728149414062, "logps/rejected": -609.9943237304688, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": 5.1848063468933105, "rewards/margins": 7.704015731811523, "rewards/rejected": -2.519209384918213, "step": 5318 }, { "epoch": 3.886027397260274, "grad_norm": 4.72046918056542, "learning_rate": 1.19017723410475e-09, "logits/chosen": -3.266158103942871, "logits/rejected": -2.522644281387329, "logps/chosen": -810.2605590820312, "logps/rejected": -686.6796264648438, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 4.479077339172363, "rewards/margins": 4.7810750007629395, "rewards/rejected": -0.30199748277664185, "step": 5319 }, { "epoch": 3.88675799086758, "grad_norm": 8.214396088610894, "learning_rate": 1.1746823598759726e-09, "logits/chosen": -3.0778276920318604, "logits/rejected": -2.370201587677002, "logps/chosen": -922.5819091796875, "logps/rejected": -686.8038330078125, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 3.490873336791992, "rewards/margins": 4.889963150024414, "rewards/rejected": -1.3990901708602905, "step": 5320 }, { "epoch": 3.8874885844748857, "grad_norm": 6.073246505792202, "learning_rate": 1.1592887736970314e-09, "logits/chosen": -3.0625362396240234, "logits/rejected": -2.1487889289855957, "logps/chosen": -938.3579711914062, "logps/rejected": -576.3662719726562, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 4.484158515930176, "rewards/margins": 5.258959770202637, "rewards/rejected": -0.7748017311096191, "step": 5321 }, { "epoch": 3.888219178082192, "grad_norm": 5.005872520283657, "learning_rate": 1.143996481834053e-09, "logits/chosen": -3.0071632862091064, "logits/rejected": -2.2092151641845703, "logps/chosen": -415.6402282714844, "logps/rejected": -381.8406066894531, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": 1.8642487525939941, "rewards/margins": 4.724877834320068, "rewards/rejected": -2.8606293201446533, "step": 5322 }, { "epoch": 3.888949771689498, "grad_norm": 6.610359280931562, "learning_rate": 1.1288054905120303e-09, "logits/chosen": -2.6385457515716553, "logits/rejected": -2.0349740982055664, "logps/chosen": -525.227294921875, "logps/rejected": -489.4810791015625, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 3.214527130126953, "rewards/margins": 7.9630303382873535, "rewards/rejected": -4.7485032081604, "step": 5323 }, { "epoch": 3.8896803652968037, "grad_norm": 6.617795661803613, "learning_rate": 1.1137158059146556e-09, "logits/chosen": -2.9631948471069336, "logits/rejected": -1.5545576810836792, "logps/chosen": -479.0025939941406, "logps/rejected": -308.3868713378906, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 2.3502237796783447, "rewards/margins": 4.995733261108398, "rewards/rejected": -2.6455090045928955, "step": 5324 }, { "epoch": 3.8904109589041096, "grad_norm": 5.867917925884113, "learning_rate": 1.0987274341844043e-09, "logits/chosen": -3.223403215408325, "logits/rejected": -2.339263677597046, "logps/chosen": -609.76171875, "logps/rejected": -453.9064025878906, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 3.4833803176879883, "rewards/margins": 5.584743499755859, "rewards/rejected": -2.101363182067871, "step": 5325 }, { "epoch": 3.8911415525114155, "grad_norm": 5.2388035501752785, "learning_rate": 1.083840381422535e-09, "logits/chosen": -3.0927412509918213, "logits/rejected": -2.3879494667053223, "logps/chosen": -714.4210815429688, "logps/rejected": -446.42523193359375, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 4.727773189544678, "rewards/margins": 7.162486553192139, "rewards/rejected": -2.434713363647461, "step": 5326 }, { "epoch": 3.8918721461187213, "grad_norm": 8.804255964912828, "learning_rate": 1.0690546536890332e-09, "logits/chosen": -3.2786366939544678, "logits/rejected": -2.2120778560638428, "logps/chosen": -967.20947265625, "logps/rejected": -830.8525390625, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 3.521345615386963, "rewards/margins": 5.622835159301758, "rewards/rejected": -2.101489782333374, "step": 5327 }, { "epoch": 3.892602739726027, "grad_norm": 4.5645937361338245, "learning_rate": 1.0543702570026681e-09, "logits/chosen": -3.226619005203247, "logits/rejected": -2.223217010498047, "logps/chosen": -575.8312377929688, "logps/rejected": -469.0270080566406, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 4.595711708068848, "rewards/margins": 9.300226211547852, "rewards/rejected": -4.704514980316162, "step": 5328 }, { "epoch": 3.8933333333333335, "grad_norm": 10.164578165964006, "learning_rate": 1.0397871973409356e-09, "logits/chosen": -2.901933193206787, "logits/rejected": -2.023768424987793, "logps/chosen": -810.1957397460938, "logps/rejected": -551.0218505859375, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 2.316629409790039, "rewards/margins": 5.51013708114624, "rewards/rejected": -3.193507432937622, "step": 5329 }, { "epoch": 3.8940639269406394, "grad_norm": 7.410998146280292, "learning_rate": 1.0253054806400597e-09, "logits/chosen": -2.4101574420928955, "logits/rejected": -1.6869914531707764, "logps/chosen": -475.98748779296875, "logps/rejected": -348.62396240234375, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": 2.419642448425293, "rewards/margins": 5.58308219909668, "rewards/rejected": -3.163440465927124, "step": 5330 }, { "epoch": 3.8947945205479453, "grad_norm": 5.268566905830364, "learning_rate": 1.0109251127950746e-09, "logits/chosen": -2.9275317192077637, "logits/rejected": -2.688323974609375, "logps/chosen": -475.7634582519531, "logps/rejected": -505.3776550292969, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": 2.2796435356140137, "rewards/margins": 5.08094596862793, "rewards/rejected": -2.801301956176758, "step": 5331 }, { "epoch": 3.895525114155251, "grad_norm": 8.52405644450123, "learning_rate": 9.966460996597147e-10, "logits/chosen": -2.731013774871826, "logits/rejected": -2.172733783721924, "logps/chosen": -684.50146484375, "logps/rejected": -560.359619140625, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 2.94205379486084, "rewards/margins": 4.3521575927734375, "rewards/rejected": -1.4101040363311768, "step": 5332 }, { "epoch": 3.896255707762557, "grad_norm": 5.558317519379424, "learning_rate": 9.824684470464418e-10, "logits/chosen": -2.6149144172668457, "logits/rejected": -2.3470709323883057, "logps/chosen": -546.1055297851562, "logps/rejected": -440.3127746582031, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 2.3309125900268555, "rewards/margins": 4.404772758483887, "rewards/rejected": -2.0738604068756104, "step": 5333 }, { "epoch": 3.896986301369863, "grad_norm": 11.723016150548272, "learning_rate": 9.683921607265e-10, "logits/chosen": -2.9280195236206055, "logits/rejected": -2.0696394443511963, "logps/chosen": -611.5593872070312, "logps/rejected": -656.15283203125, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": 2.209357261657715, "rewards/margins": 4.222980976104736, "rewards/rejected": -2.0136239528656006, "step": 5334 }, { "epoch": 3.8977168949771688, "grad_norm": 3.438806874932568, "learning_rate": 9.544172464298616e-10, "logits/chosen": -2.936245918273926, "logits/rejected": -2.356804609298706, "logps/chosen": -632.1387939453125, "logps/rejected": -566.64892578125, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 4.159936904907227, "rewards/margins": 6.309918403625488, "rewards/rejected": -2.1499810218811035, "step": 5335 }, { "epoch": 3.898447488584475, "grad_norm": 4.147217537043427, "learning_rate": 9.405437098451984e-10, "logits/chosen": -3.1675825119018555, "logits/rejected": -2.316798686981201, "logps/chosen": -433.7240905761719, "logps/rejected": -326.8531494140625, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 2.1153862476348877, "rewards/margins": 3.969066619873047, "rewards/rejected": -1.8536804914474487, "step": 5336 }, { "epoch": 3.899178082191781, "grad_norm": 6.631902014400792, "learning_rate": 9.267715566199652e-10, "logits/chosen": -3.018054485321045, "logits/rejected": -2.2016971111297607, "logps/chosen": -703.0811767578125, "logps/rejected": -582.3229370117188, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 3.7416977882385254, "rewards/margins": 5.876553058624268, "rewards/rejected": -2.134855270385742, "step": 5337 }, { "epoch": 3.899908675799087, "grad_norm": 3.3120229125589438, "learning_rate": 9.131007923602885e-10, "logits/chosen": -2.773958683013916, "logits/rejected": -1.717670202255249, "logps/chosen": -567.205810546875, "logps/rejected": -409.2973327636719, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 3.805881977081299, "rewards/margins": 7.931492328643799, "rewards/rejected": -4.1256103515625, "step": 5338 }, { "epoch": 3.9006392694063927, "grad_norm": 7.447345532124417, "learning_rate": 8.995314226311057e-10, "logits/chosen": -2.7334632873535156, "logits/rejected": -2.368579387664795, "logps/chosen": -932.88916015625, "logps/rejected": -668.6273193359375, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 4.660361289978027, "rewards/margins": 6.08365535736084, "rewards/rejected": -1.4232938289642334, "step": 5339 }, { "epoch": 3.9013698630136986, "grad_norm": 2.676796913905169, "learning_rate": 8.860634529559708e-10, "logits/chosen": -2.643618583679199, "logits/rejected": -2.436194896697998, "logps/chosen": -629.1181030273438, "logps/rejected": -629.25341796875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 3.3516619205474854, "rewards/margins": 7.1017680168151855, "rewards/rejected": -3.750105857849121, "step": 5340 }, { "epoch": 3.9021004566210045, "grad_norm": 14.846984125473776, "learning_rate": 8.726968888172759e-10, "logits/chosen": -2.876459836959839, "logits/rejected": -1.4857423305511475, "logps/chosen": -702.511474609375, "logps/rejected": -312.97027587890625, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 5.562835693359375, "rewards/margins": 7.8320207595825195, "rewards/rejected": -2.2691845893859863, "step": 5341 }, { "epoch": 3.9028310502283103, "grad_norm": 19.528173976524837, "learning_rate": 8.594317356560299e-10, "logits/chosen": -2.12947154045105, "logits/rejected": -2.4426703453063965, "logps/chosen": -361.556884765625, "logps/rejected": -567.9530639648438, "loss": 0.0443, "rewards/accuracies": 0.875, "rewards/chosen": 2.0786008834838867, "rewards/margins": 6.271665573120117, "rewards/rejected": -4.193065166473389, "step": 5342 }, { "epoch": 3.9035616438356167, "grad_norm": 5.139175720916157, "learning_rate": 8.4626799887208e-10, "logits/chosen": -2.487196207046509, "logits/rejected": -2.3086485862731934, "logps/chosen": -631.39794921875, "logps/rejected": -461.4183349609375, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 3.870493173599243, "rewards/margins": 6.160915851593018, "rewards/rejected": -2.2904231548309326, "step": 5343 }, { "epoch": 3.9042922374429225, "grad_norm": 10.910675112975294, "learning_rate": 8.332056838238343e-10, "logits/chosen": -2.5992884635925293, "logits/rejected": -2.2691497802734375, "logps/chosen": -688.0043334960938, "logps/rejected": -638.9627685546875, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 5.259498119354248, "rewards/margins": 6.943492889404297, "rewards/rejected": -1.683995246887207, "step": 5344 }, { "epoch": 3.9050228310502284, "grad_norm": 3.364589742333574, "learning_rate": 8.202447958285674e-10, "logits/chosen": -2.638399839401245, "logits/rejected": -2.384355068206787, "logps/chosen": -497.3291015625, "logps/rejected": -575.1845092773438, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 3.9493608474731445, "rewards/margins": 5.704837799072266, "rewards/rejected": -1.7554773092269897, "step": 5345 }, { "epoch": 3.9057534246575343, "grad_norm": 10.305860566877751, "learning_rate": 8.073853401621978e-10, "logits/chosen": -2.5052809715270996, "logits/rejected": -2.338609218597412, "logps/chosen": -698.0946044921875, "logps/rejected": -641.1531372070312, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 3.2963080406188965, "rewards/margins": 3.8074843883514404, "rewards/rejected": -0.511176347732544, "step": 5346 }, { "epoch": 3.90648401826484, "grad_norm": 5.691881106327192, "learning_rate": 7.946273220593158e-10, "logits/chosen": -2.717209815979004, "logits/rejected": -2.4732909202575684, "logps/chosen": -559.504638671875, "logps/rejected": -573.357666015625, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 2.715811252593994, "rewards/margins": 5.181816577911377, "rewards/rejected": -2.4660050868988037, "step": 5347 }, { "epoch": 3.907214611872146, "grad_norm": 3.877537665513147, "learning_rate": 7.819707467132952e-10, "logits/chosen": -2.8605237007141113, "logits/rejected": -2.001485824584961, "logps/chosen": -579.1041259765625, "logps/rejected": -387.18316650390625, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 3.4230611324310303, "rewards/margins": 6.753978252410889, "rewards/rejected": -3.3309168815612793, "step": 5348 }, { "epoch": 3.907945205479452, "grad_norm": 2.6772558451086548, "learning_rate": 7.694156192761813e-10, "logits/chosen": -2.8327479362487793, "logits/rejected": -1.8979072570800781, "logps/chosen": -808.7808837890625, "logps/rejected": -668.5404663085938, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 3.0199368000030518, "rewards/margins": 6.563165664672852, "rewards/rejected": -3.5432286262512207, "step": 5349 }, { "epoch": 3.908675799086758, "grad_norm": 7.355049174006302, "learning_rate": 7.569619448587194e-10, "logits/chosen": -2.754868745803833, "logits/rejected": -2.236849308013916, "logps/chosen": -789.8167724609375, "logps/rejected": -639.5980834960938, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 4.285012245178223, "rewards/margins": 6.880387306213379, "rewards/rejected": -2.5953755378723145, "step": 5350 }, { "epoch": 3.909406392694064, "grad_norm": 5.300556355191457, "learning_rate": 7.446097285303543e-10, "logits/chosen": -2.7134554386138916, "logits/rejected": -2.2907161712646484, "logps/chosen": -599.482421875, "logps/rejected": -492.4972229003906, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 2.621575355529785, "rewards/margins": 4.330104827880859, "rewards/rejected": -1.7085298299789429, "step": 5351 }, { "epoch": 3.91013698630137, "grad_norm": 5.271614991782123, "learning_rate": 7.323589753192582e-10, "logits/chosen": -2.6891682147979736, "logits/rejected": -2.113438606262207, "logps/chosen": -611.45947265625, "logps/rejected": -584.1151123046875, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 3.4237520694732666, "rewards/margins": 6.390506267547607, "rewards/rejected": -2.96675443649292, "step": 5352 }, { "epoch": 3.910867579908676, "grad_norm": 8.827626766222645, "learning_rate": 7.202096902122756e-10, "logits/chosen": -3.252737283706665, "logits/rejected": -2.9279489517211914, "logps/chosen": -574.1126708984375, "logps/rejected": -486.59918212890625, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 2.312973976135254, "rewards/margins": 4.008462905883789, "rewards/rejected": -1.6954889297485352, "step": 5353 }, { "epoch": 3.9115981735159817, "grad_norm": 6.066675065981461, "learning_rate": 7.081618781549503e-10, "logits/chosen": -2.5771865844726562, "logits/rejected": -1.8004037141799927, "logps/chosen": -611.5759887695312, "logps/rejected": -538.67236328125, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 2.409691333770752, "rewards/margins": 4.765842914581299, "rewards/rejected": -2.356151819229126, "step": 5354 }, { "epoch": 3.9123287671232876, "grad_norm": 6.424508624453222, "learning_rate": 6.962155440515261e-10, "logits/chosen": -2.8166356086730957, "logits/rejected": -1.9632272720336914, "logps/chosen": -591.2838134765625, "logps/rejected": -548.3179931640625, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 3.744938373565674, "rewards/margins": 5.724153518676758, "rewards/rejected": -1.9792149066925049, "step": 5355 }, { "epoch": 3.9130593607305935, "grad_norm": 4.220278992673467, "learning_rate": 6.843706927649462e-10, "logits/chosen": -3.0317132472991943, "logits/rejected": -2.8776235580444336, "logps/chosen": -954.2828369140625, "logps/rejected": -804.5346069335938, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": 6.01793098449707, "rewards/margins": 8.234569549560547, "rewards/rejected": -2.2166388034820557, "step": 5356 }, { "epoch": 3.9137899543379, "grad_norm": 5.629850034461172, "learning_rate": 6.726273291168261e-10, "logits/chosen": -2.950538158416748, "logits/rejected": -1.9598238468170166, "logps/chosen": -692.17431640625, "logps/rejected": -463.8402404785156, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 5.174871444702148, "rewards/margins": 7.8626837730407715, "rewards/rejected": -2.687812089920044, "step": 5357 }, { "epoch": 3.914520547945205, "grad_norm": 6.174784532254983, "learning_rate": 6.609854578874529e-10, "logits/chosen": -2.601503849029541, "logits/rejected": -2.3739728927612305, "logps/chosen": -612.7796630859375, "logps/rejected": -577.1383056640625, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 2.7954633235931396, "rewards/margins": 6.1475300788879395, "rewards/rejected": -3.3520665168762207, "step": 5358 }, { "epoch": 3.9152511415525115, "grad_norm": 4.178977150136419, "learning_rate": 6.494450838158416e-10, "logits/chosen": -2.943697214126587, "logits/rejected": -2.677666425704956, "logps/chosen": -539.912841796875, "logps/rejected": -597.9142456054688, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 3.0104353427886963, "rewards/margins": 5.434275150299072, "rewards/rejected": -2.423839569091797, "step": 5359 }, { "epoch": 3.9159817351598174, "grad_norm": 7.2890868355200675, "learning_rate": 6.380062115997064e-10, "logits/chosen": -2.8649556636810303, "logits/rejected": -2.3805179595947266, "logps/chosen": -1054.2767333984375, "logps/rejected": -817.29248046875, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 4.983492374420166, "rewards/margins": 5.3659210205078125, "rewards/rejected": -0.38242876529693604, "step": 5360 }, { "epoch": 3.9167123287671233, "grad_norm": 12.324941168406749, "learning_rate": 6.266688458953506e-10, "logits/chosen": -2.977950096130371, "logits/rejected": -2.5601654052734375, "logps/chosen": -814.2347412109375, "logps/rejected": -691.1135864257812, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 5.9912109375, "rewards/margins": 6.841256618499756, "rewards/rejected": -0.8500463962554932, "step": 5361 }, { "epoch": 3.917442922374429, "grad_norm": 8.934545068499833, "learning_rate": 6.154329913178602e-10, "logits/chosen": -2.825422525405884, "logits/rejected": -2.1287877559661865, "logps/chosen": -600.9450073242188, "logps/rejected": -432.7018127441406, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 3.4152188301086426, "rewards/margins": 5.179142951965332, "rewards/rejected": -1.763924241065979, "step": 5362 }, { "epoch": 3.918173515981735, "grad_norm": 5.495022515484237, "learning_rate": 6.042986524409655e-10, "logits/chosen": -2.832988739013672, "logits/rejected": -1.741516351699829, "logps/chosen": -629.6716918945312, "logps/rejected": -413.1755676269531, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 3.8191280364990234, "rewards/margins": 6.6154704093933105, "rewards/rejected": -2.7963428497314453, "step": 5363 }, { "epoch": 3.9189041095890413, "grad_norm": 4.570252403451039, "learning_rate": 5.932658337970132e-10, "logits/chosen": -2.4867501258850098, "logits/rejected": -2.4385132789611816, "logps/chosen": -480.32196044921875, "logps/rejected": -598.2657470703125, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 3.031032085418701, "rewards/margins": 5.552348613739014, "rewards/rejected": -2.5213165283203125, "step": 5364 }, { "epoch": 3.9196347031963468, "grad_norm": 6.89623075724896, "learning_rate": 5.823345398771329e-10, "logits/chosen": -2.248018741607666, "logits/rejected": -2.356630325317383, "logps/chosen": -279.4163513183594, "logps/rejected": -404.5098876953125, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 1.2112016677856445, "rewards/margins": 5.4498138427734375, "rewards/rejected": -4.238612651824951, "step": 5365 }, { "epoch": 3.920365296803653, "grad_norm": 2.838495619280796, "learning_rate": 5.715047751310154e-10, "logits/chosen": -2.820404052734375, "logits/rejected": -2.2010111808776855, "logps/chosen": -659.6793212890625, "logps/rejected": -558.3323974609375, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 4.176329135894775, "rewards/margins": 7.884477138519287, "rewards/rejected": -3.7081480026245117, "step": 5366 }, { "epoch": 3.921095890410959, "grad_norm": 3.5668272540819, "learning_rate": 5.607765439671341e-10, "logits/chosen": -2.1706533432006836, "logits/rejected": -2.087279796600342, "logps/chosen": -335.4061279296875, "logps/rejected": -454.3001708984375, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 1.2271349430084229, "rewards/margins": 6.9798665046691895, "rewards/rejected": -5.752731800079346, "step": 5367 }, { "epoch": 3.921826484018265, "grad_norm": 7.563214481115389, "learning_rate": 5.501498507525237e-10, "logits/chosen": -3.1388182640075684, "logits/rejected": -2.4119815826416016, "logps/chosen": -651.5216064453125, "logps/rejected": -510.9938049316406, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": 2.5465188026428223, "rewards/margins": 5.041275978088379, "rewards/rejected": -2.494757652282715, "step": 5368 }, { "epoch": 3.9225570776255707, "grad_norm": 6.497356170553624, "learning_rate": 5.396246998129738e-10, "logits/chosen": -2.859389305114746, "logits/rejected": -2.695397138595581, "logps/chosen": -677.3308715820312, "logps/rejected": -666.5791625976562, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 3.989210367202759, "rewards/margins": 7.7830986976623535, "rewards/rejected": -3.793889045715332, "step": 5369 }, { "epoch": 3.9232876712328766, "grad_norm": 3.427718098919691, "learning_rate": 5.292010954329184e-10, "logits/chosen": -3.0884618759155273, "logits/rejected": -2.385254144668579, "logps/chosen": -495.7743225097656, "logps/rejected": -435.8443603515625, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 2.7631282806396484, "rewards/margins": 5.406328201293945, "rewards/rejected": -2.6431996822357178, "step": 5370 }, { "epoch": 3.924018264840183, "grad_norm": 5.034199105048633, "learning_rate": 5.188790418553801e-10, "logits/chosen": -2.868452787399292, "logits/rejected": -2.8713345527648926, "logps/chosen": -834.4910888671875, "logps/rejected": -712.129638671875, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 5.397859573364258, "rewards/margins": 5.845069408416748, "rewards/rejected": -0.4472099244594574, "step": 5371 }, { "epoch": 3.9247488584474883, "grad_norm": 6.73750986300226, "learning_rate": 5.086585432821366e-10, "logits/chosen": -2.4918508529663086, "logits/rejected": -2.362231731414795, "logps/chosen": -811.1219482421875, "logps/rejected": -725.069091796875, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 3.7478227615356445, "rewards/margins": 4.67462158203125, "rewards/rejected": -0.9267988204956055, "step": 5372 }, { "epoch": 3.9254794520547946, "grad_norm": 5.191917285207416, "learning_rate": 4.985396038736101e-10, "logits/chosen": -2.4515161514282227, "logits/rejected": -2.109910249710083, "logps/chosen": -764.9625244140625, "logps/rejected": -558.9114379882812, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 3.218491554260254, "rewards/margins": 6.729406356811523, "rewards/rejected": -3.510915517807007, "step": 5373 }, { "epoch": 3.9262100456621005, "grad_norm": 5.606179349830491, "learning_rate": 4.885222277488388e-10, "logits/chosen": -2.3828001022338867, "logits/rejected": -2.5326948165893555, "logps/chosen": -649.70458984375, "logps/rejected": -752.900146484375, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": 2.464405059814453, "rewards/margins": 5.66707706451416, "rewards/rejected": -3.202671527862549, "step": 5374 }, { "epoch": 3.9269406392694064, "grad_norm": 2.863167504188928, "learning_rate": 4.786064189855888e-10, "logits/chosen": -2.9328606128692627, "logits/rejected": -2.2146410942077637, "logps/chosen": -603.0797119140625, "logps/rejected": -589.637451171875, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 4.7488203048706055, "rewards/margins": 6.4272027015686035, "rewards/rejected": -1.678382158279419, "step": 5375 }, { "epoch": 3.9276712328767123, "grad_norm": 7.820851737075096, "learning_rate": 4.687921816201868e-10, "logits/chosen": -3.1054296493530273, "logits/rejected": -1.4273196458816528, "logps/chosen": -575.2369384765625, "logps/rejected": -386.84521484375, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 5.053996562957764, "rewards/margins": 8.769241333007812, "rewards/rejected": -3.7152457237243652, "step": 5376 }, { "epoch": 3.928401826484018, "grad_norm": 4.115711273239167, "learning_rate": 4.590795196476871e-10, "logits/chosen": -2.4635000228881836, "logits/rejected": -2.383026599884033, "logps/chosen": -752.731201171875, "logps/rejected": -557.7794799804688, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 2.5763986110687256, "rewards/margins": 5.778547286987305, "rewards/rejected": -3.202147960662842, "step": 5377 }, { "epoch": 3.9291324200913245, "grad_norm": 7.099810326420177, "learning_rate": 4.4946843702176053e-10, "logits/chosen": -2.9343292713165283, "logits/rejected": -2.2207982540130615, "logps/chosen": -513.990966796875, "logps/rejected": -465.52203369140625, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 3.561281442642212, "rewards/margins": 6.23052978515625, "rewards/rejected": -2.669248580932617, "step": 5378 }, { "epoch": 3.92986301369863, "grad_norm": 3.1821144833017665, "learning_rate": 4.399589376547497e-10, "logits/chosen": -2.6797218322753906, "logits/rejected": -1.7428529262542725, "logps/chosen": -612.4639282226562, "logps/rejected": -459.235595703125, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 4.121356010437012, "rewards/margins": 5.827922821044922, "rewards/rejected": -1.7065670490264893, "step": 5379 }, { "epoch": 3.930593607305936, "grad_norm": 4.555684472924535, "learning_rate": 4.305510254176692e-10, "logits/chosen": -3.017953395843506, "logits/rejected": -2.4631688594818115, "logps/chosen": -876.717529296875, "logps/rejected": -693.2710571289062, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 4.425589561462402, "rewards/margins": 5.870938301086426, "rewards/rejected": -1.445348858833313, "step": 5380 }, { "epoch": 3.931324200913242, "grad_norm": 6.698922927597583, "learning_rate": 4.2124470414009463e-10, "logits/chosen": -2.968825340270996, "logits/rejected": -2.1481642723083496, "logps/chosen": -853.6544189453125, "logps/rejected": -601.8321533203125, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 4.236800193786621, "rewards/margins": 5.596499443054199, "rewards/rejected": -1.3596997261047363, "step": 5381 }, { "epoch": 3.932054794520548, "grad_norm": 4.80260371479953, "learning_rate": 4.1203997761032895e-10, "logits/chosen": -3.0771851539611816, "logits/rejected": -1.670474886894226, "logps/chosen": -912.401123046875, "logps/rejected": -482.39056396484375, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 6.289819717407227, "rewards/margins": 8.205854415893555, "rewards/rejected": -1.9160341024398804, "step": 5382 }, { "epoch": 3.932785388127854, "grad_norm": 8.03699664828297, "learning_rate": 4.029368495752916e-10, "logits/chosen": -2.845916748046875, "logits/rejected": -2.2287492752075195, "logps/chosen": -508.34613037109375, "logps/rejected": -416.67742919921875, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 2.49528169631958, "rewards/margins": 5.3827056884765625, "rewards/rejected": -2.8874237537384033, "step": 5383 }, { "epoch": 3.9335159817351597, "grad_norm": 7.021023650565966, "learning_rate": 3.9393532374054627e-10, "logits/chosen": -2.810053586959839, "logits/rejected": -2.3737683296203613, "logps/chosen": -405.0205078125, "logps/rejected": -544.66796875, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 2.6894962787628174, "rewards/margins": 6.206811904907227, "rewards/rejected": -3.51731538772583, "step": 5384 }, { "epoch": 3.934246575342466, "grad_norm": 18.804575521581842, "learning_rate": 3.8503540377030074e-10, "logits/chosen": -2.8361635208129883, "logits/rejected": -2.3021905422210693, "logps/chosen": -367.0701599121094, "logps/rejected": -317.63970947265625, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 3.7616207599639893, "rewards/margins": 7.254035472869873, "rewards/rejected": -3.492414712905884, "step": 5385 }, { "epoch": 3.9349771689497715, "grad_norm": 7.1817909455304205, "learning_rate": 3.7623709328740704e-10, "logits/chosen": -2.712346076965332, "logits/rejected": -2.012589454650879, "logps/chosen": -637.2399291992188, "logps/rejected": -358.5943603515625, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 3.2934298515319824, "rewards/margins": 4.831630229949951, "rewards/rejected": -1.5382004976272583, "step": 5386 }, { "epoch": 3.9357077625570778, "grad_norm": 5.230220774554927, "learning_rate": 3.6754039587333363e-10, "logits/chosen": -2.404841184616089, "logits/rejected": -1.4957152605056763, "logps/chosen": -1027.288818359375, "logps/rejected": -503.1990051269531, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 5.097640514373779, "rewards/margins": 8.438567161560059, "rewards/rejected": -3.3409268856048584, "step": 5387 }, { "epoch": 3.9364383561643836, "grad_norm": 8.62157737160154, "learning_rate": 3.589453150682209e-10, "logits/chosen": -2.9071457386016846, "logits/rejected": -2.1363892555236816, "logps/chosen": -847.4134521484375, "logps/rejected": -588.9088134765625, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 4.483107089996338, "rewards/margins": 5.148258686065674, "rewards/rejected": -0.6651512384414673, "step": 5388 }, { "epoch": 3.9371689497716895, "grad_norm": 4.70463238762711, "learning_rate": 3.504518543707702e-10, "logits/chosen": -2.8664603233337402, "logits/rejected": -1.9904851913452148, "logps/chosen": -357.16845703125, "logps/rejected": -273.7854309082031, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 2.606907844543457, "rewards/margins": 5.920811653137207, "rewards/rejected": -3.31390380859375, "step": 5389 }, { "epoch": 3.9378995433789954, "grad_norm": 7.294337744350895, "learning_rate": 3.4206001723843803e-10, "logits/chosen": -2.821152448654175, "logits/rejected": -1.7244781255722046, "logps/chosen": -532.660888671875, "logps/rejected": -563.0922241210938, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": 2.4310972690582275, "rewards/margins": 6.115140914916992, "rewards/rejected": -3.6840436458587646, "step": 5390 }, { "epoch": 3.9386301369863013, "grad_norm": 7.315046861332885, "learning_rate": 3.337698070872141e-10, "logits/chosen": -2.5296759605407715, "logits/rejected": -1.6941947937011719, "logps/chosen": -541.8681640625, "logps/rejected": -338.24365234375, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 2.0922889709472656, "rewards/margins": 4.934842109680176, "rewards/rejected": -2.84255313873291, "step": 5391 }, { "epoch": 3.939360730593607, "grad_norm": 4.1983576969672995, "learning_rate": 3.2558122729178795e-10, "logits/chosen": -2.7594826221466064, "logits/rejected": -2.49452543258667, "logps/chosen": -726.0076293945312, "logps/rejected": -637.7311401367188, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 3.0731289386749268, "rewards/margins": 5.29767370223999, "rewards/rejected": -2.2245447635650635, "step": 5392 }, { "epoch": 3.940091324200913, "grad_norm": 3.9681578667523736, "learning_rate": 3.1749428118535426e-10, "logits/chosen": -2.763509750366211, "logits/rejected": -2.039738655090332, "logps/chosen": -760.3202514648438, "logps/rejected": -526.6734619140625, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 4.031948089599609, "rewards/margins": 6.236634254455566, "rewards/rejected": -2.2046866416931152, "step": 5393 }, { "epoch": 3.9408219178082193, "grad_norm": 7.65201448332896, "learning_rate": 3.0950897205991864e-10, "logits/chosen": -2.504145860671997, "logits/rejected": -2.2617993354797363, "logps/chosen": -436.5337829589844, "logps/rejected": -461.5801696777344, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 1.6110997200012207, "rewards/margins": 4.935736656188965, "rewards/rejected": -3.324636697769165, "step": 5394 }, { "epoch": 3.941552511415525, "grad_norm": 6.088931012220621, "learning_rate": 3.016253031659921e-10, "logits/chosen": -2.831549644470215, "logits/rejected": -2.2884905338287354, "logps/chosen": -977.6428833007812, "logps/rejected": -753.4409790039062, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 4.4867377281188965, "rewards/margins": 4.510962009429932, "rewards/rejected": -0.024224117398262024, "step": 5395 }, { "epoch": 3.942283105022831, "grad_norm": 7.358602106608219, "learning_rate": 2.93843277712702e-10, "logits/chosen": -3.1393117904663086, "logits/rejected": -2.5691208839416504, "logps/chosen": -493.718505859375, "logps/rejected": -393.56951904296875, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": 3.0521106719970703, "rewards/margins": 5.800114631652832, "rewards/rejected": -2.7480039596557617, "step": 5396 }, { "epoch": 3.943013698630137, "grad_norm": 5.519903409361584, "learning_rate": 2.8616289886790326e-10, "logits/chosen": -2.65708589553833, "logits/rejected": -2.3137354850769043, "logps/chosen": -710.7166748046875, "logps/rejected": -759.2265014648438, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 3.3901009559631348, "rewards/margins": 7.510647773742676, "rewards/rejected": -4.120546817779541, "step": 5397 }, { "epoch": 3.943744292237443, "grad_norm": 14.288091243815654, "learning_rate": 2.7858416975792833e-10, "logits/chosen": -2.7197985649108887, "logits/rejected": -2.365601062774658, "logps/chosen": -667.3524169921875, "logps/rejected": -636.993896484375, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 3.4466958045959473, "rewards/margins": 4.74374532699585, "rewards/rejected": -1.2970497608184814, "step": 5398 }, { "epoch": 3.9444748858447487, "grad_norm": 5.196839347712475, "learning_rate": 2.7110709346789275e-10, "logits/chosen": -2.500073194503784, "logits/rejected": -1.9334348440170288, "logps/chosen": -601.3750610351562, "logps/rejected": -468.3092041015625, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 4.285622596740723, "rewards/margins": 8.05140209197998, "rewards/rejected": -3.765779972076416, "step": 5399 }, { "epoch": 3.9452054794520546, "grad_norm": 3.711026417520253, "learning_rate": 2.6373167304138964e-10, "logits/chosen": -2.5875821113586426, "logits/rejected": -1.967263102531433, "logps/chosen": -409.1018371582031, "logps/rejected": -373.6333312988281, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 2.300797939300537, "rewards/margins": 6.529389381408691, "rewards/rejected": -4.2285919189453125, "step": 5400 }, { "epoch": 3.945936073059361, "grad_norm": 10.688707428482605, "learning_rate": 2.564579114807397e-10, "logits/chosen": -2.6709787845611572, "logits/rejected": -2.3490757942199707, "logps/chosen": -615.5294799804688, "logps/rejected": -683.5772094726562, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 5.811744689941406, "rewards/margins": 6.471778869628906, "rewards/rejected": -0.6600348353385925, "step": 5401 }, { "epoch": 3.9466666666666668, "grad_norm": 6.827269895594792, "learning_rate": 2.492858117467966e-10, "logits/chosen": -2.3730170726776123, "logits/rejected": -2.3314602375030518, "logps/chosen": -353.2565612792969, "logps/rejected": -575.7201538085938, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 3.890800952911377, "rewards/margins": 9.904580116271973, "rewards/rejected": -6.013780117034912, "step": 5402 }, { "epoch": 3.9473972602739726, "grad_norm": 3.107143655786271, "learning_rate": 2.4221537675911397e-10, "logits/chosen": -2.714261293411255, "logits/rejected": -2.626521587371826, "logps/chosen": -723.9854736328125, "logps/rejected": -729.7376708984375, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 4.119337558746338, "rewards/margins": 5.635930061340332, "rewards/rejected": -1.516593337059021, "step": 5403 }, { "epoch": 3.9481278538812785, "grad_norm": 2.857278023656041, "learning_rate": 2.3524660939577836e-10, "logits/chosen": -2.6835696697235107, "logits/rejected": -2.1387314796447754, "logps/chosen": -415.1343688964844, "logps/rejected": -419.86859130859375, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": 2.8994007110595703, "rewards/margins": 6.743505001068115, "rewards/rejected": -3.844104290008545, "step": 5404 }, { "epoch": 3.9488584474885844, "grad_norm": 20.334898257876855, "learning_rate": 2.283795124935206e-10, "logits/chosen": -2.9308018684387207, "logits/rejected": -2.415428400039673, "logps/chosen": -615.3639526367188, "logps/rejected": -560.1380004882812, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": 2.981654405593872, "rewards/margins": 5.018695831298828, "rewards/rejected": -2.037041664123535, "step": 5405 }, { "epoch": 3.9495890410958903, "grad_norm": 7.795457240917347, "learning_rate": 2.2161408884774358e-10, "logits/chosen": -2.7252461910247803, "logits/rejected": -1.9316167831420898, "logps/chosen": -686.2274169921875, "logps/rejected": -553.1387939453125, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 4.2276177406311035, "rewards/margins": 6.452551364898682, "rewards/rejected": -2.2249338626861572, "step": 5406 }, { "epoch": 3.950319634703196, "grad_norm": 12.850134680372456, "learning_rate": 2.149503412123832e-10, "logits/chosen": -2.9460902214050293, "logits/rejected": -1.9010357856750488, "logps/chosen": -468.6228332519531, "logps/rejected": -367.9486083984375, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 4.313974380493164, "rewards/margins": 8.324539184570312, "rewards/rejected": -4.010564804077148, "step": 5407 }, { "epoch": 3.9510502283105025, "grad_norm": 5.221619142830921, "learning_rate": 2.0838827230001965e-10, "logits/chosen": -2.6012892723083496, "logits/rejected": -2.261457920074463, "logps/chosen": -639.6734619140625, "logps/rejected": -634.572265625, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": 4.154118537902832, "rewards/margins": 7.525142669677734, "rewards/rejected": -3.3710250854492188, "step": 5408 }, { "epoch": 3.9517808219178083, "grad_norm": 3.992444054409265, "learning_rate": 2.0192788478184953e-10, "logits/chosen": -3.1344223022460938, "logits/rejected": -1.8553845882415771, "logps/chosen": -953.319580078125, "logps/rejected": -564.5642700195312, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 3.3436899185180664, "rewards/margins": 6.92979097366333, "rewards/rejected": -3.5861012935638428, "step": 5409 }, { "epoch": 3.952511415525114, "grad_norm": 10.366901224146536, "learning_rate": 1.955691812876581e-10, "logits/chosen": -2.6355197429656982, "logits/rejected": -1.760981798171997, "logps/chosen": -379.8152770996094, "logps/rejected": -280.8251953125, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 2.1616063117980957, "rewards/margins": 5.946251392364502, "rewards/rejected": -3.7846450805664062, "step": 5410 }, { "epoch": 3.95324200913242, "grad_norm": 6.872090781841228, "learning_rate": 1.8931216440587483e-10, "logits/chosen": -2.646695137023926, "logits/rejected": -2.4906468391418457, "logps/chosen": -468.31805419921875, "logps/rejected": -610.7024536132812, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 1.635939359664917, "rewards/margins": 9.64058780670166, "rewards/rejected": -8.00464916229248, "step": 5411 }, { "epoch": 3.953972602739726, "grad_norm": 5.52874198994066, "learning_rate": 1.8315683668346238e-10, "logits/chosen": -2.3959169387817383, "logits/rejected": -1.8927276134490967, "logps/chosen": -223.28778076171875, "logps/rejected": -369.3941955566406, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 2.84480619430542, "rewards/margins": 9.154504776000977, "rewards/rejected": -6.30969762802124, "step": 5412 }, { "epoch": 3.954703196347032, "grad_norm": 4.224831725612419, "learning_rate": 1.7710320062608308e-10, "logits/chosen": -2.680114507675171, "logits/rejected": -2.126474142074585, "logps/chosen": -631.8067626953125, "logps/rejected": -464.6422424316406, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": 5.322053909301758, "rewards/margins": 7.367826461791992, "rewards/rejected": -2.0457727909088135, "step": 5413 }, { "epoch": 3.9554337899543377, "grad_norm": 4.063225784025871, "learning_rate": 1.711512586979602e-10, "logits/chosen": -2.1414732933044434, "logits/rejected": -2.1639904975891113, "logps/chosen": -577.976318359375, "logps/rejected": -734.0145263671875, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 2.442276954650879, "rewards/margins": 5.635517120361328, "rewards/rejected": -3.193239688873291, "step": 5414 }, { "epoch": 3.956164383561644, "grad_norm": 8.296668558372117, "learning_rate": 1.6530101332187795e-10, "logits/chosen": -2.8148224353790283, "logits/rejected": -1.4690260887145996, "logps/chosen": -829.7319946289062, "logps/rejected": -431.9751892089844, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 4.315634727478027, "rewards/margins": 7.129340171813965, "rewards/rejected": -2.8137059211730957, "step": 5415 }, { "epoch": 3.95689497716895, "grad_norm": 5.002981632044164, "learning_rate": 1.5955246687929247e-10, "logits/chosen": -2.553903579711914, "logits/rejected": -2.3756673336029053, "logps/chosen": -587.2225341796875, "logps/rejected": -579.7732543945312, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 4.388964653015137, "rewards/margins": 7.367908477783203, "rewards/rejected": -2.9789440631866455, "step": 5416 }, { "epoch": 3.9576255707762558, "grad_norm": 5.863774064794702, "learning_rate": 1.5390562171024857e-10, "logits/chosen": -3.2779369354248047, "logits/rejected": -3.098659038543701, "logps/chosen": -524.8482055664062, "logps/rejected": -581.0903930664062, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 3.4948348999023438, "rewards/margins": 5.9645586013793945, "rewards/rejected": -2.469723701477051, "step": 5417 }, { "epoch": 3.9583561643835616, "grad_norm": 8.6799195513535, "learning_rate": 1.4836048011337977e-10, "logits/chosen": -3.3204245567321777, "logits/rejected": -2.6461551189422607, "logps/chosen": -660.6962890625, "logps/rejected": -564.5708618164062, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": 3.5319619178771973, "rewards/margins": 4.695102691650391, "rewards/rejected": -1.1631412506103516, "step": 5418 }, { "epoch": 3.9590867579908675, "grad_norm": 9.178322704395157, "learning_rate": 1.429170443458805e-10, "logits/chosen": -2.643707275390625, "logits/rejected": -2.2196462154388428, "logps/chosen": -642.763671875, "logps/rejected": -382.8275146484375, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 2.5328545570373535, "rewards/margins": 4.635429859161377, "rewards/rejected": -2.1025750637054443, "step": 5419 }, { "epoch": 3.9598173515981734, "grad_norm": 4.9608250402484515, "learning_rate": 1.375753166236171e-10, "logits/chosen": -2.597749710083008, "logits/rejected": -1.8076961040496826, "logps/chosen": -689.263671875, "logps/rejected": -501.21441650390625, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 3.645914316177368, "rewards/margins": 6.7348480224609375, "rewards/rejected": -3.0889339447021484, "step": 5420 }, { "epoch": 3.9605479452054793, "grad_norm": 8.518161444927859, "learning_rate": 1.3233529912101692e-10, "logits/chosen": -2.6843180656433105, "logits/rejected": -1.9541839361190796, "logps/chosen": -403.4430236816406, "logps/rejected": -350.38458251953125, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 2.1213183403015137, "rewards/margins": 5.683503150939941, "rewards/rejected": -3.5621848106384277, "step": 5421 }, { "epoch": 3.9612785388127856, "grad_norm": 3.9866484495106103, "learning_rate": 1.2719699397109595e-10, "logits/chosen": -3.0141265392303467, "logits/rejected": -1.9958158731460571, "logps/chosen": -491.6615905761719, "logps/rejected": -381.2454528808594, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 1.8754757642745972, "rewards/margins": 6.079928398132324, "rewards/rejected": -4.2044525146484375, "step": 5422 }, { "epoch": 3.9620091324200915, "grad_norm": 12.31891043068415, "learning_rate": 1.2216040326545886e-10, "logits/chosen": -3.158668041229248, "logits/rejected": -2.84955096244812, "logps/chosen": -770.3092041015625, "logps/rejected": -730.255615234375, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 2.133335590362549, "rewards/margins": 4.421932220458984, "rewards/rejected": -2.2885963916778564, "step": 5423 }, { "epoch": 3.9627397260273973, "grad_norm": 4.4713890979300706, "learning_rate": 1.172255290543822e-10, "logits/chosen": -2.9423134326934814, "logits/rejected": -2.4002480506896973, "logps/chosen": -763.4177856445312, "logps/rejected": -473.92327880859375, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 3.955143451690674, "rewards/margins": 5.363472938537598, "rewards/rejected": -1.408328890800476, "step": 5424 }, { "epoch": 3.963470319634703, "grad_norm": 5.235978066091736, "learning_rate": 1.1239237334662033e-10, "logits/chosen": -2.2737224102020264, "logits/rejected": -1.5309351682662964, "logps/chosen": -423.2452392578125, "logps/rejected": -326.49951171875, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 3.258169174194336, "rewards/margins": 8.767221450805664, "rewards/rejected": -5.50905179977417, "step": 5425 }, { "epoch": 3.964200913242009, "grad_norm": 7.865059042852672, "learning_rate": 1.0766093810959942e-10, "logits/chosen": -2.9979443550109863, "logits/rejected": -2.4312524795532227, "logps/chosen": -750.0955810546875, "logps/rejected": -545.8612060546875, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": 3.9351253509521484, "rewards/margins": 5.107686996459961, "rewards/rejected": -1.1725623607635498, "step": 5426 }, { "epoch": 3.964931506849315, "grad_norm": 4.227984684478824, "learning_rate": 1.0303122526933438e-10, "logits/chosen": -2.8783833980560303, "logits/rejected": -2.5649831295013428, "logps/chosen": -656.8167114257812, "logps/rejected": -657.2739868164062, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 3.038317918777466, "rewards/margins": 5.509960651397705, "rewards/rejected": -2.471642255783081, "step": 5427 }, { "epoch": 3.965662100456621, "grad_norm": 7.042748302325048, "learning_rate": 9.850323671042882e-11, "logits/chosen": -3.2677621841430664, "logits/rejected": -2.5047402381896973, "logps/chosen": -400.342041015625, "logps/rejected": -670.3975219726562, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 1.728749394416809, "rewards/margins": 4.233342170715332, "rewards/rejected": -2.5045924186706543, "step": 5428 }, { "epoch": 3.966392694063927, "grad_norm": 6.545505280249898, "learning_rate": 9.407697427601947e-11, "logits/chosen": -2.604079008102417, "logits/rejected": -2.438361644744873, "logps/chosen": -780.58935546875, "logps/rejected": -744.8601684570312, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": 5.721445083618164, "rewards/margins": 6.779885292053223, "rewards/rejected": -1.05843985080719, "step": 5429 }, { "epoch": 3.967123287671233, "grad_norm": 4.5705363944223505, "learning_rate": 8.975243976794278e-11, "logits/chosen": -2.9214236736297607, "logits/rejected": -2.308743715286255, "logps/chosen": -806.09765625, "logps/rejected": -679.715087890625, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 4.1261162757873535, "rewards/margins": 6.472740173339844, "rewards/rejected": -2.3466238975524902, "step": 5430 }, { "epoch": 3.967853881278539, "grad_norm": 5.634124814623635, "learning_rate": 8.552963494651289e-11, "logits/chosen": -2.8330352306365967, "logits/rejected": -2.500143051147461, "logps/chosen": -855.311279296875, "logps/rejected": -768.1032104492188, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 4.892239570617676, "rewards/margins": 6.135550022125244, "rewards/rejected": -1.2433103322982788, "step": 5431 }, { "epoch": 3.9685844748858448, "grad_norm": 7.555197224191804, "learning_rate": 8.140856153071585e-11, "logits/chosen": -2.9183759689331055, "logits/rejected": -2.385927438735962, "logps/chosen": -686.978515625, "logps/rejected": -776.0762329101562, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 4.316518306732178, "rewards/margins": 5.296457767486572, "rewards/rejected": -0.9799398183822632, "step": 5432 }, { "epoch": 3.9693150684931506, "grad_norm": 13.358723942468972, "learning_rate": 7.738922119809865e-11, "logits/chosen": -2.9558463096618652, "logits/rejected": -2.7875266075134277, "logps/chosen": -756.4384155273438, "logps/rejected": -738.4744873046875, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 3.7545197010040283, "rewards/margins": 5.377548694610596, "rewards/rejected": -1.623029112815857, "step": 5433 }, { "epoch": 3.9700456621004565, "grad_norm": 5.151681806891136, "learning_rate": 7.347161558476922e-11, "logits/chosen": -2.8653781414031982, "logits/rejected": -2.47015118598938, "logps/chosen": -329.6590270996094, "logps/rejected": -362.34759521484375, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 2.328312635421753, "rewards/margins": 4.316023349761963, "rewards/rejected": -1.9877103567123413, "step": 5434 }, { "epoch": 3.9707762557077624, "grad_norm": 3.7020204079682197, "learning_rate": 6.965574628547966e-11, "logits/chosen": -2.5043742656707764, "logits/rejected": -2.4254794120788574, "logps/chosen": -778.13916015625, "logps/rejected": -937.1304931640625, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": 5.450675010681152, "rewards/margins": 6.080358505249023, "rewards/rejected": -0.6296836137771606, "step": 5435 }, { "epoch": 3.9715068493150687, "grad_norm": 13.734106692665083, "learning_rate": 6.594161485348748e-11, "logits/chosen": -2.928415536880493, "logits/rejected": -2.455698013305664, "logps/chosen": -606.9005737304688, "logps/rejected": -565.4832153320312, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 3.534665584564209, "rewards/margins": 5.367812633514404, "rewards/rejected": -1.8331470489501953, "step": 5436 }, { "epoch": 3.9722374429223746, "grad_norm": 5.5262102346449975, "learning_rate": 6.232922280072216e-11, "logits/chosen": -3.028827667236328, "logits/rejected": -1.8772231340408325, "logps/chosen": -833.8421630859375, "logps/rejected": -483.89471435546875, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 4.1994452476501465, "rewards/margins": 4.63809061050415, "rewards/rejected": -0.43864527344703674, "step": 5437 }, { "epoch": 3.9729680365296804, "grad_norm": 8.70548762744184, "learning_rate": 5.881857159767411e-11, "logits/chosen": -2.9879531860351562, "logits/rejected": -2.6191928386688232, "logps/chosen": -620.3667602539062, "logps/rejected": -561.0512084960938, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 5.901645183563232, "rewards/margins": 7.604538917541504, "rewards/rejected": -1.7028937339782715, "step": 5438 }, { "epoch": 3.9736986301369863, "grad_norm": 5.056358473926241, "learning_rate": 5.5409662673366886e-11, "logits/chosen": -3.1671276092529297, "logits/rejected": -1.8554134368896484, "logps/chosen": -478.93603515625, "logps/rejected": -307.9844055175781, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 1.4544637203216553, "rewards/margins": 4.772249698638916, "rewards/rejected": -3.31778621673584, "step": 5439 }, { "epoch": 3.974429223744292, "grad_norm": 5.156744546004208, "learning_rate": 5.210249741546824e-11, "logits/chosen": -2.574601173400879, "logits/rejected": -2.070136308670044, "logps/chosen": -585.9793090820312, "logps/rejected": -577.0608520507812, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 2.9231655597686768, "rewards/margins": 7.745108127593994, "rewards/rejected": -4.8219428062438965, "step": 5440 }, { "epoch": 3.975159817351598, "grad_norm": 6.835481848653501, "learning_rate": 4.889707717023461e-11, "logits/chosen": -2.4592185020446777, "logits/rejected": -1.6463947296142578, "logps/chosen": -464.2015380859375, "logps/rejected": -467.48663330078125, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": 4.345851421356201, "rewards/margins": 7.205883502960205, "rewards/rejected": -2.860032558441162, "step": 5441 }, { "epoch": 3.975890410958904, "grad_norm": 6.547022123663868, "learning_rate": 4.579340324242786e-11, "logits/chosen": -2.249692440032959, "logits/rejected": -2.504042148590088, "logps/chosen": -593.1033935546875, "logps/rejected": -813.21142578125, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 2.1429882049560547, "rewards/margins": 4.1061506271362305, "rewards/rejected": -1.9631623029708862, "step": 5442 }, { "epoch": 3.9766210045662103, "grad_norm": 6.211316136748698, "learning_rate": 4.2791476895481795e-11, "logits/chosen": -2.7202084064483643, "logits/rejected": -1.904079556465149, "logps/chosen": -590.1998291015625, "logps/rejected": -523.5303344726562, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": 3.077418327331543, "rewards/margins": 5.555369853973389, "rewards/rejected": -2.4779515266418457, "step": 5443 }, { "epoch": 3.977351598173516, "grad_norm": 6.910297571336133, "learning_rate": 3.9891299351363374e-11, "logits/chosen": -2.5632236003875732, "logits/rejected": -1.8117921352386475, "logps/chosen": -556.7494506835938, "logps/rejected": -379.2086486816406, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 2.949199676513672, "rewards/margins": 6.429286956787109, "rewards/rejected": -3.4800870418548584, "step": 5444 }, { "epoch": 3.978082191780822, "grad_norm": 5.356326645933708, "learning_rate": 3.709287179062825e-11, "logits/chosen": -2.5104403495788574, "logits/rejected": -2.184948444366455, "logps/chosen": -554.52490234375, "logps/rejected": -455.95361328125, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 2.7989742755889893, "rewards/margins": 6.529273986816406, "rewards/rejected": -3.730299949645996, "step": 5445 }, { "epoch": 3.978812785388128, "grad_norm": 5.320069070812229, "learning_rate": 3.439619535242078e-11, "logits/chosen": -2.5269527435302734, "logits/rejected": -2.537626028060913, "logps/chosen": -671.744140625, "logps/rejected": -883.6298217773438, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 3.3518118858337402, "rewards/margins": 7.993755340576172, "rewards/rejected": -4.641943454742432, "step": 5446 }, { "epoch": 3.9795433789954338, "grad_norm": 15.41253249264747, "learning_rate": 3.180127113447395e-11, "logits/chosen": -3.1468753814697266, "logits/rejected": -2.7712955474853516, "logps/chosen": -828.900390625, "logps/rejected": -634.859619140625, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": 4.909877300262451, "rewards/margins": 5.962212562561035, "rewards/rejected": -1.0523350238800049, "step": 5447 }, { "epoch": 3.9802739726027396, "grad_norm": 4.089486500932433, "learning_rate": 2.9308100193053966e-11, "logits/chosen": -3.179882049560547, "logits/rejected": -2.0888872146606445, "logps/chosen": -678.8282470703125, "logps/rejected": -464.1260986328125, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 3.1452043056488037, "rewards/margins": 5.619173049926758, "rewards/rejected": -2.473968505859375, "step": 5448 }, { "epoch": 3.9810045662100455, "grad_norm": 3.785978906950554, "learning_rate": 2.691668354309895e-11, "logits/chosen": -3.318032741546631, "logits/rejected": -3.0870273113250732, "logps/chosen": -966.6524658203125, "logps/rejected": -762.257080078125, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 3.7007064819335938, "rewards/margins": 4.690695762634277, "rewards/rejected": -0.9899894595146179, "step": 5449 }, { "epoch": 3.981735159817352, "grad_norm": 5.276739417655181, "learning_rate": 2.4627022158052456e-11, "logits/chosen": -2.4836955070495605, "logits/rejected": -2.8588600158691406, "logps/chosen": -621.6263427734375, "logps/rejected": -633.9710693359375, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 2.7574100494384766, "rewards/margins": 4.9218430519104, "rewards/rejected": -2.1644325256347656, "step": 5450 }, { "epoch": 3.9824657534246577, "grad_norm": 3.1357335197735234, "learning_rate": 2.243911696991896e-11, "logits/chosen": -2.99196195602417, "logits/rejected": -2.058746814727783, "logps/chosen": -724.8944091796875, "logps/rejected": -546.4383544921875, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 4.288005352020264, "rewards/margins": 8.142865180969238, "rewards/rejected": -3.854860305786133, "step": 5451 }, { "epoch": 3.9831963470319636, "grad_norm": 3.05037343600092, "learning_rate": 2.0352968869374876e-11, "logits/chosen": -3.1932601928710938, "logits/rejected": -1.6239668130874634, "logps/chosen": -671.5821533203125, "logps/rejected": -321.16131591796875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 3.621413230895996, "rewards/margins": 6.769004821777344, "rewards/rejected": -3.1475911140441895, "step": 5452 }, { "epoch": 3.9839269406392694, "grad_norm": 4.391445096897205, "learning_rate": 1.836857870557429e-11, "logits/chosen": -2.827066421508789, "logits/rejected": -2.135916233062744, "logps/chosen": -913.7877197265625, "logps/rejected": -577.3447265625, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 4.8926496505737305, "rewards/margins": 6.320702075958252, "rewards/rejected": -1.4280521869659424, "step": 5453 }, { "epoch": 3.9846575342465753, "grad_norm": 4.777419407043503, "learning_rate": 1.6485947286287716e-11, "logits/chosen": -2.598641872406006, "logits/rejected": -2.2537360191345215, "logps/chosen": -510.55718994140625, "logps/rejected": -505.71002197265625, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 3.646766185760498, "rewards/margins": 7.6080002784729, "rewards/rejected": -3.961233615875244, "step": 5454 }, { "epoch": 3.985388127853881, "grad_norm": 20.63398167952981, "learning_rate": 1.470507537790211e-11, "logits/chosen": -2.7999722957611084, "logits/rejected": -1.9563541412353516, "logps/chosen": -581.9244995117188, "logps/rejected": -459.76385498046875, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 2.704115629196167, "rewards/margins": 5.894595146179199, "rewards/rejected": -3.190479278564453, "step": 5455 }, { "epoch": 3.986118721461187, "grad_norm": 3.6349004836279777, "learning_rate": 1.3025963705337595e-11, "logits/chosen": -2.601423740386963, "logits/rejected": -2.0863630771636963, "logps/chosen": -798.2518310546875, "logps/rejected": -618.6640625, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": 3.90187406539917, "rewards/margins": 5.951649188995361, "rewards/rejected": -2.0497751235961914, "step": 5456 }, { "epoch": 3.9868493150684934, "grad_norm": 8.630740259630064, "learning_rate": 1.144861295207522e-11, "logits/chosen": -2.8667469024658203, "logits/rejected": -2.26111102104187, "logps/chosen": -756.4758911132812, "logps/rejected": -652.3935546875, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 5.858659744262695, "rewards/margins": 7.691890716552734, "rewards/rejected": -1.8332308530807495, "step": 5457 }, { "epoch": 3.987579908675799, "grad_norm": 3.404037176787608, "learning_rate": 9.973023760240229e-12, "logits/chosen": -2.8165524005889893, "logits/rejected": -2.45028018951416, "logps/chosen": -756.4385986328125, "logps/rejected": -718.6859130859375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 4.311727523803711, "rewards/margins": 5.399837493896484, "rewards/rejected": -1.0881097316741943, "step": 5458 }, { "epoch": 3.988310502283105, "grad_norm": 4.575207506487218, "learning_rate": 8.599196730463276e-12, "logits/chosen": -3.4421544075012207, "logits/rejected": -1.8176548480987549, "logps/chosen": -841.6519775390625, "logps/rejected": -399.6744384765625, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 5.424970626831055, "rewards/margins": 7.202879428863525, "rewards/rejected": -1.7779086828231812, "step": 5459 }, { "epoch": 3.989041095890411, "grad_norm": 4.086922113007304, "learning_rate": 7.327132422019211e-12, "logits/chosen": -3.0472397804260254, "logits/rejected": -1.9590022563934326, "logps/chosen": -932.37451171875, "logps/rejected": -500.343017578125, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 4.373081207275391, "rewards/margins": 6.711955547332764, "rewards/rejected": -2.338874340057373, "step": 5460 }, { "epoch": 3.989771689497717, "grad_norm": 6.2990518976724035, "learning_rate": 6.156831352660541e-12, "logits/chosen": -2.552326202392578, "logits/rejected": -2.0854921340942383, "logps/chosen": -376.394775390625, "logps/rejected": -469.2660827636719, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": 2.70027494430542, "rewards/margins": 8.990965843200684, "rewards/rejected": -6.2906904220581055, "step": 5461 }, { "epoch": 3.9905022831050228, "grad_norm": 6.9157958191030655, "learning_rate": 5.088293998811721e-12, "logits/chosen": -2.757695198059082, "logits/rejected": -2.5947980880737305, "logps/chosen": -603.3355712890625, "logps/rejected": -614.5173950195312, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.571560859680176, "rewards/margins": 6.629910945892334, "rewards/rejected": -3.0583503246307373, "step": 5462 }, { "epoch": 3.9912328767123286, "grad_norm": 5.714733944070684, "learning_rate": 4.121520795430377e-12, "logits/chosen": -2.8395638465881348, "logits/rejected": -2.0930235385894775, "logps/chosen": -709.9227294921875, "logps/rejected": -581.1326904296875, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 4.042820930480957, "rewards/margins": 4.564574718475342, "rewards/rejected": -0.5217536687850952, "step": 5463 }, { "epoch": 3.991963470319635, "grad_norm": 3.6083720081068638, "learning_rate": 3.2565121360628167e-12, "logits/chosen": -2.41422438621521, "logits/rejected": -2.438929796218872, "logps/chosen": -690.8007202148438, "logps/rejected": -688.4367065429688, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": 3.0744595527648926, "rewards/margins": 5.0126543045043945, "rewards/rejected": -1.938194751739502, "step": 5464 }, { "epoch": 3.9926940639269404, "grad_norm": 5.380247349211399, "learning_rate": 2.4932683728440262e-12, "logits/chosen": -2.4217238426208496, "logits/rejected": -2.3199214935302734, "logps/chosen": -503.43536376953125, "logps/rejected": -745.0977783203125, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 3.259431838989258, "rewards/margins": 6.030531406402588, "rewards/rejected": -2.771099328994751, "step": 5465 }, { "epoch": 3.9934246575342467, "grad_norm": 5.4073897907738475, "learning_rate": 1.8317898164144086e-12, "logits/chosen": -2.512361526489258, "logits/rejected": -2.339570999145508, "logps/chosen": -252.4790496826172, "logps/rejected": -344.9925231933594, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 1.9222263097763062, "rewards/margins": 4.990472793579102, "rewards/rejected": -3.068246364593506, "step": 5466 }, { "epoch": 3.9941552511415526, "grad_norm": 4.733742448202898, "learning_rate": 1.2720767360585583e-12, "logits/chosen": -3.350712299346924, "logits/rejected": -2.6595687866210938, "logps/chosen": -965.2703857421875, "logps/rejected": -733.264892578125, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 4.267724990844727, "rewards/margins": 5.757676601409912, "rewards/rejected": -1.4899519681930542, "step": 5467 }, { "epoch": 3.9948858447488584, "grad_norm": 5.969372142702796, "learning_rate": 8.141293596219956e-13, "logits/chosen": -3.184602975845337, "logits/rejected": -1.916162371635437, "logps/chosen": -673.4010009765625, "logps/rejected": -465.35882568359375, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 2.549086093902588, "rewards/margins": 5.640641212463379, "rewards/rejected": -3.091555595397949, "step": 5468 }, { "epoch": 3.9956164383561643, "grad_norm": 9.770308252038097, "learning_rate": 4.579478735389219e-13, "logits/chosen": -2.5169129371643066, "logits/rejected": -2.245260238647461, "logps/chosen": -582.3985595703125, "logps/rejected": -556.478759765625, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 2.2845940589904785, "rewards/margins": 7.249733924865723, "rewards/rejected": -4.965140342712402, "step": 5469 }, { "epoch": 3.99634703196347, "grad_norm": 5.484624699399378, "learning_rate": 2.0353242274895322e-13, "logits/chosen": -2.810373306274414, "logits/rejected": -2.152010440826416, "logps/chosen": -578.916748046875, "logps/rejected": -528.0346069335938, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 5.265854835510254, "rewards/margins": 7.719869613647461, "rewards/rejected": -2.4540140628814697, "step": 5470 }, { "epoch": 3.9970776255707765, "grad_norm": 4.5087422117922715, "learning_rate": 5.0883110863653157e-14, "logits/chosen": -3.049499034881592, "logits/rejected": -1.9139463901519775, "logps/chosen": -793.2178955078125, "logps/rejected": -436.4674377441406, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 5.109677314758301, "rewards/margins": 7.697728633880615, "rewards/rejected": -2.5880513191223145, "step": 5471 }, { "epoch": 3.997808219178082, "grad_norm": 3.3185149948335413, "learning_rate": 0.0, "logits/chosen": -3.069558620452881, "logits/rejected": -2.073305606842041, "logps/chosen": -925.1841430664062, "logps/rejected": -677.47412109375, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 4.810154914855957, "rewards/margins": 7.006922721862793, "rewards/rejected": -2.196767807006836, "step": 5472 }, { "epoch": 3.997808219178082, "eval_logits/chosen": -2.860849618911743, "eval_logits/rejected": -2.3769683837890625, "eval_logps/chosen": -706.0082397460938, "eval_logps/rejected": -596.4307861328125, "eval_loss": 0.40534183382987976, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 2.9905052185058594, "eval_rewards/margins": 4.6127777099609375, "eval_rewards/rejected": -1.6222723722457886, "eval_runtime": 14.4187, "eval_samples_per_second": 7.629, "eval_steps_per_second": 0.971, "step": 5472 }, { "epoch": 3.997808219178082, "step": 5472, "total_flos": 519662108934144.0, "train_loss": 0.17924007512469045, "train_runtime": 53882.8337, "train_samples_per_second": 3.251, "train_steps_per_second": 0.102 } ], "logging_steps": 1, "max_steps": 5472, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 519662108934144.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }