{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9959514170040484, "eval_steps": 500, "global_step": 1110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01349527665317139, "grad_norm": 85.5, "learning_rate": 2.2522522522522524e-07, "logits/chosen": -1.500240683555603, "logits/rejected": -1.5190627574920654, "logps/chosen": -159.05484008789062, "logps/rejected": -164.59542846679688, "loss": 0.6946, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.006750366650521755, "rewards/margins": -0.002313111675903201, "rewards/rejected": 0.0090634785592556, "step": 5 }, { "epoch": 0.02699055330634278, "grad_norm": 92.5, "learning_rate": 4.504504504504505e-07, "logits/chosen": -1.4508098363876343, "logits/rejected": -1.4352288246154785, "logps/chosen": -141.31773376464844, "logps/rejected": -167.95175170898438, "loss": 0.7035, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.00739473570138216, "rewards/margins": -0.01960981823503971, "rewards/rejected": 0.01221508253365755, "step": 10 }, { "epoch": 0.04048582995951417, "grad_norm": 74.0, "learning_rate": 6.756756756756758e-07, "logits/chosen": -1.3884494304656982, "logits/rejected": -1.3975419998168945, "logps/chosen": -192.84548950195312, "logps/rejected": -180.82046508789062, "loss": 0.6966, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.004980484023690224, "rewards/margins": -0.006102551706135273, "rewards/rejected": 0.011083034798502922, "step": 15 }, { "epoch": 0.05398110661268556, "grad_norm": 99.0, "learning_rate": 9.00900900900901e-07, "logits/chosen": -1.4855096340179443, "logits/rejected": -1.4922425746917725, "logps/chosen": -148.1718292236328, "logps/rejected": -152.18133544921875, "loss": 0.6843, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.002431074623018503, "rewards/margins": 0.018751021474599838, "rewards/rejected": -0.016319945454597473, "step": 20 }, { "epoch": 0.06747638326585695, "grad_norm": 113.0, "learning_rate": 1.1261261261261262e-06, "logits/chosen": -1.4175087213516235, "logits/rejected": -1.4836245775222778, "logps/chosen": -264.17132568359375, "logps/rejected": -193.3080596923828, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": 0.002699580043554306, "rewards/margins": 0.005426598247140646, "rewards/rejected": -0.00272701820358634, "step": 25 }, { "epoch": 0.08097165991902834, "grad_norm": 89.0, "learning_rate": 1.3513513513513515e-06, "logits/chosen": -1.3333433866500854, "logits/rejected": -1.4199435710906982, "logps/chosen": -220.9799041748047, "logps/rejected": -186.35690307617188, "loss": 0.688, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.009898080490529537, "rewards/margins": 0.012090040370821953, "rewards/rejected": -0.0021919584833085537, "step": 30 }, { "epoch": 0.09446693657219973, "grad_norm": 66.5, "learning_rate": 1.5765765765765766e-06, "logits/chosen": -1.5576092004776, "logits/rejected": -1.493931770324707, "logps/chosen": -148.85377502441406, "logps/rejected": -168.85574340820312, "loss": 0.6811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014485938474535942, "rewards/margins": 0.025426441803574562, "rewards/rejected": -0.010940502397716045, "step": 35 }, { "epoch": 0.10796221322537113, "grad_norm": 87.5, "learning_rate": 1.801801801801802e-06, "logits/chosen": -1.460998296737671, "logits/rejected": -1.4714558124542236, "logps/chosen": -165.34341430664062, "logps/rejected": -167.67092895507812, "loss": 0.6808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.018663501366972923, "rewards/margins": 0.027817577123641968, "rewards/rejected": -0.009154075756669044, "step": 40 }, { "epoch": 0.1214574898785425, "grad_norm": 93.0, "learning_rate": 2.0270270270270273e-06, "logits/chosen": -1.3859444856643677, "logits/rejected": -1.4024606943130493, "logps/chosen": -162.58734130859375, "logps/rejected": -191.04025268554688, "loss": 0.6846, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009018613025546074, "rewards/margins": 0.019761864095926285, "rewards/rejected": -0.010743250139057636, "step": 45 }, { "epoch": 0.1349527665317139, "grad_norm": 89.5, "learning_rate": 2.2522522522522524e-06, "logits/chosen": -1.4222023487091064, "logits/rejected": -1.54598069190979, "logps/chosen": -285.5871276855469, "logps/rejected": -167.19281005859375, "loss": 0.6684, "rewards/accuracies": 0.75, "rewards/chosen": 0.02634511888027191, "rewards/margins": 0.052618540823459625, "rewards/rejected": -0.026273420080542564, "step": 50 }, { "epoch": 0.1484480431848853, "grad_norm": 69.5, "learning_rate": 2.4774774774774775e-06, "logits/chosen": -1.5841736793518066, "logits/rejected": -1.516913890838623, "logps/chosen": -170.33505249023438, "logps/rejected": -188.19314575195312, "loss": 0.6639, "rewards/accuracies": 0.75, "rewards/chosen": -0.004526221659034491, "rewards/margins": 0.06425820291042328, "rewards/rejected": -0.06878442317247391, "step": 55 }, { "epoch": 0.16194331983805668, "grad_norm": 72.0, "learning_rate": 2.702702702702703e-06, "logits/chosen": -1.438759207725525, "logits/rejected": -1.3985353708267212, "logps/chosen": -198.15411376953125, "logps/rejected": -208.3758544921875, "loss": 0.6501, "rewards/accuracies": 0.75, "rewards/chosen": 0.047606997191905975, "rewards/margins": 0.09706764668226242, "rewards/rejected": -0.049460653215646744, "step": 60 }, { "epoch": 0.17543859649122806, "grad_norm": 164.0, "learning_rate": 2.927927927927928e-06, "logits/chosen": -1.4191879034042358, "logits/rejected": -1.5293009281158447, "logps/chosen": -217.4423370361328, "logps/rejected": -202.1327362060547, "loss": 0.6846, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014217356219887733, "rewards/margins": 0.027354473248124123, "rewards/rejected": -0.013137114234268665, "step": 65 }, { "epoch": 0.18893387314439947, "grad_norm": 75.5, "learning_rate": 3.1531531531531532e-06, "logits/chosen": -1.510615587234497, "logits/rejected": -1.5524317026138306, "logps/chosen": -277.9597473144531, "logps/rejected": -174.99221801757812, "loss": 0.6538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.01016303151845932, "rewards/margins": 0.08965723216533661, "rewards/rejected": -0.07949419319629669, "step": 70 }, { "epoch": 0.20242914979757085, "grad_norm": 127.5, "learning_rate": 3.3783783783783788e-06, "logits/chosen": -1.5467108488082886, "logits/rejected": -1.7057151794433594, "logps/chosen": -236.87759399414062, "logps/rejected": -171.19088745117188, "loss": 0.6316, "rewards/accuracies": 0.75, "rewards/chosen": 0.024651767686009407, "rewards/margins": 0.13629736006259918, "rewards/rejected": -0.11164556443691254, "step": 75 }, { "epoch": 0.21592442645074225, "grad_norm": 67.0, "learning_rate": 3.603603603603604e-06, "logits/chosen": -1.3438420295715332, "logits/rejected": -1.5014269351959229, "logps/chosen": -211.7142791748047, "logps/rejected": -149.79403686523438, "loss": 0.6296, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.016034509986639023, "rewards/margins": 0.1428973227739334, "rewards/rejected": -0.12686282396316528, "step": 80 }, { "epoch": 0.22941970310391363, "grad_norm": 67.0, "learning_rate": 3.828828828828829e-06, "logits/chosen": -1.580759048461914, "logits/rejected": -1.5942776203155518, "logps/chosen": -186.5341339111328, "logps/rejected": -198.06871032714844, "loss": 0.6112, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.011369394138455391, "rewards/margins": 0.18740348517894745, "rewards/rejected": -0.1987728774547577, "step": 85 }, { "epoch": 0.242914979757085, "grad_norm": 104.5, "learning_rate": 4.0540540540540545e-06, "logits/chosen": -1.5142263174057007, "logits/rejected": -1.526908040046692, "logps/chosen": -172.0498504638672, "logps/rejected": -204.1090545654297, "loss": 0.5947, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.001767634996213019, "rewards/margins": 0.23096399009227753, "rewards/rejected": -0.2327316552400589, "step": 90 }, { "epoch": 0.2564102564102564, "grad_norm": 67.0, "learning_rate": 4.27927927927928e-06, "logits/chosen": -1.2964483499526978, "logits/rejected": -1.287847876548767, "logps/chosen": -152.49652099609375, "logps/rejected": -162.25242614746094, "loss": 0.6261, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.031346581876277924, "rewards/margins": 0.17656004428863525, "rewards/rejected": -0.20790663361549377, "step": 95 }, { "epoch": 0.2699055330634278, "grad_norm": 122.0, "learning_rate": 4.504504504504505e-06, "logits/chosen": -1.6146646738052368, "logits/rejected": -1.6288648843765259, "logps/chosen": -245.85440063476562, "logps/rejected": -252.33163452148438, "loss": 0.5388, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008112089708447456, "rewards/margins": 0.4617583155632019, "rewards/rejected": -0.4536462426185608, "step": 100 }, { "epoch": 0.2834008097165992, "grad_norm": 54.75, "learning_rate": 4.72972972972973e-06, "logits/chosen": -1.7181060314178467, "logits/rejected": -1.6348508596420288, "logps/chosen": -181.34054565429688, "logps/rejected": -187.49969482421875, "loss": 0.5332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.032877303659915924, "rewards/margins": 0.5002557635307312, "rewards/rejected": -0.5331330895423889, "step": 105 }, { "epoch": 0.2968960863697706, "grad_norm": 93.5, "learning_rate": 4.954954954954955e-06, "logits/chosen": -1.471880555152893, "logits/rejected": -1.4882009029388428, "logps/chosen": -239.46017456054688, "logps/rejected": -203.43408203125, "loss": 0.639, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20699986815452576, "rewards/margins": 0.2872315049171448, "rewards/rejected": -0.49423137307167053, "step": 110 }, { "epoch": 0.31039136302294196, "grad_norm": 83.5, "learning_rate": 4.999802215142814e-06, "logits/chosen": -1.572249174118042, "logits/rejected": -1.5214914083480835, "logps/chosen": -181.75244140625, "logps/rejected": -206.9883270263672, "loss": 0.4953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2786533534526825, "rewards/margins": 0.6539293527603149, "rewards/rejected": -0.932582676410675, "step": 115 }, { "epoch": 0.32388663967611336, "grad_norm": 63.25, "learning_rate": 4.998998767795805e-06, "logits/chosen": -1.3965647220611572, "logits/rejected": -1.5122724771499634, "logps/chosen": -185.1367645263672, "logps/rejected": -141.9375457763672, "loss": 0.5188, "rewards/accuracies": 0.75, "rewards/chosen": -0.12487339973449707, "rewards/margins": 0.5116696357727051, "rewards/rejected": -0.6365430951118469, "step": 120 }, { "epoch": 0.33738191632928477, "grad_norm": 94.5, "learning_rate": 4.9975774948882615e-06, "logits/chosen": -1.5592033863067627, "logits/rejected": -1.5545122623443604, "logps/chosen": -134.59095764160156, "logps/rejected": -159.44424438476562, "loss": 0.5878, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.218244269490242, "rewards/margins": 0.560061514377594, "rewards/rejected": -0.7783057689666748, "step": 125 }, { "epoch": 0.3508771929824561, "grad_norm": 159.0, "learning_rate": 4.995538747800403e-06, "logits/chosen": -1.5116926431655884, "logits/rejected": -1.5991663932800293, "logps/chosen": -196.37417602539062, "logps/rejected": -162.26467895507812, "loss": 0.555, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6864209175109863, "rewards/margins": 0.5580738186836243, "rewards/rejected": -1.2444946765899658, "step": 130 }, { "epoch": 0.3643724696356275, "grad_norm": 77.5, "learning_rate": 4.9928830305701164e-06, "logits/chosen": -1.4444091320037842, "logits/rejected": -1.404262661933899, "logps/chosen": -185.04042053222656, "logps/rejected": -186.958740234375, "loss": 0.4598, "rewards/accuracies": 0.75, "rewards/chosen": -0.22133490443229675, "rewards/margins": 0.7992109060287476, "rewards/rejected": -1.0205457210540771, "step": 135 }, { "epoch": 0.37786774628879893, "grad_norm": 50.25, "learning_rate": 4.98961099976835e-06, "logits/chosen": -1.5445549488067627, "logits/rejected": -1.586544156074524, "logps/chosen": -199.28408813476562, "logps/rejected": -183.11032104492188, "loss": 0.4536, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06479507684707642, "rewards/margins": 0.9296582341194153, "rewards/rejected": -0.9944533109664917, "step": 140 }, { "epoch": 0.3913630229419703, "grad_norm": 68.0, "learning_rate": 4.985723464336783e-06, "logits/chosen": -1.4274847507476807, "logits/rejected": -1.4104160070419312, "logps/chosen": -185.9368896484375, "logps/rejected": -188.2207489013672, "loss": 0.4902, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17553560435771942, "rewards/margins": 0.6832131743431091, "rewards/rejected": -0.8587487936019897, "step": 145 }, { "epoch": 0.4048582995951417, "grad_norm": 65.0, "learning_rate": 4.9812213853878376e-06, "logits/chosen": -1.6410919427871704, "logits/rejected": -1.6832342147827148, "logps/chosen": -168.22726440429688, "logps/rejected": -165.28591918945312, "loss": 0.4942, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.19691412150859833, "rewards/margins": 0.8052200078964233, "rewards/rejected": -1.002134084701538, "step": 150 }, { "epoch": 0.4183535762483131, "grad_norm": 84.0, "learning_rate": 4.9761058759670625e-06, "logits/chosen": -1.4086945056915283, "logits/rejected": -1.3933309316635132, "logps/chosen": -200.54226684570312, "logps/rejected": -191.30516052246094, "loss": 0.5805, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.38961219787597656, "rewards/margins": 0.6619648337364197, "rewards/rejected": -1.051577091217041, "step": 155 }, { "epoch": 0.4318488529014845, "grad_norm": 48.75, "learning_rate": 4.970378200777949e-06, "logits/chosen": -1.4240281581878662, "logits/rejected": -1.5275284051895142, "logps/chosen": -149.6121826171875, "logps/rejected": -153.7329864501953, "loss": 0.3726, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.22904136776924133, "rewards/margins": 1.2087788581848145, "rewards/rejected": -1.4378201961517334, "step": 160 }, { "epoch": 0.44534412955465585, "grad_norm": 57.5, "learning_rate": 4.964039775869271e-06, "logits/chosen": -1.5353929996490479, "logits/rejected": -1.5400171279907227, "logps/chosen": -172.69320678710938, "logps/rejected": -186.09596252441406, "loss": 0.4821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14687059819698334, "rewards/margins": 1.0381742715835571, "rewards/rejected": -1.1850450038909912, "step": 165 }, { "epoch": 0.45883940620782726, "grad_norm": 68.5, "learning_rate": 4.957092168284987e-06, "logits/chosen": -1.5351091623306274, "logits/rejected": -1.480067253112793, "logps/chosen": -224.7134246826172, "logps/rejected": -280.2825012207031, "loss": 0.4522, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15150094032287598, "rewards/margins": 0.8322998881340027, "rewards/rejected": -0.9838007092475891, "step": 170 }, { "epoch": 0.47233468286099867, "grad_norm": 47.25, "learning_rate": 4.949537095676824e-06, "logits/chosen": -1.5415345430374146, "logits/rejected": -1.4604427814483643, "logps/chosen": -173.94085693359375, "logps/rejected": -215.93075561523438, "loss": 0.45, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3776322901248932, "rewards/margins": 1.5937398672103882, "rewards/rejected": -1.9713722467422485, "step": 175 }, { "epoch": 0.48582995951417, "grad_norm": 95.5, "learning_rate": 4.9413764258796236e-06, "logits/chosen": -1.5088344812393188, "logits/rejected": -1.6158044338226318, "logps/chosen": -273.03594970703125, "logps/rejected": -221.93997192382812, "loss": 0.5881, "rewards/accuracies": 0.625, "rewards/chosen": -0.25630080699920654, "rewards/margins": 0.5983410477638245, "rewards/rejected": -0.8546417951583862, "step": 180 }, { "epoch": 0.4993252361673414, "grad_norm": 83.0, "learning_rate": 4.93261217644956e-06, "logits/chosen": -1.3866004943847656, "logits/rejected": -1.363396406173706, "logps/chosen": -211.2840576171875, "logps/rejected": -256.87811279296875, "loss": 0.4912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24753907322883606, "rewards/margins": 0.9087351560592651, "rewards/rejected": -1.1562741994857788, "step": 185 }, { "epoch": 0.5128205128205128, "grad_norm": 79.0, "learning_rate": 4.923246514165339e-06, "logits/chosen": -1.357788324356079, "logits/rejected": -1.322389841079712, "logps/chosen": -221.6494598388672, "logps/rejected": -238.56637573242188, "loss": 0.3841, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.21661829948425293, "rewards/margins": 1.6020748615264893, "rewards/rejected": -1.8186931610107422, "step": 190 }, { "epoch": 0.5263157894736842, "grad_norm": 78.0, "learning_rate": 4.913281754492509e-06, "logits/chosen": -1.5164716243743896, "logits/rejected": -1.5658130645751953, "logps/chosen": -211.942138671875, "logps/rejected": -251.4232177734375, "loss": 0.439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2759682238101959, "rewards/margins": 1.2201299667358398, "rewards/rejected": -1.4960981607437134, "step": 195 }, { "epoch": 0.5398110661268556, "grad_norm": 68.0, "learning_rate": 4.902720361011007e-06, "logits/chosen": -1.43938148021698, "logits/rejected": -1.4012665748596191, "logps/chosen": -198.0753936767578, "logps/rejected": -230.1431121826172, "loss": 0.436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4660988748073578, "rewards/margins": 1.3129799365997314, "rewards/rejected": -1.7790788412094116, "step": 200 }, { "epoch": 0.553306342780027, "grad_norm": 116.0, "learning_rate": 4.891564944806095e-06, "logits/chosen": -1.3829123973846436, "logits/rejected": -1.4532912969589233, "logps/chosen": -204.92056274414062, "logps/rejected": -184.2178192138672, "loss": 0.4408, "rewards/accuracies": 0.75, "rewards/chosen": -0.4832437038421631, "rewards/margins": 1.4000451564788818, "rewards/rejected": -1.8832887411117554, "step": 205 }, { "epoch": 0.5668016194331984, "grad_norm": 39.0, "learning_rate": 4.879818263822816e-06, "logits/chosen": -1.5301909446716309, "logits/rejected": -1.4669263362884521, "logps/chosen": -176.71139526367188, "logps/rejected": -210.8941192626953, "loss": 0.4359, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7508934140205383, "rewards/margins": 1.5884822607040405, "rewards/rejected": -2.3393757343292236, "step": 210 }, { "epoch": 0.5802968960863698, "grad_norm": 118.5, "learning_rate": 4.867483222184158e-06, "logits/chosen": -1.4969114065170288, "logits/rejected": -1.4513076543807983, "logps/chosen": -183.51742553710938, "logps/rejected": -234.21078491210938, "loss": 0.4083, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1092641353607178, "rewards/margins": 2.7672932147979736, "rewards/rejected": -4.876556873321533, "step": 215 }, { "epoch": 0.5937921727395412, "grad_norm": 82.5, "learning_rate": 4.854562869473063e-06, "logits/chosen": -1.6114156246185303, "logits/rejected": -1.6086403131484985, "logps/chosen": -158.5917510986328, "logps/rejected": -182.981689453125, "loss": 0.5288, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8133976459503174, "rewards/margins": 2.3693175315856934, "rewards/rejected": -4.182714939117432, "step": 220 }, { "epoch": 0.6072874493927125, "grad_norm": 64.5, "learning_rate": 4.841060399978481e-06, "logits/chosen": -1.4258265495300293, "logits/rejected": -1.5041557550430298, "logps/chosen": -203.29505920410156, "logps/rejected": -173.55667114257812, "loss": 0.467, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.451561838388443, "rewards/margins": 0.9895628094673157, "rewards/rejected": -1.4411247968673706, "step": 225 }, { "epoch": 0.6207827260458839, "grad_norm": 53.75, "learning_rate": 4.826979151905655e-06, "logits/chosen": -1.3954380750656128, "logits/rejected": -1.4369020462036133, "logps/chosen": -133.7052764892578, "logps/rejected": -152.63189697265625, "loss": 0.3819, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.21247024834156036, "rewards/margins": 1.1218936443328857, "rewards/rejected": -1.3343639373779297, "step": 230 }, { "epoch": 0.6342780026990553, "grad_norm": 34.25, "learning_rate": 4.812322606550813e-06, "logits/chosen": -1.477416753768921, "logits/rejected": -1.35099196434021, "logps/chosen": -183.8603057861328, "logps/rejected": -200.47122192382812, "loss": 0.403, "rewards/accuracies": 0.875, "rewards/chosen": -0.22856561839580536, "rewards/margins": 1.1782000064849854, "rewards/rejected": -1.4067654609680176, "step": 235 }, { "epoch": 0.6477732793522267, "grad_norm": 142.0, "learning_rate": 4.7970943874404904e-06, "logits/chosen": -1.5746204853057861, "logits/rejected": -1.5317301750183105, "logps/chosen": -132.62966918945312, "logps/rejected": -169.4604034423828, "loss": 0.4905, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2887588441371918, "rewards/margins": 1.0178512334823608, "rewards/rejected": -1.3066102266311646, "step": 240 }, { "epoch": 0.6612685560053981, "grad_norm": 81.5, "learning_rate": 4.781298259435691e-06, "logits/chosen": -1.4620139598846436, "logits/rejected": -1.5366100072860718, "logps/chosen": -207.0232696533203, "logps/rejected": -182.5987548828125, "loss": 0.3498, "rewards/accuracies": 0.875, "rewards/chosen": -0.38011789321899414, "rewards/margins": 1.517073392868042, "rewards/rejected": -1.8971912860870361, "step": 245 }, { "epoch": 0.6747638326585695, "grad_norm": 59.0, "learning_rate": 4.7649381278011e-06, "logits/chosen": -1.525059700012207, "logits/rejected": -1.4892899990081787, "logps/chosen": -132.02548217773438, "logps/rejected": -172.75595092773438, "loss": 0.4596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.47625675797462463, "rewards/margins": 1.6200672388076782, "rewards/rejected": -2.0963237285614014, "step": 250 }, { "epoch": 0.6882591093117408, "grad_norm": 93.5, "learning_rate": 4.748018037239592e-06, "logits/chosen": -1.6185624599456787, "logits/rejected": -1.6007747650146484, "logps/chosen": -190.04196166992188, "logps/rejected": -271.9373474121094, "loss": 0.377, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.29186195135116577, "rewards/margins": 1.4247747659683228, "rewards/rejected": -1.7166366577148438, "step": 255 }, { "epoch": 0.7017543859649122, "grad_norm": 54.75, "learning_rate": 4.7305421708922596e-06, "logits/chosen": -1.5387685298919678, "logits/rejected": -1.4462766647338867, "logps/chosen": -199.54568481445312, "logps/rejected": -219.14901733398438, "loss": 0.5013, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4976847767829895, "rewards/margins": 1.649714708328247, "rewards/rejected": -2.147399425506592, "step": 260 }, { "epoch": 0.7152496626180836, "grad_norm": 92.0, "learning_rate": 4.712514849304219e-06, "logits/chosen": -1.4592026472091675, "logits/rejected": -1.5086675882339478, "logps/chosen": -203.43939208984375, "logps/rejected": -182.27008056640625, "loss": 0.3704, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.30615222454071045, "rewards/margins": 1.7558097839355469, "rewards/rejected": -2.0619618892669678, "step": 265 }, { "epoch": 0.728744939271255, "grad_norm": 94.0, "learning_rate": 4.693940529356444e-06, "logits/chosen": -1.5462654829025269, "logits/rejected": -1.5494886636734009, "logps/chosen": -204.8282470703125, "logps/rejected": -262.1166076660156, "loss": 0.4081, "rewards/accuracies": 0.875, "rewards/chosen": -0.18543025851249695, "rewards/margins": 1.581555724143982, "rewards/rejected": -1.7669861316680908, "step": 270 }, { "epoch": 0.7422402159244265, "grad_norm": 49.5, "learning_rate": 4.674823803163899e-06, "logits/chosen": -1.5121240615844727, "logits/rejected": -1.378418207168579, "logps/chosen": -176.5196533203125, "logps/rejected": -259.83154296875, "loss": 0.2792, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2915424704551697, "rewards/margins": 2.276181697845459, "rewards/rejected": -2.5677244663238525, "step": 275 }, { "epoch": 0.7557354925775979, "grad_norm": 63.5, "learning_rate": 4.655169396940229e-06, "logits/chosen": -1.488743782043457, "logits/rejected": -1.4984915256500244, "logps/chosen": -227.04574584960938, "logps/rejected": -223.5692596435547, "loss": 0.3756, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3987753987312317, "rewards/margins": 1.6648337841033936, "rewards/rejected": -2.0636088848114014, "step": 280 }, { "epoch": 0.7692307692307693, "grad_norm": 62.75, "learning_rate": 4.6349821698293025e-06, "logits/chosen": -1.4782928228378296, "logits/rejected": -1.480554223060608, "logps/chosen": -168.77146911621094, "logps/rejected": -283.3312683105469, "loss": 0.3639, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.32490164041519165, "rewards/margins": 1.6070611476898193, "rewards/rejected": -1.9319626092910767, "step": 285 }, { "epoch": 0.7827260458839406, "grad_norm": 85.0, "learning_rate": 4.6142671127038905e-06, "logits/chosen": -1.5204181671142578, "logits/rejected": -1.4846007823944092, "logps/chosen": -122.49859619140625, "logps/rejected": -159.67666625976562, "loss": 0.3855, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5428152680397034, "rewards/margins": 1.4056587219238281, "rewards/rejected": -1.9484741687774658, "step": 290 }, { "epoch": 0.796221322537112, "grad_norm": 124.5, "learning_rate": 4.593029346931777e-06, "logits/chosen": -1.5233218669891357, "logits/rejected": -1.4880311489105225, "logps/chosen": -190.8978271484375, "logps/rejected": -212.50808715820312, "loss": 0.4094, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5791584253311157, "rewards/margins": 1.7821210622787476, "rewards/rejected": -2.3612794876098633, "step": 295 }, { "epoch": 0.8097165991902834, "grad_norm": 121.0, "learning_rate": 4.571274123109606e-06, "logits/chosen": -1.5600152015686035, "logits/rejected": -1.5772325992584229, "logps/chosen": -211.6980438232422, "logps/rejected": -159.11520385742188, "loss": 0.5103, "rewards/accuracies": 0.75, "rewards/chosen": -0.5033570528030396, "rewards/margins": 1.3233957290649414, "rewards/rejected": -1.8267529010772705, "step": 300 }, { "epoch": 0.8232118758434548, "grad_norm": 87.0, "learning_rate": 4.549006819764779e-06, "logits/chosen": -1.3667839765548706, "logits/rejected": -1.408111333847046, "logps/chosen": -252.8665008544922, "logps/rejected": -246.56600952148438, "loss": 0.6645, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4156951904296875, "rewards/margins": 0.9969050288200378, "rewards/rejected": -1.4126002788543701, "step": 305 }, { "epoch": 0.8367071524966262, "grad_norm": 65.0, "learning_rate": 4.52623294202573e-06, "logits/chosen": -1.5357733964920044, "logits/rejected": -1.6000627279281616, "logps/chosen": -203.2954864501953, "logps/rejected": -178.47378540039062, "loss": 0.3625, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1179068312048912, "rewards/margins": 1.5459201335906982, "rewards/rejected": -1.6638271808624268, "step": 310 }, { "epoch": 0.8502024291497976, "grad_norm": 38.75, "learning_rate": 4.502958120260894e-06, "logits/chosen": -1.4177687168121338, "logits/rejected": -1.466953992843628, "logps/chosen": -208.93142700195312, "logps/rejected": -204.0532989501953, "loss": 0.3943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10137398540973663, "rewards/margins": 1.5527517795562744, "rewards/rejected": -1.6541255712509155, "step": 315 }, { "epoch": 0.863697705802969, "grad_norm": 94.5, "learning_rate": 4.479188108686714e-06, "logits/chosen": -1.543738603591919, "logits/rejected": -1.5562658309936523, "logps/chosen": -195.75601196289062, "logps/rejected": -243.9476776123047, "loss": 0.393, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09615819901227951, "rewards/margins": 1.808638334274292, "rewards/rejected": -1.9047966003417969, "step": 320 }, { "epoch": 0.8771929824561403, "grad_norm": 53.25, "learning_rate": 4.454928783945033e-06, "logits/chosen": -1.4368815422058105, "logits/rejected": -1.465288519859314, "logps/chosen": -182.02488708496094, "logps/rejected": -166.5155487060547, "loss": 0.3673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09929310530424118, "rewards/margins": 1.477452039718628, "rewards/rejected": -1.5767452716827393, "step": 325 }, { "epoch": 0.8906882591093117, "grad_norm": 94.5, "learning_rate": 4.430186143650216e-06, "logits/chosen": -1.3891671895980835, "logits/rejected": -1.3638372421264648, "logps/chosen": -167.63204956054688, "logps/rejected": -166.39913940429688, "loss": 0.4332, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.18427793681621552, "rewards/margins": 1.2914403676986694, "rewards/rejected": -1.4757182598114014, "step": 330 }, { "epoch": 0.9041835357624831, "grad_norm": 68.5, "learning_rate": 4.404966304906363e-06, "logits/chosen": -1.5300304889678955, "logits/rejected": -1.541245698928833, "logps/chosen": -237.1887969970703, "logps/rejected": -258.4833984375, "loss": 0.2851, "rewards/accuracies": 0.875, "rewards/chosen": -0.2509092092514038, "rewards/margins": 2.2454378604888916, "rewards/rejected": -2.496346950531006, "step": 335 }, { "epoch": 0.9176788124156545, "grad_norm": 91.5, "learning_rate": 4.379275502794984e-06, "logits/chosen": -1.4159671068191528, "logits/rejected": -1.3942148685455322, "logps/chosen": -204.76268005371094, "logps/rejected": -194.83755493164062, "loss": 0.3974, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.590856671333313, "rewards/margins": 1.8947960138320923, "rewards/rejected": -2.4856529235839844, "step": 340 }, { "epoch": 0.9311740890688259, "grad_norm": 24.875, "learning_rate": 4.3531200888335015e-06, "logits/chosen": -1.499260663986206, "logits/rejected": -1.5041369199752808, "logps/chosen": -158.403076171875, "logps/rejected": -188.42300415039062, "loss": 0.3399, "rewards/accuracies": 0.875, "rewards/chosen": -0.4716406464576721, "rewards/margins": 2.255904197692871, "rewards/rejected": -2.7275447845458984, "step": 345 }, { "epoch": 0.9446693657219973, "grad_norm": 49.0, "learning_rate": 4.326506529404973e-06, "logits/chosen": -1.4987239837646484, "logits/rejected": -1.5489791631698608, "logps/chosen": -228.030517578125, "logps/rejected": -199.24453735351562, "loss": 0.4954, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5366212129592896, "rewards/margins": 1.576836347579956, "rewards/rejected": -2.113457441329956, "step": 350 }, { "epoch": 0.9581646423751687, "grad_norm": 50.5, "learning_rate": 4.299441404159409e-06, "logits/chosen": -1.4427543878555298, "logits/rejected": -1.4410443305969238, "logps/chosen": -142.67196655273438, "logps/rejected": -182.15530395507812, "loss": 0.3882, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.45489731431007385, "rewards/margins": 1.885206937789917, "rewards/rejected": -2.340104341506958, "step": 355 }, { "epoch": 0.97165991902834, "grad_norm": 71.0, "learning_rate": 4.271931404387096e-06, "logits/chosen": -1.4958666563034058, "logits/rejected": -1.4852968454360962, "logps/chosen": -203.7172088623047, "logps/rejected": -223.72958374023438, "loss": 0.3129, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4084866940975189, "rewards/margins": 2.0505545139312744, "rewards/rejected": -2.459041118621826, "step": 360 }, { "epoch": 0.9851551956815114, "grad_norm": 72.0, "learning_rate": 4.243983331364307e-06, "logits/chosen": -1.6051279306411743, "logits/rejected": -1.5763704776763916, "logps/chosen": -156.02700805664062, "logps/rejected": -212.16317749023438, "loss": 0.4821, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6169974207878113, "rewards/margins": 1.195291519165039, "rewards/rejected": -1.8122888803482056, "step": 365 }, { "epoch": 0.9986504723346828, "grad_norm": 91.0, "learning_rate": 4.215604094671835e-06, "logits/chosen": -1.5946276187896729, "logits/rejected": -1.525407075881958, "logps/chosen": -190.231689453125, "logps/rejected": -210.0182342529297, "loss": 0.4743, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5886441469192505, "rewards/margins": 1.6572681665420532, "rewards/rejected": -2.245912551879883, "step": 370 }, { "epoch": 1.0121457489878543, "grad_norm": 71.5, "learning_rate": 4.186800710486732e-06, "logits/chosen": -1.503097414970398, "logits/rejected": -1.4615429639816284, "logps/chosen": -177.4516143798828, "logps/rejected": -223.7339324951172, "loss": 0.2691, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2226639688014984, "rewards/margins": 2.2762439250946045, "rewards/rejected": -2.4989078044891357, "step": 375 }, { "epoch": 1.0256410256410255, "grad_norm": 16.75, "learning_rate": 4.157580299847717e-06, "logits/chosen": -1.4365036487579346, "logits/rejected": -1.4489128589630127, "logps/chosen": -185.9925994873047, "logps/rejected": -210.19802856445312, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": -0.1753823310136795, "rewards/margins": 3.160768508911133, "rewards/rejected": -3.336151123046875, "step": 380 }, { "epoch": 1.039136302294197, "grad_norm": 27.125, "learning_rate": 4.12795008689464e-06, "logits/chosen": -1.4434540271759033, "logits/rejected": -1.5021578073501587, "logps/chosen": -210.2549591064453, "logps/rejected": -247.6964569091797, "loss": 0.2329, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.21405327320098877, "rewards/margins": 2.4333832263946533, "rewards/rejected": -2.219329833984375, "step": 385 }, { "epoch": 1.0526315789473684, "grad_norm": 29.5, "learning_rate": 4.0979173970824626e-06, "logits/chosen": -1.5133657455444336, "logits/rejected": -1.5038350820541382, "logps/chosen": -187.3416290283203, "logps/rejected": -197.63766479492188, "loss": 0.1885, "rewards/accuracies": 1.0, "rewards/chosen": 0.0815470814704895, "rewards/margins": 2.5452542304992676, "rewards/rejected": -2.463707447052002, "step": 390 }, { "epoch": 1.0661268556005399, "grad_norm": 11.3125, "learning_rate": 4.067489655370197e-06, "logits/chosen": -1.486011028289795, "logits/rejected": -1.5427876710891724, "logps/chosen": -248.8966064453125, "logps/rejected": -205.6848602294922, "loss": 0.1103, "rewards/accuracies": 1.0, "rewards/chosen": 0.543197751045227, "rewards/margins": 3.468106746673584, "rewards/rejected": -2.9249091148376465, "step": 395 }, { "epoch": 1.0796221322537112, "grad_norm": 21.625, "learning_rate": 4.0366743843852315e-06, "logits/chosen": -1.4536128044128418, "logits/rejected": -1.39426851272583, "logps/chosen": -157.4046173095703, "logps/rejected": -206.4637451171875, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": -0.14299368858337402, "rewards/margins": 3.642580032348633, "rewards/rejected": -3.7855734825134277, "step": 400 }, { "epoch": 1.0931174089068827, "grad_norm": 73.0, "learning_rate": 4.005479202563524e-06, "logits/chosen": -1.4207379817962646, "logits/rejected": -1.4653427600860596, "logps/chosen": -175.64657592773438, "logps/rejected": -188.96347045898438, "loss": 0.113, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.22152826189994812, "rewards/margins": 3.9064407348632812, "rewards/rejected": -4.127968788146973, "step": 405 }, { "epoch": 1.106612685560054, "grad_norm": 22.5, "learning_rate": 3.973911822266099e-06, "logits/chosen": -1.3683284521102905, "logits/rejected": -1.4073810577392578, "logps/chosen": -200.2495880126953, "logps/rejected": -196.02499389648438, "loss": 0.1506, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4190312922000885, "rewards/margins": 3.017284393310547, "rewards/rejected": -3.4363160133361816, "step": 410 }, { "epoch": 1.1201079622132253, "grad_norm": 61.0, "learning_rate": 3.941980047872324e-06, "logits/chosen": -1.3142037391662598, "logits/rejected": -1.3677208423614502, "logps/chosen": -200.49827575683594, "logps/rejected": -213.0048828125, "loss": 0.2229, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.29499849677085876, "rewards/margins": 2.430476188659668, "rewards/rejected": -2.7254748344421387, "step": 415 }, { "epoch": 1.1336032388663968, "grad_norm": 33.5, "learning_rate": 3.9096917738504445e-06, "logits/chosen": -1.5029326677322388, "logits/rejected": -1.522037386894226, "logps/chosen": -211.3799285888672, "logps/rejected": -195.49777221679688, "loss": 0.2023, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.20138970017433167, "rewards/margins": 3.0471653938293457, "rewards/rejected": -3.2485554218292236, "step": 420 }, { "epoch": 1.147098515519568, "grad_norm": 67.5, "learning_rate": 3.877054982805835e-06, "logits/chosen": -1.503327488899231, "logits/rejected": -1.5182857513427734, "logps/chosen": -206.69345092773438, "logps/rejected": -220.8511505126953, "loss": 0.2, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.075591079890728, "rewards/margins": 3.3199775218963623, "rewards/rejected": -3.39556884765625, "step": 425 }, { "epoch": 1.1605937921727396, "grad_norm": 41.25, "learning_rate": 3.844077743507468e-06, "logits/chosen": -1.4972890615463257, "logits/rejected": -1.4547359943389893, "logps/chosen": -190.38272094726562, "logps/rejected": -237.7483367919922, "loss": 0.1763, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.00510750338435173, "rewards/margins": 3.4418201446533203, "rewards/rejected": -3.446927309036255, "step": 430 }, { "epoch": 1.174089068825911, "grad_norm": 43.0, "learning_rate": 3.8107682088930797e-06, "logits/chosen": -1.5898491144180298, "logits/rejected": -1.628394365310669, "logps/chosen": -209.7681884765625, "logps/rejected": -223.9811248779297, "loss": 0.2875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.15720273554325104, "rewards/margins": 2.540311336517334, "rewards/rejected": -2.697514057159424, "step": 435 }, { "epoch": 1.1875843454790824, "grad_norm": 19.875, "learning_rate": 3.777134614053522e-06, "logits/chosen": -1.3833550214767456, "logits/rejected": -1.3048458099365234, "logps/chosen": -153.44886779785156, "logps/rejected": -187.23211669921875, "loss": 0.2094, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.15450401604175568, "rewards/margins": 2.7406177520751953, "rewards/rejected": -2.8951218128204346, "step": 440 }, { "epoch": 1.2010796221322537, "grad_norm": 25.25, "learning_rate": 3.7431852741968104e-06, "logits/chosen": -1.5894601345062256, "logits/rejected": -1.4398654699325562, "logps/chosen": -161.95870971679688, "logps/rejected": -259.89544677734375, "loss": 0.2674, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3261250853538513, "rewards/margins": 2.652719497680664, "rewards/rejected": -2.9788451194763184, "step": 445 }, { "epoch": 1.214574898785425, "grad_norm": 25.625, "learning_rate": 3.7089285825923614e-06, "logits/chosen": -1.481194257736206, "logits/rejected": -1.4744828939437866, "logps/chosen": -136.75341796875, "logps/rejected": -182.98255920410156, "loss": 0.216, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15366610884666443, "rewards/margins": 2.4346675872802734, "rewards/rejected": -2.5883336067199707, "step": 450 }, { "epoch": 1.2280701754385965, "grad_norm": 59.0, "learning_rate": 3.6743730084959275e-06, "logits/chosen": -1.4641847610473633, "logits/rejected": -1.4495608806610107, "logps/chosen": -226.5570068359375, "logps/rejected": -231.99484252929688, "loss": 0.1606, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0351928249001503, "rewards/margins": 2.678950071334839, "rewards/rejected": -2.6437573432922363, "step": 455 }, { "epoch": 1.2415654520917678, "grad_norm": 29.125, "learning_rate": 3.639527095055753e-06, "logits/chosen": -1.4890583753585815, "logits/rejected": -1.4146323204040527, "logps/chosen": -211.8848419189453, "logps/rejected": -223.7265167236328, "loss": 0.1515, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1912011355161667, "rewards/margins": 3.216825008392334, "rewards/rejected": -3.4080262184143066, "step": 460 }, { "epoch": 1.2550607287449393, "grad_norm": 28.5, "learning_rate": 3.604399457200458e-06, "logits/chosen": -1.5582194328308105, "logits/rejected": -1.530056357383728, "logps/chosen": -174.59786987304688, "logps/rejected": -235.314697265625, "loss": 0.1586, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.02259807661175728, "rewards/margins": 3.3205082416534424, "rewards/rejected": -3.343106508255005, "step": 465 }, { "epoch": 1.2685560053981106, "grad_norm": 47.0, "learning_rate": 3.5689987795091735e-06, "logits/chosen": -1.5336169004440308, "logits/rejected": -1.5555146932601929, "logps/chosen": -192.9527587890625, "logps/rejected": -217.05029296875, "loss": 0.1666, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.11079835891723633, "rewards/margins": 2.9511632919311523, "rewards/rejected": -3.0619616508483887, "step": 470 }, { "epoch": 1.282051282051282, "grad_norm": 31.5, "learning_rate": 3.5333338140644602e-06, "logits/chosen": -1.567378044128418, "logits/rejected": -1.5020748376846313, "logps/chosen": -151.2008819580078, "logps/rejected": -193.5251007080078, "loss": 0.1562, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06739845871925354, "rewards/margins": 2.88545560836792, "rewards/rejected": -2.81805682182312, "step": 475 }, { "epoch": 1.2955465587044535, "grad_norm": 27.625, "learning_rate": 3.497413378288541e-06, "logits/chosen": -1.558091402053833, "logits/rejected": -1.5880284309387207, "logps/chosen": -208.2618408203125, "logps/rejected": -215.33065795898438, "loss": 0.1537, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08185993134975433, "rewards/margins": 2.7448792457580566, "rewards/rejected": -2.8267390727996826, "step": 480 }, { "epoch": 1.3090418353576248, "grad_norm": 21.0, "learning_rate": 3.4612463527633728e-06, "logits/chosen": -1.517230749130249, "logits/rejected": -1.5125114917755127, "logps/chosen": -165.6942138671875, "logps/rejected": -177.20965576171875, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": -0.21497321128845215, "rewards/margins": 3.283679485321045, "rewards/rejected": -3.498652935028076, "step": 485 }, { "epoch": 1.3225371120107963, "grad_norm": 58.25, "learning_rate": 3.4248416790351086e-06, "logits/chosen": -1.4563219547271729, "logits/rejected": -1.4463237524032593, "logps/chosen": -222.70803833007812, "logps/rejected": -276.1205139160156, "loss": 0.1741, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.18195849657058716, "rewards/margins": 3.079150438308716, "rewards/rejected": -3.2611091136932373, "step": 490 }, { "epoch": 1.3360323886639676, "grad_norm": 26.5, "learning_rate": 3.3882083574034847e-06, "logits/chosen": -1.495981216430664, "logits/rejected": -1.510833501815796, "logps/chosen": -217.92416381835938, "logps/rejected": -232.9659881591797, "loss": 0.1075, "rewards/accuracies": 1.0, "rewards/chosen": 0.07798905670642853, "rewards/margins": 3.7004494667053223, "rewards/rejected": -3.6224606037139893, "step": 495 }, { "epoch": 1.349527665317139, "grad_norm": 11.625, "learning_rate": 3.3513554446966846e-06, "logits/chosen": -1.607877492904663, "logits/rejected": -1.5209126472473145, "logps/chosen": -145.24710083007812, "logps/rejected": -269.81951904296875, "loss": 0.0835, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1404910534620285, "rewards/margins": 3.947847843170166, "rewards/rejected": -4.088338375091553, "step": 500 }, { "epoch": 1.349527665317139, "eval_logits/chosen": -1.5215187072753906, "eval_logits/rejected": -1.5562808513641357, "eval_logps/chosen": -190.62527465820312, "eval_logps/rejected": -222.86770629882812, "eval_loss": 0.3281523883342743, "eval_rewards/accuracies": 0.849397599697113, "eval_rewards/chosen": -0.6181024312973022, "eval_rewards/margins": 2.1862471103668213, "eval_rewards/rejected": -2.804349660873413, "eval_runtime": 23.4839, "eval_samples_per_second": 14.052, "eval_steps_per_second": 3.534, "step": 500 }, { "epoch": 1.3630229419703104, "grad_norm": 25.625, "learning_rate": 3.314292052032227e-06, "logits/chosen": -1.4269988536834717, "logits/rejected": -1.5553017854690552, "logps/chosen": -245.88330078125, "logps/rejected": -144.62518310546875, "loss": 0.2057, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.03630426153540611, "rewards/margins": 2.8089230060577393, "rewards/rejected": -2.8452274799346924, "step": 505 }, { "epoch": 1.376518218623482, "grad_norm": 42.75, "learning_rate": 3.2770273425644285e-06, "logits/chosen": -1.3818541765213013, "logits/rejected": -1.31718909740448, "logps/chosen": -194.84194946289062, "logps/rejected": -197.08505249023438, "loss": 0.1862, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.11487498134374619, "rewards/margins": 3.057730197906494, "rewards/rejected": -3.172605276107788, "step": 510 }, { "epoch": 1.3900134952766532, "grad_norm": 29.0, "learning_rate": 3.2395705292190067e-06, "logits/chosen": -1.467614769935608, "logits/rejected": -1.438024640083313, "logps/chosen": -180.7233428955078, "logps/rejected": -217.57559204101562, "loss": 0.1711, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.10827420651912689, "rewards/margins": 3.1877129077911377, "rewards/rejected": -3.295987367630005, "step": 515 }, { "epoch": 1.4035087719298245, "grad_norm": 12.125, "learning_rate": 3.2019308724153743e-06, "logits/chosen": -1.4175347089767456, "logits/rejected": -1.5785712003707886, "logps/chosen": -196.76730346679688, "logps/rejected": -179.5243377685547, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 0.12531700730323792, "rewards/margins": 3.2615838050842285, "rewards/rejected": -3.1362667083740234, "step": 520 }, { "epoch": 1.417004048582996, "grad_norm": 27.375, "learning_rate": 3.164117677777191e-06, "logits/chosen": -1.5264801979064941, "logits/rejected": -1.6040115356445312, "logps/chosen": -150.361328125, "logps/rejected": -164.02816772460938, "loss": 0.1757, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4109339118003845, "rewards/margins": 3.098153591156006, "rewards/rejected": -3.509087324142456, "step": 525 }, { "epoch": 1.4304993252361673, "grad_norm": 38.25, "learning_rate": 3.1261402938317465e-06, "logits/chosen": -1.5730303525924683, "logits/rejected": -1.6026499271392822, "logps/chosen": -164.3070831298828, "logps/rejected": -246.06338500976562, "loss": 0.1532, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.005527207162231207, "rewards/margins": 3.9187304973602295, "rewards/rejected": -3.913203477859497, "step": 530 }, { "epoch": 1.4439946018893388, "grad_norm": 20.375, "learning_rate": 3.088008109698726e-06, "logits/chosen": -1.444838285446167, "logits/rejected": -1.5232534408569336, "logps/chosen": -194.70555114746094, "logps/rejected": -218.77590942382812, "loss": 0.1892, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.10898719727993011, "rewards/margins": 3.2815093994140625, "rewards/rejected": -3.1725223064422607, "step": 535 }, { "epoch": 1.45748987854251, "grad_norm": 43.0, "learning_rate": 3.0497305527689446e-06, "logits/chosen": -1.4176692962646484, "logits/rejected": -1.4581646919250488, "logps/chosen": -190.53550720214844, "logps/rejected": -202.92530822753906, "loss": 0.1852, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.12854930758476257, "rewards/margins": 3.113678216934204, "rewards/rejected": -3.242227554321289, "step": 540 }, { "epoch": 1.4709851551956814, "grad_norm": 42.0, "learning_rate": 3.011317086373628e-06, "logits/chosen": -1.4024337530136108, "logits/rejected": -1.4260265827178955, "logps/chosen": -222.62124633789062, "logps/rejected": -228.56295776367188, "loss": 0.1847, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.02214776910841465, "rewards/margins": 3.127570629119873, "rewards/rejected": -3.1497180461883545, "step": 545 }, { "epoch": 1.484480431848853, "grad_norm": 38.5, "learning_rate": 2.9727772074447916e-06, "logits/chosen": -1.4362146854400635, "logits/rejected": -1.4737937450408936, "logps/chosen": -190.13218688964844, "logps/rejected": -182.9353790283203, "loss": 0.1473, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.015029204078018665, "rewards/margins": 3.5559723377227783, "rewards/rejected": -3.5710015296936035, "step": 550 }, { "epoch": 1.4979757085020242, "grad_norm": 105.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.5892771482467651, "logits/rejected": -1.5846550464630127, "logps/chosen": -128.55142211914062, "logps/rejected": -169.93356323242188, "loss": 0.2029, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.536339282989502, "rewards/margins": 2.9536054134368896, "rewards/rejected": -3.4899444580078125, "step": 555 }, { "epoch": 1.5114709851551957, "grad_norm": 49.75, "learning_rate": 2.8953563536233525e-06, "logits/chosen": -1.650007963180542, "logits/rejected": -1.6943776607513428, "logps/chosen": -168.49082946777344, "logps/rejected": -202.83924865722656, "loss": 0.186, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5661638379096985, "rewards/margins": 3.2901504039764404, "rewards/rejected": -3.856314182281494, "step": 560 }, { "epoch": 1.524966261808367, "grad_norm": 21.0, "learning_rate": 2.8564945194294273e-06, "logits/chosen": -1.5658307075500488, "logits/rejected": -1.46593177318573, "logps/chosen": -162.1931915283203, "logps/rejected": -254.7098388671875, "loss": 0.168, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5101041793823242, "rewards/margins": 3.1685078144073486, "rewards/rejected": -3.6786117553710938, "step": 565 }, { "epoch": 1.5384615384615383, "grad_norm": 13.125, "learning_rate": 2.817544549367197e-06, "logits/chosen": -1.4567762613296509, "logits/rejected": -1.4438632726669312, "logps/chosen": -173.05821228027344, "logps/rejected": -226.567626953125, "loss": 0.1935, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4246034622192383, "rewards/margins": 3.5222201347351074, "rewards/rejected": -3.946824312210083, "step": 570 }, { "epoch": 1.5519568151147098, "grad_norm": 18.875, "learning_rate": 2.778516073008071e-06, "logits/chosen": -1.3770719766616821, "logits/rejected": -1.4858124256134033, "logps/chosen": -178.8583221435547, "logps/rejected": -180.4306640625, "loss": 0.2049, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.34754911065101624, "rewards/margins": 2.8492796421051025, "rewards/rejected": -3.196829080581665, "step": 575 }, { "epoch": 1.5654520917678814, "grad_norm": 51.0, "learning_rate": 2.7394187393325107e-06, "logits/chosen": -1.4935017824172974, "logits/rejected": -1.482154130935669, "logps/chosen": -183.38815307617188, "logps/rejected": -203.4325714111328, "loss": 0.2601, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.4926990866661072, "rewards/margins": 2.8543787002563477, "rewards/rejected": -3.347078323364258, "step": 580 }, { "epoch": 1.5789473684210527, "grad_norm": 11.875, "learning_rate": 2.7002622143445177e-06, "logits/chosen": -1.5763792991638184, "logits/rejected": -1.581122875213623, "logps/chosen": -230.5819854736328, "logps/rejected": -290.4792175292969, "loss": 0.1305, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.07175219804048538, "rewards/margins": 4.184114933013916, "rewards/rejected": -4.112362861633301, "step": 585 }, { "epoch": 1.592442645074224, "grad_norm": 46.75, "learning_rate": 2.6610561786819207e-06, "logits/chosen": -1.6340926885604858, "logits/rejected": -1.5590074062347412, "logps/chosen": -145.62442016601562, "logps/rejected": -248.79403686523438, "loss": 0.1715, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3683429956436157, "rewards/margins": 3.4242138862609863, "rewards/rejected": -3.7925562858581543, "step": 590 }, { "epoch": 1.6059379217273952, "grad_norm": 8.5625, "learning_rate": 2.6218103252230302e-06, "logits/chosen": -1.5815064907073975, "logits/rejected": -1.558189868927002, "logps/chosen": -145.986572265625, "logps/rejected": -209.48776245117188, "loss": 0.1382, "rewards/accuracies": 1.0, "rewards/chosen": -0.15136297047138214, "rewards/margins": 3.156534194946289, "rewards/rejected": -3.3078970909118652, "step": 595 }, { "epoch": 1.6194331983805668, "grad_norm": 33.5, "learning_rate": 2.582534356690284e-06, "logits/chosen": -1.4829189777374268, "logits/rejected": -1.5618332624435425, "logps/chosen": -280.50482177734375, "logps/rejected": -227.37191772460938, "loss": 0.111, "rewards/accuracies": 1.0, "rewards/chosen": 0.014963224530220032, "rewards/margins": 3.7380282878875732, "rewards/rejected": -3.723065137863159, "step": 600 }, { "epoch": 1.6329284750337383, "grad_norm": 19.25, "learning_rate": 2.5432379832514437e-06, "logits/chosen": -1.5892632007598877, "logits/rejected": -1.6352291107177734, "logps/chosen": -158.56002807617188, "logps/rejected": -202.90060424804688, "loss": 0.2301, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.49658140540122986, "rewards/margins": 3.0457987785339355, "rewards/rejected": -3.5423800945281982, "step": 605 }, { "epoch": 1.6464237516869096, "grad_norm": 18.5, "learning_rate": 2.5039309201189618e-06, "logits/chosen": -1.6018474102020264, "logits/rejected": -1.6965217590332031, "logps/chosen": -161.53518676757812, "logps/rejected": -185.10025024414062, "loss": 0.1597, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.14256028831005096, "rewards/margins": 3.0957140922546387, "rewards/rejected": -3.238274335861206, "step": 610 }, { "epoch": 1.6599190283400809, "grad_norm": 22.375, "learning_rate": 2.4646228851480957e-06, "logits/chosen": -1.391078233718872, "logits/rejected": -1.3691911697387695, "logps/chosen": -206.93734741210938, "logps/rejected": -213.29428100585938, "loss": 0.2172, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01679610088467598, "rewards/margins": 2.988704204559326, "rewards/rejected": -2.9719078540802, "step": 615 }, { "epoch": 1.6734143049932524, "grad_norm": 13.25, "learning_rate": 2.4253235964343677e-06, "logits/chosen": -1.590201497077942, "logits/rejected": -1.4947328567504883, "logps/chosen": -162.37301635742188, "logps/rejected": -259.95294189453125, "loss": 0.1116, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.33501359820365906, "rewards/margins": 4.118770599365234, "rewards/rejected": -4.453783988952637, "step": 620 }, { "epoch": 1.686909581646424, "grad_norm": 73.0, "learning_rate": 2.3860427699109726e-06, "logits/chosen": -1.6217790842056274, "logits/rejected": -1.6454839706420898, "logps/chosen": -172.94483947753906, "logps/rejected": -205.34475708007812, "loss": 0.2869, "rewards/accuracies": 0.875, "rewards/chosen": -0.9174480438232422, "rewards/margins": 3.128140449523926, "rewards/rejected": -4.045588493347168, "step": 625 }, { "epoch": 1.7004048582995952, "grad_norm": 32.5, "learning_rate": 2.34679011694671e-06, "logits/chosen": -1.5026500225067139, "logits/rejected": -1.6494897603988647, "logps/chosen": -268.9452209472656, "logps/rejected": -212.0578155517578, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": -0.23328566551208496, "rewards/margins": 4.139514446258545, "rewards/rejected": -4.372800350189209, "step": 630 }, { "epoch": 1.7139001349527665, "grad_norm": 70.5, "learning_rate": 2.3075753419450524e-06, "logits/chosen": -1.5526963472366333, "logits/rejected": -1.6195096969604492, "logps/chosen": -205.20431518554688, "logps/rejected": -197.59744262695312, "loss": 0.2026, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3273767828941345, "rewards/margins": 2.9169745445251465, "rewards/rejected": -3.2443511486053467, "step": 635 }, { "epoch": 1.7273954116059378, "grad_norm": 38.5, "learning_rate": 2.2684081399449327e-06, "logits/chosen": -1.4865336418151855, "logits/rejected": -1.479229211807251, "logps/chosen": -188.85787963867188, "logps/rejected": -203.17514038085938, "loss": 0.269, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.778891921043396, "rewards/margins": 3.5117366313934326, "rewards/rejected": -4.290627956390381, "step": 640 }, { "epoch": 1.7408906882591093, "grad_norm": 116.0, "learning_rate": 2.2292981942238454e-06, "logits/chosen": -1.598434329032898, "logits/rejected": -1.6193567514419556, "logps/chosen": -170.999267578125, "logps/rejected": -234.42391967773438, "loss": 0.3528, "rewards/accuracies": 0.875, "rewards/chosen": -0.586583137512207, "rewards/margins": 3.1737523078918457, "rewards/rejected": -3.7603354454040527, "step": 645 }, { "epoch": 1.7543859649122808, "grad_norm": 36.0, "learning_rate": 2.1902551739038624e-06, "logits/chosen": -1.5177044868469238, "logits/rejected": -1.4585306644439697, "logps/chosen": -171.92758178710938, "logps/rejected": -219.8982696533203, "loss": 0.2386, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5028108954429626, "rewards/margins": 3.118129253387451, "rewards/rejected": -3.6209399700164795, "step": 650 }, { "epoch": 1.7678812415654521, "grad_norm": 11.0625, "learning_rate": 2.151288731561136e-06, "logits/chosen": -1.532063364982605, "logits/rejected": -1.4071648120880127, "logps/chosen": -211.4221649169922, "logps/rejected": -240.8402099609375, "loss": 0.1651, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.42295771837234497, "rewards/margins": 3.7488913536071777, "rewards/rejected": -4.171849250793457, "step": 655 }, { "epoch": 1.7813765182186234, "grad_norm": 23.625, "learning_rate": 2.1124085008395056e-06, "logits/chosen": -1.4962142705917358, "logits/rejected": -1.4677404165267944, "logps/chosen": -197.39447021484375, "logps/rejected": -263.4613342285156, "loss": 0.1999, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.16278569400310516, "rewards/margins": 3.5397281646728516, "rewards/rejected": -3.7025134563446045, "step": 660 }, { "epoch": 1.7948717948717947, "grad_norm": 20.625, "learning_rate": 2.073624094068776e-06, "logits/chosen": -1.5467997789382935, "logits/rejected": -1.540650725364685, "logps/chosen": -186.6321563720703, "logps/rejected": -259.65045166015625, "loss": 0.2781, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.22982105612754822, "rewards/margins": 3.7238681316375732, "rewards/rejected": -3.9536895751953125, "step": 665 }, { "epoch": 1.8083670715249662, "grad_norm": 12.875, "learning_rate": 2.03494509988827e-06, "logits/chosen": -1.6044431924819946, "logits/rejected": -1.627730131149292, "logps/chosen": -184.64320373535156, "logps/rejected": -204.9185791015625, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": -0.07731951773166656, "rewards/margins": 3.6050572395324707, "rewards/rejected": -3.6823768615722656, "step": 670 }, { "epoch": 1.8218623481781377, "grad_norm": 22.375, "learning_rate": 1.996381080876237e-06, "logits/chosen": -1.6212413311004639, "logits/rejected": -1.5563671588897705, "logps/chosen": -219.73171997070312, "logps/rejected": -281.0826721191406, "loss": 0.1177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023561427369713783, "rewards/margins": 3.5450756549835205, "rewards/rejected": -3.521514415740967, "step": 675 }, { "epoch": 1.835357624831309, "grad_norm": 42.5, "learning_rate": 1.957941571185702e-06, "logits/chosen": -1.4472072124481201, "logits/rejected": -1.5231066942214966, "logps/chosen": -256.3811950683594, "logps/rejected": -225.1781768798828, "loss": 0.2672, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.01850978098809719, "rewards/margins": 3.1582770347595215, "rewards/rejected": -3.1767868995666504, "step": 680 }, { "epoch": 1.8488529014844803, "grad_norm": 30.625, "learning_rate": 1.919636074187346e-06, "logits/chosen": -1.388319730758667, "logits/rejected": -1.4473168849945068, "logps/chosen": -253.48312377929688, "logps/rejected": -212.169189453125, "loss": 0.1468, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18813356757164001, "rewards/margins": 3.097019672393799, "rewards/rejected": -2.908886194229126, "step": 685 }, { "epoch": 1.8623481781376519, "grad_norm": 90.0, "learning_rate": 1.8814740601199943e-06, "logits/chosen": -1.4006351232528687, "logits/rejected": -1.4068963527679443, "logps/chosen": -164.6719970703125, "logps/rejected": -193.83538818359375, "loss": 0.2666, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.42647585272789, "rewards/margins": 2.7546064853668213, "rewards/rejected": -3.181082248687744, "step": 690 }, { "epoch": 1.8758434547908234, "grad_norm": 25.75, "learning_rate": 1.8434649637492952e-06, "logits/chosen": -1.341395616531372, "logits/rejected": -1.3592100143432617, "logps/chosen": -181.58978271484375, "logps/rejected": -235.27456665039062, "loss": 0.1718, "rewards/accuracies": 1.0, "rewards/chosen": -0.15408626198768616, "rewards/margins": 3.203856945037842, "rewards/rejected": -3.357943296432495, "step": 695 }, { "epoch": 1.8893387314439947, "grad_norm": 18.625, "learning_rate": 1.8056181820351737e-06, "logits/chosen": -1.565199613571167, "logits/rejected": -1.5012518167495728, "logps/chosen": -241.5365753173828, "logps/rejected": -229.1800079345703, "loss": 0.1734, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.34816664457321167, "rewards/margins": 4.178171634674072, "rewards/rejected": -3.830005168914795, "step": 700 }, { "epoch": 1.902834008097166, "grad_norm": 8.875, "learning_rate": 1.7679430718086244e-06, "logits/chosen": -1.5023219585418701, "logits/rejected": -1.4059240818023682, "logps/chosen": -240.8516082763672, "logps/rejected": -287.47955322265625, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 0.22036854922771454, "rewards/margins": 4.166906833648682, "rewards/rejected": -3.946538209915161, "step": 705 }, { "epoch": 1.9163292847503373, "grad_norm": 35.5, "learning_rate": 1.7304489474584307e-06, "logits/chosen": -1.565582036972046, "logits/rejected": -1.4994531869888306, "logps/chosen": -148.25338745117188, "logps/rejected": -231.37741088867188, "loss": 0.123, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1026371493935585, "rewards/margins": 3.7467575073242188, "rewards/rejected": -3.6441197395324707, "step": 710 }, { "epoch": 1.9298245614035088, "grad_norm": 28.125, "learning_rate": 1.693145078628377e-06, "logits/chosen": -1.6054456233978271, "logits/rejected": -1.6087411642074585, "logps/chosen": -159.12234497070312, "logps/rejected": -214.5330352783203, "loss": 0.1255, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.13751724362373352, "rewards/margins": 3.8032824993133545, "rewards/rejected": -3.940800428390503, "step": 715 }, { "epoch": 1.9433198380566803, "grad_norm": 18.375, "learning_rate": 1.6560406879255192e-06, "logits/chosen": -1.615686058998108, "logits/rejected": -1.678998351097107, "logps/chosen": -179.3768768310547, "logps/rejected": -188.79124450683594, "loss": 0.1608, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13511483371257782, "rewards/margins": 3.1098551750183105, "rewards/rejected": -3.2449698448181152, "step": 720 }, { "epoch": 1.9568151147098516, "grad_norm": 20.5, "learning_rate": 1.6191449486400893e-06, "logits/chosen": -1.5641348361968994, "logits/rejected": -1.5269627571105957, "logps/chosen": -190.90200805664062, "logps/rejected": -200.14797973632812, "loss": 0.1858, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.057850100100040436, "rewards/margins": 3.392789363861084, "rewards/rejected": -3.4506402015686035, "step": 725 }, { "epoch": 1.9703103913630229, "grad_norm": 46.25, "learning_rate": 1.5824669824775868e-06, "logits/chosen": -1.6585397720336914, "logits/rejected": -1.6145107746124268, "logps/chosen": -153.5370330810547, "logps/rejected": -246.87869262695312, "loss": 0.1935, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1705460101366043, "rewards/margins": 3.2724738121032715, "rewards/rejected": -3.4430203437805176, "step": 730 }, { "epoch": 1.9838056680161942, "grad_norm": 21.125, "learning_rate": 1.5460158573036288e-06, "logits/chosen": -1.425318956375122, "logits/rejected": -1.5616633892059326, "logps/chosen": -228.63955688476562, "logps/rejected": -232.26235961914062, "loss": 0.1763, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15325972437858582, "rewards/margins": 2.676074504852295, "rewards/rejected": -2.8293344974517822, "step": 735 }, { "epoch": 1.9973009446693657, "grad_norm": 57.75, "learning_rate": 1.509800584902108e-06, "logits/chosen": -1.4701238870620728, "logits/rejected": -1.335039496421814, "logps/chosen": -165.36788940429688, "logps/rejected": -248.84915161132812, "loss": 0.2088, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.21872563660144806, "rewards/margins": 3.6630032062530518, "rewards/rejected": -3.8817286491394043, "step": 740 }, { "epoch": 2.010796221322537, "grad_norm": 13.0625, "learning_rate": 1.473830118747216e-06, "logits/chosen": -1.3533880710601807, "logits/rejected": -1.4392606019973755, "logps/chosen": -173.4610595703125, "logps/rejected": -189.3846435546875, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": -0.04286568984389305, "rewards/margins": 3.4945671558380127, "rewards/rejected": -3.537432909011841, "step": 745 }, { "epoch": 2.0242914979757085, "grad_norm": 5.46875, "learning_rate": 1.4381133517898803e-06, "logits/chosen": -1.5612472295761108, "logits/rejected": -1.6096746921539307, "logps/chosen": -244.1045684814453, "logps/rejected": -227.0, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": 0.28070664405822754, "rewards/margins": 4.070573329925537, "rewards/rejected": -3.7898666858673096, "step": 750 }, { "epoch": 2.03778677462888, "grad_norm": 46.75, "learning_rate": 1.4026591142591733e-06, "logits/chosen": -1.4181170463562012, "logits/rejected": -1.5695334672927856, "logps/chosen": -218.1271514892578, "logps/rejected": -171.77000427246094, "loss": 0.1633, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2111760377883911, "rewards/margins": 2.9384093284606934, "rewards/rejected": -3.149585247039795, "step": 755 }, { "epoch": 2.051282051282051, "grad_norm": 24.5, "learning_rate": 1.3674761714792153e-06, "logits/chosen": -1.5777294635772705, "logits/rejected": -1.6976985931396484, "logps/chosen": -224.3392791748047, "logps/rejected": -254.0798797607422, "loss": 0.0739, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.15005287528038025, "rewards/margins": 4.0651535987854, "rewards/rejected": -3.91510009765625, "step": 760 }, { "epoch": 2.064777327935223, "grad_norm": 23.75, "learning_rate": 1.33257322170213e-06, "logits/chosen": -1.4911249876022339, "logits/rejected": -1.500860571861267, "logps/chosen": -172.9776611328125, "logps/rejected": -201.8260040283203, "loss": 0.1002, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.21206021308898926, "rewards/margins": 3.810685634613037, "rewards/rejected": -3.598625659942627, "step": 765 }, { "epoch": 2.078272604588394, "grad_norm": 28.375, "learning_rate": 1.2979588939575879e-06, "logits/chosen": -1.5784046649932861, "logits/rejected": -1.5579355955123901, "logps/chosen": -192.16024780273438, "logps/rejected": -219.4779510498047, "loss": 0.1696, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06577552855014801, "rewards/margins": 3.573701858520508, "rewards/rejected": -3.5079262256622314, "step": 770 }, { "epoch": 2.0917678812415654, "grad_norm": 14.8125, "learning_rate": 1.2636417459194536e-06, "logits/chosen": -1.5944167375564575, "logits/rejected": -1.6392465829849243, "logps/chosen": -235.58633422851562, "logps/rejected": -274.0408630371094, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": -0.08247147500514984, "rewards/margins": 4.281358242034912, "rewards/rejected": -4.363830089569092, "step": 775 }, { "epoch": 2.1052631578947367, "grad_norm": 5.21875, "learning_rate": 1.2296302617900772e-06, "logits/chosen": -1.5774985551834106, "logits/rejected": -1.6413581371307373, "logps/chosen": -171.0308074951172, "logps/rejected": -183.9725341796875, "loss": 0.0845, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.016073107719421387, "rewards/margins": 3.9465243816375732, "rewards/rejected": -3.9304511547088623, "step": 780 }, { "epoch": 2.118758434547908, "grad_norm": 15.0, "learning_rate": 1.1959328502027556e-06, "logits/chosen": -1.5672693252563477, "logits/rejected": -1.5724976062774658, "logps/chosen": -161.8846435546875, "logps/rejected": -190.6571807861328, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 0.02019577845931053, "rewards/margins": 3.7148475646972656, "rewards/rejected": -3.6946518421173096, "step": 785 }, { "epoch": 2.1322537112010798, "grad_norm": 19.125, "learning_rate": 1.1625578421428714e-06, "logits/chosen": -1.4088555574417114, "logits/rejected": -1.331659197807312, "logps/chosen": -197.23593139648438, "logps/rejected": -279.7657470703125, "loss": 0.1239, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08394167572259903, "rewards/margins": 3.702916383743286, "rewards/rejected": -3.786858081817627, "step": 790 }, { "epoch": 2.145748987854251, "grad_norm": 19.625, "learning_rate": 1.1295134888882258e-06, "logits/chosen": -1.5858689546585083, "logits/rejected": -1.6758959293365479, "logps/chosen": -194.56253051757812, "logps/rejected": -206.4073028564453, "loss": 0.0922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.14167055487632751, "rewards/margins": 3.8033957481384277, "rewards/rejected": -3.945065975189209, "step": 795 }, { "epoch": 2.1592442645074224, "grad_norm": 16.25, "learning_rate": 1.0968079599690872e-06, "logits/chosen": -1.5427080392837524, "logits/rejected": -1.509251356124878, "logps/chosen": -227.91281127929688, "logps/rejected": -196.93661499023438, "loss": 0.112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0069570960476994514, "rewards/margins": 3.6783218383789062, "rewards/rejected": -3.6852786540985107, "step": 800 }, { "epoch": 2.1727395411605936, "grad_norm": 19.875, "learning_rate": 1.064449341148442e-06, "logits/chosen": -1.624629020690918, "logits/rejected": -1.647383689880371, "logps/chosen": -203.95071411132812, "logps/rejected": -221.9706573486328, "loss": 0.1216, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1412554681301117, "rewards/margins": 3.5126278400421143, "rewards/rejected": -3.6538829803466797, "step": 805 }, { "epoch": 2.1862348178137654, "grad_norm": 14.5, "learning_rate": 1.0324456324229536e-06, "logits/chosen": -1.4194597005844116, "logits/rejected": -1.3489387035369873, "logps/chosen": -166.34426879882812, "logps/rejected": -239.3138427734375, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": -0.0588313452899456, "rewards/margins": 3.9181437492370605, "rewards/rejected": -3.976975202560425, "step": 810 }, { "epoch": 2.1997300944669367, "grad_norm": 35.5, "learning_rate": 1.000804746045138e-06, "logits/chosen": -1.3923031091690063, "logits/rejected": -1.4646499156951904, "logps/chosen": -191.46279907226562, "logps/rejected": -184.79953002929688, "loss": 0.1111, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.008677695877850056, "rewards/margins": 3.193809986114502, "rewards/rejected": -3.2024874687194824, "step": 815 }, { "epoch": 2.213225371120108, "grad_norm": 16.0, "learning_rate": 9.695345045672167e-07, "logits/chosen": -1.4313310384750366, "logits/rejected": -1.4792088270187378, "logps/chosen": -191.17092895507812, "logps/rejected": -196.5364532470703, "loss": 0.118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.33248454332351685, "rewards/margins": 3.7640583515167236, "rewards/rejected": -4.096542835235596, "step": 820 }, { "epoch": 2.2267206477732793, "grad_norm": 15.5, "learning_rate": 9.386426389071532e-07, "logits/chosen": -1.4152162075042725, "logits/rejected": -1.363843321800232, "logps/chosen": -229.3914031982422, "logps/rejected": -278.37847900390625, "loss": 0.0961, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.30344587564468384, "rewards/margins": 4.63069486618042, "rewards/rejected": -4.934141635894775, "step": 825 }, { "epoch": 2.2402159244264506, "grad_norm": 17.625, "learning_rate": 9.081367864373489e-07, "logits/chosen": -1.3973594903945923, "logits/rejected": -1.524677038192749, "logps/chosen": -168.33126831054688, "logps/rejected": -156.55892944335938, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": -0.1414262354373932, "rewards/margins": 3.3840813636779785, "rewards/rejected": -3.5255074501037598, "step": 830 }, { "epoch": 2.2537112010796223, "grad_norm": 11.8125, "learning_rate": 8.780244890964567e-07, "logits/chosen": -1.4209728240966797, "logits/rejected": -1.2569080591201782, "logps/chosen": -177.04782104492188, "logps/rejected": -275.0938415527344, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 0.16639022529125214, "rewards/margins": 3.9153380393981934, "rewards/rejected": -3.748948335647583, "step": 835 }, { "epoch": 2.2672064777327936, "grad_norm": 10.625, "learning_rate": 8.483131915247969e-07, "logits/chosen": -1.563407301902771, "logits/rejected": -1.534883975982666, "logps/chosen": -171.35104370117188, "logps/rejected": -242.4336700439453, "loss": 0.0949, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2416602075099945, "rewards/margins": 4.914166450500488, "rewards/rejected": -5.155826568603516, "step": 840 }, { "epoch": 2.280701754385965, "grad_norm": 19.75, "learning_rate": 8.190102392238191e-07, "logits/chosen": -1.4438880681991577, "logits/rejected": -1.4186255931854248, "logps/chosen": -154.63705444335938, "logps/rejected": -207.8048858642578, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": -0.18597714602947235, "rewards/margins": 4.108304500579834, "rewards/rejected": -4.294281959533691, "step": 845 }, { "epoch": 2.294197031039136, "grad_norm": 32.25, "learning_rate": 7.90122876740086e-07, "logits/chosen": -1.63836669921875, "logits/rejected": -1.5565919876098633, "logps/chosen": -226.85037231445312, "logps/rejected": -326.13421630859375, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 0.0758393257856369, "rewards/margins": 4.579066276550293, "rewards/rejected": -4.503227233886719, "step": 850 }, { "epoch": 2.3076923076923075, "grad_norm": 5.0625, "learning_rate": 7.616582458742059e-07, "logits/chosen": -1.4565999507904053, "logits/rejected": -1.455143928527832, "logps/chosen": -212.2303009033203, "logps/rejected": -276.86834716796875, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": -0.1173635721206665, "rewards/margins": 4.344286918640137, "rewards/rejected": -4.46165132522583, "step": 855 }, { "epoch": 2.3211875843454792, "grad_norm": 9.6875, "learning_rate": 7.336233839151693e-07, "logits/chosen": -1.6497745513916016, "logits/rejected": -1.6588242053985596, "logps/chosen": -169.42959594726562, "logps/rejected": -258.19207763671875, "loss": 0.1057, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.21658802032470703, "rewards/margins": 3.805851697921753, "rewards/rejected": -4.022439479827881, "step": 860 }, { "epoch": 2.3346828609986505, "grad_norm": 21.5, "learning_rate": 7.060252219005304e-07, "logits/chosen": -1.520618200302124, "logits/rejected": -1.5337458848953247, "logps/chosen": -227.05679321289062, "logps/rejected": -317.5985107421875, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": -0.06503160297870636, "rewards/margins": 4.4666852951049805, "rewards/rejected": -4.531716823577881, "step": 865 }, { "epoch": 2.348178137651822, "grad_norm": 12.5, "learning_rate": 6.788705829028483e-07, "logits/chosen": -1.5424460172653198, "logits/rejected": -1.527999997138977, "logps/chosen": -186.46414184570312, "logps/rejected": -190.83157348632812, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 0.1064692884683609, "rewards/margins": 3.359034776687622, "rewards/rejected": -3.2525649070739746, "step": 870 }, { "epoch": 2.361673414304993, "grad_norm": 66.5, "learning_rate": 6.521661803428225e-07, "logits/chosen": -1.5013136863708496, "logits/rejected": -1.5206286907196045, "logps/chosen": -201.0956268310547, "logps/rejected": -198.01573181152344, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": -0.13122853636741638, "rewards/margins": 3.767671585083008, "rewards/rejected": -3.898899793624878, "step": 875 }, { "epoch": 2.375168690958165, "grad_norm": 11.3125, "learning_rate": 6.259186163295439e-07, "logits/chosen": -1.2552602291107178, "logits/rejected": -1.3482682704925537, "logps/chosen": -246.9757080078125, "logps/rejected": -239.8274383544922, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": -0.1879548728466034, "rewards/margins": 3.7479751110076904, "rewards/rejected": -3.935929775238037, "step": 880 }, { "epoch": 2.388663967611336, "grad_norm": 16.0, "learning_rate": 6.001343800282569e-07, "logits/chosen": -1.5184439420700073, "logits/rejected": -1.4158121347427368, "logps/chosen": -145.63616943359375, "logps/rejected": -212.58468627929688, "loss": 0.0783, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3523162603378296, "rewards/margins": 4.166034698486328, "rewards/rejected": -4.5183515548706055, "step": 885 }, { "epoch": 2.4021592442645074, "grad_norm": 12.0625, "learning_rate": 5.748198460560475e-07, "logits/chosen": -1.602419137954712, "logits/rejected": -1.6869083642959595, "logps/chosen": -211.70947265625, "logps/rejected": -220.8863525390625, "loss": 0.0806, "rewards/accuracies": 1.0, "rewards/chosen": 0.16532480716705322, "rewards/margins": 4.41878080368042, "rewards/rejected": -4.253456115722656, "step": 890 }, { "epoch": 2.4156545209176787, "grad_norm": 32.75, "learning_rate": 5.499812729058546e-07, "logits/chosen": -1.56089186668396, "logits/rejected": -1.5883516073226929, "logps/chosen": -181.11459350585938, "logps/rejected": -161.60299682617188, "loss": 0.1433, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2707998752593994, "rewards/margins": 3.216136932373047, "rewards/rejected": -3.4869370460510254, "step": 895 }, { "epoch": 2.42914979757085, "grad_norm": 14.8125, "learning_rate": 5.256248013991857e-07, "logits/chosen": -1.5014961957931519, "logits/rejected": -1.4206339120864868, "logps/chosen": -226.8283233642578, "logps/rejected": -266.60333251953125, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 0.00030528902425430715, "rewards/margins": 4.552371978759766, "rewards/rejected": -4.552066802978516, "step": 900 }, { "epoch": 2.4426450742240218, "grad_norm": 23.0, "learning_rate": 5.01756453167925e-07, "logits/chosen": -1.5279182195663452, "logits/rejected": -1.5130751132965088, "logps/chosen": -199.68397521972656, "logps/rejected": -246.5128936767578, "loss": 0.0683, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.14631351828575134, "rewards/margins": 4.73899507522583, "rewards/rejected": -4.592680931091309, "step": 905 }, { "epoch": 2.456140350877193, "grad_norm": 21.375, "learning_rate": 4.78382129165613e-07, "logits/chosen": -1.4500765800476074, "logits/rejected": -1.5014575719833374, "logps/chosen": -185.51475524902344, "logps/rejected": -181.7137908935547, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 0.09951256215572357, "rewards/margins": 3.4707932472229004, "rewards/rejected": -3.371281147003174, "step": 910 }, { "epoch": 2.4696356275303644, "grad_norm": 32.5, "learning_rate": 4.5550760820855633e-07, "logits/chosen": -1.557877779006958, "logits/rejected": -1.4586069583892822, "logps/chosen": -209.05062866210938, "logps/rejected": -308.66424560546875, "loss": 0.118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2664136290550232, "rewards/margins": 4.0513434410095215, "rewards/rejected": -4.3177571296691895, "step": 915 }, { "epoch": 2.4831309041835357, "grad_norm": 22.5, "learning_rate": 4.3313854554713457e-07, "logits/chosen": -1.5593338012695312, "logits/rejected": -1.5647127628326416, "logps/chosen": -197.6747283935547, "logps/rejected": -253.01876831054688, "loss": 0.0716, "rewards/accuracies": 1.0, "rewards/chosen": 0.0987640991806984, "rewards/margins": 4.090095043182373, "rewards/rejected": -3.9913315773010254, "step": 920 }, { "epoch": 2.4966261808367074, "grad_norm": 20.125, "learning_rate": 4.1128047146765936e-07, "logits/chosen": -1.435847520828247, "logits/rejected": -1.453253149986267, "logps/chosen": -141.46656799316406, "logps/rejected": -162.93905639648438, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": 0.20369374752044678, "rewards/margins": 3.790607452392578, "rewards/rejected": -3.586913585662842, "step": 925 }, { "epoch": 2.5101214574898787, "grad_norm": 32.5, "learning_rate": 3.899387899251242e-07, "logits/chosen": -1.499912142753601, "logits/rejected": -1.5055288076400757, "logps/chosen": -179.4788360595703, "logps/rejected": -202.9369354248047, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": -0.04291580989956856, "rewards/margins": 3.4943645000457764, "rewards/rejected": -3.537280321121216, "step": 930 }, { "epoch": 2.52361673414305, "grad_norm": 6.59375, "learning_rate": 3.6911877720719053e-07, "logits/chosen": -1.6243568658828735, "logits/rejected": -1.5396671295166016, "logps/chosen": -155.4473419189453, "logps/rejected": -191.9477081298828, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": -0.33543360233306885, "rewards/margins": 4.113525867462158, "rewards/rejected": -4.4489593505859375, "step": 935 }, { "epoch": 2.5371120107962213, "grad_norm": 10.3125, "learning_rate": 3.488255806297311e-07, "logits/chosen": -1.4612650871276855, "logits/rejected": -1.6070709228515625, "logps/chosen": -164.7592010498047, "logps/rejected": -161.7231903076172, "loss": 0.1901, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.06939269602298737, "rewards/margins": 3.406930446624756, "rewards/rejected": -3.3375372886657715, "step": 940 }, { "epoch": 2.5506072874493926, "grad_norm": 7.46875, "learning_rate": 3.2906421726426857e-07, "logits/chosen": -1.4703078269958496, "logits/rejected": -1.4379500150680542, "logps/chosen": -204.19473266601562, "logps/rejected": -244.11965942382812, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": -0.6960457563400269, "rewards/margins": 4.154335975646973, "rewards/rejected": -4.850381851196289, "step": 945 }, { "epoch": 2.564102564102564, "grad_norm": 17.375, "learning_rate": 3.09839572697605e-07, "logits/chosen": -1.560767412185669, "logits/rejected": -1.4427921772003174, "logps/chosen": -243.10568237304688, "logps/rejected": -232.52108764648438, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": -0.049421075731515884, "rewards/margins": 4.088489055633545, "rewards/rejected": -4.137909889221191, "step": 950 }, { "epoch": 2.5775978407557356, "grad_norm": 19.75, "learning_rate": 2.9115639982396166e-07, "logits/chosen": -1.515772819519043, "logits/rejected": -1.6191974878311157, "logps/chosen": -210.3816375732422, "logps/rejected": -198.30801391601562, "loss": 0.1289, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.32715946435928345, "rewards/margins": 3.6732399463653564, "rewards/rejected": -4.000399589538574, "step": 955 }, { "epoch": 2.591093117408907, "grad_norm": 16.875, "learning_rate": 2.7301931766992916e-07, "logits/chosen": -1.53992760181427, "logits/rejected": -1.6426169872283936, "logps/chosen": -202.2464599609375, "logps/rejected": -200.73020935058594, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": 0.2096923142671585, "rewards/margins": 3.49652361869812, "rewards/rejected": -3.2868313789367676, "step": 960 }, { "epoch": 2.604588394062078, "grad_norm": 15.875, "learning_rate": 2.554328102525022e-07, "logits/chosen": -1.468806505203247, "logits/rejected": -1.5037376880645752, "logps/chosen": -225.407470703125, "logps/rejected": -265.16326904296875, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 0.1482563018798828, "rewards/margins": 3.908936023712158, "rewards/rejected": -3.760679244995117, "step": 965 }, { "epoch": 2.6180836707152495, "grad_norm": 28.25, "learning_rate": 2.3840122547050482e-07, "logits/chosen": -1.4675546884536743, "logits/rejected": -1.427056074142456, "logps/chosen": -189.55482482910156, "logps/rejected": -238.43399047851562, "loss": 0.128, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15342381596565247, "rewards/margins": 4.185477256774902, "rewards/rejected": -4.338901042938232, "step": 970 }, { "epoch": 2.6315789473684212, "grad_norm": 10.6875, "learning_rate": 2.219287740296605e-07, "logits/chosen": -1.5017975568771362, "logits/rejected": -1.5283129215240479, "logps/chosen": -185.2952117919922, "logps/rejected": -218.5054168701172, "loss": 0.0971, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2838120460510254, "rewards/margins": 4.120657444000244, "rewards/rejected": -4.4044694900512695, "step": 975 }, { "epoch": 2.6450742240215925, "grad_norm": 21.0, "learning_rate": 2.060195284015837e-07, "logits/chosen": -1.662113904953003, "logits/rejected": -1.6862503290176392, "logps/chosen": -150.606689453125, "logps/rejected": -198.61793518066406, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": -0.2786501944065094, "rewards/margins": 3.8265221118927, "rewards/rejected": -4.105172157287598, "step": 980 }, { "epoch": 2.658569500674764, "grad_norm": 19.75, "learning_rate": 1.9067742181694353e-07, "logits/chosen": -1.4568703174591064, "logits/rejected": -1.4512639045715332, "logps/chosen": -171.15443420410156, "logps/rejected": -221.99526977539062, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": -0.17055651545524597, "rewards/margins": 5.160454273223877, "rewards/rejected": -5.3310112953186035, "step": 985 }, { "epoch": 2.672064777327935, "grad_norm": 93.0, "learning_rate": 1.75906247293057e-07, "logits/chosen": -1.6594133377075195, "logits/rejected": -1.5529086589813232, "logps/chosen": -156.86392211914062, "logps/rejected": -285.59197998046875, "loss": 0.118, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5136551856994629, "rewards/margins": 4.625790596008301, "rewards/rejected": -5.139446258544922, "step": 990 }, { "epoch": 2.6855600539811064, "grad_norm": 11.3125, "learning_rate": 1.617096566961429e-07, "logits/chosen": -1.466498613357544, "logits/rejected": -1.4549661874771118, "logps/chosen": -155.0102081298828, "logps/rejected": -232.1795654296875, "loss": 0.152, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.09109257161617279, "rewards/margins": 3.467794418334961, "rewards/rejected": -3.558886766433716, "step": 995 }, { "epoch": 2.699055330634278, "grad_norm": 15.9375, "learning_rate": 1.4809115983847267e-07, "logits/chosen": -1.377762794494629, "logits/rejected": -1.3253929615020752, "logps/chosen": -148.2834014892578, "logps/rejected": -208.0382080078125, "loss": 0.1151, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.24366268515586853, "rewards/margins": 3.6103515625, "rewards/rejected": -3.8540141582489014, "step": 1000 }, { "epoch": 2.699055330634278, "eval_logits/chosen": -1.536294937133789, "eval_logits/rejected": -1.5776937007904053, "eval_logps/chosen": -191.7211456298828, "eval_logps/rejected": -226.05455017089844, "eval_loss": 0.31860384345054626, "eval_rewards/accuracies": 0.8524096608161926, "eval_rewards/chosen": -0.7276893258094788, "eval_rewards/margins": 2.395343065261841, "eval_rewards/rejected": -3.1230320930480957, "eval_runtime": 23.3449, "eval_samples_per_second": 14.136, "eval_steps_per_second": 3.555, "step": 1000 }, { "epoch": 2.7125506072874495, "grad_norm": 23.625, "learning_rate": 1.3505412361064395e-07, "logits/chosen": -1.4981733560562134, "logits/rejected": -1.5207927227020264, "logps/chosen": -192.99154663085938, "logps/rejected": -194.6613311767578, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": 0.07562440633773804, "rewards/margins": 4.270889759063721, "rewards/rejected": -4.195265769958496, "step": 1005 }, { "epoch": 2.7260458839406208, "grad_norm": 20.5, "learning_rate": 1.226017711491867e-07, "logits/chosen": -1.5061196088790894, "logits/rejected": -1.5956671237945557, "logps/chosen": -170.25169372558594, "logps/rejected": -240.0498046875, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": -0.27517637610435486, "rewards/margins": 3.623337507247925, "rewards/rejected": -3.89851450920105, "step": 1010 }, { "epoch": 2.739541160593792, "grad_norm": 31.25, "learning_rate": 1.107371810397076e-07, "logits/chosen": -1.4881411790847778, "logits/rejected": -1.5475780963897705, "logps/chosen": -237.45504760742188, "logps/rejected": -212.13330078125, "loss": 0.0625, "rewards/accuracies": 1.0, "rewards/chosen": -0.10645435005426407, "rewards/margins": 4.086081027984619, "rewards/rejected": -4.192535400390625, "step": 1015 }, { "epoch": 2.753036437246964, "grad_norm": 16.75, "learning_rate": 9.946328655577625e-08, "logits/chosen": -1.5837833881378174, "logits/rejected": -1.6130040884017944, "logps/chosen": -137.10398864746094, "logps/rejected": -171.19357299804688, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": -0.27897781133651733, "rewards/margins": 3.8964107036590576, "rewards/rejected": -4.175389289855957, "step": 1020 }, { "epoch": 2.766531713900135, "grad_norm": 28.25, "learning_rate": 8.878287493373245e-08, "logits/chosen": -1.5753690004348755, "logits/rejected": -1.6070302724838257, "logps/chosen": -214.03018188476562, "logps/rejected": -189.55850219726562, "loss": 0.1188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10192601382732391, "rewards/margins": 3.4568443298339844, "rewards/rejected": -3.558769941329956, "step": 1025 }, { "epoch": 2.7800269905533064, "grad_norm": 26.625, "learning_rate": 7.869858668360042e-08, "logits/chosen": -1.4193127155303955, "logits/rejected": -1.2717030048370361, "logps/chosen": -187.0641632080078, "logps/rejected": -224.65066528320312, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": -0.18936040997505188, "rewards/margins": 4.242377758026123, "rewards/rejected": -4.43173885345459, "step": 1030 }, { "epoch": 2.7935222672064777, "grad_norm": 24.75, "learning_rate": 6.921291493627747e-08, "logits/chosen": -1.6177479028701782, "logits/rejected": -1.6725289821624756, "logps/chosen": -248.9903564453125, "logps/rejected": -230.86611938476562, "loss": 0.0856, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.29994627833366394, "rewards/margins": 3.9232945442199707, "rewards/rejected": -3.6233487129211426, "step": 1035 }, { "epoch": 2.807017543859649, "grad_norm": 42.5, "learning_rate": 6.032820482716001e-08, "logits/chosen": -1.5851434469223022, "logits/rejected": -1.5880482196807861, "logps/chosen": -155.3755340576172, "logps/rejected": -186.6389617919922, "loss": 0.1754, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.19228528439998627, "rewards/margins": 3.5754799842834473, "rewards/rejected": -3.7677650451660156, "step": 1040 }, { "epoch": 2.8205128205128203, "grad_norm": 43.0, "learning_rate": 5.204665291635519e-08, "logits/chosen": -1.496819019317627, "logits/rejected": -1.5007538795471191, "logps/chosen": -179.5200653076172, "logps/rejected": -266.001953125, "loss": 0.1038, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.35474246740341187, "rewards/margins": 3.8987841606140137, "rewards/rejected": -4.253526210784912, "step": 1045 }, { "epoch": 2.834008097165992, "grad_norm": 27.5, "learning_rate": 4.437030664562969e-08, "logits/chosen": -1.470956563949585, "logits/rejected": -1.52825927734375, "logps/chosen": -203.93551635742188, "logps/rejected": -220.02639770507812, "loss": 0.1639, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0977933406829834, "rewards/margins": 3.205706834793091, "rewards/rejected": -3.3035004138946533, "step": 1050 }, { "epoch": 2.8475033738191633, "grad_norm": 65.0, "learning_rate": 3.730106383222132e-08, "logits/chosen": -1.5251743793487549, "logits/rejected": -1.3242510557174683, "logps/chosen": -186.79141235351562, "logps/rejected": -250.46566772460938, "loss": 0.0909, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.299465537071228, "rewards/margins": 4.545691967010498, "rewards/rejected": -4.845158100128174, "step": 1055 }, { "epoch": 2.8609986504723346, "grad_norm": 19.75, "learning_rate": 3.084067219964182e-08, "logits/chosen": -1.527754783630371, "logits/rejected": -1.5058457851409912, "logps/chosen": -173.50900268554688, "logps/rejected": -246.65628051757812, "loss": 0.2529, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.42231351137161255, "rewards/margins": 3.4638805389404297, "rewards/rejected": -3.8861937522888184, "step": 1060 }, { "epoch": 2.8744939271255063, "grad_norm": 43.75, "learning_rate": 2.499072894559057e-08, "logits/chosen": -1.6412513256072998, "logits/rejected": -1.6829668283462524, "logps/chosen": -180.06788635253906, "logps/rejected": -219.94528198242188, "loss": 0.1089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.23302574455738068, "rewards/margins": 3.369854688644409, "rewards/rejected": -3.6028804779052734, "step": 1065 }, { "epoch": 2.8879892037786776, "grad_norm": 13.75, "learning_rate": 1.975268034707878e-08, "logits/chosen": -1.4751927852630615, "logits/rejected": -1.5141003131866455, "logps/chosen": -204.79470825195312, "logps/rejected": -223.97509765625, "loss": 0.0822, "rewards/accuracies": 1.0, "rewards/chosen": 0.15369097888469696, "rewards/margins": 3.9829258918762207, "rewards/rejected": -3.8292346000671387, "step": 1070 }, { "epoch": 2.901484480431849, "grad_norm": 39.0, "learning_rate": 1.512782140286939e-08, "logits/chosen": -1.4587006568908691, "logits/rejected": -1.5042657852172852, "logps/chosen": -156.6952667236328, "logps/rejected": -263.0159912109375, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": -0.11610189825296402, "rewards/margins": 3.9884142875671387, "rewards/rejected": -4.10451602935791, "step": 1075 }, { "epoch": 2.91497975708502, "grad_norm": 17.75, "learning_rate": 1.1117295513313475e-08, "logits/chosen": -1.665400743484497, "logits/rejected": -1.6617343425750732, "logps/chosen": -161.07443237304688, "logps/rejected": -207.7931671142578, "loss": 0.0872, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10158289968967438, "rewards/margins": 4.066722869873047, "rewards/rejected": -3.965139865875244, "step": 1080 }, { "epoch": 2.9284750337381915, "grad_norm": 20.75, "learning_rate": 7.72209419766995e-09, "logits/chosen": -1.4860131740570068, "logits/rejected": -1.3406977653503418, "logps/chosen": -168.0951690673828, "logps/rejected": -274.35113525390625, "loss": 0.1053, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.44807571172714233, "rewards/margins": 3.924337863922119, "rewards/rejected": -4.372413635253906, "step": 1085 }, { "epoch": 2.941970310391363, "grad_norm": 16.25, "learning_rate": 4.943056848972227e-09, "logits/chosen": -1.493690848350525, "logits/rejected": -1.5224257707595825, "logps/chosen": -209.3112335205078, "logps/rejected": -208.22988891601562, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 0.035886406898498535, "rewards/margins": 3.8605358600616455, "rewards/rejected": -3.8246493339538574, "step": 1090 }, { "epoch": 2.9554655870445345, "grad_norm": 14.8125, "learning_rate": 2.7808705265053305e-09, "logits/chosen": -1.571223497390747, "logits/rejected": -1.5577231645584106, "logps/chosen": -169.42562866210938, "logps/rejected": -181.50631713867188, "loss": 0.1109, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.13199149072170258, "rewards/margins": 3.612278699874878, "rewards/rejected": -3.744269847869873, "step": 1095 }, { "epoch": 2.968960863697706, "grad_norm": 27.75, "learning_rate": 1.2360697859462035e-09, "logits/chosen": -1.5886671543121338, "logits/rejected": -1.562727928161621, "logps/chosen": -162.84046936035156, "logps/rejected": -219.8025360107422, "loss": 0.118, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3454614281654358, "rewards/margins": 4.1069793701171875, "rewards/rejected": -4.4524407386779785, "step": 1100 }, { "epoch": 2.982456140350877, "grad_norm": 13.0625, "learning_rate": 3.090365472041557e-10, "logits/chosen": -1.5336341857910156, "logits/rejected": -1.5714600086212158, "logps/chosen": -217.091064453125, "logps/rejected": -239.0583953857422, "loss": 0.1793, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2769380509853363, "rewards/margins": 3.7623977661132812, "rewards/rejected": -4.039335250854492, "step": 1105 }, { "epoch": 2.9959514170040484, "grad_norm": 16.75, "learning_rate": 0.0, "logits/chosen": -1.4733049869537354, "logits/rejected": -1.4821723699569702, "logps/chosen": -191.77438354492188, "logps/rejected": -275.19158935546875, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 0.11918088048696518, "rewards/margins": 4.132817268371582, "rewards/rejected": -4.013636589050293, "step": 1110 }, { "epoch": 2.9959514170040484, "step": 1110, "total_flos": 4.5615607240812134e+17, "train_loss": 0.26195224279218965, "train_runtime": 3105.2921, "train_samples_per_second": 2.862, "train_steps_per_second": 0.357 } ], "logging_steps": 5, "max_steps": 1110, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.5615607240812134e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }