diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3404 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9959514170040484, + "eval_steps": 500, + "global_step": 1110, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01349527665317139, + "grad_norm": 85.5, + "learning_rate": 2.2522522522522524e-07, + "logits/chosen": -1.500240683555603, + "logits/rejected": -1.5190627574920654, + "logps/chosen": -159.05484008789062, + "logps/rejected": -164.59542846679688, + "loss": 0.6946, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.006750366650521755, + "rewards/margins": -0.002313111675903201, + "rewards/rejected": 0.0090634785592556, + "step": 5 + }, + { + "epoch": 0.02699055330634278, + "grad_norm": 92.5, + "learning_rate": 4.504504504504505e-07, + "logits/chosen": -1.4508098363876343, + "logits/rejected": -1.4352288246154785, + "logps/chosen": -141.31773376464844, + "logps/rejected": -167.95175170898438, + "loss": 0.7035, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.00739473570138216, + "rewards/margins": -0.01960981823503971, + "rewards/rejected": 0.01221508253365755, + "step": 10 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 74.0, + "learning_rate": 6.756756756756758e-07, + "logits/chosen": -1.3884494304656982, + "logits/rejected": -1.3975419998168945, + "logps/chosen": -192.84548950195312, + "logps/rejected": -180.82046508789062, + "loss": 0.6966, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.004980484023690224, + "rewards/margins": -0.006102551706135273, + "rewards/rejected": 0.011083034798502922, + "step": 15 + }, + { + "epoch": 0.05398110661268556, + "grad_norm": 99.0, + "learning_rate": 9.00900900900901e-07, + "logits/chosen": -1.4855096340179443, + "logits/rejected": -1.4922425746917725, + "logps/chosen": -148.1718292236328, + "logps/rejected": -152.18133544921875, + "loss": 0.6843, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002431074623018503, + "rewards/margins": 0.018751021474599838, + "rewards/rejected": -0.016319945454597473, + "step": 20 + }, + { + "epoch": 0.06747638326585695, + "grad_norm": 113.0, + "learning_rate": 1.1261261261261262e-06, + "logits/chosen": -1.4175087213516235, + "logits/rejected": -1.4836245775222778, + "logps/chosen": -264.17132568359375, + "logps/rejected": -193.3080596923828, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.002699580043554306, + "rewards/margins": 0.005426598247140646, + "rewards/rejected": -0.00272701820358634, + "step": 25 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 89.0, + "learning_rate": 1.3513513513513515e-06, + "logits/chosen": -1.3333433866500854, + "logits/rejected": -1.4199435710906982, + "logps/chosen": -220.9799041748047, + "logps/rejected": -186.35690307617188, + "loss": 0.688, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.009898080490529537, + "rewards/margins": 0.012090040370821953, + "rewards/rejected": -0.0021919584833085537, + "step": 30 + }, + { + "epoch": 0.09446693657219973, + "grad_norm": 66.5, + "learning_rate": 1.5765765765765766e-06, + "logits/chosen": -1.5576092004776, + "logits/rejected": -1.493931770324707, + "logps/chosen": -148.85377502441406, + "logps/rejected": -168.85574340820312, + "loss": 0.6811, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.014485938474535942, + "rewards/margins": 0.025426441803574562, + "rewards/rejected": -0.010940502397716045, + "step": 35 + }, + { + "epoch": 0.10796221322537113, + "grad_norm": 87.5, + "learning_rate": 1.801801801801802e-06, + "logits/chosen": -1.460998296737671, + "logits/rejected": -1.4714558124542236, + "logps/chosen": -165.34341430664062, + "logps/rejected": -167.67092895507812, + "loss": 0.6808, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.018663501366972923, + "rewards/margins": 0.027817577123641968, + "rewards/rejected": -0.009154075756669044, + "step": 40 + }, + { + "epoch": 0.1214574898785425, + "grad_norm": 93.0, + "learning_rate": 2.0270270270270273e-06, + "logits/chosen": -1.3859444856643677, + "logits/rejected": -1.4024606943130493, + "logps/chosen": -162.58734130859375, + "logps/rejected": -191.04025268554688, + "loss": 0.6846, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.009018613025546074, + "rewards/margins": 0.019761864095926285, + "rewards/rejected": -0.010743250139057636, + "step": 45 + }, + { + "epoch": 0.1349527665317139, + "grad_norm": 89.5, + "learning_rate": 2.2522522522522524e-06, + "logits/chosen": -1.4222023487091064, + "logits/rejected": -1.54598069190979, + "logps/chosen": -285.5871276855469, + "logps/rejected": -167.19281005859375, + "loss": 0.6684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02634511888027191, + "rewards/margins": 0.052618540823459625, + "rewards/rejected": -0.026273420080542564, + "step": 50 + }, + { + "epoch": 0.1484480431848853, + "grad_norm": 69.5, + "learning_rate": 2.4774774774774775e-06, + "logits/chosen": -1.5841736793518066, + "logits/rejected": -1.516913890838623, + "logps/chosen": -170.33505249023438, + "logps/rejected": -188.19314575195312, + "loss": 0.6639, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004526221659034491, + "rewards/margins": 0.06425820291042328, + "rewards/rejected": -0.06878442317247391, + "step": 55 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 72.0, + "learning_rate": 2.702702702702703e-06, + "logits/chosen": -1.438759207725525, + "logits/rejected": -1.3985353708267212, + "logps/chosen": -198.15411376953125, + "logps/rejected": -208.3758544921875, + "loss": 0.6501, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.047606997191905975, + "rewards/margins": 0.09706764668226242, + "rewards/rejected": -0.049460653215646744, + "step": 60 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 164.0, + "learning_rate": 2.927927927927928e-06, + "logits/chosen": -1.4191879034042358, + "logits/rejected": -1.5293009281158447, + "logps/chosen": -217.4423370361328, + "logps/rejected": -202.1327362060547, + "loss": 0.6846, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.014217356219887733, + "rewards/margins": 0.027354473248124123, + "rewards/rejected": -0.013137114234268665, + "step": 65 + }, + { + "epoch": 0.18893387314439947, + "grad_norm": 75.5, + "learning_rate": 3.1531531531531532e-06, + "logits/chosen": -1.510615587234497, + "logits/rejected": -1.5524317026138306, + "logps/chosen": -277.9597473144531, + "logps/rejected": -174.99221801757812, + "loss": 0.6538, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.01016303151845932, + "rewards/margins": 0.08965723216533661, + "rewards/rejected": -0.07949419319629669, + "step": 70 + }, + { + "epoch": 0.20242914979757085, + "grad_norm": 127.5, + "learning_rate": 3.3783783783783788e-06, + "logits/chosen": -1.5467108488082886, + "logits/rejected": -1.7057151794433594, + "logps/chosen": -236.87759399414062, + "logps/rejected": -171.19088745117188, + "loss": 0.6316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024651767686009407, + "rewards/margins": 0.13629736006259918, + "rewards/rejected": -0.11164556443691254, + "step": 75 + }, + { + "epoch": 0.21592442645074225, + "grad_norm": 67.0, + "learning_rate": 3.603603603603604e-06, + "logits/chosen": -1.3438420295715332, + "logits/rejected": -1.5014269351959229, + "logps/chosen": -211.7142791748047, + "logps/rejected": -149.79403686523438, + "loss": 0.6296, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.016034509986639023, + "rewards/margins": 0.1428973227739334, + "rewards/rejected": -0.12686282396316528, + "step": 80 + }, + { + "epoch": 0.22941970310391363, + "grad_norm": 67.0, + "learning_rate": 3.828828828828829e-06, + "logits/chosen": -1.580759048461914, + "logits/rejected": -1.5942776203155518, + "logps/chosen": -186.5341339111328, + "logps/rejected": -198.06871032714844, + "loss": 0.6112, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.011369394138455391, + "rewards/margins": 0.18740348517894745, + "rewards/rejected": -0.1987728774547577, + "step": 85 + }, + { + "epoch": 0.242914979757085, + "grad_norm": 104.5, + "learning_rate": 4.0540540540540545e-06, + "logits/chosen": -1.5142263174057007, + "logits/rejected": -1.526908040046692, + "logps/chosen": -172.0498504638672, + "logps/rejected": -204.1090545654297, + "loss": 0.5947, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.001767634996213019, + "rewards/margins": 0.23096399009227753, + "rewards/rejected": -0.2327316552400589, + "step": 90 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 67.0, + "learning_rate": 4.27927927927928e-06, + "logits/chosen": -1.2964483499526978, + "logits/rejected": -1.287847876548767, + "logps/chosen": -152.49652099609375, + "logps/rejected": -162.25242614746094, + "loss": 0.6261, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.031346581876277924, + "rewards/margins": 0.17656004428863525, + "rewards/rejected": -0.20790663361549377, + "step": 95 + }, + { + "epoch": 0.2699055330634278, + "grad_norm": 122.0, + "learning_rate": 4.504504504504505e-06, + "logits/chosen": -1.6146646738052368, + "logits/rejected": -1.6288648843765259, + "logps/chosen": -245.85440063476562, + "logps/rejected": -252.33163452148438, + "loss": 0.5388, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.008112089708447456, + "rewards/margins": 0.4617583155632019, + "rewards/rejected": -0.4536462426185608, + "step": 100 + }, + { + "epoch": 0.2834008097165992, + "grad_norm": 54.75, + "learning_rate": 4.72972972972973e-06, + "logits/chosen": -1.7181060314178467, + "logits/rejected": -1.6348508596420288, + "logps/chosen": -181.34054565429688, + "logps/rejected": -187.49969482421875, + "loss": 0.5332, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.032877303659915924, + "rewards/margins": 0.5002557635307312, + "rewards/rejected": -0.5331330895423889, + "step": 105 + }, + { + "epoch": 0.2968960863697706, + "grad_norm": 93.5, + "learning_rate": 4.954954954954955e-06, + "logits/chosen": -1.471880555152893, + "logits/rejected": -1.4882009029388428, + "logps/chosen": -239.46017456054688, + "logps/rejected": -203.43408203125, + "loss": 0.639, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20699986815452576, + "rewards/margins": 0.2872315049171448, + "rewards/rejected": -0.49423137307167053, + "step": 110 + }, + { + "epoch": 0.31039136302294196, + "grad_norm": 83.5, + "learning_rate": 4.999802215142814e-06, + "logits/chosen": -1.572249174118042, + "logits/rejected": -1.5214914083480835, + "logps/chosen": -181.75244140625, + "logps/rejected": -206.9883270263672, + "loss": 0.4953, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2786533534526825, + "rewards/margins": 0.6539293527603149, + "rewards/rejected": -0.932582676410675, + "step": 115 + }, + { + "epoch": 0.32388663967611336, + "grad_norm": 63.25, + "learning_rate": 4.998998767795805e-06, + "logits/chosen": -1.3965647220611572, + "logits/rejected": -1.5122724771499634, + "logps/chosen": -185.1367645263672, + "logps/rejected": -141.9375457763672, + "loss": 0.5188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12487339973449707, + "rewards/margins": 0.5116696357727051, + "rewards/rejected": -0.6365430951118469, + "step": 120 + }, + { + "epoch": 0.33738191632928477, + "grad_norm": 94.5, + "learning_rate": 4.9975774948882615e-06, + "logits/chosen": -1.5592033863067627, + "logits/rejected": -1.5545122623443604, + "logps/chosen": -134.59095764160156, + "logps/rejected": -159.44424438476562, + "loss": 0.5878, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.218244269490242, + "rewards/margins": 0.560061514377594, + "rewards/rejected": -0.7783057689666748, + "step": 125 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 159.0, + "learning_rate": 4.995538747800403e-06, + "logits/chosen": -1.5116926431655884, + "logits/rejected": -1.5991663932800293, + "logps/chosen": -196.37417602539062, + "logps/rejected": -162.26467895507812, + "loss": 0.555, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6864209175109863, + "rewards/margins": 0.5580738186836243, + "rewards/rejected": -1.2444946765899658, + "step": 130 + }, + { + "epoch": 0.3643724696356275, + "grad_norm": 77.5, + "learning_rate": 4.9928830305701164e-06, + "logits/chosen": -1.4444091320037842, + "logits/rejected": -1.404262661933899, + "logps/chosen": -185.04042053222656, + "logps/rejected": -186.958740234375, + "loss": 0.4598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22133490443229675, + "rewards/margins": 0.7992109060287476, + "rewards/rejected": -1.0205457210540771, + "step": 135 + }, + { + "epoch": 0.37786774628879893, + "grad_norm": 50.25, + "learning_rate": 4.98961099976835e-06, + "logits/chosen": -1.5445549488067627, + "logits/rejected": -1.586544156074524, + "logps/chosen": -199.28408813476562, + "logps/rejected": -183.11032104492188, + "loss": 0.4536, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06479507684707642, + "rewards/margins": 0.9296582341194153, + "rewards/rejected": -0.9944533109664917, + "step": 140 + }, + { + "epoch": 0.3913630229419703, + "grad_norm": 68.0, + "learning_rate": 4.985723464336783e-06, + "logits/chosen": -1.4274847507476807, + "logits/rejected": -1.4104160070419312, + "logps/chosen": -185.9368896484375, + "logps/rejected": -188.2207489013672, + "loss": 0.4902, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17553560435771942, + "rewards/margins": 0.6832131743431091, + "rewards/rejected": -0.8587487936019897, + "step": 145 + }, + { + "epoch": 0.4048582995951417, + "grad_norm": 65.0, + "learning_rate": 4.9812213853878376e-06, + "logits/chosen": -1.6410919427871704, + "logits/rejected": -1.6832342147827148, + "logps/chosen": -168.22726440429688, + "logps/rejected": -165.28591918945312, + "loss": 0.4942, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19691412150859833, + "rewards/margins": 0.8052200078964233, + "rewards/rejected": -1.002134084701538, + "step": 150 + }, + { + "epoch": 0.4183535762483131, + "grad_norm": 84.0, + "learning_rate": 4.9761058759670625e-06, + "logits/chosen": -1.4086945056915283, + "logits/rejected": -1.3933309316635132, + "logps/chosen": -200.54226684570312, + "logps/rejected": -191.30516052246094, + "loss": 0.5805, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38961219787597656, + "rewards/margins": 0.6619648337364197, + "rewards/rejected": -1.051577091217041, + "step": 155 + }, + { + "epoch": 0.4318488529014845, + "grad_norm": 48.75, + "learning_rate": 4.970378200777949e-06, + "logits/chosen": -1.4240281581878662, + "logits/rejected": -1.5275284051895142, + "logps/chosen": -149.6121826171875, + "logps/rejected": -153.7329864501953, + "loss": 0.3726, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22904136776924133, + "rewards/margins": 1.2087788581848145, + "rewards/rejected": -1.4378201961517334, + "step": 160 + }, + { + "epoch": 0.44534412955465585, + "grad_norm": 57.5, + "learning_rate": 4.964039775869271e-06, + "logits/chosen": -1.5353929996490479, + "logits/rejected": -1.5400171279907227, + "logps/chosen": -172.69320678710938, + "logps/rejected": -186.09596252441406, + "loss": 0.4821, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14687059819698334, + "rewards/margins": 1.0381742715835571, + "rewards/rejected": -1.1850450038909912, + "step": 165 + }, + { + "epoch": 0.45883940620782726, + "grad_norm": 68.5, + "learning_rate": 4.957092168284987e-06, + "logits/chosen": -1.5351091623306274, + "logits/rejected": -1.480067253112793, + "logps/chosen": -224.7134246826172, + "logps/rejected": -280.2825012207031, + "loss": 0.4522, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15150094032287598, + "rewards/margins": 0.8322998881340027, + "rewards/rejected": -0.9838007092475891, + "step": 170 + }, + { + "epoch": 0.47233468286099867, + "grad_norm": 47.25, + "learning_rate": 4.949537095676824e-06, + "logits/chosen": -1.5415345430374146, + "logits/rejected": -1.4604427814483643, + "logps/chosen": -173.94085693359375, + "logps/rejected": -215.93075561523438, + "loss": 0.45, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3776322901248932, + "rewards/margins": 1.5937398672103882, + "rewards/rejected": -1.9713722467422485, + "step": 175 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 95.5, + "learning_rate": 4.9413764258796236e-06, + "logits/chosen": -1.5088344812393188, + "logits/rejected": -1.6158044338226318, + "logps/chosen": -273.03594970703125, + "logps/rejected": -221.93997192382812, + "loss": 0.5881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25630080699920654, + "rewards/margins": 0.5983410477638245, + "rewards/rejected": -0.8546417951583862, + "step": 180 + }, + { + "epoch": 0.4993252361673414, + "grad_norm": 83.0, + "learning_rate": 4.93261217644956e-06, + "logits/chosen": -1.3866004943847656, + "logits/rejected": -1.363396406173706, + "logps/chosen": -211.2840576171875, + "logps/rejected": -256.87811279296875, + "loss": 0.4912, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.24753907322883606, + "rewards/margins": 0.9087351560592651, + "rewards/rejected": -1.1562741994857788, + "step": 185 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 79.0, + "learning_rate": 4.923246514165339e-06, + "logits/chosen": -1.357788324356079, + "logits/rejected": -1.322389841079712, + "logps/chosen": -221.6494598388672, + "logps/rejected": -238.56637573242188, + "loss": 0.3841, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21661829948425293, + "rewards/margins": 1.6020748615264893, + "rewards/rejected": -1.8186931610107422, + "step": 190 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 78.0, + "learning_rate": 4.913281754492509e-06, + "logits/chosen": -1.5164716243743896, + "logits/rejected": -1.5658130645751953, + "logps/chosen": -211.942138671875, + "logps/rejected": -251.4232177734375, + "loss": 0.439, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2759682238101959, + "rewards/margins": 1.2201299667358398, + "rewards/rejected": -1.4960981607437134, + "step": 195 + }, + { + "epoch": 0.5398110661268556, + "grad_norm": 68.0, + "learning_rate": 4.902720361011007e-06, + "logits/chosen": -1.43938148021698, + "logits/rejected": -1.4012665748596191, + "logps/chosen": -198.0753936767578, + "logps/rejected": -230.1431121826172, + "loss": 0.436, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4660988748073578, + "rewards/margins": 1.3129799365997314, + "rewards/rejected": -1.7790788412094116, + "step": 200 + }, + { + "epoch": 0.553306342780027, + "grad_norm": 116.0, + "learning_rate": 4.891564944806095e-06, + "logits/chosen": -1.3829123973846436, + "logits/rejected": -1.4532912969589233, + "logps/chosen": -204.92056274414062, + "logps/rejected": -184.2178192138672, + "loss": 0.4408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4832437038421631, + "rewards/margins": 1.4000451564788818, + "rewards/rejected": -1.8832887411117554, + "step": 205 + }, + { + "epoch": 0.5668016194331984, + "grad_norm": 39.0, + "learning_rate": 4.879818263822816e-06, + "logits/chosen": -1.5301909446716309, + "logits/rejected": -1.4669263362884521, + "logps/chosen": -176.71139526367188, + "logps/rejected": -210.8941192626953, + "loss": 0.4359, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7508934140205383, + "rewards/margins": 1.5884822607040405, + "rewards/rejected": -2.3393757343292236, + "step": 210 + }, + { + "epoch": 0.5802968960863698, + "grad_norm": 118.5, + "learning_rate": 4.867483222184158e-06, + "logits/chosen": -1.4969114065170288, + "logits/rejected": -1.4513076543807983, + "logps/chosen": -183.51742553710938, + "logps/rejected": -234.21078491210938, + "loss": 0.4083, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.1092641353607178, + "rewards/margins": 2.7672932147979736, + "rewards/rejected": -4.876556873321533, + "step": 215 + }, + { + "epoch": 0.5937921727395412, + "grad_norm": 82.5, + "learning_rate": 4.854562869473063e-06, + "logits/chosen": -1.6114156246185303, + "logits/rejected": -1.6086403131484985, + "logps/chosen": -158.5917510986328, + "logps/rejected": -182.981689453125, + "loss": 0.5288, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8133976459503174, + "rewards/margins": 2.3693175315856934, + "rewards/rejected": -4.182714939117432, + "step": 220 + }, + { + "epoch": 0.6072874493927125, + "grad_norm": 64.5, + "learning_rate": 4.841060399978481e-06, + "logits/chosen": -1.4258265495300293, + "logits/rejected": -1.5041557550430298, + "logps/chosen": -203.29505920410156, + "logps/rejected": -173.55667114257812, + "loss": 0.467, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.451561838388443, + "rewards/margins": 0.9895628094673157, + "rewards/rejected": -1.4411247968673706, + "step": 225 + }, + { + "epoch": 0.6207827260458839, + "grad_norm": 53.75, + "learning_rate": 4.826979151905655e-06, + "logits/chosen": -1.3954380750656128, + "logits/rejected": -1.4369020462036133, + "logps/chosen": -133.7052764892578, + "logps/rejected": -152.63189697265625, + "loss": 0.3819, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21247024834156036, + "rewards/margins": 1.1218936443328857, + "rewards/rejected": -1.3343639373779297, + "step": 230 + }, + { + "epoch": 0.6342780026990553, + "grad_norm": 34.25, + "learning_rate": 4.812322606550813e-06, + "logits/chosen": -1.477416753768921, + "logits/rejected": -1.35099196434021, + "logps/chosen": -183.8603057861328, + "logps/rejected": -200.47122192382812, + "loss": 0.403, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22856561839580536, + "rewards/margins": 1.1782000064849854, + "rewards/rejected": -1.4067654609680176, + "step": 235 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 142.0, + "learning_rate": 4.7970943874404904e-06, + "logits/chosen": -1.5746204853057861, + "logits/rejected": -1.5317301750183105, + "logps/chosen": -132.62966918945312, + "logps/rejected": -169.4604034423828, + "loss": 0.4905, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2887588441371918, + "rewards/margins": 1.0178512334823608, + "rewards/rejected": -1.3066102266311646, + "step": 240 + }, + { + "epoch": 0.6612685560053981, + "grad_norm": 81.5, + "learning_rate": 4.781298259435691e-06, + "logits/chosen": -1.4620139598846436, + "logits/rejected": -1.5366100072860718, + "logps/chosen": -207.0232696533203, + "logps/rejected": -182.5987548828125, + "loss": 0.3498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38011789321899414, + "rewards/margins": 1.517073392868042, + "rewards/rejected": -1.8971912860870361, + "step": 245 + }, + { + "epoch": 0.6747638326585695, + "grad_norm": 59.0, + "learning_rate": 4.7649381278011e-06, + "logits/chosen": -1.525059700012207, + "logits/rejected": -1.4892899990081787, + "logps/chosen": -132.02548217773438, + "logps/rejected": -172.75595092773438, + "loss": 0.4596, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.47625675797462463, + "rewards/margins": 1.6200672388076782, + "rewards/rejected": -2.0963237285614014, + "step": 250 + }, + { + "epoch": 0.6882591093117408, + "grad_norm": 93.5, + "learning_rate": 4.748018037239592e-06, + "logits/chosen": -1.6185624599456787, + "logits/rejected": -1.6007747650146484, + "logps/chosen": -190.04196166992188, + "logps/rejected": -271.9373474121094, + "loss": 0.377, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.29186195135116577, + "rewards/margins": 1.4247747659683228, + "rewards/rejected": -1.7166366577148438, + "step": 255 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 54.75, + "learning_rate": 4.7305421708922596e-06, + "logits/chosen": -1.5387685298919678, + "logits/rejected": -1.4462766647338867, + "logps/chosen": -199.54568481445312, + "logps/rejected": -219.14901733398438, + "loss": 0.5013, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4976847767829895, + "rewards/margins": 1.649714708328247, + "rewards/rejected": -2.147399425506592, + "step": 260 + }, + { + "epoch": 0.7152496626180836, + "grad_norm": 92.0, + "learning_rate": 4.712514849304219e-06, + "logits/chosen": -1.4592026472091675, + "logits/rejected": -1.5086675882339478, + "logps/chosen": -203.43939208984375, + "logps/rejected": -182.27008056640625, + "loss": 0.3704, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.30615222454071045, + "rewards/margins": 1.7558097839355469, + "rewards/rejected": -2.0619618892669678, + "step": 265 + }, + { + "epoch": 0.728744939271255, + "grad_norm": 94.0, + "learning_rate": 4.693940529356444e-06, + "logits/chosen": -1.5462654829025269, + "logits/rejected": -1.5494886636734009, + "logps/chosen": -204.8282470703125, + "logps/rejected": -262.1166076660156, + "loss": 0.4081, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18543025851249695, + "rewards/margins": 1.581555724143982, + "rewards/rejected": -1.7669861316680908, + "step": 270 + }, + { + "epoch": 0.7422402159244265, + "grad_norm": 49.5, + "learning_rate": 4.674823803163899e-06, + "logits/chosen": -1.5121240615844727, + "logits/rejected": -1.378418207168579, + "logps/chosen": -176.5196533203125, + "logps/rejected": -259.83154296875, + "loss": 0.2792, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2915424704551697, + "rewards/margins": 2.276181697845459, + "rewards/rejected": -2.5677244663238525, + "step": 275 + }, + { + "epoch": 0.7557354925775979, + "grad_norm": 63.5, + "learning_rate": 4.655169396940229e-06, + "logits/chosen": -1.488743782043457, + "logits/rejected": -1.4984915256500244, + "logps/chosen": -227.04574584960938, + "logps/rejected": -223.5692596435547, + "loss": 0.3756, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3987753987312317, + "rewards/margins": 1.6648337841033936, + "rewards/rejected": -2.0636088848114014, + "step": 280 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 62.75, + "learning_rate": 4.6349821698293025e-06, + "logits/chosen": -1.4782928228378296, + "logits/rejected": -1.480554223060608, + "logps/chosen": -168.77146911621094, + "logps/rejected": -283.3312683105469, + "loss": 0.3639, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.32490164041519165, + "rewards/margins": 1.6070611476898193, + "rewards/rejected": -1.9319626092910767, + "step": 285 + }, + { + "epoch": 0.7827260458839406, + "grad_norm": 85.0, + "learning_rate": 4.6142671127038905e-06, + "logits/chosen": -1.5204181671142578, + "logits/rejected": -1.4846007823944092, + "logps/chosen": -122.49859619140625, + "logps/rejected": -159.67666625976562, + "loss": 0.3855, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5428152680397034, + "rewards/margins": 1.4056587219238281, + "rewards/rejected": -1.9484741687774658, + "step": 290 + }, + { + "epoch": 0.796221322537112, + "grad_norm": 124.5, + "learning_rate": 4.593029346931777e-06, + "logits/chosen": -1.5233218669891357, + "logits/rejected": -1.4880311489105225, + "logps/chosen": -190.8978271484375, + "logps/rejected": -212.50808715820312, + "loss": 0.4094, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5791584253311157, + "rewards/margins": 1.7821210622787476, + "rewards/rejected": -2.3612794876098633, + "step": 295 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 121.0, + "learning_rate": 4.571274123109606e-06, + "logits/chosen": -1.5600152015686035, + "logits/rejected": -1.5772325992584229, + "logps/chosen": -211.6980438232422, + "logps/rejected": -159.11520385742188, + "loss": 0.5103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5033570528030396, + "rewards/margins": 1.3233957290649414, + "rewards/rejected": -1.8267529010772705, + "step": 300 + }, + { + "epoch": 0.8232118758434548, + "grad_norm": 87.0, + "learning_rate": 4.549006819764779e-06, + "logits/chosen": -1.3667839765548706, + "logits/rejected": -1.408111333847046, + "logps/chosen": -252.8665008544922, + "logps/rejected": -246.56600952148438, + "loss": 0.6645, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4156951904296875, + "rewards/margins": 0.9969050288200378, + "rewards/rejected": -1.4126002788543701, + "step": 305 + }, + { + "epoch": 0.8367071524966262, + "grad_norm": 65.0, + "learning_rate": 4.52623294202573e-06, + "logits/chosen": -1.5357733964920044, + "logits/rejected": -1.6000627279281616, + "logps/chosen": -203.2954864501953, + "logps/rejected": -178.47378540039062, + "loss": 0.3625, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1179068312048912, + "rewards/margins": 1.5459201335906982, + "rewards/rejected": -1.6638271808624268, + "step": 310 + }, + { + "epoch": 0.8502024291497976, + "grad_norm": 38.75, + "learning_rate": 4.502958120260894e-06, + "logits/chosen": -1.4177687168121338, + "logits/rejected": -1.466953992843628, + "logps/chosen": -208.93142700195312, + "logps/rejected": -204.0532989501953, + "loss": 0.3943, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10137398540973663, + "rewards/margins": 1.5527517795562744, + "rewards/rejected": -1.6541255712509155, + "step": 315 + }, + { + "epoch": 0.863697705802969, + "grad_norm": 94.5, + "learning_rate": 4.479188108686714e-06, + "logits/chosen": -1.543738603591919, + "logits/rejected": -1.5562658309936523, + "logps/chosen": -195.75601196289062, + "logps/rejected": -243.9476776123047, + "loss": 0.393, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09615819901227951, + "rewards/margins": 1.808638334274292, + "rewards/rejected": -1.9047966003417969, + "step": 320 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 53.25, + "learning_rate": 4.454928783945033e-06, + "logits/chosen": -1.4368815422058105, + "logits/rejected": -1.465288519859314, + "logps/chosen": -182.02488708496094, + "logps/rejected": -166.5155487060547, + "loss": 0.3673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09929310530424118, + "rewards/margins": 1.477452039718628, + "rewards/rejected": -1.5767452716827393, + "step": 325 + }, + { + "epoch": 0.8906882591093117, + "grad_norm": 94.5, + "learning_rate": 4.430186143650216e-06, + "logits/chosen": -1.3891671895980835, + "logits/rejected": -1.3638372421264648, + "logps/chosen": -167.63204956054688, + "logps/rejected": -166.39913940429688, + "loss": 0.4332, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18427793681621552, + "rewards/margins": 1.2914403676986694, + "rewards/rejected": -1.4757182598114014, + "step": 330 + }, + { + "epoch": 0.9041835357624831, + "grad_norm": 68.5, + "learning_rate": 4.404966304906363e-06, + "logits/chosen": -1.5300304889678955, + "logits/rejected": -1.541245698928833, + "logps/chosen": -237.1887969970703, + "logps/rejected": -258.4833984375, + "loss": 0.2851, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2509092092514038, + "rewards/margins": 2.2454378604888916, + "rewards/rejected": -2.496346950531006, + "step": 335 + }, + { + "epoch": 0.9176788124156545, + "grad_norm": 91.5, + "learning_rate": 4.379275502794984e-06, + "logits/chosen": -1.4159671068191528, + "logits/rejected": -1.3942148685455322, + "logps/chosen": -204.76268005371094, + "logps/rejected": -194.83755493164062, + "loss": 0.3974, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.590856671333313, + "rewards/margins": 1.8947960138320923, + "rewards/rejected": -2.4856529235839844, + "step": 340 + }, + { + "epoch": 0.9311740890688259, + "grad_norm": 24.875, + "learning_rate": 4.3531200888335015e-06, + "logits/chosen": -1.499260663986206, + "logits/rejected": -1.5041369199752808, + "logps/chosen": -158.403076171875, + "logps/rejected": -188.42300415039062, + "loss": 0.3399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4716406464576721, + "rewards/margins": 2.255904197692871, + "rewards/rejected": -2.7275447845458984, + "step": 345 + }, + { + "epoch": 0.9446693657219973, + "grad_norm": 49.0, + "learning_rate": 4.326506529404973e-06, + "logits/chosen": -1.4987239837646484, + "logits/rejected": -1.5489791631698608, + "logps/chosen": -228.030517578125, + "logps/rejected": -199.24453735351562, + "loss": 0.4954, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5366212129592896, + "rewards/margins": 1.576836347579956, + "rewards/rejected": -2.113457441329956, + "step": 350 + }, + { + "epoch": 0.9581646423751687, + "grad_norm": 50.5, + "learning_rate": 4.299441404159409e-06, + "logits/chosen": -1.4427543878555298, + "logits/rejected": -1.4410443305969238, + "logps/chosen": -142.67196655273438, + "logps/rejected": -182.15530395507812, + "loss": 0.3882, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.45489731431007385, + "rewards/margins": 1.885206937789917, + "rewards/rejected": -2.340104341506958, + "step": 355 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 71.0, + "learning_rate": 4.271931404387096e-06, + "logits/chosen": -1.4958666563034058, + "logits/rejected": -1.4852968454360962, + "logps/chosen": -203.7172088623047, + "logps/rejected": -223.72958374023438, + "loss": 0.3129, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4084866940975189, + "rewards/margins": 2.0505545139312744, + "rewards/rejected": -2.459041118621826, + "step": 360 + }, + { + "epoch": 0.9851551956815114, + "grad_norm": 72.0, + "learning_rate": 4.243983331364307e-06, + "logits/chosen": -1.6051279306411743, + "logits/rejected": -1.5763704776763916, + "logps/chosen": -156.02700805664062, + "logps/rejected": -212.16317749023438, + "loss": 0.4821, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6169974207878113, + "rewards/margins": 1.195291519165039, + "rewards/rejected": -1.8122888803482056, + "step": 365 + }, + { + "epoch": 0.9986504723346828, + "grad_norm": 91.0, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -1.5946276187896729, + "logits/rejected": -1.525407075881958, + "logps/chosen": -190.231689453125, + "logps/rejected": -210.0182342529297, + "loss": 0.4743, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5886441469192505, + "rewards/margins": 1.6572681665420532, + "rewards/rejected": -2.245912551879883, + "step": 370 + }, + { + "epoch": 1.0121457489878543, + "grad_norm": 71.5, + "learning_rate": 4.186800710486732e-06, + "logits/chosen": -1.503097414970398, + "logits/rejected": -1.4615429639816284, + "logps/chosen": -177.4516143798828, + "logps/rejected": -223.7339324951172, + "loss": 0.2691, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2226639688014984, + "rewards/margins": 2.2762439250946045, + "rewards/rejected": -2.4989078044891357, + "step": 375 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 16.75, + "learning_rate": 4.157580299847717e-06, + "logits/chosen": -1.4365036487579346, + "logits/rejected": -1.4489128589630127, + "logps/chosen": -185.9925994873047, + "logps/rejected": -210.19802856445312, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1753823310136795, + "rewards/margins": 3.160768508911133, + "rewards/rejected": -3.336151123046875, + "step": 380 + }, + { + "epoch": 1.039136302294197, + "grad_norm": 27.125, + "learning_rate": 4.12795008689464e-06, + "logits/chosen": -1.4434540271759033, + "logits/rejected": -1.5021578073501587, + "logps/chosen": -210.2549591064453, + "logps/rejected": -247.6964569091797, + "loss": 0.2329, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.21405327320098877, + "rewards/margins": 2.4333832263946533, + "rewards/rejected": -2.219329833984375, + "step": 385 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 29.5, + "learning_rate": 4.0979173970824626e-06, + "logits/chosen": -1.5133657455444336, + "logits/rejected": -1.5038350820541382, + "logps/chosen": -187.3416290283203, + "logps/rejected": -197.63766479492188, + "loss": 0.1885, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0815470814704895, + "rewards/margins": 2.5452542304992676, + "rewards/rejected": -2.463707447052002, + "step": 390 + }, + { + "epoch": 1.0661268556005399, + "grad_norm": 11.3125, + "learning_rate": 4.067489655370197e-06, + "logits/chosen": -1.486011028289795, + "logits/rejected": -1.5427876710891724, + "logps/chosen": -248.8966064453125, + "logps/rejected": -205.6848602294922, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.543197751045227, + "rewards/margins": 3.468106746673584, + "rewards/rejected": -2.9249091148376465, + "step": 395 + }, + { + "epoch": 1.0796221322537112, + "grad_norm": 21.625, + "learning_rate": 4.0366743843852315e-06, + "logits/chosen": -1.4536128044128418, + "logits/rejected": -1.39426851272583, + "logps/chosen": -157.4046173095703, + "logps/rejected": -206.4637451171875, + "loss": 0.1189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14299368858337402, + "rewards/margins": 3.642580032348633, + "rewards/rejected": -3.7855734825134277, + "step": 400 + }, + { + "epoch": 1.0931174089068827, + "grad_norm": 73.0, + "learning_rate": 4.005479202563524e-06, + "logits/chosen": -1.4207379817962646, + "logits/rejected": -1.4653427600860596, + "logps/chosen": -175.64657592773438, + "logps/rejected": -188.96347045898438, + "loss": 0.113, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.22152826189994812, + "rewards/margins": 3.9064407348632812, + "rewards/rejected": -4.127968788146973, + "step": 405 + }, + { + "epoch": 1.106612685560054, + "grad_norm": 22.5, + "learning_rate": 3.973911822266099e-06, + "logits/chosen": -1.3683284521102905, + "logits/rejected": -1.4073810577392578, + "logps/chosen": -200.2495880126953, + "logps/rejected": -196.02499389648438, + "loss": 0.1506, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4190312922000885, + "rewards/margins": 3.017284393310547, + "rewards/rejected": -3.4363160133361816, + "step": 410 + }, + { + "epoch": 1.1201079622132253, + "grad_norm": 61.0, + "learning_rate": 3.941980047872324e-06, + "logits/chosen": -1.3142037391662598, + "logits/rejected": -1.3677208423614502, + "logps/chosen": -200.49827575683594, + "logps/rejected": -213.0048828125, + "loss": 0.2229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.29499849677085876, + "rewards/margins": 2.430476188659668, + "rewards/rejected": -2.7254748344421387, + "step": 415 + }, + { + "epoch": 1.1336032388663968, + "grad_norm": 33.5, + "learning_rate": 3.9096917738504445e-06, + "logits/chosen": -1.5029326677322388, + "logits/rejected": -1.522037386894226, + "logps/chosen": -211.3799285888672, + "logps/rejected": -195.49777221679688, + "loss": 0.2023, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20138970017433167, + "rewards/margins": 3.0471653938293457, + "rewards/rejected": -3.2485554218292236, + "step": 420 + }, + { + "epoch": 1.147098515519568, + "grad_norm": 67.5, + "learning_rate": 3.877054982805835e-06, + "logits/chosen": -1.503327488899231, + "logits/rejected": -1.5182857513427734, + "logps/chosen": -206.69345092773438, + "logps/rejected": -220.8511505126953, + "loss": 0.2, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.075591079890728, + "rewards/margins": 3.3199775218963623, + "rewards/rejected": -3.39556884765625, + "step": 425 + }, + { + "epoch": 1.1605937921727396, + "grad_norm": 41.25, + "learning_rate": 3.844077743507468e-06, + "logits/chosen": -1.4972890615463257, + "logits/rejected": -1.4547359943389893, + "logps/chosen": -190.38272094726562, + "logps/rejected": -237.7483367919922, + "loss": 0.1763, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.00510750338435173, + "rewards/margins": 3.4418201446533203, + "rewards/rejected": -3.446927309036255, + "step": 430 + }, + { + "epoch": 1.174089068825911, + "grad_norm": 43.0, + "learning_rate": 3.8107682088930797e-06, + "logits/chosen": -1.5898491144180298, + "logits/rejected": -1.628394365310669, + "logps/chosen": -209.7681884765625, + "logps/rejected": -223.9811248779297, + "loss": 0.2875, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15720273554325104, + "rewards/margins": 2.540311336517334, + "rewards/rejected": -2.697514057159424, + "step": 435 + }, + { + "epoch": 1.1875843454790824, + "grad_norm": 19.875, + "learning_rate": 3.777134614053522e-06, + "logits/chosen": -1.3833550214767456, + "logits/rejected": -1.3048458099365234, + "logps/chosen": -153.44886779785156, + "logps/rejected": -187.23211669921875, + "loss": 0.2094, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15450401604175568, + "rewards/margins": 2.7406177520751953, + "rewards/rejected": -2.8951218128204346, + "step": 440 + }, + { + "epoch": 1.2010796221322537, + "grad_norm": 25.25, + "learning_rate": 3.7431852741968104e-06, + "logits/chosen": -1.5894601345062256, + "logits/rejected": -1.4398654699325562, + "logps/chosen": -161.95870971679688, + "logps/rejected": -259.89544677734375, + "loss": 0.2674, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3261250853538513, + "rewards/margins": 2.652719497680664, + "rewards/rejected": -2.9788451194763184, + "step": 445 + }, + { + "epoch": 1.214574898785425, + "grad_norm": 25.625, + "learning_rate": 3.7089285825923614e-06, + "logits/chosen": -1.481194257736206, + "logits/rejected": -1.4744828939437866, + "logps/chosen": -136.75341796875, + "logps/rejected": -182.98255920410156, + "loss": 0.216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15366610884666443, + "rewards/margins": 2.4346675872802734, + "rewards/rejected": -2.5883336067199707, + "step": 450 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 59.0, + "learning_rate": 3.6743730084959275e-06, + "logits/chosen": -1.4641847610473633, + "logits/rejected": -1.4495608806610107, + "logps/chosen": -226.5570068359375, + "logps/rejected": -231.99484252929688, + "loss": 0.1606, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0351928249001503, + "rewards/margins": 2.678950071334839, + "rewards/rejected": -2.6437573432922363, + "step": 455 + }, + { + "epoch": 1.2415654520917678, + "grad_norm": 29.125, + "learning_rate": 3.639527095055753e-06, + "logits/chosen": -1.4890583753585815, + "logits/rejected": -1.4146323204040527, + "logps/chosen": -211.8848419189453, + "logps/rejected": -223.7265167236328, + "loss": 0.1515, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1912011355161667, + "rewards/margins": 3.216825008392334, + "rewards/rejected": -3.4080262184143066, + "step": 460 + }, + { + "epoch": 1.2550607287449393, + "grad_norm": 28.5, + "learning_rate": 3.604399457200458e-06, + "logits/chosen": -1.5582194328308105, + "logits/rejected": -1.530056357383728, + "logps/chosen": -174.59786987304688, + "logps/rejected": -235.314697265625, + "loss": 0.1586, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.02259807661175728, + "rewards/margins": 3.3205082416534424, + "rewards/rejected": -3.343106508255005, + "step": 465 + }, + { + "epoch": 1.2685560053981106, + "grad_norm": 47.0, + "learning_rate": 3.5689987795091735e-06, + "logits/chosen": -1.5336169004440308, + "logits/rejected": -1.5555146932601929, + "logps/chosen": -192.9527587890625, + "logps/rejected": -217.05029296875, + "loss": 0.1666, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.11079835891723633, + "rewards/margins": 2.9511632919311523, + "rewards/rejected": -3.0619616508483887, + "step": 470 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 31.5, + "learning_rate": 3.5333338140644602e-06, + "logits/chosen": -1.567378044128418, + "logits/rejected": -1.5020748376846313, + "logps/chosen": -151.2008819580078, + "logps/rejected": -193.5251007080078, + "loss": 0.1562, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06739845871925354, + "rewards/margins": 2.88545560836792, + "rewards/rejected": -2.81805682182312, + "step": 475 + }, + { + "epoch": 1.2955465587044535, + "grad_norm": 27.625, + "learning_rate": 3.497413378288541e-06, + "logits/chosen": -1.558091402053833, + "logits/rejected": -1.5880284309387207, + "logps/chosen": -208.2618408203125, + "logps/rejected": -215.33065795898438, + "loss": 0.1537, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.08185993134975433, + "rewards/margins": 2.7448792457580566, + "rewards/rejected": -2.8267390727996826, + "step": 480 + }, + { + "epoch": 1.3090418353576248, + "grad_norm": 21.0, + "learning_rate": 3.4612463527633728e-06, + "logits/chosen": -1.517230749130249, + "logits/rejected": -1.5125114917755127, + "logps/chosen": -165.6942138671875, + "logps/rejected": -177.20965576171875, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21497321128845215, + "rewards/margins": 3.283679485321045, + "rewards/rejected": -3.498652935028076, + "step": 485 + }, + { + "epoch": 1.3225371120107963, + "grad_norm": 58.25, + "learning_rate": 3.4248416790351086e-06, + "logits/chosen": -1.4563219547271729, + "logits/rejected": -1.4463237524032593, + "logps/chosen": -222.70803833007812, + "logps/rejected": -276.1205139160156, + "loss": 0.1741, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18195849657058716, + "rewards/margins": 3.079150438308716, + "rewards/rejected": -3.2611091136932373, + "step": 490 + }, + { + "epoch": 1.3360323886639676, + "grad_norm": 26.5, + "learning_rate": 3.3882083574034847e-06, + "logits/chosen": -1.495981216430664, + "logits/rejected": -1.510833501815796, + "logps/chosen": -217.92416381835938, + "logps/rejected": -232.9659881591797, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07798905670642853, + "rewards/margins": 3.7004494667053223, + "rewards/rejected": -3.6224606037139893, + "step": 495 + }, + { + "epoch": 1.349527665317139, + "grad_norm": 11.625, + "learning_rate": 3.3513554446966846e-06, + "logits/chosen": -1.607877492904663, + "logits/rejected": -1.5209126472473145, + "logps/chosen": -145.24710083007812, + "logps/rejected": -269.81951904296875, + "loss": 0.0835, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1404910534620285, + "rewards/margins": 3.947847843170166, + "rewards/rejected": -4.088338375091553, + "step": 500 + }, + { + "epoch": 1.349527665317139, + "eval_logits/chosen": -1.5215187072753906, + "eval_logits/rejected": -1.5562808513641357, + "eval_logps/chosen": -190.62527465820312, + "eval_logps/rejected": -222.86770629882812, + "eval_loss": 0.3281523883342743, + "eval_rewards/accuracies": 0.849397599697113, + "eval_rewards/chosen": -0.6181024312973022, + "eval_rewards/margins": 2.1862471103668213, + "eval_rewards/rejected": -2.804349660873413, + "eval_runtime": 23.4839, + "eval_samples_per_second": 14.052, + "eval_steps_per_second": 3.534, + "step": 500 + }, + { + "epoch": 1.3630229419703104, + "grad_norm": 25.625, + "learning_rate": 3.314292052032227e-06, + "logits/chosen": -1.4269988536834717, + "logits/rejected": -1.5553017854690552, + "logps/chosen": -245.88330078125, + "logps/rejected": -144.62518310546875, + "loss": 0.2057, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.03630426153540611, + "rewards/margins": 2.8089230060577393, + "rewards/rejected": -2.8452274799346924, + "step": 505 + }, + { + "epoch": 1.376518218623482, + "grad_norm": 42.75, + "learning_rate": 3.2770273425644285e-06, + "logits/chosen": -1.3818541765213013, + "logits/rejected": -1.31718909740448, + "logps/chosen": -194.84194946289062, + "logps/rejected": -197.08505249023438, + "loss": 0.1862, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.11487498134374619, + "rewards/margins": 3.057730197906494, + "rewards/rejected": -3.172605276107788, + "step": 510 + }, + { + "epoch": 1.3900134952766532, + "grad_norm": 29.0, + "learning_rate": 3.2395705292190067e-06, + "logits/chosen": -1.467614769935608, + "logits/rejected": -1.438024640083313, + "logps/chosen": -180.7233428955078, + "logps/rejected": -217.57559204101562, + "loss": 0.1711, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.10827420651912689, + "rewards/margins": 3.1877129077911377, + "rewards/rejected": -3.295987367630005, + "step": 515 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 12.125, + "learning_rate": 3.2019308724153743e-06, + "logits/chosen": -1.4175347089767456, + "logits/rejected": -1.5785712003707886, + "logps/chosen": -196.76730346679688, + "logps/rejected": -179.5243377685547, + "loss": 0.114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12531700730323792, + "rewards/margins": 3.2615838050842285, + "rewards/rejected": -3.1362667083740234, + "step": 520 + }, + { + "epoch": 1.417004048582996, + "grad_norm": 27.375, + "learning_rate": 3.164117677777191e-06, + "logits/chosen": -1.5264801979064941, + "logits/rejected": -1.6040115356445312, + "logps/chosen": -150.361328125, + "logps/rejected": -164.02816772460938, + "loss": 0.1757, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4109339118003845, + "rewards/margins": 3.098153591156006, + "rewards/rejected": -3.509087324142456, + "step": 525 + }, + { + "epoch": 1.4304993252361673, + "grad_norm": 38.25, + "learning_rate": 3.1261402938317465e-06, + "logits/chosen": -1.5730303525924683, + "logits/rejected": -1.6026499271392822, + "logps/chosen": -164.3070831298828, + "logps/rejected": -246.06338500976562, + "loss": 0.1532, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.005527207162231207, + "rewards/margins": 3.9187304973602295, + "rewards/rejected": -3.913203477859497, + "step": 530 + }, + { + "epoch": 1.4439946018893388, + "grad_norm": 20.375, + "learning_rate": 3.088008109698726e-06, + "logits/chosen": -1.444838285446167, + "logits/rejected": -1.5232534408569336, + "logps/chosen": -194.70555114746094, + "logps/rejected": -218.77590942382812, + "loss": 0.1892, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.10898719727993011, + "rewards/margins": 3.2815093994140625, + "rewards/rejected": -3.1725223064422607, + "step": 535 + }, + { + "epoch": 1.45748987854251, + "grad_norm": 43.0, + "learning_rate": 3.0497305527689446e-06, + "logits/chosen": -1.4176692962646484, + "logits/rejected": -1.4581646919250488, + "logps/chosen": -190.53550720214844, + "logps/rejected": -202.92530822753906, + "loss": 0.1852, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.12854930758476257, + "rewards/margins": 3.113678216934204, + "rewards/rejected": -3.242227554321289, + "step": 540 + }, + { + "epoch": 1.4709851551956814, + "grad_norm": 42.0, + "learning_rate": 3.011317086373628e-06, + "logits/chosen": -1.4024337530136108, + "logits/rejected": -1.4260265827178955, + "logps/chosen": -222.62124633789062, + "logps/rejected": -228.56295776367188, + "loss": 0.1847, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.02214776910841465, + "rewards/margins": 3.127570629119873, + "rewards/rejected": -3.1497180461883545, + "step": 545 + }, + { + "epoch": 1.484480431848853, + "grad_norm": 38.5, + "learning_rate": 2.9727772074447916e-06, + "logits/chosen": -1.4362146854400635, + "logits/rejected": -1.4737937450408936, + "logps/chosen": -190.13218688964844, + "logps/rejected": -182.9353790283203, + "loss": 0.1473, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.015029204078018665, + "rewards/margins": 3.5559723377227783, + "rewards/rejected": -3.5710015296936035, + "step": 550 + }, + { + "epoch": 1.4979757085020242, + "grad_norm": 105.0, + "learning_rate": 2.9341204441673267e-06, + "logits/chosen": -1.5892771482467651, + "logits/rejected": -1.5846550464630127, + "logps/chosen": -128.55142211914062, + "logps/rejected": -169.93356323242188, + "loss": 0.2029, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.536339282989502, + "rewards/margins": 2.9536054134368896, + "rewards/rejected": -3.4899444580078125, + "step": 555 + }, + { + "epoch": 1.5114709851551957, + "grad_norm": 49.75, + "learning_rate": 2.8953563536233525e-06, + "logits/chosen": -1.650007963180542, + "logits/rejected": -1.6943776607513428, + "logps/chosen": -168.49082946777344, + "logps/rejected": -202.83924865722656, + "loss": 0.186, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5661638379096985, + "rewards/margins": 3.2901504039764404, + "rewards/rejected": -3.856314182281494, + "step": 560 + }, + { + "epoch": 1.524966261808367, + "grad_norm": 21.0, + "learning_rate": 2.8564945194294273e-06, + "logits/chosen": -1.5658307075500488, + "logits/rejected": -1.46593177318573, + "logps/chosen": -162.1931915283203, + "logps/rejected": -254.7098388671875, + "loss": 0.168, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5101041793823242, + "rewards/margins": 3.1685078144073486, + "rewards/rejected": -3.6786117553710938, + "step": 565 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 13.125, + "learning_rate": 2.817544549367197e-06, + "logits/chosen": -1.4567762613296509, + "logits/rejected": -1.4438632726669312, + "logps/chosen": -173.05821228027344, + "logps/rejected": -226.567626953125, + "loss": 0.1935, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4246034622192383, + "rewards/margins": 3.5222201347351074, + "rewards/rejected": -3.946824312210083, + "step": 570 + }, + { + "epoch": 1.5519568151147098, + "grad_norm": 18.875, + "learning_rate": 2.778516073008071e-06, + "logits/chosen": -1.3770719766616821, + "logits/rejected": -1.4858124256134033, + "logps/chosen": -178.8583221435547, + "logps/rejected": -180.4306640625, + "loss": 0.2049, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.34754911065101624, + "rewards/margins": 2.8492796421051025, + "rewards/rejected": -3.196829080581665, + "step": 575 + }, + { + "epoch": 1.5654520917678814, + "grad_norm": 51.0, + "learning_rate": 2.7394187393325107e-06, + "logits/chosen": -1.4935017824172974, + "logits/rejected": -1.482154130935669, + "logps/chosen": -183.38815307617188, + "logps/rejected": -203.4325714111328, + "loss": 0.2601, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.4926990866661072, + "rewards/margins": 2.8543787002563477, + "rewards/rejected": -3.347078323364258, + "step": 580 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 11.875, + "learning_rate": 2.7002622143445177e-06, + "logits/chosen": -1.5763792991638184, + "logits/rejected": -1.581122875213623, + "logps/chosen": -230.5819854736328, + "logps/rejected": -290.4792175292969, + "loss": 0.1305, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.07175219804048538, + "rewards/margins": 4.184114933013916, + "rewards/rejected": -4.112362861633301, + "step": 585 + }, + { + "epoch": 1.592442645074224, + "grad_norm": 46.75, + "learning_rate": 2.6610561786819207e-06, + "logits/chosen": -1.6340926885604858, + "logits/rejected": -1.5590074062347412, + "logps/chosen": -145.62442016601562, + "logps/rejected": -248.79403686523438, + "loss": 0.1715, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.3683429956436157, + "rewards/margins": 3.4242138862609863, + "rewards/rejected": -3.7925562858581543, + "step": 590 + }, + { + "epoch": 1.6059379217273952, + "grad_norm": 8.5625, + "learning_rate": 2.6218103252230302e-06, + "logits/chosen": -1.5815064907073975, + "logits/rejected": -1.558189868927002, + "logps/chosen": -145.986572265625, + "logps/rejected": -209.48776245117188, + "loss": 0.1382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15136297047138214, + "rewards/margins": 3.156534194946289, + "rewards/rejected": -3.3078970909118652, + "step": 595 + }, + { + "epoch": 1.6194331983805668, + "grad_norm": 33.5, + "learning_rate": 2.582534356690284e-06, + "logits/chosen": -1.4829189777374268, + "logits/rejected": -1.5618332624435425, + "logps/chosen": -280.50482177734375, + "logps/rejected": -227.37191772460938, + "loss": 0.111, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014963224530220032, + "rewards/margins": 3.7380282878875732, + "rewards/rejected": -3.723065137863159, + "step": 600 + }, + { + "epoch": 1.6329284750337383, + "grad_norm": 19.25, + "learning_rate": 2.5432379832514437e-06, + "logits/chosen": -1.5892632007598877, + "logits/rejected": -1.6352291107177734, + "logps/chosen": -158.56002807617188, + "logps/rejected": -202.90060424804688, + "loss": 0.2301, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.49658140540122986, + "rewards/margins": 3.0457987785339355, + "rewards/rejected": -3.5423800945281982, + "step": 605 + }, + { + "epoch": 1.6464237516869096, + "grad_norm": 18.5, + "learning_rate": 2.5039309201189618e-06, + "logits/chosen": -1.6018474102020264, + "logits/rejected": -1.6965217590332031, + "logps/chosen": -161.53518676757812, + "logps/rejected": -185.10025024414062, + "loss": 0.1597, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.14256028831005096, + "rewards/margins": 3.0957140922546387, + "rewards/rejected": -3.238274335861206, + "step": 610 + }, + { + "epoch": 1.6599190283400809, + "grad_norm": 22.375, + "learning_rate": 2.4646228851480957e-06, + "logits/chosen": -1.391078233718872, + "logits/rejected": -1.3691911697387695, + "logps/chosen": -206.93734741210938, + "logps/rejected": -213.29428100585938, + "loss": 0.2172, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01679610088467598, + "rewards/margins": 2.988704204559326, + "rewards/rejected": -2.9719078540802, + "step": 615 + }, + { + "epoch": 1.6734143049932524, + "grad_norm": 13.25, + "learning_rate": 2.4253235964343677e-06, + "logits/chosen": -1.590201497077942, + "logits/rejected": -1.4947328567504883, + "logps/chosen": -162.37301635742188, + "logps/rejected": -259.95294189453125, + "loss": 0.1116, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.33501359820365906, + "rewards/margins": 4.118770599365234, + "rewards/rejected": -4.453783988952637, + "step": 620 + }, + { + "epoch": 1.686909581646424, + "grad_norm": 73.0, + "learning_rate": 2.3860427699109726e-06, + "logits/chosen": -1.6217790842056274, + "logits/rejected": -1.6454839706420898, + "logps/chosen": -172.94483947753906, + "logps/rejected": -205.34475708007812, + "loss": 0.2869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9174480438232422, + "rewards/margins": 3.128140449523926, + "rewards/rejected": -4.045588493347168, + "step": 625 + }, + { + "epoch": 1.7004048582995952, + "grad_norm": 32.5, + "learning_rate": 2.34679011694671e-06, + "logits/chosen": -1.5026500225067139, + "logits/rejected": -1.6494897603988647, + "logps/chosen": -268.9452209472656, + "logps/rejected": -212.0578155517578, + "loss": 0.1194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23328566551208496, + "rewards/margins": 4.139514446258545, + "rewards/rejected": -4.372800350189209, + "step": 630 + }, + { + "epoch": 1.7139001349527665, + "grad_norm": 70.5, + "learning_rate": 2.3075753419450524e-06, + "logits/chosen": -1.5526963472366333, + "logits/rejected": -1.6195096969604492, + "logps/chosen": -205.20431518554688, + "logps/rejected": -197.59744262695312, + "loss": 0.2026, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3273767828941345, + "rewards/margins": 2.9169745445251465, + "rewards/rejected": -3.2443511486053467, + "step": 635 + }, + { + "epoch": 1.7273954116059378, + "grad_norm": 38.5, + "learning_rate": 2.2684081399449327e-06, + "logits/chosen": -1.4865336418151855, + "logits/rejected": -1.479229211807251, + "logps/chosen": -188.85787963867188, + "logps/rejected": -203.17514038085938, + "loss": 0.269, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.778891921043396, + "rewards/margins": 3.5117366313934326, + "rewards/rejected": -4.290627956390381, + "step": 640 + }, + { + "epoch": 1.7408906882591093, + "grad_norm": 116.0, + "learning_rate": 2.2292981942238454e-06, + "logits/chosen": -1.598434329032898, + "logits/rejected": -1.6193567514419556, + "logps/chosen": -170.999267578125, + "logps/rejected": -234.42391967773438, + "loss": 0.3528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.586583137512207, + "rewards/margins": 3.1737523078918457, + "rewards/rejected": -3.7603354454040527, + "step": 645 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 36.0, + "learning_rate": 2.1902551739038624e-06, + "logits/chosen": -1.5177044868469238, + "logits/rejected": -1.4585306644439697, + "logps/chosen": -171.92758178710938, + "logps/rejected": -219.8982696533203, + "loss": 0.2386, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5028108954429626, + "rewards/margins": 3.118129253387451, + "rewards/rejected": -3.6209399700164795, + "step": 650 + }, + { + "epoch": 1.7678812415654521, + "grad_norm": 11.0625, + "learning_rate": 2.151288731561136e-06, + "logits/chosen": -1.532063364982605, + "logits/rejected": -1.4071648120880127, + "logps/chosen": -211.4221649169922, + "logps/rejected": -240.8402099609375, + "loss": 0.1651, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.42295771837234497, + "rewards/margins": 3.7488913536071777, + "rewards/rejected": -4.171849250793457, + "step": 655 + }, + { + "epoch": 1.7813765182186234, + "grad_norm": 23.625, + "learning_rate": 2.1124085008395056e-06, + "logits/chosen": -1.4962142705917358, + "logits/rejected": -1.4677404165267944, + "logps/chosen": -197.39447021484375, + "logps/rejected": -263.4613342285156, + "loss": 0.1999, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.16278569400310516, + "rewards/margins": 3.5397281646728516, + "rewards/rejected": -3.7025134563446045, + "step": 660 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 20.625, + "learning_rate": 2.073624094068776e-06, + "logits/chosen": -1.5467997789382935, + "logits/rejected": -1.540650725364685, + "logps/chosen": -186.6321563720703, + "logps/rejected": -259.65045166015625, + "loss": 0.2781, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.22982105612754822, + "rewards/margins": 3.7238681316375732, + "rewards/rejected": -3.9536895751953125, + "step": 665 + }, + { + "epoch": 1.8083670715249662, + "grad_norm": 12.875, + "learning_rate": 2.03494509988827e-06, + "logits/chosen": -1.6044431924819946, + "logits/rejected": -1.627730131149292, + "logps/chosen": -184.64320373535156, + "logps/rejected": -204.9185791015625, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07731951773166656, + "rewards/margins": 3.6050572395324707, + "rewards/rejected": -3.6823768615722656, + "step": 670 + }, + { + "epoch": 1.8218623481781377, + "grad_norm": 22.375, + "learning_rate": 1.996381080876237e-06, + "logits/chosen": -1.6212413311004639, + "logits/rejected": -1.5563671588897705, + "logps/chosen": -219.73171997070312, + "logps/rejected": -281.0826721191406, + "loss": 0.1177, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.023561427369713783, + "rewards/margins": 3.5450756549835205, + "rewards/rejected": -3.521514415740967, + "step": 675 + }, + { + "epoch": 1.835357624831309, + "grad_norm": 42.5, + "learning_rate": 1.957941571185702e-06, + "logits/chosen": -1.4472072124481201, + "logits/rejected": -1.5231066942214966, + "logps/chosen": -256.3811950683594, + "logps/rejected": -225.1781768798828, + "loss": 0.2672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.01850978098809719, + "rewards/margins": 3.1582770347595215, + "rewards/rejected": -3.1767868995666504, + "step": 680 + }, + { + "epoch": 1.8488529014844803, + "grad_norm": 30.625, + "learning_rate": 1.919636074187346e-06, + "logits/chosen": -1.388319730758667, + "logits/rejected": -1.4473168849945068, + "logps/chosen": -253.48312377929688, + "logps/rejected": -212.169189453125, + "loss": 0.1468, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.18813356757164001, + "rewards/margins": 3.097019672393799, + "rewards/rejected": -2.908886194229126, + "step": 685 + }, + { + "epoch": 1.8623481781376519, + "grad_norm": 90.0, + "learning_rate": 1.8814740601199943e-06, + "logits/chosen": -1.4006351232528687, + "logits/rejected": -1.4068963527679443, + "logps/chosen": -164.6719970703125, + "logps/rejected": -193.83538818359375, + "loss": 0.2666, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.42647585272789, + "rewards/margins": 2.7546064853668213, + "rewards/rejected": -3.181082248687744, + "step": 690 + }, + { + "epoch": 1.8758434547908234, + "grad_norm": 25.75, + "learning_rate": 1.8434649637492952e-06, + "logits/chosen": -1.341395616531372, + "logits/rejected": -1.3592100143432617, + "logps/chosen": -181.58978271484375, + "logps/rejected": -235.27456665039062, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15408626198768616, + "rewards/margins": 3.203856945037842, + "rewards/rejected": -3.357943296432495, + "step": 695 + }, + { + "epoch": 1.8893387314439947, + "grad_norm": 18.625, + "learning_rate": 1.8056181820351737e-06, + "logits/chosen": -1.565199613571167, + "logits/rejected": -1.5012518167495728, + "logps/chosen": -241.5365753173828, + "logps/rejected": -229.1800079345703, + "loss": 0.1734, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.34816664457321167, + "rewards/margins": 4.178171634674072, + "rewards/rejected": -3.830005168914795, + "step": 700 + }, + { + "epoch": 1.902834008097166, + "grad_norm": 8.875, + "learning_rate": 1.7679430718086244e-06, + "logits/chosen": -1.5023219585418701, + "logits/rejected": -1.4059240818023682, + "logps/chosen": -240.8516082763672, + "logps/rejected": -287.47955322265625, + "loss": 0.0641, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22036854922771454, + "rewards/margins": 4.166906833648682, + "rewards/rejected": -3.946538209915161, + "step": 705 + }, + { + "epoch": 1.9163292847503373, + "grad_norm": 35.5, + "learning_rate": 1.7304489474584307e-06, + "logits/chosen": -1.565582036972046, + "logits/rejected": -1.4994531869888306, + "logps/chosen": -148.25338745117188, + "logps/rejected": -231.37741088867188, + "loss": 0.123, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1026371493935585, + "rewards/margins": 3.7467575073242188, + "rewards/rejected": -3.6441197395324707, + "step": 710 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 28.125, + "learning_rate": 1.693145078628377e-06, + "logits/chosen": -1.6054456233978271, + "logits/rejected": -1.6087411642074585, + "logps/chosen": -159.12234497070312, + "logps/rejected": -214.5330352783203, + "loss": 0.1255, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.13751724362373352, + "rewards/margins": 3.8032824993133545, + "rewards/rejected": -3.940800428390503, + "step": 715 + }, + { + "epoch": 1.9433198380566803, + "grad_norm": 18.375, + "learning_rate": 1.6560406879255192e-06, + "logits/chosen": -1.615686058998108, + "logits/rejected": -1.678998351097107, + "logps/chosen": -179.3768768310547, + "logps/rejected": -188.79124450683594, + "loss": 0.1608, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.13511483371257782, + "rewards/margins": 3.1098551750183105, + "rewards/rejected": -3.2449698448181152, + "step": 720 + }, + { + "epoch": 1.9568151147098516, + "grad_norm": 20.5, + "learning_rate": 1.6191449486400893e-06, + "logits/chosen": -1.5641348361968994, + "logits/rejected": -1.5269627571105957, + "logps/chosen": -190.90200805664062, + "logps/rejected": -200.14797973632812, + "loss": 0.1858, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.057850100100040436, + "rewards/margins": 3.392789363861084, + "rewards/rejected": -3.4506402015686035, + "step": 725 + }, + { + "epoch": 1.9703103913630229, + "grad_norm": 46.25, + "learning_rate": 1.5824669824775868e-06, + "logits/chosen": -1.6585397720336914, + "logits/rejected": -1.6145107746124268, + "logps/chosen": -153.5370330810547, + "logps/rejected": -246.87869262695312, + "loss": 0.1935, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1705460101366043, + "rewards/margins": 3.2724738121032715, + "rewards/rejected": -3.4430203437805176, + "step": 730 + }, + { + "epoch": 1.9838056680161942, + "grad_norm": 21.125, + "learning_rate": 1.5460158573036288e-06, + "logits/chosen": -1.425318956375122, + "logits/rejected": -1.5616633892059326, + "logps/chosen": -228.63955688476562, + "logps/rejected": -232.26235961914062, + "loss": 0.1763, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15325972437858582, + "rewards/margins": 2.676074504852295, + "rewards/rejected": -2.8293344974517822, + "step": 735 + }, + { + "epoch": 1.9973009446693657, + "grad_norm": 57.75, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -1.4701238870620728, + "logits/rejected": -1.335039496421814, + "logps/chosen": -165.36788940429688, + "logps/rejected": -248.84915161132812, + "loss": 0.2088, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21872563660144806, + "rewards/margins": 3.6630032062530518, + "rewards/rejected": -3.8817286491394043, + "step": 740 + }, + { + "epoch": 2.010796221322537, + "grad_norm": 13.0625, + "learning_rate": 1.473830118747216e-06, + "logits/chosen": -1.3533880710601807, + "logits/rejected": -1.4392606019973755, + "logps/chosen": -173.4610595703125, + "logps/rejected": -189.3846435546875, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04286568984389305, + "rewards/margins": 3.4945671558380127, + "rewards/rejected": -3.537432909011841, + "step": 745 + }, + { + "epoch": 2.0242914979757085, + "grad_norm": 5.46875, + "learning_rate": 1.4381133517898803e-06, + "logits/chosen": -1.5612472295761108, + "logits/rejected": -1.6096746921539307, + "logps/chosen": -244.1045684814453, + "logps/rejected": -227.0, + "loss": 0.068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28070664405822754, + "rewards/margins": 4.070573329925537, + "rewards/rejected": -3.7898666858673096, + "step": 750 + }, + { + "epoch": 2.03778677462888, + "grad_norm": 46.75, + "learning_rate": 1.4026591142591733e-06, + "logits/chosen": -1.4181170463562012, + "logits/rejected": -1.5695334672927856, + "logps/chosen": -218.1271514892578, + "logps/rejected": -171.77000427246094, + "loss": 0.1633, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2111760377883911, + "rewards/margins": 2.9384093284606934, + "rewards/rejected": -3.149585247039795, + "step": 755 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 24.5, + "learning_rate": 1.3674761714792153e-06, + "logits/chosen": -1.5777294635772705, + "logits/rejected": -1.6976985931396484, + "logps/chosen": -224.3392791748047, + "logps/rejected": -254.0798797607422, + "loss": 0.0739, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.15005287528038025, + "rewards/margins": 4.0651535987854, + "rewards/rejected": -3.91510009765625, + "step": 760 + }, + { + "epoch": 2.064777327935223, + "grad_norm": 23.75, + "learning_rate": 1.33257322170213e-06, + "logits/chosen": -1.4911249876022339, + "logits/rejected": -1.500860571861267, + "logps/chosen": -172.9776611328125, + "logps/rejected": -201.8260040283203, + "loss": 0.1002, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.21206021308898926, + "rewards/margins": 3.810685634613037, + "rewards/rejected": -3.598625659942627, + "step": 765 + }, + { + "epoch": 2.078272604588394, + "grad_norm": 28.375, + "learning_rate": 1.2979588939575879e-06, + "logits/chosen": -1.5784046649932861, + "logits/rejected": -1.5579355955123901, + "logps/chosen": -192.16024780273438, + "logps/rejected": -219.4779510498047, + "loss": 0.1696, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.06577552855014801, + "rewards/margins": 3.573701858520508, + "rewards/rejected": -3.5079262256622314, + "step": 770 + }, + { + "epoch": 2.0917678812415654, + "grad_norm": 14.8125, + "learning_rate": 1.2636417459194536e-06, + "logits/chosen": -1.5944167375564575, + "logits/rejected": -1.6392465829849243, + "logps/chosen": -235.58633422851562, + "logps/rejected": -274.0408630371094, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08247147500514984, + "rewards/margins": 4.281358242034912, + "rewards/rejected": -4.363830089569092, + "step": 775 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 5.21875, + "learning_rate": 1.2296302617900772e-06, + "logits/chosen": -1.5774985551834106, + "logits/rejected": -1.6413581371307373, + "logps/chosen": -171.0308074951172, + "logps/rejected": -183.9725341796875, + "loss": 0.0845, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.016073107719421387, + "rewards/margins": 3.9465243816375732, + "rewards/rejected": -3.9304511547088623, + "step": 780 + }, + { + "epoch": 2.118758434547908, + "grad_norm": 15.0, + "learning_rate": 1.1959328502027556e-06, + "logits/chosen": -1.5672693252563477, + "logits/rejected": -1.5724976062774658, + "logps/chosen": -161.8846435546875, + "logps/rejected": -190.6571807861328, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02019577845931053, + "rewards/margins": 3.7148475646972656, + "rewards/rejected": -3.6946518421173096, + "step": 785 + }, + { + "epoch": 2.1322537112010798, + "grad_norm": 19.125, + "learning_rate": 1.1625578421428714e-06, + "logits/chosen": -1.4088555574417114, + "logits/rejected": -1.331659197807312, + "logps/chosen": -197.23593139648438, + "logps/rejected": -279.7657470703125, + "loss": 0.1239, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.08394167572259903, + "rewards/margins": 3.702916383743286, + "rewards/rejected": -3.786858081817627, + "step": 790 + }, + { + "epoch": 2.145748987854251, + "grad_norm": 19.625, + "learning_rate": 1.1295134888882258e-06, + "logits/chosen": -1.5858689546585083, + "logits/rejected": -1.6758959293365479, + "logps/chosen": -194.56253051757812, + "logps/rejected": -206.4073028564453, + "loss": 0.0922, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.14167055487632751, + "rewards/margins": 3.8033957481384277, + "rewards/rejected": -3.945065975189209, + "step": 795 + }, + { + "epoch": 2.1592442645074224, + "grad_norm": 16.25, + "learning_rate": 1.0968079599690872e-06, + "logits/chosen": -1.5427080392837524, + "logits/rejected": -1.509251356124878, + "logps/chosen": -227.91281127929688, + "logps/rejected": -196.93661499023438, + "loss": 0.112, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.0069570960476994514, + "rewards/margins": 3.6783218383789062, + "rewards/rejected": -3.6852786540985107, + "step": 800 + }, + { + "epoch": 2.1727395411605936, + "grad_norm": 19.875, + "learning_rate": 1.064449341148442e-06, + "logits/chosen": -1.624629020690918, + "logits/rejected": -1.647383689880371, + "logps/chosen": -203.95071411132812, + "logps/rejected": -221.9706573486328, + "loss": 0.1216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1412554681301117, + "rewards/margins": 3.5126278400421143, + "rewards/rejected": -3.6538829803466797, + "step": 805 + }, + { + "epoch": 2.1862348178137654, + "grad_norm": 14.5, + "learning_rate": 1.0324456324229536e-06, + "logits/chosen": -1.4194597005844116, + "logits/rejected": -1.3489387035369873, + "logps/chosen": -166.34426879882812, + "logps/rejected": -239.3138427734375, + "loss": 0.0775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0588313452899456, + "rewards/margins": 3.9181437492370605, + "rewards/rejected": -3.976975202560425, + "step": 810 + }, + { + "epoch": 2.1997300944669367, + "grad_norm": 35.5, + "learning_rate": 1.000804746045138e-06, + "logits/chosen": -1.3923031091690063, + "logits/rejected": -1.4646499156951904, + "logps/chosen": -191.46279907226562, + "logps/rejected": -184.79953002929688, + "loss": 0.1111, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.008677695877850056, + "rewards/margins": 3.193809986114502, + "rewards/rejected": -3.2024874687194824, + "step": 815 + }, + { + "epoch": 2.213225371120108, + "grad_norm": 16.0, + "learning_rate": 9.695345045672167e-07, + "logits/chosen": -1.4313310384750366, + "logits/rejected": -1.4792088270187378, + "logps/chosen": -191.17092895507812, + "logps/rejected": -196.5364532470703, + "loss": 0.118, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.33248454332351685, + "rewards/margins": 3.7640583515167236, + "rewards/rejected": -4.096542835235596, + "step": 820 + }, + { + "epoch": 2.2267206477732793, + "grad_norm": 15.5, + "learning_rate": 9.386426389071532e-07, + "logits/chosen": -1.4152162075042725, + "logits/rejected": -1.363843321800232, + "logps/chosen": -229.3914031982422, + "logps/rejected": -278.37847900390625, + "loss": 0.0961, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.30344587564468384, + "rewards/margins": 4.63069486618042, + "rewards/rejected": -4.934141635894775, + "step": 825 + }, + { + "epoch": 2.2402159244264506, + "grad_norm": 17.625, + "learning_rate": 9.081367864373489e-07, + "logits/chosen": -1.3973594903945923, + "logits/rejected": -1.524677038192749, + "logps/chosen": -168.33126831054688, + "logps/rejected": -156.55892944335938, + "loss": 0.0944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1414262354373932, + "rewards/margins": 3.3840813636779785, + "rewards/rejected": -3.5255074501037598, + "step": 830 + }, + { + "epoch": 2.2537112010796223, + "grad_norm": 11.8125, + "learning_rate": 8.780244890964567e-07, + "logits/chosen": -1.4209728240966797, + "logits/rejected": -1.2569080591201782, + "logps/chosen": -177.04782104492188, + "logps/rejected": -275.0938415527344, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16639022529125214, + "rewards/margins": 3.9153380393981934, + "rewards/rejected": -3.748948335647583, + "step": 835 + }, + { + "epoch": 2.2672064777327936, + "grad_norm": 10.625, + "learning_rate": 8.483131915247969e-07, + "logits/chosen": -1.563407301902771, + "logits/rejected": -1.534883975982666, + "logps/chosen": -171.35104370117188, + "logps/rejected": -242.4336700439453, + "loss": 0.0949, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2416602075099945, + "rewards/margins": 4.914166450500488, + "rewards/rejected": -5.155826568603516, + "step": 840 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 19.75, + "learning_rate": 8.190102392238191e-07, + "logits/chosen": -1.4438880681991577, + "logits/rejected": -1.4186255931854248, + "logps/chosen": -154.63705444335938, + "logps/rejected": -207.8048858642578, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18597714602947235, + "rewards/margins": 4.108304500579834, + "rewards/rejected": -4.294281959533691, + "step": 845 + }, + { + "epoch": 2.294197031039136, + "grad_norm": 32.25, + "learning_rate": 7.90122876740086e-07, + "logits/chosen": -1.63836669921875, + "logits/rejected": -1.5565919876098633, + "logps/chosen": -226.85037231445312, + "logps/rejected": -326.13421630859375, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0758393257856369, + "rewards/margins": 4.579066276550293, + "rewards/rejected": -4.503227233886719, + "step": 850 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 5.0625, + "learning_rate": 7.616582458742059e-07, + "logits/chosen": -1.4565999507904053, + "logits/rejected": -1.455143928527832, + "logps/chosen": -212.2303009033203, + "logps/rejected": -276.86834716796875, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1173635721206665, + "rewards/margins": 4.344286918640137, + "rewards/rejected": -4.46165132522583, + "step": 855 + }, + { + "epoch": 2.3211875843454792, + "grad_norm": 9.6875, + "learning_rate": 7.336233839151693e-07, + "logits/chosen": -1.6497745513916016, + "logits/rejected": -1.6588242053985596, + "logps/chosen": -169.42959594726562, + "logps/rejected": -258.19207763671875, + "loss": 0.1057, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.21658802032470703, + "rewards/margins": 3.805851697921753, + "rewards/rejected": -4.022439479827881, + "step": 860 + }, + { + "epoch": 2.3346828609986505, + "grad_norm": 21.5, + "learning_rate": 7.060252219005304e-07, + "logits/chosen": -1.520618200302124, + "logits/rejected": -1.5337458848953247, + "logps/chosen": -227.05679321289062, + "logps/rejected": -317.5985107421875, + "loss": 0.0774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06503160297870636, + "rewards/margins": 4.4666852951049805, + "rewards/rejected": -4.531716823577881, + "step": 865 + }, + { + "epoch": 2.348178137651822, + "grad_norm": 12.5, + "learning_rate": 6.788705829028483e-07, + "logits/chosen": -1.5424460172653198, + "logits/rejected": -1.527999997138977, + "logps/chosen": -186.46414184570312, + "logps/rejected": -190.83157348632812, + "loss": 0.1022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1064692884683609, + "rewards/margins": 3.359034776687622, + "rewards/rejected": -3.2525649070739746, + "step": 870 + }, + { + "epoch": 2.361673414304993, + "grad_norm": 66.5, + "learning_rate": 6.521661803428225e-07, + "logits/chosen": -1.5013136863708496, + "logits/rejected": -1.5206286907196045, + "logps/chosen": -201.0956268310547, + "logps/rejected": -198.01573181152344, + "loss": 0.0955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13122853636741638, + "rewards/margins": 3.767671585083008, + "rewards/rejected": -3.898899793624878, + "step": 875 + }, + { + "epoch": 2.375168690958165, + "grad_norm": 11.3125, + "learning_rate": 6.259186163295439e-07, + "logits/chosen": -1.2552602291107178, + "logits/rejected": -1.3482682704925537, + "logps/chosen": -246.9757080078125, + "logps/rejected": -239.8274383544922, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1879548728466034, + "rewards/margins": 3.7479751110076904, + "rewards/rejected": -3.935929775238037, + "step": 880 + }, + { + "epoch": 2.388663967611336, + "grad_norm": 16.0, + "learning_rate": 6.001343800282569e-07, + "logits/chosen": -1.5184439420700073, + "logits/rejected": -1.4158121347427368, + "logps/chosen": -145.63616943359375, + "logps/rejected": -212.58468627929688, + "loss": 0.0783, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3523162603378296, + "rewards/margins": 4.166034698486328, + "rewards/rejected": -4.5183515548706055, + "step": 885 + }, + { + "epoch": 2.4021592442645074, + "grad_norm": 12.0625, + "learning_rate": 5.748198460560475e-07, + "logits/chosen": -1.602419137954712, + "logits/rejected": -1.6869083642959595, + "logps/chosen": -211.70947265625, + "logps/rejected": -220.8863525390625, + "loss": 0.0806, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16532480716705322, + "rewards/margins": 4.41878080368042, + "rewards/rejected": -4.253456115722656, + "step": 890 + }, + { + "epoch": 2.4156545209176787, + "grad_norm": 32.75, + "learning_rate": 5.499812729058546e-07, + "logits/chosen": -1.56089186668396, + "logits/rejected": -1.5883516073226929, + "logps/chosen": -181.11459350585938, + "logps/rejected": -161.60299682617188, + "loss": 0.1433, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2707998752593994, + "rewards/margins": 3.216136932373047, + "rewards/rejected": -3.4869370460510254, + "step": 895 + }, + { + "epoch": 2.42914979757085, + "grad_norm": 14.8125, + "learning_rate": 5.256248013991857e-07, + "logits/chosen": -1.5014961957931519, + "logits/rejected": -1.4206339120864868, + "logps/chosen": -226.8283233642578, + "logps/rejected": -266.60333251953125, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00030528902425430715, + "rewards/margins": 4.552371978759766, + "rewards/rejected": -4.552066802978516, + "step": 900 + }, + { + "epoch": 2.4426450742240218, + "grad_norm": 23.0, + "learning_rate": 5.01756453167925e-07, + "logits/chosen": -1.5279182195663452, + "logits/rejected": -1.5130751132965088, + "logps/chosen": -199.68397521972656, + "logps/rejected": -246.5128936767578, + "loss": 0.0683, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.14631351828575134, + "rewards/margins": 4.73899507522583, + "rewards/rejected": -4.592680931091309, + "step": 905 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 21.375, + "learning_rate": 4.78382129165613e-07, + "logits/chosen": -1.4500765800476074, + "logits/rejected": -1.5014575719833374, + "logps/chosen": -185.51475524902344, + "logps/rejected": -181.7137908935547, + "loss": 0.1049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09951256215572357, + "rewards/margins": 3.4707932472229004, + "rewards/rejected": -3.371281147003174, + "step": 910 + }, + { + "epoch": 2.4696356275303644, + "grad_norm": 32.5, + "learning_rate": 4.5550760820855633e-07, + "logits/chosen": -1.557877779006958, + "logits/rejected": -1.4586069583892822, + "logps/chosen": -209.05062866210938, + "logps/rejected": -308.66424560546875, + "loss": 0.118, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2664136290550232, + "rewards/margins": 4.0513434410095215, + "rewards/rejected": -4.3177571296691895, + "step": 915 + }, + { + "epoch": 2.4831309041835357, + "grad_norm": 22.5, + "learning_rate": 4.3313854554713457e-07, + "logits/chosen": -1.5593338012695312, + "logits/rejected": -1.5647127628326416, + "logps/chosen": -197.6747283935547, + "logps/rejected": -253.01876831054688, + "loss": 0.0716, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0987640991806984, + "rewards/margins": 4.090095043182373, + "rewards/rejected": -3.9913315773010254, + "step": 920 + }, + { + "epoch": 2.4966261808367074, + "grad_norm": 20.125, + "learning_rate": 4.1128047146765936e-07, + "logits/chosen": -1.435847520828247, + "logits/rejected": -1.453253149986267, + "logps/chosen": -141.46656799316406, + "logps/rejected": -162.93905639648438, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20369374752044678, + "rewards/margins": 3.790607452392578, + "rewards/rejected": -3.586913585662842, + "step": 925 + }, + { + "epoch": 2.5101214574898787, + "grad_norm": 32.5, + "learning_rate": 3.899387899251242e-07, + "logits/chosen": -1.499912142753601, + "logits/rejected": -1.5055288076400757, + "logps/chosen": -179.4788360595703, + "logps/rejected": -202.9369354248047, + "loss": 0.0808, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04291580989956856, + "rewards/margins": 3.4943645000457764, + "rewards/rejected": -3.537280321121216, + "step": 930 + }, + { + "epoch": 2.52361673414305, + "grad_norm": 6.59375, + "learning_rate": 3.6911877720719053e-07, + "logits/chosen": -1.6243568658828735, + "logits/rejected": -1.5396671295166016, + "logps/chosen": -155.4473419189453, + "logps/rejected": -191.9477081298828, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33543360233306885, + "rewards/margins": 4.113525867462158, + "rewards/rejected": -4.4489593505859375, + "step": 935 + }, + { + "epoch": 2.5371120107962213, + "grad_norm": 10.3125, + "learning_rate": 3.488255806297311e-07, + "logits/chosen": -1.4612650871276855, + "logits/rejected": -1.6070709228515625, + "logps/chosen": -164.7592010498047, + "logps/rejected": -161.7231903076172, + "loss": 0.1901, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06939269602298737, + "rewards/margins": 3.406930446624756, + "rewards/rejected": -3.3375372886657715, + "step": 940 + }, + { + "epoch": 2.5506072874493926, + "grad_norm": 7.46875, + "learning_rate": 3.2906421726426857e-07, + "logits/chosen": -1.4703078269958496, + "logits/rejected": -1.4379500150680542, + "logps/chosen": -204.19473266601562, + "logps/rejected": -244.11965942382812, + "loss": 0.0904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6960457563400269, + "rewards/margins": 4.154335975646973, + "rewards/rejected": -4.850381851196289, + "step": 945 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 17.375, + "learning_rate": 3.09839572697605e-07, + "logits/chosen": -1.560767412185669, + "logits/rejected": -1.4427921772003174, + "logps/chosen": -243.10568237304688, + "logps/rejected": -232.52108764648438, + "loss": 0.0844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.049421075731515884, + "rewards/margins": 4.088489055633545, + "rewards/rejected": -4.137909889221191, + "step": 950 + }, + { + "epoch": 2.5775978407557356, + "grad_norm": 19.75, + "learning_rate": 2.9115639982396166e-07, + "logits/chosen": -1.515772819519043, + "logits/rejected": -1.6191974878311157, + "logps/chosen": -210.3816375732422, + "logps/rejected": -198.30801391601562, + "loss": 0.1289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.32715946435928345, + "rewards/margins": 3.6732399463653564, + "rewards/rejected": -4.000399589538574, + "step": 955 + }, + { + "epoch": 2.591093117408907, + "grad_norm": 16.875, + "learning_rate": 2.7301931766992916e-07, + "logits/chosen": -1.53992760181427, + "logits/rejected": -1.6426169872283936, + "logps/chosen": -202.2464599609375, + "logps/rejected": -200.73020935058594, + "loss": 0.0916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2096923142671585, + "rewards/margins": 3.49652361869812, + "rewards/rejected": -3.2868313789367676, + "step": 960 + }, + { + "epoch": 2.604588394062078, + "grad_norm": 15.875, + "learning_rate": 2.554328102525022e-07, + "logits/chosen": -1.468806505203247, + "logits/rejected": -1.5037376880645752, + "logps/chosen": -225.407470703125, + "logps/rejected": -265.16326904296875, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1482563018798828, + "rewards/margins": 3.908936023712158, + "rewards/rejected": -3.760679244995117, + "step": 965 + }, + { + "epoch": 2.6180836707152495, + "grad_norm": 28.25, + "learning_rate": 2.3840122547050482e-07, + "logits/chosen": -1.4675546884536743, + "logits/rejected": -1.427056074142456, + "logps/chosen": -189.55482482910156, + "logps/rejected": -238.43399047851562, + "loss": 0.128, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15342381596565247, + "rewards/margins": 4.185477256774902, + "rewards/rejected": -4.338901042938232, + "step": 970 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 10.6875, + "learning_rate": 2.219287740296605e-07, + "logits/chosen": -1.5017975568771362, + "logits/rejected": -1.5283129215240479, + "logps/chosen": -185.2952117919922, + "logps/rejected": -218.5054168701172, + "loss": 0.0971, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2838120460510254, + "rewards/margins": 4.120657444000244, + "rewards/rejected": -4.4044694900512695, + "step": 975 + }, + { + "epoch": 2.6450742240215925, + "grad_norm": 21.0, + "learning_rate": 2.060195284015837e-07, + "logits/chosen": -1.662113904953003, + "logits/rejected": -1.6862503290176392, + "logps/chosen": -150.606689453125, + "logps/rejected": -198.61793518066406, + "loss": 0.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2786501944065094, + "rewards/margins": 3.8265221118927, + "rewards/rejected": -4.105172157287598, + "step": 980 + }, + { + "epoch": 2.658569500674764, + "grad_norm": 19.75, + "learning_rate": 1.9067742181694353e-07, + "logits/chosen": -1.4568703174591064, + "logits/rejected": -1.4512639045715332, + "logps/chosen": -171.15443420410156, + "logps/rejected": -221.99526977539062, + "loss": 0.096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17055651545524597, + "rewards/margins": 5.160454273223877, + "rewards/rejected": -5.3310112953186035, + "step": 985 + }, + { + "epoch": 2.672064777327935, + "grad_norm": 93.0, + "learning_rate": 1.75906247293057e-07, + "logits/chosen": -1.6594133377075195, + "logits/rejected": -1.5529086589813232, + "logps/chosen": -156.86392211914062, + "logps/rejected": -285.59197998046875, + "loss": 0.118, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5136551856994629, + "rewards/margins": 4.625790596008301, + "rewards/rejected": -5.139446258544922, + "step": 990 + }, + { + "epoch": 2.6855600539811064, + "grad_norm": 11.3125, + "learning_rate": 1.617096566961429e-07, + "logits/chosen": -1.466498613357544, + "logits/rejected": -1.4549661874771118, + "logps/chosen": -155.0102081298828, + "logps/rejected": -232.1795654296875, + "loss": 0.152, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.09109257161617279, + "rewards/margins": 3.467794418334961, + "rewards/rejected": -3.558886766433716, + "step": 995 + }, + { + "epoch": 2.699055330634278, + "grad_norm": 15.9375, + "learning_rate": 1.4809115983847267e-07, + "logits/chosen": -1.377762794494629, + "logits/rejected": -1.3253929615020752, + "logps/chosen": -148.2834014892578, + "logps/rejected": -208.0382080078125, + "loss": 0.1151, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24366268515586853, + "rewards/margins": 3.6103515625, + "rewards/rejected": -3.8540141582489014, + "step": 1000 + }, + { + "epoch": 2.699055330634278, + "eval_logits/chosen": -1.536294937133789, + "eval_logits/rejected": -1.5776937007904053, + "eval_logps/chosen": -191.7211456298828, + "eval_logps/rejected": -226.05455017089844, + "eval_loss": 0.31860384345054626, + "eval_rewards/accuracies": 0.8524096608161926, + "eval_rewards/chosen": -0.7276893258094788, + "eval_rewards/margins": 2.395343065261841, + "eval_rewards/rejected": -3.1230320930480957, + "eval_runtime": 23.3449, + "eval_samples_per_second": 14.136, + "eval_steps_per_second": 3.555, + "step": 1000 + }, + { + "epoch": 2.7125506072874495, + "grad_norm": 23.625, + "learning_rate": 1.3505412361064395e-07, + "logits/chosen": -1.4981733560562134, + "logits/rejected": -1.5207927227020264, + "logps/chosen": -192.99154663085938, + "logps/rejected": -194.6613311767578, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07562440633773804, + "rewards/margins": 4.270889759063721, + "rewards/rejected": -4.195265769958496, + "step": 1005 + }, + { + "epoch": 2.7260458839406208, + "grad_norm": 20.5, + "learning_rate": 1.226017711491867e-07, + "logits/chosen": -1.5061196088790894, + "logits/rejected": -1.5956671237945557, + "logps/chosen": -170.25169372558594, + "logps/rejected": -240.0498046875, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27517637610435486, + "rewards/margins": 3.623337507247925, + "rewards/rejected": -3.89851450920105, + "step": 1010 + }, + { + "epoch": 2.739541160593792, + "grad_norm": 31.25, + "learning_rate": 1.107371810397076e-07, + "logits/chosen": -1.4881411790847778, + "logits/rejected": -1.5475780963897705, + "logps/chosen": -237.45504760742188, + "logps/rejected": -212.13330078125, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10645435005426407, + "rewards/margins": 4.086081027984619, + "rewards/rejected": -4.192535400390625, + "step": 1015 + }, + { + "epoch": 2.753036437246964, + "grad_norm": 16.75, + "learning_rate": 9.946328655577625e-08, + "logits/chosen": -1.5837833881378174, + "logits/rejected": -1.6130040884017944, + "logps/chosen": -137.10398864746094, + "logps/rejected": -171.19357299804688, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27897781133651733, + "rewards/margins": 3.8964107036590576, + "rewards/rejected": -4.175389289855957, + "step": 1020 + }, + { + "epoch": 2.766531713900135, + "grad_norm": 28.25, + "learning_rate": 8.878287493373245e-08, + "logits/chosen": -1.5753690004348755, + "logits/rejected": -1.6070302724838257, + "logps/chosen": -214.03018188476562, + "logps/rejected": -189.55850219726562, + "loss": 0.1188, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.10192601382732391, + "rewards/margins": 3.4568443298339844, + "rewards/rejected": -3.558769941329956, + "step": 1025 + }, + { + "epoch": 2.7800269905533064, + "grad_norm": 26.625, + "learning_rate": 7.869858668360042e-08, + "logits/chosen": -1.4193127155303955, + "logits/rejected": -1.2717030048370361, + "logps/chosen": -187.0641632080078, + "logps/rejected": -224.65066528320312, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18936040997505188, + "rewards/margins": 4.242377758026123, + "rewards/rejected": -4.43173885345459, + "step": 1030 + }, + { + "epoch": 2.7935222672064777, + "grad_norm": 24.75, + "learning_rate": 6.921291493627747e-08, + "logits/chosen": -1.6177479028701782, + "logits/rejected": -1.6725289821624756, + "logps/chosen": -248.9903564453125, + "logps/rejected": -230.86611938476562, + "loss": 0.0856, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.29994627833366394, + "rewards/margins": 3.9232945442199707, + "rewards/rejected": -3.6233487129211426, + "step": 1035 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 42.5, + "learning_rate": 6.032820482716001e-08, + "logits/chosen": -1.5851434469223022, + "logits/rejected": -1.5880482196807861, + "logps/chosen": -155.3755340576172, + "logps/rejected": -186.6389617919922, + "loss": 0.1754, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19228528439998627, + "rewards/margins": 3.5754799842834473, + "rewards/rejected": -3.7677650451660156, + "step": 1040 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 43.0, + "learning_rate": 5.204665291635519e-08, + "logits/chosen": -1.496819019317627, + "logits/rejected": -1.5007538795471191, + "logps/chosen": -179.5200653076172, + "logps/rejected": -266.001953125, + "loss": 0.1038, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.35474246740341187, + "rewards/margins": 3.8987841606140137, + "rewards/rejected": -4.253526210784912, + "step": 1045 + }, + { + "epoch": 2.834008097165992, + "grad_norm": 27.5, + "learning_rate": 4.437030664562969e-08, + "logits/chosen": -1.470956563949585, + "logits/rejected": -1.52825927734375, + "logps/chosen": -203.93551635742188, + "logps/rejected": -220.02639770507812, + "loss": 0.1639, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.0977933406829834, + "rewards/margins": 3.205706834793091, + "rewards/rejected": -3.3035004138946533, + "step": 1050 + }, + { + "epoch": 2.8475033738191633, + "grad_norm": 65.0, + "learning_rate": 3.730106383222132e-08, + "logits/chosen": -1.5251743793487549, + "logits/rejected": -1.3242510557174683, + "logps/chosen": -186.79141235351562, + "logps/rejected": -250.46566772460938, + "loss": 0.0909, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.299465537071228, + "rewards/margins": 4.545691967010498, + "rewards/rejected": -4.845158100128174, + "step": 1055 + }, + { + "epoch": 2.8609986504723346, + "grad_norm": 19.75, + "learning_rate": 3.084067219964182e-08, + "logits/chosen": -1.527754783630371, + "logits/rejected": -1.5058457851409912, + "logps/chosen": -173.50900268554688, + "logps/rejected": -246.65628051757812, + "loss": 0.2529, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.42231351137161255, + "rewards/margins": 3.4638805389404297, + "rewards/rejected": -3.8861937522888184, + "step": 1060 + }, + { + "epoch": 2.8744939271255063, + "grad_norm": 43.75, + "learning_rate": 2.499072894559057e-08, + "logits/chosen": -1.6412513256072998, + "logits/rejected": -1.6829668283462524, + "logps/chosen": -180.06788635253906, + "logps/rejected": -219.94528198242188, + "loss": 0.1089, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.23302574455738068, + "rewards/margins": 3.369854688644409, + "rewards/rejected": -3.6028804779052734, + "step": 1065 + }, + { + "epoch": 2.8879892037786776, + "grad_norm": 13.75, + "learning_rate": 1.975268034707878e-08, + "logits/chosen": -1.4751927852630615, + "logits/rejected": -1.5141003131866455, + "logps/chosen": -204.79470825195312, + "logps/rejected": -223.97509765625, + "loss": 0.0822, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15369097888469696, + "rewards/margins": 3.9829258918762207, + "rewards/rejected": -3.8292346000671387, + "step": 1070 + }, + { + "epoch": 2.901484480431849, + "grad_norm": 39.0, + "learning_rate": 1.512782140286939e-08, + "logits/chosen": -1.4587006568908691, + "logits/rejected": -1.5042657852172852, + "logps/chosen": -156.6952667236328, + "logps/rejected": -263.0159912109375, + "loss": 0.0959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11610189825296402, + "rewards/margins": 3.9884142875671387, + "rewards/rejected": -4.10451602935791, + "step": 1075 + }, + { + "epoch": 2.91497975708502, + "grad_norm": 17.75, + "learning_rate": 1.1117295513313475e-08, + "logits/chosen": -1.665400743484497, + "logits/rejected": -1.6617343425750732, + "logps/chosen": -161.07443237304688, + "logps/rejected": -207.7931671142578, + "loss": 0.0872, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.10158289968967438, + "rewards/margins": 4.066722869873047, + "rewards/rejected": -3.965139865875244, + "step": 1080 + }, + { + "epoch": 2.9284750337381915, + "grad_norm": 20.75, + "learning_rate": 7.72209419766995e-09, + "logits/chosen": -1.4860131740570068, + "logits/rejected": -1.3406977653503418, + "logps/chosen": -168.0951690673828, + "logps/rejected": -274.35113525390625, + "loss": 0.1053, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.44807571172714233, + "rewards/margins": 3.924337863922119, + "rewards/rejected": -4.372413635253906, + "step": 1085 + }, + { + "epoch": 2.941970310391363, + "grad_norm": 16.25, + "learning_rate": 4.943056848972227e-09, + "logits/chosen": -1.493690848350525, + "logits/rejected": -1.5224257707595825, + "logps/chosen": -209.3112335205078, + "logps/rejected": -208.22988891601562, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.035886406898498535, + "rewards/margins": 3.8605358600616455, + "rewards/rejected": -3.8246493339538574, + "step": 1090 + }, + { + "epoch": 2.9554655870445345, + "grad_norm": 14.8125, + "learning_rate": 2.7808705265053305e-09, + "logits/chosen": -1.571223497390747, + "logits/rejected": -1.5577231645584106, + "logps/chosen": -169.42562866210938, + "logps/rejected": -181.50631713867188, + "loss": 0.1109, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.13199149072170258, + "rewards/margins": 3.612278699874878, + "rewards/rejected": -3.744269847869873, + "step": 1095 + }, + { + "epoch": 2.968960863697706, + "grad_norm": 27.75, + "learning_rate": 1.2360697859462035e-09, + "logits/chosen": -1.5886671543121338, + "logits/rejected": -1.562727928161621, + "logps/chosen": -162.84046936035156, + "logps/rejected": -219.8025360107422, + "loss": 0.118, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3454614281654358, + "rewards/margins": 4.1069793701171875, + "rewards/rejected": -4.4524407386779785, + "step": 1100 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 13.0625, + "learning_rate": 3.090365472041557e-10, + "logits/chosen": -1.5336341857910156, + "logits/rejected": -1.5714600086212158, + "logps/chosen": -217.091064453125, + "logps/rejected": -239.0583953857422, + "loss": 0.1793, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2769380509853363, + "rewards/margins": 3.7623977661132812, + "rewards/rejected": -4.039335250854492, + "step": 1105 + }, + { + "epoch": 2.9959514170040484, + "grad_norm": 16.75, + "learning_rate": 0.0, + "logits/chosen": -1.4733049869537354, + "logits/rejected": -1.4821723699569702, + "logps/chosen": -191.77438354492188, + "logps/rejected": -275.19158935546875, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11918088048696518, + "rewards/margins": 4.132817268371582, + "rewards/rejected": -4.013636589050293, + "step": 1110 + }, + { + "epoch": 2.9959514170040484, + "step": 1110, + "total_flos": 4.5615607240812134e+17, + "train_loss": 0.26195224279218965, + "train_runtime": 3105.2921, + "train_samples_per_second": 2.862, + "train_steps_per_second": 0.357 + } + ], + "logging_steps": 5, + "max_steps": 1110, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.5615607240812134e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}