diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3246 +1,1516 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.20013254392163232, + "epoch": 0.028088778094264185, "eval_steps": 5000, - "global_step": 1900, + "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.001053329178534907, - "grad_norm": 217.68748474121094, - "learning_rate": 5.263157894736842e-06, - "logits/chosen": -2.8389856815338135, - "logits/rejected": -2.8396875858306885, - "logps/chosen": -5.6633830070495605, - "logps/rejected": -6.1086931228637695, - "loss": 5.7387, - "odds_ratio_loss": 12.78534984588623, - "rewards/accuracies": 0.6208333373069763, - "rewards/chosen": -0.5663383603096008, - "rewards/margins": 0.04453102499246597, - "rewards/rejected": -0.6108693480491638, - "sft_loss": 4.460188388824463, + "epoch": 0.00028088778094264183, + "grad_norm": 1.578125, + "learning_rate": 1.4044943820224718e-07, + "logits/chosen": -6.981852054595947, + "logits/rejected": -6.981904029846191, + "logps/chosen": -0.7819840312004089, + "logps/rejected": -3.407517910003662, + "loss": 0.8348, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.0781984031200409, + "rewards/margins": 0.2625533938407898, + "rewards/rejected": -0.3407517671585083, + "step": 1 + }, + { + "epoch": 0.0005617755618852837, + "grad_norm": 1.90625, + "learning_rate": 2.8089887640449437e-07, + "logits/chosen": -6.981129169464111, + "logits/rejected": -6.981235027313232, + "logps/chosen": -0.5972335338592529, + "logps/rejected": -3.7509422302246094, + "loss": 0.8099, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.059723351150751114, + "rewards/margins": 0.3153708577156067, + "rewards/rejected": -0.3750942647457123, + "step": 2 + }, + { + "epoch": 0.0008426633428279256, + "grad_norm": 1.328125, + "learning_rate": 4.213483146067416e-07, + "logits/chosen": -6.982983589172363, + "logits/rejected": -6.983180522918701, + "logps/chosen": -0.6279873847961426, + "logps/rejected": -3.3199615478515625, + "loss": 0.8299, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.06279874593019485, + "rewards/margins": 0.26919740438461304, + "rewards/rejected": -0.3319961428642273, + "step": 3 + }, + { + "epoch": 0.0011235511237705673, + "grad_norm": 1.046875, + "learning_rate": 5.617977528089887e-07, + "logits/chosen": -6.978228569030762, + "logits/rejected": -6.978395462036133, + "logps/chosen": -0.663998007774353, + "logps/rejected": -3.635097026824951, + "loss": 0.818, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.06639980524778366, + "rewards/margins": 0.2971099317073822, + "rewards/rejected": -0.3635097146034241, + "step": 4 + }, + { + "epoch": 0.0014044389047132091, + "grad_norm": 1.9921875, + "learning_rate": 7.02247191011236e-07, + "logits/chosen": -6.984366416931152, + "logits/rejected": -6.984519004821777, + "logps/chosen": -0.5929118990898132, + "logps/rejected": -3.171377420425415, + "loss": 0.833, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.05929119512438774, + "rewards/margins": 0.25784653425216675, + "rewards/rejected": -0.317137748003006, + "step": 5 + }, + { + "epoch": 0.0016853266856558511, + "grad_norm": 0.81640625, + "learning_rate": 8.426966292134832e-07, + "logits/chosen": -6.9845099449157715, + "logits/rejected": -6.9846272468566895, + "logps/chosen": -0.6544881463050842, + "logps/rejected": -3.7314887046813965, + "loss": 0.8135, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.0654488131403923, + "rewards/margins": 0.3077000379562378, + "rewards/rejected": -0.3731488585472107, + "step": 6 + }, + { + "epoch": 0.001966214466598493, + "grad_norm": 2.0625, + "learning_rate": 9.831460674157304e-07, + "logits/chosen": -6.983869552612305, + "logits/rejected": -6.98401403427124, + "logps/chosen": -0.6330832839012146, + "logps/rejected": -3.969212055206299, + "loss": 0.7987, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.06330832839012146, + "rewards/margins": 0.3336128890514374, + "rewards/rejected": -0.39692118763923645, + "step": 7 + }, + { + "epoch": 0.0022471022475411347, + "grad_norm": 1.2265625, + "learning_rate": 1.1235955056179775e-06, + "logits/chosen": -6.984411239624023, + "logits/rejected": -6.984560012817383, + "logps/chosen": -0.5015445351600647, + "logps/rejected": -3.5077507495880127, + "loss": 0.812, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.05015445500612259, + "rewards/margins": 0.30062058568000793, + "rewards/rejected": -0.3507750630378723, + "step": 8 + }, + { + "epoch": 0.0025279900284837765, + "grad_norm": 3.484375, + "learning_rate": 1.2640449438202247e-06, + "logits/chosen": -6.9832258224487305, + "logits/rejected": -6.9833831787109375, + "logps/chosen": -0.5916701555252075, + "logps/rejected": -3.901883125305176, + "loss": 0.7984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05916702002286911, + "rewards/margins": 0.33102133870124817, + "rewards/rejected": -0.3901883363723755, + "step": 9 + }, + { + "epoch": 0.0028088778094264182, + "grad_norm": 1.6484375, + "learning_rate": 1.404494382022472e-06, + "logits/chosen": -6.982822418212891, + "logits/rejected": -6.982968330383301, + "logps/chosen": -0.6599135994911194, + "logps/rejected": -3.5486698150634766, + "loss": 0.8177, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.0659913569688797, + "rewards/margins": 0.28887563943862915, + "rewards/rejected": -0.35486698150634766, "step": 10 }, { - "epoch": 0.002106658357069814, - "grad_norm": 59.99885177612305, - "learning_rate": 1.0526315789473684e-05, - "logits/chosen": -3.0334506034851074, - "logits/rejected": -3.034069299697876, - "logps/chosen": -2.313845634460449, - "logps/rejected": -3.318277597427368, - "loss": 2.3687, - "odds_ratio_loss": 3.849175214767456, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.23138457536697388, - "rewards/margins": 0.10044321417808533, - "rewards/rejected": -0.3318277597427368, - "sft_loss": 1.9837348461151123, - "step": 20 - }, - { - "epoch": 0.0031599875356047207, - "grad_norm": 21.754234313964844, - "learning_rate": 1.5789473684210526e-05, - "logits/chosen": -3.226034641265869, - "logits/rejected": -3.2263522148132324, - "logps/chosen": -1.0712947845458984, - "logps/rejected": -2.3490335941314697, - "loss": 1.1116, - "odds_ratio_loss": 2.9476191997528076, - "rewards/accuracies": 0.8083333373069763, - "rewards/chosen": -0.10712946206331253, - "rewards/margins": 0.12777391076087952, - "rewards/rejected": -0.23490336537361145, - "sft_loss": 0.816817581653595, - "step": 30 + "epoch": 0.00308976559036906, + "grad_norm": 0.8125, + "learning_rate": 1.544943820224719e-06, + "logits/chosen": -6.987037658691406, + "logits/rejected": -6.987265586853027, + "logps/chosen": -0.5249046087265015, + "logps/rejected": -3.0017249584198, + "loss": 0.8425, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.05249045789241791, + "rewards/margins": 0.24768203496932983, + "rewards/rejected": -0.30017250776290894, + "step": 11 + }, + { + "epoch": 0.0033706533713117022, + "grad_norm": 1.546875, + "learning_rate": 1.6853932584269663e-06, + "logits/chosen": -6.9844279289245605, + "logits/rejected": -6.984550952911377, + "logps/chosen": -0.6450465321540833, + "logps/rejected": -3.3710899353027344, + "loss": 0.8281, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.06450465321540833, + "rewards/margins": 0.2726043164730072, + "rewards/rejected": -0.3371089994907379, + "step": 12 + }, + { + "epoch": 0.003651541152254344, + "grad_norm": 1.390625, + "learning_rate": 1.8258426966292136e-06, + "logits/chosen": -6.985495567321777, + "logits/rejected": -6.985678672790527, + "logps/chosen": -0.6660882234573364, + "logps/rejected": -3.941418170928955, + "loss": 0.8009, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.06660880893468857, + "rewards/margins": 0.3275330066680908, + "rewards/rejected": -0.3941417932510376, + "step": 13 + }, + { + "epoch": 0.003932428933196986, + "grad_norm": 1.6015625, + "learning_rate": 1.966292134831461e-06, + "logits/chosen": -6.982288837432861, + "logits/rejected": -6.982420921325684, + "logps/chosen": -0.7159062623977661, + "logps/rejected": -3.4875435829162598, + "loss": 0.8243, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.07159063965082169, + "rewards/margins": 0.27716371417045593, + "rewards/rejected": -0.348754346370697, + "step": 14 }, { "epoch": 0.004213316714139628, - "grad_norm": 7.782005310058594, - "learning_rate": 2.105263157894737e-05, - "logits/chosen": -3.4438676834106445, - "logits/rejected": -3.4442031383514404, - "logps/chosen": -0.8733291625976562, - "logps/rejected": -2.5004069805145264, - "loss": 0.914, - "odds_ratio_loss": 3.1110053062438965, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.08733292669057846, - "rewards/margins": 0.16270779073238373, - "rewards/rejected": -0.2500407099723816, - "sft_loss": 0.6028769016265869, - "step": 40 + "grad_norm": 1.4296875, + "learning_rate": 2.106741573033708e-06, + "logits/chosen": -6.986008644104004, + "logits/rejected": -6.986215114593506, + "logps/chosen": -0.6564575433731079, + "logps/rejected": -3.4806642532348633, + "loss": 0.8223, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.0656457468867302, + "rewards/margins": 0.28242069482803345, + "rewards/rejected": -0.34806641936302185, + "step": 15 + }, + { + "epoch": 0.004494204495082269, + "grad_norm": 0.94921875, + "learning_rate": 2.247191011235955e-06, + "logits/chosen": -6.985368251800537, + "logits/rejected": -6.985543727874756, + "logps/chosen": -0.5949140787124634, + "logps/rejected": -3.1833853721618652, + "loss": 0.8354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05949141085147858, + "rewards/margins": 0.25884711742401123, + "rewards/rejected": -0.3183385133743286, + "step": 16 + }, + { + "epoch": 0.004775092276024911, + "grad_norm": 1.53125, + "learning_rate": 2.3876404494382022e-06, + "logits/chosen": -6.983895778656006, + "logits/rejected": -6.984076023101807, + "logps/chosen": -0.6407002806663513, + "logps/rejected": -3.6408045291900635, + "loss": 0.8183, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.06407003104686737, + "rewards/margins": 0.30001044273376465, + "rewards/rejected": -0.3640804588794708, + "step": 17 + }, + { + "epoch": 0.005055980056967553, + "grad_norm": 0.91796875, + "learning_rate": 2.5280898876404495e-06, + "logits/chosen": -6.984615802764893, + "logits/rejected": -6.984807014465332, + "logps/chosen": -0.5760031938552856, + "logps/rejected": -2.942106008529663, + "loss": 0.8485, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.057600319385528564, + "rewards/margins": 0.2366103082895279, + "rewards/rejected": -0.29421064257621765, + "step": 18 + }, + { + "epoch": 0.005336867837910195, + "grad_norm": 2.765625, + "learning_rate": 2.6685393258426968e-06, + "logits/chosen": -6.985834121704102, + "logits/rejected": -6.986002445220947, + "logps/chosen": -0.603966474533081, + "logps/rejected": -3.6589133739471436, + "loss": 0.8121, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.06039665639400482, + "rewards/margins": 0.3054946959018707, + "rewards/rejected": -0.36589139699935913, + "step": 19 + }, + { + "epoch": 0.0056177556188528365, + "grad_norm": 1.1484375, + "learning_rate": 2.808988764044944e-06, + "logits/chosen": -6.985572814941406, + "logits/rejected": -6.985714912414551, + "logps/chosen": -0.520312488079071, + "logps/rejected": -2.815075159072876, + "loss": 0.8505, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.0520312525331974, + "rewards/margins": 0.22947625815868378, + "rewards/rejected": -0.2815075218677521, + "step": 20 }, { - "epoch": 0.0052666458926745345, - "grad_norm": 9.219217300415039, - "learning_rate": 2.631578947368421e-05, - "logits/chosen": -3.4509921073913574, - "logits/rejected": -3.451155662536621, - "logps/chosen": -0.7789952158927917, - "logps/rejected": -2.3700900077819824, - "loss": 0.8189, - "odds_ratio_loss": 2.722813367843628, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.0778995230793953, - "rewards/margins": 0.1591094583272934, - "rewards/rejected": -0.23700900375843048, - "sft_loss": 0.5466489195823669, - "step": 50 + "epoch": 0.005898643399795478, + "grad_norm": 3.421875, + "learning_rate": 2.9494382022471913e-06, + "logits/chosen": -6.986955165863037, + "logits/rejected": -6.987088680267334, + "logps/chosen": -0.6575434803962708, + "logps/rejected": -3.354898691177368, + "loss": 0.8309, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.06575435400009155, + "rewards/margins": 0.26973551511764526, + "rewards/rejected": -0.3354898989200592, + "step": 21 + }, + { + "epoch": 0.00617953118073812, + "grad_norm": 1.359375, + "learning_rate": 3.089887640449438e-06, + "logits/chosen": -6.985095500946045, + "logits/rejected": -6.985231876373291, + "logps/chosen": -0.6375535726547241, + "logps/rejected": -3.722036600112915, + "loss": 0.8155, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06375535577535629, + "rewards/margins": 0.30844831466674805, + "rewards/rejected": -0.37220367789268494, + "step": 22 + }, + { + "epoch": 0.006460418961680763, + "grad_norm": 1.3359375, + "learning_rate": 3.230337078651686e-06, + "logits/chosen": -6.985586166381836, + "logits/rejected": -6.985759258270264, + "logps/chosen": -0.6368163824081421, + "logps/rejected": -3.551453113555908, + "loss": 0.8186, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.06368163973093033, + "rewards/margins": 0.2914636731147766, + "rewards/rejected": -0.35514530539512634, + "step": 23 + }, + { + "epoch": 0.0067413067426234045, + "grad_norm": 0.9453125, + "learning_rate": 3.3707865168539327e-06, + "logits/chosen": -6.983944892883301, + "logits/rejected": -6.984124660491943, + "logps/chosen": -0.6929716467857361, + "logps/rejected": -3.0338144302368164, + "loss": 0.8463, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.06929716467857361, + "rewards/margins": 0.23408427834510803, + "rewards/rejected": -0.30338141322135925, + "step": 24 + }, + { + "epoch": 0.007022194523566046, + "grad_norm": 2.15625, + "learning_rate": 3.5112359550561803e-06, + "logits/chosen": -6.988375663757324, + "logits/rejected": -6.988600730895996, + "logps/chosen": -0.5282725691795349, + "logps/rejected": -2.989157199859619, + "loss": 0.84, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.05282726138830185, + "rewards/margins": 0.24608850479125977, + "rewards/rejected": -0.2989157438278198, + "step": 25 + }, + { + "epoch": 0.007303082304508688, + "grad_norm": 0.90234375, + "learning_rate": 3.651685393258427e-06, + "logits/chosen": -6.9826226234436035, + "logits/rejected": -6.982814311981201, + "logps/chosen": -0.576172411441803, + "logps/rejected": -3.1249589920043945, + "loss": 0.8391, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.05761724337935448, + "rewards/margins": 0.2548786401748657, + "rewards/rejected": -0.3124958872795105, + "step": 26 + }, + { + "epoch": 0.00758397008545133, + "grad_norm": 0.8984375, + "learning_rate": 3.7921348314606744e-06, + "logits/chosen": -6.98524284362793, + "logits/rejected": -6.985445499420166, + "logps/chosen": -0.567965567111969, + "logps/rejected": -3.0560202598571777, + "loss": 0.8399, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.05679655820131302, + "rewards/margins": 0.2488054633140564, + "rewards/rejected": -0.3056020438671112, + "step": 27 + }, + { + "epoch": 0.007864857866393972, + "grad_norm": 2.046875, + "learning_rate": 3.932584269662922e-06, + "logits/chosen": -6.987497806549072, + "logits/rejected": -6.9876604080200195, + "logps/chosen": -0.671168863773346, + "logps/rejected": -3.572070837020874, + "loss": 0.8193, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.0671168863773346, + "rewards/margins": 0.2900902330875397, + "rewards/rejected": -0.3572070896625519, + "step": 28 + }, + { + "epoch": 0.008145745647336613, + "grad_norm": 1.7578125, + "learning_rate": 4.073033707865169e-06, + "logits/chosen": -6.984347343444824, + "logits/rejected": -6.98446798324585, + "logps/chosen": -0.7242225408554077, + "logps/rejected": -3.0529520511627197, + "loss": 0.8501, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.07242225855588913, + "rewards/margins": 0.23287296295166016, + "rewards/rejected": -0.30529525876045227, + "step": 29 }, { - "epoch": 0.006319975071209441, - "grad_norm": 7.359012603759766, - "learning_rate": 3.157894736842105e-05, - "logits/chosen": -3.5323359966278076, - "logits/rejected": -3.532480001449585, - "logps/chosen": -0.7335668802261353, - "logps/rejected": -2.5415773391723633, - "loss": 0.7747, - "odds_ratio_loss": 2.7141709327697754, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.07335668057203293, - "rewards/margins": 0.18080104887485504, - "rewards/rejected": -0.2541577219963074, - "sft_loss": 0.5032956600189209, - "step": 60 + "epoch": 0.008426633428279255, + "grad_norm": 0.90234375, + "learning_rate": 4.213483146067416e-06, + "logits/chosen": -6.987656116485596, + "logits/rejected": -6.987755298614502, + "logps/chosen": -0.5501436591148376, + "logps/rejected": -2.847120761871338, + "loss": 0.8488, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05501437187194824, + "rewards/margins": 0.22969767451286316, + "rewards/rejected": -0.2847120463848114, + "step": 30 }, { - "epoch": 0.007373304249744348, - "grad_norm": 16.543472290039062, - "learning_rate": 3.6842105263157895e-05, - "logits/chosen": -3.2315316200256348, - "logits/rejected": -3.2316715717315674, - "logps/chosen": -0.6829439997673035, - "logps/rejected": -2.482815980911255, - "loss": 0.7205, - "odds_ratio_loss": 2.6182825565338135, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.06829439848661423, - "rewards/margins": 0.17998717725276947, - "rewards/rejected": -0.2482815831899643, - "sft_loss": 0.4587023854255676, - "step": 70 + "epoch": 0.008707521209221898, + "grad_norm": 1.765625, + "learning_rate": 4.3539325842696635e-06, + "logits/chosen": -6.9837751388549805, + "logits/rejected": -6.98394775390625, + "logps/chosen": -0.6806322932243347, + "logps/rejected": -3.402733325958252, + "loss": 0.826, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.06806322932243347, + "rewards/margins": 0.27221009135246277, + "rewards/rejected": -0.34027332067489624, + "step": 31 + }, + { + "epoch": 0.008988408990164539, + "grad_norm": 1.28125, + "learning_rate": 4.49438202247191e-06, + "logits/chosen": -6.984766960144043, + "logits/rejected": -6.984991073608398, + "logps/chosen": -0.5861184597015381, + "logps/rejected": -3.729720115661621, + "loss": 0.8089, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.05861184746026993, + "rewards/margins": 0.3143601715564728, + "rewards/rejected": -0.3729720413684845, + "step": 32 + }, + { + "epoch": 0.009269296771107181, + "grad_norm": 1.046875, + "learning_rate": 4.634831460674158e-06, + "logits/chosen": -6.988133907318115, + "logits/rejected": -6.988280296325684, + "logps/chosen": -0.6212756037712097, + "logps/rejected": -3.2797980308532715, + "loss": 0.8298, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.06212755665183067, + "rewards/margins": 0.26585227251052856, + "rewards/rejected": -0.32797983288764954, + "step": 33 + }, + { + "epoch": 0.009550184552049822, + "grad_norm": 1.21875, + "learning_rate": 4.7752808988764044e-06, + "logits/chosen": -6.988953113555908, + "logits/rejected": -6.989142894744873, + "logps/chosen": -0.5206266641616821, + "logps/rejected": -3.0407469272613525, + "loss": 0.8377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05206267163157463, + "rewards/margins": 0.2520120143890381, + "rewards/rejected": -0.3040746748447418, + "step": 34 + }, + { + "epoch": 0.009831072332992465, + "grad_norm": 1.0390625, + "learning_rate": 4.915730337078652e-06, + "logits/chosen": -6.989229202270508, + "logits/rejected": -6.989412307739258, + "logps/chosen": -0.5842424631118774, + "logps/rejected": -3.2295570373535156, + "loss": 0.8331, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.05842424929141998, + "rewards/margins": 0.2645314335823059, + "rewards/rejected": -0.3229556977748871, + "step": 35 + }, + { + "epoch": 0.010111960113935106, + "grad_norm": 1.6015625, + "learning_rate": 5.056179775280899e-06, + "logits/chosen": -6.989651203155518, + "logits/rejected": -6.989835739135742, + "logps/chosen": -0.5448687672615051, + "logps/rejected": -3.445011615753174, + "loss": 0.8174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05448687821626663, + "rewards/margins": 0.2900142967700958, + "rewards/rejected": -0.34450116753578186, + "step": 36 + }, + { + "epoch": 0.010392847894877748, + "grad_norm": 0.97265625, + "learning_rate": 5.196629213483146e-06, + "logits/chosen": -6.987741470336914, + "logits/rejected": -6.987899303436279, + "logps/chosen": -0.6063218116760254, + "logps/rejected": -3.657203197479248, + "loss": 0.8103, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.06063217669725418, + "rewards/margins": 0.3050881326198578, + "rewards/rejected": -0.36572033166885376, + "step": 37 + }, + { + "epoch": 0.01067373567582039, + "grad_norm": 1.3125, + "learning_rate": 5.3370786516853935e-06, + "logits/chosen": -6.992892265319824, + "logits/rejected": -6.993043899536133, + "logps/chosen": -0.6687143445014954, + "logps/rejected": -3.4205684661865234, + "loss": 0.8262, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.06687143445014954, + "rewards/margins": 0.2751854360103607, + "rewards/rejected": -0.34205687046051025, + "step": 38 + }, + { + "epoch": 0.010954623456763032, + "grad_norm": 1.4453125, + "learning_rate": 5.477528089887641e-06, + "logits/chosen": -6.9873948097229, + "logits/rejected": -6.987576484680176, + "logps/chosen": -0.6130951046943665, + "logps/rejected": -3.9130940437316895, + "loss": 0.8023, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.061309512704610825, + "rewards/margins": 0.3299998939037323, + "rewards/rejected": -0.3913094103336334, + "step": 39 + }, + { + "epoch": 0.011235511237705673, + "grad_norm": 0.98828125, + "learning_rate": 5.617977528089888e-06, + "logits/chosen": -6.99265718460083, + "logits/rejected": -6.992844104766846, + "logps/chosen": -0.6354062557220459, + "logps/rejected": -2.818499803543091, + "loss": 0.8565, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.06354063004255295, + "rewards/margins": 0.21830937266349792, + "rewards/rejected": -0.2818499803543091, + "step": 40 }, { - "epoch": 0.008426633428279255, - "grad_norm": 5.6111907958984375, - "learning_rate": 4.210526315789474e-05, - "logits/chosen": -2.9154586791992188, - "logits/rejected": -2.9156744480133057, - "logps/chosen": -0.7533618807792664, - "logps/rejected": -2.669802665710449, - "loss": 0.7912, - "odds_ratio_loss": 2.688143730163574, - "rewards/accuracies": 0.8333333134651184, - "rewards/chosen": -0.07533618807792664, - "rewards/margins": 0.19164405763149261, - "rewards/rejected": -0.26698026061058044, - "sft_loss": 0.5223438739776611, - "step": 80 + "epoch": 0.011516399018648316, + "grad_norm": 0.80078125, + "learning_rate": 5.758426966292135e-06, + "logits/chosen": -6.9895339012146, + "logits/rejected": -6.989684104919434, + "logps/chosen": -0.6787976026535034, + "logps/rejected": -3.6232690811157227, + "loss": 0.8171, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.06787975877523422, + "rewards/margins": 0.2944471836090088, + "rewards/rejected": -0.3623269200325012, + "step": 41 + }, + { + "epoch": 0.011797286799590957, + "grad_norm": 0.77734375, + "learning_rate": 5.8988764044943826e-06, + "logits/chosen": -6.996128082275391, + "logits/rejected": -6.996224403381348, + "logps/chosen": -0.666965901851654, + "logps/rejected": -3.1539041996002197, + "loss": 0.8416, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.06669659167528152, + "rewards/margins": 0.2486938238143921, + "rewards/rejected": -0.315390408039093, + "step": 42 + }, + { + "epoch": 0.0120781745805336, + "grad_norm": 3.265625, + "learning_rate": 6.03932584269663e-06, + "logits/chosen": -6.990922927856445, + "logits/rejected": -6.991119861602783, + "logps/chosen": -0.6920310258865356, + "logps/rejected": -3.7208621501922607, + "loss": 0.8148, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.06920310109853745, + "rewards/margins": 0.3028830885887146, + "rewards/rejected": -0.37208619713783264, + "step": 43 + }, + { + "epoch": 0.01235906236147624, + "grad_norm": 0.953125, + "learning_rate": 6.179775280898876e-06, + "logits/chosen": -6.995143413543701, + "logits/rejected": -6.995302200317383, + "logps/chosen": -0.6558566093444824, + "logps/rejected": -3.2403218746185303, + "loss": 0.8344, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.065585657954216, + "rewards/margins": 0.2584465444087982, + "rewards/rejected": -0.3240322172641754, + "step": 44 }, { - "epoch": 0.009479962606814161, - "grad_norm": 3.4394092559814453, - "learning_rate": 4.736842105263158e-05, - "logits/chosen": -3.0740275382995605, - "logits/rejected": -3.0744776725769043, - "logps/chosen": -0.6786951422691345, - "logps/rejected": -3.1611907482147217, - "loss": 0.7145, - "odds_ratio_loss": 2.766305446624756, - "rewards/accuracies": 0.8270833492279053, - "rewards/chosen": -0.06786951422691345, - "rewards/margins": 0.2482495754957199, - "rewards/rejected": -0.31611910462379456, - "sft_loss": 0.4378434419631958, - "step": 90 + "epoch": 0.012639950142418883, + "grad_norm": 0.94921875, + "learning_rate": 6.320224719101124e-06, + "logits/chosen": -6.992753028869629, + "logits/rejected": -6.992949962615967, + "logps/chosen": -0.6340925097465515, + "logps/rejected": -4.1242756843566895, + "loss": 0.7913, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.06340925395488739, + "rewards/margins": 0.34901827573776245, + "rewards/rejected": -0.41242751479148865, + "step": 45 + }, + { + "epoch": 0.012920837923361525, + "grad_norm": 1.140625, + "learning_rate": 6.460674157303372e-06, + "logits/chosen": -6.998230934143066, + "logits/rejected": -6.998466491699219, + "logps/chosen": -0.48865756392478943, + "logps/rejected": -3.8250622749328613, + "loss": 0.7962, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.048865754157304764, + "rewards/margins": 0.333640456199646, + "rewards/rejected": -0.38250622153282166, + "step": 46 + }, + { + "epoch": 0.013201725704304166, + "grad_norm": 2.296875, + "learning_rate": 6.601123595505618e-06, + "logits/chosen": -6.998146057128906, + "logits/rejected": -6.998294830322266, + "logps/chosen": -0.6883760690689087, + "logps/rejected": -3.535900831222534, + "loss": 0.8262, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.06883761286735535, + "rewards/margins": 0.2847524583339691, + "rewards/rejected": -0.35359007120132446, + "step": 47 + }, + { + "epoch": 0.013482613485246809, + "grad_norm": 1.0, + "learning_rate": 6.741573033707865e-06, + "logits/chosen": -6.998913288116455, + "logits/rejected": -6.9990620613098145, + "logps/chosen": -0.5589190721511841, + "logps/rejected": -2.9924392700195312, + "loss": 0.8434, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.055891912430524826, + "rewards/margins": 0.2433520257472992, + "rewards/rejected": -0.2992439270019531, + "step": 48 + }, + { + "epoch": 0.01376350126618945, + "grad_norm": 2.234375, + "learning_rate": 6.8820224719101126e-06, + "logits/chosen": -6.999466419219971, + "logits/rejected": -6.999658107757568, + "logps/chosen": -0.5700675845146179, + "logps/rejected": -3.315387487411499, + "loss": 0.8248, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.05700676143169403, + "rewards/margins": 0.2745319902896881, + "rewards/rejected": -0.33153873682022095, + "step": 49 + }, + { + "epoch": 0.014044389047132093, + "grad_norm": 1.078125, + "learning_rate": 7.022471910112361e-06, + "logits/chosen": -7.00152587890625, + "logits/rejected": -7.001701831817627, + "logps/chosen": -0.6014066934585571, + "logps/rejected": -3.370955228805542, + "loss": 0.8292, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.060140661895275116, + "rewards/margins": 0.27695485949516296, + "rewards/rejected": -0.3370955288410187, + "step": 50 }, { - "epoch": 0.010533291785349069, - "grad_norm": 9.483699798583984, - "learning_rate": 5.263157894736842e-05, - "logits/chosen": -3.197289228439331, - "logits/rejected": -3.19758677482605, - "logps/chosen": -0.6806226372718811, - "logps/rejected": -3.0961530208587646, - "loss": 0.7172, - "odds_ratio_loss": 2.700289726257324, + "epoch": 0.014325276828074733, + "grad_norm": 1.84375, + "learning_rate": 7.162921348314606e-06, + "logits/chosen": -7.0027384757995605, + "logits/rejected": -7.002865791320801, + "logps/chosen": -0.6463361978530884, + "logps/rejected": -3.259683609008789, + "loss": 0.8333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06463362276554108, + "rewards/margins": 0.26133471727371216, + "rewards/rejected": -0.32596835494041443, + "step": 51 + }, + { + "epoch": 0.014606164609017376, + "grad_norm": 1.0703125, + "learning_rate": 7.303370786516854e-06, + "logits/chosen": -7.000398635864258, + "logits/rejected": -7.000590801239014, + "logps/chosen": -0.650733470916748, + "logps/rejected": -3.551283359527588, + "loss": 0.8207, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.06507335603237152, + "rewards/margins": 0.29005497694015503, + "rewards/rejected": -0.35512834787368774, + "step": 52 + }, + { + "epoch": 0.014887052389960017, + "grad_norm": 4.78125, + "learning_rate": 7.443820224719102e-06, + "logits/chosen": -7.001314640045166, + "logits/rejected": -7.001479148864746, + "logps/chosen": -0.6439567804336548, + "logps/rejected": -3.231163263320923, + "loss": 0.8379, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.06439567357301712, + "rewards/margins": 0.2587206959724426, + "rewards/rejected": -0.32311639189720154, + "step": 53 + }, + { + "epoch": 0.01516794017090266, + "grad_norm": 9.125, + "learning_rate": 7.584269662921349e-06, + "logits/chosen": -7.009494304656982, + "logits/rejected": -7.009639739990234, + "logps/chosen": -0.6828871965408325, + "logps/rejected": -3.5422186851501465, + "loss": 0.8226, "rewards/accuracies": 0.84375, - "rewards/chosen": -0.06806226074695587, - "rewards/margins": 0.24155308306217194, - "rewards/rejected": -0.3096153140068054, - "sft_loss": 0.44717150926589966, - "step": 100 + "rewards/chosen": -0.06828872114419937, + "rewards/margins": 0.28593313694000244, + "rewards/rejected": -0.3542218804359436, + "step": 54 + }, + { + "epoch": 0.0154488279518453, + "grad_norm": 1.9765625, + "learning_rate": 7.724719101123595e-06, + "logits/chosen": -7.009527206420898, + "logits/rejected": -7.009775161743164, + "logps/chosen": -0.5451503992080688, + "logps/rejected": -3.2060601711273193, + "loss": 0.831, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.054515041410923004, + "rewards/margins": 0.2660909593105316, + "rewards/rejected": -0.3206060528755188, + "step": 55 + }, + { + "epoch": 0.015729715732787943, + "grad_norm": 1.203125, + "learning_rate": 7.865168539325843e-06, + "logits/chosen": -7.00971794128418, + "logits/rejected": -7.009875774383545, + "logps/chosen": -0.6500095129013062, + "logps/rejected": -3.940423011779785, + "loss": 0.8, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.06500095129013062, + "rewards/margins": 0.32904139161109924, + "rewards/rejected": -0.39404234290122986, + "step": 56 + }, + { + "epoch": 0.016010603513730584, + "grad_norm": 2.703125, + "learning_rate": 8.00561797752809e-06, + "logits/chosen": -7.013581275939941, + "logits/rejected": -7.013791561126709, + "logps/chosen": -0.5986226797103882, + "logps/rejected": -3.3548402786254883, + "loss": 0.8256, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.05986226722598076, + "rewards/margins": 0.2756217420101166, + "rewards/rejected": -0.33548396825790405, + "step": 57 + }, + { + "epoch": 0.016291491294673225, + "grad_norm": 1.2109375, + "learning_rate": 8.146067415730338e-06, + "logits/chosen": -7.008817672729492, + "logits/rejected": -7.008937358856201, + "logps/chosen": -0.5546457171440125, + "logps/rejected": -3.4342713356018066, + "loss": 0.8211, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.055464569479227066, + "rewards/margins": 0.28796258568763733, + "rewards/rejected": -0.3434271812438965, + "step": 58 + }, + { + "epoch": 0.01657237907561587, + "grad_norm": 1.7109375, + "learning_rate": 8.286516853932584e-06, + "logits/chosen": -7.012298583984375, + "logits/rejected": -7.012528896331787, + "logps/chosen": -0.5672339797019958, + "logps/rejected": -3.319044828414917, + "loss": 0.8259, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.05672340467572212, + "rewards/margins": 0.2751810550689697, + "rewards/rejected": -0.33190447092056274, + "step": 59 }, { - "epoch": 0.011586620963883975, - "grad_norm": 7.847203731536865, - "learning_rate": 5.789473684210527e-05, - "logits/chosen": -3.01198673248291, - "logits/rejected": -3.0121328830718994, - "logps/chosen": -0.7313061356544495, - "logps/rejected": -3.1134066581726074, - "loss": 0.7701, - "odds_ratio_loss": 2.80288028717041, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.07313062250614166, - "rewards/margins": 0.2382100373506546, - "rewards/rejected": -0.31134068965911865, - "sft_loss": 0.489812970161438, - "step": 110 + "epoch": 0.01685326685655851, + "grad_norm": 1.265625, + "learning_rate": 8.426966292134832e-06, + "logits/chosen": -7.013304710388184, + "logits/rejected": -7.013603210449219, + "logps/chosen": -0.5733633637428284, + "logps/rejected": -3.944350004196167, + "loss": 0.796, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.057336337864398956, + "rewards/margins": 0.33709871768951416, + "rewards/rejected": -0.3944350481033325, + "step": 60 }, { - "epoch": 0.012639950142418883, - "grad_norm": 9.357610702514648, - "learning_rate": 6.31578947368421e-05, - "logits/chosen": -2.793389320373535, - "logits/rejected": -2.793593406677246, - "logps/chosen": -0.744411826133728, - "logps/rejected": -3.2574427127838135, - "loss": 0.781, - "odds_ratio_loss": 2.4383304119110107, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.07444119453430176, - "rewards/margins": 0.251303106546402, - "rewards/rejected": -0.32574430108070374, - "sft_loss": 0.5372061133384705, - "step": 120 - }, - { - "epoch": 0.013693279320953789, - "grad_norm": 4.926955699920654, - "learning_rate": 6.842105263157896e-05, - "logits/chosen": -2.829728603363037, - "logits/rejected": -2.829943895339966, - "logps/chosen": -0.5513553619384766, - "logps/rejected": -2.877117872238159, - "loss": 0.5817, - "odds_ratio_loss": 2.001354694366455, + "epoch": 0.01713415463750115, + "grad_norm": 2.609375, + "learning_rate": 8.567415730337079e-06, + "logits/chosen": -7.01167106628418, + "logits/rejected": -7.011828899383545, + "logps/chosen": -0.6211044192314148, + "logps/rejected": -3.431169033050537, + "loss": 0.8257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06211044266819954, + "rewards/margins": 0.28100642561912537, + "rewards/rejected": -0.3431169092655182, + "step": 61 + }, + { + "epoch": 0.017415042418443796, + "grad_norm": 4.15625, + "learning_rate": 8.707865168539327e-06, + "logits/chosen": -7.013251781463623, + "logits/rejected": -7.013393878936768, + "logps/chosen": -0.6521888971328735, + "logps/rejected": -3.21026873588562, + "loss": 0.8371, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.06521890312433243, + "rewards/margins": 0.2558079659938812, + "rewards/rejected": -0.32102692127227783, + "step": 62 + }, + { + "epoch": 0.017695930199386437, + "grad_norm": 1.1796875, + "learning_rate": 8.848314606741573e-06, + "logits/chosen": -7.01942253112793, + "logits/rejected": -7.019570350646973, + "logps/chosen": -0.5423337817192078, + "logps/rejected": -3.554921865463257, + "loss": 0.8147, "rewards/accuracies": 0.875, - "rewards/chosen": -0.055135536938905716, - "rewards/margins": 0.23257622122764587, - "rewards/rejected": -0.2877117693424225, - "sft_loss": 0.38156595826148987, - "step": 130 - }, - { - "epoch": 0.014746608499488697, - "grad_norm": 8.740342140197754, - "learning_rate": 7.368421052631579e-05, - "logits/chosen": -2.9767470359802246, - "logits/rejected": -2.9770071506500244, - "logps/chosen": -0.5641220211982727, - "logps/rejected": -4.819352149963379, - "loss": 0.589, - "odds_ratio_loss": 2.0714950561523438, - "rewards/accuracies": 0.8833333253860474, - "rewards/chosen": -0.05641220510005951, - "rewards/margins": 0.4255230724811554, - "rewards/rejected": -0.48193517327308655, - "sft_loss": 0.3818718194961548, - "step": 140 - }, - { - "epoch": 0.015799937678023603, - "grad_norm": 21.316011428833008, - "learning_rate": 7.894736842105263e-05, - "logits/chosen": -2.9424824714660645, - "logits/rejected": -2.942646026611328, - "logps/chosen": -0.6596536040306091, - "logps/rejected": -5.643968105316162, - "loss": 0.6873, - "odds_ratio_loss": 2.2379026412963867, - "rewards/accuracies": 0.8833333253860474, - "rewards/chosen": -0.06596536934375763, - "rewards/margins": 0.4984314739704132, - "rewards/rejected": -0.5643967986106873, - "sft_loss": 0.46346980333328247, - "step": 150 + "rewards/chosen": -0.054233379662036896, + "rewards/margins": 0.3012588322162628, + "rewards/rejected": -0.3554922044277191, + "step": 63 + }, + { + "epoch": 0.017976817980329077, + "grad_norm": 0.84375, + "learning_rate": 8.98876404494382e-06, + "logits/chosen": -7.020681858062744, + "logits/rejected": -7.020779132843018, + "logps/chosen": -0.6097487807273865, + "logps/rejected": -3.4758450984954834, + "loss": 0.8241, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.06097487360239029, + "rewards/margins": 0.28660961985588074, + "rewards/rejected": -0.3475845158100128, + "step": 64 + }, + { + "epoch": 0.01825770576127172, + "grad_norm": 1.0703125, + "learning_rate": 9.129213483146068e-06, + "logits/chosen": -7.021762847900391, + "logits/rejected": -7.022034645080566, + "logps/chosen": -0.6843651533126831, + "logps/rejected": -4.37304162979126, + "loss": 0.7824, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.06843651831150055, + "rewards/margins": 0.3688676655292511, + "rewards/rejected": -0.43730416893959045, + "step": 65 + }, + { + "epoch": 0.018538593542214363, + "grad_norm": 1.359375, + "learning_rate": 9.269662921348316e-06, + "logits/chosen": -7.02295446395874, + "logits/rejected": -7.023041248321533, + "logps/chosen": -0.6040046215057373, + "logps/rejected": -3.583864688873291, + "loss": 0.8154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06040046364068985, + "rewards/margins": 0.2979860007762909, + "rewards/rejected": -0.3583865165710449, + "step": 66 + }, + { + "epoch": 0.018819481323157004, + "grad_norm": 1.15625, + "learning_rate": 9.410112359550562e-06, + "logits/chosen": -7.029426097869873, + "logits/rejected": -7.0295729637146, + "logps/chosen": -0.5566189289093018, + "logps/rejected": -3.2764642238616943, + "loss": 0.8283, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.055661890655756, + "rewards/margins": 0.2719845473766327, + "rewards/rejected": -0.3276464343070984, + "step": 67 + }, + { + "epoch": 0.019100369104099645, + "grad_norm": 1.234375, + "learning_rate": 9.550561797752809e-06, + "logits/chosen": -7.031888961791992, + "logits/rejected": -7.032125473022461, + "logps/chosen": -0.5691148638725281, + "logps/rejected": -3.4460644721984863, + "loss": 0.821, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.05691148713231087, + "rewards/margins": 0.2876949906349182, + "rewards/rejected": -0.3446064591407776, + "step": 68 + }, + { + "epoch": 0.019381256885042285, + "grad_norm": 1.59375, + "learning_rate": 9.691011235955057e-06, + "logits/chosen": -7.036633014678955, + "logits/rejected": -7.036778926849365, + "logps/chosen": -0.6520639061927795, + "logps/rejected": -3.434640645980835, + "loss": 0.8252, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.06520639359951019, + "rewards/margins": 0.27825766801834106, + "rewards/rejected": -0.34346407651901245, + "step": 69 + }, + { + "epoch": 0.01966214466598493, + "grad_norm": 1.046875, + "learning_rate": 9.831460674157303e-06, + "logits/chosen": -7.03872013092041, + "logits/rejected": -7.038879871368408, + "logps/chosen": -0.5827143788337708, + "logps/rejected": -3.6182808876037598, + "loss": 0.8103, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.058271437883377075, + "rewards/margins": 0.3035566508769989, + "rewards/rejected": -0.36182811856269836, + "step": 70 }, { - "epoch": 0.01685326685655851, - "grad_norm": 8.542074203491211, - "learning_rate": 8.421052631578948e-05, - "logits/chosen": -2.788573741912842, - "logits/rejected": -2.7887728214263916, - "logps/chosen": -0.5694094300270081, - "logps/rejected": -4.549073219299316, - "loss": 0.5953, - "odds_ratio_loss": 2.08962082862854, - "rewards/accuracies": 0.9083333611488342, - "rewards/chosen": -0.05694093927741051, - "rewards/margins": 0.3979664146900177, - "rewards/rejected": -0.4549073278903961, - "sft_loss": 0.3863413631916046, - "step": 160 - }, - { - "epoch": 0.017906596035093418, - "grad_norm": 10.086228370666504, - "learning_rate": 8.947368421052632e-05, - "logits/chosen": -2.747136354446411, - "logits/rejected": -2.7473561763763428, - "logps/chosen": -0.5511812567710876, - "logps/rejected": -4.04136848449707, - "loss": 0.5864, - "odds_ratio_loss": 2.00999116897583, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.0551181361079216, - "rewards/margins": 0.34901875257492065, - "rewards/rejected": -0.40413692593574524, - "sft_loss": 0.3853704035282135, - "step": 170 - }, - { - "epoch": 0.018959925213628322, - "grad_norm": 6.374107837677002, - "learning_rate": 9.473684210526316e-05, - "logits/chosen": -2.6502909660339355, - "logits/rejected": -2.650648593902588, - "logps/chosen": -0.5040014982223511, - "logps/rejected": -4.205493450164795, - "loss": 0.5236, - "odds_ratio_loss": 1.6211795806884766, - "rewards/accuracies": 0.9208333492279053, - "rewards/chosen": -0.050400152802467346, - "rewards/margins": 0.3701492249965668, - "rewards/rejected": -0.4205494225025177, - "sft_loss": 0.3614722490310669, - "step": 180 - }, - { - "epoch": 0.02001325439216323, - "grad_norm": 2.717724323272705, - "learning_rate": 0.0001, - "logits/chosen": -2.7677037715911865, - "logits/rejected": -2.7680699825286865, - "logps/chosen": -0.5834678411483765, - "logps/rejected": -4.945567607879639, - "loss": 0.6101, - "odds_ratio_loss": 2.1636292934417725, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -0.05834679678082466, - "rewards/margins": 0.4362100064754486, - "rewards/rejected": -0.49455681443214417, - "sft_loss": 0.39376381039619446, - "step": 190 + "epoch": 0.01994303244692757, + "grad_norm": 1.0078125, + "learning_rate": 9.97191011235955e-06, + "logits/chosen": -7.044363021850586, + "logits/rejected": -7.044580936431885, + "logps/chosen": -0.6912685036659241, + "logps/rejected": -3.794344425201416, + "loss": 0.8079, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.06912684440612793, + "rewards/margins": 0.3103076219558716, + "rewards/rejected": -0.3794344663619995, + "step": 71 + }, + { + "epoch": 0.02022392022787021, + "grad_norm": 3.609375, + "learning_rate": 1.0112359550561798e-05, + "logits/chosen": -7.046081066131592, + "logits/rejected": -7.046180725097656, + "logps/chosen": -0.683323085308075, + "logps/rejected": -3.082217216491699, + "loss": 0.843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06833230704069138, + "rewards/margins": 0.23988942801952362, + "rewards/rejected": -0.3082217574119568, + "step": 72 + }, + { + "epoch": 0.020504808008812853, + "grad_norm": 1.1484375, + "learning_rate": 1.0252808988764046e-05, + "logits/chosen": -7.047407150268555, + "logits/rejected": -7.047583103179932, + "logps/chosen": -0.6924074292182922, + "logps/rejected": -3.506157159805298, + "loss": 0.8246, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.0692407488822937, + "rewards/margins": 0.281374990940094, + "rewards/rejected": -0.3506157100200653, + "step": 73 + }, + { + "epoch": 0.020785695789755497, + "grad_norm": 74.5, + "learning_rate": 1.0393258426966292e-05, + "logits/chosen": -7.055050373077393, + "logits/rejected": -7.055255889892578, + "logps/chosen": -0.4969649910926819, + "logps/rejected": -3.5049824714660645, + "loss": 0.8142, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.04969649761915207, + "rewards/margins": 0.30080172419548035, + "rewards/rejected": -0.350498229265213, + "step": 74 }, { "epoch": 0.021066583570698138, - "grad_norm": 6.805058479309082, - "learning_rate": 0.00010526315789473683, - "logits/chosen": -2.722885847091675, - "logits/rejected": -2.723130226135254, - "logps/chosen": -0.6606806516647339, - "logps/rejected": -4.050003528594971, - "loss": 0.6941, - "odds_ratio_loss": 2.383312940597534, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -0.06606806069612503, - "rewards/margins": 0.3389323055744171, - "rewards/rejected": -0.4050002992153168, - "sft_loss": 0.4557226896286011, - "step": 200 - }, - { - "epoch": 0.022119912749233046, - "grad_norm": 5.377903938293457, - "learning_rate": 0.00011052631578947368, - "logits/chosen": -2.777975559234619, - "logits/rejected": -2.778069257736206, - "logps/chosen": -0.663443922996521, - "logps/rejected": -3.8639333248138428, - "loss": 0.69, - "odds_ratio_loss": 1.9626834392547607, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -0.06634439527988434, - "rewards/margins": 0.3200489580631256, - "rewards/rejected": -0.38639336824417114, - "sft_loss": 0.4936945140361786, - "step": 210 - }, - { - "epoch": 0.02317324192776795, - "grad_norm": 11.14415454864502, - "learning_rate": 0.00011578947368421053, - "logits/chosen": -2.69752836227417, - "logits/rejected": -2.6975817680358887, - "logps/chosen": -0.7092010378837585, - "logps/rejected": -3.6659300327301025, - "loss": 0.7441, - "odds_ratio_loss": 2.298079013824463, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07092010229825974, - "rewards/margins": 0.29567286372184753, - "rewards/rejected": -0.36659297347068787, - "sft_loss": 0.5143173933029175, - "step": 220 - }, - { - "epoch": 0.024226571106302858, - "grad_norm": 17.70037269592285, - "learning_rate": 0.00012105263157894738, - "logits/chosen": -2.8756678104400635, - "logits/rejected": -2.875657796859741, - "logps/chosen": -0.639348566532135, - "logps/rejected": -4.930140018463135, - "loss": 0.6653, - "odds_ratio_loss": 2.042109489440918, - "rewards/accuracies": 0.8958333134651184, - "rewards/chosen": -0.06393485516309738, - "rewards/margins": 0.42907920479774475, - "rewards/rejected": -0.49301406741142273, - "sft_loss": 0.4610413908958435, - "step": 230 + "grad_norm": 1.3515625, + "learning_rate": 1.0533707865168539e-05, + "logits/chosen": -7.057081699371338, + "logits/rejected": -7.057204723358154, + "logps/chosen": -0.6645674705505371, + "logps/rejected": -3.872537136077881, + "loss": 0.8049, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.06645674258470535, + "rewards/margins": 0.32079699635505676, + "rewards/rejected": -0.38725370168685913, + "step": 75 + }, + { + "epoch": 0.02134747135164078, + "grad_norm": 0.984375, + "learning_rate": 1.0674157303370787e-05, + "logits/chosen": -7.062127113342285, + "logits/rejected": -7.0622992515563965, + "logps/chosen": -0.683305561542511, + "logps/rejected": -3.850494861602783, + "loss": 0.8067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0683305561542511, + "rewards/margins": 0.3167189657688141, + "rewards/rejected": -0.3850494921207428, + "step": 76 + }, + { + "epoch": 0.021628359132583423, + "grad_norm": 0.7890625, + "learning_rate": 1.0814606741573033e-05, + "logits/chosen": -7.0633769035339355, + "logits/rejected": -7.0634965896606445, + "logps/chosen": -0.6324968338012695, + "logps/rejected": -3.4045989513397217, + "loss": 0.8247, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.06324967741966248, + "rewards/margins": 0.27721020579338074, + "rewards/rejected": -0.3404598832130432, + "step": 77 + }, + { + "epoch": 0.021909246913526064, + "grad_norm": 1.1875, + "learning_rate": 1.0955056179775282e-05, + "logits/chosen": -7.069698333740234, + "logits/rejected": -7.0697784423828125, + "logps/chosen": -0.6205027103424072, + "logps/rejected": -3.5221495628356934, + "loss": 0.8222, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.06205027177929878, + "rewards/margins": 0.2901647090911865, + "rewards/rejected": -0.3522149622440338, + "step": 78 + }, + { + "epoch": 0.022190134694468705, + "grad_norm": 1.2109375, + "learning_rate": 1.1095505617977528e-05, + "logits/chosen": -7.070184707641602, + "logits/rejected": -7.070287227630615, + "logps/chosen": -0.5837754607200623, + "logps/rejected": -3.566255807876587, + "loss": 0.8126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.058377549052238464, + "rewards/margins": 0.2982480823993683, + "rewards/rejected": -0.35662561655044556, + "step": 79 + }, + { + "epoch": 0.022471022475411346, + "grad_norm": 1.5859375, + "learning_rate": 1.1235955056179776e-05, + "logits/chosen": -7.073416709899902, + "logits/rejected": -7.073558807373047, + "logps/chosen": -0.6057207584381104, + "logps/rejected": -3.7733519077301025, + "loss": 0.8087, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.060572076588869095, + "rewards/margins": 0.31676313281059265, + "rewards/rejected": -0.37733519077301025, + "step": 80 }, { - "epoch": 0.025279900284837765, - "grad_norm": 22.880346298217773, - "learning_rate": 0.0001263157894736842, - "logits/chosen": -3.3157336711883545, - "logits/rejected": -3.3157567977905273, - "logps/chosen": -1.2456268072128296, - "logps/rejected": -5.491827964782715, - "loss": 1.3155, - "odds_ratio_loss": 4.847614765167236, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.12456268817186356, - "rewards/margins": 0.4246201515197754, - "rewards/rejected": -0.5491827726364136, - "sft_loss": 0.8306990265846252, - "step": 240 - }, - { - "epoch": 0.026333229463372673, - "grad_norm": 5.729049205780029, - "learning_rate": 0.00013157894736842105, - "logits/chosen": -2.5473344326019287, - "logits/rejected": -2.5469789505004883, - "logps/chosen": -0.749646008014679, - "logps/rejected": -6.155911445617676, - "loss": 0.7811, - "odds_ratio_loss": 2.4923181533813477, - "rewards/accuracies": 0.8645833134651184, - "rewards/chosen": -0.07496459782123566, - "rewards/margins": 0.5406264662742615, - "rewards/rejected": -0.6155910491943359, - "sft_loss": 0.5319061875343323, - "step": 250 - }, - { - "epoch": 0.027386558641907578, - "grad_norm": 11.868535995483398, - "learning_rate": 0.00013684210526315792, - "logits/chosen": -2.9207849502563477, - "logits/rejected": -2.9205822944641113, - "logps/chosen": -0.8412100076675415, - "logps/rejected": -5.425389289855957, - "loss": 0.8748, - "odds_ratio_loss": 3.032189130783081, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.08412099629640579, - "rewards/margins": 0.4584178924560547, - "rewards/rejected": -0.5425389409065247, - "sft_loss": 0.5715639591217041, - "step": 260 - }, - { - "epoch": 0.028439887820442485, - "grad_norm": 9.387285232543945, - "learning_rate": 0.00014210526315789474, - "logits/chosen": -3.1293420791625977, - "logits/rejected": -3.129204273223877, - "logps/chosen": -0.7858380079269409, - "logps/rejected": -4.375970363616943, - "loss": 0.8281, - "odds_ratio_loss": 2.820624351501465, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.07858379930257797, - "rewards/margins": 0.35901322960853577, - "rewards/rejected": -0.43759700655937195, - "sft_loss": 0.5460221171379089, - "step": 270 - }, - { - "epoch": 0.029493216998977393, - "grad_norm": 5.87777042388916, - "learning_rate": 0.00014736842105263158, - "logits/chosen": -2.9154765605926514, - "logits/rejected": -2.91546368598938, - "logps/chosen": -0.595488965511322, - "logps/rejected": -3.7737627029418945, - "loss": 0.6283, - "odds_ratio_loss": 2.1527862548828125, - "rewards/accuracies": 0.8604166507720947, - "rewards/chosen": -0.05954889953136444, - "rewards/margins": 0.31782734394073486, - "rewards/rejected": -0.3773762583732605, - "sft_loss": 0.4130483567714691, - "step": 280 - }, - { - "epoch": 0.0305465461775123, - "grad_norm": 10.7889986038208, - "learning_rate": 0.00015263157894736842, - "logits/chosen": -3.0789036750793457, - "logits/rejected": -3.079068899154663, - "logps/chosen": -0.5851417779922485, - "logps/rejected": -3.892369031906128, - "loss": 0.6135, - "odds_ratio_loss": 2.1990480422973633, - "rewards/accuracies": 0.8791666626930237, - "rewards/chosen": -0.05851416662335396, - "rewards/margins": 0.33072274923324585, - "rewards/rejected": -0.3892369568347931, - "sft_loss": 0.3936450183391571, - "step": 290 - }, - { - "epoch": 0.031599875356047205, - "grad_norm": 10.757394790649414, - "learning_rate": 0.00015789473684210527, - "logits/chosen": -3.1641581058502197, - "logits/rejected": -3.1641595363616943, - "logps/chosen": -0.625824511051178, - "logps/rejected": -4.4615254402160645, - "loss": 0.6551, - "odds_ratio_loss": 2.074664354324341, - "rewards/accuracies": 0.8708333373069763, - "rewards/chosen": -0.06258244812488556, - "rewards/margins": 0.3835701644420624, - "rewards/rejected": -0.44615259766578674, - "sft_loss": 0.44765299558639526, - "step": 300 - }, - { - "epoch": 0.03265320453458211, - "grad_norm": 8.260001182556152, - "learning_rate": 0.0001631578947368421, - "logits/chosen": -3.1696364879608154, - "logits/rejected": -3.169647693634033, - "logps/chosen": -0.6334646940231323, - "logps/rejected": -4.125209331512451, - "loss": 0.6653, - "odds_ratio_loss": 2.147244691848755, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -0.06334646791219711, - "rewards/margins": 0.34917446970939636, - "rewards/rejected": -0.4125209450721741, - "sft_loss": 0.45062482357025146, - "step": 310 - }, - { - "epoch": 0.03370653371311702, - "grad_norm": 17.23076057434082, - "learning_rate": 0.00016842105263157895, - "logits/chosen": -3.175907611846924, - "logits/rejected": -3.175673007965088, - "logps/chosen": -0.8329946994781494, - "logps/rejected": -4.718179702758789, - "loss": 0.8649, - "odds_ratio_loss": 2.6120009422302246, - "rewards/accuracies": 0.8729166388511658, - "rewards/chosen": -0.08329946547746658, - "rewards/margins": 0.3885185122489929, - "rewards/rejected": -0.4718180298805237, - "sft_loss": 0.6036695837974548, - "step": 320 - }, - { - "epoch": 0.03475986289165193, - "grad_norm": 8.57013988494873, - "learning_rate": 0.0001736842105263158, - "logits/chosen": -2.885397434234619, - "logits/rejected": -2.8853094577789307, - "logps/chosen": -0.7634103298187256, - "logps/rejected": -3.892472982406616, - "loss": 0.7919, - "odds_ratio_loss": 2.4835994243621826, - "rewards/accuracies": 0.8770833611488342, - "rewards/chosen": -0.07634103298187256, - "rewards/margins": 0.31290626525878906, - "rewards/rejected": -0.389247328042984, - "sft_loss": 0.5435259938240051, - "step": 330 - }, - { - "epoch": 0.035813192070186836, - "grad_norm": 6.161098957061768, - "learning_rate": 0.00017894736842105264, - "logits/chosen": -2.7474565505981445, - "logits/rejected": -2.747159004211426, - "logps/chosen": -0.6635507345199585, - "logps/rejected": -4.673018455505371, - "loss": 0.6934, - "odds_ratio_loss": 2.4976134300231934, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.06635507941246033, - "rewards/margins": 0.4009467363357544, - "rewards/rejected": -0.4673018753528595, - "sft_loss": 0.44364967942237854, - "step": 340 - }, - { - "epoch": 0.036866521248721744, - "grad_norm": 6.6998372077941895, - "learning_rate": 0.00018421052631578948, - "logits/chosen": -3.3279902935028076, - "logits/rejected": -3.327728509902954, - "logps/chosen": -0.7464654445648193, - "logps/rejected": -4.424903869628906, - "loss": 0.7811, - "odds_ratio_loss": 2.4744033813476562, - "rewards/accuracies": 0.8270833492279053, - "rewards/chosen": -0.07464654743671417, - "rewards/margins": 0.3678438365459442, - "rewards/rejected": -0.4424903988838196, - "sft_loss": 0.5336239337921143, - "step": 350 - }, - { - "epoch": 0.037919850427256645, - "grad_norm": 4.3592400550842285, - "learning_rate": 0.00018947368421052632, - "logits/chosen": -3.632689952850342, - "logits/rejected": -3.6323180198669434, - "logps/chosen": -0.6197668313980103, - "logps/rejected": -5.021815299987793, - "loss": 0.647, - "odds_ratio_loss": 2.0902163982391357, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -0.061976686120033264, - "rewards/margins": 0.44020482897758484, - "rewards/rejected": -0.5021815299987793, - "sft_loss": 0.43794170022010803, - "step": 360 - }, - { - "epoch": 0.03897317960579155, - "grad_norm": 9.133977890014648, - "learning_rate": 0.00019473684210526317, - "logits/chosen": -3.6677591800689697, - "logits/rejected": -3.667369842529297, - "logps/chosen": -0.6474730372428894, - "logps/rejected": -5.439915180206299, - "loss": 0.6779, - "odds_ratio_loss": 2.1945674419403076, - "rewards/accuracies": 0.8666666746139526, - "rewards/chosen": -0.06474730372428894, - "rewards/margins": 0.4792442321777344, - "rewards/rejected": -0.5439915060997009, - "sft_loss": 0.45841965079307556, - "step": 370 - }, - { - "epoch": 0.04002650878432646, - "grad_norm": 9.33304214477539, - "learning_rate": 0.0002, - "logits/chosen": -3.5276687145233154, - "logits/rejected": -3.527397632598877, - "logps/chosen": -0.6519566178321838, - "logps/rejected": -4.4450178146362305, - "loss": 0.6857, - "odds_ratio_loss": 2.411801338195801, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.06519566476345062, - "rewards/margins": 0.3793061375617981, - "rewards/rejected": -0.4445018172264099, - "sft_loss": 0.44453203678131104, - "step": 380 - }, - { - "epoch": 0.04107983796286137, - "grad_norm": 8.269370079040527, - "learning_rate": 0.00020526315789473685, - "logits/chosen": -3.345468282699585, - "logits/rejected": -3.3452601432800293, - "logps/chosen": -0.7482808232307434, - "logps/rejected": -4.133052349090576, - "loss": 0.7849, - "odds_ratio_loss": 2.473043918609619, - "rewards/accuracies": 0.8395833373069763, - "rewards/chosen": -0.07482809573411942, - "rewards/margins": 0.33847716450691223, - "rewards/rejected": -0.41330528259277344, - "sft_loss": 0.5376084446907043, - "step": 390 - }, - { - "epoch": 0.042133167141396276, - "grad_norm": 3.1917130947113037, - "learning_rate": 0.00021052631578947367, - "logits/chosen": -3.498554229736328, - "logits/rejected": -3.4982807636260986, - "logps/chosen": -0.6910140514373779, - "logps/rejected": -4.305008888244629, - "loss": 0.7229, - "odds_ratio_loss": 2.5834543704986572, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.06910141557455063, - "rewards/margins": 0.36139950156211853, - "rewards/rejected": -0.43050095438957214, - "sft_loss": 0.46452444791793823, - "step": 400 - }, - { - "epoch": 0.043186496319931184, - "grad_norm": 8.981714248657227, - "learning_rate": 0.00021578947368421054, - "logits/chosen": -3.482508420944214, - "logits/rejected": -3.482311725616455, - "logps/chosen": -0.666519045829773, - "logps/rejected": -3.838574171066284, - "loss": 0.7004, - "odds_ratio_loss": 2.59932017326355, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.06665190309286118, - "rewards/margins": 0.3172055184841156, - "rewards/rejected": -0.3838574290275574, - "sft_loss": 0.4405144453048706, - "step": 410 - }, - { - "epoch": 0.04423982549846609, - "grad_norm": 5.946087837219238, - "learning_rate": 0.00022105263157894735, - "logits/chosen": -3.5680463314056396, - "logits/rejected": -3.5679588317871094, - "logps/chosen": -0.6861178874969482, - "logps/rejected": -3.294382333755493, - "loss": 0.7209, - "odds_ratio_loss": 2.4581611156463623, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06861178576946259, - "rewards/margins": 0.26082643866539, - "rewards/rejected": -0.3294382095336914, - "sft_loss": 0.4750979244709015, - "step": 420 - }, - { - "epoch": 0.045293154677001, - "grad_norm": 16.975387573242188, - "learning_rate": 0.00022631578947368422, - "logits/chosen": -3.8716492652893066, - "logits/rejected": -3.871539831161499, - "logps/chosen": -0.7186715602874756, - "logps/rejected": -3.2046849727630615, - "loss": 0.7559, - "odds_ratio_loss": 2.6692261695861816, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.07186715304851532, - "rewards/margins": 0.24860134720802307, - "rewards/rejected": -0.3204684853553772, - "sft_loss": 0.48894843459129333, - "step": 430 - }, - { - "epoch": 0.0463464838555359, - "grad_norm": 3.843916416168213, - "learning_rate": 0.00023157894736842107, - "logits/chosen": -3.794214963912964, - "logits/rejected": -3.794062852859497, - "logps/chosen": -0.6966003179550171, - "logps/rejected": -3.6082844734191895, - "loss": 0.7316, - "odds_ratio_loss": 2.6561334133148193, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.06966003775596619, - "rewards/margins": 0.2911684215068817, - "rewards/rejected": -0.3608284592628479, - "sft_loss": 0.4659655690193176, - "step": 440 - }, - { - "epoch": 0.04739981303407081, - "grad_norm": 14.617210388183594, - "learning_rate": 0.00023684210526315788, - "logits/chosen": -3.84993052482605, - "logits/rejected": -3.8500382900238037, - "logps/chosen": -0.7132828831672668, - "logps/rejected": -3.116370916366577, - "loss": 0.7449, - "odds_ratio_loss": 2.349879264831543, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.0713282972574234, - "rewards/margins": 0.24030880630016327, - "rewards/rejected": -0.31163710355758667, - "sft_loss": 0.5099204182624817, - "step": 450 - }, - { - "epoch": 0.048453142212605715, - "grad_norm": 15.630524635314941, - "learning_rate": 0.00024210526315789475, - "logits/chosen": -4.3313679695129395, - "logits/rejected": -4.331648349761963, - "logps/chosen": -0.7833544611930847, - "logps/rejected": -2.8526246547698975, - "loss": 0.8191, - "odds_ratio_loss": 2.5849623680114746, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07833544164896011, - "rewards/margins": 0.20692706108093262, - "rewards/rejected": -0.28526249527931213, - "sft_loss": 0.5606356263160706, - "step": 460 - }, - { - "epoch": 0.04950647139114062, - "grad_norm": 4.825496196746826, - "learning_rate": 0.0002473684210526316, - "logits/chosen": -4.020305156707764, - "logits/rejected": -4.020514965057373, - "logps/chosen": -0.7084909677505493, - "logps/rejected": -2.901973009109497, - "loss": 0.745, - "odds_ratio_loss": 2.5733158588409424, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.07084909081459045, - "rewards/margins": 0.21934820711612701, - "rewards/rejected": -0.29019731283187866, - "sft_loss": 0.48766499757766724, - "step": 470 - }, - { - "epoch": 0.05055980056967553, - "grad_norm": 6.267645835876465, - "learning_rate": 0.0002526315789473684, - "logits/chosen": -3.936088800430298, - "logits/rejected": -3.936236619949341, - "logps/chosen": -0.7358769774436951, - "logps/rejected": -2.6652441024780273, - "loss": 0.7689, - "odds_ratio_loss": 2.5294764041900635, - "rewards/accuracies": 0.8541666865348816, - "rewards/chosen": -0.07358769327402115, - "rewards/margins": 0.1929367184638977, - "rewards/rejected": -0.26652440428733826, - "sft_loss": 0.5159851312637329, - "step": 480 - }, - { - "epoch": 0.05161312974821044, - "grad_norm": 7.438229084014893, - "learning_rate": 0.0002578947368421053, - "logits/chosen": -4.008545875549316, - "logits/rejected": -4.008641242980957, - "logps/chosen": -0.7306921482086182, - "logps/rejected": -2.7273244857788086, - "loss": 0.7645, - "odds_ratio_loss": 2.6694443225860596, - "rewards/accuracies": 0.8291666507720947, - "rewards/chosen": -0.07306921482086182, - "rewards/margins": 0.1996632218360901, - "rewards/rejected": -0.2727324366569519, - "sft_loss": 0.49753716588020325, - "step": 490 - }, - { - "epoch": 0.052666458926745346, - "grad_norm": 5.6936469078063965, - "learning_rate": 0.0002631578947368421, - "logits/chosen": -3.8969054222106934, - "logits/rejected": -3.896923303604126, - "logps/chosen": -0.7155380249023438, - "logps/rejected": -3.1710591316223145, - "loss": 0.7467, - "odds_ratio_loss": 2.616429567337036, - "rewards/accuracies": 0.8729166388511658, - "rewards/chosen": -0.07155381143093109, - "rewards/margins": 0.24555210769176483, - "rewards/rejected": -0.3171059191226959, - "sft_loss": 0.48508700728416443, - "step": 500 - }, - { - "epoch": 0.053719788105280254, - "grad_norm": 4.272115230560303, - "learning_rate": 0.00026842105263157897, - "logits/chosen": -3.8689732551574707, - "logits/rejected": -3.868974208831787, - "logps/chosen": -0.6541014313697815, - "logps/rejected": -3.1265270709991455, - "loss": 0.6821, - "odds_ratio_loss": 2.2681777477264404, - "rewards/accuracies": 0.8854166865348816, - "rewards/chosen": -0.06541014462709427, - "rewards/margins": 0.2472425401210785, - "rewards/rejected": -0.31265270709991455, - "sft_loss": 0.4552646279335022, - "step": 510 - }, - { - "epoch": 0.054773117283815155, - "grad_norm": 5.510837078094482, - "learning_rate": 0.00027368421052631584, - "logits/chosen": -3.8620245456695557, - "logits/rejected": -3.862044334411621, - "logps/chosen": -0.6386537551879883, - "logps/rejected": -3.322967767715454, - "loss": 0.6712, - "odds_ratio_loss": 2.311323881149292, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.06386537849903107, - "rewards/margins": 0.2684313654899597, - "rewards/rejected": -0.33229681849479675, - "sft_loss": 0.4400910437107086, - "step": 520 - }, - { - "epoch": 0.05582644646235006, - "grad_norm": 12.38877010345459, - "learning_rate": 0.0002789473684210526, - "logits/chosen": -4.60584020614624, - "logits/rejected": -4.6057939529418945, - "logps/chosen": -0.7113536596298218, - "logps/rejected": -3.330021381378174, - "loss": 0.7496, - "odds_ratio_loss": 2.427239179611206, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07113537192344666, - "rewards/margins": 0.2618667781352997, - "rewards/rejected": -0.33300215005874634, - "sft_loss": 0.5068832635879517, - "step": 530 - }, - { - "epoch": 0.05687977564088497, - "grad_norm": 2.2653727531433105, - "learning_rate": 0.00028421052631578947, - "logits/chosen": -5.099688529968262, - "logits/rejected": -5.09957218170166, - "logps/chosen": -0.6874160170555115, - "logps/rejected": -3.100078582763672, - "loss": 0.7231, - "odds_ratio_loss": 2.652129650115967, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.06874160468578339, - "rewards/margins": 0.24126628041267395, - "rewards/rejected": -0.31000787019729614, - "sft_loss": 0.45785781741142273, - "step": 540 - }, - { - "epoch": 0.05793310481941988, - "grad_norm": 6.484382152557373, - "learning_rate": 0.00028947368421052634, - "logits/chosen": -4.181884288787842, - "logits/rejected": -4.181893348693848, - "logps/chosen": -0.7184228897094727, - "logps/rejected": -3.2252790927886963, - "loss": 0.7563, - "odds_ratio_loss": 2.4872865676879883, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.07184228301048279, - "rewards/margins": 0.25068560242652893, - "rewards/rejected": -0.3225278854370117, - "sft_loss": 0.5075890421867371, - "step": 550 - }, - { - "epoch": 0.058986433997954786, - "grad_norm": 6.237575531005859, - "learning_rate": 0.00029473684210526316, - "logits/chosen": -4.042048931121826, - "logits/rejected": -4.042147159576416, - "logps/chosen": -0.6820612549781799, - "logps/rejected": -2.6241307258605957, - "loss": 0.7178, - "odds_ratio_loss": 2.6058013439178467, - "rewards/accuracies": 0.8604166507720947, - "rewards/chosen": -0.06820613890886307, - "rewards/margins": 0.19420695304870605, - "rewards/rejected": -0.26241305470466614, - "sft_loss": 0.45718762278556824, - "step": 560 - }, - { - "epoch": 0.060039763176489694, - "grad_norm": 5.729897499084473, - "learning_rate": 0.0003, - "logits/chosen": -3.9665284156799316, - "logits/rejected": -3.966668128967285, - "logps/chosen": -0.7161160111427307, - "logps/rejected": -2.8060250282287598, - "loss": 0.7529, - "odds_ratio_loss": 2.4470136165618896, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.07161159813404083, - "rewards/margins": 0.20899087190628052, - "rewards/rejected": -0.28060245513916016, - "sft_loss": 0.5082017779350281, - "step": 570 - }, - { - "epoch": 0.0610930923550246, - "grad_norm": 5.065602779388428, - "learning_rate": 0.00030526315789473684, - "logits/chosen": -3.9091081619262695, - "logits/rejected": -3.9092376232147217, - "logps/chosen": -0.6755971908569336, - "logps/rejected": -2.8741674423217773, - "loss": 0.7078, - "odds_ratio_loss": 2.4316818714141846, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -0.06755972653627396, - "rewards/margins": 0.21985705196857452, - "rewards/rejected": -0.2874167859554291, - "sft_loss": 0.4646414816379547, - "step": 580 - }, - { - "epoch": 0.06214642153355951, - "grad_norm": 2.45158314704895, - "learning_rate": 0.0003105263157894737, - "logits/chosen": -3.9886550903320312, - "logits/rejected": -3.9887642860412598, - "logps/chosen": -0.649567186832428, - "logps/rejected": -3.0265567302703857, - "loss": 0.6791, - "odds_ratio_loss": 2.3147711753845215, - "rewards/accuracies": 0.8729166388511658, - "rewards/chosen": -0.06495673209428787, - "rewards/margins": 0.2376989722251892, - "rewards/rejected": -0.3026556670665741, - "sft_loss": 0.4476209580898285, - "step": 590 - }, - { - "epoch": 0.06319975071209441, - "grad_norm": 15.312357902526855, - "learning_rate": 0.00031578947368421053, - "logits/chosen": -3.9752919673919678, - "logits/rejected": -3.9754388332366943, - "logps/chosen": -0.696826159954071, - "logps/rejected": -3.0343518257141113, - "loss": 0.732, - "odds_ratio_loss": 2.443164587020874, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.06968262046575546, - "rewards/margins": 0.23375259339809418, - "rewards/rejected": -0.30343523621559143, - "sft_loss": 0.48767518997192383, - "step": 600 - }, - { - "epoch": 0.06425307989062932, - "grad_norm": 9.758230209350586, - "learning_rate": 0.0003210526315789474, - "logits/chosen": -3.76411509513855, - "logits/rejected": -3.763446092605591, - "logps/chosen": -0.7242849469184875, - "logps/rejected": -5.800142288208008, - "loss": 0.7592, - "odds_ratio_loss": 2.657700300216675, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.07242848724126816, - "rewards/margins": 0.5075857639312744, - "rewards/rejected": -0.5800142884254456, - "sft_loss": 0.49342209100723267, - "step": 610 - }, - { - "epoch": 0.06530640906916423, - "grad_norm": 7.555414199829102, - "learning_rate": 0.0003263157894736842, - "logits/chosen": -4.165302753448486, - "logits/rejected": -4.1650519371032715, - "logps/chosen": -0.7384843230247498, - "logps/rejected": -3.710164785385132, - "loss": 0.7768, - "odds_ratio_loss": 2.5719528198242188, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.0738484337925911, - "rewards/margins": 0.297168105840683, - "rewards/rejected": -0.3710165023803711, - "sft_loss": 0.5196101665496826, - "step": 620 - }, - { - "epoch": 0.06635973824769914, - "grad_norm": 4.273881435394287, - "learning_rate": 0.00033157894736842103, - "logits/chosen": -4.187811374664307, - "logits/rejected": -4.186800479888916, - "logps/chosen": -0.6481006145477295, - "logps/rejected": -5.178854942321777, - "loss": 0.6803, - "odds_ratio_loss": 2.3059561252593994, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -0.06481005996465683, - "rewards/margins": 0.45307546854019165, - "rewards/rejected": -0.5178855061531067, - "sft_loss": 0.4497505724430084, - "step": 630 - }, - { - "epoch": 0.06741306742623404, - "grad_norm": 6.665101528167725, - "learning_rate": 0.0003368421052631579, - "logits/chosen": -4.168524265289307, - "logits/rejected": -4.1669230461120605, - "logps/chosen": -0.658748984336853, - "logps/rejected": -6.14946174621582, - "loss": 0.6936, - "odds_ratio_loss": 2.615469455718994, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06587490439414978, - "rewards/margins": 0.5490713715553284, - "rewards/rejected": -0.614946186542511, - "sft_loss": 0.432014137506485, - "step": 640 - }, - { - "epoch": 0.06846639660476894, - "grad_norm": 5.859743118286133, - "learning_rate": 0.00034210526315789477, - "logits/chosen": -4.12244176864624, - "logits/rejected": -4.120962619781494, - "logps/chosen": -0.703795850276947, - "logps/rejected": -5.927857875823975, - "loss": 0.739, - "odds_ratio_loss": 2.5700552463531494, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -0.0703795999288559, - "rewards/margins": 0.5224061608314514, - "rewards/rejected": -0.5927857756614685, - "sft_loss": 0.48198458552360535, - "step": 650 - }, - { - "epoch": 0.06951972578330386, - "grad_norm": 3.937659502029419, - "learning_rate": 0.0003473684210526316, - "logits/chosen": -4.144687175750732, - "logits/rejected": -4.143020153045654, - "logps/chosen": -0.6716140508651733, - "logps/rejected": -6.169389247894287, - "loss": 0.704, - "odds_ratio_loss": 2.5956499576568604, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.06716141104698181, - "rewards/margins": 0.549777626991272, - "rewards/rejected": -0.6169389486312866, - "sft_loss": 0.44441157579421997, - "step": 660 - }, - { - "epoch": 0.07057305496183876, - "grad_norm": 4.0990681648254395, - "learning_rate": 0.0003526315789473684, - "logits/chosen": -4.208241939544678, - "logits/rejected": -4.206976413726807, - "logps/chosen": -0.6446244120597839, - "logps/rejected": -5.427404403686523, - "loss": 0.6791, - "odds_ratio_loss": 2.412100315093994, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06446244567632675, - "rewards/margins": 0.4782780110836029, - "rewards/rejected": -0.5427404642105103, - "sft_loss": 0.4378568232059479, - "step": 670 - }, - { - "epoch": 0.07162638414037367, - "grad_norm": 4.258831977844238, - "learning_rate": 0.0003578947368421053, - "logits/chosen": -4.341937065124512, - "logits/rejected": -4.341104984283447, - "logps/chosen": -0.7450679540634155, - "logps/rejected": -4.367857933044434, - "loss": 0.7874, - "odds_ratio_loss": 2.708036184310913, - "rewards/accuracies": 0.8083333373069763, - "rewards/chosen": -0.07450678944587708, - "rewards/margins": 0.3622789978981018, - "rewards/rejected": -0.4367857873439789, - "sft_loss": 0.5165507793426514, - "step": 680 - }, - { - "epoch": 0.07267971331890857, - "grad_norm": 10.723002433776855, - "learning_rate": 0.00036315789473684214, - "logits/chosen": -4.344449996948242, - "logits/rejected": -4.344136714935303, - "logps/chosen": -0.8118324279785156, - "logps/rejected": -3.4473140239715576, - "loss": 0.852, - "odds_ratio_loss": 2.871811628341675, - "rewards/accuracies": 0.8104166388511658, - "rewards/chosen": -0.08118324726819992, - "rewards/margins": 0.26354819536209106, - "rewards/rejected": -0.3447313904762268, - "sft_loss": 0.5648209452629089, - "step": 690 - }, - { - "epoch": 0.07373304249744349, - "grad_norm": 5.821114540100098, - "learning_rate": 0.00036842105263157896, - "logits/chosen": -4.07045316696167, - "logits/rejected": -4.069707870483398, - "logps/chosen": -0.8850536942481995, - "logps/rejected": -5.450161933898926, - "loss": 0.9181, - "odds_ratio_loss": 3.165844678878784, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.08850537240505219, - "rewards/margins": 0.4565107524394989, - "rewards/rejected": -0.5450161695480347, - "sft_loss": 0.6015486121177673, - "step": 700 - }, - { - "epoch": 0.07478637167597839, - "grad_norm": 2.219165563583374, - "learning_rate": 0.0003736842105263158, - "logits/chosen": -3.7920498847961426, - "logits/rejected": -3.7913458347320557, - "logps/chosen": -0.7324831485748291, - "logps/rejected": -4.996405601501465, - "loss": 0.7707, - "odds_ratio_loss": 2.7621920108795166, - "rewards/accuracies": 0.8041666746139526, - "rewards/chosen": -0.07324830442667007, - "rewards/margins": 0.42639225721359253, - "rewards/rejected": -0.499640554189682, - "sft_loss": 0.49450069665908813, - "step": 710 - }, - { - "epoch": 0.07583970085451329, - "grad_norm": 5.435701370239258, - "learning_rate": 0.00037894736842105265, - "logits/chosen": -4.781533718109131, - "logits/rejected": -4.781356334686279, - "logps/chosen": -0.6917392611503601, - "logps/rejected": -4.243617057800293, - "loss": 0.7244, - "odds_ratio_loss": 2.5724165439605713, - "rewards/accuracies": 0.8645833134651184, - "rewards/chosen": -0.06917393207550049, - "rewards/margins": 0.3551878333091736, - "rewards/rejected": -0.4243617355823517, - "sft_loss": 0.4671470522880554, - "step": 720 - }, - { - "epoch": 0.0768930300330482, - "grad_norm": 4.722170352935791, - "learning_rate": 0.00038421052631578946, - "logits/chosen": -4.822556495666504, - "logits/rejected": -4.822704792022705, - "logps/chosen": -0.6918298006057739, - "logps/rejected": -3.5728962421417236, - "loss": 0.7267, - "odds_ratio_loss": 2.5833892822265625, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.06918298453092575, - "rewards/margins": 0.28810662031173706, - "rewards/rejected": -0.3572896420955658, - "sft_loss": 0.4683450758457184, - "step": 730 - }, - { - "epoch": 0.0779463592115831, - "grad_norm": 2.800881862640381, - "learning_rate": 0.00038947368421052633, - "logits/chosen": -4.779958248138428, - "logits/rejected": -4.780096530914307, - "logps/chosen": -0.6186120510101318, - "logps/rejected": -3.642204761505127, - "loss": 0.6549, - "odds_ratio_loss": 2.427229881286621, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.06186120584607124, - "rewards/margins": 0.3023592531681061, - "rewards/rejected": -0.3642204701900482, - "sft_loss": 0.41214191913604736, - "step": 740 - }, - { - "epoch": 0.07899968839011802, - "grad_norm": 5.068697452545166, - "learning_rate": 0.00039473684210526315, - "logits/chosen": -4.596142768859863, - "logits/rejected": -4.595941066741943, - "logps/chosen": -0.7380008697509766, - "logps/rejected": -4.162142276763916, - "loss": 0.7749, - "odds_ratio_loss": 2.4714393615722656, - "rewards/accuracies": 0.8333333134651184, - "rewards/chosen": -0.07380008697509766, - "rewards/margins": 0.34241411089897156, - "rewards/rejected": -0.4162141978740692, - "sft_loss": 0.527804970741272, - "step": 750 - }, - { - "epoch": 0.08005301756865292, - "grad_norm": 3.4697628021240234, - "learning_rate": 0.0004, - "logits/chosen": -4.708858013153076, - "logits/rejected": -4.708543300628662, - "logps/chosen": -0.675973653793335, - "logps/rejected": -4.2291083335876465, - "loss": 0.7074, - "odds_ratio_loss": 2.3750479221343994, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.06759736686944962, - "rewards/margins": 0.3553134799003601, - "rewards/rejected": -0.4229108393192291, - "sft_loss": 0.4698618948459625, - "step": 760 - }, - { - "epoch": 0.08110634674718784, - "grad_norm": 11.160131454467773, - "learning_rate": 0.00040526315789473684, - "logits/chosen": -5.051191329956055, - "logits/rejected": -5.050747871398926, - "logps/chosen": -0.7793533802032471, - "logps/rejected": -5.09091854095459, - "loss": 0.8153, - "odds_ratio_loss": 2.829737901687622, - "rewards/accuracies": 0.8291666507720947, - "rewards/chosen": -0.07793533802032471, - "rewards/margins": 0.43115654587745667, - "rewards/rejected": -0.509091854095459, - "sft_loss": 0.5323660969734192, - "step": 770 - }, - { - "epoch": 0.08215967592572274, - "grad_norm": 3.8492166996002197, - "learning_rate": 0.0004105263157894737, - "logits/chosen": -4.681753158569336, - "logits/rejected": -4.681027889251709, - "logps/chosen": -0.67795729637146, - "logps/rejected": -5.4289870262146, - "loss": 0.7104, - "odds_ratio_loss": 2.6001367568969727, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06779572367668152, - "rewards/margins": 0.4751029908657074, - "rewards/rejected": -0.5428987145423889, - "sft_loss": 0.45040473341941833, - "step": 780 - }, - { - "epoch": 0.08321300510425765, - "grad_norm": 4.350924491882324, - "learning_rate": 0.0004157894736842106, - "logits/chosen": -5.090719699859619, - "logits/rejected": -5.0898871421813965, - "logps/chosen": -0.6309987902641296, - "logps/rejected": -6.083089828491211, - "loss": 0.6608, - "odds_ratio_loss": 2.363413095474243, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -0.06309988349676132, - "rewards/margins": 0.5452090501785278, - "rewards/rejected": -0.6083090305328369, - "sft_loss": 0.4244639277458191, - "step": 790 - }, - { - "epoch": 0.08426633428279255, - "grad_norm": 4.629517078399658, - "learning_rate": 0.00042105263157894734, - "logits/chosen": -5.171376705169678, - "logits/rejected": -5.170820713043213, - "logps/chosen": -0.7821296453475952, - "logps/rejected": -4.942056655883789, - "loss": 0.8229, - "odds_ratio_loss": 2.6525399684906006, - "rewards/accuracies": 0.8208333253860474, - "rewards/chosen": -0.07821296900510788, - "rewards/margins": 0.41599270701408386, - "rewards/rejected": -0.49420568346977234, - "sft_loss": 0.557674765586853, - "step": 800 - }, - { - "epoch": 0.08531966346132745, - "grad_norm": 5.111828327178955, - "learning_rate": 0.0004263157894736842, - "logits/chosen": -4.80694580078125, - "logits/rejected": -4.806312561035156, - "logps/chosen": -0.7565589547157288, - "logps/rejected": -5.1447882652282715, - "loss": 0.791, - "odds_ratio_loss": 2.5595006942749023, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.07565589994192123, - "rewards/margins": 0.4388229250907898, - "rewards/rejected": -0.514478862285614, - "sft_loss": 0.5350964665412903, - "step": 810 - }, - { - "epoch": 0.08637299263986237, - "grad_norm": 4.786795616149902, - "learning_rate": 0.0004315789473684211, - "logits/chosen": -4.826291561126709, - "logits/rejected": -4.825577259063721, - "logps/chosen": -0.6888704895973206, - "logps/rejected": -5.567938804626465, - "loss": 0.7202, - "odds_ratio_loss": 2.4763171672821045, - "rewards/accuracies": 0.8541666865348816, - "rewards/chosen": -0.06888704746961594, - "rewards/margins": 0.487906813621521, - "rewards/rejected": -0.5567939281463623, - "sft_loss": 0.472540020942688, - "step": 820 - }, - { - "epoch": 0.08742632181839727, - "grad_norm": 7.63191556930542, - "learning_rate": 0.00043684210526315795, - "logits/chosen": -4.981302738189697, - "logits/rejected": -4.980616569519043, - "logps/chosen": -0.7095519304275513, - "logps/rejected": -5.726536273956299, - "loss": 0.7455, - "odds_ratio_loss": 2.7154994010925293, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.07095518708229065, - "rewards/margins": 0.5016984343528748, - "rewards/rejected": -0.5726536512374878, - "sft_loss": 0.4739212989807129, - "step": 830 - }, - { - "epoch": 0.08847965099693218, - "grad_norm": 3.9395651817321777, - "learning_rate": 0.0004421052631578947, - "logits/chosen": -5.055291652679443, - "logits/rejected": -5.054480075836182, - "logps/chosen": -0.6979976892471313, - "logps/rejected": -5.470063209533691, - "loss": 0.7345, - "odds_ratio_loss": 2.5650830268859863, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.0697997659444809, - "rewards/margins": 0.4772065579891205, - "rewards/rejected": -0.547006368637085, - "sft_loss": 0.477975457906723, - "step": 840 - }, - { - "epoch": 0.08953298017546708, - "grad_norm": 9.380526542663574, - "learning_rate": 0.0004473684210526316, - "logits/chosen": -4.9736008644104, - "logits/rejected": -4.972882270812988, - "logps/chosen": -0.6816462874412537, - "logps/rejected": -5.888026714324951, - "loss": 0.715, - "odds_ratio_loss": 2.5402190685272217, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.0681646317243576, - "rewards/margins": 0.520638108253479, - "rewards/rejected": -0.588802695274353, - "sft_loss": 0.46102267503738403, - "step": 850 - }, - { - "epoch": 0.090586309354002, - "grad_norm": 3.032940626144409, - "learning_rate": 0.00045263157894736845, - "logits/chosen": -4.86979866027832, - "logits/rejected": -4.869546413421631, - "logps/chosen": -0.8415181636810303, - "logps/rejected": -4.487551689147949, - "loss": 0.8781, - "odds_ratio_loss": 2.631443977355957, - "rewards/accuracies": 0.8229166865348816, - "rewards/chosen": -0.08415181934833527, - "rewards/margins": 0.3646034300327301, - "rewards/rejected": -0.44875526428222656, - "sft_loss": 0.6149870157241821, - "step": 860 - }, - { - "epoch": 0.0916396385325369, - "grad_norm": 5.457218170166016, - "learning_rate": 0.00045789473684210527, - "logits/chosen": -4.5640482902526855, - "logits/rejected": -4.5640788078308105, - "logps/chosen": -0.7579687833786011, - "logps/rejected": -3.3095312118530273, - "loss": 0.7946, - "odds_ratio_loss": 2.6955649852752686, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.07579687237739563, - "rewards/margins": 0.2551562488079071, - "rewards/rejected": -0.33095312118530273, - "sft_loss": 0.5250447988510132, - "step": 870 - }, - { - "epoch": 0.0926929677110718, - "grad_norm": 4.475607872009277, - "learning_rate": 0.00046315789473684214, - "logits/chosen": -4.721373558044434, - "logits/rejected": -4.7214860916137695, - "logps/chosen": -0.7569971680641174, - "logps/rejected": -3.347615957260132, - "loss": 0.7905, - "odds_ratio_loss": 2.5438392162323, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.07569971680641174, - "rewards/margins": 0.25906190276145935, - "rewards/rejected": -0.3347616195678711, - "sft_loss": 0.536092221736908, - "step": 880 - }, - { - "epoch": 0.09374629688960671, - "grad_norm": 31.67135238647461, - "learning_rate": 0.00046842105263157895, - "logits/chosen": -4.7270989418029785, - "logits/rejected": -4.727247714996338, - "logps/chosen": -0.7946822047233582, - "logps/rejected": -3.15295147895813, - "loss": 0.829, - "odds_ratio_loss": 2.443174123764038, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.07946821302175522, - "rewards/margins": 0.23582692444324493, - "rewards/rejected": -0.31529513001441956, - "sft_loss": 0.5846543908119202, - "step": 890 - }, - { - "epoch": 0.09479962606814162, - "grad_norm": 3.2339320182800293, - "learning_rate": 0.00047368421052631577, - "logits/chosen": -5.0870866775512695, - "logits/rejected": -5.087241172790527, - "logps/chosen": -0.6878632307052612, - "logps/rejected": -2.8736085891723633, - "loss": 0.7248, - "odds_ratio_loss": 2.5279555320739746, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.06878631561994553, - "rewards/margins": 0.21857453882694244, - "rewards/rejected": -0.2873608469963074, - "sft_loss": 0.47198787331581116, - "step": 900 - }, - { - "epoch": 0.09585295524667653, - "grad_norm": 2.4642884731292725, - "learning_rate": 0.00047894736842105264, - "logits/chosen": -5.1205644607543945, - "logits/rejected": -5.120718479156494, - "logps/chosen": -0.6843523383140564, - "logps/rejected": -3.210320472717285, - "loss": 0.7126, - "odds_ratio_loss": 2.3976242542266846, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -0.06843523681163788, - "rewards/margins": 0.25259679555892944, - "rewards/rejected": -0.3210320770740509, - "sft_loss": 0.47283756732940674, - "step": 910 - }, - { - "epoch": 0.09690628442521143, - "grad_norm": 5.920956611633301, - "learning_rate": 0.0004842105263157895, - "logits/chosen": -5.090200424194336, - "logits/rejected": -5.090282917022705, - "logps/chosen": -0.6722908616065979, - "logps/rejected": -3.273742198944092, - "loss": 0.7063, - "odds_ratio_loss": 2.530402421951294, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.06722908467054367, - "rewards/margins": 0.2601451575756073, - "rewards/rejected": -0.3273741900920868, - "sft_loss": 0.45327988266944885, - "step": 920 - }, - { - "epoch": 0.09795961360374635, - "grad_norm": 4.960987567901611, - "learning_rate": 0.0004894736842105264, - "logits/chosen": -5.087591648101807, - "logits/rejected": -5.087601661682129, - "logps/chosen": -0.7244377136230469, - "logps/rejected": -3.0884244441986084, - "loss": 0.7571, - "odds_ratio_loss": 2.6207199096679688, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.07244376838207245, - "rewards/margins": 0.23639869689941406, - "rewards/rejected": -0.3088424801826477, - "sft_loss": 0.4949897825717926, - "step": 930 - }, - { - "epoch": 0.09901294278228125, - "grad_norm": 5.539714336395264, - "learning_rate": 0.0004947368421052632, - "logits/chosen": -5.29771614074707, - "logits/rejected": -5.297707557678223, - "logps/chosen": -0.6988152265548706, - "logps/rejected": -3.315657615661621, - "loss": 0.7308, - "odds_ratio_loss": 2.3126912117004395, - "rewards/accuracies": 0.8645833134651184, - "rewards/chosen": -0.06988153606653214, - "rewards/margins": 0.26168423891067505, - "rewards/rejected": -0.3315657675266266, - "sft_loss": 0.4995124638080597, - "step": 940 - }, - { - "epoch": 0.10006627196081616, - "grad_norm": 6.586909294128418, - "learning_rate": 0.0005, - "logits/chosen": -5.285855293273926, - "logits/rejected": -5.2858428955078125, - "logps/chosen": -0.6867055296897888, - "logps/rejected": -3.43638277053833, - "loss": 0.7161, - "odds_ratio_loss": 2.5117852687835693, - "rewards/accuracies": 0.8791666626930237, - "rewards/chosen": -0.06867055594921112, - "rewards/margins": 0.2749677002429962, - "rewards/rejected": -0.34363824129104614, - "sft_loss": 0.4649271070957184, - "step": 950 - }, - { - "epoch": 0.10111960113935106, - "grad_norm": 6.756776809692383, - "learning_rate": 0.0004999983096040005, - "logits/chosen": -5.550118923187256, - "logits/rejected": -5.5501179695129395, - "logps/chosen": -0.7224279642105103, - "logps/rejected": -3.376439332962036, - "loss": 0.7555, - "odds_ratio_loss": 2.5149753093719482, - "rewards/accuracies": 0.8604166507720947, - "rewards/chosen": -0.07224280387163162, - "rewards/margins": 0.2654011845588684, - "rewards/rejected": -0.3376440107822418, - "sft_loss": 0.5040432214736938, - "step": 960 - }, - { - "epoch": 0.10217293031788596, - "grad_norm": 161.19760131835938, - "learning_rate": 0.0004999932384388613, - "logits/chosen": -5.225105285644531, - "logits/rejected": -5.225213527679443, - "logps/chosen": -1.5466647148132324, - "logps/rejected": -3.935602903366089, - "loss": 1.5839, - "odds_ratio_loss": 3.059565782546997, - "rewards/accuracies": 0.8604166507720947, - "rewards/chosen": -0.154666468501091, - "rewards/margins": 0.2388937920331955, - "rewards/rejected": -0.3935602605342865, - "sft_loss": 1.277909755706787, - "step": 970 - }, - { - "epoch": 0.10322625949642088, - "grad_norm": 4.612696170806885, - "learning_rate": 0.000499984786573161, - "logits/chosen": -5.253983020782471, - "logits/rejected": -5.254087448120117, - "logps/chosen": -0.6865792870521545, - "logps/rejected": -3.219088077545166, - "loss": 0.7214, - "odds_ratio_loss": 2.43359375, - "rewards/accuracies": 0.8166666626930237, - "rewards/chosen": -0.06865792721509933, - "rewards/margins": 0.2532508671283722, - "rewards/rejected": -0.3219088315963745, - "sft_loss": 0.4780765473842621, - "step": 980 - }, - { - "epoch": 0.10427958867495578, - "grad_norm": 4.7582621574401855, - "learning_rate": 0.0004999729541211952, - "logits/chosen": -5.1987152099609375, - "logits/rejected": -5.198534965515137, - "logps/chosen": -0.8549334406852722, - "logps/rejected": -4.291516304016113, - "loss": 0.8956, - "odds_ratio_loss": 2.6601970195770264, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.08549333363771439, - "rewards/margins": 0.3436582684516907, - "rewards/rejected": -0.42915162444114685, - "sft_loss": 0.6296234726905823, - "step": 990 - }, - { - "epoch": 0.10533291785349069, - "grad_norm": 5.924813747406006, - "learning_rate": 0.0004999577412429764, - "logits/chosen": -5.115817070007324, - "logits/rejected": -5.115783214569092, - "logps/chosen": -0.6959132552146912, - "logps/rejected": -3.6701784133911133, - "loss": 0.7282, - "odds_ratio_loss": 2.448162078857422, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.06959132105112076, - "rewards/margins": 0.29742658138275146, - "rewards/rejected": -0.36701786518096924, - "sft_loss": 0.48338958621025085, - "step": 1000 - }, - { - "epoch": 0.1063862470320256, - "grad_norm": 3.775995969772339, - "learning_rate": 0.0004999391481442307, - "logits/chosen": -5.038882732391357, - "logits/rejected": -5.039083003997803, - "logps/chosen": -0.6505192518234253, - "logps/rejected": -2.7321646213531494, - "loss": 0.686, - "odds_ratio_loss": 2.5259392261505127, - "rewards/accuracies": 0.8395833373069763, - "rewards/chosen": -0.06505192071199417, - "rewards/margins": 0.2081645280122757, - "rewards/rejected": -0.2732164263725281, - "sft_loss": 0.43344053626060486, - "step": 1010 - }, - { - "epoch": 0.10743957621056051, - "grad_norm": 3.577996253967285, - "learning_rate": 0.0004999171750763959, - "logits/chosen": -4.925287246704102, - "logits/rejected": -4.925505638122559, - "logps/chosen": -0.6491485238075256, - "logps/rejected": -2.673661947250366, - "loss": 0.683, - "odds_ratio_loss": 2.427180528640747, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.06491485238075256, - "rewards/margins": 0.20245136320590973, - "rewards/rejected": -0.2673662006855011, - "sft_loss": 0.44026196002960205, - "step": 1020 - }, - { - "epoch": 0.10849290538909541, - "grad_norm": 4.37986946105957, - "learning_rate": 0.0004998918223366173, - "logits/chosen": -5.010101318359375, - "logits/rejected": -5.010295391082764, - "logps/chosen": -0.7345671057701111, - "logps/rejected": -2.6488146781921387, - "loss": 0.7723, - "odds_ratio_loss": 2.661365509033203, - "rewards/accuracies": 0.8291666507720947, - "rewards/chosen": -0.07345671951770782, - "rewards/margins": 0.19142475724220276, - "rewards/rejected": -0.2648814618587494, - "sft_loss": 0.5061719417572021, - "step": 1030 - }, - { - "epoch": 0.10954623456763031, - "grad_norm": 8.366073608398438, - "learning_rate": 0.0004998630902677444, - "logits/chosen": -5.033807277679443, - "logits/rejected": -5.034041881561279, - "logps/chosen": -0.7631211876869202, - "logps/rejected": -2.8217647075653076, - "loss": 0.8007, - "odds_ratio_loss": 2.807353973388672, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.0763121098279953, - "rewards/margins": 0.2058643400669098, - "rewards/rejected": -0.2821764647960663, - "sft_loss": 0.5199962854385376, - "step": 1040 - }, - { - "epoch": 0.11059956374616522, - "grad_norm": 3.063091993331909, - "learning_rate": 0.0004998309792583257, - "logits/chosen": -5.056707859039307, - "logits/rejected": -5.056928634643555, - "logps/chosen": -0.7930384278297424, - "logps/rejected": -3.048496723175049, - "loss": 0.8372, - "odds_ratio_loss": 2.895695209503174, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.07930383831262589, - "rewards/margins": 0.22554583847522736, - "rewards/rejected": -0.30484965443611145, - "sft_loss": 0.5476340651512146, - "step": 1050 - }, - { - "epoch": 0.11165289292470013, - "grad_norm": 4.035704612731934, - "learning_rate": 0.0004997954897426039, - "logits/chosen": -4.731100559234619, - "logits/rejected": -4.731292247772217, - "logps/chosen": -0.7024089694023132, - "logps/rejected": -3.2537832260131836, - "loss": 0.7407, - "odds_ratio_loss": 2.768791675567627, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.07024088501930237, - "rewards/margins": 0.25513747334480286, - "rewards/rejected": -0.3253783583641052, - "sft_loss": 0.46386364102363586, - "step": 1060 - }, - { - "epoch": 0.11270622210323504, - "grad_norm": 5.625176906585693, - "learning_rate": 0.0004997566222005095, - "logits/chosen": -5.09245491027832, - "logits/rejected": -5.0925984382629395, - "logps/chosen": -0.7186325192451477, - "logps/rejected": -3.2234888076782227, - "loss": 0.7539, - "odds_ratio_loss": 2.5938258171081543, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -0.07186325639486313, - "rewards/margins": 0.2504856288433075, - "rewards/rejected": -0.3223489224910736, - "sft_loss": 0.4945569932460785, - "step": 1070 - }, - { - "epoch": 0.11375955128176994, - "grad_norm": 8.753211975097656, - "learning_rate": 0.0004997143771576551, - "logits/chosen": -5.339606761932373, - "logits/rejected": -5.339755058288574, - "logps/chosen": -0.6703072190284729, - "logps/rejected": -3.089592933654785, - "loss": 0.706, - "odds_ratio_loss": 2.5606048107147217, - "rewards/accuracies": 0.8208333253860474, - "rewards/chosen": -0.06703073531389236, - "rewards/margins": 0.2419285774230957, - "rewards/rejected": -0.30895933508872986, - "sft_loss": 0.44997820258140564, - "step": 1080 - }, - { - "epoch": 0.11481288046030486, - "grad_norm": 7.652149677276611, - "learning_rate": 0.0004996687551853271, - "logits/chosen": -5.191481113433838, - "logits/rejected": -5.191573619842529, - "logps/chosen": -0.7030736207962036, - "logps/rejected": -3.310757875442505, - "loss": 0.7353, - "odds_ratio_loss": 2.5654568672180176, - "rewards/accuracies": 0.8645833134651184, - "rewards/chosen": -0.07030736654996872, - "rewards/margins": 0.26076844334602356, - "rewards/rejected": -0.3310757875442505, - "sft_loss": 0.47873547673225403, - "step": 1090 - }, - { - "epoch": 0.11586620963883976, - "grad_norm": 4.607894420623779, - "learning_rate": 0.0004996197569004794, - "logits/chosen": -4.798411846160889, - "logits/rejected": -4.798523426055908, - "logps/chosen": -0.6611460447311401, - "logps/rejected": -2.983532428741455, - "loss": 0.6988, - "odds_ratio_loss": 2.426917314529419, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.06611461192369461, - "rewards/margins": 0.23223866522312164, - "rewards/rejected": -0.29835325479507446, - "sft_loss": 0.4561263918876648, - "step": 1100 - }, - { - "epoch": 0.11691953881737467, - "grad_norm": 6.003838539123535, - "learning_rate": 0.000499567382965724, - "logits/chosen": -4.689076900482178, - "logits/rejected": -4.689194202423096, - "logps/chosen": -0.6927405595779419, - "logps/rejected": -3.072282552719116, - "loss": 0.7266, - "odds_ratio_loss": 2.494777202606201, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06927405297756195, - "rewards/margins": 0.23795419931411743, - "rewards/rejected": -0.3072282373905182, - "sft_loss": 0.477167546749115, - "step": 1110 - }, - { - "epoch": 0.11797286799590957, - "grad_norm": 3.377394676208496, - "learning_rate": 0.0004995116340893223, - "logits/chosen": -4.666645526885986, - "logits/rejected": -4.666871547698975, - "logps/chosen": -0.6164705157279968, - "logps/rejected": -3.2369017601013184, - "loss": 0.6451, - "odds_ratio_loss": 2.411543369293213, - "rewards/accuracies": 0.8645833134651184, - "rewards/chosen": -0.061647046357393265, - "rewards/margins": 0.26204314827919006, - "rewards/rejected": -0.3236902058124542, - "sft_loss": 0.40390655398368835, - "step": 1120 - }, - { - "epoch": 0.11902619717444447, - "grad_norm": 3.650819778442383, - "learning_rate": 0.0004994525110251759, - "logits/chosen": -5.011782169342041, - "logits/rejected": -5.012125492095947, - "logps/chosen": -0.8005008101463318, - "logps/rejected": -3.1773478984832764, - "loss": 0.8349, - "odds_ratio_loss": 2.8024778366088867, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.08005008101463318, - "rewards/margins": 0.2376846969127655, - "rewards/rejected": -0.3177347779273987, - "sft_loss": 0.5546395182609558, - "step": 1130 - }, - { - "epoch": 0.12007952635297939, - "grad_norm": 5.192956447601318, - "learning_rate": 0.0004993900145728157, - "logits/chosen": -5.2029805183410645, - "logits/rejected": -5.203344821929932, - "logps/chosen": -0.7520886063575745, - "logps/rejected": -3.3132715225219727, - "loss": 0.7852, - "odds_ratio_loss": 2.5903329849243164, + "epoch": 0.02275191025635399, + "grad_norm": 1.0390625, + "learning_rate": 1.1376404494382022e-05, + "logits/chosen": -7.079671859741211, + "logits/rejected": -7.079834461212158, + "logps/chosen": -0.7357921004295349, + "logps/rejected": -3.681584596633911, + "loss": 0.8192, "rewards/accuracies": 0.84375, - "rewards/chosen": -0.07520885765552521, - "rewards/margins": 0.2561182677745819, - "rewards/rejected": -0.3313271403312683, - "sft_loss": 0.5261538624763489, - "step": 1140 - }, - { - "epoch": 0.12113285553151429, - "grad_norm": 7.4040985107421875, - "learning_rate": 0.0004993241455773918, - "logits/chosen": -5.165520191192627, - "logits/rejected": -5.165550708770752, - "logps/chosen": -0.7126467227935791, - "logps/rejected": -3.5545897483825684, - "loss": 0.748, - "odds_ratio_loss": 2.594726800918579, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07126467674970627, - "rewards/margins": 0.28419435024261475, - "rewards/rejected": -0.3554590046405792, - "sft_loss": 0.48852840065956116, - "step": 1150 - }, - { - "epoch": 0.1221861847100492, - "grad_norm": 4.905418872833252, - "learning_rate": 0.0004992549049296619, - "logits/chosen": -5.190316200256348, - "logits/rejected": -5.190418720245361, - "logps/chosen": -0.6702563762664795, - "logps/rejected": -3.4835519790649414, - "loss": 0.7052, - "odds_ratio_loss": 2.497361183166504, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.06702563911676407, - "rewards/margins": 0.28132954239845276, - "rewards/rejected": -0.3483552038669586, - "sft_loss": 0.4554961621761322, - "step": 1160 - }, - { - "epoch": 0.1232395138885841, - "grad_norm": 4.1152238845825195, - "learning_rate": 0.0004991822935659786, - "logits/chosen": -5.477373123168945, - "logits/rejected": -5.477328300476074, - "logps/chosen": -0.8472169637680054, - "logps/rejected": -3.917269229888916, - "loss": 0.8817, - "odds_ratio_loss": 3.221430540084839, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.08472169190645218, - "rewards/margins": 0.30700525641441345, - "rewards/rejected": -0.39172691106796265, - "sft_loss": 0.5595788955688477, - "step": 1170 - }, - { - "epoch": 0.12429284306711902, - "grad_norm": 5.008730888366699, - "learning_rate": 0.0004991063124682778, - "logits/chosen": -5.323733329772949, - "logits/rejected": -5.323288917541504, - "logps/chosen": -0.757644534111023, - "logps/rejected": -5.853792190551758, - "loss": 0.7925, - "odds_ratio_loss": 2.670191526412964, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.07576445490121841, - "rewards/margins": 0.5096147656440735, - "rewards/rejected": -0.5853793025016785, - "sft_loss": 0.5255211591720581, - "step": 1180 - }, - { - "epoch": 0.12534617224565392, - "grad_norm": 31.133501052856445, - "learning_rate": 0.0004990269626640645, - "logits/chosen": -5.64047384262085, - "logits/rejected": -5.64005184173584, - "logps/chosen": -0.768204391002655, - "logps/rejected": -5.164267063140869, - "loss": 0.8058, - "odds_ratio_loss": 2.815326452255249, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07682044059038162, - "rewards/margins": 0.43960627913475037, - "rewards/rejected": -0.516426682472229, - "sft_loss": 0.5242764949798584, - "step": 1190 - }, - { - "epoch": 0.12639950142418882, - "grad_norm": 6.6835222244262695, - "learning_rate": 0.0004989442452263996, - "logits/chosen": -5.151331424713135, - "logits/rejected": -5.1510162353515625, - "logps/chosen": -0.8430954217910767, - "logps/rejected": -4.105188369750977, - "loss": 0.8846, - "odds_ratio_loss": 3.0806498527526855, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.08430954068899155, - "rewards/margins": 0.32620927691459656, - "rewards/rejected": -0.4105188250541687, - "sft_loss": 0.5764933228492737, - "step": 1200 - }, - { - "epoch": 0.12745283060272372, - "grad_norm": 6.060980796813965, - "learning_rate": 0.0004988581612738847, - "logits/chosen": -5.232106685638428, - "logits/rejected": -5.2317962646484375, - "logps/chosen": -0.7580560445785522, - "logps/rejected": -3.9481935501098633, - "loss": 0.7955, - "odds_ratio_loss": 2.8234329223632812, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.07580561190843582, - "rewards/margins": 0.319013774394989, - "rewards/rejected": -0.39481934905052185, - "sft_loss": 0.5131634473800659, - "step": 1210 - }, - { - "epoch": 0.12850615978125865, - "grad_norm": 4.154183864593506, - "learning_rate": 0.0004987687119706477, - "logits/chosen": -5.385165214538574, - "logits/rejected": -5.385090351104736, - "logps/chosen": -0.7838355302810669, - "logps/rejected": -3.81575608253479, - "loss": 0.8198, - "odds_ratio_loss": 2.7132718563079834, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.07838355004787445, - "rewards/margins": 0.30319201946258545, - "rewards/rejected": -0.3815755844116211, - "sft_loss": 0.5484524965286255, - "step": 1220 - }, - { - "epoch": 0.12955948895979355, - "grad_norm": 4.047881603240967, - "learning_rate": 0.0004986758985263265, - "logits/chosen": -5.267871379852295, - "logits/rejected": -5.267872333526611, - "logps/chosen": -0.7567169666290283, - "logps/rejected": -2.9766650199890137, - "loss": 0.7939, - "odds_ratio_loss": 2.620311975479126, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -0.07567168772220612, - "rewards/margins": 0.2219947874546051, - "rewards/rejected": -0.2976664900779724, - "sft_loss": 0.5318555235862732, - "step": 1230 - }, - { - "epoch": 0.13061281813832845, - "grad_norm": 2.7552692890167236, - "learning_rate": 0.0004985797221960529, - "logits/chosen": -5.264489650726318, - "logits/rejected": -5.264598369598389, - "logps/chosen": -0.7095692753791809, - "logps/rejected": -3.2434911727905273, - "loss": 0.7445, - "odds_ratio_loss": 2.7399401664733887, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.07095693051815033, - "rewards/margins": 0.25339218974113464, - "rewards/rejected": -0.3243491053581238, - "sft_loss": 0.47051993012428284, - "step": 1240 - }, - { - "epoch": 0.13166614731686335, - "grad_norm": 3.3210740089416504, - "learning_rate": 0.0004984801842804357, - "logits/chosen": -5.190452575683594, - "logits/rejected": -5.190454959869385, - "logps/chosen": -0.6608388423919678, - "logps/rejected": -3.0926926136016846, - "loss": 0.6955, - "odds_ratio_loss": 2.5877368450164795, - "rewards/accuracies": 0.8541666865348816, - "rewards/chosen": -0.0660838857293129, - "rewards/margins": 0.2431853860616684, - "rewards/rejected": -0.3092692792415619, - "sft_loss": 0.43669426441192627, - "step": 1250 - }, - { - "epoch": 0.13271947649539828, - "grad_norm": 2.113543748855591, - "learning_rate": 0.0004983772861255426, - "logits/chosen": -5.355245113372803, - "logits/rejected": -5.3552327156066895, - "logps/chosen": -0.7177728414535522, - "logps/rejected": -3.0373685359954834, - "loss": 0.7558, - "odds_ratio_loss": 2.9420340061187744, - "rewards/accuracies": 0.8020833134651184, - "rewards/chosen": -0.07177729159593582, - "rewards/margins": 0.23195955157279968, - "rewards/rejected": -0.3037368357181549, - "sft_loss": 0.4615623354911804, - "step": 1260 - }, - { - "epoch": 0.13377280567393318, - "grad_norm": 2.6125612258911133, - "learning_rate": 0.0004982710291228828, - "logits/chosen": -5.5682549476623535, - "logits/rejected": -5.568284034729004, - "logps/chosen": -0.7032897472381592, - "logps/rejected": -3.1011478900909424, - "loss": 0.7454, - "odds_ratio_loss": 2.6539957523345947, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.0703289657831192, - "rewards/margins": 0.23978586494922638, - "rewards/rejected": -0.3101148307323456, - "sft_loss": 0.4799610376358032, - "step": 1270 - }, - { - "epoch": 0.13482613485246808, - "grad_norm": 4.396284580230713, - "learning_rate": 0.0004981614147093875, - "logits/chosen": -5.787298202514648, - "logits/rejected": -5.787331581115723, - "logps/chosen": -0.7411842942237854, - "logps/rejected": -3.3715906143188477, - "loss": 0.7797, - "odds_ratio_loss": 2.705798625946045, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.07411842048168182, - "rewards/margins": 0.2630406320095062, - "rewards/rejected": -0.33715906739234924, - "sft_loss": 0.5091153979301453, - "step": 1280 - }, - { - "epoch": 0.13587946403100298, - "grad_norm": 5.317102909088135, - "learning_rate": 0.000498048444367391, - "logits/chosen": -5.471971035003662, - "logits/rejected": -5.472008228302002, - "logps/chosen": -0.7457516193389893, - "logps/rejected": -3.6155009269714355, - "loss": 0.7782, - "odds_ratio_loss": 2.801055908203125, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.07457517087459564, - "rewards/margins": 0.2869749367237091, - "rewards/rejected": -0.36155006289482117, - "sft_loss": 0.49809518456459045, - "step": 1290 - }, - { - "epoch": 0.13693279320953788, - "grad_norm": 4.3287506103515625, - "learning_rate": 0.00049793211962461, - "logits/chosen": -5.262281894683838, - "logits/rejected": -5.262419700622559, - "logps/chosen": -0.7028716802597046, - "logps/rejected": -2.8387629985809326, - "loss": 0.7409, - "odds_ratio_loss": 2.748593807220459, - "rewards/accuracies": 0.8208333253860474, - "rewards/chosen": -0.07028716057538986, - "rewards/margins": 0.2135891616344452, - "rewards/rejected": -0.28387632966041565, - "sft_loss": 0.4660036265850067, - "step": 1300 - }, - { - "epoch": 0.1379861223880728, - "grad_norm": 1.7063987255096436, - "learning_rate": 0.0004978124420541238, - "logits/chosen": -5.180874347686768, - "logits/rejected": -5.181042671203613, - "logps/chosen": -0.7432348132133484, - "logps/rejected": -2.8490519523620605, - "loss": 0.7859, - "odds_ratio_loss": 2.679919958114624, - "rewards/accuracies": 0.8229166865348816, - "rewards/chosen": -0.07432348281145096, - "rewards/margins": 0.2105817198753357, - "rewards/rejected": -0.28490516543388367, - "sft_loss": 0.5179334282875061, - "step": 1310 - }, - { - "epoch": 0.1390394515666077, - "grad_norm": 2.908730983734131, - "learning_rate": 0.0004976894132743521, - "logits/chosen": -5.495538234710693, - "logits/rejected": -5.49543571472168, - "logps/chosen": -0.696144700050354, - "logps/rejected": -3.018319845199585, - "loss": 0.7357, - "odds_ratio_loss": 3.030001163482666, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.0696144625544548, - "rewards/margins": 0.23221756517887115, - "rewards/rejected": -0.30183205008506775, - "sft_loss": 0.4326884150505066, - "step": 1320 - }, - { - "epoch": 0.14009278074514261, - "grad_norm": 3.4632487297058105, - "learning_rate": 0.0004975630349490338, - "logits/chosen": -5.525754928588867, - "logits/rejected": -5.52562141418457, - "logps/chosen": -0.7210444808006287, - "logps/rejected": -3.5407607555389404, - "loss": 0.7551, - "odds_ratio_loss": 2.435321092605591, - "rewards/accuracies": 0.8229166865348816, - "rewards/chosen": -0.07210444658994675, - "rewards/margins": 0.28197160363197327, - "rewards/rejected": -0.3540760576725006, - "sft_loss": 0.5115490555763245, - "step": 1330 - }, - { - "epoch": 0.14114610992367752, - "grad_norm": 6.958703517913818, - "learning_rate": 0.0004974333087872041, - "logits/chosen": -5.362403869628906, - "logits/rejected": -5.362163066864014, - "logps/chosen": -0.732792854309082, - "logps/rejected": -4.104527950286865, - "loss": 0.7646, - "odds_ratio_loss": 2.719597339630127, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.07327928394079208, - "rewards/margins": 0.3371734619140625, - "rewards/rejected": -0.4104527533054352, - "sft_loss": 0.4926711320877075, - "step": 1340 - }, - { - "epoch": 0.14219943910221244, - "grad_norm": 2.859614849090576, - "learning_rate": 0.0004973002365431719, - "logits/chosen": -5.708899021148682, - "logits/rejected": -5.708664894104004, - "logps/chosen": -0.6528833508491516, - "logps/rejected": -4.244791030883789, - "loss": 0.6818, - "odds_ratio_loss": 2.3199055194854736, - "rewards/accuracies": 0.8770833611488342, - "rewards/chosen": -0.06528832763433456, - "rewards/margins": 0.35919085144996643, - "rewards/rejected": -0.4244791567325592, - "sft_loss": 0.44977909326553345, - "step": 1350 - }, - { - "epoch": 0.14325276828074734, - "grad_norm": 4.355047702789307, - "learning_rate": 0.0004971638200164954, - "logits/chosen": -6.141923427581787, - "logits/rejected": -6.141890525817871, - "logps/chosen": -0.7438533902168274, - "logps/rejected": -3.756343126296997, - "loss": 0.7845, - "odds_ratio_loss": 2.7992961406707764, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.07438533753156662, - "rewards/margins": 0.3012489676475525, - "rewards/rejected": -0.3756342828273773, - "sft_loss": 0.5046018362045288, - "step": 1360 - }, - { - "epoch": 0.14430609745928225, - "grad_norm": 2.1691360473632812, - "learning_rate": 0.0004970240610519582, - "logits/chosen": -5.513346195220947, - "logits/rejected": -5.513165473937988, - "logps/chosen": -0.7122442722320557, - "logps/rejected": -4.005341529846191, - "loss": 0.7452, - "odds_ratio_loss": 2.3780596256256104, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.07122442126274109, - "rewards/margins": 0.32930976152420044, - "rewards/rejected": -0.40053418278694153, - "sft_loss": 0.5073560476303101, - "step": 1370 - }, - { - "epoch": 0.14535942663781715, - "grad_norm": 8.108336448669434, - "learning_rate": 0.0004968809615395443, - "logits/chosen": -5.834110736846924, - "logits/rejected": -5.834160327911377, - "logps/chosen": -0.7287462949752808, - "logps/rejected": -3.322920560836792, - "loss": 0.7683, - "odds_ratio_loss": 2.7808871269226074, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.07287462800741196, - "rewards/margins": 0.25941744446754456, - "rewards/rejected": -0.3322920799255371, - "sft_loss": 0.4901922941207886, - "step": 1380 - }, - { - "epoch": 0.14641275581635205, - "grad_norm": 4.8621392250061035, - "learning_rate": 0.0004967345234144125, - "logits/chosen": -5.492813587188721, - "logits/rejected": -5.492822647094727, - "logps/chosen": -0.6987211108207703, - "logps/rejected": -3.542156934738159, - "loss": 0.7353, - "odds_ratio_loss": 2.4634013175964355, - "rewards/accuracies": 0.8333333134651184, - "rewards/chosen": -0.06987211108207703, - "rewards/margins": 0.2843436002731323, - "rewards/rejected": -0.35421568155288696, - "sft_loss": 0.48899564146995544, - "step": 1390 - }, - { - "epoch": 0.14746608499488698, - "grad_norm": 4.177161693572998, - "learning_rate": 0.00049658474865687, - "logits/chosen": -5.457418441772461, - "logits/rejected": -5.457381248474121, - "logps/chosen": -0.6720997095108032, - "logps/rejected": -3.794857978820801, - "loss": 0.705, - "odds_ratio_loss": 2.515441417694092, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06720996648073196, - "rewards/margins": 0.31227582693099976, - "rewards/rejected": -0.3794857859611511, - "sft_loss": 0.4534277021884918, - "step": 1400 - }, - { - "epoch": 0.14851941417342188, - "grad_norm": 6.939654350280762, - "learning_rate": 0.000496431639292346, - "logits/chosen": -5.591572284698486, - "logits/rejected": -5.591520309448242, - "logps/chosen": -0.6898292899131775, - "logps/rejected": -3.8443169593811035, - "loss": 0.7205, - "odds_ratio_loss": 2.573276996612549, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.06898292899131775, - "rewards/margins": 0.3154487609863281, - "rewards/rejected": -0.3844316899776459, - "sft_loss": 0.46317487955093384, - "step": 1410 - }, - { - "epoch": 0.14957274335195678, - "grad_norm": 3.3419246673583984, - "learning_rate": 0.0004962751973913644, - "logits/chosen": -5.660191059112549, - "logits/rejected": -5.6601338386535645, - "logps/chosen": -0.7214117050170898, - "logps/rejected": -3.800171375274658, - "loss": 0.7539, - "odds_ratio_loss": 2.2002346515655518, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07214117050170898, - "rewards/margins": 0.30787599086761475, - "rewards/rejected": -0.38001713156700134, - "sft_loss": 0.5338917374610901, - "step": 1420 - }, - { - "epoch": 0.15062607253049168, - "grad_norm": 6.627447128295898, - "learning_rate": 0.0004961154250695152, - "logits/chosen": -5.646604537963867, - "logits/rejected": -5.646506309509277, - "logps/chosen": -0.6789618730545044, - "logps/rejected": -3.7556862831115723, - "loss": 0.7131, - "odds_ratio_loss": 2.4358396530151367, - "rewards/accuracies": 0.8395833373069763, - "rewards/chosen": -0.06789619475603104, - "rewards/margins": 0.3076724112033844, - "rewards/rejected": -0.3755686581134796, - "sft_loss": 0.46949687600135803, - "step": 1430 - }, - { - "epoch": 0.15167940170902658, - "grad_norm": 7.264041900634766, - "learning_rate": 0.0004959523244874262, - "logits/chosen": -5.661590576171875, - "logits/rejected": -5.661508560180664, - "logps/chosen": -0.6989915370941162, - "logps/rejected": -3.6030194759368896, - "loss": 0.7324, - "odds_ratio_loss": 2.595428943634033, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.06989916414022446, - "rewards/margins": 0.2904028296470642, - "rewards/rejected": -0.3603019714355469, - "sft_loss": 0.4728315472602844, - "step": 1440 - }, - { - "epoch": 0.1527327308875615, - "grad_norm": 3.4570446014404297, - "learning_rate": 0.0004957858978507342, - "logits/chosen": -5.628535270690918, - "logits/rejected": -5.628504276275635, - "logps/chosen": -0.6590794324874878, - "logps/rejected": -3.3306422233581543, - "loss": 0.6931, - "odds_ratio_loss": 2.6064822673797607, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.06590793281793594, - "rewards/margins": 0.2671562433242798, - "rewards/rejected": -0.3330641984939575, - "sft_loss": 0.43240654468536377, - "step": 1450 - }, - { - "epoch": 0.1537860600660964, - "grad_norm": 7.473481178283691, - "learning_rate": 0.0004956161474100544, - "logits/chosen": -5.7138261795043945, - "logits/rejected": -5.713827133178711, - "logps/chosen": -0.6599766612052917, - "logps/rejected": -3.381110668182373, - "loss": 0.6939, - "odds_ratio_loss": 2.4460840225219727, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.06599767506122589, - "rewards/margins": 0.2721133828163147, - "rewards/rejected": -0.3381110727787018, - "sft_loss": 0.44932422041893005, - "step": 1460 - }, - { - "epoch": 0.1548393892446313, - "grad_norm": 6.867354869842529, - "learning_rate": 0.0004954430754609506, - "logits/chosen": -5.79508638381958, - "logits/rejected": -5.795089244842529, - "logps/chosen": -0.6903258562088013, - "logps/rejected": -3.085076332092285, - "loss": 0.7338, - "odds_ratio_loss": 2.651606798171997, - "rewards/accuracies": 0.8041666746139526, - "rewards/chosen": -0.06903257966041565, - "rewards/margins": 0.2394750565290451, - "rewards/rejected": -0.30850762128829956, - "sft_loss": 0.4686751961708069, - "step": 1470 - }, - { - "epoch": 0.1558927184231662, - "grad_norm": 3.0605275630950928, - "learning_rate": 0.0004952666843439038, - "logits/chosen": -5.6379008293151855, - "logits/rejected": -5.6378703117370605, - "logps/chosen": -0.6344018578529358, - "logps/rejected": -3.5333213806152344, - "loss": 0.6667, - "odds_ratio_loss": 2.371415853500366, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.06344018131494522, - "rewards/margins": 0.2898919880390167, - "rewards/rejected": -0.35333216190338135, - "sft_loss": 0.42956602573394775, - "step": 1480 - }, - { - "epoch": 0.15694604760170114, - "grad_norm": 3.5998635292053223, - "learning_rate": 0.0004950869764442807, - "logits/chosen": -5.513609886169434, - "logits/rejected": -5.513594627380371, - "logps/chosen": -0.6546897888183594, - "logps/rejected": -3.4388887882232666, - "loss": 0.6841, - "odds_ratio_loss": 2.4476258754730225, - "rewards/accuracies": 0.8854166865348816, - "rewards/chosen": -0.06546898186206818, - "rewards/margins": 0.2784199118614197, - "rewards/rejected": -0.34388887882232666, - "sft_loss": 0.43934836983680725, - "step": 1490 - }, - { - "epoch": 0.15799937678023604, - "grad_norm": 3.7529213428497314, - "learning_rate": 0.0004949039541923015, - "logits/chosen": -5.581011772155762, - "logits/rejected": -5.580976486206055, - "logps/chosen": -0.6702675223350525, - "logps/rejected": -3.7115352153778076, - "loss": 0.7055, - "odds_ratio_loss": 2.6884007453918457, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.06702675670385361, - "rewards/margins": 0.3041267395019531, - "rewards/rejected": -0.37115350365638733, - "sft_loss": 0.43668031692504883, - "step": 1500 - }, - { - "epoch": 0.15905270595877094, - "grad_norm": 6.72556734085083, - "learning_rate": 0.0004947176200630068, - "logits/chosen": -5.502162456512451, - "logits/rejected": -5.502138137817383, - "logps/chosen": -0.6218239068984985, - "logps/rejected": -3.428339719772339, - "loss": 0.6499, - "odds_ratio_loss": 2.3972864151000977, - "rewards/accuracies": 0.8708333373069763, - "rewards/chosen": -0.06218238174915314, - "rewards/margins": 0.28065159916877747, - "rewards/rejected": -0.3428339660167694, - "sft_loss": 0.4101923108100891, - "step": 1510 - }, - { - "epoch": 0.16010603513730584, - "grad_norm": 3.806243658065796, - "learning_rate": 0.0004945279765762243, - "logits/chosen": -5.590113639831543, - "logits/rejected": -5.590085029602051, - "logps/chosen": -0.6871938109397888, - "logps/rejected": -3.6291747093200684, - "loss": 0.7198, - "odds_ratio_loss": 2.472882032394409, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.06871937960386276, - "rewards/margins": 0.2941981554031372, - "rewards/rejected": -0.3629175126552582, - "sft_loss": 0.4725038409233093, - "step": 1520 - }, - { - "epoch": 0.16115936431584074, - "grad_norm": 5.523288249969482, - "learning_rate": 0.0004943350262965349, - "logits/chosen": -5.691066265106201, - "logits/rejected": -5.6911163330078125, - "logps/chosen": -0.6510820984840393, - "logps/rejected": -3.059138774871826, - "loss": 0.685, - "odds_ratio_loss": 2.56689715385437, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.06510820984840393, - "rewards/margins": 0.24080567061901093, - "rewards/rejected": -0.30591386556625366, - "sft_loss": 0.42830324172973633, - "step": 1530 - }, - { - "epoch": 0.16221269349437567, - "grad_norm": 4.537405967712402, - "learning_rate": 0.0004941387718332374, - "logits/chosen": -5.746434688568115, - "logits/rejected": -5.746466159820557, - "logps/chosen": -0.6964403390884399, - "logps/rejected": -3.4062578678131104, - "loss": 0.7309, - "odds_ratio_loss": 2.425229787826538, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.0696440264582634, - "rewards/margins": 0.2709817886352539, - "rewards/rejected": -0.3406257629394531, - "sft_loss": 0.4884008467197418, - "step": 1540 - }, - { - "epoch": 0.16326602267291057, - "grad_norm": 1.987900972366333, - "learning_rate": 0.000493939215840314, - "logits/chosen": -5.694365978240967, - "logits/rejected": -5.694273471832275, - "logps/chosen": -0.6464301347732544, - "logps/rejected": -3.67587947845459, - "loss": 0.6811, - "odds_ratio_loss": 2.441976308822632, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.0646430179476738, - "rewards/margins": 0.3029448688030243, - "rewards/rejected": -0.36758795380592346, - "sft_loss": 0.43692201375961304, - "step": 1550 - }, - { - "epoch": 0.16431935185144547, - "grad_norm": 4.507101058959961, - "learning_rate": 0.000493736361016394, - "logits/chosen": -5.841778755187988, - "logits/rejected": -5.841654300689697, - "logps/chosen": -0.6818705797195435, - "logps/rejected": -3.587505578994751, - "loss": 0.7136, - "odds_ratio_loss": 2.5504865646362305, - "rewards/accuracies": 0.8291666507720947, - "rewards/chosen": -0.06818706542253494, - "rewards/margins": 0.29056352376937866, - "rewards/rejected": -0.3587505519390106, - "sft_loss": 0.45857658982276917, - "step": 1560 - }, - { - "epoch": 0.16537268102998037, - "grad_norm": 3.6050593852996826, - "learning_rate": 0.0004935302101047171, - "logits/chosen": -5.996950149536133, - "logits/rejected": -5.996947288513184, - "logps/chosen": -0.6442388296127319, - "logps/rejected": -3.464613437652588, - "loss": 0.673, - "odds_ratio_loss": 2.08451247215271, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06442389637231827, - "rewards/margins": 0.2820374071598053, - "rewards/rejected": -0.34646129608154297, - "sft_loss": 0.4645636975765228, - "step": 1570 - }, - { - "epoch": 0.1664260102085153, - "grad_norm": 6.22938871383667, - "learning_rate": 0.0004933207658930968, - "logits/chosen": -6.110846996307373, - "logits/rejected": -6.1108527183532715, - "logps/chosen": -0.5905119776725769, - "logps/rejected": -3.7033638954162598, - "loss": 0.6196, - "odds_ratio_loss": 2.2230594158172607, - "rewards/accuracies": 0.8708333373069763, - "rewards/chosen": -0.05905119329690933, - "rewards/margins": 0.3112851679325104, - "rewards/rejected": -0.3703364133834839, - "sft_loss": 0.3972512185573578, - "step": 1580 - }, - { - "epoch": 0.1674793393870502, - "grad_norm": 5.692537307739258, - "learning_rate": 0.0004931080312138824, - "logits/chosen": -5.9748077392578125, - "logits/rejected": -5.974870681762695, - "logps/chosen": -0.6414510607719421, - "logps/rejected": -3.1401894092559814, - "loss": 0.6755, - "odds_ratio_loss": 2.408728837966919, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.06414511054754257, - "rewards/margins": 0.24987384676933289, - "rewards/rejected": -0.31401893496513367, - "sft_loss": 0.4346589744091034, - "step": 1590 - }, - { - "epoch": 0.1685326685655851, - "grad_norm": 4.23068380355835, - "learning_rate": 0.0004928920089439206, - "logits/chosen": -5.843720436096191, - "logits/rejected": -5.84379768371582, - "logps/chosen": -0.7081334590911865, - "logps/rejected": -3.097430467605591, - "loss": 0.7425, - "odds_ratio_loss": 2.416752338409424, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07081333547830582, - "rewards/margins": 0.23892968893051147, - "rewards/rejected": -0.3097430169582367, - "sft_loss": 0.5008493661880493, - "step": 1600 - }, - { - "epoch": 0.16958599774412, - "grad_norm": 7.679098606109619, - "learning_rate": 0.000492672702004517, - "logits/chosen": -5.8228349685668945, - "logits/rejected": -5.822881698608398, - "logps/chosen": -0.6390455365180969, - "logps/rejected": -2.928208112716675, - "loss": 0.6714, - "odds_ratio_loss": 2.5421833992004395, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.0639045462012291, - "rewards/margins": 0.2289162576198578, - "rewards/rejected": -0.29282084107398987, - "sft_loss": 0.4171499013900757, - "step": 1610 - }, - { - "epoch": 0.1706393269226549, - "grad_norm": 2.961296558380127, - "learning_rate": 0.000492450113361396, - "logits/chosen": -5.76025915145874, - "logits/rejected": -5.760366916656494, - "logps/chosen": -0.707727313041687, - "logps/rejected": -2.787848472595215, - "loss": 0.7488, - "odds_ratio_loss": 2.6859564781188965, - "rewards/accuracies": 0.8083333373069763, - "rewards/chosen": -0.07077272981405258, - "rewards/margins": 0.20801211893558502, - "rewards/rejected": -0.278784841299057, - "sft_loss": 0.48024895787239075, - "step": 1620 - }, - { - "epoch": 0.17169265610118983, - "grad_norm": 5.1038432121276855, - "learning_rate": 0.0004922242460246613, - "logits/chosen": -5.724485397338867, - "logits/rejected": -5.724647521972656, - "logps/chosen": -0.6975029110908508, - "logps/rejected": -2.6335670948028564, - "loss": 0.7302, - "odds_ratio_loss": 2.552466869354248, - "rewards/accuracies": 0.8729166388511658, - "rewards/chosen": -0.06975029408931732, - "rewards/margins": 0.1936063915491104, - "rewards/rejected": -0.26335668563842773, - "sft_loss": 0.4749198257923126, - "step": 1630 - }, - { - "epoch": 0.17274598527972473, - "grad_norm": 4.588533401489258, - "learning_rate": 0.0004919951030487549, - "logits/chosen": -5.752465724945068, - "logits/rejected": -5.75269079208374, - "logps/chosen": -0.7377220392227173, - "logps/rejected": -2.5051889419555664, - "loss": 0.7742, - "odds_ratio_loss": 2.6720080375671387, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.07377220690250397, - "rewards/margins": 0.1767466962337494, - "rewards/rejected": -0.25051891803741455, - "sft_loss": 0.5070357918739319, - "step": 1640 - }, - { - "epoch": 0.17379931445825963, - "grad_norm": 4.702722072601318, - "learning_rate": 0.0004917626875324156, - "logits/chosen": -5.999307155609131, - "logits/rejected": -5.999597072601318, - "logps/chosen": -0.6588828563690186, - "logps/rejected": -3.105346202850342, - "loss": 0.6907, - "odds_ratio_loss": 2.3203392028808594, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.06588829308748245, - "rewards/margins": 0.244646355509758, - "rewards/rejected": -0.31053462624549866, - "sft_loss": 0.458629310131073, - "step": 1650 - }, - { - "epoch": 0.17485264363679454, - "grad_norm": 4.34697961807251, - "learning_rate": 0.0004915270026186377, - "logits/chosen": -6.0448760986328125, - "logits/rejected": -6.045097827911377, - "logps/chosen": -0.6159750819206238, - "logps/rejected": -3.314389705657959, - "loss": 0.6451, - "odds_ratio_loss": 2.2701919078826904, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.06159750744700432, - "rewards/margins": 0.2698414921760559, - "rewards/rejected": -0.33143898844718933, - "sft_loss": 0.4180779755115509, - "step": 1660 - }, - { - "epoch": 0.17590597281532946, - "grad_norm": 6.3104119300842285, - "learning_rate": 0.0004912880514946277, - "logits/chosen": -6.198366165161133, - "logits/rejected": -6.198540210723877, - "logps/chosen": -0.6512912511825562, - "logps/rejected": -3.2115368843078613, - "loss": 0.6825, - "odds_ratio_loss": 2.350752353668213, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.0651291236281395, - "rewards/margins": 0.256024569272995, - "rewards/rejected": -0.3211536705493927, - "sft_loss": 0.4474564790725708, - "step": 1670 - }, - { - "epoch": 0.17695930199386437, - "grad_norm": 4.557435989379883, - "learning_rate": 0.0004910458373917618, - "logits/chosen": -5.757941722869873, - "logits/rejected": -5.75801944732666, - "logps/chosen": -0.7286573648452759, - "logps/rejected": -3.038480043411255, - "loss": 0.7631, - "odds_ratio_loss": 2.563901662826538, - "rewards/accuracies": 0.8229166865348816, - "rewards/chosen": -0.07286573201417923, - "rewards/margins": 0.23098230361938477, - "rewards/rejected": -0.303847998380661, - "sft_loss": 0.5067596435546875, - "step": 1680 - }, - { - "epoch": 0.17801263117239927, - "grad_norm": 4.517858028411865, - "learning_rate": 0.0004908003635855421, - "logits/chosen": -5.7685866355896, - "logits/rejected": -5.768723011016846, - "logps/chosen": -0.6709701418876648, - "logps/rejected": -3.045161724090576, - "loss": 0.7057, - "odds_ratio_loss": 2.353038787841797, - "rewards/accuracies": 0.8270833492279053, - "rewards/chosen": -0.0670970231294632, - "rewards/margins": 0.23741915822029114, - "rewards/rejected": -0.30451616644859314, - "sft_loss": 0.4703839421272278, - "step": 1690 - }, - { - "epoch": 0.17906596035093417, - "grad_norm": 3.8505797386169434, - "learning_rate": 0.0004905516333955521, - "logits/chosen": -5.820653915405273, - "logits/rejected": -5.820913791656494, - "logps/chosen": -0.6014044880867004, - "logps/rejected": -2.9712061882019043, - "loss": 0.6348, - "odds_ratio_loss": 2.333472728729248, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.060140449553728104, - "rewards/margins": 0.23698018491268158, - "rewards/rejected": -0.2971206307411194, - "sft_loss": 0.40144431591033936, - "step": 1700 - }, - { - "epoch": 0.18011928952946907, - "grad_norm": 5.725916385650635, - "learning_rate": 0.0004902996501854119, - "logits/chosen": -6.354620933532715, - "logits/rejected": -6.355036735534668, - "logps/chosen": -1.4475494623184204, - "logps/rejected": -3.9082717895507812, - "loss": 1.4845, - "odds_ratio_loss": 4.034452438354492, - "rewards/accuracies": 0.8291666507720947, - "rewards/chosen": -0.14475493133068085, - "rewards/margins": 0.24607227742671967, - "rewards/rejected": -0.3908271789550781, - "sft_loss": 1.0810879468917847, - "step": 1710 - }, - { - "epoch": 0.181172618708004, - "grad_norm": 8.74376392364502, - "learning_rate": 0.0004900444173627328, - "logits/chosen": -6.625903129577637, - "logits/rejected": -6.62622594833374, - "logps/chosen": -0.7164724469184875, - "logps/rejected": -3.1040403842926025, - "loss": 0.7538, - "odds_ratio_loss": 2.710744619369507, - "rewards/accuracies": 0.8208333253860474, - "rewards/chosen": -0.07164724916219711, - "rewards/margins": 0.23875676095485687, - "rewards/rejected": -0.3104040026664734, - "sft_loss": 0.4827170670032501, - "step": 1720 - }, - { - "epoch": 0.1822259478865389, - "grad_norm": 2.9171664714813232, - "learning_rate": 0.0004897859383790711, - "logits/chosen": -6.8558220863342285, - "logits/rejected": -6.856238842010498, - "logps/chosen": -0.6772664785385132, - "logps/rejected": -3.0685346126556396, - "loss": 0.7145, - "odds_ratio_loss": 2.5943028926849365, - "rewards/accuracies": 0.8145833611488342, - "rewards/chosen": -0.06772664934396744, - "rewards/margins": 0.23912683129310608, - "rewards/rejected": -0.3068534731864929, - "sft_loss": 0.4550252854824066, - "step": 1730 - }, - { - "epoch": 0.1832792770650738, - "grad_norm": 4.223478317260742, - "learning_rate": 0.0004895242167298816, - "logits/chosen": -6.91244649887085, - "logits/rejected": -6.912972927093506, - "logps/chosen": -0.6928088665008545, - "logps/rejected": -3.333386182785034, - "loss": 0.724, - "odds_ratio_loss": 2.613675117492676, - "rewards/accuracies": 0.8291666507720947, - "rewards/chosen": -0.06928088515996933, - "rewards/margins": 0.2640577256679535, - "rewards/rejected": -0.3333386480808258, - "sft_loss": 0.4625937044620514, - "step": 1740 - }, - { - "epoch": 0.1843326062436087, - "grad_norm": 6.387345790863037, - "learning_rate": 0.0004892592559544702, - "logits/chosen": -6.475886821746826, - "logits/rejected": -6.476265907287598, - "logps/chosen": -0.689757764339447, - "logps/rejected": -2.9596078395843506, - "loss": 0.7284, - "odds_ratio_loss": 2.5536398887634277, - "rewards/accuracies": 0.8083333373069763, - "rewards/chosen": -0.0689757764339447, - "rewards/margins": 0.22698503732681274, - "rewards/rejected": -0.29596078395843506, - "sft_loss": 0.47301238775253296, - "step": 1750 - }, - { - "epoch": 0.1853859354221436, - "grad_norm": 4.752090930938721, - "learning_rate": 0.0004889910596359457, - "logits/chosen": -6.3866801261901855, - "logits/rejected": -6.3870625495910645, - "logps/chosen": -0.6425169110298157, - "logps/rejected": -3.2402262687683105, - "loss": 0.6768, - "odds_ratio_loss": 2.416498899459839, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.06425168365240097, - "rewards/margins": 0.259770929813385, - "rewards/rejected": -0.3240226209163666, - "sft_loss": 0.43517401814460754, - "step": 1760 - }, - { - "epoch": 0.18643926460067853, - "grad_norm": 4.994302272796631, - "learning_rate": 0.0004887196314011722, - "logits/chosen": -6.208808422088623, - "logits/rejected": -6.209136009216309, - "logps/chosen": -0.6876904964447021, - "logps/rejected": -3.412658929824829, - "loss": 0.7195, - "odds_ratio_loss": 2.441316843032837, - "rewards/accuracies": 0.8479166626930237, - "rewards/chosen": -0.06876904517412186, - "rewards/margins": 0.2724968492984772, - "rewards/rejected": -0.34126585721969604, - "sft_loss": 0.4754055142402649, - "step": 1770 - }, - { - "epoch": 0.18749259377921343, - "grad_norm": 6.439792156219482, - "learning_rate": 0.0004884449749207192, - "logits/chosen": -6.457438945770264, - "logits/rejected": -6.457731246948242, - "logps/chosen": -0.6336179375648499, - "logps/rejected": -2.840827703475952, - "loss": 0.6657, - "odds_ratio_loss": 2.448340892791748, - "rewards/accuracies": 0.8708333373069763, - "rewards/chosen": -0.06336179375648499, - "rewards/margins": 0.22072099149227142, - "rewards/rejected": -0.2840828001499176, - "sft_loss": 0.420904278755188, - "step": 1780 - }, - { - "epoch": 0.18854592295774833, - "grad_norm": 4.342998027801514, - "learning_rate": 0.00048816709390881266, - "logits/chosen": -6.21989631652832, - "logits/rejected": -6.220187664031982, - "logps/chosen": -0.6857010722160339, - "logps/rejected": -2.874756336212158, - "loss": 0.7182, - "odds_ratio_loss": 2.453400135040283, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06857011467218399, - "rewards/margins": 0.218905508518219, - "rewards/rejected": -0.2874756455421448, - "sft_loss": 0.47288984060287476, - "step": 1790 - }, - { - "epoch": 0.18959925213628323, - "grad_norm": 33.773277282714844, - "learning_rate": 0.0004878859921232839, - "logits/chosen": -5.917886257171631, - "logits/rejected": -5.9181623458862305, - "logps/chosen": -0.7129290103912354, - "logps/rejected": -2.9589409828186035, - "loss": 0.7486, - "odds_ratio_loss": 2.4278337955474854, - "rewards/accuracies": 0.8166666626930237, - "rewards/chosen": -0.07129290699958801, - "rewards/margins": 0.22460119426250458, - "rewards/rejected": -0.2958941161632538, - "sft_loss": 0.5058320760726929, - "step": 1800 - }, - { - "epoch": 0.19065258131481816, - "grad_norm": 4.040477275848389, - "learning_rate": 0.00048760167336551964, - "logits/chosen": -5.841413974761963, - "logits/rejected": -5.8417158126831055, - "logps/chosen": -0.6335561275482178, - "logps/rejected": -3.0441653728485107, - "loss": 0.6684, - "odds_ratio_loss": 2.3195104598999023, - "rewards/accuracies": 0.8354166746139526, - "rewards/chosen": -0.06335561722517014, - "rewards/margins": 0.24106094241142273, - "rewards/rejected": -0.30441656708717346, - "sft_loss": 0.4364630877971649, - "step": 1810 - }, - { - "epoch": 0.19170591049335306, - "grad_norm": 6.749710559844971, - "learning_rate": 0.0004873141414804103, - "logits/chosen": -5.7162394523620605, - "logits/rejected": -5.716516017913818, - "logps/chosen": -0.6518925428390503, - "logps/rejected": -3.0878660678863525, - "loss": 0.6886, - "odds_ratio_loss": 2.5920615196228027, - "rewards/accuracies": 0.8458333611488342, - "rewards/chosen": -0.06518926471471786, - "rewards/margins": 0.2435973733663559, - "rewards/rejected": -0.30878666043281555, - "sft_loss": 0.4293573498725891, - "step": 1820 - }, - { - "epoch": 0.19275923967188796, - "grad_norm": 2.045081615447998, - "learning_rate": 0.00048702340035629787, - "logits/chosen": -5.856993198394775, - "logits/rejected": -5.857146263122559, - "logps/chosen": -0.6080865263938904, - "logps/rejected": -2.7589311599731445, - "loss": 0.6369, - "odds_ratio_loss": 2.007796287536621, - "rewards/accuracies": 0.8666666746139526, - "rewards/chosen": -0.06080865487456322, - "rewards/margins": 0.21508444845676422, - "rewards/rejected": -0.27589309215545654, - "sft_loss": 0.43609535694122314, - "step": 1830 - }, - { - "epoch": 0.19381256885042286, - "grad_norm": 4.158146381378174, - "learning_rate": 0.0004867294539249234, - "logits/chosen": -6.230529308319092, - "logits/rejected": -6.230895519256592, - "logps/chosen": -0.6979438066482544, - "logps/rejected": -3.5587799549102783, - "loss": 0.7295, - "odds_ratio_loss": 2.502671718597412, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -0.06979438662528992, - "rewards/margins": 0.2860836088657379, - "rewards/rejected": -0.35587799549102783, - "sft_loss": 0.4792328178882599, - "step": 1840 - }, - { - "epoch": 0.19486589802895776, - "grad_norm": 2.7599072456359863, - "learning_rate": 0.0004864323061613738, - "logits/chosen": -6.244935512542725, - "logits/rejected": -6.245189189910889, - "logps/chosen": -0.6155544519424438, - "logps/rejected": -3.0617711544036865, - "loss": 0.6473, - "odds_ratio_loss": 2.4084250926971436, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.06155544891953468, - "rewards/margins": 0.2446216493844986, - "rewards/rejected": -0.3061771094799042, - "sft_loss": 0.4064619243144989, - "step": 1850 - }, - { - "epoch": 0.1959192272074927, - "grad_norm": 4.056497573852539, - "learning_rate": 0.0004861319610840282, - "logits/chosen": -5.854410648345947, - "logits/rejected": -5.8545074462890625, - "logps/chosen": -0.7075474262237549, - "logps/rejected": -3.4564712047576904, - "loss": 0.7458, - "odds_ratio_loss": 2.5600531101226807, - "rewards/accuracies": 0.8333333134651184, - "rewards/chosen": -0.07075474411249161, - "rewards/margins": 0.2748924195766449, - "rewards/rejected": -0.3456471860408783, - "sft_loss": 0.4897785782814026, - "step": 1860 - }, - { - "epoch": 0.1969725563860276, - "grad_norm": 7.0494489669799805, - "learning_rate": 0.00048582842275450366, - "logits/chosen": -5.870307922363281, - "logits/rejected": -5.870253086090088, - "logps/chosen": -0.6499666571617126, - "logps/rejected": -3.4302499294281006, - "loss": 0.6847, - "odds_ratio_loss": 2.5706870555877686, - "rewards/accuracies": 0.8520833253860474, - "rewards/chosen": -0.06499668210744858, - "rewards/margins": 0.27802836894989014, - "rewards/rejected": -0.3430250287055969, - "sft_loss": 0.4276408553123474, - "step": 1870 - }, - { - "epoch": 0.1980258855645625, - "grad_norm": 9.770491600036621, - "learning_rate": 0.0004855216952775999, - "logits/chosen": -6.05530309677124, - "logits/rejected": -6.05518102645874, - "logps/chosen": -0.6710807681083679, - "logps/rejected": -3.7501718997955322, - "loss": 0.7031, - "odds_ratio_loss": 2.5512893199920654, - "rewards/accuracies": 0.8583333492279053, - "rewards/chosen": -0.06710807234048843, - "rewards/margins": 0.3079090714454651, - "rewards/rejected": -0.3750171661376953, - "sft_loss": 0.4479447305202484, - "step": 1880 - }, - { - "epoch": 0.1990792147430974, - "grad_norm": 46.68679428100586, - "learning_rate": 0.0004852117828012441, - "logits/chosen": -6.125611782073975, - "logits/rejected": -6.125678539276123, - "logps/chosen": -0.8755971789360046, - "logps/rejected": -4.0310163497924805, - "loss": 0.9086, - "odds_ratio_loss": 3.206876516342163, - "rewards/accuracies": 0.8416666388511658, - "rewards/chosen": -0.08755972236394882, - "rewards/margins": 0.31554192304611206, - "rewards/rejected": -0.4031016528606415, - "sft_loss": 0.5878926515579224, - "step": 1890 - }, - { - "epoch": 0.20013254392163232, - "grad_norm": 7.289094924926758, - "learning_rate": 0.00048489868951643477, - "logits/chosen": -6.526234149932861, - "logits/rejected": -6.526541233062744, - "logps/chosen": -1.0017836093902588, - "logps/rejected": -3.509293556213379, - "loss": 1.0477, - "odds_ratio_loss": 3.5900070667266846, - "rewards/accuracies": 0.8270833492279053, - "rewards/chosen": -0.10017836093902588, - "rewards/margins": 0.2507510483264923, - "rewards/rejected": -0.3509294092655182, - "sft_loss": 0.6887442469596863, - "step": 1900 + "rewards/chosen": -0.07357922196388245, + "rewards/margins": 0.29457923769950867, + "rewards/rejected": -0.3681584596633911, + "step": 81 + }, + { + "epoch": 0.02303279803729663, + "grad_norm": 1.5625, + "learning_rate": 1.151685393258427e-05, + "logits/chosen": -7.0821213722229, + "logits/rejected": -7.082340240478516, + "logps/chosen": -0.628500759601593, + "logps/rejected": -3.7704579830169678, + "loss": 0.8099, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.06285007297992706, + "rewards/margins": 0.31419575214385986, + "rewards/rejected": -0.37704581022262573, + "step": 82 + }, + { + "epoch": 0.023313685818239272, + "grad_norm": 0.9765625, + "learning_rate": 1.1657303370786517e-05, + "logits/chosen": -7.087411880493164, + "logits/rejected": -7.0875701904296875, + "logps/chosen": -0.7774583101272583, + "logps/rejected": -3.098719358444214, + "loss": 0.8513, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.07774583995342255, + "rewards/margins": 0.2321261167526245, + "rewards/rejected": -0.3098719120025635, + "step": 83 + }, + { + "epoch": 0.023594573599181913, + "grad_norm": 0.96484375, + "learning_rate": 1.1797752808988765e-05, + "logits/chosen": -7.087563991546631, + "logits/rejected": -7.087766647338867, + "logps/chosen": -0.5942208766937256, + "logps/rejected": -3.334862232208252, + "loss": 0.8292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0594220906496048, + "rewards/margins": 0.2740641236305237, + "rewards/rejected": -0.3334861993789673, + "step": 84 + }, + { + "epoch": 0.023875461380124557, + "grad_norm": 1.21875, + "learning_rate": 1.1938202247191012e-05, + "logits/chosen": -7.094630241394043, + "logits/rejected": -7.094793319702148, + "logps/chosen": -0.6228974461555481, + "logps/rejected": -4.061489105224609, + "loss": 0.7951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06228974461555481, + "rewards/margins": 0.34385910630226135, + "rewards/rejected": -0.40614888072013855, + "step": 85 + }, + { + "epoch": 0.0241563491610672, + "grad_norm": 1.6484375, + "learning_rate": 1.207865168539326e-05, + "logits/chosen": -7.10193395614624, + "logits/rejected": -7.1020917892456055, + "logps/chosen": -0.6275684237480164, + "logps/rejected": -3.7310783863067627, + "loss": 0.8106, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.06275683641433716, + "rewards/margins": 0.31035101413726807, + "rewards/rejected": -0.3731078505516052, + "step": 86 + }, + { + "epoch": 0.02443723694200984, + "grad_norm": 1.1171875, + "learning_rate": 1.2219101123595506e-05, + "logits/chosen": -7.102446556091309, + "logits/rejected": -7.102678298950195, + "logps/chosen": -0.6499467492103577, + "logps/rejected": -3.5946946144104004, + "loss": 0.8177, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.064994677901268, + "rewards/margins": 0.2944748103618622, + "rewards/rejected": -0.3594695031642914, + "step": 87 + }, + { + "epoch": 0.02471812472295248, + "grad_norm": 1.4921875, + "learning_rate": 1.2359550561797752e-05, + "logits/chosen": -7.104351997375488, + "logits/rejected": -7.104528427124023, + "logps/chosen": -0.6288101077079773, + "logps/rejected": -3.6652326583862305, + "loss": 0.814, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.0628810003399849, + "rewards/margins": 0.303642213344574, + "rewards/rejected": -0.36652323603630066, + "step": 88 + }, + { + "epoch": 0.024999012503895125, + "grad_norm": 2.78125, + "learning_rate": 1.25e-05, + "logits/chosen": -7.110116004943848, + "logits/rejected": -7.110256671905518, + "logps/chosen": -0.6746590733528137, + "logps/rejected": -3.6932754516601562, + "loss": 0.8167, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.06746591627597809, + "rewards/margins": 0.30186164379119873, + "rewards/rejected": -0.3693275451660156, + "step": 89 + }, + { + "epoch": 0.025279900284837765, + "grad_norm": 1.4140625, + "learning_rate": 1.2640449438202249e-05, + "logits/chosen": -7.113439559936523, + "logits/rejected": -7.113628387451172, + "logps/chosen": -0.6546343564987183, + "logps/rejected": -3.9948794841766357, + "loss": 0.7989, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06546343863010406, + "rewards/margins": 0.3340245485305786, + "rewards/rejected": -0.3994879722595215, + "step": 90 + }, + { + "epoch": 0.025560788065780406, + "grad_norm": 1.1171875, + "learning_rate": 1.2780898876404495e-05, + "logits/chosen": -7.119630336761475, + "logits/rejected": -7.119820594787598, + "logps/chosen": -0.5485891103744507, + "logps/rejected": -3.8539154529571533, + "loss": 0.8, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.054858915507793427, + "rewards/margins": 0.33053261041641235, + "rewards/rejected": -0.38539156317710876, + "step": 91 + }, + { + "epoch": 0.02584167584672305, + "grad_norm": 1.84375, + "learning_rate": 1.2921348314606743e-05, + "logits/chosen": -7.124433994293213, + "logits/rejected": -7.124541282653809, + "logps/chosen": -0.5675304532051086, + "logps/rejected": -3.8615882396698, + "loss": 0.8041, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.056753043085336685, + "rewards/margins": 0.3294057846069336, + "rewards/rejected": -0.38615882396698, + "step": 92 + }, + { + "epoch": 0.02612256362766569, + "grad_norm": 1.1875, + "learning_rate": 1.3061797752808991e-05, + "logits/chosen": -7.122183799743652, + "logits/rejected": -7.122403621673584, + "logps/chosen": -0.6670414805412292, + "logps/rejected": -4.148705959320068, + "loss": 0.7911, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.0667041540145874, + "rewards/margins": 0.34816643595695496, + "rewards/rejected": -0.41487061977386475, + "step": 93 + }, + { + "epoch": 0.026403451408608333, + "grad_norm": 4.0625, + "learning_rate": 1.3202247191011236e-05, + "logits/chosen": -7.128363609313965, + "logits/rejected": -7.128509998321533, + "logps/chosen": -0.6008510589599609, + "logps/rejected": -4.413697719573975, + "loss": 0.7794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06008511036634445, + "rewards/margins": 0.3812846839427948, + "rewards/rejected": -0.44136980175971985, + "step": 94 + }, + { + "epoch": 0.026684339189550974, + "grad_norm": 2.171875, + "learning_rate": 1.3342696629213482e-05, + "logits/chosen": -7.137217044830322, + "logits/rejected": -7.137389183044434, + "logps/chosen": -0.6029202938079834, + "logps/rejected": -3.891530990600586, + "loss": 0.801, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.06029203534126282, + "rewards/margins": 0.3288610577583313, + "rewards/rejected": -0.3891531229019165, + "step": 95 + }, + { + "epoch": 0.026965226970493618, + "grad_norm": 1.921875, + "learning_rate": 1.348314606741573e-05, + "logits/chosen": -7.1387410163879395, + "logits/rejected": -7.1389546394348145, + "logps/chosen": -0.6319260597229004, + "logps/rejected": -4.000391006469727, + "loss": 0.7998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06319259852170944, + "rewards/margins": 0.3368465006351471, + "rewards/rejected": -0.40003910660743713, + "step": 96 + }, + { + "epoch": 0.02724611475143626, + "grad_norm": 1.0078125, + "learning_rate": 1.3623595505617979e-05, + "logits/chosen": -7.144716739654541, + "logits/rejected": -7.144957065582275, + "logps/chosen": -0.5787003040313721, + "logps/rejected": -3.752794027328491, + "loss": 0.8073, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.05787003040313721, + "rewards/margins": 0.31740936636924744, + "rewards/rejected": -0.37527936697006226, + "step": 97 + }, + { + "epoch": 0.0275270025323789, + "grad_norm": 1.1875, + "learning_rate": 1.3764044943820225e-05, + "logits/chosen": -7.144606113433838, + "logits/rejected": -7.144704818725586, + "logps/chosen": -0.5937943458557129, + "logps/rejected": -3.984668493270874, + "loss": 0.7967, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.05937943235039711, + "rewards/margins": 0.33908742666244507, + "rewards/rejected": -0.3984668552875519, + "step": 98 + }, + { + "epoch": 0.02780789031332154, + "grad_norm": 1.1875, + "learning_rate": 1.3904494382022473e-05, + "logits/chosen": -7.152622222900391, + "logits/rejected": -7.152801990509033, + "logps/chosen": -0.5818836688995361, + "logps/rejected": -3.607451915740967, + "loss": 0.8172, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.05818836763501167, + "rewards/margins": 0.3025568425655365, + "rewards/rejected": -0.3607451915740967, + "step": 99 + }, + { + "epoch": 0.028088778094264185, + "grad_norm": 1.0625, + "learning_rate": 1.4044943820224721e-05, + "logits/chosen": -7.15784215927124, + "logits/rejected": -7.1580023765563965, + "logps/chosen": -0.6804911494255066, + "logps/rejected": -4.50457239151001, + "loss": 0.7831, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.0680491179227829, + "rewards/margins": 0.38240814208984375, + "rewards/rejected": -0.45045721530914307, + "step": 100 } ], - "logging_steps": 10, - "max_steps": 9493, + "logging_steps": 1, + "max_steps": 3560, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, @@ -3256,7 +1526,7 @@ "attributes": {} } }, - "total_flos": 1.4024826693511741e+18, + "total_flos": 1.966690847510692e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null