diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,17 +1,17 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.998430141287284, + "epoch": 0.9994767137624281, "eval_steps": 100, - "global_step": 2865, + "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010465724751439038, - "grad_norm": 127.35001565973752, - "learning_rate": 1.7421602787456446e-09, + "grad_norm": 127.36319300622706, + "learning_rate": 5.208333333333333e-09, "logits/chosen": -2.9235777854919434, "logits/rejected": -2.7912116050720215, "logps/chosen": -380.68548583984375, @@ -25,4756 +25,1587 @@ }, { "epoch": 0.010465724751439037, - "grad_norm": 108.97762663697972, - "learning_rate": 1.7421602787456446e-08, - "logits/chosen": -2.59572696685791, - "logits/rejected": -2.569275379180908, - "logps/chosen": -256.58428955078125, - "logps/rejected": -234.87478637695312, - "loss": 0.6943, + "grad_norm": 107.9740860657244, + "learning_rate": 5.208333333333333e-08, + "logits/chosen": -2.595984935760498, + "logits/rejected": -2.56972074508667, + "logps/chosen": -256.59259033203125, + "logps/rejected": -234.86553955078125, + "loss": 0.6936, "rewards/accuracies": 0.4513888955116272, - "rewards/chosen": 0.0009517045691609383, - "rewards/margins": 0.0016916098538786173, - "rewards/rejected": -0.0007399055757559836, + "rewards/chosen": 0.00012609982513822615, + "rewards/margins": -5.737816172768362e-05, + "rewards/rejected": 0.00018347776494920254, "step": 10 }, { "epoch": 0.020931449502878074, - "grad_norm": 127.24164727471926, - "learning_rate": 3.484320557491289e-08, - "logits/chosen": -2.6139628887176514, - "logits/rejected": -2.5763537883758545, - "logps/chosen": -283.115478515625, - "logps/rejected": -282.2696838378906, - "loss": 0.6911, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -0.0010749094653874636, - "rewards/margins": -0.000987284118309617, - "rewards/rejected": -8.762532524997368e-05, + "grad_norm": 123.40563647833262, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.6136789321899414, + "logits/rejected": -2.5759799480438232, + "logps/chosen": -283.09326171875, + "logps/rejected": -282.3014221191406, + "loss": 0.6899, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0011462386464700103, + "rewards/margins": 0.004408253822475672, + "rewards/rejected": -0.0032620157580822706, "step": 20 }, { "epoch": 0.03139717425431711, - "grad_norm": 121.77845448939686, - "learning_rate": 5.2264808362369334e-08, - "logits/chosen": -2.693781614303589, - "logits/rejected": -2.6697208881378174, - "logps/chosen": -270.2797546386719, - "logps/rejected": -276.697509765625, - "loss": 0.6928, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.002786443568766117, - "rewards/margins": 0.00608491338789463, - "rewards/rejected": -0.0032984702847898006, + "grad_norm": 121.3818362170214, + "learning_rate": 1.5624999999999999e-07, + "logits/chosen": -2.6919732093811035, + "logits/rejected": -2.667656660079956, + "logps/chosen": -270.1833801269531, + "logps/rejected": -276.7445068359375, + "loss": 0.683, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.01242474652826786, + "rewards/margins": 0.02042328752577305, + "rewards/rejected": -0.007998539134860039, "step": 30 }, { "epoch": 0.04186289900575615, - "grad_norm": 118.52470749345107, - "learning_rate": 6.968641114982578e-08, - "logits/chosen": -2.6702141761779785, - "logits/rejected": -2.5954573154449463, - "logps/chosen": -290.90484619140625, - "logps/rejected": -282.1049499511719, - "loss": 0.6889, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.007351614534854889, - "rewards/margins": 0.01041644997894764, - "rewards/rejected": -0.003064836375415325, + "grad_norm": 115.66860622860622, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -2.665289878845215, + "logits/rejected": -2.5902962684631348, + "logps/chosen": -290.4902648925781, + "logps/rejected": -282.1328125, + "loss": 0.6686, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.04880904406309128, + "rewards/margins": 0.05465905740857124, + "rewards/rejected": -0.005850008223205805, "step": 40 }, { "epoch": 0.052328623757195186, - "grad_norm": 134.5179695335488, - "learning_rate": 8.710801393728223e-08, - "logits/chosen": -2.6866862773895264, - "logits/rejected": -2.603309154510498, - "logps/chosen": -267.50653076171875, - "logps/rejected": -236.64370727539062, - "loss": 0.6841, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.015602147206664085, - "rewards/margins": 0.018949907273054123, - "rewards/rejected": -0.0033477633260190487, + "grad_norm": 113.05355403051037, + "learning_rate": 2.604166666666667e-07, + "logits/chosen": -2.672785997390747, + "logits/rejected": -2.589231014251709, + "logps/chosen": -266.2040100097656, + "logps/rejected": -236.48739624023438, + "loss": 0.641, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.14585763216018677, + "rewards/margins": 0.13357332348823547, + "rewards/rejected": 0.012284321710467339, "step": 50 }, { "epoch": 0.06279434850863422, - "grad_norm": 114.25724612835627, - "learning_rate": 1.0452961672473867e-07, - "logits/chosen": -2.6629409790039062, - "logits/rejected": -2.62890887260437, - "logps/chosen": -303.4552307128906, - "logps/rejected": -275.6412658691406, - "loss": 0.6733, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.05119643360376358, - "rewards/margins": 0.0428791344165802, - "rewards/rejected": 0.00831730104982853, + "grad_norm": 106.6001336411059, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.6317365169525146, + "logits/rejected": -2.596348285675049, + "logps/chosen": -300.1541442871094, + "logps/rejected": -274.40191650390625, + "loss": 0.6134, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3813043236732483, + "rewards/margins": 0.24904949963092804, + "rewards/rejected": 0.13225486874580383, "step": 60 }, { "epoch": 0.07326007326007326, - "grad_norm": 107.20903578910969, - "learning_rate": 1.219512195121951e-07, - "logits/chosen": -2.5873584747314453, - "logits/rejected": -2.5813212394714355, - "logps/chosen": -258.8870849609375, - "logps/rejected": -258.4106140136719, - "loss": 0.6633, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.07917213439941406, - "rewards/margins": 0.09942146390676498, - "rewards/rejected": -0.020249325782060623, + "grad_norm": 105.44232045239558, + "learning_rate": 3.645833333333333e-07, + "logits/chosen": -2.5406157970428467, + "logits/rejected": -2.5337507724761963, + "logps/chosen": -256.82513427734375, + "logps/rejected": -259.8224792480469, + "loss": 0.5958, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.2853667438030243, + "rewards/margins": 0.44680237770080566, + "rewards/rejected": -0.16143563389778137, "step": 70 }, { "epoch": 0.0837257980115123, - "grad_norm": 109.13637260374338, - "learning_rate": 1.3937282229965157e-07, - "logits/chosen": -2.6257143020629883, - "logits/rejected": -2.5616321563720703, - "logps/chosen": -274.7749938964844, - "logps/rejected": -251.4767303466797, - "loss": 0.6425, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.12770769000053406, - "rewards/margins": 0.1433565765619278, - "rewards/rejected": -0.01564888283610344, + "grad_norm": 118.11259244631633, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.583888292312622, + "logits/rejected": -2.516160249710083, + "logps/chosen": -271.23541259765625, + "logps/rejected": -252.84814453125, + "loss": 0.5843, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.4816661775112152, + "rewards/margins": 0.6344618797302246, + "rewards/rejected": -0.15279565751552582, "step": 80 }, { "epoch": 0.09419152276295134, - "grad_norm": 100.81061745977243, - "learning_rate": 1.56794425087108e-07, - "logits/chosen": -2.598877429962158, - "logits/rejected": -2.5678374767303467, - "logps/chosen": -253.87411499023438, - "logps/rejected": -263.8825378417969, - "loss": 0.6372, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.17327770590782166, - "rewards/margins": 0.09231527149677277, - "rewards/rejected": 0.08096243441104889, + "grad_norm": 90.59677124687741, + "learning_rate": 4.6874999999999996e-07, + "logits/chosen": -2.549140691757202, + "logits/rejected": -2.518716335296631, + "logps/chosen": -249.97048950195312, + "logps/rejected": -263.16864013671875, + "loss": 0.5806, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.5636419057846069, + "rewards/margins": 0.4112882614135742, + "rewards/rejected": 0.1523536890745163, "step": 90 }, { "epoch": 0.10465724751439037, - "grad_norm": 93.89925441673365, - "learning_rate": 1.7421602787456445e-07, - "logits/chosen": -2.597304582595825, - "logits/rejected": -2.532872200012207, - "logps/chosen": -296.5617370605469, - "logps/rejected": -279.34783935546875, - "loss": 0.6044, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.3293375074863434, - "rewards/margins": 0.26606154441833496, - "rewards/rejected": 0.06327597796916962, + "grad_norm": 89.93368824706333, + "learning_rate": 4.999732492681437e-07, + "logits/chosen": -2.5437731742858887, + "logits/rejected": -2.4748952388763428, + "logps/chosen": -290.6165771484375, + "logps/rejected": -277.58087158203125, + "loss": 0.5446, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.9238502383232117, + "rewards/margins": 0.6838744878768921, + "rewards/rejected": 0.23997588455677032, "step": 100 }, { "epoch": 0.10465724751439037, - "eval_logits/chosen": -2.6369240283966064, - "eval_logits/rejected": -2.5834383964538574, - "eval_logps/chosen": -278.35797119140625, - "eval_logps/rejected": -259.34893798828125, - "eval_loss": 0.6128642559051514, - "eval_rewards/accuracies": 0.7301587462425232, - "eval_rewards/chosen": 0.35960179567337036, - "eval_rewards/margins": 0.2726038694381714, - "eval_rewards/rejected": 0.08699791878461838, - "eval_runtime": 195.7508, - "eval_samples_per_second": 10.217, - "eval_steps_per_second": 0.322, + "eval_logits/chosen": -2.5742745399475098, + "eval_logits/rejected": -2.516094923019409, + "eval_logps/chosen": -271.84344482421875, + "eval_logps/rejected": -256.6897888183594, + "eval_loss": 0.5752763748168945, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": 1.0110565423965454, + "eval_rewards/margins": 0.6581434011459351, + "eval_rewards/rejected": 0.35291311144828796, + "eval_runtime": 206.982, + "eval_samples_per_second": 9.663, + "eval_steps_per_second": 0.304, "step": 100 }, { "epoch": 0.1151229722658294, - "grad_norm": 98.52534823895529, - "learning_rate": 1.916376306620209e-07, - "logits/chosen": -2.646341323852539, - "logits/rejected": -2.599412202835083, - "logps/chosen": -238.52932739257812, - "logps/rejected": -236.8010711669922, - "loss": 0.6152, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.29135388135910034, - "rewards/margins": 0.24636240303516388, - "rewards/rejected": 0.04499145597219467, + "grad_norm": 91.21630112817637, + "learning_rate": 4.996723692767926e-07, + "logits/chosen": -2.5750699043273926, + "logits/rejected": -2.528003215789795, + "logps/chosen": -234.3852081298828, + "logps/rejected": -236.4423370361328, + "loss": 0.5648, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.705768346786499, + "rewards/margins": 0.6249028444290161, + "rewards/rejected": 0.08086539804935455, "step": 110 }, { "epoch": 0.12558869701726844, - "grad_norm": 105.21775376810443, - "learning_rate": 2.0905923344947734e-07, - "logits/chosen": -2.598649024963379, - "logits/rejected": -2.5564463138580322, - "logps/chosen": -258.513427734375, - "logps/rejected": -287.64447021484375, - "loss": 0.5889, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.4047703742980957, - "rewards/margins": 0.32032063603401184, - "rewards/rejected": 0.08444973081350327, + "grad_norm": 106.85996872671201, + "learning_rate": 4.990375746213598e-07, + "logits/chosen": -2.524847984313965, + "logits/rejected": -2.4740397930145264, + "logps/chosen": -258.1869201660156, + "logps/rejected": -292.03643798828125, + "loss": 0.5378, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.43742185831069946, + "rewards/margins": 0.7921646237373352, + "rewards/rejected": -0.3547428548336029, "step": 120 }, { "epoch": 0.1360544217687075, - "grad_norm": 104.25642251564453, - "learning_rate": 2.264808362369338e-07, - "logits/chosen": -2.5888803005218506, - "logits/rejected": -2.5576508045196533, - "logps/chosen": -278.2981262207031, - "logps/rejected": -256.62762451171875, - "loss": 0.5752, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.2916451096534729, - "rewards/margins": 0.5158523917198181, - "rewards/rejected": -0.22420723736286163, + "grad_norm": 113.00637141782764, + "learning_rate": 4.980697142834314e-07, + "logits/chosen": -2.520475387573242, + "logits/rejected": -2.4845001697540283, + "logps/chosen": -278.5224304199219, + "logps/rejected": -261.1920166015625, + "loss": 0.5472, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.26921743154525757, + "rewards/margins": 0.9498673677444458, + "rewards/rejected": -0.6806498765945435, "step": 130 }, { "epoch": 0.14652014652014653, - "grad_norm": 96.76913531891151, - "learning_rate": 2.439024390243902e-07, - "logits/chosen": -2.5127837657928467, - "logits/rejected": -2.4813475608825684, - "logps/chosen": -288.41888427734375, - "logps/rejected": -291.90740966796875, - "loss": 0.5622, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.26953125, - "rewards/margins": 0.5665478706359863, - "rewards/rejected": -0.2970166504383087, + "grad_norm": 98.25615957326066, + "learning_rate": 4.967700826904229e-07, + "logits/chosen": -2.4557089805603027, + "logits/rejected": -2.4235308170318604, + "logps/chosen": -284.69012451171875, + "logps/rejected": -292.3227233886719, + "loss": 0.5192, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.6424055099487305, + "rewards/margins": 0.9809519648551941, + "rewards/rejected": -0.3385465741157532, "step": 140 }, { "epoch": 0.15698587127158556, - "grad_norm": 85.05713660390295, - "learning_rate": 2.613240418118467e-07, - "logits/chosen": -2.638923168182373, - "logits/rejected": -2.586540699005127, - "logps/chosen": -284.3011474609375, - "logps/rejected": -248.72805786132812, - "loss": 0.5763, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.3531322777271271, - "rewards/margins": 0.5276108384132385, - "rewards/rejected": -0.17447853088378906, + "grad_norm": 99.56093992482657, + "learning_rate": 4.951404179843962e-07, + "logits/chosen": -2.601367473602295, + "logits/rejected": -2.5492353439331055, + "logps/chosen": -286.1108093261719, + "logps/rejected": -252.67074584960938, + "loss": 0.5511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.17216625809669495, + "rewards/margins": 0.7409141659736633, + "rewards/rejected": -0.568747878074646, "step": 150 }, { "epoch": 0.1674515960230246, - "grad_norm": 114.10789477230233, - "learning_rate": 2.7874564459930313e-07, - "logits/chosen": -2.589647054672241, - "logits/rejected": -2.503873348236084, - "logps/chosen": -291.9870910644531, - "logps/rejected": -258.3062438964844, - "loss": 0.5495, + "grad_norm": 115.6693954933169, + "learning_rate": 4.931828996974498e-07, + "logits/chosen": -2.568312883377075, + "logits/rejected": -2.4805312156677246, + "logps/chosen": -291.2640380859375, + "logps/rejected": -259.77508544921875, + "loss": 0.5535, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.383622944355011, - "rewards/margins": 0.6132623553276062, - "rewards/rejected": -0.22963948547840118, + "rewards/chosen": 0.4559265077114105, + "rewards/margins": 0.8324508666992188, + "rewards/rejected": -0.3765243887901306, "step": 160 }, { "epoch": 0.17791732077446362, - "grad_norm": 103.11401162930947, - "learning_rate": 2.961672473867596e-07, - "logits/chosen": -2.5706489086151123, - "logits/rejected": -2.488593578338623, - "logps/chosen": -275.73333740234375, - "logps/rejected": -245.1799774169922, - "loss": 0.5418, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.2719510495662689, - "rewards/margins": 0.7100328207015991, - "rewards/rejected": -0.4380817413330078, + "grad_norm": 108.75023514338689, + "learning_rate": 4.909001458367866e-07, + "logits/chosen": -2.556863784790039, + "logits/rejected": -2.4760687351226807, + "logps/chosen": -274.7050476074219, + "logps/rejected": -246.76708984375, + "loss": 0.5534, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.37477999925613403, + "rewards/margins": 0.9715734720230103, + "rewards/rejected": -0.5967934727668762, "step": 170 }, { "epoch": 0.18838304552590268, - "grad_norm": 100.15036421868844, - "learning_rate": 3.13588850174216e-07, - "logits/chosen": -2.56353497505188, - "logits/rejected": -2.52207612991333, - "logps/chosen": -240.8281707763672, - "logps/rejected": -254.623291015625, - "loss": 0.5309, + "grad_norm": 99.91541524541664, + "learning_rate": 4.882952093833627e-07, + "logits/chosen": -2.543196201324463, + "logits/rejected": -2.501505136489868, + "logps/chosen": -242.51748657226562, + "logps/rejected": -259.04241943359375, + "loss": 0.5136, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.09020280838012695, - "rewards/margins": 0.6936953663825989, - "rewards/rejected": -0.7838981747627258, + "rewards/chosen": -0.2591323256492615, + "rewards/margins": 0.9666780233383179, + "rewards/rejected": -1.2258104085922241, "step": 180 }, { "epoch": 0.1988487702773417, - "grad_norm": 99.03103065699287, - "learning_rate": 3.3101045296167245e-07, - "logits/chosen": -2.4599368572235107, - "logits/rejected": -2.4445507526397705, - "logps/chosen": -236.59841918945312, - "logps/rejected": -257.88421630859375, - "loss": 0.5291, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.0578870065510273, - "rewards/margins": 0.7402381896972656, - "rewards/rejected": -0.798125147819519, + "grad_norm": 115.88419375588242, + "learning_rate": 4.853715742087946e-07, + "logits/chosen": -2.4405717849731445, + "logits/rejected": -2.4257616996765137, + "logps/chosen": -240.6494140625, + "logps/rejected": -264.52215576171875, + "loss": 0.5006, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.46298712491989136, + "rewards/margins": 0.9989321827888489, + "rewards/rejected": -1.4619193077087402, "step": 190 }, { "epoch": 0.20931449502878074, - "grad_norm": 104.67870733462854, - "learning_rate": 3.484320557491289e-07, - "logits/chosen": -2.5791962146759033, - "logits/rejected": -2.499197483062744, - "logps/chosen": -307.86517333984375, - "logps/rejected": -262.0858459472656, - "loss": 0.57, + "grad_norm": 105.94881559115358, + "learning_rate": 4.821331504159906e-07, + "logits/chosen": -2.5927906036376953, + "logits/rejected": -2.517674684524536, + "logps/chosen": -309.7959289550781, + "logps/rejected": -265.2268371582031, + "loss": 0.5475, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": 0.27781373262405396, - "rewards/margins": 0.7760672569274902, - "rewards/rejected": -0.4982534348964691, + "rewards/chosen": 0.08473672717809677, + "rewards/margins": 0.8970876932144165, + "rewards/rejected": -0.812350869178772, "step": 200 }, { "epoch": 0.20931449502878074, - "eval_logits/chosen": -2.546539306640625, - "eval_logits/rejected": -2.4866795539855957, - "eval_logps/chosen": -276.03204345703125, - "eval_logps/rejected": -261.8945007324219, - "eval_loss": 0.5571414828300476, - "eval_rewards/accuracies": 0.7539682388305664, - "eval_rewards/chosen": 0.5921932458877563, - "eval_rewards/margins": 0.7597501873970032, - "eval_rewards/rejected": -0.16755692660808563, - "eval_runtime": 195.4694, - "eval_samples_per_second": 10.232, - "eval_steps_per_second": 0.322, + "eval_logits/chosen": -2.5923333168029785, + "eval_logits/rejected": -2.537993907928467, + "eval_logps/chosen": -277.6067810058594, + "eval_logps/rejected": -265.0432434082031, + "eval_loss": 0.5464460849761963, + "eval_rewards/accuracies": 0.7638888955116272, + "eval_rewards/chosen": 0.4347245991230011, + "eval_rewards/margins": 0.9171593189239502, + "eval_rewards/rejected": -0.48243483901023865, + "eval_runtime": 206.9145, + "eval_samples_per_second": 9.666, + "eval_steps_per_second": 0.304, "step": 200 }, { "epoch": 0.21978021978021978, - "grad_norm": 123.83522939040155, - "learning_rate": 3.6585365853658536e-07, - "logits/chosen": -2.535982608795166, - "logits/rejected": -2.4645702838897705, - "logps/chosen": -277.9667053222656, - "logps/rejected": -273.4307556152344, - "loss": 0.5694, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.48403400182724, - "rewards/margins": 0.5986848473548889, - "rewards/rejected": -0.11465080827474594, + "grad_norm": 118.18724049066705, + "learning_rate": 4.785842691097342e-07, + "logits/chosen": -2.5883002281188965, + "logits/rejected": -2.524261951446533, + "logps/chosen": -278.22625732421875, + "logps/rejected": -275.1640625, + "loss": 0.5643, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.45807909965515137, + "rewards/margins": 0.7460596561431885, + "rewards/rejected": -0.2879805266857147, "step": 210 }, { "epoch": 0.2302459445316588, - "grad_norm": 107.83552757357393, - "learning_rate": 3.832752613240418e-07, - "logits/chosen": -2.5283524990081787, - "logits/rejected": -2.4452483654022217, - "logps/chosen": -285.9525146484375, - "logps/rejected": -242.55825805664062, - "loss": 0.5304, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.12322857230901718, - "rewards/margins": 0.7635325193405151, - "rewards/rejected": -0.640303909778595, + "grad_norm": 156.31813233608617, + "learning_rate": 4.7472967660421603e-07, + "logits/chosen": -2.5972952842712402, + "logits/rejected": -2.5259041786193848, + "logps/chosen": -284.1155700683594, + "logps/rejected": -242.37039184570312, + "loss": 0.5191, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.30692246556282043, + "rewards/margins": 0.9284420013427734, + "rewards/rejected": -0.6215195655822754, "step": 220 }, { "epoch": 0.24071166928309787, - "grad_norm": 95.0494443146835, - "learning_rate": 4.006968641114982e-07, - "logits/chosen": -2.465329647064209, - "logits/rejected": -2.4375150203704834, - "logps/chosen": -291.64776611328125, - "logps/rejected": -277.65496826171875, - "loss": 0.5349, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.29114001989364624, - "rewards/margins": 0.6186447739601135, - "rewards/rejected": -0.9097847938537598, + "grad_norm": 103.00603061075961, + "learning_rate": 4.705745280752585e-07, + "logits/chosen": -2.527318000793457, + "logits/rejected": -2.505685329437256, + "logps/chosen": -287.8115539550781, + "logps/rejected": -275.6258850097656, + "loss": 0.5225, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.09247669577598572, + "rewards/margins": 0.7993501424789429, + "rewards/rejected": -0.7068735361099243, "step": 230 }, { "epoch": 0.25117739403453687, - "grad_norm": 111.07206552609104, - "learning_rate": 4.1811846689895467e-07, - "logits/chosen": -2.459812641143799, - "logits/rejected": -2.440061569213867, - "logps/chosen": -267.8942565917969, - "logps/rejected": -263.02056884765625, - "loss": 0.5127, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.03198814019560814, - "rewards/margins": 0.8522968292236328, - "rewards/rejected": -0.8842849731445312, + "grad_norm": 187.82863477728128, + "learning_rate": 4.6612438066572555e-07, + "logits/chosen": -2.536621570587158, + "logits/rejected": -2.525778293609619, + "logps/chosen": -265.84332275390625, + "logps/rejected": -262.84210205078125, + "loss": 0.517, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.17310741543769836, + "rewards/margins": 1.0395452976226807, + "rewards/rejected": -0.8664379119873047, "step": 240 }, { "epoch": 0.2616431187859759, - "grad_norm": 102.32651543625639, - "learning_rate": 4.3554006968641113e-07, - "logits/chosen": -2.4430692195892334, - "logits/rejected": -2.4185352325439453, - "logps/chosen": -275.0945739746094, - "logps/rejected": -263.59759521484375, - "loss": 0.5012, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.26749125123023987, - "rewards/margins": 0.868768036365509, - "rewards/rejected": -0.6012767553329468, + "grad_norm": 98.47328118938334, + "learning_rate": 4.6138518605333664e-07, + "logits/chosen": -2.5257980823516846, + "logits/rejected": -2.50608229637146, + "logps/chosen": -275.45831298828125, + "logps/rejected": -265.85247802734375, + "loss": 0.4916, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.23112091422080994, + "rewards/margins": 1.057886004447937, + "rewards/rejected": -0.8267651796340942, "step": 250 }, { "epoch": 0.272108843537415, - "grad_norm": 117.33999105545253, - "learning_rate": 4.529616724738676e-07, - "logits/chosen": -2.4992964267730713, - "logits/rejected": -2.443032741546631, - "logps/chosen": -291.96759033203125, - "logps/rejected": -292.431640625, - "loss": 0.519, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.2992609739303589, - "rewards/margins": 0.716291606426239, - "rewards/rejected": -1.0155525207519531, + "grad_norm": 132.81269847647957, + "learning_rate": 4.5636328249082514e-07, + "logits/chosen": -2.6113829612731934, + "logits/rejected": -2.5672502517700195, + "logps/chosen": -288.1708068847656, + "logps/rejected": -290.2517395019531, + "loss": 0.5332, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.08042006194591522, + "rewards/margins": 0.8779838681221008, + "rewards/rejected": -0.7975638508796692, "step": 260 }, { "epoch": 0.282574568288854, - "grad_norm": 110.37872577613058, - "learning_rate": 4.7038327526132404e-07, - "logits/chosen": -2.3981385231018066, - "logits/rejected": -2.371910572052002, - "logps/chosen": -295.07244873046875, - "logps/rejected": -257.9580993652344, - "loss": 0.5053, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.46228867769241333, - "rewards/margins": 0.9018086194992065, - "rewards/rejected": -0.4395199418067932, + "grad_norm": 109.69739978094053, + "learning_rate": 4.510653863290871e-07, + "logits/chosen": -2.5312352180480957, + "logits/rejected": -2.514791965484619, + "logps/chosen": -294.85821533203125, + "logps/rejected": -259.7037658691406, + "loss": 0.4994, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.4837077260017395, + "rewards/margins": 1.097794771194458, + "rewards/rejected": -0.614086925983429, "step": 270 }, { "epoch": 0.29304029304029305, - "grad_norm": 108.54717406795075, - "learning_rate": 4.878048780487804e-07, - "logits/chosen": -2.4483561515808105, - "logits/rejected": -2.3909499645233154, - "logps/chosen": -289.1899108886719, - "logps/rejected": -287.1288146972656, - "loss": 0.5408, + "grad_norm": 112.60908808433207, + "learning_rate": 4.4549858303465737e-07, + "logits/chosen": -2.577178478240967, + "logits/rejected": -2.527773380279541, + "logps/chosen": -293.6539001464844, + "logps/rejected": -294.2825927734375, + "loss": 0.5343, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.7032612562179565, - "rewards/margins": 1.0894885063171387, - "rewards/rejected": -0.3862271308898926, + "rewards/chosen": 0.25686317682266235, + "rewards/margins": 1.3584675788879395, + "rewards/rejected": -1.1016044616699219, "step": 280 }, { "epoch": 0.3035060177917321, - "grad_norm": 115.08372508805712, - "learning_rate": 4.999983293481417e-07, - "logits/chosen": -2.4171204566955566, - "logits/rejected": -2.322211742401123, - "logps/chosen": -273.3341369628906, - "logps/rejected": -245.93264770507812, - "loss": 0.5142, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.057748716324567795, - "rewards/margins": 0.8324533700942993, - "rewards/rejected": -0.8902019262313843, + "grad_norm": 136.22152391063145, + "learning_rate": 4.396703177135261e-07, + "logits/chosen": -2.5472209453582764, + "logits/rejected": -2.470147132873535, + "logps/chosen": -272.1517028808594, + "logps/rejected": -246.69384765625, + "loss": 0.5047, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.06049398332834244, + "rewards/margins": 1.0268157720565796, + "rewards/rejected": -0.9663218259811401, "step": 290 }, { "epoch": 0.3139717425431711, - "grad_norm": 104.03336908036505, - "learning_rate": 4.999686294918159e-07, - "logits/chosen": -2.432267904281616, - "logits/rejected": -2.4175007343292236, - "logps/chosen": -271.7186279296875, - "logps/rejected": -278.4363098144531, - "loss": 0.5429, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.27266231179237366, - "rewards/margins": 0.796421468257904, - "rewards/rejected": -1.0690839290618896, + "grad_norm": 103.27642265277515, + "learning_rate": 4.335883851539693e-07, + "logits/chosen": -2.571641206741333, + "logits/rejected": -2.5657002925872803, + "logps/chosen": -269.2393493652344, + "logps/rejected": -277.0874938964844, + "loss": 0.5359, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.024735305458307266, + "rewards/margins": 0.9094675183296204, + "rewards/rejected": -0.9342027902603149, "step": 300 }, { "epoch": 0.3139717425431711, - "eval_logits/chosen": -2.420806884765625, - "eval_logits/rejected": -2.354243040084839, - "eval_logps/chosen": -281.93511962890625, - "eval_logps/rejected": -269.8440246582031, - "eval_loss": 0.5365566611289978, - "eval_rewards/accuracies": 0.7539682388305664, - "eval_rewards/chosen": 0.0018868001643568277, - "eval_rewards/margins": 0.9643990993499756, - "eval_rewards/rejected": -0.9625123143196106, - "eval_runtime": 193.0603, - "eval_samples_per_second": 10.359, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -2.559617757797241, + "eval_logits/rejected": -2.5065526962280273, + "eval_logps/chosen": -281.25714111328125, + "eval_logps/rejected": -270.3889465332031, + "eval_loss": 0.5473023653030396, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": 0.0696859061717987, + "eval_rewards/margins": 1.0866899490356445, + "eval_rewards/rejected": -1.017004132270813, + "eval_runtime": 206.5677, + "eval_samples_per_second": 9.682, + "eval_steps_per_second": 0.305, "step": 300 }, { "epoch": 0.32443746729461015, - "grad_norm": 115.97305483559151, - "learning_rate": 4.999018091152689e-07, - "logits/chosen": -2.475944995880127, - "logits/rejected": -2.3476412296295166, - "logps/chosen": -282.55377197265625, - "logps/rejected": -241.09091186523438, - "loss": 0.5453, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.11820511519908905, - "rewards/margins": 0.9483598470687866, - "rewards/rejected": -0.8301547169685364, + "grad_norm": 113.48472631776741, + "learning_rate": 4.272609194017105e-07, + "logits/chosen": -2.5980334281921387, + "logits/rejected": -2.5018410682678223, + "logps/chosen": -284.5079650878906, + "logps/rejected": -245.64804077148438, + "loss": 0.5341, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.07721372693777084, + "rewards/margins": 1.208656907081604, + "rewards/rejected": -1.2858705520629883, "step": 310 }, { "epoch": 0.3349031920460492, - "grad_norm": 97.14026918766356, - "learning_rate": 4.997978781413745e-07, - "logits/chosen": -2.4234707355499268, - "logits/rejected": -2.375223398208618, - "logps/chosen": -248.39590454101562, - "logps/rejected": -255.823486328125, - "loss": 0.585, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.4650595784187317, - "rewards/margins": 0.9892406463623047, - "rewards/rejected": -0.5241810083389282, + "grad_norm": 94.90226304989493, + "learning_rate": 4.2069638288135547e-07, + "logits/chosen": -2.53539776802063, + "logits/rejected": -2.496711254119873, + "logps/chosen": -254.16464233398438, + "logps/rejected": -263.13336181640625, + "loss": 0.5831, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.1118149608373642, + "rewards/margins": 1.1433535814285278, + "rewards/rejected": -1.2551685571670532, "step": 320 }, { "epoch": 0.3453689167974882, - "grad_norm": 108.17949568858647, - "learning_rate": 4.996568520039564e-07, - "logits/chosen": -2.4319825172424316, - "logits/rejected": -2.3865997791290283, - "logps/chosen": -259.1747741699219, - "logps/rejected": -243.63912963867188, - "loss": 0.5364, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.28245288133621216, - "rewards/margins": 0.8709386587142944, - "rewards/rejected": -1.1533915996551514, + "grad_norm": 111.02501519646566, + "learning_rate": 4.139035550786494e-07, + "logits/chosen": -2.503291606903076, + "logits/rejected": -2.466763973236084, + "logps/chosen": -260.0853271484375, + "logps/rejected": -246.05831909179688, + "loss": 0.5226, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3735058307647705, + "rewards/margins": 1.021803379058838, + "rewards/rejected": -1.395309329032898, "step": 330 }, { "epoch": 0.35583464154892724, - "grad_norm": 126.45669783557696, - "learning_rate": 4.994787516454973e-07, - "logits/chosen": -2.3520426750183105, - "logits/rejected": -2.303215742111206, - "logps/chosen": -266.7975158691406, - "logps/rejected": -241.9561004638672, - "loss": 0.5753, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.27472639083862305, - "rewards/margins": 0.8663428425788879, - "rewards/rejected": -0.5916165113449097, + "grad_norm": 120.37840185839579, + "learning_rate": 4.0689152079869306e-07, + "logits/chosen": -2.419663667678833, + "logits/rejected": -2.3767175674438477, + "logps/chosen": -270.08642578125, + "logps/rejected": -246.24227905273438, + "loss": 0.6048, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.054160237312316895, + "rewards/margins": 0.966072678565979, + "rewards/rejected": -1.0202327966690063, "step": 340 }, { "epoch": 0.3663003663003663, - "grad_norm": 122.45007669335854, - "learning_rate": 4.992636035140273e-07, - "logits/chosen": -2.42226505279541, - "logits/rejected": -2.3810067176818848, - "logps/chosen": -288.61932373046875, - "logps/rejected": -261.0976867675781, - "loss": 0.5433, + "grad_norm": 135.62587519571315, + "learning_rate": 3.99669658015821e-07, + "logits/chosen": -2.512122631072998, + "logits/rejected": -2.480046510696411, + "logps/chosen": -289.42449951171875, + "logps/rejected": -262.649169921875, + "loss": 0.537, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.2599612772464752, - "rewards/margins": 0.7570372819900513, - "rewards/rejected": -0.49707597494125366, + "rewards/chosen": 0.17944186925888062, + "rewards/margins": 0.8316653370857239, + "rewards/rejected": -0.6522234678268433, "step": 350 }, { "epoch": 0.37676609105180536, - "grad_norm": 102.33707449256083, - "learning_rate": 4.990114395591979e-07, - "logits/chosen": -2.370319366455078, - "logits/rejected": -2.350492477416992, - "logps/chosen": -280.61785888671875, - "logps/rejected": -269.88177490234375, - "loss": 0.4794, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": 0.528896152973175, - "rewards/margins": 1.2394490242004395, - "rewards/rejected": -0.7105528116226196, + "grad_norm": 95.8864031595669, + "learning_rate": 3.92247625331392e-07, + "logits/chosen": -2.4578399658203125, + "logits/rejected": -2.4470133781433105, + "logps/chosen": -281.417236328125, + "logps/rejected": -271.0404357910156, + "loss": 0.4816, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.4489572048187256, + "rewards/margins": 1.2753773927688599, + "rewards/rejected": -0.8264200091362, "step": 360 }, { "epoch": 0.3872318158032444, - "grad_norm": 100.71654133555701, - "learning_rate": 4.987222972275368e-07, - "logits/chosen": -2.420348644256592, - "logits/rejected": -2.373081922531128, - "logps/chosen": -261.0738525390625, - "logps/rejected": -242.97256469726562, - "loss": 0.5247, + "grad_norm": 100.50656645709512, + "learning_rate": 3.846353490562664e-07, + "logits/chosen": -2.5093467235565186, + "logits/rejected": -2.4688096046447754, + "logps/chosen": -260.3323669433594, + "logps/rejected": -242.1940155029297, + "loss": 0.5303, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.22305448353290558, - "rewards/margins": 0.9546422958374023, - "rewards/rejected": -0.7315877676010132, + "rewards/chosen": 0.2971992492675781, + "rewards/margins": 0.9509322047233582, + "rewards/rejected": -0.65373295545578, "step": 370 }, { "epoch": 0.3976975405546834, - "grad_norm": 110.22879578680318, - "learning_rate": 4.983962194568871e-07, - "logits/chosen": -2.3985788822174072, - "logits/rejected": -2.356171131134033, - "logps/chosen": -255.7781524658203, - "logps/rejected": -257.33551025390625, - "loss": 0.5119, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.11307580769062042, - "rewards/margins": 0.9896215200424194, - "rewards/rejected": -1.1026971340179443, + "grad_norm": 106.58256559416274, + "learning_rate": 3.768430099352445e-07, + "logits/chosen": -2.487880229949951, + "logits/rejected": -2.454012870788574, + "logps/chosen": -255.39425659179688, + "logps/rejected": -257.49078369140625, + "loss": 0.5166, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.07468878477811813, + "rewards/margins": 1.0435364246368408, + "rewards/rejected": -1.118225336074829, "step": 380 }, { "epoch": 0.40816326530612246, - "grad_norm": 99.85541487699506, - "learning_rate": 4.980332546700308e-07, - "logits/chosen": -2.4487264156341553, - "logits/rejected": -2.4068520069122314, - "logps/chosen": -281.5476989746094, - "logps/rejected": -277.7716979980469, - "loss": 0.5407, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.005253917071968317, - "rewards/margins": 0.9466168284416199, - "rewards/rejected": -0.9413629770278931, + "grad_norm": 101.08281656389065, + "learning_rate": 3.6888102953122304e-07, + "logits/chosen": -2.5439352989196777, + "logits/rejected": -2.506777286529541, + "logps/chosen": -287.0313720703125, + "logps/rejected": -284.038818359375, + "loss": 0.5404, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5431143045425415, + "rewards/margins": 1.0249578952789307, + "rewards/rejected": -1.5680720806121826, "step": 390 }, { "epoch": 0.4186289900575615, - "grad_norm": 113.31542793413445, - "learning_rate": 4.976334567674982e-07, - "logits/chosen": -2.4607863426208496, - "logits/rejected": -2.4668262004852295, - "logps/chosen": -268.7435302734375, - "logps/rejected": -275.6502380371094, - "loss": 0.5168, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.1197589784860611, - "rewards/margins": 0.9163272976875305, - "rewards/rejected": -0.7965682744979858, + "grad_norm": 295.883680569105, + "learning_rate": 3.607600562872785e-07, + "logits/chosen": -2.568805456161499, + "logits/rejected": -2.5805678367614746, + "logps/chosen": -274.0014953613281, + "logps/rejected": -281.6232604980469, + "loss": 0.5228, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4060361385345459, + "rewards/margins": 0.9878374338150024, + "rewards/rejected": -1.3938738107681274, "step": 400 }, { "epoch": 0.4186289900575615, - "eval_logits/chosen": -2.53886342048645, - "eval_logits/rejected": -2.475952625274658, - "eval_logps/chosen": -280.3628845214844, - "eval_logps/rejected": -269.0635070800781, - "eval_loss": 0.545230507850647, - "eval_rewards/accuracies": 0.7599206566810608, - "eval_rewards/chosen": 0.1591119021177292, - "eval_rewards/margins": 1.043568730354309, - "eval_rewards/rejected": -0.8844567537307739, - "eval_runtime": 195.558, - "eval_samples_per_second": 10.227, - "eval_steps_per_second": 0.322, + "eval_logits/chosen": -2.6470746994018555, + "eval_logits/rejected": -2.59334397315979, + "eval_logps/chosen": -284.2651672363281, + "eval_logps/rejected": -273.2837219238281, + "eval_loss": 0.5321290493011475, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -0.23111547529697418, + "eval_rewards/margins": 1.0753650665283203, + "eval_rewards/rejected": -1.3064805269241333, + "eval_runtime": 206.5734, + "eval_samples_per_second": 9.682, + "eval_steps_per_second": 0.305, "step": 400 }, { "epoch": 0.4290947148090005, - "grad_norm": 97.88452536385499, - "learning_rate": 4.971968851195637e-07, - "logits/chosen": -2.4723410606384277, - "logits/rejected": -2.4187119007110596, - "logps/chosen": -272.06732177734375, - "logps/rejected": -256.6286315917969, - "loss": 0.5457, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.032803989946842194, - "rewards/margins": 0.9476572275161743, - "rewards/rejected": -0.9804611206054688, + "grad_norm": 103.72761100708047, + "learning_rate": 3.5249095128531856e-07, + "logits/chosen": -2.588397741317749, + "logits/rejected": -2.5449459552764893, + "logps/chosen": -274.9619140625, + "logps/rejected": -259.3703308105469, + "loss": 0.5364, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.32226118445396423, + "rewards/margins": 0.9323698878288269, + "rewards/rejected": -1.2546310424804688, "step": 410 }, { "epoch": 0.43956043956043955, - "grad_norm": 130.43769934733578, - "learning_rate": 4.967236045574293e-07, - "logits/chosen": -2.50521183013916, - "logits/rejected": -2.401068687438965, - "logps/chosen": -279.63104248046875, - "logps/rejected": -262.1017761230469, - "loss": 0.5377, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.256000816822052, - "rewards/margins": 1.261556625366211, - "rewards/rejected": -1.0055558681488037, + "grad_norm": 143.18448896096157, + "learning_rate": 3.4408477372034736e-07, + "logits/chosen": -2.6011955738067627, + "logits/rejected": -2.511427402496338, + "logps/chosen": -281.31475830078125, + "logps/rejected": -264.31573486328125, + "loss": 0.5605, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.08763083815574646, + "rewards/margins": 1.3145806789398193, + "rewards/rejected": -1.226949691772461, "step": 420 }, { "epoch": 0.4500261643118786, - "grad_norm": 107.9734205604773, - "learning_rate": 4.962136853635969e-07, - "logits/chosen": -2.472808361053467, - "logits/rejected": -2.44633412361145, - "logps/chosen": -251.77755737304688, - "logps/rejected": -259.7610778808594, - "loss": 0.5102, + "grad_norm": 105.39381768622819, + "learning_rate": 3.3555276610977276e-07, + "logits/chosen": -2.582556962966919, + "logits/rejected": -2.563215732574463, + "logps/chosen": -251.7899932861328, + "logps/rejected": -261.0627746582031, + "loss": 0.5028, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.20002666115760803, - "rewards/margins": 1.1591206789016724, - "rewards/rejected": -0.9590939283370972, + "rewards/chosen": 0.19878244400024414, + "rewards/margins": 1.2880423069000244, + "rewards/rejected": -1.0892596244812012, "step": 430 }, { "epoch": 0.4604918890633176, - "grad_norm": 94.62480991940093, - "learning_rate": 4.956672032614314e-07, - "logits/chosen": -2.4571681022644043, - "logits/rejected": -2.4182329177856445, - "logps/chosen": -291.9977722167969, - "logps/rejected": -258.6444396972656, - "loss": 0.5978, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.32614707946777344, - "rewards/margins": 0.6479467153549194, - "rewards/rejected": -0.3217995762825012, + "grad_norm": 97.95625671060284, + "learning_rate": 3.269063392575352e-07, + "logits/chosen": -2.5603010654449463, + "logits/rejected": -2.5315120220184326, + "logps/chosen": -294.2579040527344, + "logps/rejected": -261.4208679199219, + "loss": 0.5781, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.100133016705513, + "rewards/margins": 0.6995735168457031, + "rewards/rejected": -0.5994404554367065, "step": 440 }, { "epoch": 0.47095761381475665, - "grad_norm": 112.97905708045955, - "learning_rate": 4.950842394039156e-07, - "logits/chosen": -2.486661195755005, - "logits/rejected": -2.446424961090088, - "logps/chosen": -279.9918518066406, - "logps/rejected": -287.51617431640625, - "loss": 0.5248, + "grad_norm": 115.32814881542778, + "learning_rate": 3.1815705699316964e-07, + "logits/chosen": -2.5920796394348145, + "logits/rejected": -2.5575454235076904, + "logps/chosen": -281.134033203125, + "logps/rejected": -289.2639465332031, + "loss": 0.5217, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.24052366614341736, - "rewards/margins": 1.1665834188461304, - "rewards/rejected": -0.9260597229003906, + "rewards/chosen": 0.12630310654640198, + "rewards/margins": 1.2271454334259033, + "rewards/rejected": -1.1008423566818237, "step": 450 }, { "epoch": 0.48142333856619574, - "grad_norm": 112.28420222121628, - "learning_rate": 4.944648803615994e-07, - "logits/chosen": -2.4559218883514404, - "logits/rejected": -2.394252300262451, - "logps/chosen": -256.81353759765625, - "logps/rejected": -255.33944702148438, - "loss": 0.5298, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.06617209315299988, - "rewards/margins": 1.1002776622772217, - "rewards/rejected": -1.0341055393218994, + "grad_norm": 118.44955170701016, + "learning_rate": 3.0931662070620794e-07, + "logits/chosen": -2.5641932487487793, + "logits/rejected": -2.5097055435180664, + "logps/chosen": -257.33160400390625, + "logps/rejected": -255.80221557617188, + "loss": 0.5178, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01436941884458065, + "rewards/margins": 1.0947532653808594, + "rewards/rejected": -1.0803838968276978, "step": 460 }, { "epoch": 0.49188906331763477, - "grad_norm": 100.27588641056836, - "learning_rate": 4.938092181097433e-07, - "logits/chosen": -2.516767978668213, - "logits/rejected": -2.4646008014678955, - "logps/chosen": -283.0987854003906, - "logps/rejected": -259.1934814453125, - "loss": 0.4971, + "grad_norm": 95.30482890686108, + "learning_rate": 3.003968536966078e-07, + "logits/chosen": -2.6018574237823486, + "logits/rejected": -2.559516429901123, + "logps/chosen": -282.80145263671875, + "logps/rejected": -259.4977111816406, + "loss": 0.4768, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.06888873130083084, - "rewards/margins": 0.9940488934516907, - "rewards/rejected": -0.9251600503921509, + "rewards/chosen": 0.09861959517002106, + "rewards/margins": 1.0542025566101074, + "rewards/rejected": -0.9555829763412476, "step": 470 }, { "epoch": 0.5023547880690737, - "grad_norm": 101.96214848421158, - "learning_rate": 4.931173500146607e-07, - "logits/chosen": -2.467817783355713, - "logits/rejected": -2.3924126625061035, - "logps/chosen": -267.6799011230469, - "logps/rejected": -265.552978515625, - "loss": 0.4989, + "grad_norm": 105.76775426891116, + "learning_rate": 2.9140968536213693e-07, + "logits/chosen": -2.546515464782715, + "logits/rejected": -2.4800972938537598, + "logps/chosen": -268.1258850097656, + "logps/rejected": -265.9749450683594, + "loss": 0.4923, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.04674670845270157, - "rewards/margins": 1.3773844242095947, - "rewards/rejected": -1.424131155014038, + "rewards/chosen": -0.09133951365947723, + "rewards/margins": 1.3749868869781494, + "rewards/rejected": -1.466326355934143, "step": 480 }, { "epoch": 0.5128205128205128, - "grad_norm": 100.79667193803812, - "learning_rate": 4.923893788192582e-07, - "logits/chosen": -2.515101671218872, - "logits/rejected": -2.3672587871551514, - "logps/chosen": -310.18133544921875, - "logps/rejected": -267.71246337890625, - "loss": 0.5085, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.17435257136821747, - "rewards/margins": 1.3074657917022705, - "rewards/rejected": -1.1331132650375366, + "grad_norm": 99.04770574478776, + "learning_rate": 2.823671352438608e-07, + "logits/chosen": -2.5936622619628906, + "logits/rejected": -2.4636528491973877, + "logps/chosen": -310.77911376953125, + "logps/rejected": -268.76055908203125, + "loss": 0.5039, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.11457942426204681, + "rewards/margins": 1.352496862411499, + "rewards/rejected": -1.237917423248291, "step": 490 }, { "epoch": 0.5232862375719518, - "grad_norm": 95.12474841185956, - "learning_rate": 4.91625412627779e-07, - "logits/chosen": -2.3883442878723145, - "logits/rejected": -2.361886978149414, - "logps/chosen": -258.3727722167969, - "logps/rejected": -237.2646026611328, - "loss": 0.5337, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.03052714094519615, - "rewards/margins": 1.0152196884155273, - "rewards/rejected": -0.9846924543380737, + "grad_norm": 89.42678222589365, + "learning_rate": 2.73281296951072e-07, + "logits/chosen": -2.482358694076538, + "logits/rejected": -2.4588165283203125, + "logps/chosen": -258.74224853515625, + "logps/rejected": -238.3992156982422, + "loss": 0.5217, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.006418359465897083, + "rewards/margins": 1.0917348861694336, + "rewards/rejected": -1.0981531143188477, "step": 500 }, { "epoch": 0.5232862375719518, - "eval_logits/chosen": -2.4845328330993652, - "eval_logits/rejected": -2.422492265701294, - "eval_logps/chosen": -280.5832824707031, - "eval_logps/rejected": -270.8497314453125, - "eval_loss": 0.5323790311813354, - "eval_rewards/accuracies": 0.7777777910232544, - "eval_rewards/chosen": 0.13707374036312103, - "eval_rewards/margins": 1.200156569480896, - "eval_rewards/rejected": -1.0630826950073242, - "eval_runtime": 195.6981, - "eval_samples_per_second": 10.22, - "eval_steps_per_second": 0.322, + "eval_logits/chosen": -2.5772807598114014, + "eval_logits/rejected": -2.5195107460021973, + "eval_logps/chosen": -281.8111267089844, + "eval_logps/rejected": -272.2919006347656, + "eval_loss": 0.526000440120697, + "eval_rewards/accuracies": 0.7876983880996704, + "eval_rewards/chosen": 0.014284107834100723, + "eval_rewards/margins": 1.2215818166732788, + "eval_rewards/rejected": -1.2072975635528564, + "eval_runtime": 206.5403, + "eval_samples_per_second": 9.683, + "eval_steps_per_second": 0.305, "step": 500 }, { "epoch": 0.533751962323391, - "grad_norm": 92.18682869134534, - "learning_rate": 4.908255648897489e-07, - "logits/chosen": -2.4914324283599854, - "logits/rejected": -2.473914384841919, - "logps/chosen": -280.83990478515625, - "logps/rejected": -274.2536315917969, - "loss": 0.4869, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.3094209134578705, - "rewards/margins": 1.3579093217849731, - "rewards/rejected": -1.0484883785247803, + "grad_norm": 95.80547597223038, + "learning_rate": 2.641643219871597e-07, + "logits/chosen": -2.5779619216918945, + "logits/rejected": -2.567397356033325, + "logps/chosen": -281.93707275390625, + "logps/rejected": -275.4586486816406, + "loss": 0.4822, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.19970571994781494, + "rewards/margins": 1.36869215965271, + "rewards/rejected": -1.1689863204956055, "step": 510 }, { "epoch": 0.54421768707483, - "grad_norm": 87.27964904963777, - "learning_rate": 4.899899543831288e-07, - "logits/chosen": -2.5276341438293457, - "logits/rejected": -2.4992339611053467, - "logps/chosen": -275.48486328125, - "logps/rejected": -264.1493835449219, - "loss": 0.5374, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.34065061807632446, - "rewards/margins": 1.0787056684494019, - "rewards/rejected": -1.419356107711792, + "grad_norm": 88.16712559993171, + "learning_rate": 2.550284034980507e-07, + "logits/chosen": -2.5944199562072754, + "logits/rejected": -2.571303129196167, + "logps/chosen": -274.64373779296875, + "logps/rejected": -263.65179443359375, + "loss": 0.5305, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.2565365731716156, + "rewards/margins": 1.113057255744934, + "rewards/rejected": -1.369593858718872, "step": 520 }, { "epoch": 0.554683411826269, - "grad_norm": 132.95325103521267, - "learning_rate": 4.891187051966765e-07, - "logits/chosen": -2.558088779449463, - "logits/rejected": -2.5096819400787354, - "logps/chosen": -271.92816162109375, - "logps/rejected": -269.7146911621094, - "loss": 0.5551, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.31684479117393494, - "rewards/margins": 1.1543093919754028, - "rewards/rejected": -1.4711540937423706, + "grad_norm": 130.8715598080221, + "learning_rate": 2.4588575996495794e-07, + "logits/chosen": -2.616055965423584, + "logits/rejected": -2.5736825466156006, + "logps/chosen": -271.31500244140625, + "logps/rejected": -268.7001037597656, + "loss": 0.5681, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.25552767515182495, + "rewards/margins": 1.114168405532837, + "rewards/rejected": -1.3696961402893066, "step": 530 }, { "epoch": 0.565149136577708, - "grad_norm": 106.84429554381373, - "learning_rate": 4.882119467115191e-07, - "logits/chosen": -2.5741055011749268, - "logits/rejected": -2.5109028816223145, - "logps/chosen": -290.3707275390625, - "logps/rejected": -271.1365661621094, - "loss": 0.5195, + "grad_norm": 105.92360698073946, + "learning_rate": 2.367486188632446e-07, + "logits/chosen": -2.6313700675964355, + "logits/rejected": -2.5749824047088623, + "logps/chosen": -290.64703369140625, + "logps/rejected": -272.3555908203125, + "loss": 0.517, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.0011930823093280196, - "rewards/margins": 1.0513924360275269, - "rewards/rejected": -1.0501995086669922, + "rewards/chosen": -0.02643487975001335, + "rewards/margins": 1.1456679105758667, + "rewards/rejected": -1.172102689743042, "step": 540 }, { "epoch": 0.5756148613291471, - "grad_norm": 106.99668899607207, - "learning_rate": 4.872698135819402e-07, - "logits/chosen": -2.5081093311309814, - "logits/rejected": -2.458326816558838, - "logps/chosen": -273.5782165527344, - "logps/rejected": -242.58462524414062, - "loss": 0.5932, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2305355817079544, - "rewards/margins": 1.097015142440796, - "rewards/rejected": -1.3275506496429443, + "grad_norm": 125.07827114011454, + "learning_rate": 2.276292003092593e-07, + "logits/chosen": -2.5747551918029785, + "logits/rejected": -2.5299413204193115, + "logps/chosen": -273.75311279296875, + "logps/rejected": -243.8401641845703, + "loss": 0.566, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.24802279472351074, + "rewards/margins": 1.2050869464874268, + "rewards/rejected": -1.453109860420227, "step": 550 }, { "epoch": 0.5860805860805861, - "grad_norm": 117.06485032371232, - "learning_rate": 4.862924457153831e-07, - "logits/chosen": -2.483041524887085, - "logits/rejected": -2.421715497970581, - "logps/chosen": -285.08026123046875, - "logps/rejected": -268.50653076171875, - "loss": 0.5679, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.22517618536949158, - "rewards/margins": 1.1347671747207642, - "rewards/rejected": -1.359943151473999, + "grad_norm": 119.35091354587506, + "learning_rate": 2.185397007170141e-07, + "logits/chosen": -2.539616584777832, + "logits/rejected": -2.483868360519409, + "logps/chosen": -284.36090087890625, + "logps/rejected": -269.1393127441406, + "loss": 0.5529, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15324077010154724, + "rewards/margins": 1.269974708557129, + "rewards/rejected": -1.4232155084609985, "step": 560 }, { "epoch": 0.5965463108320251, - "grad_norm": 129.61805488811393, - "learning_rate": 4.852799882516748e-07, - "logits/chosen": -2.546429395675659, - "logits/rejected": -2.476639747619629, - "logps/chosen": -283.28375244140625, - "logps/rejected": -280.09674072265625, - "loss": 0.5192, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.3800364136695862, - "rewards/margins": 1.0123282670974731, - "rewards/rejected": -1.3923646211624146, + "grad_norm": 121.35724227176418, + "learning_rate": 2.094922764865619e-07, + "logits/chosen": -2.620558977127075, + "logits/rejected": -2.552006244659424, + "logps/chosen": -282.87921142578125, + "logps/rejected": -279.60931396484375, + "loss": 0.5355, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3395797610282898, + "rewards/margins": 1.0040457248687744, + "rewards/rejected": -1.343625545501709, "step": 570 }, { "epoch": 0.6070120355834642, - "grad_norm": 113.73382366678065, - "learning_rate": 4.842325915414727e-07, - "logits/chosen": -2.5227150917053223, - "logits/rejected": -2.455122232437134, - "logps/chosen": -253.94906616210938, - "logps/rejected": -224.01223754882812, - "loss": 0.5453, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.06417503952980042, - "rewards/margins": 1.0516060590744019, - "rewards/rejected": -1.1157810688018799, + "grad_norm": 114.95741828344511, + "learning_rate": 2.0049902774588797e-07, + "logits/chosen": -2.604160785675049, + "logits/rejected": -2.53875994682312, + "logps/chosen": -254.87548828125, + "logps/rejected": -224.6983184814453, + "loss": 0.5335, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.15681779384613037, + "rewards/margins": 1.0275707244873047, + "rewards/rejected": -1.1843883991241455, "step": 580 }, { "epoch": 0.6174777603349032, - "grad_norm": 128.3521211514605, - "learning_rate": 4.831504111239373e-07, - "logits/chosen": -2.502279043197632, - "logits/rejected": -2.4321484565734863, - "logps/chosen": -283.06298828125, - "logps/rejected": -284.6265563964844, - "loss": 0.5285, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5076113343238831, - "rewards/margins": 1.0840386152267456, - "rewards/rejected": -1.5916500091552734, + "grad_norm": 116.63197017075697, + "learning_rate": 1.9157198216806238e-07, + "logits/chosen": -2.5892062187194824, + "logits/rejected": -2.523660659790039, + "logps/chosen": -282.70709228515625, + "logps/rejected": -283.9153137207031, + "loss": 0.5121, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4720216691493988, + "rewards/margins": 1.0485048294067383, + "rewards/rejected": -1.52052640914917, "step": 590 }, { "epoch": 0.6279434850863422, - "grad_norm": 88.98605239316043, - "learning_rate": 4.820336077036342e-07, - "logits/chosen": -2.424705743789673, - "logits/rejected": -2.3662896156311035, - "logps/chosen": -289.51470947265625, - "logps/rejected": -276.53240966796875, - "loss": 0.5163, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.6964026689529419, - "rewards/margins": 0.9501320123672485, - "rewards/rejected": -1.6465345621109009, + "grad_norm": 90.93991377080742, + "learning_rate": 1.8272307888529274e-07, + "logits/chosen": -2.502859592437744, + "logits/rejected": -2.44142746925354, + "logps/chosen": -288.04864501953125, + "logps/rejected": -274.6844787597656, + "loss": 0.517, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5497933626174927, + "rewards/margins": 0.9119491577148438, + "rewards/rejected": -1.461742639541626, "step": 600 }, { "epoch": 0.6279434850863422, - "eval_logits/chosen": -2.4911673069000244, - "eval_logits/rejected": -2.4333341121673584, - "eval_logps/chosen": -285.7394104003906, - "eval_logps/rejected": -275.6128845214844, - "eval_loss": 0.536916971206665, - "eval_rewards/accuracies": 0.7559523582458496, - "eval_rewards/chosen": -0.378540575504303, - "eval_rewards/margins": 1.1608545780181885, - "eval_rewards/rejected": -1.5393950939178467, - "eval_runtime": 195.4858, - "eval_samples_per_second": 10.231, - "eval_steps_per_second": 0.322, + "eval_logits/chosen": -2.5743534564971924, + "eval_logits/rejected": -2.518336296081543, + "eval_logps/chosen": -284.8755187988281, + "eval_logps/rejected": -274.78082275390625, + "eval_loss": 0.5262279510498047, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.29215142130851746, + "eval_rewards/margins": 1.1640383005142212, + "eval_rewards/rejected": -1.4561898708343506, + "eval_runtime": 206.851, + "eval_samples_per_second": 9.669, + "eval_steps_per_second": 0.305, "step": 600 }, { "epoch": 0.6384092098377813, - "grad_norm": 108.48457616115262, - "learning_rate": 4.808823471266701e-07, - "logits/chosen": -2.4642491340637207, - "logits/rejected": -2.379899740219116, - "logps/chosen": -308.4049377441406, - "logps/rejected": -259.427490234375, - "loss": 0.5422, + "grad_norm": 109.87505024379428, + "learning_rate": 1.7396415252139288e-07, + "logits/chosen": -2.54246187210083, + "logits/rejected": -2.4643430709838867, + "logps/chosen": -308.33099365234375, + "logps/rejected": -259.29583740234375, + "loss": 0.5252, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.2815133035182953, - "rewards/margins": 1.1930127143859863, - "rewards/rejected": -1.4745259284973145, + "rewards/chosen": -0.2741188406944275, + "rewards/margins": 1.187243103981018, + "rewards/rejected": -1.4613618850708008, "step": 610 }, { "epoch": 0.6488749345892203, - "grad_norm": 92.91181383765685, - "learning_rate": 4.796968003560638e-07, - "logits/chosen": -2.4167888164520264, - "logits/rejected": -2.4008824825286865, - "logps/chosen": -270.0111389160156, - "logps/rejected": -247.5885772705078, - "loss": 0.4721, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.10089156776666641, - "rewards/margins": 1.219185709953308, - "rewards/rejected": -1.1182941198349, + "grad_norm": 92.62549956674485, + "learning_rate": 1.6530691736402316e-07, + "logits/chosen": -2.4948151111602783, + "logits/rejected": -2.4823691844940186, + "logps/chosen": -271.5950927734375, + "logps/rejected": -249.6556396484375, + "loss": 0.4543, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.05750391632318497, + "rewards/margins": 1.2674978971481323, + "rewards/rejected": -1.3250019550323486, "step": 620 }, { "epoch": 0.6593406593406593, - "grad_norm": 114.29651849047085, - "learning_rate": 4.784771434463586e-07, - "logits/chosen": -2.5169453620910645, - "logits/rejected": -2.43233060836792, - "logps/chosen": -271.96343994140625, - "logps/rejected": -248.26620483398438, - "loss": 0.4974, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.2741064429283142, - "rewards/margins": 1.503544569015503, - "rewards/rejected": -1.229438066482544, + "grad_norm": 115.38512973551775, + "learning_rate": 1.5676295169786864e-07, + "logits/chosen": -2.588836193084717, + "logits/rejected": -2.5063748359680176, + "logps/chosen": -272.53277587890625, + "logps/rejected": -246.722900390625, + "loss": 0.4912, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.21716764569282532, + "rewards/margins": 1.2922747135162354, + "rewards/rejected": -1.0751070976257324, "step": 630 }, { "epoch": 0.6698063840920984, - "grad_norm": 92.55126354825397, - "learning_rate": 4.772235575174775e-07, - "logits/chosen": -2.430208683013916, - "logits/rejected": -2.370039463043213, - "logps/chosen": -267.5420227050781, - "logps/rejected": -241.68505859375, - "loss": 0.4778, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.053665466606616974, - "rewards/margins": 1.2480392456054688, - "rewards/rejected": -1.30170476436615, + "grad_norm": 98.58039473996509, + "learning_rate": 1.483436823197092e-07, + "logits/chosen": -2.5013747215270996, + "logits/rejected": -2.4492290019989014, + "logps/chosen": -267.44415283203125, + "logps/rejected": -240.54727172851562, + "loss": 0.4593, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.04387538880109787, + "rewards/margins": 1.1440480947494507, + "rewards/rejected": -1.1879234313964844, "step": 640 }, { "epoch": 0.6802721088435374, - "grad_norm": 109.16465900306048, - "learning_rate": 4.7593622872782745e-07, - "logits/chosen": -2.47455096244812, - "logits/rejected": -2.4582881927490234, - "logps/chosen": -261.575439453125, - "logps/rejected": -276.32830810546875, - "loss": 0.5439, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.6074545383453369, - "rewards/margins": 1.0647180080413818, - "rewards/rejected": -0.45726341009140015, + "grad_norm": 108.19817847647745, + "learning_rate": 1.4006036925609243e-07, + "logits/chosen": -2.545503616333008, + "logits/rejected": -2.5265164375305176, + "logps/chosen": -264.9919738769531, + "logps/rejected": -279.6017150878906, + "loss": 0.5124, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.26580336689949036, + "rewards/margins": 1.0504064559936523, + "rewards/rejected": -0.7846031188964844, "step": 650 }, { "epoch": 0.6907378335949764, - "grad_norm": 107.53957587248918, - "learning_rate": 4.7461534824665415e-07, - "logits/chosen": -2.4683327674865723, - "logits/rejected": -2.399282932281494, - "logps/chosen": -283.3675842285156, - "logps/rejected": -272.4090576171875, - "loss": 0.5112, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.11449335515499115, - "rewards/margins": 1.3809350728988647, - "rewards/rejected": -1.4954285621643066, + "grad_norm": 103.96694126480867, + "learning_rate": 1.319240907040458e-07, + "logits/chosen": -2.545365571975708, + "logits/rejected": -2.4856820106506348, + "logps/chosen": -279.41656494140625, + "logps/rejected": -267.95294189453125, + "loss": 0.5078, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.28060275316238403, + "rewards/margins": 1.3304187059402466, + "rewards/rejected": -1.0498158931732178, "step": 660 }, { "epoch": 0.7012035583464155, - "grad_norm": 108.88129395121551, - "learning_rate": 4.7326111222565365e-07, - "logits/chosen": -2.4932992458343506, - "logits/rejected": -2.4070651531219482, - "logps/chosen": -297.422119140625, - "logps/rejected": -304.24102783203125, - "loss": 0.5035, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.8884851336479187, - "rewards/margins": 1.2241568565368652, - "rewards/rejected": -2.112642288208008, + "grad_norm": 126.63379779239241, + "learning_rate": 1.239457282149695e-07, + "logits/chosen": -2.5796685218811035, + "logits/rejected": -2.5014591217041016, + "logps/chosen": -288.4901428222656, + "logps/rejected": -296.0643615722656, + "loss": 0.4813, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004715624265372753, + "rewards/margins": 1.2996948957443237, + "rewards/rejected": -1.2949790954589844, "step": 670 }, { "epoch": 0.7116692830978545, - "grad_norm": 119.57222177110934, - "learning_rate": 4.718737217698434e-07, - "logits/chosen": -2.478400468826294, - "logits/rejected": -2.452972888946533, - "logps/chosen": -316.11932373046875, - "logps/rejected": -306.0750427246094, - "loss": 0.5363, + "grad_norm": 104.43312059048667, + "learning_rate": 1.1613595214152711e-07, + "logits/chosen": -2.5504367351531982, + "logits/rejected": -2.536426067352295, + "logps/chosen": -310.7039489746094, + "logps/rejected": -300.8048400878906, + "loss": 0.5083, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.4414599537849426, - "rewards/margins": 1.373000979423523, - "rewards/rejected": -1.8144609928131104, + "rewards/chosen": 0.10008208453655243, + "rewards/margins": 1.3875254392623901, + "rewards/rejected": -1.2874433994293213, "step": 680 }, { "epoch": 0.7221350078492935, - "grad_norm": 89.19339487572331, - "learning_rate": 4.7045338290769816e-07, - "logits/chosen": -2.4800658226013184, - "logits/rejected": -2.3848013877868652, - "logps/chosen": -274.92669677734375, - "logps/rejected": -248.93276977539062, - "loss": 0.4761, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.4272380769252777, - "rewards/margins": 1.198631763458252, - "rewards/rejected": -1.6258699893951416, + "grad_norm": 86.78408797627893, + "learning_rate": 1.0850520736699362e-07, + "logits/chosen": -2.5602102279663086, + "logits/rejected": -2.476207971572876, + "logps/chosen": -272.1448974609375, + "logps/rejected": -247.1821746826172, + "loss": 0.4535, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14905604720115662, + "rewards/margins": 1.301754355430603, + "rewards/rejected": -1.4508103132247925, "step": 690 }, { "epoch": 0.7326007326007326, - "grad_norm": 103.16959491509996, - "learning_rate": 4.690003065605545e-07, - "logits/chosen": -2.4219014644622803, - "logits/rejected": -2.3752682209014893, - "logps/chosen": -255.3638153076172, - "logps/rejected": -250.96163940429688, - "loss": 0.4881, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.12730266153812408, - "rewards/margins": 1.1120960712432861, - "rewards/rejected": -1.2393988370895386, + "grad_norm": 101.551211026177, + "learning_rate": 1.0106369933615042e-07, + "logits/chosen": -2.5397322177886963, + "logits/rejected": -2.493654251098633, + "logps/chosen": -256.18463134765625, + "logps/rejected": -252.2621612548828, + "loss": 0.4766, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2093852460384369, + "rewards/margins": 1.1600673198699951, + "rewards/rejected": -1.369452714920044, "step": 700 }, { "epoch": 0.7326007326007326, - "eval_logits/chosen": -2.4505491256713867, - "eval_logits/rejected": -2.3891587257385254, - "eval_logps/chosen": -280.71136474609375, - "eval_logps/rejected": -272.34771728515625, - "eval_loss": 0.5379975438117981, - "eval_rewards/accuracies": 0.7678571343421936, - "eval_rewards/chosen": 0.12425977736711502, - "eval_rewards/margins": 1.3371385335922241, - "eval_rewards/rejected": -1.212878704071045, - "eval_runtime": 194.8514, - "eval_samples_per_second": 10.264, - "eval_steps_per_second": 0.323, + "eval_logits/chosen": -2.575119972229004, + "eval_logits/rejected": -2.5193893909454346, + "eval_logps/chosen": -282.1366271972656, + "eval_logps/rejected": -273.1544494628906, + "eval_loss": 0.527851402759552, + "eval_rewards/accuracies": 0.7797619104385376, + "eval_rewards/chosen": -0.018264232203364372, + "eval_rewards/margins": 1.2752907276153564, + "eval_rewards/rejected": -1.2935550212860107, + "eval_runtime": 206.8622, + "eval_samples_per_second": 9.668, + "eval_steps_per_second": 0.305, "step": 700 }, { "epoch": 0.7430664573521716, - "grad_norm": 176.34005081961848, - "learning_rate": 4.6751470851128906e-07, - "logits/chosen": -2.3656280040740967, - "logits/rejected": -2.2906768321990967, - "logps/chosen": -258.83599853515625, - "logps/rejected": -255.7509002685547, - "loss": 0.65, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.03799005225300789, - "rewards/margins": 1.2437125444412231, - "rewards/rejected": -1.2817026376724243, + "grad_norm": 195.86678751064096, + "learning_rate": 9.382138040640714e-08, + "logits/chosen": -2.5042312145233154, + "logits/rejected": -2.4359371662139893, + "logps/chosen": -261.2381896972656, + "logps/rejected": -257.90142822265625, + "loss": 0.5532, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.27821090817451477, + "rewards/margins": 1.218544602394104, + "rewards/rejected": -1.4967554807662964, "step": 710 }, { "epoch": 0.7535321821036107, - "grad_norm": 97.99595491212375, - "learning_rate": 4.6599680937227446e-07, - "logits/chosen": -2.4037389755249023, - "logits/rejected": -2.3628785610198975, - "logps/chosen": -280.8328857421875, - "logps/rejected": -257.66644287109375, - "loss": 0.526, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.2932104766368866, - "rewards/margins": 1.3130133152008057, - "rewards/rejected": -1.0198028087615967, + "grad_norm": 97.34497217574805, + "learning_rate": 8.678793653740632e-08, + "logits/chosen": -2.5442166328430176, + "logits/rejected": -2.506136417388916, + "logps/chosen": -283.2102966308594, + "logps/rejected": -260.00079345703125, + "loss": 0.5108, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.05546964332461357, + "rewards/margins": 1.3087044954299927, + "rewards/rejected": -1.25323486328125, "step": 720 }, { "epoch": 0.7639979068550498, - "grad_norm": 118.27606340854193, - "learning_rate": 4.6444683455261823e-07, - "logits/chosen": -2.450195550918579, - "logits/rejected": -2.382481813430786, - "logps/chosen": -271.068359375, - "logps/rejected": -252.5572509765625, - "loss": 0.5127, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5691794157028198, - "rewards/margins": 0.9860371351242065, - "rewards/rejected": -1.5552165508270264, + "grad_norm": 112.03077715841201, + "learning_rate": 7.997277433690983e-08, + "logits/chosen": -2.5620975494384766, + "logits/rejected": -2.50207781791687, + "logps/chosen": -268.3021545410156, + "logps/rejected": -249.5152587890625, + "loss": 0.4936, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2925592064857483, + "rewards/margins": 0.9584562182426453, + "rewards/rejected": -1.251015543937683, "step": 730 }, { "epoch": 0.7744636316064888, - "grad_norm": 101.80090219260086, - "learning_rate": 4.6286501422468934e-07, - "logits/chosen": -2.420738458633423, - "logits/rejected": -2.3926711082458496, - "logps/chosen": -261.02239990234375, - "logps/rejected": -250.47390747070312, - "loss": 0.5761, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8172445297241211, - "rewards/margins": 1.006648302078247, - "rewards/rejected": -1.8238928318023682, + "grad_norm": 104.0919829579398, + "learning_rate": 7.338500848029602e-08, + "logits/chosen": -2.5124528408050537, + "logits/rejected": -2.4893691539764404, + "logps/chosen": -255.79995727539062, + "logps/rejected": -245.2656707763672, + "loss": 0.557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2949948012828827, + "rewards/margins": 1.0080726146697998, + "rewards/rejected": -1.3030673265457153, "step": 740 }, { "epoch": 0.7849293563579278, - "grad_norm": 99.92749052732131, - "learning_rate": 4.6125158328993763e-07, - "logits/chosen": -2.3929853439331055, - "logits/rejected": -2.355663537979126, - "logps/chosen": -254.00296020507812, - "logps/rejected": -262.77093505859375, - "loss": 0.4823, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.713224470615387, - "rewards/margins": 1.1954909563064575, - "rewards/rejected": -1.9087154865264893, + "grad_norm": 99.74133982899394, + "learning_rate": 6.70334495204884e-08, + "logits/chosen": -2.4846601486206055, + "logits/rejected": -2.44720458984375, + "logps/chosen": -250.17013549804688, + "logps/rejected": -258.7621765136719, + "loss": 0.4753, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3299402594566345, + "rewards/margins": 1.1778974533081055, + "rewards/rejected": -1.5078377723693848, "step": 750 }, { "epoch": 0.7953950811093669, - "grad_norm": 72.14490267875367, - "learning_rate": 4.596067813440106e-07, - "logits/chosen": -2.4292824268341064, - "logits/rejected": -2.397916555404663, - "logps/chosen": -246.23904418945312, - "logps/rejected": -260.45623779296875, - "loss": 0.5252, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.48775267601013184, - "rewards/margins": 1.2581268548965454, - "rewards/rejected": -1.7458795309066772, + "grad_norm": 73.11096892518727, + "learning_rate": 6.092659210462231e-08, + "logits/chosen": -2.5241293907165527, + "logits/rejected": -2.492896318435669, + "logps/chosen": -244.87533569335938, + "logps/rejected": -258.718017578125, + "loss": 0.5085, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.351382315158844, + "rewards/margins": 1.2206757068634033, + "rewards/rejected": -1.5720579624176025, "step": 760 }, { "epoch": 0.8058608058608059, - "grad_norm": 78.10757160148016, - "learning_rate": 4.579308526411734e-07, - "logits/chosen": -2.473436117172241, - "logits/rejected": -2.466076374053955, - "logps/chosen": -305.9208984375, - "logps/rejected": -335.34722900390625, - "loss": 0.5094, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.2560064196586609, - "rewards/margins": 1.0847007036209106, - "rewards/rejected": -1.3407068252563477, + "grad_norm": 75.76619723672131, + "learning_rate": 5.507260361320737e-08, + "logits/chosen": -2.559986114501953, + "logits/rejected": -2.5477547645568848, + "logps/chosen": -305.89483642578125, + "logps/rejected": -336.5893859863281, + "loss": 0.4732, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.25339916348457336, + "rewards/margins": 1.2115254402160645, + "rewards/rejected": -1.4649248123168945, "step": 770 }, { "epoch": 0.8163265306122449, - "grad_norm": 107.92956426114232, - "learning_rate": 4.5622404605803664e-07, - "logits/chosen": -2.4524974822998047, - "logits/rejected": -2.4113879203796387, - "logps/chosen": -319.4300842285156, - "logps/rejected": -277.6269226074219, - "loss": 0.5672, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.04860789328813553, - "rewards/margins": 0.9609671831130981, - "rewards/rejected": -1.0095751285552979, + "grad_norm": 105.8401947481118, + "learning_rate": 4.947931323697982e-08, + "logits/chosen": -2.517803430557251, + "logits/rejected": -2.4777727127075195, + "logps/chosen": -319.5518493652344, + "logps/rejected": -279.0348815917969, + "loss": 0.5237, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0607873797416687, + "rewards/margins": 1.0895849466323853, + "rewards/rejected": -1.1503721475601196, "step": 780 }, { "epoch": 0.826792255363684, - "grad_norm": 110.13616048195198, - "learning_rate": 4.5448661505659847e-07, - "logits/chosen": -2.535991668701172, - "logits/rejected": -2.42181134223938, - "logps/chosen": -279.4062805175781, - "logps/rejected": -258.510009765625, - "loss": 0.5393, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.0836770236492157, - "rewards/margins": 1.238987684249878, - "rewards/rejected": -1.3226646184921265, + "grad_norm": 113.35027564232854, + "learning_rate": 4.415420150605398e-08, + "logits/chosen": -2.5854082107543945, + "logits/rejected": -2.4739928245544434, + "logps/chosen": -280.592529296875, + "logps/rejected": -259.7514343261719, + "loss": 0.5226, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.20229856669902802, + "rewards/margins": 1.2445086240768433, + "rewards/rejected": -1.4468071460723877, "step": 790 }, { "epoch": 0.837257980115123, - "grad_norm": 115.98200848981845, - "learning_rate": 4.5271881764660504e-07, - "logits/chosen": -2.4115824699401855, - "logits/rejected": -2.3959667682647705, - "logps/chosen": -266.58087158203125, - "logps/rejected": -276.32843017578125, - "loss": 0.49, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.0452110692858696, - "rewards/margins": 1.2522789239883423, - "rewards/rejected": -1.297490119934082, + "grad_norm": 115.9328163130989, + "learning_rate": 3.9104390285376374e-08, + "logits/chosen": -2.4794726371765137, + "logits/rejected": -2.463261365890503, + "logps/chosen": -268.74591064453125, + "logps/rejected": -277.9842834472656, + "loss": 0.4894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2617167830467224, + "rewards/margins": 1.2013636827468872, + "rewards/rejected": -1.4630804061889648, "step": 800 }, { "epoch": 0.837257980115123, - "eval_logits/chosen": -2.5043623447418213, - "eval_logits/rejected": -2.4478914737701416, - "eval_logps/chosen": -280.8053894042969, - "eval_logps/rejected": -270.5943603515625, - "eval_loss": 0.5410587191581726, - "eval_rewards/accuracies": 0.7638888955116272, - "eval_rewards/chosen": 0.11485897749662399, - "eval_rewards/margins": 1.152403473854065, - "eval_rewards/rejected": -1.03754460811615, - "eval_runtime": 194.4603, - "eval_samples_per_second": 10.285, - "eval_steps_per_second": 0.324, + "eval_logits/chosen": -2.5850720405578613, + "eval_logits/rejected": -2.531074047088623, + "eval_logps/chosen": -282.5210876464844, + "eval_logps/rejected": -272.81268310546875, + "eval_loss": 0.5257476568222046, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.05670681595802307, + "eval_rewards/margins": 1.2026678323745728, + "eval_rewards/rejected": -1.259374737739563, + "eval_runtime": 207.4453, + "eval_samples_per_second": 9.641, + "eval_steps_per_second": 0.304, "step": 800 }, { "epoch": 0.847723704866562, - "grad_norm": 88.25800981530219, - "learning_rate": 4.509209163472356e-07, - "logits/chosen": -2.4694581031799316, - "logits/rejected": -2.4082190990448, - "logps/chosen": -291.17242431640625, - "logps/rejected": -257.36126708984375, - "loss": 0.5203, + "grad_norm": 87.80187909830617, + "learning_rate": 3.433663324986208e-08, + "logits/chosen": -2.5537991523742676, + "logits/rejected": -2.495201826095581, + "logps/chosen": -292.41900634765625, + "logps/rejected": -258.72625732421875, + "loss": 0.5129, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.004567861557006836, - "rewards/margins": 1.0958082675933838, - "rewards/rejected": -1.091240406036377, + "rewards/chosen": -0.12009086459875107, + "rewards/margins": 1.1076496839523315, + "rewards/rejected": -1.2277406454086304, "step": 810 }, { "epoch": 0.858189429618001, - "grad_norm": 108.72305702203586, - "learning_rate": 4.490931781481189e-07, - "logits/chosen": -2.492886781692505, - "logits/rejected": -2.468400478363037, - "logps/chosen": -304.05010986328125, - "logps/rejected": -295.14208984375, - "loss": 0.5197, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.07160592824220657, - "rewards/margins": 1.270275354385376, - "rewards/rejected": -1.19866943359375, + "grad_norm": 113.52567438362996, + "learning_rate": 2.9857306851953897e-08, + "logits/chosen": -2.5736160278320312, + "logits/rejected": -2.5441737174987793, + "logps/chosen": -305.28387451171875, + "logps/rejected": -295.17120361328125, + "loss": 0.508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05176614597439766, + "rewards/margins": 1.149815320968628, + "rewards/rejected": -1.201581358909607, "step": 820 }, { "epoch": 0.8686551543694401, - "grad_norm": 118.01868116884087, - "learning_rate": 4.472358744696844e-07, - "logits/chosen": -2.466876268386841, - "logits/rejected": -2.4468750953674316, - "logps/chosen": -258.73577880859375, - "logps/rejected": -262.7125549316406, - "loss": 0.5499, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.06087448447942734, - "rewards/margins": 1.233014702796936, - "rewards/rejected": -1.2938891649246216, + "grad_norm": 94.35513967454145, + "learning_rate": 2.567240179368185e-08, + "logits/chosen": -2.5325002670288086, + "logits/rejected": -2.5152266025543213, + "logps/chosen": -259.08319091796875, + "logps/rejected": -262.1970520019531, + "loss": 0.5463, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.09561798721551895, + "rewards/margins": 1.1467230319976807, + "rewards/rejected": -1.242341160774231, "step": 830 }, { "epoch": 0.8791208791208791, - "grad_norm": 132.854490862069, - "learning_rate": 4.453492811228564e-07, - "logits/chosen": -2.4807190895080566, - "logits/rejected": -2.4279937744140625, - "logps/chosen": -280.91009521484375, - "logps/rejected": -278.7124938964844, - "loss": 0.4802, + "grad_norm": 129.97386022506407, + "learning_rate": 2.1787515014630357e-08, + "logits/chosen": -2.540358304977417, + "logits/rejected": -2.4877097606658936, + "logps/chosen": -282.8505859375, + "logps/rejected": -278.73236083984375, + "loss": 0.4725, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.13477404415607452, - "rewards/margins": 1.3992303609848022, - "rewards/rejected": -1.2644561529159546, + "rewards/chosen": -0.05927306413650513, + "rewards/margins": 1.2071725130081177, + "rewards/rejected": -1.266445517539978, "step": 840 }, { "epoch": 0.8895866038723181, - "grad_norm": 87.98848494339458, - "learning_rate": 4.43433678268096e-07, - "logits/chosen": -2.4316108226776123, - "logits/rejected": -2.368696689605713, - "logps/chosen": -283.5729064941406, - "logps/rejected": -250.20700073242188, - "loss": 0.511, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.27316465973854065, - "rewards/margins": 1.3995450735092163, - "rewards/rejected": -1.6727097034454346, + "grad_norm": 79.91861676789057, + "learning_rate": 1.820784220652766e-08, + "logits/chosen": -2.480196714401245, + "logits/rejected": -2.413062572479248, + "logps/chosen": -282.0146179199219, + "logps/rejected": -246.9482421875, + "loss": 0.4622, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.1173391118645668, + "rewards/margins": 1.2294926643371582, + "rewards/rejected": -1.3468319177627563, "step": 850 }, { "epoch": 0.9000523286237572, - "grad_norm": 88.46206995973934, - "learning_rate": 4.4148935037379674e-07, - "logits/chosen": -2.492147922515869, - "logits/rejected": -2.43426775932312, - "logps/chosen": -299.02227783203125, - "logps/rejected": -285.74609375, - "loss": 0.5116, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.0740816593170166, - "rewards/margins": 1.3668744564056396, - "rewards/rejected": -2.440955877304077, + "grad_norm": 108.90343136375496, + "learning_rate": 1.4938170864468636e-08, + "logits/chosen": -2.517470359802246, + "logits/rejected": -2.458211898803711, + "logps/chosen": -289.65374755859375, + "logps/rejected": -274.5342102050781, + "loss": 0.4958, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.1372273564338684, + "rewards/margins": 1.1825356483459473, + "rewards/rejected": -1.3197630643844604, "step": 860 }, { "epoch": 0.9105180533751962, - "grad_norm": 172.42125575951448, - "learning_rate": 4.395165861740413e-07, - "logits/chosen": -2.5659687519073486, - "logits/rejected": -2.498225450515747, - "logps/chosen": -279.9397277832031, - "logps/rejected": -267.6497497558594, - "loss": 0.5475, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.8144890069961548, - "rewards/margins": 1.2995784282684326, - "rewards/rejected": -2.114067554473877, + "grad_norm": 153.358042630482, + "learning_rate": 1.1982873884064465e-08, + "logits/chosen": -2.578099012374878, + "logits/rejected": -2.503145933151245, + "logps/chosen": -271.9369201660156, + "logps/rejected": -258.82427978515625, + "loss": 0.5166, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.014210554771125317, + "rewards/margins": 1.2173117399215698, + "rewards/rejected": -1.2315223217010498, "step": 870 }, { "epoch": 0.9209837781266352, - "grad_norm": 142.93679575396322, - "learning_rate": 4.3751567862572405e-07, - "logits/chosen": -2.5635287761688232, - "logits/rejected": -2.5526645183563232, - "logps/chosen": -310.57391357421875, - "logps/rejected": -312.0995788574219, - "loss": 0.5207, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6461750864982605, - "rewards/margins": 1.0978899002075195, - "rewards/rejected": -1.7440650463104248, + "grad_norm": 88.58371346729444, + "learning_rate": 9.345903713082304e-09, + "logits/chosen": -2.5621767044067383, + "logits/rejected": -2.545060396194458, + "logps/chosen": -303.8287658691406, + "logps/rejected": -306.15374755859375, + "loss": 0.4932, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.02834125980734825, + "rewards/margins": 1.1778244972229004, + "rewards/rejected": -1.149483323097229, "step": 880 }, { "epoch": 0.9314495028780743, - "grad_norm": 92.59614312640235, - "learning_rate": 4.354869248650466e-07, - "logits/chosen": -2.5237598419189453, - "logits/rejected": -2.5179736614227295, - "logps/chosen": -293.27667236328125, - "logps/rejected": -297.16265869140625, - "loss": 0.5403, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.5625289678573608, - "rewards/margins": 1.1325795650482178, - "rewards/rejected": -1.6951086521148682, + "grad_norm": 123.70760581405742, + "learning_rate": 7.030787065396865e-09, + "logits/chosen": -2.5063490867614746, + "logits/rejected": -2.4980926513671875, + "logps/chosen": -288.7598571777344, + "logps/rejected": -291.5402526855469, + "loss": 0.5139, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.11085305362939835, + "rewards/margins": 1.0220136642456055, + "rewards/rejected": -1.132866621017456, "step": 890 }, { "epoch": 0.9419152276295133, - "grad_norm": 158.52814706340763, - "learning_rate": 4.3343062616339306e-07, - "logits/chosen": -2.505187749862671, - "logits/rejected": -2.5016567707061768, - "logps/chosen": -284.1822204589844, - "logps/rejected": -296.31854248046875, - "loss": 0.5097, + "grad_norm": 123.75723552086023, + "learning_rate": 5.04062020432286e-09, + "logits/chosen": -2.5003843307495117, + "logits/rejected": -2.4942610263824463, + "logps/chosen": -281.16094970703125, + "logps/rejected": -294.83892822265625, + "loss": 0.4722, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.38374561071395874, - "rewards/margins": 1.1031537055969238, - "rewards/rejected": -1.486899495124817, + "rewards/chosen": -0.08161640912294388, + "rewards/margins": 1.2573249340057373, + "rewards/rejected": -1.3389413356781006, "step": 900 }, { "epoch": 0.9419152276295133, - "eval_logits/chosen": -2.582000970840454, - "eval_logits/rejected": -2.5297691822052, - "eval_logps/chosen": -283.95635986328125, - "eval_logps/rejected": -274.88885498046875, - "eval_loss": 0.5622031688690186, - "eval_rewards/accuracies": 0.7698412537574768, - "eval_rewards/chosen": -0.2002360075712204, - "eval_rewards/margins": 1.2667559385299683, - "eval_rewards/rejected": -1.4669920206069946, - "eval_runtime": 195.8426, - "eval_samples_per_second": 10.212, - "eval_steps_per_second": 0.322, + "eval_logits/chosen": -2.590116024017334, + "eval_logits/rejected": -2.5362370014190674, + "eval_logps/chosen": -282.11407470703125, + "eval_logps/rejected": -272.7223205566406, + "eval_loss": 0.5279979109764099, + "eval_rewards/accuracies": 0.7797619104385376, + "eval_rewards/chosen": -0.01600908488035202, + "eval_rewards/margins": 1.2343299388885498, + "eval_rewards/rejected": -1.2503389120101929, + "eval_runtime": 207.5627, + "eval_samples_per_second": 9.636, + "eval_steps_per_second": 0.304, "step": 900 }, { "epoch": 0.9523809523809523, - "grad_norm": 121.79029128909083, - "learning_rate": 4.31347087882591e-07, - "logits/chosen": -2.46720552444458, - "logits/rejected": -2.441413164138794, - "logps/chosen": -224.29696655273438, - "logps/rejected": -225.60366821289062, - "loss": 0.5443, + "grad_norm": 117.42904817244555, + "learning_rate": 3.3780648016376866e-09, + "logits/chosen": -2.4798412322998047, + "logits/rejected": -2.4555060863494873, + "logps/chosen": -224.1981658935547, + "logps/rejected": -224.94448852539062, + "loss": 0.4853, "rewards/accuracies": 0.78125, - "rewards/chosen": -0.08442769944667816, - "rewards/margins": 1.188298225402832, - "rewards/rejected": -1.2727259397506714, + "rewards/chosen": -0.0745500773191452, + "rewards/margins": 1.1322568655014038, + "rewards/rejected": -1.206807017326355, "step": 910 }, { "epoch": 0.9628466771323915, - "grad_norm": 116.42372556269152, - "learning_rate": 4.2923661942956515e-07, - "logits/chosen": -2.5624608993530273, - "logits/rejected": -2.5129048824310303, - "logps/chosen": -286.4587097167969, - "logps/rejected": -264.3240966796875, - "loss": 0.547, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": 0.018908865749835968, - "rewards/margins": 1.2915865182876587, - "rewards/rejected": -1.2726776599884033, + "grad_norm": 118.07550601245399, + "learning_rate": 2.0453443778310766e-09, + "logits/chosen": -2.59162974357605, + "logits/rejected": -2.536531925201416, + "logps/chosen": -287.6225280761719, + "logps/rejected": -264.23828125, + "loss": 0.5345, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.09747320413589478, + "rewards/margins": 1.166621208190918, + "rewards/rejected": -1.264094591140747, "step": 920 }, { "epoch": 0.9733124018838305, - "grad_norm": 78.35136647705365, - "learning_rate": 4.270995342103896e-07, - "logits/chosen": -2.520020008087158, - "logits/rejected": -2.4239940643310547, - "logps/chosen": -312.50701904296875, - "logps/rejected": -276.9075622558594, - "loss": 0.4914, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.22941641509532928, - "rewards/margins": 1.4956104755401611, - "rewards/rejected": -1.2661937475204468, + "grad_norm": 76.103065820101, + "learning_rate": 1.0442413283435758e-09, + "logits/chosen": -2.567437171936035, + "logits/rejected": -2.467308521270752, + "logps/chosen": -315.1402893066406, + "logps/rejected": -279.08343505859375, + "loss": 0.4831, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.03391196206212044, + "rewards/margins": 1.449866533279419, + "rewards/rejected": -1.4837785959243774, "step": 930 }, { "epoch": 0.9837781266352695, - "grad_norm": 109.1159513218319, - "learning_rate": 4.2493614958374774e-07, - "logits/chosen": -2.52817964553833, - "logits/rejected": -2.499875068664551, - "logps/chosen": -262.322509765625, - "logps/rejected": -271.85650634765625, - "loss": 0.4871, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.08858039230108261, - "rewards/margins": 1.3023322820663452, - "rewards/rejected": -1.2137519121170044, + "grad_norm": 105.2080573269562, + "learning_rate": 3.760945397705828e-10, + "logits/chosen": -2.5792009830474854, + "logits/rejected": -2.542511463165283, + "logps/chosen": -261.95806884765625, + "logps/rejected": -271.60552978515625, + "loss": 0.4913, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.12502527236938477, + "rewards/margins": 1.313679575920105, + "rewards/rejected": -1.1886541843414307, "step": 940 }, { "epoch": 0.9942438513867086, - "grad_norm": 107.81485992510846, - "learning_rate": 4.227467868138035e-07, - "logits/chosen": -2.5245165824890137, - "logits/rejected": -2.5067315101623535, - "logps/chosen": -304.98712158203125, - "logps/rejected": -299.4005126953125, - "loss": 0.4825, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.2150495946407318, - "rewards/margins": 1.2283554077148438, - "rewards/rejected": -1.443405032157898, + "grad_norm": 121.4273155412146, + "learning_rate": 4.17975992204056e-11, + "logits/chosen": -2.5649020671844482, + "logits/rejected": -2.5450055599212646, + "logps/chosen": -303.64208984375, + "logps/rejected": -295.95794677734375, + "loss": 0.4961, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.08054888248443604, + "rewards/margins": 1.018601417541504, + "rewards/rejected": -1.0991504192352295, "step": 950 }, { - "epoch": 1.0047095761381475, - "grad_norm": 50.46088335404094, - "learning_rate": 4.2053177102249374e-07, - "logits/chosen": -2.511298418045044, - "logits/rejected": -2.5007259845733643, - "logps/chosen": -276.84228515625, - "logps/rejected": -267.48919677734375, - "loss": 0.3287, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": 0.3138599097728729, - "rewards/margins": 2.2567601203918457, - "rewards/rejected": -1.9429000616073608, - "step": 960 - }, - { - "epoch": 1.0151753008895865, - "grad_norm": 27.442740844820033, - "learning_rate": 4.182914311412473e-07, - "logits/chosen": -2.5494847297668457, - "logits/rejected": -2.4884865283966064, - "logps/chosen": -302.3250427246094, - "logps/rejected": -312.5084228515625, - "loss": 0.1001, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2510488033294678, - "rewards/margins": 5.093658447265625, - "rewards/rejected": -3.842609405517578, - "step": 970 - }, - { - "epoch": 1.0256410256410255, - "grad_norm": 80.00922596327213, - "learning_rate": 4.1602609986213865e-07, - "logits/chosen": -2.538651704788208, - "logits/rejected": -2.4874634742736816, - "logps/chosen": -256.922607421875, - "logps/rejected": -267.7474670410156, - "loss": 0.1043, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.2808031737804413, - "rewards/margins": 4.430321216583252, - "rewards/rejected": -4.149518013000488, - "step": 980 - }, - { - "epoch": 1.0361067503924646, - "grad_norm": 23.7454721878054, - "learning_rate": 4.1373611358848237e-07, - "logits/chosen": -2.6277410984039307, - "logits/rejected": -2.567559242248535, - "logps/chosen": -282.07623291015625, - "logps/rejected": -295.5177307128906, - "loss": 0.1006, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.47978025674819946, - "rewards/margins": 4.933806419372559, - "rewards/rejected": -4.454026222229004, - "step": 990 - }, - { - "epoch": 1.0465724751439036, - "grad_norm": 21.033765214056462, - "learning_rate": 4.114218123848777e-07, - "logits/chosen": -2.6434526443481445, - "logits/rejected": -2.555600166320801, - "logps/chosen": -293.57879638671875, - "logps/rejected": -307.98382568359375, - "loss": 0.1144, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 1.0712214708328247, - "rewards/margins": 4.741218090057373, - "rewards/rejected": -3.669996976852417, - "step": 1000 - }, - { - "epoch": 1.0465724751439036, - "eval_logits/chosen": -2.608029365539551, - "eval_logits/rejected": -2.549466371536255, - "eval_logps/chosen": -284.90142822265625, - "eval_logps/rejected": -278.99273681640625, - "eval_loss": 0.5713591575622559, - "eval_rewards/accuracies": 0.7638888955116272, - "eval_rewards/chosen": -0.2947400212287903, - "eval_rewards/margins": 1.582641363143921, - "eval_rewards/rejected": -1.8773810863494873, - "eval_runtime": 195.3231, - "eval_samples_per_second": 10.239, - "eval_steps_per_second": 0.323, - "step": 1000 - }, - { - "epoch": 1.0570381998953426, - "grad_norm": 44.560432953678024, - "learning_rate": 4.090835399267082e-07, - "logits/chosen": -2.564392566680908, - "logits/rejected": -2.4928183555603027, - "logps/chosen": -260.095458984375, - "logps/rejected": -265.59423828125, - "loss": 0.1385, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9047374725341797, - "rewards/margins": 4.782032489776611, - "rewards/rejected": -3.8772950172424316, - "step": 1010 - }, - { - "epoch": 1.0675039246467817, - "grad_norm": 18.87740280109327, - "learning_rate": 4.067216434491059e-07, - "logits/chosen": -2.5619444847106934, - "logits/rejected": -2.5070197582244873, - "logps/chosen": -302.210693359375, - "logps/rejected": -324.6355895996094, - "loss": 0.0931, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6377789378166199, - "rewards/margins": 5.321119785308838, - "rewards/rejected": -4.683340549468994, - "step": 1020 - }, - { - "epoch": 1.077969649398221, - "grad_norm": 21.19160105675347, - "learning_rate": 4.0433647369538657e-07, - "logits/chosen": -2.571237087249756, - "logits/rejected": -2.521949052810669, - "logps/chosen": -272.63629150390625, - "logps/rejected": -305.75042724609375, - "loss": 0.0921, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.30905666947364807, - "rewards/margins": 5.351611137390137, - "rewards/rejected": -5.042553901672363, - "step": 1030 - }, - { - "epoch": 1.08843537414966, - "grad_norm": 29.65068310700299, - "learning_rate": 4.019283848649643e-07, - "logits/chosen": -2.555471658706665, - "logits/rejected": -2.458915948867798, - "logps/chosen": -254.86093139648438, - "logps/rejected": -280.85406494140625, - "loss": 0.1119, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": 0.7559573650360107, - "rewards/margins": 4.866219520568848, - "rewards/rejected": -4.110262393951416, - "step": 1040 - }, - { - "epoch": 1.098901098901099, - "grad_norm": 32.51020918580123, - "learning_rate": 3.9949773456075215e-07, - "logits/chosen": -2.5701072216033936, - "logits/rejected": -2.50352144241333, - "logps/chosen": -243.7562255859375, - "logps/rejected": -282.5508728027344, - "loss": 0.0846, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.8431166410446167, - "rewards/margins": 5.474893569946289, - "rewards/rejected": -4.631776332855225, - "step": 1050 - }, - { - "epoch": 1.109366823652538, - "grad_norm": 38.488350951276, - "learning_rate": 3.9704488373605844e-07, - "logits/chosen": -2.532349109649658, - "logits/rejected": -2.4308629035949707, - "logps/chosen": -260.8913269042969, - "logps/rejected": -285.77276611328125, - "loss": 0.1018, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9833381772041321, - "rewards/margins": 5.060244083404541, - "rewards/rejected": -4.076905727386475, - "step": 1060 - }, - { - "epoch": 1.119832548403977, - "grad_norm": 66.60901013561211, - "learning_rate": 3.9457019664098455e-07, - "logits/chosen": -2.5843162536621094, - "logits/rejected": -2.499265670776367, - "logps/chosen": -272.5025939941406, - "logps/rejected": -289.4697570800781, - "loss": 0.1227, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.6055439710617065, - "rewards/margins": 5.325932502746582, - "rewards/rejected": -4.720389366149902, - "step": 1070 - }, - { - "epoch": 1.130298273155416, - "grad_norm": 54.60599750254155, - "learning_rate": 3.920740407683337e-07, - "logits/chosen": -2.5222134590148926, - "logits/rejected": -2.504809617996216, - "logps/chosen": -249.7719268798828, - "logps/rejected": -309.17645263671875, - "loss": 0.1273, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.009540790691971779, - "rewards/margins": 5.185042858123779, - "rewards/rejected": -5.175501823425293, - "step": 1080 - }, - { - "epoch": 1.1407639979068551, - "grad_norm": 12.663744761283516, - "learning_rate": 3.895567867990379e-07, - "logits/chosen": -2.59201717376709, - "logits/rejected": -2.5475192070007324, - "logps/chosen": -271.7298889160156, - "logps/rejected": -310.5108337402344, - "loss": 0.102, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.14892436563968658, - "rewards/margins": 5.022067546844482, - "rewards/rejected": -4.873143196105957, - "step": 1090 - }, - { - "epoch": 1.1512297226582942, - "grad_norm": 27.975053801647217, - "learning_rate": 3.8701880854711134e-07, - "logits/chosen": -2.5267632007598877, - "logits/rejected": -2.469308376312256, - "logps/chosen": -278.9061584472656, - "logps/rejected": -303.2061462402344, - "loss": 0.087, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6444324254989624, - "rewards/margins": 5.443302154541016, - "rewards/rejected": -4.798869609832764, - "step": 1100 - }, - { - "epoch": 1.1512297226582942, - "eval_logits/chosen": -2.5698976516723633, - "eval_logits/rejected": -2.5036232471466064, - "eval_logps/chosen": -288.8863830566406, - "eval_logps/rejected": -286.51995849609375, - "eval_loss": 0.5959874987602234, - "eval_rewards/accuracies": 0.783730149269104, - "eval_rewards/chosen": -0.6932358145713806, - "eval_rewards/margins": 1.9368664026260376, - "eval_rewards/rejected": -2.6301023960113525, - "eval_runtime": 192.6966, - "eval_samples_per_second": 10.379, - "eval_steps_per_second": 0.327, - "step": 1100 - }, - { - "epoch": 1.1616954474097332, - "grad_norm": 19.596534344919895, - "learning_rate": 3.844604829041395e-07, - "logits/chosen": -2.5460801124572754, - "logits/rejected": -2.445225477218628, - "logps/chosen": -251.8176727294922, - "logps/rejected": -285.4109191894531, - "loss": 0.0692, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.9998615980148315, - "rewards/margins": 5.435625076293945, - "rewards/rejected": -4.435763359069824, - "step": 1110 - }, - { - "epoch": 1.1721611721611722, - "grad_norm": 42.58016727610291, - "learning_rate": 3.8188218978330947e-07, - "logits/chosen": -2.534914970397949, - "logits/rejected": -2.478555202484131, - "logps/chosen": -267.4512634277344, - "logps/rejected": -303.4002380371094, - "loss": 0.1227, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.9088047742843628, - "rewards/margins": 5.357703685760498, - "rewards/rejected": -4.448899269104004, - "step": 1120 - }, - { - "epoch": 1.1826268969126112, - "grad_norm": 22.48299581222999, - "learning_rate": 3.792843120629935e-07, - "logits/chosen": -2.5710811614990234, - "logits/rejected": -2.4828689098358154, - "logps/chosen": -276.33062744140625, - "logps/rejected": -318.22406005859375, - "loss": 0.0953, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.9937319755554199, - "rewards/margins": 5.263495445251465, - "rewards/rejected": -4.269762992858887, - "step": 1130 - }, - { - "epoch": 1.1930926216640503, - "grad_norm": 24.33337948216996, - "learning_rate": 3.7666723552989066e-07, - "logits/chosen": -2.5102903842926025, - "logits/rejected": -2.472074031829834, - "logps/chosen": -292.93792724609375, - "logps/rejected": -316.83441162109375, - "loss": 0.101, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.5052931308746338, - "rewards/margins": 5.799196243286133, - "rewards/rejected": -4.293902397155762, - "step": 1140 - }, - { - "epoch": 1.2035583464154893, - "grad_norm": 18.383546295236997, - "learning_rate": 3.7403134882173725e-07, - "logits/chosen": -2.5045714378356934, - "logits/rejected": -2.4667632579803467, - "logps/chosen": -257.2259216308594, - "logps/rejected": -312.82965087890625, - "loss": 0.0831, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.1549804210662842, - "rewards/margins": 5.819235801696777, - "rewards/rejected": -4.6642560958862305, - "step": 1150 - }, - { - "epoch": 1.2140240711669283, - "grad_norm": 119.62235983983288, - "learning_rate": 3.713770433695946e-07, - "logits/chosen": -2.516573905944824, - "logits/rejected": -2.401930570602417, - "logps/chosen": -269.4717102050781, - "logps/rejected": -293.58111572265625, - "loss": 0.1385, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.3582627773284912, - "rewards/margins": 5.877196788787842, - "rewards/rejected": -5.5189337730407715, - "step": 1160 - }, - { - "epoch": 1.2244897959183674, - "grad_norm": 18.315344691249365, - "learning_rate": 3.687047133397201e-07, - "logits/chosen": -2.491701126098633, - "logits/rejected": -2.448899984359741, - "logps/chosen": -261.7595520019531, - "logps/rejected": -290.7538757324219, - "loss": 0.0949, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5927351713180542, - "rewards/margins": 5.714540958404541, - "rewards/rejected": -5.1218061447143555, - "step": 1170 - }, - { - "epoch": 1.2349555206698064, - "grad_norm": 51.3201349443732, - "learning_rate": 3.6601475557503407e-07, - "logits/chosen": -2.524080753326416, - "logits/rejected": -2.4736952781677246, - "logps/chosen": -273.5008850097656, - "logps/rejected": -296.5593566894531, - "loss": 0.1184, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.5895344018936157, - "rewards/margins": 4.980040550231934, - "rewards/rejected": -4.390506267547607, - "step": 1180 - }, - { - "epoch": 1.2454212454212454, - "grad_norm": 39.70185923450293, - "learning_rate": 3.633075695361881e-07, - "logits/chosen": -2.565307855606079, - "logits/rejected": -2.4783263206481934, - "logps/chosen": -280.61737060546875, - "logps/rejected": -301.8477783203125, - "loss": 0.101, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5513092875480652, - "rewards/margins": 5.298879623413086, - "rewards/rejected": -4.747570037841797, - "step": 1190 - }, - { - "epoch": 1.2558869701726845, - "grad_norm": 49.898229559254574, - "learning_rate": 3.6058355724224475e-07, - "logits/chosen": -2.471615791320801, - "logits/rejected": -2.4181594848632812, - "logps/chosen": -323.2919921875, - "logps/rejected": -349.9242248535156, - "loss": 0.1122, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.39504244923591614, - "rewards/margins": 5.795102119445801, - "rewards/rejected": -5.400059700012207, - "step": 1200 - }, - { - "epoch": 1.2558869701726845, - "eval_logits/chosen": -2.476499080657959, - "eval_logits/rejected": -2.4062724113464355, - "eval_logps/chosen": -297.6088562011719, - "eval_logps/rejected": -296.8384094238281, - "eval_loss": 0.6133444309234619, - "eval_rewards/accuracies": 0.7539682388305664, - "eval_rewards/chosen": -1.565487265586853, - "eval_rewards/margins": 2.096463203430176, - "eval_rewards/rejected": -3.6619503498077393, - "eval_runtime": 193.7272, - "eval_samples_per_second": 10.324, - "eval_steps_per_second": 0.325, - "step": 1200 - }, - { - "epoch": 1.2663526949241235, - "grad_norm": 46.48432196843033, - "learning_rate": 3.578431232109774e-07, - "logits/chosen": -2.5137457847595215, - "logits/rejected": -2.385891914367676, - "logps/chosen": -340.22967529296875, - "logps/rejected": -326.50567626953125, - "loss": 0.0996, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.1203179806470871, - "rewards/margins": 5.927081108093262, - "rewards/rejected": -5.806763172149658, - "step": 1210 - }, - { - "epoch": 1.2768184196755625, - "grad_norm": 44.38844795235724, - "learning_rate": 3.5508667439879935e-07, - "logits/chosen": -2.4454009532928467, - "logits/rejected": -2.3734889030456543, - "logps/chosen": -297.68267822265625, - "logps/rejected": -300.8840026855469, - "loss": 0.1137, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.07286659628152847, - "rewards/margins": 5.4494757652282715, - "rewards/rejected": -5.3766093254089355, - "step": 1220 - }, - { - "epoch": 1.2872841444270016, - "grad_norm": 27.61446068416563, - "learning_rate": 3.523146201403302e-07, - "logits/chosen": -2.458700656890869, - "logits/rejected": -2.406069755554199, - "logps/chosen": -260.40350341796875, - "logps/rejected": -278.19775390625, - "loss": 0.0992, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.23116512596607208, - "rewards/margins": 5.250620365142822, - "rewards/rejected": -5.019455432891846, - "step": 1230 - }, - { - "epoch": 1.2977498691784406, - "grad_norm": 47.38326324552, - "learning_rate": 3.4952737208760944e-07, - "logits/chosen": -2.416764736175537, - "logits/rejected": -2.358327865600586, - "logps/chosen": -251.84603881835938, - "logps/rejected": -289.4906921386719, - "loss": 0.1261, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.7051903009414673, - "rewards/margins": 5.252205848693848, - "rewards/rejected": -4.547015190124512, - "step": 1240 - }, - { - "epoch": 1.3082155939298796, - "grad_norm": 58.42403978763642, - "learning_rate": 3.4672534414896613e-07, - "logits/chosen": -2.458853244781494, - "logits/rejected": -2.362720012664795, - "logps/chosen": -283.8580017089844, - "logps/rejected": -293.7142639160156, - "loss": 0.0983, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.9454904794692993, - "rewards/margins": 5.443618297576904, - "rewards/rejected": -4.4981279373168945, - "step": 1250 - }, - { - "epoch": 1.3186813186813187, - "grad_norm": 124.79543111458564, - "learning_rate": 3.4390895242755256e-07, - "logits/chosen": -2.4527194499969482, - "logits/rejected": -2.401614189147949, - "logps/chosen": -256.9396667480469, - "logps/rejected": -305.99017333984375, - "loss": 0.1026, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.19380363821983337, - "rewards/margins": 5.0710296630859375, - "rewards/rejected": -4.877225875854492, - "step": 1260 - }, - { - "epoch": 1.3291470434327577, - "grad_norm": 53.969365288650714, - "learning_rate": 3.4107861515955325e-07, - "logits/chosen": -2.406191349029541, - "logits/rejected": -2.3340790271759033, - "logps/chosen": -268.2250061035156, - "logps/rejected": -308.24176025390625, - "loss": 0.0982, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": 0.4892198443412781, - "rewards/margins": 5.7689127922058105, - "rewards/rejected": -5.279693603515625, - "step": 1270 - }, - { - "epoch": 1.3396127681841967, - "grad_norm": 23.925696976595415, - "learning_rate": 3.382347526520765e-07, - "logits/chosen": -2.4319324493408203, - "logits/rejected": -2.300276041030884, - "logps/chosen": -270.14642333984375, - "logps/rejected": -293.6103515625, - "loss": 0.0761, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.40858253836631775, - "rewards/margins": 5.94081449508667, - "rewards/rejected": -5.532231330871582, - "step": 1280 - }, - { - "epoch": 1.3500784929356358, - "grad_norm": 59.86150351928805, - "learning_rate": 3.3537778722073805e-07, - "logits/chosen": -2.4228928089141846, - "logits/rejected": -2.3317923545837402, - "logps/chosen": -285.8216247558594, - "logps/rejected": -307.4306640625, - "loss": 0.0972, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.25407934188842773, - "rewards/margins": 5.552480697631836, - "rewards/rejected": -5.298401832580566, - "step": 1290 - }, - { - "epoch": 1.3605442176870748, - "grad_norm": 50.51655456201444, - "learning_rate": 3.3250814312694747e-07, - "logits/chosen": -2.4078121185302734, - "logits/rejected": -2.3337295055389404, - "logps/chosen": -266.6385803222656, - "logps/rejected": -324.4608459472656, - "loss": 0.1303, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.1993413120508194, - "rewards/margins": 5.515454292297363, - "rewards/rejected": -5.714795112609863, - "step": 1300 - }, - { - "epoch": 1.3605442176870748, - "eval_logits/chosen": -2.4469552040100098, - "eval_logits/rejected": -2.3747079372406006, - "eval_logps/chosen": -299.5291442871094, - "eval_logps/rejected": -297.04644775390625, - "eval_loss": 0.6040070652961731, - "eval_rewards/accuracies": 0.783730149269104, - "eval_rewards/chosen": -1.757515788078308, - "eval_rewards/margins": 1.9252358675003052, - "eval_rewards/rejected": -3.682751417160034, - "eval_runtime": 193.4257, - "eval_samples_per_second": 10.34, - "eval_steps_per_second": 0.326, - "step": 1300 - }, - { - "epoch": 1.3710099424385138, - "grad_norm": 34.42094107606306, - "learning_rate": 3.2962624651490456e-07, - "logits/chosen": -2.4732913970947266, - "logits/rejected": -2.345318078994751, - "logps/chosen": -282.5798645019531, - "logps/rejected": -292.8085632324219, - "loss": 0.1105, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.08338163048028946, - "rewards/margins": 5.2257795333862305, - "rewards/rejected": -5.309161186218262, - "step": 1310 - }, - { - "epoch": 1.3814756671899528, - "grad_norm": 41.60487159244909, - "learning_rate": 3.2673252534831685e-07, - "logits/chosen": -2.402320384979248, - "logits/rejected": -2.3370251655578613, - "logps/chosen": -247.3248291015625, - "logps/rejected": -294.50274658203125, - "loss": 0.0824, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.07617280632257462, - "rewards/margins": 5.308609962463379, - "rewards/rejected": -5.384782791137695, - "step": 1320 - }, - { - "epoch": 1.3919413919413919, - "grad_norm": 42.03988467894461, - "learning_rate": 3.2382740934684695e-07, - "logits/chosen": -2.3768885135650635, - "logits/rejected": -2.3301796913146973, - "logps/chosen": -269.8595886230469, - "logps/rejected": -339.62530517578125, - "loss": 0.1227, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": 0.3696175217628479, - "rewards/margins": 5.659558296203613, - "rewards/rejected": -5.28994083404541, - "step": 1330 - }, - { - "epoch": 1.402407116692831, - "grad_norm": 77.1245747683099, - "learning_rate": 3.209113299222982e-07, - "logits/chosen": -2.3635897636413574, - "logits/rejected": -2.269571304321289, - "logps/chosen": -287.84014892578125, - "logps/rejected": -302.1590270996094, - "loss": 0.0855, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.5934638977050781, - "rewards/margins": 5.421599388122559, - "rewards/rejected": -4.828135967254639, - "step": 1340 - }, - { - "epoch": 1.41287284144427, - "grad_norm": 48.52563486984949, - "learning_rate": 3.179847201145505e-07, - "logits/chosen": -2.328540802001953, - "logits/rejected": -2.27812123298645, - "logps/chosen": -263.614990234375, - "logps/rejected": -309.95037841796875, - "loss": 0.0965, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.25371187925338745, - "rewards/margins": 5.66549825668335, - "rewards/rejected": -5.4117865562438965, - "step": 1350 - }, - { - "epoch": 1.423338566195709, - "grad_norm": 53.647777507183335, - "learning_rate": 3.1504801452725276e-07, - "logits/chosen": -2.3678460121154785, - "logits/rejected": -2.2927908897399902, - "logps/chosen": -276.91546630859375, - "logps/rejected": -322.7835693359375, - "loss": 0.1184, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.38571494817733765, - "rewards/margins": 5.667319297790527, - "rewards/rejected": -5.281604290008545, - "step": 1360 - }, - { - "epoch": 1.433804290947148, - "grad_norm": 61.873474693990616, - "learning_rate": 3.121016492632848e-07, - "logits/chosen": -2.2989566326141357, - "logits/rejected": -2.235711097717285, - "logps/chosen": -270.10260009765625, - "logps/rejected": -297.0763244628906, - "loss": 0.1233, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.17630472779273987, - "rewards/margins": 5.2244720458984375, - "rewards/rejected": -5.4007768630981445, - "step": 1370 - }, - { - "epoch": 1.4442700156985873, - "grad_norm": 38.23882234820376, - "learning_rate": 3.091460618599951e-07, - "logits/chosen": -2.316293716430664, - "logits/rejected": -2.254382610321045, - "logps/chosen": -266.8833312988281, - "logps/rejected": -295.21923828125, - "loss": 0.1005, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.2151879370212555, - "rewards/margins": 5.430513381958008, - "rewards/rejected": -5.215325355529785, - "step": 1380 - }, - { - "epoch": 1.454735740450026, - "grad_norm": 39.818438997209, - "learning_rate": 3.0618169122422646e-07, - "logits/chosen": -2.380506753921509, - "logits/rejected": -2.278787136077881, - "logps/chosen": -284.90618896484375, - "logps/rejected": -290.0193786621094, - "loss": 0.1215, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.08821509033441544, - "rewards/margins": 4.949409484863281, - "rewards/rejected": -4.861193656921387, - "step": 1390 - }, - { - "epoch": 1.4652014652014653, - "grad_norm": 45.04129416327594, - "learning_rate": 3.032089775671378e-07, - "logits/chosen": -2.381953239440918, - "logits/rejected": -2.381547451019287, - "logps/chosen": -285.6064147949219, - "logps/rejected": -350.67840576171875, - "loss": 0.0884, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.008377855643630028, - "rewards/margins": 5.490458965301514, - "rewards/rejected": -5.498836994171143, - "step": 1400 - }, - { - "epoch": 1.4652014652014653, - "eval_logits/chosen": -2.4553167819976807, - "eval_logits/rejected": -2.3840415477752686, - "eval_logps/chosen": -296.1571350097656, - "eval_logps/rejected": -292.8251037597656, - "eval_loss": 0.6035022139549255, - "eval_rewards/accuracies": 0.7797619104385376, - "eval_rewards/chosen": -1.4203155040740967, - "eval_rewards/margins": 1.8403037786483765, - "eval_rewards/rejected": -3.2606194019317627, - "eval_runtime": 191.3187, - "eval_samples_per_second": 10.454, - "eval_steps_per_second": 0.329, - "step": 1400 - }, - { - "epoch": 1.4756671899529041, - "grad_norm": 32.4336229298901, - "learning_rate": 3.0022836233883316e-07, - "logits/chosen": -2.4722976684570312, - "logits/rejected": -2.4245364665985107, - "logps/chosen": -266.31195068359375, - "logps/rejected": -314.67242431640625, - "loss": 0.0874, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.17366066575050354, - "rewards/margins": 5.68074893951416, - "rewards/rejected": -5.507088661193848, - "step": 1410 - }, - { - "epoch": 1.4861329147043434, - "grad_norm": 58.40705461879319, - "learning_rate": 2.9724028816280505e-07, - "logits/chosen": -2.374878168106079, - "logits/rejected": -2.3210766315460205, - "logps/chosen": -283.2925720214844, - "logps/rejected": -314.82550048828125, - "loss": 0.097, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6519819498062134, - "rewards/margins": 6.13173246383667, - "rewards/rejected": -5.479750633239746, - "step": 1420 - }, - { - "epoch": 1.4965986394557822, - "grad_norm": 105.10608440201732, - "learning_rate": 2.942451987702052e-07, - "logits/chosen": -2.4517791271209717, - "logits/rejected": -2.390284538269043, - "logps/chosen": -292.65838623046875, - "logps/rejected": -332.29803466796875, - "loss": 0.1129, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.4461434483528137, - "rewards/margins": 6.248325347900391, - "rewards/rejected": -5.802182197570801, - "step": 1430 - }, - { - "epoch": 1.5070643642072215, - "grad_norm": 55.55020039744771, - "learning_rate": 2.9124353893395036e-07, - "logits/chosen": -2.4190707206726074, - "logits/rejected": -2.338974952697754, - "logps/chosen": -287.1531677246094, - "logps/rejected": -327.5191955566406, - "loss": 0.1102, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.318783164024353, - "rewards/margins": 6.298137187957764, - "rewards/rejected": -5.979353904724121, - "step": 1440 - }, - { - "epoch": 1.5175300889586603, - "grad_norm": 51.17354471400776, - "learning_rate": 2.882357544026722e-07, - "logits/chosen": -2.4564871788024902, - "logits/rejected": -2.3731117248535156, - "logps/chosen": -309.31085205078125, - "logps/rejected": -336.7893371582031, - "loss": 0.1154, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.6580182313919067, - "rewards/margins": 6.26528263092041, - "rewards/rejected": -5.607264995574951, - "step": 1450 - }, - { - "epoch": 1.5279958137100995, - "grad_norm": 68.34311770981486, - "learning_rate": 2.852222918345246e-07, - "logits/chosen": -2.4634900093078613, - "logits/rejected": -2.4269614219665527, - "logps/chosen": -275.28338623046875, - "logps/rejected": -338.93743896484375, - "loss": 0.0784, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.05179625004529953, - "rewards/margins": 5.689550399780273, - "rewards/rejected": -5.741346836090088, - "step": 1460 - }, - { - "epoch": 1.5384615384615383, - "grad_norm": 33.36265269718653, - "learning_rate": 2.822035987308537e-07, - "logits/chosen": -2.558387517929077, - "logits/rejected": -2.51924204826355, - "logps/chosen": -308.51898193359375, - "logps/rejected": -365.987060546875, - "loss": 0.1043, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.17806850373744965, - "rewards/margins": 6.266432762145996, - "rewards/rejected": -6.4445013999938965, - "step": 1470 - }, - { - "epoch": 1.5489272632129776, - "grad_norm": 32.80530058684236, - "learning_rate": 2.791801233697438e-07, - "logits/chosen": -2.493621349334717, - "logits/rejected": -2.4007058143615723, - "logps/chosen": -267.80426025390625, - "logps/rejected": -270.6562194824219, - "loss": 0.1135, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -0.8583701252937317, - "rewards/margins": 5.2491888999938965, - "rewards/rejected": -6.107558727264404, - "step": 1480 - }, - { - "epoch": 1.5593929879644164, - "grad_norm": 25.705212725772036, - "learning_rate": 2.761523147394481e-07, - "logits/chosen": -2.4753096103668213, - "logits/rejected": -2.392556667327881, - "logps/chosen": -295.1965637207031, - "logps/rejected": -316.6902770996094, - "loss": 0.0898, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.2624974250793457, - "rewards/margins": 5.262028694152832, - "rewards/rejected": -5.524526119232178, - "step": 1490 - }, - { - "epoch": 1.5698587127158556, - "grad_norm": 29.00493809729549, - "learning_rate": 2.7312062247171326e-07, - "logits/chosen": -2.4782490730285645, - "logits/rejected": -2.4300010204315186, - "logps/chosen": -288.5757751464844, - "logps/rejected": -342.921875, - "loss": 0.0807, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.026790831238031387, - "rewards/margins": 6.348857879638672, - "rewards/rejected": -6.322066307067871, - "step": 1500 - }, - { - "epoch": 1.5698587127158556, - "eval_logits/chosen": -2.4730660915374756, - "eval_logits/rejected": -2.396198272705078, - "eval_logps/chosen": -300.2314147949219, - "eval_logps/rejected": -299.35986328125, - "eval_loss": 0.6032547354698181, - "eval_rewards/accuracies": 0.7876983880996704, - "eval_rewards/chosen": -1.8277430534362793, - "eval_rewards/margins": 2.086350202560425, - "eval_rewards/rejected": -3.914093494415283, - "eval_runtime": 193.6286, - "eval_samples_per_second": 10.329, - "eval_steps_per_second": 0.325, - "step": 1500 - }, - { - "epoch": 1.5803244374672945, - "grad_norm": 39.13459417429113, - "learning_rate": 2.7008549677500876e-07, - "logits/chosen": -2.486649990081787, - "logits/rejected": -2.4319205284118652, - "logps/chosen": -306.03790283203125, - "logps/rejected": -346.36248779296875, - "loss": 0.0769, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.14326974749565125, - "rewards/margins": 5.877261161804199, - "rewards/rejected": -6.020531177520752, - "step": 1510 - }, - { - "epoch": 1.5907901622187337, - "grad_norm": 41.03219380725487, - "learning_rate": 2.670473883676709e-07, - "logits/chosen": -2.3917407989501953, - "logits/rejected": -2.3191471099853516, - "logps/chosen": -287.2149963378906, - "logps/rejected": -343.78857421875, - "loss": 0.0833, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.2559010982513428, - "rewards/margins": 6.58474063873291, - "rewards/rejected": -6.328840255737305, - "step": 1520 - }, - { - "epoch": 1.6012558869701727, - "grad_norm": 12.341459448492067, - "learning_rate": 2.640067484109707e-07, - "logits/chosen": -2.479145050048828, - "logits/rejected": -2.354341745376587, - "logps/chosen": -285.10528564453125, - "logps/rejected": -285.71026611328125, - "loss": 0.077, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.27475470304489136, - "rewards/margins": 5.8808722496032715, - "rewards/rejected": -5.60611629486084, - "step": 1530 - }, - { - "epoch": 1.6117216117216118, - "grad_norm": 23.006939126230655, - "learning_rate": 2.609640284421158e-07, - "logits/chosen": -2.4949090480804443, - "logits/rejected": -2.3601808547973633, - "logps/chosen": -296.86700439453125, - "logps/rejected": -319.2213134765625, - "loss": 0.0803, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.36580049991607666, - "rewards/margins": 6.237917900085449, - "rewards/rejected": -5.872117042541504, - "step": 1540 - }, - { - "epoch": 1.6221873364730508, - "grad_norm": 28.621662488917778, - "learning_rate": 2.579196803071972e-07, - "logits/chosen": -2.4438681602478027, - "logits/rejected": -2.335808277130127, - "logps/chosen": -305.5447082519531, - "logps/rejected": -356.4336242675781, - "loss": 0.093, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3216351568698883, - "rewards/margins": 6.547228813171387, - "rewards/rejected": -6.225594520568848, - "step": 1550 - }, - { - "epoch": 1.6326530612244898, - "grad_norm": 66.4897168190457, - "learning_rate": 2.548741560940902e-07, - "logits/chosen": -2.4033031463623047, - "logits/rejected": -2.3117175102233887, - "logps/chosen": -299.1517333984375, - "logps/rejected": -327.88018798828125, - "loss": 0.1107, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5267201662063599, - "rewards/margins": 6.344514846801758, - "rewards/rejected": -5.817794322967529, - "step": 1560 - }, - { - "epoch": 1.6431187859759289, - "grad_norm": 34.61787633547149, - "learning_rate": 2.518279080653178e-07, - "logits/chosen": -2.453005313873291, - "logits/rejected": -2.315548896789551, - "logps/chosen": -260.16815185546875, - "logps/rejected": -298.7721862792969, - "loss": 0.1008, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.07265006005764008, - "rewards/margins": 6.0829572677612305, - "rewards/rejected": -6.010307312011719, - "step": 1570 - }, - { - "epoch": 1.653584510727368, - "grad_norm": 51.625900209289306, - "learning_rate": 2.487813885908907e-07, - "logits/chosen": -2.4198195934295654, - "logits/rejected": -2.312120199203491, - "logps/chosen": -279.27093505859375, - "logps/rejected": -306.5099792480469, - "loss": 0.1052, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.3075508177280426, - "rewards/margins": 5.736119270324707, - "rewards/rejected": -5.4285688400268555, - "step": 1580 - }, - { - "epoch": 1.664050235478807, - "grad_norm": 56.34908237360435, - "learning_rate": 2.457350500811292e-07, - "logits/chosen": -2.4117214679718018, - "logits/rejected": -2.316828727722168, - "logps/chosen": -287.1158447265625, - "logps/rejected": -307.75054931640625, - "loss": 0.0895, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.04768422618508339, - "rewards/margins": 5.654821872711182, - "rewards/rejected": -5.607137203216553, - "step": 1590 - }, - { - "epoch": 1.674515960230246, - "grad_norm": 61.225685795529785, - "learning_rate": 2.4268934491948027e-07, - "logits/chosen": -2.4021828174591064, - "logits/rejected": -2.3125617504119873, - "logps/chosen": -301.65057373046875, - "logps/rejected": -314.3941345214844, - "loss": 0.1027, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.3099140524864197, - "rewards/margins": 5.4058637619018555, - "rewards/rejected": -5.715777397155762, - "step": 1600 - }, - { - "epoch": 1.674515960230246, - "eval_logits/chosen": -2.453608512878418, - "eval_logits/rejected": -2.3745977878570557, - "eval_logps/chosen": -295.36798095703125, - "eval_logps/rejected": -293.9023742675781, - "eval_loss": 0.6156607270240784, - "eval_rewards/accuracies": 0.7857142686843872, - "eval_rewards/chosen": -1.3413996696472168, - "eval_rewards/margins": 2.0269458293914795, - "eval_rewards/rejected": -3.368345260620117, - "eval_runtime": 193.6194, - "eval_samples_per_second": 10.33, - "eval_steps_per_second": 0.325, - "step": 1600 - }, - { - "epoch": 1.684981684981685, - "grad_norm": 34.855706161054364, - "learning_rate": 2.396447253953385e-07, - "logits/chosen": -2.4529919624328613, - "logits/rejected": -2.3209946155548096, - "logps/chosen": -307.96795654296875, - "logps/rejected": -346.8514099121094, - "loss": 0.0987, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.2908426821231842, - "rewards/margins": 5.942322731018066, - "rewards/rejected": -5.651480197906494, - "step": 1610 - }, - { - "epoch": 1.695447409733124, - "grad_norm": 40.23950338201641, - "learning_rate": 2.3660164363687996e-07, - "logits/chosen": -2.429110288619995, - "logits/rejected": -2.344552755355835, - "logps/chosen": -274.6294250488281, - "logps/rejected": -306.53521728515625, - "loss": 0.1027, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.3539965748786926, - "rewards/margins": 5.67824649810791, - "rewards/rejected": -5.324250221252441, - "step": 1620 - }, - { - "epoch": 1.705913134484563, - "grad_norm": 81.42567945329304, - "learning_rate": 2.33560551543922e-07, - "logits/chosen": -2.405832290649414, - "logits/rejected": -2.3714067935943604, - "logps/chosen": -248.69699096679688, - "logps/rejected": -296.99969482421875, - "loss": 0.1197, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.35418736934661865, - "rewards/margins": 5.518715858459473, - "rewards/rejected": -5.164528846740723, - "step": 1630 - }, - { - "epoch": 1.716378859236002, - "grad_norm": 59.10810526638296, - "learning_rate": 2.3052190072081489e-07, - "logits/chosen": -2.4959118366241455, - "logits/rejected": -2.437864065170288, - "logps/chosen": -296.5282897949219, - "logps/rejected": -358.8733825683594, - "loss": 0.1046, - "rewards/accuracies": 0.96875, - "rewards/chosen": 1.0068142414093018, - "rewards/margins": 6.6231889724731445, - "rewards/rejected": -5.6163740158081055, - "step": 1640 - }, - { - "epoch": 1.7268445839874411, - "grad_norm": 29.236935457263172, - "learning_rate": 2.2748614240937864e-07, - "logits/chosen": -2.3946421146392822, - "logits/rejected": -2.356879234313965, - "logps/chosen": -288.42291259765625, - "logps/rejected": -323.4318542480469, - "loss": 0.0757, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.8861092329025269, - "rewards/margins": 6.604270935058594, - "rewards/rejected": -5.718161582946777, - "step": 1650 - }, - { - "epoch": 1.7373103087388801, - "grad_norm": 59.24639025370486, - "learning_rate": 2.2445372742189332e-07, - "logits/chosen": -2.3799214363098145, - "logits/rejected": -2.3509440422058105, - "logps/chosen": -265.31610107421875, - "logps/rejected": -308.52093505859375, - "loss": 0.0939, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.22912676632404327, - "rewards/margins": 5.795788764953613, - "rewards/rejected": -5.566662311553955, - "step": 1660 - }, - { - "epoch": 1.7477760334903192, - "grad_norm": 40.77320234047342, - "learning_rate": 2.2142510607415276e-07, - "logits/chosen": -2.4008963108062744, - "logits/rejected": -2.3019137382507324, - "logps/chosen": -277.7632141113281, - "logps/rejected": -322.4698486328125, - "loss": 0.0878, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.13216844201087952, - "rewards/margins": 6.392604827880859, - "rewards/rejected": -6.260436534881592, - "step": 1670 - }, - { - "epoch": 1.7582417582417582, - "grad_norm": 33.86820139777179, - "learning_rate": 2.18400728118593e-07, - "logits/chosen": -2.360358715057373, - "logits/rejected": -2.309007167816162, - "logps/chosen": -249.20718383789062, - "logps/rejected": -292.47479248046875, - "loss": 0.0914, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.06997944414615631, - "rewards/margins": 6.330134391784668, - "rewards/rejected": -6.26015567779541, - "step": 1680 - }, - { - "epoch": 1.7687074829931972, - "grad_norm": 48.17300599757862, - "learning_rate": 2.1538104267750283e-07, - "logits/chosen": -2.442073106765747, - "logits/rejected": -2.325488328933716, - "logps/chosen": -298.17694091796875, - "logps/rejected": -304.7314147949219, - "loss": 0.0866, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5448241233825684, - "rewards/margins": 6.185843467712402, - "rewards/rejected": -5.64102029800415, - "step": 1690 - }, - { - "epoch": 1.7791732077446363, - "grad_norm": 71.33509713420423, - "learning_rate": 2.123664981763295e-07, - "logits/chosen": -2.4249722957611084, - "logits/rejected": -2.3569416999816895, - "logps/chosen": -267.69305419921875, - "logps/rejected": -307.974609375, - "loss": 0.0989, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.22023609280586243, - "rewards/margins": 5.313014030456543, - "rewards/rejected": -5.092778205871582, - "step": 1700 - }, - { - "epoch": 1.7791732077446363, - "eval_logits/chosen": -2.4547817707061768, - "eval_logits/rejected": -2.3749890327453613, - "eval_logps/chosen": -296.099609375, - "eval_logps/rejected": -296.1083068847656, - "eval_loss": 0.6008522510528564, - "eval_rewards/accuracies": 0.7916666865348816, - "eval_rewards/chosen": -1.4145591259002686, - "eval_rewards/margins": 2.174380302429199, - "eval_rewards/rejected": -3.5889394283294678, - "eval_runtime": 191.8668, - "eval_samples_per_second": 10.424, - "eval_steps_per_second": 0.328, - "step": 1700 - }, - { - "epoch": 1.7896389324960753, - "grad_norm": 47.57590603677416, - "learning_rate": 2.0935754227708716e-07, - "logits/chosen": -2.453197956085205, - "logits/rejected": -2.342824935913086, - "logps/chosen": -315.5529479980469, - "logps/rejected": -307.0747375488281, - "loss": 0.1077, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.2278057038784027, - "rewards/margins": 5.9148478507995605, - "rewards/rejected": -5.687043190002441, - "step": 1710 - }, - { - "epoch": 1.8001046572475143, - "grad_norm": 38.626666570004126, - "learning_rate": 2.0635462181187827e-07, - "logits/chosen": -2.38702392578125, - "logits/rejected": -2.3546884059906006, - "logps/chosen": -292.1382751464844, - "logps/rejected": -320.6051330566406, - "loss": 0.0806, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.27204078435897827, - "rewards/margins": 5.993593692779541, - "rewards/rejected": -5.721553802490234, - "step": 1720 - }, - { - "epoch": 1.8105703819989536, - "grad_norm": 31.445724457121557, - "learning_rate": 2.0335818271653914e-07, - "logits/chosen": -2.4119772911071777, - "logits/rejected": -2.32378888130188, - "logps/chosen": -282.2355651855469, - "logps/rejected": -308.63714599609375, - "loss": 0.0947, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": 0.42752543091773987, - "rewards/margins": 5.950605869293213, - "rewards/rejected": -5.523080825805664, - "step": 1730 - }, - { - "epoch": 1.8210361067503924, - "grad_norm": 47.82561883326015, - "learning_rate": 2.0036866996441814e-07, - "logits/chosen": -2.427154064178467, - "logits/rejected": -2.3411049842834473, - "logps/chosen": -300.0530700683594, - "logps/rejected": -309.24481201171875, - "loss": 0.0861, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -0.07376101613044739, - "rewards/margins": 5.65637731552124, - "rewards/rejected": -5.730139255523682, - "step": 1740 - }, - { - "epoch": 1.8315018315018317, - "grad_norm": 44.905680663643835, - "learning_rate": 1.9738652750029675e-07, - "logits/chosen": -2.4551262855529785, - "logits/rejected": -2.3360395431518555, - "logps/chosen": -296.11468505859375, - "logps/rejected": -302.42071533203125, - "loss": 0.11, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.3061297833919525, - "rewards/margins": 5.58739709854126, - "rewards/rejected": -5.893526077270508, - "step": 1750 - }, - { - "epoch": 1.8419675562532705, - "grad_norm": 35.45129484378483, - "learning_rate": 1.9441219817446307e-07, - "logits/chosen": -2.398743152618408, - "logits/rejected": -2.345670700073242, - "logps/chosen": -276.93304443359375, - "logps/rejected": -317.46563720703125, - "loss": 0.0837, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.40583688020706177, - "rewards/margins": 5.691088676452637, - "rewards/rejected": -6.096926212310791, - "step": 1760 - }, - { - "epoch": 1.8524332810047097, - "grad_norm": 30.223237604432892, - "learning_rate": 1.9144612367694884e-07, - "logits/chosen": -2.466519594192505, - "logits/rejected": -2.3764290809631348, - "logps/chosen": -312.2805480957031, - "logps/rejected": -322.6864929199219, - "loss": 0.0644, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": 0.20971712470054626, - "rewards/margins": 6.1681904792785645, - "rewards/rejected": -5.958473205566406, - "step": 1770 - }, - { - "epoch": 1.8628990057561485, - "grad_norm": 28.54612239609294, - "learning_rate": 1.8848874447193802e-07, - "logits/chosen": -2.473593235015869, - "logits/rejected": -2.3854522705078125, - "logps/chosen": -303.9466552734375, - "logps/rejected": -358.9847412109375, - "loss": 0.09, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.4601580500602722, - "rewards/margins": 6.7925896644592285, - "rewards/rejected": -6.332431316375732, - "step": 1780 - }, - { - "epoch": 1.8733647305075878, - "grad_norm": 46.66403544331058, - "learning_rate": 1.855404997323571e-07, - "logits/chosen": -2.400825023651123, - "logits/rejected": -2.321739673614502, - "logps/chosen": -280.7015686035156, - "logps/rejected": -310.62188720703125, - "loss": 0.0994, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.31265783309936523, - "rewards/margins": 6.252267360687256, - "rewards/rejected": -5.939610481262207, - "step": 1790 - }, - { - "epoch": 1.8838304552590266, - "grad_norm": 80.41499572795993, - "learning_rate": 1.8260182727465797e-07, - "logits/chosen": -2.386293888092041, - "logits/rejected": -2.3238525390625, - "logps/chosen": -258.31787109375, - "logps/rejected": -297.99102783203125, - "loss": 0.0945, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.28738999366760254, - "rewards/margins": 6.135641574859619, - "rewards/rejected": -5.848252296447754, - "step": 1800 - }, - { - "epoch": 1.8838304552590266, - "eval_logits/chosen": -2.482546329498291, - "eval_logits/rejected": -2.40509295463562, - "eval_logps/chosen": -293.2389831542969, - "eval_logps/rejected": -293.4879455566406, - "eval_loss": 0.6108795404434204, - "eval_rewards/accuracies": 0.7876983880996704, - "eval_rewards/chosen": -1.128499984741211, - "eval_rewards/margins": 2.1984012126922607, - "eval_rewards/rejected": -3.3269011974334717, - "eval_runtime": 193.8616, - "eval_samples_per_second": 10.317, - "eval_steps_per_second": 0.325, - "step": 1800 - }, - { - "epoch": 1.8942961800104658, - "grad_norm": 27.383614395627195, - "learning_rate": 1.7967316349380214e-07, - "logits/chosen": -2.412998676300049, - "logits/rejected": -2.3707902431488037, - "logps/chosen": -257.7221374511719, - "logps/rejected": -307.52484130859375, - "loss": 0.0943, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.579398512840271, - "rewards/margins": 6.259312629699707, - "rewards/rejected": -5.6799139976501465, - "step": 1810 - }, - { - "epoch": 1.9047619047619047, - "grad_norm": 48.266806873504315, - "learning_rate": 1.7675494329845513e-07, - "logits/chosen": -2.436744451522827, - "logits/rejected": -2.345109701156616, - "logps/chosen": -266.7190246582031, - "logps/rejected": -300.0904235839844, - "loss": 0.1103, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.014025735668838024, - "rewards/margins": 5.596121311187744, - "rewards/rejected": -5.610146522521973, - "step": 1820 - }, - { - "epoch": 1.915227629513344, - "grad_norm": 71.1074135786704, - "learning_rate": 1.738476000464026e-07, - "logits/chosen": -2.357640504837036, - "logits/rejected": -2.3518869876861572, - "logps/chosen": -255.0594024658203, - "logps/rejected": -329.00897216796875, - "loss": 0.1026, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -0.4222487807273865, - "rewards/margins": 5.6480183601379395, - "rewards/rejected": -6.0702667236328125, - "step": 1830 - }, - { - "epoch": 1.9256933542647827, - "grad_norm": 55.65973781688108, - "learning_rate": 1.7095156548019647e-07, - "logits/chosen": -2.4681243896484375, - "logits/rejected": -2.3697564601898193, - "logps/chosen": -297.0665588378906, - "logps/rejected": -322.86883544921875, - "loss": 0.0862, - "rewards/accuracies": 0.96875, - "rewards/chosen": 0.17328354716300964, - "rewards/margins": 6.246537208557129, - "rewards/rejected": -6.073253631591797, - "step": 1840 - }, - { - "epoch": 1.936159079016222, - "grad_norm": 28.817492756337316, - "learning_rate": 1.680672696630406e-07, - "logits/chosen": -2.4129433631896973, - "logits/rejected": -2.3727798461914062, - "logps/chosen": -272.39434814453125, - "logps/rejected": -314.386474609375, - "loss": 0.0882, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.09276209771633148, - "rewards/margins": 6.109572410583496, - "rewards/rejected": -6.016811370849609, - "step": 1850 - }, - { - "epoch": 1.9466248037676608, - "grad_norm": 68.40192852895274, - "learning_rate": 1.6519514091492623e-07, - "logits/chosen": -2.426889657974243, - "logits/rejected": -2.355790615081787, - "logps/chosen": -260.6510925292969, - "logps/rejected": -285.44830322265625, - "loss": 0.0909, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.22806808352470398, - "rewards/margins": 5.616031169891357, - "rewards/rejected": -5.844099998474121, - "step": 1860 - }, - { - "epoch": 1.9570905285191, - "grad_norm": 37.94842058687102, - "learning_rate": 1.6233560574902626e-07, - "logits/chosen": -2.4496002197265625, - "logits/rejected": -2.332244396209717, - "logps/chosen": -328.71527099609375, - "logps/rejected": -331.65313720703125, - "loss": 0.0766, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.32529911398887634, - "rewards/margins": 6.428679466247559, - "rewards/rejected": -6.103381156921387, - "step": 1870 - }, - { - "epoch": 1.9675562532705388, - "grad_norm": 30.54467205248471, - "learning_rate": 1.594890888083575e-07, - "logits/chosen": -2.4548861980438232, - "logits/rejected": -2.358835458755493, - "logps/chosen": -285.5658874511719, - "logps/rejected": -315.0328674316406, - "loss": 0.1235, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.2686571478843689, - "rewards/margins": 5.998961925506592, - "rewards/rejected": -6.267618656158447, - "step": 1880 - }, - { - "epoch": 1.978021978021978, - "grad_norm": 87.06094295773572, - "learning_rate": 1.5665601280272123e-07, - "logits/chosen": -2.46909236907959, - "logits/rejected": -2.3773446083068848, - "logps/chosen": -280.7239074707031, - "logps/rejected": -317.36962890625, - "loss": 0.1072, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.33357730507850647, - "rewards/margins": 5.834200382232666, - "rewards/rejected": -6.1677775382995605, - "step": 1890 - }, - { - "epoch": 1.988487702773417, - "grad_norm": 47.70335393543009, - "learning_rate": 1.5383679844592977e-07, - "logits/chosen": -2.394463300704956, - "logits/rejected": -2.3512821197509766, - "logps/chosen": -302.1070251464844, - "logps/rejected": -340.91241455078125, - "loss": 0.0789, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.27979159355163574, - "rewards/margins": 6.587284088134766, - "rewards/rejected": -6.8670759201049805, - "step": 1900 - }, - { - "epoch": 1.988487702773417, - "eval_logits/chosen": -2.4730138778686523, - "eval_logits/rejected": -2.3967626094818115, - "eval_logps/chosen": -301.0693664550781, - "eval_logps/rejected": -300.8061828613281, - "eval_loss": 0.6093136072158813, - "eval_rewards/accuracies": 0.783730149269104, - "eval_rewards/chosen": -1.9115347862243652, - "eval_rewards/margins": 2.147188663482666, - "eval_rewards/rejected": -4.0587239265441895, - "eval_runtime": 191.7766, - "eval_samples_per_second": 10.429, - "eval_steps_per_second": 0.329, - "step": 1900 - }, - { - "epoch": 1.9989534275248562, - "grad_norm": 73.49626340948943, - "learning_rate": 1.5103186439333132e-07, - "logits/chosen": -2.4204254150390625, - "logits/rejected": -2.335169792175293, - "logps/chosen": -295.8714599609375, - "logps/rejected": -315.05218505859375, - "loss": 0.0859, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.012365707196295261, - "rewards/margins": 6.092530727386475, - "rewards/rejected": -6.080164909362793, - "step": 1910 - }, - { - "epoch": 2.009419152276295, - "grad_norm": 10.023101379447539, - "learning_rate": 1.4824162717963828e-07, - "logits/chosen": -2.4575653076171875, - "logits/rejected": -2.3501172065734863, - "logps/chosen": -310.2493896484375, - "logps/rejected": -336.82562255859375, - "loss": 0.0208, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.22803792357444763, - "rewards/margins": 7.252997398376465, - "rewards/rejected": -7.024959564208984, - "step": 1920 - }, - { - "epoch": 2.0198848770277342, - "grad_norm": 8.777634189198675, - "learning_rate": 1.4546650115707191e-07, - "logits/chosen": -2.3002705574035645, - "logits/rejected": -2.226264715194702, - "logps/chosen": -253.7710418701172, - "logps/rejected": -312.8944396972656, - "loss": 0.0167, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.1650373786687851, - "rewards/margins": 7.585174560546875, - "rewards/rejected": -7.42013692855835, - "step": 1930 - }, - { - "epoch": 2.030350601779173, - "grad_norm": 4.861056063440996, - "learning_rate": 1.427068984338311e-07, - "logits/chosen": -2.3367066383361816, - "logits/rejected": -2.218111515045166, - "logps/chosen": -257.58807373046875, - "logps/rejected": -307.8265075683594, - "loss": 0.0122, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4628918170928955, - "rewards/margins": 7.316593170166016, - "rewards/rejected": -6.853701591491699, - "step": 1940 - }, - { - "epoch": 2.0408163265306123, - "grad_norm": 15.609050314334675, - "learning_rate": 1.3996322881289347e-07, - "logits/chosen": -2.2831313610076904, - "logits/rejected": -2.1279468536376953, - "logps/chosen": -264.9062805175781, - "logps/rejected": -320.8538513183594, - "loss": 0.0108, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": 0.3485656976699829, - "rewards/margins": 8.686336517333984, - "rewards/rejected": -8.33777141571045, - "step": 1950 - }, - { - "epoch": 2.051282051282051, - "grad_norm": 3.1932464693276406, - "learning_rate": 1.372358997311596e-07, - "logits/chosen": -2.2604432106018066, - "logits/rejected": -2.2114975452423096, - "logps/chosen": -255.83029174804688, - "logps/rejected": -348.0851135253906, - "loss": 0.0131, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9002828598022461, - "rewards/margins": 8.180410385131836, - "rewards/rejected": -9.080694198608398, - "step": 1960 - }, - { - "epoch": 2.0617477760334904, - "grad_norm": 6.384456561185668, - "learning_rate": 1.3452531619894835e-07, - "logits/chosen": -2.378364086151123, - "logits/rejected": -2.2661073207855225, - "logps/chosen": -285.37298583984375, - "logps/rejected": -339.8665771484375, - "loss": 0.0142, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.3006087839603424, - "rewards/margins": 8.448820114135742, - "rewards/rejected": -8.749428749084473, - "step": 1970 - }, - { - "epoch": 2.072213500784929, - "grad_norm": 10.566574954605159, - "learning_rate": 1.3183188073985244e-07, - "logits/chosen": -2.2682971954345703, - "logits/rejected": -2.2013564109802246, - "logps/chosen": -287.0935363769531, - "logps/rejected": -348.1131286621094, - "loss": 0.0102, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.9138514399528503, - "rewards/margins": 8.52752685546875, - "rewards/rejected": -9.441377639770508, - "step": 1980 - }, - { - "epoch": 2.0826792255363684, - "grad_norm": 7.135645379116354, - "learning_rate": 1.291559933309635e-07, - "logits/chosen": -2.2837555408477783, - "logits/rejected": -2.175523042678833, - "logps/chosen": -293.1258239746094, - "logps/rejected": -335.9425964355469, - "loss": 0.0109, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -0.6258302927017212, - "rewards/margins": 8.172767639160156, - "rewards/rejected": -8.79859733581543, - "step": 1990 - }, - { - "epoch": 2.0931449502878072, - "grad_norm": 3.305652171586157, - "learning_rate": 1.264980513434752e-07, - "logits/chosen": -2.2141549587249756, - "logits/rejected": -2.087204694747925, - "logps/chosen": -264.577392578125, - "logps/rejected": -326.2967224121094, - "loss": 0.0086, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -0.38176727294921875, - "rewards/margins": 8.374670028686523, - "rewards/rejected": -8.756438255310059, - "step": 2000 - }, - { - "epoch": 2.0931449502878072, - "eval_logits/chosen": -2.2928240299224854, - "eval_logits/rejected": -2.2016477584838867, - "eval_logps/chosen": -311.0745849609375, - "eval_logps/rejected": -319.6028747558594, - "eval_loss": 0.7413998246192932, - "eval_rewards/accuracies": 0.7757936716079712, - "eval_rewards/chosen": -2.912059783935547, - "eval_rewards/margins": 3.026334762573242, - "eval_rewards/rejected": -5.938394546508789, - "eval_runtime": 191.7802, - "eval_samples_per_second": 10.429, - "eval_steps_per_second": 0.329, - "step": 2000 - }, - { - "epoch": 2.1036106750392465, - "grad_norm": 11.047275176108153, - "learning_rate": 1.2385844948367321e-07, - "logits/chosen": -2.3096377849578857, - "logits/rejected": -2.196176767349243, - "logps/chosen": -294.1839904785156, - "logps/rejected": -373.6404724121094, - "loss": 0.0131, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2498817890882492, - "rewards/margins": 9.5932035446167, - "rewards/rejected": -9.843087196350098, - "step": 2010 - }, - { - "epoch": 2.1140763997906853, - "grad_norm": 8.298731363990367, - "learning_rate": 1.2123757973432113e-07, - "logits/chosen": -2.3028080463409424, - "logits/rejected": -2.1726481914520264, - "logps/chosen": -280.67083740234375, - "logps/rejected": -351.8200988769531, - "loss": 0.0189, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9443599581718445, - "rewards/margins": 8.549309730529785, - "rewards/rejected": -9.493669509887695, - "step": 2020 - }, - { - "epoch": 2.1245421245421245, - "grad_norm": 3.1054936554627157, - "learning_rate": 1.1863583129645105e-07, - "logits/chosen": -2.293182373046875, - "logits/rejected": -2.1630449295043945, - "logps/chosen": -288.399658203125, - "logps/rejected": -340.930908203125, - "loss": 0.0086, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6419817805290222, - "rewards/margins": 8.76051139831543, - "rewards/rejected": -9.40249252319336, - "step": 2030 - }, - { - "epoch": 2.1350078492935634, - "grad_norm": 11.83036388382499, - "learning_rate": 1.1605359053156604e-07, - "logits/chosen": -2.1931464672088623, - "logits/rejected": -2.1602439880371094, - "logps/chosen": -268.1726989746094, - "logps/rejected": -363.6659240722656, - "loss": 0.014, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.9340217709541321, - "rewards/margins": 10.068492889404297, - "rewards/rejected": -11.002513885498047, - "step": 2040 - }, - { - "epoch": 2.1454735740450026, - "grad_norm": 2.32191720890261, - "learning_rate": 1.1349124090426642e-07, - "logits/chosen": -2.1487176418304443, - "logits/rejected": -1.9843820333480835, - "logps/chosen": -285.9366149902344, - "logps/rejected": -362.80267333984375, - "loss": 0.0159, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -1.2288988828659058, - "rewards/margins": 9.116064071655273, - "rewards/rejected": -10.344964027404785, - "step": 2050 - }, - { - "epoch": 2.155939298796442, - "grad_norm": 13.947515739568049, - "learning_rate": 1.1094916292530403e-07, - "logits/chosen": -2.0874814987182617, - "logits/rejected": -1.886475920677185, - "logps/chosen": -320.4478454589844, - "logps/rejected": -398.4905090332031, - "loss": 0.0104, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8671395182609558, - "rewards/margins": 10.20057201385498, - "rewards/rejected": -11.067710876464844, - "step": 2060 - }, - { - "epoch": 2.1664050235478807, - "grad_norm": 9.732415516645181, - "learning_rate": 1.0842773409507622e-07, - "logits/chosen": -2.029383420944214, - "logits/rejected": -1.9133367538452148, - "logps/chosen": -287.4421081542969, - "logps/rejected": -371.32940673828125, - "loss": 0.0083, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.849840521812439, - "rewards/margins": 9.739965438842773, - "rewards/rejected": -11.589804649353027, - "step": 2070 - }, - { - "epoch": 2.17687074829932, - "grad_norm": 1.6637388619758282, - "learning_rate": 1.0592732884756752e-07, - "logits/chosen": -2.1483490467071533, - "logits/rejected": -1.991890549659729, - "logps/chosen": -302.6157531738281, - "logps/rejected": -376.6459655761719, - "loss": 0.0129, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -1.3996049165725708, - "rewards/margins": 9.1718168258667, - "rewards/rejected": -10.571422576904297, - "step": 2080 - }, - { - "epoch": 2.1873364730507587, - "grad_norm": 2.556458493098877, - "learning_rate": 1.0344831849474505e-07, - "logits/chosen": -2.1448283195495605, - "logits/rejected": -1.9991111755371094, - "logps/chosen": -316.14898681640625, - "logps/rejected": -398.9246826171875, - "loss": 0.0125, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7466624975204468, - "rewards/margins": 9.658205032348633, - "rewards/rejected": -11.404867172241211, - "step": 2090 - }, - { - "epoch": 2.197802197802198, - "grad_norm": 23.14889132999627, - "learning_rate": 1.0099107117141878e-07, - "logits/chosen": -2.0124361515045166, - "logits/rejected": -1.8575055599212646, - "logps/chosen": -301.7145080566406, - "logps/rejected": -376.8993225097656, - "loss": 0.0137, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -1.6709038019180298, - "rewards/margins": 10.484903335571289, - "rewards/rejected": -12.155807495117188, - "step": 2100 - }, - { - "epoch": 2.197802197802198, - "eval_logits/chosen": -2.0338311195373535, - "eval_logits/rejected": -1.892431378364563, - "eval_logps/chosen": -328.73358154296875, - "eval_logps/rejected": -342.078857421875, - "eval_loss": 0.8116357922554016, - "eval_rewards/accuracies": 0.7678571343421936, - "eval_rewards/chosen": -4.6779584884643555, - "eval_rewards/margins": 3.5080323219299316, - "eval_rewards/rejected": -8.185990333557129, - "eval_runtime": 193.4644, - "eval_samples_per_second": 10.338, - "eval_steps_per_second": 0.326, - "step": 2100 - }, - { - "epoch": 2.208267922553637, - "grad_norm": 2.129843347093792, - "learning_rate": 9.855595178057333e-08, - "logits/chosen": -2.061807155609131, - "logits/rejected": -1.8693170547485352, - "logps/chosen": -312.75799560546875, - "logps/rejected": -367.29925537109375, - "loss": 0.0117, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -1.609450340270996, - "rewards/margins": 9.72584342956543, - "rewards/rejected": -11.33529281616211, - "step": 2110 - }, - { - "epoch": 2.218733647305076, - "grad_norm": 29.654494828140166, - "learning_rate": 9.614332193917948e-08, - "logits/chosen": -1.9340378046035767, - "logits/rejected": -1.7978109121322632, - "logps/chosen": -297.70806884765625, - "logps/rejected": -382.2371520996094, - "loss": 0.011, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8321090936660767, - "rewards/margins": 10.517792701721191, - "rewards/rejected": -12.34990119934082, - "step": 2120 - }, - { - "epoch": 2.229199372056515, - "grad_norm": 6.016653975168096, - "learning_rate": 9.375353992449383e-08, - "logits/chosen": -1.8777296543121338, - "logits/rejected": -1.7320115566253662, - "logps/chosen": -295.0875549316406, - "logps/rejected": -379.01568603515625, - "loss": 0.0166, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -1.6746835708618164, - "rewards/margins": 9.687846183776855, - "rewards/rejected": -11.362528800964355, - "step": 2130 - }, - { - "epoch": 2.239665096807954, - "grad_norm": 2.7871156197845615, - "learning_rate": 9.138696062085441e-08, - "logits/chosen": -1.884777307510376, - "logits/rejected": -1.6580861806869507, - "logps/chosen": -290.1960754394531, - "logps/rejected": -390.84027099609375, - "loss": 0.0079, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.2465689182281494, - "rewards/margins": 10.349069595336914, - "rewards/rejected": -12.5956392288208, - "step": 2140 - }, - { - "epoch": 2.250130821559393, - "grad_norm": 15.049492418394964, - "learning_rate": 8.904393546698005e-08, - "logits/chosen": -1.9832146167755127, - "logits/rejected": -1.7372066974639893, - "logps/chosen": -340.45526123046875, - "logps/rejected": -406.02020263671875, - "loss": 0.0081, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5490401983261108, - "rewards/margins": 11.160773277282715, - "rewards/rejected": -12.709813117980957, - "step": 2150 - }, - { - "epoch": 2.260596546310832, - "grad_norm": 6.87573391996733, - "learning_rate": 8.672481240378141e-08, - "logits/chosen": -1.9584165811538696, - "logits/rejected": -1.773619294166565, - "logps/chosen": -296.46905517578125, - "logps/rejected": -394.05767822265625, - "loss": 0.0213, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.0312249660491943, - "rewards/margins": 10.553927421569824, - "rewards/rejected": -12.585153579711914, - "step": 2160 - }, - { - "epoch": 2.271062271062271, - "grad_norm": 2.037653022862868, - "learning_rate": 8.442993582269189e-08, - "logits/chosen": -2.0515332221984863, - "logits/rejected": -1.8242855072021484, - "logps/chosen": -317.90216064453125, - "logps/rejected": -361.9333190917969, - "loss": 0.0154, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -1.6946855783462524, - "rewards/margins": 9.678606986999512, - "rewards/rejected": -11.37329387664795, - "step": 2170 - }, - { - "epoch": 2.2815279958137102, - "grad_norm": 1.809689134605664, - "learning_rate": 8.215964651452455e-08, - "logits/chosen": -1.993316888809204, - "logits/rejected": -1.7858413457870483, - "logps/chosen": -339.15869140625, - "logps/rejected": -389.7720031738281, - "loss": 0.0116, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -1.9753690958023071, - "rewards/margins": 9.869874954223633, - "rewards/rejected": -11.845242500305176, - "step": 2180 - }, - { - "epoch": 2.291993720565149, - "grad_norm": 10.729248233698183, - "learning_rate": 7.991428161886502e-08, - "logits/chosen": -1.86545729637146, - "logits/rejected": -1.7574052810668945, - "logps/chosen": -277.6654357910156, - "logps/rejected": -404.5018005371094, - "loss": 0.016, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -1.674983263015747, - "rewards/margins": 10.156793594360352, - "rewards/rejected": -11.831775665283203, - "step": 2190 - }, - { - "epoch": 2.3024594453165883, - "grad_norm": 18.093227564514844, - "learning_rate": 7.76941745740061e-08, - "logits/chosen": -1.9233520030975342, - "logits/rejected": -1.757142424583435, - "logps/chosen": -291.3281555175781, - "logps/rejected": -330.958740234375, - "loss": 0.0152, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -1.8258177042007446, - "rewards/margins": 8.812068939208984, - "rewards/rejected": -10.637887001037598, - "step": 2200 - }, - { - "epoch": 2.3024594453165883, - "eval_logits/chosen": -1.9886796474456787, - "eval_logits/rejected": -1.8207371234893799, - "eval_logps/chosen": -332.9471130371094, - "eval_logps/rejected": -347.8080139160156, - "eval_loss": 0.8371008038520813, - "eval_rewards/accuracies": 0.7678571343421936, - "eval_rewards/chosen": -5.099310874938965, - "eval_rewards/margins": 3.6595966815948486, - "eval_rewards/rejected": -8.758907318115234, - "eval_runtime": 192.7239, - "eval_samples_per_second": 10.378, - "eval_steps_per_second": 0.327, - "step": 2200 - }, - { - "epoch": 2.312925170068027, - "grad_norm": 6.062095729802051, - "learning_rate": 7.549965506743122e-08, - "logits/chosen": -1.9530508518218994, - "logits/rejected": -1.699387788772583, - "logps/chosen": -301.16607666015625, - "logps/rejected": -349.4542236328125, - "loss": 0.013, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.1856892108917236, - "rewards/margins": 9.574801445007324, - "rewards/rejected": -11.760492324829102, - "step": 2210 - }, - { - "epoch": 2.3233908948194664, - "grad_norm": 5.950267866846336, - "learning_rate": 7.33310489868563e-08, - "logits/chosen": -1.9227104187011719, - "logits/rejected": -1.7536914348602295, - "logps/chosen": -319.16436767578125, - "logps/rejected": -397.6893005371094, - "loss": 0.0115, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.0652170181274414, - "rewards/margins": 10.093847274780273, - "rewards/rejected": -12.159064292907715, - "step": 2220 - }, - { - "epoch": 2.333856619570905, - "grad_norm": 42.42599405374929, - "learning_rate": 7.118867837183518e-08, - "logits/chosen": -1.8385636806488037, - "logits/rejected": -1.6841402053833008, - "logps/chosen": -293.6424560546875, - "logps/rejected": -379.22174072265625, - "loss": 0.0131, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -1.6550343036651611, - "rewards/margins": 10.858930587768555, - "rewards/rejected": -12.51396369934082, - "step": 2230 - }, - { - "epoch": 2.3443223443223444, - "grad_norm": 46.34517067922723, - "learning_rate": 6.907286136593605e-08, - "logits/chosen": -1.7662725448608398, - "logits/rejected": -1.5930817127227783, - "logps/chosen": -272.4427490234375, - "logps/rejected": -377.41552734375, - "loss": 0.0082, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.1439077854156494, - "rewards/margins": 10.517159461975098, - "rewards/rejected": -12.661067008972168, - "step": 2240 - }, - { - "epoch": 2.3547880690737832, - "grad_norm": 160.40461472786274, - "learning_rate": 6.698391216949701e-08, - "logits/chosen": -1.9438517093658447, - "logits/rejected": -1.6640784740447998, - "logps/chosen": -318.5687255859375, - "logps/rejected": -393.029052734375, - "loss": 0.0107, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.306821346282959, - "rewards/margins": 10.74700927734375, - "rewards/rejected": -13.053831100463867, - "step": 2250 - }, - { - "epoch": 2.3652537938252225, - "grad_norm": 5.534201969041167, - "learning_rate": 6.49221409929677e-08, - "logits/chosen": -1.9769662618637085, - "logits/rejected": -1.794873833656311, - "logps/chosen": -336.00177001953125, - "logps/rejected": -409.899169921875, - "loss": 0.0141, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9429311752319336, - "rewards/margins": 10.622076034545898, - "rewards/rejected": -12.565008163452148, - "step": 2260 - }, - { - "epoch": 2.3757195185766613, - "grad_norm": 206.55079490948867, - "learning_rate": 6.288785401084206e-08, - "logits/chosen": -1.8087565898895264, - "logits/rejected": -1.544429898262024, - "logps/chosen": -318.8955993652344, - "logps/rejected": -380.7987976074219, - "loss": 0.0113, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.9285528659820557, - "rewards/margins": 10.339279174804688, - "rewards/rejected": -13.267831802368164, - "step": 2270 - }, - { - "epoch": 2.3861852433281006, - "grad_norm": 2.634840792094369, - "learning_rate": 6.088135331619138e-08, - "logits/chosen": -1.8126347064971924, - "logits/rejected": -1.5803256034851074, - "logps/chosen": -324.7929992675781, - "logps/rejected": -419.17254638671875, - "loss": 0.0039, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.214232921600342, - "rewards/margins": 10.513631820678711, - "rewards/rejected": -13.727865219116211, - "step": 2280 - }, - { - "epoch": 2.3966509680795394, - "grad_norm": 3.65930801608112, - "learning_rate": 5.8902936875803805e-08, - "logits/chosen": -1.7584556341171265, - "logits/rejected": -1.5765407085418701, - "logps/chosen": -323.23590087890625, - "logps/rejected": -401.53790283203125, - "loss": 0.0162, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -3.28875470161438, - "rewards/margins": 9.740612030029297, - "rewards/rejected": -13.029367446899414, - "step": 2290 - }, - { - "epoch": 2.4071166928309786, - "grad_norm": 4.988238090829977, - "learning_rate": 5.695289848593532e-08, - "logits/chosen": -1.7769734859466553, - "logits/rejected": -1.5633699893951416, - "logps/chosen": -297.45477294921875, - "logps/rejected": -397.2643127441406, - "loss": 0.0062, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.732020854949951, - "rewards/margins": 11.00343132019043, - "rewards/rejected": -13.735452651977539, - "step": 2300 - }, - { - "epoch": 2.4071166928309786, - "eval_logits/chosen": -1.8085733652114868, - "eval_logits/rejected": -1.5897276401519775, - "eval_logps/chosen": -344.485595703125, - "eval_logps/rejected": -361.6346130371094, - "eval_loss": 0.8704150319099426, - "eval_rewards/accuracies": 0.7678571343421936, - "eval_rewards/chosen": -6.253159523010254, - "eval_rewards/margins": 3.88840651512146, - "eval_rewards/rejected": -10.141566276550293, - "eval_runtime": 193.6771, - "eval_samples_per_second": 10.326, - "eval_steps_per_second": 0.325, - "step": 2300 - }, - { - "epoch": 2.4175824175824174, - "grad_norm": 1.9558136349019484, - "learning_rate": 5.5031527728681193e-08, - "logits/chosen": -1.6991350650787354, - "logits/rejected": -1.5171782970428467, - "logps/chosen": -286.3030700683594, - "logps/rejected": -416.25811767578125, - "loss": 0.0094, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.3869826793670654, - "rewards/margins": 11.138927459716797, - "rewards/rejected": -14.525911331176758, - "step": 2310 - }, - { - "epoch": 2.4280481423338567, - "grad_norm": 16.526705275598278, - "learning_rate": 5.3139109928972806e-08, - "logits/chosen": -1.7334638833999634, - "logits/rejected": -1.4885704517364502, - "logps/chosen": -343.65899658203125, - "logps/rejected": -410.31793212890625, - "loss": 0.0109, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.301802158355713, - "rewards/margins": 11.803378105163574, - "rewards/rejected": -14.105180740356445, - "step": 2320 - }, - { - "epoch": 2.4385138670852955, - "grad_norm": 2.2627183182177513, - "learning_rate": 5.127592611220657e-08, - "logits/chosen": -1.786730408668518, - "logits/rejected": -1.5299816131591797, - "logps/chosen": -335.2308654785156, - "logps/rejected": -386.974853515625, - "loss": 0.0105, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.8129029273986816, - "rewards/margins": 10.42974853515625, - "rewards/rejected": -13.242650985717773, - "step": 2330 - }, - { - "epoch": 2.4489795918367347, - "grad_norm": 5.282250601405888, - "learning_rate": 4.944225296251159e-08, - "logits/chosen": -1.9082715511322021, - "logits/rejected": -1.6149944067001343, - "logps/chosen": -368.7810974121094, - "logps/rejected": -424.2177734375, - "loss": 0.0112, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.2932369709014893, - "rewards/margins": 10.5211763381958, - "rewards/rejected": -12.814413070678711, - "step": 2340 - }, - { - "epoch": 2.4594453165881736, - "grad_norm": 9.164754272643682, - "learning_rate": 4.7638362781661726e-08, - "logits/chosen": -1.7273809909820557, - "logits/rejected": -1.4639962911605835, - "logps/chosen": -300.2232360839844, - "logps/rejected": -391.49896240234375, - "loss": 0.009, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.921433210372925, - "rewards/margins": 10.661535263061523, - "rewards/rejected": -13.582969665527344, - "step": 2350 - }, - { - "epoch": 2.469911041339613, - "grad_norm": 15.596501475627925, - "learning_rate": 4.5864523448638754e-08, - "logits/chosen": -1.8010714054107666, - "logits/rejected": -1.5301591157913208, - "logps/chosen": -301.37347412109375, - "logps/rejected": -360.9327392578125, - "loss": 0.0151, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.202887773513794, - "rewards/margins": 10.092283248901367, - "rewards/rejected": -13.295171737670898, - "step": 2360 - }, - { - "epoch": 2.4803767660910516, - "grad_norm": 16.07892238404221, - "learning_rate": 4.412099837985192e-08, - "logits/chosen": -1.7746025323867798, - "logits/rejected": -1.5011335611343384, - "logps/chosen": -295.3475646972656, - "logps/rejected": -369.8910217285156, - "loss": 0.0077, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -1.929955244064331, - "rewards/margins": 10.22883415222168, - "rewards/rejected": -12.15878963470459, - "step": 2370 - }, - { - "epoch": 2.490842490842491, - "grad_norm": 5.394194754534313, - "learning_rate": 4.240804649002089e-08, - "logits/chosen": -1.7836806774139404, - "logits/rejected": -1.6082366704940796, - "logps/chosen": -282.34515380859375, - "logps/rejected": -376.56634521484375, - "loss": 0.0095, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.5700106620788574, - "rewards/margins": 10.335325241088867, - "rewards/rejected": -12.905336380004883, - "step": 2380 - }, - { - "epoch": 2.50130821559393, - "grad_norm": 4.286657602803862, - "learning_rate": 4.072592215372597e-08, - "logits/chosen": -1.6599235534667969, - "logits/rejected": -1.4786574840545654, - "logps/chosen": -289.55511474609375, - "logps/rejected": -359.83355712890625, - "loss": 0.0085, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -2.3360424041748047, - "rewards/margins": 10.60867977142334, - "rewards/rejected": -12.944722175598145, - "step": 2390 - }, - { - "epoch": 2.511773940345369, - "grad_norm": 10.63509603780654, - "learning_rate": 3.907487516763389e-08, - "logits/chosen": -1.8115570545196533, - "logits/rejected": -1.5182044506072998, - "logps/chosen": -321.276611328125, - "logps/rejected": -390.94329833984375, - "loss": 0.0124, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.664477825164795, - "rewards/margins": 10.350967407226562, - "rewards/rejected": -13.015446662902832, - "step": 2400 - }, - { - "epoch": 2.511773940345369, - "eval_logits/chosen": -1.775066614151001, - "eval_logits/rejected": -1.5561261177062988, - "eval_logps/chosen": -338.5582275390625, - "eval_logps/rejected": -356.9429016113281, - "eval_loss": 0.884819746017456, - "eval_rewards/accuracies": 0.7698412537574768, - "eval_rewards/chosen": -5.660420894622803, - "eval_rewards/margins": 4.01197624206543, - "eval_rewards/rejected": -9.67239761352539, - "eval_runtime": 193.882, - "eval_samples_per_second": 10.316, - "eval_steps_per_second": 0.325, - "step": 2400 - }, - { - "epoch": 2.5222396650968077, - "grad_norm": 6.163253123092362, - "learning_rate": 3.7455150713402854e-08, - "logits/chosen": -1.7800722122192383, - "logits/rejected": -1.544559121131897, - "logps/chosen": -330.23443603515625, - "logps/rejected": -393.90606689453125, - "loss": 0.011, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.6713662147521973, - "rewards/margins": 10.72212028503418, - "rewards/rejected": -13.393487930297852, - "step": 2410 - }, - { - "epoch": 2.532705389848247, - "grad_norm": 84.19487135751818, - "learning_rate": 3.586698932127236e-08, - "logits/chosen": -1.764854073524475, - "logits/rejected": -1.5328432321548462, - "logps/chosen": -301.03314208984375, - "logps/rejected": -393.0440368652344, - "loss": 0.0182, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.6621508598327637, - "rewards/margins": 10.602301597595215, - "rewards/rejected": -13.264452934265137, - "step": 2420 - }, - { - "epoch": 2.5431711145996863, - "grad_norm": 11.339655694754711, - "learning_rate": 3.431062683434474e-08, - "logits/chosen": -1.7327712774276733, - "logits/rejected": -1.5323649644851685, - "logps/chosen": -325.77471923828125, - "logps/rejected": -399.5340576171875, - "loss": 0.0063, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9265735149383545, - "rewards/margins": 11.067084312438965, - "rewards/rejected": -13.993657112121582, - "step": 2430 - }, - { - "epoch": 2.553636839351125, - "grad_norm": 2.4986624020646278, - "learning_rate": 3.278629437356234e-08, - "logits/chosen": -1.7188389301300049, - "logits/rejected": -1.4560762643814087, - "logps/chosen": -303.47564697265625, - "logps/rejected": -395.47222900390625, - "loss": 0.0143, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.004666805267334, - "rewards/margins": 10.12282943725586, - "rewards/rejected": -13.127496719360352, - "step": 2440 - }, - { - "epoch": 2.564102564102564, - "grad_norm": 30.233163151020317, - "learning_rate": 3.129421830338552e-08, - "logits/chosen": -1.8167226314544678, - "logits/rejected": -1.605737328529358, - "logps/chosen": -305.89794921875, - "logps/rejected": -394.5679931640625, - "loss": 0.0106, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -2.1378867626190186, - "rewards/margins": 10.694021224975586, - "rewards/rejected": -12.831907272338867, - "step": 2450 - }, - { - "epoch": 2.574568288854003, - "grad_norm": 3.323634602147803, - "learning_rate": 2.983462019817748e-08, - "logits/chosen": -1.7865597009658813, - "logits/rejected": -1.556998372077942, - "logps/chosen": -337.3326721191406, - "logps/rejected": -401.4388122558594, - "loss": 0.0073, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.34912371635437, - "rewards/margins": 10.564788818359375, - "rewards/rejected": -12.913911819458008, - "step": 2460 - }, - { - "epoch": 2.5850340136054424, - "grad_norm": 2.455850799846819, - "learning_rate": 2.840771680930068e-08, - "logits/chosen": -1.7107175588607788, - "logits/rejected": -1.5839600563049316, - "logps/chosen": -268.41363525390625, - "logps/rejected": -396.84222412109375, - "loss": 0.0171, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -2.7059414386749268, - "rewards/margins": 10.727618217468262, - "rewards/rejected": -13.433561325073242, - "step": 2470 - }, - { - "epoch": 2.595499738356881, - "grad_norm": 3.723369741968513, - "learning_rate": 2.7013720032928677e-08, - "logits/chosen": -1.652012825012207, - "logits/rejected": -1.565966248512268, - "logps/chosen": -314.6357421875, - "logps/rejected": -428.3789978027344, - "loss": 0.0135, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.7459170818328857, - "rewards/margins": 10.685503005981445, - "rewards/rejected": -13.431419372558594, - "step": 2480 - }, - { - "epoch": 2.60596546310832, - "grad_norm": 13.674302397919858, - "learning_rate": 2.5652836878579497e-08, - "logits/chosen": -1.7104835510253906, - "logits/rejected": -1.5401417016983032, - "logps/chosen": -319.35687255859375, - "logps/rejected": -436.4452209472656, - "loss": 0.0107, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.9921905994415283, - "rewards/margins": 11.111845016479492, - "rewards/rejected": -14.104036331176758, - "step": 2490 - }, - { - "epoch": 2.6164311878597593, - "grad_norm": 8.04395961479556, - "learning_rate": 2.4325269438374706e-08, - "logits/chosen": -1.6020452976226807, - "logits/rejected": -1.3771822452545166, - "logps/chosen": -287.2886657714844, - "logps/rejected": -392.15411376953125, - "loss": 0.0078, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.868680715560913, - "rewards/margins": 10.790876388549805, - "rewards/rejected": -13.659558296203613, - "step": 2500 - }, - { - "epoch": 2.6164311878597593, - "eval_logits/chosen": -1.6589971780776978, - "eval_logits/rejected": -1.418144702911377, - "eval_logps/chosen": -343.6351623535156, - "eval_logps/rejected": -362.63360595703125, - "eval_loss": 0.8925788402557373, - "eval_rewards/accuracies": 0.7678571343421936, - "eval_rewards/chosen": -6.16811990737915, - "eval_rewards/margins": 4.07335090637207, - "eval_rewards/rejected": -10.241469383239746, - "eval_runtime": 193.3444, - "eval_samples_per_second": 10.344, - "eval_steps_per_second": 0.326, - "step": 2500 - }, - { - "epoch": 2.6268969126111985, - "grad_norm": 23.518385312479104, - "learning_rate": 2.303121485702844e-08, - "logits/chosen": -1.6294246912002563, - "logits/rejected": -1.3117210865020752, - "logps/chosen": -333.60137939453125, - "logps/rejected": -397.86468505859375, - "loss": 0.0107, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.2120442390441895, - "rewards/margins": 10.750337600708008, - "rewards/rejected": -13.962382316589355, - "step": 2510 - }, - { - "epoch": 2.6373626373626373, - "grad_norm": 5.139125461027446, - "learning_rate": 2.1770865302571386e-08, - "logits/chosen": -1.467590093612671, - "logits/rejected": -1.2534291744232178, - "logps/chosen": -307.50433349609375, - "logps/rejected": -409.88775634765625, - "loss": 0.0173, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.5433502197265625, - "rewards/margins": 11.32188606262207, - "rewards/rejected": -14.865236282348633, - "step": 2520 - }, - { - "epoch": 2.647828362114076, - "grad_norm": 4.351699332459416, - "learning_rate": 2.0544407937813534e-08, - "logits/chosen": -1.578022837638855, - "logits/rejected": -1.3664928674697876, - "logps/chosen": -291.37152099609375, - "logps/rejected": -396.4104919433594, - "loss": 0.0101, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.8613739013671875, - "rewards/margins": 11.001996994018555, - "rewards/rejected": -13.863372802734375, - "step": 2530 - }, - { - "epoch": 2.6582940868655154, - "grad_norm": 18.60730819733523, - "learning_rate": 1.9352024892550583e-08, - "logits/chosen": -1.60271418094635, - "logits/rejected": -1.2357113361358643, - "logps/chosen": -318.61920166015625, - "logps/rejected": -398.6212463378906, - "loss": 0.0181, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.277348279953003, - "rewards/margins": 11.363245010375977, - "rewards/rejected": -14.640594482421875, - "step": 2540 - }, - { - "epoch": 2.6687598116169546, - "grad_norm": 10.21343230776909, - "learning_rate": 1.8193893236517377e-08, - "logits/chosen": -1.6024513244628906, - "logits/rejected": -1.3179681301116943, - "logps/chosen": -313.63189697265625, - "logps/rejected": -399.43035888671875, - "loss": 0.0187, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -3.163684368133545, - "rewards/margins": 10.75622844696045, - "rewards/rejected": -13.919912338256836, - "step": 2550 - }, - { - "epoch": 2.6792255363683934, - "grad_norm": 23.537841996961767, - "learning_rate": 1.7070184953092953e-08, - "logits/chosen": -1.579572319984436, - "logits/rejected": -1.325355052947998, - "logps/chosen": -314.2366638183594, - "logps/rejected": -416.67193603515625, - "loss": 0.0054, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.367757797241211, - "rewards/margins": 11.030823707580566, - "rewards/rejected": -14.398580551147461, - "step": 2560 - }, - { - "epoch": 2.6896912611198327, - "grad_norm": 1.099111302618137, - "learning_rate": 1.5981066913760916e-08, - "logits/chosen": -1.585573434829712, - "logits/rejected": -1.3345189094543457, - "logps/chosen": -337.53094482421875, - "logps/rejected": -402.87799072265625, - "loss": 0.0106, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.7730913162231445, - "rewards/margins": 11.045392990112305, - "rewards/rejected": -13.81848430633545, - "step": 2570 - }, - { - "epoch": 2.7001569858712715, - "grad_norm": 2.1638416195971195, - "learning_rate": 1.492670085332884e-08, - "logits/chosen": -1.6148006916046143, - "logits/rejected": -1.331849217414856, - "logps/chosen": -304.6117248535156, - "logps/rejected": -374.53485107421875, - "loss": 0.0083, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.7536914348602295, - "rewards/margins": 10.357696533203125, - "rewards/rejected": -14.11138916015625, - "step": 2580 - }, - { - "epoch": 2.7106227106227108, - "grad_norm": 1.7801652299795674, - "learning_rate": 1.3907243345910785e-08, - "logits/chosen": -1.4681947231292725, - "logits/rejected": -1.2752296924591064, - "logps/chosen": -295.4445495605469, - "logps/rejected": -398.19134521484375, - "loss": 0.0175, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -3.3213725090026855, - "rewards/margins": 11.644938468933105, - "rewards/rejected": -14.96631145477295, - "step": 2590 - }, - { - "epoch": 2.7210884353741496, - "grad_norm": 8.693200592883448, - "learning_rate": 1.2922845781675378e-08, - "logits/chosen": -1.5907632112503052, - "logits/rejected": -1.4056947231292725, - "logps/chosen": -316.6394348144531, - "logps/rejected": -393.81158447265625, - "loss": 0.0083, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.9686014652252197, - "rewards/margins": 10.71739673614502, - "rewards/rejected": -13.685998916625977, - "step": 2600 - }, - { - "epoch": 2.7210884353741496, - "eval_logits/chosen": -1.649336338043213, - "eval_logits/rejected": -1.392931342124939, - "eval_logps/chosen": -347.2772521972656, - "eval_logps/rejected": -366.7602233886719, - "eval_loss": 0.90024334192276, - "eval_rewards/accuracies": 0.7658730149269104, - "eval_rewards/chosen": -6.532324314117432, - "eval_rewards/margins": 4.12180757522583, - "eval_rewards/rejected": -10.654129981994629, - "eval_runtime": 196.4578, - "eval_samples_per_second": 10.18, - "eval_steps_per_second": 0.321, - "step": 2600 - }, - { - "epoch": 2.731554160125589, - "grad_norm": 11.926887440041698, - "learning_rate": 1.1973654344364925e-08, - "logits/chosen": -1.6159374713897705, - "logits/rejected": -1.4075515270233154, - "logps/chosen": -287.9294738769531, - "logps/rejected": -385.4781188964844, - "loss": 0.0138, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.2662291526794434, - "rewards/margins": 10.173772811889648, - "rewards/rejected": -13.44000244140625, - "step": 2610 - }, - { - "epoch": 2.7420198848770276, - "grad_norm": 4.934145568750915, - "learning_rate": 1.1059809989586582e-08, - "logits/chosen": -1.491999864578247, - "logits/rejected": -1.1634150743484497, - "logps/chosen": -285.71112060546875, - "logps/rejected": -391.3531188964844, - "loss": 0.0176, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.645512819290161, - "rewards/margins": 11.22826099395752, - "rewards/rejected": -14.873773574829102, - "step": 2620 - }, - { - "epoch": 2.752485609628467, - "grad_norm": 28.780969559212785, - "learning_rate": 1.018144842388044e-08, - "logits/chosen": -1.532060980796814, - "logits/rejected": -1.307739496231079, - "logps/chosen": -295.8886413574219, - "logps/rejected": -399.0740051269531, - "loss": 0.012, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.4307899475097656, - "rewards/margins": 11.213778495788574, - "rewards/rejected": -14.644567489624023, - "step": 2630 - }, - { - "epoch": 2.7629513343799057, - "grad_norm": 4.414419872688541, - "learning_rate": 9.338700084567108e-09, - "logits/chosen": -1.5409033298492432, - "logits/rejected": -1.2314783334732056, - "logps/chosen": -322.3118896484375, - "logps/rejected": -394.9079284667969, - "loss": 0.0208, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.6526527404785156, - "rewards/margins": 10.730294227600098, - "rewards/rejected": -14.382946968078613, - "step": 2640 - }, - { - "epoch": 2.773417059131345, - "grad_norm": 13.45724219243782, - "learning_rate": 8.531690120377605e-09, - "logits/chosen": -1.5432606935501099, - "logits/rejected": -1.2715123891830444, - "logps/chosen": -310.42034912109375, - "logps/rejected": -399.38037109375, - "loss": 0.0093, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.7229926586151123, - "rewards/margins": 11.18032455444336, - "rewards/rejected": -14.903315544128418, - "step": 2650 - }, - { - "epoch": 2.7838827838827838, - "grad_norm": 5.061007206005282, - "learning_rate": 7.760538372868636e-09, - "logits/chosen": -1.6668756008148193, - "logits/rejected": -1.3258527517318726, - "logps/chosen": -294.17266845703125, - "logps/rejected": -378.208984375, - "loss": 0.0128, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -3.9430460929870605, - "rewards/margins": 10.386548042297363, - "rewards/rejected": -14.329594612121582, - "step": 2660 - }, - { - "epoch": 2.794348508634223, - "grad_norm": 4.718121425945516, - "learning_rate": 7.025359358626165e-09, - "logits/chosen": -1.5457098484039307, - "logits/rejected": -1.1619064807891846, - "logps/chosen": -300.71075439453125, - "logps/rejected": -372.9494934082031, - "loss": 0.0123, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.5143496990203857, - "rewards/margins": 11.366539001464844, - "rewards/rejected": -14.880889892578125, - "step": 2670 - }, - { - "epoch": 2.804814233385662, - "grad_norm": 6.736015826996016, - "learning_rate": 6.326262252259462e-09, - "logits/chosen": -1.6261476278305054, - "logits/rejected": -1.4010369777679443, - "logps/chosen": -346.00750732421875, - "logps/rejected": -412.39404296875, - "loss": 0.009, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.332373857498169, - "rewards/margins": 10.390043258666992, - "rewards/rejected": -13.722416877746582, - "step": 2680 - }, - { - "epoch": 2.815279958137101, - "grad_norm": 10.634095011721135, - "learning_rate": 5.66335087018871e-09, - "logits/chosen": -1.5246877670288086, - "logits/rejected": -1.316731572151184, - "logps/chosen": -304.30908203125, - "logps/rejected": -400.41058349609375, - "loss": 0.0105, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6065049171447754, - "rewards/margins": 10.88361930847168, - "rewards/rejected": -14.49012565612793, - "step": 2690 - }, - { - "epoch": 2.82574568288854, - "grad_norm": 12.08938896117685, - "learning_rate": 5.036723655228225e-09, - "logits/chosen": -1.6640583276748657, - "logits/rejected": -1.2813664674758911, - "logps/chosen": -317.0228271484375, - "logps/rejected": -400.35809326171875, - "loss": 0.0115, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.855902671813965, - "rewards/margins": 10.84324836730957, - "rewards/rejected": -13.699151992797852, - "step": 2700 - }, - { - "epoch": 2.82574568288854, - "eval_logits/chosen": -1.6631838083267212, - "eval_logits/rejected": -1.404729962348938, - "eval_logps/chosen": -346.2244873046875, - "eval_logps/rejected": -366.2515869140625, - "eval_loss": 0.9076420664787292, - "eval_rewards/accuracies": 0.7638888955116272, - "eval_rewards/chosen": -6.427051067352295, - "eval_rewards/margins": 4.17621374130249, - "eval_rewards/rejected": -10.603264808654785, - "eval_runtime": 196.8938, - "eval_samples_per_second": 10.158, - "eval_steps_per_second": 0.32, - "step": 2700 - }, - { - "epoch": 2.836211407639979, - "grad_norm": 2.156664953744044, - "learning_rate": 4.446473661967432e-09, - "logits/chosen": -1.642991304397583, - "logits/rejected": -1.4424149990081787, - "logps/chosen": -342.7087097167969, - "logps/rejected": -409.0138854980469, - "loss": 0.0053, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8689379692077637, - "rewards/margins": 11.032378196716309, - "rewards/rejected": -13.90131664276123, - "step": 2710 - }, - { - "epoch": 2.846677132391418, - "grad_norm": 2.9762664960012635, - "learning_rate": 3.89268854295241e-09, - "logits/chosen": -1.6240724325180054, - "logits/rejected": -1.3690745830535889, - "logps/chosen": -311.7180480957031, - "logps/rejected": -395.76275634765625, - "loss": 0.0128, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.589081287384033, - "rewards/margins": 10.571843147277832, - "rewards/rejected": -14.160923957824707, - "step": 2720 - }, - { - "epoch": 2.857142857142857, - "grad_norm": 5.553350884657269, - "learning_rate": 3.3754505356693996e-09, - "logits/chosen": -1.5529824495315552, - "logits/rejected": -1.302966833114624, - "logps/chosen": -300.986572265625, - "logps/rejected": -436.4410705566406, - "loss": 0.0106, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.5891246795654297, - "rewards/margins": 11.44792366027832, - "rewards/rejected": -15.03704833984375, - "step": 2730 - }, - { - "epoch": 2.867608581894296, - "grad_norm": 0.8805528554169163, - "learning_rate": 2.8948364503322276e-09, - "logits/chosen": -1.6217581033706665, - "logits/rejected": -1.4575835466384888, - "logps/chosen": -301.9267883300781, - "logps/rejected": -406.52203369140625, - "loss": 0.0094, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.3556251525878906, - "rewards/margins": 11.159490585327148, - "rewards/rejected": -14.515115737915039, - "step": 2740 - }, - { - "epoch": 2.8780743066457353, - "grad_norm": 347.2983615733762, - "learning_rate": 2.4509176584762704e-09, - "logits/chosen": -1.6258951425552368, - "logits/rejected": -1.2514183521270752, - "logps/chosen": -311.1775207519531, - "logps/rejected": -387.1222229003906, - "loss": 0.0193, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -3.150230884552002, - "rewards/margins": 11.10479736328125, - "rewards/rejected": -14.255029678344727, - "step": 2750 - }, - { - "epoch": 2.8885400313971745, - "grad_norm": 6.119092588030237, - "learning_rate": 2.043760082359569e-09, - "logits/chosen": -1.5979597568511963, - "logits/rejected": -1.2118130922317505, - "logps/chosen": -312.3309631347656, - "logps/rejected": -399.59332275390625, - "loss": 0.0083, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.5269381999969482, - "rewards/margins": 10.837129592895508, - "rewards/rejected": -14.364068984985352, - "step": 2760 - }, - { - "epoch": 2.8990057561486133, - "grad_norm": 26.03020843719191, - "learning_rate": 1.6734241851733e-09, - "logits/chosen": -1.7141611576080322, - "logits/rejected": -1.3291828632354736, - "logps/chosen": -329.8605651855469, - "logps/rejected": -416.9443359375, - "loss": 0.0069, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.871325969696045, - "rewards/margins": 11.196578025817871, - "rewards/rejected": -14.067903518676758, - "step": 2770 - }, - { - "epoch": 2.909471480900052, - "grad_norm": 3.4120485839469863, - "learning_rate": 1.3399649620629839e-09, - "logits/chosen": -1.625018835067749, - "logits/rejected": -1.2956981658935547, - "logps/chosen": -333.72509765625, - "logps/rejected": -425.624755859375, - "loss": 0.0126, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.050673007965088, - "rewards/margins": 10.871179580688477, - "rewards/rejected": -13.921852111816406, - "step": 2780 - }, - { - "epoch": 2.9199372056514914, - "grad_norm": 2.063297047179136, - "learning_rate": 1.0434319319617135e-09, - "logits/chosen": -1.6166296005249023, - "logits/rejected": -1.4035065174102783, - "logps/chosen": -313.02099609375, - "logps/rejected": -405.0020751953125, - "loss": 0.0078, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.9148199558258057, - "rewards/margins": 11.087327003479004, - "rewards/rejected": -14.002148628234863, - "step": 2790 - }, - { - "epoch": 2.9304029304029307, - "grad_norm": 8.556647505190979, - "learning_rate": 7.838691302365364e-10, - "logits/chosen": -1.4856208562850952, - "logits/rejected": -1.2040634155273438, - "logps/chosen": -310.12823486328125, - "logps/rejected": -383.26019287109375, - "loss": 0.0134, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.5830254554748535, - "rewards/margins": 11.645868301391602, - "rewards/rejected": -15.22889232635498, - "step": 2800 - }, - { - "epoch": 2.9304029304029307, - "eval_logits/chosen": -1.6525452136993408, - "eval_logits/rejected": -1.3900270462036133, - "eval_logps/chosen": -345.9360656738281, - "eval_logps/rejected": -366.1889343261719, - "eval_loss": 0.9106400609016418, - "eval_rewards/accuracies": 0.7638888955116272, - "eval_rewards/chosen": -6.398204803466797, - "eval_rewards/margins": 4.198793411254883, - "eval_rewards/rejected": -10.596999168395996, - "eval_runtime": 196.8627, - "eval_samples_per_second": 10.159, - "eval_steps_per_second": 0.32, - "step": 2800 - }, - { - "epoch": 2.9408686551543695, - "grad_norm": 4.161096651151409, - "learning_rate": 5.613151021490459e-10, - "logits/chosen": -1.6516437530517578, - "logits/rejected": -1.3479902744293213, - "logps/chosen": -319.8536376953125, - "logps/rejected": -426.85870361328125, - "loss": 0.0112, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.3773441314697266, - "rewards/margins": 12.091669082641602, - "rewards/rejected": -15.469012260437012, - "step": 2810 - }, - { - "epoch": 2.9513343799058083, - "grad_norm": 0.6324232604716057, - "learning_rate": 3.758028971315996e-10, - "logits/chosen": -1.570770025253296, - "logits/rejected": -1.2533477544784546, - "logps/chosen": -299.7353820800781, - "logps/rejected": -409.45428466796875, - "loss": 0.0093, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -3.395376682281494, - "rewards/margins": 11.752998352050781, - "rewards/rejected": -15.148374557495117, - "step": 2820 - }, - { - "epoch": 2.9618001046572475, - "grad_norm": 39.551836958387554, - "learning_rate": 2.2736006387927275e-10, - "logits/chosen": -1.7062339782714844, - "logits/rejected": -1.3970075845718384, - "logps/chosen": -321.1328430175781, - "logps/rejected": -398.12353515625, - "loss": 0.014, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -3.26806640625, - "rewards/margins": 10.62768268585205, - "rewards/rejected": -13.89574909210205, - "step": 2830 - }, - { - "epoch": 2.9722658294086868, - "grad_norm": 7.385644121693272, - "learning_rate": 1.1600864625893603e-10, - "logits/chosen": -1.5698139667510986, - "logits/rejected": -1.1535699367523193, - "logps/chosen": -319.61083984375, - "logps/rejected": -400.57659912109375, - "loss": 0.0124, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.564915418624878, - "rewards/margins": 11.524139404296875, - "rewards/rejected": -15.0890531539917, - "step": 2840 - }, - { - "epoch": 2.9827315541601256, - "grad_norm": 5.3636311137847255, - "learning_rate": 4.176518003573548e-11, - "logits/chosen": -1.5674631595611572, - "logits/rejected": -1.4830777645111084, - "logps/chosen": -304.5412902832031, - "logps/rejected": -414.52679443359375, - "loss": 0.0164, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.2556285858154297, - "rewards/margins": 11.08967113494873, - "rewards/rejected": -14.345301628112793, - "step": 2850 - }, - { - "epoch": 2.9931972789115644, - "grad_norm": 0.6366721800954926, - "learning_rate": 4.640690417528903e-12, - "logits/chosen": -1.4274837970733643, - "logits/rejected": -1.2798718214035034, - "logps/chosen": -337.44549560546875, - "logps/rejected": -460.02264404296875, - "loss": 0.007, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.261587619781494, - "rewards/margins": 11.79706859588623, - "rewards/rejected": -15.058656692504883, - "step": 2860 - }, - { - "epoch": 2.998430141287284, - "step": 2865, + "epoch": 0.9994767137624281, + "step": 955, "total_flos": 0.0, - "train_loss": 0.21882489114921755, - "train_runtime": 49934.9402, - "train_samples_per_second": 3.673, - "train_steps_per_second": 0.057 + "train_loss": 0.528570184657711, + "train_runtime": 17823.4182, + "train_samples_per_second": 3.43, + "train_steps_per_second": 0.054 } ], "logging_steps": 10, - "max_steps": 2865, + "max_steps": 955, "num_input_tokens_seen": 0, - "num_train_epochs": 3, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8,