{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 2.436493005352286, "learning_rate": 1.3054830287206268e-08, "logits/chosen": -2.1143321990966797, "logits/rejected": -2.472040891647339, "logps/chosen": -177.52757263183594, "logps/rejected": -252.707275390625, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6926161050796509, "epoch": 0.0, "grad_norm": 36.02958311408835, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.578505277633667, "logits/rejected": -2.2738986015319824, "logps/chosen": -276.4295959472656, "logps/rejected": -184.18495178222656, "loss": 0.6991, "positive_losses": 0.039088089019060135, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0003647406992968172, "rewards/margins": 0.0010644833091646433, "rewards/margins_max": 0.0041068182326853275, "rewards/margins_min": -0.0015654838643968105, "rewards/margins_std": 0.002455304143950343, "rewards/rejected": -0.0006997426389716566, "step": 10 }, { "dpo_losses": 0.6933375000953674, "epoch": 0.01, "grad_norm": 22.215581096541932, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.6722965240478516, "logits/rejected": -2.3532378673553467, "logps/chosen": -336.49383544921875, "logps/rejected": -271.5801086425781, "loss": 0.6997, "positive_losses": 0.08401088416576385, "rewards/accuracies": 0.5, "rewards/chosen": 0.00075221445877105, "rewards/margins": -0.00037500550388358533, "rewards/margins_max": 0.004432193469256163, "rewards/margins_min": -0.004654556512832642, "rewards/margins_std": 0.004250099416822195, "rewards/rejected": 0.0011272199917584658, "step": 20 }, { "dpo_losses": 0.6925877928733826, "epoch": 0.01, "grad_norm": 9.651890829367645, "learning_rate": 3.9164490861618804e-07, "logits/chosen": -2.322463274002075, "logits/rejected": -2.3930299282073975, "logps/chosen": -246.892822265625, "logps/rejected": -251.5536346435547, "loss": 0.696, "positive_losses": 0.016086244955658913, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.003448696807026863, "rewards/margins": 0.0011251671239733696, "rewards/margins_max": 0.00628508348017931, "rewards/margins_min": -0.003383347298949957, "rewards/margins_std": 0.00429148506373167, "rewards/rejected": 0.00232352945022285, "step": 30 }, { "dpo_losses": 0.6933262944221497, "epoch": 0.01, "grad_norm": 15.193574169527125, "learning_rate": 5.221932114882506e-07, "logits/chosen": -2.356933116912842, "logits/rejected": -2.3318867683410645, "logps/chosen": -211.1941680908203, "logps/rejected": -239.860595703125, "loss": 0.6944, "positive_losses": 0.015000438317656517, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0036029077600687742, "rewards/margins": -0.0003518062294460833, "rewards/margins_max": 0.004791459534317255, "rewards/margins_min": -0.004789828788489103, "rewards/margins_std": 0.00436857994645834, "rewards/rejected": 0.00395471416413784, "step": 40 }, { "dpo_losses": 0.6933200359344482, "epoch": 0.01, "grad_norm": 2.861740551281531, "learning_rate": 6.527415143603135e-07, "logits/chosen": -2.6225380897521973, "logits/rejected": -2.4836673736572266, "logps/chosen": -330.4644470214844, "logps/rejected": -305.0365295410156, "loss": 0.694, "positive_losses": 0.0016630649333819747, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.006856194697320461, "rewards/margins": -0.00034048277302645147, "rewards/margins_max": 0.004664260894060135, "rewards/margins_min": -0.005454545840620995, "rewards/margins_std": 0.004431026987731457, "rewards/rejected": 0.007196678314357996, "step": 50 }, { "dpo_losses": 0.6928949952125549, "epoch": 0.02, "grad_norm": 2.8390579920613805, "learning_rate": 7.832898172323761e-07, "logits/chosen": -2.3425657749176025, "logits/rejected": -2.265756607055664, "logps/chosen": -235.59115600585938, "logps/rejected": -240.47305297851562, "loss": 0.6934, "positive_losses": 0.010653781704604626, "rewards/accuracies": 0.5, "rewards/chosen": 0.00621157418936491, "rewards/margins": 0.0005123416776768863, "rewards/margins_max": 0.00648439209908247, "rewards/margins_min": -0.005089015234261751, "rewards/margins_std": 0.005148830823600292, "rewards/rejected": 0.005699232220649719, "step": 60 }, { "dpo_losses": 0.6919016242027283, "epoch": 0.02, "grad_norm": 10.169959947175231, "learning_rate": 9.138381201044387e-07, "logits/chosen": -2.430279493331909, "logits/rejected": -2.300011157989502, "logps/chosen": -246.1517333984375, "logps/rejected": -245.7523651123047, "loss": 0.6937, "positive_losses": 0.008912896737456322, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.008372459560632706, "rewards/margins": 0.002516259206458926, "rewards/margins_max": 0.014019829221069813, "rewards/margins_min": -0.006390625145286322, "rewards/margins_std": 0.009104877710342407, "rewards/rejected": 0.005856200121343136, "step": 70 }, { "dpo_losses": 0.6920778751373291, "epoch": 0.02, "grad_norm": 3.3745896831741122, "learning_rate": 1.0443864229765013e-06, "logits/chosen": -2.458425998687744, "logits/rejected": -2.366384983062744, "logps/chosen": -279.7979431152344, "logps/rejected": -237.0563201904297, "loss": 0.6926, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.00952855497598648, "rewards/margins": 0.0021492778323590755, "rewards/margins_max": 0.009919574484229088, "rewards/margins_min": -0.004271012265235186, "rewards/margins_std": 0.006084291730076075, "rewards/rejected": 0.007379277143627405, "step": 80 }, { "dpo_losses": 0.6924630403518677, "epoch": 0.02, "grad_norm": 20.33193781498704, "learning_rate": 1.1749347258485642e-06, "logits/chosen": -2.5910892486572266, "logits/rejected": -2.5497522354125977, "logps/chosen": -343.89093017578125, "logps/rejected": -322.6294860839844, "loss": 0.6935, "positive_losses": 0.017118072137236595, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.012896155938506126, "rewards/margins": 0.0013928873231634498, "rewards/margins_max": 0.01210005208849907, "rewards/margins_min": -0.009835563600063324, "rewards/margins_std": 0.00964871421456337, "rewards/rejected": 0.011503269895911217, "step": 90 }, { "dpo_losses": 0.6911870837211609, "epoch": 0.03, "grad_norm": 29.82351449111192, "learning_rate": 1.305483028720627e-06, "logits/chosen": -2.2807178497314453, "logits/rejected": -2.168356418609619, "logps/chosen": -275.6390075683594, "logps/rejected": -220.49844360351562, "loss": 0.6938, "positive_losses": 0.03862800449132919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.011399135924875736, "rewards/margins": 0.003962818533182144, "rewards/margins_max": 0.017177527770400047, "rewards/margins_min": -0.00563895795494318, "rewards/margins_std": 0.010453316383063793, "rewards/rejected": 0.0074363164603710175, "step": 100 }, { "epoch": 0.03, "eval_dpo_losses": 0.6916342973709106, "eval_logits/chosen": -2.341143846511841, "eval_logits/rejected": -2.230705499649048, "eval_logps/chosen": -274.44976806640625, "eval_logps/rejected": -261.8032531738281, "eval_loss": 0.6935612559318542, "eval_positive_losses": 0.02029024064540863, "eval_rewards/accuracies": 0.5694444179534912, "eval_rewards/chosen": 0.01324367430061102, "eval_rewards/margins": 0.003073295811191201, "eval_rewards/margins_max": 0.024846632033586502, "eval_rewards/margins_min": -0.01431260071694851, "eval_rewards/margins_std": 0.012665905058383942, "eval_rewards/rejected": 0.010170378722250462, "eval_runtime": 388.3897, "eval_samples_per_second": 5.149, "eval_steps_per_second": 0.162, "step": 100 }, { "dpo_losses": 0.6931635141372681, "epoch": 0.03, "grad_norm": 2.9742291132929513, "learning_rate": 1.4360313315926894e-06, "logits/chosen": -2.4229214191436768, "logits/rejected": -2.3891592025756836, "logps/chosen": -252.0586395263672, "logps/rejected": -261.424560546875, "loss": 0.6922, "positive_losses": 0.012239265255630016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.014577746391296387, "rewards/margins": 5.038874405727256e-06, "rewards/margins_max": 0.013945160433650017, "rewards/margins_min": -0.011510682292282581, "rewards/margins_std": 0.011192189529538155, "rewards/rejected": 0.014572707004845142, "step": 110 }, { "dpo_losses": 0.6913285255432129, "epoch": 0.03, "grad_norm": 2.9337520632840195, "learning_rate": 1.5665796344647521e-06, "logits/chosen": -2.3784403800964355, "logits/rejected": -2.3113296031951904, "logps/chosen": -230.49819946289062, "logps/rejected": -233.60287475585938, "loss": 0.6929, "positive_losses": 0.04759025573730469, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.015536767430603504, "rewards/margins": 0.0036879326216876507, "rewards/margins_max": 0.02044644206762314, "rewards/margins_min": -0.010087795555591583, "rewards/margins_std": 0.01347220130264759, "rewards/rejected": 0.011848834343254566, "step": 120 }, { "dpo_losses": 0.693742036819458, "epoch": 0.03, "grad_norm": 2.7693489993972675, "learning_rate": 1.6971279373368146e-06, "logits/chosen": -2.398063898086548, "logits/rejected": -2.3056235313415527, "logps/chosen": -266.774169921875, "logps/rejected": -357.10162353515625, "loss": 0.6927, "positive_losses": 0.009470367804169655, "rewards/accuracies": 0.5, "rewards/chosen": 0.018754294142127037, "rewards/margins": -0.00105818803422153, "rewards/margins_max": 0.019026655703783035, "rewards/margins_min": -0.026388054713606834, "rewards/margins_std": 0.020254066213965416, "rewards/rejected": 0.019812483340501785, "step": 130 }, { "dpo_losses": 0.6886600255966187, "epoch": 0.04, "grad_norm": 18.145407927556267, "learning_rate": 1.8276762402088774e-06, "logits/chosen": -2.3262314796447754, "logits/rejected": -2.1723906993865967, "logps/chosen": -213.9654083251953, "logps/rejected": -206.73922729492188, "loss": 0.696, "positive_losses": 0.01792144775390625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.018916286528110504, "rewards/margins": 0.009153150022029877, "rewards/margins_max": 0.039224009960889816, "rewards/margins_min": -0.00752085167914629, "rewards/margins_std": 0.021640609949827194, "rewards/rejected": 0.009763132780790329, "step": 140 }, { "dpo_losses": 0.6908050775527954, "epoch": 0.04, "grad_norm": 2.624476789375122, "learning_rate": 1.9582245430809403e-06, "logits/chosen": -2.3353309631347656, "logits/rejected": -2.2895712852478027, "logps/chosen": -213.6117706298828, "logps/rejected": -242.24832153320312, "loss": 0.6921, "positive_losses": 0.0013246536254882812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.020151834934949875, "rewards/margins": 0.004766896832734346, "rewards/margins_max": 0.024007394909858704, "rewards/margins_min": -0.01263777632266283, "rewards/margins_std": 0.016176212579011917, "rewards/rejected": 0.01538493949919939, "step": 150 }, { "dpo_losses": 0.6894623041152954, "epoch": 0.04, "grad_norm": 4.405549988028905, "learning_rate": 2.0887728459530026e-06, "logits/chosen": -2.4915030002593994, "logits/rejected": -2.319269895553589, "logps/chosen": -254.7397003173828, "logps/rejected": -223.55178833007812, "loss": 0.6911, "positive_losses": 0.0026970624458044767, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.025369063019752502, "rewards/margins": 0.00744537403807044, "rewards/margins_max": 0.02455688826739788, "rewards/margins_min": -0.009821002371609211, "rewards/margins_std": 0.015232970006763935, "rewards/rejected": 0.017923690378665924, "step": 160 }, { "dpo_losses": 0.6887125372886658, "epoch": 0.04, "grad_norm": 13.085819728947653, "learning_rate": 2.2193211488250653e-06, "logits/chosen": -2.2926387786865234, "logits/rejected": -2.3033576011657715, "logps/chosen": -226.3302459716797, "logps/rejected": -203.73971557617188, "loss": 0.6918, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.023752663284540176, "rewards/margins": 0.008978691883385181, "rewards/margins_max": 0.030928168445825577, "rewards/margins_min": -0.008593680337071419, "rewards/margins_std": 0.018045030534267426, "rewards/rejected": 0.01477397233247757, "step": 170 }, { "dpo_losses": 0.6858507394790649, "epoch": 0.05, "grad_norm": 23.83060950162102, "learning_rate": 2.3498694516971284e-06, "logits/chosen": -2.480717182159424, "logits/rejected": -2.255446672439575, "logps/chosen": -299.66571044921875, "logps/rejected": -242.9132537841797, "loss": 0.6958, "positive_losses": 0.029059791937470436, "rewards/accuracies": 0.75, "rewards/chosen": 0.02899104915559292, "rewards/margins": 0.014792178757488728, "rewards/margins_max": 0.03641856461763382, "rewards/margins_min": -0.006799428258091211, "rewards/margins_std": 0.019825013354420662, "rewards/rejected": 0.014198869466781616, "step": 180 }, { "dpo_losses": 0.6841143369674683, "epoch": 0.05, "grad_norm": 2.775216233360337, "learning_rate": 2.4804177545691907e-06, "logits/chosen": -2.6274118423461914, "logits/rejected": -2.3566718101501465, "logps/chosen": -267.79119873046875, "logps/rejected": -205.7725372314453, "loss": 0.6902, "positive_losses": 8.94546537892893e-05, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0354822501540184, "rewards/margins": 0.018354758620262146, "rewards/margins_max": 0.049285221844911575, "rewards/margins_min": -0.006443038582801819, "rewards/margins_std": 0.02506195567548275, "rewards/rejected": 0.017127489671111107, "step": 190 }, { "dpo_losses": 0.6827012300491333, "epoch": 0.05, "grad_norm": 13.31831111247488, "learning_rate": 2.610966057441254e-06, "logits/chosen": -2.3919177055358887, "logits/rejected": -2.240403175354004, "logps/chosen": -301.1272277832031, "logps/rejected": -250.2685546875, "loss": 0.6869, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04041924327611923, "rewards/margins": 0.0211967583745718, "rewards/margins_max": 0.052608318626880646, "rewards/margins_min": -0.004013159312307835, "rewards/margins_std": 0.02568567916750908, "rewards/rejected": 0.019222481176257133, "step": 200 }, { "epoch": 0.05, "eval_dpo_losses": 0.686770498752594, "eval_logits/chosen": -2.3372015953063965, "eval_logits/rejected": -2.2271885871887207, "eval_logps/chosen": -272.1819152832031, "eval_logps/rejected": -260.5340576171875, "eval_loss": 0.6912068128585815, "eval_positive_losses": 0.03335093334317207, "eval_rewards/accuracies": 0.6646825671195984, "eval_rewards/chosen": 0.03592230752110481, "eval_rewards/margins": 0.013060340657830238, "eval_rewards/margins_max": 0.06305618584156036, "eval_rewards/margins_min": -0.03152156248688698, "eval_rewards/margins_std": 0.030760591849684715, "eval_rewards/rejected": 0.022861965000629425, "eval_runtime": 399.7005, "eval_samples_per_second": 5.004, "eval_steps_per_second": 0.158, "step": 200 }, { "dpo_losses": 0.6853702068328857, "epoch": 0.05, "grad_norm": 2.968156614731641, "learning_rate": 2.741514360313316e-06, "logits/chosen": -2.4707226753234863, "logits/rejected": -2.259084701538086, "logps/chosen": -301.0274658203125, "logps/rejected": -272.90789794921875, "loss": 0.6929, "positive_losses": 0.04770316928625107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04160173237323761, "rewards/margins": 0.015960896387696266, "rewards/margins_max": 0.05990996211767197, "rewards/margins_min": -0.021231159567832947, "rewards/margins_std": 0.035254962742328644, "rewards/rejected": 0.025640835985541344, "step": 210 }, { "dpo_losses": 0.6867796182632446, "epoch": 0.06, "grad_norm": 9.552806054538372, "learning_rate": 2.872062663185379e-06, "logits/chosen": -2.2375309467315674, "logits/rejected": -2.1978111267089844, "logps/chosen": -249.0443572998047, "logps/rejected": -284.0054931640625, "loss": 0.6883, "positive_losses": 0.0650840774178505, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.04898852854967117, "rewards/margins": 0.013053612783551216, "rewards/margins_max": 0.0420328751206398, "rewards/margins_min": -0.0185235682874918, "rewards/margins_std": 0.026452433317899704, "rewards/rejected": 0.03593491390347481, "step": 220 }, { "dpo_losses": 0.6791070699691772, "epoch": 0.06, "grad_norm": 15.295810619570497, "learning_rate": 3.0026109660574416e-06, "logits/chosen": -2.457859516143799, "logits/rejected": -2.316096782684326, "logps/chosen": -294.4407043457031, "logps/rejected": -250.66259765625, "loss": 0.6887, "positive_losses": 0.012937545776367188, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.06922395527362823, "rewards/margins": 0.02883235737681389, "rewards/margins_max": 0.08588799834251404, "rewards/margins_min": -0.01900091953575611, "rewards/margins_std": 0.045562244951725006, "rewards/rejected": 0.04039159044623375, "step": 230 }, { "dpo_losses": 0.6845782399177551, "epoch": 0.06, "grad_norm": 2.830011739537992, "learning_rate": 3.1331592689295043e-06, "logits/chosen": -2.333503246307373, "logits/rejected": -2.0535292625427246, "logps/chosen": -251.6215362548828, "logps/rejected": -214.29605102539062, "loss": 0.6811, "positive_losses": 0.00905685406178236, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06861050426959991, "rewards/margins": 0.017773425206542015, "rewards/margins_max": 0.07205475866794586, "rewards/margins_min": -0.021642131730914116, "rewards/margins_std": 0.04276832193136215, "rewards/rejected": 0.05083707720041275, "step": 240 }, { "dpo_losses": 0.6792364120483398, "epoch": 0.07, "grad_norm": 12.833382610540202, "learning_rate": 3.263707571801567e-06, "logits/chosen": -2.4070284366607666, "logits/rejected": -2.384169816970825, "logps/chosen": -260.6197204589844, "logps/rejected": -249.26040649414062, "loss": 0.6897, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06667246669530869, "rewards/margins": 0.028825070708990097, "rewards/margins_max": 0.08307720720767975, "rewards/margins_min": -0.012393154203891754, "rewards/margins_std": 0.04193580895662308, "rewards/rejected": 0.03784739598631859, "step": 250 }, { "dpo_losses": 0.6748965978622437, "epoch": 0.07, "grad_norm": 2.6022964679381664, "learning_rate": 3.3942558746736293e-06, "logits/chosen": -2.355379343032837, "logits/rejected": -2.2721242904663086, "logps/chosen": -325.04339599609375, "logps/rejected": -321.17681884765625, "loss": 0.6799, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.08676992356777191, "rewards/margins": 0.03794393688440323, "rewards/margins_max": 0.11332438141107559, "rewards/margins_min": -0.019129080697894096, "rewards/margins_std": 0.059720832854509354, "rewards/rejected": 0.04882598668336868, "step": 260 }, { "dpo_losses": 0.6814028024673462, "epoch": 0.07, "grad_norm": 2.5397356754848657, "learning_rate": 3.524804177545692e-06, "logits/chosen": -2.318666934967041, "logits/rejected": -2.2338852882385254, "logps/chosen": -297.05511474609375, "logps/rejected": -285.2984924316406, "loss": 0.6818, "positive_losses": 0.018982697278261185, "rewards/accuracies": 0.5, "rewards/chosen": 0.1056601032614708, "rewards/margins": 0.025041431188583374, "rewards/margins_max": 0.10871823877096176, "rewards/margins_min": -0.04196007549762726, "rewards/margins_std": 0.06698973476886749, "rewards/rejected": 0.08061867207288742, "step": 270 }, { "dpo_losses": 0.6854309439659119, "epoch": 0.07, "grad_norm": 2.414591413412544, "learning_rate": 3.6553524804177547e-06, "logits/chosen": -2.429506301879883, "logits/rejected": -2.4180550575256348, "logps/chosen": -258.20751953125, "logps/rejected": -233.81112670898438, "loss": 0.7195, "positive_losses": 0.2982654571533203, "rewards/accuracies": 0.5, "rewards/chosen": 0.10437098890542984, "rewards/margins": 0.018108632415533066, "rewards/margins_max": 0.11589519679546356, "rewards/margins_min": -0.08476543426513672, "rewards/margins_std": 0.08821991831064224, "rewards/rejected": 0.08626236021518707, "step": 280 }, { "dpo_losses": 0.6730380058288574, "epoch": 0.08, "grad_norm": 2.862189570089975, "learning_rate": 3.7859007832898174e-06, "logits/chosen": -2.4280779361724854, "logits/rejected": -2.2283263206481934, "logps/chosen": -288.5790100097656, "logps/rejected": -232.5244140625, "loss": 0.6883, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1280081868171692, "rewards/margins": 0.041770853102207184, "rewards/margins_max": 0.10442335903644562, "rewards/margins_min": -0.022371429949998856, "rewards/margins_std": 0.05768733471632004, "rewards/rejected": 0.08623732626438141, "step": 290 }, { "dpo_losses": 0.6745599508285522, "epoch": 0.08, "grad_norm": 12.921590412147294, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -2.6218185424804688, "logits/rejected": -2.42942476272583, "logps/chosen": -296.28118896484375, "logps/rejected": -232.6905059814453, "loss": 0.6821, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14077244699001312, "rewards/margins": 0.03875672444701195, "rewards/margins_max": 0.1144590824842453, "rewards/margins_min": -0.03806469962000847, "rewards/margins_std": 0.06972731649875641, "rewards/rejected": 0.10201573371887207, "step": 300 }, { "epoch": 0.08, "eval_dpo_losses": 0.6765538454055786, "eval_logits/chosen": -2.3237428665161133, "eval_logits/rejected": -2.21431040763855, "eval_logps/chosen": -262.7723693847656, "eval_logps/rejected": -253.2898712158203, "eval_loss": 0.679607093334198, "eval_positive_losses": 0.018218902871012688, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.1300174593925476, "eval_rewards/margins": 0.03471315652132034, "eval_rewards/margins_max": 0.14281484484672546, "eval_rewards/margins_min": -0.06296098977327347, "eval_rewards/margins_std": 0.06745360791683197, "eval_rewards/rejected": 0.09530431777238846, "eval_runtime": 387.0424, "eval_samples_per_second": 5.167, "eval_steps_per_second": 0.163, "step": 300 }, { "dpo_losses": 0.6738269329071045, "epoch": 0.08, "grad_norm": 3.0335645196308554, "learning_rate": 4.046997389033943e-06, "logits/chosen": -2.4384329319000244, "logits/rejected": -2.4015088081359863, "logps/chosen": -266.3841247558594, "logps/rejected": -292.3809814453125, "loss": 0.6769, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12135608494281769, "rewards/margins": 0.04019695892930031, "rewards/margins_max": 0.10660040378570557, "rewards/margins_min": -0.021759014576673508, "rewards/margins_std": 0.05759907513856888, "rewards/rejected": 0.08115912973880768, "step": 310 }, { "dpo_losses": 0.668108344078064, "epoch": 0.08, "grad_norm": 2.693233849447548, "learning_rate": 4.177545691906005e-06, "logits/chosen": -2.4366071224212646, "logits/rejected": -2.339660167694092, "logps/chosen": -288.8175354003906, "logps/rejected": -256.23565673828125, "loss": 0.6929, "positive_losses": 0.1860916167497635, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1333363950252533, "rewards/margins": 0.053958844393491745, "rewards/margins_max": 0.17244751751422882, "rewards/margins_min": -0.061126988381147385, "rewards/margins_std": 0.10477306693792343, "rewards/rejected": 0.07937754690647125, "step": 320 }, { "dpo_losses": 0.6810218691825867, "epoch": 0.09, "grad_norm": 2.685426666343741, "learning_rate": 4.308093994778068e-06, "logits/chosen": -2.3714699745178223, "logits/rejected": -2.2585549354553223, "logps/chosen": -261.8404846191406, "logps/rejected": -277.73687744140625, "loss": 0.6823, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1293381154537201, "rewards/margins": 0.026614252477884293, "rewards/margins_max": 0.12214706838130951, "rewards/margins_min": -0.06969315558671951, "rewards/margins_std": 0.08482696861028671, "rewards/rejected": 0.10272388160228729, "step": 330 }, { "dpo_losses": 0.6689623594284058, "epoch": 0.09, "grad_norm": 3.0156613678789888, "learning_rate": 4.4386422976501306e-06, "logits/chosen": -2.2923426628112793, "logits/rejected": -2.284660816192627, "logps/chosen": -280.08709716796875, "logps/rejected": -274.1883850097656, "loss": 0.6747, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1294107437133789, "rewards/margins": 0.050875864923000336, "rewards/margins_max": 0.14234112203121185, "rewards/margins_min": -0.01888411119580269, "rewards/margins_std": 0.07344356924295425, "rewards/rejected": 0.07853487133979797, "step": 340 }, { "dpo_losses": 0.6817235946655273, "epoch": 0.09, "grad_norm": 11.50219838235079, "learning_rate": 4.569190600522193e-06, "logits/chosen": -2.2793664932250977, "logits/rejected": -2.1003825664520264, "logps/chosen": -254.3335723876953, "logps/rejected": -225.3618927001953, "loss": 0.706, "positive_losses": 0.24355831742286682, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.10696812719106674, "rewards/margins": 0.026087790727615356, "rewards/margins_max": 0.1430610865354538, "rewards/margins_min": -0.10218159109354019, "rewards/margins_std": 0.10859046131372452, "rewards/rejected": 0.08088032901287079, "step": 350 }, { "dpo_losses": 0.6678518056869507, "epoch": 0.09, "grad_norm": 8.67371531711433, "learning_rate": 4.699738903394257e-06, "logits/chosen": -2.6383492946624756, "logits/rejected": -2.403627872467041, "logps/chosen": -346.515869140625, "logps/rejected": -272.076416015625, "loss": 0.6808, "positive_losses": 0.27331846952438354, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17022207379341125, "rewards/margins": 0.05378316715359688, "rewards/margins_max": 0.1587308943271637, "rewards/margins_min": -0.05835150554776192, "rewards/margins_std": 0.09476341307163239, "rewards/rejected": 0.11643888801336288, "step": 360 }, { "dpo_losses": 0.6787279844284058, "epoch": 0.1, "grad_norm": 2.415861181875224, "learning_rate": 4.8302872062663196e-06, "logits/chosen": -2.2024409770965576, "logits/rejected": -2.2221169471740723, "logps/chosen": -188.47567749023438, "logps/rejected": -224.8198699951172, "loss": 0.6839, "positive_losses": 0.03471221774816513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12595216929912567, "rewards/margins": 0.030537093058228493, "rewards/margins_max": 0.1175287589430809, "rewards/margins_min": -0.04461822658777237, "rewards/margins_std": 0.07086901366710663, "rewards/rejected": 0.09541507065296173, "step": 370 }, { "dpo_losses": 0.6720711588859558, "epoch": 0.1, "grad_norm": 2.5270318571436814, "learning_rate": 4.9608355091383814e-06, "logits/chosen": -2.385875940322876, "logits/rejected": -2.0670740604400635, "logps/chosen": -249.42141723632812, "logps/rejected": -245.84146118164062, "loss": 0.6769, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13377264142036438, "rewards/margins": 0.04521537199616432, "rewards/margins_max": 0.15654854476451874, "rewards/margins_min": -0.06165168806910515, "rewards/margins_std": 0.09616450220346451, "rewards/rejected": 0.08855727314949036, "step": 380 }, { "dpo_losses": 0.6597688794136047, "epoch": 0.1, "grad_norm": 14.069604867512439, "learning_rate": 4.9999488562447675e-06, "logits/chosen": -2.5476415157318115, "logits/rejected": -2.402465343475342, "logps/chosen": -253.4579620361328, "logps/rejected": -245.3377685546875, "loss": 0.6739, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.15567433834075928, "rewards/margins": 0.06992227584123611, "rewards/margins_max": 0.1612171232700348, "rewards/margins_min": -0.019116202369332314, "rewards/margins_std": 0.08076639473438263, "rewards/rejected": 0.08575205504894257, "step": 390 }, { "dpo_losses": 0.6859565377235413, "epoch": 0.1, "grad_norm": 2.4604770774649514, "learning_rate": 4.999698361256577e-06, "logits/chosen": -2.3254785537719727, "logits/rejected": -2.209468364715576, "logps/chosen": -219.5220947265625, "logps/rejected": -215.3175048828125, "loss": 0.6907, "positive_losses": 0.3594726622104645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.10298417508602142, "rewards/margins": 0.017944559454917908, "rewards/margins_max": 0.13592465221881866, "rewards/margins_min": -0.11202068626880646, "rewards/margins_std": 0.11073519289493561, "rewards/rejected": 0.08503963053226471, "step": 400 }, { "epoch": 0.1, "eval_dpo_losses": 0.6648116111755371, "eval_logits/chosen": -2.2944986820220947, "eval_logits/rejected": -2.186610221862793, "eval_logps/chosen": -263.6666259765625, "eval_logps/rejected": -256.7861328125, "eval_loss": 0.6964106559753418, "eval_positive_losses": 0.25473934412002563, "eval_rewards/accuracies": 0.7341269850730896, "eval_rewards/chosen": 0.1210751160979271, "eval_rewards/margins": 0.060733649879693985, "eval_rewards/margins_max": 0.23180438578128815, "eval_rewards/margins_min": -0.09239085763692856, "eval_rewards/margins_std": 0.10815587639808655, "eval_rewards/rejected": 0.06034146249294281, "eval_runtime": 387.8046, "eval_samples_per_second": 5.157, "eval_steps_per_second": 0.162, "step": 400 }, { "dpo_losses": 0.6753194332122803, "epoch": 0.11, "grad_norm": 13.349243462660745, "learning_rate": 4.999239142174581e-06, "logits/chosen": -2.279818058013916, "logits/rejected": -2.0927653312683105, "logps/chosen": -282.8083190917969, "logps/rejected": -286.2363586425781, "loss": 0.6957, "positive_losses": 0.19796772301197052, "rewards/accuracies": 0.625, "rewards/chosen": 0.10469593852758408, "rewards/margins": 0.03812658041715622, "rewards/margins_max": 0.12132169306278229, "rewards/margins_min": -0.06464502960443497, "rewards/margins_std": 0.0828610509634018, "rewards/rejected": 0.06656936556100845, "step": 410 }, { "dpo_losses": 0.6530672907829285, "epoch": 0.11, "grad_norm": 9.502928433052148, "learning_rate": 4.99857123734344e-06, "logits/chosen": -2.3241848945617676, "logits/rejected": -2.1916375160217285, "logps/chosen": -247.09542846679688, "logps/rejected": -225.3440704345703, "loss": 0.6798, "positive_losses": 0.07038631290197372, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.12835900485515594, "rewards/margins": 0.08442454040050507, "rewards/margins_max": 0.20049388706684113, "rewards/margins_min": -0.000227789583732374, "rewards/margins_std": 0.0884331688284874, "rewards/rejected": 0.04393446445465088, "step": 420 }, { "dpo_losses": 0.6647271513938904, "epoch": 0.11, "grad_norm": 30.45088919608147, "learning_rate": 4.997694702533016e-06, "logits/chosen": -2.3110671043395996, "logits/rejected": -2.070655107498169, "logps/chosen": -274.56878662109375, "logps/rejected": -207.313720703125, "loss": 0.6796, "positive_losses": 0.10166008770465851, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15044531226158142, "rewards/margins": 0.060526229441165924, "rewards/margins_max": 0.1840842068195343, "rewards/margins_min": -0.03285117819905281, "rewards/margins_std": 0.09493841230869293, "rewards/rejected": 0.0899190753698349, "step": 430 }, { "dpo_losses": 0.6653302311897278, "epoch": 0.12, "grad_norm": 2.715796258265014, "learning_rate": 4.996609610933713e-06, "logits/chosen": -2.388329267501831, "logits/rejected": -2.3687074184417725, "logps/chosen": -257.00103759765625, "logps/rejected": -256.05413818359375, "loss": 0.699, "positive_losses": 0.5066024661064148, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13530997931957245, "rewards/margins": 0.060604095458984375, "rewards/margins_max": 0.19812436401844025, "rewards/margins_min": -0.05112669989466667, "rewards/margins_std": 0.1107383519411087, "rewards/rejected": 0.07470588386058807, "step": 440 }, { "dpo_losses": 0.6691263914108276, "epoch": 0.12, "grad_norm": 2.373659860065595, "learning_rate": 4.995316053150366e-06, "logits/chosen": -2.498375415802002, "logits/rejected": -2.2869575023651123, "logps/chosen": -263.6067199707031, "logps/rejected": -221.4702606201172, "loss": 0.6952, "positive_losses": 0.3391250669956207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.13136933743953705, "rewards/margins": 0.0507415235042572, "rewards/margins_max": 0.14853163063526154, "rewards/margins_min": -0.043901462107896805, "rewards/margins_std": 0.08198720961809158, "rewards/rejected": 0.08062782138586044, "step": 450 }, { "dpo_losses": 0.665686309337616, "epoch": 0.12, "grad_norm": 6.985271763249108, "learning_rate": 4.9938141371946815e-06, "logits/chosen": -2.2182416915893555, "logits/rejected": -2.19655704498291, "logps/chosen": -226.240234375, "logps/rejected": -250.16201782226562, "loss": 0.6809, "positive_losses": 0.20003890991210938, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12023647874593735, "rewards/margins": 0.05803869292140007, "rewards/margins_max": 0.16543307900428772, "rewards/margins_min": -0.03566453233361244, "rewards/margins_std": 0.09299205243587494, "rewards/rejected": 0.06219778582453728, "step": 460 }, { "dpo_losses": 0.674501359462738, "epoch": 0.12, "grad_norm": 13.987007651983344, "learning_rate": 4.992103988476206e-06, "logits/chosen": -2.3932113647460938, "logits/rejected": -2.3694698810577393, "logps/chosen": -269.54296875, "logps/rejected": -271.8418884277344, "loss": 0.6983, "positive_losses": 0.5444073677062988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11291886866092682, "rewards/margins": 0.041192490607500076, "rewards/margins_max": 0.149806946516037, "rewards/margins_min": -0.0638364925980568, "rewards/margins_std": 0.09488866478204727, "rewards/rejected": 0.07172638922929764, "step": 470 }, { "dpo_losses": 0.6648268699645996, "epoch": 0.13, "grad_norm": 13.186551172916197, "learning_rate": 4.990185749791866e-06, "logits/chosen": -2.3940088748931885, "logits/rejected": -2.2588846683502197, "logps/chosen": -264.1141662597656, "logps/rejected": -227.1199188232422, "loss": 0.6747, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14029274880886078, "rewards/margins": 0.06220408156514168, "rewards/margins_max": 0.1558995395898819, "rewards/margins_min": -0.07043421268463135, "rewards/margins_std": 0.10141684859991074, "rewards/rejected": 0.07808864861726761, "step": 480 }, { "dpo_losses": 0.6573482751846313, "epoch": 0.13, "grad_norm": 16.430013481965805, "learning_rate": 4.9880595813140395e-06, "logits/chosen": -2.359638214111328, "logits/rejected": -2.35622501373291, "logps/chosen": -292.9980163574219, "logps/rejected": -299.01513671875, "loss": 0.6991, "positive_losses": 0.18225860595703125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15383556485176086, "rewards/margins": 0.07642128318548203, "rewards/margins_max": 0.22237035632133484, "rewards/margins_min": -0.05203705281019211, "rewards/margins_std": 0.11964057385921478, "rewards/rejected": 0.07741429656744003, "step": 490 }, { "dpo_losses": 0.6754809617996216, "epoch": 0.13, "grad_norm": 12.093163291243192, "learning_rate": 4.985725660577184e-06, "logits/chosen": -2.369516372680664, "logits/rejected": -2.342146396636963, "logps/chosen": -267.4049072265625, "logps/rejected": -229.26321411132812, "loss": 0.7003, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15567676723003387, "rewards/margins": 0.038981594145298004, "rewards/margins_max": 0.1621088981628418, "rewards/margins_min": -0.09022364765405655, "rewards/margins_std": 0.11141878366470337, "rewards/rejected": 0.11669518798589706, "step": 500 }, { "epoch": 0.13, "eval_dpo_losses": 0.6721124649047852, "eval_logits/chosen": -2.2857210636138916, "eval_logits/rejected": -2.177556276321411, "eval_logps/chosen": -259.9059753417969, "eval_logps/rejected": -251.4864501953125, "eval_loss": 0.6789628267288208, "eval_positive_losses": 0.0658472552895546, "eval_rewards/accuracies": 0.6686508059501648, "eval_rewards/chosen": 0.1586819589138031, "eval_rewards/margins": 0.04534366726875305, "eval_rewards/margins_max": 0.20670504868030548, "eval_rewards/margins_min": -0.09800657629966736, "eval_rewards/margins_std": 0.102239228785038, "eval_rewards/rejected": 0.11333829164505005, "eval_runtime": 387.3091, "eval_samples_per_second": 5.164, "eval_steps_per_second": 0.163, "step": 500 }, { "dpo_losses": 0.6778862476348877, "epoch": 0.13, "grad_norm": 12.655104597089716, "learning_rate": 4.983184182463009e-06, "logits/chosen": -2.2541165351867676, "logits/rejected": -2.139129161834717, "logps/chosen": -241.78829956054688, "logps/rejected": -235.052734375, "loss": 0.6821, "positive_losses": 0.1570049226284027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.13940033316612244, "rewards/margins": 0.03358124941587448, "rewards/margins_max": 0.1504804790019989, "rewards/margins_min": -0.06657135486602783, "rewards/margins_std": 0.09891293942928314, "rewards/rejected": 0.10581908375024796, "step": 510 }, { "dpo_losses": 0.6701198816299438, "epoch": 0.14, "grad_norm": 20.612354077712716, "learning_rate": 4.980435359184203e-06, "logits/chosen": -2.224423408508301, "logits/rejected": -1.9810459613800049, "logps/chosen": -247.1111297607422, "logps/rejected": -225.3288116455078, "loss": 0.6858, "positive_losses": 0.06976928561925888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1302551031112671, "rewards/margins": 0.05010036751627922, "rewards/margins_max": 0.1707076132297516, "rewards/margins_min": -0.053237248212099075, "rewards/margins_std": 0.10070963948965073, "rewards/rejected": 0.08015473932027817, "step": 520 }, { "dpo_losses": 0.6654530167579651, "epoch": 0.14, "grad_norm": 2.807764092624076, "learning_rate": 4.9774794202667236e-06, "logits/chosen": -2.4467451572418213, "logits/rejected": -2.328883409500122, "logps/chosen": -300.3257751464844, "logps/rejected": -288.34832763671875, "loss": 0.6959, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1512792855501175, "rewards/margins": 0.058304183185100555, "rewards/margins_max": 0.15108561515808105, "rewards/margins_min": -0.011961912736296654, "rewards/margins_std": 0.07505235821008682, "rewards/rejected": 0.09297507256269455, "step": 530 }, { "dpo_losses": 0.6859993934631348, "epoch": 0.14, "grad_norm": 11.307038564263854, "learning_rate": 4.974316612530615e-06, "logits/chosen": -2.2917912006378174, "logits/rejected": -2.251622200012207, "logps/chosen": -254.5506591796875, "logps/rejected": -232.3438720703125, "loss": 0.6778, "positive_losses": 0.11064491420984268, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.1402551233768463, "rewards/margins": 0.01737521030008793, "rewards/margins_max": 0.11270274966955185, "rewards/margins_min": -0.09471074491739273, "rewards/margins_std": 0.09167486429214478, "rewards/rejected": 0.12287990003824234, "step": 540 }, { "dpo_losses": 0.6665059924125671, "epoch": 0.14, "grad_norm": 2.1732535618299003, "learning_rate": 4.970947200069416e-06, "logits/chosen": -2.516829490661621, "logits/rejected": -2.407902240753174, "logps/chosen": -276.01934814453125, "logps/rejected": -250.8043975830078, "loss": 0.6899, "positive_losses": 0.04767026752233505, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15557391941547394, "rewards/margins": 0.05617040395736694, "rewards/margins_max": 0.16392546892166138, "rewards/margins_min": -0.037529654800891876, "rewards/margins_std": 0.08721224963665009, "rewards/rejected": 0.099403515458107, "step": 550 }, { "dpo_losses": 0.6779977083206177, "epoch": 0.15, "grad_norm": 2.392481064676673, "learning_rate": 4.967371464228096e-06, "logits/chosen": -2.370774745941162, "logits/rejected": -2.404670238494873, "logps/chosen": -268.37286376953125, "logps/rejected": -250.9393310546875, "loss": 0.6862, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.16901841759681702, "rewards/margins": 0.03337451070547104, "rewards/margins_max": 0.1328039914369583, "rewards/margins_min": -0.06189457327127457, "rewards/margins_std": 0.08851297199726105, "rewards/rejected": 0.13564392924308777, "step": 560 }, { "dpo_losses": 0.6760483980178833, "epoch": 0.15, "grad_norm": 2.2470071809430685, "learning_rate": 4.963589703579569e-06, "logits/chosen": -2.349813938140869, "logits/rejected": -2.1958553791046143, "logps/chosen": -249.5431365966797, "logps/rejected": -222.64614868164062, "loss": 0.6866, "positive_losses": 0.1816307008266449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14338871836662292, "rewards/margins": 0.03716425970196724, "rewards/margins_max": 0.1451178640127182, "rewards/margins_min": -0.051981497555971146, "rewards/margins_std": 0.08835052698850632, "rewards/rejected": 0.10622447729110718, "step": 570 }, { "dpo_losses": 0.6414459943771362, "epoch": 0.15, "grad_norm": 2.713884933539946, "learning_rate": 4.9596022338997615e-06, "logits/chosen": -2.7107632160186768, "logits/rejected": -2.4209001064300537, "logps/chosen": -337.2410583496094, "logps/rejected": -255.7893524169922, "loss": 0.7011, "positive_losses": 0.0, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.18762512505054474, "rewards/margins": 0.10993113368749619, "rewards/margins_max": 0.23535898327827454, "rewards/margins_min": 0.013100062496960163, "rewards/margins_std": 0.10084857046604156, "rewards/rejected": 0.07769399881362915, "step": 580 }, { "dpo_losses": 0.6847223043441772, "epoch": 0.15, "grad_norm": 2.0753814254777994, "learning_rate": 4.955409388141243e-06, "logits/chosen": -2.3709237575531006, "logits/rejected": -2.2699685096740723, "logps/chosen": -231.6881866455078, "logps/rejected": -251.21640014648438, "loss": 0.6778, "positive_losses": 0.06961822509765625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11436762660741806, "rewards/margins": 0.019310925155878067, "rewards/margins_max": 0.1262083649635315, "rewards/margins_min": -0.07558675855398178, "rewards/margins_std": 0.08758808672428131, "rewards/rejected": 0.0950566977262497, "step": 590 }, { "dpo_losses": 0.6626082062721252, "epoch": 0.16, "grad_norm": 2.1367524473891075, "learning_rate": 4.951011516405429e-06, "logits/chosen": -2.479884624481201, "logits/rejected": -2.3370730876922607, "logps/chosen": -373.1979675292969, "logps/rejected": -300.1449279785156, "loss": 0.6682, "positive_losses": 0.01121444720774889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16548365354537964, "rewards/margins": 0.0666542649269104, "rewards/margins_max": 0.23898963630199432, "rewards/margins_min": -0.07070523500442505, "rewards/margins_std": 0.13495507836341858, "rewards/rejected": 0.09882939606904984, "step": 600 }, { "epoch": 0.16, "eval_dpo_losses": 0.6655822992324829, "eval_logits/chosen": -2.2719125747680664, "eval_logits/rejected": -2.1644198894500732, "eval_logps/chosen": -262.09979248046875, "eval_logps/rejected": -255.08895874023438, "eval_loss": 0.6923016309738159, "eval_positive_losses": 0.20340295135974884, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 0.1367436647415161, "eval_rewards/margins": 0.05943042412400246, "eval_rewards/margins_max": 0.24141547083854675, "eval_rewards/margins_min": -0.09355739504098892, "eval_rewards/margins_std": 0.1135849580168724, "eval_rewards/rejected": 0.07731323689222336, "eval_runtime": 387.7718, "eval_samples_per_second": 5.158, "eval_steps_per_second": 0.162, "step": 600 }, { "dpo_losses": 0.6690826416015625, "epoch": 0.16, "grad_norm": 15.99828043280404, "learning_rate": 4.946408985913344e-06, "logits/chosen": -2.3795905113220215, "logits/rejected": -2.176065683364868, "logps/chosen": -314.34039306640625, "logps/rejected": -256.96234130859375, "loss": 0.6776, "positive_losses": 0.10370178520679474, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14271800220012665, "rewards/margins": 0.051775336265563965, "rewards/margins_max": 0.1486392468214035, "rewards/margins_min": -0.03890662267804146, "rewards/margins_std": 0.08334605395793915, "rewards/rejected": 0.09094268828630447, "step": 610 }, { "dpo_losses": 0.6567118763923645, "epoch": 0.16, "grad_norm": 2.495205668991317, "learning_rate": 4.941602180974958e-06, "logits/chosen": -2.3387537002563477, "logits/rejected": -2.2407264709472656, "logps/chosen": -248.9082489013672, "logps/rejected": -225.10806274414062, "loss": 0.6682, "positive_losses": 0.21484032273292542, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1359931379556656, "rewards/margins": 0.08235027641057968, "rewards/margins_max": 0.2500799596309662, "rewards/margins_min": -0.06781473010778427, "rewards/margins_std": 0.14302915334701538, "rewards/rejected": 0.05364285781979561, "step": 620 }, { "dpo_losses": 0.6537377238273621, "epoch": 0.16, "grad_norm": 2.1405377892319786, "learning_rate": 4.936591502957101e-06, "logits/chosen": -2.459953784942627, "logits/rejected": -2.34082293510437, "logps/chosen": -260.98773193359375, "logps/rejected": -259.555419921875, "loss": 0.6936, "positive_losses": 0.12968750298023224, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1627923846244812, "rewards/margins": 0.08589986711740494, "rewards/margins_max": 0.23806282877922058, "rewards/margins_min": -0.019510917365550995, "rewards/margins_std": 0.1139247864484787, "rewards/rejected": 0.07689251005649567, "step": 630 }, { "dpo_losses": 0.6614780426025391, "epoch": 0.17, "grad_norm": 20.047083555520768, "learning_rate": 4.931377370249946e-06, "logits/chosen": -2.385593891143799, "logits/rejected": -2.247202157974243, "logps/chosen": -307.6918640136719, "logps/rejected": -248.19534301757812, "loss": 0.7049, "positive_losses": 0.4248853623867035, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1519450843334198, "rewards/margins": 0.06993581354618073, "rewards/margins_max": 0.21718063950538635, "rewards/margins_min": -0.0518064871430397, "rewards/margins_std": 0.11829618364572525, "rewards/rejected": 0.08200927823781967, "step": 640 }, { "dpo_losses": 0.6777046918869019, "epoch": 0.17, "grad_norm": 12.200265625110061, "learning_rate": 4.925960218232073e-06, "logits/chosen": -2.39190673828125, "logits/rejected": -2.3569204807281494, "logps/chosen": -255.3836669921875, "logps/rejected": -269.2209777832031, "loss": 0.6955, "positive_losses": 0.48981934785842896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.13370266556739807, "rewards/margins": 0.037743326276540756, "rewards/margins_max": 0.2125106304883957, "rewards/margins_min": -0.1132003664970398, "rewards/margins_std": 0.14416250586509705, "rewards/rejected": 0.09595932811498642, "step": 650 }, { "dpo_losses": 0.6684109568595886, "epoch": 0.17, "grad_norm": 14.95131921975893, "learning_rate": 4.920340499234116e-06, "logits/chosen": -2.3995680809020996, "logits/rejected": -2.2585902214050293, "logps/chosen": -207.7253875732422, "logps/rejected": -234.0863800048828, "loss": 0.6887, "positive_losses": 0.22086945176124573, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.143407940864563, "rewards/margins": 0.053105246275663376, "rewards/margins_max": 0.16075178980827332, "rewards/margins_min": -0.07296352088451385, "rewards/margins_std": 0.10269738733768463, "rewards/rejected": 0.09030269086360931, "step": 660 }, { "dpo_losses": 0.672214925289154, "epoch": 0.18, "grad_norm": 2.27040301337442, "learning_rate": 4.914518682500995e-06, "logits/chosen": -2.4030909538269043, "logits/rejected": -2.356276035308838, "logps/chosen": -223.3278045654297, "logps/rejected": -229.46792602539062, "loss": 0.6759, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15397925674915314, "rewards/margins": 0.0457574762403965, "rewards/margins_max": 0.17481981217861176, "rewards/margins_min": -0.0956016555428505, "rewards/margins_std": 0.1210639700293541, "rewards/rejected": 0.10822176933288574, "step": 670 }, { "dpo_losses": 0.6679674386978149, "epoch": 0.18, "grad_norm": 12.540053626132906, "learning_rate": 4.9084952541527315e-06, "logits/chosen": -2.2560207843780518, "logits/rejected": -2.0700149536132812, "logps/chosen": -306.978271484375, "logps/rejected": -280.5941162109375, "loss": 0.6757, "positive_losses": 0.1108788475394249, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1706356704235077, "rewards/margins": 0.05449013039469719, "rewards/margins_max": 0.1885974407196045, "rewards/margins_min": -0.08131690323352814, "rewards/margins_std": 0.11809859424829483, "rewards/rejected": 0.1161455288529396, "step": 680 }, { "dpo_losses": 0.6393178105354309, "epoch": 0.18, "grad_norm": 2.1228303491815472, "learning_rate": 4.902270717143858e-06, "logits/chosen": -2.351646661758423, "logits/rejected": -2.1488397121429443, "logps/chosen": -260.0979919433594, "logps/rejected": -233.5791015625, "loss": 0.6946, "positive_losses": 0.22238540649414062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14163188636302948, "rewards/margins": 0.11493770778179169, "rewards/margins_max": 0.240229532122612, "rewards/margins_min": 0.004443532321602106, "rewards/margins_std": 0.10304765403270721, "rewards/rejected": 0.02669418975710869, "step": 690 }, { "dpo_losses": 0.6753467917442322, "epoch": 0.18, "grad_norm": 2.17702877091902, "learning_rate": 4.895845591221427e-06, "logits/chosen": -2.292858123779297, "logits/rejected": -2.2389369010925293, "logps/chosen": -245.1870880126953, "logps/rejected": -249.68881225585938, "loss": 0.7018, "positive_losses": 0.19409790635108948, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1421625316143036, "rewards/margins": 0.039503492414951324, "rewards/margins_max": 0.16869139671325684, "rewards/margins_min": -0.08940212428569794, "rewards/margins_std": 0.11191286891698837, "rewards/rejected": 0.10265902429819107, "step": 700 }, { "epoch": 0.18, "eval_dpo_losses": 0.6687398552894592, "eval_logits/chosen": -2.253567934036255, "eval_logits/rejected": -2.1459801197052, "eval_logps/chosen": -260.107421875, "eval_logps/rejected": -252.4310302734375, "eval_loss": 0.6776766777038574, "eval_positive_losses": 0.08308552950620651, "eval_rewards/accuracies": 0.682539701461792, "eval_rewards/chosen": 0.1566673070192337, "eval_rewards/margins": 0.05277470499277115, "eval_rewards/margins_max": 0.23339077830314636, "eval_rewards/margins_min": -0.10097137093544006, "eval_rewards/margins_std": 0.11228429526090622, "eval_rewards/rejected": 0.10389261692762375, "eval_runtime": 386.7037, "eval_samples_per_second": 5.172, "eval_steps_per_second": 0.163, "step": 700 }, { "dpo_losses": 0.6727067232131958, "epoch": 0.19, "grad_norm": 2.1448456302750465, "learning_rate": 4.8892204128816e-06, "logits/chosen": -2.324993848800659, "logits/rejected": -2.2222707271575928, "logps/chosen": -206.90878295898438, "logps/rejected": -177.45217895507812, "loss": 0.6796, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.1342659443616867, "rewards/margins": 0.04264216870069504, "rewards/margins_max": 0.11368715763092041, "rewards/margins_min": -0.023314783349633217, "rewards/margins_std": 0.06210507079958916, "rewards/rejected": 0.09162376821041107, "step": 710 }, { "dpo_losses": 0.6426345705986023, "epoch": 0.19, "grad_norm": 7.79717784591578, "learning_rate": 4.882395735324864e-06, "logits/chosen": -2.311656951904297, "logits/rejected": -2.0881175994873047, "logps/chosen": -327.39935302734375, "logps/rejected": -286.40753173828125, "loss": 0.6637, "positive_losses": 0.0, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.1701394021511078, "rewards/margins": 0.10840602219104767, "rewards/margins_max": 0.26491349935531616, "rewards/margins_min": -0.013164414092898369, "rewards/margins_std": 0.12765491008758545, "rewards/rejected": 0.06173338741064072, "step": 720 }, { "dpo_losses": 0.6573339700698853, "epoch": 0.19, "grad_norm": 8.101079383799247, "learning_rate": 4.87537212840983e-06, "logits/chosen": -2.5441060066223145, "logits/rejected": -2.288764715194702, "logps/chosen": -293.4589538574219, "logps/rejected": -260.5919189453125, "loss": 0.6727, "positive_losses": 0.05285034328699112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1285373568534851, "rewards/margins": 0.07752709090709686, "rewards/margins_max": 0.21534352004528046, "rewards/margins_min": -0.06429310142993927, "rewards/margins_std": 0.12748174369335175, "rewards/rejected": 0.05101025104522705, "step": 730 }, { "dpo_losses": 0.665749192237854, "epoch": 0.19, "grad_norm": 2.860175349055325, "learning_rate": 4.8681501786056545e-06, "logits/chosen": -2.1872236728668213, "logits/rejected": -2.3543593883514404, "logps/chosen": -240.6953582763672, "logps/rejected": -316.33203125, "loss": 0.7149, "positive_losses": 0.7752044796943665, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15433523058891296, "rewards/margins": 0.061246126890182495, "rewards/margins_max": 0.22508402168750763, "rewards/margins_min": -0.07986617833375931, "rewards/margins_std": 0.13370192050933838, "rewards/rejected": 0.09308910369873047, "step": 740 }, { "dpo_losses": 0.675001323223114, "epoch": 0.2, "grad_norm": 2.4577929554796953, "learning_rate": 4.860730488943068e-06, "logits/chosen": -2.381234645843506, "logits/rejected": -2.408716917037964, "logps/chosen": -225.5152587890625, "logps/rejected": -250.5948028564453, "loss": 0.6832, "positive_losses": 0.4588395953178406, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.14705222845077515, "rewards/margins": 0.040194448083639145, "rewards/margins_max": 0.1591082215309143, "rewards/margins_min": -0.053133536130189896, "rewards/margins_std": 0.09478326141834259, "rewards/rejected": 0.1068577915430069, "step": 750 }, { "dpo_losses": 0.649374783039093, "epoch": 0.2, "grad_norm": 2.461240525738415, "learning_rate": 4.853113678964022e-06, "logits/chosen": -2.4449493885040283, "logits/rejected": -2.254509449005127, "logps/chosen": -259.0587463378906, "logps/rejected": -231.087646484375, "loss": 0.6679, "positive_losses": 0.04253082349896431, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1678490936756134, "rewards/margins": 0.0919952243566513, "rewards/margins_max": 0.19063004851341248, "rewards/margins_min": 0.007611794862896204, "rewards/margins_std": 0.08293931186199188, "rewards/rejected": 0.0758538693189621, "step": 760 }, { "dpo_losses": 0.6608075499534607, "epoch": 0.2, "grad_norm": 15.947113847448277, "learning_rate": 4.845300384669958e-06, "logits/chosen": -2.3778998851776123, "logits/rejected": -2.146644115447998, "logps/chosen": -287.42401123046875, "logps/rejected": -265.72198486328125, "loss": 0.691, "positive_losses": 0.4739212095737457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12427745759487152, "rewards/margins": 0.0702078565955162, "rewards/margins_max": 0.23464974761009216, "rewards/margins_min": -0.06977568566799164, "rewards/margins_std": 0.13273942470550537, "rewards/rejected": 0.054069600999355316, "step": 770 }, { "dpo_losses": 0.6654810905456543, "epoch": 0.2, "grad_norm": 2.5770026335934086, "learning_rate": 4.837291258468701e-06, "logits/chosen": -2.3677866458892822, "logits/rejected": -2.251971960067749, "logps/chosen": -282.57391357421875, "logps/rejected": -264.45458984375, "loss": 0.682, "positive_losses": 0.07812710106372833, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16855712234973907, "rewards/margins": 0.06081680208444595, "rewards/margins_max": 0.20666389167308807, "rewards/margins_min": -0.10391966998577118, "rewards/margins_std": 0.13577458262443542, "rewards/rejected": 0.10774032026529312, "step": 780 }, { "dpo_losses": 0.6754828095436096, "epoch": 0.21, "grad_norm": 2.4665431528699195, "learning_rate": 4.829086969119984e-06, "logits/chosen": -2.3838953971862793, "logits/rejected": -2.343912124633789, "logps/chosen": -260.1561584472656, "logps/rejected": -272.9075622558594, "loss": 0.6795, "positive_losses": 0.07786712795495987, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.16216763854026794, "rewards/margins": 0.038654766976833344, "rewards/margins_max": 0.1642107516527176, "rewards/margins_min": -0.07301913946866989, "rewards/margins_std": 0.10424380004405975, "rewards/rejected": 0.1235128790140152, "step": 790 }, { "dpo_losses": 0.6584834456443787, "epoch": 0.21, "grad_norm": 2.4850340155071398, "learning_rate": 4.820688201679605e-06, "logits/chosen": -2.189183473587036, "logits/rejected": -2.0696215629577637, "logps/chosen": -307.51336669921875, "logps/rejected": -230.66067504882812, "loss": 0.6821, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16072562336921692, "rewards/margins": 0.07449884712696075, "rewards/margins_max": 0.21343155205249786, "rewards/margins_min": -0.04383747652173042, "rewards/margins_std": 0.11556984484195709, "rewards/rejected": 0.08622678369283676, "step": 800 }, { "epoch": 0.21, "eval_dpo_losses": 0.6652129888534546, "eval_logits/chosen": -2.2634823322296143, "eval_logits/rejected": -2.1533122062683105, "eval_logps/chosen": -259.8833923339844, "eval_logps/rejected": -253.0055389404297, "eval_loss": 0.6809996366500854, "eval_positive_losses": 0.12352155894041061, "eval_rewards/accuracies": 0.6746031641960144, "eval_rewards/chosen": 0.15890754759311676, "eval_rewards/margins": 0.06075997278094292, "eval_rewards/margins_max": 0.26078224182128906, "eval_rewards/margins_min": -0.11133670806884766, "eval_rewards/margins_std": 0.12388527393341064, "eval_rewards/rejected": 0.09814754873514175, "eval_runtime": 394.1939, "eval_samples_per_second": 5.074, "eval_steps_per_second": 0.16, "step": 800 }, { "dpo_losses": 0.6675550937652588, "epoch": 0.21, "grad_norm": 10.102902635510135, "learning_rate": 4.8120956574422315e-06, "logits/chosen": -2.353468418121338, "logits/rejected": -2.230372667312622, "logps/chosen": -275.5753479003906, "logps/rejected": -259.0949401855469, "loss": 0.6949, "positive_losses": 0.26875075697898865, "rewards/accuracies": 0.625, "rewards/chosen": 0.1548309326171875, "rewards/margins": 0.05591664835810661, "rewards/margins_max": 0.19111007452011108, "rewards/margins_min": -0.0724906176328659, "rewards/margins_std": 0.11679482460021973, "rewards/rejected": 0.09891428053379059, "step": 810 }, { "dpo_losses": 0.6544283032417297, "epoch": 0.21, "grad_norm": 18.390357833127357, "learning_rate": 4.803310053882831e-06, "logits/chosen": -2.2659404277801514, "logits/rejected": -2.1163411140441895, "logps/chosen": -237.2716064453125, "logps/rejected": -204.75767517089844, "loss": 0.6854, "positive_losses": 0.1876932680606842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14170509576797485, "rewards/margins": 0.08338084071874619, "rewards/margins_max": 0.23509807884693146, "rewards/margins_min": -0.035363998264074326, "rewards/margins_std": 0.12274130433797836, "rewards/rejected": 0.058324266225099564, "step": 820 }, { "dpo_losses": 0.6639114618301392, "epoch": 0.22, "grad_norm": 2.671523578577587, "learning_rate": 4.794332124596775e-06, "logits/chosen": -2.388666868209839, "logits/rejected": -2.2445638179779053, "logps/chosen": -287.84686279296875, "logps/rejected": -282.60980224609375, "loss": 0.6961, "positive_losses": 0.049111176282167435, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15192775428295135, "rewards/margins": 0.06462886929512024, "rewards/margins_max": 0.2050425261259079, "rewards/margins_min": -0.07998664677143097, "rewards/margins_std": 0.12840968370437622, "rewards/rejected": 0.08729889243841171, "step": 830 }, { "dpo_losses": 0.6760834455490112, "epoch": 0.22, "grad_norm": 14.477696596160204, "learning_rate": 4.785162619238575e-06, "logits/chosen": -2.5569658279418945, "logits/rejected": -2.4249210357666016, "logps/chosen": -241.2959747314453, "logps/rejected": -205.107666015625, "loss": 0.7118, "positive_losses": 0.9432792663574219, "rewards/accuracies": 0.625, "rewards/chosen": 0.14538225531578064, "rewards/margins": 0.03826458007097244, "rewards/margins_max": 0.16456389427185059, "rewards/margins_min": -0.07554511725902557, "rewards/margins_std": 0.10714679956436157, "rewards/rejected": 0.1071176752448082, "step": 840 }, { "dpo_losses": 0.6707090139389038, "epoch": 0.22, "grad_norm": 2.595491123655335, "learning_rate": 4.775802303459288e-06, "logits/chosen": -2.271686315536499, "logits/rejected": -2.2093093395233154, "logps/chosen": -210.01296997070312, "logps/rejected": -225.61801147460938, "loss": 0.6842, "positive_losses": 0.40496253967285156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.11106234788894653, "rewards/margins": 0.048898521810770035, "rewards/margins_max": 0.16908064484596252, "rewards/margins_min": -0.05571809411048889, "rewards/margins_std": 0.10174580663442612, "rewards/rejected": 0.062163837254047394, "step": 850 }, { "dpo_losses": 0.6654592752456665, "epoch": 0.23, "grad_norm": 2.4975375445868866, "learning_rate": 4.766251958842589e-06, "logits/chosen": -2.1895527839660645, "logits/rejected": -2.341291666030884, "logps/chosen": -129.0610809326172, "logps/rejected": -192.2652587890625, "loss": 0.6894, "positive_losses": 0.5391143560409546, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.123971126973629, "rewards/margins": 0.060204893350601196, "rewards/margins_max": 0.20584020018577576, "rewards/margins_min": -0.06416536867618561, "rewards/margins_std": 0.12060312926769257, "rewards/rejected": 0.0637662336230278, "step": 860 }, { "dpo_losses": 0.656313419342041, "epoch": 0.23, "grad_norm": 17.205691722122616, "learning_rate": 4.7565123828395066e-06, "logits/chosen": -2.392832040786743, "logits/rejected": -2.131091594696045, "logps/chosen": -280.2188415527344, "logps/rejected": -223.3489990234375, "loss": 0.7131, "positive_losses": 0.6413799524307251, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15697380900382996, "rewards/margins": 0.08226948976516724, "rewards/margins_max": 0.25493258237838745, "rewards/margins_min": -0.0931466594338417, "rewards/margins_std": 0.15836051106452942, "rewards/rejected": 0.07470433413982391, "step": 870 }, { "dpo_losses": 0.6767135858535767, "epoch": 0.23, "grad_norm": 10.008043106364383, "learning_rate": 4.746584388701831e-06, "logits/chosen": -2.3035080432891846, "logits/rejected": -2.296736717224121, "logps/chosen": -249.7926025390625, "logps/rejected": -303.38104248046875, "loss": 0.7071, "positive_losses": 0.4137893617153168, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13377834856510162, "rewards/margins": 0.03808373212814331, "rewards/margins_max": 0.15827365219593048, "rewards/margins_min": -0.1503000259399414, "rewards/margins_std": 0.13915404677391052, "rewards/rejected": 0.09569462388753891, "step": 880 }, { "dpo_losses": 0.6645438075065613, "epoch": 0.23, "grad_norm": 2.7000225984545994, "learning_rate": 4.736468805414218e-06, "logits/chosen": -2.268113613128662, "logits/rejected": -2.100329875946045, "logps/chosen": -284.2929992675781, "logps/rejected": -245.8045196533203, "loss": 0.7009, "positive_losses": 0.26146888732910156, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1436806619167328, "rewards/margins": 0.06280405819416046, "rewards/margins_max": 0.20422759652137756, "rewards/margins_min": -0.0720982551574707, "rewards/margins_std": 0.12289199978113174, "rewards/rejected": 0.08087659627199173, "step": 890 }, { "dpo_losses": 0.6946014165878296, "epoch": 0.24, "grad_norm": 2.116500236195804, "learning_rate": 4.7261664776249595e-06, "logits/chosen": -2.346662998199463, "logits/rejected": -2.295959949493408, "logps/chosen": -200.938720703125, "logps/rejected": -254.0438995361328, "loss": 0.688, "positive_losses": 0.2874298095703125, "rewards/accuracies": 0.5, "rewards/chosen": 0.13553032279014587, "rewards/margins": 0.000744523829780519, "rewards/margins_max": 0.12407539784908295, "rewards/margins_min": -0.13311627507209778, "rewards/margins_std": 0.11493979394435883, "rewards/rejected": 0.13478581607341766, "step": 900 }, { "epoch": 0.24, "eval_dpo_losses": 0.6641383171081543, "eval_logits/chosen": -2.256582498550415, "eval_logits/rejected": -2.1457910537719727, "eval_logps/chosen": -259.82415771484375, "eval_logps/rejected": -253.1665496826172, "eval_loss": 0.6810576319694519, "eval_positive_losses": 0.13925646245479584, "eval_rewards/accuracies": 0.6726190447807312, "eval_rewards/chosen": 0.15949979424476624, "eval_rewards/margins": 0.06296224892139435, "eval_rewards/margins_max": 0.26004937291145325, "eval_rewards/margins_min": -0.10504385083913803, "eval_rewards/margins_std": 0.1222677081823349, "eval_rewards/rejected": 0.0965375229716301, "eval_runtime": 387.0608, "eval_samples_per_second": 5.167, "eval_steps_per_second": 0.163, "step": 900 }, { "dpo_losses": 0.6637071967124939, "epoch": 0.24, "grad_norm": 17.340772548480214, "learning_rate": 4.715678265575463e-06, "logits/chosen": -2.3154962062835693, "logits/rejected": -2.196873188018799, "logps/chosen": -243.58984375, "logps/rejected": -231.0081329345703, "loss": 0.6834, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1703028529882431, "rewards/margins": 0.06537614017724991, "rewards/margins_max": 0.23286600410938263, "rewards/margins_min": -0.08773735165596008, "rewards/margins_std": 0.14007434248924255, "rewards/rejected": 0.10492672026157379, "step": 910 }, { "dpo_losses": 0.6645714044570923, "epoch": 0.24, "grad_norm": 2.184503519318641, "learning_rate": 4.705005045028415e-06, "logits/chosen": -2.344594955444336, "logits/rejected": -2.1285653114318848, "logps/chosen": -269.5585632324219, "logps/rejected": -224.4385223388672, "loss": 0.6977, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16074909269809723, "rewards/margins": 0.06163681671023369, "rewards/margins_max": 0.164463609457016, "rewards/margins_min": -0.07055871188640594, "rewards/margins_std": 0.10524662584066391, "rewards/rejected": 0.09911226481199265, "step": 920 }, { "dpo_losses": 0.6457914113998413, "epoch": 0.24, "grad_norm": 2.592807092825355, "learning_rate": 4.694147707194659e-06, "logits/chosen": -2.219250440597534, "logits/rejected": -1.9482553005218506, "logps/chosen": -304.4398498535156, "logps/rejected": -281.15496826171875, "loss": 0.6884, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.16592565178871155, "rewards/margins": 0.10221840441226959, "rewards/margins_max": 0.23177412152290344, "rewards/margins_min": -0.05666229873895645, "rewards/margins_std": 0.13097290694713593, "rewards/rejected": 0.06370726227760315, "step": 930 }, { "dpo_losses": 0.662357747554779, "epoch": 0.25, "grad_norm": 22.740797256544926, "learning_rate": 4.683107158658782e-06, "logits/chosen": -2.277939558029175, "logits/rejected": -2.2073493003845215, "logps/chosen": -280.28985595703125, "logps/rejected": -266.9888610839844, "loss": 0.6893, "positive_losses": 0.3534319996833801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15780262649059296, "rewards/margins": 0.06693222373723984, "rewards/margins_max": 0.2032538205385208, "rewards/margins_min": -0.06463401019573212, "rewards/margins_std": 0.12193477153778076, "rewards/rejected": 0.09087041020393372, "step": 940 }, { "dpo_losses": 0.6652836799621582, "epoch": 0.25, "grad_norm": 2.329754457592499, "learning_rate": 4.671884321303407e-06, "logits/chosen": -2.3597216606140137, "logits/rejected": -2.3833189010620117, "logps/chosen": -247.15576171875, "logps/rejected": -297.06500244140625, "loss": 0.6907, "positive_losses": 0.5967577695846558, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1415858119726181, "rewards/margins": 0.06350874155759811, "rewards/margins_max": 0.24594661593437195, "rewards/margins_min": -0.12001018226146698, "rewards/margins_std": 0.16381381452083588, "rewards/rejected": 0.0780770480632782, "step": 950 }, { "dpo_losses": 0.6882269978523254, "epoch": 0.25, "grad_norm": 2.5325440289811403, "learning_rate": 4.660480132232224e-06, "logits/chosen": -2.3346667289733887, "logits/rejected": -2.132582664489746, "logps/chosen": -348.41534423828125, "logps/rejected": -286.09393310546875, "loss": 0.6741, "positive_losses": 0.038466643542051315, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1551962047815323, "rewards/margins": 0.013931773602962494, "rewards/margins_max": 0.15722933411598206, "rewards/margins_min": -0.11236351728439331, "rewards/margins_std": 0.12054040282964706, "rewards/rejected": 0.14126445353031158, "step": 960 }, { "dpo_losses": 0.6646562814712524, "epoch": 0.25, "grad_norm": 7.090452797531582, "learning_rate": 4.6488955436917414e-06, "logits/chosen": -2.2712275981903076, "logits/rejected": -2.1604058742523193, "logps/chosen": -311.5713806152344, "logps/rejected": -271.99212646484375, "loss": 0.6851, "positive_losses": 0.1743614226579666, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17403236031532288, "rewards/margins": 0.06217183545231819, "rewards/margins_max": 0.21675007045269012, "rewards/margins_min": -0.06949727982282639, "rewards/margins_std": 0.12961900234222412, "rewards/rejected": 0.11186055094003677, "step": 970 }, { "dpo_losses": 0.649638831615448, "epoch": 0.26, "grad_norm": 2.199664489131489, "learning_rate": 4.6371315229917644e-06, "logits/chosen": -2.4195556640625, "logits/rejected": -2.314803123474121, "logps/chosen": -280.59136962890625, "logps/rejected": -234.5943145751953, "loss": 0.6643, "positive_losses": 0.17922744154930115, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.1437879502773285, "rewards/margins": 0.09260457754135132, "rewards/margins_max": 0.21585097908973694, "rewards/margins_min": -0.011479884386062622, "rewards/margins_std": 0.10187037289142609, "rewards/rejected": 0.051183342933654785, "step": 980 }, { "dpo_losses": 0.6588505506515503, "epoch": 0.26, "grad_norm": 11.86322980109028, "learning_rate": 4.625189052424638e-06, "logits/chosen": -2.3081791400909424, "logits/rejected": -1.9642865657806396, "logps/chosen": -274.06903076171875, "logps/rejected": -240.0980224609375, "loss": 0.6781, "positive_losses": 0.24162845313549042, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1613207310438156, "rewards/margins": 0.07505827397108078, "rewards/margins_max": 0.24122989177703857, "rewards/margins_min": -0.10169516503810883, "rewards/margins_std": 0.14784538745880127, "rewards/rejected": 0.08626247942447662, "step": 990 }, { "dpo_losses": 0.6666866540908813, "epoch": 0.26, "grad_norm": 8.722757710448876, "learning_rate": 4.613069129183218e-06, "logits/chosen": -2.31239652633667, "logits/rejected": -2.1944682598114014, "logps/chosen": -219.4092254638672, "logps/rejected": -202.46139526367188, "loss": 0.7002, "positive_losses": 0.05709972232580185, "rewards/accuracies": 0.625, "rewards/chosen": 0.12596020102500916, "rewards/margins": 0.05940049886703491, "rewards/margins_max": 0.22839348018169403, "rewards/margins_min": -0.08162297308444977, "rewards/margins_std": 0.14441663026809692, "rewards/rejected": 0.06655970215797424, "step": 1000 }, { "epoch": 0.26, "eval_dpo_losses": 0.663709819316864, "eval_logits/chosen": -2.2487740516662598, "eval_logits/rejected": -2.1384687423706055, "eval_logps/chosen": -259.4406433105469, "eval_logps/rejected": -252.89996337890625, "eval_loss": 0.6804019212722778, "eval_positive_losses": 0.10123474150896072, "eval_rewards/accuracies": 0.6805555820465088, "eval_rewards/chosen": 0.1633351594209671, "eval_rewards/margins": 0.06413190066814423, "eval_rewards/margins_max": 0.2711329460144043, "eval_rewards/margins_min": -0.10919316112995148, "eval_rewards/margins_std": 0.12785594165325165, "eval_rewards/rejected": 0.09920325875282288, "eval_runtime": 387.7456, "eval_samples_per_second": 5.158, "eval_steps_per_second": 0.162, "step": 1000 }, { "dpo_losses": 0.6627888083457947, "epoch": 0.26, "grad_norm": 9.78393361872739, "learning_rate": 4.600772765277607e-06, "logits/chosen": -2.4124324321746826, "logits/rejected": -2.2471697330474854, "logps/chosen": -258.18798828125, "logps/rejected": -264.5705261230469, "loss": 0.6902, "positive_losses": 0.12382545322179794, "rewards/accuracies": 0.75, "rewards/chosen": 0.13427311182022095, "rewards/margins": 0.06638023257255554, "rewards/margins_max": 0.21689562499523163, "rewards/margins_min": -0.04900385066866875, "rewards/margins_std": 0.12105952203273773, "rewards/rejected": 0.067892886698246, "step": 1010 }, { "dpo_losses": 0.6693023443222046, "epoch": 0.27, "grad_norm": 10.980952330600205, "learning_rate": 4.588300987450652e-06, "logits/chosen": -2.4499268531799316, "logits/rejected": -2.360106945037842, "logps/chosen": -243.27615356445312, "logps/rejected": -258.1858825683594, "loss": 0.6683, "positive_losses": 0.06820545345544815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16206596791744232, "rewards/margins": 0.052915751934051514, "rewards/margins_max": 0.20177678763866425, "rewards/margins_min": -0.08179645240306854, "rewards/margins_std": 0.12339916080236435, "rewards/rejected": 0.10915021598339081, "step": 1020 }, { "dpo_losses": 0.6665711998939514, "epoch": 0.27, "grad_norm": 14.246429531448616, "learning_rate": 4.5756548370922136e-06, "logits/chosen": -2.2398667335510254, "logits/rejected": -2.224827527999878, "logps/chosen": -268.6308898925781, "logps/rejected": -284.24700927734375, "loss": 0.6738, "positive_losses": 0.015036058612167835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1669318974018097, "rewards/margins": 0.05966324359178543, "rewards/margins_max": 0.22333300113677979, "rewards/margins_min": -0.09002655744552612, "rewards/margins_std": 0.14175747334957123, "rewards/rejected": 0.10726865381002426, "step": 1030 }, { "dpo_losses": 0.663563072681427, "epoch": 0.27, "grad_norm": 9.50999727748507, "learning_rate": 4.562835370152206e-06, "logits/chosen": -2.260150909423828, "logits/rejected": -2.0543112754821777, "logps/chosen": -259.80364990234375, "logps/rejected": -250.2171173095703, "loss": 0.6765, "positive_losses": 0.23957176506519318, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1509713977575302, "rewards/margins": 0.06638696044683456, "rewards/margins_max": 0.25140640139579773, "rewards/margins_min": -0.07849059998989105, "rewards/margins_std": 0.14595940709114075, "rewards/rejected": 0.08458445221185684, "step": 1040 }, { "dpo_losses": 0.6645854115486145, "epoch": 0.27, "grad_norm": 17.231767389709244, "learning_rate": 4.54984365705243e-06, "logits/chosen": -2.3229148387908936, "logits/rejected": -2.123594284057617, "logps/chosen": -289.7521057128906, "logps/rejected": -280.2361755371094, "loss": 0.6815, "positive_losses": 0.3443160951137543, "rewards/accuracies": 0.625, "rewards/chosen": 0.1624918282032013, "rewards/margins": 0.06275049597024918, "rewards/margins_max": 0.24426603317260742, "rewards/margins_min": -0.048405326902866364, "rewards/margins_std": 0.1337272822856903, "rewards/rejected": 0.09974134713411331, "step": 1050 }, { "dpo_losses": 0.6497806310653687, "epoch": 0.28, "grad_norm": 2.4614874331292795, "learning_rate": 4.536680782597191e-06, "logits/chosen": -2.3885538578033447, "logits/rejected": -2.3042359352111816, "logps/chosen": -325.8906555175781, "logps/rejected": -320.53704833984375, "loss": 0.6564, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1824796497821808, "rewards/margins": 0.09465797990560532, "rewards/margins_max": 0.24030208587646484, "rewards/margins_min": -0.038389913737773895, "rewards/margins_std": 0.12726078927516937, "rewards/rejected": 0.08782166242599487, "step": 1060 }, { "dpo_losses": 0.6591795086860657, "epoch": 0.28, "grad_norm": 6.143888348935831, "learning_rate": 4.523347845882718e-06, "logits/chosen": -2.255825996398926, "logits/rejected": -2.2352421283721924, "logps/chosen": -231.74862670898438, "logps/rejected": -243.02685546875, "loss": 0.6884, "positive_losses": 0.015968704596161842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15174275636672974, "rewards/margins": 0.07359109818935394, "rewards/margins_max": 0.22740980982780457, "rewards/margins_min": -0.09200847893953323, "rewards/margins_std": 0.13872823119163513, "rewards/rejected": 0.0781516581773758, "step": 1070 }, { "dpo_losses": 0.664664626121521, "epoch": 0.28, "grad_norm": 2.225197249680979, "learning_rate": 4.50984596020539e-06, "logits/chosen": -2.4682116508483887, "logits/rejected": -2.2945234775543213, "logps/chosen": -291.2568664550781, "logps/rejected": -302.60516357421875, "loss": 0.6911, "positive_losses": 0.6719337701797485, "rewards/accuracies": 0.625, "rewards/chosen": 0.1476764678955078, "rewards/margins": 0.06609688699245453, "rewards/margins_max": 0.24901673197746277, "rewards/margins_min": -0.1592492312192917, "rewards/margins_std": 0.17742705345153809, "rewards/rejected": 0.08157958835363388, "step": 1080 }, { "dpo_losses": 0.6614287495613098, "epoch": 0.29, "grad_norm": 2.6354651968085, "learning_rate": 4.4961762529687745e-06, "logits/chosen": -2.3438379764556885, "logits/rejected": -2.176189422607422, "logps/chosen": -227.27798461914062, "logps/rejected": -200.7275848388672, "loss": 0.67, "positive_losses": 0.07575416564941406, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.162602037191391, "rewards/margins": 0.07273516803979874, "rewards/margins_max": 0.2560585141181946, "rewards/margins_min": -0.07066643983125687, "rewards/margins_std": 0.14869292080402374, "rewards/rejected": 0.08986687660217285, "step": 1090 }, { "dpo_losses": 0.6672337055206299, "epoch": 0.29, "grad_norm": 19.85746865235589, "learning_rate": 4.482339865589492e-06, "logits/chosen": -2.1834332942962646, "logits/rejected": -2.203415870666504, "logps/chosen": -261.6937561035156, "logps/rejected": -255.4010467529297, "loss": 0.7049, "positive_losses": 0.8625373840332031, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13648399710655212, "rewards/margins": 0.06020275875926018, "rewards/margins_max": 0.22401699423789978, "rewards/margins_min": -0.10347610712051392, "rewards/margins_std": 0.14890322089195251, "rewards/rejected": 0.07628123462200165, "step": 1100 }, { "epoch": 0.29, "eval_dpo_losses": 0.6610326170921326, "eval_logits/chosen": -2.250674247741699, "eval_logits/rejected": -2.1421589851379395, "eval_logps/chosen": -259.6789245605469, "eval_logps/rejected": -253.78704833984375, "eval_loss": 0.686665415763855, "eval_positive_losses": 0.17053718864917755, "eval_rewards/accuracies": 0.6726190447807312, "eval_rewards/chosen": 0.16095228493213654, "eval_rewards/margins": 0.07061993330717087, "eval_rewards/margins_max": 0.2987801730632782, "eval_rewards/margins_min": -0.11839765310287476, "eval_rewards/margins_std": 0.1405448615550995, "eval_rewards/rejected": 0.09033234417438507, "eval_runtime": 387.2671, "eval_samples_per_second": 5.164, "eval_steps_per_second": 0.163, "step": 1100 }, { "dpo_losses": 0.6476628184318542, "epoch": 0.29, "grad_norm": 2.7261254252674347, "learning_rate": 4.468337953401909e-06, "logits/chosen": -2.3605847358703613, "logits/rejected": -2.2457211017608643, "logps/chosen": -243.8014373779297, "logps/rejected": -249.6242218017578, "loss": 0.6762, "positive_losses": 0.3656042218208313, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16719859838485718, "rewards/margins": 0.0984012633562088, "rewards/margins_max": 0.23489364981651306, "rewards/margins_min": -0.061882637441158295, "rewards/margins_std": 0.13354650139808655, "rewards/rejected": 0.06879732012748718, "step": 1110 }, { "dpo_losses": 0.6547914743423462, "epoch": 0.29, "grad_norm": 2.717604998935058, "learning_rate": 4.45417168556166e-06, "logits/chosen": -2.3577358722686768, "logits/rejected": -2.196436643600464, "logps/chosen": -240.9995574951172, "logps/rejected": -189.7039794921875, "loss": 0.6723, "positive_losses": 0.07942962646484375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.18919794261455536, "rewards/margins": 0.08417442440986633, "rewards/margins_max": 0.27299022674560547, "rewards/margins_min": -0.05165041238069534, "rewards/margins_std": 0.13926398754119873, "rewards/rejected": 0.10502351820468903, "step": 1120 }, { "dpo_losses": 0.6596941947937012, "epoch": 0.3, "grad_norm": 2.5434575468089564, "learning_rate": 4.439842244948036e-06, "logits/chosen": -2.414846897125244, "logits/rejected": -2.3808791637420654, "logps/chosen": -264.6458435058594, "logps/rejected": -250.7647247314453, "loss": 0.6808, "positive_losses": 0.28480416536331177, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16445598006248474, "rewards/margins": 0.07282321155071259, "rewards/margins_max": 0.21273484826087952, "rewards/margins_min": -0.06943021714687347, "rewards/margins_std": 0.1274661421775818, "rewards/rejected": 0.09163276851177216, "step": 1130 }, { "dpo_losses": 0.6460368633270264, "epoch": 0.3, "grad_norm": 14.857809439576664, "learning_rate": 4.425350828065204e-06, "logits/chosen": -2.2652177810668945, "logits/rejected": -2.147026777267456, "logps/chosen": -215.55078125, "logps/rejected": -209.05740356445312, "loss": 0.6979, "positive_losses": 0.7609390020370483, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16093167662620544, "rewards/margins": 0.10260282456874847, "rewards/margins_max": 0.30866894125938416, "rewards/margins_min": -0.04153290390968323, "rewards/margins_std": 0.15257690846920013, "rewards/rejected": 0.05832885578274727, "step": 1140 }, { "dpo_losses": 0.6390611529350281, "epoch": 0.3, "grad_norm": 2.7542828241535484, "learning_rate": 4.410698644942303e-06, "logits/chosen": -2.447237014770508, "logits/rejected": -2.332554340362549, "logps/chosen": -270.4691467285156, "logps/rejected": -241.2642059326172, "loss": 0.6908, "positive_losses": 0.1799442321062088, "rewards/accuracies": 0.75, "rewards/chosen": 0.17177441716194153, "rewards/margins": 0.11705859750509262, "rewards/margins_max": 0.3035069406032562, "rewards/margins_min": -0.03952892869710922, "rewards/margins_std": 0.15131911635398865, "rewards/rejected": 0.05471581220626831, "step": 1150 }, { "dpo_losses": 0.664864182472229, "epoch": 0.3, "grad_norm": 12.011495805857846, "learning_rate": 4.395886919032406e-06, "logits/chosen": -2.231477975845337, "logits/rejected": -2.1597814559936523, "logps/chosen": -167.5512237548828, "logps/rejected": -183.29306030273438, "loss": 0.6667, "positive_losses": 0.23996391892433167, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1413363814353943, "rewards/margins": 0.06302407383918762, "rewards/margins_max": 0.20785093307495117, "rewards/margins_min": -0.08397276699542999, "rewards/margins_std": 0.13264100253582, "rewards/rejected": 0.07831232249736786, "step": 1160 }, { "dpo_losses": 0.6543959379196167, "epoch": 0.31, "grad_norm": 2.523036298952617, "learning_rate": 4.380916887110366e-06, "logits/chosen": -2.245565176010132, "logits/rejected": -2.249809741973877, "logps/chosen": -222.7660369873047, "logps/rejected": -244.51724243164062, "loss": 0.6729, "positive_losses": 0.07111816108226776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1590961366891861, "rewards/margins": 0.08477087318897247, "rewards/margins_max": 0.2326078861951828, "rewards/margins_min": -0.06254646182060242, "rewards/margins_std": 0.13829925656318665, "rewards/rejected": 0.07432525604963303, "step": 1170 }, { "dpo_losses": 0.6608009934425354, "epoch": 0.31, "grad_norm": 8.77742371737048, "learning_rate": 4.365789799169539e-06, "logits/chosen": -2.2703909873962402, "logits/rejected": -2.092261791229248, "logps/chosen": -231.94284057617188, "logps/rejected": -227.1644287109375, "loss": 0.6988, "positive_losses": 0.2732559144496918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1458846926689148, "rewards/margins": 0.070404052734375, "rewards/margins_max": 0.2065153419971466, "rewards/margins_min": -0.05929439514875412, "rewards/margins_std": 0.11762721836566925, "rewards/rejected": 0.07548064738512039, "step": 1180 }, { "dpo_losses": 0.678370475769043, "epoch": 0.31, "grad_norm": 2.4239255038733423, "learning_rate": 4.350506918317416e-06, "logits/chosen": -2.402472734451294, "logits/rejected": -2.3434395790100098, "logps/chosen": -229.9500274658203, "logps/rejected": -205.379638671875, "loss": 0.7284, "positive_losses": 0.4407051205635071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.11117472499608994, "rewards/margins": 0.032065004110336304, "rewards/margins_max": 0.11911328881978989, "rewards/margins_min": -0.02809775434434414, "rewards/margins_std": 0.06709595024585724, "rewards/rejected": 0.07910971343517303, "step": 1190 }, { "dpo_losses": 0.6790968179702759, "epoch": 0.31, "grad_norm": 2.34384867142558, "learning_rate": 4.335069520670149e-06, "logits/chosen": -2.1453909873962402, "logits/rejected": -2.128192901611328, "logps/chosen": -193.49462890625, "logps/rejected": -205.179443359375, "loss": 0.6695, "positive_losses": 0.05309867858886719, "rewards/accuracies": 0.625, "rewards/chosen": 0.14820554852485657, "rewards/margins": 0.03212646394968033, "rewards/margins_max": 0.15427207946777344, "rewards/margins_min": -0.08756106346845627, "rewards/margins_std": 0.10824175179004669, "rewards/rejected": 0.11607907712459564, "step": 1200 }, { "epoch": 0.31, "eval_dpo_losses": 0.6609339118003845, "eval_logits/chosen": -2.2481796741485596, "eval_logits/rejected": -2.140394926071167, "eval_logps/chosen": -259.26153564453125, "eval_logps/rejected": -253.34384155273438, "eval_loss": 0.6800269484519958, "eval_positive_losses": 0.09799966216087341, "eval_rewards/accuracies": 0.6785714030265808, "eval_rewards/chosen": 0.1651260107755661, "eval_rewards/margins": 0.07036175578832626, "eval_rewards/margins_max": 0.2943589985370636, "eval_rewards/margins_min": -0.10625045746564865, "eval_rewards/margins_std": 0.1347654163837433, "eval_rewards/rejected": 0.09476425498723984, "eval_runtime": 387.5158, "eval_samples_per_second": 5.161, "eval_steps_per_second": 0.163, "step": 1200 }, { "dpo_losses": 0.6740719079971313, "epoch": 0.32, "grad_norm": 8.081111746871818, "learning_rate": 4.319478895246e-06, "logits/chosen": -2.432292938232422, "logits/rejected": -2.2795419692993164, "logps/chosen": -273.97955322265625, "logps/rejected": -243.25146484375, "loss": 0.6749, "positive_losses": 0.3476318418979645, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.15843451023101807, "rewards/margins": 0.04401896521449089, "rewards/margins_max": 0.22885560989379883, "rewards/margins_min": -0.11295966804027557, "rewards/margins_std": 0.14958584308624268, "rewards/rejected": 0.11441554874181747, "step": 1210 }, { "dpo_losses": 0.657934308052063, "epoch": 0.32, "grad_norm": 9.68965016775813, "learning_rate": 4.303736343857704e-06, "logits/chosen": -2.3647541999816895, "logits/rejected": -2.1458818912506104, "logps/chosen": -263.87945556640625, "logps/rejected": -254.51113891601562, "loss": 0.7111, "positive_losses": 0.4801223874092102, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1476506143808365, "rewards/margins": 0.07711522281169891, "rewards/margins_max": 0.2491142302751541, "rewards/margins_min": -0.07323641330003738, "rewards/margins_std": 0.13967391848564148, "rewards/rejected": 0.07053537666797638, "step": 1220 }, { "dpo_losses": 0.6634490489959717, "epoch": 0.32, "grad_norm": 6.628913155565054, "learning_rate": 4.287843181003772e-06, "logits/chosen": -2.264528751373291, "logits/rejected": -2.212719202041626, "logps/chosen": -250.247802734375, "logps/rejected": -260.8634033203125, "loss": 0.6766, "positive_losses": 0.00949172955006361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1584172546863556, "rewards/margins": 0.06412375718355179, "rewards/margins_max": 0.1835576444864273, "rewards/margins_min": -0.0519113652408123, "rewards/margins_std": 0.10713416337966919, "rewards/rejected": 0.0942934900522232, "step": 1230 }, { "dpo_losses": 0.6569926142692566, "epoch": 0.32, "grad_norm": 2.058800239992248, "learning_rate": 4.27180073375873e-06, "logits/chosen": -2.0869901180267334, "logits/rejected": -2.1009068489074707, "logps/chosen": -216.7119140625, "logps/rejected": -204.34628295898438, "loss": 0.6685, "positive_losses": 0.10867004096508026, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1584130972623825, "rewards/margins": 0.0783301517367363, "rewards/margins_max": 0.21663177013397217, "rewards/margins_min": -0.03814803808927536, "rewards/margins_std": 0.11310204118490219, "rewards/rejected": 0.0800829604268074, "step": 1240 }, { "dpo_losses": 0.6640318632125854, "epoch": 0.33, "grad_norm": 2.4706448393569667, "learning_rate": 4.255610341662304e-06, "logits/chosen": -2.218728542327881, "logits/rejected": -2.159151554107666, "logps/chosen": -245.96957397460938, "logps/rejected": -243.5109405517578, "loss": 0.6925, "positive_losses": 1.367981195449829, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13404865562915802, "rewards/margins": 0.06507338583469391, "rewards/margins_max": 0.20603609085083008, "rewards/margins_min": -0.061202965676784515, "rewards/margins_std": 0.12070804834365845, "rewards/rejected": 0.06897525489330292, "step": 1250 }, { "dpo_losses": 0.6569708585739136, "epoch": 0.33, "grad_norm": 13.037256699776199, "learning_rate": 4.2392733566075764e-06, "logits/chosen": -2.214778423309326, "logits/rejected": -1.9950603246688843, "logps/chosen": -205.86575317382812, "logps/rejected": -197.5122833251953, "loss": 0.713, "positive_losses": 0.5701795816421509, "rewards/accuracies": 0.75, "rewards/chosen": 0.12042121589183807, "rewards/margins": 0.07956317067146301, "rewards/margins_max": 0.20144668221473694, "rewards/margins_min": -0.030759120360016823, "rewards/margins_std": 0.10372885316610336, "rewards/rejected": 0.04085804522037506, "step": 1260 }, { "dpo_losses": 0.6675158143043518, "epoch": 0.33, "grad_norm": 14.043294719791152, "learning_rate": 4.2227911427280975e-06, "logits/chosen": -2.283082962036133, "logits/rejected": -2.309471607208252, "logps/chosen": -259.65362548828125, "logps/rejected": -279.0824279785156, "loss": 0.7093, "positive_losses": 0.09183263778686523, "rewards/accuracies": 0.625, "rewards/chosen": 0.14191466569900513, "rewards/margins": 0.056323401629924774, "rewards/margins_max": 0.20242862403392792, "rewards/margins_min": -0.055060409009456635, "rewards/margins_std": 0.11829537153244019, "rewards/rejected": 0.08559127897024155, "step": 1270 }, { "dpo_losses": 0.6720221638679504, "epoch": 0.33, "grad_norm": 2.412034253637924, "learning_rate": 4.206165076283983e-06, "logits/chosen": -2.189272165298462, "logits/rejected": -2.277871608734131, "logps/chosen": -197.14346313476562, "logps/rejected": -231.35336303710938, "loss": 0.6751, "positive_losses": 0.01348266564309597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14842286705970764, "rewards/margins": 0.04617368429899216, "rewards/margins_max": 0.18874576687812805, "rewards/margins_min": -0.08526624739170074, "rewards/margins_std": 0.12275786697864532, "rewards/rejected": 0.10224918276071548, "step": 1280 }, { "dpo_losses": 0.6644900441169739, "epoch": 0.34, "grad_norm": 9.724370674226977, "learning_rate": 4.189396545546995e-06, "logits/chosen": -2.2466421127319336, "logits/rejected": -2.141737461090088, "logps/chosen": -212.48080444335938, "logps/rejected": -233.0284881591797, "loss": 0.6896, "positive_losses": 0.6066713929176331, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12284813821315765, "rewards/margins": 0.06304603815078735, "rewards/margins_max": 0.20019695162773132, "rewards/margins_min": -0.05893053859472275, "rewards/margins_std": 0.11821810901165009, "rewards/rejected": 0.0598021075129509, "step": 1290 }, { "dpo_losses": 0.6792903542518616, "epoch": 0.34, "grad_norm": 21.680244221455467, "learning_rate": 4.172486950684627e-06, "logits/chosen": -2.3272290229797363, "logits/rejected": -2.164533853530884, "logps/chosen": -186.06436157226562, "logps/rejected": -220.2150115966797, "loss": 0.7072, "positive_losses": 1.3802852630615234, "rewards/accuracies": 0.625, "rewards/chosen": 0.11806647479534149, "rewards/margins": 0.03358602523803711, "rewards/margins_max": 0.1967659741640091, "rewards/margins_min": -0.12723815441131592, "rewards/margins_std": 0.1433938890695572, "rewards/rejected": 0.08448044955730438, "step": 1300 }, { "epoch": 0.34, "eval_dpo_losses": 0.6559665203094482, "eval_logits/chosen": -2.230766534805298, "eval_logits/rejected": -2.1223793029785156, "eval_logps/chosen": -261.4336242675781, "eval_logps/rejected": -256.6372375488281, "eval_loss": 0.6937506794929504, "eval_positive_losses": 0.24659039080142975, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.14340533316135406, "eval_rewards/margins": 0.08157505095005035, "eval_rewards/margins_max": 0.3216693103313446, "eval_rewards/margins_min": -0.11065223813056946, "eval_rewards/margins_std": 0.14679890871047974, "eval_rewards/rejected": 0.06183028593659401, "eval_runtime": 387.9894, "eval_samples_per_second": 5.155, "eval_steps_per_second": 0.162, "step": 1300 }, { "dpo_losses": 0.6723489761352539, "epoch": 0.34, "grad_norm": 22.694833794435052, "learning_rate": 4.155437703643182e-06, "logits/chosen": -2.2930028438568115, "logits/rejected": -2.190011978149414, "logps/chosen": -268.75738525390625, "logps/rejected": -257.1799011230469, "loss": 0.679, "positive_losses": 0.3650621473789215, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1240684762597084, "rewards/margins": 0.046475570648908615, "rewards/margins_max": 0.18167918920516968, "rewards/margins_min": -0.0799814760684967, "rewards/margins_std": 0.11639727652072906, "rewards/rejected": 0.07759290188550949, "step": 1310 }, { "dpo_losses": 0.6589926481246948, "epoch": 0.35, "grad_norm": 2.730673764544938, "learning_rate": 4.138250228029882e-06, "logits/chosen": -2.274446964263916, "logits/rejected": -2.2229180335998535, "logps/chosen": -243.6131591796875, "logps/rejected": -243.33944702148438, "loss": 0.6756, "positive_losses": 0.2796749174594879, "rewards/accuracies": 0.75, "rewards/chosen": 0.14495351910591125, "rewards/margins": 0.07322598248720169, "rewards/margins_max": 0.20433540642261505, "rewards/margins_min": -0.056198012083768845, "rewards/margins_std": 0.11583195626735687, "rewards/rejected": 0.07172755897045135, "step": 1320 }, { "dpo_losses": 0.6617782115936279, "epoch": 0.35, "grad_norm": 2.3057199008486338, "learning_rate": 4.120925958993994e-06, "logits/chosen": -2.420835018157959, "logits/rejected": -2.289306640625, "logps/chosen": -244.6587371826172, "logps/rejected": -250.67672729492188, "loss": 0.7032, "positive_losses": 0.33096009492874146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15069489181041718, "rewards/margins": 0.06795565038919449, "rewards/margins_max": 0.2162495106458664, "rewards/margins_min": -0.0716993436217308, "rewards/margins_std": 0.12497158348560333, "rewards/rejected": 0.08273923397064209, "step": 1330 }, { "dpo_losses": 0.6517545580863953, "epoch": 0.35, "grad_norm": 9.793503955508594, "learning_rate": 4.103466343106999e-06, "logits/chosen": -2.160895824432373, "logits/rejected": -2.0393996238708496, "logps/chosen": -301.50714111328125, "logps/rejected": -256.36956787109375, "loss": 0.6737, "positive_losses": 0.05755653232336044, "rewards/accuracies": 0.75, "rewards/chosen": 0.1462530791759491, "rewards/margins": 0.08966291695833206, "rewards/margins_max": 0.240148663520813, "rewards/margins_min": -0.05157693102955818, "rewards/margins_std": 0.13575977087020874, "rewards/rejected": 0.056590158492326736, "step": 1340 }, { "dpo_losses": 0.6564763784408569, "epoch": 0.35, "grad_norm": 12.153394035094296, "learning_rate": 4.085872838241797e-06, "logits/chosen": -2.232252359390259, "logits/rejected": -2.1454672813415527, "logps/chosen": -271.2743835449219, "logps/rejected": -237.84988403320312, "loss": 0.6783, "positive_losses": 0.3564796447753906, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15949495136737823, "rewards/margins": 0.07960118353366852, "rewards/margins_max": 0.2421347200870514, "rewards/margins_min": -0.06724490970373154, "rewards/margins_std": 0.13933594524860382, "rewards/rejected": 0.07989376783370972, "step": 1350 }, { "dpo_losses": 0.6681951284408569, "epoch": 0.36, "grad_norm": 2.7539685830711593, "learning_rate": 4.06814691345098e-06, "logits/chosen": -2.3236818313598633, "logits/rejected": -2.3378896713256836, "logps/chosen": -193.13453674316406, "logps/rejected": -186.54464721679688, "loss": 0.6992, "positive_losses": 0.34249573945999146, "rewards/accuracies": 0.625, "rewards/chosen": 0.15839411318302155, "rewards/margins": 0.05530413240194321, "rewards/margins_max": 0.21971502900123596, "rewards/margins_min": -0.09977023303508759, "rewards/margins_std": 0.14228175580501556, "rewards/rejected": 0.10308997333049774, "step": 1360 }, { "dpo_losses": 0.6513376832008362, "epoch": 0.36, "grad_norm": 2.2982709107233528, "learning_rate": 4.050290048844171e-06, "logits/chosen": -2.433276414871216, "logits/rejected": -2.227302074432373, "logps/chosen": -251.3153533935547, "logps/rejected": -234.02688598632812, "loss": 0.6622, "positive_losses": 0.06426239013671875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16175240278244019, "rewards/margins": 0.09088386595249176, "rewards/margins_max": 0.2658461928367615, "rewards/margins_min": -0.05776233598589897, "rewards/margins_std": 0.1440446525812149, "rewards/rejected": 0.07086853682994843, "step": 1370 }, { "dpo_losses": 0.6311546564102173, "epoch": 0.36, "grad_norm": 6.110619615769177, "learning_rate": 4.032303735464422e-06, "logits/chosen": -2.233980655670166, "logits/rejected": -2.1485273838043213, "logps/chosen": -251.48648071289062, "logps/rejected": -264.5433044433594, "loss": 0.6618, "positive_losses": 0.11382284015417099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17915266752243042, "rewards/margins": 0.13619256019592285, "rewards/margins_max": 0.3460856080055237, "rewards/margins_min": -0.05254535749554634, "rewards/margins_std": 0.17662225663661957, "rewards/rejected": 0.04296010732650757, "step": 1380 }, { "dpo_losses": 0.6420263051986694, "epoch": 0.36, "grad_norm": 10.566199743823093, "learning_rate": 4.014189475163727e-06, "logits/chosen": -2.4261655807495117, "logits/rejected": -2.1915037631988525, "logps/chosen": -243.33676147460938, "logps/rejected": -207.6195831298828, "loss": 0.697, "positive_losses": 0.2987831234931946, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13450759649276733, "rewards/margins": 0.10947465896606445, "rewards/margins_max": 0.25402912497520447, "rewards/margins_min": -0.021803459152579308, "rewards/margins_std": 0.12463472783565521, "rewards/rejected": 0.025032931938767433, "step": 1390 }, { "dpo_losses": 0.6637845635414124, "epoch": 0.37, "grad_norm": 6.095866456879401, "learning_rate": 3.995948780477605e-06, "logits/chosen": -2.2400317192077637, "logits/rejected": -2.017141819000244, "logps/chosen": -251.0870361328125, "logps/rejected": -210.7526397705078, "loss": 0.711, "positive_losses": 1.0670568943023682, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13232043385505676, "rewards/margins": 0.06537578999996185, "rewards/margins_max": 0.21851429343223572, "rewards/margins_min": -0.0990295335650444, "rewards/margins_std": 0.13912370800971985, "rewards/rejected": 0.06694462895393372, "step": 1400 }, { "epoch": 0.37, "eval_dpo_losses": 0.6585971117019653, "eval_logits/chosen": -2.2128803730010986, "eval_logits/rejected": -2.10371470451355, "eval_logps/chosen": -259.81817626953125, "eval_logps/rejected": -254.48196411132812, "eval_loss": 0.6854003667831421, "eval_positive_losses": 0.13627225160598755, "eval_rewards/accuracies": 0.6904761791229248, "eval_rewards/chosen": 0.15955978631973267, "eval_rewards/margins": 0.07617643475532532, "eval_rewards/margins_max": 0.3199954628944397, "eval_rewards/margins_min": -0.11948077380657196, "eval_rewards/margins_std": 0.1463845819234848, "eval_rewards/rejected": 0.08338334411382675, "eval_runtime": 387.2474, "eval_samples_per_second": 5.165, "eval_steps_per_second": 0.163, "step": 1400 }, { "dpo_losses": 0.6694769859313965, "epoch": 0.37, "grad_norm": 2.57917107831817, "learning_rate": 3.977583174498816e-06, "logits/chosen": -2.2356762886047363, "logits/rejected": -2.1949288845062256, "logps/chosen": -210.61404418945312, "logps/rejected": -231.76010131835938, "loss": 0.6759, "positive_losses": 0.059041596949100494, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14265164732933044, "rewards/margins": 0.052664853632450104, "rewards/margins_max": 0.1973382979631424, "rewards/margins_min": -0.08420625329017639, "rewards/margins_std": 0.12456656992435455, "rewards/rejected": 0.08998680859804153, "step": 1410 }, { "dpo_losses": 0.6566817164421082, "epoch": 0.37, "grad_norm": 10.674919353243011, "learning_rate": 3.959094190750172e-06, "logits/chosen": -2.2339484691619873, "logits/rejected": -1.9982763528823853, "logps/chosen": -220.60336303710938, "logps/rejected": -180.81517028808594, "loss": 0.6751, "positive_losses": 0.06749725341796875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14831414818763733, "rewards/margins": 0.0779358446598053, "rewards/margins_max": 0.23278650641441345, "rewards/margins_min": -0.04039284959435463, "rewards/margins_std": 0.11831691116094589, "rewards/rejected": 0.07037831097841263, "step": 1420 }, { "dpo_losses": 0.6597561240196228, "epoch": 0.37, "grad_norm": 12.138240545685436, "learning_rate": 3.9404833730564975e-06, "logits/chosen": -2.3191170692443848, "logits/rejected": -2.2896511554718018, "logps/chosen": -190.17869567871094, "logps/rejected": -224.36618041992188, "loss": 0.6936, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15538226068019867, "rewards/margins": 0.07535049319267273, "rewards/margins_max": 0.2349153459072113, "rewards/margins_min": -0.0715322345495224, "rewards/margins_std": 0.1350911557674408, "rewards/rejected": 0.08003176748752594, "step": 1430 }, { "dpo_losses": 0.6527693867683411, "epoch": 0.38, "grad_norm": 6.6027325377704065, "learning_rate": 3.921752275415712e-06, "logits/chosen": -2.302039623260498, "logits/rejected": -2.1102588176727295, "logps/chosen": -286.7153625488281, "logps/rejected": -314.29864501953125, "loss": 0.6748, "positive_losses": 0.10076923668384552, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16684961318969727, "rewards/margins": 0.08727109432220459, "rewards/margins_max": 0.25249677896499634, "rewards/margins_min": -0.05451441928744316, "rewards/margins_std": 0.13182449340820312, "rewards/rejected": 0.07957851886749268, "step": 1440 }, { "dpo_losses": 0.6378606557846069, "epoch": 0.38, "grad_norm": 2.8079244057459576, "learning_rate": 3.902902461869079e-06, "logits/chosen": -2.381296396255493, "logits/rejected": -2.246230363845825, "logps/chosen": -296.21807861328125, "logps/rejected": -259.53314208984375, "loss": 0.7121, "positive_losses": 0.5923923254013062, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14990513026714325, "rewards/margins": 0.11951925605535507, "rewards/margins_max": 0.2996339499950409, "rewards/margins_min": -0.030710767954587936, "rewards/margins_std": 0.1475505530834198, "rewards/rejected": 0.030385861173272133, "step": 1450 }, { "dpo_losses": 0.6424838900566101, "epoch": 0.38, "grad_norm": 2.2958896106207924, "learning_rate": 3.883935506370605e-06, "logits/chosen": -2.2176663875579834, "logits/rejected": -2.043203592300415, "logps/chosen": -268.8602294921875, "logps/rejected": -233.1497802734375, "loss": 0.6607, "positive_losses": 0.032093048095703125, "rewards/accuracies": 0.75, "rewards/chosen": 0.1543729156255722, "rewards/margins": 0.1102604866027832, "rewards/margins_max": 0.2886444926261902, "rewards/margins_min": -0.05657818913459778, "rewards/margins_std": 0.1558775007724762, "rewards/rejected": 0.0441124327480793, "step": 1460 }, { "dpo_losses": 0.6538541913032532, "epoch": 0.38, "grad_norm": 19.229417771022085, "learning_rate": 3.864852992655617e-06, "logits/chosen": -2.4004173278808594, "logits/rejected": -2.26729154586792, "logps/chosen": -237.31088256835938, "logps/rejected": -232.22421264648438, "loss": 0.6815, "positive_losses": 0.35954970121383667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1483347862958908, "rewards/margins": 0.08406121283769608, "rewards/margins_max": 0.22949786484241486, "rewards/margins_min": -0.014123158529400826, "rewards/margins_std": 0.10779520124197006, "rewards/rejected": 0.06427358835935593, "step": 1470 }, { "dpo_losses": 0.6409989595413208, "epoch": 0.39, "grad_norm": 14.658604804831146, "learning_rate": 3.845656514108516e-06, "logits/chosen": -2.3393166065216064, "logits/rejected": -2.290844202041626, "logps/chosen": -214.43820190429688, "logps/rejected": -257.7483825683594, "loss": 0.6608, "positive_losses": 0.5421035885810852, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13432268798351288, "rewards/margins": 0.11384852230548859, "rewards/margins_max": 0.2886359691619873, "rewards/margins_min": -0.015134045854210854, "rewards/margins_std": 0.13748089969158173, "rewards/rejected": 0.020474178716540337, "step": 1480 }, { "dpo_losses": 0.6370208859443665, "epoch": 0.39, "grad_norm": 12.378857560451737, "learning_rate": 3.826347673629738e-06, "logits/chosen": -2.227959632873535, "logits/rejected": -2.067913770675659, "logps/chosen": -242.8267059326172, "logps/rejected": -244.0117645263672, "loss": 0.7478, "positive_losses": 0.2992299199104309, "rewards/accuracies": 0.75, "rewards/chosen": 0.1250033676624298, "rewards/margins": 0.12451610714197159, "rewards/margins_max": 0.36837467551231384, "rewards/margins_min": -0.03669679909944534, "rewards/margins_std": 0.17886798083782196, "rewards/rejected": 0.0004872471035923809, "step": 1490 }, { "dpo_losses": 0.6506232023239136, "epoch": 0.39, "grad_norm": 2.095459425267379, "learning_rate": 3.8069280835019062e-06, "logits/chosen": -2.3164889812469482, "logits/rejected": -2.159947156906128, "logps/chosen": -218.90591430664062, "logps/rejected": -196.047119140625, "loss": 0.6953, "positive_losses": 0.47218209505081177, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16028736531734467, "rewards/margins": 0.0963880717754364, "rewards/margins_max": 0.28781890869140625, "rewards/margins_min": -0.07303518056869507, "rewards/margins_std": 0.1648676097393036, "rewards/rejected": 0.06389929354190826, "step": 1500 }, { "epoch": 0.39, "eval_dpo_losses": 0.6537300944328308, "eval_logits/chosen": -2.229872465133667, "eval_logits/rejected": -2.1212284564971924, "eval_logps/chosen": -260.6038818359375, "eval_logps/rejected": -256.4078369140625, "eval_loss": 0.7055920958518982, "eval_positive_losses": 0.38133206963539124, "eval_rewards/accuracies": 0.692460298538208, "eval_rewards/chosen": 0.15170258283615112, "eval_rewards/margins": 0.08757810294628143, "eval_rewards/margins_max": 0.35322973132133484, "eval_rewards/margins_min": -0.11925424635410309, "eval_rewards/margins_std": 0.16037528216838837, "eval_rewards/rejected": 0.06412447243928909, "eval_runtime": 387.6196, "eval_samples_per_second": 5.16, "eval_steps_per_second": 0.163, "step": 1500 }, { "dpo_losses": 0.6619753241539001, "epoch": 0.4, "grad_norm": 2.528291173879744, "learning_rate": 3.7873993652552077e-06, "logits/chosen": -2.4148833751678467, "logits/rejected": -2.160015344619751, "logps/chosen": -279.8267517089844, "logps/rejected": -249.9979248046875, "loss": 0.7063, "positive_losses": 0.16603966057300568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1563212126493454, "rewards/margins": 0.07162059098482132, "rewards/margins_max": 0.2594786584377289, "rewards/margins_min": -0.12171760946512222, "rewards/margins_std": 0.17017123103141785, "rewards/rejected": 0.08470062911510468, "step": 1510 }, { "dpo_losses": 0.6537727117538452, "epoch": 0.4, "grad_norm": 6.113484983126728, "learning_rate": 3.7677631495319953e-06, "logits/chosen": -2.3113043308258057, "logits/rejected": -2.016031265258789, "logps/chosen": -256.72967529296875, "logps/rejected": -207.41012573242188, "loss": 0.7137, "positive_losses": 0.8440361022949219, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1625308096408844, "rewards/margins": 0.08701851963996887, "rewards/margins_max": 0.2605285346508026, "rewards/margins_min": -0.06920422613620758, "rewards/margins_std": 0.14851629734039307, "rewards/rejected": 0.07551229000091553, "step": 1520 }, { "dpo_losses": 0.6691297292709351, "epoch": 0.4, "grad_norm": 2.19772325963483, "learning_rate": 3.748021075950633e-06, "logits/chosen": -2.2782042026519775, "logits/rejected": -2.1705007553100586, "logps/chosen": -253.73611450195312, "logps/rejected": -222.47140502929688, "loss": 0.6674, "positive_losses": 0.3496719300746918, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16310924291610718, "rewards/margins": 0.052569448947906494, "rewards/margins_max": 0.17783726751804352, "rewards/margins_min": -0.0822906643152237, "rewards/margins_std": 0.11528579145669937, "rewards/rejected": 0.11053977906703949, "step": 1530 }, { "dpo_losses": 0.6753464341163635, "epoch": 0.4, "grad_norm": 2.3986068814884667, "learning_rate": 3.7281747929685824e-06, "logits/chosen": -2.3216331005096436, "logits/rejected": -2.1723740100860596, "logps/chosen": -300.3550109863281, "logps/rejected": -276.2474670410156, "loss": 0.6994, "positive_losses": 0.02894439734518528, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17750614881515503, "rewards/margins": 0.04148155823349953, "rewards/margins_max": 0.1745886504650116, "rewards/margins_min": -0.08291731029748917, "rewards/margins_std": 0.11950113624334335, "rewards/rejected": 0.1360245943069458, "step": 1540 }, { "dpo_losses": 0.6556013226509094, "epoch": 0.41, "grad_norm": 22.240326225028785, "learning_rate": 3.7082259577447604e-06, "logits/chosen": -2.221151828765869, "logits/rejected": -2.2277069091796875, "logps/chosen": -274.70928955078125, "logps/rejected": -269.41094970703125, "loss": 0.6788, "positive_losses": 0.34530869126319885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1900779902935028, "rewards/margins": 0.08317329734563828, "rewards/margins_max": 0.25127747654914856, "rewards/margins_min": -0.044112998992204666, "rewards/margins_std": 0.13341939449310303, "rewards/rejected": 0.10690467059612274, "step": 1550 }, { "dpo_losses": 0.6573207378387451, "epoch": 0.41, "grad_norm": 10.47684213832614, "learning_rate": 3.6881762360011688e-06, "logits/chosen": -2.191758394241333, "logits/rejected": -2.0990564823150635, "logps/chosen": -236.7447052001953, "logps/rejected": -232.09390258789062, "loss": 0.6809, "positive_losses": 0.03408203274011612, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16853897273540497, "rewards/margins": 0.08065663278102875, "rewards/margins_max": 0.2781978249549866, "rewards/margins_min": -0.08269564807415009, "rewards/margins_std": 0.1651892066001892, "rewards/rejected": 0.08788233995437622, "step": 1560 }, { "dpo_losses": 0.6495047807693481, "epoch": 0.41, "grad_norm": 2.4187953934857935, "learning_rate": 3.668027301883802e-06, "logits/chosen": -2.323568344116211, "logits/rejected": -2.1562869548797607, "logps/chosen": -320.42852783203125, "logps/rejected": -251.86837768554688, "loss": 0.6759, "positive_losses": 0.21250610053539276, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19269652664661407, "rewards/margins": 0.09307434409856796, "rewards/margins_max": 0.24820411205291748, "rewards/margins_min": -0.028641974553465843, "rewards/margins_std": 0.1217184066772461, "rewards/rejected": 0.09962216764688492, "step": 1570 }, { "dpo_losses": 0.6656565070152283, "epoch": 0.41, "grad_norm": 2.221026475057259, "learning_rate": 3.64778083782286e-06, "logits/chosen": -2.318321466445923, "logits/rejected": -2.337523937225342, "logps/chosen": -244.8104705810547, "logps/rejected": -239.976318359375, "loss": 0.6718, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.15679460763931274, "rewards/margins": 0.05957472324371338, "rewards/margins_max": 0.18666084110736847, "rewards/margins_min": -0.07175968587398529, "rewards/margins_std": 0.11472679674625397, "rewards/rejected": 0.09721989929676056, "step": 1580 }, { "dpo_losses": 0.664269208908081, "epoch": 0.42, "grad_norm": 19.774621223458364, "learning_rate": 3.627438534392268e-06, "logits/chosen": -2.3398044109344482, "logits/rejected": -2.3922080993652344, "logps/chosen": -285.7788391113281, "logps/rejected": -278.5696105957031, "loss": 0.7088, "positive_losses": 0.29485243558883667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15554800629615784, "rewards/margins": 0.06451208889484406, "rewards/margins_max": 0.24171264469623566, "rewards/margins_min": -0.0985206887125969, "rewards/margins_std": 0.1506597101688385, "rewards/rejected": 0.09103591740131378, "step": 1590 }, { "dpo_losses": 0.6609795689582825, "epoch": 0.42, "grad_norm": 2.123273933853367, "learning_rate": 3.607002090168506e-06, "logits/chosen": -2.197411298751831, "logits/rejected": -2.1134510040283203, "logps/chosen": -231.3763885498047, "logps/rejected": -206.7642822265625, "loss": 0.7031, "positive_losses": 0.7530155181884766, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13760001957416534, "rewards/margins": 0.07041819393634796, "rewards/margins_max": 0.20062389969825745, "rewards/margins_min": -0.06463692337274551, "rewards/margins_std": 0.11847081035375595, "rewards/rejected": 0.06718180328607559, "step": 1600 }, { "epoch": 0.42, "eval_dpo_losses": 0.6578581929206848, "eval_logits/chosen": -2.21309232711792, "eval_logits/rejected": -2.1056692600250244, "eval_logps/chosen": -259.9510192871094, "eval_logps/rejected": -254.76768493652344, "eval_loss": 0.6876051425933838, "eval_positive_losses": 0.18803879618644714, "eval_rewards/accuracies": 0.682539701461792, "eval_rewards/chosen": 0.15823133289813995, "eval_rewards/margins": 0.07770541310310364, "eval_rewards/margins_max": 0.31716811656951904, "eval_rewards/margins_min": -0.1122400313615799, "eval_rewards/margins_std": 0.1461840569972992, "eval_rewards/rejected": 0.08052590489387512, "eval_runtime": 387.9175, "eval_samples_per_second": 5.156, "eval_steps_per_second": 0.162, "step": 1600 }, { "dpo_losses": 0.6362642645835876, "epoch": 0.42, "grad_norm": 1.9886530620319751, "learning_rate": 3.586473211588787e-06, "logits/chosen": -2.3695764541625977, "logits/rejected": -2.2071399688720703, "logps/chosen": -239.9380340576172, "logps/rejected": -233.781005859375, "loss": 0.6734, "positive_losses": 0.0, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.18069012463092804, "rewards/margins": 0.12568405270576477, "rewards/margins_max": 0.3078451156616211, "rewards/margins_min": -0.01436980627477169, "rewards/margins_std": 0.14986862242221832, "rewards/rejected": 0.05500606447458267, "step": 1610 }, { "dpo_losses": 0.6587170362472534, "epoch": 0.42, "grad_norm": 2.516640215433662, "learning_rate": 3.5658536128085623e-06, "logits/chosen": -2.2276418209075928, "logits/rejected": -2.024360179901123, "logps/chosen": -214.1223907470703, "logps/rejected": -204.4665985107422, "loss": 0.6749, "positive_losses": 0.04087943956255913, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1617608368396759, "rewards/margins": 0.07760385423898697, "rewards/margins_max": 0.28894880414009094, "rewards/margins_min": -0.07395356893539429, "rewards/margins_std": 0.16520926356315613, "rewards/rejected": 0.08415697515010834, "step": 1620 }, { "dpo_losses": 0.6373803615570068, "epoch": 0.43, "grad_norm": 5.828616860423684, "learning_rate": 3.545145015558399e-06, "logits/chosen": -2.485524892807007, "logits/rejected": -2.200207471847534, "logps/chosen": -271.67840576171875, "logps/rejected": -251.66830444335938, "loss": 0.6787, "positive_losses": 0.08732955157756805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1597469300031662, "rewards/margins": 0.12127149105072021, "rewards/margins_max": 0.3024597465991974, "rewards/margins_min": -0.036076612770557404, "rewards/margins_std": 0.15198369324207306, "rewards/rejected": 0.03847543150186539, "step": 1630 }, { "dpo_losses": 0.655339241027832, "epoch": 0.43, "grad_norm": 2.4907997596531097, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -2.3760619163513184, "logits/rejected": -2.159356117248535, "logps/chosen": -258.41058349609375, "logps/rejected": -222.9991912841797, "loss": 0.6698, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1644005924463272, "rewards/margins": 0.08215393126010895, "rewards/margins_max": 0.24081382155418396, "rewards/margins_min": -0.0419418066740036, "rewards/margins_std": 0.1269606053829193, "rewards/rejected": 0.08224667608737946, "step": 1640 }, { "dpo_losses": 0.6539028882980347, "epoch": 0.43, "grad_norm": 2.7336971641705827, "learning_rate": 3.503467749582857e-06, "logits/chosen": -2.294990062713623, "logits/rejected": -2.0262155532836914, "logps/chosen": -356.5543518066406, "logps/rejected": -271.4464111328125, "loss": 0.6856, "positive_losses": 0.15073546767234802, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16452430188655853, "rewards/margins": 0.0868738666176796, "rewards/margins_max": 0.24911335110664368, "rewards/margins_min": -0.0847848579287529, "rewards/margins_std": 0.15053974092006683, "rewards/rejected": 0.07765044271945953, "step": 1650 }, { "dpo_losses": 0.6361522674560547, "epoch": 0.43, "grad_norm": 13.190196841768866, "learning_rate": 3.4825025608971947e-06, "logits/chosen": -2.2501466274261475, "logits/rejected": -2.236222743988037, "logps/chosen": -221.6660614013672, "logps/rejected": -280.033935546875, "loss": 0.6546, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17570729553699493, "rewards/margins": 0.12653592228889465, "rewards/margins_max": 0.3315494656562805, "rewards/margins_min": -0.04361531138420105, "rewards/margins_std": 0.17155036330223083, "rewards/rejected": 0.04917137324810028, "step": 1660 }, { "dpo_losses": 0.6731664538383484, "epoch": 0.44, "grad_norm": 2.2952154483103753, "learning_rate": 3.4614553335304407e-06, "logits/chosen": -2.4266159534454346, "logits/rejected": -2.3008029460906982, "logps/chosen": -236.4641876220703, "logps/rejected": -222.02224731445312, "loss": 0.6957, "positive_losses": 0.7988578677177429, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.134002685546875, "rewards/margins": 0.04398564621806145, "rewards/margins_max": 0.1630837619304657, "rewards/margins_min": -0.08370241522789001, "rewards/margins_std": 0.11168815940618515, "rewards/rejected": 0.09001703560352325, "step": 1670 }, { "dpo_losses": 0.6721925735473633, "epoch": 0.44, "grad_norm": 18.676358477144223, "learning_rate": 3.4403278249200222e-06, "logits/chosen": -2.2974770069122314, "logits/rejected": -2.1660468578338623, "logps/chosen": -286.3014221191406, "logps/rejected": -239.70083618164062, "loss": 0.7216, "positive_losses": 0.9073417782783508, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.12841558456420898, "rewards/margins": 0.048945244401693344, "rewards/margins_max": 0.2239234447479248, "rewards/margins_min": -0.1302177757024765, "rewards/margins_std": 0.15237346291542053, "rewards/rejected": 0.07947034388780594, "step": 1680 }, { "dpo_losses": 0.6647047400474548, "epoch": 0.44, "grad_norm": 2.2420235284800873, "learning_rate": 3.4191217992068293e-06, "logits/chosen": -2.19769549369812, "logits/rejected": -2.1188807487487793, "logps/chosen": -213.783935546875, "logps/rejected": -242.63302612304688, "loss": 0.6811, "positive_losses": 0.3910190463066101, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13422329723834991, "rewards/margins": 0.06350506842136383, "rewards/margins_max": 0.2252282351255417, "rewards/margins_min": -0.09613536298274994, "rewards/margins_std": 0.1466381847858429, "rewards/rejected": 0.07071822136640549, "step": 1690 }, { "dpo_losses": 0.681535005569458, "epoch": 0.44, "grad_norm": 2.2484500632819833, "learning_rate": 3.3978390270879056e-06, "logits/chosen": -2.2266082763671875, "logits/rejected": -2.22533917427063, "logps/chosen": -188.8800506591797, "logps/rejected": -245.88037109375, "loss": 0.6709, "positive_losses": 0.16224956512451172, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.12615133821964264, "rewards/margins": 0.02792949602007866, "rewards/margins_max": 0.1845114827156067, "rewards/margins_min": -0.08707179129123688, "rewards/margins_std": 0.12447158247232437, "rewards/rejected": 0.09822182357311249, "step": 1700 }, { "epoch": 0.44, "eval_dpo_losses": 0.6572098731994629, "eval_logits/chosen": -2.2161550521850586, "eval_logits/rejected": -2.1065783500671387, "eval_logps/chosen": -259.6551818847656, "eval_logps/rejected": -254.58656311035156, "eval_loss": 0.6839549541473389, "eval_positive_losses": 0.11838395893573761, "eval_rewards/accuracies": 0.6944444179534912, "eval_rewards/chosen": 0.16118983924388885, "eval_rewards/margins": 0.07885253429412842, "eval_rewards/margins_max": 0.31419461965560913, "eval_rewards/margins_min": -0.10979805141687393, "eval_rewards/margins_std": 0.1436033397912979, "eval_rewards/rejected": 0.08233729749917984, "eval_runtime": 390.7008, "eval_samples_per_second": 5.119, "eval_steps_per_second": 0.161, "step": 1700 }, { "dpo_losses": 0.6609781980514526, "epoch": 0.45, "grad_norm": 23.4304611826244, "learning_rate": 3.3764812856685995e-06, "logits/chosen": -2.3463547229766846, "logits/rejected": -2.3056178092956543, "logps/chosen": -227.5908966064453, "logps/rejected": -222.55093383789062, "loss": 0.6742, "positive_losses": 0.07393493503332138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.17003372311592102, "rewards/margins": 0.07245069742202759, "rewards/margins_max": 0.23304156959056854, "rewards/margins_min": -0.06605934351682663, "rewards/margins_std": 0.1329105794429779, "rewards/rejected": 0.09758303314447403, "step": 1710 }, { "dpo_losses": 0.6497678756713867, "epoch": 0.45, "grad_norm": 2.475072755611206, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -2.3156237602233887, "logits/rejected": -2.210573673248291, "logps/chosen": -229.82821655273438, "logps/rejected": -265.3238220214844, "loss": 0.6843, "positive_losses": 0.000385284423828125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17360106110572815, "rewards/margins": 0.09454257786273956, "rewards/margins_max": 0.3015786111354828, "rewards/margins_min": -0.039262037724256516, "rewards/margins_std": 0.15579503774642944, "rewards/rejected": 0.07905846834182739, "step": 1720 }, { "dpo_losses": 0.6504519581794739, "epoch": 0.45, "grad_norm": 2.408728375973761, "learning_rate": 3.3335480345008907e-06, "logits/chosen": -2.333217144012451, "logits/rejected": -2.183610200881958, "logps/chosen": -258.3575744628906, "logps/rejected": -285.5169372558594, "loss": 0.7472, "positive_losses": 0.5097106695175171, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17115990817546844, "rewards/margins": 0.09332086145877838, "rewards/margins_max": 0.2633286714553833, "rewards/margins_min": -0.06344510614871979, "rewards/margins_std": 0.14474806189537048, "rewards/rejected": 0.07783903926610947, "step": 1730 }, { "dpo_losses": 0.6426266431808472, "epoch": 0.46, "grad_norm": 13.855777656374636, "learning_rate": 3.3119761096666055e-06, "logits/chosen": -2.252967119216919, "logits/rejected": -2.0604300498962402, "logps/chosen": -286.0965270996094, "logps/rejected": -224.70693969726562, "loss": 0.6781, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.19771504402160645, "rewards/margins": 0.10869884490966797, "rewards/margins_max": 0.2351720631122589, "rewards/margins_min": -0.012729622423648834, "rewards/margins_std": 0.11419552564620972, "rewards/rejected": 0.08901620656251907, "step": 1740 }, { "dpo_losses": 0.6526317596435547, "epoch": 0.46, "grad_norm": 2.24432186394182, "learning_rate": 3.290336385060832e-06, "logits/chosen": -2.327709913253784, "logits/rejected": -2.277622699737549, "logps/chosen": -242.6006622314453, "logps/rejected": -258.0498962402344, "loss": 0.6727, "positive_losses": 0.209736630320549, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.16469606757164001, "rewards/margins": 0.08771221339702606, "rewards/margins_max": 0.2150670737028122, "rewards/margins_min": -0.030314141884446144, "rewards/margins_std": 0.11024855077266693, "rewards/rejected": 0.07698385417461395, "step": 1750 }, { "dpo_losses": 0.6428591012954712, "epoch": 0.46, "grad_norm": 2.265143612988673, "learning_rate": 3.268630667594348e-06, "logits/chosen": -2.1345322132110596, "logits/rejected": -1.9872815608978271, "logps/chosen": -239.9320068359375, "logps/rejected": -232.318603515625, "loss": 0.6884, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17053855955600739, "rewards/margins": 0.11081834882497787, "rewards/margins_max": 0.33126169443130493, "rewards/margins_min": -0.04989037662744522, "rewards/margins_std": 0.17013967037200928, "rewards/rejected": 0.05972020700573921, "step": 1760 }, { "dpo_losses": 0.6694139242172241, "epoch": 0.46, "grad_norm": 11.331969983245273, "learning_rate": 3.2468607696883147e-06, "logits/chosen": -2.3787145614624023, "logits/rejected": -2.344045639038086, "logps/chosen": -292.93231201171875, "logps/rejected": -294.61865234375, "loss": 0.6798, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17151954770088196, "rewards/margins": 0.05364648252725601, "rewards/margins_max": 0.2067890614271164, "rewards/margins_min": -0.08926218748092651, "rewards/margins_std": 0.13876472413539886, "rewards/rejected": 0.11787305027246475, "step": 1770 }, { "dpo_losses": 0.6557838320732117, "epoch": 0.47, "grad_norm": 8.309806215159057, "learning_rate": 3.225028509122944e-06, "logits/chosen": -2.3050711154937744, "logits/rejected": -2.195180654525757, "logps/chosen": -273.392578125, "logps/rejected": -248.3610076904297, "loss": 0.6715, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.15661221742630005, "rewards/margins": 0.08068086951971054, "rewards/margins_max": 0.2283354252576828, "rewards/margins_min": -0.05743633583188057, "rewards/margins_std": 0.12780140340328217, "rewards/rejected": 0.07593134790658951, "step": 1780 }, { "dpo_losses": 0.6662956476211548, "epoch": 0.47, "grad_norm": 17.793083235773285, "learning_rate": 3.2031357088857083e-06, "logits/chosen": -2.2426159381866455, "logits/rejected": -2.1609861850738525, "logps/chosen": -254.09133911132812, "logps/rejected": -214.1595458984375, "loss": 0.6973, "positive_losses": 0.2452464997768402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14974237978458405, "rewards/margins": 0.05911676958203316, "rewards/margins_max": 0.2234044075012207, "rewards/margins_min": -0.06955133378505707, "rewards/margins_std": 0.132836252450943, "rewards/rejected": 0.09062561392784119, "step": 1790 }, { "dpo_losses": 0.6537083387374878, "epoch": 0.47, "grad_norm": 2.607666644205318, "learning_rate": 3.181184197019127e-06, "logits/chosen": -2.5173232555389404, "logits/rejected": -2.1202733516693115, "logps/chosen": -322.5250549316406, "logps/rejected": -288.473876953125, "loss": 0.743, "positive_losses": 0.42681044340133667, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.137128084897995, "rewards/margins": 0.0858563631772995, "rewards/margins_max": 0.22768381237983704, "rewards/margins_min": -0.024022815749049187, "rewards/margins_std": 0.11366715282201767, "rewards/rejected": 0.051271725445985794, "step": 1800 }, { "epoch": 0.47, "eval_dpo_losses": 0.6559661626815796, "eval_logits/chosen": -2.2307217121124268, "eval_logits/rejected": -2.122577428817749, "eval_logps/chosen": -259.6641845703125, "eval_logps/rejected": -254.88400268554688, "eval_loss": 0.6829984188079834, "eval_positive_losses": 0.11839014291763306, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.16109980642795563, "eval_rewards/margins": 0.08173713833093643, "eval_rewards/margins_max": 0.3243264853954315, "eval_rewards/margins_min": -0.11155477911233902, "eval_rewards/margins_std": 0.14689843356609344, "eval_rewards/rejected": 0.0793626680970192, "eval_runtime": 387.0549, "eval_samples_per_second": 5.167, "eval_steps_per_second": 0.163, "step": 1800 }, { "dpo_losses": 0.6600796580314636, "epoch": 0.47, "grad_norm": 16.677213663151477, "learning_rate": 3.159175806468126e-06, "logits/chosen": -2.4094319343566895, "logits/rejected": -2.207319498062134, "logps/chosen": -308.32244873046875, "logps/rejected": -284.3188171386719, "loss": 0.6972, "positive_losses": 0.3622092306613922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16681766510009766, "rewards/margins": 0.07201824337244034, "rewards/margins_max": 0.23398518562316895, "rewards/margins_min": -0.05544561892747879, "rewards/margins_std": 0.1270267218351364, "rewards/rejected": 0.09479942172765732, "step": 1810 }, { "dpo_losses": 0.6437441110610962, "epoch": 0.48, "grad_norm": 2.656285902790451, "learning_rate": 3.1371123749269804e-06, "logits/chosen": -2.243051052093506, "logits/rejected": -2.098654270172119, "logps/chosen": -262.8346252441406, "logps/rejected": -278.67388916015625, "loss": 0.6836, "positive_losses": 0.048346709460020065, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.18207576870918274, "rewards/margins": 0.10702575743198395, "rewards/margins_max": 0.25208669900894165, "rewards/margins_min": -0.019060485064983368, "rewards/margins_std": 0.12055227905511856, "rewards/rejected": 0.07505004107952118, "step": 1820 }, { "dpo_losses": 0.6380342245101929, "epoch": 0.48, "grad_norm": 2.1907564175485765, "learning_rate": 3.114995744685877e-06, "logits/chosen": -2.2460262775421143, "logits/rejected": -2.1322672367095947, "logps/chosen": -252.00595092773438, "logps/rejected": -317.4335632324219, "loss": 0.6686, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18621371686458588, "rewards/margins": 0.12512195110321045, "rewards/margins_max": 0.3873857855796814, "rewards/margins_min": -0.0639919564127922, "rewards/margins_std": 0.20073077082633972, "rewards/rejected": 0.061091743409633636, "step": 1830 }, { "dpo_losses": 0.648234486579895, "epoch": 0.48, "grad_norm": 11.853914514059928, "learning_rate": 3.0928277624770743e-06, "logits/chosen": -2.4729294776916504, "logits/rejected": -2.2658793926239014, "logps/chosen": -219.70498657226562, "logps/rejected": -230.059814453125, "loss": 0.6899, "positive_losses": 0.3261658549308777, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1800881326198578, "rewards/margins": 0.09756244719028473, "rewards/margins_max": 0.2512146532535553, "rewards/margins_min": -0.07729899883270264, "rewards/margins_std": 0.14372311532497406, "rewards/rejected": 0.08252566307783127, "step": 1840 }, { "dpo_losses": 0.666070818901062, "epoch": 0.48, "grad_norm": 4.825609063956048, "learning_rate": 3.070610279320708e-06, "logits/chosen": -2.332650661468506, "logits/rejected": -2.098080635070801, "logps/chosen": -214.677978515625, "logps/rejected": -206.3356475830078, "loss": 0.6697, "positive_losses": 0.2964407205581665, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14598266780376434, "rewards/margins": 0.05982706695795059, "rewards/margins_max": 0.20257607102394104, "rewards/margins_min": -0.07006332278251648, "rewards/margins_std": 0.12039034068584442, "rewards/rejected": 0.08615561574697495, "step": 1850 }, { "dpo_losses": 0.6466644406318665, "epoch": 0.49, "grad_norm": 13.016998914580407, "learning_rate": 3.0483451503702264e-06, "logits/chosen": -2.1851696968078613, "logits/rejected": -2.1019253730773926, "logps/chosen": -212.0569610595703, "logps/rejected": -236.5753936767578, "loss": 0.6818, "positive_losses": 0.1128692626953125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17612609267234802, "rewards/margins": 0.10288868099451065, "rewards/margins_max": 0.2922040522098541, "rewards/margins_min": -0.05712326616048813, "rewards/margins_std": 0.15590299665927887, "rewards/rejected": 0.07323741912841797, "step": 1860 }, { "dpo_losses": 0.6581683158874512, "epoch": 0.49, "grad_norm": 2.3744062384196996, "learning_rate": 3.0260342347574916e-06, "logits/chosen": -2.1926872730255127, "logits/rejected": -2.1632323265075684, "logps/chosen": -243.56008911132812, "logps/rejected": -275.03338623046875, "loss": 0.6783, "positive_losses": 0.26922979950904846, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1426573097705841, "rewards/margins": 0.07918456196784973, "rewards/margins_max": 0.29523831605911255, "rewards/margins_min": -0.10316663980484009, "rewards/margins_std": 0.17539319396018982, "rewards/rejected": 0.06347275525331497, "step": 1870 }, { "dpo_losses": 0.6489076614379883, "epoch": 0.49, "grad_norm": 19.829528614622312, "learning_rate": 3.0036793954375358e-06, "logits/chosen": -2.3742499351501465, "logits/rejected": -2.176572322845459, "logps/chosen": -235.2572021484375, "logps/rejected": -233.92575073242188, "loss": 0.6678, "positive_losses": 0.046978045254945755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17792953550815582, "rewards/margins": 0.09410925209522247, "rewards/margins_max": 0.24029035866260529, "rewards/margins_min": -0.028395721688866615, "rewards/margins_std": 0.119581438601017, "rewards/rejected": 0.08382028341293335, "step": 1880 }, { "dpo_losses": 0.681326687335968, "epoch": 0.49, "grad_norm": 7.3090687061733455, "learning_rate": 2.981282499033009e-06, "logits/chosen": -2.405606985092163, "logits/rejected": -2.4267818927764893, "logps/chosen": -272.88177490234375, "logps/rejected": -341.9512939453125, "loss": 0.6875, "positive_losses": 0.2878372073173523, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.13740622997283936, "rewards/margins": 0.032390616834163666, "rewards/margins_max": 0.2132464349269867, "rewards/margins_min": -0.14967162907123566, "rewards/margins_std": 0.16698478162288666, "rewards/rejected": 0.10501561313867569, "step": 1890 }, { "dpo_losses": 0.6579803824424744, "epoch": 0.5, "grad_norm": 1.9783981303417906, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -2.4266490936279297, "logits/rejected": -2.144906520843506, "logps/chosen": -248.9272918701172, "logps/rejected": -218.65530395507812, "loss": 0.7089, "positive_losses": 0.7414811849594116, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1529330611228943, "rewards/margins": 0.07830923050642014, "rewards/margins_max": 0.245284765958786, "rewards/margins_min": -0.09717442840337753, "rewards/margins_std": 0.1546204388141632, "rewards/rejected": 0.07462382316589355, "step": 1900 }, { "epoch": 0.5, "eval_dpo_losses": 0.6563791036605835, "eval_logits/chosen": -2.239947557449341, "eval_logits/rejected": -2.133819580078125, "eval_logps/chosen": -259.76629638671875, "eval_logps/rejected": -254.8997039794922, "eval_loss": 0.6842896938323975, "eval_positive_losses": 0.13663482666015625, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.16007843613624573, "eval_rewards/margins": 0.08087295293807983, "eval_rewards/margins_max": 0.32715752720832825, "eval_rewards/margins_min": -0.11086796969175339, "eval_rewards/margins_std": 0.1472061723470688, "eval_rewards/rejected": 0.07920549809932709, "eval_runtime": 387.2734, "eval_samples_per_second": 5.164, "eval_steps_per_second": 0.163, "step": 1900 }, { "dpo_losses": 0.6610921025276184, "epoch": 0.5, "grad_norm": 16.118584827939106, "learning_rate": 2.9363700188634597e-06, "logits/chosen": -2.3087778091430664, "logits/rejected": -2.1755776405334473, "logps/chosen": -226.97897338867188, "logps/rejected": -234.13583374023438, "loss": 0.6781, "positive_losses": 0.04137077182531357, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1706680953502655, "rewards/margins": 0.06953239440917969, "rewards/margins_max": 0.21511049568653107, "rewards/margins_min": -0.05151674151420593, "rewards/margins_std": 0.12065571546554565, "rewards/rejected": 0.10113570839166641, "step": 1910 }, { "dpo_losses": 0.6788076758384705, "epoch": 0.5, "grad_norm": 16.64449412179032, "learning_rate": 2.9138581852776053e-06, "logits/chosen": -2.3122291564941406, "logits/rejected": -2.1988909244537354, "logps/chosen": -246.87002563476562, "logps/rejected": -194.48733520507812, "loss": 0.6909, "positive_losses": 0.17411652207374573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15158802270889282, "rewards/margins": 0.03324580192565918, "rewards/margins_max": 0.17186763882637024, "rewards/margins_min": -0.08952119946479797, "rewards/margins_std": 0.11633072793483734, "rewards/rejected": 0.11834220588207245, "step": 1920 }, { "dpo_losses": 0.6391749978065491, "epoch": 0.51, "grad_norm": 12.807366063647953, "learning_rate": 2.8913117946523805e-06, "logits/chosen": -2.3553452491760254, "logits/rejected": -2.094841480255127, "logps/chosen": -320.3630676269531, "logps/rejected": -231.02212524414062, "loss": 0.6706, "positive_losses": 0.0036911009810864925, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.16602584719657898, "rewards/margins": 0.11495769023895264, "rewards/margins_max": 0.24866096675395966, "rewards/margins_min": 0.009748214855790138, "rewards/margins_std": 0.11054448783397675, "rewards/rejected": 0.05106815695762634, "step": 1930 }, { "dpo_losses": 0.666602373123169, "epoch": 0.51, "grad_norm": 21.71926005471224, "learning_rate": 2.8687327296049126e-06, "logits/chosen": -2.288761615753174, "logits/rejected": -2.2334160804748535, "logps/chosen": -241.0114288330078, "logps/rejected": -248.7958984375, "loss": 0.6837, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1628572791814804, "rewards/margins": 0.057786233723163605, "rewards/margins_max": 0.18463234603405, "rewards/margins_min": -0.08051592111587524, "rewards/margins_std": 0.11834029853343964, "rewards/rejected": 0.1050710529088974, "step": 1940 }, { "dpo_losses": 0.6563688516616821, "epoch": 0.51, "grad_norm": 2.5622964173369662, "learning_rate": 2.8461228754806376e-06, "logits/chosen": -2.2827160358428955, "logits/rejected": -2.2857627868652344, "logps/chosen": -277.69683837890625, "logps/rejected": -268.86944580078125, "loss": 0.6783, "positive_losses": 0.09722747653722763, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17104503512382507, "rewards/margins": 0.0823342427611351, "rewards/margins_max": 0.2681478261947632, "rewards/margins_min": -0.0553547628223896, "rewards/margins_std": 0.14788714051246643, "rewards/rejected": 0.08871077746152878, "step": 1950 }, { "dpo_losses": 0.6291549801826477, "epoch": 0.51, "grad_norm": 2.5025850282718607, "learning_rate": 2.823484120195865e-06, "logits/chosen": -2.5488483905792236, "logits/rejected": -2.289984703063965, "logps/chosen": -275.1360168457031, "logps/rejected": -251.43606567382812, "loss": 0.6745, "positive_losses": 0.17697659134864807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2031518965959549, "rewards/margins": 0.1412176489830017, "rewards/margins_max": 0.330684095621109, "rewards/margins_min": -0.047308262437582016, "rewards/margins_std": 0.17111757397651672, "rewards/rejected": 0.06193426251411438, "step": 1960 }, { "dpo_losses": 0.6590481996536255, "epoch": 0.52, "grad_norm": 12.492408583487688, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -2.4189035892486572, "logits/rejected": -2.27512264251709, "logps/chosen": -262.02996826171875, "logps/rejected": -251.7520294189453, "loss": 0.6919, "positive_losses": 0.32400742173194885, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1722312867641449, "rewards/margins": 0.07811994850635529, "rewards/margins_max": 0.2386968582868576, "rewards/margins_min": -0.0985770896077156, "rewards/margins_std": 0.15485592186450958, "rewards/rejected": 0.09411133825778961, "step": 1970 }, { "dpo_losses": 0.6748195886611938, "epoch": 0.52, "grad_norm": 2.4167272187462743, "learning_rate": 2.7781274697184353e-06, "logits/chosen": -2.297257423400879, "logits/rejected": -2.1935324668884277, "logps/chosen": -187.9236602783203, "logps/rejected": -204.40139770507812, "loss": 0.6814, "positive_losses": 0.01688995398581028, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.1636572927236557, "rewards/margins": 0.040947090834379196, "rewards/margins_max": 0.18551024794578552, "rewards/margins_min": -0.0641537457704544, "rewards/margins_std": 0.10928811877965927, "rewards/rejected": 0.12271019071340561, "step": 1980 }, { "dpo_losses": 0.6567361354827881, "epoch": 0.52, "grad_norm": 2.17081452969872, "learning_rate": 2.7554133617930397e-06, "logits/chosen": -2.4658613204956055, "logits/rejected": -2.224078416824341, "logps/chosen": -239.8988037109375, "logps/rejected": -237.00241088867188, "loss": 0.6815, "positive_losses": 0.06911468505859375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15757226943969727, "rewards/margins": 0.0781024843454361, "rewards/margins_max": 0.213197261095047, "rewards/margins_min": -0.045259878039360046, "rewards/margins_std": 0.11204835027456284, "rewards/rejected": 0.07946979254484177, "step": 1990 }, { "dpo_losses": 0.6754752397537231, "epoch": 0.52, "grad_norm": 2.0431411116120266, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -2.28262996673584, "logits/rejected": -2.2627787590026855, "logps/chosen": -215.33834838867188, "logps/rejected": -218.91213989257812, "loss": 0.667, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.1724465936422348, "rewards/margins": 0.04107920452952385, "rewards/margins_max": 0.20003566145896912, "rewards/margins_min": -0.09646781533956528, "rewards/margins_std": 0.12854884564876556, "rewards/rejected": 0.13136737048625946, "step": 2000 }, { "epoch": 0.52, "eval_dpo_losses": 0.6585911512374878, "eval_logits/chosen": -2.254525899887085, "eval_logits/rejected": -2.1472699642181396, "eval_logps/chosen": -258.5389099121094, "eval_logps/rejected": -253.20980834960938, "eval_loss": 0.6773356795310974, "eval_positive_losses": 0.07739488035440445, "eval_rewards/accuracies": 0.670634925365448, "eval_rewards/chosen": 0.17235229909420013, "eval_rewards/margins": 0.0762476995587349, "eval_rewards/margins_max": 0.32138708233833313, "eval_rewards/margins_min": -0.11841771751642227, "eval_rewards/margins_std": 0.14760389924049377, "eval_rewards/rejected": 0.09610462188720703, "eval_runtime": 408.1712, "eval_samples_per_second": 4.9, "eval_steps_per_second": 0.154, "step": 2000 }, { "dpo_losses": 0.6521593928337097, "epoch": 0.53, "grad_norm": 2.519916073294699, "learning_rate": 2.7099230635178954e-06, "logits/chosen": -2.272141456604004, "logits/rejected": -2.258521556854248, "logps/chosen": -224.4058837890625, "logps/rejected": -218.7519073486328, "loss": 0.6556, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.17160876095294952, "rewards/margins": 0.08633692562580109, "rewards/margins_max": 0.19217197597026825, "rewards/margins_min": 0.0013048506807535887, "rewards/margins_std": 0.0837632268667221, "rewards/rejected": 0.08527182042598724, "step": 2010 }, { "dpo_losses": 0.6603204607963562, "epoch": 0.53, "grad_norm": 14.476808614846659, "learning_rate": 2.6871506715949608e-06, "logits/chosen": -2.5525927543640137, "logits/rejected": -2.3581275939941406, "logps/chosen": -277.409423828125, "logps/rejected": -233.7879638671875, "loss": 0.6771, "positive_losses": 0.1558856964111328, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.15855351090431213, "rewards/margins": 0.07552589476108551, "rewards/margins_max": 0.2811165750026703, "rewards/margins_min": -0.09371216595172882, "rewards/margins_std": 0.17102788388729095, "rewards/rejected": 0.08302764594554901, "step": 2020 }, { "dpo_losses": 0.6583741903305054, "epoch": 0.53, "grad_norm": 2.3994140625, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -2.4063515663146973, "logits/rejected": -2.3350014686584473, "logps/chosen": -258.9729919433594, "logps/rejected": -257.6073303222656, "loss": 0.7459, "positive_losses": 0.16091489791870117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.142665833234787, "rewards/margins": 0.0760270208120346, "rewards/margins_max": 0.23376016318798065, "rewards/margins_min": -0.06926386058330536, "rewards/margins_std": 0.13635453581809998, "rewards/rejected": 0.06663882732391357, "step": 2030 }, { "dpo_losses": 0.6505128145217896, "epoch": 0.53, "grad_norm": 12.536801295336462, "learning_rate": 2.6415609094604562e-06, "logits/chosen": -2.076948642730713, "logits/rejected": -2.029512882232666, "logps/chosen": -262.8587341308594, "logps/rejected": -209.3634796142578, "loss": 0.7157, "positive_losses": 0.8242809176445007, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12950512766838074, "rewards/margins": 0.0949791893362999, "rewards/margins_max": 0.29250580072402954, "rewards/margins_min": -0.09627943485975266, "rewards/margins_std": 0.16977520287036896, "rewards/rejected": 0.03452594205737114, "step": 2040 }, { "dpo_losses": 0.6484048366546631, "epoch": 0.54, "grad_norm": 25.91004052863122, "learning_rate": 2.618747345980904e-06, "logits/chosen": -2.4091012477874756, "logits/rejected": -2.300563097000122, "logps/chosen": -250.53189086914062, "logps/rejected": -245.96273803710938, "loss": 0.7019, "positive_losses": 0.30422669649124146, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1503020077943802, "rewards/margins": 0.09670291841030121, "rewards/margins_max": 0.26473844051361084, "rewards/margins_min": -0.03165449574589729, "rewards/margins_std": 0.13367241621017456, "rewards/rejected": 0.053599096834659576, "step": 2050 }, { "dpo_losses": 0.6560009121894836, "epoch": 0.54, "grad_norm": 2.587478777664799, "learning_rate": 2.595923867132136e-06, "logits/chosen": -2.3362202644348145, "logits/rejected": -2.2678093910217285, "logps/chosen": -267.53009033203125, "logps/rejected": -241.83889770507812, "loss": 0.6607, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1875811070203781, "rewards/margins": 0.07941807806491852, "rewards/margins_max": 0.19589532911777496, "rewards/margins_min": -0.03498644009232521, "rewards/margins_std": 0.10155224800109863, "rewards/rejected": 0.10816304385662079, "step": 2060 }, { "dpo_losses": 0.6506420373916626, "epoch": 0.54, "grad_norm": 2.1289124095022705, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -2.2264554500579834, "logits/rejected": -2.251516103744507, "logps/chosen": -205.99502563476562, "logps/rejected": -262.0281066894531, "loss": 0.6729, "positive_losses": 0.018190670758485794, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17610502243041992, "rewards/margins": 0.09245802462100983, "rewards/margins_max": 0.2544223666191101, "rewards/margins_min": -0.0524260513484478, "rewards/margins_std": 0.13549388945102692, "rewards/rejected": 0.0836469978094101, "step": 2070 }, { "dpo_losses": 0.6619730591773987, "epoch": 0.54, "grad_norm": 2.3911505227502365, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -2.440260410308838, "logits/rejected": -2.3453919887542725, "logps/chosen": -198.92776489257812, "logps/rejected": -218.76953125, "loss": 0.6758, "positive_losses": 0.29948052763938904, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15793924033641815, "rewards/margins": 0.06796392053365707, "rewards/margins_max": 0.21925139427185059, "rewards/margins_min": -0.0832114964723587, "rewards/margins_std": 0.13561256229877472, "rewards/rejected": 0.08997531980276108, "step": 2080 }, { "dpo_losses": 0.6462734937667847, "epoch": 0.55, "grad_norm": 8.823808921425936, "learning_rate": 2.527412999094507e-06, "logits/chosen": -2.2284703254699707, "logits/rejected": -2.2002511024475098, "logps/chosen": -250.41259765625, "logps/rejected": -304.85382080078125, "loss": 0.6803, "positive_losses": 0.4008301794528961, "rewards/accuracies": 0.75, "rewards/chosen": 0.14126747846603394, "rewards/margins": 0.10547207295894623, "rewards/margins_max": 0.2907482981681824, "rewards/margins_min": -0.04991523548960686, "rewards/margins_std": 0.15157726407051086, "rewards/rejected": 0.03579540550708771, "step": 2090 }, { "dpo_losses": 0.6568211317062378, "epoch": 0.55, "grad_norm": 2.1124797436346805, "learning_rate": 2.504568922200064e-06, "logits/chosen": -2.3519229888916016, "logits/rejected": -2.2333531379699707, "logps/chosen": -226.63345336914062, "logps/rejected": -200.7207794189453, "loss": 0.6863, "positive_losses": 0.2710380554199219, "rewards/accuracies": 0.625, "rewards/chosen": 0.15608185529708862, "rewards/margins": 0.07943911850452423, "rewards/margins_max": 0.25659435987472534, "rewards/margins_min": -0.04794612526893616, "rewards/margins_std": 0.13517385721206665, "rewards/rejected": 0.07664273679256439, "step": 2100 }, { "epoch": 0.55, "eval_dpo_losses": 0.6553297638893127, "eval_logits/chosen": -2.2443695068359375, "eval_logits/rejected": -2.138110876083374, "eval_logps/chosen": -259.83306884765625, "eval_logps/rejected": -255.2269287109375, "eval_loss": 0.686160147190094, "eval_positive_losses": 0.14275741577148438, "eval_rewards/accuracies": 0.692460298538208, "eval_rewards/chosen": 0.15941113233566284, "eval_rewards/margins": 0.08347754925489426, "eval_rewards/margins_max": 0.33785706758499146, "eval_rewards/margins_min": -0.11630003899335861, "eval_rewards/margins_std": 0.1520967334508896, "eval_rewards/rejected": 0.07593357563018799, "eval_runtime": 386.459, "eval_samples_per_second": 5.175, "eval_steps_per_second": 0.163, "step": 2100 }, { "dpo_losses": 0.6585019826889038, "epoch": 0.55, "grad_norm": 1.9768479936403094, "learning_rate": 2.4817244638019333e-06, "logits/chosen": -2.2777047157287598, "logits/rejected": -1.962303876876831, "logps/chosen": -234.02420043945312, "logps/rejected": -209.2008819580078, "loss": 0.7141, "positive_losses": 0.050678253173828125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14699319005012512, "rewards/margins": 0.07718072086572647, "rewards/margins_max": 0.27803340554237366, "rewards/margins_min": -0.0683688074350357, "rewards/margins_std": 0.15312418341636658, "rewards/rejected": 0.06981248408555984, "step": 2110 }, { "dpo_losses": 0.6425925493240356, "epoch": 0.55, "grad_norm": 19.579088289350643, "learning_rate": 2.4588815314058155e-06, "logits/chosen": -2.2205238342285156, "logits/rejected": -2.2000303268432617, "logps/chosen": -212.6324920654297, "logps/rejected": -247.48867797851562, "loss": 0.7043, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1731795370578766, "rewards/margins": 0.10820887982845306, "rewards/margins_max": 0.24850232899188995, "rewards/margins_min": -0.008411906659603119, "rewards/margins_std": 0.11912062019109726, "rewards/rejected": 0.06497064977884293, "step": 2120 }, { "dpo_losses": 0.6860312223434448, "epoch": 0.56, "grad_norm": 2.355493852219062, "learning_rate": 2.4360420323899922e-06, "logits/chosen": -2.3546016216278076, "logits/rejected": -2.444478750228882, "logps/chosen": -178.6026153564453, "logps/rejected": -240.92257690429688, "loss": 0.6829, "positive_losses": 0.3143409788608551, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13544535636901855, "rewards/margins": 0.01912139728665352, "rewards/margins_max": 0.15571314096450806, "rewards/margins_min": -0.13922038674354553, "rewards/margins_std": 0.13142560422420502, "rewards/rejected": 0.11632396280765533, "step": 2130 }, { "dpo_losses": 0.6597676873207092, "epoch": 0.56, "grad_norm": 27.79115227066431, "learning_rate": 2.4132078738460585e-06, "logits/chosen": -2.4666554927825928, "logits/rejected": -2.3963735103607178, "logps/chosen": -264.1156311035156, "logps/rejected": -262.232421875, "loss": 0.6912, "positive_losses": 0.15466919541358948, "rewards/accuracies": 0.625, "rewards/chosen": 0.1620219349861145, "rewards/margins": 0.0729297325015068, "rewards/margins_max": 0.21023695170879364, "rewards/margins_min": -0.05655983090400696, "rewards/margins_std": 0.1222340315580368, "rewards/rejected": 0.08909222483634949, "step": 2140 }, { "dpo_losses": 0.6541584730148315, "epoch": 0.56, "grad_norm": 2.2257776851498305, "learning_rate": 2.3903809624196826e-06, "logits/chosen": -2.422748565673828, "logits/rejected": -2.186906099319458, "logps/chosen": -254.19998168945312, "logps/rejected": -203.87844848632812, "loss": 0.6968, "positive_losses": 0.676135241985321, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16899213194847107, "rewards/margins": 0.08468370139598846, "rewards/margins_max": 0.2252078354358673, "rewards/margins_min": -0.05006546899676323, "rewards/margins_std": 0.12473394721746445, "rewards/rejected": 0.0843084305524826, "step": 2150 }, { "dpo_losses": 0.6540201306343079, "epoch": 0.57, "grad_norm": 26.496714262323522, "learning_rate": 2.3675632041513978e-06, "logits/chosen": -2.0788064002990723, "logits/rejected": -2.0171713829040527, "logps/chosen": -190.94192504882812, "logps/rejected": -220.2122802734375, "loss": 0.6979, "positive_losses": 0.09005127102136612, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.16166457533836365, "rewards/margins": 0.08768502622842789, "rewards/margins_max": 0.2841649055480957, "rewards/margins_min": -0.04412169009447098, "rewards/margins_std": 0.14577646553516388, "rewards/rejected": 0.07397954165935516, "step": 2160 }, { "dpo_losses": 0.651910662651062, "epoch": 0.57, "grad_norm": 1.8282939474174966, "learning_rate": 2.3447565043174533e-06, "logits/chosen": -2.0645394325256348, "logits/rejected": -2.0284571647644043, "logps/chosen": -212.9246368408203, "logps/rejected": -232.7741241455078, "loss": 0.6632, "positive_losses": 0.09863968193531036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16998614370822906, "rewards/margins": 0.08791140466928482, "rewards/margins_max": 0.22409109771251678, "rewards/margins_min": -0.038312822580337524, "rewards/margins_std": 0.11691458523273468, "rewards/rejected": 0.08207472413778305, "step": 2170 }, { "dpo_losses": 0.6686201095581055, "epoch": 0.57, "grad_norm": 15.348893494129817, "learning_rate": 2.321962767270724e-06, "logits/chosen": -2.6147117614746094, "logits/rejected": -2.3883659839630127, "logps/chosen": -316.5950927734375, "logps/rejected": -242.1833038330078, "loss": 0.6892, "positive_losses": 0.5988219976425171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13277402520179749, "rewards/margins": 0.05517308786511421, "rewards/margins_max": 0.19286704063415527, "rewards/margins_min": -0.08148355036973953, "rewards/margins_std": 0.11973325908184052, "rewards/rejected": 0.07760094106197357, "step": 2180 }, { "dpo_losses": 0.6531265377998352, "epoch": 0.57, "grad_norm": 21.105234142343104, "learning_rate": 2.299183896281692e-06, "logits/chosen": -2.3792638778686523, "logits/rejected": -2.457986354827881, "logps/chosen": -263.7500915527344, "logps/rejected": -324.11932373046875, "loss": 0.673, "positive_losses": 0.2141517698764801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1407163441181183, "rewards/margins": 0.08956310898065567, "rewards/margins_max": 0.30031102895736694, "rewards/margins_min": -0.09678220748901367, "rewards/margins_std": 0.1736389547586441, "rewards/rejected": 0.05115324258804321, "step": 2190 }, { "dpo_losses": 0.6388326287269592, "epoch": 0.58, "grad_norm": 2.366529820296026, "learning_rate": 2.2764217933795297e-06, "logits/chosen": -2.453784227371216, "logits/rejected": -2.1842854022979736, "logps/chosen": -332.912841796875, "logps/rejected": -281.45703125, "loss": 0.6773, "positive_losses": 0.39421844482421875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18071512877941132, "rewards/margins": 0.12147901207208633, "rewards/margins_max": 0.34408822655677795, "rewards/margins_min": -0.04213566333055496, "rewards/margins_std": 0.17345568537712097, "rewards/rejected": 0.05923609808087349, "step": 2200 }, { "epoch": 0.58, "eval_dpo_losses": 0.653840184211731, "eval_logits/chosen": -2.2498533725738525, "eval_logits/rejected": -2.1444947719573975, "eval_logps/chosen": -260.26995849609375, "eval_logps/rejected": -256.0005798339844, "eval_loss": 0.6892291307449341, "eval_positive_losses": 0.18925876915454865, "eval_rewards/accuracies": 0.7063491940498352, "eval_rewards/chosen": 0.15504179894924164, "eval_rewards/margins": 0.08684448897838593, "eval_rewards/margins_max": 0.34502720832824707, "eval_rewards/margins_min": -0.11554267257452011, "eval_rewards/margins_std": 0.1544053852558136, "eval_rewards/rejected": 0.06819730997085571, "eval_runtime": 387.0225, "eval_samples_per_second": 5.168, "eval_steps_per_second": 0.163, "step": 2200 }, { "dpo_losses": 0.6598193049430847, "epoch": 0.58, "grad_norm": 2.062548087744335, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -2.367823600769043, "logits/rejected": -2.31829571723938, "logps/chosen": -235.9591522216797, "logps/rejected": -243.7229461669922, "loss": 0.6731, "positive_losses": 0.3742210268974304, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14884069561958313, "rewards/margins": 0.07413595914840698, "rewards/margins_max": 0.23931702971458435, "rewards/margins_min": -0.07351360470056534, "rewards/margins_std": 0.14227581024169922, "rewards/rejected": 0.07470472902059555, "step": 2210 }, { "dpo_losses": 0.6491401791572571, "epoch": 0.58, "grad_norm": 18.24199009529165, "learning_rate": 2.230955492793149e-06, "logits/chosen": -2.4262213706970215, "logits/rejected": -2.266186237335205, "logps/chosen": -291.1212158203125, "logps/rejected": -264.0152282714844, "loss": 0.7074, "positive_losses": 0.15470580756664276, "rewards/accuracies": 0.75, "rewards/chosen": 0.16550055146217346, "rewards/margins": 0.09727510064840317, "rewards/margins_max": 0.27264532446861267, "rewards/margins_min": -0.11392829567193985, "rewards/margins_std": 0.17097152769565582, "rewards/rejected": 0.06822545826435089, "step": 2220 }, { "dpo_losses": 0.6427727937698364, "epoch": 0.58, "grad_norm": 19.66987368195491, "learning_rate": 2.208255091531947e-06, "logits/chosen": -2.2821335792541504, "logits/rejected": -2.2873048782348633, "logps/chosen": -227.7251739501953, "logps/rejected": -259.9873352050781, "loss": 0.6572, "positive_losses": 0.01116332970559597, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15087153017520905, "rewards/margins": 0.1119416207075119, "rewards/margins_max": 0.2820807099342346, "rewards/margins_min": -0.05207523703575134, "rewards/margins_std": 0.1523815542459488, "rewards/rejected": 0.038929905742406845, "step": 2230 }, { "dpo_losses": 0.6642849445343018, "epoch": 0.59, "grad_norm": 2.3934761946534078, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -2.304441213607788, "logits/rejected": -2.177365303039551, "logps/chosen": -261.0813903808594, "logps/rejected": -222.5238037109375, "loss": 0.721, "positive_losses": 0.04360236972570419, "rewards/accuracies": 0.625, "rewards/chosen": 0.15033815801143646, "rewards/margins": 0.06477586925029755, "rewards/margins_max": 0.2120368778705597, "rewards/margins_min": -0.0854678601026535, "rewards/margins_std": 0.13209500908851624, "rewards/rejected": 0.08556229621171951, "step": 2240 }, { "dpo_losses": 0.6582245230674744, "epoch": 0.59, "grad_norm": 6.295832015146168, "learning_rate": 2.162929264300107e-06, "logits/chosen": -2.347304105758667, "logits/rejected": -2.205922842025757, "logps/chosen": -258.48138427734375, "logps/rejected": -233.47805786132812, "loss": 0.7022, "positive_losses": 0.20507431030273438, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1643645316362381, "rewards/margins": 0.07669254392385483, "rewards/margins_max": 0.24145369231700897, "rewards/margins_min": -0.06831179559230804, "rewards/margins_std": 0.14044266939163208, "rewards/rejected": 0.08767198026180267, "step": 2250 }, { "dpo_losses": 0.6525287628173828, "epoch": 0.59, "grad_norm": 13.151068675199165, "learning_rate": 2.1403076230230006e-06, "logits/chosen": -2.212770700454712, "logits/rejected": -2.0876238346099854, "logps/chosen": -221.83743286132812, "logps/rejected": -227.41580200195312, "loss": 0.7003, "positive_losses": 0.08033790439367294, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.159569650888443, "rewards/margins": 0.08753874897956848, "rewards/margins_max": 0.246320441365242, "rewards/margins_min": -0.039417896419763565, "rewards/margins_std": 0.12804748117923737, "rewards/rejected": 0.0720309242606163, "step": 2260 }, { "dpo_losses": 0.6604429483413696, "epoch": 0.59, "grad_norm": 9.98785998635011, "learning_rate": 2.11771601595586e-06, "logits/chosen": -2.343064308166504, "logits/rejected": -2.1542677879333496, "logps/chosen": -228.5521697998047, "logps/rejected": -236.6898193359375, "loss": 0.6751, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15450337529182434, "rewards/margins": 0.07203666865825653, "rewards/margins_max": 0.23137226700782776, "rewards/margins_min": -0.07542747259140015, "rewards/margins_std": 0.13655950129032135, "rewards/rejected": 0.08246669918298721, "step": 2270 }, { "dpo_losses": 0.6473335027694702, "epoch": 0.6, "grad_norm": 10.805701792255075, "learning_rate": 2.0951563294913737e-06, "logits/chosen": -2.3738865852355957, "logits/rejected": -2.309765577316284, "logps/chosen": -229.6358642578125, "logps/rejected": -252.7618408203125, "loss": 0.6743, "positive_losses": 0.176319882273674, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16395223140716553, "rewards/margins": 0.10114537179470062, "rewards/margins_max": 0.27519017457962036, "rewards/margins_min": -0.059026725590229034, "rewards/margins_std": 0.14918819069862366, "rewards/rejected": 0.0628068596124649, "step": 2280 }, { "dpo_losses": 0.6592585444450378, "epoch": 0.6, "grad_norm": 5.4701076021368245, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -2.2610745429992676, "logits/rejected": -2.1019999980926514, "logps/chosen": -195.2284698486328, "logps/rejected": -204.8556365966797, "loss": 0.6943, "positive_losses": 0.005021190736442804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15064308047294617, "rewards/margins": 0.07348679006099701, "rewards/margins_max": 0.20557072758674622, "rewards/margins_min": -0.04632128030061722, "rewards/margins_std": 0.11287045478820801, "rewards/rejected": 0.07715629041194916, "step": 2290 }, { "dpo_losses": 0.6648882031440735, "epoch": 0.6, "grad_norm": 2.4646253292561204, "learning_rate": 2.050140250457023e-06, "logits/chosen": -2.196101188659668, "logits/rejected": -2.1919500827789307, "logps/chosen": -219.4390411376953, "logps/rejected": -233.34765625, "loss": 0.6967, "positive_losses": 0.010335922241210938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1766515076160431, "rewards/margins": 0.06181100755929947, "rewards/margins_max": 0.19367459416389465, "rewards/margins_min": -0.08046714961528778, "rewards/margins_std": 0.12206308543682098, "rewards/rejected": 0.11484047025442123, "step": 2300 }, { "epoch": 0.6, "eval_dpo_losses": 0.6542055010795593, "eval_logits/chosen": -2.248225212097168, "eval_logits/rejected": -2.1438400745391846, "eval_logps/chosen": -259.7584533691406, "eval_logps/rejected": -255.399169921875, "eval_loss": 0.6864062547683716, "eval_positive_losses": 0.17344222962856293, "eval_rewards/accuracies": 0.6964285969734192, "eval_rewards/chosen": 0.1601567566394806, "eval_rewards/margins": 0.0859455093741417, "eval_rewards/margins_max": 0.3410240113735199, "eval_rewards/margins_min": -0.1155787855386734, "eval_rewards/margins_std": 0.15319597721099854, "eval_rewards/rejected": 0.0742112472653389, "eval_runtime": 387.2507, "eval_samples_per_second": 5.165, "eval_steps_per_second": 0.163, "step": 2300 }, { "dpo_losses": 0.6528568863868713, "epoch": 0.6, "grad_norm": 2.2841811487944237, "learning_rate": 2.0276876167168042e-06, "logits/chosen": -2.427232265472412, "logits/rejected": -2.1577353477478027, "logps/chosen": -298.0076904296875, "logps/rejected": -240.8087921142578, "loss": 0.6649, "positive_losses": 0.025879669934511185, "rewards/accuracies": 0.625, "rewards/chosen": 0.1474526822566986, "rewards/margins": 0.08844628930091858, "rewards/margins_max": 0.28476443886756897, "rewards/margins_min": -0.07260879129171371, "rewards/margins_std": 0.160629004240036, "rewards/rejected": 0.059006400406360626, "step": 2310 }, { "dpo_losses": 0.6599723100662231, "epoch": 0.61, "grad_norm": 15.71791125992891, "learning_rate": 2.0052744209246682e-06, "logits/chosen": -2.26005482673645, "logits/rejected": -2.2196738719940186, "logps/chosen": -245.84048461914062, "logps/rejected": -246.4224090576172, "loss": 0.7116, "positive_losses": 0.4199165403842926, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16618305444717407, "rewards/margins": 0.07166745513677597, "rewards/margins_max": 0.22913792729377747, "rewards/margins_min": -0.07279949635267258, "rewards/margins_std": 0.13110610842704773, "rewards/rejected": 0.0945156067609787, "step": 2320 }, { "dpo_losses": 0.6400814652442932, "epoch": 0.61, "grad_norm": 7.780448412684285, "learning_rate": 1.9829025345760127e-06, "logits/chosen": -2.2666420936584473, "logits/rejected": -2.1920011043548584, "logps/chosen": -280.20098876953125, "logps/rejected": -311.0518798828125, "loss": 0.6919, "positive_losses": 0.23571816086769104, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17709824442863464, "rewards/margins": 0.11623726785182953, "rewards/margins_max": 0.29110187292099, "rewards/margins_min": -0.05250846594572067, "rewards/margins_std": 0.150735005736351, "rewards/rejected": 0.060860972851514816, "step": 2330 }, { "dpo_losses": 0.6592649817466736, "epoch": 0.61, "grad_norm": 5.937490684100422, "learning_rate": 1.9605738257169115e-06, "logits/chosen": -2.2879064083099365, "logits/rejected": -2.1984965801239014, "logps/chosen": -319.3297119140625, "logps/rejected": -315.752685546875, "loss": 0.6822, "positive_losses": 0.2728019654750824, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.13883593678474426, "rewards/margins": 0.0735674649477005, "rewards/margins_max": 0.2363482415676117, "rewards/margins_min": -0.05464771389961243, "rewards/margins_std": 0.1320185512304306, "rewards/rejected": 0.06526847928762436, "step": 2340 }, { "dpo_losses": 0.6482247114181519, "epoch": 0.62, "grad_norm": 21.378750299198238, "learning_rate": 1.9382901587881275e-06, "logits/chosen": -2.399590253829956, "logits/rejected": -2.2812163829803467, "logps/chosen": -275.9439392089844, "logps/rejected": -239.9832763671875, "loss": 0.6709, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.19270841777324677, "rewards/margins": 0.0943981260061264, "rewards/margins_max": 0.20143957436084747, "rewards/margins_min": -0.003161539090797305, "rewards/margins_std": 0.0917639285326004, "rewards/rejected": 0.09831027686595917, "step": 2350 }, { "dpo_losses": 0.6602179408073425, "epoch": 0.62, "grad_norm": 13.819512015752062, "learning_rate": 1.916053394469437e-06, "logits/chosen": -2.535148859024048, "logits/rejected": -2.346242904663086, "logps/chosen": -249.9375457763672, "logps/rejected": -203.2711944580078, "loss": 0.6743, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1673075407743454, "rewards/margins": 0.07183058559894562, "rewards/margins_max": 0.23388834297657013, "rewards/margins_min": -0.049382101744413376, "rewards/margins_std": 0.1247815489768982, "rewards/rejected": 0.09547694772481918, "step": 2360 }, { "dpo_losses": 0.6746606826782227, "epoch": 0.62, "grad_norm": 12.283777496161159, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -2.439030647277832, "logits/rejected": -2.2991063594818115, "logps/chosen": -228.607177734375, "logps/rejected": -222.484375, "loss": 0.6793, "positive_losses": 0.42002028226852417, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.16483189165592194, "rewards/margins": 0.045864470303058624, "rewards/margins_max": 0.22576327621936798, "rewards/margins_min": -0.14611127972602844, "rewards/margins_std": 0.16603752970695496, "rewards/rejected": 0.11896741390228271, "step": 2370 }, { "dpo_losses": 0.6637119054794312, "epoch": 0.62, "grad_norm": 2.2035225583924114, "learning_rate": 1.8717279966446267e-06, "logits/chosen": -2.4245810508728027, "logits/rejected": -2.3400917053222656, "logps/chosen": -216.8575897216797, "logps/rejected": -202.5941619873047, "loss": 0.6585, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15102484822273254, "rewards/margins": 0.06429088860750198, "rewards/margins_max": 0.2087424248456955, "rewards/margins_min": -0.06031946465373039, "rewards/margins_std": 0.11977015435695648, "rewards/rejected": 0.08673397451639175, "step": 2380 }, { "dpo_losses": 0.6618281602859497, "epoch": 0.63, "grad_norm": 17.106925837832282, "learning_rate": 1.8496430642964698e-06, "logits/chosen": -2.373640537261963, "logits/rejected": -2.3986754417419434, "logps/chosen": -250.9483642578125, "logps/rejected": -288.2162170410156, "loss": 0.6706, "positive_losses": 0.10676345974206924, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17134235799312592, "rewards/margins": 0.07012984156608582, "rewards/margins_max": 0.2648313641548157, "rewards/margins_min": -0.10279300063848495, "rewards/margins_std": 0.16330289840698242, "rewards/rejected": 0.1012125164270401, "step": 2390 }, { "dpo_losses": 0.6392374634742737, "epoch": 0.63, "grad_norm": 10.919237683415178, "learning_rate": 1.827612436565286e-06, "logits/chosen": -2.353942632675171, "logits/rejected": -2.1905226707458496, "logps/chosen": -247.59970092773438, "logps/rejected": -292.2537841796875, "loss": 0.6674, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19166508316993713, "rewards/margins": 0.11626045405864716, "rewards/margins_max": 0.26691120862960815, "rewards/margins_min": -0.016501018777489662, "rewards/margins_std": 0.12931865453720093, "rewards/rejected": 0.07540462911128998, "step": 2400 }, { "epoch": 0.63, "eval_dpo_losses": 0.6543847322463989, "eval_logits/chosen": -2.239903211593628, "eval_logits/rejected": -2.1347789764404297, "eval_logps/chosen": -259.18359375, "eval_logps/rejected": -254.7984161376953, "eval_loss": 0.6829746961593628, "eval_positive_losses": 0.13217762112617493, "eval_rewards/accuracies": 0.6964285969734192, "eval_rewards/chosen": 0.16590562462806702, "eval_rewards/margins": 0.08568684011697769, "eval_rewards/margins_max": 0.3405982553958893, "eval_rewards/margins_min": -0.1206393837928772, "eval_rewards/margins_std": 0.15430031716823578, "eval_rewards/rejected": 0.08021882176399231, "eval_runtime": 386.6342, "eval_samples_per_second": 5.173, "eval_steps_per_second": 0.163, "step": 2400 }, { "dpo_losses": 0.6597193479537964, "epoch": 0.63, "grad_norm": 22.64174508745889, "learning_rate": 1.8056379530021492e-06, "logits/chosen": -2.352130174636841, "logits/rejected": -2.2774384021759033, "logps/chosen": -275.99884033203125, "logps/rejected": -244.3888702392578, "loss": 0.6534, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16923965513706207, "rewards/margins": 0.07458443194627762, "rewards/margins_max": 0.24472546577453613, "rewards/margins_min": -0.08073264360427856, "rewards/margins_std": 0.1445024609565735, "rewards/rejected": 0.09465523064136505, "step": 2410 }, { "dpo_losses": 0.670413076877594, "epoch": 0.63, "grad_norm": 11.913254327097587, "learning_rate": 1.7837214484701154e-06, "logits/chosen": -2.4123101234436035, "logits/rejected": -2.223991870880127, "logps/chosen": -318.64239501953125, "logps/rejected": -270.60089111328125, "loss": 0.6862, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17370271682739258, "rewards/margins": 0.05205560848116875, "rewards/margins_max": 0.20840521156787872, "rewards/margins_min": -0.09155932068824768, "rewards/margins_std": 0.1330285221338272, "rewards/rejected": 0.12164710462093353, "step": 2420 }, { "dpo_losses": 0.6464499235153198, "epoch": 0.64, "grad_norm": 6.715805473050531, "learning_rate": 1.7618647529910043e-06, "logits/chosen": -2.345921277999878, "logits/rejected": -2.2066192626953125, "logps/chosen": -266.8060302734375, "logps/rejected": -242.70285034179688, "loss": 0.6655, "positive_losses": 0.48648756742477417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15442386269569397, "rewards/margins": 0.10020202398300171, "rewards/margins_max": 0.25283271074295044, "rewards/margins_min": -0.036137789487838745, "rewards/margins_std": 0.1275724619626999, "rewards/rejected": 0.05422184616327286, "step": 2430 }, { "dpo_losses": 0.6536785364151001, "epoch": 0.64, "grad_norm": 5.052402551797527, "learning_rate": 1.7400696915925996e-06, "logits/chosen": -2.254143238067627, "logits/rejected": -2.198411226272583, "logps/chosen": -271.36077880859375, "logps/rejected": -256.7943115234375, "loss": 0.6666, "positive_losses": 0.06517486274242401, "rewards/accuracies": 0.75, "rewards/chosen": 0.15615098178386688, "rewards/margins": 0.08732055127620697, "rewards/margins_max": 0.2473866492509842, "rewards/margins_min": -0.07213245332241058, "rewards/margins_std": 0.13965485990047455, "rewards/rejected": 0.06883043050765991, "step": 2440 }, { "dpo_losses": 0.6582575440406799, "epoch": 0.64, "grad_norm": 4.054727484298276, "learning_rate": 1.718338084156254e-06, "logits/chosen": -2.333944082260132, "logits/rejected": -2.3637521266937256, "logps/chosen": -270.36810302734375, "logps/rejected": -334.80419921875, "loss": 0.6904, "positive_losses": 0.5169219970703125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14250943064689636, "rewards/margins": 0.07800460606813431, "rewards/margins_max": 0.2533722519874573, "rewards/margins_min": -0.10011126101016998, "rewards/margins_std": 0.1587589681148529, "rewards/rejected": 0.06450481712818146, "step": 2450 }, { "dpo_losses": 0.6609666347503662, "epoch": 0.64, "grad_norm": 9.7553567109617, "learning_rate": 1.6966717452649372e-06, "logits/chosen": -2.223315715789795, "logits/rejected": -2.1969823837280273, "logps/chosen": -220.9146270751953, "logps/rejected": -231.94677734375, "loss": 0.6833, "positive_losses": 0.23555946350097656, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14648443460464478, "rewards/margins": 0.07381759583950043, "rewards/margins_max": 0.2866845726966858, "rewards/margins_min": -0.1064581423997879, "rewards/margins_std": 0.17279109358787537, "rewards/rejected": 0.07266684621572495, "step": 2460 }, { "dpo_losses": 0.6552813649177551, "epoch": 0.65, "grad_norm": 2.0617978605911476, "learning_rate": 1.6750724840517103e-06, "logits/chosen": -2.333803415298462, "logits/rejected": -2.2953193187713623, "logps/chosen": -313.409912109375, "logps/rejected": -285.7817077636719, "loss": 0.6903, "positive_losses": 0.5005233883857727, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15589001774787903, "rewards/margins": 0.08735939115285873, "rewards/margins_max": 0.30748414993286133, "rewards/margins_min": -0.10609878599643707, "rewards/margins_std": 0.1882353127002716, "rewards/rejected": 0.0685306116938591, "step": 2470 }, { "dpo_losses": 0.6604397892951965, "epoch": 0.65, "grad_norm": 11.579494215176105, "learning_rate": 1.6535421040486686e-06, "logits/chosen": -2.412940263748169, "logits/rejected": -2.298880100250244, "logps/chosen": -279.41546630859375, "logps/rejected": -234.7178192138672, "loss": 0.6711, "positive_losses": 0.22225990891456604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16586124897003174, "rewards/margins": 0.0713425725698471, "rewards/margins_max": 0.190806582570076, "rewards/margins_min": -0.05704197287559509, "rewards/margins_std": 0.11491812765598297, "rewards/rejected": 0.09451869130134583, "step": 2480 }, { "dpo_losses": 0.6267030835151672, "epoch": 0.65, "grad_norm": 9.989822167387924, "learning_rate": 1.6320824030363458e-06, "logits/chosen": -2.3187882900238037, "logits/rejected": -2.0554635524749756, "logps/chosen": -292.9251708984375, "logps/rejected": -246.4881134033203, "loss": 0.6497, "positive_losses": 0.014348601922392845, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.21200427412986755, "rewards/margins": 0.14321297407150269, "rewards/margins_max": 0.3055918216705322, "rewards/margins_min": -0.0032319724559783936, "rewards/margins_std": 0.1415683478116989, "rewards/rejected": 0.06879128515720367, "step": 2490 }, { "dpo_losses": 0.6509294509887695, "epoch": 0.65, "grad_norm": 2.2469296168876918, "learning_rate": 1.6106951728936028e-06, "logits/chosen": -2.2784674167633057, "logits/rejected": -2.29398512840271, "logps/chosen": -230.30709838867188, "logps/rejected": -281.54779052734375, "loss": 0.6892, "positive_losses": 0.4901771545410156, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17643043398857117, "rewards/margins": 0.09405721724033356, "rewards/margins_max": 0.2858222424983978, "rewards/margins_min": -0.0642620176076889, "rewards/margins_std": 0.15505994856357574, "rewards/rejected": 0.0823732241988182, "step": 2500 }, { "epoch": 0.65, "eval_dpo_losses": 0.6547347903251648, "eval_logits/chosen": -2.2405202388763428, "eval_logits/rejected": -2.1352145671844482, "eval_logps/chosen": -258.6753234863281, "eval_logps/rejected": -254.2301025390625, "eval_loss": 0.6822869777679443, "eval_positive_losses": 0.13067425787448883, "eval_rewards/accuracies": 0.6845238208770752, "eval_rewards/chosen": 0.1709882616996765, "eval_rewards/margins": 0.08508633077144623, "eval_rewards/margins_max": 0.3441563844680786, "eval_rewards/margins_min": -0.12116732448339462, "eval_rewards/margins_std": 0.15571796894073486, "eval_rewards/rejected": 0.08590193837881088, "eval_runtime": 386.593, "eval_samples_per_second": 5.173, "eval_steps_per_second": 0.163, "step": 2500 }, { "dpo_losses": 0.6511312127113342, "epoch": 0.66, "grad_norm": 2.2041598655821706, "learning_rate": 1.5893821994479996e-06, "logits/chosen": -2.2733168601989746, "logits/rejected": -2.1043715476989746, "logps/chosen": -224.7740478515625, "logps/rejected": -230.84384155273438, "loss": 0.6549, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17908117175102234, "rewards/margins": 0.0930861234664917, "rewards/margins_max": 0.2508627772331238, "rewards/margins_min": -0.04661761596798897, "rewards/margins_std": 0.13155964016914368, "rewards/rejected": 0.08599505573511124, "step": 2510 }, { "dpo_losses": 0.6298661231994629, "epoch": 0.66, "grad_norm": 1.713049364727071, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -2.392409563064575, "logits/rejected": -2.2570481300354004, "logps/chosen": -230.9207305908203, "logps/rejected": -228.6148681640625, "loss": 0.7055, "positive_losses": 0.03975028917193413, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17922814190387726, "rewards/margins": 0.14048965275287628, "rewards/margins_max": 0.34951239824295044, "rewards/margins_min": -0.037560805678367615, "rewards/margins_std": 0.17873182892799377, "rewards/rejected": 0.03873848170042038, "step": 2520 }, { "dpo_losses": 0.6607402563095093, "epoch": 0.66, "grad_norm": 8.763718668041445, "learning_rate": 1.5469861348078014e-06, "logits/chosen": -2.321742534637451, "logits/rejected": -2.3663437366485596, "logps/chosen": -254.3001708984375, "logps/rejected": -290.60772705078125, "loss": 0.6871, "positive_losses": 0.6780862808227539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14342905580997467, "rewards/margins": 0.07178018987178802, "rewards/margins_max": 0.21852734684944153, "rewards/margins_min": -0.08084462583065033, "rewards/margins_std": 0.1346113234758377, "rewards/rejected": 0.07164885848760605, "step": 2530 }, { "dpo_losses": 0.651390790939331, "epoch": 0.66, "grad_norm": 14.955716696770587, "learning_rate": 1.5259065836724035e-06, "logits/chosen": -2.2039706707000732, "logits/rejected": -2.3029284477233887, "logps/chosen": -256.8654479980469, "logps/rejected": -316.02545166015625, "loss": 0.6893, "positive_losses": 0.04978217929601669, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17799420654773712, "rewards/margins": 0.0939134731888771, "rewards/margins_max": 0.27904585003852844, "rewards/margins_min": -0.11202855408191681, "rewards/margins_std": 0.1759130358695984, "rewards/rejected": 0.08408074080944061, "step": 2540 }, { "dpo_losses": 0.6456555724143982, "epoch": 0.67, "grad_norm": 2.486302809425322, "learning_rate": 1.5049083690569456e-06, "logits/chosen": -2.4829952716827393, "logits/rejected": -2.282881736755371, "logps/chosen": -251.9456787109375, "logps/rejected": -278.27716064453125, "loss": 0.6762, "positive_losses": 0.8203288912773132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.161819726228714, "rewards/margins": 0.1074310764670372, "rewards/margins_max": 0.26294535398483276, "rewards/margins_min": -0.047836970537900925, "rewards/margins_std": 0.14247296750545502, "rewards/rejected": 0.05438864231109619, "step": 2550 }, { "dpo_losses": 0.6591287851333618, "epoch": 0.67, "grad_norm": 2.33551399193772, "learning_rate": 1.4839932443063057e-06, "logits/chosen": -2.1252949237823486, "logits/rejected": -2.081033229827881, "logps/chosen": -175.4127960205078, "logps/rejected": -172.68276977539062, "loss": 0.6606, "positive_losses": 0.05720243602991104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15542718768119812, "rewards/margins": 0.07314060628414154, "rewards/margins_max": 0.19430318474769592, "rewards/margins_min": -0.054354071617126465, "rewards/margins_std": 0.1059526577591896, "rewards/rejected": 0.08228656649589539, "step": 2560 }, { "dpo_losses": 0.6454457640647888, "epoch": 0.67, "grad_norm": 7.797851990548673, "learning_rate": 1.4631629558273803e-06, "logits/chosen": -2.3393354415893555, "logits/rejected": -2.2406973838806152, "logps/chosen": -256.0242004394531, "logps/rejected": -272.27032470703125, "loss": 0.6794, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18352380394935608, "rewards/margins": 0.1055334210395813, "rewards/margins_max": 0.28466230630874634, "rewards/margins_min": -0.07258038967847824, "rewards/margins_std": 0.1630893498659134, "rewards/rejected": 0.07799038290977478, "step": 2570 }, { "dpo_losses": 0.6473389863967896, "epoch": 0.68, "grad_norm": 2.389667593532742, "learning_rate": 1.4424192429432657e-06, "logits/chosen": -2.3204522132873535, "logits/rejected": -2.2108471393585205, "logps/chosen": -237.8162384033203, "logps/rejected": -262.1719055175781, "loss": 0.6693, "positive_losses": 0.06007995456457138, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1857767552137375, "rewards/margins": 0.10074867308139801, "rewards/margins_max": 0.31079235672950745, "rewards/margins_min": -0.05753164738416672, "rewards/margins_std": 0.1656728982925415, "rewards/rejected": 0.08502806723117828, "step": 2580 }, { "dpo_losses": 0.6396852731704712, "epoch": 0.68, "grad_norm": 5.499078326594066, "learning_rate": 1.421763837748016e-06, "logits/chosen": -2.5509209632873535, "logits/rejected": -2.346020221710205, "logps/chosen": -271.3934326171875, "logps/rejected": -230.72525024414062, "loss": 0.6893, "positive_losses": 0.569193959236145, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17547360062599182, "rewards/margins": 0.11968740075826645, "rewards/margins_max": 0.3073948323726654, "rewards/margins_min": -0.0861777514219284, "rewards/margins_std": 0.1823941469192505, "rewards/rejected": 0.05578618496656418, "step": 2590 }, { "dpo_losses": 0.6720633506774902, "epoch": 0.68, "grad_norm": 2.4531726225099852, "learning_rate": 1.401198464962021e-06, "logits/chosen": -2.094071626663208, "logits/rejected": -2.2020092010498047, "logps/chosen": -214.7558135986328, "logps/rejected": -257.4778747558594, "loss": 0.6957, "positive_losses": 0.5747789144515991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12981782853603363, "rewards/margins": 0.04834100231528282, "rewards/margins_max": 0.22302313148975372, "rewards/margins_min": -0.10227296501398087, "rewards/margins_std": 0.14092636108398438, "rewards/rejected": 0.08147682249546051, "step": 2600 }, { "epoch": 0.68, "eval_dpo_losses": 0.6549044251441956, "eval_logits/chosen": -2.2436580657958984, "eval_logits/rejected": -2.1391208171844482, "eval_logps/chosen": -258.9982604980469, "eval_logps/rejected": -254.5000457763672, "eval_loss": 0.6825661659240723, "eval_positive_losses": 0.12786059081554413, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.16775895655155182, "eval_rewards/margins": 0.08455661684274673, "eval_rewards/margins_max": 0.33997035026550293, "eval_rewards/margins_min": -0.11716578155755997, "eval_rewards/margins_std": 0.1527850180864334, "eval_rewards/rejected": 0.08320236206054688, "eval_runtime": 387.0043, "eval_samples_per_second": 5.168, "eval_steps_per_second": 0.163, "step": 2600 }, { "dpo_losses": 0.6692937016487122, "epoch": 0.68, "grad_norm": 2.684931392539035, "learning_rate": 1.3807248417879896e-06, "logits/chosen": -2.368227481842041, "logits/rejected": -2.1184744834899902, "logps/chosen": -244.9230499267578, "logps/rejected": -174.0605926513672, "loss": 0.678, "positive_losses": 0.08151092380285263, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.16569675505161285, "rewards/margins": 0.05241062119603157, "rewards/margins_max": 0.20387116074562073, "rewards/margins_min": -0.09062186628580093, "rewards/margins_std": 0.13162575662136078, "rewards/rejected": 0.11328613758087158, "step": 2610 }, { "dpo_losses": 0.6513010859489441, "epoch": 0.69, "grad_norm": 12.244098721066036, "learning_rate": 1.3603446777675665e-06, "logits/chosen": -2.1435139179229736, "logits/rejected": -2.178621530532837, "logps/chosen": -218.70468139648438, "logps/rejected": -246.75314331054688, "loss": 0.6782, "positive_losses": 0.07565002143383026, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16712693870067596, "rewards/margins": 0.0913182869553566, "rewards/margins_max": 0.2443009614944458, "rewards/margins_min": -0.05475162714719772, "rewards/margins_std": 0.13221421837806702, "rewards/rejected": 0.07580865919589996, "step": 2620 }, { "dpo_losses": 0.6594210267066956, "epoch": 0.69, "grad_norm": 29.26339753936434, "learning_rate": 1.3400596746385817e-06, "logits/chosen": -2.4458351135253906, "logits/rejected": -2.3435826301574707, "logps/chosen": -233.05245971679688, "logps/rejected": -236.0742645263672, "loss": 0.7104, "positive_losses": 0.10796470940113068, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14475899934768677, "rewards/margins": 0.07429394125938416, "rewards/margins_max": 0.24970436096191406, "rewards/margins_min": -0.08126474916934967, "rewards/margins_std": 0.14938676357269287, "rewards/rejected": 0.07046505063772202, "step": 2630 }, { "dpo_losses": 0.682369589805603, "epoch": 0.69, "grad_norm": 9.327559819823339, "learning_rate": 1.3198715261929587e-06, "logits/chosen": -2.3018815517425537, "logits/rejected": -2.323288917541504, "logps/chosen": -306.16351318359375, "logps/rejected": -286.18609619140625, "loss": 0.7423, "positive_losses": 1.196443796157837, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.14743389189243317, "rewards/margins": 0.029096532613039017, "rewards/margins_max": 0.2234838306903839, "rewards/margins_min": -0.1615041196346283, "rewards/margins_std": 0.17085576057434082, "rewards/rejected": 0.11833735555410385, "step": 2640 }, { "dpo_losses": 0.63266521692276, "epoch": 0.69, "grad_norm": 1.9724582686077101, "learning_rate": 1.2997819181352823e-06, "logits/chosen": -2.3811697959899902, "logits/rejected": -2.1694116592407227, "logps/chosen": -214.98696899414062, "logps/rejected": -211.55300903320312, "loss": 0.6428, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.18273723125457764, "rewards/margins": 0.1316065937280655, "rewards/margins_max": 0.325876921415329, "rewards/margins_min": -0.028294021263718605, "rewards/margins_std": 0.16020527482032776, "rewards/rejected": 0.051130641251802444, "step": 2650 }, { "dpo_losses": 0.6455888748168945, "epoch": 0.7, "grad_norm": 15.436842684323334, "learning_rate": 1.2797925279420454e-06, "logits/chosen": -2.480480432510376, "logits/rejected": -2.399625778198242, "logps/chosen": -298.2550354003906, "logps/rejected": -240.13980102539062, "loss": 0.6592, "positive_losses": 0.06563720852136612, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19139285385608673, "rewards/margins": 0.11018925905227661, "rewards/margins_max": 0.28477951884269714, "rewards/margins_min": -0.05193904787302017, "rewards/margins_std": 0.15335839986801147, "rewards/rejected": 0.08120360225439072, "step": 2660 }, { "dpo_losses": 0.6599230766296387, "epoch": 0.7, "grad_norm": 11.93267948625453, "learning_rate": 1.2599050247215764e-06, "logits/chosen": -2.483884811401367, "logits/rejected": -2.2498438358306885, "logps/chosen": -260.6651611328125, "logps/rejected": -225.16128540039062, "loss": 0.6837, "positive_losses": 0.6628273129463196, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14891615509986877, "rewards/margins": 0.07099025696516037, "rewards/margins_max": 0.17174911499023438, "rewards/margins_min": -0.04231322556734085, "rewards/margins_std": 0.0940045565366745, "rewards/rejected": 0.07792589068412781, "step": 2670 }, { "dpo_losses": 0.6397324800491333, "epoch": 0.7, "grad_norm": 10.16187532013519, "learning_rate": 1.2401210690746705e-06, "logits/chosen": -2.4015774726867676, "logits/rejected": -2.314923048019409, "logps/chosen": -312.27947998046875, "logps/rejected": -353.54986572265625, "loss": 0.6985, "positive_losses": 1.038459062576294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1520489603281021, "rewards/margins": 0.12213818728923798, "rewards/margins_max": 0.371204674243927, "rewards/margins_min": -0.08915247023105621, "rewards/margins_std": 0.20714953541755676, "rewards/rejected": 0.02991076372563839, "step": 2680 }, { "dpo_losses": 0.6301525831222534, "epoch": 0.7, "grad_norm": 2.396153459031987, "learning_rate": 1.2204423129559306e-06, "logits/chosen": -2.241032123565674, "logits/rejected": -2.203106164932251, "logps/chosen": -169.1468048095703, "logps/rejected": -223.8304443359375, "loss": 0.6929, "positive_losses": 0.570334255695343, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.1512703001499176, "rewards/margins": 0.13741326332092285, "rewards/margins_max": 0.3291873037815094, "rewards/margins_min": -0.03459559381008148, "rewards/margins_std": 0.16125375032424927, "rewards/rejected": 0.013857054524123669, "step": 2690 }, { "dpo_losses": 0.6673535108566284, "epoch": 0.71, "grad_norm": 19.897738902391257, "learning_rate": 1.20087039953583e-06, "logits/chosen": -2.2144763469696045, "logits/rejected": -2.166745185852051, "logps/chosen": -224.12844848632812, "logps/rejected": -209.28652954101562, "loss": 0.6711, "positive_losses": 0.09066619724035263, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1556396186351776, "rewards/margins": 0.05799253657460213, "rewards/margins_max": 0.21496224403381348, "rewards/margins_min": -0.09004837274551392, "rewards/margins_std": 0.13549737632274628, "rewards/rejected": 0.09764708578586578, "step": 2700 }, { "epoch": 0.71, "eval_dpo_losses": 0.6560174822807312, "eval_logits/chosen": -2.2367265224456787, "eval_logits/rejected": -2.1324782371520996, "eval_logps/chosen": -258.6187438964844, "eval_logps/rejected": -253.8745574951172, "eval_loss": 0.681713879108429, "eval_positive_losses": 0.11478157341480255, "eval_rewards/accuracies": 0.6904761791229248, "eval_rewards/chosen": 0.17155404388904572, "eval_rewards/margins": 0.08209677040576935, "eval_rewards/margins_max": 0.3345986008644104, "eval_rewards/margins_min": -0.1177971214056015, "eval_rewards/margins_std": 0.1513931006193161, "eval_rewards/rejected": 0.08945729583501816, "eval_runtime": 434.0982, "eval_samples_per_second": 4.607, "eval_steps_per_second": 0.145, "step": 2700 }, { "dpo_losses": 0.6578797698020935, "epoch": 0.71, "grad_norm": 14.34189090025184, "learning_rate": 1.181406963063507e-06, "logits/chosen": -2.5197956562042236, "logits/rejected": -2.3595120906829834, "logps/chosen": -263.73321533203125, "logps/rejected": -264.922119140625, "loss": 0.6873, "positive_losses": 0.18321609497070312, "rewards/accuracies": 0.625, "rewards/chosen": 0.15809258818626404, "rewards/margins": 0.07585270702838898, "rewards/margins_max": 0.22914262115955353, "rewards/margins_min": -0.03902244567871094, "rewards/margins_std": 0.12546536326408386, "rewards/rejected": 0.08223988115787506, "step": 2710 }, { "dpo_losses": 0.6598614454269409, "epoch": 0.71, "grad_norm": 8.96142661930692, "learning_rate": 1.1620536287303052e-06, "logits/chosen": -2.2102255821228027, "logits/rejected": -2.1026530265808105, "logps/chosen": -190.5597686767578, "logps/rejected": -213.94906616210938, "loss": 0.6763, "positive_losses": 0.2107589691877365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16230681538581848, "rewards/margins": 0.07546380907297134, "rewards/margins_max": 0.2334224283695221, "rewards/margins_min": -0.09442691504955292, "rewards/margins_std": 0.14420178532600403, "rewards/rejected": 0.08684299886226654, "step": 2720 }, { "dpo_losses": 0.643836498260498, "epoch": 0.71, "grad_norm": 2.2256143257637153, "learning_rate": 1.1428120125340717e-06, "logits/chosen": -2.2484631538391113, "logits/rejected": -2.216881275177002, "logps/chosen": -285.47894287109375, "logps/rejected": -280.4811096191406, "loss": 0.6589, "positive_losses": 0.061797332018613815, "rewards/accuracies": 0.875, "rewards/chosen": 0.2055526226758957, "rewards/margins": 0.10586044937372208, "rewards/margins_max": 0.2709989547729492, "rewards/margins_min": -0.0479423925280571, "rewards/margins_std": 0.13987375795841217, "rewards/rejected": 0.09969218075275421, "step": 2730 }, { "dpo_losses": 0.6633583903312683, "epoch": 0.72, "grad_norm": 9.969648840869668, "learning_rate": 1.123683721144223e-06, "logits/chosen": -2.340733766555786, "logits/rejected": -2.3486671447753906, "logps/chosen": -198.47938537597656, "logps/rejected": -248.7616729736328, "loss": 0.6896, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15433421730995178, "rewards/margins": 0.06829866766929626, "rewards/margins_max": 0.23929543793201447, "rewards/margins_min": -0.088859423995018, "rewards/margins_std": 0.1427856981754303, "rewards/rejected": 0.08603556454181671, "step": 2740 }, { "dpo_losses": 0.6630204319953918, "epoch": 0.72, "grad_norm": 2.639796019678925, "learning_rate": 1.1046703517675848e-06, "logits/chosen": -2.3711884021759033, "logits/rejected": -2.2541847229003906, "logps/chosen": -240.79312133789062, "logps/rejected": -219.089111328125, "loss": 0.6817, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.17425379157066345, "rewards/margins": 0.06687882542610168, "rewards/margins_max": 0.2347218096256256, "rewards/margins_min": -0.08911930024623871, "rewards/margins_std": 0.1443357616662979, "rewards/rejected": 0.10737496614456177, "step": 2750 }, { "dpo_losses": 0.6397862434387207, "epoch": 0.72, "grad_norm": 2.3382645565294835, "learning_rate": 1.085773492015028e-06, "logits/chosen": -2.4397153854370117, "logits/rejected": -2.1719274520874023, "logps/chosen": -341.91192626953125, "logps/rejected": -298.88702392578125, "loss": 0.651, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.19096066057682037, "rewards/margins": 0.11804171651601791, "rewards/margins_max": 0.3371199071407318, "rewards/margins_min": -0.05634661763906479, "rewards/margins_std": 0.17567841708660126, "rewards/rejected": 0.07291895896196365, "step": 2760 }, { "dpo_losses": 0.6509621143341064, "epoch": 0.72, "grad_norm": 2.4352195048630083, "learning_rate": 1.0669947197689034e-06, "logits/chosen": -2.5231716632843018, "logits/rejected": -2.2585768699645996, "logps/chosen": -281.86865234375, "logps/rejected": -260.9451599121094, "loss": 0.6804, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1748836636543274, "rewards/margins": 0.09597839415073395, "rewards/margins_max": 0.32797059416770935, "rewards/margins_min": -0.04887733608484268, "rewards/margins_std": 0.17323651909828186, "rewards/rejected": 0.07890526950359344, "step": 2770 }, { "dpo_losses": 0.6601817011833191, "epoch": 0.73, "grad_norm": 10.844167915667994, "learning_rate": 1.048335603051291e-06, "logits/chosen": -2.370856523513794, "logits/rejected": -2.1673877239227295, "logps/chosen": -301.0946960449219, "logps/rejected": -210.18258666992188, "loss": 0.6958, "positive_losses": 0.4529266357421875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18078163266181946, "rewards/margins": 0.07245569676160812, "rewards/margins_max": 0.24224159121513367, "rewards/margins_min": -0.06918230652809143, "rewards/margins_std": 0.1352653056383133, "rewards/rejected": 0.10832594335079193, "step": 2780 }, { "dpo_losses": 0.6436033248901367, "epoch": 0.73, "grad_norm": 1.7966775536900308, "learning_rate": 1.0297976998930665e-06, "logits/chosen": -2.514495372772217, "logits/rejected": -2.2174785137176514, "logps/chosen": -327.05224609375, "logps/rejected": -272.4554443359375, "loss": 0.6733, "positive_losses": 0.061385344713926315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19684699177742004, "rewards/margins": 0.10876120626926422, "rewards/margins_max": 0.28177350759506226, "rewards/margins_min": -0.016671407967805862, "rewards/margins_std": 0.13238796591758728, "rewards/rejected": 0.08808580785989761, "step": 2790 }, { "dpo_losses": 0.6563596725463867, "epoch": 0.73, "grad_norm": 2.22692996556542, "learning_rate": 1.0113825582038078e-06, "logits/chosen": -2.367077112197876, "logits/rejected": -2.2473807334899902, "logps/chosen": -218.83718872070312, "logps/rejected": -223.6019287109375, "loss": 0.6669, "positive_losses": 0.19268837571144104, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17192740738391876, "rewards/margins": 0.08053543418645859, "rewards/margins_max": 0.25530171394348145, "rewards/margins_min": -0.05144239217042923, "rewards/margins_std": 0.1355515420436859, "rewards/rejected": 0.09139196574687958, "step": 2800 }, { "epoch": 0.73, "eval_dpo_losses": 0.6557942628860474, "eval_logits/chosen": -2.241514205932617, "eval_logits/rejected": -2.137483835220337, "eval_logps/chosen": -258.3899841308594, "eval_logps/rejected": -253.72315979003906, "eval_loss": 0.6795781254768372, "eval_positive_losses": 0.1073906272649765, "eval_rewards/accuracies": 0.692460298538208, "eval_rewards/chosen": 0.1738419085741043, "eval_rewards/margins": 0.08287054300308228, "eval_rewards/margins_max": 0.3400602638721466, "eval_rewards/margins_min": -0.12301057577133179, "eval_rewards/margins_std": 0.155066579580307, "eval_rewards/rejected": 0.09097136557102203, "eval_runtime": 386.5201, "eval_samples_per_second": 5.174, "eval_steps_per_second": 0.163, "step": 2800 }, { "dpo_losses": 0.6442911028862, "epoch": 0.74, "grad_norm": 2.4783170724252797, "learning_rate": 9.930917156425477e-07, "logits/chosen": -2.1939587593078613, "logits/rejected": -2.177405595779419, "logps/chosen": -204.1696014404297, "logps/rejected": -202.10458374023438, "loss": 0.6903, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16616372764110565, "rewards/margins": 0.1043115109205246, "rewards/margins_max": 0.2585856318473816, "rewards/margins_min": -0.01893065869808197, "rewards/margins_std": 0.12061973661184311, "rewards/rejected": 0.06185222789645195, "step": 2810 }, { "dpo_losses": 0.6672787666320801, "epoch": 0.74, "grad_norm": 8.874511705317229, "learning_rate": 9.749266994893756e-07, "logits/chosen": -2.285464286804199, "logits/rejected": -2.196565628051758, "logps/chosen": -215.01611328125, "logps/rejected": -251.84860229492188, "loss": 0.6637, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16061432659626007, "rewards/margins": 0.05914852023124695, "rewards/margins_max": 0.2060922086238861, "rewards/margins_min": -0.11268939077854156, "rewards/margins_std": 0.13965968787670135, "rewards/rejected": 0.10146580636501312, "step": 2820 }, { "dpo_losses": 0.646509051322937, "epoch": 0.74, "grad_norm": 13.984493932804313, "learning_rate": 9.56889026517913e-07, "logits/chosen": -2.163336992263794, "logits/rejected": -2.107482433319092, "logps/chosen": -256.30706787109375, "logps/rejected": -312.94549560546875, "loss": 0.7015, "positive_losses": 1.3790088891983032, "rewards/accuracies": 0.75, "rewards/chosen": 0.16067567467689514, "rewards/margins": 0.10382004082202911, "rewards/margins_max": 0.3095715641975403, "rewards/margins_min": -0.06091379001736641, "rewards/margins_std": 0.1614278256893158, "rewards/rejected": 0.056855618953704834, "step": 2830 }, { "dpo_losses": 0.6709100604057312, "epoch": 0.74, "grad_norm": 2.403021789636137, "learning_rate": 9.389802028686617e-07, "logits/chosen": -2.2739999294281006, "logits/rejected": -2.244316816329956, "logps/chosen": -182.1898651123047, "logps/rejected": -175.9180450439453, "loss": 0.7041, "positive_losses": 0.030075836926698685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14807407557964325, "rewards/margins": 0.047214582562446594, "rewards/margins_max": 0.14521318674087524, "rewards/margins_min": -0.04553777724504471, "rewards/margins_std": 0.08190377056598663, "rewards/rejected": 0.10085948556661606, "step": 2840 }, { "dpo_losses": 0.6599880456924438, "epoch": 0.75, "grad_norm": 2.1808026923579322, "learning_rate": 9.212017239232427e-07, "logits/chosen": -2.2557995319366455, "logits/rejected": -2.253403663635254, "logps/chosen": -252.77685546875, "logps/rejected": -237.7715606689453, "loss": 0.6966, "positive_losses": 0.05833854526281357, "rewards/accuracies": 0.75, "rewards/chosen": 0.15123063325881958, "rewards/margins": 0.07374230027198792, "rewards/margins_max": 0.23325583338737488, "rewards/margins_min": -0.1191643625497818, "rewards/margins_std": 0.15287819504737854, "rewards/rejected": 0.07748834043741226, "step": 2850 }, { "dpo_losses": 0.6836689114570618, "epoch": 0.75, "grad_norm": 6.061106413503276, "learning_rate": 9.03555074179533e-07, "logits/chosen": -2.2998619079589844, "logits/rejected": -2.207089900970459, "logps/chosen": -231.2566680908203, "logps/rejected": -227.62179565429688, "loss": 0.662, "positive_losses": 0.009832000359892845, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.1641579568386078, "rewards/margins": 0.024136612191796303, "rewards/margins_max": 0.17477421462535858, "rewards/margins_min": -0.10811509937047958, "rewards/margins_std": 0.126048743724823, "rewards/rejected": 0.14002135396003723, "step": 2860 }, { "dpo_losses": 0.6449126601219177, "epoch": 0.75, "grad_norm": 2.810213134122018, "learning_rate": 8.860417271277067e-07, "logits/chosen": -2.1638543605804443, "logits/rejected": -2.040700674057007, "logps/chosen": -264.4967041015625, "logps/rejected": -250.8966064453125, "loss": 0.6774, "positive_losses": 0.11832503974437714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16353316605091095, "rewards/margins": 0.10419987142086029, "rewards/margins_max": 0.26601141691207886, "rewards/margins_min": -0.03732285648584366, "rewards/margins_std": 0.13528510928153992, "rewards/rejected": 0.05933328717947006, "step": 2870 }, { "dpo_losses": 0.652980625629425, "epoch": 0.75, "grad_norm": 2.595149019111713, "learning_rate": 8.686631451272029e-07, "logits/chosen": -2.4004759788513184, "logits/rejected": -2.070020914077759, "logps/chosen": -278.42523193359375, "logps/rejected": -235.72659301757812, "loss": 0.6804, "positive_losses": 0.45129984617233276, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.17295101284980774, "rewards/margins": 0.09022224694490433, "rewards/margins_max": 0.25402548909187317, "rewards/margins_min": -0.06448952108621597, "rewards/margins_std": 0.14810165762901306, "rewards/rejected": 0.08272875845432281, "step": 2880 }, { "dpo_losses": 0.6570742726325989, "epoch": 0.76, "grad_norm": 2.0081893626589067, "learning_rate": 8.514207792846168e-07, "logits/chosen": -2.248448371887207, "logits/rejected": -2.2126903533935547, "logps/chosen": -258.3336486816406, "logps/rejected": -286.007080078125, "loss": 0.6548, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15458668768405914, "rewards/margins": 0.0792255848646164, "rewards/margins_max": 0.25035232305526733, "rewards/margins_min": -0.07467710971832275, "rewards/margins_std": 0.1483931541442871, "rewards/rejected": 0.07536109536886215, "step": 2890 }, { "dpo_losses": 0.6553142666816711, "epoch": 0.76, "grad_norm": 13.861558056295364, "learning_rate": 8.343160693325356e-07, "logits/chosen": -2.3606836795806885, "logits/rejected": -2.1785192489624023, "logps/chosen": -273.04254150390625, "logps/rejected": -247.1999053955078, "loss": 0.6942, "positive_losses": 0.5457504391670227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15291614830493927, "rewards/margins": 0.0847366526722908, "rewards/margins_max": 0.234330415725708, "rewards/margins_min": -0.08198666572570801, "rewards/margins_std": 0.1436898410320282, "rewards/rejected": 0.06817950308322906, "step": 2900 }, { "epoch": 0.76, "eval_dpo_losses": 0.6533595323562622, "eval_logits/chosen": -2.2419686317443848, "eval_logits/rejected": -2.137899160385132, "eval_logps/chosen": -258.7987976074219, "eval_logps/rejected": -254.67906188964844, "eval_loss": 0.6826818585395813, "eval_positive_losses": 0.13755901157855988, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.16975350677967072, "eval_rewards/margins": 0.08834144473075867, "eval_rewards/margins_max": 0.35349351167678833, "eval_rewards/margins_min": -0.12288575619459152, "eval_rewards/margins_std": 0.1597186177968979, "eval_rewards/rejected": 0.08141203969717026, "eval_runtime": 386.8244, "eval_samples_per_second": 5.17, "eval_steps_per_second": 0.163, "step": 2900 }, { "dpo_losses": 0.6503573060035706, "epoch": 0.76, "grad_norm": 2.4431464501173874, "learning_rate": 8.173504435093174e-07, "logits/chosen": -2.2749648094177246, "logits/rejected": -2.1129987239837646, "logps/chosen": -244.6068572998047, "logps/rejected": -213.86056518554688, "loss": 0.6828, "positive_losses": 0.20812682807445526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1681584119796753, "rewards/margins": 0.09518470615148544, "rewards/margins_max": 0.30862218141555786, "rewards/margins_min": -0.055295854806900024, "rewards/margins_std": 0.16082781553268433, "rewards/rejected": 0.07297371327877045, "step": 2910 }, { "dpo_losses": 0.6556156873703003, "epoch": 0.76, "grad_norm": 24.8059952903275, "learning_rate": 8.00525318439836e-07, "logits/chosen": -2.3019073009490967, "logits/rejected": -2.1130080223083496, "logps/chosen": -248.23806762695312, "logps/rejected": -195.24069213867188, "loss": 0.6835, "positive_losses": 1.1095945835113525, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16012874245643616, "rewards/margins": 0.0838567391037941, "rewards/margins_max": 0.27189338207244873, "rewards/margins_min": -0.0858188271522522, "rewards/margins_std": 0.15910735726356506, "rewards/rejected": 0.07627199590206146, "step": 2920 }, { "dpo_losses": 0.6502605676651001, "epoch": 0.77, "grad_norm": 20.973905063280792, "learning_rate": 7.838420990171927e-07, "logits/chosen": -2.2429561614990234, "logits/rejected": -2.136467695236206, "logps/chosen": -277.0298156738281, "logps/rejected": -236.4407501220703, "loss": 0.7029, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18447257578372955, "rewards/margins": 0.09331153333187103, "rewards/margins_max": 0.2775861918926239, "rewards/margins_min": -0.05593902990221977, "rewards/margins_std": 0.15071845054626465, "rewards/rejected": 0.09116105735301971, "step": 2930 }, { "dpo_losses": 0.6580758094787598, "epoch": 0.77, "grad_norm": 1.8912407368939272, "learning_rate": 7.673021782854084e-07, "logits/chosen": -2.439190626144409, "logits/rejected": -2.4232096672058105, "logps/chosen": -269.99078369140625, "logps/rejected": -273.3539123535156, "loss": 0.6932, "positive_losses": 0.0600738525390625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1767464578151703, "rewards/margins": 0.07703068107366562, "rewards/margins_max": 0.25914904475212097, "rewards/margins_min": -0.07336901873350143, "rewards/margins_std": 0.14645086228847504, "rewards/rejected": 0.09971576184034348, "step": 2940 }, { "dpo_losses": 0.6559703946113586, "epoch": 0.77, "grad_norm": 13.642280469417495, "learning_rate": 7.509069373231039e-07, "logits/chosen": -2.5095577239990234, "logits/rejected": -2.394094944000244, "logps/chosen": -258.40924072265625, "logps/rejected": -281.8453063964844, "loss": 0.6839, "positive_losses": 1.1155738830566406, "rewards/accuracies": 0.75, "rewards/chosen": 0.14290133118629456, "rewards/margins": 0.08364812284708023, "rewards/margins_max": 0.2652481198310852, "rewards/margins_min": -0.0900956392288208, "rewards/margins_std": 0.15815143287181854, "rewards/rejected": 0.05925322324037552, "step": 2950 }, { "dpo_losses": 0.6857680082321167, "epoch": 0.77, "grad_norm": 10.9300944516358, "learning_rate": 7.346577451281822e-07, "logits/chosen": -2.139549493789673, "logits/rejected": -2.1975948810577393, "logps/chosen": -224.23046875, "logps/rejected": -219.50631713867188, "loss": 0.7089, "positive_losses": 1.1742143630981445, "rewards/accuracies": 0.625, "rewards/chosen": 0.12446574866771698, "rewards/margins": 0.021695952862501144, "rewards/margins_max": 0.18259833753108978, "rewards/margins_min": -0.1411958932876587, "rewards/margins_std": 0.14416971802711487, "rewards/rejected": 0.10276981443166733, "step": 2960 }, { "dpo_losses": 0.6511141657829285, "epoch": 0.78, "grad_norm": 19.545237179085824, "learning_rate": 7.185559585035138e-07, "logits/chosen": -2.404660701751709, "logits/rejected": -2.2767269611358643, "logps/chosen": -206.1131591796875, "logps/rejected": -189.3387451171875, "loss": 0.7102, "positive_losses": 0.090673066675663, "rewards/accuracies": 0.625, "rewards/chosen": 0.16719654202461243, "rewards/margins": 0.09269991517066956, "rewards/margins_max": 0.3110290467739105, "rewards/margins_min": -0.052907079458236694, "rewards/margins_std": 0.16249682009220123, "rewards/rejected": 0.07449664920568466, "step": 2970 }, { "dpo_losses": 0.6411671042442322, "epoch": 0.78, "grad_norm": 2.2137110267624736, "learning_rate": 7.026029219436504e-07, "logits/chosen": -2.2068686485290527, "logits/rejected": -2.166290521621704, "logps/chosen": -280.74774169921875, "logps/rejected": -293.70086669921875, "loss": 0.656, "positive_losses": 0.03380317613482475, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1751025766134262, "rewards/margins": 0.11968563497066498, "rewards/margins_max": 0.3710842430591583, "rewards/margins_min": -0.06859045475721359, "rewards/margins_std": 0.19778503477573395, "rewards/rejected": 0.055416930466890335, "step": 2980 }, { "dpo_losses": 0.6529301404953003, "epoch": 0.78, "grad_norm": 8.697121540814704, "learning_rate": 6.867999675225523e-07, "logits/chosen": -2.355289936065674, "logits/rejected": -2.2426371574401855, "logps/chosen": -277.53985595703125, "logps/rejected": -297.69012451171875, "loss": 0.6792, "positive_losses": 0.2433311492204666, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1514471471309662, "rewards/margins": 0.08627411723136902, "rewards/margins_max": 0.21687212586402893, "rewards/margins_min": -0.0320710688829422, "rewards/margins_std": 0.11254825443029404, "rewards/rejected": 0.06517302989959717, "step": 2990 }, { "dpo_losses": 0.6534997224807739, "epoch": 0.79, "grad_norm": 2.734847109773537, "learning_rate": 6.711484147823663e-07, "logits/chosen": -2.3748514652252197, "logits/rejected": -2.25462007522583, "logps/chosen": -215.39404296875, "logps/rejected": -248.48434448242188, "loss": 0.6631, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1545630395412445, "rewards/margins": 0.09078287333250046, "rewards/margins_max": 0.2895895838737488, "rewards/margins_min": -0.05770108103752136, "rewards/margins_std": 0.16436031460762024, "rewards/rejected": 0.06378016620874405, "step": 3000 }, { "epoch": 0.79, "eval_dpo_losses": 0.6525840163230896, "eval_logits/chosen": -2.2451510429382324, "eval_logits/rejected": -2.1417453289031982, "eval_logps/chosen": -258.98388671875, "eval_logps/rejected": -255.03399658203125, "eval_loss": 0.6842887997627258, "eval_positive_losses": 0.15195737779140472, "eval_rewards/accuracies": 0.692460298538208, "eval_rewards/chosen": 0.16790270805358887, "eval_rewards/margins": 0.09003976732492447, "eval_rewards/margins_max": 0.3580784797668457, "eval_rewards/margins_min": -0.1202986016869545, "eval_rewards/margins_std": 0.1604534536600113, "eval_rewards/rejected": 0.0778629407286644, "eval_runtime": 387.0341, "eval_samples_per_second": 5.168, "eval_steps_per_second": 0.163, "step": 3000 }, { "dpo_losses": 0.6444215774536133, "epoch": 0.79, "grad_norm": 2.149106341202291, "learning_rate": 6.556495706232413e-07, "logits/chosen": -2.4604363441467285, "logits/rejected": -2.2537124156951904, "logps/chosen": -278.8143615722656, "logps/rejected": -248.9881591796875, "loss": 0.6802, "positive_losses": 0.5926286578178406, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14932508766651154, "rewards/margins": 0.10600350797176361, "rewards/margins_max": 0.2735843062400818, "rewards/margins_min": -0.050291866064071655, "rewards/margins_std": 0.13988161087036133, "rewards/rejected": 0.043321575969457626, "step": 3010 }, { "dpo_losses": 0.6592887043952942, "epoch": 0.79, "grad_norm": 17.807857296545833, "learning_rate": 6.403047291942057e-07, "logits/chosen": -2.311393976211548, "logits/rejected": -2.181124210357666, "logps/chosen": -204.22738647460938, "logps/rejected": -178.9779510498047, "loss": 0.6998, "positive_losses": 0.5409892797470093, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15512463450431824, "rewards/margins": 0.07736720144748688, "rewards/margins_max": 0.2487223893404007, "rewards/margins_min": -0.12204670906066895, "rewards/margins_std": 0.16226565837860107, "rewards/rejected": 0.07775743305683136, "step": 3020 }, { "dpo_losses": 0.6397817134857178, "epoch": 0.79, "grad_norm": 2.1200470141829855, "learning_rate": 6.251151717851023e-07, "logits/chosen": -2.236409902572632, "logits/rejected": -2.2558975219726562, "logps/chosen": -200.06234741210938, "logps/rejected": -277.60626220703125, "loss": 0.6689, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.18225884437561035, "rewards/margins": 0.11746920645236969, "rewards/margins_max": 0.29077091813087463, "rewards/margins_min": -0.0012919344007968903, "rewards/margins_std": 0.13081949949264526, "rewards/rejected": 0.06478965282440186, "step": 3030 }, { "dpo_losses": 0.654072105884552, "epoch": 0.8, "grad_norm": 2.761540559580292, "learning_rate": 6.100821667196041e-07, "logits/chosen": -2.2519009113311768, "logits/rejected": -2.144467830657959, "logps/chosen": -273.92266845703125, "logps/rejected": -291.9794006347656, "loss": 0.6637, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.19264277815818787, "rewards/margins": 0.08743550628423691, "rewards/margins_max": 0.27152904868125916, "rewards/margins_min": -0.08487816154956818, "rewards/margins_std": 0.16320733726024628, "rewards/rejected": 0.10520727932453156, "step": 3040 }, { "dpo_losses": 0.6647396683692932, "epoch": 0.8, "grad_norm": 23.810293020266204, "learning_rate": 5.952069692493062e-07, "logits/chosen": -2.2673792839050293, "logits/rejected": -2.1899237632751465, "logps/chosen": -267.8157958984375, "logps/rejected": -270.7992248535156, "loss": 0.6767, "positive_losses": 0.3106445372104645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1567973792552948, "rewards/margins": 0.06685099750757217, "rewards/margins_max": 0.2384394407272339, "rewards/margins_min": -0.11775145679712296, "rewards/margins_std": 0.1661355197429657, "rewards/rejected": 0.08994637429714203, "step": 3050 }, { "dpo_losses": 0.6583305597305298, "epoch": 0.8, "grad_norm": 10.689039331418575, "learning_rate": 5.80490821448918e-07, "logits/chosen": -2.3916077613830566, "logits/rejected": -2.321349859237671, "logps/chosen": -276.8563537597656, "logps/rejected": -264.2248229980469, "loss": 0.675, "positive_losses": 0.14002494513988495, "rewards/accuracies": 0.75, "rewards/chosen": 0.15911553800106049, "rewards/margins": 0.07481556385755539, "rewards/margins_max": 0.21512219309806824, "rewards/margins_min": -0.03177332133054733, "rewards/margins_std": 0.10979966819286346, "rewards/rejected": 0.08429998904466629, "step": 3060 }, { "dpo_losses": 0.6622790098190308, "epoch": 0.8, "grad_norm": 8.861515600631439, "learning_rate": 5.659349521125459e-07, "logits/chosen": -2.180570125579834, "logits/rejected": -2.0564029216766357, "logps/chosen": -221.5233612060547, "logps/rejected": -207.24246215820312, "loss": 0.678, "positive_losses": 0.3228427767753601, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14501142501831055, "rewards/margins": 0.06833904981613159, "rewards/margins_max": 0.23418673872947693, "rewards/margins_min": -0.0759841725230217, "rewards/margins_std": 0.13753478229045868, "rewards/rejected": 0.07667239010334015, "step": 3070 }, { "dpo_losses": 0.658730149269104, "epoch": 0.81, "grad_norm": 2.537576187568532, "learning_rate": 5.5154057665109e-07, "logits/chosen": -2.259204149246216, "logits/rejected": -2.2626352310180664, "logps/chosen": -213.51614379882812, "logps/rejected": -253.5718536376953, "loss": 0.6807, "positive_losses": 0.21948127448558807, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.16080555319786072, "rewards/margins": 0.07590626180171967, "rewards/margins_max": 0.21234968304634094, "rewards/margins_min": -0.06352119892835617, "rewards/margins_std": 0.12795665860176086, "rewards/rejected": 0.08489931374788284, "step": 3080 }, { "dpo_losses": 0.6643017530441284, "epoch": 0.81, "grad_norm": 6.8578732873530575, "learning_rate": 5.373088969907586e-07, "logits/chosen": -2.448826789855957, "logits/rejected": -2.2087972164154053, "logps/chosen": -287.5294494628906, "logps/rejected": -251.1128387451172, "loss": 0.7028, "positive_losses": 0.8749073147773743, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13391676545143127, "rewards/margins": 0.06617481261491776, "rewards/margins_max": 0.21680596470832825, "rewards/margins_min": -0.10882861912250519, "rewards/margins_std": 0.14915981888771057, "rewards/rejected": 0.06774194538593292, "step": 3090 }, { "dpo_losses": 0.6736847758293152, "epoch": 0.81, "grad_norm": 2.3103746493491535, "learning_rate": 5.23241101472709e-07, "logits/chosen": -2.319197177886963, "logits/rejected": -2.3016042709350586, "logps/chosen": -265.5721435546875, "logps/rejected": -271.1224670410156, "loss": 0.6838, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17576611042022705, "rewards/margins": 0.04620806872844696, "rewards/margins_max": 0.21499133110046387, "rewards/margins_min": -0.09205774962902069, "rewards/margins_std": 0.13699552416801453, "rewards/rejected": 0.12955807149410248, "step": 3100 }, { "epoch": 0.81, "eval_dpo_losses": 0.6537554264068604, "eval_logits/chosen": -2.2381534576416016, "eval_logits/rejected": -2.1339519023895264, "eval_logps/chosen": -258.3243408203125, "eval_logps/rejected": -254.1182403564453, "eval_loss": 0.6810853481292725, "eval_positive_losses": 0.1294495016336441, "eval_rewards/accuracies": 0.6904761791229248, "eval_rewards/chosen": 0.17449788749217987, "eval_rewards/margins": 0.08747726678848267, "eval_rewards/margins_max": 0.35538461804389954, "eval_rewards/margins_min": -0.1213645339012146, "eval_rewards/margins_std": 0.15946441888809204, "eval_rewards/rejected": 0.0870206207036972, "eval_runtime": 399.0593, "eval_samples_per_second": 5.012, "eval_steps_per_second": 0.158, "step": 3100 }, { "dpo_losses": 0.662432074546814, "epoch": 0.81, "grad_norm": 23.514902137976115, "learning_rate": 5.09338364753818e-07, "logits/chosen": -2.2971441745758057, "logits/rejected": -2.123034715652466, "logps/chosen": -306.0448303222656, "logps/rejected": -271.21490478515625, "loss": 0.6809, "positive_losses": 0.39984816312789917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16501955687999725, "rewards/margins": 0.06859545409679413, "rewards/margins_max": 0.2637186646461487, "rewards/margins_min": -0.09291068464517593, "rewards/margins_std": 0.1540498286485672, "rewards/rejected": 0.09642409533262253, "step": 3110 }, { "dpo_losses": 0.6436980962753296, "epoch": 0.82, "grad_norm": 2.105578474536834, "learning_rate": 4.956018477086005e-07, "logits/chosen": -2.3031558990478516, "logits/rejected": -2.166726589202881, "logps/chosen": -243.03384399414062, "logps/rejected": -247.81600952148438, "loss": 0.6877, "positive_losses": 0.04438161849975586, "rewards/accuracies": 0.75, "rewards/chosen": 0.1725742369890213, "rewards/margins": 0.10747237503528595, "rewards/margins_max": 0.29282379150390625, "rewards/margins_min": -0.04991322010755539, "rewards/margins_std": 0.14813265204429626, "rewards/rejected": 0.06510186195373535, "step": 3120 }, { "dpo_losses": 0.661555826663971, "epoch": 0.82, "grad_norm": 13.286299591531778, "learning_rate": 4.820326973322764e-07, "logits/chosen": -2.3249430656433105, "logits/rejected": -2.3343217372894287, "logps/chosen": -230.4161834716797, "logps/rejected": -221.0846405029297, "loss": 0.6711, "positive_losses": 0.2990850508213043, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1598578244447708, "rewards/margins": 0.07453365623950958, "rewards/margins_max": 0.2871330678462982, "rewards/margins_min": -0.08729489892721176, "rewards/margins_std": 0.16927851736545563, "rewards/rejected": 0.08532416075468063, "step": 3130 }, { "dpo_losses": 0.6583113670349121, "epoch": 0.82, "grad_norm": 2.01552717577689, "learning_rate": 4.686320466449981e-07, "logits/chosen": -2.1339659690856934, "logits/rejected": -2.1851744651794434, "logps/chosen": -195.4499969482422, "logps/rejected": -204.33255004882812, "loss": 0.6718, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1526738703250885, "rewards/margins": 0.07782270014286041, "rewards/margins_max": 0.2588769793510437, "rewards/margins_min": -0.07749754190444946, "rewards/margins_std": 0.15506020188331604, "rewards/rejected": 0.07485117018222809, "step": 3140 }, { "dpo_losses": 0.6674398183822632, "epoch": 0.82, "grad_norm": 17.33038882163307, "learning_rate": 4.554010145972418e-07, "logits/chosen": -2.2771804332733154, "logits/rejected": -2.2395780086517334, "logps/chosen": -261.2558898925781, "logps/rejected": -277.98828125, "loss": 0.6687, "positive_losses": 0.09732933342456818, "rewards/accuracies": 0.625, "rewards/chosen": 0.1672058403491974, "rewards/margins": 0.0582907609641552, "rewards/margins_max": 0.23328760266304016, "rewards/margins_min": -0.09914519637823105, "rewards/margins_std": 0.1508348286151886, "rewards/rejected": 0.1089150682091713, "step": 3150 }, { "dpo_losses": 0.6414269804954529, "epoch": 0.83, "grad_norm": 5.42217717716566, "learning_rate": 4.4234070597637455e-07, "logits/chosen": -2.4382681846618652, "logits/rejected": -2.1755149364471436, "logps/chosen": -294.73699951171875, "logps/rejected": -212.7322540283203, "loss": 0.689, "positive_losses": 0.05511055141687393, "rewards/accuracies": 0.75, "rewards/chosen": 0.2085982859134674, "rewards/margins": 0.11408412456512451, "rewards/margins_max": 0.2852762043476105, "rewards/margins_min": -0.06618374586105347, "rewards/margins_std": 0.15638698637485504, "rewards/rejected": 0.0945141464471817, "step": 3160 }, { "dpo_losses": 0.6603347659111023, "epoch": 0.83, "grad_norm": 11.442776489517852, "learning_rate": 4.2945221131440783e-07, "logits/chosen": -2.418555498123169, "logits/rejected": -2.2951455116271973, "logps/chosen": -292.9923400878906, "logps/rejected": -282.74896240234375, "loss": 0.6788, "positive_losses": 0.5385681390762329, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.16707170009613037, "rewards/margins": 0.07161282002925873, "rewards/margins_max": 0.2092454731464386, "rewards/margins_min": -0.06386003643274307, "rewards/margins_std": 0.12432940304279327, "rewards/rejected": 0.09545888751745224, "step": 3170 }, { "dpo_losses": 0.6621330976486206, "epoch": 0.83, "grad_norm": 24.583675817485446, "learning_rate": 4.167366067969381e-07, "logits/chosen": -2.1327767372131348, "logits/rejected": -2.1718926429748535, "logps/chosen": -252.2990264892578, "logps/rejected": -229.47988891601562, "loss": 0.6997, "positive_losses": 0.4635990262031555, "rewards/accuracies": 0.75, "rewards/chosen": 0.17453351616859436, "rewards/margins": 0.07015625387430191, "rewards/margins_max": 0.22704768180847168, "rewards/margins_min": -0.13893255591392517, "rewards/margins_std": 0.1603839248418808, "rewards/rejected": 0.10437728464603424, "step": 3180 }, { "dpo_losses": 0.664763331413269, "epoch": 0.83, "grad_norm": 2.367027755735733, "learning_rate": 4.041949541732826e-07, "logits/chosen": -2.175737142562866, "logits/rejected": -2.022587537765503, "logps/chosen": -219.670654296875, "logps/rejected": -214.9852294921875, "loss": 0.6661, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.14674416184425354, "rewards/margins": 0.06417163461446762, "rewards/margins_max": 0.24690458178520203, "rewards/margins_min": -0.09494888037443161, "rewards/margins_std": 0.15239901840686798, "rewards/rejected": 0.08257253468036652, "step": 3190 }, { "dpo_losses": 0.6554974913597107, "epoch": 0.84, "grad_norm": 6.677644972803848, "learning_rate": 3.9182830066782614e-07, "logits/chosen": -2.1366796493530273, "logits/rejected": -2.1481008529663086, "logps/chosen": -245.388671875, "logps/rejected": -234.8861083984375, "loss": 0.6598, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17017970979213715, "rewards/margins": 0.08330049365758896, "rewards/margins_max": 0.2756052613258362, "rewards/margins_min": -0.06858067214488983, "rewards/margins_std": 0.15181994438171387, "rewards/rejected": 0.08687921613454819, "step": 3200 }, { "epoch": 0.84, "eval_dpo_losses": 0.6535687446594238, "eval_logits/chosen": -2.238140344619751, "eval_logits/rejected": -2.1344997882843018, "eval_logps/chosen": -258.42059326171875, "eval_logps/rejected": -254.2526092529297, "eval_loss": 0.6810405254364014, "eval_positive_losses": 0.13592833280563354, "eval_rewards/accuracies": 0.6944444179534912, "eval_rewards/chosen": 0.1735353022813797, "eval_rewards/margins": 0.08785834908485413, "eval_rewards/margins_max": 0.3550361096858978, "eval_rewards/margins_min": -0.1213151291012764, "eval_rewards/margins_std": 0.15950633585453033, "eval_rewards/rejected": 0.08567693084478378, "eval_runtime": 386.483, "eval_samples_per_second": 5.175, "eval_steps_per_second": 0.163, "step": 3200 }, { "dpo_losses": 0.6457203030586243, "epoch": 0.84, "grad_norm": 11.165854201485354, "learning_rate": 3.796376788925771e-07, "logits/chosen": -2.283282995223999, "logits/rejected": -2.2262542247772217, "logps/chosen": -232.2800750732422, "logps/rejected": -265.31207275390625, "loss": 0.6818, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1805500090122223, "rewards/margins": 0.10865093767642975, "rewards/margins_max": 0.3225840628147125, "rewards/margins_min": -0.04238703101873398, "rewards/margins_std": 0.1681584119796753, "rewards/rejected": 0.07189907878637314, "step": 3210 }, { "dpo_losses": 0.665763258934021, "epoch": 0.84, "grad_norm": 13.125204611500347, "learning_rate": 3.676241067609465e-07, "logits/chosen": -2.2513158321380615, "logits/rejected": -2.1509430408477783, "logps/chosen": -225.428466796875, "logps/rejected": -241.76535034179688, "loss": 0.6756, "positive_losses": 0.5663833618164062, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1532856523990631, "rewards/margins": 0.06156473606824875, "rewards/margins_max": 0.21126067638397217, "rewards/margins_min": -0.0775647684931755, "rewards/margins_std": 0.12892693281173706, "rewards/rejected": 0.09172092378139496, "step": 3220 }, { "dpo_losses": 0.662002444267273, "epoch": 0.85, "grad_norm": 2.4972427422011987, "learning_rate": 3.5578858740274976e-07, "logits/chosen": -2.31249737739563, "logits/rejected": -2.2492175102233887, "logps/chosen": -286.94110107421875, "logps/rejected": -301.15325927734375, "loss": 0.7051, "positive_losses": 0.4719085693359375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1620463877916336, "rewards/margins": 0.07036799192428589, "rewards/margins_max": 0.2971928119659424, "rewards/margins_min": -0.07245415449142456, "rewards/margins_std": 0.16775697469711304, "rewards/rejected": 0.09167838841676712, "step": 3230 }, { "dpo_losses": 0.6361449956893921, "epoch": 0.85, "grad_norm": 17.08639474125265, "learning_rate": 3.44132109080447e-07, "logits/chosen": -2.435424566268921, "logits/rejected": -2.238687038421631, "logps/chosen": -236.94552612304688, "logps/rejected": -291.427490234375, "loss": 0.6722, "positive_losses": 0.03195800632238388, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17111745476722717, "rewards/margins": 0.12512411177158356, "rewards/margins_max": 0.3150358498096466, "rewards/margins_min": -0.03759292885661125, "rewards/margins_std": 0.15489307045936584, "rewards/rejected": 0.04599333927035332, "step": 3240 }, { "dpo_losses": 0.6538858413696289, "epoch": 0.85, "grad_norm": 2.425001624195302, "learning_rate": 3.3265564510662344e-07, "logits/chosen": -2.4212875366210938, "logits/rejected": -2.3288378715515137, "logps/chosen": -266.2784423828125, "logps/rejected": -269.1116943359375, "loss": 0.6863, "positive_losses": 0.24813231825828552, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17173567414283752, "rewards/margins": 0.08545049279928207, "rewards/margins_max": 0.223008394241333, "rewards/margins_min": -0.04507576674222946, "rewards/margins_std": 0.1168411374092102, "rewards/rejected": 0.08628518879413605, "step": 3250 }, { "dpo_losses": 0.6383971571922302, "epoch": 0.85, "grad_norm": 10.425349489935803, "learning_rate": 3.213601537627195e-07, "logits/chosen": -2.315153121948242, "logits/rejected": -2.2009711265563965, "logps/chosen": -233.04861450195312, "logps/rejected": -264.72393798828125, "loss": 0.673, "positive_losses": 0.10766830295324326, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.16209791600704193, "rewards/margins": 0.12161654233932495, "rewards/margins_max": 0.319131076335907, "rewards/margins_min": -0.04576132819056511, "rewards/margins_std": 0.16405105590820312, "rewards/rejected": 0.04048134759068489, "step": 3260 }, { "dpo_losses": 0.6697072386741638, "epoch": 0.86, "grad_norm": 6.385803642391433, "learning_rate": 3.1024657821901063e-07, "logits/chosen": -2.2490181922912598, "logits/rejected": -2.146900177001953, "logps/chosen": -222.7403106689453, "logps/rejected": -251.56723022460938, "loss": 0.6686, "positive_losses": 0.06131019443273544, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13475115597248077, "rewards/margins": 0.054383594542741776, "rewards/margins_max": 0.20636025071144104, "rewards/margins_min": -0.10669461637735367, "rewards/margins_std": 0.13830003142356873, "rewards/rejected": 0.0803675502538681, "step": 3270 }, { "dpo_losses": 0.6649834513664246, "epoch": 0.86, "grad_norm": 14.369461186659779, "learning_rate": 2.9931584645585654e-07, "logits/chosen": -2.410416841506958, "logits/rejected": -2.2019524574279785, "logps/chosen": -186.33419799804688, "logps/rejected": -173.55006408691406, "loss": 0.6736, "positive_losses": 0.08618392795324326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14030560851097107, "rewards/margins": 0.06636351346969604, "rewards/margins_max": 0.25486403703689575, "rewards/margins_min": -0.11576934903860092, "rewards/margins_std": 0.1644243746995926, "rewards/rejected": 0.07394208014011383, "step": 3280 }, { "dpo_losses": 0.630217969417572, "epoch": 0.86, "grad_norm": 2.3470946797085874, "learning_rate": 2.885688711862136e-07, "logits/chosen": -2.309390068054199, "logits/rejected": -2.132350206375122, "logps/chosen": -264.3833312988281, "logps/rejected": -279.4195861816406, "loss": 0.6646, "positive_losses": 0.19128647446632385, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.19541771709918976, "rewards/margins": 0.14199557900428772, "rewards/margins_max": 0.35998716950416565, "rewards/margins_min": -0.09135288745164871, "rewards/margins_std": 0.20254869759082794, "rewards/rejected": 0.05342211574316025, "step": 3290 }, { "dpo_losses": 0.6573060154914856, "epoch": 0.86, "grad_norm": 2.31887557108711, "learning_rate": 2.7800654977942486e-07, "logits/chosen": -2.463292121887207, "logits/rejected": -2.333122730255127, "logps/chosen": -310.8419494628906, "logps/rejected": -323.2276916503906, "loss": 0.6749, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.18466229736804962, "rewards/margins": 0.07884453237056732, "rewards/margins_max": 0.24194669723510742, "rewards/margins_min": -0.06480497866868973, "rewards/margins_std": 0.1407381147146225, "rewards/rejected": 0.1058177724480629, "step": 3300 }, { "epoch": 0.86, "eval_dpo_losses": 0.6533048152923584, "eval_logits/chosen": -2.234851598739624, "eval_logits/rejected": -2.130990505218506, "eval_logps/chosen": -258.4991760253906, "eval_logps/rejected": -254.3860321044922, "eval_loss": 0.6815329194068909, "eval_positive_losses": 0.13988177478313446, "eval_rewards/accuracies": 0.692460298538208, "eval_rewards/chosen": 0.17274963855743408, "eval_rewards/margins": 0.08840711414813995, "eval_rewards/margins_max": 0.35538098216056824, "eval_rewards/margins_min": -0.12075136601924896, "eval_rewards/margins_std": 0.15941965579986572, "eval_rewards/rejected": 0.08434252440929413, "eval_runtime": 386.7816, "eval_samples_per_second": 5.171, "eval_steps_per_second": 0.163, "step": 3300 }, { "dpo_losses": 0.6246877908706665, "epoch": 0.87, "grad_norm": 19.1239978801995, "learning_rate": 2.6762976418628797e-07, "logits/chosen": -2.249816656112671, "logits/rejected": -2.1182568073272705, "logps/chosen": -190.5952911376953, "logps/rejected": -217.44155883789062, "loss": 0.7001, "positive_losses": 0.06870155036449432, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.16738249361515045, "rewards/margins": 0.15625303983688354, "rewards/margins_max": 0.43447503447532654, "rewards/margins_min": -0.014380864799022675, "rewards/margins_std": 0.20568692684173584, "rewards/rejected": 0.011129467748105526, "step": 3310 }, { "dpo_losses": 0.6498314142227173, "epoch": 0.87, "grad_norm": 20.15948375222565, "learning_rate": 2.5743938086541354e-07, "logits/chosen": -2.117475986480713, "logits/rejected": -2.069697380065918, "logps/chosen": -271.4492492675781, "logps/rejected": -228.81588745117188, "loss": 0.6718, "positive_losses": 0.11292801052331924, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.15503479540348053, "rewards/margins": 0.09339234977960587, "rewards/margins_max": 0.2410856932401657, "rewards/margins_min": -0.042519111186265945, "rewards/margins_std": 0.12763680517673492, "rewards/rejected": 0.06164243072271347, "step": 3320 }, { "dpo_losses": 0.6674187183380127, "epoch": 0.87, "grad_norm": 8.427142079780813, "learning_rate": 2.4743625071087574e-07, "logits/chosen": -2.291749954223633, "logits/rejected": -2.3545637130737305, "logps/chosen": -257.85369873046875, "logps/rejected": -297.4468688964844, "loss": 0.6709, "positive_losses": 0.3090629577636719, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.16220709681510925, "rewards/margins": 0.05781088396906853, "rewards/margins_max": 0.21202406287193298, "rewards/margins_min": -0.08543402701616287, "rewards/margins_std": 0.13548211753368378, "rewards/rejected": 0.10439622402191162, "step": 3330 }, { "dpo_losses": 0.6553888916969299, "epoch": 0.87, "grad_norm": 2.3281260240795776, "learning_rate": 2.3762120898116498e-07, "logits/chosen": -2.3886215686798096, "logits/rejected": -2.3799920082092285, "logps/chosen": -258.31231689453125, "logps/rejected": -325.18646240234375, "loss": 0.6522, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.15233513712882996, "rewards/margins": 0.08192898333072662, "rewards/margins_max": 0.21695943176746368, "rewards/margins_min": -0.04254608601331711, "rewards/margins_std": 0.12258054316043854, "rewards/rejected": 0.07040613889694214, "step": 3340 }, { "dpo_losses": 0.650645911693573, "epoch": 0.88, "grad_norm": 7.156903232928195, "learning_rate": 2.2799507522944048e-07, "logits/chosen": -2.410691976547241, "logits/rejected": -2.104086399078369, "logps/chosen": -265.23699951171875, "logps/rejected": -246.97579956054688, "loss": 0.6676, "positive_losses": 0.011097717098891735, "rewards/accuracies": 0.75, "rewards/chosen": 0.16835041344165802, "rewards/margins": 0.0982150286436081, "rewards/margins_max": 0.33052539825439453, "rewards/margins_min": -0.11177612841129303, "rewards/margins_std": 0.19844050705432892, "rewards/rejected": 0.07013537734746933, "step": 3350 }, { "dpo_losses": 0.6471047401428223, "epoch": 0.88, "grad_norm": 5.274044742236762, "learning_rate": 2.1855865323510056e-07, "logits/chosen": -2.4150993824005127, "logits/rejected": -2.3032796382904053, "logps/chosen": -256.17572021484375, "logps/rejected": -321.46368408203125, "loss": 0.6848, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17062309384346008, "rewards/margins": 0.10880155861377716, "rewards/margins_max": 0.3457118272781372, "rewards/margins_min": -0.05264957994222641, "rewards/margins_std": 0.1773463487625122, "rewards/rejected": 0.06182154268026352, "step": 3360 }, { "dpo_losses": 0.6665820479393005, "epoch": 0.88, "grad_norm": 7.915805515500524, "learning_rate": 2.0931273093666575e-07, "logits/chosen": -2.3872427940368652, "logits/rejected": -2.384542226791382, "logps/chosen": -244.8478546142578, "logps/rejected": -261.66412353515625, "loss": 0.677, "positive_losses": 0.28215521574020386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16023533046245575, "rewards/margins": 0.06339271366596222, "rewards/margins_max": 0.3065606951713562, "rewards/margins_min": -0.12356342375278473, "rewards/margins_std": 0.19092825055122375, "rewards/rejected": 0.09684261679649353, "step": 3370 }, { "dpo_losses": 0.6603595018386841, "epoch": 0.88, "grad_norm": 1.8693566909959658, "learning_rate": 2.002580803659873e-07, "logits/chosen": -2.3584275245666504, "logits/rejected": -2.2111449241638184, "logps/chosen": -258.8819885253906, "logps/rejected": -223.0545196533203, "loss": 0.7143, "positive_losses": 0.1812393218278885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1700710505247116, "rewards/margins": 0.07348418980836868, "rewards/margins_max": 0.22153392434120178, "rewards/margins_min": -0.07975874841213226, "rewards/margins_std": 0.1389322429895401, "rewards/rejected": 0.09658686071634293, "step": 3380 }, { "dpo_losses": 0.6535211801528931, "epoch": 0.89, "grad_norm": 2.1563105920211716, "learning_rate": 1.913954575837826e-07, "logits/chosen": -2.1887354850769043, "logits/rejected": -2.1103200912475586, "logps/chosen": -233.25534057617188, "logps/rejected": -255.188720703125, "loss": 0.6445, "positive_losses": 0.03745412826538086, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1456550657749176, "rewards/margins": 0.09550122916698456, "rewards/margins_max": 0.3543938994407654, "rewards/margins_min": -0.1219588965177536, "rewards/margins_std": 0.2061467170715332, "rewards/rejected": 0.05015384405851364, "step": 3390 }, { "dpo_losses": 0.6510832905769348, "epoch": 0.89, "grad_norm": 2.165443062674317, "learning_rate": 1.827256026165028e-07, "logits/chosen": -2.3058199882507324, "logits/rejected": -2.17576265335083, "logps/chosen": -259.0904846191406, "logps/rejected": -287.29486083984375, "loss": 0.675, "positive_losses": 0.03086547926068306, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16996337473392487, "rewards/margins": 0.09885217994451523, "rewards/margins_max": 0.375033974647522, "rewards/margins_min": -0.08358651399612427, "rewards/margins_std": 0.20886187255382538, "rewards/rejected": 0.07111118733882904, "step": 3400 }, { "epoch": 0.89, "eval_dpo_losses": 0.6527442336082458, "eval_logits/chosen": -2.235651731491089, "eval_logits/rejected": -2.1317570209503174, "eval_logps/chosen": -258.57366943359375, "eval_logps/rejected": -254.59271240234375, "eval_loss": 0.6825324296951294, "eval_positive_losses": 0.15060579776763916, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.17200466990470886, "eval_rewards/margins": 0.08972898870706558, "eval_rewards/margins_max": 0.359048068523407, "eval_rewards/margins_min": -0.12174738198518753, "eval_rewards/margins_std": 0.16102150082588196, "eval_rewards/rejected": 0.08227568864822388, "eval_runtime": 386.2619, "eval_samples_per_second": 5.178, "eval_steps_per_second": 0.163, "step": 3400 }, { "dpo_losses": 0.654764711856842, "epoch": 0.89, "grad_norm": 1.9779380264023572, "learning_rate": 1.7424923939454274e-07, "logits/chosen": -2.096106767654419, "logits/rejected": -2.036606788635254, "logps/chosen": -221.55642700195312, "logps/rejected": -262.0559387207031, "loss": 0.6689, "positive_losses": 0.368595689535141, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14159539341926575, "rewards/margins": 0.08518040925264359, "rewards/margins_max": 0.22402985394001007, "rewards/margins_min": -0.08319288492202759, "rewards/margins_std": 0.14345747232437134, "rewards/rejected": 0.056414999067783356, "step": 3410 }, { "dpo_losses": 0.63566654920578, "epoch": 0.9, "grad_norm": 16.20975993464036, "learning_rate": 1.6596707569179304e-07, "logits/chosen": -2.4435386657714844, "logits/rejected": -2.0861752033233643, "logps/chosen": -325.83502197265625, "logps/rejected": -279.63336181640625, "loss": 0.6701, "positive_losses": 0.062273599207401276, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19472917914390564, "rewards/margins": 0.12625108659267426, "rewards/margins_max": 0.31810396909713745, "rewards/margins_min": -0.05376405641436577, "rewards/margins_std": 0.1653144359588623, "rewards/rejected": 0.06847808510065079, "step": 3420 }, { "dpo_losses": 0.6542297005653381, "epoch": 0.9, "grad_norm": 2.396364092431464, "learning_rate": 1.578798030665385e-07, "logits/chosen": -2.3601245880126953, "logits/rejected": -2.255420446395874, "logps/chosen": -296.8489990234375, "logps/rejected": -274.9451904296875, "loss": 0.6687, "positive_losses": 0.03637237474322319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16607016324996948, "rewards/margins": 0.0869467705488205, "rewards/margins_max": 0.2556857466697693, "rewards/margins_min": -0.10844133794307709, "rewards/margins_std": 0.15820132195949554, "rewards/rejected": 0.07912340760231018, "step": 3430 }, { "dpo_losses": 0.6591772437095642, "epoch": 0.9, "grad_norm": 1.721560451903123, "learning_rate": 1.499880968037165e-07, "logits/chosen": -2.3040645122528076, "logits/rejected": -2.340810537338257, "logps/chosen": -245.3950958251953, "logps/rejected": -267.0332336425781, "loss": 0.673, "positive_losses": 0.05183257907629013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1617814600467682, "rewards/margins": 0.07359369844198227, "rewards/margins_max": 0.22743456065654755, "rewards/margins_min": -0.059500329196453094, "rewards/margins_std": 0.12445168197154999, "rewards/rejected": 0.08818773925304413, "step": 3440 }, { "dpo_losses": 0.647858738899231, "epoch": 0.9, "grad_norm": 9.361374375602939, "learning_rate": 1.4229261585852805e-07, "logits/chosen": -2.326876163482666, "logits/rejected": -2.236635446548462, "logps/chosen": -241.5677490234375, "logps/rejected": -248.3171844482422, "loss": 0.6656, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.18497946858406067, "rewards/margins": 0.09752850234508514, "rewards/margins_max": 0.27518752217292786, "rewards/margins_min": -0.02556411363184452, "rewards/margins_std": 0.1336226910352707, "rewards/rejected": 0.08745095878839493, "step": 3450 }, { "dpo_losses": 0.6455780863761902, "epoch": 0.91, "grad_norm": 2.235661703037212, "learning_rate": 1.3479400280141886e-07, "logits/chosen": -2.3478381633758545, "logits/rejected": -2.1198596954345703, "logps/chosen": -283.96575927734375, "logps/rejected": -211.55972290039062, "loss": 0.646, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19400246441364288, "rewards/margins": 0.10352899134159088, "rewards/margins_max": 0.27262037992477417, "rewards/margins_min": -0.05394411087036133, "rewards/margins_std": 0.14614689350128174, "rewards/rejected": 0.0904734805226326, "step": 3460 }, { "dpo_losses": 0.6586331725120544, "epoch": 0.91, "grad_norm": 11.567394973149476, "learning_rate": 1.2749288376442044e-07, "logits/chosen": -2.295335292816162, "logits/rejected": -2.1246066093444824, "logps/chosen": -255.32846069335938, "logps/rejected": -278.50323486328125, "loss": 0.6765, "positive_losses": 0.06229591369628906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15667331218719482, "rewards/margins": 0.07661958038806915, "rewards/margins_max": 0.2290104329586029, "rewards/margins_min": -0.07687706500291824, "rewards/margins_std": 0.13829876482486725, "rewards/rejected": 0.08005372434854507, "step": 3470 }, { "dpo_losses": 0.6619609594345093, "epoch": 0.91, "grad_norm": 8.3013731406622, "learning_rate": 1.203898683888713e-07, "logits/chosen": -2.378535747528076, "logits/rejected": -2.1960971355438232, "logps/chosen": -207.6337432861328, "logps/rejected": -221.1319580078125, "loss": 0.7189, "positive_losses": 1.0735082626342773, "rewards/accuracies": 0.625, "rewards/chosen": 0.13812896609306335, "rewards/margins": 0.0729888528585434, "rewards/margins_max": 0.2558894455432892, "rewards/margins_min": -0.12884943187236786, "rewards/margins_std": 0.17273090779781342, "rewards/rejected": 0.06514010578393936, "step": 3480 }, { "dpo_losses": 0.6734636425971985, "epoch": 0.91, "grad_norm": 9.881173823507565, "learning_rate": 1.1348554977451132e-07, "logits/chosen": -2.192690372467041, "logits/rejected": -2.264064311981201, "logps/chosen": -232.96322631835938, "logps/rejected": -210.9683837890625, "loss": 0.6872, "positive_losses": 0.10573215782642365, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15231771767139435, "rewards/margins": 0.04238463565707207, "rewards/margins_max": 0.16154329478740692, "rewards/margins_min": -0.0827672928571701, "rewards/margins_std": 0.10678144544363022, "rewards/rejected": 0.10993307828903198, "step": 3490 }, { "dpo_losses": 0.6373180150985718, "epoch": 0.92, "grad_norm": 16.2692278445599, "learning_rate": 1.0678050442995802e-07, "logits/chosen": -2.36617112159729, "logits/rejected": -2.2015204429626465, "logps/chosen": -330.5494384765625, "logps/rejected": -270.83013916015625, "loss": 0.7038, "positive_losses": 0.3278042674064636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17825230956077576, "rewards/margins": 0.12509088218212128, "rewards/margins_max": 0.3482973873615265, "rewards/margins_min": -0.058735769242048264, "rewards/margins_std": 0.1799793541431427, "rewards/rejected": 0.05316141992807388, "step": 3500 }, { "epoch": 0.92, "eval_dpo_losses": 0.6516081094741821, "eval_logits/chosen": -2.237894058227539, "eval_logits/rejected": -2.134103298187256, "eval_logps/chosen": -258.9013366699219, "eval_logps/rejected": -255.18002319335938, "eval_loss": 0.6850085258483887, "eval_positive_losses": 0.16901761293411255, "eval_rewards/accuracies": 0.692460298538208, "eval_rewards/chosen": 0.16872814297676086, "eval_rewards/margins": 0.09232571721076965, "eval_rewards/margins_max": 0.36539673805236816, "eval_rewards/margins_min": -0.12090327590703964, "eval_rewards/margins_std": 0.16323718428611755, "eval_rewards/rejected": 0.0764024406671524, "eval_runtime": 385.8355, "eval_samples_per_second": 5.184, "eval_steps_per_second": 0.163, "step": 3500 }, { "dpo_losses": 0.6591561436653137, "epoch": 0.92, "grad_norm": 8.855899512571144, "learning_rate": 1.0027529222456755e-07, "logits/chosen": -2.33859920501709, "logits/rejected": -2.1733481884002686, "logps/chosen": -265.84515380859375, "logps/rejected": -253.9714813232422, "loss": 0.691, "positive_losses": 0.5303298830986023, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16335991024971008, "rewards/margins": 0.07397004216909409, "rewards/margins_max": 0.2372910976409912, "rewards/margins_min": -0.05326114222407341, "rewards/margins_std": 0.12948307394981384, "rewards/rejected": 0.08938989043235779, "step": 3510 }, { "dpo_losses": 0.6582337617874146, "epoch": 0.92, "grad_norm": 13.692992619402085, "learning_rate": 9.397045634168766e-08, "logits/chosen": -2.242560625076294, "logits/rejected": -2.2196292877197266, "logps/chosen": -251.29421997070312, "logps/rejected": -244.8548126220703, "loss": 0.7162, "positive_losses": 1.0794401168823242, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13431352376937866, "rewards/margins": 0.08366844803094864, "rewards/margins_max": 0.3396510183811188, "rewards/margins_min": -0.12880495190620422, "rewards/margins_std": 0.20290382206439972, "rewards/rejected": 0.05064509063959122, "step": 3520 }, { "dpo_losses": 0.658118486404419, "epoch": 0.92, "grad_norm": 10.169727385851553, "learning_rate": 8.78665232332998e-08, "logits/chosen": -2.477908134460449, "logits/rejected": -2.30541729927063, "logps/chosen": -257.14410400390625, "logps/rejected": -237.76708984375, "loss": 0.6622, "positive_losses": 0.36432725191116333, "rewards/accuracies": 0.625, "rewards/chosen": 0.160700723528862, "rewards/margins": 0.07876841723918915, "rewards/margins_max": 0.29171493649482727, "rewards/margins_min": -0.06884454935789108, "rewards/margins_std": 0.16209475696086884, "rewards/rejected": 0.08193229883909225, "step": 3530 }, { "dpo_losses": 0.6545093059539795, "epoch": 0.93, "grad_norm": 4.311408305548019, "learning_rate": 8.196400257606208e-08, "logits/chosen": -2.306016445159912, "logits/rejected": -2.0804665088653564, "logps/chosen": -270.492431640625, "logps/rejected": -249.11758422851562, "loss": 0.6941, "positive_losses": 0.3300871253013611, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16537818312644958, "rewards/margins": 0.08601883053779602, "rewards/margins_max": 0.25100621581077576, "rewards/margins_min": -0.063410684466362, "rewards/margins_std": 0.13855870068073273, "rewards/rejected": 0.07935934513807297, "step": 3540 }, { "dpo_losses": 0.6559299230575562, "epoch": 0.93, "grad_norm": 2.20971289914531, "learning_rate": 7.626338722875076e-08, "logits/chosen": -2.1281886100769043, "logits/rejected": -2.084740161895752, "logps/chosen": -235.03054809570312, "logps/rejected": -250.90560913085938, "loss": 0.688, "positive_losses": 0.006858205888420343, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1704014539718628, "rewards/margins": 0.08176849037408829, "rewards/margins_max": 0.2720128893852234, "rewards/margins_min": -0.0708046406507492, "rewards/margins_std": 0.1572929173707962, "rewards/rejected": 0.08863295614719391, "step": 3550 }, { "dpo_losses": 0.6421164274215698, "epoch": 0.93, "grad_norm": 2.301286781095836, "learning_rate": 7.076515319110688e-08, "logits/chosen": -2.3332972526550293, "logits/rejected": -2.100236415863037, "logps/chosen": -261.58038330078125, "logps/rejected": -251.2709197998047, "loss": 0.6656, "positive_losses": 0.2422019988298416, "rewards/accuracies": 0.75, "rewards/chosen": 0.13788476586341858, "rewards/margins": 0.11643823236227036, "rewards/margins_max": 0.35397812724113464, "rewards/margins_min": -0.07436250150203705, "rewards/margins_std": 0.18804042041301727, "rewards/rejected": 0.02144654653966427, "step": 3560 }, { "dpo_losses": 0.6480227112770081, "epoch": 0.93, "grad_norm": 11.663736756586763, "learning_rate": 6.54697595640899e-08, "logits/chosen": -2.3693957328796387, "logits/rejected": -2.2666051387786865, "logps/chosen": -292.54205322265625, "logps/rejected": -256.78643798828125, "loss": 0.7036, "positive_losses": 0.4908958375453949, "rewards/accuracies": 0.75, "rewards/chosen": 0.16833141446113586, "rewards/margins": 0.0978071391582489, "rewards/margins_max": 0.25479069352149963, "rewards/margins_min": -0.03925397992134094, "rewards/margins_std": 0.13049368560314178, "rewards/rejected": 0.07052429020404816, "step": 3570 }, { "dpo_losses": 0.6262181401252747, "epoch": 0.94, "grad_norm": 2.6123711294120606, "learning_rate": 6.037764851154426e-08, "logits/chosen": -2.344728946685791, "logits/rejected": -2.0930678844451904, "logps/chosen": -283.3323059082031, "logps/rejected": -259.57696533203125, "loss": 0.6848, "positive_losses": 0.2706981599330902, "rewards/accuracies": 0.75, "rewards/chosen": 0.2072361409664154, "rewards/margins": 0.14926394820213318, "rewards/margins_max": 0.36749666929244995, "rewards/margins_min": -0.04141984134912491, "rewards/margins_std": 0.18114466965198517, "rewards/rejected": 0.05797220394015312, "step": 3580 }, { "dpo_losses": 0.6570135354995728, "epoch": 0.94, "grad_norm": 2.382270526222289, "learning_rate": 5.548924522327748e-08, "logits/chosen": -2.2625138759613037, "logits/rejected": -2.1901559829711914, "logps/chosen": -180.02627563476562, "logps/rejected": -194.6327667236328, "loss": 0.6782, "positive_losses": 0.025394439697265625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.14451250433921814, "rewards/margins": 0.08130757510662079, "rewards/margins_max": 0.25572913885116577, "rewards/margins_min": -0.10066118091344833, "rewards/margins_std": 0.16043666005134583, "rewards/rejected": 0.06320494413375854, "step": 3590 }, { "dpo_losses": 0.6422615051269531, "epoch": 0.94, "grad_norm": 7.758169482911929, "learning_rate": 5.0804957879556915e-08, "logits/chosen": -2.1195802688598633, "logits/rejected": -2.1484951972961426, "logps/chosen": -225.63186645507812, "logps/rejected": -272.039306640625, "loss": 0.6684, "positive_losses": 0.6606330871582031, "rewards/accuracies": 0.75, "rewards/chosen": 0.16965535283088684, "rewards/margins": 0.11303885281085968, "rewards/margins_max": 0.27875059843063354, "rewards/margins_min": -0.057497747242450714, "rewards/margins_std": 0.1491389125585556, "rewards/rejected": 0.056616514921188354, "step": 3600 }, { "epoch": 0.94, "eval_dpo_losses": 0.6521257162094116, "eval_logits/chosen": -2.23319411277771, "eval_logits/rejected": -2.1291911602020264, "eval_logps/chosen": -258.7999572753906, "eval_logps/rejected": -254.9619598388672, "eval_loss": 0.6840330362319946, "eval_positive_losses": 0.1637362688779831, "eval_rewards/accuracies": 0.6944444179534912, "eval_rewards/chosen": 0.16974212229251862, "eval_rewards/margins": 0.09115886688232422, "eval_rewards/margins_max": 0.3632105886936188, "eval_rewards/margins_min": -0.12115152925252914, "eval_rewards/margins_std": 0.16247817873954773, "eval_rewards/rejected": 0.0785832330584526, "eval_runtime": 386.4015, "eval_samples_per_second": 5.176, "eval_steps_per_second": 0.163, "step": 3600 }, { "dpo_losses": 0.6644371151924133, "epoch": 0.94, "grad_norm": 8.558760000239475, "learning_rate": 4.632517761702815e-08, "logits/chosen": -2.233320713043213, "logits/rejected": -2.2563061714172363, "logps/chosen": -209.6776885986328, "logps/rejected": -215.22421264648438, "loss": 0.669, "positive_losses": 0.1309986114501953, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.15478673577308655, "rewards/margins": 0.06250542402267456, "rewards/margins_max": 0.20296648144721985, "rewards/margins_min": -0.05720805004239082, "rewards/margins_std": 0.11538243293762207, "rewards/rejected": 0.09228130429983139, "step": 3610 }, { "dpo_losses": 0.6656601428985596, "epoch": 0.95, "grad_norm": 8.928774542541825, "learning_rate": 4.205027849605359e-08, "logits/chosen": -2.4145822525024414, "logits/rejected": -2.2265400886535645, "logps/chosen": -207.8376922607422, "logps/rejected": -231.404296875, "loss": 0.6735, "positive_losses": 0.22031936049461365, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.16390538215637207, "rewards/margins": 0.060680996626615524, "rewards/margins_max": 0.18728139996528625, "rewards/margins_min": -0.06372333317995071, "rewards/margins_std": 0.11280860006809235, "rewards/rejected": 0.10322437435388565, "step": 3620 }, { "dpo_losses": 0.648389458656311, "epoch": 0.95, "grad_norm": 2.2151612822841504, "learning_rate": 3.798061746947995e-08, "logits/chosen": -2.1811726093292236, "logits/rejected": -2.113131046295166, "logps/chosen": -226.98007202148438, "logps/rejected": -296.51043701171875, "loss": 0.6646, "positive_losses": 0.34091687202453613, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14185377955436707, "rewards/margins": 0.09637492150068283, "rewards/margins_max": 0.24663469195365906, "rewards/margins_min": -0.046325553208589554, "rewards/margins_std": 0.13547521829605103, "rewards/rejected": 0.04547884315252304, "step": 3630 }, { "dpo_losses": 0.6773545742034912, "epoch": 0.95, "grad_norm": 2.0741349542793093, "learning_rate": 3.411653435283158e-08, "logits/chosen": -2.389756679534912, "logits/rejected": -2.318364143371582, "logps/chosen": -241.6837615966797, "logps/rejected": -222.1800537109375, "loss": 0.7063, "positive_losses": 0.21475906670093536, "rewards/accuracies": 0.625, "rewards/chosen": 0.15981486439704895, "rewards/margins": 0.037009142339229584, "rewards/margins_max": 0.19361883401870728, "rewards/margins_min": -0.10754145681858063, "rewards/margins_std": 0.13640299439430237, "rewards/rejected": 0.12280573695898056, "step": 3640 }, { "dpo_losses": 0.6529147624969482, "epoch": 0.96, "grad_norm": 16.41910582622612, "learning_rate": 3.04583517959367e-08, "logits/chosen": -2.190483570098877, "logits/rejected": -2.195648670196533, "logps/chosen": -256.9494323730469, "logps/rejected": -311.24322509765625, "loss": 0.6826, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19076752662658691, "rewards/margins": 0.08783547580242157, "rewards/margins_max": 0.2549024820327759, "rewards/margins_min": -0.06040471792221069, "rewards/margins_std": 0.1367541253566742, "rewards/rejected": 0.10293205827474594, "step": 3650 }, { "dpo_losses": 0.6718894243240356, "epoch": 0.96, "grad_norm": 10.286900784920066, "learning_rate": 2.7006375255985984e-08, "logits/chosen": -2.563291311264038, "logits/rejected": -2.453547954559326, "logps/chosen": -253.66720581054688, "logps/rejected": -250.96963500976562, "loss": 0.7056, "positive_losses": 0.2768881916999817, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17926418781280518, "rewards/margins": 0.046900905668735504, "rewards/margins_max": 0.18268300592899323, "rewards/margins_min": -0.07423814386129379, "rewards/margins_std": 0.11312292516231537, "rewards/rejected": 0.13236327469348907, "step": 3660 }, { "dpo_losses": 0.6506290435791016, "epoch": 0.96, "grad_norm": 2.215781791960788, "learning_rate": 2.3760892972027328e-08, "logits/chosen": -2.332125663757324, "logits/rejected": -2.2531609535217285, "logps/chosen": -258.5568542480469, "logps/rejected": -293.6180419921875, "loss": 0.6855, "positive_losses": 0.27732858061790466, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.16688136756420135, "rewards/margins": 0.09657464921474457, "rewards/margins_max": 0.349087655544281, "rewards/margins_min": -0.06213757395744324, "rewards/margins_std": 0.19168011844158173, "rewards/rejected": 0.0703067034482956, "step": 3670 }, { "dpo_losses": 0.6428881883621216, "epoch": 0.96, "grad_norm": 10.39045832220638, "learning_rate": 2.072217594089765e-08, "logits/chosen": -2.2591280937194824, "logits/rejected": -2.2007009983062744, "logps/chosen": -267.62371826171875, "logps/rejected": -271.640869140625, "loss": 0.6737, "positive_losses": 0.2508724331855774, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.17502333223819733, "rewards/margins": 0.1083843857049942, "rewards/margins_max": 0.25783079862594604, "rewards/margins_min": -0.030369117856025696, "rewards/margins_std": 0.12855744361877441, "rewards/rejected": 0.06663894653320312, "step": 3680 }, { "dpo_losses": 0.6458489298820496, "epoch": 0.97, "grad_norm": 12.175581988263628, "learning_rate": 1.789047789459375e-08, "logits/chosen": -2.192356586456299, "logits/rejected": -2.0730724334716797, "logps/chosen": -295.0837707519531, "logps/rejected": -232.6221466064453, "loss": 0.6806, "positive_losses": 0.39788514375686646, "rewards/accuracies": 0.75, "rewards/chosen": 0.18995991349220276, "rewards/margins": 0.10518516600131989, "rewards/margins_max": 0.2766581177711487, "rewards/margins_min": -0.0712372437119484, "rewards/margins_std": 0.1480109542608261, "rewards/rejected": 0.08477479219436646, "step": 3690 }, { "dpo_losses": 0.6435426473617554, "epoch": 0.97, "grad_norm": 8.65035380262822, "learning_rate": 1.5266035279088708e-08, "logits/chosen": -2.4487831592559814, "logits/rejected": -2.3731191158294678, "logps/chosen": -304.1348876953125, "logps/rejected": -273.0533142089844, "loss": 0.6882, "positive_losses": 0.5730873346328735, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18121850490570068, "rewards/margins": 0.10878153145313263, "rewards/margins_max": 0.2810076177120209, "rewards/margins_min": -0.05754469707608223, "rewards/margins_std": 0.16096261143684387, "rewards/rejected": 0.07243697345256805, "step": 3700 }, { "epoch": 0.97, "eval_dpo_losses": 0.6521113514900208, "eval_logits/chosen": -2.2388391494750977, "eval_logits/rejected": -2.1349871158599854, "eval_logps/chosen": -258.7601013183594, "eval_logps/rejected": -254.92567443847656, "eval_loss": 0.6840153932571411, "eval_positive_losses": 0.1614445000886917, "eval_rewards/accuracies": 0.6944444179534912, "eval_rewards/chosen": 0.17014054954051971, "eval_rewards/margins": 0.09119454771280289, "eval_rewards/margins_max": 0.36288222670555115, "eval_rewards/margins_min": -0.12096142023801804, "eval_rewards/margins_std": 0.1623527854681015, "eval_rewards/rejected": 0.07894602417945862, "eval_runtime": 386.0282, "eval_samples_per_second": 5.181, "eval_steps_per_second": 0.163, "step": 3700 }, { "dpo_losses": 0.6406093835830688, "epoch": 0.97, "grad_norm": 2.2405489473723375, "learning_rate": 1.2849067234584623e-08, "logits/chosen": -2.221004009246826, "logits/rejected": -2.1221065521240234, "logps/chosen": -273.83209228515625, "logps/rejected": -267.626708984375, "loss": 0.6572, "positive_losses": 0.03246307373046875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.20061473548412323, "rewards/margins": 0.11160682141780853, "rewards/margins_max": 0.24180695414543152, "rewards/margins_min": -0.0029508545994758606, "rewards/margins_std": 0.11144082248210907, "rewards/rejected": 0.0890078917145729, "step": 3710 }, { "dpo_losses": 0.6355922818183899, "epoch": 0.97, "grad_norm": 5.337184508958447, "learning_rate": 1.0639775577218625e-08, "logits/chosen": -2.4390177726745605, "logits/rejected": -2.3144328594207764, "logps/chosen": -268.61566162109375, "logps/rejected": -243.4637451171875, "loss": 0.6707, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.19714899361133575, "rewards/margins": 0.12269027531147003, "rewards/margins_max": 0.2553178668022156, "rewards/margins_min": -0.002566099166870117, "rewards/margins_std": 0.11562588065862656, "rewards/rejected": 0.07445873320102692, "step": 3720 }, { "dpo_losses": 0.6400421857833862, "epoch": 0.98, "grad_norm": 15.788064161596985, "learning_rate": 8.638344782207486e-09, "logits/chosen": -2.445316791534424, "logits/rejected": -2.296851873397827, "logps/chosen": -302.14892578125, "logps/rejected": -251.044921875, "loss": 0.7544, "positive_losses": 0.1412469893693924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.18697282671928406, "rewards/margins": 0.11634068191051483, "rewards/margins_max": 0.3019865155220032, "rewards/margins_min": -0.044070713222026825, "rewards/margins_std": 0.15469469130039215, "rewards/rejected": 0.07063215970993042, "step": 3730 }, { "dpo_losses": 0.6285609006881714, "epoch": 0.98, "grad_norm": 1.9615804522466027, "learning_rate": 6.84494196844715e-09, "logits/chosen": -2.3738036155700684, "logits/rejected": -2.261535167694092, "logps/chosen": -304.20733642578125, "logps/rejected": -311.68255615234375, "loss": 0.6499, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.20539335906505585, "rewards/margins": 0.1412840485572815, "rewards/margins_max": 0.3032866418361664, "rewards/margins_min": -0.009407620877027512, "rewards/margins_std": 0.14868082106113434, "rewards/rejected": 0.06410931050777435, "step": 3740 }, { "dpo_losses": 0.6710334420204163, "epoch": 0.98, "grad_norm": 8.092597555350883, "learning_rate": 5.259716884556121e-09, "logits/chosen": -2.18589448928833, "logits/rejected": -2.1594722270965576, "logps/chosen": -267.77764892578125, "logps/rejected": -252.1927947998047, "loss": 0.7153, "positive_losses": 0.9474433064460754, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1605871617794037, "rewards/margins": 0.051191043108701706, "rewards/margins_max": 0.19923296570777893, "rewards/margins_min": -0.1430814266204834, "rewards/margins_std": 0.15550334751605988, "rewards/rejected": 0.10939611494541168, "step": 3750 }, { "dpo_losses": 0.6612724661827087, "epoch": 0.98, "grad_norm": 2.175892140492185, "learning_rate": 3.882801896372967e-09, "logits/chosen": -2.2251269817352295, "logits/rejected": -2.211683750152588, "logps/chosen": -208.31350708007812, "logps/rejected": -229.2930145263672, "loss": 0.6778, "positive_losses": 0.35350674390792847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14576348662376404, "rewards/margins": 0.07249745726585388, "rewards/margins_max": 0.2510822117328644, "rewards/margins_min": -0.08735796064138412, "rewards/margins_std": 0.15778590738773346, "rewards/rejected": 0.07326604425907135, "step": 3760 }, { "dpo_losses": 0.6476324796676636, "epoch": 0.99, "grad_norm": 12.06158791686119, "learning_rate": 2.7143119759026614e-09, "logits/chosen": -2.3815693855285645, "logits/rejected": -2.2842819690704346, "logps/chosen": -276.88446044921875, "logps/rejected": -304.99310302734375, "loss": 0.6623, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.184347003698349, "rewards/margins": 0.09787255525588989, "rewards/margins_max": 0.2605125308036804, "rewards/margins_min": -0.011675780639052391, "rewards/margins_std": 0.12482529878616333, "rewards/rejected": 0.0864744558930397, "step": 3770 }, { "dpo_losses": 0.6382454633712769, "epoch": 0.99, "grad_norm": 17.837788723464822, "learning_rate": 1.754344691717591e-09, "logits/chosen": -2.330889940261841, "logits/rejected": -2.0647213459014893, "logps/chosen": -276.5771789550781, "logps/rejected": -258.5731201171875, "loss": 0.6871, "positive_losses": 0.4069229066371918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17648881673812866, "rewards/margins": 0.1236453503370285, "rewards/margins_max": 0.3527640402317047, "rewards/margins_min": -0.08053845167160034, "rewards/margins_std": 0.1910148411989212, "rewards/rejected": 0.05284346267580986, "step": 3780 }, { "dpo_losses": 0.6739862561225891, "epoch": 0.99, "grad_norm": 17.41410280670526, "learning_rate": 1.0029802008096335e-09, "logits/chosen": -2.270875930786133, "logits/rejected": -2.257889747619629, "logps/chosen": -231.9406280517578, "logps/rejected": -211.40771484375, "loss": 0.7196, "positive_losses": 0.8542770147323608, "rewards/accuracies": 0.625, "rewards/chosen": 0.1338658630847931, "rewards/margins": 0.0462508425116539, "rewards/margins_max": 0.2182837426662445, "rewards/margins_min": -0.11107391119003296, "rewards/margins_std": 0.14643257856369019, "rewards/rejected": 0.08761502802371979, "step": 3790 }, { "dpo_losses": 0.652928352355957, "epoch": 0.99, "grad_norm": 10.369804092366572, "learning_rate": 4.602812418974534e-10, "logits/chosen": -2.2093427181243896, "logits/rejected": -2.136033058166504, "logps/chosen": -197.088623046875, "logps/rejected": -175.34713745117188, "loss": 0.687, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1494404375553131, "rewards/margins": 0.08840851485729218, "rewards/margins_max": 0.2532631754875183, "rewards/margins_min": -0.03992040082812309, "rewards/margins_std": 0.13011029362678528, "rewards/rejected": 0.06103191897273064, "step": 3800 }, { "epoch": 0.99, "eval_dpo_losses": 0.6520495414733887, "eval_logits/chosen": -2.236772060394287, "eval_logits/rejected": -2.1328725814819336, "eval_logps/chosen": -258.76507568359375, "eval_logps/rejected": -254.94320678710938, "eval_loss": 0.6839171051979065, "eval_positive_losses": 0.16110260784626007, "eval_rewards/accuracies": 0.6984127163887024, "eval_rewards/chosen": 0.17009054124355316, "eval_rewards/margins": 0.09131984412670135, "eval_rewards/margins_max": 0.36288949847221375, "eval_rewards/margins_min": -0.12122222781181335, "eval_rewards/margins_std": 0.16238048672676086, "eval_rewards/rejected": 0.07877067476511002, "eval_runtime": 386.129, "eval_samples_per_second": 5.18, "eval_steps_per_second": 0.163, "step": 3800 }, { "dpo_losses": 0.6445170044898987, "epoch": 1.0, "grad_norm": 2.090060270982713, "learning_rate": 1.2629313018819312e-10, "logits/chosen": -2.3199119567871094, "logits/rejected": -2.181281566619873, "logps/chosen": -281.1825256347656, "logps/rejected": -256.9422302246094, "loss": 0.6751, "positive_losses": 0.34783935546875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1826791763305664, "rewards/margins": 0.10701730102300644, "rewards/margins_max": 0.30159538984298706, "rewards/margins_min": -0.037402741611003876, "rewards/margins_std": 0.15129396319389343, "rewards/rejected": 0.07566186785697937, "step": 3810 }, { "dpo_losses": 0.6556090116500854, "epoch": 1.0, "grad_norm": 13.825318615296109, "learning_rate": 1.0437535929996855e-12, "logits/chosen": -2.2659218311309814, "logits/rejected": -2.137295722961426, "logps/chosen": -287.13433837890625, "logps/rejected": -208.0966033935547, "loss": 0.7044, "positive_losses": 1.1326688528060913, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14959761500358582, "rewards/margins": 0.08305417746305466, "rewards/margins_max": 0.2696264982223511, "rewards/margins_min": -0.07530739158391953, "rewards/margins_std": 0.15788157284259796, "rewards/rejected": 0.06654343754053116, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6847423079499896, "train_runtime": 42109.9082, "train_samples_per_second": 1.452, "train_steps_per_second": 0.091 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }