{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 2.436919118673123, "learning_rate": 1.3054830287206266e-09, "logits/chosen": -2.1143321990966797, "logits/rejected": -2.472040891647339, "logps/chosen": -177.52757263183594, "logps/rejected": -252.707275390625, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6930272579193115, "epoch": 0.0, "grad_norm": 37.21145991523898, "learning_rate": 1.3054830287206264e-08, "logits/chosen": -2.580538749694824, "logits/rejected": -2.2765724658966064, "logps/chosen": -276.4545593261719, "logps/rejected": -184.12747192382812, "loss": 0.6974, "positive_losses": 0.020280519500374794, "rewards/accuracies": 0.1111111119389534, "rewards/chosen": 0.00011522188287926838, "rewards/margins": 0.00024025494349189103, "rewards/margins_max": 0.0011821322841569781, "rewards/margins_min": -0.0003236473712604493, "rewards/margins_std": 0.0006685936823487282, "rewards/rejected": -0.00012503305333666503, "step": 10 }, { "dpo_losses": 0.6933600902557373, "epoch": 0.01, "grad_norm": 43.16625489341689, "learning_rate": 2.610966057441253e-08, "logits/chosen": -2.6730246543884277, "logits/rejected": -2.354184627532959, "logps/chosen": -336.60894775390625, "logps/rejected": -271.6905822753906, "loss": 0.7019, "positive_losses": 0.131869837641716, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0003984665381722152, "rewards/margins": -0.000420931086409837, "rewards/margins_max": 0.004248641896992922, "rewards/margins_min": -0.005044418387115002, "rewards/margins_std": 0.004089266061782837, "rewards/rejected": 2.2464513676823117e-05, "step": 20 }, { "dpo_losses": 0.6926755905151367, "epoch": 0.01, "grad_norm": 32.23362138600393, "learning_rate": 3.91644908616188e-08, "logits/chosen": -2.3213353157043457, "logits/rejected": -2.391751766204834, "logps/chosen": -247.16177368164062, "logps/rejected": -251.8048095703125, "loss": 0.7034, "positive_losses": 0.055048275738954544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0007592567126266658, "rewards/margins": 0.0009473847458139062, "rewards/margins_max": 0.0048844716511666775, "rewards/margins_min": -0.002753495005890727, "rewards/margins_std": 0.0034001737367361784, "rewards/rejected": -0.00018812823691405356, "step": 30 }, { "dpo_losses": 0.6931294202804565, "epoch": 0.01, "grad_norm": 28.948820005096927, "learning_rate": 5.221932114882506e-08, "logits/chosen": -2.3566269874572754, "logits/rejected": -2.3309898376464844, "logps/chosen": -211.5314483642578, "logps/rejected": -240.2368927001953, "loss": 0.7035, "positive_losses": 0.05449333041906357, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.00023043225519359112, "rewards/margins": 3.878644201904535e-05, "rewards/margins_max": 0.003696088446304202, "rewards/margins_min": -0.002856578677892685, "rewards/margins_std": 0.002960009966045618, "rewards/rejected": 0.00019164584227837622, "step": 40 }, { "dpo_losses": 0.6936127543449402, "epoch": 0.01, "grad_norm": 31.6642273080463, "learning_rate": 6.527415143603133e-08, "logits/chosen": -2.6226742267608643, "logits/rejected": -2.483947992324829, "logps/chosen": -331.0611877441406, "logps/rejected": -305.57452392578125, "loss": 0.6996, "positive_losses": 0.04957117885351181, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008887395961210132, "rewards/margins": -0.0009281095117330551, "rewards/margins_max": 0.001778390840627253, "rewards/margins_min": -0.004149734042584896, "rewards/margins_std": 0.0026368508115410805, "rewards/rejected": 0.0018168489914387465, "step": 50 }, { "dpo_losses": 0.6931079626083374, "epoch": 0.02, "grad_norm": 15.572720155358516, "learning_rate": 7.83289817232376e-08, "logits/chosen": -2.3426220417022705, "logits/rejected": -2.266212224960327, "logps/chosen": -236.0840606689453, "logps/rejected": -240.9228057861328, "loss": 0.6985, "positive_losses": 0.023387432098388672, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0012827671598643064, "rewards/margins": 8.110237831715494e-05, "rewards/margins_max": 0.003486787434667349, "rewards/margins_min": -0.0033876230008900166, "rewards/margins_std": 0.0030694655142724514, "rewards/rejected": 0.0012016647960990667, "step": 60 }, { "dpo_losses": 0.693344235420227, "epoch": 0.02, "grad_norm": 7.8731478980587575, "learning_rate": 9.138381201044386e-08, "logits/chosen": -2.431896924972534, "logits/rejected": -2.300759792327881, "logps/chosen": -246.80429077148438, "logps/rejected": -246.1142120361328, "loss": 0.696, "positive_losses": 0.03448672220110893, "rewards/accuracies": 0.375, "rewards/chosen": 0.0018469663336873055, "rewards/margins": -0.00039082911098375916, "rewards/margins_max": 0.003382150549441576, "rewards/margins_min": -0.004007203970104456, "rewards/margins_std": 0.0031986695248633623, "rewards/rejected": 0.0022377956192940474, "step": 70 }, { "dpo_losses": 0.6932621598243713, "epoch": 0.02, "grad_norm": 19.99624598557502, "learning_rate": 1.0443864229765012e-07, "logits/chosen": -2.462129831314087, "logits/rejected": -2.3701767921447754, "logps/chosen": -280.59197998046875, "logps/rejected": -237.6129913330078, "loss": 0.6969, "positive_losses": 0.05826077610254288, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0015884635504335165, "rewards/margins": -0.00022426671057473868, "rewards/margins_max": 0.00464463047683239, "rewards/margins_min": -0.005269146058708429, "rewards/margins_std": 0.004375453107059002, "rewards/rejected": 0.0018127303337678313, "step": 80 }, { "dpo_losses": 0.6935017704963684, "epoch": 0.02, "grad_norm": 7.214313415301501, "learning_rate": 1.174934725848564e-07, "logits/chosen": -2.5947582721710205, "logits/rejected": -2.553162097930908, "logps/chosen": -344.86138916015625, "logps/rejected": -323.3901672363281, "loss": 0.6965, "positive_losses": 0.022647667676210403, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0031913493294268847, "rewards/margins": -0.0007054595043882728, "rewards/margins_max": 0.0029357299208641052, "rewards/margins_min": -0.004324203822761774, "rewards/margins_std": 0.003250499488785863, "rewards/rejected": 0.0038968082517385483, "step": 90 }, { "dpo_losses": 0.6929136514663696, "epoch": 0.03, "grad_norm": 29.353944415263427, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.2842743396759033, "logits/rejected": -2.171790361404419, "logps/chosen": -276.45654296875, "logps/rejected": -220.96701049804688, "loss": 0.696, "positive_losses": 0.029300499707460403, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.003223895560950041, "rewards/margins": 0.00047344350605271757, "rewards/margins_max": 0.006351976189762354, "rewards/margins_min": -0.004781234078109264, "rewards/margins_std": 0.0050557455979287624, "rewards/rejected": 0.0027504523750394583, "step": 100 }, { "epoch": 0.03, "eval_dpo_losses": 0.6932076215744019, "eval_logits/chosen": -2.3446600437164307, "eval_logits/rejected": -2.234032154083252, "eval_logps/chosen": -275.4396667480469, "eval_logps/rejected": -262.4742431640625, "eval_loss": 0.6946702599525452, "eval_positive_losses": 0.01234098058193922, "eval_rewards/accuracies": 0.4722222089767456, "eval_rewards/chosen": 0.00334473280236125, "eval_rewards/margins": -0.00011571276263566688, "eval_rewards/margins_max": 0.006645516026765108, "eval_rewards/margins_min": -0.006240579299628735, "eval_rewards/margins_std": 0.004307589493691921, "eval_rewards/rejected": 0.0034604459069669247, "eval_runtime": 390.9327, "eval_samples_per_second": 5.116, "eval_steps_per_second": 0.161, "step": 100 }, { "dpo_losses": 0.6934612989425659, "epoch": 0.03, "grad_norm": 13.810811901934839, "learning_rate": 1.4360313315926893e-07, "logits/chosen": -2.4272146224975586, "logits/rejected": -2.393319606781006, "logps/chosen": -253.1317901611328, "logps/rejected": -262.434814453125, "loss": 0.6953, "positive_losses": 0.011834526434540749, "rewards/accuracies": 0.5, "rewards/chosen": 0.003846388775855303, "rewards/margins": -0.0006237152847461402, "rewards/margins_max": 0.0041875895112752914, "rewards/margins_min": -0.0053040506318211555, "rewards/margins_std": 0.004180489107966423, "rewards/rejected": 0.00447010388597846, "step": 110 }, { "dpo_losses": 0.6928579211235046, "epoch": 0.03, "grad_norm": 25.408335169659384, "learning_rate": 1.566579634464752e-07, "logits/chosen": -2.383654832839966, "logits/rejected": -2.3157191276550293, "logps/chosen": -231.6434326171875, "logps/rejected": -234.43820190429688, "loss": 0.697, "positive_losses": 0.08562879264354706, "rewards/accuracies": 0.5, "rewards/chosen": 0.004084297455847263, "rewards/margins": 0.0005887853330932558, "rewards/margins_max": 0.006473775953054428, "rewards/margins_min": -0.006034852471202612, "rewards/margins_std": 0.005430928431451321, "rewards/rejected": 0.003495512530207634, "step": 120 }, { "dpo_losses": 0.6938505172729492, "epoch": 0.03, "grad_norm": 12.728654533776709, "learning_rate": 1.6971279373368143e-07, "logits/chosen": -2.3995563983917236, "logits/rejected": -2.3070030212402344, "logps/chosen": -268.192626953125, "logps/rejected": -358.4860534667969, "loss": 0.6945, "positive_losses": 0.020833205431699753, "rewards/accuracies": 0.375, "rewards/chosen": 0.0045697325840592384, "rewards/margins": -0.0013981324154883623, "rewards/margins_max": 0.0037638768553733826, "rewards/margins_min": -0.007319015916436911, "rewards/margins_std": 0.004900245927274227, "rewards/rejected": 0.005967865232378244, "step": 130 }, { "dpo_losses": 0.6928057074546814, "epoch": 0.04, "grad_norm": 17.691217793349296, "learning_rate": 1.8276762402088773e-07, "logits/chosen": -2.3290584087371826, "logits/rejected": -2.174755573272705, "logps/chosen": -215.482666015625, "logps/rejected": -207.40994262695312, "loss": 0.6944, "positive_losses": 0.0052035809494555, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0037437081336975098, "rewards/margins": 0.0006875363760627806, "rewards/margins_max": 0.005510732997208834, "rewards/margins_min": -0.0035748309455811977, "rewards/margins_std": 0.0041796499863266945, "rewards/rejected": 0.00305617181584239, "step": 140 }, { "dpo_losses": 0.6926361322402954, "epoch": 0.04, "grad_norm": 5.784324034925397, "learning_rate": 1.95822454308094e-07, "logits/chosen": -2.3415417671203613, "logits/rejected": -2.294668674468994, "logps/chosen": -215.10275268554688, "logps/rejected": -243.3657684326172, "loss": 0.6939, "positive_losses": 0.0035098553635179996, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005242037586867809, "rewards/margins": 0.0010315107647329569, "rewards/margins_max": 0.007911418564617634, "rewards/margins_min": -0.0046582394279539585, "rewards/margins_std": 0.005558903329074383, "rewards/rejected": 0.004210526589304209, "step": 150 }, { "dpo_losses": 0.6928949952125549, "epoch": 0.04, "grad_norm": 2.3046564649658285, "learning_rate": 2.0887728459530023e-07, "logits/chosen": -2.4997358322143555, "logits/rejected": -2.3269925117492676, "logps/chosen": -256.75543212890625, "logps/rejected": -224.8738250732422, "loss": 0.6945, "positive_losses": 0.01890692673623562, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0052116429433226585, "rewards/margins": 0.000508451194036752, "rewards/margins_max": 0.0048551904037594795, "rewards/margins_min": -0.003729505930095911, "rewards/margins_std": 0.00386296259239316, "rewards/rejected": 0.004703192505985498, "step": 160 }, { "dpo_losses": 0.6930381655693054, "epoch": 0.04, "grad_norm": 16.806212884046968, "learning_rate": 2.2193211488250652e-07, "logits/chosen": -2.3000152111053467, "logits/rejected": -2.310934543609619, "logps/chosen": -228.2405242919922, "logps/rejected": -204.77462768554688, "loss": 0.694, "positive_losses": 0.01548094768077135, "rewards/accuracies": 0.5, "rewards/chosen": 0.004649649374186993, "rewards/margins": 0.00022466508380603045, "rewards/margins_max": 0.0046968720853328705, "rewards/margins_min": -0.005565372295677662, "rewards/margins_std": 0.0046022022143006325, "rewards/rejected": 0.004424984101206064, "step": 170 }, { "dpo_losses": 0.692747175693512, "epoch": 0.05, "grad_norm": 3.012373678637585, "learning_rate": 2.349869451697128e-07, "logits/chosen": -2.4903554916381836, "logits/rejected": -2.263298988342285, "logps/chosen": -302.0119934082031, "logps/rejected": -243.8614501953125, "loss": 0.6937, "positive_losses": 0.013303923420608044, "rewards/accuracies": 0.625, "rewards/chosen": 0.0055279238149523735, "rewards/margins": 0.0008107582107186317, "rewards/margins_max": 0.007508331444114447, "rewards/margins_min": -0.005641286727041006, "rewards/margins_std": 0.0057433731853961945, "rewards/rejected": 0.004717165604233742, "step": 180 }, { "dpo_losses": 0.6919572949409485, "epoch": 0.05, "grad_norm": 2.8785860667757444, "learning_rate": 2.4804177545691903e-07, "logits/chosen": -2.6338706016540527, "logits/rejected": -2.3610117435455322, "logps/chosen": -270.59356689453125, "logps/rejected": -206.9786376953125, "loss": 0.6935, "positive_losses": 0.008061980828642845, "rewards/accuracies": 0.625, "rewards/chosen": 0.007459124084562063, "rewards/margins": 0.0023925146088004112, "rewards/margins_max": 0.00886436365544796, "rewards/margins_min": -0.004327102564275265, "rewards/margins_std": 0.006158038042485714, "rewards/rejected": 0.005066608544439077, "step": 190 }, { "dpo_losses": 0.6921755075454712, "epoch": 0.05, "grad_norm": 13.314281075837563, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.3962268829345703, "logits/rejected": -2.245490789413452, "logps/chosen": -304.4664306640625, "logps/rejected": -251.6834716796875, "loss": 0.694, "positive_losses": 0.01467971783131361, "rewards/accuracies": 0.625, "rewards/chosen": 0.00702693173661828, "rewards/margins": 0.001953268889337778, "rewards/margins_max": 0.007922893390059471, "rewards/margins_min": -0.003625961486250162, "rewards/margins_std": 0.0052861375734210014, "rewards/rejected": 0.005073662847280502, "step": 200 }, { "epoch": 0.05, "eval_dpo_losses": 0.6929535269737244, "eval_logits/chosen": -2.3497445583343506, "eval_logits/rejected": -2.239522695541382, "eval_logps/chosen": -275.1669006347656, "eval_logps/rejected": -262.252685546875, "eval_loss": 0.6935574412345886, "eval_positive_losses": 0.00973793026059866, "eval_rewards/accuracies": 0.5138888955116272, "eval_rewards/chosen": 0.006072176620364189, "eval_rewards/margins": 0.0003964357019867748, "eval_rewards/margins_max": 0.009460194036364555, "eval_rewards/margins_min": -0.008211031556129456, "eval_rewards/margins_std": 0.005887583363801241, "eval_rewards/rejected": 0.005675741471350193, "eval_runtime": 390.3681, "eval_samples_per_second": 5.123, "eval_steps_per_second": 0.161, "step": 200 }, { "dpo_losses": 0.692274808883667, "epoch": 0.05, "grad_norm": 17.537881015328292, "learning_rate": 2.7415143603133156e-07, "logits/chosen": -2.4792137145996094, "logits/rejected": -2.26755690574646, "logps/chosen": -304.48876953125, "logps/rejected": -274.9486083984375, "loss": 0.6935, "positive_losses": 0.0014587402110919356, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0069889710284769535, "rewards/margins": 0.0017549883341416717, "rewards/margins_max": 0.008708182722330093, "rewards/margins_min": -0.004553269594907761, "rewards/margins_std": 0.005933467298746109, "rewards/rejected": 0.005233983509242535, "step": 210 }, { "dpo_losses": 0.6924992799758911, "epoch": 0.06, "grad_norm": 2.718816909843638, "learning_rate": 2.8720626631853785e-07, "logits/chosen": -2.2473559379577637, "logits/rejected": -2.2075276374816895, "logps/chosen": -253.273681640625, "logps/rejected": -287.05999755859375, "loss": 0.6929, "positive_losses": 0.017268944531679153, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0066956751979887486, "rewards/margins": 0.0013056481257081032, "rewards/margins_max": 0.008818237110972404, "rewards/margins_min": -0.004259868524968624, "rewards/margins_std": 0.0058462671004235744, "rewards/rejected": 0.005390027537941933, "step": 220 }, { "dpo_losses": 0.6914334297180176, "epoch": 0.06, "grad_norm": 2.597889182754257, "learning_rate": 3.002610966057441e-07, "logits/chosen": -2.4690744876861572, "logits/rejected": -2.32529616355896, "logps/chosen": -300.3739013671875, "logps/rejected": -254.05679321289062, "loss": 0.6929, "positive_losses": 0.0044761658646166325, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.009891889058053493, "rewards/margins": 0.003442333545535803, "rewards/margins_max": 0.012979629449546337, "rewards/margins_min": -0.002517446642741561, "rewards/margins_std": 0.007012346293777227, "rewards/rejected": 0.006449555512517691, "step": 230 }, { "dpo_losses": 0.6925778388977051, "epoch": 0.06, "grad_norm": 15.046706154134439, "learning_rate": 3.133159268929504e-07, "logits/chosen": -2.3476109504699707, "logits/rejected": -2.0642662048339844, "logps/chosen": -257.81329345703125, "logps/rejected": -218.8253631591797, "loss": 0.693, "positive_losses": 0.012066555209457874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006692859344184399, "rewards/margins": 0.001148747862316668, "rewards/margins_max": 0.007607417646795511, "rewards/margins_min": -0.005777723155915737, "rewards/margins_std": 0.006123958621174097, "rewards/rejected": 0.00554411206394434, "step": 240 }, { "dpo_losses": 0.6926454305648804, "epoch": 0.07, "grad_norm": 11.877075656383642, "learning_rate": 3.263707571801567e-07, "logits/chosen": -2.424232006072998, "logits/rejected": -2.399569511413574, "logps/chosen": -266.6890869140625, "logps/rejected": -252.54904174804688, "loss": 0.6936, "positive_losses": 0.011177492327988148, "rewards/accuracies": 0.5, "rewards/chosen": 0.005979129578918219, "rewards/margins": 0.001018165610730648, "rewards/margins_max": 0.0069590238854289055, "rewards/margins_min": -0.006120038218796253, "rewards/margins_std": 0.005888332612812519, "rewards/rejected": 0.004960964433848858, "step": 250 }, { "dpo_losses": 0.6918977499008179, "epoch": 0.07, "grad_norm": 2.5763562232657495, "learning_rate": 3.3942558746736286e-07, "logits/chosen": -2.3855857849121094, "logits/rejected": -2.3012237548828125, "logps/chosen": -332.80828857421875, "logps/rejected": -325.3997802734375, "loss": 0.6924, "positive_losses": 0.0011383056407794356, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.009121356531977654, "rewards/margins": 0.0025252080522477627, "rewards/margins_max": 0.012511787004768848, "rewards/margins_min": -0.004578470252454281, "rewards/margins_std": 0.007813150063157082, "rewards/rejected": 0.006596148945391178, "step": 260 }, { "dpo_losses": 0.6923853158950806, "epoch": 0.07, "grad_norm": 7.226922499479053, "learning_rate": 3.5248041775456916e-07, "logits/chosen": -2.350959300994873, "logits/rejected": -2.2645444869995117, "logps/chosen": -306.7738342285156, "logps/rejected": -292.66705322265625, "loss": 0.6932, "positive_losses": 0.015529441647231579, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.008473177440464497, "rewards/margins": 0.0015398891409859061, "rewards/margins_max": 0.009197860024869442, "rewards/margins_min": -0.0058059715665876865, "rewards/margins_std": 0.00660657649859786, "rewards/rejected": 0.006933287717401981, "step": 270 }, { "dpo_losses": 0.6923667192459106, "epoch": 0.07, "grad_norm": 2.4144727241079833, "learning_rate": 3.6553524804177545e-07, "logits/chosen": -2.458653211593628, "logits/rejected": -2.449664831161499, "logps/chosen": -267.655029296875, "logps/rejected": -241.6058349609375, "loss": 0.6936, "positive_losses": 0.0, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.009895781055092812, "rewards/margins": 0.0015804242575541139, "rewards/margins_max": 0.008207702077925205, "rewards/margins_min": -0.004959435667842627, "rewards/margins_std": 0.005886612925678492, "rewards/rejected": 0.008315357379615307, "step": 280 }, { "dpo_losses": 0.6915370225906372, "epoch": 0.08, "grad_norm": 11.719442687861164, "learning_rate": 3.785900783289817e-07, "logits/chosen": -2.4494004249572754, "logits/rejected": -2.2430827617645264, "logps/chosen": -300.3384704589844, "logps/rejected": -240.4301300048828, "loss": 0.6926, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010413247160613537, "rewards/margins": 0.0032329782843589783, "rewards/margins_max": 0.010191191919147968, "rewards/margins_min": -0.0028379459399729967, "rewards/margins_std": 0.005778302438557148, "rewards/rejected": 0.007180268410593271, "step": 290 }, { "dpo_losses": 0.6912565231323242, "epoch": 0.08, "grad_norm": 2.7832201719598557, "learning_rate": 3.91644908616188e-07, "logits/chosen": -2.6444625854492188, "logits/rejected": -2.4465010166168213, "logps/chosen": -309.1859436035156, "logps/rejected": -242.1009063720703, "loss": 0.6925, "positive_losses": 0.006983947940170765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01172476913779974, "rewards/margins": 0.00381329539231956, "rewards/margins_max": 0.015631547197699547, "rewards/margins_min": -0.004602078814059496, "rewards/margins_std": 0.009133655577898026, "rewards/rejected": 0.007911473512649536, "step": 300 }, { "epoch": 0.08, "eval_dpo_losses": 0.6923103928565979, "eval_logits/chosen": -2.344533681869507, "eval_logits/rejected": -2.234121561050415, "eval_logps/chosen": -274.72064208984375, "eval_logps/rejected": -261.9361877441406, "eval_loss": 0.692529559135437, "eval_positive_losses": 0.006553869228810072, "eval_rewards/accuracies": 0.5595238208770752, "eval_rewards/chosen": 0.010534894652664661, "eval_rewards/margins": 0.001694104983471334, "eval_rewards/margins_max": 0.015569796785712242, "eval_rewards/margins_min": -0.010307504795491695, "eval_rewards/margins_std": 0.008523602038621902, "eval_rewards/rejected": 0.008840790018439293, "eval_runtime": 390.8632, "eval_samples_per_second": 5.117, "eval_steps_per_second": 0.161, "step": 300 }, { "dpo_losses": 0.6937541961669922, "epoch": 0.08, "grad_norm": 3.088577261725526, "learning_rate": 4.046997389033943e-07, "logits/chosen": -2.4557738304138184, "logits/rejected": -2.4186770915985107, "logps/chosen": -277.6371765136719, "logps/rejected": -299.49462890625, "loss": 0.6932, "positive_losses": 0.005030012223869562, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.008825590834021568, "rewards/margins": -0.0011969914194196463, "rewards/margins_max": 0.0061055333353579044, "rewards/margins_min": -0.01069082785397768, "rewards/margins_std": 0.0073821828700602055, "rewards/rejected": 0.010022582486271858, "step": 310 }, { "dpo_losses": 0.6908339262008667, "epoch": 0.08, "grad_norm": 9.829216391607567, "learning_rate": 4.1775456919060046e-07, "logits/chosen": -2.4529805183410645, "logits/rejected": -2.3554484844207764, "logps/chosen": -300.8166809082031, "logps/rejected": -263.30596923828125, "loss": 0.6923, "positive_losses": 0.0026596069801598787, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.013345083221793175, "rewards/margins": 0.004670474678277969, "rewards/margins_max": 0.017103437334299088, "rewards/margins_min": -0.007430749479681253, "rewards/margins_std": 0.010712040588259697, "rewards/rejected": 0.008674608543515205, "step": 320 }, { "dpo_losses": 0.6927045583724976, "epoch": 0.09, "grad_norm": 2.7521626031987645, "learning_rate": 4.3080939947780675e-07, "logits/chosen": -2.393118381500244, "logits/rejected": -2.278369426727295, "logps/chosen": -273.70330810546875, "logps/rejected": -287.0301818847656, "loss": 0.6931, "positive_losses": 0.009592438116669655, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.010709630325436592, "rewards/margins": 0.0009189089760184288, "rewards/margins_max": 0.012000239454209805, "rewards/margins_min": -0.010556844994425774, "rewards/margins_std": 0.010111270472407341, "rewards/rejected": 0.009790720418095589, "step": 330 }, { "dpo_losses": 0.6898068189620972, "epoch": 0.09, "grad_norm": 3.160895224650102, "learning_rate": 4.4386422976501305e-07, "logits/chosen": -2.3239378929138184, "logits/rejected": -2.3136210441589355, "logps/chosen": -291.5858154296875, "logps/rejected": -281.27252197265625, "loss": 0.693, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.014423887245357037, "rewards/margins": 0.006730523891746998, "rewards/margins_max": 0.020521607249975204, "rewards/margins_min": -0.004177724476903677, "rewards/margins_std": 0.011026902124285698, "rewards/rejected": 0.007693366147577763, "step": 340 }, { "dpo_losses": 0.691826343536377, "epoch": 0.09, "grad_norm": 6.181606684346002, "learning_rate": 4.569190600522193e-07, "logits/chosen": -2.317305326461792, "logits/rejected": -2.13057804107666, "logps/chosen": -263.7362976074219, "logps/rejected": -232.4236297607422, "loss": 0.6932, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.012940694577991962, "rewards/margins": 0.0026776641607284546, "rewards/margins_max": 0.014089075848460197, "rewards/margins_min": -0.008894523605704308, "rewards/margins_std": 0.010269769467413425, "rewards/rejected": 0.010263030417263508, "step": 350 }, { "dpo_losses": 0.6904358863830566, "epoch": 0.09, "grad_norm": 3.1469547386635965, "learning_rate": 4.699738903394256e-07, "logits/chosen": -2.6787221431732178, "logits/rejected": -2.4300901889801025, "logps/chosen": -361.8331298828125, "logps/rejected": -282.56341552734375, "loss": 0.6921, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01704949513077736, "rewards/margins": 0.005480791442096233, "rewards/margins_max": 0.020162995904684067, "rewards/margins_min": -0.007818843238055706, "rewards/margins_std": 0.013104942627251148, "rewards/rejected": 0.011568702757358551, "step": 360 }, { "dpo_losses": 0.6934413909912109, "epoch": 0.1, "grad_norm": 2.511112976589279, "learning_rate": 4.830287206266319e-07, "logits/chosen": -2.240891218185425, "logits/rejected": -2.2595362663269043, "logps/chosen": -200.19371032714844, "logps/rejected": -233.42678833007812, "loss": 0.6934, "positive_losses": 0.045371245592832565, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.00877201184630394, "rewards/margins": -0.0005743166548199952, "rewards/margins_max": 0.0075528621673583984, "rewards/margins_min": -0.008113959804177284, "rewards/margins_std": 0.00696157943457365, "rewards/rejected": 0.009346329607069492, "step": 370 }, { "dpo_losses": 0.6904364824295044, "epoch": 0.1, "grad_norm": 8.38590270977505, "learning_rate": 4.960835509138381e-07, "logits/chosen": -2.435319185256958, "logits/rejected": -2.1107144355773926, "logps/chosen": -261.4594421386719, "logps/rejected": -253.9053192138672, "loss": 0.692, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.013392363674938679, "rewards/margins": 0.00547355692833662, "rewards/margins_max": 0.019673766568303108, "rewards/margins_min": -0.004629914648830891, "rewards/margins_std": 0.011092177592217922, "rewards/rejected": 0.007918806746602058, "step": 380 }, { "dpo_losses": 0.6912089586257935, "epoch": 0.1, "grad_norm": 2.4234607058409523, "learning_rate": 4.999948856244767e-07, "logits/chosen": -2.5964977741241455, "logits/rejected": -2.4474472999572754, "logps/chosen": -267.3672790527344, "logps/rejected": -252.64822387695312, "loss": 0.6914, "positive_losses": 0.0031883239280432463, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.016581350937485695, "rewards/margins": 0.003933814354240894, "rewards/margins_max": 0.02252252958714962, "rewards/margins_min": -0.0087641142308712, "rewards/margins_std": 0.014020366594195366, "rewards/rejected": 0.012647537514567375, "step": 390 }, { "dpo_losses": 0.6926881670951843, "epoch": 0.1, "grad_norm": 2.364769635584474, "learning_rate": 4.999698361256577e-07, "logits/chosen": -2.380741596221924, "logits/rejected": -2.257896900177002, "logps/chosen": -228.54287719726562, "logps/rejected": -222.64077758789062, "loss": 0.6922, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.012776264920830727, "rewards/margins": 0.0009694203035905957, "rewards/margins_max": 0.0137071143835783, "rewards/margins_min": -0.012924333103001118, "rewards/margins_std": 0.012163150124251842, "rewards/rejected": 0.011806843802332878, "step": 400 }, { "epoch": 0.1, "eval_dpo_losses": 0.6913719773292542, "eval_logits/chosen": -2.3339977264404297, "eval_logits/rejected": -2.2231147289276123, "eval_logps/chosen": -274.14910888671875, "eval_logps/rejected": -261.5552673339844, "eval_loss": 0.6916861534118652, "eval_positive_losses": 0.006425357889384031, "eval_rewards/accuracies": 0.591269850730896, "eval_rewards/chosen": 0.0162503644824028, "eval_rewards/margins": 0.0036002290435135365, "eval_rewards/margins_max": 0.02482009492814541, "eval_rewards/margins_min": -0.014531257562339306, "eval_rewards/margins_std": 0.012822597287595272, "eval_rewards/rejected": 0.012650134041905403, "eval_runtime": 390.5651, "eval_samples_per_second": 5.121, "eval_steps_per_second": 0.161, "step": 400 }, { "dpo_losses": 0.6915867328643799, "epoch": 0.11, "grad_norm": 9.14541474400389, "learning_rate": 4.99923914217458e-07, "logits/chosen": -2.330832004547119, "logits/rejected": -2.1426444053649902, "logps/chosen": -291.74066162109375, "logps/rejected": -291.671630859375, "loss": 0.692, "positive_losses": 0.00031595228938385844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.015372270718216896, "rewards/margins": 0.0031558140181005, "rewards/margins_max": 0.014808600768446922, "rewards/margins_min": -0.009504411369562149, "rewards/margins_std": 0.010665502399206161, "rewards/rejected": 0.012216455303132534, "step": 410 }, { "dpo_losses": 0.69011390209198, "epoch": 0.11, "grad_norm": 2.2472459043940147, "learning_rate": 4.99857123734344e-07, "logits/chosen": -2.380620002746582, "logits/rejected": -2.242215394973755, "logps/chosen": -258.4724426269531, "logps/rejected": -228.89181518554688, "loss": 0.6906, "positive_losses": 0.0019479751354083419, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014588996767997742, "rewards/margins": 0.006132002919912338, "rewards/margins_max": 0.024467330425977707, "rewards/margins_min": -0.009631652384996414, "rewards/margins_std": 0.015156319364905357, "rewards/rejected": 0.008456994779407978, "step": 420 }, { "dpo_losses": 0.6890771985054016, "epoch": 0.11, "grad_norm": 23.393899549515307, "learning_rate": 4.997694702533016e-07, "logits/chosen": -2.370934009552002, "logits/rejected": -2.1254513263702393, "logps/chosen": -287.72265625, "logps/rejected": -215.23715209960938, "loss": 0.6911, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.018906597048044205, "rewards/margins": 0.008222071453928947, "rewards/margins_max": 0.024109777063131332, "rewards/margins_min": -0.004399652127176523, "rewards/margins_std": 0.013075257651507854, "rewards/rejected": 0.010684525594115257, "step": 430 }, { "dpo_losses": 0.6907385587692261, "epoch": 0.12, "grad_norm": 2.7484314519957707, "learning_rate": 4.996609610933712e-07, "logits/chosen": -2.4527649879455566, "logits/rejected": -2.432299852371216, "logps/chosen": -268.7482604980469, "logps/rejected": -262.2334289550781, "loss": 0.6907, "positive_losses": 0.019094085320830345, "rewards/accuracies": 0.625, "rewards/chosen": 0.017837559804320335, "rewards/margins": 0.00492429081350565, "rewards/margins_max": 0.02515985444188118, "rewards/margins_min": -0.011089869774878025, "rewards/margins_std": 0.016181860119104385, "rewards/rejected": 0.01291326992213726, "step": 440 }, { "dpo_losses": 0.6903314590454102, "epoch": 0.12, "grad_norm": 2.634886244719784, "learning_rate": 4.995316053150366e-07, "logits/chosen": -2.5657308101654053, "logits/rejected": -2.346353054046631, "logps/chosen": -274.9120178222656, "logps/rejected": -228.2688446044922, "loss": 0.6923, "positive_losses": 0.022286225110292435, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01831643097102642, "rewards/margins": 0.005674588028341532, "rewards/margins_max": 0.018724936991930008, "rewards/margins_min": -0.0074019655585289, "rewards/margins_std": 0.011426335200667381, "rewards/rejected": 0.012641841545701027, "step": 450 }, { "dpo_losses": 0.6908451914787292, "epoch": 0.12, "grad_norm": 6.632279049974339, "learning_rate": 4.99381413719468e-07, "logits/chosen": -2.284437656402588, "logits/rejected": -2.2629213333129883, "logps/chosen": -236.77194213867188, "logps/rejected": -255.356201171875, "loss": 0.6904, "positive_losses": 0.02016448974609375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.014919276349246502, "rewards/margins": 0.004663817584514618, "rewards/margins_max": 0.019028306007385254, "rewards/margins_min": -0.006994474679231644, "rewards/margins_std": 0.011909430846571922, "rewards/rejected": 0.010255459696054459, "step": 460 }, { "dpo_losses": 0.6905598044395447, "epoch": 0.12, "grad_norm": 1.9464485497405548, "learning_rate": 4.992103988476205e-07, "logits/chosen": -2.467581033706665, "logits/rejected": -2.435335636138916, "logps/chosen": -278.9448547363281, "logps/rejected": -277.6481628417969, "loss": 0.6937, "positive_losses": 0.051589202135801315, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.018899690359830856, "rewards/margins": 0.005235827527940273, "rewards/margins_max": 0.019831741228699684, "rewards/margins_min": -0.010511765256524086, "rewards/margins_std": 0.013238822109997272, "rewards/rejected": 0.013663860969245434, "step": 470 }, { "dpo_losses": 0.6905876398086548, "epoch": 0.13, "grad_norm": 11.440945465926182, "learning_rate": 4.990185749791864e-07, "logits/chosen": -2.4389989376068115, "logits/rejected": -2.3032586574554443, "logps/chosen": -276.0810546875, "logps/rejected": -233.3933868408203, "loss": 0.6902, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.020623551681637764, "rewards/margins": 0.005269557237625122, "rewards/margins_max": 0.02258582040667534, "rewards/margins_min": -0.01461974997073412, "rewards/margins_std": 0.016966843977570534, "rewards/rejected": 0.015353994444012642, "step": 480 }, { "dpo_losses": 0.6883918046951294, "epoch": 0.13, "grad_norm": 8.84117105037518, "learning_rate": 4.988059581314039e-07, "logits/chosen": -2.414215087890625, "logits/rejected": -2.4071717262268066, "logps/chosen": -305.7737121582031, "logps/rejected": -305.1197814941406, "loss": 0.6915, "positive_losses": 0.019093703478574753, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.026078814640641212, "rewards/margins": 0.009710654616355896, "rewards/margins_max": 0.03801741451025009, "rewards/margins_min": -0.015532249584794044, "rewards/margins_std": 0.02291879430413246, "rewards/rejected": 0.016368160024285316, "step": 490 }, { "dpo_losses": 0.6884673833847046, "epoch": 0.13, "grad_norm": 11.875548661256008, "learning_rate": 4.985725660577184e-07, "logits/chosen": -2.4249682426452637, "logits/rejected": -2.3980660438537598, "logps/chosen": -280.6375427246094, "logps/rejected": -239.54928588867188, "loss": 0.6911, "positive_losses": 0.02021026611328125, "rewards/accuracies": 0.625, "rewards/chosen": 0.02335011586546898, "rewards/margins": 0.009515838697552681, "rewards/margins_max": 0.035986822098493576, "rewards/margins_min": -0.010793705470860004, "rewards/margins_std": 0.02099209651350975, "rewards/rejected": 0.013834277167916298, "step": 500 }, { "epoch": 0.13, "eval_dpo_losses": 0.6901781558990479, "eval_logits/chosen": -2.338940143585205, "eval_logits/rejected": -2.228980541229248, "eval_logps/chosen": -273.4386901855469, "eval_logps/rejected": -261.0899353027344, "eval_loss": 0.6910428404808044, "eval_positive_losses": 0.013966274447739124, "eval_rewards/accuracies": 0.591269850730896, "eval_rewards/chosen": 0.023354750126600266, "eval_rewards/margins": 0.0060513378120958805, "eval_rewards/margins_max": 0.038669101893901825, "eval_rewards/margins_min": -0.02098749577999115, "eval_rewards/margins_std": 0.019231565296649933, "eval_rewards/rejected": 0.017303410917520523, "eval_runtime": 390.4963, "eval_samples_per_second": 5.122, "eval_steps_per_second": 0.161, "step": 500 }, { "dpo_losses": 0.6901510953903198, "epoch": 0.13, "grad_norm": 12.491752650342695, "learning_rate": 4.983184182463008e-07, "logits/chosen": -2.3167169094085693, "logits/rejected": -2.192013740539551, "logps/chosen": -253.66543579101562, "logps/rejected": -244.1859893798828, "loss": 0.6921, "positive_losses": 0.053206730633974075, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.020629312843084335, "rewards/margins": 0.0061425017192959785, "rewards/margins_max": 0.030502531677484512, "rewards/margins_min": -0.013592461124062538, "rewards/margins_std": 0.019782716408371925, "rewards/rejected": 0.014486810192465782, "step": 510 }, { "dpo_losses": 0.6902118921279907, "epoch": 0.14, "grad_norm": 10.94952603363854, "learning_rate": 4.980435359184203e-07, "logits/chosen": -2.2864110469818115, "logits/rejected": -2.039227247238159, "logps/chosen": -258.0054626464844, "logps/rejected": -231.8078155517578, "loss": 0.6915, "positive_losses": 0.02971954271197319, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.021311722695827484, "rewards/margins": 0.005946998484432697, "rewards/margins_max": 0.022834371775388718, "rewards/margins_min": -0.009917364455759525, "rewards/margins_std": 0.014656739309430122, "rewards/rejected": 0.015364723280072212, "step": 520 }, { "dpo_losses": 0.6898611187934875, "epoch": 0.14, "grad_norm": 2.981149258300992, "learning_rate": 4.977479420266723e-07, "logits/chosen": -2.507845401763916, "logits/rejected": -2.3874363899230957, "logps/chosen": -312.6935729980469, "logps/rejected": -295.5491027832031, "loss": 0.6915, "positive_losses": 0.0047630309127271175, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.027600998058915138, "rewards/margins": 0.006633379962295294, "rewards/margins_max": 0.01997458003461361, "rewards/margins_min": -0.005872023291885853, "rewards/margins_std": 0.011955427937209606, "rewards/rejected": 0.02096761390566826, "step": 530 }, { "dpo_losses": 0.6923002004623413, "epoch": 0.14, "grad_norm": 8.849342761446769, "learning_rate": 4.974316612530614e-07, "logits/chosen": -2.3498849868774414, "logits/rejected": -2.3088784217834473, "logps/chosen": -266.21124267578125, "logps/rejected": -242.4516143798828, "loss": 0.6907, "positive_losses": 0.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.02364932931959629, "rewards/margins": 0.0018468145281076431, "rewards/margins_max": 0.024594077840447426, "rewards/margins_min": -0.021861569955945015, "rewards/margins_std": 0.02070799469947815, "rewards/rejected": 0.021802512928843498, "step": 540 }, { "dpo_losses": 0.6895166635513306, "epoch": 0.14, "grad_norm": 11.315253328394473, "learning_rate": 4.970947200069415e-07, "logits/chosen": -2.5822155475616455, "logits/rejected": -2.475471019744873, "logps/chosen": -288.88934326171875, "logps/rejected": -258.79620361328125, "loss": 0.6908, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.026873474940657616, "rewards/margins": 0.007388119585812092, "rewards/margins_max": 0.030571982264518738, "rewards/margins_min": -0.012608656659722328, "rewards/margins_std": 0.019486630335450172, "rewards/rejected": 0.01948535442352295, "step": 550 }, { "dpo_losses": 0.689440131187439, "epoch": 0.15, "grad_norm": 3.151088005965224, "learning_rate": 4.967371464228095e-07, "logits/chosen": -2.439462661743164, "logits/rejected": -2.4694623947143555, "logps/chosen": -282.3801574707031, "logps/rejected": -262.3647155761719, "loss": 0.6918, "positive_losses": 0.0, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.02894566021859646, "rewards/margins": 0.0075555541552603245, "rewards/margins_max": 0.03021211549639702, "rewards/margins_min": -0.011896403506398201, "rewards/margins_std": 0.018885262310504913, "rewards/rejected": 0.02139010652899742, "step": 560 }, { "dpo_losses": 0.6894875764846802, "epoch": 0.15, "grad_norm": 2.410633602507762, "learning_rate": 4.963589703579569e-07, "logits/chosen": -2.4311766624450684, "logits/rejected": -2.2668728828430176, "logps/chosen": -261.2833557128906, "logps/rejected": -231.4168701171875, "loss": 0.6914, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.025986915454268456, "rewards/margins": 0.007469603326171637, "rewards/margins_max": 0.029773946851491928, "rewards/margins_min": -0.00879929680377245, "rewards/margins_std": 0.017354438081383705, "rewards/rejected": 0.018517309799790382, "step": 570 }, { "dpo_losses": 0.6805204153060913, "epoch": 0.15, "grad_norm": 2.8906466096637358, "learning_rate": 4.959602233899761e-07, "logits/chosen": -2.794351100921631, "logits/rejected": -2.490011215209961, "logps/chosen": -351.5090637207031, "logps/rejected": -261.6322326660156, "loss": 0.6875, "positive_losses": 0.0, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.04494539275765419, "rewards/margins": 0.025680119171738625, "rewards/margins_max": 0.06106184050440788, "rewards/margins_min": 0.0007383271004073322, "rewards/margins_std": 0.027541790157556534, "rewards/rejected": 0.019265275448560715, "step": 580 }, { "dpo_losses": 0.69092857837677, "epoch": 0.15, "grad_norm": 2.294669824922033, "learning_rate": 4.955409388141243e-07, "logits/chosen": -2.4268391132354736, "logits/rejected": -2.3248438835144043, "logps/chosen": -241.16159057617188, "logps/rejected": -259.2159118652344, "loss": 0.69, "positive_losses": 0.03714141994714737, "rewards/accuracies": 0.625, "rewards/chosen": 0.019633669406175613, "rewards/margins": 0.0045719086192548275, "rewards/margins_max": 0.029149528592824936, "rewards/margins_min": -0.018686216324567795, "rewards/margins_std": 0.02102431282401085, "rewards/rejected": 0.015061760321259499, "step": 590 }, { "dpo_losses": 0.6852045059204102, "epoch": 0.16, "grad_norm": 2.3149794611833348, "learning_rate": 4.951011516405429e-07, "logits/chosen": -2.5475382804870605, "logits/rejected": -2.3969569206237793, "logps/chosen": -385.9237365722656, "logps/rejected": -307.8304138183594, "loss": 0.6871, "positive_losses": 0.0025314330123364925, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.038226108998060226, "rewards/margins": 0.0162517037242651, "rewards/margins_max": 0.060403406620025635, "rewards/margins_min": -0.016597673296928406, "rewards/margins_std": 0.03403856232762337, "rewards/rejected": 0.021974410861730576, "step": 600 }, { "epoch": 0.16, "eval_dpo_losses": 0.6889464855194092, "eval_logits/chosen": -2.3304007053375244, "eval_logits/rejected": -2.220170021057129, "eval_logps/chosen": -272.81805419921875, "eval_logps/rejected": -260.724365234375, "eval_loss": 0.6908224821090698, "eval_positive_losses": 0.022509079426527023, "eval_rewards/accuracies": 0.6111111044883728, "eval_rewards/chosen": 0.029561027884483337, "eval_rewards/margins": 0.008601733483374119, "eval_rewards/margins_max": 0.05209686607122421, "eval_rewards/margins_min": -0.026402389630675316, "eval_rewards/margins_std": 0.02516203001141548, "eval_rewards/rejected": 0.020959289744496346, "eval_runtime": 390.609, "eval_samples_per_second": 5.12, "eval_steps_per_second": 0.161, "step": 600 }, { "dpo_losses": 0.6866358518600464, "epoch": 0.16, "grad_norm": 9.671348840948728, "learning_rate": 4.946408985913344e-07, "logits/chosen": -2.4539263248443604, "logits/rejected": -2.2442119121551514, "logps/chosen": -325.1873474121094, "logps/rejected": -263.95623779296875, "loss": 0.6887, "positive_losses": 0.01163635216653347, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03424832224845886, "rewards/margins": 0.013244586996734142, "rewards/margins_max": 0.04095269739627838, "rewards/margins_min": -0.012006950564682484, "rewards/margins_std": 0.024460995569825172, "rewards/rejected": 0.021003730595111847, "step": 610 }, { "dpo_losses": 0.6868307590484619, "epoch": 0.16, "grad_norm": 13.93417250252336, "learning_rate": 4.941602180974958e-07, "logits/chosen": -2.4058642387390137, "logits/rejected": -2.305898904800415, "logps/chosen": -259.3505554199219, "logps/rejected": -228.6035919189453, "loss": 0.6878, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03157050162553787, "rewards/margins": 0.012882744893431664, "rewards/margins_max": 0.04328671097755432, "rewards/margins_min": -0.01362523902207613, "rewards/margins_std": 0.024857234209775925, "rewards/rejected": 0.01868775486946106, "step": 620 }, { "dpo_losses": 0.6849768757820129, "epoch": 0.16, "grad_norm": 2.251428468567982, "learning_rate": 4.936591502957101e-07, "logits/chosen": -2.540921449661255, "logits/rejected": -2.415156841278076, "logps/chosen": -273.33978271484375, "logps/rejected": -264.990478515625, "loss": 0.691, "positive_losses": 0.0054916380904614925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03927159309387207, "rewards/margins": 0.01672983542084694, "rewards/margins_max": 0.05854141712188721, "rewards/margins_min": -0.008396012708544731, "rewards/margins_std": 0.03006715141236782, "rewards/rejected": 0.02254176139831543, "step": 630 }, { "dpo_losses": 0.6834777593612671, "epoch": 0.17, "grad_norm": 2.9978873443406626, "learning_rate": 4.931377370249945e-07, "logits/chosen": -2.465360164642334, "logits/rejected": -2.323390483856201, "logps/chosen": -318.85009765625, "logps/rejected": -254.32638549804688, "loss": 0.6921, "positive_losses": 0.008511352352797985, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.04036317020654678, "rewards/margins": 0.019664334133267403, "rewards/margins_max": 0.05634418874979019, "rewards/margins_min": -0.006656390614807606, "rewards/margins_std": 0.028288107365369797, "rewards/rejected": 0.02069883793592453, "step": 640 }, { "dpo_losses": 0.690898597240448, "epoch": 0.17, "grad_norm": 2.9666367890071452, "learning_rate": 4.925960218232072e-07, "logits/chosen": -2.475574254989624, "logits/rejected": -2.438917875289917, "logps/chosen": -265.4050598144531, "logps/rejected": -275.9569396972656, "loss": 0.6888, "positive_losses": 0.00336112966760993, "rewards/accuracies": 0.5, "rewards/chosen": 0.03348889201879501, "rewards/margins": 0.004889338277280331, "rewards/margins_max": 0.045515276491642, "rewards/margins_min": -0.032374169677495956, "rewards/margins_std": 0.03493247181177139, "rewards/rejected": 0.02859955094754696, "step": 650 }, { "dpo_losses": 0.6893867254257202, "epoch": 0.17, "grad_norm": 13.383728031658093, "learning_rate": 4.920340499234116e-07, "logits/chosen": -2.479072093963623, "logits/rejected": -2.3308908939361572, "logps/chosen": -219.2537384033203, "logps/rejected": -241.07998657226562, "loss": 0.6899, "positive_losses": 0.013996887020766735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.028124403208494186, "rewards/margins": 0.0077577875927090645, "rewards/margins_max": 0.03608463332056999, "rewards/margins_min": -0.02335934340953827, "rewards/margins_std": 0.026355575770139694, "rewards/rejected": 0.020366612821817398, "step": 660 }, { "dpo_losses": 0.6918702125549316, "epoch": 0.18, "grad_norm": 2.571766172681382, "learning_rate": 4.914518682500995e-07, "logits/chosen": -2.4899744987487793, "logits/rejected": -2.4399514198303223, "logps/chosen": -235.8241424560547, "logps/rejected": -237.6702423095703, "loss": 0.6903, "positive_losses": 0.02091064490377903, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.029015814885497093, "rewards/margins": 0.002817349974066019, "rewards/margins_max": 0.032551538199186325, "rewards/margins_min": -0.02978183887898922, "rewards/margins_std": 0.02787242829799652, "rewards/rejected": 0.02619846537709236, "step": 670 }, { "dpo_losses": 0.6856767535209656, "epoch": 0.18, "grad_norm": 2.5807895646917203, "learning_rate": 4.90849525415273e-07, "logits/chosen": -2.346619129180908, "logits/rejected": -2.15134859085083, "logps/chosen": -319.52606201171875, "logps/rejected": -289.216796875, "loss": 0.6872, "positive_losses": 0.01070404052734375, "rewards/accuracies": 0.625, "rewards/chosen": 0.04515766352415085, "rewards/margins": 0.015239179134368896, "rewards/margins_max": 0.055382005870342255, "rewards/margins_min": -0.01504664309322834, "rewards/margins_std": 0.03229757770895958, "rewards/rejected": 0.02991848811507225, "step": 680 }, { "dpo_losses": 0.6813660264015198, "epoch": 0.18, "grad_norm": 2.44720288414687, "learning_rate": 4.902270717143858e-07, "logits/chosen": -2.451616048812866, "logits/rejected": -2.238523006439209, "logps/chosen": -269.92657470703125, "logps/rejected": -234.30697631835938, "loss": 0.6866, "positive_losses": 0.0, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.04334636777639389, "rewards/margins": 0.02393108420073986, "rewards/margins_max": 0.05526786297559738, "rewards/margins_min": -0.007282085716724396, "rewards/margins_std": 0.026958834379911423, "rewards/rejected": 0.01941528357565403, "step": 690 }, { "dpo_losses": 0.6902865171432495, "epoch": 0.18, "grad_norm": 2.4027807815131177, "learning_rate": 4.895845591221426e-07, "logits/chosen": -2.377312421798706, "logits/rejected": -2.3240437507629395, "logps/chosen": -255.6237030029297, "logps/rejected": -256.7688903808594, "loss": 0.689, "positive_losses": 0.010212135501205921, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.037796467542648315, "rewards/margins": 0.005938265006989241, "rewards/margins_max": 0.03653832525014877, "rewards/margins_min": -0.018000982701778412, "rewards/margins_std": 0.024616271257400513, "rewards/rejected": 0.03185820206999779, "step": 700 }, { "epoch": 0.18, "eval_dpo_losses": 0.6875638365745544, "eval_logits/chosen": -2.3290038108825684, "eval_logits/rejected": -2.2190423011779785, "eval_logps/chosen": -271.92645263671875, "eval_logps/rejected": -260.12255859375, "eval_loss": 0.6901445984840393, "eval_positive_losses": 0.027367547154426575, "eval_rewards/accuracies": 0.6230158805847168, "eval_rewards/chosen": 0.03847659006714821, "eval_rewards/margins": 0.011499141342937946, "eval_rewards/margins_max": 0.06611426919698715, "eval_rewards/margins_min": -0.03457736596465111, "eval_rewards/margins_std": 0.032335616648197174, "eval_rewards/rejected": 0.02697744406759739, "eval_runtime": 390.2443, "eval_samples_per_second": 5.125, "eval_steps_per_second": 0.161, "step": 700 }, { "dpo_losses": 0.688839316368103, "epoch": 0.19, "grad_norm": 2.3384729614586925, "learning_rate": 4.8892204128816e-07, "logits/chosen": -2.3998520374298096, "logits/rejected": -2.2950470447540283, "logps/chosen": -217.37109375, "logps/rejected": -184.52944946289062, "loss": 0.6886, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.029643137007951736, "rewards/margins": 0.008792102336883545, "rewards/margins_max": 0.03336808830499649, "rewards/margins_min": -0.012953135184943676, "rewards/margins_std": 0.02139660343527794, "rewards/rejected": 0.020851030945777893, "step": 710 }, { "dpo_losses": 0.6816563606262207, "epoch": 0.19, "grad_norm": 15.246645699239062, "learning_rate": 4.882395735324863e-07, "logits/chosen": -2.4097650051116943, "logits/rejected": -2.1812243461608887, "logps/chosen": -339.34832763671875, "logps/rejected": -289.8691101074219, "loss": 0.687, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05065031722187996, "rewards/margins": 0.02353280782699585, "rewards/margins_max": 0.07415162026882172, "rewards/margins_min": -0.010938909836113453, "rewards/margins_std": 0.039117004722356796, "rewards/rejected": 0.02711750566959381, "step": 720 }, { "dpo_losses": 0.6858643889427185, "epoch": 0.19, "grad_norm": 2.821074744693753, "learning_rate": 4.875372128409829e-07, "logits/chosen": -2.6435656547546387, "logits/rejected": -2.3795571327209473, "logps/chosen": -302.74102783203125, "logps/rejected": -263.6440734863281, "loss": 0.6926, "positive_losses": 0.17191104590892792, "rewards/accuracies": 0.625, "rewards/chosen": 0.03571704775094986, "rewards/margins": 0.0152281504124403, "rewards/margins_max": 0.06637503206729889, "rewards/margins_min": -0.0372738391160965, "rewards/margins_std": 0.04622985050082207, "rewards/rejected": 0.02048889361321926, "step": 730 }, { "dpo_losses": 0.6894487142562866, "epoch": 0.19, "grad_norm": 2.1808845760441105, "learning_rate": 4.868150178605653e-07, "logits/chosen": -2.269254207611084, "logits/rejected": -2.438769817352295, "logps/chosen": -252.0069580078125, "logps/rejected": -322.34429931640625, "loss": 0.6936, "positive_losses": 0.10730285942554474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04121936112642288, "rewards/margins": 0.008252440020442009, "rewards/margins_max": 0.0700400248169899, "rewards/margins_min": -0.05098434165120125, "rewards/margins_std": 0.05312617868185043, "rewards/rejected": 0.032966919243335724, "step": 740 }, { "dpo_losses": 0.6892884969711304, "epoch": 0.2, "grad_norm": 2.6276569543885078, "learning_rate": 4.860730488943068e-07, "logits/chosen": -2.4644899368286133, "logits/rejected": -2.486419200897217, "logps/chosen": -236.35067749023438, "logps/rejected": -258.2120056152344, "loss": 0.6911, "positive_losses": 0.028100013732910156, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.03869793191552162, "rewards/margins": 0.008012340404093266, "rewards/margins_max": 0.04519136995077133, "rewards/margins_min": -0.01695234328508377, "rewards/margins_std": 0.0280009713023901, "rewards/rejected": 0.030685584992170334, "step": 750 }, { "dpo_losses": 0.6833207011222839, "epoch": 0.2, "grad_norm": 2.6070379881847483, "learning_rate": 4.853113678964021e-07, "logits/chosen": -2.5191140174865723, "logits/rejected": -2.322978973388672, "logps/chosen": -271.30255126953125, "logps/rejected": -236.1392364501953, "loss": 0.6896, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04541131108999252, "rewards/margins": 0.020073089748620987, "rewards/margins_max": 0.055327899754047394, "rewards/margins_min": -0.0054566748440265656, "rewards/margins_std": 0.02755601704120636, "rewards/rejected": 0.025338217616081238, "step": 760 }, { "dpo_losses": 0.6883939504623413, "epoch": 0.2, "grad_norm": 17.86778359029702, "learning_rate": 4.845300384669957e-07, "logits/chosen": -2.4559013843536377, "logits/rejected": -2.2184033393859863, "logps/chosen": -295.0942687988281, "logps/rejected": -267.3595275878906, "loss": 0.6894, "positive_losses": 0.027328873053193092, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.04757463186979294, "rewards/margins": 0.009880626574158669, "rewards/margins_max": 0.05066379904747009, "rewards/margins_min": -0.028831705451011658, "rewards/margins_std": 0.035041529685258865, "rewards/rejected": 0.03769400343298912, "step": 770 }, { "dpo_losses": 0.6819594502449036, "epoch": 0.2, "grad_norm": 2.7145057961863515, "learning_rate": 4.8372912584687e-07, "logits/chosen": -2.4299702644348145, "logits/rejected": -2.314842939376831, "logps/chosen": -293.68243408203125, "logps/rejected": -271.77008056640625, "loss": 0.6853, "positive_losses": 0.01267166156321764, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05747154355049133, "rewards/margins": 0.022886093705892563, "rewards/margins_max": 0.06950096040964127, "rewards/margins_min": -0.012445281259715557, "rewards/margins_std": 0.036640461534261703, "rewards/rejected": 0.03458544611930847, "step": 780 }, { "dpo_losses": 0.6872497200965881, "epoch": 0.21, "grad_norm": 2.6720034919761635, "learning_rate": 4.829086969119983e-07, "logits/chosen": -2.436641216278076, "logits/rejected": -2.3978874683380127, "logps/chosen": -271.9613952636719, "logps/rejected": -282.0673828125, "loss": 0.6917, "positive_losses": 0.08474578708410263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04411480575799942, "rewards/margins": 0.012200284749269485, "rewards/margins_max": 0.05773855373263359, "rewards/margins_min": -0.025964293628931046, "rewards/margins_std": 0.036045514047145844, "rewards/rejected": 0.03191452473402023, "step": 790 }, { "dpo_losses": 0.682244598865509, "epoch": 0.21, "grad_norm": 2.7240400040723745, "learning_rate": 4.820688201679605e-07, "logits/chosen": -2.253638744354248, "logits/rejected": -2.132657527923584, "logps/chosen": -318.6755065917969, "logps/rejected": -236.60183715820312, "loss": 0.6859, "positive_losses": 0.01424484234303236, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04910421371459961, "rewards/margins": 0.022288832813501358, "rewards/margins_max": 0.06530847400426865, "rewards/margins_min": -0.009823386557400227, "rewards/margins_std": 0.03450342267751694, "rewards/rejected": 0.02681538462638855, "step": 800 }, { "epoch": 0.21, "eval_dpo_losses": 0.6860873103141785, "eval_logits/chosen": -2.3311781883239746, "eval_logits/rejected": -2.2215147018432617, "eval_logps/chosen": -270.8916320800781, "eval_logps/rejected": -259.393310546875, "eval_loss": 0.6882888078689575, "eval_positive_losses": 0.016842538490891457, "eval_rewards/accuracies": 0.6349206566810608, "eval_rewards/chosen": 0.04882540926337242, "eval_rewards/margins": 0.014555813744664192, "eval_rewards/margins_max": 0.0764850303530693, "eval_rewards/margins_min": -0.0370931476354599, "eval_rewards/margins_std": 0.036580607295036316, "eval_rewards/rejected": 0.03426959738135338, "eval_runtime": 390.0239, "eval_samples_per_second": 5.128, "eval_steps_per_second": 0.162, "step": 800 }, { "dpo_losses": 0.684611439704895, "epoch": 0.21, "grad_norm": 11.333237666305978, "learning_rate": 4.812095657442231e-07, "logits/chosen": -2.4185566902160645, "logits/rejected": -2.295619487762451, "logps/chosen": -285.80169677734375, "logps/rejected": -265.4990234375, "loss": 0.6899, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.05256735533475876, "rewards/margins": 0.017693892121315002, "rewards/margins_max": 0.0627375990152359, "rewards/margins_min": -0.022355688735842705, "rewards/margins_std": 0.03840577229857445, "rewards/rejected": 0.034873463213443756, "step": 810 }, { "dpo_losses": 0.6822260022163391, "epoch": 0.21, "grad_norm": 15.946948792206301, "learning_rate": 4.803310053882831e-07, "logits/chosen": -2.3436923027038574, "logits/rejected": -2.1960482597351074, "logps/chosen": -246.3326416015625, "logps/rejected": -207.7164764404297, "loss": 0.6918, "positive_losses": 0.050566863268613815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.051094900816679, "rewards/margins": 0.022358717396855354, "rewards/margins_max": 0.06413952261209488, "rewards/margins_min": -0.018008124083280563, "rewards/margins_std": 0.03576374053955078, "rewards/rejected": 0.028736192733049393, "step": 820 }, { "dpo_losses": 0.6871287226676941, "epoch": 0.22, "grad_norm": 2.534352038334735, "learning_rate": 4.794332124596775e-07, "logits/chosen": -2.462862730026245, "logits/rejected": -2.317910671234131, "logps/chosen": -297.66363525390625, "logps/rejected": -287.2362060546875, "loss": 0.6857, "positive_losses": 0.0, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.05376025289297104, "rewards/margins": 0.012725532054901123, "rewards/margins_max": 0.06634513288736343, "rewards/margins_min": -0.031268827617168427, "rewards/margins_std": 0.04368848353624344, "rewards/rejected": 0.041034720838069916, "step": 830 }, { "dpo_losses": 0.6884168386459351, "epoch": 0.22, "grad_norm": 10.012635449887494, "learning_rate": 4.785162619238574e-07, "logits/chosen": -2.6376395225524902, "logits/rejected": -2.5037364959716797, "logps/chosen": -251.16427612304688, "logps/rejected": -212.1234588623047, "loss": 0.6883, "positive_losses": 0.044121552258729935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04669896513223648, "rewards/margins": 0.009739209897816181, "rewards/margins_max": 0.04596342146396637, "rewards/margins_min": -0.020430846139788628, "rewards/margins_std": 0.029439836740493774, "rewards/rejected": 0.036959752440452576, "step": 840 }, { "dpo_losses": 0.6918618083000183, "epoch": 0.22, "grad_norm": 2.782353385908179, "learning_rate": 4.775802303459287e-07, "logits/chosen": -2.3455281257629395, "logits/rejected": -2.286647081375122, "logps/chosen": -217.6573028564453, "logps/rejected": -228.6517333984375, "loss": 0.6925, "positive_losses": 0.14215087890625, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.034618720412254333, "rewards/margins": 0.0027922955341637135, "rewards/margins_max": 0.030602851882576942, "rewards/margins_min": -0.02602304145693779, "rewards/margins_std": 0.025757908821105957, "rewards/rejected": 0.031826429069042206, "step": 850 }, { "dpo_losses": 0.6878683567047119, "epoch": 0.23, "grad_norm": 2.7103301585468986, "learning_rate": 4.766251958842589e-07, "logits/chosen": -2.2674427032470703, "logits/rejected": -2.4135782718658447, "logps/chosen": -137.48617553710938, "logps/rejected": -195.75880432128906, "loss": 0.6911, "positive_losses": 0.01347961463034153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03972017019987106, "rewards/margins": 0.010889324359595776, "rewards/margins_max": 0.047764308750629425, "rewards/margins_min": -0.021479088813066483, "rewards/margins_std": 0.03014972247183323, "rewards/rejected": 0.028830837458372116, "step": 860 }, { "dpo_losses": 0.6808698773384094, "epoch": 0.23, "grad_norm": 2.6327920341262234, "learning_rate": 4.756512382839506e-07, "logits/chosen": -2.4804461002349854, "logits/rejected": -2.210893392562866, "logps/chosen": -289.2867431640625, "logps/rejected": -226.7284698486328, "loss": 0.6872, "positive_losses": 0.015552520751953125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06629444658756256, "rewards/margins": 0.025384655222296715, "rewards/margins_max": 0.07790140062570572, "rewards/margins_min": -0.021799257025122643, "rewards/margins_std": 0.04600118473172188, "rewards/rejected": 0.0409097857773304, "step": 870 }, { "dpo_losses": 0.6934110522270203, "epoch": 0.23, "grad_norm": 2.6180027844827296, "learning_rate": 4.746584388701831e-07, "logits/chosen": -2.379020929336548, "logits/rejected": -2.372100353240967, "logps/chosen": -256.7723083496094, "logps/rejected": -306.55596923828125, "loss": 0.6952, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06398121267557144, "rewards/margins": 3.6085395549889654e-05, "rewards/margins_max": 0.04678868502378464, "rewards/margins_min": -0.05195774883031845, "rewards/margins_std": 0.044627320021390915, "rewards/rejected": 0.06394512206315994, "step": 880 }, { "dpo_losses": 0.6875602006912231, "epoch": 0.23, "grad_norm": 3.1864858771824798, "learning_rate": 4.736468805414218e-07, "logits/chosen": -2.34584379196167, "logits/rejected": -2.185379981994629, "logps/chosen": -291.78399658203125, "logps/rejected": -248.1912384033203, "loss": 0.6857, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0687704011797905, "rewards/margins": 0.011760897934436798, "rewards/margins_max": 0.06032535433769226, "rewards/margins_min": -0.03584817424416542, "rewards/margins_std": 0.04264114052057266, "rewards/rejected": 0.057009506970644, "step": 890 }, { "dpo_losses": 0.6938089728355408, "epoch": 0.24, "grad_norm": 2.4839252568669736, "learning_rate": 4.7261664776249595e-07, "logits/chosen": -2.429117441177368, "logits/rejected": -2.375800371170044, "logps/chosen": -207.78659057617188, "logps/rejected": -260.76129150390625, "loss": 0.691, "positive_losses": 0.0021507262717932463, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.06705164909362793, "rewards/margins": -0.0005602512392215431, "rewards/margins_max": 0.06103014945983887, "rewards/margins_min": -0.0632304698228836, "rewards/margins_std": 0.0541420578956604, "rewards/rejected": 0.06761191040277481, "step": 900 }, { "epoch": 0.24, "eval_dpo_losses": 0.6836631894111633, "eval_logits/chosen": -2.3281569480895996, "eval_logits/rejected": -2.2185821533203125, "eval_logps/chosen": -268.1935119628906, "eval_logps/rejected": -257.20367431640625, "eval_loss": 0.6864961981773376, "eval_positive_losses": 0.01226631086319685, "eval_rewards/accuracies": 0.6746031641960144, "eval_rewards/chosen": 0.07580607384443283, "eval_rewards/margins": 0.019639918580651283, "eval_rewards/margins_max": 0.09394823759794235, "eval_rewards/margins_min": -0.04419327154755592, "eval_rewards/margins_std": 0.04506434500217438, "eval_rewards/rejected": 0.0561661571264267, "eval_runtime": 390.1465, "eval_samples_per_second": 5.126, "eval_steps_per_second": 0.161, "step": 900 }, { "dpo_losses": 0.6845285892486572, "epoch": 0.24, "grad_norm": 9.600829851522764, "learning_rate": 4.7156782655754624e-07, "logits/chosen": -2.390500545501709, "logits/rejected": -2.2705318927764893, "logps/chosen": -253.17721557617188, "logps/rejected": -235.8699188232422, "loss": 0.6859, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07442884147167206, "rewards/margins": 0.018119780346751213, "rewards/margins_max": 0.08612563461065292, "rewards/margins_min": -0.03710592910647392, "rewards/margins_std": 0.05383007973432541, "rewards/rejected": 0.05630906671285629, "step": 910 }, { "dpo_losses": 0.6831806898117065, "epoch": 0.24, "grad_norm": 2.345363315994964, "learning_rate": 4.705005045028414e-07, "logits/chosen": -2.4191765785217285, "logits/rejected": -2.20536470413208, "logps/chosen": -276.8003234863281, "logps/rejected": -227.61459350585938, "loss": 0.6889, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08833175897598267, "rewards/margins": 0.020980171859264374, "rewards/margins_max": 0.08479996770620346, "rewards/margins_min": -0.04129518195986748, "rewards/margins_std": 0.05555069446563721, "rewards/rejected": 0.06735159456729889, "step": 920 }, { "dpo_losses": 0.682255208492279, "epoch": 0.24, "grad_norm": 2.828505685380789, "learning_rate": 4.694147707194659e-07, "logits/chosen": -2.3006973266601562, "logits/rejected": -2.0301570892333984, "logps/chosen": -311.56146240234375, "logps/rejected": -280.30413818359375, "loss": 0.682, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09470949321985245, "rewards/margins": 0.022493673488497734, "rewards/margins_max": 0.07522673159837723, "rewards/margins_min": -0.025179479271173477, "rewards/margins_std": 0.044799186289310455, "rewards/rejected": 0.07221582531929016, "step": 930 }, { "dpo_losses": 0.6797299385070801, "epoch": 0.25, "grad_norm": 2.753419224305331, "learning_rate": 4.683107158658781e-07, "logits/chosen": -2.3657236099243164, "logits/rejected": -2.2920451164245605, "logps/chosen": -284.9367370605469, "logps/rejected": -267.7127380371094, "loss": 0.6873, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11133380234241486, "rewards/margins": 0.02770196832716465, "rewards/margins_max": 0.0943271666765213, "rewards/margins_min": -0.020491529256105423, "rewards/margins_std": 0.05047357827425003, "rewards/rejected": 0.08363182842731476, "step": 940 }, { "dpo_losses": 0.6831534504890442, "epoch": 0.25, "grad_norm": 8.460491136944729, "learning_rate": 4.6718843213034066e-07, "logits/chosen": -2.443721294403076, "logits/rejected": -2.462355136871338, "logps/chosen": -251.26730346679688, "logps/rejected": -296.9676818847656, "loss": 0.6865, "positive_losses": 0.020296860486268997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10047026723623276, "rewards/margins": 0.021419592201709747, "rewards/margins_max": 0.10218329727649689, "rewards/margins_min": -0.054422251880168915, "rewards/margins_std": 0.06877554953098297, "rewards/rejected": 0.07905067503452301, "step": 950 }, { "dpo_losses": 0.6926820874214172, "epoch": 0.25, "grad_norm": 2.7768169606334028, "learning_rate": 4.660480132232224e-07, "logits/chosen": -2.415651559829712, "logits/rejected": -2.216207981109619, "logps/chosen": -354.13201904296875, "logps/rejected": -290.62115478515625, "loss": 0.6847, "positive_losses": 0.01096954382956028, "rewards/accuracies": 0.625, "rewards/chosen": 0.09802886843681335, "rewards/margins": 0.002036420162767172, "rewards/margins_max": 0.0750807598233223, "rewards/margins_min": -0.07417962700128555, "rewards/margins_std": 0.06501518934965134, "rewards/rejected": 0.09599246084690094, "step": 960 }, { "dpo_losses": 0.6813235282897949, "epoch": 0.25, "grad_norm": 2.637386706481696, "learning_rate": 4.64889554369174e-07, "logits/chosen": -2.353144884109497, "logits/rejected": -2.239154577255249, "logps/chosen": -318.57257080078125, "logps/rejected": -275.2332458496094, "loss": 0.6883, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1040201187133789, "rewards/margins": 0.024571260437369347, "rewards/margins_max": 0.08051761984825134, "rewards/margins_min": -0.023823332041502, "rewards/margins_std": 0.046752989292144775, "rewards/rejected": 0.07944886386394501, "step": 970 }, { "dpo_losses": 0.6813982725143433, "epoch": 0.26, "grad_norm": 2.3756200081507606, "learning_rate": 4.637131522991764e-07, "logits/chosen": -2.491522789001465, "logits/rejected": -2.385634183883667, "logps/chosen": -286.08795166015625, "logps/rejected": -233.29443359375, "loss": 0.682, "positive_losses": 0.017777632921934128, "rewards/accuracies": 0.625, "rewards/chosen": 0.08882251381874084, "rewards/margins": 0.024640141054987907, "rewards/margins_max": 0.08776797354221344, "rewards/margins_min": -0.02915186807513237, "rewards/margins_std": 0.05259226635098457, "rewards/rejected": 0.06418237090110779, "step": 980 }, { "dpo_losses": 0.6840972304344177, "epoch": 0.26, "grad_norm": 2.36533537569499, "learning_rate": 4.6251890524246375e-07, "logits/chosen": -2.390099048614502, "logits/rejected": -2.0463039875030518, "logps/chosen": -279.87225341796875, "logps/rejected": -240.3599090576172, "loss": 0.6806, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.10328897088766098, "rewards/margins": 0.01964501291513443, "rewards/margins_max": 0.09536108374595642, "rewards/margins_min": -0.06421180069446564, "rewards/margins_std": 0.0703578069806099, "rewards/rejected": 0.08364395797252655, "step": 990 }, { "dpo_losses": 0.690109133720398, "epoch": 0.26, "grad_norm": 17.902738802781684, "learning_rate": 4.613069129183218e-07, "logits/chosen": -2.3880062103271484, "logits/rejected": -2.2725625038146973, "logps/chosen": -224.428955078125, "logps/rejected": -202.2260284423828, "loss": 0.6913, "positive_losses": 0.14122924208641052, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.075763039290905, "rewards/margins": 0.006849632598459721, "rewards/margins_max": 0.06722959131002426, "rewards/margins_min": -0.048183660954236984, "rewards/margins_std": 0.052964307367801666, "rewards/rejected": 0.06891341507434845, "step": 1000 }, { "epoch": 0.26, "eval_dpo_losses": 0.6793550848960876, "eval_logits/chosen": -2.3236756324768066, "eval_logits/rejected": -2.2138757705688477, "eval_logps/chosen": -265.4327697753906, "eval_logps/rejected": -255.3634796142578, "eval_loss": 0.6848495006561279, "eval_positive_losses": 0.028416674584150314, "eval_rewards/accuracies": 0.6845238208770752, "eval_rewards/chosen": 0.10341382771730423, "eval_rewards/margins": 0.028845757246017456, "eval_rewards/margins_max": 0.1287689357995987, "eval_rewards/margins_min": -0.05748141184449196, "eval_rewards/margins_std": 0.06155591458082199, "eval_rewards/rejected": 0.07456808537244797, "eval_runtime": 389.7712, "eval_samples_per_second": 5.131, "eval_steps_per_second": 0.162, "step": 1000 }, { "dpo_losses": 0.67509925365448, "epoch": 0.26, "grad_norm": 2.418275878332405, "learning_rate": 4.6007727652776065e-07, "logits/chosen": -2.482348680496216, "logits/rejected": -2.31712007522583, "logps/chosen": -261.5564270019531, "logps/rejected": -265.0421142578125, "loss": 0.6798, "positive_losses": 0.00772171001881361, "rewards/accuracies": 0.75, "rewards/chosen": 0.10058889538049698, "rewards/margins": 0.037412021309137344, "rewards/margins_max": 0.11190427839756012, "rewards/margins_min": -0.011948691681027412, "rewards/margins_std": 0.05710822343826294, "rewards/rejected": 0.06317687034606934, "step": 1010 }, { "dpo_losses": 0.6917736530303955, "epoch": 0.27, "grad_norm": 2.1977395322270725, "learning_rate": 4.588300987450652e-07, "logits/chosen": -2.5221619606018066, "logits/rejected": -2.4375834465026855, "logps/chosen": -250.0184326171875, "logps/rejected": -259.99285888671875, "loss": 0.6854, "positive_losses": 0.04188041761517525, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.09464290738105774, "rewards/margins": 0.00356256659142673, "rewards/margins_max": 0.06899553537368774, "rewards/margins_min": -0.05539632588624954, "rewards/margins_std": 0.05616170912981033, "rewards/rejected": 0.09108033776283264, "step": 1020 }, { "dpo_losses": 0.6744934916496277, "epoch": 0.27, "grad_norm": 3.063547383475696, "learning_rate": 4.5756548370922134e-07, "logits/chosen": -2.3202242851257324, "logits/rejected": -2.3008053302764893, "logps/chosen": -274.3762512207031, "logps/rejected": -287.91693115234375, "loss": 0.6801, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1094784364104271, "rewards/margins": 0.038908950984478, "rewards/margins_max": 0.12030893564224243, "rewards/margins_min": -0.028052043169736862, "rewards/margins_std": 0.06532229483127594, "rewards/rejected": 0.0705694779753685, "step": 1030 }, { "dpo_losses": 0.6799553632736206, "epoch": 0.27, "grad_norm": 2.419745314886675, "learning_rate": 4.5628353701522047e-07, "logits/chosen": -2.3585898876190186, "logits/rejected": -2.149651527404785, "logps/chosen": -264.49993896484375, "logps/rejected": -251.07373046875, "loss": 0.684, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.10400895774364471, "rewards/margins": 0.027990808710455894, "rewards/margins_max": 0.11783566325902939, "rewards/margins_min": -0.05716601759195328, "rewards/margins_std": 0.07723238319158554, "rewards/rejected": 0.07601813971996307, "step": 1040 }, { "dpo_losses": 0.6819077730178833, "epoch": 0.27, "grad_norm": 2.5511667798537103, "learning_rate": 4.549843657052429e-07, "logits/chosen": -2.42150616645813, "logits/rejected": -2.220043659210205, "logps/chosen": -294.95892333984375, "logps/rejected": -281.5714111328125, "loss": 0.6862, "positive_losses": 0.07329864799976349, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.11042330414056778, "rewards/margins": 0.02403411641716957, "rewards/margins_max": 0.11498866230249405, "rewards/margins_min": -0.04870307445526123, "rewards/margins_std": 0.0733790472149849, "rewards/rejected": 0.08638918399810791, "step": 1050 }, { "dpo_losses": 0.6759483814239502, "epoch": 0.28, "grad_norm": 2.8498076591504966, "learning_rate": 4.5366807825971907e-07, "logits/chosen": -2.479825496673584, "logits/rejected": -2.3990323543548584, "logps/chosen": -332.65167236328125, "logps/rejected": -321.4006042480469, "loss": 0.6796, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1148696094751358, "rewards/margins": 0.035683564841747284, "rewards/margins_max": 0.11110599339008331, "rewards/margins_min": -0.023787712678313255, "rewards/margins_std": 0.0587044358253479, "rewards/rejected": 0.07918603718280792, "step": 1060 }, { "dpo_losses": 0.6762247681617737, "epoch": 0.28, "grad_norm": 3.0030568761174488, "learning_rate": 4.5233478458827176e-07, "logits/chosen": -2.3397724628448486, "logits/rejected": -2.3156626224517822, "logps/chosen": -235.5967254638672, "logps/rejected": -243.0796661376953, "loss": 0.6847, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11326183378696442, "rewards/margins": 0.03563816845417023, "rewards/margins_max": 0.1274457573890686, "rewards/margins_min": -0.03707501292228699, "rewards/margins_std": 0.07458946853876114, "rewards/rejected": 0.07762367278337479, "step": 1070 }, { "dpo_losses": 0.689909815788269, "epoch": 0.28, "grad_norm": 14.748818560338359, "learning_rate": 4.509845960205389e-07, "logits/chosen": -2.5546841621398926, "logits/rejected": -2.385857582092285, "logps/chosen": -295.5920104980469, "logps/rejected": -301.12994384765625, "loss": 0.6927, "positive_losses": 0.10746002197265625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1043245792388916, "rewards/margins": 0.007992265745997429, "rewards/margins_max": 0.08492609113454819, "rewards/margins_min": -0.06998135894536972, "rewards/margins_std": 0.07033375650644302, "rewards/rejected": 0.09633230417966843, "step": 1080 }, { "dpo_losses": 0.6789954900741577, "epoch": 0.29, "grad_norm": 2.909147069670702, "learning_rate": 4.4961762529687736e-07, "logits/chosen": -2.4266538619995117, "logits/rejected": -2.261120080947876, "logps/chosen": -232.55166625976562, "logps/rejected": -201.7569122314453, "loss": 0.6796, "positive_losses": 0.007260131649672985, "rewards/accuracies": 0.75, "rewards/chosen": 0.10986524820327759, "rewards/margins": 0.030291426926851273, "rewards/margins_max": 0.12055738270282745, "rewards/margins_min": -0.05291157215833664, "rewards/margins_std": 0.07624989002943039, "rewards/rejected": 0.07957382500171661, "step": 1090 }, { "dpo_losses": 0.6826087832450867, "epoch": 0.29, "grad_norm": 7.154394450604047, "learning_rate": 4.482339865589492e-07, "logits/chosen": -2.262277603149414, "logits/rejected": -2.283386468887329, "logps/chosen": -266.8435363769531, "logps/rejected": -256.75628662109375, "loss": 0.6909, "positive_losses": 0.24166718125343323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08498607575893402, "rewards/margins": 0.022256899625062943, "rewards/margins_max": 0.08648417890071869, "rewards/margins_min": -0.04861097037792206, "rewards/margins_std": 0.061483729630708694, "rewards/rejected": 0.06272917985916138, "step": 1100 }, { "epoch": 0.29, "eval_dpo_losses": 0.6771906018257141, "eval_logits/chosen": -2.321810007095337, "eval_logits/rejected": -2.211862564086914, "eval_logps/chosen": -265.0727233886719, "eval_logps/rejected": -255.46434020996094, "eval_loss": 0.6845595836639404, "eval_positive_losses": 0.04408329352736473, "eval_rewards/accuracies": 0.7003968358039856, "eval_rewards/chosen": 0.10701427608728409, "eval_rewards/margins": 0.03345479443669319, "eval_rewards/margins_max": 0.14162880182266235, "eval_rewards/margins_min": -0.06219448149204254, "eval_rewards/margins_std": 0.06762981414794922, "eval_rewards/rejected": 0.07355947047472, "eval_runtime": 389.8475, "eval_samples_per_second": 5.13, "eval_steps_per_second": 0.162, "step": 1100 }, { "dpo_losses": 0.6702350974082947, "epoch": 0.29, "grad_norm": 3.1168908764435015, "learning_rate": 4.4683379534019076e-07, "logits/chosen": -2.4398868083953857, "logits/rejected": -2.318588972091675, "logps/chosen": -249.0113983154297, "logps/rejected": -249.75717163085938, "loss": 0.6799, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.11509887129068375, "rewards/margins": 0.04763125628232956, "rewards/margins_max": 0.12711192667484283, "rewards/margins_min": -0.011348049156367779, "rewards/margins_std": 0.06408245861530304, "rewards/rejected": 0.06746760755777359, "step": 1110 }, { "dpo_losses": 0.6797469258308411, "epoch": 0.29, "grad_norm": 3.152825943797844, "learning_rate": 4.4541716855616593e-07, "logits/chosen": -2.433220863342285, "logits/rejected": -2.2725555896759033, "logps/chosen": -249.1158905029297, "logps/rejected": -192.24551391601562, "loss": 0.6802, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.10803476721048355, "rewards/margins": 0.02842649817466736, "rewards/margins_max": 0.10135779529809952, "rewards/margins_min": -0.04152476042509079, "rewards/margins_std": 0.06397048383951187, "rewards/rejected": 0.0796082466840744, "step": 1120 }, { "dpo_losses": 0.6687802076339722, "epoch": 0.3, "grad_norm": 2.825997242412585, "learning_rate": 4.4398422449480357e-07, "logits/chosen": -2.494903802871704, "logits/rejected": -2.4573416709899902, "logps/chosen": -269.8377685546875, "logps/rejected": -253.7794647216797, "loss": 0.681, "positive_losses": 0.05566215515136719, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11253684759140015, "rewards/margins": 0.05105144903063774, "rewards/margins_max": 0.13689115643501282, "rewards/margins_min": -0.02360970340669155, "rewards/margins_std": 0.07145430147647858, "rewards/rejected": 0.06148539111018181, "step": 1130 }, { "dpo_losses": 0.6732393503189087, "epoch": 0.3, "grad_norm": 2.6479588979199136, "learning_rate": 4.4253508280652036e-07, "logits/chosen": -2.3493645191192627, "logits/rejected": -2.228364944458008, "logps/chosen": -221.1791534423828, "logps/rejected": -208.58544921875, "loss": 0.6772, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.10464813560247421, "rewards/margins": 0.041599713265895844, "rewards/margins_max": 0.12741993367671967, "rewards/margins_min": -0.012737281620502472, "rewards/margins_std": 0.06424719840288162, "rewards/rejected": 0.06304843723773956, "step": 1140 }, { "dpo_losses": 0.6573927998542786, "epoch": 0.3, "grad_norm": 3.2040435218379204, "learning_rate": 4.410698644942302e-07, "logits/chosen": -2.532921075820923, "logits/rejected": -2.408984422683716, "logps/chosen": -274.9655456542969, "logps/rejected": -241.4488525390625, "loss": 0.6721, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.12681028246879578, "rewards/margins": 0.07394054532051086, "rewards/margins_max": 0.15211491286754608, "rewards/margins_min": 0.006267952267080545, "rewards/margins_std": 0.06822885572910309, "rewards/rejected": 0.05286973714828491, "step": 1150 }, { "dpo_losses": 0.6720808148384094, "epoch": 0.3, "grad_norm": 2.6056441629775233, "learning_rate": 4.3958869190324057e-07, "logits/chosen": -2.3161792755126953, "logits/rejected": -2.2409677505493164, "logps/chosen": -171.29046630859375, "logps/rejected": -185.1177215576172, "loss": 0.6781, "positive_losses": 0.04304046556353569, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10394398868083954, "rewards/margins": 0.04387819021940231, "rewards/margins_max": 0.11931698024272919, "rewards/margins_min": -0.028122887015342712, "rewards/margins_std": 0.06602666527032852, "rewards/rejected": 0.06006580591201782, "step": 1160 }, { "dpo_losses": 0.6745734810829163, "epoch": 0.31, "grad_norm": 14.910242213860128, "learning_rate": 4.380916887110365e-07, "logits/chosen": -2.3279192447662354, "logits/rejected": -2.334415912628174, "logps/chosen": -228.3085479736328, "logps/rejected": -245.5347442626953, "loss": 0.6852, "positive_losses": 0.1813919097185135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10367108881473541, "rewards/margins": 0.039520520716905594, "rewards/margins_max": 0.1438857913017273, "rewards/margins_min": -0.05429115146398544, "rewards/margins_std": 0.08662428706884384, "rewards/rejected": 0.06415055692195892, "step": 1170 }, { "dpo_losses": 0.6766068339347839, "epoch": 0.31, "grad_norm": 2.6680611500859843, "learning_rate": 4.3657897991695394e-07, "logits/chosen": -2.3505563735961914, "logits/rejected": -2.169110059738159, "logps/chosen": -236.2076416015625, "logps/rejected": -227.8813018798828, "loss": 0.6825, "positive_losses": 0.06520233303308487, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1032366156578064, "rewards/margins": 0.034924477338790894, "rewards/margins_max": 0.10433157533407211, "rewards/margins_min": -0.03459693118929863, "rewards/margins_std": 0.061243802309036255, "rewards/rejected": 0.0683121383190155, "step": 1180 }, { "dpo_losses": 0.6801928281784058, "epoch": 0.31, "grad_norm": 12.68960087863225, "learning_rate": 4.350506918317416e-07, "logits/chosen": -2.476472854614258, "logits/rejected": -2.415755033493042, "logps/chosen": -231.51318359375, "logps/rejected": -206.41708374023438, "loss": 0.6859, "positive_losses": 0.09468536078929901, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09554317593574524, "rewards/margins": 0.026808012276887894, "rewards/margins_max": 0.09269646555185318, "rewards/margins_min": -0.019989343360066414, "rewards/margins_std": 0.04987224191427231, "rewards/rejected": 0.06873515248298645, "step": 1190 }, { "dpo_losses": 0.6868709921836853, "epoch": 0.31, "grad_norm": 2.734858704426741, "learning_rate": 4.335069520670149e-07, "logits/chosen": -2.217108964920044, "logits/rejected": -2.203026294708252, "logps/chosen": -199.20884704589844, "logps/rejected": -209.1021728515625, "loss": 0.68, "positive_losses": 0.0884883850812912, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.09106338024139404, "rewards/margins": 0.014211786910891533, "rewards/margins_max": 0.1051938533782959, "rewards/margins_min": -0.05359530448913574, "rewards/margins_std": 0.07011215388774872, "rewards/rejected": 0.07685159146785736, "step": 1200 }, { "epoch": 0.31, "eval_dpo_losses": 0.6751182079315186, "eval_logits/chosen": -2.32558536529541, "eval_logits/rejected": -2.21626353263855, "eval_logps/chosen": -264.9368896484375, "eval_logps/rejected": -255.78553771972656, "eval_loss": 0.6870678663253784, "eval_positive_losses": 0.09414191544055939, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.10837253928184509, "eval_rewards/margins": 0.03802526742219925, "eval_rewards/margins_max": 0.16127105057239532, "eval_rewards/margins_min": -0.06981559842824936, "eval_rewards/margins_std": 0.07652898877859116, "eval_rewards/rejected": 0.07034727185964584, "eval_runtime": 389.4084, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 1200 }, { "dpo_losses": 0.6863173842430115, "epoch": 0.32, "grad_norm": 8.72998853894647, "learning_rate": 4.319478895245999e-07, "logits/chosen": -2.508650302886963, "logits/rejected": -2.350947380065918, "logps/chosen": -279.8328552246094, "logps/rejected": -246.3408966064453, "loss": 0.6815, "positive_losses": 0.11141548305749893, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.09990166127681732, "rewards/margins": 0.01638060063123703, "rewards/margins_max": 0.13084599375724792, "rewards/margins_min": -0.0891188532114029, "rewards/margins_std": 0.09604878723621368, "rewards/rejected": 0.0835210531949997, "step": 1210 }, { "dpo_losses": 0.6801341772079468, "epoch": 0.32, "grad_norm": 2.7972889173028292, "learning_rate": 4.3037363438577036e-07, "logits/chosen": -2.441908121109009, "logits/rejected": -2.222297430038452, "logps/chosen": -268.83551025390625, "logps/rejected": -254.4813995361328, "loss": 0.6825, "positive_losses": 0.06684646755456924, "rewards/accuracies": 0.75, "rewards/chosen": 0.0980902686715126, "rewards/margins": 0.027257755398750305, "rewards/margins_max": 0.08557780086994171, "rewards/margins_min": -0.026540720835328102, "rewards/margins_std": 0.050947390496730804, "rewards/rejected": 0.0708325058221817, "step": 1220 }, { "dpo_losses": 0.6762003302574158, "epoch": 0.32, "grad_norm": 2.3569515489171327, "learning_rate": 4.2878431810037716e-07, "logits/chosen": -2.345717668533325, "logits/rejected": -2.2899441719055176, "logps/chosen": -254.9907684326172, "logps/rejected": -262.7395935058594, "loss": 0.6844, "positive_losses": 0.024697113782167435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.11098732799291611, "rewards/margins": 0.03545626252889633, "rewards/margins_max": 0.09782389551401138, "rewards/margins_min": -0.035419661551713943, "rewards/margins_std": 0.05959220603108406, "rewards/rejected": 0.07553107291460037, "step": 1230 }, { "dpo_losses": 0.6743085980415344, "epoch": 0.32, "grad_norm": 2.396415031692105, "learning_rate": 4.271800733758729e-07, "logits/chosen": -2.1780688762664795, "logits/rejected": -2.187371015548706, "logps/chosen": -221.46206665039062, "logps/rejected": -205.1850128173828, "loss": 0.6799, "positive_losses": 0.19806060194969177, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1109117865562439, "rewards/margins": 0.03921615332365036, "rewards/margins_max": 0.11014548689126968, "rewards/margins_min": -0.01798471435904503, "rewards/margins_std": 0.05748777464032173, "rewards/rejected": 0.07169563323259354, "step": 1240 }, { "dpo_losses": 0.6800273656845093, "epoch": 0.33, "grad_norm": 32.530240588326045, "learning_rate": 4.255610341662304e-07, "logits/chosen": -2.3170695304870605, "logits/rejected": -2.254488468170166, "logps/chosen": -248.8870391845703, "logps/rejected": -242.7115936279297, "loss": 0.6924, "positive_losses": 0.24381713569164276, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10487429797649384, "rewards/margins": 0.027905315160751343, "rewards/margins_max": 0.10394702851772308, "rewards/margins_min": -0.04022867977619171, "rewards/margins_std": 0.06330694258213043, "rewards/rejected": 0.07696898281574249, "step": 1250 }, { "dpo_losses": 0.6769441962242126, "epoch": 0.33, "grad_norm": 2.7025242838090535, "learning_rate": 4.2392733566075757e-07, "logits/chosen": -2.3135523796081543, "logits/rejected": -2.0908892154693604, "logps/chosen": -208.30215454101562, "logps/rejected": -195.39869689941406, "loss": 0.6914, "positive_losses": 0.095672607421875, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.09605704247951508, "rewards/margins": 0.034063082188367844, "rewards/margins_max": 0.11588318645954132, "rewards/margins_min": -0.03785210847854614, "rewards/margins_std": 0.06705339252948761, "rewards/rejected": 0.06199396774172783, "step": 1260 }, { "dpo_losses": 0.6793417930603027, "epoch": 0.33, "grad_norm": 16.8445363046987, "learning_rate": 4.2227911427280973e-07, "logits/chosen": -2.3784537315368652, "logits/rejected": -2.3995862007141113, "logps/chosen": -263.38592529296875, "logps/rejected": -280.08380126953125, "loss": 0.6882, "positive_losses": 0.044260405004024506, "rewards/accuracies": 0.625, "rewards/chosen": 0.10459186881780624, "rewards/margins": 0.029014548286795616, "rewards/margins_max": 0.1086081862449646, "rewards/margins_min": -0.03850003704428673, "rewards/margins_std": 0.06705694645643234, "rewards/rejected": 0.07557731866836548, "step": 1270 }, { "dpo_losses": 0.6836634278297424, "epoch": 0.33, "grad_norm": 14.466799907393026, "learning_rate": 4.206165076283982e-07, "logits/chosen": -2.2906830310821533, "logits/rejected": -2.373886823654175, "logps/chosen": -202.0186767578125, "logps/rejected": -233.62167358398438, "loss": 0.6812, "positive_losses": 0.0921836867928505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09967093169689178, "rewards/margins": 0.02010520175099373, "rewards/margins_max": 0.0820559710264206, "rewards/margins_min": -0.0423809289932251, "rewards/margins_std": 0.05496206879615784, "rewards/rejected": 0.07956572622060776, "step": 1280 }, { "dpo_losses": 0.6749547123908997, "epoch": 0.34, "grad_norm": 2.6130072616844138, "learning_rate": 4.1893965455469946e-07, "logits/chosen": -2.341592311859131, "logits/rejected": -2.234506368637085, "logps/chosen": -214.84976196289062, "logps/rejected": -232.9533233642578, "loss": 0.6793, "positive_losses": 0.026287078857421875, "rewards/accuracies": 0.625, "rewards/chosen": 0.09915883839130402, "rewards/margins": 0.038605134934186935, "rewards/margins_max": 0.14530403912067413, "rewards/margins_min": -0.04796471446752548, "rewards/margins_std": 0.08534155040979385, "rewards/rejected": 0.06055371090769768, "step": 1290 }, { "dpo_losses": 0.6844350099563599, "epoch": 0.34, "grad_norm": 16.29174513281669, "learning_rate": 4.172486950684626e-07, "logits/chosen": -2.4190704822540283, "logits/rejected": -2.251585006713867, "logps/chosen": -188.2867889404297, "logps/rejected": -220.9339599609375, "loss": 0.695, "positive_losses": 0.15784378349781036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09584221243858337, "rewards/margins": 0.018551377579569817, "rewards/margins_max": 0.08650018274784088, "rewards/margins_min": -0.04825694113969803, "rewards/margins_std": 0.05878395587205887, "rewards/rejected": 0.07729082554578781, "step": 1300 }, { "epoch": 0.34, "eval_dpo_losses": 0.674910843372345, "eval_logits/chosen": -2.3277697563171387, "eval_logits/rejected": -2.2191708087921143, "eval_logps/chosen": -264.4297180175781, "eval_logps/rejected": -255.3223114013672, "eval_loss": 0.68409264087677, "eval_positive_losses": 0.07105972617864609, "eval_rewards/accuracies": 0.7023809552192688, "eval_rewards/chosen": 0.11344455927610397, "eval_rewards/margins": 0.03846484795212746, "eval_rewards/margins_max": 0.16221289336681366, "eval_rewards/margins_min": -0.07032685726881027, "eval_rewards/margins_std": 0.07698249071836472, "eval_rewards/rejected": 0.07497970759868622, "eval_runtime": 389.5224, "eval_samples_per_second": 5.134, "eval_steps_per_second": 0.162, "step": 1300 }, { "dpo_losses": 0.6815450191497803, "epoch": 0.34, "grad_norm": 16.563810015998545, "learning_rate": 4.155437703643181e-07, "logits/chosen": -2.3886966705322266, "logits/rejected": -2.282742977142334, "logps/chosen": -269.80523681640625, "logps/rejected": -256.05316162109375, "loss": 0.6806, "positive_losses": 0.014493560418486595, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11359025537967682, "rewards/margins": 0.024729851633310318, "rewards/margins_max": 0.10749141871929169, "rewards/margins_min": -0.03529422730207443, "rewards/margins_std": 0.06559218466281891, "rewards/rejected": 0.0888604000210762, "step": 1310 }, { "dpo_losses": 0.6790980100631714, "epoch": 0.35, "grad_norm": 3.055096142843397, "learning_rate": 4.138250228029881e-07, "logits/chosen": -2.369089365005493, "logits/rejected": -2.319018602371216, "logps/chosen": -246.2501983642578, "logps/rejected": -241.6671905517578, "loss": 0.6806, "positive_losses": 0.020523834973573685, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.11858324706554413, "rewards/margins": 0.030133020132780075, "rewards/margins_max": 0.12184000015258789, "rewards/margins_min": -0.04813048988580704, "rewards/margins_std": 0.07563811540603638, "rewards/rejected": 0.08845021575689316, "step": 1320 }, { "dpo_losses": 0.674498438835144, "epoch": 0.35, "grad_norm": 2.626328177725137, "learning_rate": 4.1209259589939935e-07, "logits/chosen": -2.5157132148742676, "logits/rejected": -2.3793091773986816, "logps/chosen": -247.8560028076172, "logps/rejected": -250.97457885742188, "loss": 0.6792, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1187223419547081, "rewards/margins": 0.03896196931600571, "rewards/margins_max": 0.12327287346124649, "rewards/margins_min": -0.03530266880989075, "rewards/margins_std": 0.06953944265842438, "rewards/rejected": 0.07976037263870239, "step": 1330 }, { "dpo_losses": 0.6801000833511353, "epoch": 0.35, "grad_norm": 3.0942507251924503, "learning_rate": 4.103466343106998e-07, "logits/chosen": -2.2623391151428223, "logits/rejected": -2.145129680633545, "logps/chosen": -305.9637145996094, "logps/rejected": -254.6493377685547, "loss": 0.6828, "positive_losses": 0.0639011412858963, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10168752819299698, "rewards/margins": 0.027894895523786545, "rewards/margins_max": 0.11847718060016632, "rewards/margins_min": -0.05172067880630493, "rewards/margins_std": 0.07555267959833145, "rewards/rejected": 0.07379263639450073, "step": 1340 }, { "dpo_losses": 0.6693710684776306, "epoch": 0.35, "grad_norm": 3.0028618831267853, "learning_rate": 4.085872838241796e-07, "logits/chosen": -2.321078300476074, "logits/rejected": -2.2318615913391113, "logps/chosen": -275.1156921386719, "logps/rejected": -238.68948364257812, "loss": 0.6802, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12108185142278671, "rewards/margins": 0.04958399385213852, "rewards/margins_max": 0.13786712288856506, "rewards/margins_min": -0.03485984355211258, "rewards/margins_std": 0.07707887887954712, "rewards/rejected": 0.07149787247180939, "step": 1350 }, { "dpo_losses": 0.6868189573287964, "epoch": 0.36, "grad_norm": 2.9635005114512922, "learning_rate": 4.06814691345098e-07, "logits/chosen": -2.424273729324341, "logits/rejected": -2.4362246990203857, "logps/chosen": -198.31723022460938, "logps/rejected": -187.5654754638672, "loss": 0.6846, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.10656698793172836, "rewards/margins": 0.013685437850654125, "rewards/margins_max": 0.0941050723195076, "rewards/margins_min": -0.04866738244891167, "rewards/margins_std": 0.06302470713853836, "rewards/rejected": 0.09288156032562256, "step": 1360 }, { "dpo_losses": 0.672343909740448, "epoch": 0.36, "grad_norm": 2.6751130766454256, "learning_rate": 4.0502900488441707e-07, "logits/chosen": -2.538100242614746, "logits/rejected": -2.3293769359588623, "logps/chosen": -255.2715301513672, "logps/rejected": -233.25204467773438, "loss": 0.6775, "positive_losses": 0.015187072567641735, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.12219059467315674, "rewards/margins": 0.0435735359787941, "rewards/margins_max": 0.12608644366264343, "rewards/margins_min": -0.023978596553206444, "rewards/margins_std": 0.06800293922424316, "rewards/rejected": 0.07861705869436264, "step": 1370 }, { "dpo_losses": 0.6585070490837097, "epoch": 0.36, "grad_norm": 2.280806824996194, "learning_rate": 4.032303735464422e-07, "logits/chosen": -2.337446689605713, "logits/rejected": -2.246135711669922, "logps/chosen": -256.72125244140625, "logps/rejected": -263.3708190917969, "loss": 0.6717, "positive_losses": 0.0, "rewards/accuracies": 0.875, "rewards/chosen": 0.12680482864379883, "rewards/margins": 0.07212004065513611, "rewards/margins_max": 0.1662723273038864, "rewards/margins_min": -0.008478707633912563, "rewards/margins_std": 0.07421205937862396, "rewards/rejected": 0.05468479543924332, "step": 1380 }, { "dpo_losses": 0.6683043241500854, "epoch": 0.36, "grad_norm": 2.755891298234886, "learning_rate": 4.014189475163726e-07, "logits/chosen": -2.5238687992095947, "logits/rejected": -2.290893793106079, "logps/chosen": -245.83871459960938, "logps/rejected": -204.3573455810547, "loss": 0.6867, "positive_losses": 0.0009101867908611894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10948805510997772, "rewards/margins": 0.0518328920006752, "rewards/margins_max": 0.13620294630527496, "rewards/margins_min": -0.023577053099870682, "rewards/margins_std": 0.07064683735370636, "rewards/rejected": 0.057655174285173416, "step": 1390 }, { "dpo_losses": 0.671380341053009, "epoch": 0.37, "grad_norm": 14.200473852713563, "learning_rate": 3.995948780477605e-07, "logits/chosen": -2.341438055038452, "logits/rejected": -2.1200079917907715, "logps/chosen": -253.569091796875, "logps/rejected": -211.28964233398438, "loss": 0.6805, "positive_losses": 0.22973403334617615, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10750003904104233, "rewards/margins": 0.04592563211917877, "rewards/margins_max": 0.13387855887413025, "rewards/margins_min": -0.03239128738641739, "rewards/margins_std": 0.07682503759860992, "rewards/rejected": 0.061574406921863556, "step": 1400 }, { "epoch": 0.37, "eval_dpo_losses": 0.6733129620552063, "eval_logits/chosen": -2.31960129737854, "eval_logits/rejected": -2.2104132175445557, "eval_logps/chosen": -264.52569580078125, "eval_logps/rejected": -255.7659912109375, "eval_loss": 0.6853117942810059, "eval_positive_losses": 0.098435178399086, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.11248450726270676, "eval_rewards/margins": 0.0419418029487133, "eval_rewards/margins_max": 0.17346851527690887, "eval_rewards/margins_min": -0.07407856732606888, "eval_rewards/margins_std": 0.08184295147657394, "eval_rewards/rejected": 0.07054270058870316, "eval_runtime": 389.4354, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 1400 }, { "dpo_losses": 0.6753364205360413, "epoch": 0.37, "grad_norm": 3.0322654045922035, "learning_rate": 3.977583174498816e-07, "logits/chosen": -2.343217372894287, "logits/rejected": -2.297837734222412, "logps/chosen": -213.8505401611328, "logps/rejected": -233.4495849609375, "loss": 0.6745, "positive_losses": 0.0008827209239825606, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11028649657964706, "rewards/margins": 0.03719424083828926, "rewards/margins_max": 0.10987532138824463, "rewards/margins_min": -0.03628981485962868, "rewards/margins_std": 0.06563820689916611, "rewards/rejected": 0.0730922520160675, "step": 1410 }, { "dpo_losses": 0.67552250623703, "epoch": 0.37, "grad_norm": 2.255044791713521, "learning_rate": 3.9590941907501717e-07, "logits/chosen": -2.334129810333252, "logits/rejected": -2.1003570556640625, "logps/chosen": -224.73300170898438, "logps/rejected": -180.84889221191406, "loss": 0.6849, "positive_losses": 0.15928420424461365, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10701780021190643, "rewards/margins": 0.03697657585144043, "rewards/margins_max": 0.11473357677459717, "rewards/margins_min": -0.0419112853705883, "rewards/margins_std": 0.07193051278591156, "rewards/rejected": 0.0700412169098854, "step": 1420 }, { "dpo_losses": 0.6714733839035034, "epoch": 0.37, "grad_norm": 3.53053533023217, "learning_rate": 3.9404833730564974e-07, "logits/chosen": -2.4286975860595703, "logits/rejected": -2.394972324371338, "logps/chosen": -195.36651611328125, "logps/rejected": -226.495849609375, "loss": 0.6798, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.10350419580936432, "rewards/margins": 0.04476894810795784, "rewards/margins_max": 0.10779400169849396, "rewards/margins_min": -0.004785294644534588, "rewards/margins_std": 0.050304114818573, "rewards/rejected": 0.05873524025082588, "step": 1430 }, { "dpo_losses": 0.6698949933052063, "epoch": 0.38, "grad_norm": 11.444267230426169, "learning_rate": 3.9217522754157117e-07, "logits/chosen": -2.4153077602386475, "logits/rejected": -2.2203705310821533, "logps/chosen": -291.612548828125, "logps/rejected": -315.343994140625, "loss": 0.6727, "positive_losses": 0.02120056189596653, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11787764728069305, "rewards/margins": 0.04875284433364868, "rewards/margins_max": 0.14001592993736267, "rewards/margins_min": -0.025638461112976074, "rewards/margins_std": 0.07346327602863312, "rewards/rejected": 0.06912480294704437, "step": 1440 }, { "dpo_losses": 0.660051167011261, "epoch": 0.38, "grad_norm": 3.224230794327213, "learning_rate": 3.9029024618690785e-07, "logits/chosen": -2.49568772315979, "logits/rejected": -2.363964796066284, "logps/chosen": -299.9936828613281, "logps/rejected": -258.32733154296875, "loss": 0.6927, "positive_losses": 0.1351947784423828, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11214890331029892, "rewards/margins": 0.06970526278018951, "rewards/margins_max": 0.18195927143096924, "rewards/margins_min": -0.027921024709939957, "rewards/margins_std": 0.08997403085231781, "rewards/rejected": 0.0424436517059803, "step": 1450 }, { "dpo_losses": 0.665652871131897, "epoch": 0.38, "grad_norm": 2.5443194152712567, "learning_rate": 3.883935506370605e-07, "logits/chosen": -2.3224480152130127, "logits/rejected": -2.1485037803649902, "logps/chosen": -274.13409423828125, "logps/rejected": -233.2168426513672, "loss": 0.6806, "positive_losses": 0.08241119235754013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10163430869579315, "rewards/margins": 0.05819261819124222, "rewards/margins_max": 0.17397622764110565, "rewards/margins_min": -0.029304545372724533, "rewards/margins_std": 0.09268315881490707, "rewards/rejected": 0.043441690504550934, "step": 1460 }, { "dpo_losses": 0.6763372421264648, "epoch": 0.38, "grad_norm": 2.8681931564003627, "learning_rate": 3.864852992655616e-07, "logits/chosen": -2.5024819374084473, "logits/rejected": -2.3657126426696777, "logps/chosen": -242.416015625, "logps/rejected": -232.46585083007812, "loss": 0.6768, "positive_losses": 0.18492698669433594, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.09728354960680008, "rewards/margins": 0.03542621061205864, "rewards/margins_max": 0.11199555546045303, "rewards/margins_min": -0.04032611474394798, "rewards/margins_std": 0.06558045744895935, "rewards/rejected": 0.061857350170612335, "step": 1470 }, { "dpo_losses": 0.675035834312439, "epoch": 0.39, "grad_norm": 21.29311744639421, "learning_rate": 3.845656514108515e-07, "logits/chosen": -2.441222667694092, "logits/rejected": -2.3946261405944824, "logps/chosen": -218.35134887695312, "logps/rejected": -254.07388305664062, "loss": 0.6848, "positive_losses": 0.14456062018871307, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09519155323505402, "rewards/margins": 0.03797183185815811, "rewards/margins_max": 0.11314894258975983, "rewards/margins_min": -0.023918112739920616, "rewards/margins_std": 0.06559471786022186, "rewards/rejected": 0.057219721376895905, "step": 1480 }, { "dpo_losses": 0.6662026047706604, "epoch": 0.39, "grad_norm": 2.814250147195734, "learning_rate": 3.8263476736297375e-07, "logits/chosen": -2.3352134227752686, "logits/rejected": -2.1717777252197266, "logps/chosen": -244.5308380126953, "logps/rejected": -238.9051971435547, "loss": 0.6829, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10796190798282623, "rewards/margins": 0.05640899017453194, "rewards/margins_max": 0.16118457913398743, "rewards/margins_min": -0.02142479456961155, "rewards/margins_std": 0.08248274028301239, "rewards/rejected": 0.0515529103577137, "step": 1490 }, { "dpo_losses": 0.6708699464797974, "epoch": 0.39, "grad_norm": 2.3807248835642314, "learning_rate": 3.8069280835019055e-07, "logits/chosen": -2.4066412448883057, "logits/rejected": -2.2527923583984375, "logps/chosen": -223.0205078125, "logps/rejected": -195.32302856445312, "loss": 0.6848, "positive_losses": 0.06665535271167755, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1191413402557373, "rewards/margins": 0.048001233488321304, "rewards/margins_max": 0.17757447063922882, "rewards/margins_min": -0.05411670356988907, "rewards/margins_std": 0.10390839725732803, "rewards/rejected": 0.0711401104927063, "step": 1500 }, { "epoch": 0.39, "eval_dpo_losses": 0.672677755355835, "eval_logits/chosen": -2.3127338886260986, "eval_logits/rejected": -2.203519582748413, "eval_logps/chosen": -264.2325744628906, "eval_logps/rejected": -255.61822509765625, "eval_loss": 0.6856805682182312, "eval_positive_losses": 0.1049688458442688, "eval_rewards/accuracies": 0.7023809552192688, "eval_rewards/chosen": 0.1154155433177948, "eval_rewards/margins": 0.043394919484853745, "eval_rewards/margins_max": 0.18140065670013428, "eval_rewards/margins_min": -0.07706259191036224, "eval_rewards/margins_std": 0.08532083034515381, "eval_rewards/rejected": 0.07202062010765076, "eval_runtime": 389.1905, "eval_samples_per_second": 5.139, "eval_steps_per_second": 0.162, "step": 1500 }, { "dpo_losses": 0.6771584749221802, "epoch": 0.4, "grad_norm": 19.61959706179481, "learning_rate": 3.7873993652552073e-07, "logits/chosen": -2.4989778995513916, "logits/rejected": -2.2445225715637207, "logps/chosen": -284.41064453125, "logps/rejected": -250.82083129882812, "loss": 0.6979, "positive_losses": 0.19508323073387146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11048240959644318, "rewards/margins": 0.03401091694831848, "rewards/margins_max": 0.11306263506412506, "rewards/margins_min": -0.04341377690434456, "rewards/margins_std": 0.07349542528390884, "rewards/rejected": 0.07647150754928589, "step": 1510 }, { "dpo_losses": 0.6729769110679626, "epoch": 0.4, "grad_norm": 2.8311486703958657, "learning_rate": 3.767763149531995e-07, "logits/chosen": -2.3952064514160156, "logits/rejected": -2.0987696647644043, "logps/chosen": -260.8228759765625, "logps/rejected": -207.1134796142578, "loss": 0.6814, "positive_losses": 0.07905330508947372, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12159881740808487, "rewards/margins": 0.04311979562044144, "rewards/margins_max": 0.16991779208183289, "rewards/margins_min": -0.04385266453027725, "rewards/margins_std": 0.09640352427959442, "rewards/rejected": 0.07847900688648224, "step": 1520 }, { "dpo_losses": 0.6735920906066895, "epoch": 0.4, "grad_norm": 8.015347541037336, "learning_rate": 3.7480210759506326e-07, "logits/chosen": -2.360323429107666, "logits/rejected": -2.247368335723877, "logps/chosen": -257.65106201171875, "logps/rejected": -225.16799926757812, "loss": 0.6711, "positive_losses": 0.01614837720990181, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12395969778299332, "rewards/margins": 0.040385790169239044, "rewards/margins_max": 0.10268676280975342, "rewards/margins_min": -0.01606556586921215, "rewards/margins_std": 0.05205771327018738, "rewards/rejected": 0.08357391506433487, "step": 1530 }, { "dpo_losses": 0.6802448034286499, "epoch": 0.4, "grad_norm": 17.12508375606799, "learning_rate": 3.728174792968582e-07, "logits/chosen": -2.4028515815734863, "logits/rejected": -2.249748468399048, "logps/chosen": -306.1953430175781, "logps/rejected": -280.787353515625, "loss": 0.6792, "positive_losses": 0.033181000500917435, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11910251528024673, "rewards/margins": 0.028476929292082787, "rewards/margins_max": 0.12485118955373764, "rewards/margins_min": -0.05928092077374458, "rewards/margins_std": 0.08486507087945938, "rewards/rejected": 0.0906255841255188, "step": 1540 }, { "dpo_losses": 0.6680120229721069, "epoch": 0.41, "grad_norm": 25.594272258865605, "learning_rate": 3.70822595774476e-07, "logits/chosen": -2.3109679222106934, "logits/rejected": -2.3119215965270996, "logps/chosen": -281.07476806640625, "logps/rejected": -272.7761535644531, "loss": 0.6777, "positive_losses": 0.1575981080532074, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12642286717891693, "rewards/margins": 0.05317026376724243, "rewards/margins_max": 0.13846921920776367, "rewards/margins_min": -0.03054303303360939, "rewards/margins_std": 0.07902370393276215, "rewards/rejected": 0.0732526183128357, "step": 1550 }, { "dpo_losses": 0.6812738180160522, "epoch": 0.41, "grad_norm": 2.727018389257601, "learning_rate": 3.688176236001168e-07, "logits/chosen": -2.281219959259033, "logits/rejected": -2.1885035037994385, "logps/chosen": -243.2242431640625, "logps/rejected": -233.10806274414062, "loss": 0.6853, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.10374365001916885, "rewards/margins": 0.026002958416938782, "rewards/margins_max": 0.13738763332366943, "rewards/margins_min": -0.07183308899402618, "rewards/margins_std": 0.09161803126335144, "rewards/rejected": 0.07774068415164948, "step": 1560 }, { "dpo_losses": 0.6652746796607971, "epoch": 0.41, "grad_norm": 16.85484817441029, "learning_rate": 3.6680273018838016e-07, "logits/chosen": -2.417208194732666, "logits/rejected": -2.2463088035583496, "logps/chosen": -326.94342041015625, "logps/rejected": -255.0051727294922, "loss": 0.6843, "positive_losses": 0.20616760849952698, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12754754722118378, "rewards/margins": 0.05929331108927727, "rewards/margins_max": 0.18207277357578278, "rewards/margins_min": -0.04680194333195686, "rewards/margins_std": 0.10363826900720596, "rewards/rejected": 0.0682542473077774, "step": 1570 }, { "dpo_losses": 0.6775561571121216, "epoch": 0.41, "grad_norm": 12.220544395495056, "learning_rate": 3.6477808378228596e-07, "logits/chosen": -2.4180045127868652, "logits/rejected": -2.4326188564300537, "logps/chosen": -250.2310791015625, "logps/rejected": -242.7777099609375, "loss": 0.6819, "positive_losses": 0.40742263197898865, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10258843004703522, "rewards/margins": 0.033382825553417206, "rewards/margins_max": 0.12542590498924255, "rewards/margins_min": -0.04625628516077995, "rewards/margins_std": 0.07698404788970947, "rewards/rejected": 0.06920559704303741, "step": 1580 }, { "dpo_losses": 0.6771516799926758, "epoch": 0.42, "grad_norm": 21.24608333620355, "learning_rate": 3.6274385343922674e-07, "logits/chosen": -2.443016290664673, "logits/rejected": -2.4929726123809814, "logps/chosen": -288.58599853515625, "logps/rejected": -278.4659729003906, "loss": 0.6859, "positive_losses": 0.11496429145336151, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12747585773468018, "rewards/margins": 0.035403721034526825, "rewards/margins_max": 0.1638483852148056, "rewards/margins_min": -0.08708731085062027, "rewards/margins_std": 0.11001044511795044, "rewards/rejected": 0.09207214415073395, "step": 1590 }, { "dpo_losses": 0.6759015321731567, "epoch": 0.42, "grad_norm": 2.5031879126635173, "learning_rate": 3.6070020901685057e-07, "logits/chosen": -2.3058149814605713, "logits/rejected": -2.2169103622436523, "logps/chosen": -233.39419555664062, "logps/rejected": -205.3389434814453, "loss": 0.6808, "positive_losses": 0.03036193922162056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.11742230504751205, "rewards/margins": 0.03598679229617119, "rewards/margins_max": 0.10207675397396088, "rewards/margins_min": -0.03041153773665428, "rewards/margins_std": 0.05977478623390198, "rewards/rejected": 0.08143551647663116, "step": 1600 }, { "epoch": 0.42, "eval_dpo_losses": 0.6729524731636047, "eval_logits/chosen": -2.321192979812622, "eval_logits/rejected": -2.2128562927246094, "eval_logps/chosen": -263.70458984375, "eval_logps/rejected": -255.02305603027344, "eval_loss": 0.6826204657554626, "eval_positive_losses": 0.07956277579069138, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.12069569528102875, "eval_rewards/margins": 0.042723335325717926, "eval_rewards/margins_max": 0.17632700502872467, "eval_rewards/margins_min": -0.07449869066476822, "eval_rewards/margins_std": 0.08286270499229431, "eval_rewards/rejected": 0.07797236740589142, "eval_runtime": 389.4235, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 1600 }, { "dpo_losses": 0.6691166758537292, "epoch": 0.42, "grad_norm": 2.279580432869315, "learning_rate": 3.5864732115887863e-07, "logits/chosen": -2.477266788482666, "logits/rejected": -2.306077718734741, "logps/chosen": -246.0018310546875, "logps/rejected": -232.3708953857422, "loss": 0.6817, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.12005207687616348, "rewards/margins": 0.05094514042139053, "rewards/margins_max": 0.144468754529953, "rewards/margins_min": -0.019564500078558922, "rewards/margins_std": 0.07487691938877106, "rewards/rejected": 0.06910693645477295, "step": 1610 }, { "dpo_losses": 0.6736332178115845, "epoch": 0.42, "grad_norm": 2.8177240809613426, "learning_rate": 3.565853612808562e-07, "logits/chosen": -2.3282740116119385, "logits/rejected": -2.124765157699585, "logps/chosen": -218.62118530273438, "logps/rejected": -205.39071655273438, "loss": 0.6778, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.11677243560552597, "rewards/margins": 0.041856471449136734, "rewards/margins_max": 0.148008793592453, "rewards/margins_min": -0.03521239012479782, "rewards/margins_std": 0.08438049256801605, "rewards/rejected": 0.07491596788167953, "step": 1620 }, { "dpo_losses": 0.6642592549324036, "epoch": 0.43, "grad_norm": 2.4007329735462495, "learning_rate": 3.5451450155583984e-07, "logits/chosen": -2.588942289352417, "logits/rejected": -2.303483486175537, "logps/chosen": -274.9673767089844, "logps/rejected": -248.8934326171875, "loss": 0.6743, "positive_losses": 0.032569121569395065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12685735523700714, "rewards/margins": 0.06063305586576462, "rewards/margins_max": 0.14821195602416992, "rewards/margins_min": -0.029791027307510376, "rewards/margins_std": 0.07603773474693298, "rewards/rejected": 0.06622429937124252, "step": 1630 }, { "dpo_losses": 0.672497570514679, "epoch": 0.43, "grad_norm": 2.9223868997714124, "learning_rate": 3.5243491490002055e-07, "logits/chosen": -2.475343942642212, "logits/rejected": -2.256777286529541, "logps/chosen": -261.9552307128906, "logps/rejected": -222.695068359375, "loss": 0.6745, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12895427644252777, "rewards/margins": 0.043666500598192215, "rewards/margins_max": 0.13691024482250214, "rewards/margins_min": -0.03571964055299759, "rewards/margins_std": 0.07629676908254623, "rewards/rejected": 0.08528777211904526, "step": 1640 }, { "dpo_losses": 0.6632629632949829, "epoch": 0.43, "grad_norm": 3.1697521904676726, "learning_rate": 3.503467749582857e-07, "logits/chosen": -2.391340732574463, "logits/rejected": -2.1258997917175293, "logps/chosen": -359.4380798339844, "logps/rejected": -271.98785400390625, "loss": 0.6761, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1356867551803589, "rewards/margins": 0.06345061957836151, "rewards/margins_max": 0.15397801995277405, "rewards/margins_min": -0.05107613280415535, "rewards/margins_std": 0.09376613795757294, "rewards/rejected": 0.07223614305257797, "step": 1650 }, { "dpo_losses": 0.6636583209037781, "epoch": 0.43, "grad_norm": 14.156037116397382, "learning_rate": 3.482502560897194e-07, "logits/chosen": -2.3435370922088623, "logits/rejected": -2.3291754722595215, "logps/chosen": -226.92581176757812, "logps/rejected": -278.88116455078125, "loss": 0.6776, "positive_losses": 0.029582977294921875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12310995906591415, "rewards/margins": 0.062410563230514526, "rewards/margins_max": 0.1651877909898758, "rewards/margins_min": -0.01980203576385975, "rewards/margins_std": 0.08251117169857025, "rewards/rejected": 0.060699403285980225, "step": 1660 }, { "dpo_losses": 0.6810007095336914, "epoch": 0.44, "grad_norm": 2.5215519808120663, "learning_rate": 3.4614553335304403e-07, "logits/chosen": -2.5205461978912354, "logits/rejected": -2.3930211067199707, "logps/chosen": -239.0262908935547, "logps/rejected": -222.79150390625, "loss": 0.6813, "positive_losses": 0.21976470947265625, "rewards/accuracies": 0.625, "rewards/chosen": 0.10838184505701065, "rewards/margins": 0.026057234033942223, "rewards/margins_max": 0.10835113376379013, "rewards/margins_min": -0.06028624251484871, "rewards/margins_std": 0.07714466750621796, "rewards/rejected": 0.08232460916042328, "step": 1670 }, { "dpo_losses": 0.6774951219558716, "epoch": 0.44, "grad_norm": 18.965644695367924, "learning_rate": 3.440327824920022e-07, "logits/chosen": -2.4017586708068848, "logits/rejected": -2.2704827785491943, "logps/chosen": -287.504638671875, "logps/rejected": -239.3936309814453, "loss": 0.6885, "positive_losses": 0.11305618286132812, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11638344824314117, "rewards/margins": 0.03384127467870712, "rewards/margins_max": 0.13455340266227722, "rewards/margins_min": -0.07597328722476959, "rewards/margins_std": 0.09197097271680832, "rewards/rejected": 0.08254217356443405, "step": 1680 }, { "dpo_losses": 0.679038941860199, "epoch": 0.44, "grad_norm": 2.5404753055554794, "learning_rate": 3.4191217992068287e-07, "logits/chosen": -2.2950174808502197, "logits/rejected": -2.211327075958252, "logps/chosen": -216.05142211914062, "logps/rejected": -241.6428680419922, "loss": 0.6914, "positive_losses": 0.019229888916015625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1115482822060585, "rewards/margins": 0.03092820569872856, "rewards/margins_max": 0.12419779598712921, "rewards/margins_min": -0.06095464155077934, "rewards/margins_std": 0.08323143422603607, "rewards/rejected": 0.08062006533145905, "step": 1690 }, { "dpo_losses": 0.6862069368362427, "epoch": 0.44, "grad_norm": 2.758654673856194, "learning_rate": 3.3978390270879056e-07, "logits/chosen": -2.3210432529449463, "logits/rejected": -2.318216562271118, "logps/chosen": -191.92111206054688, "logps/rejected": -247.7636260986328, "loss": 0.6732, "positive_losses": 0.017992209643125534, "rewards/accuracies": 0.5, "rewards/chosen": 0.09574076533317566, "rewards/margins": 0.016351768746972084, "rewards/margins_max": 0.11797819286584854, "rewards/margins_min": -0.08319463580846786, "rewards/margins_std": 0.08919573575258255, "rewards/rejected": 0.07938901335000992, "step": 1700 }, { "epoch": 0.44, "eval_dpo_losses": 0.67174232006073, "eval_logits/chosen": -2.314293622970581, "eval_logits/rejected": -2.205415725708008, "eval_logps/chosen": -263.95953369140625, "eval_logps/rejected": -255.54295349121094, "eval_loss": 0.6845545768737793, "eval_positive_losses": 0.10288190096616745, "eval_rewards/accuracies": 0.704365074634552, "eval_rewards/chosen": 0.11814623326063156, "eval_rewards/margins": 0.04537297785282135, "eval_rewards/margins_max": 0.1850803643465042, "eval_rewards/margins_min": -0.07645407319068909, "eval_rewards/margins_std": 0.08680056035518646, "eval_rewards/rejected": 0.07277326285839081, "eval_runtime": 389.1742, "eval_samples_per_second": 5.139, "eval_steps_per_second": 0.162, "step": 1700 }, { "dpo_losses": 0.6764692068099976, "epoch": 0.45, "grad_norm": 34.120270677322445, "learning_rate": 3.376481285668599e-07, "logits/chosen": -2.4371509552001953, "logits/rejected": -2.397617816925049, "logps/chosen": -232.47000122070312, "logps/rejected": -223.736083984375, "loss": 0.6939, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.12124262750148773, "rewards/margins": 0.03551114350557327, "rewards/margins_max": 0.1231713742017746, "rewards/margins_min": -0.042569421231746674, "rewards/margins_std": 0.0739036500453949, "rewards/rejected": 0.08573149144649506, "step": 1710 }, { "dpo_losses": 0.6769440174102783, "epoch": 0.45, "grad_norm": 2.8659043104192246, "learning_rate": 3.355050358314172e-07, "logits/chosen": -2.4147379398345947, "logits/rejected": -2.3089993000030518, "logps/chosen": -234.2282257080078, "logps/rejected": -263.7807922363281, "loss": 0.6837, "positive_losses": 0.01812870427966118, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12960085272789001, "rewards/margins": 0.03511188179254532, "rewards/margins_max": 0.14957959949970245, "rewards/margins_min": -0.07076962292194366, "rewards/margins_std": 0.09437619149684906, "rewards/rejected": 0.0944889634847641, "step": 1720 }, { "dpo_losses": 0.6762871742248535, "epoch": 0.45, "grad_norm": 2.668998463698481, "learning_rate": 3.33354803450089e-07, "logits/chosen": -2.4239468574523926, "logits/rejected": -2.2775349617004395, "logps/chosen": -263.69525146484375, "logps/rejected": -285.1688537597656, "loss": 0.7093, "positive_losses": 0.341064453125, "rewards/accuracies": 0.75, "rewards/chosen": 0.11778296530246735, "rewards/margins": 0.036462798714637756, "rewards/margins_max": 0.12930986285209656, "rewards/margins_min": -0.05971219390630722, "rewards/margins_std": 0.08384320139884949, "rewards/rejected": 0.0813201516866684, "step": 1730 }, { "dpo_losses": 0.6578310132026672, "epoch": 0.46, "grad_norm": 67.87122313386948, "learning_rate": 3.311976109666605e-07, "logits/chosen": -2.3400423526763916, "logits/rejected": -2.144742012023926, "logps/chosen": -292.26275634765625, "logps/rejected": -227.45693969726562, "loss": 0.6822, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1360526978969574, "rewards/margins": 0.07453630864620209, "rewards/margins_max": 0.19015610218048096, "rewards/margins_min": -0.027337830513715744, "rewards/margins_std": 0.09655106067657471, "rewards/rejected": 0.061516374349594116, "step": 1740 }, { "dpo_losses": 0.6718469858169556, "epoch": 0.46, "grad_norm": 2.616414334548666, "learning_rate": 3.2903363850608317e-07, "logits/chosen": -2.418524980545044, "logits/rejected": -2.367490768432617, "logps/chosen": -247.1109161376953, "logps/rejected": -258.22576904296875, "loss": 0.677, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11959341913461685, "rewards/margins": 0.04436866194009781, "rewards/margins_max": 0.11880648136138916, "rewards/margins_min": -0.018970290198922157, "rewards/margins_std": 0.061818283051252365, "rewards/rejected": 0.07522475719451904, "step": 1750 }, { "dpo_losses": 0.656428337097168, "epoch": 0.46, "grad_norm": 2.6165900160462194, "learning_rate": 3.2686306675943477e-07, "logits/chosen": -2.227027416229248, "logits/rejected": -2.0776469707489014, "logps/chosen": -243.53445434570312, "logps/rejected": -232.5414581298828, "loss": 0.6839, "positive_losses": 0.033985137939453125, "rewards/accuracies": 0.75, "rewards/chosen": 0.13451433181762695, "rewards/margins": 0.07702252268791199, "rewards/margins_max": 0.19696083664894104, "rewards/margins_min": -0.006645015440881252, "rewards/margins_std": 0.09024585038423538, "rewards/rejected": 0.057491790503263474, "step": 1760 }, { "dpo_losses": 0.673287034034729, "epoch": 0.46, "grad_norm": 10.725563841894862, "learning_rate": 3.2468607696883145e-07, "logits/chosen": -2.464648485183716, "logits/rejected": -2.426926374435425, "logps/chosen": -296.7885437011719, "logps/rejected": -297.307373046875, "loss": 0.6815, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.13295727968215942, "rewards/margins": 0.041971445083618164, "rewards/margins_max": 0.13836826384067535, "rewards/margins_min": -0.04463455453515053, "rewards/margins_std": 0.07990754395723343, "rewards/rejected": 0.09098584949970245, "step": 1770 }, { "dpo_losses": 0.6736095547676086, "epoch": 0.47, "grad_norm": 2.7613899866196236, "learning_rate": 3.2250285091229435e-07, "logits/chosen": -2.3886301517486572, "logits/rejected": -2.279801845550537, "logps/chosen": -277.1134338378906, "logps/rejected": -248.1231689453125, "loss": 0.6797, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.11940373480319977, "rewards/margins": 0.04109395295381546, "rewards/margins_max": 0.12384055554866791, "rewards/margins_min": -0.037078097462654114, "rewards/margins_std": 0.07173661887645721, "rewards/rejected": 0.07830978184938431, "step": 1780 }, { "dpo_losses": 0.6730136275291443, "epoch": 0.47, "grad_norm": 3.207521278031283, "learning_rate": 3.2031357088857083e-07, "logits/chosen": -2.3274896144866943, "logits/rejected": -2.250530242919922, "logps/chosen": -257.1026611328125, "logps/rejected": -215.4867401123047, "loss": 0.681, "positive_losses": 0.03467712551355362, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11962921917438507, "rewards/margins": 0.04227549582719803, "rewards/margins_max": 0.1462966650724411, "rewards/margins_min": -0.03869865462183952, "rewards/margins_std": 0.0816623792052269, "rewards/rejected": 0.07735371589660645, "step": 1790 }, { "dpo_losses": 0.6722658276557922, "epoch": 0.47, "grad_norm": 3.0706491843866206, "learning_rate": 3.1811841970191267e-07, "logits/chosen": -2.6015467643737793, "logits/rejected": -2.2039971351623535, "logps/chosen": -324.97552490234375, "logps/rejected": -286.8111572265625, "loss": 0.6995, "positive_losses": 0.21692581474781036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11262290179729462, "rewards/margins": 0.04472409188747406, "rewards/margins_max": 0.14744366705417633, "rewards/margins_min": -0.05651304870843887, "rewards/margins_std": 0.08603670448064804, "rewards/rejected": 0.06789880990982056, "step": 1800 }, { "epoch": 0.47, "eval_dpo_losses": 0.6722102165222168, "eval_logits/chosen": -2.313633680343628, "eval_logits/rejected": -2.204257011413574, "eval_logps/chosen": -263.4484558105469, "eval_logps/rejected": -254.92019653320312, "eval_loss": 0.6804162859916687, "eval_positive_losses": 0.06985494494438171, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 0.12325708568096161, "eval_rewards/margins": 0.044256262481212616, "eval_rewards/margins_max": 0.17775839567184448, "eval_rewards/margins_min": -0.07445338368415833, "eval_rewards/margins_std": 0.0832548514008522, "eval_rewards/rejected": 0.07900082319974899, "eval_runtime": 389.1873, "eval_samples_per_second": 5.139, "eval_steps_per_second": 0.162, "step": 1800 }, { "dpo_losses": 0.6654368042945862, "epoch": 0.47, "grad_norm": 2.804312593272828, "learning_rate": 3.1591758064681257e-07, "logits/chosen": -2.4925196170806885, "logits/rejected": -2.2858967781066895, "logps/chosen": -311.110107421875, "logps/rejected": -285.68389892578125, "loss": 0.6767, "positive_losses": 0.0401611328125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.13894125819206238, "rewards/margins": 0.05779231712222099, "rewards/margins_max": 0.13284161686897278, "rewards/margins_min": -0.006430783774703741, "rewards/margins_std": 0.06498508155345917, "rewards/rejected": 0.0811489149928093, "step": 1810 }, { "dpo_losses": 0.6682026982307434, "epoch": 0.48, "grad_norm": 3.0229646377034687, "learning_rate": 3.13711237492698e-07, "logits/chosen": -2.322195529937744, "logits/rejected": -2.178256034851074, "logps/chosen": -268.0097961425781, "logps/rejected": -278.3845520019531, "loss": 0.679, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.13032406568527222, "rewards/margins": 0.05238030105829239, "rewards/margins_max": 0.13246887922286987, "rewards/margins_min": -0.026665234938263893, "rewards/margins_std": 0.06984977424144745, "rewards/rejected": 0.07794377207756042, "step": 1820 }, { "dpo_losses": 0.6673954129219055, "epoch": 0.48, "grad_norm": 2.605782325553638, "learning_rate": 3.1149957446858767e-07, "logits/chosen": -2.3244545459747314, "logits/rejected": -2.207878351211548, "logps/chosen": -256.9566650390625, "logps/rejected": -315.4747009277344, "loss": 0.6753, "positive_losses": 0.01052780169993639, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1367064267396927, "rewards/margins": 0.056025873869657516, "rewards/margins_max": 0.18707223236560822, "rewards/margins_min": -0.07942967116832733, "rewards/margins_std": 0.1153632178902626, "rewards/rejected": 0.08068054169416428, "step": 1830 }, { "dpo_losses": 0.6703895926475525, "epoch": 0.48, "grad_norm": 12.439256093637027, "learning_rate": 3.0928277624770736e-07, "logits/chosen": -2.5592637062072754, "logits/rejected": -2.347291946411133, "logps/chosen": -225.4854736328125, "logps/rejected": -230.8401641845703, "loss": 0.6731, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12228335440158844, "rewards/margins": 0.04756082966923714, "rewards/margins_max": 0.12534084916114807, "rewards/margins_min": -0.04288121312856674, "rewards/margins_std": 0.07646225392818451, "rewards/rejected": 0.074722521007061, "step": 1840 }, { "dpo_losses": 0.6799191832542419, "epoch": 0.48, "grad_norm": 11.875014134448826, "learning_rate": 3.0706102793207073e-07, "logits/chosen": -2.4091084003448486, "logits/rejected": -2.1726622581481934, "logps/chosen": -218.6199493408203, "logps/rejected": -207.18002319335938, "loss": 0.6789, "positive_losses": 0.18042020499706268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.10656299442052841, "rewards/margins": 0.02885150909423828, "rewards/margins_max": 0.11726919561624527, "rewards/margins_min": -0.05387432500720024, "rewards/margins_std": 0.07662785053253174, "rewards/rejected": 0.07771147787570953, "step": 1850 }, { "dpo_losses": 0.6633358597755432, "epoch": 0.49, "grad_norm": 9.270947630674176, "learning_rate": 3.048345150370226e-07, "logits/chosen": -2.264721632003784, "logits/rejected": -2.177180051803589, "logps/chosen": -216.2780303955078, "logps/rejected": -236.80899047851562, "loss": 0.6849, "positive_losses": 0.07617492973804474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.13391555845737457, "rewards/margins": 0.0630137100815773, "rewards/margins_max": 0.1710374355316162, "rewards/margins_min": -0.018290380015969276, "rewards/margins_std": 0.08563703298568726, "rewards/rejected": 0.07090185582637787, "step": 1860 }, { "dpo_losses": 0.6713906526565552, "epoch": 0.49, "grad_norm": 2.711731016790175, "learning_rate": 3.0260342347574913e-07, "logits/chosen": -2.2666735649108887, "logits/rejected": -2.233799457550049, "logps/chosen": -246.74520874023438, "logps/rejected": -274.8251953125, "loss": 0.6706, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11080590635538101, "rewards/margins": 0.04525129497051239, "rewards/margins_max": 0.12506887316703796, "rewards/margins_min": -0.017419980838894844, "rewards/margins_std": 0.06518755108118057, "rewards/rejected": 0.06555460393428802, "step": 1870 }, { "dpo_losses": 0.673332691192627, "epoch": 0.49, "grad_norm": 12.78618737248966, "learning_rate": 3.0036793954375357e-07, "logits/chosen": -2.4488892555236816, "logits/rejected": -2.2490506172180176, "logps/chosen": -240.43649291992188, "logps/rejected": -233.9011688232422, "loss": 0.6831, "positive_losses": 0.09395217895507812, "rewards/accuracies": 0.625, "rewards/chosen": 0.12613634765148163, "rewards/margins": 0.04207058995962143, "rewards/margins_max": 0.14051197469234467, "rewards/margins_min": -0.04489634186029434, "rewards/margins_std": 0.08194707334041595, "rewards/rejected": 0.0840657576918602, "step": 1880 }, { "dpo_losses": 0.6874047517776489, "epoch": 0.49, "grad_norm": 2.6999201656830545, "learning_rate": 2.9812824990330085e-07, "logits/chosen": -2.4783337116241455, "logits/rejected": -2.496032476425171, "logps/chosen": -276.6015625, "logps/rejected": -343.79962158203125, "loss": 0.684, "positive_losses": 0.17296981811523438, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.10020841658115387, "rewards/margins": 0.013676243834197521, "rewards/margins_max": 0.10636010020971298, "rewards/margins_min": -0.06935359537601471, "rewards/margins_std": 0.07755988091230392, "rewards/rejected": 0.08653218299150467, "step": 1890 }, { "dpo_losses": 0.6682941913604736, "epoch": 0.5, "grad_norm": 2.3790518680095927, "learning_rate": 2.958845415678316e-07, "logits/chosen": -2.501760721206665, "logits/rejected": -2.2126975059509277, "logps/chosen": -251.99496459960938, "logps/rejected": -219.154296875, "loss": 0.6824, "positive_losses": 0.19085045158863068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1222560778260231, "rewards/margins": 0.05262229964137077, "rewards/margins_max": 0.1504717767238617, "rewards/margins_min": -0.031698085367679596, "rewards/margins_std": 0.08034897595643997, "rewards/rejected": 0.06963379681110382, "step": 1900 }, { "epoch": 0.5, "eval_dpo_losses": 0.6712539792060852, "eval_logits/chosen": -2.316337823867798, "eval_logits/rejected": -2.207723379135132, "eval_logps/chosen": -263.53961181640625, "eval_logps/rejected": -255.22503662109375, "eval_loss": 0.681625247001648, "eval_positive_losses": 0.08613637834787369, "eval_rewards/accuracies": 0.7063491940498352, "eval_rewards/chosen": 0.12234490364789963, "eval_rewards/margins": 0.04639248922467232, "eval_rewards/margins_max": 0.18661029636859894, "eval_rewards/margins_min": -0.07671674340963364, "eval_rewards/margins_std": 0.08719097077846527, "eval_rewards/rejected": 0.07595241814851761, "eval_runtime": 389.0253, "eval_samples_per_second": 5.141, "eval_steps_per_second": 0.162, "step": 1900 }, { "dpo_losses": 0.6703254580497742, "epoch": 0.5, "grad_norm": 16.490605309586503, "learning_rate": 2.936370018863459e-07, "logits/chosen": -2.382408618927002, "logits/rejected": -2.2437782287597656, "logps/chosen": -230.81478881835938, "logps/rejected": -235.80105590820312, "loss": 0.6796, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13231012225151062, "rewards/margins": 0.04782681167125702, "rewards/margins_max": 0.11688504368066788, "rewards/margins_min": -0.02766653336584568, "rewards/margins_std": 0.0663471594452858, "rewards/rejected": 0.0844833180308342, "step": 1910 }, { "dpo_losses": 0.6762624382972717, "epoch": 0.5, "grad_norm": 10.921214158744982, "learning_rate": 2.913858185277605e-07, "logits/chosen": -2.3846967220306396, "logits/rejected": -2.2659671306610107, "logps/chosen": -251.1505584716797, "logps/rejected": -199.11154174804688, "loss": 0.6897, "positive_losses": 0.04620208591222763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1087828055024147, "rewards/margins": 0.03668287768959999, "rewards/margins_max": 0.15579333901405334, "rewards/margins_min": -0.051136285066604614, "rewards/margins_std": 0.0928848534822464, "rewards/rejected": 0.07209992408752441, "step": 1920 }, { "dpo_losses": 0.6675348281860352, "epoch": 0.51, "grad_norm": 13.143769131880989, "learning_rate": 2.89131179465238e-07, "logits/chosen": -2.4216268062591553, "logits/rejected": -2.158876657485962, "logps/chosen": -325.57855224609375, "logps/rejected": -230.1248016357422, "loss": 0.6831, "positive_losses": 0.09407653659582138, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11387120187282562, "rewards/margins": 0.05382996052503586, "rewards/margins_max": 0.144154354929924, "rewards/margins_min": -0.01063024252653122, "rewards/margins_std": 0.07056508958339691, "rewards/rejected": 0.06004124879837036, "step": 1930 }, { "dpo_losses": 0.672429621219635, "epoch": 0.51, "grad_norm": 2.8120893390512443, "learning_rate": 2.8687327296049125e-07, "logits/chosen": -2.3585832118988037, "logits/rejected": -2.296858072280884, "logps/chosen": -244.4750518798828, "logps/rejected": -250.869873046875, "loss": 0.6773, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12822110950946808, "rewards/margins": 0.043890226632356644, "rewards/margins_max": 0.1373916119337082, "rewards/margins_min": -0.02716038189828396, "rewards/margins_std": 0.07384473830461502, "rewards/rejected": 0.08433087915182114, "step": 1940 }, { "dpo_losses": 0.6721124649047852, "epoch": 0.51, "grad_norm": 2.8988357640484343, "learning_rate": 2.846122875480637e-07, "logits/chosen": -2.354259729385376, "logits/rejected": -2.3565616607666016, "logps/chosen": -282.6304626464844, "logps/rejected": -270.01202392578125, "loss": 0.6758, "positive_losses": 0.05466156080365181, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12170840799808502, "rewards/margins": 0.0444231852889061, "rewards/margins_max": 0.1315532624721527, "rewards/margins_min": -0.031354110687971115, "rewards/margins_std": 0.07245416939258575, "rewards/rejected": 0.07728521525859833, "step": 1950 }, { "dpo_losses": 0.6553946733474731, "epoch": 0.51, "grad_norm": 2.8012205494971383, "learning_rate": 2.8234841201958647e-07, "logits/chosen": -2.6282870769500732, "logits/rejected": -2.361875534057617, "logps/chosen": -280.6455078125, "logps/rejected": -250.8061981201172, "loss": 0.685, "positive_losses": 0.031368255615234375, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14805732667446136, "rewards/margins": 0.07982449233531952, "rewards/margins_max": 0.1970626413822174, "rewards/margins_min": -0.013273512944579124, "rewards/margins_std": 0.09581952542066574, "rewards/rejected": 0.06823284924030304, "step": 1960 }, { "dpo_losses": 0.6719330549240112, "epoch": 0.52, "grad_norm": 25.136510840729144, "learning_rate": 2.800818354080148e-07, "logits/chosen": -2.4884390830993652, "logits/rejected": -2.339339017868042, "logps/chosen": -266.663818359375, "logps/rejected": -253.10494995117188, "loss": 0.6919, "positive_losses": 0.075963594019413, "rewards/accuracies": 0.75, "rewards/chosen": 0.12589292228221893, "rewards/margins": 0.04531095176935196, "rewards/margins_max": 0.12366080284118652, "rewards/margins_min": -0.054113052785396576, "rewards/margins_std": 0.07959363609552383, "rewards/rejected": 0.08058197051286697, "step": 1970 }, { "dpo_losses": 0.6841176748275757, "epoch": 0.52, "grad_norm": 2.7675665812766486, "learning_rate": 2.778127469718435e-07, "logits/chosen": -2.3658573627471924, "logits/rejected": -2.261683225631714, "logps/chosen": -191.81185913085938, "logps/rejected": -206.1306915283203, "loss": 0.6862, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.12477536499500275, "rewards/margins": 0.01935800537467003, "rewards/margins_max": 0.10074321180582047, "rewards/margins_min": -0.03507170453667641, "rewards/margins_std": 0.061387162655591965, "rewards/rejected": 0.10541733354330063, "step": 1980 }, { "dpo_losses": 0.6662879586219788, "epoch": 0.52, "grad_norm": 2.4847661316272474, "learning_rate": 2.755413361793039e-07, "logits/chosen": -2.5276575088500977, "logits/rejected": -2.290635585784912, "logps/chosen": -243.6441192626953, "logps/rejected": -238.5072021484375, "loss": 0.6801, "positive_losses": 0.148579403758049, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.12011927366256714, "rewards/margins": 0.05569753795862198, "rewards/margins_max": 0.13553032279014587, "rewards/margins_min": -0.0066137490794062614, "rewards/margins_std": 0.06399749964475632, "rewards/rejected": 0.06442175805568695, "step": 1990 }, { "dpo_losses": 0.6880080699920654, "epoch": 0.52, "grad_norm": 2.4419212347476376, "learning_rate": 2.7326779269254356e-07, "logits/chosen": -2.3399906158447266, "logits/rejected": -2.323331117630005, "logps/chosen": -221.29183959960938, "logps/rejected": -221.89608764648438, "loss": 0.6805, "positive_losses": 0.024726103991270065, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1129118800163269, "rewards/margins": 0.011383811011910439, "rewards/margins_max": 0.07983001321554184, "rewards/margins_min": -0.06014372035861015, "rewards/margins_std": 0.062172554433345795, "rewards/rejected": 0.10152806341648102, "step": 2000 }, { "epoch": 0.52, "eval_dpo_losses": 0.6721857190132141, "eval_logits/chosen": -2.3145906925201416, "eval_logits/rejected": -2.2056307792663574, "eval_logps/chosen": -262.8673400878906, "eval_logps/rejected": -254.33956909179688, "eval_loss": 0.6786460280418396, "eval_positive_losses": 0.052308522164821625, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.12906791269779205, "eval_rewards/margins": 0.04426078498363495, "eval_rewards/margins_max": 0.17582374811172485, "eval_rewards/margins_min": -0.07321862876415253, "eval_rewards/margins_std": 0.08221457898616791, "eval_rewards/rejected": 0.08480710536241531, "eval_runtime": 388.8707, "eval_samples_per_second": 5.143, "eval_steps_per_second": 0.162, "step": 2000 }, { "dpo_losses": 0.672469973564148, "epoch": 0.53, "grad_norm": 2.7756196369822828, "learning_rate": 2.709923063517895e-07, "logits/chosen": -2.3407835960388184, "logits/rejected": -2.3254504203796387, "logps/chosen": -228.28256225585938, "logps/rejected": -218.29135131835938, "loss": 0.676, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.13284185528755188, "rewards/margins": 0.042964477092027664, "rewards/margins_max": 0.10426248610019684, "rewards/margins_min": -0.012125581502914429, "rewards/margins_std": 0.05200987309217453, "rewards/rejected": 0.08987738192081451, "step": 2010 }, { "dpo_losses": 0.6716371774673462, "epoch": 0.53, "grad_norm": 10.877571240461199, "learning_rate": 2.68715067159496e-07, "logits/chosen": -2.620425224304199, "logits/rejected": -2.426180839538574, "logps/chosen": -278.9306335449219, "logps/rejected": -232.401123046875, "loss": 0.6746, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14334070682525635, "rewards/margins": 0.04644431546330452, "rewards/margins_max": 0.17115263640880585, "rewards/margins_min": -0.06203880161046982, "rewards/margins_std": 0.10304610431194305, "rewards/rejected": 0.09689638018608093, "step": 2020 }, { "dpo_losses": 0.6739299297332764, "epoch": 0.53, "grad_norm": 2.8145317368445286, "learning_rate": 2.664362652644806e-07, "logits/chosen": -2.4717283248901367, "logits/rejected": -2.396667003631592, "logps/chosen": -260.51544189453125, "logps/rejected": -255.59207153320312, "loss": 0.6802, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12724120914936066, "rewards/margins": 0.040449660271406174, "rewards/margins_max": 0.13347701728343964, "rewards/margins_min": -0.03219344839453697, "rewards/margins_std": 0.0735369548201561, "rewards/rejected": 0.08679153770208359, "step": 2030 }, { "dpo_losses": 0.666767954826355, "epoch": 0.53, "grad_norm": 2.7956799559348036, "learning_rate": 2.6415609094604555e-07, "logits/chosen": -2.1382689476013184, "logits/rejected": -2.0867698192596436, "logps/chosen": -264.2413024902344, "logps/rejected": -206.7948455810547, "loss": 0.6775, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11567956209182739, "rewards/margins": 0.05546707659959793, "rewards/margins_max": 0.1475561261177063, "rewards/margins_min": -0.03252142667770386, "rewards/margins_std": 0.08140332251787186, "rewards/rejected": 0.06021248549222946, "step": 2040 }, { "dpo_losses": 0.6652515530586243, "epoch": 0.54, "grad_norm": 10.960606869790631, "learning_rate": 2.618747345980904e-07, "logits/chosen": -2.47698974609375, "logits/rejected": -2.3628809452056885, "logps/chosen": -253.8305206298828, "logps/rejected": -245.3777618408203, "loss": 0.6852, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.11731608211994171, "rewards/margins": 0.05786700174212456, "rewards/margins_max": 0.1389814019203186, "rewards/margins_min": -0.004924899898469448, "rewards/margins_std": 0.06482270359992981, "rewards/rejected": 0.05944906920194626, "step": 2050 }, { "dpo_losses": 0.6709498167037964, "epoch": 0.54, "grad_norm": 18.281949439077025, "learning_rate": 2.595923867132136e-07, "logits/chosen": -2.40461802482605, "logits/rejected": -2.332186222076416, "logps/chosen": -272.1275634765625, "logps/rejected": -243.11367797851562, "loss": 0.6799, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14160621166229248, "rewards/margins": 0.046191029250621796, "rewards/margins_max": 0.11678475141525269, "rewards/margins_min": -0.01861092820763588, "rewards/margins_std": 0.05945184826850891, "rewards/rejected": 0.09541517496109009, "step": 2060 }, { "dpo_losses": 0.673790693283081, "epoch": 0.54, "grad_norm": 2.5234911650156633, "learning_rate": 2.5730923786680667e-07, "logits/chosen": -2.2907633781433105, "logits/rejected": -2.3143460750579834, "logps/chosen": -211.24154663085938, "logps/rejected": -262.0733947753906, "loss": 0.6733, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12364006042480469, "rewards/margins": 0.04044589027762413, "rewards/margins_max": 0.13304606080055237, "rewards/margins_min": -0.033517368137836456, "rewards/margins_std": 0.07433114945888519, "rewards/rejected": 0.08319418132305145, "step": 2070 }, { "dpo_losses": 0.6776586771011353, "epoch": 0.54, "grad_norm": 2.6801462031098127, "learning_rate": 2.5502547870114135e-07, "logits/chosen": -2.5126452445983887, "logits/rejected": -2.420867443084717, "logps/chosen": -202.7610626220703, "logps/rejected": -219.1664276123047, "loss": 0.6753, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.119606152176857, "rewards/margins": 0.03359980508685112, "rewards/margins_max": 0.1414192169904709, "rewards/margins_min": -0.07246644794940948, "rewards/margins_std": 0.09257280081510544, "rewards/rejected": 0.08600634336471558, "step": 2080 }, { "dpo_losses": 0.6733182668685913, "epoch": 0.55, "grad_norm": 2.919970702716745, "learning_rate": 2.527412999094506e-07, "logits/chosen": -2.296156406402588, "logits/rejected": -2.265514850616455, "logps/chosen": -253.04330444335938, "logps/rejected": -301.1412048339844, "loss": 0.6712, "positive_losses": 0.01033172570168972, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11496015638113022, "rewards/margins": 0.042038727551698685, "rewards/margins_max": 0.1336946338415146, "rewards/margins_min": -0.03470680117607117, "rewards/margins_std": 0.07306559383869171, "rewards/rejected": 0.07292143255472183, "step": 2090 }, { "dpo_losses": 0.6688312292098999, "epoch": 0.55, "grad_norm": 11.297778588084944, "learning_rate": 2.5045689222000636e-07, "logits/chosen": -2.4182441234588623, "logits/rejected": -2.292839527130127, "logps/chosen": -230.51708984375, "logps/rejected": -201.8252716064453, "loss": 0.6827, "positive_losses": 0.039516448974609375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11724548041820526, "rewards/margins": 0.05164783447980881, "rewards/margins_max": 0.15772123634815216, "rewards/margins_min": -0.02960938774049282, "rewards/margins_std": 0.08144444972276688, "rewards/rejected": 0.06559765338897705, "step": 2100 }, { "epoch": 0.55, "eval_dpo_losses": 0.6709840297698975, "eval_logits/chosen": -2.3177263736724854, "eval_logits/rejected": -2.2091636657714844, "eval_logps/chosen": -263.2407531738281, "eval_logps/rejected": -254.97987365722656, "eval_loss": 0.6801783442497253, "eval_positive_losses": 0.07079866528511047, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.12533384561538696, "eval_rewards/margins": 0.04692983627319336, "eval_rewards/margins_max": 0.1860412210226059, "eval_rewards/margins_min": -0.0765845999121666, "eval_rewards/margins_std": 0.08679856359958649, "eval_rewards/rejected": 0.078404001891613, "eval_runtime": 389.1772, "eval_samples_per_second": 5.139, "eval_steps_per_second": 0.162, "step": 2100 }, { "dpo_losses": 0.6726669073104858, "epoch": 0.55, "grad_norm": 2.352185239743905, "learning_rate": 2.481724463801933e-07, "logits/chosen": -2.3458216190338135, "logits/rejected": -2.0222859382629395, "logps/chosen": -236.3656005859375, "logps/rejected": -208.16036987304688, "loss": 0.6858, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12357939779758453, "rewards/margins": 0.04336198419332504, "rewards/margins_max": 0.1470601111650467, "rewards/margins_min": -0.037931304425001144, "rewards/margins_std": 0.08214269578456879, "rewards/rejected": 0.08021741360425949, "step": 2110 }, { "dpo_losses": 0.6705645322799683, "epoch": 0.55, "grad_norm": 2.833547546666325, "learning_rate": 2.4588815314058154e-07, "logits/chosen": -2.2835419178009033, "logits/rejected": -2.261867046356201, "logps/chosen": -217.6439208984375, "logps/rejected": -246.447021484375, "loss": 0.6772, "positive_losses": 0.09856720268726349, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12306518852710724, "rewards/margins": 0.04767810180783272, "rewards/margins_max": 0.14357921481132507, "rewards/margins_min": -0.03136060759425163, "rewards/margins_std": 0.08332129567861557, "rewards/rejected": 0.07538709044456482, "step": 2120 }, { "dpo_losses": 0.6898881196975708, "epoch": 0.56, "grad_norm": 2.6953278914302388, "learning_rate": 2.4360420323899917e-07, "logits/chosen": -2.420711040496826, "logits/rejected": -2.5133652687072754, "logps/chosen": -181.68017578125, "logps/rejected": -242.8861541748047, "loss": 0.6856, "positive_losses": 0.15330390632152557, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.10466961562633514, "rewards/margins": 0.00798152107745409, "rewards/margins_max": 0.08101270347833633, "rewards/margins_min": -0.07644171267747879, "rewards/margins_std": 0.07264996320009232, "rewards/rejected": 0.09668810665607452, "step": 2130 }, { "dpo_losses": 0.6729011535644531, "epoch": 0.56, "grad_norm": 3.0771947704084797, "learning_rate": 2.4132078738460583e-07, "logits/chosen": -2.5381364822387695, "logits/rejected": -2.462954044342041, "logps/chosen": -267.4510498046875, "logps/rejected": -262.56036376953125, "loss": 0.689, "positive_losses": 0.034569550305604935, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12866781651973724, "rewards/margins": 0.04285497963428497, "rewards/margins_max": 0.1201673299074173, "rewards/margins_min": -0.035310883074998856, "rewards/margins_std": 0.07186194509267807, "rewards/rejected": 0.08581284433603287, "step": 2140 }, { "dpo_losses": 0.671280026435852, "epoch": 0.56, "grad_norm": 2.6584486893485315, "learning_rate": 2.390380962419682e-07, "logits/chosen": -2.4874727725982666, "logits/rejected": -2.2423481941223145, "logps/chosen": -258.41033935546875, "logps/rejected": -204.3238525390625, "loss": 0.6751, "positive_losses": 0.02422790601849556, "rewards/accuracies": 0.625, "rewards/chosen": 0.12688805162906647, "rewards/margins": 0.047033827751874924, "rewards/margins_max": 0.149917334318161, "rewards/margins_min": -0.049349941313266754, "rewards/margins_std": 0.0899084135890007, "rewards/rejected": 0.07985422015190125, "step": 2150 }, { "dpo_losses": 0.6717740297317505, "epoch": 0.57, "grad_norm": 32.11724546181829, "learning_rate": 2.3675632041513977e-07, "logits/chosen": -2.140286922454834, "logits/rejected": -2.076103925704956, "logps/chosen": -195.2900390625, "logps/rejected": -220.3128204345703, "loss": 0.6906, "positive_losses": 0.1071346253156662, "rewards/accuracies": 0.625, "rewards/chosen": 0.11818341165781021, "rewards/margins": 0.04520890489220619, "rewards/margins_max": 0.1480513960123062, "rewards/margins_min": -0.029786264523863792, "rewards/margins_std": 0.08025858551263809, "rewards/rejected": 0.07297449558973312, "step": 2160 }, { "dpo_losses": 0.6730810403823853, "epoch": 0.57, "grad_norm": 2.17583110759051, "learning_rate": 2.344756504317453e-07, "logits/chosen": -2.1259207725524902, "logits/rejected": -2.0879054069519043, "logps/chosen": -217.6492919921875, "logps/rejected": -232.9006805419922, "loss": 0.6726, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12273970991373062, "rewards/margins": 0.04193046689033508, "rewards/margins_max": 0.13667893409729004, "rewards/margins_min": -0.027661597356200218, "rewards/margins_std": 0.07526539266109467, "rewards/rejected": 0.08080923557281494, "step": 2170 }, { "dpo_losses": 0.6780750751495361, "epoch": 0.57, "grad_norm": 2.4349726754913297, "learning_rate": 2.3219627672707237e-07, "logits/chosen": -2.6811447143554688, "logits/rejected": -2.4451870918273926, "logps/chosen": -318.3858947753906, "logps/rejected": -241.76412963867188, "loss": 0.6884, "positive_losses": 0.26033860445022583, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11486611515283585, "rewards/margins": 0.03307346627116203, "rewards/margins_max": 0.13917610049247742, "rewards/margins_min": -0.06652870774269104, "rewards/margins_std": 0.08933931589126587, "rewards/rejected": 0.08179265260696411, "step": 2180 }, { "dpo_losses": 0.6739374399185181, "epoch": 0.57, "grad_norm": 16.798659970564337, "learning_rate": 2.2991838962816918e-07, "logits/chosen": -2.4422969818115234, "logits/rejected": -2.5227487087249756, "logps/chosen": -265.8934631347656, "logps/rejected": -321.3507385253906, "loss": 0.677, "positive_losses": 0.0023105621803551912, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11928291618824005, "rewards/margins": 0.04044344648718834, "rewards/margins_max": 0.11650732904672623, "rewards/margins_min": -0.04066140577197075, "rewards/margins_std": 0.06925653666257858, "rewards/rejected": 0.07883947342634201, "step": 2190 }, { "dpo_losses": 0.6611309051513672, "epoch": 0.58, "grad_norm": 2.7457706700894358, "learning_rate": 2.2764217933795297e-07, "logits/chosen": -2.524014711380005, "logits/rejected": -2.2457149028778076, "logps/chosen": -335.1786804199219, "logps/rejected": -278.33154296875, "loss": 0.6746, "positive_losses": 0.09897804260253906, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.15805689990520477, "rewards/margins": 0.06756599247455597, "rewards/margins_max": 0.18138262629508972, "rewards/margins_min": -0.016255531460046768, "rewards/margins_std": 0.09083538502454758, "rewards/rejected": 0.0904908999800682, "step": 2200 }, { "epoch": 0.58, "eval_dpo_losses": 0.6712497472763062, "eval_logits/chosen": -2.3152434825897217, "eval_logits/rejected": -2.20694899559021, "eval_logps/chosen": -263.0406494140625, "eval_logps/rejected": -254.72311401367188, "eval_loss": 0.6794271469116211, "eval_positive_losses": 0.06798940896987915, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.12733520567417145, "eval_rewards/margins": 0.046363554894924164, "eval_rewards/margins_max": 0.1843395233154297, "eval_rewards/margins_min": -0.07644502073526382, "eval_rewards/margins_std": 0.08635833114385605, "eval_rewards/rejected": 0.08097164332866669, "eval_runtime": 388.8379, "eval_samples_per_second": 5.144, "eval_steps_per_second": 0.162, "step": 2200 }, { "dpo_losses": 0.6732332110404968, "epoch": 0.58, "grad_norm": 2.326260223120231, "learning_rate": 2.253678359193278e-07, "logits/chosen": -2.4300689697265625, "logits/rejected": -2.375605821609497, "logps/chosen": -238.16653442382812, "logps/rejected": -242.73190307617188, "loss": 0.6773, "positive_losses": 0.11408233642578125, "rewards/accuracies": 0.75, "rewards/chosen": 0.12676678597927094, "rewards/margins": 0.0421518012881279, "rewards/margins_max": 0.13364621996879578, "rewards/margins_min": -0.049227677285671234, "rewards/margins_std": 0.08051840960979462, "rewards/rejected": 0.08461497724056244, "step": 2210 }, { "dpo_losses": 0.6675369143486023, "epoch": 0.58, "grad_norm": 20.649716635323127, "learning_rate": 2.230955492793149e-07, "logits/chosen": -2.4923653602600098, "logits/rejected": -2.3206570148468018, "logps/chosen": -294.13677978515625, "logps/rejected": -262.75714111328125, "loss": 0.6949, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.13534510135650635, "rewards/margins": 0.0545387864112854, "rewards/margins_max": 0.17115472257137299, "rewards/margins_min": -0.06179576367139816, "rewards/margins_std": 0.10468991100788116, "rewards/rejected": 0.08080631494522095, "step": 2220 }, { "dpo_losses": 0.6687606573104858, "epoch": 0.58, "grad_norm": 2.712144742282089, "learning_rate": 2.2082550915319468e-07, "logits/chosen": -2.346778392791748, "logits/rejected": -2.3491933345794678, "logps/chosen": -231.1897735595703, "logps/rejected": -257.394775390625, "loss": 0.6745, "positive_losses": 0.021025847643613815, "rewards/accuracies": 0.75, "rewards/chosen": 0.11622549593448639, "rewards/margins": 0.051369886845350266, "rewards/margins_max": 0.1383776217699051, "rewards/margins_min": -0.04924733191728592, "rewards/margins_std": 0.08356579393148422, "rewards/rejected": 0.06485561281442642, "step": 2230 }, { "dpo_losses": 0.6752229928970337, "epoch": 0.59, "grad_norm": 2.7399724725050523, "learning_rate": 2.1855790508866433e-07, "logits/chosen": -2.366572618484497, "logits/rejected": -2.2383673191070557, "logps/chosen": -263.64044189453125, "logps/rejected": -222.43954467773438, "loss": 0.6844, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12474701553583145, "rewards/margins": 0.03834204748272896, "rewards/margins_max": 0.13281366229057312, "rewards/margins_min": -0.04047427326440811, "rewards/margins_std": 0.07687701284885406, "rewards/rejected": 0.0864049643278122, "step": 2240 }, { "dpo_losses": 0.6729758977890015, "epoch": 0.59, "grad_norm": 6.598517696499391, "learning_rate": 2.162929264300107e-07, "logits/chosen": -2.4142298698425293, "logits/rejected": -2.26248836517334, "logps/chosen": -262.2449951171875, "logps/rejected": -233.86508178710938, "loss": 0.6878, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12672877311706543, "rewards/margins": 0.04292669892311096, "rewards/margins_max": 0.15457038581371307, "rewards/margins_min": -0.039324142038822174, "rewards/margins_std": 0.08759310096502304, "rewards/rejected": 0.08380208909511566, "step": 2250 }, { "dpo_losses": 0.6780350208282471, "epoch": 0.59, "grad_norm": 2.607583898875135, "learning_rate": 2.1403076230230005e-07, "logits/chosen": -2.2764439582824707, "logits/rejected": -2.150205612182617, "logps/chosen": -226.9442901611328, "logps/rejected": -227.02200317382812, "loss": 0.69, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.10850127041339874, "rewards/margins": 0.03253210335969925, "rewards/margins_max": 0.1258852481842041, "rewards/margins_min": -0.04920916259288788, "rewards/margins_std": 0.08082172274589539, "rewards/rejected": 0.0759691670536995, "step": 2260 }, { "dpo_losses": 0.6667267084121704, "epoch": 0.59, "grad_norm": 12.403204325458098, "learning_rate": 2.1177160159558596e-07, "logits/chosen": -2.4153366088867188, "logits/rejected": -2.2211289405822754, "logps/chosen": -230.8486328125, "logps/rejected": -237.3624267578125, "loss": 0.6762, "positive_losses": 0.09792785346508026, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1315385401248932, "rewards/margins": 0.05579804256558418, "rewards/margins_max": 0.15570712089538574, "rewards/margins_min": -0.040628574788570404, "rewards/margins_std": 0.085642009973526, "rewards/rejected": 0.0757405012845993, "step": 2270 }, { "dpo_losses": 0.6641010642051697, "epoch": 0.6, "grad_norm": 11.561666592642, "learning_rate": 2.0951563294913734e-07, "logits/chosen": -2.439711332321167, "logits/rejected": -2.373257875442505, "logps/chosen": -232.34732055664062, "logps/rejected": -251.49862670898438, "loss": 0.6728, "positive_losses": 0.012303161434829235, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13683752715587616, "rewards/margins": 0.061398476362228394, "rewards/margins_max": 0.15952324867248535, "rewards/margins_min": -0.03735595569014549, "rewards/margins_std": 0.08912477642297745, "rewards/rejected": 0.07543905824422836, "step": 2280 }, { "dpo_losses": 0.6715140342712402, "epoch": 0.6, "grad_norm": 2.0970057894399874, "learning_rate": 2.072630447356869e-07, "logits/chosen": -2.328179359436035, "logits/rejected": -2.1677708625793457, "logps/chosen": -197.54293823242188, "logps/rejected": -204.33871459960938, "loss": 0.6866, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12749826908111572, "rewards/margins": 0.04517274349927902, "rewards/margins_max": 0.12831640243530273, "rewards/margins_min": -0.029996544122695923, "rewards/margins_std": 0.0713895931839943, "rewards/rejected": 0.0823255404829979, "step": 2290 }, { "dpo_losses": 0.6821847558021545, "epoch": 0.6, "grad_norm": 3.025753110345526, "learning_rate": 2.0501402504570232e-07, "logits/chosen": -2.2562546730041504, "logits/rejected": -2.2500576972961426, "logps/chosen": -224.04733276367188, "logps/rejected": -234.13052368164062, "loss": 0.6785, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13056842982769012, "rewards/margins": 0.023556387051939964, "rewards/margins_max": 0.11165548861026764, "rewards/margins_min": -0.05098745971918106, "rewards/margins_std": 0.06972166150808334, "rewards/rejected": 0.10701203346252441, "step": 2300 }, { "epoch": 0.6, "eval_dpo_losses": 0.6707358956336975, "eval_logits/chosen": -2.3101794719696045, "eval_logits/rejected": -2.201512098312378, "eval_logps/chosen": -263.1322937011719, "eval_logps/rejected": -254.92745971679688, "eval_loss": 0.6799682974815369, "eval_positive_losses": 0.07333074510097504, "eval_rewards/accuracies": 0.704365074634552, "eval_rewards/chosen": 0.12641893327236176, "eval_rewards/margins": 0.047490689903497696, "eval_rewards/margins_max": 0.18837027251720428, "eval_rewards/margins_min": -0.07741330564022064, "eval_rewards/margins_std": 0.08799280226230621, "eval_rewards/rejected": 0.07892823964357376, "eval_runtime": 388.8029, "eval_samples_per_second": 5.144, "eval_steps_per_second": 0.162, "step": 2300 }, { "dpo_losses": 0.6713213324546814, "epoch": 0.6, "grad_norm": 2.7476638928335024, "learning_rate": 2.027687616716804e-07, "logits/chosen": -2.4975926876068115, "logits/rejected": -2.2198967933654785, "logps/chosen": -299.9280090332031, "logps/rejected": -238.59115600585938, "loss": 0.6737, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12824943661689758, "rewards/margins": 0.04706652835011482, "rewards/margins_max": 0.18506288528442383, "rewards/margins_min": -0.06329890340566635, "rewards/margins_std": 0.10903745889663696, "rewards/rejected": 0.08118291944265366, "step": 2310 }, { "dpo_losses": 0.6747066974639893, "epoch": 0.61, "grad_norm": 15.271816470083001, "learning_rate": 2.005274420924668e-07, "logits/chosen": -2.328023672103882, "logits/rejected": -2.2872633934020996, "logps/chosen": -249.0967559814453, "logps/rejected": -246.4385223388672, "loss": 0.6817, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13362038135528564, "rewards/margins": 0.0392659530043602, "rewards/margins_max": 0.14235100150108337, "rewards/margins_min": -0.055910874158144, "rewards/margins_std": 0.08719529211521149, "rewards/rejected": 0.09435443580150604, "step": 2320 }, { "dpo_losses": 0.6650981307029724, "epoch": 0.61, "grad_norm": 8.847047009239377, "learning_rate": 1.9829025345760121e-07, "logits/chosen": -2.331576108932495, "logits/rejected": -2.253425121307373, "logps/chosen": -284.6048889160156, "logps/rejected": -309.8121337890625, "loss": 0.6916, "positive_losses": 0.0721282958984375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13305923342704773, "rewards/margins": 0.059800952672958374, "rewards/margins_max": 0.15308743715286255, "rewards/margins_min": -0.046781525015830994, "rewards/margins_std": 0.09087027609348297, "rewards/rejected": 0.07325827330350876, "step": 2330 }, { "dpo_losses": 0.6801132559776306, "epoch": 0.61, "grad_norm": 3.172819851863312, "learning_rate": 1.960573825716911e-07, "logits/chosen": -2.35091233253479, "logits/rejected": -2.2602930068969727, "logps/chosen": -321.7744140625, "logps/rejected": -313.6804504394531, "loss": 0.6786, "positive_losses": 0.11025962978601456, "rewards/accuracies": 0.625, "rewards/chosen": 0.11438898742198944, "rewards/margins": 0.028398191556334496, "rewards/margins_max": 0.13593852519989014, "rewards/margins_min": -0.0712394043803215, "rewards/margins_std": 0.09077819436788559, "rewards/rejected": 0.08599081635475159, "step": 2340 }, { "dpo_losses": 0.6714805364608765, "epoch": 0.62, "grad_norm": 8.961820790026913, "learning_rate": 1.9382901587881273e-07, "logits/chosen": -2.4663190841674805, "logits/rejected": -2.3418736457824707, "logps/chosen": -281.6524963378906, "logps/rejected": -240.7716522216797, "loss": 0.6776, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13562333583831787, "rewards/margins": 0.045196667313575745, "rewards/margins_max": 0.13108943402767181, "rewards/margins_min": -0.016351500526070595, "rewards/margins_std": 0.06597861647605896, "rewards/rejected": 0.09042666852474213, "step": 2350 }, { "dpo_losses": 0.66993248462677, "epoch": 0.62, "grad_norm": 20.045493455171616, "learning_rate": 1.9160533944694364e-07, "logits/chosen": -2.611262798309326, "logits/rejected": -2.4175140857696533, "logps/chosen": -253.954833984375, "logps/rejected": -204.962646484375, "loss": 0.6842, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1271345317363739, "rewards/margins": 0.04857195168733597, "rewards/margins_max": 0.13428984582424164, "rewards/margins_min": -0.025959158316254616, "rewards/margins_std": 0.07212872803211212, "rewards/rejected": 0.07856258004903793, "step": 2360 }, { "dpo_losses": 0.6835199594497681, "epoch": 0.62, "grad_norm": 13.932795941029747, "learning_rate": 1.8938653895242602e-07, "logits/chosen": -2.508631706237793, "logits/rejected": -2.361909866333008, "logps/chosen": -233.99069213867188, "logps/rejected": -225.4585418701172, "loss": 0.6863, "positive_losses": 0.18389339745044708, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.11099698394536972, "rewards/margins": 0.021771330386400223, "rewards/margins_max": 0.1108565554022789, "rewards/margins_min": -0.0883224681019783, "rewards/margins_std": 0.08869551122188568, "rewards/rejected": 0.0892256572842598, "step": 2370 }, { "dpo_losses": 0.674932062625885, "epoch": 0.62, "grad_norm": 2.6553149204303725, "learning_rate": 1.8717279966446264e-07, "logits/chosen": -2.4951369762420654, "logits/rejected": -2.4066832065582275, "logps/chosen": -220.5588836669922, "logps/rejected": -203.70608520507812, "loss": 0.6724, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11401192098855972, "rewards/margins": 0.03839712589979172, "rewards/margins_max": 0.12840159237384796, "rewards/margins_min": -0.04321750998497009, "rewards/margins_std": 0.07752354443073273, "rewards/rejected": 0.075614795088768, "step": 2380 }, { "dpo_losses": 0.6778360605239868, "epoch": 0.63, "grad_norm": 7.518509975289994, "learning_rate": 1.8496430642964694e-07, "logits/chosen": -2.449381113052368, "logits/rejected": -2.475433349609375, "logps/chosen": -254.767578125, "logps/rejected": -288.427490234375, "loss": 0.6783, "positive_losses": 0.171234130859375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13315029442310333, "rewards/margins": 0.034050993621349335, "rewards/margins_max": 0.1621081382036209, "rewards/margins_min": -0.09972251951694489, "rewards/margins_std": 0.11383918672800064, "rewards/rejected": 0.099099300801754, "step": 2390 }, { "dpo_losses": 0.666256308555603, "epoch": 0.63, "grad_norm": 2.662767112120377, "learning_rate": 1.8276124365652855e-07, "logits/chosen": -2.428035259246826, "logits/rejected": -2.2614102363586426, "logps/chosen": -252.599365234375, "logps/rejected": -291.35784912109375, "loss": 0.6814, "positive_losses": 0.16289862990379333, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1416683942079544, "rewards/margins": 0.05730510875582695, "rewards/margins_max": 0.15901552140712738, "rewards/margins_min": -0.03400059789419174, "rewards/margins_std": 0.08542235940694809, "rewards/rejected": 0.08436329662799835, "step": 2400 }, { "epoch": 0.63, "eval_dpo_losses": 0.6701316833496094, "eval_logits/chosen": -2.3060834407806396, "eval_logits/rejected": -2.197547674179077, "eval_logps/chosen": -263.0727233886719, "eval_logps/rejected": -254.9967803955078, "eval_loss": 0.6801121830940247, "eval_positive_losses": 0.07649845629930496, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.12701408565044403, "eval_rewards/margins": 0.048778824508190155, "eval_rewards/margins_max": 0.1911194771528244, "eval_rewards/margins_min": -0.0774940699338913, "eval_rewards/margins_std": 0.0890553891658783, "eval_rewards/rejected": 0.07823526114225388, "eval_runtime": 388.9658, "eval_samples_per_second": 5.142, "eval_steps_per_second": 0.162, "step": 2400 }, { "dpo_losses": 0.6746547222137451, "epoch": 0.63, "grad_norm": 10.829212833162783, "learning_rate": 1.805637953002149e-07, "logits/chosen": -2.428637981414795, "logits/rejected": -2.3510046005249023, "logps/chosen": -280.0422668457031, "logps/rejected": -244.94064331054688, "loss": 0.6703, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.12880542874336243, "rewards/margins": 0.039667874574661255, "rewards/margins_max": 0.1417379230260849, "rewards/margins_min": -0.05121506005525589, "rewards/margins_std": 0.0838494524359703, "rewards/rejected": 0.08913756906986237, "step": 2410 }, { "dpo_losses": 0.6781811714172363, "epoch": 0.63, "grad_norm": 11.031408379041876, "learning_rate": 1.7837214484701153e-07, "logits/chosen": -2.4880175590515137, "logits/rejected": -2.2961878776550293, "logps/chosen": -321.85601806640625, "logps/rejected": -271.87152099609375, "loss": 0.6736, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1415662169456482, "rewards/margins": 0.032625462859869, "rewards/margins_max": 0.12503772974014282, "rewards/margins_min": -0.07475106418132782, "rewards/margins_std": 0.08933347463607788, "rewards/rejected": 0.10894075781106949, "step": 2420 }, { "dpo_losses": 0.6657954454421997, "epoch": 0.64, "grad_norm": 2.323322129607951, "learning_rate": 1.761864752991004e-07, "logits/chosen": -2.4241466522216797, "logits/rejected": -2.278221607208252, "logps/chosen": -268.7740478515625, "logps/rejected": -240.4455108642578, "loss": 0.6704, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.1347435712814331, "rewards/margins": 0.05794829875230789, "rewards/margins_max": 0.19111862778663635, "rewards/margins_min": -0.0388403944671154, "rewards/margins_std": 0.10241158306598663, "rewards/rejected": 0.07679527252912521, "step": 2430 }, { "dpo_losses": 0.6735921502113342, "epoch": 0.64, "grad_norm": 2.7595032434266455, "learning_rate": 1.7400696915925995e-07, "logits/chosen": -2.3249740600585938, "logits/rejected": -2.2678728103637695, "logps/chosen": -274.99957275390625, "logps/rejected": -255.86026000976562, "loss": 0.6796, "positive_losses": 0.00613059988245368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1197628527879715, "rewards/margins": 0.041591838002204895, "rewards/margins_max": 0.1378866732120514, "rewards/margins_min": -0.04991874098777771, "rewards/margins_std": 0.08247729390859604, "rewards/rejected": 0.07817099988460541, "step": 2440 }, { "dpo_losses": 0.6738299131393433, "epoch": 0.64, "grad_norm": 2.714027951843656, "learning_rate": 1.718338084156254e-07, "logits/chosen": -2.4030306339263916, "logits/rejected": -2.434103488922119, "logps/chosen": -273.5843811035156, "logps/rejected": -334.3334655761719, "loss": 0.6794, "positive_losses": 0.12439880520105362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11034686863422394, "rewards/margins": 0.04113452509045601, "rewards/margins_max": 0.1379338800907135, "rewards/margins_min": -0.06139503791928291, "rewards/margins_std": 0.08562412112951279, "rewards/rejected": 0.06921233981847763, "step": 2450 }, { "dpo_losses": 0.6804717183113098, "epoch": 0.64, "grad_norm": 2.6055240196302387, "learning_rate": 1.696671745264937e-07, "logits/chosen": -2.2881839275360107, "logits/rejected": -2.2594962120056152, "logps/chosen": -225.520751953125, "logps/rejected": -231.9641876220703, "loss": 0.6792, "positive_losses": 0.06221923977136612, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.10042314231395721, "rewards/margins": 0.02793046459555626, "rewards/margins_max": 0.1297374665737152, "rewards/margins_min": -0.07130490243434906, "rewards/margins_std": 0.09099586308002472, "rewards/rejected": 0.07249267399311066, "step": 2460 }, { "dpo_losses": 0.6655539870262146, "epoch": 0.65, "grad_norm": 2.4355956243241716, "learning_rate": 1.67507248405171e-07, "logits/chosen": -2.402937412261963, "logits/rejected": -2.359192371368408, "logps/chosen": -316.0345458984375, "logps/rejected": -285.56927490234375, "loss": 0.6783, "positive_losses": 0.08771057426929474, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12964379787445068, "rewards/margins": 0.05898932367563248, "rewards/margins_max": 0.18386808037757874, "rewards/margins_min": -0.03832029178738594, "rewards/margins_std": 0.09967012703418732, "rewards/rejected": 0.0706544741988182, "step": 2470 }, { "dpo_losses": 0.6730169057846069, "epoch": 0.65, "grad_norm": 13.51043523916097, "learning_rate": 1.6535421040486683e-07, "logits/chosen": -2.480891466140747, "logits/rejected": -2.3601253032684326, "logps/chosen": -282.91156005859375, "logps/rejected": -235.36572265625, "loss": 0.6731, "positive_losses": 0.09273681789636612, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1309002786874771, "rewards/margins": 0.04286051541566849, "rewards/margins_max": 0.134964257478714, "rewards/margins_min": -0.062332116067409515, "rewards/margins_std": 0.08761082589626312, "rewards/rejected": 0.08803976327180862, "step": 2480 }, { "dpo_losses": 0.6507342457771301, "epoch": 0.65, "grad_norm": 2.6458179944011775, "learning_rate": 1.6320824030363456e-07, "logits/chosen": -2.396559238433838, "logits/rejected": -2.125082492828369, "logps/chosen": -298.47674560546875, "logps/rejected": -246.6115264892578, "loss": 0.6724, "positive_losses": 0.0, "rewards/accuracies": 0.875, "rewards/chosen": 0.1564883589744568, "rewards/margins": 0.08893148601055145, "rewards/margins_max": 0.19699367880821228, "rewards/margins_min": -0.01525606494396925, "rewards/margins_std": 0.09315965324640274, "rewards/rejected": 0.06755688041448593, "step": 2490 }, { "dpo_losses": 0.6684159636497498, "epoch": 0.65, "grad_norm": 2.548627379247408, "learning_rate": 1.6106951728936024e-07, "logits/chosen": -2.344459056854248, "logits/rejected": -2.356534719467163, "logps/chosen": -235.946533203125, "logps/rejected": -282.99542236328125, "loss": 0.6871, "positive_losses": 0.2631763517856598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12003586441278458, "rewards/margins": 0.05213887244462967, "rewards/margins_max": 0.14503471553325653, "rewards/margins_min": -0.03775560110807419, "rewards/margins_std": 0.08332803100347519, "rewards/rejected": 0.06789698451757431, "step": 2500 }, { "epoch": 0.65, "eval_dpo_losses": 0.6696481704711914, "eval_logits/chosen": -2.3111836910247803, "eval_logits/rejected": -2.202989101409912, "eval_logps/chosen": -263.2015380859375, "eval_logps/rejected": -255.2311553955078, "eval_loss": 0.6803688406944275, "eval_positive_losses": 0.08171971887350082, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.12572623789310455, "eval_rewards/margins": 0.04983469471335411, "eval_rewards/margins_max": 0.19349558651447296, "eval_rewards/margins_min": -0.07847902178764343, "eval_rewards/margins_std": 0.09043386578559875, "eval_rewards/rejected": 0.07589154690504074, "eval_runtime": 389.1292, "eval_samples_per_second": 5.14, "eval_steps_per_second": 0.162, "step": 2500 }, { "dpo_losses": 0.6641864776611328, "epoch": 0.66, "grad_norm": 2.5793948658537955, "learning_rate": 1.5893821994479994e-07, "logits/chosen": -2.346575975418091, "logits/rejected": -2.171170473098755, "logps/chosen": -229.090576171875, "logps/rejected": -232.0622100830078, "loss": 0.6709, "positive_losses": 0.006843948271125555, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13591636717319489, "rewards/margins": 0.062105000019073486, "rewards/margins_max": 0.15014702081680298, "rewards/margins_min": -0.05036243051290512, "rewards/margins_std": 0.091130331158638, "rewards/rejected": 0.0738113671541214, "step": 2510 }, { "dpo_losses": 0.6601074934005737, "epoch": 0.66, "grad_norm": 2.0939537063115856, "learning_rate": 1.5681452623266867e-07, "logits/chosen": -2.458527088165283, "logits/rejected": -2.3202872276306152, "logps/chosen": -236.30508422851562, "logps/rejected": -226.9938201904297, "loss": 0.6893, "positive_losses": 0.15527038276195526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12538447976112366, "rewards/margins": 0.07043556869029999, "rewards/margins_max": 0.18808217346668243, "rewards/margins_min": -0.029224693775177002, "rewards/margins_std": 0.10209884494543076, "rewards/rejected": 0.05494891479611397, "step": 2520 }, { "dpo_losses": 0.6646263599395752, "epoch": 0.66, "grad_norm": 2.521217905349977, "learning_rate": 1.546986134807801e-07, "logits/chosen": -2.389547824859619, "logits/rejected": -2.431588649749756, "logps/chosen": -256.923583984375, "logps/rejected": -292.1583557128906, "loss": 0.6855, "positive_losses": 0.13802985846996307, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.117195263504982, "rewards/margins": 0.06105233356356621, "rewards/margins_max": 0.18727239966392517, "rewards/margins_min": -0.045372117310762405, "rewards/margins_std": 0.10130174458026886, "rewards/rejected": 0.056142933666706085, "step": 2530 }, { "dpo_losses": 0.6761118769645691, "epoch": 0.66, "grad_norm": 18.88153569328297, "learning_rate": 1.5259065836724034e-07, "logits/chosen": -2.2753195762634277, "logits/rejected": -2.3748373985290527, "logps/chosen": -261.7287902832031, "logps/rejected": -315.1612854003906, "loss": 0.6914, "positive_losses": 0.0997081771492958, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12936082482337952, "rewards/margins": 0.03663833811879158, "rewards/margins_max": 0.11799211800098419, "rewards/margins_min": -0.0659271627664566, "rewards/margins_std": 0.08513730019330978, "rewards/rejected": 0.09272248297929764, "step": 2540 }, { "dpo_losses": 0.665381133556366, "epoch": 0.67, "grad_norm": 3.0080352304404285, "learning_rate": 1.5049083690569454e-07, "logits/chosen": -2.5593276023864746, "logits/rejected": -2.34718656539917, "logps/chosen": -255.66213989257812, "logps/rejected": -277.16033935546875, "loss": 0.6828, "positive_losses": 0.41350212693214417, "rewards/accuracies": 0.75, "rewards/chosen": 0.12465520948171616, "rewards/margins": 0.05909818410873413, "rewards/margins_max": 0.1476270705461502, "rewards/margins_min": -0.04015301540493965, "rewards/margins_std": 0.08696560561656952, "rewards/rejected": 0.06555704027414322, "step": 2550 }, { "dpo_losses": 0.6722395420074463, "epoch": 0.67, "grad_norm": 6.343958357973756, "learning_rate": 1.4839932443063056e-07, "logits/chosen": -2.1935677528381348, "logits/rejected": -2.148071765899658, "logps/chosen": -179.27767944335938, "logps/rejected": -173.64285278320312, "loss": 0.6745, "positive_losses": 0.011631393805146217, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11677847802639008, "rewards/margins": 0.044092949479818344, "rewards/margins_max": 0.13412240147590637, "rewards/margins_min": -0.023376774042844772, "rewards/margins_std": 0.06923139095306396, "rewards/rejected": 0.07268551737070084, "step": 2560 }, { "dpo_losses": 0.6732044816017151, "epoch": 0.67, "grad_norm": 2.8788307211089683, "learning_rate": 1.46316295582738e-07, "logits/chosen": -2.402930498123169, "logits/rejected": -2.302464723587036, "logps/chosen": -261.242919921875, "logps/rejected": -271.2743225097656, "loss": 0.6781, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.13133680820465088, "rewards/margins": 0.04338634014129639, "rewards/margins_max": 0.16494791209697723, "rewards/margins_min": -0.05951229855418205, "rewards/margins_std": 0.09981429576873779, "rewards/rejected": 0.0879504457116127, "step": 2570 }, { "dpo_losses": 0.6682552099227905, "epoch": 0.68, "grad_norm": 11.051924483547609, "learning_rate": 1.4424192429432655e-07, "logits/chosen": -2.3908326625823975, "logits/rejected": -2.2768425941467285, "logps/chosen": -243.1500701904297, "logps/rejected": -262.7445068359375, "loss": 0.6769, "positive_losses": 0.14676514267921448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13243842124938965, "rewards/margins": 0.053136877715587616, "rewards/margins_max": 0.1762290596961975, "rewards/margins_min": -0.045566756278276443, "rewards/margins_std": 0.09685839712619781, "rewards/rejected": 0.07930152863264084, "step": 2580 }, { "dpo_losses": 0.65812087059021, "epoch": 0.68, "grad_norm": 5.208328348793188, "learning_rate": 1.4217638377480158e-07, "logits/chosen": -2.6276822090148926, "logits/rejected": -2.414039373397827, "logps/chosen": -276.1061706542969, "logps/rejected": -230.8770294189453, "loss": 0.6708, "positive_losses": 0.007232666015625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12834641337394714, "rewards/margins": 0.0740780383348465, "rewards/margins_max": 0.18319042026996613, "rewards/margins_min": -0.046623922884464264, "rewards/margins_std": 0.1039138063788414, "rewards/rejected": 0.054268382489681244, "step": 2590 }, { "dpo_losses": 0.6800216436386108, "epoch": 0.68, "grad_norm": 2.8180281668528484, "learning_rate": 1.401198464962021e-07, "logits/chosen": -2.147932291030884, "logits/rejected": -2.2562713623046875, "logps/chosen": -218.607177734375, "logps/rejected": -259.3116760253906, "loss": 0.6898, "positive_losses": 0.11958718299865723, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09130442887544632, "rewards/margins": 0.028165409341454506, "rewards/margins_max": 0.12180614471435547, "rewards/margins_min": -0.04717284440994263, "rewards/margins_std": 0.07551318407058716, "rewards/rejected": 0.06313902884721756, "step": 2600 }, { "epoch": 0.68, "eval_dpo_losses": 0.6693880558013916, "eval_logits/chosen": -2.309218645095825, "eval_logits/rejected": -2.201179027557373, "eval_logps/chosen": -263.2164306640625, "eval_logps/rejected": -255.30506896972656, "eval_loss": 0.680719256401062, "eval_positive_losses": 0.08781720697879791, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.12557688355445862, "eval_rewards/margins": 0.050424735993146896, "eval_rewards/margins_max": 0.1973823755979538, "eval_rewards/margins_min": -0.07862609624862671, "eval_rewards/margins_std": 0.0916237160563469, "eval_rewards/rejected": 0.07515214383602142, "eval_runtime": 389.1756, "eval_samples_per_second": 5.139, "eval_steps_per_second": 0.162, "step": 2600 }, { "dpo_losses": 0.6712017059326172, "epoch": 0.68, "grad_norm": 13.665690007241619, "learning_rate": 1.3807248417879894e-07, "logits/chosen": -2.4296982288360596, "logits/rejected": -2.1721301078796387, "logps/chosen": -248.73971557617188, "logps/rejected": -177.23849487304688, "loss": 0.6936, "positive_losses": 0.027557373046875, "rewards/accuracies": 0.75, "rewards/chosen": 0.12752988934516907, "rewards/margins": 0.04602277651429176, "rewards/margins_max": 0.12316372245550156, "rewards/margins_min": -0.03338465839624405, "rewards/margins_std": 0.07344406098127365, "rewards/rejected": 0.0815071314573288, "step": 2610 }, { "dpo_losses": 0.6694207191467285, "epoch": 0.69, "grad_norm": 2.603351516301946, "learning_rate": 1.3603446777675665e-07, "logits/chosen": -2.207489013671875, "logits/rejected": -2.2407240867614746, "logps/chosen": -222.8031005859375, "logps/rejected": -246.66348266601562, "loss": 0.6817, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12614263594150543, "rewards/margins": 0.04943731054663658, "rewards/margins_max": 0.1236383467912674, "rewards/margins_min": -0.02230791375041008, "rewards/margins_std": 0.06495092064142227, "rewards/rejected": 0.07670532166957855, "step": 2620 }, { "dpo_losses": 0.6737352609634399, "epoch": 0.69, "grad_norm": 32.98071835486797, "learning_rate": 1.3400596746385814e-07, "logits/chosen": -2.5135321617126465, "logits/rejected": -2.407353401184082, "logps/chosen": -236.05911254882812, "logps/rejected": -235.73892211914062, "loss": 0.6844, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1146928071975708, "rewards/margins": 0.0408744290471077, "rewards/margins_max": 0.12695711851119995, "rewards/margins_min": -0.04190366342663765, "rewards/margins_std": 0.07236522436141968, "rewards/rejected": 0.07381837069988251, "step": 2630 }, { "dpo_losses": 0.6835433840751648, "epoch": 0.69, "grad_norm": 2.680426581864648, "learning_rate": 1.3198715261929586e-07, "logits/chosen": -2.3674778938293457, "logits/rejected": -2.3864409923553467, "logps/chosen": -309.6078186035156, "logps/rejected": -288.9012145996094, "loss": 0.6913, "positive_losses": 0.3510604798793793, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.11299104988574982, "rewards/margins": 0.02180514857172966, "rewards/margins_max": 0.15122799575328827, "rewards/margins_min": -0.0634220615029335, "rewards/margins_std": 0.09901756048202515, "rewards/rejected": 0.09118588268756866, "step": 2640 }, { "dpo_losses": 0.6556983590126038, "epoch": 0.69, "grad_norm": 2.2362532823857313, "learning_rate": 1.299781918135282e-07, "logits/chosen": -2.447723865509033, "logits/rejected": -2.2306768894195557, "logps/chosen": -219.0023956298828, "logps/rejected": -210.27975463867188, "loss": 0.6654, "positive_losses": 0.0, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.14258310198783875, "rewards/margins": 0.07871986925601959, "rewards/margins_max": 0.19488683342933655, "rewards/margins_min": -0.02206130139529705, "rewards/margins_std": 0.09514255821704865, "rewards/rejected": 0.06386323273181915, "step": 2650 }, { "dpo_losses": 0.6649500727653503, "epoch": 0.7, "grad_norm": 3.106277840257375, "learning_rate": 1.279792527942045e-07, "logits/chosen": -2.552910089492798, "logits/rejected": -2.4695918560028076, "logps/chosen": -303.94598388671875, "logps/rejected": -240.98593139648438, "loss": 0.6758, "positive_losses": 0.4118766784667969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13448257744312286, "rewards/margins": 0.061740435659885406, "rewards/margins_max": 0.17445023357868195, "rewards/margins_min": -0.05959685519337654, "rewards/margins_std": 0.10694795846939087, "rewards/rejected": 0.07274213433265686, "step": 2660 }, { "dpo_losses": 0.6732437014579773, "epoch": 0.7, "grad_norm": 7.140526167772002, "learning_rate": 1.259905024721576e-07, "logits/chosen": -2.5550179481506348, "logits/rejected": -2.314347267150879, "logps/chosen": -263.9970703125, "logps/rejected": -225.591552734375, "loss": 0.6789, "positive_losses": 0.13650360703468323, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11559703201055527, "rewards/margins": 0.041973777115345, "rewards/margins_max": 0.1362651139497757, "rewards/margins_min": -0.0336526595056057, "rewards/margins_std": 0.07619912922382355, "rewards/rejected": 0.07362325489521027, "step": 2670 }, { "dpo_losses": 0.6671115756034851, "epoch": 0.7, "grad_norm": 3.0541407883724343, "learning_rate": 1.2401210690746703e-07, "logits/chosen": -2.469423532485962, "logits/rejected": -2.379824638366699, "logps/chosen": -315.61468505859375, "logps/rejected": -350.3507385253906, "loss": 0.7045, "positive_losses": 0.5094150304794312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1186969131231308, "rewards/margins": 0.05679485946893692, "rewards/margins_max": 0.20409195125102997, "rewards/margins_min": -0.06501797586679459, "rewards/margins_std": 0.11827214062213898, "rewards/rejected": 0.061902064830064774, "step": 2680 }, { "dpo_losses": 0.6645984649658203, "epoch": 0.7, "grad_norm": 2.9071397957429204, "learning_rate": 1.2204423129559305e-07, "logits/chosen": -2.3132965564727783, "logits/rejected": -2.273155689239502, "logps/chosen": -173.6273956298828, "logps/rejected": -220.5270233154297, "loss": 0.6847, "positive_losses": 0.175892636179924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.10646476596593857, "rewards/margins": 0.05957334488630295, "rewards/margins_max": 0.13360820710659027, "rewards/margins_min": -0.032420530915260315, "rewards/margins_std": 0.0760737806558609, "rewards/rejected": 0.04689141735434532, "step": 2690 }, { "dpo_losses": 0.6778229475021362, "epoch": 0.71, "grad_norm": 16.196307404317697, "learning_rate": 1.2008703995358299e-07, "logits/chosen": -2.278902769088745, "logits/rejected": -2.228820323944092, "logps/chosen": -228.1091766357422, "logps/rejected": -210.7903594970703, "loss": 0.6743, "positive_losses": 0.08758354187011719, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11583232879638672, "rewards/margins": 0.033223554491996765, "rewards/margins_max": 0.10955943912267685, "rewards/margins_min": -0.05547773838043213, "rewards/margins_std": 0.07437022030353546, "rewards/rejected": 0.08260877430438995, "step": 2700 }, { "epoch": 0.71, "eval_dpo_losses": 0.6693156361579895, "eval_logits/chosen": -2.307466506958008, "eval_logits/rejected": -2.1993889808654785, "eval_logps/chosen": -263.1350402832031, "eval_logps/rejected": -255.23927307128906, "eval_loss": 0.6807480454444885, "eval_positive_losses": 0.09035903960466385, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.1263909488916397, "eval_rewards/margins": 0.05058103799819946, "eval_rewards/margins_max": 0.19718614220619202, "eval_rewards/margins_min": -0.0788588598370552, "eval_rewards/margins_std": 0.09179635345935822, "eval_rewards/rejected": 0.07580989599227905, "eval_runtime": 389.4037, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 2700 }, { "dpo_losses": 0.6733942627906799, "epoch": 0.71, "grad_norm": 10.070786754251595, "learning_rate": 1.1814069630635068e-07, "logits/chosen": -2.5925991535186768, "logits/rejected": -2.4262733459472656, "logps/chosen": -267.6837463378906, "logps/rejected": -265.426025390625, "loss": 0.6872, "positive_losses": 0.1328105926513672, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11858787387609482, "rewards/margins": 0.041387103497982025, "rewards/margins_max": 0.11525280773639679, "rewards/margins_min": -0.032838840037584305, "rewards/margins_std": 0.06790928542613983, "rewards/rejected": 0.07720077037811279, "step": 2710 }, { "dpo_losses": 0.675045371055603, "epoch": 0.71, "grad_norm": 2.7117783179701447, "learning_rate": 1.1620536287303051e-07, "logits/chosen": -2.2699389457702637, "logits/rejected": -2.1596310138702393, "logps/chosen": -195.565673828125, "logps/rejected": -215.2549285888672, "loss": 0.6807, "positive_losses": 0.3760528564453125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11224798113107681, "rewards/margins": 0.038463763892650604, "rewards/margins_max": 0.12004473060369492, "rewards/margins_min": -0.05318959429860115, "rewards/margins_std": 0.07698283344507217, "rewards/rejected": 0.07378420978784561, "step": 2720 }, { "dpo_losses": 0.6693524122238159, "epoch": 0.71, "grad_norm": 2.70680171989791, "learning_rate": 1.1428120125340716e-07, "logits/chosen": -2.3123159408569336, "logits/rejected": -2.2779717445373535, "logps/chosen": -292.3013610839844, "logps/rejected": -281.864501953125, "loss": 0.6792, "positive_losses": 0.15024718642234802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1373281180858612, "rewards/margins": 0.05147003009915352, "rewards/margins_max": 0.1808009147644043, "rewards/margins_min": -0.07780647277832031, "rewards/margins_std": 0.11627540737390518, "rewards/rejected": 0.08585809171199799, "step": 2730 }, { "dpo_losses": 0.6748474836349487, "epoch": 0.72, "grad_norm": 12.068780874709667, "learning_rate": 1.123683721144223e-07, "logits/chosen": -2.408970594406128, "logits/rejected": -2.4148025512695312, "logps/chosen": -201.9468231201172, "logps/rejected": -249.34799194335938, "loss": 0.6818, "positive_losses": 0.07415314018726349, "rewards/accuracies": 0.625, "rewards/chosen": 0.11965974420309067, "rewards/margins": 0.03948744758963585, "rewards/margins_max": 0.14163735508918762, "rewards/margins_min": -0.05036713555455208, "rewards/margins_std": 0.08382929116487503, "rewards/rejected": 0.08017229288816452, "step": 2740 }, { "dpo_losses": 0.6700785756111145, "epoch": 0.72, "grad_norm": 2.996473624009116, "learning_rate": 1.1046703517675845e-07, "logits/chosen": -2.4366610050201416, "logits/rejected": -2.3128981590270996, "logps/chosen": -244.5033721923828, "logps/rejected": -221.0505828857422, "loss": 0.6917, "positive_losses": 0.10491104423999786, "rewards/accuracies": 0.625, "rewards/chosen": 0.13715149462223053, "rewards/margins": 0.04939151555299759, "rewards/margins_max": 0.16166019439697266, "rewards/margins_min": -0.037866491824388504, "rewards/margins_std": 0.09216253459453583, "rewards/rejected": 0.08775997906923294, "step": 2750 }, { "dpo_losses": 0.6587422490119934, "epoch": 0.72, "grad_norm": 2.729519090878146, "learning_rate": 1.085773492015028e-07, "logits/chosen": -2.5133657455444336, "logits/rejected": -2.2355284690856934, "logps/chosen": -347.0047302246094, "logps/rejected": -299.4930114746094, "loss": 0.6688, "positive_losses": 0.11064758151769638, "rewards/accuracies": 0.75, "rewards/chosen": 0.14003266394138336, "rewards/margins": 0.07317359745502472, "rewards/margins_max": 0.19682635366916656, "rewards/margins_min": -0.036624275147914886, "rewards/margins_std": 0.10397710651159286, "rewards/rejected": 0.06685905158519745, "step": 2760 }, { "dpo_losses": 0.6638213396072388, "epoch": 0.72, "grad_norm": 4.519114693557289, "learning_rate": 1.0669947197689033e-07, "logits/chosen": -2.590829372406006, "logits/rejected": -2.3142385482788086, "logps/chosen": -286.30450439453125, "logps/rejected": -262.106689453125, "loss": 0.6838, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1305249035358429, "rewards/margins": 0.06323496997356415, "rewards/margins_max": 0.2091999053955078, "rewards/margins_min": -0.04065699130296707, "rewards/margins_std": 0.11020747572183609, "rewards/rejected": 0.06728993356227875, "step": 2770 }, { "dpo_losses": 0.664034366607666, "epoch": 0.73, "grad_norm": 2.221119434853578, "learning_rate": 1.048335603051291e-07, "logits/chosen": -2.443502902984619, "logits/rejected": -2.2299230098724365, "logps/chosen": -304.90411376953125, "logps/rejected": -212.9366455078125, "loss": 0.6806, "positive_losses": 0.19652633368968964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14268755912780762, "rewards/margins": 0.06190212815999985, "rewards/margins_max": 0.16873760521411896, "rewards/margins_min": -0.04047098755836487, "rewards/margins_std": 0.09722321480512619, "rewards/rejected": 0.08078540861606598, "step": 2780 }, { "dpo_losses": 0.6702266931533813, "epoch": 0.73, "grad_norm": 2.2525791644581283, "learning_rate": 1.0297976998930663e-07, "logits/chosen": -2.5825071334838867, "logits/rejected": -2.2750227451324463, "logps/chosen": -333.5815734863281, "logps/rejected": -272.9296569824219, "loss": 0.6926, "positive_losses": 0.31713980436325073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13155362010002136, "rewards/margins": 0.0482100248336792, "rewards/margins_max": 0.1293967217206955, "rewards/margins_min": -0.029747311025857925, "rewards/margins_std": 0.07054877281188965, "rewards/rejected": 0.08334358781576157, "step": 2790 }, { "dpo_losses": 0.6763724684715271, "epoch": 0.73, "grad_norm": 2.7234766420581664, "learning_rate": 1.0113825582038077e-07, "logits/chosen": -2.434692859649658, "logits/rejected": -2.3092548847198486, "logps/chosen": -223.5378875732422, "logps/rejected": -223.94400024414062, "loss": 0.6738, "positive_losses": 0.0, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.12492053210735321, "rewards/margins": 0.03694877028465271, "rewards/margins_max": 0.16444940865039825, "rewards/margins_min": -0.06462839990854263, "rewards/margins_std": 0.10002219676971436, "rewards/rejected": 0.0879717618227005, "step": 2800 }, { "epoch": 0.73, "eval_dpo_losses": 0.6697705388069153, "eval_logits/chosen": -2.3041625022888184, "eval_logits/rejected": -2.196119546890259, "eval_logps/chosen": -262.9600830078125, "eval_logps/rejected": -254.96617126464844, "eval_loss": 0.679980993270874, "eval_positive_losses": 0.08064993470907211, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 0.12814079225063324, "eval_rewards/margins": 0.04959971457719803, "eval_rewards/margins_max": 0.1936854124069214, "eval_rewards/margins_min": -0.07892739772796631, "eval_rewards/margins_std": 0.09076903760433197, "eval_rewards/rejected": 0.0785410925745964, "eval_runtime": 389.4066, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 2800 }, { "dpo_losses": 0.6656321287155151, "epoch": 0.74, "grad_norm": 2.920798363433187, "learning_rate": 9.930917156425475e-08, "logits/chosen": -2.257516384124756, "logits/rejected": -2.234851598739624, "logps/chosen": -208.3054962158203, "logps/rejected": -201.53509521484375, "loss": 0.6786, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1248052716255188, "rewards/margins": 0.057258110493421555, "rewards/margins_max": 0.13672688603401184, "rewards/margins_min": -0.022176122292876244, "rewards/margins_std": 0.07319865375757217, "rewards/rejected": 0.06754714995622635, "step": 2810 }, { "dpo_losses": 0.67621248960495, "epoch": 0.74, "grad_norm": 3.0739801840208614, "learning_rate": 9.749266994893754e-08, "logits/chosen": -2.3510212898254395, "logits/rejected": -2.258446455001831, "logps/chosen": -219.2156219482422, "logps/rejected": -253.7711944580078, "loss": 0.675, "positive_losses": 0.12092037498950958, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11861914396286011, "rewards/margins": 0.03637947514653206, "rewards/margins_max": 0.1337026208639145, "rewards/margins_min": -0.0735204666852951, "rewards/margins_std": 0.08902983367443085, "rewards/rejected": 0.08223967254161835, "step": 2820 }, { "dpo_losses": 0.664071261882782, "epoch": 0.74, "grad_norm": 2.7038895953397617, "learning_rate": 9.568890265179128e-08, "logits/chosen": -2.2189993858337402, "logits/rejected": -2.162444829940796, "logps/chosen": -259.46728515625, "logps/rejected": -311.8764953613281, "loss": 0.6752, "positive_losses": 0.166178897023201, "rewards/accuracies": 0.75, "rewards/chosen": 0.12907323241233826, "rewards/margins": 0.06152749061584473, "rewards/margins_max": 0.16760775446891785, "rewards/margins_min": -0.03343154117465019, "rewards/margins_std": 0.0909598246216774, "rewards/rejected": 0.06754572689533234, "step": 2830 }, { "dpo_losses": 0.6727863550186157, "epoch": 0.74, "grad_norm": 2.7086100412499956, "learning_rate": 9.389802028686616e-08, "logits/chosen": -2.338531970977783, "logits/rejected": -2.305568218231201, "logps/chosen": -185.4831085205078, "logps/rejected": -178.7330780029297, "loss": 0.6861, "positive_losses": 0.09433136135339737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11514159291982651, "rewards/margins": 0.0424327626824379, "rewards/margins_max": 0.12293638288974762, "rewards/margins_min": -0.0355571024119854, "rewards/margins_std": 0.07008004188537598, "rewards/rejected": 0.07270883023738861, "step": 2840 }, { "dpo_losses": 0.6721526980400085, "epoch": 0.75, "grad_norm": 2.702726389871361, "learning_rate": 9.212017239232426e-08, "logits/chosen": -2.310347557067871, "logits/rejected": -2.3053088188171387, "logps/chosen": -256.51812744140625, "logps/rejected": -238.6575927734375, "loss": 0.6795, "positive_losses": 0.018294906243681908, "rewards/accuracies": 0.75, "rewards/chosen": 0.11381785571575165, "rewards/margins": 0.04518987983465195, "rewards/margins_max": 0.14602318406105042, "rewards/margins_min": -0.08707142621278763, "rewards/margins_std": 0.104322150349617, "rewards/rejected": 0.0686279758810997, "step": 2850 }, { "dpo_losses": 0.6838535070419312, "epoch": 0.75, "grad_norm": 2.7645896223543374, "learning_rate": 9.035550741795328e-08, "logits/chosen": -2.3564000129699707, "logits/rejected": -2.261359691619873, "logps/chosen": -236.6603546142578, "logps/rejected": -232.6334991455078, "loss": 0.6731, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.11012081801891327, "rewards/margins": 0.020216383039951324, "rewards/margins_max": 0.09920711815357208, "rewards/margins_min": -0.06468682736158371, "rewards/margins_std": 0.07363492995500565, "rewards/rejected": 0.08990444988012314, "step": 2860 }, { "dpo_losses": 0.6664173603057861, "epoch": 0.75, "grad_norm": 3.3644912439425974, "learning_rate": 8.860417271277065e-08, "logits/chosen": -2.22172212600708, "logits/rejected": -2.0948173999786377, "logps/chosen": -269.0731506347656, "logps/rejected": -250.63851928710938, "loss": 0.6774, "positive_losses": 0.17580795288085938, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.11776889860630035, "rewards/margins": 0.05585477501153946, "rewards/margins_max": 0.13254714012145996, "rewards/margins_min": -0.02687937021255493, "rewards/margins_std": 0.06907229125499725, "rewards/rejected": 0.061914123594760895, "step": 2870 }, { "dpo_losses": 0.6714795231819153, "epoch": 0.75, "grad_norm": 3.0722586406766608, "learning_rate": 8.686631451272029e-08, "logits/chosen": -2.468583583831787, "logits/rejected": -2.1245083808898926, "logps/chosen": -282.9653625488281, "logps/rejected": -235.885498046875, "loss": 0.674, "positive_losses": 0.09384231269359589, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12755025923252106, "rewards/margins": 0.04641108959913254, "rewards/margins_max": 0.14898782968521118, "rewards/margins_min": -0.05662735551595688, "rewards/margins_std": 0.09192202240228653, "rewards/rejected": 0.08113916963338852, "step": 2880 }, { "dpo_losses": 0.6747239828109741, "epoch": 0.76, "grad_norm": 2.3987755552742525, "learning_rate": 8.514207792846168e-08, "logits/chosen": -2.3091626167297363, "logits/rejected": -2.2703776359558105, "logps/chosen": -262.51751708984375, "logps/rejected": -286.2556457519531, "loss": 0.6704, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11274804174900055, "rewards/margins": 0.03987208753824234, "rewards/margins_max": 0.15627378225326538, "rewards/margins_min": -0.07009749859571457, "rewards/margins_std": 0.10128549486398697, "rewards/rejected": 0.0728759616613388, "step": 2890 }, { "dpo_losses": 0.6669967174530029, "epoch": 0.76, "grad_norm": 17.873455701155606, "learning_rate": 8.343160693325355e-08, "logits/chosen": -2.4240801334381104, "logits/rejected": -2.231790542602539, "logps/chosen": -276.09466552734375, "logps/rejected": -247.3150634765625, "loss": 0.6842, "positive_losses": 0.12964782118797302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12239459902048111, "rewards/margins": 0.05536676570773125, "rewards/margins_max": 0.1523353010416031, "rewards/margins_min": -0.0335552953183651, "rewards/margins_std": 0.0856507197022438, "rewards/rejected": 0.06702783703804016, "step": 2900 }, { "epoch": 0.76, "eval_dpo_losses": 0.6693070530891418, "eval_logits/chosen": -2.3029189109802246, "eval_logits/rejected": -2.194601058959961, "eval_logps/chosen": -263.0264587402344, "eval_logps/rejected": -255.1313934326172, "eval_loss": 0.6801213026046753, "eval_positive_losses": 0.08509287983179092, "eval_rewards/accuracies": 0.7182539701461792, "eval_rewards/chosen": 0.12747682631015778, "eval_rewards/margins": 0.05058792605996132, "eval_rewards/margins_max": 0.1963043510913849, "eval_rewards/margins_min": -0.07937076687812805, "eval_rewards/margins_std": 0.09169505536556244, "eval_rewards/rejected": 0.07688891142606735, "eval_runtime": 389.4812, "eval_samples_per_second": 5.135, "eval_steps_per_second": 0.162, "step": 2900 }, { "dpo_losses": 0.6685654520988464, "epoch": 0.76, "grad_norm": 2.9558015834098676, "learning_rate": 8.173504435093173e-08, "logits/chosen": -2.335361957550049, "logits/rejected": -2.170654296875, "logps/chosen": -248.86172485351562, "logps/rejected": -213.89352416992188, "loss": 0.6877, "positive_losses": 0.1840469390153885, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12560966610908508, "rewards/margins": 0.05296555906534195, "rewards/margins_max": 0.18911658227443695, "rewards/margins_min": -0.05661248415708542, "rewards/margins_std": 0.11027349531650543, "rewards/rejected": 0.07264409959316254, "step": 2910 }, { "dpo_losses": 0.6721591949462891, "epoch": 0.76, "grad_norm": 27.251567830470783, "learning_rate": 8.005253184398359e-08, "logits/chosen": -2.3661603927612305, "logits/rejected": -2.1698505878448486, "logps/chosen": -252.70751953125, "logps/rejected": -195.7733917236328, "loss": 0.691, "positive_losses": 0.7153045535087585, "rewards/accuracies": 0.75, "rewards/chosen": 0.11543408781290054, "rewards/margins": 0.04448876529932022, "rewards/margins_max": 0.14110511541366577, "rewards/margins_min": -0.04499402269721031, "rewards/margins_std": 0.08315658569335938, "rewards/rejected": 0.07094533741474152, "step": 2920 }, { "dpo_losses": 0.6571344137191772, "epoch": 0.77, "grad_norm": 19.287494557071494, "learning_rate": 7.838420990171926e-08, "logits/chosen": -2.3155629634857178, "logits/rejected": -2.202726364135742, "logps/chosen": -280.87408447265625, "logps/rejected": -238.4647979736328, "loss": 0.6797, "positive_losses": 0.0, "rewards/accuracies": 0.875, "rewards/chosen": 0.14602993428707123, "rewards/margins": 0.0751093178987503, "rewards/margins_max": 0.17614969611167908, "rewards/margins_min": -0.005204112268984318, "rewards/margins_std": 0.08325017243623734, "rewards/rejected": 0.07092062383890152, "step": 2930 }, { "dpo_losses": 0.6676996946334839, "epoch": 0.77, "grad_norm": 9.589217217949928, "learning_rate": 7.673021782854083e-08, "logits/chosen": -2.514594078063965, "logits/rejected": -2.4897377490997314, "logps/chosen": -273.68731689453125, "logps/rejected": -274.6900634765625, "loss": 0.6785, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13978087902069092, "rewards/margins": 0.05342685058712959, "rewards/margins_max": 0.16125664114952087, "rewards/margins_min": -0.03897722437977791, "rewards/margins_std": 0.08939844369888306, "rewards/rejected": 0.08635403215885162, "step": 2940 }, { "dpo_losses": 0.6699270009994507, "epoch": 0.77, "grad_norm": 13.84854655988451, "learning_rate": 7.509069373231039e-08, "logits/chosen": -2.573387622833252, "logits/rejected": -2.454108238220215, "logps/chosen": -260.8175354003906, "logps/rejected": -280.77752685546875, "loss": 0.6826, "positive_losses": 0.2681739926338196, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11881864070892334, "rewards/margins": 0.04888763278722763, "rewards/margins_max": 0.13266468048095703, "rewards/margins_min": -0.03170103579759598, "rewards/margins_std": 0.07664258778095245, "rewards/rejected": 0.06993099302053452, "step": 2950 }, { "dpo_losses": 0.6843770742416382, "epoch": 0.77, "grad_norm": 12.867418742229951, "learning_rate": 7.346577451281821e-08, "logits/chosen": -2.198758602142334, "logits/rejected": -2.257113218307495, "logps/chosen": -226.89138793945312, "logps/rejected": -221.955810546875, "loss": 0.6852, "positive_losses": 0.4668550491333008, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09785674512386322, "rewards/margins": 0.01958160661160946, "rewards/margins_max": 0.10453460365533829, "rewards/margins_min": -0.07532242685556412, "rewards/margins_std": 0.08237345516681671, "rewards/rejected": 0.07827513664960861, "step": 2960 }, { "dpo_losses": 0.6725844144821167, "epoch": 0.78, "grad_norm": 21.863798750463772, "learning_rate": 7.185559585035136e-08, "logits/chosen": -2.4711601734161377, "logits/rejected": -2.3374249935150146, "logps/chosen": -211.1033935546875, "logps/rejected": -189.42324829101562, "loss": 0.6937, "positive_losses": 0.08393194526433945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11729426681995392, "rewards/margins": 0.04364239424467087, "rewards/margins_max": 0.14692465960979462, "rewards/margins_min": -0.04448354244232178, "rewards/margins_std": 0.08650805801153183, "rewards/rejected": 0.07365186512470245, "step": 2970 }, { "dpo_losses": 0.670464813709259, "epoch": 0.78, "grad_norm": 2.582822000494144, "learning_rate": 7.026029219436502e-08, "logits/chosen": -2.2779898643493652, "logits/rejected": -2.2378458976745605, "logps/chosen": -285.8140869140625, "logps/rejected": -291.67828369140625, "loss": 0.6707, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12443889677524567, "rewards/margins": 0.048795659095048904, "rewards/margins_max": 0.1751028150320053, "rewards/margins_min": -0.04355131834745407, "rewards/margins_std": 0.09665794670581818, "rewards/rejected": 0.07564322650432587, "step": 2980 }, { "dpo_losses": 0.6771678328514099, "epoch": 0.78, "grad_norm": 2.761628274905905, "learning_rate": 6.867999675225522e-08, "logits/chosen": -2.4259865283966064, "logits/rejected": -2.309704303741455, "logps/chosen": -281.1209716796875, "logps/rejected": -296.069580078125, "loss": 0.6731, "positive_losses": 0.0049114227294921875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11563529074192047, "rewards/margins": 0.03425656259059906, "rewards/margins_max": 0.1351454108953476, "rewards/margins_min": -0.04455222561955452, "rewards/margins_std": 0.0793975293636322, "rewards/rejected": 0.08137871325016022, "step": 2990 }, { "dpo_losses": 0.6674336194992065, "epoch": 0.79, "grad_norm": 3.1045654518623795, "learning_rate": 6.711484147823662e-08, "logits/chosen": -2.4439868927001953, "logits/rejected": -2.317187786102295, "logps/chosen": -218.49215698242188, "logps/rejected": -247.92922973632812, "loss": 0.6734, "positive_losses": 0.018839264288544655, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12358206510543823, "rewards/margins": 0.05425044894218445, "rewards/margins_max": 0.1507342904806137, "rewards/margins_min": -0.03602418303489685, "rewards/margins_std": 0.08498506247997284, "rewards/rejected": 0.06933162361383438, "step": 3000 }, { "epoch": 0.79, "eval_dpo_losses": 0.6692048907279968, "eval_logits/chosen": -2.2995188236236572, "eval_logits/rejected": -2.191080093383789, "eval_logps/chosen": -262.9806823730469, "eval_logps/rejected": -255.109375, "eval_loss": 0.6801201105117798, "eval_positive_losses": 0.08543862402439117, "eval_rewards/accuracies": 0.716269850730896, "eval_rewards/chosen": 0.1279347687959671, "eval_rewards/margins": 0.05082550644874573, "eval_rewards/margins_max": 0.1971311718225479, "eval_rewards/margins_min": -0.07978010177612305, "eval_rewards/margins_std": 0.09212593734264374, "eval_rewards/rejected": 0.07710926979780197, "eval_runtime": 389.3657, "eval_samples_per_second": 5.137, "eval_steps_per_second": 0.162, "step": 3000 }, { "dpo_losses": 0.6623255014419556, "epoch": 0.79, "grad_norm": 2.635460311668974, "learning_rate": 6.556495706232412e-08, "logits/chosen": -2.537750720977783, "logits/rejected": -2.3223774433135986, "logps/chosen": -280.81939697265625, "logps/rejected": -246.88784790039062, "loss": 0.6836, "positive_losses": 0.11644306033849716, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.12927424907684326, "rewards/margins": 0.06494958698749542, "rewards/margins_max": 0.13848480582237244, "rewards/margins_min": -0.037068434059619904, "rewards/margins_std": 0.0810275748372078, "rewards/rejected": 0.06432466953992844, "step": 3010 }, { "dpo_losses": 0.6677768230438232, "epoch": 0.79, "grad_norm": 19.186179845667883, "learning_rate": 6.403047291942057e-08, "logits/chosen": -2.3835415840148926, "logits/rejected": -2.2464287281036377, "logps/chosen": -207.40292358398438, "logps/rejected": -179.82437133789062, "loss": 0.6881, "positive_losses": 0.04441719129681587, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.12336908280849457, "rewards/margins": 0.05407561734318733, "rewards/margins_max": 0.14613750576972961, "rewards/margins_min": -0.04651116952300072, "rewards/margins_std": 0.08561340719461441, "rewards/rejected": 0.06929346174001694, "step": 3020 }, { "dpo_losses": 0.6659911274909973, "epoch": 0.79, "grad_norm": 2.5309983116575325, "learning_rate": 6.251151717851021e-08, "logits/chosen": -2.2995262145996094, "logits/rejected": -2.3188467025756836, "logps/chosen": -205.2932891845703, "logps/rejected": -276.8324279785156, "loss": 0.6837, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12994948029518127, "rewards/margins": 0.057421375066041946, "rewards/margins_max": 0.14615169167518616, "rewards/margins_min": -0.02146311104297638, "rewards/margins_std": 0.07583276182413101, "rewards/rejected": 0.07252810895442963, "step": 3030 }, { "dpo_losses": 0.666481614112854, "epoch": 0.8, "grad_norm": 3.481352722734416, "learning_rate": 6.100821667196041e-08, "logits/chosen": -2.322502851486206, "logits/rejected": -2.2097604274749756, "logps/chosen": -278.5419006347656, "logps/rejected": -293.50054931640625, "loss": 0.6657, "positive_losses": 0.0007812500116415322, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1464504450559616, "rewards/margins": 0.05645475536584854, "rewards/margins_max": 0.16323360800743103, "rewards/margins_min": -0.04700779914855957, "rewards/margins_std": 0.09214611351490021, "rewards/rejected": 0.08999571949243546, "step": 3040 }, { "dpo_losses": 0.6730872392654419, "epoch": 0.8, "grad_norm": 14.952160651701709, "learning_rate": 5.952069692493061e-08, "logits/chosen": -2.3322887420654297, "logits/rejected": -2.2521557807922363, "logps/chosen": -270.5457763671875, "logps/rejected": -271.17230224609375, "loss": 0.6798, "positive_losses": 0.2617286741733551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12949788570404053, "rewards/margins": 0.04328242316842079, "rewards/margins_max": 0.1422918140888214, "rewards/margins_min": -0.06964142620563507, "rewards/margins_std": 0.09423469007015228, "rewards/rejected": 0.08621545881032944, "step": 3050 }, { "dpo_losses": 0.6729440689086914, "epoch": 0.8, "grad_norm": 9.19456324607824, "learning_rate": 5.8049082144891794e-08, "logits/chosen": -2.4647738933563232, "logits/rejected": -2.390613555908203, "logps/chosen": -281.032470703125, "logps/rejected": -265.1967468261719, "loss": 0.6778, "positive_losses": 0.05382842943072319, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.11735423654317856, "rewards/margins": 0.04277355968952179, "rewards/margins_max": 0.14710049331188202, "rewards/margins_min": -0.04388534650206566, "rewards/margins_std": 0.08554147928953171, "rewards/rejected": 0.07458066940307617, "step": 3060 }, { "dpo_losses": 0.6724769473075867, "epoch": 0.8, "grad_norm": 9.65769340319505, "learning_rate": 5.659349521125459e-08, "logits/chosen": -2.2546300888061523, "logits/rejected": -2.123145341873169, "logps/chosen": -224.6599884033203, "logps/rejected": -207.84115600585938, "loss": 0.682, "positive_losses": 0.18030662834644318, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11364501714706421, "rewards/margins": 0.04295942932367325, "rewards/margins_max": 0.11281891167163849, "rewards/margins_min": -0.03324463218450546, "rewards/margins_std": 0.06412409245967865, "rewards/rejected": 0.07068559527397156, "step": 3070 }, { "dpo_losses": 0.6758753061294556, "epoch": 0.81, "grad_norm": 2.690289757165303, "learning_rate": 5.5154057665109e-08, "logits/chosen": -2.3207614421844482, "logits/rejected": -2.3253307342529297, "logps/chosen": -217.6414031982422, "logps/rejected": -253.79635620117188, "loss": 0.6807, "positive_losses": 0.02252960205078125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11955288797616959, "rewards/margins": 0.03689891844987869, "rewards/margins_max": 0.139844611287117, "rewards/margins_min": -0.04306263476610184, "rewards/margins_std": 0.08311165124177933, "rewards/rejected": 0.0826539620757103, "step": 3080 }, { "dpo_losses": 0.677670419216156, "epoch": 0.81, "grad_norm": 2.237941962552355, "learning_rate": 5.3730889699075853e-08, "logits/chosen": -2.5178394317626953, "logits/rejected": -2.275176525115967, "logps/chosen": -289.77960205078125, "logps/rejected": -250.08154296875, "loss": 0.6936, "positive_losses": 0.3494514524936676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11141493171453476, "rewards/margins": 0.033360108733177185, "rewards/margins_max": 0.12383078038692474, "rewards/margins_min": -0.05329901725053787, "rewards/margins_std": 0.07991122454404831, "rewards/rejected": 0.07805482298135757, "step": 3090 }, { "dpo_losses": 0.679115355014801, "epoch": 0.81, "grad_norm": 12.885502059682352, "learning_rate": 5.2324110147270893e-08, "logits/chosen": -2.384154796600342, "logits/rejected": -2.3629403114318848, "logps/chosen": -270.0539245605469, "logps/rejected": -274.03106689453125, "loss": 0.6772, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1309482753276825, "rewards/margins": 0.03047696128487587, "rewards/margins_max": 0.11086331307888031, "rewards/margins_min": -0.06656692922115326, "rewards/margins_std": 0.07912115007638931, "rewards/rejected": 0.10047130286693573, "step": 3100 }, { "epoch": 0.81, "eval_dpo_losses": 0.6692146062850952, "eval_logits/chosen": -2.305426836013794, "eval_logits/rejected": -2.197521448135376, "eval_logps/chosen": -262.9377136230469, "eval_logps/rejected": -255.0641632080078, "eval_loss": 0.680048942565918, "eval_positive_losses": 0.08660812675952911, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.12836430966854095, "eval_rewards/margins": 0.05080313980579376, "eval_rewards/margins_max": 0.19701167941093445, "eval_rewards/margins_min": -0.07985077053308487, "eval_rewards/margins_std": 0.09216360747814178, "eval_rewards/rejected": 0.07756116986274719, "eval_runtime": 389.2628, "eval_samples_per_second": 5.138, "eval_steps_per_second": 0.162, "step": 3100 }, { "dpo_losses": 0.6768172979354858, "epoch": 0.81, "grad_norm": 2.664154140652573, "learning_rate": 5.0933836475381795e-08, "logits/chosen": -2.35669207572937, "logits/rejected": -2.1809933185577393, "logps/chosen": -309.72637939453125, "logps/rejected": -271.519287109375, "loss": 0.682, "positive_losses": 0.11478348076343536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.128203347325325, "rewards/margins": 0.03482294827699661, "rewards/margins_max": 0.13301284611225128, "rewards/margins_min": -0.0655655562877655, "rewards/margins_std": 0.08626042306423187, "rewards/rejected": 0.0933803841471672, "step": 3110 }, { "dpo_losses": 0.6689193844795227, "epoch": 0.82, "grad_norm": 2.4317220038465948, "learning_rate": 4.956018477086005e-08, "logits/chosen": -2.369124174118042, "logits/rejected": -2.2284798622131348, "logps/chosen": -247.3609619140625, "logps/rejected": -246.45242309570312, "loss": 0.6867, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12930302321910858, "rewards/margins": 0.050565313547849655, "rewards/margins_max": 0.13973358273506165, "rewards/margins_min": -0.02274639531970024, "rewards/margins_std": 0.07152976840734482, "rewards/rejected": 0.07873772084712982, "step": 3120 }, { "dpo_losses": 0.6681303977966309, "epoch": 0.82, "grad_norm": 2.5105283298166565, "learning_rate": 4.820326973322763e-08, "logits/chosen": -2.3948864936828613, "logits/rejected": -2.400576114654541, "logps/chosen": -234.1851806640625, "logps/rejected": -222.6758270263672, "loss": 0.6799, "positive_losses": 0.14835719764232635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12216782569885254, "rewards/margins": 0.05275552719831467, "rewards/margins_max": 0.14772121608257294, "rewards/margins_min": -0.04417845979332924, "rewards/margins_std": 0.08869819343090057, "rewards/rejected": 0.06941230595111847, "step": 3130 }, { "dpo_losses": 0.6695515513420105, "epoch": 0.82, "grad_norm": 2.352499942921426, "learning_rate": 4.686320466449981e-08, "logits/chosen": -2.1995625495910645, "logits/rejected": -2.2488417625427246, "logps/chosen": -199.1984100341797, "logps/rejected": -205.27658081054688, "loss": 0.672, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.11518961191177368, "rewards/margins": 0.04977904260158539, "rewards/margins_max": 0.14333482086658478, "rewards/margins_min": -0.03746955469250679, "rewards/margins_std": 0.08041433990001678, "rewards/rejected": 0.06541057676076889, "step": 3140 }, { "dpo_losses": 0.6795169711112976, "epoch": 0.82, "grad_norm": 11.913559799845393, "learning_rate": 4.554010145972417e-08, "logits/chosen": -2.3467938899993896, "logits/rejected": -2.303445816040039, "logps/chosen": -266.292236328125, "logps/rejected": -280.1981201171875, "loss": 0.6835, "positive_losses": 0.026194382458925247, "rewards/accuracies": 0.625, "rewards/chosen": 0.11684246361255646, "rewards/margins": 0.030025780200958252, "rewards/margins_max": 0.12344875186681747, "rewards/margins_min": -0.06171383708715439, "rewards/margins_std": 0.08395267277956009, "rewards/rejected": 0.08681667596101761, "step": 3150 }, { "dpo_losses": 0.6605914831161499, "epoch": 0.83, "grad_norm": 7.15045224340085, "learning_rate": 4.423407059763745e-08, "logits/chosen": -2.5058696269989014, "logits/rejected": -2.232790470123291, "logps/chosen": -300.78326416015625, "logps/rejected": -214.3233184814453, "loss": 0.6845, "positive_losses": 0.10122756659984589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14813533425331116, "rewards/margins": 0.06953185796737671, "rewards/margins_max": 0.19648176431655884, "rewards/margins_min": -0.045553501695394516, "rewards/margins_std": 0.10774551331996918, "rewards/rejected": 0.07860346138477325, "step": 3160 }, { "dpo_losses": 0.6713321805000305, "epoch": 0.83, "grad_norm": 2.599052432592171, "learning_rate": 4.294522113144078e-08, "logits/chosen": -2.4957058429718018, "logits/rejected": -2.3634119033813477, "logps/chosen": -296.5467224121094, "logps/rejected": -283.74835205078125, "loss": 0.6817, "positive_losses": 0.35028839111328125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13152767717838287, "rewards/margins": 0.04606260731816292, "rewards/margins_max": 0.14536167681217194, "rewards/margins_min": -0.04244931787252426, "rewards/margins_std": 0.08641714602708817, "rewards/rejected": 0.08546505868434906, "step": 3170 }, { "dpo_losses": 0.6689661741256714, "epoch": 0.83, "grad_norm": 22.810802323388774, "learning_rate": 4.1673660679693804e-08, "logits/chosen": -2.204338788986206, "logits/rejected": -2.2384581565856934, "logps/chosen": -256.31414794921875, "logps/rejected": -231.65786743164062, "loss": 0.6877, "positive_losses": 0.1391722708940506, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13438187539577484, "rewards/margins": 0.05178465694189072, "rewards/margins_max": 0.1554839313030243, "rewards/margins_min": -0.07032118737697601, "rewards/margins_std": 0.09987451881170273, "rewards/rejected": 0.08259721845388412, "step": 3180 }, { "dpo_losses": 0.6721521615982056, "epoch": 0.83, "grad_norm": 2.7674389079310964, "learning_rate": 4.041949541732825e-08, "logits/chosen": -2.2454986572265625, "logits/rejected": -2.087000608444214, "logps/chosen": -222.91934204101562, "logps/rejected": -216.2421875, "loss": 0.6771, "positive_losses": 0.029915237799286842, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11425735801458359, "rewards/margins": 0.04425473138689995, "rewards/margins_max": 0.13472576439380646, "rewards/margins_min": -0.02775680460035801, "rewards/margins_std": 0.07394196838140488, "rewards/rejected": 0.07000264525413513, "step": 3190 }, { "dpo_losses": 0.6709674000740051, "epoch": 0.84, "grad_norm": 2.2333066293438946, "learning_rate": 3.9182830066782605e-08, "logits/chosen": -2.2033145427703857, "logits/rejected": -2.212981700897217, "logps/chosen": -249.73922729492188, "logps/rejected": -235.5943603515625, "loss": 0.6748, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.126674085855484, "rewards/margins": 0.04687775298953056, "rewards/margins_max": 0.136430025100708, "rewards/margins_min": -0.05149704962968826, "rewards/margins_std": 0.08441311120986938, "rewards/rejected": 0.07979633659124374, "step": 3200 }, { "epoch": 0.84, "eval_dpo_losses": 0.6692778468132019, "eval_logits/chosen": -2.3046810626983643, "eval_logits/rejected": -2.196681499481201, "eval_logps/chosen": -262.84710693359375, "eval_logps/rejected": -254.9594268798828, "eval_loss": 0.6797298789024353, "eval_positive_losses": 0.07960080355405807, "eval_rewards/accuracies": 0.716269850730896, "eval_rewards/chosen": 0.12927043437957764, "eval_rewards/margins": 0.0506620891392231, "eval_rewards/margins_max": 0.19603802263736725, "eval_rewards/margins_min": -0.07980389147996902, "eval_rewards/margins_std": 0.09178327769041061, "eval_rewards/rejected": 0.07860833406448364, "eval_runtime": 389.1068, "eval_samples_per_second": 5.14, "eval_steps_per_second": 0.162, "step": 3200 }, { "dpo_losses": 0.6701821088790894, "epoch": 0.84, "grad_norm": 2.6191304296967863, "learning_rate": 3.79637678892577e-08, "logits/chosen": -2.352324962615967, "logits/rejected": -2.2913012504577637, "logps/chosen": -237.4392547607422, "logps/rejected": -264.421142578125, "loss": 0.6766, "positive_losses": 0.017442893236875534, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12895804643630981, "rewards/margins": 0.04814993590116501, "rewards/margins_max": 0.14973807334899902, "rewards/margins_min": -0.027895677834749222, "rewards/margins_std": 0.07947418093681335, "rewards/rejected": 0.0808081179857254, "step": 3210 }, { "dpo_losses": 0.6761429309844971, "epoch": 0.84, "grad_norm": 15.27265123455721, "learning_rate": 3.6762410676094645e-08, "logits/chosen": -2.3249361515045166, "logits/rejected": -2.21986985206604, "logps/chosen": -228.9075164794922, "logps/rejected": -242.70150756835938, "loss": 0.6787, "positive_losses": 0.09328155219554901, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11849500238895416, "rewards/margins": 0.03613581135869026, "rewards/margins_max": 0.12373298406600952, "rewards/margins_min": -0.05344996601343155, "rewards/margins_std": 0.08006436377763748, "rewards/rejected": 0.082359179854393, "step": 3220 }, { "dpo_losses": 0.6709215641021729, "epoch": 0.85, "grad_norm": 3.079176360146454, "learning_rate": 3.557885874027497e-08, "logits/chosen": -2.382995367050171, "logits/rejected": -2.316608428955078, "logps/chosen": -290.6761474609375, "logps/rejected": -302.59991455078125, "loss": 0.6859, "positive_losses": 0.031456757336854935, "rewards/accuracies": 0.625, "rewards/chosen": 0.12469639629125595, "rewards/margins": 0.04748505353927612, "rewards/margins_max": 0.16519136726856232, "rewards/margins_min": -0.0583454966545105, "rewards/margins_std": 0.09840714186429977, "rewards/rejected": 0.07721133530139923, "step": 3230 }, { "dpo_losses": 0.6677560806274414, "epoch": 0.85, "grad_norm": 20.951384447188836, "learning_rate": 3.441321090804469e-08, "logits/chosen": -2.503652811050415, "logits/rejected": -2.3028178215026855, "logps/chosen": -241.68765258789062, "logps/rejected": -289.0697326660156, "loss": 0.6865, "positive_losses": 0.12185287475585938, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12369632720947266, "rewards/margins": 0.05412539094686508, "rewards/margins_max": 0.1644526869058609, "rewards/margins_min": -0.03049234114587307, "rewards/margins_std": 0.0892515480518341, "rewards/rejected": 0.06957095116376877, "step": 3240 }, { "dpo_losses": 0.6692062616348267, "epoch": 0.85, "grad_norm": 2.7222918536131977, "learning_rate": 3.326556451066234e-08, "logits/chosen": -2.4931464195251465, "logits/rejected": -2.3961825370788574, "logps/chosen": -270.8192138671875, "logps/rejected": -270.1750183105469, "loss": 0.6828, "positive_losses": 0.0018508911598473787, "rewards/accuracies": 0.75, "rewards/chosen": 0.12632817029953003, "rewards/margins": 0.05067627876996994, "rewards/margins_max": 0.14526931941509247, "rewards/margins_min": -0.031427957117557526, "rewards/margins_std": 0.07805519551038742, "rewards/rejected": 0.07565189152956009, "step": 3250 }, { "dpo_losses": 0.6653386950492859, "epoch": 0.85, "grad_norm": 12.21629398570275, "learning_rate": 3.2136015376271946e-08, "logits/chosen": -2.3881750106811523, "logits/rejected": -2.264289617538452, "logps/chosen": -236.799072265625, "logps/rejected": -262.1990661621094, "loss": 0.6737, "positive_losses": 0.05254364013671875, "rewards/accuracies": 0.75, "rewards/chosen": 0.12459317594766617, "rewards/margins": 0.05886339396238327, "rewards/margins_max": 0.16286525130271912, "rewards/margins_min": -0.04101995378732681, "rewards/margins_std": 0.08917327225208282, "rewards/rejected": 0.0657297894358635, "step": 3260 }, { "dpo_losses": 0.678049623966217, "epoch": 0.86, "grad_norm": 2.4228037376250278, "learning_rate": 3.102465782190106e-08, "logits/chosen": -2.3185598850250244, "logits/rejected": -2.212437152862549, "logps/chosen": -224.92428588867188, "logps/rejected": -251.5836944580078, "loss": 0.6827, "positive_losses": 0.08734703063964844, "rewards/accuracies": 0.75, "rewards/chosen": 0.11291112005710602, "rewards/margins": 0.03270808234810829, "rewards/margins_max": 0.12682226300239563, "rewards/margins_min": -0.06739183515310287, "rewards/margins_std": 0.08744670450687408, "rewards/rejected": 0.08020304143428802, "step": 3270 }, { "dpo_losses": 0.6745136976242065, "epoch": 0.86, "grad_norm": 2.741217634616196, "learning_rate": 2.993158464558565e-08, "logits/chosen": -2.4807307720184326, "logits/rejected": -2.267157793045044, "logps/chosen": -189.45870971679688, "logps/rejected": -174.0417938232422, "loss": 0.6769, "positive_losses": 0.015582275576889515, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10906050354242325, "rewards/margins": 0.04003576934337616, "rewards/margins_max": 0.14708790183067322, "rewards/margins_min": -0.05873081088066101, "rewards/margins_std": 0.08886677771806717, "rewards/rejected": 0.06902472674846649, "step": 3280 }, { "dpo_losses": 0.6528472304344177, "epoch": 0.86, "grad_norm": 2.7834090521405734, "learning_rate": 2.8856887118621358e-08, "logits/chosen": -2.3837623596191406, "logits/rejected": -2.1955437660217285, "logps/chosen": -269.6961364746094, "logps/rejected": -279.1293029785156, "loss": 0.682, "positive_losses": 0.24052810668945312, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14228954911231995, "rewards/margins": 0.08596445620059967, "rewards/margins_max": 0.20559343695640564, "rewards/margins_min": -0.029758477583527565, "rewards/margins_std": 0.10836313664913177, "rewards/rejected": 0.05632506683468819, "step": 3290 }, { "dpo_losses": 0.6663740277290344, "epoch": 0.86, "grad_norm": 2.7576992454723928, "learning_rate": 2.7800654977942482e-08, "logits/chosen": -2.5414488315582275, "logits/rejected": -2.4038424491882324, "logps/chosen": -315.38531494140625, "logps/rejected": -325.5350646972656, "loss": 0.6821, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13922825455665588, "rewards/margins": 0.05648420378565788, "rewards/margins_max": 0.15471987426280975, "rewards/margins_min": -0.03397979587316513, "rewards/margins_std": 0.08410502225160599, "rewards/rejected": 0.0827440470457077, "step": 3300 }, { "epoch": 0.86, "eval_dpo_losses": 0.6692480444908142, "eval_logits/chosen": -2.3090004920959473, "eval_logits/rejected": -2.201423168182373, "eval_logps/chosen": -262.878662109375, "eval_logps/rejected": -254.99827575683594, "eval_loss": 0.6797296404838562, "eval_positive_losses": 0.08235513418912888, "eval_rewards/accuracies": 0.7103174328804016, "eval_rewards/chosen": 0.1289546936750412, "eval_rewards/margins": 0.05073468014597893, "eval_rewards/margins_max": 0.1960068792104721, "eval_rewards/margins_min": -0.08024682849645615, "eval_rewards/margins_std": 0.09202095121145248, "eval_rewards/rejected": 0.07822001725435257, "eval_runtime": 389.2171, "eval_samples_per_second": 5.139, "eval_steps_per_second": 0.162, "step": 3300 }, { "dpo_losses": 0.659342885017395, "epoch": 0.87, "grad_norm": 13.715400582366, "learning_rate": 2.676297641862879e-08, "logits/chosen": -2.3234944343566895, "logits/rejected": -2.187084674835205, "logps/chosen": -194.15719604492188, "logps/rejected": -212.4752655029297, "loss": 0.6875, "positive_losses": 0.23284760117530823, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.13176333904266357, "rewards/margins": 0.07097126543521881, "rewards/margins_max": 0.17156919836997986, "rewards/margins_min": -0.005562918726354837, "rewards/margins_std": 0.08076272159814835, "rewards/rejected": 0.06079208105802536, "step": 3310 }, { "dpo_losses": 0.6678709983825684, "epoch": 0.87, "grad_norm": 3.272656623000695, "learning_rate": 2.5743938086541352e-08, "logits/chosen": -2.1880550384521484, "logits/rejected": -2.1361775398254395, "logps/chosen": -275.4288635253906, "logps/rejected": -228.80368041992188, "loss": 0.6774, "positive_losses": 0.08111687004566193, "rewards/accuracies": 0.625, "rewards/chosen": 0.11523906141519547, "rewards/margins": 0.05347437411546707, "rewards/margins_max": 0.1482856124639511, "rewards/margins_min": -0.03432339429855347, "rewards/margins_std": 0.08084601163864136, "rewards/rejected": 0.06176469475030899, "step": 3320 }, { "dpo_losses": 0.6768721342086792, "epoch": 0.87, "grad_norm": 10.810526937933288, "learning_rate": 2.474362507108757e-08, "logits/chosen": -2.3585729598999023, "logits/rejected": -2.417057514190674, "logps/chosen": -261.7973327636719, "logps/rejected": -299.03839111328125, "loss": 0.6708, "positive_losses": 0.06487121433019638, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12277094274759293, "rewards/margins": 0.03429005667567253, "rewards/margins_max": 0.11022315919399261, "rewards/margins_min": -0.03451364487409592, "rewards/margins_std": 0.06537089496850967, "rewards/rejected": 0.0884808897972107, "step": 3330 }, { "dpo_losses": 0.6758958101272583, "epoch": 0.87, "grad_norm": 2.764390314220604, "learning_rate": 2.3762120898116495e-08, "logits/chosen": -2.459319829940796, "logits/rejected": -2.4472625255584717, "logps/chosen": -262.14654541015625, "logps/rejected": -324.5309143066406, "loss": 0.6736, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.11399255692958832, "rewards/margins": 0.037031032145023346, "rewards/margins_max": 0.13497795164585114, "rewards/margins_min": -0.06016576290130615, "rewards/margins_std": 0.08762778341770172, "rewards/rejected": 0.07696153968572617, "step": 3340 }, { "dpo_losses": 0.6766647100448608, "epoch": 0.88, "grad_norm": 7.985707390204869, "learning_rate": 2.2799507522944044e-08, "logits/chosen": -2.4830164909362793, "logits/rejected": -2.170753240585327, "logps/chosen": -269.501953125, "logps/rejected": -245.11135864257812, "loss": 0.686, "positive_losses": 0.0970970168709755, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12570074200630188, "rewards/margins": 0.03692053258419037, "rewards/margins_max": 0.1613270491361618, "rewards/margins_min": -0.06142609566450119, "rewards/margins_std": 0.10042493045330048, "rewards/rejected": 0.08878020942211151, "step": 3350 }, { "dpo_losses": 0.6793362498283386, "epoch": 0.88, "grad_norm": 4.584897808903795, "learning_rate": 2.1855865323510054e-08, "logits/chosen": -2.4830827713012695, "logits/rejected": -2.368726968765259, "logps/chosen": -261.88482666015625, "logps/rejected": -319.2535705566406, "loss": 0.6797, "positive_losses": 0.011059570126235485, "rewards/accuracies": 0.625, "rewards/chosen": 0.11353154480457306, "rewards/margins": 0.02960852161049843, "rewards/margins_max": 0.11623834073543549, "rewards/margins_min": -0.0545518696308136, "rewards/margins_std": 0.07567659020423889, "rewards/rejected": 0.08392303436994553, "step": 3360 }, { "dpo_losses": 0.6746928691864014, "epoch": 0.88, "grad_norm": 9.611543438644452, "learning_rate": 2.0931273093666573e-08, "logits/chosen": -2.463020086288452, "logits/rejected": -2.4596176147460938, "logps/chosen": -248.17269897460938, "logps/rejected": -262.7024841308594, "loss": 0.6766, "positive_losses": 0.11037788540124893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12698687613010406, "rewards/margins": 0.0405283086001873, "rewards/margins_max": 0.17926689982414246, "rewards/margins_min": -0.07436909526586533, "rewards/margins_std": 0.11250394582748413, "rewards/rejected": 0.08645856380462646, "step": 3370 }, { "dpo_losses": 0.68145751953125, "epoch": 0.88, "grad_norm": 2.3251110255486576, "learning_rate": 2.002580803659873e-08, "logits/chosen": -2.4229302406311035, "logits/rejected": -2.2744932174682617, "logps/chosen": -264.92926025390625, "logps/rejected": -224.35107421875, "loss": 0.6996, "positive_losses": 0.3258255124092102, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.10959847271442413, "rewards/margins": 0.025977253913879395, "rewards/margins_max": 0.13311061263084412, "rewards/margins_min": -0.06495875865221024, "rewards/margins_std": 0.08804834634065628, "rewards/rejected": 0.08362121880054474, "step": 3380 }, { "dpo_losses": 0.6634842753410339, "epoch": 0.89, "grad_norm": 2.5356101642162403, "learning_rate": 1.9139545758378256e-08, "logits/chosen": -2.262960910797119, "logits/rejected": -2.182276725769043, "logps/chosen": -235.3115997314453, "logps/rejected": -254.09158325195312, "loss": 0.6645, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12509270012378693, "rewards/margins": 0.06396766006946564, "rewards/margins_max": 0.21244347095489502, "rewards/margins_min": -0.06330318748950958, "rewards/margins_std": 0.11905509233474731, "rewards/rejected": 0.06112504005432129, "step": 3390 }, { "dpo_losses": 0.6705407500267029, "epoch": 0.89, "grad_norm": 2.607130718762831, "learning_rate": 1.8272560261650277e-08, "logits/chosen": -2.3748831748962402, "logits/rejected": -2.241793632507324, "logps/chosen": -264.50860595703125, "logps/rejected": -287.6324768066406, "loss": 0.6759, "positive_losses": 0.022732162848114967, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1157820075750351, "rewards/margins": 0.048046939074993134, "rewards/margins_max": 0.16383512318134308, "rewards/margins_min": -0.04165526479482651, "rewards/margins_std": 0.09161083400249481, "rewards/rejected": 0.06773505359888077, "step": 3400 }, { "epoch": 0.89, "eval_dpo_losses": 0.6691598296165466, "eval_logits/chosen": -2.305250883102417, "eval_logits/rejected": -2.1973135471343994, "eval_logps/chosen": -262.86993408203125, "eval_logps/rejected": -255.00863647460938, "eval_loss": 0.6796736121177673, "eval_positive_losses": 0.08117539435625076, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.12904208898544312, "eval_rewards/margins": 0.05092561990022659, "eval_rewards/margins_max": 0.1970774531364441, "eval_rewards/margins_min": -0.08038254827260971, "eval_rewards/margins_std": 0.09222856163978577, "eval_rewards/rejected": 0.07811646908521652, "eval_runtime": 389.3373, "eval_samples_per_second": 5.137, "eval_steps_per_second": 0.162, "step": 3400 }, { "dpo_losses": 0.6751166582107544, "epoch": 0.89, "grad_norm": 9.016997498972946, "learning_rate": 1.742492393945427e-08, "logits/chosen": -2.169116497039795, "logits/rejected": -2.10593581199646, "logps/chosen": -225.00881958007812, "logps/rejected": -260.89276123046875, "loss": 0.6787, "positive_losses": 0.083196260035038, "rewards/accuracies": 0.625, "rewards/chosen": 0.10707148164510727, "rewards/margins": 0.039024822413921356, "rewards/margins_max": 0.12156794965267181, "rewards/margins_min": -0.06058493256568909, "rewards/margins_std": 0.0827067568898201, "rewards/rejected": 0.06804665923118591, "step": 3410 }, { "dpo_losses": 0.6496423482894897, "epoch": 0.9, "grad_norm": 13.819403256728211, "learning_rate": 1.6596707569179302e-08, "logits/chosen": -2.520373821258545, "logits/rejected": -2.146146059036255, "logps/chosen": -329.33758544921875, "logps/rejected": -279.7669372558594, "loss": 0.6787, "positive_losses": 0.0009946823120117188, "rewards/accuracies": 0.875, "rewards/chosen": 0.15970297157764435, "rewards/margins": 0.09256087243556976, "rewards/margins_max": 0.2187737673521042, "rewards/margins_min": -0.011989710852503777, "rewards/margins_std": 0.1066315546631813, "rewards/rejected": 0.06714209169149399, "step": 3420 }, { "dpo_losses": 0.6721950769424438, "epoch": 0.9, "grad_norm": 2.8442089003981414, "learning_rate": 1.5787980306653848e-08, "logits/chosen": -2.4318490028381348, "logits/rejected": -2.323869228363037, "logps/chosen": -300.0576477050781, "logps/rejected": -274.08441162109375, "loss": 0.6725, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13398399949073792, "rewards/margins": 0.04625312238931656, "rewards/margins_max": 0.17304448783397675, "rewards/margins_min": -0.08346472680568695, "rewards/margins_std": 0.11451027542352676, "rewards/rejected": 0.08773088455200195, "step": 3430 }, { "dpo_losses": 0.6695524454116821, "epoch": 0.9, "grad_norm": 1.9424919301065455, "learning_rate": 1.499880968037165e-08, "logits/chosen": -2.375246524810791, "logits/rejected": -2.410353183746338, "logps/chosen": -249.32101440429688, "logps/rejected": -268.6033630371094, "loss": 0.6743, "positive_losses": 0.04729504510760307, "rewards/accuracies": 0.75, "rewards/chosen": 0.12252242863178253, "rewards/margins": 0.05003643035888672, "rewards/margins_max": 0.1606391817331314, "rewards/margins_min": -0.050567395985126495, "rewards/margins_std": 0.09126288443803787, "rewards/rejected": 0.07248599082231522, "step": 3440 }, { "dpo_losses": 0.6693820357322693, "epoch": 0.9, "grad_norm": 5.458834132547894, "learning_rate": 1.4229261585852803e-08, "logits/chosen": -2.400480270385742, "logits/rejected": -2.3023390769958496, "logps/chosen": -246.43112182617188, "logps/rejected": -248.4359588623047, "loss": 0.6771, "positive_losses": 0.14210395514965057, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1363454908132553, "rewards/margins": 0.05008237436413765, "rewards/margins_max": 0.16186223924160004, "rewards/margins_min": -0.036306947469711304, "rewards/margins_std": 0.08560968190431595, "rewards/rejected": 0.08626312017440796, "step": 3450 }, { "dpo_losses": 0.662241518497467, "epoch": 0.91, "grad_norm": 2.907407796616291, "learning_rate": 1.3479400280141883e-08, "logits/chosen": -2.4223241806030273, "logits/rejected": -2.1820554733276367, "logps/chosen": -289.401611328125, "logps/rejected": -213.1032257080078, "loss": 0.6707, "positive_losses": 0.015072250738739967, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.1396438181400299, "rewards/margins": 0.0646057277917862, "rewards/margins_max": 0.16545650362968445, "rewards/margins_min": -0.0057312725111842155, "rewards/margins_std": 0.07797079533338547, "rewards/rejected": 0.07503808289766312, "step": 3460 }, { "dpo_losses": 0.6651964783668518, "epoch": 0.91, "grad_norm": 8.394677574050455, "learning_rate": 1.2749288376442042e-08, "logits/chosen": -2.3686633110046387, "logits/rejected": -2.193319082260132, "logps/chosen": -258.24859619140625, "logps/rejected": -279.73223876953125, "loss": 0.6788, "positive_losses": 0.017699431627988815, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1274716705083847, "rewards/margins": 0.0597081184387207, "rewards/margins_max": 0.1584540456533432, "rewards/margins_min": -0.03655258193612099, "rewards/margins_std": 0.08701176941394806, "rewards/rejected": 0.0677635669708252, "step": 3470 }, { "dpo_losses": 0.6753323674201965, "epoch": 0.91, "grad_norm": 2.7949624795050148, "learning_rate": 1.2038986838887127e-08, "logits/chosen": -2.4512059688568115, "logits/rejected": -2.2673470973968506, "logps/chosen": -210.4167938232422, "logps/rejected": -220.53466796875, "loss": 0.6883, "positive_losses": 0.232188418507576, "rewards/accuracies": 0.625, "rewards/chosen": 0.11029820144176483, "rewards/margins": 0.03918548673391342, "rewards/margins_max": 0.15071064233779907, "rewards/margins_min": -0.06958399713039398, "rewards/margins_std": 0.0983327254652977, "rewards/rejected": 0.071112722158432, "step": 3480 }, { "dpo_losses": 0.6772063970565796, "epoch": 0.91, "grad_norm": 11.472429480436729, "learning_rate": 1.1348554977451131e-08, "logits/chosen": -2.262913227081299, "logits/rejected": -2.333108425140381, "logps/chosen": -236.9550323486328, "logps/rejected": -214.06472778320312, "loss": 0.6848, "positive_losses": 0.07017497718334198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11239944398403168, "rewards/margins": 0.03342963010072708, "rewards/margins_max": 0.10693303495645523, "rewards/margins_min": -0.03752649575471878, "rewards/margins_std": 0.06555505096912384, "rewards/rejected": 0.07896982133388519, "step": 3490 }, { "dpo_losses": 0.6629253625869751, "epoch": 0.92, "grad_norm": 2.956102111859424, "learning_rate": 1.06780504429958e-08, "logits/chosen": -2.4380486011505127, "logits/rejected": -2.265533924102783, "logps/chosen": -334.5301208496094, "logps/rejected": -268.69451904296875, "loss": 0.678, "positive_losses": 0.07878541946411133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13844558596611023, "rewards/margins": 0.06392788887023926, "rewards/margins_max": 0.17385877668857574, "rewards/margins_min": -0.042621415108442307, "rewards/margins_std": 0.09444503486156464, "rewards/rejected": 0.07451769709587097, "step": 3500 }, { "epoch": 0.92, "eval_dpo_losses": 0.669166624546051, "eval_logits/chosen": -2.3025174140930176, "eval_logits/rejected": -2.1943209171295166, "eval_logps/chosen": -262.8900146484375, "eval_logps/rejected": -255.0269317626953, "eval_loss": 0.6798329949378967, "eval_positive_losses": 0.08230466395616531, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.1288415491580963, "eval_rewards/margins": 0.05090819671750069, "eval_rewards/margins_max": 0.19710253179073334, "eval_rewards/margins_min": -0.08030140399932861, "eval_rewards/margins_std": 0.09232944250106812, "eval_rewards/rejected": 0.07793334871530533, "eval_runtime": 389.4074, "eval_samples_per_second": 5.136, "eval_steps_per_second": 0.162, "step": 3500 }, { "dpo_losses": 0.6746149659156799, "epoch": 0.92, "grad_norm": 9.259435429345293, "learning_rate": 1.0027529222456754e-08, "logits/chosen": -2.4104716777801514, "logits/rejected": -2.2399020195007324, "logps/chosen": -270.034423828125, "logps/rejected": -254.6239013671875, "loss": 0.6905, "positive_losses": 0.23281840980052948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12146767228841782, "rewards/margins": 0.03860211372375488, "rewards/margins_max": 0.105097696185112, "rewards/margins_min": -0.049825794994831085, "rewards/margins_std": 0.06890954077243805, "rewards/rejected": 0.08286555111408234, "step": 3510 }, { "dpo_losses": 0.6734747886657715, "epoch": 0.92, "grad_norm": 15.642280483379114, "learning_rate": 9.397045634168766e-09, "logits/chosen": -2.3103578090667725, "logits/rejected": -2.281399965286255, "logps/chosen": -253.24765014648438, "logps/rejected": -242.67373657226562, "loss": 0.6861, "positive_losses": 0.15676459670066833, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11477897316217422, "rewards/margins": 0.04232285916805267, "rewards/margins_max": 0.15683066844940186, "rewards/margins_min": -0.05869164317846298, "rewards/margins_std": 0.09568311274051666, "rewards/rejected": 0.07245610654354095, "step": 3520 }, { "dpo_losses": 0.679439902305603, "epoch": 0.92, "grad_norm": 11.869270307677668, "learning_rate": 8.78665232332998e-09, "logits/chosen": -2.5483803749084473, "logits/rejected": -2.3675687313079834, "logps/chosen": -261.5073547363281, "logps/rejected": -237.2522735595703, "loss": 0.6794, "positive_losses": 0.14146193861961365, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11706838756799698, "rewards/margins": 0.029987787827849388, "rewards/margins_max": 0.15094950795173645, "rewards/margins_min": -0.05403905361890793, "rewards/margins_std": 0.0912812277674675, "rewards/rejected": 0.08708060532808304, "step": 3530 }, { "dpo_losses": 0.6692111492156982, "epoch": 0.93, "grad_norm": 21.906836486888775, "learning_rate": 8.196400257606206e-09, "logits/chosen": -2.378215789794922, "logits/rejected": -2.1472201347351074, "logps/chosen": -274.83172607421875, "logps/rejected": -249.9633331298828, "loss": 0.6897, "positive_losses": 0.2881126403808594, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12198509275913239, "rewards/margins": 0.051083408296108246, "rewards/margins_max": 0.16817298531532288, "rewards/margins_min": -0.0438561737537384, "rewards/margins_std": 0.0920911356806755, "rewards/rejected": 0.07090168446302414, "step": 3540 }, { "dpo_losses": 0.6670487523078918, "epoch": 0.93, "grad_norm": 2.591588050001513, "learning_rate": 7.626338722875075e-09, "logits/chosen": -2.1925442218780518, "logits/rejected": -2.1438047885894775, "logps/chosen": -238.2859344482422, "logps/rejected": -251.4809112548828, "loss": 0.6909, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13784736394882202, "rewards/margins": 0.05496755987405777, "rewards/margins_max": 0.16254086792469025, "rewards/margins_min": -0.03475554287433624, "rewards/margins_std": 0.09163057804107666, "rewards/rejected": 0.08287978172302246, "step": 3550 }, { "dpo_losses": 0.6672071218490601, "epoch": 0.93, "grad_norm": 2.6743384710867417, "learning_rate": 7.0765153191106875e-09, "logits/chosen": -2.404132843017578, "logits/rejected": -2.167569637298584, "logps/chosen": -264.3001403808594, "logps/rejected": -247.8280792236328, "loss": 0.67, "positive_losses": 0.03965425491333008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11068712174892426, "rewards/margins": 0.05481187626719475, "rewards/margins_max": 0.15251585841178894, "rewards/margins_min": -0.034066714346408844, "rewards/margins_std": 0.08707704395055771, "rewards/rejected": 0.05587524175643921, "step": 3560 }, { "dpo_losses": 0.6626065373420715, "epoch": 0.93, "grad_norm": 8.843119352790355, "learning_rate": 6.54697595640899e-09, "logits/chosen": -2.4443838596343994, "logits/rejected": -2.335523843765259, "logps/chosen": -296.3941650390625, "logps/rejected": -257.3419494628906, "loss": 0.6911, "positive_losses": 0.07377395778894424, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1298106461763382, "rewards/margins": 0.06484152376651764, "rewards/margins_max": 0.18660303950309753, "rewards/margins_min": -0.0270791407674551, "rewards/margins_std": 0.09408458322286606, "rewards/rejected": 0.06496910750865936, "step": 3570 }, { "dpo_losses": 0.6592499017715454, "epoch": 0.94, "grad_norm": 3.0564075515172515, "learning_rate": 6.037764851154425e-09, "logits/chosen": -2.4130449295043945, "logits/rejected": -2.1542978286743164, "logps/chosen": -290.1929016113281, "logps/rejected": -258.75164794921875, "loss": 0.6841, "positive_losses": 0.042176056653261185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13862988352775574, "rewards/margins": 0.07240472733974457, "rewards/margins_max": 0.19755132496356964, "rewards/margins_min": -0.046934641897678375, "rewards/margins_std": 0.10821268707513809, "rewards/rejected": 0.06622515618801117, "step": 3580 }, { "dpo_losses": 0.6759646534919739, "epoch": 0.94, "grad_norm": 2.9111007079536333, "learning_rate": 5.548924522327747e-09, "logits/chosen": -2.3286805152893066, "logits/rejected": -2.251971960067749, "logps/chosen": -183.4307861328125, "logps/rejected": -193.52877807617188, "loss": 0.6755, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11046738922595978, "rewards/margins": 0.0362229123711586, "rewards/margins_max": 0.12659592926502228, "rewards/margins_min": -0.04409477114677429, "rewards/margins_std": 0.07543198019266129, "rewards/rejected": 0.07424446940422058, "step": 3590 }, { "dpo_losses": 0.6715174913406372, "epoch": 0.94, "grad_norm": 2.675839700547726, "learning_rate": 5.080495787955691e-09, "logits/chosen": -2.188714027404785, "logits/rejected": -2.219529867172241, "logps/chosen": -230.4191131591797, "logps/rejected": -270.1329650878906, "loss": 0.6798, "positive_losses": 0.3534698486328125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12178288400173187, "rewards/margins": 0.04610290005803108, "rewards/margins_max": 0.13876459002494812, "rewards/margins_min": -0.05036674812436104, "rewards/margins_std": 0.08440079540014267, "rewards/rejected": 0.07567998766899109, "step": 3600 }, { "epoch": 0.94, "eval_dpo_losses": 0.669320821762085, "eval_logits/chosen": -2.3045456409454346, "eval_logits/rejected": -2.1965110301971436, "eval_logps/chosen": -262.8558349609375, "eval_logps/rejected": -254.95993041992188, "eval_loss": 0.6795217394828796, "eval_positive_losses": 0.0807814821600914, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": 0.12918297946453094, "eval_rewards/margins": 0.05057960003614426, "eval_rewards/margins_max": 0.19568683207035065, "eval_rewards/margins_min": -0.08033297955989838, "eval_rewards/margins_std": 0.0919150710105896, "eval_rewards/rejected": 0.07860337197780609, "eval_runtime": 389.0033, "eval_samples_per_second": 5.141, "eval_steps_per_second": 0.162, "step": 3600 }, { "dpo_losses": 0.6795822978019714, "epoch": 0.94, "grad_norm": 8.805145008078018, "learning_rate": 4.632517761702814e-09, "logits/chosen": -2.3028149604797363, "logits/rejected": -2.3220579624176025, "logps/chosen": -213.9730224609375, "logps/rejected": -216.1724853515625, "loss": 0.6707, "positive_losses": 0.06297607719898224, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11183343827724457, "rewards/margins": 0.02903483808040619, "rewards/margins_max": 0.11653663963079453, "rewards/margins_min": -0.05309765413403511, "rewards/margins_std": 0.07721646130084991, "rewards/rejected": 0.08279859274625778, "step": 3610 }, { "dpo_losses": 0.675351619720459, "epoch": 0.95, "grad_norm": 2.776990307344219, "learning_rate": 4.205027849605358e-09, "logits/chosen": -2.492180585861206, "logits/rejected": -2.2991082668304443, "logps/chosen": -211.90835571289062, "logps/rejected": -233.210205078125, "loss": 0.6801, "positive_losses": 0.17425231635570526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12319853156805038, "rewards/margins": 0.03803318738937378, "rewards/margins_max": 0.1315273493528366, "rewards/margins_min": -0.0496831089258194, "rewards/margins_std": 0.08257023245096207, "rewards/rejected": 0.0851653516292572, "step": 3620 }, { "dpo_losses": 0.6789863109588623, "epoch": 0.95, "grad_norm": 8.75134877299949, "learning_rate": 3.798061746947995e-09, "logits/chosen": -2.250687599182129, "logits/rejected": -2.1806836128234863, "logps/chosen": -231.2392120361328, "logps/rejected": -294.1446838378906, "loss": 0.6801, "positive_losses": 0.20766349136829376, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09926270693540573, "rewards/margins": 0.03012619912624359, "rewards/margins_max": 0.11989504098892212, "rewards/margins_min": -0.05875443294644356, "rewards/margins_std": 0.07929670810699463, "rewards/rejected": 0.06913651525974274, "step": 3630 }, { "dpo_losses": 0.6807145476341248, "epoch": 0.95, "grad_norm": 2.4098069294880133, "learning_rate": 3.411653435283157e-09, "logits/chosen": -2.46113657951355, "logits/rejected": -2.38702130317688, "logps/chosen": -246.1498565673828, "logps/rejected": -225.646484375, "loss": 0.6948, "positive_losses": 0.14888039231300354, "rewards/accuracies": 0.625, "rewards/chosen": 0.115153968334198, "rewards/margins": 0.02701282501220703, "rewards/margins_max": 0.12020417302846909, "rewards/margins_min": -0.07684946060180664, "rewards/margins_std": 0.08799809962511063, "rewards/rejected": 0.08814114332199097, "step": 3640 }, { "dpo_losses": 0.6742995977401733, "epoch": 0.96, "grad_norm": 17.815931344388616, "learning_rate": 3.0458351795936698e-09, "logits/chosen": -2.254080295562744, "logits/rejected": -2.258018970489502, "logps/chosen": -263.5292663574219, "logps/rejected": -313.0960998535156, "loss": 0.6873, "positive_losses": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12496940791606903, "rewards/margins": 0.04056607559323311, "rewards/margins_max": 0.1460079848766327, "rewards/margins_min": -0.06665924936532974, "rewards/margins_std": 0.09792429953813553, "rewards/rejected": 0.08440332114696503, "step": 3650 }, { "dpo_losses": 0.6802531480789185, "epoch": 0.96, "grad_norm": 17.390995265038566, "learning_rate": 2.700637525598598e-09, "logits/chosen": -2.6376395225524902, "logits/rejected": -2.5229780673980713, "logps/chosen": -258.58477783203125, "logps/rejected": -253.9266815185547, "loss": 0.6918, "positive_losses": 0.05217475816607475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13008873164653778, "rewards/margins": 0.027296090498566628, "rewards/margins_max": 0.10404376685619354, "rewards/margins_min": -0.04743499681353569, "rewards/margins_std": 0.06597328931093216, "rewards/rejected": 0.1027926579117775, "step": 3660 }, { "dpo_losses": 0.6713598370552063, "epoch": 0.96, "grad_norm": 2.511044899828085, "learning_rate": 2.3760892972027324e-09, "logits/chosen": -2.4055914878845215, "logits/rejected": -2.3222835063934326, "logps/chosen": -262.60980224609375, "logps/rejected": -292.75567626953125, "loss": 0.6854, "positive_losses": 0.0884777083992958, "rewards/accuracies": 0.625, "rewards/chosen": 0.12635168433189392, "rewards/margins": 0.04742127284407616, "rewards/margins_max": 0.19884729385375977, "rewards/margins_min": -0.053927112370729446, "rewards/margins_std": 0.11563923209905624, "rewards/rejected": 0.07893041521310806, "step": 3670 }, { "dpo_losses": 0.6599666476249695, "epoch": 0.96, "grad_norm": 11.98651255179324, "learning_rate": 2.0722175940897645e-09, "logits/chosen": -2.3357038497924805, "logits/rejected": -2.2685706615448, "logps/chosen": -270.7154541015625, "logps/rejected": -270.9029846191406, "loss": 0.6786, "positive_losses": 0.026680756360292435, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14410603046417236, "rewards/margins": 0.07008855789899826, "rewards/margins_max": 0.16702865064144135, "rewards/margins_min": -0.02863515354692936, "rewards/margins_std": 0.08886677026748657, "rewards/rejected": 0.0740174800157547, "step": 3680 }, { "dpo_losses": 0.6617531180381775, "epoch": 0.97, "grad_norm": 8.957252283442633, "learning_rate": 1.7890477894593748e-09, "logits/chosen": -2.2614758014678955, "logits/rejected": -2.133333444595337, "logps/chosen": -300.1104431152344, "logps/rejected": -233.7286376953125, "loss": 0.6754, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13969308137893677, "rewards/margins": 0.06598304212093353, "rewards/margins_max": 0.1569063663482666, "rewards/margins_min": -0.026309896260499954, "rewards/margins_std": 0.08132576197385788, "rewards/rejected": 0.07371003925800323, "step": 3690 }, { "dpo_losses": 0.6660200357437134, "epoch": 0.97, "grad_norm": 9.22354797679686, "learning_rate": 1.5266035279088708e-09, "logits/chosen": -2.5149595737457275, "logits/rejected": -2.436842918395996, "logps/chosen": -309.20263671875, "logps/rejected": -273.0166320800781, "loss": 0.6848, "positive_losses": 0.17203445732593536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13054092228412628, "rewards/margins": 0.057736676186323166, "rewards/margins_max": 0.17761030793190002, "rewards/margins_min": -0.057766757905483246, "rewards/margins_std": 0.10507609695196152, "rewards/rejected": 0.07280426472425461, "step": 3700 }, { "epoch": 0.97, "eval_dpo_losses": 0.6691861152648926, "eval_logits/chosen": -2.309345245361328, "eval_logits/rejected": -2.2016751766204834, "eval_logps/chosen": -262.8503723144531, "eval_logps/rejected": -254.9833526611328, "eval_loss": 0.6797061562538147, "eval_positive_losses": 0.0805898979306221, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.12923771142959595, "eval_rewards/margins": 0.050868432968854904, "eval_rewards/margins_max": 0.19673700630664825, "eval_rewards/margins_min": -0.08014870434999466, "eval_rewards/margins_std": 0.09213271737098694, "eval_rewards/rejected": 0.07836927473545074, "eval_runtime": 388.9516, "eval_samples_per_second": 5.142, "eval_steps_per_second": 0.162, "step": 3700 }, { "dpo_losses": 0.6625914573669434, "epoch": 0.97, "grad_norm": 2.642306001556098, "learning_rate": 1.2849067234584621e-09, "logits/chosen": -2.2955644130706787, "logits/rejected": -2.190904378890991, "logps/chosen": -279.26654052734375, "logps/rejected": -268.289794921875, "loss": 0.675, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14626960456371307, "rewards/margins": 0.06389258801937103, "rewards/margins_max": 0.15590433776378632, "rewards/margins_min": -0.010839484632015228, "rewards/margins_std": 0.07458408176898956, "rewards/rejected": 0.08237699419260025, "step": 3710 }, { "dpo_losses": 0.6633309125900269, "epoch": 0.97, "grad_norm": 8.606130109629252, "learning_rate": 1.0639775577218625e-09, "logits/chosen": -2.5142621994018555, "logits/rejected": -2.3808085918426514, "logps/chosen": -274.4912109375, "logps/rejected": -243.31387329101562, "loss": 0.6814, "positive_losses": 0.009181213565170765, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13839319348335266, "rewards/margins": 0.06243567913770676, "rewards/margins_max": 0.15030893683433533, "rewards/margins_min": -0.022112851962447166, "rewards/margins_std": 0.08007488399744034, "rewards/rejected": 0.07595750689506531, "step": 3720 }, { "dpo_losses": 0.6643810272216797, "epoch": 0.98, "grad_norm": 12.901249764398736, "learning_rate": 8.638344782207485e-10, "logits/chosen": -2.5149521827697754, "logits/rejected": -2.3692288398742676, "logps/chosen": -307.21429443359375, "logps/rejected": -250.5802459716797, "loss": 0.6816, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13631896674633026, "rewards/margins": 0.06103982776403427, "rewards/margins_max": 0.17756375670433044, "rewards/margins_min": -0.03455282002687454, "rewards/margins_std": 0.09665904194116592, "rewards/rejected": 0.0752791315317154, "step": 3730 }, { "dpo_losses": 0.6682798862457275, "epoch": 0.98, "grad_norm": 2.3338036630812113, "learning_rate": 6.844941968447149e-10, "logits/chosen": -2.4491734504699707, "logits/rejected": -2.3357410430908203, "logps/chosen": -310.2236633300781, "logps/rejected": -308.8087463378906, "loss": 0.6614, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14522995054721832, "rewards/margins": 0.05238284543156624, "rewards/margins_max": 0.13315540552139282, "rewards/margins_min": -0.03716592863202095, "rewards/margins_std": 0.07564219832420349, "rewards/rejected": 0.09284709393978119, "step": 3740 }, { "dpo_losses": 0.6789427995681763, "epoch": 0.98, "grad_norm": 2.7949141123068735, "learning_rate": 5.25971688455612e-10, "logits/chosen": -2.256833076477051, "logits/rejected": -2.222848892211914, "logps/chosen": -270.87579345703125, "logps/rejected": -253.28817749023438, "loss": 0.6882, "positive_losses": 0.12509965896606445, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1296057254076004, "rewards/margins": 0.031163418665528297, "rewards/margins_max": 0.14388689398765564, "rewards/margins_min": -0.0796196237206459, "rewards/margins_std": 0.09771333634853363, "rewards/rejected": 0.09844230115413666, "step": 3750 }, { "dpo_losses": 0.6741542220115662, "epoch": 0.98, "grad_norm": 2.590522872697187, "learning_rate": 3.882801896372967e-10, "logits/chosen": -2.291642904281616, "logits/rejected": -2.274657726287842, "logps/chosen": -211.21560668945312, "logps/rejected": -228.9896697998047, "loss": 0.6754, "positive_losses": 0.1516435593366623, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11674255132675171, "rewards/margins": 0.04044301062822342, "rewards/margins_max": 0.1501026451587677, "rewards/margins_min": -0.04160268232226372, "rewards/margins_std": 0.08655392378568649, "rewards/rejected": 0.07629954069852829, "step": 3760 }, { "dpo_losses": 0.6695734262466431, "epoch": 0.99, "grad_norm": 12.66894136048125, "learning_rate": 2.714311975902661e-10, "logits/chosen": -2.4541516304016113, "logits/rejected": -2.352698802947998, "logps/chosen": -281.77685546875, "logps/rejected": -305.18212890625, "loss": 0.6795, "positive_losses": 0.03063812293112278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1354237198829651, "rewards/margins": 0.05083948373794556, "rewards/margins_max": 0.18954767286777496, "rewards/margins_min": -0.06128822639584541, "rewards/margins_std": 0.11173137277364731, "rewards/rejected": 0.08458424359560013, "step": 3770 }, { "dpo_losses": 0.6644253134727478, "epoch": 0.99, "grad_norm": 21.446046777046842, "learning_rate": 1.754344691717591e-10, "logits/chosen": -2.4007885456085205, "logits/rejected": -2.124936580657959, "logps/chosen": -280.6167297363281, "logps/rejected": -256.33551025390625, "loss": 0.6874, "positive_losses": 0.07916869968175888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13609318435192108, "rewards/margins": 0.06087359040975571, "rewards/margins_max": 0.17766180634498596, "rewards/margins_min": -0.05126393958926201, "rewards/margins_std": 0.10541017353534698, "rewards/rejected": 0.07521959394216537, "step": 3780 }, { "dpo_losses": 0.6822197437286377, "epoch": 0.99, "grad_norm": 18.896183884077214, "learning_rate": 1.0029802008096333e-10, "logits/chosen": -2.3358957767486572, "logits/rejected": -2.3211677074432373, "logps/chosen": -235.7646484375, "logps/rejected": -213.0660400390625, "loss": 0.6923, "positive_losses": 0.2177116423845291, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09562576562166214, "rewards/margins": 0.024593904614448547, "rewards/margins_max": 0.13694313168525696, "rewards/margins_min": -0.0745038390159607, "rewards/margins_std": 0.09195156395435333, "rewards/rejected": 0.07103186845779419, "step": 3790 }, { "dpo_losses": 0.6712885499000549, "epoch": 0.99, "grad_norm": 10.494289843706932, "learning_rate": 4.602812418974533e-11, "logits/chosen": -2.279733180999756, "logits/rejected": -2.2047605514526367, "logps/chosen": -200.5308837890625, "logps/rejected": -174.5224151611328, "loss": 0.6734, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.11501780897378922, "rewards/margins": 0.04573873057961464, "rewards/margins_max": 0.1347006857395172, "rewards/margins_min": -0.021916985511779785, "rewards/margins_std": 0.07183204591274261, "rewards/rejected": 0.06927908957004547, "step": 3800 }, { "epoch": 0.99, "eval_dpo_losses": 0.6691918969154358, "eval_logits/chosen": -2.302849054336548, "eval_logits/rejected": -2.194676160812378, "eval_logps/chosen": -262.8234558105469, "eval_logps/rejected": -254.95423889160156, "eval_loss": 0.6797086000442505, "eval_positive_losses": 0.08223303407430649, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": 0.12950687110424042, "eval_rewards/margins": 0.05084652826189995, "eval_rewards/margins_max": 0.19665119051933289, "eval_rewards/margins_min": -0.07997436076402664, "eval_rewards/margins_std": 0.09202685952186584, "eval_rewards/rejected": 0.07866034656763077, "eval_runtime": 388.8897, "eval_samples_per_second": 5.143, "eval_steps_per_second": 0.162, "step": 3800 }, { "dpo_losses": 0.6714563369750977, "epoch": 1.0, "grad_norm": 2.387999145897436, "learning_rate": 1.2629313018819309e-11, "logits/chosen": -2.3879127502441406, "logits/rejected": -2.243809461593628, "logps/chosen": -285.85205078125, "logps/rejected": -255.5814971923828, "loss": 0.684, "positive_losses": 0.28295212984085083, "rewards/accuracies": 0.75, "rewards/chosen": 0.13598409295082092, "rewards/margins": 0.04671488329768181, "rewards/margins_max": 0.17115409672260284, "rewards/margins_min": -0.04593339562416077, "rewards/margins_std": 0.09708617627620697, "rewards/rejected": 0.08926919847726822, "step": 3810 }, { "dpo_losses": 0.6675957441329956, "epoch": 1.0, "grad_norm": 2.6377664476651415, "learning_rate": 1.0437535929996855e-13, "logits/chosen": -2.335951328277588, "logits/rejected": -2.2002310752868652, "logps/chosen": -289.9017639160156, "logps/rejected": -208.04306030273438, "loss": 0.6864, "positive_losses": 0.45768141746520996, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12192322313785553, "rewards/margins": 0.05484456568956375, "rewards/margins_max": 0.19565747678279877, "rewards/margins_min": -0.04932503029704094, "rewards/margins_std": 0.11101311445236206, "rewards/rejected": 0.06707865744829178, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.68390608707419, "train_runtime": 41762.7734, "train_samples_per_second": 1.464, "train_steps_per_second": 0.091 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }