{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 306, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 16.160893330879862, "learning_rate": 1.6129032258064514e-08, "logits/chosen": -1.6982225179672241, "logits/rejected": -1.086500644683838, "logps/chosen": -208.50250244140625, "logps/rejected": -262.22808837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 18.69135932886243, "learning_rate": 1.6129032258064515e-07, "logits/chosen": -1.4697270393371582, "logits/rejected": -0.9334302544593811, "logps/chosen": -234.84239196777344, "logps/rejected": -317.2652893066406, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": -0.000248217984335497, "rewards/margins": 0.00017482459952589124, "rewards/rejected": -0.00042304262751713395, "step": 10 }, { "epoch": 0.07, "grad_norm": 16.031117031534944, "learning_rate": 3.225806451612903e-07, "logits/chosen": -1.3034999370574951, "logits/rejected": -1.0336174964904785, "logps/chosen": -224.30307006835938, "logps/rejected": -348.02825927734375, "loss": 0.6916, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0018210510024800897, "rewards/margins": 0.0035554722417145967, "rewards/rejected": -0.0017344218213111162, "step": 20 }, { "epoch": 0.1, "grad_norm": 16.925592105056882, "learning_rate": 4.838709677419355e-07, "logits/chosen": -1.0461599826812744, "logits/rejected": -0.9585116505622864, "logps/chosen": -237.54373168945312, "logps/rejected": -275.940673828125, "loss": 0.6822, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.01179418247193098, "rewards/margins": 0.023266470059752464, "rewards/rejected": -0.011472286656498909, "step": 30 }, { "epoch": 0.13, "grad_norm": 17.790603933181593, "learning_rate": 4.986797785768295e-07, "logits/chosen": -1.2466868162155151, "logits/rejected": -1.0099724531173706, "logps/chosen": -226.1925506591797, "logps/rejected": -308.3481750488281, "loss": 0.6584, "rewards/accuracies": 0.8125, "rewards/chosen": 0.006962643004953861, "rewards/margins": 0.06420420855283737, "rewards/rejected": -0.057241566479206085, "step": 40 }, { "epoch": 0.16, "grad_norm": 18.832438856630475, "learning_rate": 4.941339491514909e-07, "logits/chosen": -1.0786056518554688, "logits/rejected": -0.8200371861457825, "logps/chosen": -250.77627563476562, "logps/rejected": -305.92352294921875, "loss": 0.5989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03331167623400688, "rewards/margins": 0.29684731364250183, "rewards/rejected": -0.3301590085029602, "step": 50 }, { "epoch": 0.2, "grad_norm": 26.859354757154694, "learning_rate": 4.864054603442063e-07, "logits/chosen": -0.8682538270950317, "logits/rejected": -0.8277397155761719, "logps/chosen": -281.60577392578125, "logps/rejected": -436.7176208496094, "loss": 0.5358, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1347092092037201, "rewards/margins": 0.7372199296951294, "rewards/rejected": -0.8719291687011719, "step": 60 }, { "epoch": 0.23, "grad_norm": 19.728868151669563, "learning_rate": 4.755950648257788e-07, "logits/chosen": -0.9148917198181152, "logits/rejected": -0.5669609308242798, "logps/chosen": -303.3214416503906, "logps/rejected": -471.29345703125, "loss": 0.4898, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.24109110236167908, "rewards/margins": 1.1684527397155762, "rewards/rejected": -1.4095438718795776, "step": 70 }, { "epoch": 0.26, "grad_norm": 17.67886600787927, "learning_rate": 4.618436926341606e-07, "logits/chosen": -0.6920875310897827, "logits/rejected": -0.1917627602815628, "logps/chosen": -238.93624877929688, "logps/rejected": -427.3094177246094, "loss": 0.4322, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.18579021096229553, "rewards/margins": 1.348921298980713, "rewards/rejected": -1.5347115993499756, "step": 80 }, { "epoch": 0.29, "grad_norm": 20.470351398393845, "learning_rate": 4.4533061393588276e-07, "logits/chosen": -0.9842801094055176, "logits/rejected": -0.17733868956565857, "logps/chosen": -269.7094421386719, "logps/rejected": -537.8287963867188, "loss": 0.4163, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.2889798879623413, "rewards/margins": 1.93475341796875, "rewards/rejected": -2.223733425140381, "step": 90 }, { "epoch": 0.33, "grad_norm": 22.11813905754231, "learning_rate": 4.262711019652764e-07, "logits/chosen": -0.5012297630310059, "logits/rejected": -9.913742542266846e-05, "logps/chosen": -328.4926452636719, "logps/rejected": -584.0737915039062, "loss": 0.3672, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.3633125424385071, "rewards/margins": 2.0779290199279785, "rewards/rejected": -2.4412412643432617, "step": 100 }, { "epoch": 0.33, "eval_logits/chosen": -1.7892314195632935, "eval_logits/rejected": 0.012374745681881905, "eval_logps/chosen": -325.3137512207031, "eval_logps/rejected": -259.1305847167969, "eval_loss": 0.5652258396148682, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.27210739254951477, "eval_rewards/margins": 0.3719515800476074, "eval_rewards/rejected": -0.6440589427947998, "eval_runtime": 62.3299, "eval_samples_per_second": 9.113, "eval_steps_per_second": 0.289, "step": 100 }, { "epoch": 0.36, "grad_norm": 27.459187382986496, "learning_rate": 4.0491362660864523e-07, "logits/chosen": -0.4417840838432312, "logits/rejected": 0.22044658660888672, "logps/chosen": -216.5067138671875, "logps/rejected": -570.1358032226562, "loss": 0.3445, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2089937925338745, "rewards/margins": 2.619229555130005, "rewards/rejected": -2.82822322845459, "step": 110 }, { "epoch": 0.39, "grad_norm": 22.673980281488344, "learning_rate": 3.8153661521931215e-07, "logits/chosen": -0.25229763984680176, "logits/rejected": 0.42893147468566895, "logps/chosen": -287.91815185546875, "logps/rejected": -586.4707641601562, "loss": 0.3152, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.3577408790588379, "rewards/margins": 2.5164005756378174, "rewards/rejected": -2.8741414546966553, "step": 120 }, { "epoch": 0.42, "grad_norm": 22.548222714710292, "learning_rate": 3.5644482289126813e-07, "logits/chosen": -0.36232301592826843, "logits/rejected": 0.7560933828353882, "logps/chosen": -271.9062805175781, "logps/rejected": -581.7931518554688, "loss": 0.3195, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.2464122772216797, "rewards/margins": 2.863015651702881, "rewards/rejected": -3.1094279289245605, "step": 130 }, { "epoch": 0.46, "grad_norm": 23.598569359547252, "learning_rate": 3.299653595104602e-07, "logits/chosen": 0.2891393005847931, "logits/rejected": 1.1849420070648193, "logps/chosen": -278.166748046875, "logps/rejected": -591.746337890625, "loss": 0.2842, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.2587340772151947, "rewards/margins": 2.8643851280212402, "rewards/rejected": -3.123119354248047, "step": 140 }, { "epoch": 0.49, "grad_norm": 21.260621719260254, "learning_rate": 3.024434253771773e-07, "logits/chosen": -0.052896756678819656, "logits/rejected": 1.054487943649292, "logps/chosen": -255.85391235351562, "logps/rejected": -814.2733154296875, "loss": 0.2754, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.3424001932144165, "rewards/margins": 4.599584579467773, "rewards/rejected": -4.9419846534729, "step": 150 }, { "epoch": 0.52, "grad_norm": 24.166711565305622, "learning_rate": 2.7423781099222037e-07, "logits/chosen": -0.3343699276447296, "logits/rejected": 1.2091766595840454, "logps/chosen": -241.1891326904297, "logps/rejected": -667.7901000976562, "loss": 0.2519, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1985635608434677, "rewards/margins": 3.7136471271514893, "rewards/rejected": -3.912210464477539, "step": 160 }, { "epoch": 0.56, "grad_norm": 47.56416189863626, "learning_rate": 2.4571621967402515e-07, "logits/chosen": -0.14796659350395203, "logits/rejected": 0.9326593279838562, "logps/chosen": -231.9830322265625, "logps/rejected": -778.2721557617188, "loss": 0.2593, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.2619406580924988, "rewards/margins": 4.371499538421631, "rewards/rejected": -4.633440017700195, "step": 170 }, { "epoch": 0.59, "grad_norm": 23.682645133331448, "learning_rate": 2.1725047398357676e-07, "logits/chosen": 0.07808978855609894, "logits/rejected": 0.8551836013793945, "logps/chosen": -299.97503662109375, "logps/rejected": -720.2726440429688, "loss": 0.2309, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3632424473762512, "rewards/margins": 3.9580280780792236, "rewards/rejected": -4.321269989013672, "step": 180 }, { "epoch": 0.62, "grad_norm": 24.593709800613684, "learning_rate": 1.892116684486976e-07, "logits/chosen": -0.4507158398628235, "logits/rejected": 1.0840203762054443, "logps/chosen": -231.7598114013672, "logps/rejected": -695.33837890625, "loss": 0.2105, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3032680153846741, "rewards/margins": 4.107699871063232, "rewards/rejected": -4.410967826843262, "step": 190 }, { "epoch": 0.65, "grad_norm": 26.182421304771378, "learning_rate": 1.619653317793613e-07, "logits/chosen": -0.4781159460544586, "logits/rejected": 1.3689903020858765, "logps/chosen": -250.3883819580078, "logps/rejected": -800.0301513671875, "loss": 0.2495, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2692957818508148, "rewards/margins": 4.869724750518799, "rewards/rejected": -5.139020919799805, "step": 200 }, { "epoch": 0.65, "eval_logits/chosen": -1.4423691034317017, "eval_logits/rejected": 0.8730748295783997, "eval_logps/chosen": -366.8614196777344, "eval_logps/rejected": -347.8152160644531, "eval_loss": 0.5144294500350952, "eval_rewards/accuracies": 0.7152777910232544, "eval_rewards/chosen": -0.6875841617584229, "eval_rewards/margins": 0.8433213233947754, "eval_rewards/rejected": -1.5309053659439087, "eval_runtime": 61.5529, "eval_samples_per_second": 9.228, "eval_steps_per_second": 0.292, "step": 200 }, { "epoch": 0.69, "grad_norm": 26.71444019699555, "learning_rate": 1.3586666164195438e-07, "logits/chosen": -0.11332446336746216, "logits/rejected": 1.1528918743133545, "logps/chosen": -242.18594360351562, "logps/rejected": -730.3687744140625, "loss": 0.2355, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.17774315178394318, "rewards/margins": 4.097687721252441, "rewards/rejected": -4.275431156158447, "step": 210 }, { "epoch": 0.72, "grad_norm": 18.78114156072228, "learning_rate": 1.1125589411448994e-07, "logits/chosen": -0.16574744880199432, "logits/rejected": 0.8906081914901733, "logps/chosen": -257.7942810058594, "logps/rejected": -799.7039794921875, "loss": 0.2147, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.10207457840442657, "rewards/margins": 4.533341407775879, "rewards/rejected": -4.635416507720947, "step": 220 }, { "epoch": 0.75, "grad_norm": 25.01487938388404, "learning_rate": 8.845386818900646e-08, "logits/chosen": -0.45102643966674805, "logits/rejected": 0.8137510418891907, "logps/chosen": -249.04348754882812, "logps/rejected": -713.5521240234375, "loss": 0.1934, "rewards/accuracies": 0.96875, "rewards/chosen": -0.15099892020225525, "rewards/margins": 4.111520290374756, "rewards/rejected": -4.262519359588623, "step": 230 }, { "epoch": 0.78, "grad_norm": 30.245012228652286, "learning_rate": 6.775784314464716e-08, "logits/chosen": -0.3785732388496399, "logits/rejected": 1.2067172527313232, "logps/chosen": -247.9203338623047, "logps/rejected": -789.9756469726562, "loss": 0.2121, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.31377214193344116, "rewards/margins": 4.594644546508789, "rewards/rejected": -4.908417701721191, "step": 240 }, { "epoch": 0.82, "grad_norm": 16.348157142642147, "learning_rate": 4.943762331835621e-08, "logits/chosen": -0.47478023171424866, "logits/rejected": 1.148115873336792, "logps/chosen": -280.11419677734375, "logps/rejected": -811.0616455078125, "loss": 0.1801, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2764636278152466, "rewards/margins": 4.763599395751953, "rewards/rejected": -5.040062427520752, "step": 250 }, { "epoch": 0.85, "grad_norm": 35.57913356579002, "learning_rate": 3.373204079273473e-08, "logits/chosen": -0.3408397138118744, "logits/rejected": 1.2824140787124634, "logps/chosen": -244.1248321533203, "logps/rejected": -754.9166259765625, "loss": 0.2096, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.23486094176769257, "rewards/margins": 4.0956854820251465, "rewards/rejected": -4.3305463790893555, "step": 260 }, { "epoch": 0.88, "grad_norm": 22.72907844280726, "learning_rate": 2.084584185459709e-08, "logits/chosen": -0.27476102113723755, "logits/rejected": 1.213181495666504, "logps/chosen": -285.0547790527344, "logps/rejected": -713.6915283203125, "loss": 0.1874, "rewards/accuracies": 0.96875, "rewards/chosen": -0.21095514297485352, "rewards/margins": 4.196396827697754, "rewards/rejected": -4.407351493835449, "step": 270 }, { "epoch": 0.92, "grad_norm": 22.905550437426676, "learning_rate": 1.0947017814003257e-08, "logits/chosen": 0.07453560829162598, "logits/rejected": 1.153464913368225, "logps/chosen": -254.3527374267578, "logps/rejected": -810.6804809570312, "loss": 0.1795, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2759082615375519, "rewards/margins": 4.727480411529541, "rewards/rejected": -5.003388404846191, "step": 280 }, { "epoch": 0.95, "grad_norm": 27.524134609799987, "learning_rate": 4.164614980622677e-09, "logits/chosen": -0.3413197100162506, "logits/rejected": 1.2102278470993042, "logps/chosen": -236.2258758544922, "logps/rejected": -899.1121215820312, "loss": 0.1885, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2925792336463928, "rewards/margins": 5.496912956237793, "rewards/rejected": -5.789492130279541, "step": 290 }, { "epoch": 0.98, "grad_norm": 20.178235335586955, "learning_rate": 5.870523477368439e-10, "logits/chosen": 0.39968985319137573, "logits/rejected": 0.5744360089302063, "logps/chosen": -251.2977752685547, "logps/rejected": -803.5077514648438, "loss": 0.1708, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.24827781319618225, "rewards/margins": 4.899833679199219, "rewards/rejected": -5.148111343383789, "step": 300 }, { "epoch": 0.98, "eval_logits/chosen": -1.3915222883224487, "eval_logits/rejected": 1.0055533647537231, "eval_logps/chosen": -385.5208435058594, "eval_logps/rejected": -374.23699951171875, "eval_loss": 0.534546434879303, "eval_rewards/accuracies": 0.6527777910232544, "eval_rewards/chosen": -0.8741780519485474, "eval_rewards/margins": 0.9209451675415039, "eval_rewards/rejected": -1.7951232194900513, "eval_runtime": 60.8637, "eval_samples_per_second": 9.332, "eval_steps_per_second": 0.296, "step": 300 }, { "epoch": 1.0, "step": 306, "total_flos": 0.0, "train_loss": 0.004050073670405967, "train_runtime": 90.6565, "train_samples_per_second": 215.892, "train_steps_per_second": 3.375 } ], "logging_steps": 10, "max_steps": 306, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }