diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2725 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 1359, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 6.209654163226836, + "learning_rate": 3.676470588235294e-09, + "logits/chosen": -1.4681403636932373, + "logits/rejected": -0.8821791410446167, + "logps/chosen": -326.7279052734375, + "logps/rejected": -393.66143798828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 7.833527457219724, + "learning_rate": 3.676470588235294e-08, + "logits/chosen": -1.1554194688796997, + "logits/rejected": -1.069737434387207, + "logps/chosen": -260.11224365234375, + "logps/rejected": -278.21954345703125, + "loss": 0.693, + "rewards/accuracies": 0.3888888955116272, + "rewards/chosen": 0.001127632916904986, + "rewards/margins": 0.001941706403158605, + "rewards/margins_max": 0.0066660139709711075, + "rewards/margins_min": -0.0027826009318232536, + "rewards/margins_std": 0.006681179627776146, + "rewards/rejected": -0.0008140733698382974, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 5.539912592900294, + "learning_rate": 7.352941176470588e-08, + "logits/chosen": -1.1387906074523926, + "logits/rejected": -1.2151895761489868, + "logps/chosen": -226.5954132080078, + "logps/rejected": -194.97735595703125, + "loss": 0.6928, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0004003068897873163, + "rewards/margins": 0.0006232298910617828, + "rewards/margins_max": 0.0029323583003133535, + "rewards/margins_min": -0.0016858980525285006, + "rewards/margins_std": 0.0032655999530106783, + "rewards/rejected": -0.0002229233068646863, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 9.074297699065875, + "learning_rate": 1.1029411764705881e-07, + "logits/chosen": -0.9134622812271118, + "logits/rejected": -1.1061055660247803, + "logps/chosen": -286.9056091308594, + "logps/rejected": -306.0609130859375, + "loss": 0.693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0033608167432248592, + "rewards/margins": 0.0010996473720297217, + "rewards/margins_max": 0.003882316406816244, + "rewards/margins_min": -0.0016830215463414788, + "rewards/margins_std": 0.003935288172215223, + "rewards/rejected": 0.0022611692547798157, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 6.630354149069055, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -0.9963411092758179, + "logits/rejected": -1.3301975727081299, + "logps/chosen": -237.13650512695312, + "logps/rejected": -233.420654296875, + "loss": 0.6928, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0003052559623029083, + "rewards/margins": -0.00031435777782462537, + "rewards/margins_max": 0.003875983878970146, + "rewards/margins_min": -0.0045046997256577015, + "rewards/margins_std": 0.005926038138568401, + "rewards/rejected": 9.101861905946862e-06, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 5.11233300662631, + "learning_rate": 1.8382352941176472e-07, + "logits/chosen": -0.9264333844184875, + "logits/rejected": -1.0728222131729126, + "logps/chosen": -219.332763671875, + "logps/rejected": -220.7531280517578, + "loss": 0.6927, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0008567962795495987, + "rewards/margins": 0.0023684161715209484, + "rewards/margins_max": 0.00644815806299448, + "rewards/margins_min": -0.001711326651275158, + "rewards/margins_std": 0.005769627168774605, + "rewards/rejected": -0.001511619659140706, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 6.3634169311373245, + "learning_rate": 2.2058823529411763e-07, + "logits/chosen": -1.1445600986480713, + "logits/rejected": -1.3254610300064087, + "logps/chosen": -269.0830993652344, + "logps/rejected": -234.78726196289062, + "loss": 0.6913, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0016944237286224961, + "rewards/margins": 0.004126362036913633, + "rewards/margins_max": 0.006431617774069309, + "rewards/margins_min": 0.0018211060669273138, + "rewards/margins_std": 0.0032601244747638702, + "rewards/rejected": -0.0024319379590451717, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 5.891925253538908, + "learning_rate": 2.5735294117647057e-07, + "logits/chosen": -1.414535403251648, + "logits/rejected": -1.5020934343338013, + "logps/chosen": -295.0069580078125, + "logps/rejected": -283.39984130859375, + "loss": 0.6904, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.003248781431466341, + "rewards/margins": 0.005960130598396063, + "rewards/margins_max": 0.01083610113710165, + "rewards/margins_min": 0.0010841598268598318, + "rewards/margins_std": 0.006895663682371378, + "rewards/rejected": -0.0027113493997603655, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 27.08598358859191, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -1.1046959161758423, + "logits/rejected": -1.121512770652771, + "logps/chosen": -233.47909545898438, + "logps/rejected": -228.24447631835938, + "loss": 0.6879, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0034522090572863817, + "rewards/margins": 0.010937942191958427, + "rewards/margins_max": 0.015041169710457325, + "rewards/margins_min": 0.006834716536104679, + "rewards/margins_std": 0.005802837200462818, + "rewards/rejected": -0.007485733367502689, + "step": 80 + }, + { + "epoch": 0.07, + "grad_norm": 5.043695997862729, + "learning_rate": 3.3088235294117644e-07, + "logits/chosen": -1.1739518642425537, + "logits/rejected": -1.1855499744415283, + "logps/chosen": -201.79940795898438, + "logps/rejected": -239.0184783935547, + "loss": 0.6861, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.005821605678647757, + "rewards/margins": 0.016343099996447563, + "rewards/margins_max": 0.022502990439534187, + "rewards/margins_min": 0.010183211416006088, + "rewards/margins_std": 0.008711399510502815, + "rewards/rejected": -0.010521495714783669, + "step": 90 + }, + { + "epoch": 0.07, + "grad_norm": 6.240948244130348, + "learning_rate": 3.6764705882352943e-07, + "logits/chosen": -1.226905345916748, + "logits/rejected": -1.402093529701233, + "logps/chosen": -276.8337707519531, + "logps/rejected": -248.4552459716797, + "loss": 0.6816, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.008135917596518993, + "rewards/margins": 0.024016622453927994, + "rewards/margins_max": 0.033059027045965195, + "rewards/margins_min": 0.014974219724535942, + "rewards/margins_std": 0.012787890620529652, + "rewards/rejected": -0.015880707651376724, + "step": 100 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -1.1694660186767578, + "eval_logits/rejected": -1.1956290006637573, + "eval_logps/chosen": -345.8330993652344, + "eval_logps/rejected": -336.38427734375, + "eval_loss": 0.6919357776641846, + "eval_rewards/accuracies": 0.5416666865348816, + "eval_rewards/chosen": 2.3678861907683313e-05, + "eval_rewards/margins": 0.002057413337752223, + "eval_rewards/margins_max": 0.027664856985211372, + "eval_rewards/margins_min": -0.02450541965663433, + "eval_rewards/margins_std": 0.017513444647192955, + "eval_rewards/rejected": -0.002033734694123268, + "eval_runtime": 419.0939, + "eval_samples_per_second": 9.544, + "eval_steps_per_second": 0.15, + "step": 100 + }, + { + "epoch": 0.08, + "grad_norm": 5.566894084924568, + "learning_rate": 4.044117647058823e-07, + "logits/chosen": -1.3186091184616089, + "logits/rejected": -1.2772490978240967, + "logps/chosen": -379.5386657714844, + "logps/rejected": -246.4805450439453, + "loss": 0.6773, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.007909432053565979, + "rewards/margins": 0.024571493268013, + "rewards/margins_max": 0.036393627524375916, + "rewards/margins_min": 0.012749359011650085, + "rewards/margins_std": 0.016719024628400803, + "rewards/rejected": -0.01666206307709217, + "step": 110 + }, + { + "epoch": 0.09, + "grad_norm": 5.5380569300473175, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -0.9861418008804321, + "logits/rejected": -1.2131096124649048, + "logps/chosen": -280.57135009765625, + "logps/rejected": -222.57217407226562, + "loss": 0.6696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.017649073153734207, + "rewards/margins": 0.06698472797870636, + "rewards/margins_max": 0.10137734562158585, + "rewards/margins_min": 0.032592128962278366, + "rewards/margins_std": 0.0486384816467762, + "rewards/rejected": -0.04933566227555275, + "step": 120 + }, + { + "epoch": 0.1, + "grad_norm": 15.361998761868088, + "learning_rate": 4.779411764705882e-07, + "logits/chosen": -1.0785776376724243, + "logits/rejected": -0.898257851600647, + "logps/chosen": -283.1363525390625, + "logps/rejected": -214.15316772460938, + "loss": 0.6611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02150227688252926, + "rewards/margins": 0.08108994364738464, + "rewards/margins_max": 0.10384353250265121, + "rewards/margins_min": 0.05833636596798897, + "rewards/margins_std": 0.032178424298763275, + "rewards/rejected": -0.059587668627500534, + "step": 130 + }, + { + "epoch": 0.1, + "grad_norm": 5.451118326133565, + "learning_rate": 4.999868030671756e-07, + "logits/chosen": -0.9526296854019165, + "logits/rejected": -0.9190389513969421, + "logps/chosen": -236.9579620361328, + "logps/rejected": -269.78240966796875, + "loss": 0.6586, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015391260385513306, + "rewards/margins": 0.07113742083311081, + "rewards/margins_max": 0.10363912582397461, + "rewards/margins_min": 0.0386357307434082, + "rewards/margins_std": 0.04596434161067009, + "rewards/rejected": -0.0557461753487587, + "step": 140 + }, + { + "epoch": 0.11, + "grad_norm": 5.767935898839982, + "learning_rate": 4.998383535732973e-07, + "logits/chosen": -1.1545963287353516, + "logits/rejected": -1.3083815574645996, + "logps/chosen": -272.58392333984375, + "logps/rejected": -251.5518798828125, + "loss": 0.637, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.027133097872138023, + "rewards/margins": 0.1410999596118927, + "rewards/margins_max": 0.18325701355934143, + "rewards/margins_min": 0.09894292801618576, + "rewards/margins_std": 0.05961906909942627, + "rewards/rejected": -0.11396688222885132, + "step": 150 + }, + { + "epoch": 0.12, + "grad_norm": 9.075069261969173, + "learning_rate": 4.995250566954361e-07, + "logits/chosen": -1.2339075803756714, + "logits/rejected": -1.3427120447158813, + "logps/chosen": -278.045654296875, + "logps/rejected": -249.33016967773438, + "loss": 0.621, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021242624148726463, + "rewards/margins": 0.1352781355381012, + "rewards/margins_max": 0.18264132738113403, + "rewards/margins_min": 0.08791494369506836, + "rewards/margins_std": 0.06698166579008102, + "rewards/rejected": -0.11403550952672958, + "step": 160 + }, + { + "epoch": 0.13, + "grad_norm": 5.288881821825863, + "learning_rate": 4.990471191519357e-07, + "logits/chosen": -1.2296701669692993, + "logits/rejected": -1.3137729167938232, + "logps/chosen": -271.8497009277344, + "logps/rejected": -257.36285400390625, + "loss": 0.6231, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.027015607804059982, + "rewards/margins": 0.2098924219608307, + "rewards/margins_max": 0.28914040327072144, + "rewards/margins_min": 0.13064439594745636, + "rewards/margins_std": 0.1120736226439476, + "rewards/rejected": -0.1828767955303192, + "step": 170 + }, + { + "epoch": 0.13, + "grad_norm": 4.722529871025577, + "learning_rate": 4.984048562937129e-07, + "logits/chosen": -1.104107141494751, + "logits/rejected": -1.2799243927001953, + "logps/chosen": -267.16131591796875, + "logps/rejected": -320.7081298828125, + "loss": 0.599, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.009208987466990948, + "rewards/margins": 0.15969006717205048, + "rewards/margins_max": 0.21888110041618347, + "rewards/margins_min": 0.1004989966750145, + "rewards/margins_std": 0.08370877802371979, + "rewards/rejected": -0.15048107504844666, + "step": 180 + }, + { + "epoch": 0.14, + "grad_norm": 4.706882294745904, + "learning_rate": 4.975986918961825e-07, + "logits/chosen": -1.1564669609069824, + "logits/rejected": -1.3084397315979004, + "logps/chosen": -287.58294677734375, + "logps/rejected": -235.0350799560547, + "loss": 0.5751, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015282683074474335, + "rewards/margins": 0.2435847818851471, + "rewards/margins_max": 0.34512418508529663, + "rewards/margins_min": 0.14204536378383636, + "rewards/margins_std": 0.14359840750694275, + "rewards/rejected": -0.22830209136009216, + "step": 190 + }, + { + "epoch": 0.15, + "grad_norm": 6.10064839157769, + "learning_rate": 4.966291578796448e-07, + "logits/chosen": -1.2383778095245361, + "logits/rejected": -1.2699321508407593, + "logps/chosen": -246.54550170898438, + "logps/rejected": -299.7005920410156, + "loss": 0.5468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00011487379379104823, + "rewards/margins": 0.3421292304992676, + "rewards/margins_max": 0.5104770064353943, + "rewards/margins_min": 0.17378148436546326, + "rewards/margins_std": 0.23807969689369202, + "rewards/rejected": -0.3420143723487854, + "step": 200 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -1.1466065645217896, + "eval_logits/rejected": -1.1508780717849731, + "eval_logps/chosen": -357.1989440917969, + "eval_logps/rejected": -350.5012512207031, + "eval_loss": 0.679348349571228, + "eval_rewards/accuracies": 0.579365074634552, + "eval_rewards/chosen": -0.11363494396209717, + "eval_rewards/margins": 0.029568513855338097, + "eval_rewards/margins_max": 0.24946285784244537, + "eval_rewards/margins_min": -0.1965206265449524, + "eval_rewards/margins_std": 0.1510881930589676, + "eval_rewards/rejected": -0.143203467130661, + "eval_runtime": 417.1858, + "eval_samples_per_second": 9.588, + "eval_steps_per_second": 0.151, + "step": 200 + }, + { + "epoch": 0.15, + "grad_norm": 7.130935509068585, + "learning_rate": 4.954968939583149e-07, + "logits/chosen": -0.82276850938797, + "logits/rejected": -1.0703377723693848, + "logps/chosen": -308.36981201171875, + "logps/rejected": -285.35321044921875, + "loss": 0.5356, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.06309916079044342, + "rewards/margins": 0.28919515013694763, + "rewards/margins_max": 0.4261881709098816, + "rewards/margins_min": 0.1522020846605301, + "rewards/margins_std": 0.1937374323606491, + "rewards/rejected": -0.35229426622390747, + "step": 210 + }, + { + "epoch": 0.16, + "grad_norm": 5.18218978578797, + "learning_rate": 4.942026472182297e-07, + "logits/chosen": -1.133894681930542, + "logits/rejected": -0.9819344282150269, + "logps/chosen": -357.5079345703125, + "logps/rejected": -290.6125183105469, + "loss": 0.5253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12442765384912491, + "rewards/margins": 0.420942485332489, + "rewards/margins_max": 0.6367592215538025, + "rewards/margins_min": 0.20512573421001434, + "rewards/margins_std": 0.30521097779273987, + "rewards/rejected": -0.5453701615333557, + "step": 220 + }, + { + "epoch": 0.17, + "grad_norm": 6.451783890738213, + "learning_rate": 4.92747271624308e-07, + "logits/chosen": -1.1002264022827148, + "logits/rejected": -1.1289845705032349, + "logps/chosen": -307.14483642578125, + "logps/rejected": -330.2859802246094, + "loss": 0.494, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.14728474617004395, + "rewards/margins": 0.6978201866149902, + "rewards/margins_max": 1.030912160873413, + "rewards/margins_min": 0.36472827196121216, + "rewards/margins_std": 0.47106313705444336, + "rewards/rejected": -0.845104992389679, + "step": 230 + }, + { + "epoch": 0.18, + "grad_norm": 6.092745398297892, + "learning_rate": 4.911317274568909e-07, + "logits/chosen": -1.1411150693893433, + "logits/rejected": -1.1094478368759155, + "logps/chosen": -294.82550048828125, + "logps/rejected": -408.50970458984375, + "loss": 0.4335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1695319563150406, + "rewards/margins": 0.7641543745994568, + "rewards/margins_max": 1.1791099309921265, + "rewards/margins_min": 0.34919896721839905, + "rewards/margins_std": 0.586835503578186, + "rewards/rejected": -0.933686375617981, + "step": 240 + }, + { + "epoch": 0.18, + "grad_norm": 13.168192652840903, + "learning_rate": 4.89357080678133e-07, + "logits/chosen": -1.0950664281845093, + "logits/rejected": -1.240697979927063, + "logps/chosen": -269.51092529296875, + "logps/rejected": -296.14837646484375, + "loss": 0.4457, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.32782456278800964, + "rewards/margins": 0.6738765835762024, + "rewards/margins_max": 0.9242515563964844, + "rewards/margins_min": 0.423501580953598, + "rewards/margins_std": 0.35408374667167664, + "rewards/rejected": -1.0017011165618896, + "step": 250 + }, + { + "epoch": 0.19, + "grad_norm": 5.63734344760071, + "learning_rate": 4.874245022286637e-07, + "logits/chosen": -1.1380219459533691, + "logits/rejected": -0.8845139741897583, + "logps/chosen": -245.44686889648438, + "logps/rejected": -377.0203552246094, + "loss": 0.4311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39118385314941406, + "rewards/margins": 0.8953431844711304, + "rewards/margins_max": 1.3878755569458008, + "rewards/margins_min": 0.40281087160110474, + "rewards/margins_std": 0.6965457797050476, + "rewards/rejected": -1.2865270376205444, + "step": 260 + }, + { + "epoch": 0.2, + "grad_norm": 5.2265046259602705, + "learning_rate": 4.853352672549815e-07, + "logits/chosen": -0.9493010640144348, + "logits/rejected": -0.9017621874809265, + "logps/chosen": -434.3206481933594, + "logps/rejected": -370.5262451171875, + "loss": 0.4015, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5151541829109192, + "rewards/margins": 0.754838764667511, + "rewards/margins_max": 1.1408073902130127, + "rewards/margins_min": 0.3688700795173645, + "rewards/margins_std": 0.5458420515060425, + "rewards/rejected": -1.2699930667877197, + "step": 270 + }, + { + "epoch": 0.21, + "grad_norm": 9.133504257567045, + "learning_rate": 4.830907542680918e-07, + "logits/chosen": -1.0836373567581177, + "logits/rejected": -0.9045012593269348, + "logps/chosen": -264.9966125488281, + "logps/rejected": -428.46539306640625, + "loss": 0.3691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5287370681762695, + "rewards/margins": 2.0548110008239746, + "rewards/margins_max": 3.620879650115967, + "rewards/margins_min": 0.48874226212501526, + "rewards/margins_std": 2.2147555351257324, + "rewards/rejected": -2.583548069000244, + "step": 280 + }, + { + "epoch": 0.21, + "grad_norm": 5.917073426239516, + "learning_rate": 4.806924442339425e-07, + "logits/chosen": -1.0086328983306885, + "logits/rejected": -0.8821426630020142, + "logps/chosen": -305.4242248535156, + "logps/rejected": -435.61737060546875, + "loss": 0.3813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40681153535842896, + "rewards/margins": 1.1146458387374878, + "rewards/margins_max": 1.7330601215362549, + "rewards/margins_min": 0.4962318539619446, + "rewards/margins_std": 0.8745697140693665, + "rewards/rejected": -1.5214574337005615, + "step": 290 + }, + { + "epoch": 0.22, + "grad_norm": 7.285903481113855, + "learning_rate": 4.781419195962598e-07, + "logits/chosen": -0.997855544090271, + "logits/rejected": -0.9541902542114258, + "logps/chosen": -299.9017639160156, + "logps/rejected": -388.34246826171875, + "loss": 0.3597, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6117764711380005, + "rewards/margins": 0.7978827953338623, + "rewards/margins_max": 1.1113024950027466, + "rewards/margins_min": 0.4844631552696228, + "rewards/margins_std": 0.44324231147766113, + "rewards/rejected": -1.4096593856811523, + "step": 300 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -1.0628585815429688, + "eval_logits/rejected": -1.051159143447876, + "eval_logps/chosen": -439.3020324707031, + "eval_logps/rejected": -442.590576171875, + "eval_loss": 0.6787940859794617, + "eval_rewards/accuracies": 0.5714285969734192, + "eval_rewards/chosen": -0.9346656203269958, + "eval_rewards/margins": 0.12943138182163239, + "eval_rewards/margins_max": 1.008405089378357, + "eval_rewards/margins_min": -0.7319620251655579, + "eval_rewards/margins_std": 0.5778602361679077, + "eval_rewards/rejected": -1.0640968084335327, + "eval_runtime": 418.5023, + "eval_samples_per_second": 9.558, + "eval_steps_per_second": 0.151, + "step": 300 + }, + { + "epoch": 0.23, + "grad_norm": 10.782754556563047, + "learning_rate": 4.754408632324253e-07, + "logits/chosen": -1.1973422765731812, + "logits/rejected": -0.9350277781486511, + "logps/chosen": -318.24627685546875, + "logps/rejected": -525.6881103515625, + "loss": 0.3331, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7586840987205505, + "rewards/margins": 2.185839891433716, + "rewards/margins_max": 2.856729745864868, + "rewards/margins_min": 1.5149496793746948, + "rewards/margins_std": 0.9487816691398621, + "rewards/rejected": -2.944523811340332, + "step": 310 + }, + { + "epoch": 0.24, + "grad_norm": 5.923061735298404, + "learning_rate": 4.725910573430866e-07, + "logits/chosen": -1.0679926872253418, + "logits/rejected": -0.945013165473938, + "logps/chosen": -365.65472412109375, + "logps/rejected": -405.1241760253906, + "loss": 0.3627, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8854155540466309, + "rewards/margins": 0.7383102178573608, + "rewards/margins_max": 1.1103546619415283, + "rewards/margins_min": 0.3662659227848053, + "rewards/margins_std": 0.5261501669883728, + "rewards/rejected": -1.6237256526947021, + "step": 320 + }, + { + "epoch": 0.24, + "grad_norm": 7.098229956454526, + "learning_rate": 4.6959438227623293e-07, + "logits/chosen": -1.1373931169509888, + "logits/rejected": -0.862761378288269, + "logps/chosen": -276.69671630859375, + "logps/rejected": -535.3623046875, + "loss": 0.2945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7917782068252563, + "rewards/margins": 2.616485595703125, + "rewards/margins_max": 4.648871421813965, + "rewards/margins_min": 0.5841000080108643, + "rewards/margins_std": 2.874227523803711, + "rewards/rejected": -3.40826416015625, + "step": 330 + }, + { + "epoch": 0.25, + "grad_norm": 8.024599779277368, + "learning_rate": 4.664528152865105e-07, + "logits/chosen": -0.7721256613731384, + "logits/rejected": -0.8172466158866882, + "logps/chosen": -349.3388671875, + "logps/rejected": -486.68597412109375, + "loss": 0.2734, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1078470945358276, + "rewards/margins": 1.6050605773925781, + "rewards/margins_max": 2.6409249305725098, + "rewards/margins_min": 0.5691961646080017, + "rewards/margins_std": 1.4649332761764526, + "rewards/rejected": -2.712907552719116, + "step": 340 + }, + { + "epoch": 0.26, + "grad_norm": 7.882556555729322, + "learning_rate": 4.6316842923059816e-07, + "logits/chosen": -1.0482970476150513, + "logits/rejected": -0.8200104832649231, + "logps/chosen": -331.43133544921875, + "logps/rejected": -785.7190551757812, + "loss": 0.3029, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9469194412231445, + "rewards/margins": 4.725480556488037, + "rewards/margins_max": 8.024388313293457, + "rewards/margins_min": 1.4265724420547485, + "rewards/margins_std": 4.665360450744629, + "rewards/rejected": -5.672399997711182, + "step": 350 + }, + { + "epoch": 0.26, + "grad_norm": 11.928720576155937, + "learning_rate": 4.5974339119950334e-07, + "logits/chosen": -0.9947048425674438, + "logits/rejected": -0.8432388305664062, + "logps/chosen": -433.1314392089844, + "logps/rejected": -583.6080932617188, + "loss": 0.2952, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5369694232940674, + "rewards/margins": 1.8583523035049438, + "rewards/margins_max": 2.645268678665161, + "rewards/margins_min": 1.0714359283447266, + "rewards/margins_std": 1.1128677129745483, + "rewards/rejected": -3.3953216075897217, + "step": 360 + }, + { + "epoch": 0.27, + "grad_norm": 5.540434948793406, + "learning_rate": 4.5617996108867997e-07, + "logits/chosen": -0.8581298589706421, + "logits/rejected": -0.3961424231529236, + "logps/chosen": -412.405517578125, + "logps/rejected": -812.7819213867188, + "loss": 0.2262, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.617913007736206, + "rewards/margins": 4.383803844451904, + "rewards/margins_max": 7.294039249420166, + "rewards/margins_min": 1.4735687971115112, + "rewards/margins_std": 4.115694522857666, + "rewards/rejected": -6.001717567443848, + "step": 370 + }, + { + "epoch": 0.28, + "grad_norm": 17.370609516247765, + "learning_rate": 4.5248049010691304e-07, + "logits/chosen": -1.0891549587249756, + "logits/rejected": -0.69083172082901, + "logps/chosen": -347.2943420410156, + "logps/rejected": -703.4866943359375, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3099722862243652, + "rewards/margins": 3.3620052337646484, + "rewards/margins_max": 5.631108283996582, + "rewards/margins_min": 1.092902421951294, + "rewards/margins_std": 3.208995819091797, + "rewards/rejected": -4.6719770431518555, + "step": 380 + }, + { + "epoch": 0.29, + "grad_norm": 9.292751258662012, + "learning_rate": 4.486474192249533e-07, + "logits/chosen": -1.0247005224227905, + "logits/rejected": -0.6028069853782654, + "logps/chosen": -442.56671142578125, + "logps/rejected": -660.4315185546875, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4350707530975342, + "rewards/margins": 2.895498514175415, + "rewards/margins_max": 3.9468486309051514, + "rewards/margins_min": 1.8441476821899414, + "rewards/margins_std": 1.4868338108062744, + "rewards/rejected": -4.330569267272949, + "step": 390 + }, + { + "epoch": 0.29, + "grad_norm": 5.8794814274178755, + "learning_rate": 4.4468327756492504e-07, + "logits/chosen": -0.7380314469337463, + "logits/rejected": -0.5135469436645508, + "logps/chosen": -366.635986328125, + "logps/rejected": -607.7274780273438, + "loss": 0.2059, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5760023593902588, + "rewards/margins": 2.5897469520568848, + "rewards/margins_max": 3.876375913619995, + "rewards/margins_min": 1.3031187057495117, + "rewards/margins_std": 1.8195674419403076, + "rewards/rejected": -4.165749549865723, + "step": 400 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -0.8807379603385925, + "eval_logits/rejected": -0.8695055842399597, + "eval_logps/chosen": -542.6320190429688, + "eval_logps/rejected": -566.7861938476562, + "eval_loss": 0.7172051072120667, + "eval_rewards/accuracies": 0.5972222089767456, + "eval_rewards/chosen": -1.9679654836654663, + "eval_rewards/margins": 0.3380873501300812, + "eval_rewards/margins_max": 2.344252109527588, + "eval_rewards/margins_min": -1.388581395149231, + "eval_rewards/margins_std": 1.2205023765563965, + "eval_rewards/rejected": -2.3060529232025146, + "eval_runtime": 415.548, + "eval_samples_per_second": 9.626, + "eval_steps_per_second": 0.152, + "step": 400 + }, + { + "epoch": 0.3, + "grad_norm": 11.887878225278437, + "learning_rate": 4.405906807315705e-07, + "logits/chosen": -0.7631363868713379, + "logits/rejected": -0.14442148804664612, + "logps/chosen": -412.6502990722656, + "logps/rejected": -617.9203491210938, + "loss": 0.1867, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8327445983886719, + "rewards/margins": 2.406796932220459, + "rewards/margins_max": 3.8745861053466797, + "rewards/margins_min": 0.9390074014663696, + "rewards/margins_std": 2.075767755508423, + "rewards/rejected": -4.239541530609131, + "step": 410 + }, + { + "epoch": 0.31, + "grad_norm": 7.6667134274072195, + "learning_rate": 4.363723290864314e-07, + "logits/chosen": -0.8663452863693237, + "logits/rejected": -0.10104439407587051, + "logps/chosen": -507.49078369140625, + "logps/rejected": -824.9513549804688, + "loss": 0.221, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1396355628967285, + "rewards/margins": 4.1518402099609375, + "rewards/margins_max": 6.126175403594971, + "rewards/margins_min": 2.1775054931640625, + "rewards/margins_std": 2.792131185531616, + "rewards/rejected": -6.291476249694824, + "step": 420 + }, + { + "epoch": 0.32, + "grad_norm": 15.491732727187143, + "learning_rate": 4.3203100596610723e-07, + "logits/chosen": -0.5918745398521423, + "logits/rejected": -0.1715858429670334, + "logps/chosen": -453.0254821777344, + "logps/rejected": -597.4471435546875, + "loss": 0.1938, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7686259746551514, + "rewards/margins": 2.167701005935669, + "rewards/margins_max": 3.2129874229431152, + "rewards/margins_min": 1.1224141120910645, + "rewards/margins_std": 1.4782588481903076, + "rewards/rejected": -3.9363269805908203, + "step": 430 + }, + { + "epoch": 0.32, + "grad_norm": 8.708872027507127, + "learning_rate": 4.2756957584576436e-07, + "logits/chosen": -0.584081768989563, + "logits/rejected": 0.096702441573143, + "logps/chosen": -451.47509765625, + "logps/rejected": -978.1886596679688, + "loss": 0.2022, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9495502710342407, + "rewards/margins": 5.0804595947265625, + "rewards/margins_max": 8.66343879699707, + "rewards/margins_min": 1.4974806308746338, + "rewards/margins_std": 5.0670976638793945, + "rewards/rejected": -7.0300092697143555, + "step": 440 + }, + { + "epoch": 0.33, + "grad_norm": 7.538469505929578, + "learning_rate": 4.22990982449109e-07, + "logits/chosen": -0.6104982495307922, + "logits/rejected": -0.21484926342964172, + "logps/chosen": -472.439453125, + "logps/rejected": -757.8245849609375, + "loss": 0.1417, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.049053192138672, + "rewards/margins": 3.2169277667999268, + "rewards/margins_max": 4.436863422393799, + "rewards/margins_min": 1.9969921112060547, + "rewards/margins_std": 1.7252495288848877, + "rewards/rejected": -5.2659807205200195, + "step": 450 + }, + { + "epoch": 0.34, + "grad_norm": 10.825278124877386, + "learning_rate": 4.1829824680607104e-07, + "logits/chosen": -0.419607937335968, + "logits/rejected": 0.11389993131160736, + "logps/chosen": -435.0726623535156, + "logps/rejected": -784.734130859375, + "loss": 0.166, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.990121841430664, + "rewards/margins": 3.3705692291259766, + "rewards/margins_max": 5.105216026306152, + "rewards/margins_min": 1.6359226703643799, + "rewards/margins_std": 2.4531607627868652, + "rewards/rejected": -5.360690593719482, + "step": 460 + }, + { + "epoch": 0.35, + "grad_norm": 9.234625136932591, + "learning_rate": 4.134944652594794e-07, + "logits/chosen": -0.5118550062179565, + "logits/rejected": 0.10812608152627945, + "logps/chosen": -453.38848876953125, + "logps/rejected": -1031.4366455078125, + "loss": 0.1243, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.346630096435547, + "rewards/margins": 5.6441521644592285, + "rewards/margins_max": 8.810213088989258, + "rewards/margins_min": 2.4780914783477783, + "rewards/margins_std": 4.4774861335754395, + "rewards/rejected": -7.990782260894775, + "step": 470 + }, + { + "epoch": 0.35, + "grad_norm": 6.8345938121765775, + "learning_rate": 4.085828074220451e-07, + "logits/chosen": -0.4821593165397644, + "logits/rejected": 0.33621591329574585, + "logps/chosen": -612.152587890625, + "logps/rejected": -944.8914794921875, + "loss": 0.142, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5179717540740967, + "rewards/margins": 4.587340354919434, + "rewards/margins_max": 7.297093868255615, + "rewards/margins_min": 1.8775880336761475, + "rewards/margins_std": 3.8321690559387207, + "rewards/rejected": -7.105312347412109, + "step": 480 + }, + { + "epoch": 0.36, + "grad_norm": 11.122230946658236, + "learning_rate": 4.035665140849994e-07, + "logits/chosen": -0.2719888985157013, + "logits/rejected": 0.40051668882369995, + "logps/chosen": -519.849365234375, + "logps/rejected": -941.7233276367188, + "loss": 0.1233, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.556196689605713, + "rewards/margins": 4.605846881866455, + "rewards/margins_max": 6.096743106842041, + "rewards/margins_min": 3.1149520874023438, + "rewards/margins_std": 2.1084442138671875, + "rewards/rejected": -7.162044525146484, + "step": 490 + }, + { + "epoch": 0.37, + "grad_norm": 4.543083572509446, + "learning_rate": 3.984488950797678e-07, + "logits/chosen": -0.19994431734085083, + "logits/rejected": 0.6510161757469177, + "logps/chosen": -450.4979553222656, + "logps/rejected": -926.5679931640625, + "loss": 0.1354, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4891421794891357, + "rewards/margins": 4.708044052124023, + "rewards/margins_max": 7.425878047943115, + "rewards/margins_min": 1.9902098178863525, + "rewards/margins_std": 3.8435981273651123, + "rewards/rejected": -7.197185516357422, + "step": 500 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -0.25537678599357605, + "eval_logits/rejected": -0.16171453893184662, + "eval_logps/chosen": -661.367431640625, + "eval_logps/rejected": -714.6080322265625, + "eval_loss": 0.8081530928611755, + "eval_rewards/accuracies": 0.6190476417541504, + "eval_rewards/chosen": -3.155320167541504, + "eval_rewards/margins": 0.6289510130882263, + "eval_rewards/margins_max": 4.081821918487549, + "eval_rewards/margins_min": -2.2017109394073486, + "eval_rewards/margins_std": 2.03205943107605, + "eval_rewards/rejected": -3.784270763397217, + "eval_runtime": 416.2564, + "eval_samples_per_second": 9.609, + "eval_steps_per_second": 0.151, + "step": 500 + }, + { + "epoch": 0.38, + "grad_norm": 11.627659490001143, + "learning_rate": 3.9323332709408904e-07, + "logits/chosen": -0.09876732528209686, + "logits/rejected": 1.3991271257400513, + "logps/chosen": -600.6998291015625, + "logps/rejected": -968.8531494140625, + "loss": 0.1308, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3052432537078857, + "rewards/margins": 4.294064998626709, + "rewards/margins_max": 6.973275184631348, + "rewards/margins_min": 1.614854097366333, + "rewards/margins_std": 3.788975954055786, + "rewards/rejected": -7.599307060241699, + "step": 510 + }, + { + "epoch": 0.38, + "grad_norm": 8.873005540995397, + "learning_rate": 3.879232514440227e-07, + "logits/chosen": -0.3379233479499817, + "logits/rejected": 0.6603206992149353, + "logps/chosen": -618.7060546875, + "logps/rejected": -1049.278076171875, + "loss": 0.1475, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1586403846740723, + "rewards/margins": 4.834142208099365, + "rewards/margins_max": 6.6787214279174805, + "rewards/margins_min": 2.989562511444092, + "rewards/margins_std": 2.6086299419403076, + "rewards/rejected": -7.992783546447754, + "step": 520 + }, + { + "epoch": 0.39, + "grad_norm": 17.879342011641224, + "learning_rate": 3.825221718033129e-07, + "logits/chosen": 0.0034618079662323, + "logits/rejected": 0.864820122718811, + "logps/chosen": -471.9354553222656, + "logps/rejected": -985.2346801757812, + "loss": 0.1082, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.823118209838867, + "rewards/margins": 5.269505023956299, + "rewards/margins_max": 8.90275764465332, + "rewards/margins_min": 1.6362518072128296, + "rewards/margins_std": 5.138195991516113, + "rewards/rejected": -8.092622756958008, + "step": 530 + }, + { + "epoch": 0.4, + "grad_norm": 6.390466873902363, + "learning_rate": 3.7703365189160746e-07, + "logits/chosen": -0.07338769733905792, + "logits/rejected": 1.4749701023101807, + "logps/chosen": -539.89697265625, + "logps/rejected": -1210.6910400390625, + "loss": 0.089, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.919481039047241, + "rewards/margins": 7.211228370666504, + "rewards/margins_max": 11.77415943145752, + "rewards/margins_min": 2.6482949256896973, + "rewards/margins_std": 6.452960968017578, + "rewards/rejected": -10.130708694458008, + "step": 540 + }, + { + "epoch": 0.4, + "grad_norm": 12.554873275869042, + "learning_rate": 3.714613131230587e-07, + "logits/chosen": -0.22135767340660095, + "logits/rejected": 1.1000282764434814, + "logps/chosen": -720.9986572265625, + "logps/rejected": -1223.421630859375, + "loss": 0.1223, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7738468647003174, + "rewards/margins": 5.606228828430176, + "rewards/margins_max": 8.233736038208008, + "rewards/margins_min": 2.9787204265594482, + "rewards/margins_std": 3.715857744216919, + "rewards/rejected": -9.380073547363281, + "step": 550 + }, + { + "epoch": 0.41, + "grad_norm": 40.923616793220184, + "learning_rate": 3.6580883221685533e-07, + "logits/chosen": -0.0870949998497963, + "logits/rejected": 1.078148603439331, + "logps/chosen": -505.99774169921875, + "logps/rejected": -1176.008544921875, + "loss": 0.0862, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.822312831878662, + "rewards/margins": 5.737250328063965, + "rewards/margins_max": 8.857365608215332, + "rewards/margins_min": 2.6171350479125977, + "rewards/margins_std": 4.412509918212891, + "rewards/rejected": -8.559562683105469, + "step": 560 + }, + { + "epoch": 0.42, + "grad_norm": 2.377000403316867, + "learning_rate": 3.6007993877126386e-07, + "logits/chosen": 0.25743845105171204, + "logits/rejected": 2.0459682941436768, + "logps/chosen": -640.0938110351562, + "logps/rejected": -1272.0159912109375, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.934041976928711, + "rewards/margins": 6.4811530113220215, + "rewards/margins_max": 10.410442352294922, + "rewards/margins_min": 2.5518646240234375, + "rewards/margins_std": 5.556853294372559, + "rewards/rejected": -10.415196418762207, + "step": 570 + }, + { + "epoch": 0.43, + "grad_norm": 6.765929979770598, + "learning_rate": 3.5427841280277937e-07, + "logits/chosen": 0.19738076627254486, + "logits/rejected": 1.5706841945648193, + "logps/chosen": -643.2400512695312, + "logps/rejected": -1103.7618408203125, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.439357280731201, + "rewards/margins": 4.518318176269531, + "rewards/margins_max": 6.311240196228027, + "rewards/margins_min": 2.725395441055298, + "rewards/margins_std": 2.5355746746063232, + "rewards/rejected": -7.957674980163574, + "step": 580 + }, + { + "epoch": 0.43, + "grad_norm": 2.3572788749229394, + "learning_rate": 3.484080822520096e-07, + "logits/chosen": 0.4655560553073883, + "logits/rejected": 1.286608099937439, + "logps/chosen": -555.6957397460938, + "logps/rejected": -1019.0916748046875, + "loss": 0.1491, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.497023820877075, + "rewards/margins": 4.476337432861328, + "rewards/margins_max": 6.756206512451172, + "rewards/margins_min": 2.1964690685272217, + "rewards/margins_std": 3.2242209911346436, + "rewards/rejected": -7.973361968994141, + "step": 590 + }, + { + "epoch": 0.44, + "grad_norm": 8.25918903118385, + "learning_rate": 3.4247282045793797e-07, + "logits/chosen": 0.2085554599761963, + "logits/rejected": 1.3560742139816284, + "logps/chosen": -595.1603393554688, + "logps/rejected": -1199.1165771484375, + "loss": 0.1327, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.877821445465088, + "rewards/margins": 6.124663352966309, + "rewards/margins_max": 9.755376815795898, + "rewards/margins_min": 2.493950843811035, + "rewards/margins_std": 5.134603500366211, + "rewards/rejected": -10.002485275268555, + "step": 600 + }, + { + "epoch": 0.44, + "eval_logits/chosen": 0.017259376123547554, + "eval_logits/rejected": 0.1599506437778473, + "eval_logps/chosen": -731.00927734375, + "eval_logps/rejected": -798.1055908203125, + "eval_loss": 0.8436357378959656, + "eval_rewards/accuracies": 0.6190476417541504, + "eval_rewards/chosen": -3.851738452911377, + "eval_rewards/margins": 0.7675079107284546, + "eval_rewards/margins_max": 4.83132266998291, + "eval_rewards/margins_min": -2.431659460067749, + "eval_rewards/margins_std": 2.352627992630005, + "eval_rewards/rejected": -4.619246482849121, + "eval_runtime": 415.8421, + "eval_samples_per_second": 9.619, + "eval_steps_per_second": 0.151, + "step": 600 + }, + { + "epoch": 0.45, + "grad_norm": 13.982869383101132, + "learning_rate": 3.3647654360223144e-07, + "logits/chosen": -0.18186531960964203, + "logits/rejected": 1.947683572769165, + "logps/chosen": -636.12548828125, + "logps/rejected": -1468.92333984375, + "loss": 0.08, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.338965654373169, + "rewards/margins": 8.887590408325195, + "rewards/margins_max": 12.813148498535156, + "rewards/margins_min": 4.962031364440918, + "rewards/margins_std": 5.551577568054199, + "rewards/rejected": -12.226556777954102, + "step": 610 + }, + { + "epoch": 0.46, + "grad_norm": 24.68214704261548, + "learning_rate": 3.30423208125281e-07, + "logits/chosen": -0.13235849142074585, + "logits/rejected": 1.7915821075439453, + "logps/chosen": -697.5199584960938, + "logps/rejected": -1485.5936279296875, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4117112159729004, + "rewards/margins": 8.921293258666992, + "rewards/margins_max": 12.249357223510742, + "rewards/margins_min": 5.593228340148926, + "rewards/margins_std": 4.7065935134887695, + "rewards/rejected": -12.333003044128418, + "step": 620 + }, + { + "epoch": 0.46, + "grad_norm": 10.905617995495655, + "learning_rate": 3.2431680811567833e-07, + "logits/chosen": -0.12053610384464264, + "logits/rejected": 1.8949730396270752, + "logps/chosen": -630.9464111328125, + "logps/rejected": -1220.925048828125, + "loss": 0.1229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5450587272644043, + "rewards/margins": 6.405971527099609, + "rewards/margins_max": 10.655710220336914, + "rewards/margins_min": 2.1562342643737793, + "rewards/margins_std": 6.010036945343018, + "rewards/rejected": -9.951030731201172, + "step": 630 + }, + { + "epoch": 0.47, + "grad_norm": 10.94150157360822, + "learning_rate": 3.1816137267485136e-07, + "logits/chosen": 0.027946263551712036, + "logits/rejected": 1.485925555229187, + "logps/chosen": -646.646728515625, + "logps/rejected": -1238.3758544921875, + "loss": 0.1477, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6477348804473877, + "rewards/margins": 6.220660209655762, + "rewards/margins_max": 9.265599250793457, + "rewards/margins_min": 3.1757211685180664, + "rewards/margins_std": 4.306193828582764, + "rewards/rejected": -9.86839485168457, + "step": 640 + }, + { + "epoch": 0.48, + "grad_norm": 17.595942677326722, + "learning_rate": 3.1196096325859815e-07, + "logits/chosen": -0.05433236435055733, + "logits/rejected": 2.2038755416870117, + "logps/chosen": -578.5730590820312, + "logps/rejected": -1498.58203125, + "loss": 0.1156, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.005871534347534, + "rewards/margins": 9.676332473754883, + "rewards/margins_max": 15.599041938781738, + "rewards/margins_min": 3.753622531890869, + "rewards/margins_std": 8.3759765625, + "rewards/rejected": -12.68220329284668, + "step": 650 + }, + { + "epoch": 0.49, + "grad_norm": 7.356684331269297, + "learning_rate": 3.057196709972727e-07, + "logits/chosen": 0.11046739667654037, + "logits/rejected": 2.175269365310669, + "logps/chosen": -674.2919921875, + "logps/rejected": -1267.6500244140625, + "loss": 0.0959, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1146275997161865, + "rewards/margins": 7.452083587646484, + "rewards/margins_max": 10.800088882446289, + "rewards/margins_min": 4.104078769683838, + "rewards/margins_std": 4.734793663024902, + "rewards/rejected": -10.56671142578125, + "step": 660 + }, + { + "epoch": 0.49, + "grad_norm": 7.038311259187171, + "learning_rate": 2.9944161399639086e-07, + "logits/chosen": 0.21353694796562195, + "logits/rejected": 1.7908731698989868, + "logps/chosen": -616.1519165039062, + "logps/rejected": -1157.595947265625, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5106310844421387, + "rewards/margins": 5.999436855316162, + "rewards/margins_max": 8.261363983154297, + "rewards/margins_min": 3.7375106811523438, + "rewards/margins_std": 3.198847532272339, + "rewards/rejected": -9.510068893432617, + "step": 670 + }, + { + "epoch": 0.5, + "grad_norm": 3.3985205014158293, + "learning_rate": 2.9313093461943824e-07, + "logits/chosen": 0.07152876257896423, + "logits/rejected": 1.9080642461776733, + "logps/chosen": -658.859619140625, + "logps/rejected": -1418.1920166015625, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6242880821228027, + "rewards/margins": 8.017306327819824, + "rewards/margins_max": 11.628385543823242, + "rewards/margins_min": 4.40622615814209, + "rewards/margins_std": 5.106837272644043, + "rewards/rejected": -11.641593933105469, + "step": 680 + }, + { + "epoch": 0.51, + "grad_norm": 12.970507914933444, + "learning_rate": 2.8679179675467104e-07, + "logits/chosen": 0.5070677995681763, + "logits/rejected": 2.8454136848449707, + "logps/chosen": -661.779296875, + "logps/rejected": -1588.948974609375, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.405068397521973, + "rewards/margins": 9.361727714538574, + "rewards/margins_max": 15.706764221191406, + "rewards/margins_min": 3.016690731048584, + "rewards/margins_std": 8.973237037658691, + "rewards/rejected": -13.766797065734863, + "step": 690 + }, + { + "epoch": 0.52, + "grad_norm": 25.37176614242638, + "learning_rate": 2.80428383067716e-07, + "logits/chosen": -0.056868601590394974, + "logits/rejected": 2.1195578575134277, + "logps/chosen": -643.5035400390625, + "logps/rejected": -1405.5491943359375, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8768184185028076, + "rewards/margins": 7.676672458648682, + "rewards/margins_max": 11.38581657409668, + "rewards/margins_min": 3.967529296875, + "rewards/margins_std": 5.245521545410156, + "rewards/rejected": -11.553489685058594, + "step": 700 + }, + { + "epoch": 0.52, + "eval_logits/chosen": 0.4162614345550537, + "eval_logits/rejected": 0.6300503015518188, + "eval_logps/chosen": -840.1605224609375, + "eval_logps/rejected": -929.0051879882812, + "eval_loss": 0.9893194437026978, + "eval_rewards/accuracies": 0.6190476417541504, + "eval_rewards/chosen": -4.943249225616455, + "eval_rewards/margins": 0.9849926233291626, + "eval_rewards/margins_max": 6.353243827819824, + "eval_rewards/margins_min": -3.295872688293457, + "eval_rewards/margins_std": 3.1250360012054443, + "eval_rewards/rejected": -5.9282426834106445, + "eval_runtime": 421.7747, + "eval_samples_per_second": 9.484, + "eval_steps_per_second": 0.149, + "step": 700 + }, + { + "epoch": 0.52, + "grad_norm": 8.759540837366416, + "learning_rate": 2.7404489224177973e-07, + "logits/chosen": 0.6560094356536865, + "logits/rejected": 3.2553603649139404, + "logps/chosen": -783.5775756835938, + "logps/rejected": -1650.650146484375, + "loss": 0.1101, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.931197166442871, + "rewards/margins": 8.854988098144531, + "rewards/margins_max": 12.33712100982666, + "rewards/margins_min": 5.372857093811035, + "rewards/margins_std": 4.924478054046631, + "rewards/rejected": -13.786186218261719, + "step": 710 + }, + { + "epoch": 0.53, + "grad_norm": 53.838974553307395, + "learning_rate": 2.676455362072894e-07, + "logits/chosen": 0.9320627450942993, + "logits/rejected": 3.438016414642334, + "logps/chosen": -699.7535400390625, + "logps/rejected": -1655.8385009765625, + "loss": 0.0852, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.045767307281494, + "rewards/margins": 9.726736068725586, + "rewards/margins_max": 13.513631820678711, + "rewards/margins_min": 5.939839839935303, + "rewards/margins_std": 5.355479717254639, + "rewards/rejected": -14.772501945495605, + "step": 720 + }, + { + "epoch": 0.54, + "grad_norm": 0.8391615669250567, + "learning_rate": 2.612345373627937e-07, + "logits/chosen": 0.2621687650680542, + "logits/rejected": 1.9230273962020874, + "logps/chosen": -639.4342041015625, + "logps/rejected": -1445.03271484375, + "loss": 0.1804, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.050388336181641, + "rewards/margins": 8.062755584716797, + "rewards/margins_max": 11.662395477294922, + "rewards/margins_min": 4.4631171226501465, + "rewards/margins_std": 5.090658664703369, + "rewards/rejected": -12.113143920898438, + "step": 730 + }, + { + "epoch": 0.54, + "grad_norm": 18.77671464276547, + "learning_rate": 2.54816125788955e-07, + "logits/chosen": 0.5534690022468567, + "logits/rejected": 2.526615858078003, + "logps/chosen": -709.9898681640625, + "logps/rejected": -1459.970947265625, + "loss": 0.1361, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.665217399597168, + "rewards/margins": 7.5482282638549805, + "rewards/margins_max": 12.083941459655762, + "rewards/margins_min": 3.0125153064727783, + "rewards/margins_std": 6.414466857910156, + "rewards/rejected": -12.213445663452148, + "step": 740 + }, + { + "epoch": 0.55, + "grad_norm": 6.37822813578148, + "learning_rate": 2.4839453645747467e-07, + "logits/chosen": 0.2104567587375641, + "logits/rejected": 1.8120098114013672, + "logps/chosen": -643.4108276367188, + "logps/rejected": -1417.44921875, + "loss": 0.1312, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7755866050720215, + "rewards/margins": 8.031917572021484, + "rewards/margins_max": 12.62381362915039, + "rewards/margins_min": 3.440018892288208, + "rewards/margins_std": 6.493924140930176, + "rewards/rejected": -11.807502746582031, + "step": 750 + }, + { + "epoch": 0.56, + "grad_norm": 16.8248388008373, + "learning_rate": 2.4197400643678987e-07, + "logits/chosen": 0.24539189040660858, + "logits/rejected": 1.6847679615020752, + "logps/chosen": -639.7948608398438, + "logps/rejected": -1011.7283935546875, + "loss": 0.0821, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.634861707687378, + "rewards/margins": 4.383325576782227, + "rewards/margins_max": 7.218289852142334, + "rewards/margins_min": 1.5483614206314087, + "rewards/margins_std": 4.009244918823242, + "rewards/rejected": -8.018186569213867, + "step": 760 + }, + { + "epoch": 0.57, + "grad_norm": 7.954736320138308, + "learning_rate": 2.3555877209638726e-07, + "logits/chosen": 0.0611066035926342, + "logits/rejected": 1.33302640914917, + "logps/chosen": -672.7412719726562, + "logps/rejected": -1782.3125, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4519970417022705, + "rewards/margins": 11.562549591064453, + "rewards/margins_max": 20.07329559326172, + "rewards/margins_min": 3.0518016815185547, + "rewards/margins_std": 12.036015510559082, + "rewards/rejected": -15.014546394348145, + "step": 770 + }, + { + "epoch": 0.57, + "grad_norm": 11.05108228058454, + "learning_rate": 2.2915306631157817e-07, + "logits/chosen": 0.2885664105415344, + "logits/rejected": 2.206385612487793, + "logps/chosen": -648.3999633789062, + "logps/rejected": -1299.401123046875, + "loss": 0.1085, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7839324474334717, + "rewards/margins": 7.008673191070557, + "rewards/margins_max": 9.869766235351562, + "rewards/margins_min": 4.147579669952393, + "rewards/margins_std": 4.046196937561035, + "rewards/rejected": -10.792604446411133, + "step": 780 + }, + { + "epoch": 0.58, + "grad_norm": 32.49887802957626, + "learning_rate": 2.2276111567057887e-07, + "logits/chosen": 0.22940261662006378, + "logits/rejected": 1.6958719491958618, + "logps/chosen": -593.3724365234375, + "logps/rejected": -1174.9674072265625, + "loss": 0.1111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.6294262409210205, + "rewards/margins": 5.80316162109375, + "rewards/margins_max": 8.796818733215332, + "rewards/margins_min": 2.8095040321350098, + "rewards/margins_std": 4.233671188354492, + "rewards/rejected": -9.432588577270508, + "step": 790 + }, + { + "epoch": 0.59, + "grad_norm": 7.06163362995566, + "learning_rate": 2.1638713768573936e-07, + "logits/chosen": 0.06335971504449844, + "logits/rejected": 1.4285287857055664, + "logps/chosen": -595.5140380859375, + "logps/rejected": -1295.677490234375, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.581036329269409, + "rewards/margins": 7.286231994628906, + "rewards/margins_max": 11.22960090637207, + "rewards/margins_min": 3.342862606048584, + "rewards/margins_std": 5.576765537261963, + "rewards/rejected": -10.867268562316895, + "step": 800 + }, + { + "epoch": 0.59, + "eval_logits/chosen": 0.12438549101352692, + "eval_logits/rejected": 0.28890377283096313, + "eval_logps/chosen": -732.38525390625, + "eval_logps/rejected": -799.7516479492188, + "eval_loss": 0.8086485862731934, + "eval_rewards/accuracies": 0.6190476417541504, + "eval_rewards/chosen": -3.8654978275299072, + "eval_rewards/margins": 0.7702099680900574, + "eval_rewards/margins_max": 4.502103328704834, + "eval_rewards/margins_min": -2.291940450668335, + "eval_rewards/margins_std": 2.2426791191101074, + "eval_rewards/rejected": -4.635707378387451, + "eval_runtime": 417.0386, + "eval_samples_per_second": 9.591, + "eval_steps_per_second": 0.151, + "step": 800 + }, + { + "epoch": 0.6, + "grad_norm": 5.3968517354258125, + "learning_rate": 2.100353380107609e-07, + "logits/chosen": 0.23273587226867676, + "logits/rejected": 1.9462811946868896, + "logps/chosen": -776.3011474609375, + "logps/rejected": -1441.837158203125, + "loss": 0.1, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.670359134674072, + "rewards/margins": 7.1968560218811035, + "rewards/margins_max": 11.221755981445312, + "rewards/margins_min": 3.171954393386841, + "rewards/margins_std": 5.6920695304870605, + "rewards/rejected": -11.86721420288086, + "step": 810 + }, + { + "epoch": 0.6, + "grad_norm": 15.03932252297939, + "learning_rate": 2.0370990766573698e-07, + "logits/chosen": -0.10733046382665634, + "logits/rejected": 1.8043702840805054, + "logps/chosen": -650.6616821289062, + "logps/rejected": -1616.010986328125, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1738812923431396, + "rewards/margins": 10.541234970092773, + "rewards/margins_max": 15.04127311706543, + "rewards/margins_min": 6.041195392608643, + "rewards/margins_std": 6.364017009735107, + "rewards/rejected": -13.715115547180176, + "step": 820 + }, + { + "epoch": 0.61, + "grad_norm": 31.097886733723477, + "learning_rate": 1.974150202718513e-07, + "logits/chosen": 0.08039845526218414, + "logits/rejected": 2.343336582183838, + "logps/chosen": -534.8485717773438, + "logps/rejected": -1418.825439453125, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1201188564300537, + "rewards/margins": 8.804471969604492, + "rewards/margins_max": 12.42898178100586, + "rewards/margins_min": 5.179962635040283, + "rewards/margins_std": 5.12583065032959, + "rewards/rejected": -11.924591064453125, + "step": 830 + }, + { + "epoch": 0.62, + "grad_norm": 16.60986174297272, + "learning_rate": 1.9115482929755445e-07, + "logits/chosen": 0.24223566055297852, + "logits/rejected": 1.6932157278060913, + "logps/chosen": -570.802978515625, + "logps/rejected": -1331.71533203125, + "loss": 0.0856, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.493419647216797, + "rewards/margins": 7.847373962402344, + "rewards/margins_max": 11.789865493774414, + "rewards/margins_min": 3.9048819541931152, + "rewards/margins_std": 5.575525760650635, + "rewards/rejected": -11.34079360961914, + "step": 840 + }, + { + "epoch": 0.63, + "grad_norm": 13.502668975247548, + "learning_rate": 1.8493346531803887e-07, + "logits/chosen": 0.48027992248535156, + "logits/rejected": 2.202148675918579, + "logps/chosen": -596.4915161132812, + "logps/rejected": -1282.6644287109375, + "loss": 0.0983, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9702115058898926, + "rewards/margins": 6.948336124420166, + "rewards/margins_max": 9.718558311462402, + "rewards/margins_min": 4.178112506866455, + "rewards/margins_std": 3.9176864624023438, + "rewards/rejected": -10.918546676635742, + "step": 850 + }, + { + "epoch": 0.63, + "grad_norm": 23.139494234389517, + "learning_rate": 1.7875503328981807e-07, + "logits/chosen": 0.3601033091545105, + "logits/rejected": 2.474608898162842, + "logps/chosen": -652.9142456054688, + "logps/rejected": -1604.696533203125, + "loss": 0.0605, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9101157188415527, + "rewards/margins": 9.86131477355957, + "rewards/margins_max": 14.181074142456055, + "rewards/margins_min": 5.541555881500244, + "rewards/margins_std": 6.1090617179870605, + "rewards/rejected": -13.771429061889648, + "step": 860 + }, + { + "epoch": 0.64, + "grad_norm": 14.349796836286524, + "learning_rate": 1.7262360984221006e-07, + "logits/chosen": 0.012769157998263836, + "logits/rejected": 1.9421314001083374, + "logps/chosen": -664.3881225585938, + "logps/rejected": -1434.8948974609375, + "loss": 0.1274, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.80120849609375, + "rewards/margins": 8.072778701782227, + "rewards/margins_max": 11.85603141784668, + "rewards/margins_min": 4.289526462554932, + "rewards/margins_std": 5.350326061248779, + "rewards/rejected": -11.873987197875977, + "step": 870 + }, + { + "epoch": 0.65, + "grad_norm": 15.485607186999017, + "learning_rate": 1.6654324058751175e-07, + "logits/chosen": 0.3775918483734131, + "logits/rejected": 1.973515510559082, + "logps/chosen": -713.2658081054688, + "logps/rejected": -1631.826904296875, + "loss": 0.0662, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.8164262771606445, + "rewards/margins": 9.047931671142578, + "rewards/margins_max": 12.981298446655273, + "rewards/margins_min": 5.114563941955566, + "rewards/margins_std": 5.562621116638184, + "rewards/rejected": -13.864356994628906, + "step": 880 + }, + { + "epoch": 0.65, + "grad_norm": 15.607832046954224, + "learning_rate": 1.6051793745163812e-07, + "logits/chosen": 0.6472679376602173, + "logits/rejected": 2.5574803352355957, + "logps/chosen": -689.5281982421875, + "logps/rejected": -1642.12109375, + "loss": 0.1011, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.642444610595703, + "rewards/margins": 9.717334747314453, + "rewards/margins_max": 15.204099655151367, + "rewards/margins_min": 4.230566501617432, + "rewards/margins_std": 7.75946044921875, + "rewards/rejected": -14.359777450561523, + "step": 890 + }, + { + "epoch": 0.66, + "grad_norm": 45.565136882193066, + "learning_rate": 1.5455167602698915e-07, + "logits/chosen": 0.06020700931549072, + "logits/rejected": 2.2921700477600098, + "logps/chosen": -727.0872192382812, + "logps/rejected": -1482.33837890625, + "loss": 0.0997, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.481110572814941, + "rewards/margins": 7.914282321929932, + "rewards/margins_max": 10.535211563110352, + "rewards/margins_min": 5.293350696563721, + "rewards/margins_std": 3.7065558433532715, + "rewards/rejected": -12.395392417907715, + "step": 900 + }, + { + "epoch": 0.66, + "eval_logits/chosen": 0.20550121366977692, + "eval_logits/rejected": 0.3917555809020996, + "eval_logps/chosen": -789.8953857421875, + "eval_logps/rejected": -866.7603149414062, + "eval_loss": 0.8639366030693054, + "eval_rewards/accuracies": 0.6269841194152832, + "eval_rewards/chosen": -4.4405999183654785, + "eval_rewards/margins": 0.8651944398880005, + "eval_rewards/margins_max": 5.159237861633301, + "eval_rewards/margins_min": -2.6377525329589844, + "eval_rewards/margins_std": 2.5658202171325684, + "eval_rewards/rejected": -5.305793762207031, + "eval_runtime": 419.7425, + "eval_samples_per_second": 9.53, + "eval_steps_per_second": 0.15, + "step": 900 + }, + { + "epoch": 0.67, + "grad_norm": 7.254262825774854, + "learning_rate": 1.4864839294928924e-07, + "logits/chosen": 0.2960719168186188, + "logits/rejected": 2.519636392593384, + "logps/chosen": -667.0858154296875, + "logps/rejected": -2054.51953125, + "loss": 0.1092, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.099366664886475, + "rewards/margins": 13.694231033325195, + "rewards/margins_max": 20.22653579711914, + "rewards/margins_min": 7.161923408508301, + "rewards/margins_std": 9.238077163696289, + "rewards/rejected": -17.793596267700195, + "step": 910 + }, + { + "epoch": 0.68, + "grad_norm": 19.00706315113973, + "learning_rate": 1.428119833001315e-07, + "logits/chosen": 0.011763498187065125, + "logits/rejected": 2.5436980724334717, + "logps/chosen": -683.8145751953125, + "logps/rejected": -1476.839111328125, + "loss": 0.0479, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7376608848571777, + "rewards/margins": 9.0157470703125, + "rewards/margins_max": 13.42829418182373, + "rewards/margins_min": 4.603199481964111, + "rewards/margins_std": 6.2402849197387695, + "rewards/rejected": -12.75340747833252, + "step": 920 + }, + { + "epoch": 0.68, + "grad_norm": 2.449628285920275, + "learning_rate": 1.370462980369401e-07, + "logits/chosen": 0.11705155670642853, + "logits/rejected": 1.5357266664505005, + "logps/chosen": -766.4974365234375, + "logps/rejected": -1289.979248046875, + "loss": 0.0636, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.693875312805176, + "rewards/margins": 5.769114017486572, + "rewards/margins_max": 7.741427421569824, + "rewards/margins_min": 3.796800136566162, + "rewards/margins_std": 2.7892730236053467, + "rewards/rejected": -10.462987899780273, + "step": 930 + }, + { + "epoch": 0.69, + "grad_norm": 11.213992357762015, + "learning_rate": 1.3135514145204606e-07, + "logits/chosen": 0.21615874767303467, + "logits/rejected": 2.0779476165771484, + "logps/chosen": -605.4188232421875, + "logps/rejected": -1608.945556640625, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8029799461364746, + "rewards/margins": 10.208128929138184, + "rewards/margins_max": 17.389694213867188, + "rewards/margins_min": 3.0265650749206543, + "rewards/margins_std": 10.15626335144043, + "rewards/rejected": -14.011110305786133, + "step": 940 + }, + { + "epoch": 0.7, + "grad_norm": 4.371361045173521, + "learning_rate": 1.257422686625539e-07, + "logits/chosen": 0.16180220246315002, + "logits/rejected": 2.055144786834717, + "logps/chosen": -682.2508544921875, + "logps/rejected": -1589.9964599609375, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9322009086608887, + "rewards/margins": 9.378369331359863, + "rewards/margins_max": 14.781808853149414, + "rewards/margins_min": 3.9749279022216797, + "rewards/margins_std": 7.641619682312012, + "rewards/rejected": -13.310567855834961, + "step": 950 + }, + { + "epoch": 0.71, + "grad_norm": 3.4893980542106102, + "learning_rate": 1.2021138313265444e-07, + "logits/chosen": 0.11532745510339737, + "logits/rejected": 1.866121530532837, + "logps/chosen": -634.554931640625, + "logps/rejected": -1674.252197265625, + "loss": 0.1202, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.853950023651123, + "rewards/margins": 10.867055892944336, + "rewards/margins_max": 18.760677337646484, + "rewards/margins_min": 2.97343373298645, + "rewards/margins_std": 11.163267135620117, + "rewards/rejected": -14.7210054397583, + "step": 960 + }, + { + "epoch": 0.71, + "grad_norm": 14.447489915734623, + "learning_rate": 1.1476613423001974e-07, + "logits/chosen": 0.17886893451213837, + "logits/rejected": 1.89533269405365, + "logps/chosen": -677.0606079101562, + "logps/rejected": -1261.5345458984375, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.194746971130371, + "rewards/margins": 5.769103050231934, + "rewards/margins_max": 7.97817325592041, + "rewards/margins_min": 3.560032606124878, + "rewards/margins_std": 3.1240971088409424, + "rewards/rejected": -9.963850021362305, + "step": 970 + }, + { + "epoch": 0.72, + "grad_norm": 3.9798658979228856, + "learning_rate": 1.0941011481789042e-07, + "logits/chosen": 0.034214410930871964, + "logits/rejected": 2.867272138595581, + "logps/chosen": -702.2564697265625, + "logps/rejected": -1857.8795166015625, + "loss": 0.0935, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.056910037994385, + "rewards/margins": 11.857443809509277, + "rewards/margins_max": 18.30853843688965, + "rewards/margins_min": 5.406346797943115, + "rewards/margins_std": 9.123228073120117, + "rewards/rejected": -15.91435432434082, + "step": 980 + }, + { + "epoch": 0.73, + "grad_norm": 27.849338662173917, + "learning_rate": 1.041468588844476e-07, + "logits/chosen": 0.4994427263736725, + "logits/rejected": 2.539013385772705, + "logps/chosen": -599.5453491210938, + "logps/rejected": -1590.7774658203125, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.911860704421997, + "rewards/margins": 10.174264907836914, + "rewards/margins_max": 15.016085624694824, + "rewards/margins_min": 5.332446098327637, + "rewards/margins_std": 6.8473663330078125, + "rewards/rejected": -14.086126327514648, + "step": 990 + }, + { + "epoch": 0.74, + "grad_norm": 4.2150247037639375, + "learning_rate": 9.897983921102954e-08, + "logits/chosen": -0.2390742003917694, + "logits/rejected": 2.2101035118103027, + "logps/chosen": -670.8737182617188, + "logps/rejected": -1509.370361328125, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.90120005607605, + "rewards/margins": 8.870689392089844, + "rewards/margins_max": 11.843083381652832, + "rewards/margins_min": 5.89829683303833, + "rewards/margins_std": 4.203598976135254, + "rewards/rejected": -12.771888732910156, + "step": 1000 + }, + { + "epoch": 0.74, + "eval_logits/chosen": 0.21985697746276855, + "eval_logits/rejected": 0.4062546491622925, + "eval_logps/chosen": -791.2946166992188, + "eval_logps/rejected": -865.1302490234375, + "eval_loss": 0.8618067502975464, + "eval_rewards/accuracies": 0.6230158805847168, + "eval_rewards/chosen": -4.454591751098633, + "eval_rewards/margins": 0.8349014520645142, + "eval_rewards/margins_max": 5.060412406921387, + "eval_rewards/margins_min": -2.622389078140259, + "eval_rewards/margins_std": 2.52128529548645, + "eval_rewards/rejected": -5.289493083953857, + "eval_runtime": 419.5466, + "eval_samples_per_second": 9.534, + "eval_steps_per_second": 0.15, + "step": 1000 + }, + { + "epoch": 0.74, + "grad_norm": 12.140211164365056, + "learning_rate": 9.391246508073433e-08, + "logits/chosen": 0.13034725189208984, + "logits/rejected": 2.0794267654418945, + "logps/chosen": -724.4019775390625, + "logps/rejected": -1571.5511474609375, + "loss": 0.0777, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.304561614990234, + "rewards/margins": 8.91108512878418, + "rewards/margins_max": 12.622480392456055, + "rewards/margins_min": 5.199688911437988, + "rewards/margins_std": 5.248705863952637, + "rewards/rejected": -13.215646743774414, + "step": 1010 + }, + { + "epoch": 0.75, + "grad_norm": 72.48315962813399, + "learning_rate": 8.894808002892037e-08, + "logits/chosen": 0.19714145362377167, + "logits/rejected": 2.8781895637512207, + "logps/chosen": -689.0614624023438, + "logps/rejected": -1635.4539794921875, + "loss": 0.0641, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.8019371032714844, + "rewards/margins": 10.48505687713623, + "rewards/margins_max": 15.840913772583008, + "rewards/margins_min": 5.129199981689453, + "rewards/margins_std": 7.5743255615234375, + "rewards/rejected": -14.286993026733398, + "step": 1020 + }, + { + "epoch": 0.76, + "grad_norm": 20.88616124929115, + "learning_rate": 8.408995963708756e-08, + "logits/chosen": -0.0833059698343277, + "logits/rejected": 2.3186755180358887, + "logps/chosen": -681.8640747070312, + "logps/rejected": -1602.0863037109375, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.856149673461914, + "rewards/margins": 9.128388404846191, + "rewards/margins_max": 12.01569652557373, + "rewards/margins_min": 6.241078853607178, + "rewards/margins_std": 4.0832719802856445, + "rewards/rejected": -12.984537124633789, + "step": 1030 + }, + { + "epoch": 0.77, + "grad_norm": 9.834472583209813, + "learning_rate": 7.934130937159508e-08, + "logits/chosen": 0.17558620870113373, + "logits/rejected": 2.297236442565918, + "logps/chosen": -637.3060302734375, + "logps/rejected": -1326.9390869140625, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.932767152786255, + "rewards/margins": 7.218419075012207, + "rewards/margins_max": 9.979570388793945, + "rewards/margins_min": 4.457267761230469, + "rewards/margins_std": 3.904857635498047, + "rewards/rejected": -11.151185989379883, + "step": 1040 + }, + { + "epoch": 0.77, + "grad_norm": 5.026095263611361, + "learning_rate": 7.470526246864364e-08, + "logits/chosen": 0.39160841703414917, + "logits/rejected": 2.559542179107666, + "logps/chosen": -693.7269287109375, + "logps/rejected": -1849.744873046875, + "loss": 0.0552, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.237768650054932, + "rewards/margins": 12.45046329498291, + "rewards/margins_max": 19.93360710144043, + "rewards/margins_min": 4.967319488525391, + "rewards/margins_std": 10.582763671875, + "rewards/rejected": -16.688232421875, + "step": 1050 + }, + { + "epoch": 0.78, + "grad_norm": 0.6591285293800628, + "learning_rate": 7.018487786691512e-08, + "logits/chosen": 0.43399763107299805, + "logits/rejected": 2.060253381729126, + "logps/chosen": -745.4591674804688, + "logps/rejected": -1831.240478515625, + "loss": 0.0678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.024683475494385, + "rewards/margins": 10.875140190124512, + "rewards/margins_max": 17.002622604370117, + "rewards/margins_min": 4.747661113739014, + "rewards/margins_std": 8.66556453704834, + "rewards/rejected": -15.899823188781738, + "step": 1060 + }, + { + "epoch": 0.79, + "grad_norm": 4.119017563303306, + "learning_rate": 6.578313818923559e-08, + "logits/chosen": -0.07052882760763168, + "logits/rejected": 1.8699405193328857, + "logps/chosen": -909.0846557617188, + "logps/rejected": -1548.6923828125, + "loss": 0.0634, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.350946426391602, + "rewards/margins": 7.820859432220459, + "rewards/margins_max": 11.624895095825195, + "rewards/margins_min": 4.016822338104248, + "rewards/margins_std": 5.379720211029053, + "rewards/rejected": -13.171804428100586, + "step": 1070 + }, + { + "epoch": 0.79, + "grad_norm": 16.860241482971446, + "learning_rate": 6.15029477745925e-08, + "logits/chosen": 0.48959070444107056, + "logits/rejected": 2.1462438106536865, + "logps/chosen": -734.9025268554688, + "logps/rejected": -1803.1939697265625, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.121659755706787, + "rewards/margins": 10.34645938873291, + "rewards/margins_max": 14.924234390258789, + "rewards/margins_min": 5.768682479858398, + "rewards/margins_std": 6.473954200744629, + "rewards/rejected": -15.468118667602539, + "step": 1080 + }, + { + "epoch": 0.8, + "grad_norm": 18.379765722708388, + "learning_rate": 5.734713076180486e-08, + "logits/chosen": 0.46901997923851013, + "logits/rejected": 3.454606294631958, + "logps/chosen": -741.1581420898438, + "logps/rejected": -1905.183349609375, + "loss": 0.0713, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.039034843444824, + "rewards/margins": 12.11182975769043, + "rewards/margins_max": 19.285795211791992, + "rewards/margins_min": 4.937865257263184, + "rewards/margins_std": 10.145517349243164, + "rewards/rejected": -17.15086555480957, + "step": 1090 + }, + { + "epoch": 0.81, + "grad_norm": 4.317359176747138, + "learning_rate": 5.3318429226110875e-08, + "logits/chosen": 0.19755136966705322, + "logits/rejected": 2.050144672393799, + "logps/chosen": -604.0868530273438, + "logps/rejected": -1733.5550537109375, + "loss": 0.141, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.046292304992676, + "rewards/margins": 11.252501487731934, + "rewards/margins_max": 16.934438705444336, + "rewards/margins_min": 5.57056188583374, + "rewards/margins_std": 8.03547477722168, + "rewards/rejected": -15.298794746398926, + "step": 1100 + }, + { + "epoch": 0.81, + "eval_logits/chosen": 0.3016913831233978, + "eval_logits/rejected": 0.5082818865776062, + "eval_logps/chosen": -832.3104858398438, + "eval_logps/rejected": -915.954833984375, + "eval_loss": 0.9049465656280518, + "eval_rewards/accuracies": 0.6190476417541504, + "eval_rewards/chosen": -4.864750385284424, + "eval_rewards/margins": 0.9329892992973328, + "eval_rewards/margins_max": 5.632690906524658, + "eval_rewards/margins_min": -2.8439128398895264, + "eval_rewards/margins_std": 2.7856106758117676, + "eval_rewards/rejected": -5.7977399826049805, + "eval_runtime": 414.0109, + "eval_samples_per_second": 9.662, + "eval_steps_per_second": 0.152, + "step": 1100 + }, + { + "epoch": 0.82, + "grad_norm": 13.4435984156611, + "learning_rate": 4.9419501369902026e-08, + "logits/chosen": 0.08746049553155899, + "logits/rejected": 2.6451172828674316, + "logps/chosen": -771.4244384765625, + "logps/rejected": -2024.484619140625, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.441189289093018, + "rewards/margins": 13.507779121398926, + "rewards/margins_max": 19.457698822021484, + "rewards/margins_min": 7.557857513427734, + "rewards/margins_std": 8.414458274841309, + "rewards/rejected": -17.9489688873291, + "step": 1110 + }, + { + "epoch": 0.82, + "grad_norm": 1.4128692999239585, + "learning_rate": 4.5652919768798896e-08, + "logits/chosen": 0.4677937924861908, + "logits/rejected": 2.3705403804779053, + "logps/chosen": -793.5311279296875, + "logps/rejected": -1775.6380615234375, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.224045276641846, + "rewards/margins": 10.134596824645996, + "rewards/margins_max": 15.679702758789062, + "rewards/margins_min": 4.589491844177246, + "rewards/margins_std": 7.8419623374938965, + "rewards/rejected": -15.358640670776367, + "step": 1120 + }, + { + "epoch": 0.83, + "grad_norm": 16.039453526164788, + "learning_rate": 4.2021169674223536e-08, + "logits/chosen": 0.2930324077606201, + "logits/rejected": 2.399545431137085, + "logps/chosen": -655.0755615234375, + "logps/rejected": -1648.030029296875, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.887598752975464, + "rewards/margins": 10.443506240844727, + "rewards/margins_max": 14.695414543151855, + "rewards/margins_min": 6.191596984863281, + "rewards/margins_std": 6.013107776641846, + "rewards/rejected": -14.331106185913086, + "step": 1130 + }, + { + "epoch": 0.84, + "grad_norm": 26.479285274862587, + "learning_rate": 3.852664737359046e-08, + "logits/chosen": 0.3496669828891754, + "logits/rejected": 1.97479248046875, + "logps/chosen": -852.40380859375, + "logps/rejected": -1573.5230712890625, + "loss": 0.0768, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.772242546081543, + "rewards/margins": 7.9194207191467285, + "rewards/margins_max": 12.818387985229492, + "rewards/margins_min": 3.0204524993896484, + "rewards/margins_std": 6.928186893463135, + "rewards/rejected": -13.691662788391113, + "step": 1140 + }, + { + "epoch": 0.85, + "grad_norm": 0.7263792166932626, + "learning_rate": 3.5171658609197824e-08, + "logits/chosen": 0.1613047868013382, + "logits/rejected": 2.029664993286133, + "logps/chosen": -742.6275024414062, + "logps/rejected": -1609.7635498046875, + "loss": 0.1096, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.6217732429504395, + "rewards/margins": 8.67860221862793, + "rewards/margins_max": 13.895421981811523, + "rewards/margins_min": 3.4617819786071777, + "rewards/margins_std": 7.377697944641113, + "rewards/rejected": -13.300374984741211, + "step": 1150 + }, + { + "epoch": 0.85, + "grad_norm": 3.13150099340305, + "learning_rate": 3.195841705686139e-08, + "logits/chosen": 0.460742712020874, + "logits/rejected": 2.694736957550049, + "logps/chosen": -821.4349365234375, + "logps/rejected": -1898.295654296875, + "loss": 0.0821, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.314339637756348, + "rewards/margins": 11.54991626739502, + "rewards/margins_max": 18.18251609802246, + "rewards/margins_min": 4.917316436767578, + "rewards/margins_std": 9.379911422729492, + "rewards/rejected": -16.864253997802734, + "step": 1160 + }, + { + "epoch": 0.86, + "grad_norm": 16.312675595535207, + "learning_rate": 2.8889042865294837e-08, + "logits/chosen": 0.13087859749794006, + "logits/rejected": 2.484839916229248, + "logps/chosen": -702.7008056640625, + "logps/rejected": -1441.55078125, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.257961750030518, + "rewards/margins": 7.842066287994385, + "rewards/margins_max": 10.642562866210938, + "rewards/margins_min": 5.041568756103516, + "rewards/margins_std": 3.960501194000244, + "rewards/rejected": -12.100028991699219, + "step": 1170 + }, + { + "epoch": 0.87, + "grad_norm": 9.055687386628646, + "learning_rate": 2.5965561257202036e-08, + "logits/chosen": 0.1169591173529625, + "logits/rejected": 2.362281560897827, + "logps/chosen": -763.2276611328125, + "logps/rejected": -1660.2099609375, + "loss": 0.0572, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.720892906188965, + "rewards/margins": 9.803942680358887, + "rewards/margins_max": 15.876733779907227, + "rewards/margins_min": 3.731149196624756, + "rewards/margins_std": 8.588226318359375, + "rewards/rejected": -14.524835586547852, + "step": 1180 + }, + { + "epoch": 0.88, + "grad_norm": 22.841895074273324, + "learning_rate": 2.318990119300218e-08, + "logits/chosen": 0.10627205669879913, + "logits/rejected": 1.2642805576324463, + "logps/chosen": -798.917724609375, + "logps/rejected": -2165.9775390625, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4411396980285645, + "rewards/margins": 13.126449584960938, + "rewards/margins_max": 20.357501983642578, + "rewards/margins_min": 5.8953962326049805, + "rewards/margins_std": 10.226253509521484, + "rewards/rejected": -18.567590713500977, + "step": 1190 + }, + { + "epoch": 0.88, + "grad_norm": 17.973583296727792, + "learning_rate": 2.0563894098070216e-08, + "logits/chosen": 0.15934190154075623, + "logits/rejected": 2.1497673988342285, + "logps/chosen": -712.0560302734375, + "logps/rejected": -1505.4547119140625, + "loss": 0.0775, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.412214756011963, + "rewards/margins": 8.303590774536133, + "rewards/margins_max": 12.088435173034668, + "rewards/margins_min": 4.5187482833862305, + "rewards/margins_std": 5.352576732635498, + "rewards/rejected": -12.715806007385254, + "step": 1200 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 0.30742567777633667, + "eval_logits/rejected": 0.5172090530395508, + "eval_logps/chosen": -836.2312622070312, + "eval_logps/rejected": -922.0319213867188, + "eval_loss": 0.9049317836761475, + "eval_rewards/accuracies": 0.6210317611694336, + "eval_rewards/chosen": -4.903958320617676, + "eval_rewards/margins": 0.9545530080795288, + "eval_rewards/margins_max": 5.713037014007568, + "eval_rewards/margins_min": -2.831618309020996, + "eval_rewards/margins_std": 2.813220262527466, + "eval_rewards/rejected": -5.858510971069336, + "eval_runtime": 422.5993, + "eval_samples_per_second": 9.465, + "eval_steps_per_second": 0.149, + "step": 1200 + }, + { + "epoch": 0.89, + "grad_norm": 2.7225416780438763, + "learning_rate": 1.8089272654333353e-08, + "logits/chosen": 0.28706851601600647, + "logits/rejected": 1.9062206745147705, + "logps/chosen": -866.8541259765625, + "logps/rejected": -1701.005615234375, + "loss": 0.0693, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.099188804626465, + "rewards/margins": 9.302727699279785, + "rewards/margins_max": 13.995088577270508, + "rewards/margins_min": 4.61036491394043, + "rewards/margins_std": 6.6360015869140625, + "rewards/rejected": -14.40191650390625, + "step": 1210 + }, + { + "epoch": 0.9, + "grad_norm": 7.492427847668467, + "learning_rate": 1.5767669657019005e-08, + "logits/chosen": 0.21484322845935822, + "logits/rejected": 2.9490137100219727, + "logps/chosen": -665.4578857421875, + "logps/rejected": -1718.431640625, + "loss": 0.0694, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.03716516494751, + "rewards/margins": 11.106006622314453, + "rewards/margins_max": 14.720375061035156, + "rewards/margins_min": 7.491639137268066, + "rewards/margins_std": 5.11148738861084, + "rewards/rejected": -15.143171310424805, + "step": 1220 + }, + { + "epoch": 0.91, + "grad_norm": 14.252457056430137, + "learning_rate": 1.3600616937310267e-08, + "logits/chosen": 0.3399500250816345, + "logits/rejected": 2.5051798820495605, + "logps/chosen": -776.6029663085938, + "logps/rejected": -1890.706298828125, + "loss": 0.0533, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.925044059753418, + "rewards/margins": 11.241573333740234, + "rewards/margins_max": 16.437541961669922, + "rewards/margins_min": 6.045604228973389, + "rewards/margins_std": 7.348209381103516, + "rewards/rejected": -16.166616439819336, + "step": 1230 + }, + { + "epoch": 0.91, + "grad_norm": 3.254929425883996, + "learning_rate": 1.1589544351619047e-08, + "logits/chosen": 0.8039329648017883, + "logits/rejected": 3.354154109954834, + "logps/chosen": -724.2069091796875, + "logps/rejected": -2016.739501953125, + "loss": 0.0701, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.160962104797363, + "rewards/margins": 12.87867259979248, + "rewards/margins_max": 20.457225799560547, + "rewards/margins_min": 5.300119400024414, + "rewards/margins_std": 10.717691421508789, + "rewards/rejected": -18.03963279724121, + "step": 1240 + }, + { + "epoch": 0.92, + "grad_norm": 9.743725835120221, + "learning_rate": 9.735778838143749e-09, + "logits/chosen": 0.17006321251392365, + "logits/rejected": 3.252281904220581, + "logps/chosen": -771.3798828125, + "logps/rejected": -2618.41943359375, + "loss": 0.0852, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.744952201843262, + "rewards/margins": 18.770408630371094, + "rewards/margins_max": 27.798681259155273, + "rewards/margins_min": 9.742134094238281, + "rewards/margins_std": 12.767908096313477, + "rewards/rejected": -23.51535987854004, + "step": 1250 + }, + { + "epoch": 0.93, + "grad_norm": 9.000437498002796, + "learning_rate": 8.040543541333655e-09, + "logits/chosen": 0.2970607578754425, + "logits/rejected": 3.4422898292541504, + "logps/chosen": -716.0152587890625, + "logps/rejected": -1930.673095703125, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7243733406066895, + "rewards/margins": 12.035941123962402, + "rewards/margins_max": 17.656423568725586, + "rewards/margins_min": 6.415456295013428, + "rewards/margins_std": 7.9485650062561035, + "rewards/rejected": -16.760313034057617, + "step": 1260 + }, + { + "epoch": 0.93, + "grad_norm": 16.466144409333417, + "learning_rate": 6.504957004838746e-09, + "logits/chosen": -0.05619863420724869, + "logits/rejected": 1.9224863052368164, + "logps/chosen": -841.8850708007812, + "logps/rejected": -1936.7113037109375, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.742644309997559, + "rewards/margins": 11.516191482543945, + "rewards/margins_max": 15.864044189453125, + "rewards/margins_min": 7.168337821960449, + "rewards/margins_std": 6.148792266845703, + "rewards/rejected": -16.258834838867188, + "step": 1270 + }, + { + "epoch": 0.94, + "grad_norm": 2.654070322101592, + "learning_rate": 5.130032433476483e-09, + "logits/chosen": 0.3038169741630554, + "logits/rejected": 2.8313422203063965, + "logps/chosen": -728.2089233398438, + "logps/rejected": -1743.801513671875, + "loss": 0.069, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6006269454956055, + "rewards/margins": 11.062047004699707, + "rewards/margins_max": 16.25905418395996, + "rewards/margins_min": 5.8650407791137695, + "rewards/margins_std": 7.349676609039307, + "rewards/rejected": -15.662673950195312, + "step": 1280 + }, + { + "epoch": 0.95, + "grad_norm": 4.94775999947406, + "learning_rate": 3.916677024702858e-09, + "logits/chosen": 0.1287023425102234, + "logits/rejected": 2.0298779010772705, + "logps/chosen": -667.8201904296875, + "logps/rejected": -1396.031005859375, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1791486740112305, + "rewards/margins": 7.460590362548828, + "rewards/margins_max": 10.305280685424805, + "rewards/margins_min": 4.615899562835693, + "rewards/margins_std": 4.023000240325928, + "rewards/rejected": -11.639739990234375, + "step": 1290 + }, + { + "epoch": 0.96, + "grad_norm": 62.41949761633163, + "learning_rate": 2.865691370028761e-09, + "logits/chosen": 0.3163800835609436, + "logits/rejected": 2.587982416152954, + "logps/chosen": -711.7886962890625, + "logps/rejected": -1568.990234375, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.575112342834473, + "rewards/margins": 9.028793334960938, + "rewards/margins_max": 13.794398307800293, + "rewards/margins_min": 4.263186454772949, + "rewards/margins_std": 6.739584922790527, + "rewards/rejected": -13.603904724121094, + "step": 1300 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 0.2898733615875244, + "eval_logits/rejected": 0.49572646617889404, + "eval_logps/chosen": -832.4283447265625, + "eval_logps/rejected": -916.66357421875, + "eval_loss": 0.9016607403755188, + "eval_rewards/accuracies": 0.6230158805847168, + "eval_rewards/chosen": -4.8659281730651855, + "eval_rewards/margins": 0.9388992786407471, + "eval_rewards/margins_max": 5.651630401611328, + "eval_rewards/margins_min": -2.8163363933563232, + "eval_rewards/margins_std": 2.78544020652771, + "eval_rewards/rejected": -5.8048272132873535, + "eval_runtime": 417.6061, + "eval_samples_per_second": 9.578, + "eval_steps_per_second": 0.151, + "step": 1300 + }, + { + "epoch": 0.96, + "grad_norm": 1.1392819952008515, + "learning_rate": 1.977768926776896e-09, + "logits/chosen": 0.29715052247047424, + "logits/rejected": 2.052577018737793, + "logps/chosen": -763.3764038085938, + "logps/rejected": -1313.391845703125, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.99444055557251, + "rewards/margins": 5.8611674308776855, + "rewards/margins_max": 7.457464694976807, + "rewards/margins_min": 4.264869213104248, + "rewards/margins_std": 2.2575066089630127, + "rewards/rejected": -10.855607986450195, + "step": 1310 + }, + { + "epoch": 0.97, + "grad_norm": 2.4664515698911433, + "learning_rate": 1.2534955605274233e-09, + "logits/chosen": 0.4122096002101898, + "logits/rejected": 3.4274659156799316, + "logps/chosen": -771.2412719726562, + "logps/rejected": -1840.775390625, + "loss": 0.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.00337028503418, + "rewards/margins": 10.93709659576416, + "rewards/margins_max": 16.795883178710938, + "rewards/margins_min": 5.078312873840332, + "rewards/margins_std": 8.285572052001953, + "rewards/rejected": -15.940465927124023, + "step": 1320 + }, + { + "epoch": 0.98, + "grad_norm": 5.80559652045183, + "learning_rate": 6.933491585542351e-10, + "logits/chosen": 0.37182289361953735, + "logits/rejected": 3.1600046157836914, + "logps/chosen": -680.2762451171875, + "logps/rejected": -1665.175537109375, + "loss": 0.1695, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.444371223449707, + "rewards/margins": 10.164952278137207, + "rewards/margins_max": 14.587198257446289, + "rewards/margins_min": 5.742705821990967, + "rewards/margins_std": 6.254001140594482, + "rewards/rejected": -14.60932445526123, + "step": 1330 + }, + { + "epoch": 0.99, + "grad_norm": 8.610945717452623, + "learning_rate": 2.9769931450737694e-10, + "logits/chosen": 0.1386619508266449, + "logits/rejected": 2.0141379833221436, + "logps/chosen": -799.5162353515625, + "logps/rejected": -1834.666015625, + "loss": 0.0767, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.182085990905762, + "rewards/margins": 10.368020057678223, + "rewards/margins_max": 15.239529609680176, + "rewards/margins_min": 5.496510028839111, + "rewards/margins_std": 6.889355659484863, + "rewards/rejected": -15.550105094909668, + "step": 1340 + }, + { + "epoch": 0.99, + "grad_norm": 33.03425737131523, + "learning_rate": 6.680708454906425e-11, + "logits/chosen": 0.2811238169670105, + "logits/rejected": 1.941292405128479, + "logps/chosen": -751.3411254882812, + "logps/rejected": -1699.15625, + "loss": 0.0687, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.085223197937012, + "rewards/margins": 9.496126174926758, + "rewards/margins_max": 13.1726713180542, + "rewards/margins_min": 5.819581508636475, + "rewards/margins_std": 5.199418544769287, + "rewards/rejected": -14.58134937286377, + "step": 1350 + }, + { + "epoch": 1.0, + "step": 1359, + "total_flos": 0.0, + "train_loss": 0.21785820982226384, + "train_runtime": 12082.0351, + "train_samples_per_second": 1.8, + "train_steps_per_second": 0.112 + } + ], + "logging_steps": 10, + "max_steps": 1359, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}