{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1346, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 23.545113700609754, "learning_rate": 3.7037037037037036e-09, "logits/chosen": -2.017277240753174, "logits/rejected": -1.9505600929260254, "logps/chosen": -342.8155212402344, "logps/rejected": -264.6424865722656, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 23.704110924178444, "learning_rate": 3.7037037037037036e-08, "logits/chosen": -1.852867603302002, "logits/rejected": -1.7641547918319702, "logps/chosen": -243.63710021972656, "logps/rejected": -215.13551330566406, "loss": 0.6933, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.0004846964729949832, "rewards/margins": -0.001089173136278987, "rewards/rejected": 0.0006044767214916646, "step": 10 }, { "epoch": 0.01, "grad_norm": 27.48286479448467, "learning_rate": 7.407407407407407e-08, "logits/chosen": -1.9755146503448486, "logits/rejected": -1.8412548303604126, "logps/chosen": -241.4310302734375, "logps/rejected": -210.738037109375, "loss": 0.6927, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0005561274592764676, "rewards/margins": 0.0004348217917140573, "rewards/rejected": 0.00012130556569900364, "step": 20 }, { "epoch": 0.02, "grad_norm": 23.49895713678948, "learning_rate": 1.111111111111111e-07, "logits/chosen": -1.8477449417114258, "logits/rejected": -1.781266450881958, "logps/chosen": -277.84527587890625, "logps/rejected": -244.1582489013672, "loss": 0.6915, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005596889648586512, "rewards/margins": 0.0021990840323269367, "rewards/rejected": 0.003397804917767644, "step": 30 }, { "epoch": 0.03, "grad_norm": 21.952979365752906, "learning_rate": 1.4814814814814815e-07, "logits/chosen": -1.8662084341049194, "logits/rejected": -1.8252031803131104, "logps/chosen": -279.81585693359375, "logps/rejected": -256.37322998046875, "loss": 0.6867, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.026755522936582565, "rewards/margins": 0.01376323588192463, "rewards/rejected": 0.01299228798598051, "step": 40 }, { "epoch": 0.04, "grad_norm": 22.515894719363914, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -1.886828064918518, "logits/rejected": -1.796974539756775, "logps/chosen": -245.1302490234375, "logps/rejected": -207.6703338623047, "loss": 0.68, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.05396401137113571, "rewards/margins": 0.03148679807782173, "rewards/rejected": 0.02247721515595913, "step": 50 }, { "epoch": 0.04, "grad_norm": 21.11853715417876, "learning_rate": 2.222222222222222e-07, "logits/chosen": -1.8658056259155273, "logits/rejected": -1.7990939617156982, "logps/chosen": -245.4588623046875, "logps/rejected": -228.79067993164062, "loss": 0.6687, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0710381492972374, "rewards/margins": 0.053314320743083954, "rewards/rejected": 0.01772383041679859, "step": 60 }, { "epoch": 0.05, "grad_norm": 21.639022509531838, "learning_rate": 2.5925925925925923e-07, "logits/chosen": -1.8920536041259766, "logits/rejected": -1.8345096111297607, "logps/chosen": -223.96511840820312, "logps/rejected": -196.08775329589844, "loss": 0.6547, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.06073574349284172, "rewards/margins": 0.08626440167427063, "rewards/rejected": -0.02552866004407406, "step": 70 }, { "epoch": 0.06, "grad_norm": 22.179495576107882, "learning_rate": 2.962962962962963e-07, "logits/chosen": -1.8825687170028687, "logits/rejected": -1.847541093826294, "logps/chosen": -232.0540313720703, "logps/rejected": -240.20120239257812, "loss": 0.6407, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.03458085656166077, "rewards/margins": 0.1154135912656784, "rewards/rejected": -0.08083274215459824, "step": 80 }, { "epoch": 0.07, "grad_norm": 21.88163995061792, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.9384691715240479, "logits/rejected": -1.922488808631897, "logps/chosen": -248.4744415283203, "logps/rejected": -261.0725402832031, "loss": 0.6135, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.029202425852417946, "rewards/margins": 0.2103302925825119, "rewards/rejected": -0.2395327389240265, "step": 90 }, { "epoch": 0.07, "grad_norm": 27.693123307166786, "learning_rate": 3.703703703703703e-07, "logits/chosen": -1.9232885837554932, "logits/rejected": -1.9198648929595947, "logps/chosen": -245.3694610595703, "logps/rejected": -275.853515625, "loss": 0.5905, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.23111872375011444, "rewards/margins": 0.2522026598453522, "rewards/rejected": -0.4833213686943054, "step": 100 }, { "epoch": 0.07, "eval_logits/chosen": -1.787776231765747, "eval_logits/rejected": -1.7244033813476562, "eval_logps/chosen": -325.57440185546875, "eval_logps/rejected": -351.93182373046875, "eval_loss": 0.6428781747817993, "eval_rewards/accuracies": 0.671875, "eval_rewards/chosen": -0.13797907531261444, "eval_rewards/margins": 0.2060878425836563, "eval_rewards/rejected": -0.34406691789627075, "eval_runtime": 97.6555, "eval_samples_per_second": 20.48, "eval_steps_per_second": 0.328, "step": 100 }, { "epoch": 0.08, "grad_norm": 33.52938589908786, "learning_rate": 4.0740740740740737e-07, "logits/chosen": -1.8354734182357788, "logits/rejected": -1.7754793167114258, "logps/chosen": -295.2403869628906, "logps/rejected": -316.46923828125, "loss": 0.5723, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5448485016822815, "rewards/margins": 0.3984159529209137, "rewards/rejected": -0.943264365196228, "step": 110 }, { "epoch": 0.09, "grad_norm": 32.42547027840792, "learning_rate": 4.444444444444444e-07, "logits/chosen": -1.7011499404907227, "logits/rejected": -1.708805799484253, "logps/chosen": -307.11334228515625, "logps/rejected": -348.78729248046875, "loss": 0.5442, "rewards/accuracies": 0.75, "rewards/chosen": -0.5989453196525574, "rewards/margins": 0.6007151007652283, "rewards/rejected": -1.1996605396270752, "step": 120 }, { "epoch": 0.1, "grad_norm": 33.08064593315955, "learning_rate": 4.814814814814814e-07, "logits/chosen": -1.70786452293396, "logits/rejected": -1.6745007038116455, "logps/chosen": -290.42498779296875, "logps/rejected": -343.42510986328125, "loss": 0.5139, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7598094344139099, "rewards/margins": 0.6571252346038818, "rewards/rejected": -1.4169347286224365, "step": 130 }, { "epoch": 0.1, "grad_norm": 33.94320124887001, "learning_rate": 4.999789692194508e-07, "logits/chosen": -1.8099472522735596, "logits/rejected": -1.754595398902893, "logps/chosen": -314.9842224121094, "logps/rejected": -356.81011962890625, "loss": 0.5172, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.812475860118866, "rewards/margins": 0.6942508816719055, "rewards/rejected": -1.5067269802093506, "step": 140 }, { "epoch": 0.11, "grad_norm": 39.07047935152003, "learning_rate": 4.998107442045616e-07, "logits/chosen": -1.6377861499786377, "logits/rejected": -1.6226139068603516, "logps/chosen": -304.92840576171875, "logps/rejected": -393.1883239746094, "loss": 0.5094, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.8283722996711731, "rewards/margins": 0.8278924822807312, "rewards/rejected": -1.6562646627426147, "step": 150 }, { "epoch": 0.12, "grad_norm": 42.785505208166626, "learning_rate": 4.994744073829293e-07, "logits/chosen": -1.5746722221374512, "logits/rejected": -1.4142063856124878, "logps/chosen": -343.25823974609375, "logps/rejected": -402.02691650390625, "loss": 0.5011, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8369730710983276, "rewards/margins": 0.8556060791015625, "rewards/rejected": -1.6925792694091797, "step": 160 }, { "epoch": 0.13, "grad_norm": 48.274083606893925, "learning_rate": 4.989701850946613e-07, "logits/chosen": -1.5056556463241577, "logits/rejected": -1.3766965866088867, "logps/chosen": -335.7103271484375, "logps/rejected": -388.94097900390625, "loss": 0.4643, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9376843571662903, "rewards/margins": 0.8313838243484497, "rewards/rejected": -1.7690680027008057, "step": 170 }, { "epoch": 0.13, "grad_norm": 46.176765511998994, "learning_rate": 4.982984166595104e-07, "logits/chosen": -1.4761296510696411, "logits/rejected": -1.3599636554718018, "logps/chosen": -408.171630859375, "logps/rejected": -472.0873107910156, "loss": 0.4577, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2097257375717163, "rewards/margins": 1.240505576133728, "rewards/rejected": -2.4502310752868652, "step": 180 }, { "epoch": 0.14, "grad_norm": 43.28509926988276, "learning_rate": 4.974595541485259e-07, "logits/chosen": -1.3221380710601807, "logits/rejected": -1.204590082168579, "logps/chosen": -335.5089416503906, "logps/rejected": -428.30621337890625, "loss": 0.4635, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.076790452003479, "rewards/margins": 1.0969324111938477, "rewards/rejected": -2.173722743988037, "step": 190 }, { "epoch": 0.15, "grad_norm": 56.09927596713516, "learning_rate": 4.964541620798307e-07, "logits/chosen": -1.2160365581512451, "logits/rejected": -1.118375539779663, "logps/chosen": -348.90753173828125, "logps/rejected": -468.21563720703125, "loss": 0.4495, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2727657556533813, "rewards/margins": 1.1830675601959229, "rewards/rejected": -2.4558334350585938, "step": 200 }, { "epoch": 0.15, "eval_logits/chosen": -1.4371435642242432, "eval_logits/rejected": -1.366525650024414, "eval_logps/chosen": -361.1814880371094, "eval_logps/rejected": -427.2509765625, "eval_loss": 0.559985339641571, "eval_rewards/accuracies": 0.74609375, "eval_rewards/chosen": -0.4940495491027832, "eval_rewards/margins": 0.6032084226608276, "eval_rewards/rejected": -1.0972579717636108, "eval_runtime": 97.4901, "eval_samples_per_second": 20.515, "eval_steps_per_second": 0.328, "step": 200 }, { "epoch": 0.16, "grad_norm": 49.36366262587358, "learning_rate": 4.952829170387241e-07, "logits/chosen": -1.1800302267074585, "logits/rejected": -1.0126550197601318, "logps/chosen": -380.48828125, "logps/rejected": -450.0765075683594, "loss": 0.4458, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3101383447647095, "rewards/margins": 0.9806028604507446, "rewards/rejected": -2.290741443634033, "step": 210 }, { "epoch": 0.16, "grad_norm": 57.25684926546983, "learning_rate": 4.939466072223697e-07, "logits/chosen": -1.2157623767852783, "logits/rejected": -1.0489680767059326, "logps/chosen": -372.591064453125, "logps/rejected": -468.7542419433594, "loss": 0.4545, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3519532680511475, "rewards/margins": 1.1502256393432617, "rewards/rejected": -2.50217866897583, "step": 220 }, { "epoch": 0.17, "grad_norm": 40.98752146946231, "learning_rate": 4.924461319093725e-07, "logits/chosen": -1.1049861907958984, "logits/rejected": -1.0018864870071411, "logps/chosen": -361.7793884277344, "logps/rejected": -487.15460205078125, "loss": 0.4436, "rewards/accuracies": 0.75, "rewards/chosen": -1.1743983030319214, "rewards/margins": 1.1021788120269775, "rewards/rejected": -2.2765772342681885, "step": 230 }, { "epoch": 0.18, "grad_norm": 57.39176618017778, "learning_rate": 4.907825008546038e-07, "logits/chosen": -0.7271394729614258, "logits/rejected": -0.6813848614692688, "logps/chosen": -377.90118408203125, "logps/rejected": -523.9625244140625, "loss": 0.4333, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4791629314422607, "rewards/margins": 1.4326350688934326, "rewards/rejected": -2.9117980003356934, "step": 240 }, { "epoch": 0.19, "grad_norm": 51.26102709104704, "learning_rate": 4.889568336096795e-07, "logits/chosen": -0.5312275290489197, "logits/rejected": -0.37771934270858765, "logps/chosen": -381.1251220703125, "logps/rejected": -479.7431640625, "loss": 0.4272, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5479203462600708, "rewards/margins": 1.1352421045303345, "rewards/rejected": -2.6831624507904053, "step": 250 }, { "epoch": 0.19, "grad_norm": 46.69946748969463, "learning_rate": 4.869703587695508e-07, "logits/chosen": -0.44748228788375854, "logits/rejected": -0.18481455743312836, "logps/chosen": -379.5589904785156, "logps/rejected": -527.2100830078125, "loss": 0.4464, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.412641167640686, "rewards/margins": 1.667824149131775, "rewards/rejected": -3.080465793609619, "step": 260 }, { "epoch": 0.2, "grad_norm": 40.8957837906737, "learning_rate": 4.848244131457127e-07, "logits/chosen": -0.9530747532844543, "logits/rejected": -0.6137160062789917, "logps/chosen": -400.1986083984375, "logps/rejected": -499.60308837890625, "loss": 0.4211, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4335994720458984, "rewards/margins": 1.4832035303115845, "rewards/rejected": -2.9168028831481934, "step": 270 }, { "epoch": 0.21, "grad_norm": 45.308995144235396, "learning_rate": 4.825204408665877e-07, "logits/chosen": -1.2076747417449951, "logits/rejected": -0.9289032220840454, "logps/chosen": -426.99114990234375, "logps/rejected": -532.0573120117188, "loss": 0.4124, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4818888902664185, "rewards/margins": 1.4990845918655396, "rewards/rejected": -2.980973720550537, "step": 280 }, { "epoch": 0.22, "grad_norm": 57.75176826411474, "learning_rate": 4.800599924056907e-07, "logits/chosen": -0.7638604044914246, "logits/rejected": -0.7332445383071899, "logps/chosen": -383.2490539550781, "logps/rejected": -556.2003784179688, "loss": 0.3833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5847924947738647, "rewards/margins": 1.5942741632461548, "rewards/rejected": -3.1790668964385986, "step": 290 }, { "epoch": 0.22, "grad_norm": 45.582764097748154, "learning_rate": 4.774447235382259e-07, "logits/chosen": -0.5798165202140808, "logits/rejected": -0.5653051733970642, "logps/chosen": -411.58154296875, "logps/rejected": -582.2734375, "loss": 0.3963, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.766920804977417, "rewards/margins": 1.7389370203018188, "rewards/rejected": -3.5058579444885254, "step": 300 }, { "epoch": 0.22, "eval_logits/chosen": -1.4608731269836426, "eval_logits/rejected": -1.2769949436187744, "eval_logps/chosen": -423.00341796875, "eval_logps/rejected": -521.115478515625, "eval_loss": 0.5291498303413391, "eval_rewards/accuracies": 0.7421875, "eval_rewards/chosen": -1.1122692823410034, "eval_rewards/margins": 0.9236345291137695, "eval_rewards/rejected": -2.0359039306640625, "eval_runtime": 97.2217, "eval_samples_per_second": 20.572, "eval_steps_per_second": 0.329, "step": 300 }, { "epoch": 0.23, "grad_norm": 42.82644939529418, "learning_rate": 4.7467639422682426e-07, "logits/chosen": -0.6843788623809814, "logits/rejected": -0.46269315481185913, "logps/chosen": -417.7638244628906, "logps/rejected": -573.83837890625, "loss": 0.4006, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8430830240249634, "rewards/margins": 1.669550895690918, "rewards/rejected": -3.512633800506592, "step": 310 }, { "epoch": 0.24, "grad_norm": 55.146360598406936, "learning_rate": 4.7175686743716223e-07, "logits/chosen": -1.140579104423523, "logits/rejected": -0.8973017930984497, "logps/chosen": -419.18048095703125, "logps/rejected": -527.0257568359375, "loss": 0.405, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4635722637176514, "rewards/margins": 1.3773781061172485, "rewards/rejected": -2.8409504890441895, "step": 320 }, { "epoch": 0.25, "grad_norm": 45.88101703811544, "learning_rate": 4.686881078842688e-07, "logits/chosen": -1.0653458833694458, "logits/rejected": -0.8751330375671387, "logps/chosen": -386.37335205078125, "logps/rejected": -510.29949951171875, "loss": 0.3899, "rewards/accuracies": 0.78125, "rewards/chosen": -1.47976553440094, "rewards/margins": 1.366317868232727, "rewards/rejected": -2.846083164215088, "step": 330 }, { "epoch": 0.25, "grad_norm": 58.11307992254104, "learning_rate": 4.654721807103558e-07, "logits/chosen": -0.5151967406272888, "logits/rejected": -0.14977958798408508, "logps/chosen": -400.7736511230469, "logps/rejected": -529.3316650390625, "loss": 0.3938, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7082515954971313, "rewards/margins": 1.6958554983139038, "rewards/rejected": -3.404106855392456, "step": 340 }, { "epoch": 0.26, "grad_norm": 48.499175539211535, "learning_rate": 4.621112500950678e-07, "logits/chosen": -0.8198322057723999, "logits/rejected": -0.5934363603591919, "logps/chosen": -429.72113037109375, "logps/rejected": -547.5772705078125, "loss": 0.3843, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8615728616714478, "rewards/margins": 1.499329924583435, "rewards/rejected": -3.3609023094177246, "step": 350 }, { "epoch": 0.27, "grad_norm": 55.599844022581365, "learning_rate": 4.5860757779908225e-07, "logits/chosen": -1.0455310344696045, "logits/rejected": -0.6826554536819458, "logps/chosen": -413.38739013671875, "logps/rejected": -542.2623291015625, "loss": 0.3736, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5897157192230225, "rewards/margins": 1.6853986978530884, "rewards/rejected": -3.2751145362854004, "step": 360 }, { "epoch": 0.27, "grad_norm": 74.71151634556864, "learning_rate": 4.5496352164204304e-07, "logits/chosen": -0.4619407057762146, "logits/rejected": -0.23415322601795197, "logps/chosen": -426.197998046875, "logps/rejected": -620.7210693359375, "loss": 0.3997, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0138180255889893, "rewards/margins": 2.0114035606384277, "rewards/rejected": -4.025221347808838, "step": 370 }, { "epoch": 0.28, "grad_norm": 46.835706945950214, "learning_rate": 4.5118153391584966e-07, "logits/chosen": -0.7893734574317932, "logits/rejected": -0.5286726951599121, "logps/chosen": -348.12554931640625, "logps/rejected": -483.89215087890625, "loss": 0.3909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0020155906677246, "rewards/margins": 1.7324419021606445, "rewards/rejected": -2.734457492828369, "step": 380 }, { "epoch": 0.29, "grad_norm": 51.06658825135186, "learning_rate": 4.472641597343713e-07, "logits/chosen": -0.5109713077545166, "logits/rejected": -0.07112047076225281, "logps/chosen": -389.3044738769531, "logps/rejected": -567.7926635742188, "loss": 0.3846, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6159217357635498, "rewards/margins": 1.9207748174667358, "rewards/rejected": -3.536696672439575, "step": 390 }, { "epoch": 0.3, "grad_norm": 44.181665144710905, "learning_rate": 4.4321403532069523e-07, "logits/chosen": -0.5097373127937317, "logits/rejected": -0.2719523012638092, "logps/chosen": -353.91278076171875, "logps/rejected": -517.2376708984375, "loss": 0.4012, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5704162120819092, "rewards/margins": 1.8435367345809937, "rewards/rejected": -3.4139533042907715, "step": 400 }, { "epoch": 0.3, "eval_logits/chosen": -1.3372514247894287, "eval_logits/rejected": -1.1222751140594482, "eval_logps/chosen": -417.65863037109375, "eval_logps/rejected": -516.7505493164062, "eval_loss": 0.5314938426017761, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -1.058821201324463, "eval_rewards/margins": 0.9334329962730408, "eval_rewards/rejected": -1.9922541379928589, "eval_runtime": 97.4658, "eval_samples_per_second": 20.52, "eval_steps_per_second": 0.328, "step": 400 }, { "epoch": 0.3, "grad_norm": 50.26869622592037, "learning_rate": 4.390338862330631e-07, "logits/chosen": -0.7592865824699402, "logits/rejected": -0.4464483857154846, "logps/chosen": -401.47607421875, "logps/rejected": -523.3784790039062, "loss": 0.3803, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7566916942596436, "rewards/margins": 1.5606569051742554, "rewards/rejected": -3.3173484802246094, "step": 410 }, { "epoch": 0.31, "grad_norm": 51.57934206296598, "learning_rate": 4.3472652553068835e-07, "logits/chosen": -0.6644355654716492, "logits/rejected": -0.23346371948719025, "logps/chosen": -404.8458557128906, "logps/rejected": -540.8956298828125, "loss": 0.3797, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7600839138031006, "rewards/margins": 1.6869585514068604, "rewards/rejected": -3.4470419883728027, "step": 420 }, { "epoch": 0.32, "grad_norm": 73.04228089758476, "learning_rate": 4.3029485188068895e-07, "logits/chosen": 0.10370206832885742, "logits/rejected": 0.39608412981033325, "logps/chosen": -385.42498779296875, "logps/rejected": -570.5172729492188, "loss": 0.3655, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.839719533920288, "rewards/margins": 1.714897871017456, "rewards/rejected": -3.5546176433563232, "step": 430 }, { "epoch": 0.33, "grad_norm": 54.512857623037554, "learning_rate": 4.257418476074103e-07, "logits/chosen": -0.023069072514772415, "logits/rejected": 0.3960541784763336, "logps/chosen": -423.490478515625, "logps/rejected": -592.7897338867188, "loss": 0.3638, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7877943515777588, "rewards/margins": 2.115088701248169, "rewards/rejected": -3.9028830528259277, "step": 440 }, { "epoch": 0.33, "grad_norm": 55.7162708155443, "learning_rate": 4.210705766854504e-07, "logits/chosen": 0.15324774384498596, "logits/rejected": 0.521506667137146, "logps/chosen": -456.01776123046875, "logps/rejected": -625.3338623046875, "loss": 0.352, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.096989870071411, "rewards/margins": 1.874829649925232, "rewards/rejected": -3.9718196392059326, "step": 450 }, { "epoch": 0.34, "grad_norm": 51.50110954656292, "learning_rate": 4.16284182677737e-07, "logits/chosen": 0.3847750127315521, "logits/rejected": 0.9687877893447876, "logps/chosen": -421.48321533203125, "logps/rejected": -571.6495361328125, "loss": 0.3771, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7761863470077515, "rewards/margins": 1.777931809425354, "rewards/rejected": -3.5541183948516846, "step": 460 }, { "epoch": 0.35, "grad_norm": 42.17081561639591, "learning_rate": 4.113858866200466e-07, "logits/chosen": 0.5899291634559631, "logits/rejected": 0.9651363492012024, "logps/chosen": -411.4060974121094, "logps/rejected": -587.0046997070312, "loss": 0.3551, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.751307725906372, "rewards/margins": 1.814639687538147, "rewards/rejected": -3.5659472942352295, "step": 470 }, { "epoch": 0.36, "grad_norm": 48.02610054790726, "learning_rate": 4.063789848533865e-07, "logits/chosen": 0.46232396364212036, "logits/rejected": 1.0872290134429932, "logps/chosen": -472.24139404296875, "logps/rejected": -634.9567260742188, "loss": 0.374, "rewards/accuracies": 0.78125, "rewards/chosen": -2.287501573562622, "rewards/margins": 1.8356859683990479, "rewards/rejected": -4.123187065124512, "step": 480 }, { "epoch": 0.36, "grad_norm": 45.88835702974933, "learning_rate": 4.0126684680570074e-07, "logits/chosen": -0.3817380368709564, "logits/rejected": 0.1566486358642578, "logps/chosen": -461.13934326171875, "logps/rejected": -592.1519165039062, "loss": 0.334, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8447940349578857, "rewards/margins": 1.7669038772583008, "rewards/rejected": -3.6116981506347656, "step": 490 }, { "epoch": 0.37, "grad_norm": 53.85769217498667, "learning_rate": 3.960529127243902e-07, "logits/chosen": -0.31509625911712646, "logits/rejected": -0.04504912719130516, "logps/chosen": -477.027099609375, "logps/rejected": -654.2672119140625, "loss": 0.3559, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.053821086883545, "rewards/margins": 2.070889711380005, "rewards/rejected": -4.124711036682129, "step": 500 }, { "epoch": 0.37, "eval_logits/chosen": -1.0066841840744019, "eval_logits/rejected": -0.6833571791648865, "eval_logps/chosen": -456.0086364746094, "eval_logps/rejected": -568.9822387695312, "eval_loss": 0.5275729894638062, "eval_rewards/accuracies": 0.7578125, "eval_rewards/chosen": -1.4423211812973022, "eval_rewards/margins": 1.0722503662109375, "eval_rewards/rejected": -2.5145716667175293, "eval_runtime": 97.6519, "eval_samples_per_second": 20.481, "eval_steps_per_second": 0.328, "step": 500 }, { "epoch": 0.38, "grad_norm": 53.47947486686438, "learning_rate": 3.9074069136117594e-07, "logits/chosen": -0.6587181687355042, "logits/rejected": -0.11707913875579834, "logps/chosen": -478.9352111816406, "logps/rejected": -631.669921875, "loss": 0.35, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.0555968284606934, "rewards/margins": 1.9847618341445923, "rewards/rejected": -4.040358543395996, "step": 510 }, { "epoch": 0.39, "grad_norm": 48.01190508303512, "learning_rate": 3.8533375761086094e-07, "logits/chosen": -0.6520954966545105, "logits/rejected": -0.19666698575019836, "logps/chosen": -399.66455078125, "logps/rejected": -589.08251953125, "loss": 0.3518, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5765998363494873, "rewards/margins": 2.0024795532226562, "rewards/rejected": -3.5790793895721436, "step": 520 }, { "epoch": 0.39, "grad_norm": 58.201909693922666, "learning_rate": 3.79835750105581e-07, "logits/chosen": -0.015231219120323658, "logits/rejected": 0.524590253829956, "logps/chosen": -425.837890625, "logps/rejected": -576.46630859375, "loss": 0.364, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9596973657608032, "rewards/margins": 1.918087363243103, "rewards/rejected": -3.8777847290039062, "step": 530 }, { "epoch": 0.4, "grad_norm": 53.67325387574443, "learning_rate": 3.742503687661627e-07, "logits/chosen": 0.3345823585987091, "logits/rejected": 0.8041492700576782, "logps/chosen": -436.06170654296875, "logps/rejected": -628.6650390625, "loss": 0.3413, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.163074016571045, "rewards/margins": 2.0728249549865723, "rewards/rejected": -4.235899925231934, "step": 540 }, { "epoch": 0.41, "grad_norm": 54.5126564713129, "learning_rate": 3.685813723122372e-07, "logits/chosen": 0.6497628688812256, "logits/rejected": 1.1682524681091309, "logps/chosen": -425.30157470703125, "logps/rejected": -617.69482421875, "loss": 0.3365, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.9300180673599243, "rewards/margins": 2.057875394821167, "rewards/rejected": -3.987893581390381, "step": 550 }, { "epoch": 0.42, "grad_norm": 62.74924566191948, "learning_rate": 3.6283257573278466e-07, "logits/chosen": 0.867998480796814, "logits/rejected": 1.330685019493103, "logps/chosen": -455.71124267578125, "logps/rejected": -659.052978515625, "loss": 0.3223, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0765323638916016, "rewards/margins": 2.156247615814209, "rewards/rejected": -4.2327799797058105, "step": 560 }, { "epoch": 0.42, "grad_norm": 48.6969642598068, "learning_rate": 3.5700784771881224e-07, "logits/chosen": 1.0166234970092773, "logits/rejected": 1.6870880126953125, "logps/chosen": -478.86407470703125, "logps/rejected": -635.7424926757812, "loss": 0.3382, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.4357941150665283, "rewards/margins": 1.9054218530654907, "rewards/rejected": -4.341216087341309, "step": 570 }, { "epoch": 0.43, "grad_norm": 43.243072977055355, "learning_rate": 3.511111080598925e-07, "logits/chosen": 0.6339820623397827, "logits/rejected": 1.3627948760986328, "logps/chosen": -447.268798828125, "logps/rejected": -636.5888671875, "loss": 0.3276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.9055280685424805, "rewards/margins": 2.3114867210388184, "rewards/rejected": -4.217014312744141, "step": 580 }, { "epoch": 0.44, "grad_norm": 69.40196325230258, "learning_rate": 3.451463250063146e-07, "logits/chosen": 0.8395903706550598, "logits/rejected": 1.488012671470642, "logps/chosen": -432.853271484375, "logps/rejected": -630.223876953125, "loss": 0.3378, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9797086715698242, "rewards/margins": 2.143889904022217, "rewards/rejected": -4.123598098754883, "step": 590 }, { "epoch": 0.45, "grad_norm": 59.19017069860126, "learning_rate": 3.3911751259862403e-07, "logits/chosen": 0.9315579533576965, "logits/rejected": 1.3961995840072632, "logps/chosen": -493.1189880371094, "logps/rejected": -684.4100341796875, "loss": 0.3291, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.291141986846924, "rewards/margins": 2.0969302654266357, "rewards/rejected": -4.3880720138549805, "step": 600 }, { "epoch": 0.45, "eval_logits/chosen": -0.2334394007921219, "eval_logits/rejected": 0.188625305891037, "eval_logps/chosen": -477.9444580078125, "eval_logps/rejected": -595.6332397460938, "eval_loss": 0.5102677941322327, "eval_rewards/accuracies": 0.76953125, "eval_rewards/chosen": -1.6616793870925903, "eval_rewards/margins": 1.1194015741348267, "eval_rewards/rejected": -2.781080961227417, "eval_runtime": 97.2562, "eval_samples_per_second": 20.564, "eval_steps_per_second": 0.329, "step": 600 }, { "epoch": 0.45, "grad_norm": 37.653590501774474, "learning_rate": 3.3302872796634754e-07, "logits/chosen": 0.9580332040786743, "logits/rejected": 1.3357497453689575, "logps/chosen": -427.964111328125, "logps/rejected": -620.7327880859375, "loss": 0.3122, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.95559823513031, "rewards/margins": 2.1169991493225098, "rewards/rejected": -4.072597503662109, "step": 610 }, { "epoch": 0.46, "grad_norm": 47.96131506831022, "learning_rate": 3.2688406859772035e-07, "logits/chosen": 0.8878351449966431, "logits/rejected": 1.4351171255111694, "logps/chosen": -489.7989196777344, "logps/rejected": -665.8047485351562, "loss": 0.3224, "rewards/accuracies": 0.84375, "rewards/chosen": -2.195067882537842, "rewards/margins": 2.1086602210998535, "rewards/rejected": -4.3037285804748535, "step": 620 }, { "epoch": 0.47, "grad_norm": 65.32009143781127, "learning_rate": 3.206876695822541e-07, "logits/chosen": 1.3710159063339233, "logits/rejected": 1.7163244485855103, "logps/chosen": -493.956298828125, "logps/rejected": -688.6646728515625, "loss": 0.3129, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.438476085662842, "rewards/margins": 2.2680106163024902, "rewards/rejected": -4.706486701965332, "step": 630 }, { "epoch": 0.48, "grad_norm": 66.03238810693847, "learning_rate": 3.144437008280012e-07, "logits/chosen": 0.709919273853302, "logits/rejected": 1.0818461179733276, "logps/chosen": -468.56890869140625, "logps/rejected": -691.1434326171875, "loss": 0.3232, "rewards/accuracies": 0.90625, "rewards/chosen": -2.252897262573242, "rewards/margins": 2.3767807483673096, "rewards/rejected": -4.629677772521973, "step": 640 }, { "epoch": 0.48, "grad_norm": 47.885060646853404, "learning_rate": 3.0815636425538665e-07, "logits/chosen": 1.0194989442825317, "logits/rejected": 1.571274995803833, "logps/chosen": -446.6681213378906, "logps/rejected": -611.84033203125, "loss": 0.3429, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.190187454223633, "rewards/margins": 2.0423951148986816, "rewards/rejected": -4.232582092285156, "step": 650 }, { "epoch": 0.49, "grad_norm": 59.75526535732341, "learning_rate": 3.018298909694986e-07, "logits/chosen": 1.3580573797225952, "logits/rejected": 1.913851022720337, "logps/chosen": -489.56982421875, "logps/rejected": -673.2572021484375, "loss": 0.3288, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.595083236694336, "rewards/margins": 2.0307328701019287, "rewards/rejected": -4.6258158683776855, "step": 660 }, { "epoch": 0.5, "grad_norm": 51.20761564052719, "learning_rate": 2.954685384127371e-07, "logits/chosen": 0.8674410581588745, "logits/rejected": 1.4072096347808838, "logps/chosen": -482.65789794921875, "logps/rejected": -649.311279296875, "loss": 0.301, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.194945812225342, "rewards/margins": 2.093947172164917, "rewards/rejected": -4.288893222808838, "step": 670 }, { "epoch": 0.51, "grad_norm": 62.65952308868226, "learning_rate": 2.8907658749974054e-07, "logits/chosen": 0.9979363679885864, "logits/rejected": 1.4131087064743042, "logps/chosen": -457.8363342285156, "logps/rejected": -703.2235107421875, "loss": 0.2929, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.264411449432373, "rewards/margins": 2.5431039333343506, "rewards/rejected": -4.807515621185303, "step": 680 }, { "epoch": 0.51, "grad_norm": 49.65473672539794, "learning_rate": 2.8265833973651503e-07, "logits/chosen": 0.6275979280471802, "logits/rejected": 1.0561200380325317, "logps/chosen": -459.69976806640625, "logps/rejected": -684.1864013671875, "loss": 0.2859, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.8421128988265991, "rewards/margins": 2.5259382724761963, "rewards/rejected": -4.368051528930664, "step": 690 }, { "epoch": 0.52, "grad_norm": 48.72864396453521, "learning_rate": 2.7621811432570736e-07, "logits/chosen": 0.8585799336433411, "logits/rejected": 1.5937745571136475, "logps/chosen": -518.5455932617188, "logps/rejected": -734.5382690429688, "loss": 0.2735, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.441080093383789, "rewards/margins": 2.6617679595947266, "rewards/rejected": -5.102847576141357, "step": 700 }, { "epoch": 0.52, "eval_logits/chosen": 0.18704134225845337, "eval_logits/rejected": 0.6721899509429932, "eval_logps/chosen": -541.279541015625, "eval_logps/rejected": -687.587158203125, "eval_loss": 0.5288776159286499, "eval_rewards/accuracies": 0.76171875, "eval_rewards/chosen": -2.2950310707092285, "eval_rewards/margins": 1.40558922290802, "eval_rewards/rejected": -3.70061993598938, "eval_runtime": 97.5006, "eval_samples_per_second": 20.513, "eval_steps_per_second": 0.328, "step": 700 }, { "epoch": 0.53, "grad_norm": 50.62866425523001, "learning_rate": 2.6976024525996917e-07, "logits/chosen": 1.1524347066879272, "logits/rejected": 1.7467842102050781, "logps/chosen": -503.6927795410156, "logps/rejected": -780.6187744140625, "loss": 0.286, "rewards/accuracies": 0.90625, "rewards/chosen": -2.7125723361968994, "rewards/margins": 2.8134512901306152, "rewards/rejected": -5.5260233879089355, "step": 710 }, { "epoch": 0.53, "grad_norm": 56.03367218705217, "learning_rate": 2.6328907840536706e-07, "logits/chosen": 0.7062090039253235, "logits/rejected": 1.2199087142944336, "logps/chosen": -460.45794677734375, "logps/rejected": -685.5617065429688, "loss": 0.3244, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.43827748298645, "rewards/margins": 2.252427577972412, "rewards/rejected": -4.690704822540283, "step": 720 }, { "epoch": 0.54, "grad_norm": 57.82647372234183, "learning_rate": 2.568089685768038e-07, "logits/chosen": 0.6572129130363464, "logits/rejected": 1.0754339694976807, "logps/chosen": -530.2496337890625, "logps/rejected": -698.03662109375, "loss": 0.313, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.59128475189209, "rewards/margins": 2.117705821990967, "rewards/rejected": -4.708990573883057, "step": 730 }, { "epoch": 0.55, "grad_norm": 50.473574423912424, "learning_rate": 2.503242766074156e-07, "logits/chosen": 0.42826253175735474, "logits/rejected": 1.0195951461791992, "logps/chosen": -451.046142578125, "logps/rejected": -653.2913818359375, "loss": 0.2898, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.9979403018951416, "rewards/margins": 2.318507432937622, "rewards/rejected": -4.316447734832764, "step": 740 }, { "epoch": 0.56, "grad_norm": 61.13648555404995, "learning_rate": 2.4383936641392136e-07, "logits/chosen": 0.6429548859596252, "logits/rejected": 1.103127360343933, "logps/chosen": -467.82049560546875, "logps/rejected": -702.5692749023438, "loss": 0.2975, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.0785393714904785, "rewards/margins": 2.386026382446289, "rewards/rejected": -4.464566230773926, "step": 750 }, { "epoch": 0.56, "grad_norm": 51.760001565819636, "learning_rate": 2.3735860205989493e-07, "logits/chosen": 0.7451823353767395, "logits/rejected": 1.1489431858062744, "logps/chosen": -462.767333984375, "logps/rejected": -706.5615234375, "loss": 0.2627, "rewards/accuracies": 0.90625, "rewards/chosen": -2.312885284423828, "rewards/margins": 2.6091692447662354, "rewards/rejected": -4.922054767608643, "step": 760 }, { "epoch": 0.57, "grad_norm": 56.13632726849474, "learning_rate": 2.308863448189402e-07, "logits/chosen": 0.5960752367973328, "logits/rejected": 1.0421712398529053, "logps/chosen": -498.1941833496094, "logps/rejected": -695.0504760742188, "loss": 0.2811, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.29612398147583, "rewards/margins": 2.4551825523376465, "rewards/rejected": -4.751306533813477, "step": 770 }, { "epoch": 0.58, "grad_norm": 67.7549300842345, "learning_rate": 2.2442695023974246e-07, "logits/chosen": 0.6856900453567505, "logits/rejected": 1.3306076526641846, "logps/chosen": -444.3168029785156, "logps/rejected": -679.816650390625, "loss": 0.2713, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.0717947483062744, "rewards/margins": 2.6752490997314453, "rewards/rejected": -4.747043609619141, "step": 780 }, { "epoch": 0.59, "grad_norm": 55.628538802719504, "learning_rate": 2.179847652149729e-07, "logits/chosen": 0.7401930093765259, "logits/rejected": 1.288172960281372, "logps/chosen": -496.6468811035156, "logps/rejected": -687.7960205078125, "loss": 0.295, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.4609100818634033, "rewards/margins": 2.223629951477051, "rewards/rejected": -4.684540271759033, "step": 790 }, { "epoch": 0.59, "grad_norm": 63.651106043315345, "learning_rate": 2.115641250560183e-07, "logits/chosen": 0.8801604509353638, "logits/rejected": 1.5266039371490479, "logps/chosen": -473.2115173339844, "logps/rejected": -701.8800659179688, "loss": 0.2752, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.4201507568359375, "rewards/margins": 2.4442293643951416, "rewards/rejected": -4.864380836486816, "step": 800 }, { "epoch": 0.59, "eval_logits/chosen": -0.16280797123908997, "eval_logits/rejected": 0.2751551866531372, "eval_logps/chosen": -533.1201782226562, "eval_logps/rejected": -668.2235717773438, "eval_loss": 0.5228938460350037, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -2.2134366035461426, "eval_rewards/margins": 1.2935477495193481, "eval_rewards/rejected": -3.506984233856201, "eval_runtime": 97.387, "eval_samples_per_second": 20.537, "eval_steps_per_second": 0.329, "step": 800 }, { "epoch": 0.6, "grad_norm": 70.2608582618962, "learning_rate": 2.051693505755042e-07, "logits/chosen": 0.8354732394218445, "logits/rejected": 1.2750941514968872, "logps/chosen": -461.49786376953125, "logps/rejected": -705.8599853515625, "loss": 0.2946, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.4096267223358154, "rewards/margins": 2.483677864074707, "rewards/rejected": -4.893305778503418, "step": 810 }, { "epoch": 0.61, "grad_norm": 49.246802198712466, "learning_rate": 1.9880474517957542e-07, "logits/chosen": 0.9254199862480164, "logits/rejected": 1.563522458076477, "logps/chosen": -481.2748107910156, "logps/rejected": -658.328125, "loss": 0.2674, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.385385036468506, "rewards/margins": 2.1492881774902344, "rewards/rejected": -4.53467321395874, "step": 820 }, { "epoch": 0.62, "grad_norm": 88.28145029556197, "learning_rate": 1.9247459197189e-07, "logits/chosen": 0.8668380975723267, "logits/rejected": 1.5001232624053955, "logps/chosen": -488.27685546875, "logps/rejected": -680.9069213867188, "loss": 0.2652, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.6699295043945312, "rewards/margins": 2.2055306434631348, "rewards/rejected": -4.875459671020508, "step": 830 }, { "epoch": 0.62, "grad_norm": 43.13543734061108, "learning_rate": 1.8618315087127602e-07, "logits/chosen": 0.6826521754264832, "logits/rejected": 1.2443543672561646, "logps/chosen": -499.20892333984375, "logps/rejected": -706.3511962890625, "loss": 0.2563, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.4423558712005615, "rewards/margins": 2.461874485015869, "rewards/rejected": -4.904230117797852, "step": 840 }, { "epoch": 0.63, "grad_norm": 56.63843357010467, "learning_rate": 1.7993465574499102e-07, "logits/chosen": 0.5323538184165955, "logits/rejected": 1.2176125049591064, "logps/chosen": -463.47857666015625, "logps/rejected": -663.4465942382812, "loss": 0.2759, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.189335823059082, "rewards/margins": 2.420409679412842, "rewards/rejected": -4.609745502471924, "step": 850 }, { "epoch": 0.64, "grad_norm": 56.31423994279339, "learning_rate": 1.7373331155951233e-07, "logits/chosen": 0.8688204884529114, "logits/rejected": 1.4698970317840576, "logps/chosen": -510.4227600097656, "logps/rejected": -748.5259399414062, "loss": 0.2649, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.550417900085449, "rewards/margins": 2.730776309967041, "rewards/rejected": -5.28119421005249, "step": 860 }, { "epoch": 0.65, "grad_norm": 50.688626621321205, "learning_rate": 1.6758329155077743e-07, "logits/chosen": 1.0613950490951538, "logits/rejected": 1.5818780660629272, "logps/chosen": -495.5560607910156, "logps/rejected": -708.2391967773438, "loss": 0.2711, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.642883777618408, "rewards/margins": 2.6204209327697754, "rewards/rejected": -5.263304710388184, "step": 870 }, { "epoch": 0.65, "grad_norm": 46.10359729315069, "learning_rate": 1.6148873441577662e-07, "logits/chosen": 1.0479947328567505, "logits/rejected": 1.5524357557296753, "logps/chosen": -480.2462463378906, "logps/rejected": -707.98681640625, "loss": 0.2699, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.261603355407715, "rewards/margins": 2.4961774349212646, "rewards/rejected": -4.757781028747559, "step": 880 }, { "epoch": 0.66, "grad_norm": 41.346767344116245, "learning_rate": 1.5545374152738934e-07, "logits/chosen": 1.1905092000961304, "logits/rejected": 1.6182410717010498, "logps/chosen": -468.92083740234375, "logps/rejected": -689.1092529296875, "loss": 0.2722, "rewards/accuracies": 0.875, "rewards/chosen": -2.264604091644287, "rewards/margins": 2.391749143600464, "rewards/rejected": -4.65635347366333, "step": 890 }, { "epoch": 0.67, "grad_norm": 60.48896334839974, "learning_rate": 1.4948237417433775e-07, "logits/chosen": 1.380293369293213, "logits/rejected": 2.2697908878326416, "logps/chosen": -436.1393127441406, "logps/rejected": -673.2228393554688, "loss": 0.2492, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.151729106903076, "rewards/margins": 2.624401330947876, "rewards/rejected": -4.776130676269531, "step": 900 }, { "epoch": 0.67, "eval_logits/chosen": 0.5183509588241577, "eval_logits/rejected": 1.0725551843643188, "eval_logps/chosen": -518.2382202148438, "eval_logps/rejected": -652.8116455078125, "eval_loss": 0.5152209997177124, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -2.064617395401001, "eval_rewards/margins": 1.2882475852966309, "eval_rewards/rejected": -3.352864980697632, "eval_runtime": 97.3137, "eval_samples_per_second": 20.552, "eval_steps_per_second": 0.329, "step": 900 }, { "epoch": 0.68, "grad_norm": 59.39383985362304, "learning_rate": 1.435786508281158e-07, "logits/chosen": 1.9009380340576172, "logits/rejected": 2.567354679107666, "logps/chosen": -482.70513916015625, "logps/rejected": -720.0316162109375, "loss": 0.2499, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.3441195487976074, "rewards/margins": 2.6516547203063965, "rewards/rejected": -4.995774269104004, "step": 910 }, { "epoch": 0.68, "grad_norm": 58.953283614647454, "learning_rate": 1.3774654443873174e-07, "logits/chosen": 1.749333381652832, "logits/rejected": 2.4905173778533936, "logps/chosen": -512.65625, "logps/rejected": -763.8499145507812, "loss": 0.2542, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.783947467803955, "rewards/margins": 2.989567756652832, "rewards/rejected": -5.773515224456787, "step": 920 }, { "epoch": 0.69, "grad_norm": 57.229551980352035, "learning_rate": 1.31989979761085e-07, "logits/chosen": 1.3056137561798096, "logits/rejected": 2.2303478717803955, "logps/chosen": -465.61627197265625, "logps/rejected": -746.7559814453125, "loss": 0.2416, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.5093438625335693, "rewards/margins": 3.106735944747925, "rewards/rejected": -5.616079807281494, "step": 930 }, { "epoch": 0.7, "grad_norm": 53.92751444407525, "learning_rate": 1.2631283071377618e-07, "logits/chosen": 1.6224052906036377, "logits/rejected": 1.9630991220474243, "logps/chosen": -458.9669494628906, "logps/rejected": -742.6818237304688, "loss": 0.2429, "rewards/accuracies": 0.90625, "rewards/chosen": -2.4590606689453125, "rewards/margins": 2.7507693767547607, "rewards/rejected": -5.209830284118652, "step": 940 }, { "epoch": 0.71, "grad_norm": 48.183067890071925, "learning_rate": 1.2071891777212744e-07, "logits/chosen": 1.061023235321045, "logits/rejected": 1.9151092767715454, "logps/chosen": -507.06744384765625, "logps/rejected": -707.039794921875, "loss": 0.253, "rewards/accuracies": 0.875, "rewards/chosen": -2.448425054550171, "rewards/margins": 2.3641083240509033, "rewards/rejected": -4.812533855438232, "step": 950 }, { "epoch": 0.71, "grad_norm": 48.31856194766799, "learning_rate": 1.1521200539716874e-07, "logits/chosen": 1.2143045663833618, "logits/rejected": 1.9916166067123413, "logps/chosen": -500.71038818359375, "logps/rejected": -771.3677978515625, "loss": 0.2426, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.3821799755096436, "rewards/margins": 3.1737558841705322, "rewards/rejected": -5.555935859680176, "step": 960 }, { "epoch": 0.72, "grad_norm": 57.66373376149326, "learning_rate": 1.0979579950231821e-07, "logits/chosen": 1.1112618446350098, "logits/rejected": 2.246898889541626, "logps/chosen": -502.126220703125, "logps/rejected": -734.8248901367188, "loss": 0.241, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.395838737487793, "rewards/margins": 2.6420400142669678, "rewards/rejected": -5.03787899017334, "step": 970 }, { "epoch": 0.73, "grad_norm": 55.20670800594472, "learning_rate": 1.0447394495946291e-07, "logits/chosen": 1.387683391571045, "logits/rejected": 2.400949478149414, "logps/chosen": -515.9779052734375, "logps/rejected": -765.4949340820312, "loss": 0.2468, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.70845365524292, "rewards/margins": 2.7117531299591064, "rewards/rejected": -5.420206546783447, "step": 980 }, { "epoch": 0.74, "grad_norm": 45.9412294534277, "learning_rate": 9.925002314611841e-08, "logits/chosen": 1.8099420070648193, "logits/rejected": 2.5098319053649902, "logps/chosen": -484.7242736816406, "logps/rejected": -777.49169921875, "loss": 0.2383, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.564988374710083, "rewards/margins": 2.9337170124053955, "rewards/rejected": -5.498705863952637, "step": 990 }, { "epoch": 0.74, "grad_norm": 64.863814963629, "learning_rate": 9.412754953531663e-08, "logits/chosen": 1.5222892761230469, "logits/rejected": 2.5317773818969727, "logps/chosen": -507.424072265625, "logps/rejected": -756.7098388671875, "loss": 0.262, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.656026601791382, "rewards/margins": 2.7969748973846436, "rewards/rejected": -5.453001976013184, "step": 1000 }, { "epoch": 0.74, "eval_logits/chosen": 0.6804571151733398, "eval_logits/rejected": 1.3123811483383179, "eval_logps/chosen": -556.8264770507812, "eval_logps/rejected": -703.1602783203125, "eval_loss": 0.5241079330444336, "eval_rewards/accuracies": 0.76171875, "eval_rewards/chosen": -2.4504995346069336, "eval_rewards/margins": 1.405852198600769, "eval_rewards/rejected": -3.856351613998413, "eval_runtime": 97.4441, "eval_samples_per_second": 20.525, "eval_steps_per_second": 0.328, "step": 1000 }, { "epoch": 0.75, "grad_norm": 69.68207773392557, "learning_rate": 8.910997132984479e-08, "logits/chosen": 1.820955514907837, "logits/rejected": 2.952479839324951, "logps/chosen": -544.1399536132812, "logps/rejected": -808.0184936523438, "loss": 0.2504, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.861184597015381, "rewards/margins": 3.071931838989258, "rewards/rejected": -5.933116436004639, "step": 1010 }, { "epoch": 0.76, "grad_norm": 50.59071094029437, "learning_rate": 8.42006651424274e-08, "logits/chosen": 1.8404204845428467, "logits/rejected": 2.6863815784454346, "logps/chosen": -461.4169921875, "logps/rejected": -703.1361083984375, "loss": 0.2318, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.4329962730407715, "rewards/margins": 2.7300188541412354, "rewards/rejected": -5.163014888763428, "step": 1020 }, { "epoch": 0.77, "grad_norm": 57.22762908033313, "learning_rate": 7.940293472341217e-08, "logits/chosen": 2.013861894607544, "logits/rejected": 2.7502970695495605, "logps/chosen": -477.7572326660156, "logps/rejected": -773.4556884765625, "loss": 0.2276, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.6210336685180664, "rewards/margins": 3.139965057373047, "rewards/rejected": -5.7609992027282715, "step": 1030 }, { "epoch": 0.77, "grad_norm": 55.15868922046573, "learning_rate": 7.472000873748918e-08, "logits/chosen": 2.0298519134521484, "logits/rejected": 2.990135431289673, "logps/chosen": -528.5840454101562, "logps/rejected": -781.4909057617188, "loss": 0.2487, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.6361494064331055, "rewards/margins": 2.9660372734069824, "rewards/rejected": -5.602187156677246, "step": 1040 }, { "epoch": 0.78, "grad_norm": 43.438291077124795, "learning_rate": 7.015503859093927e-08, "logits/chosen": 2.1326801776885986, "logits/rejected": 2.5511794090270996, "logps/chosen": -486.6455078125, "logps/rejected": -757.7630004882812, "loss": 0.2148, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.617185592651367, "rewards/margins": 2.795973062515259, "rewards/rejected": -5.413158893585205, "step": 1050 }, { "epoch": 0.79, "grad_norm": 63.14016572546011, "learning_rate": 6.571109631087451e-08, "logits/chosen": 2.417752742767334, "logits/rejected": 3.036146402359009, "logps/chosen": -494.73046875, "logps/rejected": -811.0126953125, "loss": 0.2112, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.588284492492676, "rewards/margins": 3.300442934036255, "rewards/rejected": -5.888727188110352, "step": 1060 }, { "epoch": 0.79, "grad_norm": 58.89863039830767, "learning_rate": 6.139117247789687e-08, "logits/chosen": 2.5516977310180664, "logits/rejected": 3.055995464324951, "logps/chosen": -535.7842407226562, "logps/rejected": -800.0374145507812, "loss": 0.2248, "rewards/accuracies": 0.90625, "rewards/chosen": -2.956123113632202, "rewards/margins": 2.720890998840332, "rewards/rejected": -5.677014350891113, "step": 1070 }, { "epoch": 0.8, "grad_norm": 41.21215573686561, "learning_rate": 5.719817421356685e-08, "logits/chosen": 1.9021530151367188, "logits/rejected": 2.7421538829803467, "logps/chosen": -549.5343017578125, "logps/rejected": -820.0753784179688, "loss": 0.2033, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.7265052795410156, "rewards/margins": 3.281470537185669, "rewards/rejected": -6.007976055145264, "step": 1080 }, { "epoch": 0.81, "grad_norm": 58.39711865385947, "learning_rate": 5.313492322403701e-08, "logits/chosen": 2.2018539905548096, "logits/rejected": 2.951138496398926, "logps/chosen": -533.9331665039062, "logps/rejected": -891.0558471679688, "loss": 0.1937, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.8866357803344727, "rewards/margins": 3.6149306297302246, "rewards/rejected": -6.501566410064697, "step": 1090 }, { "epoch": 0.82, "grad_norm": 51.18256501676837, "learning_rate": 4.9204153901165805e-08, "logits/chosen": 1.9893665313720703, "logits/rejected": 2.7781219482421875, "logps/chosen": -530.7794189453125, "logps/rejected": -824.0559692382812, "loss": 0.2299, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.8573508262634277, "rewards/margins": 3.2189173698425293, "rewards/rejected": -6.076268196105957, "step": 1100 }, { "epoch": 0.82, "eval_logits/chosen": 0.8391125202178955, "eval_logits/rejected": 1.4834216833114624, "eval_logps/chosen": -588.2494506835938, "eval_logps/rejected": -741.857421875, "eval_loss": 0.5312901139259338, "eval_rewards/accuracies": 0.7578125, "eval_rewards/chosen": -2.7647294998168945, "eval_rewards/margins": 1.4785932302474976, "eval_rewards/rejected": -4.243322849273682, "eval_runtime": 97.5423, "eval_samples_per_second": 20.504, "eval_steps_per_second": 0.328, "step": 1100 }, { "epoch": 0.82, "grad_norm": 68.60925195657734, "learning_rate": 4.540851148239036e-08, "logits/chosen": 1.7061752080917358, "logits/rejected": 2.698995351791382, "logps/chosen": -537.1931762695312, "logps/rejected": -848.33154296875, "loss": 0.2129, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7809014320373535, "rewards/margins": 3.3348469734191895, "rewards/rejected": -6.115748405456543, "step": 1110 }, { "epoch": 0.83, "grad_norm": 48.80096479357628, "learning_rate": 4.1750550270596206e-08, "logits/chosen": 1.531884789466858, "logits/rejected": 2.923696994781494, "logps/chosen": -509.5885314941406, "logps/rejected": -794.9307250976562, "loss": 0.1954, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.630959987640381, "rewards/margins": 3.3725571632385254, "rewards/rejected": -6.003516674041748, "step": 1120 }, { "epoch": 0.84, "grad_norm": 68.79197398198284, "learning_rate": 3.823273191518234e-08, "logits/chosen": 1.5292671918869019, "logits/rejected": 2.3230159282684326, "logps/chosen": -568.5833740234375, "logps/rejected": -835.826171875, "loss": 0.2178, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.0106937885284424, "rewards/margins": 3.2017643451690674, "rewards/rejected": -6.212458610534668, "step": 1130 }, { "epoch": 0.85, "grad_norm": 59.434543375011025, "learning_rate": 3.485742375547745e-08, "logits/chosen": 1.4421080350875854, "logits/rejected": 2.442089796066284, "logps/chosen": -553.727294921875, "logps/rejected": -822.7138671875, "loss": 0.2009, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.1090734004974365, "rewards/margins": 2.9853668212890625, "rewards/rejected": -6.094440460205078, "step": 1140 }, { "epoch": 0.85, "grad_norm": 38.888275757403804, "learning_rate": 3.162689722762365e-08, "logits/chosen": 1.5811113119125366, "logits/rejected": 2.2564284801483154, "logps/chosen": -543.1163940429688, "logps/rejected": -842.681640625, "loss": 0.2095, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.9668571949005127, "rewards/margins": 3.10882830619812, "rewards/rejected": -6.075685024261475, "step": 1150 }, { "epoch": 0.86, "grad_norm": 42.47551430381964, "learning_rate": 2.8543326335997904e-08, "logits/chosen": 1.768690824508667, "logits/rejected": 2.4484939575195312, "logps/chosen": -556.0635375976562, "logps/rejected": -805.807373046875, "loss": 0.2046, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.865739107131958, "rewards/margins": 2.8989548683166504, "rewards/rejected": -5.764693737030029, "step": 1160 }, { "epoch": 0.87, "grad_norm": 59.36158165544989, "learning_rate": 2.560878619020157e-08, "logits/chosen": 1.9017894268035889, "logits/rejected": 2.7026009559631348, "logps/chosen": -521.269287109375, "logps/rejected": -813.7127685546875, "loss": 0.1964, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.9693474769592285, "rewards/margins": 3.1322848796844482, "rewards/rejected": -6.101632595062256, "step": 1170 }, { "epoch": 0.88, "grad_norm": 49.475189963130575, "learning_rate": 2.2825251608601466e-08, "logits/chosen": 1.8870357275009155, "logits/rejected": 2.8944287300109863, "logps/chosen": -558.059814453125, "logps/rejected": -868.568359375, "loss": 0.1891, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.1376397609710693, "rewards/margins": 3.2884891033172607, "rewards/rejected": -6.426129341125488, "step": 1180 }, { "epoch": 0.88, "grad_norm": 85.599165147591, "learning_rate": 2.0194595789362474e-08, "logits/chosen": 1.9095745086669922, "logits/rejected": 2.530900478363037, "logps/chosen": -577.1746826171875, "logps/rejected": -892.88623046875, "loss": 0.2027, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0735995769500732, "rewards/margins": 3.377427339553833, "rewards/rejected": -6.451026916503906, "step": 1190 }, { "epoch": 0.89, "grad_norm": 45.52491787365754, "learning_rate": 1.7718589049866728e-08, "logits/chosen": 2.376490592956543, "logits/rejected": 3.1364424228668213, "logps/chosen": -510.269287109375, "logps/rejected": -829.1940307617188, "loss": 0.1974, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9278645515441895, "rewards/margins": 3.433408737182617, "rewards/rejected": -6.361273765563965, "step": 1200 }, { "epoch": 0.89, "eval_logits/chosen": 0.8963963389396667, "eval_logits/rejected": 1.5457934141159058, "eval_logps/chosen": -606.617431640625, "eval_logps/rejected": -764.6512451171875, "eval_loss": 0.5366576910018921, "eval_rewards/accuracies": 0.76171875, "eval_rewards/chosen": -2.948409080505371, "eval_rewards/margins": 1.5228519439697266, "eval_rewards/rejected": -4.471261024475098, "eval_runtime": 97.4355, "eval_samples_per_second": 20.526, "eval_steps_per_second": 0.328, "step": 1200 }, { "epoch": 0.9, "grad_norm": 56.7147448955845, "learning_rate": 1.539889763536645e-08, "logits/chosen": 1.9441492557525635, "logits/rejected": 3.0478804111480713, "logps/chosen": -538.355224609375, "logps/rejected": -856.01416015625, "loss": 0.2187, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.842240571975708, "rewards/margins": 3.5280959606170654, "rewards/rejected": -6.370336055755615, "step": 1210 }, { "epoch": 0.91, "grad_norm": 60.258963508413004, "learning_rate": 1.3237082597673172e-08, "logits/chosen": 2.1856608390808105, "logits/rejected": 2.853616237640381, "logps/chosen": -517.0845947265625, "logps/rejected": -845.6990966796875, "loss": 0.204, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0185937881469727, "rewards/margins": 3.2306289672851562, "rewards/rejected": -6.249222755432129, "step": 1220 }, { "epoch": 0.91, "grad_norm": 71.41232139420377, "learning_rate": 1.1234598744637502e-08, "logits/chosen": 1.5448696613311768, "logits/rejected": 2.610525608062744, "logps/chosen": -545.0371704101562, "logps/rejected": -821.2421875, "loss": 0.2063, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.1403965950012207, "rewards/margins": 3.1843514442443848, "rewards/rejected": -6.3247480392456055, "step": 1230 }, { "epoch": 0.92, "grad_norm": 57.959377016977456, "learning_rate": 9.392793661126414e-09, "logits/chosen": 1.898782730102539, "logits/rejected": 2.7061781883239746, "logps/chosen": -582.9857177734375, "logps/rejected": -879.3019409179688, "loss": 0.1979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.2453556060791016, "rewards/margins": 3.297309160232544, "rewards/rejected": -6.542665004730225, "step": 1240 }, { "epoch": 0.93, "grad_norm": 50.86760187147993, "learning_rate": 7.71290680215711e-09, "logits/chosen": 2.0340778827667236, "logits/rejected": 2.8080642223358154, "logps/chosen": -558.147705078125, "logps/rejected": -874.9266357421875, "loss": 0.1974, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0640769004821777, "rewards/margins": 3.380338668823242, "rewards/rejected": -6.444415092468262, "step": 1250 }, { "epoch": 0.94, "grad_norm": 61.973766270626015, "learning_rate": 6.196068658797543e-09, "logits/chosen": 1.8814232349395752, "logits/rejected": 2.7813236713409424, "logps/chosen": -551.5777587890625, "logps/rejected": -826.7698974609375, "loss": 0.1971, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.9602150917053223, "rewards/margins": 3.0024728775024414, "rewards/rejected": -5.9626874923706055, "step": 1260 }, { "epoch": 0.94, "grad_norm": 67.6695850405579, "learning_rate": 4.843299997394717e-09, "logits/chosen": 1.856507658958435, "logits/rejected": 2.7601516246795654, "logps/chosen": -540.268310546875, "logps/rejected": -846.9691162109375, "loss": 0.2067, "rewards/accuracies": 0.9375, "rewards/chosen": -3.077454090118408, "rewards/margins": 3.414836883544922, "rewards/rejected": -6.492290496826172, "step": 1270 }, { "epoch": 0.95, "grad_norm": 68.73319089653008, "learning_rate": 3.655511172643372e-09, "logits/chosen": 1.932074785232544, "logits/rejected": 2.437225818634033, "logps/chosen": -531.4140625, "logps/rejected": -836.9505615234375, "loss": 0.1876, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.8276994228363037, "rewards/margins": 3.25665020942688, "rewards/rejected": -6.084350109100342, "step": 1280 }, { "epoch": 0.96, "grad_norm": 50.423800165908794, "learning_rate": 2.633501514956532e-09, "logits/chosen": 1.9169034957885742, "logits/rejected": 2.7369441986083984, "logps/chosen": -586.8289794921875, "logps/rejected": -896.8014526367188, "loss": 0.2044, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.1295228004455566, "rewards/margins": 3.5232949256896973, "rewards/rejected": -6.652817726135254, "step": 1290 }, { "epoch": 0.97, "grad_norm": 57.31903342529662, "learning_rate": 1.777958792550993e-09, "logits/chosen": 1.5464543104171753, "logits/rejected": 2.9688878059387207, "logps/chosen": -587.2015380859375, "logps/rejected": -853.0357666015625, "loss": 0.1842, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.988502025604248, "rewards/margins": 3.156489372253418, "rewards/rejected": -6.144991397857666, "step": 1300 }, { "epoch": 0.97, "eval_logits/chosen": 0.9558575749397278, "eval_logits/rejected": 1.609464406967163, "eval_logps/chosen": -609.159423828125, "eval_logps/rejected": -767.4317016601562, "eval_loss": 0.5365558862686157, "eval_rewards/accuracies": 0.76171875, "eval_rewards/chosen": -2.9738292694091797, "eval_rewards/margins": 1.5252362489700317, "eval_rewards/rejected": -4.499065399169922, "eval_runtime": 97.3239, "eval_samples_per_second": 20.55, "eval_steps_per_second": 0.329, "step": 1300 }, { "epoch": 0.97, "grad_norm": 66.21886288694567, "learning_rate": 1.0894587486089125e-09, "logits/chosen": 1.8931999206542969, "logits/rejected": 2.824298858642578, "logps/chosen": -563.06201171875, "logps/rejected": -834.8709716796875, "loss": 0.2157, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.2370285987854004, "rewards/margins": 3.035515546798706, "rewards/rejected": -6.272543430328369, "step": 1310 }, { "epoch": 0.98, "grad_norm": 45.779926433395936, "learning_rate": 5.684647138277098e-10, "logits/chosen": 1.7055333852767944, "logits/rejected": 2.308079719543457, "logps/chosen": -531.0139770507812, "logps/rejected": -862.2609252929688, "loss": 0.1974, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.956573486328125, "rewards/margins": 3.375626802444458, "rewards/rejected": -6.332200050354004, "step": 1320 }, { "epoch": 0.99, "grad_norm": 58.05458328657747, "learning_rate": 2.153272946184559e-10, "logits/chosen": 1.735358476638794, "logits/rejected": 2.259385585784912, "logps/chosen": -585.9295043945312, "logps/rejected": -861.4645385742188, "loss": 0.1738, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.10073184967041, "rewards/margins": 2.996291399002075, "rewards/rejected": -6.097023010253906, "step": 1330 }, { "epoch": 1.0, "grad_norm": 46.42702960995785, "learning_rate": 3.0284137163189004e-11, "logits/chosen": 2.000138759613037, "logits/rejected": 2.7859671115875244, "logps/chosen": -530.1033935546875, "logps/rejected": -878.3465576171875, "loss": 0.1884, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.1844658851623535, "rewards/margins": 3.3884029388427734, "rewards/rejected": -6.572869300842285, "step": 1340 }, { "epoch": 1.0, "step": 1346, "total_flos": 0.0, "train_loss": 0.335402155391883, "train_runtime": 21644.3608, "train_samples_per_second": 7.959, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 1346, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }