diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 6.1621459865713515, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.06070180982351303, + "logits/rejected": 0.14738903939723969, + "logps/chosen": -1.716059684753418, + "logps/rejected": -1.8892710208892822, + "loss": 1.0429, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.716059684753418, + "rewards/margins": 0.1732112467288971, + "rewards/rejected": -1.8892710208892822, + "semantic_entropy": 0.6584457159042358, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 9.137033794779027, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": -0.0036977827548980713, + "logits/rejected": 0.11409668624401093, + "logps/chosen": -1.8028045892715454, + "logps/rejected": -1.8464124202728271, + "loss": 1.1233, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8028045892715454, + "rewards/margins": 0.0436079278588295, + "rewards/rejected": -1.8464124202728271, + "semantic_entropy": 0.6394152641296387, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 9.22389226014171, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.029309600591659546, + "logits/rejected": 0.06751412898302078, + "logps/chosen": -1.6355518102645874, + "logps/rejected": -1.7657592296600342, + "loss": 1.1344, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6355518102645874, + "rewards/margins": 0.13020756840705872, + "rewards/rejected": -1.7657592296600342, + "semantic_entropy": 0.6930069923400879, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 6.704632465419751, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.03660174086689949, + "logits/rejected": 0.049360670149326324, + "logps/chosen": -1.724509596824646, + "logps/rejected": -1.8065202236175537, + "loss": 1.145, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.724509596824646, + "rewards/margins": 0.08201076835393906, + "rewards/rejected": -1.8065202236175537, + "semantic_entropy": 0.6685421466827393, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 13.950567091423647, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.04136265441775322, + "logits/rejected": 0.044629622250795364, + "logps/chosen": -1.869329810142517, + "logps/rejected": -1.7786051034927368, + "loss": 1.2712, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.869329810142517, + "rewards/margins": -0.09072484076023102, + "rewards/rejected": -1.7786051034927368, + "semantic_entropy": 0.6433960795402527, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 7.520127719976578, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.07225209474563599, + "logits/rejected": 0.020951146259903908, + "logps/chosen": -1.9089466333389282, + "logps/rejected": -1.832271933555603, + "loss": 1.1721, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -1.9089466333389282, + "rewards/margins": -0.07667465507984161, + "rewards/rejected": -1.832271933555603, + "semantic_entropy": 0.6176777482032776, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 8.288075347283838, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.05746116489171982, + "logits/rejected": 0.10160557925701141, + "logps/chosen": -1.845741629600525, + "logps/rejected": -1.9970605373382568, + "loss": 1.1629, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.845741629600525, + "rewards/margins": 0.1513189673423767, + "rewards/rejected": -1.9970605373382568, + "semantic_entropy": 0.6350187063217163, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 7.5458186716671465, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.05770735815167427, + "logits/rejected": 0.23583391308784485, + "logps/chosen": -1.880816102027893, + "logps/rejected": -1.743043303489685, + "loss": 1.2132, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.880816102027893, + "rewards/margins": -0.1377728283405304, + "rewards/rejected": -1.743043303489685, + "semantic_entropy": 0.6431102752685547, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 12.928036650171752, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.049303993582725525, + "logits/rejected": 0.25262051820755005, + "logps/chosen": -1.837459921836853, + "logps/rejected": -1.8713966608047485, + "loss": 1.1798, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.837459921836853, + "rewards/margins": 0.03393695876002312, + "rewards/rejected": -1.8713966608047485, + "semantic_entropy": 0.649166464805603, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 10.160669036683966, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.027670959010720253, + "logits/rejected": 0.1239209994673729, + "logps/chosen": -1.8993823528289795, + "logps/rejected": -1.7789846658706665, + "loss": 1.2256, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.8993823528289795, + "rewards/margins": -0.1203979030251503, + "rewards/rejected": -1.7789846658706665, + "semantic_entropy": 0.6335883140563965, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 7.047012193835533, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.10063391923904419, + "logits/rejected": 0.12058229744434357, + "logps/chosen": -1.8336282968521118, + "logps/rejected": -1.8673959970474243, + "loss": 1.1935, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.8336282968521118, + "rewards/margins": 0.03376791998744011, + "rewards/rejected": -1.8673959970474243, + "semantic_entropy": 0.6438094973564148, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 7.199053435790905, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.08423934876918793, + "logits/rejected": 0.10448728501796722, + "logps/chosen": -1.789345145225525, + "logps/rejected": -1.894176721572876, + "loss": 1.1008, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.789345145225525, + "rewards/margins": 0.10483156144618988, + "rewards/rejected": -1.894176721572876, + "semantic_entropy": 0.6360429525375366, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 5.878839162191842, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.04232923686504364, + "logits/rejected": 0.10366680473089218, + "logps/chosen": -1.6381199359893799, + "logps/rejected": -1.7684608697891235, + "loss": 1.0888, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6381199359893799, + "rewards/margins": 0.13034099340438843, + "rewards/rejected": -1.7684608697891235, + "semantic_entropy": 0.6962206959724426, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 11.097796193507412, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.07627397030591965, + "logits/rejected": 0.07312844693660736, + "logps/chosen": -1.766296148300171, + "logps/rejected": -1.8135309219360352, + "loss": 1.1905, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -1.766296148300171, + "rewards/margins": 0.047234609723091125, + "rewards/rejected": -1.8135309219360352, + "semantic_entropy": 0.6539437770843506, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 11.180823699806128, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.043935492634773254, + "logits/rejected": 0.1390921175479889, + "logps/chosen": -1.7772403955459595, + "logps/rejected": -2.038160562515259, + "loss": 1.0594, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7772403955459595, + "rewards/margins": 0.2609199583530426, + "rewards/rejected": -2.038160562515259, + "semantic_entropy": 0.6338866353034973, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 7.729614400854603, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": 0.009521784260869026, + "logits/rejected": 0.11359156668186188, + "logps/chosen": -1.7183939218521118, + "logps/rejected": -1.7508172988891602, + "loss": 1.1522, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7183939218521118, + "rewards/margins": 0.0324234738945961, + "rewards/rejected": -1.7508172988891602, + "semantic_entropy": 0.6691663265228271, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 5.774164895526498, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.16618943214416504, + "logits/rejected": 0.07412171363830566, + "logps/chosen": -1.7912899255752563, + "logps/rejected": -1.9684991836547852, + "loss": 1.1099, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7912899255752563, + "rewards/margins": 0.17720915377140045, + "rewards/rejected": -1.9684991836547852, + "semantic_entropy": 0.6479779481887817, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 13.994171190985876, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.08484308421611786, + "logits/rejected": 0.04691457375884056, + "logps/chosen": -1.750454306602478, + "logps/rejected": -1.7775003910064697, + "loss": 1.1925, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.750454306602478, + "rewards/margins": 0.027046024799346924, + "rewards/rejected": -1.7775003910064697, + "semantic_entropy": 0.668484091758728, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 5.179734454416302, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.0784115418791771, + "logits/rejected": 0.06837181746959686, + "logps/chosen": -1.805314302444458, + "logps/rejected": -1.9120498895645142, + "loss": 1.1394, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.805314302444458, + "rewards/margins": 0.10673556476831436, + "rewards/rejected": -1.9120498895645142, + "semantic_entropy": 0.6409928202629089, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 6.303816361495141, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.0438729003071785, + "logits/rejected": 0.01818550005555153, + "logps/chosen": -1.6925382614135742, + "logps/rejected": -1.8010832071304321, + "loss": 1.104, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6925382614135742, + "rewards/margins": 0.10854510962963104, + "rewards/rejected": -1.8010832071304321, + "semantic_entropy": 0.6733208298683167, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 8.250362407815302, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.04747066646814346, + "logits/rejected": 0.07233314961194992, + "logps/chosen": -1.6426517963409424, + "logps/rejected": -1.8100935220718384, + "loss": 1.0833, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6426517963409424, + "rewards/margins": 0.16744166612625122, + "rewards/rejected": -1.8100935220718384, + "semantic_entropy": 0.6844531297683716, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 6.750093995633937, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.0031594126485288143, + "logits/rejected": 0.09811054170131683, + "logps/chosen": -1.6814390420913696, + "logps/rejected": -1.7384357452392578, + "loss": 1.1586, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6814390420913696, + "rewards/margins": 0.05699686333537102, + "rewards/rejected": -1.7384357452392578, + "semantic_entropy": 0.6790895462036133, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 8.991359972391313, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.024249624460935593, + "logits/rejected": 0.23187024891376495, + "logps/chosen": -1.6709773540496826, + "logps/rejected": -1.9569326639175415, + "loss": 1.0366, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6709773540496826, + "rewards/margins": 0.28595516085624695, + "rewards/rejected": -1.9569326639175415, + "semantic_entropy": 0.6562684178352356, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 5.808107193049869, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.07225940376520157, + "logits/rejected": 0.10119612514972687, + "logps/chosen": -1.7596435546875, + "logps/rejected": -1.8809823989868164, + "loss": 1.0991, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7596435546875, + "rewards/margins": 0.12133894115686417, + "rewards/rejected": -1.8809823989868164, + "semantic_entropy": 0.6526800990104675, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 6.833917796547652, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.07721801102161407, + "logits/rejected": 0.05250721424818039, + "logps/chosen": -1.6813856363296509, + "logps/rejected": -1.6302525997161865, + "loss": 1.1922, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6813856363296509, + "rewards/margins": -0.051132846623659134, + "rewards/rejected": -1.6302525997161865, + "semantic_entropy": 0.6917638778686523, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 8.414614080726308, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.02672005072236061, + "logits/rejected": 0.15939494967460632, + "logps/chosen": -1.7269341945648193, + "logps/rejected": -1.847845435142517, + "loss": 1.0565, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7269341945648193, + "rewards/margins": 0.12091119587421417, + "rewards/rejected": -1.847845435142517, + "semantic_entropy": 0.6518223881721497, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 15.502747011489308, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.045921992510557175, + "logits/rejected": 0.06921950727701187, + "logps/chosen": -1.7797927856445312, + "logps/rejected": -1.7936254739761353, + "loss": 1.1683, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7797927856445312, + "rewards/margins": 0.013832822442054749, + "rewards/rejected": -1.7936254739761353, + "semantic_entropy": 0.6467072367668152, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 10.934153827167037, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.029972439631819725, + "logits/rejected": 0.13952571153640747, + "logps/chosen": -1.7365680932998657, + "logps/rejected": -1.8929036855697632, + "loss": 1.0664, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7365680932998657, + "rewards/margins": 0.15633563697338104, + "rewards/rejected": -1.8929036855697632, + "semantic_entropy": 0.6409581899642944, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 9.72307827733719, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": -0.0310811810195446, + "logits/rejected": 0.11993386596441269, + "logps/chosen": -1.651533842086792, + "logps/rejected": -1.7689011096954346, + "loss": 1.0902, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.651533842086792, + "rewards/margins": 0.1173669844865799, + "rewards/rejected": -1.7689011096954346, + "semantic_entropy": 0.6721662282943726, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 10.946409292553673, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.06234356015920639, + "logits/rejected": 0.0975189134478569, + "logps/chosen": -1.6039183139801025, + "logps/rejected": -1.6026118993759155, + "loss": 1.1726, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6039183139801025, + "rewards/margins": -0.0013063341611996293, + "rewards/rejected": -1.6026118993759155, + "semantic_entropy": 0.7188035249710083, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 8.966979965135215, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.053622614592313766, + "logits/rejected": -0.006728078238666058, + "logps/chosen": -1.6239417791366577, + "logps/rejected": -1.7132408618927002, + "loss": 1.1056, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6239417791366577, + "rewards/margins": 0.0892990455031395, + "rewards/rejected": -1.7132408618927002, + "semantic_entropy": 0.6902952790260315, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 7.433737051457635, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.1430046111345291, + "logits/rejected": -0.0034906647633761168, + "logps/chosen": -1.7533985376358032, + "logps/rejected": -1.7312949895858765, + "loss": 1.1909, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.7533985376358032, + "rewards/margins": -0.022103413939476013, + "rewards/rejected": -1.7312949895858765, + "semantic_entropy": 0.6651071310043335, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 8.024498031822322, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.05941913276910782, + "logits/rejected": 0.11122976243495941, + "logps/chosen": -1.5744872093200684, + "logps/rejected": -1.7276694774627686, + "loss": 1.1042, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5744872093200684, + "rewards/margins": 0.1531822681427002, + "rewards/rejected": -1.7276694774627686, + "semantic_entropy": 0.6984173059463501, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 12.469922507489153, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.0928221344947815, + "logits/rejected": -0.041338033974170685, + "logps/chosen": -1.7328227758407593, + "logps/rejected": -1.7776778936386108, + "loss": 1.1493, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.7328227758407593, + "rewards/margins": 0.04485485702753067, + "rewards/rejected": -1.7776778936386108, + "semantic_entropy": 0.6588774919509888, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 10.037232387665194, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.05204933136701584, + "logits/rejected": 0.04736893251538277, + "logps/chosen": -1.6078169345855713, + "logps/rejected": -1.7068147659301758, + "loss": 1.135, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6078169345855713, + "rewards/margins": 0.09899773448705673, + "rewards/rejected": -1.7068147659301758, + "semantic_entropy": 0.6981975436210632, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 8.220031140397653, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": 0.015208420343697071, + "logits/rejected": 0.01307359803467989, + "logps/chosen": -1.633329153060913, + "logps/rejected": -1.777130365371704, + "loss": 1.1206, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.633329153060913, + "rewards/margins": 0.14380115270614624, + "rewards/rejected": -1.777130365371704, + "semantic_entropy": 0.6907540559768677, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 8.44277458445029, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.12780144810676575, + "logits/rejected": -0.040175847709178925, + "logps/chosen": -1.5936999320983887, + "logps/rejected": -1.6630268096923828, + "loss": 1.1614, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.5936999320983887, + "rewards/margins": 0.06932689249515533, + "rewards/rejected": -1.6630268096923828, + "semantic_entropy": 0.7057312726974487, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 8.214598410169415, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.06513194739818573, + "logits/rejected": 0.05364646762609482, + "logps/chosen": -1.6519644260406494, + "logps/rejected": -1.7357133626937866, + "loss": 1.0944, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.6519644260406494, + "rewards/margins": 0.08374904841184616, + "rewards/rejected": -1.7357133626937866, + "semantic_entropy": 0.6681785583496094, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 5.81056993834838, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": 0.021021168678998947, + "logits/rejected": 0.17753520607948303, + "logps/chosen": -1.4594143629074097, + "logps/rejected": -1.6115144491195679, + "loss": 1.0913, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4594143629074097, + "rewards/margins": 0.15210004150867462, + "rewards/rejected": -1.6115144491195679, + "semantic_entropy": 0.7464505434036255, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 12.873598411605691, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.06861015409231186, + "logits/rejected": 0.07104112207889557, + "logps/chosen": -1.5893186330795288, + "logps/rejected": -1.601284384727478, + "loss": 1.1435, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5893186330795288, + "rewards/margins": 0.011965674348175526, + "rewards/rejected": -1.601284384727478, + "semantic_entropy": 0.7116156816482544, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 13.584024992116543, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.05508657544851303, + "logits/rejected": 0.08913681656122208, + "logps/chosen": -1.518781065940857, + "logps/rejected": -1.5591602325439453, + "loss": 1.1321, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.518781065940857, + "rewards/margins": 0.04037924110889435, + "rewards/rejected": -1.5591602325439453, + "semantic_entropy": 0.7272243499755859, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 12.81406377653034, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.135735422372818, + "logits/rejected": 0.05884036421775818, + "logps/chosen": -1.5534254312515259, + "logps/rejected": -1.7356328964233398, + "loss": 1.0671, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5534254312515259, + "rewards/margins": 0.1822076290845871, + "rewards/rejected": -1.7356328964233398, + "semantic_entropy": 0.7095759510993958, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 7.357321246011275, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.18377165496349335, + "logits/rejected": 0.060975439846515656, + "logps/chosen": -1.5331767797470093, + "logps/rejected": -1.627393126487732, + "loss": 1.0754, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5331767797470093, + "rewards/margins": 0.09421636164188385, + "rewards/rejected": -1.627393126487732, + "semantic_entropy": 0.7281553149223328, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 16.502921098288432, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.04538556560873985, + "logits/rejected": 0.14347299933433533, + "logps/chosen": -1.5174219608306885, + "logps/rejected": -1.7298427820205688, + "loss": 1.0482, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5174219608306885, + "rewards/margins": 0.21242070198059082, + "rewards/rejected": -1.7298427820205688, + "semantic_entropy": 0.7120253443717957, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 6.370433761880968, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.0893501490354538, + "logits/rejected": 0.08341099321842194, + "logps/chosen": -1.4963688850402832, + "logps/rejected": -1.64535391330719, + "loss": 1.0625, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4963688850402832, + "rewards/margins": 0.14898499846458435, + "rewards/rejected": -1.64535391330719, + "semantic_entropy": 0.7224361300468445, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 6.037012104027165, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": -0.012658950872719288, + "logits/rejected": 0.06327076256275177, + "logps/chosen": -1.5707639455795288, + "logps/rejected": -1.736476182937622, + "loss": 1.0823, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5707639455795288, + "rewards/margins": 0.16571208834648132, + "rewards/rejected": -1.736476182937622, + "semantic_entropy": 0.7066926956176758, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 10.864711575015095, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": 0.02206835150718689, + "logits/rejected": 0.1628737896680832, + "logps/chosen": -1.5150396823883057, + "logps/rejected": -1.692229986190796, + "loss": 1.0396, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5150396823883057, + "rewards/margins": 0.17719021439552307, + "rewards/rejected": -1.692229986190796, + "semantic_entropy": 0.7193215489387512, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 7.074774903340499, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.02384335733950138, + "logits/rejected": 0.1055992841720581, + "logps/chosen": -1.5265928506851196, + "logps/rejected": -1.719167947769165, + "loss": 1.0547, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5265928506851196, + "rewards/margins": 0.19257517158985138, + "rewards/rejected": -1.719167947769165, + "semantic_entropy": 0.7045563459396362, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 7.662272971348944, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": 0.036410488188266754, + "logits/rejected": 0.1562315970659256, + "logps/chosen": -1.6148598194122314, + "logps/rejected": -1.7143230438232422, + "loss": 1.0858, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6148598194122314, + "rewards/margins": 0.09946312010288239, + "rewards/rejected": -1.7143230438232422, + "semantic_entropy": 0.6894456744194031, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 11.864125002258264, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": -0.048241887241601944, + "logits/rejected": 0.11782636493444443, + "logps/chosen": -1.6349153518676758, + "logps/rejected": -1.6976579427719116, + "loss": 1.1406, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6349153518676758, + "rewards/margins": 0.06274263560771942, + "rewards/rejected": -1.6976579427719116, + "semantic_entropy": 0.6956798434257507, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 7.796195965000263, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -0.020806463435292244, + "logits/rejected": 0.12357542663812637, + "logps/chosen": -1.4866034984588623, + "logps/rejected": -1.6658039093017578, + "loss": 1.056, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4866034984588623, + "rewards/margins": 0.17920050024986267, + "rewards/rejected": -1.6658039093017578, + "semantic_entropy": 0.7214481830596924, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 7.563331790101729, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.21241986751556396, + "logits/rejected": -0.10534496605396271, + "logps/chosen": -1.653738260269165, + "logps/rejected": -1.7724952697753906, + "loss": 1.0457, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.653738260269165, + "rewards/margins": 0.1187569871544838, + "rewards/rejected": -1.7724952697753906, + "semantic_entropy": 0.6744239330291748, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 16.160498491276236, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": -0.07287711650133133, + "logits/rejected": 0.009031775407493114, + "logps/chosen": -1.675157904624939, + "logps/rejected": -1.7961061000823975, + "loss": 1.0988, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.675157904624939, + "rewards/margins": 0.12094844877719879, + "rewards/rejected": -1.7961061000823975, + "semantic_entropy": 0.6546419262886047, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 5.654659760480054, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -0.08118149638175964, + "logits/rejected": 0.0527944378554821, + "logps/chosen": -1.5596070289611816, + "logps/rejected": -1.6867377758026123, + "loss": 1.0573, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5596070289611816, + "rewards/margins": 0.12713071703910828, + "rewards/rejected": -1.6867377758026123, + "semantic_entropy": 0.6952215433120728, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 7.923835022429222, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -0.02389593794941902, + "logits/rejected": 0.07080022990703583, + "logps/chosen": -1.520407795906067, + "logps/rejected": -1.754201889038086, + "loss": 1.0602, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.520407795906067, + "rewards/margins": 0.23379412293434143, + "rewards/rejected": -1.754201889038086, + "semantic_entropy": 0.7215244770050049, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 14.712512665656517, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": -0.10531296581029892, + "logits/rejected": 0.0524088516831398, + "logps/chosen": -1.6230707168579102, + "logps/rejected": -1.7438873052597046, + "loss": 1.0823, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6230707168579102, + "rewards/margins": 0.12081663310527802, + "rewards/rejected": -1.7438873052597046, + "semantic_entropy": 0.679011344909668, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 6.69243253514015, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.0718010812997818, + "logits/rejected": 0.06791789084672928, + "logps/chosen": -1.597745656967163, + "logps/rejected": -1.7144542932510376, + "loss": 1.106, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.597745656967163, + "rewards/margins": 0.11670851707458496, + "rewards/rejected": -1.7144542932510376, + "semantic_entropy": 0.6957116723060608, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 7.7782211257781055, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.11950629949569702, + "logits/rejected": 0.17797723412513733, + "logps/chosen": -1.5724703073501587, + "logps/rejected": -1.766248345375061, + "loss": 1.0197, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5724703073501587, + "rewards/margins": 0.1937781125307083, + "rewards/rejected": -1.766248345375061, + "semantic_entropy": 0.6906196475028992, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 11.79027110371697, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": -0.04648825526237488, + "logits/rejected": 0.012335294857621193, + "logps/chosen": -1.5410258769989014, + "logps/rejected": -1.663569688796997, + "loss": 1.063, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5410258769989014, + "rewards/margins": 0.12254378944635391, + "rewards/rejected": -1.663569688796997, + "semantic_entropy": 0.7017509341239929, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 8.835849573199795, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.07654620707035065, + "logits/rejected": 0.09046686440706253, + "logps/chosen": -1.5949294567108154, + "logps/rejected": -1.71002197265625, + "loss": 1.0829, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.5949294567108154, + "rewards/margins": 0.11509259045124054, + "rewards/rejected": -1.71002197265625, + "semantic_entropy": 0.6955488920211792, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 6.983366890384154, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": -0.02877117693424225, + "logits/rejected": 0.04091879725456238, + "logps/chosen": -1.6932131052017212, + "logps/rejected": -1.6878843307495117, + "loss": 1.1315, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.6932131052017212, + "rewards/margins": -0.005328828003257513, + "rewards/rejected": -1.6878843307495117, + "semantic_entropy": 0.6642959713935852, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 9.615112381569704, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.2014521062374115, + "logits/rejected": -0.11008793115615845, + "logps/chosen": -1.6740272045135498, + "logps/rejected": -1.7929702997207642, + "loss": 1.0864, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.6740272045135498, + "rewards/margins": 0.11894307285547256, + "rewards/rejected": -1.7929702997207642, + "semantic_entropy": 0.6629990339279175, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 12.292656026710143, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.01841166988015175, + "logits/rejected": 0.14304211735725403, + "logps/chosen": -1.6711599826812744, + "logps/rejected": -1.8580175638198853, + "loss": 1.0557, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6711599826812744, + "rewards/margins": 0.186857670545578, + "rewards/rejected": -1.8580175638198853, + "semantic_entropy": 0.6592516899108887, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 6.646670707784385, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.059432487934827805, + "logits/rejected": 0.0704459697008133, + "logps/chosen": -1.5968337059020996, + "logps/rejected": -1.6530053615570068, + "loss": 1.0909, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.5968337059020996, + "rewards/margins": 0.05617170408368111, + "rewards/rejected": -1.6530053615570068, + "semantic_entropy": 0.6955806612968445, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 9.457657582105599, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.10694797337055206, + "logits/rejected": 0.012209171429276466, + "logps/chosen": -1.6328935623168945, + "logps/rejected": -1.9439910650253296, + "loss": 1.0218, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6328935623168945, + "rewards/margins": 0.3110976219177246, + "rewards/rejected": -1.9439910650253296, + "semantic_entropy": 0.6683646440505981, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 12.926849465905892, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -0.005584185477346182, + "logits/rejected": 0.14117801189422607, + "logps/chosen": -1.6041080951690674, + "logps/rejected": -1.904754638671875, + "loss": 0.995, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6041080951690674, + "rewards/margins": 0.30064669251441956, + "rewards/rejected": -1.904754638671875, + "semantic_entropy": 0.6687676906585693, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 20.510474921917158, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": 0.04617486149072647, + "logits/rejected": 0.151122584939003, + "logps/chosen": -1.688357949256897, + "logps/rejected": -1.7622886896133423, + "loss": 1.0828, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.688357949256897, + "rewards/margins": 0.07393099367618561, + "rewards/rejected": -1.7622886896133423, + "semantic_entropy": 0.66343754529953, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 17.51474938670172, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -0.02019418589770794, + "logits/rejected": 0.12733808159828186, + "logps/chosen": -1.763425588607788, + "logps/rejected": -1.875178575515747, + "loss": 1.1057, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.763425588607788, + "rewards/margins": 0.11175310611724854, + "rewards/rejected": -1.875178575515747, + "semantic_entropy": 0.6318264007568359, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 11.458477109078752, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": 0.050300829112529755, + "logits/rejected": 0.08116074651479721, + "logps/chosen": -1.6531829833984375, + "logps/rejected": -1.8452335596084595, + "loss": 1.0593, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6531829833984375, + "rewards/margins": 0.1920507401227951, + "rewards/rejected": -1.8452335596084595, + "semantic_entropy": 0.6621311902999878, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 11.329624892763588, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": 0.015283575281500816, + "logits/rejected": 0.1078251451253891, + "logps/chosen": -1.609297752380371, + "logps/rejected": -1.7397960424423218, + "loss": 1.0848, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.609297752380371, + "rewards/margins": 0.13049837946891785, + "rewards/rejected": -1.7397960424423218, + "semantic_entropy": 0.6860819458961487, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 9.749373970268978, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.07602973282337189, + "logits/rejected": 0.14425484836101532, + "logps/chosen": -1.6898406744003296, + "logps/rejected": -1.7698333263397217, + "loss": 1.0882, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6898406744003296, + "rewards/margins": 0.07999298721551895, + "rewards/rejected": -1.7698333263397217, + "semantic_entropy": 0.6580514311790466, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 9.515555465754796, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.05237163230776787, + "logits/rejected": 0.025818347930908203, + "logps/chosen": -1.6769046783447266, + "logps/rejected": -1.8567724227905273, + "loss": 1.0509, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6769046783447266, + "rewards/margins": 0.17986764013767242, + "rewards/rejected": -1.8567724227905273, + "semantic_entropy": 0.6746851205825806, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 15.086499304257604, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": 0.019944345578551292, + "logits/rejected": 0.10378506034612656, + "logps/chosen": -1.6250860691070557, + "logps/rejected": -1.7133582830429077, + "loss": 1.0988, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6250860691070557, + "rewards/margins": 0.08827227354049683, + "rewards/rejected": -1.7133582830429077, + "semantic_entropy": 0.6908445358276367, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 13.830072743833517, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": -0.03349882736802101, + "logits/rejected": 0.05777007341384888, + "logps/chosen": -1.6009029150009155, + "logps/rejected": -1.7026888132095337, + "loss": 1.1096, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.6009029150009155, + "rewards/margins": 0.10178569704294205, + "rewards/rejected": -1.7026888132095337, + "semantic_entropy": 0.7001045346260071, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 11.699656608660698, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.07194206863641739, + "logits/rejected": 0.07688136398792267, + "logps/chosen": -1.6611896753311157, + "logps/rejected": -1.9261270761489868, + "loss": 1.0219, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.6611896753311157, + "rewards/margins": 0.2649373412132263, + "rewards/rejected": -1.9261270761489868, + "semantic_entropy": 0.6614036560058594, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 8.432293708205574, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.03905266523361206, + "logits/rejected": 0.04177533835172653, + "logps/chosen": -1.8525673151016235, + "logps/rejected": -2.073270082473755, + "loss": 1.0041, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.8525673151016235, + "rewards/margins": 0.22070245444774628, + "rewards/rejected": -2.073270082473755, + "semantic_entropy": 0.6032805442810059, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 6.558363447038307, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": -0.004520825110375881, + "logits/rejected": 0.06874732673168182, + "logps/chosen": -1.8738794326782227, + "logps/rejected": -1.971431016921997, + "loss": 1.0507, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.8738794326782227, + "rewards/margins": 0.09755153954029083, + "rewards/rejected": -1.971431016921997, + "semantic_entropy": 0.5936424732208252, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 11.9814690281164, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": 0.056425292044878006, + "logits/rejected": 0.21399247646331787, + "logps/chosen": -1.9227275848388672, + "logps/rejected": -2.138357400894165, + "loss": 1.014, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9227275848388672, + "rewards/margins": 0.21562990546226501, + "rewards/rejected": -2.138357400894165, + "semantic_entropy": 0.5690654516220093, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 11.659848810770669, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.04911988228559494, + "logits/rejected": 0.10953982919454575, + "logps/chosen": -1.8240041732788086, + "logps/rejected": -1.974021553993225, + "loss": 1.0404, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.8240041732788086, + "rewards/margins": 0.1500171720981598, + "rewards/rejected": -1.974021553993225, + "semantic_entropy": 0.608680009841919, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 17.654443437190544, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": 0.07092205435037613, + "logits/rejected": 0.15978960692882538, + "logps/chosen": -1.84757399559021, + "logps/rejected": -2.0060219764709473, + "loss": 1.0119, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.84757399559021, + "rewards/margins": 0.158447727560997, + "rewards/rejected": -2.0060219764709473, + "semantic_entropy": 0.5995103120803833, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.27376288175582886, + "eval_logits/rejected": 0.35996997356414795, + "eval_logps/chosen": -1.7929844856262207, + "eval_logps/rejected": -2.0279812812805176, + "eval_loss": 1.013211965560913, + "eval_rewards/accuracies": 0.5660237669944763, + "eval_rewards/chosen": -1.7929844856262207, + "eval_rewards/margins": 0.23499667644500732, + "eval_rewards/rejected": -2.0279812812805176, + "eval_runtime": 35.6332, + "eval_samples_per_second": 37.746, + "eval_semantic_entropy": 0.6131907105445862, + "eval_steps_per_second": 9.457, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 10.075168755240373, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": -0.0024316341150552034, + "logits/rejected": 0.0858984887599945, + "logps/chosen": -1.8473918437957764, + "logps/rejected": -2.068129062652588, + "loss": 1.0621, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8473918437957764, + "rewards/margins": 0.22073736786842346, + "rewards/rejected": -2.068129062652588, + "semantic_entropy": 0.6077993512153625, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 18.950538852606062, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": 0.04571036621928215, + "logits/rejected": 0.16462978720664978, + "logps/chosen": -1.7759612798690796, + "logps/rejected": -1.9969593286514282, + "loss": 1.012, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.7759612798690796, + "rewards/margins": 0.22099807858467102, + "rewards/rejected": -1.9969593286514282, + "semantic_entropy": 0.6101277470588684, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 12.639518148578636, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": 0.04078054428100586, + "logits/rejected": 0.08592768013477325, + "logps/chosen": -1.8478095531463623, + "logps/rejected": -2.025573492050171, + "loss": 1.0192, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.8478095531463623, + "rewards/margins": 0.17776378989219666, + "rewards/rejected": -2.025573492050171, + "semantic_entropy": 0.5999530553817749, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 10.190575297401363, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": 0.0002573668898548931, + "logits/rejected": 0.18812724947929382, + "logps/chosen": -1.6988885402679443, + "logps/rejected": -1.8715341091156006, + "loss": 1.0463, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.6988885402679443, + "rewards/margins": 0.17264559864997864, + "rewards/rejected": -1.8715341091156006, + "semantic_entropy": 0.6426397562026978, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 11.953799492318076, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": 0.0010619193781167269, + "logits/rejected": 0.19794592261314392, + "logps/chosen": -1.7909456491470337, + "logps/rejected": -2.094650983810425, + "loss": 0.9602, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7909456491470337, + "rewards/margins": 0.30370545387268066, + "rewards/rejected": -2.094650983810425, + "semantic_entropy": 0.6076642274856567, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 10.263106298016128, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.02113468013703823, + "logits/rejected": 0.18052729964256287, + "logps/chosen": -1.803195595741272, + "logps/rejected": -2.2266640663146973, + "loss": 0.9559, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.803195595741272, + "rewards/margins": 0.4234686493873596, + "rewards/rejected": -2.2266640663146973, + "semantic_entropy": 0.6058921813964844, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 23.770582060225383, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": 0.07859322428703308, + "logits/rejected": 0.17100855708122253, + "logps/chosen": -1.8036648035049438, + "logps/rejected": -1.972700834274292, + "loss": 1.0131, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.8036648035049438, + "rewards/margins": 0.1690361052751541, + "rewards/rejected": -1.972700834274292, + "semantic_entropy": 0.6105883717536926, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 8.942531726492792, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": 0.027078593149781227, + "logits/rejected": 0.11831989139318466, + "logps/chosen": -1.8217464685440063, + "logps/rejected": -2.0932116508483887, + "loss": 0.9838, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.8217464685440063, + "rewards/margins": 0.2714650630950928, + "rewards/rejected": -2.0932116508483887, + "semantic_entropy": 0.5999422669410706, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 12.264776032319165, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": 0.01442508865147829, + "logits/rejected": 0.12444069236516953, + "logps/chosen": -1.918039083480835, + "logps/rejected": -2.2938976287841797, + "loss": 0.9688, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.918039083480835, + "rewards/margins": 0.37585827708244324, + "rewards/rejected": -2.2938976287841797, + "semantic_entropy": 0.5600379705429077, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 20.147606211927062, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": 0.08424471318721771, + "logits/rejected": 0.2129439115524292, + "logps/chosen": -1.9648675918579102, + "logps/rejected": -2.2125535011291504, + "loss": 0.9685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9648675918579102, + "rewards/margins": 0.24768579006195068, + "rewards/rejected": -2.2125535011291504, + "semantic_entropy": 0.555503249168396, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 10.39177189011024, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": 0.1032770648598671, + "logits/rejected": 0.18853013217449188, + "logps/chosen": -1.8158471584320068, + "logps/rejected": -2.1788058280944824, + "loss": 0.9408, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.8158471584320068, + "rewards/margins": 0.3629588484764099, + "rewards/rejected": -2.1788058280944824, + "semantic_entropy": 0.5897886157035828, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 9.700606050871391, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.019435208290815353, + "logits/rejected": 0.10268989950418472, + "logps/chosen": -1.9720776081085205, + "logps/rejected": -2.17942476272583, + "loss": 0.9789, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.9720776081085205, + "rewards/margins": 0.2073473036289215, + "rewards/rejected": -2.17942476272583, + "semantic_entropy": 0.5492539405822754, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 13.542184802712866, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": 0.25124862790107727, + "logits/rejected": 0.2735592722892761, + "logps/chosen": -2.041325807571411, + "logps/rejected": -2.2813258171081543, + "loss": 0.9751, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.041325807571411, + "rewards/margins": 0.24000012874603271, + "rewards/rejected": -2.2813258171081543, + "semantic_entropy": 0.5254560112953186, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 9.43369236776717, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": 0.28713709115982056, + "logits/rejected": 0.24178346991539001, + "logps/chosen": -2.011660575866699, + "logps/rejected": -2.2471611499786377, + "loss": 0.9972, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.011660575866699, + "rewards/margins": 0.2355005294084549, + "rewards/rejected": -2.2471611499786377, + "semantic_entropy": 0.5238825678825378, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 8.440890172534203, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": 0.06835935264825821, + "logits/rejected": 0.2116996943950653, + "logps/chosen": -1.9710376262664795, + "logps/rejected": -2.4928886890411377, + "loss": 0.8891, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.9710376262664795, + "rewards/margins": 0.5218510031700134, + "rewards/rejected": -2.4928886890411377, + "semantic_entropy": 0.5133862495422363, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 13.271352574157751, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": 0.11900024116039276, + "logits/rejected": 0.3160308599472046, + "logps/chosen": -2.046262502670288, + "logps/rejected": -2.303492307662964, + "loss": 0.9389, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.046262502670288, + "rewards/margins": 0.25722989439964294, + "rewards/rejected": -2.303492307662964, + "semantic_entropy": 0.5193344354629517, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 17.40327979933669, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": 0.11675455421209335, + "logits/rejected": 0.16688722372055054, + "logps/chosen": -2.2786717414855957, + "logps/rejected": -2.450490951538086, + "loss": 0.9805, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.2786717414855957, + "rewards/margins": 0.17181938886642456, + "rewards/rejected": -2.450490951538086, + "semantic_entropy": 0.45640721917152405, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 10.5637127371766, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": 0.14526286721229553, + "logits/rejected": 0.21295936405658722, + "logps/chosen": -2.2720694541931152, + "logps/rejected": -2.437973737716675, + "loss": 0.9629, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.2720694541931152, + "rewards/margins": 0.16590480506420135, + "rewards/rejected": -2.437973737716675, + "semantic_entropy": 0.4487149715423584, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 13.968933074481987, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": 0.10887887328863144, + "logits/rejected": 0.1369287371635437, + "logps/chosen": -2.321854829788208, + "logps/rejected": -2.471646785736084, + "loss": 0.9461, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.321854829788208, + "rewards/margins": 0.14979204535484314, + "rewards/rejected": -2.471646785736084, + "semantic_entropy": 0.42514246702194214, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 15.91869715362306, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": 0.11035114526748657, + "logits/rejected": 0.19290763139724731, + "logps/chosen": -2.324972152709961, + "logps/rejected": -2.60076642036438, + "loss": 0.8935, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.324972152709961, + "rewards/margins": 0.2757939100265503, + "rewards/rejected": -2.60076642036438, + "semantic_entropy": 0.4203091263771057, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 17.82851785747078, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": 0.10254337638616562, + "logits/rejected": 0.23869287967681885, + "logps/chosen": -2.434830904006958, + "logps/rejected": -2.6683614253997803, + "loss": 0.88, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.434830904006958, + "rewards/margins": 0.23353052139282227, + "rewards/rejected": -2.6683614253997803, + "semantic_entropy": 0.38676854968070984, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 9.222919526301315, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": 0.2949961721897125, + "logits/rejected": 0.34191757440567017, + "logps/chosen": -2.5707197189331055, + "logps/rejected": -2.9306082725524902, + "loss": 0.8353, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.5707197189331055, + "rewards/margins": 0.35988861322402954, + "rewards/rejected": -2.9306082725524902, + "semantic_entropy": 0.3533479571342468, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 13.125321055251398, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": 0.26439735293388367, + "logits/rejected": 0.3643534779548645, + "logps/chosen": -2.6091151237487793, + "logps/rejected": -2.9702847003936768, + "loss": 0.8275, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.6091151237487793, + "rewards/margins": 0.3611697256565094, + "rewards/rejected": -2.9702847003936768, + "semantic_entropy": 0.3468519449234009, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 13.837098088359745, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": 0.1988576203584671, + "logits/rejected": 0.3169275224208832, + "logps/chosen": -2.997946262359619, + "logps/rejected": -3.395313262939453, + "loss": 0.824, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.997946262359619, + "rewards/margins": 0.39736661314964294, + "rewards/rejected": -3.395313262939453, + "semantic_entropy": 0.2663891315460205, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 23.212683763316665, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": 0.362657368183136, + "logits/rejected": 0.431486040353775, + "logps/chosen": -3.46795654296875, + "logps/rejected": -3.9383537769317627, + "loss": 0.7851, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.46795654296875, + "rewards/margins": 0.4703969359397888, + "rewards/rejected": -3.9383537769317627, + "semantic_entropy": 0.19957469403743744, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 19.427149448565736, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": 0.3255121111869812, + "logits/rejected": 0.39444050192832947, + "logps/chosen": -3.581265926361084, + "logps/rejected": -4.079986095428467, + "loss": 0.7894, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.581265926361084, + "rewards/margins": 0.4987207055091858, + "rewards/rejected": -4.079986095428467, + "semantic_entropy": 0.19464334845542908, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 17.35259546926179, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": 0.20121872425079346, + "logits/rejected": 0.4174925684928894, + "logps/chosen": -3.966810941696167, + "logps/rejected": -4.486771583557129, + "loss": 0.721, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -3.966810941696167, + "rewards/margins": 0.5199612379074097, + "rewards/rejected": -4.486771583557129, + "semantic_entropy": 0.14531609416007996, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 26.596890402490573, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": 0.2888971269130707, + "logits/rejected": 0.3459900915622711, + "logps/chosen": -4.495975971221924, + "logps/rejected": -4.9110822677612305, + "loss": 0.7492, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.495975971221924, + "rewards/margins": 0.4151054322719574, + "rewards/rejected": -4.9110822677612305, + "semantic_entropy": 0.09809108078479767, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 18.02680952570327, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": 0.20895743370056152, + "logits/rejected": 0.3623776435852051, + "logps/chosen": -4.741501808166504, + "logps/rejected": -5.503144264221191, + "loss": 0.6348, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.741501808166504, + "rewards/margins": 0.7616419196128845, + "rewards/rejected": -5.503144264221191, + "semantic_entropy": 0.08251913636922836, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 20.727150313169535, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": 0.30375415086746216, + "logits/rejected": 0.3442818522453308, + "logps/chosen": -5.456840991973877, + "logps/rejected": -6.024032115936279, + "loss": 0.7065, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.456840991973877, + "rewards/margins": 0.5671912431716919, + "rewards/rejected": -6.024032115936279, + "semantic_entropy": 0.06045646220445633, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 27.134124904666713, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": 0.28694844245910645, + "logits/rejected": 0.3676696717739105, + "logps/chosen": -5.522095680236816, + "logps/rejected": -5.709345817565918, + "loss": 0.8281, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -5.522095680236816, + "rewards/margins": 0.18725113570690155, + "rewards/rejected": -5.709345817565918, + "semantic_entropy": 0.05345703288912773, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 31.40474528083335, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": 0.3329199552536011, + "logits/rejected": 0.3428536355495453, + "logps/chosen": -4.907380104064941, + "logps/rejected": -5.42898416519165, + "loss": 0.684, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.907380104064941, + "rewards/margins": 0.5216037034988403, + "rewards/rejected": -5.42898416519165, + "semantic_entropy": 0.08699695765972137, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 15.731040510232038, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": 0.32874903082847595, + "logits/rejected": 0.37473997473716736, + "logps/chosen": -4.960855960845947, + "logps/rejected": -5.458104610443115, + "loss": 0.6559, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.960855960845947, + "rewards/margins": 0.49724894762039185, + "rewards/rejected": -5.458104610443115, + "semantic_entropy": 0.07233523577451706, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 26.591918700910675, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": 0.2645338177680969, + "logits/rejected": 0.4236833453178406, + "logps/chosen": -4.993116855621338, + "logps/rejected": -5.463040828704834, + "loss": 0.6672, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.993116855621338, + "rewards/margins": 0.4699248671531677, + "rewards/rejected": -5.463040828704834, + "semantic_entropy": 0.06868621706962585, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 17.123203800580065, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": 0.26640480756759644, + "logits/rejected": 0.3024447560310364, + "logps/chosen": -4.769078731536865, + "logps/rejected": -5.275615692138672, + "loss": 0.6851, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.769078731536865, + "rewards/margins": 0.5065367817878723, + "rewards/rejected": -5.275615692138672, + "semantic_entropy": 0.07678450644016266, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 16.862219340555285, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": 0.2403547316789627, + "logits/rejected": 0.35913315415382385, + "logps/chosen": -4.844521999359131, + "logps/rejected": -5.310414791107178, + "loss": 0.6817, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.844521999359131, + "rewards/margins": 0.4658929407596588, + "rewards/rejected": -5.310414791107178, + "semantic_entropy": 0.07107989490032196, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 20.48877444397547, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": 0.2463439702987671, + "logits/rejected": 0.3041590750217438, + "logps/chosen": -5.344332218170166, + "logps/rejected": -5.5950727462768555, + "loss": 0.7775, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.344332218170166, + "rewards/margins": 0.250741183757782, + "rewards/rejected": -5.5950727462768555, + "semantic_entropy": 0.05236155912280083, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 33.45307601229143, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": 0.2851886749267578, + "logits/rejected": 0.37152618169784546, + "logps/chosen": -5.112926006317139, + "logps/rejected": -5.441379547119141, + "loss": 0.7305, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -5.112926006317139, + "rewards/margins": 0.3284529149532318, + "rewards/rejected": -5.441379547119141, + "semantic_entropy": 0.05966230109333992, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 28.772414597800683, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": 0.3128214478492737, + "logits/rejected": 0.4057738184928894, + "logps/chosen": -5.047120094299316, + "logps/rejected": -5.475900173187256, + "loss": 0.6587, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -5.047120094299316, + "rewards/margins": 0.4287797510623932, + "rewards/rejected": -5.475900173187256, + "semantic_entropy": 0.056463856250047684, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 21.595817825496415, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": 0.27063342928886414, + "logits/rejected": 0.3820621371269226, + "logps/chosen": -5.453131198883057, + "logps/rejected": -5.779126167297363, + "loss": 0.6993, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -5.453131198883057, + "rewards/margins": 0.32599514722824097, + "rewards/rejected": -5.779126167297363, + "semantic_entropy": 0.04167807847261429, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 26.050724529880945, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": 0.24278345704078674, + "logits/rejected": 0.3093962073326111, + "logps/chosen": -5.513918876647949, + "logps/rejected": -5.7660112380981445, + "loss": 0.8, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.513918876647949, + "rewards/margins": 0.25209134817123413, + "rewards/rejected": -5.7660112380981445, + "semantic_entropy": 0.04349964112043381, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 45.051043777132236, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": 0.31496500968933105, + "logits/rejected": 0.47544389963150024, + "logps/chosen": -5.2426862716674805, + "logps/rejected": -5.6988115310668945, + "loss": 0.6803, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -5.2426862716674805, + "rewards/margins": 0.4561251699924469, + "rewards/rejected": -5.6988115310668945, + "semantic_entropy": 0.05658901482820511, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 23.856980522226188, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": 0.32243964076042175, + "logits/rejected": 0.39280059933662415, + "logps/chosen": -5.77672815322876, + "logps/rejected": -6.442985534667969, + "loss": 0.5973, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -5.77672815322876, + "rewards/margins": 0.6662576794624329, + "rewards/rejected": -6.442985534667969, + "semantic_entropy": 0.03704090788960457, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 15.346139192682116, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": 0.4094429016113281, + "logits/rejected": 0.44762665033340454, + "logps/chosen": -6.212830543518066, + "logps/rejected": -6.7759881019592285, + "loss": 0.6226, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -6.212830543518066, + "rewards/margins": 0.5631579160690308, + "rewards/rejected": -6.7759881019592285, + "semantic_entropy": 0.03154679387807846, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 18.787286491671612, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": 0.41264209151268005, + "logits/rejected": 0.5199421048164368, + "logps/chosen": -6.266766548156738, + "logps/rejected": -6.7597222328186035, + "loss": 0.6553, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -6.266766548156738, + "rewards/margins": 0.49295586347579956, + "rewards/rejected": -6.7597222328186035, + "semantic_entropy": 0.03314858675003052, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 23.697424686130283, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": 0.3286336362361908, + "logits/rejected": 0.416952908039093, + "logps/chosen": -6.489420413970947, + "logps/rejected": -7.091695308685303, + "loss": 0.6025, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -6.489420413970947, + "rewards/margins": 0.6022747755050659, + "rewards/rejected": -7.091695308685303, + "semantic_entropy": 0.019754167646169662, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 23.346401547821337, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": 0.39607900381088257, + "logits/rejected": 0.5640990138053894, + "logps/chosen": -6.5967607498168945, + "logps/rejected": -7.059272766113281, + "loss": 0.674, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -6.5967607498168945, + "rewards/margins": 0.4625115990638733, + "rewards/rejected": -7.059272766113281, + "semantic_entropy": 0.023461516946554184, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 28.41310343878819, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": 0.37280526757240295, + "logits/rejected": 0.5077700018882751, + "logps/chosen": -6.618893623352051, + "logps/rejected": -6.993128776550293, + "loss": 0.6847, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.618893623352051, + "rewards/margins": 0.3742350935935974, + "rewards/rejected": -6.993128776550293, + "semantic_entropy": 0.016319947317242622, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 17.075034812283786, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": 0.4683307707309723, + "logits/rejected": 0.5001148581504822, + "logps/chosen": -6.166420936584473, + "logps/rejected": -6.613181114196777, + "loss": 0.6588, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -6.166420936584473, + "rewards/margins": 0.4467601776123047, + "rewards/rejected": -6.613181114196777, + "semantic_entropy": 0.023232873529195786, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 22.115212411816152, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": 0.49777716398239136, + "logits/rejected": 0.5633991360664368, + "logps/chosen": -6.14475154876709, + "logps/rejected": -6.610726833343506, + "loss": 0.6416, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.14475154876709, + "rewards/margins": 0.4659750461578369, + "rewards/rejected": -6.610726833343506, + "semantic_entropy": 0.023257287219166756, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 20.090310906010423, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": 0.4952174127101898, + "logits/rejected": 0.5142993927001953, + "logps/chosen": -5.922252655029297, + "logps/rejected": -6.414206504821777, + "loss": 0.6695, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -5.922252655029297, + "rewards/margins": 0.49195390939712524, + "rewards/rejected": -6.414206504821777, + "semantic_entropy": 0.027981286868453026, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 19.79614358859752, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": 0.5208943486213684, + "logits/rejected": 0.5800861120223999, + "logps/chosen": -6.255806922912598, + "logps/rejected": -6.602316856384277, + "loss": 0.697, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -6.255806922912598, + "rewards/margins": 0.34651073813438416, + "rewards/rejected": -6.602316856384277, + "semantic_entropy": 0.02483288012444973, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 19.579221986465477, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": 0.5133857727050781, + "logits/rejected": 0.58699631690979, + "logps/chosen": -6.382502555847168, + "logps/rejected": -6.7586350440979, + "loss": 0.6752, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -6.382502555847168, + "rewards/margins": 0.3761317729949951, + "rewards/rejected": -6.7586350440979, + "semantic_entropy": 0.019113317131996155, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 19.24827550379443, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": 0.5319421291351318, + "logits/rejected": 0.6193274259567261, + "logps/chosen": -6.479439735412598, + "logps/rejected": -6.937412261962891, + "loss": 0.5944, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -6.479439735412598, + "rewards/margins": 0.4579733908176422, + "rewards/rejected": -6.937412261962891, + "semantic_entropy": 0.016540968790650368, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 20.36938516339038, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": 0.5054916143417358, + "logits/rejected": 0.5468543171882629, + "logps/chosen": -6.396731376647949, + "logps/rejected": -6.9033098220825195, + "loss": 0.5913, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.396731376647949, + "rewards/margins": 0.5065786242485046, + "rewards/rejected": -6.9033098220825195, + "semantic_entropy": 0.019933702424168587, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 25.694663813660224, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": 0.4442412257194519, + "logits/rejected": 0.5351762771606445, + "logps/chosen": -6.232950687408447, + "logps/rejected": -6.665299892425537, + "loss": 0.6833, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.232950687408447, + "rewards/margins": 0.4323497414588928, + "rewards/rejected": -6.665299892425537, + "semantic_entropy": 0.02159653976559639, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 26.644324005748285, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": 0.49901971220970154, + "logits/rejected": 0.5869329571723938, + "logps/chosen": -6.33712911605835, + "logps/rejected": -6.925673007965088, + "loss": 0.5808, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -6.33712911605835, + "rewards/margins": 0.588544487953186, + "rewards/rejected": -6.925673007965088, + "semantic_entropy": 0.01862289011478424, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 20.404114370621034, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": 0.5496017336845398, + "logits/rejected": 0.5704872012138367, + "logps/chosen": -6.48660135269165, + "logps/rejected": -6.933077812194824, + "loss": 0.6603, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -6.48660135269165, + "rewards/margins": 0.44647669792175293, + "rewards/rejected": -6.933077812194824, + "semantic_entropy": 0.017888184636831284, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 20.958217633474742, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": 0.4869377613067627, + "logits/rejected": 0.5502743721008301, + "logps/chosen": -6.533112525939941, + "logps/rejected": -6.993691921234131, + "loss": 0.6216, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -6.533112525939941, + "rewards/margins": 0.4605790674686432, + "rewards/rejected": -6.993691921234131, + "semantic_entropy": 0.01869816519320011, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 28.100366278994386, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": 0.38128662109375, + "logits/rejected": 0.45251068472862244, + "logps/chosen": -6.801139831542969, + "logps/rejected": -7.043248176574707, + "loss": 0.6849, + "rewards/accuracies": 0.59375, + "rewards/chosen": -6.801139831542969, + "rewards/margins": 0.2421083003282547, + "rewards/rejected": -7.043248176574707, + "semantic_entropy": 0.012132355943322182, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 20.82013187037515, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": 0.30956459045410156, + "logits/rejected": 0.44323819875717163, + "logps/chosen": -6.558831691741943, + "logps/rejected": -7.164409637451172, + "loss": 0.575, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -6.558831691741943, + "rewards/margins": 0.6055777668952942, + "rewards/rejected": -7.164409637451172, + "semantic_entropy": 0.01754785142838955, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 21.30045872363481, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": 0.3447558283805847, + "logits/rejected": 0.4274185299873352, + "logps/chosen": -6.733677864074707, + "logps/rejected": -7.2757744789123535, + "loss": 0.6037, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -6.733677864074707, + "rewards/margins": 0.5420972108840942, + "rewards/rejected": -7.2757744789123535, + "semantic_entropy": 0.015751570463180542, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 30.288720792247716, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": 0.3281204402446747, + "logits/rejected": 0.449833482503891, + "logps/chosen": -7.066946983337402, + "logps/rejected": -7.522922515869141, + "loss": 0.6432, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -7.066946983337402, + "rewards/margins": 0.4559754431247711, + "rewards/rejected": -7.522922515869141, + "semantic_entropy": 0.010822773911058903, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 21.97533323049887, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": 0.4738125205039978, + "logits/rejected": 0.6201906204223633, + "logps/chosen": -6.687346458435059, + "logps/rejected": -7.375749111175537, + "loss": 0.5906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -6.687346458435059, + "rewards/margins": 0.6884029507637024, + "rewards/rejected": -7.375749111175537, + "semantic_entropy": 0.01416093111038208, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 16.154939819122443, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": 0.397294819355011, + "logits/rejected": 0.4953377842903137, + "logps/chosen": -6.555293083190918, + "logps/rejected": -7.133930206298828, + "loss": 0.5751, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -6.555293083190918, + "rewards/margins": 0.5786372423171997, + "rewards/rejected": -7.133930206298828, + "semantic_entropy": 0.016166144981980324, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 18.66427550708316, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": 0.39967241883277893, + "logits/rejected": 0.4871141314506531, + "logps/chosen": -6.588616371154785, + "logps/rejected": -7.018074989318848, + "loss": 0.6501, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -6.588616371154785, + "rewards/margins": 0.4294595718383789, + "rewards/rejected": -7.018074989318848, + "semantic_entropy": 0.015043877065181732, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 19.58743135959893, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": 0.424797385931015, + "logits/rejected": 0.5106293559074402, + "logps/chosen": -6.42412805557251, + "logps/rejected": -6.823407173156738, + "loss": 0.661, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -6.42412805557251, + "rewards/margins": 0.3992784023284912, + "rewards/rejected": -6.823407173156738, + "semantic_entropy": 0.017190445214509964, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 19.710530643393298, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": 0.5481308102607727, + "logits/rejected": 0.6441117525100708, + "logps/chosen": -6.604589939117432, + "logps/rejected": -7.156881809234619, + "loss": 0.5894, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -6.604589939117432, + "rewards/margins": 0.5522912740707397, + "rewards/rejected": -7.156881809234619, + "semantic_entropy": 0.014214654453098774, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 17.871273849433326, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": 0.6293722987174988, + "logits/rejected": 0.6717933416366577, + "logps/chosen": -6.835976600646973, + "logps/rejected": -7.2494215965271, + "loss": 0.6631, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -6.835976600646973, + "rewards/margins": 0.4134441316127777, + "rewards/rejected": -7.2494215965271, + "semantic_entropy": 0.01153610274195671, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 16.865148721700457, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": 0.541202962398529, + "logits/rejected": 0.6529287695884705, + "logps/chosen": -6.675736904144287, + "logps/rejected": -7.145176887512207, + "loss": 0.6284, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -6.675736904144287, + "rewards/margins": 0.4694399833679199, + "rewards/rejected": -7.145176887512207, + "semantic_entropy": 0.013248731382191181, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 23.498295974070185, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": 0.5743650197982788, + "logits/rejected": 0.663760781288147, + "logps/chosen": -6.699901580810547, + "logps/rejected": -7.10396671295166, + "loss": 0.6749, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.699901580810547, + "rewards/margins": 0.4040653109550476, + "rewards/rejected": -7.10396671295166, + "semantic_entropy": 0.014644038863480091, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 15.10343166970609, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": 0.4620290696620941, + "logits/rejected": 0.5227762460708618, + "logps/chosen": -6.713381767272949, + "logps/rejected": -7.118601322174072, + "loss": 0.6779, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -6.713381767272949, + "rewards/margins": 0.4052188992500305, + "rewards/rejected": -7.118601322174072, + "semantic_entropy": 0.014408141374588013, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 16.82158650124028, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": 0.4263577461242676, + "logits/rejected": 0.4895492494106293, + "logps/chosen": -6.6601409912109375, + "logps/rejected": -7.117767333984375, + "loss": 0.6173, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -6.6601409912109375, + "rewards/margins": 0.45762643218040466, + "rewards/rejected": -7.117767333984375, + "semantic_entropy": 0.014361525885760784, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 13.610647877557357, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": 0.5320577621459961, + "logits/rejected": 0.6060940027236938, + "logps/chosen": -6.88693904876709, + "logps/rejected": -7.462642669677734, + "loss": 0.5833, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.88693904876709, + "rewards/margins": 0.5757043957710266, + "rewards/rejected": -7.462642669677734, + "semantic_entropy": 0.011815531179308891, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 10.55729867477025, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": 0.4426754415035248, + "logits/rejected": 0.527428150177002, + "logps/chosen": -6.9573655128479, + "logps/rejected": -7.31267786026001, + "loss": 0.6804, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.9573655128479, + "rewards/margins": 0.35531362891197205, + "rewards/rejected": -7.31267786026001, + "semantic_entropy": 0.011011673137545586, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 20.225405830042792, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": 0.38837724924087524, + "logits/rejected": 0.4707297384738922, + "logps/chosen": -6.923590660095215, + "logps/rejected": -7.26629638671875, + "loss": 0.6241, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -6.923590660095215, + "rewards/margins": 0.34270578622817993, + "rewards/rejected": -7.26629638671875, + "semantic_entropy": 0.010305705480277538, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 24.361936444306224, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": 0.49288463592529297, + "logits/rejected": 0.5485498309135437, + "logps/chosen": -6.930272102355957, + "logps/rejected": -7.437797546386719, + "loss": 0.6253, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -6.930272102355957, + "rewards/margins": 0.5075257420539856, + "rewards/rejected": -7.437797546386719, + "semantic_entropy": 0.0111698554828763, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 17.05301230734048, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": 0.3991442918777466, + "logits/rejected": 0.45411986112594604, + "logps/chosen": -6.633962154388428, + "logps/rejected": -7.100059509277344, + "loss": 0.6083, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.633962154388428, + "rewards/margins": 0.4660969376564026, + "rewards/rejected": -7.100059509277344, + "semantic_entropy": 0.014079605229198933, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 16.409030978097405, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": 0.2830341160297394, + "logits/rejected": 0.3877353072166443, + "logps/chosen": -6.539282321929932, + "logps/rejected": -7.096798896789551, + "loss": 0.6047, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.539282321929932, + "rewards/margins": 0.5575160384178162, + "rewards/rejected": -7.096798896789551, + "semantic_entropy": 0.015337374992668629, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 13.519739236635578, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": 0.30631715059280396, + "logits/rejected": 0.35199958086013794, + "logps/chosen": -6.5635480880737305, + "logps/rejected": -7.057257175445557, + "loss": 0.5924, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.5635480880737305, + "rewards/margins": 0.49370861053466797, + "rewards/rejected": -7.057257175445557, + "semantic_entropy": 0.015060871839523315, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.4883604347705841, + "eval_logits/rejected": 0.5385698676109314, + "eval_logps/chosen": -6.740940093994141, + "eval_logps/rejected": -7.298737049102783, + "eval_loss": 0.5850783586502075, + "eval_rewards/accuracies": 0.6810088753700256, + "eval_rewards/chosen": -6.740940093994141, + "eval_rewards/margins": 0.5577963590621948, + "eval_rewards/rejected": -7.298737049102783, + "eval_runtime": 34.813, + "eval_samples_per_second": 38.635, + "eval_semantic_entropy": 0.013112816959619522, + "eval_steps_per_second": 9.68, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 19.237636205467812, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": 0.30846095085144043, + "logits/rejected": 0.3973791003227234, + "logps/chosen": -6.796361446380615, + "logps/rejected": -7.3480072021484375, + "loss": 0.5769, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.796361446380615, + "rewards/margins": 0.551645040512085, + "rewards/rejected": -7.3480072021484375, + "semantic_entropy": 0.01173459179699421, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 17.157117885795127, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": 0.30237749218940735, + "logits/rejected": 0.3430071473121643, + "logps/chosen": -6.8641839027404785, + "logps/rejected": -7.332343101501465, + "loss": 0.6095, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -6.8641839027404785, + "rewards/margins": 0.4681592583656311, + "rewards/rejected": -7.332343101501465, + "semantic_entropy": 0.013705052435398102, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 17.86668130034969, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": 0.31386134028434753, + "logits/rejected": 0.3876408636569977, + "logps/chosen": -6.719006538391113, + "logps/rejected": -7.485579490661621, + "loss": 0.5959, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.719006538391113, + "rewards/margins": 0.7665729522705078, + "rewards/rejected": -7.485579490661621, + "semantic_entropy": 0.016674160957336426, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 14.614768670805427, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": 0.39349859952926636, + "logits/rejected": 0.4051267206668854, + "logps/chosen": -7.036497592926025, + "logps/rejected": -7.662347316741943, + "loss": 0.5724, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -7.036497592926025, + "rewards/margins": 0.6258499622344971, + "rewards/rejected": -7.662347316741943, + "semantic_entropy": 0.012881157919764519, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 30.403962010078292, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": 0.3851960599422455, + "logits/rejected": 0.45596402883529663, + "logps/chosen": -7.236742973327637, + "logps/rejected": -7.798255920410156, + "loss": 0.6115, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.236742973327637, + "rewards/margins": 0.5615121126174927, + "rewards/rejected": -7.798255920410156, + "semantic_entropy": 0.013946113176643848, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 20.869180068401533, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": 0.45160192251205444, + "logits/rejected": 0.5339521765708923, + "logps/chosen": -7.302011966705322, + "logps/rejected": -7.983066558837891, + "loss": 0.5467, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -7.302011966705322, + "rewards/margins": 0.6810555458068848, + "rewards/rejected": -7.983066558837891, + "semantic_entropy": 0.010290712118148804, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 19.2525609808125, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": 0.5376949906349182, + "logits/rejected": 0.588058590888977, + "logps/chosen": -7.635756492614746, + "logps/rejected": -8.161395072937012, + "loss": 0.6001, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -7.635756492614746, + "rewards/margins": 0.5256373286247253, + "rewards/rejected": -8.161395072937012, + "semantic_entropy": 0.008404644206166267, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 19.454601896686906, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": 0.5822176933288574, + "logits/rejected": 0.6964151263237, + "logps/chosen": -7.75359582901001, + "logps/rejected": -8.28437614440918, + "loss": 0.6008, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.75359582901001, + "rewards/margins": 0.5307798981666565, + "rewards/rejected": -8.28437614440918, + "semantic_entropy": 0.006352287717163563, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 16.509437642167754, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": 0.6624695658683777, + "logits/rejected": 0.6998416185379028, + "logps/chosen": -7.631247043609619, + "logps/rejected": -8.026775360107422, + "loss": 0.6553, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.631247043609619, + "rewards/margins": 0.39552828669548035, + "rewards/rejected": -8.026775360107422, + "semantic_entropy": 0.00823633000254631, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 18.796921800518653, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": 0.6312128305435181, + "logits/rejected": 0.6908556222915649, + "logps/chosen": -7.528157711029053, + "logps/rejected": -8.073545455932617, + "loss": 0.565, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.528157711029053, + "rewards/margins": 0.545387864112854, + "rewards/rejected": -8.073545455932617, + "semantic_entropy": 0.007035645190626383, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 27.677336964086486, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": 0.5352962017059326, + "logits/rejected": 0.6097812056541443, + "logps/chosen": -7.2810773849487305, + "logps/rejected": -7.931620121002197, + "loss": 0.6214, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.2810773849487305, + "rewards/margins": 0.6505423188209534, + "rewards/rejected": -7.931620121002197, + "semantic_entropy": 0.009111289866268635, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 14.272226362354068, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": 0.34996968507766724, + "logits/rejected": 0.4666077494621277, + "logps/chosen": -7.2218828201293945, + "logps/rejected": -7.859616756439209, + "loss": 0.6001, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -7.2218828201293945, + "rewards/margins": 0.6377342939376831, + "rewards/rejected": -7.859616756439209, + "semantic_entropy": 0.009801121428608894, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 26.063412987584922, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": 0.33111342787742615, + "logits/rejected": 0.40587443113327026, + "logps/chosen": -7.022481441497803, + "logps/rejected": -7.678803443908691, + "loss": 0.605, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -7.022481441497803, + "rewards/margins": 0.6563228368759155, + "rewards/rejected": -7.678803443908691, + "semantic_entropy": 0.013630586676299572, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 13.43476713833555, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": 0.22445161640644073, + "logits/rejected": 0.39778199791908264, + "logps/chosen": -6.874536037445068, + "logps/rejected": -7.5805511474609375, + "loss": 0.5545, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -6.874536037445068, + "rewards/margins": 0.7060148119926453, + "rewards/rejected": -7.5805511474609375, + "semantic_entropy": 0.014564545825123787, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 20.657905051442476, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": 0.3672102391719818, + "logits/rejected": 0.41505926847457886, + "logps/chosen": -7.1059699058532715, + "logps/rejected": -7.5586981773376465, + "loss": 0.6699, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -7.1059699058532715, + "rewards/margins": 0.4527283310890198, + "rewards/rejected": -7.5586981773376465, + "semantic_entropy": 0.01175951398909092, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 10.963959193632688, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": 0.3955201506614685, + "logits/rejected": 0.5143112540245056, + "logps/chosen": -7.272347927093506, + "logps/rejected": -7.984310150146484, + "loss": 0.5331, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -7.272347927093506, + "rewards/margins": 0.7119626998901367, + "rewards/rejected": -7.984310150146484, + "semantic_entropy": 0.009277241304516792, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 17.54527652296308, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": 0.330959290266037, + "logits/rejected": 0.38617414236068726, + "logps/chosen": -7.402396202087402, + "logps/rejected": -7.947201728820801, + "loss": 0.5931, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -7.402396202087402, + "rewards/margins": 0.5448045134544373, + "rewards/rejected": -7.947201728820801, + "semantic_entropy": 0.008298173546791077, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 18.303499456390117, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": 0.2591246962547302, + "logits/rejected": 0.33385053277015686, + "logps/chosen": -7.262864589691162, + "logps/rejected": -7.755476951599121, + "loss": 0.6183, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.262864589691162, + "rewards/margins": 0.49261218309402466, + "rewards/rejected": -7.755476951599121, + "semantic_entropy": 0.012104134075343609, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 17.671561073979735, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": 0.348507821559906, + "logits/rejected": 0.4041469991207123, + "logps/chosen": -7.320245265960693, + "logps/rejected": -7.844791412353516, + "loss": 0.6116, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -7.320245265960693, + "rewards/margins": 0.5245463848114014, + "rewards/rejected": -7.844791412353516, + "semantic_entropy": 0.012385739013552666, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 19.73543179549833, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": 0.2475900948047638, + "logits/rejected": 0.30757012963294983, + "logps/chosen": -7.321754455566406, + "logps/rejected": -7.892062187194824, + "loss": 0.6304, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -7.321754455566406, + "rewards/margins": 0.5703079104423523, + "rewards/rejected": -7.892062187194824, + "semantic_entropy": 0.011291766539216042, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 23.01329046657235, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": 0.3327587842941284, + "logits/rejected": 0.4450058043003082, + "logps/chosen": -7.170090675354004, + "logps/rejected": -7.9321088790893555, + "loss": 0.5644, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.170090675354004, + "rewards/margins": 0.7620194554328918, + "rewards/rejected": -7.9321088790893555, + "semantic_entropy": 0.013015474192798138, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 13.383948275630102, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": 0.3371312916278839, + "logits/rejected": 0.39455828070640564, + "logps/chosen": -7.155638217926025, + "logps/rejected": -7.695284843444824, + "loss": 0.6107, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.155638217926025, + "rewards/margins": 0.5396467447280884, + "rewards/rejected": -7.695284843444824, + "semantic_entropy": 0.010182186029851437, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 17.28977001865095, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": 0.3446193337440491, + "logits/rejected": 0.4533708095550537, + "logps/chosen": -7.136691093444824, + "logps/rejected": -7.7163190841674805, + "loss": 0.6031, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.136691093444824, + "rewards/margins": 0.5796278715133667, + "rewards/rejected": -7.7163190841674805, + "semantic_entropy": 0.01082690805196762, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 16.51391506460225, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": 0.3450031876564026, + "logits/rejected": 0.4413267970085144, + "logps/chosen": -6.917219638824463, + "logps/rejected": -7.611997127532959, + "loss": 0.5868, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.917219638824463, + "rewards/margins": 0.6947778463363647, + "rewards/rejected": -7.611997127532959, + "semantic_entropy": 0.013024079613387585, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 22.468621981222572, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": 0.30431827902793884, + "logits/rejected": 0.4026539921760559, + "logps/chosen": -7.283698081970215, + "logps/rejected": -7.741362571716309, + "loss": 0.6606, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -7.283698081970215, + "rewards/margins": 0.4576646387577057, + "rewards/rejected": -7.741362571716309, + "semantic_entropy": 0.009632373228669167, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 22.170244658612646, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": 0.29118505120277405, + "logits/rejected": 0.34077757596969604, + "logps/chosen": -7.040729522705078, + "logps/rejected": -7.742720127105713, + "loss": 0.5839, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -7.040729522705078, + "rewards/margins": 0.7019898295402527, + "rewards/rejected": -7.742720127105713, + "semantic_entropy": 0.01227110717445612, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 17.301303488197902, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": 0.285023957490921, + "logits/rejected": 0.3279130458831787, + "logps/chosen": -7.310843467712402, + "logps/rejected": -7.921121120452881, + "loss": 0.5644, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -7.310843467712402, + "rewards/margins": 0.6102767586708069, + "rewards/rejected": -7.921121120452881, + "semantic_entropy": 0.011266985908150673, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 17.308136534374302, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": 0.2767130434513092, + "logits/rejected": 0.3372814357280731, + "logps/chosen": -7.225625038146973, + "logps/rejected": -7.890871524810791, + "loss": 0.5472, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.225625038146973, + "rewards/margins": 0.6652467846870422, + "rewards/rejected": -7.890871524810791, + "semantic_entropy": 0.011087710037827492, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 18.83655138445813, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": 0.25278183817863464, + "logits/rejected": 0.37249675393104553, + "logps/chosen": -7.409969329833984, + "logps/rejected": -7.930933475494385, + "loss": 0.6172, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -7.409969329833984, + "rewards/margins": 0.5209641456604004, + "rewards/rejected": -7.930933475494385, + "semantic_entropy": 0.012186022475361824, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 22.919603952974256, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": 0.22130601108074188, + "logits/rejected": 0.3324377238750458, + "logps/chosen": -7.2493391036987305, + "logps/rejected": -7.837095737457275, + "loss": 0.614, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.2493391036987305, + "rewards/margins": 0.5877568125724792, + "rewards/rejected": -7.837095737457275, + "semantic_entropy": 0.013909459114074707, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 20.299888419343265, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": 0.2346227467060089, + "logits/rejected": 0.38050609827041626, + "logps/chosen": -7.475827217102051, + "logps/rejected": -8.115577697753906, + "loss": 0.5694, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.475827217102051, + "rewards/margins": 0.639750599861145, + "rewards/rejected": -8.115577697753906, + "semantic_entropy": 0.011748342774808407, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 26.80684693873056, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": 0.26571953296661377, + "logits/rejected": 0.3704363703727722, + "logps/chosen": -7.530020713806152, + "logps/rejected": -8.231939315795898, + "loss": 0.5966, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.530020713806152, + "rewards/margins": 0.7019174098968506, + "rewards/rejected": -8.231939315795898, + "semantic_entropy": 0.010797923430800438, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 22.781706412422338, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": 0.3323562741279602, + "logits/rejected": 0.3556436598300934, + "logps/chosen": -7.796743869781494, + "logps/rejected": -8.327234268188477, + "loss": 0.6751, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.796743869781494, + "rewards/margins": 0.5304909944534302, + "rewards/rejected": -8.327234268188477, + "semantic_entropy": 0.0087089529260993, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 15.79393005727687, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": 0.37312960624694824, + "logits/rejected": 0.370185911655426, + "logps/chosen": -7.5613603591918945, + "logps/rejected": -8.011152267456055, + "loss": 0.6467, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -7.5613603591918945, + "rewards/margins": 0.4497918486595154, + "rewards/rejected": -8.011152267456055, + "semantic_entropy": 0.008952843025326729, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 11.274721147056358, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": 0.34255561232566833, + "logits/rejected": 0.4821571409702301, + "logps/chosen": -7.382586479187012, + "logps/rejected": -7.989335060119629, + "loss": 0.5608, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.382586479187012, + "rewards/margins": 0.6067487001419067, + "rewards/rejected": -7.989335060119629, + "semantic_entropy": 0.00937967374920845, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 18.0683203101528, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": 0.3637096583843231, + "logits/rejected": 0.4652346074581146, + "logps/chosen": -7.228192329406738, + "logps/rejected": -7.7642951011657715, + "loss": 0.5798, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.228192329406738, + "rewards/margins": 0.536103367805481, + "rewards/rejected": -7.7642951011657715, + "semantic_entropy": 0.010401034727692604, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 15.686670446006769, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": 0.508113443851471, + "logits/rejected": 0.6232683062553406, + "logps/chosen": -7.083076477050781, + "logps/rejected": -7.62778377532959, + "loss": 0.5897, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -7.083076477050781, + "rewards/margins": 0.5447085499763489, + "rewards/rejected": -7.62778377532959, + "semantic_entropy": 0.010786814615130424, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 11.8792829377083, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": 0.5830814242362976, + "logits/rejected": 0.602809488773346, + "logps/chosen": -7.003039360046387, + "logps/rejected": -7.556340217590332, + "loss": 0.5887, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -7.003039360046387, + "rewards/margins": 0.5533004999160767, + "rewards/rejected": -7.556340217590332, + "semantic_entropy": 0.01125816348940134, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 13.602049737746205, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": 0.5771272778511047, + "logits/rejected": 0.6160858869552612, + "logps/chosen": -7.035143852233887, + "logps/rejected": -7.554785251617432, + "loss": 0.6097, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -7.035143852233887, + "rewards/margins": 0.5196409225463867, + "rewards/rejected": -7.554785251617432, + "semantic_entropy": 0.010585736483335495, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 13.516147889237553, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": 0.594735860824585, + "logits/rejected": 0.6816811561584473, + "logps/chosen": -7.359915256500244, + "logps/rejected": -7.887757778167725, + "loss": 0.5661, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -7.359915256500244, + "rewards/margins": 0.52784264087677, + "rewards/rejected": -7.887757778167725, + "semantic_entropy": 0.00826399214565754, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 21.398826323428153, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": 0.5744519829750061, + "logits/rejected": 0.676822304725647, + "logps/chosen": -7.203065395355225, + "logps/rejected": -7.726864814758301, + "loss": 0.5962, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -7.203065395355225, + "rewards/margins": 0.5237992405891418, + "rewards/rejected": -7.726864814758301, + "semantic_entropy": 0.01075592078268528, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 13.803390343561437, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": 0.5930423140525818, + "logits/rejected": 0.6964749693870544, + "logps/chosen": -6.960592746734619, + "logps/rejected": -7.653602600097656, + "loss": 0.5627, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -6.960592746734619, + "rewards/margins": 0.6930093765258789, + "rewards/rejected": -7.653602600097656, + "semantic_entropy": 0.012550493702292442, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 17.08463390545894, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": 0.5665202140808105, + "logits/rejected": 0.6539164781570435, + "logps/chosen": -6.99094295501709, + "logps/rejected": -7.582823276519775, + "loss": 0.5839, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -6.99094295501709, + "rewards/margins": 0.5918795466423035, + "rewards/rejected": -7.582823276519775, + "semantic_entropy": 0.011810271069407463, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 16.08279102674936, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": 0.5616046786308289, + "logits/rejected": 0.6401379704475403, + "logps/chosen": -6.898770809173584, + "logps/rejected": -7.535937309265137, + "loss": 0.5637, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -6.898770809173584, + "rewards/margins": 0.6371673941612244, + "rewards/rejected": -7.535937309265137, + "semantic_entropy": 0.012174823321402073, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 17.72669785551067, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": 0.5178264379501343, + "logits/rejected": 0.5918501019477844, + "logps/chosen": -7.055424690246582, + "logps/rejected": -7.835641384124756, + "loss": 0.5346, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -7.055424690246582, + "rewards/margins": 0.7802165150642395, + "rewards/rejected": -7.835641384124756, + "semantic_entropy": 0.012361900880932808, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 23.667023672408078, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": 0.5643856525421143, + "logits/rejected": 0.615861713886261, + "logps/chosen": -7.197871208190918, + "logps/rejected": -7.710868835449219, + "loss": 0.6595, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -7.197871208190918, + "rewards/margins": 0.512997031211853, + "rewards/rejected": -7.710868835449219, + "semantic_entropy": 0.010254869237542152, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 18.12568347350743, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": 0.607509434223175, + "logits/rejected": 0.6717751622200012, + "logps/chosen": -7.640361785888672, + "logps/rejected": -8.166301727294922, + "loss": 0.5919, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.640361785888672, + "rewards/margins": 0.525938868522644, + "rewards/rejected": -8.166301727294922, + "semantic_entropy": 0.0061937421560287476, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 16.521524018519518, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": 0.6564599871635437, + "logits/rejected": 0.7573956251144409, + "logps/chosen": -7.6685791015625, + "logps/rejected": -8.591699600219727, + "loss": 0.4835, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -7.6685791015625, + "rewards/margins": 0.923120379447937, + "rewards/rejected": -8.591699600219727, + "semantic_entropy": 0.006848378572613001, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 23.54333369460797, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": 0.6087485551834106, + "logits/rejected": 0.704781711101532, + "logps/chosen": -8.048932075500488, + "logps/rejected": -8.719579696655273, + "loss": 0.5672, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.048932075500488, + "rewards/margins": 0.6706476211547852, + "rewards/rejected": -8.719579696655273, + "semantic_entropy": 0.005030449479818344, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 18.562047799738053, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": 0.531555712223053, + "logits/rejected": 0.5957599878311157, + "logps/chosen": -8.234556198120117, + "logps/rejected": -8.946748733520508, + "loss": 0.5617, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.234556198120117, + "rewards/margins": 0.7121928930282593, + "rewards/rejected": -8.946748733520508, + "semantic_entropy": 0.003957569133490324, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 20.633835966884693, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": 0.5291253924369812, + "logits/rejected": 0.5597686767578125, + "logps/chosen": -8.142400741577148, + "logps/rejected": -8.771413803100586, + "loss": 0.5741, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.142400741577148, + "rewards/margins": 0.6290136575698853, + "rewards/rejected": -8.771413803100586, + "semantic_entropy": 0.004258748609572649, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 34.54645017218859, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": 0.5386477112770081, + "logits/rejected": 0.6480933427810669, + "logps/chosen": -8.24083137512207, + "logps/rejected": -8.844766616821289, + "loss": 0.5895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.24083137512207, + "rewards/margins": 0.6039354801177979, + "rewards/rejected": -8.844766616821289, + "semantic_entropy": 0.004295586608350277, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 18.991335444449422, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": 0.5871809124946594, + "logits/rejected": 0.6752435564994812, + "logps/chosen": -8.063031196594238, + "logps/rejected": -8.696462631225586, + "loss": 0.5889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -8.063031196594238, + "rewards/margins": 0.6334304809570312, + "rewards/rejected": -8.696462631225586, + "semantic_entropy": 0.005001295357942581, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 13.986507130299717, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": 0.5075832605361938, + "logits/rejected": 0.65854811668396, + "logps/chosen": -8.004261016845703, + "logps/rejected": -8.6107177734375, + "loss": 0.5822, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.004261016845703, + "rewards/margins": 0.6064566373825073, + "rewards/rejected": -8.6107177734375, + "semantic_entropy": 0.004056010395288467, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 13.26346189390983, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": 0.5564194917678833, + "logits/rejected": 0.6411615014076233, + "logps/chosen": -7.78427791595459, + "logps/rejected": -8.535164833068848, + "loss": 0.5486, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.78427791595459, + "rewards/margins": 0.7508861422538757, + "rewards/rejected": -8.535164833068848, + "semantic_entropy": 0.00624846201390028, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 17.424932509279063, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": 0.5387696623802185, + "logits/rejected": 0.6152251362800598, + "logps/chosen": -7.787422180175781, + "logps/rejected": -8.456579208374023, + "loss": 0.5961, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -7.787422180175781, + "rewards/margins": 0.6691574454307556, + "rewards/rejected": -8.456579208374023, + "semantic_entropy": 0.005864334292709827, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 13.701215181738062, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": 0.6226884126663208, + "logits/rejected": 0.7190333008766174, + "logps/chosen": -7.892449855804443, + "logps/rejected": -8.544393539428711, + "loss": 0.5802, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -7.892449855804443, + "rewards/margins": 0.6519426107406616, + "rewards/rejected": -8.544393539428711, + "semantic_entropy": 0.005305818282067776, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 24.146957684848896, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": 0.6268946528434753, + "logits/rejected": 0.6841186285018921, + "logps/chosen": -7.797842502593994, + "logps/rejected": -8.302523612976074, + "loss": 0.6651, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -7.797842502593994, + "rewards/margins": 0.5046811699867249, + "rewards/rejected": -8.302523612976074, + "semantic_entropy": 0.006395612843334675, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 18.196449287747534, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": 0.5718734264373779, + "logits/rejected": 0.6677632331848145, + "logps/chosen": -7.862264156341553, + "logps/rejected": -8.452044486999512, + "loss": 0.5838, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -7.862264156341553, + "rewards/margins": 0.5897812843322754, + "rewards/rejected": -8.452044486999512, + "semantic_entropy": 0.005370546132326126, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 18.197375532898803, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": 0.6339142322540283, + "logits/rejected": 0.7179350852966309, + "logps/chosen": -7.516765594482422, + "logps/rejected": -8.200715065002441, + "loss": 0.5646, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.516765594482422, + "rewards/margins": 0.6839491128921509, + "rewards/rejected": -8.200715065002441, + "semantic_entropy": 0.007220913656055927, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 16.846636546681744, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": 0.570271909236908, + "logits/rejected": 0.6238844990730286, + "logps/chosen": -7.6180243492126465, + "logps/rejected": -8.167299270629883, + "loss": 0.5952, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -7.6180243492126465, + "rewards/margins": 0.549274206161499, + "rewards/rejected": -8.167299270629883, + "semantic_entropy": 0.006116692908108234, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 26.633505176750862, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": 0.638900101184845, + "logits/rejected": 0.7319290637969971, + "logps/chosen": -7.582394599914551, + "logps/rejected": -8.110502243041992, + "loss": 0.5966, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -7.582394599914551, + "rewards/margins": 0.5281090140342712, + "rewards/rejected": -8.110502243041992, + "semantic_entropy": 0.006717337761074305, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 27.411766077174338, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": 0.6096881628036499, + "logits/rejected": 0.7123531699180603, + "logps/chosen": -7.56555700302124, + "logps/rejected": -7.999688625335693, + "loss": 0.6675, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.56555700302124, + "rewards/margins": 0.4341324269771576, + "rewards/rejected": -7.999688625335693, + "semantic_entropy": 0.0064805252477526665, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 13.615665322183913, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": 0.5667654275894165, + "logits/rejected": 0.6309981346130371, + "logps/chosen": -7.366901397705078, + "logps/rejected": -7.9114861488342285, + "loss": 0.6056, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.366901397705078, + "rewards/margins": 0.5445848703384399, + "rewards/rejected": -7.9114861488342285, + "semantic_entropy": 0.007139952387660742, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 15.675473698672269, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": 0.5247001647949219, + "logits/rejected": 0.6292780041694641, + "logps/chosen": -7.417148590087891, + "logps/rejected": -7.97311544418335, + "loss": 0.5796, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.417148590087891, + "rewards/margins": 0.5559675097465515, + "rewards/rejected": -7.97311544418335, + "semantic_entropy": 0.006966522429138422, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 16.21958228068165, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": 0.6028557419776917, + "logits/rejected": 0.6466434597969055, + "logps/chosen": -7.145249366760254, + "logps/rejected": -7.668765068054199, + "loss": 0.5902, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -7.145249366760254, + "rewards/margins": 0.5235155820846558, + "rewards/rejected": -7.668765068054199, + "semantic_entropy": 0.008800549432635307, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 15.585684935656099, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": 0.6030339002609253, + "logits/rejected": 0.7146845459938049, + "logps/chosen": -7.388113975524902, + "logps/rejected": -7.941763401031494, + "loss": 0.5785, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.388113975524902, + "rewards/margins": 0.5536485910415649, + "rewards/rejected": -7.941763401031494, + "semantic_entropy": 0.007055189460515976, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 15.602057313200888, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": 0.5913205742835999, + "logits/rejected": 0.602319598197937, + "logps/chosen": -7.421736717224121, + "logps/rejected": -7.810797214508057, + "loss": 0.6116, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.421736717224121, + "rewards/margins": 0.38906151056289673, + "rewards/rejected": -7.810797214508057, + "semantic_entropy": 0.00688566267490387, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 15.626443610629703, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": 0.5252998471260071, + "logits/rejected": 0.5994977355003357, + "logps/chosen": -7.5483551025390625, + "logps/rejected": -8.214573860168457, + "loss": 0.5531, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.5483551025390625, + "rewards/margins": 0.6662176251411438, + "rewards/rejected": -8.214573860168457, + "semantic_entropy": 0.007115071173757315, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 16.005271466438277, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": 0.5813131332397461, + "logits/rejected": 0.640332818031311, + "logps/chosen": -7.900903224945068, + "logps/rejected": -8.512435913085938, + "loss": 0.5717, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.900903224945068, + "rewards/margins": 0.6115323901176453, + "rewards/rejected": -8.512435913085938, + "semantic_entropy": 0.0051747518591582775, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 10.625567401564435, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": 0.6257847547531128, + "logits/rejected": 0.6622999906539917, + "logps/chosen": -7.946097373962402, + "logps/rejected": -8.47764778137207, + "loss": 0.6041, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.946097373962402, + "rewards/margins": 0.5315494537353516, + "rewards/rejected": -8.47764778137207, + "semantic_entropy": 0.005170217715203762, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 13.397344602674059, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": 0.6595005989074707, + "logits/rejected": 0.7116304636001587, + "logps/chosen": -7.946617126464844, + "logps/rejected": -8.630485534667969, + "loss": 0.5581, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.946617126464844, + "rewards/margins": 0.6838675737380981, + "rewards/rejected": -8.630485534667969, + "semantic_entropy": 0.005823346786201, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 15.865578560547265, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": 0.5803102254867554, + "logits/rejected": 0.6273818016052246, + "logps/chosen": -7.978617191314697, + "logps/rejected": -8.42393684387207, + "loss": 0.6414, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -7.978617191314697, + "rewards/margins": 0.44531959295272827, + "rewards/rejected": -8.42393684387207, + "semantic_entropy": 0.0065127527341246605, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 15.673714953896777, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": 0.5762825608253479, + "logits/rejected": 0.6548576354980469, + "logps/chosen": -8.140003204345703, + "logps/rejected": -8.694659233093262, + "loss": 0.5987, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.140003204345703, + "rewards/margins": 0.5546567440032959, + "rewards/rejected": -8.694659233093262, + "semantic_entropy": 0.005027764476835728, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 14.354746897825013, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": 0.5808348655700684, + "logits/rejected": 0.6876333355903625, + "logps/chosen": -7.959421634674072, + "logps/rejected": -8.607701301574707, + "loss": 0.5651, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.959421634674072, + "rewards/margins": 0.6482798457145691, + "rewards/rejected": -8.607701301574707, + "semantic_entropy": 0.006594679318368435, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 12.98455385650067, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": 0.5893815755844116, + "logits/rejected": 0.661659836769104, + "logps/chosen": -7.993622779846191, + "logps/rejected": -8.655765533447266, + "loss": 0.5828, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -7.993622779846191, + "rewards/margins": 0.6621420979499817, + "rewards/rejected": -8.655765533447266, + "semantic_entropy": 0.00508379889652133, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 14.115881656600934, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": 0.6585602164268494, + "logits/rejected": 0.7358173131942749, + "logps/chosen": -7.811059474945068, + "logps/rejected": -8.613174438476562, + "loss": 0.5195, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -7.811059474945068, + "rewards/margins": 0.8021153211593628, + "rewards/rejected": -8.613174438476562, + "semantic_entropy": 0.006301518529653549, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 16.283892207062348, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": 0.6387815475463867, + "logits/rejected": 0.7039491534233093, + "logps/chosen": -7.9904046058654785, + "logps/rejected": -8.646058082580566, + "loss": 0.5645, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -7.9904046058654785, + "rewards/margins": 0.655653715133667, + "rewards/rejected": -8.646058082580566, + "semantic_entropy": 0.005252276547253132, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 15.655836290539325, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": 0.6491774320602417, + "logits/rejected": 0.7328025698661804, + "logps/chosen": -7.836843013763428, + "logps/rejected": -8.513358116149902, + "loss": 0.5399, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.836843013763428, + "rewards/margins": 0.6765137910842896, + "rewards/rejected": -8.513358116149902, + "semantic_entropy": 0.005473036784678698, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 22.62708626022183, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": 0.5962838530540466, + "logits/rejected": 0.6391795873641968, + "logps/chosen": -8.035821914672852, + "logps/rejected": -8.608399391174316, + "loss": 0.5951, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.035821914672852, + "rewards/margins": 0.572577953338623, + "rewards/rejected": -8.608399391174316, + "semantic_entropy": 0.005160279106348753, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.7506793141365051, + "eval_logits/rejected": 0.7968686819076538, + "eval_logps/chosen": -7.988265514373779, + "eval_logps/rejected": -8.681319236755371, + "eval_loss": 0.5522213578224182, + "eval_rewards/accuracies": 0.7062314748764038, + "eval_rewards/chosen": -7.988265514373779, + "eval_rewards/margins": 0.6930533647537231, + "eval_rewards/rejected": -8.681319236755371, + "eval_runtime": 35.081, + "eval_samples_per_second": 38.34, + "eval_semantic_entropy": 0.004989789333194494, + "eval_steps_per_second": 9.606, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 18.377278845630318, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": 0.582940399646759, + "logits/rejected": 0.6627975106239319, + "logps/chosen": -8.004450798034668, + "logps/rejected": -8.656949043273926, + "loss": 0.5505, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.004450798034668, + "rewards/margins": 0.6524981260299683, + "rewards/rejected": -8.656949043273926, + "semantic_entropy": 0.004912947304546833, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 13.700239190708754, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": 0.7021932601928711, + "logits/rejected": 0.7906457185745239, + "logps/chosen": -8.091392517089844, + "logps/rejected": -8.654991149902344, + "loss": 0.587, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.091392517089844, + "rewards/margins": 0.563599169254303, + "rewards/rejected": -8.654991149902344, + "semantic_entropy": 0.004794766195118427, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 15.628975304693077, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": 0.6097584962844849, + "logits/rejected": 0.6667622327804565, + "logps/chosen": -8.029305458068848, + "logps/rejected": -8.514669418334961, + "loss": 0.619, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.029305458068848, + "rewards/margins": 0.4853641390800476, + "rewards/rejected": -8.514669418334961, + "semantic_entropy": 0.004363791085779667, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 14.141900238974408, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": 0.7649446725845337, + "logits/rejected": 0.8241230249404907, + "logps/chosen": -7.659104824066162, + "logps/rejected": -8.234978675842285, + "loss": 0.5818, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -7.659104824066162, + "rewards/margins": 0.5758742094039917, + "rewards/rejected": -8.234978675842285, + "semantic_entropy": 0.006544353906065226, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 17.44536516233375, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": 0.6202625036239624, + "logits/rejected": 0.7067128419876099, + "logps/chosen": -7.734452724456787, + "logps/rejected": -8.231317520141602, + "loss": 0.6291, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.734452724456787, + "rewards/margins": 0.4968656599521637, + "rewards/rejected": -8.231317520141602, + "semantic_entropy": 0.006045544985681772, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 14.058747132134709, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": 0.6863638162612915, + "logits/rejected": 0.7385177612304688, + "logps/chosen": -7.626795768737793, + "logps/rejected": -8.15473461151123, + "loss": 0.6068, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.626795768737793, + "rewards/margins": 0.5279384851455688, + "rewards/rejected": -8.15473461151123, + "semantic_entropy": 0.006082098465412855, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 15.524786458902534, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": 0.6960703134536743, + "logits/rejected": 0.7752768397331238, + "logps/chosen": -7.594348907470703, + "logps/rejected": -8.13396167755127, + "loss": 0.5959, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.594348907470703, + "rewards/margins": 0.5396129488945007, + "rewards/rejected": -8.13396167755127, + "semantic_entropy": 0.0065464479848742485, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 10.898687489202283, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": 0.6455325484275818, + "logits/rejected": 0.7051125764846802, + "logps/chosen": -7.510709285736084, + "logps/rejected": -8.048765182495117, + "loss": 0.5766, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.510709285736084, + "rewards/margins": 0.5380562543869019, + "rewards/rejected": -8.048765182495117, + "semantic_entropy": 0.007156215608119965, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 18.786409391603485, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": 0.6517975330352783, + "logits/rejected": 0.7252013683319092, + "logps/chosen": -7.325045108795166, + "logps/rejected": -7.9806952476501465, + "loss": 0.5695, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -7.325045108795166, + "rewards/margins": 0.6556496620178223, + "rewards/rejected": -7.9806952476501465, + "semantic_entropy": 0.00768858939409256, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 24.445945587094794, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": 0.6680857539176941, + "logits/rejected": 0.7646031975746155, + "logps/chosen": -7.549722194671631, + "logps/rejected": -8.019618034362793, + "loss": 0.645, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.549722194671631, + "rewards/margins": 0.46989649534225464, + "rewards/rejected": -8.019618034362793, + "semantic_entropy": 0.006873616483062506, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 15.014444640765525, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": 0.6218008995056152, + "logits/rejected": 0.6838528513908386, + "logps/chosen": -7.445742607116699, + "logps/rejected": -8.093426704406738, + "loss": 0.5457, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.445742607116699, + "rewards/margins": 0.6476832628250122, + "rewards/rejected": -8.093426704406738, + "semantic_entropy": 0.007571948226541281, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 12.345959930188625, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": 0.5849351286888123, + "logits/rejected": 0.6522541642189026, + "logps/chosen": -7.5591864585876465, + "logps/rejected": -8.265511512756348, + "loss": 0.5424, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.5591864585876465, + "rewards/margins": 0.7063250541687012, + "rewards/rejected": -8.265511512756348, + "semantic_entropy": 0.006167138926684856, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 16.266967829326354, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": 0.6240657567977905, + "logits/rejected": 0.7110647559165955, + "logps/chosen": -7.6305341720581055, + "logps/rejected": -8.264394760131836, + "loss": 0.5468, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -7.6305341720581055, + "rewards/margins": 0.6338610053062439, + "rewards/rejected": -8.264394760131836, + "semantic_entropy": 0.006204391364008188, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 16.69670345070714, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": 0.5465856790542603, + "logits/rejected": 0.5944739580154419, + "logps/chosen": -7.692608833312988, + "logps/rejected": -8.17878532409668, + "loss": 0.6284, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -7.692608833312988, + "rewards/margins": 0.48617634177207947, + "rewards/rejected": -8.17878532409668, + "semantic_entropy": 0.0065203020349144936, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 10.756532347044482, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": 0.5842273235321045, + "logits/rejected": 0.6651209592819214, + "logps/chosen": -7.88360595703125, + "logps/rejected": -8.369672775268555, + "loss": 0.6449, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -7.88360595703125, + "rewards/margins": 0.4860672950744629, + "rewards/rejected": -8.369672775268555, + "semantic_entropy": 0.005015389062464237, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 21.16131993716241, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": 0.6626430153846741, + "logits/rejected": 0.7511327862739563, + "logps/chosen": -7.711289882659912, + "logps/rejected": -8.375436782836914, + "loss": 0.5492, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -7.711289882659912, + "rewards/margins": 0.6641460657119751, + "rewards/rejected": -8.375436782836914, + "semantic_entropy": 0.006019088439643383, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 17.008433131854304, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": 0.73247891664505, + "logits/rejected": 0.7920357584953308, + "logps/chosen": -7.982652187347412, + "logps/rejected": -8.343966484069824, + "loss": 0.6391, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -7.982652187347412, + "rewards/margins": 0.3613142967224121, + "rewards/rejected": -8.343966484069824, + "semantic_entropy": 0.004631609655916691, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 13.160928071624928, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": 0.6529003977775574, + "logits/rejected": 0.7320042252540588, + "logps/chosen": -7.793301582336426, + "logps/rejected": -8.294574737548828, + "loss": 0.6074, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -7.793301582336426, + "rewards/margins": 0.5012733340263367, + "rewards/rejected": -8.294574737548828, + "semantic_entropy": 0.0051393527537584305, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 14.183716262535388, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": 0.6803663969039917, + "logits/rejected": 0.6954035758972168, + "logps/chosen": -7.670355796813965, + "logps/rejected": -8.38364028930664, + "loss": 0.5344, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.670355796813965, + "rewards/margins": 0.7132849097251892, + "rewards/rejected": -8.38364028930664, + "semantic_entropy": 0.006493359804153442, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 14.518556999601335, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": 0.7013619542121887, + "logits/rejected": 0.8164475560188293, + "logps/chosen": -7.910555839538574, + "logps/rejected": -8.657155990600586, + "loss": 0.5453, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.910555839538574, + "rewards/margins": 0.746599555015564, + "rewards/rejected": -8.657155990600586, + "semantic_entropy": 0.004653572104871273, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 12.861320996249733, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": 0.7577477693557739, + "logits/rejected": 0.7888853549957275, + "logps/chosen": -7.878431797027588, + "logps/rejected": -8.5064058303833, + "loss": 0.5883, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -7.878431797027588, + "rewards/margins": 0.6279749870300293, + "rewards/rejected": -8.5064058303833, + "semantic_entropy": 0.004988783039152622, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 14.945074518060963, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": 0.8046930432319641, + "logits/rejected": 0.8452129364013672, + "logps/chosen": -7.966567039489746, + "logps/rejected": -8.664255142211914, + "loss": 0.5506, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -7.966567039489746, + "rewards/margins": 0.6976876258850098, + "rewards/rejected": -8.664255142211914, + "semantic_entropy": 0.004865294322371483, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 18.100914575781236, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": 0.8275071382522583, + "logits/rejected": 0.8813290596008301, + "logps/chosen": -8.298616409301758, + "logps/rejected": -8.859460830688477, + "loss": 0.6166, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.298616409301758, + "rewards/margins": 0.5608429312705994, + "rewards/rejected": -8.859460830688477, + "semantic_entropy": 0.003682538866996765, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 15.788405153142321, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": 0.7420376539230347, + "logits/rejected": 0.8037020564079285, + "logps/chosen": -8.253267288208008, + "logps/rejected": -8.862098693847656, + "loss": 0.5702, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.253267288208008, + "rewards/margins": 0.6088317036628723, + "rewards/rejected": -8.862098693847656, + "semantic_entropy": 0.003497874829918146, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 20.04954090545044, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": 0.8426326513290405, + "logits/rejected": 0.8867173194885254, + "logps/chosen": -8.014307022094727, + "logps/rejected": -8.588244438171387, + "loss": 0.6257, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -8.014307022094727, + "rewards/margins": 0.5739374756813049, + "rewards/rejected": -8.588244438171387, + "semantic_entropy": 0.0046114143915474415, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 16.08088418888514, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": 0.8074777722358704, + "logits/rejected": 0.8401328921318054, + "logps/chosen": -7.99387264251709, + "logps/rejected": -8.621038436889648, + "loss": 0.5737, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.99387264251709, + "rewards/margins": 0.6271660327911377, + "rewards/rejected": -8.621038436889648, + "semantic_entropy": 0.004982014186680317, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 17.648574778030703, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": 0.7622288465499878, + "logits/rejected": 0.7995889782905579, + "logps/chosen": -8.03836441040039, + "logps/rejected": -8.615036010742188, + "loss": 0.5845, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.03836441040039, + "rewards/margins": 0.5766717195510864, + "rewards/rejected": -8.615036010742188, + "semantic_entropy": 0.005048284772783518, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 13.251467288565097, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": 0.7949849963188171, + "logits/rejected": 0.835627555847168, + "logps/chosen": -8.13330078125, + "logps/rejected": -8.713802337646484, + "loss": 0.5896, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -8.13330078125, + "rewards/margins": 0.580501139163971, + "rewards/rejected": -8.713802337646484, + "semantic_entropy": 0.00421832874417305, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 12.727013846864125, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": 0.8075268864631653, + "logits/rejected": 0.8839607238769531, + "logps/chosen": -8.259916305541992, + "logps/rejected": -9.057500839233398, + "loss": 0.4887, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.259916305541992, + "rewards/margins": 0.7975843548774719, + "rewards/rejected": -9.057500839233398, + "semantic_entropy": 0.003385394811630249, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 19.33189686664916, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": 0.7991722226142883, + "logits/rejected": 0.829816997051239, + "logps/chosen": -8.155218124389648, + "logps/rejected": -8.655525207519531, + "loss": 0.6021, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.155218124389648, + "rewards/margins": 0.5003066062927246, + "rewards/rejected": -8.655525207519531, + "semantic_entropy": 0.004503914155066013, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 21.351677483542304, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": 0.7611302137374878, + "logits/rejected": 0.8729363679885864, + "logps/chosen": -8.371480941772461, + "logps/rejected": -8.982285499572754, + "loss": 0.5771, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.371480941772461, + "rewards/margins": 0.6108050346374512, + "rewards/rejected": -8.982285499572754, + "semantic_entropy": 0.004243707284331322, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 15.476926041346282, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": 0.7315706610679626, + "logits/rejected": 0.7855316400527954, + "logps/chosen": -8.193710327148438, + "logps/rejected": -9.018750190734863, + "loss": 0.5218, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.193710327148438, + "rewards/margins": 0.8250393867492676, + "rewards/rejected": -9.018750190734863, + "semantic_entropy": 0.004040227737277746, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 11.327037956728795, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": 0.6972507238388062, + "logits/rejected": 0.7701762318611145, + "logps/chosen": -8.05665397644043, + "logps/rejected": -8.655900955200195, + "loss": 0.5916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.05665397644043, + "rewards/margins": 0.5992475748062134, + "rewards/rejected": -8.655900955200195, + "semantic_entropy": 0.0046203965321183205, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 26.290588474950017, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": 0.6830715537071228, + "logits/rejected": 0.7537237405776978, + "logps/chosen": -8.189208030700684, + "logps/rejected": -9.016082763671875, + "loss": 0.5602, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.189208030700684, + "rewards/margins": 0.8268746137619019, + "rewards/rejected": -9.016082763671875, + "semantic_entropy": 0.004340589977800846, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 14.319339627993806, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": 0.6698434352874756, + "logits/rejected": 0.7308276295661926, + "logps/chosen": -8.377470016479492, + "logps/rejected": -9.087924003601074, + "loss": 0.53, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.377470016479492, + "rewards/margins": 0.7104541063308716, + "rewards/rejected": -9.087924003601074, + "semantic_entropy": 0.0037077039014548063, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 14.5676067902578, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": 0.6635018587112427, + "logits/rejected": 0.7242141962051392, + "logps/chosen": -8.505064010620117, + "logps/rejected": -9.010017395019531, + "loss": 0.6233, + "rewards/accuracies": 0.65625, + "rewards/chosen": -8.505064010620117, + "rewards/margins": 0.5049545168876648, + "rewards/rejected": -9.010017395019531, + "semantic_entropy": 0.003181255189701915, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 16.866071551135633, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": 0.6800563335418701, + "logits/rejected": 0.7338107228279114, + "logps/chosen": -8.420328140258789, + "logps/rejected": -9.087446212768555, + "loss": 0.5497, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.420328140258789, + "rewards/margins": 0.6671197414398193, + "rewards/rejected": -9.087446212768555, + "semantic_entropy": 0.0032719075679779053, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 15.614567180768711, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": 0.737398624420166, + "logits/rejected": 0.7930010557174683, + "logps/chosen": -8.552255630493164, + "logps/rejected": -9.13463020324707, + "loss": 0.5937, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.552255630493164, + "rewards/margins": 0.5823749899864197, + "rewards/rejected": -9.13463020324707, + "semantic_entropy": 0.002839865395799279, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 17.35671690242798, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": 0.7323909997940063, + "logits/rejected": 0.7694789171218872, + "logps/chosen": -8.701313972473145, + "logps/rejected": -9.22875690460205, + "loss": 0.5945, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.701313972473145, + "rewards/margins": 0.5274431109428406, + "rewards/rejected": -9.22875690460205, + "semantic_entropy": 0.002377058146521449, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 10.711809648309112, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": 0.6817182302474976, + "logits/rejected": 0.7388890385627747, + "logps/chosen": -8.783699989318848, + "logps/rejected": -9.290229797363281, + "loss": 0.5884, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.783699989318848, + "rewards/margins": 0.5065295696258545, + "rewards/rejected": -9.290229797363281, + "semantic_entropy": 0.0022387620992958546, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 15.614698518847275, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": 0.6605618000030518, + "logits/rejected": 0.7475873827934265, + "logps/chosen": -8.55317497253418, + "logps/rejected": -9.264518737792969, + "loss": 0.5843, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.55317497253418, + "rewards/margins": 0.7113439440727234, + "rewards/rejected": -9.264518737792969, + "semantic_entropy": 0.002737089293077588, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 13.033534778573415, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": 0.6980472803115845, + "logits/rejected": 0.7700284719467163, + "logps/chosen": -8.742466926574707, + "logps/rejected": -9.366756439208984, + "loss": 0.5443, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.742466926574707, + "rewards/margins": 0.6242889165878296, + "rewards/rejected": -9.366756439208984, + "semantic_entropy": 0.0023838577326387167, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 38.89935633989603, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": 0.7200860977172852, + "logits/rejected": 0.7739099264144897, + "logps/chosen": -8.618478775024414, + "logps/rejected": -9.061718940734863, + "loss": 0.6104, + "rewards/accuracies": 0.65625, + "rewards/chosen": -8.618478775024414, + "rewards/margins": 0.4432406425476074, + "rewards/rejected": -9.061718940734863, + "semantic_entropy": 0.002750970423221588, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 12.996110838480991, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": 0.6858905553817749, + "logits/rejected": 0.7582074403762817, + "logps/chosen": -8.569108963012695, + "logps/rejected": -9.244118690490723, + "loss": 0.5774, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.569108963012695, + "rewards/margins": 0.6750102043151855, + "rewards/rejected": -9.244118690490723, + "semantic_entropy": 0.0030036987736821175, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 11.513246333141913, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": 0.6833176612854004, + "logits/rejected": 0.7447593212127686, + "logps/chosen": -8.49552059173584, + "logps/rejected": -9.083105087280273, + "loss": 0.5847, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -8.49552059173584, + "rewards/margins": 0.5875846147537231, + "rewards/rejected": -9.083105087280273, + "semantic_entropy": 0.003222426865249872, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 17.050277712672678, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": 0.6484526991844177, + "logits/rejected": 0.7113555669784546, + "logps/chosen": -8.497220993041992, + "logps/rejected": -9.029525756835938, + "loss": 0.5907, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -8.497220993041992, + "rewards/margins": 0.5323046445846558, + "rewards/rejected": -9.029525756835938, + "semantic_entropy": 0.002952256705611944, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 14.721214698389861, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": 0.7316317558288574, + "logits/rejected": 0.7572312355041504, + "logps/chosen": -8.28238296508789, + "logps/rejected": -8.961918830871582, + "loss": 0.5423, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.28238296508789, + "rewards/margins": 0.6795355677604675, + "rewards/rejected": -8.961918830871582, + "semantic_entropy": 0.004207999911159277, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 12.207483167169574, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": 0.703294038772583, + "logits/rejected": 0.7658201456069946, + "logps/chosen": -8.169378280639648, + "logps/rejected": -8.818994522094727, + "loss": 0.5517, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.169378280639648, + "rewards/margins": 0.6496168971061707, + "rewards/rejected": -8.818994522094727, + "semantic_entropy": 0.0045172530226409435, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 31.132649463038923, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": 0.6745550036430359, + "logits/rejected": 0.7686847448348999, + "logps/chosen": -8.215496063232422, + "logps/rejected": -8.745055198669434, + "loss": 0.5833, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.215496063232422, + "rewards/margins": 0.5295597910881042, + "rewards/rejected": -8.745055198669434, + "semantic_entropy": 0.004211473278701305, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 20.17196061549839, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": 0.68059903383255, + "logits/rejected": 0.7309106588363647, + "logps/chosen": -8.216412544250488, + "logps/rejected": -8.789416313171387, + "loss": 0.5969, + "rewards/accuracies": 0.65625, + "rewards/chosen": -8.216412544250488, + "rewards/margins": 0.5730043649673462, + "rewards/rejected": -8.789416313171387, + "semantic_entropy": 0.004476086236536503, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 17.688599393649028, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": 0.6764446496963501, + "logits/rejected": 0.74993497133255, + "logps/chosen": -8.021484375, + "logps/rejected": -8.647329330444336, + "loss": 0.5842, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.021484375, + "rewards/margins": 0.6258445978164673, + "rewards/rejected": -8.647329330444336, + "semantic_entropy": 0.005138213746249676, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 14.905907733509586, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": 0.6466782689094543, + "logits/rejected": 0.7279826402664185, + "logps/chosen": -7.967951774597168, + "logps/rejected": -8.646588325500488, + "loss": 0.5654, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.967951774597168, + "rewards/margins": 0.6786371469497681, + "rewards/rejected": -8.646588325500488, + "semantic_entropy": 0.004834360908716917, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 11.247618506337135, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": 0.6892117857933044, + "logits/rejected": 0.7534765601158142, + "logps/chosen": -8.057080268859863, + "logps/rejected": -8.565564155578613, + "loss": 0.6136, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -8.057080268859863, + "rewards/margins": 0.5084843039512634, + "rewards/rejected": -8.565564155578613, + "semantic_entropy": 0.00426045898348093, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 13.37630930170591, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": 0.6549677848815918, + "logits/rejected": 0.6734327077865601, + "logps/chosen": -8.016042709350586, + "logps/rejected": -8.552094459533691, + "loss": 0.6187, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -8.016042709350586, + "rewards/margins": 0.5360512137413025, + "rewards/rejected": -8.552094459533691, + "semantic_entropy": 0.005072770640254021, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 13.216764994602103, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": 0.717325747013092, + "logits/rejected": 0.8044508695602417, + "logps/chosen": -8.261899948120117, + "logps/rejected": -8.891412734985352, + "loss": 0.5755, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.261899948120117, + "rewards/margins": 0.6295128464698792, + "rewards/rejected": -8.891412734985352, + "semantic_entropy": 0.0033580393064767122, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 15.240573495892372, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": 0.8391082882881165, + "logits/rejected": 0.8751834034919739, + "logps/chosen": -8.093847274780273, + "logps/rejected": -8.829301834106445, + "loss": 0.5484, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.093847274780273, + "rewards/margins": 0.7354532480239868, + "rewards/rejected": -8.829301834106445, + "semantic_entropy": 0.004077838733792305, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 14.604017721504034, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": 0.8139607310295105, + "logits/rejected": 0.8734992742538452, + "logps/chosen": -8.397181510925293, + "logps/rejected": -9.134657859802246, + "loss": 0.5343, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.397181510925293, + "rewards/margins": 0.7374764680862427, + "rewards/rejected": -9.134657859802246, + "semantic_entropy": 0.0033265065867453814, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 17.08531394574266, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": 0.8408036231994629, + "logits/rejected": 0.882840633392334, + "logps/chosen": -8.239664077758789, + "logps/rejected": -8.969578742980957, + "loss": 0.5681, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.239664077758789, + "rewards/margins": 0.7299133539199829, + "rewards/rejected": -8.969578742980957, + "semantic_entropy": 0.0042380583472549915, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 14.477147607243182, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": 0.8981844186782837, + "logits/rejected": 0.9408555030822754, + "logps/chosen": -8.414166450500488, + "logps/rejected": -8.8630952835083, + "loss": 0.6356, + "rewards/accuracies": 0.65625, + "rewards/chosen": -8.414166450500488, + "rewards/margins": 0.44892817735671997, + "rewards/rejected": -8.8630952835083, + "semantic_entropy": 0.003574197646230459, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 13.661227346120718, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": 0.8391574621200562, + "logits/rejected": 0.8985759019851685, + "logps/chosen": -8.088191032409668, + "logps/rejected": -8.639988899230957, + "loss": 0.6163, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.088191032409668, + "rewards/margins": 0.5517988801002502, + "rewards/rejected": -8.639988899230957, + "semantic_entropy": 0.00466513354331255, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 14.061529510910384, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": 0.8700096011161804, + "logits/rejected": 0.9046772718429565, + "logps/chosen": -8.150907516479492, + "logps/rejected": -8.664621353149414, + "loss": 0.6186, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -8.150907516479492, + "rewards/margins": 0.5137127637863159, + "rewards/rejected": -8.664621353149414, + "semantic_entropy": 0.004039828199893236, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 12.998907380952517, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": 0.8303213119506836, + "logits/rejected": 0.9604493379592896, + "logps/chosen": -8.156909942626953, + "logps/rejected": -8.906213760375977, + "loss": 0.5157, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.156909942626953, + "rewards/margins": 0.7493036985397339, + "rewards/rejected": -8.906213760375977, + "semantic_entropy": 0.004079463891685009, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 11.673455076007292, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": 0.8863071203231812, + "logits/rejected": 0.9234801530838013, + "logps/chosen": -8.100628852844238, + "logps/rejected": -8.736692428588867, + "loss": 0.5499, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.100628852844238, + "rewards/margins": 0.6360650062561035, + "rewards/rejected": -8.736692428588867, + "semantic_entropy": 0.0043214112520217896, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 14.906715664230957, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": 0.8381370306015015, + "logits/rejected": 0.8832274675369263, + "logps/chosen": -8.123575210571289, + "logps/rejected": -8.772272109985352, + "loss": 0.5631, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.123575210571289, + "rewards/margins": 0.6486952900886536, + "rewards/rejected": -8.772272109985352, + "semantic_entropy": 0.003872636239975691, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 13.00308584021274, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": 0.826133131980896, + "logits/rejected": 0.8961697816848755, + "logps/chosen": -8.128072738647461, + "logps/rejected": -8.749670028686523, + "loss": 0.5707, + "rewards/accuracies": 0.65625, + "rewards/chosen": -8.128072738647461, + "rewards/margins": 0.621599555015564, + "rewards/rejected": -8.749670028686523, + "semantic_entropy": 0.00478363037109375, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 22.234033697271784, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": 0.7835792303085327, + "logits/rejected": 0.8497790098190308, + "logps/chosen": -8.275663375854492, + "logps/rejected": -8.82735538482666, + "loss": 0.5884, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -8.275663375854492, + "rewards/margins": 0.5516918301582336, + "rewards/rejected": -8.82735538482666, + "semantic_entropy": 0.003584084566682577, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 13.347377014963383, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": 0.7705615758895874, + "logits/rejected": 0.8761787414550781, + "logps/chosen": -8.147015571594238, + "logps/rejected": -8.853046417236328, + "loss": 0.5173, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.147015571594238, + "rewards/margins": 0.7060302495956421, + "rewards/rejected": -8.853046417236328, + "semantic_entropy": 0.004314957652240992, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 18.743981589501008, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": 0.7425702214241028, + "logits/rejected": 0.8426684141159058, + "logps/chosen": -8.143719673156738, + "logps/rejected": -8.96298599243164, + "loss": 0.5714, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.143719673156738, + "rewards/margins": 0.8192659616470337, + "rewards/rejected": -8.96298599243164, + "semantic_entropy": 0.00467184092849493, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 17.857816830477596, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": 0.6718012094497681, + "logits/rejected": 0.7780871987342834, + "logps/chosen": -8.064419746398926, + "logps/rejected": -8.771439552307129, + "loss": 0.5654, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.064419746398926, + "rewards/margins": 0.7070209383964539, + "rewards/rejected": -8.771439552307129, + "semantic_entropy": 0.004769052378833294, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 15.883695166387948, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": 0.523202121257782, + "logits/rejected": 0.7221616506576538, + "logps/chosen": -7.929041385650635, + "logps/rejected": -8.971317291259766, + "loss": 0.4605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.929041385650635, + "rewards/margins": 1.0422756671905518, + "rewards/rejected": -8.971317291259766, + "semantic_entropy": 0.0057184770703315735, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 11.62870813642443, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": 0.5561312437057495, + "logits/rejected": 0.7267721891403198, + "logps/chosen": -7.7915754318237305, + "logps/rejected": -8.73208999633789, + "loss": 0.524, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.7915754318237305, + "rewards/margins": 0.9405128359794617, + "rewards/rejected": -8.73208999633789, + "semantic_entropy": 0.005991402082145214, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 12.493629792798085, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": 0.6039088368415833, + "logits/rejected": 0.6618218421936035, + "logps/chosen": -8.060002326965332, + "logps/rejected": -8.698019981384277, + "loss": 0.5877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -8.060002326965332, + "rewards/margins": 0.6380175352096558, + "rewards/rejected": -8.698019981384277, + "semantic_entropy": 0.004686708562076092, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 11.295471536198134, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": 0.7054456472396851, + "logits/rejected": 0.763306736946106, + "logps/chosen": -8.17889404296875, + "logps/rejected": -8.809242248535156, + "loss": 0.5912, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.17889404296875, + "rewards/margins": 0.6303480863571167, + "rewards/rejected": -8.809242248535156, + "semantic_entropy": 0.004566199611872435, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 18.547048805065355, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": 0.7248358726501465, + "logits/rejected": 0.7658167481422424, + "logps/chosen": -8.291397094726562, + "logps/rejected": -8.84311294555664, + "loss": 0.6214, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.291397094726562, + "rewards/margins": 0.5517162680625916, + "rewards/rejected": -8.84311294555664, + "semantic_entropy": 0.003865548875182867, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 12.961728802829933, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": 0.721314549446106, + "logits/rejected": 0.7908953428268433, + "logps/chosen": -8.195411682128906, + "logps/rejected": -8.733312606811523, + "loss": 0.5662, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.195411682128906, + "rewards/margins": 0.5379008650779724, + "rewards/rejected": -8.733312606811523, + "semantic_entropy": 0.004040508531033993, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 13.388709273345874, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": 0.7946035861968994, + "logits/rejected": 0.9081939458847046, + "logps/chosen": -8.505581855773926, + "logps/rejected": -9.189567565917969, + "loss": 0.5373, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.505581855773926, + "rewards/margins": 0.6839855313301086, + "rewards/rejected": -9.189567565917969, + "semantic_entropy": 0.003315441310405731, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 13.04328623863774, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": 0.7168788313865662, + "logits/rejected": 0.8062397837638855, + "logps/chosen": -8.312705039978027, + "logps/rejected": -8.956674575805664, + "loss": 0.5598, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.312705039978027, + "rewards/margins": 0.6439692378044128, + "rewards/rejected": -8.956674575805664, + "semantic_entropy": 0.004072139970958233, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 19.229092305162187, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": 0.6322071552276611, + "logits/rejected": 0.6842302680015564, + "logps/chosen": -8.209632873535156, + "logps/rejected": -8.76137638092041, + "loss": 0.5783, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -8.209632873535156, + "rewards/margins": 0.5517433881759644, + "rewards/rejected": -8.76137638092041, + "semantic_entropy": 0.003791673108935356, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 15.721412896614648, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": 0.6889594793319702, + "logits/rejected": 0.7277365922927856, + "logps/chosen": -8.424080848693848, + "logps/rejected": -8.998977661132812, + "loss": 0.5645, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.424080848693848, + "rewards/margins": 0.5748964548110962, + "rewards/rejected": -8.998977661132812, + "semantic_entropy": 0.0032078386284410954, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 12.71386552957631, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": 0.6700653433799744, + "logits/rejected": 0.7809063196182251, + "logps/chosen": -8.39610481262207, + "logps/rejected": -9.334978103637695, + "loss": 0.4796, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.39610481262207, + "rewards/margins": 0.938875675201416, + "rewards/rejected": -9.334978103637695, + "semantic_entropy": 0.00389484572224319, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.8517152070999146, + "eval_logits/rejected": 0.9157667756080627, + "eval_logps/chosen": -8.478996276855469, + "eval_logps/rejected": -9.19737434387207, + "eval_loss": 0.5405778884887695, + "eval_rewards/accuracies": 0.7047477960586548, + "eval_rewards/chosen": -8.478996276855469, + "eval_rewards/margins": 0.7183785438537598, + "eval_rewards/rejected": -9.19737434387207, + "eval_runtime": 35.1436, + "eval_samples_per_second": 38.272, + "eval_semantic_entropy": 0.0034910058602690697, + "eval_steps_per_second": 9.589, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 15.698469595260914, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": 0.6854676008224487, + "logits/rejected": 0.8206149935722351, + "logps/chosen": -8.568761825561523, + "logps/rejected": -9.31121826171875, + "loss": 0.5156, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.568761825561523, + "rewards/margins": 0.742457389831543, + "rewards/rejected": -9.31121826171875, + "semantic_entropy": 0.003175111021846533, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 13.477417972760096, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": 0.699101448059082, + "logits/rejected": 0.8196004033088684, + "logps/chosen": -8.327180862426758, + "logps/rejected": -9.16191577911377, + "loss": 0.51, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.327180862426758, + "rewards/margins": 0.8347347974777222, + "rewards/rejected": -9.16191577911377, + "semantic_entropy": 0.0040628439746797085, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 20.385879298528565, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": 0.7844869494438171, + "logits/rejected": 0.8709976077079773, + "logps/chosen": -8.346048355102539, + "logps/rejected": -9.043893814086914, + "loss": 0.5844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.346048355102539, + "rewards/margins": 0.6978455781936646, + "rewards/rejected": -9.043893814086914, + "semantic_entropy": 0.004238657653331757, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 16.839936129408876, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": 0.8071925044059753, + "logits/rejected": 0.9016444087028503, + "logps/chosen": -8.5030517578125, + "logps/rejected": -9.184589385986328, + "loss": 0.5976, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -8.5030517578125, + "rewards/margins": 0.6815375685691833, + "rewards/rejected": -9.184589385986328, + "semantic_entropy": 0.0040657538920640945, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 16.82819858205259, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": 0.8722259402275085, + "logits/rejected": 0.9098442196846008, + "logps/chosen": -8.707246780395508, + "logps/rejected": -9.02901554107666, + "loss": 0.7021, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -8.707246780395508, + "rewards/margins": 0.32176870107650757, + "rewards/rejected": -9.02901554107666, + "semantic_entropy": 0.0030822004191577435, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 12.54191217498305, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": 0.853954017162323, + "logits/rejected": 0.9320189356803894, + "logps/chosen": -8.5809965133667, + "logps/rejected": -9.205659866333008, + "loss": 0.5758, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.5809965133667, + "rewards/margins": 0.6246632933616638, + "rewards/rejected": -9.205659866333008, + "semantic_entropy": 0.002861475106328726, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 13.279557320703779, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": 0.830333411693573, + "logits/rejected": 0.9043375849723816, + "logps/chosen": -8.427899360656738, + "logps/rejected": -9.160634994506836, + "loss": 0.5253, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.427899360656738, + "rewards/margins": 0.7327350378036499, + "rewards/rejected": -9.160634994506836, + "semantic_entropy": 0.0032138600945472717, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 19.39467151368381, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": 0.8193842172622681, + "logits/rejected": 0.8876082301139832, + "logps/chosen": -8.680830001831055, + "logps/rejected": -9.321008682250977, + "loss": 0.5834, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.680830001831055, + "rewards/margins": 0.6401779651641846, + "rewards/rejected": -9.321008682250977, + "semantic_entropy": 0.0026057157665491104, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 15.252904015477414, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": 0.8152815103530884, + "logits/rejected": 0.8414192199707031, + "logps/chosen": -8.817136764526367, + "logps/rejected": -9.296814918518066, + "loss": 0.6108, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.817136764526367, + "rewards/margins": 0.4796779751777649, + "rewards/rejected": -9.296814918518066, + "semantic_entropy": 0.002496039029210806, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 17.953373871726004, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": 0.8935707211494446, + "logits/rejected": 0.9353858232498169, + "logps/chosen": -8.69524097442627, + "logps/rejected": -9.359209060668945, + "loss": 0.5249, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.69524097442627, + "rewards/margins": 0.6639670133590698, + "rewards/rejected": -9.359209060668945, + "semantic_entropy": 0.002959498204290867, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 15.625644253516462, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": 0.8291314840316772, + "logits/rejected": 0.8965352177619934, + "logps/chosen": -8.816374778747559, + "logps/rejected": -9.487445831298828, + "loss": 0.5349, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.816374778747559, + "rewards/margins": 0.6710702180862427, + "rewards/rejected": -9.487445831298828, + "semantic_entropy": 0.0024764954578131437, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 15.866078991034088, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": 0.8331910371780396, + "logits/rejected": 0.8664076924324036, + "logps/chosen": -8.80284595489502, + "logps/rejected": -9.485407829284668, + "loss": 0.5391, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.80284595489502, + "rewards/margins": 0.6825627088546753, + "rewards/rejected": -9.485407829284668, + "semantic_entropy": 0.002537056338042021, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 18.091028298007316, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": 0.8735591769218445, + "logits/rejected": 0.8835130929946899, + "logps/chosen": -9.021527290344238, + "logps/rejected": -9.55382251739502, + "loss": 0.5901, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.021527290344238, + "rewards/margins": 0.532294750213623, + "rewards/rejected": -9.55382251739502, + "semantic_entropy": 0.0023630578070878983, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 15.178387048780932, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": 0.8976732492446899, + "logits/rejected": 0.9629983901977539, + "logps/chosen": -9.05078411102295, + "logps/rejected": -9.503996849060059, + "loss": 0.63, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -9.05078411102295, + "rewards/margins": 0.45321202278137207, + "rewards/rejected": -9.503996849060059, + "semantic_entropy": 0.0022809661459177732, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 16.07101375873566, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": 0.8819114565849304, + "logits/rejected": 0.9694005250930786, + "logps/chosen": -8.932337760925293, + "logps/rejected": -9.645790100097656, + "loss": 0.5404, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.932337760925293, + "rewards/margins": 0.7134513258934021, + "rewards/rejected": -9.645790100097656, + "semantic_entropy": 0.002526444150134921, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 22.308829335036855, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": 0.8551505208015442, + "logits/rejected": 0.8981190919876099, + "logps/chosen": -8.808084487915039, + "logps/rejected": -9.255620956420898, + "loss": 0.644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -8.808084487915039, + "rewards/margins": 0.4475362300872803, + "rewards/rejected": -9.255620956420898, + "semantic_entropy": 0.0026800683699548244, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 20.755871823674013, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": 0.8553838729858398, + "logits/rejected": 0.9643619656562805, + "logps/chosen": -8.639958381652832, + "logps/rejected": -9.357548713684082, + "loss": 0.5769, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.639958381652832, + "rewards/margins": 0.7175900340080261, + "rewards/rejected": -9.357548713684082, + "semantic_entropy": 0.002625108230859041, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 17.716986972701864, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": 0.8817728161811829, + "logits/rejected": 0.9899671673774719, + "logps/chosen": -8.551309585571289, + "logps/rejected": -9.368635177612305, + "loss": 0.5262, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.551309585571289, + "rewards/margins": 0.8173257112503052, + "rewards/rejected": -9.368635177612305, + "semantic_entropy": 0.0030341236852109432, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 26.70901823794411, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": 0.7727741003036499, + "logits/rejected": 0.8659934997558594, + "logps/chosen": -8.45335578918457, + "logps/rejected": -9.134082794189453, + "loss": 0.528, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.45335578918457, + "rewards/margins": 0.6807276606559753, + "rewards/rejected": -9.134082794189453, + "semantic_entropy": 0.0030217047315090895, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 15.766523978337645, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": 0.8686197996139526, + "logits/rejected": 0.8411453366279602, + "logps/chosen": -8.678540229797363, + "logps/rejected": -9.106678009033203, + "loss": 0.6239, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.678540229797363, + "rewards/margins": 0.42813801765441895, + "rewards/rejected": -9.106678009033203, + "semantic_entropy": 0.0028703988064080477, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 18.30778845018795, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": 0.8415244817733765, + "logits/rejected": 0.9102290868759155, + "logps/chosen": -8.705827713012695, + "logps/rejected": -9.459232330322266, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.705827713012695, + "rewards/margins": 0.753406286239624, + "rewards/rejected": -9.459232330322266, + "semantic_entropy": 0.002563622547313571, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 14.122552458570945, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": 0.8687243461608887, + "logits/rejected": 0.9228528738021851, + "logps/chosen": -8.917773246765137, + "logps/rejected": -9.463602066040039, + "loss": 0.5956, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.917773246765137, + "rewards/margins": 0.5458282828330994, + "rewards/rejected": -9.463602066040039, + "semantic_entropy": 0.002534933853894472, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 22.1831103043282, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": 0.8289508819580078, + "logits/rejected": 0.9157236218452454, + "logps/chosen": -8.972761154174805, + "logps/rejected": -9.611922264099121, + "loss": 0.6294, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.972761154174805, + "rewards/margins": 0.6391609907150269, + "rewards/rejected": -9.611922264099121, + "semantic_entropy": 0.0023491496685892344, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 18.48437033985848, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": 0.8286212086677551, + "logits/rejected": 0.8758748769760132, + "logps/chosen": -8.496380805969238, + "logps/rejected": -9.228724479675293, + "loss": 0.5515, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.496380805969238, + "rewards/margins": 0.7323442101478577, + "rewards/rejected": -9.228724479675293, + "semantic_entropy": 0.003955576568841934, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 15.320431126329213, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": 0.7788019180297852, + "logits/rejected": 0.8485990762710571, + "logps/chosen": -8.44409465789795, + "logps/rejected": -9.369488716125488, + "loss": 0.4862, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.44409465789795, + "rewards/margins": 0.9253931045532227, + "rewards/rejected": -9.369488716125488, + "semantic_entropy": 0.0033842413686215878, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 13.032187521778193, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": 0.7908933758735657, + "logits/rejected": 0.839871883392334, + "logps/chosen": -8.113957405090332, + "logps/rejected": -8.812549591064453, + "loss": 0.5337, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.113957405090332, + "rewards/margins": 0.6985923647880554, + "rewards/rejected": -8.812549591064453, + "semantic_entropy": 0.004230237565934658, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 19.494387499479178, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": 0.72081059217453, + "logits/rejected": 0.8257828950881958, + "logps/chosen": -8.163381576538086, + "logps/rejected": -8.904546737670898, + "loss": 0.5976, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.163381576538086, + "rewards/margins": 0.7411641478538513, + "rewards/rejected": -8.904546737670898, + "semantic_entropy": 0.00452050007879734, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 18.390962651772945, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": 0.7455258965492249, + "logits/rejected": 0.8390616178512573, + "logps/chosen": -8.214960098266602, + "logps/rejected": -8.860664367675781, + "loss": 0.5546, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.214960098266602, + "rewards/margins": 0.6457030177116394, + "rewards/rejected": -8.860664367675781, + "semantic_entropy": 0.004450926091521978, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 12.45958358529586, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": 0.779666543006897, + "logits/rejected": 0.7992655038833618, + "logps/chosen": -7.90356969833374, + "logps/rejected": -8.546361923217773, + "loss": 0.5456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.90356969833374, + "rewards/margins": 0.6427920460700989, + "rewards/rejected": -8.546361923217773, + "semantic_entropy": 0.005241268780082464, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 20.787825790509054, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": 0.7400572896003723, + "logits/rejected": 0.7830491065979004, + "logps/chosen": -8.003057479858398, + "logps/rejected": -8.49720573425293, + "loss": 0.6642, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -8.003057479858398, + "rewards/margins": 0.4941479563713074, + "rewards/rejected": -8.49720573425293, + "semantic_entropy": 0.005025799386203289, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 13.730264245914015, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": 0.7920068502426147, + "logits/rejected": 0.8651610612869263, + "logps/chosen": -7.900570869445801, + "logps/rejected": -8.649713516235352, + "loss": 0.5124, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.900570869445801, + "rewards/margins": 0.7491430640220642, + "rewards/rejected": -8.649713516235352, + "semantic_entropy": 0.005135712679475546, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 13.626163938056823, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": 0.7864473462104797, + "logits/rejected": 0.8614629507064819, + "logps/chosen": -7.7179975509643555, + "logps/rejected": -8.315633773803711, + "loss": 0.5636, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.7179975509643555, + "rewards/margins": 0.5976354479789734, + "rewards/rejected": -8.315633773803711, + "semantic_entropy": 0.0065610273741185665, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 15.396621437912136, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": 0.7842726111412048, + "logits/rejected": 0.8139573335647583, + "logps/chosen": -7.729952812194824, + "logps/rejected": -8.440402030944824, + "loss": 0.5618, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -7.729952812194824, + "rewards/margins": 0.710450291633606, + "rewards/rejected": -8.440402030944824, + "semantic_entropy": 0.005991552956402302, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 11.779338418294584, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": 0.7771416306495667, + "logits/rejected": 0.8638502359390259, + "logps/chosen": -7.753976345062256, + "logps/rejected": -8.381518363952637, + "loss": 0.5732, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.753976345062256, + "rewards/margins": 0.6275419592857361, + "rewards/rejected": -8.381518363952637, + "semantic_entropy": 0.0064716823399066925, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 11.790444019030152, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": 0.7170445919036865, + "logits/rejected": 0.7984222173690796, + "logps/chosen": -7.559231758117676, + "logps/rejected": -8.215555191040039, + "loss": 0.5718, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.559231758117676, + "rewards/margins": 0.6563239097595215, + "rewards/rejected": -8.215555191040039, + "semantic_entropy": 0.007783152163028717, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 13.251274223031999, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": 0.7697458863258362, + "logits/rejected": 0.8472963571548462, + "logps/chosen": -7.661177635192871, + "logps/rejected": -8.409696578979492, + "loss": 0.5248, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -7.661177635192871, + "rewards/margins": 0.7485184073448181, + "rewards/rejected": -8.409696578979492, + "semantic_entropy": 0.006556454114615917, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 16.317970460951365, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": 0.7925196886062622, + "logits/rejected": 0.827374279499054, + "logps/chosen": -7.7944183349609375, + "logps/rejected": -8.290719985961914, + "loss": 0.5983, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.7944183349609375, + "rewards/margins": 0.4963007867336273, + "rewards/rejected": -8.290719985961914, + "semantic_entropy": 0.006235038861632347, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 17.388652579583002, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": 0.7371417284011841, + "logits/rejected": 0.7948885560035706, + "logps/chosen": -7.8774542808532715, + "logps/rejected": -8.463842391967773, + "loss": 0.5767, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -7.8774542808532715, + "rewards/margins": 0.5863882303237915, + "rewards/rejected": -8.463842391967773, + "semantic_entropy": 0.0055058179423213005, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 10.102522428025908, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": 0.701524555683136, + "logits/rejected": 0.7919615507125854, + "logps/chosen": -7.6824631690979, + "logps/rejected": -8.461966514587402, + "loss": 0.516, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.6824631690979, + "rewards/margins": 0.7795030474662781, + "rewards/rejected": -8.461966514587402, + "semantic_entropy": 0.007335428148508072, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 13.049955014081108, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": 0.723731279373169, + "logits/rejected": 0.7662105560302734, + "logps/chosen": -7.902833461761475, + "logps/rejected": -8.512142181396484, + "loss": 0.5491, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -7.902833461761475, + "rewards/margins": 0.6093090772628784, + "rewards/rejected": -8.512142181396484, + "semantic_entropy": 0.00509651331230998, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 17.055444027861736, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": 0.651114821434021, + "logits/rejected": 0.7337725758552551, + "logps/chosen": -7.82892370223999, + "logps/rejected": -8.779696464538574, + "loss": 0.4542, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -7.82892370223999, + "rewards/margins": 0.950772762298584, + "rewards/rejected": -8.779696464538574, + "semantic_entropy": 0.0056858672760427, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 12.069977661137925, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": 0.722413182258606, + "logits/rejected": 0.8114809989929199, + "logps/chosen": -8.02531623840332, + "logps/rejected": -8.714741706848145, + "loss": 0.5643, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.02531623840332, + "rewards/margins": 0.6894262433052063, + "rewards/rejected": -8.714741706848145, + "semantic_entropy": 0.004859632812440395, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 14.829212830302785, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": 0.7346758842468262, + "logits/rejected": 0.8068816065788269, + "logps/chosen": -8.147111892700195, + "logps/rejected": -8.884883880615234, + "loss": 0.5406, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.147111892700195, + "rewards/margins": 0.7377720475196838, + "rewards/rejected": -8.884883880615234, + "semantic_entropy": 0.00493080448359251, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 16.49209421591941, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": 0.7819596529006958, + "logits/rejected": 0.8145734667778015, + "logps/chosen": -8.00898551940918, + "logps/rejected": -8.68702507019043, + "loss": 0.5593, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.00898551940918, + "rewards/margins": 0.6780385971069336, + "rewards/rejected": -8.68702507019043, + "semantic_entropy": 0.005897555500268936, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 14.239228008160856, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": 0.7752368450164795, + "logits/rejected": 0.863335132598877, + "logps/chosen": -7.959936618804932, + "logps/rejected": -9.050023078918457, + "loss": 0.4822, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -7.959936618804932, + "rewards/margins": 1.0900851488113403, + "rewards/rejected": -9.050023078918457, + "semantic_entropy": 0.005652183201164007, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 16.436302978410623, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": 0.8048080205917358, + "logits/rejected": 0.8463269472122192, + "logps/chosen": -8.244328498840332, + "logps/rejected": -8.958102226257324, + "loss": 0.5714, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.244328498840332, + "rewards/margins": 0.7137740254402161, + "rewards/rejected": -8.958102226257324, + "semantic_entropy": 0.0040356675162911415, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 17.79215643845262, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": 0.7882435917854309, + "logits/rejected": 0.8229848146438599, + "logps/chosen": -8.396721839904785, + "logps/rejected": -8.957392692565918, + "loss": 0.5884, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.396721839904785, + "rewards/margins": 0.5606712102890015, + "rewards/rejected": -8.957392692565918, + "semantic_entropy": 0.0036423238925635815, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 14.625489667473667, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": 0.8869732618331909, + "logits/rejected": 0.9640114903450012, + "logps/chosen": -8.529989242553711, + "logps/rejected": -9.27656364440918, + "loss": 0.5253, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.529989242553711, + "rewards/margins": 0.7465731501579285, + "rewards/rejected": -9.27656364440918, + "semantic_entropy": 0.0031088325195014477, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 18.188969568475773, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": 0.8915464282035828, + "logits/rejected": 0.9507058262825012, + "logps/chosen": -8.5012845993042, + "logps/rejected": -9.236323356628418, + "loss": 0.5562, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.5012845993042, + "rewards/margins": 0.7350392937660217, + "rewards/rejected": -9.236323356628418, + "semantic_entropy": 0.002896857215091586, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 17.9922914014973, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": 0.8699381947517395, + "logits/rejected": 0.953484833240509, + "logps/chosen": -8.702693939208984, + "logps/rejected": -9.392245292663574, + "loss": 0.5587, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.702693939208984, + "rewards/margins": 0.6895512342453003, + "rewards/rejected": -9.392245292663574, + "semantic_entropy": 0.00236605666577816, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 15.27885989370606, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": 0.9513596296310425, + "logits/rejected": 1.0096721649169922, + "logps/chosen": -8.60840129852295, + "logps/rejected": -9.362217903137207, + "loss": 0.5943, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.60840129852295, + "rewards/margins": 0.7538172006607056, + "rewards/rejected": -9.362217903137207, + "semantic_entropy": 0.0031571455765515566, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 16.3301144819492, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": 0.9395162463188171, + "logits/rejected": 0.9991067051887512, + "logps/chosen": -8.834760665893555, + "logps/rejected": -9.530847549438477, + "loss": 0.5574, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.834760665893555, + "rewards/margins": 0.696088433265686, + "rewards/rejected": -9.530847549438477, + "semantic_entropy": 0.0025766813196241856, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 14.909090646149107, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": 0.9531529545783997, + "logits/rejected": 0.9883922338485718, + "logps/chosen": -8.549825668334961, + "logps/rejected": -9.276273727416992, + "loss": 0.5704, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.549825668334961, + "rewards/margins": 0.7264472246170044, + "rewards/rejected": -9.276273727416992, + "semantic_entropy": 0.003057825844734907, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 14.551680179512683, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": 0.918908953666687, + "logits/rejected": 0.9724335670471191, + "logps/chosen": -8.606492042541504, + "logps/rejected": -9.455463409423828, + "loss": 0.495, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.606492042541504, + "rewards/margins": 0.8489717245101929, + "rewards/rejected": -9.455463409423828, + "semantic_entropy": 0.002914209384471178, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 17.126300719381895, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": 0.9382045865058899, + "logits/rejected": 1.0026956796646118, + "logps/chosen": -8.67430591583252, + "logps/rejected": -9.235125541687012, + "loss": 0.5954, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -8.67430591583252, + "rewards/margins": 0.5608205795288086, + "rewards/rejected": -9.235125541687012, + "semantic_entropy": 0.0024842366110533476, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 12.526549153278241, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": 0.9702759981155396, + "logits/rejected": 1.0298935174942017, + "logps/chosen": -8.516494750976562, + "logps/rejected": -9.477919578552246, + "loss": 0.4751, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.516494750976562, + "rewards/margins": 0.961426854133606, + "rewards/rejected": -9.477919578552246, + "semantic_entropy": 0.0030000859405845404, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 13.104074462534614, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": 0.9352337121963501, + "logits/rejected": 0.9791328310966492, + "logps/chosen": -8.481460571289062, + "logps/rejected": -9.137152671813965, + "loss": 0.5852, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.481460571289062, + "rewards/margins": 0.6556928157806396, + "rewards/rejected": -9.137152671813965, + "semantic_entropy": 0.0034713305067270994, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 15.985094320283427, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": 0.9075764417648315, + "logits/rejected": 1.0042946338653564, + "logps/chosen": -8.522588729858398, + "logps/rejected": -9.330839157104492, + "loss": 0.4979, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.522588729858398, + "rewards/margins": 0.8082510828971863, + "rewards/rejected": -9.330839157104492, + "semantic_entropy": 0.0028249945025891066, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 12.915214195374903, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": 0.8950628042221069, + "logits/rejected": 0.924281120300293, + "logps/chosen": -8.34730052947998, + "logps/rejected": -9.118095397949219, + "loss": 0.5147, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.34730052947998, + "rewards/margins": 0.7707957625389099, + "rewards/rejected": -9.118095397949219, + "semantic_entropy": 0.0034280649852007627, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 14.453317743532807, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": 0.8320645093917847, + "logits/rejected": 0.9602710604667664, + "logps/chosen": -8.307376861572266, + "logps/rejected": -9.152776718139648, + "loss": 0.5081, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.307376861572266, + "rewards/margins": 0.845400333404541, + "rewards/rejected": -9.152776718139648, + "semantic_entropy": 0.0037023150362074375, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 13.51505578799361, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": 0.8122785687446594, + "logits/rejected": 0.8963130712509155, + "logps/chosen": -8.318288803100586, + "logps/rejected": -9.081267356872559, + "loss": 0.536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.318288803100586, + "rewards/margins": 0.7629793882369995, + "rewards/rejected": -9.081267356872559, + "semantic_entropy": 0.003525532316416502, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 11.191552343163695, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": 0.7361363172531128, + "logits/rejected": 0.8019342422485352, + "logps/chosen": -8.195878982543945, + "logps/rejected": -8.886285781860352, + "loss": 0.5351, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.195878982543945, + "rewards/margins": 0.6904064416885376, + "rewards/rejected": -8.886285781860352, + "semantic_entropy": 0.004457551054656506, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 12.647769499958207, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": 0.7044271230697632, + "logits/rejected": 0.77665114402771, + "logps/chosen": -8.380681037902832, + "logps/rejected": -9.184738159179688, + "loss": 0.5034, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.380681037902832, + "rewards/margins": 0.8040567636489868, + "rewards/rejected": -9.184738159179688, + "semantic_entropy": 0.003701858688145876, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 13.95644498738611, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": 0.722461998462677, + "logits/rejected": 0.7603663206100464, + "logps/chosen": -8.306981086730957, + "logps/rejected": -9.107414245605469, + "loss": 0.5306, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.306981086730957, + "rewards/margins": 0.8004336357116699, + "rewards/rejected": -9.107414245605469, + "semantic_entropy": 0.004512041341513395, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 18.459332801354297, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": 0.5903456211090088, + "logits/rejected": 0.7183451056480408, + "logps/chosen": -8.510897636413574, + "logps/rejected": -9.23041820526123, + "loss": 0.5363, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.510897636413574, + "rewards/margins": 0.719520628452301, + "rewards/rejected": -9.23041820526123, + "semantic_entropy": 0.003166732145473361, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 18.340027693624933, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": 0.6703733205795288, + "logits/rejected": 0.7412980198860168, + "logps/chosen": -8.390009880065918, + "logps/rejected": -9.196538925170898, + "loss": 0.5283, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.390009880065918, + "rewards/margins": 0.8065292239189148, + "rewards/rejected": -9.196538925170898, + "semantic_entropy": 0.004317262209951878, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 14.802116730649294, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": 0.6543900966644287, + "logits/rejected": 0.7763436436653137, + "logps/chosen": -8.447009086608887, + "logps/rejected": -9.306153297424316, + "loss": 0.5238, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.447009086608887, + "rewards/margins": 0.8591440916061401, + "rewards/rejected": -9.306153297424316, + "semantic_entropy": 0.003334530396386981, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 15.098311821125256, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": 0.6818082928657532, + "logits/rejected": 0.7659986615180969, + "logps/chosen": -8.268132209777832, + "logps/rejected": -9.30879020690918, + "loss": 0.4737, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.268132209777832, + "rewards/margins": 1.040657877922058, + "rewards/rejected": -9.30879020690918, + "semantic_entropy": 0.003615723457187414, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 14.723077590553624, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": 0.6228159666061401, + "logits/rejected": 0.688185453414917, + "logps/chosen": -8.453712463378906, + "logps/rejected": -9.435036659240723, + "loss": 0.4647, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.453712463378906, + "rewards/margins": 0.9813230633735657, + "rewards/rejected": -9.435036659240723, + "semantic_entropy": 0.0034209657460451126, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 13.627077288321304, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": 0.7107739448547363, + "logits/rejected": 0.8460835218429565, + "logps/chosen": -8.611701965332031, + "logps/rejected": -9.478960990905762, + "loss": 0.5379, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.611701965332031, + "rewards/margins": 0.8672583699226379, + "rewards/rejected": -9.478960990905762, + "semantic_entropy": 0.0030593627598136663, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 19.757986263536445, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": 0.6388633847236633, + "logits/rejected": 0.6890886425971985, + "logps/chosen": -8.633177757263184, + "logps/rejected": -9.407966613769531, + "loss": 0.51, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.633177757263184, + "rewards/margins": 0.7747882604598999, + "rewards/rejected": -9.407966613769531, + "semantic_entropy": 0.002916950499638915, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 16.381811199534628, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": 0.6950886845588684, + "logits/rejected": 0.720539927482605, + "logps/chosen": -8.707314491271973, + "logps/rejected": -9.449724197387695, + "loss": 0.558, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.707314491271973, + "rewards/margins": 0.74241042137146, + "rewards/rejected": -9.449724197387695, + "semantic_entropy": 0.0032343785278499126, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 13.044302764643096, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": 0.6273518800735474, + "logits/rejected": 0.6855801939964294, + "logps/chosen": -8.859047889709473, + "logps/rejected": -9.979398727416992, + "loss": 0.4405, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.859047889709473, + "rewards/margins": 1.1203503608703613, + "rewards/rejected": -9.979398727416992, + "semantic_entropy": 0.0030110005754977465, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 16.03642878238039, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": 0.5873863697052002, + "logits/rejected": 0.6804260015487671, + "logps/chosen": -8.77137565612793, + "logps/rejected": -9.576631546020508, + "loss": 0.5406, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.77137565612793, + "rewards/margins": 0.8052547574043274, + "rewards/rejected": -9.576631546020508, + "semantic_entropy": 0.0030320039950311184, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 22.084325288130714, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": 0.6454890370368958, + "logits/rejected": 0.6792441606521606, + "logps/chosen": -8.825540542602539, + "logps/rejected": -9.584020614624023, + "loss": 0.5592, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.825540542602539, + "rewards/margins": 0.7584813833236694, + "rewards/rejected": -9.584020614624023, + "semantic_entropy": 0.0030581161845475435, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 19.767902079352925, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": 0.5810804963111877, + "logits/rejected": 0.6741968989372253, + "logps/chosen": -8.81079387664795, + "logps/rejected": -9.655781745910645, + "loss": 0.5413, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.81079387664795, + "rewards/margins": 0.8449875712394714, + "rewards/rejected": -9.655781745910645, + "semantic_entropy": 0.0033272195141762495, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 16.00497801864881, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": 0.5558470487594604, + "logits/rejected": 0.6562130451202393, + "logps/chosen": -8.712007522583008, + "logps/rejected": -9.437994003295898, + "loss": 0.566, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.712007522583008, + "rewards/margins": 0.7259871959686279, + "rewards/rejected": -9.437994003295898, + "semantic_entropy": 0.004049594048410654, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 22.86662817638401, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": 0.48547202348709106, + "logits/rejected": 0.5629103779792786, + "logps/chosen": -8.967915534973145, + "logps/rejected": -9.580000877380371, + "loss": 0.5851, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -8.967915534973145, + "rewards/margins": 0.6120861172676086, + "rewards/rejected": -9.580000877380371, + "semantic_entropy": 0.002668407978489995, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 23.23762612453471, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": 0.5937298536300659, + "logits/rejected": 0.7143954038619995, + "logps/chosen": -8.771955490112305, + "logps/rejected": -9.69508171081543, + "loss": 0.5095, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.771955490112305, + "rewards/margins": 0.9231254458427429, + "rewards/rejected": -9.69508171081543, + "semantic_entropy": 0.002983763115480542, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 20.894254936050405, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": 0.5293421745300293, + "logits/rejected": 0.6653727293014526, + "logps/chosen": -8.856660842895508, + "logps/rejected": -9.56396198272705, + "loss": 0.5834, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.856660842895508, + "rewards/margins": 0.7073008418083191, + "rewards/rejected": -9.56396198272705, + "semantic_entropy": 0.002809601603075862, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.7784110307693481, + "eval_logits/rejected": 0.8619949221611023, + "eval_logps/chosen": -8.725646018981934, + "eval_logps/rejected": -9.51314926147461, + "eval_loss": 0.5343691110610962, + "eval_rewards/accuracies": 0.7158753871917725, + "eval_rewards/chosen": -8.725646018981934, + "eval_rewards/margins": 0.7875038385391235, + "eval_rewards/rejected": -9.51314926147461, + "eval_runtime": 34.7505, + "eval_samples_per_second": 38.704, + "eval_semantic_entropy": 0.00273532303981483, + "eval_steps_per_second": 9.698, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 16.92593879734535, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": 0.5764074921607971, + "logits/rejected": 0.6182007193565369, + "logps/chosen": -8.691645622253418, + "logps/rejected": -9.37775707244873, + "loss": 0.5661, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.691645622253418, + "rewards/margins": 0.6861115097999573, + "rewards/rejected": -9.37775707244873, + "semantic_entropy": 0.003028175327926874, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 18.60708769092857, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": 0.5846803784370422, + "logits/rejected": 0.6755378842353821, + "logps/chosen": -8.488496780395508, + "logps/rejected": -9.236639022827148, + "loss": 0.5124, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.488496780395508, + "rewards/margins": 0.7481436729431152, + "rewards/rejected": -9.236639022827148, + "semantic_entropy": 0.0031055829022079706, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 16.81295395917736, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": 0.5960233807563782, + "logits/rejected": 0.6982234120368958, + "logps/chosen": -8.587759017944336, + "logps/rejected": -9.308615684509277, + "loss": 0.5304, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.587759017944336, + "rewards/margins": 0.720857560634613, + "rewards/rejected": -9.308615684509277, + "semantic_entropy": 0.003149865660816431, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 19.584362200308394, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": 0.4914863705635071, + "logits/rejected": 0.6277307868003845, + "logps/chosen": -8.720789909362793, + "logps/rejected": -9.521505355834961, + "loss": 0.4904, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.720789909362793, + "rewards/margins": 0.8007165789604187, + "rewards/rejected": -9.521505355834961, + "semantic_entropy": 0.0029073634650558233, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 14.905704270703776, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": 0.5698509812355042, + "logits/rejected": 0.6809700727462769, + "logps/chosen": -8.530683517456055, + "logps/rejected": -9.367597579956055, + "loss": 0.5261, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.530683517456055, + "rewards/margins": 0.8369154930114746, + "rewards/rejected": -9.367597579956055, + "semantic_entropy": 0.0036535891704261303, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 11.142809207007566, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": 0.4973204731941223, + "logits/rejected": 0.5573136210441589, + "logps/chosen": -8.422048568725586, + "logps/rejected": -9.212953567504883, + "loss": 0.5264, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.422048568725586, + "rewards/margins": 0.7909058332443237, + "rewards/rejected": -9.212953567504883, + "semantic_entropy": 0.003728007199242711, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 18.70125115826522, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": 0.5369696617126465, + "logits/rejected": 0.6485335230827332, + "logps/chosen": -8.6622896194458, + "logps/rejected": -9.386190414428711, + "loss": 0.5708, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.6622896194458, + "rewards/margins": 0.7239011526107788, + "rewards/rejected": -9.386190414428711, + "semantic_entropy": 0.0035473487805575132, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 26.024840250891568, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": 0.47266292572021484, + "logits/rejected": 0.526736855506897, + "logps/chosen": -8.556685447692871, + "logps/rejected": -9.491031646728516, + "loss": 0.4911, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.556685447692871, + "rewards/margins": 0.9343463778495789, + "rewards/rejected": -9.491031646728516, + "semantic_entropy": 0.0032796214800328016, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 18.59073371986523, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": 0.5553776025772095, + "logits/rejected": 0.6106212735176086, + "logps/chosen": -8.6795015335083, + "logps/rejected": -9.444005012512207, + "loss": 0.5309, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.6795015335083, + "rewards/margins": 0.7645029425621033, + "rewards/rejected": -9.444005012512207, + "semantic_entropy": 0.0029942230321466923, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 19.661856792543684, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": 0.5739470720291138, + "logits/rejected": 0.6971379518508911, + "logps/chosen": -8.599574089050293, + "logps/rejected": -9.433286666870117, + "loss": 0.5109, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.599574089050293, + "rewards/margins": 0.833710789680481, + "rewards/rejected": -9.433286666870117, + "semantic_entropy": 0.002899765968322754, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 16.83932620346083, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": 0.5915915966033936, + "logits/rejected": 0.7208577394485474, + "logps/chosen": -8.551434516906738, + "logps/rejected": -9.56495189666748, + "loss": 0.4975, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.551434516906738, + "rewards/margins": 1.0135180950164795, + "rewards/rejected": -9.56495189666748, + "semantic_entropy": 0.004057818092405796, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 19.759575014791515, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": 0.644290566444397, + "logits/rejected": 0.7044304609298706, + "logps/chosen": -8.437231063842773, + "logps/rejected": -9.365800857543945, + "loss": 0.4995, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.437231063842773, + "rewards/margins": 0.928569495677948, + "rewards/rejected": -9.365800857543945, + "semantic_entropy": 0.00467148469761014, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 19.3959994469509, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": 0.5765253305435181, + "logits/rejected": 0.7118976712226868, + "logps/chosen": -8.473932266235352, + "logps/rejected": -9.513734817504883, + "loss": 0.4958, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.473932266235352, + "rewards/margins": 1.0398019552230835, + "rewards/rejected": -9.513734817504883, + "semantic_entropy": 0.003963841591030359, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 19.393093668750392, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": 0.6232589483261108, + "logits/rejected": 0.7551737427711487, + "logps/chosen": -8.12829875946045, + "logps/rejected": -9.182195663452148, + "loss": 0.4412, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -8.12829875946045, + "rewards/margins": 1.0538949966430664, + "rewards/rejected": -9.182195663452148, + "semantic_entropy": 0.004817788954824209, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 23.64668063780086, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": 0.5677531957626343, + "logits/rejected": 0.6550413966178894, + "logps/chosen": -8.326519966125488, + "logps/rejected": -9.14603042602539, + "loss": 0.5172, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.326519966125488, + "rewards/margins": 0.8195114135742188, + "rewards/rejected": -9.14603042602539, + "semantic_entropy": 0.0040381476283073425, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 19.95159503167207, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": 0.5089942216873169, + "logits/rejected": 0.6331297159194946, + "logps/chosen": -8.021492004394531, + "logps/rejected": -8.944520950317383, + "loss": 0.5098, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.021492004394531, + "rewards/margins": 0.9230290651321411, + "rewards/rejected": -8.944520950317383, + "semantic_entropy": 0.005194402299821377, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 25.2963455688314, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": 0.6509027481079102, + "logits/rejected": 0.7161253690719604, + "logps/chosen": -8.10318660736084, + "logps/rejected": -9.13819408416748, + "loss": 0.4955, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.10318660736084, + "rewards/margins": 1.0350077152252197, + "rewards/rejected": -9.13819408416748, + "semantic_entropy": 0.004798793233931065, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 16.477568206890176, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": 0.6596091389656067, + "logits/rejected": 0.6896553635597229, + "logps/chosen": -8.11032772064209, + "logps/rejected": -8.861970901489258, + "loss": 0.533, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.11032772064209, + "rewards/margins": 0.7516436576843262, + "rewards/rejected": -8.861970901489258, + "semantic_entropy": 0.004430143162608147, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 16.367388918717808, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": 0.640828013420105, + "logits/rejected": 0.7348512411117554, + "logps/chosen": -8.398119926452637, + "logps/rejected": -9.17949104309082, + "loss": 0.5304, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.398119926452637, + "rewards/margins": 0.7813706398010254, + "rewards/rejected": -9.17949104309082, + "semantic_entropy": 0.0035932317841798067, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 22.724539658375086, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": 0.7362472414970398, + "logits/rejected": 0.7698075771331787, + "logps/chosen": -8.4552583694458, + "logps/rejected": -9.216978073120117, + "loss": 0.5361, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.4552583694458, + "rewards/margins": 0.7617195844650269, + "rewards/rejected": -9.216978073120117, + "semantic_entropy": 0.003288673236966133, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 17.17232989165224, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": 0.8084769248962402, + "logits/rejected": 0.9146261215209961, + "logps/chosen": -8.417569160461426, + "logps/rejected": -9.373042106628418, + "loss": 0.4631, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.417569160461426, + "rewards/margins": 0.9554733037948608, + "rewards/rejected": -9.373042106628418, + "semantic_entropy": 0.0028464009519666433, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 17.323667244270915, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": 0.767966091632843, + "logits/rejected": 0.8326314687728882, + "logps/chosen": -8.67313003540039, + "logps/rejected": -9.694366455078125, + "loss": 0.4904, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.67313003540039, + "rewards/margins": 1.0212359428405762, + "rewards/rejected": -9.694366455078125, + "semantic_entropy": 0.0024180663749575615, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 18.88990514293712, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": 0.8208998441696167, + "logits/rejected": 0.8627697229385376, + "logps/chosen": -8.815618515014648, + "logps/rejected": -9.641824722290039, + "loss": 0.5307, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.815618515014648, + "rewards/margins": 0.826204776763916, + "rewards/rejected": -9.641824722290039, + "semantic_entropy": 0.0024063908495008945, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 22.075179293885157, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": 0.772208571434021, + "logits/rejected": 0.8185374140739441, + "logps/chosen": -8.663274765014648, + "logps/rejected": -9.404105186462402, + "loss": 0.5304, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.663274765014648, + "rewards/margins": 0.7408307790756226, + "rewards/rejected": -9.404105186462402, + "semantic_entropy": 0.0030980452429503202, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 16.278122487967558, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": 0.745190441608429, + "logits/rejected": 0.8001850247383118, + "logps/chosen": -8.882969856262207, + "logps/rejected": -9.755064010620117, + "loss": 0.5305, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.882969856262207, + "rewards/margins": 0.872094452381134, + "rewards/rejected": -9.755064010620117, + "semantic_entropy": 0.0022930700797587633, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 20.293385998629976, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": 0.8587775230407715, + "logits/rejected": 0.9622253179550171, + "logps/chosen": -8.695469856262207, + "logps/rejected": -9.686747550964355, + "loss": 0.4915, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.695469856262207, + "rewards/margins": 0.991279125213623, + "rewards/rejected": -9.686747550964355, + "semantic_entropy": 0.0027502470184117556, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 20.703488040967667, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": 0.765255868434906, + "logits/rejected": 0.8885319828987122, + "logps/chosen": -8.589404106140137, + "logps/rejected": -9.656288146972656, + "loss": 0.4855, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.589404106140137, + "rewards/margins": 1.0668823719024658, + "rewards/rejected": -9.656288146972656, + "semantic_entropy": 0.0031393137760460377, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 23.797314324550555, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": 0.8055821657180786, + "logits/rejected": 0.8668516874313354, + "logps/chosen": -8.526637077331543, + "logps/rejected": -9.445337295532227, + "loss": 0.4964, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.526637077331543, + "rewards/margins": 0.9186998605728149, + "rewards/rejected": -9.445337295532227, + "semantic_entropy": 0.003491030540317297, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 15.411161024417602, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": 0.8090022206306458, + "logits/rejected": 0.8830445408821106, + "logps/chosen": -8.554264068603516, + "logps/rejected": -9.823812484741211, + "loss": 0.4472, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.554264068603516, + "rewards/margins": 1.269547462463379, + "rewards/rejected": -9.823812484741211, + "semantic_entropy": 0.0031209487933665514, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 16.571465015989386, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": 0.773921549320221, + "logits/rejected": 0.8259444236755371, + "logps/chosen": -8.577144622802734, + "logps/rejected": -9.52406120300293, + "loss": 0.4723, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.577144622802734, + "rewards/margins": 0.9469181895256042, + "rewards/rejected": -9.52406120300293, + "semantic_entropy": 0.0030519163701683283, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 19.150879536804933, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": 0.7496171593666077, + "logits/rejected": 0.8101975321769714, + "logps/chosen": -8.777600288391113, + "logps/rejected": -9.443353652954102, + "loss": 0.5967, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -8.777600288391113, + "rewards/margins": 0.6657532453536987, + "rewards/rejected": -9.443353652954102, + "semantic_entropy": 0.002408596221357584, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 20.084289617276934, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": 0.7996751666069031, + "logits/rejected": 0.8572956919670105, + "logps/chosen": -8.599963188171387, + "logps/rejected": -9.580442428588867, + "loss": 0.5205, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.599963188171387, + "rewards/margins": 0.9804786443710327, + "rewards/rejected": -9.580442428588867, + "semantic_entropy": 0.0031619679648429155, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 19.904342457753952, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": 0.7399067282676697, + "logits/rejected": 0.8611429333686829, + "logps/chosen": -8.674205780029297, + "logps/rejected": -9.685873985290527, + "loss": 0.5023, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.674205780029297, + "rewards/margins": 1.0116674900054932, + "rewards/rejected": -9.685873985290527, + "semantic_entropy": 0.0029065976850688457, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 11.68066385401199, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": 0.8236852884292603, + "logits/rejected": 0.8967201113700867, + "logps/chosen": -8.778889656066895, + "logps/rejected": -9.750692367553711, + "loss": 0.5151, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.778889656066895, + "rewards/margins": 0.9718036651611328, + "rewards/rejected": -9.750692367553711, + "semantic_entropy": 0.0026112559717148542, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 13.781442467832825, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": 0.7699551582336426, + "logits/rejected": 0.8938447833061218, + "logps/chosen": -8.977490425109863, + "logps/rejected": -9.788980484008789, + "loss": 0.5165, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.977490425109863, + "rewards/margins": 0.8114897012710571, + "rewards/rejected": -9.788980484008789, + "semantic_entropy": 0.0019960529170930386, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 14.44343492374134, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": 0.8235516548156738, + "logits/rejected": 0.8966760635375977, + "logps/chosen": -9.135007858276367, + "logps/rejected": -10.119426727294922, + "loss": 0.4607, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.135007858276367, + "rewards/margins": 0.9844182133674622, + "rewards/rejected": -10.119426727294922, + "semantic_entropy": 0.0018916798289865255, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 17.800127012280676, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": 0.8629690408706665, + "logits/rejected": 0.9295064210891724, + "logps/chosen": -9.345608711242676, + "logps/rejected": -10.031997680664062, + "loss": 0.5446, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -9.345608711242676, + "rewards/margins": 0.6863887906074524, + "rewards/rejected": -10.031997680664062, + "semantic_entropy": 0.0015552560798823833, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 22.042952861910784, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": 0.9068318605422974, + "logits/rejected": 0.924595832824707, + "logps/chosen": -9.173646926879883, + "logps/rejected": -10.000932693481445, + "loss": 0.5156, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.173646926879883, + "rewards/margins": 0.8272865414619446, + "rewards/rejected": -10.000932693481445, + "semantic_entropy": 0.0017204980831593275, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 18.715280386882995, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": 0.8207842707633972, + "logits/rejected": 0.9047282934188843, + "logps/chosen": -9.131489753723145, + "logps/rejected": -9.951040267944336, + "loss": 0.5302, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -9.131489753723145, + "rewards/margins": 0.8195503950119019, + "rewards/rejected": -9.951040267944336, + "semantic_entropy": 0.0020564752630889416, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 14.79847154131496, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": 0.8447543382644653, + "logits/rejected": 0.8663375973701477, + "logps/chosen": -9.094636917114258, + "logps/rejected": -9.878302574157715, + "loss": 0.534, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -9.094636917114258, + "rewards/margins": 0.7836667895317078, + "rewards/rejected": -9.878302574157715, + "semantic_entropy": 0.0019203886622563004, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 17.15954179734088, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": 0.7871206998825073, + "logits/rejected": 0.8676565289497375, + "logps/chosen": -9.28339672088623, + "logps/rejected": -10.069405555725098, + "loss": 0.5129, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -9.28339672088623, + "rewards/margins": 0.7860093712806702, + "rewards/rejected": -10.069405555725098, + "semantic_entropy": 0.001820198493078351, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 24.404592657138192, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": 0.7938421368598938, + "logits/rejected": 0.8801844716072083, + "logps/chosen": -8.840426445007324, + "logps/rejected": -9.785604476928711, + "loss": 0.4784, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.840426445007324, + "rewards/margins": 0.9451776742935181, + "rewards/rejected": -9.785604476928711, + "semantic_entropy": 0.002649650676175952, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 17.132762998778745, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": 0.7745561003684998, + "logits/rejected": 0.8122493624687195, + "logps/chosen": -8.980504035949707, + "logps/rejected": -9.769770622253418, + "loss": 0.5316, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.980504035949707, + "rewards/margins": 0.7892670035362244, + "rewards/rejected": -9.769770622253418, + "semantic_entropy": 0.0022570898290723562, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 20.513738139152867, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": 0.7253280878067017, + "logits/rejected": 0.7848079204559326, + "logps/chosen": -8.66881275177002, + "logps/rejected": -9.506206512451172, + "loss": 0.5316, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.66881275177002, + "rewards/margins": 0.8373939394950867, + "rewards/rejected": -9.506206512451172, + "semantic_entropy": 0.0028397340793162584, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 13.082835254565163, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": 0.6793020367622375, + "logits/rejected": 0.8425678014755249, + "logps/chosen": -8.629182815551758, + "logps/rejected": -9.746764183044434, + "loss": 0.4487, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.629182815551758, + "rewards/margins": 1.1175806522369385, + "rewards/rejected": -9.746764183044434, + "semantic_entropy": 0.003014157759025693, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 15.319269039008326, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": 0.6559053659439087, + "logits/rejected": 0.7742191553115845, + "logps/chosen": -8.408263206481934, + "logps/rejected": -9.492993354797363, + "loss": 0.4948, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.408263206481934, + "rewards/margins": 1.084729790687561, + "rewards/rejected": -9.492993354797363, + "semantic_entropy": 0.0036600581370294094, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 22.81260636310479, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": 0.6717875003814697, + "logits/rejected": 0.8029670715332031, + "logps/chosen": -8.644887924194336, + "logps/rejected": -9.598337173461914, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.644887924194336, + "rewards/margins": 0.9534481763839722, + "rewards/rejected": -9.598337173461914, + "semantic_entropy": 0.002926050452515483, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 17.071928193449306, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": 0.6294863224029541, + "logits/rejected": 0.7521852254867554, + "logps/chosen": -8.822403907775879, + "logps/rejected": -9.73637866973877, + "loss": 0.5241, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.822403907775879, + "rewards/margins": 0.9139748811721802, + "rewards/rejected": -9.73637866973877, + "semantic_entropy": 0.003058222122490406, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 21.59100967195749, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": 0.6673406362533569, + "logits/rejected": 0.6980730295181274, + "logps/chosen": -8.757534980773926, + "logps/rejected": -9.63608169555664, + "loss": 0.4955, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.757534980773926, + "rewards/margins": 0.8785461187362671, + "rewards/rejected": -9.63608169555664, + "semantic_entropy": 0.003113445593044162, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 17.12424676715307, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": 0.6984297633171082, + "logits/rejected": 0.7338518500328064, + "logps/chosen": -8.893332481384277, + "logps/rejected": -9.793367385864258, + "loss": 0.527, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.893332481384277, + "rewards/margins": 0.9000345468521118, + "rewards/rejected": -9.793367385864258, + "semantic_entropy": 0.002758896443992853, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 26.22447421233564, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": 0.6646834015846252, + "logits/rejected": 0.7228942513465881, + "logps/chosen": -8.950407028198242, + "logps/rejected": -9.782625198364258, + "loss": 0.5624, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.950407028198242, + "rewards/margins": 0.8322180509567261, + "rewards/rejected": -9.782625198364258, + "semantic_entropy": 0.0024748151190578938, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 17.893944580761662, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": 0.7343819737434387, + "logits/rejected": 0.763200044631958, + "logps/chosen": -8.684735298156738, + "logps/rejected": -9.369165420532227, + "loss": 0.5689, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.684735298156738, + "rewards/margins": 0.6844292283058167, + "rewards/rejected": -9.369165420532227, + "semantic_entropy": 0.003174789249897003, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 15.807684902195147, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": 0.6750258207321167, + "logits/rejected": 0.7452644109725952, + "logps/chosen": -8.592541694641113, + "logps/rejected": -9.485219955444336, + "loss": 0.5089, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.592541694641113, + "rewards/margins": 0.8926795721054077, + "rewards/rejected": -9.485219955444336, + "semantic_entropy": 0.0034456239081919193, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 18.6126390663135, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": 0.6281952857971191, + "logits/rejected": 0.7401161789894104, + "logps/chosen": -8.631272315979004, + "logps/rejected": -9.659812927246094, + "loss": 0.4665, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.631272315979004, + "rewards/margins": 1.0285407304763794, + "rewards/rejected": -9.659812927246094, + "semantic_entropy": 0.0035267819184809923, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 18.495386475386976, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": 0.6814571619033813, + "logits/rejected": 0.7883174419403076, + "logps/chosen": -8.451251983642578, + "logps/rejected": -9.589815139770508, + "loss": 0.4347, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.451251983642578, + "rewards/margins": 1.1385620832443237, + "rewards/rejected": -9.589815139770508, + "semantic_entropy": 0.0035903877578675747, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 20.866364773443532, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": 0.6894387602806091, + "logits/rejected": 0.7951668500900269, + "logps/chosen": -8.350536346435547, + "logps/rejected": -9.263737678527832, + "loss": 0.492, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.350536346435547, + "rewards/margins": 0.9132000207901001, + "rewards/rejected": -9.263737678527832, + "semantic_entropy": 0.003823335049673915, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 11.808941649198584, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": 0.7374765276908875, + "logits/rejected": 0.8545964956283569, + "logps/chosen": -8.481078147888184, + "logps/rejected": -9.478879928588867, + "loss": 0.4777, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.481078147888184, + "rewards/margins": 0.9978022575378418, + "rewards/rejected": -9.478879928588867, + "semantic_entropy": 0.003645769553259015, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 17.775252169557806, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": 0.6624468564987183, + "logits/rejected": 0.7552576661109924, + "logps/chosen": -8.490701675415039, + "logps/rejected": -9.470184326171875, + "loss": 0.5227, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.490701675415039, + "rewards/margins": 0.9794837832450867, + "rewards/rejected": -9.470184326171875, + "semantic_entropy": 0.0034333504736423492, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 23.97776160790946, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": 0.6835793256759644, + "logits/rejected": 0.7225985527038574, + "logps/chosen": -8.687114715576172, + "logps/rejected": -9.38883113861084, + "loss": 0.6041, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.687114715576172, + "rewards/margins": 0.7017166018486023, + "rewards/rejected": -9.38883113861084, + "semantic_entropy": 0.003160933731123805, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 21.69249472023334, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": 0.7158384919166565, + "logits/rejected": 0.8074569702148438, + "logps/chosen": -8.510350227355957, + "logps/rejected": -9.450441360473633, + "loss": 0.5061, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.510350227355957, + "rewards/margins": 0.940090537071228, + "rewards/rejected": -9.450441360473633, + "semantic_entropy": 0.00320886867120862, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 17.13076671528079, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": 0.7047310471534729, + "logits/rejected": 0.7689910531044006, + "logps/chosen": -8.937776565551758, + "logps/rejected": -9.842035293579102, + "loss": 0.5066, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.937776565551758, + "rewards/margins": 0.9042595624923706, + "rewards/rejected": -9.842035293579102, + "semantic_entropy": 0.0021909018978476524, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 22.05914195430034, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": 0.7231523394584656, + "logits/rejected": 0.8367801904678345, + "logps/chosen": -8.771172523498535, + "logps/rejected": -9.783547401428223, + "loss": 0.4775, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.771172523498535, + "rewards/margins": 1.0123744010925293, + "rewards/rejected": -9.783547401428223, + "semantic_entropy": 0.0027366154827177525, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 15.488052133555222, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": 0.6668115854263306, + "logits/rejected": 0.7803434133529663, + "logps/chosen": -8.952492713928223, + "logps/rejected": -9.73788070678711, + "loss": 0.5417, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.952492713928223, + "rewards/margins": 0.7853885293006897, + "rewards/rejected": -9.73788070678711, + "semantic_entropy": 0.002661502454429865, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 15.301361368412175, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": 0.7436283826828003, + "logits/rejected": 0.783458411693573, + "logps/chosen": -8.722562789916992, + "logps/rejected": -9.623026847839355, + "loss": 0.4961, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.722562789916992, + "rewards/margins": 0.9004641771316528, + "rewards/rejected": -9.623026847839355, + "semantic_entropy": 0.0026164718437939882, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 18.344895783311472, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": 0.7868816256523132, + "logits/rejected": 0.8121258020401001, + "logps/chosen": -8.369720458984375, + "logps/rejected": -9.339384078979492, + "loss": 0.4349, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -8.369720458984375, + "rewards/margins": 0.9696633219718933, + "rewards/rejected": -9.339384078979492, + "semantic_entropy": 0.004334195517003536, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 20.284036778511826, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": 0.662312388420105, + "logits/rejected": 0.7393311262130737, + "logps/chosen": -8.260697364807129, + "logps/rejected": -9.110601425170898, + "loss": 0.5587, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.260697364807129, + "rewards/margins": 0.8499045372009277, + "rewards/rejected": -9.110601425170898, + "semantic_entropy": 0.0039521572180092335, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 13.687544225959545, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": 0.695865273475647, + "logits/rejected": 0.759333074092865, + "logps/chosen": -8.318536758422852, + "logps/rejected": -9.270764350891113, + "loss": 0.4915, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.318536758422852, + "rewards/margins": 0.9522277116775513, + "rewards/rejected": -9.270764350891113, + "semantic_entropy": 0.003750443458557129, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 14.99477345835548, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": 0.5629149079322815, + "logits/rejected": 0.659963846206665, + "logps/chosen": -8.367746353149414, + "logps/rejected": -9.242141723632812, + "loss": 0.5076, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.367746353149414, + "rewards/margins": 0.8743956685066223, + "rewards/rejected": -9.242141723632812, + "semantic_entropy": 0.0034209941513836384, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 14.622457962908232, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": 0.6555184721946716, + "logits/rejected": 0.7649224996566772, + "logps/chosen": -8.27055549621582, + "logps/rejected": -9.224145889282227, + "loss": 0.5084, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.27055549621582, + "rewards/margins": 0.9535905122756958, + "rewards/rejected": -9.224145889282227, + "semantic_entropy": 0.0038777173031121492, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 18.63345646684996, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": 0.6671181917190552, + "logits/rejected": 0.7599374651908875, + "logps/chosen": -8.387980461120605, + "logps/rejected": -9.266815185546875, + "loss": 0.4867, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.387980461120605, + "rewards/margins": 0.8788350820541382, + "rewards/rejected": -9.266815185546875, + "semantic_entropy": 0.0032208203338086605, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 16.89717027672504, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": 0.685912013053894, + "logits/rejected": 0.7816206812858582, + "logps/chosen": -8.186650276184082, + "logps/rejected": -9.200610160827637, + "loss": 0.4687, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.186650276184082, + "rewards/margins": 1.0139598846435547, + "rewards/rejected": -9.200610160827637, + "semantic_entropy": 0.004348042421042919, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 16.86807483534206, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": 0.6663065552711487, + "logits/rejected": 0.7573191523551941, + "logps/chosen": -8.505255699157715, + "logps/rejected": -9.44536304473877, + "loss": 0.4819, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -8.505255699157715, + "rewards/margins": 0.940106987953186, + "rewards/rejected": -9.44536304473877, + "semantic_entropy": 0.0034768693149089813, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 17.687247649811177, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": 0.7061843276023865, + "logits/rejected": 0.7796521186828613, + "logps/chosen": -8.41321086883545, + "logps/rejected": -9.373991012573242, + "loss": 0.5037, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.41321086883545, + "rewards/margins": 0.9607791900634766, + "rewards/rejected": -9.373991012573242, + "semantic_entropy": 0.0037853557150810957, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 19.15884763927205, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": 0.7049607038497925, + "logits/rejected": 0.8010439872741699, + "logps/chosen": -8.644028663635254, + "logps/rejected": -9.527328491210938, + "loss": 0.4998, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.644028663635254, + "rewards/margins": 0.8833004832267761, + "rewards/rejected": -9.527328491210938, + "semantic_entropy": 0.0030520078726112843, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 16.73014781307649, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": 0.637940526008606, + "logits/rejected": 0.7165664434432983, + "logps/chosen": -8.474000930786133, + "logps/rejected": -9.344751358032227, + "loss": 0.5156, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.474000930786133, + "rewards/margins": 0.8707484006881714, + "rewards/rejected": -9.344751358032227, + "semantic_entropy": 0.003291874658316374, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 20.729681567322082, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": 0.6396089792251587, + "logits/rejected": 0.747488796710968, + "logps/chosen": -8.582317352294922, + "logps/rejected": -9.629173278808594, + "loss": 0.4984, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.582317352294922, + "rewards/margins": 1.0468562841415405, + "rewards/rejected": -9.629173278808594, + "semantic_entropy": 0.0034858197905123234, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 19.291131856489386, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": 0.6974250078201294, + "logits/rejected": 0.7831848859786987, + "logps/chosen": -8.711091041564941, + "logps/rejected": -9.662050247192383, + "loss": 0.5122, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.711091041564941, + "rewards/margins": 0.9509603381156921, + "rewards/rejected": -9.662050247192383, + "semantic_entropy": 0.0031486363150179386, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 18.66005580137364, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": 0.6889894008636475, + "logits/rejected": 0.7326347231864929, + "logps/chosen": -8.706632614135742, + "logps/rejected": -9.896500587463379, + "loss": 0.4453, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -8.706632614135742, + "rewards/margins": 1.1898666620254517, + "rewards/rejected": -9.896500587463379, + "semantic_entropy": 0.002780457027256489, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 22.671781708487075, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": 0.6582309603691101, + "logits/rejected": 0.7270200252532959, + "logps/chosen": -8.679912567138672, + "logps/rejected": -9.480433464050293, + "loss": 0.5314, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.679912567138672, + "rewards/margins": 0.8005210161209106, + "rewards/rejected": -9.480433464050293, + "semantic_entropy": 0.002767809433862567, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 22.234310993156075, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": 0.6792951822280884, + "logits/rejected": 0.7309907674789429, + "logps/chosen": -8.538030624389648, + "logps/rejected": -9.394686698913574, + "loss": 0.5261, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.538030624389648, + "rewards/margins": 0.8566561937332153, + "rewards/rejected": -9.394686698913574, + "semantic_entropy": 0.0033909387420862913, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.8011821508407593, + "eval_logits/rejected": 0.872314453125, + "eval_logps/chosen": -8.710298538208008, + "eval_logps/rejected": -9.651128768920898, + "eval_loss": 0.5312913060188293, + "eval_rewards/accuracies": 0.7136498689651489, + "eval_rewards/chosen": -8.710298538208008, + "eval_rewards/margins": 0.9408305883407593, + "eval_rewards/rejected": -9.651128768920898, + "eval_runtime": 34.8607, + "eval_samples_per_second": 38.582, + "eval_semantic_entropy": 0.002928712172433734, + "eval_steps_per_second": 9.667, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 14.920939180127396, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": 0.6384707093238831, + "logits/rejected": 0.7600412368774414, + "logps/chosen": -8.579252243041992, + "logps/rejected": -9.70583724975586, + "loss": 0.4806, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.579252243041992, + "rewards/margins": 1.1265841722488403, + "rewards/rejected": -9.70583724975586, + "semantic_entropy": 0.0032008637208491564, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 18.47727128635857, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": 0.5698826313018799, + "logits/rejected": 0.6842392683029175, + "logps/chosen": -8.608453750610352, + "logps/rejected": -9.420400619506836, + "loss": 0.5163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.608453750610352, + "rewards/margins": 0.8119487762451172, + "rewards/rejected": -9.420400619506836, + "semantic_entropy": 0.0028821511659771204, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 16.15173827430261, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": 0.6563664078712463, + "logits/rejected": 0.7400893568992615, + "logps/chosen": -8.490577697753906, + "logps/rejected": -9.582011222839355, + "loss": 0.4587, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.490577697753906, + "rewards/margins": 1.091435194015503, + "rewards/rejected": -9.582011222839355, + "semantic_entropy": 0.003659659530967474, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 20.007292439163773, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": 0.57745361328125, + "logits/rejected": 0.6826112866401672, + "logps/chosen": -8.502618789672852, + "logps/rejected": -9.526583671569824, + "loss": 0.5005, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.502618789672852, + "rewards/margins": 1.0239640474319458, + "rewards/rejected": -9.526583671569824, + "semantic_entropy": 0.0039854454807937145, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 16.126086508254385, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": 0.6350833177566528, + "logits/rejected": 0.7082042098045349, + "logps/chosen": -8.247810363769531, + "logps/rejected": -9.1130952835083, + "loss": 0.5063, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.247810363769531, + "rewards/margins": 0.8652847409248352, + "rewards/rejected": -9.1130952835083, + "semantic_entropy": 0.0047439588233828545, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 23.717409436049728, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": 0.5765770077705383, + "logits/rejected": 0.6420444250106812, + "logps/chosen": -8.346854209899902, + "logps/rejected": -9.272150993347168, + "loss": 0.5377, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.346854209899902, + "rewards/margins": 0.9252961277961731, + "rewards/rejected": -9.272150993347168, + "semantic_entropy": 0.003574087517336011, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 17.97651242704695, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": 0.5552124381065369, + "logits/rejected": 0.658178448677063, + "logps/chosen": -8.291497230529785, + "logps/rejected": -9.098273277282715, + "loss": 0.5842, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.291497230529785, + "rewards/margins": 0.806775689125061, + "rewards/rejected": -9.098273277282715, + "semantic_entropy": 0.004253287799656391, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 20.217954122529044, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": 0.5669525861740112, + "logits/rejected": 0.6433640718460083, + "logps/chosen": -8.215181350708008, + "logps/rejected": -9.121828079223633, + "loss": 0.5425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.215181350708008, + "rewards/margins": 0.9066460728645325, + "rewards/rejected": -9.121828079223633, + "semantic_entropy": 0.004315282683819532, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 21.969818944329187, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": 0.5023918151855469, + "logits/rejected": 0.614323616027832, + "logps/chosen": -8.214559555053711, + "logps/rejected": -9.092178344726562, + "loss": 0.5167, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.214559555053711, + "rewards/margins": 0.8776181936264038, + "rewards/rejected": -9.092178344726562, + "semantic_entropy": 0.004376448690891266, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 21.680469755063655, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": 0.48196372389793396, + "logits/rejected": 0.5732806921005249, + "logps/chosen": -8.17034912109375, + "logps/rejected": -9.072924613952637, + "loss": 0.5107, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.17034912109375, + "rewards/margins": 0.9025766253471375, + "rewards/rejected": -9.072924613952637, + "semantic_entropy": 0.004180104471743107, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 14.07946177566356, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": 0.47136348485946655, + "logits/rejected": 0.6471112370491028, + "logps/chosen": -8.491350173950195, + "logps/rejected": -9.42007827758789, + "loss": 0.4935, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.491350173950195, + "rewards/margins": 0.9287282228469849, + "rewards/rejected": -9.42007827758789, + "semantic_entropy": 0.003645123215392232, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 23.554632267605992, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": 0.5830351114273071, + "logits/rejected": 0.6560341119766235, + "logps/chosen": -8.401416778564453, + "logps/rejected": -9.3002290725708, + "loss": 0.5216, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.401416778564453, + "rewards/margins": 0.8988133668899536, + "rewards/rejected": -9.3002290725708, + "semantic_entropy": 0.0037794325035065413, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 18.146924951726284, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": 0.5521279573440552, + "logits/rejected": 0.6386257410049438, + "logps/chosen": -8.719072341918945, + "logps/rejected": -9.739156723022461, + "loss": 0.4651, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -8.719072341918945, + "rewards/margins": 1.0200841426849365, + "rewards/rejected": -9.739156723022461, + "semantic_entropy": 0.002688236068934202, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 20.103380904418128, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": 0.5402216911315918, + "logits/rejected": 0.6326644420623779, + "logps/chosen": -8.76352596282959, + "logps/rejected": -9.569908142089844, + "loss": 0.5324, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.76352596282959, + "rewards/margins": 0.8063834309577942, + "rewards/rejected": -9.569908142089844, + "semantic_entropy": 0.0027508633211255074, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 19.93100548564064, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": 0.6381164789199829, + "logits/rejected": 0.6684954762458801, + "logps/chosen": -8.71910572052002, + "logps/rejected": -9.549886703491211, + "loss": 0.5264, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.71910572052002, + "rewards/margins": 0.8307819366455078, + "rewards/rejected": -9.549886703491211, + "semantic_entropy": 0.0028809071518480778, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 15.7435684829995, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": 0.6417192220687866, + "logits/rejected": 0.7400007843971252, + "logps/chosen": -8.722017288208008, + "logps/rejected": -9.792933464050293, + "loss": 0.4668, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.722017288208008, + "rewards/margins": 1.0709177255630493, + "rewards/rejected": -9.792933464050293, + "semantic_entropy": 0.003136158687993884, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 19.570820563611054, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": 0.7284759283065796, + "logits/rejected": 0.8398087620735168, + "logps/chosen": -8.61033821105957, + "logps/rejected": -9.802716255187988, + "loss": 0.4663, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.61033821105957, + "rewards/margins": 1.1923778057098389, + "rewards/rejected": -9.802716255187988, + "semantic_entropy": 0.003474020166322589, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 19.36403331204118, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": 0.6692131757736206, + "logits/rejected": 0.7076988220214844, + "logps/chosen": -8.740633964538574, + "logps/rejected": -9.593387603759766, + "loss": 0.504, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.740633964538574, + "rewards/margins": 0.8527532815933228, + "rewards/rejected": -9.593387603759766, + "semantic_entropy": 0.0032423834782093763, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 18.303427693985547, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": 0.7160294651985168, + "logits/rejected": 0.7507287859916687, + "logps/chosen": -8.617566108703613, + "logps/rejected": -9.467153549194336, + "loss": 0.5475, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.617566108703613, + "rewards/margins": 0.8495874404907227, + "rewards/rejected": -9.467153549194336, + "semantic_entropy": 0.003361668437719345, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 23.850604033392393, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": 0.6444199681282043, + "logits/rejected": 0.7431300282478333, + "logps/chosen": -8.590035438537598, + "logps/rejected": -9.325902938842773, + "loss": 0.5461, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.590035438537598, + "rewards/margins": 0.7358676791191101, + "rewards/rejected": -9.325902938842773, + "semantic_entropy": 0.0030975653789937496, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 18.77168242134909, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": 0.6908949017524719, + "logits/rejected": 0.7892520427703857, + "logps/chosen": -8.384283065795898, + "logps/rejected": -9.46776008605957, + "loss": 0.468, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.384283065795898, + "rewards/margins": 1.0834753513336182, + "rewards/rejected": -9.46776008605957, + "semantic_entropy": 0.003537180367857218, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 18.137558397174647, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": 0.6804380416870117, + "logits/rejected": 0.7819782495498657, + "logps/chosen": -8.316720962524414, + "logps/rejected": -9.37002944946289, + "loss": 0.4474, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.316720962524414, + "rewards/margins": 1.0533078908920288, + "rewards/rejected": -9.37002944946289, + "semantic_entropy": 0.0034739505499601364, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 15.709292621392008, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": 0.7852478623390198, + "logits/rejected": 0.8401540517807007, + "logps/chosen": -8.328946113586426, + "logps/rejected": -9.2665433883667, + "loss": 0.4953, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.328946113586426, + "rewards/margins": 0.937597393989563, + "rewards/rejected": -9.2665433883667, + "semantic_entropy": 0.003876983653753996, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 16.260852565852623, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": 0.720770001411438, + "logits/rejected": 0.7988892793655396, + "logps/chosen": -8.437568664550781, + "logps/rejected": -9.330516815185547, + "loss": 0.4806, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.437568664550781, + "rewards/margins": 0.8929487466812134, + "rewards/rejected": -9.330516815185547, + "semantic_entropy": 0.0032981105614453554, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 20.213224180218113, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": 0.7619292140007019, + "logits/rejected": 0.8122636079788208, + "logps/chosen": -8.559782981872559, + "logps/rejected": -9.494768142700195, + "loss": 0.5013, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.559782981872559, + "rewards/margins": 0.9349856376647949, + "rewards/rejected": -9.494768142700195, + "semantic_entropy": 0.0034507550299167633, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 23.935038029395674, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": 0.8029264211654663, + "logits/rejected": 0.8429144620895386, + "logps/chosen": -8.917330741882324, + "logps/rejected": -9.750707626342773, + "loss": 0.5555, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.917330741882324, + "rewards/margins": 0.8333770632743835, + "rewards/rejected": -9.750707626342773, + "semantic_entropy": 0.0024835984222590923, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 26.97837329749874, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": 0.756155788898468, + "logits/rejected": 0.8390409350395203, + "logps/chosen": -9.110207557678223, + "logps/rejected": -9.918048858642578, + "loss": 0.5293, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.110207557678223, + "rewards/margins": 0.8078413009643555, + "rewards/rejected": -9.918048858642578, + "semantic_entropy": 0.0021289088763296604, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 17.786890203549685, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": 0.7632160186767578, + "logits/rejected": 0.8699263334274292, + "logps/chosen": -9.151754379272461, + "logps/rejected": -10.03592586517334, + "loss": 0.499, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.151754379272461, + "rewards/margins": 0.884171187877655, + "rewards/rejected": -10.03592586517334, + "semantic_entropy": 0.0021057447884231806, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 20.119224807141038, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": 0.7189488410949707, + "logits/rejected": 0.8399428129196167, + "logps/chosen": -9.119746208190918, + "logps/rejected": -10.26237964630127, + "loss": 0.4471, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.119746208190918, + "rewards/margins": 1.1426328420639038, + "rewards/rejected": -10.26237964630127, + "semantic_entropy": 0.0021095951087772846, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 20.813656956825653, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": 0.7267470955848694, + "logits/rejected": 0.823375403881073, + "logps/chosen": -9.255183219909668, + "logps/rejected": -9.974876403808594, + "loss": 0.5697, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -9.255183219909668, + "rewards/margins": 0.7196929454803467, + "rewards/rejected": -9.974876403808594, + "semantic_entropy": 0.0019825948402285576, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 21.495996998491723, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": 0.7436253428459167, + "logits/rejected": 0.8189195394515991, + "logps/chosen": -8.933206558227539, + "logps/rejected": -10.065174102783203, + "loss": 0.4548, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -8.933206558227539, + "rewards/margins": 1.1319692134857178, + "rewards/rejected": -10.065174102783203, + "semantic_entropy": 0.0025200708769261837, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 20.492917918741757, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": 0.6491128206253052, + "logits/rejected": 0.7273428440093994, + "logps/chosen": -8.97862434387207, + "logps/rejected": -9.844133377075195, + "loss": 0.5339, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.97862434387207, + "rewards/margins": 0.8655084371566772, + "rewards/rejected": -9.844133377075195, + "semantic_entropy": 0.0024933055974543095, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 29.52052844095437, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": 0.618812620639801, + "logits/rejected": 0.7136391997337341, + "logps/chosen": -8.866046905517578, + "logps/rejected": -9.910287857055664, + "loss": 0.495, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.866046905517578, + "rewards/margins": 1.04423987865448, + "rewards/rejected": -9.910287857055664, + "semantic_entropy": 0.002750510349869728, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 14.963175375540626, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": 0.6298393607139587, + "logits/rejected": 0.6999781727790833, + "logps/chosen": -8.907299995422363, + "logps/rejected": -9.864768981933594, + "loss": 0.5256, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.907299995422363, + "rewards/margins": 0.9574697613716125, + "rewards/rejected": -9.864768981933594, + "semantic_entropy": 0.002827054588124156, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 24.51017226672691, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": 0.6173363327980042, + "logits/rejected": 0.7241848111152649, + "logps/chosen": -8.95418930053711, + "logps/rejected": -9.835868835449219, + "loss": 0.523, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.95418930053711, + "rewards/margins": 0.8816791772842407, + "rewards/rejected": -9.835868835449219, + "semantic_entropy": 0.0022052470594644547, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 23.404717225980484, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": 0.6575708985328674, + "logits/rejected": 0.7401934266090393, + "logps/chosen": -9.168517112731934, + "logps/rejected": -9.852693557739258, + "loss": 0.5703, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -9.168517112731934, + "rewards/margins": 0.6841762661933899, + "rewards/rejected": -9.852693557739258, + "semantic_entropy": 0.0024021922145038843, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 20.395741155582876, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": 0.6744663119316101, + "logits/rejected": 0.7450428009033203, + "logps/chosen": -9.043600082397461, + "logps/rejected": -9.901880264282227, + "loss": 0.5319, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -9.043600082397461, + "rewards/margins": 0.8582803010940552, + "rewards/rejected": -9.901880264282227, + "semantic_entropy": 0.002837617415934801, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 20.903114229317822, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": 0.6738818287849426, + "logits/rejected": 0.7437289953231812, + "logps/chosen": -8.894235610961914, + "logps/rejected": -9.801753044128418, + "loss": 0.5257, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.894235610961914, + "rewards/margins": 0.907518744468689, + "rewards/rejected": -9.801753044128418, + "semantic_entropy": 0.002832833444699645, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 25.8050758741644, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": 0.6168414354324341, + "logits/rejected": 0.659568190574646, + "logps/chosen": -9.102640151977539, + "logps/rejected": -9.882707595825195, + "loss": 0.5363, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.102640151977539, + "rewards/margins": 0.780068576335907, + "rewards/rejected": -9.882707595825195, + "semantic_entropy": 0.002202157862484455, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 22.37455211473883, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": 0.6996050477027893, + "logits/rejected": 0.7838853597640991, + "logps/chosen": -8.78913402557373, + "logps/rejected": -9.713356018066406, + "loss": 0.4855, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.78913402557373, + "rewards/margins": 0.924220085144043, + "rewards/rejected": -9.713356018066406, + "semantic_entropy": 0.003083221148699522, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 15.384141820184274, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": 0.705297589302063, + "logits/rejected": 0.7580437660217285, + "logps/chosen": -8.660966873168945, + "logps/rejected": -9.467788696289062, + "loss": 0.5043, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.660966873168945, + "rewards/margins": 0.8068218231201172, + "rewards/rejected": -9.467788696289062, + "semantic_entropy": 0.003117068437859416, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 19.949668972130336, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": 0.6838169097900391, + "logits/rejected": 0.7196789383888245, + "logps/chosen": -8.577276229858398, + "logps/rejected": -9.5936861038208, + "loss": 0.4727, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.577276229858398, + "rewards/margins": 1.01641047000885, + "rewards/rejected": -9.5936861038208, + "semantic_entropy": 0.003384160343557596, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 23.363596001347148, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": 0.6700709462165833, + "logits/rejected": 0.7268036007881165, + "logps/chosen": -8.639312744140625, + "logps/rejected": -9.69861888885498, + "loss": 0.4863, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.639312744140625, + "rewards/margins": 1.0593070983886719, + "rewards/rejected": -9.69861888885498, + "semantic_entropy": 0.003042886033654213, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 16.823068494049753, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": 0.7304331064224243, + "logits/rejected": 0.7038922309875488, + "logps/chosen": -8.778966903686523, + "logps/rejected": -9.502559661865234, + "loss": 0.5525, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.778966903686523, + "rewards/margins": 0.7235932350158691, + "rewards/rejected": -9.502559661865234, + "semantic_entropy": 0.0028461969923228025, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 24.59681543759587, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": 0.6792068481445312, + "logits/rejected": 0.7667452096939087, + "logps/chosen": -8.76710319519043, + "logps/rejected": -9.83530330657959, + "loss": 0.4983, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.76710319519043, + "rewards/margins": 1.0682008266448975, + "rewards/rejected": -9.83530330657959, + "semantic_entropy": 0.0029692454263567924, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 19.14917858388336, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": 0.5627522468566895, + "logits/rejected": 0.6816359758377075, + "logps/chosen": -8.73742389678955, + "logps/rejected": -9.567540168762207, + "loss": 0.5112, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.73742389678955, + "rewards/margins": 0.8301169276237488, + "rewards/rejected": -9.567540168762207, + "semantic_entropy": 0.0030936195980757475, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 17.47734554681335, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": 0.6170838475227356, + "logits/rejected": 0.6742144823074341, + "logps/chosen": -8.985275268554688, + "logps/rejected": -9.946958541870117, + "loss": 0.4664, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.985275268554688, + "rewards/margins": 0.9616818428039551, + "rewards/rejected": -9.946958541870117, + "semantic_entropy": 0.002239787485450506, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 18.087276533577075, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": 0.6644759774208069, + "logits/rejected": 0.7190378904342651, + "logps/chosen": -9.075571060180664, + "logps/rejected": -9.81869888305664, + "loss": 0.5604, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -9.075571060180664, + "rewards/margins": 0.7431273460388184, + "rewards/rejected": -9.81869888305664, + "semantic_entropy": 0.002120462479069829, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 17.443905406025888, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": 0.6211899518966675, + "logits/rejected": 0.6783931851387024, + "logps/chosen": -8.705963134765625, + "logps/rejected": -9.728338241577148, + "loss": 0.4588, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.705963134765625, + "rewards/margins": 1.0223755836486816, + "rewards/rejected": -9.728338241577148, + "semantic_entropy": 0.0033381134271621704, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 18.870609176206806, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": 0.6599079370498657, + "logits/rejected": 0.736240565776825, + "logps/chosen": -8.820722579956055, + "logps/rejected": -9.812540054321289, + "loss": 0.49, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.820722579956055, + "rewards/margins": 0.9918166995048523, + "rewards/rejected": -9.812540054321289, + "semantic_entropy": 0.002813478233292699, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 20.325954850202407, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": 0.6205192804336548, + "logits/rejected": 0.6998498439788818, + "logps/chosen": -8.68531608581543, + "logps/rejected": -9.498059272766113, + "loss": 0.5063, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.68531608581543, + "rewards/margins": 0.8127420544624329, + "rewards/rejected": -9.498059272766113, + "semantic_entropy": 0.0029031294398009777, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 17.30912171919916, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": 0.6297236680984497, + "logits/rejected": 0.6760424971580505, + "logps/chosen": -8.35009765625, + "logps/rejected": -9.166117668151855, + "loss": 0.5628, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.35009765625, + "rewards/margins": 0.816020131111145, + "rewards/rejected": -9.166117668151855, + "semantic_entropy": 0.003771452931687236, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 15.64812739081427, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": 0.5272424817085266, + "logits/rejected": 0.5996197462081909, + "logps/chosen": -8.598016738891602, + "logps/rejected": -9.50200080871582, + "loss": 0.4918, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.598016738891602, + "rewards/margins": 0.9039848446846008, + "rewards/rejected": -9.50200080871582, + "semantic_entropy": 0.003393507096916437, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 22.54029918814073, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": 0.6191005706787109, + "logits/rejected": 0.6678223609924316, + "logps/chosen": -8.656926155090332, + "logps/rejected": -9.598608016967773, + "loss": 0.5089, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.656926155090332, + "rewards/margins": 0.9416826963424683, + "rewards/rejected": -9.598608016967773, + "semantic_entropy": 0.0036796010099351406, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 38.66235795954998, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": 0.5672577619552612, + "logits/rejected": 0.6477295160293579, + "logps/chosen": -8.749332427978516, + "logps/rejected": -9.726335525512695, + "loss": 0.5089, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.749332427978516, + "rewards/margins": 0.9770025014877319, + "rewards/rejected": -9.726335525512695, + "semantic_entropy": 0.003146649803966284, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 24.587723329790435, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": 0.532563328742981, + "logits/rejected": 0.6375452280044556, + "logps/chosen": -8.473979949951172, + "logps/rejected": -9.374165534973145, + "loss": 0.5416, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.473979949951172, + "rewards/margins": 0.9001848101615906, + "rewards/rejected": -9.374165534973145, + "semantic_entropy": 0.004033363424241543, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 17.100229454274775, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": 0.5713559985160828, + "logits/rejected": 0.6829615831375122, + "logps/chosen": -8.452000617980957, + "logps/rejected": -9.610584259033203, + "loss": 0.4244, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -8.452000617980957, + "rewards/margins": 1.1585838794708252, + "rewards/rejected": -9.610584259033203, + "semantic_entropy": 0.0038808733224868774, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 22.766099328641907, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": 0.5507108569145203, + "logits/rejected": 0.6832348704338074, + "logps/chosen": -8.426264762878418, + "logps/rejected": -9.474390029907227, + "loss": 0.49, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.426264762878418, + "rewards/margins": 1.0481255054473877, + "rewards/rejected": -9.474390029907227, + "semantic_entropy": 0.004706330597400665, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 21.28132833379913, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": 0.6715400815010071, + "logits/rejected": 0.6841408610343933, + "logps/chosen": -8.57282829284668, + "logps/rejected": -9.571008682250977, + "loss": 0.5092, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.57282829284668, + "rewards/margins": 0.9981800317764282, + "rewards/rejected": -9.571008682250977, + "semantic_entropy": 0.0033726401161402464, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 17.178979708018453, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": 0.644763708114624, + "logits/rejected": 0.746247410774231, + "logps/chosen": -8.67873477935791, + "logps/rejected": -9.667515754699707, + "loss": 0.4689, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.67873477935791, + "rewards/margins": 0.9887820482254028, + "rewards/rejected": -9.667515754699707, + "semantic_entropy": 0.0032643512822687626, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 18.421052634418867, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": 0.5887877345085144, + "logits/rejected": 0.7074635624885559, + "logps/chosen": -8.820067405700684, + "logps/rejected": -10.023954391479492, + "loss": 0.4421, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.820067405700684, + "rewards/margins": 1.2038882970809937, + "rewards/rejected": -10.023954391479492, + "semantic_entropy": 0.002584748435765505, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 18.781234298542874, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": 0.7198264598846436, + "logits/rejected": 0.7411429286003113, + "logps/chosen": -8.748977661132812, + "logps/rejected": -9.773608207702637, + "loss": 0.4872, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.748977661132812, + "rewards/margins": 1.0246312618255615, + "rewards/rejected": -9.773608207702637, + "semantic_entropy": 0.003493456868454814, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 16.626832554859085, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": 0.6111767292022705, + "logits/rejected": 0.6929227113723755, + "logps/chosen": -8.618370056152344, + "logps/rejected": -9.645769119262695, + "loss": 0.4634, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.618370056152344, + "rewards/margins": 1.0273983478546143, + "rewards/rejected": -9.645769119262695, + "semantic_entropy": 0.003336191177368164, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 15.275820526471591, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": 0.6123021841049194, + "logits/rejected": 0.6909358501434326, + "logps/chosen": -8.73341178894043, + "logps/rejected": -9.467869758605957, + "loss": 0.5598, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.73341178894043, + "rewards/margins": 0.7344561815261841, + "rewards/rejected": -9.467869758605957, + "semantic_entropy": 0.0026931720785796642, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 22.157104502252846, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": 0.7188630104064941, + "logits/rejected": 0.7658997774124146, + "logps/chosen": -8.619566917419434, + "logps/rejected": -9.543168067932129, + "loss": 0.4784, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.619566917419434, + "rewards/margins": 0.9236003160476685, + "rewards/rejected": -9.543168067932129, + "semantic_entropy": 0.003137335879728198, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 20.352841774469262, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": 0.6846107244491577, + "logits/rejected": 0.7849973440170288, + "logps/chosen": -8.482339859008789, + "logps/rejected": -9.482155799865723, + "loss": 0.5162, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.482339859008789, + "rewards/margins": 0.9998153448104858, + "rewards/rejected": -9.482155799865723, + "semantic_entropy": 0.0033546772319823503, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 12.410524435742113, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": 0.7148826718330383, + "logits/rejected": 0.8275319933891296, + "logps/chosen": -8.265142440795898, + "logps/rejected": -9.430082321166992, + "loss": 0.417, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -8.265142440795898, + "rewards/margins": 1.1649402379989624, + "rewards/rejected": -9.430082321166992, + "semantic_entropy": 0.0043460773304104805, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 19.113883533650075, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": 0.63080894947052, + "logits/rejected": 0.6792045831680298, + "logps/chosen": -8.33402156829834, + "logps/rejected": -9.474609375, + "loss": 0.4612, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.33402156829834, + "rewards/margins": 1.140586256980896, + "rewards/rejected": -9.474609375, + "semantic_entropy": 0.0038649775087833405, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 19.573081780536924, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": 0.6687484979629517, + "logits/rejected": 0.7621025443077087, + "logps/chosen": -8.205583572387695, + "logps/rejected": -9.156460762023926, + "loss": 0.4883, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.205583572387695, + "rewards/margins": 0.9508770108222961, + "rewards/rejected": -9.156460762023926, + "semantic_entropy": 0.004209198523312807, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 19.22285858261932, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": 0.6872994303703308, + "logits/rejected": 0.7447125315666199, + "logps/chosen": -8.262472152709961, + "logps/rejected": -9.375197410583496, + "loss": 0.4668, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.262472152709961, + "rewards/margins": 1.1127252578735352, + "rewards/rejected": -9.375197410583496, + "semantic_entropy": 0.004294519778341055, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 28.965956806202943, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": 0.7001906633377075, + "logits/rejected": 0.7653275728225708, + "logps/chosen": -8.254480361938477, + "logps/rejected": -9.233253479003906, + "loss": 0.5039, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.254480361938477, + "rewards/margins": 0.9787724614143372, + "rewards/rejected": -9.233253479003906, + "semantic_entropy": 0.00447105010971427, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 19.46901251194481, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": 0.6296931505203247, + "logits/rejected": 0.7145162224769592, + "logps/chosen": -8.382627487182617, + "logps/rejected": -9.445598602294922, + "loss": 0.4638, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.382627487182617, + "rewards/margins": 1.0629713535308838, + "rewards/rejected": -9.445598602294922, + "semantic_entropy": 0.004158531315624714, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 19.53223501670479, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": 0.7034457325935364, + "logits/rejected": 0.7711877226829529, + "logps/chosen": -8.465707778930664, + "logps/rejected": -9.426530838012695, + "loss": 0.5123, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.465707778930664, + "rewards/margins": 0.9608221054077148, + "rewards/rejected": -9.426530838012695, + "semantic_entropy": 0.003798137651756406, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 14.100323670272886, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": 0.6083649396896362, + "logits/rejected": 0.6571983098983765, + "logps/chosen": -8.479659080505371, + "logps/rejected": -9.488851547241211, + "loss": 0.4886, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.479659080505371, + "rewards/margins": 1.0091919898986816, + "rewards/rejected": -9.488851547241211, + "semantic_entropy": 0.0037951588165014982, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 19.799070365162166, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": 0.6102844476699829, + "logits/rejected": 0.6613792181015015, + "logps/chosen": -8.798731803894043, + "logps/rejected": -9.752110481262207, + "loss": 0.5194, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.798731803894043, + "rewards/margins": 0.9533787965774536, + "rewards/rejected": -9.752110481262207, + "semantic_entropy": 0.003143253503367305, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 17.219558141254097, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": 0.5687362551689148, + "logits/rejected": 0.634920597076416, + "logps/chosen": -8.662237167358398, + "logps/rejected": -9.508859634399414, + "loss": 0.5463, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -8.662237167358398, + "rewards/margins": 0.8466218709945679, + "rewards/rejected": -9.508859634399414, + "semantic_entropy": 0.0033071953803300858, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 20.01873928855486, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": 0.6002562642097473, + "logits/rejected": 0.696466326713562, + "logps/chosen": -8.673624992370605, + "logps/rejected": -9.823293685913086, + "loss": 0.455, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.673624992370605, + "rewards/margins": 1.1496690511703491, + "rewards/rejected": -9.823293685913086, + "semantic_entropy": 0.003196306060999632, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 30.437982634244353, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": 0.5805534720420837, + "logits/rejected": 0.5821332931518555, + "logps/chosen": -8.47557258605957, + "logps/rejected": -9.357189178466797, + "loss": 0.5343, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.47557258605957, + "rewards/margins": 0.8816182017326355, + "rewards/rejected": -9.357189178466797, + "semantic_entropy": 0.0038487245328724384, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 14.831624104584504, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": 0.5990924835205078, + "logits/rejected": 0.6138975024223328, + "logps/chosen": -8.312957763671875, + "logps/rejected": -9.31361198425293, + "loss": 0.5031, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.312957763671875, + "rewards/margins": 1.0006548166275024, + "rewards/rejected": -9.31361198425293, + "semantic_entropy": 0.004499537404626608, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 21.33910911582425, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": 0.5481540560722351, + "logits/rejected": 0.6002416610717773, + "logps/chosen": -8.367044448852539, + "logps/rejected": -9.348922729492188, + "loss": 0.4879, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.367044448852539, + "rewards/margins": 0.9818779230117798, + "rewards/rejected": -9.348922729492188, + "semantic_entropy": 0.004131897818297148, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.6895690560340881, + "eval_logits/rejected": 0.749624490737915, + "eval_logps/chosen": -8.626724243164062, + "eval_logps/rejected": -9.53298282623291, + "eval_loss": 0.5264463424682617, + "eval_rewards/accuracies": 0.7218101024627686, + "eval_rewards/chosen": -8.626724243164062, + "eval_rewards/margins": 0.9062579870223999, + "eval_rewards/rejected": -9.53298282623291, + "eval_runtime": 35.1374, + "eval_samples_per_second": 38.278, + "eval_semantic_entropy": 0.0033146331552416086, + "eval_steps_per_second": 9.591, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 15.846955607641894, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": 0.4758935868740082, + "logits/rejected": 0.5631710290908813, + "logps/chosen": -8.500029563903809, + "logps/rejected": -9.449440002441406, + "loss": 0.5011, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.500029563903809, + "rewards/margins": 0.9494104385375977, + "rewards/rejected": -9.449440002441406, + "semantic_entropy": 0.004371006041765213, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 24.705548492382672, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": 0.5791555643081665, + "logits/rejected": 0.6237837672233582, + "logps/chosen": -8.626651763916016, + "logps/rejected": -9.473905563354492, + "loss": 0.5144, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.626651763916016, + "rewards/margins": 0.8472524881362915, + "rewards/rejected": -9.473905563354492, + "semantic_entropy": 0.003329088445752859, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 14.693823746449954, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": 0.5436751842498779, + "logits/rejected": 0.5992386341094971, + "logps/chosen": -8.613186836242676, + "logps/rejected": -9.71760368347168, + "loss": 0.4307, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -8.613186836242676, + "rewards/margins": 1.1044175624847412, + "rewards/rejected": -9.71760368347168, + "semantic_entropy": 0.003140996443107724, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 15.499555019749941, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": 0.5179445147514343, + "logits/rejected": 0.6111316084861755, + "logps/chosen": -8.66978645324707, + "logps/rejected": -9.439632415771484, + "loss": 0.5827, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.66978645324707, + "rewards/margins": 0.7698466777801514, + "rewards/rejected": -9.439632415771484, + "semantic_entropy": 0.0032921708188951015, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 16.093368752669864, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": 0.5145665407180786, + "logits/rejected": 0.6263571977615356, + "logps/chosen": -8.70081901550293, + "logps/rejected": -9.660847663879395, + "loss": 0.4992, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.70081901550293, + "rewards/margins": 0.9600294232368469, + "rewards/rejected": -9.660847663879395, + "semantic_entropy": 0.002892556134611368, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 13.79559161731438, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": 0.5084502696990967, + "logits/rejected": 0.5790830850601196, + "logps/chosen": -8.357169151306152, + "logps/rejected": -9.426858901977539, + "loss": 0.4507, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.357169151306152, + "rewards/margins": 1.0696887969970703, + "rewards/rejected": -9.426858901977539, + "semantic_entropy": 0.0035817469470202923, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 17.910427446812193, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": 0.5757554769515991, + "logits/rejected": 0.6622332334518433, + "logps/chosen": -8.620783805847168, + "logps/rejected": -9.76025390625, + "loss": 0.4271, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.620783805847168, + "rewards/margins": 1.1394703388214111, + "rewards/rejected": -9.76025390625, + "semantic_entropy": 0.0031910459510982037, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 17.705706566190077, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": 0.5833605527877808, + "logits/rejected": 0.6117344498634338, + "logps/chosen": -8.490338325500488, + "logps/rejected": -9.415372848510742, + "loss": 0.5481, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.490338325500488, + "rewards/margins": 0.925035834312439, + "rewards/rejected": -9.415372848510742, + "semantic_entropy": 0.0037457395810633898, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 19.220022114950083, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": 0.6395395994186401, + "logits/rejected": 0.6757252812385559, + "logps/chosen": -8.574124336242676, + "logps/rejected": -9.431645393371582, + "loss": 0.5835, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.574124336242676, + "rewards/margins": 0.8575227856636047, + "rewards/rejected": -9.431645393371582, + "semantic_entropy": 0.0034476309083402157, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 16.469084913870834, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": 0.5754357576370239, + "logits/rejected": 0.6329125165939331, + "logps/chosen": -8.597195625305176, + "logps/rejected": -9.660018920898438, + "loss": 0.4696, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.597195625305176, + "rewards/margins": 1.0628234148025513, + "rewards/rejected": -9.660018920898438, + "semantic_entropy": 0.003124454291537404, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 18.533405817853968, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": 0.4836948812007904, + "logits/rejected": 0.5998759269714355, + "logps/chosen": -8.543926239013672, + "logps/rejected": -9.415602684020996, + "loss": 0.5262, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.543926239013672, + "rewards/margins": 0.871677577495575, + "rewards/rejected": -9.415602684020996, + "semantic_entropy": 0.0034175370819866657, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 16.66857458253813, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": 0.5200189352035522, + "logits/rejected": 0.6539384126663208, + "logps/chosen": -8.513689041137695, + "logps/rejected": -9.264158248901367, + "loss": 0.5351, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.513689041137695, + "rewards/margins": 0.7504681348800659, + "rewards/rejected": -9.264158248901367, + "semantic_entropy": 0.00305316224694252, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 21.309556253730577, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": 0.5738528370857239, + "logits/rejected": 0.6424544453620911, + "logps/chosen": -8.373230934143066, + "logps/rejected": -9.177302360534668, + "loss": 0.5274, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.373230934143066, + "rewards/margins": 0.8040705919265747, + "rewards/rejected": -9.177302360534668, + "semantic_entropy": 0.0038657269906252623, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 18.48692651527053, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": 0.5925968289375305, + "logits/rejected": 0.6500064730644226, + "logps/chosen": -8.42739486694336, + "logps/rejected": -9.342714309692383, + "loss": 0.524, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.42739486694336, + "rewards/margins": 0.9153194427490234, + "rewards/rejected": -9.342714309692383, + "semantic_entropy": 0.0032528643496334553, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 18.694567668937122, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": 0.5311469435691833, + "logits/rejected": 0.6227244138717651, + "logps/chosen": -8.448528289794922, + "logps/rejected": -9.284102439880371, + "loss": 0.5258, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.448528289794922, + "rewards/margins": 0.8355741500854492, + "rewards/rejected": -9.284102439880371, + "semantic_entropy": 0.0036838327068835497, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 20.341898488782054, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": 0.6697776913642883, + "logits/rejected": 0.7325208187103271, + "logps/chosen": -8.417803764343262, + "logps/rejected": -9.420540809631348, + "loss": 0.4782, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.417803764343262, + "rewards/margins": 1.0027358531951904, + "rewards/rejected": -9.420540809631348, + "semantic_entropy": 0.0036022099666297436, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 17.610305324362457, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": 0.5705369710922241, + "logits/rejected": 0.6492313146591187, + "logps/chosen": -8.502610206604004, + "logps/rejected": -9.509003639221191, + "loss": 0.4879, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.502610206604004, + "rewards/margins": 1.0063927173614502, + "rewards/rejected": -9.509003639221191, + "semantic_entropy": 0.003285133745521307, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 17.881369724017024, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": 0.573552131652832, + "logits/rejected": 0.6860645413398743, + "logps/chosen": -8.498343467712402, + "logps/rejected": -9.553579330444336, + "loss": 0.4739, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.498343467712402, + "rewards/margins": 1.055237054824829, + "rewards/rejected": -9.553579330444336, + "semantic_entropy": 0.0032834571320563555, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 17.628648766982945, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": 0.6237468719482422, + "logits/rejected": 0.7165063619613647, + "logps/chosen": -8.482809066772461, + "logps/rejected": -9.492294311523438, + "loss": 0.4927, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.482809066772461, + "rewards/margins": 1.0094853639602661, + "rewards/rejected": -9.492294311523438, + "semantic_entropy": 0.003665131749585271, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 17.76187822982553, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": 0.6243129968643188, + "logits/rejected": 0.7127053737640381, + "logps/chosen": -8.367454528808594, + "logps/rejected": -9.463773727416992, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.367454528808594, + "rewards/margins": 1.0963184833526611, + "rewards/rejected": -9.463773727416992, + "semantic_entropy": 0.003789290087297559, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 22.486981787052432, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": 0.7026504278182983, + "logits/rejected": 0.7289483547210693, + "logps/chosen": -8.396336555480957, + "logps/rejected": -9.454975128173828, + "loss": 0.4576, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -8.396336555480957, + "rewards/margins": 1.058638334274292, + "rewards/rejected": -9.454975128173828, + "semantic_entropy": 0.003976074513047934, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 21.34998407613261, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": 0.6059376001358032, + "logits/rejected": 0.6654237508773804, + "logps/chosen": -8.55673599243164, + "logps/rejected": -9.484312057495117, + "loss": 0.4952, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.55673599243164, + "rewards/margins": 0.927575945854187, + "rewards/rejected": -9.484312057495117, + "semantic_entropy": 0.0034220025409013033, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 25.338171297276553, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": 0.6434666514396667, + "logits/rejected": 0.7561715841293335, + "logps/chosen": -8.727119445800781, + "logps/rejected": -9.780064582824707, + "loss": 0.5051, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.727119445800781, + "rewards/margins": 1.0529462099075317, + "rewards/rejected": -9.780064582824707, + "semantic_entropy": 0.002783454256132245, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 19.446663229697716, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": 0.622826099395752, + "logits/rejected": 0.6913628578186035, + "logps/chosen": -8.684412002563477, + "logps/rejected": -9.680010795593262, + "loss": 0.5004, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.684412002563477, + "rewards/margins": 0.9955987930297852, + "rewards/rejected": -9.680010795593262, + "semantic_entropy": 0.0030565441120415926, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 21.499839435107912, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": 0.66752690076828, + "logits/rejected": 0.7305563688278198, + "logps/chosen": -8.636419296264648, + "logps/rejected": -9.558130264282227, + "loss": 0.5297, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.636419296264648, + "rewards/margins": 0.9217103123664856, + "rewards/rejected": -9.558130264282227, + "semantic_entropy": 0.0032252557575702667, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 17.043070253231157, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": 0.6017109155654907, + "logits/rejected": 0.6921178698539734, + "logps/chosen": -8.539863586425781, + "logps/rejected": -9.874781608581543, + "loss": 0.455, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.539863586425781, + "rewards/margins": 1.334917426109314, + "rewards/rejected": -9.874781608581543, + "semantic_entropy": 0.0031048119999468327, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 17.31470565155956, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": 0.6002456545829773, + "logits/rejected": 0.6670821905136108, + "logps/chosen": -8.838860511779785, + "logps/rejected": -9.790765762329102, + "loss": 0.4888, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.838860511779785, + "rewards/margins": 0.9519071578979492, + "rewards/rejected": -9.790765762329102, + "semantic_entropy": 0.0028922937344759703, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 18.420355066758702, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": 0.6272684335708618, + "logits/rejected": 0.7411568760871887, + "logps/chosen": -8.951470375061035, + "logps/rejected": -10.118718147277832, + "loss": 0.4787, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.951470375061035, + "rewards/margins": 1.1672481298446655, + "rewards/rejected": -10.118718147277832, + "semantic_entropy": 0.0028697990346699953, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 24.156663129325842, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": 0.6460695862770081, + "logits/rejected": 0.7259084582328796, + "logps/chosen": -8.902268409729004, + "logps/rejected": -9.860254287719727, + "loss": 0.5189, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.902268409729004, + "rewards/margins": 0.9579856991767883, + "rewards/rejected": -9.860254287719727, + "semantic_entropy": 0.0026515666395425797, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 28.217781936607324, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": 0.6646770238876343, + "logits/rejected": 0.7777436375617981, + "logps/chosen": -8.907236099243164, + "logps/rejected": -10.02210807800293, + "loss": 0.422, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.907236099243164, + "rewards/margins": 1.1148706674575806, + "rewards/rejected": -10.02210807800293, + "semantic_entropy": 0.0026082415133714676, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 22.254732967367147, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": 0.7216917872428894, + "logits/rejected": 0.781305193901062, + "logps/chosen": -8.932952880859375, + "logps/rejected": -9.814626693725586, + "loss": 0.5321, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.932952880859375, + "rewards/margins": 0.8816744089126587, + "rewards/rejected": -9.814626693725586, + "semantic_entropy": 0.0031717985402792692, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 17.792398556391674, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": 0.6349023580551147, + "logits/rejected": 0.7170180678367615, + "logps/chosen": -8.807271003723145, + "logps/rejected": -9.855276107788086, + "loss": 0.4582, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.807271003723145, + "rewards/margins": 1.0480067729949951, + "rewards/rejected": -9.855276107788086, + "semantic_entropy": 0.0029146361630409956, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 16.167553601713156, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": 0.6512748599052429, + "logits/rejected": 0.7206937074661255, + "logps/chosen": -8.949135780334473, + "logps/rejected": -10.088407516479492, + "loss": 0.4511, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.949135780334473, + "rewards/margins": 1.139272689819336, + "rewards/rejected": -10.088407516479492, + "semantic_entropy": 0.0026951334439218044, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 21.334679501529834, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": 0.6061184406280518, + "logits/rejected": 0.6971312761306763, + "logps/chosen": -9.021596908569336, + "logps/rejected": -9.995620727539062, + "loss": 0.5042, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.021596908569336, + "rewards/margins": 0.974023163318634, + "rewards/rejected": -9.995620727539062, + "semantic_entropy": 0.002106505911797285, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 27.712943840731256, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": 0.6605676412582397, + "logits/rejected": 0.7498981952667236, + "logps/chosen": -8.881559371948242, + "logps/rejected": -9.735440254211426, + "loss": 0.5282, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.881559371948242, + "rewards/margins": 0.8538818359375, + "rewards/rejected": -9.735440254211426, + "semantic_entropy": 0.0024334299378097057, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 14.991037570020522, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": 0.6337238550186157, + "logits/rejected": 0.7245572805404663, + "logps/chosen": -8.76764965057373, + "logps/rejected": -9.690933227539062, + "loss": 0.482, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.76764965057373, + "rewards/margins": 0.9232838749885559, + "rewards/rejected": -9.690933227539062, + "semantic_entropy": 0.002793360035866499, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 24.077369910851743, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": 0.5908278226852417, + "logits/rejected": 0.7130194902420044, + "logps/chosen": -8.76014518737793, + "logps/rejected": -9.954129219055176, + "loss": 0.5029, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.76014518737793, + "rewards/margins": 1.1939831972122192, + "rewards/rejected": -9.954129219055176, + "semantic_entropy": 0.003040383802726865, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 15.211762910677832, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": 0.6387141942977905, + "logits/rejected": 0.7284534573554993, + "logps/chosen": -8.648200035095215, + "logps/rejected": -9.790531158447266, + "loss": 0.4399, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.648200035095215, + "rewards/margins": 1.1423308849334717, + "rewards/rejected": -9.790531158447266, + "semantic_entropy": 0.003230876522138715, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 20.214549437081253, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": 0.6860362887382507, + "logits/rejected": 0.7700978517532349, + "logps/chosen": -8.748394012451172, + "logps/rejected": -9.925196647644043, + "loss": 0.4639, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.748394012451172, + "rewards/margins": 1.1768031120300293, + "rewards/rejected": -9.925196647644043, + "semantic_entropy": 0.002992126392200589, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 13.676962647539636, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": 0.6779240965843201, + "logits/rejected": 0.7815280556678772, + "logps/chosen": -8.563261985778809, + "logps/rejected": -9.472494125366211, + "loss": 0.5373, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.563261985778809, + "rewards/margins": 0.9092334508895874, + "rewards/rejected": -9.472494125366211, + "semantic_entropy": 0.0036033024080097675, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 35.682884824697986, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": 0.6307097673416138, + "logits/rejected": 0.7322098612785339, + "logps/chosen": -8.694342613220215, + "logps/rejected": -9.436209678649902, + "loss": 0.5628, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.694342613220215, + "rewards/margins": 0.7418667078018188, + "rewards/rejected": -9.436209678649902, + "semantic_entropy": 0.003090116661041975, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 20.559441487883948, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": 0.651374101638794, + "logits/rejected": 0.7265350222587585, + "logps/chosen": -8.542802810668945, + "logps/rejected": -9.415166854858398, + "loss": 0.5648, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.542802810668945, + "rewards/margins": 0.8723649978637695, + "rewards/rejected": -9.415166854858398, + "semantic_entropy": 0.0034926377702504396, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 24.620427653936158, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": 0.7381612658500671, + "logits/rejected": 0.7867849469184875, + "logps/chosen": -8.523360252380371, + "logps/rejected": -9.46071720123291, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.523360252380371, + "rewards/margins": 0.9373563528060913, + "rewards/rejected": -9.46071720123291, + "semantic_entropy": 0.003742937697097659, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 20.150742954590974, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": 0.6840203404426575, + "logits/rejected": 0.7242007851600647, + "logps/chosen": -8.462261199951172, + "logps/rejected": -9.2748384475708, + "loss": 0.5223, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.462261199951172, + "rewards/margins": 0.8125771284103394, + "rewards/rejected": -9.2748384475708, + "semantic_entropy": 0.0034177147317677736, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 22.925549219305793, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": 0.6633858680725098, + "logits/rejected": 0.7617511749267578, + "logps/chosen": -8.355466842651367, + "logps/rejected": -9.252098083496094, + "loss": 0.5333, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.355466842651367, + "rewards/margins": 0.896629810333252, + "rewards/rejected": -9.252098083496094, + "semantic_entropy": 0.003791496157646179, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 24.09406066747849, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": 0.6741195917129517, + "logits/rejected": 0.7528184056282043, + "logps/chosen": -8.633956909179688, + "logps/rejected": -9.625297546386719, + "loss": 0.5164, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.633956909179688, + "rewards/margins": 0.9913405179977417, + "rewards/rejected": -9.625297546386719, + "semantic_entropy": 0.0035090327728539705, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 17.245493463371307, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": 0.7293022871017456, + "logits/rejected": 0.8059667348861694, + "logps/chosen": -8.407529830932617, + "logps/rejected": -9.319184303283691, + "loss": 0.5002, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.407529830932617, + "rewards/margins": 0.9116536378860474, + "rewards/rejected": -9.319184303283691, + "semantic_entropy": 0.004189362749457359, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 22.579101949094103, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": 0.6055505275726318, + "logits/rejected": 0.6722275018692017, + "logps/chosen": -8.391597747802734, + "logps/rejected": -9.499109268188477, + "loss": 0.4719, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.391597747802734, + "rewards/margins": 1.1075109243392944, + "rewards/rejected": -9.499109268188477, + "semantic_entropy": 0.003600142430514097, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 42.73415984103176, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": 0.6817172765731812, + "logits/rejected": 0.7639212608337402, + "logps/chosen": -8.343725204467773, + "logps/rejected": -9.283025741577148, + "loss": 0.5163, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.343725204467773, + "rewards/margins": 0.939300537109375, + "rewards/rejected": -9.283025741577148, + "semantic_entropy": 0.004117668606340885, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 13.671217913035836, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": 0.6588679552078247, + "logits/rejected": 0.7599143385887146, + "logps/chosen": -8.530394554138184, + "logps/rejected": -9.683368682861328, + "loss": 0.4385, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.530394554138184, + "rewards/margins": 1.1529743671417236, + "rewards/rejected": -9.683368682861328, + "semantic_entropy": 0.003528149798512459, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 17.926663757907765, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": 0.6311002373695374, + "logits/rejected": 0.7291450500488281, + "logps/chosen": -8.84853744506836, + "logps/rejected": -9.728075981140137, + "loss": 0.5199, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.84853744506836, + "rewards/margins": 0.8795391917228699, + "rewards/rejected": -9.728075981140137, + "semantic_entropy": 0.0028904026839882135, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 17.340322531594975, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": 0.7018830180168152, + "logits/rejected": 0.8387192487716675, + "logps/chosen": -8.556188583374023, + "logps/rejected": -9.5839204788208, + "loss": 0.5109, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.556188583374023, + "rewards/margins": 1.0277318954467773, + "rewards/rejected": -9.5839204788208, + "semantic_entropy": 0.004027285613119602, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 15.73972180449081, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": 0.6703477501869202, + "logits/rejected": 0.7229039669036865, + "logps/chosen": -8.628973007202148, + "logps/rejected": -9.581435203552246, + "loss": 0.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.628973007202148, + "rewards/margins": 0.9524634480476379, + "rewards/rejected": -9.581435203552246, + "semantic_entropy": 0.003708144649863243, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 23.480077098467, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": 0.6456252336502075, + "logits/rejected": 0.7398085594177246, + "logps/chosen": -8.677080154418945, + "logps/rejected": -9.685178756713867, + "loss": 0.5225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.677080154418945, + "rewards/margins": 1.0080986022949219, + "rewards/rejected": -9.685178756713867, + "semantic_entropy": 0.0029380209743976593, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 15.433407858304248, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": 0.6237664222717285, + "logits/rejected": 0.6907469630241394, + "logps/chosen": -8.563250541687012, + "logps/rejected": -9.583332061767578, + "loss": 0.4477, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -8.563250541687012, + "rewards/margins": 1.0200810432434082, + "rewards/rejected": -9.583332061767578, + "semantic_entropy": 0.0032678351271897554, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 15.623034117961172, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": 0.5593664050102234, + "logits/rejected": 0.6882971525192261, + "logps/chosen": -8.571015357971191, + "logps/rejected": -9.626784324645996, + "loss": 0.4314, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -8.571015357971191, + "rewards/margins": 1.055769681930542, + "rewards/rejected": -9.626784324645996, + "semantic_entropy": 0.003013583132997155, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 19.95888030099492, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": 0.6781376004219055, + "logits/rejected": 0.7037637233734131, + "logps/chosen": -8.676929473876953, + "logps/rejected": -9.475358009338379, + "loss": 0.523, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.676929473876953, + "rewards/margins": 0.7984285354614258, + "rewards/rejected": -9.475358009338379, + "semantic_entropy": 0.003011090215295553, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 23.790232395129802, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": 0.6791437268257141, + "logits/rejected": 0.7153705358505249, + "logps/chosen": -8.594088554382324, + "logps/rejected": -9.416778564453125, + "loss": 0.5839, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.594088554382324, + "rewards/margins": 0.8226897120475769, + "rewards/rejected": -9.416778564453125, + "semantic_entropy": 0.0035625225864350796, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 19.641758817894523, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": 0.6901504397392273, + "logits/rejected": 0.7671633958816528, + "logps/chosen": -8.59467601776123, + "logps/rejected": -9.425036430358887, + "loss": 0.52, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.59467601776123, + "rewards/margins": 0.8303607702255249, + "rewards/rejected": -9.425036430358887, + "semantic_entropy": 0.003136052517220378, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 21.418887952150815, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": 0.6037132143974304, + "logits/rejected": 0.7032198905944824, + "logps/chosen": -8.467050552368164, + "logps/rejected": -9.3862886428833, + "loss": 0.52, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.467050552368164, + "rewards/margins": 0.9192383885383606, + "rewards/rejected": -9.3862886428833, + "semantic_entropy": 0.0032492957543581724, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 23.776890869160376, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": 0.6591798663139343, + "logits/rejected": 0.7749283909797668, + "logps/chosen": -8.419390678405762, + "logps/rejected": -9.24329948425293, + "loss": 0.5334, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.419390678405762, + "rewards/margins": 0.8239078521728516, + "rewards/rejected": -9.24329948425293, + "semantic_entropy": 0.003245703876018524, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 14.314831569908868, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": 0.695237398147583, + "logits/rejected": 0.7862176299095154, + "logps/chosen": -8.383251190185547, + "logps/rejected": -9.337328910827637, + "loss": 0.474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.383251190185547, + "rewards/margins": 0.954079270362854, + "rewards/rejected": -9.337328910827637, + "semantic_entropy": 0.0036495565436780453, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 17.93218545649754, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": 0.6778665781021118, + "logits/rejected": 0.7646596431732178, + "logps/chosen": -8.393171310424805, + "logps/rejected": -9.438430786132812, + "loss": 0.4243, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -8.393171310424805, + "rewards/margins": 1.0452605485916138, + "rewards/rejected": -9.438430786132812, + "semantic_entropy": 0.0038520165253430605, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 11.631955236946794, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": 0.6586011648178101, + "logits/rejected": 0.7384502291679382, + "logps/chosen": -8.368680000305176, + "logps/rejected": -9.391559600830078, + "loss": 0.4636, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.368680000305176, + "rewards/margins": 1.0228804349899292, + "rewards/rejected": -9.391559600830078, + "semantic_entropy": 0.0038888491690158844, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 17.817638170727182, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": 0.7192140817642212, + "logits/rejected": 0.7774937152862549, + "logps/chosen": -8.442755699157715, + "logps/rejected": -9.343069076538086, + "loss": 0.5072, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.442755699157715, + "rewards/margins": 0.9003141522407532, + "rewards/rejected": -9.343069076538086, + "semantic_entropy": 0.0035094446502625942, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 15.880665450221413, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": 0.7191354036331177, + "logits/rejected": 0.788988471031189, + "logps/chosen": -8.542181015014648, + "logps/rejected": -9.512245178222656, + "loss": 0.4687, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.542181015014648, + "rewards/margins": 0.9700649380683899, + "rewards/rejected": -9.512245178222656, + "semantic_entropy": 0.0031049910467118025, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 20.022219719640713, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": 0.798575222492218, + "logits/rejected": 0.8416634798049927, + "logps/chosen": -8.635334968566895, + "logps/rejected": -9.463285446166992, + "loss": 0.5202, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.635334968566895, + "rewards/margins": 0.8279510736465454, + "rewards/rejected": -9.463285446166992, + "semantic_entropy": 0.0029447092674672604, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 22.26373435182509, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": 0.6276187896728516, + "logits/rejected": 0.6913308501243591, + "logps/chosen": -8.587217330932617, + "logps/rejected": -9.444761276245117, + "loss": 0.5068, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.587217330932617, + "rewards/margins": 0.8575426936149597, + "rewards/rejected": -9.444761276245117, + "semantic_entropy": 0.003062673145905137, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 26.04794148992061, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": 0.6948825120925903, + "logits/rejected": 0.7602331042289734, + "logps/chosen": -8.406000137329102, + "logps/rejected": -9.530774116516113, + "loss": 0.4554, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.406000137329102, + "rewards/margins": 1.1247742176055908, + "rewards/rejected": -9.530774116516113, + "semantic_entropy": 0.0036121797747910023, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 23.24113283268114, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": 0.6636757254600525, + "logits/rejected": 0.7241615653038025, + "logps/chosen": -8.501133918762207, + "logps/rejected": -9.556479454040527, + "loss": 0.4467, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.501133918762207, + "rewards/margins": 1.0553454160690308, + "rewards/rejected": -9.556479454040527, + "semantic_entropy": 0.00418940931558609, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 17.478492942232236, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": 0.6438261270523071, + "logits/rejected": 0.7361315488815308, + "logps/chosen": -8.412601470947266, + "logps/rejected": -9.485505104064941, + "loss": 0.4467, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.412601470947266, + "rewards/margins": 1.0729031562805176, + "rewards/rejected": -9.485505104064941, + "semantic_entropy": 0.0037474199198186398, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 19.822571481610865, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": 0.7401809692382812, + "logits/rejected": 0.7346007227897644, + "logps/chosen": -8.39743423461914, + "logps/rejected": -9.354679107666016, + "loss": 0.4803, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.39743423461914, + "rewards/margins": 0.9572445154190063, + "rewards/rejected": -9.354679107666016, + "semantic_entropy": 0.004037821665406227, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 18.508878104944426, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": 0.6463350057601929, + "logits/rejected": 0.7294069528579712, + "logps/chosen": -8.58276081085205, + "logps/rejected": -9.354612350463867, + "loss": 0.5592, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.58276081085205, + "rewards/margins": 0.7718508243560791, + "rewards/rejected": -9.354612350463867, + "semantic_entropy": 0.0035912543535232544, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 19.828630412175407, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": 0.6823207139968872, + "logits/rejected": 0.7240070104598999, + "logps/chosen": -8.653319358825684, + "logps/rejected": -9.661191940307617, + "loss": 0.4801, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.653319358825684, + "rewards/margins": 1.0078718662261963, + "rewards/rejected": -9.661191940307617, + "semantic_entropy": 0.0033484199084341526, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 20.43246886248836, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": 0.6765194535255432, + "logits/rejected": 0.7426118850708008, + "logps/chosen": -8.664289474487305, + "logps/rejected": -9.393746376037598, + "loss": 0.5722, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.664289474487305, + "rewards/margins": 0.7294565439224243, + "rewards/rejected": -9.393746376037598, + "semantic_entropy": 0.003087881486862898, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 21.003656385946787, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": 0.7278314828872681, + "logits/rejected": 0.7725498080253601, + "logps/chosen": -8.796308517456055, + "logps/rejected": -9.755891799926758, + "loss": 0.4998, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.796308517456055, + "rewards/margins": 0.9595831036567688, + "rewards/rejected": -9.755891799926758, + "semantic_entropy": 0.002779710106551647, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 21.709608881311866, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": 0.6481348276138306, + "logits/rejected": 0.6695024967193604, + "logps/chosen": -8.799505233764648, + "logps/rejected": -9.460579872131348, + "loss": 0.5856, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -8.799505233764648, + "rewards/margins": 0.6610761880874634, + "rewards/rejected": -9.460579872131348, + "semantic_entropy": 0.0028663822449743748, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 16.96070709578509, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": 0.59629225730896, + "logits/rejected": 0.6733515858650208, + "logps/chosen": -8.772279739379883, + "logps/rejected": -9.883055686950684, + "loss": 0.4559, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -8.772279739379883, + "rewards/margins": 1.1107757091522217, + "rewards/rejected": -9.883055686950684, + "semantic_entropy": 0.0028946802485734224, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 15.2495463629594, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": 0.6147344708442688, + "logits/rejected": 0.6999740600585938, + "logps/chosen": -8.856141090393066, + "logps/rejected": -9.772600173950195, + "loss": 0.5042, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.856141090393066, + "rewards/margins": 0.916458010673523, + "rewards/rejected": -9.772600173950195, + "semantic_entropy": 0.002791165839880705, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 17.706370672383937, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": 0.7594738006591797, + "logits/rejected": 0.8430054783821106, + "logps/chosen": -8.928936958312988, + "logps/rejected": -9.845270156860352, + "loss": 0.5524, + "rewards/accuracies": 0.65625, + "rewards/chosen": -8.928936958312988, + "rewards/margins": 0.916333019733429, + "rewards/rejected": -9.845270156860352, + "semantic_entropy": 0.002889876952394843, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.8485396504402161, + "eval_logits/rejected": 0.9051938652992249, + "eval_logps/chosen": -8.875685691833496, + "eval_logps/rejected": -9.834607124328613, + "eval_loss": 0.5206592679023743, + "eval_rewards/accuracies": 0.716617226600647, + "eval_rewards/chosen": -8.875685691833496, + "eval_rewards/margins": 0.9589214324951172, + "eval_rewards/rejected": -9.834607124328613, + "eval_runtime": 35.3345, + "eval_samples_per_second": 38.065, + "eval_semantic_entropy": 0.0029725246131420135, + "eval_steps_per_second": 9.537, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 16.489166146612195, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": 0.6345168948173523, + "logits/rejected": 0.724500834941864, + "logps/chosen": -8.844565391540527, + "logps/rejected": -9.868181228637695, + "loss": 0.491, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.844565391540527, + "rewards/margins": 1.023616075515747, + "rewards/rejected": -9.868181228637695, + "semantic_entropy": 0.0029267659410834312, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 20.284904352984434, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": 0.6574803590774536, + "logits/rejected": 0.7301944494247437, + "logps/chosen": -8.73315143585205, + "logps/rejected": -9.601076126098633, + "loss": 0.5384, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.73315143585205, + "rewards/margins": 0.8679240942001343, + "rewards/rejected": -9.601076126098633, + "semantic_entropy": 0.003197681624442339, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 18.178656885884827, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": 0.7295519709587097, + "logits/rejected": 0.775715708732605, + "logps/chosen": -8.844693183898926, + "logps/rejected": -9.844103813171387, + "loss": 0.4747, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.844693183898926, + "rewards/margins": 0.9994112253189087, + "rewards/rejected": -9.844103813171387, + "semantic_entropy": 0.003269757376983762, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 18.18536875280265, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": 0.7071816325187683, + "logits/rejected": 0.8377809524536133, + "logps/chosen": -9.233766555786133, + "logps/rejected": -10.121223449707031, + "loss": 0.5734, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.233766555786133, + "rewards/margins": 0.887457549571991, + "rewards/rejected": -10.121223449707031, + "semantic_entropy": 0.0021587200462818146, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 19.728824808262466, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": 0.7191265225410461, + "logits/rejected": 0.7812397480010986, + "logps/chosen": -8.780452728271484, + "logps/rejected": -9.705193519592285, + "loss": 0.5172, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.780452728271484, + "rewards/margins": 0.9247404932975769, + "rewards/rejected": -9.705193519592285, + "semantic_entropy": 0.0030050217173993587, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 22.902320646291393, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": 0.7195814847946167, + "logits/rejected": 0.7792760133743286, + "logps/chosen": -9.003668785095215, + "logps/rejected": -10.163053512573242, + "loss": 0.4755, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.003668785095215, + "rewards/margins": 1.159385085105896, + "rewards/rejected": -10.163053512573242, + "semantic_entropy": 0.0027733384631574154, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 21.27644304238722, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": 0.754095196723938, + "logits/rejected": 0.8056744337081909, + "logps/chosen": -8.933822631835938, + "logps/rejected": -9.91698169708252, + "loss": 0.5001, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.933822631835938, + "rewards/margins": 0.983159065246582, + "rewards/rejected": -9.91698169708252, + "semantic_entropy": 0.002852677833288908, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 16.305218184451377, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": 0.7781286239624023, + "logits/rejected": 0.8381627798080444, + "logps/chosen": -9.009490013122559, + "logps/rejected": -10.12246036529541, + "loss": 0.4383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.009490013122559, + "rewards/margins": 1.1129701137542725, + "rewards/rejected": -10.12246036529541, + "semantic_entropy": 0.0021688812412321568, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 21.167999522279032, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": 0.7233031988143921, + "logits/rejected": 0.7836328148841858, + "logps/chosen": -9.125692367553711, + "logps/rejected": -10.023492813110352, + "loss": 0.5164, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -9.125692367553711, + "rewards/margins": 0.8977994918823242, + "rewards/rejected": -10.023492813110352, + "semantic_entropy": 0.0024925144389271736, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 17.847444177521478, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": 0.7969452142715454, + "logits/rejected": 0.8838433027267456, + "logps/chosen": -9.0504789352417, + "logps/rejected": -10.168962478637695, + "loss": 0.445, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.0504789352417, + "rewards/margins": 1.1184842586517334, + "rewards/rejected": -10.168962478637695, + "semantic_entropy": 0.00228295405395329, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 13.863230971283963, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": 0.7870410680770874, + "logits/rejected": 0.8611448407173157, + "logps/chosen": -8.87813663482666, + "logps/rejected": -9.888373374938965, + "loss": 0.4735, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.87813663482666, + "rewards/margins": 1.0102384090423584, + "rewards/rejected": -9.888373374938965, + "semantic_entropy": 0.00256515690125525, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 23.662117895070335, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": 0.6543598175048828, + "logits/rejected": 0.7426427006721497, + "logps/chosen": -9.041504859924316, + "logps/rejected": -10.04463005065918, + "loss": 0.4806, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.041504859924316, + "rewards/margins": 1.0031250715255737, + "rewards/rejected": -10.04463005065918, + "semantic_entropy": 0.0023853727616369724, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 24.256153218206705, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": 0.7513245940208435, + "logits/rejected": 0.8328276872634888, + "logps/chosen": -9.090994834899902, + "logps/rejected": -10.088811874389648, + "loss": 0.4761, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.090994834899902, + "rewards/margins": 0.9978184700012207, + "rewards/rejected": -10.088811874389648, + "semantic_entropy": 0.002217040164396167, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 26.314796306829628, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": 0.7460024356842041, + "logits/rejected": 0.8103092312812805, + "logps/chosen": -9.169168472290039, + "logps/rejected": -10.078702926635742, + "loss": 0.5253, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -9.169168472290039, + "rewards/margins": 0.9095350503921509, + "rewards/rejected": -10.078702926635742, + "semantic_entropy": 0.0022672966588288546, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 18.707979583798863, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": 0.7222810983657837, + "logits/rejected": 0.8605899810791016, + "logps/chosen": -8.943084716796875, + "logps/rejected": -10.04680347442627, + "loss": 0.4723, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.943084716796875, + "rewards/margins": 1.1037187576293945, + "rewards/rejected": -10.04680347442627, + "semantic_entropy": 0.002709039021283388, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 24.168153990326203, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": 0.6883140802383423, + "logits/rejected": 0.7442909479141235, + "logps/chosen": -8.88626766204834, + "logps/rejected": -9.887288093566895, + "loss": 0.4843, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -8.88626766204834, + "rewards/margins": 1.0010201930999756, + "rewards/rejected": -9.887288093566895, + "semantic_entropy": 0.002547713927924633, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 21.306584019538054, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": 0.7414734363555908, + "logits/rejected": 0.8382769823074341, + "logps/chosen": -9.08434009552002, + "logps/rejected": -10.091033935546875, + "loss": 0.4818, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.08434009552002, + "rewards/margins": 1.0066949129104614, + "rewards/rejected": -10.091033935546875, + "semantic_entropy": 0.002196715446189046, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 17.567070937529156, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": 0.7264934778213501, + "logits/rejected": 0.8058233261108398, + "logps/chosen": -8.905702590942383, + "logps/rejected": -9.89527416229248, + "loss": 0.456, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.905702590942383, + "rewards/margins": 0.9895727038383484, + "rewards/rejected": -9.89527416229248, + "semantic_entropy": 0.0023129256442189217, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 21.091679239749574, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": 0.7000614404678345, + "logits/rejected": 0.7599838972091675, + "logps/chosen": -9.041738510131836, + "logps/rejected": -10.077522277832031, + "loss": 0.4989, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.041738510131836, + "rewards/margins": 1.0357847213745117, + "rewards/rejected": -10.077522277832031, + "semantic_entropy": 0.002556400140747428, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 19.4750798931357, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": 0.61224764585495, + "logits/rejected": 0.694617509841919, + "logps/chosen": -9.045601844787598, + "logps/rejected": -10.111780166625977, + "loss": 0.5129, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.045601844787598, + "rewards/margins": 1.0661789178848267, + "rewards/rejected": -10.111780166625977, + "semantic_entropy": 0.002750884275883436, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 27.700841135900863, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": 0.6138121485710144, + "logits/rejected": 0.7267636060714722, + "logps/chosen": -8.853018760681152, + "logps/rejected": -9.896512985229492, + "loss": 0.4664, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.853018760681152, + "rewards/margins": 1.0434927940368652, + "rewards/rejected": -9.896512985229492, + "semantic_entropy": 0.002849545329809189, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 17.137464902365757, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": 0.7429224848747253, + "logits/rejected": 0.792006254196167, + "logps/chosen": -8.867963790893555, + "logps/rejected": -9.816696166992188, + "loss": 0.4852, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.867963790893555, + "rewards/margins": 0.9487320184707642, + "rewards/rejected": -9.816696166992188, + "semantic_entropy": 0.0024321440141648054, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 23.093031626174174, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": 0.703718900680542, + "logits/rejected": 0.7882632613182068, + "logps/chosen": -8.92485237121582, + "logps/rejected": -9.829282760620117, + "loss": 0.4997, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.92485237121582, + "rewards/margins": 0.904431939125061, + "rewards/rejected": -9.829282760620117, + "semantic_entropy": 0.00211041746661067, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 14.81174004133826, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": 0.7307204008102417, + "logits/rejected": 0.828132152557373, + "logps/chosen": -8.699949264526367, + "logps/rejected": -9.868879318237305, + "loss": 0.4528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.699949264526367, + "rewards/margins": 1.1689304113388062, + "rewards/rejected": -9.868879318237305, + "semantic_entropy": 0.0027522039599716663, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 21.68742926157539, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": 0.6795674562454224, + "logits/rejected": 0.7630687355995178, + "logps/chosen": -8.83338451385498, + "logps/rejected": -9.718835830688477, + "loss": 0.5331, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -8.83338451385498, + "rewards/margins": 0.8854507207870483, + "rewards/rejected": -9.718835830688477, + "semantic_entropy": 0.0027334585320204496, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 17.78803459405486, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": 0.6920473575592041, + "logits/rejected": 0.7552872896194458, + "logps/chosen": -8.73144245147705, + "logps/rejected": -9.89186954498291, + "loss": 0.4425, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.73144245147705, + "rewards/margins": 1.1604268550872803, + "rewards/rejected": -9.89186954498291, + "semantic_entropy": 0.0029787137173116207, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 26.15974232065996, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": 0.6749182939529419, + "logits/rejected": 0.7685472965240479, + "logps/chosen": -8.99049186706543, + "logps/rejected": -9.847522735595703, + "loss": 0.5483, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.99049186706543, + "rewards/margins": 0.8570305705070496, + "rewards/rejected": -9.847522735595703, + "semantic_entropy": 0.002466335194185376, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 27.88902855871644, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": 0.7373770475387573, + "logits/rejected": 0.8123876452445984, + "logps/chosen": -8.785151481628418, + "logps/rejected": -9.975650787353516, + "loss": 0.4473, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.785151481628418, + "rewards/margins": 1.1904983520507812, + "rewards/rejected": -9.975650787353516, + "semantic_entropy": 0.0027192619163542986, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 18.711785224251184, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": 0.7438098788261414, + "logits/rejected": 0.8496102094650269, + "logps/chosen": -8.687559127807617, + "logps/rejected": -9.705583572387695, + "loss": 0.4863, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.687559127807617, + "rewards/margins": 1.0180258750915527, + "rewards/rejected": -9.705583572387695, + "semantic_entropy": 0.0030276733450591564, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 14.740013723796874, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": 0.6757484674453735, + "logits/rejected": 0.7701447606086731, + "logps/chosen": -8.937789916992188, + "logps/rejected": -9.954288482666016, + "loss": 0.4742, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -8.937789916992188, + "rewards/margins": 1.0164979696273804, + "rewards/rejected": -9.954288482666016, + "semantic_entropy": 0.0024529777001589537, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 23.48077638997515, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": 0.7250600457191467, + "logits/rejected": 0.7637051343917847, + "logps/chosen": -8.986780166625977, + "logps/rejected": -9.95530891418457, + "loss": 0.5204, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.986780166625977, + "rewards/margins": 0.968528151512146, + "rewards/rejected": -9.95530891418457, + "semantic_entropy": 0.0026002321392297745, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 21.480655584717656, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": 0.6162451505661011, + "logits/rejected": 0.6867518424987793, + "logps/chosen": -8.727703094482422, + "logps/rejected": -9.817054748535156, + "loss": 0.485, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.727703094482422, + "rewards/margins": 1.0893512964248657, + "rewards/rejected": -9.817054748535156, + "semantic_entropy": 0.0029442054219543934, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 18.329533144366998, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": 0.6584054231643677, + "logits/rejected": 0.7416545152664185, + "logps/chosen": -8.870896339416504, + "logps/rejected": -9.782048225402832, + "loss": 0.4965, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.870896339416504, + "rewards/margins": 0.9111523628234863, + "rewards/rejected": -9.782048225402832, + "semantic_entropy": 0.0025828261859714985, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 20.53246303343791, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": 0.699885368347168, + "logits/rejected": 0.7778645753860474, + "logps/chosen": -9.00536060333252, + "logps/rejected": -9.712576866149902, + "loss": 0.6029, + "rewards/accuracies": 0.71875, + "rewards/chosen": -9.00536060333252, + "rewards/margins": 0.7072166204452515, + "rewards/rejected": -9.712576866149902, + "semantic_entropy": 0.0022258516401052475, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 17.92382157104711, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": 0.7124849557876587, + "logits/rejected": 0.773395836353302, + "logps/chosen": -9.165143013000488, + "logps/rejected": -10.00512409210205, + "loss": 0.5968, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -9.165143013000488, + "rewards/margins": 0.8399818539619446, + "rewards/rejected": -10.00512409210205, + "semantic_entropy": 0.0018791807815432549, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 16.031873167807106, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": 0.6675196886062622, + "logits/rejected": 0.7558413743972778, + "logps/chosen": -8.87452220916748, + "logps/rejected": -9.87469482421875, + "loss": 0.5014, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.87452220916748, + "rewards/margins": 1.0001723766326904, + "rewards/rejected": -9.87469482421875, + "semantic_entropy": 0.0024310979060828686, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 17.18330270756532, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": 0.6833704710006714, + "logits/rejected": 0.6954981684684753, + "logps/chosen": -8.974740982055664, + "logps/rejected": -9.834104537963867, + "loss": 0.515, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.974740982055664, + "rewards/margins": 0.8593646883964539, + "rewards/rejected": -9.834104537963867, + "semantic_entropy": 0.002467888640239835, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 30.761929745103753, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": 0.7065908908843994, + "logits/rejected": 0.761638343334198, + "logps/chosen": -9.091318130493164, + "logps/rejected": -9.837724685668945, + "loss": 0.5966, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.091318130493164, + "rewards/margins": 0.7464063763618469, + "rewards/rejected": -9.837724685668945, + "semantic_entropy": 0.002267292933538556, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 17.437367219946186, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": 0.6314225792884827, + "logits/rejected": 0.7021452784538269, + "logps/chosen": -8.934675216674805, + "logps/rejected": -9.954093933105469, + "loss": 0.4614, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.934675216674805, + "rewards/margins": 1.019417405128479, + "rewards/rejected": -9.954093933105469, + "semantic_entropy": 0.002328323433175683, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 23.666717824573407, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": 0.6827374696731567, + "logits/rejected": 0.7666773796081543, + "logps/chosen": -9.024335861206055, + "logps/rejected": -9.977819442749023, + "loss": 0.499, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.024335861206055, + "rewards/margins": 0.9534839391708374, + "rewards/rejected": -9.977819442749023, + "semantic_entropy": 0.002623113337904215, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 19.47814600029652, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": 0.6472792029380798, + "logits/rejected": 0.7226368188858032, + "logps/chosen": -9.10517406463623, + "logps/rejected": -9.983365058898926, + "loss": 0.4792, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.10517406463623, + "rewards/margins": 0.878190815448761, + "rewards/rejected": -9.983365058898926, + "semantic_entropy": 0.001885834732092917, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 15.863322204582039, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": 0.6499922871589661, + "logits/rejected": 0.7298166751861572, + "logps/chosen": -9.109966278076172, + "logps/rejected": -10.095043182373047, + "loss": 0.4607, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.109966278076172, + "rewards/margins": 0.9850764274597168, + "rewards/rejected": -10.095043182373047, + "semantic_entropy": 0.00205561937764287, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 20.471803353130138, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": 0.6554244756698608, + "logits/rejected": 0.7187341451644897, + "logps/chosen": -8.939603805541992, + "logps/rejected": -9.954214096069336, + "loss": 0.4732, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.939603805541992, + "rewards/margins": 1.0146093368530273, + "rewards/rejected": -9.954214096069336, + "semantic_entropy": 0.002255493775010109, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 17.77008986004475, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": 0.7610489726066589, + "logits/rejected": 0.8220788836479187, + "logps/chosen": -8.944317817687988, + "logps/rejected": -9.942276000976562, + "loss": 0.5332, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.944317817687988, + "rewards/margins": 0.9979581832885742, + "rewards/rejected": -9.942276000976562, + "semantic_entropy": 0.0026616621762514114, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 17.655564242212915, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": 0.6452963948249817, + "logits/rejected": 0.6714794039726257, + "logps/chosen": -9.091032028198242, + "logps/rejected": -10.047563552856445, + "loss": 0.4901, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.091032028198242, + "rewards/margins": 0.9565309286117554, + "rewards/rejected": -10.047563552856445, + "semantic_entropy": 0.0020148297771811485, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 21.687629209942305, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": 0.6120941638946533, + "logits/rejected": 0.6991773843765259, + "logps/chosen": -8.957076072692871, + "logps/rejected": -10.24323844909668, + "loss": 0.4464, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -8.957076072692871, + "rewards/margins": 1.286162257194519, + "rewards/rejected": -10.24323844909668, + "semantic_entropy": 0.002235526219010353, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 20.343261070644665, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": 0.6617427468299866, + "logits/rejected": 0.7312533259391785, + "logps/chosen": -9.039822578430176, + "logps/rejected": -10.031728744506836, + "loss": 0.4638, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.039822578430176, + "rewards/margins": 0.9919074177742004, + "rewards/rejected": -10.031728744506836, + "semantic_entropy": 0.002043725224211812, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 23.236450227403648, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": 0.6440222859382629, + "logits/rejected": 0.7037031054496765, + "logps/chosen": -9.063264846801758, + "logps/rejected": -10.010323524475098, + "loss": 0.5479, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -9.063264846801758, + "rewards/margins": 0.9470599889755249, + "rewards/rejected": -10.010323524475098, + "semantic_entropy": 0.0022424368653446436, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 19.62330937214821, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": 0.6262876987457275, + "logits/rejected": 0.7453981041908264, + "logps/chosen": -8.971453666687012, + "logps/rejected": -10.038634300231934, + "loss": 0.5028, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -8.971453666687012, + "rewards/margins": 1.067180871963501, + "rewards/rejected": -10.038634300231934, + "semantic_entropy": 0.002401644829660654, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 20.685899363455015, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": 0.6409908533096313, + "logits/rejected": 0.7288905382156372, + "logps/chosen": -9.046746253967285, + "logps/rejected": -10.134294509887695, + "loss": 0.4441, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.046746253967285, + "rewards/margins": 1.0875482559204102, + "rewards/rejected": -10.134294509887695, + "semantic_entropy": 0.0022094310261309147, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 21.553543305017367, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": 0.6723691821098328, + "logits/rejected": 0.797301173210144, + "logps/chosen": -9.249448776245117, + "logps/rejected": -10.260098457336426, + "loss": 0.525, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.249448776245117, + "rewards/margins": 1.0106487274169922, + "rewards/rejected": -10.260098457336426, + "semantic_entropy": 0.0019731963984668255, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 18.840448032000573, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": 0.7298885583877563, + "logits/rejected": 0.7910041213035583, + "logps/chosen": -8.926676750183105, + "logps/rejected": -9.932170867919922, + "loss": 0.4829, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.926676750183105, + "rewards/margins": 1.0054935216903687, + "rewards/rejected": -9.932170867919922, + "semantic_entropy": 0.0026156664825975895, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 22.35639759207699, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": 0.6428291201591492, + "logits/rejected": 0.6888445615768433, + "logps/chosen": -9.04511833190918, + "logps/rejected": -10.008955001831055, + "loss": 0.5023, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.04511833190918, + "rewards/margins": 0.9638371467590332, + "rewards/rejected": -10.008955001831055, + "semantic_entropy": 0.002010942902415991, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 18.756301563038658, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": 0.6267444491386414, + "logits/rejected": 0.7092264890670776, + "logps/chosen": -8.965785026550293, + "logps/rejected": -10.16661548614502, + "loss": 0.4493, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.965785026550293, + "rewards/margins": 1.2008302211761475, + "rewards/rejected": -10.16661548614502, + "semantic_entropy": 0.002212436404079199, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 20.509740408539155, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": 0.644954264163971, + "logits/rejected": 0.6691696047782898, + "logps/chosen": -9.087681770324707, + "logps/rejected": -10.077864646911621, + "loss": 0.5039, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -9.087681770324707, + "rewards/margins": 0.9901838302612305, + "rewards/rejected": -10.077864646911621, + "semantic_entropy": 0.0026786925736814737, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 21.56561893288092, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": 0.6968456506729126, + "logits/rejected": 0.7207155823707581, + "logps/chosen": -9.34924602508545, + "logps/rejected": -10.384513854980469, + "loss": 0.5044, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.34924602508545, + "rewards/margins": 1.0352654457092285, + "rewards/rejected": -10.384513854980469, + "semantic_entropy": 0.001824896433390677, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 27.331504916633676, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": 0.6225256323814392, + "logits/rejected": 0.7147785425186157, + "logps/chosen": -9.113728523254395, + "logps/rejected": -10.127340316772461, + "loss": 0.4956, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.113728523254395, + "rewards/margins": 1.0136115550994873, + "rewards/rejected": -10.127340316772461, + "semantic_entropy": 0.002069010864943266, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 24.045050734996725, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": 0.7259531617164612, + "logits/rejected": 0.767625093460083, + "logps/chosen": -9.031217575073242, + "logps/rejected": -10.065279006958008, + "loss": 0.453, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.031217575073242, + "rewards/margins": 1.034061312675476, + "rewards/rejected": -10.065279006958008, + "semantic_entropy": 0.002121392637491226, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 22.0898928139177, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": 0.6421200037002563, + "logits/rejected": 0.7500838041305542, + "logps/chosen": -9.124689102172852, + "logps/rejected": -10.039737701416016, + "loss": 0.4787, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.124689102172852, + "rewards/margins": 0.915047824382782, + "rewards/rejected": -10.039737701416016, + "semantic_entropy": 0.0020720604807138443, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 17.368463245273844, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": 0.6820253133773804, + "logits/rejected": 0.767578125, + "logps/chosen": -8.85074520111084, + "logps/rejected": -9.993677139282227, + "loss": 0.4724, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.85074520111084, + "rewards/margins": 1.1429319381713867, + "rewards/rejected": -9.993677139282227, + "semantic_entropy": 0.0029044263064861298, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 26.111048656274622, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": 0.6787170767784119, + "logits/rejected": 0.7552027702331543, + "logps/chosen": -8.890897750854492, + "logps/rejected": -9.884078025817871, + "loss": 0.476, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.890897750854492, + "rewards/margins": 0.9931808710098267, + "rewards/rejected": -9.884078025817871, + "semantic_entropy": 0.0023928822483867407, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 25.720344446595462, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": 0.6534699201583862, + "logits/rejected": 0.7134217619895935, + "logps/chosen": -8.654134750366211, + "logps/rejected": -9.688114166259766, + "loss": 0.4572, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.654134750366211, + "rewards/margins": 1.0339783430099487, + "rewards/rejected": -9.688114166259766, + "semantic_entropy": 0.0033304274547845125, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 26.719905564320833, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": 0.6928398609161377, + "logits/rejected": 0.7959692478179932, + "logps/chosen": -8.998773574829102, + "logps/rejected": -10.005119323730469, + "loss": 0.4712, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.998773574829102, + "rewards/margins": 1.0063453912734985, + "rewards/rejected": -10.005119323730469, + "semantic_entropy": 0.002207712968811393, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 18.310966271031408, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": 0.6792198419570923, + "logits/rejected": 0.7416011691093445, + "logps/chosen": -9.011211395263672, + "logps/rejected": -9.959385871887207, + "loss": 0.5049, + "rewards/accuracies": 0.71875, + "rewards/chosen": -9.011211395263672, + "rewards/margins": 0.9481745958328247, + "rewards/rejected": -9.959385871887207, + "semantic_entropy": 0.0023789291735738516, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 23.54294511956383, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": 0.7045190334320068, + "logits/rejected": 0.814649224281311, + "logps/chosen": -9.096199035644531, + "logps/rejected": -9.993253707885742, + "loss": 0.5336, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.096199035644531, + "rewards/margins": 0.8970546722412109, + "rewards/rejected": -9.993253707885742, + "semantic_entropy": 0.0019266394665464759, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 25.00575901129148, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": 0.717838704586029, + "logits/rejected": 0.7664039134979248, + "logps/chosen": -9.02385139465332, + "logps/rejected": -9.899523735046387, + "loss": 0.4944, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.02385139465332, + "rewards/margins": 0.8756723403930664, + "rewards/rejected": -9.899523735046387, + "semantic_entropy": 0.00239885738119483, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 14.206266277834583, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": 0.6777101755142212, + "logits/rejected": 0.7591882944107056, + "logps/chosen": -8.996365547180176, + "logps/rejected": -10.051039695739746, + "loss": 0.4775, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.996365547180176, + "rewards/margins": 1.0546749830245972, + "rewards/rejected": -10.051039695739746, + "semantic_entropy": 0.0023488677106797695, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 24.582046979640456, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": 0.6657333374023438, + "logits/rejected": 0.7678895592689514, + "logps/chosen": -9.090019226074219, + "logps/rejected": -9.867273330688477, + "loss": 0.5413, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.090019226074219, + "rewards/margins": 0.7772535085678101, + "rewards/rejected": -9.867273330688477, + "semantic_entropy": 0.0021225649397820234, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 22.002900126766963, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": 0.6315397620201111, + "logits/rejected": 0.7467874884605408, + "logps/chosen": -8.85982894897461, + "logps/rejected": -9.882316589355469, + "loss": 0.4531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.85982894897461, + "rewards/margins": 1.0224884748458862, + "rewards/rejected": -9.882316589355469, + "semantic_entropy": 0.0025396724231541157, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 18.88844359088285, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": 0.7221022844314575, + "logits/rejected": 0.779187798500061, + "logps/chosen": -9.089310646057129, + "logps/rejected": -9.891887664794922, + "loss": 0.5786, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.089310646057129, + "rewards/margins": 0.802577018737793, + "rewards/rejected": -9.891887664794922, + "semantic_entropy": 0.002080023754388094, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 14.948819514875911, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": 0.6752597093582153, + "logits/rejected": 0.7371557354927063, + "logps/chosen": -8.859451293945312, + "logps/rejected": -9.911179542541504, + "loss": 0.483, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.859451293945312, + "rewards/margins": 1.051727294921875, + "rewards/rejected": -9.911179542541504, + "semantic_entropy": 0.0026133800856769085, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 24.637227593137656, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": 0.6935003995895386, + "logits/rejected": 0.7413294315338135, + "logps/chosen": -8.910869598388672, + "logps/rejected": -9.782427787780762, + "loss": 0.5241, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.910869598388672, + "rewards/margins": 0.8715595006942749, + "rewards/rejected": -9.782427787780762, + "semantic_entropy": 0.002564162714406848, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 16.09008068793547, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": 0.7049506902694702, + "logits/rejected": 0.7636333703994751, + "logps/chosen": -9.050897598266602, + "logps/rejected": -10.201239585876465, + "loss": 0.4495, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.050897598266602, + "rewards/margins": 1.150342583656311, + "rewards/rejected": -10.201239585876465, + "semantic_entropy": 0.0020480218809098005, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 17.29460063325407, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": 0.6761201024055481, + "logits/rejected": 0.7888168096542358, + "logps/chosen": -8.835293769836426, + "logps/rejected": -9.917633056640625, + "loss": 0.454, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -8.835293769836426, + "rewards/margins": 1.0823395252227783, + "rewards/rejected": -9.917633056640625, + "semantic_entropy": 0.002739850664511323, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 15.951273378502073, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": 0.7415227293968201, + "logits/rejected": 0.8035387992858887, + "logps/chosen": -8.881102561950684, + "logps/rejected": -9.969578742980957, + "loss": 0.4531, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -8.881102561950684, + "rewards/margins": 1.0884764194488525, + "rewards/rejected": -9.969578742980957, + "semantic_entropy": 0.0023522416595369577, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 18.18795560922933, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": 0.7050553560256958, + "logits/rejected": 0.7514214515686035, + "logps/chosen": -8.872556686401367, + "logps/rejected": -9.893532752990723, + "loss": 0.4645, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.872556686401367, + "rewards/margins": 1.0209757089614868, + "rewards/rejected": -9.893532752990723, + "semantic_entropy": 0.0025408435612916946, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 15.738805690279383, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": 0.7101159691810608, + "logits/rejected": 0.7551933526992798, + "logps/chosen": -9.055107116699219, + "logps/rejected": -10.217333793640137, + "loss": 0.4421, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.055107116699219, + "rewards/margins": 1.1622273921966553, + "rewards/rejected": -10.217333793640137, + "semantic_entropy": 0.002511825645342469, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 19.41964183644154, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": 0.7462642788887024, + "logits/rejected": 0.7571308016777039, + "logps/chosen": -9.074853897094727, + "logps/rejected": -9.919515609741211, + "loss": 0.5562, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -9.074853897094727, + "rewards/margins": 0.8446613550186157, + "rewards/rejected": -9.919515609741211, + "semantic_entropy": 0.002418497810140252, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 15.49567951743004, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": 0.6995843052864075, + "logits/rejected": 0.7880675792694092, + "logps/chosen": -9.252038955688477, + "logps/rejected": -10.221213340759277, + "loss": 0.5099, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.252038955688477, + "rewards/margins": 0.969176173210144, + "rewards/rejected": -10.221213340759277, + "semantic_entropy": 0.0019108497072011232, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 21.391951740269207, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": 0.7241548895835876, + "logits/rejected": 0.7919793725013733, + "logps/chosen": -9.234020233154297, + "logps/rejected": -10.174389839172363, + "loss": 0.5311, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.234020233154297, + "rewards/margins": 0.9403679966926575, + "rewards/rejected": -10.174389839172363, + "semantic_entropy": 0.0019412841647863388, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.7883932590484619, + "eval_logits/rejected": 0.8341716527938843, + "eval_logps/chosen": -9.098273277282715, + "eval_logps/rejected": -10.07473087310791, + "eval_loss": 0.5169808268547058, + "eval_rewards/accuracies": 0.7232937812805176, + "eval_rewards/chosen": -9.098273277282715, + "eval_rewards/margins": 0.9764575362205505, + "eval_rewards/rejected": -10.07473087310791, + "eval_runtime": 35.2413, + "eval_samples_per_second": 38.165, + "eval_semantic_entropy": 0.0023804251104593277, + "eval_steps_per_second": 9.563, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 20.553397352111812, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": 0.7208374738693237, + "logits/rejected": 0.7372707724571228, + "logps/chosen": -8.946023941040039, + "logps/rejected": -9.88565731048584, + "loss": 0.5125, + "rewards/accuracies": 0.71875, + "rewards/chosen": -8.946023941040039, + "rewards/margins": 0.9396332502365112, + "rewards/rejected": -9.88565731048584, + "semantic_entropy": 0.0025065175723284483, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 14.899765335539588, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": 0.66487056016922, + "logits/rejected": 0.7429142594337463, + "logps/chosen": -8.882848739624023, + "logps/rejected": -9.959342956542969, + "loss": 0.4247, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -8.882848739624023, + "rewards/margins": 1.0764933824539185, + "rewards/rejected": -9.959342956542969, + "semantic_entropy": 0.0027935917023569345, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 18.125423863526265, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": 0.7205886840820312, + "logits/rejected": 0.783849835395813, + "logps/chosen": -9.081689834594727, + "logps/rejected": -10.110027313232422, + "loss": 0.493, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.081689834594727, + "rewards/margins": 1.0283381938934326, + "rewards/rejected": -10.110027313232422, + "semantic_entropy": 0.0018970107194036245, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 22.214798729957145, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": 0.7102506160736084, + "logits/rejected": 0.7653765678405762, + "logps/chosen": -9.172645568847656, + "logps/rejected": -10.060796737670898, + "loss": 0.5162, + "rewards/accuracies": 0.71875, + "rewards/chosen": -9.172645568847656, + "rewards/margins": 0.8881510496139526, + "rewards/rejected": -10.060796737670898, + "semantic_entropy": 0.002231413032859564, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 16.808694866109835, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": 0.7016305923461914, + "logits/rejected": 0.7090336084365845, + "logps/chosen": -8.874353408813477, + "logps/rejected": -9.903203010559082, + "loss": 0.4595, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.874353408813477, + "rewards/margins": 1.028850793838501, + "rewards/rejected": -9.903203010559082, + "semantic_entropy": 0.0032983936835080385, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 19.764545534044544, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": 0.6539800763130188, + "logits/rejected": 0.6890634894371033, + "logps/chosen": -8.991025924682617, + "logps/rejected": -10.090238571166992, + "loss": 0.4918, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.991025924682617, + "rewards/margins": 1.0992109775543213, + "rewards/rejected": -10.090238571166992, + "semantic_entropy": 0.002287736628204584, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 22.455179400576377, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": 0.6598862409591675, + "logits/rejected": 0.7323023676872253, + "logps/chosen": -8.853995323181152, + "logps/rejected": -10.00835132598877, + "loss": 0.4441, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -8.853995323181152, + "rewards/margins": 1.1543556451797485, + "rewards/rejected": -10.00835132598877, + "semantic_entropy": 0.0028118849731981754, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 20.333660341959458, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": 0.6398320198059082, + "logits/rejected": 0.6992667317390442, + "logps/chosen": -9.079477310180664, + "logps/rejected": -10.012245178222656, + "loss": 0.4811, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.079477310180664, + "rewards/margins": 0.9327686429023743, + "rewards/rejected": -10.012245178222656, + "semantic_entropy": 0.002612376119941473, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 17.83037025874042, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": 0.6089428663253784, + "logits/rejected": 0.6763657331466675, + "logps/chosen": -9.060758590698242, + "logps/rejected": -9.848979949951172, + "loss": 0.5207, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.060758590698242, + "rewards/margins": 0.7882214188575745, + "rewards/rejected": -9.848979949951172, + "semantic_entropy": 0.002590155927464366, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 18.640006592880273, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": 0.6610291600227356, + "logits/rejected": 0.6989740133285522, + "logps/chosen": -9.217550277709961, + "logps/rejected": -9.963292121887207, + "loss": 0.5338, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.217550277709961, + "rewards/margins": 0.7457407712936401, + "rewards/rejected": -9.963292121887207, + "semantic_entropy": 0.002440792042762041, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 23.51490179151936, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": 0.7316364049911499, + "logits/rejected": 0.7886163592338562, + "logps/chosen": -9.14958381652832, + "logps/rejected": -10.033146858215332, + "loss": 0.5228, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.14958381652832, + "rewards/margins": 0.8835636377334595, + "rewards/rejected": -10.033146858215332, + "semantic_entropy": 0.0022072389256209135, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 21.709681200914343, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": 0.7047960162162781, + "logits/rejected": 0.7564027309417725, + "logps/chosen": -9.247949600219727, + "logps/rejected": -10.113713264465332, + "loss": 0.5189, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.247949600219727, + "rewards/margins": 0.8657627105712891, + "rewards/rejected": -10.113713264465332, + "semantic_entropy": 0.002100490964949131, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 20.917519916910795, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": 0.6687744855880737, + "logits/rejected": 0.7209922671318054, + "logps/chosen": -8.913464546203613, + "logps/rejected": -10.03473949432373, + "loss": 0.4826, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -8.913464546203613, + "rewards/margins": 1.1212753057479858, + "rewards/rejected": -10.03473949432373, + "semantic_entropy": 0.0026697556022554636, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 24.85961354517434, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": 0.7192034721374512, + "logits/rejected": 0.776824951171875, + "logps/chosen": -8.97942066192627, + "logps/rejected": -10.088353157043457, + "loss": 0.4491, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.97942066192627, + "rewards/margins": 1.1089332103729248, + "rewards/rejected": -10.088353157043457, + "semantic_entropy": 0.002531954552978277, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 18.73523000620898, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": 0.6787633895874023, + "logits/rejected": 0.7581242322921753, + "logps/chosen": -9.208142280578613, + "logps/rejected": -10.221251487731934, + "loss": 0.5371, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.208142280578613, + "rewards/margins": 1.0131086111068726, + "rewards/rejected": -10.221251487731934, + "semantic_entropy": 0.0021549214143306017, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 16.839870791359772, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": 0.6967512369155884, + "logits/rejected": 0.7626298666000366, + "logps/chosen": -8.988113403320312, + "logps/rejected": -10.06352710723877, + "loss": 0.439, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.988113403320312, + "rewards/margins": 1.0754133462905884, + "rewards/rejected": -10.06352710723877, + "semantic_entropy": 0.002239447785541415, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 25.890088694071558, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": 0.7034773826599121, + "logits/rejected": 0.7456248998641968, + "logps/chosen": -9.344053268432617, + "logps/rejected": -10.413859367370605, + "loss": 0.4768, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.344053268432617, + "rewards/margins": 1.0698063373565674, + "rewards/rejected": -10.413859367370605, + "semantic_entropy": 0.0019446806982159615, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 20.622779505918793, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": 0.77290278673172, + "logits/rejected": 0.8412041664123535, + "logps/chosen": -9.321812629699707, + "logps/rejected": -10.298714637756348, + "loss": 0.4964, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.321812629699707, + "rewards/margins": 0.9769018292427063, + "rewards/rejected": -10.298714637756348, + "semantic_entropy": 0.0020173420198261738, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 20.085568144464975, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": 0.7355653643608093, + "logits/rejected": 0.7890772819519043, + "logps/chosen": -9.242449760437012, + "logps/rejected": -10.24592399597168, + "loss": 0.476, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.242449760437012, + "rewards/margins": 1.0034732818603516, + "rewards/rejected": -10.24592399597168, + "semantic_entropy": 0.0019115330651402473, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 19.738688834879806, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": 0.7584089040756226, + "logits/rejected": 0.8401368260383606, + "logps/chosen": -9.147109985351562, + "logps/rejected": -10.11845874786377, + "loss": 0.4871, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.147109985351562, + "rewards/margins": 0.9713494181632996, + "rewards/rejected": -10.11845874786377, + "semantic_entropy": 0.002204468008130789, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 23.899312142624215, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": 0.8023883700370789, + "logits/rejected": 0.8330841064453125, + "logps/chosen": -9.159601211547852, + "logps/rejected": -10.233617782592773, + "loss": 0.4659, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.159601211547852, + "rewards/margins": 1.0740149021148682, + "rewards/rejected": -10.233617782592773, + "semantic_entropy": 0.0021205353550612926, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 19.69531748298203, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": 0.8703521490097046, + "logits/rejected": 0.9046932458877563, + "logps/chosen": -9.088752746582031, + "logps/rejected": -10.052389144897461, + "loss": 0.514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.088752746582031, + "rewards/margins": 0.9636358022689819, + "rewards/rejected": -10.052389144897461, + "semantic_entropy": 0.0021650404669344425, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 16.58014663580926, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": 0.8061652183532715, + "logits/rejected": 0.8440017700195312, + "logps/chosen": -9.173129081726074, + "logps/rejected": -10.378541946411133, + "loss": 0.4549, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.173129081726074, + "rewards/margins": 1.2054128646850586, + "rewards/rejected": -10.378541946411133, + "semantic_entropy": 0.002410900080576539, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 19.2115007374387, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": 0.7924807071685791, + "logits/rejected": 0.8539689183235168, + "logps/chosen": -9.085798263549805, + "logps/rejected": -10.019353866577148, + "loss": 0.4867, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.085798263549805, + "rewards/margins": 0.9335559010505676, + "rewards/rejected": -10.019353866577148, + "semantic_entropy": 0.0022169214207679033, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 15.73085596537741, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": 0.728915810585022, + "logits/rejected": 0.7807096838951111, + "logps/chosen": -9.136758804321289, + "logps/rejected": -10.273028373718262, + "loss": 0.4235, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.136758804321289, + "rewards/margins": 1.1362701654434204, + "rewards/rejected": -10.273028373718262, + "semantic_entropy": 0.0019926291424781084, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 21.48912266471965, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": 0.8049052357673645, + "logits/rejected": 0.8560088872909546, + "logps/chosen": -9.200352668762207, + "logps/rejected": -10.1701078414917, + "loss": 0.4887, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.200352668762207, + "rewards/margins": 0.9697545766830444, + "rewards/rejected": -10.1701078414917, + "semantic_entropy": 0.0019432473927736282, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 29.255894045857996, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": 0.8145462870597839, + "logits/rejected": 0.8601492047309875, + "logps/chosen": -9.245951652526855, + "logps/rejected": -10.164546012878418, + "loss": 0.5493, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -9.245951652526855, + "rewards/margins": 0.9185951352119446, + "rewards/rejected": -10.164546012878418, + "semantic_entropy": 0.0018242119112983346, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 20.524507990126196, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": 0.8181111216545105, + "logits/rejected": 0.8561455607414246, + "logps/chosen": -9.23983383178711, + "logps/rejected": -10.13349723815918, + "loss": 0.4809, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.23983383178711, + "rewards/margins": 0.8936625719070435, + "rewards/rejected": -10.13349723815918, + "semantic_entropy": 0.002180408453568816, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 17.0373176165906, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": 0.8141145706176758, + "logits/rejected": 0.8517176508903503, + "logps/chosen": -9.19636344909668, + "logps/rejected": -10.442273139953613, + "loss": 0.3877, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.19636344909668, + "rewards/margins": 1.2459100484848022, + "rewards/rejected": -10.442273139953613, + "semantic_entropy": 0.002024973975494504, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 15.697982397516139, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": 0.7669566869735718, + "logits/rejected": 0.811779797077179, + "logps/chosen": -8.987443923950195, + "logps/rejected": -10.217406272888184, + "loss": 0.4174, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -8.987443923950195, + "rewards/margins": 1.2299631834030151, + "rewards/rejected": -10.217406272888184, + "semantic_entropy": 0.0029276900459080935, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 14.632163572843812, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": 0.8088932037353516, + "logits/rejected": 0.8847886323928833, + "logps/chosen": -9.229738235473633, + "logps/rejected": -10.462305068969727, + "loss": 0.4008, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.229738235473633, + "rewards/margins": 1.2325657606124878, + "rewards/rejected": -10.462305068969727, + "semantic_entropy": 0.0019744504243135452, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 13.938257971384212, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": 0.7997492551803589, + "logits/rejected": 0.8458935022354126, + "logps/chosen": -9.3661470413208, + "logps/rejected": -10.73242473602295, + "loss": 0.4011, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.3661470413208, + "rewards/margins": 1.3662781715393066, + "rewards/rejected": -10.73242473602295, + "semantic_entropy": 0.0021405040752142668, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 21.92333227216617, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": 0.7903780937194824, + "logits/rejected": 0.8987275958061218, + "logps/chosen": -9.55534553527832, + "logps/rejected": -10.687799453735352, + "loss": 0.4631, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.55534553527832, + "rewards/margins": 1.132454514503479, + "rewards/rejected": -10.687799453735352, + "semantic_entropy": 0.0018273256719112396, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 21.21705630664952, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": 0.7476860284805298, + "logits/rejected": 0.8315266370773315, + "logps/chosen": -9.206350326538086, + "logps/rejected": -10.599584579467773, + "loss": 0.3865, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.206350326538086, + "rewards/margins": 1.3932336568832397, + "rewards/rejected": -10.599584579467773, + "semantic_entropy": 0.002457220805808902, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 23.717702676896593, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": 0.8404645919799805, + "logits/rejected": 0.8653911352157593, + "logps/chosen": -9.488239288330078, + "logps/rejected": -10.67754077911377, + "loss": 0.4216, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.488239288330078, + "rewards/margins": 1.1893017292022705, + "rewards/rejected": -10.67754077911377, + "semantic_entropy": 0.001736976788379252, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 18.026440727909602, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": 0.7588644027709961, + "logits/rejected": 0.804205596446991, + "logps/chosen": -9.2258882522583, + "logps/rejected": -10.435041427612305, + "loss": 0.4317, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.2258882522583, + "rewards/margins": 1.2091554403305054, + "rewards/rejected": -10.435041427612305, + "semantic_entropy": 0.0019264190923422575, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 17.35712096542329, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": 0.7276099324226379, + "logits/rejected": 0.8241073489189148, + "logps/chosen": -9.210673332214355, + "logps/rejected": -10.420351028442383, + "loss": 0.443, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.210673332214355, + "rewards/margins": 1.2096776962280273, + "rewards/rejected": -10.420351028442383, + "semantic_entropy": 0.0021508794743567705, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 13.520740192725968, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": 0.6945358514785767, + "logits/rejected": 0.7733569741249084, + "logps/chosen": -9.187652587890625, + "logps/rejected": -10.517842292785645, + "loss": 0.3952, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.187652587890625, + "rewards/margins": 1.3301887512207031, + "rewards/rejected": -10.517842292785645, + "semantic_entropy": 0.0021430773194879293, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 20.76831890859162, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": 0.7405019998550415, + "logits/rejected": 0.7900283336639404, + "logps/chosen": -9.107033729553223, + "logps/rejected": -10.247450828552246, + "loss": 0.4329, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.107033729553223, + "rewards/margins": 1.140415906906128, + "rewards/rejected": -10.247450828552246, + "semantic_entropy": 0.0022071374114602804, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 20.516974992601128, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": 0.7259657382965088, + "logits/rejected": 0.807550311088562, + "logps/chosen": -9.008337020874023, + "logps/rejected": -10.242483139038086, + "loss": 0.4152, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.008337020874023, + "rewards/margins": 1.2341454029083252, + "rewards/rejected": -10.242483139038086, + "semantic_entropy": 0.0025359108112752438, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 24.572088168328634, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": 0.7538091540336609, + "logits/rejected": 0.8418930172920227, + "logps/chosen": -9.286577224731445, + "logps/rejected": -10.427810668945312, + "loss": 0.4257, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.286577224731445, + "rewards/margins": 1.1412330865859985, + "rewards/rejected": -10.427810668945312, + "semantic_entropy": 0.0017364490777254105, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 18.5404432038831, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": 0.7726496458053589, + "logits/rejected": 0.855174720287323, + "logps/chosen": -9.180562019348145, + "logps/rejected": -10.476685523986816, + "loss": 0.3802, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.180562019348145, + "rewards/margins": 1.29612398147583, + "rewards/rejected": -10.476685523986816, + "semantic_entropy": 0.0018982533365488052, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 18.915627570533157, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": 0.721420407295227, + "logits/rejected": 0.7838973999023438, + "logps/chosen": -9.233001708984375, + "logps/rejected": -10.553727149963379, + "loss": 0.3627, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.233001708984375, + "rewards/margins": 1.320725679397583, + "rewards/rejected": -10.553727149963379, + "semantic_entropy": 0.0020444700494408607, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 18.14358020288826, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": 0.6938169598579407, + "logits/rejected": 0.8017823100090027, + "logps/chosen": -9.465526580810547, + "logps/rejected": -10.78095817565918, + "loss": 0.3999, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.465526580810547, + "rewards/margins": 1.315431833267212, + "rewards/rejected": -10.78095817565918, + "semantic_entropy": 0.001522608334198594, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 17.57535140312262, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": 0.7879313230514526, + "logits/rejected": 0.8248831629753113, + "logps/chosen": -9.305051803588867, + "logps/rejected": -10.57546615600586, + "loss": 0.3852, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.305051803588867, + "rewards/margins": 1.2704143524169922, + "rewards/rejected": -10.57546615600586, + "semantic_entropy": 0.0018570246174931526, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 20.119447880874766, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": 0.7350586652755737, + "logits/rejected": 0.8105939030647278, + "logps/chosen": -9.496885299682617, + "logps/rejected": -10.835896492004395, + "loss": 0.3968, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.496885299682617, + "rewards/margins": 1.33901047706604, + "rewards/rejected": -10.835896492004395, + "semantic_entropy": 0.0016353337559849024, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 15.071984684362736, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": 0.7667199969291687, + "logits/rejected": 0.8489478826522827, + "logps/chosen": -9.403945922851562, + "logps/rejected": -10.778000831604004, + "loss": 0.3713, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.403945922851562, + "rewards/margins": 1.3740556240081787, + "rewards/rejected": -10.778000831604004, + "semantic_entropy": 0.001894004992209375, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 17.801746580163428, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": 0.7659896612167358, + "logits/rejected": 0.8438106775283813, + "logps/chosen": -9.306072235107422, + "logps/rejected": -10.509511947631836, + "loss": 0.4364, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.306072235107422, + "rewards/margins": 1.2034391164779663, + "rewards/rejected": -10.509511947631836, + "semantic_entropy": 0.00206328509375453, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 16.02571773917183, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": 0.7731425166130066, + "logits/rejected": 0.860715389251709, + "logps/chosen": -9.523012161254883, + "logps/rejected": -10.883203506469727, + "loss": 0.3804, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.523012161254883, + "rewards/margins": 1.3601921796798706, + "rewards/rejected": -10.883203506469727, + "semantic_entropy": 0.001517820986919105, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 17.59552798096768, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": 0.7494341731071472, + "logits/rejected": 0.8325299024581909, + "logps/chosen": -9.523847579956055, + "logps/rejected": -10.638973236083984, + "loss": 0.4481, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.523847579956055, + "rewards/margins": 1.1151244640350342, + "rewards/rejected": -10.638973236083984, + "semantic_entropy": 0.0015663004014641047, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 21.242971901873577, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": 0.7531827092170715, + "logits/rejected": 0.8257854580879211, + "logps/chosen": -9.41790771484375, + "logps/rejected": -10.846906661987305, + "loss": 0.428, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.41790771484375, + "rewards/margins": 1.428999423980713, + "rewards/rejected": -10.846906661987305, + "semantic_entropy": 0.001963115995749831, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 23.548898424244427, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": 0.7559301853179932, + "logits/rejected": 0.8003666996955872, + "logps/chosen": -9.45875072479248, + "logps/rejected": -10.853038787841797, + "loss": 0.4169, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.45875072479248, + "rewards/margins": 1.3942878246307373, + "rewards/rejected": -10.853038787841797, + "semantic_entropy": 0.0015923971077427268, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 20.46304047002647, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": 0.6976224780082703, + "logits/rejected": 0.7671376466751099, + "logps/chosen": -9.384844779968262, + "logps/rejected": -10.741189956665039, + "loss": 0.3793, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.384844779968262, + "rewards/margins": 1.3563454151153564, + "rewards/rejected": -10.741189956665039, + "semantic_entropy": 0.001644113683141768, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 24.88011754633229, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": 0.6563600301742554, + "logits/rejected": 0.6912602782249451, + "logps/chosen": -9.349275588989258, + "logps/rejected": -10.556272506713867, + "loss": 0.4195, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.349275588989258, + "rewards/margins": 1.206997036933899, + "rewards/rejected": -10.556272506713867, + "semantic_entropy": 0.0016969643766060472, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 24.129234255509385, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": 0.7805946469306946, + "logits/rejected": 0.8489789962768555, + "logps/chosen": -9.523481369018555, + "logps/rejected": -10.783955574035645, + "loss": 0.4536, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.523481369018555, + "rewards/margins": 1.2604728937149048, + "rewards/rejected": -10.783955574035645, + "semantic_entropy": 0.001705177710391581, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 20.770855194305607, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": 0.7947415113449097, + "logits/rejected": 0.8732272982597351, + "logps/chosen": -9.23397159576416, + "logps/rejected": -10.698512077331543, + "loss": 0.3445, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.23397159576416, + "rewards/margins": 1.4645414352416992, + "rewards/rejected": -10.698512077331543, + "semantic_entropy": 0.0018425941234454513, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 19.657773873554152, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": 0.7870718240737915, + "logits/rejected": 0.8257455825805664, + "logps/chosen": -9.316213607788086, + "logps/rejected": -10.864585876464844, + "loss": 0.3707, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.316213607788086, + "rewards/margins": 1.5483721494674683, + "rewards/rejected": -10.864585876464844, + "semantic_entropy": 0.001938262372277677, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 29.398133134663638, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": 0.7874363660812378, + "logits/rejected": 0.8331443667411804, + "logps/chosen": -9.478517532348633, + "logps/rejected": -10.828798294067383, + "loss": 0.4304, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.478517532348633, + "rewards/margins": 1.3502806425094604, + "rewards/rejected": -10.828798294067383, + "semantic_entropy": 0.0018109595403075218, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 17.54106321474661, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": 0.7402059435844421, + "logits/rejected": 0.8292325735092163, + "logps/chosen": -9.550467491149902, + "logps/rejected": -10.937161445617676, + "loss": 0.3934, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.550467491149902, + "rewards/margins": 1.3866939544677734, + "rewards/rejected": -10.937161445617676, + "semantic_entropy": 0.0017161194700747728, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 17.162272687965423, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": 0.800572395324707, + "logits/rejected": 0.8477448225021362, + "logps/chosen": -9.5453462600708, + "logps/rejected": -10.9487943649292, + "loss": 0.3582, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.5453462600708, + "rewards/margins": 1.4034483432769775, + "rewards/rejected": -10.9487943649292, + "semantic_entropy": 0.0017864892724901438, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 18.16669999802219, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": 0.7785830497741699, + "logits/rejected": 0.8476356267929077, + "logps/chosen": -9.759795188903809, + "logps/rejected": -11.034205436706543, + "loss": 0.4168, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.759795188903809, + "rewards/margins": 1.2744102478027344, + "rewards/rejected": -11.034205436706543, + "semantic_entropy": 0.0020329877734184265, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 21.502409164431704, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": 0.8269200325012207, + "logits/rejected": 0.8657910227775574, + "logps/chosen": -9.461301803588867, + "logps/rejected": -10.762018203735352, + "loss": 0.3961, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.461301803588867, + "rewards/margins": 1.300715446472168, + "rewards/rejected": -10.762018203735352, + "semantic_entropy": 0.0016499152407050133, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 22.127771721745866, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": 0.7920838594436646, + "logits/rejected": 0.8535796403884888, + "logps/chosen": -9.840039253234863, + "logps/rejected": -11.020828247070312, + "loss": 0.4512, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.840039253234863, + "rewards/margins": 1.1807891130447388, + "rewards/rejected": -11.020828247070312, + "semantic_entropy": 0.0013382106553763151, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 21.03906625443135, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": 0.7727931141853333, + "logits/rejected": 0.8665952682495117, + "logps/chosen": -9.525362968444824, + "logps/rejected": -11.037253379821777, + "loss": 0.3669, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.525362968444824, + "rewards/margins": 1.5118907690048218, + "rewards/rejected": -11.037253379821777, + "semantic_entropy": 0.0015195768792182207, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 17.048956060445978, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": 0.7970033884048462, + "logits/rejected": 0.8288514018058777, + "logps/chosen": -9.431905746459961, + "logps/rejected": -10.85214900970459, + "loss": 0.3827, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.431905746459961, + "rewards/margins": 1.4202440977096558, + "rewards/rejected": -10.85214900970459, + "semantic_entropy": 0.001813689828850329, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 15.3880445844123, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": 0.7811511158943176, + "logits/rejected": 0.860381007194519, + "logps/chosen": -9.466936111450195, + "logps/rejected": -10.93088436126709, + "loss": 0.376, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.466936111450195, + "rewards/margins": 1.4639488458633423, + "rewards/rejected": -10.93088436126709, + "semantic_entropy": 0.0018378589302301407, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 15.913978912005007, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": 0.7653626203536987, + "logits/rejected": 0.8481870889663696, + "logps/chosen": -9.565362930297852, + "logps/rejected": -10.939167022705078, + "loss": 0.3977, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.565362930297852, + "rewards/margins": 1.3738042116165161, + "rewards/rejected": -10.939167022705078, + "semantic_entropy": 0.0015822149580344558, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 27.05791308561908, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": 0.7272459268569946, + "logits/rejected": 0.768274188041687, + "logps/chosen": -9.656865119934082, + "logps/rejected": -10.986165046691895, + "loss": 0.4415, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.656865119934082, + "rewards/margins": 1.3292994499206543, + "rewards/rejected": -10.986165046691895, + "semantic_entropy": 0.0016760114813223481, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 24.89803710573063, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": 0.7583307027816772, + "logits/rejected": 0.8247106671333313, + "logps/chosen": -9.691811561584473, + "logps/rejected": -10.66722297668457, + "loss": 0.4955, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.691811561584473, + "rewards/margins": 0.9754101634025574, + "rewards/rejected": -10.66722297668457, + "semantic_entropy": 0.0014091429766267538, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 27.703608412459293, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": 0.8197698593139648, + "logits/rejected": 0.9059907793998718, + "logps/chosen": -9.506436347961426, + "logps/rejected": -10.890914916992188, + "loss": 0.4066, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.506436347961426, + "rewards/margins": 1.3844783306121826, + "rewards/rejected": -10.890914916992188, + "semantic_entropy": 0.0022954349406063557, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 23.22557014778233, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": 0.758955180644989, + "logits/rejected": 0.8061805963516235, + "logps/chosen": -9.516363143920898, + "logps/rejected": -10.984567642211914, + "loss": 0.4001, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.516363143920898, + "rewards/margins": 1.4682044982910156, + "rewards/rejected": -10.984567642211914, + "semantic_entropy": 0.0018032476073130965, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 13.701261098953237, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": 0.7926728129386902, + "logits/rejected": 0.8731076121330261, + "logps/chosen": -9.494871139526367, + "logps/rejected": -11.167115211486816, + "loss": 0.3389, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.494871139526367, + "rewards/margins": 1.6722424030303955, + "rewards/rejected": -11.167115211486816, + "semantic_entropy": 0.0018036758992820978, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 20.712815017204772, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": 0.7888078093528748, + "logits/rejected": 0.8680068850517273, + "logps/chosen": -9.749283790588379, + "logps/rejected": -11.215142250061035, + "loss": 0.3831, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.749283790588379, + "rewards/margins": 1.4658589363098145, + "rewards/rejected": -11.215142250061035, + "semantic_entropy": 0.0016434881836175919, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 20.343949879193467, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": 0.8662029504776001, + "logits/rejected": 0.9218491315841675, + "logps/chosen": -9.647726058959961, + "logps/rejected": -11.065472602844238, + "loss": 0.4375, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.647726058959961, + "rewards/margins": 1.4177464246749878, + "rewards/rejected": -11.065472602844238, + "semantic_entropy": 0.001984253991395235, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 24.494673317414936, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": 0.8529459834098816, + "logits/rejected": 0.9132159352302551, + "logps/chosen": -9.46276569366455, + "logps/rejected": -10.809822082519531, + "loss": 0.3669, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -9.46276569366455, + "rewards/margins": 1.3470571041107178, + "rewards/rejected": -10.809822082519531, + "semantic_entropy": 0.0018992737168446183, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 19.20248865933138, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": 0.8148723840713501, + "logits/rejected": 0.8485409617424011, + "logps/chosen": -9.510710716247559, + "logps/rejected": -11.025351524353027, + "loss": 0.3994, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.510710716247559, + "rewards/margins": 1.5146404504776, + "rewards/rejected": -11.025351524353027, + "semantic_entropy": 0.0019136825576424599, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 28.472850279807695, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": 0.7736892700195312, + "logits/rejected": 0.8397472500801086, + "logps/chosen": -9.476969718933105, + "logps/rejected": -10.861968040466309, + "loss": 0.3791, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.476969718933105, + "rewards/margins": 1.3849985599517822, + "rewards/rejected": -10.861968040466309, + "semantic_entropy": 0.0017747702077031136, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 19.047738461075404, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": 0.7408386468887329, + "logits/rejected": 0.7888758778572083, + "logps/chosen": -9.377801895141602, + "logps/rejected": -10.882084846496582, + "loss": 0.3476, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -9.377801895141602, + "rewards/margins": 1.5042815208435059, + "rewards/rejected": -10.882084846496582, + "semantic_entropy": 0.00191340665332973, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 28.183809111461034, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": 0.748810887336731, + "logits/rejected": 0.8732229471206665, + "logps/chosen": -9.67158317565918, + "logps/rejected": -11.065845489501953, + "loss": 0.4378, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.67158317565918, + "rewards/margins": 1.3942630290985107, + "rewards/rejected": -11.065845489501953, + "semantic_entropy": 0.00166232546325773, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 25.264626186096123, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": 0.806961178779602, + "logits/rejected": 0.8473097681999207, + "logps/chosen": -9.56501579284668, + "logps/rejected": -10.97465705871582, + "loss": 0.3953, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.56501579284668, + "rewards/margins": 1.4096405506134033, + "rewards/rejected": -10.97465705871582, + "semantic_entropy": 0.0016302301082760096, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.9285687804222107, + "eval_logits/rejected": 0.978207528591156, + "eval_logps/chosen": -9.8406982421875, + "eval_logps/rejected": -10.940929412841797, + "eval_loss": 0.526120126247406, + "eval_rewards/accuracies": 0.719584584236145, + "eval_rewards/chosen": -9.8406982421875, + "eval_rewards/margins": 1.100231647491455, + "eval_rewards/rejected": -10.940929412841797, + "eval_runtime": 35.0954, + "eval_samples_per_second": 38.324, + "eval_semantic_entropy": 0.001473370473831892, + "eval_steps_per_second": 9.602, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 23.667192452025375, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": 0.8243509531021118, + "logits/rejected": 0.9055337905883789, + "logps/chosen": -9.871681213378906, + "logps/rejected": -11.096675872802734, + "loss": 0.4597, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.871681213378906, + "rewards/margins": 1.224994421005249, + "rewards/rejected": -11.096675872802734, + "semantic_entropy": 0.0016837811563163996, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 27.290675701525487, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": 0.76947021484375, + "logits/rejected": 0.8418231010437012, + "logps/chosen": -9.789541244506836, + "logps/rejected": -10.954092979431152, + "loss": 0.4684, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.789541244506836, + "rewards/margins": 1.1645511388778687, + "rewards/rejected": -10.954092979431152, + "semantic_entropy": 0.0013261919375509024, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 22.016730657163862, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": 0.7831665873527527, + "logits/rejected": 0.8772950172424316, + "logps/chosen": -9.797338485717773, + "logps/rejected": -11.046935081481934, + "loss": 0.4435, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.797338485717773, + "rewards/margins": 1.249597191810608, + "rewards/rejected": -11.046935081481934, + "semantic_entropy": 0.0014855300541967154, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 18.024857287329926, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": 0.7459646463394165, + "logits/rejected": 0.8185451626777649, + "logps/chosen": -9.737576484680176, + "logps/rejected": -11.164031982421875, + "loss": 0.3774, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.737576484680176, + "rewards/margins": 1.4264552593231201, + "rewards/rejected": -11.164031982421875, + "semantic_entropy": 0.0017724098870530725, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 26.64615086075229, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": 0.8524463772773743, + "logits/rejected": 0.8867918848991394, + "logps/chosen": -9.587358474731445, + "logps/rejected": -10.673149108886719, + "loss": 0.4612, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.587358474731445, + "rewards/margins": 1.0857917070388794, + "rewards/rejected": -10.673149108886719, + "semantic_entropy": 0.0018163727363571525, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 16.629223181883795, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": 0.7545696496963501, + "logits/rejected": 0.8315639495849609, + "logps/chosen": -9.440114974975586, + "logps/rejected": -10.97568130493164, + "loss": 0.35, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.440114974975586, + "rewards/margins": 1.5355665683746338, + "rewards/rejected": -10.97568130493164, + "semantic_entropy": 0.0017560431733727455, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 16.933778427501522, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": 0.7635836005210876, + "logits/rejected": 0.7833055853843689, + "logps/chosen": -9.408586502075195, + "logps/rejected": -10.607271194458008, + "loss": 0.4138, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.408586502075195, + "rewards/margins": 1.1986857652664185, + "rewards/rejected": -10.607271194458008, + "semantic_entropy": 0.0017117311945185065, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 23.28336770400718, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": 0.8093854784965515, + "logits/rejected": 0.8086700439453125, + "logps/chosen": -9.568441390991211, + "logps/rejected": -10.82945442199707, + "loss": 0.4363, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.568441390991211, + "rewards/margins": 1.2610145807266235, + "rewards/rejected": -10.82945442199707, + "semantic_entropy": 0.0016031649429351091, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 21.12058624108258, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": 0.8389409780502319, + "logits/rejected": 0.8938524127006531, + "logps/chosen": -9.587557792663574, + "logps/rejected": -10.771153450012207, + "loss": 0.4503, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -9.587557792663574, + "rewards/margins": 1.1835949420928955, + "rewards/rejected": -10.771153450012207, + "semantic_entropy": 0.0016682265559211373, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 31.758164864561788, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": 0.8181732892990112, + "logits/rejected": 0.8720429539680481, + "logps/chosen": -9.709211349487305, + "logps/rejected": -10.695291519165039, + "loss": 0.5042, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.709211349487305, + "rewards/margins": 0.9860790371894836, + "rewards/rejected": -10.695291519165039, + "semantic_entropy": 0.0015186185482889414, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 22.613433139861957, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": 0.8588092923164368, + "logits/rejected": 0.8922932744026184, + "logps/chosen": -9.441000938415527, + "logps/rejected": -10.503214836120605, + "loss": 0.4521, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.441000938415527, + "rewards/margins": 1.062213659286499, + "rewards/rejected": -10.503214836120605, + "semantic_entropy": 0.0017032899195328355, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 18.889543716610927, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": 0.9263399243354797, + "logits/rejected": 0.967937171459198, + "logps/chosen": -9.547457695007324, + "logps/rejected": -10.7252836227417, + "loss": 0.4549, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.547457695007324, + "rewards/margins": 1.177826166152954, + "rewards/rejected": -10.7252836227417, + "semantic_entropy": 0.0017254750709980726, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 20.68930586575862, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": 0.7730456590652466, + "logits/rejected": 0.801898181438446, + "logps/chosen": -9.515592575073242, + "logps/rejected": -10.984514236450195, + "loss": 0.3715, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.515592575073242, + "rewards/margins": 1.4689228534698486, + "rewards/rejected": -10.984514236450195, + "semantic_entropy": 0.0015030469512566924, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 24.806502277573657, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": 0.8492151498794556, + "logits/rejected": 0.9006759524345398, + "logps/chosen": -9.50521183013916, + "logps/rejected": -11.008363723754883, + "loss": 0.4001, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.50521183013916, + "rewards/margins": 1.5031511783599854, + "rewards/rejected": -11.008363723754883, + "semantic_entropy": 0.0017372198635712266, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 19.324917536101047, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": 0.814143180847168, + "logits/rejected": 0.8865806460380554, + "logps/chosen": -9.452108383178711, + "logps/rejected": -10.87096118927002, + "loss": 0.3736, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.452108383178711, + "rewards/margins": 1.4188525676727295, + "rewards/rejected": -10.87096118927002, + "semantic_entropy": 0.0015151125844568014, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 18.81790999844241, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": 0.7656540870666504, + "logits/rejected": 0.8226898312568665, + "logps/chosen": -9.545916557312012, + "logps/rejected": -10.923840522766113, + "loss": 0.4004, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.545916557312012, + "rewards/margins": 1.3779232501983643, + "rewards/rejected": -10.923840522766113, + "semantic_entropy": 0.0021084630861878395, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 21.06645650645646, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": 0.8369568586349487, + "logits/rejected": 0.9141233563423157, + "logps/chosen": -9.500614166259766, + "logps/rejected": -10.795511245727539, + "loss": 0.3764, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.500614166259766, + "rewards/margins": 1.2948954105377197, + "rewards/rejected": -10.795511245727539, + "semantic_entropy": 0.0015911769587546587, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 21.33908508690175, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": 0.7690576314926147, + "logits/rejected": 0.8658930063247681, + "logps/chosen": -9.389853477478027, + "logps/rejected": -10.788896560668945, + "loss": 0.3887, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.389853477478027, + "rewards/margins": 1.3990432024002075, + "rewards/rejected": -10.788896560668945, + "semantic_entropy": 0.002215514425188303, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 22.867041410821965, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": 0.7939780950546265, + "logits/rejected": 0.8460676074028015, + "logps/chosen": -9.5419282913208, + "logps/rejected": -10.808481216430664, + "loss": 0.3914, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.5419282913208, + "rewards/margins": 1.2665529251098633, + "rewards/rejected": -10.808481216430664, + "semantic_entropy": 0.0017644502222537994, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 23.636550449380298, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": 0.8169394731521606, + "logits/rejected": 0.8834837675094604, + "logps/chosen": -9.410249710083008, + "logps/rejected": -10.827165603637695, + "loss": 0.3956, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.410249710083008, + "rewards/margins": 1.416915774345398, + "rewards/rejected": -10.827165603637695, + "semantic_entropy": 0.001839539734646678, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 23.012604021963845, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": 0.853449821472168, + "logits/rejected": 0.9539750218391418, + "logps/chosen": -9.770658493041992, + "logps/rejected": -11.06202507019043, + "loss": 0.4337, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.770658493041992, + "rewards/margins": 1.2913668155670166, + "rewards/rejected": -11.06202507019043, + "semantic_entropy": 0.0014862673124298453, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 25.51169802963628, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": 0.7688170671463013, + "logits/rejected": 0.8904584646224976, + "logps/chosen": -9.553049087524414, + "logps/rejected": -11.032699584960938, + "loss": 0.3851, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.553049087524414, + "rewards/margins": 1.4796515703201294, + "rewards/rejected": -11.032699584960938, + "semantic_entropy": 0.0020472349133342505, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 23.510129434710546, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": 0.7789877653121948, + "logits/rejected": 0.8432193994522095, + "logps/chosen": -9.628904342651367, + "logps/rejected": -10.928987503051758, + "loss": 0.4125, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.628904342651367, + "rewards/margins": 1.3000822067260742, + "rewards/rejected": -10.928987503051758, + "semantic_entropy": 0.001494646305218339, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 25.695966110067108, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": 0.8373724222183228, + "logits/rejected": 0.8486580848693848, + "logps/chosen": -9.40330696105957, + "logps/rejected": -10.690084457397461, + "loss": 0.4098, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.40330696105957, + "rewards/margins": 1.2867774963378906, + "rewards/rejected": -10.690084457397461, + "semantic_entropy": 0.0017762102652341127, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 17.76197982327494, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": 0.7693505883216858, + "logits/rejected": 0.8610559701919556, + "logps/chosen": -9.42241382598877, + "logps/rejected": -11.041936874389648, + "loss": 0.3312, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -9.42241382598877, + "rewards/margins": 1.619523286819458, + "rewards/rejected": -11.041936874389648, + "semantic_entropy": 0.0017702898476272821, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 17.95604315957035, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": 0.8016265630722046, + "logits/rejected": 0.8611480593681335, + "logps/chosen": -9.537622451782227, + "logps/rejected": -10.907397270202637, + "loss": 0.4239, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.537622451782227, + "rewards/margins": 1.3697750568389893, + "rewards/rejected": -10.907397270202637, + "semantic_entropy": 0.001632682979106903, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 16.93384833519254, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": 0.8083820343017578, + "logits/rejected": 0.8523054122924805, + "logps/chosen": -9.715473175048828, + "logps/rejected": -11.148602485656738, + "loss": 0.4369, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.715473175048828, + "rewards/margins": 1.4331295490264893, + "rewards/rejected": -11.148602485656738, + "semantic_entropy": 0.0015578053425997496, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 18.601513607697683, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": 0.7638577818870544, + "logits/rejected": 0.7983411550521851, + "logps/chosen": -9.586159706115723, + "logps/rejected": -10.826416015625, + "loss": 0.4351, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.586159706115723, + "rewards/margins": 1.2402559518814087, + "rewards/rejected": -10.826416015625, + "semantic_entropy": 0.001649503014050424, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 24.016266301167416, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": 0.8312221765518188, + "logits/rejected": 0.8952839970588684, + "logps/chosen": -9.727499008178711, + "logps/rejected": -11.005497932434082, + "loss": 0.4144, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.727499008178711, + "rewards/margins": 1.2779988050460815, + "rewards/rejected": -11.005497932434082, + "semantic_entropy": 0.0015145648503676057, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 20.566896327558037, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": 0.8225449323654175, + "logits/rejected": 0.8508152961730957, + "logps/chosen": -9.408650398254395, + "logps/rejected": -10.707399368286133, + "loss": 0.4173, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.408650398254395, + "rewards/margins": 1.2987501621246338, + "rewards/rejected": -10.707399368286133, + "semantic_entropy": 0.0019972771406173706, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 22.344243213376547, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": 0.847356915473938, + "logits/rejected": 0.8640506863594055, + "logps/chosen": -9.579663276672363, + "logps/rejected": -10.968865394592285, + "loss": 0.4137, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.579663276672363, + "rewards/margins": 1.3892011642456055, + "rewards/rejected": -10.968865394592285, + "semantic_entropy": 0.0021100840531289577, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 19.57624931515473, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": 0.8184317350387573, + "logits/rejected": 0.88373863697052, + "logps/chosen": -9.572199821472168, + "logps/rejected": -11.031925201416016, + "loss": 0.3741, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.572199821472168, + "rewards/margins": 1.4597254991531372, + "rewards/rejected": -11.031925201416016, + "semantic_entropy": 0.0016687295865267515, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 24.504791180268253, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": 0.8567901849746704, + "logits/rejected": 0.9107440114021301, + "logps/chosen": -9.613670349121094, + "logps/rejected": -11.115751266479492, + "loss": 0.3757, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.613670349121094, + "rewards/margins": 1.5020800828933716, + "rewards/rejected": -11.115751266479492, + "semantic_entropy": 0.0012760651297867298, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 35.6519577235443, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": 0.8858783841133118, + "logits/rejected": 0.9201458096504211, + "logps/chosen": -9.616140365600586, + "logps/rejected": -10.997381210327148, + "loss": 0.4086, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.616140365600586, + "rewards/margins": 1.3812413215637207, + "rewards/rejected": -10.997381210327148, + "semantic_entropy": 0.0018040050053969026, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 26.229983342336126, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": 0.8341430425643921, + "logits/rejected": 0.8626706004142761, + "logps/chosen": -9.763750076293945, + "logps/rejected": -10.994816780090332, + "loss": 0.4061, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.763750076293945, + "rewards/margins": 1.2310662269592285, + "rewards/rejected": -10.994816780090332, + "semantic_entropy": 0.001325559918768704, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 19.660501965854834, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": 0.8540051579475403, + "logits/rejected": 0.913652777671814, + "logps/chosen": -9.744295120239258, + "logps/rejected": -11.07103157043457, + "loss": 0.4022, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.744295120239258, + "rewards/margins": 1.3267360925674438, + "rewards/rejected": -11.07103157043457, + "semantic_entropy": 0.0015319742960855365, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 22.89745562199727, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": 0.7796264886856079, + "logits/rejected": 0.8492299318313599, + "logps/chosen": -9.544806480407715, + "logps/rejected": -10.937907218933105, + "loss": 0.3939, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.544806480407715, + "rewards/margins": 1.3931005001068115, + "rewards/rejected": -10.937907218933105, + "semantic_entropy": 0.0017392231384292245, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 21.62752883795121, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": 0.8781774640083313, + "logits/rejected": 0.9093042612075806, + "logps/chosen": -9.595394134521484, + "logps/rejected": -10.983304977416992, + "loss": 0.417, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.595394134521484, + "rewards/margins": 1.3879096508026123, + "rewards/rejected": -10.983304977416992, + "semantic_entropy": 0.0014014368643984199, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 22.389579813909606, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": 0.7380444407463074, + "logits/rejected": 0.7925786972045898, + "logps/chosen": -9.564143180847168, + "logps/rejected": -10.847589492797852, + "loss": 0.3963, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.564143180847168, + "rewards/margins": 1.2834450006484985, + "rewards/rejected": -10.847589492797852, + "semantic_entropy": 0.0015661569777876139, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 23.059520479950827, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": 0.8489105105400085, + "logits/rejected": 0.8289289474487305, + "logps/chosen": -9.792860984802246, + "logps/rejected": -10.936185836791992, + "loss": 0.4414, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.792860984802246, + "rewards/margins": 1.1433252096176147, + "rewards/rejected": -10.936185836791992, + "semantic_entropy": 0.0015545317437499762, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 20.613769102957825, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": 0.7959033250808716, + "logits/rejected": 0.8639119267463684, + "logps/chosen": -9.70136833190918, + "logps/rejected": -10.945596694946289, + "loss": 0.4202, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.70136833190918, + "rewards/margins": 1.244227409362793, + "rewards/rejected": -10.945596694946289, + "semantic_entropy": 0.0013822594191879034, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 17.406251682106838, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": 0.7929319143295288, + "logits/rejected": 0.8579456210136414, + "logps/chosen": -9.719596862792969, + "logps/rejected": -11.37035083770752, + "loss": 0.3609, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.719596862792969, + "rewards/margins": 1.6507545709609985, + "rewards/rejected": -11.37035083770752, + "semantic_entropy": 0.0013113311724737287, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 20.15920926644943, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": 0.7851302623748779, + "logits/rejected": 0.8529064059257507, + "logps/chosen": -9.851489067077637, + "logps/rejected": -11.166707038879395, + "loss": 0.4203, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.851489067077637, + "rewards/margins": 1.3152183294296265, + "rewards/rejected": -11.166707038879395, + "semantic_entropy": 0.0013941864017397165, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 19.09139226174037, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": 0.8212282061576843, + "logits/rejected": 0.877540111541748, + "logps/chosen": -9.683355331420898, + "logps/rejected": -11.168909072875977, + "loss": 0.3788, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.683355331420898, + "rewards/margins": 1.4855531454086304, + "rewards/rejected": -11.168909072875977, + "semantic_entropy": 0.0017290354007855058, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 19.755486537220495, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": 0.7770802974700928, + "logits/rejected": 0.8633508682250977, + "logps/chosen": -9.642881393432617, + "logps/rejected": -11.013988494873047, + "loss": 0.4055, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.642881393432617, + "rewards/margins": 1.3711069822311401, + "rewards/rejected": -11.013988494873047, + "semantic_entropy": 0.0014985213056206703, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 21.927676822159636, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": 0.8036985397338867, + "logits/rejected": 0.8095115423202515, + "logps/chosen": -9.7802152633667, + "logps/rejected": -11.069761276245117, + "loss": 0.4268, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.7802152633667, + "rewards/margins": 1.289547085762024, + "rewards/rejected": -11.069761276245117, + "semantic_entropy": 0.0012104662600904703, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 16.00842803469047, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": 0.7866020202636719, + "logits/rejected": 0.8434419631958008, + "logps/chosen": -9.776571273803711, + "logps/rejected": -11.023462295532227, + "loss": 0.4244, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.776571273803711, + "rewards/margins": 1.2468903064727783, + "rewards/rejected": -11.023462295532227, + "semantic_entropy": 0.0012111186515539885, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 17.323870558007282, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": 0.8010492324829102, + "logits/rejected": 0.8359723091125488, + "logps/chosen": -9.49112319946289, + "logps/rejected": -10.895849227905273, + "loss": 0.3877, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.49112319946289, + "rewards/margins": 1.404726266860962, + "rewards/rejected": -10.895849227905273, + "semantic_entropy": 0.0014989904593676329, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 17.259101259042957, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": 0.8071710467338562, + "logits/rejected": 0.8544157147407532, + "logps/chosen": -9.638090133666992, + "logps/rejected": -11.011279106140137, + "loss": 0.3814, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.638090133666992, + "rewards/margins": 1.3731900453567505, + "rewards/rejected": -11.011279106140137, + "semantic_entropy": 0.0015442619333043694, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 15.415019825942553, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": 0.8083289861679077, + "logits/rejected": 0.9057637453079224, + "logps/chosen": -9.574909210205078, + "logps/rejected": -10.998074531555176, + "loss": 0.3687, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.574909210205078, + "rewards/margins": 1.4231641292572021, + "rewards/rejected": -10.998074531555176, + "semantic_entropy": 0.00155646784696728, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 21.576286115755277, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": 0.8501211404800415, + "logits/rejected": 0.9308522343635559, + "logps/chosen": -9.679555892944336, + "logps/rejected": -11.19702434539795, + "loss": 0.3827, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.679555892944336, + "rewards/margins": 1.517469048500061, + "rewards/rejected": -11.19702434539795, + "semantic_entropy": 0.0016793437534943223, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 26.719598028515872, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": 0.8302766680717468, + "logits/rejected": 0.8800037503242493, + "logps/chosen": -9.66600513458252, + "logps/rejected": -10.949085235595703, + "loss": 0.4151, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.66600513458252, + "rewards/margins": 1.2830795049667358, + "rewards/rejected": -10.949085235595703, + "semantic_entropy": 0.001497269026003778, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 18.483844690586892, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": 0.8208200335502625, + "logits/rejected": 0.8599546551704407, + "logps/chosen": -9.824455261230469, + "logps/rejected": -11.335619926452637, + "loss": 0.3885, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.824455261230469, + "rewards/margins": 1.511163353919983, + "rewards/rejected": -11.335619926452637, + "semantic_entropy": 0.0013760743895545602, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 20.65468077960879, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": 0.8452394604682922, + "logits/rejected": 0.9035156965255737, + "logps/chosen": -9.813318252563477, + "logps/rejected": -11.151365280151367, + "loss": 0.4044, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.813318252563477, + "rewards/margins": 1.338047742843628, + "rewards/rejected": -11.151365280151367, + "semantic_entropy": 0.0013139288639649749, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 24.257011022001592, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": 0.7976378202438354, + "logits/rejected": 0.8221977353096008, + "logps/chosen": -9.780439376831055, + "logps/rejected": -11.116656303405762, + "loss": 0.4225, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.780439376831055, + "rewards/margins": 1.3362162113189697, + "rewards/rejected": -11.116656303405762, + "semantic_entropy": 0.0013957961928099394, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 22.65516964126615, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": 0.8350450396537781, + "logits/rejected": 0.8612421154975891, + "logps/chosen": -9.680601119995117, + "logps/rejected": -11.20177936553955, + "loss": 0.3626, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.680601119995117, + "rewards/margins": 1.5211775302886963, + "rewards/rejected": -11.20177936553955, + "semantic_entropy": 0.0013702240539714694, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 29.58219434395317, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": 0.8323311805725098, + "logits/rejected": 0.8706264495849609, + "logps/chosen": -9.795225143432617, + "logps/rejected": -11.236984252929688, + "loss": 0.3767, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.795225143432617, + "rewards/margins": 1.441759467124939, + "rewards/rejected": -11.236984252929688, + "semantic_entropy": 0.0013441203627735376, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 29.102509702206316, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": 0.7748151421546936, + "logits/rejected": 0.81391441822052, + "logps/chosen": -9.950288772583008, + "logps/rejected": -11.165694236755371, + "loss": 0.4377, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.950288772583008, + "rewards/margins": 1.215405821800232, + "rewards/rejected": -11.165694236755371, + "semantic_entropy": 0.0010313175152987242, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 21.581567035471323, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": 0.8050596117973328, + "logits/rejected": 0.8407198190689087, + "logps/chosen": -9.88306999206543, + "logps/rejected": -11.050642013549805, + "loss": 0.4321, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.88306999206543, + "rewards/margins": 1.167571783065796, + "rewards/rejected": -11.050642013549805, + "semantic_entropy": 0.0010738309938460588, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 18.06623114943259, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": 0.8042596578598022, + "logits/rejected": 0.8785734176635742, + "logps/chosen": -9.795085906982422, + "logps/rejected": -11.104022979736328, + "loss": 0.3878, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.795085906982422, + "rewards/margins": 1.308937907218933, + "rewards/rejected": -11.104022979736328, + "semantic_entropy": 0.0012584684882313013, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 24.33590640447122, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": 0.7726608514785767, + "logits/rejected": 0.8293962478637695, + "logps/chosen": -9.73291015625, + "logps/rejected": -11.144505500793457, + "loss": 0.4062, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.73291015625, + "rewards/margins": 1.4115943908691406, + "rewards/rejected": -11.144505500793457, + "semantic_entropy": 0.002003467408940196, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 19.852698261404328, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": 0.8046091198921204, + "logits/rejected": 0.8712922930717468, + "logps/chosen": -9.885152816772461, + "logps/rejected": -11.513572692871094, + "loss": 0.3311, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.885152816772461, + "rewards/margins": 1.6284195184707642, + "rewards/rejected": -11.513572692871094, + "semantic_entropy": 0.0014156540855765343, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 23.28791831420902, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": 0.7911036610603333, + "logits/rejected": 0.8605779409408569, + "logps/chosen": -9.974563598632812, + "logps/rejected": -11.249377250671387, + "loss": 0.467, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.974563598632812, + "rewards/margins": 1.2748134136199951, + "rewards/rejected": -11.249377250671387, + "semantic_entropy": 0.0012652326840907335, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 18.031587200544166, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": 0.8097645044326782, + "logits/rejected": 0.8588771820068359, + "logps/chosen": -9.800325393676758, + "logps/rejected": -11.185141563415527, + "loss": 0.393, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.800325393676758, + "rewards/margins": 1.3848176002502441, + "rewards/rejected": -11.185141563415527, + "semantic_entropy": 0.0018925167387351394, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 29.399301801147324, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": 0.7945131063461304, + "logits/rejected": 0.8561771512031555, + "logps/chosen": -9.814886093139648, + "logps/rejected": -11.265652656555176, + "loss": 0.4417, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.814886093139648, + "rewards/margins": 1.45076584815979, + "rewards/rejected": -11.265652656555176, + "semantic_entropy": 0.0013577769277617335, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 18.818414850816456, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": 0.782823920249939, + "logits/rejected": 0.8478276133537292, + "logps/chosen": -9.825902938842773, + "logps/rejected": -11.110780715942383, + "loss": 0.4206, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.825902938842773, + "rewards/margins": 1.2848764657974243, + "rewards/rejected": -11.110780715942383, + "semantic_entropy": 0.0011613890528678894, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 23.834103955908468, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": 0.7609673142433167, + "logits/rejected": 0.8239792585372925, + "logps/chosen": -9.851580619812012, + "logps/rejected": -11.248353958129883, + "loss": 0.385, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.851580619812012, + "rewards/margins": 1.3967727422714233, + "rewards/rejected": -11.248353958129883, + "semantic_entropy": 0.0012605976080521941, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 23.686138337629355, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": 0.8113320469856262, + "logits/rejected": 0.853225588798523, + "logps/chosen": -9.844433784484863, + "logps/rejected": -11.266097068786621, + "loss": 0.4029, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.844433784484863, + "rewards/margins": 1.4216625690460205, + "rewards/rejected": -11.266097068786621, + "semantic_entropy": 0.0013559302315115929, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 25.669925692295536, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": 0.7266643643379211, + "logits/rejected": 0.8064903020858765, + "logps/chosen": -9.897780418395996, + "logps/rejected": -11.229866981506348, + "loss": 0.4261, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.897780418395996, + "rewards/margins": 1.3320866823196411, + "rewards/rejected": -11.229866981506348, + "semantic_entropy": 0.0012010873761028051, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 24.94777596239184, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": 0.796136200428009, + "logits/rejected": 0.8669074177742004, + "logps/chosen": -9.997017860412598, + "logps/rejected": -11.391059875488281, + "loss": 0.3978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.997017860412598, + "rewards/margins": 1.3940420150756836, + "rewards/rejected": -11.391059875488281, + "semantic_entropy": 0.0011204956099390984, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 25.67221825991835, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": 0.7781258225440979, + "logits/rejected": 0.8242195248603821, + "logps/chosen": -9.795356750488281, + "logps/rejected": -11.114994049072266, + "loss": 0.3966, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.795356750488281, + "rewards/margins": 1.3196370601654053, + "rewards/rejected": -11.114994049072266, + "semantic_entropy": 0.0012639164924621582, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 29.57080046877201, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": 0.7557036876678467, + "logits/rejected": 0.8132128715515137, + "logps/chosen": -9.90876579284668, + "logps/rejected": -11.25818920135498, + "loss": 0.4118, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.90876579284668, + "rewards/margins": 1.3494237661361694, + "rewards/rejected": -11.25818920135498, + "semantic_entropy": 0.0013076277682557702, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 20.78894701815076, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": 0.8289060592651367, + "logits/rejected": 0.8358928561210632, + "logps/chosen": -9.96793270111084, + "logps/rejected": -11.24679183959961, + "loss": 0.4306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.96793270111084, + "rewards/margins": 1.2788599729537964, + "rewards/rejected": -11.24679183959961, + "semantic_entropy": 0.0013636414660140872, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 26.519372471909644, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": 0.8788352012634277, + "logits/rejected": 0.9096766710281372, + "logps/chosen": -9.882646560668945, + "logps/rejected": -11.346869468688965, + "loss": 0.3805, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.882646560668945, + "rewards/margins": 1.464221715927124, + "rewards/rejected": -11.346869468688965, + "semantic_entropy": 0.0013449579710140824, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 26.805430797019667, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": 0.7472053170204163, + "logits/rejected": 0.8391642570495605, + "logps/chosen": -9.822053909301758, + "logps/rejected": -11.334062576293945, + "loss": 0.3744, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.822053909301758, + "rewards/margins": 1.5120099782943726, + "rewards/rejected": -11.334062576293945, + "semantic_entropy": 0.0012804374564439058, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 23.745463280246494, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": 0.8212148547172546, + "logits/rejected": 0.9034450650215149, + "logps/chosen": -9.849560737609863, + "logps/rejected": -11.316389083862305, + "loss": 0.3922, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.849560737609863, + "rewards/margins": 1.4668283462524414, + "rewards/rejected": -11.316389083862305, + "semantic_entropy": 0.0014727965462952852, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 24.765118352803277, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": 0.706335723400116, + "logits/rejected": 0.7600988149642944, + "logps/chosen": -10.072949409484863, + "logps/rejected": -11.255311965942383, + "loss": 0.4585, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -10.072949409484863, + "rewards/margins": 1.1823631525039673, + "rewards/rejected": -11.255311965942383, + "semantic_entropy": 0.0010486546671018004, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 16.53037049152132, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": 0.8160565495491028, + "logits/rejected": 0.8825929760932922, + "logps/chosen": -9.718530654907227, + "logps/rejected": -11.098315238952637, + "loss": 0.375, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.718530654907227, + "rewards/margins": 1.3797847032546997, + "rewards/rejected": -11.098315238952637, + "semantic_entropy": 0.0014672328252345324, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 19.343996568570635, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": 0.8211394548416138, + "logits/rejected": 0.8589351773262024, + "logps/chosen": -9.79681396484375, + "logps/rejected": -11.286918640136719, + "loss": 0.4092, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.79681396484375, + "rewards/margins": 1.4901044368743896, + "rewards/rejected": -11.286918640136719, + "semantic_entropy": 0.0013513191370293498, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 18.946006963734423, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": 0.777470052242279, + "logits/rejected": 0.8312585949897766, + "logps/chosen": -9.910394668579102, + "logps/rejected": -11.160832405090332, + "loss": 0.428, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.910394668579102, + "rewards/margins": 1.2504370212554932, + "rewards/rejected": -11.160832405090332, + "semantic_entropy": 0.0012193446746096015, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": 0.9214699268341064, + "eval_logits/rejected": 0.9721218943595886, + "eval_logps/chosen": -9.951480865478516, + "eval_logps/rejected": -11.088980674743652, + "eval_loss": 0.5250210762023926, + "eval_rewards/accuracies": 0.721068263053894, + "eval_rewards/chosen": -9.951480865478516, + "eval_rewards/margins": 1.1374988555908203, + "eval_rewards/rejected": -11.088980674743652, + "eval_runtime": 35.1208, + "eval_samples_per_second": 38.296, + "eval_semantic_entropy": 0.0012979113962501287, + "eval_steps_per_second": 9.595, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 24.25396278476502, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": 0.7317711114883423, + "logits/rejected": 0.7822341322898865, + "logps/chosen": -9.765314102172852, + "logps/rejected": -11.11705207824707, + "loss": 0.3892, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.765314102172852, + "rewards/margins": 1.3517379760742188, + "rewards/rejected": -11.11705207824707, + "semantic_entropy": 0.001415650942362845, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 16.799881334103528, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": 0.8066733479499817, + "logits/rejected": 0.9000295400619507, + "logps/chosen": -9.963193893432617, + "logps/rejected": -11.469663619995117, + "loss": 0.3841, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.963193893432617, + "rewards/margins": 1.5064703226089478, + "rewards/rejected": -11.469663619995117, + "semantic_entropy": 0.001393306301906705, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 22.10955918100729, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": 0.7781058549880981, + "logits/rejected": 0.822067141532898, + "logps/chosen": -9.866239547729492, + "logps/rejected": -11.280614852905273, + "loss": 0.4068, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.866239547729492, + "rewards/margins": 1.4143754243850708, + "rewards/rejected": -11.280614852905273, + "semantic_entropy": 0.0012620962224900723, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 24.95675503053961, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": 0.7621157765388489, + "logits/rejected": 0.7959357500076294, + "logps/chosen": -9.751391410827637, + "logps/rejected": -10.94892692565918, + "loss": 0.4499, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.751391410827637, + "rewards/margins": 1.1975345611572266, + "rewards/rejected": -10.94892692565918, + "semantic_entropy": 0.001342722331173718, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 22.719615672561844, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": 0.7777091264724731, + "logits/rejected": 0.860866367816925, + "logps/chosen": -9.617403030395508, + "logps/rejected": -11.080734252929688, + "loss": 0.3296, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.617403030395508, + "rewards/margins": 1.4633299112319946, + "rewards/rejected": -11.080734252929688, + "semantic_entropy": 0.0015527913346886635, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 22.213207378400554, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": 0.7948800325393677, + "logits/rejected": 0.8321939706802368, + "logps/chosen": -9.664536476135254, + "logps/rejected": -11.117993354797363, + "loss": 0.3919, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.664536476135254, + "rewards/margins": 1.4534571170806885, + "rewards/rejected": -11.117993354797363, + "semantic_entropy": 0.0012703756801784039, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 17.553736783320986, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": 0.8016083836555481, + "logits/rejected": 0.8781582117080688, + "logps/chosen": -9.703906059265137, + "logps/rejected": -11.203841209411621, + "loss": 0.3737, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.703906059265137, + "rewards/margins": 1.4999356269836426, + "rewards/rejected": -11.203841209411621, + "semantic_entropy": 0.0014606801560148597, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 20.144746908461087, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": 0.7229622602462769, + "logits/rejected": 0.8273760676383972, + "logps/chosen": -9.712352752685547, + "logps/rejected": -11.075556755065918, + "loss": 0.4149, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.712352752685547, + "rewards/margins": 1.363203763961792, + "rewards/rejected": -11.075556755065918, + "semantic_entropy": 0.0015043210005387664, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 21.29344289082141, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": 0.765488862991333, + "logits/rejected": 0.8269311189651489, + "logps/chosen": -9.819905281066895, + "logps/rejected": -11.093830108642578, + "loss": 0.4315, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.819905281066895, + "rewards/margins": 1.2739253044128418, + "rewards/rejected": -11.093830108642578, + "semantic_entropy": 0.0011897517833858728, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 31.858698430890097, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": 0.8017476797103882, + "logits/rejected": 0.8760465383529663, + "logps/chosen": -9.925119400024414, + "logps/rejected": -11.373433113098145, + "loss": 0.4195, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.925119400024414, + "rewards/margins": 1.4483143091201782, + "rewards/rejected": -11.373433113098145, + "semantic_entropy": 0.001272709690965712, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 21.93569257046853, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": 0.7383990287780762, + "logits/rejected": 0.8073341250419617, + "logps/chosen": -9.690180778503418, + "logps/rejected": -11.078446388244629, + "loss": 0.4162, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.690180778503418, + "rewards/margins": 1.3882659673690796, + "rewards/rejected": -11.078446388244629, + "semantic_entropy": 0.0013964849058538675, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 19.97322050449576, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": 0.7342582941055298, + "logits/rejected": 0.7973549962043762, + "logps/chosen": -9.67171573638916, + "logps/rejected": -10.98813533782959, + "loss": 0.4229, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.67171573638916, + "rewards/margins": 1.316420316696167, + "rewards/rejected": -10.98813533782959, + "semantic_entropy": 0.001446625916287303, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 20.71353603983738, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": 0.7267228960990906, + "logits/rejected": 0.8412584066390991, + "logps/chosen": -9.8917875289917, + "logps/rejected": -11.24048137664795, + "loss": 0.4238, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.8917875289917, + "rewards/margins": 1.3486926555633545, + "rewards/rejected": -11.24048137664795, + "semantic_entropy": 0.0015950720990076661, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 24.253708429812765, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": 0.7586138844490051, + "logits/rejected": 0.8224746584892273, + "logps/chosen": -9.68405532836914, + "logps/rejected": -11.180296897888184, + "loss": 0.3864, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.68405532836914, + "rewards/margins": 1.4962437152862549, + "rewards/rejected": -11.180296897888184, + "semantic_entropy": 0.0014107396127656102, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 21.753207135798707, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": 0.7383859753608704, + "logits/rejected": 0.8205004930496216, + "logps/chosen": -9.776418685913086, + "logps/rejected": -11.116586685180664, + "loss": 0.4023, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.776418685913086, + "rewards/margins": 1.34016752243042, + "rewards/rejected": -11.116586685180664, + "semantic_entropy": 0.0015196467284113169, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 37.45298900218617, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": 0.7673597931861877, + "logits/rejected": 0.8331373333930969, + "logps/chosen": -9.831031799316406, + "logps/rejected": -11.022607803344727, + "loss": 0.4243, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.831031799316406, + "rewards/margins": 1.1915762424468994, + "rewards/rejected": -11.022607803344727, + "semantic_entropy": 0.001290981424972415, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 29.063993723072343, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": 0.7792296409606934, + "logits/rejected": 0.8261978030204773, + "logps/chosen": -9.763978958129883, + "logps/rejected": -11.260164260864258, + "loss": 0.3626, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.763978958129883, + "rewards/margins": 1.496183156967163, + "rewards/rejected": -11.260164260864258, + "semantic_entropy": 0.0017484973650425673, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 21.46210084893414, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": 0.7814306616783142, + "logits/rejected": 0.8631542921066284, + "logps/chosen": -9.915163040161133, + "logps/rejected": -11.228841781616211, + "loss": 0.4225, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.915163040161133, + "rewards/margins": 1.3136794567108154, + "rewards/rejected": -11.228841781616211, + "semantic_entropy": 0.0011776359751820564, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 20.87464472480962, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": 0.6937421560287476, + "logits/rejected": 0.7896022796630859, + "logps/chosen": -9.791691780090332, + "logps/rejected": -11.08985710144043, + "loss": 0.4082, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.791691780090332, + "rewards/margins": 1.2981641292572021, + "rewards/rejected": -11.08985710144043, + "semantic_entropy": 0.0014528365572914481, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 22.141016604754245, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": 0.8213682174682617, + "logits/rejected": 0.8454049825668335, + "logps/chosen": -9.781519889831543, + "logps/rejected": -11.107604026794434, + "loss": 0.3978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.781519889831543, + "rewards/margins": 1.3260858058929443, + "rewards/rejected": -11.107604026794434, + "semantic_entropy": 0.0014172891387715936, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 20.81249553944227, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": 0.8247060775756836, + "logits/rejected": 0.9181084632873535, + "logps/chosen": -9.852422714233398, + "logps/rejected": -11.161924362182617, + "loss": 0.4153, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.852422714233398, + "rewards/margins": 1.3095014095306396, + "rewards/rejected": -11.161924362182617, + "semantic_entropy": 0.0011818426428362727, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 22.065604053110732, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": 0.7819138765335083, + "logits/rejected": 0.8265460133552551, + "logps/chosen": -9.933201789855957, + "logps/rejected": -11.143549919128418, + "loss": 0.5036, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.933201789855957, + "rewards/margins": 1.2103482484817505, + "rewards/rejected": -11.143549919128418, + "semantic_entropy": 0.0012393039651215076, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 17.547878202239335, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": 0.7445524334907532, + "logits/rejected": 0.7736660242080688, + "logps/chosen": -9.694409370422363, + "logps/rejected": -11.00683307647705, + "loss": 0.42, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.694409370422363, + "rewards/margins": 1.3124234676361084, + "rewards/rejected": -11.00683307647705, + "semantic_entropy": 0.001383893541060388, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 23.631179882315553, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": 0.792984664440155, + "logits/rejected": 0.8252687454223633, + "logps/chosen": -9.863832473754883, + "logps/rejected": -11.273421287536621, + "loss": 0.3964, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.863832473754883, + "rewards/margins": 1.409589409828186, + "rewards/rejected": -11.273421287536621, + "semantic_entropy": 0.001443797373212874, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 31.566956401950897, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": 0.7816181182861328, + "logits/rejected": 0.873005747795105, + "logps/chosen": -9.954813003540039, + "logps/rejected": -11.395962715148926, + "loss": 0.3918, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.954813003540039, + "rewards/margins": 1.4411489963531494, + "rewards/rejected": -11.395962715148926, + "semantic_entropy": 0.0011875508353114128, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 25.748276815440583, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": 0.7940434813499451, + "logits/rejected": 0.840873122215271, + "logps/chosen": -9.88911247253418, + "logps/rejected": -11.033079147338867, + "loss": 0.4659, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.88911247253418, + "rewards/margins": 1.1439659595489502, + "rewards/rejected": -11.033079147338867, + "semantic_entropy": 0.0013449281686916947, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 23.532526327852437, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": 0.8855890035629272, + "logits/rejected": 0.8981055021286011, + "logps/chosen": -9.778970718383789, + "logps/rejected": -11.14280891418457, + "loss": 0.4195, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.778970718383789, + "rewards/margins": 1.363840103149414, + "rewards/rejected": -11.14280891418457, + "semantic_entropy": 0.001345223980024457, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 23.144715325749804, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": 0.7933780550956726, + "logits/rejected": 0.8676679730415344, + "logps/chosen": -9.759759902954102, + "logps/rejected": -10.895282745361328, + "loss": 0.4728, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.759759902954102, + "rewards/margins": 1.135524034500122, + "rewards/rejected": -10.895282745361328, + "semantic_entropy": 0.001288101659156382, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 21.014853352663092, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": 0.8033139109611511, + "logits/rejected": 0.8606590032577515, + "logps/chosen": -9.787649154663086, + "logps/rejected": -11.041508674621582, + "loss": 0.4454, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.787649154663086, + "rewards/margins": 1.2538607120513916, + "rewards/rejected": -11.041508674621582, + "semantic_entropy": 0.0012479587458074093, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 25.574697677651937, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": 0.7537301182746887, + "logits/rejected": 0.814228355884552, + "logps/chosen": -9.702180862426758, + "logps/rejected": -11.04191780090332, + "loss": 0.42, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.702180862426758, + "rewards/margins": 1.3397365808486938, + "rewards/rejected": -11.04191780090332, + "semantic_entropy": 0.001320059527643025, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 24.618729215771705, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": 0.7519547343254089, + "logits/rejected": 0.8329681158065796, + "logps/chosen": -9.94709587097168, + "logps/rejected": -10.988082885742188, + "loss": 0.4853, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.94709587097168, + "rewards/margins": 1.0409865379333496, + "rewards/rejected": -10.988082885742188, + "semantic_entropy": 0.0011719849426299334, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 26.17462652294696, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": 0.7849665284156799, + "logits/rejected": 0.845086932182312, + "logps/chosen": -9.825363159179688, + "logps/rejected": -11.26807975769043, + "loss": 0.4079, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.825363159179688, + "rewards/margins": 1.4427168369293213, + "rewards/rejected": -11.26807975769043, + "semantic_entropy": 0.0016312900697812438, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 19.057899173583554, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": 0.7082661986351013, + "logits/rejected": 0.7645975947380066, + "logps/chosen": -9.70044994354248, + "logps/rejected": -11.13469123840332, + "loss": 0.3632, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.70044994354248, + "rewards/margins": 1.434242606163025, + "rewards/rejected": -11.13469123840332, + "semantic_entropy": 0.001436132937669754, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 22.488252138217725, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": 0.8336771130561829, + "logits/rejected": 0.8841003179550171, + "logps/chosen": -9.908154487609863, + "logps/rejected": -11.13020133972168, + "loss": 0.4829, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -9.908154487609863, + "rewards/margins": 1.2220475673675537, + "rewards/rejected": -11.13020133972168, + "semantic_entropy": 0.0014596920227631927, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 23.13587206070571, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": 0.8057346343994141, + "logits/rejected": 0.850749135017395, + "logps/chosen": -9.688154220581055, + "logps/rejected": -11.131349563598633, + "loss": 0.3866, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.688154220581055, + "rewards/margins": 1.4431952238082886, + "rewards/rejected": -11.131349563598633, + "semantic_entropy": 0.0014849931467324495, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 23.357693931056765, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": 0.7591571807861328, + "logits/rejected": 0.7910270094871521, + "logps/chosen": -9.584843635559082, + "logps/rejected": -10.765449523925781, + "loss": 0.4539, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.584843635559082, + "rewards/margins": 1.1806063652038574, + "rewards/rejected": -10.765449523925781, + "semantic_entropy": 0.0015608349349349737, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 20.637887868629868, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": 0.7534157037734985, + "logits/rejected": 0.842387855052948, + "logps/chosen": -9.789365768432617, + "logps/rejected": -11.138287544250488, + "loss": 0.4176, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.789365768432617, + "rewards/margins": 1.3489205837249756, + "rewards/rejected": -11.138287544250488, + "semantic_entropy": 0.0012669655261561275, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 18.543572299518026, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": 0.7632014751434326, + "logits/rejected": 0.8216020464897156, + "logps/chosen": -9.654337882995605, + "logps/rejected": -10.88366413116455, + "loss": 0.4224, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.654337882995605, + "rewards/margins": 1.229326605796814, + "rewards/rejected": -10.88366413116455, + "semantic_entropy": 0.0014172986848279834, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 19.149278895897503, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": 0.7528376579284668, + "logits/rejected": 0.8152421116828918, + "logps/chosen": -9.646097183227539, + "logps/rejected": -11.058183670043945, + "loss": 0.4187, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.646097183227539, + "rewards/margins": 1.412088394165039, + "rewards/rejected": -11.058183670043945, + "semantic_entropy": 0.0015735877677798271, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 21.187119688281765, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": 0.7996068000793457, + "logits/rejected": 0.8711563348770142, + "logps/chosen": -9.832134246826172, + "logps/rejected": -11.303122520446777, + "loss": 0.3744, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.832134246826172, + "rewards/margins": 1.4709880352020264, + "rewards/rejected": -11.303122520446777, + "semantic_entropy": 0.0013540387153625488, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 21.004241219678757, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": 0.7817031741142273, + "logits/rejected": 0.8433337211608887, + "logps/chosen": -9.771738052368164, + "logps/rejected": -11.173564910888672, + "loss": 0.3894, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.771738052368164, + "rewards/margins": 1.4018254280090332, + "rewards/rejected": -11.173564910888672, + "semantic_entropy": 0.0012005962198600173, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 27.55976528492471, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": 0.7919789552688599, + "logits/rejected": 0.8178110122680664, + "logps/chosen": -9.770918846130371, + "logps/rejected": -11.095891952514648, + "loss": 0.437, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.770918846130371, + "rewards/margins": 1.3249746561050415, + "rewards/rejected": -11.095891952514648, + "semantic_entropy": 0.0011873061303049326, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 20.74798992950919, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": 0.814818263053894, + "logits/rejected": 0.8907683491706848, + "logps/chosen": -9.838384628295898, + "logps/rejected": -11.240675926208496, + "loss": 0.3874, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.838384628295898, + "rewards/margins": 1.402291178703308, + "rewards/rejected": -11.240675926208496, + "semantic_entropy": 0.0014131965581327677, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 17.77642999066438, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": 0.7889447808265686, + "logits/rejected": 0.8778635859489441, + "logps/chosen": -9.612691879272461, + "logps/rejected": -10.984048843383789, + "loss": 0.3752, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.612691879272461, + "rewards/margins": 1.3713561296463013, + "rewards/rejected": -10.984048843383789, + "semantic_entropy": 0.0014132572105154395, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 19.68502083527493, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": 0.7648957967758179, + "logits/rejected": 0.7786797881126404, + "logps/chosen": -9.624971389770508, + "logps/rejected": -11.085968971252441, + "loss": 0.3885, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.624971389770508, + "rewards/margins": 1.460997223854065, + "rewards/rejected": -11.085968971252441, + "semantic_entropy": 0.0014543391298502684, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 23.013655732727454, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": 0.7716919183731079, + "logits/rejected": 0.8139322996139526, + "logps/chosen": -9.422433853149414, + "logps/rejected": -10.843701362609863, + "loss": 0.4095, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.422433853149414, + "rewards/margins": 1.4212672710418701, + "rewards/rejected": -10.843701362609863, + "semantic_entropy": 0.001683591166511178, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 24.587592896382663, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": 0.9284713864326477, + "logits/rejected": 0.9669192433357239, + "logps/chosen": -9.819892883300781, + "logps/rejected": -11.273547172546387, + "loss": 0.3939, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.819892883300781, + "rewards/margins": 1.4536547660827637, + "rewards/rejected": -11.273547172546387, + "semantic_entropy": 0.0012030914658680558, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 22.016927492062443, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": 0.7996488213539124, + "logits/rejected": 0.8577510714530945, + "logps/chosen": -9.611989974975586, + "logps/rejected": -10.817550659179688, + "loss": 0.4296, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.611989974975586, + "rewards/margins": 1.2055622339248657, + "rewards/rejected": -10.817550659179688, + "semantic_entropy": 0.0014349967241287231, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 20.553055577709493, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": 0.8431406021118164, + "logits/rejected": 0.930561900138855, + "logps/chosen": -9.651968002319336, + "logps/rejected": -11.155494689941406, + "loss": 0.3923, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.651968002319336, + "rewards/margins": 1.5035268068313599, + "rewards/rejected": -11.155494689941406, + "semantic_entropy": 0.001411119825206697, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 29.66982937376926, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": 0.7703269124031067, + "logits/rejected": 0.8427858352661133, + "logps/chosen": -9.545438766479492, + "logps/rejected": -10.993854522705078, + "loss": 0.3839, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.545438766479492, + "rewards/margins": 1.4484152793884277, + "rewards/rejected": -10.993854522705078, + "semantic_entropy": 0.0015610662521794438, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 24.578094627007825, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": 0.7740985751152039, + "logits/rejected": 0.851282000541687, + "logps/chosen": -9.825765609741211, + "logps/rejected": -10.998079299926758, + "loss": 0.4317, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.825765609741211, + "rewards/margins": 1.1723124980926514, + "rewards/rejected": -10.998079299926758, + "semantic_entropy": 0.0011104957666248083, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 17.212403265377716, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": 0.7384323477745056, + "logits/rejected": 0.809241771697998, + "logps/chosen": -9.808893203735352, + "logps/rejected": -11.134016036987305, + "loss": 0.4052, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.808893203735352, + "rewards/margins": 1.3251229524612427, + "rewards/rejected": -11.134016036987305, + "semantic_entropy": 0.001255923300050199, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 24.084031431969617, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": 0.8113842010498047, + "logits/rejected": 0.8640966415405273, + "logps/chosen": -9.65959644317627, + "logps/rejected": -10.88883113861084, + "loss": 0.4217, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.65959644317627, + "rewards/margins": 1.2292344570159912, + "rewards/rejected": -10.88883113861084, + "semantic_entropy": 0.0014709953684359789, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 23.42979933597834, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": 0.8243430256843567, + "logits/rejected": 0.8697830438613892, + "logps/chosen": -9.677289962768555, + "logps/rejected": -10.889700889587402, + "loss": 0.4429, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.677289962768555, + "rewards/margins": 1.212410807609558, + "rewards/rejected": -10.889700889587402, + "semantic_entropy": 0.001211336930282414, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 19.33238977643887, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": 0.830100417137146, + "logits/rejected": 0.8942376971244812, + "logps/chosen": -9.691996574401855, + "logps/rejected": -11.031505584716797, + "loss": 0.4332, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.691996574401855, + "rewards/margins": 1.3395094871520996, + "rewards/rejected": -11.031505584716797, + "semantic_entropy": 0.0013690624618902802, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 23.119082399990962, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": 0.8294227719306946, + "logits/rejected": 0.8717254400253296, + "logps/chosen": -9.509129524230957, + "logps/rejected": -10.85628890991211, + "loss": 0.41, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.509129524230957, + "rewards/margins": 1.3471596240997314, + "rewards/rejected": -10.85628890991211, + "semantic_entropy": 0.0018529357621446252, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 18.56557158174028, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": 0.795819878578186, + "logits/rejected": 0.88841712474823, + "logps/chosen": -9.543716430664062, + "logps/rejected": -10.928507804870605, + "loss": 0.3959, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.543716430664062, + "rewards/margins": 1.3847920894622803, + "rewards/rejected": -10.928507804870605, + "semantic_entropy": 0.0014773935545235872, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 26.78815904135646, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": 0.7813885807991028, + "logits/rejected": 0.8535317182540894, + "logps/chosen": -9.660378456115723, + "logps/rejected": -10.954367637634277, + "loss": 0.4392, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.660378456115723, + "rewards/margins": 1.2939906120300293, + "rewards/rejected": -10.954367637634277, + "semantic_entropy": 0.0014350914862006903, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 20.289843532833586, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": 0.8242961168289185, + "logits/rejected": 0.9029603004455566, + "logps/chosen": -9.712553977966309, + "logps/rejected": -11.0477933883667, + "loss": 0.4122, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.712553977966309, + "rewards/margins": 1.335240125656128, + "rewards/rejected": -11.0477933883667, + "semantic_entropy": 0.0014915402280166745, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 21.47761754271104, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": 0.787671685218811, + "logits/rejected": 0.8374601602554321, + "logps/chosen": -9.7283353805542, + "logps/rejected": -10.967700004577637, + "loss": 0.4222, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.7283353805542, + "rewards/margins": 1.2393652200698853, + "rewards/rejected": -10.967700004577637, + "semantic_entropy": 0.001365487463772297, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 28.03895601653979, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": 0.776307225227356, + "logits/rejected": 0.8199490308761597, + "logps/chosen": -9.557718276977539, + "logps/rejected": -10.877527236938477, + "loss": 0.4126, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.557718276977539, + "rewards/margins": 1.3198084831237793, + "rewards/rejected": -10.877527236938477, + "semantic_entropy": 0.0016608207952231169, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 32.868150571212254, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": 0.7730585336685181, + "logits/rejected": 0.8562465906143188, + "logps/chosen": -9.679393768310547, + "logps/rejected": -11.042935371398926, + "loss": 0.4244, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.679393768310547, + "rewards/margins": 1.3635411262512207, + "rewards/rejected": -11.042935371398926, + "semantic_entropy": 0.0014725655782967806, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 18.781224798445212, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": 0.8205526471138, + "logits/rejected": 0.8565570712089539, + "logps/chosen": -9.75233268737793, + "logps/rejected": -11.291008949279785, + "loss": 0.3533, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.75233268737793, + "rewards/margins": 1.5386755466461182, + "rewards/rejected": -11.291008949279785, + "semantic_entropy": 0.0013560467632487416, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 28.052854143232477, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": 0.7241109609603882, + "logits/rejected": 0.8222753405570984, + "logps/chosen": -9.689494132995605, + "logps/rejected": -11.097482681274414, + "loss": 0.3869, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.689494132995605, + "rewards/margins": 1.4079889059066772, + "rewards/rejected": -11.097482681274414, + "semantic_entropy": 0.0011607788037508726, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 25.150937293751188, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": 0.7780320048332214, + "logits/rejected": 0.8369150161743164, + "logps/chosen": -9.633565902709961, + "logps/rejected": -10.885080337524414, + "loss": 0.4725, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.633565902709961, + "rewards/margins": 1.2515143156051636, + "rewards/rejected": -10.885080337524414, + "semantic_entropy": 0.001628419035114348, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 22.549685697406634, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": 0.8130934834480286, + "logits/rejected": 0.9138733744621277, + "logps/chosen": -9.572754859924316, + "logps/rejected": -11.118395805358887, + "loss": 0.3436, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.572754859924316, + "rewards/margins": 1.5456407070159912, + "rewards/rejected": -11.118395805358887, + "semantic_entropy": 0.0017241360619664192, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 22.715970370218447, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": 0.8037542104721069, + "logits/rejected": 0.8449680209159851, + "logps/chosen": -9.80845832824707, + "logps/rejected": -11.075779914855957, + "loss": 0.4049, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.80845832824707, + "rewards/margins": 1.2673219442367554, + "rewards/rejected": -11.075779914855957, + "semantic_entropy": 0.0013479054905474186, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 21.870424101570975, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": 0.80865877866745, + "logits/rejected": 0.8610594868659973, + "logps/chosen": -9.720281600952148, + "logps/rejected": -11.03128433227539, + "loss": 0.3992, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.720281600952148, + "rewards/margins": 1.3110027313232422, + "rewards/rejected": -11.03128433227539, + "semantic_entropy": 0.0014346633106470108, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 25.565083159355556, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": 0.7875592708587646, + "logits/rejected": 0.8164467811584473, + "logps/chosen": -9.870200157165527, + "logps/rejected": -11.153292655944824, + "loss": 0.4386, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.870200157165527, + "rewards/margins": 1.2830924987792969, + "rewards/rejected": -11.153292655944824, + "semantic_entropy": 0.0011865177657455206, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 23.70159485340149, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": 0.7858568429946899, + "logits/rejected": 0.8397903442382812, + "logps/chosen": -9.606379508972168, + "logps/rejected": -11.173693656921387, + "loss": 0.3656, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.606379508972168, + "rewards/margins": 1.5673143863677979, + "rewards/rejected": -11.173693656921387, + "semantic_entropy": 0.0015011833747848868, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 20.488177720347142, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": 0.7883397936820984, + "logits/rejected": 0.8317297101020813, + "logps/chosen": -9.790312767028809, + "logps/rejected": -11.212163925170898, + "loss": 0.4019, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.790312767028809, + "rewards/margins": 1.4218522310256958, + "rewards/rejected": -11.212163925170898, + "semantic_entropy": 0.0015070982044562697, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 19.639448727845746, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": 0.7966683506965637, + "logits/rejected": 0.8559118509292603, + "logps/chosen": -9.566210746765137, + "logps/rejected": -11.087980270385742, + "loss": 0.3618, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.566210746765137, + "rewards/margins": 1.5217713117599487, + "rewards/rejected": -11.087980270385742, + "semantic_entropy": 0.0015061668818816543, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 23.858385047679327, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": 0.791594922542572, + "logits/rejected": 0.8555682301521301, + "logps/chosen": -9.7157564163208, + "logps/rejected": -10.862947463989258, + "loss": 0.4575, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.7157564163208, + "rewards/margins": 1.1471917629241943, + "rewards/rejected": -10.862947463989258, + "semantic_entropy": 0.0014351477148011327, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 23.875748432774547, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": 0.8073896169662476, + "logits/rejected": 0.850189208984375, + "logps/chosen": -9.71528434753418, + "logps/rejected": -11.275640487670898, + "loss": 0.3913, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.71528434753418, + "rewards/margins": 1.56035578250885, + "rewards/rejected": -11.275640487670898, + "semantic_entropy": 0.0013745089527219534, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 24.624000051473708, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": 0.8650287389755249, + "logits/rejected": 0.9158161282539368, + "logps/chosen": -9.759730339050293, + "logps/rejected": -11.222911834716797, + "loss": 0.4193, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.759730339050293, + "rewards/margins": 1.463181495666504, + "rewards/rejected": -11.222911834716797, + "semantic_entropy": 0.0013576913625001907, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 18.880924959816383, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": 0.753667950630188, + "logits/rejected": 0.8446556329727173, + "logps/chosen": -9.776894569396973, + "logps/rejected": -11.069523811340332, + "loss": 0.4017, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.776894569396973, + "rewards/margins": 1.292628526687622, + "rewards/rejected": -11.069523811340332, + "semantic_entropy": 0.0013450583210214972, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 26.78240030796786, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": 0.8099533319473267, + "logits/rejected": 0.8473021388053894, + "logps/chosen": -9.640462875366211, + "logps/rejected": -10.797101974487305, + "loss": 0.4493, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.640462875366211, + "rewards/margins": 1.1566379070281982, + "rewards/rejected": -10.797101974487305, + "semantic_entropy": 0.0015895968535915017, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 27.727951802633132, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": 0.7754964828491211, + "logits/rejected": 0.8192489743232727, + "logps/chosen": -9.690164566040039, + "logps/rejected": -11.208440780639648, + "loss": 0.4116, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.690164566040039, + "rewards/margins": 1.5182764530181885, + "rewards/rejected": -11.208440780639648, + "semantic_entropy": 0.0017889321316033602, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 28.149971668259315, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": 0.7809394001960754, + "logits/rejected": 0.8340757489204407, + "logps/chosen": -9.840954780578613, + "logps/rejected": -11.24518871307373, + "loss": 0.3951, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.840954780578613, + "rewards/margins": 1.4042353630065918, + "rewards/rejected": -11.24518871307373, + "semantic_entropy": 0.0015198871260508895, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 17.6357809314226, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": 0.8027510643005371, + "logits/rejected": 0.8235558271408081, + "logps/chosen": -9.668852806091309, + "logps/rejected": -10.926243782043457, + "loss": 0.4394, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.668852806091309, + "rewards/margins": 1.257389783859253, + "rewards/rejected": -10.926243782043457, + "semantic_entropy": 0.001651085214689374, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.8550450205802917, + "eval_logits/rejected": 0.8955670595169067, + "eval_logps/chosen": -9.817285537719727, + "eval_logps/rejected": -10.942052841186523, + "eval_loss": 0.5237716436386108, + "eval_rewards/accuracies": 0.7255192995071411, + "eval_rewards/chosen": -9.817285537719727, + "eval_rewards/margins": 1.1247663497924805, + "eval_rewards/rejected": -10.942052841186523, + "eval_runtime": 35.1465, + "eval_samples_per_second": 38.268, + "eval_semantic_entropy": 0.0013976304326206446, + "eval_steps_per_second": 9.588, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 25.642642407075105, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": 0.7796936631202698, + "logits/rejected": 0.817895233631134, + "logps/chosen": -9.595781326293945, + "logps/rejected": -10.862689018249512, + "loss": 0.4646, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.595781326293945, + "rewards/margins": 1.2669070959091187, + "rewards/rejected": -10.862689018249512, + "semantic_entropy": 0.0013227377785369754, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 20.93852604421033, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": 0.777945339679718, + "logits/rejected": 0.7996780872344971, + "logps/chosen": -9.620034217834473, + "logps/rejected": -11.079931259155273, + "loss": 0.3832, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.620034217834473, + "rewards/margins": 1.4598976373672485, + "rewards/rejected": -11.079931259155273, + "semantic_entropy": 0.0017585292225703597, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 24.39765980999833, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": 0.8418199419975281, + "logits/rejected": 0.8500019311904907, + "logps/chosen": -9.530296325683594, + "logps/rejected": -10.991520881652832, + "loss": 0.4437, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.530296325683594, + "rewards/margins": 1.4612245559692383, + "rewards/rejected": -10.991520881652832, + "semantic_entropy": 0.0018125723581761122, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 31.63637936690902, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": 0.8116466403007507, + "logits/rejected": 0.8796448707580566, + "logps/chosen": -9.765870094299316, + "logps/rejected": -10.956972122192383, + "loss": 0.4451, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.765870094299316, + "rewards/margins": 1.191102385520935, + "rewards/rejected": -10.956972122192383, + "semantic_entropy": 0.0012761508114635944, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 27.871272030425462, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": 0.8403164148330688, + "logits/rejected": 0.900818943977356, + "logps/chosen": -9.660100936889648, + "logps/rejected": -10.994760513305664, + "loss": 0.4122, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.660100936889648, + "rewards/margins": 1.3346589803695679, + "rewards/rejected": -10.994760513305664, + "semantic_entropy": 0.001608129939995706, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 28.248867607037017, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": 0.8395519256591797, + "logits/rejected": 0.8937468528747559, + "logps/chosen": -9.703470230102539, + "logps/rejected": -11.014467239379883, + "loss": 0.413, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.703470230102539, + "rewards/margins": 1.3109973669052124, + "rewards/rejected": -11.014467239379883, + "semantic_entropy": 0.0014707682421430945, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 25.030373000459893, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": 0.7156926393508911, + "logits/rejected": 0.783743679523468, + "logps/chosen": -9.621664047241211, + "logps/rejected": -11.060919761657715, + "loss": 0.351, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -9.621664047241211, + "rewards/margins": 1.439256191253662, + "rewards/rejected": -11.060919761657715, + "semantic_entropy": 0.001427180483005941, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 26.228171272523518, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": 0.7665778398513794, + "logits/rejected": 0.8383312225341797, + "logps/chosen": -9.543096542358398, + "logps/rejected": -11.02253532409668, + "loss": 0.3674, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.543096542358398, + "rewards/margins": 1.4794379472732544, + "rewards/rejected": -11.02253532409668, + "semantic_entropy": 0.001682286150753498, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 28.09304993416799, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": 0.7801726460456848, + "logits/rejected": 0.8014837503433228, + "logps/chosen": -9.684420585632324, + "logps/rejected": -10.936495780944824, + "loss": 0.4075, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.684420585632324, + "rewards/margins": 1.2520757913589478, + "rewards/rejected": -10.936495780944824, + "semantic_entropy": 0.0015050426591187716, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 23.215944833323043, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": 0.7506409883499146, + "logits/rejected": 0.8083317875862122, + "logps/chosen": -9.687183380126953, + "logps/rejected": -11.075380325317383, + "loss": 0.4257, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.687183380126953, + "rewards/margins": 1.3881968259811401, + "rewards/rejected": -11.075380325317383, + "semantic_entropy": 0.0012998328311368823, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 26.1051895667939, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": 0.8464560508728027, + "logits/rejected": 0.884229302406311, + "logps/chosen": -9.554250717163086, + "logps/rejected": -10.965102195739746, + "loss": 0.3927, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.554250717163086, + "rewards/margins": 1.410851240158081, + "rewards/rejected": -10.965102195739746, + "semantic_entropy": 0.0016194203635677695, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 29.62614133583001, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": 0.7584127187728882, + "logits/rejected": 0.7833465337753296, + "logps/chosen": -9.556783676147461, + "logps/rejected": -10.777464866638184, + "loss": 0.4411, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.556783676147461, + "rewards/margins": 1.2206814289093018, + "rewards/rejected": -10.777464866638184, + "semantic_entropy": 0.002015589503571391, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 19.381379819748208, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": 0.8153206706047058, + "logits/rejected": 0.8267370462417603, + "logps/chosen": -9.816014289855957, + "logps/rejected": -11.20833683013916, + "loss": 0.4169, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.816014289855957, + "rewards/margins": 1.3923231363296509, + "rewards/rejected": -11.20833683013916, + "semantic_entropy": 0.001474303426221013, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 16.778112141427197, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": 0.8045045137405396, + "logits/rejected": 0.8898431658744812, + "logps/chosen": -9.492764472961426, + "logps/rejected": -11.15455436706543, + "loss": 0.3243, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.492764472961426, + "rewards/margins": 1.6617908477783203, + "rewards/rejected": -11.15455436706543, + "semantic_entropy": 0.001532680937089026, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 22.352970563490228, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": 0.8180361986160278, + "logits/rejected": 0.8790004849433899, + "logps/chosen": -9.450529098510742, + "logps/rejected": -10.990362167358398, + "loss": 0.3905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.450529098510742, + "rewards/margins": 1.539833426475525, + "rewards/rejected": -10.990362167358398, + "semantic_entropy": 0.0015029583591967821, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 20.68826893781038, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": 0.9028242826461792, + "logits/rejected": 0.9264825582504272, + "logps/chosen": -9.678912162780762, + "logps/rejected": -11.107706069946289, + "loss": 0.3986, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.678912162780762, + "rewards/margins": 1.4287939071655273, + "rewards/rejected": -11.107706069946289, + "semantic_entropy": 0.0014561197021976113, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 23.193032984362397, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": 0.8201519846916199, + "logits/rejected": 0.8877601623535156, + "logps/chosen": -9.788244247436523, + "logps/rejected": -11.236588478088379, + "loss": 0.3943, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.788244247436523, + "rewards/margins": 1.4483439922332764, + "rewards/rejected": -11.236588478088379, + "semantic_entropy": 0.0011878965888172388, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 22.557556114089028, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": 0.7988881468772888, + "logits/rejected": 0.8320202827453613, + "logps/chosen": -9.691407203674316, + "logps/rejected": -11.068517684936523, + "loss": 0.4254, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.691407203674316, + "rewards/margins": 1.3771107196807861, + "rewards/rejected": -11.068517684936523, + "semantic_entropy": 0.001317240297794342, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 19.898200763361565, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": 0.7807987928390503, + "logits/rejected": 0.8615278005599976, + "logps/chosen": -9.802709579467773, + "logps/rejected": -11.011950492858887, + "loss": 0.4279, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.802709579467773, + "rewards/margins": 1.2092421054840088, + "rewards/rejected": -11.011950492858887, + "semantic_entropy": 0.0015492916572839022, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 23.69089776610765, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": 0.7918484807014465, + "logits/rejected": 0.8997529149055481, + "logps/chosen": -9.777336120605469, + "logps/rejected": -11.20526123046875, + "loss": 0.374, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.777336120605469, + "rewards/margins": 1.4279241561889648, + "rewards/rejected": -11.20526123046875, + "semantic_entropy": 0.0012134136632084846, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 20.406979801252664, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": 0.795091986656189, + "logits/rejected": 0.8641031384468079, + "logps/chosen": -9.73538589477539, + "logps/rejected": -11.23878288269043, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.73538589477539, + "rewards/margins": 1.5033972263336182, + "rewards/rejected": -11.23878288269043, + "semantic_entropy": 0.0015479883877560496, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 19.21550189332208, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": 0.7698862552642822, + "logits/rejected": 0.8082722425460815, + "logps/chosen": -9.636996269226074, + "logps/rejected": -11.186447143554688, + "loss": 0.3999, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.636996269226074, + "rewards/margins": 1.5494511127471924, + "rewards/rejected": -11.186447143554688, + "semantic_entropy": 0.0014376682229340076, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 23.312400032128476, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": 0.8084294199943542, + "logits/rejected": 0.8658881187438965, + "logps/chosen": -9.69934368133545, + "logps/rejected": -10.961128234863281, + "loss": 0.4383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.69934368133545, + "rewards/margins": 1.261784553527832, + "rewards/rejected": -10.961128234863281, + "semantic_entropy": 0.0016576785128563643, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 22.36582731335697, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": 0.8091555833816528, + "logits/rejected": 0.8557069897651672, + "logps/chosen": -9.803995132446289, + "logps/rejected": -11.197736740112305, + "loss": 0.3964, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.803995132446289, + "rewards/margins": 1.3937435150146484, + "rewards/rejected": -11.197736740112305, + "semantic_entropy": 0.0012877520639449358, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 18.952340559608157, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": 0.7486631870269775, + "logits/rejected": 0.8488380312919617, + "logps/chosen": -9.719161987304688, + "logps/rejected": -11.249533653259277, + "loss": 0.3694, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.719161987304688, + "rewards/margins": 1.5303723812103271, + "rewards/rejected": -11.249533653259277, + "semantic_entropy": 0.001469378243200481, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 18.013629249072153, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": 0.8312109708786011, + "logits/rejected": 0.9005948901176453, + "logps/chosen": -9.830782890319824, + "logps/rejected": -11.317447662353516, + "loss": 0.3873, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.830782890319824, + "rewards/margins": 1.4866645336151123, + "rewards/rejected": -11.317447662353516, + "semantic_entropy": 0.0013012022245675325, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 26.68369648236472, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": 0.8236324191093445, + "logits/rejected": 0.898714542388916, + "logps/chosen": -9.688699722290039, + "logps/rejected": -11.057371139526367, + "loss": 0.4296, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.688699722290039, + "rewards/margins": 1.368671178817749, + "rewards/rejected": -11.057371139526367, + "semantic_entropy": 0.0014394777826964855, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 19.214329700454428, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": 0.7843598127365112, + "logits/rejected": 0.8269468545913696, + "logps/chosen": -9.618779182434082, + "logps/rejected": -10.83703899383545, + "loss": 0.4075, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.618779182434082, + "rewards/margins": 1.2182590961456299, + "rewards/rejected": -10.83703899383545, + "semantic_entropy": 0.0018105891067534685, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 25.567099589733154, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": 0.771405041217804, + "logits/rejected": 0.867265522480011, + "logps/chosen": -9.69874382019043, + "logps/rejected": -11.158080101013184, + "loss": 0.3747, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.69874382019043, + "rewards/margins": 1.4593359231948853, + "rewards/rejected": -11.158080101013184, + "semantic_entropy": 0.0013225203147158027, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 21.387358904048636, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": 0.8330507278442383, + "logits/rejected": 0.8846317529678345, + "logps/chosen": -9.71510124206543, + "logps/rejected": -11.14268684387207, + "loss": 0.3984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.71510124206543, + "rewards/margins": 1.4275856018066406, + "rewards/rejected": -11.14268684387207, + "semantic_entropy": 0.0013751887017861009, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 23.151467653095995, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": 0.8044828176498413, + "logits/rejected": 0.8498908877372742, + "logps/chosen": -9.883355140686035, + "logps/rejected": -11.208585739135742, + "loss": 0.4105, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.883355140686035, + "rewards/margins": 1.325231909751892, + "rewards/rejected": -11.208585739135742, + "semantic_entropy": 0.0015358685050159693, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 21.04274720664362, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": 0.8240159749984741, + "logits/rejected": 0.9003788828849792, + "logps/chosen": -9.615509986877441, + "logps/rejected": -11.031620025634766, + "loss": 0.3856, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.615509986877441, + "rewards/margins": 1.4161105155944824, + "rewards/rejected": -11.031620025634766, + "semantic_entropy": 0.0017490362515673041, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 39.33191380791592, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": 0.7968525290489197, + "logits/rejected": 0.8308472633361816, + "logps/chosen": -9.674779891967773, + "logps/rejected": -11.145748138427734, + "loss": 0.371, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.674779891967773, + "rewards/margins": 1.4709681272506714, + "rewards/rejected": -11.145748138427734, + "semantic_entropy": 0.0014223111793398857, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 20.85099205117232, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": 0.8421157002449036, + "logits/rejected": 0.8929288983345032, + "logps/chosen": -9.53125, + "logps/rejected": -10.943530082702637, + "loss": 0.3826, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.53125, + "rewards/margins": 1.4122816324234009, + "rewards/rejected": -10.943530082702637, + "semantic_entropy": 0.0014043385162949562, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 22.939506081463986, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": 0.8108006715774536, + "logits/rejected": 0.8519940376281738, + "logps/chosen": -9.818662643432617, + "logps/rejected": -11.12476921081543, + "loss": 0.3931, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.818662643432617, + "rewards/margins": 1.3061046600341797, + "rewards/rejected": -11.12476921081543, + "semantic_entropy": 0.0010739094577729702, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 22.129521362188676, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": 0.822609543800354, + "logits/rejected": 0.8529809713363647, + "logps/chosen": -9.925312995910645, + "logps/rejected": -11.192630767822266, + "loss": 0.4489, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.925312995910645, + "rewards/margins": 1.2673180103302002, + "rewards/rejected": -11.192630767822266, + "semantic_entropy": 0.0011812245938926935, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 24.19733690749803, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": 0.8324605226516724, + "logits/rejected": 0.9051470756530762, + "logps/chosen": -9.69133472442627, + "logps/rejected": -11.151666641235352, + "loss": 0.4529, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.69133472442627, + "rewards/margins": 1.460331678390503, + "rewards/rejected": -11.151666641235352, + "semantic_entropy": 0.0018878221744671464, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 31.252423984045087, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": 0.8355720639228821, + "logits/rejected": 0.8954976797103882, + "logps/chosen": -9.62411880493164, + "logps/rejected": -11.132128715515137, + "loss": 0.361, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.62411880493164, + "rewards/margins": 1.5080082416534424, + "rewards/rejected": -11.132128715515137, + "semantic_entropy": 0.0016464665532112122, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 21.68508018940414, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": 0.7898616790771484, + "logits/rejected": 0.8546016812324524, + "logps/chosen": -9.857865333557129, + "logps/rejected": -11.005608558654785, + "loss": 0.4645, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.857865333557129, + "rewards/margins": 1.147742748260498, + "rewards/rejected": -11.005608558654785, + "semantic_entropy": 0.0011803485685959458, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 21.328951358859364, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": 0.8253191113471985, + "logits/rejected": 0.8587236404418945, + "logps/chosen": -9.829444885253906, + "logps/rejected": -11.261645317077637, + "loss": 0.4094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.829444885253906, + "rewards/margins": 1.4322013854980469, + "rewards/rejected": -11.261645317077637, + "semantic_entropy": 0.0012271823361515999, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 25.75553736028734, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": 0.8507275581359863, + "logits/rejected": 0.8877654075622559, + "logps/chosen": -9.808802604675293, + "logps/rejected": -10.979841232299805, + "loss": 0.438, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.808802604675293, + "rewards/margins": 1.1710389852523804, + "rewards/rejected": -10.979841232299805, + "semantic_entropy": 0.0013294884702190757, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 29.51695593361045, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": 0.8268327713012695, + "logits/rejected": 0.8833521008491516, + "logps/chosen": -9.724630355834961, + "logps/rejected": -11.137059211730957, + "loss": 0.3859, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.724630355834961, + "rewards/margins": 1.412428617477417, + "rewards/rejected": -11.137059211730957, + "semantic_entropy": 0.0016626717988401651, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 20.483800947742463, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": 0.8099279403686523, + "logits/rejected": 0.8540660738945007, + "logps/chosen": -9.710822105407715, + "logps/rejected": -11.191596984863281, + "loss": 0.3804, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.710822105407715, + "rewards/margins": 1.4807744026184082, + "rewards/rejected": -11.191596984863281, + "semantic_entropy": 0.0015045705949887633, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 26.229172740059667, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": 0.8173542022705078, + "logits/rejected": 0.8689044117927551, + "logps/chosen": -9.657730102539062, + "logps/rejected": -10.887810707092285, + "loss": 0.4424, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.657730102539062, + "rewards/margins": 1.230080008506775, + "rewards/rejected": -10.887810707092285, + "semantic_entropy": 0.0013948578853160143, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 19.40648296473702, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": 0.7883289456367493, + "logits/rejected": 0.8523383140563965, + "logps/chosen": -9.800715446472168, + "logps/rejected": -11.160974502563477, + "loss": 0.4105, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.800715446472168, + "rewards/margins": 1.360258936882019, + "rewards/rejected": -11.160974502563477, + "semantic_entropy": 0.00112335872836411, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 20.18654274201843, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": 0.8560435175895691, + "logits/rejected": 0.8896482586860657, + "logps/chosen": -9.66085147857666, + "logps/rejected": -11.121121406555176, + "loss": 0.3676, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.66085147857666, + "rewards/margins": 1.4602700471878052, + "rewards/rejected": -11.121121406555176, + "semantic_entropy": 0.0015098705189302564, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 27.638003566187923, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": 0.8065202832221985, + "logits/rejected": 0.8954984545707703, + "logps/chosen": -9.99959945678711, + "logps/rejected": -11.314790725708008, + "loss": 0.4151, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.99959945678711, + "rewards/margins": 1.3151907920837402, + "rewards/rejected": -11.314790725708008, + "semantic_entropy": 0.0012680039508268237, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 22.574139063646903, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": 0.8925463557243347, + "logits/rejected": 0.9036859273910522, + "logps/chosen": -9.90015983581543, + "logps/rejected": -11.163106918334961, + "loss": 0.4213, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.90015983581543, + "rewards/margins": 1.262947678565979, + "rewards/rejected": -11.163106918334961, + "semantic_entropy": 0.0010416943114250898, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 23.894727539187592, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": 0.8361862301826477, + "logits/rejected": 0.8620640635490417, + "logps/chosen": -9.725111961364746, + "logps/rejected": -11.048177719116211, + "loss": 0.3983, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.725111961364746, + "rewards/margins": 1.3230668306350708, + "rewards/rejected": -11.048177719116211, + "semantic_entropy": 0.001452545402571559, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 22.35978287732217, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": 0.7902384996414185, + "logits/rejected": 0.8329121470451355, + "logps/chosen": -9.640339851379395, + "logps/rejected": -10.89880657196045, + "loss": 0.4069, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.640339851379395, + "rewards/margins": 1.258466362953186, + "rewards/rejected": -10.89880657196045, + "semantic_entropy": 0.0015184081858024001, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 21.602439650044992, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": 0.7853751182556152, + "logits/rejected": 0.8521712422370911, + "logps/chosen": -9.592992782592773, + "logps/rejected": -11.00520133972168, + "loss": 0.3884, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.592992782592773, + "rewards/margins": 1.4122079610824585, + "rewards/rejected": -11.00520133972168, + "semantic_entropy": 0.0015211288118734956, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 24.292700666304096, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": 0.8210417628288269, + "logits/rejected": 0.8545898199081421, + "logps/chosen": -9.68048095703125, + "logps/rejected": -10.930859565734863, + "loss": 0.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.68048095703125, + "rewards/margins": 1.2503786087036133, + "rewards/rejected": -10.930859565734863, + "semantic_entropy": 0.001456740777939558, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 23.853271664665353, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": 0.8457640409469604, + "logits/rejected": 0.8957823514938354, + "logps/chosen": -9.459394454956055, + "logps/rejected": -10.84212589263916, + "loss": 0.4069, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.459394454956055, + "rewards/margins": 1.3827307224273682, + "rewards/rejected": -10.84212589263916, + "semantic_entropy": 0.0017677752766758204, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 27.366391781200583, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": 0.8697333335876465, + "logits/rejected": 0.9225956201553345, + "logps/chosen": -9.678709983825684, + "logps/rejected": -10.979662895202637, + "loss": 0.3868, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.678709983825684, + "rewards/margins": 1.3009527921676636, + "rewards/rejected": -10.979662895202637, + "semantic_entropy": 0.0014675845159217715, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 29.099080644827772, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": 0.8708797693252563, + "logits/rejected": 0.8982332348823547, + "logps/chosen": -9.632844924926758, + "logps/rejected": -10.764936447143555, + "loss": 0.4623, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.632844924926758, + "rewards/margins": 1.132093071937561, + "rewards/rejected": -10.764936447143555, + "semantic_entropy": 0.0013683564029633999, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 23.320063008009125, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": 0.8062912225723267, + "logits/rejected": 0.8842616081237793, + "logps/chosen": -9.81375503540039, + "logps/rejected": -11.180780410766602, + "loss": 0.3998, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.81375503540039, + "rewards/margins": 1.3670246601104736, + "rewards/rejected": -11.180780410766602, + "semantic_entropy": 0.0012563010677695274, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 27.02131423176778, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": 0.8369554281234741, + "logits/rejected": 0.8760555386543274, + "logps/chosen": -9.69217300415039, + "logps/rejected": -11.273096084594727, + "loss": 0.3291, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.69217300415039, + "rewards/margins": 1.580923318862915, + "rewards/rejected": -11.273096084594727, + "semantic_entropy": 0.0013889706460759044, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 26.902505403388286, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": 0.8435298800468445, + "logits/rejected": 0.8922918438911438, + "logps/chosen": -9.850217819213867, + "logps/rejected": -11.199501037597656, + "loss": 0.4054, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.850217819213867, + "rewards/margins": 1.3492811918258667, + "rewards/rejected": -11.199501037597656, + "semantic_entropy": 0.0012866712640970945, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 23.139963064688857, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": 0.7457908987998962, + "logits/rejected": 0.8107954859733582, + "logps/chosen": -9.701014518737793, + "logps/rejected": -11.19543743133545, + "loss": 0.4072, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.701014518737793, + "rewards/margins": 1.4944229125976562, + "rewards/rejected": -11.19543743133545, + "semantic_entropy": 0.001561012351885438, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 18.505427836623596, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": 0.8288080096244812, + "logits/rejected": 0.852368950843811, + "logps/chosen": -9.831713676452637, + "logps/rejected": -11.238375663757324, + "loss": 0.4368, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.831713676452637, + "rewards/margins": 1.4066613912582397, + "rewards/rejected": -11.238375663757324, + "semantic_entropy": 0.0013758750865235925, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 24.20695474770458, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": 0.7989641427993774, + "logits/rejected": 0.8495559692382812, + "logps/chosen": -9.894341468811035, + "logps/rejected": -11.228631019592285, + "loss": 0.3974, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.894341468811035, + "rewards/margins": 1.3342888355255127, + "rewards/rejected": -11.228631019592285, + "semantic_entropy": 0.0011284537613391876, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 19.561878478981807, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": 0.7845159769058228, + "logits/rejected": 0.8294118046760559, + "logps/chosen": -9.763102531433105, + "logps/rejected": -11.050474166870117, + "loss": 0.4596, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.763102531433105, + "rewards/margins": 1.2873718738555908, + "rewards/rejected": -11.050474166870117, + "semantic_entropy": 0.0012428943300619721, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 26.484826469549404, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": 0.7876384258270264, + "logits/rejected": 0.8591636419296265, + "logps/chosen": -9.837101936340332, + "logps/rejected": -11.317276954650879, + "loss": 0.348, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.837101936340332, + "rewards/margins": 1.4801758527755737, + "rewards/rejected": -11.317276954650879, + "semantic_entropy": 0.0012740811798721552, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 26.768124375085126, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": 0.8366681933403015, + "logits/rejected": 0.8936346173286438, + "logps/chosen": -9.770502090454102, + "logps/rejected": -11.170347213745117, + "loss": 0.3935, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.770502090454102, + "rewards/margins": 1.3998456001281738, + "rewards/rejected": -11.170347213745117, + "semantic_entropy": 0.0012818884570151567, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 34.15098903260704, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": 0.7642577886581421, + "logits/rejected": 0.8753819465637207, + "logps/chosen": -9.696569442749023, + "logps/rejected": -11.150983810424805, + "loss": 0.3862, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.696569442749023, + "rewards/margins": 1.454413652420044, + "rewards/rejected": -11.150983810424805, + "semantic_entropy": 0.0013747283956035972, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 28.8525710173051, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": 0.7833540439605713, + "logits/rejected": 0.855305552482605, + "logps/chosen": -9.691442489624023, + "logps/rejected": -11.090289115905762, + "loss": 0.4098, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.691442489624023, + "rewards/margins": 1.3988467454910278, + "rewards/rejected": -11.090289115905762, + "semantic_entropy": 0.001538719516247511, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 30.720388291606127, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": 0.8673465847969055, + "logits/rejected": 0.9190985560417175, + "logps/chosen": -9.79682445526123, + "logps/rejected": -11.017160415649414, + "loss": 0.4352, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.79682445526123, + "rewards/margins": 1.2203348875045776, + "rewards/rejected": -11.017160415649414, + "semantic_entropy": 0.0019034147262573242, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 19.26413991827598, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": 0.8775045275688171, + "logits/rejected": 0.948703944683075, + "logps/chosen": -9.918733596801758, + "logps/rejected": -11.272817611694336, + "loss": 0.3995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.918733596801758, + "rewards/margins": 1.3540844917297363, + "rewards/rejected": -11.272817611694336, + "semantic_entropy": 0.0010751333320513368, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 27.791159491160563, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": 0.8398303985595703, + "logits/rejected": 0.8916828036308289, + "logps/chosen": -9.831799507141113, + "logps/rejected": -11.197306632995605, + "loss": 0.4334, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.831799507141113, + "rewards/margins": 1.3655065298080444, + "rewards/rejected": -11.197306632995605, + "semantic_entropy": 0.0013978518545627594, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 26.235313395338775, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": 0.8134158849716187, + "logits/rejected": 0.8556219935417175, + "logps/chosen": -9.731077194213867, + "logps/rejected": -11.126041412353516, + "loss": 0.4181, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.731077194213867, + "rewards/margins": 1.394963026046753, + "rewards/rejected": -11.126041412353516, + "semantic_entropy": 0.00145871308632195, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 19.605206487487308, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": 0.7870944738388062, + "logits/rejected": 0.8299382925033569, + "logps/chosen": -9.642583847045898, + "logps/rejected": -11.064803123474121, + "loss": 0.4027, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.642583847045898, + "rewards/margins": 1.4222198724746704, + "rewards/rejected": -11.064803123474121, + "semantic_entropy": 0.0014933927450329065, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 18.551143666002904, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": 0.7672029137611389, + "logits/rejected": 0.8150334358215332, + "logps/chosen": -9.65953254699707, + "logps/rejected": -11.068132400512695, + "loss": 0.3776, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.65953254699707, + "rewards/margins": 1.4086004495620728, + "rewards/rejected": -11.068132400512695, + "semantic_entropy": 0.0015167773235589266, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 28.399641966852776, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": 0.8048108220100403, + "logits/rejected": 0.8365411758422852, + "logps/chosen": -9.653702735900879, + "logps/rejected": -11.053384780883789, + "loss": 0.4278, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.653702735900879, + "rewards/margins": 1.3996822834014893, + "rewards/rejected": -11.053384780883789, + "semantic_entropy": 0.0016002919292077422, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 22.352042394265855, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": 0.8579298257827759, + "logits/rejected": 0.9403360486030579, + "logps/chosen": -10.103005409240723, + "logps/rejected": -11.47459602355957, + "loss": 0.3877, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -10.103005409240723, + "rewards/margins": 1.3715909719467163, + "rewards/rejected": -11.47459602355957, + "semantic_entropy": 0.001051284489221871, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 25.499811878639434, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": 0.8347901105880737, + "logits/rejected": 0.9103593826293945, + "logps/chosen": -9.974831581115723, + "logps/rejected": -11.45885944366455, + "loss": 0.3708, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.974831581115723, + "rewards/margins": 1.4840264320373535, + "rewards/rejected": -11.45885944366455, + "semantic_entropy": 0.0011410152073949575, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 28.6122272486666, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": 0.8332939147949219, + "logits/rejected": 0.8270009756088257, + "logps/chosen": -9.950929641723633, + "logps/rejected": -11.199871063232422, + "loss": 0.4188, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.950929641723633, + "rewards/margins": 1.2489430904388428, + "rewards/rejected": -11.199871063232422, + "semantic_entropy": 0.001222481718286872, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 17.901053138739112, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": 0.8614856600761414, + "logits/rejected": 0.8957780599594116, + "logps/chosen": -9.616273880004883, + "logps/rejected": -11.097272872924805, + "loss": 0.3995, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.616273880004883, + "rewards/margins": 1.4809997081756592, + "rewards/rejected": -11.097272872924805, + "semantic_entropy": 0.001978642772883177, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 23.833732221773882, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": 0.7540593147277832, + "logits/rejected": 0.8192625045776367, + "logps/chosen": -9.756368637084961, + "logps/rejected": -11.14604663848877, + "loss": 0.3821, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -9.756368637084961, + "rewards/margins": 1.3896772861480713, + "rewards/rejected": -11.14604663848877, + "semantic_entropy": 0.0011760034831240773, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 22.006338516815813, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": 0.8480769991874695, + "logits/rejected": 0.9198252558708191, + "logps/chosen": -9.712282180786133, + "logps/rejected": -11.163978576660156, + "loss": 0.3873, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.712282180786133, + "rewards/margins": 1.4516950845718384, + "rewards/rejected": -11.163978576660156, + "semantic_entropy": 0.0014456122880801558, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 29.759216354153438, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": 0.7646334171295166, + "logits/rejected": 0.8091537356376648, + "logps/chosen": -9.873659133911133, + "logps/rejected": -11.170156478881836, + "loss": 0.4221, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.873659133911133, + "rewards/margins": 1.2964979410171509, + "rewards/rejected": -11.170156478881836, + "semantic_entropy": 0.001071856007911265, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.8672059774398804, + "eval_logits/rejected": 0.9047586917877197, + "eval_logps/chosen": -9.958077430725098, + "eval_logps/rejected": -11.086060523986816, + "eval_loss": 0.5239496231079102, + "eval_rewards/accuracies": 0.7247774600982666, + "eval_rewards/chosen": -9.958077430725098, + "eval_rewards/margins": 1.1279836893081665, + "eval_rewards/rejected": -11.086060523986816, + "eval_runtime": 35.0763, + "eval_samples_per_second": 38.345, + "eval_semantic_entropy": 0.0012691987212747335, + "eval_steps_per_second": 9.608, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 19.396392953978733, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": 0.8649656176567078, + "logits/rejected": 0.8771345019340515, + "logps/chosen": -9.787662506103516, + "logps/rejected": -11.10372543334961, + "loss": 0.4402, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.787662506103516, + "rewards/margins": 1.316063642501831, + "rewards/rejected": -11.10372543334961, + "semantic_entropy": 0.001424965332262218, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 26.94949255337376, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": 0.8623428344726562, + "logits/rejected": 0.8985008001327515, + "logps/chosen": -9.891159057617188, + "logps/rejected": -11.298178672790527, + "loss": 0.4101, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.891159057617188, + "rewards/margins": 1.4070203304290771, + "rewards/rejected": -11.298178672790527, + "semantic_entropy": 0.0011413523461669683, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 19.482056056108192, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": 0.8556788563728333, + "logits/rejected": 0.8989516496658325, + "logps/chosen": -9.591584205627441, + "logps/rejected": -11.054668426513672, + "loss": 0.3868, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.591584205627441, + "rewards/margins": 1.4630842208862305, + "rewards/rejected": -11.054668426513672, + "semantic_entropy": 0.0015841536223888397, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 24.45269364730528, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": 0.7989322543144226, + "logits/rejected": 0.8648680448532104, + "logps/chosen": -9.795249938964844, + "logps/rejected": -11.304253578186035, + "loss": 0.3443, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.795249938964844, + "rewards/margins": 1.5090038776397705, + "rewards/rejected": -11.304253578186035, + "semantic_entropy": 0.0012715930351987481, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 29.960130426736338, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": 0.7782562971115112, + "logits/rejected": 0.8624800443649292, + "logps/chosen": -9.840250968933105, + "logps/rejected": -11.1703462600708, + "loss": 0.4095, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.840250968933105, + "rewards/margins": 1.3300951719284058, + "rewards/rejected": -11.1703462600708, + "semantic_entropy": 0.0011493575293570757, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 17.26742704065323, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": 0.7447667121887207, + "logits/rejected": 0.7923721075057983, + "logps/chosen": -9.699943542480469, + "logps/rejected": -11.134923934936523, + "loss": 0.3732, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.699943542480469, + "rewards/margins": 1.4349799156188965, + "rewards/rejected": -11.134923934936523, + "semantic_entropy": 0.001493643270805478, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 18.96292537799569, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": 0.7624896764755249, + "logits/rejected": 0.8235493898391724, + "logps/chosen": -9.626928329467773, + "logps/rejected": -11.030915260314941, + "loss": 0.3852, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.626928329467773, + "rewards/margins": 1.4039862155914307, + "rewards/rejected": -11.030915260314941, + "semantic_entropy": 0.0014617822598665953, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 21.629798181433536, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": 0.8226927518844604, + "logits/rejected": 0.88921058177948, + "logps/chosen": -9.711584091186523, + "logps/rejected": -11.16446590423584, + "loss": 0.3962, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.711584091186523, + "rewards/margins": 1.4528809785842896, + "rewards/rejected": -11.16446590423584, + "semantic_entropy": 0.0014884325210005045, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 25.58854679978555, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": 0.844528317451477, + "logits/rejected": 0.8750749826431274, + "logps/chosen": -9.763383865356445, + "logps/rejected": -11.062616348266602, + "loss": 0.4045, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.763383865356445, + "rewards/margins": 1.2992339134216309, + "rewards/rejected": -11.062616348266602, + "semantic_entropy": 0.001275677583180368, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 30.415412249344016, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": 0.7932205200195312, + "logits/rejected": 0.8923565745353699, + "logps/chosen": -9.760113716125488, + "logps/rejected": -11.208516120910645, + "loss": 0.4172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.760113716125488, + "rewards/margins": 1.4484022855758667, + "rewards/rejected": -11.208516120910645, + "semantic_entropy": 0.0014725803630426526, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 22.204446838947426, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": 0.8339746594429016, + "logits/rejected": 0.9121615290641785, + "logps/chosen": -10.030183792114258, + "logps/rejected": -11.423551559448242, + "loss": 0.3705, + "rewards/accuracies": 0.84375, + "rewards/chosen": -10.030183792114258, + "rewards/margins": 1.393368124961853, + "rewards/rejected": -11.423551559448242, + "semantic_entropy": 0.0011341646313667297, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 36.390773136441254, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": 0.8048036694526672, + "logits/rejected": 0.8228060603141785, + "logps/chosen": -9.797411918640137, + "logps/rejected": -11.064781188964844, + "loss": 0.4645, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.797411918640137, + "rewards/margins": 1.267369270324707, + "rewards/rejected": -11.064781188964844, + "semantic_entropy": 0.0013338859425857663, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 19.439863848193177, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": 0.8065903782844543, + "logits/rejected": 0.8303533792495728, + "logps/chosen": -9.820572853088379, + "logps/rejected": -11.200105667114258, + "loss": 0.3785, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.820572853088379, + "rewards/margins": 1.3795334100723267, + "rewards/rejected": -11.200105667114258, + "semantic_entropy": 0.0013253279030323029, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 24.995280474358317, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": 0.8746574521064758, + "logits/rejected": 0.8969374895095825, + "logps/chosen": -9.832982063293457, + "logps/rejected": -11.270672798156738, + "loss": 0.3779, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.832982063293457, + "rewards/margins": 1.43769109249115, + "rewards/rejected": -11.270672798156738, + "semantic_entropy": 0.0013589839218184352, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 15.437216876674157, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": 0.7981137633323669, + "logits/rejected": 0.8469152450561523, + "logps/chosen": -9.720634460449219, + "logps/rejected": -11.106169700622559, + "loss": 0.3561, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.720634460449219, + "rewards/margins": 1.385535478591919, + "rewards/rejected": -11.106169700622559, + "semantic_entropy": 0.0014761090278625488, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 15.998655549802471, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": 0.7983990907669067, + "logits/rejected": 0.8353071212768555, + "logps/chosen": -10.041610717773438, + "logps/rejected": -11.562406539916992, + "loss": 0.377, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -10.041610717773438, + "rewards/margins": 1.5207948684692383, + "rewards/rejected": -11.562406539916992, + "semantic_entropy": 0.0010572883766144514, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 27.424504358400647, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": 0.7955508232116699, + "logits/rejected": 0.8736175298690796, + "logps/chosen": -9.822931289672852, + "logps/rejected": -11.340441703796387, + "loss": 0.3779, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.822931289672852, + "rewards/margins": 1.5175096988677979, + "rewards/rejected": -11.340441703796387, + "semantic_entropy": 0.0011566228931769729, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 27.591040599780438, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": 0.8231992721557617, + "logits/rejected": 0.8760835528373718, + "logps/chosen": -9.819466590881348, + "logps/rejected": -11.222938537597656, + "loss": 0.4014, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.819466590881348, + "rewards/margins": 1.4034711122512817, + "rewards/rejected": -11.222938537597656, + "semantic_entropy": 0.0014938964741304517, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 18.470072827747238, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": 0.8005205988883972, + "logits/rejected": 0.8584259748458862, + "logps/chosen": -9.931344985961914, + "logps/rejected": -11.320419311523438, + "loss": 0.4159, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.931344985961914, + "rewards/margins": 1.3890745639801025, + "rewards/rejected": -11.320419311523438, + "semantic_entropy": 0.0011897350195795298, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 23.14759681156242, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": 0.7812570929527283, + "logits/rejected": 0.8415404558181763, + "logps/chosen": -9.733675003051758, + "logps/rejected": -11.219903945922852, + "loss": 0.3617, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.733675003051758, + "rewards/margins": 1.4862289428710938, + "rewards/rejected": -11.219903945922852, + "semantic_entropy": 0.001550258370116353, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 28.37485360554531, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": 0.8305708169937134, + "logits/rejected": 0.8369789123535156, + "logps/chosen": -9.811845779418945, + "logps/rejected": -11.167532920837402, + "loss": 0.4223, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.811845779418945, + "rewards/margins": 1.3556877374649048, + "rewards/rejected": -11.167532920837402, + "semantic_entropy": 0.0015380210243165493, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 16.333026482611224, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": 0.8135038614273071, + "logits/rejected": 0.8544967770576477, + "logps/chosen": -9.836585998535156, + "logps/rejected": -11.551431655883789, + "loss": 0.379, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.836585998535156, + "rewards/margins": 1.7148460149765015, + "rewards/rejected": -11.551431655883789, + "semantic_entropy": 0.0012975989375263453, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 22.628935794259295, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": 0.8904609680175781, + "logits/rejected": 0.9412251710891724, + "logps/chosen": -9.89416217803955, + "logps/rejected": -11.147021293640137, + "loss": 0.4024, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.89416217803955, + "rewards/margins": 1.2528594732284546, + "rewards/rejected": -11.147021293640137, + "semantic_entropy": 0.001227770815603435, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 24.067990062164654, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": 0.8739885091781616, + "logits/rejected": 0.9077037572860718, + "logps/chosen": -9.861922264099121, + "logps/rejected": -11.29807186126709, + "loss": 0.3558, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.861922264099121, + "rewards/margins": 1.4361498355865479, + "rewards/rejected": -11.29807186126709, + "semantic_entropy": 0.0011800903594121337, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 20.07201861954392, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": 0.8092811703681946, + "logits/rejected": 0.8977264165878296, + "logps/chosen": -9.86597728729248, + "logps/rejected": -11.442625045776367, + "loss": 0.3673, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.86597728729248, + "rewards/margins": 1.5766479969024658, + "rewards/rejected": -11.442625045776367, + "semantic_entropy": 0.0013031138805672526, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 28.074967041760427, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": 0.8255325555801392, + "logits/rejected": 0.8576027154922485, + "logps/chosen": -9.927534103393555, + "logps/rejected": -11.337235450744629, + "loss": 0.4482, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.927534103393555, + "rewards/margins": 1.4097009897232056, + "rewards/rejected": -11.337235450744629, + "semantic_entropy": 0.0012552501866593957, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 20.375284103879604, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": 0.820387065410614, + "logits/rejected": 0.8405435681343079, + "logps/chosen": -9.774995803833008, + "logps/rejected": -10.992179870605469, + "loss": 0.4593, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.774995803833008, + "rewards/margins": 1.2171828746795654, + "rewards/rejected": -10.992179870605469, + "semantic_entropy": 0.001311628962866962, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 20.571805080727156, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": 0.8561931848526001, + "logits/rejected": 0.9219743609428406, + "logps/chosen": -9.802877426147461, + "logps/rejected": -11.207548141479492, + "loss": 0.4, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.802877426147461, + "rewards/margins": 1.404672384262085, + "rewards/rejected": -11.207548141479492, + "semantic_entropy": 0.0014292590785771608, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 22.031680288537032, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": 0.7743942141532898, + "logits/rejected": 0.8276403546333313, + "logps/chosen": -9.865171432495117, + "logps/rejected": -11.20588207244873, + "loss": 0.3737, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.865171432495117, + "rewards/margins": 1.3407100439071655, + "rewards/rejected": -11.20588207244873, + "semantic_entropy": 0.0012073902180418372, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 23.63055718220825, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": 0.8746307492256165, + "logits/rejected": 0.895931601524353, + "logps/chosen": -9.761825561523438, + "logps/rejected": -11.010538101196289, + "loss": 0.473, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.761825561523438, + "rewards/margins": 1.2487126588821411, + "rewards/rejected": -11.010538101196289, + "semantic_entropy": 0.001414592145010829, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 20.296610881740666, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": 0.8312317132949829, + "logits/rejected": 0.8839332461357117, + "logps/chosen": -9.876100540161133, + "logps/rejected": -11.113783836364746, + "loss": 0.4206, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.876100540161133, + "rewards/margins": 1.2376841306686401, + "rewards/rejected": -11.113783836364746, + "semantic_entropy": 0.0015838369727134705, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 30.87457108658666, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": 0.8078063726425171, + "logits/rejected": 0.8501046895980835, + "logps/chosen": -9.82047176361084, + "logps/rejected": -11.10120964050293, + "loss": 0.4325, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.82047176361084, + "rewards/margins": 1.2807366847991943, + "rewards/rejected": -11.10120964050293, + "semantic_entropy": 0.001436726888641715, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 20.020061107451614, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": 0.7786573171615601, + "logits/rejected": 0.8030532598495483, + "logps/chosen": -9.810864448547363, + "logps/rejected": -11.275449752807617, + "loss": 0.402, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.810864448547363, + "rewards/margins": 1.4645856618881226, + "rewards/rejected": -11.275449752807617, + "semantic_entropy": 0.0011968390317633748, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 27.208547849442688, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": 0.8697658777236938, + "logits/rejected": 0.8987666368484497, + "logps/chosen": -9.942388534545898, + "logps/rejected": -11.106074333190918, + "loss": 0.4543, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.942388534545898, + "rewards/margins": 1.1636863946914673, + "rewards/rejected": -11.106074333190918, + "semantic_entropy": 0.0013244937872514129, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 23.175199847112445, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": 0.7859164476394653, + "logits/rejected": 0.8312576413154602, + "logps/chosen": -9.579522132873535, + "logps/rejected": -11.244876861572266, + "loss": 0.3603, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.579522132873535, + "rewards/margins": 1.665353775024414, + "rewards/rejected": -11.244876861572266, + "semantic_entropy": 0.001594201079569757, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 19.400301117431837, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": 0.789514422416687, + "logits/rejected": 0.8606871366500854, + "logps/chosen": -9.77333927154541, + "logps/rejected": -11.256834983825684, + "loss": 0.3748, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.77333927154541, + "rewards/margins": 1.4834961891174316, + "rewards/rejected": -11.256834983825684, + "semantic_entropy": 0.001359016285277903, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 16.661476052195837, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": 0.8640682101249695, + "logits/rejected": 0.8913204073905945, + "logps/chosen": -9.612969398498535, + "logps/rejected": -11.022314071655273, + "loss": 0.402, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.612969398498535, + "rewards/margins": 1.4093445539474487, + "rewards/rejected": -11.022314071655273, + "semantic_entropy": 0.0015766730066388845, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 23.611323349348925, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": 0.7981586456298828, + "logits/rejected": 0.8657184839248657, + "logps/chosen": -10.07356071472168, + "logps/rejected": -11.186826705932617, + "loss": 0.4737, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.07356071472168, + "rewards/margins": 1.1132649183273315, + "rewards/rejected": -11.186826705932617, + "semantic_entropy": 0.001212230185046792, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 24.769868059281617, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": 0.688225269317627, + "logits/rejected": 0.7675420641899109, + "logps/chosen": -9.678533554077148, + "logps/rejected": -11.024267196655273, + "loss": 0.4225, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.678533554077148, + "rewards/margins": 1.3457330465316772, + "rewards/rejected": -11.024267196655273, + "semantic_entropy": 0.001531310030259192, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 23.697079058959545, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": 0.7804639339447021, + "logits/rejected": 0.8653789758682251, + "logps/chosen": -9.64900016784668, + "logps/rejected": -11.365049362182617, + "loss": 0.329, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.64900016784668, + "rewards/margins": 1.7160485982894897, + "rewards/rejected": -11.365049362182617, + "semantic_entropy": 0.0016292607178911567, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 20.18703093788473, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": 0.8109928369522095, + "logits/rejected": 0.8580228686332703, + "logps/chosen": -9.768526077270508, + "logps/rejected": -11.133955001831055, + "loss": 0.4179, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.768526077270508, + "rewards/margins": 1.3654298782348633, + "rewards/rejected": -11.133955001831055, + "semantic_entropy": 0.0016449004178866744, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 14.944854500276783, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": 0.8247249722480774, + "logits/rejected": 0.8406414985656738, + "logps/chosen": -9.951112747192383, + "logps/rejected": -11.39813232421875, + "loss": 0.4228, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.951112747192383, + "rewards/margins": 1.4470199346542358, + "rewards/rejected": -11.39813232421875, + "semantic_entropy": 0.001148298499174416, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 17.13225055698541, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": 0.810439944267273, + "logits/rejected": 0.904525637626648, + "logps/chosen": -10.016260147094727, + "logps/rejected": -11.255754470825195, + "loss": 0.4262, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -10.016260147094727, + "rewards/margins": 1.2394943237304688, + "rewards/rejected": -11.255754470825195, + "semantic_entropy": 0.0010481254430487752, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 21.19682630785255, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": 0.8494071960449219, + "logits/rejected": 0.8823550343513489, + "logps/chosen": -9.834370613098145, + "logps/rejected": -11.027946472167969, + "loss": 0.4672, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.834370613098145, + "rewards/margins": 1.193576693534851, + "rewards/rejected": -11.027946472167969, + "semantic_entropy": 0.001247903099283576, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 25.992370109808167, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": 0.8348292112350464, + "logits/rejected": 0.8578389286994934, + "logps/chosen": -9.816828727722168, + "logps/rejected": -11.25603199005127, + "loss": 0.4049, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.816828727722168, + "rewards/margins": 1.4392026662826538, + "rewards/rejected": -11.25603199005127, + "semantic_entropy": 0.0012755084317177534, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 22.319154102335233, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": 0.8351479768753052, + "logits/rejected": 0.9166293144226074, + "logps/chosen": -9.844882011413574, + "logps/rejected": -11.379692077636719, + "loss": 0.35, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.844882011413574, + "rewards/margins": 1.5348093509674072, + "rewards/rejected": -11.379692077636719, + "semantic_entropy": 0.0016486002132296562, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 19.717507438885043, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": 0.7874764204025269, + "logits/rejected": 0.8273738026618958, + "logps/chosen": -9.670753479003906, + "logps/rejected": -11.1673583984375, + "loss": 0.3777, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.670753479003906, + "rewards/margins": 1.4966033697128296, + "rewards/rejected": -11.1673583984375, + "semantic_entropy": 0.001804637722671032, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 20.47002458984704, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": 0.8280852437019348, + "logits/rejected": 0.8752776980400085, + "logps/chosen": -9.854182243347168, + "logps/rejected": -11.047213554382324, + "loss": 0.4253, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.854182243347168, + "rewards/margins": 1.1930307149887085, + "rewards/rejected": -11.047213554382324, + "semantic_entropy": 0.0011610215296968818, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 27.24590899012989, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": 0.8899133801460266, + "logits/rejected": 0.9149841070175171, + "logps/chosen": -9.686140060424805, + "logps/rejected": -11.125383377075195, + "loss": 0.4581, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -9.686140060424805, + "rewards/margins": 1.4392426013946533, + "rewards/rejected": -11.125383377075195, + "semantic_entropy": 0.0013887417735531926, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 30.201566273141413, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": 0.8797470331192017, + "logits/rejected": 0.9165847897529602, + "logps/chosen": -9.838947296142578, + "logps/rejected": -11.165335655212402, + "loss": 0.4477, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.838947296142578, + "rewards/margins": 1.3263883590698242, + "rewards/rejected": -11.165335655212402, + "semantic_entropy": 0.0014228606596589088, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 21.66371324186062, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": 0.7495226263999939, + "logits/rejected": 0.8447777032852173, + "logps/chosen": -9.877795219421387, + "logps/rejected": -10.990615844726562, + "loss": 0.4624, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -9.877795219421387, + "rewards/margins": 1.1128205060958862, + "rewards/rejected": -10.990615844726562, + "semantic_entropy": 0.001258770003914833, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 19.131164191986134, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": 0.7976396083831787, + "logits/rejected": 0.8642821311950684, + "logps/chosen": -9.621539115905762, + "logps/rejected": -11.064565658569336, + "loss": 0.3859, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.621539115905762, + "rewards/margins": 1.4430257081985474, + "rewards/rejected": -11.064565658569336, + "semantic_entropy": 0.0015615615993738174, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 16.998983409706852, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": 0.8069744110107422, + "logits/rejected": 0.8920931816101074, + "logps/chosen": -9.918710708618164, + "logps/rejected": -11.107327461242676, + "loss": 0.4295, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.918710708618164, + "rewards/margins": 1.1886180639266968, + "rewards/rejected": -11.107327461242676, + "semantic_entropy": 0.001179686514660716, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 34.20627907884556, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": 0.8224443197250366, + "logits/rejected": 0.8717595338821411, + "logps/chosen": -9.855988502502441, + "logps/rejected": -11.113961219787598, + "loss": 0.4348, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.855988502502441, + "rewards/margins": 1.2579724788665771, + "rewards/rejected": -11.113961219787598, + "semantic_entropy": 0.0013345398474484682, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 21.07119537810863, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": 0.8732544183731079, + "logits/rejected": 0.8522858619689941, + "logps/chosen": -9.735010147094727, + "logps/rejected": -10.999938011169434, + "loss": 0.4045, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.735010147094727, + "rewards/margins": 1.2649286985397339, + "rewards/rejected": -10.999938011169434, + "semantic_entropy": 0.0016018247697502375, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 22.6554353032534, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": 0.795121431350708, + "logits/rejected": 0.8548393249511719, + "logps/chosen": -9.826266288757324, + "logps/rejected": -11.189390182495117, + "loss": 0.4163, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.826266288757324, + "rewards/margins": 1.363124132156372, + "rewards/rejected": -11.189390182495117, + "semantic_entropy": 0.0013476324966177344, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 25.149551686278766, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": 0.8276500701904297, + "logits/rejected": 0.8823320269584656, + "logps/chosen": -9.684768676757812, + "logps/rejected": -10.858797073364258, + "loss": 0.4437, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.684768676757812, + "rewards/margins": 1.1740278005599976, + "rewards/rejected": -10.858797073364258, + "semantic_entropy": 0.0018609801772981882, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 25.88498942606627, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": 0.7547510862350464, + "logits/rejected": 0.7992917895317078, + "logps/chosen": -9.716946601867676, + "logps/rejected": -10.954937934875488, + "loss": 0.4331, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.716946601867676, + "rewards/margins": 1.2379915714263916, + "rewards/rejected": -10.954937934875488, + "semantic_entropy": 0.0013116684276610613, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 18.9946324561305, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": 0.8538810014724731, + "logits/rejected": 0.9081370234489441, + "logps/chosen": -9.737415313720703, + "logps/rejected": -11.309637069702148, + "loss": 0.3575, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.737415313720703, + "rewards/margins": 1.5722216367721558, + "rewards/rejected": -11.309637069702148, + "semantic_entropy": 0.0014735187869518995, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 18.123811698473638, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": 0.8666974902153015, + "logits/rejected": 0.9071024656295776, + "logps/chosen": -9.86131763458252, + "logps/rejected": -11.140911102294922, + "loss": 0.4182, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.86131763458252, + "rewards/margins": 1.2795933485031128, + "rewards/rejected": -11.140911102294922, + "semantic_entropy": 0.0012588893296197057, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 28.867054705404758, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": 0.828734278678894, + "logits/rejected": 0.8480997085571289, + "logps/chosen": -9.789422988891602, + "logps/rejected": -11.044143676757812, + "loss": 0.4458, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.789422988891602, + "rewards/margins": 1.2547214031219482, + "rewards/rejected": -11.044143676757812, + "semantic_entropy": 0.0014515508664771914, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 20.361191522004823, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": 0.8103793859481812, + "logits/rejected": 0.8867511749267578, + "logps/chosen": -9.793035507202148, + "logps/rejected": -11.505876541137695, + "loss": 0.3433, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -9.793035507202148, + "rewards/margins": 1.7128407955169678, + "rewards/rejected": -11.505876541137695, + "semantic_entropy": 0.0011786060640588403, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 25.4932234461195, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": 0.7820402383804321, + "logits/rejected": 0.8293735384941101, + "logps/chosen": -9.81843376159668, + "logps/rejected": -11.206514358520508, + "loss": 0.4035, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -9.81843376159668, + "rewards/margins": 1.3880798816680908, + "rewards/rejected": -11.206514358520508, + "semantic_entropy": 0.001364008872769773, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 22.786471299719448, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": 0.8798080682754517, + "logits/rejected": 0.8902498483657837, + "logps/chosen": -9.707275390625, + "logps/rejected": -11.065264701843262, + "loss": 0.3739, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.707275390625, + "rewards/margins": 1.3579896688461304, + "rewards/rejected": -11.065264701843262, + "semantic_entropy": 0.0014018730726093054, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 24.344487883516077, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": 0.8397024273872375, + "logits/rejected": 0.9295798540115356, + "logps/chosen": -9.954577445983887, + "logps/rejected": -11.368110656738281, + "loss": 0.4164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.954577445983887, + "rewards/margins": 1.4135328531265259, + "rewards/rejected": -11.368110656738281, + "semantic_entropy": 0.0010031659621745348, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 22.61990267466979, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": 0.8053072094917297, + "logits/rejected": 0.8756014108657837, + "logps/chosen": -10.003788948059082, + "logps/rejected": -11.16923713684082, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.003788948059082, + "rewards/margins": 1.1654479503631592, + "rewards/rejected": -11.16923713684082, + "semantic_entropy": 0.001166566857136786, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 25.594286670666698, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": 0.8106497526168823, + "logits/rejected": 0.8661069869995117, + "logps/chosen": -9.88862419128418, + "logps/rejected": -11.237882614135742, + "loss": 0.4061, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.88862419128418, + "rewards/margins": 1.3492584228515625, + "rewards/rejected": -11.237882614135742, + "semantic_entropy": 0.0012052215170115232, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 22.658593939455915, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": 0.7906460762023926, + "logits/rejected": 0.8491800427436829, + "logps/chosen": -9.941095352172852, + "logps/rejected": -11.274066925048828, + "loss": 0.4179, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.941095352172852, + "rewards/margins": 1.332972526550293, + "rewards/rejected": -11.274066925048828, + "semantic_entropy": 0.0011057687224820256, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 22.045194562461553, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": 0.847141444683075, + "logits/rejected": 0.885520339012146, + "logps/chosen": -9.864678382873535, + "logps/rejected": -11.155853271484375, + "loss": 0.4141, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.864678382873535, + "rewards/margins": 1.2911745309829712, + "rewards/rejected": -11.155853271484375, + "semantic_entropy": 0.0010961454827338457, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 16.922925357956778, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": 0.8648706674575806, + "logits/rejected": 0.9044130444526672, + "logps/chosen": -9.688464164733887, + "logps/rejected": -10.932621955871582, + "loss": 0.4164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.688464164733887, + "rewards/margins": 1.2441574335098267, + "rewards/rejected": -10.932621955871582, + "semantic_entropy": 0.0013283784501254559, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 25.775353395913132, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": 0.7877558469772339, + "logits/rejected": 0.8715565800666809, + "logps/chosen": -9.736814498901367, + "logps/rejected": -11.426549911499023, + "loss": 0.3466, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.736814498901367, + "rewards/margins": 1.6897351741790771, + "rewards/rejected": -11.426549911499023, + "semantic_entropy": 0.001374770887196064, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 24.401341154573725, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": 0.7978461980819702, + "logits/rejected": 0.8458053469657898, + "logps/chosen": -9.92179012298584, + "logps/rejected": -11.083666801452637, + "loss": 0.4448, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.92179012298584, + "rewards/margins": 1.1618760824203491, + "rewards/rejected": -11.083666801452637, + "semantic_entropy": 0.001205159118399024, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 23.987542667982627, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": 0.8300431370735168, + "logits/rejected": 0.9039738774299622, + "logps/chosen": -9.867820739746094, + "logps/rejected": -11.240914344787598, + "loss": 0.4123, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.867820739746094, + "rewards/margins": 1.3730926513671875, + "rewards/rejected": -11.240914344787598, + "semantic_entropy": 0.0011399075156077743, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 20.018421831607807, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": 0.8630874752998352, + "logits/rejected": 0.9077743291854858, + "logps/chosen": -9.715357780456543, + "logps/rejected": -11.356972694396973, + "loss": 0.3585, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -9.715357780456543, + "rewards/margins": 1.6416149139404297, + "rewards/rejected": -11.356972694396973, + "semantic_entropy": 0.0017618630081415176, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 19.995404454240774, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": 0.8361412882804871, + "logits/rejected": 0.8893247842788696, + "logps/chosen": -9.477738380432129, + "logps/rejected": -10.875368118286133, + "loss": 0.4263, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.477738380432129, + "rewards/margins": 1.3976287841796875, + "rewards/rejected": -10.875368118286133, + "semantic_entropy": 0.001689505996182561, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 25.119002068756046, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": 0.8052582740783691, + "logits/rejected": 0.8892769813537598, + "logps/chosen": -9.728483200073242, + "logps/rejected": -11.229433059692383, + "loss": 0.3915, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -9.728483200073242, + "rewards/margins": 1.5009489059448242, + "rewards/rejected": -11.229433059692383, + "semantic_entropy": 0.0017851864686235785, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 17.38170767201798, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": 0.7621601819992065, + "logits/rejected": 0.8389317393302917, + "logps/chosen": -9.730030059814453, + "logps/rejected": -11.127847671508789, + "loss": 0.3831, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -9.730030059814453, + "rewards/margins": 1.3978168964385986, + "rewards/rejected": -11.127847671508789, + "semantic_entropy": 0.0015055348630994558, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 18.851089955324714, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": 0.8362342119216919, + "logits/rejected": 0.894599437713623, + "logps/chosen": -9.942944526672363, + "logps/rejected": -11.178738594055176, + "loss": 0.4489, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -9.942944526672363, + "rewards/margins": 1.2357933521270752, + "rewards/rejected": -11.178738594055176, + "semantic_entropy": 0.001227195025421679, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 33.481853049291296, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": 0.8312174677848816, + "logits/rejected": 0.907203197479248, + "logps/chosen": -9.739818572998047, + "logps/rejected": -11.097038269042969, + "loss": 0.435, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -9.739818572998047, + "rewards/margins": 1.3572200536727905, + "rewards/rejected": -11.097038269042969, + "semantic_entropy": 0.0013708441983908415, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 32.77365513835394, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": 0.8438766598701477, + "logits/rejected": 0.876266360282898, + "logps/chosen": -9.731843948364258, + "logps/rejected": -11.251733779907227, + "loss": 0.4023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.731843948364258, + "rewards/margins": 1.5198904275894165, + "rewards/rejected": -11.251733779907227, + "semantic_entropy": 0.001664994633756578, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.9163224101066589, + "eval_logits/rejected": 0.9576993584632874, + "eval_logps/chosen": -9.975739479064941, + "eval_logps/rejected": -11.105352401733398, + "eval_loss": 0.52450031042099, + "eval_rewards/accuracies": 0.7240356206893921, + "eval_rewards/chosen": -9.975739479064941, + "eval_rewards/margins": 1.1296132802963257, + "eval_rewards/rejected": -11.105352401733398, + "eval_runtime": 35.057, + "eval_samples_per_second": 38.366, + "eval_semantic_entropy": 0.0012647128896787763, + "eval_steps_per_second": 9.613, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.5450739759491819, + "train_runtime": 29046.9509, + "train_samples_per_second": 6.175, + "train_steps_per_second": 0.193 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}