{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9940933254577673, "eval_steps": 43, "global_step": 422, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004725339633786178, "grad_norm": 1.1176782705315427, "learning_rate": 3.846153846153846e-08, "logits/chosen": -1.5937305688858032, "logits/rejected": -1.7021960020065308, "logps/chosen": -247.54559326171875, "logps/rejected": -179.0218048095703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.009450679267572357, "grad_norm": 1.2535773078918948, "learning_rate": 7.692307692307692e-08, "logits/chosen": -1.7181015014648438, "logits/rejected": -1.644026756286621, "logps/chosen": -259.1505432128906, "logps/rejected": -241.68020629882812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.014176018901358535, "grad_norm": 0.995484839907434, "learning_rate": 1.1538461538461539e-07, "logits/chosen": -0.8613071441650391, "logits/rejected": -0.8891040682792664, "logps/chosen": -230.91070556640625, "logps/rejected": -219.62979125976562, "loss": 0.6934, "rewards/accuracies": 0.390625, "rewards/chosen": 6.782727723475546e-05, "rewards/margins": -0.0004302160523366183, "rewards/rejected": 0.0004980433732271194, "step": 3 }, { "epoch": 0.018901358535144713, "grad_norm": 1.1568688062326662, "learning_rate": 1.5384615384615385e-07, "logits/chosen": -1.1649622917175293, "logits/rejected": -1.131172776222229, "logps/chosen": -184.93499755859375, "logps/rejected": -184.127197265625, "loss": 0.693, "rewards/accuracies": 0.421875, "rewards/chosen": -0.0002185619086958468, "rewards/margins": -0.0006310059688985348, "rewards/rejected": 0.0004124442348256707, "step": 4 }, { "epoch": 0.02362669816893089, "grad_norm": 1.0155652752384032, "learning_rate": 1.9230769230769231e-07, "logits/chosen": -1.8650751113891602, "logits/rejected": -1.9386688470840454, "logps/chosen": -193.21636962890625, "logps/rejected": -175.6696014404297, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.00019388733198866248, "rewards/margins": -0.00023763455101288855, "rewards/rejected": 0.0004315219703130424, "step": 5 }, { "epoch": 0.02835203780271707, "grad_norm": 1.2022303509332404, "learning_rate": 2.3076923076923078e-07, "logits/chosen": -1.8849067687988281, "logits/rejected": -1.8837637901306152, "logps/chosen": -234.3896484375, "logps/rejected": -218.09625244140625, "loss": 0.6934, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0004923291853629053, "rewards/margins": 0.0006284262635745108, "rewards/rejected": -0.00013609707821160555, "step": 6 }, { "epoch": 0.03307737743650325, "grad_norm": 1.136952564893261, "learning_rate": 2.692307692307692e-07, "logits/chosen": -1.816144585609436, "logits/rejected": -1.934072494506836, "logps/chosen": -246.14027404785156, "logps/rejected": -177.02993774414062, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0834413589909673e-05, "rewards/margins": 0.001102915033698082, "rewards/rejected": -0.0011237493017688394, "step": 7 }, { "epoch": 0.03780271707028943, "grad_norm": 1.1596582232433024, "learning_rate": 3.076923076923077e-07, "logits/chosen": -1.8363107442855835, "logits/rejected": -1.8167006969451904, "logps/chosen": -239.65370178222656, "logps/rejected": -221.20333862304688, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00017343120998702943, "rewards/margins": -0.0007941695512272418, "rewards/rejected": 0.0006207384867593646, "step": 8 }, { "epoch": 0.042528056704075605, "grad_norm": 1.1810520949859216, "learning_rate": 3.461538461538461e-07, "logits/chosen": -1.6903538703918457, "logits/rejected": -1.7734307050704956, "logps/chosen": -239.26535034179688, "logps/rejected": -193.875244140625, "loss": 0.6929, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0003690614248625934, "rewards/margins": 0.00022412401449400932, "rewards/rejected": 0.00014493743947241455, "step": 9 }, { "epoch": 0.04725339633786178, "grad_norm": 1.253373850013781, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.5372889041900635, "logits/rejected": -1.5536653995513916, "logps/chosen": -222.93399047851562, "logps/rejected": -223.1899871826172, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0005048069870099425, "rewards/margins": 0.000533790560439229, "rewards/rejected": -0.0010385976638644934, "step": 10 }, { "epoch": 0.05197873597164796, "grad_norm": 1.1609876885112298, "learning_rate": 4.2307692307692304e-07, "logits/chosen": -1.20901620388031, "logits/rejected": -1.2452011108398438, "logps/chosen": -274.1497497558594, "logps/rejected": -227.36790466308594, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0004496507463045418, "rewards/margins": -0.0006296815699897707, "rewards/rejected": 0.0001800309109967202, "step": 11 }, { "epoch": 0.05670407560543414, "grad_norm": 1.1166995258772006, "learning_rate": 4.6153846153846156e-07, "logits/chosen": -1.8701603412628174, "logits/rejected": -1.8107975721359253, "logps/chosen": -219.3397674560547, "logps/rejected": -232.28269958496094, "loss": 0.6927, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0005328843253664672, "rewards/margins": 0.000161867166752927, "rewards/rejected": -0.0006947515066713095, "step": 12 }, { "epoch": 0.06142941523922032, "grad_norm": 1.1010862122779934, "learning_rate": 5e-07, "logits/chosen": -1.3035281896591187, "logits/rejected": -1.3319075107574463, "logps/chosen": -200.9400634765625, "logps/rejected": -177.5894317626953, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.00024598470190539956, "rewards/margins": 0.001570576336234808, "rewards/rejected": -0.0013245916925370693, "step": 13 }, { "epoch": 0.0661547548730065, "grad_norm": 1.109450926366569, "learning_rate": 4.999926250172797e-07, "logits/chosen": -1.2467422485351562, "logits/rejected": -1.2619496583938599, "logps/chosen": -247.1371612548828, "logps/rejected": -232.03895568847656, "loss": 0.6925, "rewards/accuracies": 0.640625, "rewards/chosen": 0.00045037176460027695, "rewards/margins": 0.0029086670838296413, "rewards/rejected": -0.0024582953192293644, "step": 14 }, { "epoch": 0.07088009450679268, "grad_norm": 1.0686336079566285, "learning_rate": 4.999705005042417e-07, "logits/chosen": -0.9053488969802856, "logits/rejected": -0.9105295538902283, "logps/chosen": -200.528076171875, "logps/rejected": -191.2373046875, "loss": 0.6924, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0003694885817822069, "rewards/margins": 0.001977597363293171, "rewards/rejected": -0.00234708609059453, "step": 15 }, { "epoch": 0.07560543414057885, "grad_norm": 1.1198403704053106, "learning_rate": 4.999336277662292e-07, "logits/chosen": -1.382132887840271, "logits/rejected": -1.3727940320968628, "logps/chosen": -237.94508361816406, "logps/rejected": -251.71519470214844, "loss": 0.6924, "rewards/accuracies": 0.640625, "rewards/chosen": -3.6539247957989573e-05, "rewards/margins": 0.002266494557261467, "rewards/rejected": -0.002303033834323287, "step": 16 }, { "epoch": 0.08033077377436504, "grad_norm": 1.022581880493408, "learning_rate": 4.998820089787287e-07, "logits/chosen": -1.0172172784805298, "logits/rejected": -1.0724008083343506, "logps/chosen": -232.03070068359375, "logps/rejected": -214.65155029296875, "loss": 0.6919, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0009355681831948459, "rewards/margins": 0.002296539256349206, "rewards/rejected": -0.003232107497751713, "step": 17 }, { "epoch": 0.08505611340815121, "grad_norm": 1.127368600628108, "learning_rate": 4.998156471872415e-07, "logits/chosen": -1.6294444799423218, "logits/rejected": -1.6482771635055542, "logps/chosen": -226.63442993164062, "logps/rejected": -209.50799560546875, "loss": 0.6913, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0007066840189509094, "rewards/margins": 0.005803423933684826, "rewards/rejected": -0.005096739623695612, "step": 18 }, { "epoch": 0.0897814530419374, "grad_norm": 1.1009230447057081, "learning_rate": 4.997345463071041e-07, "logits/chosen": -1.9955631494522095, "logits/rejected": -1.9177535772323608, "logps/chosen": -219.0171661376953, "logps/rejected": -204.72091674804688, "loss": 0.6906, "rewards/accuracies": 0.515625, "rewards/chosen": -0.001279333489947021, "rewards/margins": 0.003984270617365837, "rewards/rejected": -0.005263603758066893, "step": 19 }, { "epoch": 0.09450679267572357, "grad_norm": 1.1219432468145276, "learning_rate": 4.996387111232572e-07, "logits/chosen": -0.9923038482666016, "logits/rejected": -1.0317736864089966, "logps/chosen": -207.70364379882812, "logps/rejected": -210.37612915039062, "loss": 0.6903, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0004186414007563144, "rewards/margins": 0.007054868154227734, "rewards/rejected": -0.007473509293049574, "step": 20 }, { "epoch": 0.09923213230950975, "grad_norm": 1.1456255677406275, "learning_rate": 4.995281472899636e-07, "logits/chosen": -1.2455811500549316, "logits/rejected": -1.2986594438552856, "logps/chosen": -260.4434814453125, "logps/rejected": -239.2939453125, "loss": 0.6899, "rewards/accuracies": 0.609375, "rewards/chosen": -0.001413986086845398, "rewards/margins": 0.006085187196731567, "rewards/rejected": -0.007499172817915678, "step": 21 }, { "epoch": 0.10395747194329592, "grad_norm": 1.0796373972108642, "learning_rate": 4.99402861330474e-07, "logits/chosen": -1.6776717901229858, "logits/rejected": -1.6839845180511475, "logps/chosen": -243.64987182617188, "logps/rejected": -224.65371704101562, "loss": 0.6898, "rewards/accuracies": 0.640625, "rewards/chosen": -0.0021368232555687428, "rewards/margins": 0.007371032610535622, "rewards/rejected": -0.009507855400443077, "step": 22 }, { "epoch": 0.10868281157708211, "grad_norm": 1.067721897185461, "learning_rate": 4.992628606366425e-07, "logits/chosen": -1.6994775533676147, "logits/rejected": -1.6823248863220215, "logps/chosen": -184.07522583007812, "logps/rejected": -201.24017333984375, "loss": 0.6892, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0037174485623836517, "rewards/margins": 0.005765101406723261, "rewards/rejected": -0.009482549503445625, "step": 23 }, { "epoch": 0.11340815121086828, "grad_norm": 1.1222480052149575, "learning_rate": 4.991081534684911e-07, "logits/chosen": -1.3170721530914307, "logits/rejected": -1.3209936618804932, "logps/chosen": -173.28440856933594, "logps/rejected": -180.39111328125, "loss": 0.6882, "rewards/accuracies": 0.703125, "rewards/chosen": -0.0013278307160362601, "rewards/margins": 0.01227161381393671, "rewards/rejected": -0.013599444180727005, "step": 24 }, { "epoch": 0.11813349084465447, "grad_norm": 1.1336846530783633, "learning_rate": 4.98938748953721e-07, "logits/chosen": -1.2200762033462524, "logits/rejected": -1.2801724672317505, "logps/chosen": -228.53701782226562, "logps/rejected": -209.99066162109375, "loss": 0.6876, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0031590620055794716, "rewards/margins": 0.014617552980780602, "rewards/rejected": -0.017776615917682648, "step": 25 }, { "epoch": 0.12285883047844064, "grad_norm": 1.1070134845717687, "learning_rate": 4.987546570871754e-07, "logits/chosen": -1.7048778533935547, "logits/rejected": -1.6716248989105225, "logps/chosen": -237.16851806640625, "logps/rejected": -234.78970336914062, "loss": 0.6868, "rewards/accuracies": 0.65625, "rewards/chosen": -0.006663296837359667, "rewards/margins": 0.013624398037791252, "rewards/rejected": -0.02028769627213478, "step": 26 }, { "epoch": 0.1275841701122268, "grad_norm": 1.1180888707264458, "learning_rate": 4.985558887302488e-07, "logits/chosen": -1.692581057548523, "logits/rejected": -1.7662020921707153, "logps/chosen": -197.77049255371094, "logps/rejected": -182.52011108398438, "loss": 0.686, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003295771311968565, "rewards/margins": 0.009790323674678802, "rewards/rejected": -0.01308609452098608, "step": 27 }, { "epoch": 0.132309509746013, "grad_norm": 1.1912060262420856, "learning_rate": 4.983424556102468e-07, "logits/chosen": -1.8331196308135986, "logits/rejected": -1.8659313917160034, "logps/chosen": -200.51959228515625, "logps/rejected": -181.59939575195312, "loss": 0.6841, "rewards/accuracies": 0.609375, "rewards/chosen": -0.004111767280846834, "rewards/margins": 0.017347747460007668, "rewards/rejected": -0.021459516137838364, "step": 28 }, { "epoch": 0.13703484937979918, "grad_norm": 1.1795634739349266, "learning_rate": 4.981143703196941e-07, "logits/chosen": -2.1984832286834717, "logits/rejected": -2.172135591506958, "logps/chosen": -193.10231018066406, "logps/rejected": -180.98953247070312, "loss": 0.6844, "rewards/accuracies": 0.640625, "rewards/chosen": -0.010889173485338688, "rewards/margins": 0.014179128222167492, "rewards/rejected": -0.02506830170750618, "step": 29 }, { "epoch": 0.14176018901358536, "grad_norm": 1.066586203576245, "learning_rate": 4.978716463155912e-07, "logits/chosen": -2.06459379196167, "logits/rejected": -2.0417637825012207, "logps/chosen": -159.03701782226562, "logps/rejected": -195.22523498535156, "loss": 0.6843, "rewards/accuracies": 0.703125, "rewards/chosen": -0.0062042661011219025, "rewards/margins": 0.016392884775996208, "rewards/rejected": -0.02259715273976326, "step": 30 }, { "epoch": 0.14648552864737152, "grad_norm": 1.2110413924457768, "learning_rate": 4.976142979186209e-07, "logits/chosen": -1.9228863716125488, "logits/rejected": -1.9126472473144531, "logps/chosen": -202.9162139892578, "logps/rejected": -175.83433532714844, "loss": 0.6821, "rewards/accuracies": 0.703125, "rewards/chosen": -0.007678491994738579, "rewards/margins": 0.02073112316429615, "rewards/rejected": -0.02840961515903473, "step": 31 }, { "epoch": 0.1512108682811577, "grad_norm": 1.2866090020461936, "learning_rate": 4.973423403123028e-07, "logits/chosen": -1.701865792274475, "logits/rejected": -1.7730942964553833, "logps/chosen": -235.43563842773438, "logps/rejected": -229.68411254882812, "loss": 0.6798, "rewards/accuracies": 0.703125, "rewards/chosen": -0.0038016163744032383, "rewards/margins": 0.025650067254900932, "rewards/rejected": -0.029451683163642883, "step": 32 }, { "epoch": 0.1559362079149439, "grad_norm": 1.1184970866462265, "learning_rate": 4.970557895420983e-07, "logits/chosen": -1.7996641397476196, "logits/rejected": -1.7775640487670898, "logps/chosen": -172.83163452148438, "logps/rejected": -201.3394317626953, "loss": 0.6818, "rewards/accuracies": 0.625, "rewards/chosen": -0.014111585915088654, "rewards/margins": 0.02376263216137886, "rewards/rejected": -0.03787422180175781, "step": 33 }, { "epoch": 0.16066154754873008, "grad_norm": 1.1028286808678158, "learning_rate": 4.967546625144633e-07, "logits/chosen": -1.1831226348876953, "logits/rejected": -1.1660494804382324, "logps/chosen": -177.04531860351562, "logps/rejected": -188.2403564453125, "loss": 0.6822, "rewards/accuracies": 0.609375, "rewards/chosen": -0.011519413441419601, "rewards/margins": 0.02097604051232338, "rewards/rejected": -0.03249545022845268, "step": 34 }, { "epoch": 0.16538688718251623, "grad_norm": 1.1486945876483394, "learning_rate": 4.964389769958506e-07, "logits/chosen": -1.382279634475708, "logits/rejected": -1.419837474822998, "logps/chosen": -156.91551208496094, "logps/rejected": -157.46437072753906, "loss": 0.6784, "rewards/accuracies": 0.703125, "rewards/chosen": -0.012027422897517681, "rewards/margins": 0.030697565525770187, "rewards/rejected": -0.04272499307990074, "step": 35 }, { "epoch": 0.17011222681630242, "grad_norm": 1.1027256033620638, "learning_rate": 4.961087516116621e-07, "logits/chosen": -1.1302804946899414, "logits/rejected": -1.1647142171859741, "logps/chosen": -263.8468322753906, "logps/rejected": -243.76544189453125, "loss": 0.6793, "rewards/accuracies": 0.59375, "rewards/chosen": -0.020269813016057014, "rewards/margins": 0.02116047963500023, "rewards/rejected": -0.041430290788412094, "step": 36 }, { "epoch": 0.1748375664500886, "grad_norm": 1.0823022111801957, "learning_rate": 4.957640058451501e-07, "logits/chosen": -1.5351812839508057, "logits/rejected": -1.5872442722320557, "logps/chosen": -204.82644653320312, "logps/rejected": -176.9189453125, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": -0.02254084125161171, "rewards/margins": 0.017112018540501595, "rewards/rejected": -0.039652857929468155, "step": 37 }, { "epoch": 0.1795629060838748, "grad_norm": 1.0721194904364362, "learning_rate": 4.954047600362669e-07, "logits/chosen": -1.8736214637756348, "logits/rejected": -1.819676160812378, "logps/chosen": -178.29270935058594, "logps/rejected": -191.61605834960938, "loss": 0.6803, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02358250319957733, "rewards/margins": 0.031186437234282494, "rewards/rejected": -0.054768942296504974, "step": 38 }, { "epoch": 0.18428824571766095, "grad_norm": 1.0629161817956538, "learning_rate": 4.950310353804659e-07, "logits/chosen": -1.7329224348068237, "logits/rejected": -1.780181646347046, "logps/chosen": -188.88072204589844, "logps/rejected": -177.74427795410156, "loss": 0.6784, "rewards/accuracies": 0.703125, "rewards/chosen": -0.024656984955072403, "rewards/margins": 0.03351202234625816, "rewards/rejected": -0.058169007301330566, "step": 39 }, { "epoch": 0.18901358535144713, "grad_norm": 1.0963449505388767, "learning_rate": 4.946428539274497e-07, "logits/chosen": -1.920142650604248, "logits/rejected": -1.9512125253677368, "logps/chosen": -237.4847412109375, "logps/rejected": -210.24838256835938, "loss": 0.6759, "rewards/accuracies": 0.625, "rewards/chosen": -0.030252018943428993, "rewards/margins": 0.040137603878974915, "rewards/rejected": -0.07038962841033936, "step": 40 }, { "epoch": 0.19373892498523332, "grad_norm": 1.039447855381113, "learning_rate": 4.942402385798706e-07, "logits/chosen": -1.246740698814392, "logits/rejected": -1.281036138534546, "logps/chosen": -240.5216064453125, "logps/rejected": -188.30795288085938, "loss": 0.6762, "rewards/accuracies": 0.53125, "rewards/chosen": -0.032288651913404465, "rewards/margins": 0.03159747272729874, "rewards/rejected": -0.0638861209154129, "step": 41 }, { "epoch": 0.1984642646190195, "grad_norm": 1.0133081629791365, "learning_rate": 4.938232130919785e-07, "logits/chosen": -2.049900531768799, "logits/rejected": -2.0438833236694336, "logps/chosen": -241.96177673339844, "logps/rejected": -223.10511779785156, "loss": 0.6775, "rewards/accuracies": 0.671875, "rewards/chosen": -0.05470336228609085, "rewards/margins": 0.040740497410297394, "rewards/rejected": -0.09544385224580765, "step": 42 }, { "epoch": 0.20318960425280566, "grad_norm": 1.046962172171356, "learning_rate": 4.933918020682195e-07, "logits/chosen": -2.0764129161834717, "logits/rejected": -1.9940263032913208, "logps/chosen": -204.7505340576172, "logps/rejected": -206.99880981445312, "loss": 0.6727, "rewards/accuracies": 0.671875, "rewards/chosen": -0.03313834220170975, "rewards/margins": 0.06050185114145279, "rewards/rejected": -0.09364018589258194, "step": 43 }, { "epoch": 0.20318960425280566, "eval_logits/chosen": -2.2005555629730225, "eval_logits/rejected": -2.2166874408721924, "eval_logps/chosen": -216.52699279785156, "eval_logps/rejected": -209.94314575195312, "eval_loss": 0.6714360117912292, "eval_rewards/accuracies": 0.5871211886405945, "eval_rewards/chosen": -0.05296258255839348, "eval_rewards/margins": 0.046969976276159286, "eval_rewards/rejected": -0.09993256628513336, "eval_runtime": 225.63, "eval_samples_per_second": 16.204, "eval_steps_per_second": 0.293, "step": 43 }, { "epoch": 0.20791494388659185, "grad_norm": 1.1285157956991194, "learning_rate": 4.929460309617843e-07, "logits/chosen": -2.0923304557800293, "logits/rejected": -2.151911973953247, "logps/chosen": -252.43092346191406, "logps/rejected": -221.0844268798828, "loss": 0.6707, "rewards/accuracies": 0.640625, "rewards/chosen": -0.04450148344039917, "rewards/margins": 0.058394819498062134, "rewards/rejected": -0.10289628803730011, "step": 44 }, { "epoch": 0.21264028352037803, "grad_norm": 1.0651629243936434, "learning_rate": 4.924859260731066e-07, "logits/chosen": -2.0476608276367188, "logits/rejected": -2.174062490463257, "logps/chosen": -219.19517517089844, "logps/rejected": -189.25193786621094, "loss": 0.6718, "rewards/accuracies": 0.640625, "rewards/chosen": -0.06256880611181259, "rewards/margins": 0.04902214929461479, "rewards/rejected": -0.11159095913171768, "step": 45 }, { "epoch": 0.21736562315416422, "grad_norm": 1.0516408835233841, "learning_rate": 4.920115145483112e-07, "logits/chosen": -1.602857232093811, "logits/rejected": -1.6103214025497437, "logps/chosen": -252.79664611816406, "logps/rejected": -229.30770874023438, "loss": 0.6711, "rewards/accuracies": 0.609375, "rewards/chosen": -0.08149686455726624, "rewards/margins": 0.037921737879514694, "rewards/rejected": -0.11941860616207123, "step": 46 }, { "epoch": 0.22209096278795037, "grad_norm": 1.1789732456475222, "learning_rate": 4.915228243776124e-07, "logits/chosen": -1.1500588655471802, "logits/rejected": -1.1534898281097412, "logps/chosen": -253.38931274414062, "logps/rejected": -220.110595703125, "loss": 0.6634, "rewards/accuracies": 0.609375, "rewards/chosen": -0.06619597226381302, "rewards/margins": 0.07172124832868576, "rewards/rejected": -0.13791722059249878, "step": 47 }, { "epoch": 0.22681630242173656, "grad_norm": 1.056306792669834, "learning_rate": 4.91019884393663e-07, "logits/chosen": -0.8076485991477966, "logits/rejected": -0.8253241181373596, "logps/chosen": -211.2886962890625, "logps/rejected": -174.4979705810547, "loss": 0.6668, "rewards/accuracies": 0.703125, "rewards/chosen": -0.05709861218929291, "rewards/margins": 0.05306413769721985, "rewards/rejected": -0.11016274988651276, "step": 48 }, { "epoch": 0.23154164205552275, "grad_norm": 1.1483517794441522, "learning_rate": 4.905027242698521e-07, "logits/chosen": -1.5992224216461182, "logits/rejected": -1.7107007503509521, "logps/chosen": -258.3623962402344, "logps/rejected": -208.75918579101562, "loss": 0.6608, "rewards/accuracies": 0.609375, "rewards/chosen": -0.07419726252555847, "rewards/margins": 0.07502313703298569, "rewards/rejected": -0.14922040700912476, "step": 49 }, { "epoch": 0.23626698168930893, "grad_norm": 1.0512679200538415, "learning_rate": 4.89971374518556e-07, "logits/chosen": -2.0465784072875977, "logits/rejected": -2.033297300338745, "logps/chosen": -185.704345703125, "logps/rejected": -203.28013610839844, "loss": 0.6639, "rewards/accuracies": 0.578125, "rewards/chosen": -0.08691324293613434, "rewards/margins": 0.06635289639234543, "rewards/rejected": -0.15326614677906036, "step": 50 }, { "epoch": 0.2409923213230951, "grad_norm": 1.112120640811307, "learning_rate": 4.894258664893363e-07, "logits/chosen": -1.7012823820114136, "logits/rejected": -1.7330955266952515, "logps/chosen": -208.68597412109375, "logps/rejected": -218.7354278564453, "loss": 0.6594, "rewards/accuracies": 0.609375, "rewards/chosen": -0.07796823233366013, "rewards/margins": 0.038594380021095276, "rewards/rejected": -0.1165626123547554, "step": 51 }, { "epoch": 0.24571766095688127, "grad_norm": 1.1579361743422294, "learning_rate": 4.888662323670913e-07, "logits/chosen": -1.6541762351989746, "logits/rejected": -1.7235084772109985, "logps/chosen": -269.59210205078125, "logps/rejected": -243.711669921875, "loss": 0.6629, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06561748683452606, "rewards/margins": 0.06448770314455032, "rewards/rejected": -0.1301051825284958, "step": 52 }, { "epoch": 0.25044300059066743, "grad_norm": 1.1521413307343888, "learning_rate": 4.882925051701568e-07, "logits/chosen": -1.853175163269043, "logits/rejected": -1.8978935480117798, "logps/chosen": -225.21856689453125, "logps/rejected": -228.90325927734375, "loss": 0.6578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09534727036952972, "rewards/margins": 0.077830970287323, "rewards/rejected": -0.17317824065685272, "step": 53 }, { "epoch": 0.2551683402244536, "grad_norm": 1.0619319741777762, "learning_rate": 4.877047187483582e-07, "logits/chosen": -1.6998298168182373, "logits/rejected": -1.7790082693099976, "logps/chosen": -212.0916748046875, "logps/rejected": -191.58206176757812, "loss": 0.6631, "rewards/accuracies": 0.546875, "rewards/chosen": -0.10848715156316757, "rewards/margins": 0.048885174095630646, "rewards/rejected": -0.1573723405599594, "step": 54 }, { "epoch": 0.2598936798582398, "grad_norm": 1.1160678416368461, "learning_rate": 4.871029077810132e-07, "logits/chosen": -1.675370216369629, "logits/rejected": -1.7553473711013794, "logps/chosen": -225.4173126220703, "logps/rejected": -198.70562744140625, "loss": 0.6558, "rewards/accuracies": 0.53125, "rewards/chosen": -0.10019448399543762, "rewards/margins": 0.08370313048362732, "rewards/rejected": -0.18389761447906494, "step": 55 }, { "epoch": 0.264619019492026, "grad_norm": 1.180700159151371, "learning_rate": 4.864871077748857e-07, "logits/chosen": -2.015566110610962, "logits/rejected": -2.080444812774658, "logps/chosen": -229.28094482421875, "logps/rejected": -215.07102966308594, "loss": 0.6587, "rewards/accuracies": 0.609375, "rewards/chosen": -0.1043662577867508, "rewards/margins": 0.07935845851898193, "rewards/rejected": -0.18372471630573273, "step": 56 }, { "epoch": 0.26934435912581217, "grad_norm": 1.2229942654627657, "learning_rate": 4.858573550620908e-07, "logits/chosen": -2.024144411087036, "logits/rejected": -2.0165395736694336, "logps/chosen": -266.3519287109375, "logps/rejected": -233.6390380859375, "loss": 0.6454, "rewards/accuracies": 0.734375, "rewards/chosen": -0.12464563548564911, "rewards/margins": 0.12967334687709808, "rewards/rejected": -0.2543190121650696, "step": 57 }, { "epoch": 0.27406969875959836, "grad_norm": 1.0980503837148203, "learning_rate": 4.852136867979515e-07, "logits/chosen": -2.3049449920654297, "logits/rejected": -2.286456346511841, "logps/chosen": -187.29776000976562, "logps/rejected": -195.98187255859375, "loss": 0.6559, "rewards/accuracies": 0.515625, "rewards/chosen": -0.13345083594322205, "rewards/margins": 0.05029616504907608, "rewards/rejected": -0.18374700844287872, "step": 58 }, { "epoch": 0.27879503839338454, "grad_norm": 1.222277907011155, "learning_rate": 4.845561409588065e-07, "logits/chosen": -2.3418726921081543, "logits/rejected": -2.2416603565216064, "logps/chosen": -184.75540161132812, "logps/rejected": -186.7329864501953, "loss": 0.6424, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10166566073894501, "rewards/margins": 0.12095416337251663, "rewards/rejected": -0.22261981666088104, "step": 59 }, { "epoch": 0.2835203780271707, "grad_norm": 1.0214927771783529, "learning_rate": 4.838847563397693e-07, "logits/chosen": -1.5123400688171387, "logits/rejected": -1.6280531883239746, "logps/chosen": -237.2841796875, "logps/rejected": -212.3114013671875, "loss": 0.6583, "rewards/accuracies": 0.609375, "rewards/chosen": -0.1481347680091858, "rewards/margins": 0.09539347887039185, "rewards/rejected": -0.24352826178073883, "step": 60 }, { "epoch": 0.28824571766095686, "grad_norm": 1.1567243525055237, "learning_rate": 4.831995725524398e-07, "logits/chosen": -2.612375497817993, "logits/rejected": -2.534623861312866, "logps/chosen": -163.80526733398438, "logps/rejected": -201.24952697753906, "loss": 0.6481, "rewards/accuracies": 0.703125, "rewards/chosen": -0.14992724359035492, "rewards/margins": 0.10543593764305115, "rewards/rejected": -0.2553631663322449, "step": 61 }, { "epoch": 0.29297105729474304, "grad_norm": 1.2299920626415959, "learning_rate": 4.825006300225665e-07, "logits/chosen": -2.0585803985595703, "logits/rejected": -2.1235456466674805, "logps/chosen": -218.53924560546875, "logps/rejected": -219.55938720703125, "loss": 0.6439, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15479950606822968, "rewards/margins": 0.11182530224323273, "rewards/rejected": -0.2666248381137848, "step": 62 }, { "epoch": 0.2976963969285292, "grad_norm": 1.203647663122135, "learning_rate": 4.817879699876622e-07, "logits/chosen": -1.9584053754806519, "logits/rejected": -2.047368288040161, "logps/chosen": -180.12673950195312, "logps/rejected": -169.58712768554688, "loss": 0.6365, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15420033037662506, "rewards/margins": 0.07548267394304276, "rewards/rejected": -0.2296830117702484, "step": 63 }, { "epoch": 0.3024217365623154, "grad_norm": 1.1158018473166338, "learning_rate": 4.810616344945705e-07, "logits/chosen": -1.889503002166748, "logits/rejected": -1.8674815893173218, "logps/chosen": -205.70799255371094, "logps/rejected": -200.00503540039062, "loss": 0.6564, "rewards/accuracies": 0.46875, "rewards/chosen": -0.20557425916194916, "rewards/margins": 0.07413503527641296, "rewards/rejected": -0.2797092795372009, "step": 64 }, { "epoch": 0.3071470761961016, "grad_norm": 1.1145309518931334, "learning_rate": 4.803216663969849e-07, "logits/chosen": -2.5508382320404053, "logits/rejected": -2.58968186378479, "logps/chosen": -222.9154052734375, "logps/rejected": -205.91091918945312, "loss": 0.646, "rewards/accuracies": 0.703125, "rewards/chosen": -0.16130368411540985, "rewards/margins": 0.11249940097332001, "rewards/rejected": -0.27380311489105225, "step": 65 }, { "epoch": 0.3118724158298878, "grad_norm": 1.1907287315768882, "learning_rate": 4.795681093529209e-07, "logits/chosen": -1.9771151542663574, "logits/rejected": -1.9131364822387695, "logps/chosen": -181.78024291992188, "logps/rejected": -203.05455017089844, "loss": 0.6367, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17563070356845856, "rewards/margins": 0.1282222419977188, "rewards/rejected": -0.30385297536849976, "step": 66 }, { "epoch": 0.31659775546367397, "grad_norm": 1.0642807389889697, "learning_rate": 4.7880100782214e-07, "logits/chosen": -2.139569044113159, "logits/rejected": -2.139991283416748, "logps/chosen": -200.4115447998047, "logps/rejected": -209.0381317138672, "loss": 0.6515, "rewards/accuracies": 0.5, "rewards/chosen": -0.25353577733039856, "rewards/margins": 0.025102369487285614, "rewards/rejected": -0.2786381244659424, "step": 67 }, { "epoch": 0.32132309509746015, "grad_norm": 1.2361659779944367, "learning_rate": 4.780204070635266e-07, "logits/chosen": -2.3622794151306152, "logits/rejected": -2.3103740215301514, "logps/chosen": -241.8636474609375, "logps/rejected": -253.21530151367188, "loss": 0.6309, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2274862378835678, "rewards/margins": 0.18495142459869385, "rewards/rejected": -0.41243770718574524, "step": 68 }, { "epoch": 0.3260484347312463, "grad_norm": 1.2216304493799137, "learning_rate": 4.772263531324172e-07, "logits/chosen": -2.275869369506836, "logits/rejected": -2.296611785888672, "logps/chosen": -255.72305297851562, "logps/rejected": -237.79794311523438, "loss": 0.631, "rewards/accuracies": 0.65625, "rewards/chosen": -0.24048469960689545, "rewards/margins": 0.11058272421360016, "rewards/rejected": -0.351067453622818, "step": 69 }, { "epoch": 0.33077377436503247, "grad_norm": 1.2564703689206955, "learning_rate": 4.764188928778843e-07, "logits/chosen": -1.9974974393844604, "logits/rejected": -2.011615514755249, "logps/chosen": -195.90382385253906, "logps/rejected": -214.2757110595703, "loss": 0.6336, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2238045632839203, "rewards/margins": 0.1463470458984375, "rewards/rejected": -0.3701516091823578, "step": 70 }, { "epoch": 0.33549911399881865, "grad_norm": 1.2749948068499388, "learning_rate": 4.755980739399711e-07, "logits/chosen": -2.3057668209075928, "logits/rejected": -2.2394864559173584, "logps/chosen": -193.60552978515625, "logps/rejected": -239.27389526367188, "loss": 0.6525, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2364463210105896, "rewards/margins": 0.1507556438446045, "rewards/rejected": -0.3872019648551941, "step": 71 }, { "epoch": 0.34022445363260484, "grad_norm": 1.3097724846280558, "learning_rate": 4.747639447468816e-07, "logits/chosen": -2.2665905952453613, "logits/rejected": -2.279989719390869, "logps/chosen": -285.88140869140625, "logps/rejected": -304.027587890625, "loss": 0.6232, "rewards/accuracies": 0.734375, "rewards/chosen": -0.23830385506153107, "rewards/margins": 0.22441713511943817, "rewards/rejected": -0.4627210199832916, "step": 72 }, { "epoch": 0.344949793266391, "grad_norm": 1.267134897055487, "learning_rate": 4.739165545121228e-07, "logits/chosen": -1.9879568815231323, "logits/rejected": -1.9682663679122925, "logps/chosen": -246.3922882080078, "logps/rejected": -247.4912109375, "loss": 0.6229, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3345012366771698, "rewards/margins": 0.17793008685112, "rewards/rejected": -0.5124313235282898, "step": 73 }, { "epoch": 0.3496751329001772, "grad_norm": 1.2766840060333855, "learning_rate": 4.730559532316014e-07, "logits/chosen": -2.3778185844421387, "logits/rejected": -2.4101202487945557, "logps/chosen": -221.90524291992188, "logps/rejected": -241.57962036132812, "loss": 0.6395, "rewards/accuracies": 0.640625, "rewards/chosen": -0.32600241899490356, "rewards/margins": 0.14937394857406616, "rewards/rejected": -0.4753763973712921, "step": 74 }, { "epoch": 0.3544004725339634, "grad_norm": 1.3566530419485627, "learning_rate": 4.721821916806741e-07, "logits/chosen": -2.4602699279785156, "logits/rejected": -2.4688634872436523, "logps/chosen": -268.510498046875, "logps/rejected": -286.26165771484375, "loss": 0.6237, "rewards/accuracies": 0.703125, "rewards/chosen": -0.31091028451919556, "rewards/margins": 0.29035893082618713, "rewards/rejected": -0.6012692451477051, "step": 75 }, { "epoch": 0.3591258121677496, "grad_norm": 1.4205593895904036, "learning_rate": 4.7129532141115145e-07, "logits/chosen": -2.5466036796569824, "logits/rejected": -2.5945773124694824, "logps/chosen": -318.98370361328125, "logps/rejected": -309.7763366699219, "loss": 0.6112, "rewards/accuracies": 0.640625, "rewards/chosen": -0.3561764359474182, "rewards/margins": 0.18557003140449524, "rewards/rejected": -0.5417464375495911, "step": 76 }, { "epoch": 0.3638511518015357, "grad_norm": 1.528828829984918, "learning_rate": 4.7039539474825683e-07, "logits/chosen": -2.2787909507751465, "logits/rejected": -2.3680973052978516, "logps/chosen": -305.41925048828125, "logps/rejected": -288.8333435058594, "loss": 0.6006, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39139410853385925, "rewards/margins": 0.2935311794281006, "rewards/rejected": -0.6849253177642822, "step": 77 }, { "epoch": 0.3685764914353219, "grad_norm": 1.6985476984319592, "learning_rate": 4.6948246478753903e-07, "logits/chosen": -2.261338472366333, "logits/rejected": -2.323387622833252, "logps/chosen": -247.09033203125, "logps/rejected": -257.6181640625, "loss": 0.6314, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5037412047386169, "rewards/margins": 0.1979576051235199, "rewards/rejected": -0.7016987800598145, "step": 78 }, { "epoch": 0.3733018310691081, "grad_norm": 1.729313425365047, "learning_rate": 4.6855658539173946e-07, "logits/chosen": -2.698389768600464, "logits/rejected": -2.646217107772827, "logps/chosen": -314.0598449707031, "logps/rejected": -283.12530517578125, "loss": 0.6206, "rewards/accuracies": 0.609375, "rewards/chosen": -0.5247927904129028, "rewards/margins": 0.22078382968902588, "rewards/rejected": -0.7455766201019287, "step": 79 }, { "epoch": 0.37802717070289427, "grad_norm": 1.66621636166947, "learning_rate": 4.6761781118761446e-07, "logits/chosen": -2.343153238296509, "logits/rejected": -2.3706839084625244, "logps/chosen": -296.689453125, "logps/rejected": -309.9665222167969, "loss": 0.6257, "rewards/accuracies": 0.65625, "rewards/chosen": -0.44052672386169434, "rewards/margins": 0.23874229192733765, "rewards/rejected": -0.6792689561843872, "step": 80 }, { "epoch": 0.38275251033668045, "grad_norm": 1.5082652270101988, "learning_rate": 4.666661975627123e-07, "logits/chosen": -2.4629459381103516, "logits/rejected": -2.486161470413208, "logps/chosen": -233.51983642578125, "logps/rejected": -262.89447021484375, "loss": 0.6154, "rewards/accuracies": 0.640625, "rewards/chosen": -0.4172506630420685, "rewards/margins": 0.3244992196559906, "rewards/rejected": -0.7417498826980591, "step": 81 }, { "epoch": 0.38747784997046664, "grad_norm": 1.5314338973522612, "learning_rate": 4.657018006621053e-07, "logits/chosen": -2.1910758018493652, "logits/rejected": -2.273641586303711, "logps/chosen": -225.42889404296875, "logps/rejected": -239.92913818359375, "loss": 0.6058, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4517180323600769, "rewards/margins": 0.22106432914733887, "rewards/rejected": -0.6727824211120605, "step": 82 }, { "epoch": 0.3922031896042528, "grad_norm": 1.6363344671283797, "learning_rate": 4.6472467738507724e-07, "logits/chosen": -2.4740734100341797, "logits/rejected": -2.593059539794922, "logps/chosen": -325.74298095703125, "logps/rejected": -297.570068359375, "loss": 0.6088, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6095322370529175, "rewards/margins": 0.1409136950969696, "rewards/rejected": -0.7504459619522095, "step": 83 }, { "epoch": 0.396928529238039, "grad_norm": 1.7032682560793109, "learning_rate": 4.6373488538176656e-07, "logits/chosen": -2.627995014190674, "logits/rejected": -2.585923433303833, "logps/chosen": -278.2349853515625, "logps/rejected": -319.6769104003906, "loss": 0.6177, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6383107900619507, "rewards/margins": 0.17859135568141937, "rewards/rejected": -0.8169021010398865, "step": 84 }, { "epoch": 0.40165386887182514, "grad_norm": 1.6927306950035537, "learning_rate": 4.627324830497645e-07, "logits/chosen": -2.3960776329040527, "logits/rejected": -2.4317002296447754, "logps/chosen": -217.64886474609375, "logps/rejected": -255.28555297851562, "loss": 0.6034, "rewards/accuracies": 0.59375, "rewards/chosen": -0.581438422203064, "rewards/margins": 0.2544565200805664, "rewards/rejected": -0.8358950018882751, "step": 85 }, { "epoch": 0.4063792085056113, "grad_norm": 1.517574514685826, "learning_rate": 4.617175295306701e-07, "logits/chosen": -2.342132329940796, "logits/rejected": -2.317411422729492, "logps/chosen": -249.52786254882812, "logps/rejected": -272.2576904296875, "loss": 0.6056, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4862441420555115, "rewards/margins": 0.2543086111545563, "rewards/rejected": -0.7405527830123901, "step": 86 }, { "epoch": 0.4063792085056113, "eval_logits/chosen": -3.0177347660064697, "eval_logits/rejected": -3.027693748474121, "eval_logps/chosen": -269.99395751953125, "eval_logps/rejected": -288.7347106933594, "eval_loss": 0.6040579080581665, "eval_rewards/accuracies": 0.6022727489471436, "eval_rewards/chosen": -0.5876324772834778, "eval_rewards/margins": 0.3002159297466278, "eval_rewards/rejected": -0.8878483772277832, "eval_runtime": 225.4053, "eval_samples_per_second": 16.22, "eval_steps_per_second": 0.293, "step": 86 }, { "epoch": 0.4111045481393975, "grad_norm": 1.828247945135766, "learning_rate": 4.6069008470660057e-07, "logits/chosen": -2.776036500930786, "logits/rejected": -2.8447940349578857, "logps/chosen": -286.0122985839844, "logps/rejected": -318.0850830078125, "loss": 0.6202, "rewards/accuracies": 0.625, "rewards/chosen": -0.6256659626960754, "rewards/margins": 0.2720867395401001, "rewards/rejected": -0.8977527022361755, "step": 87 }, { "epoch": 0.4158298877731837, "grad_norm": 1.7339677719729851, "learning_rate": 4.596502091966587e-07, "logits/chosen": -2.6904196739196777, "logits/rejected": -2.8044652938842773, "logps/chosen": -318.77557373046875, "logps/rejected": -326.3057556152344, "loss": 0.6133, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6397165060043335, "rewards/margins": 0.23243850469589233, "rewards/rejected": -0.8721550107002258, "step": 88 }, { "epoch": 0.4205552274069699, "grad_norm": 1.5787512385428173, "learning_rate": 4.5859796435335575e-07, "logits/chosen": -2.5359115600585938, "logits/rejected": -2.5303304195404053, "logps/chosen": -256.5873718261719, "logps/rejected": -295.2117004394531, "loss": 0.6062, "rewards/accuracies": 0.609375, "rewards/chosen": -0.5649542212486267, "rewards/margins": 0.2900598347187042, "rewards/rejected": -0.8550140857696533, "step": 89 }, { "epoch": 0.42528056704075606, "grad_norm": 1.6858517491129223, "learning_rate": 4.5753341225899195e-07, "logits/chosen": -2.4564006328582764, "logits/rejected": -2.470282793045044, "logps/chosen": -327.185791015625, "logps/rejected": -319.4944152832031, "loss": 0.6017, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6457672715187073, "rewards/margins": 0.2363281399011612, "rewards/rejected": -0.882095456123352, "step": 90 }, { "epoch": 0.43000590667454225, "grad_norm": 1.6776313748940133, "learning_rate": 4.564566157219938e-07, "logits/chosen": -2.467501163482666, "logits/rejected": -2.570307970046997, "logps/chosen": -318.20867919921875, "logps/rejected": -302.66949462890625, "loss": 0.6015, "rewards/accuracies": 0.578125, "rewards/chosen": -0.5671995878219604, "rewards/margins": 0.2143457680940628, "rewards/rejected": -0.7815454006195068, "step": 91 }, { "epoch": 0.43473124630832843, "grad_norm": 2.0614895261976747, "learning_rate": 4.5536763827320803e-07, "logits/chosen": -2.3631057739257812, "logits/rejected": -2.54693341255188, "logps/chosen": -273.6968078613281, "logps/rejected": -246.98641967773438, "loss": 0.5856, "rewards/accuracies": 0.609375, "rewards/chosen": -0.4555537700653076, "rewards/margins": 0.22295869886875153, "rewards/rejected": -0.6785125136375427, "step": 92 }, { "epoch": 0.43945658594211456, "grad_norm": 1.8016509044142324, "learning_rate": 4.5426654416215367e-07, "logits/chosen": -2.71864652633667, "logits/rejected": -2.7026920318603516, "logps/chosen": -288.259033203125, "logps/rejected": -330.29132080078125, "loss": 0.6038, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5275314450263977, "rewards/margins": 0.5003238916397095, "rewards/rejected": -1.027855396270752, "step": 93 }, { "epoch": 0.44418192557590075, "grad_norm": 2.243457029027687, "learning_rate": 4.5315339835323095e-07, "logits/chosen": -2.884897232055664, "logits/rejected": -2.8569068908691406, "logps/chosen": -269.7425537109375, "logps/rejected": -297.4085693359375, "loss": 0.5748, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5586040019989014, "rewards/margins": 0.3271249234676361, "rewards/rejected": -0.8857288956642151, "step": 94 }, { "epoch": 0.44890726520968693, "grad_norm": 2.296742463720749, "learning_rate": 4.520282665218889e-07, "logits/chosen": -2.790522336959839, "logits/rejected": -2.84653377532959, "logps/chosen": -265.41339111328125, "logps/rejected": -315.9186096191406, "loss": 0.5625, "rewards/accuracies": 0.75, "rewards/chosen": -0.5728757977485657, "rewards/margins": 0.515011191368103, "rewards/rejected": -1.0878870487213135, "step": 95 }, { "epoch": 0.4536326048434731, "grad_norm": 1.922804376035695, "learning_rate": 4.5089121505074987e-07, "logits/chosen": -2.6556386947631836, "logits/rejected": -2.806910276412964, "logps/chosen": -244.31153869628906, "logps/rejected": -247.36322021484375, "loss": 0.5787, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5964667797088623, "rewards/margins": 0.29180973768234253, "rewards/rejected": -0.8882765173912048, "step": 96 }, { "epoch": 0.4583579444772593, "grad_norm": 1.6125061422080407, "learning_rate": 4.4974231102569355e-07, "logits/chosen": -2.7232208251953125, "logits/rejected": -2.8808493614196777, "logps/chosen": -272.30206298828125, "logps/rejected": -273.55487060546875, "loss": 0.5764, "rewards/accuracies": 0.640625, "rewards/chosen": -0.5672851800918579, "rewards/margins": 0.39647072553634644, "rewards/rejected": -0.9637559056282043, "step": 97 }, { "epoch": 0.4630832841110455, "grad_norm": 2.0273197043813598, "learning_rate": 4.4858162223189853e-07, "logits/chosen": -2.691676616668701, "logits/rejected": -2.7318079471588135, "logps/chosen": -331.65582275390625, "logps/rejected": -321.1370544433594, "loss": 0.6022, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7305989861488342, "rewards/margins": 0.20282992720603943, "rewards/rejected": -0.933428943157196, "step": 98 }, { "epoch": 0.4678086237448317, "grad_norm": 1.772177669462408, "learning_rate": 4.474092171498434e-07, "logits/chosen": -2.5423169136047363, "logits/rejected": -2.5960371494293213, "logps/chosen": -260.9735107421875, "logps/rejected": -277.1812744140625, "loss": 0.5888, "rewards/accuracies": 0.546875, "rewards/chosen": -0.6579495668411255, "rewards/margins": 0.32558560371398926, "rewards/rejected": -0.9835351705551147, "step": 99 }, { "epoch": 0.47253396337861786, "grad_norm": 2.031475270950697, "learning_rate": 4.462251649512656e-07, "logits/chosen": -2.805039167404175, "logits/rejected": -2.7704780101776123, "logps/chosen": -235.95053100585938, "logps/rejected": -292.6195373535156, "loss": 0.5634, "rewards/accuracies": 0.640625, "rewards/chosen": -0.612291157245636, "rewards/margins": 0.49476855993270874, "rewards/rejected": -1.1070597171783447, "step": 100 }, { "epoch": 0.477259303012404, "grad_norm": 2.8786657503544033, "learning_rate": 4.4502953549508135e-07, "logits/chosen": -2.829331398010254, "logits/rejected": -2.8344359397888184, "logps/chosen": -309.11285400390625, "logps/rejected": -331.1981201171875, "loss": 0.596, "rewards/accuracies": 0.75, "rewards/chosen": -0.7622178792953491, "rewards/margins": 0.48595699667930603, "rewards/rejected": -1.2481749057769775, "step": 101 }, { "epoch": 0.4819846426461902, "grad_norm": 2.6747243514730163, "learning_rate": 4.438223993232634e-07, "logits/chosen": -2.71714186668396, "logits/rejected": -2.7164933681488037, "logps/chosen": -319.4408264160156, "logps/rejected": -335.07025146484375, "loss": 0.5873, "rewards/accuracies": 0.625, "rewards/chosen": -0.7349227666854858, "rewards/margins": 0.5957677960395813, "rewards/rejected": -1.3306907415390015, "step": 102 }, { "epoch": 0.48670998227997636, "grad_norm": 2.184532115681571, "learning_rate": 4.426038276566787e-07, "logits/chosen": -2.595947027206421, "logits/rejected": -2.6409239768981934, "logps/chosen": -302.1788330078125, "logps/rejected": -300.60736083984375, "loss": 0.5929, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6437785625457764, "rewards/margins": 0.4648054838180542, "rewards/rejected": -1.1085840463638306, "step": 103 }, { "epoch": 0.49143532191376255, "grad_norm": 2.351448463153027, "learning_rate": 4.413738923908874e-07, "logits/chosen": -2.820120334625244, "logits/rejected": -2.881047248840332, "logps/chosen": -321.73577880859375, "logps/rejected": -336.362548828125, "loss": 0.6113, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6863315105438232, "rewards/margins": 0.3078695237636566, "rewards/rejected": -0.9942010641098022, "step": 104 }, { "epoch": 0.49616066154754873, "grad_norm": 2.0712718540984976, "learning_rate": 4.4013266609190016e-07, "logits/chosen": -2.726858139038086, "logits/rejected": -2.8224053382873535, "logps/chosen": -318.89129638671875, "logps/rejected": -321.7113037109375, "loss": 0.5567, "rewards/accuracies": 0.578125, "rewards/chosen": -0.7669743299484253, "rewards/margins": 0.5382488369941711, "rewards/rejected": -1.3052233457565308, "step": 105 }, { "epoch": 0.5008860011813349, "grad_norm": 2.7161021836994412, "learning_rate": 4.3888022199189684e-07, "logits/chosen": -2.5977838039398193, "logits/rejected": -2.568969488143921, "logps/chosen": -270.40380859375, "logps/rejected": -321.2980041503906, "loss": 0.6029, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6543689966201782, "rewards/margins": 0.5025177001953125, "rewards/rejected": -1.1568866968154907, "step": 106 }, { "epoch": 0.505611340815121, "grad_norm": 2.133467086083558, "learning_rate": 4.3761663398490634e-07, "logits/chosen": -2.5719194412231445, "logits/rejected": -2.569828510284424, "logps/chosen": -278.3515930175781, "logps/rejected": -290.2523193359375, "loss": 0.5601, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6445180773735046, "rewards/margins": 0.44993308186531067, "rewards/rejected": -1.0944510698318481, "step": 107 }, { "epoch": 0.5103366804489072, "grad_norm": 2.581357357210363, "learning_rate": 4.363419766224464e-07, "logits/chosen": -2.5332443714141846, "logits/rejected": -2.5566651821136475, "logps/chosen": -255.7954559326172, "logps/rejected": -285.38604736328125, "loss": 0.5701, "rewards/accuracies": 0.625, "rewards/chosen": -0.7009232044219971, "rewards/margins": 0.5014970302581787, "rewards/rejected": -1.2024202346801758, "step": 108 }, { "epoch": 0.5150620200826934, "grad_norm": 2.0545799713212913, "learning_rate": 4.3505632510912515e-07, "logits/chosen": -2.5492563247680664, "logits/rejected": -2.6753411293029785, "logps/chosen": -252.5239715576172, "logps/rejected": -250.57691955566406, "loss": 0.5745, "rewards/accuracies": 0.625, "rewards/chosen": -0.580327033996582, "rewards/margins": 0.28667935729026794, "rewards/rejected": -0.8670063614845276, "step": 109 }, { "epoch": 0.5197873597164796, "grad_norm": 2.2034797994763458, "learning_rate": 4.3375975529820414e-07, "logits/chosen": -2.492084503173828, "logits/rejected": -2.432577610015869, "logps/chosen": -310.48046875, "logps/rejected": -362.7835388183594, "loss": 0.5392, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6999268531799316, "rewards/margins": 0.7504494190216064, "rewards/rejected": -1.450376272201538, "step": 110 }, { "epoch": 0.5245126993502658, "grad_norm": 1.9829084137694295, "learning_rate": 4.3245234368712304e-07, "logits/chosen": -2.7556090354919434, "logits/rejected": -2.754206418991089, "logps/chosen": -270.7232666015625, "logps/rejected": -324.1688232421875, "loss": 0.5701, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6123411655426025, "rewards/margins": 0.4949289858341217, "rewards/rejected": -1.1072702407836914, "step": 111 }, { "epoch": 0.529238038984052, "grad_norm": 2.0360711261149596, "learning_rate": 4.3113416741298616e-07, "logits/chosen": -2.659914016723633, "logits/rejected": -2.673081874847412, "logps/chosen": -292.6357727050781, "logps/rejected": -286.8974914550781, "loss": 0.5785, "rewards/accuracies": 0.6875, "rewards/chosen": -0.645796537399292, "rewards/margins": 0.4479163587093353, "rewards/rejected": -1.0937130451202393, "step": 112 }, { "epoch": 0.5339633786178382, "grad_norm": 2.3215552370547945, "learning_rate": 4.298053042480114e-07, "logits/chosen": -2.6102089881896973, "logits/rejected": -2.666215419769287, "logps/chosen": -284.019775390625, "logps/rejected": -313.7834777832031, "loss": 0.5434, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6691496968269348, "rewards/margins": 0.5124155879020691, "rewards/rejected": -1.181565284729004, "step": 113 }, { "epoch": 0.5386887182516243, "grad_norm": 2.1290543043289434, "learning_rate": 4.2846583259494185e-07, "logits/chosen": -2.795818328857422, "logits/rejected": -2.9482498168945312, "logps/chosen": -291.4162292480469, "logps/rejected": -279.486083984375, "loss": 0.5576, "rewards/accuracies": 0.5625, "rewards/chosen": -0.791326105594635, "rewards/margins": 0.29795482754707336, "rewards/rejected": -1.0892809629440308, "step": 114 }, { "epoch": 0.5434140578854105, "grad_norm": 2.123022825050298, "learning_rate": 4.271158314824199e-07, "logits/chosen": -2.5966644287109375, "logits/rejected": -2.67663836479187, "logps/chosen": -286.2615661621094, "logps/rejected": -300.17669677734375, "loss": 0.5549, "rewards/accuracies": 0.609375, "rewards/chosen": -0.6490954160690308, "rewards/margins": 0.37902897596359253, "rewards/rejected": -1.028124451637268, "step": 115 }, { "epoch": 0.5481393975191967, "grad_norm": 2.3410590679680854, "learning_rate": 4.2575538056032446e-07, "logits/chosen": -2.3392884731292725, "logits/rejected": -2.3976926803588867, "logps/chosen": -309.89501953125, "logps/rejected": -347.65264892578125, "loss": 0.5519, "rewards/accuracies": 0.609375, "rewards/chosen": -0.8080885410308838, "rewards/margins": 0.5476577877998352, "rewards/rejected": -1.3557462692260742, "step": 116 }, { "epoch": 0.5528647371529829, "grad_norm": 2.1818324792915305, "learning_rate": 4.2438456009507195e-07, "logits/chosen": -2.751250743865967, "logits/rejected": -2.683605909347534, "logps/chosen": -304.8617248535156, "logps/rejected": -357.66583251953125, "loss": 0.5637, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7783306837081909, "rewards/margins": 0.6774348020553589, "rewards/rejected": -1.4557652473449707, "step": 117 }, { "epoch": 0.5575900767867691, "grad_norm": 2.2239562359532994, "learning_rate": 4.230034509648803e-07, "logits/chosen": -2.653618335723877, "logits/rejected": -2.6241607666015625, "logps/chosen": -311.1373596191406, "logps/rejected": -379.66473388671875, "loss": 0.5332, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6985858678817749, "rewards/margins": 0.754062294960022, "rewards/rejected": -1.4526481628417969, "step": 118 }, { "epoch": 0.5623154164205553, "grad_norm": 2.2500972070282024, "learning_rate": 4.216121346549973e-07, "logits/chosen": -3.0888874530792236, "logits/rejected": -2.974677562713623, "logps/chosen": -266.66705322265625, "logps/rejected": -335.3798828125, "loss": 0.5606, "rewards/accuracies": 0.609375, "rewards/chosen": -0.6862295866012573, "rewards/margins": 0.5550753474235535, "rewards/rejected": -1.2413049936294556, "step": 119 }, { "epoch": 0.5670407560543415, "grad_norm": 2.18698230287775, "learning_rate": 4.202106932528928e-07, "logits/chosen": -2.6840288639068604, "logits/rejected": -2.7219927310943604, "logps/chosen": -349.6735534667969, "logps/rejected": -351.0025634765625, "loss": 0.5593, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9063121676445007, "rewards/margins": 0.5141624808311462, "rewards/rejected": -1.420474648475647, "step": 120 }, { "epoch": 0.5717660956881275, "grad_norm": 2.3004644364082734, "learning_rate": 4.1879920944341593e-07, "logits/chosen": -2.914316177368164, "logits/rejected": -2.9891955852508545, "logps/chosen": -279.8473205566406, "logps/rejected": -313.1243591308594, "loss": 0.544, "rewards/accuracies": 0.71875, "rewards/chosen": -0.754368245601654, "rewards/margins": 0.5700761079788208, "rewards/rejected": -1.3244441747665405, "step": 121 }, { "epoch": 0.5764914353219137, "grad_norm": 2.3085447219221504, "learning_rate": 4.1737776650391625e-07, "logits/chosen": -2.5704903602600098, "logits/rejected": -2.6333065032958984, "logps/chosen": -274.203369140625, "logps/rejected": -333.9256896972656, "loss": 0.5438, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7395837903022766, "rewards/margins": 0.5535318851470947, "rewards/rejected": -1.2931156158447266, "step": 122 }, { "epoch": 0.5812167749556999, "grad_norm": 2.1555107869656234, "learning_rate": 4.1594644829933074e-07, "logits/chosen": -3.0889954566955566, "logits/rejected": -3.00903582572937, "logps/chosen": -287.2713623046875, "logps/rejected": -352.50244140625, "loss": 0.5432, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7744563221931458, "rewards/margins": 0.6194370985031128, "rewards/rejected": -1.3938933610916138, "step": 123 }, { "epoch": 0.5859421145894861, "grad_norm": 2.3124214479042147, "learning_rate": 4.1450533927723563e-07, "logits/chosen": -2.7649660110473633, "logits/rejected": -2.7645654678344727, "logps/chosen": -323.98663330078125, "logps/rejected": -377.0577697753906, "loss": 0.5562, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9616256952285767, "rewards/margins": 0.6629120707511902, "rewards/rejected": -1.624537706375122, "step": 124 }, { "epoch": 0.5906674542232723, "grad_norm": 2.201123781604531, "learning_rate": 4.130545244628638e-07, "logits/chosen": -2.8170711994171143, "logits/rejected": -2.801412582397461, "logps/chosen": -284.6482849121094, "logps/rejected": -339.86328125, "loss": 0.5695, "rewards/accuracies": 0.625, "rewards/chosen": -0.9369969964027405, "rewards/margins": 0.6950640678405762, "rewards/rejected": -1.6320611238479614, "step": 125 }, { "epoch": 0.5953927938570585, "grad_norm": 2.4679791573291907, "learning_rate": 4.11594089454089e-07, "logits/chosen": -2.7288601398468018, "logits/rejected": -2.732513427734375, "logps/chosen": -331.1778259277344, "logps/rejected": -422.12139892578125, "loss": 0.566, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8497134447097778, "rewards/margins": 0.6027945280075073, "rewards/rejected": -1.4525080919265747, "step": 126 }, { "epoch": 0.6001181334908446, "grad_norm": 2.5170488926988073, "learning_rate": 4.101241204163748e-07, "logits/chosen": -2.66646671295166, "logits/rejected": -2.621904134750366, "logps/chosen": -301.6587829589844, "logps/rejected": -365.4261474609375, "loss": 0.5407, "rewards/accuracies": 0.625, "rewards/chosen": -0.9232099056243896, "rewards/margins": 0.4544805884361267, "rewards/rejected": -1.377690315246582, "step": 127 }, { "epoch": 0.6048434731246308, "grad_norm": 2.3673200269773718, "learning_rate": 4.086447040776911e-07, "logits/chosen": -3.108903169631958, "logits/rejected": -3.056070327758789, "logps/chosen": -276.7916259765625, "logps/rejected": -342.23876953125, "loss": 0.5449, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8404449820518494, "rewards/margins": 0.6569064855575562, "rewards/rejected": -1.4973516464233398, "step": 128 }, { "epoch": 0.609568812758417, "grad_norm": 3.2943893516301768, "learning_rate": 4.071559277233975e-07, "logits/chosen": -3.1619277000427246, "logits/rejected": -3.1500680446624756, "logps/chosen": -283.8644104003906, "logps/rejected": -360.0433044433594, "loss": 0.573, "rewards/accuracies": 0.609375, "rewards/chosen": -0.969115674495697, "rewards/margins": 0.5388011932373047, "rewards/rejected": -1.507916808128357, "step": 129 }, { "epoch": 0.609568812758417, "eval_logits/chosen": -2.9238109588623047, "eval_logits/rejected": -2.93009352684021, "eval_logps/chosen": -304.09130859375, "eval_logps/rejected": -360.09600830078125, "eval_loss": 0.5451335906982422, "eval_rewards/accuracies": 0.6174242496490479, "eval_rewards/chosen": -0.9286060333251953, "eval_rewards/margins": 0.6728550791740417, "eval_rewards/rejected": -1.6014612913131714, "eval_runtime": 226.5871, "eval_samples_per_second": 16.135, "eval_steps_per_second": 0.291, "step": 129 }, { "epoch": 0.6142941523922032, "grad_norm": 2.3717766371975086, "learning_rate": 4.05657879191093e-07, "logits/chosen": -2.86085844039917, "logits/rejected": -3.0779123306274414, "logps/chosen": -353.9771423339844, "logps/rejected": -372.76788330078125, "loss": 0.5611, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9203246235847473, "rewards/margins": 0.6556491851806641, "rewards/rejected": -1.575973629951477, "step": 130 }, { "epoch": 0.6190194920259894, "grad_norm": 2.9558527636533025, "learning_rate": 4.04150646865434e-07, "logits/chosen": -2.6664750576019287, "logits/rejected": -2.6207780838012695, "logps/chosen": -283.3083801269531, "logps/rejected": -348.6871032714844, "loss": 0.5525, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8656440377235413, "rewards/margins": 0.7297619581222534, "rewards/rejected": -1.5954060554504395, "step": 131 }, { "epoch": 0.6237448316597756, "grad_norm": 2.7269741124418165, "learning_rate": 4.0263431967291934e-07, "logits/chosen": -2.8708412647247314, "logits/rejected": -2.836123466491699, "logps/chosen": -251.37966918945312, "logps/rejected": -302.320068359375, "loss": 0.5554, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8969355821609497, "rewards/margins": 0.4451577365398407, "rewards/rejected": -1.3420933485031128, "step": 132 }, { "epoch": 0.6284701712935618, "grad_norm": 2.3559260676043547, "learning_rate": 4.011089870766437e-07, "logits/chosen": -2.9185516834259033, "logits/rejected": -3.0862460136413574, "logps/chosen": -340.1504211425781, "logps/rejected": -376.01629638671875, "loss": 0.5223, "rewards/accuracies": 0.75, "rewards/chosen": -0.8409585952758789, "rewards/margins": 0.9160802364349365, "rewards/rejected": -1.7570387125015259, "step": 133 }, { "epoch": 0.6331955109273479, "grad_norm": 2.551925501157886, "learning_rate": 3.995747390710196e-07, "logits/chosen": -2.895480155944824, "logits/rejected": -2.939868211746216, "logps/chosen": -323.12164306640625, "logps/rejected": -397.951171875, "loss": 0.5385, "rewards/accuracies": 0.703125, "rewards/chosen": -1.1145464181900024, "rewards/margins": 1.0304715633392334, "rewards/rejected": -2.1450178623199463, "step": 134 }, { "epoch": 0.6379208505611341, "grad_norm": 3.0639374327823625, "learning_rate": 3.98031666176467e-07, "logits/chosen": -3.221116781234741, "logits/rejected": -3.125380516052246, "logps/chosen": -277.20684814453125, "logps/rejected": -368.4245300292969, "loss": 0.5453, "rewards/accuracies": 0.625, "rewards/chosen": -0.9265705943107605, "rewards/margins": 0.8099436163902283, "rewards/rejected": -1.7365142107009888, "step": 135 }, { "epoch": 0.6426461901949203, "grad_norm": 3.0229894732050537, "learning_rate": 3.9647985943407345e-07, "logits/chosen": -2.7229156494140625, "logits/rejected": -2.624408006668091, "logps/chosen": -296.76507568359375, "logps/rejected": -366.4635925292969, "loss": 0.5446, "rewards/accuracies": 0.671875, "rewards/chosen": -0.911621630191803, "rewards/margins": 0.3694719672203064, "rewards/rejected": -1.2810935974121094, "step": 136 }, { "epoch": 0.6473715298287065, "grad_norm": 2.777240365107515, "learning_rate": 3.949194104002224e-07, "logits/chosen": -3.008553981781006, "logits/rejected": -3.0245308876037598, "logps/chosen": -278.2191162109375, "logps/rejected": -372.57672119140625, "loss": 0.5333, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9170963168144226, "rewards/margins": 0.91133052110672, "rewards/rejected": -1.8284270763397217, "step": 137 }, { "epoch": 0.6520968694624926, "grad_norm": 2.5983010643513222, "learning_rate": 3.93350411141191e-07, "logits/chosen": -2.984111785888672, "logits/rejected": -2.9940693378448486, "logps/chosen": -256.7328796386719, "logps/rejected": -304.14361572265625, "loss": 0.5481, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8156505227088928, "rewards/margins": 0.6031564474105835, "rewards/rejected": -1.418807029724121, "step": 138 }, { "epoch": 0.6568222090962788, "grad_norm": 2.758553605218817, "learning_rate": 3.917729542277187e-07, "logits/chosen": -2.739635944366455, "logits/rejected": -2.8373708724975586, "logps/chosen": -355.6564025878906, "logps/rejected": -444.02471923828125, "loss": 0.5188, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9943738579750061, "rewards/margins": 0.8635731339454651, "rewards/rejected": -1.8579471111297607, "step": 139 }, { "epoch": 0.6615475487300649, "grad_norm": 2.5137621293297605, "learning_rate": 3.901871327295453e-07, "logits/chosen": -2.6592538356781006, "logits/rejected": -2.8847031593322754, "logps/chosen": -330.49609375, "logps/rejected": -349.5611572265625, "loss": 0.5406, "rewards/accuracies": 0.640625, "rewards/chosen": -0.8463073968887329, "rewards/margins": 0.6520651578903198, "rewards/rejected": -1.4983725547790527, "step": 140 }, { "epoch": 0.6662728883638511, "grad_norm": 2.285013562717476, "learning_rate": 3.885930402099199e-07, "logits/chosen": -2.5882949829101562, "logits/rejected": -2.6650753021240234, "logps/chosen": -335.5899658203125, "logps/rejected": -360.06451416015625, "loss": 0.5074, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9886775016784668, "rewards/margins": 0.6825906038284302, "rewards/rejected": -1.6712682247161865, "step": 141 }, { "epoch": 0.6709982279976373, "grad_norm": 2.554720405760107, "learning_rate": 3.8699077072008085e-07, "logits/chosen": -2.8670525550842285, "logits/rejected": -2.866511344909668, "logps/chosen": -271.18243408203125, "logps/rejected": -343.53582763671875, "loss": 0.5163, "rewards/accuracies": 0.578125, "rewards/chosen": -1.0047590732574463, "rewards/margins": 0.5558298826217651, "rewards/rejected": -1.5605889558792114, "step": 142 }, { "epoch": 0.6757235676314235, "grad_norm": 2.830180877879511, "learning_rate": 3.8538041879370657e-07, "logits/chosen": -3.037707567214966, "logits/rejected": -3.063495397567749, "logps/chosen": -347.0140380859375, "logps/rejected": -392.6544494628906, "loss": 0.5505, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9989163875579834, "rewards/margins": 0.8129914402961731, "rewards/rejected": -1.8119077682495117, "step": 143 }, { "epoch": 0.6804489072652097, "grad_norm": 2.6000164530484224, "learning_rate": 3.8376207944133817e-07, "logits/chosen": -3.087387800216675, "logits/rejected": -3.079148530960083, "logps/chosen": -316.996337890625, "logps/rejected": -361.9954528808594, "loss": 0.5365, "rewards/accuracies": 0.671875, "rewards/chosen": -1.111214518547058, "rewards/margins": 0.5254876017570496, "rewards/rejected": -1.636702060699463, "step": 144 }, { "epoch": 0.6851742468989959, "grad_norm": 2.6115954536025408, "learning_rate": 3.8213584814477363e-07, "logits/chosen": -3.111316442489624, "logits/rejected": -3.184953451156616, "logps/chosen": -311.6522521972656, "logps/rejected": -325.0541076660156, "loss": 0.4982, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9516675472259521, "rewards/margins": 0.6691429018974304, "rewards/rejected": -1.6208105087280273, "step": 145 }, { "epoch": 0.689899586532782, "grad_norm": 2.976191491134426, "learning_rate": 3.8050182085143464e-07, "logits/chosen": -2.9731078147888184, "logits/rejected": -3.072920083999634, "logps/chosen": -309.2756042480469, "logps/rejected": -352.2410888671875, "loss": 0.5572, "rewards/accuracies": 0.78125, "rewards/chosen": -1.080316185951233, "rewards/margins": 0.8104506134986877, "rewards/rejected": -1.8907668590545654, "step": 146 }, { "epoch": 0.6946249261665682, "grad_norm": 2.837971176112728, "learning_rate": 3.7886009396870564e-07, "logits/chosen": -2.8793129920959473, "logits/rejected": -2.8820691108703613, "logps/chosen": -288.26116943359375, "logps/rejected": -363.0924072265625, "loss": 0.5536, "rewards/accuracies": 0.640625, "rewards/chosen": -1.1091738939285278, "rewards/margins": 0.493495374917984, "rewards/rejected": -1.602669358253479, "step": 147 }, { "epoch": 0.6993502658003544, "grad_norm": 2.928245357032695, "learning_rate": 3.7721076435824585e-07, "logits/chosen": -2.7040960788726807, "logits/rejected": -2.8614137172698975, "logps/chosen": -415.0562438964844, "logps/rejected": -420.36297607421875, "loss": 0.5321, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0266618728637695, "rewards/margins": 0.8255325555801392, "rewards/rejected": -1.8521945476531982, "step": 148 }, { "epoch": 0.7040756054341406, "grad_norm": 2.983452098073201, "learning_rate": 3.755539293302742e-07, "logits/chosen": -2.614259958267212, "logits/rejected": -2.6951889991760254, "logps/chosen": -352.3388366699219, "logps/rejected": -375.2248840332031, "loss": 0.5457, "rewards/accuracies": 0.625, "rewards/chosen": -1.1011898517608643, "rewards/margins": 0.613980233669281, "rewards/rejected": -1.715169906616211, "step": 149 }, { "epoch": 0.7088009450679268, "grad_norm": 2.6652814589714997, "learning_rate": 3.738896866378282e-07, "logits/chosen": -2.7506563663482666, "logits/rejected": -2.690138339996338, "logps/chosen": -322.7042236328125, "logps/rejected": -373.8511047363281, "loss": 0.516, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9171172380447388, "rewards/margins": 0.8513570427894592, "rewards/rejected": -1.7684742212295532, "step": 150 }, { "epoch": 0.713526284701713, "grad_norm": 2.5934419246508096, "learning_rate": 3.722181344709969e-07, "logits/chosen": -2.7295525074005127, "logits/rejected": -2.855721950531006, "logps/chosen": -332.5438232421875, "logps/rejected": -359.502197265625, "loss": 0.5121, "rewards/accuracies": 0.75, "rewards/chosen": -0.9578840136528015, "rewards/margins": 0.7636557221412659, "rewards/rejected": -1.7215397357940674, "step": 151 }, { "epoch": 0.7182516243354992, "grad_norm": 3.1206430500674975, "learning_rate": 3.705393714511268e-07, "logits/chosen": -2.845468282699585, "logits/rejected": -2.7837162017822266, "logps/chosen": -314.1913757324219, "logps/rejected": -419.7186584472656, "loss": 0.5321, "rewards/accuracies": 0.640625, "rewards/chosen": -0.9525049924850464, "rewards/margins": 0.6129012703895569, "rewards/rejected": -1.5654062032699585, "step": 152 }, { "epoch": 0.7229769639692853, "grad_norm": 3.3791075672510336, "learning_rate": 3.688534966250042e-07, "logits/chosen": -3.007288932800293, "logits/rejected": -2.9382283687591553, "logps/chosen": -297.81622314453125, "logps/rejected": -355.0559387207031, "loss": 0.54, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8055727481842041, "rewards/margins": 0.7948654890060425, "rewards/rejected": -1.6004382371902466, "step": 153 }, { "epoch": 0.7277023036030714, "grad_norm": 2.7374735457469384, "learning_rate": 3.671606094590108e-07, "logits/chosen": -2.7088348865509033, "logits/rejected": -2.7453291416168213, "logps/chosen": -337.4541931152344, "logps/rejected": -411.91485595703125, "loss": 0.5429, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0184537172317505, "rewards/margins": 0.8295416235923767, "rewards/rejected": -1.8479952812194824, "step": 154 }, { "epoch": 0.7324276432368576, "grad_norm": 2.977388710936637, "learning_rate": 3.6546080983325523e-07, "logits/chosen": -2.824364185333252, "logits/rejected": -2.911698579788208, "logps/chosen": -308.72161865234375, "logps/rejected": -312.67431640625, "loss": 0.5497, "rewards/accuracies": 0.5625, "rewards/chosen": -1.054483413696289, "rewards/margins": 0.345467209815979, "rewards/rejected": -1.399950623512268, "step": 155 }, { "epoch": 0.7371529828706438, "grad_norm": 3.297987493267062, "learning_rate": 3.6375419803568046e-07, "logits/chosen": -2.938750743865967, "logits/rejected": -3.12616229057312, "logps/chosen": -380.7288513183594, "logps/rejected": -393.94683837890625, "loss": 0.5366, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0641953945159912, "rewards/margins": 0.899326741695404, "rewards/rejected": -1.96352219581604, "step": 156 }, { "epoch": 0.74187832250443, "grad_norm": 2.5345279316698983, "learning_rate": 3.6204087475614676e-07, "logits/chosen": -2.923267364501953, "logits/rejected": -2.8579440116882324, "logps/chosen": -302.9458923339844, "logps/rejected": -387.598388671875, "loss": 0.5246, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0088130235671997, "rewards/margins": 0.8763782382011414, "rewards/rejected": -1.8851913213729858, "step": 157 }, { "epoch": 0.7466036621382162, "grad_norm": 3.13993186485945, "learning_rate": 3.603209410804906e-07, "logits/chosen": -2.7970800399780273, "logits/rejected": -2.77022123336792, "logps/chosen": -260.28485107421875, "logps/rejected": -377.06585693359375, "loss": 0.5166, "rewards/accuracies": 0.75, "rewards/chosen": -0.9031432867050171, "rewards/margins": 0.9541431665420532, "rewards/rejected": -1.8572864532470703, "step": 158 }, { "epoch": 0.7513290017720023, "grad_norm": 2.777966751571199, "learning_rate": 3.5859449848456123e-07, "logits/chosen": -2.83420991897583, "logits/rejected": -2.9197449684143066, "logps/chosen": -270.993896484375, "logps/rejected": -329.7164001464844, "loss": 0.5438, "rewards/accuracies": 0.609375, "rewards/chosen": -0.9526958465576172, "rewards/margins": 0.5992559194564819, "rewards/rejected": -1.5519516468048096, "step": 159 }, { "epoch": 0.7560543414057885, "grad_norm": 2.816810414145774, "learning_rate": 3.5686164882823313e-07, "logits/chosen": -2.4739251136779785, "logits/rejected": -2.5660862922668457, "logps/chosen": -325.5205383300781, "logps/rejected": -357.9018859863281, "loss": 0.5207, "rewards/accuracies": 0.734375, "rewards/chosen": -1.059773564338684, "rewards/margins": 0.855665922164917, "rewards/rejected": -1.9154393672943115, "step": 160 }, { "epoch": 0.7607796810395747, "grad_norm": 2.7264976697102217, "learning_rate": 3.5512249434939634e-07, "logits/chosen": -3.020364284515381, "logits/rejected": -3.1138038635253906, "logps/chosen": -292.44451904296875, "logps/rejected": -370.27581787109375, "loss": 0.5324, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0052980184555054, "rewards/margins": 0.916731059551239, "rewards/rejected": -1.9220290184020996, "step": 161 }, { "epoch": 0.7655050206733609, "grad_norm": 4.143376503088087, "learning_rate": 3.533771376579249e-07, "logits/chosen": -2.8414347171783447, "logits/rejected": -2.8189237117767334, "logps/chosen": -298.5760192871094, "logps/rejected": -407.37457275390625, "loss": 0.5111, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9188227653503418, "rewards/margins": 1.0365185737609863, "rewards/rejected": -1.9553413391113281, "step": 162 }, { "epoch": 0.7702303603071471, "grad_norm": 2.851627509613439, "learning_rate": 3.5162568172962215e-07, "logits/chosen": -2.737412214279175, "logits/rejected": -2.8929431438446045, "logps/chosen": -325.90631103515625, "logps/rejected": -372.39410400390625, "loss": 0.5193, "rewards/accuracies": 0.625, "rewards/chosen": -1.134871244430542, "rewards/margins": 0.6576811075210571, "rewards/rejected": -1.7925523519515991, "step": 163 }, { "epoch": 0.7749556999409333, "grad_norm": 3.4081328480353985, "learning_rate": 3.498682299001459e-07, "logits/chosen": -2.622042179107666, "logits/rejected": -2.759326696395874, "logps/chosen": -354.53448486328125, "logps/rejected": -397.6759338378906, "loss": 0.5084, "rewards/accuracies": 0.640625, "rewards/chosen": -1.1062355041503906, "rewards/margins": 0.7745749354362488, "rewards/rejected": -1.8808104991912842, "step": 164 }, { "epoch": 0.7796810395747195, "grad_norm": 5.113522770893731, "learning_rate": 3.4810488585891103e-07, "logits/chosen": -2.7235350608825684, "logits/rejected": -2.8394298553466797, "logps/chosen": -339.3172912597656, "logps/rejected": -403.5739440917969, "loss": 0.5256, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0079370737075806, "rewards/margins": 0.9361110925674438, "rewards/rejected": -1.9440481662750244, "step": 165 }, { "epoch": 0.7844063792085056, "grad_norm": 2.845383053447366, "learning_rate": 3.4633575364297224e-07, "logits/chosen": -3.0904507637023926, "logits/rejected": -3.1447291374206543, "logps/chosen": -308.4082336425781, "logps/rejected": -403.6357421875, "loss": 0.5258, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1904851198196411, "rewards/margins": 1.1804500818252563, "rewards/rejected": -2.3709352016448975, "step": 166 }, { "epoch": 0.7891317188422918, "grad_norm": 3.6331311614486483, "learning_rate": 3.445609376308857e-07, "logits/chosen": -2.7289986610412598, "logits/rejected": -2.64235520362854, "logps/chosen": -351.40728759765625, "logps/rejected": -435.42010498046875, "loss": 0.4979, "rewards/accuracies": 0.734375, "rewards/chosen": -1.273624300956726, "rewards/margins": 1.0576740503311157, "rewards/rejected": -2.331298351287842, "step": 167 }, { "epoch": 0.793857058476078, "grad_norm": 3.6152830110317655, "learning_rate": 3.4278054253655086e-07, "logits/chosen": -2.8024775981903076, "logits/rejected": -2.853891611099243, "logps/chosen": -322.92181396484375, "logps/rejected": -396.2481689453125, "loss": 0.5094, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0655136108398438, "rewards/margins": 0.8890769481658936, "rewards/rejected": -1.9545905590057373, "step": 168 }, { "epoch": 0.7985823981098642, "grad_norm": 3.119357172634371, "learning_rate": 3.4099467340303214e-07, "logits/chosen": -3.0272624492645264, "logits/rejected": -3.140334129333496, "logps/chosen": -307.60479736328125, "logps/rejected": -376.73590087890625, "loss": 0.5064, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1390501260757446, "rewards/margins": 0.9899504780769348, "rewards/rejected": -2.129000425338745, "step": 169 }, { "epoch": 0.8033077377436503, "grad_norm": 3.457360676304788, "learning_rate": 3.392034355963614e-07, "logits/chosen": -2.8242180347442627, "logits/rejected": -2.8203928470611572, "logps/chosen": -338.4170837402344, "logps/rejected": -375.84893798828125, "loss": 0.5257, "rewards/accuracies": 0.6875, "rewards/chosen": -1.182822346687317, "rewards/margins": 0.7907478213310242, "rewards/rejected": -1.9735702276229858, "step": 170 }, { "epoch": 0.8080330773774365, "grad_norm": 5.011470232701305, "learning_rate": 3.374069347993218e-07, "logits/chosen": -2.6921019554138184, "logits/rejected": -2.7788760662078857, "logps/chosen": -329.470703125, "logps/rejected": -467.9057922363281, "loss": 0.5396, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1266531944274902, "rewards/margins": 0.8941323757171631, "rewards/rejected": -2.0207855701446533, "step": 171 }, { "epoch": 0.8127584170112226, "grad_norm": 3.7253097168704294, "learning_rate": 3.356052770052119e-07, "logits/chosen": -2.716782331466675, "logits/rejected": -2.6556971073150635, "logps/chosen": -309.70672607421875, "logps/rejected": -426.3639221191406, "loss": 0.5239, "rewards/accuracies": 0.625, "rewards/chosen": -1.2285258769989014, "rewards/margins": 0.8051817417144775, "rewards/rejected": -2.033707618713379, "step": 172 }, { "epoch": 0.8127584170112226, "eval_logits/chosen": -2.9802942276000977, "eval_logits/rejected": -2.9883527755737305, "eval_logps/chosen": -339.8587646484375, "eval_logps/rejected": -423.5323791503906, "eval_loss": 0.51226407289505, "eval_rewards/accuracies": 0.6287878751754761, "eval_rewards/chosen": -1.2862800359725952, "eval_rewards/margins": 0.9495444893836975, "eval_rewards/rejected": -2.2358245849609375, "eval_runtime": 225.4382, "eval_samples_per_second": 16.217, "eval_steps_per_second": 0.293, "step": 172 }, { "epoch": 0.8174837566450088, "grad_norm": 4.039346342004204, "learning_rate": 3.337985685115926e-07, "logits/chosen": -2.9982471466064453, "logits/rejected": -2.934654712677002, "logps/chosen": -345.1435546875, "logps/rejected": -418.8468017578125, "loss": 0.5227, "rewards/accuracies": 0.703125, "rewards/chosen": -1.2590099573135376, "rewards/margins": 0.7078821659088135, "rewards/rejected": -1.9668920040130615, "step": 173 }, { "epoch": 0.822209096278795, "grad_norm": 3.355839857834348, "learning_rate": 3.319869159140152e-07, "logits/chosen": -2.412257432937622, "logits/rejected": -2.5528626441955566, "logps/chosen": -313.64422607421875, "logps/rejected": -387.2737121582031, "loss": 0.5154, "rewards/accuracies": 0.75, "rewards/chosen": -1.2666703462600708, "rewards/margins": 0.9918266534805298, "rewards/rejected": -2.2584969997406006, "step": 174 }, { "epoch": 0.8269344359125812, "grad_norm": 3.8883187343016528, "learning_rate": 3.301704260997325e-07, "logits/chosen": -2.835768222808838, "logits/rejected": -2.861588716506958, "logps/chosen": -310.4635314941406, "logps/rejected": -418.094970703125, "loss": 0.5263, "rewards/accuracies": 0.71875, "rewards/chosen": -1.149375557899475, "rewards/margins": 1.2631374597549438, "rewards/rejected": -2.41251277923584, "step": 175 }, { "epoch": 0.8316597755463674, "grad_norm": 3.943606898054145, "learning_rate": 3.283492062413925e-07, "logits/chosen": -2.8773105144500732, "logits/rejected": -2.919630527496338, "logps/chosen": -315.5, "logps/rejected": -414.37908935546875, "loss": 0.5139, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2905051708221436, "rewards/margins": 1.0659555196762085, "rewards/rejected": -2.3564605712890625, "step": 176 }, { "epoch": 0.8363851151801536, "grad_norm": 4.394795298842495, "learning_rate": 3.2652336379071506e-07, "logits/chosen": -2.7635695934295654, "logits/rejected": -2.8052563667297363, "logps/chosen": -371.2409973144531, "logps/rejected": -429.0415954589844, "loss": 0.4828, "rewards/accuracies": 0.625, "rewards/chosen": -1.2244281768798828, "rewards/margins": 1.045049786567688, "rewards/rejected": -2.2694778442382812, "step": 177 }, { "epoch": 0.8411104548139398, "grad_norm": 3.6672724994810593, "learning_rate": 3.246930064721523e-07, "logits/chosen": -2.7490479946136475, "logits/rejected": -2.6954996585845947, "logps/chosen": -265.0568542480469, "logps/rejected": -373.4683532714844, "loss": 0.5134, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1422216892242432, "rewards/margins": 1.010473370552063, "rewards/rejected": -2.1526949405670166, "step": 178 }, { "epoch": 0.8458357944477259, "grad_norm": 3.438810171868027, "learning_rate": 3.228582422765331e-07, "logits/chosen": -2.661006212234497, "logits/rejected": -2.756809711456299, "logps/chosen": -363.7588806152344, "logps/rejected": -392.8025207519531, "loss": 0.5044, "rewards/accuracies": 0.609375, "rewards/chosen": -1.341508388519287, "rewards/margins": 0.7005224227905273, "rewards/rejected": -2.0420308113098145, "step": 179 }, { "epoch": 0.8505611340815121, "grad_norm": 4.113182426182398, "learning_rate": 3.2101917945469135e-07, "logits/chosen": -2.700942277908325, "logits/rejected": -2.82185435295105, "logps/chosen": -364.240234375, "logps/rejected": -429.81292724609375, "loss": 0.4816, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1254488229751587, "rewards/margins": 0.9122301340103149, "rewards/rejected": -2.0376789569854736, "step": 180 }, { "epoch": 0.8552864737152983, "grad_norm": 4.022329679746118, "learning_rate": 3.1917592651107927e-07, "logits/chosen": -2.8973255157470703, "logits/rejected": -2.794524908065796, "logps/chosen": -352.9599914550781, "logps/rejected": -422.71197509765625, "loss": 0.5153, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2179690599441528, "rewards/margins": 0.7948740124702454, "rewards/rejected": -2.012843132019043, "step": 181 }, { "epoch": 0.8600118133490845, "grad_norm": 3.7009474178127353, "learning_rate": 3.173285921973657e-07, "logits/chosen": -2.793835401535034, "logits/rejected": -2.7703464031219482, "logps/chosen": -320.6808776855469, "logps/rejected": -454.6962890625, "loss": 0.512, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1430678367614746, "rewards/margins": 1.6093158721923828, "rewards/rejected": -2.7523837089538574, "step": 182 }, { "epoch": 0.8647371529828707, "grad_norm": 3.2328530891675182, "learning_rate": 3.1547728550601983e-07, "logits/chosen": -2.6808881759643555, "logits/rejected": -2.676565170288086, "logps/chosen": -301.3103332519531, "logps/rejected": -402.32281494140625, "loss": 0.5161, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1162338256835938, "rewards/margins": 1.0167958736419678, "rewards/rejected": -2.1330299377441406, "step": 183 }, { "epoch": 0.8694624926166569, "grad_norm": 3.899851002215307, "learning_rate": 3.1362211566388057e-07, "logits/chosen": -2.8450677394866943, "logits/rejected": -2.836167097091675, "logps/chosen": -341.335693359375, "logps/rejected": -407.4953918457031, "loss": 0.5422, "rewards/accuracies": 0.671875, "rewards/chosen": -1.2810035943984985, "rewards/margins": 0.6536043882369995, "rewards/rejected": -1.934607982635498, "step": 184 }, { "epoch": 0.874187832250443, "grad_norm": 3.2712338921521518, "learning_rate": 3.1176319212571204e-07, "logits/chosen": -2.405541181564331, "logits/rejected": -2.4331328868865967, "logps/chosen": -288.65325927734375, "logps/rejected": -364.7303466796875, "loss": 0.5114, "rewards/accuracies": 0.625, "rewards/chosen": -1.2161575555801392, "rewards/margins": 0.9309273362159729, "rewards/rejected": -2.147084951400757, "step": 185 }, { "epoch": 0.8789131718842291, "grad_norm": 3.8818256203788852, "learning_rate": 3.099006245677461e-07, "logits/chosen": -2.4217336177825928, "logits/rejected": -2.360973358154297, "logps/chosen": -381.958740234375, "logps/rejected": -487.387451171875, "loss": 0.5393, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2569398880004883, "rewards/margins": 0.7359199523925781, "rewards/rejected": -1.9928598403930664, "step": 186 }, { "epoch": 0.8836385115180153, "grad_norm": 3.241850575925625, "learning_rate": 3.0803452288121113e-07, "logits/chosen": -2.6186816692352295, "logits/rejected": -2.5035552978515625, "logps/chosen": -356.3555603027344, "logps/rejected": -538.8609619140625, "loss": 0.4985, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2594354152679443, "rewards/margins": 1.667135238647461, "rewards/rejected": -2.9265708923339844, "step": 187 }, { "epoch": 0.8883638511518015, "grad_norm": 2.994070703824197, "learning_rate": 3.0616499716584874e-07, "logits/chosen": -2.936795473098755, "logits/rejected": -2.855689525604248, "logps/chosen": -339.5811767578125, "logps/rejected": -430.01483154296875, "loss": 0.5116, "rewards/accuracies": 0.625, "rewards/chosen": -1.1258800029754639, "rewards/margins": 0.6621576547622681, "rewards/rejected": -1.7880375385284424, "step": 188 }, { "epoch": 0.8930891907855877, "grad_norm": 3.4626212317934146, "learning_rate": 3.042921577234177e-07, "logits/chosen": -2.6947526931762695, "logits/rejected": -2.8166098594665527, "logps/chosen": -328.1815185546875, "logps/rejected": -340.3358154296875, "loss": 0.5048, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0728776454925537, "rewards/margins": 0.6414874792098999, "rewards/rejected": -1.7143651247024536, "step": 189 }, { "epoch": 0.8978145304193739, "grad_norm": 3.1647602539221182, "learning_rate": 3.024161150511861e-07, "logits/chosen": -2.9085636138916016, "logits/rejected": -3.0228068828582764, "logps/chosen": -323.0364074707031, "logps/rejected": -358.6117248535156, "loss": 0.5248, "rewards/accuracies": 0.703125, "rewards/chosen": -1.1753543615341187, "rewards/margins": 0.9408324956893921, "rewards/rejected": -2.1161868572235107, "step": 190 }, { "epoch": 0.90253987005316, "grad_norm": 3.025256770279786, "learning_rate": 3.0053697983541247e-07, "logits/chosen": -2.545339345932007, "logits/rejected": -2.5873234272003174, "logps/chosen": -364.4934387207031, "logps/rejected": -404.4205322265625, "loss": 0.5114, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3082777261734009, "rewards/margins": 0.9447382688522339, "rewards/rejected": -2.2530159950256348, "step": 191 }, { "epoch": 0.9072652096869462, "grad_norm": 3.2107060614205682, "learning_rate": 2.986548629448146e-07, "logits/chosen": -2.5320749282836914, "logits/rejected": -2.6858551502227783, "logps/chosen": -357.6763916015625, "logps/rejected": -418.71173095703125, "loss": 0.5194, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0752955675125122, "rewards/margins": 1.176898717880249, "rewards/rejected": -2.252194404602051, "step": 192 }, { "epoch": 0.9119905493207324, "grad_norm": 4.290813644018959, "learning_rate": 2.967698754240289e-07, "logits/chosen": -2.6695892810821533, "logits/rejected": -2.660761833190918, "logps/chosen": -347.8155517578125, "logps/rejected": -403.8181457519531, "loss": 0.5543, "rewards/accuracies": 0.625, "rewards/chosen": -1.207077145576477, "rewards/margins": 0.7051151394844055, "rewards/rejected": -1.9121922254562378, "step": 193 }, { "epoch": 0.9167158889545186, "grad_norm": 3.656365041410898, "learning_rate": 2.948821284870585e-07, "logits/chosen": -3.009221315383911, "logits/rejected": -2.9382755756378174, "logps/chosen": -338.1603698730469, "logps/rejected": -425.35931396484375, "loss": 0.5007, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3645328283309937, "rewards/margins": 0.9410390257835388, "rewards/rejected": -2.305572032928467, "step": 194 }, { "epoch": 0.9214412285883048, "grad_norm": 3.166267065915439, "learning_rate": 2.9299173351071176e-07, "logits/chosen": -2.544590950012207, "logits/rejected": -2.5580813884735107, "logps/chosen": -375.34112548828125, "logps/rejected": -437.5019836425781, "loss": 0.5057, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1270973682403564, "rewards/margins": 0.8198693990707397, "rewards/rejected": -1.9469666481018066, "step": 195 }, { "epoch": 0.926166568222091, "grad_norm": 2.9416082082853805, "learning_rate": 2.9109880202803097e-07, "logits/chosen": -2.5898144245147705, "logits/rejected": -2.590017795562744, "logps/chosen": -325.3600769042969, "logps/rejected": -417.5689392089844, "loss": 0.5251, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1922552585601807, "rewards/margins": 1.2383495569229126, "rewards/rejected": -2.4306044578552246, "step": 196 }, { "epoch": 0.9308919078558772, "grad_norm": 4.344454107439186, "learning_rate": 2.892034457217119e-07, "logits/chosen": -2.793138027191162, "logits/rejected": -2.6765213012695312, "logps/chosen": -393.07080078125, "logps/rejected": -508.9129638671875, "loss": 0.5302, "rewards/accuracies": 0.734375, "rewards/chosen": -1.202684998512268, "rewards/margins": 1.2748346328735352, "rewards/rejected": -2.4775197505950928, "step": 197 }, { "epoch": 0.9356172474896634, "grad_norm": 4.341995602773921, "learning_rate": 2.8730577641751474e-07, "logits/chosen": -2.5986645221710205, "logits/rejected": -2.7119102478027344, "logps/chosen": -301.242431640625, "logps/rejected": -423.28619384765625, "loss": 0.5145, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1418884992599487, "rewards/margins": 1.2161611318588257, "rewards/rejected": -2.3580498695373535, "step": 198 }, { "epoch": 0.9403425871234495, "grad_norm": 3.5092494052777536, "learning_rate": 2.854059060776659e-07, "logits/chosen": -2.4758505821228027, "logits/rejected": -2.5398991107940674, "logps/chosen": -311.9541015625, "logps/rejected": -413.2162780761719, "loss": 0.4867, "rewards/accuracies": 0.8125, "rewards/chosen": -1.084080457687378, "rewards/margins": 1.512494683265686, "rewards/rejected": -2.5965750217437744, "step": 199 }, { "epoch": 0.9450679267572357, "grad_norm": 3.5187855585130294, "learning_rate": 2.835039467942529e-07, "logits/chosen": -2.5920052528381348, "logits/rejected": -2.7105183601379395, "logps/chosen": -340.66351318359375, "logps/rejected": -416.5780944824219, "loss": 0.5222, "rewards/accuracies": 0.75, "rewards/chosen": -1.0971770286560059, "rewards/margins": 0.9351829290390015, "rewards/rejected": -2.032360315322876, "step": 200 }, { "epoch": 0.9497932663910219, "grad_norm": 3.793717608121411, "learning_rate": 2.8160001078261055e-07, "logits/chosen": -2.498663902282715, "logits/rejected": -2.4728269577026367, "logps/chosen": -301.9998474121094, "logps/rejected": -426.3241271972656, "loss": 0.5251, "rewards/accuracies": 0.6875, "rewards/chosen": -1.031512975692749, "rewards/margins": 1.083221197128296, "rewards/rejected": -2.114734172821045, "step": 201 }, { "epoch": 0.954518606024808, "grad_norm": 2.9737105768220244, "learning_rate": 2.7969421037470033e-07, "logits/chosen": -2.7715539932250977, "logits/rejected": -2.7849650382995605, "logps/chosen": -345.68963623046875, "logps/rejected": -444.9283752441406, "loss": 0.5044, "rewards/accuracies": 0.609375, "rewards/chosen": -1.262139081954956, "rewards/margins": 0.9420502185821533, "rewards/rejected": -2.2041893005371094, "step": 202 }, { "epoch": 0.9592439456585942, "grad_norm": 3.4484596414693893, "learning_rate": 2.777866580124829e-07, "logits/chosen": -2.491079330444336, "logits/rejected": -2.7043981552124023, "logps/chosen": -395.48211669921875, "logps/rejected": -463.1803894042969, "loss": 0.5037, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2879524230957031, "rewards/margins": 0.926118791103363, "rewards/rejected": -2.214071273803711, "step": 203 }, { "epoch": 0.9639692852923804, "grad_norm": 3.553922161828863, "learning_rate": 2.758774662412838e-07, "logits/chosen": -2.5517563819885254, "logits/rejected": -2.4807627201080322, "logps/chosen": -305.34637451171875, "logps/rejected": -486.2059631347656, "loss": 0.5027, "rewards/accuracies": 0.734375, "rewards/chosen": -1.346076250076294, "rewards/margins": 1.699803352355957, "rewards/rejected": -3.04587984085083, "step": 204 }, { "epoch": 0.9686946249261665, "grad_norm": 3.191519565656703, "learning_rate": 2.739667477031538e-07, "logits/chosen": -2.7191619873046875, "logits/rejected": -2.7111451625823975, "logps/chosen": -344.4493408203125, "logps/rejected": -480.70135498046875, "loss": 0.4982, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2324879169464111, "rewards/margins": 1.2888904809951782, "rewards/rejected": -2.521378517150879, "step": 205 }, { "epoch": 0.9734199645599527, "grad_norm": 4.672920315021906, "learning_rate": 2.7205461513022233e-07, "logits/chosen": -2.0949220657348633, "logits/rejected": -2.1367533206939697, "logps/chosen": -390.1219482421875, "logps/rejected": -415.38037109375, "loss": 0.4815, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0024380683898926, "rewards/margins": 0.9189928770065308, "rewards/rejected": -1.9214308261871338, "step": 206 }, { "epoch": 0.9781453041937389, "grad_norm": 3.5306370227740262, "learning_rate": 2.70141181338047e-07, "logits/chosen": -2.406205177307129, "logits/rejected": -2.38879132270813, "logps/chosen": -356.2658386230469, "logps/rejected": -456.2667236328125, "loss": 0.4946, "rewards/accuracies": 0.71875, "rewards/chosen": -1.220351219177246, "rewards/margins": 1.0728812217712402, "rewards/rejected": -2.2932324409484863, "step": 207 }, { "epoch": 0.9828706438275251, "grad_norm": 3.479933152007338, "learning_rate": 2.6822655921895693e-07, "logits/chosen": -2.446387767791748, "logits/rejected": -2.346810817718506, "logps/chosen": -312.9122619628906, "logps/rejected": -459.10040283203125, "loss": 0.5015, "rewards/accuracies": 0.859375, "rewards/chosen": -1.142736792564392, "rewards/margins": 1.2611982822418213, "rewards/rejected": -2.403934955596924, "step": 208 }, { "epoch": 0.9875959834613113, "grad_norm": 3.8539520740205018, "learning_rate": 2.663108617353926e-07, "logits/chosen": -2.7008585929870605, "logits/rejected": -2.7585320472717285, "logps/chosen": -410.00634765625, "logps/rejected": -455.63812255859375, "loss": 0.5103, "rewards/accuracies": 0.609375, "rewards/chosen": -1.2332756519317627, "rewards/margins": 0.693227231502533, "rewards/rejected": -1.9265029430389404, "step": 209 }, { "epoch": 0.9923213230950975, "grad_norm": 3.845322423212567, "learning_rate": 2.6439420191324064e-07, "logits/chosen": -2.462545871734619, "logits/rejected": -2.5116636753082275, "logps/chosen": -305.0135803222656, "logps/rejected": -405.6048583984375, "loss": 0.4911, "rewards/accuracies": 0.6875, "rewards/chosen": -1.231694221496582, "rewards/margins": 0.9429072141647339, "rewards/rejected": -2.1746013164520264, "step": 210 }, { "epoch": 0.9970466627288836, "grad_norm": 5.6022942181300905, "learning_rate": 2.6247669283516556e-07, "logits/chosen": -2.544132947921753, "logits/rejected": -2.5758156776428223, "logps/chosen": -366.85009765625, "logps/rejected": -442.0340881347656, "loss": 0.5449, "rewards/accuracies": 0.671875, "rewards/chosen": -1.4171723127365112, "rewards/margins": 1.0115364789962769, "rewards/rejected": -2.428708791732788, "step": 211 }, { "epoch": 1.0017720023626697, "grad_norm": 3.186023679965871, "learning_rate": 2.60558447633938e-07, "logits/chosen": -2.244544506072998, "logits/rejected": -2.3220551013946533, "logps/chosen": -366.2366638183594, "logps/rejected": -452.18707275390625, "loss": 0.5058, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4039456844329834, "rewards/margins": 1.031445860862732, "rewards/rejected": -2.435391426086426, "step": 212 }, { "epoch": 1.006497341996456, "grad_norm": 3.6450488482030523, "learning_rate": 2.5863957948575963e-07, "logits/chosen": -2.3453848361968994, "logits/rejected": -2.468796968460083, "logps/chosen": -326.9305114746094, "logps/rejected": -368.9020690917969, "loss": 0.4954, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1048694849014282, "rewards/margins": 0.7470109462738037, "rewards/rejected": -1.8518803119659424, "step": 213 }, { "epoch": 1.011222681630242, "grad_norm": 4.137227043648897, "learning_rate": 2.567202016035859e-07, "logits/chosen": -2.5369303226470947, "logits/rejected": -2.535897731781006, "logps/chosen": -310.4713439941406, "logps/rejected": -399.39398193359375, "loss": 0.4987, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2701810598373413, "rewards/margins": 0.7798756957054138, "rewards/rejected": -2.0500564575195312, "step": 214 }, { "epoch": 1.0159480212640284, "grad_norm": 4.02450997349156, "learning_rate": 2.5480042723044653e-07, "logits/chosen": -2.567810535430908, "logits/rejected": -2.575981378555298, "logps/chosen": -350.37957763671875, "logps/rejected": -428.45745849609375, "loss": 0.4668, "rewards/accuracies": 0.671875, "rewards/chosen": -1.4323270320892334, "rewards/margins": 1.0882251262664795, "rewards/rejected": -2.520552396774292, "step": 215 }, { "epoch": 1.0159480212640284, "eval_logits/chosen": -2.584289789199829, "eval_logits/rejected": -2.5910158157348633, "eval_logps/chosen": -361.1751708984375, "eval_logps/rejected": -463.7195129394531, "eval_loss": 0.49447911977767944, "eval_rewards/accuracies": 0.6439393758773804, "eval_rewards/chosen": -1.499444603919983, "eval_rewards/margins": 1.138251781463623, "eval_rewards/rejected": -2.6376962661743164, "eval_runtime": 224.7997, "eval_samples_per_second": 16.263, "eval_steps_per_second": 0.294, "step": 215 }, { "epoch": 1.0206733608978145, "grad_norm": 4.359766282431931, "learning_rate": 2.5288036963276414e-07, "logits/chosen": -2.4916322231292725, "logits/rejected": -2.482870101928711, "logps/chosen": -297.8340759277344, "logps/rejected": -387.14703369140625, "loss": 0.4968, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1933519840240479, "rewards/margins": 1.0262653827667236, "rewards/rejected": -2.2196173667907715, "step": 216 }, { "epoch": 1.0253987005316008, "grad_norm": 3.593941578167617, "learning_rate": 2.509601420936717e-07, "logits/chosen": -2.5306057929992676, "logits/rejected": -2.440415382385254, "logps/chosen": -341.2530212402344, "logps/rejected": -464.18487548828125, "loss": 0.488, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4438025951385498, "rewards/margins": 1.340946078300476, "rewards/rejected": -2.7847485542297363, "step": 217 }, { "epoch": 1.0301240401653868, "grad_norm": 3.664745354346814, "learning_rate": 2.490398579063283e-07, "logits/chosen": -2.6102001667022705, "logits/rejected": -2.529940128326416, "logps/chosen": -345.69677734375, "logps/rejected": -484.93035888671875, "loss": 0.5001, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5673726797103882, "rewards/margins": 1.314273476600647, "rewards/rejected": -2.8816463947296143, "step": 218 }, { "epoch": 1.0348493797991731, "grad_norm": 5.084728530421535, "learning_rate": 2.4711963036723583e-07, "logits/chosen": -2.3622119426727295, "logits/rejected": -2.365530490875244, "logps/chosen": -367.52984619140625, "logps/rejected": -406.59375, "loss": 0.4928, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4226937294006348, "rewards/margins": 0.49759745597839355, "rewards/rejected": -1.9202911853790283, "step": 219 }, { "epoch": 1.0395747194329592, "grad_norm": 3.6814125755312093, "learning_rate": 2.451995727695535e-07, "logits/chosen": -2.4771206378936768, "logits/rejected": -2.5038909912109375, "logps/chosen": -365.91815185546875, "logps/rejected": -477.00347900390625, "loss": 0.4837, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5666453838348389, "rewards/margins": 1.2746491432189941, "rewards/rejected": -2.841294527053833, "step": 220 }, { "epoch": 1.0443000590667455, "grad_norm": 4.348311431123137, "learning_rate": 2.432797983964141e-07, "logits/chosen": -2.514589786529541, "logits/rejected": -2.460106134414673, "logps/chosen": -358.013427734375, "logps/rejected": -444.3507080078125, "loss": 0.4922, "rewards/accuracies": 0.671875, "rewards/chosen": -1.4816346168518066, "rewards/margins": 0.9881500601768494, "rewards/rejected": -2.469784736633301, "step": 221 }, { "epoch": 1.0490253987005316, "grad_norm": 4.474335447495181, "learning_rate": 2.413604205142404e-07, "logits/chosen": -2.2786381244659424, "logits/rejected": -2.371419668197632, "logps/chosen": -377.5779113769531, "logps/rejected": -431.80316162109375, "loss": 0.5267, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8127973079681396, "rewards/margins": 0.9888743162155151, "rewards/rejected": -2.8016717433929443, "step": 222 }, { "epoch": 1.0537507383343179, "grad_norm": 3.9171412475826557, "learning_rate": 2.3944155236606196e-07, "logits/chosen": -2.4430832862854004, "logits/rejected": -2.409916639328003, "logps/chosen": -344.40399169921875, "logps/rejected": -461.02349853515625, "loss": 0.4846, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4063448905944824, "rewards/margins": 1.1203367710113525, "rewards/rejected": -2.526681661605835, "step": 223 }, { "epoch": 1.058476077968104, "grad_norm": 5.279702970403167, "learning_rate": 2.3752330716483444e-07, "logits/chosen": -2.7835280895233154, "logits/rejected": -2.7787797451019287, "logps/chosen": -353.5722351074219, "logps/rejected": -415.2908935546875, "loss": 0.5057, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6296898126602173, "rewards/margins": 0.9214704632759094, "rewards/rejected": -2.5511598587036133, "step": 224 }, { "epoch": 1.0632014176018902, "grad_norm": 3.8586230520693157, "learning_rate": 2.356057980867594e-07, "logits/chosen": -2.547018527984619, "logits/rejected": -2.5479323863983154, "logps/chosen": -411.9010009765625, "logps/rejected": -543.1708984375, "loss": 0.4834, "rewards/accuracies": 0.734375, "rewards/chosen": -1.6013555526733398, "rewards/margins": 1.6427891254425049, "rewards/rejected": -3.244144916534424, "step": 225 }, { "epoch": 1.0679267572356763, "grad_norm": 3.836310963297125, "learning_rate": 2.3368913826460742e-07, "logits/chosen": -2.622213840484619, "logits/rejected": -2.699857473373413, "logps/chosen": -383.7267150878906, "logps/rejected": -406.25152587890625, "loss": 0.5085, "rewards/accuracies": 0.640625, "rewards/chosen": -1.4025287628173828, "rewards/margins": 0.56195068359375, "rewards/rejected": -1.9644795656204224, "step": 226 }, { "epoch": 1.0726520968694624, "grad_norm": 4.999193459222086, "learning_rate": 2.3177344078104305e-07, "logits/chosen": -2.6717772483825684, "logits/rejected": -2.672889232635498, "logps/chosen": -346.08544921875, "logps/rejected": -417.90545654296875, "loss": 0.4906, "rewards/accuracies": 0.640625, "rewards/chosen": -1.6554023027420044, "rewards/margins": 0.8562977313995361, "rewards/rejected": -2.51170015335083, "step": 227 }, { "epoch": 1.0773774365032487, "grad_norm": 4.6410271396992595, "learning_rate": 2.2985881866195304e-07, "logits/chosen": -2.646639347076416, "logits/rejected": -2.7165169715881348, "logps/chosen": -355.2235107421875, "logps/rejected": -395.57550048828125, "loss": 0.5089, "rewards/accuracies": 0.640625, "rewards/chosen": -1.3660098314285278, "rewards/margins": 0.7286862134933472, "rewards/rejected": -2.094696044921875, "step": 228 }, { "epoch": 1.0821027761370348, "grad_norm": 3.9004711748243412, "learning_rate": 2.2794538486977765e-07, "logits/chosen": -2.573826313018799, "logits/rejected": -2.6808300018310547, "logps/chosen": -355.0180969238281, "logps/rejected": -473.90252685546875, "loss": 0.4922, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4073047637939453, "rewards/margins": 1.1750006675720215, "rewards/rejected": -2.582305431365967, "step": 229 }, { "epoch": 1.086828115770821, "grad_norm": 3.6316010874552704, "learning_rate": 2.2603325229684628e-07, "logits/chosen": -2.4409735202789307, "logits/rejected": -2.4068901538848877, "logps/chosen": -306.1089172363281, "logps/rejected": -447.4771423339844, "loss": 0.4552, "rewards/accuracies": 0.609375, "rewards/chosen": -1.4156140089035034, "rewards/margins": 1.0425467491149902, "rewards/rejected": -2.458160877227783, "step": 230 }, { "epoch": 1.0915534554046071, "grad_norm": 4.420363258356965, "learning_rate": 2.2412253375871618e-07, "logits/chosen": -2.681562662124634, "logits/rejected": -2.5477468967437744, "logps/chosen": -338.50775146484375, "logps/rejected": -500.01568603515625, "loss": 0.4909, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4533276557922363, "rewards/margins": 1.5830377340316772, "rewards/rejected": -3.036365270614624, "step": 231 }, { "epoch": 1.0962787950383934, "grad_norm": 3.3398932376367116, "learning_rate": 2.2221334198751717e-07, "logits/chosen": -2.566534996032715, "logits/rejected": -2.663203716278076, "logps/chosen": -423.2144470214844, "logps/rejected": -525.0465087890625, "loss": 0.4902, "rewards/accuracies": 0.6875, "rewards/chosen": -1.552110195159912, "rewards/margins": 1.6297154426574707, "rewards/rejected": -3.181825876235962, "step": 232 }, { "epoch": 1.1010041346721795, "grad_norm": 3.258958216613353, "learning_rate": 2.2030578962529964e-07, "logits/chosen": -2.533161163330078, "logits/rejected": -2.598869800567627, "logps/chosen": -384.85443115234375, "logps/rejected": -436.78302001953125, "loss": 0.4943, "rewards/accuracies": 0.703125, "rewards/chosen": -1.1795471906661987, "rewards/margins": 0.7562814354896545, "rewards/rejected": -1.935828685760498, "step": 233 }, { "epoch": 1.1057294743059658, "grad_norm": 4.3977570998277695, "learning_rate": 2.1839998921738948e-07, "logits/chosen": -2.682744026184082, "logits/rejected": -2.8130013942718506, "logps/chosen": -375.19012451171875, "logps/rejected": -446.29486083984375, "loss": 0.4962, "rewards/accuracies": 0.71875, "rewards/chosen": -1.338588833808899, "rewards/margins": 0.9564355611801147, "rewards/rejected": -2.2950241565704346, "step": 234 }, { "epoch": 1.1104548139397519, "grad_norm": 3.711007822076339, "learning_rate": 2.1649605320574715e-07, "logits/chosen": -2.770697832107544, "logits/rejected": -2.7958450317382812, "logps/chosen": -263.7291259765625, "logps/rejected": -410.407470703125, "loss": 0.4984, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2530757188796997, "rewards/margins": 1.3193039894104004, "rewards/rejected": -2.5723795890808105, "step": 235 }, { "epoch": 1.1151801535735382, "grad_norm": 5.205159446883962, "learning_rate": 2.1459409392233414e-07, "logits/chosen": -2.509124279022217, "logits/rejected": -2.4804513454437256, "logps/chosen": -431.5054931640625, "logps/rejected": -569.149169921875, "loss": 0.4849, "rewards/accuracies": 0.75, "rewards/chosen": -1.6995575428009033, "rewards/margins": 1.6153841018676758, "rewards/rejected": -3.3149421215057373, "step": 236 }, { "epoch": 1.1199054932073242, "grad_norm": 3.6720237761578276, "learning_rate": 2.1269422358248534e-07, "logits/chosen": -2.2790334224700928, "logits/rejected": -2.5470128059387207, "logps/chosen": -368.87750244140625, "logps/rejected": -383.09478759765625, "loss": 0.4829, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3113298416137695, "rewards/margins": 0.9517258405685425, "rewards/rejected": -2.2630558013916016, "step": 237 }, { "epoch": 1.1246308328411105, "grad_norm": 4.38203569485804, "learning_rate": 2.1079655427828804e-07, "logits/chosen": -2.484546184539795, "logits/rejected": -2.513195753097534, "logps/chosen": -329.1628723144531, "logps/rejected": -448.17327880859375, "loss": 0.4783, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3690838813781738, "rewards/margins": 1.233577847480774, "rewards/rejected": -2.602661609649658, "step": 238 }, { "epoch": 1.1293561724748966, "grad_norm": 4.1915678394143665, "learning_rate": 2.0890119797196904e-07, "logits/chosen": -2.489327907562256, "logits/rejected": -2.619502544403076, "logps/chosen": -367.636474609375, "logps/rejected": -442.066650390625, "loss": 0.4619, "rewards/accuracies": 0.625, "rewards/chosen": -1.260221004486084, "rewards/margins": 1.034293293952942, "rewards/rejected": -2.2945144176483154, "step": 239 }, { "epoch": 1.1340815121086827, "grad_norm": 4.912500043289106, "learning_rate": 2.0700826648928827e-07, "logits/chosen": -2.445549964904785, "logits/rejected": -2.4925429821014404, "logps/chosen": -403.63580322265625, "logps/rejected": -492.6475830078125, "loss": 0.4837, "rewards/accuracies": 0.75, "rewards/chosen": -1.4245916604995728, "rewards/margins": 1.291830062866211, "rewards/rejected": -2.7164220809936523, "step": 240 }, { "epoch": 1.138806851742469, "grad_norm": 3.973403985509937, "learning_rate": 2.0511787151294153e-07, "logits/chosen": -2.6269099712371826, "logits/rejected": -2.6633949279785156, "logps/chosen": -362.7601318359375, "logps/rejected": -465.685791015625, "loss": 0.4772, "rewards/accuracies": 0.734375, "rewards/chosen": -1.3287506103515625, "rewards/margins": 1.2157796621322632, "rewards/rejected": -2.544530153274536, "step": 241 }, { "epoch": 1.1435321913762553, "grad_norm": 3.706128513697955, "learning_rate": 2.0323012457597113e-07, "logits/chosen": -2.6716468334198, "logits/rejected": -2.5537989139556885, "logps/chosen": -313.87188720703125, "logps/rejected": -456.5760498046875, "loss": 0.4663, "rewards/accuracies": 0.765625, "rewards/chosen": -1.3276413679122925, "rewards/margins": 1.286893367767334, "rewards/rejected": -2.614534854888916, "step": 242 }, { "epoch": 1.1482575310100414, "grad_norm": 4.436897843537744, "learning_rate": 2.0134513705518544e-07, "logits/chosen": -2.442168951034546, "logits/rejected": -2.475062608718872, "logps/chosen": -349.3672180175781, "logps/rejected": -437.71075439453125, "loss": 0.4738, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4281671047210693, "rewards/margins": 1.169339656829834, "rewards/rejected": -2.5975069999694824, "step": 243 }, { "epoch": 1.1529828706438274, "grad_norm": 3.739187250501708, "learning_rate": 1.9946302016458754e-07, "logits/chosen": -2.5069191455841064, "logits/rejected": -2.4427642822265625, "logps/chosen": -378.0785827636719, "logps/rejected": -515.2130126953125, "loss": 0.4537, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5397615432739258, "rewards/margins": 1.4216468334197998, "rewards/rejected": -2.9614078998565674, "step": 244 }, { "epoch": 1.1577082102776137, "grad_norm": 4.618667609463329, "learning_rate": 1.975838849488139e-07, "logits/chosen": -2.6840784549713135, "logits/rejected": -2.608031988143921, "logps/chosen": -353.92388916015625, "logps/rejected": -443.1690368652344, "loss": 0.4789, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5656286478042603, "rewards/margins": 0.7766789793968201, "rewards/rejected": -2.3423075675964355, "step": 245 }, { "epoch": 1.1624335499113998, "grad_norm": 3.85370799421685, "learning_rate": 1.957078422765823e-07, "logits/chosen": -2.512361764907837, "logits/rejected": -2.664332389831543, "logps/chosen": -423.3899230957031, "logps/rejected": -493.901123046875, "loss": 0.508, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5689921379089355, "rewards/margins": 1.2216757535934448, "rewards/rejected": -2.790667772293091, "step": 246 }, { "epoch": 1.167158889545186, "grad_norm": 4.071196958136995, "learning_rate": 1.9383500283415127e-07, "logits/chosen": -2.7708868980407715, "logits/rejected": -2.928670644760132, "logps/chosen": -413.68780517578125, "logps/rejected": -419.13287353515625, "loss": 0.4674, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6208066940307617, "rewards/margins": 0.8534724116325378, "rewards/rejected": -2.4742789268493652, "step": 247 }, { "epoch": 1.1718842291789722, "grad_norm": 4.6057615724394445, "learning_rate": 1.9196547711878882e-07, "logits/chosen": -2.7272331714630127, "logits/rejected": -2.7651190757751465, "logps/chosen": -391.68218994140625, "logps/rejected": -536.7830810546875, "loss": 0.4898, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3025808334350586, "rewards/margins": 1.699794888496399, "rewards/rejected": -3.002375602722168, "step": 248 }, { "epoch": 1.1766095688127585, "grad_norm": 4.356687184481579, "learning_rate": 1.9009937543225393e-07, "logits/chosen": -2.6314167976379395, "logits/rejected": -2.5595688819885254, "logps/chosen": -311.2470703125, "logps/rejected": -458.4511413574219, "loss": 0.4912, "rewards/accuracies": 0.609375, "rewards/chosen": -1.5766427516937256, "rewards/margins": 1.1243988275527954, "rewards/rejected": -2.7010414600372314, "step": 249 }, { "epoch": 1.1813349084465445, "grad_norm": 3.8646816254377168, "learning_rate": 1.8823680787428804e-07, "logits/chosen": -2.628770589828491, "logits/rejected": -2.6632299423217773, "logps/chosen": -362.2929382324219, "logps/rejected": -451.97686767578125, "loss": 0.5168, "rewards/accuracies": 0.6875, "rewards/chosen": -1.444336175918579, "rewards/margins": 1.026198387145996, "rewards/rejected": -2.470534324645996, "step": 250 }, { "epoch": 1.1860602480803308, "grad_norm": 4.972160856261013, "learning_rate": 1.8637788433611946e-07, "logits/chosen": -2.6531898975372314, "logits/rejected": -2.6115176677703857, "logps/chosen": -374.62554931640625, "logps/rejected": -556.6736450195312, "loss": 0.4812, "rewards/accuracies": 0.75, "rewards/chosen": -1.495612382888794, "rewards/margins": 1.505082368850708, "rewards/rejected": -3.000694751739502, "step": 251 }, { "epoch": 1.190785587714117, "grad_norm": 5.493325018181506, "learning_rate": 1.8452271449398015e-07, "logits/chosen": -2.6560559272766113, "logits/rejected": -2.7018203735351562, "logps/chosen": -352.3895568847656, "logps/rejected": -441.25994873046875, "loss": 0.4814, "rewards/accuracies": 0.75, "rewards/chosen": -1.6333725452423096, "rewards/margins": 1.1176269054412842, "rewards/rejected": -2.7509994506835938, "step": 252 }, { "epoch": 1.1955109273479032, "grad_norm": 4.859694388307905, "learning_rate": 1.8267140780263424e-07, "logits/chosen": -2.634824514389038, "logits/rejected": -2.638603448867798, "logps/chosen": -336.9228515625, "logps/rejected": -454.23883056640625, "loss": 0.4996, "rewards/accuracies": 0.625, "rewards/chosen": -1.460708498954773, "rewards/margins": 1.1815128326416016, "rewards/rejected": -2.642221450805664, "step": 253 }, { "epoch": 1.2002362669816893, "grad_norm": 4.23344077907717, "learning_rate": 1.8082407348892076e-07, "logits/chosen": -2.581425666809082, "logits/rejected": -2.4411489963531494, "logps/chosen": -360.4097595214844, "logps/rejected": -511.65484619140625, "loss": 0.4754, "rewards/accuracies": 0.734375, "rewards/chosen": -1.5460442304611206, "rewards/margins": 1.2290416955947876, "rewards/rejected": -2.775085687637329, "step": 254 }, { "epoch": 1.2049616066154756, "grad_norm": 5.021641074779002, "learning_rate": 1.7898082054530868e-07, "logits/chosen": -2.5814576148986816, "logits/rejected": -2.562331199645996, "logps/chosen": -391.0801696777344, "logps/rejected": -486.2853698730469, "loss": 0.4886, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5095934867858887, "rewards/margins": 0.9785588979721069, "rewards/rejected": -2.488152503967285, "step": 255 }, { "epoch": 1.2096869462492617, "grad_norm": 4.032966746831722, "learning_rate": 1.7714175772346683e-07, "logits/chosen": -2.73325514793396, "logits/rejected": -2.8232052326202393, "logps/chosen": -361.27423095703125, "logps/rejected": -461.78125, "loss": 0.5027, "rewards/accuracies": 0.734375, "rewards/chosen": -1.4946091175079346, "rewards/margins": 1.1646876335144043, "rewards/rejected": -2.659296751022339, "step": 256 }, { "epoch": 1.2144122858830477, "grad_norm": 4.348859426367674, "learning_rate": 1.753069935278477e-07, "logits/chosen": -2.5508053302764893, "logits/rejected": -2.6324799060821533, "logps/chosen": -349.6580810546875, "logps/rejected": -436.2727966308594, "loss": 0.4904, "rewards/accuracies": 0.671875, "rewards/chosen": -1.5449261665344238, "rewards/margins": 1.1617025136947632, "rewards/rejected": -2.7066287994384766, "step": 257 }, { "epoch": 1.219137625516834, "grad_norm": 3.7399337348111246, "learning_rate": 1.7347663620928494e-07, "logits/chosen": -2.6396665573120117, "logits/rejected": -2.686690330505371, "logps/chosen": -373.62762451171875, "logps/rejected": -456.91876220703125, "loss": 0.4607, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5388782024383545, "rewards/margins": 1.0700188875198364, "rewards/rejected": -2.6088972091674805, "step": 258 }, { "epoch": 1.219137625516834, "eval_logits/chosen": -2.7951161861419678, "eval_logits/rejected": -2.8025708198547363, "eval_logps/chosen": -369.3280029296875, "eval_logps/rejected": -488.8177490234375, "eval_loss": 0.48161956667900085, "eval_rewards/accuracies": 0.6401515007019043, "eval_rewards/chosen": -1.5809730291366577, "eval_rewards/margins": 1.307705044746399, "eval_rewards/rejected": -2.8886778354644775, "eval_runtime": 225.1752, "eval_samples_per_second": 16.236, "eval_steps_per_second": 0.293, "step": 258 }, { "epoch": 1.2238629651506203, "grad_norm": 4.565788447191084, "learning_rate": 1.7165079375860752e-07, "logits/chosen": -2.7769393920898438, "logits/rejected": -2.7542011737823486, "logps/chosen": -327.9093322753906, "logps/rejected": -441.0569763183594, "loss": 0.5122, "rewards/accuracies": 0.765625, "rewards/chosen": -1.4275953769683838, "rewards/margins": 1.174314022064209, "rewards/rejected": -2.6019093990325928, "step": 259 }, { "epoch": 1.2285883047844064, "grad_norm": 3.8161499063717206, "learning_rate": 1.6982957390026748e-07, "logits/chosen": -2.5881879329681396, "logits/rejected": -2.5274972915649414, "logps/chosen": -380.05670166015625, "logps/rejected": -548.7481079101562, "loss": 0.4618, "rewards/accuracies": 0.640625, "rewards/chosen": -1.7176233530044556, "rewards/margins": 1.5351811647415161, "rewards/rejected": -3.2528045177459717, "step": 260 }, { "epoch": 1.2333136444181925, "grad_norm": 4.44964808900954, "learning_rate": 1.680130840859848e-07, "logits/chosen": -2.6286840438842773, "logits/rejected": -2.606605291366577, "logps/chosen": -325.35833740234375, "logps/rejected": -435.7322082519531, "loss": 0.4912, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2944529056549072, "rewards/margins": 1.2223491668701172, "rewards/rejected": -2.5168020725250244, "step": 261 }, { "epoch": 1.2380389840519788, "grad_norm": 4.264258297752704, "learning_rate": 1.662014314884074e-07, "logits/chosen": -2.6938886642456055, "logits/rejected": -2.6410956382751465, "logps/chosen": -341.89141845703125, "logps/rejected": -533.8803100585938, "loss": 0.4583, "rewards/accuracies": 0.671875, "rewards/chosen": -1.6077433824539185, "rewards/margins": 1.743915319442749, "rewards/rejected": -3.351658821105957, "step": 262 }, { "epoch": 1.2427643236857648, "grad_norm": 3.8551209282619374, "learning_rate": 1.64394722994788e-07, "logits/chosen": -2.644559383392334, "logits/rejected": -2.672788381576538, "logps/chosen": -332.5156555175781, "logps/rejected": -419.4230041503906, "loss": 0.4707, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4292283058166504, "rewards/margins": 0.7848268151283264, "rewards/rejected": -2.214055299758911, "step": 263 }, { "epoch": 1.2474896633195511, "grad_norm": 5.862600550730873, "learning_rate": 1.625930652006782e-07, "logits/chosen": -2.806763172149658, "logits/rejected": -2.818608283996582, "logps/chosen": -332.0267639160156, "logps/rejected": -421.3615417480469, "loss": 0.5, "rewards/accuracies": 0.671875, "rewards/chosen": -1.3579407930374146, "rewards/margins": 1.1036723852157593, "rewards/rejected": -2.461613178253174, "step": 264 }, { "epoch": 1.2522150029533372, "grad_norm": 4.952960168127134, "learning_rate": 1.607965644036386e-07, "logits/chosen": -2.397037982940674, "logits/rejected": -2.370586633682251, "logps/chosen": -380.63433837890625, "logps/rejected": -567.65478515625, "loss": 0.4619, "rewards/accuracies": 0.796875, "rewards/chosen": -1.3215548992156982, "rewards/margins": 1.9972357749938965, "rewards/rejected": -3.318790912628174, "step": 265 }, { "epoch": 1.2569403425871235, "grad_norm": 4.451231917032084, "learning_rate": 1.5900532659696786e-07, "logits/chosen": -2.5172245502471924, "logits/rejected": -2.549943208694458, "logps/chosen": -310.1299743652344, "logps/rejected": -405.5203857421875, "loss": 0.493, "rewards/accuracies": 0.703125, "rewards/chosen": -1.271753191947937, "rewards/margins": 0.9632126688957214, "rewards/rejected": -2.2349658012390137, "step": 266 }, { "epoch": 1.2616656822209096, "grad_norm": 3.9305163776837313, "learning_rate": 1.5721945746344914e-07, "logits/chosen": -2.5553438663482666, "logits/rejected": -2.6015634536743164, "logps/chosen": -336.09326171875, "logps/rejected": -444.8936462402344, "loss": 0.4871, "rewards/accuracies": 0.78125, "rewards/chosen": -1.417650580406189, "rewards/margins": 1.3679817914962769, "rewards/rejected": -2.785632371902466, "step": 267 }, { "epoch": 1.2663910218546959, "grad_norm": 3.688073971085107, "learning_rate": 1.5543906236911423e-07, "logits/chosen": -2.798358917236328, "logits/rejected": -2.803248167037964, "logps/chosen": -307.88250732421875, "logps/rejected": -455.02081298828125, "loss": 0.4952, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4341715574264526, "rewards/margins": 1.0945826768875122, "rewards/rejected": -2.528754234313965, "step": 268 }, { "epoch": 1.271116361488482, "grad_norm": 4.072630703078056, "learning_rate": 1.5366424635702773e-07, "logits/chosen": -2.4071907997131348, "logits/rejected": -2.469078540802002, "logps/chosen": -344.1185302734375, "logps/rejected": -481.708740234375, "loss": 0.4607, "rewards/accuracies": 0.703125, "rewards/chosen": -1.4194681644439697, "rewards/margins": 1.5102123022079468, "rewards/rejected": -2.929680109024048, "step": 269 }, { "epoch": 1.2758417011222682, "grad_norm": 4.509575176085059, "learning_rate": 1.5189511414108902e-07, "logits/chosen": -2.6352696418762207, "logits/rejected": -2.5769996643066406, "logps/chosen": -299.04486083984375, "logps/rejected": -428.2113952636719, "loss": 0.4548, "rewards/accuracies": 0.734375, "rewards/chosen": -1.261251449584961, "rewards/margins": 1.3529876470565796, "rewards/rejected": -2.61423921585083, "step": 270 }, { "epoch": 1.2805670407560543, "grad_norm": 4.337503774611465, "learning_rate": 1.5013177009985412e-07, "logits/chosen": -2.492708683013916, "logits/rejected": -2.5158631801605225, "logps/chosen": -369.5946960449219, "logps/rejected": -539.3471069335938, "loss": 0.4839, "rewards/accuracies": 0.765625, "rewards/chosen": -1.4575221538543701, "rewards/margins": 1.7974143028259277, "rewards/rejected": -3.2549362182617188, "step": 271 }, { "epoch": 1.2852923803898406, "grad_norm": 3.7985527417770797, "learning_rate": 1.4837431827037786e-07, "logits/chosen": -2.588874101638794, "logits/rejected": -2.6791810989379883, "logps/chosen": -399.30633544921875, "logps/rejected": -391.2076416015625, "loss": 0.4848, "rewards/accuracies": 0.671875, "rewards/chosen": -1.1733232736587524, "rewards/margins": 0.8876461982727051, "rewards/rejected": -2.060969591140747, "step": 272 }, { "epoch": 1.2900177200236267, "grad_norm": 4.177927201443006, "learning_rate": 1.466228623420751e-07, "logits/chosen": -2.498131036758423, "logits/rejected": -2.5749406814575195, "logps/chosen": -323.9309997558594, "logps/rejected": -450.7756042480469, "loss": 0.4591, "rewards/accuracies": 0.75, "rewards/chosen": -1.2552162408828735, "rewards/margins": 1.4370646476745605, "rewards/rejected": -2.6922807693481445, "step": 273 }, { "epoch": 1.2947430596574128, "grad_norm": 3.6814005701875523, "learning_rate": 1.448775056506036e-07, "logits/chosen": -2.469701051712036, "logits/rejected": -2.5528130531311035, "logps/chosen": -382.1181335449219, "logps/rejected": -487.5738525390625, "loss": 0.4747, "rewards/accuracies": 0.765625, "rewards/chosen": -1.516263723373413, "rewards/margins": 1.4078060388565063, "rewards/rejected": -2.92406964302063, "step": 274 }, { "epoch": 1.299468399291199, "grad_norm": 3.7200492832656766, "learning_rate": 1.4313835117176692e-07, "logits/chosen": -2.9147932529449463, "logits/rejected": -2.988351821899414, "logps/chosen": -376.6600341796875, "logps/rejected": -456.061279296875, "loss": 0.4718, "rewards/accuracies": 0.734375, "rewards/chosen": -1.3018884658813477, "rewards/margins": 1.2087831497192383, "rewards/rejected": -2.510671615600586, "step": 275 }, { "epoch": 1.3041937389249854, "grad_norm": 4.704523030577493, "learning_rate": 1.4140550151543872e-07, "logits/chosen": -2.5561208724975586, "logits/rejected": -2.669656276702881, "logps/chosen": -391.99273681640625, "logps/rejected": -470.0566711425781, "loss": 0.4779, "rewards/accuracies": 0.765625, "rewards/chosen": -1.384698510169983, "rewards/margins": 1.2832698822021484, "rewards/rejected": -2.667968511581421, "step": 276 }, { "epoch": 1.3089190785587714, "grad_norm": 3.890211089107488, "learning_rate": 1.3967905891950936e-07, "logits/chosen": -2.525979995727539, "logits/rejected": -2.5187900066375732, "logps/chosen": -329.0636291503906, "logps/rejected": -518.5650024414062, "loss": 0.4978, "rewards/accuracies": 0.765625, "rewards/chosen": -1.3041456937789917, "rewards/margins": 1.7594897747039795, "rewards/rejected": -3.0636353492736816, "step": 277 }, { "epoch": 1.3136444181925575, "grad_norm": 3.900954884610721, "learning_rate": 1.3795912524385322e-07, "logits/chosen": -2.6802122592926025, "logits/rejected": -2.7386527061462402, "logps/chosen": -394.46905517578125, "logps/rejected": -530.927978515625, "loss": 0.4819, "rewards/accuracies": 0.75, "rewards/chosen": -1.5945687294006348, "rewards/margins": 1.4686897993087769, "rewards/rejected": -3.063258647918701, "step": 278 }, { "epoch": 1.3183697578263438, "grad_norm": 5.012829563655198, "learning_rate": 1.3624580196431952e-07, "logits/chosen": -2.735568046569824, "logits/rejected": -2.7510178089141846, "logps/chosen": -375.1429748535156, "logps/rejected": -484.94390869140625, "loss": 0.4769, "rewards/accuracies": 0.625, "rewards/chosen": -1.396562099456787, "rewards/margins": 1.301504135131836, "rewards/rejected": -2.698065996170044, "step": 279 }, { "epoch": 1.3230950974601299, "grad_norm": 6.486088065803327, "learning_rate": 1.3453919016674483e-07, "logits/chosen": -2.5972790718078613, "logits/rejected": -2.6828713417053223, "logps/chosen": -317.81280517578125, "logps/rejected": -378.3294982910156, "loss": 0.4867, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1421467065811157, "rewards/margins": 1.074657917022705, "rewards/rejected": -2.2168045043945312, "step": 280 }, { "epoch": 1.3278204370939162, "grad_norm": 4.08450025143174, "learning_rate": 1.328393905409892e-07, "logits/chosen": -2.4976108074188232, "logits/rejected": -2.5105791091918945, "logps/chosen": -394.51141357421875, "logps/rejected": -482.7172546386719, "loss": 0.4878, "rewards/accuracies": 0.78125, "rewards/chosen": -1.521199107170105, "rewards/margins": 1.0865689516067505, "rewards/rejected": -2.6077680587768555, "step": 281 }, { "epoch": 1.3325457767277022, "grad_norm": 4.246089354170224, "learning_rate": 1.3114650337499578e-07, "logits/chosen": -2.629361152648926, "logits/rejected": -2.595665693283081, "logps/chosen": -334.8639831542969, "logps/rejected": -423.8031921386719, "loss": 0.4994, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5956919193267822, "rewards/margins": 0.8941479921340942, "rewards/rejected": -2.489840030670166, "step": 282 }, { "epoch": 1.3372711163614885, "grad_norm": 3.6260185771038103, "learning_rate": 1.2946062854887314e-07, "logits/chosen": -2.430432081222534, "logits/rejected": -2.3944997787475586, "logps/chosen": -371.2943115234375, "logps/rejected": -522.4259033203125, "loss": 0.4836, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5247899293899536, "rewards/margins": 1.4179781675338745, "rewards/rejected": -2.942767858505249, "step": 283 }, { "epoch": 1.3419964559952746, "grad_norm": 4.374381091295728, "learning_rate": 1.2778186552900316e-07, "logits/chosen": -2.7117838859558105, "logits/rejected": -2.761711597442627, "logps/chosen": -400.6455078125, "logps/rejected": -503.24371337890625, "loss": 0.484, "rewards/accuracies": 0.78125, "rewards/chosen": -1.717864990234375, "rewards/margins": 1.5052025318145752, "rewards/rejected": -3.2230677604675293, "step": 284 }, { "epoch": 1.346721795629061, "grad_norm": 3.8541867456367847, "learning_rate": 1.261103133621718e-07, "logits/chosen": -2.594248056411743, "logits/rejected": -2.603362798690796, "logps/chosen": -360.4008483886719, "logps/rejected": -533.9642944335938, "loss": 0.4763, "rewards/accuracies": 0.546875, "rewards/chosen": -1.626147747039795, "rewards/margins": 1.3955036401748657, "rewards/rejected": -3.021651268005371, "step": 285 }, { "epoch": 1.351447135262847, "grad_norm": 3.8033607627593815, "learning_rate": 1.2444607066972583e-07, "logits/chosen": -2.385476589202881, "logits/rejected": -2.4700021743774414, "logps/chosen": -379.0484924316406, "logps/rejected": -446.36932373046875, "loss": 0.4707, "rewards/accuracies": 0.734375, "rewards/chosen": -1.5443971157073975, "rewards/margins": 1.0206142663955688, "rewards/rejected": -2.5650112628936768, "step": 286 }, { "epoch": 1.356172474896633, "grad_norm": 3.9322741636171017, "learning_rate": 1.227892356417542e-07, "logits/chosen": -2.8771088123321533, "logits/rejected": -2.838731050491333, "logps/chosen": -366.8982849121094, "logps/rejected": -519.6290893554688, "loss": 0.4657, "rewards/accuracies": 0.734375, "rewards/chosen": -1.7020255327224731, "rewards/margins": 1.5927929878234863, "rewards/rejected": -3.29481840133667, "step": 287 }, { "epoch": 1.3608978145304194, "grad_norm": 4.441774997254426, "learning_rate": 1.211399060312943e-07, "logits/chosen": -2.6161060333251953, "logits/rejected": -2.6837587356567383, "logps/chosen": -333.47747802734375, "logps/rejected": -396.8973388671875, "loss": 0.4805, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4345372915267944, "rewards/margins": 0.6499552726745605, "rewards/rejected": -2.0844926834106445, "step": 288 }, { "epoch": 1.3656231541642057, "grad_norm": 4.102742239919904, "learning_rate": 1.1949817914856539e-07, "logits/chosen": -2.6814827919006348, "logits/rejected": -2.6281232833862305, "logps/chosen": -309.97998046875, "logps/rejected": -483.1641540527344, "loss": 0.4876, "rewards/accuracies": 0.703125, "rewards/chosen": -1.347307801246643, "rewards/margins": 1.4000307321548462, "rewards/rejected": -2.7473385334014893, "step": 289 }, { "epoch": 1.3703484937979917, "grad_norm": 3.510044851118614, "learning_rate": 1.1786415185522644e-07, "logits/chosen": -2.4141433238983154, "logits/rejected": -2.440483331680298, "logps/chosen": -371.58050537109375, "logps/rejected": -485.5816345214844, "loss": 0.462, "rewards/accuracies": 0.671875, "rewards/chosen": -1.6337575912475586, "rewards/margins": 1.3851597309112549, "rewards/rejected": -3.0189173221588135, "step": 290 }, { "epoch": 1.3750738334317778, "grad_norm": 4.439828210645555, "learning_rate": 1.1623792055866182e-07, "logits/chosen": -2.9460198879241943, "logits/rejected": -2.792397975921631, "logps/chosen": -306.8702392578125, "logps/rejected": -499.4562072753906, "loss": 0.4817, "rewards/accuracies": 0.765625, "rewards/chosen": -1.5415101051330566, "rewards/margins": 1.6944992542266846, "rewards/rejected": -3.2360095977783203, "step": 291 }, { "epoch": 1.379799173065564, "grad_norm": 3.808563735975941, "learning_rate": 1.1461958120629345e-07, "logits/chosen": -2.601799488067627, "logits/rejected": -2.6055119037628174, "logps/chosen": -349.54119873046875, "logps/rejected": -453.0423278808594, "loss": 0.4919, "rewards/accuracies": 0.75, "rewards/chosen": -1.38273286819458, "rewards/margins": 1.1617132425308228, "rewards/rejected": -2.5444459915161133, "step": 292 }, { "epoch": 1.3845245126993504, "grad_norm": 3.614960145049863, "learning_rate": 1.1300922927991912e-07, "logits/chosen": -2.3846492767333984, "logits/rejected": -2.35745906829834, "logps/chosen": -377.6761169433594, "logps/rejected": -496.96405029296875, "loss": 0.4831, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5498136281967163, "rewards/margins": 1.1908408403396606, "rewards/rejected": -2.740654468536377, "step": 293 }, { "epoch": 1.3892498523331365, "grad_norm": 5.27301677074656, "learning_rate": 1.1140695979008017e-07, "logits/chosen": -2.359983205795288, "logits/rejected": -2.346726894378662, "logps/chosen": -323.6959533691406, "logps/rejected": -452.9242248535156, "loss": 0.4725, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4340472221374512, "rewards/margins": 1.28817880153656, "rewards/rejected": -2.722226142883301, "step": 294 }, { "epoch": 1.3939751919669225, "grad_norm": 3.7242744476436704, "learning_rate": 1.0981286727045483e-07, "logits/chosen": -2.4720327854156494, "logits/rejected": -2.353053331375122, "logps/chosen": -352.6996765136719, "logps/rejected": -499.5018615722656, "loss": 0.476, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4100433588027954, "rewards/margins": 1.4266828298568726, "rewards/rejected": -2.836726188659668, "step": 295 }, { "epoch": 1.3987005316007088, "grad_norm": 4.955539561562781, "learning_rate": 1.0822704577228131e-07, "logits/chosen": -2.642940044403076, "logits/rejected": -2.639770746231079, "logps/chosen": -338.60595703125, "logps/rejected": -491.56805419921875, "loss": 0.4635, "rewards/accuracies": 0.765625, "rewards/chosen": -1.4772026538848877, "rewards/margins": 1.517985224723816, "rewards/rejected": -2.995187759399414, "step": 296 }, { "epoch": 1.403425871234495, "grad_norm": 4.549161233388332, "learning_rate": 1.0664958885880901e-07, "logits/chosen": -2.5309808254241943, "logits/rejected": -2.6360573768615723, "logps/chosen": -335.5814514160156, "logps/rejected": -437.315673828125, "loss": 0.4888, "rewards/accuracies": 0.640625, "rewards/chosen": -1.466670036315918, "rewards/margins": 1.2143834829330444, "rewards/rejected": -2.681053638458252, "step": 297 }, { "epoch": 1.4081512108682812, "grad_norm": 3.7991551857385533, "learning_rate": 1.0508058959977756e-07, "logits/chosen": -2.673210382461548, "logits/rejected": -2.5948569774627686, "logps/chosen": -333.4312744140625, "logps/rejected": -518.317626953125, "loss": 0.47, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2860051393508911, "rewards/margins": 1.857041358947754, "rewards/rejected": -3.1430463790893555, "step": 298 }, { "epoch": 1.4128765505020673, "grad_norm": 4.382508044742225, "learning_rate": 1.0352014056592653e-07, "logits/chosen": -2.746319055557251, "logits/rejected": -2.8318300247192383, "logps/chosen": -361.51220703125, "logps/rejected": -441.3756103515625, "loss": 0.492, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3820509910583496, "rewards/margins": 1.1350202560424805, "rewards/rejected": -2.517071485519409, "step": 299 }, { "epoch": 1.4176018901358536, "grad_norm": 4.674467616661016, "learning_rate": 1.0196833382353303e-07, "logits/chosen": -2.731412887573242, "logits/rejected": -2.6908507347106934, "logps/chosen": -322.404052734375, "logps/rejected": -462.2546081542969, "loss": 0.4942, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4098780155181885, "rewards/margins": 1.386682391166687, "rewards/rejected": -2.796560287475586, "step": 300 }, { "epoch": 1.4223272297696397, "grad_norm": 4.067329027842824, "learning_rate": 1.0042526092898049e-07, "logits/chosen": -2.8876852989196777, "logits/rejected": -2.7597179412841797, "logps/chosen": -328.04437255859375, "logps/rejected": -431.107666015625, "loss": 0.5068, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4123650789260864, "rewards/margins": 1.0988068580627441, "rewards/rejected": -2.511171817779541, "step": 301 }, { "epoch": 1.4223272297696397, "eval_logits/chosen": -2.751302719116211, "eval_logits/rejected": -2.7585830688476562, "eval_logps/chosen": -369.27899169921875, "eval_logps/rejected": -500.55902099609375, "eval_loss": 0.47641003131866455, "eval_rewards/accuracies": 0.6401515007019043, "eval_rewards/chosen": -1.5804827213287354, "eval_rewards/margins": 1.4256082773208618, "eval_rewards/rejected": -3.0060908794403076, "eval_runtime": 225.4536, "eval_samples_per_second": 16.216, "eval_steps_per_second": 0.293, "step": 301 }, { "epoch": 1.427052569403426, "grad_norm": 4.636151437447421, "learning_rate": 9.889101292335625e-08, "logits/chosen": -2.6484196186065674, "logits/rejected": -2.6770687103271484, "logps/chosen": -402.7945251464844, "logps/rejected": -453.43646240234375, "loss": 0.4782, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1831938028335571, "rewards/margins": 1.1924808025360107, "rewards/rejected": -2.3756744861602783, "step": 302 }, { "epoch": 1.431777909037212, "grad_norm": 4.004369409074385, "learning_rate": 9.736568032708068e-08, "logits/chosen": -2.5602633953094482, "logits/rejected": -2.6748807430267334, "logps/chosen": -366.6900634765625, "logps/rejected": -485.0727844238281, "loss": 0.482, "rewards/accuracies": 0.578125, "rewards/chosen": -1.4285987615585327, "rewards/margins": 1.416813611984253, "rewards/rejected": -2.845412254333496, "step": 303 }, { "epoch": 1.436503248670998, "grad_norm": 4.878881366827008, "learning_rate": 9.584935313456596e-08, "logits/chosen": -2.231307029724121, "logits/rejected": -2.287929058074951, "logps/chosen": -352.46282958984375, "logps/rejected": -453.6793212890625, "loss": 0.4942, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5504112243652344, "rewards/margins": 1.3251781463623047, "rewards/rejected": -2.875589609146118, "step": 304 }, { "epoch": 1.4412285883047844, "grad_norm": 4.241730784056723, "learning_rate": 9.4342120808907e-08, "logits/chosen": -2.703420639038086, "logits/rejected": -2.6591320037841797, "logps/chosen": -383.8643493652344, "logps/rejected": -594.2821655273438, "loss": 0.4792, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5567654371261597, "rewards/margins": 2.0447824001312256, "rewards/rejected": -3.6015477180480957, "step": 305 }, { "epoch": 1.4459539279385707, "grad_norm": 3.8843212431707284, "learning_rate": 9.284407227660249e-08, "logits/chosen": -2.8023083209991455, "logits/rejected": -2.7946949005126953, "logps/chosen": -337.86090087890625, "logps/rejected": -434.48260498046875, "loss": 0.4894, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2694463729858398, "rewards/margins": 1.1360807418823242, "rewards/rejected": -2.405527114868164, "step": 306 }, { "epoch": 1.4506792675723568, "grad_norm": 3.978483022316159, "learning_rate": 9.13552959223089e-08, "logits/chosen": -2.676576852798462, "logits/rejected": -2.5387959480285645, "logps/chosen": -313.46954345703125, "logps/rejected": -421.6120300292969, "loss": 0.4765, "rewards/accuracies": 0.625, "rewards/chosen": -1.3958847522735596, "rewards/margins": 0.8875546455383301, "rewards/rejected": -2.2834396362304688, "step": 307 }, { "epoch": 1.4554046072061428, "grad_norm": 4.395780053942796, "learning_rate": 8.987587958362516e-08, "logits/chosen": -2.856872320175171, "logits/rejected": -2.8188376426696777, "logps/chosen": -371.1302795410156, "logps/rejected": -465.9201354980469, "loss": 0.4681, "rewards/accuracies": 0.765625, "rewards/chosen": -1.4001535177230835, "rewards/margins": 1.085311770439148, "rewards/rejected": -2.4854652881622314, "step": 308 }, { "epoch": 1.4601299468399291, "grad_norm": 3.8656297909933772, "learning_rate": 8.840591054591096e-08, "logits/chosen": -2.5090444087982178, "logits/rejected": -2.6375505924224854, "logps/chosen": -410.7342529296875, "logps/rejected": -467.5154724121094, "loss": 0.4832, "rewards/accuracies": 0.703125, "rewards/chosen": -1.2629905939102173, "rewards/margins": 1.3948893547058105, "rewards/rejected": -2.6578800678253174, "step": 309 }, { "epoch": 1.4648552864737154, "grad_norm": 4.334534835752672, "learning_rate": 8.694547553713618e-08, "logits/chosen": -2.759681224822998, "logits/rejected": -2.73026180267334, "logps/chosen": -355.33624267578125, "logps/rejected": -529.0985107421875, "loss": 0.4896, "rewards/accuracies": 0.796875, "rewards/chosen": -1.521816372871399, "rewards/margins": 1.6343505382537842, "rewards/rejected": -3.1561670303344727, "step": 310 }, { "epoch": 1.4695806261075015, "grad_norm": 4.853059543604455, "learning_rate": 8.54946607227644e-08, "logits/chosen": -2.4591803550720215, "logits/rejected": -2.5936403274536133, "logps/chosen": -391.37298583984375, "logps/rejected": -434.3221435546875, "loss": 0.4857, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5846033096313477, "rewards/margins": 1.0046393871307373, "rewards/rejected": -2.589242696762085, "step": 311 }, { "epoch": 1.4743059657412876, "grad_norm": 5.028900388618828, "learning_rate": 8.405355170066925e-08, "logits/chosen": -2.7181220054626465, "logits/rejected": -2.590919017791748, "logps/chosen": -380.94818115234375, "logps/rejected": -548.143798828125, "loss": 0.4851, "rewards/accuracies": 0.734375, "rewards/chosen": -1.6279135942459106, "rewards/margins": 1.5961647033691406, "rewards/rejected": -3.2240781784057617, "step": 312 }, { "epoch": 1.4790313053750739, "grad_norm": 4.261294839829192, "learning_rate": 8.262223349608366e-08, "logits/chosen": -2.7533867359161377, "logits/rejected": -2.8102259635925293, "logps/chosen": -394.7510681152344, "logps/rejected": -499.4160461425781, "loss": 0.4965, "rewards/accuracies": 0.546875, "rewards/chosen": -1.5835039615631104, "rewards/margins": 1.1018283367156982, "rewards/rejected": -2.6853325366973877, "step": 313 }, { "epoch": 1.48375664500886, "grad_norm": 4.418632236406977, "learning_rate": 8.120079055658402e-08, "logits/chosen": -2.642446279525757, "logits/rejected": -2.6767466068267822, "logps/chosen": -323.5433349609375, "logps/rejected": -469.7967834472656, "loss": 0.475, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2628949880599976, "rewards/margins": 1.5510131120681763, "rewards/rejected": -2.813908100128174, "step": 314 }, { "epoch": 1.4884819846426462, "grad_norm": 4.312754452868253, "learning_rate": 7.978930674710719e-08, "logits/chosen": -2.4338035583496094, "logits/rejected": -2.444002151489258, "logps/chosen": -374.0926513671875, "logps/rejected": -496.3292236328125, "loss": 0.4875, "rewards/accuracies": 0.65625, "rewards/chosen": -1.539766788482666, "rewards/margins": 1.5390400886535645, "rewards/rejected": -3.0788071155548096, "step": 315 }, { "epoch": 1.4932073242764323, "grad_norm": 4.036827603807639, "learning_rate": 7.838786534500269e-08, "logits/chosen": -2.7101027965545654, "logits/rejected": -2.7315640449523926, "logps/chosen": -367.41107177734375, "logps/rejected": -477.13916015625, "loss": 0.4933, "rewards/accuracies": 0.671875, "rewards/chosen": -1.501757025718689, "rewards/margins": 1.17995023727417, "rewards/rejected": -2.6817073822021484, "step": 316 }, { "epoch": 1.4979326639102186, "grad_norm": 4.1034959013822645, "learning_rate": 7.699654903511971e-08, "logits/chosen": -2.4980247020721436, "logits/rejected": -2.569985866546631, "logps/chosen": -306.99346923828125, "logps/rejected": -443.2658996582031, "loss": 0.4761, "rewards/accuracies": 0.828125, "rewards/chosen": -1.0971927642822266, "rewards/margins": 1.3962390422821045, "rewards/rejected": -2.49343204498291, "step": 317 }, { "epoch": 1.5026580035440047, "grad_norm": 3.8908709553914083, "learning_rate": 7.561543990492803e-08, "logits/chosen": -2.5545809268951416, "logits/rejected": -2.7232065200805664, "logps/chosen": -382.45367431640625, "logps/rejected": -496.840087890625, "loss": 0.4696, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5967504978179932, "rewards/margins": 1.5658732652664185, "rewards/rejected": -3.162623643875122, "step": 318 }, { "epoch": 1.507383343177791, "grad_norm": 4.1097836221394095, "learning_rate": 7.424461943967555e-08, "logits/chosen": -2.5873563289642334, "logits/rejected": -2.7303099632263184, "logps/chosen": -394.63916015625, "logps/rejected": -551.1517333984375, "loss": 0.4793, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5619065761566162, "rewards/margins": 1.4349513053894043, "rewards/rejected": -2.9968576431274414, "step": 319 }, { "epoch": 1.512108682811577, "grad_norm": 3.8818233399377373, "learning_rate": 7.288416851758016e-08, "logits/chosen": -2.638657808303833, "logits/rejected": -2.547767162322998, "logps/chosen": -392.5679626464844, "logps/rejected": -627.8018188476562, "loss": 0.4782, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5840284824371338, "rewards/margins": 2.276487112045288, "rewards/rejected": -3.860515594482422, "step": 320 }, { "epoch": 1.5168340224453631, "grad_norm": 4.298412046444391, "learning_rate": 7.153416740505814e-08, "logits/chosen": -2.473698854446411, "logits/rejected": -2.580441951751709, "logps/chosen": -402.406982421875, "logps/rejected": -486.8596496582031, "loss": 0.4701, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5713368654251099, "rewards/margins": 1.1730860471725464, "rewards/rejected": -2.7444231510162354, "step": 321 }, { "epoch": 1.5215593620791494, "grad_norm": 3.844411445149988, "learning_rate": 7.01946957519886e-08, "logits/chosen": -2.414036273956299, "logits/rejected": -2.5014243125915527, "logps/chosen": -408.16455078125, "logps/rejected": -474.1005859375, "loss": 0.4766, "rewards/accuracies": 0.640625, "rewards/chosen": -1.5195822715759277, "rewards/margins": 1.162341594696045, "rewards/rejected": -2.6819238662719727, "step": 322 }, { "epoch": 1.5262847017129357, "grad_norm": 4.081324399203426, "learning_rate": 6.88658325870138e-08, "logits/chosen": -2.6832616329193115, "logits/rejected": -2.7357096672058105, "logps/chosen": -385.5028076171875, "logps/rejected": -469.4779052734375, "loss": 0.4665, "rewards/accuracies": 0.734375, "rewards/chosen": -1.399859070777893, "rewards/margins": 1.3238856792449951, "rewards/rejected": -2.7237446308135986, "step": 323 }, { "epoch": 1.5310100413467218, "grad_norm": 5.228883712742776, "learning_rate": 6.754765631287695e-08, "logits/chosen": -2.544619560241699, "logits/rejected": -2.655355453491211, "logps/chosen": -355.68328857421875, "logps/rejected": -468.37738037109375, "loss": 0.4799, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6572238206863403, "rewards/margins": 1.6221368312835693, "rewards/rejected": -3.27936053276062, "step": 324 }, { "epoch": 1.5357353809805079, "grad_norm": 3.85359128112212, "learning_rate": 6.62402447017959e-08, "logits/chosen": -2.255566358566284, "logits/rejected": -2.2861790657043457, "logps/chosen": -374.5622863769531, "logps/rejected": -525.822265625, "loss": 0.4804, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5297114849090576, "rewards/margins": 1.5574190616607666, "rewards/rejected": -3.087130546569824, "step": 325 }, { "epoch": 1.5404607206142942, "grad_norm": 3.9384759163947733, "learning_rate": 6.494367489087488e-08, "logits/chosen": -2.310734987258911, "logits/rejected": -2.331479072570801, "logps/chosen": -353.3757629394531, "logps/rejected": -456.4471740722656, "loss": 0.4973, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2544013261795044, "rewards/margins": 0.9933812618255615, "rewards/rejected": -2.2477827072143555, "step": 326 }, { "epoch": 1.5451860602480805, "grad_norm": 4.707537372396568, "learning_rate": 6.365802337755364e-08, "logits/chosen": -2.5162229537963867, "logits/rejected": -2.5781514644622803, "logps/chosen": -359.474365234375, "logps/rejected": -459.92413330078125, "loss": 0.4737, "rewards/accuracies": 0.75, "rewards/chosen": -1.3576124906539917, "rewards/margins": 1.2539989948272705, "rewards/rejected": -2.6116113662719727, "step": 327 }, { "epoch": 1.5499113998818665, "grad_norm": 3.5347435278013024, "learning_rate": 6.238336601509364e-08, "logits/chosen": -2.4307329654693604, "logits/rejected": -2.413248300552368, "logps/chosen": -346.82537841796875, "logps/rejected": -487.83111572265625, "loss": 0.4487, "rewards/accuracies": 0.765625, "rewards/chosen": -1.5922647714614868, "rewards/margins": 1.7097985744476318, "rewards/rejected": -3.302063465118408, "step": 328 }, { "epoch": 1.5546367395156526, "grad_norm": 4.5192043219385925, "learning_rate": 6.111977800810316e-08, "logits/chosen": -2.4796946048736572, "logits/rejected": -2.3877992630004883, "logps/chosen": -332.8540344238281, "logps/rejected": -508.25360107421875, "loss": 0.4836, "rewards/accuracies": 0.734375, "rewards/chosen": -1.7624458074569702, "rewards/margins": 1.4423408508300781, "rewards/rejected": -3.204786777496338, "step": 329 }, { "epoch": 1.559362079149439, "grad_norm": 4.386475280759837, "learning_rate": 5.986733390809993e-08, "logits/chosen": -2.400326728820801, "logits/rejected": -2.2906858921051025, "logps/chosen": -386.3865966796875, "logps/rejected": -556.1158447265625, "loss": 0.4623, "rewards/accuracies": 0.703125, "rewards/chosen": -1.8006043434143066, "rewards/margins": 1.7274423837661743, "rewards/rejected": -3.5280466079711914, "step": 330 }, { "epoch": 1.564087418783225, "grad_norm": 4.101101033266814, "learning_rate": 5.862610760911257e-08, "logits/chosen": -2.4113216400146484, "logits/rejected": -2.3908936977386475, "logps/chosen": -364.8887939453125, "logps/rejected": -459.69854736328125, "loss": 0.4372, "rewards/accuracies": 0.625, "rewards/chosen": -1.4839426279067993, "rewards/margins": 1.3043051958084106, "rewards/rejected": -2.78824782371521, "step": 331 }, { "epoch": 1.5688127584170113, "grad_norm": 4.330222141211974, "learning_rate": 5.739617234332131e-08, "logits/chosen": -2.6859869956970215, "logits/rejected": -2.7122979164123535, "logps/chosen": -405.88018798828125, "logps/rejected": -452.80926513671875, "loss": 0.4648, "rewards/accuracies": 0.640625, "rewards/chosen": -1.5999912023544312, "rewards/margins": 0.9445231556892395, "rewards/rejected": -2.5445144176483154, "step": 332 }, { "epoch": 1.5735380980507974, "grad_norm": 5.485683540439284, "learning_rate": 5.6177600676736656e-08, "logits/chosen": -2.5637035369873047, "logits/rejected": -2.5589380264282227, "logps/chosen": -393.2574462890625, "logps/rejected": -529.729248046875, "loss": 0.5169, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7628885507583618, "rewards/margins": 1.4583940505981445, "rewards/rejected": -3.221282482147217, "step": 333 }, { "epoch": 1.5782634376845834, "grad_norm": 3.9160914653232983, "learning_rate": 5.4970464504918654e-08, "logits/chosen": -2.7670090198516846, "logits/rejected": -2.7049059867858887, "logps/chosen": -369.9678039550781, "logps/rejected": -494.07196044921875, "loss": 0.4565, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4675428867340088, "rewards/margins": 1.4338531494140625, "rewards/rejected": -2.901395797729492, "step": 334 }, { "epoch": 1.5829887773183697, "grad_norm": 3.637994981997343, "learning_rate": 5.37748350487344e-08, "logits/chosen": -2.6616098880767822, "logits/rejected": -2.6261179447174072, "logps/chosen": -340.9493408203125, "logps/rejected": -505.6307067871094, "loss": 0.4528, "rewards/accuracies": 0.703125, "rewards/chosen": -1.4446711540222168, "rewards/margins": 1.6602320671081543, "rewards/rejected": -3.10490345954895, "step": 335 }, { "epoch": 1.587714116952156, "grad_norm": 3.742063853340133, "learning_rate": 5.2590782850156667e-08, "logits/chosen": -2.755837917327881, "logits/rejected": -2.6713008880615234, "logps/chosen": -421.58154296875, "logps/rejected": -625.4422607421875, "loss": 0.4608, "rewards/accuracies": 0.734375, "rewards/chosen": -1.701414704322815, "rewards/margins": 1.9440244436264038, "rewards/rejected": -3.6454391479492188, "step": 336 }, { "epoch": 1.592439456585942, "grad_norm": 3.5742676774235154, "learning_rate": 5.14183777681014e-08, "logits/chosen": -2.466548442840576, "logits/rejected": -2.5580849647521973, "logps/chosen": -414.5927734375, "logps/rejected": -506.8092956542969, "loss": 0.5037, "rewards/accuracies": 0.6875, "rewards/chosen": -1.518671989440918, "rewards/margins": 1.1154457330703735, "rewards/rejected": -2.634117603302002, "step": 337 }, { "epoch": 1.5971647962197282, "grad_norm": 3.6653171076626205, "learning_rate": 5.0257688974306436e-08, "logits/chosen": -2.8738627433776855, "logits/rejected": -3.015981435775757, "logps/chosen": -381.5254211425781, "logps/rejected": -457.140869140625, "loss": 0.4883, "rewards/accuracies": 0.75, "rewards/chosen": -1.607730507850647, "rewards/margins": 1.3426533937454224, "rewards/rejected": -2.9503836631774902, "step": 338 }, { "epoch": 1.6018901358535145, "grad_norm": 3.837456752855929, "learning_rate": 4.910878494925008e-08, "logits/chosen": -2.6002864837646484, "logits/rejected": -2.6144702434539795, "logps/chosen": -402.1292724609375, "logps/rejected": -593.8958740234375, "loss": 0.4436, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5465564727783203, "rewards/margins": 1.739166498184204, "rewards/rejected": -3.2857229709625244, "step": 339 }, { "epoch": 1.6066154754873008, "grad_norm": 3.9821598438730565, "learning_rate": 4.7971733478111094e-08, "logits/chosen": -2.5669634342193604, "logits/rejected": -2.5983939170837402, "logps/chosen": -384.938720703125, "logps/rejected": -557.4461669921875, "loss": 0.471, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5989502668380737, "rewards/margins": 1.8718353509902954, "rewards/rejected": -3.4707858562469482, "step": 340 }, { "epoch": 1.6113408151210868, "grad_norm": 4.1505854551229735, "learning_rate": 4.684660164676896e-08, "logits/chosen": -2.4149627685546875, "logits/rejected": -2.3984737396240234, "logps/chosen": -367.2955322265625, "logps/rejected": -546.0213623046875, "loss": 0.4849, "rewards/accuracies": 0.75, "rewards/chosen": -1.5166963338851929, "rewards/margins": 1.747159719467163, "rewards/rejected": -3.2638559341430664, "step": 341 }, { "epoch": 1.616066154754873, "grad_norm": 4.195319716535671, "learning_rate": 4.5733455837846325e-08, "logits/chosen": -2.6863296031951904, "logits/rejected": -2.740741014480591, "logps/chosen": -388.5705261230469, "logps/rejected": -527.72265625, "loss": 0.4803, "rewards/accuracies": 0.65625, "rewards/chosen": -1.659536600112915, "rewards/margins": 1.501134991645813, "rewards/rejected": -3.1606712341308594, "step": 342 }, { "epoch": 1.6207914943886592, "grad_norm": 4.414273407573815, "learning_rate": 4.4632361726791914e-08, "logits/chosen": -2.6036033630371094, "logits/rejected": -2.6751937866210938, "logps/chosen": -401.4815979003906, "logps/rejected": -471.8222961425781, "loss": 0.5018, "rewards/accuracies": 0.71875, "rewards/chosen": -1.435232162475586, "rewards/margins": 1.3340187072753906, "rewards/rejected": -2.7692506313323975, "step": 343 }, { "epoch": 1.6255168340224455, "grad_norm": 5.041321789664628, "learning_rate": 4.354338427800619e-08, "logits/chosen": -2.6457765102386475, "logits/rejected": -2.5664923191070557, "logps/chosen": -325.753662109375, "logps/rejected": -515.0615844726562, "loss": 0.4724, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4971141815185547, "rewards/margins": 1.5831940174102783, "rewards/rejected": -3.080307960510254, "step": 344 }, { "epoch": 1.6255168340224455, "eval_logits/chosen": -2.623720645904541, "eval_logits/rejected": -2.6295571327209473, "eval_logps/chosen": -379.5492858886719, "eval_logps/rejected": -517.3630981445312, "eval_loss": 0.47301965951919556, "eval_rewards/accuracies": 0.6382575631141663, "eval_rewards/chosen": -1.683185338973999, "eval_rewards/margins": 1.4909465312957764, "eval_rewards/rejected": -3.1741318702697754, "eval_runtime": 225.158, "eval_samples_per_second": 16.237, "eval_steps_per_second": 0.293, "step": 344 }, { "epoch": 1.6302421736562316, "grad_norm": 3.9177181536561396, "learning_rate": 4.246658774100803e-08, "logits/chosen": -2.5313777923583984, "logits/rejected": -2.6277084350585938, "logps/chosen": -407.18621826171875, "logps/rejected": -496.5007629394531, "loss": 0.4807, "rewards/accuracies": 0.703125, "rewards/chosen": -1.6058672666549683, "rewards/margins": 1.1191009283065796, "rewards/rejected": -2.724968194961548, "step": 345 }, { "epoch": 1.6349675132900177, "grad_norm": 3.841031231021936, "learning_rate": 4.140203564664421e-08, "logits/chosen": -2.6209938526153564, "logits/rejected": -2.650219678878784, "logps/chosen": -361.2805480957031, "logps/rejected": -478.2210998535156, "loss": 0.4595, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4036047458648682, "rewards/margins": 1.4318450689315796, "rewards/rejected": -2.8354499340057373, "step": 346 }, { "epoch": 1.6396928529238037, "grad_norm": 3.522512280432432, "learning_rate": 4.0349790803341274e-08, "logits/chosen": -2.8207521438598633, "logits/rejected": -2.7012057304382324, "logps/chosen": -348.5565490722656, "logps/rejected": -503.2954406738281, "loss": 0.4711, "rewards/accuracies": 0.765625, "rewards/chosen": -1.5178475379943848, "rewards/margins": 1.5804749727249146, "rewards/rejected": -3.0983223915100098, "step": 347 }, { "epoch": 1.64441819255759, "grad_norm": 4.0111825885134245, "learning_rate": 3.930991529339936e-08, "logits/chosen": -2.629011631011963, "logits/rejected": -2.583953380584717, "logps/chosen": -376.8917236328125, "logps/rejected": -623.347412109375, "loss": 0.4686, "rewards/accuracies": 0.703125, "rewards/chosen": -1.57673978805542, "rewards/margins": 2.396878719329834, "rewards/rejected": -3.973618268966675, "step": 348 }, { "epoch": 1.6491435321913763, "grad_norm": 4.471413252688273, "learning_rate": 3.828247046932992e-08, "logits/chosen": -2.6523804664611816, "logits/rejected": -2.6967380046844482, "logps/chosen": -339.6617126464844, "logps/rejected": -441.6883239746094, "loss": 0.4632, "rewards/accuracies": 0.71875, "rewards/chosen": -1.344817876815796, "rewards/margins": 1.1860837936401367, "rewards/rejected": -2.5309014320373535, "step": 349 }, { "epoch": 1.6538688718251624, "grad_norm": 3.9015948159102902, "learning_rate": 3.7267516950235525e-08, "logits/chosen": -2.590344190597534, "logits/rejected": -2.6862494945526123, "logps/chosen": -347.7746887207031, "logps/rejected": -500.335693359375, "loss": 0.4407, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4618185758590698, "rewards/margins": 1.6277704238891602, "rewards/rejected": -3.0895891189575195, "step": 350 }, { "epoch": 1.6585942114589485, "grad_norm": 4.642097450354513, "learning_rate": 3.62651146182334e-08, "logits/chosen": -2.656710624694824, "logits/rejected": -2.673964738845825, "logps/chosen": -364.386962890625, "logps/rejected": -458.63348388671875, "loss": 0.5201, "rewards/accuracies": 0.65625, "rewards/chosen": -1.533918023109436, "rewards/margins": 1.1320774555206299, "rewards/rejected": -2.6659955978393555, "step": 351 }, { "epoch": 1.6633195510927348, "grad_norm": 4.219548915754237, "learning_rate": 3.527532261492272e-08, "logits/chosen": -2.572221517562866, "logits/rejected": -2.526576042175293, "logps/chosen": -370.37396240234375, "logps/rejected": -487.1659240722656, "loss": 0.4737, "rewards/accuracies": 0.734375, "rewards/chosen": -1.4427790641784668, "rewards/margins": 1.223225712776184, "rewards/rejected": -2.6660046577453613, "step": 352 }, { "epoch": 1.668044890726521, "grad_norm": 4.167493075518676, "learning_rate": 3.4298199337894685e-08, "logits/chosen": -2.6304097175598145, "logits/rejected": -2.6287808418273926, "logps/chosen": -387.1392822265625, "logps/rejected": -581.6126708984375, "loss": 0.4896, "rewards/accuracies": 0.734375, "rewards/chosen": -1.8162624835968018, "rewards/margins": 2.150383234024048, "rewards/rejected": -3.9666457176208496, "step": 353 }, { "epoch": 1.6727702303603071, "grad_norm": 5.897960463157494, "learning_rate": 3.333380243728773e-08, "logits/chosen": -2.4372665882110596, "logits/rejected": -2.5481488704681396, "logps/chosen": -380.5074157714844, "logps/rejected": -474.3066711425781, "loss": 0.4799, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3929729461669922, "rewards/margins": 1.5269252061843872, "rewards/rejected": -2.919898271560669, "step": 354 }, { "epoch": 1.6774955699940932, "grad_norm": 4.6734891602402, "learning_rate": 3.238218881238558e-08, "logits/chosen": -2.68146014213562, "logits/rejected": -2.764531135559082, "logps/chosen": -385.89788818359375, "logps/rejected": -453.99371337890625, "loss": 0.4799, "rewards/accuracies": 0.734375, "rewards/chosen": -1.5357441902160645, "rewards/margins": 1.310141921043396, "rewards/rejected": -2.845885992050171, "step": 355 }, { "epoch": 1.6822209096278795, "grad_norm": 3.9989351330319503, "learning_rate": 3.1443414608260526e-08, "logits/chosen": -2.7942676544189453, "logits/rejected": -2.7728328704833984, "logps/chosen": -417.6398010253906, "logps/rejected": -534.0235595703125, "loss": 0.4725, "rewards/accuracies": 0.796875, "rewards/chosen": -1.648948073387146, "rewards/margins": 1.4638010263442993, "rewards/rejected": -3.1127490997314453, "step": 356 }, { "epoch": 1.6869462492616658, "grad_norm": 4.114134264165304, "learning_rate": 3.0517535212460946e-08, "logits/chosen": -2.6477482318878174, "logits/rejected": -2.7860238552093506, "logps/chosen": -470.87103271484375, "logps/rejected": -513.6318969726562, "loss": 0.469, "rewards/accuracies": 0.640625, "rewards/chosen": -1.568166732788086, "rewards/margins": 0.9764127731323242, "rewards/rejected": -2.5445797443389893, "step": 357 }, { "epoch": 1.6916715888954519, "grad_norm": 4.425834666691968, "learning_rate": 2.960460525174313e-08, "logits/chosen": -2.8916306495666504, "logits/rejected": -2.853088855743408, "logps/chosen": -339.81610107421875, "logps/rejected": -483.9287109375, "loss": 0.472, "rewards/accuracies": 0.671875, "rewards/chosen": -1.479756236076355, "rewards/margins": 1.5092005729675293, "rewards/rejected": -2.9889566898345947, "step": 358 }, { "epoch": 1.696396928529238, "grad_norm": 4.184257952905371, "learning_rate": 2.8704678588848535e-08, "logits/chosen": -2.52712345123291, "logits/rejected": -2.5050594806671143, "logps/chosen": -368.9576416015625, "logps/rejected": -528.0908813476562, "loss": 0.4573, "rewards/accuracies": 0.71875, "rewards/chosen": -1.523732304573059, "rewards/margins": 1.6147029399871826, "rewards/rejected": -3.1384353637695312, "step": 359 }, { "epoch": 1.7011222681630243, "grad_norm": 4.172727890906571, "learning_rate": 2.781780831932595e-08, "logits/chosen": -2.6111361980438232, "logits/rejected": -2.634065628051758, "logps/chosen": -437.898681640625, "logps/rejected": -492.3134460449219, "loss": 0.4917, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5102522373199463, "rewards/margins": 0.8861820697784424, "rewards/rejected": -2.3964343070983887, "step": 360 }, { "epoch": 1.7058476077968105, "grad_norm": 3.6954976996048376, "learning_rate": 2.6944046768398565e-08, "logits/chosen": -2.5292959213256836, "logits/rejected": -2.5336508750915527, "logps/chosen": -348.8612060546875, "logps/rejected": -474.258056640625, "loss": 0.4838, "rewards/accuracies": 0.671875, "rewards/chosen": -1.3767406940460205, "rewards/margins": 1.3499197959899902, "rewards/rejected": -2.7266602516174316, "step": 361 }, { "epoch": 1.7105729474305966, "grad_norm": 4.557110158231638, "learning_rate": 2.608344548787722e-08, "logits/chosen": -2.493603229522705, "logits/rejected": -2.6656622886657715, "logps/chosen": -443.8504638671875, "logps/rejected": -544.0814819335938, "loss": 0.4946, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7300480604171753, "rewards/margins": 1.5911731719970703, "rewards/rejected": -3.321221113204956, "step": 362 }, { "epoch": 1.7152982870643827, "grad_norm": 4.456300726365723, "learning_rate": 2.523605525311842e-08, "logits/chosen": -2.5634899139404297, "logits/rejected": -2.5102734565734863, "logps/chosen": -380.3689270019531, "logps/rejected": -480.8353271484375, "loss": 0.457, "rewards/accuracies": 0.703125, "rewards/chosen": -1.3825726509094238, "rewards/margins": 0.8929464221000671, "rewards/rejected": -2.2755191326141357, "step": 363 }, { "epoch": 1.7200236266981688, "grad_norm": 4.384534987558066, "learning_rate": 2.440192606002889e-08, "logits/chosen": -2.7241060733795166, "logits/rejected": -2.700328826904297, "logps/chosen": -400.6265869140625, "logps/rejected": -509.753173828125, "loss": 0.4544, "rewards/accuracies": 0.609375, "rewards/chosen": -1.817798137664795, "rewards/margins": 1.4839262962341309, "rewards/rejected": -3.3017241954803467, "step": 364 }, { "epoch": 1.724748966331955, "grad_norm": 4.665836153801188, "learning_rate": 2.3581107122115723e-08, "logits/chosen": -2.7754364013671875, "logits/rejected": -2.7966537475585938, "logps/chosen": -398.6808166503906, "logps/rejected": -486.95263671875, "loss": 0.4908, "rewards/accuracies": 0.671875, "rewards/chosen": -1.9782556295394897, "rewards/margins": 1.371333360671997, "rewards/rejected": -3.3495888710021973, "step": 365 }, { "epoch": 1.7294743059657414, "grad_norm": 4.147614147053133, "learning_rate": 2.2773646867582763e-08, "logits/chosen": -2.626425266265869, "logits/rejected": -2.57529354095459, "logps/chosen": -398.737548828125, "logps/rejected": -586.5382690429688, "loss": 0.488, "rewards/accuracies": 0.734375, "rewards/chosen": -1.5953752994537354, "rewards/margins": 1.6417709589004517, "rewards/rejected": -3.2371463775634766, "step": 366 }, { "epoch": 1.7341996455995274, "grad_norm": 4.0997514353208775, "learning_rate": 2.19795929364735e-08, "logits/chosen": -2.473259449005127, "logits/rejected": -2.6184892654418945, "logps/chosen": -344.3788757324219, "logps/rejected": -420.8124694824219, "loss": 0.4917, "rewards/accuracies": 0.625, "rewards/chosen": -1.3588237762451172, "rewards/margins": 1.1373924016952515, "rewards/rejected": -2.496216058731079, "step": 367 }, { "epoch": 1.7389249852333135, "grad_norm": 4.374774389574994, "learning_rate": 2.119899217785995e-08, "logits/chosen": -2.467965841293335, "logits/rejected": -2.5698554515838623, "logps/chosen": -387.9084777832031, "logps/rejected": -457.1981201171875, "loss": 0.4716, "rewards/accuracies": 0.8125, "rewards/chosen": -1.393808364868164, "rewards/margins": 1.2519505023956299, "rewards/rejected": -2.645759105682373, "step": 368 }, { "epoch": 1.7436503248670998, "grad_norm": 4.049649530225524, "learning_rate": 2.0431890647079093e-08, "logits/chosen": -2.407700538635254, "logits/rejected": -2.458270788192749, "logps/chosen": -414.94561767578125, "logps/rejected": -544.7018432617188, "loss": 0.4652, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8057122230529785, "rewards/margins": 1.333924412727356, "rewards/rejected": -3.139636754989624, "step": 369 }, { "epoch": 1.748375664500886, "grad_norm": 4.671069391094125, "learning_rate": 1.967833360301513e-08, "logits/chosen": -2.637674331665039, "logits/rejected": -2.722353458404541, "logps/chosen": -357.71142578125, "logps/rejected": -436.8525390625, "loss": 0.4661, "rewards/accuracies": 0.640625, "rewards/chosen": -1.5406451225280762, "rewards/margins": 1.1408922672271729, "rewards/rejected": -2.681537389755249, "step": 370 }, { "epoch": 1.7531010041346722, "grad_norm": 4.49257028234618, "learning_rate": 1.8938365505429544e-08, "logits/chosen": -2.710331678390503, "logits/rejected": -2.7880280017852783, "logps/chosen": -386.74945068359375, "logps/rejected": -489.5121154785156, "loss": 0.4942, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6727674007415771, "rewards/margins": 1.33446204662323, "rewards/rejected": -3.0072293281555176, "step": 371 }, { "epoch": 1.7578263437684583, "grad_norm": 3.5465325285307228, "learning_rate": 1.8212030012337704e-08, "logits/chosen": -2.757737874984741, "logits/rejected": -2.687244415283203, "logps/chosen": -360.2991943359375, "logps/rejected": -512.8646240234375, "loss": 0.4577, "rewards/accuracies": 0.703125, "rewards/chosen": -1.4896515607833862, "rewards/margins": 1.5841938257217407, "rewards/rejected": -3.0738449096679688, "step": 372 }, { "epoch": 1.7625516834022446, "grad_norm": 4.417035193749527, "learning_rate": 1.7499369977433453e-08, "logits/chosen": -2.6367974281311035, "logits/rejected": -2.677651882171631, "logps/chosen": -362.3944396972656, "logps/rejected": -460.9113464355469, "loss": 0.4586, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5859841108322144, "rewards/margins": 1.0494407415390015, "rewards/rejected": -2.6354243755340576, "step": 373 }, { "epoch": 1.7672770230360308, "grad_norm": 4.392962342030237, "learning_rate": 1.680042744756016e-08, "logits/chosen": -2.9928336143493652, "logits/rejected": -2.9165008068084717, "logps/chosen": -366.8034973144531, "logps/rejected": -550.0221557617188, "loss": 0.4769, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5658485889434814, "rewards/margins": 1.7793083190917969, "rewards/rejected": -3.3451569080352783, "step": 374 }, { "epoch": 1.772002362669817, "grad_norm": 4.856077218434121, "learning_rate": 1.611524366023062e-08, "logits/chosen": -2.722025156021118, "logits/rejected": -2.8585572242736816, "logps/chosen": -347.9723815917969, "logps/rejected": -446.63604736328125, "loss": 0.4742, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4827804565429688, "rewards/margins": 1.0786786079406738, "rewards/rejected": -2.5614588260650635, "step": 375 }, { "epoch": 1.776727702303603, "grad_norm": 3.752916160722341, "learning_rate": 1.544385904119344e-08, "logits/chosen": -2.913771867752075, "logits/rejected": -2.971179962158203, "logps/chosen": -360.3616638183594, "logps/rejected": -423.8080749511719, "loss": 0.4586, "rewards/accuracies": 0.75, "rewards/chosen": -1.482804298400879, "rewards/margins": 1.2081830501556396, "rewards/rejected": -2.6909875869750977, "step": 376 }, { "epoch": 1.7814530419373893, "grad_norm": 4.12751400895733, "learning_rate": 1.4786313202048456e-08, "logits/chosen": -2.6907248497009277, "logits/rejected": -2.816317319869995, "logps/chosen": -403.627685546875, "logps/rejected": -487.9366760253906, "loss": 0.4929, "rewards/accuracies": 0.671875, "rewards/chosen": -1.6239163875579834, "rewards/margins": 1.5215915441513062, "rewards/rejected": -3.1455078125, "step": 377 }, { "epoch": 1.7861783815711756, "grad_norm": 4.590826584263257, "learning_rate": 1.4142644937909203e-08, "logits/chosen": -2.6118569374084473, "logits/rejected": -2.6019818782806396, "logps/chosen": -341.8358154296875, "logps/rejected": -419.81829833984375, "loss": 0.484, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3214576244354248, "rewards/margins": 0.8901782631874084, "rewards/rejected": -2.2116360664367676, "step": 378 }, { "epoch": 1.7909037212049617, "grad_norm": 4.146292704965862, "learning_rate": 1.351289222511426e-08, "logits/chosen": -2.610994577407837, "logits/rejected": -2.78933048248291, "logps/chosen": -369.8974609375, "logps/rejected": -480.7877197265625, "loss": 0.4694, "rewards/accuracies": 0.75, "rewards/chosen": -1.5018396377563477, "rewards/margins": 1.5635069608688354, "rewards/rejected": -3.0653464794158936, "step": 379 }, { "epoch": 1.7956290608387477, "grad_norm": 5.1734179405150895, "learning_rate": 1.2897092218986716e-08, "logits/chosen": -2.523732900619507, "logits/rejected": -2.6424248218536377, "logps/chosen": -425.2537536621094, "logps/rejected": -560.3030395507812, "loss": 0.4854, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5777177810668945, "rewards/margins": 1.5381261110305786, "rewards/rejected": -3.1158437728881836, "step": 380 }, { "epoch": 1.8003544004725338, "grad_norm": 3.865564251130276, "learning_rate": 1.2295281251641698e-08, "logits/chosen": -2.653510570526123, "logits/rejected": -2.6763200759887695, "logps/chosen": -392.48577880859375, "logps/rejected": -508.2447509765625, "loss": 0.4676, "rewards/accuracies": 0.671875, "rewards/chosen": -1.5738812685012817, "rewards/margins": 1.344365119934082, "rewards/rejected": -2.918246269226074, "step": 381 }, { "epoch": 1.80507974010632, "grad_norm": 3.9007660136930062, "learning_rate": 1.1707494829843207e-08, "logits/chosen": -2.4932150840759277, "logits/rejected": -2.525545835494995, "logps/chosen": -393.6082763671875, "logps/rejected": -494.412109375, "loss": 0.4637, "rewards/accuracies": 0.765625, "rewards/chosen": -1.6010535955429077, "rewards/margins": 1.5001673698425293, "rewards/rejected": -3.1012210845947266, "step": 382 }, { "epoch": 1.8098050797401064, "grad_norm": 3.789687566531843, "learning_rate": 1.1133767632908798e-08, "logits/chosen": -2.773787021636963, "logits/rejected": -2.7511181831359863, "logps/chosen": -374.2195739746094, "logps/rejected": -590.141357421875, "loss": 0.5006, "rewards/accuracies": 0.703125, "rewards/chosen": -1.784470558166504, "rewards/margins": 1.8250298500061035, "rewards/rejected": -3.6095001697540283, "step": 383 }, { "epoch": 1.8145304193738925, "grad_norm": 4.27781176287506, "learning_rate": 1.0574133510663747e-08, "logits/chosen": -2.4717490673065186, "logits/rejected": -2.580211639404297, "logps/chosen": -404.38250732421875, "logps/rejected": -501.7881774902344, "loss": 0.4858, "rewards/accuracies": 0.75, "rewards/chosen": -1.4478615522384644, "rewards/margins": 1.3333170413970947, "rewards/rejected": -2.7811787128448486, "step": 384 }, { "epoch": 1.8192557590076786, "grad_norm": 4.660154426544148, "learning_rate": 1.0028625481443981e-08, "logits/chosen": -2.6154394149780273, "logits/rejected": -2.6079351902008057, "logps/chosen": -332.8676452636719, "logps/rejected": -475.0517272949219, "loss": 0.4503, "rewards/accuracies": 0.703125, "rewards/chosen": -1.4882500171661377, "rewards/margins": 1.4799097776412964, "rewards/rejected": -2.9681599140167236, "step": 385 }, { "epoch": 1.8239810986414648, "grad_norm": 4.32107981908328, "learning_rate": 9.497275730147774e-09, "logits/chosen": -2.57356333732605, "logits/rejected": -2.566416025161743, "logps/chosen": -400.1445007324219, "logps/rejected": -566.9680786132812, "loss": 0.4524, "rewards/accuracies": 0.796875, "rewards/chosen": -1.724095344543457, "rewards/margins": 1.8485801219940186, "rewards/rejected": -3.5726757049560547, "step": 386 }, { "epoch": 1.8287064382752511, "grad_norm": 4.382882347588404, "learning_rate": 8.980115606337046e-09, "logits/chosen": -2.744180202484131, "logits/rejected": -2.6595263481140137, "logps/chosen": -318.4762268066406, "logps/rejected": -484.2186584472656, "loss": 0.4836, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4589486122131348, "rewards/margins": 1.453284502029419, "rewards/rejected": -2.912233352661133, "step": 387 }, { "epoch": 1.8287064382752511, "eval_logits/chosen": -2.637449264526367, "eval_logits/rejected": -2.6433615684509277, "eval_logps/chosen": -379.1832275390625, "eval_logps/rejected": -518.951416015625, "eval_loss": 0.4717705249786377, "eval_rewards/accuracies": 0.6420454382896423, "eval_rewards/chosen": -1.679525375366211, "eval_rewards/margins": 1.5104897022247314, "eval_rewards/rejected": -3.1900153160095215, "eval_runtime": 226.5578, "eval_samples_per_second": 16.137, "eval_steps_per_second": 0.291, "step": 387 }, { "epoch": 1.8334317779090372, "grad_norm": 4.767778942870907, "learning_rate": 8.47717562238756e-09, "logits/chosen": -2.504225254058838, "logits/rejected": -2.5545616149902344, "logps/chosen": -355.3597717285156, "logps/rejected": -519.2000732421875, "loss": 0.4663, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5064764022827148, "rewards/margins": 1.655312418937683, "rewards/rejected": -3.1617889404296875, "step": 388 }, { "epoch": 1.8381571175428233, "grad_norm": 4.59007333084006, "learning_rate": 7.988485451688815e-09, "logits/chosen": -2.8325204849243164, "logits/rejected": -2.8123779296875, "logps/chosen": -341.4407043457031, "logps/rejected": -501.8046875, "loss": 0.4958, "rewards/accuracies": 0.609375, "rewards/chosen": -1.6452869176864624, "rewards/margins": 1.5238325595855713, "rewards/rejected": -3.1691195964813232, "step": 389 }, { "epoch": 1.8428824571766096, "grad_norm": 4.836517410103638, "learning_rate": 7.514073926893432e-09, "logits/chosen": -2.441648006439209, "logits/rejected": -2.528268814086914, "logps/chosen": -386.49493408203125, "logps/rejected": -454.25616455078125, "loss": 0.5151, "rewards/accuracies": 0.671875, "rewards/chosen": -1.5899585485458374, "rewards/margins": 0.9383600950241089, "rewards/rejected": -2.5283186435699463, "step": 390 }, { "epoch": 1.8476077968103959, "grad_norm": 4.204493754201481, "learning_rate": 7.053969038215674e-09, "logits/chosen": -2.7649574279785156, "logits/rejected": -2.6198995113372803, "logps/chosen": -405.90283203125, "logps/rejected": -581.5423583984375, "loss": 0.4766, "rewards/accuracies": 0.640625, "rewards/chosen": -1.7321686744689941, "rewards/margins": 1.7127902507781982, "rewards/rejected": -3.4449586868286133, "step": 391 }, { "epoch": 1.852333136444182, "grad_norm": 3.805689590990249, "learning_rate": 6.608197931780496e-09, "logits/chosen": -2.5429623126983643, "logits/rejected": -2.4480888843536377, "logps/chosen": -373.41375732421875, "logps/rejected": -588.4783935546875, "loss": 0.4787, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4610140323638916, "rewards/margins": 1.9005025625228882, "rewards/rejected": -3.3615164756774902, "step": 392 }, { "epoch": 1.857058476077968, "grad_norm": 4.640569841501701, "learning_rate": 6.176786908021453e-09, "logits/chosen": -2.661363363265991, "logits/rejected": -2.734570026397705, "logps/chosen": -411.58306884765625, "logps/rejected": -510.5803527832031, "loss": 0.475, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6014535427093506, "rewards/margins": 1.3125067949295044, "rewards/rejected": -2.9139604568481445, "step": 393 }, { "epoch": 1.8617838157117543, "grad_norm": 4.140728148044369, "learning_rate": 5.759761420129322e-09, "logits/chosen": -2.901158332824707, "logits/rejected": -2.942783832550049, "logps/chosen": -331.9306945800781, "logps/rejected": -466.704833984375, "loss": 0.4703, "rewards/accuracies": 0.75, "rewards/chosen": -1.4073690176010132, "rewards/margins": 1.4787429571151733, "rewards/rejected": -2.8861119747161865, "step": 394 }, { "epoch": 1.8665091553455406, "grad_norm": 3.736333802301315, "learning_rate": 5.357146072550278e-09, "logits/chosen": -2.4809322357177734, "logits/rejected": -2.499831438064575, "logps/chosen": -393.06109619140625, "logps/rejected": -467.2709655761719, "loss": 0.4632, "rewards/accuracies": 0.59375, "rewards/chosen": -1.567178726196289, "rewards/margins": 0.7932885885238647, "rewards/rejected": -2.3604674339294434, "step": 395 }, { "epoch": 1.8712344949793267, "grad_norm": 4.263089946197189, "learning_rate": 4.968964619534138e-09, "logits/chosen": -2.6070809364318848, "logits/rejected": -2.5771572589874268, "logps/chosen": -346.75042724609375, "logps/rejected": -519.9202880859375, "loss": 0.443, "rewards/accuracies": 0.78125, "rewards/chosen": -1.525521993637085, "rewards/margins": 1.564245581626892, "rewards/rejected": -3.0897674560546875, "step": 396 }, { "epoch": 1.8759598346131128, "grad_norm": 4.104385303806953, "learning_rate": 4.595239963733011e-09, "logits/chosen": -2.834834575653076, "logits/rejected": -2.8623130321502686, "logps/chosen": -387.9482727050781, "logps/rejected": -544.7717895507812, "loss": 0.427, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8901216983795166, "rewards/margins": 1.8027396202087402, "rewards/rejected": -3.6928610801696777, "step": 397 }, { "epoch": 1.8806851742468988, "grad_norm": 3.9166519315043407, "learning_rate": 4.2359941548499035e-09, "logits/chosen": -2.438992977142334, "logits/rejected": -2.4675240516662598, "logps/chosen": -377.0653076171875, "logps/rejected": -579.7406616210938, "loss": 0.4621, "rewards/accuracies": 0.640625, "rewards/chosen": -1.7994550466537476, "rewards/margins": 1.7710639238357544, "rewards/rejected": -3.570518970489502, "step": 398 }, { "epoch": 1.8854105138806851, "grad_norm": 3.980027730791039, "learning_rate": 3.891248388337847e-09, "logits/chosen": -2.461378574371338, "logits/rejected": -2.3893392086029053, "logps/chosen": -391.1559753417969, "logps/rejected": -487.46112060546875, "loss": 0.4763, "rewards/accuracies": 0.75, "rewards/chosen": -1.5593876838684082, "rewards/margins": 1.065595030784607, "rewards/rejected": -2.6249828338623047, "step": 399 }, { "epoch": 1.8901358535144714, "grad_norm": 4.768587791597909, "learning_rate": 3.5610230041494828e-09, "logits/chosen": -2.3206558227539062, "logits/rejected": -2.3519835472106934, "logps/chosen": -392.8856506347656, "logps/rejected": -517.7825317382812, "loss": 0.4687, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5414066314697266, "rewards/margins": 1.4560898542404175, "rewards/rejected": -2.9974963665008545, "step": 400 }, { "epoch": 1.8948611931482575, "grad_norm": 4.155264459662159, "learning_rate": 3.2453374855367366e-09, "logits/chosen": -2.830385684967041, "logits/rejected": -2.893541097640991, "logps/chosen": -356.2238464355469, "logps/rejected": -443.7274169921875, "loss": 0.4612, "rewards/accuracies": 0.6875, "rewards/chosen": -1.463138461112976, "rewards/margins": 1.1921138763427734, "rewards/rejected": -2.65525221824646, "step": 401 }, { "epoch": 1.8995865327820436, "grad_norm": 4.0526022624158955, "learning_rate": 2.9442104579016356e-09, "logits/chosen": -2.313565731048584, "logits/rejected": -2.4396870136260986, "logps/chosen": -441.73736572265625, "logps/rejected": -458.2696533203125, "loss": 0.4724, "rewards/accuracies": 0.703125, "rewards/chosen": -1.3574057817459106, "rewards/margins": 0.8890299797058105, "rewards/rejected": -2.2464358806610107, "step": 402 }, { "epoch": 1.9043118724158299, "grad_norm": 3.860540669652651, "learning_rate": 2.657659687697156e-09, "logits/chosen": -2.6873679161071777, "logits/rejected": -2.549762725830078, "logps/chosen": -317.7322082519531, "logps/rejected": -429.63787841796875, "loss": 0.4784, "rewards/accuracies": 0.640625, "rewards/chosen": -1.5817471742630005, "rewards/margins": 1.0039292573928833, "rewards/rejected": -2.5856761932373047, "step": 403 }, { "epoch": 1.9090372120496162, "grad_norm": 4.53787076168368, "learning_rate": 2.385702081379143e-09, "logits/chosen": -2.3574860095977783, "logits/rejected": -2.427652359008789, "logps/chosen": -431.2988586425781, "logps/rejected": -567.0403442382812, "loss": 0.4604, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6895103454589844, "rewards/margins": 1.765432357788086, "rewards/rejected": -3.4549427032470703, "step": 404 }, { "epoch": 1.9137625516834023, "grad_norm": 3.735971197718604, "learning_rate": 2.1283536844087513e-09, "logits/chosen": -2.5645933151245117, "logits/rejected": -2.5676231384277344, "logps/chosen": -362.2431335449219, "logps/rejected": -531.4500122070312, "loss": 0.4731, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5010064840316772, "rewards/margins": 1.6250488758087158, "rewards/rejected": -3.1260552406311035, "step": 405 }, { "epoch": 1.9184878913171883, "grad_norm": 3.613782865337248, "learning_rate": 1.885629680305867e-09, "logits/chosen": -2.569301128387451, "logits/rejected": -2.568390130996704, "logps/chosen": -384.6716613769531, "logps/rejected": -521.9403076171875, "loss": 0.4467, "rewards/accuracies": 0.703125, "rewards/chosen": -1.6642777919769287, "rewards/margins": 1.619908332824707, "rewards/rejected": -3.284186363220215, "step": 406 }, { "epoch": 1.9232132309509746, "grad_norm": 4.461343319483535, "learning_rate": 1.6575443897531294e-09, "logits/chosen": -2.5552725791931152, "logits/rejected": -2.436856985092163, "logps/chosen": -367.128662109375, "logps/rejected": -545.8114624023438, "loss": 0.4754, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6492211818695068, "rewards/margins": 1.592346429824829, "rewards/rejected": -3.241567373275757, "step": 407 }, { "epoch": 1.927938570584761, "grad_norm": 4.454271459686257, "learning_rate": 1.4441112697511638e-09, "logits/chosen": -2.6292550563812256, "logits/rejected": -2.6084656715393066, "logps/chosen": -421.9637756347656, "logps/rejected": -506.38897705078125, "loss": 0.484, "rewards/accuracies": 0.75, "rewards/chosen": -1.6983362436294556, "rewards/margins": 1.19191312789917, "rewards/rejected": -2.890249252319336, "step": 408 }, { "epoch": 1.932663910218547, "grad_norm": 4.002197751374577, "learning_rate": 1.2453429128245762e-09, "logits/chosen": -2.5047454833984375, "logits/rejected": -2.451660633087158, "logps/chosen": -431.7708740234375, "logps/rejected": -614.2373657226562, "loss": 0.471, "rewards/accuracies": 0.75, "rewards/chosen": -1.9678634405136108, "rewards/margins": 1.840644359588623, "rewards/rejected": -3.8085079193115234, "step": 409 }, { "epoch": 1.937389249852333, "grad_norm": 4.283657038062509, "learning_rate": 1.061251046278938e-09, "logits/chosen": -2.5947906970977783, "logits/rejected": -2.700439214706421, "logps/chosen": -392.77850341796875, "logps/rejected": -458.53570556640625, "loss": 0.4785, "rewards/accuracies": 0.703125, "rewards/chosen": -1.4087834358215332, "rewards/margins": 1.301395297050476, "rewards/rejected": -2.7101786136627197, "step": 410 }, { "epoch": 1.9421145894861194, "grad_norm": 4.320389688019286, "learning_rate": 8.918465315088941e-10, "logits/chosen": -2.529167413711548, "logits/rejected": -2.595038652420044, "logps/chosen": -436.0292663574219, "logps/rejected": -522.2467041015625, "loss": 0.4691, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7954521179199219, "rewards/margins": 1.5213781595230103, "rewards/rejected": -3.3168301582336426, "step": 411 }, { "epoch": 1.9468399291199054, "grad_norm": 3.917488632599818, "learning_rate": 7.371393633574252e-10, "logits/chosen": -2.508873224258423, "logits/rejected": -2.5457823276519775, "logps/chosen": -384.64923095703125, "logps/rejected": -475.94134521484375, "loss": 0.469, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4391266107559204, "rewards/margins": 1.211023211479187, "rewards/rejected": -2.6501498222351074, "step": 412 }, { "epoch": 1.9515652687536917, "grad_norm": 3.9351124104785584, "learning_rate": 5.971386695260705e-10, "logits/chosen": -2.465684652328491, "logits/rejected": -2.5066022872924805, "logps/chosen": -385.0665283203125, "logps/rejected": -479.13214111328125, "loss": 0.4643, "rewards/accuracies": 0.765625, "rewards/chosen": -1.4245293140411377, "rewards/margins": 1.249526858329773, "rewards/rejected": -2.6740562915802, "step": 413 }, { "epoch": 1.9562906083874778, "grad_norm": 3.704041812352823, "learning_rate": 4.718527100364134e-10, "logits/chosen": -2.6545798778533936, "logits/rejected": -2.6624419689178467, "logps/chosen": -366.58184814453125, "logps/rejected": -510.66436767578125, "loss": 0.4482, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5549781322479248, "rewards/margins": 1.5164021253585815, "rewards/rejected": -3.071380376815796, "step": 414 }, { "epoch": 1.9610159480212639, "grad_norm": 4.499954109459605, "learning_rate": 3.6128887674272133e-10, "logits/chosen": -2.5801494121551514, "logits/rejected": -2.551297187805176, "logps/chosen": -360.39935302734375, "logps/rejected": -539.8038330078125, "loss": 0.4529, "rewards/accuracies": 0.75, "rewards/chosen": -1.6478443145751953, "rewards/margins": 1.7761938571929932, "rewards/rejected": -3.4240384101867676, "step": 415 }, { "epoch": 1.9657412876550502, "grad_norm": 4.539840176257443, "learning_rate": 2.6545369289587836e-10, "logits/chosen": -2.506206750869751, "logits/rejected": -2.729609489440918, "logps/chosen": -423.645751953125, "logps/rejected": -465.3453674316406, "loss": 0.5018, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2937572002410889, "rewards/margins": 1.3675696849822998, "rewards/rejected": -2.6613268852233887, "step": 416 }, { "epoch": 1.9704666272888365, "grad_norm": 4.977216825161169, "learning_rate": 1.843528127584981e-10, "logits/chosen": -2.5777127742767334, "logits/rejected": -2.611386775970459, "logps/chosen": -396.95892333984375, "logps/rejected": -500.09375, "loss": 0.4951, "rewards/accuracies": 0.609375, "rewards/chosen": -1.7503533363342285, "rewards/margins": 1.2793786525726318, "rewards/rejected": -3.0297319889068604, "step": 417 }, { "epoch": 1.9751919669226226, "grad_norm": 4.452268262277202, "learning_rate": 1.17991021271302e-10, "logits/chosen": -2.4709465503692627, "logits/rejected": -2.399839401245117, "logps/chosen": -379.194091796875, "logps/rejected": -520.7940063476562, "loss": 0.4696, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5808072090148926, "rewards/margins": 1.5406217575073242, "rewards/rejected": -3.121428966522217, "step": 418 }, { "epoch": 1.9799173065564086, "grad_norm": 4.060313383389935, "learning_rate": 6.637223377078949e-11, "logits/chosen": -2.822049856185913, "logits/rejected": -2.800950765609741, "logps/chosen": -309.0566711425781, "logps/rejected": -446.6823425292969, "loss": 0.4561, "rewards/accuracies": 0.6875, "rewards/chosen": -1.39582097530365, "rewards/margins": 1.2124607563018799, "rewards/rejected": -2.6082818508148193, "step": 419 }, { "epoch": 1.984642646190195, "grad_norm": 4.291839752698122, "learning_rate": 2.949949575833943e-11, "logits/chosen": -2.625284433364868, "logits/rejected": -2.6156272888183594, "logps/chosen": -342.9141845703125, "logps/rejected": -463.14312744140625, "loss": 0.5061, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5005218982696533, "rewards/margins": 1.2724413871765137, "rewards/rejected": -2.772963047027588, "step": 420 }, { "epoch": 1.9893679858239812, "grad_norm": 4.75166895203396, "learning_rate": 7.374982720326217e-12, "logits/chosen": -2.7700271606445312, "logits/rejected": -2.7563564777374268, "logps/chosen": -397.4788818359375, "logps/rejected": -582.6298828125, "loss": 0.4718, "rewards/accuracies": 0.703125, "rewards/chosen": -1.6806347370147705, "rewards/margins": 1.8375290632247925, "rewards/rejected": -3.5181639194488525, "step": 421 }, { "epoch": 1.9940933254577673, "grad_norm": 4.42823795907204, "learning_rate": 0.0, "logits/chosen": -2.568351984024048, "logits/rejected": -2.728876829147339, "logps/chosen": -369.7898254394531, "logps/rejected": -406.9199523925781, "loss": 0.4865, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3815803527832031, "rewards/margins": 0.7950088977813721, "rewards/rejected": -2.176589250564575, "step": 422 }, { "epoch": 1.9940933254577673, "step": 422, "total_flos": 0.0, "train_loss": 0.5345145690638872, "train_runtime": 33183.7631, "train_samples_per_second": 5.711, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 422, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 43, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }