{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998424948810837, "eval_steps": 100, "global_step": 3174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.322265625, "learning_rate": 1.5723270440251572e-09, "logits/chosen": -1.3876760005950928, "logits/rejected": -1.4584133625030518, "logps/chosen": -148.11717224121094, "logps/rejected": -197.28189086914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.474609375, "learning_rate": 1.5723270440251573e-08, "logits/chosen": -1.2969517707824707, "logits/rejected": -1.0069656372070312, "logps/chosen": -190.4855499267578, "logps/rejected": -182.0135498046875, "loss": 0.6929, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.0011108842445537448, "rewards/margins": 0.001312906388193369, "rewards/margins_max": 0.0032973522320389748, "rewards/margins_min": -0.0006715393392369151, "rewards/margins_std": 0.0028064302168786526, "rewards/rejected": -0.00020202209998387843, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.443359375, "learning_rate": 3.1446540880503146e-08, "logits/chosen": -1.36593496799469, "logits/rejected": -1.0528085231781006, "logps/chosen": -225.4935760498047, "logps/rejected": -200.0979766845703, "loss": 0.6933, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 7.484816160285845e-05, "rewards/margins": -0.0001347160286968574, "rewards/margins_max": 0.0016663169953972101, "rewards/margins_min": -0.0019357489654794335, "rewards/margins_std": 0.0025470454711467028, "rewards/rejected": 0.00020956425578333437, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.431640625, "learning_rate": 4.7169811320754715e-08, "logits/chosen": -1.26302170753479, "logits/rejected": -0.982827365398407, "logps/chosen": -180.48269653320312, "logps/rejected": -184.57960510253906, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005473994533531368, "rewards/margins": 0.0005724715883843601, "rewards/margins_max": 0.002282569883391261, "rewards/margins_min": -0.0011376264737918973, "rewards/margins_std": 0.002418444026261568, "rewards/rejected": -0.0011198710417374969, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.33984375, "learning_rate": 6.289308176100629e-08, "logits/chosen": -1.4589287042617798, "logits/rejected": -1.1574287414550781, "logps/chosen": -225.4607696533203, "logps/rejected": -276.73675537109375, "loss": 0.6935, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00039744950481690466, "rewards/margins": -0.00030673606670461595, "rewards/margins_max": 0.0013146628625690937, "rewards/margins_min": -0.0019281348213553429, "rewards/margins_std": 0.0022930041886866093, "rewards/rejected": 0.0007041855715215206, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.515625, "learning_rate": 7.861635220125786e-08, "logits/chosen": -1.3671009540557861, "logits/rejected": -0.8631851077079773, "logps/chosen": -331.6417236328125, "logps/rejected": -205.7646026611328, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0002600564039312303, "rewards/margins": -0.0007569913868792355, "rewards/margins_max": 0.0010670910123735666, "rewards/margins_min": -0.002581073669716716, "rewards/margins_std": 0.002579641994088888, "rewards/rejected": 0.0010170477908104658, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.451171875, "learning_rate": 9.433962264150943e-08, "logits/chosen": -1.1948202848434448, "logits/rejected": -1.0117332935333252, "logps/chosen": -203.6728515625, "logps/rejected": -264.63153076171875, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.000772724102716893, "rewards/margins": 1.4207902495400049e-05, "rewards/margins_max": 0.0016571771120652556, "rewards/margins_min": -0.001628761412575841, "rewards/margins_std": 0.0023235089611262083, "rewards/rejected": 0.0007585162529721856, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.447265625, "learning_rate": 1.10062893081761e-07, "logits/chosen": -1.4154024124145508, "logits/rejected": -1.0937511920928955, "logps/chosen": -218.91259765625, "logps/rejected": -224.9219207763672, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00017823689267970622, "rewards/margins": -0.00020709517411887646, "rewards/margins_max": 0.0016052055871114135, "rewards/margins_min": -0.002019395586103201, "rewards/margins_std": 0.00256298016756773, "rewards/rejected": 0.00038533215411007404, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.5078125, "learning_rate": 1.2578616352201258e-07, "logits/chosen": -1.2727240324020386, "logits/rejected": -0.9936261177062988, "logps/chosen": -285.10943603515625, "logps/rejected": -266.4510192871094, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": 0.0009560451726429164, "rewards/margins": 0.0006419096025638282, "rewards/margins_max": 0.0029980712570250034, "rewards/margins_min": -0.0017142522847279906, "rewards/margins_std": 0.003332116873934865, "rewards/rejected": 0.0003141355118714273, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.59375, "learning_rate": 1.4150943396226414e-07, "logits/chosen": -1.4589568376541138, "logits/rejected": -1.1692708730697632, "logps/chosen": -212.2246551513672, "logps/rejected": -219.21646118164062, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0005518359830603004, "rewards/margins": 0.001025562291033566, "rewards/margins_max": 0.0023937453515827656, "rewards/margins_min": -0.0003426209441386163, "rewards/margins_std": 0.001934903091751039, "rewards/rejected": -0.00047372624976560473, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.3359375, "learning_rate": 1.5723270440251572e-07, "logits/chosen": -1.405853033065796, "logits/rejected": -0.9023151397705078, "logps/chosen": -257.5167236328125, "logps/rejected": -205.4027862548828, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005634050467051566, "rewards/margins": 0.001445975387468934, "rewards/margins_max": 0.0039651584811508656, "rewards/margins_min": -0.001073207939043641, "rewards/margins_std": 0.0035626632161438465, "rewards/rejected": -0.0008825702825561166, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.392578125, "learning_rate": 1.7295597484276728e-07, "logits/chosen": -1.2503092288970947, "logits/rejected": -0.9771049618721008, "logps/chosen": -230.6888427734375, "logps/rejected": -189.9393310546875, "loss": 0.6924, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.000937645963858813, "rewards/margins": 0.001897513517178595, "rewards/margins_max": 0.0035337067674845457, "rewards/margins_min": 0.00026132012135349214, "rewards/margins_std": 0.0023139265831559896, "rewards/rejected": -0.0009598674369044602, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.453125, "learning_rate": 1.8867924528301886e-07, "logits/chosen": -1.413317084312439, "logits/rejected": -1.0483345985412598, "logps/chosen": -195.40811157226562, "logps/rejected": -186.1103515625, "loss": 0.6928, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0013014640426263213, "rewards/margins": 0.0010003356728702784, "rewards/margins_max": 0.0033742673695087433, "rewards/margins_min": -0.0013735961401835084, "rewards/margins_std": 0.003357246518135071, "rewards/rejected": 0.0003011283988598734, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.57421875, "learning_rate": 2.0440251572327044e-07, "logits/chosen": -1.1766637563705444, "logits/rejected": -0.9444602131843567, "logps/chosen": -219.5814666748047, "logps/rejected": -248.6021728515625, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0013613433111459017, "rewards/margins": 0.0008185977349057794, "rewards/margins_max": 0.0035369223915040493, "rewards/margins_min": -0.0018997270381078124, "rewards/margins_std": 0.0038442914374172688, "rewards/rejected": 0.0005427456344477832, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.416015625, "learning_rate": 2.20125786163522e-07, "logits/chosen": -1.1843626499176025, "logits/rejected": -0.9615445137023926, "logps/chosen": -267.6846923828125, "logps/rejected": -216.41455078125, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0015651207650080323, "rewards/margins": 0.0013728371122851968, "rewards/margins_max": 0.003893634770065546, "rewards/margins_min": -0.0011479605454951525, "rewards/margins_std": 0.0035649463534355164, "rewards/rejected": 0.00019228360906708986, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.314453125, "learning_rate": 2.3584905660377358e-07, "logits/chosen": -1.3410217761993408, "logits/rejected": -0.8768698573112488, "logps/chosen": -305.0233154296875, "logps/rejected": -234.83407592773438, "loss": 0.692, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0015008506597951055, "rewards/margins": 0.002513662213459611, "rewards/margins_max": 0.005906062666326761, "rewards/margins_min": -0.0008787383558228612, "rewards/margins_std": 0.004797579254955053, "rewards/rejected": -0.0010128116700798273, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.34765625, "learning_rate": 2.5157232704402517e-07, "logits/chosen": -1.3618042469024658, "logits/rejected": -1.1711941957473755, "logps/chosen": -168.73251342773438, "logps/rejected": -229.12173461914062, "loss": 0.6921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0013464975636452436, "rewards/margins": 0.0017303951317444444, "rewards/margins_max": 0.002993419300764799, "rewards/margins_min": 0.000467371050035581, "rewards/margins_std": 0.0017861860105767846, "rewards/rejected": -0.0003838978009298444, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.326171875, "learning_rate": 2.672955974842767e-07, "logits/chosen": -1.2365471124649048, "logits/rejected": -0.9317380785942078, "logps/chosen": -220.8829345703125, "logps/rejected": -198.69509887695312, "loss": 0.6921, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0018165509682148695, "rewards/margins": 0.0023886661510914564, "rewards/margins_max": 0.004315841477364302, "rewards/margins_min": 0.0004614906501956284, "rewards/margins_std": 0.0027254377491772175, "rewards/rejected": -0.0005721148918382823, "step": 170 }, { "epoch": 0.06, "grad_norm": 0.5625, "learning_rate": 2.830188679245283e-07, "logits/chosen": -1.3770387172698975, "logits/rejected": -1.0459026098251343, "logps/chosen": -213.62649536132812, "logps/rejected": -216.0526580810547, "loss": 0.6916, "rewards/accuracies": 0.75, "rewards/chosen": 0.0032491025049239397, "rewards/margins": 0.004016959108412266, "rewards/margins_max": 0.006775864399969578, "rewards/margins_min": 0.0012580546317622066, "rewards/margins_std": 0.0039016795344650745, "rewards/rejected": -0.0007678564870730042, "step": 180 }, { "epoch": 0.06, "grad_norm": 0.43359375, "learning_rate": 2.9874213836477983e-07, "logits/chosen": -1.280879259109497, "logits/rejected": -1.0743911266326904, "logps/chosen": -196.5890655517578, "logps/rejected": -215.959228515625, "loss": 0.6916, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.003082216251641512, "rewards/margins": 0.0024519709404557943, "rewards/margins_max": 0.004635250195860863, "rewards/margins_min": 0.0002686919760890305, "rewards/margins_std": 0.003087623044848442, "rewards/rejected": 0.0006302451947703958, "step": 190 }, { "epoch": 0.06, "grad_norm": 0.431640625, "learning_rate": 3.1446540880503144e-07, "logits/chosen": -1.312201738357544, "logits/rejected": -1.1042929887771606, "logps/chosen": -219.0549774169922, "logps/rejected": -207.017822265625, "loss": 0.6917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.001382762915454805, "rewards/margins": 0.002051582094281912, "rewards/margins_max": 0.0041840835474431515, "rewards/margins_min": -8.09192206361331e-05, "rewards/margins_std": 0.0030158127192407846, "rewards/rejected": -0.0006688194698654115, "step": 200 }, { "epoch": 0.07, "grad_norm": 0.38671875, "learning_rate": 3.30188679245283e-07, "logits/chosen": -1.4287524223327637, "logits/rejected": -1.090522050857544, "logps/chosen": -237.6526336669922, "logps/rejected": -253.42056274414062, "loss": 0.6913, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.003422607434913516, "rewards/margins": 0.0037254388444125652, "rewards/margins_max": 0.007606147322803736, "rewards/margins_min": -0.00015526966308243573, "rewards/margins_std": 0.005488150753080845, "rewards/rejected": -0.00030283164232969284, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.3359375, "learning_rate": 3.4591194968553456e-07, "logits/chosen": -1.3905646800994873, "logits/rejected": -1.1243839263916016, "logps/chosen": -275.39739990234375, "logps/rejected": -198.20419311523438, "loss": 0.6913, "rewards/accuracies": 0.875, "rewards/chosen": 0.0038289937656372786, "rewards/margins": 0.0037508513778448105, "rewards/margins_max": 0.00589752709493041, "rewards/margins_min": 0.0016041755443438888, "rewards/margins_std": 0.0030358582735061646, "rewards/rejected": 7.814211130607873e-05, "step": 220 }, { "epoch": 0.07, "grad_norm": 0.482421875, "learning_rate": 3.616352201257861e-07, "logits/chosen": -1.4099429845809937, "logits/rejected": -1.1149197816848755, "logps/chosen": -253.6432647705078, "logps/rejected": -201.7845458984375, "loss": 0.6906, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.004002997186034918, "rewards/margins": 0.004339634440839291, "rewards/margins_max": 0.0070360577665269375, "rewards/margins_min": 0.0016432113479822874, "rewards/margins_std": 0.003813318442553282, "rewards/rejected": -0.00033663742942735553, "step": 230 }, { "epoch": 0.08, "grad_norm": 0.482421875, "learning_rate": 3.773584905660377e-07, "logits/chosen": -1.3923314809799194, "logits/rejected": -1.2636398077011108, "logps/chosen": -176.70986938476562, "logps/rejected": -260.1700134277344, "loss": 0.6905, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.003964459989219904, "rewards/margins": 0.0052338032983243465, "rewards/margins_max": 0.008278938010334969, "rewards/margins_min": 0.0021886671893298626, "rewards/margins_std": 0.004306471906602383, "rewards/rejected": -0.0012693424941971898, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.392578125, "learning_rate": 3.9308176100628933e-07, "logits/chosen": -1.4243371486663818, "logits/rejected": -1.1771718263626099, "logps/chosen": -265.36944580078125, "logps/rejected": -217.3080291748047, "loss": 0.6903, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.004339687060564756, "rewards/margins": 0.005566070321947336, "rewards/margins_max": 0.009654941037297249, "rewards/margins_min": 0.00147719937376678, "rewards/margins_std": 0.00578253623098135, "rewards/rejected": -0.0012263832613825798, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.412109375, "learning_rate": 4.088050314465409e-07, "logits/chosen": -1.2650946378707886, "logits/rejected": -0.7829256057739258, "logps/chosen": -283.19415283203125, "logps/rejected": -258.4779052734375, "loss": 0.6898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006826590746641159, "rewards/margins": 0.0073587894439697266, "rewards/margins_max": 0.012493086978793144, "rewards/margins_min": 0.002224491210654378, "rewards/margins_std": 0.007260994054377079, "rewards/rejected": -0.0005321979406289756, "step": 260 }, { "epoch": 0.09, "grad_norm": 0.51953125, "learning_rate": 4.2452830188679244e-07, "logits/chosen": -1.379631757736206, "logits/rejected": -0.836907684803009, "logps/chosen": -260.239501953125, "logps/rejected": -241.7003631591797, "loss": 0.6892, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.006983810570091009, "rewards/margins": 0.009598185308277607, "rewards/margins_max": 0.013700554147362709, "rewards/margins_min": 0.00549581553786993, "rewards/margins_std": 0.005801626015454531, "rewards/rejected": -0.002614373806864023, "step": 270 }, { "epoch": 0.09, "grad_norm": 0.376953125, "learning_rate": 4.40251572327044e-07, "logits/chosen": -1.538783311843872, "logits/rejected": -1.2011783123016357, "logps/chosen": -195.48477172851562, "logps/rejected": -190.34756469726562, "loss": 0.6903, "rewards/accuracies": 0.875, "rewards/chosen": 0.004987453110516071, "rewards/margins": 0.005685538984835148, "rewards/margins_max": 0.008754138834774494, "rewards/margins_min": 0.002616937505081296, "rewards/margins_std": 0.004339656792581081, "rewards/rejected": -0.0006980849429965019, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.30859375, "learning_rate": 4.559748427672956e-07, "logits/chosen": -1.453611135482788, "logits/rejected": -1.022805094718933, "logps/chosen": -242.57275390625, "logps/rejected": -207.9861602783203, "loss": 0.6885, "rewards/accuracies": 0.875, "rewards/chosen": 0.007862430065870285, "rewards/margins": 0.010466397739946842, "rewards/margins_max": 0.015998583287000656, "rewards/margins_min": 0.00493421358987689, "rewards/margins_std": 0.007823689840734005, "rewards/rejected": -0.0026039674412459135, "step": 290 }, { "epoch": 0.09, "grad_norm": 0.37109375, "learning_rate": 4.7169811320754717e-07, "logits/chosen": -1.6172186136245728, "logits/rejected": -1.1852935552597046, "logps/chosen": -227.5122528076172, "logps/rejected": -211.18136596679688, "loss": 0.6883, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.01041549351066351, "rewards/margins": 0.009889104403555393, "rewards/margins_max": 0.014782111160457134, "rewards/margins_min": 0.004996097646653652, "rewards/margins_std": 0.006919757463037968, "rewards/rejected": 0.0005263882922008634, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.609375, "learning_rate": 4.874213836477988e-07, "logits/chosen": -1.3813108205795288, "logits/rejected": -1.1312620639801025, "logps/chosen": -199.9052276611328, "logps/rejected": -216.5789337158203, "loss": 0.6888, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.009382456541061401, "rewards/margins": 0.008607019670307636, "rewards/margins_max": 0.013246886432170868, "rewards/margins_min": 0.003967151511460543, "rewards/margins_std": 0.0065617635846138, "rewards/rejected": 0.0007754383259452879, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.3984375, "learning_rate": 4.999993950030735e-07, "logits/chosen": -1.3889760971069336, "logits/rejected": -1.0410958528518677, "logps/chosen": -250.23452758789062, "logps/rejected": -237.1486053466797, "loss": 0.6888, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.007147905416786671, "rewards/margins": 0.008209905587136745, "rewards/margins_max": 0.013320088386535645, "rewards/margins_min": 0.003099723719060421, "rewards/margins_std": 0.007226888090372086, "rewards/rejected": -0.0010620001703500748, "step": 320 }, { "epoch": 0.1, "grad_norm": 0.416015625, "learning_rate": 4.999782204181026e-07, "logits/chosen": -1.4792320728302002, "logits/rejected": -0.9951168298721313, "logps/chosen": -240.000732421875, "logps/rejected": -256.5224304199219, "loss": 0.6873, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.011202135123312473, "rewards/margins": 0.014701342210173607, "rewards/margins_max": 0.022865889593958855, "rewards/margins_min": 0.006536795757710934, "rewards/margins_std": 0.011546412482857704, "rewards/rejected": -0.003499208018183708, "step": 330 }, { "epoch": 0.11, "grad_norm": 0.4453125, "learning_rate": 4.999267989149139e-07, "logits/chosen": -1.3123283386230469, "logits/rejected": -0.9737062454223633, "logps/chosen": -181.56187438964844, "logps/rejected": -181.15927124023438, "loss": 0.6872, "rewards/accuracies": 0.875, "rewards/chosen": 0.00947630312293768, "rewards/margins": 0.010779361240565777, "rewards/margins_max": 0.014272956177592278, "rewards/margins_min": 0.007285767234861851, "rewards/margins_std": 0.004940689541399479, "rewards/rejected": -0.0013030586997047067, "step": 340 }, { "epoch": 0.11, "grad_norm": 0.48828125, "learning_rate": 4.998451367154173e-07, "logits/chosen": -1.3401153087615967, "logits/rejected": -0.9146574139595032, "logps/chosen": -263.080810546875, "logps/rejected": -242.10769653320312, "loss": 0.6867, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.010717710480093956, "rewards/margins": 0.013646000996232033, "rewards/margins_max": 0.021306831389665604, "rewards/margins_min": 0.005985168274492025, "rewards/margins_std": 0.010834051296114922, "rewards/rejected": -0.0029282893519848585, "step": 350 }, { "epoch": 0.11, "grad_norm": 0.490234375, "learning_rate": 4.997332437005931e-07, "logits/chosen": -1.6230709552764893, "logits/rejected": -1.197361946105957, "logps/chosen": -276.94305419921875, "logps/rejected": -222.4744110107422, "loss": 0.6867, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.01071252953261137, "rewards/margins": 0.014663497917354107, "rewards/margins_max": 0.023539308458566666, "rewards/margins_min": 0.005787692964076996, "rewards/margins_std": 0.012552286498248577, "rewards/rejected": -0.003950969781726599, "step": 360 }, { "epoch": 0.12, "grad_norm": 0.404296875, "learning_rate": 4.995911334092962e-07, "logits/chosen": -1.4035460948944092, "logits/rejected": -1.0208442211151123, "logps/chosen": -231.21987915039062, "logps/rejected": -171.5296630859375, "loss": 0.6846, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.012600275687873363, "rewards/margins": 0.01731901802122593, "rewards/margins_max": 0.023710820823907852, "rewards/margins_min": 0.010927212424576283, "rewards/margins_std": 0.0090393777936697, "rewards/rejected": -0.0047187404707074165, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.345703125, "learning_rate": 4.994188230366183e-07, "logits/chosen": -1.3101140260696411, "logits/rejected": -1.0723780393600464, "logps/chosen": -228.2586669921875, "logps/rejected": -181.29495239257812, "loss": 0.6858, "rewards/accuracies": 0.875, "rewards/chosen": 0.01025369018316269, "rewards/margins": 0.014498481526970863, "rewards/margins_max": 0.023106779903173447, "rewards/margins_min": 0.005890182219445705, "rewards/margins_std": 0.012173972092568874, "rewards/rejected": -0.004244790878146887, "step": 380 }, { "epoch": 0.12, "grad_norm": 0.380859375, "learning_rate": 4.992163334318065e-07, "logits/chosen": -1.295276165008545, "logits/rejected": -0.8806821703910828, "logps/chosen": -266.9888916015625, "logps/rejected": -213.50936889648438, "loss": 0.6845, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.012072061188519001, "rewards/margins": 0.016327695921063423, "rewards/margins_max": 0.02478429302573204, "rewards/margins_min": 0.007871100679039955, "rewards/margins_std": 0.011959430761635303, "rewards/rejected": -0.004255634266883135, "step": 390 }, { "epoch": 0.13, "grad_norm": 0.384765625, "learning_rate": 4.989836890957414e-07, "logits/chosen": -1.3160616159439087, "logits/rejected": -0.985907256603241, "logps/chosen": -209.94921875, "logps/rejected": -199.602294921875, "loss": 0.6861, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.011544553562998772, "rewards/margins": 0.013081875629723072, "rewards/margins_max": 0.020108871161937714, "rewards/margins_min": 0.00605488196015358, "rewards/margins_std": 0.009937671013176441, "rewards/rejected": -0.0015373228816315532, "step": 400 }, { "epoch": 0.13, "grad_norm": 0.55859375, "learning_rate": 4.987209181779722e-07, "logits/chosen": -1.4713616371154785, "logits/rejected": -1.2096041440963745, "logps/chosen": -192.724853515625, "logps/rejected": -176.57815551757812, "loss": 0.6856, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.012110630050301552, "rewards/margins": 0.013713735155761242, "rewards/margins_max": 0.019766617566347122, "rewards/margins_min": 0.007660853676497936, "rewards/margins_std": 0.008560067042708397, "rewards/rejected": -0.0016031056875362992, "step": 410 }, { "epoch": 0.13, "grad_norm": 0.390625, "learning_rate": 4.984280524733107e-07, "logits/chosen": -1.367755651473999, "logits/rejected": -0.9895979762077332, "logps/chosen": -256.28277587890625, "logps/rejected": -244.4967041015625, "loss": 0.6819, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02168741635978222, "rewards/margins": 0.0215130727738142, "rewards/margins_max": 0.03101455047726631, "rewards/margins_min": 0.012011596001684666, "rewards/margins_std": 0.013437116518616676, "rewards/rejected": 0.00017434502660762519, "step": 420 }, { "epoch": 0.14, "grad_norm": 0.443359375, "learning_rate": 4.98105127417984e-07, "logits/chosen": -1.3296594619750977, "logits/rejected": -1.030011773109436, "logps/chosen": -259.4856872558594, "logps/rejected": -251.21728515625, "loss": 0.6827, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.019032226875424385, "rewards/margins": 0.019077284261584282, "rewards/margins_max": 0.032611675560474396, "rewards/margins_min": 0.005542895756661892, "rewards/margins_std": 0.019140515476465225, "rewards/rejected": -4.505945253185928e-05, "step": 430 }, { "epoch": 0.14, "grad_norm": 0.462890625, "learning_rate": 4.97752182085347e-07, "logits/chosen": -1.513671636581421, "logits/rejected": -0.9878429174423218, "logps/chosen": -207.31887817382812, "logps/rejected": -203.1181640625, "loss": 0.6832, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02094658836722374, "rewards/margins": 0.020975876599550247, "rewards/margins_max": 0.0333896279335022, "rewards/margins_min": 0.008562122471630573, "rewards/margins_std": 0.01755569875240326, "rewards/rejected": -2.9285531127243303e-05, "step": 440 }, { "epoch": 0.14, "grad_norm": 0.490234375, "learning_rate": 4.973692591811548e-07, "logits/chosen": -1.3032740354537964, "logits/rejected": -1.1484416723251343, "logps/chosen": -212.3832550048828, "logps/rejected": -230.93881225585938, "loss": 0.684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01571028307080269, "rewards/margins": 0.018571963533759117, "rewards/margins_max": 0.028482910245656967, "rewards/margins_min": 0.008661014959216118, "rewards/margins_std": 0.014016198925673962, "rewards/rejected": -0.002861680928617716, "step": 450 }, { "epoch": 0.14, "grad_norm": 0.353515625, "learning_rate": 4.96956405038395e-07, "logits/chosen": -1.2598702907562256, "logits/rejected": -0.9527764320373535, "logps/chosen": -172.1599578857422, "logps/rejected": -211.24148559570312, "loss": 0.6821, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.015954652801156044, "rewards/margins": 0.020714178681373596, "rewards/margins_max": 0.03062686324119568, "rewards/margins_min": 0.010801494121551514, "rewards/margins_std": 0.014018652029335499, "rewards/rejected": -0.004759527277201414, "step": 460 }, { "epoch": 0.15, "grad_norm": 0.40234375, "learning_rate": 4.965136696116812e-07, "logits/chosen": -1.3497663736343384, "logits/rejected": -1.029840111732483, "logps/chosen": -213.888916015625, "logps/rejected": -260.24090576171875, "loss": 0.6804, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.020394446328282356, "rewards/margins": 0.026438722386956215, "rewards/margins_max": 0.037151582539081573, "rewards/margins_min": 0.015725860372185707, "rewards/margins_std": 0.015150276012718678, "rewards/rejected": -0.006044276989996433, "step": 470 }, { "epoch": 0.15, "grad_norm": 0.515625, "learning_rate": 4.960411064712094e-07, "logits/chosen": -1.3540217876434326, "logits/rejected": -1.0137008428573608, "logps/chosen": -183.4146270751953, "logps/rejected": -218.0957489013672, "loss": 0.6826, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0185568667948246, "rewards/margins": 0.021527493372559547, "rewards/margins_max": 0.031103383749723434, "rewards/margins_min": 0.011951602064073086, "rewards/margins_std": 0.013542355969548225, "rewards/rejected": -0.002970626810565591, "step": 480 }, { "epoch": 0.15, "grad_norm": 0.431640625, "learning_rate": 4.955387727962759e-07, "logits/chosen": -1.469268798828125, "logits/rejected": -1.1933975219726562, "logps/chosen": -175.39476013183594, "logps/rejected": -170.35171508789062, "loss": 0.6835, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.016227375715970993, "rewards/margins": 0.018915237858891487, "rewards/margins_max": 0.028273263946175575, "rewards/margins_min": 0.009557214565575123, "rewards/margins_std": 0.013234244659543037, "rewards/rejected": -0.0026878633070737123, "step": 490 }, { "epoch": 0.16, "grad_norm": 0.39453125, "learning_rate": 4.95006729368358e-07, "logits/chosen": -1.591016411781311, "logits/rejected": -1.1849809885025024, "logps/chosen": -215.30050659179688, "logps/rejected": -204.9720458984375, "loss": 0.6813, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.017296748235821724, "rewards/margins": 0.025146162137389183, "rewards/margins_max": 0.03591996058821678, "rewards/margins_min": 0.014372363686561584, "rewards/margins_std": 0.015236446633934975, "rewards/rejected": -0.007849409244954586, "step": 500 }, { "epoch": 0.16, "grad_norm": 0.36328125, "learning_rate": 4.944450405637601e-07, "logits/chosen": -1.3407318592071533, "logits/rejected": -1.0564701557159424, "logps/chosen": -208.2605438232422, "logps/rejected": -194.71420288085938, "loss": 0.681, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.014876808039844036, "rewards/margins": 0.02353464625775814, "rewards/margins_max": 0.03144057095050812, "rewards/margins_min": 0.015628723427653313, "rewards/margins_std": 0.011180664412677288, "rewards/rejected": -0.008657841011881828, "step": 510 }, { "epoch": 0.16, "grad_norm": 0.34375, "learning_rate": 4.938537743458248e-07, "logits/chosen": -1.3480737209320068, "logits/rejected": -1.028096318244934, "logps/chosen": -179.07174682617188, "logps/rejected": -181.2510223388672, "loss": 0.684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.019465144723653793, "rewards/margins": 0.02149110659956932, "rewards/margins_max": 0.032975539565086365, "rewards/margins_min": 0.010006672702729702, "rewards/margins_std": 0.016241444274783134, "rewards/rejected": -0.0020259625744074583, "step": 520 }, { "epoch": 0.17, "grad_norm": 0.46484375, "learning_rate": 4.932330022567081e-07, "logits/chosen": -1.3110549449920654, "logits/rejected": -1.0896965265274048, "logps/chosen": -212.4080810546875, "logps/rejected": -204.01026916503906, "loss": 0.6808, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.018044626340270042, "rewards/margins": 0.027044925838708878, "rewards/margins_max": 0.03969361633062363, "rewards/margins_min": 0.01439622975885868, "rewards/margins_std": 0.017887955531477928, "rewards/rejected": -0.009000294841825962, "step": 530 }, { "epoch": 0.17, "grad_norm": 0.4453125, "learning_rate": 4.925827994087244e-07, "logits/chosen": -1.467350721359253, "logits/rejected": -0.9765011072158813, "logps/chosen": -196.60804748535156, "logps/rejected": -212.77114868164062, "loss": 0.6806, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022185953333973885, "rewards/margins": 0.024724114686250687, "rewards/margins_max": 0.03783790022134781, "rewards/margins_min": 0.011610329151153564, "rewards/margins_std": 0.018545694649219513, "rewards/rejected": -0.00253815995529294, "step": 540 }, { "epoch": 0.17, "grad_norm": 0.41015625, "learning_rate": 4.91903244475257e-07, "logits/chosen": -1.4453445672988892, "logits/rejected": -1.2255735397338867, "logps/chosen": -233.5497589111328, "logps/rejected": -203.3679962158203, "loss": 0.6799, "rewards/accuracies": 0.875, "rewards/chosen": 0.022691726684570312, "rewards/margins": 0.027249213308095932, "rewards/margins_max": 0.042755015194416046, "rewards/margins_min": 0.011743416078388691, "rewards/margins_std": 0.021928513422608376, "rewards/rejected": -0.004557489417493343, "step": 550 }, { "epoch": 0.18, "grad_norm": 0.326171875, "learning_rate": 4.91194419681239e-07, "logits/chosen": -1.4020469188690186, "logits/rejected": -1.0889606475830078, "logps/chosen": -201.20901489257812, "logps/rejected": -204.1788330078125, "loss": 0.681, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024152381345629692, "rewards/margins": 0.02786511741578579, "rewards/margins_max": 0.042585860937833786, "rewards/margins_min": 0.013144371099770069, "rewards/margins_std": 0.020818280056118965, "rewards/rejected": -0.00371273560449481, "step": 560 }, { "epoch": 0.18, "grad_norm": 0.462890625, "learning_rate": 4.904564107932048e-07, "logits/chosen": -1.2641432285308838, "logits/rejected": -0.897659182548523, "logps/chosen": -271.8118896484375, "logps/rejected": -239.61300659179688, "loss": 0.6815, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.017158251255750656, "rewards/margins": 0.024462290108203888, "rewards/margins_max": 0.03610853850841522, "rewards/margins_min": 0.012816043570637703, "rewards/margins_std": 0.016470283269882202, "rewards/rejected": -0.007304038852453232, "step": 570 }, { "epoch": 0.18, "grad_norm": 0.4296875, "learning_rate": 4.896893071089115e-07, "logits/chosen": -1.3425147533416748, "logits/rejected": -1.0659515857696533, "logps/chosen": -230.05111694335938, "logps/rejected": -245.98550415039062, "loss": 0.6748, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02490960620343685, "rewards/margins": 0.0353575199842453, "rewards/margins_max": 0.05103808641433716, "rewards/margins_min": 0.019676949828863144, "rewards/margins_std": 0.022175675258040428, "rewards/rejected": -0.010447912849485874, "step": 580 }, { "epoch": 0.19, "grad_norm": 0.40625, "learning_rate": 4.888932014465352e-07, "logits/chosen": -1.313063383102417, "logits/rejected": -0.9944307208061218, "logps/chosen": -208.10879516601562, "logps/rejected": -223.5287322998047, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.022358160465955734, "rewards/margins": 0.03080761432647705, "rewards/margins_max": 0.04008474573493004, "rewards/margins_min": 0.021530481055378914, "rewards/margins_std": 0.013119848445057869, "rewards/rejected": -0.008449452929198742, "step": 590 }, { "epoch": 0.19, "grad_norm": 0.5, "learning_rate": 4.88068190133439e-07, "logits/chosen": -1.4195083379745483, "logits/rejected": -1.1984379291534424, "logps/chosen": -293.54132080078125, "logps/rejected": -259.38360595703125, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.023516178131103516, "rewards/margins": 0.03416413068771362, "rewards/margins_max": 0.045261941850185394, "rewards/margins_min": 0.023066317662596703, "rewards/margins_std": 0.01569467782974243, "rewards/rejected": -0.010647954419255257, "step": 600 }, { "epoch": 0.19, "grad_norm": 0.40625, "learning_rate": 4.872143729945184e-07, "logits/chosen": -1.2229716777801514, "logits/rejected": -0.8150213360786438, "logps/chosen": -218.43276977539062, "logps/rejected": -191.75827026367188, "loss": 0.6784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022476380690932274, "rewards/margins": 0.03276657313108444, "rewards/margins_max": 0.046399351209402084, "rewards/margins_min": 0.019133802503347397, "rewards/margins_std": 0.019279656931757927, "rewards/rejected": -0.010290195234119892, "step": 610 }, { "epoch": 0.2, "grad_norm": 0.466796875, "learning_rate": 4.863318533401223e-07, "logits/chosen": -1.3115172386169434, "logits/rejected": -0.8752225041389465, "logps/chosen": -246.44140625, "logps/rejected": -270.7298278808594, "loss": 0.6758, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02260987088084221, "rewards/margins": 0.034483883529901505, "rewards/margins_max": 0.053231727331876755, "rewards/margins_min": 0.015736039727926254, "rewards/margins_std": 0.02651345357298851, "rewards/rejected": -0.01187401358038187, "step": 620 }, { "epoch": 0.2, "grad_norm": 0.326171875, "learning_rate": 4.854207379535528e-07, "logits/chosen": -1.4319788217544556, "logits/rejected": -1.0323983430862427, "logps/chosen": -254.2920684814453, "logps/rejected": -237.34378051757812, "loss": 0.6786, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.017374712973833084, "rewards/margins": 0.029332011938095093, "rewards/margins_max": 0.04456937313079834, "rewards/margins_min": 0.014094656333327293, "rewards/margins_std": 0.021548878401517868, "rewards/rejected": -0.011957301758229733, "step": 630 }, { "epoch": 0.2, "grad_norm": 0.404296875, "learning_rate": 4.844811370781446e-07, "logits/chosen": -1.4312872886657715, "logits/rejected": -0.9997726678848267, "logps/chosen": -244.96224975585938, "logps/rejected": -225.2017059326172, "loss": 0.6778, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.030679643154144287, "rewards/margins": 0.03587724640965462, "rewards/margins_max": 0.05194888263940811, "rewards/margins_min": 0.01980561390519142, "rewards/margins_std": 0.022728722542524338, "rewards/rejected": -0.00519760325551033, "step": 640 }, { "epoch": 0.2, "grad_norm": 0.4453125, "learning_rate": 4.835131644039251e-07, "logits/chosen": -1.4758861064910889, "logits/rejected": -0.9546338319778442, "logps/chosen": -339.2093505859375, "logps/rejected": -224.3199920654297, "loss": 0.6737, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02631019614636898, "rewards/margins": 0.04230981320142746, "rewards/margins_max": 0.05546834319829941, "rewards/margins_min": 0.02915129065513611, "rewards/margins_std": 0.01860896497964859, "rewards/rejected": -0.015999620780348778, "step": 650 }, { "epoch": 0.21, "grad_norm": 0.330078125, "learning_rate": 4.825169370538594e-07, "logits/chosen": -1.2813438177108765, "logits/rejected": -1.069059133529663, "logps/chosen": -231.5976104736328, "logps/rejected": -245.3614501953125, "loss": 0.6778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02429072931408882, "rewards/margins": 0.027610447257757187, "rewards/margins_max": 0.0455966591835022, "rewards/margins_min": 0.009624237194657326, "rewards/margins_std": 0.025436347350478172, "rewards/rejected": -0.003319723065942526, "step": 660 }, { "epoch": 0.21, "grad_norm": 0.46484375, "learning_rate": 4.814925755696778e-07, "logits/chosen": -1.4551244974136353, "logits/rejected": -0.9832841157913208, "logps/chosen": -288.0592346191406, "logps/rejected": -252.08364868164062, "loss": 0.6752, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028888309374451637, "rewards/margins": 0.03665446117520332, "rewards/margins_max": 0.05256615951657295, "rewards/margins_min": 0.020742762833833694, "rewards/margins_std": 0.022502535954117775, "rewards/rejected": -0.007766152266412973, "step": 670 }, { "epoch": 0.21, "grad_norm": 0.546875, "learning_rate": 4.804402038972899e-07, "logits/chosen": -1.4220234155654907, "logits/rejected": -1.02151358127594, "logps/chosen": -271.8201599121094, "logps/rejected": -273.0591735839844, "loss": 0.6743, "rewards/accuracies": 0.875, "rewards/chosen": 0.028314124792814255, "rewards/margins": 0.03461919724941254, "rewards/margins_max": 0.0456906296312809, "rewards/margins_min": 0.02354777231812477, "rewards/margins_std": 0.015657365322113037, "rewards/rejected": -0.006305074784904718, "step": 680 }, { "epoch": 0.22, "grad_norm": 0.279296875, "learning_rate": 4.79359949371789e-07, "logits/chosen": -1.3343526124954224, "logits/rejected": -0.9362949132919312, "logps/chosen": -257.0128173828125, "logps/rejected": -225.06753540039062, "loss": 0.6765, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.025834929198026657, "rewards/margins": 0.035724394023418427, "rewards/margins_max": 0.04703225940465927, "rewards/margins_min": 0.024416524916887283, "rewards/margins_std": 0.015991736203432083, "rewards/rejected": -0.009889459237456322, "step": 690 }, { "epoch": 0.22, "grad_norm": 0.41796875, "learning_rate": 4.782519427020432e-07, "logits/chosen": -1.3143739700317383, "logits/rejected": -0.9841324687004089, "logps/chosen": -204.9932098388672, "logps/rejected": -218.54141235351562, "loss": 0.6756, "rewards/accuracies": 0.875, "rewards/chosen": 0.016944795846939087, "rewards/margins": 0.0319121815264225, "rewards/margins_max": 0.04973548650741577, "rewards/margins_min": 0.014088879339396954, "rewards/margins_std": 0.025205958634614944, "rewards/rejected": -0.014967384748160839, "step": 700 }, { "epoch": 0.22, "grad_norm": 0.33203125, "learning_rate": 4.771163179548808e-07, "logits/chosen": -1.3899494409561157, "logits/rejected": -0.9665300250053406, "logps/chosen": -333.5497131347656, "logps/rejected": -231.9584503173828, "loss": 0.6699, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.027130257338285446, "rewards/margins": 0.04676414281129837, "rewards/margins_max": 0.06533849239349365, "rewards/margins_min": 0.028189798817038536, "rewards/margins_std": 0.026268085464835167, "rewards/rejected": -0.019633881747722626, "step": 710 }, { "epoch": 0.23, "grad_norm": 0.43359375, "learning_rate": 4.75953212538868e-07, "logits/chosen": -1.2207629680633545, "logits/rejected": -0.8575620651245117, "logps/chosen": -263.4650573730469, "logps/rejected": -233.98886108398438, "loss": 0.6717, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03418269008398056, "rewards/margins": 0.05005268007516861, "rewards/margins_max": 0.0767994374036789, "rewards/margins_min": 0.02330590970814228, "rewards/margins_std": 0.03782564401626587, "rewards/rejected": -0.0158699844032526, "step": 720 }, { "epoch": 0.23, "grad_norm": 0.404296875, "learning_rate": 4.7476276718768284e-07, "logits/chosen": -1.434762716293335, "logits/rejected": -1.0808919668197632, "logps/chosen": -206.06533813476562, "logps/rejected": -218.6961669921875, "loss": 0.6751, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.026135995984077454, "rewards/margins": 0.03937443345785141, "rewards/margins_max": 0.05718846991658211, "rewards/margins_min": 0.021560396999120712, "rewards/margins_std": 0.025192851200699806, "rewards/rejected": -0.01323844026774168, "step": 730 }, { "epoch": 0.23, "grad_norm": 0.41015625, "learning_rate": 4.7354512594308654e-07, "logits/chosen": -1.3616220951080322, "logits/rejected": -1.1791460514068604, "logps/chosen": -193.25332641601562, "logps/rejected": -197.8474578857422, "loss": 0.6753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.024115614593029022, "rewards/margins": 0.03462132811546326, "rewards/margins_max": 0.052414439618587494, "rewards/margins_min": 0.016828209161758423, "rewards/margins_std": 0.02516326680779457, "rewards/rejected": -0.010505708865821362, "step": 740 }, { "epoch": 0.24, "grad_norm": 0.431640625, "learning_rate": 4.7230043613749527e-07, "logits/chosen": -1.3196706771850586, "logits/rejected": -1.0803533792495728, "logps/chosen": -229.3977813720703, "logps/rejected": -198.8769989013672, "loss": 0.6758, "rewards/accuracies": 0.875, "rewards/chosen": 0.023436803370714188, "rewards/margins": 0.03306025639176369, "rewards/margins_max": 0.05252969264984131, "rewards/margins_min": 0.013590824790298939, "rewards/margins_std": 0.027533939108252525, "rewards/rejected": -0.009623454883694649, "step": 750 }, { "epoch": 0.24, "grad_norm": 0.373046875, "learning_rate": 4.710288483761524e-07, "logits/chosen": -1.1608425378799438, "logits/rejected": -0.8409261703491211, "logps/chosen": -238.56130981445312, "logps/rejected": -212.3688507080078, "loss": 0.677, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01990620419383049, "rewards/margins": 0.038102246820926666, "rewards/margins_max": 0.053315240889787674, "rewards/margins_min": 0.022889258340001106, "rewards/margins_std": 0.021514419466257095, "rewards/rejected": -0.018196044489741325, "step": 760 }, { "epoch": 0.24, "grad_norm": 0.38671875, "learning_rate": 4.697305165189062e-07, "logits/chosen": -1.4269897937774658, "logits/rejected": -1.0499980449676514, "logps/chosen": -230.6156768798828, "logps/rejected": -229.01708984375, "loss": 0.6741, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.028978174552321434, "rewards/margins": 0.03824831172823906, "rewards/margins_max": 0.06103180721402168, "rewards/margins_min": 0.015464827418327332, "rewards/margins_std": 0.03222071751952171, "rewards/rejected": -0.009270140901207924, "step": 770 }, { "epoch": 0.25, "grad_norm": 0.546875, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -1.3930243253707886, "logits/rejected": -0.9540492296218872, "logps/chosen": -225.96084594726562, "logps/rejected": -237.0554962158203, "loss": 0.6711, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03488198295235634, "rewards/margins": 0.05557037144899368, "rewards/margins_max": 0.07404030859470367, "rewards/margins_min": 0.0371004194021225, "rewards/margins_std": 0.026120448485016823, "rewards/rejected": -0.02068837732076645, "step": 780 }, { "epoch": 0.25, "grad_norm": 0.5078125, "learning_rate": 4.6705425211702656e-07, "logits/chosen": -1.4000756740570068, "logits/rejected": -1.1083465814590454, "logps/chosen": -172.87281799316406, "logps/rejected": -189.81704711914062, "loss": 0.6745, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023823823779821396, "rewards/margins": 0.0378737710416317, "rewards/margins_max": 0.05512396618723869, "rewards/margins_min": 0.020623570308089256, "rewards/margins_std": 0.02439546398818493, "rewards/rejected": -0.014049944467842579, "step": 790 }, { "epoch": 0.25, "grad_norm": 0.431640625, "learning_rate": 4.656766433956062e-07, "logits/chosen": -1.3979090452194214, "logits/rejected": -0.8946587443351746, "logps/chosen": -248.97512817382812, "logps/rejected": -218.88919067382812, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 0.03325175121426582, "rewards/margins": 0.05383073538541794, "rewards/margins_max": 0.07829690724611282, "rewards/margins_min": 0.02936457097530365, "rewards/margins_std": 0.034600384533405304, "rewards/rejected": -0.020578987896442413, "step": 800 }, { "epoch": 0.26, "grad_norm": 0.337890625, "learning_rate": 4.6427293818552613e-07, "logits/chosen": -1.4188311100006104, "logits/rejected": -0.9876410365104675, "logps/chosen": -234.7069854736328, "logps/rejected": -182.8696746826172, "loss": 0.6725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03687068074941635, "rewards/margins": 0.04073936119675636, "rewards/margins_max": 0.05916820093989372, "rewards/margins_min": 0.022310517728328705, "rewards/margins_std": 0.026062315329909325, "rewards/rejected": -0.0038686811458319426, "step": 810 }, { "epoch": 0.26, "grad_norm": 0.498046875, "learning_rate": 4.6284330633260994e-07, "logits/chosen": -1.3178324699401855, "logits/rejected": -0.9743862152099609, "logps/chosen": -202.13705444335938, "logps/rejected": -199.08094787597656, "loss": 0.6723, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0206548273563385, "rewards/margins": 0.038690946996212006, "rewards/margins_max": 0.057139646261930466, "rewards/margins_min": 0.020242247730493546, "rewards/margins_std": 0.026090402156114578, "rewards/rejected": -0.018036121502518654, "step": 820 }, { "epoch": 0.26, "grad_norm": 0.40625, "learning_rate": 4.6138792081975844e-07, "logits/chosen": -1.4049233198165894, "logits/rejected": -1.0411832332611084, "logps/chosen": -223.6266632080078, "logps/rejected": -188.9544219970703, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.03218904137611389, "rewards/margins": 0.04215382784605026, "rewards/margins_max": 0.06759864091873169, "rewards/margins_min": 0.016709014773368835, "rewards/margins_std": 0.03598439693450928, "rewards/rejected": -0.009964784607291222, "step": 830 }, { "epoch": 0.26, "grad_norm": 0.41796875, "learning_rate": 4.599069577460194e-07, "logits/chosen": -1.4191118478775024, "logits/rejected": -1.1629408597946167, "logps/chosen": -280.8072814941406, "logps/rejected": -243.64645385742188, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 0.04105687886476517, "rewards/margins": 0.05216727405786514, "rewards/margins_max": 0.07590137422084808, "rewards/margins_min": 0.028433170169591904, "rewards/margins_std": 0.033565085381269455, "rewards/rejected": -0.011110392399132252, "step": 840 }, { "epoch": 0.27, "grad_norm": 0.3515625, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -1.505789875984192, "logits/rejected": -1.090831995010376, "logps/chosen": -203.38735961914062, "logps/rejected": -218.90292358398438, "loss": 0.6706, "rewards/accuracies": 1.0, "rewards/chosen": 0.035862646996974945, "rewards/margins": 0.048062682151794434, "rewards/margins_max": 0.06934330612421036, "rewards/margins_min": 0.02678206004202366, "rewards/margins_std": 0.030095338821411133, "rewards/rejected": -0.01220003329217434, "step": 850 }, { "epoch": 0.27, "grad_norm": 0.466796875, "learning_rate": 4.5686901876458384e-07, "logits/chosen": -1.4151548147201538, "logits/rejected": -1.0735548734664917, "logps/chosen": -211.01199340820312, "logps/rejected": -224.63619995117188, "loss": 0.6725, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02967039868235588, "rewards/margins": 0.04329541698098183, "rewards/margins_max": 0.0652666911482811, "rewards/margins_min": 0.021324139088392258, "rewards/margins_std": 0.03107207641005516, "rewards/rejected": -0.013625016435980797, "step": 860 }, { "epoch": 0.27, "grad_norm": 0.46484375, "learning_rate": 4.553124104420784e-07, "logits/chosen": -1.3255832195281982, "logits/rejected": -1.1080420017242432, "logps/chosen": -205.3770751953125, "logps/rejected": -209.04660034179688, "loss": 0.6728, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022777115926146507, "rewards/margins": 0.0444360189139843, "rewards/margins_max": 0.06135256215929985, "rewards/margins_min": 0.02751948870718479, "rewards/margins_std": 0.023923594504594803, "rewards/rejected": -0.02165890485048294, "step": 870 }, { "epoch": 0.28, "grad_norm": 0.353515625, "learning_rate": 4.537309596845905e-07, "logits/chosen": -1.4212206602096558, "logits/rejected": -1.1468111276626587, "logps/chosen": -203.2875213623047, "logps/rejected": -183.04867553710938, "loss": 0.6704, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03265067934989929, "rewards/margins": 0.04764062911272049, "rewards/margins_max": 0.06543318927288055, "rewards/margins_min": 0.029848068952560425, "rewards/margins_std": 0.025162484496831894, "rewards/rejected": -0.014989949762821198, "step": 880 }, { "epoch": 0.28, "grad_norm": 0.333984375, "learning_rate": 4.521248578448373e-07, "logits/chosen": -1.295290231704712, "logits/rejected": -1.2244700193405151, "logps/chosen": -167.2049560546875, "logps/rejected": -235.2522735595703, "loss": 0.6736, "rewards/accuracies": 0.875, "rewards/chosen": 0.021955791860818863, "rewards/margins": 0.0307827889919281, "rewards/margins_max": 0.04658069089055061, "rewards/margins_min": 0.01498488150537014, "rewards/margins_std": 0.022341612726449966, "rewards/rejected": -0.008826995268464088, "step": 890 }, { "epoch": 0.28, "grad_norm": 0.392578125, "learning_rate": 4.504942992582732e-07, "logits/chosen": -1.2876170873641968, "logits/rejected": -1.070996642112732, "logps/chosen": -201.41519165039062, "logps/rejected": -215.9574737548828, "loss": 0.6713, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.033751118928194046, "rewards/margins": 0.03825461119413376, "rewards/margins_max": 0.05966836214065552, "rewards/margins_min": 0.016840863972902298, "rewards/margins_std": 0.030283614993095398, "rewards/rejected": -0.004503494594246149, "step": 900 }, { "epoch": 0.29, "grad_norm": 0.3671875, "learning_rate": 4.4883948121957483e-07, "logits/chosen": -1.3818947076797485, "logits/rejected": -1.1178925037384033, "logps/chosen": -170.48712158203125, "logps/rejected": -221.67098999023438, "loss": 0.6719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028661269694566727, "rewards/margins": 0.04194828122854233, "rewards/margins_max": 0.06600390374660492, "rewards/margins_min": 0.017892662435770035, "rewards/margins_std": 0.03401978313922882, "rewards/rejected": -0.013287017121911049, "step": 910 }, { "epoch": 0.29, "grad_norm": 0.4609375, "learning_rate": 4.471606039587695e-07, "logits/chosen": -1.4353498220443726, "logits/rejected": -1.2498797178268433, "logps/chosen": -250.303466796875, "logps/rejected": -252.1241912841797, "loss": 0.6728, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.030318697914481163, "rewards/margins": 0.03714519739151001, "rewards/margins_max": 0.05723171681165695, "rewards/margins_min": 0.01705867424607277, "rewards/margins_std": 0.02840663120150566, "rewards/rejected": -0.006826499011367559, "step": 920 }, { "epoch": 0.29, "grad_norm": 0.298828125, "learning_rate": 4.4545787061700746e-07, "logits/chosen": -1.4596531391143799, "logits/rejected": -0.9841306805610657, "logps/chosen": -191.699462890625, "logps/rejected": -231.6093292236328, "loss": 0.6695, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.037762559950351715, "rewards/margins": 0.05371398851275444, "rewards/margins_max": 0.07777608931064606, "rewards/margins_min": 0.029651891440153122, "rewards/margins_std": 0.03402894735336304, "rewards/rejected": -0.015951428562402725, "step": 930 }, { "epoch": 0.3, "grad_norm": 0.478515625, "learning_rate": 4.4373148722198183e-07, "logits/chosen": -1.3031915426254272, "logits/rejected": -0.9831310510635376, "logps/chosen": -203.33865356445312, "logps/rejected": -233.1627197265625, "loss": 0.6694, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03300677612423897, "rewards/margins": 0.06004180386662483, "rewards/margins_max": 0.09207084774971008, "rewards/margins_min": 0.028012752532958984, "rewards/margins_std": 0.04529590904712677, "rewards/rejected": -0.027035022154450417, "step": 940 }, { "epoch": 0.3, "grad_norm": 0.498046875, "learning_rate": 4.4198166266300025e-07, "logits/chosen": -1.4863415956497192, "logits/rejected": -1.0757322311401367, "logps/chosen": -224.2287139892578, "logps/rejected": -239.1637725830078, "loss": 0.6657, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.034104812890291214, "rewards/margins": 0.05569925159215927, "rewards/margins_max": 0.08411959558725357, "rewards/margins_min": 0.027278924360871315, "rewards/margins_std": 0.04019241780042648, "rewards/rejected": -0.021594444289803505, "step": 950 }, { "epoch": 0.3, "grad_norm": 0.39453125, "learning_rate": 4.402086086657092e-07, "logits/chosen": -1.5037004947662354, "logits/rejected": -0.9914538264274597, "logps/chosen": -196.54397583007812, "logps/rejected": -211.8581085205078, "loss": 0.6703, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030650585889816284, "rewards/margins": 0.04529775679111481, "rewards/margins_max": 0.06939564645290375, "rewards/margins_min": 0.021199876442551613, "rewards/margins_std": 0.034079547971487045, "rewards/rejected": -0.014647173695266247, "step": 960 }, { "epoch": 0.31, "grad_norm": 0.46875, "learning_rate": 4.3841253976647584e-07, "logits/chosen": -1.4153146743774414, "logits/rejected": -1.0589603185653687, "logps/chosen": -195.8428192138672, "logps/rejected": -189.7724609375, "loss": 0.6679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03924962133169174, "rewards/margins": 0.05352933332324028, "rewards/margins_max": 0.08327177166938782, "rewards/margins_min": 0.023786883801221848, "rewards/margins_std": 0.042062170803546906, "rewards/rejected": -0.014279710128903389, "step": 970 }, { "epoch": 0.31, "grad_norm": 0.28125, "learning_rate": 4.3659367328642917e-07, "logits/chosen": -1.1924479007720947, "logits/rejected": -1.0621583461761475, "logps/chosen": -212.1740264892578, "logps/rejected": -255.7356414794922, "loss": 0.6707, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026019830256700516, "rewards/margins": 0.039071694016456604, "rewards/margins_max": 0.05669945478439331, "rewards/margins_min": 0.021443922072649002, "rewards/margins_std": 0.024929430335760117, "rewards/rejected": -0.013051861897110939, "step": 980 }, { "epoch": 0.31, "grad_norm": 0.5, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -1.3828264474868774, "logits/rejected": -1.12723708152771, "logps/chosen": -209.1006622314453, "logps/rejected": -211.49014282226562, "loss": 0.6707, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024994900450110435, "rewards/margins": 0.04330515116453171, "rewards/margins_max": 0.0647624060511589, "rewards/margins_min": 0.021847892552614212, "rewards/margins_std": 0.03034515120089054, "rewards/rejected": -0.018310246989130974, "step": 990 }, { "epoch": 0.32, "grad_norm": 0.345703125, "learning_rate": 4.3288843063411573e-07, "logits/chosen": -1.588935136795044, "logits/rejected": -1.1631680727005005, "logps/chosen": -212.937255859375, "logps/rejected": -199.36758422851562, "loss": 0.6685, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03406776860356331, "rewards/margins": 0.04911542311310768, "rewards/margins_max": 0.07006208598613739, "rewards/margins_min": 0.02816876210272312, "rewards/margins_std": 0.029623055830597878, "rewards/rejected": -0.015047654509544373, "step": 1000 }, { "epoch": 0.32, "grad_norm": 0.4296875, "learning_rate": 4.310025027895925e-07, "logits/chosen": -1.4283082485198975, "logits/rejected": -1.1020632982254028, "logps/chosen": -215.6881103515625, "logps/rejected": -228.725341796875, "loss": 0.6686, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.035303063690662384, "rewards/margins": 0.047801949083805084, "rewards/margins_max": 0.06932314485311508, "rewards/margins_min": 0.026280760765075684, "rewards/margins_std": 0.03043556772172451, "rewards/rejected": -0.01249888725578785, "step": 1010 }, { "epoch": 0.32, "grad_norm": 0.447265625, "learning_rate": 4.290946739654962e-07, "logits/chosen": -1.3023067712783813, "logits/rejected": -0.9218745231628418, "logps/chosen": -245.7296905517578, "logps/rejected": -226.1380615234375, "loss": 0.6675, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.031115401536226273, "rewards/margins": 0.05050064995884895, "rewards/margins_max": 0.06927161663770676, "rewards/margins_min": 0.03172967582941055, "rewards/margins_std": 0.026546159759163857, "rewards/rejected": -0.019385244697332382, "step": 1020 }, { "epoch": 0.32, "grad_norm": 0.3046875, "learning_rate": 4.2716517500570704e-07, "logits/chosen": -1.3911397457122803, "logits/rejected": -1.181490182876587, "logps/chosen": -186.88909912109375, "logps/rejected": -218.47900390625, "loss": 0.6743, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.024538397789001465, "rewards/margins": 0.040439218282699585, "rewards/margins_max": 0.060990117490291595, "rewards/margins_min": 0.019888322800397873, "rewards/margins_std": 0.029063355177640915, "rewards/rejected": -0.01590082235634327, "step": 1030 }, { "epoch": 0.33, "grad_norm": 0.515625, "learning_rate": 4.252142393761533e-07, "logits/chosen": -1.4555580615997314, "logits/rejected": -1.0047805309295654, "logps/chosen": -251.005615234375, "logps/rejected": -284.2795715332031, "loss": 0.668, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03476887568831444, "rewards/margins": 0.054674047976732254, "rewards/margins_max": 0.07890333235263824, "rewards/margins_min": 0.030444765463471413, "rewards/margins_std": 0.03426538407802582, "rewards/rejected": -0.019905168563127518, "step": 1040 }, { "epoch": 0.33, "grad_norm": 0.359375, "learning_rate": 4.232421031365617e-07, "logits/chosen": -1.3305257558822632, "logits/rejected": -1.1427993774414062, "logps/chosen": -180.7315216064453, "logps/rejected": -214.3630828857422, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 0.02955777570605278, "rewards/margins": 0.051715098321437836, "rewards/margins_max": 0.07381218671798706, "rewards/margins_min": 0.029618006199598312, "rewards/margins_std": 0.031250011175870895, "rewards/rejected": -0.022157320752739906, "step": 1050 }, { "epoch": 0.33, "grad_norm": 0.318359375, "learning_rate": 4.212490049118951e-07, "logits/chosen": -1.4470938444137573, "logits/rejected": -1.143046498298645, "logps/chosen": -198.9834442138672, "logps/rejected": -239.5135955810547, "loss": 0.6707, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.028033524751663208, "rewards/margins": 0.04688093811273575, "rewards/margins_max": 0.06461174786090851, "rewards/margins_min": 0.02915012836456299, "rewards/margins_std": 0.025075148791074753, "rewards/rejected": -0.018847409635782242, "step": 1060 }, { "epoch": 0.34, "grad_norm": 0.337890625, "learning_rate": 4.1923518586347914e-07, "logits/chosen": -1.4638912677764893, "logits/rejected": -1.0022966861724854, "logps/chosen": -209.8042449951172, "logps/rejected": -192.01329040527344, "loss": 0.6697, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03496958687901497, "rewards/margins": 0.05312635377049446, "rewards/margins_max": 0.07696934044361115, "rewards/margins_min": 0.029283368960022926, "rewards/margins_std": 0.03371907025575638, "rewards/rejected": -0.018156763166189194, "step": 1070 }, { "epoch": 0.34, "grad_norm": 0.455078125, "learning_rate": 4.172008896598221e-07, "logits/chosen": -1.3048521280288696, "logits/rejected": -1.0749359130859375, "logps/chosen": -201.3562469482422, "logps/rejected": -187.42080688476562, "loss": 0.6696, "rewards/accuracies": 1.0, "rewards/chosen": 0.03374785929918289, "rewards/margins": 0.055463533848524094, "rewards/margins_max": 0.07117541134357452, "rewards/margins_min": 0.03975165635347366, "rewards/margins_std": 0.022219957783818245, "rewards/rejected": -0.02171567641198635, "step": 1080 }, { "epoch": 0.34, "grad_norm": 0.40234375, "learning_rate": 4.151463624471313e-07, "logits/chosen": -1.32763671875, "logits/rejected": -0.8290489315986633, "logps/chosen": -311.2829895019531, "logps/rejected": -223.5568084716797, "loss": 0.6627, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04091322422027588, "rewards/margins": 0.07021255791187286, "rewards/margins_max": 0.10776461660861969, "rewards/margins_min": 0.03266051039099693, "rewards/margins_std": 0.05310662463307381, "rewards/rejected": -0.029299337416887283, "step": 1090 }, { "epoch": 0.35, "grad_norm": 0.37109375, "learning_rate": 4.130718528195303e-07, "logits/chosen": -1.4879382848739624, "logits/rejected": -1.0252676010131836, "logps/chosen": -229.93917846679688, "logps/rejected": -225.00491333007812, "loss": 0.666, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.041905976831912994, "rewards/margins": 0.06251799315214157, "rewards/margins_max": 0.08764694631099701, "rewards/margins_min": 0.03738904744386673, "rewards/margins_std": 0.035537708550691605, "rewards/rejected": -0.020612016320228577, "step": 1100 }, { "epoch": 0.35, "grad_norm": 0.328125, "learning_rate": 4.109776117889789e-07, "logits/chosen": -1.371626853942871, "logits/rejected": -0.9644553065299988, "logps/chosen": -256.90826416015625, "logps/rejected": -262.3277282714844, "loss": 0.665, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04058977589011192, "rewards/margins": 0.058307357132434845, "rewards/margins_max": 0.07639677822589874, "rewards/margins_min": 0.04021793603897095, "rewards/margins_std": 0.025582294911146164, "rewards/rejected": -0.017717575654387474, "step": 1110 }, { "epoch": 0.35, "grad_norm": 0.3515625, "learning_rate": 4.088638927549016e-07, "logits/chosen": -1.4024112224578857, "logits/rejected": -1.0277204513549805, "logps/chosen": -257.0721740722656, "logps/rejected": -223.2074432373047, "loss": 0.6695, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.033399466425180435, "rewards/margins": 0.05190381407737732, "rewards/margins_max": 0.07284527271986008, "rewards/margins_min": 0.03096235729753971, "rewards/margins_std": 0.029615694656968117, "rewards/rejected": -0.018504345789551735, "step": 1120 }, { "epoch": 0.36, "grad_norm": 0.3359375, "learning_rate": 4.067309514735267e-07, "logits/chosen": -1.2835520505905151, "logits/rejected": -0.9591856002807617, "logps/chosen": -253.2421417236328, "logps/rejected": -214.3036651611328, "loss": 0.6642, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03112444281578064, "rewards/margins": 0.06105799227952957, "rewards/margins_max": 0.08269943296909332, "rewards/margins_min": 0.03941655158996582, "rewards/margins_std": 0.030605623498558998, "rewards/rejected": -0.02993355132639408, "step": 1130 }, { "epoch": 0.36, "grad_norm": 0.4296875, "learning_rate": 4.045790460269395e-07, "logits/chosen": -1.29916250705719, "logits/rejected": -0.9579310417175293, "logps/chosen": -222.2379608154297, "logps/rejected": -203.0634307861328, "loss": 0.667, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03449912741780281, "rewards/margins": 0.04771226644515991, "rewards/margins_max": 0.0739569216966629, "rewards/margins_min": 0.02146761119365692, "rewards/margins_std": 0.03711555153131485, "rewards/rejected": -0.0132131427526474, "step": 1140 }, { "epoch": 0.36, "grad_norm": 0.46484375, "learning_rate": 4.02408436791856e-07, "logits/chosen": -1.3718782663345337, "logits/rejected": -1.0133472681045532, "logps/chosen": -234.8833770751953, "logps/rejected": -237.4645233154297, "loss": 0.6669, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03908390551805496, "rewards/margins": 0.06387855857610703, "rewards/margins_max": 0.0912180095911026, "rewards/margins_min": 0.03653910756111145, "rewards/margins_std": 0.038663819432258606, "rewards/rejected": -0.02479465678334236, "step": 1150 }, { "epoch": 0.37, "grad_norm": 0.6796875, "learning_rate": 4.0021938640811717e-07, "logits/chosen": -1.3344662189483643, "logits/rejected": -0.9591034054756165, "logps/chosen": -221.8365478515625, "logps/rejected": -358.8748779296875, "loss": 0.6653, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03558691591024399, "rewards/margins": 0.06812174618244171, "rewards/margins_max": 0.09799469262361526, "rewards/margins_min": 0.03824879601597786, "rewards/margins_std": 0.042246729135513306, "rewards/rejected": -0.032534826546907425, "step": 1160 }, { "epoch": 0.37, "grad_norm": 0.275390625, "learning_rate": 3.980121597469095e-07, "logits/chosen": -1.4173529148101807, "logits/rejected": -1.046112298965454, "logps/chosen": -222.1094970703125, "logps/rejected": -195.96484375, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 0.02532361075282097, "rewards/margins": 0.04526478797197342, "rewards/margins_max": 0.06393333524465561, "rewards/margins_min": 0.02659623883664608, "rewards/margins_std": 0.02640131488442421, "rewards/rejected": -0.0199411790817976, "step": 1170 }, { "epoch": 0.37, "grad_norm": 0.412109375, "learning_rate": 3.9578702387871735e-07, "logits/chosen": -1.4770991802215576, "logits/rejected": -1.0594831705093384, "logps/chosen": -200.45314025878906, "logps/rejected": -181.73829650878906, "loss": 0.67, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03991563245654106, "rewards/margins": 0.04643087834119797, "rewards/margins_max": 0.0705905631184578, "rewards/margins_min": 0.022271184250712395, "rewards/margins_std": 0.034166961908340454, "rewards/rejected": -0.00651524355635047, "step": 1180 }, { "epoch": 0.37, "grad_norm": 0.34375, "learning_rate": 3.9354424804100647e-07, "logits/chosen": -1.3302786350250244, "logits/rejected": -1.0419865846633911, "logps/chosen": -180.52911376953125, "logps/rejected": -229.23507690429688, "loss": 0.6661, "rewards/accuracies": 1.0, "rewards/chosen": 0.03989443928003311, "rewards/margins": 0.05144830420613289, "rewards/margins_max": 0.07513656467199326, "rewards/margins_min": 0.02776004932820797, "rewards/margins_std": 0.03350025415420532, "rewards/rejected": -0.0115538714453578, "step": 1190 }, { "epoch": 0.38, "grad_norm": 0.2734375, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -1.4453057050704956, "logits/rejected": -0.843630313873291, "logps/chosen": -239.1511993408203, "logps/rejected": -228.0477294921875, "loss": 0.6689, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0347132682800293, "rewards/margins": 0.05644859001040459, "rewards/margins_max": 0.07778388261795044, "rewards/margins_min": 0.035113297402858734, "rewards/margins_std": 0.03017266094684601, "rewards/rejected": -0.02173532173037529, "step": 1200 }, { "epoch": 0.38, "grad_norm": 0.337890625, "learning_rate": 3.8900686404608174e-07, "logits/chosen": -1.4021894931793213, "logits/rejected": -1.1501901149749756, "logps/chosen": -246.96676635742188, "logps/rejected": -243.79055786132812, "loss": 0.6648, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02853170968592167, "rewards/margins": 0.05541776865720749, "rewards/margins_max": 0.0840243324637413, "rewards/margins_min": 0.026811202988028526, "rewards/margins_std": 0.04045579582452774, "rewards/rejected": -0.02688606083393097, "step": 1210 }, { "epoch": 0.38, "grad_norm": 0.328125, "learning_rate": 3.8671280490422753e-07, "logits/chosen": -1.4860647916793823, "logits/rejected": -1.2145134210586548, "logps/chosen": -173.87155151367188, "logps/rejected": -215.08114624023438, "loss": 0.6686, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03185255080461502, "rewards/margins": 0.0523541197180748, "rewards/margins_max": 0.07217199355363846, "rewards/margins_min": 0.032536253333091736, "rewards/margins_std": 0.028026703745126724, "rewards/rejected": -0.020501574501395226, "step": 1220 }, { "epoch": 0.39, "grad_norm": 0.3984375, "learning_rate": 3.8440220375714435e-07, "logits/chosen": -1.4330469369888306, "logits/rejected": -0.923498809337616, "logps/chosen": -194.24989318847656, "logps/rejected": -188.0842742919922, "loss": 0.6661, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02217457816004753, "rewards/margins": 0.05012967437505722, "rewards/margins_max": 0.07560008764266968, "rewards/margins_min": 0.02465927042067051, "rewards/margins_std": 0.03602059185504913, "rewards/rejected": -0.02795509621500969, "step": 1230 }, { "epoch": 0.39, "grad_norm": 0.42578125, "learning_rate": 3.8207534018344434e-07, "logits/chosen": -1.4624649286270142, "logits/rejected": -1.2272025346755981, "logps/chosen": -224.9335479736328, "logps/rejected": -215.4106903076172, "loss": 0.6645, "rewards/accuracies": 1.0, "rewards/chosen": 0.02788296714425087, "rewards/margins": 0.05390559881925583, "rewards/margins_max": 0.07784163951873779, "rewards/margins_min": 0.02996954880654812, "rewards/margins_std": 0.033850688487291336, "rewards/rejected": -0.02602263353765011, "step": 1240 }, { "epoch": 0.39, "grad_norm": 0.376953125, "learning_rate": 3.797324957294643e-07, "logits/chosen": -1.4522289037704468, "logits/rejected": -1.0942248106002808, "logps/chosen": -197.66709899902344, "logps/rejected": -188.29644775390625, "loss": 0.6667, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028640951961278915, "rewards/margins": 0.04304185137152672, "rewards/margins_max": 0.06764715909957886, "rewards/margins_min": 0.01843653805553913, "rewards/margins_std": 0.034797169268131256, "rewards/rejected": -0.014400901272892952, "step": 1250 }, { "epoch": 0.4, "grad_norm": 0.451171875, "learning_rate": 3.773739538751988e-07, "logits/chosen": -1.4544618129730225, "logits/rejected": -1.0294139385223389, "logps/chosen": -245.71435546875, "logps/rejected": -208.09115600585938, "loss": 0.6646, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.032376714050769806, "rewards/margins": 0.04516047239303589, "rewards/margins_max": 0.06522423774003983, "rewards/margins_min": 0.025096703320741653, "rewards/margins_std": 0.028374452143907547, "rewards/rejected": -0.012783756479620934, "step": 1260 }, { "epoch": 0.4, "grad_norm": 0.431640625, "learning_rate": 3.75e-07, "logits/chosen": -1.39158034324646, "logits/rejected": -0.9538629651069641, "logps/chosen": -339.5301513671875, "logps/rejected": -214.0384521484375, "loss": 0.6693, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03498070687055588, "rewards/margins": 0.05068878084421158, "rewards/margins_max": 0.07239842414855957, "rewards/margins_min": 0.028979141265153885, "rewards/margins_std": 0.030702069401741028, "rewards/rejected": -0.015708070248365402, "step": 1270 }, { "epoch": 0.4, "grad_norm": 0.388671875, "learning_rate": 3.7261092134804695e-07, "logits/chosen": -1.313458800315857, "logits/rejected": -0.9937132000923157, "logps/chosen": -205.0299835205078, "logps/rejected": -212.155029296875, "loss": 0.6684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03651620075106621, "rewards/margins": 0.05705242604017258, "rewards/margins_max": 0.08746035397052765, "rewards/margins_min": 0.026644494384527206, "rewards/margins_std": 0.043003302067518234, "rewards/rejected": -0.020536217838525772, "step": 1280 }, { "epoch": 0.41, "grad_norm": 0.40234375, "learning_rate": 3.702070069935898e-07, "logits/chosen": -1.4626922607421875, "logits/rejected": -1.015981674194336, "logps/chosen": -227.63339233398438, "logps/rejected": -221.4516143798828, "loss": 0.6682, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03474265709519386, "rewards/margins": 0.051122285425662994, "rewards/margins_max": 0.06716804951429367, "rewards/margins_min": 0.03507651016116142, "rewards/margins_std": 0.02269214577972889, "rewards/rejected": -0.016379622742533684, "step": 1290 }, { "epoch": 0.41, "grad_norm": 0.6171875, "learning_rate": 3.6778854780597213e-07, "logits/chosen": -1.2919328212738037, "logits/rejected": -0.9956780672073364, "logps/chosen": -222.3484344482422, "logps/rejected": -182.62179565429688, "loss": 0.6681, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.030011823400855064, "rewards/margins": 0.0488753467798233, "rewards/margins_max": 0.06740256398916245, "rewards/margins_min": 0.030348125845193863, "rewards/margins_std": 0.02620144747197628, "rewards/rejected": -0.01886352151632309, "step": 1300 }, { "epoch": 0.41, "grad_norm": 0.466796875, "learning_rate": 3.653558364144363e-07, "logits/chosen": -1.4199802875518799, "logits/rejected": -1.1749187707901, "logps/chosen": -182.4161376953125, "logps/rejected": -217.2281951904297, "loss": 0.6651, "rewards/accuracies": 0.875, "rewards/chosen": 0.03831896930932999, "rewards/margins": 0.053058166056871414, "rewards/margins_max": 0.07967302948236465, "rewards/margins_min": 0.026443298906087875, "rewards/margins_std": 0.03763909637928009, "rewards/rejected": -0.014739197678864002, "step": 1310 }, { "epoch": 0.42, "grad_norm": 0.4296875, "learning_rate": 3.629091671727159e-07, "logits/chosen": -1.383264422416687, "logits/rejected": -0.935562252998352, "logps/chosen": -236.8832550048828, "logps/rejected": -232.43701171875, "loss": 0.6652, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03460818529129028, "rewards/margins": 0.05942409485578537, "rewards/margins_max": 0.08626364171504974, "rewards/margins_min": 0.0325845405459404, "rewards/margins_std": 0.037956852465867996, "rewards/rejected": -0.02481590211391449, "step": 1320 }, { "epoch": 0.42, "grad_norm": 0.515625, "learning_rate": 3.6044883612341957e-07, "logits/chosen": -1.4922215938568115, "logits/rejected": -1.210303544998169, "logps/chosen": -175.39468383789062, "logps/rejected": -175.10372924804688, "loss": 0.6688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0377926230430603, "rewards/margins": 0.044999849051237106, "rewards/margins_max": 0.06272000819444656, "rewards/margins_min": 0.027279695495963097, "rewards/margins_std": 0.025060083717107773, "rewards/rejected": -0.007207226939499378, "step": 1330 }, { "epoch": 0.42, "grad_norm": 0.416015625, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -1.447775959968567, "logits/rejected": -1.1010136604309082, "logps/chosen": -233.7635040283203, "logps/rejected": -213.46658325195312, "loss": 0.6693, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.032312069088220596, "rewards/margins": 0.052999138832092285, "rewards/margins_max": 0.07665625959634781, "rewards/margins_min": 0.02934201993048191, "rewards/margins_std": 0.033456217497587204, "rewards/rejected": -0.02068706974387169, "step": 1340 }, { "epoch": 0.43, "grad_norm": 0.38671875, "learning_rate": 3.554883810017844e-07, "logits/chosen": -1.3156002759933472, "logits/rejected": -1.0745857954025269, "logps/chosen": -181.6421661376953, "logps/rejected": -183.23080444335938, "loss": 0.6698, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030048031359910965, "rewards/margins": 0.04210250452160835, "rewards/margins_max": 0.07046877592802048, "rewards/margins_min": 0.013736230321228504, "rewards/margins_std": 0.04011595994234085, "rewards/rejected": -0.01205446757376194, "step": 1350 }, { "epoch": 0.43, "grad_norm": 0.298828125, "learning_rate": 3.529888571356561e-07, "logits/chosen": -1.2578437328338623, "logits/rejected": -1.0070809125900269, "logps/chosen": -250.3462677001953, "logps/rejected": -232.718994140625, "loss": 0.6706, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026622626930475235, "rewards/margins": 0.04009575396776199, "rewards/margins_max": 0.06229530647397041, "rewards/margins_min": 0.017896197736263275, "rewards/margins_std": 0.03139491006731987, "rewards/rejected": -0.01347312517464161, "step": 1360 }, { "epoch": 0.43, "grad_norm": 0.4609375, "learning_rate": 3.50476871801749e-07, "logits/chosen": -1.374895453453064, "logits/rejected": -0.9742172956466675, "logps/chosen": -298.19110107421875, "logps/rejected": -209.47329711914062, "loss": 0.6677, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03527087718248367, "rewards/margins": 0.04785536974668503, "rewards/margins_max": 0.06894843280315399, "rewards/margins_min": 0.026762310415506363, "rewards/margins_std": 0.029830092564225197, "rewards/rejected": -0.012584498152136803, "step": 1370 }, { "epoch": 0.43, "grad_norm": 0.337890625, "learning_rate": 3.479527289458021e-07, "logits/chosen": -1.3711079359054565, "logits/rejected": -1.0774781703948975, "logps/chosen": -184.34344482421875, "logps/rejected": -224.1437225341797, "loss": 0.6642, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03007492795586586, "rewards/margins": 0.05908045917749405, "rewards/margins_max": 0.09251175820827484, "rewards/margins_min": 0.02564915083348751, "rewards/margins_std": 0.04727901145815849, "rewards/rejected": -0.029005536809563637, "step": 1380 }, { "epoch": 0.44, "grad_norm": 0.427734375, "learning_rate": 3.4541673398459315e-07, "logits/chosen": -1.293668508529663, "logits/rejected": -1.0986145734786987, "logps/chosen": -209.5894012451172, "logps/rejected": -229.1302490234375, "loss": 0.6686, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0236887875944376, "rewards/margins": 0.04496780037879944, "rewards/margins_max": 0.06106124445796013, "rewards/margins_min": 0.028874356299638748, "rewards/margins_std": 0.0227595716714859, "rewards/rejected": -0.021279016509652138, "step": 1390 }, { "epoch": 0.44, "grad_norm": 0.48046875, "learning_rate": 3.4286919376898303e-07, "logits/chosen": -1.2458115816116333, "logits/rejected": -0.9769574403762817, "logps/chosen": -219.8367919921875, "logps/rejected": -227.66421508789062, "loss": 0.6661, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028372962027788162, "rewards/margins": 0.04673437029123306, "rewards/margins_max": 0.06330729275941849, "rewards/margins_min": 0.030161460861563683, "rewards/margins_std": 0.02343764156103134, "rewards/rejected": -0.018361413851380348, "step": 1400 }, { "epoch": 0.44, "grad_norm": 0.51953125, "learning_rate": 3.403104165467883e-07, "logits/chosen": -1.3929589986801147, "logits/rejected": -1.1880546808242798, "logps/chosen": -276.8525085449219, "logps/rejected": -241.208740234375, "loss": 0.666, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03430411219596863, "rewards/margins": 0.04882946237921715, "rewards/margins_max": 0.08031658828258514, "rewards/margins_min": 0.017342329025268555, "rewards/margins_std": 0.04452953487634659, "rewards/rejected": -0.014525346457958221, "step": 1410 }, { "epoch": 0.45, "grad_norm": 0.486328125, "learning_rate": 3.377407119254826e-07, "logits/chosen": -1.307857632637024, "logits/rejected": -0.973365306854248, "logps/chosen": -262.1522521972656, "logps/rejected": -219.1666717529297, "loss": 0.6699, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03059108555316925, "rewards/margins": 0.05130365490913391, "rewards/margins_max": 0.07557855546474457, "rewards/margins_min": 0.027028745040297508, "rewards/margins_std": 0.03432989865541458, "rewards/rejected": -0.02071256935596466, "step": 1420 }, { "epoch": 0.45, "grad_norm": 0.578125, "learning_rate": 3.351603908347359e-07, "logits/chosen": -1.3961646556854248, "logits/rejected": -1.0634922981262207, "logps/chosen": -244.14907836914062, "logps/rejected": -209.52413940429688, "loss": 0.6664, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02553880773484707, "rewards/margins": 0.039372727274894714, "rewards/margins_max": 0.05578699707984924, "rewards/margins_min": 0.022958464920520782, "rewards/margins_std": 0.023213278502225876, "rewards/rejected": -0.013833923265337944, "step": 1430 }, { "epoch": 0.45, "grad_norm": 0.443359375, "learning_rate": 3.325697654887918e-07, "logits/chosen": -1.457953929901123, "logits/rejected": -1.1763416528701782, "logps/chosen": -168.73855590820312, "logps/rejected": -200.1396026611328, "loss": 0.6663, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.039046648889780045, "rewards/margins": 0.061664480715990067, "rewards/margins_max": 0.08563290536403656, "rewards/margins_min": 0.03769605979323387, "rewards/margins_std": 0.03389647603034973, "rewards/rejected": -0.02261783741414547, "step": 1440 }, { "epoch": 0.46, "grad_norm": 0.435546875, "learning_rate": 3.2996914934869034e-07, "logits/chosen": -1.4136825799942017, "logits/rejected": -0.9438567161560059, "logps/chosen": -211.27880859375, "logps/rejected": -251.1080322265625, "loss": 0.6655, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.05148143321275711, "rewards/margins": 0.05926315858960152, "rewards/margins_max": 0.0925159901380539, "rewards/margins_min": 0.026010334491729736, "rewards/margins_std": 0.04702659696340561, "rewards/rejected": -0.00778172304853797, "step": 1450 }, { "epoch": 0.46, "grad_norm": 0.392578125, "learning_rate": 3.273588570843399e-07, "logits/chosen": -1.3561222553253174, "logits/rejected": -0.8794288635253906, "logps/chosen": -219.59188842773438, "logps/rejected": -204.20651245117188, "loss": 0.6653, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04018976539373398, "rewards/margins": 0.058758050203323364, "rewards/margins_max": 0.08319707214832306, "rewards/margins_min": 0.034319035708904266, "rewards/margins_std": 0.034561995416879654, "rewards/rejected": -0.018568288534879684, "step": 1460 }, { "epoch": 0.46, "grad_norm": 0.5078125, "learning_rate": 3.2473920453644254e-07, "logits/chosen": -1.364458680152893, "logits/rejected": -1.1189966201782227, "logps/chosen": -200.58279418945312, "logps/rejected": -247.4306182861328, "loss": 0.6625, "rewards/accuracies": 1.0, "rewards/chosen": 0.03194325789809227, "rewards/margins": 0.058095790445804596, "rewards/margins_max": 0.08179818838834763, "rewards/margins_min": 0.03439338877797127, "rewards/margins_std": 0.03352025896310806, "rewards/rejected": -0.02615252695977688, "step": 1470 }, { "epoch": 0.47, "grad_norm": 0.40234375, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -1.4114757776260376, "logits/rejected": -1.0227770805358887, "logps/chosen": -217.49783325195312, "logps/rejected": -270.8158874511719, "loss": 0.6649, "rewards/accuracies": 1.0, "rewards/chosen": 0.03752985596656799, "rewards/margins": 0.06543248146772385, "rewards/margins_max": 0.08699898421764374, "rewards/margins_min": 0.043865982443094254, "rewards/margins_std": 0.030499637126922607, "rewards/rejected": -0.02790263295173645, "step": 1480 }, { "epoch": 0.47, "grad_norm": 0.34765625, "learning_rate": 3.194730875773504e-07, "logits/chosen": -1.3351142406463623, "logits/rejected": -1.0667884349822998, "logps/chosen": -226.33425903320312, "logps/rejected": -211.48983764648438, "loss": 0.6666, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03304114192724228, "rewards/margins": 0.047557245939970016, "rewards/margins_max": 0.06921641528606415, "rewards/margins_min": 0.02589806541800499, "rewards/margins_std": 0.030630702152848244, "rewards/rejected": -0.014516102150082588, "step": 1490 }, { "epoch": 0.47, "grad_norm": 0.42578125, "learning_rate": 3.168272603569025e-07, "logits/chosen": -1.4025719165802002, "logits/rejected": -0.8659202456474304, "logps/chosen": -255.092529296875, "logps/rejected": -191.5826416015625, "loss": 0.6657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.037762049585580826, "rewards/margins": 0.05839651823043823, "rewards/margins_max": 0.08473102748394012, "rewards/margins_min": 0.03206200897693634, "rewards/margins_std": 0.03724262863397598, "rewards/rejected": -0.020634472370147705, "step": 1500 }, { "epoch": 0.48, "grad_norm": 0.41015625, "learning_rate": 3.1417334715730257e-07, "logits/chosen": -1.312922716140747, "logits/rejected": -0.9928410649299622, "logps/chosen": -274.3824768066406, "logps/rejected": -207.8001251220703, "loss": 0.6652, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03649697080254555, "rewards/margins": 0.04787913337349892, "rewards/margins_max": 0.07063382118940353, "rewards/margins_min": 0.025124436244368553, "rewards/margins_std": 0.032179996371269226, "rewards/rejected": -0.01138215884566307, "step": 1510 }, { "epoch": 0.48, "grad_norm": 0.3125, "learning_rate": 3.115116690973081e-07, "logits/chosen": -1.275967001914978, "logits/rejected": -1.0719497203826904, "logps/chosen": -170.84716796875, "logps/rejected": -187.09201049804688, "loss": 0.6697, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030098671093583107, "rewards/margins": 0.04616239666938782, "rewards/margins_max": 0.06867832690477371, "rewards/margins_min": 0.023646462708711624, "rewards/margins_std": 0.03184233605861664, "rewards/rejected": -0.01606372371315956, "step": 1520 }, { "epoch": 0.48, "grad_norm": 0.375, "learning_rate": 3.088425482352106e-07, "logits/chosen": -1.3329031467437744, "logits/rejected": -0.9551903009414673, "logps/chosen": -178.49220275878906, "logps/rejected": -163.50289916992188, "loss": 0.6707, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02671188674867153, "rewards/margins": 0.05042758584022522, "rewards/margins_max": 0.07536738365888596, "rewards/margins_min": 0.025487786158919334, "rewards/margins_std": 0.03527020663022995, "rewards/rejected": -0.023715700954198837, "step": 1530 }, { "epoch": 0.49, "grad_norm": 0.5625, "learning_rate": 3.061663075298675e-07, "logits/chosen": -1.5138485431671143, "logits/rejected": -1.1314074993133545, "logps/chosen": -250.61813354492188, "logps/rejected": -272.20379638671875, "loss": 0.6643, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03775627166032791, "rewards/margins": 0.05153984948992729, "rewards/margins_max": 0.07664564251899719, "rewards/margins_min": 0.026434045284986496, "rewards/margins_std": 0.035504959523677826, "rewards/rejected": -0.013783574104309082, "step": 1540 }, { "epoch": 0.49, "grad_norm": 0.408203125, "learning_rate": 3.034832708016243e-07, "logits/chosen": -1.5145914554595947, "logits/rejected": -1.0713765621185303, "logps/chosen": -261.14312744140625, "logps/rejected": -210.29232788085938, "loss": 0.6632, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03341571241617203, "rewards/margins": 0.048537809401750565, "rewards/margins_max": 0.07006336748600006, "rewards/margins_min": 0.027012262493371964, "rewards/margins_std": 0.030441725626587868, "rewards/rejected": -0.01512210350483656, "step": 1550 }, { "epoch": 0.49, "grad_norm": 0.37890625, "learning_rate": 3.0079376269313354e-07, "logits/chosen": -1.4111496210098267, "logits/rejected": -1.072613000869751, "logps/chosen": -207.9450225830078, "logps/rejected": -267.03912353515625, "loss": 0.6651, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.029351189732551575, "rewards/margins": 0.05201409012079239, "rewards/margins_max": 0.07417653501033783, "rewards/margins_min": 0.0298516396433115, "rewards/margins_std": 0.031342435628175735, "rewards/rejected": -0.022662896662950516, "step": 1560 }, { "epoch": 0.49, "grad_norm": 0.330078125, "learning_rate": 2.9809810863007284e-07, "logits/chosen": -1.4359506368637085, "logits/rejected": -1.0733433961868286, "logps/chosen": -200.97647094726562, "logps/rejected": -209.4395751953125, "loss": 0.6674, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.039248835295438766, "rewards/margins": 0.04961882531642914, "rewards/margins_max": 0.0718330442905426, "rewards/margins_min": 0.027404606342315674, "rewards/margins_std": 0.031415652483701706, "rewards/rejected": -0.010369991883635521, "step": 1570 }, { "epoch": 0.5, "grad_norm": 0.55859375, "learning_rate": 2.9539663478176946e-07, "logits/chosen": -1.2646214962005615, "logits/rejected": -1.1139628887176514, "logps/chosen": -206.5272674560547, "logps/rejected": -250.39108276367188, "loss": 0.6681, "rewards/accuracies": 1.0, "rewards/chosen": 0.02430318295955658, "rewards/margins": 0.051103752106428146, "rewards/margins_max": 0.0777682214975357, "rewards/margins_min": 0.024439293891191483, "rewards/margins_std": 0.03770923987030983, "rewards/rejected": -0.026800569146871567, "step": 1580 }, { "epoch": 0.5, "grad_norm": 0.4765625, "learning_rate": 2.9268966802173436e-07, "logits/chosen": -1.3860819339752197, "logits/rejected": -0.975805938243866, "logps/chosen": -270.6651611328125, "logps/rejected": -221.06259155273438, "loss": 0.6665, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03461884707212448, "rewards/margins": 0.05071113631129265, "rewards/margins_max": 0.0760193020105362, "rewards/margins_min": 0.025402987375855446, "rewards/margins_std": 0.03579113632440567, "rewards/rejected": -0.016092294827103615, "step": 1590 }, { "epoch": 0.5, "grad_norm": 0.443359375, "learning_rate": 2.89977535888111e-07, "logits/chosen": -1.3565785884857178, "logits/rejected": -0.9915903210639954, "logps/chosen": -177.0413055419922, "logps/rejected": -182.9870147705078, "loss": 0.6673, "rewards/accuracies": 1.0, "rewards/chosen": 0.03303280100226402, "rewards/margins": 0.06585012376308441, "rewards/margins_max": 0.08999715745449066, "rewards/margins_min": 0.04170309379696846, "rewards/margins_std": 0.034149058163166046, "rewards/rejected": -0.03281732648611069, "step": 1600 }, { "epoch": 0.51, "grad_norm": 0.388671875, "learning_rate": 2.872605665440436e-07, "logits/chosen": -1.3481905460357666, "logits/rejected": -1.1729605197906494, "logps/chosen": -169.9842529296875, "logps/rejected": -223.30044555664062, "loss": 0.6626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.032422084361314774, "rewards/margins": 0.055260200053453445, "rewards/margins_max": 0.07780520617961884, "rewards/margins_min": 0.03271518647670746, "rewards/margins_std": 0.03188345581293106, "rewards/rejected": -0.02283811755478382, "step": 1610 }, { "epoch": 0.51, "grad_norm": 0.37890625, "learning_rate": 2.845390887379706e-07, "logits/chosen": -1.4345109462738037, "logits/rejected": -1.1150403022766113, "logps/chosen": -225.3082275390625, "logps/rejected": -199.63519287109375, "loss": 0.669, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.034576646983623505, "rewards/margins": 0.04932091385126114, "rewards/margins_max": 0.06937690079212189, "rewards/margins_min": 0.029264941811561584, "rewards/margins_std": 0.028363442048430443, "rewards/rejected": -0.014744272455573082, "step": 1620 }, { "epoch": 0.51, "grad_norm": 0.58203125, "learning_rate": 2.8181343176384585e-07, "logits/chosen": -1.2172272205352783, "logits/rejected": -1.0032122135162354, "logps/chosen": -194.42764282226562, "logps/rejected": -336.8403015136719, "loss": 0.662, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02610202692449093, "rewards/margins": 0.06659840792417526, "rewards/margins_max": 0.09039248526096344, "rewards/margins_min": 0.04280433803796768, "rewards/margins_std": 0.03364989906549454, "rewards/rejected": -0.04049638658761978, "step": 1630 }, { "epoch": 0.52, "grad_norm": 0.408203125, "learning_rate": 2.7908392542129537e-07, "logits/chosen": -1.491234540939331, "logits/rejected": -1.1456706523895264, "logps/chosen": -226.4430694580078, "logps/rejected": -264.64874267578125, "loss": 0.6649, "rewards/accuracies": 1.0, "rewards/chosen": 0.036219272762537, "rewards/margins": 0.06396204233169556, "rewards/margins_max": 0.0869758352637291, "rewards/margins_min": 0.040948253124952316, "rewards/margins_std": 0.032546427100896835, "rewards/rejected": -0.02774277701973915, "step": 1640 }, { "epoch": 0.52, "grad_norm": 0.42578125, "learning_rate": 2.763508999757119e-07, "logits/chosen": -1.4049649238586426, "logits/rejected": -1.239553689956665, "logps/chosen": -215.4875030517578, "logps/rejected": -298.31365966796875, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 0.03639969974756241, "rewards/margins": 0.053609687834978104, "rewards/margins_max": 0.07144194096326828, "rewards/margins_min": 0.035777442157268524, "rewards/margins_std": 0.025218605995178223, "rewards/rejected": -0.017209986224770546, "step": 1650 }, { "epoch": 0.52, "grad_norm": 0.4140625, "learning_rate": 2.7361468611829326e-07, "logits/chosen": -1.4899475574493408, "logits/rejected": -1.128447413444519, "logps/chosen": -200.3207550048828, "logps/rejected": -228.01718139648438, "loss": 0.6621, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03625740110874176, "rewards/margins": 0.0598982498049736, "rewards/margins_max": 0.09039153158664703, "rewards/margins_min": 0.029404977336525917, "rewards/margins_std": 0.043124008923769, "rewards/rejected": -0.02364085428416729, "step": 1660 }, { "epoch": 0.53, "grad_norm": 0.259765625, "learning_rate": 2.708756149260292e-07, "logits/chosen": -1.4126758575439453, "logits/rejected": -1.0123107433319092, "logps/chosen": -235.05734252929688, "logps/rejected": -203.85006713867188, "loss": 0.6662, "rewards/accuracies": 1.0, "rewards/chosen": 0.034583888947963715, "rewards/margins": 0.052448056638240814, "rewards/margins_max": 0.07766715437173843, "rewards/margins_min": 0.027228962630033493, "rewards/margins_std": 0.03566519170999527, "rewards/rejected": -0.0178641676902771, "step": 1670 }, { "epoch": 0.53, "grad_norm": 0.486328125, "learning_rate": 2.681340178216423e-07, "logits/chosen": -1.6247339248657227, "logits/rejected": -1.223256230354309, "logps/chosen": -237.5697784423828, "logps/rejected": -252.75521850585938, "loss": 0.6637, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03515109419822693, "rewards/margins": 0.06105039268732071, "rewards/margins_max": 0.08089162409305573, "rewards/margins_min": 0.041209153831005096, "rewards/margins_std": 0.028059745207428932, "rewards/rejected": -0.02589929662644863, "step": 1680 }, { "epoch": 0.53, "grad_norm": 0.5078125, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -1.3141326904296875, "logits/rejected": -0.9784961938858032, "logps/chosen": -204.03591918945312, "logps/rejected": -265.62591552734375, "loss": 0.664, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03488199785351753, "rewards/margins": 0.06790916621685028, "rewards/margins_max": 0.09974372386932373, "rewards/margins_min": 0.03607460856437683, "rewards/margins_std": 0.04502086713910103, "rewards/rejected": -0.03302717208862305, "step": 1690 }, { "epoch": 0.54, "grad_norm": 0.361328125, "learning_rate": 2.62644573055405e-07, "logits/chosen": -1.527411699295044, "logits/rejected": -1.0853965282440186, "logps/chosen": -193.60665893554688, "logps/rejected": -200.410888671875, "loss": 0.6657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.037782810628414154, "rewards/margins": 0.06309525668621063, "rewards/margins_max": 0.0949065238237381, "rewards/margins_min": 0.031283993273973465, "rewards/margins_std": 0.04498792067170143, "rewards/rejected": -0.02531243860721588, "step": 1700 }, { "epoch": 0.54, "grad_norm": 0.447265625, "learning_rate": 2.598973896065674e-07, "logits/chosen": -1.1190847158432007, "logits/rejected": -0.9498281478881836, "logps/chosen": -246.06240844726562, "logps/rejected": -278.57708740234375, "loss": 0.6647, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.029680589213967323, "rewards/margins": 0.06372956186532974, "rewards/margins_max": 0.08478715270757675, "rewards/margins_min": 0.04267194867134094, "rewards/margins_std": 0.029779959470033646, "rewards/rejected": -0.03404896706342697, "step": 1710 }, { "epoch": 0.54, "grad_norm": 0.322265625, "learning_rate": 2.571490085912638e-07, "logits/chosen": -1.294392704963684, "logits/rejected": -0.901209831237793, "logps/chosen": -222.6404571533203, "logps/rejected": -221.46646118164062, "loss": 0.666, "rewards/accuracies": 1.0, "rewards/chosen": 0.035451389849185944, "rewards/margins": 0.06020699813961983, "rewards/margins_max": 0.08351422101259232, "rewards/margins_min": 0.036899782717227936, "rewards/margins_std": 0.03296138346195221, "rewards/rejected": -0.024755608290433884, "step": 1720 }, { "epoch": 0.54, "grad_norm": 0.404296875, "learning_rate": 2.5439976255868846e-07, "logits/chosen": -1.3172805309295654, "logits/rejected": -0.9587199091911316, "logps/chosen": -201.81642150878906, "logps/rejected": -264.8630065917969, "loss": 0.6643, "rewards/accuracies": 1.0, "rewards/chosen": 0.02492239698767662, "rewards/margins": 0.06449567526578903, "rewards/margins_max": 0.10114102065563202, "rewards/margins_min": 0.02785031870007515, "rewards/margins_std": 0.051824361085891724, "rewards/rejected": -0.03957327455282211, "step": 1730 }, { "epoch": 0.55, "grad_norm": 0.458984375, "learning_rate": 2.5164998416270137e-07, "logits/chosen": -1.4752823114395142, "logits/rejected": -1.1924030780792236, "logps/chosen": -225.65927124023438, "logps/rejected": -236.69290161132812, "loss": 0.6656, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.025519024580717087, "rewards/margins": 0.05898071080446243, "rewards/margins_max": 0.09197360277175903, "rewards/margins_min": 0.025987815111875534, "rewards/margins_std": 0.04665899649262428, "rewards/rejected": -0.033461686223745346, "step": 1740 }, { "epoch": 0.55, "grad_norm": 0.34765625, "learning_rate": 2.489000061215775e-07, "logits/chosen": -1.3754206895828247, "logits/rejected": -1.0634129047393799, "logps/chosen": -212.5056915283203, "logps/rejected": -217.0105438232422, "loss": 0.6659, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03384281322360039, "rewards/margins": 0.05164814740419388, "rewards/margins_max": 0.0746842697262764, "rewards/margins_min": 0.02861202321946621, "rewards/margins_std": 0.0325779989361763, "rewards/rejected": -0.01780533231794834, "step": 1750 }, { "epoch": 0.55, "grad_norm": 0.365234375, "learning_rate": 2.461501611777483e-07, "logits/chosen": -1.3263044357299805, "logits/rejected": -1.0537205934524536, "logps/chosen": -197.9228973388672, "logps/rejected": -214.32839965820312, "loss": 0.671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.030653411522507668, "rewards/margins": 0.04631539434194565, "rewards/margins_max": 0.06874962151050568, "rewards/margins_min": 0.023881174623966217, "rewards/margins_std": 0.03172678127884865, "rewards/rejected": -0.01566198468208313, "step": 1760 }, { "epoch": 0.56, "grad_norm": 0.345703125, "learning_rate": 2.4340078205754096e-07, "logits/chosen": -1.4674514532089233, "logits/rejected": -1.0580947399139404, "logps/chosen": -228.774169921875, "logps/rejected": -245.3206329345703, "loss": 0.6632, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03198526054620743, "rewards/margins": 0.060487449169158936, "rewards/margins_max": 0.09252621978521347, "rewards/margins_min": 0.028448667377233505, "rewards/margins_std": 0.045309677720069885, "rewards/rejected": -0.02850218489766121, "step": 1770 }, { "epoch": 0.56, "grad_norm": 0.390625, "learning_rate": 2.406522014309186e-07, "logits/chosen": -1.3413441181182861, "logits/rejected": -1.0260752439498901, "logps/chosen": -217.0348358154297, "logps/rejected": -218.7316436767578, "loss": 0.6673, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03381979838013649, "rewards/margins": 0.05491740256547928, "rewards/margins_max": 0.08126216381788254, "rewards/margins_min": 0.028572645038366318, "rewards/margins_std": 0.037257120013237, "rewards/rejected": -0.021097611635923386, "step": 1780 }, { "epoch": 0.56, "grad_norm": 0.3984375, "learning_rate": 2.3790475187122832e-07, "logits/chosen": -1.3534529209136963, "logits/rejected": -1.0642507076263428, "logps/chosen": -203.16989135742188, "logps/rejected": -185.0489044189453, "loss": 0.6664, "rewards/accuracies": 1.0, "rewards/chosen": 0.03119819238781929, "rewards/margins": 0.058540262281894684, "rewards/margins_max": 0.0830526053905487, "rewards/margins_min": 0.034027911722660065, "rewards/margins_std": 0.034665681421756744, "rewards/rejected": -0.027342066168785095, "step": 1790 }, { "epoch": 0.57, "grad_norm": 0.412109375, "learning_rate": 2.351587658149598e-07, "logits/chosen": -1.453975796699524, "logits/rejected": -0.9396857023239136, "logps/chosen": -307.119140625, "logps/rejected": -293.79193115234375, "loss": 0.6639, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04417193681001663, "rewards/margins": 0.06028149649500847, "rewards/margins_max": 0.08487708121538162, "rewards/margins_min": 0.03568592667579651, "rewards/margins_std": 0.03478339686989784, "rewards/rejected": -0.016109565272927284, "step": 1800 }, { "epoch": 0.57, "grad_norm": 0.39453125, "learning_rate": 2.3241457552152187e-07, "logits/chosen": -1.2886158227920532, "logits/rejected": -0.8535853624343872, "logps/chosen": -255.9151153564453, "logps/rejected": -190.72183227539062, "loss": 0.6651, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0356779471039772, "rewards/margins": 0.06389064341783524, "rewards/margins_max": 0.09344568848609924, "rewards/margins_min": 0.034335602074861526, "rewards/margins_std": 0.04179714247584343, "rewards/rejected": -0.028212696313858032, "step": 1810 }, { "epoch": 0.57, "grad_norm": 0.37890625, "learning_rate": 2.2967251303303876e-07, "logits/chosen": -1.2967920303344727, "logits/rejected": -1.069603443145752, "logps/chosen": -174.32562255859375, "logps/rejected": -198.73556518554688, "loss": 0.6673, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02710595726966858, "rewards/margins": 0.04836275056004524, "rewards/margins_max": 0.07030778378248215, "rewards/margins_min": 0.02641770802438259, "rewards/margins_std": 0.03103497065603733, "rewards/rejected": -0.021256795153021812, "step": 1820 }, { "epoch": 0.58, "grad_norm": 0.275390625, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -1.3830006122589111, "logits/rejected": -1.131734848022461, "logps/chosen": -196.27232360839844, "logps/rejected": -220.3488311767578, "loss": 0.6668, "rewards/accuracies": 0.875, "rewards/chosen": 0.04211854934692383, "rewards/margins": 0.054630208760499954, "rewards/margins_max": 0.08319230377674103, "rewards/margins_min": 0.026068110018968582, "rewards/margins_std": 0.0403929129242897, "rewards/rejected": -0.012511657550930977, "step": 1830 }, { "epoch": 0.58, "grad_norm": 0.458984375, "learning_rate": 2.2419609831198695e-07, "logits/chosen": -1.314412236213684, "logits/rejected": -1.0906130075454712, "logps/chosen": -202.8844451904297, "logps/rejected": -282.2475280761719, "loss": 0.6657, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.029452290385961533, "rewards/margins": 0.06307311356067657, "rewards/margins_max": 0.08669252693653107, "rewards/margins_min": 0.03945370018482208, "rewards/margins_std": 0.03340289741754532, "rewards/rejected": -0.03362082317471504, "step": 1840 }, { "epoch": 0.58, "grad_norm": 0.4765625, "learning_rate": 2.2146240871581875e-07, "logits/chosen": -1.4870127439498901, "logits/rejected": -1.10221529006958, "logps/chosen": -257.47381591796875, "logps/rejected": -300.7210388183594, "loss": 0.6677, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.048544105142354965, "rewards/margins": 0.06802462041378021, "rewards/margins_max": 0.0930628627538681, "rewards/margins_min": 0.04298638552427292, "rewards/margins_std": 0.035409413278102875, "rewards/rejected": -0.019480522722005844, "step": 1850 }, { "epoch": 0.59, "grad_norm": 0.404296875, "learning_rate": 2.187321721172288e-07, "logits/chosen": -1.2666473388671875, "logits/rejected": -0.9587362408638, "logps/chosen": -202.96151733398438, "logps/rejected": -188.11402893066406, "loss": 0.6625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.035519860684871674, "rewards/margins": 0.06863918900489807, "rewards/margins_max": 0.10405266284942627, "rewards/margins_min": 0.033225707709789276, "rewards/margins_std": 0.05008222907781601, "rewards/rejected": -0.0331193283200264, "step": 1860 }, { "epoch": 0.59, "grad_norm": 0.306640625, "learning_rate": 2.1600571886996932e-07, "logits/chosen": -1.409246563911438, "logits/rejected": -0.9662661552429199, "logps/chosen": -255.17337036132812, "logps/rejected": -237.2165069580078, "loss": 0.664, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.034895267337560654, "rewards/margins": 0.060319460928440094, "rewards/margins_max": 0.08776156604290009, "rewards/margins_min": 0.032877348363399506, "rewards/margins_std": 0.038808997720479965, "rewards/rejected": -0.025424188002943993, "step": 1870 }, { "epoch": 0.59, "grad_norm": 0.34765625, "learning_rate": 2.1328337887001386e-07, "logits/chosen": -1.3689050674438477, "logits/rejected": -0.9174262881278992, "logps/chosen": -250.42257690429688, "logps/rejected": -213.65115356445312, "loss": 0.6637, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03616097569465637, "rewards/margins": 0.0674886554479599, "rewards/margins_max": 0.09155096858739853, "rewards/margins_min": 0.04342634230852127, "rewards/margins_std": 0.03402925282716751, "rewards/rejected": -0.03132767975330353, "step": 1880 }, { "epoch": 0.6, "grad_norm": 0.55859375, "learning_rate": 2.105654815156406e-07, "logits/chosen": -1.2773230075836182, "logits/rejected": -0.9415411949157715, "logps/chosen": -211.5564727783203, "logps/rejected": -241.5669403076172, "loss": 0.6639, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03792757913470268, "rewards/margins": 0.05873064324259758, "rewards/margins_max": 0.08351272344589233, "rewards/margins_min": 0.03394855558872223, "rewards/margins_std": 0.03504716232419014, "rewards/rejected": -0.0208030603826046, "step": 1890 }, { "epoch": 0.6, "grad_norm": 0.443359375, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -1.5174918174743652, "logits/rejected": -1.0792747735977173, "logps/chosen": -274.3040466308594, "logps/rejected": -269.9195556640625, "loss": 0.6665, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.031049564480781555, "rewards/margins": 0.04908784478902817, "rewards/margins_max": 0.07297800481319427, "rewards/margins_min": 0.02519768849015236, "rewards/margins_std": 0.033785782754421234, "rewards/rejected": -0.01803828403353691, "step": 1900 }, { "epoch": 0.6, "grad_norm": 0.3984375, "learning_rate": 2.0514432960919976e-07, "logits/chosen": -1.3264081478118896, "logits/rejected": -0.8952063322067261, "logps/chosen": -275.90582275390625, "logps/rejected": -227.85183715820312, "loss": 0.6629, "rewards/accuracies": 0.875, "rewards/chosen": 0.035590268671512604, "rewards/margins": 0.05433149263262749, "rewards/margins_max": 0.08622786402702332, "rewards/margins_min": 0.022435134276747704, "rewards/margins_std": 0.04510827362537384, "rewards/rejected": -0.01874123141169548, "step": 1910 }, { "epoch": 0.6, "grad_norm": 0.384765625, "learning_rate": 2.024417310068309e-07, "logits/chosen": -1.3526580333709717, "logits/rejected": -1.0428838729858398, "logps/chosen": -242.9093475341797, "logps/rejected": -221.0670928955078, "loss": 0.6657, "rewards/accuracies": 1.0, "rewards/chosen": 0.03770860657095909, "rewards/margins": 0.06553932279348373, "rewards/margins_max": 0.09631849825382233, "rewards/margins_min": 0.03476015478372574, "rewards/margins_std": 0.04352831840515137, "rewards/rejected": -0.027830716222524643, "step": 1920 }, { "epoch": 0.61, "grad_norm": 0.392578125, "learning_rate": 1.9974488687007272e-07, "logits/chosen": -1.321537733078003, "logits/rejected": -0.9563083648681641, "logps/chosen": -189.53338623046875, "logps/rejected": -208.65695190429688, "loss": 0.6677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.037261709570884705, "rewards/margins": 0.050910621881484985, "rewards/margins_max": 0.08092696219682693, "rewards/margins_min": 0.020894277840852737, "rewards/margins_std": 0.042449526488780975, "rewards/rejected": -0.01364891231060028, "step": 1930 }, { "epoch": 0.61, "grad_norm": 0.314453125, "learning_rate": 1.9705412351224935e-07, "logits/chosen": -1.341074824333191, "logits/rejected": -1.031362533569336, "logps/chosen": -262.0687561035156, "logps/rejected": -209.2541046142578, "loss": 0.6632, "rewards/accuracies": 1.0, "rewards/chosen": 0.04073809087276459, "rewards/margins": 0.06763813644647598, "rewards/margins_max": 0.10079771280288696, "rewards/margins_min": 0.034478556364774704, "rewards/margins_std": 0.04689472168684006, "rewards/rejected": -0.026900043711066246, "step": 1940 }, { "epoch": 0.61, "grad_norm": 0.482421875, "learning_rate": 1.9436976651092142e-07, "logits/chosen": -1.4449079036712646, "logits/rejected": -1.0441436767578125, "logps/chosen": -323.22515869140625, "logps/rejected": -259.187744140625, "loss": 0.6621, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04286254942417145, "rewards/margins": 0.06869898736476898, "rewards/margins_max": 0.08886998146772385, "rewards/margins_min": 0.048527974635362625, "rewards/margins_std": 0.028526106849312782, "rewards/rejected": -0.025836432352662086, "step": 1950 }, { "epoch": 0.62, "grad_norm": 0.380859375, "learning_rate": 1.9169214066849198e-07, "logits/chosen": -1.3310493230819702, "logits/rejected": -1.0039780139923096, "logps/chosen": -207.80368041992188, "logps/rejected": -217.77279663085938, "loss": 0.6671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029799357056617737, "rewards/margins": 0.05040057748556137, "rewards/margins_max": 0.07911469042301178, "rewards/margins_min": 0.021686479449272156, "rewards/margins_std": 0.04060788080096245, "rewards/rejected": -0.02060122787952423, "step": 1960 }, { "epoch": 0.62, "grad_norm": 0.37109375, "learning_rate": 1.890215699729057e-07, "logits/chosen": -1.3599677085876465, "logits/rejected": -0.952431321144104, "logps/chosen": -220.8314971923828, "logps/rejected": -218.5143280029297, "loss": 0.6636, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03455578535795212, "rewards/margins": 0.059498321264982224, "rewards/margins_max": 0.08264943957328796, "rewards/margins_min": 0.03634720668196678, "rewards/margins_std": 0.032740626484155655, "rewards/rejected": -0.024942539632320404, "step": 1970 }, { "epoch": 0.62, "grad_norm": 0.3828125, "learning_rate": 1.8635837755844736e-07, "logits/chosen": -1.5396320819854736, "logits/rejected": -1.1135740280151367, "logps/chosen": -192.1985321044922, "logps/rejected": -189.65496826171875, "loss": 0.6638, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04031743109226227, "rewards/margins": 0.06330820918083191, "rewards/margins_max": 0.09154955297708511, "rewards/margins_min": 0.035066869109869, "rewards/margins_std": 0.03993929177522659, "rewards/rejected": -0.02299078181385994, "step": 1980 }, { "epoch": 0.63, "grad_norm": 0.328125, "learning_rate": 1.837028856666426e-07, "logits/chosen": -1.396333932876587, "logits/rejected": -1.0482286214828491, "logps/chosen": -223.5980987548828, "logps/rejected": -197.462646484375, "loss": 0.6639, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03402381017804146, "rewards/margins": 0.05975471809506416, "rewards/margins_max": 0.09332195669412613, "rewards/margins_min": 0.026187485083937645, "rewards/margins_std": 0.04747123643755913, "rewards/rejected": -0.025730907917022705, "step": 1990 }, { "epoch": 0.63, "grad_norm": 0.37109375, "learning_rate": 1.8105541560726783e-07, "logits/chosen": -1.5116699934005737, "logits/rejected": -1.005076289176941, "logps/chosen": -216.2085418701172, "logps/rejected": -199.5402374267578, "loss": 0.6648, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.037188541144132614, "rewards/margins": 0.05942929536104202, "rewards/margins_max": 0.08686941862106323, "rewards/margins_min": 0.03198916092514992, "rewards/margins_std": 0.038806211203336716, "rewards/rejected": -0.02224075235426426, "step": 2000 }, { "epoch": 0.63, "grad_norm": 0.353515625, "learning_rate": 1.7841628771947186e-07, "logits/chosen": -1.4040260314941406, "logits/rejected": -0.965591549873352, "logps/chosen": -234.39431762695312, "logps/rejected": -202.01571655273438, "loss": 0.6651, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03427529335021973, "rewards/margins": 0.0503767728805542, "rewards/margins_max": 0.07114100456237793, "rewards/margins_min": 0.02961254119873047, "rewards/margins_std": 0.02936505898833275, "rewards/rejected": -0.016101477667689323, "step": 2010 }, { "epoch": 0.64, "grad_norm": 0.3515625, "learning_rate": 1.757858213330157e-07, "logits/chosen": -1.1877460479736328, "logits/rejected": -0.9582545161247253, "logps/chosen": -229.884033203125, "logps/rejected": -281.41351318359375, "loss": 0.6646, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03942031413316727, "rewards/margins": 0.06542594730854034, "rewards/margins_max": 0.09780795872211456, "rewards/margins_min": 0.03304394707083702, "rewards/margins_std": 0.04579506441950798, "rewards/rejected": -0.026005636900663376, "step": 2020 }, { "epoch": 0.64, "grad_norm": 0.462890625, "learning_rate": 1.7316433472963426e-07, "logits/chosen": -1.507406234741211, "logits/rejected": -1.1749341487884521, "logps/chosen": -281.5582580566406, "logps/rejected": -243.66110229492188, "loss": 0.6623, "rewards/accuracies": 1.0, "rewards/chosen": 0.03884587436914444, "rewards/margins": 0.059391576796770096, "rewards/margins_max": 0.08674292266368866, "rewards/margins_min": 0.032040227204561234, "rewards/margins_std": 0.03868064284324646, "rewards/rejected": -0.02054569497704506, "step": 2030 }, { "epoch": 0.64, "grad_norm": 0.357421875, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -1.3578734397888184, "logits/rejected": -0.849805474281311, "logps/chosen": -331.993408203125, "logps/rejected": -279.07733154296875, "loss": 0.6626, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03726685792207718, "rewards/margins": 0.06775570660829544, "rewards/margins_max": 0.09739796817302704, "rewards/margins_min": 0.03811345621943474, "rewards/margins_std": 0.04192047566175461, "rewards/rejected": -0.03048885427415371, "step": 2040 }, { "epoch": 0.65, "grad_norm": 0.3828125, "learning_rate": 1.6794956852796616e-07, "logits/chosen": -1.421799659729004, "logits/rejected": -1.0734702348709106, "logps/chosen": -214.08364868164062, "logps/rejected": -222.42636108398438, "loss": 0.66, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.046803176403045654, "rewards/margins": 0.07683407515287399, "rewards/margins_max": 0.11319296061992645, "rewards/margins_min": 0.040475185960531235, "rewards/margins_std": 0.05141923576593399, "rewards/rejected": -0.03003089688718319, "step": 2050 }, { "epoch": 0.65, "grad_norm": 0.283203125, "learning_rate": 1.653569199070764e-07, "logits/chosen": -1.437723994255066, "logits/rejected": -1.0029988288879395, "logps/chosen": -206.7332000732422, "logps/rejected": -232.79580688476562, "loss": 0.661, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04119989275932312, "rewards/margins": 0.06901855766773224, "rewards/margins_max": 0.10958409309387207, "rewards/margins_min": 0.028453027829527855, "rewards/margins_std": 0.05736833065748215, "rewards/rejected": -0.027818670496344566, "step": 2060 }, { "epoch": 0.65, "grad_norm": 0.46484375, "learning_rate": 1.6277451294770832e-07, "logits/chosen": -1.427294135093689, "logits/rejected": -1.043678641319275, "logps/chosen": -173.60861206054688, "logps/rejected": -159.7396697998047, "loss": 0.663, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03490697965025902, "rewards/margins": 0.05194821208715439, "rewards/margins_max": 0.0755261555314064, "rewards/margins_min": 0.028370272368192673, "rewards/margins_std": 0.03334423899650574, "rewards/rejected": -0.01704123243689537, "step": 2070 }, { "epoch": 0.66, "grad_norm": 0.396484375, "learning_rate": 1.6020266011649176e-07, "logits/chosen": -1.3484151363372803, "logits/rejected": -0.9436542391777039, "logps/chosen": -246.00296020507812, "logps/rejected": -232.9607391357422, "loss": 0.6638, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03393206372857094, "rewards/margins": 0.06296433508396149, "rewards/margins_max": 0.09208185970783234, "rewards/margins_min": 0.03384682536125183, "rewards/margins_std": 0.0411783829331398, "rewards/rejected": -0.029032278805971146, "step": 2080 }, { "epoch": 0.66, "grad_norm": 0.34375, "learning_rate": 1.5764167260302608e-07, "logits/chosen": -1.269598364830017, "logits/rejected": -1.101138949394226, "logps/chosen": -212.4265594482422, "logps/rejected": -261.7176208496094, "loss": 0.6673, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02871812880039215, "rewards/margins": 0.059116750955581665, "rewards/margins_max": 0.081370510160923, "rewards/margins_min": 0.03686298802495003, "rewards/margins_std": 0.03147156536579132, "rewards/rejected": -0.030398612841963768, "step": 2090 }, { "epoch": 0.66, "grad_norm": 0.369140625, "learning_rate": 1.5509186028222653e-07, "logits/chosen": -1.3609730005264282, "logits/rejected": -0.8888334035873413, "logps/chosen": -240.88809204101562, "logps/rejected": -205.7561492919922, "loss": 0.6634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.036828793585300446, "rewards/margins": 0.06859615445137024, "rewards/margins_max": 0.09640248119831085, "rewards/margins_min": 0.040789827704429626, "rewards/margins_std": 0.0393240861594677, "rewards/rejected": -0.031767360866069794, "step": 2100 }, { "epoch": 0.66, "grad_norm": 0.390625, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -1.4757276773452759, "logits/rejected": -1.0737035274505615, "logps/chosen": -197.3357696533203, "logps/rejected": -193.08956909179688, "loss": 0.6677, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03849588334560394, "rewards/margins": 0.049884069710969925, "rewards/margins_max": 0.07029401510953903, "rewards/margins_min": 0.02947412058711052, "rewards/margins_std": 0.028864026069641113, "rewards/rejected": -0.01138819195330143, "step": 2110 }, { "epoch": 0.67, "grad_norm": 0.44140625, "learning_rate": 1.500269939200648e-07, "logits/chosen": -1.4104186296463013, "logits/rejected": -1.1364113092422485, "logps/chosen": -180.714111328125, "logps/rejected": -193.08792114257812, "loss": 0.6653, "rewards/accuracies": 1.0, "rewards/chosen": 0.03277165815234184, "rewards/margins": 0.04854750260710716, "rewards/margins_max": 0.07237715274095535, "rewards/margins_min": 0.02471785433590412, "rewards/margins_std": 0.03370020538568497, "rewards/rejected": -0.01577584072947502, "step": 2120 }, { "epoch": 0.67, "grad_norm": 0.384765625, "learning_rate": 1.4751255271848661e-07, "logits/chosen": -1.3990291357040405, "logits/rejected": -1.111859917640686, "logps/chosen": -191.26333618164062, "logps/rejected": -209.4487762451172, "loss": 0.6633, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03463239595293999, "rewards/margins": 0.05561947077512741, "rewards/margins_max": 0.08030703663825989, "rewards/margins_min": 0.03093191422522068, "rewards/margins_std": 0.03491348773241043, "rewards/rejected": -0.02098708227276802, "step": 2130 }, { "epoch": 0.67, "grad_norm": 0.4296875, "learning_rate": 1.450105123149904e-07, "logits/chosen": -1.3517110347747803, "logits/rejected": -0.8976603746414185, "logps/chosen": -236.5410614013672, "logps/rejected": -285.66143798828125, "loss": 0.6594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04675716906785965, "rewards/margins": 0.0757768452167511, "rewards/margins_max": 0.1105475053191185, "rewards/margins_min": 0.0410061851143837, "rewards/margins_std": 0.04917313903570175, "rewards/rejected": -0.029019678011536598, "step": 2140 }, { "epoch": 0.68, "grad_norm": 0.36328125, "learning_rate": 1.4252117545199638e-07, "logits/chosen": -1.2252193689346313, "logits/rejected": -1.2452119588851929, "logps/chosen": -129.21884155273438, "logps/rejected": -187.29981994628906, "loss": 0.6689, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02311699464917183, "rewards/margins": 0.053245484828948975, "rewards/margins_max": 0.08089035749435425, "rewards/margins_min": 0.025600602850317955, "rewards/margins_std": 0.03909575939178467, "rewards/rejected": -0.030128484591841698, "step": 2150 }, { "epoch": 0.68, "grad_norm": 0.421875, "learning_rate": 1.400448433348191e-07, "logits/chosen": -1.3551205396652222, "logits/rejected": -1.0361279249191284, "logps/chosen": -181.05245971679688, "logps/rejected": -190.93905639648438, "loss": 0.6667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.033756453543901443, "rewards/margins": 0.05258417874574661, "rewards/margins_max": 0.08322058618068695, "rewards/margins_min": 0.02194777876138687, "rewards/margins_std": 0.04332640767097473, "rewards/rejected": -0.01882772520184517, "step": 2160 }, { "epoch": 0.68, "grad_norm": 0.3828125, "learning_rate": 1.3758181559522219e-07, "logits/chosen": -1.3742306232452393, "logits/rejected": -1.1042159795761108, "logps/chosen": -195.7826690673828, "logps/rejected": -224.00357055664062, "loss": 0.6649, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.039305493235588074, "rewards/margins": 0.054510366171598434, "rewards/margins_max": 0.07698939740657806, "rewards/margins_min": 0.032031331211328506, "rewards/margins_std": 0.031790152192115784, "rewards/rejected": -0.015204873867332935, "step": 2170 }, { "epoch": 0.69, "grad_norm": 0.490234375, "learning_rate": 1.351323902551631e-07, "logits/chosen": -1.423339605331421, "logits/rejected": -1.0979268550872803, "logps/chosen": -188.20086669921875, "logps/rejected": -208.48483276367188, "loss": 0.6595, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.036755792796611786, "rewards/margins": 0.06653173267841339, "rewards/margins_max": 0.10358710587024689, "rewards/margins_min": 0.029476355761289597, "rewards/margins_std": 0.052404217422008514, "rewards/rejected": -0.029775941744446754, "step": 2180 }, { "epoch": 0.69, "grad_norm": 0.46484375, "learning_rate": 1.3269686369073347e-07, "logits/chosen": -1.4356403350830078, "logits/rejected": -0.9359350204467773, "logps/chosen": -255.5299530029297, "logps/rejected": -220.5718536376953, "loss": 0.663, "rewards/accuracies": 1.0, "rewards/chosen": 0.03550455719232559, "rewards/margins": 0.07294157147407532, "rewards/margins_max": 0.11041506379842758, "rewards/margins_min": 0.03546806797385216, "rewards/margins_std": 0.052995532751083374, "rewards/rejected": -0.037437014281749725, "step": 2190 }, { "epoch": 0.69, "grad_norm": 0.56640625, "learning_rate": 1.3027553059629776e-07, "logits/chosen": -1.270801305770874, "logits/rejected": -0.9209572076797485, "logps/chosen": -203.37147521972656, "logps/rejected": -237.0596160888672, "loss": 0.6625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03659834340214729, "rewards/margins": 0.07047709822654724, "rewards/margins_max": 0.10837771743535995, "rewards/margins_min": 0.03257646784186363, "rewards/margins_std": 0.05359958857297897, "rewards/rejected": -0.03387875854969025, "step": 2200 }, { "epoch": 0.7, "grad_norm": 0.400390625, "learning_rate": 1.2786868394883615e-07, "logits/chosen": -1.3924726247787476, "logits/rejected": -0.9072662591934204, "logps/chosen": -237.67532348632812, "logps/rejected": -171.44007873535156, "loss": 0.6647, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03645704686641693, "rewards/margins": 0.04906289279460907, "rewards/margins_max": 0.07811780273914337, "rewards/margins_min": 0.02000797912478447, "rewards/margins_std": 0.041089847683906555, "rewards/rejected": -0.012605843134224415, "step": 2210 }, { "epoch": 0.7, "grad_norm": 0.287109375, "learning_rate": 1.2547661497249423e-07, "logits/chosen": -1.505576491355896, "logits/rejected": -1.0931254625320435, "logps/chosen": -251.4204559326172, "logps/rejected": -184.33187866210938, "loss": 0.6618, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03499855846166611, "rewards/margins": 0.060498736798763275, "rewards/margins_max": 0.09210414439439774, "rewards/margins_min": 0.028893321752548218, "rewards/margins_std": 0.04469680041074753, "rewards/rejected": -0.025500169023871422, "step": 2220 }, { "epoch": 0.7, "grad_norm": 0.369140625, "learning_rate": 1.2309961310334608e-07, "logits/chosen": -1.381753921508789, "logits/rejected": -1.0234613418579102, "logps/chosen": -209.87673950195312, "logps/rejected": -193.29415893554688, "loss": 0.6657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.036537621170282364, "rewards/margins": 0.05814922973513603, "rewards/margins_max": 0.0908384695649147, "rewards/margins_min": 0.025459999218583107, "rewards/margins_std": 0.04622955992817879, "rewards/rejected": -0.021611608564853668, "step": 2230 }, { "epoch": 0.71, "grad_norm": 0.47265625, "learning_rate": 1.207379659543726e-07, "logits/chosen": -1.5136375427246094, "logits/rejected": -1.0719817876815796, "logps/chosen": -235.4477081298828, "logps/rejected": -190.52899169921875, "loss": 0.6619, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04006263613700867, "rewards/margins": 0.05348850414156914, "rewards/margins_max": 0.07529211789369583, "rewards/margins_min": 0.03168489784002304, "rewards/margins_std": 0.03083496168255806, "rewards/rejected": -0.013425871729850769, "step": 2240 }, { "epoch": 0.71, "grad_norm": 0.48046875, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -1.5472790002822876, "logits/rejected": -1.063508152961731, "logps/chosen": -237.460205078125, "logps/rejected": -203.92752075195312, "loss": 0.6676, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03670421242713928, "rewards/margins": 0.0651477798819542, "rewards/margins_max": 0.09266404807567596, "rewards/margins_min": 0.03763151913881302, "rewards/margins_std": 0.038913875818252563, "rewards/rejected": -0.02844356931746006, "step": 2250 }, { "epoch": 0.71, "grad_norm": 0.4140625, "learning_rate": 1.1606187694482895e-07, "logits/chosen": -1.3274108171463013, "logits/rejected": -1.0006046295166016, "logps/chosen": -341.37298583984375, "logps/rejected": -298.43218994140625, "loss": 0.6606, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03555456921458244, "rewards/margins": 0.0692644715309143, "rewards/margins_max": 0.09813406318426132, "rewards/margins_min": 0.04039488732814789, "rewards/margins_std": 0.040827758610248566, "rewards/rejected": -0.03370990604162216, "step": 2260 }, { "epoch": 0.72, "grad_norm": 0.453125, "learning_rate": 1.1374800088267766e-07, "logits/chosen": -1.3964722156524658, "logits/rejected": -0.8625639081001282, "logps/chosen": -256.6228332519531, "logps/rejected": -204.37188720703125, "loss": 0.6618, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03464067354798317, "rewards/margins": 0.06588082760572433, "rewards/margins_max": 0.09385097026824951, "rewards/margins_min": 0.03791068494319916, "rewards/margins_std": 0.03955575078725815, "rewards/rejected": -0.031240154057741165, "step": 2270 }, { "epoch": 0.72, "grad_norm": 0.392578125, "learning_rate": 1.1145061106907803e-07, "logits/chosen": -1.3579143285751343, "logits/rejected": -1.1530735492706299, "logps/chosen": -213.7913055419922, "logps/rejected": -274.8453674316406, "loss": 0.6626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.039796892553567886, "rewards/margins": 0.07231783121824265, "rewards/margins_max": 0.1056450754404068, "rewards/margins_min": 0.03899059444665909, "rewards/margins_std": 0.04713182896375656, "rewards/rejected": -0.03252093866467476, "step": 2280 }, { "epoch": 0.72, "grad_norm": 0.34375, "learning_rate": 1.0916998548409447e-07, "logits/chosen": -1.2776060104370117, "logits/rejected": -1.0304553508758545, "logps/chosen": -208.4978790283203, "logps/rejected": -255.5095977783203, "loss": 0.663, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03622225672006607, "rewards/margins": 0.06858749687671661, "rewards/margins_max": 0.09841950237751007, "rewards/margins_min": 0.03875547647476196, "rewards/margins_std": 0.04218883812427521, "rewards/rejected": -0.032365236431360245, "step": 2290 }, { "epoch": 0.72, "grad_norm": 0.5859375, "learning_rate": 1.0690640007934978e-07, "logits/chosen": -1.365751028060913, "logits/rejected": -0.8165037035942078, "logps/chosen": -263.61102294921875, "logps/rejected": -221.7294158935547, "loss": 0.6703, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.036759573966264725, "rewards/margins": 0.057599522173404694, "rewards/margins_max": 0.08515635877847672, "rewards/margins_min": 0.030042681843042374, "rewards/margins_std": 0.03897125646471977, "rewards/rejected": -0.020839953795075417, "step": 2300 }, { "epoch": 0.73, "grad_norm": 0.451171875, "learning_rate": 1.0466012874463507e-07, "logits/chosen": -1.2811259031295776, "logits/rejected": -0.9887920618057251, "logps/chosen": -267.3749694824219, "logps/rejected": -244.70596313476562, "loss": 0.6677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.038502246141433716, "rewards/margins": 0.060044266283512115, "rewards/margins_max": 0.08822239935398102, "rewards/margins_min": 0.03186614066362381, "rewards/margins_std": 0.039849892258644104, "rewards/rejected": -0.02154202200472355, "step": 2310 }, { "epoch": 0.73, "grad_norm": 0.498046875, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -1.4756540060043335, "logits/rejected": -0.9919270277023315, "logps/chosen": -223.4065704345703, "logps/rejected": -209.97573852539062, "loss": 0.6588, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04816528037190437, "rewards/margins": 0.07530733942985535, "rewards/margins_max": 0.10730701684951782, "rewards/margins_min": 0.04330766201019287, "rewards/margins_std": 0.04525437951087952, "rewards/rejected": -0.027142059057950974, "step": 2320 }, { "epoch": 0.73, "grad_norm": 0.326171875, "learning_rate": 1.0022061333671647e-07, "logits/chosen": -1.3365637063980103, "logits/rejected": -0.9453974962234497, "logps/chosen": -221.6447296142578, "logps/rejected": -205.4340057373047, "loss": 0.6633, "rewards/accuracies": 1.0, "rewards/chosen": 0.043263550847768784, "rewards/margins": 0.07088526338338852, "rewards/margins_max": 0.09706144034862518, "rewards/margins_min": 0.044709086418151855, "rewards/margins_std": 0.037018708884716034, "rewards/rejected": -0.027621712535619736, "step": 2330 }, { "epoch": 0.74, "grad_norm": 0.490234375, "learning_rate": 9.802790643694817e-08, "logits/chosen": -1.3576759099960327, "logits/rejected": -1.1886638402938843, "logps/chosen": -196.93856811523438, "logps/rejected": -203.70106506347656, "loss": 0.6661, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03112906776368618, "rewards/margins": 0.05759000778198242, "rewards/margins_max": 0.08904091268777847, "rewards/margins_min": 0.02613910473883152, "rewards/margins_std": 0.044478293508291245, "rewards/rejected": -0.026460934430360794, "step": 2340 }, { "epoch": 0.74, "grad_norm": 0.4609375, "learning_rate": 9.585358788908393e-08, "logits/chosen": -1.386399745941162, "logits/rejected": -1.065953254699707, "logps/chosen": -228.66220092773438, "logps/rejected": -250.2444610595703, "loss": 0.6644, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.030235329642891884, "rewards/margins": 0.05980142205953598, "rewards/margins_max": 0.09089671075344086, "rewards/margins_min": 0.028706133365631104, "rewards/margins_std": 0.043975379317998886, "rewards/rejected": -0.029566094279289246, "step": 2350 }, { "epoch": 0.74, "grad_norm": 0.453125, "learning_rate": 9.36979207817849e-08, "logits/chosen": -1.5047038793563843, "logits/rejected": -1.2480775117874146, "logps/chosen": -239.8202667236328, "logps/rejected": -234.4571533203125, "loss": 0.6693, "rewards/accuracies": 0.875, "rewards/chosen": 0.032885629683732986, "rewards/margins": 0.05206092447042465, "rewards/margins_max": 0.07507045567035675, "rewards/margins_min": 0.029051411896944046, "rewards/margins_std": 0.032540373504161835, "rewards/rejected": -0.019175300374627113, "step": 2360 }, { "epoch": 0.75, "grad_norm": 0.5390625, "learning_rate": 9.156116594692096e-08, "logits/chosen": -1.4589383602142334, "logits/rejected": -0.9495819807052612, "logps/chosen": -231.2331085205078, "logps/rejected": -212.32382202148438, "loss": 0.66, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04424898326396942, "rewards/margins": 0.06708662211894989, "rewards/margins_max": 0.09897585213184357, "rewards/margins_min": 0.03519739955663681, "rewards/margins_std": 0.04509817436337471, "rewards/rejected": -0.02283763512969017, "step": 2370 }, { "epoch": 0.75, "grad_norm": 0.375, "learning_rate": 8.944358192801102e-08, "logits/chosen": -1.4549717903137207, "logits/rejected": -0.9532996416091919, "logps/chosen": -222.93148803710938, "logps/rejected": -191.50634765625, "loss": 0.6578, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.042102448642253876, "rewards/margins": 0.07653506100177765, "rewards/margins_max": 0.11154161393642426, "rewards/margins_min": 0.041528504341840744, "rewards/margins_std": 0.04950674995779991, "rewards/rejected": -0.034432608634233475, "step": 2380 }, { "epoch": 0.75, "grad_norm": 0.3984375, "learning_rate": 8.734542494893954e-08, "logits/chosen": -1.492494821548462, "logits/rejected": -1.2444711923599243, "logps/chosen": -219.95266723632812, "logps/rejected": -268.90484619140625, "loss": 0.6652, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04024102911353111, "rewards/margins": 0.06013556569814682, "rewards/margins_max": 0.08759422600269318, "rewards/margins_min": 0.03267688676714897, "rewards/margins_std": 0.03883242979645729, "rewards/rejected": -0.01989452913403511, "step": 2390 }, { "epoch": 0.76, "grad_norm": 0.921875, "learning_rate": 8.526694888295355e-08, "logits/chosen": -1.3630679845809937, "logits/rejected": -1.0612514019012451, "logps/chosen": -223.59716796875, "logps/rejected": -237.7313690185547, "loss": 0.6612, "rewards/accuracies": 1.0, "rewards/chosen": 0.03609809651970863, "rewards/margins": 0.06568726152181625, "rewards/margins_max": 0.08733747154474258, "rewards/margins_min": 0.04403705149888992, "rewards/margins_std": 0.03061802126467228, "rewards/rejected": -0.02958916500210762, "step": 2400 }, { "epoch": 0.76, "grad_norm": 0.42578125, "learning_rate": 8.320840522194505e-08, "logits/chosen": -1.3517181873321533, "logits/rejected": -1.105916142463684, "logps/chosen": -233.48831176757812, "logps/rejected": -236.3004913330078, "loss": 0.6624, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03493565320968628, "rewards/margins": 0.061103563755750656, "rewards/margins_max": 0.09413080662488937, "rewards/margins_min": 0.02807632088661194, "rewards/margins_std": 0.046707578003406525, "rewards/rejected": -0.026167908683419228, "step": 2410 }, { "epoch": 0.76, "grad_norm": 0.400390625, "learning_rate": 8.117004304602052e-08, "logits/chosen": -1.4049303531646729, "logits/rejected": -0.988071620464325, "logps/chosen": -274.993896484375, "logps/rejected": -221.76278686523438, "loss": 0.6689, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.037282317876815796, "rewards/margins": 0.049982473254203796, "rewards/margins_max": 0.07475082576274872, "rewards/margins_min": 0.025214115157723427, "rewards/margins_std": 0.035027749836444855, "rewards/rejected": -0.01270015724003315, "step": 2420 }, { "epoch": 0.77, "grad_norm": 0.3359375, "learning_rate": 7.915210899336283e-08, "logits/chosen": -1.5335876941680908, "logits/rejected": -1.1939712762832642, "logps/chosen": -214.1549530029297, "logps/rejected": -259.2396545410156, "loss": 0.6641, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0516619011759758, "rewards/margins": 0.06498047709465027, "rewards/margins_max": 0.09797366708517075, "rewards/margins_min": 0.03198728710412979, "rewards/margins_std": 0.046659428626298904, "rewards/rejected": -0.013318580575287342, "step": 2430 }, { "epoch": 0.77, "grad_norm": 0.50390625, "learning_rate": 7.715484723038837e-08, "logits/chosen": -1.1930948495864868, "logits/rejected": -0.938764750957489, "logps/chosen": -220.21621704101562, "logps/rejected": -254.71420288085938, "loss": 0.6654, "rewards/accuracies": 1.0, "rewards/chosen": 0.027883481234312057, "rewards/margins": 0.060532040894031525, "rewards/margins_max": 0.0814305990934372, "rewards/margins_min": 0.03963347524404526, "rewards/margins_std": 0.029555032029747963, "rewards/rejected": -0.03264855593442917, "step": 2440 }, { "epoch": 0.77, "grad_norm": 0.412109375, "learning_rate": 7.517849942220348e-08, "logits/chosen": -1.288425087928772, "logits/rejected": -0.9016556739807129, "logps/chosen": -207.7607421875, "logps/rejected": -215.00808715820312, "loss": 0.6621, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.033341165632009506, "rewards/margins": 0.05306249111890793, "rewards/margins_max": 0.07986272126436234, "rewards/margins_min": 0.026262247934937477, "rewards/margins_std": 0.03790125995874405, "rewards/rejected": -0.019721319898962975, "step": 2450 }, { "epoch": 0.77, "grad_norm": 0.48828125, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.3114441633224487, "logits/rejected": -1.1315343379974365, "logps/chosen": -204.32998657226562, "logps/rejected": -197.68760681152344, "loss": 0.6669, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023209819570183754, "rewards/margins": 0.04553220421075821, "rewards/margins_max": 0.06928315758705139, "rewards/margins_min": 0.021781256422400475, "rewards/margins_std": 0.033588919788599014, "rewards/rejected": -0.022322386503219604, "step": 2460 }, { "epoch": 0.78, "grad_norm": 0.390625, "learning_rate": 7.128949964893646e-08, "logits/chosen": -1.4030101299285889, "logits/rejected": -1.0203847885131836, "logps/chosen": -246.3531951904297, "logps/rejected": -231.5150604248047, "loss": 0.6645, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.041862308979034424, "rewards/margins": 0.06798645108938217, "rewards/margins_max": 0.09953634440898895, "rewards/margins_min": 0.036436546593904495, "rewards/margins_std": 0.04461830109357834, "rewards/rejected": -0.026124143972992897, "step": 2470 }, { "epoch": 0.78, "grad_norm": 0.2890625, "learning_rate": 6.937731824588141e-08, "logits/chosen": -1.3225687742233276, "logits/rejected": -1.2012965679168701, "logps/chosen": -161.27560424804688, "logps/rejected": -162.04849243164062, "loss": 0.6697, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024264657869935036, "rewards/margins": 0.04363849759101868, "rewards/margins_max": 0.06468725949525833, "rewards/margins_min": 0.02258973941206932, "rewards/margins_std": 0.029767444357275963, "rewards/rejected": -0.01937383972108364, "step": 2480 }, { "epoch": 0.78, "grad_norm": 0.4140625, "learning_rate": 6.74869918647325e-08, "logits/chosen": -1.2273991107940674, "logits/rejected": -0.8869683146476746, "logps/chosen": -242.3751678466797, "logps/rejected": -222.62265014648438, "loss": 0.6686, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04028186947107315, "rewards/margins": 0.05239185690879822, "rewards/margins_max": 0.07009953260421753, "rewards/margins_min": 0.03468417376279831, "rewards/margins_std": 0.025042440742254257, "rewards/rejected": -0.012109987437725067, "step": 2490 }, { "epoch": 0.79, "grad_norm": 0.443359375, "learning_rate": 6.56187492316059e-08, "logits/chosen": -1.3965575695037842, "logits/rejected": -0.9450374841690063, "logps/chosen": -220.7981414794922, "logps/rejected": -155.75204467773438, "loss": 0.6613, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02550414577126503, "rewards/margins": 0.06219423562288284, "rewards/margins_max": 0.08597894012928009, "rewards/margins_min": 0.0384095273911953, "rewards/margins_std": 0.033636655658483505, "rewards/rejected": -0.036690086126327515, "step": 2500 }, { "epoch": 0.79, "grad_norm": 0.447265625, "learning_rate": 6.377281640052357e-08, "logits/chosen": -1.5471882820129395, "logits/rejected": -1.1804416179656982, "logps/chosen": -192.26565551757812, "logps/rejected": -246.218994140625, "loss": 0.6628, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04850774258375168, "rewards/margins": 0.06347064673900604, "rewards/margins_max": 0.10149389505386353, "rewards/margins_min": 0.025447404012084007, "rewards/margins_std": 0.05377299338579178, "rewards/rejected": -0.014962906017899513, "step": 2510 }, { "epoch": 0.79, "grad_norm": 0.40625, "learning_rate": 6.19494167260613e-08, "logits/chosen": -1.425964117050171, "logits/rejected": -1.0960302352905273, "logps/chosen": -184.11727905273438, "logps/rejected": -191.51913452148438, "loss": 0.6597, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0385814905166626, "rewards/margins": 0.061642616987228394, "rewards/margins_max": 0.09265581518411636, "rewards/margins_min": 0.030629415065050125, "rewards/margins_std": 0.04385928437113762, "rewards/rejected": -0.023061122745275497, "step": 2520 }, { "epoch": 0.8, "grad_norm": 0.39453125, "learning_rate": 6.01487708363232e-08, "logits/chosen": -1.4187657833099365, "logits/rejected": -1.0462344884872437, "logps/chosen": -231.49960327148438, "logps/rejected": -250.50973510742188, "loss": 0.6599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04294178634881973, "rewards/margins": 0.06284011900424957, "rewards/margins_max": 0.0833154022693634, "rewards/margins_min": 0.042364828288555145, "rewards/margins_std": 0.02895643189549446, "rewards/rejected": -0.01989833451807499, "step": 2530 }, { "epoch": 0.8, "grad_norm": 0.52734375, "learning_rate": 5.837109660624606e-08, "logits/chosen": -1.3851536512374878, "logits/rejected": -1.0157699584960938, "logps/chosen": -226.1177978515625, "logps/rejected": -238.81539916992188, "loss": 0.663, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03696604073047638, "rewards/margins": 0.06656143069267273, "rewards/margins_max": 0.0883278027176857, "rewards/margins_min": 0.04479505866765976, "rewards/margins_std": 0.030782291665673256, "rewards/rejected": -0.02959538996219635, "step": 2540 }, { "epoch": 0.8, "grad_norm": 0.3359375, "learning_rate": 5.6616609131236725e-08, "logits/chosen": -1.5234705209732056, "logits/rejected": -1.249939203262329, "logps/chosen": -209.16690063476562, "logps/rejected": -201.7328643798828, "loss": 0.666, "rewards/accuracies": 1.0, "rewards/chosen": 0.042094189673662186, "rewards/margins": 0.05819466710090637, "rewards/margins_max": 0.08667898923158646, "rewards/margins_min": 0.029710358008742332, "rewards/margins_std": 0.040282897651195526, "rewards/rejected": -0.016100479289889336, "step": 2550 }, { "epoch": 0.81, "grad_norm": 0.59765625, "learning_rate": 5.4885520701146324e-08, "logits/chosen": -1.27875816822052, "logits/rejected": -0.9493977427482605, "logps/chosen": -214.4361572265625, "logps/rejected": -233.2643280029297, "loss": 0.6636, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.034729085862636566, "rewards/margins": 0.06826482713222504, "rewards/margins_max": 0.11228573322296143, "rewards/margins_min": 0.024243932217359543, "rewards/margins_std": 0.06225494667887688, "rewards/rejected": -0.03353574126958847, "step": 2560 }, { "epoch": 0.81, "grad_norm": 0.46484375, "learning_rate": 5.3178040774583236e-08, "logits/chosen": -1.4629589319229126, "logits/rejected": -0.9861122965812683, "logps/chosen": -280.67486572265625, "logps/rejected": -271.3564147949219, "loss": 0.6655, "rewards/accuracies": 1.0, "rewards/chosen": 0.03766489028930664, "rewards/margins": 0.06308640539646149, "rewards/margins_max": 0.08921506255865097, "rewards/margins_min": 0.036957744508981705, "rewards/margins_std": 0.03695150464773178, "rewards/rejected": -0.025421511381864548, "step": 2570 }, { "epoch": 0.81, "grad_norm": 0.451171875, "learning_rate": 5.149437595356901e-08, "logits/chosen": -1.3392517566680908, "logits/rejected": -0.9539203643798828, "logps/chosen": -244.0900421142578, "logps/rejected": -216.6325225830078, "loss": 0.6665, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03929731249809265, "rewards/margins": 0.05567679926753044, "rewards/margins_max": 0.08472796529531479, "rewards/margins_min": 0.026625623926520348, "rewards/margins_std": 0.041084565222263336, "rewards/rejected": -0.01637948676943779, "step": 2580 }, { "epoch": 0.82, "grad_norm": 0.490234375, "learning_rate": 4.9834729958540016e-08, "logits/chosen": -1.3185430765151978, "logits/rejected": -0.9537866711616516, "logps/chosen": -255.76937866210938, "logps/rejected": -173.42251586914062, "loss": 0.6619, "rewards/accuracies": 1.0, "rewards/chosen": 0.03467049077153206, "rewards/margins": 0.05125656723976135, "rewards/margins_max": 0.07643640786409378, "rewards/margins_min": 0.026076724752783775, "rewards/margins_std": 0.03560966998338699, "rewards/rejected": -0.016586078330874443, "step": 2590 }, { "epoch": 0.82, "grad_norm": 0.26171875, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -1.4323641061782837, "logits/rejected": -1.1901360750198364, "logps/chosen": -212.28759765625, "logps/rejected": -251.69052124023438, "loss": 0.6676, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03305204585194588, "rewards/margins": 0.04768746346235275, "rewards/margins_max": 0.0705387219786644, "rewards/margins_min": 0.024836191907525063, "rewards/margins_std": 0.03231657296419144, "rewards/rejected": -0.014635416679084301, "step": 2600 }, { "epoch": 0.82, "grad_norm": 0.56640625, "learning_rate": 4.658829477270995e-08, "logits/chosen": -1.4831786155700684, "logits/rejected": -1.0595829486846924, "logps/chosen": -205.73196411132812, "logps/rejected": -281.29119873046875, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 0.04150627925992012, "rewards/margins": 0.06586649268865585, "rewards/margins_max": 0.08460468798875809, "rewards/margins_min": 0.047128308564424515, "rewards/margins_std": 0.026499798521399498, "rewards/rejected": -0.02436022460460663, "step": 2610 }, { "epoch": 0.83, "grad_norm": 0.37109375, "learning_rate": 4.5001898394768336e-08, "logits/chosen": -1.4085218906402588, "logits/rejected": -1.1751958131790161, "logps/chosen": -211.86831665039062, "logps/rejected": -212.71908569335938, "loss": 0.6666, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02613511122763157, "rewards/margins": 0.05009372904896736, "rewards/margins_max": 0.07305373251438141, "rewards/margins_min": 0.027133729308843613, "rewards/margins_std": 0.03247034177184105, "rewards/rejected": -0.023958619683980942, "step": 2620 }, { "epoch": 0.83, "grad_norm": 0.50390625, "learning_rate": 4.3440306421001324e-08, "logits/chosen": -1.531702995300293, "logits/rejected": -1.2762770652770996, "logps/chosen": -264.6157531738281, "logps/rejected": -239.91134643554688, "loss": 0.6656, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03299673646688461, "rewards/margins": 0.05226144194602966, "rewards/margins_max": 0.08147990703582764, "rewards/margins_min": 0.023042969405651093, "rewards/margins_std": 0.041321154683828354, "rewards/rejected": -0.019264699891209602, "step": 2630 }, { "epoch": 0.83, "grad_norm": 0.3046875, "learning_rate": 4.190370780124863e-08, "logits/chosen": -1.2897651195526123, "logits/rejected": -1.0072309970855713, "logps/chosen": -186.4278564453125, "logps/rejected": -243.1654815673828, "loss": 0.669, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.032657403498888016, "rewards/margins": 0.049304358661174774, "rewards/margins_max": 0.07682739198207855, "rewards/margins_min": 0.021781327202916145, "rewards/margins_std": 0.038923438638448715, "rewards/rejected": -0.01664695516228676, "step": 2640 }, { "epoch": 0.83, "grad_norm": 0.345703125, "learning_rate": 4.0392288461199045e-08, "logits/chosen": -1.2460219860076904, "logits/rejected": -1.0387169122695923, "logps/chosen": -224.2480926513672, "logps/rejected": -217.1114959716797, "loss": 0.6665, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.033198267221450806, "rewards/margins": 0.061884719878435135, "rewards/margins_max": 0.09340154379606247, "rewards/margins_min": 0.0303678959608078, "rewards/margins_std": 0.04457152262330055, "rewards/rejected": -0.02868645451962948, "step": 2650 }, { "epoch": 0.84, "grad_norm": 0.37890625, "learning_rate": 3.8906231279893423e-08, "logits/chosen": -1.231979250907898, "logits/rejected": -1.0273730754852295, "logps/chosen": -233.99267578125, "logps/rejected": -187.70278930664062, "loss": 0.6644, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02670123614370823, "rewards/margins": 0.05759245902299881, "rewards/margins_max": 0.09034743160009384, "rewards/margins_min": 0.024837475270032883, "rewards/margins_std": 0.0463225394487381, "rewards/rejected": -0.03089122101664543, "step": 2660 }, { "epoch": 0.84, "grad_norm": 0.369140625, "learning_rate": 3.74457160675965e-08, "logits/chosen": -1.3447935581207275, "logits/rejected": -1.003073811531067, "logps/chosen": -207.041015625, "logps/rejected": -198.29556274414062, "loss": 0.6667, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03199172765016556, "rewards/margins": 0.05442710965871811, "rewards/margins_max": 0.07237287610769272, "rewards/margins_min": 0.036481358110904694, "rewards/margins_std": 0.025379130616784096, "rewards/rejected": -0.02243538200855255, "step": 2670 }, { "epoch": 0.84, "grad_norm": 0.373046875, "learning_rate": 3.601091954404062e-08, "logits/chosen": -1.2016583681106567, "logits/rejected": -0.9326213002204895, "logps/chosen": -238.39126586914062, "logps/rejected": -243.9576416015625, "loss": 0.6645, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026809915900230408, "rewards/margins": 0.04789675027132034, "rewards/margins_max": 0.07007952034473419, "rewards/margins_min": 0.025713974609971046, "rewards/margins_std": 0.03137117996811867, "rewards/rejected": -0.021086832508444786, "step": 2680 }, { "epoch": 0.85, "grad_norm": 0.44921875, "learning_rate": 3.460201531704263e-08, "logits/chosen": -1.3697774410247803, "logits/rejected": -0.8151613473892212, "logps/chosen": -393.69189453125, "logps/rejected": -246.65817260742188, "loss": 0.6595, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03834783285856247, "rewards/margins": 0.07543188333511353, "rewards/margins_max": 0.10231365263462067, "rewards/margins_min": 0.04855012148618698, "rewards/margins_std": 0.038016561418771744, "rewards/rejected": -0.037084050476551056, "step": 2690 }, { "epoch": 0.85, "grad_norm": 0.38671875, "learning_rate": 3.321917386149772e-08, "logits/chosen": -1.4533543586730957, "logits/rejected": -1.0557693243026733, "logps/chosen": -209.1657257080078, "logps/rejected": -214.769287109375, "loss": 0.6642, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.042614761739969254, "rewards/margins": 0.05421183258295059, "rewards/margins_max": 0.08117054402828217, "rewards/margins_min": 0.02725311741232872, "rewards/margins_std": 0.038125377148389816, "rewards/rejected": -0.011597072705626488, "step": 2700 }, { "epoch": 0.85, "grad_norm": 0.412109375, "learning_rate": 3.1862562498752354e-08, "logits/chosen": -1.4646778106689453, "logits/rejected": -1.1616142988204956, "logps/chosen": -192.743408203125, "logps/rejected": -208.6314697265625, "loss": 0.6701, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03919973596930504, "rewards/margins": 0.048980120569467545, "rewards/margins_max": 0.06728260964155197, "rewards/margins_min": 0.030677635222673416, "rewards/margins_std": 0.02588362991809845, "rewards/rejected": -0.009780386462807655, "step": 2710 }, { "epoch": 0.86, "grad_norm": 0.4453125, "learning_rate": 3.053234537635857e-08, "logits/chosen": -1.5152153968811035, "logits/rejected": -1.1075925827026367, "logps/chosen": -182.39224243164062, "logps/rejected": -248.19351196289062, "loss": 0.6603, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.038503944873809814, "rewards/margins": 0.06572575867176056, "rewards/margins_max": 0.09247289597988129, "rewards/margins_min": 0.03897860646247864, "rewards/margins_std": 0.03782618045806885, "rewards/rejected": -0.027221810072660446, "step": 2720 }, { "epoch": 0.86, "grad_norm": 0.50390625, "learning_rate": 2.922868344821236e-08, "logits/chosen": -1.3224998712539673, "logits/rejected": -0.881952166557312, "logps/chosen": -220.5806121826172, "logps/rejected": -189.16519165039062, "loss": 0.6628, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03971542418003082, "rewards/margins": 0.06064347177743912, "rewards/margins_max": 0.08912724256515503, "rewards/margins_min": 0.03215969726443291, "rewards/margins_std": 0.04028213769197464, "rewards/rejected": -0.020928047597408295, "step": 2730 }, { "epoch": 0.86, "grad_norm": 0.404296875, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -1.4898918867111206, "logits/rejected": -0.9584072828292847, "logps/chosen": -253.1838836669922, "logps/rejected": -262.40740966796875, "loss": 0.6648, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04760271683335304, "rewards/margins": 0.06493322551250458, "rewards/margins_max": 0.088630810379982, "rewards/margins_min": 0.041235629469156265, "rewards/margins_std": 0.03351346030831337, "rewards/rejected": -0.017330504953861237, "step": 2740 }, { "epoch": 0.87, "grad_norm": 0.41796875, "learning_rate": 2.670165290550544e-08, "logits/chosen": -1.386683464050293, "logits/rejected": -0.9467649459838867, "logps/chosen": -209.0247802734375, "logps/rejected": -213.07766723632812, "loss": 0.6661, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03524526581168175, "rewards/margins": 0.05571124702692032, "rewards/margins_max": 0.0755188837647438, "rewards/margins_min": 0.03590361401438713, "rewards/margins_std": 0.028012219816446304, "rewards/rejected": -0.02046598121523857, "step": 2750 }, { "epoch": 0.87, "grad_norm": 0.380859375, "learning_rate": 2.5478590057127268e-08, "logits/chosen": -1.4220774173736572, "logits/rejected": -1.0289338827133179, "logps/chosen": -211.04776000976562, "logps/rejected": -194.3659210205078, "loss": 0.6609, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03885051980614662, "rewards/margins": 0.06495725363492966, "rewards/margins_max": 0.09185833483934402, "rewards/margins_min": 0.038056183606386185, "rewards/margins_std": 0.03804386407136917, "rewards/rejected": -0.026106741279363632, "step": 2760 }, { "epoch": 0.87, "grad_norm": 0.38671875, "learning_rate": 2.4282693898364432e-08, "logits/chosen": -1.4226223230361938, "logits/rejected": -0.9696500897407532, "logps/chosen": -176.65994262695312, "logps/rejected": -180.49554443359375, "loss": 0.6622, "rewards/accuracies": 1.0, "rewards/chosen": 0.04054059833288193, "rewards/margins": 0.06953348219394684, "rewards/margins_max": 0.09578864276409149, "rewards/margins_min": 0.04327831417322159, "rewards/margins_std": 0.03713040426373482, "rewards/rejected": -0.02899288199841976, "step": 2770 }, { "epoch": 0.88, "grad_norm": 0.328125, "learning_rate": 2.3114109130516424e-08, "logits/chosen": -1.3210171461105347, "logits/rejected": -0.9485718607902527, "logps/chosen": -182.39852905273438, "logps/rejected": -210.02572631835938, "loss": 0.6639, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03208126500248909, "rewards/margins": 0.06662876158952713, "rewards/margins_max": 0.09643807262182236, "rewards/margins_min": 0.036819443106651306, "rewards/margins_std": 0.04215674102306366, "rewards/rejected": -0.03454749658703804, "step": 2780 }, { "epoch": 0.88, "grad_norm": 0.302734375, "learning_rate": 2.1972977150253064e-08, "logits/chosen": -1.5038772821426392, "logits/rejected": -0.935627818107605, "logps/chosen": -247.6013641357422, "logps/rejected": -288.5367431640625, "loss": 0.6649, "rewards/accuracies": 0.875, "rewards/chosen": 0.04114503413438797, "rewards/margins": 0.07020784169435501, "rewards/margins_max": 0.11474663019180298, "rewards/margins_min": 0.025669043883681297, "rewards/margins_std": 0.06298737227916718, "rewards/rejected": -0.02906280755996704, "step": 2790 }, { "epoch": 0.88, "grad_norm": 0.57421875, "learning_rate": 2.085943603250595e-08, "logits/chosen": -1.428411841392517, "logits/rejected": -1.1179401874542236, "logps/chosen": -189.0768585205078, "logps/rejected": -204.92483520507812, "loss": 0.6648, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.037042513489723206, "rewards/margins": 0.06584902107715607, "rewards/margins_max": 0.08738056570291519, "rewards/margins_min": 0.04431745409965515, "rewards/margins_std": 0.03045022115111351, "rewards/rejected": -0.028806498274207115, "step": 2800 }, { "epoch": 0.89, "grad_norm": 0.31640625, "learning_rate": 1.977362051376158e-08, "logits/chosen": -1.4192006587982178, "logits/rejected": -1.046197772026062, "logps/chosen": -207.2356414794922, "logps/rejected": -183.64010620117188, "loss": 0.6652, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.040088996291160583, "rewards/margins": 0.05597255378961563, "rewards/margins_max": 0.08285551518201828, "rewards/margins_min": 0.02908957563340664, "rewards/margins_std": 0.03801826387643814, "rewards/rejected": -0.01588355377316475, "step": 2810 }, { "epoch": 0.89, "grad_norm": 0.416015625, "learning_rate": 1.8715661975758524e-08, "logits/chosen": -1.2246617078781128, "logits/rejected": -1.0061366558074951, "logps/chosen": -167.17034912109375, "logps/rejected": -239.23721313476562, "loss": 0.6592, "rewards/accuracies": 1.0, "rewards/chosen": 0.03663500398397446, "rewards/margins": 0.065810427069664, "rewards/margins_max": 0.09848222881555557, "rewards/margins_min": 0.03313861042261124, "rewards/margins_std": 0.04620492085814476, "rewards/rejected": -0.029175419360399246, "step": 2820 }, { "epoch": 0.89, "grad_norm": 0.427734375, "learning_rate": 1.768568842959037e-08, "logits/chosen": -1.4292596578598022, "logits/rejected": -1.0080540180206299, "logps/chosen": -259.69537353515625, "logps/rejected": -236.2705078125, "loss": 0.6597, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03333950787782669, "rewards/margins": 0.06502407789230347, "rewards/margins_max": 0.08750364929437637, "rewards/margins_min": 0.04254449903964996, "rewards/margins_std": 0.03179091960191727, "rewards/rejected": -0.031684570014476776, "step": 2830 }, { "epoch": 0.89, "grad_norm": 0.41015625, "learning_rate": 1.668382450021666e-08, "logits/chosen": -1.3095591068267822, "logits/rejected": -1.0401207208633423, "logps/chosen": -206.1196746826172, "logps/rejected": -171.01002502441406, "loss": 0.6647, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03997686505317688, "rewards/margins": 0.05088004469871521, "rewards/margins_max": 0.07651884853839874, "rewards/margins_min": 0.025241252034902573, "rewards/margins_std": 0.03625873476266861, "rewards/rejected": -0.010903185233473778, "step": 2840 }, { "epoch": 0.9, "grad_norm": 0.458984375, "learning_rate": 1.571019141138366e-08, "logits/chosen": -1.3637133836746216, "logits/rejected": -1.0843619108200073, "logps/chosen": -171.24868774414062, "logps/rejected": -179.87950134277344, "loss": 0.6668, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0334320105612278, "rewards/margins": 0.048344530165195465, "rewards/margins_max": 0.06994569301605225, "rewards/margins_min": 0.026743358001112938, "rewards/margins_std": 0.030548665672540665, "rewards/rejected": -0.014912518672645092, "step": 2850 }, { "epoch": 0.9, "grad_norm": 0.515625, "learning_rate": 1.4764906970956142e-08, "logits/chosen": -1.356999397277832, "logits/rejected": -1.0233064889907837, "logps/chosen": -193.38766479492188, "logps/rejected": -196.44386291503906, "loss": 0.6634, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03667105361819267, "rewards/margins": 0.05664552003145218, "rewards/margins_max": 0.07828361541032791, "rewards/margins_min": 0.03500741347670555, "rewards/margins_std": 0.03060089983046055, "rewards/rejected": -0.01997446082532406, "step": 2860 }, { "epoch": 0.9, "grad_norm": 0.455078125, "learning_rate": 1.3848085556663197e-08, "logits/chosen": -1.2966177463531494, "logits/rejected": -0.9208385348320007, "logps/chosen": -267.82086181640625, "logps/rejected": -202.51319885253906, "loss": 0.6643, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.034102655947208405, "rewards/margins": 0.05611242726445198, "rewards/margins_max": 0.08146383613348007, "rewards/margins_min": 0.030761009082198143, "rewards/margins_std": 0.03585231304168701, "rewards/rejected": -0.022009767591953278, "step": 2870 }, { "epoch": 0.91, "grad_norm": 0.396484375, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -1.3745180368423462, "logits/rejected": -1.0097087621688843, "logps/chosen": -287.45062255859375, "logps/rejected": -254.785400390625, "loss": 0.6658, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03240882605314255, "rewards/margins": 0.05045477673411369, "rewards/margins_max": 0.07354002445936203, "rewards/margins_min": 0.027369529008865356, "rewards/margins_std": 0.032647471874952316, "rewards/rejected": -0.018045950680971146, "step": 2880 }, { "epoch": 0.91, "grad_norm": 0.322265625, "learning_rate": 1.2100272084097779e-08, "logits/chosen": -1.323025107383728, "logits/rejected": -1.0186015367507935, "logps/chosen": -183.8828582763672, "logps/rejected": -250.3448486328125, "loss": 0.6619, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0408027246594429, "rewards/margins": 0.07389940321445465, "rewards/margins_max": 0.10190100967884064, "rewards/margins_min": 0.04589778929948807, "rewards/margins_std": 0.03960026055574417, "rewards/rejected": -0.03309667855501175, "step": 2890 }, { "epoch": 0.91, "grad_norm": 0.373046875, "learning_rate": 1.1269491508133944e-08, "logits/chosen": -1.5226811170578003, "logits/rejected": -0.9228025674819946, "logps/chosen": -312.5696105957031, "logps/rejected": -221.99136352539062, "loss": 0.6626, "rewards/accuracies": 1.0, "rewards/chosen": 0.0407402329146862, "rewards/margins": 0.06274916976690292, "rewards/margins_max": 0.08748480677604675, "rewards/margins_min": 0.038013529032468796, "rewards/margins_std": 0.03498147428035736, "rewards/rejected": -0.02200893685221672, "step": 2900 }, { "epoch": 0.92, "grad_norm": 0.447265625, "learning_rate": 1.0467596897333008e-08, "logits/chosen": -1.3627498149871826, "logits/rejected": -0.8954145312309265, "logps/chosen": -231.42977905273438, "logps/rejected": -222.2968292236328, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 0.04414510354399681, "rewards/margins": 0.06752908229827881, "rewards/margins_max": 0.09981563687324524, "rewards/margins_min": 0.03524252399802208, "rewards/margins_std": 0.04566008597612381, "rewards/rejected": -0.02338396944105625, "step": 2910 }, { "epoch": 0.92, "grad_norm": 0.341796875, "learning_rate": 9.694685279510672e-09, "logits/chosen": -1.3423680067062378, "logits/rejected": -1.2014684677124023, "logps/chosen": -185.5139617919922, "logps/rejected": -232.6142120361328, "loss": 0.6689, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022014331072568893, "rewards/margins": 0.049973584711551666, "rewards/margins_max": 0.0689433366060257, "rewards/margins_min": 0.031003836542367935, "rewards/margins_std": 0.026827272027730942, "rewards/rejected": -0.027959251776337624, "step": 2920 }, { "epoch": 0.92, "grad_norm": 0.375, "learning_rate": 8.950850175592328e-09, "logits/chosen": -1.4081456661224365, "logits/rejected": -1.0961415767669678, "logps/chosen": -232.86813354492188, "logps/rejected": -269.9138488769531, "loss": 0.6679, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0401880256831646, "rewards/margins": 0.05734118074178696, "rewards/margins_max": 0.08120250701904297, "rewards/margins_min": 0.03347986191511154, "rewards/margins_std": 0.03374500200152397, "rewards/rejected": -0.01715315505862236, "step": 2930 }, { "epoch": 0.93, "grad_norm": 0.38671875, "learning_rate": 8.236181588297115e-09, "logits/chosen": -1.3293626308441162, "logits/rejected": -0.9906571507453918, "logps/chosen": -256.72100830078125, "logps/rejected": -313.48504638671875, "loss": 0.6643, "rewards/accuracies": 1.0, "rewards/chosen": 0.03118916228413582, "rewards/margins": 0.06911532580852509, "rewards/margins_max": 0.10093537718057632, "rewards/margins_min": 0.03729528561234474, "rewards/margins_std": 0.045000337064266205, "rewards/rejected": -0.037926167249679565, "step": 2940 }, { "epoch": 0.93, "grad_norm": 0.375, "learning_rate": 7.550765991247654e-09, "logits/chosen": -1.3571122884750366, "logits/rejected": -0.9799866676330566, "logps/chosen": -237.0681610107422, "logps/rejected": -217.8629913330078, "loss": 0.6643, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.039660923182964325, "rewards/margins": 0.058103930205106735, "rewards/margins_max": 0.08864767849445343, "rewards/margins_min": 0.02756018377840519, "rewards/margins_std": 0.04319537803530693, "rewards/rejected": -0.01844300702214241, "step": 2950 }, { "epoch": 0.93, "grad_norm": 0.27734375, "learning_rate": 6.894686318507064e-09, "logits/chosen": -1.3770530223846436, "logits/rejected": -1.0678465366363525, "logps/chosen": -207.478759765625, "logps/rejected": -254.6818389892578, "loss": 0.6665, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03722939267754555, "rewards/margins": 0.057834554463624954, "rewards/margins_max": 0.07993746548891068, "rewards/margins_min": 0.035731635987758636, "rewards/margins_std": 0.0312582366168499, "rewards/rejected": -0.020605161786079407, "step": 2960 }, { "epoch": 0.94, "grad_norm": 0.41015625, "learning_rate": 6.268021954544095e-09, "logits/chosen": -1.1451586484909058, "logits/rejected": -0.9856246709823608, "logps/chosen": -198.33804321289062, "logps/rejected": -290.0233459472656, "loss": 0.6659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03236705809831619, "rewards/margins": 0.06491495668888092, "rewards/margins_max": 0.09603826701641083, "rewards/margins_min": 0.033791638910770416, "rewards/margins_std": 0.044015005230903625, "rewards/rejected": -0.03254788741469383, "step": 2970 }, { "epoch": 0.94, "grad_norm": 0.5078125, "learning_rate": 5.670848724627531e-09, "logits/chosen": -1.4588849544525146, "logits/rejected": -1.0730645656585693, "logps/chosen": -301.3870849609375, "logps/rejected": -199.2222442626953, "loss": 0.6658, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.026702869683504105, "rewards/margins": 0.04165857285261154, "rewards/margins_max": 0.06404221057891846, "rewards/margins_min": 0.01927492953836918, "rewards/margins_std": 0.03165525197982788, "rewards/rejected": -0.014955705031752586, "step": 2980 }, { "epoch": 0.94, "grad_norm": 0.412109375, "learning_rate": 5.103238885651617e-09, "logits/chosen": -1.4286754131317139, "logits/rejected": -0.9818390011787415, "logps/chosen": -238.0774688720703, "logps/rejected": -222.6654815673828, "loss": 0.6611, "rewards/accuracies": 1.0, "rewards/chosen": 0.03790941461920738, "rewards/margins": 0.06597913056612015, "rewards/margins_max": 0.09669280052185059, "rewards/margins_min": 0.03526546061038971, "rewards/margins_std": 0.04343568533658981, "rewards/rejected": -0.028069715946912766, "step": 2990 }, { "epoch": 0.95, "grad_norm": 0.484375, "learning_rate": 4.565261117393249e-09, "logits/chosen": -1.527706503868103, "logits/rejected": -1.1605089902877808, "logps/chosen": -238.7028045654297, "logps/rejected": -198.54071044921875, "loss": 0.6641, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.038233425468206406, "rewards/margins": 0.05112460255622864, "rewards/margins_max": 0.072813980281353, "rewards/margins_min": 0.029435228556394577, "rewards/margins_std": 0.03067341446876526, "rewards/rejected": -0.01289118267595768, "step": 3000 }, { "epoch": 0.95, "grad_norm": 0.392578125, "learning_rate": 4.056980514201447e-09, "logits/chosen": -1.3091288805007935, "logits/rejected": -0.9673709869384766, "logps/chosen": -203.36215209960938, "logps/rejected": -215.65908813476562, "loss": 0.662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.038450248539447784, "rewards/margins": 0.06377027928829193, "rewards/margins_max": 0.09830651432275772, "rewards/margins_min": 0.0292340274900198, "rewards/margins_std": 0.048841629177331924, "rewards/rejected": -0.02532001957297325, "step": 3010 }, { "epoch": 0.95, "grad_norm": 0.357421875, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -1.3335479497909546, "logits/rejected": -0.9828931093215942, "logps/chosen": -176.47000122070312, "logps/rejected": -178.46786499023438, "loss": 0.6696, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.031307101249694824, "rewards/margins": 0.04759521037340164, "rewards/margins_max": 0.0687338337302208, "rewards/margins_min": 0.02645658515393734, "rewards/margins_std": 0.02989453449845314, "rewards/rejected": -0.016288110986351967, "step": 3020 }, { "epoch": 0.95, "grad_norm": 0.390625, "learning_rate": 3.129753206453201e-09, "logits/chosen": -1.4696094989776611, "logits/rejected": -1.032707929611206, "logps/chosen": -234.9283447265625, "logps/rejected": -236.0854949951172, "loss": 0.6626, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0398876890540123, "rewards/margins": 0.0536864697933197, "rewards/margins_max": 0.08105526119470596, "rewards/margins_min": 0.026317689567804337, "rewards/margins_std": 0.03870530426502228, "rewards/rejected": -0.013798783533275127, "step": 3030 }, { "epoch": 0.96, "grad_norm": 0.33203125, "learning_rate": 2.7109186947449348e-09, "logits/chosen": -1.4651210308074951, "logits/rejected": -1.179198980331421, "logps/chosen": -185.0526123046875, "logps/rejected": -206.34677124023438, "loss": 0.6674, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.037171076983213425, "rewards/margins": 0.04990251734852791, "rewards/margins_max": 0.07099257409572601, "rewards/margins_min": 0.0288124717772007, "rewards/margins_std": 0.029825836420059204, "rewards/rejected": -0.012731445021927357, "step": 3040 }, { "epoch": 0.96, "grad_norm": 0.330078125, "learning_rate": 2.322005720224618e-09, "logits/chosen": -1.2301725149154663, "logits/rejected": -0.8613675236701965, "logps/chosen": -176.6241912841797, "logps/rejected": -234.5286407470703, "loss": 0.6633, "rewards/accuracies": 1.0, "rewards/chosen": 0.040943752974271774, "rewards/margins": 0.06620831787586212, "rewards/margins_max": 0.09054501354694366, "rewards/margins_min": 0.04187161475419998, "rewards/margins_std": 0.0344172939658165, "rewards/rejected": -0.02526455745100975, "step": 3050 }, { "epoch": 0.96, "grad_norm": 0.349609375, "learning_rate": 1.9630613406676764e-09, "logits/chosen": -1.3148514032363892, "logits/rejected": -1.1194158792495728, "logps/chosen": -204.06472778320312, "logps/rejected": -175.95155334472656, "loss": 0.6675, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030322005972266197, "rewards/margins": 0.04166535660624504, "rewards/margins_max": 0.06148039177060127, "rewards/margins_min": 0.021850308403372765, "rewards/margins_std": 0.028022700920701027, "rewards/rejected": -0.011343345046043396, "step": 3060 }, { "epoch": 0.97, "grad_norm": 0.494140625, "learning_rate": 1.6341289877028486e-09, "logits/chosen": -1.2309526205062866, "logits/rejected": -0.9648950695991516, "logps/chosen": -221.1148223876953, "logps/rejected": -218.8831024169922, "loss": 0.6665, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03704274445772171, "rewards/margins": 0.06422804296016693, "rewards/margins_max": 0.09260173887014389, "rewards/margins_min": 0.03585432469844818, "rewards/margins_std": 0.040126487612724304, "rewards/rejected": -0.027185291051864624, "step": 3070 }, { "epoch": 0.97, "grad_norm": 0.41015625, "learning_rate": 1.33524846155747e-09, "logits/chosen": -1.5479004383087158, "logits/rejected": -1.124626874923706, "logps/chosen": -272.0228271484375, "logps/rejected": -232.5234832763672, "loss": 0.6602, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03827238082885742, "rewards/margins": 0.06339852511882782, "rewards/margins_max": 0.08984600752592087, "rewards/margins_min": 0.03695103898644447, "rewards/margins_std": 0.03740239515900612, "rewards/rejected": -0.0251261405646801, "step": 3080 }, { "epoch": 0.97, "grad_norm": 0.4296875, "learning_rate": 1.066455926241383e-09, "logits/chosen": -1.3203740119934082, "logits/rejected": -1.0223264694213867, "logps/chosen": -217.78921508789062, "logps/rejected": -185.77662658691406, "loss": 0.6662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03560353443026543, "rewards/margins": 0.05814961716532707, "rewards/margins_max": 0.08662423491477966, "rewards/margins_min": 0.029674995690584183, "rewards/margins_std": 0.04026919603347778, "rewards/rejected": -0.022546080872416496, "step": 3090 }, { "epoch": 0.98, "grad_norm": 0.625, "learning_rate": 8.277839051712698e-10, "logits/chosen": -1.2869453430175781, "logits/rejected": -0.9400846362113953, "logps/chosen": -253.38711547851562, "logps/rejected": -252.90274047851562, "loss": 0.6638, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04124082997441292, "rewards/margins": 0.05371633172035217, "rewards/margins_max": 0.08102357387542725, "rewards/margins_min": 0.0264090858399868, "rewards/margins_std": 0.03861827403306961, "rewards/rejected": -0.012475499883294106, "step": 3100 }, { "epoch": 0.98, "grad_norm": 0.55078125, "learning_rate": 6.192612772354944e-10, "logits/chosen": -1.323472499847412, "logits/rejected": -0.9910783767700195, "logps/chosen": -250.5233917236328, "logps/rejected": -254.0393524169922, "loss": 0.6645, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.043138034641742706, "rewards/margins": 0.062495727092027664, "rewards/margins_max": 0.09117720276117325, "rewards/margins_min": 0.03381425514817238, "rewards/margins_std": 0.0405617319047451, "rewards/rejected": -0.019357692450284958, "step": 3110 }, { "epoch": 0.98, "grad_norm": 0.30859375, "learning_rate": 4.4091327329956465e-10, "logits/chosen": -1.3970682621002197, "logits/rejected": -1.0630197525024414, "logps/chosen": -187.95303344726562, "logps/rejected": -180.37051391601562, "loss": 0.6629, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04492129012942314, "rewards/margins": 0.0595441572368145, "rewards/margins_max": 0.08808682858943939, "rewards/margins_min": 0.03100150264799595, "rewards/margins_std": 0.04036542400717735, "rewards/rejected": -0.014622872695326805, "step": 3120 }, { "epoch": 0.99, "grad_norm": 0.44140625, "learning_rate": 2.927614731534356e-10, "logits/chosen": -1.3621008396148682, "logits/rejected": -1.0651832818984985, "logps/chosen": -214.0552520751953, "logps/rejected": -293.3019104003906, "loss": 0.6652, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03748806565999985, "rewards/margins": 0.06025733798742294, "rewards/margins_max": 0.08895647525787354, "rewards/margins_min": 0.031558211892843246, "rewards/margins_std": 0.04058670252561569, "rewards/rejected": -0.022769279778003693, "step": 3130 }, { "epoch": 0.99, "grad_norm": 0.33984375, "learning_rate": 1.7482380290034792e-10, "logits/chosen": -1.4978671073913574, "logits/rejected": -1.0491201877593994, "logps/chosen": -187.7884063720703, "logps/rejected": -193.33639526367188, "loss": 0.6632, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03827610984444618, "rewards/margins": 0.06368110328912735, "rewards/margins_max": 0.08939781039953232, "rewards/margins_min": 0.03796439617872238, "rewards/margins_std": 0.036368921399116516, "rewards/rejected": -0.025404995307326317, "step": 3140 }, { "epoch": 0.99, "grad_norm": 0.490234375, "learning_rate": 8.711453278778535e-11, "logits/chosen": -1.3394626379013062, "logits/rejected": -0.8948138356208801, "logps/chosen": -242.09231567382812, "logps/rejected": -217.08139038085938, "loss": 0.6631, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.035616446286439896, "rewards/margins": 0.06384526938199997, "rewards/margins_max": 0.09082364290952682, "rewards/margins_min": 0.03686688840389252, "rewards/margins_std": 0.03815319389104843, "rewards/rejected": -0.028228823095560074, "step": 3150 }, { "epoch": 1.0, "grad_norm": 0.376953125, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -1.425526738166809, "logits/rejected": -1.098435640335083, "logps/chosen": -208.4182586669922, "logps/rejected": -194.59750366210938, "loss": 0.6706, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03818322345614433, "rewards/margins": 0.04908495396375656, "rewards/margins_max": 0.07002463191747665, "rewards/margins_min": 0.02814526855945587, "rewards/margins_std": 0.029613185673952103, "rewards/rejected": -0.010901734232902527, "step": 3160 }, { "epoch": 1.0, "grad_norm": 0.51171875, "learning_rate": 2.419984777790596e-12, "logits/chosen": -1.3360934257507324, "logits/rejected": -0.8945194482803345, "logps/chosen": -228.0156707763672, "logps/rejected": -237.915283203125, "loss": 0.6624, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.045156557112932205, "rewards/margins": 0.06454546749591827, "rewards/margins_max": 0.10190453380346298, "rewards/margins_min": 0.02718639373779297, "rewards/margins_std": 0.05283369496464729, "rewards/rejected": -0.01938890479505062, "step": 3170 }, { "epoch": 1.0, "eval_logits/chosen": -1.0169051885604858, "eval_logits/rejected": -0.8946173191070557, "eval_logps/chosen": -322.6468811035156, "eval_logps/rejected": -313.6658020019531, "eval_loss": 0.6918271780014038, "eval_rewards/accuracies": 0.5540000200271606, "eval_rewards/chosen": 0.02316886931657791, "eval_rewards/margins": 0.0031846188940107822, "eval_rewards/margins_max": 0.06275644898414612, "eval_rewards/margins_min": -0.059831298887729645, "eval_rewards/margins_std": 0.040721021592617035, "eval_rewards/rejected": 0.019984247162938118, "eval_runtime": 1444.6396, "eval_samples_per_second": 2.769, "eval_steps_per_second": 0.173, "step": 3174 }, { "epoch": 1.0, "step": 3174, "total_flos": 0.0, "train_loss": 0.6703614967006065, "train_runtime": 26793.455, "train_samples_per_second": 0.948, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 3174, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }