diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5773 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998424948810837, + "eval_steps": 100, + "global_step": 3174, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.322265625, + "learning_rate": 1.5723270440251572e-09, + "logits/chosen": -1.3876760005950928, + "logits/rejected": -1.4584133625030518, + "logps/chosen": -148.11717224121094, + "logps/rejected": -197.28189086914062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.474609375, + "learning_rate": 1.5723270440251573e-08, + "logits/chosen": -1.2969517707824707, + "logits/rejected": -1.0069656372070312, + "logps/chosen": -190.4855499267578, + "logps/rejected": -182.0135498046875, + "loss": 0.6929, + "rewards/accuracies": 0.5833333134651184, + "rewards/chosen": 0.0011108842445537448, + "rewards/margins": 0.001312906388193369, + "rewards/margins_max": 0.0032973522320389748, + "rewards/margins_min": -0.0006715393392369151, + "rewards/margins_std": 0.0028064302168786526, + "rewards/rejected": -0.00020202209998387843, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.443359375, + "learning_rate": 3.1446540880503146e-08, + "logits/chosen": -1.36593496799469, + "logits/rejected": -1.0528085231781006, + "logps/chosen": -225.4935760498047, + "logps/rejected": -200.0979766845703, + "loss": 0.6933, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 7.484816160285845e-05, + "rewards/margins": -0.0001347160286968574, + "rewards/margins_max": 0.0016663169953972101, + "rewards/margins_min": -0.0019357489654794335, + "rewards/margins_std": 0.0025470454711467028, + "rewards/rejected": 0.00020956425578333437, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.431640625, + "learning_rate": 4.7169811320754715e-08, + "logits/chosen": -1.26302170753479, + "logits/rejected": -0.982827365398407, + "logps/chosen": -180.48269653320312, + "logps/rejected": -184.57960510253906, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005473994533531368, + "rewards/margins": 0.0005724715883843601, + "rewards/margins_max": 0.002282569883391261, + "rewards/margins_min": -0.0011376264737918973, + "rewards/margins_std": 0.002418444026261568, + "rewards/rejected": -0.0011198710417374969, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.33984375, + "learning_rate": 6.289308176100629e-08, + "logits/chosen": -1.4589287042617798, + "logits/rejected": -1.1574287414550781, + "logps/chosen": -225.4607696533203, + "logps/rejected": -276.73675537109375, + "loss": 0.6935, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00039744950481690466, + "rewards/margins": -0.00030673606670461595, + "rewards/margins_max": 0.0013146628625690937, + "rewards/margins_min": -0.0019281348213553429, + "rewards/margins_std": 0.0022930041886866093, + "rewards/rejected": 0.0007041855715215206, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.515625, + "learning_rate": 7.861635220125786e-08, + "logits/chosen": -1.3671009540557861, + "logits/rejected": -0.8631851077079773, + "logps/chosen": -331.6417236328125, + "logps/rejected": -205.7646026611328, + "loss": 0.6932, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0002600564039312303, + "rewards/margins": -0.0007569913868792355, + "rewards/margins_max": 0.0010670910123735666, + "rewards/margins_min": -0.002581073669716716, + "rewards/margins_std": 0.002579641994088888, + "rewards/rejected": 0.0010170477908104658, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.451171875, + "learning_rate": 9.433962264150943e-08, + "logits/chosen": -1.1948202848434448, + "logits/rejected": -1.0117332935333252, + "logps/chosen": -203.6728515625, + "logps/rejected": -264.63153076171875, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.000772724102716893, + "rewards/margins": 1.4207902495400049e-05, + "rewards/margins_max": 0.0016571771120652556, + "rewards/margins_min": -0.001628761412575841, + "rewards/margins_std": 0.0023235089611262083, + "rewards/rejected": 0.0007585162529721856, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.447265625, + "learning_rate": 1.10062893081761e-07, + "logits/chosen": -1.4154024124145508, + "logits/rejected": -1.0937511920928955, + "logps/chosen": -218.91259765625, + "logps/rejected": -224.9219207763672, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00017823689267970622, + "rewards/margins": -0.00020709517411887646, + "rewards/margins_max": 0.0016052055871114135, + "rewards/margins_min": -0.002019395586103201, + "rewards/margins_std": 0.00256298016756773, + "rewards/rejected": 0.00038533215411007404, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 0.5078125, + "learning_rate": 1.2578616352201258e-07, + "logits/chosen": -1.2727240324020386, + "logits/rejected": -0.9936261177062988, + "logps/chosen": -285.10943603515625, + "logps/rejected": -266.4510192871094, + "loss": 0.6929, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0009560451726429164, + "rewards/margins": 0.0006419096025638282, + "rewards/margins_max": 0.0029980712570250034, + "rewards/margins_min": -0.0017142522847279906, + "rewards/margins_std": 0.003332116873934865, + "rewards/rejected": 0.0003141355118714273, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 0.59375, + "learning_rate": 1.4150943396226414e-07, + "logits/chosen": -1.4589568376541138, + "logits/rejected": -1.1692708730697632, + "logps/chosen": -212.2246551513672, + "logps/rejected": -219.21646118164062, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0005518359830603004, + "rewards/margins": 0.001025562291033566, + "rewards/margins_max": 0.0023937453515827656, + "rewards/margins_min": -0.0003426209441386163, + "rewards/margins_std": 0.001934903091751039, + "rewards/rejected": -0.00047372624976560473, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.3359375, + "learning_rate": 1.5723270440251572e-07, + "logits/chosen": -1.405853033065796, + "logits/rejected": -0.9023151397705078, + "logps/chosen": -257.5167236328125, + "logps/rejected": -205.4027862548828, + "loss": 0.6925, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0005634050467051566, + "rewards/margins": 0.001445975387468934, + "rewards/margins_max": 0.0039651584811508656, + "rewards/margins_min": -0.001073207939043641, + "rewards/margins_std": 0.0035626632161438465, + "rewards/rejected": -0.0008825702825561166, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 0.392578125, + "learning_rate": 1.7295597484276728e-07, + "logits/chosen": -1.2503092288970947, + "logits/rejected": -0.9771049618721008, + "logps/chosen": -230.6888427734375, + "logps/rejected": -189.9393310546875, + "loss": 0.6924, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.000937645963858813, + "rewards/margins": 0.001897513517178595, + "rewards/margins_max": 0.0035337067674845457, + "rewards/margins_min": 0.00026132012135349214, + "rewards/margins_std": 0.0023139265831559896, + "rewards/rejected": -0.0009598674369044602, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.453125, + "learning_rate": 1.8867924528301886e-07, + "logits/chosen": -1.413317084312439, + "logits/rejected": -1.0483345985412598, + "logps/chosen": -195.40811157226562, + "logps/rejected": -186.1103515625, + "loss": 0.6928, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0013014640426263213, + "rewards/margins": 0.0010003356728702784, + "rewards/margins_max": 0.0033742673695087433, + "rewards/margins_min": -0.0013735961401835084, + "rewards/margins_std": 0.003357246518135071, + "rewards/rejected": 0.0003011283988598734, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.57421875, + "learning_rate": 2.0440251572327044e-07, + "logits/chosen": -1.1766637563705444, + "logits/rejected": -0.9444602131843567, + "logps/chosen": -219.5814666748047, + "logps/rejected": -248.6021728515625, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0013613433111459017, + "rewards/margins": 0.0008185977349057794, + "rewards/margins_max": 0.0035369223915040493, + "rewards/margins_min": -0.0018997270381078124, + "rewards/margins_std": 0.0038442914374172688, + "rewards/rejected": 0.0005427456344477832, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.416015625, + "learning_rate": 2.20125786163522e-07, + "logits/chosen": -1.1843626499176025, + "logits/rejected": -0.9615445137023926, + "logps/chosen": -267.6846923828125, + "logps/rejected": -216.41455078125, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0015651207650080323, + "rewards/margins": 0.0013728371122851968, + "rewards/margins_max": 0.003893634770065546, + "rewards/margins_min": -0.0011479605454951525, + "rewards/margins_std": 0.0035649463534355164, + "rewards/rejected": 0.00019228360906708986, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 0.314453125, + "learning_rate": 2.3584905660377358e-07, + "logits/chosen": -1.3410217761993408, + "logits/rejected": -0.8768698573112488, + "logps/chosen": -305.0233154296875, + "logps/rejected": -234.83407592773438, + "loss": 0.692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0015008506597951055, + "rewards/margins": 0.002513662213459611, + "rewards/margins_max": 0.005906062666326761, + "rewards/margins_min": -0.0008787383558228612, + "rewards/margins_std": 0.004797579254955053, + "rewards/rejected": -0.0010128116700798273, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.34765625, + "learning_rate": 2.5157232704402517e-07, + "logits/chosen": -1.3618042469024658, + "logits/rejected": -1.1711941957473755, + "logps/chosen": -168.73251342773438, + "logps/rejected": -229.12173461914062, + "loss": 0.6921, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0013464975636452436, + "rewards/margins": 0.0017303951317444444, + "rewards/margins_max": 0.002993419300764799, + "rewards/margins_min": 0.000467371050035581, + "rewards/margins_std": 0.0017861860105767846, + "rewards/rejected": -0.0003838978009298444, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.326171875, + "learning_rate": 2.672955974842767e-07, + "logits/chosen": -1.2365471124649048, + "logits/rejected": -0.9317380785942078, + "logps/chosen": -220.8829345703125, + "logps/rejected": -198.69509887695312, + "loss": 0.6921, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0018165509682148695, + "rewards/margins": 0.0023886661510914564, + "rewards/margins_max": 0.004315841477364302, + "rewards/margins_min": 0.0004614906501956284, + "rewards/margins_std": 0.0027254377491772175, + "rewards/rejected": -0.0005721148918382823, + "step": 170 + }, + { + "epoch": 0.06, + "grad_norm": 0.5625, + "learning_rate": 2.830188679245283e-07, + "logits/chosen": -1.3770387172698975, + "logits/rejected": -1.0459026098251343, + "logps/chosen": -213.62649536132812, + "logps/rejected": -216.0526580810547, + "loss": 0.6916, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0032491025049239397, + "rewards/margins": 0.004016959108412266, + "rewards/margins_max": 0.006775864399969578, + "rewards/margins_min": 0.0012580546317622066, + "rewards/margins_std": 0.0039016795344650745, + "rewards/rejected": -0.0007678564870730042, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 0.43359375, + "learning_rate": 2.9874213836477983e-07, + "logits/chosen": -1.280879259109497, + "logits/rejected": -1.0743911266326904, + "logps/chosen": -196.5890655517578, + "logps/rejected": -215.959228515625, + "loss": 0.6916, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.003082216251641512, + "rewards/margins": 0.0024519709404557943, + "rewards/margins_max": 0.004635250195860863, + "rewards/margins_min": 0.0002686919760890305, + "rewards/margins_std": 0.003087623044848442, + "rewards/rejected": 0.0006302451947703958, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.431640625, + "learning_rate": 3.1446540880503144e-07, + "logits/chosen": -1.312201738357544, + "logits/rejected": -1.1042929887771606, + "logps/chosen": -219.0549774169922, + "logps/rejected": -207.017822265625, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.001382762915454805, + "rewards/margins": 0.002051582094281912, + "rewards/margins_max": 0.0041840835474431515, + "rewards/margins_min": -8.09192206361331e-05, + "rewards/margins_std": 0.0030158127192407846, + "rewards/rejected": -0.0006688194698654115, + "step": 200 + }, + { + "epoch": 0.07, + "grad_norm": 0.38671875, + "learning_rate": 3.30188679245283e-07, + "logits/chosen": -1.4287524223327637, + "logits/rejected": -1.090522050857544, + "logps/chosen": -237.6526336669922, + "logps/rejected": -253.42056274414062, + "loss": 0.6913, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.003422607434913516, + "rewards/margins": 0.0037254388444125652, + "rewards/margins_max": 0.007606147322803736, + "rewards/margins_min": -0.00015526966308243573, + "rewards/margins_std": 0.005488150753080845, + "rewards/rejected": -0.00030283164232969284, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 0.3359375, + "learning_rate": 3.4591194968553456e-07, + "logits/chosen": -1.3905646800994873, + "logits/rejected": -1.1243839263916016, + "logps/chosen": -275.39739990234375, + "logps/rejected": -198.20419311523438, + "loss": 0.6913, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0038289937656372786, + "rewards/margins": 0.0037508513778448105, + "rewards/margins_max": 0.00589752709493041, + "rewards/margins_min": 0.0016041755443438888, + "rewards/margins_std": 0.0030358582735061646, + "rewards/rejected": 7.814211130607873e-05, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.482421875, + "learning_rate": 3.616352201257861e-07, + "logits/chosen": -1.4099429845809937, + "logits/rejected": -1.1149197816848755, + "logps/chosen": -253.6432647705078, + "logps/rejected": -201.7845458984375, + "loss": 0.6906, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.004002997186034918, + "rewards/margins": 0.004339634440839291, + "rewards/margins_max": 0.0070360577665269375, + "rewards/margins_min": 0.0016432113479822874, + "rewards/margins_std": 0.003813318442553282, + "rewards/rejected": -0.00033663742942735553, + "step": 230 + }, + { + "epoch": 0.08, + "grad_norm": 0.482421875, + "learning_rate": 3.773584905660377e-07, + "logits/chosen": -1.3923314809799194, + "logits/rejected": -1.2636398077011108, + "logps/chosen": -176.70986938476562, + "logps/rejected": -260.1700134277344, + "loss": 0.6905, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.003964459989219904, + "rewards/margins": 0.0052338032983243465, + "rewards/margins_max": 0.008278938010334969, + "rewards/margins_min": 0.0021886671893298626, + "rewards/margins_std": 0.004306471906602383, + "rewards/rejected": -0.0012693424941971898, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 0.392578125, + "learning_rate": 3.9308176100628933e-07, + "logits/chosen": -1.4243371486663818, + "logits/rejected": -1.1771718263626099, + "logps/chosen": -265.36944580078125, + "logps/rejected": -217.3080291748047, + "loss": 0.6903, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.004339687060564756, + "rewards/margins": 0.005566070321947336, + "rewards/margins_max": 0.009654941037297249, + "rewards/margins_min": 0.00147719937376678, + "rewards/margins_std": 0.00578253623098135, + "rewards/rejected": -0.0012263832613825798, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 0.412109375, + "learning_rate": 4.088050314465409e-07, + "logits/chosen": -1.2650946378707886, + "logits/rejected": -0.7829256057739258, + "logps/chosen": -283.19415283203125, + "logps/rejected": -258.4779052734375, + "loss": 0.6898, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.006826590746641159, + "rewards/margins": 0.0073587894439697266, + "rewards/margins_max": 0.012493086978793144, + "rewards/margins_min": 0.002224491210654378, + "rewards/margins_std": 0.007260994054377079, + "rewards/rejected": -0.0005321979406289756, + "step": 260 + }, + { + "epoch": 0.09, + "grad_norm": 0.51953125, + "learning_rate": 4.2452830188679244e-07, + "logits/chosen": -1.379631757736206, + "logits/rejected": -0.836907684803009, + "logps/chosen": -260.239501953125, + "logps/rejected": -241.7003631591797, + "loss": 0.6892, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.006983810570091009, + "rewards/margins": 0.009598185308277607, + "rewards/margins_max": 0.013700554147362709, + "rewards/margins_min": 0.00549581553786993, + "rewards/margins_std": 0.005801626015454531, + "rewards/rejected": -0.002614373806864023, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 0.376953125, + "learning_rate": 4.40251572327044e-07, + "logits/chosen": -1.538783311843872, + "logits/rejected": -1.2011783123016357, + "logps/chosen": -195.48477172851562, + "logps/rejected": -190.34756469726562, + "loss": 0.6903, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004987453110516071, + "rewards/margins": 0.005685538984835148, + "rewards/margins_max": 0.008754138834774494, + "rewards/margins_min": 0.002616937505081296, + "rewards/margins_std": 0.004339656792581081, + "rewards/rejected": -0.0006980849429965019, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.30859375, + "learning_rate": 4.559748427672956e-07, + "logits/chosen": -1.453611135482788, + "logits/rejected": -1.022805094718933, + "logps/chosen": -242.57275390625, + "logps/rejected": -207.9861602783203, + "loss": 0.6885, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.007862430065870285, + "rewards/margins": 0.010466397739946842, + "rewards/margins_max": 0.015998583287000656, + "rewards/margins_min": 0.00493421358987689, + "rewards/margins_std": 0.007823689840734005, + "rewards/rejected": -0.0026039674412459135, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.37109375, + "learning_rate": 4.7169811320754717e-07, + "logits/chosen": -1.6172186136245728, + "logits/rejected": -1.1852935552597046, + "logps/chosen": -227.5122528076172, + "logps/rejected": -211.18136596679688, + "loss": 0.6883, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.01041549351066351, + "rewards/margins": 0.009889104403555393, + "rewards/margins_max": 0.014782111160457134, + "rewards/margins_min": 0.004996097646653652, + "rewards/margins_std": 0.006919757463037968, + "rewards/rejected": 0.0005263882922008634, + "step": 300 + }, + { + "epoch": 0.1, + "grad_norm": 0.609375, + "learning_rate": 4.874213836477988e-07, + "logits/chosen": -1.3813108205795288, + "logits/rejected": -1.1312620639801025, + "logps/chosen": -199.9052276611328, + "logps/rejected": -216.5789337158203, + "loss": 0.6888, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.009382456541061401, + "rewards/margins": 0.008607019670307636, + "rewards/margins_max": 0.013246886432170868, + "rewards/margins_min": 0.003967151511460543, + "rewards/margins_std": 0.0065617635846138, + "rewards/rejected": 0.0007754383259452879, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.3984375, + "learning_rate": 4.999993950030735e-07, + "logits/chosen": -1.3889760971069336, + "logits/rejected": -1.0410958528518677, + "logps/chosen": -250.23452758789062, + "logps/rejected": -237.1486053466797, + "loss": 0.6888, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.007147905416786671, + "rewards/margins": 0.008209905587136745, + "rewards/margins_max": 0.013320088386535645, + "rewards/margins_min": 0.003099723719060421, + "rewards/margins_std": 0.007226888090372086, + "rewards/rejected": -0.0010620001703500748, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.416015625, + "learning_rate": 4.999782204181026e-07, + "logits/chosen": -1.4792320728302002, + "logits/rejected": -0.9951168298721313, + "logps/chosen": -240.000732421875, + "logps/rejected": -256.5224304199219, + "loss": 0.6873, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.011202135123312473, + "rewards/margins": 0.014701342210173607, + "rewards/margins_max": 0.022865889593958855, + "rewards/margins_min": 0.006536795757710934, + "rewards/margins_std": 0.011546412482857704, + "rewards/rejected": -0.003499208018183708, + "step": 330 + }, + { + "epoch": 0.11, + "grad_norm": 0.4453125, + "learning_rate": 4.999267989149139e-07, + "logits/chosen": -1.3123283386230469, + "logits/rejected": -0.9737062454223633, + "logps/chosen": -181.56187438964844, + "logps/rejected": -181.15927124023438, + "loss": 0.6872, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00947630312293768, + "rewards/margins": 0.010779361240565777, + "rewards/margins_max": 0.014272956177592278, + "rewards/margins_min": 0.007285767234861851, + "rewards/margins_std": 0.004940689541399479, + "rewards/rejected": -0.0013030586997047067, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 0.48828125, + "learning_rate": 4.998451367154173e-07, + "logits/chosen": -1.3401153087615967, + "logits/rejected": -0.9146574139595032, + "logps/chosen": -263.080810546875, + "logps/rejected": -242.10769653320312, + "loss": 0.6867, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.010717710480093956, + "rewards/margins": 0.013646000996232033, + "rewards/margins_max": 0.021306831389665604, + "rewards/margins_min": 0.005985168274492025, + "rewards/margins_std": 0.010834051296114922, + "rewards/rejected": -0.0029282893519848585, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 0.490234375, + "learning_rate": 4.997332437005931e-07, + "logits/chosen": -1.6230709552764893, + "logits/rejected": -1.197361946105957, + "logps/chosen": -276.94305419921875, + "logps/rejected": -222.4744110107422, + "loss": 0.6867, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.01071252953261137, + "rewards/margins": 0.014663497917354107, + "rewards/margins_max": 0.023539308458566666, + "rewards/margins_min": 0.005787692964076996, + "rewards/margins_std": 0.012552286498248577, + "rewards/rejected": -0.003950969781726599, + "step": 360 + }, + { + "epoch": 0.12, + "grad_norm": 0.404296875, + "learning_rate": 4.995911334092962e-07, + "logits/chosen": -1.4035460948944092, + "logits/rejected": -1.0208442211151123, + "logps/chosen": -231.21987915039062, + "logps/rejected": -171.5296630859375, + "loss": 0.6846, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.012600275687873363, + "rewards/margins": 0.01731901802122593, + "rewards/margins_max": 0.023710820823907852, + "rewards/margins_min": 0.010927212424576283, + "rewards/margins_std": 0.0090393777936697, + "rewards/rejected": -0.0047187404707074165, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 0.345703125, + "learning_rate": 4.994188230366183e-07, + "logits/chosen": -1.3101140260696411, + "logits/rejected": -1.0723780393600464, + "logps/chosen": -228.2586669921875, + "logps/rejected": -181.29495239257812, + "loss": 0.6858, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01025369018316269, + "rewards/margins": 0.014498481526970863, + "rewards/margins_max": 0.023106779903173447, + "rewards/margins_min": 0.005890182219445705, + "rewards/margins_std": 0.012173972092568874, + "rewards/rejected": -0.004244790878146887, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 0.380859375, + "learning_rate": 4.992163334318065e-07, + "logits/chosen": -1.295276165008545, + "logits/rejected": -0.8806821703910828, + "logps/chosen": -266.9888916015625, + "logps/rejected": -213.50936889648438, + "loss": 0.6845, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.012072061188519001, + "rewards/margins": 0.016327695921063423, + "rewards/margins_max": 0.02478429302573204, + "rewards/margins_min": 0.007871100679039955, + "rewards/margins_std": 0.011959430761635303, + "rewards/rejected": -0.004255634266883135, + "step": 390 + }, + { + "epoch": 0.13, + "grad_norm": 0.384765625, + "learning_rate": 4.989836890957414e-07, + "logits/chosen": -1.3160616159439087, + "logits/rejected": -0.985907256603241, + "logps/chosen": -209.94921875, + "logps/rejected": -199.602294921875, + "loss": 0.6861, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.011544553562998772, + "rewards/margins": 0.013081875629723072, + "rewards/margins_max": 0.020108871161937714, + "rewards/margins_min": 0.00605488196015358, + "rewards/margins_std": 0.009937671013176441, + "rewards/rejected": -0.0015373228816315532, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 0.55859375, + "learning_rate": 4.987209181779722e-07, + "logits/chosen": -1.4713616371154785, + "logits/rejected": -1.2096041440963745, + "logps/chosen": -192.724853515625, + "logps/rejected": -176.57815551757812, + "loss": 0.6856, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.012110630050301552, + "rewards/margins": 0.013713735155761242, + "rewards/margins_max": 0.019766617566347122, + "rewards/margins_min": 0.007660853676497936, + "rewards/margins_std": 0.008560067042708397, + "rewards/rejected": -0.0016031056875362992, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 0.390625, + "learning_rate": 4.984280524733107e-07, + "logits/chosen": -1.367755651473999, + "logits/rejected": -0.9895979762077332, + "logps/chosen": -256.28277587890625, + "logps/rejected": -244.4967041015625, + "loss": 0.6819, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02168741635978222, + "rewards/margins": 0.0215130727738142, + "rewards/margins_max": 0.03101455047726631, + "rewards/margins_min": 0.012011596001684666, + "rewards/margins_std": 0.013437116518616676, + "rewards/rejected": 0.00017434502660762519, + "step": 420 + }, + { + "epoch": 0.14, + "grad_norm": 0.443359375, + "learning_rate": 4.98105127417984e-07, + "logits/chosen": -1.3296594619750977, + "logits/rejected": -1.030011773109436, + "logps/chosen": -259.4856872558594, + "logps/rejected": -251.21728515625, + "loss": 0.6827, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.019032226875424385, + "rewards/margins": 0.019077284261584282, + "rewards/margins_max": 0.032611675560474396, + "rewards/margins_min": 0.005542895756661892, + "rewards/margins_std": 0.019140515476465225, + "rewards/rejected": -4.505945253185928e-05, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.462890625, + "learning_rate": 4.97752182085347e-07, + "logits/chosen": -1.513671636581421, + "logits/rejected": -0.9878429174423218, + "logps/chosen": -207.31887817382812, + "logps/rejected": -203.1181640625, + "loss": 0.6832, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02094658836722374, + "rewards/margins": 0.020975876599550247, + "rewards/margins_max": 0.0333896279335022, + "rewards/margins_min": 0.008562122471630573, + "rewards/margins_std": 0.01755569875240326, + "rewards/rejected": -2.9285531127243303e-05, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 0.490234375, + "learning_rate": 4.973692591811548e-07, + "logits/chosen": -1.3032740354537964, + "logits/rejected": -1.1484416723251343, + "logps/chosen": -212.3832550048828, + "logps/rejected": -230.93881225585938, + "loss": 0.684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01571028307080269, + "rewards/margins": 0.018571963533759117, + "rewards/margins_max": 0.028482910245656967, + "rewards/margins_min": 0.008661014959216118, + "rewards/margins_std": 0.014016198925673962, + "rewards/rejected": -0.002861680928617716, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.353515625, + "learning_rate": 4.96956405038395e-07, + "logits/chosen": -1.2598702907562256, + "logits/rejected": -0.9527764320373535, + "logps/chosen": -172.1599578857422, + "logps/rejected": -211.24148559570312, + "loss": 0.6821, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.015954652801156044, + "rewards/margins": 0.020714178681373596, + "rewards/margins_max": 0.03062686324119568, + "rewards/margins_min": 0.010801494121551514, + "rewards/margins_std": 0.014018652029335499, + "rewards/rejected": -0.004759527277201414, + "step": 460 + }, + { + "epoch": 0.15, + "grad_norm": 0.40234375, + "learning_rate": 4.965136696116812e-07, + "logits/chosen": -1.3497663736343384, + "logits/rejected": -1.029840111732483, + "logps/chosen": -213.888916015625, + "logps/rejected": -260.24090576171875, + "loss": 0.6804, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.020394446328282356, + "rewards/margins": 0.026438722386956215, + "rewards/margins_max": 0.037151582539081573, + "rewards/margins_min": 0.015725860372185707, + "rewards/margins_std": 0.015150276012718678, + "rewards/rejected": -0.006044276989996433, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 0.515625, + "learning_rate": 4.960411064712094e-07, + "logits/chosen": -1.3540217876434326, + "logits/rejected": -1.0137008428573608, + "logps/chosen": -183.4146270751953, + "logps/rejected": -218.0957489013672, + "loss": 0.6826, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0185568667948246, + "rewards/margins": 0.021527493372559547, + "rewards/margins_max": 0.031103383749723434, + "rewards/margins_min": 0.011951602064073086, + "rewards/margins_std": 0.013542355969548225, + "rewards/rejected": -0.002970626810565591, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.431640625, + "learning_rate": 4.955387727962759e-07, + "logits/chosen": -1.469268798828125, + "logits/rejected": -1.1933975219726562, + "logps/chosen": -175.39476013183594, + "logps/rejected": -170.35171508789062, + "loss": 0.6835, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.016227375715970993, + "rewards/margins": 0.018915237858891487, + "rewards/margins_max": 0.028273263946175575, + "rewards/margins_min": 0.009557214565575123, + "rewards/margins_std": 0.013234244659543037, + "rewards/rejected": -0.0026878633070737123, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 0.39453125, + "learning_rate": 4.95006729368358e-07, + "logits/chosen": -1.591016411781311, + "logits/rejected": -1.1849809885025024, + "logps/chosen": -215.30050659179688, + "logps/rejected": -204.9720458984375, + "loss": 0.6813, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.017296748235821724, + "rewards/margins": 0.025146162137389183, + "rewards/margins_max": 0.03591996058821678, + "rewards/margins_min": 0.014372363686561584, + "rewards/margins_std": 0.015236446633934975, + "rewards/rejected": -0.007849409244954586, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 0.36328125, + "learning_rate": 4.944450405637601e-07, + "logits/chosen": -1.3407318592071533, + "logits/rejected": -1.0564701557159424, + "logps/chosen": -208.2605438232422, + "logps/rejected": -194.71420288085938, + "loss": 0.681, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.014876808039844036, + "rewards/margins": 0.02353464625775814, + "rewards/margins_max": 0.03144057095050812, + "rewards/margins_min": 0.015628723427653313, + "rewards/margins_std": 0.011180664412677288, + "rewards/rejected": -0.008657841011881828, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.34375, + "learning_rate": 4.938537743458248e-07, + "logits/chosen": -1.3480737209320068, + "logits/rejected": -1.028096318244934, + "logps/chosen": -179.07174682617188, + "logps/rejected": -181.2510223388672, + "loss": 0.684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.019465144723653793, + "rewards/margins": 0.02149110659956932, + "rewards/margins_max": 0.032975539565086365, + "rewards/margins_min": 0.010006672702729702, + "rewards/margins_std": 0.016241444274783134, + "rewards/rejected": -0.0020259625744074583, + "step": 520 + }, + { + "epoch": 0.17, + "grad_norm": 0.46484375, + "learning_rate": 4.932330022567081e-07, + "logits/chosen": -1.3110549449920654, + "logits/rejected": -1.0896965265274048, + "logps/chosen": -212.4080810546875, + "logps/rejected": -204.01026916503906, + "loss": 0.6808, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.018044626340270042, + "rewards/margins": 0.027044925838708878, + "rewards/margins_max": 0.03969361633062363, + "rewards/margins_min": 0.01439622975885868, + "rewards/margins_std": 0.017887955531477928, + "rewards/rejected": -0.009000294841825962, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 0.4453125, + "learning_rate": 4.925827994087244e-07, + "logits/chosen": -1.467350721359253, + "logits/rejected": -0.9765011072158813, + "logps/chosen": -196.60804748535156, + "logps/rejected": -212.77114868164062, + "loss": 0.6806, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.022185953333973885, + "rewards/margins": 0.024724114686250687, + "rewards/margins_max": 0.03783790022134781, + "rewards/margins_min": 0.011610329151153564, + "rewards/margins_std": 0.018545694649219513, + "rewards/rejected": -0.00253815995529294, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.41015625, + "learning_rate": 4.91903244475257e-07, + "logits/chosen": -1.4453445672988892, + "logits/rejected": -1.2255735397338867, + "logps/chosen": -233.5497589111328, + "logps/rejected": -203.3679962158203, + "loss": 0.6799, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.022691726684570312, + "rewards/margins": 0.027249213308095932, + "rewards/margins_max": 0.042755015194416046, + "rewards/margins_min": 0.011743416078388691, + "rewards/margins_std": 0.021928513422608376, + "rewards/rejected": -0.004557489417493343, + "step": 550 + }, + { + "epoch": 0.18, + "grad_norm": 0.326171875, + "learning_rate": 4.91194419681239e-07, + "logits/chosen": -1.4020469188690186, + "logits/rejected": -1.0889606475830078, + "logps/chosen": -201.20901489257812, + "logps/rejected": -204.1788330078125, + "loss": 0.681, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.024152381345629692, + "rewards/margins": 0.02786511741578579, + "rewards/margins_max": 0.042585860937833786, + "rewards/margins_min": 0.013144371099770069, + "rewards/margins_std": 0.020818280056118965, + "rewards/rejected": -0.00371273560449481, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 0.462890625, + "learning_rate": 4.904564107932048e-07, + "logits/chosen": -1.2641432285308838, + "logits/rejected": -0.897659182548523, + "logps/chosen": -271.8118896484375, + "logps/rejected": -239.61300659179688, + "loss": 0.6815, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.017158251255750656, + "rewards/margins": 0.024462290108203888, + "rewards/margins_max": 0.03610853850841522, + "rewards/margins_min": 0.012816043570637703, + "rewards/margins_std": 0.016470283269882202, + "rewards/rejected": -0.007304038852453232, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 0.4296875, + "learning_rate": 4.896893071089115e-07, + "logits/chosen": -1.3425147533416748, + "logits/rejected": -1.0659515857696533, + "logps/chosen": -230.05111694335938, + "logps/rejected": -245.98550415039062, + "loss": 0.6748, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02490960620343685, + "rewards/margins": 0.0353575199842453, + "rewards/margins_max": 0.05103808641433716, + "rewards/margins_min": 0.019676949828863144, + "rewards/margins_std": 0.022175675258040428, + "rewards/rejected": -0.010447912849485874, + "step": 580 + }, + { + "epoch": 0.19, + "grad_norm": 0.40625, + "learning_rate": 4.888932014465352e-07, + "logits/chosen": -1.313063383102417, + "logits/rejected": -0.9944307208061218, + "logps/chosen": -208.10879516601562, + "logps/rejected": -223.5287322998047, + "loss": 0.679, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.022358160465955734, + "rewards/margins": 0.03080761432647705, + "rewards/margins_max": 0.04008474573493004, + "rewards/margins_min": 0.021530481055378914, + "rewards/margins_std": 0.013119848445057869, + "rewards/rejected": -0.008449452929198742, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 0.5, + "learning_rate": 4.88068190133439e-07, + "logits/chosen": -1.4195083379745483, + "logits/rejected": -1.1984379291534424, + "logps/chosen": -293.54132080078125, + "logps/rejected": -259.38360595703125, + "loss": 0.6771, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023516178131103516, + "rewards/margins": 0.03416413068771362, + "rewards/margins_max": 0.045261941850185394, + "rewards/margins_min": 0.023066317662596703, + "rewards/margins_std": 0.01569467782974243, + "rewards/rejected": -0.010647954419255257, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 0.40625, + "learning_rate": 4.872143729945184e-07, + "logits/chosen": -1.2229716777801514, + "logits/rejected": -0.8150213360786438, + "logps/chosen": -218.43276977539062, + "logps/rejected": -191.75827026367188, + "loss": 0.6784, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.022476380690932274, + "rewards/margins": 0.03276657313108444, + "rewards/margins_max": 0.046399351209402084, + "rewards/margins_min": 0.019133802503347397, + "rewards/margins_std": 0.019279656931757927, + "rewards/rejected": -0.010290195234119892, + "step": 610 + }, + { + "epoch": 0.2, + "grad_norm": 0.466796875, + "learning_rate": 4.863318533401223e-07, + "logits/chosen": -1.3115172386169434, + "logits/rejected": -0.8752225041389465, + "logps/chosen": -246.44140625, + "logps/rejected": -270.7298278808594, + "loss": 0.6758, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02260987088084221, + "rewards/margins": 0.034483883529901505, + "rewards/margins_max": 0.053231727331876755, + "rewards/margins_min": 0.015736039727926254, + "rewards/margins_std": 0.02651345357298851, + "rewards/rejected": -0.01187401358038187, + "step": 620 + }, + { + "epoch": 0.2, + "grad_norm": 0.326171875, + "learning_rate": 4.854207379535528e-07, + "logits/chosen": -1.4319788217544556, + "logits/rejected": -1.0323983430862427, + "logps/chosen": -254.2920684814453, + "logps/rejected": -237.34378051757812, + "loss": 0.6786, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.017374712973833084, + "rewards/margins": 0.029332011938095093, + "rewards/margins_max": 0.04456937313079834, + "rewards/margins_min": 0.014094656333327293, + "rewards/margins_std": 0.021548878401517868, + "rewards/rejected": -0.011957301758229733, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 0.404296875, + "learning_rate": 4.844811370781446e-07, + "logits/chosen": -1.4312872886657715, + "logits/rejected": -0.9997726678848267, + "logps/chosen": -244.96224975585938, + "logps/rejected": -225.2017059326172, + "loss": 0.6778, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.030679643154144287, + "rewards/margins": 0.03587724640965462, + "rewards/margins_max": 0.05194888263940811, + "rewards/margins_min": 0.01980561390519142, + "rewards/margins_std": 0.022728722542524338, + "rewards/rejected": -0.00519760325551033, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 0.4453125, + "learning_rate": 4.835131644039251e-07, + "logits/chosen": -1.4758861064910889, + "logits/rejected": -0.9546338319778442, + "logps/chosen": -339.2093505859375, + "logps/rejected": -224.3199920654297, + "loss": 0.6737, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02631019614636898, + "rewards/margins": 0.04230981320142746, + "rewards/margins_max": 0.05546834319829941, + "rewards/margins_min": 0.02915129065513611, + "rewards/margins_std": 0.01860896497964859, + "rewards/rejected": -0.015999620780348778, + "step": 650 + }, + { + "epoch": 0.21, + "grad_norm": 0.330078125, + "learning_rate": 4.825169370538594e-07, + "logits/chosen": -1.2813438177108765, + "logits/rejected": -1.069059133529663, + "logps/chosen": -231.5976104736328, + "logps/rejected": -245.3614501953125, + "loss": 0.6778, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02429072931408882, + "rewards/margins": 0.027610447257757187, + "rewards/margins_max": 0.0455966591835022, + "rewards/margins_min": 0.009624237194657326, + "rewards/margins_std": 0.025436347350478172, + "rewards/rejected": -0.003319723065942526, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 0.46484375, + "learning_rate": 4.814925755696778e-07, + "logits/chosen": -1.4551244974136353, + "logits/rejected": -0.9832841157913208, + "logps/chosen": -288.0592346191406, + "logps/rejected": -252.08364868164062, + "loss": 0.6752, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.028888309374451637, + "rewards/margins": 0.03665446117520332, + "rewards/margins_max": 0.05256615951657295, + "rewards/margins_min": 0.020742762833833694, + "rewards/margins_std": 0.022502535954117775, + "rewards/rejected": -0.007766152266412973, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 0.546875, + "learning_rate": 4.804402038972899e-07, + "logits/chosen": -1.4220234155654907, + "logits/rejected": -1.02151358127594, + "logps/chosen": -271.8201599121094, + "logps/rejected": -273.0591735839844, + "loss": 0.6743, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.028314124792814255, + "rewards/margins": 0.03461919724941254, + "rewards/margins_max": 0.0456906296312809, + "rewards/margins_min": 0.02354777231812477, + "rewards/margins_std": 0.015657365322113037, + "rewards/rejected": -0.006305074784904718, + "step": 680 + }, + { + "epoch": 0.22, + "grad_norm": 0.279296875, + "learning_rate": 4.79359949371789e-07, + "logits/chosen": -1.3343526124954224, + "logits/rejected": -0.9362949132919312, + "logps/chosen": -257.0128173828125, + "logps/rejected": -225.06753540039062, + "loss": 0.6765, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.025834929198026657, + "rewards/margins": 0.035724394023418427, + "rewards/margins_max": 0.04703225940465927, + "rewards/margins_min": 0.024416524916887283, + "rewards/margins_std": 0.015991736203432083, + "rewards/rejected": -0.009889459237456322, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 0.41796875, + "learning_rate": 4.782519427020432e-07, + "logits/chosen": -1.3143739700317383, + "logits/rejected": -0.9841324687004089, + "logps/chosen": -204.9932098388672, + "logps/rejected": -218.54141235351562, + "loss": 0.6756, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.016944795846939087, + "rewards/margins": 0.0319121815264225, + "rewards/margins_max": 0.04973548650741577, + "rewards/margins_min": 0.014088879339396954, + "rewards/margins_std": 0.025205958634614944, + "rewards/rejected": -0.014967384748160839, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.33203125, + "learning_rate": 4.771163179548808e-07, + "logits/chosen": -1.3899494409561157, + "logits/rejected": -0.9665300250053406, + "logps/chosen": -333.5497131347656, + "logps/rejected": -231.9584503173828, + "loss": 0.6699, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.027130257338285446, + "rewards/margins": 0.04676414281129837, + "rewards/margins_max": 0.06533849239349365, + "rewards/margins_min": 0.028189798817038536, + "rewards/margins_std": 0.026268085464835167, + "rewards/rejected": -0.019633881747722626, + "step": 710 + }, + { + "epoch": 0.23, + "grad_norm": 0.43359375, + "learning_rate": 4.75953212538868e-07, + "logits/chosen": -1.2207629680633545, + "logits/rejected": -0.8575620651245117, + "logps/chosen": -263.4650573730469, + "logps/rejected": -233.98886108398438, + "loss": 0.6717, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03418269008398056, + "rewards/margins": 0.05005268007516861, + "rewards/margins_max": 0.0767994374036789, + "rewards/margins_min": 0.02330590970814228, + "rewards/margins_std": 0.03782564401626587, + "rewards/rejected": -0.0158699844032526, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.404296875, + "learning_rate": 4.7476276718768284e-07, + "logits/chosen": -1.434762716293335, + "logits/rejected": -1.0808919668197632, + "logps/chosen": -206.06533813476562, + "logps/rejected": -218.6961669921875, + "loss": 0.6751, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.026135995984077454, + "rewards/margins": 0.03937443345785141, + "rewards/margins_max": 0.05718846991658211, + "rewards/margins_min": 0.021560396999120712, + "rewards/margins_std": 0.025192851200699806, + "rewards/rejected": -0.01323844026774168, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 0.41015625, + "learning_rate": 4.7354512594308654e-07, + "logits/chosen": -1.3616220951080322, + "logits/rejected": -1.1791460514068604, + "logps/chosen": -193.25332641601562, + "logps/rejected": -197.8474578857422, + "loss": 0.6753, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.024115614593029022, + "rewards/margins": 0.03462132811546326, + "rewards/margins_max": 0.052414439618587494, + "rewards/margins_min": 0.016828209161758423, + "rewards/margins_std": 0.02516326680779457, + "rewards/rejected": -0.010505708865821362, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 0.431640625, + "learning_rate": 4.7230043613749527e-07, + "logits/chosen": -1.3196706771850586, + "logits/rejected": -1.0803533792495728, + "logps/chosen": -229.3977813720703, + "logps/rejected": -198.8769989013672, + "loss": 0.6758, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023436803370714188, + "rewards/margins": 0.03306025639176369, + "rewards/margins_max": 0.05252969264984131, + "rewards/margins_min": 0.013590824790298939, + "rewards/margins_std": 0.027533939108252525, + "rewards/rejected": -0.009623454883694649, + "step": 750 + }, + { + "epoch": 0.24, + "grad_norm": 0.373046875, + "learning_rate": 4.710288483761524e-07, + "logits/chosen": -1.1608425378799438, + "logits/rejected": -0.8409261703491211, + "logps/chosen": -238.56130981445312, + "logps/rejected": -212.3688507080078, + "loss": 0.677, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01990620419383049, + "rewards/margins": 0.038102246820926666, + "rewards/margins_max": 0.053315240889787674, + "rewards/margins_min": 0.022889258340001106, + "rewards/margins_std": 0.021514419466257095, + "rewards/rejected": -0.018196044489741325, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 0.38671875, + "learning_rate": 4.697305165189062e-07, + "logits/chosen": -1.4269897937774658, + "logits/rejected": -1.0499980449676514, + "logps/chosen": -230.6156768798828, + "logps/rejected": -229.01708984375, + "loss": 0.6741, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.028978174552321434, + "rewards/margins": 0.03824831172823906, + "rewards/margins_max": 0.06103180721402168, + "rewards/margins_min": 0.015464827418327332, + "rewards/margins_std": 0.03222071751952171, + "rewards/rejected": -0.009270140901207924, + "step": 770 + }, + { + "epoch": 0.25, + "grad_norm": 0.546875, + "learning_rate": 4.6840559766159235e-07, + "logits/chosen": -1.3930243253707886, + "logits/rejected": -0.9540492296218872, + "logps/chosen": -225.96084594726562, + "logps/rejected": -237.0554962158203, + "loss": 0.6711, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03488198295235634, + "rewards/margins": 0.05557037144899368, + "rewards/margins_max": 0.07404030859470367, + "rewards/margins_min": 0.0371004194021225, + "rewards/margins_std": 0.026120448485016823, + "rewards/rejected": -0.02068837732076645, + "step": 780 + }, + { + "epoch": 0.25, + "grad_norm": 0.5078125, + "learning_rate": 4.6705425211702656e-07, + "logits/chosen": -1.4000756740570068, + "logits/rejected": -1.1083465814590454, + "logps/chosen": -172.87281799316406, + "logps/rejected": -189.81704711914062, + "loss": 0.6745, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.023823823779821396, + "rewards/margins": 0.0378737710416317, + "rewards/margins_max": 0.05512396618723869, + "rewards/margins_min": 0.020623570308089256, + "rewards/margins_std": 0.02439546398818493, + "rewards/rejected": -0.014049944467842579, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 0.431640625, + "learning_rate": 4.656766433956062e-07, + "logits/chosen": -1.3979090452194214, + "logits/rejected": -0.8946587443351746, + "logps/chosen": -248.97512817382812, + "logps/rejected": -218.88919067382812, + "loss": 0.6695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03325175121426582, + "rewards/margins": 0.05383073538541794, + "rewards/margins_max": 0.07829690724611282, + "rewards/margins_min": 0.02936457097530365, + "rewards/margins_std": 0.034600384533405304, + "rewards/rejected": -0.020578987896442413, + "step": 800 + }, + { + "epoch": 0.26, + "grad_norm": 0.337890625, + "learning_rate": 4.6427293818552613e-07, + "logits/chosen": -1.4188311100006104, + "logits/rejected": -0.9876410365104675, + "logps/chosen": -234.7069854736328, + "logps/rejected": -182.8696746826172, + "loss": 0.6725, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03687068074941635, + "rewards/margins": 0.04073936119675636, + "rewards/margins_max": 0.05916820093989372, + "rewards/margins_min": 0.022310517728328705, + "rewards/margins_std": 0.026062315329909325, + "rewards/rejected": -0.0038686811458319426, + "step": 810 + }, + { + "epoch": 0.26, + "grad_norm": 0.498046875, + "learning_rate": 4.6284330633260994e-07, + "logits/chosen": -1.3178324699401855, + "logits/rejected": -0.9743862152099609, + "logps/chosen": -202.13705444335938, + "logps/rejected": -199.08094787597656, + "loss": 0.6723, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0206548273563385, + "rewards/margins": 0.038690946996212006, + "rewards/margins_max": 0.057139646261930466, + "rewards/margins_min": 0.020242247730493546, + "rewards/margins_std": 0.026090402156114578, + "rewards/rejected": -0.018036121502518654, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 0.40625, + "learning_rate": 4.6138792081975844e-07, + "logits/chosen": -1.4049233198165894, + "logits/rejected": -1.0411832332611084, + "logps/chosen": -223.6266632080078, + "logps/rejected": -188.9544219970703, + "loss": 0.6723, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03218904137611389, + "rewards/margins": 0.04215382784605026, + "rewards/margins_max": 0.06759864091873169, + "rewards/margins_min": 0.016709014773368835, + "rewards/margins_std": 0.03598439693450928, + "rewards/rejected": -0.009964784607291222, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 0.41796875, + "learning_rate": 4.599069577460194e-07, + "logits/chosen": -1.4191118478775024, + "logits/rejected": -1.1629408597946167, + "logps/chosen": -280.8072814941406, + "logps/rejected": -243.64645385742188, + "loss": 0.6685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04105687886476517, + "rewards/margins": 0.05216727405786514, + "rewards/margins_max": 0.07590137422084808, + "rewards/margins_min": 0.028433170169591904, + "rewards/margins_std": 0.033565085381269455, + "rewards/rejected": -0.011110392399132252, + "step": 840 + }, + { + "epoch": 0.27, + "grad_norm": 0.3515625, + "learning_rate": 4.5840059630527985e-07, + "logits/chosen": -1.505789875984192, + "logits/rejected": -1.090831995010376, + "logps/chosen": -203.38735961914062, + "logps/rejected": -218.90292358398438, + "loss": 0.6706, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.035862646996974945, + "rewards/margins": 0.048062682151794434, + "rewards/margins_max": 0.06934330612421036, + "rewards/margins_min": 0.02678206004202366, + "rewards/margins_std": 0.030095338821411133, + "rewards/rejected": -0.01220003329217434, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 0.466796875, + "learning_rate": 4.5686901876458384e-07, + "logits/chosen": -1.4151548147201538, + "logits/rejected": -1.0735548734664917, + "logps/chosen": -211.01199340820312, + "logps/rejected": -224.63619995117188, + "loss": 0.6725, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02967039868235588, + "rewards/margins": 0.04329541698098183, + "rewards/margins_max": 0.0652666911482811, + "rewards/margins_min": 0.021324139088392258, + "rewards/margins_std": 0.03107207641005516, + "rewards/rejected": -0.013625016435980797, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 0.46484375, + "learning_rate": 4.553124104420784e-07, + "logits/chosen": -1.3255832195281982, + "logits/rejected": -1.1080420017242432, + "logps/chosen": -205.3770751953125, + "logps/rejected": -209.04660034179688, + "loss": 0.6728, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.022777115926146507, + "rewards/margins": 0.0444360189139843, + "rewards/margins_max": 0.06135256215929985, + "rewards/margins_min": 0.02751948870718479, + "rewards/margins_std": 0.023923594504594803, + "rewards/rejected": -0.02165890485048294, + "step": 870 + }, + { + "epoch": 0.28, + "grad_norm": 0.353515625, + "learning_rate": 4.537309596845905e-07, + "logits/chosen": -1.4212206602096558, + "logits/rejected": -1.1468111276626587, + "logps/chosen": -203.2875213623047, + "logps/rejected": -183.04867553710938, + "loss": 0.6704, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03265067934989929, + "rewards/margins": 0.04764062911272049, + "rewards/margins_max": 0.06543318927288055, + "rewards/margins_min": 0.029848068952560425, + "rewards/margins_std": 0.025162484496831894, + "rewards/rejected": -0.014989949762821198, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.333984375, + "learning_rate": 4.521248578448373e-07, + "logits/chosen": -1.295290231704712, + "logits/rejected": -1.2244700193405151, + "logps/chosen": -167.2049560546875, + "logps/rejected": -235.2522735595703, + "loss": 0.6736, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021955791860818863, + "rewards/margins": 0.0307827889919281, + "rewards/margins_max": 0.04658069089055061, + "rewards/margins_min": 0.01498488150537014, + "rewards/margins_std": 0.022341612726449966, + "rewards/rejected": -0.008826995268464088, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 0.392578125, + "learning_rate": 4.504942992582732e-07, + "logits/chosen": -1.2876170873641968, + "logits/rejected": -1.070996642112732, + "logps/chosen": -201.41519165039062, + "logps/rejected": -215.9574737548828, + "loss": 0.6713, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.033751118928194046, + "rewards/margins": 0.03825461119413376, + "rewards/margins_max": 0.05966836214065552, + "rewards/margins_min": 0.016840863972902298, + "rewards/margins_std": 0.030283614993095398, + "rewards/rejected": -0.004503494594246149, + "step": 900 + }, + { + "epoch": 0.29, + "grad_norm": 0.3671875, + "learning_rate": 4.4883948121957483e-07, + "logits/chosen": -1.3818947076797485, + "logits/rejected": -1.1178925037384033, + "logps/chosen": -170.48712158203125, + "logps/rejected": -221.67098999023438, + "loss": 0.6719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.028661269694566727, + "rewards/margins": 0.04194828122854233, + "rewards/margins_max": 0.06600390374660492, + "rewards/margins_min": 0.017892662435770035, + "rewards/margins_std": 0.03401978313922882, + "rewards/rejected": -0.013287017121911049, + "step": 910 + }, + { + "epoch": 0.29, + "grad_norm": 0.4609375, + "learning_rate": 4.471606039587695e-07, + "logits/chosen": -1.4353498220443726, + "logits/rejected": -1.2498797178268433, + "logps/chosen": -250.303466796875, + "logps/rejected": -252.1241912841797, + "loss": 0.6728, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.030318697914481163, + "rewards/margins": 0.03714519739151001, + "rewards/margins_max": 0.05723171681165695, + "rewards/margins_min": 0.01705867424607277, + "rewards/margins_std": 0.02840663120150566, + "rewards/rejected": -0.006826499011367559, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 0.298828125, + "learning_rate": 4.4545787061700746e-07, + "logits/chosen": -1.4596531391143799, + "logits/rejected": -0.9841306805610657, + "logps/chosen": -191.699462890625, + "logps/rejected": -231.6093292236328, + "loss": 0.6695, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.037762559950351715, + "rewards/margins": 0.05371398851275444, + "rewards/margins_max": 0.07777608931064606, + "rewards/margins_min": 0.029651891440153122, + "rewards/margins_std": 0.03402894735336304, + "rewards/rejected": -0.015951428562402725, + "step": 930 + }, + { + "epoch": 0.3, + "grad_norm": 0.478515625, + "learning_rate": 4.4373148722198183e-07, + "logits/chosen": -1.3031915426254272, + "logits/rejected": -0.9831310510635376, + "logps/chosen": -203.33865356445312, + "logps/rejected": -233.1627197265625, + "loss": 0.6694, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03300677612423897, + "rewards/margins": 0.06004180386662483, + "rewards/margins_max": 0.09207084774971008, + "rewards/margins_min": 0.028012752532958984, + "rewards/margins_std": 0.04529590904712677, + "rewards/rejected": -0.027035022154450417, + "step": 940 + }, + { + "epoch": 0.3, + "grad_norm": 0.498046875, + "learning_rate": 4.4198166266300025e-07, + "logits/chosen": -1.4863415956497192, + "logits/rejected": -1.0757322311401367, + "logps/chosen": -224.2287139892578, + "logps/rejected": -239.1637725830078, + "loss": 0.6657, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.034104812890291214, + "rewards/margins": 0.05569925159215927, + "rewards/margins_max": 0.08411959558725357, + "rewards/margins_min": 0.027278924360871315, + "rewards/margins_std": 0.04019241780042648, + "rewards/rejected": -0.021594444289803505, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 0.39453125, + "learning_rate": 4.402086086657092e-07, + "logits/chosen": -1.5037004947662354, + "logits/rejected": -0.9914538264274597, + "logps/chosen": -196.54397583007812, + "logps/rejected": -211.8581085205078, + "loss": 0.6703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030650585889816284, + "rewards/margins": 0.04529775679111481, + "rewards/margins_max": 0.06939564645290375, + "rewards/margins_min": 0.021199876442551613, + "rewards/margins_std": 0.034079547971487045, + "rewards/rejected": -0.014647173695266247, + "step": 960 + }, + { + "epoch": 0.31, + "grad_norm": 0.46875, + "learning_rate": 4.3841253976647584e-07, + "logits/chosen": -1.4153146743774414, + "logits/rejected": -1.0589603185653687, + "logps/chosen": -195.8428192138672, + "logps/rejected": -189.7724609375, + "loss": 0.6679, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03924962133169174, + "rewards/margins": 0.05352933332324028, + "rewards/margins_max": 0.08327177166938782, + "rewards/margins_min": 0.023786883801221848, + "rewards/margins_std": 0.042062170803546906, + "rewards/rejected": -0.014279710128903389, + "step": 970 + }, + { + "epoch": 0.31, + "grad_norm": 0.28125, + "learning_rate": 4.3659367328642917e-07, + "logits/chosen": -1.1924479007720947, + "logits/rejected": -1.0621583461761475, + "logps/chosen": -212.1740264892578, + "logps/rejected": -255.7356414794922, + "loss": 0.6707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026019830256700516, + "rewards/margins": 0.039071694016456604, + "rewards/margins_max": 0.05669945478439331, + "rewards/margins_min": 0.021443922072649002, + "rewards/margins_std": 0.024929430335760117, + "rewards/rejected": -0.013051861897110939, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 0.5, + "learning_rate": 4.3475222930516473e-07, + "logits/chosen": -1.3828264474868774, + "logits/rejected": -1.12723708152771, + "logps/chosen": -209.1006622314453, + "logps/rejected": -211.49014282226562, + "loss": 0.6707, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.024994900450110435, + "rewards/margins": 0.04330515116453171, + "rewards/margins_max": 0.0647624060511589, + "rewards/margins_min": 0.021847892552614212, + "rewards/margins_std": 0.03034515120089054, + "rewards/rejected": -0.018310246989130974, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 0.345703125, + "learning_rate": 4.3288843063411573e-07, + "logits/chosen": -1.588935136795044, + "logits/rejected": -1.1631680727005005, + "logps/chosen": -212.937255859375, + "logps/rejected": -199.36758422851562, + "loss": 0.6685, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03406776860356331, + "rewards/margins": 0.04911542311310768, + "rewards/margins_max": 0.07006208598613739, + "rewards/margins_min": 0.02816876210272312, + "rewards/margins_std": 0.029623055830597878, + "rewards/rejected": -0.015047654509544373, + "step": 1000 + }, + { + "epoch": 0.32, + "grad_norm": 0.4296875, + "learning_rate": 4.310025027895925e-07, + "logits/chosen": -1.4283082485198975, + "logits/rejected": -1.1020632982254028, + "logps/chosen": -215.6881103515625, + "logps/rejected": -228.725341796875, + "loss": 0.6686, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.035303063690662384, + "rewards/margins": 0.047801949083805084, + "rewards/margins_max": 0.06932314485311508, + "rewards/margins_min": 0.026280760765075684, + "rewards/margins_std": 0.03043556772172451, + "rewards/rejected": -0.01249888725578785, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 0.447265625, + "learning_rate": 4.290946739654962e-07, + "logits/chosen": -1.3023067712783813, + "logits/rejected": -0.9218745231628418, + "logps/chosen": -245.7296905517578, + "logps/rejected": -226.1380615234375, + "loss": 0.6675, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.031115401536226273, + "rewards/margins": 0.05050064995884895, + "rewards/margins_max": 0.06927161663770676, + "rewards/margins_min": 0.03172967582941055, + "rewards/margins_std": 0.026546159759163857, + "rewards/rejected": -0.019385244697332382, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 0.3046875, + "learning_rate": 4.2716517500570704e-07, + "logits/chosen": -1.3911397457122803, + "logits/rejected": -1.181490182876587, + "logps/chosen": -186.88909912109375, + "logps/rejected": -218.47900390625, + "loss": 0.6743, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.024538397789001465, + "rewards/margins": 0.040439218282699585, + "rewards/margins_max": 0.060990117490291595, + "rewards/margins_min": 0.019888322800397873, + "rewards/margins_std": 0.029063355177640915, + "rewards/rejected": -0.01590082235634327, + "step": 1030 + }, + { + "epoch": 0.33, + "grad_norm": 0.515625, + "learning_rate": 4.252142393761533e-07, + "logits/chosen": -1.4555580615997314, + "logits/rejected": -1.0047805309295654, + "logps/chosen": -251.005615234375, + "logps/rejected": -284.2795715332031, + "loss": 0.668, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03476887568831444, + "rewards/margins": 0.054674047976732254, + "rewards/margins_max": 0.07890333235263824, + "rewards/margins_min": 0.030444765463471413, + "rewards/margins_std": 0.03426538407802582, + "rewards/rejected": -0.019905168563127518, + "step": 1040 + }, + { + "epoch": 0.33, + "grad_norm": 0.359375, + "learning_rate": 4.232421031365617e-07, + "logits/chosen": -1.3305257558822632, + "logits/rejected": -1.1427993774414062, + "logps/chosen": -180.7315216064453, + "logps/rejected": -214.3630828857422, + "loss": 0.6704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02955777570605278, + "rewards/margins": 0.051715098321437836, + "rewards/margins_max": 0.07381218671798706, + "rewards/margins_min": 0.029618006199598312, + "rewards/margins_std": 0.031250011175870895, + "rewards/rejected": -0.022157320752739906, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 0.318359375, + "learning_rate": 4.212490049118951e-07, + "logits/chosen": -1.4470938444137573, + "logits/rejected": -1.143046498298645, + "logps/chosen": -198.9834442138672, + "logps/rejected": -239.5135955810547, + "loss": 0.6707, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.028033524751663208, + "rewards/margins": 0.04688093811273575, + "rewards/margins_max": 0.06461174786090851, + "rewards/margins_min": 0.02915012836456299, + "rewards/margins_std": 0.025075148791074753, + "rewards/rejected": -0.018847409635782242, + "step": 1060 + }, + { + "epoch": 0.34, + "grad_norm": 0.337890625, + "learning_rate": 4.1923518586347914e-07, + "logits/chosen": -1.4638912677764893, + "logits/rejected": -1.0022966861724854, + "logps/chosen": -209.8042449951172, + "logps/rejected": -192.01329040527344, + "loss": 0.6697, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03496958687901497, + "rewards/margins": 0.05312635377049446, + "rewards/margins_max": 0.07696934044361115, + "rewards/margins_min": 0.029283368960022926, + "rewards/margins_std": 0.03371907025575638, + "rewards/rejected": -0.018156763166189194, + "step": 1070 + }, + { + "epoch": 0.34, + "grad_norm": 0.455078125, + "learning_rate": 4.172008896598221e-07, + "logits/chosen": -1.3048521280288696, + "logits/rejected": -1.0749359130859375, + "logps/chosen": -201.3562469482422, + "logps/rejected": -187.42080688476562, + "loss": 0.6696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03374785929918289, + "rewards/margins": 0.055463533848524094, + "rewards/margins_max": 0.07117541134357452, + "rewards/margins_min": 0.03975165635347366, + "rewards/margins_std": 0.022219957783818245, + "rewards/rejected": -0.02171567641198635, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 0.40234375, + "learning_rate": 4.151463624471313e-07, + "logits/chosen": -1.32763671875, + "logits/rejected": -0.8290489315986633, + "logps/chosen": -311.2829895019531, + "logps/rejected": -223.5568084716797, + "loss": 0.6627, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04091322422027588, + "rewards/margins": 0.07021255791187286, + "rewards/margins_max": 0.10776461660861969, + "rewards/margins_min": 0.03266051039099693, + "rewards/margins_std": 0.05310662463307381, + "rewards/rejected": -0.029299337416887283, + "step": 1090 + }, + { + "epoch": 0.35, + "grad_norm": 0.37109375, + "learning_rate": 4.130718528195303e-07, + "logits/chosen": -1.4879382848739624, + "logits/rejected": -1.0252676010131836, + "logps/chosen": -229.93917846679688, + "logps/rejected": -225.00491333007812, + "loss": 0.666, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.041905976831912994, + "rewards/margins": 0.06251799315214157, + "rewards/margins_max": 0.08764694631099701, + "rewards/margins_min": 0.03738904744386673, + "rewards/margins_std": 0.035537708550691605, + "rewards/rejected": -0.020612016320228577, + "step": 1100 + }, + { + "epoch": 0.35, + "grad_norm": 0.328125, + "learning_rate": 4.109776117889789e-07, + "logits/chosen": -1.371626853942871, + "logits/rejected": -0.9644553065299988, + "logps/chosen": -256.90826416015625, + "logps/rejected": -262.3277282714844, + "loss": 0.665, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04058977589011192, + "rewards/margins": 0.058307357132434845, + "rewards/margins_max": 0.07639677822589874, + "rewards/margins_min": 0.04021793603897095, + "rewards/margins_std": 0.025582294911146164, + "rewards/rejected": -0.017717575654387474, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 0.3515625, + "learning_rate": 4.088638927549016e-07, + "logits/chosen": -1.4024112224578857, + "logits/rejected": -1.0277204513549805, + "logps/chosen": -257.0721740722656, + "logps/rejected": -223.2074432373047, + "loss": 0.6695, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.033399466425180435, + "rewards/margins": 0.05190381407737732, + "rewards/margins_max": 0.07284527271986008, + "rewards/margins_min": 0.03096235729753971, + "rewards/margins_std": 0.029615694656968117, + "rewards/rejected": -0.018504345789551735, + "step": 1120 + }, + { + "epoch": 0.36, + "grad_norm": 0.3359375, + "learning_rate": 4.067309514735267e-07, + "logits/chosen": -1.2835520505905151, + "logits/rejected": -0.9591856002807617, + "logps/chosen": -253.2421417236328, + "logps/rejected": -214.3036651611328, + "loss": 0.6642, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03112444281578064, + "rewards/margins": 0.06105799227952957, + "rewards/margins_max": 0.08269943296909332, + "rewards/margins_min": 0.03941655158996582, + "rewards/margins_std": 0.030605623498558998, + "rewards/rejected": -0.02993355132639408, + "step": 1130 + }, + { + "epoch": 0.36, + "grad_norm": 0.4296875, + "learning_rate": 4.045790460269395e-07, + "logits/chosen": -1.29916250705719, + "logits/rejected": -0.9579310417175293, + "logps/chosen": -222.2379608154297, + "logps/rejected": -203.0634307861328, + "loss": 0.667, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03449912741780281, + "rewards/margins": 0.04771226644515991, + "rewards/margins_max": 0.0739569216966629, + "rewards/margins_min": 0.02146761119365692, + "rewards/margins_std": 0.03711555153131485, + "rewards/rejected": -0.0132131427526474, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 0.46484375, + "learning_rate": 4.02408436791856e-07, + "logits/chosen": -1.3718782663345337, + "logits/rejected": -1.0133472681045532, + "logps/chosen": -234.8833770751953, + "logps/rejected": -237.4645233154297, + "loss": 0.6669, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03908390551805496, + "rewards/margins": 0.06387855857610703, + "rewards/margins_max": 0.0912180095911026, + "rewards/margins_min": 0.03653910756111145, + "rewards/margins_std": 0.038663819432258606, + "rewards/rejected": -0.02479465678334236, + "step": 1150 + }, + { + "epoch": 0.37, + "grad_norm": 0.6796875, + "learning_rate": 4.0021938640811717e-07, + "logits/chosen": -1.3344662189483643, + "logits/rejected": -0.9591034054756165, + "logps/chosen": -221.8365478515625, + "logps/rejected": -358.8748779296875, + "loss": 0.6653, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03558691591024399, + "rewards/margins": 0.06812174618244171, + "rewards/margins_max": 0.09799469262361526, + "rewards/margins_min": 0.03824879601597786, + "rewards/margins_std": 0.042246729135513306, + "rewards/rejected": -0.032534826546907425, + "step": 1160 + }, + { + "epoch": 0.37, + "grad_norm": 0.275390625, + "learning_rate": 3.980121597469095e-07, + "logits/chosen": -1.4173529148101807, + "logits/rejected": -1.046112298965454, + "logps/chosen": -222.1094970703125, + "logps/rejected": -195.96484375, + "loss": 0.6705, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02532361075282097, + "rewards/margins": 0.04526478797197342, + "rewards/margins_max": 0.06393333524465561, + "rewards/margins_min": 0.02659623883664608, + "rewards/margins_std": 0.02640131488442421, + "rewards/rejected": -0.0199411790817976, + "step": 1170 + }, + { + "epoch": 0.37, + "grad_norm": 0.412109375, + "learning_rate": 3.9578702387871735e-07, + "logits/chosen": -1.4770991802215576, + "logits/rejected": -1.0594831705093384, + "logps/chosen": -200.45314025878906, + "logps/rejected": -181.73829650878906, + "loss": 0.67, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03991563245654106, + "rewards/margins": 0.04643087834119797, + "rewards/margins_max": 0.0705905631184578, + "rewards/margins_min": 0.022271184250712395, + "rewards/margins_std": 0.034166961908340454, + "rewards/rejected": -0.00651524355635047, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 0.34375, + "learning_rate": 3.9354424804100647e-07, + "logits/chosen": -1.3302786350250244, + "logits/rejected": -1.0419865846633911, + "logps/chosen": -180.52911376953125, + "logps/rejected": -229.23507690429688, + "loss": 0.6661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03989443928003311, + "rewards/margins": 0.05144830420613289, + "rewards/margins_max": 0.07513656467199326, + "rewards/margins_min": 0.02776004932820797, + "rewards/margins_std": 0.03350025415420532, + "rewards/rejected": -0.0115538714453578, + "step": 1190 + }, + { + "epoch": 0.38, + "grad_norm": 0.2734375, + "learning_rate": 3.9128410360564793e-07, + "logits/chosen": -1.4453057050704956, + "logits/rejected": -0.843630313873291, + "logps/chosen": -239.1511993408203, + "logps/rejected": -228.0477294921875, + "loss": 0.6689, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0347132682800293, + "rewards/margins": 0.05644859001040459, + "rewards/margins_max": 0.07778388261795044, + "rewards/margins_min": 0.035113297402858734, + "rewards/margins_std": 0.03017266094684601, + "rewards/rejected": -0.02173532173037529, + "step": 1200 + }, + { + "epoch": 0.38, + "grad_norm": 0.337890625, + "learning_rate": 3.8900686404608174e-07, + "logits/chosen": -1.4021894931793213, + "logits/rejected": -1.1501901149749756, + "logps/chosen": -246.96676635742188, + "logps/rejected": -243.79055786132812, + "loss": 0.6648, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02853170968592167, + "rewards/margins": 0.05541776865720749, + "rewards/margins_max": 0.0840243324637413, + "rewards/margins_min": 0.026811202988028526, + "rewards/margins_std": 0.04045579582452774, + "rewards/rejected": -0.02688606083393097, + "step": 1210 + }, + { + "epoch": 0.38, + "grad_norm": 0.328125, + "learning_rate": 3.8671280490422753e-07, + "logits/chosen": -1.4860647916793823, + "logits/rejected": -1.2145134210586548, + "logps/chosen": -173.87155151367188, + "logps/rejected": -215.08114624023438, + "loss": 0.6686, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03185255080461502, + "rewards/margins": 0.0523541197180748, + "rewards/margins_max": 0.07217199355363846, + "rewards/margins_min": 0.032536253333091736, + "rewards/margins_std": 0.028026703745126724, + "rewards/rejected": -0.020501574501395226, + "step": 1220 + }, + { + "epoch": 0.39, + "grad_norm": 0.3984375, + "learning_rate": 3.8440220375714435e-07, + "logits/chosen": -1.4330469369888306, + "logits/rejected": -0.923498809337616, + "logps/chosen": -194.24989318847656, + "logps/rejected": -188.0842742919922, + "loss": 0.6661, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02217457816004753, + "rewards/margins": 0.05012967437505722, + "rewards/margins_max": 0.07560008764266968, + "rewards/margins_min": 0.02465927042067051, + "rewards/margins_std": 0.03602059185504913, + "rewards/rejected": -0.02795509621500969, + "step": 1230 + }, + { + "epoch": 0.39, + "grad_norm": 0.42578125, + "learning_rate": 3.8207534018344434e-07, + "logits/chosen": -1.4624649286270142, + "logits/rejected": -1.2272025346755981, + "logps/chosen": -224.9335479736328, + "logps/rejected": -215.4106903076172, + "loss": 0.6645, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02788296714425087, + "rewards/margins": 0.05390559881925583, + "rewards/margins_max": 0.07784163951873779, + "rewards/margins_min": 0.02996954880654812, + "rewards/margins_std": 0.033850688487291336, + "rewards/rejected": -0.02602263353765011, + "step": 1240 + }, + { + "epoch": 0.39, + "grad_norm": 0.376953125, + "learning_rate": 3.797324957294643e-07, + "logits/chosen": -1.4522289037704468, + "logits/rejected": -1.0942248106002808, + "logps/chosen": -197.66709899902344, + "logps/rejected": -188.29644775390625, + "loss": 0.6667, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.028640951961278915, + "rewards/margins": 0.04304185137152672, + "rewards/margins_max": 0.06764715909957886, + "rewards/margins_min": 0.01843653805553913, + "rewards/margins_std": 0.034797169268131256, + "rewards/rejected": -0.014400901272892952, + "step": 1250 + }, + { + "epoch": 0.4, + "grad_norm": 0.451171875, + "learning_rate": 3.773739538751988e-07, + "logits/chosen": -1.4544618129730225, + "logits/rejected": -1.0294139385223389, + "logps/chosen": -245.71435546875, + "logps/rejected": -208.09115600585938, + "loss": 0.6646, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.032376714050769806, + "rewards/margins": 0.04516047239303589, + "rewards/margins_max": 0.06522423774003983, + "rewards/margins_min": 0.025096703320741653, + "rewards/margins_std": 0.028374452143907547, + "rewards/rejected": -0.012783756479620934, + "step": 1260 + }, + { + "epoch": 0.4, + "grad_norm": 0.431640625, + "learning_rate": 3.75e-07, + "logits/chosen": -1.39158034324646, + "logits/rejected": -0.9538629651069641, + "logps/chosen": -339.5301513671875, + "logps/rejected": -214.0384521484375, + "loss": 0.6693, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03498070687055588, + "rewards/margins": 0.05068878084421158, + "rewards/margins_max": 0.07239842414855957, + "rewards/margins_min": 0.028979141265153885, + "rewards/margins_std": 0.030702069401741028, + "rewards/rejected": -0.015708070248365402, + "step": 1270 + }, + { + "epoch": 0.4, + "grad_norm": 0.388671875, + "learning_rate": 3.7261092134804695e-07, + "logits/chosen": -1.313458800315857, + "logits/rejected": -0.9937132000923157, + "logps/chosen": -205.0299835205078, + "logps/rejected": -212.155029296875, + "loss": 0.6684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03651620075106621, + "rewards/margins": 0.05705242604017258, + "rewards/margins_max": 0.08746035397052765, + "rewards/margins_min": 0.026644494384527206, + "rewards/margins_std": 0.043003302067518234, + "rewards/rejected": -0.020536217838525772, + "step": 1280 + }, + { + "epoch": 0.41, + "grad_norm": 0.40234375, + "learning_rate": 3.702070069935898e-07, + "logits/chosen": -1.4626922607421875, + "logits/rejected": -1.015981674194336, + "logps/chosen": -227.63339233398438, + "logps/rejected": -221.4516143798828, + "loss": 0.6682, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03474265709519386, + "rewards/margins": 0.051122285425662994, + "rewards/margins_max": 0.06716804951429367, + "rewards/margins_min": 0.03507651016116142, + "rewards/margins_std": 0.02269214577972889, + "rewards/rejected": -0.016379622742533684, + "step": 1290 + }, + { + "epoch": 0.41, + "grad_norm": 0.6171875, + "learning_rate": 3.6778854780597213e-07, + "logits/chosen": -1.2919328212738037, + "logits/rejected": -0.9956780672073364, + "logps/chosen": -222.3484344482422, + "logps/rejected": -182.62179565429688, + "loss": 0.6681, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.030011823400855064, + "rewards/margins": 0.0488753467798233, + "rewards/margins_max": 0.06740256398916245, + "rewards/margins_min": 0.030348125845193863, + "rewards/margins_std": 0.02620144747197628, + "rewards/rejected": -0.01886352151632309, + "step": 1300 + }, + { + "epoch": 0.41, + "grad_norm": 0.466796875, + "learning_rate": 3.653558364144363e-07, + "logits/chosen": -1.4199802875518799, + "logits/rejected": -1.1749187707901, + "logps/chosen": -182.4161376953125, + "logps/rejected": -217.2281951904297, + "loss": 0.6651, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03831896930932999, + "rewards/margins": 0.053058166056871414, + "rewards/margins_max": 0.07967302948236465, + "rewards/margins_min": 0.026443298906087875, + "rewards/margins_std": 0.03763909637928009, + "rewards/rejected": -0.014739197678864002, + "step": 1310 + }, + { + "epoch": 0.42, + "grad_norm": 0.4296875, + "learning_rate": 3.629091671727159e-07, + "logits/chosen": -1.383264422416687, + "logits/rejected": -0.935562252998352, + "logps/chosen": -236.8832550048828, + "logps/rejected": -232.43701171875, + "loss": 0.6652, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03460818529129028, + "rewards/margins": 0.05942409485578537, + "rewards/margins_max": 0.08626364171504974, + "rewards/margins_min": 0.0325845405459404, + "rewards/margins_std": 0.037956852465867996, + "rewards/rejected": -0.02481590211391449, + "step": 1320 + }, + { + "epoch": 0.42, + "grad_norm": 0.515625, + "learning_rate": 3.6044883612341957e-07, + "logits/chosen": -1.4922215938568115, + "logits/rejected": -1.210303544998169, + "logps/chosen": -175.39468383789062, + "logps/rejected": -175.10372924804688, + "loss": 0.6688, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0377926230430603, + "rewards/margins": 0.044999849051237106, + "rewards/margins_max": 0.06272000819444656, + "rewards/margins_min": 0.027279695495963097, + "rewards/margins_std": 0.025060083717107773, + "rewards/rejected": -0.007207226939499378, + "step": 1330 + }, + { + "epoch": 0.42, + "grad_norm": 0.416015625, + "learning_rate": 3.5797514096221024e-07, + "logits/chosen": -1.447775959968567, + "logits/rejected": -1.1010136604309082, + "logps/chosen": -233.7635040283203, + "logps/rejected": -213.46658325195312, + "loss": 0.6693, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.032312069088220596, + "rewards/margins": 0.052999138832092285, + "rewards/margins_max": 0.07665625959634781, + "rewards/margins_min": 0.02934201993048191, + "rewards/margins_std": 0.033456217497587204, + "rewards/rejected": -0.02068706974387169, + "step": 1340 + }, + { + "epoch": 0.43, + "grad_norm": 0.38671875, + "learning_rate": 3.554883810017844e-07, + "logits/chosen": -1.3156002759933472, + "logits/rejected": -1.0745857954025269, + "logps/chosen": -181.6421661376953, + "logps/rejected": -183.23080444335938, + "loss": 0.6698, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030048031359910965, + "rewards/margins": 0.04210250452160835, + "rewards/margins_max": 0.07046877592802048, + "rewards/margins_min": 0.013736230321228504, + "rewards/margins_std": 0.04011595994234085, + "rewards/rejected": -0.01205446757376194, + "step": 1350 + }, + { + "epoch": 0.43, + "grad_norm": 0.298828125, + "learning_rate": 3.529888571356561e-07, + "logits/chosen": -1.2578437328338623, + "logits/rejected": -1.0070809125900269, + "logps/chosen": -250.3462677001953, + "logps/rejected": -232.718994140625, + "loss": 0.6706, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.026622626930475235, + "rewards/margins": 0.04009575396776199, + "rewards/margins_max": 0.06229530647397041, + "rewards/margins_min": 0.017896197736263275, + "rewards/margins_std": 0.03139491006731987, + "rewards/rejected": -0.01347312517464161, + "step": 1360 + }, + { + "epoch": 0.43, + "grad_norm": 0.4609375, + "learning_rate": 3.50476871801749e-07, + "logits/chosen": -1.374895453453064, + "logits/rejected": -0.9742172956466675, + "logps/chosen": -298.19110107421875, + "logps/rejected": -209.47329711914062, + "loss": 0.6677, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03527087718248367, + "rewards/margins": 0.04785536974668503, + "rewards/margins_max": 0.06894843280315399, + "rewards/margins_min": 0.026762310415506363, + "rewards/margins_std": 0.029830092564225197, + "rewards/rejected": -0.012584498152136803, + "step": 1370 + }, + { + "epoch": 0.43, + "grad_norm": 0.337890625, + "learning_rate": 3.479527289458021e-07, + "logits/chosen": -1.3711079359054565, + "logits/rejected": -1.0774781703948975, + "logps/chosen": -184.34344482421875, + "logps/rejected": -224.1437225341797, + "loss": 0.6642, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03007492795586586, + "rewards/margins": 0.05908045917749405, + "rewards/margins_max": 0.09251175820827484, + "rewards/margins_min": 0.02564915083348751, + "rewards/margins_std": 0.04727901145815849, + "rewards/rejected": -0.029005536809563637, + "step": 1380 + }, + { + "epoch": 0.44, + "grad_norm": 0.427734375, + "learning_rate": 3.4541673398459315e-07, + "logits/chosen": -1.293668508529663, + "logits/rejected": -1.0986145734786987, + "logps/chosen": -209.5894012451172, + "logps/rejected": -229.1302490234375, + "loss": 0.6686, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0236887875944376, + "rewards/margins": 0.04496780037879944, + "rewards/margins_max": 0.06106124445796013, + "rewards/margins_min": 0.028874356299638748, + "rewards/margins_std": 0.0227595716714859, + "rewards/rejected": -0.021279016509652138, + "step": 1390 + }, + { + "epoch": 0.44, + "grad_norm": 0.48046875, + "learning_rate": 3.4286919376898303e-07, + "logits/chosen": -1.2458115816116333, + "logits/rejected": -0.9769574403762817, + "logps/chosen": -219.8367919921875, + "logps/rejected": -227.66421508789062, + "loss": 0.6661, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.028372962027788162, + "rewards/margins": 0.04673437029123306, + "rewards/margins_max": 0.06330729275941849, + "rewards/margins_min": 0.030161460861563683, + "rewards/margins_std": 0.02343764156103134, + "rewards/rejected": -0.018361413851380348, + "step": 1400 + }, + { + "epoch": 0.44, + "grad_norm": 0.51953125, + "learning_rate": 3.403104165467883e-07, + "logits/chosen": -1.3929589986801147, + "logits/rejected": -1.1880546808242798, + "logps/chosen": -276.8525085449219, + "logps/rejected": -241.208740234375, + "loss": 0.666, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03430411219596863, + "rewards/margins": 0.04882946237921715, + "rewards/margins_max": 0.08031658828258514, + "rewards/margins_min": 0.017342329025268555, + "rewards/margins_std": 0.04452953487634659, + "rewards/rejected": -0.014525346457958221, + "step": 1410 + }, + { + "epoch": 0.45, + "grad_norm": 0.486328125, + "learning_rate": 3.377407119254826e-07, + "logits/chosen": -1.307857632637024, + "logits/rejected": -0.973365306854248, + "logps/chosen": -262.1522521972656, + "logps/rejected": -219.1666717529297, + "loss": 0.6699, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03059108555316925, + "rewards/margins": 0.05130365490913391, + "rewards/margins_max": 0.07557855546474457, + "rewards/margins_min": 0.027028745040297508, + "rewards/margins_std": 0.03432989865541458, + "rewards/rejected": -0.02071256935596466, + "step": 1420 + }, + { + "epoch": 0.45, + "grad_norm": 0.578125, + "learning_rate": 3.351603908347359e-07, + "logits/chosen": -1.3961646556854248, + "logits/rejected": -1.0634922981262207, + "logps/chosen": -244.14907836914062, + "logps/rejected": -209.52413940429688, + "loss": 0.6664, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02553880773484707, + "rewards/margins": 0.039372727274894714, + "rewards/margins_max": 0.05578699707984924, + "rewards/margins_min": 0.022958464920520782, + "rewards/margins_std": 0.023213278502225876, + "rewards/rejected": -0.013833923265337944, + "step": 1430 + }, + { + "epoch": 0.45, + "grad_norm": 0.443359375, + "learning_rate": 3.325697654887918e-07, + "logits/chosen": -1.457953929901123, + "logits/rejected": -1.1763416528701782, + "logps/chosen": -168.73855590820312, + "logps/rejected": -200.1396026611328, + "loss": 0.6663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.039046648889780045, + "rewards/margins": 0.061664480715990067, + "rewards/margins_max": 0.08563290536403656, + "rewards/margins_min": 0.03769605979323387, + "rewards/margins_std": 0.03389647603034973, + "rewards/rejected": -0.02261783741414547, + "step": 1440 + }, + { + "epoch": 0.46, + "grad_norm": 0.435546875, + "learning_rate": 3.2996914934869034e-07, + "logits/chosen": -1.4136825799942017, + "logits/rejected": -0.9438567161560059, + "logps/chosen": -211.27880859375, + "logps/rejected": -251.1080322265625, + "loss": 0.6655, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.05148143321275711, + "rewards/margins": 0.05926315858960152, + "rewards/margins_max": 0.0925159901380539, + "rewards/margins_min": 0.026010334491729736, + "rewards/margins_std": 0.04702659696340561, + "rewards/rejected": -0.00778172304853797, + "step": 1450 + }, + { + "epoch": 0.46, + "grad_norm": 0.392578125, + "learning_rate": 3.273588570843399e-07, + "logits/chosen": -1.3561222553253174, + "logits/rejected": -0.8794288635253906, + "logps/chosen": -219.59188842773438, + "logps/rejected": -204.20651245117188, + "loss": 0.6653, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04018976539373398, + "rewards/margins": 0.058758050203323364, + "rewards/margins_max": 0.08319707214832306, + "rewards/margins_min": 0.034319035708904266, + "rewards/margins_std": 0.034561995416879654, + "rewards/rejected": -0.018568288534879684, + "step": 1460 + }, + { + "epoch": 0.46, + "grad_norm": 0.5078125, + "learning_rate": 3.2473920453644254e-07, + "logits/chosen": -1.364458680152893, + "logits/rejected": -1.1189966201782227, + "logps/chosen": -200.58279418945312, + "logps/rejected": -247.4306182861328, + "loss": 0.6625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03194325789809227, + "rewards/margins": 0.058095790445804596, + "rewards/margins_max": 0.08179818838834763, + "rewards/margins_min": 0.03439338877797127, + "rewards/margins_std": 0.03352025896310806, + "rewards/rejected": -0.02615252695977688, + "step": 1470 + }, + { + "epoch": 0.47, + "grad_norm": 0.40234375, + "learning_rate": 3.2211050867827805e-07, + "logits/chosen": -1.4114757776260376, + "logits/rejected": -1.0227770805358887, + "logps/chosen": -217.49783325195312, + "logps/rejected": -270.8158874511719, + "loss": 0.6649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03752985596656799, + "rewards/margins": 0.06543248146772385, + "rewards/margins_max": 0.08699898421764374, + "rewards/margins_min": 0.043865982443094254, + "rewards/margins_std": 0.030499637126922607, + "rewards/rejected": -0.02790263295173645, + "step": 1480 + }, + { + "epoch": 0.47, + "grad_norm": 0.34765625, + "learning_rate": 3.194730875773504e-07, + "logits/chosen": -1.3351142406463623, + "logits/rejected": -1.0667884349822998, + "logps/chosen": -226.33425903320312, + "logps/rejected": -211.48983764648438, + "loss": 0.6666, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03304114192724228, + "rewards/margins": 0.047557245939970016, + "rewards/margins_max": 0.06921641528606415, + "rewards/margins_min": 0.02589806541800499, + "rewards/margins_std": 0.030630702152848244, + "rewards/rejected": -0.014516102150082588, + "step": 1490 + }, + { + "epoch": 0.47, + "grad_norm": 0.42578125, + "learning_rate": 3.168272603569025e-07, + "logits/chosen": -1.4025719165802002, + "logits/rejected": -0.8659202456474304, + "logps/chosen": -255.092529296875, + "logps/rejected": -191.5826416015625, + "loss": 0.6657, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.037762049585580826, + "rewards/margins": 0.05839651823043823, + "rewards/margins_max": 0.08473102748394012, + "rewards/margins_min": 0.03206200897693634, + "rewards/margins_std": 0.03724262863397598, + "rewards/rejected": -0.020634472370147705, + "step": 1500 + }, + { + "epoch": 0.48, + "grad_norm": 0.41015625, + "learning_rate": 3.1417334715730257e-07, + "logits/chosen": -1.312922716140747, + "logits/rejected": -0.9928410649299622, + "logps/chosen": -274.3824768066406, + "logps/rejected": -207.8001251220703, + "loss": 0.6652, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03649697080254555, + "rewards/margins": 0.04787913337349892, + "rewards/margins_max": 0.07063382118940353, + "rewards/margins_min": 0.025124436244368553, + "rewards/margins_std": 0.032179996371269226, + "rewards/rejected": -0.01138215884566307, + "step": 1510 + }, + { + "epoch": 0.48, + "grad_norm": 0.3125, + "learning_rate": 3.115116690973081e-07, + "logits/chosen": -1.275967001914978, + "logits/rejected": -1.0719497203826904, + "logps/chosen": -170.84716796875, + "logps/rejected": -187.09201049804688, + "loss": 0.6697, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030098671093583107, + "rewards/margins": 0.04616239666938782, + "rewards/margins_max": 0.06867832690477371, + "rewards/margins_min": 0.023646462708711624, + "rewards/margins_std": 0.03184233605861664, + "rewards/rejected": -0.01606372371315956, + "step": 1520 + }, + { + "epoch": 0.48, + "grad_norm": 0.375, + "learning_rate": 3.088425482352106e-07, + "logits/chosen": -1.3329031467437744, + "logits/rejected": -0.9551903009414673, + "logps/chosen": -178.49220275878906, + "logps/rejected": -163.50289916992188, + "loss": 0.6707, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02671188674867153, + "rewards/margins": 0.05042758584022522, + "rewards/margins_max": 0.07536738365888596, + "rewards/margins_min": 0.025487786158919334, + "rewards/margins_std": 0.03527020663022995, + "rewards/rejected": -0.023715700954198837, + "step": 1530 + }, + { + "epoch": 0.49, + "grad_norm": 0.5625, + "learning_rate": 3.061663075298675e-07, + "logits/chosen": -1.5138485431671143, + "logits/rejected": -1.1314074993133545, + "logps/chosen": -250.61813354492188, + "logps/rejected": -272.20379638671875, + "loss": 0.6643, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03775627166032791, + "rewards/margins": 0.05153984948992729, + "rewards/margins_max": 0.07664564251899719, + "rewards/margins_min": 0.026434045284986496, + "rewards/margins_std": 0.035504959523677826, + "rewards/rejected": -0.013783574104309082, + "step": 1540 + }, + { + "epoch": 0.49, + "grad_norm": 0.408203125, + "learning_rate": 3.034832708016243e-07, + "logits/chosen": -1.5145914554595947, + "logits/rejected": -1.0713765621185303, + "logps/chosen": -261.14312744140625, + "logps/rejected": -210.29232788085938, + "loss": 0.6632, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03341571241617203, + "rewards/margins": 0.048537809401750565, + "rewards/margins_max": 0.07006336748600006, + "rewards/margins_min": 0.027012262493371964, + "rewards/margins_std": 0.030441725626587868, + "rewards/rejected": -0.01512210350483656, + "step": 1550 + }, + { + "epoch": 0.49, + "grad_norm": 0.37890625, + "learning_rate": 3.0079376269313354e-07, + "logits/chosen": -1.4111496210098267, + "logits/rejected": -1.072613000869751, + "logps/chosen": -207.9450225830078, + "logps/rejected": -267.03912353515625, + "loss": 0.6651, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.029351189732551575, + "rewards/margins": 0.05201409012079239, + "rewards/margins_max": 0.07417653501033783, + "rewards/margins_min": 0.0298516396433115, + "rewards/margins_std": 0.031342435628175735, + "rewards/rejected": -0.022662896662950516, + "step": 1560 + }, + { + "epoch": 0.49, + "grad_norm": 0.330078125, + "learning_rate": 2.9809810863007284e-07, + "logits/chosen": -1.4359506368637085, + "logits/rejected": -1.0733433961868286, + "logps/chosen": -200.97647094726562, + "logps/rejected": -209.4395751953125, + "loss": 0.6674, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.039248835295438766, + "rewards/margins": 0.04961882531642914, + "rewards/margins_max": 0.0718330442905426, + "rewards/margins_min": 0.027404606342315674, + "rewards/margins_std": 0.031415652483701706, + "rewards/rejected": -0.010369991883635521, + "step": 1570 + }, + { + "epoch": 0.5, + "grad_norm": 0.55859375, + "learning_rate": 2.9539663478176946e-07, + "logits/chosen": -1.2646214962005615, + "logits/rejected": -1.1139628887176514, + "logps/chosen": -206.5272674560547, + "logps/rejected": -250.39108276367188, + "loss": 0.6681, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02430318295955658, + "rewards/margins": 0.051103752106428146, + "rewards/margins_max": 0.0777682214975357, + "rewards/margins_min": 0.024439293891191483, + "rewards/margins_std": 0.03770923987030983, + "rewards/rejected": -0.026800569146871567, + "step": 1580 + }, + { + "epoch": 0.5, + "grad_norm": 0.4765625, + "learning_rate": 2.9268966802173436e-07, + "logits/chosen": -1.3860819339752197, + "logits/rejected": -0.975805938243866, + "logps/chosen": -270.6651611328125, + "logps/rejected": -221.06259155273438, + "loss": 0.6665, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03461884707212448, + "rewards/margins": 0.05071113631129265, + "rewards/margins_max": 0.0760193020105362, + "rewards/margins_min": 0.025402987375855446, + "rewards/margins_std": 0.03579113632440567, + "rewards/rejected": -0.016092294827103615, + "step": 1590 + }, + { + "epoch": 0.5, + "grad_norm": 0.443359375, + "learning_rate": 2.89977535888111e-07, + "logits/chosen": -1.3565785884857178, + "logits/rejected": -0.9915903210639954, + "logps/chosen": -177.0413055419922, + "logps/rejected": -182.9870147705078, + "loss": 0.6673, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03303280100226402, + "rewards/margins": 0.06585012376308441, + "rewards/margins_max": 0.08999715745449066, + "rewards/margins_min": 0.04170309379696846, + "rewards/margins_std": 0.034149058163166046, + "rewards/rejected": -0.03281732648611069, + "step": 1600 + }, + { + "epoch": 0.51, + "grad_norm": 0.388671875, + "learning_rate": 2.872605665440436e-07, + "logits/chosen": -1.3481905460357666, + "logits/rejected": -1.1729605197906494, + "logps/chosen": -169.9842529296875, + "logps/rejected": -223.30044555664062, + "loss": 0.6626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.032422084361314774, + "rewards/margins": 0.055260200053453445, + "rewards/margins_max": 0.07780520617961884, + "rewards/margins_min": 0.03271518647670746, + "rewards/margins_std": 0.03188345581293106, + "rewards/rejected": -0.02283811755478382, + "step": 1610 + }, + { + "epoch": 0.51, + "grad_norm": 0.37890625, + "learning_rate": 2.845390887379706e-07, + "logits/chosen": -1.4345109462738037, + "logits/rejected": -1.1150403022766113, + "logps/chosen": -225.3082275390625, + "logps/rejected": -199.63519287109375, + "loss": 0.669, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.034576646983623505, + "rewards/margins": 0.04932091385126114, + "rewards/margins_max": 0.06937690079212189, + "rewards/margins_min": 0.029264941811561584, + "rewards/margins_std": 0.028363442048430443, + "rewards/rejected": -0.014744272455573082, + "step": 1620 + }, + { + "epoch": 0.51, + "grad_norm": 0.58203125, + "learning_rate": 2.8181343176384585e-07, + "logits/chosen": -1.2172272205352783, + "logits/rejected": -1.0032122135162354, + "logps/chosen": -194.42764282226562, + "logps/rejected": -336.8403015136719, + "loss": 0.662, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02610202692449093, + "rewards/margins": 0.06659840792417526, + "rewards/margins_max": 0.09039248526096344, + "rewards/margins_min": 0.04280433803796768, + "rewards/margins_std": 0.03364989906549454, + "rewards/rejected": -0.04049638658761978, + "step": 1630 + }, + { + "epoch": 0.52, + "grad_norm": 0.408203125, + "learning_rate": 2.7908392542129537e-07, + "logits/chosen": -1.491234540939331, + "logits/rejected": -1.1456706523895264, + "logps/chosen": -226.4430694580078, + "logps/rejected": -264.64874267578125, + "loss": 0.6649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.036219272762537, + "rewards/margins": 0.06396204233169556, + "rewards/margins_max": 0.0869758352637291, + "rewards/margins_min": 0.040948253124952316, + "rewards/margins_std": 0.032546427100896835, + "rewards/rejected": -0.02774277701973915, + "step": 1640 + }, + { + "epoch": 0.52, + "grad_norm": 0.42578125, + "learning_rate": 2.763508999757119e-07, + "logits/chosen": -1.4049649238586426, + "logits/rejected": -1.239553689956665, + "logps/chosen": -215.4875030517578, + "logps/rejected": -298.31365966796875, + "loss": 0.6683, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03639969974756241, + "rewards/margins": 0.053609687834978104, + "rewards/margins_max": 0.07144194096326828, + "rewards/margins_min": 0.035777442157268524, + "rewards/margins_std": 0.025218605995178223, + "rewards/rejected": -0.017209986224770546, + "step": 1650 + }, + { + "epoch": 0.52, + "grad_norm": 0.4140625, + "learning_rate": 2.7361468611829326e-07, + "logits/chosen": -1.4899475574493408, + "logits/rejected": -1.128447413444519, + "logps/chosen": -200.3207550048828, + "logps/rejected": -228.01718139648438, + "loss": 0.6621, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03625740110874176, + "rewards/margins": 0.0598982498049736, + "rewards/margins_max": 0.09039153158664703, + "rewards/margins_min": 0.029404977336525917, + "rewards/margins_std": 0.043124008923769, + "rewards/rejected": -0.02364085428416729, + "step": 1660 + }, + { + "epoch": 0.53, + "grad_norm": 0.259765625, + "learning_rate": 2.708756149260292e-07, + "logits/chosen": -1.4126758575439453, + "logits/rejected": -1.0123107433319092, + "logps/chosen": -235.05734252929688, + "logps/rejected": -203.85006713867188, + "loss": 0.6662, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.034583888947963715, + "rewards/margins": 0.052448056638240814, + "rewards/margins_max": 0.07766715437173843, + "rewards/margins_min": 0.027228962630033493, + "rewards/margins_std": 0.03566519170999527, + "rewards/rejected": -0.0178641676902771, + "step": 1670 + }, + { + "epoch": 0.53, + "grad_norm": 0.486328125, + "learning_rate": 2.681340178216423e-07, + "logits/chosen": -1.6247339248657227, + "logits/rejected": -1.223256230354309, + "logps/chosen": -237.5697784423828, + "logps/rejected": -252.75521850585938, + "loss": 0.6637, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03515109419822693, + "rewards/margins": 0.06105039268732071, + "rewards/margins_max": 0.08089162409305573, + "rewards/margins_min": 0.041209153831005096, + "rewards/margins_std": 0.028059745207428932, + "rewards/rejected": -0.02589929662644863, + "step": 1680 + }, + { + "epoch": 0.53, + "grad_norm": 0.5078125, + "learning_rate": 2.6539022653348575e-07, + "logits/chosen": -1.3141326904296875, + "logits/rejected": -0.9784961938858032, + "logps/chosen": -204.03591918945312, + "logps/rejected": -265.62591552734375, + "loss": 0.664, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03488199785351753, + "rewards/margins": 0.06790916621685028, + "rewards/margins_max": 0.09974372386932373, + "rewards/margins_min": 0.03607460856437683, + "rewards/margins_std": 0.04502086713910103, + "rewards/rejected": -0.03302717208862305, + "step": 1690 + }, + { + "epoch": 0.54, + "grad_norm": 0.361328125, + "learning_rate": 2.62644573055405e-07, + "logits/chosen": -1.527411699295044, + "logits/rejected": -1.0853965282440186, + "logps/chosen": -193.60665893554688, + "logps/rejected": -200.410888671875, + "loss": 0.6657, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.037782810628414154, + "rewards/margins": 0.06309525668621063, + "rewards/margins_max": 0.0949065238237381, + "rewards/margins_min": 0.031283993273973465, + "rewards/margins_std": 0.04498792067170143, + "rewards/rejected": -0.02531243860721588, + "step": 1700 + }, + { + "epoch": 0.54, + "grad_norm": 0.447265625, + "learning_rate": 2.598973896065674e-07, + "logits/chosen": -1.1190847158432007, + "logits/rejected": -0.9498281478881836, + "logps/chosen": -246.06240844726562, + "logps/rejected": -278.57708740234375, + "loss": 0.6647, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.029680589213967323, + "rewards/margins": 0.06372956186532974, + "rewards/margins_max": 0.08478715270757675, + "rewards/margins_min": 0.04267194867134094, + "rewards/margins_std": 0.029779959470033646, + "rewards/rejected": -0.03404896706342697, + "step": 1710 + }, + { + "epoch": 0.54, + "grad_norm": 0.322265625, + "learning_rate": 2.571490085912638e-07, + "logits/chosen": -1.294392704963684, + "logits/rejected": -0.901209831237793, + "logps/chosen": -222.6404571533203, + "logps/rejected": -221.46646118164062, + "loss": 0.666, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.035451389849185944, + "rewards/margins": 0.06020699813961983, + "rewards/margins_max": 0.08351422101259232, + "rewards/margins_min": 0.036899782717227936, + "rewards/margins_std": 0.03296138346195221, + "rewards/rejected": -0.024755608290433884, + "step": 1720 + }, + { + "epoch": 0.54, + "grad_norm": 0.404296875, + "learning_rate": 2.5439976255868846e-07, + "logits/chosen": -1.3172805309295654, + "logits/rejected": -0.9587199091911316, + "logps/chosen": -201.81642150878906, + "logps/rejected": -264.8630065917969, + "loss": 0.6643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02492239698767662, + "rewards/margins": 0.06449567526578903, + "rewards/margins_max": 0.10114102065563202, + "rewards/margins_min": 0.02785031870007515, + "rewards/margins_std": 0.051824361085891724, + "rewards/rejected": -0.03957327455282211, + "step": 1730 + }, + { + "epoch": 0.55, + "grad_norm": 0.458984375, + "learning_rate": 2.5164998416270137e-07, + "logits/chosen": -1.4752823114395142, + "logits/rejected": -1.1924030780792236, + "logps/chosen": -225.65927124023438, + "logps/rejected": -236.69290161132812, + "loss": 0.6656, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.025519024580717087, + "rewards/margins": 0.05898071080446243, + "rewards/margins_max": 0.09197360277175903, + "rewards/margins_min": 0.025987815111875534, + "rewards/margins_std": 0.04665899649262428, + "rewards/rejected": -0.033461686223745346, + "step": 1740 + }, + { + "epoch": 0.55, + "grad_norm": 0.34765625, + "learning_rate": 2.489000061215775e-07, + "logits/chosen": -1.3754206895828247, + "logits/rejected": -1.0634129047393799, + "logps/chosen": -212.5056915283203, + "logps/rejected": -217.0105438232422, + "loss": 0.6659, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03384281322360039, + "rewards/margins": 0.05164814740419388, + "rewards/margins_max": 0.0746842697262764, + "rewards/margins_min": 0.02861202321946621, + "rewards/margins_std": 0.0325779989361763, + "rewards/rejected": -0.01780533231794834, + "step": 1750 + }, + { + "epoch": 0.55, + "grad_norm": 0.365234375, + "learning_rate": 2.461501611777483e-07, + "logits/chosen": -1.3263044357299805, + "logits/rejected": -1.0537205934524536, + "logps/chosen": -197.9228973388672, + "logps/rejected": -214.32839965820312, + "loss": 0.671, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.030653411522507668, + "rewards/margins": 0.04631539434194565, + "rewards/margins_max": 0.06874962151050568, + "rewards/margins_min": 0.023881174623966217, + "rewards/margins_std": 0.03172678127884865, + "rewards/rejected": -0.01566198468208313, + "step": 1760 + }, + { + "epoch": 0.56, + "grad_norm": 0.345703125, + "learning_rate": 2.4340078205754096e-07, + "logits/chosen": -1.4674514532089233, + "logits/rejected": -1.0580947399139404, + "logps/chosen": -228.774169921875, + "logps/rejected": -245.3206329345703, + "loss": 0.6632, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03198526054620743, + "rewards/margins": 0.060487449169158936, + "rewards/margins_max": 0.09252621978521347, + "rewards/margins_min": 0.028448667377233505, + "rewards/margins_std": 0.045309677720069885, + "rewards/rejected": -0.02850218489766121, + "step": 1770 + }, + { + "epoch": 0.56, + "grad_norm": 0.390625, + "learning_rate": 2.406522014309186e-07, + "logits/chosen": -1.3413441181182861, + "logits/rejected": -1.0260752439498901, + "logps/chosen": -217.0348358154297, + "logps/rejected": -218.7316436767578, + "loss": 0.6673, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03381979838013649, + "rewards/margins": 0.05491740256547928, + "rewards/margins_max": 0.08126216381788254, + "rewards/margins_min": 0.028572645038366318, + "rewards/margins_std": 0.037257120013237, + "rewards/rejected": -0.021097611635923386, + "step": 1780 + }, + { + "epoch": 0.56, + "grad_norm": 0.3984375, + "learning_rate": 2.3790475187122832e-07, + "logits/chosen": -1.3534529209136963, + "logits/rejected": -1.0642507076263428, + "logps/chosen": -203.16989135742188, + "logps/rejected": -185.0489044189453, + "loss": 0.6664, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03119819238781929, + "rewards/margins": 0.058540262281894684, + "rewards/margins_max": 0.0830526053905487, + "rewards/margins_min": 0.034027911722660065, + "rewards/margins_std": 0.034665681421756744, + "rewards/rejected": -0.027342066168785095, + "step": 1790 + }, + { + "epoch": 0.57, + "grad_norm": 0.412109375, + "learning_rate": 2.351587658149598e-07, + "logits/chosen": -1.453975796699524, + "logits/rejected": -0.9396857023239136, + "logps/chosen": -307.119140625, + "logps/rejected": -293.79193115234375, + "loss": 0.6639, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04417193681001663, + "rewards/margins": 0.06028149649500847, + "rewards/margins_max": 0.08487708121538162, + "rewards/margins_min": 0.03568592667579651, + "rewards/margins_std": 0.03478339686989784, + "rewards/rejected": -0.016109565272927284, + "step": 1800 + }, + { + "epoch": 0.57, + "grad_norm": 0.39453125, + "learning_rate": 2.3241457552152187e-07, + "logits/chosen": -1.2886158227920532, + "logits/rejected": -0.8535853624343872, + "logps/chosen": -255.9151153564453, + "logps/rejected": -190.72183227539062, + "loss": 0.6651, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0356779471039772, + "rewards/margins": 0.06389064341783524, + "rewards/margins_max": 0.09344568848609924, + "rewards/margins_min": 0.034335602074861526, + "rewards/margins_std": 0.04179714247584343, + "rewards/rejected": -0.028212696313858032, + "step": 1810 + }, + { + "epoch": 0.57, + "grad_norm": 0.37890625, + "learning_rate": 2.2967251303303876e-07, + "logits/chosen": -1.2967920303344727, + "logits/rejected": -1.069603443145752, + "logps/chosen": -174.32562255859375, + "logps/rejected": -198.73556518554688, + "loss": 0.6673, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02710595726966858, + "rewards/margins": 0.04836275056004524, + "rewards/margins_max": 0.07030778378248215, + "rewards/margins_min": 0.02641770802438259, + "rewards/margins_std": 0.03103497065603733, + "rewards/rejected": -0.021256795153021812, + "step": 1820 + }, + { + "epoch": 0.58, + "grad_norm": 0.275390625, + "learning_rate": 2.2693291013417452e-07, + "logits/chosen": -1.3830006122589111, + "logits/rejected": -1.131734848022461, + "logps/chosen": -196.27232360839844, + "logps/rejected": -220.3488311767578, + "loss": 0.6668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04211854934692383, + "rewards/margins": 0.054630208760499954, + "rewards/margins_max": 0.08319230377674103, + "rewards/margins_min": 0.026068110018968582, + "rewards/margins_std": 0.0403929129242897, + "rewards/rejected": -0.012511657550930977, + "step": 1830 + }, + { + "epoch": 0.58, + "grad_norm": 0.458984375, + "learning_rate": 2.2419609831198695e-07, + "logits/chosen": -1.314412236213684, + "logits/rejected": -1.0906130075454712, + "logps/chosen": -202.8844451904297, + "logps/rejected": -282.2475280761719, + "loss": 0.6657, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.029452290385961533, + "rewards/margins": 0.06307311356067657, + "rewards/margins_max": 0.08669252693653107, + "rewards/margins_min": 0.03945370018482208, + "rewards/margins_std": 0.03340289741754532, + "rewards/rejected": -0.03362082317471504, + "step": 1840 + }, + { + "epoch": 0.58, + "grad_norm": 0.4765625, + "learning_rate": 2.2146240871581875e-07, + "logits/chosen": -1.4870127439498901, + "logits/rejected": -1.10221529006958, + "logps/chosen": -257.47381591796875, + "logps/rejected": -300.7210388183594, + "loss": 0.6677, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.048544105142354965, + "rewards/margins": 0.06802462041378021, + "rewards/margins_max": 0.0930628627538681, + "rewards/margins_min": 0.04298638552427292, + "rewards/margins_std": 0.035409413278102875, + "rewards/rejected": -0.019480522722005844, + "step": 1850 + }, + { + "epoch": 0.59, + "grad_norm": 0.404296875, + "learning_rate": 2.187321721172288e-07, + "logits/chosen": -1.2666473388671875, + "logits/rejected": -0.9587362408638, + "logps/chosen": -202.96151733398438, + "logps/rejected": -188.11402893066406, + "loss": 0.6625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.035519860684871674, + "rewards/margins": 0.06863918900489807, + "rewards/margins_max": 0.10405266284942627, + "rewards/margins_min": 0.033225707709789276, + "rewards/margins_std": 0.05008222907781601, + "rewards/rejected": -0.0331193283200264, + "step": 1860 + }, + { + "epoch": 0.59, + "grad_norm": 0.306640625, + "learning_rate": 2.1600571886996932e-07, + "logits/chosen": -1.409246563911438, + "logits/rejected": -0.9662661552429199, + "logps/chosen": -255.17337036132812, + "logps/rejected": -237.2165069580078, + "loss": 0.664, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.034895267337560654, + "rewards/margins": 0.060319460928440094, + "rewards/margins_max": 0.08776156604290009, + "rewards/margins_min": 0.032877348363399506, + "rewards/margins_std": 0.038808997720479965, + "rewards/rejected": -0.025424188002943993, + "step": 1870 + }, + { + "epoch": 0.59, + "grad_norm": 0.34765625, + "learning_rate": 2.1328337887001386e-07, + "logits/chosen": -1.3689050674438477, + "logits/rejected": -0.9174262881278992, + "logps/chosen": -250.42257690429688, + "logps/rejected": -213.65115356445312, + "loss": 0.6637, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03616097569465637, + "rewards/margins": 0.0674886554479599, + "rewards/margins_max": 0.09155096858739853, + "rewards/margins_min": 0.04342634230852127, + "rewards/margins_std": 0.03402925282716751, + "rewards/rejected": -0.03132767975330353, + "step": 1880 + }, + { + "epoch": 0.6, + "grad_norm": 0.55859375, + "learning_rate": 2.105654815156406e-07, + "logits/chosen": -1.2773230075836182, + "logits/rejected": -0.9415411949157715, + "logps/chosen": -211.5564727783203, + "logps/rejected": -241.5669403076172, + "loss": 0.6639, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03792757913470268, + "rewards/margins": 0.05873064324259758, + "rewards/margins_max": 0.08351272344589233, + "rewards/margins_min": 0.03394855558872223, + "rewards/margins_std": 0.03504716232419014, + "rewards/rejected": -0.0208030603826046, + "step": 1890 + }, + { + "epoch": 0.6, + "grad_norm": 0.443359375, + "learning_rate": 2.0785235566757517e-07, + "logits/chosen": -1.5174918174743652, + "logits/rejected": -1.0792747735977173, + "logps/chosen": -274.3040466308594, + "logps/rejected": -269.9195556640625, + "loss": 0.6665, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.031049564480781555, + "rewards/margins": 0.04908784478902817, + "rewards/margins_max": 0.07297800481319427, + "rewards/margins_min": 0.02519768849015236, + "rewards/margins_std": 0.033785782754421234, + "rewards/rejected": -0.01803828403353691, + "step": 1900 + }, + { + "epoch": 0.6, + "grad_norm": 0.3984375, + "learning_rate": 2.0514432960919976e-07, + "logits/chosen": -1.3264081478118896, + "logits/rejected": -0.8952063322067261, + "logps/chosen": -275.90582275390625, + "logps/rejected": -227.85183715820312, + "loss": 0.6629, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035590268671512604, + "rewards/margins": 0.05433149263262749, + "rewards/margins_max": 0.08622786402702332, + "rewards/margins_min": 0.022435134276747704, + "rewards/margins_std": 0.04510827362537384, + "rewards/rejected": -0.01874123141169548, + "step": 1910 + }, + { + "epoch": 0.6, + "grad_norm": 0.384765625, + "learning_rate": 2.024417310068309e-07, + "logits/chosen": -1.3526580333709717, + "logits/rejected": -1.0428838729858398, + "logps/chosen": -242.9093475341797, + "logps/rejected": -221.0670928955078, + "loss": 0.6657, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03770860657095909, + "rewards/margins": 0.06553932279348373, + "rewards/margins_max": 0.09631849825382233, + "rewards/margins_min": 0.03476015478372574, + "rewards/margins_std": 0.04352831840515137, + "rewards/rejected": -0.027830716222524643, + "step": 1920 + }, + { + "epoch": 0.61, + "grad_norm": 0.392578125, + "learning_rate": 1.9974488687007272e-07, + "logits/chosen": -1.321537733078003, + "logits/rejected": -0.9563083648681641, + "logps/chosen": -189.53338623046875, + "logps/rejected": -208.65695190429688, + "loss": 0.6677, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.037261709570884705, + "rewards/margins": 0.050910621881484985, + "rewards/margins_max": 0.08092696219682693, + "rewards/margins_min": 0.020894277840852737, + "rewards/margins_std": 0.042449526488780975, + "rewards/rejected": -0.01364891231060028, + "step": 1930 + }, + { + "epoch": 0.61, + "grad_norm": 0.314453125, + "learning_rate": 1.9705412351224935e-07, + "logits/chosen": -1.341074824333191, + "logits/rejected": -1.031362533569336, + "logps/chosen": -262.0687561035156, + "logps/rejected": -209.2541046142578, + "loss": 0.6632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04073809087276459, + "rewards/margins": 0.06763813644647598, + "rewards/margins_max": 0.10079771280288696, + "rewards/margins_min": 0.034478556364774704, + "rewards/margins_std": 0.04689472168684006, + "rewards/rejected": -0.026900043711066246, + "step": 1940 + }, + { + "epoch": 0.61, + "grad_norm": 0.482421875, + "learning_rate": 1.9436976651092142e-07, + "logits/chosen": -1.4449079036712646, + "logits/rejected": -1.0441436767578125, + "logps/chosen": -323.22515869140625, + "logps/rejected": -259.187744140625, + "loss": 0.6621, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04286254942417145, + "rewards/margins": 0.06869898736476898, + "rewards/margins_max": 0.08886998146772385, + "rewards/margins_min": 0.048527974635362625, + "rewards/margins_std": 0.028526106849312782, + "rewards/rejected": -0.025836432352662086, + "step": 1950 + }, + { + "epoch": 0.62, + "grad_norm": 0.380859375, + "learning_rate": 1.9169214066849198e-07, + "logits/chosen": -1.3310493230819702, + "logits/rejected": -1.0039780139923096, + "logps/chosen": -207.80368041992188, + "logps/rejected": -217.77279663085938, + "loss": 0.6671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.029799357056617737, + "rewards/margins": 0.05040057748556137, + "rewards/margins_max": 0.07911469042301178, + "rewards/margins_min": 0.021686479449272156, + "rewards/margins_std": 0.04060788080096245, + "rewards/rejected": -0.02060122787952423, + "step": 1960 + }, + { + "epoch": 0.62, + "grad_norm": 0.37109375, + "learning_rate": 1.890215699729057e-07, + "logits/chosen": -1.3599677085876465, + "logits/rejected": -0.952431321144104, + "logps/chosen": -220.8314971923828, + "logps/rejected": -218.5143280029297, + "loss": 0.6636, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03455578535795212, + "rewards/margins": 0.059498321264982224, + "rewards/margins_max": 0.08264943957328796, + "rewards/margins_min": 0.03634720668196678, + "rewards/margins_std": 0.032740626484155655, + "rewards/rejected": -0.024942539632320404, + "step": 1970 + }, + { + "epoch": 0.62, + "grad_norm": 0.3828125, + "learning_rate": 1.8635837755844736e-07, + "logits/chosen": -1.5396320819854736, + "logits/rejected": -1.1135740280151367, + "logps/chosen": -192.1985321044922, + "logps/rejected": -189.65496826171875, + "loss": 0.6638, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04031743109226227, + "rewards/margins": 0.06330820918083191, + "rewards/margins_max": 0.09154955297708511, + "rewards/margins_min": 0.035066869109869, + "rewards/margins_std": 0.03993929177522659, + "rewards/rejected": -0.02299078181385994, + "step": 1980 + }, + { + "epoch": 0.63, + "grad_norm": 0.328125, + "learning_rate": 1.837028856666426e-07, + "logits/chosen": -1.396333932876587, + "logits/rejected": -1.0482286214828491, + "logps/chosen": -223.5980987548828, + "logps/rejected": -197.462646484375, + "loss": 0.6639, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03402381017804146, + "rewards/margins": 0.05975471809506416, + "rewards/margins_max": 0.09332195669412613, + "rewards/margins_min": 0.026187485083937645, + "rewards/margins_std": 0.04747123643755913, + "rewards/rejected": -0.025730907917022705, + "step": 1990 + }, + { + "epoch": 0.63, + "grad_norm": 0.37109375, + "learning_rate": 1.8105541560726783e-07, + "logits/chosen": -1.5116699934005737, + "logits/rejected": -1.005076289176941, + "logps/chosen": -216.2085418701172, + "logps/rejected": -199.5402374267578, + "loss": 0.6648, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.037188541144132614, + "rewards/margins": 0.05942929536104202, + "rewards/margins_max": 0.08686941862106323, + "rewards/margins_min": 0.03198916092514992, + "rewards/margins_std": 0.038806211203336716, + "rewards/rejected": -0.02224075235426426, + "step": 2000 + }, + { + "epoch": 0.63, + "grad_norm": 0.353515625, + "learning_rate": 1.7841628771947186e-07, + "logits/chosen": -1.4040260314941406, + "logits/rejected": -0.965591549873352, + "logps/chosen": -234.39431762695312, + "logps/rejected": -202.01571655273438, + "loss": 0.6651, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03427529335021973, + "rewards/margins": 0.0503767728805542, + "rewards/margins_max": 0.07114100456237793, + "rewards/margins_min": 0.02961254119873047, + "rewards/margins_std": 0.02936505898833275, + "rewards/rejected": -0.016101477667689323, + "step": 2010 + }, + { + "epoch": 0.64, + "grad_norm": 0.3515625, + "learning_rate": 1.757858213330157e-07, + "logits/chosen": -1.1877460479736328, + "logits/rejected": -0.9582545161247253, + "logps/chosen": -229.884033203125, + "logps/rejected": -281.41351318359375, + "loss": 0.6646, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03942031413316727, + "rewards/margins": 0.06542594730854034, + "rewards/margins_max": 0.09780795872211456, + "rewards/margins_min": 0.03304394707083702, + "rewards/margins_std": 0.04579506441950798, + "rewards/rejected": -0.026005636900663376, + "step": 2020 + }, + { + "epoch": 0.64, + "grad_norm": 0.462890625, + "learning_rate": 1.7316433472963426e-07, + "logits/chosen": -1.507406234741211, + "logits/rejected": -1.1749341487884521, + "logps/chosen": -281.5582580566406, + "logps/rejected": -243.66110229492188, + "loss": 0.6623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03884587436914444, + "rewards/margins": 0.059391576796770096, + "rewards/margins_max": 0.08674292266368866, + "rewards/margins_min": 0.032040227204561234, + "rewards/margins_std": 0.03868064284324646, + "rewards/rejected": -0.02054569497704506, + "step": 2030 + }, + { + "epoch": 0.64, + "grad_norm": 0.357421875, + "learning_rate": 1.7055214510452458e-07, + "logits/chosen": -1.3578734397888184, + "logits/rejected": -0.849805474281311, + "logps/chosen": -331.993408203125, + "logps/rejected": -279.07733154296875, + "loss": 0.6626, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03726685792207718, + "rewards/margins": 0.06775570660829544, + "rewards/margins_max": 0.09739796817302704, + "rewards/margins_min": 0.03811345621943474, + "rewards/margins_std": 0.04192047566175461, + "rewards/rejected": -0.03048885427415371, + "step": 2040 + }, + { + "epoch": 0.65, + "grad_norm": 0.3828125, + "learning_rate": 1.6794956852796616e-07, + "logits/chosen": -1.421799659729004, + "logits/rejected": -1.0734702348709106, + "logps/chosen": -214.08364868164062, + "logps/rejected": -222.42636108398438, + "loss": 0.66, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.046803176403045654, + "rewards/margins": 0.07683407515287399, + "rewards/margins_max": 0.11319296061992645, + "rewards/margins_min": 0.040475185960531235, + "rewards/margins_std": 0.05141923576593399, + "rewards/rejected": -0.03003089688718319, + "step": 2050 + }, + { + "epoch": 0.65, + "grad_norm": 0.283203125, + "learning_rate": 1.653569199070764e-07, + "logits/chosen": -1.437723994255066, + "logits/rejected": -1.0029988288879395, + "logps/chosen": -206.7332000732422, + "logps/rejected": -232.79580688476562, + "loss": 0.661, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04119989275932312, + "rewards/margins": 0.06901855766773224, + "rewards/margins_max": 0.10958409309387207, + "rewards/margins_min": 0.028453027829527855, + "rewards/margins_std": 0.05736833065748215, + "rewards/rejected": -0.027818670496344566, + "step": 2060 + }, + { + "epoch": 0.65, + "grad_norm": 0.46484375, + "learning_rate": 1.6277451294770832e-07, + "logits/chosen": -1.427294135093689, + "logits/rejected": -1.043678641319275, + "logps/chosen": -173.60861206054688, + "logps/rejected": -159.7396697998047, + "loss": 0.663, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03490697965025902, + "rewards/margins": 0.05194821208715439, + "rewards/margins_max": 0.0755261555314064, + "rewards/margins_min": 0.028370272368192673, + "rewards/margins_std": 0.03334423899650574, + "rewards/rejected": -0.01704123243689537, + "step": 2070 + }, + { + "epoch": 0.66, + "grad_norm": 0.396484375, + "learning_rate": 1.6020266011649176e-07, + "logits/chosen": -1.3484151363372803, + "logits/rejected": -0.9436542391777039, + "logps/chosen": -246.00296020507812, + "logps/rejected": -232.9607391357422, + "loss": 0.6638, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03393206372857094, + "rewards/margins": 0.06296433508396149, + "rewards/margins_max": 0.09208185970783234, + "rewards/margins_min": 0.03384682536125183, + "rewards/margins_std": 0.0411783829331398, + "rewards/rejected": -0.029032278805971146, + "step": 2080 + }, + { + "epoch": 0.66, + "grad_norm": 0.34375, + "learning_rate": 1.5764167260302608e-07, + "logits/chosen": -1.269598364830017, + "logits/rejected": -1.101138949394226, + "logps/chosen": -212.4265594482422, + "logps/rejected": -261.7176208496094, + "loss": 0.6673, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02871812880039215, + "rewards/margins": 0.059116750955581665, + "rewards/margins_max": 0.081370510160923, + "rewards/margins_min": 0.03686298802495003, + "rewards/margins_std": 0.03147156536579132, + "rewards/rejected": -0.030398612841963768, + "step": 2090 + }, + { + "epoch": 0.66, + "grad_norm": 0.369140625, + "learning_rate": 1.5509186028222653e-07, + "logits/chosen": -1.3609730005264282, + "logits/rejected": -0.8888334035873413, + "logps/chosen": -240.88809204101562, + "logps/rejected": -205.7561492919922, + "loss": 0.6634, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.036828793585300446, + "rewards/margins": 0.06859615445137024, + "rewards/margins_max": 0.09640248119831085, + "rewards/margins_min": 0.040789827704429626, + "rewards/margins_std": 0.0393240861594677, + "rewards/rejected": -0.031767360866069794, + "step": 2100 + }, + { + "epoch": 0.66, + "grad_norm": 0.390625, + "learning_rate": 1.5255353167683017e-07, + "logits/chosen": -1.4757276773452759, + "logits/rejected": -1.0737035274505615, + "logps/chosen": -197.3357696533203, + "logps/rejected": -193.08956909179688, + "loss": 0.6677, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03849588334560394, + "rewards/margins": 0.049884069710969925, + "rewards/margins_max": 0.07029401510953903, + "rewards/margins_min": 0.02947412058711052, + "rewards/margins_std": 0.028864026069641113, + "rewards/rejected": -0.01138819195330143, + "step": 2110 + }, + { + "epoch": 0.67, + "grad_norm": 0.44140625, + "learning_rate": 1.500269939200648e-07, + "logits/chosen": -1.4104186296463013, + "logits/rejected": -1.1364113092422485, + "logps/chosen": -180.714111328125, + "logps/rejected": -193.08792114257812, + "loss": 0.6653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03277165815234184, + "rewards/margins": 0.04854750260710716, + "rewards/margins_max": 0.07237715274095535, + "rewards/margins_min": 0.02471785433590412, + "rewards/margins_std": 0.03370020538568497, + "rewards/rejected": -0.01577584072947502, + "step": 2120 + }, + { + "epoch": 0.67, + "grad_norm": 0.384765625, + "learning_rate": 1.4751255271848661e-07, + "logits/chosen": -1.3990291357040405, + "logits/rejected": -1.111859917640686, + "logps/chosen": -191.26333618164062, + "logps/rejected": -209.4487762451172, + "loss": 0.6633, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03463239595293999, + "rewards/margins": 0.05561947077512741, + "rewards/margins_max": 0.08030703663825989, + "rewards/margins_min": 0.03093191422522068, + "rewards/margins_std": 0.03491348773241043, + "rewards/rejected": -0.02098708227276802, + "step": 2130 + }, + { + "epoch": 0.67, + "grad_norm": 0.4296875, + "learning_rate": 1.450105123149904e-07, + "logits/chosen": -1.3517110347747803, + "logits/rejected": -0.8976603746414185, + "logps/chosen": -236.5410614013672, + "logps/rejected": -285.66143798828125, + "loss": 0.6594, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04675716906785965, + "rewards/margins": 0.0757768452167511, + "rewards/margins_max": 0.1105475053191185, + "rewards/margins_min": 0.0410061851143837, + "rewards/margins_std": 0.04917313903570175, + "rewards/rejected": -0.029019678011536598, + "step": 2140 + }, + { + "epoch": 0.68, + "grad_norm": 0.36328125, + "learning_rate": 1.4252117545199638e-07, + "logits/chosen": -1.2252193689346313, + "logits/rejected": -1.2452119588851929, + "logps/chosen": -129.21884155273438, + "logps/rejected": -187.29981994628906, + "loss": 0.6689, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02311699464917183, + "rewards/margins": 0.053245484828948975, + "rewards/margins_max": 0.08089035749435425, + "rewards/margins_min": 0.025600602850317955, + "rewards/margins_std": 0.03909575939178467, + "rewards/rejected": -0.030128484591841698, + "step": 2150 + }, + { + "epoch": 0.68, + "grad_norm": 0.421875, + "learning_rate": 1.400448433348191e-07, + "logits/chosen": -1.3551205396652222, + "logits/rejected": -1.0361279249191284, + "logps/chosen": -181.05245971679688, + "logps/rejected": -190.93905639648438, + "loss": 0.6667, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.033756453543901443, + "rewards/margins": 0.05258417874574661, + "rewards/margins_max": 0.08322058618068695, + "rewards/margins_min": 0.02194777876138687, + "rewards/margins_std": 0.04332640767097473, + "rewards/rejected": -0.01882772520184517, + "step": 2160 + }, + { + "epoch": 0.68, + "grad_norm": 0.3828125, + "learning_rate": 1.3758181559522219e-07, + "logits/chosen": -1.3742306232452393, + "logits/rejected": -1.1042159795761108, + "logps/chosen": -195.7826690673828, + "logps/rejected": -224.00357055664062, + "loss": 0.6649, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.039305493235588074, + "rewards/margins": 0.054510366171598434, + "rewards/margins_max": 0.07698939740657806, + "rewards/margins_min": 0.032031331211328506, + "rewards/margins_std": 0.031790152192115784, + "rewards/rejected": -0.015204873867332935, + "step": 2170 + }, + { + "epoch": 0.69, + "grad_norm": 0.490234375, + "learning_rate": 1.351323902551631e-07, + "logits/chosen": -1.423339605331421, + "logits/rejected": -1.0979268550872803, + "logps/chosen": -188.20086669921875, + "logps/rejected": -208.48483276367188, + "loss": 0.6595, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.036755792796611786, + "rewards/margins": 0.06653173267841339, + "rewards/margins_max": 0.10358710587024689, + "rewards/margins_min": 0.029476355761289597, + "rewards/margins_std": 0.052404217422008514, + "rewards/rejected": -0.029775941744446754, + "step": 2180 + }, + { + "epoch": 0.69, + "grad_norm": 0.46484375, + "learning_rate": 1.3269686369073347e-07, + "logits/chosen": -1.4356403350830078, + "logits/rejected": -0.9359350204467773, + "logps/chosen": -255.5299530029297, + "logps/rejected": -220.5718536376953, + "loss": 0.663, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03550455719232559, + "rewards/margins": 0.07294157147407532, + "rewards/margins_max": 0.11041506379842758, + "rewards/margins_min": 0.03546806797385216, + "rewards/margins_std": 0.052995532751083374, + "rewards/rejected": -0.037437014281749725, + "step": 2190 + }, + { + "epoch": 0.69, + "grad_norm": 0.56640625, + "learning_rate": 1.3027553059629776e-07, + "logits/chosen": -1.270801305770874, + "logits/rejected": -0.9209572076797485, + "logps/chosen": -203.37147521972656, + "logps/rejected": -237.0596160888672, + "loss": 0.6625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03659834340214729, + "rewards/margins": 0.07047709822654724, + "rewards/margins_max": 0.10837771743535995, + "rewards/margins_min": 0.03257646784186363, + "rewards/margins_std": 0.05359958857297897, + "rewards/rejected": -0.03387875854969025, + "step": 2200 + }, + { + "epoch": 0.7, + "grad_norm": 0.400390625, + "learning_rate": 1.2786868394883615e-07, + "logits/chosen": -1.3924726247787476, + "logits/rejected": -0.9072662591934204, + "logps/chosen": -237.67532348632812, + "logps/rejected": -171.44007873535156, + "loss": 0.6647, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03645704686641693, + "rewards/margins": 0.04906289279460907, + "rewards/margins_max": 0.07811780273914337, + "rewards/margins_min": 0.02000797912478447, + "rewards/margins_std": 0.041089847683906555, + "rewards/rejected": -0.012605843134224415, + "step": 2210 + }, + { + "epoch": 0.7, + "grad_norm": 0.287109375, + "learning_rate": 1.2547661497249423e-07, + "logits/chosen": -1.505576491355896, + "logits/rejected": -1.0931254625320435, + "logps/chosen": -251.4204559326172, + "logps/rejected": -184.33187866210938, + "loss": 0.6618, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03499855846166611, + "rewards/margins": 0.060498736798763275, + "rewards/margins_max": 0.09210414439439774, + "rewards/margins_min": 0.028893321752548218, + "rewards/margins_std": 0.04469680041074753, + "rewards/rejected": -0.025500169023871422, + "step": 2220 + }, + { + "epoch": 0.7, + "grad_norm": 0.369140625, + "learning_rate": 1.2309961310334608e-07, + "logits/chosen": -1.381753921508789, + "logits/rejected": -1.0234613418579102, + "logps/chosen": -209.87673950195312, + "logps/rejected": -193.29415893554688, + "loss": 0.6657, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.036537621170282364, + "rewards/margins": 0.05814922973513603, + "rewards/margins_max": 0.0908384695649147, + "rewards/margins_min": 0.025459999218583107, + "rewards/margins_std": 0.04622955992817879, + "rewards/rejected": -0.021611608564853668, + "step": 2230 + }, + { + "epoch": 0.71, + "grad_norm": 0.47265625, + "learning_rate": 1.207379659543726e-07, + "logits/chosen": -1.5136375427246094, + "logits/rejected": -1.0719817876815796, + "logps/chosen": -235.4477081298828, + "logps/rejected": -190.52899169921875, + "loss": 0.6619, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04006263613700867, + "rewards/margins": 0.05348850414156914, + "rewards/margins_max": 0.07529211789369583, + "rewards/margins_min": 0.03168489784002304, + "rewards/margins_std": 0.03083496168255806, + "rewards/rejected": -0.013425871729850769, + "step": 2240 + }, + { + "epoch": 0.71, + "grad_norm": 0.48046875, + "learning_rate": 1.1839195928066101e-07, + "logits/chosen": -1.5472790002822876, + "logits/rejected": -1.063508152961731, + "logps/chosen": -237.460205078125, + "logps/rejected": -203.92752075195312, + "loss": 0.6676, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03670421242713928, + "rewards/margins": 0.0651477798819542, + "rewards/margins_max": 0.09266404807567596, + "rewards/margins_min": 0.03763151913881302, + "rewards/margins_std": 0.038913875818252563, + "rewards/rejected": -0.02844356931746006, + "step": 2250 + }, + { + "epoch": 0.71, + "grad_norm": 0.4140625, + "learning_rate": 1.1606187694482895e-07, + "logits/chosen": -1.3274108171463013, + "logits/rejected": -1.0006046295166016, + "logps/chosen": -341.37298583984375, + "logps/rejected": -298.43218994140625, + "loss": 0.6606, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03555456921458244, + "rewards/margins": 0.0692644715309143, + "rewards/margins_max": 0.09813406318426132, + "rewards/margins_min": 0.04039488732814789, + "rewards/margins_std": 0.040827758610248566, + "rewards/rejected": -0.03370990604162216, + "step": 2260 + }, + { + "epoch": 0.72, + "grad_norm": 0.453125, + "learning_rate": 1.1374800088267766e-07, + "logits/chosen": -1.3964722156524658, + "logits/rejected": -0.8625639081001282, + "logps/chosen": -256.6228332519531, + "logps/rejected": -204.37188720703125, + "loss": 0.6618, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03464067354798317, + "rewards/margins": 0.06588082760572433, + "rewards/margins_max": 0.09385097026824951, + "rewards/margins_min": 0.03791068494319916, + "rewards/margins_std": 0.03955575078725815, + "rewards/rejected": -0.031240154057741165, + "step": 2270 + }, + { + "epoch": 0.72, + "grad_norm": 0.392578125, + "learning_rate": 1.1145061106907803e-07, + "logits/chosen": -1.3579143285751343, + "logits/rejected": -1.1530735492706299, + "logps/chosen": -213.7913055419922, + "logps/rejected": -274.8453674316406, + "loss": 0.6626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.039796892553567886, + "rewards/margins": 0.07231783121824265, + "rewards/margins_max": 0.1056450754404068, + "rewards/margins_min": 0.03899059444665909, + "rewards/margins_std": 0.04713182896375656, + "rewards/rejected": -0.03252093866467476, + "step": 2280 + }, + { + "epoch": 0.72, + "grad_norm": 0.34375, + "learning_rate": 1.0916998548409447e-07, + "logits/chosen": -1.2776060104370117, + "logits/rejected": -1.0304553508758545, + "logps/chosen": -208.4978790283203, + "logps/rejected": -255.5095977783203, + "loss": 0.663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03622225672006607, + "rewards/margins": 0.06858749687671661, + "rewards/margins_max": 0.09841950237751007, + "rewards/margins_min": 0.03875547647476196, + "rewards/margins_std": 0.04218883812427521, + "rewards/rejected": -0.032365236431360245, + "step": 2290 + }, + { + "epoch": 0.72, + "grad_norm": 0.5859375, + "learning_rate": 1.0690640007934978e-07, + "logits/chosen": -1.365751028060913, + "logits/rejected": -0.8165037035942078, + "logps/chosen": -263.61102294921875, + "logps/rejected": -221.7294158935547, + "loss": 0.6703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.036759573966264725, + "rewards/margins": 0.057599522173404694, + "rewards/margins_max": 0.08515635877847672, + "rewards/margins_min": 0.030042681843042374, + "rewards/margins_std": 0.03897125646471977, + "rewards/rejected": -0.020839953795075417, + "step": 2300 + }, + { + "epoch": 0.73, + "grad_norm": 0.451171875, + "learning_rate": 1.0466012874463507e-07, + "logits/chosen": -1.2811259031295776, + "logits/rejected": -0.9887920618057251, + "logps/chosen": -267.3749694824219, + "logps/rejected": -244.70596313476562, + "loss": 0.6677, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.038502246141433716, + "rewards/margins": 0.060044266283512115, + "rewards/margins_max": 0.08822239935398102, + "rewards/margins_min": 0.03186614066362381, + "rewards/margins_std": 0.039849892258644104, + "rewards/rejected": -0.02154202200472355, + "step": 2310 + }, + { + "epoch": 0.73, + "grad_norm": 0.498046875, + "learning_rate": 1.0243144327477013e-07, + "logits/chosen": -1.4756540060043335, + "logits/rejected": -0.9919270277023315, + "logps/chosen": -223.4065704345703, + "logps/rejected": -209.97573852539062, + "loss": 0.6588, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04816528037190437, + "rewards/margins": 0.07530733942985535, + "rewards/margins_max": 0.10730701684951782, + "rewards/margins_min": 0.04330766201019287, + "rewards/margins_std": 0.04525437951087952, + "rewards/rejected": -0.027142059057950974, + "step": 2320 + }, + { + "epoch": 0.73, + "grad_norm": 0.326171875, + "learning_rate": 1.0022061333671647e-07, + "logits/chosen": -1.3365637063980103, + "logits/rejected": -0.9453974962234497, + "logps/chosen": -221.6447296142578, + "logps/rejected": -205.4340057373047, + "loss": 0.6633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.043263550847768784, + "rewards/margins": 0.07088526338338852, + "rewards/margins_max": 0.09706144034862518, + "rewards/margins_min": 0.044709086418151855, + "rewards/margins_std": 0.037018708884716034, + "rewards/rejected": -0.027621712535619736, + "step": 2330 + }, + { + "epoch": 0.74, + "grad_norm": 0.490234375, + "learning_rate": 9.802790643694817e-08, + "logits/chosen": -1.3576759099960327, + "logits/rejected": -1.1886638402938843, + "logps/chosen": -196.93856811523438, + "logps/rejected": -203.70106506347656, + "loss": 0.6661, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03112906776368618, + "rewards/margins": 0.05759000778198242, + "rewards/margins_max": 0.08904091268777847, + "rewards/margins_min": 0.02613910473883152, + "rewards/margins_std": 0.044478293508291245, + "rewards/rejected": -0.026460934430360794, + "step": 2340 + }, + { + "epoch": 0.74, + "grad_norm": 0.4609375, + "learning_rate": 9.585358788908393e-08, + "logits/chosen": -1.386399745941162, + "logits/rejected": -1.065953254699707, + "logps/chosen": -228.66220092773438, + "logps/rejected": -250.2444610595703, + "loss": 0.6644, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.030235329642891884, + "rewards/margins": 0.05980142205953598, + "rewards/margins_max": 0.09089671075344086, + "rewards/margins_min": 0.028706133365631104, + "rewards/margins_std": 0.043975379317998886, + "rewards/rejected": -0.029566094279289246, + "step": 2350 + }, + { + "epoch": 0.74, + "grad_norm": 0.453125, + "learning_rate": 9.36979207817849e-08, + "logits/chosen": -1.5047038793563843, + "logits/rejected": -1.2480775117874146, + "logps/chosen": -239.8202667236328, + "logps/rejected": -234.4571533203125, + "loss": 0.6693, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.032885629683732986, + "rewards/margins": 0.05206092447042465, + "rewards/margins_max": 0.07507045567035675, + "rewards/margins_min": 0.029051411896944046, + "rewards/margins_std": 0.032540373504161835, + "rewards/rejected": -0.019175300374627113, + "step": 2360 + }, + { + "epoch": 0.75, + "grad_norm": 0.5390625, + "learning_rate": 9.156116594692096e-08, + "logits/chosen": -1.4589383602142334, + "logits/rejected": -0.9495819807052612, + "logps/chosen": -231.2331085205078, + "logps/rejected": -212.32382202148438, + "loss": 0.66, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04424898326396942, + "rewards/margins": 0.06708662211894989, + "rewards/margins_max": 0.09897585213184357, + "rewards/margins_min": 0.03519739955663681, + "rewards/margins_std": 0.04509817436337471, + "rewards/rejected": -0.02283763512969017, + "step": 2370 + }, + { + "epoch": 0.75, + "grad_norm": 0.375, + "learning_rate": 8.944358192801102e-08, + "logits/chosen": -1.4549717903137207, + "logits/rejected": -0.9532996416091919, + "logps/chosen": -222.93148803710938, + "logps/rejected": -191.50634765625, + "loss": 0.6578, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.042102448642253876, + "rewards/margins": 0.07653506100177765, + "rewards/margins_max": 0.11154161393642426, + "rewards/margins_min": 0.041528504341840744, + "rewards/margins_std": 0.04950674995779991, + "rewards/rejected": -0.034432608634233475, + "step": 2380 + }, + { + "epoch": 0.75, + "grad_norm": 0.3984375, + "learning_rate": 8.734542494893954e-08, + "logits/chosen": -1.492494821548462, + "logits/rejected": -1.2444711923599243, + "logps/chosen": -219.95266723632812, + "logps/rejected": -268.90484619140625, + "loss": 0.6652, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04024102911353111, + "rewards/margins": 0.06013556569814682, + "rewards/margins_max": 0.08759422600269318, + "rewards/margins_min": 0.03267688676714897, + "rewards/margins_std": 0.03883242979645729, + "rewards/rejected": -0.01989452913403511, + "step": 2390 + }, + { + "epoch": 0.76, + "grad_norm": 0.921875, + "learning_rate": 8.526694888295355e-08, + "logits/chosen": -1.3630679845809937, + "logits/rejected": -1.0612514019012451, + "logps/chosen": -223.59716796875, + "logps/rejected": -237.7313690185547, + "loss": 0.6612, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03609809651970863, + "rewards/margins": 0.06568726152181625, + "rewards/margins_max": 0.08733747154474258, + "rewards/margins_min": 0.04403705149888992, + "rewards/margins_std": 0.03061802126467228, + "rewards/rejected": -0.02958916500210762, + "step": 2400 + }, + { + "epoch": 0.76, + "grad_norm": 0.42578125, + "learning_rate": 8.320840522194505e-08, + "logits/chosen": -1.3517181873321533, + "logits/rejected": -1.105916142463684, + "logps/chosen": -233.48831176757812, + "logps/rejected": -236.3004913330078, + "loss": 0.6624, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03493565320968628, + "rewards/margins": 0.061103563755750656, + "rewards/margins_max": 0.09413080662488937, + "rewards/margins_min": 0.02807632088661194, + "rewards/margins_std": 0.046707578003406525, + "rewards/rejected": -0.026167908683419228, + "step": 2410 + }, + { + "epoch": 0.76, + "grad_norm": 0.400390625, + "learning_rate": 8.117004304602052e-08, + "logits/chosen": -1.4049303531646729, + "logits/rejected": -0.988071620464325, + "logps/chosen": -274.993896484375, + "logps/rejected": -221.76278686523438, + "loss": 0.6689, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.037282317876815796, + "rewards/margins": 0.049982473254203796, + "rewards/margins_max": 0.07475082576274872, + "rewards/margins_min": 0.025214115157723427, + "rewards/margins_std": 0.035027749836444855, + "rewards/rejected": -0.01270015724003315, + "step": 2420 + }, + { + "epoch": 0.77, + "grad_norm": 0.3359375, + "learning_rate": 7.915210899336283e-08, + "logits/chosen": -1.5335876941680908, + "logits/rejected": -1.1939712762832642, + "logps/chosen": -214.1549530029297, + "logps/rejected": -259.2396545410156, + "loss": 0.6641, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0516619011759758, + "rewards/margins": 0.06498047709465027, + "rewards/margins_max": 0.09797366708517075, + "rewards/margins_min": 0.03198728710412979, + "rewards/margins_std": 0.046659428626298904, + "rewards/rejected": -0.013318580575287342, + "step": 2430 + }, + { + "epoch": 0.77, + "grad_norm": 0.50390625, + "learning_rate": 7.715484723038837e-08, + "logits/chosen": -1.1930948495864868, + "logits/rejected": -0.938764750957489, + "logps/chosen": -220.21621704101562, + "logps/rejected": -254.71420288085938, + "loss": 0.6654, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.027883481234312057, + "rewards/margins": 0.060532040894031525, + "rewards/margins_max": 0.0814305990934372, + "rewards/margins_min": 0.03963347524404526, + "rewards/margins_std": 0.029555032029747963, + "rewards/rejected": -0.03264855593442917, + "step": 2440 + }, + { + "epoch": 0.77, + "grad_norm": 0.412109375, + "learning_rate": 7.517849942220348e-08, + "logits/chosen": -1.288425087928772, + "logits/rejected": -0.9016556739807129, + "logps/chosen": -207.7607421875, + "logps/rejected": -215.00808715820312, + "loss": 0.6621, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.033341165632009506, + "rewards/margins": 0.05306249111890793, + "rewards/margins_max": 0.07986272126436234, + "rewards/margins_min": 0.026262247934937477, + "rewards/margins_std": 0.03790125995874405, + "rewards/rejected": -0.019721319898962975, + "step": 2450 + }, + { + "epoch": 0.77, + "grad_norm": 0.48828125, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -1.3114441633224487, + "logits/rejected": -1.1315343379974365, + "logps/chosen": -204.32998657226562, + "logps/rejected": -197.68760681152344, + "loss": 0.6669, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.023209819570183754, + "rewards/margins": 0.04553220421075821, + "rewards/margins_max": 0.06928315758705139, + "rewards/margins_min": 0.021781256422400475, + "rewards/margins_std": 0.033588919788599014, + "rewards/rejected": -0.022322386503219604, + "step": 2460 + }, + { + "epoch": 0.78, + "grad_norm": 0.390625, + "learning_rate": 7.128949964893646e-08, + "logits/chosen": -1.4030101299285889, + "logits/rejected": -1.0203847885131836, + "logps/chosen": -246.3531951904297, + "logps/rejected": -231.5150604248047, + "loss": 0.6645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.041862308979034424, + "rewards/margins": 0.06798645108938217, + "rewards/margins_max": 0.09953634440898895, + "rewards/margins_min": 0.036436546593904495, + "rewards/margins_std": 0.04461830109357834, + "rewards/rejected": -0.026124143972992897, + "step": 2470 + }, + { + "epoch": 0.78, + "grad_norm": 0.2890625, + "learning_rate": 6.937731824588141e-08, + "logits/chosen": -1.3225687742233276, + "logits/rejected": -1.2012965679168701, + "logps/chosen": -161.27560424804688, + "logps/rejected": -162.04849243164062, + "loss": 0.6697, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.024264657869935036, + "rewards/margins": 0.04363849759101868, + "rewards/margins_max": 0.06468725949525833, + "rewards/margins_min": 0.02258973941206932, + "rewards/margins_std": 0.029767444357275963, + "rewards/rejected": -0.01937383972108364, + "step": 2480 + }, + { + "epoch": 0.78, + "grad_norm": 0.4140625, + "learning_rate": 6.74869918647325e-08, + "logits/chosen": -1.2273991107940674, + "logits/rejected": -0.8869683146476746, + "logps/chosen": -242.3751678466797, + "logps/rejected": -222.62265014648438, + "loss": 0.6686, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04028186947107315, + "rewards/margins": 0.05239185690879822, + "rewards/margins_max": 0.07009953260421753, + "rewards/margins_min": 0.03468417376279831, + "rewards/margins_std": 0.025042440742254257, + "rewards/rejected": -0.012109987437725067, + "step": 2490 + }, + { + "epoch": 0.79, + "grad_norm": 0.443359375, + "learning_rate": 6.56187492316059e-08, + "logits/chosen": -1.3965575695037842, + "logits/rejected": -0.9450374841690063, + "logps/chosen": -220.7981414794922, + "logps/rejected": -155.75204467773438, + "loss": 0.6613, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02550414577126503, + "rewards/margins": 0.06219423562288284, + "rewards/margins_max": 0.08597894012928009, + "rewards/margins_min": 0.0384095273911953, + "rewards/margins_std": 0.033636655658483505, + "rewards/rejected": -0.036690086126327515, + "step": 2500 + }, + { + "epoch": 0.79, + "grad_norm": 0.447265625, + "learning_rate": 6.377281640052357e-08, + "logits/chosen": -1.5471882820129395, + "logits/rejected": -1.1804416179656982, + "logps/chosen": -192.26565551757812, + "logps/rejected": -246.218994140625, + "loss": 0.6628, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04850774258375168, + "rewards/margins": 0.06347064673900604, + "rewards/margins_max": 0.10149389505386353, + "rewards/margins_min": 0.025447404012084007, + "rewards/margins_std": 0.05377299338579178, + "rewards/rejected": -0.014962906017899513, + "step": 2510 + }, + { + "epoch": 0.79, + "grad_norm": 0.40625, + "learning_rate": 6.19494167260613e-08, + "logits/chosen": -1.425964117050171, + "logits/rejected": -1.0960302352905273, + "logps/chosen": -184.11727905273438, + "logps/rejected": -191.51913452148438, + "loss": 0.6597, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0385814905166626, + "rewards/margins": 0.061642616987228394, + "rewards/margins_max": 0.09265581518411636, + "rewards/margins_min": 0.030629415065050125, + "rewards/margins_std": 0.04385928437113762, + "rewards/rejected": -0.023061122745275497, + "step": 2520 + }, + { + "epoch": 0.8, + "grad_norm": 0.39453125, + "learning_rate": 6.01487708363232e-08, + "logits/chosen": -1.4187657833099365, + "logits/rejected": -1.0462344884872437, + "logps/chosen": -231.49960327148438, + "logps/rejected": -250.50973510742188, + "loss": 0.6599, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04294178634881973, + "rewards/margins": 0.06284011900424957, + "rewards/margins_max": 0.0833154022693634, + "rewards/margins_min": 0.042364828288555145, + "rewards/margins_std": 0.02895643189549446, + "rewards/rejected": -0.01989833451807499, + "step": 2530 + }, + { + "epoch": 0.8, + "grad_norm": 0.52734375, + "learning_rate": 5.837109660624606e-08, + "logits/chosen": -1.3851536512374878, + "logits/rejected": -1.0157699584960938, + "logps/chosen": -226.1177978515625, + "logps/rejected": -238.81539916992188, + "loss": 0.663, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03696604073047638, + "rewards/margins": 0.06656143069267273, + "rewards/margins_max": 0.0883278027176857, + "rewards/margins_min": 0.04479505866765976, + "rewards/margins_std": 0.030782291665673256, + "rewards/rejected": -0.02959538996219635, + "step": 2540 + }, + { + "epoch": 0.8, + "grad_norm": 0.3359375, + "learning_rate": 5.6616609131236725e-08, + "logits/chosen": -1.5234705209732056, + "logits/rejected": -1.249939203262329, + "logps/chosen": -209.16690063476562, + "logps/rejected": -201.7328643798828, + "loss": 0.666, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.042094189673662186, + "rewards/margins": 0.05819466710090637, + "rewards/margins_max": 0.08667898923158646, + "rewards/margins_min": 0.029710358008742332, + "rewards/margins_std": 0.040282897651195526, + "rewards/rejected": -0.016100479289889336, + "step": 2550 + }, + { + "epoch": 0.81, + "grad_norm": 0.59765625, + "learning_rate": 5.4885520701146324e-08, + "logits/chosen": -1.27875816822052, + "logits/rejected": -0.9493977427482605, + "logps/chosen": -214.4361572265625, + "logps/rejected": -233.2643280029297, + "loss": 0.6636, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.034729085862636566, + "rewards/margins": 0.06826482713222504, + "rewards/margins_max": 0.11228573322296143, + "rewards/margins_min": 0.024243932217359543, + "rewards/margins_std": 0.06225494667887688, + "rewards/rejected": -0.03353574126958847, + "step": 2560 + }, + { + "epoch": 0.81, + "grad_norm": 0.46484375, + "learning_rate": 5.3178040774583236e-08, + "logits/chosen": -1.4629589319229126, + "logits/rejected": -0.9861122965812683, + "logps/chosen": -280.67486572265625, + "logps/rejected": -271.3564147949219, + "loss": 0.6655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03766489028930664, + "rewards/margins": 0.06308640539646149, + "rewards/margins_max": 0.08921506255865097, + "rewards/margins_min": 0.036957744508981705, + "rewards/margins_std": 0.03695150464773178, + "rewards/rejected": -0.025421511381864548, + "step": 2570 + }, + { + "epoch": 0.81, + "grad_norm": 0.451171875, + "learning_rate": 5.149437595356901e-08, + "logits/chosen": -1.3392517566680908, + "logits/rejected": -0.9539203643798828, + "logps/chosen": -244.0900421142578, + "logps/rejected": -216.6325225830078, + "loss": 0.6665, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03929731249809265, + "rewards/margins": 0.05567679926753044, + "rewards/margins_max": 0.08472796529531479, + "rewards/margins_min": 0.026625623926520348, + "rewards/margins_std": 0.041084565222263336, + "rewards/rejected": -0.01637948676943779, + "step": 2580 + }, + { + "epoch": 0.82, + "grad_norm": 0.490234375, + "learning_rate": 4.9834729958540016e-08, + "logits/chosen": -1.3185430765151978, + "logits/rejected": -0.9537866711616516, + "logps/chosen": -255.76937866210938, + "logps/rejected": -173.42251586914062, + "loss": 0.6619, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03467049077153206, + "rewards/margins": 0.05125656723976135, + "rewards/margins_max": 0.07643640786409378, + "rewards/margins_min": 0.026076724752783775, + "rewards/margins_std": 0.03560966998338699, + "rewards/rejected": -0.016586078330874443, + "step": 2590 + }, + { + "epoch": 0.82, + "grad_norm": 0.26171875, + "learning_rate": 4.8199303603697614e-08, + "logits/chosen": -1.4323641061782837, + "logits/rejected": -1.1901360750198364, + "logps/chosen": -212.28759765625, + "logps/rejected": -251.69052124023438, + "loss": 0.6676, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03305204585194588, + "rewards/margins": 0.04768746346235275, + "rewards/margins_max": 0.0705387219786644, + "rewards/margins_min": 0.024836191907525063, + "rewards/margins_std": 0.03231657296419144, + "rewards/rejected": -0.014635416679084301, + "step": 2600 + }, + { + "epoch": 0.82, + "grad_norm": 0.56640625, + "learning_rate": 4.658829477270995e-08, + "logits/chosen": -1.4831786155700684, + "logits/rejected": -1.0595829486846924, + "logps/chosen": -205.73196411132812, + "logps/rejected": -281.29119873046875, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04150627925992012, + "rewards/margins": 0.06586649268865585, + "rewards/margins_max": 0.08460468798875809, + "rewards/margins_min": 0.047128308564424515, + "rewards/margins_std": 0.026499798521399498, + "rewards/rejected": -0.02436022460460663, + "step": 2610 + }, + { + "epoch": 0.83, + "grad_norm": 0.37109375, + "learning_rate": 4.5001898394768336e-08, + "logits/chosen": -1.4085218906402588, + "logits/rejected": -1.1751958131790161, + "logps/chosen": -211.86831665039062, + "logps/rejected": -212.71908569335938, + "loss": 0.6666, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02613511122763157, + "rewards/margins": 0.05009372904896736, + "rewards/margins_max": 0.07305373251438141, + "rewards/margins_min": 0.027133729308843613, + "rewards/margins_std": 0.03247034177184105, + "rewards/rejected": -0.023958619683980942, + "step": 2620 + }, + { + "epoch": 0.83, + "grad_norm": 0.50390625, + "learning_rate": 4.3440306421001324e-08, + "logits/chosen": -1.531702995300293, + "logits/rejected": -1.2762770652770996, + "logps/chosen": -264.6157531738281, + "logps/rejected": -239.91134643554688, + "loss": 0.6656, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03299673646688461, + "rewards/margins": 0.05226144194602966, + "rewards/margins_max": 0.08147990703582764, + "rewards/margins_min": 0.023042969405651093, + "rewards/margins_std": 0.041321154683828354, + "rewards/rejected": -0.019264699891209602, + "step": 2630 + }, + { + "epoch": 0.83, + "grad_norm": 0.3046875, + "learning_rate": 4.190370780124863e-08, + "logits/chosen": -1.2897651195526123, + "logits/rejected": -1.0072309970855713, + "logps/chosen": -186.4278564453125, + "logps/rejected": -243.1654815673828, + "loss": 0.669, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.032657403498888016, + "rewards/margins": 0.049304358661174774, + "rewards/margins_max": 0.07682739198207855, + "rewards/margins_min": 0.021781327202916145, + "rewards/margins_std": 0.038923438638448715, + "rewards/rejected": -0.01664695516228676, + "step": 2640 + }, + { + "epoch": 0.83, + "grad_norm": 0.345703125, + "learning_rate": 4.0392288461199045e-08, + "logits/chosen": -1.2460219860076904, + "logits/rejected": -1.0387169122695923, + "logps/chosen": -224.2480926513672, + "logps/rejected": -217.1114959716797, + "loss": 0.6665, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.033198267221450806, + "rewards/margins": 0.061884719878435135, + "rewards/margins_max": 0.09340154379606247, + "rewards/margins_min": 0.0303678959608078, + "rewards/margins_std": 0.04457152262330055, + "rewards/rejected": -0.02868645451962948, + "step": 2650 + }, + { + "epoch": 0.84, + "grad_norm": 0.37890625, + "learning_rate": 3.8906231279893423e-08, + "logits/chosen": -1.231979250907898, + "logits/rejected": -1.0273730754852295, + "logps/chosen": -233.99267578125, + "logps/rejected": -187.70278930664062, + "loss": 0.6644, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02670123614370823, + "rewards/margins": 0.05759245902299881, + "rewards/margins_max": 0.09034743160009384, + "rewards/margins_min": 0.024837475270032883, + "rewards/margins_std": 0.0463225394487381, + "rewards/rejected": -0.03089122101664543, + "step": 2660 + }, + { + "epoch": 0.84, + "grad_norm": 0.369140625, + "learning_rate": 3.74457160675965e-08, + "logits/chosen": -1.3447935581207275, + "logits/rejected": -1.003073811531067, + "logps/chosen": -207.041015625, + "logps/rejected": -198.29556274414062, + "loss": 0.6667, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03199172765016556, + "rewards/margins": 0.05442710965871811, + "rewards/margins_max": 0.07237287610769272, + "rewards/margins_min": 0.036481358110904694, + "rewards/margins_std": 0.025379130616784096, + "rewards/rejected": -0.02243538200855255, + "step": 2670 + }, + { + "epoch": 0.84, + "grad_norm": 0.373046875, + "learning_rate": 3.601091954404062e-08, + "logits/chosen": -1.2016583681106567, + "logits/rejected": -0.9326213002204895, + "logps/chosen": -238.39126586914062, + "logps/rejected": -243.9576416015625, + "loss": 0.6645, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026809915900230408, + "rewards/margins": 0.04789675027132034, + "rewards/margins_max": 0.07007952034473419, + "rewards/margins_min": 0.025713974609971046, + "rewards/margins_std": 0.03137117996811867, + "rewards/rejected": -0.021086832508444786, + "step": 2680 + }, + { + "epoch": 0.85, + "grad_norm": 0.44921875, + "learning_rate": 3.460201531704263e-08, + "logits/chosen": -1.3697774410247803, + "logits/rejected": -0.8151613473892212, + "logps/chosen": -393.69189453125, + "logps/rejected": -246.65817260742188, + "loss": 0.6595, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03834783285856247, + "rewards/margins": 0.07543188333511353, + "rewards/margins_max": 0.10231365263462067, + "rewards/margins_min": 0.04855012148618698, + "rewards/margins_std": 0.038016561418771744, + "rewards/rejected": -0.037084050476551056, + "step": 2690 + }, + { + "epoch": 0.85, + "grad_norm": 0.38671875, + "learning_rate": 3.321917386149772e-08, + "logits/chosen": -1.4533543586730957, + "logits/rejected": -1.0557693243026733, + "logps/chosen": -209.1657257080078, + "logps/rejected": -214.769287109375, + "loss": 0.6642, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.042614761739969254, + "rewards/margins": 0.05421183258295059, + "rewards/margins_max": 0.08117054402828217, + "rewards/margins_min": 0.02725311741232872, + "rewards/margins_std": 0.038125377148389816, + "rewards/rejected": -0.011597072705626488, + "step": 2700 + }, + { + "epoch": 0.85, + "grad_norm": 0.412109375, + "learning_rate": 3.1862562498752354e-08, + "logits/chosen": -1.4646778106689453, + "logits/rejected": -1.1616142988204956, + "logps/chosen": -192.743408203125, + "logps/rejected": -208.6314697265625, + "loss": 0.6701, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03919973596930504, + "rewards/margins": 0.048980120569467545, + "rewards/margins_max": 0.06728260964155197, + "rewards/margins_min": 0.030677635222673416, + "rewards/margins_std": 0.02588362991809845, + "rewards/rejected": -0.009780386462807655, + "step": 2710 + }, + { + "epoch": 0.86, + "grad_norm": 0.4453125, + "learning_rate": 3.053234537635857e-08, + "logits/chosen": -1.5152153968811035, + "logits/rejected": -1.1075925827026367, + "logps/chosen": -182.39224243164062, + "logps/rejected": -248.19351196289062, + "loss": 0.6603, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.038503944873809814, + "rewards/margins": 0.06572575867176056, + "rewards/margins_max": 0.09247289597988129, + "rewards/margins_min": 0.03897860646247864, + "rewards/margins_std": 0.03782618045806885, + "rewards/rejected": -0.027221810072660446, + "step": 2720 + }, + { + "epoch": 0.86, + "grad_norm": 0.50390625, + "learning_rate": 2.922868344821236e-08, + "logits/chosen": -1.3224998712539673, + "logits/rejected": -0.881952166557312, + "logps/chosen": -220.5806121826172, + "logps/rejected": -189.16519165039062, + "loss": 0.6628, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03971542418003082, + "rewards/margins": 0.06064347177743912, + "rewards/margins_max": 0.08912724256515503, + "rewards/margins_min": 0.03215969726443291, + "rewards/margins_std": 0.04028213769197464, + "rewards/rejected": -0.020928047597408295, + "step": 2730 + }, + { + "epoch": 0.86, + "grad_norm": 0.404296875, + "learning_rate": 2.7951734455078786e-08, + "logits/chosen": -1.4898918867111206, + "logits/rejected": -0.9584072828292847, + "logps/chosen": -253.1838836669922, + "logps/rejected": -262.40740966796875, + "loss": 0.6648, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04760271683335304, + "rewards/margins": 0.06493322551250458, + "rewards/margins_max": 0.088630810379982, + "rewards/margins_min": 0.041235629469156265, + "rewards/margins_std": 0.03351346030831337, + "rewards/rejected": -0.017330504953861237, + "step": 2740 + }, + { + "epoch": 0.87, + "grad_norm": 0.41796875, + "learning_rate": 2.670165290550544e-08, + "logits/chosen": -1.386683464050293, + "logits/rejected": -0.9467649459838867, + "logps/chosen": -209.0247802734375, + "logps/rejected": -213.07766723632812, + "loss": 0.6661, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03524526581168175, + "rewards/margins": 0.05571124702692032, + "rewards/margins_max": 0.0755188837647438, + "rewards/margins_min": 0.03590361401438713, + "rewards/margins_std": 0.028012219816446304, + "rewards/rejected": -0.02046598121523857, + "step": 2750 + }, + { + "epoch": 0.87, + "grad_norm": 0.380859375, + "learning_rate": 2.5478590057127268e-08, + "logits/chosen": -1.4220774173736572, + "logits/rejected": -1.0289338827133179, + "logps/chosen": -211.04776000976562, + "logps/rejected": -194.3659210205078, + "loss": 0.6609, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03885051980614662, + "rewards/margins": 0.06495725363492966, + "rewards/margins_max": 0.09185833483934402, + "rewards/margins_min": 0.038056183606386185, + "rewards/margins_std": 0.03804386407136917, + "rewards/rejected": -0.026106741279363632, + "step": 2760 + }, + { + "epoch": 0.87, + "grad_norm": 0.38671875, + "learning_rate": 2.4282693898364432e-08, + "logits/chosen": -1.4226223230361938, + "logits/rejected": -0.9696500897407532, + "logps/chosen": -176.65994262695312, + "logps/rejected": -180.49554443359375, + "loss": 0.6622, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04054059833288193, + "rewards/margins": 0.06953348219394684, + "rewards/margins_max": 0.09578864276409149, + "rewards/margins_min": 0.04327831417322159, + "rewards/margins_std": 0.03713040426373482, + "rewards/rejected": -0.02899288199841976, + "step": 2770 + }, + { + "epoch": 0.88, + "grad_norm": 0.328125, + "learning_rate": 2.3114109130516424e-08, + "logits/chosen": -1.3210171461105347, + "logits/rejected": -0.9485718607902527, + "logps/chosen": -182.39852905273438, + "logps/rejected": -210.02572631835938, + "loss": 0.6639, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03208126500248909, + "rewards/margins": 0.06662876158952713, + "rewards/margins_max": 0.09643807262182236, + "rewards/margins_min": 0.036819443106651306, + "rewards/margins_std": 0.04215674102306366, + "rewards/rejected": -0.03454749658703804, + "step": 2780 + }, + { + "epoch": 0.88, + "grad_norm": 0.302734375, + "learning_rate": 2.1972977150253064e-08, + "logits/chosen": -1.5038772821426392, + "logits/rejected": -0.935627818107605, + "logps/chosen": -247.6013641357422, + "logps/rejected": -288.5367431640625, + "loss": 0.6649, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04114503413438797, + "rewards/margins": 0.07020784169435501, + "rewards/margins_max": 0.11474663019180298, + "rewards/margins_min": 0.025669043883681297, + "rewards/margins_std": 0.06298737227916718, + "rewards/rejected": -0.02906280755996704, + "step": 2790 + }, + { + "epoch": 0.88, + "grad_norm": 0.57421875, + "learning_rate": 2.085943603250595e-08, + "logits/chosen": -1.428411841392517, + "logits/rejected": -1.1179401874542236, + "logps/chosen": -189.0768585205078, + "logps/rejected": -204.92483520507812, + "loss": 0.6648, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.037042513489723206, + "rewards/margins": 0.06584902107715607, + "rewards/margins_max": 0.08738056570291519, + "rewards/margins_min": 0.04431745409965515, + "rewards/margins_std": 0.03045022115111351, + "rewards/rejected": -0.028806498274207115, + "step": 2800 + }, + { + "epoch": 0.89, + "grad_norm": 0.31640625, + "learning_rate": 1.977362051376158e-08, + "logits/chosen": -1.4192006587982178, + "logits/rejected": -1.046197772026062, + "logps/chosen": -207.2356414794922, + "logps/rejected": -183.64010620117188, + "loss": 0.6652, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.040088996291160583, + "rewards/margins": 0.05597255378961563, + "rewards/margins_max": 0.08285551518201828, + "rewards/margins_min": 0.02908957563340664, + "rewards/margins_std": 0.03801826387643814, + "rewards/rejected": -0.01588355377316475, + "step": 2810 + }, + { + "epoch": 0.89, + "grad_norm": 0.416015625, + "learning_rate": 1.8715661975758524e-08, + "logits/chosen": -1.2246617078781128, + "logits/rejected": -1.0061366558074951, + "logps/chosen": -167.17034912109375, + "logps/rejected": -239.23721313476562, + "loss": 0.6592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03663500398397446, + "rewards/margins": 0.065810427069664, + "rewards/margins_max": 0.09848222881555557, + "rewards/margins_min": 0.03313861042261124, + "rewards/margins_std": 0.04620492085814476, + "rewards/rejected": -0.029175419360399246, + "step": 2820 + }, + { + "epoch": 0.89, + "grad_norm": 0.427734375, + "learning_rate": 1.768568842959037e-08, + "logits/chosen": -1.4292596578598022, + "logits/rejected": -1.0080540180206299, + "logps/chosen": -259.69537353515625, + "logps/rejected": -236.2705078125, + "loss": 0.6597, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03333950787782669, + "rewards/margins": 0.06502407789230347, + "rewards/margins_max": 0.08750364929437637, + "rewards/margins_min": 0.04254449903964996, + "rewards/margins_std": 0.03179091960191727, + "rewards/rejected": -0.031684570014476776, + "step": 2830 + }, + { + "epoch": 0.89, + "grad_norm": 0.41015625, + "learning_rate": 1.668382450021666e-08, + "logits/chosen": -1.3095591068267822, + "logits/rejected": -1.0401207208633423, + "logps/chosen": -206.1196746826172, + "logps/rejected": -171.01002502441406, + "loss": 0.6647, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03997686505317688, + "rewards/margins": 0.05088004469871521, + "rewards/margins_max": 0.07651884853839874, + "rewards/margins_min": 0.025241252034902573, + "rewards/margins_std": 0.03625873476266861, + "rewards/rejected": -0.010903185233473778, + "step": 2840 + }, + { + "epoch": 0.9, + "grad_norm": 0.458984375, + "learning_rate": 1.571019141138366e-08, + "logits/chosen": -1.3637133836746216, + "logits/rejected": -1.0843619108200073, + "logps/chosen": -171.24868774414062, + "logps/rejected": -179.87950134277344, + "loss": 0.6668, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0334320105612278, + "rewards/margins": 0.048344530165195465, + "rewards/margins_max": 0.06994569301605225, + "rewards/margins_min": 0.026743358001112938, + "rewards/margins_std": 0.030548665672540665, + "rewards/rejected": -0.014912518672645092, + "step": 2850 + }, + { + "epoch": 0.9, + "grad_norm": 0.515625, + "learning_rate": 1.4764906970956142e-08, + "logits/chosen": -1.356999397277832, + "logits/rejected": -1.0233064889907837, + "logps/chosen": -193.38766479492188, + "logps/rejected": -196.44386291503906, + "loss": 0.6634, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03667105361819267, + "rewards/margins": 0.05664552003145218, + "rewards/margins_max": 0.07828361541032791, + "rewards/margins_min": 0.03500741347670555, + "rewards/margins_std": 0.03060089983046055, + "rewards/rejected": -0.01997446082532406, + "step": 2860 + }, + { + "epoch": 0.9, + "grad_norm": 0.455078125, + "learning_rate": 1.3848085556663197e-08, + "logits/chosen": -1.2966177463531494, + "logits/rejected": -0.9208385348320007, + "logps/chosen": -267.82086181640625, + "logps/rejected": -202.51319885253906, + "loss": 0.6643, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.034102655947208405, + "rewards/margins": 0.05611242726445198, + "rewards/margins_max": 0.08146383613348007, + "rewards/margins_min": 0.030761009082198143, + "rewards/margins_std": 0.03585231304168701, + "rewards/rejected": -0.022009767591953278, + "step": 2870 + }, + { + "epoch": 0.91, + "grad_norm": 0.396484375, + "learning_rate": 1.2959838102258535e-08, + "logits/chosen": -1.3745180368423462, + "logits/rejected": -1.0097087621688843, + "logps/chosen": -287.45062255859375, + "logps/rejected": -254.785400390625, + "loss": 0.6658, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03240882605314255, + "rewards/margins": 0.05045477673411369, + "rewards/margins_max": 0.07354002445936203, + "rewards/margins_min": 0.027369529008865356, + "rewards/margins_std": 0.032647471874952316, + "rewards/rejected": -0.018045950680971146, + "step": 2880 + }, + { + "epoch": 0.91, + "grad_norm": 0.322265625, + "learning_rate": 1.2100272084097779e-08, + "logits/chosen": -1.323025107383728, + "logits/rejected": -1.0186015367507935, + "logps/chosen": -183.8828582763672, + "logps/rejected": -250.3448486328125, + "loss": 0.6619, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0408027246594429, + "rewards/margins": 0.07389940321445465, + "rewards/margins_max": 0.10190100967884064, + "rewards/margins_min": 0.04589778929948807, + "rewards/margins_std": 0.03960026055574417, + "rewards/rejected": -0.03309667855501175, + "step": 2890 + }, + { + "epoch": 0.91, + "grad_norm": 0.373046875, + "learning_rate": 1.1269491508133944e-08, + "logits/chosen": -1.5226811170578003, + "logits/rejected": -0.9228025674819946, + "logps/chosen": -312.5696105957031, + "logps/rejected": -221.99136352539062, + "loss": 0.6626, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0407402329146862, + "rewards/margins": 0.06274916976690292, + "rewards/margins_max": 0.08748480677604675, + "rewards/margins_min": 0.038013529032468796, + "rewards/margins_std": 0.03498147428035736, + "rewards/rejected": -0.02200893685221672, + "step": 2900 + }, + { + "epoch": 0.92, + "grad_norm": 0.447265625, + "learning_rate": 1.0467596897333008e-08, + "logits/chosen": -1.3627498149871826, + "logits/rejected": -0.8954145312309265, + "logps/chosen": -231.42977905273438, + "logps/rejected": -222.2968292236328, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04414510354399681, + "rewards/margins": 0.06752908229827881, + "rewards/margins_max": 0.09981563687324524, + "rewards/margins_min": 0.03524252399802208, + "rewards/margins_std": 0.04566008597612381, + "rewards/rejected": -0.02338396944105625, + "step": 2910 + }, + { + "epoch": 0.92, + "grad_norm": 0.341796875, + "learning_rate": 9.694685279510672e-09, + "logits/chosen": -1.3423680067062378, + "logits/rejected": -1.2014684677124023, + "logps/chosen": -185.5139617919922, + "logps/rejected": -232.6142120361328, + "loss": 0.6689, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.022014331072568893, + "rewards/margins": 0.049973584711551666, + "rewards/margins_max": 0.0689433366060257, + "rewards/margins_min": 0.031003836542367935, + "rewards/margins_std": 0.026827272027730942, + "rewards/rejected": -0.027959251776337624, + "step": 2920 + }, + { + "epoch": 0.92, + "grad_norm": 0.375, + "learning_rate": 8.950850175592328e-09, + "logits/chosen": -1.4081456661224365, + "logits/rejected": -1.0961415767669678, + "logps/chosen": -232.86813354492188, + "logps/rejected": -269.9138488769531, + "loss": 0.6679, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0401880256831646, + "rewards/margins": 0.05734118074178696, + "rewards/margins_max": 0.08120250701904297, + "rewards/margins_min": 0.03347986191511154, + "rewards/margins_std": 0.03374500200152397, + "rewards/rejected": -0.01715315505862236, + "step": 2930 + }, + { + "epoch": 0.93, + "grad_norm": 0.38671875, + "learning_rate": 8.236181588297115e-09, + "logits/chosen": -1.3293626308441162, + "logits/rejected": -0.9906571507453918, + "logps/chosen": -256.72100830078125, + "logps/rejected": -313.48504638671875, + "loss": 0.6643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03118916228413582, + "rewards/margins": 0.06911532580852509, + "rewards/margins_max": 0.10093537718057632, + "rewards/margins_min": 0.03729528561234474, + "rewards/margins_std": 0.045000337064266205, + "rewards/rejected": -0.037926167249679565, + "step": 2940 + }, + { + "epoch": 0.93, + "grad_norm": 0.375, + "learning_rate": 7.550765991247654e-09, + "logits/chosen": -1.3571122884750366, + "logits/rejected": -0.9799866676330566, + "logps/chosen": -237.0681610107422, + "logps/rejected": -217.8629913330078, + "loss": 0.6643, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.039660923182964325, + "rewards/margins": 0.058103930205106735, + "rewards/margins_max": 0.08864767849445343, + "rewards/margins_min": 0.02756018377840519, + "rewards/margins_std": 0.04319537803530693, + "rewards/rejected": -0.01844300702214241, + "step": 2950 + }, + { + "epoch": 0.93, + "grad_norm": 0.27734375, + "learning_rate": 6.894686318507064e-09, + "logits/chosen": -1.3770530223846436, + "logits/rejected": -1.0678465366363525, + "logps/chosen": -207.478759765625, + "logps/rejected": -254.6818389892578, + "loss": 0.6665, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03722939267754555, + "rewards/margins": 0.057834554463624954, + "rewards/margins_max": 0.07993746548891068, + "rewards/margins_min": 0.035731635987758636, + "rewards/margins_std": 0.0312582366168499, + "rewards/rejected": -0.020605161786079407, + "step": 2960 + }, + { + "epoch": 0.94, + "grad_norm": 0.41015625, + "learning_rate": 6.268021954544095e-09, + "logits/chosen": -1.1451586484909058, + "logits/rejected": -0.9856246709823608, + "logps/chosen": -198.33804321289062, + "logps/rejected": -290.0233459472656, + "loss": 0.6659, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03236705809831619, + "rewards/margins": 0.06491495668888092, + "rewards/margins_max": 0.09603826701641083, + "rewards/margins_min": 0.033791638910770416, + "rewards/margins_std": 0.044015005230903625, + "rewards/rejected": -0.03254788741469383, + "step": 2970 + }, + { + "epoch": 0.94, + "grad_norm": 0.5078125, + "learning_rate": 5.670848724627531e-09, + "logits/chosen": -1.4588849544525146, + "logits/rejected": -1.0730645656585693, + "logps/chosen": -301.3870849609375, + "logps/rejected": -199.2222442626953, + "loss": 0.6658, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.026702869683504105, + "rewards/margins": 0.04165857285261154, + "rewards/margins_max": 0.06404221057891846, + "rewards/margins_min": 0.01927492953836918, + "rewards/margins_std": 0.03165525197982788, + "rewards/rejected": -0.014955705031752586, + "step": 2980 + }, + { + "epoch": 0.94, + "grad_norm": 0.412109375, + "learning_rate": 5.103238885651617e-09, + "logits/chosen": -1.4286754131317139, + "logits/rejected": -0.9818390011787415, + "logps/chosen": -238.0774688720703, + "logps/rejected": -222.6654815673828, + "loss": 0.6611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03790941461920738, + "rewards/margins": 0.06597913056612015, + "rewards/margins_max": 0.09669280052185059, + "rewards/margins_min": 0.03526546061038971, + "rewards/margins_std": 0.04343568533658981, + "rewards/rejected": -0.028069715946912766, + "step": 2990 + }, + { + "epoch": 0.95, + "grad_norm": 0.484375, + "learning_rate": 4.565261117393249e-09, + "logits/chosen": -1.527706503868103, + "logits/rejected": -1.1605089902877808, + "logps/chosen": -238.7028045654297, + "logps/rejected": -198.54071044921875, + "loss": 0.6641, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.038233425468206406, + "rewards/margins": 0.05112460255622864, + "rewards/margins_max": 0.072813980281353, + "rewards/margins_min": 0.029435228556394577, + "rewards/margins_std": 0.03067341446876526, + "rewards/rejected": -0.01289118267595768, + "step": 3000 + }, + { + "epoch": 0.95, + "grad_norm": 0.392578125, + "learning_rate": 4.056980514201447e-09, + "logits/chosen": -1.3091288805007935, + "logits/rejected": -0.9673709869384766, + "logps/chosen": -203.36215209960938, + "logps/rejected": -215.65908813476562, + "loss": 0.662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.038450248539447784, + "rewards/margins": 0.06377027928829193, + "rewards/margins_max": 0.09830651432275772, + "rewards/margins_min": 0.0292340274900198, + "rewards/margins_std": 0.048841629177331924, + "rewards/rejected": -0.02532001957297325, + "step": 3010 + }, + { + "epoch": 0.95, + "grad_norm": 0.357421875, + "learning_rate": 3.5784585771215235e-09, + "logits/chosen": -1.3335479497909546, + "logits/rejected": -0.9828931093215942, + "logps/chosen": -176.47000122070312, + "logps/rejected": -178.46786499023438, + "loss": 0.6696, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.031307101249694824, + "rewards/margins": 0.04759521037340164, + "rewards/margins_max": 0.0687338337302208, + "rewards/margins_min": 0.02645658515393734, + "rewards/margins_std": 0.02989453449845314, + "rewards/rejected": -0.016288110986351967, + "step": 3020 + }, + { + "epoch": 0.95, + "grad_norm": 0.390625, + "learning_rate": 3.129753206453201e-09, + "logits/chosen": -1.4696094989776611, + "logits/rejected": -1.032707929611206, + "logps/chosen": -234.9283447265625, + "logps/rejected": -236.0854949951172, + "loss": 0.6626, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0398876890540123, + "rewards/margins": 0.0536864697933197, + "rewards/margins_max": 0.08105526119470596, + "rewards/margins_min": 0.026317689567804337, + "rewards/margins_std": 0.03870530426502228, + "rewards/rejected": -0.013798783533275127, + "step": 3030 + }, + { + "epoch": 0.96, + "grad_norm": 0.33203125, + "learning_rate": 2.7109186947449348e-09, + "logits/chosen": -1.4651210308074951, + "logits/rejected": -1.179198980331421, + "logps/chosen": -185.0526123046875, + "logps/rejected": -206.34677124023438, + "loss": 0.6674, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.037171076983213425, + "rewards/margins": 0.04990251734852791, + "rewards/margins_max": 0.07099257409572601, + "rewards/margins_min": 0.0288124717772007, + "rewards/margins_std": 0.029825836420059204, + "rewards/rejected": -0.012731445021927357, + "step": 3040 + }, + { + "epoch": 0.96, + "grad_norm": 0.330078125, + "learning_rate": 2.322005720224618e-09, + "logits/chosen": -1.2301725149154663, + "logits/rejected": -0.8613675236701965, + "logps/chosen": -176.6241912841797, + "logps/rejected": -234.5286407470703, + "loss": 0.6633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.040943752974271774, + "rewards/margins": 0.06620831787586212, + "rewards/margins_max": 0.09054501354694366, + "rewards/margins_min": 0.04187161475419998, + "rewards/margins_std": 0.0344172939658165, + "rewards/rejected": -0.02526455745100975, + "step": 3050 + }, + { + "epoch": 0.96, + "grad_norm": 0.349609375, + "learning_rate": 1.9630613406676764e-09, + "logits/chosen": -1.3148514032363892, + "logits/rejected": -1.1194158792495728, + "logps/chosen": -204.06472778320312, + "logps/rejected": -175.95155334472656, + "loss": 0.6675, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030322005972266197, + "rewards/margins": 0.04166535660624504, + "rewards/margins_max": 0.06148039177060127, + "rewards/margins_min": 0.021850308403372765, + "rewards/margins_std": 0.028022700920701027, + "rewards/rejected": -0.011343345046043396, + "step": 3060 + }, + { + "epoch": 0.97, + "grad_norm": 0.494140625, + "learning_rate": 1.6341289877028486e-09, + "logits/chosen": -1.2309526205062866, + "logits/rejected": -0.9648950695991516, + "logps/chosen": -221.1148223876953, + "logps/rejected": -218.8831024169922, + "loss": 0.6665, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03704274445772171, + "rewards/margins": 0.06422804296016693, + "rewards/margins_max": 0.09260173887014389, + "rewards/margins_min": 0.03585432469844818, + "rewards/margins_std": 0.040126487612724304, + "rewards/rejected": -0.027185291051864624, + "step": 3070 + }, + { + "epoch": 0.97, + "grad_norm": 0.41015625, + "learning_rate": 1.33524846155747e-09, + "logits/chosen": -1.5479004383087158, + "logits/rejected": -1.124626874923706, + "logps/chosen": -272.0228271484375, + "logps/rejected": -232.5234832763672, + "loss": 0.6602, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03827238082885742, + "rewards/margins": 0.06339852511882782, + "rewards/margins_max": 0.08984600752592087, + "rewards/margins_min": 0.03695103898644447, + "rewards/margins_std": 0.03740239515900612, + "rewards/rejected": -0.0251261405646801, + "step": 3080 + }, + { + "epoch": 0.97, + "grad_norm": 0.4296875, + "learning_rate": 1.066455926241383e-09, + "logits/chosen": -1.3203740119934082, + "logits/rejected": -1.0223264694213867, + "logps/chosen": -217.78921508789062, + "logps/rejected": -185.77662658691406, + "loss": 0.6662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03560353443026543, + "rewards/margins": 0.05814961716532707, + "rewards/margins_max": 0.08662423491477966, + "rewards/margins_min": 0.029674995690584183, + "rewards/margins_std": 0.04026919603347778, + "rewards/rejected": -0.022546080872416496, + "step": 3090 + }, + { + "epoch": 0.98, + "grad_norm": 0.625, + "learning_rate": 8.277839051712698e-10, + "logits/chosen": -1.2869453430175781, + "logits/rejected": -0.9400846362113953, + "logps/chosen": -253.38711547851562, + "logps/rejected": -252.90274047851562, + "loss": 0.6638, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04124082997441292, + "rewards/margins": 0.05371633172035217, + "rewards/margins_max": 0.08102357387542725, + "rewards/margins_min": 0.0264090858399868, + "rewards/margins_std": 0.03861827403306961, + "rewards/rejected": -0.012475499883294106, + "step": 3100 + }, + { + "epoch": 0.98, + "grad_norm": 0.55078125, + "learning_rate": 6.192612772354944e-10, + "logits/chosen": -1.323472499847412, + "logits/rejected": -0.9910783767700195, + "logps/chosen": -250.5233917236328, + "logps/rejected": -254.0393524169922, + "loss": 0.6645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.043138034641742706, + "rewards/margins": 0.062495727092027664, + "rewards/margins_max": 0.09117720276117325, + "rewards/margins_min": 0.03381425514817238, + "rewards/margins_std": 0.0405617319047451, + "rewards/rejected": -0.019357692450284958, + "step": 3110 + }, + { + "epoch": 0.98, + "grad_norm": 0.30859375, + "learning_rate": 4.4091327329956465e-10, + "logits/chosen": -1.3970682621002197, + "logits/rejected": -1.0630197525024414, + "logps/chosen": -187.95303344726562, + "logps/rejected": -180.37051391601562, + "loss": 0.6629, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04492129012942314, + "rewards/margins": 0.0595441572368145, + "rewards/margins_max": 0.08808682858943939, + "rewards/margins_min": 0.03100150264799595, + "rewards/margins_std": 0.04036542400717735, + "rewards/rejected": -0.014622872695326805, + "step": 3120 + }, + { + "epoch": 0.99, + "grad_norm": 0.44140625, + "learning_rate": 2.927614731534356e-10, + "logits/chosen": -1.3621008396148682, + "logits/rejected": -1.0651832818984985, + "logps/chosen": -214.0552520751953, + "logps/rejected": -293.3019104003906, + "loss": 0.6652, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03748806565999985, + "rewards/margins": 0.06025733798742294, + "rewards/margins_max": 0.08895647525787354, + "rewards/margins_min": 0.031558211892843246, + "rewards/margins_std": 0.04058670252561569, + "rewards/rejected": -0.022769279778003693, + "step": 3130 + }, + { + "epoch": 0.99, + "grad_norm": 0.33984375, + "learning_rate": 1.7482380290034792e-10, + "logits/chosen": -1.4978671073913574, + "logits/rejected": -1.0491201877593994, + "logps/chosen": -187.7884063720703, + "logps/rejected": -193.33639526367188, + "loss": 0.6632, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03827610984444618, + "rewards/margins": 0.06368110328912735, + "rewards/margins_max": 0.08939781039953232, + "rewards/margins_min": 0.03796439617872238, + "rewards/margins_std": 0.036368921399116516, + "rewards/rejected": -0.025404995307326317, + "step": 3140 + }, + { + "epoch": 0.99, + "grad_norm": 0.490234375, + "learning_rate": 8.711453278778535e-11, + "logits/chosen": -1.3394626379013062, + "logits/rejected": -0.8948138356208801, + "logps/chosen": -242.09231567382812, + "logps/rejected": -217.08139038085938, + "loss": 0.6631, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.035616446286439896, + "rewards/margins": 0.06384526938199997, + "rewards/margins_max": 0.09082364290952682, + "rewards/margins_min": 0.03686688840389252, + "rewards/margins_std": 0.03815319389104843, + "rewards/rejected": -0.028228823095560074, + "step": 3150 + }, + { + "epoch": 1.0, + "grad_norm": 0.376953125, + "learning_rate": 2.9644275480772416e-11, + "logits/chosen": -1.425526738166809, + "logits/rejected": -1.098435640335083, + "logps/chosen": -208.4182586669922, + "logps/rejected": -194.59750366210938, + "loss": 0.6706, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03818322345614433, + "rewards/margins": 0.04908495396375656, + "rewards/margins_max": 0.07002463191747665, + "rewards/margins_min": 0.02814526855945587, + "rewards/margins_std": 0.029613185673952103, + "rewards/rejected": -0.010901734232902527, + "step": 3160 + }, + { + "epoch": 1.0, + "grad_norm": 0.51171875, + "learning_rate": 2.419984777790596e-12, + "logits/chosen": -1.3360934257507324, + "logits/rejected": -0.8945194482803345, + "logps/chosen": -228.0156707763672, + "logps/rejected": -237.915283203125, + "loss": 0.6624, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.045156557112932205, + "rewards/margins": 0.06454546749591827, + "rewards/margins_max": 0.10190453380346298, + "rewards/margins_min": 0.02718639373779297, + "rewards/margins_std": 0.05283369496464729, + "rewards/rejected": -0.01938890479505062, + "step": 3170 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.0169051885604858, + "eval_logits/rejected": -0.8946173191070557, + "eval_logps/chosen": -322.6468811035156, + "eval_logps/rejected": -313.6658020019531, + "eval_loss": 0.6918271780014038, + "eval_rewards/accuracies": 0.5540000200271606, + "eval_rewards/chosen": 0.02316886931657791, + "eval_rewards/margins": 0.0031846188940107822, + "eval_rewards/margins_max": 0.06275644898414612, + "eval_rewards/margins_min": -0.059831298887729645, + "eval_rewards/margins_std": 0.040721021592617035, + "eval_rewards/rejected": 0.019984247162938118, + "eval_runtime": 1444.6396, + "eval_samples_per_second": 2.769, + "eval_steps_per_second": 0.173, + "step": 3174 + }, + { + "epoch": 1.0, + "step": 3174, + "total_flos": 0.0, + "train_loss": 0.6703614967006065, + "train_runtime": 26793.455, + "train_samples_per_second": 0.948, + "train_steps_per_second": 0.118 + } + ], + "logging_steps": 10, + "max_steps": 3174, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}