diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5294 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.3333333333333334e-08, + "logits/chosen": -1.5177760124206543, + "logits/rejected": -1.1611042022705078, + "logps/chosen": -309.02911376953125, + "logps/rejected": -848.8409423828125, + "loss": 0.2593, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.3333333333333336e-07, + "logits/chosen": -1.5949721336364746, + "logits/rejected": -1.165027379989624, + "logps/chosen": -451.9952697753906, + "logps/rejected": -786.9351806640625, + "loss": 0.2269, + "rewards/accuracies": 0.2777777910232544, + "rewards/chosen": -0.0005774286109954119, + "rewards/margins": -0.000440336880274117, + "rewards/rejected": -0.00013709173072129488, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.666666666666667e-07, + "logits/chosen": -1.557755470275879, + "logits/rejected": -1.188490629196167, + "logps/chosen": -457.94342041015625, + "logps/rejected": -653.2659912109375, + "loss": 0.2143, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0007440468179993331, + "rewards/margins": 0.000692694156896323, + "rewards/rejected": 5.135267929290421e-05, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": -1.8446645736694336, + "logits/rejected": -1.1848350763320923, + "logps/chosen": -542.8465576171875, + "logps/rejected": -840.0565185546875, + "loss": 0.1931, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.001833869144320488, + "rewards/margins": 0.002176427748054266, + "rewards/rejected": -0.000342558283591643, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 5.333333333333335e-07, + "logits/chosen": -1.3947551250457764, + "logits/rejected": -1.0338518619537354, + "logps/chosen": -513.7941284179688, + "logps/rejected": -851.7515869140625, + "loss": 0.1978, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0025823800824582577, + "rewards/margins": 0.007740010507404804, + "rewards/rejected": -0.005157629959285259, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": -1.544039249420166, + "logits/rejected": -1.1640592813491821, + "logps/chosen": -444.1375427246094, + "logps/rejected": -751.6806640625, + "loss": 0.1943, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.006201503798365593, + "rewards/margins": 0.012399530969560146, + "rewards/rejected": -0.006198027171194553, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": -1.7549737691879272, + "logits/rejected": -1.134901523590088, + "logps/chosen": -524.1077880859375, + "logps/rejected": -897.7693481445312, + "loss": 0.1816, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.010963965207338333, + "rewards/margins": 0.028727427124977112, + "rewards/rejected": -0.01776346191763878, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 9.333333333333334e-07, + "logits/chosen": -1.707073450088501, + "logits/rejected": -0.9422414898872375, + "logps/chosen": -418.4791564941406, + "logps/rejected": -887.1856689453125, + "loss": 0.166, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.011874416843056679, + "rewards/margins": 0.05441901832818985, + "rewards/rejected": -0.042544592171907425, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 1.066666666666667e-06, + "logits/chosen": -1.7464405298233032, + "logits/rejected": -1.3240439891815186, + "logps/chosen": -393.67059326171875, + "logps/rejected": -797.724609375, + "loss": 0.1765, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.014196820557117462, + "rewards/margins": 0.05819591134786606, + "rewards/rejected": -0.043999094516038895, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": -1.47357976436615, + "logits/rejected": -0.9985636472702026, + "logps/chosen": -445.72979736328125, + "logps/rejected": -712.3211059570312, + "loss": 0.1603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.003458250779658556, + "rewards/margins": 0.06728541851043701, + "rewards/rejected": -0.06382717192173004, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": -1.5050207376480103, + "logits/rejected": -1.0137097835540771, + "logps/chosen": -409.77252197265625, + "logps/rejected": -882.01025390625, + "loss": 0.1755, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.011769925244152546, + "rewards/margins": 0.09516488015651703, + "rewards/rejected": -0.1069348007440567, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 1.4666666666666669e-06, + "logits/chosen": -1.508277177810669, + "logits/rejected": -0.9909588098526001, + "logps/chosen": -446.08514404296875, + "logps/rejected": -970.3818359375, + "loss": 0.1234, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.039314642548561096, + "rewards/margins": 0.12522205710411072, + "rewards/rejected": -0.16453669965267181, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": -1.5001076459884644, + "logits/rejected": -1.1417639255523682, + "logps/chosen": -595.5568237304688, + "logps/rejected": -1075.8310546875, + "loss": 0.1178, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11702040582895279, + "rewards/margins": 0.1300191581249237, + "rewards/rejected": -0.2470395863056183, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 1.7333333333333336e-06, + "logits/chosen": -1.6354900598526, + "logits/rejected": -1.1173183917999268, + "logps/chosen": -596.342041015625, + "logps/rejected": -1051.9910888671875, + "loss": 0.122, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1303187757730484, + "rewards/margins": 0.14471343159675598, + "rewards/rejected": -0.2750321924686432, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 1.8666666666666669e-06, + "logits/chosen": -1.857448935508728, + "logits/rejected": -1.1995853185653687, + "logps/chosen": -488.5594177246094, + "logps/rejected": -1067.145263671875, + "loss": 0.1382, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11596964299678802, + "rewards/margins": 0.1565937101840973, + "rewards/rejected": -0.2725633978843689, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.6261777877807617, + "logits/rejected": -1.1077944040298462, + "logps/chosen": -564.3380737304688, + "logps/rejected": -1317.8497314453125, + "loss": 0.0805, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14445793628692627, + "rewards/margins": 0.2764219641685486, + "rewards/rejected": -0.42087993025779724, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 2.133333333333334e-06, + "logits/chosen": -1.4915621280670166, + "logits/rejected": -1.1237232685089111, + "logps/chosen": -487.80487060546875, + "logps/rejected": -1048.151611328125, + "loss": 0.0999, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.14958631992340088, + "rewards/margins": 0.19907937943935394, + "rewards/rejected": -0.3486657440662384, + "step": 160 + }, + { + "epoch": 0.05, + "learning_rate": 2.266666666666667e-06, + "logits/chosen": -1.462720274925232, + "logits/rejected": -1.1023520231246948, + "logps/chosen": -552.3968505859375, + "logps/rejected": -1198.3912353515625, + "loss": 0.0924, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1677219718694687, + "rewards/margins": 0.21208930015563965, + "rewards/rejected": -0.37981128692626953, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": -1.6897376775741577, + "logits/rejected": -0.9965440630912781, + "logps/chosen": -626.109619140625, + "logps/rejected": -1155.2620849609375, + "loss": 0.0705, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15819820761680603, + "rewards/margins": 0.23869426548480988, + "rewards/rejected": -0.3968924880027771, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 2.5333333333333338e-06, + "logits/chosen": -1.8232837915420532, + "logits/rejected": -0.823337197303772, + "logps/chosen": -675.5474853515625, + "logps/rejected": -1195.9923095703125, + "loss": 0.1007, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12152433395385742, + "rewards/margins": 0.2081596404314041, + "rewards/rejected": -0.32968395948410034, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": -1.9476888179779053, + "logits/rejected": -1.0242823362350464, + "logps/chosen": -624.99951171875, + "logps/rejected": -1197.134521484375, + "loss": 0.0905, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12548482418060303, + "rewards/margins": 0.23341400921344757, + "rewards/rejected": -0.3588988482952118, + "step": 200 + }, + { + "epoch": 0.06, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": -1.518812894821167, + "logits/rejected": -1.046945571899414, + "logps/chosen": -764.122314453125, + "logps/rejected": -1280.217041015625, + "loss": 0.1437, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23460201919078827, + "rewards/margins": 0.19938690960407257, + "rewards/rejected": -0.4339889585971832, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 2.9333333333333338e-06, + "logits/chosen": -1.6284335851669312, + "logits/rejected": -1.1508177518844604, + "logps/chosen": -523.3719482421875, + "logps/rejected": -1222.5562744140625, + "loss": 0.0616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14848622679710388, + "rewards/margins": 0.24298810958862305, + "rewards/rejected": -0.39147430658340454, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 3.066666666666667e-06, + "logits/chosen": -1.6719776391983032, + "logits/rejected": -0.9572904706001282, + "logps/chosen": -747.065185546875, + "logps/rejected": -1382.8427734375, + "loss": 0.0855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23779627680778503, + "rewards/margins": 0.2256799042224884, + "rewards/rejected": -0.4634762406349182, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": -1.840018630027771, + "logits/rejected": -1.1092523336410522, + "logps/chosen": -612.7644653320312, + "logps/rejected": -1217.589111328125, + "loss": 0.0733, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16975006461143494, + "rewards/margins": 0.2756669223308563, + "rewards/rejected": -0.44541701674461365, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -1.574760913848877, + "logits/rejected": -1.1205968856811523, + "logps/chosen": -629.0726318359375, + "logps/rejected": -1156.187255859375, + "loss": 0.0998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13886727392673492, + "rewards/margins": 0.202668234705925, + "rewards/rejected": -0.3415355086326599, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 3.4666666666666672e-06, + "logits/chosen": -1.7936065196990967, + "logits/rejected": -0.9479697942733765, + "logps/chosen": -538.048583984375, + "logps/rejected": -1271.905029296875, + "loss": 0.063, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.10844652354717255, + "rewards/margins": 0.2883257269859314, + "rewards/rejected": -0.39677220582962036, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -1.7110416889190674, + "logits/rejected": -1.0976063013076782, + "logps/chosen": -601.929443359375, + "logps/rejected": -1087.264404296875, + "loss": 0.074, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13733436167240143, + "rewards/margins": 0.2208215445280075, + "rewards/rejected": -0.35815590620040894, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 3.7333333333333337e-06, + "logits/chosen": -1.684920310974121, + "logits/rejected": -1.0355182886123657, + "logps/chosen": -502.43438720703125, + "logps/rejected": -1141.5850830078125, + "loss": 0.0975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10886510461568832, + "rewards/margins": 0.2565527558326721, + "rewards/rejected": -0.3654178977012634, + "step": 280 + }, + { + "epoch": 0.08, + "learning_rate": 3.866666666666667e-06, + "logits/chosen": -1.7269914150238037, + "logits/rejected": -0.9788557887077332, + "logps/chosen": -717.2500610351562, + "logps/rejected": -1336.142333984375, + "loss": 0.0838, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.23452234268188477, + "rewards/margins": 0.29266995191574097, + "rewards/rejected": -0.527192234992981, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -1.7060441970825195, + "logits/rejected": -1.1475598812103271, + "logps/chosen": -730.9443969726562, + "logps/rejected": -1244.9676513671875, + "loss": 0.1341, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.255375474691391, + "rewards/margins": 0.19869598746299744, + "rewards/rejected": -0.4540714621543884, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 4.133333333333333e-06, + "logits/chosen": -1.7061704397201538, + "logits/rejected": -1.1434520483016968, + "logps/chosen": -705.079833984375, + "logps/rejected": -1187.599365234375, + "loss": 0.1019, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13966450095176697, + "rewards/margins": 0.22368240356445312, + "rewards/rejected": -0.3633468747138977, + "step": 310 + }, + { + "epoch": 0.09, + "learning_rate": 4.266666666666668e-06, + "logits/chosen": -2.0864200592041016, + "logits/rejected": -1.0598593950271606, + "logps/chosen": -621.1461181640625, + "logps/rejected": -1218.3201904296875, + "loss": 0.076, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12446784973144531, + "rewards/margins": 0.285147488117218, + "rewards/rejected": -0.40961527824401855, + "step": 320 + }, + { + "epoch": 0.09, + "learning_rate": 4.4e-06, + "logits/chosen": -1.633683204650879, + "logits/rejected": -1.259135127067566, + "logps/chosen": -594.0384521484375, + "logps/rejected": -1040.244873046875, + "loss": 0.1407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13469278812408447, + "rewards/margins": 0.18073201179504395, + "rewards/rejected": -0.3154247999191284, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 4.533333333333334e-06, + "logits/chosen": -1.5713467597961426, + "logits/rejected": -1.1714986562728882, + "logps/chosen": -552.28515625, + "logps/rejected": -954.9786376953125, + "loss": 0.102, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16208061575889587, + "rewards/margins": 0.17426642775535583, + "rewards/rejected": -0.3363470435142517, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": -1.383143424987793, + "logits/rejected": -1.1191000938415527, + "logps/chosen": -495.08245849609375, + "logps/rejected": -1157.8853759765625, + "loss": 0.0813, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1548539251089096, + "rewards/margins": 0.23339009284973145, + "rewards/rejected": -0.38824400305747986, + "step": 350 + }, + { + "epoch": 0.1, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -1.7334524393081665, + "logits/rejected": -1.072506070137024, + "logps/chosen": -742.59814453125, + "logps/rejected": -1228.1810302734375, + "loss": 0.128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15262000262737274, + "rewards/margins": 0.23166091740131378, + "rewards/rejected": -0.3842809200286865, + "step": 360 + }, + { + "epoch": 0.1, + "learning_rate": 4.933333333333334e-06, + "logits/chosen": -1.8477518558502197, + "logits/rejected": -1.3124161958694458, + "logps/chosen": -677.6192626953125, + "logps/rejected": -1154.7989501953125, + "loss": 0.1155, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12368879467248917, + "rewards/margins": 0.20320534706115723, + "rewards/rejected": -0.32689422369003296, + "step": 370 + }, + { + "epoch": 0.1, + "learning_rate": 4.999972922944898e-06, + "logits/chosen": -1.8957774639129639, + "logits/rejected": -1.3899834156036377, + "logps/chosen": -547.0092163085938, + "logps/rejected": -1174.024658203125, + "loss": 0.0877, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06287173926830292, + "rewards/margins": 0.2412436455488205, + "rewards/rejected": -0.304115355014801, + "step": 380 + }, + { + "epoch": 0.1, + "learning_rate": 4.999756310023261e-06, + "logits/chosen": -1.791933298110962, + "logits/rejected": -1.1157985925674438, + "logps/chosen": -516.9517211914062, + "logps/rejected": -1007.5950317382812, + "loss": 0.0993, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06450649350881577, + "rewards/margins": 0.22859111428260803, + "rewards/rejected": -0.293097585439682, + "step": 390 + }, + { + "epoch": 0.11, + "learning_rate": 4.999323102948655e-06, + "logits/chosen": -1.6634111404418945, + "logits/rejected": -0.9339207410812378, + "logps/chosen": -541.6902465820312, + "logps/rejected": -1222.6890869140625, + "loss": 0.0786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11366814374923706, + "rewards/margins": 0.3344195783138275, + "rewards/rejected": -0.44808775186538696, + "step": 400 + }, + { + "epoch": 0.11, + "learning_rate": 4.998673339256785e-06, + "logits/chosen": -1.7227275371551514, + "logits/rejected": -1.4248206615447998, + "logps/chosen": -612.4100341796875, + "logps/rejected": -1344.530517578125, + "loss": 0.0903, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16698965430259705, + "rewards/margins": 0.24890287220478058, + "rewards/rejected": -0.41589251160621643, + "step": 410 + }, + { + "epoch": 0.11, + "learning_rate": 4.997807075247147e-06, + "logits/chosen": -1.7536413669586182, + "logits/rejected": -1.3619048595428467, + "logps/chosen": -464.01678466796875, + "logps/rejected": -1163.01953125, + "loss": 0.0692, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.021548744291067123, + "rewards/margins": 0.24685168266296387, + "rewards/rejected": -0.2684004306793213, + "step": 420 + }, + { + "epoch": 0.11, + "learning_rate": 4.996724385978142e-06, + "logits/chosen": -2.028895139694214, + "logits/rejected": -1.2840532064437866, + "logps/chosen": -478.65533447265625, + "logps/rejected": -1112.1922607421875, + "loss": 0.0923, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.00986658688634634, + "rewards/margins": 0.23762169480323792, + "rewards/rejected": -0.24748826026916504, + "step": 430 + }, + { + "epoch": 0.12, + "learning_rate": 4.995425365260585e-06, + "logits/chosen": -1.9320755004882812, + "logits/rejected": -1.3463362455368042, + "logps/chosen": -468.8946838378906, + "logps/rejected": -1079.24951171875, + "loss": 0.0844, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05752667784690857, + "rewards/margins": 0.25381767749786377, + "rewards/rejected": -0.31134432554244995, + "step": 440 + }, + { + "epoch": 0.12, + "learning_rate": 4.993910125649561e-06, + "logits/chosen": -1.8947498798370361, + "logits/rejected": -1.4494032859802246, + "logps/chosen": -612.285400390625, + "logps/rejected": -1175.610595703125, + "loss": 0.0961, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20282471179962158, + "rewards/margins": 0.24255582690238953, + "rewards/rejected": -0.4453805387020111, + "step": 450 + }, + { + "epoch": 0.12, + "learning_rate": 4.992178798434684e-06, + "logits/chosen": -1.8909610509872437, + "logits/rejected": -1.2547855377197266, + "logps/chosen": -703.7716064453125, + "logps/rejected": -1249.4072265625, + "loss": 0.0749, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18841485679149628, + "rewards/margins": 0.24599456787109375, + "rewards/rejected": -0.4344094693660736, + "step": 460 + }, + { + "epoch": 0.13, + "learning_rate": 4.990231533628719e-06, + "logits/chosen": -1.9798517227172852, + "logits/rejected": -1.4687498807907104, + "logps/chosen": -451.41192626953125, + "logps/rejected": -1110.53466796875, + "loss": 0.0835, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05847315117716789, + "rewards/margins": 0.2581016421318054, + "rewards/rejected": -0.3165748119354248, + "step": 470 + }, + { + "epoch": 0.13, + "learning_rate": 4.988068499954578e-06, + "logits/chosen": -1.9682140350341797, + "logits/rejected": -1.161273717880249, + "logps/chosen": -534.8321533203125, + "logps/rejected": -1069.1895751953125, + "loss": 0.0861, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.08595879375934601, + "rewards/margins": 0.2487233430147171, + "rewards/rejected": -0.3346821367740631, + "step": 480 + }, + { + "epoch": 0.13, + "learning_rate": 4.985689884830711e-06, + "logits/chosen": -1.800855040550232, + "logits/rejected": -1.0105341672897339, + "logps/chosen": -645.858154296875, + "logps/rejected": -1335.704833984375, + "loss": 0.06, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17263874411582947, + "rewards/margins": 0.28426748514175415, + "rewards/rejected": -0.4569062292575836, + "step": 490 + }, + { + "epoch": 0.13, + "learning_rate": 4.983095894354858e-06, + "logits/chosen": -1.845642328262329, + "logits/rejected": -1.124245285987854, + "logps/chosen": -684.5682373046875, + "logps/rejected": -1378.302490234375, + "loss": 0.0689, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19359903037548065, + "rewards/margins": 0.3165056109428406, + "rewards/rejected": -0.5101046562194824, + "step": 500 + }, + { + "epoch": 0.14, + "learning_rate": 4.980286753286196e-06, + "logits/chosen": -1.7543941736221313, + "logits/rejected": -1.139478087425232, + "logps/chosen": -591.703857421875, + "logps/rejected": -1196.9752197265625, + "loss": 0.0752, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17276380956172943, + "rewards/margins": 0.274630606174469, + "rewards/rejected": -0.44739437103271484, + "step": 510 + }, + { + "epoch": 0.14, + "learning_rate": 4.97726270502586e-06, + "logits/chosen": -1.7981364727020264, + "logits/rejected": -1.2439225912094116, + "logps/chosen": -622.7135620117188, + "logps/rejected": -1289.652587890625, + "loss": 0.0642, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16895990073680878, + "rewards/margins": 0.27986788749694824, + "rewards/rejected": -0.4488278329372406, + "step": 520 + }, + { + "epoch": 0.14, + "learning_rate": 4.974024011595864e-06, + "logits/chosen": -1.7756052017211914, + "logits/rejected": -1.273600697517395, + "logps/chosen": -779.8721923828125, + "logps/rejected": -1319.716064453125, + "loss": 0.0889, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19681531190872192, + "rewards/margins": 0.24762988090515137, + "rewards/rejected": -0.4444451928138733, + "step": 530 + }, + { + "epoch": 0.14, + "learning_rate": 4.970570953616383e-06, + "logits/chosen": -1.7580140829086304, + "logits/rejected": -1.2909038066864014, + "logps/chosen": -611.9743041992188, + "logps/rejected": -1292.6343994140625, + "loss": 0.0655, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14458617568016052, + "rewards/margins": 0.3040325939655304, + "rewards/rejected": -0.44861873984336853, + "step": 540 + }, + { + "epoch": 0.15, + "learning_rate": 4.966903830281449e-06, + "logits/chosen": -2.1057114601135254, + "logits/rejected": -1.096806287765503, + "logps/chosen": -525.5113525390625, + "logps/rejected": -1102.981689453125, + "loss": 0.0727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09377844631671906, + "rewards/margins": 0.2827639877796173, + "rewards/rejected": -0.3765423893928528, + "step": 550 + }, + { + "epoch": 0.15, + "learning_rate": 4.9630229593330226e-06, + "logits/chosen": -1.8463417291641235, + "logits/rejected": -1.2035672664642334, + "logps/chosen": -658.9556884765625, + "logps/rejected": -1261.522705078125, + "loss": 0.0586, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1989162266254425, + "rewards/margins": 0.25805556774139404, + "rewards/rejected": -0.45697179436683655, + "step": 560 + }, + { + "epoch": 0.15, + "learning_rate": 4.958928677033465e-06, + "logits/chosen": -1.9065221548080444, + "logits/rejected": -1.3213990926742554, + "logps/chosen": -802.2817993164062, + "logps/rejected": -1280.823486328125, + "loss": 0.1215, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2689030170440674, + "rewards/margins": 0.23530542850494385, + "rewards/rejected": -0.5042084455490112, + "step": 570 + }, + { + "epoch": 0.15, + "learning_rate": 4.954621338136399e-06, + "logits/chosen": -1.796940803527832, + "logits/rejected": -1.1448333263397217, + "logps/chosen": -800.59619140625, + "logps/rejected": -1366.584716796875, + "loss": 0.0668, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21505789458751678, + "rewards/margins": 0.30636119842529297, + "rewards/rejected": -0.5214190483093262, + "step": 580 + }, + { + "epoch": 0.16, + "learning_rate": 4.95010131585597e-06, + "logits/chosen": -1.6272817850112915, + "logits/rejected": -0.8004865646362305, + "logps/chosen": -684.5340576171875, + "logps/rejected": -1158.3621826171875, + "loss": 0.1052, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14372439682483673, + "rewards/margins": 0.24541731178760529, + "rewards/rejected": -0.389141708612442, + "step": 590 + }, + { + "epoch": 0.16, + "learning_rate": 4.9453690018345144e-06, + "logits/chosen": -1.791548490524292, + "logits/rejected": -1.2827670574188232, + "logps/chosen": -447.36956787109375, + "logps/rejected": -1077.491943359375, + "loss": 0.0668, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08956549316644669, + "rewards/margins": 0.26021477580070496, + "rewards/rejected": -0.34978026151657104, + "step": 600 + }, + { + "epoch": 0.16, + "learning_rate": 4.940424806108619e-06, + "logits/chosen": -1.989205002784729, + "logits/rejected": -1.3596338033676147, + "logps/chosen": -683.2862548828125, + "logps/rejected": -1159.7490234375, + "loss": 0.1118, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.177694171667099, + "rewards/margins": 0.19292449951171875, + "rewards/rejected": -0.37061864137649536, + "step": 610 + }, + { + "epoch": 0.17, + "learning_rate": 4.935269157073597e-06, + "logits/chosen": -1.8252586126327515, + "logits/rejected": -1.4199360609054565, + "logps/chosen": -499.2095642089844, + "logps/rejected": -1207.395263671875, + "loss": 0.0594, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09857722371816635, + "rewards/margins": 0.3041486144065857, + "rewards/rejected": -0.40272584557533264, + "step": 620 + }, + { + "epoch": 0.17, + "learning_rate": 4.9299025014463665e-06, + "logits/chosen": -1.646740198135376, + "logits/rejected": -0.9128702878952026, + "logps/chosen": -546.8827514648438, + "logps/rejected": -1260.8980712890625, + "loss": 0.0639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10494896024465561, + "rewards/margins": 0.3301486670970917, + "rewards/rejected": -0.4350976347923279, + "step": 630 + }, + { + "epoch": 0.17, + "learning_rate": 4.924325304226745e-06, + "logits/chosen": -1.8604828119277954, + "logits/rejected": -1.0761983394622803, + "logps/chosen": -693.2246704101562, + "logps/rejected": -1271.934326171875, + "loss": 0.0518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19305351376533508, + "rewards/margins": 0.2916993200778961, + "rewards/rejected": -0.4847528040409088, + "step": 640 + }, + { + "epoch": 0.17, + "learning_rate": 4.91853804865716e-06, + "logits/chosen": -1.38680100440979, + "logits/rejected": -0.9548083543777466, + "logps/chosen": -604.3922729492188, + "logps/rejected": -1208.889404296875, + "loss": 0.09, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18794074654579163, + "rewards/margins": 0.24451354146003723, + "rewards/rejected": -0.43245425820350647, + "step": 650 + }, + { + "epoch": 0.18, + "learning_rate": 4.912541236180779e-06, + "logits/chosen": -1.663114309310913, + "logits/rejected": -1.3176881074905396, + "logps/chosen": -644.1502685546875, + "logps/rejected": -1255.1209716796875, + "loss": 0.107, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18079988658428192, + "rewards/margins": 0.23088447749614716, + "rewards/rejected": -0.4116843640804291, + "step": 660 + }, + { + "epoch": 0.18, + "learning_rate": 4.9063353863980565e-06, + "logits/chosen": -1.651368498802185, + "logits/rejected": -1.0333479642868042, + "logps/chosen": -622.9571533203125, + "logps/rejected": -1196.442138671875, + "loss": 0.1069, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13679654896259308, + "rewards/margins": 0.2643422484397888, + "rewards/rejected": -0.4011387825012207, + "step": 670 + }, + { + "epoch": 0.18, + "learning_rate": 4.899921037021719e-06, + "logits/chosen": -1.8946233987808228, + "logits/rejected": -1.4026496410369873, + "logps/chosen": -608.56005859375, + "logps/rejected": -1102.85107421875, + "loss": 0.1145, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18631377816200256, + "rewards/margins": 0.20509126782417297, + "rewards/rejected": -0.39140504598617554, + "step": 680 + }, + { + "epoch": 0.18, + "learning_rate": 4.893298743830168e-06, + "logits/chosen": -1.664089560508728, + "logits/rejected": -1.1638383865356445, + "logps/chosen": -681.1607055664062, + "logps/rejected": -1325.0721435546875, + "loss": 0.0936, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.213485985994339, + "rewards/margins": 0.2793218493461609, + "rewards/rejected": -0.4928078055381775, + "step": 690 + }, + { + "epoch": 0.19, + "learning_rate": 4.88646908061933e-06, + "logits/chosen": -1.7145121097564697, + "logits/rejected": -1.0264990329742432, + "logps/chosen": -640.181396484375, + "logps/rejected": -1224.787841796875, + "loss": 0.0784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20638792216777802, + "rewards/margins": 0.2602555751800537, + "rewards/rejected": -0.46664348244667053, + "step": 700 + }, + { + "epoch": 0.19, + "learning_rate": 4.879432639152935e-06, + "logits/chosen": -1.5630238056182861, + "logits/rejected": -0.8563889265060425, + "logps/chosen": -688.0330810546875, + "logps/rejected": -1333.184326171875, + "loss": 0.087, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20561043918132782, + "rewards/margins": 0.285735547542572, + "rewards/rejected": -0.4913460314273834, + "step": 710 + }, + { + "epoch": 0.19, + "learning_rate": 4.8721900291112415e-06, + "logits/chosen": -1.7675155401229858, + "logits/rejected": -1.3769733905792236, + "logps/chosen": -606.588134765625, + "logps/rejected": -1178.3841552734375, + "loss": 0.0998, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10641028732061386, + "rewards/margins": 0.26885437965393066, + "rewards/rejected": -0.3752647042274475, + "step": 720 + }, + { + "epoch": 0.19, + "learning_rate": 4.864741878038218e-06, + "logits/chosen": -1.5133328437805176, + "logits/rejected": -1.1089714765548706, + "logps/chosen": -560.0144653320312, + "logps/rejected": -1235.3402099609375, + "loss": 0.085, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12044046074151993, + "rewards/margins": 0.26784801483154297, + "rewards/rejected": -0.3882884979248047, + "step": 730 + }, + { + "epoch": 0.2, + "learning_rate": 4.857088831287158e-06, + "logits/chosen": -1.8833305835723877, + "logits/rejected": -1.1213592290878296, + "logps/chosen": -614.7459716796875, + "logps/rejected": -1231.1275634765625, + "loss": 0.0612, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14084835350513458, + "rewards/margins": 0.25714007019996643, + "rewards/rejected": -0.3979884088039398, + "step": 740 + }, + { + "epoch": 0.2, + "learning_rate": 4.849231551964771e-06, + "logits/chosen": -1.6487013101577759, + "logits/rejected": -1.0574976205825806, + "logps/chosen": -638.1517333984375, + "logps/rejected": -1221.0208740234375, + "loss": 0.0773, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15978939831256866, + "rewards/margins": 0.2666565477848053, + "rewards/rejected": -0.4264459013938904, + "step": 750 + }, + { + "epoch": 0.2, + "learning_rate": 4.841170720873723e-06, + "logits/chosen": -1.6006218194961548, + "logits/rejected": -1.0765124559402466, + "logps/chosen": -648.8009643554688, + "logps/rejected": -1193.5694580078125, + "loss": 0.0903, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20380504429340363, + "rewards/margins": 0.2502138018608093, + "rewards/rejected": -0.45401889085769653, + "step": 760 + }, + { + "epoch": 0.21, + "learning_rate": 4.832907036453647e-06, + "logits/chosen": -1.6672321557998657, + "logits/rejected": -1.2181921005249023, + "logps/chosen": -757.1592407226562, + "logps/rejected": -1409.3095703125, + "loss": 0.0809, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.24005849659442902, + "rewards/margins": 0.2847335636615753, + "rewards/rejected": -0.5247920155525208, + "step": 770 + }, + { + "epoch": 0.21, + "learning_rate": 4.824441214720629e-06, + "logits/chosen": -1.3765209913253784, + "logits/rejected": -1.0029934644699097, + "logps/chosen": -498.0054626464844, + "logps/rejected": -1238.66943359375, + "loss": 0.0667, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15500691533088684, + "rewards/margins": 0.3080350160598755, + "rewards/rejected": -0.4630419611930847, + "step": 780 + }, + { + "epoch": 0.21, + "learning_rate": 4.815773989205165e-06, + "logits/chosen": -1.7891244888305664, + "logits/rejected": -1.2491934299468994, + "logps/chosen": -556.8863525390625, + "logps/rejected": -1332.5364990234375, + "loss": 0.0617, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13201333582401276, + "rewards/margins": 0.3241254687309265, + "rewards/rejected": -0.4561387896537781, + "step": 790 + }, + { + "epoch": 0.21, + "learning_rate": 4.806906110888606e-06, + "logits/chosen": -1.6729551553726196, + "logits/rejected": -1.187744379043579, + "logps/chosen": -529.7684936523438, + "logps/rejected": -1193.1453857421875, + "loss": 0.0791, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1251310557126999, + "rewards/margins": 0.2839392423629761, + "rewards/rejected": -0.4090702533721924, + "step": 800 + }, + { + "epoch": 0.22, + "learning_rate": 4.7978383481380865e-06, + "logits/chosen": -1.6085048913955688, + "logits/rejected": -1.2299137115478516, + "logps/chosen": -579.7472534179688, + "logps/rejected": -1075.484130859375, + "loss": 0.0994, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15328553318977356, + "rewards/margins": 0.2056526243686676, + "rewards/rejected": -0.3589381277561188, + "step": 810 + }, + { + "epoch": 0.22, + "learning_rate": 4.788571486639948e-06, + "logits/chosen": -1.4437693357467651, + "logits/rejected": -0.9531752467155457, + "logps/chosen": -721.705078125, + "logps/rejected": -1422.717529296875, + "loss": 0.0707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19354596734046936, + "rewards/margins": 0.2650890648365021, + "rewards/rejected": -0.45863503217697144, + "step": 820 + }, + { + "epoch": 0.22, + "learning_rate": 4.779106329331665e-06, + "logits/chosen": -1.7850446701049805, + "logits/rejected": -1.1975328922271729, + "logps/chosen": -639.4754028320312, + "logps/rejected": -1183.6280517578125, + "loss": 0.1136, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18970054388046265, + "rewards/margins": 0.22527900338172913, + "rewards/rejected": -0.4149795472621918, + "step": 830 + }, + { + "epoch": 0.22, + "learning_rate": 4.769443696332272e-06, + "logits/chosen": -1.6451295614242554, + "logits/rejected": -0.9056906700134277, + "logps/chosen": -704.8148193359375, + "logps/rejected": -1420.8524169921875, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17515410482883453, + "rewards/margins": 0.3173523545265198, + "rewards/rejected": -0.4925064444541931, + "step": 840 + }, + { + "epoch": 0.23, + "learning_rate": 4.759584424871302e-06, + "logits/chosen": -1.5979700088500977, + "logits/rejected": -1.1091766357421875, + "logps/chosen": -686.29052734375, + "logps/rejected": -1212.19580078125, + "loss": 0.1128, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15797309577465057, + "rewards/margins": 0.21523161232471466, + "rewards/rejected": -0.3732047379016876, + "step": 850 + }, + { + "epoch": 0.23, + "learning_rate": 4.749529369216246e-06, + "logits/chosen": -1.7736440896987915, + "logits/rejected": -1.172586441040039, + "logps/chosen": -660.5985107421875, + "logps/rejected": -1275.8603515625, + "loss": 0.0675, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.12020470947027206, + "rewards/margins": 0.2714688181877136, + "rewards/rejected": -0.39167362451553345, + "step": 860 + }, + { + "epoch": 0.23, + "learning_rate": 4.7392794005985324e-06, + "logits/chosen": -1.8436565399169922, + "logits/rejected": -1.348578929901123, + "logps/chosen": -471.02703857421875, + "logps/rejected": -1074.3665771484375, + "loss": 0.0923, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.061982929706573486, + "rewards/margins": 0.23231768608093262, + "rewards/rejected": -0.2943006157875061, + "step": 870 + }, + { + "epoch": 0.23, + "learning_rate": 4.7288354071380415e-06, + "logits/chosen": -1.6346375942230225, + "logits/rejected": -1.053733468055725, + "logps/chosen": -518.6777954101562, + "logps/rejected": -1128.2366943359375, + "loss": 0.0928, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11276010423898697, + "rewards/margins": 0.24085621535778046, + "rewards/rejected": -0.35361629724502563, + "step": 880 + }, + { + "epoch": 0.24, + "learning_rate": 4.7181982937661485e-06, + "logits/chosen": -1.5796483755111694, + "logits/rejected": -0.9548759460449219, + "logps/chosen": -694.7703857421875, + "logps/rejected": -1379.165771484375, + "loss": 0.0694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21314816176891327, + "rewards/margins": 0.28819718956947327, + "rewards/rejected": -0.5013453364372253, + "step": 890 + }, + { + "epoch": 0.24, + "learning_rate": 4.707368982147318e-06, + "logits/chosen": -1.6242942810058594, + "logits/rejected": -1.100235104560852, + "logps/chosen": -656.1912841796875, + "logps/rejected": -1320.0205078125, + "loss": 0.0818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20513398945331573, + "rewards/margins": 0.2904512286186218, + "rewards/rejected": -0.49558526277542114, + "step": 900 + }, + { + "epoch": 0.24, + "learning_rate": 4.696348410599244e-06, + "logits/chosen": -1.6266180276870728, + "logits/rejected": -1.112475872039795, + "logps/chosen": -615.5531005859375, + "logps/rejected": -1176.023681640625, + "loss": 0.1074, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21413663029670715, + "rewards/margins": 0.24315333366394043, + "rewards/rejected": -0.45728999376296997, + "step": 910 + }, + { + "epoch": 0.25, + "learning_rate": 4.685137534011549e-06, + "logits/chosen": -1.8083276748657227, + "logits/rejected": -0.9349889755249023, + "logps/chosen": -659.9114990234375, + "logps/rejected": -1210.7021484375, + "loss": 0.0749, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1867416799068451, + "rewards/margins": 0.27049291133880615, + "rewards/rejected": -0.45723456144332886, + "step": 920 + }, + { + "epoch": 0.25, + "learning_rate": 4.673737323763048e-06, + "logits/chosen": -1.5211584568023682, + "logits/rejected": -0.8713824152946472, + "logps/chosen": -557.5753784179688, + "logps/rejected": -1095.0653076171875, + "loss": 0.1147, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1781322956085205, + "rewards/margins": 0.23243267834186554, + "rewards/rejected": -0.41056495904922485, + "step": 930 + }, + { + "epoch": 0.25, + "learning_rate": 4.662148767637578e-06, + "logits/chosen": -1.6907579898834229, + "logits/rejected": -0.92974454164505, + "logps/chosen": -672.3154907226562, + "logps/rejected": -1263.9849853515625, + "loss": 0.0513, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.14491698145866394, + "rewards/margins": 0.2988959848880768, + "rewards/rejected": -0.44381293654441833, + "step": 940 + }, + { + "epoch": 0.25, + "learning_rate": 4.650372869738415e-06, + "logits/chosen": -1.713449239730835, + "logits/rejected": -1.046890139579773, + "logps/chosen": -656.3117065429688, + "logps/rejected": -1259.781982421875, + "loss": 0.0727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1550595909357071, + "rewards/margins": 0.27939510345458984, + "rewards/rejected": -0.43445467948913574, + "step": 950 + }, + { + "epoch": 0.26, + "learning_rate": 4.638410650401267e-06, + "logits/chosen": -1.6114223003387451, + "logits/rejected": -0.8841923475265503, + "logps/chosen": -619.8568725585938, + "logps/rejected": -1200.001953125, + "loss": 0.092, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18295657634735107, + "rewards/margins": 0.27934443950653076, + "rewards/rejected": -0.46230101585388184, + "step": 960 + }, + { + "epoch": 0.26, + "learning_rate": 4.626263146105875e-06, + "logits/chosen": -1.8829456567764282, + "logits/rejected": -1.1874229907989502, + "logps/chosen": -612.0319213867188, + "logps/rejected": -1310.8118896484375, + "loss": 0.0626, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15665313601493835, + "rewards/margins": 0.32128262519836426, + "rewards/rejected": -0.4779357314109802, + "step": 970 + }, + { + "epoch": 0.26, + "learning_rate": 4.613931409386196e-06, + "logits/chosen": -1.7148971557617188, + "logits/rejected": -1.1706236600875854, + "logps/chosen": -651.0546875, + "logps/rejected": -1184.420654296875, + "loss": 0.0975, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15911589562892914, + "rewards/margins": 0.26245635747909546, + "rewards/rejected": -0.4215722680091858, + "step": 980 + }, + { + "epoch": 0.26, + "learning_rate": 4.601416508739211e-06, + "logits/chosen": -1.7349941730499268, + "logits/rejected": -1.0320522785186768, + "logps/chosen": -641.8563842773438, + "logps/rejected": -1191.799072265625, + "loss": 0.0864, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12740136682987213, + "rewards/margins": 0.27434709668159485, + "rewards/rejected": -0.4017484784126282, + "step": 990 + }, + { + "epoch": 0.27, + "learning_rate": 4.588719528532342e-06, + "logits/chosen": -1.7186208963394165, + "logits/rejected": -1.139203429222107, + "logps/chosen": -608.3776245117188, + "logps/rejected": -1194.766845703125, + "loss": 0.0803, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12015128135681152, + "rewards/margins": 0.28770390152931213, + "rewards/rejected": -0.4078551232814789, + "step": 1000 + }, + { + "epoch": 0.27, + "learning_rate": 4.575841568909494e-06, + "logits/chosen": -1.9281619787216187, + "logits/rejected": -1.021177887916565, + "logps/chosen": -606.2479248046875, + "logps/rejected": -1277.761474609375, + "loss": 0.0629, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12076227366924286, + "rewards/margins": 0.31744498014450073, + "rewards/rejected": -0.4382072389125824, + "step": 1010 + }, + { + "epoch": 0.27, + "learning_rate": 4.562783745695738e-06, + "logits/chosen": -1.772539734840393, + "logits/rejected": -1.1384470462799072, + "logps/chosen": -584.2857666015625, + "logps/rejected": -1120.206787109375, + "loss": 0.0924, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10691050440073013, + "rewards/margins": 0.25287091732025146, + "rewards/rejected": -0.359781414270401, + "step": 1020 + }, + { + "epoch": 0.27, + "learning_rate": 4.549547190300622e-06, + "logits/chosen": -1.6101019382476807, + "logits/rejected": -0.995293915271759, + "logps/chosen": -663.1490478515625, + "logps/rejected": -1269.482666015625, + "loss": 0.0903, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19160196185112, + "rewards/margins": 0.2655636966228485, + "rewards/rejected": -0.4571656584739685, + "step": 1030 + }, + { + "epoch": 0.28, + "learning_rate": 4.536133049620143e-06, + "logits/chosen": -1.6123756170272827, + "logits/rejected": -0.8887365460395813, + "logps/chosen": -645.6787719726562, + "logps/rejected": -1266.5006103515625, + "loss": 0.0808, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13368944823741913, + "rewards/margins": 0.28590840101242065, + "rewards/rejected": -0.4195978045463562, + "step": 1040 + }, + { + "epoch": 0.28, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -1.7597821950912476, + "logits/rejected": -1.0739284753799438, + "logps/chosen": -670.746337890625, + "logps/rejected": -1303.915771484375, + "loss": 0.0671, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15525345504283905, + "rewards/margins": 0.28055456280708313, + "rewards/rejected": -0.435808002948761, + "step": 1050 + }, + { + "epoch": 0.28, + "learning_rate": 4.508776676821739e-06, + "logits/chosen": -1.6723902225494385, + "logits/rejected": -1.1365679502487183, + "logps/chosen": -659.1893310546875, + "logps/rejected": -1257.1162109375, + "loss": 0.0961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18045826256275177, + "rewards/margins": 0.28194642066955566, + "rewards/rejected": -0.46240463852882385, + "step": 1060 + }, + { + "epoch": 0.29, + "learning_rate": 4.494836815027022e-06, + "logits/chosen": -1.674168348312378, + "logits/rejected": -0.8119763135910034, + "logps/chosen": -675.2777099609375, + "logps/rejected": -1269.997802734375, + "loss": 0.1022, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2055848389863968, + "rewards/margins": 0.2812694013118744, + "rewards/rejected": -0.48685422539711, + "step": 1070 + }, + { + "epoch": 0.29, + "learning_rate": 4.4807241083879774e-06, + "logits/chosen": -1.5376824140548706, + "logits/rejected": -0.9127294421195984, + "logps/chosen": -633.2178344726562, + "logps/rejected": -1178.2454833984375, + "loss": 0.0838, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18428584933280945, + "rewards/margins": 0.24417026340961456, + "rewards/rejected": -0.4284561276435852, + "step": 1080 + }, + { + "epoch": 0.29, + "learning_rate": 4.466439779715696e-06, + "logits/chosen": -1.4871512651443481, + "logits/rejected": -0.8935056924819946, + "logps/chosen": -606.7117309570312, + "logps/rejected": -1183.592041015625, + "loss": 0.0976, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1557384431362152, + "rewards/margins": 0.2602311670780182, + "rewards/rejected": -0.4159695506095886, + "step": 1090 + }, + { + "epoch": 0.29, + "learning_rate": 4.451985066691649e-06, + "logits/chosen": -1.5563310384750366, + "logits/rejected": -1.0307669639587402, + "logps/chosen": -558.4649047851562, + "logps/rejected": -1085.895751953125, + "loss": 0.0918, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1063220277428627, + "rewards/margins": 0.26596084237098694, + "rewards/rejected": -0.37228289246559143, + "step": 1100 + }, + { + "epoch": 0.3, + "learning_rate": 4.437361221760449e-06, + "logits/chosen": -1.692957878112793, + "logits/rejected": -1.1781022548675537, + "logps/chosen": -514.2180786132812, + "logps/rejected": -1231.850341796875, + "loss": 0.0484, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.07757623493671417, + "rewards/margins": 0.30450260639190674, + "rewards/rejected": -0.3820788860321045, + "step": 1110 + }, + { + "epoch": 0.3, + "learning_rate": 4.422569512021332e-06, + "logits/chosen": -1.6798826456069946, + "logits/rejected": -1.1389153003692627, + "logps/chosen": -585.5572509765625, + "logps/rejected": -1191.422607421875, + "loss": 0.083, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06024783104658127, + "rewards/margins": 0.2825511693954468, + "rewards/rejected": -0.34279894828796387, + "step": 1120 + }, + { + "epoch": 0.3, + "learning_rate": 4.407611219118363e-06, + "logits/chosen": -1.574668526649475, + "logits/rejected": -1.1745796203613281, + "logps/chosen": -573.3482666015625, + "logps/rejected": -1237.1324462890625, + "loss": 0.0851, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14945417642593384, + "rewards/margins": 0.26941633224487305, + "rewards/rejected": -0.4188705384731293, + "step": 1130 + }, + { + "epoch": 0.3, + "learning_rate": 4.3924876391293915e-06, + "logits/chosen": -1.5106923580169678, + "logits/rejected": -1.0384470224380493, + "logps/chosen": -653.4393920898438, + "logps/rejected": -1249.9561767578125, + "loss": 0.0827, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21140387654304504, + "rewards/margins": 0.26517191529273987, + "rewards/rejected": -0.4765757620334625, + "step": 1140 + }, + { + "epoch": 0.31, + "learning_rate": 4.377200082453748e-06, + "logits/chosen": -1.553951621055603, + "logits/rejected": -1.1727259159088135, + "logps/chosen": -584.9124145507812, + "logps/rejected": -1274.189453125, + "loss": 0.0708, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17560866475105286, + "rewards/margins": 0.3020893931388855, + "rewards/rejected": -0.47769802808761597, + "step": 1150 + }, + { + "epoch": 0.31, + "learning_rate": 4.361749873698707e-06, + "logits/chosen": -1.6106698513031006, + "logits/rejected": -0.9710136651992798, + "logps/chosen": -683.4093017578125, + "logps/rejected": -1351.5721435546875, + "loss": 0.0537, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1945325881242752, + "rewards/margins": 0.3134706914424896, + "rewards/rejected": -0.5080032348632812, + "step": 1160 + }, + { + "epoch": 0.31, + "learning_rate": 4.346138351564711e-06, + "logits/chosen": -1.4298365116119385, + "logits/rejected": -0.9186006784439087, + "logps/chosen": -629.0538940429688, + "logps/rejected": -1190.7537841796875, + "loss": 0.1065, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1838223785161972, + "rewards/margins": 0.25950607657432556, + "rewards/rejected": -0.44332846999168396, + "step": 1170 + }, + { + "epoch": 0.31, + "learning_rate": 4.330366868729376e-06, + "logits/chosen": -1.5331902503967285, + "logits/rejected": -0.8202164769172668, + "logps/chosen": -719.1092529296875, + "logps/rejected": -1225.5712890625, + "loss": 0.1239, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.23043549060821533, + "rewards/margins": 0.24460101127624512, + "rewards/rejected": -0.47503647208213806, + "step": 1180 + }, + { + "epoch": 0.32, + "learning_rate": 4.3144367917302964e-06, + "logits/chosen": -1.5329627990722656, + "logits/rejected": -1.0226593017578125, + "logps/chosen": -558.7479248046875, + "logps/rejected": -1248.3746337890625, + "loss": 0.0732, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17867226898670197, + "rewards/margins": 0.27305805683135986, + "rewards/rejected": -0.451730340719223, + "step": 1190 + }, + { + "epoch": 0.32, + "learning_rate": 4.2983495008466285e-06, + "logits/chosen": -1.4446375370025635, + "logits/rejected": -1.2019789218902588, + "logps/chosen": -629.9727172851562, + "logps/rejected": -1310.1802978515625, + "loss": 0.0701, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19203418493270874, + "rewards/margins": 0.26955386996269226, + "rewards/rejected": -0.461588054895401, + "step": 1200 + }, + { + "epoch": 0.32, + "learning_rate": 4.2821063899795015e-06, + "logits/chosen": -1.5882418155670166, + "logits/rejected": -0.9619570970535278, + "logps/chosen": -633.544921875, + "logps/rejected": -1342.188720703125, + "loss": 0.0702, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15442287921905518, + "rewards/margins": 0.3408200740814209, + "rewards/rejected": -0.49524298310279846, + "step": 1210 + }, + { + "epoch": 0.33, + "learning_rate": 4.265708866531238e-06, + "logits/chosen": -1.6361687183380127, + "logits/rejected": -1.160875916481018, + "logps/chosen": -604.972412109375, + "logps/rejected": -1180.0421142578125, + "loss": 0.0912, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15669772028923035, + "rewards/margins": 0.2758365571498871, + "rewards/rejected": -0.43253427743911743, + "step": 1220 + }, + { + "epoch": 0.33, + "learning_rate": 4.249158351283414e-06, + "logits/chosen": -1.4963274002075195, + "logits/rejected": -1.0508579015731812, + "logps/chosen": -670.7274169921875, + "logps/rejected": -1422.38037109375, + "loss": 0.0547, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18853971362113953, + "rewards/margins": 0.3481212556362152, + "rewards/rejected": -0.5366610288619995, + "step": 1230 + }, + { + "epoch": 0.33, + "learning_rate": 4.232456278273743e-06, + "logits/chosen": -1.5074360370635986, + "logits/rejected": -1.1942028999328613, + "logps/chosen": -621.916015625, + "logps/rejected": -1097.22705078125, + "loss": 0.1119, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20438826084136963, + "rewards/margins": 0.19910171627998352, + "rewards/rejected": -0.40348997712135315, + "step": 1240 + }, + { + "epoch": 0.33, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -1.7932571172714233, + "logits/rejected": -1.156964659690857, + "logps/chosen": -605.7149047851562, + "logps/rejected": -1170.5318603515625, + "loss": 0.0823, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13441742956638336, + "rewards/margins": 0.26707369089126587, + "rewards/rejected": -0.4014911651611328, + "step": 1250 + }, + { + "epoch": 0.34, + "learning_rate": 4.198603260653792e-06, + "logits/chosen": -1.5639320611953735, + "logits/rejected": -1.011678695678711, + "logps/chosen": -570.6861572265625, + "logps/rejected": -1016.75390625, + "loss": 0.1338, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09839320182800293, + "rewards/margins": 0.16072291135787964, + "rewards/rejected": -0.25911611318588257, + "step": 1260 + }, + { + "epoch": 0.34, + "learning_rate": 4.181455249275701e-06, + "logits/chosen": -1.3490540981292725, + "logits/rejected": -0.8460888862609863, + "logps/chosen": -604.7997436523438, + "logps/rejected": -1211.717529296875, + "loss": 0.0948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16084125638008118, + "rewards/margins": 0.24433521926403046, + "rewards/rejected": -0.40517646074295044, + "step": 1270 + }, + { + "epoch": 0.34, + "learning_rate": 4.1641615463459926e-06, + "logits/chosen": -1.4548208713531494, + "logits/rejected": -0.9700274467468262, + "logps/chosen": -662.6041259765625, + "logps/rejected": -1517.4892578125, + "loss": 0.043, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.16144345700740814, + "rewards/margins": 0.34403008222579956, + "rewards/rejected": -0.5054734945297241, + "step": 1280 + }, + { + "epoch": 0.34, + "learning_rate": 4.146723650296701e-06, + "logits/chosen": -1.5642893314361572, + "logits/rejected": -1.012499213218689, + "logps/chosen": -522.3748168945312, + "logps/rejected": -1114.373779296875, + "loss": 0.0893, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11404868215322495, + "rewards/margins": 0.27995288372039795, + "rewards/rejected": -0.3940015733242035, + "step": 1290 + }, + { + "epoch": 0.35, + "learning_rate": 4.129143072053639e-06, + "logits/chosen": -1.8208191394805908, + "logits/rejected": -1.0344860553741455, + "logps/chosen": -717.9315185546875, + "logps/rejected": -1288.3831787109375, + "loss": 0.0814, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1353410929441452, + "rewards/margins": 0.2894688546657562, + "rewards/rejected": -0.42480993270874023, + "step": 1300 + }, + { + "epoch": 0.35, + "learning_rate": 4.111421334905468e-06, + "logits/chosen": -1.67376708984375, + "logits/rejected": -0.7968643307685852, + "logps/chosen": -653.73193359375, + "logps/rejected": -1237.788330078125, + "loss": 0.0799, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10021106898784637, + "rewards/margins": 0.2965288758277893, + "rewards/rejected": -0.3967399299144745, + "step": 1310 + }, + { + "epoch": 0.35, + "learning_rate": 4.093559974371725e-06, + "logits/chosen": -1.5515210628509521, + "logits/rejected": -1.0218132734298706, + "logps/chosen": -771.3948974609375, + "logps/rejected": -1335.147705078125, + "loss": 0.0912, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16722288727760315, + "rewards/margins": 0.2642812132835388, + "rewards/rejected": -0.4315040707588196, + "step": 1320 + }, + { + "epoch": 0.35, + "learning_rate": 4.075560538069767e-06, + "logits/chosen": -1.6814053058624268, + "logits/rejected": -1.175160527229309, + "logps/chosen": -561.907470703125, + "logps/rejected": -1060.85498046875, + "loss": 0.1004, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06503216922283173, + "rewards/margins": 0.25370270013809204, + "rewards/rejected": -0.3187348544597626, + "step": 1330 + }, + { + "epoch": 0.36, + "learning_rate": 4.05742458558068e-06, + "logits/chosen": -1.8086637258529663, + "logits/rejected": -1.3191715478897095, + "logps/chosen": -529.0259399414062, + "logps/rejected": -1079.9012451171875, + "loss": 0.0936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0950629860162735, + "rewards/margins": 0.24811288714408875, + "rewards/rejected": -0.34317582845687866, + "step": 1340 + }, + { + "epoch": 0.36, + "learning_rate": 4.039153688314146e-06, + "logits/chosen": -1.6986970901489258, + "logits/rejected": -1.07126784324646, + "logps/chosen": -537.1072387695312, + "logps/rejected": -1259.658935546875, + "loss": 0.0665, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07069256901741028, + "rewards/margins": 0.353458434343338, + "rewards/rejected": -0.4241510331630707, + "step": 1350 + }, + { + "epoch": 0.36, + "learning_rate": 4.020749429372286e-06, + "logits/chosen": -1.6649013757705688, + "logits/rejected": -1.1139470338821411, + "logps/chosen": -582.5765991210938, + "logps/rejected": -1291.3966064453125, + "loss": 0.0791, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.08450852334499359, + "rewards/margins": 0.30345186591148376, + "rewards/rejected": -0.38796037435531616, + "step": 1360 + }, + { + "epoch": 0.37, + "learning_rate": 4.002213403412492e-06, + "logits/chosen": -1.6457083225250244, + "logits/rejected": -1.1551355123519897, + "logps/chosen": -443.95770263671875, + "logps/rejected": -1125.097412109375, + "loss": 0.0942, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04620728641748428, + "rewards/margins": 0.29895836114883423, + "rewards/rejected": -0.3451656699180603, + "step": 1370 + }, + { + "epoch": 0.37, + "learning_rate": 3.983547216509254e-06, + "logits/chosen": -1.8475234508514404, + "logits/rejected": -0.9538863897323608, + "logps/chosen": -532.26171875, + "logps/rejected": -1173.6441650390625, + "loss": 0.0721, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07093639671802521, + "rewards/margins": 0.30083587765693665, + "rewards/rejected": -0.37177228927612305, + "step": 1380 + }, + { + "epoch": 0.37, + "learning_rate": 3.964752486015001e-06, + "logits/chosen": -1.702235460281372, + "logits/rejected": -1.0087345838546753, + "logps/chosen": -570.5701904296875, + "logps/rejected": -1237.5167236328125, + "loss": 0.0554, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.10062988102436066, + "rewards/margins": 0.29185742139816284, + "rewards/rejected": -0.3924873471260071, + "step": 1390 + }, + { + "epoch": 0.37, + "learning_rate": 3.945830840419966e-06, + "logits/chosen": -1.8062200546264648, + "logits/rejected": -1.2370173931121826, + "logps/chosen": -563.0610961914062, + "logps/rejected": -1202.4017333984375, + "loss": 0.1014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08197510987520218, + "rewards/margins": 0.284410685300827, + "rewards/rejected": -0.3663857579231262, + "step": 1400 + }, + { + "epoch": 0.38, + "learning_rate": 3.92678391921108e-06, + "logits/chosen": -1.685450792312622, + "logits/rejected": -1.268028974533081, + "logps/chosen": -474.2938537597656, + "logps/rejected": -1090.5147705078125, + "loss": 0.0983, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0882890596985817, + "rewards/margins": 0.26188915967941284, + "rewards/rejected": -0.35017821192741394, + "step": 1410 + }, + { + "epoch": 0.38, + "learning_rate": 3.907613372729916e-06, + "logits/chosen": -1.5756769180297852, + "logits/rejected": -1.1073580980300903, + "logps/chosen": -508.64654541015625, + "logps/rejected": -1192.768798828125, + "loss": 0.0905, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11147113889455795, + "rewards/margins": 0.2684895396232605, + "rewards/rejected": -0.37996068596839905, + "step": 1420 + }, + { + "epoch": 0.38, + "learning_rate": 3.888320862029699e-06, + "logits/chosen": -1.970336675643921, + "logits/rejected": -1.1683611869812012, + "logps/chosen": -608.2377319335938, + "logps/rejected": -1272.260498046875, + "loss": 0.091, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08651524782180786, + "rewards/margins": 0.3126547038555145, + "rewards/rejected": -0.399169921875, + "step": 1430 + }, + { + "epoch": 0.38, + "learning_rate": 3.868908058731376e-06, + "logits/chosen": -1.7017894983291626, + "logits/rejected": -0.9617505073547363, + "logps/chosen": -684.0576782226562, + "logps/rejected": -1245.509765625, + "loss": 0.0996, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09601768106222153, + "rewards/margins": 0.29448553919792175, + "rewards/rejected": -0.3905032277107239, + "step": 1440 + }, + { + "epoch": 0.39, + "learning_rate": 3.849376644878783e-06, + "logits/chosen": -1.536409616470337, + "logits/rejected": -1.0880365371704102, + "logps/chosen": -533.0728149414062, + "logps/rejected": -1243.9488525390625, + "loss": 0.0928, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07541215419769287, + "rewards/margins": 0.28547203540802, + "rewards/rejected": -0.3608841896057129, + "step": 1450 + }, + { + "epoch": 0.39, + "learning_rate": 3.829728312792895e-06, + "logits/chosen": -1.8492475748062134, + "logits/rejected": -1.256415605545044, + "logps/chosen": -434.9085388183594, + "logps/rejected": -1020.18798828125, + "loss": 0.0619, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04833076149225235, + "rewards/margins": 0.2557259202003479, + "rewards/rejected": -0.30405664443969727, + "step": 1460 + }, + { + "epoch": 0.39, + "learning_rate": 3.8099647649251984e-06, + "logits/chosen": -1.5395265817642212, + "logits/rejected": -1.087846040725708, + "logps/chosen": -591.6812744140625, + "logps/rejected": -1221.6868896484375, + "loss": 0.0932, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10671044886112213, + "rewards/margins": 0.2588750422000885, + "rewards/rejected": -0.36558544635772705, + "step": 1470 + }, + { + "epoch": 0.39, + "learning_rate": 3.790087713710179e-06, + "logits/chosen": -1.5241081714630127, + "logits/rejected": -0.8990601301193237, + "logps/chosen": -703.971923828125, + "logps/rejected": -1318.2982177734375, + "loss": 0.0708, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16658341884613037, + "rewards/margins": 0.270012766122818, + "rewards/rejected": -0.436596155166626, + "step": 1480 + }, + { + "epoch": 0.4, + "learning_rate": 3.770098881416945e-06, + "logits/chosen": -1.489585041999817, + "logits/rejected": -1.3004378080368042, + "logps/chosen": -654.8056030273438, + "logps/rejected": -1286.8641357421875, + "loss": 0.0956, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15169434249401093, + "rewards/margins": 0.25045520067214966, + "rewards/rejected": -0.40214958786964417, + "step": 1490 + }, + { + "epoch": 0.4, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -1.6937404870986938, + "logits/rejected": -1.0904152393341064, + "logps/chosen": -521.1519775390625, + "logps/rejected": -1150.9736328125, + "loss": 0.0922, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09178796410560608, + "rewards/margins": 0.2572135031223297, + "rewards/rejected": -0.3490014672279358, + "step": 1500 + }, + { + "epoch": 0.4, + "learning_rate": 3.7297928109491765e-06, + "logits/chosen": -1.5384725332260132, + "logits/rejected": -1.1151206493377686, + "logps/chosen": -500.30340576171875, + "logps/rejected": -1171.871337890625, + "loss": 0.0648, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09594957530498505, + "rewards/margins": 0.31415387988090515, + "rewards/rejected": -0.4101034700870514, + "step": 1510 + }, + { + "epoch": 0.41, + "learning_rate": 3.7094790651387414e-06, + "logits/chosen": -1.750771164894104, + "logits/rejected": -1.0950592756271362, + "logps/chosen": -557.678466796875, + "logps/rejected": -1188.5867919921875, + "loss": 0.0778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14062079787254333, + "rewards/margins": 0.282297819852829, + "rewards/rejected": -0.4229187071323395, + "step": 1520 + }, + { + "epoch": 0.41, + "learning_rate": 3.689060522675689e-06, + "logits/chosen": -1.5061167478561401, + "logits/rejected": -1.0993683338165283, + "logps/chosen": -719.0147705078125, + "logps/rejected": -1315.07421875, + "loss": 0.0885, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22822144627571106, + "rewards/margins": 0.24384041130542755, + "rewards/rejected": -0.47206181287765503, + "step": 1530 + }, + { + "epoch": 0.41, + "learning_rate": 3.668538952747236e-06, + "logits/chosen": -1.7315492630004883, + "logits/rejected": -1.098783254623413, + "logps/chosen": -702.6656494140625, + "logps/rejected": -1301.657958984375, + "loss": 0.0741, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16757100820541382, + "rewards/margins": 0.2810649871826172, + "rewards/rejected": -0.4486359655857086, + "step": 1540 + }, + { + "epoch": 0.41, + "learning_rate": 3.6479161334675294e-06, + "logits/chosen": -1.6738449335098267, + "logits/rejected": -1.0924396514892578, + "logps/chosen": -584.9110107421875, + "logps/rejected": -1123.946533203125, + "loss": 0.0867, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09195319563150406, + "rewards/margins": 0.2546834349632263, + "rewards/rejected": -0.3466365933418274, + "step": 1550 + }, + { + "epoch": 0.42, + "learning_rate": 3.627193851723577e-06, + "logits/chosen": -1.5191009044647217, + "logits/rejected": -1.2671594619750977, + "logps/chosen": -644.867431640625, + "logps/rejected": -1194.0931396484375, + "loss": 0.1019, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.133761465549469, + "rewards/margins": 0.24754054844379425, + "rewards/rejected": -0.38130199909210205, + "step": 1560 + }, + { + "epoch": 0.42, + "learning_rate": 3.6063739030204226e-06, + "logits/chosen": -1.7906509637832642, + "logits/rejected": -1.3401691913604736, + "logps/chosen": -475.7640075683594, + "logps/rejected": -1105.63623046875, + "loss": 0.0816, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06556513905525208, + "rewards/margins": 0.2934816777706146, + "rewards/rejected": -0.3590467870235443, + "step": 1570 + }, + { + "epoch": 0.42, + "learning_rate": 3.5854580913255706e-06, + "logits/chosen": -1.6345583200454712, + "logits/rejected": -0.9686171412467957, + "logps/chosen": -546.1632080078125, + "logps/rejected": -1137.38232421875, + "loss": 0.0774, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09245268255472183, + "rewards/margins": 0.2836568355560303, + "rewards/rejected": -0.3761095106601715, + "step": 1580 + }, + { + "epoch": 0.42, + "learning_rate": 3.564448228912682e-06, + "logits/chosen": -1.774083137512207, + "logits/rejected": -1.0874212980270386, + "logps/chosen": -627.0970458984375, + "logps/rejected": -1292.3609619140625, + "loss": 0.0548, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14228971302509308, + "rewards/margins": 0.29837566614151, + "rewards/rejected": -0.4406653940677643, + "step": 1590 + }, + { + "epoch": 0.43, + "learning_rate": 3.543346136204545e-06, + "logits/chosen": -1.492363691329956, + "logits/rejected": -0.9753513336181641, + "logps/chosen": -715.6846923828125, + "logps/rejected": -1311.7313232421875, + "loss": 0.1043, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19200663268566132, + "rewards/margins": 0.2355610430240631, + "rewards/rejected": -0.427567720413208, + "step": 1600 + }, + { + "epoch": 0.43, + "learning_rate": 3.522153641615345e-06, + "logits/chosen": -1.6796743869781494, + "logits/rejected": -1.0669097900390625, + "logps/chosen": -651.3643798828125, + "logps/rejected": -1211.4693603515625, + "loss": 0.0636, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12537550926208496, + "rewards/margins": 0.29044246673583984, + "rewards/rejected": -0.4158180356025696, + "step": 1610 + }, + { + "epoch": 0.43, + "learning_rate": 3.5008725813922383e-06, + "logits/chosen": -1.53739333152771, + "logits/rejected": -1.009767770767212, + "logps/chosen": -771.9571533203125, + "logps/rejected": -1199.3785400390625, + "loss": 0.0922, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17814821004867554, + "rewards/margins": 0.24210956692695618, + "rewards/rejected": -0.4202577471733093, + "step": 1620 + }, + { + "epoch": 0.43, + "learning_rate": 3.4795047994562463e-06, + "logits/chosen": -1.4524638652801514, + "logits/rejected": -1.051928162574768, + "logps/chosen": -690.6948852539062, + "logps/rejected": -1226.161865234375, + "loss": 0.0876, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18693287670612335, + "rewards/margins": 0.25937145948410034, + "rewards/rejected": -0.4463043212890625, + "step": 1630 + }, + { + "epoch": 0.44, + "learning_rate": 3.458052147242494e-06, + "logits/chosen": -1.8221120834350586, + "logits/rejected": -1.1258487701416016, + "logps/chosen": -630.0181274414062, + "logps/rejected": -1044.5679931640625, + "loss": 0.1046, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09280923753976822, + "rewards/margins": 0.23150058090686798, + "rewards/rejected": -0.3243098556995392, + "step": 1640 + }, + { + "epoch": 0.44, + "learning_rate": 3.436516483539781e-06, + "logits/chosen": -1.8554388284683228, + "logits/rejected": -1.3803541660308838, + "logps/chosen": -633.9047241210938, + "logps/rejected": -1229.6116943359375, + "loss": 0.0778, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13664793968200684, + "rewards/margins": 0.2661344110965729, + "rewards/rejected": -0.4027823805809021, + "step": 1650 + }, + { + "epoch": 0.44, + "learning_rate": 3.4148996743295305e-06, + "logits/chosen": -1.9048486948013306, + "logits/rejected": -1.058411955833435, + "logps/chosen": -720.1047973632812, + "logps/rejected": -1232.9039306640625, + "loss": 0.0732, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12227736413478851, + "rewards/margins": 0.2787812352180481, + "rewards/rejected": -0.4010585844516754, + "step": 1660 + }, + { + "epoch": 0.45, + "learning_rate": 3.3932035926241103e-06, + "logits/chosen": -1.6380399465560913, + "logits/rejected": -1.262209177017212, + "logps/chosen": -482.85662841796875, + "logps/rejected": -1295.7718505859375, + "loss": 0.0462, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.08021150529384613, + "rewards/margins": 0.3560211658477783, + "rewards/rejected": -0.43623265624046326, + "step": 1670 + }, + { + "epoch": 0.45, + "learning_rate": 3.3714301183045382e-06, + "logits/chosen": -1.595003366470337, + "logits/rejected": -1.2377384901046753, + "logps/chosen": -547.7657470703125, + "logps/rejected": -1363.28125, + "loss": 0.0617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12761729955673218, + "rewards/margins": 0.29965031147003174, + "rewards/rejected": -0.4272676110267639, + "step": 1680 + }, + { + "epoch": 0.45, + "learning_rate": 3.349581137957604e-06, + "logits/chosen": -1.8540780544281006, + "logits/rejected": -1.0956978797912598, + "logps/chosen": -570.6678466796875, + "logps/rejected": -1214.4659423828125, + "loss": 0.0458, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.08135120570659637, + "rewards/margins": 0.32362592220306396, + "rewards/rejected": -0.40497714281082153, + "step": 1690 + }, + { + "epoch": 0.45, + "learning_rate": 3.3276585447123957e-06, + "logits/chosen": -1.4734654426574707, + "logits/rejected": -1.1689575910568237, + "logps/chosen": -471.4612731933594, + "logps/rejected": -1102.623046875, + "loss": 0.0775, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06378956139087677, + "rewards/margins": 0.2605707049369812, + "rewards/rejected": -0.32436028122901917, + "step": 1700 + }, + { + "epoch": 0.46, + "learning_rate": 3.3056642380762783e-06, + "logits/chosen": -1.6254606246948242, + "logits/rejected": -0.9264505505561829, + "logps/chosen": -598.3116455078125, + "logps/rejected": -1273.1226806640625, + "loss": 0.0645, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10875244438648224, + "rewards/margins": 0.3382043242454529, + "rewards/rejected": -0.4469567835330963, + "step": 1710 + }, + { + "epoch": 0.46, + "learning_rate": 3.2836001237702993e-06, + "logits/chosen": -1.7050098180770874, + "logits/rejected": -1.1736676692962646, + "logps/chosen": -605.455078125, + "logps/rejected": -1361.8411865234375, + "loss": 0.0707, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09871726483106613, + "rewards/margins": 0.318486750125885, + "rewards/rejected": -0.41720399260520935, + "step": 1720 + }, + { + "epoch": 0.46, + "learning_rate": 3.2614681135640696e-06, + "logits/chosen": -1.666524887084961, + "logits/rejected": -1.0706026554107666, + "logps/chosen": -633.6300048828125, + "logps/rejected": -1287.6302490234375, + "loss": 0.0617, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08346323668956757, + "rewards/margins": 0.29316291213035583, + "rewards/rejected": -0.376626193523407, + "step": 1730 + }, + { + "epoch": 0.46, + "learning_rate": 3.2392701251101172e-06, + "logits/chosen": -1.6395193338394165, + "logits/rejected": -1.1502461433410645, + "logps/chosen": -596.9259033203125, + "logps/rejected": -1209.7242431640625, + "loss": 0.0985, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13397231698036194, + "rewards/margins": 0.2410469949245453, + "rewards/rejected": -0.3750193417072296, + "step": 1740 + }, + { + "epoch": 0.47, + "learning_rate": 3.217008081777726e-06, + "logits/chosen": -1.6936429738998413, + "logits/rejected": -1.1541160345077515, + "logps/chosen": -687.9554443359375, + "logps/rejected": -1373.260986328125, + "loss": 0.0775, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19113728404045105, + "rewards/margins": 0.3089086413383484, + "rewards/rejected": -0.500045895576477, + "step": 1750 + }, + { + "epoch": 0.47, + "learning_rate": 3.1946839124862873e-06, + "logits/chosen": -1.499732255935669, + "logits/rejected": -1.073246955871582, + "logps/chosen": -632.1316528320312, + "logps/rejected": -1340.867919921875, + "loss": 0.055, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14432887732982635, + "rewards/margins": 0.3110666275024414, + "rewards/rejected": -0.45539551973342896, + "step": 1760 + }, + { + "epoch": 0.47, + "learning_rate": 3.1722995515381644e-06, + "logits/chosen": -1.6296335458755493, + "logits/rejected": -0.9428736567497253, + "logps/chosen": -613.3892822265625, + "logps/rejected": -1241.071533203125, + "loss": 0.0922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1409432291984558, + "rewards/margins": 0.28491806983947754, + "rewards/rejected": -0.42586126923561096, + "step": 1770 + }, + { + "epoch": 0.47, + "learning_rate": 3.149856938451094e-06, + "logits/chosen": -1.9275051355361938, + "logits/rejected": -0.9073979258537292, + "logps/chosen": -685.4188842773438, + "logps/rejected": -1194.86767578125, + "loss": 0.0869, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15064950287342072, + "rewards/margins": 0.25674349069595337, + "rewards/rejected": -0.4073929786682129, + "step": 1780 + }, + { + "epoch": 0.48, + "learning_rate": 3.127358017790132e-06, + "logits/chosen": -1.5247657299041748, + "logits/rejected": -0.8251350522041321, + "logps/chosen": -658.7485961914062, + "logps/rejected": -1302.989013671875, + "loss": 0.0565, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15536119043827057, + "rewards/margins": 0.306736022233963, + "rewards/rejected": -0.46209725737571716, + "step": 1790 + }, + { + "epoch": 0.48, + "learning_rate": 3.1048047389991693e-06, + "logits/chosen": -1.5092096328735352, + "logits/rejected": -0.8953585624694824, + "logps/chosen": -563.54638671875, + "logps/rejected": -1277.382080078125, + "loss": 0.0635, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12220227718353271, + "rewards/margins": 0.32350245118141174, + "rewards/rejected": -0.44570469856262207, + "step": 1800 + }, + { + "epoch": 0.48, + "learning_rate": 3.082199056232015e-06, + "logits/chosen": -1.555259108543396, + "logits/rejected": -1.2513010501861572, + "logps/chosen": -576.5824584960938, + "logps/rejected": -1193.6871337890625, + "loss": 0.1014, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14716719090938568, + "rewards/margins": 0.2701808214187622, + "rewards/rejected": -0.4173479974269867, + "step": 1810 + }, + { + "epoch": 0.49, + "learning_rate": 3.059542928183079e-06, + "logits/chosen": -1.2370166778564453, + "logits/rejected": -0.862767219543457, + "logps/chosen": -630.3663330078125, + "logps/rejected": -1336.5428466796875, + "loss": 0.0544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1622113287448883, + "rewards/margins": 0.32105860114097595, + "rewards/rejected": -0.48326998949050903, + "step": 1820 + }, + { + "epoch": 0.49, + "learning_rate": 3.0368383179176584e-06, + "logits/chosen": -1.385969877243042, + "logits/rejected": -1.0176479816436768, + "logps/chosen": -719.9691772460938, + "logps/rejected": -1240.3895263671875, + "loss": 0.1078, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19016364216804504, + "rewards/margins": 0.2406831681728363, + "rewards/rejected": -0.43084684014320374, + "step": 1830 + }, + { + "epoch": 0.49, + "learning_rate": 3.0140871927018466e-06, + "logits/chosen": -1.6181665658950806, + "logits/rejected": -1.2087422609329224, + "logps/chosen": -704.0010986328125, + "logps/rejected": -1382.859619140625, + "loss": 0.0552, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19687633216381073, + "rewards/margins": 0.28999894857406616, + "rewards/rejected": -0.4868752360343933, + "step": 1840 + }, + { + "epoch": 0.49, + "learning_rate": 2.9912915238320755e-06, + "logits/chosen": -1.4043359756469727, + "logits/rejected": -1.1318188905715942, + "logps/chosen": -555.2379760742188, + "logps/rejected": -1354.341796875, + "loss": 0.0577, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16941991448402405, + "rewards/margins": 0.33636412024497986, + "rewards/rejected": -0.5057840347290039, + "step": 1850 + }, + { + "epoch": 0.5, + "learning_rate": 2.9684532864643123e-06, + "logits/chosen": -1.7377849817276, + "logits/rejected": -1.16031014919281, + "logps/chosen": -689.5167236328125, + "logps/rejected": -1164.914794921875, + "loss": 0.1023, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21689483523368835, + "rewards/margins": 0.22367532551288605, + "rewards/rejected": -0.440570205450058, + "step": 1860 + }, + { + "epoch": 0.5, + "learning_rate": 2.945574459442917e-06, + "logits/chosen": -1.6468284130096436, + "logits/rejected": -1.190332055091858, + "logps/chosen": -639.2093505859375, + "logps/rejected": -1291.997802734375, + "loss": 0.0592, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1794385313987732, + "rewards/margins": 0.31004798412323, + "rewards/rejected": -0.4894865155220032, + "step": 1870 + }, + { + "epoch": 0.5, + "learning_rate": 2.922657025129185e-06, + "logits/chosen": -1.5556018352508545, + "logits/rejected": -1.0828845500946045, + "logps/chosen": -718.0322875976562, + "logps/rejected": -1318.2115478515625, + "loss": 0.0983, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27457764744758606, + "rewards/margins": 0.23540663719177246, + "rewards/rejected": -0.5099843144416809, + "step": 1880 + }, + { + "epoch": 0.5, + "learning_rate": 2.8997029692295875e-06, + "logits/chosen": -1.66329026222229, + "logits/rejected": -1.0540707111358643, + "logps/chosen": -647.1304931640625, + "logps/rejected": -1394.345703125, + "loss": 0.0644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16953504085540771, + "rewards/margins": 0.3324953019618988, + "rewards/rejected": -0.5020303726196289, + "step": 1890 + }, + { + "epoch": 0.51, + "learning_rate": 2.876714280623708e-06, + "logits/chosen": -1.3675628900527954, + "logits/rejected": -0.910226047039032, + "logps/chosen": -656.5228881835938, + "logps/rejected": -1325.244140625, + "loss": 0.0585, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18668699264526367, + "rewards/margins": 0.3124103248119354, + "rewards/rejected": -0.4990972876548767, + "step": 1900 + }, + { + "epoch": 0.51, + "learning_rate": 2.8536929511919227e-06, + "logits/chosen": -1.5113346576690674, + "logits/rejected": -0.917544960975647, + "logps/chosen": -621.9835815429688, + "logps/rejected": -1159.2508544921875, + "loss": 0.0598, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17902755737304688, + "rewards/margins": 0.25234436988830566, + "rewards/rejected": -0.43137192726135254, + "step": 1910 + }, + { + "epoch": 0.51, + "learning_rate": 2.8306409756428067e-06, + "logits/chosen": -1.56842041015625, + "logits/rejected": -0.9856363534927368, + "logps/chosen": -565.5784301757812, + "logps/rejected": -1276.6402587890625, + "loss": 0.0583, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.158840149641037, + "rewards/margins": 0.29064539074897766, + "rewards/rejected": -0.4494854807853699, + "step": 1920 + }, + { + "epoch": 0.51, + "learning_rate": 2.807560351340302e-06, + "logits/chosen": -1.5603129863739014, + "logits/rejected": -1.0442321300506592, + "logps/chosen": -594.1895751953125, + "logps/rejected": -1224.7371826171875, + "loss": 0.0977, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17820130288600922, + "rewards/margins": 0.2835688889026642, + "rewards/rejected": -0.4617701470851898, + "step": 1930 + }, + { + "epoch": 0.52, + "learning_rate": 2.7844530781306544e-06, + "logits/chosen": -1.436647653579712, + "logits/rejected": -0.8475536108016968, + "logps/chosen": -703.9013061523438, + "logps/rejected": -1465.6171875, + "loss": 0.0612, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.22263050079345703, + "rewards/margins": 0.3488808274269104, + "rewards/rejected": -0.5715113282203674, + "step": 1940 + }, + { + "epoch": 0.52, + "learning_rate": 2.761321158169134e-06, + "logits/chosen": -1.6537196636199951, + "logits/rejected": -1.1116868257522583, + "logps/chosen": -717.2203369140625, + "logps/rejected": -1459.016357421875, + "loss": 0.0578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19066976010799408, + "rewards/margins": 0.35232046246528625, + "rewards/rejected": -0.5429901480674744, + "step": 1950 + }, + { + "epoch": 0.52, + "learning_rate": 2.738166595746554e-06, + "logits/chosen": -1.709814429283142, + "logits/rejected": -0.9134047627449036, + "logps/chosen": -705.0501708984375, + "logps/rejected": -1441.9495849609375, + "loss": 0.0426, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.172596737742424, + "rewards/margins": 0.34574776887893677, + "rewards/rejected": -0.518344521522522, + "step": 1960 + }, + { + "epoch": 0.53, + "learning_rate": 2.7149913971156105e-06, + "logits/chosen": -1.6743762493133545, + "logits/rejected": -1.134342908859253, + "logps/chosen": -682.0380859375, + "logps/rejected": -1288.8563232421875, + "loss": 0.0783, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21863889694213867, + "rewards/margins": 0.30050721764564514, + "rewards/rejected": -0.5191460847854614, + "step": 1970 + }, + { + "epoch": 0.53, + "learning_rate": 2.6917975703170466e-06, + "logits/chosen": -1.384412407875061, + "logits/rejected": -0.9328360557556152, + "logps/chosen": -756.0421752929688, + "logps/rejected": -1321.971923828125, + "loss": 0.0844, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2494826316833496, + "rewards/margins": 0.2922648787498474, + "rewards/rejected": -0.541747510433197, + "step": 1980 + }, + { + "epoch": 0.53, + "learning_rate": 2.668587125005663e-06, + "logits/chosen": -1.573412299156189, + "logits/rejected": -0.9849382638931274, + "logps/chosen": -653.5593872070312, + "logps/rejected": -1219.503173828125, + "loss": 0.0674, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22859041392803192, + "rewards/margins": 0.2729692757129669, + "rewards/rejected": -0.5015596747398376, + "step": 1990 + }, + { + "epoch": 0.53, + "learning_rate": 2.6453620722761897e-06, + "logits/chosen": -1.6217581033706665, + "logits/rejected": -0.829363226890564, + "logps/chosen": -687.14794921875, + "logps/rejected": -1367.03515625, + "loss": 0.0515, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2579112648963928, + "rewards/margins": 0.3186108469963074, + "rewards/rejected": -0.5765220522880554, + "step": 2000 + }, + { + "epoch": 0.54, + "learning_rate": 2.6221244244890336e-06, + "logits/chosen": -1.4822559356689453, + "logits/rejected": -0.8854954838752747, + "logps/chosen": -803.3583984375, + "logps/rejected": -1373.9749755859375, + "loss": 0.0823, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2859230637550354, + "rewards/margins": 0.2797744870185852, + "rewards/rejected": -0.5656975507736206, + "step": 2010 + }, + { + "epoch": 0.54, + "learning_rate": 2.5988761950959133e-06, + "logits/chosen": -1.405379056930542, + "logits/rejected": -0.980889618396759, + "logps/chosen": -653.2147216796875, + "logps/rejected": -1412.9344482421875, + "loss": 0.0489, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19988027215003967, + "rewards/margins": 0.3394158184528351, + "rewards/rejected": -0.5392960906028748, + "step": 2020 + }, + { + "epoch": 0.54, + "learning_rate": 2.575619398465402e-06, + "logits/chosen": -1.5412838459014893, + "logits/rejected": -0.9163180589675903, + "logps/chosen": -539.6043090820312, + "logps/rejected": -1107.312255859375, + "loss": 0.0771, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1628035455942154, + "rewards/margins": 0.2565108835697174, + "rewards/rejected": -0.419314444065094, + "step": 2030 + }, + { + "epoch": 0.54, + "learning_rate": 2.5523560497083927e-06, + "logits/chosen": -1.482742428779602, + "logits/rejected": -1.0331344604492188, + "logps/chosen": -751.7644653320312, + "logps/rejected": -1281.9117431640625, + "loss": 0.0847, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21723918616771698, + "rewards/margins": 0.26914697885513306, + "rewards/rejected": -0.48638615012168884, + "step": 2040 + }, + { + "epoch": 0.55, + "learning_rate": 2.5290881645034932e-06, + "logits/chosen": -1.6871871948242188, + "logits/rejected": -0.9969019889831543, + "logps/chosen": -712.149169921875, + "logps/rejected": -1383.6920166015625, + "loss": 0.0813, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2281813621520996, + "rewards/margins": 0.3213092088699341, + "rewards/rejected": -0.5494905710220337, + "step": 2050 + }, + { + "epoch": 0.55, + "learning_rate": 2.5058177589223766e-06, + "logits/chosen": -1.4727680683135986, + "logits/rejected": -1.048156976699829, + "logps/chosen": -657.0037231445312, + "logps/rejected": -1363.59814453125, + "loss": 0.0745, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2256421595811844, + "rewards/margins": 0.31263765692710876, + "rewards/rejected": -0.5382798314094543, + "step": 2060 + }, + { + "epoch": 0.55, + "learning_rate": 2.482546849255096e-06, + "logits/chosen": -1.5077242851257324, + "logits/rejected": -0.8984912633895874, + "logps/chosen": -722.7063598632812, + "logps/rejected": -1281.8148193359375, + "loss": 0.0741, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23482950031757355, + "rewards/margins": 0.291878879070282, + "rewards/rejected": -0.5267083644866943, + "step": 2070 + }, + { + "epoch": 0.55, + "learning_rate": 2.4592774518353858e-06, + "logits/chosen": -1.6423757076263428, + "logits/rejected": -0.9732748866081238, + "logps/chosen": -625.0535888671875, + "logps/rejected": -1348.394287109375, + "loss": 0.0562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19351795315742493, + "rewards/margins": 0.3293379247188568, + "rewards/rejected": -0.5228558778762817, + "step": 2080 + }, + { + "epoch": 0.56, + "learning_rate": 2.436011582865945e-06, + "logits/chosen": -1.7477773427963257, + "logits/rejected": -1.0844796895980835, + "logps/chosen": -786.804443359375, + "logps/rejected": -1510.250732421875, + "loss": 0.0565, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24655666947364807, + "rewards/margins": 0.34261250495910645, + "rewards/rejected": -0.5891691446304321, + "step": 2090 + }, + { + "epoch": 0.56, + "learning_rate": 2.4127512582437486e-06, + "logits/chosen": -1.1730453968048096, + "logits/rejected": -1.0297441482543945, + "logps/chosen": -610.2826538085938, + "logps/rejected": -1334.4935302734375, + "loss": 0.075, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21944193542003632, + "rewards/margins": 0.28680866956710815, + "rewards/rejected": -0.5062506198883057, + "step": 2100 + }, + { + "epoch": 0.56, + "learning_rate": 2.3894984933853734e-06, + "logits/chosen": -1.6909167766571045, + "logits/rejected": -1.2207863330841064, + "logps/chosen": -736.7841796875, + "logps/rejected": -1436.5059814453125, + "loss": 0.0715, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2114332616329193, + "rewards/margins": 0.30715304613113403, + "rewards/rejected": -0.518586277961731, + "step": 2110 + }, + { + "epoch": 0.57, + "learning_rate": 2.366255303052377e-06, + "logits/chosen": -1.7539796829223633, + "logits/rejected": -1.0083894729614258, + "logps/chosen": -702.0449829101562, + "logps/rejected": -1368.424072265625, + "loss": 0.0588, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17488941550254822, + "rewards/margins": 0.3086225390434265, + "rewards/rejected": -0.4835119843482971, + "step": 2120 + }, + { + "epoch": 0.57, + "learning_rate": 2.3430237011767166e-06, + "logits/chosen": -1.783517837524414, + "logits/rejected": -0.9859122037887573, + "logps/chosen": -631.6397705078125, + "logps/rejected": -1418.53857421875, + "loss": 0.0386, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.13998395204544067, + "rewards/margins": 0.3510037064552307, + "rewards/rejected": -0.490987628698349, + "step": 2130 + }, + { + "epoch": 0.57, + "learning_rate": 2.319805700686257e-06, + "logits/chosen": -1.5874156951904297, + "logits/rejected": -1.0402486324310303, + "logps/chosen": -647.2991943359375, + "logps/rejected": -1264.6134033203125, + "loss": 0.0663, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15432874858379364, + "rewards/margins": 0.29131144285202026, + "rewards/rejected": -0.4456402361392975, + "step": 2140 + }, + { + "epoch": 0.57, + "learning_rate": 2.296603313330355e-06, + "logits/chosen": -1.6740115880966187, + "logits/rejected": -1.049090027809143, + "logps/chosen": -650.9720458984375, + "logps/rejected": -1069.4693603515625, + "loss": 0.1228, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1620035022497177, + "rewards/margins": 0.2206643521785736, + "rewards/rejected": -0.3826678991317749, + "step": 2150 + }, + { + "epoch": 0.58, + "learning_rate": 2.2734185495055503e-06, + "logits/chosen": -1.4526993036270142, + "logits/rejected": -1.0033073425292969, + "logps/chosen": -464.20257568359375, + "logps/rejected": -1249.659423828125, + "loss": 0.0672, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11725373566150665, + "rewards/margins": 0.32836082577705383, + "rewards/rejected": -0.4456145763397217, + "step": 2160 + }, + { + "epoch": 0.58, + "learning_rate": 2.250253418081373e-06, + "logits/chosen": -1.6637146472930908, + "logits/rejected": -1.0717463493347168, + "logps/chosen": -662.2470703125, + "logps/rejected": -1223.6536865234375, + "loss": 0.0843, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1466962993144989, + "rewards/margins": 0.2738208770751953, + "rewards/rejected": -0.4205172061920166, + "step": 2170 + }, + { + "epoch": 0.58, + "learning_rate": 2.22710992622628e-06, + "logits/chosen": -1.7047134637832642, + "logits/rejected": -0.7803869843482971, + "logps/chosen": -681.6057739257812, + "logps/rejected": -1309.5865478515625, + "loss": 0.057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15952114760875702, + "rewards/margins": 0.309850811958313, + "rewards/rejected": -0.4693719744682312, + "step": 2180 + }, + { + "epoch": 0.58, + "learning_rate": 2.2039900792337477e-06, + "logits/chosen": -1.5044130086898804, + "logits/rejected": -0.787899374961853, + "logps/chosen": -632.465576171875, + "logps/rejected": -1316.4517822265625, + "loss": 0.0515, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18237130343914032, + "rewards/margins": 0.32328280806541443, + "rewards/rejected": -0.5056540369987488, + "step": 2190 + }, + { + "epoch": 0.59, + "learning_rate": 2.1808958803485134e-06, + "logits/chosen": -1.6702648401260376, + "logits/rejected": -0.990136444568634, + "logps/chosen": -455.564208984375, + "logps/rejected": -1168.5498046875, + "loss": 0.0599, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12168209254741669, + "rewards/margins": 0.3131280541419983, + "rewards/rejected": -0.434810072183609, + "step": 2200 + }, + { + "epoch": 0.59, + "learning_rate": 2.157829330593008e-06, + "logits/chosen": -1.6868867874145508, + "logits/rejected": -0.9523127675056458, + "logps/chosen": -733.1408081054688, + "logps/rejected": -1337.63818359375, + "loss": 0.0715, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21179482340812683, + "rewards/margins": 0.30666384100914, + "rewards/rejected": -0.5184586644172668, + "step": 2210 + }, + { + "epoch": 0.59, + "learning_rate": 2.134792428593971e-06, + "logits/chosen": -1.6914653778076172, + "logits/rejected": -1.2282737493515015, + "logps/chosen": -614.1090087890625, + "logps/rejected": -1088.0286865234375, + "loss": 0.1059, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1738576740026474, + "rewards/margins": 0.21695072948932648, + "rewards/rejected": -0.3908084034919739, + "step": 2220 + }, + { + "epoch": 0.59, + "learning_rate": 2.1117871704092818e-06, + "logits/chosen": -1.6489229202270508, + "logits/rejected": -0.9123425483703613, + "logps/chosen": -679.4093017578125, + "logps/rejected": -1336.1690673828125, + "loss": 0.0813, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1528438925743103, + "rewards/margins": 0.3031119108200073, + "rewards/rejected": -0.4559558033943176, + "step": 2230 + }, + { + "epoch": 0.6, + "learning_rate": 2.0888155493550027e-06, + "logits/chosen": -1.658093810081482, + "logits/rejected": -1.2635515928268433, + "logps/chosen": -563.3775634765625, + "logps/rejected": -1217.8135986328125, + "loss": 0.0686, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12988874316215515, + "rewards/margins": 0.31268182396888733, + "rewards/rejected": -0.44257062673568726, + "step": 2240 + }, + { + "epoch": 0.6, + "learning_rate": 2.0658795558326745e-06, + "logits/chosen": -1.5542079210281372, + "logits/rejected": -0.9705120921134949, + "logps/chosen": -641.7821655273438, + "logps/rejected": -1335.944091796875, + "loss": 0.0586, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1482844054698944, + "rewards/margins": 0.3350493013858795, + "rewards/rejected": -0.48333367705345154, + "step": 2250 + }, + { + "epoch": 0.6, + "learning_rate": 2.0429811771568468e-06, + "logits/chosen": -1.6625282764434814, + "logits/rejected": -0.9575098752975464, + "logps/chosen": -673.6990966796875, + "logps/rejected": -1288.328369140625, + "loss": 0.0408, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1419149488210678, + "rewards/margins": 0.2989000678062439, + "rewards/rejected": -0.4408150315284729, + "step": 2260 + }, + { + "epoch": 0.61, + "learning_rate": 2.0201223973828917e-06, + "logits/chosen": -1.7629448175430298, + "logits/rejected": -1.1014915704727173, + "logps/chosen": -609.2691650390625, + "logps/rejected": -1323.014404296875, + "loss": 0.0751, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12834301590919495, + "rewards/margins": 0.31924059987068176, + "rewards/rejected": -0.4475835859775543, + "step": 2270 + }, + { + "epoch": 0.61, + "learning_rate": 1.997305197135089e-06, + "logits/chosen": -1.5687427520751953, + "logits/rejected": -0.9063301086425781, + "logps/chosen": -674.9013671875, + "logps/rejected": -1229.5550537109375, + "loss": 0.0873, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12829624116420746, + "rewards/margins": 0.24914589524269104, + "rewards/rejected": -0.3774421215057373, + "step": 2280 + }, + { + "epoch": 0.61, + "learning_rate": 1.9745315534350157e-06, + "logits/chosen": -1.6142327785491943, + "logits/rejected": -1.2041738033294678, + "logps/chosen": -527.4874877929688, + "logps/rejected": -1276.542724609375, + "loss": 0.0724, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11953765153884888, + "rewards/margins": 0.3008989989757538, + "rewards/rejected": -0.42043668031692505, + "step": 2290 + }, + { + "epoch": 0.61, + "learning_rate": 1.9518034395302413e-06, + "logits/chosen": -1.584478497505188, + "logits/rejected": -0.8746352195739746, + "logps/chosen": -507.75128173828125, + "logps/rejected": -1205.164794921875, + "loss": 0.0654, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12713788449764252, + "rewards/margins": 0.3278200626373291, + "rewards/rejected": -0.4549580216407776, + "step": 2300 + }, + { + "epoch": 0.62, + "learning_rate": 1.9291228247233607e-06, + "logits/chosen": -1.452108383178711, + "logits/rejected": -0.9603986740112305, + "logps/chosen": -750.004638671875, + "logps/rejected": -1415.546875, + "loss": 0.0642, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.22353100776672363, + "rewards/margins": 0.3011419475078583, + "rewards/rejected": -0.5246729254722595, + "step": 2310 + }, + { + "epoch": 0.62, + "learning_rate": 1.9064916742013515e-06, + "logits/chosen": -1.5349475145339966, + "logits/rejected": -0.8534983396530151, + "logps/chosen": -691.6678466796875, + "logps/rejected": -1337.88671875, + "loss": 0.0564, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19090059399604797, + "rewards/margins": 0.3062039911746979, + "rewards/rejected": -0.49710458517074585, + "step": 2320 + }, + { + "epoch": 0.62, + "learning_rate": 1.883911948865306e-06, + "logits/chosen": -1.6528412103652954, + "logits/rejected": -1.132430076599121, + "logps/chosen": -626.3675537109375, + "logps/rejected": -1352.5123291015625, + "loss": 0.0702, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1741064339876175, + "rewards/margins": 0.31939199566841125, + "rewards/rejected": -0.49349841475486755, + "step": 2330 + }, + { + "epoch": 0.62, + "learning_rate": 1.8613856051605242e-06, + "logits/chosen": -1.8713703155517578, + "logits/rejected": -1.2219023704528809, + "logps/chosen": -479.8798828125, + "logps/rejected": -1113.4814453125, + "loss": 0.0658, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10413724184036255, + "rewards/margins": 0.322685569524765, + "rewards/rejected": -0.42682284116744995, + "step": 2340 + }, + { + "epoch": 0.63, + "learning_rate": 1.8389145949069953e-06, + "logits/chosen": -1.5313884019851685, + "logits/rejected": -1.042482614517212, + "logps/chosen": -540.1700439453125, + "logps/rejected": -1220.364990234375, + "loss": 0.0499, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13704395294189453, + "rewards/margins": 0.29186248779296875, + "rewards/rejected": -0.4289064407348633, + "step": 2350 + }, + { + "epoch": 0.63, + "learning_rate": 1.816500865130279e-06, + "logits/chosen": -1.495516300201416, + "logits/rejected": -1.306896448135376, + "logps/chosen": -562.7477416992188, + "logps/rejected": -1185.534912109375, + "loss": 0.1051, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14714586734771729, + "rewards/margins": 0.26410627365112305, + "rewards/rejected": -0.4112521708011627, + "step": 2360 + }, + { + "epoch": 0.63, + "learning_rate": 1.7941463578928088e-06, + "logits/chosen": -1.6695563793182373, + "logits/rejected": -0.9721006155014038, + "logps/chosen": -601.9846801757812, + "logps/rejected": -1367.9365234375, + "loss": 0.0603, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1521584689617157, + "rewards/margins": 0.35412856936454773, + "rewards/rejected": -0.5062869787216187, + "step": 2370 + }, + { + "epoch": 0.63, + "learning_rate": 1.7718530101256115e-06, + "logits/chosen": -1.6784439086914062, + "logits/rejected": -1.0285909175872803, + "logps/chosen": -690.5152587890625, + "logps/rejected": -1287.234130859375, + "loss": 0.083, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.17937234044075012, + "rewards/margins": 0.27991387248039246, + "rewards/rejected": -0.4592861533164978, + "step": 2380 + }, + { + "epoch": 0.64, + "learning_rate": 1.7496227534604859e-06, + "logits/chosen": -1.7998254299163818, + "logits/rejected": -1.1072345972061157, + "logps/chosen": -648.5956420898438, + "logps/rejected": -1205.2398681640625, + "loss": 0.0657, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1724121868610382, + "rewards/margins": 0.29253289103507996, + "rewards/rejected": -0.46494507789611816, + "step": 2390 + }, + { + "epoch": 0.64, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": -1.6273527145385742, + "logits/rejected": -1.183037519454956, + "logps/chosen": -620.6856689453125, + "logps/rejected": -1215.266845703125, + "loss": 0.0901, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17143599689006805, + "rewards/margins": 0.2781030535697937, + "rewards/rejected": -0.44953903555870056, + "step": 2400 + }, + { + "epoch": 0.64, + "learning_rate": 1.7053592124637557e-06, + "logits/chosen": -1.8067405223846436, + "logits/rejected": -1.2858269214630127, + "logps/chosen": -659.517822265625, + "logps/rejected": -1250.8367919921875, + "loss": 0.0949, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17256858944892883, + "rewards/margins": 0.2582642436027527, + "rewards/rejected": -0.43083280324935913, + "step": 2410 + }, + { + "epoch": 0.65, + "learning_rate": 1.6833297633956647e-06, + "logits/chosen": -1.443641185760498, + "logits/rejected": -1.0768052339553833, + "logps/chosen": -559.7144775390625, + "logps/rejected": -1292.55859375, + "loss": 0.0641, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.11608362197875977, + "rewards/margins": 0.34168118238449097, + "rewards/rejected": -0.45776480436325073, + "step": 2420 + }, + { + "epoch": 0.65, + "learning_rate": 1.661371075624363e-06, + "logits/chosen": -1.6469463109970093, + "logits/rejected": -1.0461533069610596, + "logps/chosen": -603.0146484375, + "logps/rejected": -1207.7100830078125, + "loss": 0.07, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.10989055782556534, + "rewards/margins": 0.2882770299911499, + "rewards/rejected": -0.39816758036613464, + "step": 2430 + }, + { + "epoch": 0.65, + "learning_rate": 1.6394850517846621e-06, + "logits/chosen": -1.6739981174468994, + "logits/rejected": -1.2458163499832153, + "logps/chosen": -687.8931884765625, + "logps/rejected": -1236.5269775390625, + "loss": 0.0977, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.148878276348114, + "rewards/margins": 0.2945893704891205, + "rewards/rejected": -0.4434676170349121, + "step": 2440 + }, + { + "epoch": 0.65, + "learning_rate": 1.6176735882153284e-06, + "logits/chosen": -1.6518714427947998, + "logits/rejected": -0.9769365191459656, + "logps/chosen": -496.71661376953125, + "logps/rejected": -1090.7801513671875, + "loss": 0.081, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.12074653804302216, + "rewards/margins": 0.26993122696876526, + "rewards/rejected": -0.3906777799129486, + "step": 2450 + }, + { + "epoch": 0.66, + "learning_rate": 1.5959385747947697e-06, + "logits/chosen": -1.5575945377349854, + "logits/rejected": -1.1600332260131836, + "logps/chosen": -541.9180908203125, + "logps/rejected": -1182.4320068359375, + "loss": 0.0797, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16873383522033691, + "rewards/margins": 0.2602773606777191, + "rewards/rejected": -0.42901119589805603, + "step": 2460 + }, + { + "epoch": 0.66, + "learning_rate": 1.5742818947772875e-06, + "logits/chosen": -1.6753427982330322, + "logits/rejected": -0.9124045372009277, + "logps/chosen": -678.1805419921875, + "logps/rejected": -1311.1553955078125, + "loss": 0.0578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17256096005439758, + "rewards/margins": 0.3086041510105133, + "rewards/rejected": -0.4811651110649109, + "step": 2470 + }, + { + "epoch": 0.66, + "learning_rate": 1.552705424629898e-06, + "logits/chosen": -1.5884169340133667, + "logits/rejected": -0.9173520803451538, + "logps/chosen": -705.35009765625, + "logps/rejected": -1284.0096435546875, + "loss": 0.0683, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16716155409812927, + "rewards/margins": 0.28985482454299927, + "rewards/rejected": -0.4570164084434509, + "step": 2480 + }, + { + "epoch": 0.66, + "learning_rate": 1.5312110338697427e-06, + "logits/chosen": -1.533384084701538, + "logits/rejected": -1.1220409870147705, + "logps/chosen": -613.6298828125, + "logps/rejected": -1082.766357421875, + "loss": 0.1039, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17001187801361084, + "rewards/margins": 0.20055198669433594, + "rewards/rejected": -0.3705638647079468, + "step": 2490 + }, + { + "epoch": 0.67, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -1.4955171346664429, + "logits/rejected": -0.8390592336654663, + "logps/chosen": -611.5145263671875, + "logps/rejected": -1197.4530029296875, + "loss": 0.0597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12041016668081284, + "rewards/margins": 0.2896363139152527, + "rewards/rejected": -0.4100464880466461, + "step": 2500 + }, + { + "epoch": 0.67, + "learning_rate": 1.4884759328590476e-06, + "logits/chosen": -1.914384126663208, + "logits/rejected": -1.2168171405792236, + "logps/chosen": -600.9666137695312, + "logps/rejected": -1124.942626953125, + "loss": 0.0888, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11489248275756836, + "rewards/margins": 0.26221710443496704, + "rewards/rejected": -0.3771095871925354, + "step": 2510 + }, + { + "epoch": 0.67, + "learning_rate": 1.467238925438646e-06, + "logits/chosen": -1.738040566444397, + "logits/rejected": -0.8857895731925964, + "logps/chosen": -679.0386962890625, + "logps/rejected": -1410.933837890625, + "loss": 0.0558, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16262464225292206, + "rewards/margins": 0.339643657207489, + "rewards/rejected": -0.5022683143615723, + "step": 2520 + }, + { + "epoch": 0.67, + "learning_rate": 1.446091402744923e-06, + "logits/chosen": -1.3981173038482666, + "logits/rejected": -0.8543848991394043, + "logps/chosen": -590.76513671875, + "logps/rejected": -1206.71240234375, + "loss": 0.0583, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15895266830921173, + "rewards/margins": 0.2688734829425812, + "rewards/rejected": -0.4278261661529541, + "step": 2530 + }, + { + "epoch": 0.68, + "learning_rate": 1.4250351971283937e-06, + "logits/chosen": -1.6803276538848877, + "logits/rejected": -1.0602971315383911, + "logps/chosen": -679.5074462890625, + "logps/rejected": -1229.011962890625, + "loss": 0.0871, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19001373648643494, + "rewards/margins": 0.26307451725006104, + "rewards/rejected": -0.45308828353881836, + "step": 2540 + }, + { + "epoch": 0.68, + "learning_rate": 1.4040721330273063e-06, + "logits/chosen": -1.5601686239242554, + "logits/rejected": -0.9407947659492493, + "logps/chosen": -616.7720336914062, + "logps/rejected": -1353.0308837890625, + "loss": 0.0501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16183273494243622, + "rewards/margins": 0.3334823250770569, + "rewards/rejected": -0.4953150749206543, + "step": 2550 + }, + { + "epoch": 0.68, + "learning_rate": 1.3832040268095589e-06, + "logits/chosen": -1.52444589138031, + "logits/rejected": -0.7717920541763306, + "logps/chosen": -666.0655517578125, + "logps/rejected": -1330.985595703125, + "loss": 0.0645, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1790360063314438, + "rewards/margins": 0.3166579306125641, + "rewards/rejected": -0.49569398164749146, + "step": 2560 + }, + { + "epoch": 0.69, + "learning_rate": 1.362432686615316e-06, + "logits/chosen": -1.511685848236084, + "logits/rejected": -1.2630524635314941, + "logps/chosen": -488.4208068847656, + "logps/rejected": -1233.045166015625, + "loss": 0.0768, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12492994964122772, + "rewards/margins": 0.30061617493629456, + "rewards/rejected": -0.4255460798740387, + "step": 2570 + }, + { + "epoch": 0.69, + "learning_rate": 1.3417599122003464e-06, + "logits/chosen": -1.624969720840454, + "logits/rejected": -0.8495733141899109, + "logps/chosen": -633.3519897460938, + "logps/rejected": -1253.824951171875, + "loss": 0.0632, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18349340558052063, + "rewards/margins": 0.2837851643562317, + "rewards/rejected": -0.4672785699367523, + "step": 2580 + }, + { + "epoch": 0.69, + "learning_rate": 1.3211874947800747e-06, + "logits/chosen": -1.4943301677703857, + "logits/rejected": -1.0596059560775757, + "logps/chosen": -581.8253784179688, + "logps/rejected": -1226.0418701171875, + "loss": 0.061, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1564798802137375, + "rewards/margins": 0.30731362104415894, + "rewards/rejected": -0.4637935161590576, + "step": 2590 + }, + { + "epoch": 0.69, + "learning_rate": 1.3007172168743854e-06, + "logits/chosen": -1.4282827377319336, + "logits/rejected": -1.0970618724822998, + "logps/chosen": -622.9385375976562, + "logps/rejected": -1321.9234619140625, + "loss": 0.0718, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18767477571964264, + "rewards/margins": 0.32170000672340393, + "rewards/rejected": -0.5093748569488525, + "step": 2600 + }, + { + "epoch": 0.7, + "learning_rate": 1.280350852153168e-06, + "logits/chosen": -1.4260807037353516, + "logits/rejected": -0.8716268539428711, + "logps/chosen": -704.5125732421875, + "logps/rejected": -1340.787353515625, + "loss": 0.0737, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22893400490283966, + "rewards/margins": 0.28585508465766907, + "rewards/rejected": -0.5147891044616699, + "step": 2610 + }, + { + "epoch": 0.7, + "learning_rate": 1.260090165282645e-06, + "logits/chosen": -1.5481798648834229, + "logits/rejected": -1.048405647277832, + "logps/chosen": -643.6202392578125, + "logps/rejected": -1242.826416015625, + "loss": 0.0806, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2178986817598343, + "rewards/margins": 0.2719099521636963, + "rewards/rejected": -0.4898086488246918, + "step": 2620 + }, + { + "epoch": 0.7, + "learning_rate": 1.2399369117724582e-06, + "logits/chosen": -1.650472640991211, + "logits/rejected": -0.9782799482345581, + "logps/chosen": -707.5630493164062, + "logps/rejected": -1298.345458984375, + "loss": 0.0753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16964152455329895, + "rewards/margins": 0.2897980809211731, + "rewards/rejected": -0.45943960547447205, + "step": 2630 + }, + { + "epoch": 0.7, + "learning_rate": 1.2198928378235717e-06, + "logits/chosen": -1.6851444244384766, + "logits/rejected": -0.92662113904953, + "logps/chosen": -619.037841796875, + "logps/rejected": -1287.4412841796875, + "loss": 0.0522, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13626527786254883, + "rewards/margins": 0.3253551125526428, + "rewards/rejected": -0.46162039041519165, + "step": 2640 + }, + { + "epoch": 0.71, + "learning_rate": 1.1999596801769617e-06, + "logits/chosen": -1.8352587223052979, + "logits/rejected": -1.171582818031311, + "logps/chosen": -616.7615966796875, + "logps/rejected": -1335.4404296875, + "loss": 0.049, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1257292926311493, + "rewards/margins": 0.32309678196907043, + "rewards/rejected": -0.4488261342048645, + "step": 2650 + }, + { + "epoch": 0.71, + "learning_rate": 1.1801391659631423e-06, + "logits/chosen": -1.6399614810943604, + "logits/rejected": -1.0923852920532227, + "logps/chosen": -635.6760864257812, + "logps/rejected": -1325.152099609375, + "loss": 0.0653, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16147668659687042, + "rewards/margins": 0.3237590193748474, + "rewards/rejected": -0.48523569107055664, + "step": 2660 + }, + { + "epoch": 0.71, + "learning_rate": 1.160433012552508e-06, + "logits/chosen": -1.5931923389434814, + "logits/rejected": -1.2261526584625244, + "logps/chosen": -603.7941284179688, + "logps/rejected": -1107.156982421875, + "loss": 0.1027, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14636698365211487, + "rewards/margins": 0.22241589426994324, + "rewards/rejected": -0.3687829077243805, + "step": 2670 + }, + { + "epoch": 0.71, + "learning_rate": 1.1408429274065418e-06, + "logits/chosen": -1.481740117073059, + "logits/rejected": -1.1304103136062622, + "logps/chosen": -606.6870727539062, + "logps/rejected": -1188.147216796875, + "loss": 0.0763, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1330951750278473, + "rewards/margins": 0.2732751667499542, + "rewards/rejected": -0.4063703119754791, + "step": 2680 + }, + { + "epoch": 0.72, + "learning_rate": 1.1213706079298566e-06, + "logits/chosen": -1.6204341650009155, + "logits/rejected": -1.0541805028915405, + "logps/chosen": -554.864990234375, + "logps/rejected": -1167.746826171875, + "loss": 0.0617, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11585495620965958, + "rewards/margins": 0.30027204751968384, + "rewards/rejected": -0.4161270260810852, + "step": 2690 + }, + { + "epoch": 0.72, + "learning_rate": 1.1020177413231334e-06, + "logits/chosen": -1.6089589595794678, + "logits/rejected": -1.0122666358947754, + "logps/chosen": -685.2811889648438, + "logps/rejected": -1211.408935546875, + "loss": 0.0745, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16823996603488922, + "rewards/margins": 0.2715032994747162, + "rewards/rejected": -0.4397433400154114, + "step": 2700 + }, + { + "epoch": 0.72, + "learning_rate": 1.0827860044369226e-06, + "logits/chosen": -1.7601515054702759, + "logits/rejected": -1.007900595664978, + "logps/chosen": -704.5645751953125, + "logps/rejected": -1274.189453125, + "loss": 0.0737, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17606690526008606, + "rewards/margins": 0.30847105383872986, + "rewards/rejected": -0.48453792929649353, + "step": 2710 + }, + { + "epoch": 0.73, + "learning_rate": 1.06367706362636e-06, + "logits/chosen": -1.5721882581710815, + "logits/rejected": -1.0723769664764404, + "logps/chosen": -665.5977783203125, + "logps/rejected": -1278.2569580078125, + "loss": 0.0799, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18703334033489227, + "rewards/margins": 0.2840210795402527, + "rewards/rejected": -0.47105446457862854, + "step": 2720 + }, + { + "epoch": 0.73, + "learning_rate": 1.0446925746067768e-06, + "logits/chosen": -1.6236509084701538, + "logits/rejected": -1.0335582494735718, + "logps/chosen": -735.1541137695312, + "logps/rejected": -1341.9544677734375, + "loss": 0.0686, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1772700995206833, + "rewards/margins": 0.31488290429115295, + "rewards/rejected": -0.49215301871299744, + "step": 2730 + }, + { + "epoch": 0.73, + "learning_rate": 1.0258341823102418e-06, + "logits/chosen": -1.6834526062011719, + "logits/rejected": -1.0599212646484375, + "logps/chosen": -650.5093994140625, + "logps/rejected": -1288.187744140625, + "loss": 0.0623, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14554272592067719, + "rewards/margins": 0.3072082996368408, + "rewards/rejected": -0.4527510106563568, + "step": 2740 + }, + { + "epoch": 0.73, + "learning_rate": 1.0071035207430352e-06, + "logits/chosen": -1.7902591228485107, + "logits/rejected": -1.289333462715149, + "logps/chosen": -654.4462890625, + "logps/rejected": -1320.8380126953125, + "loss": 0.0673, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16027647256851196, + "rewards/margins": 0.28841906785964966, + "rewards/rejected": -0.44869551062583923, + "step": 2750 + }, + { + "epoch": 0.74, + "learning_rate": 9.88502212844063e-07, + "logits/chosen": -1.528241515159607, + "logits/rejected": -1.140417218208313, + "logps/chosen": -584.2600708007812, + "logps/rejected": -1204.6785888671875, + "loss": 0.0822, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12145284563302994, + "rewards/margins": 0.2952510714530945, + "rewards/rejected": -0.4167039394378662, + "step": 2760 + }, + { + "epoch": 0.74, + "learning_rate": 9.700318703442437e-07, + "logits/chosen": -1.6165825128555298, + "logits/rejected": -1.2629040479660034, + "logps/chosen": -692.536865234375, + "logps/rejected": -1356.869384765625, + "loss": 0.0861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16812773048877716, + "rewards/margins": 0.2822396159172058, + "rewards/rejected": -0.4503673017024994, + "step": 2770 + }, + { + "epoch": 0.74, + "learning_rate": 9.516940936268504e-07, + "logits/chosen": -1.4886250495910645, + "logits/rejected": -0.8492704629898071, + "logps/chosen": -567.9114379882812, + "logps/rejected": -1223.29931640625, + "loss": 0.0907, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15036030113697052, + "rewards/margins": 0.3028646409511566, + "rewards/rejected": -0.45322495698928833, + "step": 2780 + }, + { + "epoch": 0.74, + "learning_rate": 9.334904715888496e-07, + "logits/chosen": -1.6458749771118164, + "logits/rejected": -1.2635785341262817, + "logps/chosen": -567.9539794921875, + "logps/rejected": -1301.9921875, + "loss": 0.0872, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1548917442560196, + "rewards/margins": 0.31059738993644714, + "rewards/rejected": -0.46548905968666077, + "step": 2790 + }, + { + "epoch": 0.75, + "learning_rate": 9.154225815032242e-07, + "logits/chosen": -1.5953037738800049, + "logits/rejected": -1.0339276790618896, + "logps/chosen": -624.7017822265625, + "logps/rejected": -1236.400634765625, + "loss": 0.0716, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17240996658802032, + "rewards/margins": 0.28697705268859863, + "rewards/rejected": -0.45938700437545776, + "step": 2800 + }, + { + "epoch": 0.75, + "learning_rate": 8.974919888823164e-07, + "logits/chosen": -1.4576761722564697, + "logits/rejected": -1.1351208686828613, + "logps/chosen": -624.9636840820312, + "logps/rejected": -1355.374755859375, + "loss": 0.0514, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19172172248363495, + "rewards/margins": 0.3108896315097809, + "rewards/rejected": -0.5026113390922546, + "step": 2810 + }, + { + "epoch": 0.75, + "learning_rate": 8.797002473421729e-07, + "logits/chosen": -1.5604914426803589, + "logits/rejected": -0.9569327235221863, + "logps/chosen": -538.5881958007812, + "logps/rejected": -1219.4466552734375, + "loss": 0.0683, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15165778994560242, + "rewards/margins": 0.28516024351119995, + "rewards/rejected": -0.4368179738521576, + "step": 2820 + }, + { + "epoch": 0.75, + "learning_rate": 8.620488984679378e-07, + "logits/chosen": -1.852301836013794, + "logits/rejected": -1.1759949922561646, + "logps/chosen": -659.7407836914062, + "logps/rejected": -1306.58203125, + "loss": 0.066, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16961313784122467, + "rewards/margins": 0.3043574392795563, + "rewards/rejected": -0.47397056221961975, + "step": 2830 + }, + { + "epoch": 0.76, + "learning_rate": 8.445394716802754e-07, + "logits/chosen": -1.6400655508041382, + "logits/rejected": -1.1373300552368164, + "logps/chosen": -735.5645751953125, + "logps/rejected": -1331.341064453125, + "loss": 0.0866, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1878480315208435, + "rewards/margins": 0.27262082695961, + "rewards/rejected": -0.4604688286781311, + "step": 2840 + }, + { + "epoch": 0.76, + "learning_rate": 8.271734841028553e-07, + "logits/chosen": -1.4932782649993896, + "logits/rejected": -0.9225826263427734, + "logps/chosen": -594.31982421875, + "logps/rejected": -1214.7799072265625, + "loss": 0.0686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1466977298259735, + "rewards/margins": 0.30178767442703247, + "rewards/rejected": -0.4484853744506836, + "step": 2850 + }, + { + "epoch": 0.76, + "learning_rate": 8.099524404308948e-07, + "logits/chosen": -1.5658118724822998, + "logits/rejected": -0.9562853574752808, + "logps/chosen": -645.6381225585938, + "logps/rejected": -1404.0450439453125, + "loss": 0.0439, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15565729141235352, + "rewards/margins": 0.3485789895057678, + "rewards/rejected": -0.5042362809181213, + "step": 2860 + }, + { + "epoch": 0.77, + "learning_rate": 7.928778328007918e-07, + "logits/chosen": -1.867717981338501, + "logits/rejected": -0.9512616991996765, + "logps/chosen": -674.2090454101562, + "logps/rejected": -1350.9642333984375, + "loss": 0.0573, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.13902226090431213, + "rewards/margins": 0.3449219763278961, + "rewards/rejected": -0.48394423723220825, + "step": 2870 + }, + { + "epoch": 0.77, + "learning_rate": 7.759511406608255e-07, + "logits/chosen": -1.5874018669128418, + "logits/rejected": -1.1192893981933594, + "logps/chosen": -586.0441284179688, + "logps/rejected": -1302.760986328125, + "loss": 0.0788, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13455680012702942, + "rewards/margins": 0.2815057635307312, + "rewards/rejected": -0.416062593460083, + "step": 2880 + }, + { + "epoch": 0.77, + "learning_rate": 7.591738306429769e-07, + "logits/chosen": -1.5606791973114014, + "logits/rejected": -1.0470914840698242, + "logps/chosen": -744.95263671875, + "logps/rejected": -1285.8818359375, + "loss": 0.0792, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18279020488262177, + "rewards/margins": 0.2564167082309723, + "rewards/rejected": -0.43920689821243286, + "step": 2890 + }, + { + "epoch": 0.77, + "learning_rate": 7.425473564358457e-07, + "logits/chosen": -1.6006968021392822, + "logits/rejected": -1.0099936723709106, + "logps/chosen": -720.3036499023438, + "logps/rejected": -1374.542236328125, + "loss": 0.0453, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18247629702091217, + "rewards/margins": 0.3243308663368225, + "rewards/rejected": -0.5068072080612183, + "step": 2900 + }, + { + "epoch": 0.78, + "learning_rate": 7.260731586586983e-07, + "logits/chosen": -1.6800225973129272, + "logits/rejected": -0.9249428510665894, + "logps/chosen": -472.9820251464844, + "logps/rejected": -1168.8466796875, + "loss": 0.0482, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1155017763376236, + "rewards/margins": 0.33621880412101746, + "rewards/rejected": -0.45172062516212463, + "step": 2910 + }, + { + "epoch": 0.78, + "learning_rate": 7.097526647366379e-07, + "logits/chosen": -1.7830703258514404, + "logits/rejected": -1.3318331241607666, + "logps/chosen": -588.818359375, + "logps/rejected": -1168.9097900390625, + "loss": 0.0765, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1334531605243683, + "rewards/margins": 0.28530946373939514, + "rewards/rejected": -0.41876259446144104, + "step": 2920 + }, + { + "epoch": 0.78, + "learning_rate": 6.935872887769299e-07, + "logits/chosen": -1.593977928161621, + "logits/rejected": -1.0323081016540527, + "logps/chosen": -722.8409423828125, + "logps/rejected": -1217.73681640625, + "loss": 0.0888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1773270219564438, + "rewards/margins": 0.2691357731819153, + "rewards/rejected": -0.4464627802371979, + "step": 2930 + }, + { + "epoch": 0.78, + "learning_rate": 6.775784314464717e-07, + "logits/chosen": -1.6172115802764893, + "logits/rejected": -1.0769383907318115, + "logps/chosen": -590.12939453125, + "logps/rejected": -1277.718505859375, + "loss": 0.0699, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1619381308555603, + "rewards/margins": 0.30093225836753845, + "rewards/rejected": -0.46287041902542114, + "step": 2940 + }, + { + "epoch": 0.79, + "learning_rate": 6.617274798504286e-07, + "logits/chosen": -1.748478651046753, + "logits/rejected": -0.9003183245658875, + "logps/chosen": -748.8468017578125, + "logps/rejected": -1387.6318359375, + "loss": 0.0683, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17095907032489777, + "rewards/margins": 0.3251902461051941, + "rewards/rejected": -0.49614930152893066, + "step": 2950 + }, + { + "epoch": 0.79, + "learning_rate": 6.460358074120518e-07, + "logits/chosen": -1.5600354671478271, + "logits/rejected": -1.1617937088012695, + "logps/chosen": -635.3890380859375, + "logps/rejected": -1242.238525390625, + "loss": 0.0767, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16152323782444, + "rewards/margins": 0.26770225167274475, + "rewards/rejected": -0.42922544479370117, + "step": 2960 + }, + { + "epoch": 0.79, + "learning_rate": 6.305047737536707e-07, + "logits/chosen": -1.6008501052856445, + "logits/rejected": -1.1396461725234985, + "logps/chosen": -604.7008666992188, + "logps/rejected": -1217.7623291015625, + "loss": 0.0703, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1483077108860016, + "rewards/margins": 0.29934391379356384, + "rewards/rejected": -0.44765162467956543, + "step": 2970 + }, + { + "epoch": 0.79, + "learning_rate": 6.151357245788917e-07, + "logits/chosen": -1.6701332330703735, + "logits/rejected": -0.9330341219902039, + "logps/chosen": -628.5372924804688, + "logps/rejected": -1414.3253173828125, + "loss": 0.0577, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.154875710606575, + "rewards/margins": 0.35905200242996216, + "rewards/rejected": -0.513927698135376, + "step": 2980 + }, + { + "epoch": 0.8, + "learning_rate": 5.999299915559956e-07, + "logits/chosen": -1.4873206615447998, + "logits/rejected": -0.9139396548271179, + "logps/chosen": -534.3379516601562, + "logps/rejected": -1303.0247802734375, + "loss": 0.0602, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1485694944858551, + "rewards/margins": 0.3279011845588684, + "rewards/rejected": -0.4764706492424011, + "step": 2990 + }, + { + "epoch": 0.8, + "learning_rate": 5.848888922025553e-07, + "logits/chosen": -1.5426976680755615, + "logits/rejected": -1.0285556316375732, + "logps/chosen": -576.2730102539062, + "logps/rejected": -1104.698974609375, + "loss": 0.0913, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1463908702135086, + "rewards/margins": 0.2579698860645294, + "rewards/rejected": -0.40436071157455444, + "step": 3000 + }, + { + "epoch": 0.8, + "learning_rate": 5.700137297712749e-07, + "logits/chosen": -1.5507534742355347, + "logits/rejected": -1.1646429300308228, + "logps/chosen": -678.5272216796875, + "logps/rejected": -1307.9720458984375, + "loss": 0.0558, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19729594886302948, + "rewards/margins": 0.2910037338733673, + "rewards/rejected": -0.488299697637558, + "step": 3010 + }, + { + "epoch": 0.81, + "learning_rate": 5.553057931370729e-07, + "logits/chosen": -1.7067492008209229, + "logits/rejected": -1.3142516613006592, + "logps/chosen": -630.6482543945312, + "logps/rejected": -1377.3018798828125, + "loss": 0.053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1730308085680008, + "rewards/margins": 0.33525392413139343, + "rewards/rejected": -0.5082847476005554, + "step": 3020 + }, + { + "epoch": 0.81, + "learning_rate": 5.407663566854008e-07, + "logits/chosen": -1.5402835607528687, + "logits/rejected": -0.9547150731086731, + "logps/chosen": -717.29833984375, + "logps/rejected": -1315.943115234375, + "loss": 0.0595, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18772569298744202, + "rewards/margins": 0.29532501101493835, + "rewards/rejected": -0.48305076360702515, + "step": 3030 + }, + { + "epoch": 0.81, + "learning_rate": 5.263966802018275e-07, + "logits/chosen": -1.8520797491073608, + "logits/rejected": -1.0401822328567505, + "logps/chosen": -717.2581176757812, + "logps/rejected": -1253.8289794921875, + "loss": 0.0639, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16455858945846558, + "rewards/margins": 0.29715651273727417, + "rewards/rejected": -0.4617151618003845, + "step": 3040 + }, + { + "epoch": 0.81, + "learning_rate": 5.121980087628802e-07, + "logits/chosen": -1.7092409133911133, + "logits/rejected": -1.2182010412216187, + "logps/chosen": -586.7152099609375, + "logps/rejected": -1214.601318359375, + "loss": 0.051, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.137488454580307, + "rewards/margins": 0.3142798840999603, + "rewards/rejected": -0.45176833868026733, + "step": 3050 + }, + { + "epoch": 0.82, + "learning_rate": 4.981715726281666e-07, + "logits/chosen": -1.5101208686828613, + "logits/rejected": -0.8277866244316101, + "logps/chosen": -569.005615234375, + "logps/rejected": -1231.107177734375, + "loss": 0.0616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13659389317035675, + "rewards/margins": 0.32126671075820923, + "rewards/rejected": -0.45786052942276, + "step": 3060 + }, + { + "epoch": 0.82, + "learning_rate": 4.843185871337722e-07, + "logits/chosen": -1.4731261730194092, + "logits/rejected": -1.047473669052124, + "logps/chosen": -693.8050537109375, + "logps/rejected": -1432.60009765625, + "loss": 0.0881, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19450101256370544, + "rewards/margins": 0.3186071217060089, + "rewards/rejected": -0.5131081342697144, + "step": 3070 + }, + { + "epoch": 0.82, + "learning_rate": 4.706402525869633e-07, + "logits/chosen": -1.3473070859909058, + "logits/rejected": -1.2000024318695068, + "logps/chosen": -589.5697021484375, + "logps/rejected": -1317.3638916015625, + "loss": 0.072, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16462837159633636, + "rewards/margins": 0.2750544846057892, + "rewards/rejected": -0.43968287110328674, + "step": 3080 + }, + { + "epoch": 0.82, + "learning_rate": 4.5713775416217884e-07, + "logits/chosen": -1.6565711498260498, + "logits/rejected": -1.1259280443191528, + "logps/chosen": -551.4010009765625, + "logps/rejected": -1178.7850341796875, + "loss": 0.0716, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13680130243301392, + "rewards/margins": 0.30534160137176514, + "rewards/rejected": -0.44214290380477905, + "step": 3090 + }, + { + "epoch": 0.83, + "learning_rate": 4.438122617983442e-07, + "logits/chosen": -1.5101524591445923, + "logits/rejected": -0.8424856066703796, + "logps/chosen": -591.2506103515625, + "logps/rejected": -1273.046142578125, + "loss": 0.0566, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13511498272418976, + "rewards/margins": 0.33536994457244873, + "rewards/rejected": -0.4704849123954773, + "step": 3100 + }, + { + "epoch": 0.83, + "learning_rate": 4.3066493009749853e-07, + "logits/chosen": -1.7471688985824585, + "logits/rejected": -1.1047070026397705, + "logps/chosen": -767.1864013671875, + "logps/rejected": -1380.8409423828125, + "loss": 0.053, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20385906100273132, + "rewards/margins": 0.27676570415496826, + "rewards/rejected": -0.4806247651576996, + "step": 3110 + }, + { + "epoch": 0.83, + "learning_rate": 4.1769689822475147e-07, + "logits/chosen": -1.4887802600860596, + "logits/rejected": -1.02260422706604, + "logps/chosen": -691.14990234375, + "logps/rejected": -1373.417724609375, + "loss": 0.0965, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20843443274497986, + "rewards/margins": 0.2790736258029938, + "rewards/rejected": -0.4875081181526184, + "step": 3120 + }, + { + "epoch": 0.83, + "learning_rate": 4.049092898095816e-07, + "logits/chosen": -1.5980995893478394, + "logits/rejected": -1.023189902305603, + "logps/chosen": -595.4110107421875, + "logps/rejected": -1312.7945556640625, + "loss": 0.0653, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16371554136276245, + "rewards/margins": 0.2928038537502289, + "rewards/rejected": -0.4565194249153137, + "step": 3130 + }, + { + "epoch": 0.84, + "learning_rate": 3.9230321284847856e-07, + "logits/chosen": -1.5363506078720093, + "logits/rejected": -1.0504465103149414, + "logps/chosen": -489.7978515625, + "logps/rejected": -1119.5257568359375, + "loss": 0.0633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11015214771032333, + "rewards/margins": 0.2800517678260803, + "rewards/rejected": -0.39020389318466187, + "step": 3140 + }, + { + "epoch": 0.84, + "learning_rate": 3.798797596089351e-07, + "logits/chosen": -1.6714694499969482, + "logits/rejected": -1.125106692314148, + "logps/chosen": -604.4114379882812, + "logps/rejected": -1297.808349609375, + "loss": 0.0516, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12939824163913727, + "rewards/margins": 0.35474586486816406, + "rewards/rejected": -0.4841441214084625, + "step": 3150 + }, + { + "epoch": 0.84, + "learning_rate": 3.6764000653481263e-07, + "logits/chosen": -1.7083511352539062, + "logits/rejected": -1.0139703750610352, + "logps/chosen": -604.1871337890625, + "logps/rejected": -1156.0162353515625, + "loss": 0.0919, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1465051770210266, + "rewards/margins": 0.27671653032302856, + "rewards/rejected": -0.4232216775417328, + "step": 3160 + }, + { + "epoch": 0.85, + "learning_rate": 3.555850141530659e-07, + "logits/chosen": -1.7717567682266235, + "logits/rejected": -1.1824371814727783, + "logps/chosen": -566.564453125, + "logps/rejected": -1101.685302734375, + "loss": 0.0733, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14669661223888397, + "rewards/margins": 0.26966187357902527, + "rewards/rejected": -0.41635847091674805, + "step": 3170 + }, + { + "epoch": 0.85, + "learning_rate": 3.4371582698185636e-07, + "logits/chosen": -1.4069181680679321, + "logits/rejected": -1.0164799690246582, + "logps/chosen": -516.7618408203125, + "logps/rejected": -1247.140380859375, + "loss": 0.0714, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1548655778169632, + "rewards/margins": 0.2954062819480896, + "rewards/rejected": -0.4502718448638916, + "step": 3180 + }, + { + "epoch": 0.85, + "learning_rate": 3.3203347344004737e-07, + "logits/chosen": -1.332287073135376, + "logits/rejected": -1.1523934602737427, + "logps/chosen": -562.6694946289062, + "logps/rejected": -1197.0635986328125, + "loss": 0.0921, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19367225468158722, + "rewards/margins": 0.24201972782611847, + "rewards/rejected": -0.4356919825077057, + "step": 3190 + }, + { + "epoch": 0.85, + "learning_rate": 3.2053896575809426e-07, + "logits/chosen": -1.4262222051620483, + "logits/rejected": -1.026829719543457, + "logps/chosen": -569.9625854492188, + "logps/rejected": -1329.806884765625, + "loss": 0.0553, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.15987573564052582, + "rewards/margins": 0.3234714865684509, + "rewards/rejected": -0.48334717750549316, + "step": 3200 + }, + { + "epoch": 0.86, + "learning_rate": 3.092332998903416e-07, + "logits/chosen": -1.403305172920227, + "logits/rejected": -1.228492021560669, + "logps/chosen": -531.6251220703125, + "logps/rejected": -1268.7862548828125, + "loss": 0.0586, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14155253767967224, + "rewards/margins": 0.29640236496925354, + "rewards/rejected": -0.4379549026489258, + "step": 3210 + }, + { + "epoch": 0.86, + "learning_rate": 2.981174554287239e-07, + "logits/chosen": -1.4185243844985962, + "logits/rejected": -1.2006936073303223, + "logps/chosen": -635.041259765625, + "logps/rejected": -1473.925537109375, + "loss": 0.0452, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.16828233003616333, + "rewards/margins": 0.3489801287651062, + "rewards/rejected": -0.5172623991966248, + "step": 3220 + }, + { + "epoch": 0.86, + "learning_rate": 2.871923955178918e-07, + "logits/chosen": -1.4334993362426758, + "logits/rejected": -0.9838508367538452, + "logps/chosen": -607.728271484375, + "logps/rejected": -1310.9564208984375, + "loss": 0.0531, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18386729061603546, + "rewards/margins": 0.3068189322948456, + "rewards/rejected": -0.49068623781204224, + "step": 3230 + }, + { + "epoch": 0.86, + "learning_rate": 2.764590667717562e-07, + "logits/chosen": -1.6019694805145264, + "logits/rejected": -1.1419106721878052, + "logps/chosen": -588.9683837890625, + "logps/rejected": -1284.370849609375, + "loss": 0.0586, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15091760456562042, + "rewards/margins": 0.30733171105384827, + "rewards/rejected": -0.4582493305206299, + "step": 3240 + }, + { + "epoch": 0.87, + "learning_rate": 2.6591839919146963e-07, + "logits/chosen": -1.6173986196517944, + "logits/rejected": -1.1335291862487793, + "logps/chosen": -654.0939331054688, + "logps/rejected": -1284.885986328125, + "loss": 0.0572, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1872612088918686, + "rewards/margins": 0.3091946244239807, + "rewards/rejected": -0.4964558482170105, + "step": 3250 + }, + { + "epoch": 0.87, + "learning_rate": 2.555713060848433e-07, + "logits/chosen": -1.4421133995056152, + "logits/rejected": -1.1747627258300781, + "logps/chosen": -585.655517578125, + "logps/rejected": -1284.943115234375, + "loss": 0.0449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1650674194097519, + "rewards/margins": 0.31287819147109985, + "rewards/rejected": -0.47794562578201294, + "step": 3260 + }, + { + "epoch": 0.87, + "learning_rate": 2.454186839872158e-07, + "logits/chosen": -1.7081034183502197, + "logits/rejected": -0.94395911693573, + "logps/chosen": -646.4754028320312, + "logps/rejected": -1342.545166015625, + "loss": 0.0709, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18191269040107727, + "rewards/margins": 0.326556921005249, + "rewards/rejected": -0.5084696412086487, + "step": 3270 + }, + { + "epoch": 0.87, + "learning_rate": 2.3546141258376786e-07, + "logits/chosen": -1.3628816604614258, + "logits/rejected": -0.9335969686508179, + "logps/chosen": -603.8056640625, + "logps/rejected": -1245.411376953125, + "loss": 0.0594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1723943054676056, + "rewards/margins": 0.3125647008419037, + "rewards/rejected": -0.48495903611183167, + "step": 3280 + }, + { + "epoch": 0.88, + "learning_rate": 2.257003546333042e-07, + "logits/chosen": -1.6881519556045532, + "logits/rejected": -1.116393804550171, + "logps/chosen": -680.4428100585938, + "logps/rejected": -1417.369384765625, + "loss": 0.0578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21765998005867004, + "rewards/margins": 0.3092319667339325, + "rewards/rejected": -0.5268920063972473, + "step": 3290 + }, + { + "epoch": 0.88, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -1.7809503078460693, + "logits/rejected": -1.235079050064087, + "logps/chosen": -521.1368408203125, + "logps/rejected": -1202.243408203125, + "loss": 0.0625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13693402707576752, + "rewards/margins": 0.2993922233581543, + "rewards/rejected": -0.43632620573043823, + "step": 3300 + }, + { + "epoch": 0.88, + "learning_rate": 2.0677024504760752e-07, + "logits/chosen": -1.5252535343170166, + "logits/rejected": -1.2412452697753906, + "logps/chosen": -526.3180541992188, + "logps/rejected": -1315.9249267578125, + "loss": 0.0633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13880392909049988, + "rewards/margins": 0.35248517990112305, + "rewards/rejected": -0.4912891387939453, + "step": 3310 + }, + { + "epoch": 0.89, + "learning_rate": 1.9760283363267684e-07, + "logits/chosen": -1.778957724571228, + "logits/rejected": -1.0381158590316772, + "logps/chosen": -641.0303955078125, + "logps/rejected": -1229.374267578125, + "loss": 0.0791, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13701531291007996, + "rewards/margins": 0.3019945025444031, + "rewards/rejected": -0.43900981545448303, + "step": 3320 + }, + { + "epoch": 0.89, + "learning_rate": 1.8863491596921745e-07, + "logits/chosen": -1.4902263879776, + "logits/rejected": -0.8488144874572754, + "logps/chosen": -537.8624267578125, + "logps/rejected": -1138.487060546875, + "loss": 0.0651, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12021678686141968, + "rewards/margins": 0.3042137622833252, + "rewards/rejected": -0.4244305491447449, + "step": 3330 + }, + { + "epoch": 0.89, + "learning_rate": 1.798672690923828e-07, + "logits/chosen": -1.604090929031372, + "logits/rejected": -1.0566972494125366, + "logps/chosen": -546.621826171875, + "logps/rejected": -1305.0101318359375, + "loss": 0.048, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1309491991996765, + "rewards/margins": 0.3338525891304016, + "rewards/rejected": -0.4648017883300781, + "step": 3340 + }, + { + "epoch": 0.89, + "learning_rate": 1.713006526846439e-07, + "logits/chosen": -1.6890567541122437, + "logits/rejected": -1.0597703456878662, + "logps/chosen": -638.17626953125, + "logps/rejected": -1403.3868408203125, + "loss": 0.0422, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.14803606271743774, + "rewards/margins": 0.36922168731689453, + "rewards/rejected": -0.517257809638977, + "step": 3350 + }, + { + "epoch": 0.9, + "learning_rate": 1.629358090099639e-07, + "logits/chosen": -1.772783637046814, + "logits/rejected": -1.145141363143921, + "logps/chosen": -645.2138671875, + "logps/rejected": -1247.31005859375, + "loss": 0.0995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16153082251548767, + "rewards/margins": 0.27567344903945923, + "rewards/rejected": -0.4372042715549469, + "step": 3360 + }, + { + "epoch": 0.9, + "learning_rate": 1.5477346284948292e-07, + "logits/chosen": -1.6137062311172485, + "logits/rejected": -1.2323498725891113, + "logps/chosen": -652.45068359375, + "logps/rejected": -1297.9696044921875, + "loss": 0.1, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18204265832901, + "rewards/margins": 0.2792537212371826, + "rewards/rejected": -0.4612963795661926, + "step": 3370 + }, + { + "epoch": 0.9, + "learning_rate": 1.4681432143872133e-07, + "logits/chosen": -1.5417966842651367, + "logits/rejected": -0.92780601978302, + "logps/chosen": -655.7986450195312, + "logps/rejected": -1352.8719482421875, + "loss": 0.0452, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1652592271566391, + "rewards/margins": 0.32993632555007935, + "rewards/rejected": -0.49519556760787964, + "step": 3380 + }, + { + "epoch": 0.9, + "learning_rate": 1.3905907440629752e-07, + "logits/chosen": -1.7428719997406006, + "logits/rejected": -1.234071969985962, + "logps/chosen": -683.9173583984375, + "logps/rejected": -1301.064208984375, + "loss": 0.0749, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1897701770067215, + "rewards/margins": 0.26576271653175354, + "rewards/rejected": -0.45553287863731384, + "step": 3390 + }, + { + "epoch": 0.91, + "learning_rate": 1.31508393714177e-07, + "logits/chosen": -1.6525071859359741, + "logits/rejected": -1.269195795059204, + "logps/chosen": -573.1322631835938, + "logps/rejected": -1224.781005859375, + "loss": 0.0717, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1454993188381195, + "rewards/margins": 0.29666373133659363, + "rewards/rejected": -0.44216299057006836, + "step": 3400 + }, + { + "epoch": 0.91, + "learning_rate": 1.241629335994471e-07, + "logits/chosen": -1.4832892417907715, + "logits/rejected": -0.7595690488815308, + "logps/chosen": -546.4700927734375, + "logps/rejected": -1105.534912109375, + "loss": 0.0961, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12580379843711853, + "rewards/margins": 0.2655871510505676, + "rewards/rejected": -0.39139097929000854, + "step": 3410 + }, + { + "epoch": 0.91, + "learning_rate": 1.1702333051763271e-07, + "logits/chosen": -1.729821801185608, + "logits/rejected": -1.004158854484558, + "logps/chosen": -648.3139038085938, + "logps/rejected": -1181.37939453125, + "loss": 0.0907, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17357492446899414, + "rewards/margins": 0.2692447900772095, + "rewards/rejected": -0.4428196847438812, + "step": 3420 + }, + { + "epoch": 0.91, + "learning_rate": 1.1009020308754587e-07, + "logits/chosen": -1.6045188903808594, + "logits/rejected": -1.1483290195465088, + "logps/chosen": -791.281494140625, + "logps/rejected": -1432.56591796875, + "loss": 0.0588, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20312626659870148, + "rewards/margins": 0.30023887753486633, + "rewards/rejected": -0.503365159034729, + "step": 3430 + }, + { + "epoch": 0.92, + "learning_rate": 1.0336415203768962e-07, + "logits/chosen": -1.4623007774353027, + "logits/rejected": -0.9743086695671082, + "logps/chosen": -684.1002807617188, + "logps/rejected": -1355.83935546875, + "loss": 0.0854, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15205714106559753, + "rewards/margins": 0.3153363764286041, + "rewards/rejected": -0.4673934876918793, + "step": 3440 + }, + { + "epoch": 0.92, + "learning_rate": 9.684576015420277e-08, + "logits/chosen": -1.5630353689193726, + "logits/rejected": -1.1269890069961548, + "logps/chosen": -500.76934814453125, + "logps/rejected": -1174.6031494140625, + "loss": 0.0748, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13523253798484802, + "rewards/margins": 0.2928512990474701, + "rewards/rejected": -0.4280838370323181, + "step": 3450 + }, + { + "epoch": 0.92, + "learning_rate": 9.053559223036746e-08, + "logits/chosen": -1.4134055376052856, + "logits/rejected": -0.9375116229057312, + "logps/chosen": -684.4464111328125, + "logps/rejected": -1184.0179443359375, + "loss": 0.097, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1784881353378296, + "rewards/margins": 0.25100329518318176, + "rewards/rejected": -0.42949143052101135, + "step": 3460 + }, + { + "epoch": 0.93, + "learning_rate": 8.44341950176683e-08, + "logits/chosen": -1.7058902978897095, + "logits/rejected": -1.1464042663574219, + "logps/chosen": -518.6105346679688, + "logps/rejected": -1139.312744140625, + "loss": 0.0706, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11180339008569717, + "rewards/margins": 0.3046559691429138, + "rewards/rejected": -0.4164593815803528, + "step": 3470 + }, + { + "epoch": 0.93, + "learning_rate": 7.854209717842231e-08, + "logits/chosen": -1.7066657543182373, + "logits/rejected": -1.1052082777023315, + "logps/chosen": -634.6749267578125, + "logps/rejected": -1268.662353515625, + "loss": 0.0549, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1490454524755478, + "rewards/margins": 0.32269707322120667, + "rewards/rejected": -0.4717424809932709, + "step": 3480 + }, + { + "epoch": 0.93, + "learning_rate": 7.285980923996989e-08, + "logits/chosen": -1.583821415901184, + "logits/rejected": -0.9543322324752808, + "logps/chosen": -620.9078979492188, + "logps/rejected": -1256.222900390625, + "loss": 0.0499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14574182033538818, + "rewards/margins": 0.3196600079536438, + "rewards/rejected": -0.465401828289032, + "step": 3490 + }, + { + "epoch": 0.93, + "learning_rate": 6.738782355044048e-08, + "logits/chosen": -1.5047067403793335, + "logits/rejected": -1.0737859010696411, + "logps/chosen": -520.77294921875, + "logps/rejected": -1155.83740234375, + "loss": 0.0764, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1430043876171112, + "rewards/margins": 0.28466781973838806, + "rewards/rejected": -0.42767223715782166, + "step": 3500 + }, + { + "epoch": 0.94, + "learning_rate": 6.212661423609184e-08, + "logits/chosen": -1.3737636804580688, + "logits/rejected": -1.1867458820343018, + "logps/chosen": -623.9158935546875, + "logps/rejected": -1239.9605712890625, + "loss": 0.078, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1589566022157669, + "rewards/margins": 0.27907633781433105, + "rewards/rejected": -0.43803295493125916, + "step": 3510 + }, + { + "epoch": 0.94, + "learning_rate": 5.707663716023021e-08, + "logits/chosen": -1.5748332738876343, + "logits/rejected": -1.063622236251831, + "logps/chosen": -616.1275634765625, + "logps/rejected": -1155.8143310546875, + "loss": 0.0835, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15737947821617126, + "rewards/margins": 0.2580162584781647, + "rewards/rejected": -0.4153957962989807, + "step": 3520 + }, + { + "epoch": 0.94, + "learning_rate": 5.22383298837098e-08, + "logits/chosen": -1.6702196598052979, + "logits/rejected": -0.9411935806274414, + "logps/chosen": -713.735107421875, + "logps/rejected": -1223.2811279296875, + "loss": 0.0824, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17610540986061096, + "rewards/margins": 0.29836469888687134, + "rewards/rejected": -0.4744700491428375, + "step": 3530 + }, + { + "epoch": 0.94, + "learning_rate": 4.761211162702117e-08, + "logits/chosen": -1.500201940536499, + "logits/rejected": -1.2098219394683838, + "logps/chosen": -633.7230224609375, + "logps/rejected": -1120.678466796875, + "loss": 0.0984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.17905206978321075, + "rewards/margins": 0.2320644110441208, + "rewards/rejected": -0.41111645102500916, + "step": 3540 + }, + { + "epoch": 0.95, + "learning_rate": 4.319838323396691e-08, + "logits/chosen": -1.6483008861541748, + "logits/rejected": -1.2190577983856201, + "logps/chosen": -614.4102783203125, + "logps/rejected": -1260.265625, + "loss": 0.0958, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16128340363502502, + "rewards/margins": 0.26488471031188965, + "rewards/rejected": -0.4261681139469147, + "step": 3550 + }, + { + "epoch": 0.95, + "learning_rate": 3.8997527136930004e-08, + "logits/chosen": -1.429495930671692, + "logits/rejected": -1.0348641872406006, + "logps/chosen": -632.0252685546875, + "logps/rejected": -1319.2584228515625, + "loss": 0.0798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17518481612205505, + "rewards/margins": 0.2713126540184021, + "rewards/rejected": -0.44649749994277954, + "step": 3560 + }, + { + "epoch": 0.95, + "learning_rate": 3.5009907323737826e-08, + "logits/chosen": -1.551414132118225, + "logits/rejected": -1.2644479274749756, + "logps/chosen": -570.0910034179688, + "logps/rejected": -1210.5382080078125, + "loss": 0.0803, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1442852020263672, + "rewards/margins": 0.26128098368644714, + "rewards/rejected": -0.4055662155151367, + "step": 3570 + }, + { + "epoch": 0.95, + "learning_rate": 3.1235869306123766e-08, + "logits/chosen": -1.8781211376190186, + "logits/rejected": -1.116159439086914, + "logps/chosen": -654.27685546875, + "logps/rejected": -1360.783447265625, + "loss": 0.0586, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15850645303726196, + "rewards/margins": 0.32171380519866943, + "rewards/rejected": -0.480220228433609, + "step": 3580 + }, + { + "epoch": 0.96, + "learning_rate": 2.767574008979007e-08, + "logits/chosen": -1.7552858591079712, + "logits/rejected": -1.1240081787109375, + "logps/chosen": -649.2583618164062, + "logps/rejected": -1285.60986328125, + "loss": 0.0746, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1459151804447174, + "rewards/margins": 0.29276043176651, + "rewards/rejected": -0.43867558240890503, + "step": 3590 + }, + { + "epoch": 0.96, + "learning_rate": 2.4329828146074096e-08, + "logits/chosen": -1.680267333984375, + "logits/rejected": -0.990991473197937, + "logps/chosen": -702.8885498046875, + "logps/rejected": -1430.7728271484375, + "loss": 0.0422, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.16736450791358948, + "rewards/margins": 0.3372777998447418, + "rewards/rejected": -0.5046423673629761, + "step": 3600 + }, + { + "epoch": 0.96, + "learning_rate": 2.1198423385220822e-08, + "logits/chosen": -1.5231233835220337, + "logits/rejected": -0.983518123626709, + "logps/chosen": -574.9891967773438, + "logps/rejected": -1192.521728515625, + "loss": 0.091, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13129422068595886, + "rewards/margins": 0.2925337255001068, + "rewards/rejected": -0.42382797598838806, + "step": 3610 + }, + { + "epoch": 0.97, + "learning_rate": 1.82817971312621e-08, + "logits/chosen": -1.689100980758667, + "logits/rejected": -1.1573827266693115, + "logps/chosen": -602.2514038085938, + "logps/rejected": -1330.6248779296875, + "loss": 0.0539, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1489819586277008, + "rewards/margins": 0.32134801149368286, + "rewards/rejected": -0.47032999992370605, + "step": 3620 + }, + { + "epoch": 0.97, + "learning_rate": 1.5580202098509078e-08, + "logits/chosen": -1.7418603897094727, + "logits/rejected": -1.2287579774856567, + "logps/chosen": -539.5288696289062, + "logps/rejected": -1092.644775390625, + "loss": 0.084, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1297844648361206, + "rewards/margins": 0.25757455825805664, + "rewards/rejected": -0.38735905289649963, + "step": 3630 + }, + { + "epoch": 0.97, + "learning_rate": 1.3093872369654148e-08, + "logits/chosen": -1.6315152645111084, + "logits/rejected": -0.7913056015968323, + "logps/chosen": -566.9249267578125, + "logps/rejected": -1232.651611328125, + "loss": 0.0596, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15806104242801666, + "rewards/margins": 0.31945645809173584, + "rewards/rejected": -0.4775174558162689, + "step": 3640 + }, + { + "epoch": 0.97, + "learning_rate": 1.0823023375489128e-08, + "logits/chosen": -1.8176482915878296, + "logits/rejected": -1.307217001914978, + "logps/chosen": -596.6251220703125, + "logps/rejected": -1259.1907958984375, + "loss": 0.0741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13308800756931305, + "rewards/margins": 0.29899168014526367, + "rewards/rejected": -0.4320797026157379, + "step": 3650 + }, + { + "epoch": 0.98, + "learning_rate": 8.767851876239075e-09, + "logits/chosen": -1.3138879537582397, + "logits/rejected": -1.0358647108078003, + "logps/chosen": -606.9632568359375, + "logps/rejected": -1166.726806640625, + "loss": 0.126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17711606621742249, + "rewards/margins": 0.24026331305503845, + "rewards/rejected": -0.41737937927246094, + "step": 3660 + }, + { + "epoch": 0.98, + "learning_rate": 6.9285359445145366e-09, + "logits/chosen": -1.6356595754623413, + "logits/rejected": -1.0119235515594482, + "logps/chosen": -606.1302490234375, + "logps/rejected": -1273.985595703125, + "loss": 0.0883, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15847013890743256, + "rewards/margins": 0.29371243715286255, + "rewards/rejected": -0.4521825909614563, + "step": 3670 + }, + { + "epoch": 0.98, + "learning_rate": 5.305234949880001e-09, + "logits/chosen": -1.659597396850586, + "logits/rejected": -1.0647908449172974, + "logps/chosen": -593.581787109375, + "logps/rejected": -1240.505615234375, + "loss": 0.0669, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15421947836875916, + "rewards/margins": 0.2849588096141815, + "rewards/rejected": -0.4391782879829407, + "step": 3680 + }, + { + "epoch": 0.98, + "learning_rate": 3.8980895450474455e-09, + "logits/chosen": -1.4947640895843506, + "logits/rejected": -1.1128860712051392, + "logps/chosen": -551.9004516601562, + "logps/rejected": -1167.104736328125, + "loss": 0.053, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13484862446784973, + "rewards/margins": 0.2813241481781006, + "rewards/rejected": -0.4161728024482727, + "step": 3690 + }, + { + "epoch": 0.99, + "learning_rate": 2.7072216536885855e-09, + "logits/chosen": -1.6360046863555908, + "logits/rejected": -1.196569800376892, + "logps/chosen": -659.7826538085938, + "logps/rejected": -1305.6195068359375, + "loss": 0.0578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17655882239341736, + "rewards/margins": 0.3209526538848877, + "rewards/rejected": -0.49751147627830505, + "step": 3700 + }, + { + "epoch": 0.99, + "learning_rate": 1.7327344598702667e-09, + "logits/chosen": -1.4936860799789429, + "logits/rejected": -1.0485936403274536, + "logps/chosen": -584.678955078125, + "logps/rejected": -1314.91650390625, + "loss": 0.071, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14625641703605652, + "rewards/margins": 0.3208056390285492, + "rewards/rejected": -0.4670620858669281, + "step": 3710 + }, + { + "epoch": 0.99, + "learning_rate": 9.747123991141193e-10, + "logits/chosen": -1.4201769828796387, + "logits/rejected": -1.045179009437561, + "logps/chosen": -620.8831787109375, + "logps/rejected": -1206.128662109375, + "loss": 0.0691, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15646150708198547, + "rewards/margins": 0.29528263211250305, + "rewards/rejected": -0.4517441391944885, + "step": 3720 + }, + { + "epoch": 0.99, + "learning_rate": 4.332211510807427e-10, + "logits/chosen": -1.530029296875, + "logits/rejected": -1.3592134714126587, + "logps/chosen": -523.2725219726562, + "logps/rejected": -1284.657958984375, + "loss": 0.0589, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.13241766393184662, + "rewards/margins": 0.30461427569389343, + "rewards/rejected": -0.43703192472457886, + "step": 3730 + }, + { + "epoch": 1.0, + "learning_rate": 1.0830763387897902e-10, + "logits/chosen": -1.5114389657974243, + "logits/rejected": -0.964741587638855, + "logps/chosen": -626.4848022460938, + "logps/rejected": -1381.5963134765625, + "loss": 0.0456, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16040828824043274, + "rewards/margins": 0.3570694923400879, + "rewards/rejected": -0.517477810382843, + "step": 3740 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -1.877239465713501, + "logits/rejected": -1.3499794006347656, + "logps/chosen": -648.46875, + "logps/rejected": -1231.8519287109375, + "loss": 0.0717, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18231454491615295, + "rewards/margins": 0.2813524603843689, + "rewards/rejected": -0.46366700530052185, + "step": 3750 + }, + { + "epoch": 1.0, + "step": 3750, + "total_flos": 0.0, + "train_loss": 0.0800992045879364, + "train_runtime": 15706.9626, + "train_samples_per_second": 0.955, + "train_steps_per_second": 0.239 + } + ], + "logging_steps": 10, + "max_steps": 3750, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}