diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5584 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 300, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.3054830287206268e-08, + "logits/chosen": -2.7604618072509766, + "logits/rejected": -2.686812162399292, + "logps/chosen": -516.73779296875, + "logps/rejected": -458.60467529296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.4880800247192383, + "logits/rejected": -2.4930832386016846, + "logps/chosen": -338.7858581542969, + "logps/rejected": -404.5611572265625, + "loss": 0.6929, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": 0.00024087271594908088, + "rewards/margins": 0.0006852700607851148, + "rewards/rejected": -0.00044439738849177957, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.475435733795166, + "logits/rejected": -2.4197583198547363, + "logps/chosen": -327.35919189453125, + "logps/rejected": -443.83868408203125, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.7859845178900287e-05, + "rewards/margins": 1.1227629329368938e-05, + "rewards/rejected": -4.9087520892499015e-05, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 3.9164490861618804e-07, + "logits/chosen": -2.5618598461151123, + "logits/rejected": -2.5939595699310303, + "logps/chosen": -348.56982421875, + "logps/rejected": -416.8001403808594, + "loss": 0.692, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.00023523520212620497, + "rewards/margins": 0.0015929860528558493, + "rewards/rejected": -0.0013577509671449661, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 5.221932114882506e-07, + "logits/chosen": -2.5246434211730957, + "logits/rejected": -2.4987733364105225, + "logps/chosen": -376.7454528808594, + "logps/rejected": -427.66729736328125, + "loss": 0.692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0012155056465417147, + "rewards/margins": 0.0037051704712212086, + "rewards/rejected": -0.00492067588493228, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 6.527415143603135e-07, + "logits/chosen": -2.4959325790405273, + "logits/rejected": -2.4452037811279297, + "logps/chosen": -290.552001953125, + "logps/rejected": -383.3431701660156, + "loss": 0.69, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0027376641519367695, + "rewards/margins": 0.005442826543003321, + "rewards/rejected": -0.008180489763617516, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 7.832898172323761e-07, + "logits/chosen": -2.4295783042907715, + "logits/rejected": -2.3901355266571045, + "logps/chosen": -377.3544006347656, + "logps/rejected": -410.72991943359375, + "loss": 0.6909, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.006509931292384863, + "rewards/margins": 0.004867006093263626, + "rewards/rejected": -0.011376937851309776, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 9.138381201044387e-07, + "logits/chosen": -2.310166835784912, + "logits/rejected": -2.279524803161621, + "logps/chosen": -279.5904846191406, + "logps/rejected": -370.0677795410156, + "loss": 0.6889, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.007495372090488672, + "rewards/margins": 0.007953675463795662, + "rewards/rejected": -0.015449047088623047, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 1.0443864229765013e-06, + "logits/chosen": -2.36948561668396, + "logits/rejected": -2.3835222721099854, + "logps/chosen": -342.13653564453125, + "logps/rejected": -447.1036682128906, + "loss": 0.6878, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.010767060332000256, + "rewards/margins": 0.013605493120849133, + "rewards/rejected": -0.024372553452849388, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 1.1749347258485642e-06, + "logits/chosen": -2.472949266433716, + "logits/rejected": -2.3902525901794434, + "logps/chosen": -325.2154541015625, + "logps/rejected": -401.51751708984375, + "loss": 0.6844, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.012460511177778244, + "rewards/margins": 0.01636466197669506, + "rewards/rejected": -0.028825175017118454, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.305483028720627e-06, + "logits/chosen": -2.5286309719085693, + "logits/rejected": -2.5337207317352295, + "logps/chosen": -365.7882080078125, + "logps/rejected": -409.24261474609375, + "loss": 0.6836, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.017355522140860558, + "rewards/margins": 0.019160564988851547, + "rewards/rejected": -0.036516088992357254, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 1.4360313315926894e-06, + "logits/chosen": -2.485241651535034, + "logits/rejected": -2.473548412322998, + "logps/chosen": -337.1002197265625, + "logps/rejected": -444.09832763671875, + "loss": 0.6793, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.023945586755871773, + "rewards/margins": 0.02508346177637577, + "rewards/rejected": -0.049029044806957245, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 1.5665796344647521e-06, + "logits/chosen": -2.437514066696167, + "logits/rejected": -2.439671516418457, + "logps/chosen": -343.27777099609375, + "logps/rejected": -444.11639404296875, + "loss": 0.6747, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.028381770476698875, + "rewards/margins": 0.039804860949516296, + "rewards/rejected": -0.06818662583827972, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 1.6971279373368146e-06, + "logits/chosen": -2.4196202754974365, + "logits/rejected": -2.329251527786255, + "logps/chosen": -380.3607482910156, + "logps/rejected": -435.93896484375, + "loss": 0.6724, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03702790290117264, + "rewards/margins": 0.049176327884197235, + "rewards/rejected": -0.08620421588420868, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 1.8276762402088774e-06, + "logits/chosen": -2.436796188354492, + "logits/rejected": -2.4008259773254395, + "logps/chosen": -364.23687744140625, + "logps/rejected": -456.6722106933594, + "loss": 0.6653, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04911624267697334, + "rewards/margins": 0.06083670258522034, + "rewards/rejected": -0.10995294898748398, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 1.9582245430809403e-06, + "logits/chosen": -2.465663433074951, + "logits/rejected": -2.4756152629852295, + "logps/chosen": -344.59808349609375, + "logps/rejected": -450.5936584472656, + "loss": 0.6615, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.07244648039340973, + "rewards/margins": 0.07592395693063736, + "rewards/rejected": -0.1483704298734665, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 2.0887728459530026e-06, + "logits/chosen": -2.365227222442627, + "logits/rejected": -2.3483099937438965, + "logps/chosen": -365.65509033203125, + "logps/rejected": -459.9405822753906, + "loss": 0.6494, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.088950976729393, + "rewards/margins": 0.0978541225194931, + "rewards/rejected": -0.1868050992488861, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 2.2193211488250653e-06, + "logits/chosen": -2.5415358543395996, + "logits/rejected": -2.4844305515289307, + "logps/chosen": -417.45013427734375, + "logps/rejected": -502.04888916015625, + "loss": 0.6513, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.12002843618392944, + "rewards/margins": 0.09641442447900772, + "rewards/rejected": -0.21644285321235657, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 2.3498694516971284e-06, + "logits/chosen": -2.373419761657715, + "logits/rejected": -2.328564167022705, + "logps/chosen": -308.98297119140625, + "logps/rejected": -423.2518005371094, + "loss": 0.6334, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10061755031347275, + "rewards/margins": 0.14423246681690216, + "rewards/rejected": -0.2448500096797943, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 2.4804177545691907e-06, + "logits/chosen": -2.4381210803985596, + "logits/rejected": -2.4739632606506348, + "logps/chosen": -366.0082092285156, + "logps/rejected": -444.33233642578125, + "loss": 0.6237, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15544722974300385, + "rewards/margins": 0.15099851787090302, + "rewards/rejected": -0.30644577741622925, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 2.610966057441254e-06, + "logits/chosen": -2.3546931743621826, + "logits/rejected": -2.3614845275878906, + "logps/chosen": -413.4657287597656, + "logps/rejected": -487.9469299316406, + "loss": 0.6049, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21883895993232727, + "rewards/margins": 0.22747401893138885, + "rewards/rejected": -0.44631296396255493, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 2.741514360313316e-06, + "logits/chosen": -2.3287081718444824, + "logits/rejected": -2.2180962562561035, + "logps/chosen": -397.4319763183594, + "logps/rejected": -475.10614013671875, + "loss": 0.6212, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2562285363674164, + "rewards/margins": 0.17265436053276062, + "rewards/rejected": -0.4288829267024994, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 2.872062663185379e-06, + "logits/chosen": -2.3937861919403076, + "logits/rejected": -2.323655366897583, + "logps/chosen": -393.7010498046875, + "logps/rejected": -540.32958984375, + "loss": 0.5708, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.31672877073287964, + "rewards/margins": 0.32215583324432373, + "rewards/rejected": -0.6388846039772034, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 3.0026109660574416e-06, + "logits/chosen": -2.2199885845184326, + "logits/rejected": -2.1488921642303467, + "logps/chosen": -362.84710693359375, + "logps/rejected": -462.92755126953125, + "loss": 0.57, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.4027708172798157, + "rewards/margins": 0.28824615478515625, + "rewards/rejected": -0.6910169124603271, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 3.1331592689295043e-06, + "logits/chosen": -2.1226742267608643, + "logits/rejected": -2.128727674484253, + "logps/chosen": -443.38226318359375, + "logps/rejected": -544.7005615234375, + "loss": 0.5704, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5918055176734924, + "rewards/margins": 0.33963826298713684, + "rewards/rejected": -0.9314438104629517, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 3.263707571801567e-06, + "logits/chosen": -2.2666714191436768, + "logits/rejected": -2.192073106765747, + "logps/chosen": -407.093505859375, + "logps/rejected": -539.2747802734375, + "loss": 0.53, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6421019434928894, + "rewards/margins": 0.4279584288597107, + "rewards/rejected": -1.0700603723526, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 3.3942558746736293e-06, + "logits/chosen": -2.1499412059783936, + "logits/rejected": -2.0672426223754883, + "logps/chosen": -427.7845764160156, + "logps/rejected": -526.4454956054688, + "loss": 0.5201, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7411731481552124, + "rewards/margins": 0.33993035554885864, + "rewards/rejected": -1.0811034440994263, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 3.524804177545692e-06, + "logits/chosen": -2.0998997688293457, + "logits/rejected": -2.022254467010498, + "logps/chosen": -459.3646545410156, + "logps/rejected": -625.1860961914062, + "loss": 0.5132, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9105283617973328, + "rewards/margins": 0.5623170137405396, + "rewards/rejected": -1.472845435142517, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 3.6553524804177547e-06, + "logits/chosen": -2.058182716369629, + "logits/rejected": -1.990330457687378, + "logps/chosen": -422.2322692871094, + "logps/rejected": -509.48504638671875, + "loss": 0.5447, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8607247471809387, + "rewards/margins": 0.5177969932556152, + "rewards/rejected": -1.3785216808319092, + "step": 280 + }, + { + "epoch": 0.08, + "learning_rate": 3.7859007832898174e-06, + "logits/chosen": -1.908735990524292, + "logits/rejected": -1.7987762689590454, + "logps/chosen": -532.5482788085938, + "logps/rejected": -675.1096801757812, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1365267038345337, + "rewards/margins": 0.7340434789657593, + "rewards/rejected": -1.870570421218872, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": -1.999489188194275, + "logits/rejected": -1.8826462030410767, + "logps/chosen": -510.0675354003906, + "logps/rejected": -654.4682006835938, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1803276538848877, + "rewards/margins": 0.692645788192749, + "rewards/rejected": -1.8729734420776367, + "step": 300 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -1.7535996437072754, + "eval_logits/rejected": -1.6233811378479004, + "eval_logps/chosen": -473.8013610839844, + "eval_logps/rejected": -603.2532958984375, + "eval_loss": 0.5339562892913818, + "eval_rewards/accuracies": 0.7310000061988831, + "eval_rewards/chosen": -1.181405782699585, + "eval_rewards/margins": 0.661102831363678, + "eval_rewards/rejected": -1.8425085544586182, + "eval_runtime": 1389.4464, + "eval_samples_per_second": 1.439, + "eval_steps_per_second": 0.36, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 4.046997389033943e-06, + "logits/chosen": -1.8269140720367432, + "logits/rejected": -1.690843939781189, + "logps/chosen": -498.0406799316406, + "logps/rejected": -633.5381469726562, + "loss": 0.5566, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2140557765960693, + "rewards/margins": 0.597601592540741, + "rewards/rejected": -1.811657190322876, + "step": 310 + }, + { + "epoch": 0.08, + "learning_rate": 4.177545691906005e-06, + "logits/chosen": -1.767525315284729, + "logits/rejected": -1.5722942352294922, + "logps/chosen": -499.41839599609375, + "logps/rejected": -658.9512329101562, + "loss": 0.5073, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2438334226608276, + "rewards/margins": 0.7738653421401978, + "rewards/rejected": -2.0176987648010254, + "step": 320 + }, + { + "epoch": 0.09, + "learning_rate": 4.308093994778068e-06, + "logits/chosen": -1.8579235076904297, + "logits/rejected": -1.7731454372406006, + "logps/chosen": -443.87689208984375, + "logps/rejected": -572.9539184570312, + "loss": 0.563, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1261751651763916, + "rewards/margins": 0.6167136430740356, + "rewards/rejected": -1.7428886890411377, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 4.4386422976501306e-06, + "logits/chosen": -1.7547829151153564, + "logits/rejected": -1.6789696216583252, + "logps/chosen": -377.6453552246094, + "logps/rejected": -584.998779296875, + "loss": 0.4792, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9642307162284851, + "rewards/margins": 0.8439178466796875, + "rewards/rejected": -1.8081486225128174, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 4.569190600522193e-06, + "logits/chosen": -1.6814041137695312, + "logits/rejected": -1.4768749475479126, + "logps/chosen": -506.594482421875, + "logps/rejected": -591.21142578125, + "loss": 0.5415, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2887766361236572, + "rewards/margins": 0.5483629107475281, + "rewards/rejected": -1.8371394872665405, + "step": 350 + }, + { + "epoch": 0.09, + "learning_rate": 4.699738903394257e-06, + "logits/chosen": -1.6767174005508423, + "logits/rejected": -1.5448577404022217, + "logps/chosen": -424.47357177734375, + "logps/rejected": -592.1474609375, + "loss": 0.4852, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9117814302444458, + "rewards/margins": 0.8897954225540161, + "rewards/rejected": -1.801576852798462, + "step": 360 + }, + { + "epoch": 0.1, + "learning_rate": 4.8302872062663196e-06, + "logits/chosen": -1.7710018157958984, + "logits/rejected": -1.6090304851531982, + "logps/chosen": -496.84796142578125, + "logps/rejected": -633.0993041992188, + "loss": 0.5396, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.243511438369751, + "rewards/margins": 0.702240526676178, + "rewards/rejected": -1.9457519054412842, + "step": 370 + }, + { + "epoch": 0.1, + "learning_rate": 4.9608355091383814e-06, + "logits/chosen": -1.4778488874435425, + "logits/rejected": -1.3310272693634033, + "logps/chosen": -402.7732849121094, + "logps/rejected": -492.5380859375, + "loss": 0.5017, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9786807298660278, + "rewards/margins": 0.623943567276001, + "rewards/rejected": -1.6026241779327393, + "step": 380 + }, + { + "epoch": 0.1, + "learning_rate": 4.9999488562447675e-06, + "logits/chosen": -1.40001380443573, + "logits/rejected": -1.2227522134780884, + "logps/chosen": -532.0943603515625, + "logps/rejected": -664.1284790039062, + "loss": 0.6002, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.533832311630249, + "rewards/margins": 0.5622987747192383, + "rewards/rejected": -2.0961310863494873, + "step": 390 + }, + { + "epoch": 0.1, + "learning_rate": 4.999698361256577e-06, + "logits/chosen": -1.461018443107605, + "logits/rejected": -1.1940746307373047, + "logps/chosen": -509.679443359375, + "logps/rejected": -636.8277587890625, + "loss": 0.5079, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1675481796264648, + "rewards/margins": 0.7109408974647522, + "rewards/rejected": -1.8784888982772827, + "step": 400 + }, + { + "epoch": 0.11, + "learning_rate": 4.999239142174581e-06, + "logits/chosen": -1.3779613971710205, + "logits/rejected": -1.1279867887496948, + "logps/chosen": -455.8907165527344, + "logps/rejected": -604.815185546875, + "loss": 0.4635, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2613023519515991, + "rewards/margins": 0.855317234992981, + "rewards/rejected": -2.11661958694458, + "step": 410 + }, + { + "epoch": 0.11, + "learning_rate": 4.99857123734344e-06, + "logits/chosen": -1.3608977794647217, + "logits/rejected": -0.901921272277832, + "logps/chosen": -503.4825134277344, + "logps/rejected": -629.4042358398438, + "loss": 0.4903, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.280937671661377, + "rewards/margins": 0.8498051762580872, + "rewards/rejected": -2.1307430267333984, + "step": 420 + }, + { + "epoch": 0.11, + "learning_rate": 4.997694702533016e-06, + "logits/chosen": -1.29677414894104, + "logits/rejected": -1.0692778825759888, + "logps/chosen": -548.71826171875, + "logps/rejected": -724.1248779296875, + "loss": 0.466, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5205175876617432, + "rewards/margins": 0.9235566854476929, + "rewards/rejected": -2.4440743923187256, + "step": 430 + }, + { + "epoch": 0.12, + "learning_rate": 4.996609610933713e-06, + "logits/chosen": -1.3901933431625366, + "logits/rejected": -1.1403733491897583, + "logps/chosen": -505.74139404296875, + "logps/rejected": -676.5526123046875, + "loss": 0.5209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.319435715675354, + "rewards/margins": 0.8349205255508423, + "rewards/rejected": -2.1543564796447754, + "step": 440 + }, + { + "epoch": 0.12, + "learning_rate": 4.995316053150366e-06, + "logits/chosen": -1.3645076751708984, + "logits/rejected": -1.0972566604614258, + "logps/chosen": -509.98016357421875, + "logps/rejected": -682.1041259765625, + "loss": 0.5307, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.315499186515808, + "rewards/margins": 0.8478175401687622, + "rewards/rejected": -2.163316488265991, + "step": 450 + }, + { + "epoch": 0.12, + "learning_rate": 4.9938141371946815e-06, + "logits/chosen": -1.3621938228607178, + "logits/rejected": -0.9139550924301147, + "logps/chosen": -506.2652893066406, + "logps/rejected": -685.5975341796875, + "loss": 0.4647, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2151587009429932, + "rewards/margins": 0.93292635679245, + "rewards/rejected": -2.148085117340088, + "step": 460 + }, + { + "epoch": 0.12, + "learning_rate": 4.992103988476206e-06, + "logits/chosen": -1.175817847251892, + "logits/rejected": -0.6028262376785278, + "logps/chosen": -476.334716796875, + "logps/rejected": -651.3397216796875, + "loss": 0.4581, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2715951204299927, + "rewards/margins": 0.8561094999313354, + "rewards/rejected": -2.127704381942749, + "step": 470 + }, + { + "epoch": 0.13, + "learning_rate": 4.990185749791866e-06, + "logits/chosen": -0.8731945753097534, + "logits/rejected": -0.5909063220024109, + "logps/chosen": -551.8997802734375, + "logps/rejected": -691.8196411132812, + "loss": 0.4837, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4918056726455688, + "rewards/margins": 1.0931782722473145, + "rewards/rejected": -2.5849835872650146, + "step": 480 + }, + { + "epoch": 0.13, + "learning_rate": 4.9880595813140395e-06, + "logits/chosen": -0.8462217450141907, + "logits/rejected": -0.4468405246734619, + "logps/chosen": -490.5753479003906, + "logps/rejected": -612.1764526367188, + "loss": 0.4908, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.316733956336975, + "rewards/margins": 0.7920147180557251, + "rewards/rejected": -2.108748435974121, + "step": 490 + }, + { + "epoch": 0.13, + "learning_rate": 4.985725660577184e-06, + "logits/chosen": -0.6493757963180542, + "logits/rejected": -0.6190133690834045, + "logps/chosen": -496.7198181152344, + "logps/rejected": -738.5999145507812, + "loss": 0.4759, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4519720077514648, + "rewards/margins": 1.00557541847229, + "rewards/rejected": -2.457547664642334, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 4.983184182463009e-06, + "logits/chosen": -0.6732873916625977, + "logits/rejected": -0.4988276958465576, + "logps/chosen": -510.90106201171875, + "logps/rejected": -625.2664794921875, + "loss": 0.4795, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3072543144226074, + "rewards/margins": 0.8223368525505066, + "rewards/rejected": -2.1295909881591797, + "step": 510 + }, + { + "epoch": 0.14, + "learning_rate": 4.980435359184203e-06, + "logits/chosen": -1.0567286014556885, + "logits/rejected": -0.561193585395813, + "logps/chosen": -564.832275390625, + "logps/rejected": -718.8677368164062, + "loss": 0.5194, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5574848651885986, + "rewards/margins": 0.938727855682373, + "rewards/rejected": -2.496212959289551, + "step": 520 + }, + { + "epoch": 0.14, + "learning_rate": 4.9774794202667236e-06, + "logits/chosen": -0.9730321168899536, + "logits/rejected": -0.3585650324821472, + "logps/chosen": -440.9288024902344, + "logps/rejected": -633.592041015625, + "loss": 0.3516, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.1980558633804321, + "rewards/margins": 1.1983206272125244, + "rewards/rejected": -2.396376132965088, + "step": 530 + }, + { + "epoch": 0.14, + "learning_rate": 4.974316612530615e-06, + "logits/chosen": -0.5258805751800537, + "logits/rejected": -0.13965483009815216, + "logps/chosen": -500.12091064453125, + "logps/rejected": -707.7438354492188, + "loss": 0.4002, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.656359314918518, + "rewards/margins": 1.194873571395874, + "rewards/rejected": -2.8512330055236816, + "step": 540 + }, + { + "epoch": 0.14, + "learning_rate": 4.970947200069416e-06, + "logits/chosen": -0.8342685699462891, + "logits/rejected": 0.23774346709251404, + "logps/chosen": -462.1539611816406, + "logps/rejected": -631.550537109375, + "loss": 0.5032, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5009446144104004, + "rewards/margins": 1.0611143112182617, + "rewards/rejected": -2.562058925628662, + "step": 550 + }, + { + "epoch": 0.15, + "learning_rate": 4.967371464228096e-06, + "logits/chosen": -0.44802480936050415, + "logits/rejected": -0.0694938451051712, + "logps/chosen": -492.4813537597656, + "logps/rejected": -663.474365234375, + "loss": 0.4973, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5952883958816528, + "rewards/margins": 1.0112850666046143, + "rewards/rejected": -2.6065733432769775, + "step": 560 + }, + { + "epoch": 0.15, + "learning_rate": 4.963589703579569e-06, + "logits/chosen": -0.5677907466888428, + "logits/rejected": 0.08190581947565079, + "logps/chosen": -503.6414489746094, + "logps/rejected": -645.163330078125, + "loss": 0.4839, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4353834390640259, + "rewards/margins": 0.922812819480896, + "rewards/rejected": -2.358196496963501, + "step": 570 + }, + { + "epoch": 0.15, + "learning_rate": 4.9596022338997615e-06, + "logits/chosen": -0.5412989854812622, + "logits/rejected": 0.47167086601257324, + "logps/chosen": -497.56103515625, + "logps/rejected": -681.288818359375, + "loss": 0.4859, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6671890020370483, + "rewards/margins": 1.1972028017044067, + "rewards/rejected": -2.864391803741455, + "step": 580 + }, + { + "epoch": 0.15, + "learning_rate": 4.955409388141243e-06, + "logits/chosen": -0.8228802680969238, + "logits/rejected": 0.08221787214279175, + "logps/chosen": -544.5670776367188, + "logps/rejected": -733.0662231445312, + "loss": 0.5465, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6897475719451904, + "rewards/margins": 1.3086159229278564, + "rewards/rejected": -2.998363971710205, + "step": 590 + }, + { + "epoch": 0.16, + "learning_rate": 4.951011516405429e-06, + "logits/chosen": -0.3682901859283447, + "logits/rejected": 0.21380114555358887, + "logps/chosen": -472.1390686035156, + "logps/rejected": -650.9225463867188, + "loss": 0.4794, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3374059200286865, + "rewards/margins": 0.9791024923324585, + "rewards/rejected": -2.3165085315704346, + "step": 600 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 0.4449966549873352, + "eval_logits/rejected": 1.2460291385650635, + "eval_logps/chosen": -494.4773254394531, + "eval_logps/rejected": -666.9945068359375, + "eval_loss": 0.4700670540332794, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -1.38816499710083, + "eval_rewards/margins": 1.0917549133300781, + "eval_rewards/rejected": -2.479919910430908, + "eval_runtime": 1372.1838, + "eval_samples_per_second": 1.458, + "eval_steps_per_second": 0.364, + "step": 600 + }, + { + "epoch": 0.16, + "learning_rate": 4.946408985913344e-06, + "logits/chosen": -1.1787288188934326, + "logits/rejected": 0.032404374331235886, + "logps/chosen": -540.3707275390625, + "logps/rejected": -690.0636596679688, + "loss": 0.4104, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2028136253356934, + "rewards/margins": 1.3079878091812134, + "rewards/rejected": -2.510801315307617, + "step": 610 + }, + { + "epoch": 0.16, + "learning_rate": 4.941602180974958e-06, + "logits/chosen": -0.5976434946060181, + "logits/rejected": 0.23231466114521027, + "logps/chosen": -513.5987548828125, + "logps/rejected": -683.162353515625, + "loss": 0.4517, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.503683090209961, + "rewards/margins": 1.2033047676086426, + "rewards/rejected": -2.7069880962371826, + "step": 620 + }, + { + "epoch": 0.16, + "learning_rate": 4.936591502957101e-06, + "logits/chosen": -0.6624077558517456, + "logits/rejected": 0.3608100712299347, + "logps/chosen": -447.55230712890625, + "logps/rejected": -660.681640625, + "loss": 0.3989, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3714964389801025, + "rewards/margins": 1.2194106578826904, + "rewards/rejected": -2.590907335281372, + "step": 630 + }, + { + "epoch": 0.17, + "learning_rate": 4.931377370249946e-06, + "logits/chosen": -0.27748388051986694, + "logits/rejected": 0.2729637026786804, + "logps/chosen": -528.355224609375, + "logps/rejected": -733.2789916992188, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8842767477035522, + "rewards/margins": 1.311034917831421, + "rewards/rejected": -3.1953113079071045, + "step": 640 + }, + { + "epoch": 0.17, + "learning_rate": 4.925960218232073e-06, + "logits/chosen": -0.4966405928134918, + "logits/rejected": 0.048351895064115524, + "logps/chosen": -491.5657653808594, + "logps/rejected": -743.1395874023438, + "loss": 0.4957, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6836191415786743, + "rewards/margins": 1.473381519317627, + "rewards/rejected": -3.157000780105591, + "step": 650 + }, + { + "epoch": 0.17, + "learning_rate": 4.920340499234116e-06, + "logits/chosen": -0.24024248123168945, + "logits/rejected": -0.06672336161136627, + "logps/chosen": -472.2447814941406, + "logps/rejected": -663.1290893554688, + "loss": 0.4382, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4003952741622925, + "rewards/margins": 1.0303130149841309, + "rewards/rejected": -2.430708408355713, + "step": 660 + }, + { + "epoch": 0.18, + "learning_rate": 4.914518682500995e-06, + "logits/chosen": -0.8933698534965515, + "logits/rejected": -0.014977499842643738, + "logps/chosen": -509.6695251464844, + "logps/rejected": -674.0376586914062, + "loss": 0.4931, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4237163066864014, + "rewards/margins": 1.024255394935608, + "rewards/rejected": -2.4479715824127197, + "step": 670 + }, + { + "epoch": 0.18, + "learning_rate": 4.9084952541527315e-06, + "logits/chosen": -0.5897430181503296, + "logits/rejected": 0.24839851260185242, + "logps/chosen": -533.2153930664062, + "logps/rejected": -633.3540649414062, + "loss": 0.4995, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5195014476776123, + "rewards/margins": 0.7836617827415466, + "rewards/rejected": -2.3031630516052246, + "step": 680 + }, + { + "epoch": 0.18, + "learning_rate": 4.902270717143858e-06, + "logits/chosen": -0.20460684597492218, + "logits/rejected": 0.12876734137535095, + "logps/chosen": -454.1747131347656, + "logps/rejected": -706.5838623046875, + "loss": 0.3716, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4049168825149536, + "rewards/margins": 1.3669251203536987, + "rewards/rejected": -2.7718420028686523, + "step": 690 + }, + { + "epoch": 0.18, + "learning_rate": 4.895845591221427e-06, + "logits/chosen": -0.5709416270256042, + "logits/rejected": 0.21547503769397736, + "logps/chosen": -543.8369750976562, + "logps/rejected": -720.9224853515625, + "loss": 0.4566, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6587600708007812, + "rewards/margins": 1.1786607503890991, + "rewards/rejected": -2.83742094039917, + "step": 700 + }, + { + "epoch": 0.19, + "learning_rate": 4.8892204128816e-06, + "logits/chosen": -0.27632415294647217, + "logits/rejected": -0.12886568903923035, + "logps/chosen": -468.2186584472656, + "logps/rejected": -706.5438232421875, + "loss": 0.4874, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4443271160125732, + "rewards/margins": 1.2774336338043213, + "rewards/rejected": -2.7217605113983154, + "step": 710 + }, + { + "epoch": 0.19, + "learning_rate": 4.882395735324864e-06, + "logits/chosen": -0.5171593427658081, + "logits/rejected": 0.06492243707180023, + "logps/chosen": -399.1227111816406, + "logps/rejected": -646.8035888671875, + "loss": 0.4334, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.1794382333755493, + "rewards/margins": 1.383467435836792, + "rewards/rejected": -2.5629055500030518, + "step": 720 + }, + { + "epoch": 0.19, + "learning_rate": 4.87537212840983e-06, + "logits/chosen": -0.38013777136802673, + "logits/rejected": 0.17461785674095154, + "logps/chosen": -431.56939697265625, + "logps/rejected": -727.7451171875, + "loss": 0.403, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2866822481155396, + "rewards/margins": 1.4361357688903809, + "rewards/rejected": -2.722817897796631, + "step": 730 + }, + { + "epoch": 0.19, + "learning_rate": 4.8681501786056545e-06, + "logits/chosen": -0.4910075068473816, + "logits/rejected": 0.1193656325340271, + "logps/chosen": -524.5296630859375, + "logps/rejected": -716.439697265625, + "loss": 0.5003, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4974342584609985, + "rewards/margins": 1.3101706504821777, + "rewards/rejected": -2.8076047897338867, + "step": 740 + }, + { + "epoch": 0.2, + "learning_rate": 4.860730488943068e-06, + "logits/chosen": -0.4912436902523041, + "logits/rejected": 0.18202224373817444, + "logps/chosen": -527.1237182617188, + "logps/rejected": -659.1048583984375, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5281684398651123, + "rewards/margins": 0.9601262211799622, + "rewards/rejected": -2.4882943630218506, + "step": 750 + }, + { + "epoch": 0.2, + "learning_rate": 4.853113678964022e-06, + "logits/chosen": -0.8603304624557495, + "logits/rejected": -0.08993472903966904, + "logps/chosen": -432.42279052734375, + "logps/rejected": -642.409912109375, + "loss": 0.3753, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2163296937942505, + "rewards/margins": 1.3121674060821533, + "rewards/rejected": -2.5284969806671143, + "step": 760 + }, + { + "epoch": 0.2, + "learning_rate": 4.845300384669958e-06, + "logits/chosen": 0.24725647270679474, + "logits/rejected": 0.0013871907722204924, + "logps/chosen": -527.3872680664062, + "logps/rejected": -742.9652099609375, + "loss": 0.4498, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8072938919067383, + "rewards/margins": 1.2479110956192017, + "rewards/rejected": -3.0552048683166504, + "step": 770 + }, + { + "epoch": 0.2, + "learning_rate": 4.837291258468701e-06, + "logits/chosen": -0.6004719734191895, + "logits/rejected": 0.5962368249893188, + "logps/chosen": -507.5689392089844, + "logps/rejected": -673.4093627929688, + "loss": 0.4537, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6435458660125732, + "rewards/margins": 1.2729610204696655, + "rewards/rejected": -2.9165070056915283, + "step": 780 + }, + { + "epoch": 0.21, + "learning_rate": 4.829086969119984e-06, + "logits/chosen": -0.4771009385585785, + "logits/rejected": 0.03888826444745064, + "logps/chosen": -551.6219482421875, + "logps/rejected": -797.9405517578125, + "loss": 0.439, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8441495895385742, + "rewards/margins": 1.5053937435150146, + "rewards/rejected": -3.349543333053589, + "step": 790 + }, + { + "epoch": 0.21, + "learning_rate": 4.820688201679605e-06, + "logits/chosen": -0.36556169390678406, + "logits/rejected": 0.3322374224662781, + "logps/chosen": -510.71661376953125, + "logps/rejected": -704.6385498046875, + "loss": 0.4447, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6285368204116821, + "rewards/margins": 1.4855858087539673, + "rewards/rejected": -3.1141226291656494, + "step": 800 + }, + { + "epoch": 0.21, + "learning_rate": 4.8120956574422315e-06, + "logits/chosen": -0.3884666860103607, + "logits/rejected": -0.13175992667675018, + "logps/chosen": -518.641357421875, + "logps/rejected": -642.6422119140625, + "loss": 0.5053, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5760186910629272, + "rewards/margins": 0.9459335207939148, + "rewards/rejected": -2.5219521522521973, + "step": 810 + }, + { + "epoch": 0.21, + "learning_rate": 4.803310053882831e-06, + "logits/chosen": -1.0650156736373901, + "logits/rejected": 0.38095536828041077, + "logps/chosen": -542.5806274414062, + "logps/rejected": -690.2403564453125, + "loss": 0.4374, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6415122747421265, + "rewards/margins": 1.045480728149414, + "rewards/rejected": -2.686992883682251, + "step": 820 + }, + { + "epoch": 0.22, + "learning_rate": 4.794332124596775e-06, + "logits/chosen": -0.6507046222686768, + "logits/rejected": 0.048465847969055176, + "logps/chosen": -479.3224182128906, + "logps/rejected": -674.3448486328125, + "loss": 0.401, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3757288455963135, + "rewards/margins": 1.2602720260620117, + "rewards/rejected": -2.636000633239746, + "step": 830 + }, + { + "epoch": 0.22, + "learning_rate": 4.785162619238575e-06, + "logits/chosen": -0.6413692235946655, + "logits/rejected": -0.042513225227594376, + "logps/chosen": -557.0155029296875, + "logps/rejected": -792.69970703125, + "loss": 0.4201, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6387875080108643, + "rewards/margins": 1.4760812520980835, + "rewards/rejected": -3.1148688793182373, + "step": 840 + }, + { + "epoch": 0.22, + "learning_rate": 4.775802303459288e-06, + "logits/chosen": -0.74284428358078, + "logits/rejected": -0.1755208522081375, + "logps/chosen": -588.0230712890625, + "logps/rejected": -802.0186767578125, + "loss": 0.4069, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7381775379180908, + "rewards/margins": 1.4629442691802979, + "rewards/rejected": -3.2011218070983887, + "step": 850 + }, + { + "epoch": 0.23, + "learning_rate": 4.766251958842589e-06, + "logits/chosen": -0.42949801683425903, + "logits/rejected": -0.4735488295555115, + "logps/chosen": -518.86572265625, + "logps/rejected": -755.33154296875, + "loss": 0.4045, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6570861339569092, + "rewards/margins": 1.4493482112884521, + "rewards/rejected": -3.1064348220825195, + "step": 860 + }, + { + "epoch": 0.23, + "learning_rate": 4.7565123828395066e-06, + "logits/chosen": -0.7870966196060181, + "logits/rejected": 0.0325060598552227, + "logps/chosen": -515.7349243164062, + "logps/rejected": -726.00927734375, + "loss": 0.5102, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6367769241333008, + "rewards/margins": 1.335331678390503, + "rewards/rejected": -2.9721086025238037, + "step": 870 + }, + { + "epoch": 0.23, + "learning_rate": 4.746584388701831e-06, + "logits/chosen": -0.7208787202835083, + "logits/rejected": -0.034571003168821335, + "logps/chosen": -488.56341552734375, + "logps/rejected": -712.2532958984375, + "loss": 0.5118, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7884547710418701, + "rewards/margins": 1.3315757513046265, + "rewards/rejected": -3.120030164718628, + "step": 880 + }, + { + "epoch": 0.23, + "learning_rate": 4.736468805414218e-06, + "logits/chosen": -0.7053539752960205, + "logits/rejected": -0.015096127986907959, + "logps/chosen": -474.1915588378906, + "logps/rejected": -634.8294677734375, + "loss": 0.4931, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4765193462371826, + "rewards/margins": 1.0172039270401, + "rewards/rejected": -2.4937233924865723, + "step": 890 + }, + { + "epoch": 0.24, + "learning_rate": 4.7261664776249595e-06, + "logits/chosen": -1.0484793186187744, + "logits/rejected": -0.35419899225234985, + "logps/chosen": -551.4525756835938, + "logps/rejected": -721.5409545898438, + "loss": 0.4519, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.49507737159729, + "rewards/margins": 1.2039979696273804, + "rewards/rejected": -2.699075222015381, + "step": 900 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 0.19788537919521332, + "eval_logits/rejected": 1.0802806615829468, + "eval_logps/chosen": -498.0536804199219, + "eval_logps/rejected": -686.2431030273438, + "eval_loss": 0.456636905670166, + "eval_rewards/accuracies": 0.7730000019073486, + "eval_rewards/chosen": -1.42392897605896, + "eval_rewards/margins": 1.248477816581726, + "eval_rewards/rejected": -2.6724064350128174, + "eval_runtime": 1384.9339, + "eval_samples_per_second": 1.444, + "eval_steps_per_second": 0.361, + "step": 900 + }, + { + "epoch": 0.24, + "learning_rate": 4.715678265575463e-06, + "logits/chosen": -0.890539824962616, + "logits/rejected": 0.22412686049938202, + "logps/chosen": -490.0135803222656, + "logps/rejected": -712.4631958007812, + "loss": 0.3771, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.2612826824188232, + "rewards/margins": 1.602736234664917, + "rewards/rejected": -2.8640189170837402, + "step": 910 + }, + { + "epoch": 0.24, + "learning_rate": 4.705005045028415e-06, + "logits/chosen": -0.8860180974006653, + "logits/rejected": 0.03514351695775986, + "logps/chosen": -594.3389282226562, + "logps/rejected": -782.8358154296875, + "loss": 0.4195, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7765058279037476, + "rewards/margins": 1.2403103113174438, + "rewards/rejected": -3.0168161392211914, + "step": 920 + }, + { + "epoch": 0.24, + "learning_rate": 4.694147707194659e-06, + "logits/chosen": -0.5554194450378418, + "logits/rejected": 0.016318077221512794, + "logps/chosen": -585.2276000976562, + "logps/rejected": -805.9215087890625, + "loss": 0.3769, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8951377868652344, + "rewards/margins": 1.719668984413147, + "rewards/rejected": -3.61480712890625, + "step": 930 + }, + { + "epoch": 0.25, + "learning_rate": 4.683107158658782e-06, + "logits/chosen": -0.7583510875701904, + "logits/rejected": 0.25492575764656067, + "logps/chosen": -562.8099975585938, + "logps/rejected": -745.3627319335938, + "loss": 0.5012, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.83584725856781, + "rewards/margins": 1.2255539894104004, + "rewards/rejected": -3.061401128768921, + "step": 940 + }, + { + "epoch": 0.25, + "learning_rate": 4.671884321303407e-06, + "logits/chosen": -1.1498582363128662, + "logits/rejected": 0.7587814331054688, + "logps/chosen": -591.7152099609375, + "logps/rejected": -751.0265502929688, + "loss": 0.4275, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7667793035507202, + "rewards/margins": 1.4644314050674438, + "rewards/rejected": -3.231210231781006, + "step": 950 + }, + { + "epoch": 0.25, + "learning_rate": 4.660480132232224e-06, + "logits/chosen": -0.7957364320755005, + "logits/rejected": -0.17932990193367004, + "logps/chosen": -428.00958251953125, + "logps/rejected": -722.1585083007812, + "loss": 0.3945, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4679275751113892, + "rewards/margins": 1.536849021911621, + "rewards/rejected": -3.0047767162323, + "step": 960 + }, + { + "epoch": 0.25, + "learning_rate": 4.6488955436917414e-06, + "logits/chosen": -0.7032185196876526, + "logits/rejected": 0.244097039103508, + "logps/chosen": -511.335693359375, + "logps/rejected": -830.0217895507812, + "loss": 0.4831, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6884835958480835, + "rewards/margins": 1.69620680809021, + "rewards/rejected": -3.384690761566162, + "step": 970 + }, + { + "epoch": 0.26, + "learning_rate": 4.6371315229917644e-06, + "logits/chosen": -0.9565087556838989, + "logits/rejected": -0.6389614343643188, + "logps/chosen": -467.09814453125, + "logps/rejected": -740.5447998046875, + "loss": 0.4217, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.331072449684143, + "rewards/margins": 1.5709624290466309, + "rewards/rejected": -2.9020345211029053, + "step": 980 + }, + { + "epoch": 0.26, + "learning_rate": 4.625189052424638e-06, + "logits/chosen": -0.9426406621932983, + "logits/rejected": -0.6099969148635864, + "logps/chosen": -464.8932189941406, + "logps/rejected": -654.5234985351562, + "loss": 0.457, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.2798701524734497, + "rewards/margins": 1.0147043466567993, + "rewards/rejected": -2.29457426071167, + "step": 990 + }, + { + "epoch": 0.26, + "learning_rate": 4.613069129183218e-06, + "logits/chosen": -0.9859519004821777, + "logits/rejected": -0.3881329894065857, + "logps/chosen": -416.37750244140625, + "logps/rejected": -661.3399658203125, + "loss": 0.4259, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.1251696348190308, + "rewards/margins": 1.4806818962097168, + "rewards/rejected": -2.605851650238037, + "step": 1000 + }, + { + "epoch": 0.26, + "learning_rate": 4.600772765277607e-06, + "logits/chosen": -0.8805161714553833, + "logits/rejected": -0.5182097554206848, + "logps/chosen": -533.8609008789062, + "logps/rejected": -738.94921875, + "loss": 0.4221, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4609010219573975, + "rewards/margins": 1.319215178489685, + "rewards/rejected": -2.780116319656372, + "step": 1010 + }, + { + "epoch": 0.27, + "learning_rate": 4.588300987450652e-06, + "logits/chosen": -0.7461687922477722, + "logits/rejected": -0.11924894899129868, + "logps/chosen": -504.3556213378906, + "logps/rejected": -709.5269165039062, + "loss": 0.4198, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.595538854598999, + "rewards/margins": 1.395408272743225, + "rewards/rejected": -2.9909470081329346, + "step": 1020 + }, + { + "epoch": 0.27, + "learning_rate": 4.5756548370922136e-06, + "logits/chosen": -0.9196340441703796, + "logits/rejected": 0.15583333373069763, + "logps/chosen": -467.1847229003906, + "logps/rejected": -698.4019165039062, + "loss": 0.471, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4349987506866455, + "rewards/margins": 1.6199867725372314, + "rewards/rejected": -3.054985523223877, + "step": 1030 + }, + { + "epoch": 0.27, + "learning_rate": 4.562835370152206e-06, + "logits/chosen": -0.9880784153938293, + "logits/rejected": -0.4833584427833557, + "logps/chosen": -474.13946533203125, + "logps/rejected": -677.9033813476562, + "loss": 0.4454, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.532492756843567, + "rewards/margins": 1.2476747035980225, + "rewards/rejected": -2.7801673412323, + "step": 1040 + }, + { + "epoch": 0.27, + "learning_rate": 4.54984365705243e-06, + "logits/chosen": -0.9044283628463745, + "logits/rejected": 0.2612631916999817, + "logps/chosen": -487.19573974609375, + "logps/rejected": -666.8839721679688, + "loss": 0.4112, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5281569957733154, + "rewards/margins": 1.2005985975265503, + "rewards/rejected": -2.728755474090576, + "step": 1050 + }, + { + "epoch": 0.28, + "learning_rate": 4.536680782597191e-06, + "logits/chosen": -1.1122404336929321, + "logits/rejected": 0.34001001715660095, + "logps/chosen": -424.924560546875, + "logps/rejected": -637.4059448242188, + "loss": 0.4054, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2176024913787842, + "rewards/margins": 1.382258653640747, + "rewards/rejected": -2.5998611450195312, + "step": 1060 + }, + { + "epoch": 0.28, + "learning_rate": 4.523347845882718e-06, + "logits/chosen": -1.2397373914718628, + "logits/rejected": -0.0026629925705492496, + "logps/chosen": -473.2481994628906, + "logps/rejected": -670.2132568359375, + "loss": 0.472, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3779747486114502, + "rewards/margins": 1.2943679094314575, + "rewards/rejected": -2.672342300415039, + "step": 1070 + }, + { + "epoch": 0.28, + "learning_rate": 4.50984596020539e-06, + "logits/chosen": -1.0471652746200562, + "logits/rejected": -0.250629723072052, + "logps/chosen": -507.221435546875, + "logps/rejected": -730.1007080078125, + "loss": 0.4251, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5079420804977417, + "rewards/margins": 1.3903275728225708, + "rewards/rejected": -2.8982696533203125, + "step": 1080 + }, + { + "epoch": 0.29, + "learning_rate": 4.4961762529687745e-06, + "logits/chosen": -0.47195902466773987, + "logits/rejected": -0.18957489728927612, + "logps/chosen": -446.6513671875, + "logps/rejected": -749.2055053710938, + "loss": 0.3931, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.421588659286499, + "rewards/margins": 1.843101143836975, + "rewards/rejected": -3.2646899223327637, + "step": 1090 + }, + { + "epoch": 0.29, + "learning_rate": 4.482339865589492e-06, + "logits/chosen": -0.6734046936035156, + "logits/rejected": 0.2455734759569168, + "logps/chosen": -536.9996337890625, + "logps/rejected": -779.7747802734375, + "loss": 0.3976, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8160717487335205, + "rewards/margins": 1.6105674505233765, + "rewards/rejected": -3.4266390800476074, + "step": 1100 + }, + { + "epoch": 0.29, + "learning_rate": 4.468337953401909e-06, + "logits/chosen": -0.7846983075141907, + "logits/rejected": 0.3714667558670044, + "logps/chosen": -643.3924560546875, + "logps/rejected": -889.9669189453125, + "loss": 0.4624, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.112175703048706, + "rewards/margins": 1.6929798126220703, + "rewards/rejected": -3.8051555156707764, + "step": 1110 + }, + { + "epoch": 0.29, + "learning_rate": 4.45417168556166e-06, + "logits/chosen": -0.887243390083313, + "logits/rejected": 0.19038431346416473, + "logps/chosen": -501.67987060546875, + "logps/rejected": -794.1953735351562, + "loss": 0.4672, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6879377365112305, + "rewards/margins": 1.825568437576294, + "rewards/rejected": -3.5135064125061035, + "step": 1120 + }, + { + "epoch": 0.3, + "learning_rate": 4.439842244948036e-06, + "logits/chosen": -1.2177503108978271, + "logits/rejected": -0.813552975654602, + "logps/chosen": -524.879150390625, + "logps/rejected": -667.7706298828125, + "loss": 0.4243, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5180702209472656, + "rewards/margins": 1.0134727954864502, + "rewards/rejected": -2.531543016433716, + "step": 1130 + }, + { + "epoch": 0.3, + "learning_rate": 4.425350828065204e-06, + "logits/chosen": -1.0455009937286377, + "logits/rejected": 0.2684328556060791, + "logps/chosen": -534.6132202148438, + "logps/rejected": -721.2943115234375, + "loss": 0.3871, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7661949396133423, + "rewards/margins": 1.332626223564148, + "rewards/rejected": -3.0988211631774902, + "step": 1140 + }, + { + "epoch": 0.3, + "learning_rate": 4.410698644942303e-06, + "logits/chosen": -0.8786664009094238, + "logits/rejected": 0.14112402498722076, + "logps/chosen": -458.5977478027344, + "logps/rejected": -710.25927734375, + "loss": 0.3959, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.411984920501709, + "rewards/margins": 1.6877896785736084, + "rewards/rejected": -3.0997745990753174, + "step": 1150 + }, + { + "epoch": 0.3, + "learning_rate": 4.395886919032406e-06, + "logits/chosen": -0.8496414422988892, + "logits/rejected": -0.006237986497581005, + "logps/chosen": -508.9517517089844, + "logps/rejected": -735.8714599609375, + "loss": 0.4109, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5958458185195923, + "rewards/margins": 1.4991271495819092, + "rewards/rejected": -3.094973087310791, + "step": 1160 + }, + { + "epoch": 0.31, + "learning_rate": 4.380916887110366e-06, + "logits/chosen": -0.7842418551445007, + "logits/rejected": -0.08200596272945404, + "logps/chosen": -496.2059020996094, + "logps/rejected": -729.1738891601562, + "loss": 0.3737, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5399061441421509, + "rewards/margins": 1.5425506830215454, + "rewards/rejected": -3.0824568271636963, + "step": 1170 + }, + { + "epoch": 0.31, + "learning_rate": 4.365789799169539e-06, + "logits/chosen": -1.2651920318603516, + "logits/rejected": 0.12391755729913712, + "logps/chosen": -512.1771850585938, + "logps/rejected": -717.8344116210938, + "loss": 0.4228, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5166397094726562, + "rewards/margins": 1.4622437953948975, + "rewards/rejected": -2.9788835048675537, + "step": 1180 + }, + { + "epoch": 0.31, + "learning_rate": 4.350506918317416e-06, + "logits/chosen": -0.9895895719528198, + "logits/rejected": -0.03255582973361015, + "logps/chosen": -519.2945556640625, + "logps/rejected": -687.9844970703125, + "loss": 0.4106, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6044280529022217, + "rewards/margins": 1.4460744857788086, + "rewards/rejected": -3.050502300262451, + "step": 1190 + }, + { + "epoch": 0.31, + "learning_rate": 4.335069520670149e-06, + "logits/chosen": -0.8492997884750366, + "logits/rejected": -0.5858234167098999, + "logps/chosen": -516.3770141601562, + "logps/rejected": -707.2117309570312, + "loss": 0.4034, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7651269435882568, + "rewards/margins": 1.3661835193634033, + "rewards/rejected": -3.1313109397888184, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 0.7243556976318359, + "eval_logits/rejected": 1.7155570983886719, + "eval_logps/chosen": -545.945068359375, + "eval_logps/rejected": -770.7060546875, + "eval_loss": 0.44869744777679443, + "eval_rewards/accuracies": 0.7870000004768372, + "eval_rewards/chosen": -1.9028427600860596, + "eval_rewards/margins": 1.614193320274353, + "eval_rewards/rejected": -3.517036199569702, + "eval_runtime": 1383.7156, + "eval_samples_per_second": 1.445, + "eval_steps_per_second": 0.361, + "step": 1200 + }, + { + "epoch": 0.32, + "learning_rate": 4.319478895246e-06, + "logits/chosen": -0.6312126517295837, + "logits/rejected": 0.010389542207121849, + "logps/chosen": -485.5673828125, + "logps/rejected": -724.3965454101562, + "loss": 0.4596, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7340030670166016, + "rewards/margins": 1.5444698333740234, + "rewards/rejected": -3.278472900390625, + "step": 1210 + }, + { + "epoch": 0.32, + "learning_rate": 4.303736343857704e-06, + "logits/chosen": -0.5306238532066345, + "logits/rejected": 0.09534727036952972, + "logps/chosen": -501.3701171875, + "logps/rejected": -773.635498046875, + "loss": 0.4065, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6401008367538452, + "rewards/margins": 1.739175796508789, + "rewards/rejected": -3.379276752471924, + "step": 1220 + }, + { + "epoch": 0.32, + "learning_rate": 4.287843181003772e-06, + "logits/chosen": -0.7599018812179565, + "logits/rejected": 0.544152557849884, + "logps/chosen": -492.7386779785156, + "logps/rejected": -739.1934814453125, + "loss": 0.3467, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.591827392578125, + "rewards/margins": 1.6447445154190063, + "rewards/rejected": -3.236571788787842, + "step": 1230 + }, + { + "epoch": 0.32, + "learning_rate": 4.27180073375873e-06, + "logits/chosen": -0.7321812510490417, + "logits/rejected": -0.04632633179426193, + "logps/chosen": -503.90655517578125, + "logps/rejected": -751.9849243164062, + "loss": 0.4352, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7118949890136719, + "rewards/margins": 1.5225197076797485, + "rewards/rejected": -3.234414577484131, + "step": 1240 + }, + { + "epoch": 0.33, + "learning_rate": 4.255610341662304e-06, + "logits/chosen": -0.8730325698852539, + "logits/rejected": -0.1791534423828125, + "logps/chosen": -524.244140625, + "logps/rejected": -752.1405029296875, + "loss": 0.3827, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7566620111465454, + "rewards/margins": 1.7185337543487549, + "rewards/rejected": -3.4751956462860107, + "step": 1250 + }, + { + "epoch": 0.33, + "learning_rate": 4.2392733566075764e-06, + "logits/chosen": -0.8349526524543762, + "logits/rejected": 0.03989090770483017, + "logps/chosen": -555.2600708007812, + "logps/rejected": -799.1808471679688, + "loss": 0.4153, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8972892761230469, + "rewards/margins": 1.5951424837112427, + "rewards/rejected": -3.492431640625, + "step": 1260 + }, + { + "epoch": 0.33, + "learning_rate": 4.2227911427280975e-06, + "logits/chosen": -1.2052414417266846, + "logits/rejected": 0.40989890694618225, + "logps/chosen": -535.0908203125, + "logps/rejected": -766.23974609375, + "loss": 0.4851, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7357299327850342, + "rewards/margins": 1.7371448278427124, + "rewards/rejected": -3.472874879837036, + "step": 1270 + }, + { + "epoch": 0.33, + "learning_rate": 4.206165076283983e-06, + "logits/chosen": -1.3161356449127197, + "logits/rejected": -0.31805121898651123, + "logps/chosen": -562.9700927734375, + "logps/rejected": -699.4861450195312, + "loss": 0.4345, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6805912256240845, + "rewards/margins": 1.0278584957122803, + "rewards/rejected": -2.708449602127075, + "step": 1280 + }, + { + "epoch": 0.34, + "learning_rate": 4.189396545546995e-06, + "logits/chosen": -1.2732408046722412, + "logits/rejected": -0.36048611998558044, + "logps/chosen": -463.95849609375, + "logps/rejected": -677.6444091796875, + "loss": 0.4661, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.385292649269104, + "rewards/margins": 1.3424113988876343, + "rewards/rejected": -2.7277040481567383, + "step": 1290 + }, + { + "epoch": 0.34, + "learning_rate": 4.172486950684627e-06, + "logits/chosen": -1.3132286071777344, + "logits/rejected": 0.031231578439474106, + "logps/chosen": -535.4584350585938, + "logps/rejected": -707.3728637695312, + "loss": 0.439, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6812185049057007, + "rewards/margins": 1.3230525255203247, + "rewards/rejected": -3.0042712688446045, + "step": 1300 + }, + { + "epoch": 0.34, + "learning_rate": 4.155437703643182e-06, + "logits/chosen": -1.2260167598724365, + "logits/rejected": -0.5245649814605713, + "logps/chosen": -462.94403076171875, + "logps/rejected": -694.3932495117188, + "loss": 0.4018, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4110779762268066, + "rewards/margins": 1.4465053081512451, + "rewards/rejected": -2.8575832843780518, + "step": 1310 + }, + { + "epoch": 0.35, + "learning_rate": 4.138250228029882e-06, + "logits/chosen": -1.0029791593551636, + "logits/rejected": -0.45876234769821167, + "logps/chosen": -471.4588928222656, + "logps/rejected": -711.7996215820312, + "loss": 0.3972, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6702163219451904, + "rewards/margins": 1.6204955577850342, + "rewards/rejected": -3.2907118797302246, + "step": 1320 + }, + { + "epoch": 0.35, + "learning_rate": 4.120925958993994e-06, + "logits/chosen": -0.5831348299980164, + "logits/rejected": -0.2067866027355194, + "logps/chosen": -476.1014099121094, + "logps/rejected": -722.8709106445312, + "loss": 0.4493, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6658756732940674, + "rewards/margins": 1.6795238256454468, + "rewards/rejected": -3.3453993797302246, + "step": 1330 + }, + { + "epoch": 0.35, + "learning_rate": 4.103466343106999e-06, + "logits/chosen": -1.077728033065796, + "logits/rejected": 0.1974649727344513, + "logps/chosen": -636.1361083984375, + "logps/rejected": -829.7213134765625, + "loss": 0.5001, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2163033485412598, + "rewards/margins": 1.5475972890853882, + "rewards/rejected": -3.7639007568359375, + "step": 1340 + }, + { + "epoch": 0.35, + "learning_rate": 4.085872838241797e-06, + "logits/chosen": -0.8451377749443054, + "logits/rejected": -0.4800887703895569, + "logps/chosen": -523.9595947265625, + "logps/rejected": -813.6463012695312, + "loss": 0.4193, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.937745451927185, + "rewards/margins": 1.8354793787002563, + "rewards/rejected": -3.7732245922088623, + "step": 1350 + }, + { + "epoch": 0.36, + "learning_rate": 4.06814691345098e-06, + "logits/chosen": -0.8073797225952148, + "logits/rejected": -0.67634516954422, + "logps/chosen": -499.55364990234375, + "logps/rejected": -725.6375732421875, + "loss": 0.5085, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8341518640518188, + "rewards/margins": 1.3492587804794312, + "rewards/rejected": -3.18341064453125, + "step": 1360 + }, + { + "epoch": 0.36, + "learning_rate": 4.050290048844171e-06, + "logits/chosen": -1.3967281579971313, + "logits/rejected": -0.1124715581536293, + "logps/chosen": -586.6700439453125, + "logps/rejected": -785.9407958984375, + "loss": 0.4425, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6672008037567139, + "rewards/margins": 1.4039936065673828, + "rewards/rejected": -3.0711944103240967, + "step": 1370 + }, + { + "epoch": 0.36, + "learning_rate": 4.032303735464422e-06, + "logits/chosen": -1.3250014781951904, + "logits/rejected": -0.28036853671073914, + "logps/chosen": -523.9191284179688, + "logps/rejected": -757.8228759765625, + "loss": 0.3506, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.4709548950195312, + "rewards/margins": 1.7138077020645142, + "rewards/rejected": -3.184762477874756, + "step": 1380 + }, + { + "epoch": 0.36, + "learning_rate": 4.014189475163727e-06, + "logits/chosen": -0.9671980142593384, + "logits/rejected": -0.292216956615448, + "logps/chosen": -559.5821533203125, + "logps/rejected": -784.157958984375, + "loss": 0.3887, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7415062189102173, + "rewards/margins": 1.629124641418457, + "rewards/rejected": -3.370630979537964, + "step": 1390 + }, + { + "epoch": 0.37, + "learning_rate": 3.995948780477605e-06, + "logits/chosen": -1.1137468814849854, + "logits/rejected": -0.3016485273838043, + "logps/chosen": -586.1868286132812, + "logps/rejected": -790.1680908203125, + "loss": 0.4715, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8678308725357056, + "rewards/margins": 1.3062608242034912, + "rewards/rejected": -3.1740918159484863, + "step": 1400 + }, + { + "epoch": 0.37, + "learning_rate": 3.977583174498816e-06, + "logits/chosen": -0.9699400067329407, + "logits/rejected": -0.7510320544242859, + "logps/chosen": -545.1305541992188, + "logps/rejected": -793.4750366210938, + "loss": 0.4399, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7863260507583618, + "rewards/margins": 1.481147050857544, + "rewards/rejected": -3.267472743988037, + "step": 1410 + }, + { + "epoch": 0.37, + "learning_rate": 3.959094190750172e-06, + "logits/chosen": -1.3356729745864868, + "logits/rejected": -0.6711053848266602, + "logps/chosen": -574.8829956054688, + "logps/rejected": -773.9825439453125, + "loss": 0.3984, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7231754064559937, + "rewards/margins": 1.5199847221374512, + "rewards/rejected": -3.2431602478027344, + "step": 1420 + }, + { + "epoch": 0.37, + "learning_rate": 3.9404833730564975e-06, + "logits/chosen": -1.2241183519363403, + "logits/rejected": -0.7589890956878662, + "logps/chosen": -524.2040405273438, + "logps/rejected": -684.8192138671875, + "loss": 0.4751, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.550222396850586, + "rewards/margins": 1.2541849613189697, + "rewards/rejected": -2.8044073581695557, + "step": 1430 + }, + { + "epoch": 0.38, + "learning_rate": 3.921752275415712e-06, + "logits/chosen": -1.2616220712661743, + "logits/rejected": -0.6698473691940308, + "logps/chosen": -494.5272521972656, + "logps/rejected": -719.5067749023438, + "loss": 0.407, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5580322742462158, + "rewards/margins": 1.4134986400604248, + "rewards/rejected": -2.9715309143066406, + "step": 1440 + }, + { + "epoch": 0.38, + "learning_rate": 3.902902461869079e-06, + "logits/chosen": -0.9971591234207153, + "logits/rejected": -0.4130152761936188, + "logps/chosen": -407.35736083984375, + "logps/rejected": -699.933837890625, + "loss": 0.3878, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3271167278289795, + "rewards/margins": 1.8487539291381836, + "rewards/rejected": -3.175870895385742, + "step": 1450 + }, + { + "epoch": 0.38, + "learning_rate": 3.883935506370605e-06, + "logits/chosen": -1.3185368776321411, + "logits/rejected": -0.5751763582229614, + "logps/chosen": -538.9137573242188, + "logps/rejected": -727.714599609375, + "loss": 0.3787, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7377328872680664, + "rewards/margins": 1.1966904401779175, + "rewards/rejected": -2.9344232082366943, + "step": 1460 + }, + { + "epoch": 0.38, + "learning_rate": 3.864852992655617e-06, + "logits/chosen": -1.2166504859924316, + "logits/rejected": -0.45289698243141174, + "logps/chosen": -516.9674682617188, + "logps/rejected": -733.4182739257812, + "loss": 0.3926, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6966655254364014, + "rewards/margins": 1.5359889268875122, + "rewards/rejected": -3.232654571533203, + "step": 1470 + }, + { + "epoch": 0.39, + "learning_rate": 3.845656514108516e-06, + "logits/chosen": -1.2518984079360962, + "logits/rejected": -0.23703515529632568, + "logps/chosen": -571.0328979492188, + "logps/rejected": -839.0745239257812, + "loss": 0.409, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9428062438964844, + "rewards/margins": 1.905398964881897, + "rewards/rejected": -3.84820556640625, + "step": 1480 + }, + { + "epoch": 0.39, + "learning_rate": 3.826347673629738e-06, + "logits/chosen": -1.241381049156189, + "logits/rejected": -0.424204021692276, + "logps/chosen": -517.6029052734375, + "logps/rejected": -811.5399169921875, + "loss": 0.4235, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9763988256454468, + "rewards/margins": 1.8420568704605103, + "rewards/rejected": -3.818455457687378, + "step": 1490 + }, + { + "epoch": 0.39, + "learning_rate": 3.8069280835019062e-06, + "logits/chosen": -1.3552948236465454, + "logits/rejected": -0.24583733081817627, + "logps/chosen": -539.5227661132812, + "logps/rejected": -833.4625244140625, + "loss": 0.4193, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.008434295654297, + "rewards/margins": 2.0153846740722656, + "rewards/rejected": -4.0238189697265625, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_logits/chosen": 0.0019010701216757298, + "eval_logits/rejected": 0.9998253583908081, + "eval_logps/chosen": -544.3021240234375, + "eval_logps/rejected": -767.47119140625, + "eval_loss": 0.44199585914611816, + "eval_rewards/accuracies": 0.7839999794960022, + "eval_rewards/chosen": -1.8864127397537231, + "eval_rewards/margins": 1.5982747077941895, + "eval_rewards/rejected": -3.484687566757202, + "eval_runtime": 1381.698, + "eval_samples_per_second": 1.447, + "eval_steps_per_second": 0.362, + "step": 1500 + }, + { + "epoch": 0.4, + "learning_rate": 3.7873993652552077e-06, + "logits/chosen": -1.0648901462554932, + "logits/rejected": -0.4144531786441803, + "logps/chosen": -475.7496643066406, + "logps/rejected": -861.4631958007812, + "loss": 0.3316, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.5341570377349854, + "rewards/margins": 2.4849419593811035, + "rewards/rejected": -4.019099235534668, + "step": 1510 + }, + { + "epoch": 0.4, + "learning_rate": 3.7677631495319953e-06, + "logits/chosen": -1.0717235803604126, + "logits/rejected": -0.44138726592063904, + "logps/chosen": -545.8651733398438, + "logps/rejected": -788.2182006835938, + "loss": 0.4331, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6811964511871338, + "rewards/margins": 1.6335046291351318, + "rewards/rejected": -3.3147010803222656, + "step": 1520 + }, + { + "epoch": 0.4, + "learning_rate": 3.748021075950633e-06, + "logits/chosen": -1.231894850730896, + "logits/rejected": -0.649477481842041, + "logps/chosen": -472.5684509277344, + "logps/rejected": -648.9671630859375, + "loss": 0.4403, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6376155614852905, + "rewards/margins": 1.3435395956039429, + "rewards/rejected": -2.9811549186706543, + "step": 1530 + }, + { + "epoch": 0.4, + "learning_rate": 3.7281747929685824e-06, + "logits/chosen": -0.8811131715774536, + "logits/rejected": -0.19349880516529083, + "logps/chosen": -556.5198364257812, + "logps/rejected": -868.0969848632812, + "loss": 0.3711, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.9225609302520752, + "rewards/margins": 2.121070384979248, + "rewards/rejected": -4.043631076812744, + "step": 1540 + }, + { + "epoch": 0.41, + "learning_rate": 3.7082259577447604e-06, + "logits/chosen": -1.3660091161727905, + "logits/rejected": -0.37334832549095154, + "logps/chosen": -508.5003356933594, + "logps/rejected": -700.1402587890625, + "loss": 0.4505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6233818531036377, + "rewards/margins": 1.5405502319335938, + "rewards/rejected": -3.1639320850372314, + "step": 1550 + }, + { + "epoch": 0.41, + "learning_rate": 3.6881762360011688e-06, + "logits/chosen": -0.9465176463127136, + "logits/rejected": -0.1583428531885147, + "logps/chosen": -568.060302734375, + "logps/rejected": -805.0638427734375, + "loss": 0.3998, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.06211256980896, + "rewards/margins": 1.7412744760513306, + "rewards/rejected": -3.80338716506958, + "step": 1560 + }, + { + "epoch": 0.41, + "learning_rate": 3.668027301883802e-06, + "logits/chosen": -0.6530407667160034, + "logits/rejected": 0.17987249791622162, + "logps/chosen": -569.39013671875, + "logps/rejected": -819.4601440429688, + "loss": 0.3991, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.138000726699829, + "rewards/margins": 1.7371118068695068, + "rewards/rejected": -3.8751120567321777, + "step": 1570 + }, + { + "epoch": 0.41, + "learning_rate": 3.64778083782286e-06, + "logits/chosen": -1.1196366548538208, + "logits/rejected": 0.07997065782546997, + "logps/chosen": -588.8242797851562, + "logps/rejected": -888.84619140625, + "loss": 0.3689, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.009155750274658, + "rewards/margins": 2.2112624645233154, + "rewards/rejected": -4.220418453216553, + "step": 1580 + }, + { + "epoch": 0.42, + "learning_rate": 3.627438534392268e-06, + "logits/chosen": -1.2351372241973877, + "logits/rejected": -0.21363726258277893, + "logps/chosen": -565.1137084960938, + "logps/rejected": -799.2964477539062, + "loss": 0.4302, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9956506490707397, + "rewards/margins": 1.7617241144180298, + "rewards/rejected": -3.7573745250701904, + "step": 1590 + }, + { + "epoch": 0.42, + "learning_rate": 3.607002090168506e-06, + "logits/chosen": -1.1077851057052612, + "logits/rejected": -0.3146376609802246, + "logps/chosen": -519.68505859375, + "logps/rejected": -786.7335815429688, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6840426921844482, + "rewards/margins": 1.7921777963638306, + "rewards/rejected": -3.4762203693389893, + "step": 1600 + }, + { + "epoch": 0.42, + "learning_rate": 3.586473211588787e-06, + "logits/chosen": -1.0645841360092163, + "logits/rejected": 0.0885920599102974, + "logps/chosen": -530.0689697265625, + "logps/rejected": -746.6349487304688, + "loss": 0.4293, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7966455221176147, + "rewards/margins": 1.5383833646774292, + "rewards/rejected": -3.335028886795044, + "step": 1610 + }, + { + "epoch": 0.42, + "learning_rate": 3.5658536128085623e-06, + "logits/chosen": -0.8512083292007446, + "logits/rejected": -0.07845296710729599, + "logps/chosen": -522.5213623046875, + "logps/rejected": -783.8287353515625, + "loss": 0.4465, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6679248809814453, + "rewards/margins": 1.7967729568481445, + "rewards/rejected": -3.464698076248169, + "step": 1620 + }, + { + "epoch": 0.43, + "learning_rate": 3.545145015558399e-06, + "logits/chosen": -1.1375799179077148, + "logits/rejected": -0.7033450603485107, + "logps/chosen": -488.27825927734375, + "logps/rejected": -743.138427734375, + "loss": 0.4717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5062588453292847, + "rewards/margins": 1.479614019393921, + "rewards/rejected": -2.985872983932495, + "step": 1630 + }, + { + "epoch": 0.43, + "learning_rate": 3.5243491490002056e-06, + "logits/chosen": -1.122290849685669, + "logits/rejected": -0.7316358685493469, + "logps/chosen": -507.69036865234375, + "logps/rejected": -690.9684448242188, + "loss": 0.4651, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5151287317276, + "rewards/margins": 1.2003190517425537, + "rewards/rejected": -2.7154476642608643, + "step": 1640 + }, + { + "epoch": 0.43, + "learning_rate": 3.503467749582857e-06, + "logits/chosen": -1.7979711294174194, + "logits/rejected": -0.23376531898975372, + "logps/chosen": -447.5152282714844, + "logps/rejected": -644.4968872070312, + "loss": 0.4207, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3395702838897705, + "rewards/margins": 1.2873914241790771, + "rewards/rejected": -2.6269614696502686, + "step": 1650 + }, + { + "epoch": 0.43, + "learning_rate": 3.4825025608971947e-06, + "logits/chosen": -1.1949323415756226, + "logits/rejected": -0.4591255784034729, + "logps/chosen": -531.0167846679688, + "logps/rejected": -708.4334716796875, + "loss": 0.4157, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6559035778045654, + "rewards/margins": 1.3914124965667725, + "rewards/rejected": -3.047316074371338, + "step": 1660 + }, + { + "epoch": 0.44, + "learning_rate": 3.4614553335304407e-06, + "logits/chosen": -0.9112270474433899, + "logits/rejected": -0.7079882025718689, + "logps/chosen": -477.55645751953125, + "logps/rejected": -665.2545166015625, + "loss": 0.4351, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.638381004333496, + "rewards/margins": 0.9713879823684692, + "rewards/rejected": -2.609769105911255, + "step": 1670 + }, + { + "epoch": 0.44, + "learning_rate": 3.4403278249200222e-06, + "logits/chosen": -1.1047728061676025, + "logits/rejected": -0.209273099899292, + "logps/chosen": -513.626708984375, + "logps/rejected": -751.2394409179688, + "loss": 0.3962, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6597833633422852, + "rewards/margins": 1.5487313270568848, + "rewards/rejected": -3.20851469039917, + "step": 1680 + }, + { + "epoch": 0.44, + "learning_rate": 3.4191217992068293e-06, + "logits/chosen": -1.1829806566238403, + "logits/rejected": 0.04266662523150444, + "logps/chosen": -560.150634765625, + "logps/rejected": -809.93896484375, + "loss": 0.4279, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8385143280029297, + "rewards/margins": 1.672101378440857, + "rewards/rejected": -3.510615825653076, + "step": 1690 + }, + { + "epoch": 0.44, + "learning_rate": 3.3978390270879056e-06, + "logits/chosen": -1.1551451683044434, + "logits/rejected": -0.16915690898895264, + "logps/chosen": -546.1964111328125, + "logps/rejected": -733.3271484375, + "loss": 0.4386, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6938936710357666, + "rewards/margins": 1.5033698081970215, + "rewards/rejected": -3.197263240814209, + "step": 1700 + }, + { + "epoch": 0.45, + "learning_rate": 3.3764812856685995e-06, + "logits/chosen": -1.0993391275405884, + "logits/rejected": -0.748005211353302, + "logps/chosen": -503.6968688964844, + "logps/rejected": -733.4053955078125, + "loss": 0.4426, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7742856740951538, + "rewards/margins": 1.4196711778640747, + "rewards/rejected": -3.1939570903778076, + "step": 1710 + }, + { + "epoch": 0.45, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": -1.1645628213882446, + "logits/rejected": -0.29861804842948914, + "logps/chosen": -516.9429931640625, + "logps/rejected": -731.75634765625, + "loss": 0.4486, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7132374048233032, + "rewards/margins": 1.4840338230133057, + "rewards/rejected": -3.1972713470458984, + "step": 1720 + }, + { + "epoch": 0.45, + "learning_rate": 3.3335480345008907e-06, + "logits/chosen": -1.0770137310028076, + "logits/rejected": -0.3164665400981903, + "logps/chosen": -475.98345947265625, + "logps/rejected": -717.9310302734375, + "loss": 0.3996, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5537554025650024, + "rewards/margins": 1.6571037769317627, + "rewards/rejected": -3.2108588218688965, + "step": 1730 + }, + { + "epoch": 0.46, + "learning_rate": 3.3119761096666055e-06, + "logits/chosen": -1.5323493480682373, + "logits/rejected": -0.1781812459230423, + "logps/chosen": -540.9231567382812, + "logps/rejected": -732.0032958984375, + "loss": 0.3972, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7525060176849365, + "rewards/margins": 1.5016968250274658, + "rewards/rejected": -3.254202365875244, + "step": 1740 + }, + { + "epoch": 0.46, + "learning_rate": 3.290336385060832e-06, + "logits/chosen": -1.1251745223999023, + "logits/rejected": -0.11602558940649033, + "logps/chosen": -634.4666748046875, + "logps/rejected": -833.8592529296875, + "loss": 0.3915, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.158566951751709, + "rewards/margins": 1.6248620748519897, + "rewards/rejected": -3.783428907394409, + "step": 1750 + }, + { + "epoch": 0.46, + "learning_rate": 3.268630667594348e-06, + "logits/chosen": -0.6574426889419556, + "logits/rejected": -0.43358176946640015, + "logps/chosen": -525.9124755859375, + "logps/rejected": -769.3137817382812, + "loss": 0.4214, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.048187017440796, + "rewards/margins": 1.6054658889770508, + "rewards/rejected": -3.6536529064178467, + "step": 1760 + }, + { + "epoch": 0.46, + "learning_rate": 3.2468607696883147e-06, + "logits/chosen": -1.0191020965576172, + "logits/rejected": -0.8884264230728149, + "logps/chosen": -542.0821533203125, + "logps/rejected": -717.0391845703125, + "loss": 0.439, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9124071598052979, + "rewards/margins": 1.0934618711471558, + "rewards/rejected": -3.0058693885803223, + "step": 1770 + }, + { + "epoch": 0.47, + "learning_rate": 3.225028509122944e-06, + "logits/chosen": -0.6006935834884644, + "logits/rejected": -0.7735892534255981, + "logps/chosen": -506.019287109375, + "logps/rejected": -766.4857177734375, + "loss": 0.4162, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.817831039428711, + "rewards/margins": 1.61776602268219, + "rewards/rejected": -3.4355969429016113, + "step": 1780 + }, + { + "epoch": 0.47, + "learning_rate": 3.2031357088857083e-06, + "logits/chosen": -1.139946699142456, + "logits/rejected": -0.35632461309432983, + "logps/chosen": -478.1253967285156, + "logps/rejected": -795.1430053710938, + "loss": 0.3478, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6329262256622314, + "rewards/margins": 2.2241363525390625, + "rewards/rejected": -3.857062816619873, + "step": 1790 + }, + { + "epoch": 0.47, + "learning_rate": 3.181184197019127e-06, + "logits/chosen": -1.2290761470794678, + "logits/rejected": -0.23889155685901642, + "logps/chosen": -565.7088623046875, + "logps/rejected": -842.8005981445312, + "loss": 0.409, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.107991933822632, + "rewards/margins": 2.049304485321045, + "rewards/rejected": -4.157296180725098, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_logits/chosen": 0.5340744853019714, + "eval_logits/rejected": 1.4875802993774414, + "eval_logps/chosen": -561.5723266601562, + "eval_logps/rejected": -791.2130126953125, + "eval_loss": 0.43651697039604187, + "eval_rewards/accuracies": 0.7919999957084656, + "eval_rewards/chosen": -2.059115409851074, + "eval_rewards/margins": 1.6629897356033325, + "eval_rewards/rejected": -3.7221052646636963, + "eval_runtime": 1381.695, + "eval_samples_per_second": 1.447, + "eval_steps_per_second": 0.362, + "step": 1800 + }, + { + "epoch": 0.47, + "learning_rate": 3.159175806468126e-06, + "logits/chosen": -1.0473666191101074, + "logits/rejected": -0.06902176141738892, + "logps/chosen": -595.341552734375, + "logps/rejected": -831.9974365234375, + "loss": 0.4152, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9551061391830444, + "rewards/margins": 1.7293351888656616, + "rewards/rejected": -3.684441328048706, + "step": 1810 + }, + { + "epoch": 0.48, + "learning_rate": 3.1371123749269804e-06, + "logits/chosen": -1.026745319366455, + "logits/rejected": -0.4049917757511139, + "logps/chosen": -565.2854614257812, + "logps/rejected": -764.7696533203125, + "loss": 0.4013, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9236133098602295, + "rewards/margins": 1.5329084396362305, + "rewards/rejected": -3.456521511077881, + "step": 1820 + }, + { + "epoch": 0.48, + "learning_rate": 3.114995744685877e-06, + "logits/chosen": -0.9977224469184875, + "logits/rejected": -0.40141773223876953, + "logps/chosen": -466.9925842285156, + "logps/rejected": -759.6148071289062, + "loss": 0.4686, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7426468133926392, + "rewards/margins": 1.7171356678009033, + "rewards/rejected": -3.459782361984253, + "step": 1830 + }, + { + "epoch": 0.48, + "learning_rate": 3.0928277624770743e-06, + "logits/chosen": -1.512317180633545, + "logits/rejected": -0.05047481134533882, + "logps/chosen": -557.8958129882812, + "logps/rejected": -753.0090942382812, + "loss": 0.4698, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.98238205909729, + "rewards/margins": 1.4256618022918701, + "rewards/rejected": -3.408043622970581, + "step": 1840 + }, + { + "epoch": 0.48, + "learning_rate": 3.070610279320708e-06, + "logits/chosen": -1.1659696102142334, + "logits/rejected": -0.22102081775665283, + "logps/chosen": -540.2110595703125, + "logps/rejected": -764.9635620117188, + "loss": 0.3463, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8047651052474976, + "rewards/margins": 1.6794044971466064, + "rewards/rejected": -3.4841697216033936, + "step": 1850 + }, + { + "epoch": 0.49, + "learning_rate": 3.0483451503702264e-06, + "logits/chosen": -0.9429510831832886, + "logits/rejected": -0.374999463558197, + "logps/chosen": -633.7833862304688, + "logps/rejected": -881.3572387695312, + "loss": 0.4, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.2336530685424805, + "rewards/margins": 1.6140598058700562, + "rewards/rejected": -3.847712993621826, + "step": 1860 + }, + { + "epoch": 0.49, + "learning_rate": 3.0260342347574916e-06, + "logits/chosen": -0.9639546275138855, + "logits/rejected": -0.8156697154045105, + "logps/chosen": -493.8570861816406, + "logps/rejected": -728.9312744140625, + "loss": 0.3857, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7304465770721436, + "rewards/margins": 1.2347511053085327, + "rewards/rejected": -2.965198040008545, + "step": 1870 + }, + { + "epoch": 0.49, + "learning_rate": 3.0036793954375358e-06, + "logits/chosen": -1.5436639785766602, + "logits/rejected": 0.08726786822080612, + "logps/chosen": -512.5186157226562, + "logps/rejected": -664.79296875, + "loss": 0.4216, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7313350439071655, + "rewards/margins": 1.4134327173233032, + "rewards/rejected": -3.144767999649048, + "step": 1880 + }, + { + "epoch": 0.49, + "learning_rate": 2.981282499033009e-06, + "logits/chosen": -0.6750370264053345, + "logits/rejected": -0.8198210597038269, + "logps/chosen": -542.04052734375, + "logps/rejected": -810.6235961914062, + "loss": 0.4386, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.0354888439178467, + "rewards/margins": 1.7842447757720947, + "rewards/rejected": -3.8197338581085205, + "step": 1890 + }, + { + "epoch": 0.5, + "learning_rate": 2.9588454156783163e-06, + "logits/chosen": -1.0820724964141846, + "logits/rejected": -0.7213941812515259, + "logps/chosen": -541.3480224609375, + "logps/rejected": -766.2906494140625, + "loss": 0.4257, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.084608554840088, + "rewards/margins": 1.3346532583236694, + "rewards/rejected": -3.4192614555358887, + "step": 1900 + }, + { + "epoch": 0.5, + "learning_rate": 2.9363700188634597e-06, + "logits/chosen": -1.2453614473342896, + "logits/rejected": -0.5085484385490417, + "logps/chosen": -540.5175170898438, + "logps/rejected": -765.7025756835938, + "loss": 0.4321, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.895167589187622, + "rewards/margins": 1.5056030750274658, + "rewards/rejected": -3.4007697105407715, + "step": 1910 + }, + { + "epoch": 0.5, + "learning_rate": 2.9138581852776053e-06, + "logits/chosen": -1.1196304559707642, + "logits/rejected": 0.026494156569242477, + "logps/chosen": -559.1360473632812, + "logps/rejected": -778.654296875, + "loss": 0.4461, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9397118091583252, + "rewards/margins": 1.5184619426727295, + "rewards/rejected": -3.4581737518310547, + "step": 1920 + }, + { + "epoch": 0.51, + "learning_rate": 2.8913117946523805e-06, + "logits/chosen": -1.2101647853851318, + "logits/rejected": 0.05119786784052849, + "logps/chosen": -497.8490295410156, + "logps/rejected": -711.6702270507812, + "loss": 0.4391, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8607673645019531, + "rewards/margins": 1.6270701885223389, + "rewards/rejected": -3.487837314605713, + "step": 1930 + }, + { + "epoch": 0.51, + "learning_rate": 2.8687327296049126e-06, + "logits/chosen": -0.9110754132270813, + "logits/rejected": -0.3987257182598114, + "logps/chosen": -488.52056884765625, + "logps/rejected": -814.98388671875, + "loss": 0.385, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5353851318359375, + "rewards/margins": 2.019871711730957, + "rewards/rejected": -3.5552570819854736, + "step": 1940 + }, + { + "epoch": 0.51, + "learning_rate": 2.8461228754806376e-06, + "logits/chosen": -1.2645906209945679, + "logits/rejected": 0.05890879034996033, + "logps/chosen": -571.50146484375, + "logps/rejected": -764.4017944335938, + "loss": 0.3804, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.847496747970581, + "rewards/margins": 1.488777995109558, + "rewards/rejected": -3.3362746238708496, + "step": 1950 + }, + { + "epoch": 0.51, + "learning_rate": 2.823484120195865e-06, + "logits/chosen": -1.035412311553955, + "logits/rejected": 0.14751215279102325, + "logps/chosen": -568.7293090820312, + "logps/rejected": -764.1280517578125, + "loss": 0.3716, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9270780086517334, + "rewards/margins": 1.58747398853302, + "rewards/rejected": -3.514551877975464, + "step": 1960 + }, + { + "epoch": 0.52, + "learning_rate": 2.8008183540801486e-06, + "logits/chosen": -1.152499794960022, + "logits/rejected": -0.26493799686431885, + "logps/chosen": -610.4190063476562, + "logps/rejected": -777.2071533203125, + "loss": 0.3893, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9002234935760498, + "rewards/margins": 1.6286661624908447, + "rewards/rejected": -3.5288894176483154, + "step": 1970 + }, + { + "epoch": 0.52, + "learning_rate": 2.7781274697184353e-06, + "logits/chosen": -1.2689648866653442, + "logits/rejected": 0.2965567111968994, + "logps/chosen": -519.8406982421875, + "logps/rejected": -776.2745971679688, + "loss": 0.451, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7539297342300415, + "rewards/margins": 1.7792565822601318, + "rewards/rejected": -3.533186435699463, + "step": 1980 + }, + { + "epoch": 0.52, + "learning_rate": 2.7554133617930397e-06, + "logits/chosen": -1.1502325534820557, + "logits/rejected": -0.16516944766044617, + "logps/chosen": -586.9942626953125, + "logps/rejected": -799.5856323242188, + "loss": 0.3986, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9755744934082031, + "rewards/margins": 1.5421812534332275, + "rewards/rejected": -3.5177555084228516, + "step": 1990 + }, + { + "epoch": 0.52, + "learning_rate": 2.7326779269254363e-06, + "logits/chosen": -1.2239593267440796, + "logits/rejected": -0.29785847663879395, + "logps/chosen": -577.9757080078125, + "logps/rejected": -780.0157470703125, + "loss": 0.4163, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8510382175445557, + "rewards/margins": 1.496047019958496, + "rewards/rejected": -3.3470852375030518, + "step": 2000 + }, + { + "epoch": 0.53, + "learning_rate": 2.7099230635178954e-06, + "logits/chosen": -0.8359963297843933, + "logits/rejected": -0.609653115272522, + "logps/chosen": -522.3953857421875, + "logps/rejected": -836.3997192382812, + "loss": 0.3754, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8302185535430908, + "rewards/margins": 1.9239788055419922, + "rewards/rejected": -3.754197359085083, + "step": 2010 + }, + { + "epoch": 0.53, + "learning_rate": 2.6871506715949608e-06, + "logits/chosen": -1.2713494300842285, + "logits/rejected": 0.4311772286891937, + "logps/chosen": -510.71942138671875, + "logps/rejected": -779.8484497070312, + "loss": 0.2961, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6959383487701416, + "rewards/margins": 1.9984363317489624, + "rewards/rejected": -3.6943740844726562, + "step": 2020 + }, + { + "epoch": 0.53, + "learning_rate": 2.6643626526448063e-06, + "logits/chosen": -0.745600700378418, + "logits/rejected": -0.04406242445111275, + "logps/chosen": -545.650390625, + "logps/rejected": -726.6231689453125, + "loss": 0.4021, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.117701768875122, + "rewards/margins": 1.459877848625183, + "rewards/rejected": -3.5775794982910156, + "step": 2030 + }, + { + "epoch": 0.53, + "learning_rate": 2.6415609094604562e-06, + "logits/chosen": -0.6423822641372681, + "logits/rejected": -0.5382918119430542, + "logps/chosen": -555.1259765625, + "logps/rejected": -854.6931762695312, + "loss": 0.4133, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.1485462188720703, + "rewards/margins": 1.8231878280639648, + "rewards/rejected": -3.9717342853546143, + "step": 2040 + }, + { + "epoch": 0.54, + "learning_rate": 2.618747345980904e-06, + "logits/chosen": -0.942598819732666, + "logits/rejected": -0.03942962735891342, + "logps/chosen": -508.3501892089844, + "logps/rejected": -809.349365234375, + "loss": 0.3591, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.922590970993042, + "rewards/margins": 2.0935001373291016, + "rewards/rejected": -4.016091346740723, + "step": 2050 + }, + { + "epoch": 0.54, + "learning_rate": 2.595923867132136e-06, + "logits/chosen": -0.9166833162307739, + "logits/rejected": 0.08796543627977371, + "logps/chosen": -561.3500366210938, + "logps/rejected": -813.5736083984375, + "loss": 0.3879, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1458635330200195, + "rewards/margins": 1.7413724660873413, + "rewards/rejected": -3.887235641479492, + "step": 2060 + }, + { + "epoch": 0.54, + "learning_rate": 2.5730923786680672e-06, + "logits/chosen": -0.8908794522285461, + "logits/rejected": 0.45012766122817993, + "logps/chosen": -528.106689453125, + "logps/rejected": -768.5990600585938, + "loss": 0.3902, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0598998069763184, + "rewards/margins": 1.840790033340454, + "rewards/rejected": -3.9006900787353516, + "step": 2070 + }, + { + "epoch": 0.54, + "learning_rate": 2.5502547870114137e-06, + "logits/chosen": -0.44030576944351196, + "logits/rejected": 0.27115920186042786, + "logps/chosen": -541.02392578125, + "logps/rejected": -866.5523681640625, + "loss": 0.4173, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.1856839656829834, + "rewards/margins": 2.092510223388672, + "rewards/rejected": -4.278193473815918, + "step": 2080 + }, + { + "epoch": 0.55, + "learning_rate": 2.527412999094507e-06, + "logits/chosen": -0.960748553276062, + "logits/rejected": 0.4776690602302551, + "logps/chosen": -641.7643432617188, + "logps/rejected": -842.5614013671875, + "loss": 0.4317, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.455843448638916, + "rewards/margins": 1.5613322257995605, + "rewards/rejected": -4.017176151275635, + "step": 2090 + }, + { + "epoch": 0.55, + "learning_rate": 2.504568922200064e-06, + "logits/chosen": -1.032212495803833, + "logits/rejected": 0.38921135663986206, + "logps/chosen": -526.1879272460938, + "logps/rejected": -780.3800659179688, + "loss": 0.4037, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8752429485321045, + "rewards/margins": 2.1657333374023438, + "rewards/rejected": -4.040976524353027, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_logits/chosen": 0.9488687515258789, + "eval_logits/rejected": 1.9484919309616089, + "eval_logps/chosen": -568.4110107421875, + "eval_logps/rejected": -807.3529052734375, + "eval_loss": 0.43342798948287964, + "eval_rewards/accuracies": 0.796999990940094, + "eval_rewards/chosen": -2.127501964569092, + "eval_rewards/margins": 1.7560021877288818, + "eval_rewards/rejected": -3.8835039138793945, + "eval_runtime": 1369.4586, + "eval_samples_per_second": 1.46, + "eval_steps_per_second": 0.365, + "step": 2100 + }, + { + "epoch": 0.55, + "learning_rate": 2.4817244638019333e-06, + "logits/chosen": -0.7853974103927612, + "logits/rejected": 0.26276087760925293, + "logps/chosen": -576.4677734375, + "logps/rejected": -882.1696166992188, + "loss": 0.4205, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.413973331451416, + "rewards/margins": 1.9889461994171143, + "rewards/rejected": -4.402919292449951, + "step": 2110 + }, + { + "epoch": 0.55, + "learning_rate": 2.4588815314058155e-06, + "logits/chosen": -0.7402883768081665, + "logits/rejected": 0.13213138282299042, + "logps/chosen": -515.807373046875, + "logps/rejected": -840.2550048828125, + "loss": 0.4364, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9831836223602295, + "rewards/margins": 2.3909099102020264, + "rewards/rejected": -4.374093532562256, + "step": 2120 + }, + { + "epoch": 0.56, + "learning_rate": 2.4360420323899922e-06, + "logits/chosen": -0.9780701398849487, + "logits/rejected": -0.47581759095191956, + "logps/chosen": -536.0255126953125, + "logps/rejected": -821.7635498046875, + "loss": 0.4016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8594684600830078, + "rewards/margins": 1.7201998233795166, + "rewards/rejected": -3.5796680450439453, + "step": 2130 + }, + { + "epoch": 0.56, + "learning_rate": 2.4132078738460585e-06, + "logits/chosen": -0.7851123809814453, + "logits/rejected": -0.3619709610939026, + "logps/chosen": -536.1060791015625, + "logps/rejected": -812.8576049804688, + "loss": 0.3879, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.774129867553711, + "rewards/margins": 1.8565963506698608, + "rewards/rejected": -3.6307265758514404, + "step": 2140 + }, + { + "epoch": 0.56, + "learning_rate": 2.3903809624196826e-06, + "logits/chosen": -1.0001862049102783, + "logits/rejected": -0.3588159680366516, + "logps/chosen": -524.4048461914062, + "logps/rejected": -757.30224609375, + "loss": 0.4045, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6777470111846924, + "rewards/margins": 1.4243619441986084, + "rewards/rejected": -3.10210919380188, + "step": 2150 + }, + { + "epoch": 0.57, + "learning_rate": 2.3675632041513978e-06, + "logits/chosen": -0.8742032051086426, + "logits/rejected": 0.1250528246164322, + "logps/chosen": -591.0662841796875, + "logps/rejected": -873.0941162109375, + "loss": 0.3503, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.1168835163116455, + "rewards/margins": 1.8603336811065674, + "rewards/rejected": -3.9772167205810547, + "step": 2160 + }, + { + "epoch": 0.57, + "learning_rate": 2.3447565043174533e-06, + "logits/chosen": -0.9236310124397278, + "logits/rejected": -0.3211767077445984, + "logps/chosen": -604.6006469726562, + "logps/rejected": -848.4435424804688, + "loss": 0.3555, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.398642063140869, + "rewards/margins": 1.6593701839447021, + "rewards/rejected": -4.058012962341309, + "step": 2170 + }, + { + "epoch": 0.57, + "learning_rate": 2.321962767270724e-06, + "logits/chosen": -0.4545938968658447, + "logits/rejected": -0.04614262655377388, + "logps/chosen": -522.8587646484375, + "logps/rejected": -840.7001953125, + "loss": 0.424, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.117671012878418, + "rewards/margins": 2.160691499710083, + "rewards/rejected": -4.278363227844238, + "step": 2180 + }, + { + "epoch": 0.57, + "learning_rate": 2.299183896281692e-06, + "logits/chosen": -0.8900741338729858, + "logits/rejected": 0.11245179176330566, + "logps/chosen": -621.9942626953125, + "logps/rejected": -912.0670776367188, + "loss": 0.3948, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.5180599689483643, + "rewards/margins": 2.2509348392486572, + "rewards/rejected": -4.768994331359863, + "step": 2190 + }, + { + "epoch": 0.58, + "learning_rate": 2.2764217933795297e-06, + "logits/chosen": -1.2973425388336182, + "logits/rejected": 0.3502582907676697, + "logps/chosen": -648.1968994140625, + "logps/rejected": -849.2420043945312, + "loss": 0.4569, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5464279651641846, + "rewards/margins": 1.8135731220245361, + "rewards/rejected": -4.3600006103515625, + "step": 2200 + }, + { + "epoch": 0.58, + "learning_rate": 2.2536783591932786e-06, + "logits/chosen": -1.0717593431472778, + "logits/rejected": -0.04763598367571831, + "logps/chosen": -564.8670654296875, + "logps/rejected": -862.4423828125, + "loss": 0.383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1421093940734863, + "rewards/margins": 2.115607976913452, + "rewards/rejected": -4.257718086242676, + "step": 2210 + }, + { + "epoch": 0.58, + "learning_rate": 2.230955492793149e-06, + "logits/chosen": -0.6327053308486938, + "logits/rejected": -0.22368088364601135, + "logps/chosen": -552.7775268554688, + "logps/rejected": -753.1697998046875, + "loss": 0.4299, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.939135193824768, + "rewards/margins": 1.3090981245040894, + "rewards/rejected": -3.2482333183288574, + "step": 2220 + }, + { + "epoch": 0.58, + "learning_rate": 2.208255091531947e-06, + "logits/chosen": -0.9401483535766602, + "logits/rejected": 0.13758346438407898, + "logps/chosen": -531.6614990234375, + "logps/rejected": -746.9278564453125, + "loss": 0.4358, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8913850784301758, + "rewards/margins": 1.5273140668869019, + "rewards/rejected": -3.418699264526367, + "step": 2230 + }, + { + "epoch": 0.59, + "learning_rate": 2.1855790508866435e-06, + "logits/chosen": -1.4784080982208252, + "logits/rejected": 0.4357272982597351, + "logps/chosen": -597.9056396484375, + "logps/rejected": -766.9682006835938, + "loss": 0.4395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0671706199645996, + "rewards/margins": 1.5233346223831177, + "rewards/rejected": -3.5905051231384277, + "step": 2240 + }, + { + "epoch": 0.59, + "learning_rate": 2.162929264300107e-06, + "logits/chosen": -1.0196201801300049, + "logits/rejected": 0.6838423013687134, + "logps/chosen": -581.2928466796875, + "logps/rejected": -801.315185546875, + "loss": 0.4091, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0828678607940674, + "rewards/margins": 1.6539685726165771, + "rewards/rejected": -3.7368361949920654, + "step": 2250 + }, + { + "epoch": 0.59, + "learning_rate": 2.1403076230230006e-06, + "logits/chosen": -0.6144061088562012, + "logits/rejected": -0.3462333083152771, + "logps/chosen": -555.2684326171875, + "logps/rejected": -745.2354736328125, + "loss": 0.4423, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8994195461273193, + "rewards/margins": 1.2702502012252808, + "rewards/rejected": -3.1696696281433105, + "step": 2260 + }, + { + "epoch": 0.59, + "learning_rate": 2.11771601595586e-06, + "logits/chosen": -1.3248759508132935, + "logits/rejected": -0.06512956321239471, + "logps/chosen": -526.8836669921875, + "logps/rejected": -727.4069213867188, + "loss": 0.4316, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7024120092391968, + "rewards/margins": 1.4804304838180542, + "rewards/rejected": -3.182842493057251, + "step": 2270 + }, + { + "epoch": 0.6, + "learning_rate": 2.0951563294913737e-06, + "logits/chosen": -0.8413470387458801, + "logits/rejected": -0.06420852988958359, + "logps/chosen": -514.109375, + "logps/rejected": -770.6260986328125, + "loss": 0.3656, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8018481731414795, + "rewards/margins": 1.71941339969635, + "rewards/rejected": -3.5212619304656982, + "step": 2280 + }, + { + "epoch": 0.6, + "learning_rate": 2.0726304473568693e-06, + "logits/chosen": -1.5680863857269287, + "logits/rejected": 0.14398300647735596, + "logps/chosen": -554.5525512695312, + "logps/rejected": -771.2307739257812, + "loss": 0.4331, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7925901412963867, + "rewards/margins": 1.641396164894104, + "rewards/rejected": -3.433986186981201, + "step": 2290 + }, + { + "epoch": 0.6, + "learning_rate": 2.050140250457023e-06, + "logits/chosen": -1.147033452987671, + "logits/rejected": -0.169643372297287, + "logps/chosen": -544.5403442382812, + "logps/rejected": -771.8599243164062, + "loss": 0.411, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.702807068824768, + "rewards/margins": 1.5390836000442505, + "rewards/rejected": -3.2418906688690186, + "step": 2300 + }, + { + "epoch": 0.6, + "learning_rate": 2.0276876167168042e-06, + "logits/chosen": -1.0450040102005005, + "logits/rejected": -0.5411997437477112, + "logps/chosen": -464.33636474609375, + "logps/rejected": -644.3059692382812, + "loss": 0.4458, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.661592721939087, + "rewards/margins": 1.0601894855499268, + "rewards/rejected": -2.7217824459075928, + "step": 2310 + }, + { + "epoch": 0.61, + "learning_rate": 2.0052744209246682e-06, + "logits/chosen": -0.9377092123031616, + "logits/rejected": -0.40263956785202026, + "logps/chosen": -498.0686950683594, + "logps/rejected": -739.6976318359375, + "loss": 0.4114, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7086822986602783, + "rewards/margins": 1.4424405097961426, + "rewards/rejected": -3.151122570037842, + "step": 2320 + }, + { + "epoch": 0.61, + "learning_rate": 1.9829025345760127e-06, + "logits/chosen": -0.7755551338195801, + "logits/rejected": -0.34596508741378784, + "logps/chosen": -440.62872314453125, + "logps/rejected": -650.15478515625, + "loss": 0.3539, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.3974610567092896, + "rewards/margins": 1.319804310798645, + "rewards/rejected": -2.7172653675079346, + "step": 2330 + }, + { + "epoch": 0.61, + "learning_rate": 1.9605738257169115e-06, + "logits/chosen": -1.3644423484802246, + "logits/rejected": -0.3873172700405121, + "logps/chosen": -490.96466064453125, + "logps/rejected": -707.909423828125, + "loss": 0.3942, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6896295547485352, + "rewards/margins": 1.3346831798553467, + "rewards/rejected": -3.0243124961853027, + "step": 2340 + }, + { + "epoch": 0.62, + "learning_rate": 1.9382901587881275e-06, + "logits/chosen": -1.3390328884124756, + "logits/rejected": -0.3258362114429474, + "logps/chosen": -560.3353271484375, + "logps/rejected": -710.94091796875, + "loss": 0.4555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.788905143737793, + "rewards/margins": 1.1810920238494873, + "rewards/rejected": -2.9699971675872803, + "step": 2350 + }, + { + "epoch": 0.62, + "learning_rate": 1.916053394469437e-06, + "logits/chosen": -1.2866407632827759, + "logits/rejected": -0.09522955119609833, + "logps/chosen": -534.5072021484375, + "logps/rejected": -711.5755004882812, + "loss": 0.3582, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.719448447227478, + "rewards/margins": 1.4744646549224854, + "rewards/rejected": -3.193912982940674, + "step": 2360 + }, + { + "epoch": 0.62, + "learning_rate": 1.8938653895242604e-06, + "logits/chosen": -1.3210828304290771, + "logits/rejected": 0.18320707976818085, + "logps/chosen": -570.3659057617188, + "logps/rejected": -748.9262084960938, + "loss": 0.38, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9527915716171265, + "rewards/margins": 1.4824120998382568, + "rewards/rejected": -3.4352035522460938, + "step": 2370 + }, + { + "epoch": 0.62, + "learning_rate": 1.8717279966446267e-06, + "logits/chosen": -0.6077791452407837, + "logits/rejected": -0.07136712223291397, + "logps/chosen": -561.3985595703125, + "logps/rejected": -833.6058349609375, + "loss": 0.4602, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8687776327133179, + "rewards/margins": 2.1088478565216064, + "rewards/rejected": -3.9776253700256348, + "step": 2380 + }, + { + "epoch": 0.63, + "learning_rate": 1.8496430642964698e-06, + "logits/chosen": -0.6436141133308411, + "logits/rejected": -0.7390815615653992, + "logps/chosen": -535.98291015625, + "logps/rejected": -785.1193237304688, + "loss": 0.4384, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8680446147918701, + "rewards/margins": 1.4708973169326782, + "rewards/rejected": -3.338942050933838, + "step": 2390 + }, + { + "epoch": 0.63, + "learning_rate": 1.827612436565286e-06, + "logits/chosen": -0.7472543716430664, + "logits/rejected": -0.552331805229187, + "logps/chosen": -512.9714965820312, + "logps/rejected": -776.1787109375, + "loss": 0.3829, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.701409101486206, + "rewards/margins": 1.6592018604278564, + "rewards/rejected": -3.3606104850769043, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_logits/chosen": 0.5047381520271301, + "eval_logits/rejected": 1.5421233177185059, + "eval_logps/chosen": -543.5669555664062, + "eval_logps/rejected": -768.0193481445312, + "eval_loss": 0.4248420000076294, + "eval_rewards/accuracies": 0.8009999990463257, + "eval_rewards/chosen": -1.879061222076416, + "eval_rewards/margins": 1.611107587814331, + "eval_rewards/rejected": -3.490169048309326, + "eval_runtime": 1378.4799, + "eval_samples_per_second": 1.451, + "eval_steps_per_second": 0.363, + "step": 2400 + }, + { + "epoch": 0.63, + "learning_rate": 1.8056379530021492e-06, + "logits/chosen": -1.644568681716919, + "logits/rejected": 0.2553193271160126, + "logps/chosen": -535.88134765625, + "logps/rejected": -778.2227783203125, + "loss": 0.3769, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.729379415512085, + "rewards/margins": 1.8933357000350952, + "rewards/rejected": -3.6227145195007324, + "step": 2410 + }, + { + "epoch": 0.63, + "learning_rate": 1.7837214484701154e-06, + "logits/chosen": -1.0312998294830322, + "logits/rejected": -0.9269927144050598, + "logps/chosen": -492.1142578125, + "logps/rejected": -759.744140625, + "loss": 0.3937, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6029274463653564, + "rewards/margins": 1.807284951210022, + "rewards/rejected": -3.410212278366089, + "step": 2420 + }, + { + "epoch": 0.64, + "learning_rate": 1.7618647529910043e-06, + "logits/chosen": -1.2176530361175537, + "logits/rejected": 0.223758265376091, + "logps/chosen": -529.0576171875, + "logps/rejected": -756.2348022460938, + "loss": 0.4664, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.851143479347229, + "rewards/margins": 1.5376180410385132, + "rewards/rejected": -3.388761520385742, + "step": 2430 + }, + { + "epoch": 0.64, + "learning_rate": 1.7400696915925996e-06, + "logits/chosen": -1.2333937883377075, + "logits/rejected": -0.22815477848052979, + "logps/chosen": -530.9912719726562, + "logps/rejected": -742.2661743164062, + "loss": 0.396, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.743680715560913, + "rewards/margins": 1.500349521636963, + "rewards/rejected": -3.244030475616455, + "step": 2440 + }, + { + "epoch": 0.64, + "learning_rate": 1.718338084156254e-06, + "logits/chosen": -1.519798994064331, + "logits/rejected": -0.11792447417974472, + "logps/chosen": -558.024658203125, + "logps/rejected": -786.2930908203125, + "loss": 0.4233, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6490389108657837, + "rewards/margins": 1.6921495199203491, + "rewards/rejected": -3.341188430786133, + "step": 2450 + }, + { + "epoch": 0.64, + "learning_rate": 1.6966717452649372e-06, + "logits/chosen": -1.2660577297210693, + "logits/rejected": -0.0037010847590863705, + "logps/chosen": -472.52471923828125, + "logps/rejected": -651.4722900390625, + "loss": 0.4358, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6838678121566772, + "rewards/margins": 1.2879998683929443, + "rewards/rejected": -2.971867561340332, + "step": 2460 + }, + { + "epoch": 0.65, + "learning_rate": 1.6750724840517103e-06, + "logits/chosen": -1.2322640419006348, + "logits/rejected": -0.4316504895687103, + "logps/chosen": -495.26544189453125, + "logps/rejected": -720.3032836914062, + "loss": 0.4283, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5014667510986328, + "rewards/margins": 1.5642062425613403, + "rewards/rejected": -3.0656726360321045, + "step": 2470 + }, + { + "epoch": 0.65, + "learning_rate": 1.6535421040486686e-06, + "logits/chosen": -0.9225546717643738, + "logits/rejected": 0.307370662689209, + "logps/chosen": -537.8445434570312, + "logps/rejected": -764.1954345703125, + "loss": 0.4295, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8402236700057983, + "rewards/margins": 1.5403947830200195, + "rewards/rejected": -3.3806185722351074, + "step": 2480 + }, + { + "epoch": 0.65, + "learning_rate": 1.6320824030363458e-06, + "logits/chosen": -0.8234320878982544, + "logits/rejected": -0.4902985095977783, + "logps/chosen": -521.9378662109375, + "logps/rejected": -800.2440185546875, + "loss": 0.3775, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6927446126937866, + "rewards/margins": 1.8545596599578857, + "rewards/rejected": -3.547304630279541, + "step": 2490 + }, + { + "epoch": 0.65, + "learning_rate": 1.6106951728936028e-06, + "logits/chosen": -0.9694339036941528, + "logits/rejected": -0.04356659576296806, + "logps/chosen": -550.1474609375, + "logps/rejected": -714.2723999023438, + "loss": 0.3934, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6208375692367554, + "rewards/margins": 1.5635387897491455, + "rewards/rejected": -3.1843764781951904, + "step": 2500 + }, + { + "epoch": 0.66, + "learning_rate": 1.5893821994479996e-06, + "logits/chosen": -1.3845258951187134, + "logits/rejected": 0.13586857914924622, + "logps/chosen": -514.47509765625, + "logps/rejected": -733.2557373046875, + "loss": 0.4431, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7695598602294922, + "rewards/margins": 1.6184390783309937, + "rewards/rejected": -3.3879990577697754, + "step": 2510 + }, + { + "epoch": 0.66, + "learning_rate": 1.5681452623266868e-06, + "logits/chosen": -0.6911166310310364, + "logits/rejected": -0.19654271006584167, + "logps/chosen": -473.919189453125, + "logps/rejected": -720.5162963867188, + "loss": 0.4099, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6180083751678467, + "rewards/margins": 1.4277281761169434, + "rewards/rejected": -3.04573655128479, + "step": 2520 + }, + { + "epoch": 0.66, + "learning_rate": 1.5469861348078014e-06, + "logits/chosen": -1.3716986179351807, + "logits/rejected": -0.06388586759567261, + "logps/chosen": -564.1226196289062, + "logps/rejected": -798.0968017578125, + "loss": 0.4469, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9020678997039795, + "rewards/margins": 1.5944623947143555, + "rewards/rejected": -3.496530532836914, + "step": 2530 + }, + { + "epoch": 0.66, + "learning_rate": 1.5259065836724035e-06, + "logits/chosen": -0.8546286821365356, + "logits/rejected": -0.3107864260673523, + "logps/chosen": -550.43017578125, + "logps/rejected": -754.4808349609375, + "loss": 0.3968, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.006619453430176, + "rewards/margins": 1.325626015663147, + "rewards/rejected": -3.3322455883026123, + "step": 2540 + }, + { + "epoch": 0.67, + "learning_rate": 1.5049083690569456e-06, + "logits/chosen": -1.1054461002349854, + "logits/rejected": -0.2836257517337799, + "logps/chosen": -498.7848205566406, + "logps/rejected": -744.7251586914062, + "loss": 0.4576, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7602741718292236, + "rewards/margins": 1.6243937015533447, + "rewards/rejected": -3.3846676349639893, + "step": 2550 + }, + { + "epoch": 0.67, + "learning_rate": 1.4839932443063057e-06, + "logits/chosen": -1.052145004272461, + "logits/rejected": -0.4817884564399719, + "logps/chosen": -500.67022705078125, + "logps/rejected": -772.7791748046875, + "loss": 0.4111, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7564786672592163, + "rewards/margins": 1.9239161014556885, + "rewards/rejected": -3.6803946495056152, + "step": 2560 + }, + { + "epoch": 0.67, + "learning_rate": 1.4631629558273803e-06, + "logits/chosen": -1.1972812414169312, + "logits/rejected": -0.44412803649902344, + "logps/chosen": -612.2957153320312, + "logps/rejected": -818.8905029296875, + "loss": 0.4121, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9536006450653076, + "rewards/margins": 1.6227566003799438, + "rewards/rejected": -3.576357364654541, + "step": 2570 + }, + { + "epoch": 0.68, + "learning_rate": 1.4424192429432657e-06, + "logits/chosen": -1.1940516233444214, + "logits/rejected": -0.6190989017486572, + "logps/chosen": -545.1439208984375, + "logps/rejected": -734.8392333984375, + "loss": 0.5085, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8257439136505127, + "rewards/margins": 1.1313966512680054, + "rewards/rejected": -2.9571404457092285, + "step": 2580 + }, + { + "epoch": 0.68, + "learning_rate": 1.421763837748016e-06, + "logits/chosen": -0.8670721054077148, + "logits/rejected": 0.19919352233409882, + "logps/chosen": -536.2199096679688, + "logps/rejected": -726.8831787109375, + "loss": 0.3427, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5812265872955322, + "rewards/margins": 1.5793380737304688, + "rewards/rejected": -3.160564422607422, + "step": 2590 + }, + { + "epoch": 0.68, + "learning_rate": 1.401198464962021e-06, + "logits/chosen": -0.7592190504074097, + "logits/rejected": -0.6002682447433472, + "logps/chosen": -536.698974609375, + "logps/rejected": -750.5850219726562, + "loss": 0.4207, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7439777851104736, + "rewards/margins": 1.444570779800415, + "rewards/rejected": -3.1885488033294678, + "step": 2600 + }, + { + "epoch": 0.68, + "learning_rate": 1.3807248417879896e-06, + "logits/chosen": -0.9805696606636047, + "logits/rejected": -0.1842125803232193, + "logps/chosen": -496.516845703125, + "logps/rejected": -714.7666015625, + "loss": 0.4135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.775261640548706, + "rewards/margins": 1.5725538730621338, + "rewards/rejected": -3.347815752029419, + "step": 2610 + }, + { + "epoch": 0.69, + "learning_rate": 1.3603446777675665e-06, + "logits/chosen": -1.047123670578003, + "logits/rejected": -0.30356377363204956, + "logps/chosen": -510.2223205566406, + "logps/rejected": -704.94091796875, + "loss": 0.4062, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7939850091934204, + "rewards/margins": 1.3114802837371826, + "rewards/rejected": -3.1054649353027344, + "step": 2620 + }, + { + "epoch": 0.69, + "learning_rate": 1.3400596746385817e-06, + "logits/chosen": -1.089388370513916, + "logits/rejected": -0.30947428941726685, + "logps/chosen": -532.441650390625, + "logps/rejected": -787.63720703125, + "loss": 0.3845, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7372934818267822, + "rewards/margins": 1.8088710308074951, + "rewards/rejected": -3.5461642742156982, + "step": 2630 + }, + { + "epoch": 0.69, + "learning_rate": 1.3198715261929587e-06, + "logits/chosen": -0.878930389881134, + "logits/rejected": -0.4860079884529114, + "logps/chosen": -504.37701416015625, + "logps/rejected": -719.3287963867188, + "loss": 0.4429, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7471263408660889, + "rewards/margins": 1.4954102039337158, + "rewards/rejected": -3.2425365447998047, + "step": 2640 + }, + { + "epoch": 0.69, + "learning_rate": 1.2997819181352823e-06, + "logits/chosen": -1.1908671855926514, + "logits/rejected": 0.06708762049674988, + "logps/chosen": -486.3038024902344, + "logps/rejected": -774.8641357421875, + "loss": 0.3094, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.6818081140518188, + "rewards/margins": 2.000152587890625, + "rewards/rejected": -3.6819605827331543, + "step": 2650 + }, + { + "epoch": 0.7, + "learning_rate": 1.2797925279420454e-06, + "logits/chosen": -0.9907468557357788, + "logits/rejected": -0.3420366942882538, + "logps/chosen": -546.3676147460938, + "logps/rejected": -735.4510498046875, + "loss": 0.3966, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9783893823623657, + "rewards/margins": 1.4355647563934326, + "rewards/rejected": -3.413954257965088, + "step": 2660 + }, + { + "epoch": 0.7, + "learning_rate": 1.2599050247215764e-06, + "logits/chosen": -1.4948866367340088, + "logits/rejected": 0.49219974875450134, + "logps/chosen": -573.19140625, + "logps/rejected": -743.2158203125, + "loss": 0.4396, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.086617946624756, + "rewards/margins": 1.3520699739456177, + "rewards/rejected": -3.438687801361084, + "step": 2670 + }, + { + "epoch": 0.7, + "learning_rate": 1.2401210690746705e-06, + "logits/chosen": -0.823261559009552, + "logits/rejected": -0.15638458728790283, + "logps/chosen": -535.7666625976562, + "logps/rejected": -808.8283081054688, + "loss": 0.4014, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9818382263183594, + "rewards/margins": 1.6475715637207031, + "rewards/rejected": -3.629409074783325, + "step": 2680 + }, + { + "epoch": 0.7, + "learning_rate": 1.2204423129559306e-06, + "logits/chosen": -0.9111806154251099, + "logits/rejected": -0.8399251699447632, + "logps/chosen": -543.661865234375, + "logps/rejected": -744.8538818359375, + "loss": 0.4439, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9010549783706665, + "rewards/margins": 1.2256118059158325, + "rewards/rejected": -3.126666784286499, + "step": 2690 + }, + { + "epoch": 0.71, + "learning_rate": 1.20087039953583e-06, + "logits/chosen": -1.051735281944275, + "logits/rejected": -0.22251495718955994, + "logps/chosen": -584.2539672851562, + "logps/rejected": -825.45703125, + "loss": 0.47, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9151395559310913, + "rewards/margins": 1.5970687866210938, + "rewards/rejected": -3.5122084617614746, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 0.5343354940414429, + "eval_logits/rejected": 1.5151687860488892, + "eval_logps/chosen": -541.308837890625, + "eval_logps/rejected": -759.2698974609375, + "eval_loss": 0.4210600256919861, + "eval_rewards/accuracies": 0.8029999732971191, + "eval_rewards/chosen": -1.8564802408218384, + "eval_rewards/margins": 1.5461931228637695, + "eval_rewards/rejected": -3.4026734828948975, + "eval_runtime": 1376.5386, + "eval_samples_per_second": 1.453, + "eval_steps_per_second": 0.363, + "step": 2700 + }, + { + "epoch": 0.71, + "learning_rate": 1.181406963063507e-06, + "logits/chosen": -1.0936121940612793, + "logits/rejected": -0.3421854078769684, + "logps/chosen": -551.6010131835938, + "logps/rejected": -727.1361083984375, + "loss": 0.523, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9586423635482788, + "rewards/margins": 1.3370798826217651, + "rewards/rejected": -3.295722484588623, + "step": 2710 + }, + { + "epoch": 0.71, + "learning_rate": 1.1620536287303052e-06, + "logits/chosen": -1.2988653182983398, + "logits/rejected": -0.4047287404537201, + "logps/chosen": -515.3692626953125, + "logps/rejected": -714.9964599609375, + "loss": 0.4243, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6312452554702759, + "rewards/margins": 1.321842074394226, + "rewards/rejected": -2.953087091445923, + "step": 2720 + }, + { + "epoch": 0.71, + "learning_rate": 1.1428120125340717e-06, + "logits/chosen": -1.1262309551239014, + "logits/rejected": 0.1220148354768753, + "logps/chosen": -560.1183471679688, + "logps/rejected": -755.31103515625, + "loss": 0.4397, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7473366260528564, + "rewards/margins": 1.4563024044036865, + "rewards/rejected": -3.203639268875122, + "step": 2730 + }, + { + "epoch": 0.72, + "learning_rate": 1.123683721144223e-06, + "logits/chosen": -1.1425328254699707, + "logits/rejected": -0.3113950788974762, + "logps/chosen": -507.19598388671875, + "logps/rejected": -742.7738037109375, + "loss": 0.3137, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6102272272109985, + "rewards/margins": 1.5865800380706787, + "rewards/rejected": -3.196807384490967, + "step": 2740 + }, + { + "epoch": 0.72, + "learning_rate": 1.1046703517675848e-06, + "logits/chosen": -1.2373132705688477, + "logits/rejected": -0.09322497248649597, + "logps/chosen": -504.95501708984375, + "logps/rejected": -680.431396484375, + "loss": 0.4333, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.792452096939087, + "rewards/margins": 1.1824333667755127, + "rewards/rejected": -2.9748852252960205, + "step": 2750 + }, + { + "epoch": 0.72, + "learning_rate": 1.085773492015028e-06, + "logits/chosen": -0.8464914560317993, + "logits/rejected": -0.11784086376428604, + "logps/chosen": -549.912841796875, + "logps/rejected": -770.5538330078125, + "loss": 0.4082, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.764224648475647, + "rewards/margins": 1.4827383756637573, + "rewards/rejected": -3.2469630241394043, + "step": 2760 + }, + { + "epoch": 0.72, + "learning_rate": 1.0669947197689034e-06, + "logits/chosen": -0.7767351269721985, + "logits/rejected": 0.08518421649932861, + "logps/chosen": -487.90399169921875, + "logps/rejected": -754.19287109375, + "loss": 0.3626, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7886440753936768, + "rewards/margins": 1.6782163381576538, + "rewards/rejected": -3.466860294342041, + "step": 2770 + }, + { + "epoch": 0.73, + "learning_rate": 1.048335603051291e-06, + "logits/chosen": -0.9023948907852173, + "logits/rejected": 0.5675928592681885, + "logps/chosen": -573.4138793945312, + "logps/rejected": -760.1019897460938, + "loss": 0.3304, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9305530786514282, + "rewards/margins": 1.7111543416976929, + "rewards/rejected": -3.641707181930542, + "step": 2780 + }, + { + "epoch": 0.73, + "learning_rate": 1.0297976998930665e-06, + "logits/chosen": -1.0089476108551025, + "logits/rejected": 0.039520103484392166, + "logps/chosen": -544.458251953125, + "logps/rejected": -781.134521484375, + "loss": 0.4407, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9217383861541748, + "rewards/margins": 1.6432183980941772, + "rewards/rejected": -3.5649566650390625, + "step": 2790 + }, + { + "epoch": 0.73, + "learning_rate": 1.0113825582038078e-06, + "logits/chosen": -0.9111200571060181, + "logits/rejected": 0.09320324659347534, + "logps/chosen": -485.19464111328125, + "logps/rejected": -772.6851806640625, + "loss": 0.3662, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6882908344268799, + "rewards/margins": 1.9764955043792725, + "rewards/rejected": -3.6647861003875732, + "step": 2800 + }, + { + "epoch": 0.74, + "learning_rate": 9.930917156425477e-07, + "logits/chosen": -1.3273289203643799, + "logits/rejected": 0.12840789556503296, + "logps/chosen": -539.8751220703125, + "logps/rejected": -770.5255126953125, + "loss": 0.4417, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8574135303497314, + "rewards/margins": 1.6678911447525024, + "rewards/rejected": -3.525304079055786, + "step": 2810 + }, + { + "epoch": 0.74, + "learning_rate": 9.749266994893756e-07, + "logits/chosen": -1.2743171453475952, + "logits/rejected": -0.29028937220573425, + "logps/chosen": -505.72515869140625, + "logps/rejected": -708.1859130859375, + "loss": 0.3927, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6994062662124634, + "rewards/margins": 1.5197365283966064, + "rewards/rejected": -3.2191429138183594, + "step": 2820 + }, + { + "epoch": 0.74, + "learning_rate": 9.56889026517913e-07, + "logits/chosen": -1.2260441780090332, + "logits/rejected": 0.2879168689250946, + "logps/chosen": -580.123291015625, + "logps/rejected": -747.0691528320312, + "loss": 0.3703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0501606464385986, + "rewards/margins": 1.359299659729004, + "rewards/rejected": -3.4094605445861816, + "step": 2830 + }, + { + "epoch": 0.74, + "learning_rate": 9.389802028686617e-07, + "logits/chosen": -1.1472103595733643, + "logits/rejected": 0.685859203338623, + "logps/chosen": -564.8762817382812, + "logps/rejected": -833.5897216796875, + "loss": 0.3696, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9314100742340088, + "rewards/margins": 2.114525318145752, + "rewards/rejected": -4.04593563079834, + "step": 2840 + }, + { + "epoch": 0.75, + "learning_rate": 9.212017239232427e-07, + "logits/chosen": -1.1055911779403687, + "logits/rejected": 0.17128732800483704, + "logps/chosen": -461.57757568359375, + "logps/rejected": -762.3133544921875, + "loss": 0.3902, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.596985101699829, + "rewards/margins": 1.919716477394104, + "rewards/rejected": -3.5167019367218018, + "step": 2850 + }, + { + "epoch": 0.75, + "learning_rate": 9.03555074179533e-07, + "logits/chosen": -0.5496357679367065, + "logits/rejected": -0.5428146123886108, + "logps/chosen": -548.8982543945312, + "logps/rejected": -794.3575439453125, + "loss": 0.4303, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8840789794921875, + "rewards/margins": 1.6221778392791748, + "rewards/rejected": -3.506256580352783, + "step": 2860 + }, + { + "epoch": 0.75, + "learning_rate": 8.860417271277067e-07, + "logits/chosen": -1.2351821660995483, + "logits/rejected": -0.3069414794445038, + "logps/chosen": -512.6089477539062, + "logps/rejected": -765.6232299804688, + "loss": 0.3669, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8200585842132568, + "rewards/margins": 1.858338713645935, + "rewards/rejected": -3.6783974170684814, + "step": 2870 + }, + { + "epoch": 0.75, + "learning_rate": 8.686631451272029e-07, + "logits/chosen": -0.9176104664802551, + "logits/rejected": 0.11124134063720703, + "logps/chosen": -530.0169067382812, + "logps/rejected": -772.3027954101562, + "loss": 0.3666, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.805767297744751, + "rewards/margins": 1.8668705224990845, + "rewards/rejected": -3.672637462615967, + "step": 2880 + }, + { + "epoch": 0.76, + "learning_rate": 8.514207792846168e-07, + "logits/chosen": -0.8543869853019714, + "logits/rejected": -0.33468276262283325, + "logps/chosen": -563.6397705078125, + "logps/rejected": -726.9634399414062, + "loss": 0.3891, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.042240619659424, + "rewards/margins": 1.2924143075942993, + "rewards/rejected": -3.3346545696258545, + "step": 2890 + }, + { + "epoch": 0.76, + "learning_rate": 8.343160693325356e-07, + "logits/chosen": -0.5689171552658081, + "logits/rejected": -0.3701861500740051, + "logps/chosen": -522.55078125, + "logps/rejected": -799.8737182617188, + "loss": 0.3666, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9046812057495117, + "rewards/margins": 1.847630262374878, + "rewards/rejected": -3.7523112297058105, + "step": 2900 + }, + { + "epoch": 0.76, + "learning_rate": 8.173504435093174e-07, + "logits/chosen": -1.1127324104309082, + "logits/rejected": 0.49559181928634644, + "logps/chosen": -580.339599609375, + "logps/rejected": -790.05419921875, + "loss": 0.4036, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1893162727355957, + "rewards/margins": 1.816868543624878, + "rewards/rejected": -4.0061845779418945, + "step": 2910 + }, + { + "epoch": 0.76, + "learning_rate": 8.00525318439836e-07, + "logits/chosen": -1.1705108880996704, + "logits/rejected": 0.1877966821193695, + "logps/chosen": -577.6532592773438, + "logps/rejected": -775.518310546875, + "loss": 0.5292, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.123340129852295, + "rewards/margins": 1.601309061050415, + "rewards/rejected": -3.724648952484131, + "step": 2920 + }, + { + "epoch": 0.77, + "learning_rate": 7.838420990171927e-07, + "logits/chosen": -0.675153374671936, + "logits/rejected": -0.15449100732803345, + "logps/chosen": -529.3169555664062, + "logps/rejected": -849.1887817382812, + "loss": 0.3346, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.992500901222229, + "rewards/margins": 2.1089320182800293, + "rewards/rejected": -4.101432800292969, + "step": 2930 + }, + { + "epoch": 0.77, + "learning_rate": 7.673021782854084e-07, + "logits/chosen": -1.0276451110839844, + "logits/rejected": 0.12752141058444977, + "logps/chosen": -569.2623901367188, + "logps/rejected": -797.4970703125, + "loss": 0.4235, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9917293787002563, + "rewards/margins": 1.7630789279937744, + "rewards/rejected": -3.7548089027404785, + "step": 2940 + }, + { + "epoch": 0.77, + "learning_rate": 7.509069373231039e-07, + "logits/chosen": -0.532359778881073, + "logits/rejected": -0.37800487875938416, + "logps/chosen": -572.4589233398438, + "logps/rejected": -849.5089111328125, + "loss": 0.4203, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.19490385055542, + "rewards/margins": 1.7656660079956055, + "rewards/rejected": -3.9605698585510254, + "step": 2950 + }, + { + "epoch": 0.77, + "learning_rate": 7.346577451281822e-07, + "logits/chosen": -0.9249979853630066, + "logits/rejected": 0.5974918603897095, + "logps/chosen": -583.9189453125, + "logps/rejected": -770.7642822265625, + "loss": 0.4452, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.059628486633301, + "rewards/margins": 1.6830648183822632, + "rewards/rejected": -3.7426934242248535, + "step": 2960 + }, + { + "epoch": 0.78, + "learning_rate": 7.185559585035138e-07, + "logits/chosen": -0.7646081447601318, + "logits/rejected": -0.08741030842065811, + "logps/chosen": -494.5181579589844, + "logps/rejected": -819.3825073242188, + "loss": 0.4152, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7815096378326416, + "rewards/margins": 2.074185848236084, + "rewards/rejected": -3.8556952476501465, + "step": 2970 + }, + { + "epoch": 0.78, + "learning_rate": 7.026029219436504e-07, + "logits/chosen": -1.0040438175201416, + "logits/rejected": -0.2370959222316742, + "logps/chosen": -547.5843505859375, + "logps/rejected": -775.7852783203125, + "loss": 0.3377, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8116861581802368, + "rewards/margins": 1.7394940853118896, + "rewards/rejected": -3.551180362701416, + "step": 2980 + }, + { + "epoch": 0.78, + "learning_rate": 6.867999675225523e-07, + "logits/chosen": -0.8310391306877136, + "logits/rejected": -0.02760641649365425, + "logps/chosen": -590.3953857421875, + "logps/rejected": -830.1414184570312, + "loss": 0.3744, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0531251430511475, + "rewards/margins": 1.790724515914917, + "rewards/rejected": -3.843848705291748, + "step": 2990 + }, + { + "epoch": 0.79, + "learning_rate": 6.711484147823663e-07, + "logits/chosen": -1.0045768022537231, + "logits/rejected": -0.32932716608047485, + "logps/chosen": -519.6864013671875, + "logps/rejected": -840.6529541015625, + "loss": 0.3769, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8191581964492798, + "rewards/margins": 1.9201765060424805, + "rewards/rejected": -3.7393341064453125, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": 0.532638430595398, + "eval_logits/rejected": 1.514159917831421, + "eval_logps/chosen": -547.6463012695312, + "eval_logps/rejected": -772.1762084960938, + "eval_loss": 0.4205494523048401, + "eval_rewards/accuracies": 0.8009999990463257, + "eval_rewards/chosen": -1.919854760169983, + "eval_rewards/margins": 1.6118818521499634, + "eval_rewards/rejected": -3.5317368507385254, + "eval_runtime": 1383.4868, + "eval_samples_per_second": 1.446, + "eval_steps_per_second": 0.361, + "step": 3000 + }, + { + "epoch": 0.79, + "learning_rate": 6.556495706232413e-07, + "logits/chosen": -1.1887633800506592, + "logits/rejected": 0.10280628502368927, + "logps/chosen": -592.3970947265625, + "logps/rejected": -802.2684326171875, + "loss": 0.4594, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.981632947921753, + "rewards/margins": 1.690386414527893, + "rewards/rejected": -3.6720194816589355, + "step": 3010 + }, + { + "epoch": 0.79, + "learning_rate": 6.403047291942057e-07, + "logits/chosen": -1.0731422901153564, + "logits/rejected": -0.11721036583185196, + "logps/chosen": -500.95733642578125, + "logps/rejected": -810.4306030273438, + "loss": 0.3202, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.6846420764923096, + "rewards/margins": 2.057126522064209, + "rewards/rejected": -3.7417690753936768, + "step": 3020 + }, + { + "epoch": 0.79, + "learning_rate": 6.251151717851023e-07, + "logits/chosen": -0.5943460464477539, + "logits/rejected": -0.3044959008693695, + "logps/chosen": -552.7174682617188, + "logps/rejected": -838.6080322265625, + "loss": 0.3992, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.064213275909424, + "rewards/margins": 1.780678153038025, + "rewards/rejected": -3.844891309738159, + "step": 3030 + }, + { + "epoch": 0.8, + "learning_rate": 6.100821667196041e-07, + "logits/chosen": -0.9400644302368164, + "logits/rejected": -0.20678548514842987, + "logps/chosen": -539.8009033203125, + "logps/rejected": -764.1534423828125, + "loss": 0.3418, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.653412103652954, + "rewards/margins": 1.8391025066375732, + "rewards/rejected": -3.4925143718719482, + "step": 3040 + }, + { + "epoch": 0.8, + "learning_rate": 5.952069692493062e-07, + "logits/chosen": -0.7938990592956543, + "logits/rejected": -0.40649691224098206, + "logps/chosen": -596.44873046875, + "logps/rejected": -842.193359375, + "loss": 0.4439, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0670700073242188, + "rewards/margins": 1.7952619791030884, + "rewards/rejected": -3.8623321056365967, + "step": 3050 + }, + { + "epoch": 0.8, + "learning_rate": 5.80490821448918e-07, + "logits/chosen": -0.9244669079780579, + "logits/rejected": -0.5335060358047485, + "logps/chosen": -596.2068481445312, + "logps/rejected": -874.0540161132812, + "loss": 0.391, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.2196078300476074, + "rewards/margins": 1.8115314245224, + "rewards/rejected": -4.031139373779297, + "step": 3060 + }, + { + "epoch": 0.8, + "learning_rate": 5.659349521125459e-07, + "logits/chosen": -0.6347008347511292, + "logits/rejected": -0.5196251273155212, + "logps/chosen": -535.7493286132812, + "logps/rejected": -856.0029296875, + "loss": 0.4562, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9471817016601562, + "rewards/margins": 1.7889111042022705, + "rewards/rejected": -3.7360928058624268, + "step": 3070 + }, + { + "epoch": 0.81, + "learning_rate": 5.5154057665109e-07, + "logits/chosen": -1.1485843658447266, + "logits/rejected": 0.004442277364432812, + "logps/chosen": -522.5572509765625, + "logps/rejected": -784.026123046875, + "loss": 0.3419, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.899583101272583, + "rewards/margins": 1.6547266244888306, + "rewards/rejected": -3.554309844970703, + "step": 3080 + }, + { + "epoch": 0.81, + "learning_rate": 5.373088969907586e-07, + "logits/chosen": -0.7392430901527405, + "logits/rejected": -0.4351174235343933, + "logps/chosen": -475.52294921875, + "logps/rejected": -764.8416748046875, + "loss": 0.3467, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6972386837005615, + "rewards/margins": 1.8677946329116821, + "rewards/rejected": -3.565033435821533, + "step": 3090 + }, + { + "epoch": 0.81, + "learning_rate": 5.23241101472709e-07, + "logits/chosen": -0.841932475566864, + "logits/rejected": -0.11554646492004395, + "logps/chosen": -619.42724609375, + "logps/rejected": -848.4366455078125, + "loss": 0.3589, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.211543560028076, + "rewards/margins": 1.8801252841949463, + "rewards/rejected": -4.091668605804443, + "step": 3100 + }, + { + "epoch": 0.81, + "learning_rate": 5.09338364753818e-07, + "logits/chosen": -1.0390411615371704, + "logits/rejected": 0.6153230667114258, + "logps/chosen": -606.2908325195312, + "logps/rejected": -770.7852783203125, + "loss": 0.5322, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2032546997070312, + "rewards/margins": 1.4495493173599243, + "rewards/rejected": -3.652804136276245, + "step": 3110 + }, + { + "epoch": 0.82, + "learning_rate": 4.956018477086005e-07, + "logits/chosen": -0.8461894989013672, + "logits/rejected": -0.3413206934928894, + "logps/chosen": -537.2196655273438, + "logps/rejected": -825.6803588867188, + "loss": 0.3784, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8306541442871094, + "rewards/margins": 1.9641211032867432, + "rewards/rejected": -3.7947754859924316, + "step": 3120 + }, + { + "epoch": 0.82, + "learning_rate": 4.820326973322764e-07, + "logits/chosen": -0.8142082095146179, + "logits/rejected": -0.10756425559520721, + "logps/chosen": -565.7686157226562, + "logps/rejected": -817.3389892578125, + "loss": 0.4628, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.146563768386841, + "rewards/margins": 1.795789122581482, + "rewards/rejected": -3.9423530101776123, + "step": 3130 + }, + { + "epoch": 0.82, + "learning_rate": 4.686320466449981e-07, + "logits/chosen": -0.9902013540267944, + "logits/rejected": -0.15123017132282257, + "logps/chosen": -511.6470642089844, + "logps/rejected": -815.7200317382812, + "loss": 0.3672, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8265291452407837, + "rewards/margins": 2.014096975326538, + "rewards/rejected": -3.8406262397766113, + "step": 3140 + }, + { + "epoch": 0.82, + "learning_rate": 4.554010145972418e-07, + "logits/chosen": -0.8389061689376831, + "logits/rejected": -0.14444035291671753, + "logps/chosen": -551.7734375, + "logps/rejected": -798.772705078125, + "loss": 0.4095, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.063991069793701, + "rewards/margins": 1.5325976610183716, + "rewards/rejected": -3.596588611602783, + "step": 3150 + }, + { + "epoch": 0.83, + "learning_rate": 4.4234070597637455e-07, + "logits/chosen": -0.8169253468513489, + "logits/rejected": 0.03178207948803902, + "logps/chosen": -529.4456787109375, + "logps/rejected": -754.3330078125, + "loss": 0.3978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.110259771347046, + "rewards/margins": 1.4216582775115967, + "rewards/rejected": -3.5319180488586426, + "step": 3160 + }, + { + "epoch": 0.83, + "learning_rate": 4.2945221131440783e-07, + "logits/chosen": -0.7767388224601746, + "logits/rejected": -0.2466239631175995, + "logps/chosen": -586.1234130859375, + "logps/rejected": -792.2913818359375, + "loss": 0.4286, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.040156602859497, + "rewards/margins": 1.661505937576294, + "rewards/rejected": -3.701662063598633, + "step": 3170 + }, + { + "epoch": 0.83, + "learning_rate": 4.167366067969381e-07, + "logits/chosen": -0.8122004270553589, + "logits/rejected": -0.4158555567264557, + "logps/chosen": -579.0358276367188, + "logps/rejected": -846.17626953125, + "loss": 0.3993, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.079592227935791, + "rewards/margins": 1.6793367862701416, + "rewards/rejected": -3.758929491043091, + "step": 3180 + }, + { + "epoch": 0.83, + "learning_rate": 4.041949541732826e-07, + "logits/chosen": -1.0485798120498657, + "logits/rejected": -0.36888834834098816, + "logps/chosen": -630.3906860351562, + "logps/rejected": -814.126953125, + "loss": 0.4027, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.191230535507202, + "rewards/margins": 1.4397324323654175, + "rewards/rejected": -3.63096284866333, + "step": 3190 + }, + { + "epoch": 0.84, + "learning_rate": 3.9182830066782614e-07, + "logits/chosen": -0.8572107553482056, + "logits/rejected": -0.33156412839889526, + "logps/chosen": -506.94781494140625, + "logps/rejected": -776.4071044921875, + "loss": 0.348, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8441927433013916, + "rewards/margins": 1.8142344951629639, + "rewards/rejected": -3.6584274768829346, + "step": 3200 + }, + { + "epoch": 0.84, + "learning_rate": 3.796376788925771e-07, + "logits/chosen": -1.5727512836456299, + "logits/rejected": 0.38068026304244995, + "logps/chosen": -552.1102905273438, + "logps/rejected": -777.140869140625, + "loss": 0.4251, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8679554462432861, + "rewards/margins": 1.6639436483383179, + "rewards/rejected": -3.5318992137908936, + "step": 3210 + }, + { + "epoch": 0.84, + "learning_rate": 3.676241067609465e-07, + "logits/chosen": -0.8122023344039917, + "logits/rejected": 0.24542848765850067, + "logps/chosen": -533.0785522460938, + "logps/rejected": -790.8440551757812, + "loss": 0.3919, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8818871974945068, + "rewards/margins": 1.9045559167861938, + "rewards/rejected": -3.7864432334899902, + "step": 3220 + }, + { + "epoch": 0.85, + "learning_rate": 3.5578858740274976e-07, + "logits/chosen": -1.0608148574829102, + "logits/rejected": -0.1883460134267807, + "logps/chosen": -546.2855224609375, + "logps/rejected": -784.9697265625, + "loss": 0.425, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9488483667373657, + "rewards/margins": 1.45414400100708, + "rewards/rejected": -3.4029927253723145, + "step": 3230 + }, + { + "epoch": 0.85, + "learning_rate": 3.44132109080447e-07, + "logits/chosen": -1.0673831701278687, + "logits/rejected": -0.0653887614607811, + "logps/chosen": -512.1033935546875, + "logps/rejected": -733.269287109375, + "loss": 0.4068, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7813838720321655, + "rewards/margins": 1.5279889106750488, + "rewards/rejected": -3.309372663497925, + "step": 3240 + }, + { + "epoch": 0.85, + "learning_rate": 3.3265564510662344e-07, + "logits/chosen": -1.1314384937286377, + "logits/rejected": -0.5283291935920715, + "logps/chosen": -552.1278076171875, + "logps/rejected": -818.47509765625, + "loss": 0.4451, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8823707103729248, + "rewards/margins": 1.676279067993164, + "rewards/rejected": -3.558649778366089, + "step": 3250 + }, + { + "epoch": 0.85, + "learning_rate": 3.213601537627195e-07, + "logits/chosen": -0.9113371968269348, + "logits/rejected": -0.332157701253891, + "logps/chosen": -565.0186767578125, + "logps/rejected": -805.5143432617188, + "loss": 0.4066, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2068862915039062, + "rewards/margins": 1.6060466766357422, + "rewards/rejected": -3.8129334449768066, + "step": 3260 + }, + { + "epoch": 0.86, + "learning_rate": 3.1024657821901063e-07, + "logits/chosen": -0.7972087264060974, + "logits/rejected": -0.6192452311515808, + "logps/chosen": -460.00244140625, + "logps/rejected": -661.6898193359375, + "loss": 0.4204, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.722604513168335, + "rewards/margins": 1.273694396018982, + "rewards/rejected": -2.9962992668151855, + "step": 3270 + }, + { + "epoch": 0.86, + "learning_rate": 2.9931584645585654e-07, + "logits/chosen": -1.2535960674285889, + "logits/rejected": 0.02778279222548008, + "logps/chosen": -465.5445861816406, + "logps/rejected": -718.2966918945312, + "loss": 0.3941, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.620823860168457, + "rewards/margins": 1.7875875234603882, + "rewards/rejected": -3.4084110260009766, + "step": 3280 + }, + { + "epoch": 0.86, + "learning_rate": 2.885688711862136e-07, + "logits/chosen": -0.6595426201820374, + "logits/rejected": -0.6082831621170044, + "logps/chosen": -518.3923950195312, + "logps/rejected": -844.9713745117188, + "loss": 0.3628, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.020922899246216, + "rewards/margins": 2.008241653442383, + "rewards/rejected": -4.0291643142700195, + "step": 3290 + }, + { + "epoch": 0.86, + "learning_rate": 2.7800654977942486e-07, + "logits/chosen": -1.1551368236541748, + "logits/rejected": 0.04624384641647339, + "logps/chosen": -494.8173828125, + "logps/rejected": -759.0722045898438, + "loss": 0.3921, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8496835231781006, + "rewards/margins": 1.606042504310608, + "rewards/rejected": -3.455725908279419, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_logits/chosen": 0.5531209707260132, + "eval_logits/rejected": 1.5286740064620972, + "eval_logps/chosen": -559.9616088867188, + "eval_logps/rejected": -791.3992309570312, + "eval_loss": 0.4215858280658722, + "eval_rewards/accuracies": 0.8050000071525574, + "eval_rewards/chosen": -2.043008327484131, + "eval_rewards/margins": 1.680959939956665, + "eval_rewards/rejected": -3.723968267440796, + "eval_runtime": 1375.3072, + "eval_samples_per_second": 1.454, + "eval_steps_per_second": 0.364, + "step": 3300 + }, + { + "epoch": 0.87, + "learning_rate": 2.6762976418628797e-07, + "logits/chosen": -0.6151852011680603, + "logits/rejected": -0.4355131983757019, + "logps/chosen": -569.3573608398438, + "logps/rejected": -845.0255737304688, + "loss": 0.3994, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9695743322372437, + "rewards/margins": 1.774019479751587, + "rewards/rejected": -3.743593692779541, + "step": 3310 + }, + { + "epoch": 0.87, + "learning_rate": 2.5743938086541354e-07, + "logits/chosen": -1.1306208372116089, + "logits/rejected": -0.369152694940567, + "logps/chosen": -528.9949951171875, + "logps/rejected": -786.5325927734375, + "loss": 0.4035, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8521554470062256, + "rewards/margins": 2.019233226776123, + "rewards/rejected": -3.8713886737823486, + "step": 3320 + }, + { + "epoch": 0.87, + "learning_rate": 2.4743625071087574e-07, + "logits/chosen": -0.9898750185966492, + "logits/rejected": -0.0026702166069298983, + "logps/chosen": -589.635498046875, + "logps/rejected": -785.4796752929688, + "loss": 0.481, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.26185941696167, + "rewards/margins": 1.3372209072113037, + "rewards/rejected": -3.5990803241729736, + "step": 3330 + }, + { + "epoch": 0.87, + "learning_rate": 2.3762120898116498e-07, + "logits/chosen": -1.0256075859069824, + "logits/rejected": -0.4684371054172516, + "logps/chosen": -513.4962158203125, + "logps/rejected": -773.9256591796875, + "loss": 0.489, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8556444644927979, + "rewards/margins": 1.662695288658142, + "rewards/rejected": -3.5183398723602295, + "step": 3340 + }, + { + "epoch": 0.88, + "learning_rate": 2.2799507522944048e-07, + "logits/chosen": -1.2172832489013672, + "logits/rejected": -0.7886725068092346, + "logps/chosen": -603.7682495117188, + "logps/rejected": -839.2596435546875, + "loss": 0.3825, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.133934259414673, + "rewards/margins": 1.56403386592865, + "rewards/rejected": -3.6979682445526123, + "step": 3350 + }, + { + "epoch": 0.88, + "learning_rate": 2.1855865323510056e-07, + "logits/chosen": -0.9375013113021851, + "logits/rejected": -0.4790850281715393, + "logps/chosen": -529.7852783203125, + "logps/rejected": -789.4864501953125, + "loss": 0.3798, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7933248281478882, + "rewards/margins": 1.7892353534698486, + "rewards/rejected": -3.5825603008270264, + "step": 3360 + }, + { + "epoch": 0.88, + "learning_rate": 2.0931273093666575e-07, + "logits/chosen": -1.0546633005142212, + "logits/rejected": 0.11733438819646835, + "logps/chosen": -566.7720947265625, + "logps/rejected": -769.4909057617188, + "loss": 0.4561, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1270699501037598, + "rewards/margins": 1.5025631189346313, + "rewards/rejected": -3.6296334266662598, + "step": 3370 + }, + { + "epoch": 0.88, + "learning_rate": 2.002580803659873e-07, + "logits/chosen": -1.0718940496444702, + "logits/rejected": -0.24651813507080078, + "logps/chosen": -563.5213012695312, + "logps/rejected": -746.7576904296875, + "loss": 0.4189, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0106587409973145, + "rewards/margins": 1.5113112926483154, + "rewards/rejected": -3.52197003364563, + "step": 3380 + }, + { + "epoch": 0.89, + "learning_rate": 1.913954575837826e-07, + "logits/chosen": -1.211354374885559, + "logits/rejected": -0.7178353071212769, + "logps/chosen": -547.396728515625, + "logps/rejected": -785.0447998046875, + "loss": 0.3956, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.859985113143921, + "rewards/margins": 1.6019798517227173, + "rewards/rejected": -3.4619648456573486, + "step": 3390 + }, + { + "epoch": 0.89, + "learning_rate": 1.827256026165028e-07, + "logits/chosen": -1.3360965251922607, + "logits/rejected": -0.5832281112670898, + "logps/chosen": -572.5958251953125, + "logps/rejected": -827.7864379882812, + "loss": 0.4079, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8811748027801514, + "rewards/margins": 1.826063871383667, + "rewards/rejected": -3.7072386741638184, + "step": 3400 + }, + { + "epoch": 0.89, + "learning_rate": 1.7424923939454274e-07, + "logits/chosen": -1.0015101432800293, + "logits/rejected": -0.4501993656158447, + "logps/chosen": -516.7034912109375, + "logps/rejected": -737.7506713867188, + "loss": 0.3917, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.871683120727539, + "rewards/margins": 1.2368314266204834, + "rewards/rejected": -3.1085145473480225, + "step": 3410 + }, + { + "epoch": 0.9, + "learning_rate": 1.6596707569179304e-07, + "logits/chosen": -1.220655918121338, + "logits/rejected": -0.36421042680740356, + "logps/chosen": -664.6968994140625, + "logps/rejected": -811.2049560546875, + "loss": 0.5067, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4624738693237305, + "rewards/margins": 1.2412056922912598, + "rewards/rejected": -3.7036795616149902, + "step": 3420 + }, + { + "epoch": 0.9, + "learning_rate": 1.578798030665385e-07, + "logits/chosen": -0.7742003202438354, + "logits/rejected": -0.1663471907377243, + "logps/chosen": -584.2454833984375, + "logps/rejected": -789.2788696289062, + "loss": 0.4325, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.159472703933716, + "rewards/margins": 1.4603602886199951, + "rewards/rejected": -3.619832992553711, + "step": 3430 + }, + { + "epoch": 0.9, + "learning_rate": 1.499880968037165e-07, + "logits/chosen": -0.9450448155403137, + "logits/rejected": -0.24974000453948975, + "logps/chosen": -543.8734130859375, + "logps/rejected": -776.0538330078125, + "loss": 0.3894, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0840260982513428, + "rewards/margins": 1.4753036499023438, + "rewards/rejected": -3.5593299865722656, + "step": 3440 + }, + { + "epoch": 0.9, + "learning_rate": 1.4229261585852805e-07, + "logits/chosen": -1.1183446645736694, + "logits/rejected": -0.8927680253982544, + "logps/chosen": -516.3631591796875, + "logps/rejected": -750.56884765625, + "loss": 0.4428, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7409288883209229, + "rewards/margins": 1.517566442489624, + "rewards/rejected": -3.258495330810547, + "step": 3450 + }, + { + "epoch": 0.91, + "learning_rate": 1.3479400280141886e-07, + "logits/chosen": -1.0318033695220947, + "logits/rejected": -0.8086441159248352, + "logps/chosen": -516.8532104492188, + "logps/rejected": -741.1300659179688, + "loss": 0.3884, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8979904651641846, + "rewards/margins": 1.477073311805725, + "rewards/rejected": -3.37506365776062, + "step": 3460 + }, + { + "epoch": 0.91, + "learning_rate": 1.2749288376442044e-07, + "logits/chosen": -0.9179407358169556, + "logits/rejected": -0.33625102043151855, + "logps/chosen": -536.0686645507812, + "logps/rejected": -847.4317626953125, + "loss": 0.3721, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9626020193099976, + "rewards/margins": 1.956992745399475, + "rewards/rejected": -3.9195950031280518, + "step": 3470 + }, + { + "epoch": 0.91, + "learning_rate": 1.203898683888713e-07, + "logits/chosen": -1.2370408773422241, + "logits/rejected": -0.602063775062561, + "logps/chosen": -547.3104858398438, + "logps/rejected": -757.8153076171875, + "loss": 0.4135, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9618017673492432, + "rewards/margins": 1.5197532176971436, + "rewards/rejected": -3.4815547466278076, + "step": 3480 + }, + { + "epoch": 0.91, + "learning_rate": 1.1348554977451132e-07, + "logits/chosen": -0.8971865773200989, + "logits/rejected": -0.45237869024276733, + "logps/chosen": -549.7821044921875, + "logps/rejected": -819.00439453125, + "loss": 0.422, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9521732330322266, + "rewards/margins": 1.8261715173721313, + "rewards/rejected": -3.7783446311950684, + "step": 3490 + }, + { + "epoch": 0.92, + "learning_rate": 1.0678050442995802e-07, + "logits/chosen": -0.5732991099357605, + "logits/rejected": -0.1439836025238037, + "logps/chosen": -517.4790649414062, + "logps/rejected": -790.7830810546875, + "loss": 0.3695, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7863785028457642, + "rewards/margins": 1.7641347646713257, + "rewards/rejected": -3.550513505935669, + "step": 3500 + }, + { + "epoch": 0.92, + "learning_rate": 1.0027529222456755e-07, + "logits/chosen": -0.8079764246940613, + "logits/rejected": -0.43154460191726685, + "logps/chosen": -505.2015075683594, + "logps/rejected": -735.8001708984375, + "loss": 0.4543, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.861318588256836, + "rewards/margins": 1.397134780883789, + "rewards/rejected": -3.258453369140625, + "step": 3510 + }, + { + "epoch": 0.92, + "learning_rate": 9.397045634168766e-08, + "logits/chosen": -1.5205557346343994, + "logits/rejected": -0.21061238646507263, + "logps/chosen": -541.4683837890625, + "logps/rejected": -778.14453125, + "loss": 0.3821, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8572975397109985, + "rewards/margins": 1.6669906377792358, + "rewards/rejected": -3.5242881774902344, + "step": 3520 + }, + { + "epoch": 0.92, + "learning_rate": 8.78665232332998e-08, + "logits/chosen": -1.0609691143035889, + "logits/rejected": -0.4593663811683655, + "logps/chosen": -509.010986328125, + "logps/rejected": -796.3077392578125, + "loss": 0.363, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9296966791152954, + "rewards/margins": 1.6614776849746704, + "rewards/rejected": -3.591174364089966, + "step": 3530 + }, + { + "epoch": 0.93, + "learning_rate": 8.196400257606208e-08, + "logits/chosen": -0.8713966608047485, + "logits/rejected": -0.9145506024360657, + "logps/chosen": -527.7672729492188, + "logps/rejected": -777.5648193359375, + "loss": 0.4802, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.896409273147583, + "rewards/margins": 1.6549888849258423, + "rewards/rejected": -3.551398515701294, + "step": 3540 + }, + { + "epoch": 0.93, + "learning_rate": 7.626338722875076e-08, + "logits/chosen": -1.1116408109664917, + "logits/rejected": -0.13830144703388214, + "logps/chosen": -569.3795166015625, + "logps/rejected": -845.25830078125, + "loss": 0.3774, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9394264221191406, + "rewards/margins": 2.049816131591797, + "rewards/rejected": -3.9892425537109375, + "step": 3550 + }, + { + "epoch": 0.93, + "learning_rate": 7.076515319110688e-08, + "logits/chosen": -0.9675251841545105, + "logits/rejected": -0.38351327180862427, + "logps/chosen": -519.7999267578125, + "logps/rejected": -768.5065307617188, + "loss": 0.3933, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9502222537994385, + "rewards/margins": 1.693610429763794, + "rewards/rejected": -3.6438324451446533, + "step": 3560 + }, + { + "epoch": 0.93, + "learning_rate": 6.54697595640899e-08, + "logits/chosen": -0.8377977609634399, + "logits/rejected": -0.5262192487716675, + "logps/chosen": -530.46728515625, + "logps/rejected": -769.6790771484375, + "loss": 0.4231, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.032832622528076, + "rewards/margins": 1.6006113290786743, + "rewards/rejected": -3.633444309234619, + "step": 3570 + }, + { + "epoch": 0.94, + "learning_rate": 6.037764851154426e-08, + "logits/chosen": -0.985633373260498, + "logits/rejected": 0.16631217300891876, + "logps/chosen": -595.2657470703125, + "logps/rejected": -853.1846923828125, + "loss": 0.3474, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9297056198120117, + "rewards/margins": 1.8593957424163818, + "rewards/rejected": -3.7891018390655518, + "step": 3580 + }, + { + "epoch": 0.94, + "learning_rate": 5.548924522327748e-08, + "logits/chosen": -1.016361951828003, + "logits/rejected": -0.7056697607040405, + "logps/chosen": -516.3272705078125, + "logps/rejected": -732.7681274414062, + "loss": 0.4643, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8100614547729492, + "rewards/margins": 1.4326629638671875, + "rewards/rejected": -3.242724657058716, + "step": 3590 + }, + { + "epoch": 0.94, + "learning_rate": 5.0804957879556915e-08, + "logits/chosen": -1.4801714420318604, + "logits/rejected": -0.21010088920593262, + "logps/chosen": -538.2376708984375, + "logps/rejected": -716.570556640625, + "loss": 0.4249, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8412452936172485, + "rewards/margins": 1.4520995616912842, + "rewards/rejected": -3.2933449745178223, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_logits/chosen": 0.3916805386543274, + "eval_logits/rejected": 1.3532880544662476, + "eval_logps/chosen": -551.5703735351562, + "eval_logps/rejected": -777.8283081054688, + "eval_loss": 0.4203905165195465, + "eval_rewards/accuracies": 0.800000011920929, + "eval_rewards/chosen": -1.9590957164764404, + "eval_rewards/margins": 1.6291632652282715, + "eval_rewards/rejected": -3.588258981704712, + "eval_runtime": 1380.3242, + "eval_samples_per_second": 1.449, + "eval_steps_per_second": 0.362, + "step": 3600 + }, + { + "epoch": 0.94, + "learning_rate": 4.632517761702815e-08, + "logits/chosen": -1.3528892993927002, + "logits/rejected": 0.41183796525001526, + "logps/chosen": -596.6740112304688, + "logps/rejected": -815.2229614257812, + "loss": 0.3384, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.033761739730835, + "rewards/margins": 1.7981764078140259, + "rewards/rejected": -3.8319382667541504, + "step": 3610 + }, + { + "epoch": 0.95, + "learning_rate": 4.205027849605359e-08, + "logits/chosen": -0.6885486245155334, + "logits/rejected": -0.6011554002761841, + "logps/chosen": -501.90008544921875, + "logps/rejected": -728.01416015625, + "loss": 0.3797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8948495388031006, + "rewards/margins": 1.430154800415039, + "rewards/rejected": -3.3250045776367188, + "step": 3620 + }, + { + "epoch": 0.95, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": -1.1346218585968018, + "logits/rejected": -0.2788551449775696, + "logps/chosen": -572.7951049804688, + "logps/rejected": -818.1143798828125, + "loss": 0.3816, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.859471321105957, + "rewards/margins": 1.9316068887710571, + "rewards/rejected": -3.7910780906677246, + "step": 3630 + }, + { + "epoch": 0.95, + "learning_rate": 3.411653435283158e-08, + "logits/chosen": -1.2903286218643188, + "logits/rejected": -0.4007846713066101, + "logps/chosen": -612.4331665039062, + "logps/rejected": -790.9478759765625, + "loss": 0.4783, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9661962985992432, + "rewards/margins": 1.4086124897003174, + "rewards/rejected": -3.3748087882995605, + "step": 3640 + }, + { + "epoch": 0.96, + "learning_rate": 3.04583517959367e-08, + "logits/chosen": -1.4373764991760254, + "logits/rejected": -0.18271857500076294, + "logps/chosen": -567.571533203125, + "logps/rejected": -814.9035034179688, + "loss": 0.372, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9404630661010742, + "rewards/margins": 1.8180534839630127, + "rewards/rejected": -3.758516788482666, + "step": 3650 + }, + { + "epoch": 0.96, + "learning_rate": 2.7006375255985984e-08, + "logits/chosen": -0.8399646878242493, + "logits/rejected": -0.41886812448501587, + "logps/chosen": -487.2755432128906, + "logps/rejected": -792.0075073242188, + "loss": 0.3617, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7799431085586548, + "rewards/margins": 2.1151885986328125, + "rewards/rejected": -3.895132064819336, + "step": 3660 + }, + { + "epoch": 0.96, + "learning_rate": 2.3760892972027328e-08, + "logits/chosen": -0.7552706003189087, + "logits/rejected": -0.37330105900764465, + "logps/chosen": -550.1450805664062, + "logps/rejected": -821.5558471679688, + "loss": 0.4513, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.0513758659362793, + "rewards/margins": 1.6616461277008057, + "rewards/rejected": -3.713021755218506, + "step": 3670 + }, + { + "epoch": 0.96, + "learning_rate": 2.072217594089765e-08, + "logits/chosen": -1.048200249671936, + "logits/rejected": -0.7420127987861633, + "logps/chosen": -548.27587890625, + "logps/rejected": -818.7357788085938, + "loss": 0.4605, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8455013036727905, + "rewards/margins": 1.6794688701629639, + "rewards/rejected": -3.524970293045044, + "step": 3680 + }, + { + "epoch": 0.97, + "learning_rate": 1.789047789459375e-08, + "logits/chosen": -0.8459684252738953, + "logits/rejected": -0.16287431120872498, + "logps/chosen": -508.4056091308594, + "logps/rejected": -766.1015625, + "loss": 0.3671, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6832281351089478, + "rewards/margins": 1.7482646703720093, + "rewards/rejected": -3.431492567062378, + "step": 3690 + }, + { + "epoch": 0.97, + "learning_rate": 1.5266035279088708e-08, + "logits/chosen": -0.6956599354743958, + "logits/rejected": -0.6482317447662354, + "logps/chosen": -574.1497802734375, + "logps/rejected": -854.5062255859375, + "loss": 0.3785, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2222962379455566, + "rewards/margins": 1.6956876516342163, + "rewards/rejected": -3.9179844856262207, + "step": 3700 + }, + { + "epoch": 0.97, + "learning_rate": 1.2849067234584623e-08, + "logits/chosen": -0.6731444001197815, + "logits/rejected": -0.2936319410800934, + "logps/chosen": -508.5874938964844, + "logps/rejected": -779.5755004882812, + "loss": 0.3793, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7908990383148193, + "rewards/margins": 1.8119395971298218, + "rewards/rejected": -3.6028385162353516, + "step": 3710 + }, + { + "epoch": 0.97, + "learning_rate": 1.0639775577218625e-08, + "logits/chosen": -1.1039248704910278, + "logits/rejected": -0.3895350396633148, + "logps/chosen": -532.2550048828125, + "logps/rejected": -799.4228515625, + "loss": 0.3993, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.914385437965393, + "rewards/margins": 1.754346489906311, + "rewards/rejected": -3.668731689453125, + "step": 3720 + }, + { + "epoch": 0.98, + "learning_rate": 8.638344782207486e-09, + "logits/chosen": -1.1005773544311523, + "logits/rejected": -0.48734521865844727, + "logps/chosen": -638.8570556640625, + "logps/rejected": -876.8406372070312, + "loss": 0.441, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3087477684020996, + "rewards/margins": 1.5587693452835083, + "rewards/rejected": -3.8675169944763184, + "step": 3730 + }, + { + "epoch": 0.98, + "learning_rate": 6.84494196844715e-09, + "logits/chosen": -1.2625192403793335, + "logits/rejected": -0.4201609194278717, + "logps/chosen": -579.9595336914062, + "logps/rejected": -743.4432373046875, + "loss": 0.4261, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0454494953155518, + "rewards/margins": 1.329736351966858, + "rewards/rejected": -3.37518572807312, + "step": 3740 + }, + { + "epoch": 0.98, + "learning_rate": 5.259716884556121e-09, + "logits/chosen": -1.0971883535385132, + "logits/rejected": 0.13197267055511475, + "logps/chosen": -497.7351989746094, + "logps/rejected": -844.2490234375, + "loss": 0.3651, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8625205755233765, + "rewards/margins": 2.320369243621826, + "rewards/rejected": -4.182889461517334, + "step": 3750 + }, + { + "epoch": 0.98, + "learning_rate": 3.882801896372967e-09, + "logits/chosen": -1.2590482234954834, + "logits/rejected": 0.06436818838119507, + "logps/chosen": -604.3903198242188, + "logps/rejected": -838.1032104492188, + "loss": 0.3897, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1124839782714844, + "rewards/margins": 1.6700093746185303, + "rewards/rejected": -3.7824935913085938, + "step": 3760 + }, + { + "epoch": 0.99, + "learning_rate": 2.7143119759026614e-09, + "logits/chosen": -0.8534577488899231, + "logits/rejected": -0.31645748019218445, + "logps/chosen": -536.9246215820312, + "logps/rejected": -888.1343994140625, + "loss": 0.2884, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8014726638793945, + "rewards/margins": 2.4648687839508057, + "rewards/rejected": -4.266341209411621, + "step": 3770 + }, + { + "epoch": 0.99, + "learning_rate": 1.754344691717591e-09, + "logits/chosen": -1.0792902708053589, + "logits/rejected": -0.5682160258293152, + "logps/chosen": -508.158447265625, + "logps/rejected": -773.3580932617188, + "loss": 0.3699, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8245254755020142, + "rewards/margins": 1.8749605417251587, + "rewards/rejected": -3.6994857788085938, + "step": 3780 + }, + { + "epoch": 0.99, + "learning_rate": 1.0029802008096335e-09, + "logits/chosen": -0.6716892719268799, + "logits/rejected": -0.40372976660728455, + "logps/chosen": -489.3829040527344, + "logps/rejected": -723.220703125, + "loss": 0.4356, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7527339458465576, + "rewards/margins": 1.6192394495010376, + "rewards/rejected": -3.371973752975464, + "step": 3790 + }, + { + "epoch": 0.99, + "learning_rate": 4.602812418974534e-10, + "logits/chosen": -0.8607047200202942, + "logits/rejected": -0.9630060195922852, + "logps/chosen": -502.1089782714844, + "logps/rejected": -731.4326171875, + "loss": 0.4389, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8323230743408203, + "rewards/margins": 1.3677552938461304, + "rewards/rejected": -3.200078248977661, + "step": 3800 + }, + { + "epoch": 1.0, + "learning_rate": 1.2629313018819312e-10, + "logits/chosen": -0.9664648175239563, + "logits/rejected": 0.03846656158566475, + "logps/chosen": -550.194091796875, + "logps/rejected": -774.918212890625, + "loss": 0.385, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9849185943603516, + "rewards/margins": 1.5859264135360718, + "rewards/rejected": -3.5708446502685547, + "step": 3810 + }, + { + "epoch": 1.0, + "learning_rate": 1.0437535929996855e-12, + "logits/chosen": -1.1103519201278687, + "logits/rejected": -0.32411596179008484, + "logps/chosen": -598.4796752929688, + "logps/rejected": -901.5104370117188, + "loss": 0.4032, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1410813331604004, + "rewards/margins": 1.9627602100372314, + "rewards/rejected": -4.103841304779053, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.43769783746998087, + "train_runtime": 91189.3347, + "train_samples_per_second": 0.67, + "train_steps_per_second": 0.042 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 400, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}