diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7394 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 9.523809523809524e-09, + "logits/chosen": -1.4941036701202393, + "logits/rejected": -1.392427682876587, + "logps/chosen": -67.08292388916016, + "logps/rejected": -289.04925537109375, + "loss": 0.1606, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 9.523809523809525e-08, + "logits/chosen": -1.6196446418762207, + "logits/rejected": -1.2225258350372314, + "logps/chosen": -404.9694519042969, + "logps/rejected": -765.021240234375, + "loss": 0.2208, + "rewards/accuracies": 0.4166666567325592, + "rewards/chosen": 0.00017578975530341268, + "rewards/margins": -5.83395967623801e-06, + "rewards/rejected": 0.00018162367632612586, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.904761904761905e-07, + "logits/chosen": -1.7201000452041626, + "logits/rejected": -1.0938055515289307, + "logps/chosen": -470.4878845214844, + "logps/rejected": -853.2833251953125, + "loss": 0.2049, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0003256895288359374, + "rewards/margins": 0.00018404466391075402, + "rewards/rejected": 0.0001416448940290138, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 2.8571428571428575e-07, + "logits/chosen": -1.7836272716522217, + "logits/rejected": -1.3143703937530518, + "logps/chosen": -437.3982849121094, + "logps/rejected": -757.3729248046875, + "loss": 0.1837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0006006127223372459, + "rewards/margins": 0.0018018081318587065, + "rewards/rejected": -0.0012011956423521042, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 3.80952380952381e-07, + "logits/chosen": -1.3586902618408203, + "logits/rejected": -1.1272189617156982, + "logps/chosen": -391.42633056640625, + "logps/rejected": -752.5025634765625, + "loss": 0.2255, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0005751135759055614, + "rewards/margins": 0.00423981249332428, + "rewards/rejected": -0.003664699150249362, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 4.7619047619047623e-07, + "logits/chosen": -1.6611652374267578, + "logits/rejected": -1.1007435321807861, + "logps/chosen": -380.03289794921875, + "logps/rejected": -720.9581298828125, + "loss": 0.1682, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.00211103493347764, + "rewards/margins": 0.009362945333123207, + "rewards/rejected": -0.007251910865306854, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 5.714285714285715e-07, + "logits/chosen": -1.514490008354187, + "logits/rejected": -1.14967942237854, + "logps/chosen": -574.3751220703125, + "logps/rejected": -911.6978759765625, + "loss": 0.196, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0030188350938260555, + "rewards/margins": 0.015978649258613586, + "rewards/rejected": -0.012959812767803669, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": -1.6599044799804688, + "logits/rejected": -0.966509222984314, + "logps/chosen": -598.3924560546875, + "logps/rejected": -950.7366333007812, + "loss": 0.1913, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.005272147245705128, + "rewards/margins": 0.03154373914003372, + "rewards/rejected": -0.02627158723771572, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 7.61904761904762e-07, + "logits/chosen": -1.6465399265289307, + "logits/rejected": -1.0205047130584717, + "logps/chosen": -457.08416748046875, + "logps/rejected": -827.5206909179688, + "loss": 0.1416, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018026497215032578, + "rewards/margins": 0.047980599105358124, + "rewards/rejected": -0.029954101890325546, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 8.571428571428572e-07, + "logits/chosen": -1.8371727466583252, + "logits/rejected": -1.0108280181884766, + "logps/chosen": -530.680908203125, + "logps/rejected": -870.0900268554688, + "loss": 0.1554, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.024562101811170578, + "rewards/margins": 0.06634469330310822, + "rewards/rejected": -0.041782595217227936, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 9.523809523809525e-07, + "logits/chosen": -1.7072471380233765, + "logits/rejected": -1.1548950672149658, + "logps/chosen": -446.76953125, + "logps/rejected": -766.6468505859375, + "loss": 0.1848, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0029524858109652996, + "rewards/margins": 0.0440407320857048, + "rewards/rejected": -0.04108824580907822, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 1.0476190476190478e-06, + "logits/chosen": -1.5558496713638306, + "logits/rejected": -1.1346105337142944, + "logps/chosen": -404.34613037109375, + "logps/rejected": -873.2721557617188, + "loss": 0.0977, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.014201948419213295, + "rewards/margins": 0.09548144042491913, + "rewards/rejected": -0.08127949386835098, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 1.142857142857143e-06, + "logits/chosen": -1.51814866065979, + "logits/rejected": -0.92498379945755, + "logps/chosen": -569.2479248046875, + "logps/rejected": -906.2493896484375, + "loss": 0.1117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026814710348844528, + "rewards/margins": 0.08289747685194016, + "rewards/rejected": -0.10971218347549438, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 1.2380952380952382e-06, + "logits/chosen": -1.7106437683105469, + "logits/rejected": -0.9695409536361694, + "logps/chosen": -596.420166015625, + "logps/rejected": -1031.6600341796875, + "loss": 0.1178, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.031284015625715256, + "rewards/margins": 0.12073127180337906, + "rewards/rejected": -0.15201528370380402, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": -1.6556059122085571, + "logits/rejected": -1.0907657146453857, + "logps/chosen": -574.3132934570312, + "logps/rejected": -996.0400390625, + "loss": 0.1512, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05514935776591301, + "rewards/margins": 0.11033248901367188, + "rewards/rejected": -0.16548185050487518, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": -1.5143836736679077, + "logits/rejected": -0.996626079082489, + "logps/chosen": -603.2044677734375, + "logps/rejected": -1043.1396484375, + "loss": 0.1036, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.08137336373329163, + "rewards/margins": 0.13189618289470673, + "rewards/rejected": -0.21326954662799835, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 1.523809523809524e-06, + "logits/chosen": -1.6874631643295288, + "logits/rejected": -1.0239840745925903, + "logps/chosen": -494.5599670410156, + "logps/rejected": -1013.8928833007812, + "loss": 0.1076, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0622636079788208, + "rewards/margins": 0.19322165846824646, + "rewards/rejected": -0.25548526644706726, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 1.6190476190476193e-06, + "logits/chosen": -1.5800025463104248, + "logits/rejected": -1.1513192653656006, + "logps/chosen": -510.38214111328125, + "logps/rejected": -1081.0006103515625, + "loss": 0.1125, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.07993271201848984, + "rewards/margins": 0.1820063441991806, + "rewards/rejected": -0.26193904876708984, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 1.7142857142857145e-06, + "logits/chosen": -1.6302579641342163, + "logits/rejected": -1.0718019008636475, + "logps/chosen": -573.6951904296875, + "logps/rejected": -1089.02978515625, + "loss": 0.1109, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12081176042556763, + "rewards/margins": 0.19152003526687622, + "rewards/rejected": -0.31233179569244385, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 1.8095238095238097e-06, + "logits/chosen": -1.6149966716766357, + "logits/rejected": -1.0598729848861694, + "logps/chosen": -598.2444458007812, + "logps/rejected": -1140.306884765625, + "loss": 0.0935, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13946162164211273, + "rewards/margins": 0.2145201712846756, + "rewards/rejected": -0.35398179292678833, + "step": 190 + }, + { + "epoch": 0.04, + "learning_rate": 1.904761904761905e-06, + "logits/chosen": -1.8164126873016357, + "logits/rejected": -1.0813723802566528, + "logps/chosen": -629.240478515625, + "logps/rejected": -1242.144775390625, + "loss": 0.0676, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17410385608673096, + "rewards/margins": 0.25729167461395264, + "rewards/rejected": -0.4313955307006836, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.7437931299209595, + "logits/rejected": -1.0192813873291016, + "logps/chosen": -559.017822265625, + "logps/rejected": -1077.651611328125, + "loss": 0.0833, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06384438276290894, + "rewards/margins": 0.24092309176921844, + "rewards/rejected": -0.3047674596309662, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 2.0952380952380955e-06, + "logits/chosen": -1.4945495128631592, + "logits/rejected": -0.9317284822463989, + "logps/chosen": -613.0845947265625, + "logps/rejected": -1165.627197265625, + "loss": 0.0806, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0914846882224083, + "rewards/margins": 0.25627589225769043, + "rewards/rejected": -0.3477606177330017, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 2.1904761904761908e-06, + "logits/chosen": -1.7308365106582642, + "logits/rejected": -1.0819891691207886, + "logps/chosen": -565.0457763671875, + "logps/rejected": -1136.459716796875, + "loss": 0.11, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14601822197437286, + "rewards/margins": 0.2253512144088745, + "rewards/rejected": -0.3713694214820862, + "step": 230 + }, + { + "epoch": 0.05, + "learning_rate": 2.285714285714286e-06, + "logits/chosen": -1.565441370010376, + "logits/rejected": -0.9686568975448608, + "logps/chosen": -580.4759521484375, + "logps/rejected": -1177.9888916015625, + "loss": 0.1125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1845378577709198, + "rewards/margins": 0.1970101296901703, + "rewards/rejected": -0.3815479874610901, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 2.380952380952381e-06, + "logits/chosen": -1.6525847911834717, + "logits/rejected": -1.0854707956314087, + "logps/chosen": -660.649169921875, + "logps/rejected": -1140.14306640625, + "loss": 0.1494, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15628325939178467, + "rewards/margins": 0.19183820486068726, + "rewards/rejected": -0.34812143445014954, + "step": 250 + }, + { + "epoch": 0.05, + "learning_rate": 2.4761904761904764e-06, + "logits/chosen": -1.6423368453979492, + "logits/rejected": -1.3389235734939575, + "logps/chosen": -474.299560546875, + "logps/rejected": -1021.30810546875, + "loss": 0.1047, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07896876335144043, + "rewards/margins": 0.18711309134960175, + "rewards/rejected": -0.26608186960220337, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 2.571428571428571e-06, + "logits/chosen": -1.7679646015167236, + "logits/rejected": -1.022589087486267, + "logps/chosen": -573.9923706054688, + "logps/rejected": -1240.5599365234375, + "loss": 0.0823, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.144535630941391, + "rewards/margins": 0.25096073746681213, + "rewards/rejected": -0.3954963684082031, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": -1.5805046558380127, + "logits/rejected": -1.2209659814834595, + "logps/chosen": -689.5147705078125, + "logps/rejected": -1180.4815673828125, + "loss": 0.1223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21097099781036377, + "rewards/margins": 0.2033153772354126, + "rewards/rejected": -0.41428643465042114, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 2.7619047619047625e-06, + "logits/chosen": -1.4988248348236084, + "logits/rejected": -1.3288917541503906, + "logps/chosen": -654.6685791015625, + "logps/rejected": -1351.736083984375, + "loss": 0.0885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1968156397342682, + "rewards/margins": 0.26081275939941406, + "rewards/rejected": -0.4576283395290375, + "step": 290 + }, + { + "epoch": 0.06, + "learning_rate": 2.8571428571428573e-06, + "logits/chosen": -1.8760350942611694, + "logits/rejected": -1.1818573474884033, + "logps/chosen": -712.5704345703125, + "logps/rejected": -1270.919677734375, + "loss": 0.0943, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20824570953845978, + "rewards/margins": 0.23740389943122864, + "rewards/rejected": -0.4456496238708496, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 2.9523809523809525e-06, + "logits/chosen": -1.7160227298736572, + "logits/rejected": -1.109726905822754, + "logps/chosen": -548.7153930664062, + "logps/rejected": -1191.7733154296875, + "loss": 0.0857, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15231922268867493, + "rewards/margins": 0.2784896790981293, + "rewards/rejected": -0.4308088719844818, + "step": 310 + }, + { + "epoch": 0.06, + "learning_rate": 3.047619047619048e-06, + "logits/chosen": -1.7121608257293701, + "logits/rejected": -1.1865103244781494, + "logps/chosen": -561.9929809570312, + "logps/rejected": -1196.7623291015625, + "loss": 0.0665, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15775255858898163, + "rewards/margins": 0.2596059739589691, + "rewards/rejected": -0.41735848784446716, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 3.142857142857143e-06, + "logits/chosen": -1.7223520278930664, + "logits/rejected": -1.1820757389068604, + "logps/chosen": -541.658447265625, + "logps/rejected": -1158.8157958984375, + "loss": 0.0664, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12493345886468887, + "rewards/margins": 0.25119608640670776, + "rewards/rejected": -0.3761295676231384, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 3.2380952380952385e-06, + "logits/chosen": -1.5452698469161987, + "logits/rejected": -0.9967746734619141, + "logps/chosen": -682.1392822265625, + "logps/rejected": -1289.32421875, + "loss": 0.0913, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1961658000946045, + "rewards/margins": 0.248531773686409, + "rewards/rejected": -0.44469761848449707, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -1.4716678857803345, + "logits/rejected": -1.3033275604248047, + "logps/chosen": -558.1559448242188, + "logps/rejected": -1119.2779541015625, + "loss": 0.1114, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11863617599010468, + "rewards/margins": 0.18471702933311462, + "rewards/rejected": -0.3033532202243805, + "step": 350 + }, + { + "epoch": 0.07, + "learning_rate": 3.428571428571429e-06, + "logits/chosen": -1.5965025424957275, + "logits/rejected": -1.1964014768600464, + "logps/chosen": -656.2760009765625, + "logps/rejected": -1180.3507080078125, + "loss": 0.0883, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.16832932829856873, + "rewards/margins": 0.21959540247917175, + "rewards/rejected": -0.3879247307777405, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 3.523809523809524e-06, + "logits/chosen": -1.6595999002456665, + "logits/rejected": -1.197110891342163, + "logps/chosen": -491.074462890625, + "logps/rejected": -1110.8392333984375, + "loss": 0.0918, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1459522545337677, + "rewards/margins": 0.24827821552753448, + "rewards/rejected": -0.3942304849624634, + "step": 370 + }, + { + "epoch": 0.07, + "learning_rate": 3.6190476190476194e-06, + "logits/chosen": -1.662347435951233, + "logits/rejected": -1.183836579322815, + "logps/chosen": -535.8385620117188, + "logps/rejected": -1255.84765625, + "loss": 0.0572, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.09289667755365372, + "rewards/margins": 0.2889617085456848, + "rewards/rejected": -0.38185834884643555, + "step": 380 + }, + { + "epoch": 0.07, + "learning_rate": 3.7142857142857146e-06, + "logits/chosen": -1.734784722328186, + "logits/rejected": -1.2055104970932007, + "logps/chosen": -504.5662536621094, + "logps/rejected": -1199.697021484375, + "loss": 0.0805, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11345423758029938, + "rewards/margins": 0.2887258231639862, + "rewards/rejected": -0.4021800458431244, + "step": 390 + }, + { + "epoch": 0.08, + "learning_rate": 3.80952380952381e-06, + "logits/chosen": -1.8762212991714478, + "logits/rejected": -0.9481765031814575, + "logps/chosen": -750.0078125, + "logps/rejected": -1295.263671875, + "loss": 0.0862, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2424238920211792, + "rewards/margins": 0.22880926728248596, + "rewards/rejected": -0.47123318910598755, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 3.9047619047619055e-06, + "logits/chosen": -1.7143110036849976, + "logits/rejected": -1.2198104858398438, + "logps/chosen": -701.7123413085938, + "logps/rejected": -1305.44140625, + "loss": 0.0971, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2008996307849884, + "rewards/margins": 0.26313668489456177, + "rewards/rejected": -0.46403631567955017, + "step": 410 + }, + { + "epoch": 0.08, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -1.71208918094635, + "logits/rejected": -1.0226839780807495, + "logps/chosen": -600.1834716796875, + "logps/rejected": -1139.7694091796875, + "loss": 0.0848, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12446846067905426, + "rewards/margins": 0.2344471663236618, + "rewards/rejected": -0.35891565680503845, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 4.095238095238096e-06, + "logits/chosen": -1.7601484060287476, + "logits/rejected": -1.099973201751709, + "logps/chosen": -627.597412109375, + "logps/rejected": -1246.977783203125, + "loss": 0.0753, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.11978325992822647, + "rewards/margins": 0.25477010011672974, + "rewards/rejected": -0.3745533227920532, + "step": 430 + }, + { + "epoch": 0.08, + "learning_rate": 4.190476190476191e-06, + "logits/chosen": -1.435619592666626, + "logits/rejected": -0.9720669984817505, + "logps/chosen": -753.2772827148438, + "logps/rejected": -1327.572265625, + "loss": 0.0828, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26884615421295166, + "rewards/margins": 0.23727090656757355, + "rewards/rejected": -0.5061171054840088, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": -1.5603129863739014, + "logits/rejected": -1.0288165807724, + "logps/chosen": -777.147705078125, + "logps/rejected": -1493.787841796875, + "loss": 0.0811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32021015882492065, + "rewards/margins": 0.3157784640789032, + "rewards/rejected": -0.6359886527061462, + "step": 450 + }, + { + "epoch": 0.09, + "learning_rate": 4.3809523809523815e-06, + "logits/chosen": -1.552169919013977, + "logits/rejected": -1.063795566558838, + "logps/chosen": -634.6558837890625, + "logps/rejected": -1188.177490234375, + "loss": 0.1036, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21501021087169647, + "rewards/margins": 0.208788201212883, + "rewards/rejected": -0.42379841208457947, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 4.476190476190477e-06, + "logits/chosen": -1.8907029628753662, + "logits/rejected": -1.0847430229187012, + "logps/chosen": -527.0374755859375, + "logps/rejected": -1173.001953125, + "loss": 0.0646, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.08367784321308136, + "rewards/margins": 0.2584603428840637, + "rewards/rejected": -0.3421381413936615, + "step": 470 + }, + { + "epoch": 0.09, + "learning_rate": 4.571428571428572e-06, + "logits/chosen": -1.7828342914581299, + "logits/rejected": -1.354624629020691, + "logps/chosen": -562.9432373046875, + "logps/rejected": -1167.8656005859375, + "loss": 0.0815, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1048797145485878, + "rewards/margins": 0.2481795847415924, + "rewards/rejected": -0.353059321641922, + "step": 480 + }, + { + "epoch": 0.09, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": -2.050565004348755, + "logits/rejected": -1.307130217552185, + "logps/chosen": -652.1871337890625, + "logps/rejected": -1257.9901123046875, + "loss": 0.061, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15454623103141785, + "rewards/margins": 0.2749207019805908, + "rewards/rejected": -0.4294669032096863, + "step": 490 + }, + { + "epoch": 0.1, + "learning_rate": 4.761904761904762e-06, + "logits/chosen": -1.7903814315795898, + "logits/rejected": -1.102508783340454, + "logps/chosen": -714.9098510742188, + "logps/rejected": -1366.354248046875, + "loss": 0.0709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18693551421165466, + "rewards/margins": 0.3030626177787781, + "rewards/rejected": -0.4899981617927551, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 4.857142857142858e-06, + "logits/chosen": -1.5884782075881958, + "logits/rejected": -1.0900219678878784, + "logps/chosen": -889.3687744140625, + "logps/rejected": -1551.3133544921875, + "loss": 0.1114, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.31739160418510437, + "rewards/margins": 0.2541782259941101, + "rewards/rejected": -0.5715699195861816, + "step": 510 + }, + { + "epoch": 0.1, + "learning_rate": 4.952380952380953e-06, + "logits/chosen": -1.917160987854004, + "logits/rejected": -1.2584677934646606, + "logps/chosen": -599.8109130859375, + "logps/rejected": -1112.045166015625, + "loss": 0.1099, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1927962303161621, + "rewards/margins": 0.21344804763793945, + "rewards/rejected": -0.40624427795410156, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 4.999986185163754e-06, + "logits/chosen": -1.6736475229263306, + "logits/rejected": -1.1986761093139648, + "logps/chosen": -646.7384033203125, + "logps/rejected": -1181.8050537109375, + "loss": 0.0981, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1889500617980957, + "rewards/margins": 0.21725177764892578, + "rewards/rejected": -0.4062018394470215, + "step": 530 + }, + { + "epoch": 0.1, + "learning_rate": 4.999875667389858e-06, + "logits/chosen": -1.7721306085586548, + "logits/rejected": -1.1803306341171265, + "logps/chosen": -586.8339233398438, + "logps/rejected": -1139.8045654296875, + "loss": 0.1072, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14554569125175476, + "rewards/margins": 0.23414616286754608, + "rewards/rejected": -0.3796918988227844, + "step": 540 + }, + { + "epoch": 0.1, + "learning_rate": 4.999654636727765e-06, + "logits/chosen": -1.6037023067474365, + "logits/rejected": -1.2241770029067993, + "logps/chosen": -585.2470092773438, + "logps/rejected": -1193.3209228515625, + "loss": 0.1007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13885599374771118, + "rewards/margins": 0.228451207280159, + "rewards/rejected": -0.36730721592903137, + "step": 550 + }, + { + "epoch": 0.11, + "learning_rate": 4.999323102948655e-06, + "logits/chosen": -1.4847551584243774, + "logits/rejected": -1.0340732336044312, + "logps/chosen": -661.06103515625, + "logps/rejected": -1281.92431640625, + "loss": 0.1014, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20873567461967468, + "rewards/margins": 0.2781946659088135, + "rewards/rejected": -0.48693031072616577, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 4.998881080708759e-06, + "logits/chosen": -1.5017633438110352, + "logits/rejected": -1.003154993057251, + "logps/chosen": -746.7311401367188, + "logps/rejected": -1284.123779296875, + "loss": 0.1062, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2704651951789856, + "rewards/margins": 0.23320093750953674, + "rewards/rejected": -0.5036661028862, + "step": 570 + }, + { + "epoch": 0.11, + "learning_rate": 4.998328589548711e-06, + "logits/chosen": -1.5699012279510498, + "logits/rejected": -1.1782718896865845, + "logps/chosen": -710.3646850585938, + "logps/rejected": -1333.047119140625, + "loss": 0.0741, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.24391067028045654, + "rewards/margins": 0.2689984440803528, + "rewards/rejected": -0.5129091143608093, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 4.997665653892682e-06, + "logits/chosen": -1.7179704904556274, + "logits/rejected": -0.9554191827774048, + "logps/chosen": -799.9317626953125, + "logps/rejected": -1362.622802734375, + "loss": 0.0745, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3002550005912781, + "rewards/margins": 0.29512229561805725, + "rewards/rejected": -0.5953773260116577, + "step": 590 + }, + { + "epoch": 0.11, + "learning_rate": 4.996892303047306e-06, + "logits/chosen": -1.586641550064087, + "logits/rejected": -0.9480462074279785, + "logps/chosen": -888.384765625, + "logps/rejected": -1318.6873779296875, + "loss": 0.112, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3719322383403778, + "rewards/margins": 0.19167180359363556, + "rewards/rejected": -0.5636041164398193, + "step": 600 + }, + { + "epoch": 0.12, + "learning_rate": 4.996008571200375e-06, + "logits/chosen": -1.5255849361419678, + "logits/rejected": -1.0800695419311523, + "logps/chosen": -759.3140869140625, + "logps/rejected": -1329.3663330078125, + "loss": 0.117, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.269595205783844, + "rewards/margins": 0.23617248237133026, + "rewards/rejected": -0.5057677030563354, + "step": 610 + }, + { + "epoch": 0.12, + "learning_rate": 4.995014497419336e-06, + "logits/chosen": -1.473812222480774, + "logits/rejected": -1.122639775276184, + "logps/chosen": -585.0800170898438, + "logps/rejected": -1056.879638671875, + "loss": 0.1179, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.17191004753112793, + "rewards/margins": 0.15805818140506744, + "rewards/rejected": -0.3299682140350342, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 4.993910125649561e-06, + "logits/chosen": -1.4959099292755127, + "logits/rejected": -0.9445433616638184, + "logps/chosen": -615.6098022460938, + "logps/rejected": -1139.494873046875, + "loss": 0.0918, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1279982626438141, + "rewards/margins": 0.24162478744983673, + "rewards/rejected": -0.3696230351924896, + "step": 630 + }, + { + "epoch": 0.12, + "learning_rate": 4.992695504712402e-06, + "logits/chosen": -1.4863700866699219, + "logits/rejected": -1.1100517511367798, + "logps/chosen": -520.9189453125, + "logps/rejected": -1139.673095703125, + "loss": 0.1111, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13522864878177643, + "rewards/margins": 0.2550002932548523, + "rewards/rejected": -0.39022889733314514, + "step": 640 + }, + { + "epoch": 0.12, + "learning_rate": 4.9913706883030385e-06, + "logits/chosen": -1.688804268836975, + "logits/rejected": -1.2719508409500122, + "logps/chosen": -706.3018798828125, + "logps/rejected": -1093.7733154296875, + "loss": 0.1174, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23440298438072205, + "rewards/margins": 0.16613098978996277, + "rewards/rejected": -0.4005340039730072, + "step": 650 + }, + { + "epoch": 0.13, + "learning_rate": 4.989935734988098e-06, + "logits/chosen": -1.6558220386505127, + "logits/rejected": -1.0028841495513916, + "logps/chosen": -598.4909057617188, + "logps/rejected": -1052.578857421875, + "loss": 0.1242, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18583187460899353, + "rewards/margins": 0.2180318385362625, + "rewards/rejected": -0.40386366844177246, + "step": 660 + }, + { + "epoch": 0.13, + "learning_rate": 4.988390708203068e-06, + "logits/chosen": -1.7862279415130615, + "logits/rejected": -1.118517518043518, + "logps/chosen": -668.9701538085938, + "logps/rejected": -1353.1531982421875, + "loss": 0.0559, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1963638961315155, + "rewards/margins": 0.30444028973579407, + "rewards/rejected": -0.5008042454719543, + "step": 670 + }, + { + "epoch": 0.13, + "learning_rate": 4.9867356762494955e-06, + "logits/chosen": -1.5889132022857666, + "logits/rejected": -1.1245874166488647, + "logps/chosen": -745.0137939453125, + "logps/rejected": -1370.260009765625, + "loss": 0.0932, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19919827580451965, + "rewards/margins": 0.26395368576049805, + "rewards/rejected": -0.4631519317626953, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 4.984970712291963e-06, + "logits/chosen": -1.7236887216567993, + "logits/rejected": -0.9385706782341003, + "logps/chosen": -698.5767822265625, + "logps/rejected": -1253.93994140625, + "loss": 0.0922, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1614110916852951, + "rewards/margins": 0.2871715724468231, + "rewards/rejected": -0.44858264923095703, + "step": 690 + }, + { + "epoch": 0.13, + "learning_rate": 4.983095894354858e-06, + "logits/chosen": -1.7446352243423462, + "logits/rejected": -1.2324244976043701, + "logps/chosen": -680.3492431640625, + "logps/rejected": -1322.1500244140625, + "loss": 0.091, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19917258620262146, + "rewards/margins": 0.26733073592185974, + "rewards/rejected": -0.46650344133377075, + "step": 700 + }, + { + "epoch": 0.14, + "learning_rate": 4.981111305318918e-06, + "logits/chosen": -1.4915651082992554, + "logits/rejected": -0.997687816619873, + "logps/chosen": -652.5929565429688, + "logps/rejected": -1232.224365234375, + "loss": 0.1035, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21357397735118866, + "rewards/margins": 0.23794837296009064, + "rewards/rejected": -0.4515223503112793, + "step": 710 + }, + { + "epoch": 0.14, + "learning_rate": 4.979017032917576e-06, + "logits/chosen": -1.93303644657135, + "logits/rejected": -1.1569701433181763, + "logps/chosen": -580.7186279296875, + "logps/rejected": -1183.568115234375, + "loss": 0.0613, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1580665409564972, + "rewards/margins": 0.2544881999492645, + "rewards/rejected": -0.41255468130111694, + "step": 720 + }, + { + "epoch": 0.14, + "learning_rate": 4.97681316973307e-06, + "logits/chosen": -1.5416204929351807, + "logits/rejected": -1.1237828731536865, + "logps/chosen": -565.6384887695312, + "logps/rejected": -1154.13134765625, + "loss": 0.112, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14230427145957947, + "rewards/margins": 0.2524174153804779, + "rewards/rejected": -0.394721657037735, + "step": 730 + }, + { + "epoch": 0.14, + "learning_rate": 4.9744998131923625e-06, + "logits/chosen": -1.6045395135879517, + "logits/rejected": -1.0634424686431885, + "logps/chosen": -734.4767456054688, + "logps/rejected": -1465.661376953125, + "loss": 0.0922, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.26009348034858704, + "rewards/margins": 0.2871132493019104, + "rewards/rejected": -0.5472067594528198, + "step": 740 + }, + { + "epoch": 0.14, + "learning_rate": 4.9720770655628216e-06, + "logits/chosen": -1.8270715475082397, + "logits/rejected": -1.260059118270874, + "logps/chosen": -715.5145263671875, + "logps/rejected": -1226.204833984375, + "loss": 0.1144, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18080273270606995, + "rewards/margins": 0.2042236328125, + "rewards/rejected": -0.38502639532089233, + "step": 750 + }, + { + "epoch": 0.14, + "learning_rate": 4.969545033947711e-06, + "logits/chosen": -1.673006296157837, + "logits/rejected": -1.180161952972412, + "logps/chosen": -606.9016723632812, + "logps/rejected": -1316.323486328125, + "loss": 0.0723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1526075303554535, + "rewards/margins": 0.26150739192962646, + "rewards/rejected": -0.41411495208740234, + "step": 760 + }, + { + "epoch": 0.15, + "learning_rate": 4.966903830281449e-06, + "logits/chosen": -1.9988367557525635, + "logits/rejected": -1.309494972229004, + "logps/chosen": -557.9260864257812, + "logps/rejected": -1146.306640625, + "loss": 0.0867, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09266151487827301, + "rewards/margins": 0.2640661597251892, + "rewards/rejected": -0.3567276895046234, + "step": 770 + }, + { + "epoch": 0.15, + "learning_rate": 4.964153571324658e-06, + "logits/chosen": -1.978520154953003, + "logits/rejected": -1.1246730089187622, + "logps/chosen": -589.2442626953125, + "logps/rejected": -954.0111083984375, + "loss": 0.1111, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1084524616599083, + "rewards/margins": 0.19350787997245789, + "rewards/rejected": -0.30196037888526917, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 4.96129437865901e-06, + "logits/chosen": -1.7288103103637695, + "logits/rejected": -1.1625417470932007, + "logps/chosen": -582.9478759765625, + "logps/rejected": -1196.01318359375, + "loss": 0.0591, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15656381845474243, + "rewards/margins": 0.2532889246940613, + "rewards/rejected": -0.4098528027534485, + "step": 790 + }, + { + "epoch": 0.15, + "learning_rate": 4.958326378681849e-06, + "logits/chosen": -1.7046226263046265, + "logits/rejected": -1.2026054859161377, + "logps/chosen": -526.1422119140625, + "logps/rejected": -1188.0306396484375, + "loss": 0.0666, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12230102717876434, + "rewards/margins": 0.279751718044281, + "rewards/rejected": -0.40205278992652893, + "step": 800 + }, + { + "epoch": 0.15, + "learning_rate": 4.955249702600598e-06, + "logits/chosen": -1.6546719074249268, + "logits/rejected": -1.117095708847046, + "logps/chosen": -624.09375, + "logps/rejected": -1231.6494140625, + "loss": 0.0991, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15768446028232574, + "rewards/margins": 0.2528056502342224, + "rewards/rejected": -0.41049009561538696, + "step": 810 + }, + { + "epoch": 0.16, + "learning_rate": 4.952064486426965e-06, + "logits/chosen": -1.7625147104263306, + "logits/rejected": -1.1312098503112793, + "logps/chosen": -638.472412109375, + "logps/rejected": -1207.470458984375, + "loss": 0.0717, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1054113358259201, + "rewards/margins": 0.2620728611946106, + "rewards/rejected": -0.3674841821193695, + "step": 820 + }, + { + "epoch": 0.16, + "learning_rate": 4.948770870970929e-06, + "logits/chosen": -1.8074289560317993, + "logits/rejected": -1.2430999279022217, + "logps/chosen": -625.0852661132812, + "logps/rejected": -1263.1839599609375, + "loss": 0.0759, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14085423946380615, + "rewards/margins": 0.27442166209220886, + "rewards/rejected": -0.415275901556015, + "step": 830 + }, + { + "epoch": 0.16, + "learning_rate": 4.9453690018345144e-06, + "logits/chosen": -1.782332181930542, + "logits/rejected": -1.2486510276794434, + "logps/chosen": -668.4853515625, + "logps/rejected": -1139.117919921875, + "loss": 0.1234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1727108657360077, + "rewards/margins": 0.21505430340766907, + "rewards/rejected": -0.38776513934135437, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 4.941859029405354e-06, + "logits/chosen": -1.5351417064666748, + "logits/rejected": -1.2106958627700806, + "logps/chosen": -692.7496337890625, + "logps/rejected": -1367.590087890625, + "loss": 0.0951, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2019486427307129, + "rewards/margins": 0.27323096990585327, + "rewards/rejected": -0.47517961263656616, + "step": 850 + }, + { + "epoch": 0.16, + "learning_rate": 4.938241108850039e-06, + "logits/chosen": -1.7847673892974854, + "logits/rejected": -0.9648265838623047, + "logps/chosen": -705.1673583984375, + "logps/rejected": -1294.5316162109375, + "loss": 0.0712, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17169032990932465, + "rewards/margins": 0.2827732563018799, + "rewards/rejected": -0.45446357131004333, + "step": 860 + }, + { + "epoch": 0.17, + "learning_rate": 4.934515400107266e-06, + "logits/chosen": -1.717449426651001, + "logits/rejected": -1.2112057209014893, + "logps/chosen": -663.0195922851562, + "logps/rejected": -1365.6519775390625, + "loss": 0.0685, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19511958956718445, + "rewards/margins": 0.2715843617916107, + "rewards/rejected": -0.4667038917541504, + "step": 870 + }, + { + "epoch": 0.17, + "learning_rate": 4.930682067880759e-06, + "logits/chosen": -1.6452863216400146, + "logits/rejected": -1.1141650676727295, + "logps/chosen": -567.5109252929688, + "logps/rejected": -1393.507568359375, + "loss": 0.0479, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17566943168640137, + "rewards/margins": 0.3386470377445221, + "rewards/rejected": -0.5143164396286011, + "step": 880 + }, + { + "epoch": 0.17, + "learning_rate": 4.926741281631991e-06, + "logits/chosen": -1.6676323413848877, + "logits/rejected": -1.2808525562286377, + "logps/chosen": -630.0742797851562, + "logps/rejected": -1261.5531005859375, + "loss": 0.0675, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18672814965248108, + "rewards/margins": 0.2657690644264221, + "rewards/rejected": -0.4524971842765808, + "step": 890 + }, + { + "epoch": 0.17, + "learning_rate": 4.922693215572695e-06, + "logits/chosen": -1.4996672868728638, + "logits/rejected": -1.0641984939575195, + "logps/chosen": -514.0299072265625, + "logps/rejected": -1189.6839599609375, + "loss": 0.0829, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11306717246770859, + "rewards/margins": 0.29115527868270874, + "rewards/rejected": -0.40422242879867554, + "step": 900 + }, + { + "epoch": 0.17, + "learning_rate": 4.91853804865716e-06, + "logits/chosen": -1.6429615020751953, + "logits/rejected": -1.2641582489013672, + "logps/chosen": -637.0469360351562, + "logps/rejected": -1262.9354248046875, + "loss": 0.0909, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15128901600837708, + "rewards/margins": 0.2525308430194855, + "rewards/rejected": -0.40381985902786255, + "step": 910 + }, + { + "epoch": 0.18, + "learning_rate": 4.91427596457432e-06, + "logits/chosen": -1.745375633239746, + "logits/rejected": -1.1431455612182617, + "logps/chosen": -661.6976318359375, + "logps/rejected": -1298.279541015625, + "loss": 0.0742, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1605122685432434, + "rewards/margins": 0.2791559100151062, + "rewards/rejected": -0.4396681785583496, + "step": 920 + }, + { + "epoch": 0.18, + "learning_rate": 4.909907151739634e-06, + "logits/chosen": -1.863525152206421, + "logits/rejected": -1.5125272274017334, + "logps/chosen": -525.1600341796875, + "logps/rejected": -1222.451171875, + "loss": 0.0681, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12098580598831177, + "rewards/margins": 0.295646607875824, + "rewards/rejected": -0.4166324734687805, + "step": 930 + }, + { + "epoch": 0.18, + "learning_rate": 4.905431803286756e-06, + "logits/chosen": -1.5892441272735596, + "logits/rejected": -1.2031795978546143, + "logps/chosen": -558.3675537109375, + "logps/rejected": -1227.478515625, + "loss": 0.0882, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13429304957389832, + "rewards/margins": 0.2827587127685547, + "rewards/rejected": -0.4170517325401306, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 4.900850117059e-06, + "logits/chosen": -1.621360182762146, + "logits/rejected": -1.333057165145874, + "logps/chosen": -596.5543823242188, + "logps/rejected": -1246.934326171875, + "loss": 0.0652, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.16683629155158997, + "rewards/margins": 0.29126232862472534, + "rewards/rejected": -0.4580985903739929, + "step": 950 + }, + { + "epoch": 0.18, + "learning_rate": 4.8961622956005895e-06, + "logits/chosen": -1.7013307809829712, + "logits/rejected": -1.2199418544769287, + "logps/chosen": -655.8325805664062, + "logps/rejected": -1257.751708984375, + "loss": 0.0768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17777304351329803, + "rewards/margins": 0.26354774832725525, + "rewards/rejected": -0.44132083654403687, + "step": 960 + }, + { + "epoch": 0.18, + "learning_rate": 4.891368546147707e-06, + "logits/chosen": -1.7467527389526367, + "logits/rejected": -1.349560022354126, + "logps/chosen": -574.1756591796875, + "logps/rejected": -1149.5211181640625, + "loss": 0.1057, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14399586617946625, + "rewards/margins": 0.26451900601387024, + "rewards/rejected": -0.4085148870944977, + "step": 970 + }, + { + "epoch": 0.19, + "learning_rate": 4.88646908061933e-06, + "logits/chosen": -1.8687814474105835, + "logits/rejected": -1.1187694072723389, + "logps/chosen": -652.5084228515625, + "logps/rejected": -1175.923095703125, + "loss": 0.0937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13137920200824738, + "rewards/margins": 0.2375623881816864, + "rewards/rejected": -0.36894160509109497, + "step": 980 + }, + { + "epoch": 0.19, + "learning_rate": 4.881464115607866e-06, + "logits/chosen": -1.7792987823486328, + "logits/rejected": -1.0951414108276367, + "logps/chosen": -598.0963134765625, + "logps/rejected": -1200.380615234375, + "loss": 0.0843, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08524195849895477, + "rewards/margins": 0.26355254650115967, + "rewards/rejected": -0.34879451990127563, + "step": 990 + }, + { + "epoch": 0.19, + "learning_rate": 4.876353872369573e-06, + "logits/chosen": -1.8199819326400757, + "logits/rejected": -1.1733105182647705, + "logps/chosen": -558.3444213867188, + "logps/rejected": -1186.50439453125, + "loss": 0.0714, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.05597637966275215, + "rewards/margins": 0.25787559151649475, + "rewards/rejected": -0.3138519823551178, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 4.871138576814782e-06, + "logits/chosen": -1.679121971130371, + "logits/rejected": -1.1686570644378662, + "logps/chosen": -554.617919921875, + "logps/rejected": -1218.363037109375, + "loss": 0.0873, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09123705327510834, + "rewards/margins": 0.2742183804512024, + "rewards/rejected": -0.3654554486274719, + "step": 1010 + }, + { + "epoch": 0.19, + "learning_rate": 4.865818459497911e-06, + "logits/chosen": -1.646142601966858, + "logits/rejected": -1.087127923965454, + "logps/chosen": -595.8470458984375, + "logps/rejected": -1093.2322998046875, + "loss": 0.1123, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.16172102093696594, + "rewards/margins": 0.21603676676750183, + "rewards/rejected": -0.3777577877044678, + "step": 1020 + }, + { + "epoch": 0.2, + "learning_rate": 4.860393755607266e-06, + "logits/chosen": -1.522953748703003, + "logits/rejected": -1.0482865571975708, + "logps/chosen": -692.7568969726562, + "logps/rejected": -1367.2935791015625, + "loss": 0.1043, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.24981939792633057, + "rewards/margins": 0.28347522020339966, + "rewards/rejected": -0.5332946181297302, + "step": 1030 + }, + { + "epoch": 0.2, + "learning_rate": 4.854864704954654e-06, + "logits/chosen": -1.4114348888397217, + "logits/rejected": -1.1013247966766357, + "logps/chosen": -734.9617309570312, + "logps/rejected": -1330.0992431640625, + "loss": 0.077, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2645266354084015, + "rewards/margins": 0.23836931586265564, + "rewards/rejected": -0.5028959512710571, + "step": 1040 + }, + { + "epoch": 0.2, + "learning_rate": 4.849231551964771e-06, + "logits/chosen": -1.7310336828231812, + "logits/rejected": -0.9586830139160156, + "logps/chosen": -699.4583740234375, + "logps/rejected": -1246.3502197265625, + "loss": 0.0698, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16729851067066193, + "rewards/margins": 0.28657031059265137, + "rewards/rejected": -0.4538688659667969, + "step": 1050 + }, + { + "epoch": 0.2, + "learning_rate": 4.843494545664407e-06, + "logits/chosen": -1.3404247760772705, + "logits/rejected": -0.7918633818626404, + "logps/chosen": -588.2305908203125, + "logps/rejected": -1199.187744140625, + "loss": 0.0801, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1429479569196701, + "rewards/margins": 0.30188634991645813, + "rewards/rejected": -0.44483429193496704, + "step": 1060 + }, + { + "epoch": 0.2, + "learning_rate": 4.837653939671427e-06, + "logits/chosen": -1.6853176355361938, + "logits/rejected": -0.9261191487312317, + "logps/chosen": -739.8435668945312, + "logps/rejected": -1302.793701171875, + "loss": 0.1033, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2311195582151413, + "rewards/margins": 0.2652646601200104, + "rewards/rejected": -0.49638423323631287, + "step": 1070 + }, + { + "epoch": 0.21, + "learning_rate": 4.8317099921835695e-06, + "logits/chosen": -1.4624329805374146, + "logits/rejected": -1.0136330127716064, + "logps/chosen": -571.1947631835938, + "logps/rejected": -1336.320556640625, + "loss": 0.067, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1917448490858078, + "rewards/margins": 0.34521591663360596, + "rewards/rejected": -0.536960780620575, + "step": 1080 + }, + { + "epoch": 0.21, + "learning_rate": 4.825662965967023e-06, + "logits/chosen": -1.6408259868621826, + "logits/rejected": -0.9534978866577148, + "logps/chosen": -721.38134765625, + "logps/rejected": -1417.104736328125, + "loss": 0.0623, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19875575602054596, + "rewards/margins": 0.3017219007015228, + "rewards/rejected": -0.50047767162323, + "step": 1090 + }, + { + "epoch": 0.21, + "learning_rate": 4.819513128344814e-06, + "logits/chosen": -1.853732705116272, + "logits/rejected": -0.9827998280525208, + "logps/chosen": -554.1961669921875, + "logps/rejected": -1199.3424072265625, + "loss": 0.0617, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12518128752708435, + "rewards/margins": 0.2828952968120575, + "rewards/rejected": -0.40807658433914185, + "step": 1100 + }, + { + "epoch": 0.21, + "learning_rate": 4.813260751184992e-06, + "logits/chosen": -1.7140512466430664, + "logits/rejected": -1.5034291744232178, + "logps/chosen": -469.31536865234375, + "logps/rejected": -1079.330810546875, + "loss": 0.1014, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11440832912921906, + "rewards/margins": 0.23391124606132507, + "rewards/rejected": -0.34831956028938293, + "step": 1110 + }, + { + "epoch": 0.21, + "learning_rate": 4.806906110888606e-06, + "logits/chosen": -1.8567192554473877, + "logits/rejected": -1.186836838722229, + "logps/chosen": -660.9390258789062, + "logps/rejected": -1327.2064208984375, + "loss": 0.0649, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17495737969875336, + "rewards/margins": 0.3085317015647888, + "rewards/rejected": -0.48348909616470337, + "step": 1120 + }, + { + "epoch": 0.22, + "learning_rate": 4.8004494883774885e-06, + "logits/chosen": -1.629417061805725, + "logits/rejected": -1.1237983703613281, + "logps/chosen": -658.7716674804688, + "logps/rejected": -1356.76220703125, + "loss": 0.075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21607021987438202, + "rewards/margins": 0.29712003469467163, + "rewards/rejected": -0.5131902694702148, + "step": 1130 + }, + { + "epoch": 0.22, + "learning_rate": 4.793891169081835e-06, + "logits/chosen": -1.5540090799331665, + "logits/rejected": -1.0890731811523438, + "logps/chosen": -597.20654296875, + "logps/rejected": -1241.4404296875, + "loss": 0.0756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1598585844039917, + "rewards/margins": 0.2764621376991272, + "rewards/rejected": -0.4363207221031189, + "step": 1140 + }, + { + "epoch": 0.22, + "learning_rate": 4.787231442927587e-06, + "logits/chosen": -1.7837997674942017, + "logits/rejected": -1.0015078783035278, + "logps/chosen": -753.8917846679688, + "logps/rejected": -1246.1300048828125, + "loss": 0.0881, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18574762344360352, + "rewards/margins": 0.2450966089963913, + "rewards/rejected": -0.430844247341156, + "step": 1150 + }, + { + "epoch": 0.22, + "learning_rate": 4.780470604323616e-06, + "logits/chosen": -1.8322811126708984, + "logits/rejected": -1.0309690237045288, + "logps/chosen": -657.1239013671875, + "logps/rejected": -1257.453857421875, + "loss": 0.0762, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09944574534893036, + "rewards/margins": 0.27715909481048584, + "rewards/rejected": -0.3766048550605774, + "step": 1160 + }, + { + "epoch": 0.22, + "learning_rate": 4.773608952148706e-06, + "logits/chosen": -1.8025999069213867, + "logits/rejected": -1.1409608125686646, + "logps/chosen": -575.5861206054688, + "logps/rejected": -1183.916259765625, + "loss": 0.075, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09301136434078217, + "rewards/margins": 0.26918086409568787, + "rewards/rejected": -0.36219221353530884, + "step": 1170 + }, + { + "epoch": 0.22, + "learning_rate": 4.766646789738342e-06, + "logits/chosen": -1.7869154214859009, + "logits/rejected": -1.3070907592773438, + "logps/chosen": -539.0377197265625, + "logps/rejected": -1107.183837890625, + "loss": 0.0632, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0965101346373558, + "rewards/margins": 0.27286410331726074, + "rewards/rejected": -0.36937421560287476, + "step": 1180 + }, + { + "epoch": 0.23, + "learning_rate": 4.759584424871302e-06, + "logits/chosen": -1.773685097694397, + "logits/rejected": -1.0828640460968018, + "logps/chosen": -565.7281494140625, + "logps/rejected": -1151.1832275390625, + "loss": 0.068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11723196506500244, + "rewards/margins": 0.280317485332489, + "rewards/rejected": -0.39754948019981384, + "step": 1190 + }, + { + "epoch": 0.23, + "learning_rate": 4.752422169756048e-06, + "logits/chosen": -1.7946689128875732, + "logits/rejected": -1.1525065898895264, + "logps/chosen": -652.74169921875, + "logps/rejected": -1309.117919921875, + "loss": 0.0831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17318835854530334, + "rewards/margins": 0.3087579309940338, + "rewards/rejected": -0.48194631934165955, + "step": 1200 + }, + { + "epoch": 0.23, + "learning_rate": 4.745160341016927e-06, + "logits/chosen": -1.865501046180725, + "logits/rejected": -1.3471348285675049, + "logps/chosen": -731.3184814453125, + "logps/rejected": -1188.284423828125, + "loss": 0.0972, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20365187525749207, + "rewards/margins": 0.21393127739429474, + "rewards/rejected": -0.417583167552948, + "step": 1210 + }, + { + "epoch": 0.23, + "learning_rate": 4.737799259680172e-06, + "logits/chosen": -1.538688063621521, + "logits/rejected": -1.0417388677597046, + "logps/chosen": -677.1107177734375, + "logps/rejected": -1417.5760498046875, + "loss": 0.0571, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2299385517835617, + "rewards/margins": 0.3572830557823181, + "rewards/rejected": -0.5872215628623962, + "step": 1220 + }, + { + "epoch": 0.23, + "learning_rate": 4.730339251159709e-06, + "logits/chosen": -1.6450903415679932, + "logits/rejected": -1.0749512910842896, + "logps/chosen": -807.4049072265625, + "logps/rejected": -1535.035400390625, + "loss": 0.0757, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.25353971123695374, + "rewards/margins": 0.3195953071117401, + "rewards/rejected": -0.5731350183486938, + "step": 1230 + }, + { + "epoch": 0.24, + "learning_rate": 4.722780645242775e-06, + "logits/chosen": -1.4446080923080444, + "logits/rejected": -0.9962978363037109, + "logps/chosen": -559.8865966796875, + "logps/rejected": -1117.860107421875, + "loss": 0.0944, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1235518679022789, + "rewards/margins": 0.2574634253978729, + "rewards/rejected": -0.3810153305530548, + "step": 1240 + }, + { + "epoch": 0.24, + "learning_rate": 4.715123776075337e-06, + "logits/chosen": -1.5536324977874756, + "logits/rejected": -0.9707902669906616, + "logps/chosen": -623.35302734375, + "logps/rejected": -1270.5560302734375, + "loss": 0.0802, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16853728890419006, + "rewards/margins": 0.2629413604736328, + "rewards/rejected": -0.43147867918014526, + "step": 1250 + }, + { + "epoch": 0.24, + "learning_rate": 4.707368982147318e-06, + "logits/chosen": -1.609861969947815, + "logits/rejected": -1.2645909786224365, + "logps/chosen": -731.3851318359375, + "logps/rejected": -1379.0576171875, + "loss": 0.0645, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2083616703748703, + "rewards/margins": 0.3054686188697815, + "rewards/rejected": -0.513830304145813, + "step": 1260 + }, + { + "epoch": 0.24, + "learning_rate": 4.699516606277638e-06, + "logits/chosen": -1.5661576986312866, + "logits/rejected": -1.0933661460876465, + "logps/chosen": -635.5277099609375, + "logps/rejected": -1325.19384765625, + "loss": 0.0897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16789881885051727, + "rewards/margins": 0.31640955805778503, + "rewards/rejected": -0.4843083322048187, + "step": 1270 + }, + { + "epoch": 0.24, + "learning_rate": 4.691566995599056e-06, + "logits/chosen": -1.6782423257827759, + "logits/rejected": -1.106381893157959, + "logps/chosen": -651.0316772460938, + "logps/rejected": -1290.894775390625, + "loss": 0.0708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1785011887550354, + "rewards/margins": 0.28199997544288635, + "rewards/rejected": -0.46050113439559937, + "step": 1280 + }, + { + "epoch": 0.25, + "learning_rate": 4.683520501542825e-06, + "logits/chosen": -1.6150267124176025, + "logits/rejected": -1.025458812713623, + "logps/chosen": -732.90185546875, + "logps/rejected": -1455.954833984375, + "loss": 0.0649, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.24051375687122345, + "rewards/margins": 0.326476514339447, + "rewards/rejected": -0.5669902563095093, + "step": 1290 + }, + { + "epoch": 0.25, + "learning_rate": 4.675377479823153e-06, + "logits/chosen": -1.629797339439392, + "logits/rejected": -1.152362585067749, + "logps/chosen": -604.6630859375, + "logps/rejected": -1252.499755859375, + "loss": 0.0777, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15616537630558014, + "rewards/margins": 0.296053022146225, + "rewards/rejected": -0.4522184431552887, + "step": 1300 + }, + { + "epoch": 0.25, + "learning_rate": 4.667138290421483e-06, + "logits/chosen": -1.421653151512146, + "logits/rejected": -0.8815647959709167, + "logps/chosen": -626.5021362304688, + "logps/rejected": -1467.975830078125, + "loss": 0.0501, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18186897039413452, + "rewards/margins": 0.39172568917274475, + "rewards/rejected": -0.5735946893692017, + "step": 1310 + }, + { + "epoch": 0.25, + "learning_rate": 4.658803297570578e-06, + "logits/chosen": -1.4821852445602417, + "logits/rejected": -1.1698157787322998, + "logps/chosen": -730.8109741210938, + "logps/rejected": -1356.880615234375, + "loss": 0.0802, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2391328066587448, + "rewards/margins": 0.26806876063346863, + "rewards/rejected": -0.507201611995697, + "step": 1320 + }, + { + "epoch": 0.25, + "learning_rate": 4.650372869738415e-06, + "logits/chosen": -1.674965262413025, + "logits/rejected": -1.0977269411087036, + "logps/chosen": -595.2286987304688, + "logps/rejected": -1290.0439453125, + "loss": 0.059, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17417269945144653, + "rewards/margins": 0.31828632950782776, + "rewards/rejected": -0.4924590587615967, + "step": 1330 + }, + { + "epoch": 0.26, + "learning_rate": 4.641847379611898e-06, + "logits/chosen": -1.5804319381713867, + "logits/rejected": -0.8383885622024536, + "logps/chosen": -729.4918823242188, + "logps/rejected": -1354.4481201171875, + "loss": 0.0658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24894657731056213, + "rewards/margins": 0.2908107042312622, + "rewards/rejected": -0.5397573709487915, + "step": 1340 + }, + { + "epoch": 0.26, + "learning_rate": 4.633227204080389e-06, + "logits/chosen": -1.4710257053375244, + "logits/rejected": -0.9187448620796204, + "logps/chosen": -717.806396484375, + "logps/rejected": -1383.8284912109375, + "loss": 0.0692, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22807364165782928, + "rewards/margins": 0.3141343891620636, + "rewards/rejected": -0.5422079563140869, + "step": 1350 + }, + { + "epoch": 0.26, + "learning_rate": 4.624512724219038e-06, + "logits/chosen": -1.8117561340332031, + "logits/rejected": -1.174837589263916, + "logps/chosen": -577.433349609375, + "logps/rejected": -1249.8837890625, + "loss": 0.0789, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16873693466186523, + "rewards/margins": 0.3111843168735504, + "rewards/rejected": -0.47992125153541565, + "step": 1360 + }, + { + "epoch": 0.26, + "learning_rate": 4.6157043252719374e-06, + "logits/chosen": -1.5913151502609253, + "logits/rejected": -1.1274433135986328, + "logps/chosen": -622.65380859375, + "logps/rejected": -1108.6986083984375, + "loss": 0.0986, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20124728977680206, + "rewards/margins": 0.2101028710603714, + "rewards/rejected": -0.41135016083717346, + "step": 1370 + }, + { + "epoch": 0.26, + "learning_rate": 4.606802396635098e-06, + "logits/chosen": -1.5497840642929077, + "logits/rejected": -1.1788917779922485, + "logps/chosen": -500.24700927734375, + "logps/rejected": -1297.15966796875, + "loss": 0.0654, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1291418969631195, + "rewards/margins": 0.3215458393096924, + "rewards/rejected": -0.4506877362728119, + "step": 1380 + }, + { + "epoch": 0.26, + "learning_rate": 4.597807331839229e-06, + "logits/chosen": -1.5772576332092285, + "logits/rejected": -1.1091996431350708, + "logps/chosen": -627.5196533203125, + "logps/rejected": -1284.9049072265625, + "loss": 0.071, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1847463697195053, + "rewards/margins": 0.2685849070549011, + "rewards/rejected": -0.4533312916755676, + "step": 1390 + }, + { + "epoch": 0.27, + "learning_rate": 4.588719528532342e-06, + "logits/chosen": -1.6495682001113892, + "logits/rejected": -1.0364662408828735, + "logps/chosen": -684.3934326171875, + "logps/rejected": -1204.6859130859375, + "loss": 0.0848, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20700335502624512, + "rewards/margins": 0.23912009596824646, + "rewards/rejected": -0.4461234509944916, + "step": 1400 + }, + { + "epoch": 0.27, + "learning_rate": 4.5795393884621735e-06, + "logits/chosen": -1.829358696937561, + "logits/rejected": -1.0703871250152588, + "logps/chosen": -587.6880493164062, + "logps/rejected": -1229.123291015625, + "loss": 0.0504, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13737113773822784, + "rewards/margins": 0.3153493106365204, + "rewards/rejected": -0.4527204632759094, + "step": 1410 + }, + { + "epoch": 0.27, + "learning_rate": 4.5702673174584236e-06, + "logits/chosen": -1.5375653505325317, + "logits/rejected": -1.086010456085205, + "logps/chosen": -600.8570556640625, + "logps/rejected": -1225.86962890625, + "loss": 0.1055, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1756105124950409, + "rewards/margins": 0.2715519964694977, + "rewards/rejected": -0.44716253876686096, + "step": 1420 + }, + { + "epoch": 0.27, + "learning_rate": 4.560903725414816e-06, + "logits/chosen": -1.8195488452911377, + "logits/rejected": -1.128953456878662, + "logps/chosen": -594.7276611328125, + "logps/rejected": -1118.1014404296875, + "loss": 0.0733, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1236497312784195, + "rewards/margins": 0.26376843452453613, + "rewards/rejected": -0.3874182105064392, + "step": 1430 + }, + { + "epoch": 0.27, + "learning_rate": 4.551449026270979e-06, + "logits/chosen": -1.4914960861206055, + "logits/rejected": -0.9787286520004272, + "logps/chosen": -583.1992797851562, + "logps/rejected": -1058.70068359375, + "loss": 0.155, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13674364984035492, + "rewards/margins": 0.1916511058807373, + "rewards/rejected": -0.3283947706222534, + "step": 1440 + }, + { + "epoch": 0.28, + "learning_rate": 4.541903637994142e-06, + "logits/chosen": -1.7652862071990967, + "logits/rejected": -1.3571741580963135, + "logps/chosen": -633.3842163085938, + "logps/rejected": -1270.315185546875, + "loss": 0.0713, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22607281804084778, + "rewards/margins": 0.3079972565174103, + "rewards/rejected": -0.5340700745582581, + "step": 1450 + }, + { + "epoch": 0.28, + "learning_rate": 4.532267982560662e-06, + "logits/chosen": -1.750309705734253, + "logits/rejected": -1.1464341878890991, + "logps/chosen": -733.6136474609375, + "logps/rejected": -1293.258544921875, + "loss": 0.0863, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2598066031932831, + "rewards/margins": 0.2805883288383484, + "rewards/rejected": -0.5403949618339539, + "step": 1460 + }, + { + "epoch": 0.28, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -1.6304270029067993, + "logits/rejected": -1.1222264766693115, + "logps/chosen": -736.6213989257812, + "logps/rejected": -1387.48876953125, + "loss": 0.0862, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2621721923351288, + "rewards/margins": 0.28952568769454956, + "rewards/rejected": -0.5516979098320007, + "step": 1470 + }, + { + "epoch": 0.28, + "learning_rate": 4.512727578062733e-06, + "logits/chosen": -1.546983003616333, + "logits/rejected": -1.1147258281707764, + "logps/chosen": -720.029296875, + "logps/rejected": -1342.5504150390625, + "loss": 0.0939, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.22671222686767578, + "rewards/margins": 0.26174527406692505, + "rewards/rejected": -0.48845750093460083, + "step": 1480 + }, + { + "epoch": 0.28, + "learning_rate": 4.502823692827859e-06, + "logits/chosen": -1.6339771747589111, + "logits/rejected": -0.8447777628898621, + "logps/chosen": -591.8370361328125, + "logps/rejected": -1214.6041259765625, + "loss": 0.0648, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13122068345546722, + "rewards/margins": 0.29385435581207275, + "rewards/rejected": -0.4250749945640564, + "step": 1490 + }, + { + "epoch": 0.29, + "learning_rate": 4.492831268057307e-06, + "logits/chosen": -1.5530567169189453, + "logits/rejected": -1.39444899559021, + "logps/chosen": -579.3165283203125, + "logps/rejected": -1225.831298828125, + "loss": 0.0804, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17402389645576477, + "rewards/margins": 0.24063608050346375, + "rewards/rejected": -0.4146599769592285, + "step": 1500 + }, + { + "epoch": 0.29, + "learning_rate": 4.482750745489733e-06, + "logits/chosen": -1.5297390222549438, + "logits/rejected": -1.2879207134246826, + "logps/chosen": -587.7469482421875, + "logps/rejected": -1158.90283203125, + "loss": 0.0884, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1852291375398636, + "rewards/margins": 0.24651098251342773, + "rewards/rejected": -0.4317401051521301, + "step": 1510 + }, + { + "epoch": 0.29, + "learning_rate": 4.472582570758367e-06, + "logits/chosen": -1.6017887592315674, + "logits/rejected": -1.2658944129943848, + "logps/chosen": -623.4454956054688, + "logps/rejected": -1206.4208984375, + "loss": 0.1038, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1995050460100174, + "rewards/margins": 0.25444871187210083, + "rewards/rejected": -0.4539538025856018, + "step": 1520 + }, + { + "epoch": 0.29, + "learning_rate": 4.4623271933713065e-06, + "logits/chosen": -1.792682409286499, + "logits/rejected": -0.9171991348266602, + "logps/chosen": -787.0244750976562, + "logps/rejected": -1522.90185546875, + "loss": 0.055, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.226217120885849, + "rewards/margins": 0.340124249458313, + "rewards/rejected": -0.5663414001464844, + "step": 1530 + }, + { + "epoch": 0.29, + "learning_rate": 4.451985066691649e-06, + "logits/chosen": -1.338421106338501, + "logits/rejected": -0.9744964838027954, + "logps/chosen": -611.8841552734375, + "logps/rejected": -1370.3062744140625, + "loss": 0.0597, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1973779797554016, + "rewards/margins": 0.29491350054740906, + "rewards/rejected": -0.49229151010513306, + "step": 1540 + }, + { + "epoch": 0.3, + "learning_rate": 4.441556647917447e-06, + "logits/chosen": -1.5752965211868286, + "logits/rejected": -1.1147037744522095, + "logps/chosen": -590.4030151367188, + "logps/rejected": -1291.9884033203125, + "loss": 0.0729, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15929007530212402, + "rewards/margins": 0.3151244521141052, + "rewards/rejected": -0.47441449761390686, + "step": 1550 + }, + { + "epoch": 0.3, + "learning_rate": 4.431042398061499e-06, + "logits/chosen": -1.5449073314666748, + "logits/rejected": -0.9329763650894165, + "logps/chosen": -652.1185302734375, + "logps/rejected": -1286.4637451171875, + "loss": 0.0574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20002980530261993, + "rewards/margins": 0.2901848256587982, + "rewards/rejected": -0.49021464586257935, + "step": 1560 + }, + { + "epoch": 0.3, + "learning_rate": 4.420442781930971e-06, + "logits/chosen": -1.851266622543335, + "logits/rejected": -1.2136085033416748, + "logps/chosen": -671.2997436523438, + "logps/rejected": -1213.92236328125, + "loss": 0.0909, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.226095512509346, + "rewards/margins": 0.27147597074508667, + "rewards/rejected": -0.4975714683532715, + "step": 1570 + }, + { + "epoch": 0.3, + "learning_rate": 4.409758268106842e-06, + "logits/chosen": -1.6012693643569946, + "logits/rejected": -1.1103239059448242, + "logps/chosen": -609.9935302734375, + "logps/rejected": -1086.2857666015625, + "loss": 0.1148, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16556409001350403, + "rewards/margins": 0.2218482941389084, + "rewards/rejected": -0.3874123990535736, + "step": 1580 + }, + { + "epoch": 0.3, + "learning_rate": 4.398989328923196e-06, + "logits/chosen": -1.4557781219482422, + "logits/rejected": -1.044471025466919, + "logps/chosen": -617.2952880859375, + "logps/rejected": -1107.7027587890625, + "loss": 0.1086, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1871017962694168, + "rewards/margins": 0.2236325442790985, + "rewards/rejected": -0.4107343256473541, + "step": 1590 + }, + { + "epoch": 0.3, + "learning_rate": 4.388136440446338e-06, + "logits/chosen": -1.918044090270996, + "logits/rejected": -1.3620249032974243, + "logps/chosen": -643.0921630859375, + "logps/rejected": -1395.455810546875, + "loss": 0.0469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15777665376663208, + "rewards/margins": 0.32988977432250977, + "rewards/rejected": -0.48766642808914185, + "step": 1600 + }, + { + "epoch": 0.31, + "learning_rate": 4.377200082453748e-06, + "logits/chosen": -1.5242416858673096, + "logits/rejected": -1.0205128192901611, + "logps/chosen": -644.7546997070312, + "logps/rejected": -1393.016357421875, + "loss": 0.0469, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18372926115989685, + "rewards/margins": 0.308932900428772, + "rewards/rejected": -0.49266213178634644, + "step": 1610 + }, + { + "epoch": 0.31, + "learning_rate": 4.366180738412876e-06, + "logits/chosen": -1.5953199863433838, + "logits/rejected": -1.0117969512939453, + "logps/chosen": -865.4313354492188, + "logps/rejected": -1511.0732421875, + "loss": 0.0603, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.3143005073070526, + "rewards/margins": 0.29054003953933716, + "rewards/rejected": -0.6048405766487122, + "step": 1620 + }, + { + "epoch": 0.31, + "learning_rate": 4.355078895459761e-06, + "logits/chosen": -1.5991640090942383, + "logits/rejected": -1.0942062139511108, + "logps/chosen": -669.2398681640625, + "logps/rejected": -1328.014404296875, + "loss": 0.0759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2610950469970703, + "rewards/margins": 0.30377668142318726, + "rewards/rejected": -0.5648717880249023, + "step": 1630 + }, + { + "epoch": 0.31, + "learning_rate": 4.343895044377504e-06, + "logits/chosen": -1.5109912157058716, + "logits/rejected": -1.0822455883026123, + "logps/chosen": -692.4320068359375, + "logps/rejected": -1503.792236328125, + "loss": 0.0686, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1996488720178604, + "rewards/margins": 0.35995012521743774, + "rewards/rejected": -0.5595989227294922, + "step": 1640 + }, + { + "epoch": 0.31, + "learning_rate": 4.332629679574566e-06, + "logits/chosen": -1.8324410915374756, + "logits/rejected": -1.1902332305908203, + "logps/chosen": -566.93603515625, + "logps/rejected": -1152.119384765625, + "loss": 0.0927, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08348747342824936, + "rewards/margins": 0.2730974853038788, + "rewards/rejected": -0.35658496618270874, + "step": 1650 + }, + { + "epoch": 0.32, + "learning_rate": 4.321283299062916e-06, + "logits/chosen": -1.6973434686660767, + "logits/rejected": -1.0856391191482544, + "logps/chosen": -560.3250122070312, + "logps/rejected": -1092.025146484375, + "loss": 0.0841, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09070460498332977, + "rewards/margins": 0.25446048378944397, + "rewards/rejected": -0.34516507387161255, + "step": 1660 + }, + { + "epoch": 0.32, + "learning_rate": 4.309856404436013e-06, + "logits/chosen": -1.6770203113555908, + "logits/rejected": -1.1826202869415283, + "logps/chosen": -546.0051879882812, + "logps/rejected": -1200.4798583984375, + "loss": 0.0628, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.08990595489740372, + "rewards/margins": 0.2899653911590576, + "rewards/rejected": -0.3798713982105255, + "step": 1670 + }, + { + "epoch": 0.32, + "learning_rate": 4.2983495008466285e-06, + "logits/chosen": -1.7677803039550781, + "logits/rejected": -1.1159054040908813, + "logps/chosen": -704.0637817382812, + "logps/rejected": -1257.2755126953125, + "loss": 0.0765, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.12143462896347046, + "rewards/margins": 0.2974852919578552, + "rewards/rejected": -0.4189198911190033, + "step": 1680 + }, + { + "epoch": 0.32, + "learning_rate": 4.2867630969845235e-06, + "logits/chosen": -1.4960908889770508, + "logits/rejected": -1.0935460329055786, + "logps/chosen": -531.4020385742188, + "logps/rejected": -1232.95166015625, + "loss": 0.0676, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.07758744060993195, + "rewards/margins": 0.30141371488571167, + "rewards/rejected": -0.3790012001991272, + "step": 1690 + }, + { + "epoch": 0.32, + "learning_rate": 4.275097705053951e-06, + "logits/chosen": -1.3998308181762695, + "logits/rejected": -0.9511981010437012, + "logps/chosen": -518.6180419921875, + "logps/rejected": -1279.418701171875, + "loss": 0.09, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11437869071960449, + "rewards/margins": 0.31818702816963196, + "rewards/rejected": -0.43256568908691406, + "step": 1700 + }, + { + "epoch": 0.33, + "learning_rate": 4.263353840751023e-06, + "logits/chosen": -1.6988664865493774, + "logits/rejected": -1.3357477188110352, + "logps/chosen": -619.7531127929688, + "logps/rejected": -1278.7855224609375, + "loss": 0.0881, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15841639041900635, + "rewards/margins": 0.2583143413066864, + "rewards/rejected": -0.41673073172569275, + "step": 1710 + }, + { + "epoch": 0.33, + "learning_rate": 4.251532023240901e-06, + "logits/chosen": -1.6375110149383545, + "logits/rejected": -1.2145359516143799, + "logps/chosen": -635.3146362304688, + "logps/rejected": -1267.1383056640625, + "loss": 0.094, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1674066036939621, + "rewards/margins": 0.26885929703712463, + "rewards/rejected": -0.43626588582992554, + "step": 1720 + }, + { + "epoch": 0.33, + "learning_rate": 4.239632775134857e-06, + "logits/chosen": -1.4710214138031006, + "logits/rejected": -0.9805693626403809, + "logps/chosen": -580.4301147460938, + "logps/rejected": -1319.4400634765625, + "loss": 0.0685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1512683480978012, + "rewards/margins": 0.29545730352401733, + "rewards/rejected": -0.44672566652297974, + "step": 1730 + }, + { + "epoch": 0.33, + "learning_rate": 4.227656622467162e-06, + "logits/chosen": -1.745552659034729, + "logits/rejected": -1.2884304523468018, + "logps/chosen": -468.9326171875, + "logps/rejected": -1042.484619140625, + "loss": 0.0868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07101878523826599, + "rewards/margins": 0.23783501982688904, + "rewards/rejected": -0.30885380506515503, + "step": 1740 + }, + { + "epoch": 0.33, + "learning_rate": 4.215604094671835e-06, + "logits/chosen": -1.6640018224716187, + "logits/rejected": -0.9551274180412292, + "logps/chosen": -606.9476318359375, + "logps/rejected": -1325.010986328125, + "loss": 0.0545, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.10354119539260864, + "rewards/margins": 0.33232301473617554, + "rewards/rejected": -0.4358642101287842, + "step": 1750 + }, + { + "epoch": 0.34, + "learning_rate": 4.203475724559235e-06, + "logits/chosen": -1.4627963304519653, + "logits/rejected": -1.1045763492584229, + "logps/chosen": -531.2335815429688, + "logps/rejected": -1144.1085205078125, + "loss": 0.0791, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13256287574768066, + "rewards/margins": 0.2571134865283966, + "rewards/rejected": -0.3896763324737549, + "step": 1760 + }, + { + "epoch": 0.34, + "learning_rate": 4.191272048292514e-06, + "logits/chosen": -1.7131484746932983, + "logits/rejected": -1.199053406715393, + "logps/chosen": -495.76568603515625, + "logps/rejected": -1074.788818359375, + "loss": 0.0709, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10537634044885635, + "rewards/margins": 0.2483271062374115, + "rewards/rejected": -0.35370343923568726, + "step": 1770 + }, + { + "epoch": 0.34, + "learning_rate": 4.178993605363904e-06, + "logits/chosen": -1.7240636348724365, + "logits/rejected": -0.839851975440979, + "logps/chosen": -684.6185302734375, + "logps/rejected": -1250.919189453125, + "loss": 0.0881, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1622859537601471, + "rewards/margins": 0.2836538851261139, + "rewards/rejected": -0.4459398686885834, + "step": 1780 + }, + { + "epoch": 0.34, + "learning_rate": 4.166640938570879e-06, + "logits/chosen": -1.6593945026397705, + "logits/rejected": -0.8714178204536438, + "logps/chosen": -645.1300048828125, + "logps/rejected": -1271.795166015625, + "loss": 0.0606, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.11708177626132965, + "rewards/margins": 0.3288155198097229, + "rewards/rejected": -0.44589725136756897, + "step": 1790 + }, + { + "epoch": 0.34, + "learning_rate": 4.154214593992149e-06, + "logits/chosen": -1.490355134010315, + "logits/rejected": -0.9209915399551392, + "logps/chosen": -571.5134887695312, + "logps/rejected": -1043.43798828125, + "loss": 0.0922, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11676846444606781, + "rewards/margins": 0.24484193325042725, + "rewards/rejected": -0.36161044239997864, + "step": 1800 + }, + { + "epoch": 0.34, + "learning_rate": 4.1417151209635265e-06, + "logits/chosen": -1.558685064315796, + "logits/rejected": -1.095737099647522, + "logps/chosen": -625.5676879882812, + "logps/rejected": -1183.7132568359375, + "loss": 0.0791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1728230118751526, + "rewards/margins": 0.2501334547996521, + "rewards/rejected": -0.4229564070701599, + "step": 1810 + }, + { + "epoch": 0.35, + "learning_rate": 4.129143072053639e-06, + "logits/chosen": -1.5196239948272705, + "logits/rejected": -0.9427810907363892, + "logps/chosen": -717.9283447265625, + "logps/rejected": -1308.8529052734375, + "loss": 0.0855, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19684462249279022, + "rewards/margins": 0.2644059956073761, + "rewards/rejected": -0.4612506330013275, + "step": 1820 + }, + { + "epoch": 0.35, + "learning_rate": 4.116499003039499e-06, + "logits/chosen": -1.5101161003112793, + "logits/rejected": -1.0998636484146118, + "logps/chosen": -608.6380615234375, + "logps/rejected": -1254.279541015625, + "loss": 0.0776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15637364983558655, + "rewards/margins": 0.2847002148628235, + "rewards/rejected": -0.44107383489608765, + "step": 1830 + }, + { + "epoch": 0.35, + "learning_rate": 4.103783472881942e-06, + "logits/chosen": -1.4874773025512695, + "logits/rejected": -0.9665767550468445, + "logps/chosen": -609.3054809570312, + "logps/rejected": -1297.626708984375, + "loss": 0.0808, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1461195945739746, + "rewards/margins": 0.28892236948013306, + "rewards/rejected": -0.4350419044494629, + "step": 1840 + }, + { + "epoch": 0.35, + "learning_rate": 4.0909970437009094e-06, + "logits/chosen": -1.8195278644561768, + "logits/rejected": -1.0763853788375854, + "logps/chosen": -610.7991333007812, + "logps/rejected": -1304.9736328125, + "loss": 0.0679, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13620565831661224, + "rewards/margins": 0.3199845850467682, + "rewards/rejected": -0.4561902582645416, + "step": 1850 + }, + { + "epoch": 0.35, + "learning_rate": 4.078140280750598e-06, + "logits/chosen": -1.6017236709594727, + "logits/rejected": -1.0406373739242554, + "logps/chosen": -633.7638549804688, + "logps/rejected": -1135.013671875, + "loss": 0.131, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1918884664773941, + "rewards/margins": 0.23026582598686218, + "rewards/rejected": -0.42215433716773987, + "step": 1860 + }, + { + "epoch": 0.36, + "learning_rate": 4.065213752394478e-06, + "logits/chosen": -1.5482735633850098, + "logits/rejected": -0.8328466415405273, + "logps/chosen": -705.640625, + "logps/rejected": -1373.3658447265625, + "loss": 0.0794, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21625776588916779, + "rewards/margins": 0.29595327377319336, + "rewards/rejected": -0.5122110247612, + "step": 1870 + }, + { + "epoch": 0.36, + "learning_rate": 4.052218030080162e-06, + "logits/chosen": -1.3692519664764404, + "logits/rejected": -0.972745418548584, + "logps/chosen": -606.39501953125, + "logps/rejected": -1162.306640625, + "loss": 0.0954, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20994558930397034, + "rewards/margins": 0.2586881220340729, + "rewards/rejected": -0.46863365173339844, + "step": 1880 + }, + { + "epoch": 0.36, + "learning_rate": 4.039153688314146e-06, + "logits/chosen": -1.484716773033142, + "logits/rejected": -0.7318023443222046, + "logps/chosen": -576.1600341796875, + "logps/rejected": -1257.232666015625, + "loss": 0.0475, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.10188218206167221, + "rewards/margins": 0.3267883360385895, + "rewards/rejected": -0.4286704659461975, + "step": 1890 + }, + { + "epoch": 0.36, + "learning_rate": 4.026021304636408e-06, + "logits/chosen": -1.5514335632324219, + "logits/rejected": -0.8337615132331848, + "logps/chosen": -563.2936401367188, + "logps/rejected": -1291.733154296875, + "loss": 0.0387, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1273859441280365, + "rewards/margins": 0.3672037720680237, + "rewards/rejected": -0.4945897161960602, + "step": 1900 + }, + { + "epoch": 0.36, + "learning_rate": 4.012821459594881e-06, + "logits/chosen": -1.5965911149978638, + "logits/rejected": -0.8250566720962524, + "logps/chosen": -533.0094604492188, + "logps/rejected": -1278.1763916015625, + "loss": 0.0492, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12176971137523651, + "rewards/margins": 0.35596519708633423, + "rewards/rejected": -0.47773489356040955, + "step": 1910 + }, + { + "epoch": 0.37, + "learning_rate": 3.999554736719785e-06, + "logits/chosen": -1.5723917484283447, + "logits/rejected": -1.016540288925171, + "logps/chosen": -640.8189697265625, + "logps/rejected": -1321.976318359375, + "loss": 0.0647, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1868075430393219, + "rewards/margins": 0.2953697144985199, + "rewards/rejected": -0.4821773171424866, + "step": 1920 + }, + { + "epoch": 0.37, + "learning_rate": 3.986221722497832e-06, + "logits/chosen": -1.5826764106750488, + "logits/rejected": -0.9718710780143738, + "logps/chosen": -663.3128051757812, + "logps/rejected": -1344.0526123046875, + "loss": 0.0816, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19100730121135712, + "rewards/margins": 0.3145214021205902, + "rewards/rejected": -0.5055286884307861, + "step": 1930 + }, + { + "epoch": 0.37, + "learning_rate": 3.9728230063463e-06, + "logits/chosen": -1.5128819942474365, + "logits/rejected": -1.130190372467041, + "logps/chosen": -516.5947265625, + "logps/rejected": -1127.3963623046875, + "loss": 0.0724, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15435567498207092, + "rewards/margins": 0.2678048312664032, + "rewards/rejected": -0.42216047644615173, + "step": 1940 + }, + { + "epoch": 0.37, + "learning_rate": 3.9593591805869755e-06, + "logits/chosen": -1.5318098068237305, + "logits/rejected": -0.9274725914001465, + "logps/chosen": -549.1895751953125, + "logps/rejected": -1094.859130859375, + "loss": 0.0795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1255248486995697, + "rewards/margins": 0.26419132947921753, + "rewards/rejected": -0.38971614837646484, + "step": 1950 + }, + { + "epoch": 0.37, + "learning_rate": 3.945830840419966e-06, + "logits/chosen": -1.7480872869491577, + "logits/rejected": -0.9923027753829956, + "logps/chosen": -624.6234130859375, + "logps/rejected": -1274.6939697265625, + "loss": 0.0874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14473918080329895, + "rewards/margins": 0.3186899721622467, + "rewards/rejected": -0.46342915296554565, + "step": 1960 + }, + { + "epoch": 0.38, + "learning_rate": 3.932238583897395e-06, + "logits/chosen": -1.560272455215454, + "logits/rejected": -0.9629675149917603, + "logps/chosen": -711.4615478515625, + "logps/rejected": -1165.3927001953125, + "loss": 0.1287, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20542404055595398, + "rewards/margins": 0.2213752716779709, + "rewards/rejected": -0.42679935693740845, + "step": 1970 + }, + { + "epoch": 0.38, + "learning_rate": 3.918583011896955e-06, + "logits/chosen": -1.5640978813171387, + "logits/rejected": -1.0945550203323364, + "logps/chosen": -573.6983032226562, + "logps/rejected": -1243.242431640625, + "loss": 0.0838, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12780186533927917, + "rewards/margins": 0.28756803274154663, + "rewards/rejected": -0.4153698980808258, + "step": 1980 + }, + { + "epoch": 0.38, + "learning_rate": 3.904864728095349e-06, + "logits/chosen": -1.6314102411270142, + "logits/rejected": -1.1875035762786865, + "logps/chosen": -564.3800659179688, + "logps/rejected": -1137.12744140625, + "loss": 0.078, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07501459121704102, + "rewards/margins": 0.2666712701320648, + "rewards/rejected": -0.34168586134910583, + "step": 1990 + }, + { + "epoch": 0.38, + "learning_rate": 3.891084338941603e-06, + "logits/chosen": -1.6454538106918335, + "logits/rejected": -0.9986695051193237, + "logps/chosen": -591.3613891601562, + "logps/rejected": -1145.727783203125, + "loss": 0.0738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09405544400215149, + "rewards/margins": 0.28904372453689575, + "rewards/rejected": -0.38309913873672485, + "step": 2000 + }, + { + "epoch": 0.38, + "learning_rate": 3.8772424536302565e-06, + "logits/chosen": -1.5474951267242432, + "logits/rejected": -0.8243762850761414, + "logps/chosen": -535.3194580078125, + "logps/rejected": -1299.9228515625, + "loss": 0.065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10869929939508438, + "rewards/margins": 0.3450026512145996, + "rewards/rejected": -0.453701913356781, + "step": 2010 + }, + { + "epoch": 0.38, + "learning_rate": 3.863339684074432e-06, + "logits/chosen": -1.6806402206420898, + "logits/rejected": -1.0863831043243408, + "logps/chosen": -759.7283935546875, + "logps/rejected": -1388.066162109375, + "loss": 0.0695, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2040364295244217, + "rewards/margins": 0.2842352092266083, + "rewards/rejected": -0.48827165365219116, + "step": 2020 + }, + { + "epoch": 0.39, + "learning_rate": 3.849376644878783e-06, + "logits/chosen": -1.3155741691589355, + "logits/rejected": -1.046635627746582, + "logps/chosen": -516.9121704101562, + "logps/rejected": -1351.8599853515625, + "loss": 0.0422, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.12792593240737915, + "rewards/margins": 0.3376065790653229, + "rewards/rejected": -0.4655325412750244, + "step": 2030 + }, + { + "epoch": 0.39, + "learning_rate": 3.835353953312322e-06, + "logits/chosen": -1.5681509971618652, + "logits/rejected": -1.044856071472168, + "logps/chosen": -618.40576171875, + "logps/rejected": -1278.9957275390625, + "loss": 0.0497, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.10632064193487167, + "rewards/margins": 0.31042590737342834, + "rewards/rejected": -0.4167465269565582, + "step": 2040 + }, + { + "epoch": 0.39, + "learning_rate": 3.821272229281139e-06, + "logits/chosen": -1.7507911920547485, + "logits/rejected": -1.2355482578277588, + "logps/chosen": -502.3592224121094, + "logps/rejected": -1111.670654296875, + "loss": 0.1022, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0735398381948471, + "rewards/margins": 0.29525768756866455, + "rewards/rejected": -0.36879754066467285, + "step": 2050 + }, + { + "epoch": 0.39, + "learning_rate": 3.8071320953009906e-06, + "logits/chosen": -1.7723318338394165, + "logits/rejected": -0.853528618812561, + "logps/chosen": -646.585205078125, + "logps/rejected": -1187.48291015625, + "loss": 0.1126, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11552548408508301, + "rewards/margins": 0.2491040676832199, + "rewards/rejected": -0.3646295666694641, + "step": 2060 + }, + { + "epoch": 0.39, + "learning_rate": 3.792934176469782e-06, + "logits/chosen": -1.5255523920059204, + "logits/rejected": -1.0040549039840698, + "logps/chosen": -517.2431640625, + "logps/rejected": -1240.179931640625, + "loss": 0.0637, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.12137254327535629, + "rewards/margins": 0.30170968174934387, + "rewards/rejected": -0.42308226227760315, + "step": 2070 + }, + { + "epoch": 0.4, + "learning_rate": 3.7786791004399353e-06, + "logits/chosen": -1.5424957275390625, + "logits/rejected": -0.9886695742607117, + "logps/chosen": -652.4849243164062, + "logps/rejected": -1332.295654296875, + "loss": 0.0549, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16771355271339417, + "rewards/margins": 0.30935198068618774, + "rewards/rejected": -0.4770655035972595, + "step": 2080 + }, + { + "epoch": 0.4, + "learning_rate": 3.764367497390642e-06, + "logits/chosen": -1.5075163841247559, + "logits/rejected": -0.9398325681686401, + "logps/chosen": -762.2354125976562, + "logps/rejected": -1400.30126953125, + "loss": 0.0722, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.22768834233283997, + "rewards/margins": 0.2723497748374939, + "rewards/rejected": -0.5000380873680115, + "step": 2090 + }, + { + "epoch": 0.4, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -1.7654527425765991, + "logits/rejected": -0.8684778213500977, + "logps/chosen": -616.1461181640625, + "logps/rejected": -1208.4818115234375, + "loss": 0.0745, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15020321309566498, + "rewards/margins": 0.31690362095832825, + "rewards/rejected": -0.46710681915283203, + "step": 2100 + }, + { + "epoch": 0.4, + "learning_rate": 3.7355772434170523e-06, + "logits/chosen": -1.4072939157485962, + "logits/rejected": -0.8714143633842468, + "logps/chosen": -672.609375, + "logps/rejected": -1258.9796142578125, + "loss": 0.0808, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19290420413017273, + "rewards/margins": 0.28400301933288574, + "rewards/rejected": -0.4769071936607361, + "step": 2110 + }, + { + "epoch": 0.4, + "learning_rate": 3.7210998652337016e-06, + "logits/chosen": -1.7671144008636475, + "logits/rejected": -1.125576376914978, + "logps/chosen": -663.3321533203125, + "logps/rejected": -1265.360595703125, + "loss": 0.1121, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1786895990371704, + "rewards/margins": 0.2822434902191162, + "rewards/rejected": -0.46093305945396423, + "step": 2120 + }, + { + "epoch": 0.41, + "learning_rate": 3.7065685054565277e-06, + "logits/chosen": -1.755200743675232, + "logits/rejected": -0.9497678875923157, + "logps/chosen": -624.0568237304688, + "logps/rejected": -1243.8646240234375, + "loss": 0.0712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1317531168460846, + "rewards/margins": 0.28538697957992554, + "rewards/rejected": -0.41714009642601013, + "step": 2130 + }, + { + "epoch": 0.41, + "learning_rate": 3.691983806478494e-06, + "logits/chosen": -1.3938941955566406, + "logits/rejected": -0.9991312026977539, + "logps/chosen": -609.2355346679688, + "logps/rejected": -1163.9844970703125, + "loss": 0.1083, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1537613570690155, + "rewards/margins": 0.22227540612220764, + "rewards/rejected": -0.37603679299354553, + "step": 2140 + }, + { + "epoch": 0.41, + "learning_rate": 3.677346413050551e-06, + "logits/chosen": -1.7743680477142334, + "logits/rejected": -0.8094174265861511, + "logps/chosen": -645.5819091796875, + "logps/rejected": -1326.28369140625, + "loss": 0.0626, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17391979694366455, + "rewards/margins": 0.29295578598976135, + "rewards/rejected": -0.4668755531311035, + "step": 2150 + }, + { + "epoch": 0.41, + "learning_rate": 3.6626569722531268e-06, + "logits/chosen": -1.8744360208511353, + "logits/rejected": -1.0872384309768677, + "logps/chosen": -745.5087890625, + "logps/rejected": -1093.416748046875, + "loss": 0.1225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2028745710849762, + "rewards/margins": 0.18700726330280304, + "rewards/rejected": -0.38988178968429565, + "step": 2160 + }, + { + "epoch": 0.41, + "learning_rate": 3.6479161334675294e-06, + "logits/chosen": -1.374147653579712, + "logits/rejected": -0.8410941362380981, + "logps/chosen": -594.9989624023438, + "logps/rejected": -1234.977294921875, + "loss": 0.0598, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11717710644006729, + "rewards/margins": 0.306627482175827, + "rewards/rejected": -0.4238045811653137, + "step": 2170 + }, + { + "epoch": 0.42, + "learning_rate": 3.6331245483472353e-06, + "logits/chosen": -1.2791547775268555, + "logits/rejected": -1.0032517910003662, + "logps/chosen": -657.0687255859375, + "logps/rejected": -1368.2298583984375, + "loss": 0.0674, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18623146414756775, + "rewards/margins": 0.30436015129089355, + "rewards/rejected": -0.4905916750431061, + "step": 2180 + }, + { + "epoch": 0.42, + "learning_rate": 3.6182828707890816e-06, + "logits/chosen": -1.5984619855880737, + "logits/rejected": -1.1988598108291626, + "logps/chosen": -685.6885375976562, + "logps/rejected": -1425.386474609375, + "loss": 0.0668, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.204415962100029, + "rewards/margins": 0.3183789551258087, + "rewards/rejected": -0.5227949023246765, + "step": 2190 + }, + { + "epoch": 0.42, + "learning_rate": 3.6033917569043604e-06, + "logits/chosen": -1.4690712690353394, + "logits/rejected": -1.0461918115615845, + "logps/chosen": -603.8146362304688, + "logps/rejected": -1344.2086181640625, + "loss": 0.066, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21491065621376038, + "rewards/margins": 0.3031674027442932, + "rewards/rejected": -0.5180780291557312, + "step": 2200 + }, + { + "epoch": 0.42, + "learning_rate": 3.588451864989811e-06, + "logits/chosen": -1.5433639287948608, + "logits/rejected": -1.1572163105010986, + "logps/chosen": -604.7794189453125, + "logps/rejected": -1369.9986572265625, + "loss": 0.0576, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17777907848358154, + "rewards/margins": 0.33150824904441833, + "rewards/rejected": -0.5092872977256775, + "step": 2210 + }, + { + "epoch": 0.42, + "learning_rate": 3.5734638554985234e-06, + "logits/chosen": -1.6556087732315063, + "logits/rejected": -0.890389084815979, + "logps/chosen": -683.4158935546875, + "logps/rejected": -1303.145263671875, + "loss": 0.0646, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18300040066242218, + "rewards/margins": 0.3182368576526642, + "rewards/rejected": -0.5012372136116028, + "step": 2220 + }, + { + "epoch": 0.42, + "learning_rate": 3.5584283910107343e-06, + "logits/chosen": -1.6998045444488525, + "logits/rejected": -0.9639657139778137, + "logps/chosen": -666.9964599609375, + "logps/rejected": -1350.66357421875, + "loss": 0.0643, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1677488386631012, + "rewards/margins": 0.34021884202957153, + "rewards/rejected": -0.5079677104949951, + "step": 2230 + }, + { + "epoch": 0.43, + "learning_rate": 3.543346136204545e-06, + "logits/chosen": -1.533036231994629, + "logits/rejected": -1.0550358295440674, + "logps/chosen": -673.4791259765625, + "logps/rejected": -1482.1470947265625, + "loss": 0.0514, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1976957768201828, + "rewards/margins": 0.3370071351528168, + "rewards/rejected": -0.5347028970718384, + "step": 2240 + }, + { + "epoch": 0.43, + "learning_rate": 3.5282177578265295e-06, + "logits/chosen": -1.7097774744033813, + "logits/rejected": -1.0682802200317383, + "logps/chosen": -583.5718994140625, + "logps/rejected": -1295.059326171875, + "loss": 0.0591, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14968575537204742, + "rewards/margins": 0.3431374430656433, + "rewards/rejected": -0.49282321333885193, + "step": 2250 + }, + { + "epoch": 0.43, + "learning_rate": 3.5130439246622635e-06, + "logits/chosen": -1.3002750873565674, + "logits/rejected": -0.9815470576286316, + "logps/chosen": -614.1593017578125, + "logps/rejected": -1260.708740234375, + "loss": 0.0932, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1836967170238495, + "rewards/margins": 0.3018070459365845, + "rewards/rejected": -0.4855037331581116, + "step": 2260 + }, + { + "epoch": 0.43, + "learning_rate": 3.497825307506758e-06, + "logits/chosen": -1.7087440490722656, + "logits/rejected": -0.9878816604614258, + "logps/chosen": -652.6859741210938, + "logps/rejected": -1311.63134765625, + "loss": 0.0807, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19891947507858276, + "rewards/margins": 0.2804645597934723, + "rewards/rejected": -0.47938403487205505, + "step": 2270 + }, + { + "epoch": 0.43, + "learning_rate": 3.4825625791348093e-06, + "logits/chosen": -1.4375241994857788, + "logits/rejected": -0.8479606509208679, + "logps/chosen": -644.3697509765625, + "logps/rejected": -1250.4058837890625, + "loss": 0.0716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1917545348405838, + "rewards/margins": 0.27922800183296204, + "rewards/rejected": -0.47098255157470703, + "step": 2280 + }, + { + "epoch": 0.44, + "learning_rate": 3.467256414271249e-06, + "logits/chosen": -1.6169170141220093, + "logits/rejected": -0.7455465197563171, + "logps/chosen": -713.6448364257812, + "logps/rejected": -1382.803955078125, + "loss": 0.0291, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18694797158241272, + "rewards/margins": 0.3667767643928528, + "rewards/rejected": -0.5537247061729431, + "step": 2290 + }, + { + "epoch": 0.44, + "learning_rate": 3.4519074895611245e-06, + "logits/chosen": -1.3121281862258911, + "logits/rejected": -0.8998733758926392, + "logps/chosen": -698.1446533203125, + "logps/rejected": -1322.88427734375, + "loss": 0.0839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2285010814666748, + "rewards/margins": 0.3055169880390167, + "rewards/rejected": -0.5340181589126587, + "step": 2300 + }, + { + "epoch": 0.44, + "learning_rate": 3.436516483539781e-06, + "logits/chosen": -1.8377107381820679, + "logits/rejected": -1.1042895317077637, + "logps/chosen": -671.3965454101562, + "logps/rejected": -1484.2662353515625, + "loss": 0.0507, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.15971992909908295, + "rewards/margins": 0.37924593687057495, + "rewards/rejected": -0.5389658212661743, + "step": 2310 + }, + { + "epoch": 0.44, + "learning_rate": 3.421084076602867e-06, + "logits/chosen": -1.4560115337371826, + "logits/rejected": -1.019805669784546, + "logps/chosen": -437.4620666503906, + "logps/rejected": -1163.4324951171875, + "loss": 0.1034, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11113061010837555, + "rewards/margins": 0.3120535910129547, + "rewards/rejected": -0.42318421602249146, + "step": 2320 + }, + { + "epoch": 0.44, + "learning_rate": 3.405610950976257e-06, + "logits/chosen": -1.5794904232025146, + "logits/rejected": -0.8965857625007629, + "logps/chosen": -640.6691284179688, + "logps/rejected": -1358.776611328125, + "loss": 0.0541, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14796991646289825, + "rewards/margins": 0.3194065988063812, + "rewards/rejected": -0.4673764705657959, + "step": 2330 + }, + { + "epoch": 0.45, + "learning_rate": 3.3900977906858923e-06, + "logits/chosen": -1.5456417798995972, + "logits/rejected": -0.9718359112739563, + "logps/chosen": -615.3480834960938, + "logps/rejected": -1091.6168212890625, + "loss": 0.1036, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17209769785404205, + "rewards/margins": 0.21953482925891876, + "rewards/rejected": -0.3916325271129608, + "step": 2340 + }, + { + "epoch": 0.45, + "learning_rate": 3.3745452815275375e-06, + "logits/chosen": -1.7035932540893555, + "logits/rejected": -0.9176031947135925, + "logps/chosen": -767.6522827148438, + "logps/rejected": -1360.09619140625, + "loss": 0.076, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1860385239124298, + "rewards/margins": 0.3071725368499756, + "rewards/rejected": -0.49321097135543823, + "step": 2350 + }, + { + "epoch": 0.45, + "learning_rate": 3.3589541110364678e-06, + "logits/chosen": -1.5451462268829346, + "logits/rejected": -0.9727177619934082, + "logps/chosen": -720.4700927734375, + "logps/rejected": -1452.360107421875, + "loss": 0.0443, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21592077612876892, + "rewards/margins": 0.33422279357910156, + "rewards/rejected": -0.5501435995101929, + "step": 2360 + }, + { + "epoch": 0.45, + "learning_rate": 3.3433249684570757e-06, + "logits/chosen": -1.5172083377838135, + "logits/rejected": -0.8823717832565308, + "logps/chosen": -560.3057861328125, + "logps/rejected": -1079.5458984375, + "loss": 0.0965, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15911349654197693, + "rewards/margins": 0.2574231028556824, + "rewards/rejected": -0.4165366590023041, + "step": 2370 + }, + { + "epoch": 0.45, + "learning_rate": 3.3276585447123957e-06, + "logits/chosen": -1.3937236070632935, + "logits/rejected": -0.9429960250854492, + "logps/chosen": -602.1780395507812, + "logps/rejected": -1398.97998046875, + "loss": 0.0482, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21115748584270477, + "rewards/margins": 0.33564311265945435, + "rewards/rejected": -0.5468006134033203, + "step": 2380 + }, + { + "epoch": 0.46, + "learning_rate": 3.3119555323735664e-06, + "logits/chosen": -1.4655485153198242, + "logits/rejected": -0.8040722608566284, + "logps/chosen": -668.6180419921875, + "logps/rejected": -1413.5433349609375, + "loss": 0.056, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.22061499953269958, + "rewards/margins": 0.3491475582122803, + "rewards/rejected": -0.5697625279426575, + "step": 2390 + }, + { + "epoch": 0.46, + "learning_rate": 3.2962166256292116e-06, + "logits/chosen": -1.4201056957244873, + "logits/rejected": -0.9808454513549805, + "logps/chosen": -639.6843872070312, + "logps/rejected": -1323.3402099609375, + "loss": 0.0663, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18054111301898956, + "rewards/margins": 0.3211413323879242, + "rewards/rejected": -0.501682460308075, + "step": 2400 + }, + { + "epoch": 0.46, + "learning_rate": 3.2804425202547494e-06, + "logits/chosen": -1.8520081043243408, + "logits/rejected": -1.2786279916763306, + "logps/chosen": -660.5281982421875, + "logps/rejected": -1369.4189453125, + "loss": 0.0553, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1543307602405548, + "rewards/margins": 0.35364800691604614, + "rewards/rejected": -0.5079787969589233, + "step": 2410 + }, + { + "epoch": 0.46, + "learning_rate": 3.2646339135816386e-06, + "logits/chosen": -1.6647218465805054, + "logits/rejected": -1.114368200302124, + "logps/chosen": -613.7562255859375, + "logps/rejected": -1222.032958984375, + "loss": 0.0533, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13925352692604065, + "rewards/margins": 0.30394676327705383, + "rewards/rejected": -0.4432002902030945, + "step": 2420 + }, + { + "epoch": 0.46, + "learning_rate": 3.2487915044665485e-06, + "logits/chosen": -1.4685899019241333, + "logits/rejected": -1.0434587001800537, + "logps/chosen": -553.0774536132812, + "logps/rejected": -1291.67529296875, + "loss": 0.067, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1718519628047943, + "rewards/margins": 0.34229570627212524, + "rewards/rejected": -0.5141476392745972, + "step": 2430 + }, + { + "epoch": 0.46, + "learning_rate": 3.2329159932604638e-06, + "logits/chosen": -1.6335970163345337, + "logits/rejected": -1.0629937648773193, + "logps/chosen": -600.8878173828125, + "logps/rejected": -1243.2655029296875, + "loss": 0.0684, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18541783094406128, + "rewards/margins": 0.29092368483543396, + "rewards/rejected": -0.4763415455818176, + "step": 2440 + }, + { + "epoch": 0.47, + "learning_rate": 3.217008081777726e-06, + "logits/chosen": -1.2788913249969482, + "logits/rejected": -0.8486631512641907, + "logps/chosen": -523.6038818359375, + "logps/rejected": -1205.5172119140625, + "loss": 0.0905, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17843207716941833, + "rewards/margins": 0.2737273573875427, + "rewards/rejected": -0.45215946435928345, + "step": 2450 + }, + { + "epoch": 0.47, + "learning_rate": 3.201068473265007e-06, + "logits/chosen": -1.6170251369476318, + "logits/rejected": -0.9501678347587585, + "logps/chosen": -626.2254638671875, + "logps/rejected": -1324.7999267578125, + "loss": 0.0563, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1649116575717926, + "rewards/margins": 0.324629008769989, + "rewards/rejected": -0.4895406663417816, + "step": 2460 + }, + { + "epoch": 0.47, + "learning_rate": 3.1850978723702213e-06, + "logits/chosen": -1.531653642654419, + "logits/rejected": -0.844316840171814, + "logps/chosen": -622.3458862304688, + "logps/rejected": -1382.773193359375, + "loss": 0.0393, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15444083511829376, + "rewards/margins": 0.37400901317596436, + "rewards/rejected": -0.5284498929977417, + "step": 2470 + }, + { + "epoch": 0.47, + "learning_rate": 3.1690969851113724e-06, + "logits/chosen": -1.6974732875823975, + "logits/rejected": -1.165449619293213, + "logps/chosen": -709.1497802734375, + "logps/rejected": -1315.007568359375, + "loss": 0.0825, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18818387389183044, + "rewards/margins": 0.3215584456920624, + "rewards/rejected": -0.5097423195838928, + "step": 2480 + }, + { + "epoch": 0.47, + "learning_rate": 3.1530665188453463e-06, + "logits/chosen": -1.3408528566360474, + "logits/rejected": -1.0513417720794678, + "logps/chosen": -562.9158325195312, + "logps/rejected": -1255.3634033203125, + "loss": 0.0658, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16134487092494965, + "rewards/margins": 0.29531365633010864, + "rewards/rejected": -0.4566585123538971, + "step": 2490 + }, + { + "epoch": 0.48, + "learning_rate": 3.137007182236637e-06, + "logits/chosen": -1.8058862686157227, + "logits/rejected": -1.1288154125213623, + "logps/chosen": -730.669189453125, + "logps/rejected": -1342.0849609375, + "loss": 0.0575, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1855761706829071, + "rewards/margins": 0.3109329044818878, + "rewards/rejected": -0.4965090751647949, + "step": 2500 + }, + { + "epoch": 0.48, + "learning_rate": 3.1209196852260204e-06, + "logits/chosen": -1.762399673461914, + "logits/rejected": -1.2845382690429688, + "logps/chosen": -559.9722900390625, + "logps/rejected": -1243.078125, + "loss": 0.0805, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1693984866142273, + "rewards/margins": 0.28685134649276733, + "rewards/rejected": -0.456249862909317, + "step": 2510 + }, + { + "epoch": 0.48, + "learning_rate": 3.1048047389991693e-06, + "logits/chosen": -1.5080870389938354, + "logits/rejected": -1.1010701656341553, + "logps/chosen": -727.5679931640625, + "logps/rejected": -1342.488525390625, + "loss": 0.0848, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2452806681394577, + "rewards/margins": 0.27895987033843994, + "rewards/rejected": -0.5242404937744141, + "step": 2520 + }, + { + "epoch": 0.48, + "learning_rate": 3.0886630559552144e-06, + "logits/chosen": -1.7736622095108032, + "logits/rejected": -0.8815032243728638, + "logps/chosen": -762.474609375, + "logps/rejected": -1368.76171875, + "loss": 0.0778, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.25478002429008484, + "rewards/margins": 0.3027140200138092, + "rewards/rejected": -0.557494044303894, + "step": 2530 + }, + { + "epoch": 0.48, + "learning_rate": 3.072495349675249e-06, + "logits/chosen": -1.6037559509277344, + "logits/rejected": -1.0513917207717896, + "logps/chosen": -613.7960205078125, + "logps/rejected": -1140.569580078125, + "loss": 0.0906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19967949390411377, + "rewards/margins": 0.2505190968513489, + "rewards/rejected": -0.45019856095314026, + "step": 2540 + }, + { + "epoch": 0.49, + "learning_rate": 3.056302334890786e-06, + "logits/chosen": -1.7416324615478516, + "logits/rejected": -1.020509123802185, + "logps/chosen": -886.8963012695312, + "logps/rejected": -1471.7713623046875, + "loss": 0.0564, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2609700560569763, + "rewards/margins": 0.3177827000617981, + "rewards/rejected": -0.5787526965141296, + "step": 2550 + }, + { + "epoch": 0.49, + "learning_rate": 3.04008472745216e-06, + "logits/chosen": -1.6769838333129883, + "logits/rejected": -1.1075221300125122, + "logps/chosen": -549.7738037109375, + "logps/rejected": -1215.347900390625, + "loss": 0.0675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17719343304634094, + "rewards/margins": 0.3189665973186493, + "rewards/rejected": -0.496160089969635, + "step": 2560 + }, + { + "epoch": 0.49, + "learning_rate": 3.0238432442968803e-06, + "logits/chosen": -1.4335027933120728, + "logits/rejected": -0.8432362675666809, + "logps/chosen": -626.7779541015625, + "logps/rejected": -1301.3828125, + "loss": 0.0702, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18088267743587494, + "rewards/margins": 0.3284760117530823, + "rewards/rejected": -0.5093587040901184, + "step": 2570 + }, + { + "epoch": 0.49, + "learning_rate": 3.0075786034179407e-06, + "logits/chosen": -1.543150782585144, + "logits/rejected": -1.0602935552597046, + "logps/chosen": -750.2728271484375, + "logps/rejected": -1355.26220703125, + "loss": 0.083, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.22942061722278595, + "rewards/margins": 0.2801212668418884, + "rewards/rejected": -0.5095418691635132, + "step": 2580 + }, + { + "epoch": 0.49, + "learning_rate": 2.9912915238320755e-06, + "logits/chosen": -1.5501201152801514, + "logits/rejected": -1.0371865034103394, + "logps/chosen": -632.9366455078125, + "logps/rejected": -1157.901611328125, + "loss": 0.0942, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18720534443855286, + "rewards/margins": 0.22448968887329102, + "rewards/rejected": -0.4116950035095215, + "step": 2590 + }, + { + "epoch": 0.5, + "learning_rate": 2.974982725547976e-06, + "logits/chosen": -1.6694200038909912, + "logits/rejected": -1.1160310506820679, + "logps/chosen": -546.0211181640625, + "logps/rejected": -1163.3369140625, + "loss": 0.078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.130954772233963, + "rewards/margins": 0.27275922894477844, + "rewards/rejected": -0.40371403098106384, + "step": 2600 + }, + { + "epoch": 0.5, + "learning_rate": 2.958652929534456e-06, + "logits/chosen": -1.367626428604126, + "logits/rejected": -0.8243290185928345, + "logps/chosen": -595.7312622070312, + "logps/rejected": -1166.314208984375, + "loss": 0.0693, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.13953298330307007, + "rewards/margins": 0.27511560916900635, + "rewards/rejected": -0.4146485924720764, + "step": 2610 + }, + { + "epoch": 0.5, + "learning_rate": 2.9423028576885894e-06, + "logits/chosen": -1.763495683670044, + "logits/rejected": -0.7965998649597168, + "logps/chosen": -688.7870483398438, + "logps/rejected": -1430.501708984375, + "loss": 0.0466, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.20105794072151184, + "rewards/margins": 0.35635849833488464, + "rewards/rejected": -0.5574164390563965, + "step": 2620 + }, + { + "epoch": 0.5, + "learning_rate": 2.9259332328037852e-06, + "logits/chosen": -1.805696725845337, + "logits/rejected": -1.4092562198638916, + "logps/chosen": -604.6575927734375, + "logps/rejected": -1143.622802734375, + "loss": 0.1062, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16817395389080048, + "rewards/margins": 0.23777250945568085, + "rewards/rejected": -0.40594643354415894, + "step": 2630 + }, + { + "epoch": 0.5, + "learning_rate": 2.9095447785378446e-06, + "logits/chosen": -1.7092657089233398, + "logits/rejected": -1.1881816387176514, + "logps/chosen": -655.2656860351562, + "logps/rejected": -1103.6141357421875, + "loss": 0.1183, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15867792069911957, + "rewards/margins": 0.23788101971149445, + "rewards/rejected": -0.396558940410614, + "step": 2640 + }, + { + "epoch": 0.5, + "learning_rate": 2.893138219380964e-06, + "logits/chosen": -1.615456223487854, + "logits/rejected": -1.0051472187042236, + "logps/chosen": -632.4168090820312, + "logps/rejected": -1268.873291015625, + "loss": 0.0784, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.179777592420578, + "rewards/margins": 0.28583192825317383, + "rewards/rejected": -0.46560949087142944, + "step": 2650 + }, + { + "epoch": 0.51, + "learning_rate": 2.876714280623708e-06, + "logits/chosen": -1.5862231254577637, + "logits/rejected": -1.2923696041107178, + "logps/chosen": -566.974365234375, + "logps/rejected": -1187.7386474609375, + "loss": 0.0914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1301722228527069, + "rewards/margins": 0.2452360838651657, + "rewards/rejected": -0.3754083216190338, + "step": 2660 + }, + { + "epoch": 0.51, + "learning_rate": 2.8602736883249504e-06, + "logits/chosen": -1.5150556564331055, + "logits/rejected": -1.1209475994110107, + "logps/chosen": -525.2214965820312, + "logps/rejected": -1281.676025390625, + "loss": 0.0384, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.10434702783823013, + "rewards/margins": 0.3269978165626526, + "rewards/rejected": -0.4313448369503021, + "step": 2670 + }, + { + "epoch": 0.51, + "learning_rate": 2.843817169279772e-06, + "logits/chosen": -1.8272594213485718, + "logits/rejected": -1.1217495203018188, + "logps/chosen": -626.3189086914062, + "logps/rejected": -1287.4119873046875, + "loss": 0.0571, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.13381603360176086, + "rewards/margins": 0.3332836627960205, + "rewards/rejected": -0.467099666595459, + "step": 2680 + }, + { + "epoch": 0.51, + "learning_rate": 2.8273454509873333e-06, + "logits/chosen": -1.6995046138763428, + "logits/rejected": -0.9833188056945801, + "logps/chosen": -723.0067138671875, + "logps/rejected": -1212.660400390625, + "loss": 0.0937, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18251793086528778, + "rewards/margins": 0.24979586899280548, + "rewards/rejected": -0.4323137700557709, + "step": 2690 + }, + { + "epoch": 0.51, + "learning_rate": 2.8108592616187135e-06, + "logits/chosen": -1.8046150207519531, + "logits/rejected": -0.830971896648407, + "logps/chosen": -650.2740478515625, + "logps/rejected": -1255.9827880859375, + "loss": 0.0641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11181609332561493, + "rewards/margins": 0.33334630727767944, + "rewards/rejected": -0.44516244530677795, + "step": 2700 + }, + { + "epoch": 0.52, + "learning_rate": 2.7943593299847186e-06, + "logits/chosen": -1.8495075702667236, + "logits/rejected": -1.0969774723052979, + "logps/chosen": -656.57080078125, + "logps/rejected": -1230.971435546875, + "loss": 0.0643, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15315493941307068, + "rewards/margins": 0.26907438039779663, + "rewards/rejected": -0.4222293496131897, + "step": 2710 + }, + { + "epoch": 0.52, + "learning_rate": 2.7778463855036656e-06, + "logits/chosen": -1.736000418663025, + "logits/rejected": -0.9863492250442505, + "logps/chosen": -610.7926025390625, + "logps/rejected": -1202.56787109375, + "loss": 0.068, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14737586677074432, + "rewards/margins": 0.28114116191864014, + "rewards/rejected": -0.42851710319519043, + "step": 2720 + }, + { + "epoch": 0.52, + "learning_rate": 2.761321158169134e-06, + "logits/chosen": -1.6639608144760132, + "logits/rejected": -1.19706130027771, + "logps/chosen": -753.8912963867188, + "logps/rejected": -1248.675537109375, + "loss": 0.0972, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.24665312469005585, + "rewards/margins": 0.2453705072402954, + "rewards/rejected": -0.49202364683151245, + "step": 2730 + }, + { + "epoch": 0.52, + "learning_rate": 2.7447843785176958e-06, + "logits/chosen": -1.2429653406143188, + "logits/rejected": -0.708914577960968, + "logps/chosen": -684.2418212890625, + "logps/rejected": -1234.620849609375, + "loss": 0.0989, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22040219604969025, + "rewards/margins": 0.2539559006690979, + "rewards/rejected": -0.47435808181762695, + "step": 2740 + }, + { + "epoch": 0.52, + "learning_rate": 2.728236777596621e-06, + "logits/chosen": -1.6828727722167969, + "logits/rejected": -1.1341021060943604, + "logps/chosen": -748.3656616210938, + "logps/rejected": -1400.454345703125, + "loss": 0.0757, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.24052861332893372, + "rewards/margins": 0.3166273236274719, + "rewards/rejected": -0.5571559071540833, + "step": 2750 + }, + { + "epoch": 0.53, + "learning_rate": 2.7116790869315583e-06, + "logits/chosen": -1.5554356575012207, + "logits/rejected": -1.296067237854004, + "logps/chosen": -595.21337890625, + "logps/rejected": -1285.7225341796875, + "loss": 0.0717, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18583238124847412, + "rewards/margins": 0.3038370609283447, + "rewards/rejected": -0.48966941237449646, + "step": 2760 + }, + { + "epoch": 0.53, + "learning_rate": 2.695112038494198e-06, + "logits/chosen": -1.653363823890686, + "logits/rejected": -0.8572921752929688, + "logps/chosen": -770.357666015625, + "logps/rejected": -1271.4033203125, + "loss": 0.0848, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1875109076499939, + "rewards/margins": 0.29938197135925293, + "rewards/rejected": -0.48689284920692444, + "step": 2770 + }, + { + "epoch": 0.53, + "learning_rate": 2.6785363646699125e-06, + "logits/chosen": -1.532932162284851, + "logits/rejected": -1.014031171798706, + "logps/chosen": -666.0637817382812, + "logps/rejected": -1335.8193359375, + "loss": 0.0647, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19544604420661926, + "rewards/margins": 0.2952579855918884, + "rewards/rejected": -0.4907039999961853, + "step": 2780 + }, + { + "epoch": 0.53, + "learning_rate": 2.6619527982253796e-06, + "logits/chosen": -1.6043936014175415, + "logits/rejected": -1.032204031944275, + "logps/chosen": -643.2554321289062, + "logps/rejected": -1290.070068359375, + "loss": 0.0661, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21375803649425507, + "rewards/margins": 0.2964997887611389, + "rewards/rejected": -0.5102577805519104, + "step": 2790 + }, + { + "epoch": 0.53, + "learning_rate": 2.6453620722761897e-06, + "logits/chosen": -1.4313132762908936, + "logits/rejected": -0.676213264465332, + "logps/chosen": -783.4808349609375, + "logps/rejected": -1314.5399169921875, + "loss": 0.0723, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.26105982065200806, + "rewards/margins": 0.27394813299179077, + "rewards/rejected": -0.5350079536437988, + "step": 2800 + }, + { + "epoch": 0.54, + "learning_rate": 2.628764920254435e-06, + "logits/chosen": -1.4004179239273071, + "logits/rejected": -1.0551695823669434, + "logps/chosen": -716.8417358398438, + "logps/rejected": -1216.4146728515625, + "loss": 0.1078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2723211646080017, + "rewards/margins": 0.2596574127674103, + "rewards/rejected": -0.5319786071777344, + "step": 2810 + }, + { + "epoch": 0.54, + "learning_rate": 2.6121620758762877e-06, + "logits/chosen": -1.4957189559936523, + "logits/rejected": -1.1808147430419922, + "logps/chosen": -696.1345825195312, + "logps/rejected": -1231.830078125, + "loss": 0.0986, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2595438063144684, + "rewards/margins": 0.25465917587280273, + "rewards/rejected": -0.5142029523849487, + "step": 2820 + }, + { + "epoch": 0.54, + "learning_rate": 2.595554273109564e-06, + "logits/chosen": -1.7410948276519775, + "logits/rejected": -1.0684701204299927, + "logps/chosen": -658.7049560546875, + "logps/rejected": -1376.8575439453125, + "loss": 0.0553, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18883931636810303, + "rewards/margins": 0.3513220548629761, + "rewards/rejected": -0.5401612520217896, + "step": 2830 + }, + { + "epoch": 0.54, + "learning_rate": 2.5789422461412776e-06, + "logits/chosen": -1.5545628070831299, + "logits/rejected": -0.9545906782150269, + "logps/chosen": -719.914306640625, + "logps/rejected": -1349.597412109375, + "loss": 0.1006, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17915014922618866, + "rewards/margins": 0.30565065145492554, + "rewards/rejected": -0.4848008155822754, + "step": 2840 + }, + { + "epoch": 0.54, + "learning_rate": 2.5623267293451827e-06, + "logits/chosen": -1.498730182647705, + "logits/rejected": -1.021172285079956, + "logps/chosen": -648.0553588867188, + "logps/rejected": -1348.339111328125, + "loss": 0.0607, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1289513111114502, + "rewards/margins": 0.3190954327583313, + "rewards/rejected": -0.4480466842651367, + "step": 2850 + }, + { + "epoch": 0.54, + "learning_rate": 2.5457084572493094e-06, + "logits/chosen": -1.4058417081832886, + "logits/rejected": -1.2573301792144775, + "logps/chosen": -549.1610717773438, + "logps/rejected": -1284.826904296875, + "loss": 0.0763, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1448233425617218, + "rewards/margins": 0.268394410610199, + "rewards/rejected": -0.41321778297424316, + "step": 2860 + }, + { + "epoch": 0.55, + "learning_rate": 2.5290881645034932e-06, + "logits/chosen": -1.7953014373779297, + "logits/rejected": -1.1695835590362549, + "logps/chosen": -652.8411254882812, + "logps/rejected": -1216.7479248046875, + "loss": 0.0644, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19003698229789734, + "rewards/margins": 0.2580549716949463, + "rewards/rejected": -0.44809192419052124, + "step": 2870 + }, + { + "epoch": 0.55, + "learning_rate": 2.5124665858468956e-06, + "logits/chosen": -1.5785634517669678, + "logits/rejected": -1.002070665359497, + "logps/chosen": -760.0968627929688, + "logps/rejected": -1325.75, + "loss": 0.0795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22499170899391174, + "rewards/margins": 0.2633865177631378, + "rewards/rejected": -0.48837822675704956, + "step": 2880 + }, + { + "epoch": 0.55, + "learning_rate": 2.4958444560755268e-06, + "logits/chosen": -1.6666443347930908, + "logits/rejected": -1.2207131385803223, + "logps/chosen": -703.9513549804688, + "logps/rejected": -1183.821533203125, + "loss": 0.0872, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2172054499387741, + "rewards/margins": 0.22457167506217957, + "rewards/rejected": -0.44177716970443726, + "step": 2890 + }, + { + "epoch": 0.55, + "learning_rate": 2.479222510009758e-06, + "logits/chosen": -1.6804230213165283, + "logits/rejected": -1.270926833152771, + "logps/chosen": -664.9152221679688, + "logps/rejected": -1291.96240234375, + "loss": 0.0988, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1930864155292511, + "rewards/margins": 0.2714272141456604, + "rewards/rejected": -0.46451354026794434, + "step": 2900 + }, + { + "epoch": 0.55, + "learning_rate": 2.4626014824618418e-06, + "logits/chosen": -1.6413965225219727, + "logits/rejected": -1.0755680799484253, + "logps/chosen": -685.9994506835938, + "logps/rejected": -1280.0340576171875, + "loss": 0.0743, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1401532143354416, + "rewards/margins": 0.2932918667793274, + "rewards/rejected": -0.4334450364112854, + "step": 2910 + }, + { + "epoch": 0.56, + "learning_rate": 2.445982108203422e-06, + "logits/chosen": -1.56205415725708, + "logits/rejected": -1.39645516872406, + "logps/chosen": -594.72509765625, + "logps/rejected": -1270.767822265625, + "loss": 0.0888, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15568089485168457, + "rewards/margins": 0.2737763524055481, + "rewards/rejected": -0.42945727705955505, + "step": 2920 + }, + { + "epoch": 0.56, + "learning_rate": 2.4293651219330614e-06, + "logits/chosen": -1.7054529190063477, + "logits/rejected": -1.0471127033233643, + "logps/chosen": -643.4332275390625, + "logps/rejected": -1266.702880859375, + "loss": 0.079, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15120860934257507, + "rewards/margins": 0.2843385338783264, + "rewards/rejected": -0.4355471134185791, + "step": 2930 + }, + { + "epoch": 0.56, + "learning_rate": 2.4127512582437486e-06, + "logits/chosen": -1.6374223232269287, + "logits/rejected": -1.173452615737915, + "logps/chosen": -560.0069580078125, + "logps/rejected": -1163.2095947265625, + "loss": 0.0735, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13395628333091736, + "rewards/margins": 0.2982594072818756, + "rewards/rejected": -0.43221569061279297, + "step": 2940 + }, + { + "epoch": 0.56, + "learning_rate": 2.3961412515904337e-06, + "logits/chosen": -1.756450891494751, + "logits/rejected": -0.9547022581100464, + "logps/chosen": -693.9270629882812, + "logps/rejected": -1390.892822265625, + "loss": 0.0401, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15946152806282043, + "rewards/margins": 0.3631231188774109, + "rewards/rejected": -0.5225846767425537, + "step": 2950 + }, + { + "epoch": 0.56, + "learning_rate": 2.3795358362575618e-06, + "logits/chosen": -1.6137168407440186, + "logits/rejected": -1.2999922037124634, + "logps/chosen": -619.2698974609375, + "logps/rejected": -1225.082275390625, + "loss": 0.0917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22298340499401093, + "rewards/margins": 0.22996529936790466, + "rewards/rejected": -0.452948659658432, + "step": 2960 + }, + { + "epoch": 0.57, + "learning_rate": 2.3629357463266e-06, + "logits/chosen": -1.5306415557861328, + "logits/rejected": -0.989396870136261, + "logps/chosen": -595.4459228515625, + "logps/rejected": -1328.7762451171875, + "loss": 0.0522, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16768696904182434, + "rewards/margins": 0.3273896276950836, + "rewards/rejected": -0.49507659673690796, + "step": 2970 + }, + { + "epoch": 0.57, + "learning_rate": 2.346341715643601e-06, + "logits/chosen": -1.6781136989593506, + "logits/rejected": -0.9793834686279297, + "logps/chosen": -635.3900756835938, + "logps/rejected": -1335.785888671875, + "loss": 0.0636, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18477866053581238, + "rewards/margins": 0.3320106863975525, + "rewards/rejected": -0.5167893171310425, + "step": 2980 + }, + { + "epoch": 0.57, + "learning_rate": 2.32975447778675e-06, + "logits/chosen": -1.6208549737930298, + "logits/rejected": -1.226190447807312, + "logps/chosen": -601.8873291015625, + "logps/rejected": -1200.184326171875, + "loss": 0.0824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17440524697303772, + "rewards/margins": 0.2568731904029846, + "rewards/rejected": -0.43127840757369995, + "step": 2990 + }, + { + "epoch": 0.57, + "learning_rate": 2.3131747660339396e-06, + "logits/chosen": -1.2349191904067993, + "logits/rejected": -0.8947417140007019, + "logps/chosen": -656.5274658203125, + "logps/rejected": -1464.7275390625, + "loss": 0.0467, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21201591193675995, + "rewards/margins": 0.3637722134590149, + "rewards/rejected": -0.575788140296936, + "step": 3000 + }, + { + "epoch": 0.57, + "learning_rate": 2.296603313330355e-06, + "logits/chosen": -1.4812730550765991, + "logits/rejected": -1.0813909769058228, + "logps/chosen": -688.93115234375, + "logps/rejected": -1384.00439453125, + "loss": 0.0604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2448856085538864, + "rewards/margins": 0.29885345697402954, + "rewards/rejected": -0.5437390208244324, + "step": 3010 + }, + { + "epoch": 0.58, + "learning_rate": 2.280040852256068e-06, + "logits/chosen": -1.5478495359420776, + "logits/rejected": -1.0680512189865112, + "logps/chosen": -661.108154296875, + "logps/rejected": -1224.5616455078125, + "loss": 0.0747, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.213933065533638, + "rewards/margins": 0.2910975515842438, + "rewards/rejected": -0.5050305724143982, + "step": 3020 + }, + { + "epoch": 0.58, + "learning_rate": 2.2634881149936576e-06, + "logits/chosen": -1.3864612579345703, + "logits/rejected": -0.871688187122345, + "logps/chosen": -667.6625366210938, + "logps/rejected": -1298.525390625, + "loss": 0.0563, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21446280181407928, + "rewards/margins": 0.3170791268348694, + "rewards/rejected": -0.5315419435501099, + "step": 3030 + }, + { + "epoch": 0.58, + "learning_rate": 2.246945833295836e-06, + "logits/chosen": -1.7432661056518555, + "logits/rejected": -0.9063519239425659, + "logps/chosen": -715.7513427734375, + "logps/rejected": -1345.704833984375, + "loss": 0.0455, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.20654010772705078, + "rewards/margins": 0.3397827744483948, + "rewards/rejected": -0.5463228821754456, + "step": 3040 + }, + { + "epoch": 0.58, + "learning_rate": 2.230414738453104e-06, + "logits/chosen": -1.5082430839538574, + "logits/rejected": -0.8369771838188171, + "logps/chosen": -656.2059936523438, + "logps/rejected": -1308.8939208984375, + "loss": 0.0631, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18457624316215515, + "rewards/margins": 0.3191223740577698, + "rewards/rejected": -0.5036987066268921, + "step": 3050 + }, + { + "epoch": 0.58, + "learning_rate": 2.2138955612614206e-06, + "logits/chosen": -1.4841004610061646, + "logits/rejected": -0.8255208730697632, + "logps/chosen": -734.7943725585938, + "logps/rejected": -1287.914794921875, + "loss": 0.0714, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.20491111278533936, + "rewards/margins": 0.31542059779167175, + "rewards/rejected": -0.5203317403793335, + "step": 3060 + }, + { + "epoch": 0.58, + "learning_rate": 2.1973890319898965e-06, + "logits/chosen": -1.4568408727645874, + "logits/rejected": -0.9991839528083801, + "logps/chosen": -693.023193359375, + "logps/rejected": -1258.9954833984375, + "loss": 0.0909, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21499352157115936, + "rewards/margins": 0.2714117467403412, + "rewards/rejected": -0.48640528321266174, + "step": 3070 + }, + { + "epoch": 0.59, + "learning_rate": 2.1808958803485134e-06, + "logits/chosen": -1.5070273876190186, + "logits/rejected": -1.0528619289398193, + "logps/chosen": -634.9210815429688, + "logps/rejected": -1429.14892578125, + "loss": 0.0652, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19342589378356934, + "rewards/margins": 0.32702723145484924, + "rewards/rejected": -0.5204530954360962, + "step": 3080 + }, + { + "epoch": 0.59, + "learning_rate": 2.1644168354558623e-06, + "logits/chosen": -1.6371214389801025, + "logits/rejected": -1.3708134889602661, + "logps/chosen": -541.59765625, + "logps/rejected": -1365.823486328125, + "loss": 0.0669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16047021746635437, + "rewards/margins": 0.34046122431755066, + "rewards/rejected": -0.500931441783905, + "step": 3090 + }, + { + "epoch": 0.59, + "learning_rate": 2.1479526258069086e-06, + "logits/chosen": -1.6579856872558594, + "logits/rejected": -1.1512315273284912, + "logps/chosen": -600.4952392578125, + "logps/rejected": -1301.325439453125, + "loss": 0.074, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16126033663749695, + "rewards/margins": 0.2973024845123291, + "rewards/rejected": -0.45856285095214844, + "step": 3100 + }, + { + "epoch": 0.59, + "learning_rate": 2.1315039792407975e-06, + "logits/chosen": -1.5018136501312256, + "logits/rejected": -1.0350725650787354, + "logps/chosen": -703.773193359375, + "logps/rejected": -1416.766357421875, + "loss": 0.0452, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16547882556915283, + "rewards/margins": 0.33334147930145264, + "rewards/rejected": -0.49882030487060547, + "step": 3110 + }, + { + "epoch": 0.59, + "learning_rate": 2.115071622908666e-06, + "logits/chosen": -1.3968632221221924, + "logits/rejected": -0.7743858098983765, + "logps/chosen": -599.1119384765625, + "logps/rejected": -1345.932861328125, + "loss": 0.0504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1530262678861618, + "rewards/margins": 0.3215201497077942, + "rewards/rejected": -0.4745463728904724, + "step": 3120 + }, + { + "epoch": 0.6, + "learning_rate": 2.0986562832415063e-06, + "logits/chosen": -1.4558215141296387, + "logits/rejected": -1.0328631401062012, + "logps/chosen": -573.017333984375, + "logps/rejected": -1261.2379150390625, + "loss": 0.0855, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14629222452640533, + "rewards/margins": 0.30260735750198364, + "rewards/rejected": -0.4488995671272278, + "step": 3130 + }, + { + "epoch": 0.6, + "learning_rate": 2.082258685918047e-06, + "logits/chosen": -1.8028316497802734, + "logits/rejected": -1.0838000774383545, + "logps/chosen": -706.5945434570312, + "logps/rejected": -1400.1011962890625, + "loss": 0.0433, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.18354645371437073, + "rewards/margins": 0.3293781578540802, + "rewards/rejected": -0.5129246115684509, + "step": 3140 + }, + { + "epoch": 0.6, + "learning_rate": 2.0658795558326745e-06, + "logits/chosen": -1.8446155786514282, + "logits/rejected": -1.05807626247406, + "logps/chosen": -743.2035522460938, + "logps/rejected": -1389.4332275390625, + "loss": 0.0609, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18256376683712006, + "rewards/margins": 0.33897337317466736, + "rewards/rejected": -0.5215371251106262, + "step": 3150 + }, + { + "epoch": 0.6, + "learning_rate": 2.049519617063389e-06, + "logits/chosen": -1.805032730102539, + "logits/rejected": -0.8840486407279968, + "logps/chosen": -731.5299072265625, + "logps/rejected": -1289.933837890625, + "loss": 0.0568, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17051993310451508, + "rewards/margins": 0.3140178620815277, + "rewards/rejected": -0.4845377802848816, + "step": 3160 + }, + { + "epoch": 0.6, + "learning_rate": 2.033179592839792e-06, + "logits/chosen": -1.3933377265930176, + "logits/rejected": -1.25589919090271, + "logps/chosen": -505.409423828125, + "logps/rejected": -1189.5247802734375, + "loss": 0.0853, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14680811762809753, + "rewards/margins": 0.26954832673072815, + "rewards/rejected": -0.41635647416114807, + "step": 3170 + }, + { + "epoch": 0.61, + "learning_rate": 2.0168602055111175e-06, + "logits/chosen": -1.468627691268921, + "logits/rejected": -1.2611991167068481, + "logps/chosen": -578.4532470703125, + "logps/rejected": -1260.8515625, + "loss": 0.0684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17467227578163147, + "rewards/margins": 0.2742113769054413, + "rewards/rejected": -0.44888362288475037, + "step": 3180 + }, + { + "epoch": 0.61, + "learning_rate": 2.0005621765142942e-06, + "logits/chosen": -1.692848563194275, + "logits/rejected": -1.1767462491989136, + "logps/chosen": -616.2111206054688, + "logps/rejected": -1344.800048828125, + "loss": 0.0624, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.190839484333992, + "rewards/margins": 0.32105207443237305, + "rewards/rejected": -0.5118916034698486, + "step": 3190 + }, + { + "epoch": 0.61, + "learning_rate": 1.9842862263420565e-06, + "logits/chosen": -1.5659022331237793, + "logits/rejected": -0.8357075452804565, + "logps/chosen": -612.3272705078125, + "logps/rejected": -1096.1163330078125, + "loss": 0.1089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.17919543385505676, + "rewards/margins": 0.2390226423740387, + "rewards/rejected": -0.41821813583374023, + "step": 3200 + }, + { + "epoch": 0.61, + "learning_rate": 1.9680330745110954e-06, + "logits/chosen": -1.4714428186416626, + "logits/rejected": -0.810672402381897, + "logps/chosen": -645.9213256835938, + "logps/rejected": -1339.7169189453125, + "loss": 0.0485, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20404461026191711, + "rewards/margins": 0.3189607262611389, + "rewards/rejected": -0.5230053663253784, + "step": 3210 + }, + { + "epoch": 0.61, + "learning_rate": 1.9518034395302413e-06, + "logits/chosen": -1.4850611686706543, + "logits/rejected": -1.0046916007995605, + "logps/chosen": -649.9691772460938, + "logps/rejected": -1234.4267578125, + "loss": 0.0648, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21667401492595673, + "rewards/margins": 0.28007322549819946, + "rewards/rejected": -0.4967471957206726, + "step": 3220 + }, + { + "epoch": 0.62, + "learning_rate": 1.9355980388687145e-06, + "logits/chosen": -1.521734595298767, + "logits/rejected": -0.9709982872009277, + "logps/chosen": -611.9412231445312, + "logps/rejected": -1343.318115234375, + "loss": 0.0673, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19162626564502716, + "rewards/margins": 0.3469206690788269, + "rewards/rejected": -0.5385469198226929, + "step": 3230 + }, + { + "epoch": 0.62, + "learning_rate": 1.9194175889243942e-06, + "logits/chosen": -1.5944442749023438, + "logits/rejected": -0.9859330058097839, + "logps/chosen": -717.7816162109375, + "logps/rejected": -1519.260986328125, + "loss": 0.0467, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.22277486324310303, + "rewards/margins": 0.34965479373931885, + "rewards/rejected": -0.5724297165870667, + "step": 3240 + }, + { + "epoch": 0.62, + "learning_rate": 1.903262804992156e-06, + "logits/chosen": -1.6996046304702759, + "logits/rejected": -0.7255717515945435, + "logps/chosen": -795.25, + "logps/rejected": -1436.568603515625, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22043243050575256, + "rewards/margins": 0.35897698998451233, + "rewards/rejected": -0.5794094204902649, + "step": 3250 + }, + { + "epoch": 0.62, + "learning_rate": 1.8871344012322504e-06, + "logits/chosen": -1.5481312274932861, + "logits/rejected": -0.7875708341598511, + "logps/chosen": -722.6868896484375, + "logps/rejected": -1411.4429931640625, + "loss": 0.0572, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.22615578770637512, + "rewards/margins": 0.3459186851978302, + "rewards/rejected": -0.5720744729042053, + "step": 3260 + }, + { + "epoch": 0.62, + "learning_rate": 1.8710330906387288e-06, + "logits/chosen": -1.4675686359405518, + "logits/rejected": -0.9357894659042358, + "logps/chosen": -694.9276123046875, + "logps/rejected": -1344.5860595703125, + "loss": 0.067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21511349081993103, + "rewards/margins": 0.31824225187301636, + "rewards/rejected": -0.5333557724952698, + "step": 3270 + }, + { + "epoch": 0.62, + "learning_rate": 1.8549595850079272e-06, + "logits/chosen": -1.5443395376205444, + "logits/rejected": -0.9139049649238586, + "logps/chosen": -652.799560546875, + "logps/rejected": -1310.782470703125, + "loss": 0.0584, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19704267382621765, + "rewards/margins": 0.312825083732605, + "rewards/rejected": -0.509867787361145, + "step": 3280 + }, + { + "epoch": 0.63, + "learning_rate": 1.8389145949069953e-06, + "logits/chosen": -1.6866626739501953, + "logits/rejected": -0.9042354822158813, + "logps/chosen": -759.4832763671875, + "logps/rejected": -1389.257568359375, + "loss": 0.0524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20711025595664978, + "rewards/margins": 0.34287315607070923, + "rewards/rejected": -0.5499833822250366, + "step": 3290 + }, + { + "epoch": 0.63, + "learning_rate": 1.8228988296424877e-06, + "logits/chosen": -1.5796806812286377, + "logits/rejected": -0.8186532855033875, + "logps/chosen": -722.0853271484375, + "logps/rejected": -1205.07177734375, + "loss": 0.0932, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1870838701725006, + "rewards/margins": 0.267849862575531, + "rewards/rejected": -0.4549337327480316, + "step": 3300 + }, + { + "epoch": 0.63, + "learning_rate": 1.806912997229008e-06, + "logits/chosen": -1.510692834854126, + "logits/rejected": -0.9041181802749634, + "logps/chosen": -691.3726806640625, + "logps/rejected": -1267.9361572265625, + "loss": 0.0598, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21052590012550354, + "rewards/margins": 0.30054470896720886, + "rewards/rejected": -0.5110706090927124, + "step": 3310 + }, + { + "epoch": 0.63, + "learning_rate": 1.7909578043579037e-06, + "logits/chosen": -1.4253116846084595, + "logits/rejected": -0.8056136965751648, + "logps/chosen": -646.0543823242188, + "logps/rejected": -1144.4342041015625, + "loss": 0.0866, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21351024508476257, + "rewards/margins": 0.2579920291900635, + "rewards/rejected": -0.4715022146701813, + "step": 3320 + }, + { + "epoch": 0.63, + "learning_rate": 1.7750339563660346e-06, + "logits/chosen": -1.5773307085037231, + "logits/rejected": -0.722690761089325, + "logps/chosen": -700.0763549804688, + "logps/rejected": -1403.1580810546875, + "loss": 0.0617, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21982452273368835, + "rewards/margins": 0.33400195837020874, + "rewards/rejected": -0.5538265109062195, + "step": 3330 + }, + { + "epoch": 0.64, + "learning_rate": 1.759142157204583e-06, + "logits/chosen": -1.1075875759124756, + "logits/rejected": -0.9842613935470581, + "logps/chosen": -617.412353515625, + "logps/rejected": -1335.633056640625, + "loss": 0.0695, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21973755955696106, + "rewards/margins": 0.2990453541278839, + "rewards/rejected": -0.5187829732894897, + "step": 3340 + }, + { + "epoch": 0.64, + "learning_rate": 1.7432831094079357e-06, + "logits/chosen": -1.3947752714157104, + "logits/rejected": -0.9352839589118958, + "logps/chosen": -711.1446533203125, + "logps/rejected": -1314.2042236328125, + "loss": 0.0829, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2424613982439041, + "rewards/margins": 0.2729516923427582, + "rewards/rejected": -0.5154131650924683, + "step": 3350 + }, + { + "epoch": 0.64, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": -1.620173692703247, + "logits/rejected": -1.1578094959259033, + "logps/chosen": -678.2091064453125, + "logps/rejected": -1377.6026611328125, + "loss": 0.0666, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21755430102348328, + "rewards/margins": 0.3121327757835388, + "rewards/rejected": -0.5296871066093445, + "step": 3360 + }, + { + "epoch": 0.64, + "learning_rate": 1.7116660707763637e-06, + "logits/chosen": -1.5007514953613281, + "logits/rejected": -0.9255658388137817, + "logps/chosen": -656.914794921875, + "logps/rejected": -1312.096435546875, + "loss": 0.0762, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19928193092346191, + "rewards/margins": 0.29995498061180115, + "rewards/rejected": -0.49923691153526306, + "step": 3370 + }, + { + "epoch": 0.64, + "learning_rate": 1.695909477647054e-06, + "logits/chosen": -1.5323008298873901, + "logits/rejected": -0.8067652583122253, + "logps/chosen": -598.3023071289062, + "logps/rejected": -1327.044921875, + "loss": 0.0575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16166332364082336, + "rewards/margins": 0.3267229497432709, + "rewards/rejected": -0.4883863031864166, + "step": 3380 + }, + { + "epoch": 0.65, + "learning_rate": 1.6801884312319893e-06, + "logits/chosen": -1.3339112997055054, + "logits/rejected": -0.9970804452896118, + "logps/chosen": -614.9922485351562, + "logps/rejected": -1234.631591796875, + "loss": 0.0734, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1723160296678543, + "rewards/margins": 0.2858824133872986, + "rewards/rejected": -0.4581984579563141, + "step": 3390 + }, + { + "epoch": 0.65, + "learning_rate": 1.6645036265170314e-06, + "logits/chosen": -1.3838064670562744, + "logits/rejected": -0.9229308366775513, + "logps/chosen": -748.576904296875, + "logps/rejected": -1306.68212890625, + "loss": 0.095, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2202054262161255, + "rewards/margins": 0.27225932478904724, + "rewards/rejected": -0.49246472120285034, + "step": 3400 + }, + { + "epoch": 0.65, + "learning_rate": 1.648855756885893e-06, + "logits/chosen": -1.4950006008148193, + "logits/rejected": -1.013434648513794, + "logps/chosen": -718.4817504882812, + "logps/rejected": -1454.3203125, + "loss": 0.0622, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.22334542870521545, + "rewards/margins": 0.3394310474395752, + "rewards/rejected": -0.5627764463424683, + "step": 3410 + }, + { + "epoch": 0.65, + "learning_rate": 1.633245514089482e-06, + "logits/chosen": -1.6091363430023193, + "logits/rejected": -1.1225229501724243, + "logps/chosen": -668.1934204101562, + "logps/rejected": -1322.8450927734375, + "loss": 0.0616, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1633572280406952, + "rewards/margins": 0.3172319829463959, + "rewards/rejected": -0.48058921098709106, + "step": 3420 + }, + { + "epoch": 0.65, + "learning_rate": 1.6176735882153284e-06, + "logits/chosen": -1.6472145318984985, + "logits/rejected": -1.1272056102752686, + "logps/chosen": -654.9362182617188, + "logps/rejected": -1358.408447265625, + "loss": 0.0646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19133299589157104, + "rewards/margins": 0.30245405435562134, + "rewards/rejected": -0.49378710985183716, + "step": 3430 + }, + { + "epoch": 0.66, + "learning_rate": 1.6021406676570667e-06, + "logits/chosen": -1.4853413105010986, + "logits/rejected": -0.9434601068496704, + "logps/chosen": -712.5463256835938, + "logps/rejected": -1411.4658203125, + "loss": 0.0628, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20891955494880676, + "rewards/margins": 0.3422708511352539, + "rewards/rejected": -0.5511903166770935, + "step": 3440 + }, + { + "epoch": 0.66, + "learning_rate": 1.5866474390840126e-06, + "logits/chosen": -1.6828727722167969, + "logits/rejected": -0.8747915029525757, + "logps/chosen": -718.73486328125, + "logps/rejected": -1527.1334228515625, + "loss": 0.0433, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19726070761680603, + "rewards/margins": 0.3979165554046631, + "rewards/rejected": -0.5951772928237915, + "step": 3450 + }, + { + "epoch": 0.66, + "learning_rate": 1.5711945874108053e-06, + "logits/chosen": -1.3750511407852173, + "logits/rejected": -0.9035806655883789, + "logps/chosen": -599.4134521484375, + "logps/rejected": -1339.314208984375, + "loss": 0.0769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17058123648166656, + "rewards/margins": 0.3418346345424652, + "rewards/rejected": -0.512415885925293, + "step": 3460 + }, + { + "epoch": 0.66, + "learning_rate": 1.5557827957671249e-06, + "logits/chosen": -1.5215259790420532, + "logits/rejected": -1.087141752243042, + "logps/chosen": -661.077880859375, + "logps/rejected": -1228.581787109375, + "loss": 0.0905, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.21272769570350647, + "rewards/margins": 0.26741451025009155, + "rewards/rejected": -0.480142205953598, + "step": 3470 + }, + { + "epoch": 0.66, + "learning_rate": 1.5404127454674994e-06, + "logits/chosen": -1.716884970664978, + "logits/rejected": -0.9167481660842896, + "logps/chosen": -776.3599243164062, + "logps/rejected": -1337.857666015625, + "loss": 0.0815, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2485647201538086, + "rewards/margins": 0.28456616401672363, + "rewards/rejected": -0.5331308841705322, + "step": 3480 + }, + { + "epoch": 0.66, + "learning_rate": 1.5250851159811809e-06, + "logits/chosen": -1.3097341060638428, + "logits/rejected": -1.1387548446655273, + "logps/chosen": -596.4869995117188, + "logps/rejected": -1371.023193359375, + "loss": 0.068, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1898602694272995, + "rewards/margins": 0.3254929780960083, + "rewards/rejected": -0.515353262424469, + "step": 3490 + }, + { + "epoch": 0.67, + "learning_rate": 1.509800584902108e-06, + "logits/chosen": -1.5502290725708008, + "logits/rejected": -0.9857848882675171, + "logps/chosen": -671.3793334960938, + "logps/rejected": -1286.765625, + "loss": 0.0897, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21347641944885254, + "rewards/margins": 0.252995103597641, + "rewards/rejected": -0.4664715826511383, + "step": 3500 + }, + { + "epoch": 0.67, + "learning_rate": 1.4945598279189565e-06, + "logits/chosen": -1.3999189138412476, + "logits/rejected": -1.0364853143692017, + "logps/chosen": -541.5253295898438, + "logps/rejected": -1361.39306640625, + "loss": 0.0509, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15634627640247345, + "rewards/margins": 0.3528437316417694, + "rewards/rejected": -0.5091899633407593, + "step": 3510 + }, + { + "epoch": 0.67, + "learning_rate": 1.4793635187852622e-06, + "logits/chosen": -1.800374984741211, + "logits/rejected": -1.2006888389587402, + "logps/chosen": -622.7084350585938, + "logps/rejected": -1129.7310791015625, + "loss": 0.078, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.17157098650932312, + "rewards/margins": 0.2593713402748108, + "rewards/rejected": -0.4309422969818115, + "step": 3520 + }, + { + "epoch": 0.67, + "learning_rate": 1.4642123292896406e-06, + "logits/chosen": -1.7451728582382202, + "logits/rejected": -0.9990085363388062, + "logps/chosen": -551.0404052734375, + "logps/rejected": -1164.4862060546875, + "loss": 0.0647, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1497395783662796, + "rewards/margins": 0.30090680718421936, + "rewards/rejected": -0.45064640045166016, + "step": 3530 + }, + { + "epoch": 0.67, + "learning_rate": 1.4491069292260867e-06, + "logits/chosen": -1.7180376052856445, + "logits/rejected": -1.0250861644744873, + "logps/chosen": -687.5936889648438, + "logps/rejected": -1251.387451171875, + "loss": 0.0717, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18078365921974182, + "rewards/margins": 0.2989780306816101, + "rewards/rejected": -0.47976168990135193, + "step": 3540 + }, + { + "epoch": 0.68, + "learning_rate": 1.4340479863643658e-06, + "logits/chosen": -1.4122458696365356, + "logits/rejected": -1.0276893377304077, + "logps/chosen": -746.4142456054688, + "logps/rejected": -1390.8551025390625, + "loss": 0.0702, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.19428804516792297, + "rewards/margins": 0.2959034740924835, + "rewards/rejected": -0.4901915192604065, + "step": 3550 + }, + { + "epoch": 0.68, + "learning_rate": 1.4190361664204936e-06, + "logits/chosen": -1.545143485069275, + "logits/rejected": -1.0826115608215332, + "logps/chosen": -602.2059936523438, + "logps/rejected": -1251.728759765625, + "loss": 0.0839, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16832391917705536, + "rewards/margins": 0.2858714163303375, + "rewards/rejected": -0.4541953206062317, + "step": 3560 + }, + { + "epoch": 0.68, + "learning_rate": 1.4040721330273063e-06, + "logits/chosen": -1.661960244178772, + "logits/rejected": -1.3034007549285889, + "logps/chosen": -693.1444091796875, + "logps/rejected": -1337.0006103515625, + "loss": 0.0833, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21670281887054443, + "rewards/margins": 0.26909923553466797, + "rewards/rejected": -0.4858019948005676, + "step": 3570 + }, + { + "epoch": 0.68, + "learning_rate": 1.3891565477051242e-06, + "logits/chosen": -1.6313354969024658, + "logits/rejected": -1.0461688041687012, + "logps/chosen": -574.7815551757812, + "logps/rejected": -1244.134033203125, + "loss": 0.0504, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1575133502483368, + "rewards/margins": 0.31078487634658813, + "rewards/rejected": -0.4682982563972473, + "step": 3580 + }, + { + "epoch": 0.68, + "learning_rate": 1.3742900698325034e-06, + "logits/chosen": -1.6138801574707031, + "logits/rejected": -0.969835638999939, + "logps/chosen": -736.7557983398438, + "logps/rejected": -1412.3299560546875, + "loss": 0.0537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2180413454771042, + "rewards/margins": 0.32191744446754456, + "rewards/rejected": -0.5399588346481323, + "step": 3590 + }, + { + "epoch": 0.69, + "learning_rate": 1.3594733566170925e-06, + "logits/chosen": -1.5131175518035889, + "logits/rejected": -0.9915425181388855, + "logps/chosen": -801.5447387695312, + "logps/rejected": -1574.994140625, + "loss": 0.055, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2719317674636841, + "rewards/margins": 0.35427626967430115, + "rewards/rejected": -0.6262079477310181, + "step": 3600 + }, + { + "epoch": 0.69, + "learning_rate": 1.3447070630665771e-06, + "logits/chosen": -1.4314167499542236, + "logits/rejected": -0.9526249170303345, + "logps/chosen": -641.8742065429688, + "logps/rejected": -1273.2801513671875, + "loss": 0.0583, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21152277290821075, + "rewards/margins": 0.302289217710495, + "rewards/rejected": -0.5138119459152222, + "step": 3610 + }, + { + "epoch": 0.69, + "learning_rate": 1.329991841959717e-06, + "logits/chosen": -1.6229140758514404, + "logits/rejected": -0.997315526008606, + "logps/chosen": -706.7448120117188, + "logps/rejected": -1259.075927734375, + "loss": 0.0891, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2167361080646515, + "rewards/margins": 0.2824261486530304, + "rewards/rejected": -0.4991622567176819, + "step": 3620 + }, + { + "epoch": 0.69, + "learning_rate": 1.3153283438175036e-06, + "logits/chosen": -1.5434691905975342, + "logits/rejected": -1.0687668323516846, + "logps/chosen": -678.1273193359375, + "logps/rejected": -1346.504150390625, + "loss": 0.0733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22936749458312988, + "rewards/margins": 0.3146396279335022, + "rewards/rejected": -0.5440071225166321, + "step": 3630 + }, + { + "epoch": 0.69, + "learning_rate": 1.3007172168743854e-06, + "logits/chosen": -1.5151026248931885, + "logits/rejected": -1.2111207246780396, + "logps/chosen": -617.3922729492188, + "logps/rejected": -1312.60986328125, + "loss": 0.0612, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21233785152435303, + "rewards/margins": 0.29888466000556946, + "rewards/rejected": -0.5112224817276001, + "step": 3640 + }, + { + "epoch": 0.7, + "learning_rate": 1.2861591070496193e-06, + "logits/chosen": -1.5453293323516846, + "logits/rejected": -0.8888516426086426, + "logps/chosen": -743.7242431640625, + "logps/rejected": -1490.795654296875, + "loss": 0.0782, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24507150053977966, + "rewards/margins": 0.3426440358161926, + "rewards/rejected": -0.5877156257629395, + "step": 3650 + }, + { + "epoch": 0.7, + "learning_rate": 1.271654657918722e-06, + "logits/chosen": -1.7394828796386719, + "logits/rejected": -1.1618573665618896, + "logps/chosen": -536.0321655273438, + "logps/rejected": -1236.074951171875, + "loss": 0.0636, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15857215225696564, + "rewards/margins": 0.3111765384674072, + "rewards/rejected": -0.46974867582321167, + "step": 3660 + }, + { + "epoch": 0.7, + "learning_rate": 1.2572045106850051e-06, + "logits/chosen": -1.6560094356536865, + "logits/rejected": -0.9025988578796387, + "logps/chosen": -661.8654174804688, + "logps/rejected": -1258.066650390625, + "loss": 0.0749, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18976400792598724, + "rewards/margins": 0.3110305964946747, + "rewards/rejected": -0.5007945895195007, + "step": 3670 + }, + { + "epoch": 0.7, + "learning_rate": 1.2428093041512418e-06, + "logits/chosen": -1.7884883880615234, + "logits/rejected": -1.1757820844650269, + "logps/chosen": -727.03564453125, + "logps/rejected": -1301.129150390625, + "loss": 0.08, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1926560401916504, + "rewards/margins": 0.28592005372047424, + "rewards/rejected": -0.478576123714447, + "step": 3680 + }, + { + "epoch": 0.7, + "learning_rate": 1.2284696746914216e-06, + "logits/chosen": -1.497154951095581, + "logits/rejected": -1.116506814956665, + "logps/chosen": -680.4603271484375, + "logps/rejected": -1272.1434326171875, + "loss": 0.0745, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20678162574768066, + "rewards/margins": 0.25609079003334045, + "rewards/rejected": -0.4628724455833435, + "step": 3690 + }, + { + "epoch": 0.7, + "learning_rate": 1.2141862562226164e-06, + "logits/chosen": -1.6773204803466797, + "logits/rejected": -0.9909335374832153, + "logps/chosen": -633.2793579101562, + "logps/rejected": -1360.771728515625, + "loss": 0.0668, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1755729764699936, + "rewards/margins": 0.3312123715877533, + "rewards/rejected": -0.5067853927612305, + "step": 3700 + }, + { + "epoch": 0.71, + "learning_rate": 1.1999596801769617e-06, + "logits/chosen": -1.7998435497283936, + "logits/rejected": -0.9226717948913574, + "logps/chosen": -769.0552368164062, + "logps/rejected": -1420.398193359375, + "loss": 0.0709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21222510933876038, + "rewards/margins": 0.3229183554649353, + "rewards/rejected": -0.5351434946060181, + "step": 3710 + }, + { + "epoch": 0.71, + "learning_rate": 1.185790575473738e-06, + "logits/chosen": -1.606642484664917, + "logits/rejected": -1.2084159851074219, + "logps/chosen": -753.2423706054688, + "logps/rejected": -1257.906982421875, + "loss": 0.0969, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23726069927215576, + "rewards/margins": 0.23133531212806702, + "rewards/rejected": -0.4685959815979004, + "step": 3720 + }, + { + "epoch": 0.71, + "learning_rate": 1.1716795684915728e-06, + "logits/chosen": -1.6280252933502197, + "logits/rejected": -0.9002641439437866, + "logps/chosen": -669.9718627929688, + "logps/rejected": -1360.99658203125, + "loss": 0.0626, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16807326674461365, + "rewards/margins": 0.33500486612319946, + "rewards/rejected": -0.5030781626701355, + "step": 3730 + }, + { + "epoch": 0.71, + "learning_rate": 1.1576272830407418e-06, + "logits/chosen": -1.6571365594863892, + "logits/rejected": -1.1870951652526855, + "logps/chosen": -640.5313110351562, + "logps/rejected": -1359.805419921875, + "loss": 0.0782, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1662045419216156, + "rewards/margins": 0.3272945284843445, + "rewards/rejected": -0.49349913001060486, + "step": 3740 + }, + { + "epoch": 0.71, + "learning_rate": 1.1436343403356019e-06, + "logits/chosen": -1.714442253112793, + "logits/rejected": -0.9745224714279175, + "logps/chosen": -706.4690551757812, + "logps/rejected": -1396.648681640625, + "loss": 0.0649, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1968952864408493, + "rewards/margins": 0.32389289140701294, + "rewards/rejected": -0.5207881927490234, + "step": 3750 + }, + { + "epoch": 0.72, + "learning_rate": 1.129701358967123e-06, + "logits/chosen": -1.73860764503479, + "logits/rejected": -0.9712656736373901, + "logps/chosen": -629.3460083007812, + "logps/rejected": -1206.01953125, + "loss": 0.0828, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14328864216804504, + "rewards/margins": 0.2993341088294983, + "rewards/rejected": -0.44262275099754333, + "step": 3760 + }, + { + "epoch": 0.72, + "learning_rate": 1.11582895487554e-06, + "logits/chosen": -1.5468307733535767, + "logits/rejected": -0.9999405741691589, + "logps/chosen": -550.5922241210938, + "logps/rejected": -1237.1678466796875, + "loss": 0.0858, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16123315691947937, + "rewards/margins": 0.283268004655838, + "rewards/rejected": -0.4445011615753174, + "step": 3770 + }, + { + "epoch": 0.72, + "learning_rate": 1.1020177413231334e-06, + "logits/chosen": -1.824406385421753, + "logits/rejected": -1.0949231386184692, + "logps/chosen": -692.0006103515625, + "logps/rejected": -1357.0108642578125, + "loss": 0.0692, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18804143369197845, + "rewards/margins": 0.304176390171051, + "rewards/rejected": -0.4922178387641907, + "step": 3780 + }, + { + "epoch": 0.72, + "learning_rate": 1.0882683288671041e-06, + "logits/chosen": -1.5519713163375854, + "logits/rejected": -1.2418644428253174, + "logps/chosen": -621.3736572265625, + "logps/rejected": -1320.0389404296875, + "loss": 0.0743, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15430279076099396, + "rewards/margins": 0.32646626234054565, + "rewards/rejected": -0.4807690978050232, + "step": 3790 + }, + { + "epoch": 0.72, + "learning_rate": 1.0745813253325957e-06, + "logits/chosen": -1.5042496919631958, + "logits/rejected": -1.1541450023651123, + "logps/chosen": -581.3851318359375, + "logps/rejected": -1167.1385498046875, + "loss": 0.0711, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17610274255275726, + "rewards/margins": 0.2804592549800873, + "rewards/rejected": -0.45656198263168335, + "step": 3800 + }, + { + "epoch": 0.73, + "learning_rate": 1.0609573357858166e-06, + "logits/chosen": -1.8346351385116577, + "logits/rejected": -0.9841367602348328, + "logps/chosen": -615.1905517578125, + "logps/rejected": -1213.364501953125, + "loss": 0.1024, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18721064925193787, + "rewards/margins": 0.26040130853652954, + "rewards/rejected": -0.4476119577884674, + "step": 3810 + }, + { + "epoch": 0.73, + "learning_rate": 1.0473969625072922e-06, + "logits/chosen": -1.4209061861038208, + "logits/rejected": -0.9002727270126343, + "logps/chosen": -687.7305908203125, + "logps/rejected": -1552.115478515625, + "loss": 0.0392, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.23270201683044434, + "rewards/margins": 0.35259899497032166, + "rewards/rejected": -0.5853010416030884, + "step": 3820 + }, + { + "epoch": 0.73, + "learning_rate": 1.0339008049652427e-06, + "logits/chosen": -1.407459020614624, + "logits/rejected": -0.9791488647460938, + "logps/chosen": -621.3890380859375, + "logps/rejected": -1234.590087890625, + "loss": 0.0653, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17459668219089508, + "rewards/margins": 0.29255175590515137, + "rewards/rejected": -0.46714845299720764, + "step": 3830 + }, + { + "epoch": 0.73, + "learning_rate": 1.0204694597890814e-06, + "logits/chosen": -1.6580352783203125, + "logits/rejected": -1.1533594131469727, + "logps/chosen": -676.6832275390625, + "logps/rejected": -1325.892822265625, + "loss": 0.0564, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20397302508354187, + "rewards/margins": 0.29500606656074524, + "rewards/rejected": -0.4989790916442871, + "step": 3840 + }, + { + "epoch": 0.73, + "learning_rate": 1.0071035207430352e-06, + "logits/chosen": -1.2952769994735718, + "logits/rejected": -0.9416033029556274, + "logps/chosen": -604.2355346679688, + "logps/rejected": -1212.9393310546875, + "loss": 0.0484, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16119088232517242, + "rewards/margins": 0.2786763310432434, + "rewards/rejected": -0.43986719846725464, + "step": 3850 + }, + { + "epoch": 0.74, + "learning_rate": 9.938035786999018e-07, + "logits/chosen": -1.5834907293319702, + "logits/rejected": -1.082273244857788, + "logps/chosen": -560.4159545898438, + "logps/rejected": -1141.927490234375, + "loss": 0.1234, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16034545004367828, + "rewards/margins": 0.24875816702842712, + "rewards/rejected": -0.4091036319732666, + "step": 3860 + }, + { + "epoch": 0.74, + "learning_rate": 9.805702216149252e-07, + "logits/chosen": -1.6294996738433838, + "logits/rejected": -0.9513868093490601, + "logps/chosen": -647.7396240234375, + "logps/rejected": -1253.830810546875, + "loss": 0.0869, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1819881945848465, + "rewards/margins": 0.3004476726055145, + "rewards/rejected": -0.48243585228919983, + "step": 3870 + }, + { + "epoch": 0.74, + "learning_rate": 9.674040344998056e-07, + "logits/chosen": -1.6295925378799438, + "logits/rejected": -1.107329249382019, + "logps/chosen": -690.513916015625, + "logps/rejected": -1182.1849365234375, + "loss": 0.1012, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23783211410045624, + "rewards/margins": 0.24144688248634338, + "rewards/rejected": -0.47927895188331604, + "step": 3880 + }, + { + "epoch": 0.74, + "learning_rate": 9.543055993968339e-07, + "logits/chosen": -1.8603365421295166, + "logits/rejected": -1.189885139465332, + "logps/chosen": -682.6376953125, + "logps/rejected": -1384.593994140625, + "loss": 0.0565, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2013491690158844, + "rewards/margins": 0.3509669601917267, + "rewards/rejected": -0.5523160696029663, + "step": 3890 + }, + { + "epoch": 0.74, + "learning_rate": 9.412754953531664e-07, + "logits/chosen": -1.4683700799942017, + "logits/rejected": -0.7776457071304321, + "logps/chosen": -579.8306274414062, + "logps/rejected": -1226.8836669921875, + "loss": 0.0879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1834620088338852, + "rewards/margins": 0.3038709759712219, + "rewards/rejected": -0.4873329699039459, + "step": 3900 + }, + { + "epoch": 0.74, + "learning_rate": 9.283142983952231e-07, + "logits/chosen": -1.4519298076629639, + "logits/rejected": -0.9826549291610718, + "logps/chosen": -702.67578125, + "logps/rejected": -1460.3287353515625, + "loss": 0.0591, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.22127850353717804, + "rewards/margins": 0.35032838582992554, + "rewards/rejected": -0.5716068744659424, + "step": 3910 + }, + { + "epoch": 0.75, + "learning_rate": 9.154225815032242e-07, + "logits/chosen": -1.4828869104385376, + "logits/rejected": -1.0451923608779907, + "logps/chosen": -605.3231201171875, + "logps/rejected": -1357.6131591796875, + "loss": 0.0581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20290470123291016, + "rewards/margins": 0.34218138456344604, + "rewards/rejected": -0.5450860857963562, + "step": 3920 + }, + { + "epoch": 0.75, + "learning_rate": 9.026009145858608e-07, + "logits/chosen": -1.6691789627075195, + "logits/rejected": -0.9547937512397766, + "logps/chosen": -698.2928466796875, + "logps/rejected": -1358.7264404296875, + "loss": 0.0853, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19656509160995483, + "rewards/margins": 0.3234030604362488, + "rewards/rejected": -0.5199681520462036, + "step": 3930 + }, + { + "epoch": 0.75, + "learning_rate": 8.898498644550973e-07, + "logits/chosen": -1.5353500843048096, + "logits/rejected": -1.1531541347503662, + "logps/chosen": -608.4739990234375, + "logps/rejected": -1238.1771240234375, + "loss": 0.0929, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20965221524238586, + "rewards/margins": 0.2650408148765564, + "rewards/rejected": -0.47469305992126465, + "step": 3940 + }, + { + "epoch": 0.75, + "learning_rate": 8.771699948011203e-07, + "logits/chosen": -1.4269169569015503, + "logits/rejected": -0.8833200335502625, + "logps/chosen": -800.2471923828125, + "logps/rejected": -1413.006591796875, + "loss": 0.0603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2683252692222595, + "rewards/margins": 0.30723056197166443, + "rewards/rejected": -0.5755558013916016, + "step": 3950 + }, + { + "epoch": 0.75, + "learning_rate": 8.645618661674144e-07, + "logits/chosen": -1.309072494506836, + "logits/rejected": -1.012961983680725, + "logps/chosen": -652.0698852539062, + "logps/rejected": -1263.73876953125, + "loss": 0.0663, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20567500591278076, + "rewards/margins": 0.27669185400009155, + "rewards/rejected": -0.4823668599128723, + "step": 3960 + }, + { + "epoch": 0.76, + "learning_rate": 8.520260359259822e-07, + "logits/chosen": -1.4296892881393433, + "logits/rejected": -0.9210003614425659, + "logps/chosen": -601.3060302734375, + "logps/rejected": -1292.217529296875, + "loss": 0.0465, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1769389510154724, + "rewards/margins": 0.3420455753803253, + "rewards/rejected": -0.5189844965934753, + "step": 3970 + }, + { + "epoch": 0.76, + "learning_rate": 8.395630582527075e-07, + "logits/chosen": -1.7763392925262451, + "logits/rejected": -0.804221510887146, + "logps/chosen": -707.9012451171875, + "logps/rejected": -1312.0638427734375, + "loss": 0.048, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19189570844173431, + "rewards/margins": 0.3184513747692108, + "rewards/rejected": -0.5103470683097839, + "step": 3980 + }, + { + "epoch": 0.76, + "learning_rate": 8.271734841028553e-07, + "logits/chosen": -1.5501251220703125, + "logits/rejected": -1.0809847116470337, + "logps/chosen": -464.4537658691406, + "logps/rejected": -1207.356689453125, + "loss": 0.0676, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.14479857683181763, + "rewards/margins": 0.31345051527023315, + "rewards/rejected": -0.4582490921020508, + "step": 3990 + }, + { + "epoch": 0.76, + "learning_rate": 8.148578611867114e-07, + "logits/chosen": -1.724912405014038, + "logits/rejected": -1.0931422710418701, + "logps/chosen": -653.0133056640625, + "logps/rejected": -1423.009521484375, + "loss": 0.0479, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19157932698726654, + "rewards/margins": 0.3370700478553772, + "rewards/rejected": -0.5286494493484497, + "step": 4000 + }, + { + "epoch": 0.76, + "learning_rate": 8.026167339453792e-07, + "logits/chosen": -1.731673002243042, + "logits/rejected": -1.0135266780853271, + "logps/chosen": -637.0040283203125, + "logps/rejected": -1375.421875, + "loss": 0.071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19047974050045013, + "rewards/margins": 0.344374418258667, + "rewards/rejected": -0.5348542332649231, + "step": 4010 + }, + { + "epoch": 0.77, + "learning_rate": 7.904506435266998e-07, + "logits/chosen": -1.5759592056274414, + "logits/rejected": -1.023024320602417, + "logps/chosen": -563.4696044921875, + "logps/rejected": -1408.36767578125, + "loss": 0.0499, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1717696487903595, + "rewards/margins": 0.3687570095062256, + "rewards/rejected": -0.5405266880989075, + "step": 4020 + }, + { + "epoch": 0.77, + "learning_rate": 7.783601277613378e-07, + "logits/chosen": -1.4576669931411743, + "logits/rejected": -0.9521520733833313, + "logps/chosen": -670.4305419921875, + "logps/rejected": -1433.7462158203125, + "loss": 0.0331, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.18914173543453217, + "rewards/margins": 0.38103678822517395, + "rewards/rejected": -0.5701784491539001, + "step": 4030 + }, + { + "epoch": 0.77, + "learning_rate": 7.66345721139003e-07, + "logits/chosen": -1.6169652938842773, + "logits/rejected": -1.045633316040039, + "logps/chosen": -625.4329223632812, + "logps/rejected": -1342.4127197265625, + "loss": 0.0622, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18822208046913147, + "rewards/margins": 0.3339303135871887, + "rewards/rejected": -0.5221523642539978, + "step": 4040 + }, + { + "epoch": 0.77, + "learning_rate": 7.544079547848183e-07, + "logits/chosen": -1.6720600128173828, + "logits/rejected": -0.9817830324172974, + "logps/chosen": -605.169189453125, + "logps/rejected": -1312.1810302734375, + "loss": 0.0644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16692718863487244, + "rewards/margins": 0.32599127292633057, + "rewards/rejected": -0.4929184913635254, + "step": 4050 + }, + { + "epoch": 0.77, + "learning_rate": 7.425473564358457e-07, + "logits/chosen": -1.6741440296173096, + "logits/rejected": -1.0046089887619019, + "logps/chosen": -649.8138427734375, + "logps/rejected": -1462.4537353515625, + "loss": 0.0276, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19399216771125793, + "rewards/margins": 0.3730100989341736, + "rewards/rejected": -0.5670022368431091, + "step": 4060 + }, + { + "epoch": 0.78, + "learning_rate": 7.307644504177539e-07, + "logits/chosen": -1.5330688953399658, + "logits/rejected": -0.9372521638870239, + "logps/chosen": -686.667236328125, + "logps/rejected": -1373.4873046875, + "loss": 0.0607, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2182857096195221, + "rewards/margins": 0.31643131375312805, + "rewards/rejected": -0.5347169637680054, + "step": 4070 + }, + { + "epoch": 0.78, + "learning_rate": 7.190597576216385e-07, + "logits/chosen": -1.4250767230987549, + "logits/rejected": -0.8781077265739441, + "logps/chosen": -703.5140991210938, + "logps/rejected": -1407.7247314453125, + "loss": 0.061, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.242970272898674, + "rewards/margins": 0.3056332767009735, + "rewards/rejected": -0.5486035943031311, + "step": 4080 + }, + { + "epoch": 0.78, + "learning_rate": 7.074337954809945e-07, + "logits/chosen": -1.5299713611602783, + "logits/rejected": -1.0373504161834717, + "logps/chosen": -718.3067626953125, + "logps/rejected": -1378.4388427734375, + "loss": 0.0615, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2197229117155075, + "rewards/margins": 0.32516324520111084, + "rewards/rejected": -0.5448861718177795, + "step": 4090 + }, + { + "epoch": 0.78, + "learning_rate": 6.958870779488447e-07, + "logits/chosen": -1.651827096939087, + "logits/rejected": -0.9306151270866394, + "logps/chosen": -670.0147705078125, + "logps/rejected": -1367.4783935546875, + "loss": 0.0726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2136562615633011, + "rewards/margins": 0.3230150640010834, + "rewards/rejected": -0.5366712808609009, + "step": 4100 + }, + { + "epoch": 0.78, + "learning_rate": 6.844201154750176e-07, + "logits/chosen": -1.382893443107605, + "logits/rejected": -0.9818013906478882, + "logps/chosen": -585.5609130859375, + "logps/rejected": -1219.7158203125, + "loss": 0.0758, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20312824845314026, + "rewards/margins": 0.28206413984298706, + "rewards/rejected": -0.4851924479007721, + "step": 4110 + }, + { + "epoch": 0.78, + "learning_rate": 6.730334149835788e-07, + "logits/chosen": -1.7012789249420166, + "logits/rejected": -1.2583403587341309, + "logps/chosen": -638.0623779296875, + "logps/rejected": -1250.6907958984375, + "loss": 0.0732, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19990402460098267, + "rewards/margins": 0.31234210729599, + "rewards/rejected": -0.5122461915016174, + "step": 4120 + }, + { + "epoch": 0.79, + "learning_rate": 6.617274798504286e-07, + "logits/chosen": -1.6098655462265015, + "logits/rejected": -1.2438991069793701, + "logps/chosen": -750.6119384765625, + "logps/rejected": -1468.4130859375, + "loss": 0.0818, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.239259272813797, + "rewards/margins": 0.3222685754299164, + "rewards/rejected": -0.5615277886390686, + "step": 4130 + }, + { + "epoch": 0.79, + "learning_rate": 6.505028098810407e-07, + "logits/chosen": -1.4979428052902222, + "logits/rejected": -0.9974969029426575, + "logps/chosen": -664.5301513671875, + "logps/rejected": -1350.095947265625, + "loss": 0.0471, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21733012795448303, + "rewards/margins": 0.3036603033542633, + "rewards/rejected": -0.5209903717041016, + "step": 4140 + }, + { + "epoch": 0.79, + "learning_rate": 6.393599012883709e-07, + "logits/chosen": -1.5405172109603882, + "logits/rejected": -1.0617095232009888, + "logps/chosen": -608.335205078125, + "logps/rejected": -1415.77099609375, + "loss": 0.0558, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19087664783000946, + "rewards/margins": 0.360538125038147, + "rewards/rejected": -0.5514147877693176, + "step": 4150 + }, + { + "epoch": 0.79, + "learning_rate": 6.282992466709247e-07, + "logits/chosen": -1.568500280380249, + "logits/rejected": -1.1401845216751099, + "logps/chosen": -619.8330078125, + "logps/rejected": -1272.417236328125, + "loss": 0.0681, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18364539742469788, + "rewards/margins": 0.29479947686195374, + "rewards/rejected": -0.4784448742866516, + "step": 4160 + }, + { + "epoch": 0.79, + "learning_rate": 6.17321334990973e-07, + "logits/chosen": -1.444135069847107, + "logits/rejected": -0.8712388277053833, + "logps/chosen": -682.9646606445312, + "logps/rejected": -1218.102783203125, + "loss": 0.1074, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22160005569458008, + "rewards/margins": 0.2552647292613983, + "rewards/rejected": -0.47686487436294556, + "step": 4170 + }, + { + "epoch": 0.8, + "learning_rate": 6.064266515529419e-07, + "logits/chosen": -1.6089346408843994, + "logits/rejected": -1.148726463317871, + "logps/chosen": -718.4061279296875, + "logps/rejected": -1356.4761962890625, + "loss": 0.0701, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23002943396568298, + "rewards/margins": 0.29873934388160706, + "rewards/rejected": -0.5287687182426453, + "step": 4180 + }, + { + "epoch": 0.8, + "learning_rate": 5.956156779819586e-07, + "logits/chosen": -1.4750916957855225, + "logits/rejected": -1.0166985988616943, + "logps/chosen": -726.7098388671875, + "logps/rejected": -1318.445556640625, + "loss": 0.0743, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1983887255191803, + "rewards/margins": 0.3115563988685608, + "rewards/rejected": -0.5099452137947083, + "step": 4190 + }, + { + "epoch": 0.8, + "learning_rate": 5.848888922025553e-07, + "logits/chosen": -1.6387817859649658, + "logits/rejected": -1.0083996057510376, + "logps/chosen": -603.5410766601562, + "logps/rejected": -1365.437744140625, + "loss": 0.0537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17825394868850708, + "rewards/margins": 0.33875900506973267, + "rewards/rejected": -0.5170130133628845, + "step": 4200 + }, + { + "epoch": 0.8, + "learning_rate": 5.742467684175473e-07, + "logits/chosen": -1.5205031633377075, + "logits/rejected": -0.9753937721252441, + "logps/chosen": -701.4137573242188, + "logps/rejected": -1442.636962890625, + "loss": 0.0652, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2099340856075287, + "rewards/margins": 0.3407210111618042, + "rewards/rejected": -0.5506550669670105, + "step": 4210 + }, + { + "epoch": 0.8, + "learning_rate": 5.636897770870667e-07, + "logits/chosen": -1.5778207778930664, + "logits/rejected": -0.8220159411430359, + "logps/chosen": -642.7035522460938, + "logps/rejected": -1299.4345703125, + "loss": 0.0646, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1969667673110962, + "rewards/margins": 0.3171537518501282, + "rewards/rejected": -0.5141205191612244, + "step": 4220 + }, + { + "epoch": 0.81, + "learning_rate": 5.532183849077651e-07, + "logits/chosen": -1.692636489868164, + "logits/rejected": -1.019176721572876, + "logps/chosen": -719.9124755859375, + "logps/rejected": -1373.59033203125, + "loss": 0.0411, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2058321237564087, + "rewards/margins": 0.3241458237171173, + "rewards/rejected": -0.5299779176712036, + "step": 4230 + }, + { + "epoch": 0.81, + "learning_rate": 5.428330547921809e-07, + "logits/chosen": -1.5508053302764893, + "logits/rejected": -0.8036486506462097, + "logps/chosen": -709.2622680664062, + "logps/rejected": -1367.7880859375, + "loss": 0.0642, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.22711780667304993, + "rewards/margins": 0.30738967657089233, + "rewards/rejected": -0.5345073938369751, + "step": 4240 + }, + { + "epoch": 0.81, + "learning_rate": 5.32534245848278e-07, + "logits/chosen": -1.4726245403289795, + "logits/rejected": -0.9846108555793762, + "logps/chosen": -694.281005859375, + "logps/rejected": -1363.457275390625, + "loss": 0.0661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2253766804933548, + "rewards/margins": 0.31212395429611206, + "rewards/rejected": -0.5375006794929504, + "step": 4250 + }, + { + "epoch": 0.81, + "learning_rate": 5.223224133591475e-07, + "logits/chosen": -1.591333031654358, + "logits/rejected": -0.8938309550285339, + "logps/chosen": -677.0285034179688, + "logps/rejected": -1240.8616943359375, + "loss": 0.0543, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.20838508009910583, + "rewards/margins": 0.2973417639732361, + "rewards/rejected": -0.5057269334793091, + "step": 4260 + }, + { + "epoch": 0.81, + "learning_rate": 5.121980087628802e-07, + "logits/chosen": -1.2787164449691772, + "logits/rejected": -0.9021083116531372, + "logps/chosen": -727.6304931640625, + "logps/rejected": -1371.044921875, + "loss": 0.1052, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2336258590221405, + "rewards/margins": 0.27272963523864746, + "rewards/rejected": -0.5063555240631104, + "step": 4270 + }, + { + "epoch": 0.82, + "learning_rate": 5.021614796326155e-07, + "logits/chosen": -1.711517333984375, + "logits/rejected": -1.1945267915725708, + "logps/chosen": -593.6410522460938, + "logps/rejected": -1357.221435546875, + "loss": 0.0416, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16998571157455444, + "rewards/margins": 0.3332723081111908, + "rewards/rejected": -0.5032579898834229, + "step": 4280 + }, + { + "epoch": 0.82, + "learning_rate": 4.922132696567463e-07, + "logits/chosen": -1.4213650226593018, + "logits/rejected": -0.9432398080825806, + "logps/chosen": -738.092529296875, + "logps/rejected": -1342.125732421875, + "loss": 0.0948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21361108124256134, + "rewards/margins": 0.284687340259552, + "rewards/rejected": -0.4982984662055969, + "step": 4290 + }, + { + "epoch": 0.82, + "learning_rate": 4.823538186193097e-07, + "logits/chosen": -1.7533416748046875, + "logits/rejected": -1.3407680988311768, + "logps/chosen": -615.1173706054688, + "logps/rejected": -1254.2808837890625, + "loss": 0.0931, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1531359702348709, + "rewards/margins": 0.30277323722839355, + "rewards/rejected": -0.4559091627597809, + "step": 4300 + }, + { + "epoch": 0.82, + "learning_rate": 4.725835623805494e-07, + "logits/chosen": -1.6482717990875244, + "logits/rejected": -1.1470712423324585, + "logps/chosen": -706.8258056640625, + "logps/rejected": -1328.094482421875, + "loss": 0.1006, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.22576546669006348, + "rewards/margins": 0.2724280059337616, + "rewards/rejected": -0.49819356203079224, + "step": 4310 + }, + { + "epoch": 0.82, + "learning_rate": 4.6290293285763816e-07, + "logits/chosen": -1.7255032062530518, + "logits/rejected": -0.9146900177001953, + "logps/chosen": -600.8002319335938, + "logps/rejected": -1215.993408203125, + "loss": 0.0699, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.16509151458740234, + "rewards/margins": 0.32071444392204285, + "rewards/rejected": -0.4858059287071228, + "step": 4320 + }, + { + "epoch": 0.82, + "learning_rate": 4.533123580055909e-07, + "logits/chosen": -1.6732444763183594, + "logits/rejected": -1.1781762838363647, + "logps/chosen": -647.499267578125, + "logps/rejected": -1235.640380859375, + "loss": 0.0888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17901811003684998, + "rewards/margins": 0.27853089570999146, + "rewards/rejected": -0.45754894614219666, + "step": 4330 + }, + { + "epoch": 0.83, + "learning_rate": 4.438122617983442e-07, + "logits/chosen": -1.3904229402542114, + "logits/rejected": -0.8612979650497437, + "logps/chosen": -639.7564697265625, + "logps/rejected": -1298.111572265625, + "loss": 0.0449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19331908226013184, + "rewards/margins": 0.31781238317489624, + "rewards/rejected": -0.5111314654350281, + "step": 4340 + }, + { + "epoch": 0.83, + "learning_rate": 4.344030642100133e-07, + "logits/chosen": -1.843650221824646, + "logits/rejected": -1.2812918424606323, + "logps/chosen": -640.6357421875, + "logps/rejected": -1231.58642578125, + "loss": 0.0726, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1844300776720047, + "rewards/margins": 0.2944430112838745, + "rewards/rejected": -0.478873074054718, + "step": 4350 + }, + { + "epoch": 0.83, + "learning_rate": 4.250851811963236e-07, + "logits/chosen": -1.2538540363311768, + "logits/rejected": -0.8775166273117065, + "logps/chosen": -620.1490478515625, + "logps/rejected": -1215.9146728515625, + "loss": 0.0662, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20052723586559296, + "rewards/margins": 0.27596521377563477, + "rewards/rejected": -0.47649240493774414, + "step": 4360 + }, + { + "epoch": 0.83, + "learning_rate": 4.158590246762278e-07, + "logits/chosen": -1.5852545499801636, + "logits/rejected": -0.8562048077583313, + "logps/chosen": -759.5594482421875, + "logps/rejected": -1533.1597900390625, + "loss": 0.0545, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2186473309993744, + "rewards/margins": 0.3518058657646179, + "rewards/rejected": -0.5704531669616699, + "step": 4370 + }, + { + "epoch": 0.83, + "learning_rate": 4.0672500251369204e-07, + "logits/chosen": -1.5913574695587158, + "logits/rejected": -0.9126864671707153, + "logps/chosen": -683.1362915039062, + "logps/rejected": -1301.7376708984375, + "loss": 0.0587, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2215549498796463, + "rewards/margins": 0.2973330020904541, + "rewards/rejected": -0.518887996673584, + "step": 4380 + }, + { + "epoch": 0.84, + "learning_rate": 3.976835184996644e-07, + "logits/chosen": -1.5635426044464111, + "logits/rejected": -0.9335036277770996, + "logps/chosen": -658.034423828125, + "logps/rejected": -1340.171630859375, + "loss": 0.0618, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18267235159873962, + "rewards/margins": 0.3069957494735718, + "rewards/rejected": -0.4896681308746338, + "step": 4390 + }, + { + "epoch": 0.84, + "learning_rate": 3.887349723342304e-07, + "logits/chosen": -1.6750385761260986, + "logits/rejected": -1.050916314125061, + "logps/chosen": -796.87353515625, + "logps/rejected": -1338.190673828125, + "loss": 0.0873, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21675412356853485, + "rewards/margins": 0.2645486891269684, + "rewards/rejected": -0.48130282759666443, + "step": 4400 + }, + { + "epoch": 0.84, + "learning_rate": 3.798797596089351e-07, + "logits/chosen": -1.6474109888076782, + "logits/rejected": -1.0717977285385132, + "logps/chosen": -796.557861328125, + "logps/rejected": -1308.2513427734375, + "loss": 0.1033, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.24403448402881622, + "rewards/margins": 0.25606316328048706, + "rewards/rejected": -0.5000976920127869, + "step": 4410 + }, + { + "epoch": 0.84, + "learning_rate": 3.711182717893011e-07, + "logits/chosen": -1.7235870361328125, + "logits/rejected": -1.0134425163269043, + "logps/chosen": -738.4308471679688, + "logps/rejected": -1354.4034423828125, + "loss": 0.1004, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21404734253883362, + "rewards/margins": 0.2857828736305237, + "rewards/rejected": -0.4998301863670349, + "step": 4420 + }, + { + "epoch": 0.84, + "learning_rate": 3.624508961975215e-07, + "logits/chosen": -1.6678346395492554, + "logits/rejected": -1.052130937576294, + "logps/chosen": -560.8265380859375, + "logps/rejected": -1179.17626953125, + "loss": 0.0663, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1430796980857849, + "rewards/margins": 0.310772180557251, + "rewards/rejected": -0.4538518786430359, + "step": 4430 + }, + { + "epoch": 0.85, + "learning_rate": 3.538780159953348e-07, + "logits/chosen": -1.6310476064682007, + "logits/rejected": -0.8700817227363586, + "logps/chosen": -661.81494140625, + "logps/rejected": -1262.3851318359375, + "loss": 0.0626, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1743876039981842, + "rewards/margins": 0.33026957511901855, + "rewards/rejected": -0.5046571493148804, + "step": 4440 + }, + { + "epoch": 0.85, + "learning_rate": 3.454000101670901e-07, + "logits/chosen": -1.432461142539978, + "logits/rejected": -0.7634763717651367, + "logps/chosen": -572.1975708007812, + "logps/rejected": -1243.5718994140625, + "loss": 0.0669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1487385481595993, + "rewards/margins": 0.3277491331100464, + "rewards/rejected": -0.4764877259731293, + "step": 4450 + }, + { + "epoch": 0.85, + "learning_rate": 3.3701725350299143e-07, + "logits/chosen": -1.6553170680999756, + "logits/rejected": -0.9621874094009399, + "logps/chosen": -617.3048706054688, + "logps/rejected": -1175.0587158203125, + "loss": 0.0827, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18988697230815887, + "rewards/margins": 0.2635541558265686, + "rewards/rejected": -0.45344114303588867, + "step": 4460 + }, + { + "epoch": 0.85, + "learning_rate": 3.2873011658252796e-07, + "logits/chosen": -1.712868094444275, + "logits/rejected": -0.9782907366752625, + "logps/chosen": -598.8307495117188, + "logps/rejected": -1285.5618896484375, + "loss": 0.052, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1676691621541977, + "rewards/margins": 0.33848708868026733, + "rewards/rejected": -0.506156325340271, + "step": 4470 + }, + { + "epoch": 0.85, + "learning_rate": 3.2053896575809426e-07, + "logits/chosen": -1.3337204456329346, + "logits/rejected": -1.099057674407959, + "logps/chosen": -584.3330078125, + "logps/rejected": -1244.3043212890625, + "loss": 0.0771, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18744370341300964, + "rewards/margins": 0.28308480978012085, + "rewards/rejected": -0.4705285131931305, + "step": 4480 + }, + { + "epoch": 0.86, + "learning_rate": 3.124441631387931e-07, + "logits/chosen": -1.5639264583587646, + "logits/rejected": -0.8667289018630981, + "logps/chosen": -647.0604248046875, + "logps/rejected": -1290.8948974609375, + "loss": 0.0658, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1925622522830963, + "rewards/margins": 0.3208863437175751, + "rewards/rejected": -0.5134485960006714, + "step": 4490 + }, + { + "epoch": 0.86, + "learning_rate": 3.044460665744284e-07, + "logits/chosen": -1.630784034729004, + "logits/rejected": -0.9906666874885559, + "logps/chosen": -730.5087280273438, + "logps/rejected": -1308.167236328125, + "loss": 0.1002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2203700840473175, + "rewards/margins": 0.29447072744369507, + "rewards/rejected": -0.5148407816886902, + "step": 4500 + }, + { + "epoch": 0.86, + "learning_rate": 2.9654502963968575e-07, + "logits/chosen": -1.586451530456543, + "logits/rejected": -0.7961150407791138, + "logps/chosen": -586.2816162109375, + "logps/rejected": -1271.3458251953125, + "loss": 0.0616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1716344654560089, + "rewards/margins": 0.32398098707199097, + "rewards/rejected": -0.49561548233032227, + "step": 4510 + }, + { + "epoch": 0.86, + "learning_rate": 2.8874140161849915e-07, + "logits/chosen": -1.7378228902816772, + "logits/rejected": -1.12490975856781, + "logps/chosen": -689.92138671875, + "logps/rejected": -1360.4993896484375, + "loss": 0.0537, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19938451051712036, + "rewards/margins": 0.3099740147590637, + "rewards/rejected": -0.5093585252761841, + "step": 4520 + }, + { + "epoch": 0.86, + "learning_rate": 2.810355274886148e-07, + "logits/chosen": -1.5160925388336182, + "logits/rejected": -0.8361374139785767, + "logps/chosen": -664.5655517578125, + "logps/rejected": -1321.7562255859375, + "loss": 0.0627, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1796727478504181, + "rewards/margins": 0.3328721225261688, + "rewards/rejected": -0.5125448703765869, + "step": 4530 + }, + { + "epoch": 0.86, + "learning_rate": 2.7342774790633686e-07, + "logits/chosen": -1.582262396812439, + "logits/rejected": -1.2324196100234985, + "logps/chosen": -573.2413330078125, + "logps/rejected": -1255.7186279296875, + "loss": 0.0777, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17455413937568665, + "rewards/margins": 0.30101698637008667, + "rewards/rejected": -0.4755710959434509, + "step": 4540 + }, + { + "epoch": 0.87, + "learning_rate": 2.6591839919146963e-07, + "logits/chosen": -1.719139814376831, + "logits/rejected": -0.8366669416427612, + "logps/chosen": -662.6421508789062, + "logps/rejected": -1291.188720703125, + "loss": 0.0676, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19452355802059174, + "rewards/margins": 0.285775750875473, + "rewards/rejected": -0.4802993834018707, + "step": 4550 + }, + { + "epoch": 0.87, + "learning_rate": 2.58507813312448e-07, + "logits/chosen": -1.674212098121643, + "logits/rejected": -1.1627938747406006, + "logps/chosen": -719.9521484375, + "logps/rejected": -1362.1099853515625, + "loss": 0.0768, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22942790389060974, + "rewards/margins": 0.30024927854537964, + "rewards/rejected": -0.5296772122383118, + "step": 4560 + }, + { + "epoch": 0.87, + "learning_rate": 2.511963178716648e-07, + "logits/chosen": -1.5046024322509766, + "logits/rejected": -1.0431791543960571, + "logps/chosen": -589.3131713867188, + "logps/rejected": -1238.647705078125, + "loss": 0.073, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17821267247200012, + "rewards/margins": 0.2867395877838135, + "rewards/rejected": -0.464952290058136, + "step": 4570 + }, + { + "epoch": 0.87, + "learning_rate": 2.439842360909864e-07, + "logits/chosen": -1.5564829111099243, + "logits/rejected": -1.1388423442840576, + "logps/chosen": -673.455810546875, + "logps/rejected": -1347.0482177734375, + "loss": 0.062, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.20791272819042206, + "rewards/margins": 0.3196045756340027, + "rewards/rejected": -0.5275173187255859, + "step": 4580 + }, + { + "epoch": 0.87, + "learning_rate": 2.3687188679746314e-07, + "logits/chosen": -1.5243313312530518, + "logits/rejected": -1.1454176902770996, + "logps/chosen": -675.7528686523438, + "logps/rejected": -1301.16064453125, + "loss": 0.0651, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20887453854084015, + "rewards/margins": 0.28879880905151367, + "rewards/rejected": -0.497673362493515, + "step": 4590 + }, + { + "epoch": 0.88, + "learning_rate": 2.2985958440923772e-07, + "logits/chosen": -1.8085925579071045, + "logits/rejected": -1.1682274341583252, + "logps/chosen": -712.1884765625, + "logps/rejected": -1192.75732421875, + "loss": 0.0929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19347521662712097, + "rewards/margins": 0.2465830147266388, + "rewards/rejected": -0.44005829095840454, + "step": 4600 + }, + { + "epoch": 0.88, + "learning_rate": 2.2294763892164284e-07, + "logits/chosen": -1.334749698638916, + "logits/rejected": -1.004903793334961, + "logps/chosen": -605.7439575195312, + "logps/rejected": -1201.1614990234375, + "loss": 0.0744, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19247183203697205, + "rewards/margins": 0.2719135582447052, + "rewards/rejected": -0.46438542008399963, + "step": 4610 + }, + { + "epoch": 0.88, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -1.6107902526855469, + "logits/rejected": -1.1624407768249512, + "logps/chosen": -651.1942138671875, + "logps/rejected": -1337.1292724609375, + "loss": 0.0723, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1878310590982437, + "rewards/margins": 0.32857757806777954, + "rewards/rejected": -0.5164086222648621, + "step": 4620 + }, + { + "epoch": 0.88, + "learning_rate": 2.094260364336026e-07, + "logits/chosen": -1.7380402088165283, + "logits/rejected": -1.0198732614517212, + "logps/chosen": -703.918701171875, + "logps/rejected": -1186.9219970703125, + "loss": 0.0744, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20499920845031738, + "rewards/margins": 0.2569231390953064, + "rewards/rejected": -0.4619223475456238, + "step": 4630 + }, + { + "epoch": 0.88, + "learning_rate": 2.0281697718742333e-07, + "logits/chosen": -1.5836068391799927, + "logits/rejected": -1.1563191413879395, + "logps/chosen": -655.0853271484375, + "logps/rejected": -1434.5350341796875, + "loss": 0.06, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.19051814079284668, + "rewards/margins": 0.33394360542297363, + "rewards/rejected": -0.5244617462158203, + "step": 4640 + }, + { + "epoch": 0.89, + "learning_rate": 1.9630947032398068e-07, + "logits/chosen": -1.7695211172103882, + "logits/rejected": -1.111585021018982, + "logps/chosen": -688.1593017578125, + "logps/rejected": -1326.374267578125, + "loss": 0.0681, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2087709903717041, + "rewards/margins": 0.29607993364334106, + "rewards/rejected": -0.5048509240150452, + "step": 4650 + }, + { + "epoch": 0.89, + "learning_rate": 1.899038035229342e-07, + "logits/chosen": -1.3365787267684937, + "logits/rejected": -0.951318621635437, + "logps/chosen": -495.21746826171875, + "logps/rejected": -1190.7960205078125, + "loss": 0.064, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15549428761005402, + "rewards/margins": 0.30532047152519226, + "rewards/rejected": -0.46081477403640747, + "step": 4660 + }, + { + "epoch": 0.89, + "learning_rate": 1.8360025996186138e-07, + "logits/chosen": -1.6793752908706665, + "logits/rejected": -1.2577649354934692, + "logps/chosen": -654.9149169921875, + "logps/rejected": -1316.4361572265625, + "loss": 0.0554, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18013811111450195, + "rewards/margins": 0.31460148096084595, + "rewards/rejected": -0.4947396218776703, + "step": 4670 + }, + { + "epoch": 0.89, + "learning_rate": 1.7739911830374352e-07, + "logits/chosen": -1.44693922996521, + "logits/rejected": -0.7544930577278137, + "logps/chosen": -736.6383056640625, + "logps/rejected": -1203.6229248046875, + "loss": 0.0792, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2087833434343338, + "rewards/margins": 0.2550183832645416, + "rewards/rejected": -0.463801771402359, + "step": 4680 + }, + { + "epoch": 0.89, + "learning_rate": 1.713006526846439e-07, + "logits/chosen": -1.4717142581939697, + "logits/rejected": -0.8292557597160339, + "logps/chosen": -659.34033203125, + "logps/rejected": -1225.5867919921875, + "loss": 0.0778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18005402386188507, + "rewards/margins": 0.2781462073326111, + "rewards/rejected": -0.45820027589797974, + "step": 4690 + }, + { + "epoch": 0.9, + "learning_rate": 1.6530513270159116e-07, + "logits/chosen": -1.7716267108917236, + "logits/rejected": -1.1000728607177734, + "logps/chosen": -632.2623291015625, + "logps/rejected": -1293.8427734375, + "loss": 0.0712, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17110465466976166, + "rewards/margins": 0.3263039290904999, + "rewards/rejected": -0.49740856885910034, + "step": 4700 + }, + { + "epoch": 0.9, + "learning_rate": 1.59412823400657e-07, + "logits/chosen": -1.740073800086975, + "logits/rejected": -0.9315148591995239, + "logps/chosen": -628.2022705078125, + "logps/rejected": -1271.313232421875, + "loss": 0.0656, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18129108846187592, + "rewards/margins": 0.31177636981010437, + "rewards/rejected": -0.4930674433708191, + "step": 4710 + }, + { + "epoch": 0.9, + "learning_rate": 1.5362398526524463e-07, + "logits/chosen": -1.902126669883728, + "logits/rejected": -1.1279147863388062, + "logps/chosen": -647.0514526367188, + "logps/rejected": -1307.1373291015625, + "loss": 0.0724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16059984266757965, + "rewards/margins": 0.32505813241004944, + "rewards/rejected": -0.4856579899787903, + "step": 4720 + }, + { + "epoch": 0.9, + "learning_rate": 1.4793887420457008e-07, + "logits/chosen": -1.327126383781433, + "logits/rejected": -0.998917281627655, + "logps/chosen": -664.4324951171875, + "logps/rejected": -1342.4854736328125, + "loss": 0.0733, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.23212707042694092, + "rewards/margins": 0.2786712944507599, + "rewards/rejected": -0.5107983350753784, + "step": 4730 + }, + { + "epoch": 0.9, + "learning_rate": 1.4235774154234855e-07, + "logits/chosen": -1.6532478332519531, + "logits/rejected": -0.874586284160614, + "logps/chosen": -730.2886962890625, + "logps/rejected": -1362.4957275390625, + "loss": 0.0468, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.21704640984535217, + "rewards/margins": 0.31448471546173096, + "rewards/rejected": -0.5315311551094055, + "step": 4740 + }, + { + "epoch": 0.9, + "learning_rate": 1.368808340056879e-07, + "logits/chosen": -1.6743333339691162, + "logits/rejected": -1.163001298904419, + "logps/chosen": -662.5440673828125, + "logps/rejected": -1255.9044189453125, + "loss": 0.0612, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21114924550056458, + "rewards/margins": 0.2891456186771393, + "rewards/rejected": -0.5002948641777039, + "step": 4750 + }, + { + "epoch": 0.91, + "learning_rate": 1.31508393714177e-07, + "logits/chosen": -1.724111557006836, + "logits/rejected": -1.1406222581863403, + "logps/chosen": -682.9302978515625, + "logps/rejected": -1390.936279296875, + "loss": 0.0674, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.18357078731060028, + "rewards/margins": 0.3388100266456604, + "rewards/rejected": -0.5223808288574219, + "step": 4760 + }, + { + "epoch": 0.91, + "learning_rate": 1.2624065816918414e-07, + "logits/chosen": -1.5875694751739502, + "logits/rejected": -0.9316909909248352, + "logps/chosen": -637.0687255859375, + "logps/rejected": -1340.5999755859375, + "loss": 0.0489, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.20680876076221466, + "rewards/margins": 0.33463913202285767, + "rewards/rejected": -0.5414477586746216, + "step": 4770 + }, + { + "epoch": 0.91, + "learning_rate": 1.210778602433596e-07, + "logits/chosen": -1.452817678451538, + "logits/rejected": -1.0641636848449707, + "logps/chosen": -696.5808715820312, + "logps/rejected": -1312.058837890625, + "loss": 0.0763, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21340656280517578, + "rewards/margins": 0.2833073139190674, + "rewards/rejected": -0.49671393632888794, + "step": 4780 + }, + { + "epoch": 0.91, + "learning_rate": 1.1602022817033709e-07, + "logits/chosen": -1.572145700454712, + "logits/rejected": -1.0893280506134033, + "logps/chosen": -582.0816040039062, + "logps/rejected": -1350.13427734375, + "loss": 0.0584, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1654617041349411, + "rewards/margins": 0.3677004277706146, + "rewards/rejected": -0.5331621766090393, + "step": 4790 + }, + { + "epoch": 0.91, + "learning_rate": 1.1106798553464804e-07, + "logits/chosen": -1.7322986125946045, + "logits/rejected": -1.2693445682525635, + "logps/chosen": -668.9528198242188, + "logps/rejected": -1356.50634765625, + "loss": 0.0663, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19448697566986084, + "rewards/margins": 0.31413862109184265, + "rewards/rejected": -0.5086256265640259, + "step": 4800 + }, + { + "epoch": 0.92, + "learning_rate": 1.0622135126183514e-07, + "logits/chosen": -1.541881799697876, + "logits/rejected": -1.1039928197860718, + "logps/chosen": -730.3561401367188, + "logps/rejected": -1410.5531005859375, + "loss": 0.0625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2166539430618286, + "rewards/margins": 0.327588826417923, + "rewards/rejected": -0.5442427396774292, + "step": 4810 + }, + { + "epoch": 0.92, + "learning_rate": 1.0148053960877396e-07, + "logits/chosen": -1.627398133277893, + "logits/rejected": -1.1000397205352783, + "logps/chosen": -591.6277465820312, + "logps/rejected": -1181.8179931640625, + "loss": 0.085, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18314941227436066, + "rewards/margins": 0.2747834324836731, + "rewards/rejected": -0.45793280005455017, + "step": 4820 + }, + { + "epoch": 0.92, + "learning_rate": 9.684576015420277e-08, + "logits/chosen": -1.6205629110336304, + "logits/rejected": -0.995998740196228, + "logps/chosen": -659.4131469726562, + "logps/rejected": -1200.470947265625, + "loss": 0.0717, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19112655520439148, + "rewards/margins": 0.2899617552757263, + "rewards/rejected": -0.4810883402824402, + "step": 4830 + }, + { + "epoch": 0.92, + "learning_rate": 9.23172177894574e-08, + "logits/chosen": -1.6110188961029053, + "logits/rejected": -1.0364059209823608, + "logps/chosen": -547.3153076171875, + "logps/rejected": -1314.8369140625, + "loss": 0.054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1694483757019043, + "rewards/margins": 0.34166327118873596, + "rewards/rejected": -0.5111116170883179, + "step": 4840 + }, + { + "epoch": 0.92, + "learning_rate": 8.78951127094127e-08, + "logits/chosen": -1.5068880319595337, + "logits/rejected": -0.9471427798271179, + "logps/chosen": -521.8685302734375, + "logps/rejected": -1296.308349609375, + "loss": 0.0578, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.13810133934020996, + "rewards/margins": 0.3386577069759369, + "rewards/rejected": -0.47675904631614685, + "step": 4850 + }, + { + "epoch": 0.93, + "learning_rate": 8.357964040363209e-08, + "logits/chosen": -1.290801763534546, + "logits/rejected": -0.9146105647087097, + "logps/chosen": -596.8779296875, + "logps/rejected": -1347.836181640625, + "loss": 0.0449, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20246358215808868, + "rewards/margins": 0.33046966791152954, + "rewards/rejected": -0.5329331755638123, + "step": 4860 + }, + { + "epoch": 0.93, + "learning_rate": 7.937099164772699e-08, + "logits/chosen": -1.7896692752838135, + "logits/rejected": -1.0212453603744507, + "logps/chosen": -690.1158447265625, + "logps/rejected": -1228.534912109375, + "loss": 0.0814, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1973530352115631, + "rewards/margins": 0.2856197655200958, + "rewards/rejected": -0.48297280073165894, + "step": 4870 + }, + { + "epoch": 0.93, + "learning_rate": 7.526935249492245e-08, + "logits/chosen": -1.679908037185669, + "logits/rejected": -1.176174283027649, + "logps/chosen": -579.732421875, + "logps/rejected": -1254.651611328125, + "loss": 0.0666, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1832021027803421, + "rewards/margins": 0.2981014847755432, + "rewards/rejected": -0.4813036024570465, + "step": 4880 + }, + { + "epoch": 0.93, + "learning_rate": 7.127490426783124e-08, + "logits/chosen": -1.7154324054718018, + "logits/rejected": -1.0123614072799683, + "logps/chosen": -632.9345703125, + "logps/rejected": -1300.0052490234375, + "loss": 0.0656, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19814178347587585, + "rewards/margins": 0.32001256942749023, + "rewards/rejected": -0.5181543231010437, + "step": 4890 + }, + { + "epoch": 0.93, + "learning_rate": 6.738782355044048e-08, + "logits/chosen": -1.5465729236602783, + "logits/rejected": -0.9724661707878113, + "logps/chosen": -593.3762817382812, + "logps/rejected": -1277.333251953125, + "loss": 0.0448, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.17785096168518066, + "rewards/margins": 0.32004857063293457, + "rewards/rejected": -0.49789953231811523, + "step": 4900 + }, + { + "epoch": 0.94, + "learning_rate": 6.360828218030191e-08, + "logits/chosen": -1.4088236093521118, + "logits/rejected": -0.7327739000320435, + "logps/chosen": -690.17626953125, + "logps/rejected": -1354.6500244140625, + "loss": 0.0651, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20846012234687805, + "rewards/margins": 0.302595853805542, + "rewards/rejected": -0.5110559463500977, + "step": 4910 + }, + { + "epoch": 0.94, + "learning_rate": 5.993644724093889e-08, + "logits/chosen": -1.6966909170150757, + "logits/rejected": -0.9541479349136353, + "logps/chosen": -614.1896362304688, + "logps/rejected": -1325.9324951171875, + "loss": 0.0657, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.183970108628273, + "rewards/margins": 0.33520275354385376, + "rewards/rejected": -0.5191728472709656, + "step": 4920 + }, + { + "epoch": 0.94, + "learning_rate": 5.637248105445775e-08, + "logits/chosen": -1.7550580501556396, + "logits/rejected": -0.9308683276176453, + "logps/chosen": -793.6554565429688, + "logps/rejected": -1485.291259765625, + "loss": 0.0541, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.23346392810344696, + "rewards/margins": 0.3398323655128479, + "rewards/rejected": -0.573296308517456, + "step": 4930 + }, + { + "epoch": 0.94, + "learning_rate": 5.291654117437262e-08, + "logits/chosen": -1.7816959619522095, + "logits/rejected": -1.0950982570648193, + "logps/chosen": -879.9280395507812, + "logps/rejected": -1422.867431640625, + "loss": 0.077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21035346388816833, + "rewards/margins": 0.30386513471603394, + "rewards/rejected": -0.5142186284065247, + "step": 4940 + }, + { + "epoch": 0.94, + "learning_rate": 4.956878037864044e-08, + "logits/chosen": -1.4438133239746094, + "logits/rejected": -0.9569295048713684, + "logps/chosen": -665.3748168945312, + "logps/rejected": -1342.6187744140625, + "loss": 0.0789, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19597147405147552, + "rewards/margins": 0.32199811935424805, + "rewards/rejected": -0.51796954870224, + "step": 4950 + }, + { + "epoch": 0.94, + "learning_rate": 4.632934666290778e-08, + "logits/chosen": -1.6642974615097046, + "logits/rejected": -0.914962649345398, + "logps/chosen": -649.2335815429688, + "logps/rejected": -1294.827392578125, + "loss": 0.0438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1978123039007187, + "rewards/margins": 0.32232779264450073, + "rewards/rejected": -0.5201401114463806, + "step": 4960 + }, + { + "epoch": 0.95, + "learning_rate": 4.319838323396691e-08, + "logits/chosen": -2.037479877471924, + "logits/rejected": -0.9676597714424133, + "logps/chosen": -689.044921875, + "logps/rejected": -1347.722900390625, + "loss": 0.0454, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1525397002696991, + "rewards/margins": 0.3588055968284607, + "rewards/rejected": -0.5113453269004822, + "step": 4970 + }, + { + "epoch": 0.95, + "learning_rate": 4.017602850342584e-08, + "logits/chosen": -1.6456416845321655, + "logits/rejected": -1.1140178442001343, + "logps/chosen": -595.4800415039062, + "logps/rejected": -1429.136962890625, + "loss": 0.0368, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1686953455209732, + "rewards/margins": 0.37380266189575195, + "rewards/rejected": -0.542497992515564, + "step": 4980 + }, + { + "epoch": 0.95, + "learning_rate": 3.7262416081589866e-08, + "logits/chosen": -1.7311760187149048, + "logits/rejected": -0.8864547610282898, + "logps/chosen": -650.812744140625, + "logps/rejected": -1239.8447265625, + "loss": 0.0841, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17808124423027039, + "rewards/margins": 0.33711546659469604, + "rewards/rejected": -0.5151967406272888, + "step": 4990 + }, + { + "epoch": 0.95, + "learning_rate": 3.445767477155443e-08, + "logits/chosen": -1.6467472314834595, + "logits/rejected": -0.9199285507202148, + "logps/chosen": -666.3021240234375, + "logps/rejected": -1309.499755859375, + "loss": 0.0378, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17945322394371033, + "rewards/margins": 0.33249643445014954, + "rewards/rejected": -0.5119495987892151, + "step": 5000 + }, + { + "epoch": 0.95, + "learning_rate": 3.1761928563510956e-08, + "logits/chosen": -1.5655813217163086, + "logits/rejected": -0.9106910824775696, + "logps/chosen": -629.9158935546875, + "logps/rejected": -1288.5303955078125, + "loss": 0.0589, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18516629934310913, + "rewards/margins": 0.32465219497680664, + "rewards/rejected": -0.5098185539245605, + "step": 5010 + }, + { + "epoch": 0.96, + "learning_rate": 2.917529662926549e-08, + "logits/chosen": -1.6372337341308594, + "logits/rejected": -1.0175507068634033, + "logps/chosen": -697.6114501953125, + "logps/rejected": -1387.153076171875, + "loss": 0.0622, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.22449293732643127, + "rewards/margins": 0.31668001413345337, + "rewards/rejected": -0.5411729216575623, + "step": 5020 + }, + { + "epoch": 0.96, + "learning_rate": 2.669789331697148e-08, + "logits/chosen": -1.5427504777908325, + "logits/rejected": -1.123302698135376, + "logps/chosen": -681.3251953125, + "logps/rejected": -1218.051513671875, + "loss": 0.0818, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1837557703256607, + "rewards/margins": 0.2799621820449829, + "rewards/rejected": -0.4637179970741272, + "step": 5030 + }, + { + "epoch": 0.96, + "learning_rate": 2.4329828146074096e-08, + "logits/chosen": -1.7077605724334717, + "logits/rejected": -0.9767044186592102, + "logps/chosen": -599.540771484375, + "logps/rejected": -1115.1014404296875, + "loss": 0.0819, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1757069081068039, + "rewards/margins": 0.2653740346431732, + "rewards/rejected": -0.44108089804649353, + "step": 5040 + }, + { + "epoch": 0.96, + "learning_rate": 2.20712058024683e-08, + "logits/chosen": -1.5907446146011353, + "logits/rejected": -1.2127859592437744, + "logps/chosen": -699.6365966796875, + "logps/rejected": -1363.490966796875, + "loss": 0.0845, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21838286519050598, + "rewards/margins": 0.3182257413864136, + "rewards/rejected": -0.5366085767745972, + "step": 5050 + }, + { + "epoch": 0.96, + "learning_rate": 1.9922126133870568e-08, + "logits/chosen": -1.7688674926757812, + "logits/rejected": -1.2168480157852173, + "logps/chosen": -596.4293823242188, + "logps/rejected": -1142.8013916015625, + "loss": 0.1057, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1690121740102768, + "rewards/margins": 0.2577545940876007, + "rewards/rejected": -0.4267667829990387, + "step": 5060 + }, + { + "epoch": 0.97, + "learning_rate": 1.7882684145406616e-08, + "logits/chosen": -1.5492855310440063, + "logits/rejected": -0.8371337652206421, + "logps/chosen": -619.2363891601562, + "logps/rejected": -1319.695556640625, + "loss": 0.0486, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18321385979652405, + "rewards/margins": 0.3207260072231293, + "rewards/rejected": -0.5039398074150085, + "step": 5070 + }, + { + "epoch": 0.97, + "learning_rate": 1.595296999541057e-08, + "logits/chosen": -1.579831838607788, + "logits/rejected": -1.1370495557785034, + "logps/chosen": -599.0010986328125, + "logps/rejected": -1130.3555908203125, + "loss": 0.0703, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17692309617996216, + "rewards/margins": 0.2634755074977875, + "rewards/rejected": -0.44039860367774963, + "step": 5080 + }, + { + "epoch": 0.97, + "learning_rate": 1.4133068991437903e-08, + "logits/chosen": -1.587927222251892, + "logits/rejected": -0.8456587791442871, + "logps/chosen": -758.2893676757812, + "logps/rejected": -1242.40869140625, + "loss": 0.0787, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.20422771573066711, + "rewards/margins": 0.28214433789253235, + "rewards/rejected": -0.4863719940185547, + "step": 5090 + }, + { + "epoch": 0.97, + "learning_rate": 1.2423061586496476e-08, + "logits/chosen": -1.3969306945800781, + "logits/rejected": -1.0442532300949097, + "logps/chosen": -612.6021728515625, + "logps/rejected": -1355.794677734375, + "loss": 0.0744, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18990156054496765, + "rewards/margins": 0.3172561228275299, + "rewards/rejected": -0.5071576833724976, + "step": 5100 + }, + { + "epoch": 0.97, + "learning_rate": 1.0823023375489128e-08, + "logits/chosen": -1.7758409976959229, + "logits/rejected": -1.0913546085357666, + "logps/chosen": -643.1871948242188, + "logps/rejected": -1352.025634765625, + "loss": 0.0547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1932803839445114, + "rewards/margins": 0.3292398452758789, + "rewards/rejected": -0.5225202441215515, + "step": 5110 + }, + { + "epoch": 0.98, + "learning_rate": 9.333025091870507e-09, + "logits/chosen": -1.6138235330581665, + "logits/rejected": -1.1149636507034302, + "logps/chosen": -720.6397705078125, + "logps/rejected": -1290.0531005859375, + "loss": 0.098, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23546946048736572, + "rewards/margins": 0.2658223509788513, + "rewards/rejected": -0.5012918710708618, + "step": 5120 + }, + { + "epoch": 0.98, + "learning_rate": 7.95313260452263e-09, + "logits/chosen": -1.4443891048431396, + "logits/rejected": -0.9066373705863953, + "logps/chosen": -705.2442626953125, + "logps/rejected": -1347.6341552734375, + "loss": 0.0772, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21808597445487976, + "rewards/margins": 0.3010413348674774, + "rewards/rejected": -0.5191273093223572, + "step": 5130 + }, + { + "epoch": 0.98, + "learning_rate": 6.683406914840818e-09, + "logits/chosen": -1.839503288269043, + "logits/rejected": -0.9399774670600891, + "logps/chosen": -724.562255859375, + "logps/rejected": -1332.5740966796875, + "loss": 0.0646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20473381876945496, + "rewards/margins": 0.34322500228881836, + "rewards/rejected": -0.5479588508605957, + "step": 5140 + }, + { + "epoch": 0.98, + "learning_rate": 5.523904154037529e-09, + "logits/chosen": -1.5230966806411743, + "logits/rejected": -1.1400573253631592, + "logps/chosen": -695.667724609375, + "logps/rejected": -1319.90576171875, + "loss": 0.088, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2105880081653595, + "rewards/margins": 0.27609017491340637, + "rewards/rejected": -0.4866781234741211, + "step": 5150 + }, + { + "epoch": 0.98, + "learning_rate": 4.474675580662113e-09, + "logits/chosen": -1.6967852115631104, + "logits/rejected": -0.9574136734008789, + "logps/chosen": -639.5457153320312, + "logps/rejected": -1343.309326171875, + "loss": 0.0461, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17780372500419617, + "rewards/margins": 0.334329754114151, + "rewards/rejected": -0.5121334195137024, + "step": 5160 + }, + { + "epoch": 0.98, + "learning_rate": 3.5357675783331823e-09, + "logits/chosen": -1.5961401462554932, + "logits/rejected": -1.085395336151123, + "logps/chosen": -672.110595703125, + "logps/rejected": -1258.385986328125, + "loss": 0.0927, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2102072536945343, + "rewards/margins": 0.27410417795181274, + "rewards/rejected": -0.48431143164634705, + "step": 5170 + }, + { + "epoch": 0.99, + "learning_rate": 2.7072216536885855e-09, + "logits/chosen": -1.572046160697937, + "logits/rejected": -1.1685867309570312, + "logps/chosen": -627.5899658203125, + "logps/rejected": -1391.069091796875, + "loss": 0.0609, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18392732739448547, + "rewards/margins": 0.3593160808086395, + "rewards/rejected": -0.543243408203125, + "step": 5180 + }, + { + "epoch": 0.99, + "learning_rate": 1.989074434551874e-09, + "logits/chosen": -1.3752251863479614, + "logits/rejected": -1.040238618850708, + "logps/chosen": -636.2149658203125, + "logps/rejected": -1396.5648193359375, + "loss": 0.0509, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2024511843919754, + "rewards/margins": 0.3306366801261902, + "rewards/rejected": -0.5330878496170044, + "step": 5190 + }, + { + "epoch": 0.99, + "learning_rate": 1.3813576683111007e-09, + "logits/chosen": -1.7658580541610718, + "logits/rejected": -0.8461794853210449, + "logps/chosen": -736.367431640625, + "logps/rejected": -1369.6800537109375, + "loss": 0.0579, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19013865292072296, + "rewards/margins": 0.3445832431316376, + "rewards/rejected": -0.5347219109535217, + "step": 5200 + }, + { + "epoch": 0.99, + "learning_rate": 8.840982205160498e-10, + "logits/chosen": -1.1934562921524048, + "logits/rejected": -0.7583077549934387, + "logps/chosen": -643.0595092773438, + "logps/rejected": -1491.562744140625, + "loss": 0.072, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2331688404083252, + "rewards/margins": 0.3358479142189026, + "rewards/rejected": -0.5690167546272278, + "step": 5210 + }, + { + "epoch": 0.99, + "learning_rate": 4.973180736911332e-10, + "logits/chosen": -1.7001054286956787, + "logits/rejected": -1.0497257709503174, + "logps/chosen": -734.0220336914062, + "logps/rejected": -1379.366943359375, + "loss": 0.0586, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.19413700699806213, + "rewards/margins": 0.33139941096305847, + "rewards/rejected": -0.5255364179611206, + "step": 5220 + }, + { + "epoch": 1.0, + "learning_rate": 2.2103432636366718e-10, + "logits/chosen": -1.510568380355835, + "logits/rejected": -1.0220980644226074, + "logps/chosen": -764.5279541015625, + "logps/rejected": -1201.794189453125, + "loss": 0.0778, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1854855716228485, + "rewards/margins": 0.24305124580860138, + "rewards/rejected": -0.4285368025302887, + "step": 5230 + }, + { + "epoch": 1.0, + "learning_rate": 5.525919230670029e-11, + "logits/chosen": -1.5603317022323608, + "logits/rejected": -1.0289227962493896, + "logps/chosen": -833.1355590820312, + "logps/rejected": -1442.788818359375, + "loss": 0.0496, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24440082907676697, + "rewards/margins": 0.3070646822452545, + "rewards/rejected": -0.5514655113220215, + "step": 5240 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -1.7747621536254883, + "logits/rejected": -1.0173823833465576, + "logps/chosen": -691.9779663085938, + "logps/rejected": -1244.77294921875, + "loss": 0.0642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19418226182460785, + "rewards/margins": 0.31588393449783325, + "rewards/rejected": -0.5100662708282471, + "step": 5250 + }, + { + "epoch": 1.0, + "step": 5250, + "total_flos": 0.0, + "train_loss": 0.07741349755014693, + "train_runtime": 22357.7334, + "train_samples_per_second": 0.939, + "train_steps_per_second": 0.235 + } + ], + "logging_steps": 10, + "max_steps": 5250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}