diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17984 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 400, + "global_step": 12465, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.009623095429029e-10, + "logits/chosen": -3.029554605484009, + "logits/rejected": -2.958740711212158, + "logps/chosen": -239.6302947998047, + "logps/rejected": -134.69642639160156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.0096230954290295e-09, + "logits/chosen": -2.757606029510498, + "logits/rejected": -2.850358724594116, + "logps/chosen": -248.6219024658203, + "logps/rejected": -237.16183471679688, + "loss": 0.6953, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009159507229924202, + "rewards/margins": 0.0038133251946419477, + "rewards/rejected": -0.012972831726074219, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 8.019246190858059e-09, + "logits/chosen": -2.8561160564422607, + "logits/rejected": -2.731553792953491, + "logps/chosen": -255.89724731445312, + "logps/rejected": -124.84513854980469, + "loss": 0.6937, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.001197890960611403, + "rewards/margins": 0.011117557995021343, + "rewards/rejected": -0.00991966761648655, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.2028869286287089e-08, + "logits/chosen": -2.9133946895599365, + "logits/rejected": -2.9325616359710693, + "logps/chosen": -334.1841735839844, + "logps/rejected": -296.2657775878906, + "loss": 0.6968, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004535389598459005, + "rewards/margins": 0.010829145088791847, + "rewards/rejected": -0.006293755955994129, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 1.6038492381716118e-08, + "logits/chosen": -2.9462289810180664, + "logits/rejected": -2.871635913848877, + "logps/chosen": -257.1698303222656, + "logps/rejected": -245.908203125, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0012525601778179407, + "rewards/margins": 0.00830057729035616, + "rewards/rejected": -0.009553136304020882, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 2.0048115477145146e-08, + "logits/chosen": -2.7116570472717285, + "logits/rejected": -2.7510933876037598, + "logps/chosen": -302.39923095703125, + "logps/rejected": -288.64300537109375, + "loss": 0.6959, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.009699945338070393, + "rewards/margins": 0.017064867541193962, + "rewards/rejected": -0.007364921271800995, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 2.4057738572574177e-08, + "logits/chosen": -2.752192974090576, + "logits/rejected": -2.6760923862457275, + "logps/chosen": -241.3775177001953, + "logps/rejected": -285.0107421875, + "loss": 0.6864, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.005488119088113308, + "rewards/margins": 0.01543651707470417, + "rewards/rejected": -0.009948397055268288, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 2.8067361668003205e-08, + "logits/chosen": -2.8234877586364746, + "logits/rejected": -2.7496979236602783, + "logps/chosen": -296.32574462890625, + "logps/rejected": -217.22366333007812, + "loss": 0.6948, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0016764644533395767, + "rewards/margins": 0.0072411722503602505, + "rewards/rejected": -0.008917637169361115, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 3.2076984763432236e-08, + "logits/chosen": -2.7752678394317627, + "logits/rejected": -2.7286124229431152, + "logps/chosen": -152.26710510253906, + "logps/rejected": -170.56275939941406, + "loss": 0.6873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008753329515457153, + "rewards/margins": 0.0059437938034534454, + "rewards/rejected": -0.014697122387588024, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 3.608660785886127e-08, + "logits/chosen": -2.857790231704712, + "logits/rejected": -2.786897897720337, + "logps/chosen": -207.5873565673828, + "logps/rejected": -227.80184936523438, + "loss": 0.6882, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.014288626611232758, + "rewards/margins": 0.009933690540492535, + "rewards/rejected": -0.024222319945693016, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 4.009623095429029e-08, + "logits/chosen": -2.7477333545684814, + "logits/rejected": -2.76132869720459, + "logps/chosen": -289.3299865722656, + "logps/rejected": -196.60256958007812, + "loss": 0.6831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.009175291284918785, + "rewards/margins": 0.05031620338559151, + "rewards/rejected": -0.05949149280786514, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 4.410585404971932e-08, + "logits/chosen": -2.8593764305114746, + "logits/rejected": -2.8490402698516846, + "logps/chosen": -259.27545166015625, + "logps/rejected": -272.4792175292969, + "loss": 0.6865, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.010402527637779713, + "rewards/margins": 0.031852759420871735, + "rewards/rejected": -0.04225528985261917, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 4.8115477145148354e-08, + "logits/chosen": -2.828747510910034, + "logits/rejected": -2.7842514514923096, + "logps/chosen": -273.6574401855469, + "logps/rejected": -270.96533203125, + "loss": 0.6821, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010771388188004494, + "rewards/margins": 0.05202381685376167, + "rewards/rejected": -0.06279521435499191, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 5.2125100240577385e-08, + "logits/chosen": -2.915553569793701, + "logits/rejected": -2.8389952182769775, + "logps/chosen": -262.3766784667969, + "logps/rejected": -256.4173583984375, + "loss": 0.6822, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.024069635197520256, + "rewards/margins": 0.03253183513879776, + "rewards/rejected": -0.056601472198963165, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 5.613472333600641e-08, + "logits/chosen": -2.9118704795837402, + "logits/rejected": -2.927285671234131, + "logps/chosen": -158.02059936523438, + "logps/rejected": -209.7635955810547, + "loss": 0.6717, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04194178432226181, + "rewards/margins": 0.027061814442276955, + "rewards/rejected": -0.06900360435247421, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 6.014434643143545e-08, + "logits/chosen": -2.893004894256592, + "logits/rejected": -2.880948543548584, + "logps/chosen": -202.68736267089844, + "logps/rejected": -205.7903594970703, + "loss": 0.6815, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023197593167424202, + "rewards/margins": 0.04789852350950241, + "rewards/rejected": -0.07109610736370087, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 6.415396952686447e-08, + "logits/chosen": -2.9180562496185303, + "logits/rejected": -2.7946887016296387, + "logps/chosen": -332.65960693359375, + "logps/rejected": -238.1708526611328, + "loss": 0.6603, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01652655564248562, + "rewards/margins": 0.008368945680558681, + "rewards/rejected": -0.024895502254366875, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 6.81635926222935e-08, + "logits/chosen": -2.9096858501434326, + "logits/rejected": -2.9310355186462402, + "logps/chosen": -251.33975219726562, + "logps/rejected": -240.75460815429688, + "loss": 0.6601, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.013797881081700325, + "rewards/margins": 0.13765253126621246, + "rewards/rejected": -0.15145042538642883, + "step": 170 + }, + { + "epoch": 0.04, + "learning_rate": 7.217321571772253e-08, + "logits/chosen": -2.988015651702881, + "logits/rejected": -2.9767017364501953, + "logps/chosen": -212.15673828125, + "logps/rejected": -159.58204650878906, + "loss": 0.6581, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10063859075307846, + "rewards/margins": 0.07325638830661774, + "rewards/rejected": -0.1738949865102768, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 7.618283881315156e-08, + "logits/chosen": -2.933257818222046, + "logits/rejected": -2.860905170440674, + "logps/chosen": -324.5125427246094, + "logps/rejected": -359.4278564453125, + "loss": 0.6641, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.020703891292214394, + "rewards/margins": 0.08283446729183197, + "rewards/rejected": -0.10353837162256241, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 8.019246190858058e-08, + "logits/chosen": -2.900707721710205, + "logits/rejected": -2.8246498107910156, + "logps/chosen": -203.93099975585938, + "logps/rejected": -252.4442138671875, + "loss": 0.6745, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07562927901744843, + "rewards/margins": 0.02675458788871765, + "rewards/rejected": -0.10238387435674667, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 8.420208500400962e-08, + "logits/chosen": -2.7394564151763916, + "logits/rejected": -2.7282943725585938, + "logps/chosen": -197.1373748779297, + "logps/rejected": -226.32382202148438, + "loss": 0.6562, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07417772710323334, + "rewards/margins": 0.05352227762341499, + "rewards/rejected": -0.12770001590251923, + "step": 210 + }, + { + "epoch": 0.05, + "learning_rate": 8.821170809943865e-08, + "logits/chosen": -2.844759941101074, + "logits/rejected": -2.826810359954834, + "logps/chosen": -191.80062866210938, + "logps/rejected": -221.77932739257812, + "loss": 0.6344, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08790848404169083, + "rewards/margins": 0.021957406774163246, + "rewards/rejected": -0.10986590385437012, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 9.222133119486767e-08, + "logits/chosen": -2.957456111907959, + "logits/rejected": -2.835681438446045, + "logps/chosen": -322.5172119140625, + "logps/rejected": -253.207763671875, + "loss": 0.6182, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04191254824399948, + "rewards/margins": 0.16338881850242615, + "rewards/rejected": -0.12147627025842667, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 9.623095429029671e-08, + "logits/chosen": -2.9493839740753174, + "logits/rejected": -2.945842981338501, + "logps/chosen": -256.1016540527344, + "logps/rejected": -189.64895629882812, + "loss": 0.6458, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.042133476585149765, + "rewards/margins": 0.1592259705066681, + "rewards/rejected": -0.20135946571826935, + "step": 240 + }, + { + "epoch": 0.06, + "learning_rate": 1.0024057738572573e-07, + "logits/chosen": -2.8034961223602295, + "logits/rejected": -2.8224565982818604, + "logps/chosen": -252.3696746826172, + "logps/rejected": -198.51544189453125, + "loss": 0.6295, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.037950549274683, + "rewards/margins": 0.15652048587799072, + "rewards/rejected": -0.11856994777917862, + "step": 250 + }, + { + "epoch": 0.06, + "learning_rate": 1.0425020048115477e-07, + "logits/chosen": -2.8333163261413574, + "logits/rejected": -2.799535036087036, + "logps/chosen": -198.86106872558594, + "logps/rejected": -193.21127319335938, + "loss": 0.6362, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02170318178832531, + "rewards/margins": 0.11439894139766693, + "rewards/rejected": -0.1361021101474762, + "step": 260 + }, + { + "epoch": 0.06, + "learning_rate": 1.082598235765838e-07, + "logits/chosen": -2.939605712890625, + "logits/rejected": -2.8323261737823486, + "logps/chosen": -253.91775512695312, + "logps/rejected": -242.62594604492188, + "loss": 0.6529, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.12199263274669647, + "rewards/margins": 0.14367853105068207, + "rewards/rejected": -0.2656711935997009, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 1.1226944667201282e-07, + "logits/chosen": -2.705425977706909, + "logits/rejected": -2.727578639984131, + "logps/chosen": -156.19100952148438, + "logps/rejected": -240.2871856689453, + "loss": 0.6291, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.002952949609607458, + "rewards/margins": 0.07988782227039337, + "rewards/rejected": -0.08284077048301697, + "step": 280 + }, + { + "epoch": 0.07, + "learning_rate": 1.1627906976744186e-07, + "logits/chosen": -2.821107864379883, + "logits/rejected": -2.7760562896728516, + "logps/chosen": -275.28216552734375, + "logps/rejected": -237.4905548095703, + "loss": 0.6239, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10767307132482529, + "rewards/margins": 0.12761202454566956, + "rewards/rejected": -0.23528508841991425, + "step": 290 + }, + { + "epoch": 0.07, + "learning_rate": 1.202886928628709e-07, + "logits/chosen": -2.8041632175445557, + "logits/rejected": -2.76464581489563, + "logps/chosen": -301.4250183105469, + "logps/rejected": -406.40057373046875, + "loss": 0.6509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0919116735458374, + "rewards/margins": 0.24763791263103485, + "rewards/rejected": -0.15572626888751984, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 1.242983159582999e-07, + "logits/chosen": -2.811683177947998, + "logits/rejected": -2.7772467136383057, + "logps/chosen": -222.76901245117188, + "logps/rejected": -203.6343536376953, + "loss": 0.6285, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10157088935375214, + "rewards/margins": 0.0654078871011734, + "rewards/rejected": -0.16697879135608673, + "step": 310 + }, + { + "epoch": 0.08, + "learning_rate": 1.2830793905372894e-07, + "logits/chosen": -2.947110414505005, + "logits/rejected": -2.8601386547088623, + "logps/chosen": -289.3692321777344, + "logps/rejected": -234.7985382080078, + "loss": 0.6254, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.15501949191093445, + "rewards/margins": 0.17323021590709686, + "rewards/rejected": -0.018210697919130325, + "step": 320 + }, + { + "epoch": 0.08, + "learning_rate": 1.3231756214915798e-07, + "logits/chosen": -2.9331088066101074, + "logits/rejected": -2.865115165710449, + "logps/chosen": -369.88671875, + "logps/rejected": -301.6773986816406, + "loss": 0.5713, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.22617539763450623, + "rewards/margins": 0.504052996635437, + "rewards/rejected": -0.2778776288032532, + "step": 330 + }, + { + "epoch": 0.08, + "learning_rate": 1.36327185244587e-07, + "logits/chosen": -2.645563840866089, + "logits/rejected": -2.539353609085083, + "logps/chosen": -209.6802978515625, + "logps/rejected": -169.13308715820312, + "loss": 0.5846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.032310713082551956, + "rewards/margins": 0.2370806187391281, + "rewards/rejected": -0.26939135789871216, + "step": 340 + }, + { + "epoch": 0.08, + "learning_rate": 1.4033680834001603e-07, + "logits/chosen": -2.581049919128418, + "logits/rejected": -2.5333192348480225, + "logps/chosen": -225.43392944335938, + "logps/rejected": -159.21926879882812, + "loss": 0.5605, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19601905345916748, + "rewards/margins": 0.25073686242103577, + "rewards/rejected": -0.05471784994006157, + "step": 350 + }, + { + "epoch": 0.09, + "learning_rate": 1.4434643143544507e-07, + "logits/chosen": -2.9410252571105957, + "logits/rejected": -2.8339309692382812, + "logps/chosen": -280.1285705566406, + "logps/rejected": -271.2792663574219, + "loss": 0.5662, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17516747117042542, + "rewards/margins": 0.35865694284439087, + "rewards/rejected": -0.18348945677280426, + "step": 360 + }, + { + "epoch": 0.09, + "learning_rate": 1.483560545308741e-07, + "logits/chosen": -2.8755431175231934, + "logits/rejected": -2.8623225688934326, + "logps/chosen": -187.56100463867188, + "logps/rejected": -187.7587432861328, + "loss": 0.5817, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26764410734176636, + "rewards/margins": 0.26760828495025635, + "rewards/rejected": 3.5798548196908087e-05, + "step": 370 + }, + { + "epoch": 0.09, + "learning_rate": 1.5236567762630312e-07, + "logits/chosen": -2.897462844848633, + "logits/rejected": -2.8069605827331543, + "logps/chosen": -255.22653198242188, + "logps/rejected": -255.3626708984375, + "loss": 0.5824, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4705290198326111, + "rewards/margins": 0.3691382110118866, + "rewards/rejected": 0.10139081627130508, + "step": 380 + }, + { + "epoch": 0.09, + "learning_rate": 1.5637530072173216e-07, + "logits/chosen": -2.919511318206787, + "logits/rejected": -2.8042795658111572, + "logps/chosen": -325.6533508300781, + "logps/rejected": -187.78512573242188, + "loss": 0.5513, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4940560460090637, + "rewards/margins": 0.7009488344192505, + "rewards/rejected": -0.20689284801483154, + "step": 390 + }, + { + "epoch": 0.1, + "learning_rate": 1.6038492381716117e-07, + "logits/chosen": -2.9202237129211426, + "logits/rejected": -2.9232850074768066, + "logps/chosen": -247.07302856445312, + "logps/rejected": -275.75836181640625, + "loss": 0.5994, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.41024351119995117, + "rewards/margins": 0.20150327682495117, + "rewards/rejected": 0.2087402641773224, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.648510456085205, + "eval_logits/rejected": -2.6271843910217285, + "eval_logps/chosen": -198.87437438964844, + "eval_logps/rejected": -189.90797424316406, + "eval_loss": 0.5895335674285889, + "eval_rewards/accuracies": 0.5950000286102295, + "eval_rewards/chosen": 0.3053191304206848, + "eval_rewards/margins": 0.34303680062294006, + "eval_rewards/rejected": -0.037717677652835846, + "eval_runtime": 132.8101, + "eval_samples_per_second": 23.763, + "eval_steps_per_second": 0.376, + "step": 400 + }, + { + "epoch": 0.1, + "learning_rate": 1.6439454691259023e-07, + "logits/chosen": -2.824357271194458, + "logits/rejected": -2.741654872894287, + "logps/chosen": -304.6470031738281, + "logps/rejected": -313.9639587402344, + "loss": 0.564, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6792846918106079, + "rewards/margins": 0.5655020475387573, + "rewards/rejected": 0.11378266662359238, + "step": 410 + }, + { + "epoch": 0.1, + "learning_rate": 1.6840417000801924e-07, + "logits/chosen": -2.782989978790283, + "logits/rejected": -2.753141403198242, + "logps/chosen": -279.14300537109375, + "logps/rejected": -236.7024383544922, + "loss": 0.6065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.519902765750885, + "rewards/margins": 0.45831745862960815, + "rewards/rejected": 0.06158534437417984, + "step": 420 + }, + { + "epoch": 0.1, + "learning_rate": 1.7241379310344828e-07, + "logits/chosen": -2.579185724258423, + "logits/rejected": -2.7632734775543213, + "logps/chosen": -236.32113647460938, + "logps/rejected": -315.44256591796875, + "loss": 0.5608, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.2255411595106125, + "rewards/margins": 0.11108832061290741, + "rewards/rejected": 0.1144527941942215, + "step": 430 + }, + { + "epoch": 0.11, + "learning_rate": 1.764234161988773e-07, + "logits/chosen": -2.7754387855529785, + "logits/rejected": -2.6534359455108643, + "logps/chosen": -233.37680053710938, + "logps/rejected": -227.4702911376953, + "loss": 0.5559, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.4115857481956482, + "rewards/margins": 1.0299506187438965, + "rewards/rejected": -0.6183647513389587, + "step": 440 + }, + { + "epoch": 0.11, + "learning_rate": 1.8043303929430633e-07, + "logits/chosen": -2.7775425910949707, + "logits/rejected": -2.770744800567627, + "logps/chosen": -174.74822998046875, + "logps/rejected": -169.72979736328125, + "loss": 0.5884, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.07133658230304718, + "rewards/margins": 0.10153523832559586, + "rewards/rejected": -0.17287181317806244, + "step": 450 + }, + { + "epoch": 0.11, + "learning_rate": 1.8444266238973534e-07, + "logits/chosen": -2.8076884746551514, + "logits/rejected": -2.8743953704833984, + "logps/chosen": -254.01199340820312, + "logps/rejected": -266.43310546875, + "loss": 0.6159, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.005350641906261444, + "rewards/margins": 0.5136643648147583, + "rewards/rejected": -0.5083136558532715, + "step": 460 + }, + { + "epoch": 0.11, + "learning_rate": 1.884522854851644e-07, + "logits/chosen": -2.820117235183716, + "logits/rejected": -2.7825677394866943, + "logps/chosen": -243.0380401611328, + "logps/rejected": -180.35055541992188, + "loss": 0.6442, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10030399262905121, + "rewards/margins": 0.535255491733551, + "rewards/rejected": -0.43495145440101624, + "step": 470 + }, + { + "epoch": 0.12, + "learning_rate": 1.9246190858059342e-07, + "logits/chosen": -2.9516303539276123, + "logits/rejected": -2.877495527267456, + "logps/chosen": -273.7452392578125, + "logps/rejected": -231.18374633789062, + "loss": 0.5541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4644753932952881, + "rewards/margins": 0.6943734884262085, + "rewards/rejected": -0.229898139834404, + "step": 480 + }, + { + "epoch": 0.12, + "learning_rate": 1.9647153167602245e-07, + "logits/chosen": -2.7269504070281982, + "logits/rejected": -2.774722099304199, + "logps/chosen": -157.05789184570312, + "logps/rejected": -270.0483093261719, + "loss": 0.5355, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2008717954158783, + "rewards/margins": 0.375074177980423, + "rewards/rejected": -0.1742023378610611, + "step": 490 + }, + { + "epoch": 0.12, + "learning_rate": 2.0048115477145147e-07, + "logits/chosen": -2.659719944000244, + "logits/rejected": -2.689526319503784, + "logps/chosen": -288.8984375, + "logps/rejected": -269.92340087890625, + "loss": 0.5479, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5158240795135498, + "rewards/margins": 0.6448057889938354, + "rewards/rejected": -0.12898170948028564, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 2.044907778668805e-07, + "logits/chosen": -2.827486515045166, + "logits/rejected": -2.677001714706421, + "logps/chosen": -295.91949462890625, + "logps/rejected": -216.48403930664062, + "loss": 0.6283, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.48119044303894043, + "rewards/margins": 0.4074464440345764, + "rewards/rejected": 0.07374398410320282, + "step": 510 + }, + { + "epoch": 0.13, + "learning_rate": 2.0850040096230954e-07, + "logits/chosen": -2.9389195442199707, + "logits/rejected": -2.760077714920044, + "logps/chosen": -323.9413146972656, + "logps/rejected": -243.3992156982422, + "loss": 0.5955, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.2887626886367798, + "rewards/margins": 0.7296485304832458, + "rewards/rejected": -0.44088587164878845, + "step": 520 + }, + { + "epoch": 0.13, + "learning_rate": 2.1251002405773858e-07, + "logits/chosen": -2.861633777618408, + "logits/rejected": -2.8447012901306152, + "logps/chosen": -327.47515869140625, + "logps/rejected": -267.309814453125, + "loss": 0.5627, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7268004417419434, + "rewards/margins": 0.37961870431900024, + "rewards/rejected": 0.34718185663223267, + "step": 530 + }, + { + "epoch": 0.13, + "learning_rate": 2.165196471531676e-07, + "logits/chosen": -2.561352252960205, + "logits/rejected": -2.57576584815979, + "logps/chosen": -208.40731811523438, + "logps/rejected": -216.1819305419922, + "loss": 0.624, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3119715750217438, + "rewards/margins": 0.571592390537262, + "rewards/rejected": -0.2596207857131958, + "step": 540 + }, + { + "epoch": 0.13, + "learning_rate": 2.2052927024859663e-07, + "logits/chosen": -2.7754769325256348, + "logits/rejected": -2.771836757659912, + "logps/chosen": -298.20635986328125, + "logps/rejected": -238.71176147460938, + "loss": 0.5883, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.4887056350708008, + "rewards/margins": 0.631751537322998, + "rewards/rejected": -0.14304590225219727, + "step": 550 + }, + { + "epoch": 0.13, + "learning_rate": 2.2453889334402564e-07, + "logits/chosen": -2.7937541007995605, + "logits/rejected": -2.68151593208313, + "logps/chosen": -196.40573120117188, + "logps/rejected": -118.2184066772461, + "loss": 0.504, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5687293410301208, + "rewards/margins": 0.7040340304374695, + "rewards/rejected": -0.13530471920967102, + "step": 560 + }, + { + "epoch": 0.14, + "learning_rate": 2.285485164394547e-07, + "logits/chosen": -2.697080612182617, + "logits/rejected": -2.7340171337127686, + "logps/chosen": -198.97213745117188, + "logps/rejected": -264.6471252441406, + "loss": 0.5554, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.41746944189071655, + "rewards/margins": 0.843182384967804, + "rewards/rejected": -0.425712913274765, + "step": 570 + }, + { + "epoch": 0.14, + "learning_rate": 2.3255813953488372e-07, + "logits/chosen": -2.761768341064453, + "logits/rejected": -2.817594051361084, + "logps/chosen": -209.8280487060547, + "logps/rejected": -212.88150024414062, + "loss": 0.5672, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36323896050453186, + "rewards/margins": 0.4819954037666321, + "rewards/rejected": -0.1187564879655838, + "step": 580 + }, + { + "epoch": 0.14, + "learning_rate": 2.3656776263031275e-07, + "logits/chosen": -2.87705397605896, + "logits/rejected": -2.7968459129333496, + "logps/chosen": -276.10699462890625, + "logps/rejected": -210.0780792236328, + "loss": 0.5556, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1134200468659401, + "rewards/margins": 0.6706252694129944, + "rewards/rejected": -0.5572052001953125, + "step": 590 + }, + { + "epoch": 0.14, + "learning_rate": 2.405773857257418e-07, + "logits/chosen": -2.921949863433838, + "logits/rejected": -2.852156639099121, + "logps/chosen": -299.18609619140625, + "logps/rejected": -249.9677276611328, + "loss": 0.5832, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.048682499676942825, + "rewards/margins": 0.2762266993522644, + "rewards/rejected": -0.3249092102050781, + "step": 600 + }, + { + "epoch": 0.15, + "learning_rate": 2.445870088211708e-07, + "logits/chosen": -2.8194832801818848, + "logits/rejected": -2.804983615875244, + "logps/chosen": -288.4292907714844, + "logps/rejected": -270.6551208496094, + "loss": 0.597, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.05671919509768486, + "rewards/margins": 0.0665658563375473, + "rewards/rejected": -0.00984666682779789, + "step": 610 + }, + { + "epoch": 0.15, + "learning_rate": 2.485966319165998e-07, + "logits/chosen": -2.662090539932251, + "logits/rejected": -2.7223060131073, + "logps/chosen": -160.1973419189453, + "logps/rejected": -188.27352905273438, + "loss": 0.685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.15639618039131165, + "rewards/margins": 0.5227680802345276, + "rewards/rejected": -0.36637189984321594, + "step": 620 + }, + { + "epoch": 0.15, + "learning_rate": 2.526062550120289e-07, + "logits/chosen": -2.924830913543701, + "logits/rejected": -2.8918211460113525, + "logps/chosen": -223.8957977294922, + "logps/rejected": -203.39224243164062, + "loss": 0.6228, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24558000266551971, + "rewards/margins": 0.5432695150375366, + "rewards/rejected": -0.7888495326042175, + "step": 630 + }, + { + "epoch": 0.15, + "learning_rate": 2.566158781074579e-07, + "logits/chosen": -2.8393218517303467, + "logits/rejected": -2.8019471168518066, + "logps/chosen": -247.6842803955078, + "logps/rejected": -235.5037078857422, + "loss": 0.5741, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2942962050437927, + "rewards/margins": 0.29929882287979126, + "rewards/rejected": -0.593595027923584, + "step": 640 + }, + { + "epoch": 0.16, + "learning_rate": 2.606255012028869e-07, + "logits/chosen": -2.76501202583313, + "logits/rejected": -2.7004733085632324, + "logps/chosen": -301.2521057128906, + "logps/rejected": -254.12240600585938, + "loss": 0.5037, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.36596235632896423, + "rewards/margins": 0.8270140886306763, + "rewards/rejected": -1.1929763555526733, + "step": 650 + }, + { + "epoch": 0.16, + "learning_rate": 2.6463512429831596e-07, + "logits/chosen": -2.7481508255004883, + "logits/rejected": -2.6847853660583496, + "logps/chosen": -249.2167205810547, + "logps/rejected": -258.1322326660156, + "loss": 0.5659, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2779560089111328, + "rewards/margins": 0.5480610728263855, + "rewards/rejected": -0.8260170817375183, + "step": 660 + }, + { + "epoch": 0.16, + "learning_rate": 2.68644747393745e-07, + "logits/chosen": -2.7402031421661377, + "logits/rejected": -2.674595355987549, + "logps/chosen": -216.8399200439453, + "logps/rejected": -173.94277954101562, + "loss": 0.5758, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18031704425811768, + "rewards/margins": 0.51617032289505, + "rewards/rejected": -0.6964873671531677, + "step": 670 + }, + { + "epoch": 0.16, + "learning_rate": 2.72654370489174e-07, + "logits/chosen": -2.7919907569885254, + "logits/rejected": -2.7736024856567383, + "logps/chosen": -214.3600311279297, + "logps/rejected": -296.0640563964844, + "loss": 0.5332, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3602636456489563, + "rewards/margins": 0.7699328660964966, + "rewards/rejected": -1.1301965713500977, + "step": 680 + }, + { + "epoch": 0.17, + "learning_rate": 2.76663993584603e-07, + "logits/chosen": -2.8806838989257812, + "logits/rejected": -2.8978848457336426, + "logps/chosen": -304.77935791015625, + "logps/rejected": -299.870849609375, + "loss": 0.6241, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06528373807668686, + "rewards/margins": 0.42759591341018677, + "rewards/rejected": -0.49287962913513184, + "step": 690 + }, + { + "epoch": 0.17, + "learning_rate": 2.8067361668003206e-07, + "logits/chosen": -2.5968194007873535, + "logits/rejected": -2.460662603378296, + "logps/chosen": -306.84686279296875, + "logps/rejected": -230.10562133789062, + "loss": 0.5487, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11472682654857635, + "rewards/margins": 0.6224948167800903, + "rewards/rejected": -0.7372217178344727, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 2.8468323977546113e-07, + "logits/chosen": -2.8128252029418945, + "logits/rejected": -2.7990946769714355, + "logps/chosen": -328.0445251464844, + "logps/rejected": -307.45086669921875, + "loss": 0.5435, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.06992456316947937, + "rewards/margins": 0.7968131303787231, + "rewards/rejected": -0.7268885374069214, + "step": 710 + }, + { + "epoch": 0.17, + "learning_rate": 2.8869286287089014e-07, + "logits/chosen": -2.818443775177002, + "logits/rejected": -2.7941231727600098, + "logps/chosen": -310.6451721191406, + "logps/rejected": -250.27383422851562, + "loss": 0.5224, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2724413573741913, + "rewards/margins": 0.7151150107383728, + "rewards/rejected": -0.9875563383102417, + "step": 720 + }, + { + "epoch": 0.18, + "learning_rate": 2.9270248596631915e-07, + "logits/chosen": -2.6747488975524902, + "logits/rejected": -2.6234326362609863, + "logps/chosen": -254.17636108398438, + "logps/rejected": -259.8215026855469, + "loss": 0.6097, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1818128079175949, + "rewards/margins": 1.5986982583999634, + "rewards/rejected": -1.7805109024047852, + "step": 730 + }, + { + "epoch": 0.18, + "learning_rate": 2.967121090617482e-07, + "logits/chosen": -2.5844151973724365, + "logits/rejected": -2.509438991546631, + "logps/chosen": -196.0883026123047, + "logps/rejected": -192.82748413085938, + "loss": 0.551, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4821125864982605, + "rewards/margins": 0.5221725702285767, + "rewards/rejected": -1.0042850971221924, + "step": 740 + }, + { + "epoch": 0.18, + "learning_rate": 3.007217321571772e-07, + "logits/chosen": -2.9302337169647217, + "logits/rejected": -2.8687689304351807, + "logps/chosen": -376.0974426269531, + "logps/rejected": -341.4430236816406, + "loss": 0.5568, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5697919130325317, + "rewards/margins": 0.8332484364509583, + "rewards/rejected": -1.4030402898788452, + "step": 750 + }, + { + "epoch": 0.18, + "learning_rate": 3.0473135525260624e-07, + "logits/chosen": -2.728005886077881, + "logits/rejected": -2.7057783603668213, + "logps/chosen": -255.732421875, + "logps/rejected": -273.4658203125, + "loss": 0.5402, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.38343286514282227, + "rewards/margins": 0.8520215153694153, + "rewards/rejected": -1.2354543209075928, + "step": 760 + }, + { + "epoch": 0.19, + "learning_rate": 3.0874097834803525e-07, + "logits/chosen": -2.6662607192993164, + "logits/rejected": -2.6617627143859863, + "logps/chosen": -367.4737854003906, + "logps/rejected": -286.6307373046875, + "loss": 0.5507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.372802495956421, + "rewards/margins": 0.3644545376300812, + "rewards/rejected": -1.7372572422027588, + "step": 770 + }, + { + "epoch": 0.19, + "learning_rate": 3.127506014434643e-07, + "logits/chosen": -2.499572277069092, + "logits/rejected": -2.6057636737823486, + "logps/chosen": -290.12115478515625, + "logps/rejected": -296.11944580078125, + "loss": 0.5236, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3911897540092468, + "rewards/margins": 1.4719053506851196, + "rewards/rejected": -1.8630950450897217, + "step": 780 + }, + { + "epoch": 0.19, + "learning_rate": 3.167602245388933e-07, + "logits/chosen": -2.8040318489074707, + "logits/rejected": -2.7535839080810547, + "logps/chosen": -255.7650909423828, + "logps/rejected": -202.61978149414062, + "loss": 0.5597, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1364818662405014, + "rewards/margins": 0.7933239340782166, + "rewards/rejected": -0.9298057556152344, + "step": 790 + }, + { + "epoch": 0.19, + "learning_rate": 3.2076984763432233e-07, + "logits/chosen": -2.8974623680114746, + "logits/rejected": -2.84101939201355, + "logps/chosen": -307.80230712890625, + "logps/rejected": -288.81854248046875, + "loss": 0.5024, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.2426052987575531, + "rewards/margins": 0.36778146028518677, + "rewards/rejected": -0.12517614662647247, + "step": 800 + }, + { + "epoch": 0.19, + "eval_logits/chosen": -2.5328729152679443, + "eval_logits/rejected": -2.5092859268188477, + "eval_logps/chosen": -203.2058563232422, + "eval_logps/rejected": -199.95619201660156, + "eval_loss": 0.5111984610557556, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -0.12783148884773254, + "eval_rewards/margins": 0.9147088527679443, + "eval_rewards/rejected": -1.042540192604065, + "eval_runtime": 133.661, + "eval_samples_per_second": 23.612, + "eval_steps_per_second": 0.374, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 3.2477947072975135e-07, + "logits/chosen": -2.769944190979004, + "logits/rejected": -2.7940926551818848, + "logps/chosen": -251.9970245361328, + "logps/rejected": -233.0319061279297, + "loss": 0.602, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.244999960064888, + "rewards/margins": 0.5882779359817505, + "rewards/rejected": -0.8332778215408325, + "step": 810 + }, + { + "epoch": 0.2, + "learning_rate": 3.2878909382518046e-07, + "logits/chosen": -2.7508976459503174, + "logits/rejected": -2.6556122303009033, + "logps/chosen": -268.6809997558594, + "logps/rejected": -225.40792846679688, + "loss": 0.5085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.32555705308914185, + "rewards/margins": 0.5282832384109497, + "rewards/rejected": -0.8538403511047363, + "step": 820 + }, + { + "epoch": 0.2, + "learning_rate": 3.327987169206095e-07, + "logits/chosen": -2.651878833770752, + "logits/rejected": -2.659212350845337, + "logps/chosen": -193.90365600585938, + "logps/rejected": -274.38616943359375, + "loss": 0.5765, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4068203866481781, + "rewards/margins": 0.3031473159790039, + "rewards/rejected": -0.7099677324295044, + "step": 830 + }, + { + "epoch": 0.2, + "learning_rate": 3.368083400160385e-07, + "logits/chosen": -2.663372039794922, + "logits/rejected": -2.8095250129699707, + "logps/chosen": -200.6801300048828, + "logps/rejected": -225.0947265625, + "loss": 0.5662, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04706698656082153, + "rewards/margins": 0.3949834406375885, + "rewards/rejected": -0.34791645407676697, + "step": 840 + }, + { + "epoch": 0.2, + "learning_rate": 3.408179631114675e-07, + "logits/chosen": -2.638129711151123, + "logits/rejected": -2.600186824798584, + "logps/chosen": -226.84945678710938, + "logps/rejected": -225.19082641601562, + "loss": 0.5461, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02068863809108734, + "rewards/margins": 0.9978263974189758, + "rewards/rejected": -0.9771377444267273, + "step": 850 + }, + { + "epoch": 0.21, + "learning_rate": 3.4482758620689656e-07, + "logits/chosen": -2.5684962272644043, + "logits/rejected": -2.7589926719665527, + "logps/chosen": -220.42724609375, + "logps/rejected": -334.71624755859375, + "loss": 0.5906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.264083594083786, + "rewards/margins": 0.5368421673774719, + "rewards/rejected": -0.8009258508682251, + "step": 860 + }, + { + "epoch": 0.21, + "learning_rate": 3.4883720930232557e-07, + "logits/chosen": -2.830177068710327, + "logits/rejected": -2.667341709136963, + "logps/chosen": -304.4564514160156, + "logps/rejected": -249.4889678955078, + "loss": 0.553, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17647650837898254, + "rewards/margins": 1.0650815963745117, + "rewards/rejected": -1.2415580749511719, + "step": 870 + }, + { + "epoch": 0.21, + "learning_rate": 3.528468323977546e-07, + "logits/chosen": -2.8149116039276123, + "logits/rejected": -2.7310032844543457, + "logps/chosen": -255.3647918701172, + "logps/rejected": -217.87197875976562, + "loss": 0.6495, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.30641308426856995, + "rewards/margins": 0.5809476971626282, + "rewards/rejected": -0.8873607516288757, + "step": 880 + }, + { + "epoch": 0.21, + "learning_rate": 3.568564554931836e-07, + "logits/chosen": -2.7607791423797607, + "logits/rejected": -2.804769515991211, + "logps/chosen": -281.76788330078125, + "logps/rejected": -262.23126220703125, + "loss": 0.5512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5253441333770752, + "rewards/margins": 0.4526292383670807, + "rewards/rejected": -0.9779733419418335, + "step": 890 + }, + { + "epoch": 0.22, + "learning_rate": 3.6086607858861266e-07, + "logits/chosen": -2.9751179218292236, + "logits/rejected": -2.9251646995544434, + "logps/chosen": -310.1224365234375, + "logps/rejected": -311.144287109375, + "loss": 0.5392, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6419559717178345, + "rewards/margins": 0.9658881425857544, + "rewards/rejected": -1.6078441143035889, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 3.6487570168404167e-07, + "logits/chosen": -2.8095269203186035, + "logits/rejected": -2.7687015533447266, + "logps/chosen": -315.8305969238281, + "logps/rejected": -280.93994140625, + "loss": 0.5323, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05711637809872627, + "rewards/margins": 0.9887657165527344, + "rewards/rejected": -1.045882225036621, + "step": 910 + }, + { + "epoch": 0.22, + "learning_rate": 3.688853247794707e-07, + "logits/chosen": -2.6602730751037598, + "logits/rejected": -2.648829936981201, + "logps/chosen": -287.0523376464844, + "logps/rejected": -225.801513671875, + "loss": 0.7499, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11555546522140503, + "rewards/margins": 1.0678458213806152, + "rewards/rejected": -1.183401107788086, + "step": 920 + }, + { + "epoch": 0.22, + "learning_rate": 3.7289494787489975e-07, + "logits/chosen": -2.725248336791992, + "logits/rejected": -2.6758840084075928, + "logps/chosen": -277.663330078125, + "logps/rejected": -233.9459228515625, + "loss": 0.5284, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5111667513847351, + "rewards/margins": 1.2667210102081299, + "rewards/rejected": -0.7555543780326843, + "step": 930 + }, + { + "epoch": 0.23, + "learning_rate": 3.769045709703288e-07, + "logits/chosen": -2.7036404609680176, + "logits/rejected": -2.615960121154785, + "logps/chosen": -248.12332153320312, + "logps/rejected": -220.653564453125, + "loss": 0.5286, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5193105936050415, + "rewards/margins": 0.9040799140930176, + "rewards/rejected": -1.423390507698059, + "step": 940 + }, + { + "epoch": 0.23, + "learning_rate": 3.809141940657578e-07, + "logits/chosen": -2.7405402660369873, + "logits/rejected": -2.5490360260009766, + "logps/chosen": -218.7405548095703, + "logps/rejected": -209.5735321044922, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42782601714134216, + "rewards/margins": 1.1354576349258423, + "rewards/rejected": -1.5632835626602173, + "step": 950 + }, + { + "epoch": 0.23, + "learning_rate": 3.8492381716118683e-07, + "logits/chosen": -2.7462010383605957, + "logits/rejected": -2.7566380500793457, + "logps/chosen": -251.06198120117188, + "logps/rejected": -270.6859130859375, + "loss": 0.5315, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5047445297241211, + "rewards/margins": 0.46457457542419434, + "rewards/rejected": -0.9693191647529602, + "step": 960 + }, + { + "epoch": 0.23, + "learning_rate": 3.8893344025661585e-07, + "logits/chosen": -2.7401294708251953, + "logits/rejected": -2.7194225788116455, + "logps/chosen": -238.30801391601562, + "logps/rejected": -263.53265380859375, + "loss": 0.5761, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07245366275310516, + "rewards/margins": 0.8650237917900085, + "rewards/rejected": -0.792570173740387, + "step": 970 + }, + { + "epoch": 0.24, + "learning_rate": 3.929430633520449e-07, + "logits/chosen": -2.621753454208374, + "logits/rejected": -2.5244412422180176, + "logps/chosen": -302.0888671875, + "logps/rejected": -295.4062805175781, + "loss": 0.5195, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.619012176990509, + "rewards/margins": 1.0741729736328125, + "rewards/rejected": -1.6931850910186768, + "step": 980 + }, + { + "epoch": 0.24, + "learning_rate": 3.969526864474739e-07, + "logits/chosen": -2.7257871627807617, + "logits/rejected": -2.7017054557800293, + "logps/chosen": -252.85275268554688, + "logps/rejected": -236.3311309814453, + "loss": 0.5415, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2137785255908966, + "rewards/margins": 0.5660091042518616, + "rewards/rejected": -0.7797876000404358, + "step": 990 + }, + { + "epoch": 0.24, + "learning_rate": 4.0096230954290293e-07, + "logits/chosen": -2.5677154064178467, + "logits/rejected": -2.4841880798339844, + "logps/chosen": -311.47845458984375, + "logps/rejected": -255.2757110595703, + "loss": 0.4928, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5334405899047852, + "rewards/margins": 1.1248290538787842, + "rewards/rejected": -1.6582696437835693, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 4.0497193263833194e-07, + "logits/chosen": -2.8594164848327637, + "logits/rejected": -2.7990007400512695, + "logps/chosen": -248.8976593017578, + "logps/rejected": -212.68789672851562, + "loss": 0.5403, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7001456022262573, + "rewards/margins": 0.3277003765106201, + "rewards/rejected": -1.027845859527588, + "step": 1010 + }, + { + "epoch": 0.25, + "learning_rate": 4.08981555733761e-07, + "logits/chosen": -2.7849814891815186, + "logits/rejected": -2.821704149246216, + "logps/chosen": -351.97552490234375, + "logps/rejected": -288.44549560546875, + "loss": 0.6945, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03144335746765137, + "rewards/margins": 1.1996476650238037, + "rewards/rejected": -1.231091022491455, + "step": 1020 + }, + { + "epoch": 0.25, + "learning_rate": 4.1299117882919007e-07, + "logits/chosen": -2.619790554046631, + "logits/rejected": -2.7029523849487305, + "logps/chosen": -277.0049133300781, + "logps/rejected": -285.37152099609375, + "loss": 0.5943, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15238934755325317, + "rewards/margins": 1.0189682245254517, + "rewards/rejected": -0.8665788769721985, + "step": 1030 + }, + { + "epoch": 0.25, + "learning_rate": 4.170008019246191e-07, + "logits/chosen": -2.7127845287323, + "logits/rejected": -2.739548444747925, + "logps/chosen": -241.86032104492188, + "logps/rejected": -307.84698486328125, + "loss": 0.6209, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2921208441257477, + "rewards/margins": 1.2693531513214111, + "rewards/rejected": -0.9772324562072754, + "step": 1040 + }, + { + "epoch": 0.25, + "learning_rate": 4.210104250200481e-07, + "logits/chosen": -2.957655906677246, + "logits/rejected": -2.835099458694458, + "logps/chosen": -259.3416748046875, + "logps/rejected": -221.20620727539062, + "loss": 0.7344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10282768309116364, + "rewards/margins": 0.7172152400016785, + "rewards/rejected": -0.614387571811676, + "step": 1050 + }, + { + "epoch": 0.26, + "learning_rate": 4.2502004811547716e-07, + "logits/chosen": -2.872497797012329, + "logits/rejected": -2.7167131900787354, + "logps/chosen": -209.87173461914062, + "logps/rejected": -172.0841522216797, + "loss": 0.586, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0810152068734169, + "rewards/margins": 1.754184365272522, + "rewards/rejected": -1.8351995944976807, + "step": 1060 + }, + { + "epoch": 0.26, + "learning_rate": 4.2902967121090617e-07, + "logits/chosen": -2.8070569038391113, + "logits/rejected": -2.742454767227173, + "logps/chosen": -148.70144653320312, + "logps/rejected": -207.06668090820312, + "loss": 0.5861, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.250434547662735, + "rewards/margins": 0.7638150453567505, + "rewards/rejected": -1.014249563217163, + "step": 1070 + }, + { + "epoch": 0.26, + "learning_rate": 4.330392943063352e-07, + "logits/chosen": -2.9118402004241943, + "logits/rejected": -2.755366325378418, + "logps/chosen": -277.2066345214844, + "logps/rejected": -326.04217529296875, + "loss": 0.6995, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.11643274873495102, + "rewards/margins": 1.0064040422439575, + "rewards/rejected": -0.8899710774421692, + "step": 1080 + }, + { + "epoch": 0.26, + "learning_rate": 4.370489174017642e-07, + "logits/chosen": -2.7893195152282715, + "logits/rejected": -2.785935878753662, + "logps/chosen": -145.8661651611328, + "logps/rejected": -232.83871459960938, + "loss": 0.7166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47274595499038696, + "rewards/margins": 1.0937156677246094, + "rewards/rejected": -1.5664615631103516, + "step": 1090 + }, + { + "epoch": 0.26, + "learning_rate": 4.4105854049719326e-07, + "logits/chosen": -2.636298418045044, + "logits/rejected": -2.638779401779175, + "logps/chosen": -279.6634521484375, + "logps/rejected": -199.56356811523438, + "loss": 0.6493, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5719181299209595, + "rewards/margins": 0.5789961814880371, + "rewards/rejected": -1.1509143114089966, + "step": 1100 + }, + { + "epoch": 0.27, + "learning_rate": 4.4506816359262227e-07, + "logits/chosen": -2.9740004539489746, + "logits/rejected": -2.8613905906677246, + "logps/chosen": -353.7268981933594, + "logps/rejected": -265.4058532714844, + "loss": 0.4656, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07540614902973175, + "rewards/margins": 1.0848721265792847, + "rewards/rejected": -1.1602783203125, + "step": 1110 + }, + { + "epoch": 0.27, + "learning_rate": 4.490777866880513e-07, + "logits/chosen": -3.0363729000091553, + "logits/rejected": -2.765496015548706, + "logps/chosen": -289.9078369140625, + "logps/rejected": -216.5182647705078, + "loss": 0.6634, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1546178013086319, + "rewards/margins": 1.15028977394104, + "rewards/rejected": -0.9956720471382141, + "step": 1120 + }, + { + "epoch": 0.27, + "learning_rate": 4.530874097834803e-07, + "logits/chosen": -2.84161639213562, + "logits/rejected": -2.7532410621643066, + "logps/chosen": -203.11619567871094, + "logps/rejected": -198.95468139648438, + "loss": 0.5293, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09384959191083908, + "rewards/margins": 1.0006788969039917, + "rewards/rejected": -0.9068293571472168, + "step": 1130 + }, + { + "epoch": 0.27, + "learning_rate": 4.570970328789094e-07, + "logits/chosen": -2.73176908493042, + "logits/rejected": -2.7268660068511963, + "logps/chosen": -284.72882080078125, + "logps/rejected": -385.8221435546875, + "loss": 0.6296, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8947023153305054, + "rewards/margins": 2.0185418128967285, + "rewards/rejected": -2.9132442474365234, + "step": 1140 + }, + { + "epoch": 0.28, + "learning_rate": 4.611066559743384e-07, + "logits/chosen": -2.810541868209839, + "logits/rejected": -2.868605136871338, + "logps/chosen": -280.7793273925781, + "logps/rejected": -270.59173583984375, + "loss": 0.4983, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2828827202320099, + "rewards/margins": 0.7669601440429688, + "rewards/rejected": -1.0498428344726562, + "step": 1150 + }, + { + "epoch": 0.28, + "learning_rate": 4.6511627906976743e-07, + "logits/chosen": -2.673184871673584, + "logits/rejected": -2.787496328353882, + "logps/chosen": -229.1334228515625, + "logps/rejected": -247.783935546875, + "loss": 0.546, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19098523259162903, + "rewards/margins": 0.7509520649909973, + "rewards/rejected": -0.9419373273849487, + "step": 1160 + }, + { + "epoch": 0.28, + "learning_rate": 4.6912590216519644e-07, + "logits/chosen": -2.7520358562469482, + "logits/rejected": -2.7745821475982666, + "logps/chosen": -254.341796875, + "logps/rejected": -270.19219970703125, + "loss": 0.6116, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.44708842039108276, + "rewards/margins": 0.5681658983230591, + "rewards/rejected": -1.0152543783187866, + "step": 1170 + }, + { + "epoch": 0.28, + "learning_rate": 4.731355252606255e-07, + "logits/chosen": -2.7063660621643066, + "logits/rejected": -2.681487560272217, + "logps/chosen": -252.1110076904297, + "logps/rejected": -236.2973175048828, + "loss": 0.6036, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.22551283240318298, + "rewards/margins": 1.2284777164459229, + "rewards/rejected": -1.002964735031128, + "step": 1180 + }, + { + "epoch": 0.29, + "learning_rate": 4.771451483560545e-07, + "logits/chosen": -2.7218141555786133, + "logits/rejected": -2.686691999435425, + "logps/chosen": -270.92645263671875, + "logps/rejected": -258.6405029296875, + "loss": 0.4865, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3326466977596283, + "rewards/margins": 1.375840425491333, + "rewards/rejected": -1.0431935787200928, + "step": 1190 + }, + { + "epoch": 0.29, + "learning_rate": 4.811547714514836e-07, + "logits/chosen": -2.9489059448242188, + "logits/rejected": -2.862583637237549, + "logps/chosen": -250.0316619873047, + "logps/rejected": -228.2823944091797, + "loss": 0.5728, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3286787271499634, + "rewards/margins": 0.5910458564758301, + "rewards/rejected": -0.9197246432304382, + "step": 1200 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.505753993988037, + "eval_logits/rejected": -2.4770805835723877, + "eval_logps/chosen": -209.3626708984375, + "eval_logps/rejected": -207.41122436523438, + "eval_loss": 0.5323905944824219, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -0.7435135841369629, + "eval_rewards/margins": 1.0445295572280884, + "eval_rewards/rejected": -1.7880432605743408, + "eval_runtime": 131.8001, + "eval_samples_per_second": 23.945, + "eval_steps_per_second": 0.379, + "step": 1200 + }, + { + "epoch": 0.29, + "learning_rate": 4.851643945469126e-07, + "logits/chosen": -2.7622861862182617, + "logits/rejected": -2.752068281173706, + "logps/chosen": -209.67770385742188, + "logps/rejected": -191.7538604736328, + "loss": 0.5126, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07353958487510681, + "rewards/margins": 1.6461225748062134, + "rewards/rejected": -1.719662070274353, + "step": 1210 + }, + { + "epoch": 0.29, + "learning_rate": 4.891740176423416e-07, + "logits/chosen": -2.607412815093994, + "logits/rejected": -2.6985273361206055, + "logps/chosen": -273.7203063964844, + "logps/rejected": -259.5631103515625, + "loss": 0.5619, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.542596697807312, + "rewards/margins": 1.319077491760254, + "rewards/rejected": -1.8616740703582764, + "step": 1220 + }, + { + "epoch": 0.3, + "learning_rate": 4.931836407377706e-07, + "logits/chosen": -2.8004584312438965, + "logits/rejected": -2.67830491065979, + "logps/chosen": -290.6675109863281, + "logps/rejected": -207.16372680664062, + "loss": 0.5771, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5204219818115234, + "rewards/margins": 1.4301979541778564, + "rewards/rejected": -1.9506199359893799, + "step": 1230 + }, + { + "epoch": 0.3, + "learning_rate": 4.971932638331996e-07, + "logits/chosen": -2.8502564430236816, + "logits/rejected": -2.75368070602417, + "logps/chosen": -262.37274169921875, + "logps/rejected": -303.19281005859375, + "loss": 0.5791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9246293902397156, + "rewards/margins": 0.9244276285171509, + "rewards/rejected": -1.8490569591522217, + "step": 1240 + }, + { + "epoch": 0.3, + "learning_rate": 4.998662863255482e-07, + "logits/chosen": -2.893864154815674, + "logits/rejected": -2.782992362976074, + "logps/chosen": -311.4219055175781, + "logps/rejected": -214.0582275390625, + "loss": 0.7277, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8336235284805298, + "rewards/margins": 1.3086597919464111, + "rewards/rejected": -2.1422832012176514, + "step": 1250 + }, + { + "epoch": 0.3, + "learning_rate": 4.994205740773756e-07, + "logits/chosen": -2.7738232612609863, + "logits/rejected": -2.7849669456481934, + "logps/chosen": -237.98391723632812, + "logps/rejected": -249.3274383544922, + "loss": 0.5599, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3302724361419678, + "rewards/margins": 0.6252075433731079, + "rewards/rejected": -1.9554798603057861, + "step": 1260 + }, + { + "epoch": 0.31, + "learning_rate": 4.989748618292031e-07, + "logits/chosen": -2.7021219730377197, + "logits/rejected": -2.466907024383545, + "logps/chosen": -318.3983154296875, + "logps/rejected": -291.46417236328125, + "loss": 0.5715, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.451961636543274, + "rewards/margins": 1.726300835609436, + "rewards/rejected": -3.178262233734131, + "step": 1270 + }, + { + "epoch": 0.31, + "learning_rate": 4.985291495810304e-07, + "logits/chosen": -2.810925006866455, + "logits/rejected": -2.7793993949890137, + "logps/chosen": -438.08441162109375, + "logps/rejected": -412.2474060058594, + "loss": 0.4904, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7998260855674744, + "rewards/margins": 1.8961460590362549, + "rewards/rejected": -2.695971965789795, + "step": 1280 + }, + { + "epoch": 0.31, + "learning_rate": 4.980834373328579e-07, + "logits/chosen": -2.717757225036621, + "logits/rejected": -2.6055784225463867, + "logps/chosen": -280.00042724609375, + "logps/rejected": -252.255859375, + "loss": 0.5581, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6964629888534546, + "rewards/margins": 1.0425831079483032, + "rewards/rejected": -1.7390460968017578, + "step": 1290 + }, + { + "epoch": 0.31, + "learning_rate": 4.976377250846854e-07, + "logits/chosen": -2.8040218353271484, + "logits/rejected": -2.7467641830444336, + "logps/chosen": -267.19720458984375, + "logps/rejected": -313.90496826171875, + "loss": 0.8625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.043508004397153854, + "rewards/margins": 1.1382510662078857, + "rewards/rejected": -1.1817591190338135, + "step": 1300 + }, + { + "epoch": 0.32, + "learning_rate": 4.971920128365127e-07, + "logits/chosen": -2.809016466140747, + "logits/rejected": -2.7497928142547607, + "logps/chosen": -343.20196533203125, + "logps/rejected": -262.23370361328125, + "loss": 0.612, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9264835119247437, + "rewards/margins": 1.130632758140564, + "rewards/rejected": -2.0571162700653076, + "step": 1310 + }, + { + "epoch": 0.32, + "learning_rate": 4.967463005883402e-07, + "logits/chosen": -2.731447696685791, + "logits/rejected": -2.6999869346618652, + "logps/chosen": -314.8204345703125, + "logps/rejected": -280.8206787109375, + "loss": 0.5644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8434036374092102, + "rewards/margins": 0.6722986102104187, + "rewards/rejected": -1.515702247619629, + "step": 1320 + }, + { + "epoch": 0.32, + "learning_rate": 4.963005883401676e-07, + "logits/chosen": -2.5341479778289795, + "logits/rejected": -2.5332157611846924, + "logps/chosen": -202.1023406982422, + "logps/rejected": -190.7056427001953, + "loss": 0.6538, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0355693101882935, + "rewards/margins": 0.49540454149246216, + "rewards/rejected": -1.5309737920761108, + "step": 1330 + }, + { + "epoch": 0.32, + "learning_rate": 4.95854876091995e-07, + "logits/chosen": -2.5441746711730957, + "logits/rejected": -2.4266505241394043, + "logps/chosen": -311.6667785644531, + "logps/rejected": -295.31048583984375, + "loss": 0.6216, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6037508845329285, + "rewards/margins": 1.1581532955169678, + "rewards/rejected": -1.7619041204452515, + "step": 1340 + }, + { + "epoch": 0.32, + "learning_rate": 4.954091638438224e-07, + "logits/chosen": -2.677034854888916, + "logits/rejected": -2.6806418895721436, + "logps/chosen": -245.2216796875, + "logps/rejected": -239.7243194580078, + "loss": 0.5543, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7556982040405273, + "rewards/margins": 1.2748024463653564, + "rewards/rejected": -3.030500888824463, + "step": 1350 + }, + { + "epoch": 0.33, + "learning_rate": 4.949634515956499e-07, + "logits/chosen": -2.53664493560791, + "logits/rejected": -2.3591296672821045, + "logps/chosen": -243.69992065429688, + "logps/rejected": -233.404296875, + "loss": 0.4621, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.5793843269348145, + "rewards/margins": 1.578896164894104, + "rewards/rejected": -4.158279895782471, + "step": 1360 + }, + { + "epoch": 0.33, + "learning_rate": 4.945177393474772e-07, + "logits/chosen": -2.6049957275390625, + "logits/rejected": -2.430155038833618, + "logps/chosen": -348.60540771484375, + "logps/rejected": -263.60772705078125, + "loss": 0.4201, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.364793300628662, + "rewards/margins": 1.9968388080596924, + "rewards/rejected": -3.3616321086883545, + "step": 1370 + }, + { + "epoch": 0.33, + "learning_rate": 4.940720270993047e-07, + "logits/chosen": -2.5204038619995117, + "logits/rejected": -2.5334765911102295, + "logps/chosen": -231.40945434570312, + "logps/rejected": -271.99517822265625, + "loss": 0.5314, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6743671894073486, + "rewards/margins": 0.9893819689750671, + "rewards/rejected": -2.6637492179870605, + "step": 1380 + }, + { + "epoch": 0.33, + "learning_rate": 4.936263148511321e-07, + "logits/chosen": -2.5044212341308594, + "logits/rejected": -2.675902843475342, + "logps/chosen": -305.47100830078125, + "logps/rejected": -262.70404052734375, + "loss": 0.6683, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.6784005165100098, + "rewards/margins": 0.4521772265434265, + "rewards/rejected": -3.130577802658081, + "step": 1390 + }, + { + "epoch": 0.34, + "learning_rate": 4.931806026029595e-07, + "logits/chosen": -2.485682964324951, + "logits/rejected": -2.54756498336792, + "logps/chosen": -285.7750244140625, + "logps/rejected": -288.01324462890625, + "loss": 0.665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9433006048202515, + "rewards/margins": 1.605025053024292, + "rewards/rejected": -2.548325777053833, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 4.927348903547869e-07, + "logits/chosen": -2.837623357772827, + "logits/rejected": -2.789177656173706, + "logps/chosen": -302.69189453125, + "logps/rejected": -281.81427001953125, + "loss": 0.5974, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.31369948387146, + "rewards/margins": 0.359649121761322, + "rewards/rejected": -1.6733486652374268, + "step": 1410 + }, + { + "epoch": 0.34, + "learning_rate": 4.922891781066144e-07, + "logits/chosen": -2.6415786743164062, + "logits/rejected": -2.67337703704834, + "logps/chosen": -215.93118286132812, + "logps/rejected": -224.37893676757812, + "loss": 0.5327, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8797400593757629, + "rewards/margins": 1.2374986410140991, + "rewards/rejected": -2.117238759994507, + "step": 1420 + }, + { + "epoch": 0.34, + "learning_rate": 4.918434658584418e-07, + "logits/chosen": -2.727658748626709, + "logits/rejected": -2.574336051940918, + "logps/chosen": -404.3485412597656, + "logps/rejected": -287.9634094238281, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3983969688415527, + "rewards/margins": 1.1075738668441772, + "rewards/rejected": -2.5059709548950195, + "step": 1430 + }, + { + "epoch": 0.35, + "learning_rate": 4.913977536102692e-07, + "logits/chosen": -2.5600686073303223, + "logits/rejected": -2.516930103302002, + "logps/chosen": -232.35513305664062, + "logps/rejected": -251.1455535888672, + "loss": 0.5859, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8366307020187378, + "rewards/margins": 1.2499377727508545, + "rewards/rejected": -2.0865683555603027, + "step": 1440 + }, + { + "epoch": 0.35, + "learning_rate": 4.909520413620967e-07, + "logits/chosen": -2.480833053588867, + "logits/rejected": -2.467857599258423, + "logps/chosen": -271.05987548828125, + "logps/rejected": -275.52978515625, + "loss": 0.5631, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.09460250288248062, + "rewards/margins": 1.4983643293380737, + "rewards/rejected": -1.4037621021270752, + "step": 1450 + }, + { + "epoch": 0.35, + "learning_rate": 4.90506329113924e-07, + "logits/chosen": -2.670718193054199, + "logits/rejected": -2.590294361114502, + "logps/chosen": -323.92510986328125, + "logps/rejected": -307.1265869140625, + "loss": 0.5922, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5208776593208313, + "rewards/margins": 1.4484494924545288, + "rewards/rejected": -1.9693269729614258, + "step": 1460 + }, + { + "epoch": 0.35, + "learning_rate": 4.900606168657515e-07, + "logits/chosen": -2.4288439750671387, + "logits/rejected": -2.445039987564087, + "logps/chosen": -283.4742431640625, + "logps/rejected": -322.4749755859375, + "loss": 0.6183, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4125239849090576, + "rewards/margins": 1.4633347988128662, + "rewards/rejected": -2.875858783721924, + "step": 1470 + }, + { + "epoch": 0.36, + "learning_rate": 4.896149046175789e-07, + "logits/chosen": -2.7901694774627686, + "logits/rejected": -2.541984796524048, + "logps/chosen": -308.7502136230469, + "logps/rejected": -289.9494934082031, + "loss": 0.6705, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.36407652497291565, + "rewards/margins": 1.3868237733840942, + "rewards/rejected": -1.7509002685546875, + "step": 1480 + }, + { + "epoch": 0.36, + "learning_rate": 4.891691923694063e-07, + "logits/chosen": -2.732682704925537, + "logits/rejected": -2.7537662982940674, + "logps/chosen": -336.2591857910156, + "logps/rejected": -362.84039306640625, + "loss": 0.5476, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7052796483039856, + "rewards/margins": 1.150768518447876, + "rewards/rejected": -1.8560482263565063, + "step": 1490 + }, + { + "epoch": 0.36, + "learning_rate": 4.887234801212337e-07, + "logits/chosen": -2.4859511852264404, + "logits/rejected": -2.400803565979004, + "logps/chosen": -212.07174682617188, + "logps/rejected": -222.74081420898438, + "loss": 0.7352, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.265228509902954, + "rewards/margins": 0.8272930383682251, + "rewards/rejected": -2.0925214290618896, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 4.882777678730611e-07, + "logits/chosen": -2.7351162433624268, + "logits/rejected": -2.7322583198547363, + "logps/chosen": -275.1295471191406, + "logps/rejected": -260.4400634765625, + "loss": 0.6142, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14362294971942902, + "rewards/margins": 1.4526232481002808, + "rewards/rejected": -1.596246361732483, + "step": 1510 + }, + { + "epoch": 0.37, + "learning_rate": 4.878320556248885e-07, + "logits/chosen": -2.7501864433288574, + "logits/rejected": -2.702101230621338, + "logps/chosen": -233.5178985595703, + "logps/rejected": -226.6597137451172, + "loss": 0.5753, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8510408401489258, + "rewards/margins": 0.6280291676521301, + "rewards/rejected": -1.4790699481964111, + "step": 1520 + }, + { + "epoch": 0.37, + "learning_rate": 4.87386343376716e-07, + "logits/chosen": -2.802588701248169, + "logits/rejected": -2.6868271827697754, + "logps/chosen": -242.33499145507812, + "logps/rejected": -192.27235412597656, + "loss": 0.4946, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.8795742988586426, + "rewards/margins": 0.8144742846488953, + "rewards/rejected": -1.694048523902893, + "step": 1530 + }, + { + "epoch": 0.37, + "learning_rate": 4.869406311285433e-07, + "logits/chosen": -2.6833994388580322, + "logits/rejected": -2.850263833999634, + "logps/chosen": -232.25537109375, + "logps/rejected": -290.7073974609375, + "loss": 0.5636, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9992902874946594, + "rewards/margins": 0.7843549847602844, + "rewards/rejected": -1.7836453914642334, + "step": 1540 + }, + { + "epoch": 0.37, + "learning_rate": 4.864949188803708e-07, + "logits/chosen": -2.731792688369751, + "logits/rejected": -2.669426441192627, + "logps/chosen": -272.5797424316406, + "logps/rejected": -257.44805908203125, + "loss": 0.5954, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6422157883644104, + "rewards/margins": 1.3242002725601196, + "rewards/rejected": -1.9664160013198853, + "step": 1550 + }, + { + "epoch": 0.38, + "learning_rate": 4.860492066321983e-07, + "logits/chosen": -2.6287994384765625, + "logits/rejected": -2.6140027046203613, + "logps/chosen": -208.34054565429688, + "logps/rejected": -211.9888153076172, + "loss": 0.5959, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.674298882484436, + "rewards/margins": 0.8545886874198914, + "rewards/rejected": -2.5288877487182617, + "step": 1560 + }, + { + "epoch": 0.38, + "learning_rate": 4.856034943840256e-07, + "logits/chosen": -2.547799825668335, + "logits/rejected": -2.57979154586792, + "logps/chosen": -285.1658630371094, + "logps/rejected": -372.6661071777344, + "loss": 0.6426, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1689493656158447, + "rewards/margins": 0.5165742635726929, + "rewards/rejected": -1.6855236291885376, + "step": 1570 + }, + { + "epoch": 0.38, + "learning_rate": 4.851577821358531e-07, + "logits/chosen": -2.5380287170410156, + "logits/rejected": -2.440687894821167, + "logps/chosen": -205.4988555908203, + "logps/rejected": -277.82818603515625, + "loss": 0.5209, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0934346914291382, + "rewards/margins": 2.067070484161377, + "rewards/rejected": -3.1605048179626465, + "step": 1580 + }, + { + "epoch": 0.38, + "learning_rate": 4.847120698876805e-07, + "logits/chosen": -2.5879967212677, + "logits/rejected": -2.610872268676758, + "logps/chosen": -254.39285278320312, + "logps/rejected": -231.7747344970703, + "loss": 0.6221, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5625192523002625, + "rewards/margins": 1.6473850011825562, + "rewards/rejected": -2.2099039554595947, + "step": 1590 + }, + { + "epoch": 0.39, + "learning_rate": 4.842663576395079e-07, + "logits/chosen": -2.5071847438812256, + "logits/rejected": -2.4455130100250244, + "logps/chosen": -236.2644805908203, + "logps/rejected": -242.8035125732422, + "loss": 0.7378, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3004558086395264, + "rewards/margins": 1.0624239444732666, + "rewards/rejected": -2.362879514694214, + "step": 1600 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.452479600906372, + "eval_logits/rejected": -2.423743486404419, + "eval_logps/chosen": -218.31736755371094, + "eval_logps/rejected": -218.83828735351562, + "eval_loss": 0.5212501883506775, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -1.6389820575714111, + "eval_rewards/margins": 1.2917686700820923, + "eval_rewards/rejected": -2.930750846862793, + "eval_runtime": 132.2425, + "eval_samples_per_second": 23.865, + "eval_steps_per_second": 0.378, + "step": 1600 + }, + { + "epoch": 0.39, + "learning_rate": 4.838206453913353e-07, + "logits/chosen": -2.909114122390747, + "logits/rejected": -2.7687737941741943, + "logps/chosen": -298.7687683105469, + "logps/rejected": -277.6521301269531, + "loss": 0.5541, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6247934103012085, + "rewards/margins": 0.7123693227767944, + "rewards/rejected": -2.337162733078003, + "step": 1610 + }, + { + "epoch": 0.39, + "learning_rate": 4.833749331431628e-07, + "logits/chosen": -2.768028497695923, + "logits/rejected": -2.6386196613311768, + "logps/chosen": -237.07260131835938, + "logps/rejected": -174.32333374023438, + "loss": 0.5605, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0496273040771484, + "rewards/margins": 1.1658755540847778, + "rewards/rejected": -2.215503215789795, + "step": 1620 + }, + { + "epoch": 0.39, + "learning_rate": 4.829292208949901e-07, + "logits/chosen": -2.676975965499878, + "logits/rejected": -2.677523612976074, + "logps/chosen": -228.9412078857422, + "logps/rejected": -234.9801788330078, + "loss": 0.5557, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3257936239242554, + "rewards/margins": 0.8237881660461426, + "rewards/rejected": -2.1495819091796875, + "step": 1630 + }, + { + "epoch": 0.39, + "learning_rate": 4.824835086468176e-07, + "logits/chosen": -2.5803751945495605, + "logits/rejected": -2.5565459728240967, + "logps/chosen": -263.369384765625, + "logps/rejected": -232.2301025390625, + "loss": 0.5663, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6426874399185181, + "rewards/margins": 1.2953565120697021, + "rewards/rejected": -1.9380439519882202, + "step": 1640 + }, + { + "epoch": 0.4, + "learning_rate": 4.82037796398645e-07, + "logits/chosen": -2.593459367752075, + "logits/rejected": -2.6587584018707275, + "logps/chosen": -128.81832885742188, + "logps/rejected": -164.3067169189453, + "loss": 0.5158, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4609547555446625, + "rewards/margins": 1.3585450649261475, + "rewards/rejected": -1.8194997310638428, + "step": 1650 + }, + { + "epoch": 0.4, + "learning_rate": 4.815920841504724e-07, + "logits/chosen": -2.5619707107543945, + "logits/rejected": -2.4942357540130615, + "logps/chosen": -210.661865234375, + "logps/rejected": -316.95404052734375, + "loss": 0.6839, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6866597533226013, + "rewards/margins": 1.1276859045028687, + "rewards/rejected": -1.8143457174301147, + "step": 1660 + }, + { + "epoch": 0.4, + "learning_rate": 4.811463719022998e-07, + "logits/chosen": -2.5982155799865723, + "logits/rejected": -2.50154447555542, + "logps/chosen": -353.1432189941406, + "logps/rejected": -342.078125, + "loss": 0.6005, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9795411825180054, + "rewards/margins": 0.43017083406448364, + "rewards/rejected": -2.409712314605713, + "step": 1670 + }, + { + "epoch": 0.4, + "learning_rate": 4.807006596541273e-07, + "logits/chosen": -2.520991086959839, + "logits/rejected": -2.565416097640991, + "logps/chosen": -164.8997039794922, + "logps/rejected": -187.4937286376953, + "loss": 0.576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5802173018455505, + "rewards/margins": 0.9242179989814758, + "rewards/rejected": -1.5044353008270264, + "step": 1680 + }, + { + "epoch": 0.41, + "learning_rate": 4.802549474059546e-07, + "logits/chosen": -2.5648231506347656, + "logits/rejected": -2.5870213508605957, + "logps/chosen": -176.57669067382812, + "logps/rejected": -212.6230010986328, + "loss": 0.6139, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2881603538990021, + "rewards/margins": 1.1466704607009888, + "rewards/rejected": -1.434830904006958, + "step": 1690 + }, + { + "epoch": 0.41, + "learning_rate": 4.798092351577821e-07, + "logits/chosen": -2.745094060897827, + "logits/rejected": -2.6509552001953125, + "logps/chosen": -312.0198669433594, + "logps/rejected": -291.9124755859375, + "loss": 0.5733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5168651342391968, + "rewards/margins": 0.7483233213424683, + "rewards/rejected": -2.265188455581665, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 4.793635229096096e-07, + "logits/chosen": -2.6192471981048584, + "logits/rejected": -2.6161856651306152, + "logps/chosen": -229.7698516845703, + "logps/rejected": -230.5714111328125, + "loss": 0.4965, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7330110669136047, + "rewards/margins": 0.8800470232963562, + "rewards/rejected": -1.613058090209961, + "step": 1710 + }, + { + "epoch": 0.41, + "learning_rate": 4.789178106614369e-07, + "logits/chosen": -2.5570099353790283, + "logits/rejected": -2.6017849445343018, + "logps/chosen": -223.7205047607422, + "logps/rejected": -221.4395294189453, + "loss": 0.4851, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1352211236953735, + "rewards/margins": 1.2103431224822998, + "rewards/rejected": -2.345564365386963, + "step": 1720 + }, + { + "epoch": 0.42, + "learning_rate": 4.784720984132644e-07, + "logits/chosen": -2.5730772018432617, + "logits/rejected": -2.583688735961914, + "logps/chosen": -199.51190185546875, + "logps/rejected": -205.45703125, + "loss": 0.5504, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.759188711643219, + "rewards/margins": 1.5073745250701904, + "rewards/rejected": -2.2665631771087646, + "step": 1730 + }, + { + "epoch": 0.42, + "learning_rate": 4.780263861650918e-07, + "logits/chosen": -2.744879722595215, + "logits/rejected": -2.656869888305664, + "logps/chosen": -243.7637481689453, + "logps/rejected": -241.1142578125, + "loss": 0.5155, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3867835998535156, + "rewards/margins": 0.6124585866928101, + "rewards/rejected": -1.9992421865463257, + "step": 1740 + }, + { + "epoch": 0.42, + "learning_rate": 4.775806739169192e-07, + "logits/chosen": -2.6300344467163086, + "logits/rejected": -2.5715746879577637, + "logps/chosen": -302.39447021484375, + "logps/rejected": -271.0872497558594, + "loss": 0.5327, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7226763963699341, + "rewards/margins": 1.8533964157104492, + "rewards/rejected": -2.5760726928710938, + "step": 1750 + }, + { + "epoch": 0.42, + "learning_rate": 4.771349616687466e-07, + "logits/chosen": -2.661221981048584, + "logits/rejected": -2.6473538875579834, + "logps/chosen": -361.41229248046875, + "logps/rejected": -344.7325134277344, + "loss": 0.4577, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5991467237472534, + "rewards/margins": 1.644186019897461, + "rewards/rejected": -2.243332624435425, + "step": 1760 + }, + { + "epoch": 0.43, + "learning_rate": 4.7668924942057403e-07, + "logits/chosen": -2.6630051136016846, + "logits/rejected": -2.716240406036377, + "logps/chosen": -316.25604248046875, + "logps/rejected": -263.24285888671875, + "loss": 0.5225, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3840251863002777, + "rewards/margins": 1.877362847328186, + "rewards/rejected": -2.261387825012207, + "step": 1770 + }, + { + "epoch": 0.43, + "learning_rate": 4.7624353717240143e-07, + "logits/chosen": -2.6680986881256104, + "logits/rejected": -2.696406364440918, + "logps/chosen": -168.38442993164062, + "logps/rejected": -248.2466583251953, + "loss": 0.5948, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7542056441307068, + "rewards/margins": 1.5609080791473389, + "rewards/rejected": -2.3151137828826904, + "step": 1780 + }, + { + "epoch": 0.43, + "learning_rate": 4.757978249242289e-07, + "logits/chosen": -2.7498373985290527, + "logits/rejected": -2.681201696395874, + "logps/chosen": -416.79718017578125, + "logps/rejected": -327.98785400390625, + "loss": 0.6477, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.382917582988739, + "rewards/margins": 0.13252462446689606, + "rewards/rejected": -0.5154422521591187, + "step": 1790 + }, + { + "epoch": 0.43, + "learning_rate": 4.753521126760563e-07, + "logits/chosen": -2.6633782386779785, + "logits/rejected": -2.591684103012085, + "logps/chosen": -246.0820770263672, + "logps/rejected": -177.46151733398438, + "loss": 0.5584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04203291982412338, + "rewards/margins": 0.8560365438461304, + "rewards/rejected": -0.8140036463737488, + "step": 1800 + }, + { + "epoch": 0.44, + "learning_rate": 4.749064004278837e-07, + "logits/chosen": -2.725090980529785, + "logits/rejected": -2.6943840980529785, + "logps/chosen": -340.26446533203125, + "logps/rejected": -339.68243408203125, + "loss": 0.4855, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3830810487270355, + "rewards/margins": 1.02919602394104, + "rewards/rejected": -1.4122769832611084, + "step": 1810 + }, + { + "epoch": 0.44, + "learning_rate": 4.7446068817971115e-07, + "logits/chosen": -2.5046231746673584, + "logits/rejected": -2.5039780139923096, + "logps/chosen": -250.435546875, + "logps/rejected": -284.41912841796875, + "loss": 0.5042, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7816805243492126, + "rewards/margins": 0.39527979493141174, + "rewards/rejected": -1.1769602298736572, + "step": 1820 + }, + { + "epoch": 0.44, + "learning_rate": 4.7401497593153855e-07, + "logits/chosen": -2.597954273223877, + "logits/rejected": -2.5391862392425537, + "logps/chosen": -291.6734313964844, + "logps/rejected": -251.46337890625, + "loss": 0.6315, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6559056639671326, + "rewards/margins": 1.4835684299468994, + "rewards/rejected": -2.1394739151000977, + "step": 1830 + }, + { + "epoch": 0.44, + "learning_rate": 4.7356926368336596e-07, + "logits/chosen": -2.6696648597717285, + "logits/rejected": -2.668829917907715, + "logps/chosen": -183.03819274902344, + "logps/rejected": -201.14859008789062, + "loss": 0.62, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9224878549575806, + "rewards/margins": 0.998814582824707, + "rewards/rejected": -1.9213024377822876, + "step": 1840 + }, + { + "epoch": 0.45, + "learning_rate": 4.731235514351934e-07, + "logits/chosen": -2.6900339126586914, + "logits/rejected": -2.670243263244629, + "logps/chosen": -224.8537139892578, + "logps/rejected": -237.53518676757812, + "loss": 0.6317, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1691210269927979, + "rewards/margins": 0.45907098054885864, + "rewards/rejected": -1.6281919479370117, + "step": 1850 + }, + { + "epoch": 0.45, + "learning_rate": 4.726778391870208e-07, + "logits/chosen": -2.5648159980773926, + "logits/rejected": -2.470302104949951, + "logps/chosen": -237.4529266357422, + "logps/rejected": -268.68359375, + "loss": 0.5522, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.4968430995941162, + "rewards/margins": 1.7916736602783203, + "rewards/rejected": -2.2885167598724365, + "step": 1860 + }, + { + "epoch": 0.45, + "learning_rate": 4.7223212693884827e-07, + "logits/chosen": -2.6383121013641357, + "logits/rejected": -2.6759159564971924, + "logps/chosen": -189.407958984375, + "logps/rejected": -209.6025390625, + "loss": 0.5415, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.683796763420105, + "rewards/margins": 1.0064215660095215, + "rewards/rejected": -1.6902183294296265, + "step": 1870 + }, + { + "epoch": 0.45, + "learning_rate": 4.7178641469067573e-07, + "logits/chosen": -2.578706979751587, + "logits/rejected": -2.5927414894104004, + "logps/chosen": -236.52951049804688, + "logps/rejected": -260.4093933105469, + "loss": 0.5788, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6235929727554321, + "rewards/margins": 0.8559904098510742, + "rewards/rejected": -1.479583501815796, + "step": 1880 + }, + { + "epoch": 0.45, + "learning_rate": 4.7134070244250313e-07, + "logits/chosen": -2.6419436931610107, + "logits/rejected": -2.5469255447387695, + "logps/chosen": -281.54901123046875, + "logps/rejected": -227.68453979492188, + "loss": 0.5113, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8387514352798462, + "rewards/margins": 1.2184150218963623, + "rewards/rejected": -2.057166337966919, + "step": 1890 + }, + { + "epoch": 0.46, + "learning_rate": 4.7089499019433053e-07, + "logits/chosen": -2.676638126373291, + "logits/rejected": -2.459538459777832, + "logps/chosen": -295.4486389160156, + "logps/rejected": -287.54119873046875, + "loss": 0.5771, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9650393724441528, + "rewards/margins": 1.7956863641738892, + "rewards/rejected": -2.760725498199463, + "step": 1900 + }, + { + "epoch": 0.46, + "learning_rate": 4.70449277946158e-07, + "logits/chosen": -2.6179847717285156, + "logits/rejected": -2.6216704845428467, + "logps/chosen": -258.857177734375, + "logps/rejected": -261.48638916015625, + "loss": 0.5117, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7386892437934875, + "rewards/margins": 0.7860111594200134, + "rewards/rejected": -1.524700403213501, + "step": 1910 + }, + { + "epoch": 0.46, + "learning_rate": 4.700035656979854e-07, + "logits/chosen": -2.4984524250030518, + "logits/rejected": -2.5021138191223145, + "logps/chosen": -280.54449462890625, + "logps/rejected": -229.6431121826172, + "loss": 0.7432, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6804414391517639, + "rewards/margins": 0.9229552149772644, + "rewards/rejected": -1.6033966541290283, + "step": 1920 + }, + { + "epoch": 0.46, + "learning_rate": 4.695578534498128e-07, + "logits/chosen": -2.5745809078216553, + "logits/rejected": -2.5078747272491455, + "logps/chosen": -272.56683349609375, + "logps/rejected": -302.67041015625, + "loss": 0.5336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08346471190452576, + "rewards/margins": 1.2559754848480225, + "rewards/rejected": -1.3394403457641602, + "step": 1930 + }, + { + "epoch": 0.47, + "learning_rate": 4.691121412016402e-07, + "logits/chosen": -2.747448205947876, + "logits/rejected": -2.6069045066833496, + "logps/chosen": -237.12060546875, + "logps/rejected": -253.00033569335938, + "loss": 0.4895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3581007421016693, + "rewards/margins": 1.0566563606262207, + "rewards/rejected": -1.4147570133209229, + "step": 1940 + }, + { + "epoch": 0.47, + "learning_rate": 4.6866642895346765e-07, + "logits/chosen": -2.6678454875946045, + "logits/rejected": -2.5825304985046387, + "logps/chosen": -202.19448852539062, + "logps/rejected": -212.3162384033203, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2410422563552856, + "rewards/margins": 1.332786202430725, + "rewards/rejected": -2.57382869720459, + "step": 1950 + }, + { + "epoch": 0.47, + "learning_rate": 4.6822071670529506e-07, + "logits/chosen": -2.569096803665161, + "logits/rejected": -2.4656262397766113, + "logps/chosen": -274.194580078125, + "logps/rejected": -203.11862182617188, + "loss": 0.5308, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0657583475112915, + "rewards/margins": 1.6224472522735596, + "rewards/rejected": -2.6882054805755615, + "step": 1960 + }, + { + "epoch": 0.47, + "learning_rate": 4.6777500445712246e-07, + "logits/chosen": -2.5886142253875732, + "logits/rejected": -2.4780044555664062, + "logps/chosen": -217.68875122070312, + "logps/rejected": -191.03097534179688, + "loss": 0.5237, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.506932020187378, + "rewards/margins": 0.9141137003898621, + "rewards/rejected": -2.4210457801818848, + "step": 1970 + }, + { + "epoch": 0.48, + "learning_rate": 4.673292922089499e-07, + "logits/chosen": -2.664307117462158, + "logits/rejected": -2.741649866104126, + "logps/chosen": -227.2781982421875, + "logps/rejected": -243.3974609375, + "loss": 0.735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9387329816818237, + "rewards/margins": 1.4798619747161865, + "rewards/rejected": -2.4185948371887207, + "step": 1980 + }, + { + "epoch": 0.48, + "learning_rate": 4.668835799607773e-07, + "logits/chosen": -2.511784076690674, + "logits/rejected": -2.568660020828247, + "logps/chosen": -251.56588745117188, + "logps/rejected": -237.3352813720703, + "loss": 0.5213, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.559882402420044, + "rewards/margins": 1.6572726964950562, + "rewards/rejected": -3.2171554565429688, + "step": 1990 + }, + { + "epoch": 0.48, + "learning_rate": 4.664378677126047e-07, + "logits/chosen": -2.5120930671691895, + "logits/rejected": -2.4584739208221436, + "logps/chosen": -405.72674560546875, + "logps/rejected": -305.15484619140625, + "loss": 0.7467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2938706874847412, + "rewards/margins": 1.316383957862854, + "rewards/rejected": -2.6102547645568848, + "step": 2000 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -2.444129705429077, + "eval_logits/rejected": -2.410576820373535, + "eval_logps/chosen": -224.02638244628906, + "eval_logps/rejected": -223.77809143066406, + "eval_loss": 0.5787909626960754, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -2.209883213043213, + "eval_rewards/margins": 1.2148469686508179, + "eval_rewards/rejected": -3.4247303009033203, + "eval_runtime": 131.4945, + "eval_samples_per_second": 24.001, + "eval_steps_per_second": 0.38, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 4.659921554644322e-07, + "logits/chosen": -2.6425423622131348, + "logits/rejected": -2.6530404090881348, + "logps/chosen": -226.0823211669922, + "logps/rejected": -147.59510803222656, + "loss": 0.5758, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9719376564025879, + "rewards/margins": 1.2038328647613525, + "rewards/rejected": -2.1757702827453613, + "step": 2010 + }, + { + "epoch": 0.49, + "learning_rate": 4.655464432162596e-07, + "logits/chosen": -2.753293991088867, + "logits/rejected": -2.5513646602630615, + "logps/chosen": -277.55706787109375, + "logps/rejected": -226.85574340820312, + "loss": 0.5395, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.108198642730713, + "rewards/margins": 1.5984714031219482, + "rewards/rejected": -2.706670045852661, + "step": 2020 + }, + { + "epoch": 0.49, + "learning_rate": 4.65100730968087e-07, + "logits/chosen": -2.6071271896362305, + "logits/rejected": -2.6852943897247314, + "logps/chosen": -255.02090454101562, + "logps/rejected": -245.03759765625, + "loss": 0.5993, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.428769588470459, + "rewards/margins": 1.5921337604522705, + "rewards/rejected": -3.0209031105041504, + "step": 2030 + }, + { + "epoch": 0.49, + "learning_rate": 4.6465501871991444e-07, + "logits/chosen": -2.7704215049743652, + "logits/rejected": -2.6319820880889893, + "logps/chosen": -280.4167175292969, + "logps/rejected": -228.3472900390625, + "loss": 0.646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.04085111618042, + "rewards/margins": 1.1138665676116943, + "rewards/rejected": -2.1547179222106934, + "step": 2040 + }, + { + "epoch": 0.49, + "learning_rate": 4.6420930647174184e-07, + "logits/chosen": -2.737074375152588, + "logits/rejected": -2.7383949756622314, + "logps/chosen": -272.1175231933594, + "logps/rejected": -234.6374053955078, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.098402500152588, + "rewards/margins": 0.6982945203781128, + "rewards/rejected": -1.7966970205307007, + "step": 2050 + }, + { + "epoch": 0.5, + "learning_rate": 4.6376359422356924e-07, + "logits/chosen": -2.6888365745544434, + "logits/rejected": -2.6673827171325684, + "logps/chosen": -280.63128662109375, + "logps/rejected": -310.49908447265625, + "loss": 0.5724, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3983144760131836, + "rewards/margins": 0.6582023501396179, + "rewards/rejected": -2.0565168857574463, + "step": 2060 + }, + { + "epoch": 0.5, + "learning_rate": 4.633178819753967e-07, + "logits/chosen": -2.8046553134918213, + "logits/rejected": -2.784102439880371, + "logps/chosen": -249.0012664794922, + "logps/rejected": -285.3121337890625, + "loss": 0.5372, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.148695468902588, + "rewards/margins": 1.1670068502426147, + "rewards/rejected": -2.3157026767730713, + "step": 2070 + }, + { + "epoch": 0.5, + "learning_rate": 4.628721697272241e-07, + "logits/chosen": -2.5930941104888916, + "logits/rejected": -2.52286958694458, + "logps/chosen": -249.0924835205078, + "logps/rejected": -229.8943328857422, + "loss": 0.8827, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8799583911895752, + "rewards/margins": 1.608770728111267, + "rewards/rejected": -3.488729476928711, + "step": 2080 + }, + { + "epoch": 0.5, + "learning_rate": 4.624264574790515e-07, + "logits/chosen": -2.737677574157715, + "logits/rejected": -2.589430332183838, + "logps/chosen": -261.4295654296875, + "logps/rejected": -243.833251953125, + "loss": 0.5408, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.307201862335205, + "rewards/margins": 1.4561229944229126, + "rewards/rejected": -2.7633252143859863, + "step": 2090 + }, + { + "epoch": 0.51, + "learning_rate": 4.619807452308789e-07, + "logits/chosen": -2.608551502227783, + "logits/rejected": -2.5399093627929688, + "logps/chosen": -303.45208740234375, + "logps/rejected": -263.0534973144531, + "loss": 0.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5837156772613525, + "rewards/margins": 1.0279721021652222, + "rewards/rejected": -2.6116878986358643, + "step": 2100 + }, + { + "epoch": 0.51, + "learning_rate": 4.6153503298270636e-07, + "logits/chosen": -2.611738443374634, + "logits/rejected": -2.6551427841186523, + "logps/chosen": -306.83026123046875, + "logps/rejected": -350.64312744140625, + "loss": 1.1069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8476687669754028, + "rewards/margins": 1.386994481086731, + "rewards/rejected": -2.2346630096435547, + "step": 2110 + }, + { + "epoch": 0.51, + "learning_rate": 4.6108932073453377e-07, + "logits/chosen": -2.672241687774658, + "logits/rejected": -2.5617220401763916, + "logps/chosen": -382.13458251953125, + "logps/rejected": -303.9514465332031, + "loss": 0.5562, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8237565755844116, + "rewards/margins": 0.574665904045105, + "rewards/rejected": -2.3984227180480957, + "step": 2120 + }, + { + "epoch": 0.51, + "learning_rate": 4.6064360848636117e-07, + "logits/chosen": -2.686922550201416, + "logits/rejected": -2.598443031311035, + "logps/chosen": -272.69525146484375, + "logps/rejected": -339.1822204589844, + "loss": 0.613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4758816957473755, + "rewards/margins": 0.8712828755378723, + "rewards/rejected": -2.3471646308898926, + "step": 2130 + }, + { + "epoch": 0.52, + "learning_rate": 4.601978962381886e-07, + "logits/chosen": -2.5901780128479004, + "logits/rejected": -2.5972065925598145, + "logps/chosen": -253.97372436523438, + "logps/rejected": -263.4020080566406, + "loss": 0.4924, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.192458152770996, + "rewards/margins": 0.78331458568573, + "rewards/rejected": -1.9757726192474365, + "step": 2140 + }, + { + "epoch": 0.52, + "learning_rate": 4.5975218399001603e-07, + "logits/chosen": -2.484832286834717, + "logits/rejected": -2.5052075386047363, + "logps/chosen": -261.0078430175781, + "logps/rejected": -297.71820068359375, + "loss": 0.6843, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.141125440597534, + "rewards/margins": 0.289185494184494, + "rewards/rejected": -2.4303107261657715, + "step": 2150 + }, + { + "epoch": 0.52, + "learning_rate": 4.5930647174184343e-07, + "logits/chosen": -2.613530397415161, + "logits/rejected": -2.641186475753784, + "logps/chosen": -268.1658935546875, + "logps/rejected": -272.3964538574219, + "loss": 0.4863, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6121963262557983, + "rewards/margins": 1.899646520614624, + "rewards/rejected": -2.511842727661133, + "step": 2160 + }, + { + "epoch": 0.52, + "learning_rate": 4.588607594936709e-07, + "logits/chosen": -2.601377487182617, + "logits/rejected": -2.5596752166748047, + "logps/chosen": -237.232421875, + "logps/rejected": -274.5160827636719, + "loss": 0.6326, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6485188007354736, + "rewards/margins": 0.939397931098938, + "rewards/rejected": -2.587916612625122, + "step": 2170 + }, + { + "epoch": 0.52, + "learning_rate": 4.584150472454983e-07, + "logits/chosen": -2.469336986541748, + "logits/rejected": -2.4013397693634033, + "logps/chosen": -185.54568481445312, + "logps/rejected": -176.74929809570312, + "loss": 0.5309, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1838127374649048, + "rewards/margins": 1.5106571912765503, + "rewards/rejected": -2.694469928741455, + "step": 2180 + }, + { + "epoch": 0.53, + "learning_rate": 4.579693349973257e-07, + "logits/chosen": -2.5632500648498535, + "logits/rejected": -2.574326515197754, + "logps/chosen": -185.436279296875, + "logps/rejected": -207.86325073242188, + "loss": 0.6808, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0079572200775146, + "rewards/margins": 1.917676329612732, + "rewards/rejected": -2.925633192062378, + "step": 2190 + }, + { + "epoch": 0.53, + "learning_rate": 4.5752362274915315e-07, + "logits/chosen": -2.6746857166290283, + "logits/rejected": -2.6083481311798096, + "logps/chosen": -223.94442749023438, + "logps/rejected": -205.3218536376953, + "loss": 0.4763, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9791423678398132, + "rewards/margins": 1.406191110610962, + "rewards/rejected": -2.38533353805542, + "step": 2200 + }, + { + "epoch": 0.53, + "learning_rate": 4.5707791050098055e-07, + "logits/chosen": -2.6126599311828613, + "logits/rejected": -2.6058077812194824, + "logps/chosen": -301.7843322753906, + "logps/rejected": -257.8680725097656, + "loss": 0.6266, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1785779446363449, + "rewards/margins": 1.7274401187896729, + "rewards/rejected": -1.9060180187225342, + "step": 2210 + }, + { + "epoch": 0.53, + "learning_rate": 4.5663219825280795e-07, + "logits/chosen": -2.6819117069244385, + "logits/rejected": -2.62992525100708, + "logps/chosen": -245.77481079101562, + "logps/rejected": -219.6219482421875, + "loss": 0.7883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7871536016464233, + "rewards/margins": 0.5845667123794556, + "rewards/rejected": -1.371720552444458, + "step": 2220 + }, + { + "epoch": 0.54, + "learning_rate": 4.561864860046354e-07, + "logits/chosen": -2.6489009857177734, + "logits/rejected": -2.623767852783203, + "logps/chosen": -212.5379180908203, + "logps/rejected": -164.32310485839844, + "loss": 0.6103, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6828866600990295, + "rewards/margins": 0.5492819547653198, + "rewards/rejected": -1.232168436050415, + "step": 2230 + }, + { + "epoch": 0.54, + "learning_rate": 4.557407737564628e-07, + "logits/chosen": -2.5661842823028564, + "logits/rejected": -2.5723376274108887, + "logps/chosen": -206.6239776611328, + "logps/rejected": -214.72195434570312, + "loss": 0.462, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9324319958686829, + "rewards/margins": 1.2257258892059326, + "rewards/rejected": -2.1581578254699707, + "step": 2240 + }, + { + "epoch": 0.54, + "learning_rate": 4.552950615082902e-07, + "logits/chosen": -2.4822638034820557, + "logits/rejected": -2.4895377159118652, + "logps/chosen": -242.5191650390625, + "logps/rejected": -286.22686767578125, + "loss": 0.6882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7670876383781433, + "rewards/margins": 1.1064833402633667, + "rewards/rejected": -1.8735707998275757, + "step": 2250 + }, + { + "epoch": 0.54, + "learning_rate": 4.548493492601176e-07, + "logits/chosen": -2.8437228202819824, + "logits/rejected": -2.701927661895752, + "logps/chosen": -276.03564453125, + "logps/rejected": -298.08099365234375, + "loss": 0.5096, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7756320238113403, + "rewards/margins": 2.3020224571228027, + "rewards/rejected": -3.0776543617248535, + "step": 2260 + }, + { + "epoch": 0.55, + "learning_rate": 4.544036370119451e-07, + "logits/chosen": -2.841212749481201, + "logits/rejected": -2.7204411029815674, + "logps/chosen": -428.544189453125, + "logps/rejected": -347.85723876953125, + "loss": 0.4426, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2579714059829712, + "rewards/margins": 1.2345914840698242, + "rewards/rejected": -2.492562770843506, + "step": 2270 + }, + { + "epoch": 0.55, + "learning_rate": 4.539579247637725e-07, + "logits/chosen": -2.5310349464416504, + "logits/rejected": -2.4994027614593506, + "logps/chosen": -274.86663818359375, + "logps/rejected": -238.88623046875, + "loss": 0.8814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.258170485496521, + "rewards/margins": 1.6624419689178467, + "rewards/rejected": -1.9206125736236572, + "step": 2280 + }, + { + "epoch": 0.55, + "learning_rate": 4.535122125155999e-07, + "logits/chosen": -2.3999197483062744, + "logits/rejected": -2.255828380584717, + "logps/chosen": -351.51837158203125, + "logps/rejected": -316.54791259765625, + "loss": 0.9701, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1201560497283936, + "rewards/margins": 1.114150047302246, + "rewards/rejected": -2.2343058586120605, + "step": 2290 + }, + { + "epoch": 0.55, + "learning_rate": 4.5306650026742734e-07, + "logits/chosen": -2.7634806632995605, + "logits/rejected": -2.5712170600891113, + "logps/chosen": -252.85025024414062, + "logps/rejected": -197.05996704101562, + "loss": 0.4855, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.760390043258667, + "rewards/margins": 1.4791388511657715, + "rewards/rejected": -3.2395293712615967, + "step": 2300 + }, + { + "epoch": 0.56, + "learning_rate": 4.5262078801925474e-07, + "logits/chosen": -2.6070544719696045, + "logits/rejected": -2.566847562789917, + "logps/chosen": -208.426025390625, + "logps/rejected": -206.1570281982422, + "loss": 0.5186, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.680463194847107, + "rewards/margins": 1.101180911064148, + "rewards/rejected": -2.781644344329834, + "step": 2310 + }, + { + "epoch": 0.56, + "learning_rate": 4.5217507577108214e-07, + "logits/chosen": -2.776313066482544, + "logits/rejected": -2.7298378944396973, + "logps/chosen": -214.5133056640625, + "logps/rejected": -248.0509033203125, + "loss": 0.5608, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9462732076644897, + "rewards/margins": 1.8917815685272217, + "rewards/rejected": -2.83805513381958, + "step": 2320 + }, + { + "epoch": 0.56, + "learning_rate": 4.517293635229096e-07, + "logits/chosen": -2.7545807361602783, + "logits/rejected": -2.642467498779297, + "logps/chosen": -243.28292846679688, + "logps/rejected": -211.8591766357422, + "loss": 0.7106, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5913723111152649, + "rewards/margins": 1.7651437520980835, + "rewards/rejected": -2.3565163612365723, + "step": 2330 + }, + { + "epoch": 0.56, + "learning_rate": 4.51283651274737e-07, + "logits/chosen": -2.4426515102386475, + "logits/rejected": -2.401933193206787, + "logps/chosen": -229.60049438476562, + "logps/rejected": -246.3610382080078, + "loss": 0.6211, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2280629873275757, + "rewards/margins": 3.3658013343811035, + "rewards/rejected": -4.5938639640808105, + "step": 2340 + }, + { + "epoch": 0.57, + "learning_rate": 4.508379390265644e-07, + "logits/chosen": -2.621298313140869, + "logits/rejected": -2.545727491378784, + "logps/chosen": -201.02578735351562, + "logps/rejected": -177.73696899414062, + "loss": 0.5226, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.283278465270996, + "rewards/margins": 1.3395992517471313, + "rewards/rejected": -2.622877597808838, + "step": 2350 + }, + { + "epoch": 0.57, + "learning_rate": 4.5039222677839186e-07, + "logits/chosen": -2.681291341781616, + "logits/rejected": -2.6904656887054443, + "logps/chosen": -342.61358642578125, + "logps/rejected": -312.4060974121094, + "loss": 0.4658, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7542943954467773, + "rewards/margins": 0.9285749197006226, + "rewards/rejected": -2.6828696727752686, + "step": 2360 + }, + { + "epoch": 0.57, + "learning_rate": 4.4994651453021926e-07, + "logits/chosen": -2.5337376594543457, + "logits/rejected": -2.5319621562957764, + "logps/chosen": -237.2696075439453, + "logps/rejected": -248.96102905273438, + "loss": 0.4649, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1634007692337036, + "rewards/margins": 2.1607351303100586, + "rewards/rejected": -3.3241360187530518, + "step": 2370 + }, + { + "epoch": 0.57, + "learning_rate": 4.4950080228204666e-07, + "logits/chosen": -2.6067795753479004, + "logits/rejected": -2.628153085708618, + "logps/chosen": -334.35369873046875, + "logps/rejected": -355.157470703125, + "loss": 0.5153, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9229193925857544, + "rewards/margins": 2.240133047103882, + "rewards/rejected": -3.1630523204803467, + "step": 2380 + }, + { + "epoch": 0.58, + "learning_rate": 4.490550900338741e-07, + "logits/chosen": -2.74542498588562, + "logits/rejected": -2.5125811100006104, + "logps/chosen": -236.3998565673828, + "logps/rejected": -233.5565643310547, + "loss": 0.4774, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.290185570716858, + "rewards/margins": 1.9261367321014404, + "rewards/rejected": -3.216322422027588, + "step": 2390 + }, + { + "epoch": 0.58, + "learning_rate": 4.486093777857015e-07, + "logits/chosen": -2.5763707160949707, + "logits/rejected": -2.471097469329834, + "logps/chosen": -193.45892333984375, + "logps/rejected": -204.922607421875, + "loss": 0.4646, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.373281478881836, + "rewards/margins": 2.0415844917297363, + "rewards/rejected": -3.4148662090301514, + "step": 2400 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.3994462490081787, + "eval_logits/rejected": -2.3682971000671387, + "eval_logps/chosen": -213.28709411621094, + "eval_logps/rejected": -216.52792358398438, + "eval_loss": 0.5308729410171509, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -1.1359529495239258, + "eval_rewards/margins": 1.5637621879577637, + "eval_rewards/rejected": -2.6997153759002686, + "eval_runtime": 133.6166, + "eval_samples_per_second": 23.62, + "eval_steps_per_second": 0.374, + "step": 2400 + }, + { + "epoch": 0.58, + "learning_rate": 4.481636655375289e-07, + "logits/chosen": -2.729184627532959, + "logits/rejected": -2.6641573905944824, + "logps/chosen": -295.86700439453125, + "logps/rejected": -331.05535888671875, + "loss": 0.6653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6393815279006958, + "rewards/margins": 1.4067280292510986, + "rewards/rejected": -2.046109676361084, + "step": 2410 + }, + { + "epoch": 0.58, + "learning_rate": 4.4771795328935633e-07, + "logits/chosen": -2.7169833183288574, + "logits/rejected": -2.603271484375, + "logps/chosen": -278.91632080078125, + "logps/rejected": -227.692138671875, + "loss": 0.7691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35997486114501953, + "rewards/margins": 1.3083598613739014, + "rewards/rejected": -1.6683346033096313, + "step": 2420 + }, + { + "epoch": 0.58, + "learning_rate": 4.472722410411838e-07, + "logits/chosen": -2.6175537109375, + "logits/rejected": -2.6558189392089844, + "logps/chosen": -192.9453125, + "logps/rejected": -223.7886199951172, + "loss": 0.4445, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.15358349680900574, + "rewards/margins": 2.531790256500244, + "rewards/rejected": -2.6853737831115723, + "step": 2430 + }, + { + "epoch": 0.59, + "learning_rate": 4.468265287930112e-07, + "logits/chosen": -2.7276058197021484, + "logits/rejected": -2.6745474338531494, + "logps/chosen": -276.5566711425781, + "logps/rejected": -318.369873046875, + "loss": 0.4895, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.425123929977417, + "rewards/margins": 0.6360111832618713, + "rewards/rejected": -2.0611350536346436, + "step": 2440 + }, + { + "epoch": 0.59, + "learning_rate": 4.463808165448386e-07, + "logits/chosen": -2.7597084045410156, + "logits/rejected": -2.6362688541412354, + "logps/chosen": -294.7146301269531, + "logps/rejected": -250.98165893554688, + "loss": 0.4817, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0659924745559692, + "rewards/margins": 1.2486473321914673, + "rewards/rejected": -2.3146398067474365, + "step": 2450 + }, + { + "epoch": 0.59, + "learning_rate": 4.4593510429666605e-07, + "logits/chosen": -2.6981377601623535, + "logits/rejected": -2.7366175651550293, + "logps/chosen": -265.9452819824219, + "logps/rejected": -277.0174865722656, + "loss": 0.548, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3343164920806885, + "rewards/margins": 1.3074525594711304, + "rewards/rejected": -2.6417689323425293, + "step": 2460 + }, + { + "epoch": 0.59, + "learning_rate": 4.4548939204849345e-07, + "logits/chosen": -2.667603015899658, + "logits/rejected": -2.713460683822632, + "logps/chosen": -264.29034423828125, + "logps/rejected": -252.31436157226562, + "loss": 0.5833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.8343324661254883, + "rewards/margins": 0.6806478500366211, + "rewards/rejected": -2.5149803161621094, + "step": 2470 + }, + { + "epoch": 0.6, + "learning_rate": 4.4504367980032085e-07, + "logits/chosen": -2.7401413917541504, + "logits/rejected": -2.7046878337860107, + "logps/chosen": -321.14385986328125, + "logps/rejected": -301.818603515625, + "loss": 0.6218, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9978463053703308, + "rewards/margins": 1.4452444314956665, + "rewards/rejected": -2.4430909156799316, + "step": 2480 + }, + { + "epoch": 0.6, + "learning_rate": 4.445979675521483e-07, + "logits/chosen": -2.6353251934051514, + "logits/rejected": -2.392544984817505, + "logps/chosen": -313.06915283203125, + "logps/rejected": -195.7866668701172, + "loss": 0.5543, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3248649835586548, + "rewards/margins": 1.5439631938934326, + "rewards/rejected": -2.868828058242798, + "step": 2490 + }, + { + "epoch": 0.6, + "learning_rate": 4.441522553039757e-07, + "logits/chosen": -2.6359786987304688, + "logits/rejected": -2.4890024662017822, + "logps/chosen": -260.76361083984375, + "logps/rejected": -262.39825439453125, + "loss": 0.6148, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1930229663848877, + "rewards/margins": 0.27814993262290955, + "rewards/rejected": -2.471173048019409, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 4.437065430558031e-07, + "logits/chosen": -2.6569018363952637, + "logits/rejected": -2.6371681690216064, + "logps/chosen": -291.4471740722656, + "logps/rejected": -333.3192138671875, + "loss": 0.6698, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.289780855178833, + "rewards/margins": 1.4741872549057007, + "rewards/rejected": -2.763967990875244, + "step": 2510 + }, + { + "epoch": 0.61, + "learning_rate": 4.4326083080763057e-07, + "logits/chosen": -2.5294575691223145, + "logits/rejected": -2.5719335079193115, + "logps/chosen": -228.34793090820312, + "logps/rejected": -250.3420867919922, + "loss": 0.4181, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4886128902435303, + "rewards/margins": 2.112856149673462, + "rewards/rejected": -3.6014695167541504, + "step": 2520 + }, + { + "epoch": 0.61, + "learning_rate": 4.4281511855945797e-07, + "logits/chosen": -2.682654857635498, + "logits/rejected": -2.4943184852600098, + "logps/chosen": -200.63919067382812, + "logps/rejected": -166.19149780273438, + "loss": 0.5266, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6488895416259766, + "rewards/margins": 1.4922336339950562, + "rewards/rejected": -3.141123056411743, + "step": 2530 + }, + { + "epoch": 0.61, + "learning_rate": 4.423694063112854e-07, + "logits/chosen": -2.7275164127349854, + "logits/rejected": -2.684523582458496, + "logps/chosen": -190.6834716796875, + "logps/rejected": -216.94180297851562, + "loss": 0.5218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3709272146224976, + "rewards/margins": 1.007057547569275, + "rewards/rejected": -2.3779845237731934, + "step": 2540 + }, + { + "epoch": 0.61, + "learning_rate": 4.419236940631129e-07, + "logits/chosen": -2.5892624855041504, + "logits/rejected": -2.5918118953704834, + "logps/chosen": -211.045654296875, + "logps/rejected": -239.8479766845703, + "loss": 0.653, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7834230661392212, + "rewards/margins": 1.6225885152816772, + "rewards/rejected": -3.4060111045837402, + "step": 2550 + }, + { + "epoch": 0.62, + "learning_rate": 4.414779818149403e-07, + "logits/chosen": -2.734849452972412, + "logits/rejected": -2.667412519454956, + "logps/chosen": -257.10894775390625, + "logps/rejected": -244.238525390625, + "loss": 0.54, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6349313259124756, + "rewards/margins": 2.199925184249878, + "rewards/rejected": -3.8348567485809326, + "step": 2560 + }, + { + "epoch": 0.62, + "learning_rate": 4.410322695667677e-07, + "logits/chosen": -2.6233067512512207, + "logits/rejected": -2.5422592163085938, + "logps/chosen": -269.6959228515625, + "logps/rejected": -316.9816589355469, + "loss": 0.5798, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6098742485046387, + "rewards/margins": 1.4767448902130127, + "rewards/rejected": -4.0866193771362305, + "step": 2570 + }, + { + "epoch": 0.62, + "learning_rate": 4.4058655731859515e-07, + "logits/chosen": -2.4794352054595947, + "logits/rejected": -2.428112745285034, + "logps/chosen": -286.3292236328125, + "logps/rejected": -268.1867980957031, + "loss": 0.4741, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4831401109695435, + "rewards/margins": 0.8532403111457825, + "rewards/rejected": -2.3363804817199707, + "step": 2580 + }, + { + "epoch": 0.62, + "learning_rate": 4.4014084507042255e-07, + "logits/chosen": -2.5975468158721924, + "logits/rejected": -2.4987661838531494, + "logps/chosen": -272.4716491699219, + "logps/rejected": -291.02294921875, + "loss": 0.5666, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.058257818222046, + "rewards/margins": 1.1403725147247314, + "rewards/rejected": -3.1986308097839355, + "step": 2590 + }, + { + "epoch": 0.63, + "learning_rate": 4.3969513282224995e-07, + "logits/chosen": -2.677935838699341, + "logits/rejected": -2.5781166553497314, + "logps/chosen": -238.8215789794922, + "logps/rejected": -212.71969604492188, + "loss": 0.556, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4124119281768799, + "rewards/margins": 1.2017902135849, + "rewards/rejected": -2.6142020225524902, + "step": 2600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3924942057407735e-07, + "logits/chosen": -2.680534839630127, + "logits/rejected": -2.5556912422180176, + "logps/chosen": -263.3976745605469, + "logps/rejected": -285.26348876953125, + "loss": 0.6082, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5407580137252808, + "rewards/margins": 1.0123125314712524, + "rewards/rejected": -2.553070545196533, + "step": 2610 + }, + { + "epoch": 0.63, + "learning_rate": 4.388037083259048e-07, + "logits/chosen": -2.614470958709717, + "logits/rejected": -2.510915517807007, + "logps/chosen": -390.08685302734375, + "logps/rejected": -334.0655822753906, + "loss": 0.5706, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8687372207641602, + "rewards/margins": 1.8866193294525146, + "rewards/rejected": -3.755356550216675, + "step": 2620 + }, + { + "epoch": 0.63, + "learning_rate": 4.383579960777322e-07, + "logits/chosen": -2.533573627471924, + "logits/rejected": -2.6349072456359863, + "logps/chosen": -263.72540283203125, + "logps/rejected": -261.92388916015625, + "loss": 0.5274, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.517315149307251, + "rewards/margins": 1.144661784172058, + "rewards/rejected": -3.6619770526885986, + "step": 2630 + }, + { + "epoch": 0.64, + "learning_rate": 4.379122838295596e-07, + "logits/chosen": -2.620600461959839, + "logits/rejected": -2.5007054805755615, + "logps/chosen": -364.69366455078125, + "logps/rejected": -349.27996826171875, + "loss": 0.5159, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1532578468322754, + "rewards/margins": 1.53290593624115, + "rewards/rejected": -3.686164140701294, + "step": 2640 + }, + { + "epoch": 0.64, + "learning_rate": 4.3746657158138707e-07, + "logits/chosen": -2.3059136867523193, + "logits/rejected": -2.1973063945770264, + "logps/chosen": -226.91806030273438, + "logps/rejected": -202.16848754882812, + "loss": 0.634, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.9682247638702393, + "rewards/margins": 0.32376235723495483, + "rewards/rejected": -3.291987180709839, + "step": 2650 + }, + { + "epoch": 0.64, + "learning_rate": 4.370208593332145e-07, + "logits/chosen": -2.5784239768981934, + "logits/rejected": -2.463409185409546, + "logps/chosen": -331.9124755859375, + "logps/rejected": -249.6974334716797, + "loss": 0.5939, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.851837158203125, + "rewards/margins": 1.636743187904358, + "rewards/rejected": -3.4885807037353516, + "step": 2660 + }, + { + "epoch": 0.64, + "learning_rate": 4.365751470850419e-07, + "logits/chosen": -2.7009811401367188, + "logits/rejected": -2.597710132598877, + "logps/chosen": -294.39959716796875, + "logps/rejected": -238.6754913330078, + "loss": 0.5219, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5454216003417969, + "rewards/margins": 1.1799639463424683, + "rewards/rejected": -2.7253854274749756, + "step": 2670 + }, + { + "epoch": 0.65, + "learning_rate": 4.3612943483686933e-07, + "logits/chosen": -2.5441927909851074, + "logits/rejected": -2.569514513015747, + "logps/chosen": -254.97341918945312, + "logps/rejected": -258.8553161621094, + "loss": 0.5772, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0851924419403076, + "rewards/margins": 1.121181845664978, + "rewards/rejected": -3.206374406814575, + "step": 2680 + }, + { + "epoch": 0.65, + "learning_rate": 4.3568372258869674e-07, + "logits/chosen": -2.5438809394836426, + "logits/rejected": -2.5418128967285156, + "logps/chosen": -240.10855102539062, + "logps/rejected": -264.3448791503906, + "loss": 0.6832, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7431503534317017, + "rewards/margins": 1.5108040571212769, + "rewards/rejected": -3.2539544105529785, + "step": 2690 + }, + { + "epoch": 0.65, + "learning_rate": 4.3523801034052414e-07, + "logits/chosen": -2.629892110824585, + "logits/rejected": -2.612617015838623, + "logps/chosen": -258.5018005371094, + "logps/rejected": -254.3931427001953, + "loss": 0.5957, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.77963125705719, + "rewards/margins": 0.42840784788131714, + "rewards/rejected": -2.2080390453338623, + "step": 2700 + }, + { + "epoch": 0.65, + "learning_rate": 4.347922980923516e-07, + "logits/chosen": -2.6034722328186035, + "logits/rejected": -2.6657822132110596, + "logps/chosen": -238.00399780273438, + "logps/rejected": -297.07891845703125, + "loss": 0.5985, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7608715295791626, + "rewards/margins": 1.8040387630462646, + "rewards/rejected": -3.5649101734161377, + "step": 2710 + }, + { + "epoch": 0.65, + "learning_rate": 4.34346585844179e-07, + "logits/chosen": -2.7711234092712402, + "logits/rejected": -2.6151115894317627, + "logps/chosen": -228.28500366210938, + "logps/rejected": -203.56143188476562, + "loss": 0.6575, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4607006311416626, + "rewards/margins": 1.8166983127593994, + "rewards/rejected": -3.2773985862731934, + "step": 2720 + }, + { + "epoch": 0.66, + "learning_rate": 4.339008735960064e-07, + "logits/chosen": -2.811764717102051, + "logits/rejected": -2.6760783195495605, + "logps/chosen": -369.1084289550781, + "logps/rejected": -295.4292907714844, + "loss": 0.569, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6835203170776367, + "rewards/margins": 0.6058062314987183, + "rewards/rejected": -2.2893266677856445, + "step": 2730 + }, + { + "epoch": 0.66, + "learning_rate": 4.3345516134783386e-07, + "logits/chosen": -2.595508098602295, + "logits/rejected": -2.546417236328125, + "logps/chosen": -271.2452392578125, + "logps/rejected": -238.8561553955078, + "loss": 0.5094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9008029699325562, + "rewards/margins": 1.1821850538253784, + "rewards/rejected": -3.0829882621765137, + "step": 2740 + }, + { + "epoch": 0.66, + "learning_rate": 4.3300944909966126e-07, + "logits/chosen": -2.4520745277404785, + "logits/rejected": -2.415116786956787, + "logps/chosen": -362.1225280761719, + "logps/rejected": -327.42095947265625, + "loss": 0.5453, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.5479540824890137, + "rewards/margins": 0.861763596534729, + "rewards/rejected": -3.409717559814453, + "step": 2750 + }, + { + "epoch": 0.66, + "learning_rate": 4.3256373685148866e-07, + "logits/chosen": -2.547978401184082, + "logits/rejected": -2.4545705318450928, + "logps/chosen": -283.8704528808594, + "logps/rejected": -259.1752014160156, + "loss": 0.4791, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.748185157775879, + "rewards/margins": 1.3087536096572876, + "rewards/rejected": -3.056938648223877, + "step": 2760 + }, + { + "epoch": 0.67, + "learning_rate": 4.3211802460331606e-07, + "logits/chosen": -2.5646090507507324, + "logits/rejected": -2.663165807723999, + "logps/chosen": -294.49261474609375, + "logps/rejected": -303.9794006347656, + "loss": 0.7898, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3019287586212158, + "rewards/margins": 1.9290939569473267, + "rewards/rejected": -3.231022596359253, + "step": 2770 + }, + { + "epoch": 0.67, + "learning_rate": 4.316723123551435e-07, + "logits/chosen": -2.7926788330078125, + "logits/rejected": -2.7688939571380615, + "logps/chosen": -325.41058349609375, + "logps/rejected": -332.62506103515625, + "loss": 0.6187, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.767809510231018, + "rewards/margins": 0.045775678008794785, + "rewards/rejected": -1.8135855197906494, + "step": 2780 + }, + { + "epoch": 0.67, + "learning_rate": 4.312266001069709e-07, + "logits/chosen": -2.803236484527588, + "logits/rejected": -2.774876117706299, + "logps/chosen": -264.9578857421875, + "logps/rejected": -234.6368408203125, + "loss": 0.6107, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1145288944244385, + "rewards/margins": 1.3622523546218872, + "rewards/rejected": -2.4767813682556152, + "step": 2790 + }, + { + "epoch": 0.67, + "learning_rate": 4.307808878587983e-07, + "logits/chosen": -2.6970162391662598, + "logits/rejected": -2.4754576683044434, + "logps/chosen": -282.1333923339844, + "logps/rejected": -283.2776794433594, + "loss": 0.7454, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6527990102767944, + "rewards/margins": 0.17735615372657776, + "rewards/rejected": -1.830155372619629, + "step": 2800 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.4574854373931885, + "eval_logits/rejected": -2.4288878440856934, + "eval_logps/chosen": -221.9242401123047, + "eval_logps/rejected": -225.12472534179688, + "eval_loss": 0.5290461778640747, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -1.999670386314392, + "eval_rewards/margins": 1.5597243309020996, + "eval_rewards/rejected": -3.559394598007202, + "eval_runtime": 131.6609, + "eval_samples_per_second": 23.971, + "eval_steps_per_second": 0.38, + "step": 2800 + }, + { + "epoch": 0.68, + "learning_rate": 4.303351756106258e-07, + "logits/chosen": -2.6771788597106934, + "logits/rejected": -2.6447653770446777, + "logps/chosen": -224.6023406982422, + "logps/rejected": -263.2858581542969, + "loss": 0.5702, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6448767185211182, + "rewards/margins": 1.2766485214233398, + "rewards/rejected": -2.921525478363037, + "step": 2810 + }, + { + "epoch": 0.68, + "learning_rate": 4.298894633624532e-07, + "logits/chosen": -2.8259623050689697, + "logits/rejected": -2.7117819786071777, + "logps/chosen": -342.81005859375, + "logps/rejected": -322.091064453125, + "loss": 0.5574, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.029432773590088, + "rewards/margins": 2.133007526397705, + "rewards/rejected": -3.162440299987793, + "step": 2820 + }, + { + "epoch": 0.68, + "learning_rate": 4.294437511142806e-07, + "logits/chosen": -2.7422261238098145, + "logits/rejected": -2.6582720279693604, + "logps/chosen": -404.9603271484375, + "logps/rejected": -286.91876220703125, + "loss": 0.4297, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7746641635894775, + "rewards/margins": 2.1579883098602295, + "rewards/rejected": -2.932652473449707, + "step": 2830 + }, + { + "epoch": 0.68, + "learning_rate": 4.2899803886610804e-07, + "logits/chosen": -2.8271028995513916, + "logits/rejected": -2.699387788772583, + "logps/chosen": -324.6036682128906, + "logps/rejected": -258.8714599609375, + "loss": 0.4797, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6421637535095215, + "rewards/margins": 1.310378074645996, + "rewards/rejected": -1.9525420665740967, + "step": 2840 + }, + { + "epoch": 0.69, + "learning_rate": 4.2855232661793545e-07, + "logits/chosen": -2.7117836475372314, + "logits/rejected": -2.7621607780456543, + "logps/chosen": -298.1368713378906, + "logps/rejected": -316.424560546875, + "loss": 0.5862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.4727377891540527, + "rewards/margins": 0.6655889749526978, + "rewards/rejected": -3.138327121734619, + "step": 2850 + }, + { + "epoch": 0.69, + "learning_rate": 4.2810661436976285e-07, + "logits/chosen": -2.646487236022949, + "logits/rejected": -2.706075668334961, + "logps/chosen": -233.93753051757812, + "logps/rejected": -271.9765930175781, + "loss": 0.5744, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.41381773352622986, + "rewards/margins": 1.9162561893463135, + "rewards/rejected": -2.3300740718841553, + "step": 2860 + }, + { + "epoch": 0.69, + "learning_rate": 4.276609021215903e-07, + "logits/chosen": -2.6221437454223633, + "logits/rejected": -2.539348602294922, + "logps/chosen": -216.64389038085938, + "logps/rejected": -192.21458435058594, + "loss": 0.5586, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6827747821807861, + "rewards/margins": 1.3041735887527466, + "rewards/rejected": -2.9869484901428223, + "step": 2870 + }, + { + "epoch": 0.69, + "learning_rate": 4.272151898734177e-07, + "logits/chosen": -2.6150918006896973, + "logits/rejected": -2.653275966644287, + "logps/chosen": -368.13031005859375, + "logps/rejected": -327.22955322265625, + "loss": 0.6245, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.465667486190796, + "rewards/margins": 1.6559873819351196, + "rewards/rejected": -3.121654748916626, + "step": 2880 + }, + { + "epoch": 0.7, + "learning_rate": 4.267694776252451e-07, + "logits/chosen": -2.7571969032287598, + "logits/rejected": -2.6127772331237793, + "logps/chosen": -249.05978393554688, + "logps/rejected": -296.90496826171875, + "loss": 0.6755, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7854106426239014, + "rewards/margins": 1.760719656944275, + "rewards/rejected": -3.546130418777466, + "step": 2890 + }, + { + "epoch": 0.7, + "learning_rate": 4.2632376537707257e-07, + "logits/chosen": -2.6566245555877686, + "logits/rejected": -2.586174488067627, + "logps/chosen": -217.7415008544922, + "logps/rejected": -230.2128448486328, + "loss": 0.57, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9581342935562134, + "rewards/margins": 2.0794451236724854, + "rewards/rejected": -4.03757905960083, + "step": 2900 + }, + { + "epoch": 0.7, + "learning_rate": 4.2587805312889997e-07, + "logits/chosen": -2.661797285079956, + "logits/rejected": -2.78605318069458, + "logps/chosen": -210.2810516357422, + "logps/rejected": -258.68243408203125, + "loss": 0.5689, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7073214054107666, + "rewards/margins": 1.2735803127288818, + "rewards/rejected": -2.9809017181396484, + "step": 2910 + }, + { + "epoch": 0.7, + "learning_rate": 4.2543234088072737e-07, + "logits/chosen": -2.734321117401123, + "logits/rejected": -2.6187925338745117, + "logps/chosen": -275.55340576171875, + "logps/rejected": -231.14389038085938, + "loss": 0.6469, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.684971570968628, + "rewards/margins": 0.38038399815559387, + "rewards/rejected": -2.0653557777404785, + "step": 2920 + }, + { + "epoch": 0.71, + "learning_rate": 4.249866286325548e-07, + "logits/chosen": -2.809257984161377, + "logits/rejected": -2.789126396179199, + "logps/chosen": -249.81005859375, + "logps/rejected": -294.38739013671875, + "loss": 0.7303, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1528228521347046, + "rewards/margins": 0.6378811001777649, + "rewards/rejected": -1.7907040119171143, + "step": 2930 + }, + { + "epoch": 0.71, + "learning_rate": 4.2454091638438223e-07, + "logits/chosen": -2.6959519386291504, + "logits/rejected": -2.7930846214294434, + "logps/chosen": -261.52392578125, + "logps/rejected": -261.2728576660156, + "loss": 0.586, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7664673328399658, + "rewards/margins": 1.2213587760925293, + "rewards/rejected": -2.987825870513916, + "step": 2940 + }, + { + "epoch": 0.71, + "learning_rate": 4.2409520413620963e-07, + "logits/chosen": -2.8703808784484863, + "logits/rejected": -2.840724229812622, + "logps/chosen": -292.93695068359375, + "logps/rejected": -257.15087890625, + "loss": 0.4875, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9478158950805664, + "rewards/margins": 1.4594476222991943, + "rewards/rejected": -2.4072635173797607, + "step": 2950 + }, + { + "epoch": 0.71, + "learning_rate": 4.2364949188803704e-07, + "logits/chosen": -2.78837513923645, + "logits/rejected": -2.6849160194396973, + "logps/chosen": -189.768798828125, + "logps/rejected": -236.8516082763672, + "loss": 0.4793, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.369784951210022, + "rewards/margins": 1.6183780431747437, + "rewards/rejected": -2.9881629943847656, + "step": 2960 + }, + { + "epoch": 0.71, + "learning_rate": 4.232037796398645e-07, + "logits/chosen": -2.8257720470428467, + "logits/rejected": -2.808422565460205, + "logps/chosen": -217.53842163085938, + "logps/rejected": -213.1930694580078, + "loss": 0.4727, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0007766485214233, + "rewards/margins": 1.5476601123809814, + "rewards/rejected": -2.5484366416931152, + "step": 2970 + }, + { + "epoch": 0.72, + "learning_rate": 4.227580673916919e-07, + "logits/chosen": -2.770376205444336, + "logits/rejected": -2.692963123321533, + "logps/chosen": -344.5162048339844, + "logps/rejected": -301.2332763671875, + "loss": 0.5528, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7139848470687866, + "rewards/margins": 2.1029775142669678, + "rewards/rejected": -2.816962242126465, + "step": 2980 + }, + { + "epoch": 0.72, + "learning_rate": 4.223123551435193e-07, + "logits/chosen": -2.8515336513519287, + "logits/rejected": -2.7625396251678467, + "logps/chosen": -228.93893432617188, + "logps/rejected": -220.40744018554688, + "loss": 0.5718, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.007663369178772, + "rewards/margins": 1.4957387447357178, + "rewards/rejected": -2.5034019947052, + "step": 2990 + }, + { + "epoch": 0.72, + "learning_rate": 4.2186664289534675e-07, + "logits/chosen": -2.7313835620880127, + "logits/rejected": -2.762516975402832, + "logps/chosen": -317.2395935058594, + "logps/rejected": -308.21319580078125, + "loss": 0.5126, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7839268445968628, + "rewards/margins": 1.3697669506072998, + "rewards/rejected": -2.153693675994873, + "step": 3000 + }, + { + "epoch": 0.72, + "learning_rate": 4.2142093064717416e-07, + "logits/chosen": -2.6275830268859863, + "logits/rejected": -2.659775972366333, + "logps/chosen": -177.7250213623047, + "logps/rejected": -220.8149871826172, + "loss": 0.48, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5103175640106201, + "rewards/margins": 2.0506412982940674, + "rewards/rejected": -2.5609591007232666, + "step": 3010 + }, + { + "epoch": 0.73, + "learning_rate": 4.2097521839900156e-07, + "logits/chosen": -2.5938849449157715, + "logits/rejected": -2.56001353263855, + "logps/chosen": -314.12158203125, + "logps/rejected": -261.13641357421875, + "loss": 0.5094, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2560573816299438, + "rewards/margins": 1.8153836727142334, + "rewards/rejected": -3.071441173553467, + "step": 3020 + }, + { + "epoch": 0.73, + "learning_rate": 4.20529506150829e-07, + "logits/chosen": -2.7524189949035645, + "logits/rejected": -2.6022703647613525, + "logps/chosen": -218.4010772705078, + "logps/rejected": -273.52423095703125, + "loss": 0.5383, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0239105224609375, + "rewards/margins": 2.7949516773223877, + "rewards/rejected": -3.818862199783325, + "step": 3030 + }, + { + "epoch": 0.73, + "learning_rate": 4.200837939026564e-07, + "logits/chosen": -2.6513009071350098, + "logits/rejected": -2.6527931690216064, + "logps/chosen": -229.96286010742188, + "logps/rejected": -232.0447998046875, + "loss": 0.6208, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2891321182250977, + "rewards/margins": 1.7754385471343994, + "rewards/rejected": -3.064570665359497, + "step": 3040 + }, + { + "epoch": 0.73, + "learning_rate": 4.196380816544838e-07, + "logits/chosen": -2.593024730682373, + "logits/rejected": -2.6378841400146484, + "logps/chosen": -232.8154754638672, + "logps/rejected": -195.8571014404297, + "loss": 0.5604, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7388196587562561, + "rewards/margins": 1.4008862972259521, + "rewards/rejected": -2.1397061347961426, + "step": 3050 + }, + { + "epoch": 0.74, + "learning_rate": 4.191923694063113e-07, + "logits/chosen": -2.851116418838501, + "logits/rejected": -2.6879868507385254, + "logps/chosen": -299.67425537109375, + "logps/rejected": -255.0391082763672, + "loss": 0.7818, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7442834377288818, + "rewards/margins": 0.992772102355957, + "rewards/rejected": -2.737055540084839, + "step": 3060 + }, + { + "epoch": 0.74, + "learning_rate": 4.187466571581387e-07, + "logits/chosen": -2.6634647846221924, + "logits/rejected": -2.553982973098755, + "logps/chosen": -279.292236328125, + "logps/rejected": -202.8257598876953, + "loss": 0.5556, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6925932168960571, + "rewards/margins": 1.1422004699707031, + "rewards/rejected": -2.8347935676574707, + "step": 3070 + }, + { + "epoch": 0.74, + "learning_rate": 4.183009449099661e-07, + "logits/chosen": -2.6643576622009277, + "logits/rejected": -2.5717310905456543, + "logps/chosen": -342.335693359375, + "logps/rejected": -294.61810302734375, + "loss": 0.5782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5681596994400024, + "rewards/margins": 1.7290674448013306, + "rewards/rejected": -3.297227144241333, + "step": 3080 + }, + { + "epoch": 0.74, + "learning_rate": 4.178552326617935e-07, + "logits/chosen": -2.5645246505737305, + "logits/rejected": -2.472097873687744, + "logps/chosen": -261.25677490234375, + "logps/rejected": -253.5023193359375, + "loss": 0.4876, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3006632328033447, + "rewards/margins": 2.4436044692993164, + "rewards/rejected": -3.7442679405212402, + "step": 3090 + }, + { + "epoch": 0.75, + "learning_rate": 4.1740952041362094e-07, + "logits/chosen": -2.579249143600464, + "logits/rejected": -2.4710917472839355, + "logps/chosen": -290.3495788574219, + "logps/rejected": -304.74017333984375, + "loss": 0.5786, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2462234497070312, + "rewards/margins": 1.3442294597625732, + "rewards/rejected": -3.5904529094696045, + "step": 3100 + }, + { + "epoch": 0.75, + "learning_rate": 4.1696380816544834e-07, + "logits/chosen": -2.6869969367980957, + "logits/rejected": -2.743889570236206, + "logps/chosen": -307.068115234375, + "logps/rejected": -354.55279541015625, + "loss": 0.696, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6236753463745117, + "rewards/margins": 1.5922362804412842, + "rewards/rejected": -3.215911388397217, + "step": 3110 + }, + { + "epoch": 0.75, + "learning_rate": 4.1651809591727575e-07, + "logits/chosen": -2.67097806930542, + "logits/rejected": -2.6003124713897705, + "logps/chosen": -201.99729919433594, + "logps/rejected": -250.19094848632812, + "loss": 0.5922, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6611751317977905, + "rewards/margins": 1.9977973699569702, + "rewards/rejected": -3.6589725017547607, + "step": 3120 + }, + { + "epoch": 0.75, + "learning_rate": 4.160723836691032e-07, + "logits/chosen": -2.574385166168213, + "logits/rejected": -2.533609628677368, + "logps/chosen": -277.22808837890625, + "logps/rejected": -255.15725708007812, + "loss": 0.7537, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.093635082244873, + "rewards/margins": 1.2647625207901, + "rewards/rejected": -3.3583977222442627, + "step": 3130 + }, + { + "epoch": 0.76, + "learning_rate": 4.156266714209306e-07, + "logits/chosen": -2.8209471702575684, + "logits/rejected": -2.765892267227173, + "logps/chosen": -250.3872528076172, + "logps/rejected": -296.6456298828125, + "loss": 0.5281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0076568126678467, + "rewards/margins": 1.0997529029846191, + "rewards/rejected": -3.107409954071045, + "step": 3140 + }, + { + "epoch": 0.76, + "learning_rate": 4.15180959172758e-07, + "logits/chosen": -2.7392449378967285, + "logits/rejected": -2.685570478439331, + "logps/chosen": -244.25949096679688, + "logps/rejected": -202.15159606933594, + "loss": 0.6135, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -2.1826963424682617, + "rewards/margins": 0.40876954793930054, + "rewards/rejected": -2.591465950012207, + "step": 3150 + }, + { + "epoch": 0.76, + "learning_rate": 4.1473524692458546e-07, + "logits/chosen": -2.697981357574463, + "logits/rejected": -2.688508987426758, + "logps/chosen": -256.394287109375, + "logps/rejected": -239.4425048828125, + "loss": 0.4602, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6544971466064453, + "rewards/margins": 0.9632614254951477, + "rewards/rejected": -2.6177589893341064, + "step": 3160 + }, + { + "epoch": 0.76, + "learning_rate": 4.1428953467641287e-07, + "logits/chosen": -2.6453068256378174, + "logits/rejected": -2.637723445892334, + "logps/chosen": -207.82302856445312, + "logps/rejected": -190.76217651367188, + "loss": 0.5699, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8176633715629578, + "rewards/margins": 1.8111244440078735, + "rewards/rejected": -2.6287875175476074, + "step": 3170 + }, + { + "epoch": 0.77, + "learning_rate": 4.1384382242824027e-07, + "logits/chosen": -2.7648839950561523, + "logits/rejected": -2.561379909515381, + "logps/chosen": -219.7236785888672, + "logps/rejected": -157.59056091308594, + "loss": 0.6589, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.0911527872085571, + "rewards/margins": 0.5300930738449097, + "rewards/rejected": -1.6212456226348877, + "step": 3180 + }, + { + "epoch": 0.77, + "learning_rate": 4.133981101800677e-07, + "logits/chosen": -2.730799436569214, + "logits/rejected": -2.71309757232666, + "logps/chosen": -291.04742431640625, + "logps/rejected": -333.51019287109375, + "loss": 0.6767, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6952773332595825, + "rewards/margins": 0.9070941805839539, + "rewards/rejected": -1.6023715734481812, + "step": 3190 + }, + { + "epoch": 0.77, + "learning_rate": 4.1295239793189513e-07, + "logits/chosen": -2.7877984046936035, + "logits/rejected": -2.6307997703552246, + "logps/chosen": -241.40847778320312, + "logps/rejected": -211.07809448242188, + "loss": 0.6092, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.0838617086410522, + "rewards/margins": 0.9140681028366089, + "rewards/rejected": -1.9979298114776611, + "step": 3200 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.5023696422576904, + "eval_logits/rejected": -2.4754655361175537, + "eval_logps/chosen": -218.04718017578125, + "eval_logps/rejected": -220.98233032226562, + "eval_loss": 0.5124280452728271, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": -1.6119625568389893, + "eval_rewards/margins": 1.533191204071045, + "eval_rewards/rejected": -3.145153760910034, + "eval_runtime": 131.8008, + "eval_samples_per_second": 23.945, + "eval_steps_per_second": 0.379, + "step": 3200 + }, + { + "epoch": 0.77, + "learning_rate": 4.1250668568372253e-07, + "logits/chosen": -2.6086478233337402, + "logits/rejected": -2.5804197788238525, + "logps/chosen": -203.43679809570312, + "logps/rejected": -236.58547973632812, + "loss": 0.5141, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9587739109992981, + "rewards/margins": 1.6325676441192627, + "rewards/rejected": -2.591341733932495, + "step": 3210 + }, + { + "epoch": 0.77, + "learning_rate": 4.1206097343555e-07, + "logits/chosen": -2.584949254989624, + "logits/rejected": -2.6324820518493652, + "logps/chosen": -210.24209594726562, + "logps/rejected": -212.9149169921875, + "loss": 0.5011, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7539044618606567, + "rewards/margins": 1.8157832622528076, + "rewards/rejected": -2.569687843322754, + "step": 3220 + }, + { + "epoch": 0.78, + "learning_rate": 4.116152611873774e-07, + "logits/chosen": -2.62009596824646, + "logits/rejected": -2.600926160812378, + "logps/chosen": -257.4697265625, + "logps/rejected": -274.40985107421875, + "loss": 0.5222, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9684908986091614, + "rewards/margins": 1.3399364948272705, + "rewards/rejected": -2.308427333831787, + "step": 3230 + }, + { + "epoch": 0.78, + "learning_rate": 4.1116954893920485e-07, + "logits/chosen": -2.8385872840881348, + "logits/rejected": -2.6282565593719482, + "logps/chosen": -323.53875732421875, + "logps/rejected": -255.8197784423828, + "loss": 0.6685, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5275509357452393, + "rewards/margins": 2.112471580505371, + "rewards/rejected": -3.6400222778320312, + "step": 3240 + }, + { + "epoch": 0.78, + "learning_rate": 4.107238366910323e-07, + "logits/chosen": -2.627898693084717, + "logits/rejected": -2.645310640335083, + "logps/chosen": -275.2945251464844, + "logps/rejected": -285.93389892578125, + "loss": 0.5802, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2948415279388428, + "rewards/margins": 1.8908132314682007, + "rewards/rejected": -3.185654878616333, + "step": 3250 + }, + { + "epoch": 0.78, + "learning_rate": 4.102781244428597e-07, + "logits/chosen": -2.7616629600524902, + "logits/rejected": -2.687288284301758, + "logps/chosen": -284.48553466796875, + "logps/rejected": -241.5064239501953, + "loss": 0.5391, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3835375308990479, + "rewards/margins": 1.3874008655548096, + "rewards/rejected": -2.7709383964538574, + "step": 3260 + }, + { + "epoch": 0.79, + "learning_rate": 4.098324121946871e-07, + "logits/chosen": -2.719691276550293, + "logits/rejected": -2.5804100036621094, + "logps/chosen": -254.3754119873047, + "logps/rejected": -256.03387451171875, + "loss": 0.5452, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2589216232299805, + "rewards/margins": 2.365007162094116, + "rewards/rejected": -3.6239287853240967, + "step": 3270 + }, + { + "epoch": 0.79, + "learning_rate": 4.093866999465145e-07, + "logits/chosen": -2.795424699783325, + "logits/rejected": -2.647678852081299, + "logps/chosen": -304.1939392089844, + "logps/rejected": -306.00494384765625, + "loss": 0.5856, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8605068922042847, + "rewards/margins": 1.4221422672271729, + "rewards/rejected": -2.282649040222168, + "step": 3280 + }, + { + "epoch": 0.79, + "learning_rate": 4.0894098769834197e-07, + "logits/chosen": -2.6333696842193604, + "logits/rejected": -2.4974284172058105, + "logps/chosen": -313.07904052734375, + "logps/rejected": -293.76153564453125, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3331620693206787, + "rewards/margins": 1.9567950963974, + "rewards/rejected": -3.289957046508789, + "step": 3290 + }, + { + "epoch": 0.79, + "learning_rate": 4.0849527545016937e-07, + "logits/chosen": -2.556406021118164, + "logits/rejected": -2.5137507915496826, + "logps/chosen": -340.04840087890625, + "logps/rejected": -318.72686767578125, + "loss": 0.4925, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1319781392812729, + "rewards/margins": 1.5771225690841675, + "rewards/rejected": -1.7091007232666016, + "step": 3300 + }, + { + "epoch": 0.8, + "learning_rate": 4.0804956320199677e-07, + "logits/chosen": -2.4623820781707764, + "logits/rejected": -2.445798635482788, + "logps/chosen": -205.68472290039062, + "logps/rejected": -215.0249786376953, + "loss": 0.5373, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.573722004890442, + "rewards/margins": 1.2487709522247314, + "rewards/rejected": -2.822493076324463, + "step": 3310 + }, + { + "epoch": 0.8, + "learning_rate": 4.0760385095382423e-07, + "logits/chosen": -2.6179795265197754, + "logits/rejected": -2.5789780616760254, + "logps/chosen": -344.1896667480469, + "logps/rejected": -260.4026794433594, + "loss": 0.5802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7595813870429993, + "rewards/margins": 1.3831188678741455, + "rewards/rejected": -2.1427001953125, + "step": 3320 + }, + { + "epoch": 0.8, + "learning_rate": 4.0715813870565163e-07, + "logits/chosen": -2.6235435009002686, + "logits/rejected": -2.544433355331421, + "logps/chosen": -253.1512908935547, + "logps/rejected": -364.1452331542969, + "loss": 0.4449, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5454899072647095, + "rewards/margins": 2.4067556858062744, + "rewards/rejected": -3.9522452354431152, + "step": 3330 + }, + { + "epoch": 0.8, + "learning_rate": 4.0671242645747903e-07, + "logits/chosen": -2.55584979057312, + "logits/rejected": -2.533679485321045, + "logps/chosen": -328.15985107421875, + "logps/rejected": -349.55181884765625, + "loss": 0.5493, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.9840243458747864, + "rewards/margins": 0.47254204750061035, + "rewards/rejected": -1.456566572189331, + "step": 3340 + }, + { + "epoch": 0.81, + "learning_rate": 4.062667142093065e-07, + "logits/chosen": -2.5850658416748047, + "logits/rejected": -2.600969076156616, + "logps/chosen": -333.9216613769531, + "logps/rejected": -289.7823486328125, + "loss": 0.4948, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2890478372573853, + "rewards/margins": 0.7226174473762512, + "rewards/rejected": -2.011665105819702, + "step": 3350 + }, + { + "epoch": 0.81, + "learning_rate": 4.058210019611339e-07, + "logits/chosen": -2.55297589302063, + "logits/rejected": -2.4784064292907715, + "logps/chosen": -250.0596160888672, + "logps/rejected": -254.9423065185547, + "loss": 0.5404, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4082493782043457, + "rewards/margins": 1.2668209075927734, + "rewards/rejected": -2.675070285797119, + "step": 3360 + }, + { + "epoch": 0.81, + "learning_rate": 4.053752897129613e-07, + "logits/chosen": -2.54938006401062, + "logits/rejected": -2.424170970916748, + "logps/chosen": -283.8787841796875, + "logps/rejected": -223.3419952392578, + "loss": 0.6448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8317139744758606, + "rewards/margins": 2.6544876098632812, + "rewards/rejected": -3.486201524734497, + "step": 3370 + }, + { + "epoch": 0.81, + "learning_rate": 4.0492957746478875e-07, + "logits/chosen": -2.602226972579956, + "logits/rejected": -2.543545961380005, + "logps/chosen": -337.1436462402344, + "logps/rejected": -331.75274658203125, + "loss": 0.6345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4520931243896484, + "rewards/margins": 1.1032426357269287, + "rewards/rejected": -2.555335521697998, + "step": 3380 + }, + { + "epoch": 0.82, + "learning_rate": 4.0448386521661615e-07, + "logits/chosen": -2.709064483642578, + "logits/rejected": -2.609285593032837, + "logps/chosen": -232.5091552734375, + "logps/rejected": -223.9417724609375, + "loss": 0.5908, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2587926387786865, + "rewards/margins": 1.2197411060333252, + "rewards/rejected": -3.4785335063934326, + "step": 3390 + }, + { + "epoch": 0.82, + "learning_rate": 4.0403815296844356e-07, + "logits/chosen": -2.738271474838257, + "logits/rejected": -2.594999313354492, + "logps/chosen": -275.84295654296875, + "logps/rejected": -344.08428955078125, + "loss": 0.4963, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0369099378585815, + "rewards/margins": 1.9961076974868774, + "rewards/rejected": -3.03301739692688, + "step": 3400 + }, + { + "epoch": 0.82, + "learning_rate": 4.03592440720271e-07, + "logits/chosen": -2.6316440105438232, + "logits/rejected": -2.522408962249756, + "logps/chosen": -280.1983947753906, + "logps/rejected": -274.7087707519531, + "loss": 0.4378, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9649044275283813, + "rewards/margins": 1.9704071283340454, + "rewards/rejected": -3.935311794281006, + "step": 3410 + }, + { + "epoch": 0.82, + "learning_rate": 4.031467284720984e-07, + "logits/chosen": -2.6141161918640137, + "logits/rejected": -2.516469955444336, + "logps/chosen": -232.9725799560547, + "logps/rejected": -208.60269165039062, + "loss": 0.5014, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7316086292266846, + "rewards/margins": 1.9986438751220703, + "rewards/rejected": -3.730252504348755, + "step": 3420 + }, + { + "epoch": 0.83, + "learning_rate": 4.027010162239258e-07, + "logits/chosen": -2.6620287895202637, + "logits/rejected": -2.523186206817627, + "logps/chosen": -256.6522521972656, + "logps/rejected": -199.36477661132812, + "loss": 0.6619, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9984378814697266, + "rewards/margins": 2.2110140323638916, + "rewards/rejected": -3.2094521522521973, + "step": 3430 + }, + { + "epoch": 0.83, + "learning_rate": 4.022553039757532e-07, + "logits/chosen": -2.6714510917663574, + "logits/rejected": -2.648057460784912, + "logps/chosen": -287.1764831542969, + "logps/rejected": -313.5572814941406, + "loss": 0.6233, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1280102729797363, + "rewards/margins": 0.7432471513748169, + "rewards/rejected": -2.871257781982422, + "step": 3440 + }, + { + "epoch": 0.83, + "learning_rate": 4.018095917275807e-07, + "logits/chosen": -2.675344705581665, + "logits/rejected": -2.560187578201294, + "logps/chosen": -334.3106689453125, + "logps/rejected": -262.6590270996094, + "loss": 0.5801, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4317984580993652, + "rewards/margins": 0.2597549557685852, + "rewards/rejected": -2.6915533542633057, + "step": 3450 + }, + { + "epoch": 0.83, + "learning_rate": 4.013638794794081e-07, + "logits/chosen": -2.728790760040283, + "logits/rejected": -2.531425952911377, + "logps/chosen": -299.5249938964844, + "logps/rejected": -282.14385986328125, + "loss": 0.4619, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9811800122261047, + "rewards/margins": 1.9384164810180664, + "rewards/rejected": -2.919595956802368, + "step": 3460 + }, + { + "epoch": 0.84, + "learning_rate": 4.009181672312355e-07, + "logits/chosen": -2.5976831912994385, + "logits/rejected": -2.5936222076416016, + "logps/chosen": -251.3915557861328, + "logps/rejected": -240.14254760742188, + "loss": 0.5653, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.274533748626709, + "rewards/margins": 0.9345539808273315, + "rewards/rejected": -3.20908784866333, + "step": 3470 + }, + { + "epoch": 0.84, + "learning_rate": 4.0047245498306294e-07, + "logits/chosen": -2.681915283203125, + "logits/rejected": -2.4873757362365723, + "logps/chosen": -258.72454833984375, + "logps/rejected": -274.6829833984375, + "loss": 0.5518, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1577260494232178, + "rewards/margins": 1.8516298532485962, + "rewards/rejected": -3.0093560218811035, + "step": 3480 + }, + { + "epoch": 0.84, + "learning_rate": 4.0002674273489034e-07, + "logits/chosen": -2.3440256118774414, + "logits/rejected": -2.4099361896514893, + "logps/chosen": -424.033935546875, + "logps/rejected": -313.701416015625, + "loss": 1.6005, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.069270133972168, + "rewards/margins": -4.384868621826172, + "rewards/rejected": -6.6844024658203125, + "step": 3490 + }, + { + "epoch": 0.84, + "learning_rate": 3.9958103048671774e-07, + "logits/chosen": -2.6543171405792236, + "logits/rejected": -2.5328967571258545, + "logps/chosen": -211.1274871826172, + "logps/rejected": -183.19216918945312, + "loss": 0.5616, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.5951848030090332, + "rewards/margins": 1.0990397930145264, + "rewards/rejected": -2.6942248344421387, + "step": 3500 + }, + { + "epoch": 0.84, + "learning_rate": 3.991353182385452e-07, + "logits/chosen": -2.6359877586364746, + "logits/rejected": -2.553628444671631, + "logps/chosen": -371.024658203125, + "logps/rejected": -292.0741271972656, + "loss": 0.608, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.197676181793213, + "rewards/margins": 1.814406156539917, + "rewards/rejected": -4.012082576751709, + "step": 3510 + }, + { + "epoch": 0.85, + "learning_rate": 3.986896059903726e-07, + "logits/chosen": -2.1818909645080566, + "logits/rejected": -2.1755576133728027, + "logps/chosen": -216.0900421142578, + "logps/rejected": -177.17047119140625, + "loss": 0.5922, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4119014739990234, + "rewards/margins": 1.8648496866226196, + "rewards/rejected": -3.2767510414123535, + "step": 3520 + }, + { + "epoch": 0.85, + "learning_rate": 3.982438937422e-07, + "logits/chosen": -2.3716135025024414, + "logits/rejected": -2.330021381378174, + "logps/chosen": -228.31802368164062, + "logps/rejected": -238.8856658935547, + "loss": 0.5279, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9070593118667603, + "rewards/margins": 2.2716453075408936, + "rewards/rejected": -4.178704738616943, + "step": 3530 + }, + { + "epoch": 0.85, + "learning_rate": 3.9779818149402746e-07, + "logits/chosen": -2.4213452339172363, + "logits/rejected": -2.287424325942993, + "logps/chosen": -173.0181427001953, + "logps/rejected": -138.43270874023438, + "loss": 0.617, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.259591579437256, + "rewards/margins": 0.8109132647514343, + "rewards/rejected": -3.070504903793335, + "step": 3540 + }, + { + "epoch": 0.85, + "learning_rate": 3.9735246924585486e-07, + "logits/chosen": -2.5399069786071777, + "logits/rejected": -2.5299155712127686, + "logps/chosen": -205.474853515625, + "logps/rejected": -172.39971923828125, + "loss": 0.5583, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7592484951019287, + "rewards/margins": 1.6299797296524048, + "rewards/rejected": -3.389228105545044, + "step": 3550 + }, + { + "epoch": 0.86, + "learning_rate": 3.9690675699768227e-07, + "logits/chosen": -2.606238842010498, + "logits/rejected": -2.4996159076690674, + "logps/chosen": -229.6689453125, + "logps/rejected": -234.391845703125, + "loss": 0.5577, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3872594833374023, + "rewards/margins": 1.8904683589935303, + "rewards/rejected": -3.2777278423309326, + "step": 3560 + }, + { + "epoch": 0.86, + "learning_rate": 3.964610447495097e-07, + "logits/chosen": -2.5027756690979004, + "logits/rejected": -2.4127001762390137, + "logps/chosen": -268.26849365234375, + "logps/rejected": -248.8619842529297, + "loss": 0.541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.935086965560913, + "rewards/margins": 1.4139950275421143, + "rewards/rejected": -4.349081993103027, + "step": 3570 + }, + { + "epoch": 0.86, + "learning_rate": 3.960153325013371e-07, + "logits/chosen": -2.600238800048828, + "logits/rejected": -2.705057382583618, + "logps/chosen": -332.97906494140625, + "logps/rejected": -393.6985168457031, + "loss": 0.6294, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9589897394180298, + "rewards/margins": 0.9357390403747559, + "rewards/rejected": -2.894728899002075, + "step": 3580 + }, + { + "epoch": 0.86, + "learning_rate": 3.9556962025316453e-07, + "logits/chosen": -2.4526476860046387, + "logits/rejected": -2.3240785598754883, + "logps/chosen": -334.75152587890625, + "logps/rejected": -304.6665344238281, + "loss": 0.6832, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.4315619468688965, + "rewards/margins": 1.0265862941741943, + "rewards/rejected": -3.458148241043091, + "step": 3590 + }, + { + "epoch": 0.87, + "learning_rate": 3.9512390800499193e-07, + "logits/chosen": -2.607935667037964, + "logits/rejected": -2.495234727859497, + "logps/chosen": -257.22442626953125, + "logps/rejected": -254.654296875, + "loss": 0.674, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4075379371643066, + "rewards/margins": 1.691053032875061, + "rewards/rejected": -4.098590850830078, + "step": 3600 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -2.288510799407959, + "eval_logits/rejected": -2.256424903869629, + "eval_logps/chosen": -231.8350372314453, + "eval_logps/rejected": -236.4845733642578, + "eval_loss": 0.5134379267692566, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -2.9907476902008057, + "eval_rewards/margins": 1.7046312093734741, + "eval_rewards/rejected": -4.695379257202148, + "eval_runtime": 133.1197, + "eval_samples_per_second": 23.708, + "eval_steps_per_second": 0.376, + "step": 3600 + }, + { + "epoch": 0.87, + "learning_rate": 3.946781957568194e-07, + "logits/chosen": -2.609255313873291, + "logits/rejected": -2.559530735015869, + "logps/chosen": -273.26092529296875, + "logps/rejected": -311.75030517578125, + "loss": 0.556, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2174479961395264, + "rewards/margins": 0.8377370834350586, + "rewards/rejected": -3.055185079574585, + "step": 3610 + }, + { + "epoch": 0.87, + "learning_rate": 3.942324835086468e-07, + "logits/chosen": -2.604952812194824, + "logits/rejected": -2.5987837314605713, + "logps/chosen": -303.27203369140625, + "logps/rejected": -262.020751953125, + "loss": 0.552, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2536367177963257, + "rewards/margins": 2.3096842765808105, + "rewards/rejected": -3.5633208751678467, + "step": 3620 + }, + { + "epoch": 0.87, + "learning_rate": 3.937867712604742e-07, + "logits/chosen": -2.687490940093994, + "logits/rejected": -2.5505900382995605, + "logps/chosen": -407.6099548339844, + "logps/rejected": -314.99267578125, + "loss": 0.5187, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3432193994522095, + "rewards/margins": 1.757752776145935, + "rewards/rejected": -3.1009724140167236, + "step": 3630 + }, + { + "epoch": 0.88, + "learning_rate": 3.9334105901230165e-07, + "logits/chosen": -2.369417667388916, + "logits/rejected": -2.2396793365478516, + "logps/chosen": -338.2528381347656, + "logps/rejected": -341.28594970703125, + "loss": 0.4532, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8581418991088867, + "rewards/margins": 3.1549291610717773, + "rewards/rejected": -5.013071537017822, + "step": 3640 + }, + { + "epoch": 0.88, + "learning_rate": 3.9289534676412905e-07, + "logits/chosen": -2.4682111740112305, + "logits/rejected": -2.5353970527648926, + "logps/chosen": -228.75643920898438, + "logps/rejected": -242.5396270751953, + "loss": 0.6887, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8795896768569946, + "rewards/margins": 0.8770751953125, + "rewards/rejected": -2.756664752960205, + "step": 3650 + }, + { + "epoch": 0.88, + "learning_rate": 3.9244963451595645e-07, + "logits/chosen": -2.5755834579467773, + "logits/rejected": -2.496194362640381, + "logps/chosen": -204.44625854492188, + "logps/rejected": -202.35910034179688, + "loss": 0.5614, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.572894811630249, + "rewards/margins": 1.5533229112625122, + "rewards/rejected": -3.1262173652648926, + "step": 3660 + }, + { + "epoch": 0.88, + "learning_rate": 3.920039222677839e-07, + "logits/chosen": -2.531261682510376, + "logits/rejected": -2.4382405281066895, + "logps/chosen": -219.8284912109375, + "logps/rejected": -268.8829650878906, + "loss": 0.5608, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3239612579345703, + "rewards/margins": 2.532979726791382, + "rewards/rejected": -3.8569416999816895, + "step": 3670 + }, + { + "epoch": 0.89, + "learning_rate": 3.915582100196113e-07, + "logits/chosen": -2.659780740737915, + "logits/rejected": -2.6414692401885986, + "logps/chosen": -187.46194458007812, + "logps/rejected": -271.33929443359375, + "loss": 0.6254, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4945189952850342, + "rewards/margins": 1.7238363027572632, + "rewards/rejected": -3.218355655670166, + "step": 3680 + }, + { + "epoch": 0.89, + "learning_rate": 3.911124977714387e-07, + "logits/chosen": -2.525784730911255, + "logits/rejected": -2.605313301086426, + "logps/chosen": -138.00347900390625, + "logps/rejected": -212.57797241210938, + "loss": 0.5476, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4867968559265137, + "rewards/margins": 1.0520381927490234, + "rewards/rejected": -3.538835048675537, + "step": 3690 + }, + { + "epoch": 0.89, + "learning_rate": 3.9066678552326617e-07, + "logits/chosen": -2.747058391571045, + "logits/rejected": -2.5562584400177, + "logps/chosen": -366.90496826171875, + "logps/rejected": -337.36309814453125, + "loss": 0.5512, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.590149164199829, + "rewards/margins": 1.419371247291565, + "rewards/rejected": -3.0095202922821045, + "step": 3700 + }, + { + "epoch": 0.89, + "learning_rate": 3.902210732750936e-07, + "logits/chosen": -2.576045513153076, + "logits/rejected": -2.4777674674987793, + "logps/chosen": -254.02273559570312, + "logps/rejected": -332.081298828125, + "loss": 0.6372, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1122708320617676, + "rewards/margins": 0.6883398294448853, + "rewards/rejected": -2.8006105422973633, + "step": 3710 + }, + { + "epoch": 0.9, + "learning_rate": 3.89775361026921e-07, + "logits/chosen": -2.511871576309204, + "logits/rejected": -2.6217472553253174, + "logps/chosen": -264.2774963378906, + "logps/rejected": -251.447509765625, + "loss": 0.4727, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3318699598312378, + "rewards/margins": 2.0866668224334717, + "rewards/rejected": -3.41853666305542, + "step": 3720 + }, + { + "epoch": 0.9, + "learning_rate": 3.8932964877874843e-07, + "logits/chosen": -2.5318033695220947, + "logits/rejected": -2.500244617462158, + "logps/chosen": -229.66226196289062, + "logps/rejected": -264.37689208984375, + "loss": 0.5982, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.124739646911621, + "rewards/margins": 0.6877515316009521, + "rewards/rejected": -1.8124911785125732, + "step": 3730 + }, + { + "epoch": 0.9, + "learning_rate": 3.8888393653057584e-07, + "logits/chosen": -2.362482786178589, + "logits/rejected": -2.423068046569824, + "logps/chosen": -234.7594451904297, + "logps/rejected": -170.59483337402344, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6956630945205688, + "rewards/margins": 1.458892583847046, + "rewards/rejected": -3.154555559158325, + "step": 3740 + }, + { + "epoch": 0.9, + "learning_rate": 3.8843822428240324e-07, + "logits/chosen": -2.4557576179504395, + "logits/rejected": -2.549530506134033, + "logps/chosen": -189.58731079101562, + "logps/rejected": -231.9694061279297, + "loss": 0.4583, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.744417428970337, + "rewards/margins": 0.9202855825424194, + "rewards/rejected": -2.664702892303467, + "step": 3750 + }, + { + "epoch": 0.9, + "learning_rate": 3.8799251203423064e-07, + "logits/chosen": -2.7234930992126465, + "logits/rejected": -2.657794952392578, + "logps/chosen": -336.3365173339844, + "logps/rejected": -351.0404052734375, + "loss": 0.4876, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1833642721176147, + "rewards/margins": 1.5095411539077759, + "rewards/rejected": -2.6929054260253906, + "step": 3760 + }, + { + "epoch": 0.91, + "learning_rate": 3.875467997860581e-07, + "logits/chosen": -2.586146354675293, + "logits/rejected": -2.4543778896331787, + "logps/chosen": -218.3040008544922, + "logps/rejected": -178.90457153320312, + "loss": 0.5329, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7448152303695679, + "rewards/margins": 0.5987850427627563, + "rewards/rejected": -2.3436005115509033, + "step": 3770 + }, + { + "epoch": 0.91, + "learning_rate": 3.871010875378855e-07, + "logits/chosen": -2.396613597869873, + "logits/rejected": -2.467390298843384, + "logps/chosen": -278.26885986328125, + "logps/rejected": -285.6539611816406, + "loss": 0.5684, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6755433678627014, + "rewards/margins": 2.223599672317505, + "rewards/rejected": -2.8991427421569824, + "step": 3780 + }, + { + "epoch": 0.91, + "learning_rate": 3.866553752897129e-07, + "logits/chosen": -2.567450523376465, + "logits/rejected": -2.653172492980957, + "logps/chosen": -252.27001953125, + "logps/rejected": -324.4461669921875, + "loss": 0.5105, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1028369665145874, + "rewards/margins": 2.0712685585021973, + "rewards/rejected": -3.174105644226074, + "step": 3790 + }, + { + "epoch": 0.91, + "learning_rate": 3.8620966304154036e-07, + "logits/chosen": -2.5739176273345947, + "logits/rejected": -2.590895652770996, + "logps/chosen": -246.83261108398438, + "logps/rejected": -229.0023651123047, + "loss": 0.6285, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3254778385162354, + "rewards/margins": 1.1004021167755127, + "rewards/rejected": -2.425879955291748, + "step": 3800 + }, + { + "epoch": 0.92, + "learning_rate": 3.8576395079336776e-07, + "logits/chosen": -2.664707660675049, + "logits/rejected": -2.6448731422424316, + "logps/chosen": -291.40386962890625, + "logps/rejected": -375.8872985839844, + "loss": 0.5661, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9805440902709961, + "rewards/margins": 0.7267992496490479, + "rewards/rejected": -1.7073434591293335, + "step": 3810 + }, + { + "epoch": 0.92, + "learning_rate": 3.8531823854519516e-07, + "logits/chosen": -2.643444776535034, + "logits/rejected": -2.5820393562316895, + "logps/chosen": -255.2077178955078, + "logps/rejected": -280.51702880859375, + "loss": 0.514, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.719731092453003, + "rewards/margins": 1.4006415605545044, + "rewards/rejected": -3.120372772216797, + "step": 3820 + }, + { + "epoch": 0.92, + "learning_rate": 3.848725262970226e-07, + "logits/chosen": -2.6456382274627686, + "logits/rejected": -2.661034345626831, + "logps/chosen": -278.55535888671875, + "logps/rejected": -323.1336975097656, + "loss": 0.6591, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.5481061935424805, + "rewards/margins": 1.3556015491485596, + "rewards/rejected": -3.903707504272461, + "step": 3830 + }, + { + "epoch": 0.92, + "learning_rate": 3.8442681404885e-07, + "logits/chosen": -2.5775082111358643, + "logits/rejected": -2.610999584197998, + "logps/chosen": -232.3846893310547, + "logps/rejected": -248.490966796875, + "loss": 0.4868, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.610137701034546, + "rewards/margins": 1.7746471166610718, + "rewards/rejected": -3.384784698486328, + "step": 3840 + }, + { + "epoch": 0.93, + "learning_rate": 3.839811018006774e-07, + "logits/chosen": -2.613619565963745, + "logits/rejected": -2.5495545864105225, + "logps/chosen": -305.49139404296875, + "logps/rejected": -323.3760681152344, + "loss": 0.6557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.350762128829956, + "rewards/margins": 1.6953766345977783, + "rewards/rejected": -4.046138763427734, + "step": 3850 + }, + { + "epoch": 0.93, + "learning_rate": 3.835353895525049e-07, + "logits/chosen": -2.573826313018799, + "logits/rejected": -2.4991908073425293, + "logps/chosen": -241.28286743164062, + "logps/rejected": -255.70767211914062, + "loss": 0.558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.769360899925232, + "rewards/margins": 1.9593086242675781, + "rewards/rejected": -3.7286696434020996, + "step": 3860 + }, + { + "epoch": 0.93, + "learning_rate": 3.830896773043323e-07, + "logits/chosen": -2.3413383960723877, + "logits/rejected": -2.299808979034424, + "logps/chosen": -262.90850830078125, + "logps/rejected": -291.4503173828125, + "loss": 0.5558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.659152626991272, + "rewards/margins": 1.9936367273330688, + "rewards/rejected": -3.652789354324341, + "step": 3870 + }, + { + "epoch": 0.93, + "learning_rate": 3.826439650561597e-07, + "logits/chosen": -2.7529985904693604, + "logits/rejected": -2.639225482940674, + "logps/chosen": -326.57635498046875, + "logps/rejected": -326.14080810546875, + "loss": 0.6663, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.5671730041503906, + "rewards/margins": 0.9525227546691895, + "rewards/rejected": -3.519695997238159, + "step": 3880 + }, + { + "epoch": 0.94, + "learning_rate": 3.8219825280798714e-07, + "logits/chosen": -2.4761345386505127, + "logits/rejected": -2.435760974884033, + "logps/chosen": -262.26092529296875, + "logps/rejected": -250.92971801757812, + "loss": 0.6382, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9833894968032837, + "rewards/margins": 0.8338977098464966, + "rewards/rejected": -2.8172874450683594, + "step": 3890 + }, + { + "epoch": 0.94, + "learning_rate": 3.8175254055981455e-07, + "logits/chosen": -2.522275447845459, + "logits/rejected": -2.507660388946533, + "logps/chosen": -299.82891845703125, + "logps/rejected": -256.46240234375, + "loss": 0.4979, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8757234811782837, + "rewards/margins": 1.705521821975708, + "rewards/rejected": -3.5812454223632812, + "step": 3900 + }, + { + "epoch": 0.94, + "learning_rate": 3.8130682831164195e-07, + "logits/chosen": -2.5906052589416504, + "logits/rejected": -2.643200635910034, + "logps/chosen": -301.50811767578125, + "logps/rejected": -296.525634765625, + "loss": 0.4759, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5264441967010498, + "rewards/margins": 1.784152626991272, + "rewards/rejected": -3.310596466064453, + "step": 3910 + }, + { + "epoch": 0.94, + "learning_rate": 3.8086111606346946e-07, + "logits/chosen": -2.602121353149414, + "logits/rejected": -2.4926421642303467, + "logps/chosen": -289.6598205566406, + "logps/rejected": -309.3394470214844, + "loss": 0.483, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2499332427978516, + "rewards/margins": 2.3378403186798096, + "rewards/rejected": -3.5877737998962402, + "step": 3920 + }, + { + "epoch": 0.95, + "learning_rate": 3.8041540381529686e-07, + "logits/chosen": -2.642411708831787, + "logits/rejected": -2.5664408206939697, + "logps/chosen": -297.56597900390625, + "logps/rejected": -281.9163513183594, + "loss": 0.5553, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1734910011291504, + "rewards/margins": 1.599700927734375, + "rewards/rejected": -3.7731919288635254, + "step": 3930 + }, + { + "epoch": 0.95, + "learning_rate": 3.7996969156712426e-07, + "logits/chosen": -2.7890563011169434, + "logits/rejected": -2.659301280975342, + "logps/chosen": -420.64630126953125, + "logps/rejected": -322.561279296875, + "loss": 0.5267, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7246720790863037, + "rewards/margins": 2.2789828777313232, + "rewards/rejected": -4.003654956817627, + "step": 3940 + }, + { + "epoch": 0.95, + "learning_rate": 3.7952397931895167e-07, + "logits/chosen": -2.694575309753418, + "logits/rejected": -2.6454367637634277, + "logps/chosen": -307.62640380859375, + "logps/rejected": -325.6059265136719, + "loss": 0.5594, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.480874538421631, + "rewards/margins": 2.4836106300354004, + "rewards/rejected": -4.964485168457031, + "step": 3950 + }, + { + "epoch": 0.95, + "learning_rate": 3.790782670707791e-07, + "logits/chosen": -2.720595359802246, + "logits/rejected": -2.8017539978027344, + "logps/chosen": -223.55422973632812, + "logps/rejected": -273.29730224609375, + "loss": 0.6146, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.248535633087158, + "rewards/margins": 0.7739115953445435, + "rewards/rejected": -3.022447109222412, + "step": 3960 + }, + { + "epoch": 0.96, + "learning_rate": 3.786325548226065e-07, + "logits/chosen": -2.695719003677368, + "logits/rejected": -2.737778902053833, + "logps/chosen": -242.3776092529297, + "logps/rejected": -295.9952697753906, + "loss": 0.6235, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.326387405395508, + "rewards/margins": 0.7281680107116699, + "rewards/rejected": -3.0545554161071777, + "step": 3970 + }, + { + "epoch": 0.96, + "learning_rate": 3.7818684257443393e-07, + "logits/chosen": -2.5355782508850098, + "logits/rejected": -2.4986259937286377, + "logps/chosen": -270.2203674316406, + "logps/rejected": -282.6047058105469, + "loss": 0.7433, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4541804790496826, + "rewards/margins": 2.1234707832336426, + "rewards/rejected": -3.577651262283325, + "step": 3980 + }, + { + "epoch": 0.96, + "learning_rate": 3.777411303262614e-07, + "logits/chosen": -2.602736711502075, + "logits/rejected": -2.6677088737487793, + "logps/chosen": -394.6764221191406, + "logps/rejected": -372.6412353515625, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5492278337478638, + "rewards/margins": 2.038675308227539, + "rewards/rejected": -3.587902784347534, + "step": 3990 + }, + { + "epoch": 0.96, + "learning_rate": 3.772954180780888e-07, + "logits/chosen": -2.601288318634033, + "logits/rejected": -2.5311670303344727, + "logps/chosen": -211.21768188476562, + "logps/rejected": -192.4173583984375, + "loss": 0.5585, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0379996299743652, + "rewards/margins": 1.9101365804672241, + "rewards/rejected": -2.948136329650879, + "step": 4000 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.4272851943969727, + "eval_logits/rejected": -2.3968420028686523, + "eval_logps/chosen": -227.1593780517578, + "eval_logps/rejected": -231.3815460205078, + "eval_loss": 0.5064984560012817, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -2.523179292678833, + "eval_rewards/margins": 1.6618961095809937, + "eval_rewards/rejected": -4.185075759887695, + "eval_runtime": 133.45, + "eval_samples_per_second": 23.649, + "eval_steps_per_second": 0.375, + "step": 4000 + }, + { + "epoch": 0.97, + "learning_rate": 3.768497058299162e-07, + "logits/chosen": -2.664705753326416, + "logits/rejected": -2.668072462081909, + "logps/chosen": -245.41207885742188, + "logps/rejected": -283.7060241699219, + "loss": 0.4825, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7716989517211914, + "rewards/margins": 1.5381790399551392, + "rewards/rejected": -3.3098785877227783, + "step": 4010 + }, + { + "epoch": 0.97, + "learning_rate": 3.7640399358174365e-07, + "logits/chosen": -2.6795592308044434, + "logits/rejected": -2.571094036102295, + "logps/chosen": -333.5656433105469, + "logps/rejected": -239.02841186523438, + "loss": 0.4686, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5428540706634521, + "rewards/margins": 1.264843225479126, + "rewards/rejected": -2.807697296142578, + "step": 4020 + }, + { + "epoch": 0.97, + "learning_rate": 3.7595828133357105e-07, + "logits/chosen": -2.746485948562622, + "logits/rejected": -2.6074657440185547, + "logps/chosen": -303.5380554199219, + "logps/rejected": -241.12039184570312, + "loss": 0.5837, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3950583934783936, + "rewards/margins": 1.0706193447113037, + "rewards/rejected": -2.465677499771118, + "step": 4030 + }, + { + "epoch": 0.97, + "learning_rate": 3.7551256908539845e-07, + "logits/chosen": -2.7529234886169434, + "logits/rejected": -2.6367545127868652, + "logps/chosen": -209.83529663085938, + "logps/rejected": -193.84812927246094, + "loss": 0.5237, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7427949905395508, + "rewards/margins": 2.1338396072387695, + "rewards/rejected": -3.8766345977783203, + "step": 4040 + }, + { + "epoch": 0.97, + "learning_rate": 3.750668568372259e-07, + "logits/chosen": -2.520416498184204, + "logits/rejected": -2.4481160640716553, + "logps/chosen": -293.22698974609375, + "logps/rejected": -373.7021789550781, + "loss": 0.7061, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4291718006134033, + "rewards/margins": 6.727791786193848, + "rewards/rejected": -9.156964302062988, + "step": 4050 + }, + { + "epoch": 0.98, + "learning_rate": 3.746211445890533e-07, + "logits/chosen": -2.741536855697632, + "logits/rejected": -2.6621148586273193, + "logps/chosen": -249.78280639648438, + "logps/rejected": -240.9440155029297, + "loss": 0.5511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0934176445007324, + "rewards/margins": 1.720473289489746, + "rewards/rejected": -3.8138911724090576, + "step": 4060 + }, + { + "epoch": 0.98, + "learning_rate": 3.741754323408807e-07, + "logits/chosen": -2.7620818614959717, + "logits/rejected": -2.5791823863983154, + "logps/chosen": -349.82342529296875, + "logps/rejected": -297.4184875488281, + "loss": 0.6709, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5897421836853027, + "rewards/margins": 1.1426117420196533, + "rewards/rejected": -3.732353925704956, + "step": 4070 + }, + { + "epoch": 0.98, + "learning_rate": 3.7372972009270817e-07, + "logits/chosen": -2.6106772422790527, + "logits/rejected": -2.502243995666504, + "logps/chosen": -225.0702362060547, + "logps/rejected": -279.71307373046875, + "loss": 0.6106, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.020258903503418, + "rewards/margins": 1.8595082759857178, + "rewards/rejected": -3.8797671794891357, + "step": 4080 + }, + { + "epoch": 0.98, + "learning_rate": 3.7328400784453557e-07, + "logits/chosen": -2.7874178886413574, + "logits/rejected": -2.8164100646972656, + "logps/chosen": -296.76483154296875, + "logps/rejected": -267.3506164550781, + "loss": 0.5592, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1279549598693848, + "rewards/margins": 1.4412357807159424, + "rewards/rejected": -3.569190263748169, + "step": 4090 + }, + { + "epoch": 0.99, + "learning_rate": 3.72838295596363e-07, + "logits/chosen": -2.780735731124878, + "logits/rejected": -2.60390567779541, + "logps/chosen": -322.66192626953125, + "logps/rejected": -267.9672546386719, + "loss": 0.5768, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0631757974624634, + "rewards/margins": 1.4549946784973145, + "rewards/rejected": -2.5181705951690674, + "step": 4100 + }, + { + "epoch": 0.99, + "learning_rate": 3.723925833481904e-07, + "logits/chosen": -2.8341269493103027, + "logits/rejected": -2.5473570823669434, + "logps/chosen": -411.355712890625, + "logps/rejected": -237.3580780029297, + "loss": 0.5235, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3425524234771729, + "rewards/margins": 1.116189956665039, + "rewards/rejected": -2.458742380142212, + "step": 4110 + }, + { + "epoch": 0.99, + "learning_rate": 3.7194687110001783e-07, + "logits/chosen": -2.276176929473877, + "logits/rejected": -2.3545913696289062, + "logps/chosen": -225.91464233398438, + "logps/rejected": -227.13137817382812, + "loss": 0.5475, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6823476552963257, + "rewards/margins": 1.8060953617095947, + "rewards/rejected": -2.488442897796631, + "step": 4120 + }, + { + "epoch": 0.99, + "learning_rate": 3.7150115885184524e-07, + "logits/chosen": -2.8451974391937256, + "logits/rejected": -2.5692057609558105, + "logps/chosen": -217.84933471679688, + "logps/rejected": -214.6060333251953, + "loss": 0.4075, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3134591579437256, + "rewards/margins": 1.6907761096954346, + "rewards/rejected": -3.004235029220581, + "step": 4130 + }, + { + "epoch": 1.0, + "learning_rate": 3.7105544660367264e-07, + "logits/chosen": -2.6245837211608887, + "logits/rejected": -2.554424285888672, + "logps/chosen": -265.3359375, + "logps/rejected": -204.1483154296875, + "loss": 0.5261, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2920002937316895, + "rewards/margins": 1.0050559043884277, + "rewards/rejected": -3.297056198120117, + "step": 4140 + }, + { + "epoch": 1.0, + "learning_rate": 3.706097343555001e-07, + "logits/chosen": -2.768834352493286, + "logits/rejected": -2.726285696029663, + "logps/chosen": -286.7914733886719, + "logps/rejected": -294.3249816894531, + "loss": 0.5969, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3674845695495605, + "rewards/margins": 0.8646809458732605, + "rewards/rejected": -3.232165575027466, + "step": 4150 + }, + { + "epoch": 1.0, + "learning_rate": 3.701640221073275e-07, + "logits/chosen": -2.623534917831421, + "logits/rejected": -2.506242275238037, + "logps/chosen": -358.9901123046875, + "logps/rejected": -276.28985595703125, + "loss": 0.3072, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5487232804298401, + "rewards/margins": 4.627379417419434, + "rewards/rejected": -5.176103115081787, + "step": 4160 + }, + { + "epoch": 1.0, + "learning_rate": 3.697183098591549e-07, + "logits/chosen": -2.725881576538086, + "logits/rejected": -2.667299270629883, + "logps/chosen": -288.69171142578125, + "logps/rejected": -358.1936950683594, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29223746061325073, + "rewards/margins": 6.076581001281738, + "rewards/rejected": -5.784343242645264, + "step": 4170 + }, + { + "epoch": 1.01, + "learning_rate": 3.6927259761098236e-07, + "logits/chosen": -2.484480857849121, + "logits/rejected": -2.5674691200256348, + "logps/chosen": -254.5686798095703, + "logps/rejected": -359.6863708496094, + "loss": 0.0855, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.383306086063385, + "rewards/margins": 6.87512731552124, + "rewards/rejected": -7.2584333419799805, + "step": 4180 + }, + { + "epoch": 1.01, + "learning_rate": 3.6882688536280976e-07, + "logits/chosen": -2.5600600242614746, + "logits/rejected": -2.6257424354553223, + "logps/chosen": -221.41751098632812, + "logps/rejected": -294.37677001953125, + "loss": 0.1015, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1488605737686157, + "rewards/margins": 5.566132068634033, + "rewards/rejected": -6.714992523193359, + "step": 4190 + }, + { + "epoch": 1.01, + "learning_rate": 3.6838117311463716e-07, + "logits/chosen": -2.4749844074249268, + "logits/rejected": -2.4374618530273438, + "logps/chosen": -254.75198364257812, + "logps/rejected": -318.059814453125, + "loss": 0.1062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2943580746650696, + "rewards/margins": 5.920179843902588, + "rewards/rejected": -6.214537620544434, + "step": 4200 + }, + { + "epoch": 1.01, + "learning_rate": 3.679354608664646e-07, + "logits/chosen": -2.69596791267395, + "logits/rejected": -2.647221803665161, + "logps/chosen": -252.98281860351562, + "logps/rejected": -320.4571228027344, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.039807986468076706, + "rewards/margins": 6.2454118728637695, + "rewards/rejected": -6.20560359954834, + "step": 4210 + }, + { + "epoch": 1.02, + "learning_rate": 3.67489748618292e-07, + "logits/chosen": -2.592900037765503, + "logits/rejected": -2.448998212814331, + "logps/chosen": -200.04762268066406, + "logps/rejected": -247.912109375, + "loss": 0.1034, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.484206199645996, + "rewards/margins": 3.945133686065674, + "rewards/rejected": -5.429339408874512, + "step": 4220 + }, + { + "epoch": 1.02, + "learning_rate": 3.670440363701194e-07, + "logits/chosen": -2.6554009914398193, + "logits/rejected": -2.5679659843444824, + "logps/chosen": -279.2522888183594, + "logps/rejected": -315.81829833984375, + "loss": 0.1252, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9038124084472656, + "rewards/margins": 6.856900691986084, + "rewards/rejected": -5.953088760375977, + "step": 4230 + }, + { + "epoch": 1.02, + "learning_rate": 3.665983241219469e-07, + "logits/chosen": -2.5731217861175537, + "logits/rejected": -2.467867136001587, + "logps/chosen": -237.4766082763672, + "logps/rejected": -258.3265075683594, + "loss": 0.0956, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0812922939658165, + "rewards/margins": 6.334844589233398, + "rewards/rejected": -6.253551483154297, + "step": 4240 + }, + { + "epoch": 1.02, + "learning_rate": 3.661526118737743e-07, + "logits/chosen": -2.4351553916931152, + "logits/rejected": -2.418645143508911, + "logps/chosen": -370.56768798828125, + "logps/rejected": -375.59210205078125, + "loss": 0.1015, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.2138526737689972, + "rewards/margins": 6.076406002044678, + "rewards/rejected": -6.290258407592773, + "step": 4250 + }, + { + "epoch": 1.03, + "learning_rate": 3.657068996256017e-07, + "logits/chosen": -2.349137306213379, + "logits/rejected": -2.307180881500244, + "logps/chosen": -250.9742889404297, + "logps/rejected": -345.11639404296875, + "loss": 0.1001, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6029349565505981, + "rewards/margins": 7.945761680603027, + "rewards/rejected": -7.342826843261719, + "step": 4260 + }, + { + "epoch": 1.03, + "learning_rate": 3.6526118737742914e-07, + "logits/chosen": -2.5820562839508057, + "logits/rejected": -2.560638189315796, + "logps/chosen": -230.8574676513672, + "logps/rejected": -302.25787353515625, + "loss": 0.0916, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4544862508773804, + "rewards/margins": 6.655741214752197, + "rewards/rejected": -5.201254844665527, + "step": 4270 + }, + { + "epoch": 1.03, + "learning_rate": 3.6481547512925654e-07, + "logits/chosen": -2.2908504009246826, + "logits/rejected": -2.3787224292755127, + "logps/chosen": -178.44161987304688, + "logps/rejected": -253.1040496826172, + "loss": 0.097, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7253895998001099, + "rewards/margins": 6.9459428787231445, + "rewards/rejected": -6.220553398132324, + "step": 4280 + }, + { + "epoch": 1.03, + "learning_rate": 3.6436976288108395e-07, + "logits/chosen": -2.4098219871520996, + "logits/rejected": -2.453495979309082, + "logps/chosen": -290.34503173828125, + "logps/rejected": -380.69964599609375, + "loss": 0.1909, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.05411381646990776, + "rewards/margins": 6.767613887786865, + "rewards/rejected": -6.713500022888184, + "step": 4290 + }, + { + "epoch": 1.03, + "learning_rate": 3.6392405063291135e-07, + "logits/chosen": -2.693671703338623, + "logits/rejected": -2.5928988456726074, + "logps/chosen": -302.6961975097656, + "logps/rejected": -344.5617370605469, + "loss": 0.1111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.14878758788108826, + "rewards/margins": 6.189462184906006, + "rewards/rejected": -6.040674209594727, + "step": 4300 + }, + { + "epoch": 1.04, + "learning_rate": 3.634783383847388e-07, + "logits/chosen": -2.6140859127044678, + "logits/rejected": -2.3915727138519287, + "logps/chosen": -358.9208984375, + "logps/rejected": -286.369384765625, + "loss": 0.0702, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.029497016221284866, + "rewards/margins": 6.153254508972168, + "rewards/rejected": -6.123757839202881, + "step": 4310 + }, + { + "epoch": 1.04, + "learning_rate": 3.630326261365662e-07, + "logits/chosen": -2.7043561935424805, + "logits/rejected": -2.4338130950927734, + "logps/chosen": -222.9595947265625, + "logps/rejected": -223.4053192138672, + "loss": 0.0501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6035966277122498, + "rewards/margins": 5.239047050476074, + "rewards/rejected": -5.842643737792969, + "step": 4320 + }, + { + "epoch": 1.04, + "learning_rate": 3.625869138883936e-07, + "logits/chosen": -2.6249303817749023, + "logits/rejected": -2.669938087463379, + "logps/chosen": -222.0033721923828, + "logps/rejected": -270.62115478515625, + "loss": 0.1371, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3372684121131897, + "rewards/margins": 7.880601406097412, + "rewards/rejected": -7.543332576751709, + "step": 4330 + }, + { + "epoch": 1.04, + "learning_rate": 3.6214120164022107e-07, + "logits/chosen": -2.5248541831970215, + "logits/rejected": -2.5408248901367188, + "logps/chosen": -186.41961669921875, + "logps/rejected": -285.8816223144531, + "loss": 0.0782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9711707234382629, + "rewards/margins": 5.940455436706543, + "rewards/rejected": -6.911625862121582, + "step": 4340 + }, + { + "epoch": 1.05, + "learning_rate": 3.6169548939204847e-07, + "logits/chosen": -2.280282735824585, + "logits/rejected": -2.3826003074645996, + "logps/chosen": -203.95257568359375, + "logps/rejected": -264.19281005859375, + "loss": 0.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7481842041015625, + "rewards/margins": 4.814732551574707, + "rewards/rejected": -6.562915802001953, + "step": 4350 + }, + { + "epoch": 1.05, + "learning_rate": 3.6124977714387587e-07, + "logits/chosen": -2.3525540828704834, + "logits/rejected": -2.377518892288208, + "logps/chosen": -234.76779174804688, + "logps/rejected": -278.06097412109375, + "loss": 0.2528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8500163555145264, + "rewards/margins": 4.516103267669678, + "rewards/rejected": -7.366120338439941, + "step": 4360 + }, + { + "epoch": 1.05, + "learning_rate": 3.6080406489570333e-07, + "logits/chosen": -2.4346866607666016, + "logits/rejected": -2.4900708198547363, + "logps/chosen": -243.305908203125, + "logps/rejected": -304.49737548828125, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2673728466033936, + "rewards/margins": 6.260402202606201, + "rewards/rejected": -7.527773857116699, + "step": 4370 + }, + { + "epoch": 1.05, + "learning_rate": 3.6035835264753073e-07, + "logits/chosen": -2.5628085136413574, + "logits/rejected": -2.5256426334381104, + "logps/chosen": -207.5811004638672, + "logps/rejected": -283.73809814453125, + "loss": 0.132, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9077810049057007, + "rewards/margins": 8.054966926574707, + "rewards/rejected": -8.962748527526855, + "step": 4380 + }, + { + "epoch": 1.06, + "learning_rate": 3.5991264039935813e-07, + "logits/chosen": -2.597865581512451, + "logits/rejected": -2.5068411827087402, + "logps/chosen": -194.13424682617188, + "logps/rejected": -194.2491912841797, + "loss": 0.0857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4135921597480774, + "rewards/margins": 5.746599197387695, + "rewards/rejected": -6.160191535949707, + "step": 4390 + }, + { + "epoch": 1.06, + "learning_rate": 3.594669281511856e-07, + "logits/chosen": -2.5832905769348145, + "logits/rejected": -2.4489328861236572, + "logps/chosen": -266.7347717285156, + "logps/rejected": -213.3500213623047, + "loss": 0.0829, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.097774863243103, + "rewards/margins": 6.298386096954346, + "rewards/rejected": -7.396161079406738, + "step": 4400 + }, + { + "epoch": 1.06, + "eval_logits/chosen": -2.2565360069274902, + "eval_logits/rejected": -2.21488356590271, + "eval_logps/chosen": -240.26016235351562, + "eval_logps/rejected": -250.58619689941406, + "eval_loss": 0.530575156211853, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -3.8332605361938477, + "eval_rewards/margins": 2.272279977798462, + "eval_rewards/rejected": -6.105540752410889, + "eval_runtime": 133.9179, + "eval_samples_per_second": 23.567, + "eval_steps_per_second": 0.373, + "step": 4400 + }, + { + "epoch": 1.06, + "learning_rate": 3.59021215903013e-07, + "logits/chosen": -2.3539633750915527, + "logits/rejected": -2.4350552558898926, + "logps/chosen": -210.99447631835938, + "logps/rejected": -279.7448425292969, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7387447357177734, + "rewards/margins": 6.219385147094727, + "rewards/rejected": -7.9581298828125, + "step": 4410 + }, + { + "epoch": 1.06, + "learning_rate": 3.585755036548404e-07, + "logits/chosen": -2.6168372631073, + "logits/rejected": -2.454535961151123, + "logps/chosen": -310.8194274902344, + "logps/rejected": -315.9908752441406, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3085572719573975, + "rewards/margins": 7.067923545837402, + "rewards/rejected": -8.376481056213379, + "step": 4420 + }, + { + "epoch": 1.07, + "learning_rate": 3.5812979140666785e-07, + "logits/chosen": -2.6452255249023438, + "logits/rejected": -2.5484931468963623, + "logps/chosen": -262.70172119140625, + "logps/rejected": -332.6692810058594, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2865467667579651, + "rewards/margins": 8.886618614196777, + "rewards/rejected": -8.600071907043457, + "step": 4430 + }, + { + "epoch": 1.07, + "learning_rate": 3.5768407915849525e-07, + "logits/chosen": -2.5716865062713623, + "logits/rejected": -2.6295697689056396, + "logps/chosen": -219.3745880126953, + "logps/rejected": -295.82073974609375, + "loss": 0.1842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3263219594955444, + "rewards/margins": 6.227828025817871, + "rewards/rejected": -7.554150581359863, + "step": 4440 + }, + { + "epoch": 1.07, + "learning_rate": 3.5723836691032266e-07, + "logits/chosen": -2.721628189086914, + "logits/rejected": -2.573312520980835, + "logps/chosen": -282.0672912597656, + "logps/rejected": -300.0820617675781, + "loss": 0.1691, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3201095759868622, + "rewards/margins": 7.496065616607666, + "rewards/rejected": -7.175955295562744, + "step": 4450 + }, + { + "epoch": 1.07, + "learning_rate": 3.5679265466215006e-07, + "logits/chosen": -2.6510682106018066, + "logits/rejected": -2.6375367641448975, + "logps/chosen": -280.3169250488281, + "logps/rejected": -317.9718322753906, + "loss": 0.0615, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.035267461091279984, + "rewards/margins": 6.4750657081604, + "rewards/rejected": -6.510333061218262, + "step": 4460 + }, + { + "epoch": 1.08, + "learning_rate": 3.563469424139775e-07, + "logits/chosen": -2.6726248264312744, + "logits/rejected": -2.5161356925964355, + "logps/chosen": -266.86773681640625, + "logps/rejected": -280.0702209472656, + "loss": 0.1067, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0053305355831980705, + "rewards/margins": 7.370538234710693, + "rewards/rejected": -7.375868797302246, + "step": 4470 + }, + { + "epoch": 1.08, + "learning_rate": 3.559012301658049e-07, + "logits/chosen": -2.4348297119140625, + "logits/rejected": -2.3549609184265137, + "logps/chosen": -288.95452880859375, + "logps/rejected": -375.9358215332031, + "loss": 0.1212, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.1854562759399414, + "rewards/margins": 8.024209022521973, + "rewards/rejected": -7.838752746582031, + "step": 4480 + }, + { + "epoch": 1.08, + "learning_rate": 3.554555179176323e-07, + "logits/chosen": -2.579810619354248, + "logits/rejected": -2.4765095710754395, + "logps/chosen": -244.8577117919922, + "logps/rejected": -270.9769592285156, + "loss": 0.1491, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 2.057324171066284, + "rewards/margins": 9.266227722167969, + "rewards/rejected": -7.208902835845947, + "step": 4490 + }, + { + "epoch": 1.08, + "learning_rate": 3.550098056694598e-07, + "logits/chosen": -2.6141881942749023, + "logits/rejected": -2.6928577423095703, + "logps/chosen": -209.54342651367188, + "logps/rejected": -355.61163330078125, + "loss": 0.0718, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4131593704223633, + "rewards/margins": 8.726862907409668, + "rewards/rejected": -8.313703536987305, + "step": 4500 + }, + { + "epoch": 1.09, + "learning_rate": 3.545640934212872e-07, + "logits/chosen": -2.6281442642211914, + "logits/rejected": -2.420114517211914, + "logps/chosen": -239.50320434570312, + "logps/rejected": -216.16220092773438, + "loss": 0.0776, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6639260053634644, + "rewards/margins": 7.536489009857178, + "rewards/rejected": -6.872563362121582, + "step": 4510 + }, + { + "epoch": 1.09, + "learning_rate": 3.541183811731146e-07, + "logits/chosen": -2.6011791229248047, + "logits/rejected": -2.56583571434021, + "logps/chosen": -269.5140686035156, + "logps/rejected": -278.8650817871094, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6813734769821167, + "rewards/margins": 8.699769973754883, + "rewards/rejected": -8.018396377563477, + "step": 4520 + }, + { + "epoch": 1.09, + "learning_rate": 3.5367266892494204e-07, + "logits/chosen": -2.620603561401367, + "logits/rejected": -2.5572497844696045, + "logps/chosen": -291.53448486328125, + "logps/rejected": -297.11328125, + "loss": 0.0778, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2895924150943756, + "rewards/margins": 6.378467082977295, + "rewards/rejected": -6.668059349060059, + "step": 4530 + }, + { + "epoch": 1.09, + "learning_rate": 3.5322695667676944e-07, + "logits/chosen": -2.5830204486846924, + "logits/rejected": -2.5361826419830322, + "logps/chosen": -200.8857421875, + "logps/rejected": -299.4195251464844, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07186810672283173, + "rewards/margins": 7.2964348793029785, + "rewards/rejected": -7.224567413330078, + "step": 4540 + }, + { + "epoch": 1.1, + "learning_rate": 3.5278124442859684e-07, + "logits/chosen": -2.5962417125701904, + "logits/rejected": -2.556992292404175, + "logps/chosen": -241.4461212158203, + "logps/rejected": -348.4083557128906, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9010728597640991, + "rewards/margins": 10.432828903198242, + "rewards/rejected": -9.531754493713379, + "step": 4550 + }, + { + "epoch": 1.1, + "learning_rate": 3.523355321804243e-07, + "logits/chosen": -2.623161554336548, + "logits/rejected": -2.672816276550293, + "logps/chosen": -306.97027587890625, + "logps/rejected": -397.1775207519531, + "loss": 0.1126, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9301154017448425, + "rewards/margins": 8.396772384643555, + "rewards/rejected": -7.466658115386963, + "step": 4560 + }, + { + "epoch": 1.1, + "learning_rate": 3.518898199322517e-07, + "logits/chosen": -2.687253475189209, + "logits/rejected": -2.619741916656494, + "logps/chosen": -347.5944519042969, + "logps/rejected": -337.0247497558594, + "loss": 0.1122, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.29035288095474243, + "rewards/margins": 6.988702297210693, + "rewards/rejected": -6.6983489990234375, + "step": 4570 + }, + { + "epoch": 1.1, + "learning_rate": 3.514441076840791e-07, + "logits/chosen": -2.512484312057495, + "logits/rejected": -2.4654126167297363, + "logps/chosen": -213.0530242919922, + "logps/rejected": -314.42327880859375, + "loss": 0.0805, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24507789313793182, + "rewards/margins": 7.889502048492432, + "rewards/rejected": -8.1345796585083, + "step": 4580 + }, + { + "epoch": 1.1, + "learning_rate": 3.5099839543590656e-07, + "logits/chosen": -2.444556713104248, + "logits/rejected": -2.4471569061279297, + "logps/chosen": -308.90423583984375, + "logps/rejected": -515.9949340820312, + "loss": 0.1229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2889782190322876, + "rewards/margins": 11.80101490020752, + "rewards/rejected": -10.51203727722168, + "step": 4590 + }, + { + "epoch": 1.11, + "learning_rate": 3.50552683187734e-07, + "logits/chosen": -2.689913034439087, + "logits/rejected": -2.5694491863250732, + "logps/chosen": -337.18414306640625, + "logps/rejected": -306.79388427734375, + "loss": 0.0925, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7814053297042847, + "rewards/margins": 6.829098701477051, + "rewards/rejected": -6.047692775726318, + "step": 4600 + }, + { + "epoch": 1.11, + "learning_rate": 3.501069709395614e-07, + "logits/chosen": -2.7009692192077637, + "logits/rejected": -2.654804229736328, + "logps/chosen": -286.5771484375, + "logps/rejected": -297.51361083984375, + "loss": 0.0954, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4205569624900818, + "rewards/margins": 7.934011936187744, + "rewards/rejected": -7.513455390930176, + "step": 4610 + }, + { + "epoch": 1.11, + "learning_rate": 3.496612586913889e-07, + "logits/chosen": -2.5797932147979736, + "logits/rejected": -2.5282037258148193, + "logps/chosen": -354.33245849609375, + "logps/rejected": -367.6128845214844, + "loss": 0.1232, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8205696940422058, + "rewards/margins": 8.912364959716797, + "rewards/rejected": -8.091795921325684, + "step": 4620 + }, + { + "epoch": 1.11, + "learning_rate": 3.492155464432163e-07, + "logits/chosen": -2.7749524116516113, + "logits/rejected": -2.616973400115967, + "logps/chosen": -389.33636474609375, + "logps/rejected": -307.29693603515625, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2715777158737183, + "rewards/margins": 6.716928005218506, + "rewards/rejected": -5.445350170135498, + "step": 4630 + }, + { + "epoch": 1.12, + "learning_rate": 3.487698341950437e-07, + "logits/chosen": -2.6227383613586426, + "logits/rejected": -2.583625316619873, + "logps/chosen": -219.1520538330078, + "logps/rejected": -240.3316650390625, + "loss": 0.1236, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.37693899869918823, + "rewards/margins": 7.113305568695068, + "rewards/rejected": -7.490243434906006, + "step": 4640 + }, + { + "epoch": 1.12, + "learning_rate": 3.483241219468711e-07, + "logits/chosen": -2.567394733428955, + "logits/rejected": -2.6633286476135254, + "logps/chosen": -237.76931762695312, + "logps/rejected": -346.8174743652344, + "loss": 0.121, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.017298942431807518, + "rewards/margins": 7.123035430908203, + "rewards/rejected": -7.10573673248291, + "step": 4650 + }, + { + "epoch": 1.12, + "learning_rate": 3.4787840969869854e-07, + "logits/chosen": -2.5288941860198975, + "logits/rejected": -2.5487313270568848, + "logps/chosen": -273.1539306640625, + "logps/rejected": -327.77471923828125, + "loss": 0.0768, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.34701523184776306, + "rewards/margins": 8.51653003692627, + "rewards/rejected": -8.169514656066895, + "step": 4660 + }, + { + "epoch": 1.12, + "learning_rate": 3.4743269745052594e-07, + "logits/chosen": -2.729304313659668, + "logits/rejected": -2.6582436561584473, + "logps/chosen": -188.26229858398438, + "logps/rejected": -237.3467254638672, + "loss": 0.0703, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.063475251197815, + "rewards/margins": 6.28040075302124, + "rewards/rejected": -7.343875885009766, + "step": 4670 + }, + { + "epoch": 1.13, + "learning_rate": 3.4698698520235335e-07, + "logits/chosen": -2.615694999694824, + "logits/rejected": -2.5419116020202637, + "logps/chosen": -209.73489379882812, + "logps/rejected": -298.9879150390625, + "loss": 0.1043, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.41462770104408264, + "rewards/margins": 7.651031494140625, + "rewards/rejected": -8.065659523010254, + "step": 4680 + }, + { + "epoch": 1.13, + "learning_rate": 3.465412729541808e-07, + "logits/chosen": -2.561784267425537, + "logits/rejected": -2.501864194869995, + "logps/chosen": -298.063720703125, + "logps/rejected": -343.0885314941406, + "loss": 0.0979, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0329442024230957, + "rewards/margins": 7.5313615798950195, + "rewards/rejected": -8.564305305480957, + "step": 4690 + }, + { + "epoch": 1.13, + "learning_rate": 3.460955607060082e-07, + "logits/chosen": -2.762120485305786, + "logits/rejected": -2.6479175090789795, + "logps/chosen": -328.4841003417969, + "logps/rejected": -332.92572021484375, + "loss": 0.14, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.12023515999317169, + "rewards/margins": 6.435657501220703, + "rewards/rejected": -6.315423011779785, + "step": 4700 + }, + { + "epoch": 1.13, + "learning_rate": 3.456498484578356e-07, + "logits/chosen": -2.7214341163635254, + "logits/rejected": -2.6656060218811035, + "logps/chosen": -225.5690460205078, + "logps/rejected": -299.20703125, + "loss": 0.0973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33568239212036133, + "rewards/margins": 6.949644565582275, + "rewards/rejected": -7.2853264808654785, + "step": 4710 + }, + { + "epoch": 1.14, + "learning_rate": 3.4520413620966306e-07, + "logits/chosen": -2.5441741943359375, + "logits/rejected": -2.5853934288024902, + "logps/chosen": -199.78700256347656, + "logps/rejected": -308.087646484375, + "loss": 0.1425, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.6698516607284546, + "rewards/margins": 7.509990692138672, + "rewards/rejected": -6.8401384353637695, + "step": 4720 + }, + { + "epoch": 1.14, + "learning_rate": 3.4475842396149047e-07, + "logits/chosen": -2.5010409355163574, + "logits/rejected": -2.500927448272705, + "logps/chosen": -233.6601104736328, + "logps/rejected": -353.69781494140625, + "loss": 0.0913, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5150055289268494, + "rewards/margins": 8.125643730163574, + "rewards/rejected": -8.640649795532227, + "step": 4730 + }, + { + "epoch": 1.14, + "learning_rate": 3.4431271171331787e-07, + "logits/chosen": -2.6142001152038574, + "logits/rejected": -2.3814339637756348, + "logps/chosen": -292.4364013671875, + "logps/rejected": -319.9347229003906, + "loss": 0.1175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1018899455666542, + "rewards/margins": 8.109670639038086, + "rewards/rejected": -8.211560249328613, + "step": 4740 + }, + { + "epoch": 1.14, + "learning_rate": 3.438669994651453e-07, + "logits/chosen": -2.5011606216430664, + "logits/rejected": -2.4382424354553223, + "logps/chosen": -208.596435546875, + "logps/rejected": -326.25323486328125, + "loss": 0.0888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11111573874950409, + "rewards/margins": 8.364480972290039, + "rewards/rejected": -8.253364562988281, + "step": 4750 + }, + { + "epoch": 1.15, + "learning_rate": 3.4342128721697273e-07, + "logits/chosen": -2.535123109817505, + "logits/rejected": -2.3774056434631348, + "logps/chosen": -261.7142028808594, + "logps/rejected": -269.27703857421875, + "loss": 0.128, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3187916278839111, + "rewards/margins": 5.030508518218994, + "rewards/rejected": -6.349300384521484, + "step": 4760 + }, + { + "epoch": 1.15, + "learning_rate": 3.4297557496880013e-07, + "logits/chosen": -2.6771738529205322, + "logits/rejected": -2.4687163829803467, + "logps/chosen": -309.0341796875, + "logps/rejected": -324.45794677734375, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0969277173280716, + "rewards/margins": 8.045573234558105, + "rewards/rejected": -8.142500877380371, + "step": 4770 + }, + { + "epoch": 1.15, + "learning_rate": 3.425298627206276e-07, + "logits/chosen": -2.4616754055023193, + "logits/rejected": -2.5115413665771484, + "logps/chosen": -228.31356811523438, + "logps/rejected": -312.6092834472656, + "loss": 0.1108, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1317485272884369, + "rewards/margins": 6.608421325683594, + "rewards/rejected": -6.740170478820801, + "step": 4780 + }, + { + "epoch": 1.15, + "learning_rate": 3.42084150472455e-07, + "logits/chosen": -2.6824426651000977, + "logits/rejected": -2.4847335815429688, + "logps/chosen": -254.8378448486328, + "logps/rejected": -316.4959411621094, + "loss": 0.1683, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.210846185684204, + "rewards/margins": 7.360320091247559, + "rewards/rejected": -8.571165084838867, + "step": 4790 + }, + { + "epoch": 1.16, + "learning_rate": 3.416384382242824e-07, + "logits/chosen": -2.5573363304138184, + "logits/rejected": -2.6037890911102295, + "logps/chosen": -204.7184600830078, + "logps/rejected": -264.3968505859375, + "loss": 0.1383, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4384899139404297, + "rewards/margins": 5.136371612548828, + "rewards/rejected": -5.574862003326416, + "step": 4800 + }, + { + "epoch": 1.16, + "eval_logits/chosen": -2.3642618656158447, + "eval_logits/rejected": -2.3301374912261963, + "eval_logps/chosen": -240.0742645263672, + "eval_logps/rejected": -246.86351013183594, + "eval_loss": 0.543235182762146, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -3.8146703243255615, + "eval_rewards/margins": 1.918602705001831, + "eval_rewards/rejected": -5.733273506164551, + "eval_runtime": 134.4375, + "eval_samples_per_second": 23.476, + "eval_steps_per_second": 0.372, + "step": 4800 + }, + { + "epoch": 1.16, + "learning_rate": 3.411927259761098e-07, + "logits/chosen": -2.652888774871826, + "logits/rejected": -2.55576229095459, + "logps/chosen": -223.55447387695312, + "logps/rejected": -250.8131866455078, + "loss": 0.1212, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.21370844542980194, + "rewards/margins": 6.394108772277832, + "rewards/rejected": -6.60781717300415, + "step": 4810 + }, + { + "epoch": 1.16, + "learning_rate": 3.4074701372793725e-07, + "logits/chosen": -2.676522731781006, + "logits/rejected": -2.4591970443725586, + "logps/chosen": -237.09603881835938, + "logps/rejected": -273.28900146484375, + "loss": 0.1163, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6377540826797485, + "rewards/margins": 6.5416717529296875, + "rewards/rejected": -8.179426193237305, + "step": 4820 + }, + { + "epoch": 1.16, + "learning_rate": 3.4030130147976465e-07, + "logits/chosen": -2.5285964012145996, + "logits/rejected": -2.544323444366455, + "logps/chosen": -322.7830810546875, + "logps/rejected": -394.04583740234375, + "loss": 0.0904, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2724666893482208, + "rewards/margins": 9.214590072631836, + "rewards/rejected": -9.48705768585205, + "step": 4830 + }, + { + "epoch": 1.16, + "learning_rate": 3.3985558923159206e-07, + "logits/chosen": -2.651007652282715, + "logits/rejected": -2.6063904762268066, + "logps/chosen": -284.8469543457031, + "logps/rejected": -305.2882385253906, + "loss": 0.0995, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8186386823654175, + "rewards/margins": 7.470262050628662, + "rewards/rejected": -8.288900375366211, + "step": 4840 + }, + { + "epoch": 1.17, + "learning_rate": 3.394098769834195e-07, + "logits/chosen": -2.7330214977264404, + "logits/rejected": -2.7236289978027344, + "logps/chosen": -313.14666748046875, + "logps/rejected": -385.87530517578125, + "loss": 0.114, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0422546863555908, + "rewards/margins": 6.971061706542969, + "rewards/rejected": -8.01331615447998, + "step": 4850 + }, + { + "epoch": 1.17, + "learning_rate": 3.389641647352469e-07, + "logits/chosen": -2.62068247795105, + "logits/rejected": -2.59302020072937, + "logps/chosen": -181.2661590576172, + "logps/rejected": -293.93359375, + "loss": 0.0889, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.053655982017517, + "rewards/margins": 6.890317440032959, + "rewards/rejected": -7.943974494934082, + "step": 4860 + }, + { + "epoch": 1.17, + "learning_rate": 3.385184524870743e-07, + "logits/chosen": -2.7119078636169434, + "logits/rejected": -2.5695385932922363, + "logps/chosen": -351.404052734375, + "logps/rejected": -259.8926696777344, + "loss": 0.1015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2817792296409607, + "rewards/margins": 5.497501850128174, + "rewards/rejected": -5.779280662536621, + "step": 4870 + }, + { + "epoch": 1.17, + "learning_rate": 3.380727402389018e-07, + "logits/chosen": -2.640078544616699, + "logits/rejected": -2.616539478302002, + "logps/chosen": -202.4800567626953, + "logps/rejected": -362.8905334472656, + "loss": 0.1176, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.41821393370628357, + "rewards/margins": 9.335319519042969, + "rewards/rejected": -9.753533363342285, + "step": 4880 + }, + { + "epoch": 1.18, + "learning_rate": 3.376270279907292e-07, + "logits/chosen": -2.784754991531372, + "logits/rejected": -2.621223211288452, + "logps/chosen": -294.47381591796875, + "logps/rejected": -250.52694702148438, + "loss": 0.1217, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5304609537124634, + "rewards/margins": 6.352158546447754, + "rewards/rejected": -5.821697235107422, + "step": 4890 + }, + { + "epoch": 1.18, + "learning_rate": 3.371813157425566e-07, + "logits/chosen": -2.585599899291992, + "logits/rejected": -2.4793825149536133, + "logps/chosen": -350.9255676269531, + "logps/rejected": -284.8587646484375, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4237033724784851, + "rewards/margins": 6.879862308502197, + "rewards/rejected": -7.303567409515381, + "step": 4900 + }, + { + "epoch": 1.18, + "learning_rate": 3.3673560349438404e-07, + "logits/chosen": -2.536411762237549, + "logits/rejected": -2.687934398651123, + "logps/chosen": -231.5205535888672, + "logps/rejected": -356.8689880371094, + "loss": 0.1045, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6803652048110962, + "rewards/margins": 8.608926773071289, + "rewards/rejected": -9.289292335510254, + "step": 4910 + }, + { + "epoch": 1.18, + "learning_rate": 3.3628989124621144e-07, + "logits/chosen": -2.7165687084198, + "logits/rejected": -2.590156078338623, + "logps/chosen": -210.8171844482422, + "logps/rejected": -249.25381469726562, + "loss": 0.1056, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7293668985366821, + "rewards/margins": 8.638975143432617, + "rewards/rejected": -7.909608364105225, + "step": 4920 + }, + { + "epoch": 1.19, + "learning_rate": 3.3584417899803884e-07, + "logits/chosen": -2.709575891494751, + "logits/rejected": -2.5665009021759033, + "logps/chosen": -278.49725341796875, + "logps/rejected": -300.79083251953125, + "loss": 0.089, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9219233393669128, + "rewards/margins": 6.195326328277588, + "rewards/rejected": -7.117249488830566, + "step": 4930 + }, + { + "epoch": 1.19, + "learning_rate": 3.353984667498663e-07, + "logits/chosen": -2.4022912979125977, + "logits/rejected": -2.588256359100342, + "logps/chosen": -246.93045043945312, + "logps/rejected": -302.0685119628906, + "loss": 0.1222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2618193626403809, + "rewards/margins": 6.316469669342041, + "rewards/rejected": -7.578289985656738, + "step": 4940 + }, + { + "epoch": 1.19, + "learning_rate": 3.349527545016937e-07, + "logits/chosen": -2.6150996685028076, + "logits/rejected": -2.436365842819214, + "logps/chosen": -362.4632873535156, + "logps/rejected": -396.39971923828125, + "loss": 0.0859, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.31028467416763306, + "rewards/margins": 7.695733070373535, + "rewards/rejected": -7.3854475021362305, + "step": 4950 + }, + { + "epoch": 1.19, + "learning_rate": 3.345070422535211e-07, + "logits/chosen": -2.392132520675659, + "logits/rejected": -2.520671844482422, + "logps/chosen": -163.79122924804688, + "logps/rejected": -267.2300720214844, + "loss": 0.099, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.057686686515808, + "rewards/margins": 7.228782653808594, + "rewards/rejected": -8.286470413208008, + "step": 4960 + }, + { + "epoch": 1.2, + "learning_rate": 3.340613300053485e-07, + "logits/chosen": -2.809436559677124, + "logits/rejected": -2.632969617843628, + "logps/chosen": -301.5816955566406, + "logps/rejected": -422.3345642089844, + "loss": 0.102, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04304458945989609, + "rewards/margins": 9.245333671569824, + "rewards/rejected": -9.202289581298828, + "step": 4970 + }, + { + "epoch": 1.2, + "learning_rate": 3.3361561775717596e-07, + "logits/chosen": -2.3093483448028564, + "logits/rejected": -2.373852491378784, + "logps/chosen": -197.71905517578125, + "logps/rejected": -246.5425567626953, + "loss": 0.0629, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.19547121226787567, + "rewards/margins": 6.883888244628906, + "rewards/rejected": -6.688416957855225, + "step": 4980 + }, + { + "epoch": 1.2, + "learning_rate": 3.3316990550900336e-07, + "logits/chosen": -2.665581464767456, + "logits/rejected": -2.588740587234497, + "logps/chosen": -267.37750244140625, + "logps/rejected": -295.717041015625, + "loss": 0.1645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5497006177902222, + "rewards/margins": 7.928070068359375, + "rewards/rejected": -8.477770805358887, + "step": 4990 + }, + { + "epoch": 1.2, + "learning_rate": 3.3272419326083077e-07, + "logits/chosen": -2.3988068103790283, + "logits/rejected": -2.463813066482544, + "logps/chosen": -258.91229248046875, + "logps/rejected": -241.25588989257812, + "loss": 0.1358, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1083600521087646, + "rewards/margins": 5.589565753936768, + "rewards/rejected": -7.697924613952637, + "step": 5000 + }, + { + "epoch": 1.21, + "learning_rate": 3.322784810126582e-07, + "logits/chosen": -2.5505945682525635, + "logits/rejected": -2.598329544067383, + "logps/chosen": -228.31265258789062, + "logps/rejected": -260.6291198730469, + "loss": 0.2057, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1303789615631104, + "rewards/margins": 5.149099826812744, + "rewards/rejected": -6.279478549957275, + "step": 5010 + }, + { + "epoch": 1.21, + "learning_rate": 3.318327687644856e-07, + "logits/chosen": -2.5362915992736816, + "logits/rejected": -2.499586582183838, + "logps/chosen": -332.15203857421875, + "logps/rejected": -346.24346923828125, + "loss": 0.0936, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3879649043083191, + "rewards/margins": 7.188324928283691, + "rewards/rejected": -7.576289176940918, + "step": 5020 + }, + { + "epoch": 1.21, + "learning_rate": 3.3138705651631303e-07, + "logits/chosen": -2.474151849746704, + "logits/rejected": -2.479788303375244, + "logps/chosen": -290.87408447265625, + "logps/rejected": -277.9864501953125, + "loss": 0.1125, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0720984935760498, + "rewards/margins": 5.177456378936768, + "rewards/rejected": -6.2495551109313965, + "step": 5030 + }, + { + "epoch": 1.21, + "learning_rate": 3.309413442681405e-07, + "logits/chosen": -2.63665771484375, + "logits/rejected": -2.5593645572662354, + "logps/chosen": -230.1723175048828, + "logps/rejected": -318.81451416015625, + "loss": 0.1069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13830193877220154, + "rewards/margins": 6.594658851623535, + "rewards/rejected": -6.7329607009887695, + "step": 5040 + }, + { + "epoch": 1.22, + "learning_rate": 3.304956320199679e-07, + "logits/chosen": -2.2486722469329834, + "logits/rejected": -2.3361761569976807, + "logps/chosen": -215.96115112304688, + "logps/rejected": -291.5397033691406, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0104453563690186, + "rewards/margins": 7.1230669021606445, + "rewards/rejected": -8.133512496948242, + "step": 5050 + }, + { + "epoch": 1.22, + "learning_rate": 3.300499197717953e-07, + "logits/chosen": -2.5913820266723633, + "logits/rejected": -2.387864589691162, + "logps/chosen": -318.3968505859375, + "logps/rejected": -324.6764221191406, + "loss": 0.0845, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11920301616191864, + "rewards/margins": 8.436397552490234, + "rewards/rejected": -8.317194938659668, + "step": 5060 + }, + { + "epoch": 1.22, + "learning_rate": 3.2960420752362275e-07, + "logits/chosen": -2.5653076171875, + "logits/rejected": -2.451930522918701, + "logps/chosen": -246.45553588867188, + "logps/rejected": -275.3322448730469, + "loss": 0.0879, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5844142436981201, + "rewards/margins": 7.900046348571777, + "rewards/rejected": -9.484460830688477, + "step": 5070 + }, + { + "epoch": 1.22, + "learning_rate": 3.2915849527545015e-07, + "logits/chosen": -2.5098018646240234, + "logits/rejected": -2.5696258544921875, + "logps/chosen": -235.1271514892578, + "logps/rejected": -378.62884521484375, + "loss": 0.0676, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0039904117584229, + "rewards/margins": 8.815313339233398, + "rewards/rejected": -9.819302558898926, + "step": 5080 + }, + { + "epoch": 1.23, + "learning_rate": 3.2871278302727755e-07, + "logits/chosen": -2.520859718322754, + "logits/rejected": -2.5557596683502197, + "logps/chosen": -157.81492614746094, + "logps/rejected": -281.9358215332031, + "loss": 0.0865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4797874987125397, + "rewards/margins": 6.32138729095459, + "rewards/rejected": -6.801175117492676, + "step": 5090 + }, + { + "epoch": 1.23, + "learning_rate": 3.28267070779105e-07, + "logits/chosen": -2.573880434036255, + "logits/rejected": -2.395332098007202, + "logps/chosen": -217.60202026367188, + "logps/rejected": -219.1641387939453, + "loss": 0.0857, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7336370944976807, + "rewards/margins": 5.26425838470459, + "rewards/rejected": -6.99789571762085, + "step": 5100 + }, + { + "epoch": 1.23, + "learning_rate": 3.278213585309324e-07, + "logits/chosen": -2.617797374725342, + "logits/rejected": -2.512047290802002, + "logps/chosen": -257.33697509765625, + "logps/rejected": -271.3388671875, + "loss": 0.0955, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8858249187469482, + "rewards/margins": 4.943978786468506, + "rewards/rejected": -7.829803466796875, + "step": 5110 + }, + { + "epoch": 1.23, + "learning_rate": 3.273756462827598e-07, + "logits/chosen": -2.3365323543548584, + "logits/rejected": -2.356816053390503, + "logps/chosen": -191.58914184570312, + "logps/rejected": -319.6527404785156, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.186131238937378, + "rewards/margins": 8.288546562194824, + "rewards/rejected": -9.474678993225098, + "step": 5120 + }, + { + "epoch": 1.23, + "learning_rate": 3.269299340345872e-07, + "logits/chosen": -2.3473777770996094, + "logits/rejected": -2.274927854537964, + "logps/chosen": -283.12548828125, + "logps/rejected": -362.0394287109375, + "loss": 0.1288, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.0770468711853027, + "rewards/margins": 12.561004638671875, + "rewards/rejected": -11.48395824432373, + "step": 5130 + }, + { + "epoch": 1.24, + "learning_rate": 3.2648422178641467e-07, + "logits/chosen": -2.4011003971099854, + "logits/rejected": -2.3673596382141113, + "logps/chosen": -280.14373779296875, + "logps/rejected": -360.3028259277344, + "loss": 0.1372, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6586049795150757, + "rewards/margins": 8.177619934082031, + "rewards/rejected": -8.836225509643555, + "step": 5140 + }, + { + "epoch": 1.24, + "learning_rate": 3.260385095382421e-07, + "logits/chosen": -2.405153751373291, + "logits/rejected": -2.5100693702697754, + "logps/chosen": -194.7483673095703, + "logps/rejected": -254.55648803710938, + "loss": 0.1007, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.961032509803772, + "rewards/margins": 5.3906989097595215, + "rewards/rejected": -6.351731300354004, + "step": 5150 + }, + { + "epoch": 1.24, + "learning_rate": 3.255927972900695e-07, + "logits/chosen": -2.440706729888916, + "logits/rejected": -2.38972544670105, + "logps/chosen": -340.3149108886719, + "logps/rejected": -404.1507263183594, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6216408014297485, + "rewards/margins": 9.724592208862305, + "rewards/rejected": -9.102952003479004, + "step": 5160 + }, + { + "epoch": 1.24, + "learning_rate": 3.2514708504189693e-07, + "logits/chosen": -2.2282023429870605, + "logits/rejected": -2.132371187210083, + "logps/chosen": -308.7526550292969, + "logps/rejected": -340.6700134277344, + "loss": 0.0645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4051053524017334, + "rewards/margins": 6.600663185119629, + "rewards/rejected": -8.005767822265625, + "step": 5170 + }, + { + "epoch": 1.25, + "learning_rate": 3.2470137279372434e-07, + "logits/chosen": -2.456524133682251, + "logits/rejected": -2.4100513458251953, + "logps/chosen": -270.8525085449219, + "logps/rejected": -367.73382568359375, + "loss": 0.093, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1903812289237976, + "rewards/margins": 8.29595947265625, + "rewards/rejected": -8.486339569091797, + "step": 5180 + }, + { + "epoch": 1.25, + "learning_rate": 3.2425566054555174e-07, + "logits/chosen": -2.426785707473755, + "logits/rejected": -2.4124486446380615, + "logps/chosen": -201.92190551757812, + "logps/rejected": -254.50369262695312, + "loss": 0.0998, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.218264579772949, + "rewards/margins": 6.062951564788818, + "rewards/rejected": -8.281216621398926, + "step": 5190 + }, + { + "epoch": 1.25, + "learning_rate": 3.238099482973792e-07, + "logits/chosen": -2.6022567749023438, + "logits/rejected": -2.4152626991271973, + "logps/chosen": -317.2975158691406, + "logps/rejected": -331.3099670410156, + "loss": 0.1425, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.007159471511841, + "rewards/margins": 6.119154930114746, + "rewards/rejected": -8.126314163208008, + "step": 5200 + }, + { + "epoch": 1.25, + "eval_logits/chosen": -2.2021408081054688, + "eval_logits/rejected": -2.1704671382904053, + "eval_logps/chosen": -249.65997314453125, + "eval_logps/rejected": -260.0906066894531, + "eval_loss": 0.5238316655158997, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -4.773243427276611, + "eval_rewards/margins": 2.282736301422119, + "eval_rewards/rejected": -7.0559797286987305, + "eval_runtime": 135.0398, + "eval_samples_per_second": 23.371, + "eval_steps_per_second": 0.37, + "step": 5200 + }, + { + "epoch": 1.25, + "learning_rate": 3.233642360492066e-07, + "logits/chosen": -2.610273599624634, + "logits/rejected": -2.397230863571167, + "logps/chosen": -259.42962646484375, + "logps/rejected": -370.8842468261719, + "loss": 0.0872, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.45601025223731995, + "rewards/margins": 8.975427627563477, + "rewards/rejected": -9.431438446044922, + "step": 5210 + }, + { + "epoch": 1.26, + "learning_rate": 3.22918523801034e-07, + "logits/chosen": -2.5248847007751465, + "logits/rejected": -2.4510183334350586, + "logps/chosen": -372.82757568359375, + "logps/rejected": -339.1612548828125, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.026867162436246872, + "rewards/margins": 8.06849193572998, + "rewards/rejected": -8.041624069213867, + "step": 5220 + }, + { + "epoch": 1.26, + "learning_rate": 3.2247281155286146e-07, + "logits/chosen": -2.375784158706665, + "logits/rejected": -2.28358793258667, + "logps/chosen": -335.15679931640625, + "logps/rejected": -404.1818542480469, + "loss": 0.1309, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0233367681503296, + "rewards/margins": 7.956613063812256, + "rewards/rejected": -8.979949951171875, + "step": 5230 + }, + { + "epoch": 1.26, + "learning_rate": 3.2202709930468886e-07, + "logits/chosen": -2.3147315979003906, + "logits/rejected": -2.241579532623291, + "logps/chosen": -314.4580078125, + "logps/rejected": -595.355224609375, + "loss": 0.0975, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8091908693313599, + "rewards/margins": 26.27374839782715, + "rewards/rejected": -25.46455955505371, + "step": 5240 + }, + { + "epoch": 1.26, + "learning_rate": 3.2158138705651626e-07, + "logits/chosen": -2.4444868564605713, + "logits/rejected": -2.4202077388763428, + "logps/chosen": -353.08477783203125, + "logps/rejected": -466.76885986328125, + "loss": 0.073, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3587248921394348, + "rewards/margins": 8.451542854309082, + "rewards/rejected": -8.810267448425293, + "step": 5250 + }, + { + "epoch": 1.27, + "learning_rate": 3.211356748083437e-07, + "logits/chosen": -2.544177532196045, + "logits/rejected": -2.4594388008117676, + "logps/chosen": -326.6631774902344, + "logps/rejected": -265.6733703613281, + "loss": 0.2467, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5703349113464355, + "rewards/margins": 6.191134452819824, + "rewards/rejected": -7.76146936416626, + "step": 5260 + }, + { + "epoch": 1.27, + "learning_rate": 3.206899625601711e-07, + "logits/chosen": -2.418236255645752, + "logits/rejected": -2.3861804008483887, + "logps/chosen": -219.2949676513672, + "logps/rejected": -341.7588806152344, + "loss": 0.0767, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4184186458587646, + "rewards/margins": 6.886469841003418, + "rewards/rejected": -9.304888725280762, + "step": 5270 + }, + { + "epoch": 1.27, + "learning_rate": 3.202442503119985e-07, + "logits/chosen": -2.49849271774292, + "logits/rejected": -2.432671070098877, + "logps/chosen": -228.3261260986328, + "logps/rejected": -281.47235107421875, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3620457649230957, + "rewards/margins": 6.679247856140137, + "rewards/rejected": -9.041293144226074, + "step": 5280 + }, + { + "epoch": 1.27, + "learning_rate": 3.1979853806382603e-07, + "logits/chosen": -2.4951071739196777, + "logits/rejected": -2.4019224643707275, + "logps/chosen": -286.7401428222656, + "logps/rejected": -457.8524475097656, + "loss": 0.0843, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6505063772201538, + "rewards/margins": 8.511384963989258, + "rewards/rejected": -10.161893844604492, + "step": 5290 + }, + { + "epoch": 1.28, + "learning_rate": 3.1935282581565344e-07, + "logits/chosen": -2.551490068435669, + "logits/rejected": -2.500885486602783, + "logps/chosen": -276.5769348144531, + "logps/rejected": -277.8192443847656, + "loss": 0.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6650933027267456, + "rewards/margins": 7.8217010498046875, + "rewards/rejected": -8.486794471740723, + "step": 5300 + }, + { + "epoch": 1.28, + "learning_rate": 3.1890711356748084e-07, + "logits/chosen": -2.4486889839172363, + "logits/rejected": -2.468679428100586, + "logps/chosen": -250.3290557861328, + "logps/rejected": -292.9578857421875, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37615785002708435, + "rewards/margins": 8.366262435913086, + "rewards/rejected": -8.74242115020752, + "step": 5310 + }, + { + "epoch": 1.28, + "learning_rate": 3.1846140131930824e-07, + "logits/chosen": -2.4542434215545654, + "logits/rejected": -2.4101929664611816, + "logps/chosen": -284.49713134765625, + "logps/rejected": -281.8470458984375, + "loss": 0.1189, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8050628900527954, + "rewards/margins": 6.364178657531738, + "rewards/rejected": -7.169241428375244, + "step": 5320 + }, + { + "epoch": 1.28, + "learning_rate": 3.180156890711357e-07, + "logits/chosen": -2.459995746612549, + "logits/rejected": -2.2036283016204834, + "logps/chosen": -274.40582275390625, + "logps/rejected": -258.71160888671875, + "loss": 0.0773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7084132432937622, + "rewards/margins": 6.719203948974609, + "rewards/rejected": -7.42761754989624, + "step": 5330 + }, + { + "epoch": 1.29, + "learning_rate": 3.175699768229631e-07, + "logits/chosen": -2.396700382232666, + "logits/rejected": -2.4109833240509033, + "logps/chosen": -257.54498291015625, + "logps/rejected": -302.1127624511719, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19791404902935028, + "rewards/margins": 7.5934038162231445, + "rewards/rejected": -7.395489692687988, + "step": 5340 + }, + { + "epoch": 1.29, + "learning_rate": 3.171242645747905e-07, + "logits/chosen": -2.379744052886963, + "logits/rejected": -2.3605384826660156, + "logps/chosen": -260.70428466796875, + "logps/rejected": -273.8489685058594, + "loss": 0.1391, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5692093968391418, + "rewards/margins": 8.12734603881836, + "rewards/rejected": -8.696555137634277, + "step": 5350 + }, + { + "epoch": 1.29, + "learning_rate": 3.1667855232661796e-07, + "logits/chosen": -2.66178297996521, + "logits/rejected": -2.6306025981903076, + "logps/chosen": -287.98992919921875, + "logps/rejected": -422.4910583496094, + "loss": 0.0707, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.1379169523715973, + "rewards/margins": 10.600385665893555, + "rewards/rejected": -10.462468147277832, + "step": 5360 + }, + { + "epoch": 1.29, + "learning_rate": 3.1623284007844536e-07, + "logits/chosen": -2.724097967147827, + "logits/rejected": -2.6357712745666504, + "logps/chosen": -248.39602661132812, + "logps/rejected": -262.27197265625, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5081327557563782, + "rewards/margins": 5.618767261505127, + "rewards/rejected": -6.126899719238281, + "step": 5370 + }, + { + "epoch": 1.29, + "learning_rate": 3.1578712783027276e-07, + "logits/chosen": -2.5694947242736816, + "logits/rejected": -2.6334919929504395, + "logps/chosen": -240.76773071289062, + "logps/rejected": -364.18157958984375, + "loss": 0.1331, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0891406536102295, + "rewards/margins": 8.847578048706055, + "rewards/rejected": -9.936718940734863, + "step": 5380 + }, + { + "epoch": 1.3, + "learning_rate": 3.153414155821002e-07, + "logits/chosen": -2.7643802165985107, + "logits/rejected": -2.6889286041259766, + "logps/chosen": -269.68487548828125, + "logps/rejected": -359.32550048828125, + "loss": 0.1253, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20100903511047363, + "rewards/margins": 7.6140313148498535, + "rewards/rejected": -7.815041542053223, + "step": 5390 + }, + { + "epoch": 1.3, + "learning_rate": 3.148957033339276e-07, + "logits/chosen": -2.576159954071045, + "logits/rejected": -2.6012067794799805, + "logps/chosen": -280.146240234375, + "logps/rejected": -369.99505615234375, + "loss": 0.0596, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.37197595834732056, + "rewards/margins": 9.538189888000488, + "rewards/rejected": -9.910165786743164, + "step": 5400 + }, + { + "epoch": 1.3, + "learning_rate": 3.14449991085755e-07, + "logits/chosen": -2.5083253383636475, + "logits/rejected": -2.4267446994781494, + "logps/chosen": -232.00759887695312, + "logps/rejected": -333.5928955078125, + "loss": 0.0747, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04621438309550285, + "rewards/margins": 11.238184928894043, + "rewards/rejected": -11.191969871520996, + "step": 5410 + }, + { + "epoch": 1.3, + "learning_rate": 3.140042788375825e-07, + "logits/chosen": -2.523236036300659, + "logits/rejected": -2.442023754119873, + "logps/chosen": -205.5675048828125, + "logps/rejected": -263.4608459472656, + "loss": 0.1162, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9463703632354736, + "rewards/margins": 7.0555853843688965, + "rewards/rejected": -9.001955032348633, + "step": 5420 + }, + { + "epoch": 1.31, + "learning_rate": 3.135585665894099e-07, + "logits/chosen": -2.658153533935547, + "logits/rejected": -2.6786551475524902, + "logps/chosen": -285.0563659667969, + "logps/rejected": -303.50701904296875, + "loss": 0.1408, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0504812002182007, + "rewards/margins": 5.920778751373291, + "rewards/rejected": -6.971259117126465, + "step": 5430 + }, + { + "epoch": 1.31, + "learning_rate": 3.131128543412373e-07, + "logits/chosen": -2.492877960205078, + "logits/rejected": -2.3323912620544434, + "logps/chosen": -281.31512451171875, + "logps/rejected": -279.0833740234375, + "loss": 0.1003, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3006997108459473, + "rewards/margins": 7.696803092956543, + "rewards/rejected": -8.997502326965332, + "step": 5440 + }, + { + "epoch": 1.31, + "learning_rate": 3.1266714209306474e-07, + "logits/chosen": -2.605281114578247, + "logits/rejected": -2.359964609146118, + "logps/chosen": -235.1231231689453, + "logps/rejected": -295.6809387207031, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4890674352645874, + "rewards/margins": 6.981287956237793, + "rewards/rejected": -8.470356941223145, + "step": 5450 + }, + { + "epoch": 1.31, + "learning_rate": 3.1222142984489215e-07, + "logits/chosen": -2.5633320808410645, + "logits/rejected": -2.5102694034576416, + "logps/chosen": -299.23663330078125, + "logps/rejected": -306.9552307128906, + "loss": 0.118, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6820122003555298, + "rewards/margins": 6.708361625671387, + "rewards/rejected": -8.390375137329102, + "step": 5460 + }, + { + "epoch": 1.32, + "learning_rate": 3.1177571759671955e-07, + "logits/chosen": -2.7502570152282715, + "logits/rejected": -2.6397671699523926, + "logps/chosen": -380.3826904296875, + "logps/rejected": -339.49615478515625, + "loss": 0.0824, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.43768367171287537, + "rewards/margins": 8.252010345458984, + "rewards/rejected": -7.814326286315918, + "step": 5470 + }, + { + "epoch": 1.32, + "learning_rate": 3.1133000534854695e-07, + "logits/chosen": -2.530097484588623, + "logits/rejected": -2.455191135406494, + "logps/chosen": -317.61968994140625, + "logps/rejected": -382.91668701171875, + "loss": 0.1061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5647618770599365, + "rewards/margins": 6.722103118896484, + "rewards/rejected": -8.286864280700684, + "step": 5480 + }, + { + "epoch": 1.32, + "learning_rate": 3.108842931003744e-07, + "logits/chosen": -2.5136406421661377, + "logits/rejected": -2.657163143157959, + "logps/chosen": -261.32818603515625, + "logps/rejected": -346.85260009765625, + "loss": 0.1256, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4253344535827637, + "rewards/margins": 5.57681941986084, + "rewards/rejected": -7.002154350280762, + "step": 5490 + }, + { + "epoch": 1.32, + "learning_rate": 3.104385808522018e-07, + "logits/chosen": -2.4770731925964355, + "logits/rejected": -2.3463892936706543, + "logps/chosen": -300.9383850097656, + "logps/rejected": -277.59539794921875, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7142188549041748, + "rewards/margins": 5.560961723327637, + "rewards/rejected": -6.275180339813232, + "step": 5500 + }, + { + "epoch": 1.33, + "learning_rate": 3.099928686040292e-07, + "logits/chosen": -2.2352776527404785, + "logits/rejected": -2.2539238929748535, + "logps/chosen": -149.8365936279297, + "logps/rejected": -223.1758575439453, + "loss": 0.0864, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1484190821647644, + "rewards/margins": 7.326336860656738, + "rewards/rejected": -7.474755764007568, + "step": 5510 + }, + { + "epoch": 1.33, + "learning_rate": 3.0954715635585667e-07, + "logits/chosen": -2.6775448322296143, + "logits/rejected": -2.567296266555786, + "logps/chosen": -289.3621520996094, + "logps/rejected": -284.3583984375, + "loss": 0.1873, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6089405417442322, + "rewards/margins": 6.219395637512207, + "rewards/rejected": -6.828335762023926, + "step": 5520 + }, + { + "epoch": 1.33, + "learning_rate": 3.0910144410768407e-07, + "logits/chosen": -2.5233407020568848, + "logits/rejected": -2.577707290649414, + "logps/chosen": -276.72845458984375, + "logps/rejected": -373.488525390625, + "loss": 0.0691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.833437204360962, + "rewards/margins": 6.862840175628662, + "rewards/rejected": -9.696276664733887, + "step": 5530 + }, + { + "epoch": 1.33, + "learning_rate": 3.086557318595115e-07, + "logits/chosen": -2.51139235496521, + "logits/rejected": -2.4481873512268066, + "logps/chosen": -339.71600341796875, + "logps/rejected": -294.01806640625, + "loss": 0.0977, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5098949670791626, + "rewards/margins": 6.523590087890625, + "rewards/rejected": -8.033485412597656, + "step": 5540 + }, + { + "epoch": 1.34, + "learning_rate": 3.0821001961133893e-07, + "logits/chosen": -2.7013440132141113, + "logits/rejected": -2.571755886077881, + "logps/chosen": -272.5346984863281, + "logps/rejected": -308.40118408203125, + "loss": 0.0972, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3033177852630615, + "rewards/margins": 8.641069412231445, + "rewards/rejected": -8.337750434875488, + "step": 5550 + }, + { + "epoch": 1.34, + "learning_rate": 3.0776430736316633e-07, + "logits/chosen": -2.5146682262420654, + "logits/rejected": -2.4330825805664062, + "logps/chosen": -255.439208984375, + "logps/rejected": -346.15216064453125, + "loss": 0.0697, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.32078343629837036, + "rewards/margins": 8.167900085449219, + "rewards/rejected": -8.488683700561523, + "step": 5560 + }, + { + "epoch": 1.34, + "learning_rate": 3.0731859511499374e-07, + "logits/chosen": -2.6147828102111816, + "logits/rejected": -2.5036494731903076, + "logps/chosen": -280.19659423828125, + "logps/rejected": -364.2330017089844, + "loss": 0.069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1797616183757782, + "rewards/margins": 8.10425853729248, + "rewards/rejected": -8.28402042388916, + "step": 5570 + }, + { + "epoch": 1.34, + "learning_rate": 3.068728828668212e-07, + "logits/chosen": -2.7004218101501465, + "logits/rejected": -2.564936637878418, + "logps/chosen": -290.7904968261719, + "logps/rejected": -324.06072998046875, + "loss": 0.1179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9224249124526978, + "rewards/margins": 7.382409572601318, + "rewards/rejected": -8.304835319519043, + "step": 5580 + }, + { + "epoch": 1.35, + "learning_rate": 3.064271706186486e-07, + "logits/chosen": -2.7208399772644043, + "logits/rejected": -2.697457790374756, + "logps/chosen": -325.6331481933594, + "logps/rejected": -395.9976806640625, + "loss": 0.0723, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5056465268135071, + "rewards/margins": 7.022643089294434, + "rewards/rejected": -7.528289794921875, + "step": 5590 + }, + { + "epoch": 1.35, + "learning_rate": 3.05981458370476e-07, + "logits/chosen": -2.454925060272217, + "logits/rejected": -2.514404296875, + "logps/chosen": -292.55413818359375, + "logps/rejected": -297.22515869140625, + "loss": 0.1053, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5471060276031494, + "rewards/margins": 6.992198944091797, + "rewards/rejected": -7.539304256439209, + "step": 5600 + }, + { + "epoch": 1.35, + "eval_logits/chosen": -2.297848701477051, + "eval_logits/rejected": -2.2597482204437256, + "eval_logps/chosen": -250.8496856689453, + "eval_logps/rejected": -264.89166259765625, + "eval_loss": 0.529845654964447, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -4.892212867736816, + "eval_rewards/margins": 2.6438732147216797, + "eval_rewards/rejected": -7.536085605621338, + "eval_runtime": 135.6664, + "eval_samples_per_second": 23.263, + "eval_steps_per_second": 0.369, + "step": 5600 + }, + { + "epoch": 1.35, + "learning_rate": 3.0553574612230345e-07, + "logits/chosen": -2.473297119140625, + "logits/rejected": -2.524893045425415, + "logps/chosen": -269.2584533691406, + "logps/rejected": -356.3623046875, + "loss": 0.148, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2787388563156128, + "rewards/margins": 8.458620071411133, + "rewards/rejected": -9.737358093261719, + "step": 5610 + }, + { + "epoch": 1.35, + "learning_rate": 3.0509003387413086e-07, + "logits/chosen": -2.58944034576416, + "logits/rejected": -2.517916679382324, + "logps/chosen": -284.4920654296875, + "logps/rejected": -234.60812377929688, + "loss": 0.1635, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.48317551612854004, + "rewards/margins": 6.530011177062988, + "rewards/rejected": -7.013186454772949, + "step": 5620 + }, + { + "epoch": 1.35, + "learning_rate": 3.0464432162595826e-07, + "logits/chosen": -2.6766116619110107, + "logits/rejected": -2.5364537239074707, + "logps/chosen": -389.23895263671875, + "logps/rejected": -419.3583068847656, + "loss": 0.2322, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5472573041915894, + "rewards/margins": 9.332606315612793, + "rewards/rejected": -8.785348892211914, + "step": 5630 + }, + { + "epoch": 1.36, + "learning_rate": 3.0419860937778566e-07, + "logits/chosen": -2.507610559463501, + "logits/rejected": -2.510158061981201, + "logps/chosen": -304.1316223144531, + "logps/rejected": -439.6920471191406, + "loss": 0.0783, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4623780846595764, + "rewards/margins": 11.98354721069336, + "rewards/rejected": -11.52116870880127, + "step": 5640 + }, + { + "epoch": 1.36, + "learning_rate": 3.037528971296131e-07, + "logits/chosen": -2.271934986114502, + "logits/rejected": -2.4431185722351074, + "logps/chosen": -226.60751342773438, + "logps/rejected": -319.9784240722656, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8316129446029663, + "rewards/margins": 8.930788040161133, + "rewards/rejected": -9.762401580810547, + "step": 5650 + }, + { + "epoch": 1.36, + "learning_rate": 3.033071848814405e-07, + "logits/chosen": -2.4854538440704346, + "logits/rejected": -2.3633275032043457, + "logps/chosen": -248.4595489501953, + "logps/rejected": -253.173583984375, + "loss": 0.1399, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.258836030960083, + "rewards/margins": 6.145152568817139, + "rewards/rejected": -7.403987884521484, + "step": 5660 + }, + { + "epoch": 1.36, + "learning_rate": 3.028614726332679e-07, + "logits/chosen": -2.17875599861145, + "logits/rejected": -2.291477918624878, + "logps/chosen": -224.3546905517578, + "logps/rejected": -389.11346435546875, + "loss": 0.1036, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.877074122428894, + "rewards/margins": 6.948840141296387, + "rewards/rejected": -7.8259148597717285, + "step": 5670 + }, + { + "epoch": 1.37, + "learning_rate": 3.024157603850954e-07, + "logits/chosen": -2.3191721439361572, + "logits/rejected": -2.257412910461426, + "logps/chosen": -317.46038818359375, + "logps/rejected": -424.9566345214844, + "loss": 0.069, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4260387420654297, + "rewards/margins": 6.946034908294678, + "rewards/rejected": -9.37207317352295, + "step": 5680 + }, + { + "epoch": 1.37, + "learning_rate": 3.019700481369228e-07, + "logits/chosen": -2.339301586151123, + "logits/rejected": -2.292539358139038, + "logps/chosen": -224.4660186767578, + "logps/rejected": -261.5100402832031, + "loss": 0.078, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1883065700531006, + "rewards/margins": 7.293276309967041, + "rewards/rejected": -8.481582641601562, + "step": 5690 + }, + { + "epoch": 1.37, + "learning_rate": 3.015243358887502e-07, + "logits/chosen": -2.2622950077056885, + "logits/rejected": -2.1832115650177, + "logps/chosen": -314.8285217285156, + "logps/rejected": -275.82293701171875, + "loss": 0.1257, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.49560078978538513, + "rewards/margins": 8.324846267700195, + "rewards/rejected": -8.820446968078613, + "step": 5700 + }, + { + "epoch": 1.37, + "learning_rate": 3.0107862364057764e-07, + "logits/chosen": -2.185823440551758, + "logits/rejected": -2.0748419761657715, + "logps/chosen": -157.82144165039062, + "logps/rejected": -272.6867980957031, + "loss": 0.1587, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2449867725372314, + "rewards/margins": 8.263470649719238, + "rewards/rejected": -9.50845718383789, + "step": 5710 + }, + { + "epoch": 1.38, + "learning_rate": 3.0063291139240504e-07, + "logits/chosen": -2.4911751747131348, + "logits/rejected": -2.346667766571045, + "logps/chosen": -274.0582275390625, + "logps/rejected": -328.19305419921875, + "loss": 0.1107, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.037358283996582, + "rewards/margins": 7.605017185211182, + "rewards/rejected": -9.642374992370605, + "step": 5720 + }, + { + "epoch": 1.38, + "learning_rate": 3.0018719914423245e-07, + "logits/chosen": -2.4801878929138184, + "logits/rejected": -2.505631446838379, + "logps/chosen": -272.7338562011719, + "logps/rejected": -355.825439453125, + "loss": 0.0679, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3978612422943115, + "rewards/margins": 8.758639335632324, + "rewards/rejected": -11.156499862670898, + "step": 5730 + }, + { + "epoch": 1.38, + "learning_rate": 2.997414868960599e-07, + "logits/chosen": -2.3838882446289062, + "logits/rejected": -2.4003381729125977, + "logps/chosen": -213.65060424804688, + "logps/rejected": -358.7235412597656, + "loss": 0.1576, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.312072277069092, + "rewards/margins": 7.0126800537109375, + "rewards/rejected": -10.324752807617188, + "step": 5740 + }, + { + "epoch": 1.38, + "learning_rate": 2.992957746478873e-07, + "logits/chosen": -2.3882734775543213, + "logits/rejected": -2.3687222003936768, + "logps/chosen": -181.3330535888672, + "logps/rejected": -329.9533386230469, + "loss": 0.0921, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8159842491149902, + "rewards/margins": 7.217014312744141, + "rewards/rejected": -10.032999038696289, + "step": 5750 + }, + { + "epoch": 1.39, + "learning_rate": 2.988500623997147e-07, + "logits/chosen": -2.4455018043518066, + "logits/rejected": -2.3834733963012695, + "logps/chosen": -220.39810180664062, + "logps/rejected": -276.8576965332031, + "loss": 0.1322, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6589540243148804, + "rewards/margins": 6.164792060852051, + "rewards/rejected": -7.823746681213379, + "step": 5760 + }, + { + "epoch": 1.39, + "learning_rate": 2.9840435015154216e-07, + "logits/chosen": -2.2847981452941895, + "logits/rejected": -2.1710095405578613, + "logps/chosen": -263.8554382324219, + "logps/rejected": -264.84405517578125, + "loss": 0.1504, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1371574401855469, + "rewards/margins": 7.276597023010254, + "rewards/rejected": -8.4137544631958, + "step": 5770 + }, + { + "epoch": 1.39, + "learning_rate": 2.9795863790336957e-07, + "logits/chosen": -1.97348952293396, + "logits/rejected": -2.097926139831543, + "logps/chosen": -215.86856079101562, + "logps/rejected": -299.7110595703125, + "loss": 0.1016, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.6454339027404785, + "rewards/margins": 5.7139201164245605, + "rewards/rejected": -8.359354019165039, + "step": 5780 + }, + { + "epoch": 1.39, + "learning_rate": 2.9751292565519697e-07, + "logits/chosen": -2.5739948749542236, + "logits/rejected": -2.4434008598327637, + "logps/chosen": -332.3185119628906, + "logps/rejected": -269.8470153808594, + "loss": 0.1018, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0568695068359375, + "rewards/margins": 5.8183512687683105, + "rewards/rejected": -7.87522029876709, + "step": 5790 + }, + { + "epoch": 1.4, + "learning_rate": 2.9706721340702437e-07, + "logits/chosen": -2.3245058059692383, + "logits/rejected": -2.3892173767089844, + "logps/chosen": -261.55718994140625, + "logps/rejected": -333.1568298339844, + "loss": 0.1085, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.437150239944458, + "rewards/margins": 8.149467468261719, + "rewards/rejected": -9.586616516113281, + "step": 5800 + }, + { + "epoch": 1.4, + "learning_rate": 2.9662150115885183e-07, + "logits/chosen": -2.138545274734497, + "logits/rejected": -2.2848308086395264, + "logps/chosen": -298.55218505859375, + "logps/rejected": -374.3736877441406, + "loss": 0.106, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0430874340236187, + "rewards/margins": 9.399953842163086, + "rewards/rejected": -9.44304084777832, + "step": 5810 + }, + { + "epoch": 1.4, + "learning_rate": 2.9617578891067923e-07, + "logits/chosen": -2.4602222442626953, + "logits/rejected": -2.3467583656311035, + "logps/chosen": -264.9916687011719, + "logps/rejected": -325.85040283203125, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7568753957748413, + "rewards/margins": 7.706831455230713, + "rewards/rejected": -9.463706970214844, + "step": 5820 + }, + { + "epoch": 1.4, + "learning_rate": 2.9573007666250663e-07, + "logits/chosen": -2.4260144233703613, + "logits/rejected": -2.1966934204101562, + "logps/chosen": -301.56878662109375, + "logps/rejected": -282.50006103515625, + "loss": 0.0655, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.379481554031372, + "rewards/margins": 6.654795169830322, + "rewards/rejected": -9.034276962280273, + "step": 5830 + }, + { + "epoch": 1.41, + "learning_rate": 2.952843644143341e-07, + "logits/chosen": -2.232840061187744, + "logits/rejected": -2.278740406036377, + "logps/chosen": -355.1175231933594, + "logps/rejected": -432.6170349121094, + "loss": 0.1483, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7908014059066772, + "rewards/margins": 5.564481735229492, + "rewards/rejected": -7.355282783508301, + "step": 5840 + }, + { + "epoch": 1.41, + "learning_rate": 2.948386521661615e-07, + "logits/chosen": -2.609567403793335, + "logits/rejected": -2.523547649383545, + "logps/chosen": -227.25277709960938, + "logps/rejected": -272.94561767578125, + "loss": 0.1382, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8979425430297852, + "rewards/margins": 8.014738082885742, + "rewards/rejected": -9.912680625915527, + "step": 5850 + }, + { + "epoch": 1.41, + "learning_rate": 2.943929399179889e-07, + "logits/chosen": -2.6208488941192627, + "logits/rejected": -2.610959768295288, + "logps/chosen": -242.736083984375, + "logps/rejected": -297.24835205078125, + "loss": 0.15, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2648603916168213, + "rewards/margins": 6.404683589935303, + "rewards/rejected": -8.669544219970703, + "step": 5860 + }, + { + "epoch": 1.41, + "learning_rate": 2.9394722766981635e-07, + "logits/chosen": -2.3997504711151123, + "logits/rejected": -2.243252992630005, + "logps/chosen": -321.68878173828125, + "logps/rejected": -326.05242919921875, + "loss": 0.1711, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2938514947891235, + "rewards/margins": 7.7465667724609375, + "rewards/rejected": -9.04041862487793, + "step": 5870 + }, + { + "epoch": 1.42, + "learning_rate": 2.9350151542164375e-07, + "logits/chosen": -2.440444231033325, + "logits/rejected": -2.283862590789795, + "logps/chosen": -273.25384521484375, + "logps/rejected": -276.9793701171875, + "loss": 0.0978, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1437404900789261, + "rewards/margins": 8.990808486938477, + "rewards/rejected": -8.847066879272461, + "step": 5880 + }, + { + "epoch": 1.42, + "learning_rate": 2.9305580317347116e-07, + "logits/chosen": -2.2605152130126953, + "logits/rejected": -2.250079393386841, + "logps/chosen": -291.37786865234375, + "logps/rejected": -414.73602294921875, + "loss": 0.1424, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4673226773738861, + "rewards/margins": 10.09349250793457, + "rewards/rejected": -10.560815811157227, + "step": 5890 + }, + { + "epoch": 1.42, + "learning_rate": 2.926100909252986e-07, + "logits/chosen": -2.527416229248047, + "logits/rejected": -2.365100145339966, + "logps/chosen": -224.0863494873047, + "logps/rejected": -231.84524536132812, + "loss": 0.0941, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3509066104888916, + "rewards/margins": 5.722054481506348, + "rewards/rejected": -7.07296085357666, + "step": 5900 + }, + { + "epoch": 1.42, + "learning_rate": 2.92164378677126e-07, + "logits/chosen": -2.506539821624756, + "logits/rejected": -2.341373920440674, + "logps/chosen": -362.89874267578125, + "logps/rejected": -298.7274169921875, + "loss": 0.0757, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.1734457015991211, + "rewards/margins": 7.619866371154785, + "rewards/rejected": -7.793312072753906, + "step": 5910 + }, + { + "epoch": 1.42, + "learning_rate": 2.917186664289534e-07, + "logits/chosen": -2.5742313861846924, + "logits/rejected": -2.57194185256958, + "logps/chosen": -299.4224853515625, + "logps/rejected": -357.34814453125, + "loss": 0.1328, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2363429069519043, + "rewards/margins": 8.072032928466797, + "rewards/rejected": -8.308377265930176, + "step": 5920 + }, + { + "epoch": 1.43, + "learning_rate": 2.912729541807809e-07, + "logits/chosen": -2.4831509590148926, + "logits/rejected": -2.3615055084228516, + "logps/chosen": -299.07000732421875, + "logps/rejected": -282.1550598144531, + "loss": 0.128, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6664316654205322, + "rewards/margins": 6.462451934814453, + "rewards/rejected": -7.128883361816406, + "step": 5930 + }, + { + "epoch": 1.43, + "learning_rate": 2.908272419326083e-07, + "logits/chosen": -2.446892261505127, + "logits/rejected": -2.4011735916137695, + "logps/chosen": -290.58453369140625, + "logps/rejected": -342.33673095703125, + "loss": 0.13, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9740456342697144, + "rewards/margins": 6.794226169586182, + "rewards/rejected": -7.768272399902344, + "step": 5940 + }, + { + "epoch": 1.43, + "learning_rate": 2.903815296844357e-07, + "logits/chosen": -2.438033103942871, + "logits/rejected": -2.404005527496338, + "logps/chosen": -215.423095703125, + "logps/rejected": -287.54266357421875, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6465083360671997, + "rewards/margins": 5.986216068267822, + "rewards/rejected": -6.632723808288574, + "step": 5950 + }, + { + "epoch": 1.43, + "learning_rate": 2.899358174362631e-07, + "logits/chosen": -2.4988837242126465, + "logits/rejected": -2.3473308086395264, + "logps/chosen": -323.3792419433594, + "logps/rejected": -318.721923828125, + "loss": 0.1268, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.178083896636963, + "rewards/margins": 5.7345428466796875, + "rewards/rejected": -7.91262674331665, + "step": 5960 + }, + { + "epoch": 1.44, + "learning_rate": 2.894901051880906e-07, + "logits/chosen": -2.4918477535247803, + "logits/rejected": -2.487396478652954, + "logps/chosen": -294.41302490234375, + "logps/rejected": -397.73846435546875, + "loss": 0.0924, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4407653212547302, + "rewards/margins": 9.137907981872559, + "rewards/rejected": -8.697144508361816, + "step": 5970 + }, + { + "epoch": 1.44, + "learning_rate": 2.89044392939918e-07, + "logits/chosen": -2.275686740875244, + "logits/rejected": -2.2461109161376953, + "logps/chosen": -275.2303161621094, + "logps/rejected": -260.4136657714844, + "loss": 0.0834, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20579476654529572, + "rewards/margins": 7.3556413650512695, + "rewards/rejected": -7.561434745788574, + "step": 5980 + }, + { + "epoch": 1.44, + "learning_rate": 2.885986806917454e-07, + "logits/chosen": -2.445239305496216, + "logits/rejected": -2.4250895977020264, + "logps/chosen": -253.6104278564453, + "logps/rejected": -262.63494873046875, + "loss": 0.1374, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.17287278175354, + "rewards/margins": 6.609041690826416, + "rewards/rejected": -7.781915187835693, + "step": 5990 + }, + { + "epoch": 1.44, + "learning_rate": 2.8815296844357285e-07, + "logits/chosen": -2.5300464630126953, + "logits/rejected": -2.318606376647949, + "logps/chosen": -294.62908935546875, + "logps/rejected": -317.3363952636719, + "loss": 0.1301, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2451810836791992, + "rewards/margins": 7.596142768859863, + "rewards/rejected": -8.841323852539062, + "step": 6000 + }, + { + "epoch": 1.44, + "eval_logits/chosen": -2.1991782188415527, + "eval_logits/rejected": -2.160623073577881, + "eval_logps/chosen": -242.2802276611328, + "eval_logps/rejected": -255.31179809570312, + "eval_loss": 0.5189629197120667, + "eval_rewards/accuracies": 0.6850000023841858, + "eval_rewards/chosen": -4.035269737243652, + "eval_rewards/margins": 2.5428318977355957, + "eval_rewards/rejected": -6.578101634979248, + "eval_runtime": 133.762, + "eval_samples_per_second": 23.594, + "eval_steps_per_second": 0.374, + "step": 6000 + }, + { + "epoch": 1.45, + "learning_rate": 2.8770725619540026e-07, + "logits/chosen": -2.5709338188171387, + "logits/rejected": -2.4518074989318848, + "logps/chosen": -314.88885498046875, + "logps/rejected": -301.244140625, + "loss": 0.0742, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20206031203269958, + "rewards/margins": 6.6858673095703125, + "rewards/rejected": -6.887927055358887, + "step": 6010 + }, + { + "epoch": 1.45, + "learning_rate": 2.8726154394722766e-07, + "logits/chosen": -2.309180498123169, + "logits/rejected": -2.3795578479766846, + "logps/chosen": -315.67608642578125, + "logps/rejected": -392.2315673828125, + "loss": 0.0897, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9258975982666016, + "rewards/margins": 8.317319869995117, + "rewards/rejected": -10.243217468261719, + "step": 6020 + }, + { + "epoch": 1.45, + "learning_rate": 2.868158316990551e-07, + "logits/chosen": -2.2017533779144287, + "logits/rejected": -2.311129093170166, + "logps/chosen": -228.1017303466797, + "logps/rejected": -276.9014587402344, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5432249903678894, + "rewards/margins": 7.823616027832031, + "rewards/rejected": -8.366842269897461, + "step": 6030 + }, + { + "epoch": 1.45, + "learning_rate": 2.863701194508825e-07, + "logits/chosen": -2.485600709915161, + "logits/rejected": -2.3876876831054688, + "logps/chosen": -299.70086669921875, + "logps/rejected": -301.390869140625, + "loss": 0.1048, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8832274675369263, + "rewards/margins": 7.019740104675293, + "rewards/rejected": -7.902967929840088, + "step": 6040 + }, + { + "epoch": 1.46, + "learning_rate": 2.859244072027099e-07, + "logits/chosen": -2.3670849800109863, + "logits/rejected": -2.3215155601501465, + "logps/chosen": -349.9449157714844, + "logps/rejected": -337.61346435546875, + "loss": 0.0864, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7926805019378662, + "rewards/margins": 7.919983863830566, + "rewards/rejected": -9.712663650512695, + "step": 6050 + }, + { + "epoch": 1.46, + "learning_rate": 2.854786949545374e-07, + "logits/chosen": -2.5219531059265137, + "logits/rejected": -2.479630947113037, + "logps/chosen": -402.0883483886719, + "logps/rejected": -387.61468505859375, + "loss": 0.0834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2764887511730194, + "rewards/margins": 7.328072547912598, + "rewards/rejected": -7.604561805725098, + "step": 6060 + }, + { + "epoch": 1.46, + "learning_rate": 2.850329827063648e-07, + "logits/chosen": -2.4510700702667236, + "logits/rejected": -2.43742036819458, + "logps/chosen": -216.06005859375, + "logps/rejected": -308.4542541503906, + "loss": 0.0882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.940998911857605, + "rewards/margins": 10.316885948181152, + "rewards/rejected": -11.257884979248047, + "step": 6070 + }, + { + "epoch": 1.46, + "learning_rate": 2.845872704581922e-07, + "logits/chosen": -2.4044597148895264, + "logits/rejected": -2.3395800590515137, + "logps/chosen": -265.0940856933594, + "logps/rejected": -279.3209228515625, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5933882594108582, + "rewards/margins": 7.275848388671875, + "rewards/rejected": -7.869236946105957, + "step": 6080 + }, + { + "epoch": 1.47, + "learning_rate": 2.8414155821001964e-07, + "logits/chosen": -2.4883437156677246, + "logits/rejected": -2.4578349590301514, + "logps/chosen": -292.8638916015625, + "logps/rejected": -271.67974853515625, + "loss": 0.1118, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9473133087158203, + "rewards/margins": 4.8426995277404785, + "rewards/rejected": -6.790012359619141, + "step": 6090 + }, + { + "epoch": 1.47, + "learning_rate": 2.8369584596184704e-07, + "logits/chosen": -2.505039691925049, + "logits/rejected": -2.3995630741119385, + "logps/chosen": -295.21234130859375, + "logps/rejected": -328.1549987792969, + "loss": 0.1052, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7760322093963623, + "rewards/margins": 12.22137451171875, + "rewards/rejected": -10.445342063903809, + "step": 6100 + }, + { + "epoch": 1.47, + "learning_rate": 2.8325013371367444e-07, + "logits/chosen": -2.303755521774292, + "logits/rejected": -2.3532848358154297, + "logps/chosen": -219.3915557861328, + "logps/rejected": -285.55853271484375, + "loss": 0.0968, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1725101470947266, + "rewards/margins": 7.460213661193848, + "rewards/rejected": -8.63272476196289, + "step": 6110 + }, + { + "epoch": 1.47, + "learning_rate": 2.828044214655019e-07, + "logits/chosen": -2.404218912124634, + "logits/rejected": -2.408658504486084, + "logps/chosen": -233.19583129882812, + "logps/rejected": -346.0060729980469, + "loss": 0.1007, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6594064831733704, + "rewards/margins": 8.938102722167969, + "rewards/rejected": -9.597509384155273, + "step": 6120 + }, + { + "epoch": 1.48, + "learning_rate": 2.823587092173293e-07, + "logits/chosen": -2.45180344581604, + "logits/rejected": -2.4573071002960205, + "logps/chosen": -240.5504150390625, + "logps/rejected": -350.02386474609375, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06072967126965523, + "rewards/margins": 8.110355377197266, + "rewards/rejected": -8.049626350402832, + "step": 6130 + }, + { + "epoch": 1.48, + "learning_rate": 2.819129969691567e-07, + "logits/chosen": -2.5962586402893066, + "logits/rejected": -2.6155471801757812, + "logps/chosen": -405.8052673339844, + "logps/rejected": -486.88604736328125, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47397932410240173, + "rewards/margins": 10.261617660522461, + "rewards/rejected": -9.787638664245605, + "step": 6140 + }, + { + "epoch": 1.48, + "learning_rate": 2.814672847209841e-07, + "logits/chosen": -2.578833818435669, + "logits/rejected": -2.530801773071289, + "logps/chosen": -260.2953186035156, + "logps/rejected": -258.282958984375, + "loss": 0.0889, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3781076669692993, + "rewards/margins": 5.418423652648926, + "rewards/rejected": -6.796531677246094, + "step": 6150 + }, + { + "epoch": 1.48, + "learning_rate": 2.8102157247281156e-07, + "logits/chosen": -2.2971138954162598, + "logits/rejected": -2.390742063522339, + "logps/chosen": -151.07147216796875, + "logps/rejected": -317.8833923339844, + "loss": 0.114, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.399823397397995, + "rewards/margins": 11.081315994262695, + "rewards/rejected": -11.481138229370117, + "step": 6160 + }, + { + "epoch": 1.48, + "learning_rate": 2.8057586022463897e-07, + "logits/chosen": -2.4942171573638916, + "logits/rejected": -2.372739553451538, + "logps/chosen": -300.8179016113281, + "logps/rejected": -292.6261901855469, + "loss": 0.0753, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7665277719497681, + "rewards/margins": 6.451620578765869, + "rewards/rejected": -7.218148231506348, + "step": 6170 + }, + { + "epoch": 1.49, + "learning_rate": 2.8013014797646637e-07, + "logits/chosen": -2.4573559761047363, + "logits/rejected": -2.380481719970703, + "logps/chosen": -330.7941589355469, + "logps/rejected": -366.9463195800781, + "loss": 0.1366, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0739825963973999, + "rewards/margins": 9.741181373596191, + "rewards/rejected": -9.667200088500977, + "step": 6180 + }, + { + "epoch": 1.49, + "learning_rate": 2.796844357282938e-07, + "logits/chosen": -2.4501938819885254, + "logits/rejected": -2.291347026824951, + "logps/chosen": -301.8091125488281, + "logps/rejected": -354.0787353515625, + "loss": 0.0845, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13122543692588806, + "rewards/margins": 10.396775245666504, + "rewards/rejected": -10.265549659729004, + "step": 6190 + }, + { + "epoch": 1.49, + "learning_rate": 2.7923872348012123e-07, + "logits/chosen": -2.2618536949157715, + "logits/rejected": -2.407378673553467, + "logps/chosen": -209.48526000976562, + "logps/rejected": -390.2980651855469, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5589649677276611, + "rewards/margins": 8.035758972167969, + "rewards/rejected": -9.594724655151367, + "step": 6200 + }, + { + "epoch": 1.49, + "learning_rate": 2.7879301123194863e-07, + "logits/chosen": -2.3829917907714844, + "logits/rejected": -2.404918670654297, + "logps/chosen": -357.5557556152344, + "logps/rejected": -368.6526794433594, + "loss": 0.0953, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1500093936920166, + "rewards/margins": 6.868212699890137, + "rewards/rejected": -8.018221855163574, + "step": 6210 + }, + { + "epoch": 1.5, + "learning_rate": 2.783472989837761e-07, + "logits/chosen": -2.3633790016174316, + "logits/rejected": -2.3713860511779785, + "logps/chosen": -240.88668823242188, + "logps/rejected": -341.467041015625, + "loss": 0.1382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3870057165622711, + "rewards/margins": 8.991482734680176, + "rewards/rejected": -9.378487586975098, + "step": 6220 + }, + { + "epoch": 1.5, + "learning_rate": 2.779015867356035e-07, + "logits/chosen": -2.6395559310913086, + "logits/rejected": -2.4677324295043945, + "logps/chosen": -356.03375244140625, + "logps/rejected": -383.40850830078125, + "loss": 0.0956, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.233582615852356, + "rewards/margins": 8.020268440246582, + "rewards/rejected": -9.253849983215332, + "step": 6230 + }, + { + "epoch": 1.5, + "learning_rate": 2.774558744874309e-07, + "logits/chosen": -2.539665460586548, + "logits/rejected": -2.538662910461426, + "logps/chosen": -249.73861694335938, + "logps/rejected": -321.86932373046875, + "loss": 0.0861, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.562501311302185, + "rewards/margins": 7.141868591308594, + "rewards/rejected": -8.70436954498291, + "step": 6240 + }, + { + "epoch": 1.5, + "learning_rate": 2.7701016223925835e-07, + "logits/chosen": -2.4371912479400635, + "logits/rejected": -2.4786174297332764, + "logps/chosen": -247.0255126953125, + "logps/rejected": -413.2113342285156, + "loss": 0.0644, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.31962472200393677, + "rewards/margins": 10.500336647033691, + "rewards/rejected": -10.18071174621582, + "step": 6250 + }, + { + "epoch": 1.51, + "learning_rate": 2.7656444999108575e-07, + "logits/chosen": -2.4717249870300293, + "logits/rejected": -2.44594144821167, + "logps/chosen": -271.02960205078125, + "logps/rejected": -389.52252197265625, + "loss": 0.108, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7181816697120667, + "rewards/margins": 10.415331840515137, + "rewards/rejected": -9.697149276733398, + "step": 6260 + }, + { + "epoch": 1.51, + "learning_rate": 2.7611873774291315e-07, + "logits/chosen": -2.3463523387908936, + "logits/rejected": -2.2759034633636475, + "logps/chosen": -314.4310302734375, + "logps/rejected": -297.7897033691406, + "loss": 0.0727, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.059358298778533936, + "rewards/margins": 10.193583488464355, + "rewards/rejected": -10.134224891662598, + "step": 6270 + }, + { + "epoch": 1.51, + "learning_rate": 2.756730254947406e-07, + "logits/chosen": -2.372990608215332, + "logits/rejected": -2.3513102531433105, + "logps/chosen": -187.26589965820312, + "logps/rejected": -298.1798400878906, + "loss": 0.0952, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.383110761642456, + "rewards/margins": 7.358151435852051, + "rewards/rejected": -8.741262435913086, + "step": 6280 + }, + { + "epoch": 1.51, + "learning_rate": 2.75227313246568e-07, + "logits/chosen": -2.606323719024658, + "logits/rejected": -2.582251787185669, + "logps/chosen": -247.609130859375, + "logps/rejected": -230.36495971679688, + "loss": 0.1148, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3474022150039673, + "rewards/margins": 5.080073356628418, + "rewards/rejected": -6.427475929260254, + "step": 6290 + }, + { + "epoch": 1.52, + "learning_rate": 2.747816009983954e-07, + "logits/chosen": -2.346926212310791, + "logits/rejected": -2.3074090480804443, + "logps/chosen": -197.5868682861328, + "logps/rejected": -285.4720458984375, + "loss": 0.1181, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5006293058395386, + "rewards/margins": 5.138708114624023, + "rewards/rejected": -6.63933801651001, + "step": 6300 + }, + { + "epoch": 1.52, + "learning_rate": 2.743358887502228e-07, + "logits/chosen": -2.4426982402801514, + "logits/rejected": -2.5822272300720215, + "logps/chosen": -213.3493194580078, + "logps/rejected": -277.4560546875, + "loss": 0.1475, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.737618088722229, + "rewards/margins": 6.50905704498291, + "rewards/rejected": -8.246675491333008, + "step": 6310 + }, + { + "epoch": 1.52, + "learning_rate": 2.738901765020503e-07, + "logits/chosen": -2.6064677238464355, + "logits/rejected": -2.637498140335083, + "logps/chosen": -262.9438171386719, + "logps/rejected": -315.25433349609375, + "loss": 0.1514, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.899113655090332, + "rewards/margins": 6.5391130447387695, + "rewards/rejected": -8.438225746154785, + "step": 6320 + }, + { + "epoch": 1.52, + "learning_rate": 2.734444642538777e-07, + "logits/chosen": -2.5768630504608154, + "logits/rejected": -2.456533432006836, + "logps/chosen": -272.20147705078125, + "logps/rejected": -294.9117736816406, + "loss": 0.139, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5142834186553955, + "rewards/margins": 8.897344589233398, + "rewards/rejected": -7.383061408996582, + "step": 6330 + }, + { + "epoch": 1.53, + "learning_rate": 2.729987520057051e-07, + "logits/chosen": -2.643470287322998, + "logits/rejected": -2.7008514404296875, + "logps/chosen": -274.010498046875, + "logps/rejected": -338.3312072753906, + "loss": 0.1023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.253525733947754, + "rewards/margins": 9.714042663574219, + "rewards/rejected": -8.460517883300781, + "step": 6340 + }, + { + "epoch": 1.53, + "learning_rate": 2.7255303975753254e-07, + "logits/chosen": -2.512911558151245, + "logits/rejected": -2.430452823638916, + "logps/chosen": -207.1924285888672, + "logps/rejected": -359.22052001953125, + "loss": 0.0731, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1545320749282837, + "rewards/margins": 8.157306671142578, + "rewards/rejected": -9.311838150024414, + "step": 6350 + }, + { + "epoch": 1.53, + "learning_rate": 2.7210732750935994e-07, + "logits/chosen": -2.48407244682312, + "logits/rejected": -2.511460781097412, + "logps/chosen": -254.64480590820312, + "logps/rejected": -264.3612060546875, + "loss": 0.1433, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2126295566558838, + "rewards/margins": 5.900941371917725, + "rewards/rejected": -7.1135711669921875, + "step": 6360 + }, + { + "epoch": 1.53, + "learning_rate": 2.7166161526118734e-07, + "logits/chosen": -2.504002094268799, + "logits/rejected": -2.3776869773864746, + "logps/chosen": -369.0848388671875, + "logps/rejected": -354.16552734375, + "loss": 0.0774, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13143929839134216, + "rewards/margins": 8.589181900024414, + "rewards/rejected": -8.457742691040039, + "step": 6370 + }, + { + "epoch": 1.54, + "learning_rate": 2.712159030130148e-07, + "logits/chosen": -2.540222406387329, + "logits/rejected": -2.3949813842773438, + "logps/chosen": -284.43585205078125, + "logps/rejected": -285.30389404296875, + "loss": 0.081, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8853492736816406, + "rewards/margins": 4.8586015701293945, + "rewards/rejected": -7.743950843811035, + "step": 6380 + }, + { + "epoch": 1.54, + "learning_rate": 2.707701907648422e-07, + "logits/chosen": -2.4505486488342285, + "logits/rejected": -2.3909640312194824, + "logps/chosen": -290.6482849121094, + "logps/rejected": -317.31573486328125, + "loss": 0.1226, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6440604329109192, + "rewards/margins": 8.36026382446289, + "rewards/rejected": -9.004323959350586, + "step": 6390 + }, + { + "epoch": 1.54, + "learning_rate": 2.703244785166696e-07, + "logits/chosen": -2.58925461769104, + "logits/rejected": -2.511303424835205, + "logps/chosen": -195.77783203125, + "logps/rejected": -212.7223358154297, + "loss": 0.0789, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6220035552978516, + "rewards/margins": 6.110626220703125, + "rewards/rejected": -7.732629299163818, + "step": 6400 + }, + { + "epoch": 1.54, + "eval_logits/chosen": -2.2593064308166504, + "eval_logits/rejected": -2.221984624862671, + "eval_logps/chosen": -248.052734375, + "eval_logps/rejected": -263.1014709472656, + "eval_loss": 0.5184081196784973, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -4.6125168800354, + "eval_rewards/margins": 2.744553804397583, + "eval_rewards/rejected": -7.357071399688721, + "eval_runtime": 131.3004, + "eval_samples_per_second": 24.036, + "eval_steps_per_second": 0.381, + "step": 6400 + }, + { + "epoch": 1.54, + "learning_rate": 2.6987876626849706e-07, + "logits/chosen": -2.4880471229553223, + "logits/rejected": -2.4972970485687256, + "logps/chosen": -200.48007202148438, + "logps/rejected": -252.0703582763672, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7859618663787842, + "rewards/margins": 5.987399578094482, + "rewards/rejected": -7.7733612060546875, + "step": 6410 + }, + { + "epoch": 1.55, + "learning_rate": 2.6943305402032446e-07, + "logits/chosen": -2.5299830436706543, + "logits/rejected": -2.304743528366089, + "logps/chosen": -363.29229736328125, + "logps/rejected": -313.2667541503906, + "loss": 0.091, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4549384117126465, + "rewards/margins": 10.024370193481445, + "rewards/rejected": -9.569430351257324, + "step": 6420 + }, + { + "epoch": 1.55, + "learning_rate": 2.6898734177215186e-07, + "logits/chosen": -2.4846057891845703, + "logits/rejected": -2.4044010639190674, + "logps/chosen": -219.111083984375, + "logps/rejected": -255.8389892578125, + "loss": 0.1175, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.520255208015442, + "rewards/margins": 5.579611778259277, + "rewards/rejected": -7.099867343902588, + "step": 6430 + }, + { + "epoch": 1.55, + "learning_rate": 2.685416295239793e-07, + "logits/chosen": -2.5400466918945312, + "logits/rejected": -2.5766820907592773, + "logps/chosen": -295.1035461425781, + "logps/rejected": -357.53485107421875, + "loss": 0.0802, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.18065910041332245, + "rewards/margins": 8.971368789672852, + "rewards/rejected": -9.152027130126953, + "step": 6440 + }, + { + "epoch": 1.55, + "learning_rate": 2.680959172758067e-07, + "logits/chosen": -2.555649518966675, + "logits/rejected": -2.529740810394287, + "logps/chosen": -233.1663818359375, + "logps/rejected": -264.34197998046875, + "loss": 0.1138, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1419332027435303, + "rewards/margins": 5.442976951599121, + "rewards/rejected": -6.584909915924072, + "step": 6450 + }, + { + "epoch": 1.55, + "learning_rate": 2.676502050276341e-07, + "logits/chosen": -2.569977283477783, + "logits/rejected": -2.450141429901123, + "logps/chosen": -269.0093994140625, + "logps/rejected": -326.5610046386719, + "loss": 0.0904, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8578832149505615, + "rewards/margins": 6.224945545196533, + "rewards/rejected": -8.0828275680542, + "step": 6460 + }, + { + "epoch": 1.56, + "learning_rate": 2.6720449277946153e-07, + "logits/chosen": -2.3317441940307617, + "logits/rejected": -2.5061981678009033, + "logps/chosen": -315.4051513671875, + "logps/rejected": -441.2271423339844, + "loss": 0.091, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8155472874641418, + "rewards/margins": 10.180726051330566, + "rewards/rejected": -9.365178108215332, + "step": 6470 + }, + { + "epoch": 1.56, + "learning_rate": 2.66758780531289e-07, + "logits/chosen": -2.6900036334991455, + "logits/rejected": -2.5762057304382324, + "logps/chosen": -207.0538330078125, + "logps/rejected": -264.35064697265625, + "loss": 0.0906, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15104790031909943, + "rewards/margins": 7.0734100341796875, + "rewards/rejected": -7.224459171295166, + "step": 6480 + }, + { + "epoch": 1.56, + "learning_rate": 2.663130682831164e-07, + "logits/chosen": -2.6154706478118896, + "logits/rejected": -2.4836790561676025, + "logps/chosen": -272.6229553222656, + "logps/rejected": -256.09967041015625, + "loss": 0.1349, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.21876728534698486, + "rewards/margins": 7.674716949462891, + "rewards/rejected": -7.455949306488037, + "step": 6490 + }, + { + "epoch": 1.56, + "learning_rate": 2.658673560349438e-07, + "logits/chosen": -2.4315361976623535, + "logits/rejected": -2.423959493637085, + "logps/chosen": -261.1012268066406, + "logps/rejected": -308.75390625, + "loss": 0.1466, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2184690237045288, + "rewards/margins": 6.828681945800781, + "rewards/rejected": -8.047151565551758, + "step": 6500 + }, + { + "epoch": 1.57, + "learning_rate": 2.6542164378677125e-07, + "logits/chosen": -2.3758773803710938, + "logits/rejected": -2.428539276123047, + "logps/chosen": -224.01278686523438, + "logps/rejected": -287.84381103515625, + "loss": 0.2044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1266942024230957, + "rewards/margins": 5.721441268920898, + "rewards/rejected": -7.848135471343994, + "step": 6510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6497593153859865e-07, + "logits/chosen": -2.7203621864318848, + "logits/rejected": -2.6722311973571777, + "logps/chosen": -295.2500915527344, + "logps/rejected": -343.1181640625, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.597442626953125, + "rewards/margins": 7.230844020843506, + "rewards/rejected": -7.828286647796631, + "step": 6520 + }, + { + "epoch": 1.57, + "learning_rate": 2.6453021929042605e-07, + "logits/chosen": -2.5043766498565674, + "logits/rejected": -2.419970750808716, + "logps/chosen": -243.61221313476562, + "logps/rejected": -362.5774230957031, + "loss": 0.0978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27581435441970825, + "rewards/margins": 7.525368690490723, + "rewards/rejected": -7.801183223724365, + "step": 6530 + }, + { + "epoch": 1.57, + "learning_rate": 2.640845070422535e-07, + "logits/chosen": -2.360407590866089, + "logits/rejected": -2.2817015647888184, + "logps/chosen": -203.2313995361328, + "logps/rejected": -193.2275848388672, + "loss": 0.092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4021323919296265, + "rewards/margins": 5.137397289276123, + "rewards/rejected": -6.539530277252197, + "step": 6540 + }, + { + "epoch": 1.58, + "learning_rate": 2.636387947940809e-07, + "logits/chosen": -2.5603091716766357, + "logits/rejected": -2.5812692642211914, + "logps/chosen": -252.48324584960938, + "logps/rejected": -280.01495361328125, + "loss": 0.1424, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5431323051452637, + "rewards/margins": 4.878392696380615, + "rewards/rejected": -7.421525001525879, + "step": 6550 + }, + { + "epoch": 1.58, + "learning_rate": 2.631930825459083e-07, + "logits/chosen": -2.4909424781799316, + "logits/rejected": -2.47674822807312, + "logps/chosen": -203.90139770507812, + "logps/rejected": -323.13922119140625, + "loss": 0.0653, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3505728244781494, + "rewards/margins": 6.907691955566406, + "rewards/rejected": -9.258264541625977, + "step": 6560 + }, + { + "epoch": 1.58, + "learning_rate": 2.6274737029773577e-07, + "logits/chosen": -2.509265184402466, + "logits/rejected": -2.5538954734802246, + "logps/chosen": -204.9351348876953, + "logps/rejected": -340.92047119140625, + "loss": 0.0754, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5897654294967651, + "rewards/margins": 7.562819480895996, + "rewards/rejected": -8.152585983276367, + "step": 6570 + }, + { + "epoch": 1.58, + "learning_rate": 2.6230165804956317e-07, + "logits/chosen": -2.493110179901123, + "logits/rejected": -2.3449318408966064, + "logps/chosen": -273.14569091796875, + "logps/rejected": -325.2339782714844, + "loss": 0.0913, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.365891456604004, + "rewards/margins": 6.012226581573486, + "rewards/rejected": -7.37811803817749, + "step": 6580 + }, + { + "epoch": 1.59, + "learning_rate": 2.618559458013906e-07, + "logits/chosen": -2.7275311946868896, + "logits/rejected": -2.59036922454834, + "logps/chosen": -288.2721862792969, + "logps/rejected": -318.25701904296875, + "loss": 0.1091, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.019632434472441673, + "rewards/margins": 8.365331649780273, + "rewards/rejected": -8.345699310302734, + "step": 6590 + }, + { + "epoch": 1.59, + "learning_rate": 2.6141023355321803e-07, + "logits/chosen": -2.4824635982513428, + "logits/rejected": -2.421909809112549, + "logps/chosen": -401.34173583984375, + "logps/rejected": -400.68011474609375, + "loss": 0.1259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.043386459350586, + "rewards/margins": 8.686726570129395, + "rewards/rejected": -9.730113983154297, + "step": 6600 + }, + { + "epoch": 1.59, + "learning_rate": 2.6096452130504543e-07, + "logits/chosen": -2.619050979614258, + "logits/rejected": -2.5701723098754883, + "logps/chosen": -281.85980224609375, + "logps/rejected": -437.05523681640625, + "loss": 0.0678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3362850546836853, + "rewards/margins": 9.662748336791992, + "rewards/rejected": -9.999032974243164, + "step": 6610 + }, + { + "epoch": 1.59, + "learning_rate": 2.6051880905687284e-07, + "logits/chosen": -2.6397461891174316, + "logits/rejected": -2.595975637435913, + "logps/chosen": -291.05242919921875, + "logps/rejected": -292.87384033203125, + "loss": 0.1049, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.23073410987854004, + "rewards/margins": 6.005520820617676, + "rewards/rejected": -6.236255168914795, + "step": 6620 + }, + { + "epoch": 1.6, + "learning_rate": 2.6007309680870024e-07, + "logits/chosen": -2.5201048851013184, + "logits/rejected": -2.3694489002227783, + "logps/chosen": -304.94183349609375, + "logps/rejected": -335.90234375, + "loss": 0.093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.141577959060669, + "rewards/margins": 7.916557312011719, + "rewards/rejected": -9.058136940002441, + "step": 6630 + }, + { + "epoch": 1.6, + "learning_rate": 2.596273845605277e-07, + "logits/chosen": -2.6051645278930664, + "logits/rejected": -2.445042133331299, + "logps/chosen": -376.5633239746094, + "logps/rejected": -316.4339294433594, + "loss": 0.1029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18511851131916046, + "rewards/margins": 7.910219669342041, + "rewards/rejected": -7.725100040435791, + "step": 6640 + }, + { + "epoch": 1.6, + "learning_rate": 2.591816723123551e-07, + "logits/chosen": -2.521838426589966, + "logits/rejected": -2.405672311782837, + "logps/chosen": -268.4342956542969, + "logps/rejected": -374.9623718261719, + "loss": 0.0886, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1511344462633133, + "rewards/margins": 11.82387924194336, + "rewards/rejected": -11.672745704650879, + "step": 6650 + }, + { + "epoch": 1.6, + "learning_rate": 2.5873596006418255e-07, + "logits/chosen": -2.351754665374756, + "logits/rejected": -2.4746947288513184, + "logps/chosen": -174.59805297851562, + "logps/rejected": -222.7696075439453, + "loss": 0.102, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.877276062965393, + "rewards/margins": 4.7677321434021, + "rewards/rejected": -6.645008087158203, + "step": 6660 + }, + { + "epoch": 1.61, + "learning_rate": 2.5829024781601e-07, + "logits/chosen": -2.329702138900757, + "logits/rejected": -2.3888792991638184, + "logps/chosen": -282.3338928222656, + "logps/rejected": -427.6305236816406, + "loss": 0.1305, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5918933153152466, + "rewards/margins": 9.590441703796387, + "rewards/rejected": -10.18233585357666, + "step": 6670 + }, + { + "epoch": 1.61, + "learning_rate": 2.578445355678374e-07, + "logits/chosen": -2.5636491775512695, + "logits/rejected": -2.4580605030059814, + "logps/chosen": -180.61399841308594, + "logps/rejected": -246.103759765625, + "loss": 0.1198, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5207059383392334, + "rewards/margins": 5.905128479003906, + "rewards/rejected": -8.425833702087402, + "step": 6680 + }, + { + "epoch": 1.61, + "learning_rate": 2.573988233196648e-07, + "logits/chosen": -2.4034101963043213, + "logits/rejected": -2.450488567352295, + "logps/chosen": -212.5645294189453, + "logps/rejected": -364.51800537109375, + "loss": 0.0828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6967671513557434, + "rewards/margins": 9.882649421691895, + "rewards/rejected": -10.579416275024414, + "step": 6690 + }, + { + "epoch": 1.61, + "learning_rate": 2.5695311107149227e-07, + "logits/chosen": -2.623979091644287, + "logits/rejected": -2.510005474090576, + "logps/chosen": -323.07916259765625, + "logps/rejected": -392.9112548828125, + "loss": 0.0996, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5257644653320312, + "rewards/margins": 10.530187606811523, + "rewards/rejected": -12.055953025817871, + "step": 6700 + }, + { + "epoch": 1.61, + "learning_rate": 2.565073988233197e-07, + "logits/chosen": -2.5732877254486084, + "logits/rejected": -2.4544434547424316, + "logps/chosen": -191.06942749023438, + "logps/rejected": -275.33612060546875, + "loss": 0.1, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2350542545318604, + "rewards/margins": 6.511356353759766, + "rewards/rejected": -8.74640941619873, + "step": 6710 + }, + { + "epoch": 1.62, + "learning_rate": 2.560616865751471e-07, + "logits/chosen": -2.662848949432373, + "logits/rejected": -2.6330981254577637, + "logps/chosen": -295.5281677246094, + "logps/rejected": -355.705810546875, + "loss": 0.071, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.05338757112622261, + "rewards/margins": 9.251906394958496, + "rewards/rejected": -9.30529499053955, + "step": 6720 + }, + { + "epoch": 1.62, + "learning_rate": 2.5561597432697453e-07, + "logits/chosen": -2.643730640411377, + "logits/rejected": -2.5296835899353027, + "logps/chosen": -277.45465087890625, + "logps/rejected": -385.3217468261719, + "loss": 0.0975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6557458639144897, + "rewards/margins": 9.16593074798584, + "rewards/rejected": -9.821678161621094, + "step": 6730 + }, + { + "epoch": 1.62, + "learning_rate": 2.5517026207880194e-07, + "logits/chosen": -2.7320146560668945, + "logits/rejected": -2.4216275215148926, + "logps/chosen": -343.2032165527344, + "logps/rejected": -280.29327392578125, + "loss": 0.1192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42550569772720337, + "rewards/margins": 7.7524094581604, + "rewards/rejected": -8.177915573120117, + "step": 6740 + }, + { + "epoch": 1.62, + "learning_rate": 2.5472454983062934e-07, + "logits/chosen": -2.399095058441162, + "logits/rejected": -2.4269192218780518, + "logps/chosen": -265.23016357421875, + "logps/rejected": -318.9640197753906, + "loss": 0.1568, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0254114866256714, + "rewards/margins": 6.011340141296387, + "rewards/rejected": -7.036751747131348, + "step": 6750 + }, + { + "epoch": 1.63, + "learning_rate": 2.542788375824568e-07, + "logits/chosen": -2.751743793487549, + "logits/rejected": -2.635657787322998, + "logps/chosen": -289.47552490234375, + "logps/rejected": -322.9488830566406, + "loss": 0.1744, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7109371423721313, + "rewards/margins": 5.925104141235352, + "rewards/rejected": -6.636041164398193, + "step": 6760 + }, + { + "epoch": 1.63, + "learning_rate": 2.538331253342842e-07, + "logits/chosen": -2.623124599456787, + "logits/rejected": -2.4960763454437256, + "logps/chosen": -261.3349609375, + "logps/rejected": -296.0354309082031, + "loss": 0.0865, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.05223352834582329, + "rewards/margins": 8.582305908203125, + "rewards/rejected": -8.634539604187012, + "step": 6770 + }, + { + "epoch": 1.63, + "learning_rate": 2.533874130861116e-07, + "logits/chosen": -2.603426218032837, + "logits/rejected": -2.652134418487549, + "logps/chosen": -262.34393310546875, + "logps/rejected": -321.20977783203125, + "loss": 0.1384, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04393220692873001, + "rewards/margins": 6.213263034820557, + "rewards/rejected": -6.257195949554443, + "step": 6780 + }, + { + "epoch": 1.63, + "learning_rate": 2.5294170083793906e-07, + "logits/chosen": -2.830530881881714, + "logits/rejected": -2.8101234436035156, + "logps/chosen": -294.06719970703125, + "logps/rejected": -318.9193420410156, + "loss": 0.0913, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9768505096435547, + "rewards/margins": 6.7217116355896, + "rewards/rejected": -8.698562622070312, + "step": 6790 + }, + { + "epoch": 1.64, + "learning_rate": 2.5249598858976646e-07, + "logits/chosen": -2.626084804534912, + "logits/rejected": -2.5628437995910645, + "logps/chosen": -315.3648986816406, + "logps/rejected": -332.21868896484375, + "loss": 0.1274, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9488713145256042, + "rewards/margins": 9.667269706726074, + "rewards/rejected": -8.718399047851562, + "step": 6800 + }, + { + "epoch": 1.64, + "eval_logits/chosen": -2.365267753601074, + "eval_logits/rejected": -2.323819160461426, + "eval_logps/chosen": -241.0087127685547, + "eval_logps/rejected": -254.7549285888672, + "eval_loss": 0.5138404369354248, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -3.9081151485443115, + "eval_rewards/margins": 2.614298105239868, + "eval_rewards/rejected": -6.522413730621338, + "eval_runtime": 132.324, + "eval_samples_per_second": 23.851, + "eval_steps_per_second": 0.378, + "step": 6800 + }, + { + "epoch": 1.64, + "learning_rate": 2.5205027634159386e-07, + "logits/chosen": -2.77650785446167, + "logits/rejected": -2.554332733154297, + "logps/chosen": -264.7838134765625, + "logps/rejected": -263.158447265625, + "loss": 0.1154, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0337071418762207, + "rewards/margins": 5.510695457458496, + "rewards/rejected": -7.544403076171875, + "step": 6810 + }, + { + "epoch": 1.64, + "learning_rate": 2.5160456409342126e-07, + "logits/chosen": -2.7782604694366455, + "logits/rejected": -2.7263779640197754, + "logps/chosen": -331.07598876953125, + "logps/rejected": -320.32696533203125, + "loss": 0.094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01526255626231432, + "rewards/margins": 7.255690097808838, + "rewards/rejected": -7.240427494049072, + "step": 6820 + }, + { + "epoch": 1.64, + "learning_rate": 2.511588518452487e-07, + "logits/chosen": -2.7435340881347656, + "logits/rejected": -2.6303529739379883, + "logps/chosen": -292.32000732421875, + "logps/rejected": -340.9613037109375, + "loss": 0.0871, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.869441032409668, + "rewards/margins": 6.238544464111328, + "rewards/rejected": -8.10798454284668, + "step": 6830 + }, + { + "epoch": 1.65, + "learning_rate": 2.507131395970761e-07, + "logits/chosen": -2.6717050075531006, + "logits/rejected": -2.720207691192627, + "logps/chosen": -284.22979736328125, + "logps/rejected": -394.85430908203125, + "loss": 0.0877, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0687626600265503, + "rewards/margins": 8.739702224731445, + "rewards/rejected": -9.808465957641602, + "step": 6840 + }, + { + "epoch": 1.65, + "learning_rate": 2.502674273489035e-07, + "logits/chosen": -2.703340530395508, + "logits/rejected": -2.5412425994873047, + "logps/chosen": -339.227294921875, + "logps/rejected": -345.09332275390625, + "loss": 0.0969, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5817199945449829, + "rewards/margins": 7.840517520904541, + "rewards/rejected": -8.422237396240234, + "step": 6850 + }, + { + "epoch": 1.65, + "learning_rate": 2.49821715100731e-07, + "logits/chosen": -2.7909388542175293, + "logits/rejected": -2.589139699935913, + "logps/chosen": -278.817626953125, + "logps/rejected": -346.43548583984375, + "loss": 0.0485, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4508662819862366, + "rewards/margins": 9.999730110168457, + "rewards/rejected": -9.548861503601074, + "step": 6860 + }, + { + "epoch": 1.65, + "learning_rate": 2.493760028525584e-07, + "logits/chosen": -2.43666672706604, + "logits/rejected": -2.3847169876098633, + "logps/chosen": -212.90188598632812, + "logps/rejected": -268.1318359375, + "loss": 0.0691, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2572885751724243, + "rewards/margins": 6.4423394203186035, + "rewards/rejected": -7.6996283531188965, + "step": 6870 + }, + { + "epoch": 1.66, + "learning_rate": 2.489302906043858e-07, + "logits/chosen": -2.7076172828674316, + "logits/rejected": -2.6247425079345703, + "logps/chosen": -284.4060974121094, + "logps/rejected": -417.0311584472656, + "loss": 0.1088, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.40026673674583435, + "rewards/margins": 9.445573806762695, + "rewards/rejected": -9.845841407775879, + "step": 6880 + }, + { + "epoch": 1.66, + "learning_rate": 2.4848457835621324e-07, + "logits/chosen": -2.670478343963623, + "logits/rejected": -2.541105031967163, + "logps/chosen": -234.4295654296875, + "logps/rejected": -348.7154846191406, + "loss": 0.1173, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9956146478652954, + "rewards/margins": 9.453681945800781, + "rewards/rejected": -10.449296951293945, + "step": 6890 + }, + { + "epoch": 1.66, + "learning_rate": 2.4803886610804065e-07, + "logits/chosen": -2.572587251663208, + "logits/rejected": -2.5934700965881348, + "logps/chosen": -194.14425659179688, + "logps/rejected": -309.1454772949219, + "loss": 0.1215, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1737267971038818, + "rewards/margins": 7.2784600257873535, + "rewards/rejected": -8.45218563079834, + "step": 6900 + }, + { + "epoch": 1.66, + "learning_rate": 2.4759315385986805e-07, + "logits/chosen": -2.53065824508667, + "logits/rejected": -2.599759578704834, + "logps/chosen": -223.47616577148438, + "logps/rejected": -344.7876892089844, + "loss": 0.1513, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4943052530288696, + "rewards/margins": 5.8011555671691895, + "rewards/rejected": -7.2954607009887695, + "step": 6910 + }, + { + "epoch": 1.67, + "learning_rate": 2.471474416116955e-07, + "logits/chosen": -2.3176236152648926, + "logits/rejected": -2.2783634662628174, + "logps/chosen": -253.1682891845703, + "logps/rejected": -328.687744140625, + "loss": 0.1323, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5988147258758545, + "rewards/margins": 9.450356483459473, + "rewards/rejected": -11.04917049407959, + "step": 6920 + }, + { + "epoch": 1.67, + "learning_rate": 2.467017293635229e-07, + "logits/chosen": -2.734055995941162, + "logits/rejected": -2.7162024974823, + "logps/chosen": -247.93777465820312, + "logps/rejected": -356.2265930175781, + "loss": 0.0788, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5648818016052246, + "rewards/margins": 6.425793647766113, + "rewards/rejected": -6.9906744956970215, + "step": 6930 + }, + { + "epoch": 1.67, + "learning_rate": 2.462560171153503e-07, + "logits/chosen": -2.5415902137756348, + "logits/rejected": -2.540515422821045, + "logps/chosen": -297.2126770019531, + "logps/rejected": -345.50750732421875, + "loss": 0.133, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.012619865126907825, + "rewards/margins": 8.577271461486816, + "rewards/rejected": -8.58989143371582, + "step": 6940 + }, + { + "epoch": 1.67, + "learning_rate": 2.4581030486717777e-07, + "logits/chosen": -2.3323845863342285, + "logits/rejected": -2.2749600410461426, + "logps/chosen": -324.28778076171875, + "logps/rejected": -375.01202392578125, + "loss": 0.0685, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.43227776885032654, + "rewards/margins": 8.051684379577637, + "rewards/rejected": -7.619407653808594, + "step": 6950 + }, + { + "epoch": 1.68, + "learning_rate": 2.4536459261900517e-07, + "logits/chosen": -2.6356239318847656, + "logits/rejected": -2.7263784408569336, + "logps/chosen": -235.08642578125, + "logps/rejected": -319.6094665527344, + "loss": 0.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5624276399612427, + "rewards/margins": 7.660147666931152, + "rewards/rejected": -8.222575187683105, + "step": 6960 + }, + { + "epoch": 1.68, + "learning_rate": 2.4491888037083257e-07, + "logits/chosen": -2.4503002166748047, + "logits/rejected": -2.4879584312438965, + "logps/chosen": -333.0428771972656, + "logps/rejected": -408.75396728515625, + "loss": 0.1068, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0898610353469849, + "rewards/margins": 9.749977111816406, + "rewards/rejected": -10.839839935302734, + "step": 6970 + }, + { + "epoch": 1.68, + "learning_rate": 2.4447316812266e-07, + "logits/chosen": -2.6729576587677, + "logits/rejected": -2.591397523880005, + "logps/chosen": -248.69216918945312, + "logps/rejected": -242.4110870361328, + "loss": 0.1096, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.16907243430614471, + "rewards/margins": 6.555850028991699, + "rewards/rejected": -6.724922180175781, + "step": 6980 + }, + { + "epoch": 1.68, + "learning_rate": 2.4402745587448743e-07, + "logits/chosen": -2.547302722930908, + "logits/rejected": -2.5183403491973877, + "logps/chosen": -277.13690185546875, + "logps/rejected": -335.0595397949219, + "loss": 0.076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.159125804901123, + "rewards/margins": 5.9451985359191895, + "rewards/rejected": -7.104323387145996, + "step": 6990 + }, + { + "epoch": 1.68, + "learning_rate": 2.4358174362631483e-07, + "logits/chosen": -2.517106771469116, + "logits/rejected": -2.4098610877990723, + "logps/chosen": -243.20498657226562, + "logps/rejected": -336.9594421386719, + "loss": 0.0656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.07006768882274628, + "rewards/margins": 8.30532169342041, + "rewards/rejected": -8.375389099121094, + "step": 7000 + }, + { + "epoch": 1.69, + "learning_rate": 2.4313603137814224e-07, + "logits/chosen": -2.7634055614471436, + "logits/rejected": -2.5951571464538574, + "logps/chosen": -297.3189697265625, + "logps/rejected": -261.8771057128906, + "loss": 0.1157, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14124087989330292, + "rewards/margins": 7.4747467041015625, + "rewards/rejected": -7.6159868240356445, + "step": 7010 + }, + { + "epoch": 1.69, + "learning_rate": 2.426903191299697e-07, + "logits/chosen": -2.69368577003479, + "logits/rejected": -2.563884735107422, + "logps/chosen": -283.6263427734375, + "logps/rejected": -298.4433288574219, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19579832255840302, + "rewards/margins": 6.992488861083984, + "rewards/rejected": -7.188286781311035, + "step": 7020 + }, + { + "epoch": 1.69, + "learning_rate": 2.422446068817971e-07, + "logits/chosen": -2.6995270252227783, + "logits/rejected": -2.5794615745544434, + "logps/chosen": -259.35845947265625, + "logps/rejected": -298.73101806640625, + "loss": 0.1126, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.14753352105617523, + "rewards/margins": 6.613396644592285, + "rewards/rejected": -6.760930061340332, + "step": 7030 + }, + { + "epoch": 1.69, + "learning_rate": 2.417988946336245e-07, + "logits/chosen": -2.691283941268921, + "logits/rejected": -2.634284496307373, + "logps/chosen": -251.0098419189453, + "logps/rejected": -373.2581481933594, + "loss": 0.0714, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1844494342803955, + "rewards/margins": 10.074152946472168, + "rewards/rejected": -8.889702796936035, + "step": 7040 + }, + { + "epoch": 1.7, + "learning_rate": 2.4135318238545195e-07, + "logits/chosen": -2.7262206077575684, + "logits/rejected": -2.604710102081299, + "logps/chosen": -303.147705078125, + "logps/rejected": -384.01470947265625, + "loss": 0.0542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17617926001548767, + "rewards/margins": 10.112824440002441, + "rewards/rejected": -10.289003372192383, + "step": 7050 + }, + { + "epoch": 1.7, + "learning_rate": 2.4090747013727936e-07, + "logits/chosen": -2.4994661808013916, + "logits/rejected": -2.5262341499328613, + "logps/chosen": -326.6519775390625, + "logps/rejected": -336.6358337402344, + "loss": 0.0872, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.24213656783103943, + "rewards/margins": 8.929125785827637, + "rewards/rejected": -9.17126178741455, + "step": 7060 + }, + { + "epoch": 1.7, + "learning_rate": 2.4046175788910676e-07, + "logits/chosen": -2.6846208572387695, + "logits/rejected": -2.664703845977783, + "logps/chosen": -236.06387329101562, + "logps/rejected": -355.14044189453125, + "loss": 0.1302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8785532712936401, + "rewards/margins": 6.995070457458496, + "rewards/rejected": -7.873623847961426, + "step": 7070 + }, + { + "epoch": 1.7, + "learning_rate": 2.400160456409342e-07, + "logits/chosen": -2.6251165866851807, + "logits/rejected": -2.6238937377929688, + "logps/chosen": -199.33859252929688, + "logps/rejected": -321.19354248046875, + "loss": 0.1684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0854692459106445, + "rewards/margins": 7.993255615234375, + "rewards/rejected": -9.078723907470703, + "step": 7080 + }, + { + "epoch": 1.71, + "learning_rate": 2.395703333927616e-07, + "logits/chosen": -2.5831985473632812, + "logits/rejected": -2.6258349418640137, + "logps/chosen": -200.64434814453125, + "logps/rejected": -322.10333251953125, + "loss": 0.1271, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.27437877655029297, + "rewards/margins": 9.108819961547852, + "rewards/rejected": -8.834441184997559, + "step": 7090 + }, + { + "epoch": 1.71, + "learning_rate": 2.39124621144589e-07, + "logits/chosen": -2.6364359855651855, + "logits/rejected": -2.613237142562866, + "logps/chosen": -227.36788940429688, + "logps/rejected": -310.9380187988281, + "loss": 0.1224, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7503639459609985, + "rewards/margins": 9.57800579071045, + "rewards/rejected": -8.827642440795898, + "step": 7100 + }, + { + "epoch": 1.71, + "learning_rate": 2.386789088964165e-07, + "logits/chosen": -2.6473875045776367, + "logits/rejected": -2.614420175552368, + "logps/chosen": -312.0506286621094, + "logps/rejected": -355.6326904296875, + "loss": 0.1323, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5325582027435303, + "rewards/margins": 7.201295375823975, + "rewards/rejected": -7.7338547706604, + "step": 7110 + }, + { + "epoch": 1.71, + "learning_rate": 2.3823319664824388e-07, + "logits/chosen": -2.4910478591918945, + "logits/rejected": -2.57076096534729, + "logps/chosen": -212.93893432617188, + "logps/rejected": -295.931396484375, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4131343364715576, + "rewards/margins": 6.202882289886475, + "rewards/rejected": -7.6160173416137695, + "step": 7120 + }, + { + "epoch": 1.72, + "learning_rate": 2.3778748440007128e-07, + "logits/chosen": -2.726280927658081, + "logits/rejected": -2.5627434253692627, + "logps/chosen": -317.63458251953125, + "logps/rejected": -292.7439880371094, + "loss": 0.1031, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6963149309158325, + "rewards/margins": 7.364465236663818, + "rewards/rejected": -8.06078052520752, + "step": 7130 + }, + { + "epoch": 1.72, + "learning_rate": 2.373417721518987e-07, + "logits/chosen": -2.5128579139709473, + "logits/rejected": -2.5653254985809326, + "logps/chosen": -226.4662628173828, + "logps/rejected": -327.03021240234375, + "loss": 0.0882, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8353655934333801, + "rewards/margins": 8.105393409729004, + "rewards/rejected": -8.94075870513916, + "step": 7140 + }, + { + "epoch": 1.72, + "learning_rate": 2.3689605990372614e-07, + "logits/chosen": -2.7434535026550293, + "logits/rejected": -2.675881862640381, + "logps/chosen": -264.567626953125, + "logps/rejected": -302.3722839355469, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9892032742500305, + "rewards/margins": 7.415769100189209, + "rewards/rejected": -8.404972076416016, + "step": 7150 + }, + { + "epoch": 1.72, + "learning_rate": 2.3645034765555354e-07, + "logits/chosen": -2.623859405517578, + "logits/rejected": -2.623645305633545, + "logps/chosen": -193.98988342285156, + "logps/rejected": -296.64398193359375, + "loss": 0.0904, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2686735391616821, + "rewards/margins": 8.979497909545898, + "rewards/rejected": -10.248170852661133, + "step": 7160 + }, + { + "epoch": 1.73, + "learning_rate": 2.36004635407381e-07, + "logits/chosen": -2.633744478225708, + "logits/rejected": -2.7162396907806396, + "logps/chosen": -256.90264892578125, + "logps/rejected": -354.1653747558594, + "loss": 0.1246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4706287384033203, + "rewards/margins": 8.282906532287598, + "rewards/rejected": -8.753534317016602, + "step": 7170 + }, + { + "epoch": 1.73, + "learning_rate": 2.3555892315920843e-07, + "logits/chosen": -2.6927480697631836, + "logits/rejected": -2.629392623901367, + "logps/chosen": -257.0445251464844, + "logps/rejected": -257.18994140625, + "loss": 0.1222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6668189764022827, + "rewards/margins": 5.208324909210205, + "rewards/rejected": -6.875143527984619, + "step": 7180 + }, + { + "epoch": 1.73, + "learning_rate": 2.3511321091103583e-07, + "logits/chosen": -2.730381488800049, + "logits/rejected": -2.6737563610076904, + "logps/chosen": -287.07574462890625, + "logps/rejected": -437.490966796875, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7772750854492188, + "rewards/margins": 8.56248664855957, + "rewards/rejected": -10.339761734008789, + "step": 7190 + }, + { + "epoch": 1.73, + "learning_rate": 2.3466749866286326e-07, + "logits/chosen": -2.7791638374328613, + "logits/rejected": -2.6489243507385254, + "logps/chosen": -381.9056701660156, + "logps/rejected": -393.8326110839844, + "loss": 0.1095, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6104615926742554, + "rewards/margins": 9.136541366577148, + "rewards/rejected": -9.747003555297852, + "step": 7200 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -2.339570999145508, + "eval_logits/rejected": -2.298337459564209, + "eval_logps/chosen": -243.2823028564453, + "eval_logps/rejected": -259.2771911621094, + "eval_loss": 0.5153447985649109, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -4.135477066040039, + "eval_rewards/margins": 2.839163303375244, + "eval_rewards/rejected": -6.974639892578125, + "eval_runtime": 131.5205, + "eval_samples_per_second": 23.996, + "eval_steps_per_second": 0.38, + "step": 7200 + }, + { + "epoch": 1.74, + "learning_rate": 2.3422178641469066e-07, + "logits/chosen": -2.5644402503967285, + "logits/rejected": -2.538790225982666, + "logps/chosen": -243.1437225341797, + "logps/rejected": -303.4709167480469, + "loss": 0.0984, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9559003710746765, + "rewards/margins": 7.72446346282959, + "rewards/rejected": -8.680364608764648, + "step": 7210 + }, + { + "epoch": 1.74, + "learning_rate": 2.337760741665181e-07, + "logits/chosen": -2.632382869720459, + "logits/rejected": -2.610016345977783, + "logps/chosen": -311.9110412597656, + "logps/rejected": -422.59027099609375, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15952453017234802, + "rewards/margins": 9.718436241149902, + "rewards/rejected": -9.877962112426758, + "step": 7220 + }, + { + "epoch": 1.74, + "learning_rate": 2.3333036191834552e-07, + "logits/chosen": -2.565765142440796, + "logits/rejected": -2.528348445892334, + "logps/chosen": -199.43502807617188, + "logps/rejected": -240.23080444335938, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3454731702804565, + "rewards/margins": 5.091964244842529, + "rewards/rejected": -6.437438011169434, + "step": 7230 + }, + { + "epoch": 1.74, + "learning_rate": 2.3288464967017293e-07, + "logits/chosen": -2.720362663269043, + "logits/rejected": -2.5842232704162598, + "logps/chosen": -306.49310302734375, + "logps/rejected": -400.3758544921875, + "loss": 0.1315, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2581801414489746, + "rewards/margins": 6.43633508682251, + "rewards/rejected": -8.694514274597168, + "step": 7240 + }, + { + "epoch": 1.74, + "learning_rate": 2.3243893742200035e-07, + "logits/chosen": -2.5583643913269043, + "logits/rejected": -2.6254963874816895, + "logps/chosen": -300.7047119140625, + "logps/rejected": -331.8702392578125, + "loss": 0.0998, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4756503105163574, + "rewards/margins": 4.999587059020996, + "rewards/rejected": -7.475237846374512, + "step": 7250 + }, + { + "epoch": 1.75, + "learning_rate": 2.3199322517382778e-07, + "logits/chosen": -2.6709704399108887, + "logits/rejected": -2.5990965366363525, + "logps/chosen": -279.72833251953125, + "logps/rejected": -345.9717712402344, + "loss": 0.1014, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2836154699325562, + "rewards/margins": 6.974087715148926, + "rewards/rejected": -8.257701873779297, + "step": 7260 + }, + { + "epoch": 1.75, + "learning_rate": 2.315475129256552e-07, + "logits/chosen": -2.6193602085113525, + "logits/rejected": -2.67102313041687, + "logps/chosen": -227.65225219726562, + "logps/rejected": -347.7125549316406, + "loss": 0.1003, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2079720497131348, + "rewards/margins": 6.406277656555176, + "rewards/rejected": -8.614249229431152, + "step": 7270 + }, + { + "epoch": 1.75, + "learning_rate": 2.3110180067748262e-07, + "logits/chosen": -2.512887954711914, + "logits/rejected": -2.5485825538635254, + "logps/chosen": -229.8495330810547, + "logps/rejected": -265.9822692871094, + "loss": 0.1176, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.186687469482422, + "rewards/margins": 5.730704307556152, + "rewards/rejected": -7.917391777038574, + "step": 7280 + }, + { + "epoch": 1.75, + "learning_rate": 2.3065608842931002e-07, + "logits/chosen": -2.5849320888519287, + "logits/rejected": -2.51088285446167, + "logps/chosen": -391.68157958984375, + "logps/rejected": -333.1160583496094, + "loss": 0.0574, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3726003170013428, + "rewards/margins": 6.668314456939697, + "rewards/rejected": -9.040914535522461, + "step": 7290 + }, + { + "epoch": 1.76, + "learning_rate": 2.3021037618113745e-07, + "logits/chosen": -2.289815664291382, + "logits/rejected": -2.3414039611816406, + "logps/chosen": -380.6048889160156, + "logps/rejected": -349.00054931640625, + "loss": 0.0835, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1775258779525757, + "rewards/margins": 8.12243938446045, + "rewards/rejected": -9.299964904785156, + "step": 7300 + }, + { + "epoch": 1.76, + "learning_rate": 2.2976466393296488e-07, + "logits/chosen": -2.578829050064087, + "logits/rejected": -2.5659432411193848, + "logps/chosen": -239.71578979492188, + "logps/rejected": -303.98968505859375, + "loss": 0.1334, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5287086963653564, + "rewards/margins": 8.155741691589355, + "rewards/rejected": -9.684450149536133, + "step": 7310 + }, + { + "epoch": 1.76, + "learning_rate": 2.2931895168479228e-07, + "logits/chosen": -2.390089511871338, + "logits/rejected": -2.44716215133667, + "logps/chosen": -283.4778747558594, + "logps/rejected": -364.2710876464844, + "loss": 0.1251, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3863930702209473, + "rewards/margins": 8.288941383361816, + "rewards/rejected": -11.675333976745605, + "step": 7320 + }, + { + "epoch": 1.76, + "learning_rate": 2.288732394366197e-07, + "logits/chosen": -2.642254114151001, + "logits/rejected": -2.433981418609619, + "logps/chosen": -242.18017578125, + "logps/rejected": -267.94879150390625, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.964556097984314, + "rewards/margins": 7.220114707946777, + "rewards/rejected": -9.184670448303223, + "step": 7330 + }, + { + "epoch": 1.77, + "learning_rate": 2.2842752718844714e-07, + "logits/chosen": -2.7937426567077637, + "logits/rejected": -2.764474868774414, + "logps/chosen": -302.13031005859375, + "logps/rejected": -317.9931945800781, + "loss": 0.1392, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.39774584770202637, + "rewards/margins": 8.073554039001465, + "rewards/rejected": -8.47130012512207, + "step": 7340 + }, + { + "epoch": 1.77, + "learning_rate": 2.2798181494027454e-07, + "logits/chosen": -2.374725341796875, + "logits/rejected": -2.4418551921844482, + "logps/chosen": -278.1357116699219, + "logps/rejected": -299.0234375, + "loss": 0.2138, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0053553581237793, + "rewards/margins": 5.862728118896484, + "rewards/rejected": -7.868082523345947, + "step": 7350 + }, + { + "epoch": 1.77, + "learning_rate": 2.2753610269210197e-07, + "logits/chosen": -2.5299973487854004, + "logits/rejected": -2.495958089828491, + "logps/chosen": -232.66177368164062, + "logps/rejected": -313.27581787109375, + "loss": 0.0797, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3753086030483246, + "rewards/margins": 8.323644638061523, + "rewards/rejected": -8.698953628540039, + "step": 7360 + }, + { + "epoch": 1.77, + "learning_rate": 2.2709039044392937e-07, + "logits/chosen": -2.585207223892212, + "logits/rejected": -2.3481059074401855, + "logps/chosen": -237.22738647460938, + "logps/rejected": -311.93524169921875, + "loss": 0.0797, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7202649116516113, + "rewards/margins": 8.996984481811523, + "rewards/rejected": -9.717249870300293, + "step": 7370 + }, + { + "epoch": 1.78, + "learning_rate": 2.266446781957568e-07, + "logits/chosen": -2.576906442642212, + "logits/rejected": -2.4248411655426025, + "logps/chosen": -263.6038513183594, + "logps/rejected": -263.1639709472656, + "loss": 0.1163, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6332124471664429, + "rewards/margins": 7.621026515960693, + "rewards/rejected": -8.254239082336426, + "step": 7380 + }, + { + "epoch": 1.78, + "learning_rate": 2.2619896594758423e-07, + "logits/chosen": -2.674010992050171, + "logits/rejected": -2.5529427528381348, + "logps/chosen": -367.4625549316406, + "logps/rejected": -397.8846740722656, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2886005640029907, + "rewards/margins": 10.786130905151367, + "rewards/rejected": -9.497529983520508, + "step": 7390 + }, + { + "epoch": 1.78, + "learning_rate": 2.2575325369941164e-07, + "logits/chosen": -2.471353054046631, + "logits/rejected": -2.4338858127593994, + "logps/chosen": -263.5669860839844, + "logps/rejected": -297.22998046875, + "loss": 0.1119, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2962851524353027, + "rewards/margins": 5.4361186027526855, + "rewards/rejected": -7.732403755187988, + "step": 7400 + }, + { + "epoch": 1.78, + "learning_rate": 2.2530754145123907e-07, + "logits/chosen": -2.433509349822998, + "logits/rejected": -2.3731446266174316, + "logps/chosen": -196.04039001464844, + "logps/rejected": -270.1302490234375, + "loss": 0.1082, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5012108087539673, + "rewards/margins": 5.40764856338501, + "rewards/rejected": -6.9088592529296875, + "step": 7410 + }, + { + "epoch": 1.79, + "learning_rate": 2.248618292030665e-07, + "logits/chosen": -2.48002028465271, + "logits/rejected": -2.403534173965454, + "logps/chosen": -285.1266174316406, + "logps/rejected": -304.3287048339844, + "loss": 0.0949, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2731415033340454, + "rewards/margins": 5.922009468078613, + "rewards/rejected": -7.195151329040527, + "step": 7420 + }, + { + "epoch": 1.79, + "learning_rate": 2.244161169548939e-07, + "logits/chosen": -2.6067662239074707, + "logits/rejected": -2.5508029460906982, + "logps/chosen": -210.6913299560547, + "logps/rejected": -247.1603240966797, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37648916244506836, + "rewards/margins": 7.376882076263428, + "rewards/rejected": -7.7533721923828125, + "step": 7430 + }, + { + "epoch": 1.79, + "learning_rate": 2.2397040470672133e-07, + "logits/chosen": -2.5132429599761963, + "logits/rejected": -2.358344554901123, + "logps/chosen": -206.2704315185547, + "logps/rejected": -258.94976806640625, + "loss": 0.1005, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4994385242462158, + "rewards/margins": 6.546236515045166, + "rewards/rejected": -8.045675277709961, + "step": 7440 + }, + { + "epoch": 1.79, + "learning_rate": 2.2352469245854873e-07, + "logits/chosen": -2.5345187187194824, + "logits/rejected": -2.507821559906006, + "logps/chosen": -227.01364135742188, + "logps/rejected": -280.1637268066406, + "loss": 0.1263, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0459386110305786, + "rewards/margins": 6.257880210876465, + "rewards/rejected": -7.303818702697754, + "step": 7450 + }, + { + "epoch": 1.8, + "learning_rate": 2.2307898021037616e-07, + "logits/chosen": -2.5831310749053955, + "logits/rejected": -2.413195848464966, + "logps/chosen": -259.00787353515625, + "logps/rejected": -315.9976501464844, + "loss": 0.1144, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8815778493881226, + "rewards/margins": 8.863349914550781, + "rewards/rejected": -7.981772422790527, + "step": 7460 + }, + { + "epoch": 1.8, + "learning_rate": 2.226332679622036e-07, + "logits/chosen": -2.3800718784332275, + "logits/rejected": -2.3832552433013916, + "logps/chosen": -300.4321594238281, + "logps/rejected": -449.7889709472656, + "loss": 0.0689, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7963374257087708, + "rewards/margins": 8.301736831665039, + "rewards/rejected": -9.098074913024902, + "step": 7470 + }, + { + "epoch": 1.8, + "learning_rate": 2.22187555714031e-07, + "logits/chosen": -2.6018245220184326, + "logits/rejected": -2.4382550716400146, + "logps/chosen": -271.9831848144531, + "logps/rejected": -292.10748291015625, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8728191256523132, + "rewards/margins": 9.360101699829102, + "rewards/rejected": -8.487282752990723, + "step": 7480 + }, + { + "epoch": 1.8, + "learning_rate": 2.2174184346585842e-07, + "logits/chosen": -2.417644739151001, + "logits/rejected": -2.4956912994384766, + "logps/chosen": -223.18490600585938, + "logps/rejected": -338.0353088378906, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7294325828552246, + "rewards/margins": 7.016575813293457, + "rewards/rejected": -7.746008396148682, + "step": 7490 + }, + { + "epoch": 1.81, + "learning_rate": 2.2129613121768585e-07, + "logits/chosen": -2.5275511741638184, + "logits/rejected": -2.478214979171753, + "logps/chosen": -226.3917999267578, + "logps/rejected": -317.48138427734375, + "loss": 0.0789, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.673362374305725, + "rewards/margins": 8.482213973999023, + "rewards/rejected": -10.1555757522583, + "step": 7500 + }, + { + "epoch": 1.81, + "learning_rate": 2.2085041896951328e-07, + "logits/chosen": -2.5859923362731934, + "logits/rejected": -2.473111629486084, + "logps/chosen": -231.0122528076172, + "logps/rejected": -317.8660583496094, + "loss": 0.1009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7452360391616821, + "rewards/margins": 10.408421516418457, + "rewards/rejected": -9.663185119628906, + "step": 7510 + }, + { + "epoch": 1.81, + "learning_rate": 2.204047067213407e-07, + "logits/chosen": -2.47501540184021, + "logits/rejected": -2.3478808403015137, + "logps/chosen": -265.54730224609375, + "logps/rejected": -473.728271484375, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27362900972366333, + "rewards/margins": 15.523233413696289, + "rewards/rejected": -15.249605178833008, + "step": 7520 + }, + { + "epoch": 1.81, + "learning_rate": 2.199589944731681e-07, + "logits/chosen": -2.5425233840942383, + "logits/rejected": -2.6027400493621826, + "logps/chosen": -201.32948303222656, + "logps/rejected": -348.94110107421875, + "loss": 0.1444, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6131919622421265, + "rewards/margins": 7.723005771636963, + "rewards/rejected": -8.336198806762695, + "step": 7530 + }, + { + "epoch": 1.81, + "learning_rate": 2.1951328222499554e-07, + "logits/chosen": -2.5663869380950928, + "logits/rejected": -2.2703213691711426, + "logps/chosen": -276.5305480957031, + "logps/rejected": -347.3857421875, + "loss": 0.1505, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.16045208275318146, + "rewards/margins": 7.604345798492432, + "rewards/rejected": -7.4438934326171875, + "step": 7540 + }, + { + "epoch": 1.82, + "learning_rate": 2.1906756997682297e-07, + "logits/chosen": -2.595304489135742, + "logits/rejected": -2.607177257537842, + "logps/chosen": -234.88998413085938, + "logps/rejected": -311.6211853027344, + "loss": 0.1163, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3873317241668701, + "rewards/margins": 7.996423244476318, + "rewards/rejected": -9.383755683898926, + "step": 7550 + }, + { + "epoch": 1.82, + "learning_rate": 2.1862185772865037e-07, + "logits/chosen": -2.367216110229492, + "logits/rejected": -2.351743221282959, + "logps/chosen": -224.83804321289062, + "logps/rejected": -285.729248046875, + "loss": 0.08, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2346243858337402, + "rewards/margins": 6.1820759773254395, + "rewards/rejected": -8.41670036315918, + "step": 7560 + }, + { + "epoch": 1.82, + "learning_rate": 2.181761454804778e-07, + "logits/chosen": -2.6036853790283203, + "logits/rejected": -2.607869863510132, + "logps/chosen": -227.0845489501953, + "logps/rejected": -282.7878112792969, + "loss": 0.0808, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.383934736251831, + "rewards/margins": 6.9649834632873535, + "rewards/rejected": -8.348918914794922, + "step": 7570 + }, + { + "epoch": 1.82, + "learning_rate": 2.1773043323230523e-07, + "logits/chosen": -2.512070894241333, + "logits/rejected": -2.538114070892334, + "logps/chosen": -202.14749145507812, + "logps/rejected": -232.96035766601562, + "loss": 0.1065, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6353708505630493, + "rewards/margins": 5.990879535675049, + "rewards/rejected": -6.626250267028809, + "step": 7580 + }, + { + "epoch": 1.83, + "learning_rate": 2.1728472098413263e-07, + "logits/chosen": -2.6221089363098145, + "logits/rejected": -2.503613233566284, + "logps/chosen": -327.0585021972656, + "logps/rejected": -374.91986083984375, + "loss": 0.1566, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7350888848304749, + "rewards/margins": 8.144794464111328, + "rewards/rejected": -7.40970516204834, + "step": 7590 + }, + { + "epoch": 1.83, + "learning_rate": 2.1683900873596006e-07, + "logits/chosen": -2.4637551307678223, + "logits/rejected": -2.480011463165283, + "logps/chosen": -165.88694763183594, + "logps/rejected": -297.799560546875, + "loss": 0.1515, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0971057415008545, + "rewards/margins": 6.92633581161499, + "rewards/rejected": -8.023443222045898, + "step": 7600 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -2.289604425430298, + "eval_logits/rejected": -2.2513480186462402, + "eval_logps/chosen": -246.97955322265625, + "eval_logps/rejected": -263.99462890625, + "eval_loss": 0.5241956114768982, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -4.505197525024414, + "eval_rewards/margins": 2.9411861896514893, + "eval_rewards/rejected": -7.446383476257324, + "eval_runtime": 131.2772, + "eval_samples_per_second": 24.041, + "eval_steps_per_second": 0.381, + "step": 7600 + }, + { + "epoch": 1.83, + "learning_rate": 2.163932964877875e-07, + "logits/chosen": -2.4689695835113525, + "logits/rejected": -2.4437785148620605, + "logps/chosen": -217.9733123779297, + "logps/rejected": -274.4111022949219, + "loss": 0.0901, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3212761878967285, + "rewards/margins": 7.479835510253906, + "rewards/rejected": -8.801111221313477, + "step": 7610 + }, + { + "epoch": 1.83, + "learning_rate": 2.159475842396149e-07, + "logits/chosen": -2.4899864196777344, + "logits/rejected": -2.4517159461975098, + "logps/chosen": -279.220458984375, + "logps/rejected": -336.27557373046875, + "loss": 0.131, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2590699195861816, + "rewards/margins": 7.154644966125488, + "rewards/rejected": -9.413714408874512, + "step": 7620 + }, + { + "epoch": 1.84, + "learning_rate": 2.1550187199144233e-07, + "logits/chosen": -2.489950180053711, + "logits/rejected": -2.5148751735687256, + "logps/chosen": -272.50115966796875, + "logps/rejected": -339.0401611328125, + "loss": 0.0984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8389506340026855, + "rewards/margins": 7.193517208099365, + "rewards/rejected": -8.032468795776367, + "step": 7630 + }, + { + "epoch": 1.84, + "learning_rate": 2.1505615974326973e-07, + "logits/chosen": -2.546099901199341, + "logits/rejected": -2.6154778003692627, + "logps/chosen": -234.4697723388672, + "logps/rejected": -371.752197265625, + "loss": 0.1064, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3317568302154541, + "rewards/margins": 9.722246170043945, + "rewards/rejected": -9.390490531921387, + "step": 7640 + }, + { + "epoch": 1.84, + "learning_rate": 2.1461044749509716e-07, + "logits/chosen": -2.612776279449463, + "logits/rejected": -2.5559134483337402, + "logps/chosen": -285.86517333984375, + "logps/rejected": -401.9395446777344, + "loss": 0.0716, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22804859280586243, + "rewards/margins": 9.717254638671875, + "rewards/rejected": -9.945302963256836, + "step": 7650 + }, + { + "epoch": 1.84, + "learning_rate": 2.141647352469246e-07, + "logits/chosen": -2.542107343673706, + "logits/rejected": -2.551405429840088, + "logps/chosen": -289.79815673828125, + "logps/rejected": -415.26318359375, + "loss": 0.2417, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5029503107070923, + "rewards/margins": 8.917051315307617, + "rewards/rejected": -8.414101600646973, + "step": 7660 + }, + { + "epoch": 1.85, + "learning_rate": 2.13719022998752e-07, + "logits/chosen": -2.834725856781006, + "logits/rejected": -2.566082715988159, + "logps/chosen": -389.8315734863281, + "logps/rejected": -338.5687561035156, + "loss": 0.1113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2165495604276657, + "rewards/margins": 8.31263542175293, + "rewards/rejected": -8.529186248779297, + "step": 7670 + }, + { + "epoch": 1.85, + "learning_rate": 2.1327331075057942e-07, + "logits/chosen": -2.5547666549682617, + "logits/rejected": -2.485154628753662, + "logps/chosen": -315.56103515625, + "logps/rejected": -315.5398864746094, + "loss": 0.0542, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5800116658210754, + "rewards/margins": 7.743584632873535, + "rewards/rejected": -8.323596000671387, + "step": 7680 + }, + { + "epoch": 1.85, + "learning_rate": 2.1282759850240685e-07, + "logits/chosen": -2.613180160522461, + "logits/rejected": -2.571366548538208, + "logps/chosen": -224.80276489257812, + "logps/rejected": -281.1871032714844, + "loss": 0.1117, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1531640738248825, + "rewards/margins": 7.546097755432129, + "rewards/rejected": -7.699261665344238, + "step": 7690 + }, + { + "epoch": 1.85, + "learning_rate": 2.1238188625423425e-07, + "logits/chosen": -2.445774555206299, + "logits/rejected": -2.4282386302948, + "logps/chosen": -205.8231658935547, + "logps/rejected": -331.0657653808594, + "loss": 0.1229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3187758922576904, + "rewards/margins": 8.039289474487305, + "rewards/rejected": -10.358064651489258, + "step": 7700 + }, + { + "epoch": 1.86, + "learning_rate": 2.1193617400606168e-07, + "logits/chosen": -2.6719584465026855, + "logits/rejected": -2.55678653717041, + "logps/chosen": -261.1755676269531, + "logps/rejected": -225.02392578125, + "loss": 0.1014, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0180306434631348, + "rewards/margins": 4.9081950187683105, + "rewards/rejected": -6.9262261390686035, + "step": 7710 + }, + { + "epoch": 1.86, + "learning_rate": 2.1149046175788908e-07, + "logits/chosen": -2.4505276679992676, + "logits/rejected": -2.4282736778259277, + "logps/chosen": -238.9077911376953, + "logps/rejected": -333.31561279296875, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1426715850830078, + "rewards/margins": 6.513665199279785, + "rewards/rejected": -7.656336784362793, + "step": 7720 + }, + { + "epoch": 1.86, + "learning_rate": 2.110447495097165e-07, + "logits/chosen": -2.54573392868042, + "logits/rejected": -2.5105884075164795, + "logps/chosen": -202.3138427734375, + "logps/rejected": -314.7386779785156, + "loss": 0.0793, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.504429578781128, + "rewards/margins": 8.053789138793945, + "rewards/rejected": -9.558218002319336, + "step": 7730 + }, + { + "epoch": 1.86, + "learning_rate": 2.1059903726154394e-07, + "logits/chosen": -2.7421913146972656, + "logits/rejected": -2.704651117324829, + "logps/chosen": -298.920654296875, + "logps/rejected": -291.2439270019531, + "loss": 0.1068, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1864571571350098, + "rewards/margins": 5.482812881469727, + "rewards/rejected": -7.6692705154418945, + "step": 7740 + }, + { + "epoch": 1.87, + "learning_rate": 2.1015332501337135e-07, + "logits/chosen": -2.760547399520874, + "logits/rejected": -2.7504758834838867, + "logps/chosen": -291.6126403808594, + "logps/rejected": -340.95819091796875, + "loss": 0.1171, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1536940038204193, + "rewards/margins": 7.864290714263916, + "rewards/rejected": -8.017983436584473, + "step": 7750 + }, + { + "epoch": 1.87, + "learning_rate": 2.0970761276519877e-07, + "logits/chosen": -2.6722984313964844, + "logits/rejected": -2.6949880123138428, + "logps/chosen": -296.4515686035156, + "logps/rejected": -405.2401123046875, + "loss": 0.0943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8006378412246704, + "rewards/margins": 8.26314926147461, + "rewards/rejected": -9.063787460327148, + "step": 7760 + }, + { + "epoch": 1.87, + "learning_rate": 2.092619005170262e-07, + "logits/chosen": -2.6990954875946045, + "logits/rejected": -2.689490556716919, + "logps/chosen": -374.5258483886719, + "logps/rejected": -390.0920104980469, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.060518957674503326, + "rewards/margins": 9.205659866333008, + "rewards/rejected": -9.266180038452148, + "step": 7770 + }, + { + "epoch": 1.87, + "learning_rate": 2.088161882688536e-07, + "logits/chosen": -2.734041690826416, + "logits/rejected": -2.6701254844665527, + "logps/chosen": -272.4217224121094, + "logps/rejected": -321.0535583496094, + "loss": 0.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9945024251937866, + "rewards/margins": 6.5279364585876465, + "rewards/rejected": -8.522439002990723, + "step": 7780 + }, + { + "epoch": 1.87, + "learning_rate": 2.0837047602068104e-07, + "logits/chosen": -2.759873628616333, + "logits/rejected": -2.594285488128662, + "logps/chosen": -309.46551513671875, + "logps/rejected": -313.20831298828125, + "loss": 0.0754, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3565568923950195, + "rewards/margins": 6.081301689147949, + "rewards/rejected": -7.437858581542969, + "step": 7790 + }, + { + "epoch": 1.88, + "learning_rate": 2.0792476377250844e-07, + "logits/chosen": -2.5614938735961914, + "logits/rejected": -2.6000454425811768, + "logps/chosen": -335.32269287109375, + "logps/rejected": -450.3589782714844, + "loss": 0.092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.614913284778595, + "rewards/margins": 10.81617546081543, + "rewards/rejected": -11.431089401245117, + "step": 7800 + }, + { + "epoch": 1.88, + "learning_rate": 2.0747905152433587e-07, + "logits/chosen": -2.6038460731506348, + "logits/rejected": -2.6459014415740967, + "logps/chosen": -275.76348876953125, + "logps/rejected": -297.15264892578125, + "loss": 0.1195, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.7798149585723877, + "rewards/margins": 4.796724796295166, + "rewards/rejected": -8.576539039611816, + "step": 7810 + }, + { + "epoch": 1.88, + "learning_rate": 2.070333392761633e-07, + "logits/chosen": -2.5913760662078857, + "logits/rejected": -2.495664596557617, + "logps/chosen": -226.21505737304688, + "logps/rejected": -314.0357360839844, + "loss": 0.1211, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9873256683349609, + "rewards/margins": 8.476485252380371, + "rewards/rejected": -9.463809967041016, + "step": 7820 + }, + { + "epoch": 1.88, + "learning_rate": 2.065876270279907e-07, + "logits/chosen": -2.530057191848755, + "logits/rejected": -2.4276559352874756, + "logps/chosen": -285.4584655761719, + "logps/rejected": -325.4349365234375, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.640655279159546, + "rewards/margins": 7.890588283538818, + "rewards/rejected": -10.531244277954102, + "step": 7830 + }, + { + "epoch": 1.89, + "learning_rate": 2.0614191477981813e-07, + "logits/chosen": -2.2655327320098877, + "logits/rejected": -2.3299734592437744, + "logps/chosen": -259.01690673828125, + "logps/rejected": -343.3759765625, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0912749767303467, + "rewards/margins": 10.160661697387695, + "rewards/rejected": -11.251936912536621, + "step": 7840 + }, + { + "epoch": 1.89, + "learning_rate": 2.0569620253164559e-07, + "logits/chosen": -2.5364832878112793, + "logits/rejected": -2.5112829208374023, + "logps/chosen": -181.44210815429688, + "logps/rejected": -203.41616821289062, + "loss": 0.1258, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.22523832321167, + "rewards/margins": 5.641473293304443, + "rewards/rejected": -7.866711616516113, + "step": 7850 + }, + { + "epoch": 1.89, + "learning_rate": 2.05250490283473e-07, + "logits/chosen": -2.453578472137451, + "logits/rejected": -2.540310859680176, + "logps/chosen": -306.55560302734375, + "logps/rejected": -313.5611267089844, + "loss": 0.0823, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.584228515625, + "rewards/margins": 7.540985107421875, + "rewards/rejected": -9.125212669372559, + "step": 7860 + }, + { + "epoch": 1.89, + "learning_rate": 2.0480477803530042e-07, + "logits/chosen": -2.7184207439422607, + "logits/rejected": -2.6624531745910645, + "logps/chosen": -398.5353698730469, + "logps/rejected": -377.66168212890625, + "loss": 0.1111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0141246318817139, + "rewards/margins": 7.387277126312256, + "rewards/rejected": -8.401402473449707, + "step": 7870 + }, + { + "epoch": 1.9, + "learning_rate": 2.0435906578712782e-07, + "logits/chosen": -2.646336078643799, + "logits/rejected": -2.4954798221588135, + "logps/chosen": -244.2439727783203, + "logps/rejected": -425.41766357421875, + "loss": 0.1331, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2392260581254959, + "rewards/margins": 12.841867446899414, + "rewards/rejected": -12.602640151977539, + "step": 7880 + }, + { + "epoch": 1.9, + "learning_rate": 2.0391335353895525e-07, + "logits/chosen": -2.645211696624756, + "logits/rejected": -2.5931031703948975, + "logps/chosen": -283.30535888671875, + "logps/rejected": -359.7178649902344, + "loss": 0.0628, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.39487886428833, + "rewards/margins": 7.769004821777344, + "rewards/rejected": -10.163884162902832, + "step": 7890 + }, + { + "epoch": 1.9, + "learning_rate": 2.0346764129078268e-07, + "logits/chosen": -2.857659101486206, + "logits/rejected": -2.7472076416015625, + "logps/chosen": -340.81488037109375, + "logps/rejected": -340.79730224609375, + "loss": 0.0905, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2454087734222412, + "rewards/margins": 6.279747009277344, + "rewards/rejected": -7.525155544281006, + "step": 7900 + }, + { + "epoch": 1.9, + "learning_rate": 2.0302192904261008e-07, + "logits/chosen": -2.508478879928589, + "logits/rejected": -2.558784246444702, + "logps/chosen": -283.2003479003906, + "logps/rejected": -312.26715087890625, + "loss": 0.1272, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7378164529800415, + "rewards/margins": 8.038057327270508, + "rewards/rejected": -8.775873184204102, + "step": 7910 + }, + { + "epoch": 1.91, + "learning_rate": 2.025762167944375e-07, + "logits/chosen": -2.6211769580841064, + "logits/rejected": -2.499763011932373, + "logps/chosen": -341.9508361816406, + "logps/rejected": -305.7552795410156, + "loss": 0.0431, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5438544750213623, + "rewards/margins": 8.497499465942383, + "rewards/rejected": -10.041353225708008, + "step": 7920 + }, + { + "epoch": 1.91, + "learning_rate": 2.0213050454626494e-07, + "logits/chosen": -2.4751362800598145, + "logits/rejected": -2.4390580654144287, + "logps/chosen": -300.8005065917969, + "logps/rejected": -291.78277587890625, + "loss": 0.0879, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7827902436256409, + "rewards/margins": 7.964527130126953, + "rewards/rejected": -8.747318267822266, + "step": 7930 + }, + { + "epoch": 1.91, + "learning_rate": 2.0168479229809234e-07, + "logits/chosen": -2.4628098011016846, + "logits/rejected": -2.4127678871154785, + "logps/chosen": -173.89230346679688, + "logps/rejected": -211.19784545898438, + "loss": 0.0951, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.949373483657837, + "rewards/margins": 6.480429172515869, + "rewards/rejected": -8.429800987243652, + "step": 7940 + }, + { + "epoch": 1.91, + "learning_rate": 2.0123908004991977e-07, + "logits/chosen": -2.6525394916534424, + "logits/rejected": -2.556516408920288, + "logps/chosen": -341.7573547363281, + "logps/rejected": -394.6011657714844, + "loss": 0.0849, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8995146751403809, + "rewards/margins": 9.777068138122559, + "rewards/rejected": -10.676582336425781, + "step": 7950 + }, + { + "epoch": 1.92, + "learning_rate": 2.0079336780174718e-07, + "logits/chosen": -2.611630916595459, + "logits/rejected": -2.531885862350464, + "logps/chosen": -248.10647583007812, + "logps/rejected": -362.81219482421875, + "loss": 0.1108, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.161255121231079, + "rewards/margins": 6.243734836578369, + "rewards/rejected": -8.404989242553711, + "step": 7960 + }, + { + "epoch": 1.92, + "learning_rate": 2.003476555535746e-07, + "logits/chosen": -2.427172899246216, + "logits/rejected": -2.468855381011963, + "logps/chosen": -229.8157958984375, + "logps/rejected": -236.3346405029297, + "loss": 0.1272, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4341981410980225, + "rewards/margins": 6.164163112640381, + "rewards/rejected": -7.598361015319824, + "step": 7970 + }, + { + "epoch": 1.92, + "learning_rate": 1.9990194330540203e-07, + "logits/chosen": -2.526604413986206, + "logits/rejected": -2.4939637184143066, + "logps/chosen": -400.15972900390625, + "logps/rejected": -459.98602294921875, + "loss": 0.0882, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4898955225944519, + "rewards/margins": 11.614582061767578, + "rewards/rejected": -11.124686241149902, + "step": 7980 + }, + { + "epoch": 1.92, + "learning_rate": 1.9945623105722944e-07, + "logits/chosen": -2.436354160308838, + "logits/rejected": -2.392300844192505, + "logps/chosen": -187.17396545410156, + "logps/rejected": -299.4290771484375, + "loss": 0.1234, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2302552461624146, + "rewards/margins": 9.613070487976074, + "rewards/rejected": -10.843326568603516, + "step": 7990 + }, + { + "epoch": 1.93, + "learning_rate": 1.9901051880905687e-07, + "logits/chosen": -2.5178189277648926, + "logits/rejected": -2.3450796604156494, + "logps/chosen": -333.3482360839844, + "logps/rejected": -289.17437744140625, + "loss": 0.1152, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9208691120147705, + "rewards/margins": 7.058518886566162, + "rewards/rejected": -8.979388236999512, + "step": 8000 + }, + { + "epoch": 1.93, + "eval_logits/chosen": -2.3184802532196045, + "eval_logits/rejected": -2.2822105884552, + "eval_logps/chosen": -247.2084197998047, + "eval_logps/rejected": -265.16278076171875, + "eval_loss": 0.5280002951622009, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -4.528088569641113, + "eval_rewards/margins": 3.0351107120513916, + "eval_rewards/rejected": -7.563199996948242, + "eval_runtime": 131.0428, + "eval_samples_per_second": 24.084, + "eval_steps_per_second": 0.382, + "step": 8000 + }, + { + "epoch": 1.93, + "learning_rate": 1.985648065608843e-07, + "logits/chosen": -2.470599889755249, + "logits/rejected": -2.4762444496154785, + "logps/chosen": -343.68865966796875, + "logps/rejected": -401.91668701171875, + "loss": 0.1266, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3179447650909424, + "rewards/margins": 8.005379676818848, + "rewards/rejected": -9.323324203491211, + "step": 8010 + }, + { + "epoch": 1.93, + "learning_rate": 1.981190943127117e-07, + "logits/chosen": -2.6357977390289307, + "logits/rejected": -2.570206880569458, + "logps/chosen": -381.3616638183594, + "logps/rejected": -264.6903381347656, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46273738145828247, + "rewards/margins": 6.4893598556518555, + "rewards/rejected": -6.952097415924072, + "step": 8020 + }, + { + "epoch": 1.93, + "learning_rate": 1.9767338206453913e-07, + "logits/chosen": -2.45924711227417, + "logits/rejected": -2.4058728218078613, + "logps/chosen": -238.2863006591797, + "logps/rejected": -353.55218505859375, + "loss": 0.1005, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.26092714071273804, + "rewards/margins": 10.053898811340332, + "rewards/rejected": -10.314826011657715, + "step": 8030 + }, + { + "epoch": 1.94, + "learning_rate": 1.9722766981636653e-07, + "logits/chosen": -2.3735110759735107, + "logits/rejected": -2.41943621635437, + "logps/chosen": -296.28955078125, + "logps/rejected": -335.2906188964844, + "loss": 0.1505, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4181472063064575, + "rewards/margins": 6.552776336669922, + "rewards/rejected": -7.97092342376709, + "step": 8040 + }, + { + "epoch": 1.94, + "learning_rate": 1.9678195756819396e-07, + "logits/chosen": -2.6774322986602783, + "logits/rejected": -2.6747498512268066, + "logps/chosen": -339.8834533691406, + "logps/rejected": -366.2880554199219, + "loss": 0.1936, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.47035661339759827, + "rewards/margins": 8.600381851196289, + "rewards/rejected": -8.130025863647461, + "step": 8050 + }, + { + "epoch": 1.94, + "learning_rate": 1.963362453200214e-07, + "logits/chosen": -2.3787899017333984, + "logits/rejected": -2.351062774658203, + "logps/chosen": -216.5155792236328, + "logps/rejected": -299.8763732910156, + "loss": 0.1035, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.026233578100800514, + "rewards/margins": 8.369463920593262, + "rewards/rejected": -8.343230247497559, + "step": 8060 + }, + { + "epoch": 1.94, + "learning_rate": 1.958905330718488e-07, + "logits/chosen": -2.3866686820983887, + "logits/rejected": -2.2519049644470215, + "logps/chosen": -160.90786743164062, + "logps/rejected": -215.9598388671875, + "loss": 0.0546, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.393612265586853, + "rewards/margins": 5.775225639343262, + "rewards/rejected": -6.168837547302246, + "step": 8070 + }, + { + "epoch": 1.94, + "learning_rate": 1.9544482082367622e-07, + "logits/chosen": -2.560020923614502, + "logits/rejected": -2.4744629859924316, + "logps/chosen": -302.98248291015625, + "logps/rejected": -415.14093017578125, + "loss": 0.1147, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.344792127609253, + "rewards/margins": 7.479535102844238, + "rewards/rejected": -8.82432746887207, + "step": 8080 + }, + { + "epoch": 1.95, + "learning_rate": 1.9499910857550365e-07, + "logits/chosen": -2.408642292022705, + "logits/rejected": -2.4067068099975586, + "logps/chosen": -286.95855712890625, + "logps/rejected": -325.5104064941406, + "loss": 0.0595, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.48605671525001526, + "rewards/margins": 7.812626838684082, + "rewards/rejected": -7.3265700340271, + "step": 8090 + }, + { + "epoch": 1.95, + "learning_rate": 1.9455339632733105e-07, + "logits/chosen": -2.468109130859375, + "logits/rejected": -2.3929147720336914, + "logps/chosen": -226.5298309326172, + "logps/rejected": -315.2115173339844, + "loss": 0.0785, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.891699194908142, + "rewards/margins": 7.080783843994141, + "rewards/rejected": -8.97248363494873, + "step": 8100 + }, + { + "epoch": 1.95, + "learning_rate": 1.9410768407915848e-07, + "logits/chosen": -2.4556844234466553, + "logits/rejected": -2.4953293800354004, + "logps/chosen": -217.6414031982422, + "logps/rejected": -328.6346740722656, + "loss": 0.114, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.800833523273468, + "rewards/margins": 7.14700174331665, + "rewards/rejected": -7.9478349685668945, + "step": 8110 + }, + { + "epoch": 1.95, + "learning_rate": 1.9366197183098589e-07, + "logits/chosen": -2.6187987327575684, + "logits/rejected": -2.5955591201782227, + "logps/chosen": -189.52865600585938, + "logps/rejected": -362.27752685546875, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21108195185661316, + "rewards/margins": 11.469135284423828, + "rewards/rejected": -11.258051872253418, + "step": 8120 + }, + { + "epoch": 1.96, + "learning_rate": 1.9321625958281332e-07, + "logits/chosen": -2.695669651031494, + "logits/rejected": -2.5160300731658936, + "logps/chosen": -243.835205078125, + "logps/rejected": -310.9700012207031, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0049825431779026985, + "rewards/margins": 9.520586967468262, + "rewards/rejected": -9.515605926513672, + "step": 8130 + }, + { + "epoch": 1.96, + "learning_rate": 1.9277054733464074e-07, + "logits/chosen": -2.502044200897217, + "logits/rejected": -2.4987478256225586, + "logps/chosen": -233.9998321533203, + "logps/rejected": -363.33001708984375, + "loss": 0.0885, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0708898305892944, + "rewards/margins": 10.336236000061035, + "rewards/rejected": -9.26534652709961, + "step": 8140 + }, + { + "epoch": 1.96, + "learning_rate": 1.9232483508646815e-07, + "logits/chosen": -2.521312952041626, + "logits/rejected": -2.4630274772644043, + "logps/chosen": -301.7711486816406, + "logps/rejected": -283.7261657714844, + "loss": 0.1093, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.0350699424743652, + "rewards/margins": 10.709259986877441, + "rewards/rejected": -8.674189567565918, + "step": 8150 + }, + { + "epoch": 1.96, + "learning_rate": 1.9187912283829558e-07, + "logits/chosen": -2.707197904586792, + "logits/rejected": -2.4446969032287598, + "logps/chosen": -221.1367950439453, + "logps/rejected": -221.2825927734375, + "loss": 0.1126, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3409653306007385, + "rewards/margins": 5.505112648010254, + "rewards/rejected": -5.846077919006348, + "step": 8160 + }, + { + "epoch": 1.97, + "learning_rate": 1.91433410590123e-07, + "logits/chosen": -2.6052212715148926, + "logits/rejected": -2.6014931201934814, + "logps/chosen": -276.29949951171875, + "logps/rejected": -361.728759765625, + "loss": 0.1135, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.200992465019226, + "rewards/margins": 8.519786834716797, + "rewards/rejected": -7.318794250488281, + "step": 8170 + }, + { + "epoch": 1.97, + "learning_rate": 1.909876983419504e-07, + "logits/chosen": -2.450148105621338, + "logits/rejected": -2.456796169281006, + "logps/chosen": -244.544921875, + "logps/rejected": -320.2922058105469, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8630601167678833, + "rewards/margins": 10.178804397583008, + "rewards/rejected": -9.315743446350098, + "step": 8180 + }, + { + "epoch": 1.97, + "learning_rate": 1.9054198609377787e-07, + "logits/chosen": -2.668520212173462, + "logits/rejected": -2.452944755554199, + "logps/chosen": -278.19049072265625, + "logps/rejected": -261.8446350097656, + "loss": 0.0965, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7886543869972229, + "rewards/margins": 7.251919746398926, + "rewards/rejected": -8.040573120117188, + "step": 8190 + }, + { + "epoch": 1.97, + "learning_rate": 1.900962738456053e-07, + "logits/chosen": -2.539135217666626, + "logits/rejected": -2.600917100906372, + "logps/chosen": -190.87591552734375, + "logps/rejected": -362.54400634765625, + "loss": 0.081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18095922470092773, + "rewards/margins": 11.730324745178223, + "rewards/rejected": -11.911282539367676, + "step": 8200 + }, + { + "epoch": 1.98, + "learning_rate": 1.896505615974327e-07, + "logits/chosen": -2.567045211791992, + "logits/rejected": -2.5732991695404053, + "logps/chosen": -203.4150390625, + "logps/rejected": -401.07196044921875, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10424991697072983, + "rewards/margins": 10.450787544250488, + "rewards/rejected": -10.346536636352539, + "step": 8210 + }, + { + "epoch": 1.98, + "learning_rate": 1.8920484934926013e-07, + "logits/chosen": -2.520174980163574, + "logits/rejected": -2.498842716217041, + "logps/chosen": -187.52316284179688, + "logps/rejected": -244.5631866455078, + "loss": 0.0831, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6810253858566284, + "rewards/margins": 5.883547782897949, + "rewards/rejected": -6.564573764801025, + "step": 8220 + }, + { + "epoch": 1.98, + "learning_rate": 1.8875913710108753e-07, + "logits/chosen": -2.5760746002197266, + "logits/rejected": -2.4093546867370605, + "logps/chosen": -254.4566650390625, + "logps/rejected": -239.0888214111328, + "loss": 0.1016, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9175148010253906, + "rewards/margins": 6.227782249450684, + "rewards/rejected": -8.145296096801758, + "step": 8230 + }, + { + "epoch": 1.98, + "learning_rate": 1.8831342485291496e-07, + "logits/chosen": -2.740858793258667, + "logits/rejected": -2.6939857006073, + "logps/chosen": -241.9950714111328, + "logps/rejected": -368.31597900390625, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09666530042886734, + "rewards/margins": 9.49067497253418, + "rewards/rejected": -9.39400863647461, + "step": 8240 + }, + { + "epoch": 1.99, + "learning_rate": 1.878677126047424e-07, + "logits/chosen": -2.4791433811187744, + "logits/rejected": -2.488866090774536, + "logps/chosen": -301.0170593261719, + "logps/rejected": -308.7914733886719, + "loss": 0.1834, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1338343620300293, + "rewards/margins": 10.202335357666016, + "rewards/rejected": -10.336170196533203, + "step": 8250 + }, + { + "epoch": 1.99, + "learning_rate": 1.874220003565698e-07, + "logits/chosen": -2.692753314971924, + "logits/rejected": -2.6837029457092285, + "logps/chosen": -295.8060302734375, + "logps/rejected": -336.2331848144531, + "loss": 0.0899, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5804893970489502, + "rewards/margins": 8.181416511535645, + "rewards/rejected": -9.761906623840332, + "step": 8260 + }, + { + "epoch": 1.99, + "learning_rate": 1.8697628810839722e-07, + "logits/chosen": -2.711230993270874, + "logits/rejected": -2.5777366161346436, + "logps/chosen": -431.20703125, + "logps/rejected": -328.12554931640625, + "loss": 0.0866, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.21979090571403503, + "rewards/margins": 8.251157760620117, + "rewards/rejected": -8.031366348266602, + "step": 8270 + }, + { + "epoch": 1.99, + "learning_rate": 1.8653057586022465e-07, + "logits/chosen": -2.6268134117126465, + "logits/rejected": -2.579261064529419, + "logps/chosen": -268.69842529296875, + "logps/rejected": -335.99249267578125, + "loss": 0.0869, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1315809488296509, + "rewards/margins": 9.498881340026855, + "rewards/rejected": -8.367301940917969, + "step": 8280 + }, + { + "epoch": 2.0, + "learning_rate": 1.8608486361205205e-07, + "logits/chosen": -2.5360658168792725, + "logits/rejected": -2.4947776794433594, + "logps/chosen": -269.78399658203125, + "logps/rejected": -336.0529479980469, + "loss": 0.1276, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6951628923416138, + "rewards/margins": 8.493033409118652, + "rewards/rejected": -9.188196182250977, + "step": 8290 + }, + { + "epoch": 2.0, + "learning_rate": 1.8563915136387948e-07, + "logits/chosen": -2.577726125717163, + "logits/rejected": -2.466395616531372, + "logps/chosen": -268.5062561035156, + "logps/rejected": -222.9132843017578, + "loss": 0.0845, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9850286245346069, + "rewards/margins": 5.597483158111572, + "rewards/rejected": -6.582511901855469, + "step": 8300 + }, + { + "epoch": 2.0, + "learning_rate": 1.8519343911570688e-07, + "logits/chosen": -2.3435440063476562, + "logits/rejected": -2.271286725997925, + "logps/chosen": -223.3821563720703, + "logps/rejected": -361.36309814453125, + "loss": 0.1232, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5280535221099854, + "rewards/margins": 8.094234466552734, + "rewards/rejected": -9.62228775024414, + "step": 8310 + }, + { + "epoch": 2.0, + "learning_rate": 1.8474772686753431e-07, + "logits/chosen": -2.581578493118286, + "logits/rejected": -2.479897975921631, + "logps/chosen": -245.2767791748047, + "logps/rejected": -291.4203186035156, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8183133006095886, + "rewards/margins": 8.31879711151123, + "rewards/rejected": -9.137109756469727, + "step": 8320 + }, + { + "epoch": 2.0, + "learning_rate": 1.8430201461936174e-07, + "logits/chosen": -2.479945182800293, + "logits/rejected": -2.5628583431243896, + "logps/chosen": -193.89761352539062, + "logps/rejected": -321.42535400390625, + "loss": 0.0361, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5563122630119324, + "rewards/margins": 8.874561309814453, + "rewards/rejected": -9.43087387084961, + "step": 8330 + }, + { + "epoch": 2.01, + "learning_rate": 1.8385630237118915e-07, + "logits/chosen": -2.727910041809082, + "logits/rejected": -2.6726880073547363, + "logps/chosen": -283.5306701660156, + "logps/rejected": -349.61822509765625, + "loss": 0.0322, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.885776698589325, + "rewards/margins": 9.441603660583496, + "rewards/rejected": -10.327380180358887, + "step": 8340 + }, + { + "epoch": 2.01, + "learning_rate": 1.8341059012301658e-07, + "logits/chosen": -2.425305128097534, + "logits/rejected": -2.476073980331421, + "logps/chosen": -293.70501708984375, + "logps/rejected": -441.30548095703125, + "loss": 0.0336, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2522985935211182, + "rewards/margins": 9.488914489746094, + "rewards/rejected": -10.741212844848633, + "step": 8350 + }, + { + "epoch": 2.01, + "learning_rate": 1.82964877874844e-07, + "logits/chosen": -2.480833053588867, + "logits/rejected": -2.5968728065490723, + "logps/chosen": -216.1871337890625, + "logps/rejected": -294.4857482910156, + "loss": 0.0282, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.41751059889793396, + "rewards/margins": 7.914503574371338, + "rewards/rejected": -8.332013130187988, + "step": 8360 + }, + { + "epoch": 2.01, + "learning_rate": 1.825191656266714e-07, + "logits/chosen": -2.640190601348877, + "logits/rejected": -2.6222071647644043, + "logps/chosen": -299.1702880859375, + "logps/rejected": -288.4884033203125, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.215740367770195, + "rewards/margins": 8.871757507324219, + "rewards/rejected": -9.087498664855957, + "step": 8370 + }, + { + "epoch": 2.02, + "learning_rate": 1.8207345337849884e-07, + "logits/chosen": -2.568044424057007, + "logits/rejected": -2.587066411972046, + "logps/chosen": -250.3285675048828, + "logps/rejected": -306.30609130859375, + "loss": 0.0412, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0288197994232178, + "rewards/margins": 8.337857246398926, + "rewards/rejected": -9.366676330566406, + "step": 8380 + }, + { + "epoch": 2.02, + "learning_rate": 1.8162774113032624e-07, + "logits/chosen": -2.292970657348633, + "logits/rejected": -2.2049508094787598, + "logps/chosen": -251.81686401367188, + "logps/rejected": -368.78204345703125, + "loss": 0.0376, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1272547245025635, + "rewards/margins": 9.117366790771484, + "rewards/rejected": -10.244623184204102, + "step": 8390 + }, + { + "epoch": 2.02, + "learning_rate": 1.8118202888215367e-07, + "logits/chosen": -2.6245510578155518, + "logits/rejected": -2.570723295211792, + "logps/chosen": -314.4761047363281, + "logps/rejected": -315.69873046875, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1873103380203247, + "rewards/margins": 7.634018898010254, + "rewards/rejected": -8.821329116821289, + "step": 8400 + }, + { + "epoch": 2.02, + "eval_logits/chosen": -2.321371555328369, + "eval_logits/rejected": -2.2849819660186768, + "eval_logps/chosen": -251.5195770263672, + "eval_logps/rejected": -271.35797119140625, + "eval_loss": 0.5478196740150452, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -4.959201335906982, + "eval_rewards/margins": 3.2235164642333984, + "eval_rewards/rejected": -8.182718276977539, + "eval_runtime": 133.6443, + "eval_samples_per_second": 23.615, + "eval_steps_per_second": 0.374, + "step": 8400 + }, + { + "epoch": 2.02, + "learning_rate": 1.807363166339811e-07, + "logits/chosen": -2.615668535232544, + "logits/rejected": -2.552216053009033, + "logps/chosen": -241.84475708007812, + "logps/rejected": -424.7079162597656, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0549558401107788, + "rewards/margins": 10.097718238830566, + "rewards/rejected": -11.152674674987793, + "step": 8410 + }, + { + "epoch": 2.03, + "learning_rate": 1.802906043858085e-07, + "logits/chosen": -2.521404266357422, + "logits/rejected": -2.5072107315063477, + "logps/chosen": -238.27090454101562, + "logps/rejected": -330.8056640625, + "loss": 0.0551, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7115432024002075, + "rewards/margins": 9.558634757995605, + "rewards/rejected": -10.270176887512207, + "step": 8420 + }, + { + "epoch": 2.03, + "learning_rate": 1.7984489213763593e-07, + "logits/chosen": -2.6884095668792725, + "logits/rejected": -2.680422306060791, + "logps/chosen": -277.35272216796875, + "logps/rejected": -377.1102600097656, + "loss": 0.0753, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.961971640586853, + "rewards/margins": 10.981986999511719, + "rewards/rejected": -11.943960189819336, + "step": 8430 + }, + { + "epoch": 2.03, + "learning_rate": 1.7939917988946336e-07, + "logits/chosen": -2.333134174346924, + "logits/rejected": -2.346411943435669, + "logps/chosen": -241.6659698486328, + "logps/rejected": -254.75167846679688, + "loss": 0.0297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6574668884277344, + "rewards/margins": 7.7597527503967285, + "rewards/rejected": -9.417219161987305, + "step": 8440 + }, + { + "epoch": 2.03, + "learning_rate": 1.7895346764129076e-07, + "logits/chosen": -2.623356342315674, + "logits/rejected": -2.467191696166992, + "logps/chosen": -314.97674560546875, + "logps/rejected": -365.6291809082031, + "loss": 0.0253, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3737142086029053, + "rewards/margins": 11.685809135437012, + "rewards/rejected": -10.312093734741211, + "step": 8450 + }, + { + "epoch": 2.04, + "learning_rate": 1.785077553931182e-07, + "logits/chosen": -2.6209094524383545, + "logits/rejected": -2.5927417278289795, + "logps/chosen": -237.51516723632812, + "logps/rejected": -387.4162292480469, + "loss": 0.0216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.13859105110168457, + "rewards/margins": 10.451630592346191, + "rewards/rejected": -10.590221405029297, + "step": 8460 + }, + { + "epoch": 2.04, + "learning_rate": 1.780620431449456e-07, + "logits/chosen": -2.4766571521759033, + "logits/rejected": -2.3246235847473145, + "logps/chosen": -221.49423217773438, + "logps/rejected": -314.0611267089844, + "loss": 0.0348, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.025004005059599876, + "rewards/margins": 11.23481559753418, + "rewards/rejected": -11.209811210632324, + "step": 8470 + }, + { + "epoch": 2.04, + "learning_rate": 1.7761633089677302e-07, + "logits/chosen": -2.6109936237335205, + "logits/rejected": -2.602966785430908, + "logps/chosen": -266.33184814453125, + "logps/rejected": -320.2018127441406, + "loss": 0.0314, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.632354736328125, + "rewards/margins": 8.30778980255127, + "rewards/rejected": -9.940142631530762, + "step": 8480 + }, + { + "epoch": 2.04, + "learning_rate": 1.7717061864860045e-07, + "logits/chosen": -2.709364891052246, + "logits/rejected": -2.57318377494812, + "logps/chosen": -276.8520202636719, + "logps/rejected": -289.3186950683594, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3175122141838074, + "rewards/margins": 9.372842788696289, + "rewards/rejected": -9.055330276489258, + "step": 8490 + }, + { + "epoch": 2.05, + "learning_rate": 1.7672490640042786e-07, + "logits/chosen": -2.52427339553833, + "logits/rejected": -2.4743118286132812, + "logps/chosen": -314.8677673339844, + "logps/rejected": -344.07421875, + "loss": 0.0234, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6964309215545654, + "rewards/margins": 10.262825012207031, + "rewards/rejected": -11.959256172180176, + "step": 8500 + }, + { + "epoch": 2.05, + "learning_rate": 1.7627919415225529e-07, + "logits/chosen": -2.570406913757324, + "logits/rejected": -2.551332950592041, + "logps/chosen": -209.91061401367188, + "logps/rejected": -333.5925598144531, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007298028562217951, + "rewards/margins": 13.005849838256836, + "rewards/rejected": -13.013150215148926, + "step": 8510 + }, + { + "epoch": 2.05, + "learning_rate": 1.7583348190408272e-07, + "logits/chosen": -2.222724437713623, + "logits/rejected": -2.300708293914795, + "logps/chosen": -190.5357666015625, + "logps/rejected": -275.17681884765625, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.722133219242096, + "rewards/margins": 11.837811470031738, + "rewards/rejected": -11.115676879882812, + "step": 8520 + }, + { + "epoch": 2.05, + "learning_rate": 1.7538776965591012e-07, + "logits/chosen": -2.618381977081299, + "logits/rejected": -2.480579137802124, + "logps/chosen": -226.8935546875, + "logps/rejected": -307.2865295410156, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6915401220321655, + "rewards/margins": 9.894481658935547, + "rewards/rejected": -11.586021423339844, + "step": 8530 + }, + { + "epoch": 2.06, + "learning_rate": 1.7494205740773757e-07, + "logits/chosen": -2.4632339477539062, + "logits/rejected": -2.4606950283050537, + "logps/chosen": -268.89569091796875, + "logps/rejected": -369.653564453125, + "loss": 0.0414, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4387400150299072, + "rewards/margins": 9.543800354003906, + "rewards/rejected": -11.982542037963867, + "step": 8540 + }, + { + "epoch": 2.06, + "learning_rate": 1.7449634515956498e-07, + "logits/chosen": -2.717900276184082, + "logits/rejected": -2.6395182609558105, + "logps/chosen": -285.09735107421875, + "logps/rejected": -365.9479064941406, + "loss": 0.0309, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6911836862564087, + "rewards/margins": 10.700658798217773, + "rewards/rejected": -11.39184284210205, + "step": 8550 + }, + { + "epoch": 2.06, + "learning_rate": 1.740506329113924e-07, + "logits/chosen": -2.54192852973938, + "logits/rejected": -2.5281126499176025, + "logps/chosen": -247.7664794921875, + "logps/rejected": -295.73883056640625, + "loss": 0.0507, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8046247959136963, + "rewards/margins": 8.388303756713867, + "rewards/rejected": -10.192930221557617, + "step": 8560 + }, + { + "epoch": 2.06, + "learning_rate": 1.7360492066321984e-07, + "logits/chosen": -2.619554042816162, + "logits/rejected": -2.523376703262329, + "logps/chosen": -298.53448486328125, + "logps/rejected": -338.39825439453125, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2122616469860077, + "rewards/margins": 10.543150901794434, + "rewards/rejected": -10.755414009094238, + "step": 8570 + }, + { + "epoch": 2.06, + "learning_rate": 1.7315920841504724e-07, + "logits/chosen": -2.56117582321167, + "logits/rejected": -2.4610273838043213, + "logps/chosen": -262.32806396484375, + "logps/rejected": -287.43170166015625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9256149530410767, + "rewards/margins": 8.507223129272461, + "rewards/rejected": -9.432836532592773, + "step": 8580 + }, + { + "epoch": 2.07, + "learning_rate": 1.7271349616687467e-07, + "logits/chosen": -2.2090346813201904, + "logits/rejected": -2.18257737159729, + "logps/chosen": -241.55361938476562, + "logps/rejected": -426.50323486328125, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20222394168376923, + "rewards/margins": 12.990568161010742, + "rewards/rejected": -12.788345336914062, + "step": 8590 + }, + { + "epoch": 2.07, + "learning_rate": 1.722677839187021e-07, + "logits/chosen": -2.5544230937957764, + "logits/rejected": -2.4997313022613525, + "logps/chosen": -254.62548828125, + "logps/rejected": -347.44830322265625, + "loss": 0.0162, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7101238965988159, + "rewards/margins": 11.792463302612305, + "rewards/rejected": -12.502588272094727, + "step": 8600 + }, + { + "epoch": 2.07, + "learning_rate": 1.718220716705295e-07, + "logits/chosen": -2.552060127258301, + "logits/rejected": -2.5133891105651855, + "logps/chosen": -241.55874633789062, + "logps/rejected": -329.78887939453125, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0036016940139234066, + "rewards/margins": 11.071989059448242, + "rewards/rejected": -11.068387031555176, + "step": 8610 + }, + { + "epoch": 2.07, + "learning_rate": 1.7137635942235693e-07, + "logits/chosen": -2.548405170440674, + "logits/rejected": -2.525132656097412, + "logps/chosen": -299.3553771972656, + "logps/rejected": -380.62060546875, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.139615774154663, + "rewards/margins": 12.482218742370605, + "rewards/rejected": -13.621835708618164, + "step": 8620 + }, + { + "epoch": 2.08, + "learning_rate": 1.7093064717418433e-07, + "logits/chosen": -2.564290761947632, + "logits/rejected": -2.458705425262451, + "logps/chosen": -258.10638427734375, + "logps/rejected": -362.2889404296875, + "loss": 0.0259, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.686469554901123, + "rewards/margins": 12.353072166442871, + "rewards/rejected": -13.039543151855469, + "step": 8630 + }, + { + "epoch": 2.08, + "learning_rate": 1.7048493492601176e-07, + "logits/chosen": -2.5656790733337402, + "logits/rejected": -2.448558807373047, + "logps/chosen": -276.85089111328125, + "logps/rejected": -294.8529968261719, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8730595707893372, + "rewards/margins": 10.170073509216309, + "rewards/rejected": -11.043131828308105, + "step": 8640 + }, + { + "epoch": 2.08, + "learning_rate": 1.700392226778392e-07, + "logits/chosen": -2.4552650451660156, + "logits/rejected": -2.6275129318237305, + "logps/chosen": -256.6981506347656, + "logps/rejected": -329.18109130859375, + "loss": 0.0089, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.57915198802948, + "rewards/margins": 10.759608268737793, + "rewards/rejected": -12.338760375976562, + "step": 8650 + }, + { + "epoch": 2.08, + "learning_rate": 1.695935104296666e-07, + "logits/chosen": -2.71759033203125, + "logits/rejected": -2.5608346462249756, + "logps/chosen": -354.6686096191406, + "logps/rejected": -412.43792724609375, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8514503240585327, + "rewards/margins": 11.856977462768555, + "rewards/rejected": -11.00552749633789, + "step": 8660 + }, + { + "epoch": 2.09, + "learning_rate": 1.6914779818149402e-07, + "logits/chosen": -2.5128674507141113, + "logits/rejected": -2.454211711883545, + "logps/chosen": -361.24237060546875, + "logps/rejected": -447.9710998535156, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2542957663536072, + "rewards/margins": 11.777315139770508, + "rewards/rejected": -11.523018836975098, + "step": 8670 + }, + { + "epoch": 2.09, + "learning_rate": 1.6870208593332145e-07, + "logits/chosen": -2.7541584968566895, + "logits/rejected": -2.6328043937683105, + "logps/chosen": -364.6342468261719, + "logps/rejected": -389.0901794433594, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.827517032623291, + "rewards/margins": 11.085371971130371, + "rewards/rejected": -11.91288948059082, + "step": 8680 + }, + { + "epoch": 2.09, + "learning_rate": 1.6825637368514886e-07, + "logits/chosen": -2.535529375076294, + "logits/rejected": -2.356951951980591, + "logps/chosen": -330.3153381347656, + "logps/rejected": -280.31146240234375, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03817791864275932, + "rewards/margins": 10.486616134643555, + "rewards/rejected": -10.52479362487793, + "step": 8690 + }, + { + "epoch": 2.09, + "learning_rate": 1.6781066143697628e-07, + "logits/chosen": -2.3027021884918213, + "logits/rejected": -2.4584460258483887, + "logps/chosen": -214.76657104492188, + "logps/rejected": -349.4528503417969, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9708747863769531, + "rewards/margins": 11.003325462341309, + "rewards/rejected": -12.974202156066895, + "step": 8700 + }, + { + "epoch": 2.1, + "learning_rate": 1.673649491888037e-07, + "logits/chosen": -2.6496386528015137, + "logits/rejected": -2.502171277999878, + "logps/chosen": -388.9115295410156, + "logps/rejected": -360.45062255859375, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.633139431476593, + "rewards/margins": 10.832531929016113, + "rewards/rejected": -11.465669631958008, + "step": 8710 + }, + { + "epoch": 2.1, + "learning_rate": 1.6691923694063112e-07, + "logits/chosen": -2.3688206672668457, + "logits/rejected": -2.2172188758850098, + "logps/chosen": -245.2628631591797, + "logps/rejected": -304.0785217285156, + "loss": 0.02, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1016969680786133, + "rewards/margins": 9.271982192993164, + "rewards/rejected": -11.373678207397461, + "step": 8720 + }, + { + "epoch": 2.1, + "learning_rate": 1.6647352469245855e-07, + "logits/chosen": -2.4992682933807373, + "logits/rejected": -2.3491616249084473, + "logps/chosen": -293.691650390625, + "logps/rejected": -324.7965393066406, + "loss": 0.0352, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.12693621218204498, + "rewards/margins": 9.295028686523438, + "rewards/rejected": -9.421964645385742, + "step": 8730 + }, + { + "epoch": 2.1, + "learning_rate": 1.6602781244428595e-07, + "logits/chosen": -2.470156192779541, + "logits/rejected": -2.3872666358947754, + "logps/chosen": -224.85061645507812, + "logps/rejected": -311.27691650390625, + "loss": 0.0443, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5656036138534546, + "rewards/margins": 9.098166465759277, + "rewards/rejected": -8.532563209533691, + "step": 8740 + }, + { + "epoch": 2.11, + "learning_rate": 1.6558210019611338e-07, + "logits/chosen": -2.6150567531585693, + "logits/rejected": -2.597041130065918, + "logps/chosen": -333.84515380859375, + "logps/rejected": -379.3749084472656, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0011082172859460115, + "rewards/margins": 11.748732566833496, + "rewards/rejected": -11.749841690063477, + "step": 8750 + }, + { + "epoch": 2.11, + "learning_rate": 1.651363879479408e-07, + "logits/chosen": -2.4506280422210693, + "logits/rejected": -2.3053932189941406, + "logps/chosen": -224.02517700195312, + "logps/rejected": -269.5003356933594, + "loss": 0.0515, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8991334438323975, + "rewards/margins": 7.669316291809082, + "rewards/rejected": -9.568449974060059, + "step": 8760 + }, + { + "epoch": 2.11, + "learning_rate": 1.646906756997682e-07, + "logits/chosen": -2.623814344406128, + "logits/rejected": -2.5576224327087402, + "logps/chosen": -396.124267578125, + "logps/rejected": -358.0867919921875, + "loss": 0.0279, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.060526300221681595, + "rewards/margins": 8.95967960357666, + "rewards/rejected": -9.0202054977417, + "step": 8770 + }, + { + "epoch": 2.11, + "learning_rate": 1.6424496345159564e-07, + "logits/chosen": -2.505446672439575, + "logits/rejected": -2.4441895484924316, + "logps/chosen": -311.7078857421875, + "logps/rejected": -399.08648681640625, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4337737560272217, + "rewards/margins": 12.172263145446777, + "rewards/rejected": -15.606036186218262, + "step": 8780 + }, + { + "epoch": 2.12, + "learning_rate": 1.6379925120342304e-07, + "logits/chosen": -2.5672106742858887, + "logits/rejected": -2.4547407627105713, + "logps/chosen": -197.42898559570312, + "logps/rejected": -345.4898376464844, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4426276683807373, + "rewards/margins": 10.940542221069336, + "rewards/rejected": -13.383171081542969, + "step": 8790 + }, + { + "epoch": 2.12, + "learning_rate": 1.6335353895525047e-07, + "logits/chosen": -2.58722186088562, + "logits/rejected": -2.413419246673584, + "logps/chosen": -265.8213195800781, + "logps/rejected": -407.5599060058594, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5724204778671265, + "rewards/margins": 12.546555519104004, + "rewards/rejected": -11.97413444519043, + "step": 8800 + }, + { + "epoch": 2.12, + "eval_logits/chosen": -2.2326250076293945, + "eval_logits/rejected": -2.192549228668213, + "eval_logps/chosen": -263.7903747558594, + "eval_logps/rejected": -290.16241455078125, + "eval_loss": 0.5999376177787781, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -6.186283588409424, + "eval_rewards/margins": 3.8768796920776367, + "eval_rewards/rejected": -10.063161849975586, + "eval_runtime": 133.9581, + "eval_samples_per_second": 23.56, + "eval_steps_per_second": 0.373, + "step": 8800 + }, + { + "epoch": 2.12, + "learning_rate": 1.629078267070779e-07, + "logits/chosen": -2.5198941230773926, + "logits/rejected": -2.3723132610321045, + "logps/chosen": -258.646240234375, + "logps/rejected": -290.8374938964844, + "loss": 0.0229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.203172445297241, + "rewards/margins": 7.861981391906738, + "rewards/rejected": -10.065153121948242, + "step": 8810 + }, + { + "epoch": 2.12, + "learning_rate": 1.624621144589053e-07, + "logits/chosen": -2.5503334999084473, + "logits/rejected": -2.5767085552215576, + "logps/chosen": -234.769775390625, + "logps/rejected": -370.20281982421875, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.087836742401123, + "rewards/margins": 10.581189155578613, + "rewards/rejected": -11.669024467468262, + "step": 8820 + }, + { + "epoch": 2.13, + "learning_rate": 1.6201640221073273e-07, + "logits/chosen": -2.6861696243286133, + "logits/rejected": -2.4844136238098145, + "logps/chosen": -285.65020751953125, + "logps/rejected": -351.6511535644531, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32659345865249634, + "rewards/margins": 11.284768104553223, + "rewards/rejected": -10.958174705505371, + "step": 8830 + }, + { + "epoch": 2.13, + "learning_rate": 1.6157068996256016e-07, + "logits/chosen": -2.508152484893799, + "logits/rejected": -2.475700855255127, + "logps/chosen": -244.31350708007812, + "logps/rejected": -310.83624267578125, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22390878200531006, + "rewards/margins": 10.004617691040039, + "rewards/rejected": -9.780708312988281, + "step": 8840 + }, + { + "epoch": 2.13, + "learning_rate": 1.6112497771438757e-07, + "logits/chosen": -2.5587594509124756, + "logits/rejected": -2.4645655155181885, + "logps/chosen": -262.1879577636719, + "logps/rejected": -341.3424377441406, + "loss": 0.0359, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.997057318687439, + "rewards/margins": 8.987771034240723, + "rewards/rejected": -10.984827041625977, + "step": 8850 + }, + { + "epoch": 2.13, + "learning_rate": 1.60679265466215e-07, + "logits/chosen": -2.526792049407959, + "logits/rejected": -2.438915967941284, + "logps/chosen": -366.299560546875, + "logps/rejected": -384.32452392578125, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2667930126190186, + "rewards/margins": 14.111457824707031, + "rewards/rejected": -12.84466552734375, + "step": 8860 + }, + { + "epoch": 2.13, + "learning_rate": 1.602335532180424e-07, + "logits/chosen": -2.499552011489868, + "logits/rejected": -2.5095345973968506, + "logps/chosen": -297.86346435546875, + "logps/rejected": -385.1635437011719, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30837756395339966, + "rewards/margins": 11.472579002380371, + "rewards/rejected": -11.780957221984863, + "step": 8870 + }, + { + "epoch": 2.14, + "learning_rate": 1.5978784096986985e-07, + "logits/chosen": -2.3323168754577637, + "logits/rejected": -2.251751661300659, + "logps/chosen": -308.49676513671875, + "logps/rejected": -427.1435546875, + "loss": 0.0341, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2195223569869995, + "rewards/margins": 14.615882873535156, + "rewards/rejected": -15.835405349731445, + "step": 8880 + }, + { + "epoch": 2.14, + "learning_rate": 1.5934212872169728e-07, + "logits/chosen": -2.4065375328063965, + "logits/rejected": -2.4848408699035645, + "logps/chosen": -292.5906982421875, + "logps/rejected": -367.9299011230469, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9609495401382446, + "rewards/margins": 10.981375694274902, + "rewards/rejected": -11.942326545715332, + "step": 8890 + }, + { + "epoch": 2.14, + "learning_rate": 1.5889641647352469e-07, + "logits/chosen": -2.580648183822632, + "logits/rejected": -2.5684657096862793, + "logps/chosen": -402.7144775390625, + "logps/rejected": -552.5977783203125, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04434077814221382, + "rewards/margins": 14.231958389282227, + "rewards/rejected": -14.187617301940918, + "step": 8900 + }, + { + "epoch": 2.14, + "learning_rate": 1.5845070422535212e-07, + "logits/chosen": -2.428067922592163, + "logits/rejected": -2.332951068878174, + "logps/chosen": -314.9877014160156, + "logps/rejected": -423.8435974121094, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3378690481185913, + "rewards/margins": 12.019502639770508, + "rewards/rejected": -13.357370376586914, + "step": 8910 + }, + { + "epoch": 2.15, + "learning_rate": 1.5800499197717954e-07, + "logits/chosen": -2.5505106449127197, + "logits/rejected": -2.5178935527801514, + "logps/chosen": -361.66827392578125, + "logps/rejected": -529.9429931640625, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1555678844451904, + "rewards/margins": 14.764144897460938, + "rewards/rejected": -16.91971206665039, + "step": 8920 + }, + { + "epoch": 2.15, + "learning_rate": 1.5755927972900695e-07, + "logits/chosen": -2.3408703804016113, + "logits/rejected": -2.298884153366089, + "logps/chosen": -280.03021240234375, + "logps/rejected": -350.9729919433594, + "loss": 0.0227, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.21215252578258514, + "rewards/margins": 13.304191589355469, + "rewards/rejected": -13.516342163085938, + "step": 8930 + }, + { + "epoch": 2.15, + "learning_rate": 1.5711356748083438e-07, + "logits/chosen": -2.6067442893981934, + "logits/rejected": -2.525693416595459, + "logps/chosen": -270.82916259765625, + "logps/rejected": -361.995849609375, + "loss": 0.0336, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.351936101913452, + "rewards/margins": 11.557195663452148, + "rewards/rejected": -13.90913200378418, + "step": 8940 + }, + { + "epoch": 2.15, + "learning_rate": 1.566678552326618e-07, + "logits/chosen": -2.5198090076446533, + "logits/rejected": -2.43400239944458, + "logps/chosen": -255.52359008789062, + "logps/rejected": -274.2308349609375, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012747669592499733, + "rewards/margins": 11.146052360534668, + "rewards/rejected": -11.15880012512207, + "step": 8950 + }, + { + "epoch": 2.16, + "learning_rate": 1.562221429844892e-07, + "logits/chosen": -2.5320210456848145, + "logits/rejected": -2.183725118637085, + "logps/chosen": -272.8175048828125, + "logps/rejected": -276.9789733886719, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9705953598022461, + "rewards/margins": 11.81946086883545, + "rewards/rejected": -12.790057182312012, + "step": 8960 + }, + { + "epoch": 2.16, + "learning_rate": 1.5577643073631664e-07, + "logits/chosen": -2.348020076751709, + "logits/rejected": -2.427987575531006, + "logps/chosen": -219.4339141845703, + "logps/rejected": -344.757568359375, + "loss": 0.0338, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.443242073059082, + "rewards/margins": 10.88129997253418, + "rewards/rejected": -12.324542045593262, + "step": 8970 + }, + { + "epoch": 2.16, + "learning_rate": 1.5533071848814404e-07, + "logits/chosen": -2.473099946975708, + "logits/rejected": -2.4175493717193604, + "logps/chosen": -263.54644775390625, + "logps/rejected": -292.1980895996094, + "loss": 0.032, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9177930355072021, + "rewards/margins": 9.586542129516602, + "rewards/rejected": -11.504335403442383, + "step": 8980 + }, + { + "epoch": 2.16, + "learning_rate": 1.5488500623997147e-07, + "logits/chosen": -2.281341314315796, + "logits/rejected": -2.402763843536377, + "logps/chosen": -240.7819366455078, + "logps/rejected": -395.9696960449219, + "loss": 0.0214, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6088535785675049, + "rewards/margins": 10.593988418579102, + "rewards/rejected": -12.202842712402344, + "step": 8990 + }, + { + "epoch": 2.17, + "learning_rate": 1.544392939917989e-07, + "logits/chosen": -2.353468179702759, + "logits/rejected": -2.2845921516418457, + "logps/chosen": -203.794921875, + "logps/rejected": -358.76873779296875, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8850176334381104, + "rewards/margins": 11.59618854522705, + "rewards/rejected": -13.481205940246582, + "step": 9000 + }, + { + "epoch": 2.17, + "learning_rate": 1.539935817436263e-07, + "logits/chosen": -2.5114352703094482, + "logits/rejected": -2.4373786449432373, + "logps/chosen": -254.0144805908203, + "logps/rejected": -368.44915771484375, + "loss": 0.0518, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6289116144180298, + "rewards/margins": 11.79938793182373, + "rewards/rejected": -12.428298950195312, + "step": 9010 + }, + { + "epoch": 2.17, + "learning_rate": 1.5354786949545373e-07, + "logits/chosen": -2.257683753967285, + "logits/rejected": -2.271179676055908, + "logps/chosen": -265.44110107421875, + "logps/rejected": -345.773193359375, + "loss": 0.0373, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7293133735656738, + "rewards/margins": 9.583789825439453, + "rewards/rejected": -10.313103675842285, + "step": 9020 + }, + { + "epoch": 2.17, + "learning_rate": 1.5310215724728116e-07, + "logits/chosen": -2.4884238243103027, + "logits/rejected": -2.3828537464141846, + "logps/chosen": -342.45391845703125, + "logps/rejected": -368.4088134765625, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9505392909049988, + "rewards/margins": 10.661983489990234, + "rewards/rejected": -11.612524032592773, + "step": 9030 + }, + { + "epoch": 2.18, + "learning_rate": 1.5265644499910856e-07, + "logits/chosen": -2.3377490043640137, + "logits/rejected": -2.173964023590088, + "logps/chosen": -238.179931640625, + "logps/rejected": -385.8675537109375, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6198335886001587, + "rewards/margins": 11.473251342773438, + "rewards/rejected": -12.093084335327148, + "step": 9040 + }, + { + "epoch": 2.18, + "learning_rate": 1.52210732750936e-07, + "logits/chosen": -2.2793710231781006, + "logits/rejected": -2.3451266288757324, + "logps/chosen": -192.71934509277344, + "logps/rejected": -380.89825439453125, + "loss": 0.027, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.118816375732422, + "rewards/margins": 10.546943664550781, + "rewards/rejected": -12.66576099395752, + "step": 9050 + }, + { + "epoch": 2.18, + "learning_rate": 1.517650205027634e-07, + "logits/chosen": -2.259404420852661, + "logits/rejected": -2.238851308822632, + "logps/chosen": -297.0094299316406, + "logps/rejected": -463.2301330566406, + "loss": 0.0187, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.126497268676758, + "rewards/margins": 14.385139465332031, + "rewards/rejected": -16.511638641357422, + "step": 9060 + }, + { + "epoch": 2.18, + "learning_rate": 1.5131930825459083e-07, + "logits/chosen": -2.097804546356201, + "logits/rejected": -2.201899528503418, + "logps/chosen": -254.0164794921875, + "logps/rejected": -321.0382385253906, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39512887597084045, + "rewards/margins": 11.358099937438965, + "rewards/rejected": -11.753230094909668, + "step": 9070 + }, + { + "epoch": 2.19, + "learning_rate": 1.5087359600641826e-07, + "logits/chosen": -2.2721550464630127, + "logits/rejected": -2.36177396774292, + "logps/chosen": -297.96405029296875, + "logps/rejected": -514.7808837890625, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2175419330596924, + "rewards/margins": 18.318937301635742, + "rewards/rejected": -17.101394653320312, + "step": 9080 + }, + { + "epoch": 2.19, + "learning_rate": 1.5042788375824566e-07, + "logits/chosen": -2.545522928237915, + "logits/rejected": -2.2234930992126465, + "logps/chosen": -256.73394775390625, + "logps/rejected": -322.40142822265625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07155473530292511, + "rewards/margins": 11.270914077758789, + "rewards/rejected": -11.199357986450195, + "step": 9090 + }, + { + "epoch": 2.19, + "learning_rate": 1.499821715100731e-07, + "logits/chosen": -2.3779549598693848, + "logits/rejected": -2.292271375656128, + "logps/chosen": -313.8078308105469, + "logps/rejected": -379.89447021484375, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6782283782958984, + "rewards/margins": 11.749468803405762, + "rewards/rejected": -12.427698135375977, + "step": 9100 + }, + { + "epoch": 2.19, + "learning_rate": 1.4953645926190052e-07, + "logits/chosen": -1.9888923168182373, + "logits/rejected": -1.8414385318756104, + "logps/chosen": -267.9323425292969, + "logps/rejected": -376.7801208496094, + "loss": 0.0258, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.582380771636963, + "rewards/margins": 10.754520416259766, + "rewards/rejected": -13.33690071105957, + "step": 9110 + }, + { + "epoch": 2.19, + "learning_rate": 1.4909074701372792e-07, + "logits/chosen": -2.2754411697387695, + "logits/rejected": -2.129660129547119, + "logps/chosen": -286.35699462890625, + "logps/rejected": -302.1578063964844, + "loss": 0.0297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5871866345405579, + "rewards/margins": 12.948625564575195, + "rewards/rejected": -13.535810470581055, + "step": 9120 + }, + { + "epoch": 2.2, + "learning_rate": 1.4864503476555535e-07, + "logits/chosen": -2.2403056621551514, + "logits/rejected": -2.3537936210632324, + "logps/chosen": -229.68936157226562, + "logps/rejected": -356.7602233886719, + "loss": 0.0394, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8653552532196045, + "rewards/margins": 10.975387573242188, + "rewards/rejected": -12.840744018554688, + "step": 9130 + }, + { + "epoch": 2.2, + "learning_rate": 1.4819932251738275e-07, + "logits/chosen": -2.247201442718506, + "logits/rejected": -2.318659782409668, + "logps/chosen": -205.4511260986328, + "logps/rejected": -338.76007080078125, + "loss": 0.0324, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.45393362641334534, + "rewards/margins": 12.517606735229492, + "rewards/rejected": -12.971542358398438, + "step": 9140 + }, + { + "epoch": 2.2, + "learning_rate": 1.4775361026921018e-07, + "logits/chosen": -2.3011038303375244, + "logits/rejected": -2.273911952972412, + "logps/chosen": -243.8572235107422, + "logps/rejected": -332.92230224609375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.181957244873047, + "rewards/margins": 11.115522384643555, + "rewards/rejected": -13.297480583190918, + "step": 9150 + }, + { + "epoch": 2.2, + "learning_rate": 1.473078980210376e-07, + "logits/chosen": -2.3728251457214355, + "logits/rejected": -2.3403146266937256, + "logps/chosen": -302.58087158203125, + "logps/rejected": -281.1512145996094, + "loss": 0.0316, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0829882621765137, + "rewards/margins": 8.896230697631836, + "rewards/rejected": -10.979219436645508, + "step": 9160 + }, + { + "epoch": 2.21, + "learning_rate": 1.46862185772865e-07, + "logits/chosen": -2.5427348613739014, + "logits/rejected": -2.3668570518493652, + "logps/chosen": -266.931884765625, + "logps/rejected": -340.0614929199219, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6020795106887817, + "rewards/margins": 11.8134183883667, + "rewards/rejected": -12.415498733520508, + "step": 9170 + }, + { + "epoch": 2.21, + "learning_rate": 1.4641647352469244e-07, + "logits/chosen": -2.518648386001587, + "logits/rejected": -2.428374767303467, + "logps/chosen": -240.01028442382812, + "logps/rejected": -456.9060974121094, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8487393260002136, + "rewards/margins": 13.704450607299805, + "rewards/rejected": -12.855712890625, + "step": 9180 + }, + { + "epoch": 2.21, + "learning_rate": 1.4597076127651987e-07, + "logits/chosen": -2.5084991455078125, + "logits/rejected": -2.3670554161071777, + "logps/chosen": -328.0556335449219, + "logps/rejected": -345.2771911621094, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7515302300453186, + "rewards/margins": 11.287810325622559, + "rewards/rejected": -10.536279678344727, + "step": 9190 + }, + { + "epoch": 2.21, + "learning_rate": 1.4552504902834727e-07, + "logits/chosen": -2.4443435668945312, + "logits/rejected": -2.267771005630493, + "logps/chosen": -335.3075256347656, + "logps/rejected": -304.65771484375, + "loss": 0.0327, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6059294939041138, + "rewards/margins": 10.79488468170166, + "rewards/rejected": -11.400813102722168, + "step": 9200 + }, + { + "epoch": 2.21, + "eval_logits/chosen": -2.174819231033325, + "eval_logits/rejected": -2.1368966102600098, + "eval_logps/chosen": -258.5181579589844, + "eval_logps/rejected": -283.9364929199219, + "eval_loss": 0.6189878582954407, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -5.659062385559082, + "eval_rewards/margins": 3.781506299972534, + "eval_rewards/rejected": -9.440567970275879, + "eval_runtime": 134.5581, + "eval_samples_per_second": 23.455, + "eval_steps_per_second": 0.372, + "step": 9200 + }, + { + "epoch": 2.22, + "learning_rate": 1.450793367801747e-07, + "logits/chosen": -2.3690028190612793, + "logits/rejected": -2.3312017917633057, + "logps/chosen": -312.51177978515625, + "logps/rejected": -377.66497802734375, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.505537509918213, + "rewards/margins": 11.056896209716797, + "rewards/rejected": -13.562433242797852, + "step": 9210 + }, + { + "epoch": 2.22, + "learning_rate": 1.4463362453200213e-07, + "logits/chosen": -2.3457257747650146, + "logits/rejected": -2.2673184871673584, + "logps/chosen": -229.06680297851562, + "logps/rejected": -268.76226806640625, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9529112577438354, + "rewards/margins": 11.679685592651367, + "rewards/rejected": -12.632596015930176, + "step": 9220 + }, + { + "epoch": 2.22, + "learning_rate": 1.4418791228382956e-07, + "logits/chosen": -2.6629931926727295, + "logits/rejected": -2.4545934200286865, + "logps/chosen": -278.45648193359375, + "logps/rejected": -302.12237548828125, + "loss": 0.0338, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7727594971656799, + "rewards/margins": 9.006889343261719, + "rewards/rejected": -9.77964973449707, + "step": 9230 + }, + { + "epoch": 2.22, + "learning_rate": 1.43742200035657e-07, + "logits/chosen": -2.4864726066589355, + "logits/rejected": -2.534029245376587, + "logps/chosen": -261.3238830566406, + "logps/rejected": -354.3556213378906, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2573440074920654, + "rewards/margins": 11.578329086303711, + "rewards/rejected": -12.835672378540039, + "step": 9240 + }, + { + "epoch": 2.23, + "learning_rate": 1.432964877874844e-07, + "logits/chosen": -2.523289203643799, + "logits/rejected": -2.1942954063415527, + "logps/chosen": -292.62457275390625, + "logps/rejected": -314.6795959472656, + "loss": 0.0425, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6800389289855957, + "rewards/margins": 10.989094734191895, + "rewards/rejected": -13.669133186340332, + "step": 9250 + }, + { + "epoch": 2.23, + "learning_rate": 1.4285077553931182e-07, + "logits/chosen": -2.3437511920928955, + "logits/rejected": -2.215064287185669, + "logps/chosen": -241.7512664794922, + "logps/rejected": -386.5013122558594, + "loss": 0.032, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.681990146636963, + "rewards/margins": 11.260754585266113, + "rewards/rejected": -13.942744255065918, + "step": 9260 + }, + { + "epoch": 2.23, + "learning_rate": 1.4240506329113925e-07, + "logits/chosen": -2.45466947555542, + "logits/rejected": -2.4046263694763184, + "logps/chosen": -256.34759521484375, + "logps/rejected": -310.4484558105469, + "loss": 0.0384, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5057967901229858, + "rewards/margins": 10.491201400756836, + "rewards/rejected": -10.99699878692627, + "step": 9270 + }, + { + "epoch": 2.23, + "learning_rate": 1.4195935104296666e-07, + "logits/chosen": -2.4198827743530273, + "logits/rejected": -2.445094585418701, + "logps/chosen": -197.35574340820312, + "logps/rejected": -301.61248779296875, + "loss": 0.0289, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.170269012451172, + "rewards/margins": 7.697892665863037, + "rewards/rejected": -10.868162155151367, + "step": 9280 + }, + { + "epoch": 2.24, + "learning_rate": 1.4151363879479409e-07, + "logits/chosen": -2.3239498138427734, + "logits/rejected": -2.255598783493042, + "logps/chosen": -312.5681457519531, + "logps/rejected": -319.4649963378906, + "loss": 0.0298, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.643101453781128, + "rewards/margins": 9.848406791687012, + "rewards/rejected": -12.491508483886719, + "step": 9290 + }, + { + "epoch": 2.24, + "learning_rate": 1.4106792654662152e-07, + "logits/chosen": -2.5454611778259277, + "logits/rejected": -2.4494545459747314, + "logps/chosen": -295.7924499511719, + "logps/rejected": -341.0743713378906, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5382768511772156, + "rewards/margins": 12.340603828430176, + "rewards/rejected": -11.802328109741211, + "step": 9300 + }, + { + "epoch": 2.24, + "learning_rate": 1.4062221429844892e-07, + "logits/chosen": -2.313345193862915, + "logits/rejected": -2.2957451343536377, + "logps/chosen": -271.76678466796875, + "logps/rejected": -425.8828125, + "loss": 0.0482, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.048810005187988, + "rewards/margins": 9.32423210144043, + "rewards/rejected": -13.373041152954102, + "step": 9310 + }, + { + "epoch": 2.24, + "learning_rate": 1.4017650205027635e-07, + "logits/chosen": -2.4318947792053223, + "logits/rejected": -2.4367456436157227, + "logps/chosen": -286.40216064453125, + "logps/rejected": -350.0318298339844, + "loss": 0.1116, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9922993183135986, + "rewards/margins": 9.691668510437012, + "rewards/rejected": -11.683968544006348, + "step": 9320 + }, + { + "epoch": 2.25, + "learning_rate": 1.3973078980210375e-07, + "logits/chosen": -2.4391160011291504, + "logits/rejected": -2.349377155303955, + "logps/chosen": -263.29931640625, + "logps/rejected": -358.6832580566406, + "loss": 0.0213, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.355463981628418, + "rewards/margins": 9.626498222351074, + "rewards/rejected": -13.981962203979492, + "step": 9330 + }, + { + "epoch": 2.25, + "learning_rate": 1.3928507755393118e-07, + "logits/chosen": -2.3661608695983887, + "logits/rejected": -2.504918336868286, + "logps/chosen": -315.29913330078125, + "logps/rejected": -415.14288330078125, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.096774697303772, + "rewards/margins": 12.709527969360352, + "rewards/rejected": -13.806302070617676, + "step": 9340 + }, + { + "epoch": 2.25, + "learning_rate": 1.388393653057586e-07, + "logits/chosen": -2.566204309463501, + "logits/rejected": -2.4146459102630615, + "logps/chosen": -334.0547790527344, + "logps/rejected": -343.410888671875, + "loss": 0.0254, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7288995981216431, + "rewards/margins": 9.186012268066406, + "rewards/rejected": -9.914911270141602, + "step": 9350 + }, + { + "epoch": 2.25, + "learning_rate": 1.38393653057586e-07, + "logits/chosen": -2.3976540565490723, + "logits/rejected": -2.289386510848999, + "logps/chosen": -207.6011505126953, + "logps/rejected": -277.0570373535156, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43546366691589355, + "rewards/margins": 10.367945671081543, + "rewards/rejected": -10.8034086227417, + "step": 9360 + }, + { + "epoch": 2.26, + "learning_rate": 1.3794794080941344e-07, + "logits/chosen": -2.543123245239258, + "logits/rejected": -2.4102225303649902, + "logps/chosen": -307.92352294921875, + "logps/rejected": -356.69488525390625, + "loss": 0.0328, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2872573137283325, + "rewards/margins": 11.00304126739502, + "rewards/rejected": -12.290298461914062, + "step": 9370 + }, + { + "epoch": 2.26, + "learning_rate": 1.3750222856124087e-07, + "logits/chosen": -2.5676543712615967, + "logits/rejected": -2.453909397125244, + "logps/chosen": -293.49224853515625, + "logps/rejected": -318.1600036621094, + "loss": 0.0499, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7830958366394043, + "rewards/margins": 11.916948318481445, + "rewards/rejected": -12.700042724609375, + "step": 9380 + }, + { + "epoch": 2.26, + "learning_rate": 1.3705651631306827e-07, + "logits/chosen": -2.337442398071289, + "logits/rejected": -2.2764225006103516, + "logps/chosen": -198.6411895751953, + "logps/rejected": -290.4046936035156, + "loss": 0.0593, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.507361650466919, + "rewards/margins": 11.443794250488281, + "rewards/rejected": -12.951156616210938, + "step": 9390 + }, + { + "epoch": 2.26, + "learning_rate": 1.366108040648957e-07, + "logits/chosen": -2.4181835651397705, + "logits/rejected": -2.457887887954712, + "logps/chosen": -293.56829833984375, + "logps/rejected": -410.14312744140625, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3593311309814453, + "rewards/margins": 12.338621139526367, + "rewards/rejected": -12.697952270507812, + "step": 9400 + }, + { + "epoch": 2.26, + "learning_rate": 1.361650918167231e-07, + "logits/chosen": -2.2344985008239746, + "logits/rejected": -2.2863659858703613, + "logps/chosen": -278.97662353515625, + "logps/rejected": -329.68560791015625, + "loss": 0.1062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.71694278717041, + "rewards/margins": 8.286422729492188, + "rewards/rejected": -11.003364562988281, + "step": 9410 + }, + { + "epoch": 2.27, + "learning_rate": 1.3571937956855053e-07, + "logits/chosen": -2.4888503551483154, + "logits/rejected": -2.3015456199645996, + "logps/chosen": -303.28668212890625, + "logps/rejected": -330.32354736328125, + "loss": 0.0319, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7734349966049194, + "rewards/margins": 13.611085891723633, + "rewards/rejected": -14.384519577026367, + "step": 9420 + }, + { + "epoch": 2.27, + "learning_rate": 1.3527366732037796e-07, + "logits/chosen": -2.4066035747528076, + "logits/rejected": -2.43558931350708, + "logps/chosen": -346.6483459472656, + "logps/rejected": -445.6409606933594, + "loss": 0.0216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1004126071929932, + "rewards/margins": 10.706197738647461, + "rewards/rejected": -11.806611061096191, + "step": 9430 + }, + { + "epoch": 2.27, + "learning_rate": 1.3482795507220537e-07, + "logits/chosen": -2.3893866539001465, + "logits/rejected": -2.4582602977752686, + "logps/chosen": -265.438720703125, + "logps/rejected": -428.552490234375, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.036462664604187, + "rewards/margins": 17.461559295654297, + "rewards/rejected": -16.425098419189453, + "step": 9440 + }, + { + "epoch": 2.27, + "learning_rate": 1.343822428240328e-07, + "logits/chosen": -2.4005985260009766, + "logits/rejected": -2.3538804054260254, + "logps/chosen": -315.60589599609375, + "logps/rejected": -392.16656494140625, + "loss": 0.02, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5265145301818848, + "rewards/margins": 9.552511215209961, + "rewards/rejected": -13.079025268554688, + "step": 9450 + }, + { + "epoch": 2.28, + "learning_rate": 1.3393653057586023e-07, + "logits/chosen": -2.221489906311035, + "logits/rejected": -2.1872735023498535, + "logps/chosen": -247.34375, + "logps/rejected": -343.33453369140625, + "loss": 0.0234, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9642674922943115, + "rewards/margins": 9.500177383422852, + "rewards/rejected": -11.464445114135742, + "step": 9460 + }, + { + "epoch": 2.28, + "learning_rate": 1.3349081832768763e-07, + "logits/chosen": -2.3547425270080566, + "logits/rejected": -2.3052620887756348, + "logps/chosen": -292.5299377441406, + "logps/rejected": -315.9531555175781, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2172069549560547, + "rewards/margins": 7.924900054931641, + "rewards/rejected": -10.142107009887695, + "step": 9470 + }, + { + "epoch": 2.28, + "learning_rate": 1.3304510607951506e-07, + "logits/chosen": -2.4919469356536865, + "logits/rejected": -2.4380953311920166, + "logps/chosen": -262.9488830566406, + "logps/rejected": -314.8113708496094, + "loss": 0.0172, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8102928996086121, + "rewards/margins": 11.47692584991455, + "rewards/rejected": -12.28721809387207, + "step": 9480 + }, + { + "epoch": 2.28, + "learning_rate": 1.3259939383134246e-07, + "logits/chosen": -2.3655810356140137, + "logits/rejected": -2.4084105491638184, + "logps/chosen": -271.9031677246094, + "logps/rejected": -357.02020263671875, + "loss": 0.0452, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6531856060028076, + "rewards/margins": 11.6923246383667, + "rewards/rejected": -13.345510482788086, + "step": 9490 + }, + { + "epoch": 2.29, + "learning_rate": 1.321536815831699e-07, + "logits/chosen": -2.414926290512085, + "logits/rejected": -2.4577858448028564, + "logps/chosen": -236.73220825195312, + "logps/rejected": -415.4325256347656, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3003663122653961, + "rewards/margins": 12.758208274841309, + "rewards/rejected": -13.058575630187988, + "step": 9500 + }, + { + "epoch": 2.29, + "learning_rate": 1.3170796933499732e-07, + "logits/chosen": -2.4245550632476807, + "logits/rejected": -2.4358413219451904, + "logps/chosen": -224.95639038085938, + "logps/rejected": -442.0921325683594, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.066308856010437, + "rewards/margins": 12.269311904907227, + "rewards/rejected": -13.335619926452637, + "step": 9510 + }, + { + "epoch": 2.29, + "learning_rate": 1.3126225708682472e-07, + "logits/chosen": -2.546781539916992, + "logits/rejected": -2.5013813972473145, + "logps/chosen": -240.187744140625, + "logps/rejected": -337.92889404296875, + "loss": 0.0321, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5588526725769043, + "rewards/margins": 10.870650291442871, + "rewards/rejected": -13.42950439453125, + "step": 9520 + }, + { + "epoch": 2.29, + "learning_rate": 1.3081654483865215e-07, + "logits/chosen": -2.231473684310913, + "logits/rejected": -2.2529168128967285, + "logps/chosen": -230.21041870117188, + "logps/rejected": -391.0644836425781, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.672971487045288, + "rewards/margins": 11.440793991088867, + "rewards/rejected": -14.11376667022705, + "step": 9530 + }, + { + "epoch": 2.3, + "learning_rate": 1.3037083259047958e-07, + "logits/chosen": -2.488002061843872, + "logits/rejected": -2.398108959197998, + "logps/chosen": -334.773193359375, + "logps/rejected": -367.2915344238281, + "loss": 0.0267, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.903795063495636, + "rewards/margins": 11.955655097961426, + "rewards/rejected": -12.859451293945312, + "step": 9540 + }, + { + "epoch": 2.3, + "learning_rate": 1.2992512034230698e-07, + "logits/chosen": -2.5971035957336426, + "logits/rejected": -2.5601534843444824, + "logps/chosen": -293.69110107421875, + "logps/rejected": -454.81707763671875, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5811969041824341, + "rewards/margins": 13.334210395812988, + "rewards/rejected": -13.915410041809082, + "step": 9550 + }, + { + "epoch": 2.3, + "learning_rate": 1.2947940809413444e-07, + "logits/chosen": -2.485360622406006, + "logits/rejected": -2.418557643890381, + "logps/chosen": -231.5789794921875, + "logps/rejected": -315.5708923339844, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3320872783660889, + "rewards/margins": 9.851500511169434, + "rewards/rejected": -11.183588981628418, + "step": 9560 + }, + { + "epoch": 2.3, + "learning_rate": 1.2903369584596184e-07, + "logits/chosen": -2.412652015686035, + "logits/rejected": -2.320526599884033, + "logps/chosen": -188.39639282226562, + "logps/rejected": -285.4320983886719, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2676024436950684, + "rewards/margins": 10.311906814575195, + "rewards/rejected": -13.579508781433105, + "step": 9570 + }, + { + "epoch": 2.31, + "learning_rate": 1.2858798359778927e-07, + "logits/chosen": -2.516284704208374, + "logits/rejected": -2.543175458908081, + "logps/chosen": -307.69793701171875, + "logps/rejected": -373.61248779296875, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6909498572349548, + "rewards/margins": 11.186471939086914, + "rewards/rejected": -11.877422332763672, + "step": 9580 + }, + { + "epoch": 2.31, + "learning_rate": 1.281422713496167e-07, + "logits/chosen": -2.5738749504089355, + "logits/rejected": -2.4848976135253906, + "logps/chosen": -272.22686767578125, + "logps/rejected": -343.75701904296875, + "loss": 0.0327, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9130065441131592, + "rewards/margins": 9.962735176086426, + "rewards/rejected": -11.875740051269531, + "step": 9590 + }, + { + "epoch": 2.31, + "learning_rate": 1.276965591014441e-07, + "logits/chosen": -2.421653985977173, + "logits/rejected": -2.423694133758545, + "logps/chosen": -233.5714569091797, + "logps/rejected": -395.04241943359375, + "loss": 0.0425, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3771601617336273, + "rewards/margins": 12.321215629577637, + "rewards/rejected": -12.69837474822998, + "step": 9600 + }, + { + "epoch": 2.31, + "eval_logits/chosen": -2.1774609088897705, + "eval_logits/rejected": -2.1409823894500732, + "eval_logps/chosen": -275.62860107421875, + "eval_logps/rejected": -303.3002014160156, + "eval_loss": 0.6297720074653625, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -7.370106220245361, + "eval_rewards/margins": 4.006834983825684, + "eval_rewards/rejected": -11.376940727233887, + "eval_runtime": 134.5205, + "eval_samples_per_second": 23.461, + "eval_steps_per_second": 0.372, + "step": 9600 + }, + { + "epoch": 2.31, + "learning_rate": 1.2725084685327153e-07, + "logits/chosen": -2.389843225479126, + "logits/rejected": -2.4056458473205566, + "logps/chosen": -237.8639373779297, + "logps/rejected": -419.3231506347656, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.295149326324463, + "rewards/margins": 13.496070861816406, + "rewards/rejected": -15.791223526000977, + "step": 9610 + }, + { + "epoch": 2.32, + "learning_rate": 1.2680513460509896e-07, + "logits/chosen": -2.48187518119812, + "logits/rejected": -2.4992806911468506, + "logps/chosen": -288.1156005859375, + "logps/rejected": -398.43634033203125, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5226317048072815, + "rewards/margins": 12.12777042388916, + "rewards/rejected": -12.650402069091797, + "step": 9620 + }, + { + "epoch": 2.32, + "learning_rate": 1.2635942235692637e-07, + "logits/chosen": -2.2290313243865967, + "logits/rejected": -2.30084490776062, + "logps/chosen": -241.2779998779297, + "logps/rejected": -418.93658447265625, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7349122166633606, + "rewards/margins": 16.390789031982422, + "rewards/rejected": -15.655878067016602, + "step": 9630 + }, + { + "epoch": 2.32, + "learning_rate": 1.259137101087538e-07, + "logits/chosen": -2.646573543548584, + "logits/rejected": -2.5851197242736816, + "logps/chosen": -294.6817932128906, + "logps/rejected": -338.83929443359375, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8279914855957031, + "rewards/margins": 10.240830421447754, + "rewards/rejected": -12.068822860717773, + "step": 9640 + }, + { + "epoch": 2.32, + "learning_rate": 1.254679978605812e-07, + "logits/chosen": -2.524639844894409, + "logits/rejected": -2.334158182144165, + "logps/chosen": -328.20819091796875, + "logps/rejected": -369.3468322753906, + "loss": 0.0429, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4329562783241272, + "rewards/margins": 11.776590347290039, + "rewards/rejected": -12.20954704284668, + "step": 9650 + }, + { + "epoch": 2.32, + "learning_rate": 1.2502228561240863e-07, + "logits/chosen": -2.627685308456421, + "logits/rejected": -2.4850966930389404, + "logps/chosen": -336.99603271484375, + "logps/rejected": -349.14068603515625, + "loss": 0.0373, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9046379327774048, + "rewards/margins": 9.74839973449707, + "rewards/rejected": -10.653037071228027, + "step": 9660 + }, + { + "epoch": 2.33, + "learning_rate": 1.2457657336423606e-07, + "logits/chosen": -2.630577802658081, + "logits/rejected": -2.5514895915985107, + "logps/chosen": -304.5028381347656, + "logps/rejected": -439.1168518066406, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3046289682388306, + "rewards/margins": 14.980245590209961, + "rewards/rejected": -13.675616264343262, + "step": 9670 + }, + { + "epoch": 2.33, + "learning_rate": 1.2413086111606346e-07, + "logits/chosen": -2.4933369159698486, + "logits/rejected": -2.3936216831207275, + "logps/chosen": -251.20663452148438, + "logps/rejected": -359.5411376953125, + "loss": 0.041, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0318059921264648, + "rewards/margins": 11.094675064086914, + "rewards/rejected": -12.126481056213379, + "step": 9680 + }, + { + "epoch": 2.33, + "learning_rate": 1.236851488678909e-07, + "logits/chosen": -2.588134288787842, + "logits/rejected": -2.4921040534973145, + "logps/chosen": -315.86358642578125, + "logps/rejected": -523.5921630859375, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24267932772636414, + "rewards/margins": 13.36680793762207, + "rewards/rejected": -13.60948657989502, + "step": 9690 + }, + { + "epoch": 2.33, + "learning_rate": 1.2323943661971832e-07, + "logits/chosen": -2.494471311569214, + "logits/rejected": -2.561774730682373, + "logps/chosen": -238.9729766845703, + "logps/rejected": -367.56536865234375, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7989002466201782, + "rewards/margins": 14.336524963378906, + "rewards/rejected": -13.537625312805176, + "step": 9700 + }, + { + "epoch": 2.34, + "learning_rate": 1.2279372437154572e-07, + "logits/chosen": -2.5178780555725098, + "logits/rejected": -2.3850350379943848, + "logps/chosen": -308.134521484375, + "logps/rejected": -334.61968994140625, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12703149020671844, + "rewards/margins": 11.759805679321289, + "rewards/rejected": -11.632774353027344, + "step": 9710 + }, + { + "epoch": 2.34, + "learning_rate": 1.2234801212337315e-07, + "logits/chosen": -2.6157431602478027, + "logits/rejected": -2.5810580253601074, + "logps/chosen": -265.21673583984375, + "logps/rejected": -375.5794372558594, + "loss": 0.046, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4621336460113525, + "rewards/margins": 10.865510940551758, + "rewards/rejected": -12.327642440795898, + "step": 9720 + }, + { + "epoch": 2.34, + "learning_rate": 1.2190229987520055e-07, + "logits/chosen": -2.553041696548462, + "logits/rejected": -2.589735507965088, + "logps/chosen": -329.040771484375, + "logps/rejected": -461.7129821777344, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9191287159919739, + "rewards/margins": 15.172648429870605, + "rewards/rejected": -14.253519058227539, + "step": 9730 + }, + { + "epoch": 2.34, + "learning_rate": 1.2145658762702798e-07, + "logits/chosen": -2.552706241607666, + "logits/rejected": -2.5576248168945312, + "logps/chosen": -222.6065216064453, + "logps/rejected": -308.9010925292969, + "loss": 0.0289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7514709830284119, + "rewards/margins": 10.980058670043945, + "rewards/rejected": -11.731529235839844, + "step": 9740 + }, + { + "epoch": 2.35, + "learning_rate": 1.210108753788554e-07, + "logits/chosen": -2.3641655445098877, + "logits/rejected": -2.234936237335205, + "logps/chosen": -336.6866760253906, + "logps/rejected": -354.4670715332031, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40606531500816345, + "rewards/margins": 11.603143692016602, + "rewards/rejected": -12.009209632873535, + "step": 9750 + }, + { + "epoch": 2.35, + "learning_rate": 1.2056516313068281e-07, + "logits/chosen": -2.3288071155548096, + "logits/rejected": -2.1793508529663086, + "logps/chosen": -283.40777587890625, + "logps/rejected": -471.2051696777344, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.462835669517517, + "rewards/margins": 15.888590812683105, + "rewards/rejected": -17.35142707824707, + "step": 9760 + }, + { + "epoch": 2.35, + "learning_rate": 1.2011945088251024e-07, + "logits/chosen": -2.5297608375549316, + "logits/rejected": -2.5485751628875732, + "logps/chosen": -210.74423217773438, + "logps/rejected": -325.4725036621094, + "loss": 0.0128, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6324639320373535, + "rewards/margins": 10.859331130981445, + "rewards/rejected": -11.49179458618164, + "step": 9770 + }, + { + "epoch": 2.35, + "learning_rate": 1.1967373863433767e-07, + "logits/chosen": -2.7175004482269287, + "logits/rejected": -2.4431304931640625, + "logps/chosen": -364.6636047363281, + "logps/rejected": -330.05438232421875, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0363945960998535, + "rewards/margins": 10.648542404174805, + "rewards/rejected": -11.684937477111816, + "step": 9780 + }, + { + "epoch": 2.36, + "learning_rate": 1.1922802638616508e-07, + "logits/chosen": -2.5165154933929443, + "logits/rejected": -2.367253065109253, + "logps/chosen": -265.6626892089844, + "logps/rejected": -372.4963684082031, + "loss": 0.0234, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.449108600616455, + "rewards/margins": 10.355419158935547, + "rewards/rejected": -12.804529190063477, + "step": 9790 + }, + { + "epoch": 2.36, + "learning_rate": 1.187823141379925e-07, + "logits/chosen": -2.5441689491271973, + "logits/rejected": -2.5348575115203857, + "logps/chosen": -245.9356231689453, + "logps/rejected": -394.76190185546875, + "loss": 0.0364, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8920745849609375, + "rewards/margins": 9.532918930053711, + "rewards/rejected": -11.424993515014648, + "step": 9800 + }, + { + "epoch": 2.36, + "learning_rate": 1.1833660188981992e-07, + "logits/chosen": -2.47633695602417, + "logits/rejected": -2.4435997009277344, + "logps/chosen": -290.0943298339844, + "logps/rejected": -384.2179870605469, + "loss": 0.0312, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4887150526046753, + "rewards/margins": 10.321699142456055, + "rewards/rejected": -11.81041431427002, + "step": 9810 + }, + { + "epoch": 2.36, + "learning_rate": 1.1789088964164735e-07, + "logits/chosen": -2.4126784801483154, + "logits/rejected": -2.377180814743042, + "logps/chosen": -232.51077270507812, + "logps/rejected": -388.77349853515625, + "loss": 0.0522, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0534749031066895, + "rewards/margins": 11.633764266967773, + "rewards/rejected": -13.687238693237305, + "step": 9820 + }, + { + "epoch": 2.37, + "learning_rate": 1.1744517739347477e-07, + "logits/chosen": -2.3851730823516846, + "logits/rejected": -2.3036623001098633, + "logps/chosen": -206.77310180664062, + "logps/rejected": -308.92852783203125, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3479121923446655, + "rewards/margins": 12.278934478759766, + "rewards/rejected": -13.626846313476562, + "step": 9830 + }, + { + "epoch": 2.37, + "learning_rate": 1.169994651453022e-07, + "logits/chosen": -2.593183994293213, + "logits/rejected": -2.4439361095428467, + "logps/chosen": -338.0233459472656, + "logps/rejected": -355.5791320800781, + "loss": 0.0298, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.021317005157471, + "rewards/margins": 10.08646297454834, + "rewards/rejected": -14.107780456542969, + "step": 9840 + }, + { + "epoch": 2.37, + "learning_rate": 1.1655375289712961e-07, + "logits/chosen": -2.450833797454834, + "logits/rejected": -2.400195598602295, + "logps/chosen": -173.69932556152344, + "logps/rejected": -285.7006530761719, + "loss": 0.0377, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.102134943008423, + "rewards/margins": 9.48808479309082, + "rewards/rejected": -11.59022045135498, + "step": 9850 + }, + { + "epoch": 2.37, + "learning_rate": 1.1610804064895703e-07, + "logits/chosen": -2.417642116546631, + "logits/rejected": -2.4772324562072754, + "logps/chosen": -222.88394165039062, + "logps/rejected": -345.0111083984375, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2814255952835083, + "rewards/margins": 13.139358520507812, + "rewards/rejected": -14.420782089233398, + "step": 9860 + }, + { + "epoch": 2.38, + "learning_rate": 1.1566232840078444e-07, + "logits/chosen": -2.580521821975708, + "logits/rejected": -2.4695916175842285, + "logps/chosen": -320.3534240722656, + "logps/rejected": -414.2294921875, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5010542869567871, + "rewards/margins": 11.957185745239258, + "rewards/rejected": -12.458239555358887, + "step": 9870 + }, + { + "epoch": 2.38, + "learning_rate": 1.1521661615261187e-07, + "logits/chosen": -2.3649206161499023, + "logits/rejected": -2.2867565155029297, + "logps/chosen": -215.96047973632812, + "logps/rejected": -383.0747985839844, + "loss": 0.0231, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.173766136169434, + "rewards/margins": 9.420974731445312, + "rewards/rejected": -13.594741821289062, + "step": 9880 + }, + { + "epoch": 2.38, + "learning_rate": 1.1477090390443929e-07, + "logits/chosen": -2.579761266708374, + "logits/rejected": -2.483165979385376, + "logps/chosen": -265.85418701171875, + "logps/rejected": -347.1213073730469, + "loss": 0.0242, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4519851207733154, + "rewards/margins": 9.690813064575195, + "rewards/rejected": -12.14279842376709, + "step": 9890 + }, + { + "epoch": 2.38, + "learning_rate": 1.143251916562667e-07, + "logits/chosen": -2.343977451324463, + "logits/rejected": -2.2638773918151855, + "logps/chosen": -212.83773803710938, + "logps/rejected": -249.4177703857422, + "loss": 0.0298, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4832682609558105, + "rewards/margins": 8.23595142364502, + "rewards/rejected": -10.719220161437988, + "step": 9900 + }, + { + "epoch": 2.39, + "learning_rate": 1.1387947940809412e-07, + "logits/chosen": -2.5048470497131348, + "logits/rejected": -2.4498581886291504, + "logps/chosen": -318.59173583984375, + "logps/rejected": -404.7265319824219, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5952701568603516, + "rewards/margins": 10.584699630737305, + "rewards/rejected": -13.179969787597656, + "step": 9910 + }, + { + "epoch": 2.39, + "learning_rate": 1.1343376715992155e-07, + "logits/chosen": -2.5422251224517822, + "logits/rejected": -2.4049437046051025, + "logps/chosen": -292.94915771484375, + "logps/rejected": -344.04693603515625, + "loss": 0.0339, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.367684841156006, + "rewards/margins": 10.511276245117188, + "rewards/rejected": -14.878959655761719, + "step": 9920 + }, + { + "epoch": 2.39, + "learning_rate": 1.1298805491174897e-07, + "logits/chosen": -2.428804874420166, + "logits/rejected": -2.3704833984375, + "logps/chosen": -240.29928588867188, + "logps/rejected": -316.4564514160156, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.077416181564331, + "rewards/margins": 11.797285079956055, + "rewards/rejected": -13.874700546264648, + "step": 9930 + }, + { + "epoch": 2.39, + "learning_rate": 1.1254234266357638e-07, + "logits/chosen": -2.7007205486297607, + "logits/rejected": -2.5549190044403076, + "logps/chosen": -358.87432861328125, + "logps/rejected": -416.8155822753906, + "loss": 0.0307, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.512645721435547, + "rewards/margins": 10.297310829162598, + "rewards/rejected": -12.809954643249512, + "step": 9940 + }, + { + "epoch": 2.39, + "learning_rate": 1.1209663041540381e-07, + "logits/chosen": -2.326416015625, + "logits/rejected": -2.425079345703125, + "logps/chosen": -279.13165283203125, + "logps/rejected": -431.99810791015625, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2667337656021118, + "rewards/margins": 11.267843246459961, + "rewards/rejected": -12.534574508666992, + "step": 9950 + }, + { + "epoch": 2.4, + "learning_rate": 1.1165091816723123e-07, + "logits/chosen": -2.422116756439209, + "logits/rejected": -2.366100311279297, + "logps/chosen": -279.7535705566406, + "logps/rejected": -423.58154296875, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5250468254089355, + "rewards/margins": 12.391502380371094, + "rewards/rejected": -13.916549682617188, + "step": 9960 + }, + { + "epoch": 2.4, + "learning_rate": 1.1120520591905864e-07, + "logits/chosen": -2.312122344970703, + "logits/rejected": -2.350348472595215, + "logps/chosen": -152.44276428222656, + "logps/rejected": -247.03585815429688, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.245760202407837, + "rewards/margins": 10.911714553833008, + "rewards/rejected": -12.157475471496582, + "step": 9970 + }, + { + "epoch": 2.4, + "learning_rate": 1.1075949367088606e-07, + "logits/chosen": -2.565394878387451, + "logits/rejected": -2.5081286430358887, + "logps/chosen": -334.61175537109375, + "logps/rejected": -351.79779052734375, + "loss": 0.021, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7482221126556396, + "rewards/margins": 11.15458869934082, + "rewards/rejected": -11.902810096740723, + "step": 9980 + }, + { + "epoch": 2.4, + "learning_rate": 1.103137814227135e-07, + "logits/chosen": -2.3138725757598877, + "logits/rejected": -2.324903726577759, + "logps/chosen": -213.28121948242188, + "logps/rejected": -337.15155029296875, + "loss": 0.0215, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5757930278778076, + "rewards/margins": 9.91811466217041, + "rewards/rejected": -13.49390697479248, + "step": 9990 + }, + { + "epoch": 2.41, + "learning_rate": 1.0986806917454092e-07, + "logits/chosen": -2.5492286682128906, + "logits/rejected": -2.4687278270721436, + "logps/chosen": -217.9174346923828, + "logps/rejected": -307.69195556640625, + "loss": 0.0387, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6853030920028687, + "rewards/margins": 11.250697135925293, + "rewards/rejected": -12.936001777648926, + "step": 10000 + }, + { + "epoch": 2.41, + "eval_logits/chosen": -2.216874361038208, + "eval_logits/rejected": -2.1790802478790283, + "eval_logps/chosen": -275.18701171875, + "eval_logps/rejected": -304.8103942871094, + "eval_loss": 0.6269313097000122, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -7.325944900512695, + "eval_rewards/margins": 4.202017307281494, + "eval_rewards/rejected": -11.527961730957031, + "eval_runtime": 134.306, + "eval_samples_per_second": 23.499, + "eval_steps_per_second": 0.372, + "step": 10000 + }, + { + "epoch": 2.41, + "learning_rate": 1.0942235692636834e-07, + "logits/chosen": -2.2933192253112793, + "logits/rejected": -2.280583143234253, + "logps/chosen": -287.15576171875, + "logps/rejected": -405.9715270996094, + "loss": 0.0218, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2334377765655518, + "rewards/margins": 10.364022254943848, + "rewards/rejected": -13.59745979309082, + "step": 10010 + }, + { + "epoch": 2.41, + "learning_rate": 1.0897664467819575e-07, + "logits/chosen": -2.4617159366607666, + "logits/rejected": -2.2975640296936035, + "logps/chosen": -238.12191772460938, + "logps/rejected": -386.0503234863281, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2631583213806152, + "rewards/margins": 12.070978164672852, + "rewards/rejected": -15.334136962890625, + "step": 10020 + }, + { + "epoch": 2.41, + "learning_rate": 1.0853093243002318e-07, + "logits/chosen": -2.505117893218994, + "logits/rejected": -2.462644100189209, + "logps/chosen": -388.018798828125, + "logps/rejected": -362.15228271484375, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5008872747421265, + "rewards/margins": 13.235308647155762, + "rewards/rejected": -13.73619556427002, + "step": 10030 + }, + { + "epoch": 2.42, + "learning_rate": 1.080852201818506e-07, + "logits/chosen": -2.4457573890686035, + "logits/rejected": -2.3285858631134033, + "logps/chosen": -289.85003662109375, + "logps/rejected": -347.39788818359375, + "loss": 0.0192, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.542717933654785, + "rewards/margins": 10.714621543884277, + "rewards/rejected": -13.257339477539062, + "step": 10040 + }, + { + "epoch": 2.42, + "learning_rate": 1.0763950793367801e-07, + "logits/chosen": -2.528005599975586, + "logits/rejected": -2.470735549926758, + "logps/chosen": -259.02911376953125, + "logps/rejected": -393.9635009765625, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5142813324928284, + "rewards/margins": 11.019901275634766, + "rewards/rejected": -11.534183502197266, + "step": 10050 + }, + { + "epoch": 2.42, + "learning_rate": 1.0719379568550543e-07, + "logits/chosen": -2.405470371246338, + "logits/rejected": -2.355475425720215, + "logps/chosen": -290.2284240722656, + "logps/rejected": -363.837890625, + "loss": 0.0221, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6940453052520752, + "rewards/margins": 10.649921417236328, + "rewards/rejected": -12.343965530395508, + "step": 10060 + }, + { + "epoch": 2.42, + "learning_rate": 1.0674808343733286e-07, + "logits/chosen": -2.5033342838287354, + "logits/rejected": -2.4000964164733887, + "logps/chosen": -280.3537292480469, + "logps/rejected": -360.11187744140625, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7043596506118774, + "rewards/margins": 12.54203987121582, + "rewards/rejected": -13.24639892578125, + "step": 10070 + }, + { + "epoch": 2.43, + "learning_rate": 1.0630237118916027e-07, + "logits/chosen": -2.4509224891662598, + "logits/rejected": -2.3927853107452393, + "logps/chosen": -286.2995300292969, + "logps/rejected": -384.1947021484375, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18650178611278534, + "rewards/margins": 14.649968147277832, + "rewards/rejected": -14.463467597961426, + "step": 10080 + }, + { + "epoch": 2.43, + "learning_rate": 1.0585665894098769e-07, + "logits/chosen": -2.4938857555389404, + "logits/rejected": -2.4885623455047607, + "logps/chosen": -278.3260498046875, + "logps/rejected": -426.4647521972656, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8486827611923218, + "rewards/margins": 13.495234489440918, + "rewards/rejected": -14.343917846679688, + "step": 10090 + }, + { + "epoch": 2.43, + "learning_rate": 1.0541094669281511e-07, + "logits/chosen": -2.4637417793273926, + "logits/rejected": -2.4643568992614746, + "logps/chosen": -325.77374267578125, + "logps/rejected": -400.1005554199219, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8435616493225098, + "rewards/margins": 15.150815963745117, + "rewards/rejected": -14.30725383758545, + "step": 10100 + }, + { + "epoch": 2.43, + "learning_rate": 1.0496523444464254e-07, + "logits/chosen": -2.7043046951293945, + "logits/rejected": -2.307955741882324, + "logps/chosen": -301.4166259765625, + "logps/rejected": -315.764404296875, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18431058526039124, + "rewards/margins": 13.029945373535156, + "rewards/rejected": -12.845632553100586, + "step": 10110 + }, + { + "epoch": 2.44, + "learning_rate": 1.0451952219646995e-07, + "logits/chosen": -2.6169772148132324, + "logits/rejected": -2.601414680480957, + "logps/chosen": -270.9599609375, + "logps/rejected": -337.7686767578125, + "loss": 0.0564, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.4154558181762695, + "rewards/margins": 9.172788619995117, + "rewards/rejected": -13.588244438171387, + "step": 10120 + }, + { + "epoch": 2.44, + "learning_rate": 1.0407380994829737e-07, + "logits/chosen": -2.5294604301452637, + "logits/rejected": -2.43257999420166, + "logps/chosen": -332.90924072265625, + "logps/rejected": -346.5657653808594, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.092760443687439, + "rewards/margins": 12.378458023071289, + "rewards/rejected": -13.471220016479492, + "step": 10130 + }, + { + "epoch": 2.44, + "learning_rate": 1.0362809770012478e-07, + "logits/chosen": -2.434919595718384, + "logits/rejected": -2.4524292945861816, + "logps/chosen": -225.8126220703125, + "logps/rejected": -380.1116638183594, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25910013914108276, + "rewards/margins": 11.829145431518555, + "rewards/rejected": -12.088244438171387, + "step": 10140 + }, + { + "epoch": 2.44, + "learning_rate": 1.0318238545195221e-07, + "logits/chosen": -2.574103832244873, + "logits/rejected": -2.5287487506866455, + "logps/chosen": -272.11920166015625, + "logps/rejected": -312.4232482910156, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0098073482513428, + "rewards/margins": 10.273530960083008, + "rewards/rejected": -11.283336639404297, + "step": 10150 + }, + { + "epoch": 2.45, + "learning_rate": 1.0273667320377964e-07, + "logits/chosen": -2.452679395675659, + "logits/rejected": -2.2785086631774902, + "logps/chosen": -238.67758178710938, + "logps/rejected": -286.9755859375, + "loss": 0.0364, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8719562292098999, + "rewards/margins": 10.701383590698242, + "rewards/rejected": -11.57334041595459, + "step": 10160 + }, + { + "epoch": 2.45, + "learning_rate": 1.0229096095560706e-07, + "logits/chosen": -2.5121803283691406, + "logits/rejected": -2.3672566413879395, + "logps/chosen": -350.19622802734375, + "logps/rejected": -336.66656494140625, + "loss": 0.0395, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.574341297149658, + "rewards/margins": 10.227441787719727, + "rewards/rejected": -12.801783561706543, + "step": 10170 + }, + { + "epoch": 2.45, + "learning_rate": 1.0184524870743448e-07, + "logits/chosen": -2.5346298217773438, + "logits/rejected": -2.4361207485198975, + "logps/chosen": -271.7481994628906, + "logps/rejected": -352.82403564453125, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7823295593261719, + "rewards/margins": 10.890006065368652, + "rewards/rejected": -12.672337532043457, + "step": 10180 + }, + { + "epoch": 2.45, + "learning_rate": 1.013995364592619e-07, + "logits/chosen": -2.5501441955566406, + "logits/rejected": -2.5360169410705566, + "logps/chosen": -307.5736999511719, + "logps/rejected": -310.0274963378906, + "loss": 0.0408, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.210803508758545, + "rewards/margins": 9.402273178100586, + "rewards/rejected": -12.613077163696289, + "step": 10190 + }, + { + "epoch": 2.45, + "learning_rate": 1.0095382421108932e-07, + "logits/chosen": -2.447296142578125, + "logits/rejected": -2.3583357334136963, + "logps/chosen": -268.5548095703125, + "logps/rejected": -295.54937744140625, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6434500813484192, + "rewards/margins": 10.180092811584473, + "rewards/rejected": -10.823541641235352, + "step": 10200 + }, + { + "epoch": 2.46, + "learning_rate": 1.0050811196291674e-07, + "logits/chosen": -2.5113818645477295, + "logits/rejected": -2.446319103240967, + "logps/chosen": -300.185791015625, + "logps/rejected": -336.2559509277344, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6623687744140625, + "rewards/margins": 14.569671630859375, + "rewards/rejected": -13.907302856445312, + "step": 10210 + }, + { + "epoch": 2.46, + "learning_rate": 1.0006239971474415e-07, + "logits/chosen": -2.4419267177581787, + "logits/rejected": -2.31793212890625, + "logps/chosen": -250.54574584960938, + "logps/rejected": -264.55316162109375, + "loss": 0.0375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5165772438049316, + "rewards/margins": 8.454904556274414, + "rewards/rejected": -11.97148323059082, + "step": 10220 + }, + { + "epoch": 2.46, + "learning_rate": 9.961668746657158e-08, + "logits/chosen": -2.562485694885254, + "logits/rejected": -2.5261874198913574, + "logps/chosen": -320.7162170410156, + "logps/rejected": -355.2144470214844, + "loss": 0.024, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5536615252494812, + "rewards/margins": 12.61359691619873, + "rewards/rejected": -13.167257308959961, + "step": 10230 + }, + { + "epoch": 2.46, + "learning_rate": 9.9170975218399e-08, + "logits/chosen": -2.394862413406372, + "logits/rejected": -2.293825626373291, + "logps/chosen": -270.94769287109375, + "logps/rejected": -383.794189453125, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8689486980438232, + "rewards/margins": 13.144645690917969, + "rewards/rejected": -15.013595581054688, + "step": 10240 + }, + { + "epoch": 2.47, + "learning_rate": 9.872526297022641e-08, + "logits/chosen": -2.740175247192383, + "logits/rejected": -2.5539283752441406, + "logps/chosen": -316.5394592285156, + "logps/rejected": -389.9117736816406, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2171639204025269, + "rewards/margins": 10.731700897216797, + "rewards/rejected": -11.94886589050293, + "step": 10250 + }, + { + "epoch": 2.47, + "learning_rate": 9.827955072205383e-08, + "logits/chosen": -2.3584835529327393, + "logits/rejected": -2.5055510997772217, + "logps/chosen": -175.42633056640625, + "logps/rejected": -365.27197265625, + "loss": 0.0468, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.867166042327881, + "rewards/margins": 12.676493644714355, + "rewards/rejected": -15.543660163879395, + "step": 10260 + }, + { + "epoch": 2.47, + "learning_rate": 9.783383847388126e-08, + "logits/chosen": -2.3794188499450684, + "logits/rejected": -2.3353278636932373, + "logps/chosen": -258.2748107910156, + "logps/rejected": -432.15594482421875, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8665927648544312, + "rewards/margins": 14.133447647094727, + "rewards/rejected": -16.000041961669922, + "step": 10270 + }, + { + "epoch": 2.47, + "learning_rate": 9.738812622570868e-08, + "logits/chosen": -2.614894390106201, + "logits/rejected": -2.5153114795684814, + "logps/chosen": -283.90240478515625, + "logps/rejected": -327.84539794921875, + "loss": 0.0616, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9870588779449463, + "rewards/margins": 9.350723266601562, + "rewards/rejected": -12.33778190612793, + "step": 10280 + }, + { + "epoch": 2.48, + "learning_rate": 9.694241397753609e-08, + "logits/chosen": -2.5317180156707764, + "logits/rejected": -2.4985158443450928, + "logps/chosen": -231.2266387939453, + "logps/rejected": -353.6574401855469, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8197168111801147, + "rewards/margins": 11.543524742126465, + "rewards/rejected": -12.363243103027344, + "step": 10290 + }, + { + "epoch": 2.48, + "learning_rate": 9.649670172936351e-08, + "logits/chosen": -2.6909403800964355, + "logits/rejected": -2.625136613845825, + "logps/chosen": -310.2020263671875, + "logps/rejected": -420.2967224121094, + "loss": 0.024, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6183617115020752, + "rewards/margins": 12.573843002319336, + "rewards/rejected": -14.192204475402832, + "step": 10300 + }, + { + "epoch": 2.48, + "learning_rate": 9.605098948119094e-08, + "logits/chosen": -2.2787041664123535, + "logits/rejected": -2.1850972175598145, + "logps/chosen": -325.05218505859375, + "logps/rejected": -388.6729431152344, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3511285781860352, + "rewards/margins": 11.135826110839844, + "rewards/rejected": -12.486954689025879, + "step": 10310 + }, + { + "epoch": 2.48, + "learning_rate": 9.560527723301835e-08, + "logits/chosen": -2.4997646808624268, + "logits/rejected": -2.3415632247924805, + "logps/chosen": -335.80474853515625, + "logps/rejected": -336.45587158203125, + "loss": 0.0453, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.522719383239746, + "rewards/margins": 9.912080764770508, + "rewards/rejected": -14.434802055358887, + "step": 10320 + }, + { + "epoch": 2.49, + "learning_rate": 9.515956498484578e-08, + "logits/chosen": -2.378953456878662, + "logits/rejected": -2.4727540016174316, + "logps/chosen": -284.6646423339844, + "logps/rejected": -384.29364013671875, + "loss": 0.0204, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6046100854873657, + "rewards/margins": 13.905375480651855, + "rewards/rejected": -14.509984970092773, + "step": 10330 + }, + { + "epoch": 2.49, + "learning_rate": 9.47138527366732e-08, + "logits/chosen": -2.497859239578247, + "logits/rejected": -2.4668257236480713, + "logps/chosen": -297.99224853515625, + "logps/rejected": -406.8845520019531, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11674115806818008, + "rewards/margins": 15.078516006469727, + "rewards/rejected": -14.961773872375488, + "step": 10340 + }, + { + "epoch": 2.49, + "learning_rate": 9.426814048850063e-08, + "logits/chosen": -2.6369550228118896, + "logits/rejected": -2.558452606201172, + "logps/chosen": -214.35311889648438, + "logps/rejected": -282.9107971191406, + "loss": 0.0342, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0868337154388428, + "rewards/margins": 11.665234565734863, + "rewards/rejected": -12.752067565917969, + "step": 10350 + }, + { + "epoch": 2.49, + "learning_rate": 9.382242824032804e-08, + "logits/chosen": -2.5463333129882812, + "logits/rejected": -2.383127450942993, + "logps/chosen": -247.4314727783203, + "logps/rejected": -374.2950439453125, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5774767398834229, + "rewards/margins": 14.915138244628906, + "rewards/rejected": -16.492618560791016, + "step": 10360 + }, + { + "epoch": 2.5, + "learning_rate": 9.337671599215546e-08, + "logits/chosen": -2.7446866035461426, + "logits/rejected": -2.57399582862854, + "logps/chosen": -344.39300537109375, + "logps/rejected": -409.7808532714844, + "loss": 0.0357, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4495247602462769, + "rewards/margins": 11.880117416381836, + "rewards/rejected": -13.32964038848877, + "step": 10370 + }, + { + "epoch": 2.5, + "learning_rate": 9.293100374398288e-08, + "logits/chosen": -2.536357879638672, + "logits/rejected": -2.47178053855896, + "logps/chosen": -237.9107208251953, + "logps/rejected": -413.24957275390625, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1901845932006836, + "rewards/margins": 12.308076858520508, + "rewards/rejected": -14.498262405395508, + "step": 10380 + }, + { + "epoch": 2.5, + "learning_rate": 9.24852914958103e-08, + "logits/chosen": -2.4459424018859863, + "logits/rejected": -2.4301600456237793, + "logps/chosen": -276.6459045410156, + "logps/rejected": -333.7275390625, + "loss": 0.0445, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7565501928329468, + "rewards/margins": 9.935433387756348, + "rewards/rejected": -11.691983222961426, + "step": 10390 + }, + { + "epoch": 2.5, + "learning_rate": 9.203957924763772e-08, + "logits/chosen": -2.4842679500579834, + "logits/rejected": -2.438427209854126, + "logps/chosen": -342.13653564453125, + "logps/rejected": -422.8092346191406, + "loss": 0.043, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9871963262557983, + "rewards/margins": 12.639472961425781, + "rewards/rejected": -14.626668930053711, + "step": 10400 + }, + { + "epoch": 2.5, + "eval_logits/chosen": -2.2662575244903564, + "eval_logits/rejected": -2.230071783065796, + "eval_logps/chosen": -274.166748046875, + "eval_logps/rejected": -305.313720703125, + "eval_loss": 0.6375707387924194, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -7.223921298980713, + "eval_rewards/margins": 4.354369640350342, + "eval_rewards/rejected": -11.578290939331055, + "eval_runtime": 133.4124, + "eval_samples_per_second": 23.656, + "eval_steps_per_second": 0.375, + "step": 10400 + }, + { + "epoch": 2.51, + "learning_rate": 9.159386699946514e-08, + "logits/chosen": -2.407238483428955, + "logits/rejected": -2.3920178413391113, + "logps/chosen": -291.1453857421875, + "logps/rejected": -341.76104736328125, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.034220896661281586, + "rewards/margins": 11.897378921508789, + "rewards/rejected": -11.863157272338867, + "step": 10410 + }, + { + "epoch": 2.51, + "learning_rate": 9.114815475129255e-08, + "logits/chosen": -2.4947426319122314, + "logits/rejected": -2.42100191116333, + "logps/chosen": -235.4882049560547, + "logps/rejected": -341.2593078613281, + "loss": 0.0247, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.277120590209961, + "rewards/margins": 11.250730514526367, + "rewards/rejected": -14.527850151062012, + "step": 10420 + }, + { + "epoch": 2.51, + "learning_rate": 9.070244250311998e-08, + "logits/chosen": -2.3231470584869385, + "logits/rejected": -2.484201431274414, + "logps/chosen": -327.2215881347656, + "logps/rejected": -483.61114501953125, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9986634254455566, + "rewards/margins": 13.383715629577637, + "rewards/rejected": -14.382379531860352, + "step": 10430 + }, + { + "epoch": 2.51, + "learning_rate": 9.02567302549474e-08, + "logits/chosen": -2.365403890609741, + "logits/rejected": -2.3459725379943848, + "logps/chosen": -302.86309814453125, + "logps/rejected": -335.1816101074219, + "loss": 0.0325, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.254432201385498, + "rewards/margins": 9.722726821899414, + "rewards/rejected": -11.977158546447754, + "step": 10440 + }, + { + "epoch": 2.52, + "learning_rate": 8.981101800677482e-08, + "logits/chosen": -2.540252208709717, + "logits/rejected": -2.496572494506836, + "logps/chosen": -291.1900939941406, + "logps/rejected": -422.4532165527344, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.518183171749115, + "rewards/margins": 13.844454765319824, + "rewards/rejected": -14.362638473510742, + "step": 10450 + }, + { + "epoch": 2.52, + "learning_rate": 8.936530575860223e-08, + "logits/chosen": -2.5834574699401855, + "logits/rejected": -2.5244648456573486, + "logps/chosen": -265.43572998046875, + "logps/rejected": -409.3478088378906, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29316291213035583, + "rewards/margins": 13.712129592895508, + "rewards/rejected": -13.418966293334961, + "step": 10460 + }, + { + "epoch": 2.52, + "learning_rate": 8.891959351042966e-08, + "logits/chosen": -2.4341185092926025, + "logits/rejected": -2.363210439682007, + "logps/chosen": -258.18609619140625, + "logps/rejected": -299.75970458984375, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06365285068750381, + "rewards/margins": 10.271844863891602, + "rewards/rejected": -10.208192825317383, + "step": 10470 + }, + { + "epoch": 2.52, + "learning_rate": 8.847388126225708e-08, + "logits/chosen": -2.6811318397521973, + "logits/rejected": -2.5494422912597656, + "logps/chosen": -232.71731567382812, + "logps/rejected": -343.6226501464844, + "loss": 0.0504, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2904490232467651, + "rewards/margins": 12.419363975524902, + "rewards/rejected": -13.709814071655273, + "step": 10480 + }, + { + "epoch": 2.52, + "learning_rate": 8.80281690140845e-08, + "logits/chosen": -2.5047972202301025, + "logits/rejected": -2.453812599182129, + "logps/chosen": -192.08062744140625, + "logps/rejected": -309.33734130859375, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5243643522262573, + "rewards/margins": 10.752581596374512, + "rewards/rejected": -11.276945114135742, + "step": 10490 + }, + { + "epoch": 2.53, + "learning_rate": 8.758245676591194e-08, + "logits/chosen": -2.4655632972717285, + "logits/rejected": -2.5261590480804443, + "logps/chosen": -268.0155334472656, + "logps/rejected": -339.0793151855469, + "loss": 0.036, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6569886207580566, + "rewards/margins": 10.690444946289062, + "rewards/rejected": -13.347434997558594, + "step": 10500 + }, + { + "epoch": 2.53, + "learning_rate": 8.713674451773935e-08, + "logits/chosen": -2.485379219055176, + "logits/rejected": -2.406667709350586, + "logps/chosen": -222.50927734375, + "logps/rejected": -289.3091735839844, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4753628969192505, + "rewards/margins": 10.559330940246582, + "rewards/rejected": -12.034693717956543, + "step": 10510 + }, + { + "epoch": 2.53, + "learning_rate": 8.669103226956677e-08, + "logits/chosen": -2.5506882667541504, + "logits/rejected": -2.442164659500122, + "logps/chosen": -228.38003540039062, + "logps/rejected": -344.5788269042969, + "loss": 0.0413, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.098649501800537, + "rewards/margins": 11.076091766357422, + "rewards/rejected": -13.1747407913208, + "step": 10520 + }, + { + "epoch": 2.53, + "learning_rate": 8.624532002139418e-08, + "logits/chosen": -2.312070846557617, + "logits/rejected": -2.3365490436553955, + "logps/chosen": -238.8922576904297, + "logps/rejected": -301.13873291015625, + "loss": 0.0376, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.9768481254577637, + "rewards/margins": 8.800875663757324, + "rewards/rejected": -12.777724266052246, + "step": 10530 + }, + { + "epoch": 2.54, + "learning_rate": 8.579960777322161e-08, + "logits/chosen": -2.561098575592041, + "logits/rejected": -2.6573328971862793, + "logps/chosen": -239.29849243164062, + "logps/rejected": -347.3624267578125, + "loss": 0.0416, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0099174976348877, + "rewards/margins": 9.177538871765137, + "rewards/rejected": -11.187456130981445, + "step": 10540 + }, + { + "epoch": 2.54, + "learning_rate": 8.535389552504903e-08, + "logits/chosen": -2.4423718452453613, + "logits/rejected": -2.1314046382904053, + "logps/chosen": -230.24740600585938, + "logps/rejected": -328.03314208984375, + "loss": 0.0298, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.904605865478516, + "rewards/margins": 10.746953964233398, + "rewards/rejected": -15.651559829711914, + "step": 10550 + }, + { + "epoch": 2.54, + "learning_rate": 8.490818327687645e-08, + "logits/chosen": -2.4472155570983887, + "logits/rejected": -2.2407352924346924, + "logps/chosen": -276.05633544921875, + "logps/rejected": -419.28509521484375, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4732625484466553, + "rewards/margins": 15.026802062988281, + "rewards/rejected": -16.500064849853516, + "step": 10560 + }, + { + "epoch": 2.54, + "learning_rate": 8.446247102870386e-08, + "logits/chosen": -2.511277437210083, + "logits/rejected": -2.3505561351776123, + "logps/chosen": -234.39535522460938, + "logps/rejected": -350.11541748046875, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.86519193649292, + "rewards/margins": 11.936873435974121, + "rewards/rejected": -14.8020658493042, + "step": 10570 + }, + { + "epoch": 2.55, + "learning_rate": 8.401675878053129e-08, + "logits/chosen": -2.5857067108154297, + "logits/rejected": -2.601583957672119, + "logps/chosen": -329.2457580566406, + "logps/rejected": -431.29754638671875, + "loss": 0.0371, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.9668831825256348, + "rewards/margins": 11.971174240112305, + "rewards/rejected": -15.938056945800781, + "step": 10580 + }, + { + "epoch": 2.55, + "learning_rate": 8.357104653235871e-08, + "logits/chosen": -2.5475380420684814, + "logits/rejected": -2.4051592350006104, + "logps/chosen": -291.6007080078125, + "logps/rejected": -381.60699462890625, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.024180293083191, + "rewards/margins": 11.200544357299805, + "rewards/rejected": -12.224725723266602, + "step": 10590 + }, + { + "epoch": 2.55, + "learning_rate": 8.312533428418612e-08, + "logits/chosen": -2.477518320083618, + "logits/rejected": -2.4938597679138184, + "logps/chosen": -257.22735595703125, + "logps/rejected": -313.45501708984375, + "loss": 0.0416, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8546251058578491, + "rewards/margins": 9.755518913269043, + "rewards/rejected": -11.610143661499023, + "step": 10600 + }, + { + "epoch": 2.55, + "learning_rate": 8.267962203601354e-08, + "logits/chosen": -2.3990440368652344, + "logits/rejected": -2.452619791030884, + "logps/chosen": -266.38232421875, + "logps/rejected": -357.98199462890625, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0757076740264893, + "rewards/margins": 11.956494331359863, + "rewards/rejected": -13.032203674316406, + "step": 10610 + }, + { + "epoch": 2.56, + "learning_rate": 8.223390978784097e-08, + "logits/chosen": -2.5130615234375, + "logits/rejected": -2.624833583831787, + "logps/chosen": -169.1503448486328, + "logps/rejected": -341.31689453125, + "loss": 0.03, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0461747646331787, + "rewards/margins": 11.008197784423828, + "rewards/rejected": -12.05437183380127, + "step": 10620 + }, + { + "epoch": 2.56, + "learning_rate": 8.178819753966839e-08, + "logits/chosen": -2.6103034019470215, + "logits/rejected": -2.4376797676086426, + "logps/chosen": -363.60845947265625, + "logps/rejected": -427.32806396484375, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0050119878724217415, + "rewards/margins": 13.436140060424805, + "rewards/rejected": -13.431129455566406, + "step": 10630 + }, + { + "epoch": 2.56, + "learning_rate": 8.13424852914958e-08, + "logits/chosen": -2.5464351177215576, + "logits/rejected": -2.5220837593078613, + "logps/chosen": -256.637939453125, + "logps/rejected": -359.1075134277344, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0264241695404053, + "rewards/margins": 12.260004043579102, + "rewards/rejected": -13.286428451538086, + "step": 10640 + }, + { + "epoch": 2.56, + "learning_rate": 8.089677304332322e-08, + "logits/chosen": -2.358281373977661, + "logits/rejected": -2.372849225997925, + "logps/chosen": -262.013671875, + "logps/rejected": -370.379638671875, + "loss": 0.0374, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2720009088516235, + "rewards/margins": 14.581092834472656, + "rewards/rejected": -13.309089660644531, + "step": 10650 + }, + { + "epoch": 2.57, + "learning_rate": 8.045106079515065e-08, + "logits/chosen": -2.3271663188934326, + "logits/rejected": -2.308911085128784, + "logps/chosen": -289.9702453613281, + "logps/rejected": -416.28643798828125, + "loss": 0.0329, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6120532751083374, + "rewards/margins": 13.261553764343262, + "rewards/rejected": -14.87360668182373, + "step": 10660 + }, + { + "epoch": 2.57, + "learning_rate": 8.000534854697808e-08, + "logits/chosen": -2.602839231491089, + "logits/rejected": -2.441680908203125, + "logps/chosen": -235.3236083984375, + "logps/rejected": -355.11566162109375, + "loss": 0.0448, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6526877880096436, + "rewards/margins": 10.503377914428711, + "rewards/rejected": -13.15606689453125, + "step": 10670 + }, + { + "epoch": 2.57, + "learning_rate": 7.955963629880549e-08, + "logits/chosen": -2.670300245285034, + "logits/rejected": -2.5688750743865967, + "logps/chosen": -276.58660888671875, + "logps/rejected": -432.313232421875, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6207407712936401, + "rewards/margins": 14.357336044311523, + "rewards/rejected": -14.978078842163086, + "step": 10680 + }, + { + "epoch": 2.57, + "learning_rate": 7.911392405063291e-08, + "logits/chosen": -2.5545012950897217, + "logits/rejected": -2.5492682456970215, + "logps/chosen": -211.1891326904297, + "logps/rejected": -282.841552734375, + "loss": 0.0367, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9025828242301941, + "rewards/margins": 9.527538299560547, + "rewards/rejected": -10.430120468139648, + "step": 10690 + }, + { + "epoch": 2.58, + "learning_rate": 7.866821180246034e-08, + "logits/chosen": -2.7213170528411865, + "logits/rejected": -2.7161808013916016, + "logps/chosen": -258.9945068359375, + "logps/rejected": -443.43414306640625, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9200226664543152, + "rewards/margins": 13.025611877441406, + "rewards/rejected": -13.945635795593262, + "step": 10700 + }, + { + "epoch": 2.58, + "learning_rate": 7.822249955428775e-08, + "logits/chosen": -2.664929151535034, + "logits/rejected": -2.4846885204315186, + "logps/chosen": -241.22171020507812, + "logps/rejected": -303.971923828125, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0947069302201271, + "rewards/margins": 11.653878211975098, + "rewards/rejected": -11.748584747314453, + "step": 10710 + }, + { + "epoch": 2.58, + "learning_rate": 7.777678730611517e-08, + "logits/chosen": -2.4323935508728027, + "logits/rejected": -2.452493906021118, + "logps/chosen": -386.83465576171875, + "logps/rejected": -371.39849853515625, + "loss": 0.0425, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2871506214141846, + "rewards/margins": 11.200230598449707, + "rewards/rejected": -14.487380981445312, + "step": 10720 + }, + { + "epoch": 2.58, + "learning_rate": 7.733107505794259e-08, + "logits/chosen": -2.409231424331665, + "logits/rejected": -2.3314690589904785, + "logps/chosen": -220.2698211669922, + "logps/rejected": -360.07476806640625, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3193631172180176, + "rewards/margins": 12.024897575378418, + "rewards/rejected": -14.344259262084961, + "step": 10730 + }, + { + "epoch": 2.58, + "learning_rate": 7.688536280977002e-08, + "logits/chosen": -2.380180835723877, + "logits/rejected": -2.447178602218628, + "logps/chosen": -223.9288787841797, + "logps/rejected": -345.01123046875, + "loss": 0.0337, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7370744943618774, + "rewards/margins": 11.357762336730957, + "rewards/rejected": -13.09483814239502, + "step": 10740 + }, + { + "epoch": 2.59, + "learning_rate": 7.643965056159743e-08, + "logits/chosen": -2.5712685585021973, + "logits/rejected": -2.560652256011963, + "logps/chosen": -239.720458984375, + "logps/rejected": -363.42645263671875, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7917028665542603, + "rewards/margins": 10.572433471679688, + "rewards/rejected": -11.3641357421875, + "step": 10750 + }, + { + "epoch": 2.59, + "learning_rate": 7.599393831342485e-08, + "logits/chosen": -2.3512864112854004, + "logits/rejected": -2.2663590908050537, + "logps/chosen": -251.39840698242188, + "logps/rejected": -387.4175109863281, + "loss": 0.0259, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3710943460464478, + "rewards/margins": 15.084039688110352, + "rewards/rejected": -16.455135345458984, + "step": 10760 + }, + { + "epoch": 2.59, + "learning_rate": 7.554822606525226e-08, + "logits/chosen": -2.5970118045806885, + "logits/rejected": -2.5169131755828857, + "logps/chosen": -300.6658630371094, + "logps/rejected": -375.5133361816406, + "loss": 0.0293, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.221968173980713, + "rewards/margins": 9.254558563232422, + "rewards/rejected": -11.476526260375977, + "step": 10770 + }, + { + "epoch": 2.59, + "learning_rate": 7.510251381707969e-08, + "logits/chosen": -2.3693273067474365, + "logits/rejected": -2.297300338745117, + "logps/chosen": -257.8289794921875, + "logps/rejected": -330.83441162109375, + "loss": 0.0433, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5106990337371826, + "rewards/margins": 9.142082214355469, + "rewards/rejected": -10.652780532836914, + "step": 10780 + }, + { + "epoch": 2.6, + "learning_rate": 7.465680156890711e-08, + "logits/chosen": -2.4781227111816406, + "logits/rejected": -2.488508701324463, + "logps/chosen": -253.02694702148438, + "logps/rejected": -379.565185546875, + "loss": 0.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2573682367801666, + "rewards/margins": 13.718050956726074, + "rewards/rejected": -13.460683822631836, + "step": 10790 + }, + { + "epoch": 2.6, + "learning_rate": 7.421108932073453e-08, + "logits/chosen": -2.562633991241455, + "logits/rejected": -2.611072063446045, + "logps/chosen": -240.0339813232422, + "logps/rejected": -447.7969665527344, + "loss": 0.0577, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4941372871398926, + "rewards/margins": 11.667159080505371, + "rewards/rejected": -14.161297798156738, + "step": 10800 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -2.2341954708099365, + "eval_logits/rejected": -2.196829319000244, + "eval_logps/chosen": -278.65399169921875, + "eval_logps/rejected": -309.213623046875, + "eval_loss": 0.6290140748023987, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -7.67264461517334, + "eval_rewards/margins": 4.295636177062988, + "eval_rewards/rejected": -11.968280792236328, + "eval_runtime": 133.4667, + "eval_samples_per_second": 23.646, + "eval_steps_per_second": 0.375, + "step": 10800 + }, + { + "epoch": 2.6, + "learning_rate": 7.376537707256194e-08, + "logits/chosen": -2.5272409915924072, + "logits/rejected": -2.4531893730163574, + "logps/chosen": -248.57968139648438, + "logps/rejected": -338.3380432128906, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2790850400924683, + "rewards/margins": 10.274438858032227, + "rewards/rejected": -11.553524017333984, + "step": 10810 + }, + { + "epoch": 2.6, + "learning_rate": 7.331966482438937e-08, + "logits/chosen": -2.4732887744903564, + "logits/rejected": -2.432586908340454, + "logps/chosen": -226.4634246826172, + "logps/rejected": -326.443603515625, + "loss": 0.0276, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4939928948879242, + "rewards/margins": 12.521749496459961, + "rewards/rejected": -12.02775764465332, + "step": 10820 + }, + { + "epoch": 2.61, + "learning_rate": 7.287395257621679e-08, + "logits/chosen": -2.3963871002197266, + "logits/rejected": -2.37272572517395, + "logps/chosen": -323.8963928222656, + "logps/rejected": -337.9401550292969, + "loss": 0.0268, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5154550075531006, + "rewards/margins": 10.958158493041992, + "rewards/rejected": -13.473612785339355, + "step": 10830 + }, + { + "epoch": 2.61, + "learning_rate": 7.24282403280442e-08, + "logits/chosen": -2.660106658935547, + "logits/rejected": -2.6419565677642822, + "logps/chosen": -323.8818359375, + "logps/rejected": -458.4462890625, + "loss": 0.0233, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8459736108779907, + "rewards/margins": 13.067835807800293, + "rewards/rejected": -13.913810729980469, + "step": 10840 + }, + { + "epoch": 2.61, + "learning_rate": 7.198252807987163e-08, + "logits/chosen": -2.5293502807617188, + "logits/rejected": -2.537257194519043, + "logps/chosen": -194.8105010986328, + "logps/rejected": -336.81829833984375, + "loss": 0.0415, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9545141458511353, + "rewards/margins": 10.744138717651367, + "rewards/rejected": -12.698652267456055, + "step": 10850 + }, + { + "epoch": 2.61, + "learning_rate": 7.153681583169906e-08, + "logits/chosen": -2.5184950828552246, + "logits/rejected": -2.4280002117156982, + "logps/chosen": -196.51992797851562, + "logps/rejected": -296.865966796875, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.675471842288971, + "rewards/margins": 12.040105819702148, + "rewards/rejected": -12.71557903289795, + "step": 10860 + }, + { + "epoch": 2.62, + "learning_rate": 7.109110358352648e-08, + "logits/chosen": -2.5834603309631348, + "logits/rejected": -2.5854382514953613, + "logps/chosen": -298.9248046875, + "logps/rejected": -413.6004943847656, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8956855535507202, + "rewards/margins": 14.540181159973145, + "rewards/rejected": -16.435867309570312, + "step": 10870 + }, + { + "epoch": 2.62, + "learning_rate": 7.06453913353539e-08, + "logits/chosen": -2.5333352088928223, + "logits/rejected": -2.5061662197113037, + "logps/chosen": -258.09515380859375, + "logps/rejected": -396.55780029296875, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07149648666381836, + "rewards/margins": 14.085748672485352, + "rewards/rejected": -14.014251708984375, + "step": 10880 + }, + { + "epoch": 2.62, + "learning_rate": 7.019967908718131e-08, + "logits/chosen": -2.518632411956787, + "logits/rejected": -2.4010226726531982, + "logps/chosen": -271.84112548828125, + "logps/rejected": -346.02154541015625, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4661548137664795, + "rewards/margins": 12.127985954284668, + "rewards/rejected": -13.594141006469727, + "step": 10890 + }, + { + "epoch": 2.62, + "learning_rate": 6.975396683900874e-08, + "logits/chosen": -2.5858657360076904, + "logits/rejected": -2.621828317642212, + "logps/chosen": -223.5819549560547, + "logps/rejected": -403.63055419921875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2753188610076904, + "rewards/margins": 11.72504997253418, + "rewards/rejected": -14.00036907196045, + "step": 10900 + }, + { + "epoch": 2.63, + "learning_rate": 6.930825459083616e-08, + "logits/chosen": -2.711143970489502, + "logits/rejected": -2.5317392349243164, + "logps/chosen": -312.88897705078125, + "logps/rejected": -298.1280822753906, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8430429697036743, + "rewards/margins": 10.042176246643066, + "rewards/rejected": -11.885217666625977, + "step": 10910 + }, + { + "epoch": 2.63, + "learning_rate": 6.886254234266357e-08, + "logits/chosen": -2.4132256507873535, + "logits/rejected": -2.299051284790039, + "logps/chosen": -294.41107177734375, + "logps/rejected": -299.05609130859375, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5870959758758545, + "rewards/margins": 9.86145305633545, + "rewards/rejected": -11.448549270629883, + "step": 10920 + }, + { + "epoch": 2.63, + "learning_rate": 6.841683009449099e-08, + "logits/chosen": -2.2700419425964355, + "logits/rejected": -2.2279114723205566, + "logps/chosen": -242.2595977783203, + "logps/rejected": -358.4086608886719, + "loss": 0.0173, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5513057708740234, + "rewards/margins": 10.892278671264648, + "rewards/rejected": -14.443583488464355, + "step": 10930 + }, + { + "epoch": 2.63, + "learning_rate": 6.797111784631842e-08, + "logits/chosen": -2.401989221572876, + "logits/rejected": -2.4412052631378174, + "logps/chosen": -284.970458984375, + "logps/rejected": -384.956787109375, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6625072360038757, + "rewards/margins": 14.448089599609375, + "rewards/rejected": -15.110595703125, + "step": 10940 + }, + { + "epoch": 2.64, + "learning_rate": 6.752540559814583e-08, + "logits/chosen": -2.272688627243042, + "logits/rejected": -2.2353696823120117, + "logps/chosen": -220.97085571289062, + "logps/rejected": -279.1014404296875, + "loss": 0.0266, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1295342445373535, + "rewards/margins": 10.96465015411377, + "rewards/rejected": -13.094184875488281, + "step": 10950 + }, + { + "epoch": 2.64, + "learning_rate": 6.707969334997325e-08, + "logits/chosen": -2.442444324493408, + "logits/rejected": -2.391512393951416, + "logps/chosen": -264.117919921875, + "logps/rejected": -352.73162841796875, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.863391101360321, + "rewards/margins": 11.013391494750977, + "rewards/rejected": -11.876781463623047, + "step": 10960 + }, + { + "epoch": 2.64, + "learning_rate": 6.663398110180066e-08, + "logits/chosen": -2.403355836868286, + "logits/rejected": -2.4246935844421387, + "logps/chosen": -193.77670288085938, + "logps/rejected": -314.63702392578125, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0767998695373535, + "rewards/margins": 11.021893501281738, + "rewards/rejected": -13.09869384765625, + "step": 10970 + }, + { + "epoch": 2.64, + "learning_rate": 6.61882688536281e-08, + "logits/chosen": -2.401864767074585, + "logits/rejected": -2.3092455863952637, + "logps/chosen": -332.11383056640625, + "logps/rejected": -365.3248596191406, + "loss": 0.036, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2405102252960205, + "rewards/margins": 13.149075508117676, + "rewards/rejected": -14.3895845413208, + "step": 10980 + }, + { + "epoch": 2.65, + "learning_rate": 6.574255660545551e-08, + "logits/chosen": -2.717160701751709, + "logits/rejected": -2.5159428119659424, + "logps/chosen": -383.21368408203125, + "logps/rejected": -334.6822509765625, + "loss": 0.0354, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6736847162246704, + "rewards/margins": 9.387896537780762, + "rewards/rejected": -11.0615816116333, + "step": 10990 + }, + { + "epoch": 2.65, + "learning_rate": 6.529684435728293e-08, + "logits/chosen": -2.600236415863037, + "logits/rejected": -2.3790392875671387, + "logps/chosen": -285.241455078125, + "logps/rejected": -324.14349365234375, + "loss": 0.0212, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.9113547801971436, + "rewards/margins": 9.267632484436035, + "rewards/rejected": -13.178987503051758, + "step": 11000 + }, + { + "epoch": 2.65, + "learning_rate": 6.485113210911034e-08, + "logits/chosen": -2.3917620182037354, + "logits/rejected": -2.319744825363159, + "logps/chosen": -240.36331176757812, + "logps/rejected": -335.1694641113281, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0478901863098145, + "rewards/margins": 11.181989669799805, + "rewards/rejected": -13.229879379272461, + "step": 11010 + }, + { + "epoch": 2.65, + "learning_rate": 6.440541986093779e-08, + "logits/chosen": -2.3351759910583496, + "logits/rejected": -2.4058279991149902, + "logps/chosen": -224.92684936523438, + "logps/rejected": -374.34259033203125, + "loss": 0.019, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7083711624145508, + "rewards/margins": 13.359003067016602, + "rewards/rejected": -15.067373275756836, + "step": 11020 + }, + { + "epoch": 2.65, + "learning_rate": 6.39597076127652e-08, + "logits/chosen": -2.4472146034240723, + "logits/rejected": -2.3554482460021973, + "logps/chosen": -245.9796142578125, + "logps/rejected": -351.9076232910156, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9210950136184692, + "rewards/margins": 13.461766242980957, + "rewards/rejected": -14.382861137390137, + "step": 11030 + }, + { + "epoch": 2.66, + "learning_rate": 6.351399536459262e-08, + "logits/chosen": -2.6213226318359375, + "logits/rejected": -2.6294121742248535, + "logps/chosen": -273.3938903808594, + "logps/rejected": -370.6047668457031, + "loss": 0.0187, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.065000295639038, + "rewards/margins": 11.00334358215332, + "rewards/rejected": -13.068344116210938, + "step": 11040 + }, + { + "epoch": 2.66, + "learning_rate": 6.306828311642005e-08, + "logits/chosen": -2.588347911834717, + "logits/rejected": -2.5161221027374268, + "logps/chosen": -238.6955108642578, + "logps/rejected": -331.37200927734375, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2676331996917725, + "rewards/margins": 11.658307075500488, + "rewards/rejected": -12.925939559936523, + "step": 11050 + }, + { + "epoch": 2.66, + "learning_rate": 6.262257086824746e-08, + "logits/chosen": -2.4580318927764893, + "logits/rejected": -2.496865749359131, + "logps/chosen": -248.82595825195312, + "logps/rejected": -391.84716796875, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.505751609802246, + "rewards/margins": 14.90924072265625, + "rewards/rejected": -16.41499137878418, + "step": 11060 + }, + { + "epoch": 2.66, + "learning_rate": 6.217685862007488e-08, + "logits/chosen": -2.487553834915161, + "logits/rejected": -2.5703344345092773, + "logps/chosen": -274.4038391113281, + "logps/rejected": -426.31304931640625, + "loss": 0.0294, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0016109943389893, + "rewards/margins": 12.330926895141602, + "rewards/rejected": -13.332539558410645, + "step": 11070 + }, + { + "epoch": 2.67, + "learning_rate": 6.17311463719023e-08, + "logits/chosen": -2.2672083377838135, + "logits/rejected": -2.265392541885376, + "logps/chosen": -226.40185546875, + "logps/rejected": -280.76251220703125, + "loss": 0.0337, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.9002182483673096, + "rewards/margins": 8.221508979797363, + "rewards/rejected": -12.121726036071777, + "step": 11080 + }, + { + "epoch": 2.67, + "learning_rate": 6.128543412372972e-08, + "logits/chosen": -2.476393938064575, + "logits/rejected": -2.365628957748413, + "logps/chosen": -314.09381103515625, + "logps/rejected": -401.48175048828125, + "loss": 0.0229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1945843696594238, + "rewards/margins": 10.73908519744873, + "rewards/rejected": -11.93366813659668, + "step": 11090 + }, + { + "epoch": 2.67, + "learning_rate": 6.083972187555714e-08, + "logits/chosen": -2.393922805786133, + "logits/rejected": -2.3239188194274902, + "logps/chosen": -202.66156005859375, + "logps/rejected": -266.5211486816406, + "loss": 0.0403, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.612478256225586, + "rewards/margins": 12.316023826599121, + "rewards/rejected": -13.928503036499023, + "step": 11100 + }, + { + "epoch": 2.67, + "learning_rate": 6.039400962738456e-08, + "logits/chosen": -2.3518242835998535, + "logits/rejected": -2.378859043121338, + "logps/chosen": -454.7975158691406, + "logps/rejected": -392.9176940917969, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3876655101776123, + "rewards/margins": 12.167783737182617, + "rewards/rejected": -14.555447578430176, + "step": 11110 + }, + { + "epoch": 2.68, + "learning_rate": 5.994829737921197e-08, + "logits/chosen": -2.493408679962158, + "logits/rejected": -2.4440033435821533, + "logps/chosen": -247.04806518554688, + "logps/rejected": -359.9694519042969, + "loss": 0.0347, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.431443691253662, + "rewards/margins": 10.620159149169922, + "rewards/rejected": -14.051603317260742, + "step": 11120 + }, + { + "epoch": 2.68, + "learning_rate": 5.9502585131039395e-08, + "logits/chosen": -2.4963178634643555, + "logits/rejected": -2.298515558242798, + "logps/chosen": -268.14385986328125, + "logps/rejected": -344.11749267578125, + "loss": 0.0298, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.784672498703003, + "rewards/margins": 11.713040351867676, + "rewards/rejected": -14.497714042663574, + "step": 11130 + }, + { + "epoch": 2.68, + "learning_rate": 5.9056872882866825e-08, + "logits/chosen": -2.4330027103424072, + "logits/rejected": -2.4529852867126465, + "logps/chosen": -238.3878936767578, + "logps/rejected": -320.8956604003906, + "loss": 0.0492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.8928139209747314, + "rewards/margins": 9.035645484924316, + "rewards/rejected": -12.928457260131836, + "step": 11140 + }, + { + "epoch": 2.68, + "learning_rate": 5.861116063469424e-08, + "logits/chosen": -2.3670287132263184, + "logits/rejected": -2.414227247238159, + "logps/chosen": -202.35137939453125, + "logps/rejected": -309.3493957519531, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23711788654327393, + "rewards/margins": 12.99688720703125, + "rewards/rejected": -13.234004020690918, + "step": 11150 + }, + { + "epoch": 2.69, + "learning_rate": 5.8165448386521663e-08, + "logits/chosen": -2.4794349670410156, + "logits/rejected": -2.357036590576172, + "logps/chosen": -299.997802734375, + "logps/rejected": -373.39080810546875, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8509117960929871, + "rewards/margins": 15.069299697875977, + "rewards/rejected": -15.920211791992188, + "step": 11160 + }, + { + "epoch": 2.69, + "learning_rate": 5.771973613834908e-08, + "logits/chosen": -2.2667782306671143, + "logits/rejected": -2.3640787601470947, + "logps/chosen": -257.46453857421875, + "logps/rejected": -464.90216064453125, + "loss": 0.0448, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3583171367645264, + "rewards/margins": 12.97047233581543, + "rewards/rejected": -15.328790664672852, + "step": 11170 + }, + { + "epoch": 2.69, + "learning_rate": 5.72740238901765e-08, + "logits/chosen": -2.4249043464660645, + "logits/rejected": -2.1784474849700928, + "logps/chosen": -285.65008544921875, + "logps/rejected": -348.6196594238281, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3535035848617554, + "rewards/margins": 12.062604904174805, + "rewards/rejected": -13.416107177734375, + "step": 11180 + }, + { + "epoch": 2.69, + "learning_rate": 5.682831164200392e-08, + "logits/chosen": -2.6384506225585938, + "logits/rejected": -2.4709858894348145, + "logps/chosen": -302.586669921875, + "logps/rejected": -415.0191345214844, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9068059921264648, + "rewards/margins": 13.053243637084961, + "rewards/rejected": -13.960049629211426, + "step": 11190 + }, + { + "epoch": 2.7, + "learning_rate": 5.638259939383134e-08, + "logits/chosen": -2.524169921875, + "logits/rejected": -2.40922474861145, + "logps/chosen": -360.9150390625, + "logps/rejected": -444.46533203125, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.248106837272644, + "rewards/margins": 13.789751052856445, + "rewards/rejected": -15.037857055664062, + "step": 11200 + }, + { + "epoch": 2.7, + "eval_logits/chosen": -2.2006473541259766, + "eval_logits/rejected": -2.162346839904785, + "eval_logps/chosen": -274.2283935546875, + "eval_logps/rejected": -304.8287048339844, + "eval_loss": 0.6259632706642151, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -7.230083465576172, + "eval_rewards/margins": 4.299704074859619, + "eval_rewards/rejected": -11.52978801727295, + "eval_runtime": 134.2311, + "eval_samples_per_second": 23.512, + "eval_steps_per_second": 0.372, + "step": 11200 + }, + { + "epoch": 2.7, + "learning_rate": 5.593688714565876e-08, + "logits/chosen": -2.505527973175049, + "logits/rejected": -2.483219861984253, + "logps/chosen": -285.79278564453125, + "logps/rejected": -396.4827880859375, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0627493858337402, + "rewards/margins": 10.589037895202637, + "rewards/rejected": -12.651786804199219, + "step": 11210 + }, + { + "epoch": 2.7, + "learning_rate": 5.549117489748618e-08, + "logits/chosen": -2.4316835403442383, + "logits/rejected": -2.2311272621154785, + "logps/chosen": -407.55078125, + "logps/rejected": -559.0494995117188, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7516244649887085, + "rewards/margins": 19.262540817260742, + "rewards/rejected": -17.51091766357422, + "step": 11220 + }, + { + "epoch": 2.7, + "learning_rate": 5.50454626493136e-08, + "logits/chosen": -2.3240840435028076, + "logits/rejected": -2.3160452842712402, + "logps/chosen": -206.2075653076172, + "logps/rejected": -300.61151123046875, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.806415855884552, + "rewards/margins": 12.24956226348877, + "rewards/rejected": -13.055978775024414, + "step": 11230 + }, + { + "epoch": 2.71, + "learning_rate": 5.4599750401141025e-08, + "logits/chosen": -2.434230089187622, + "logits/rejected": -2.3133277893066406, + "logps/chosen": -243.75863647460938, + "logps/rejected": -367.0157775878906, + "loss": 0.0245, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.664422035217285, + "rewards/margins": 10.782770156860352, + "rewards/rejected": -13.447192192077637, + "step": 11240 + }, + { + "epoch": 2.71, + "learning_rate": 5.415403815296844e-08, + "logits/chosen": -2.3377509117126465, + "logits/rejected": -2.351390838623047, + "logps/chosen": -274.16522216796875, + "logps/rejected": -353.56158447265625, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5112389922142029, + "rewards/margins": 13.706448554992676, + "rewards/rejected": -14.217687606811523, + "step": 11250 + }, + { + "epoch": 2.71, + "learning_rate": 5.3708325904795864e-08, + "logits/chosen": -2.5061848163604736, + "logits/rejected": -2.394094944000244, + "logps/chosen": -280.7466735839844, + "logps/rejected": -361.4125061035156, + "loss": 0.0191, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9356305599212646, + "rewards/margins": 10.402562141418457, + "rewards/rejected": -12.3381929397583, + "step": 11260 + }, + { + "epoch": 2.71, + "learning_rate": 5.326261365662328e-08, + "logits/chosen": -2.435429096221924, + "logits/rejected": -2.3512542247772217, + "logps/chosen": -250.2201385498047, + "logps/rejected": -404.38128662109375, + "loss": 0.0343, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6449331641197205, + "rewards/margins": 14.451985359191895, + "rewards/rejected": -15.096919059753418, + "step": 11270 + }, + { + "epoch": 2.71, + "learning_rate": 5.28169014084507e-08, + "logits/chosen": -2.5936787128448486, + "logits/rejected": -2.437343120574951, + "logps/chosen": -452.4847106933594, + "logps/rejected": -542.2637939453125, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.385621190071106, + "rewards/margins": 18.973867416381836, + "rewards/rejected": -17.588247299194336, + "step": 11280 + }, + { + "epoch": 2.72, + "learning_rate": 5.237118916027812e-08, + "logits/chosen": -2.3915462493896484, + "logits/rejected": -2.471947193145752, + "logps/chosen": -235.27487182617188, + "logps/rejected": -347.845458984375, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0457872152328491, + "rewards/margins": 12.638503074645996, + "rewards/rejected": -13.684290885925293, + "step": 11290 + }, + { + "epoch": 2.72, + "learning_rate": 5.192547691210554e-08, + "logits/chosen": -2.5435686111450195, + "logits/rejected": -2.5178191661834717, + "logps/chosen": -286.24530029296875, + "logps/rejected": -341.6558837890625, + "loss": 0.0325, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.60090970993042, + "rewards/margins": 9.717835426330566, + "rewards/rejected": -13.318742752075195, + "step": 11300 + }, + { + "epoch": 2.72, + "learning_rate": 5.147976466393296e-08, + "logits/chosen": -2.3367373943328857, + "logits/rejected": -2.254424810409546, + "logps/chosen": -204.71485900878906, + "logps/rejected": -354.184814453125, + "loss": 0.0259, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.430279731750488, + "rewards/margins": 10.637048721313477, + "rewards/rejected": -15.067327499389648, + "step": 11310 + }, + { + "epoch": 2.72, + "learning_rate": 5.103405241576039e-08, + "logits/chosen": -2.33821177482605, + "logits/rejected": -2.2306008338928223, + "logps/chosen": -188.8552703857422, + "logps/rejected": -264.550048828125, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7913947105407715, + "rewards/margins": 10.940762519836426, + "rewards/rejected": -13.732156753540039, + "step": 11320 + }, + { + "epoch": 2.73, + "learning_rate": 5.05883401675878e-08, + "logits/chosen": -2.525364637374878, + "logits/rejected": -2.46457839012146, + "logps/chosen": -343.2807312011719, + "logps/rejected": -489.612060546875, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20165547728538513, + "rewards/margins": 17.081844329833984, + "rewards/rejected": -16.88018798828125, + "step": 11330 + }, + { + "epoch": 2.73, + "learning_rate": 5.0142627919415226e-08, + "logits/chosen": -2.560027599334717, + "logits/rejected": -2.4247658252716064, + "logps/chosen": -215.8699493408203, + "logps/rejected": -268.9886169433594, + "loss": 0.023, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.39333075284957886, + "rewards/margins": 11.755952835083008, + "rewards/rejected": -11.362622261047363, + "step": 11340 + }, + { + "epoch": 2.73, + "learning_rate": 4.969691567124264e-08, + "logits/chosen": -2.461838960647583, + "logits/rejected": -2.450381278991699, + "logps/chosen": -364.3572998046875, + "logps/rejected": -452.86669921875, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1634099930524826, + "rewards/margins": 12.84306526184082, + "rewards/rejected": -13.006475448608398, + "step": 11350 + }, + { + "epoch": 2.73, + "learning_rate": 4.9251203423070065e-08, + "logits/chosen": -2.484093189239502, + "logits/rejected": -2.3913586139678955, + "logps/chosen": -239.50033569335938, + "logps/rejected": -251.77700805664062, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1334115266799927, + "rewards/margins": 10.335721015930176, + "rewards/rejected": -11.469133377075195, + "step": 11360 + }, + { + "epoch": 2.74, + "learning_rate": 4.880549117489748e-08, + "logits/chosen": -2.5914254188537598, + "logits/rejected": -2.37621808052063, + "logps/chosen": -341.57904052734375, + "logps/rejected": -336.15582275390625, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0589287281036377, + "rewards/margins": 11.350578308105469, + "rewards/rejected": -12.409507751464844, + "step": 11370 + }, + { + "epoch": 2.74, + "learning_rate": 4.8359778926724904e-08, + "logits/chosen": -2.3758625984191895, + "logits/rejected": -2.3536620140075684, + "logps/chosen": -271.1437683105469, + "logps/rejected": -358.9661560058594, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1041805744171143, + "rewards/margins": 12.512085914611816, + "rewards/rejected": -13.616266250610352, + "step": 11380 + }, + { + "epoch": 2.74, + "learning_rate": 4.791406667855232e-08, + "logits/chosen": -2.556595802307129, + "logits/rejected": -2.530003309249878, + "logps/chosen": -297.84808349609375, + "logps/rejected": -388.65423583984375, + "loss": 0.0312, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1992433071136475, + "rewards/margins": 12.781665802001953, + "rewards/rejected": -13.98090934753418, + "step": 11390 + }, + { + "epoch": 2.74, + "learning_rate": 4.746835443037975e-08, + "logits/chosen": -2.4237866401672363, + "logits/rejected": -2.2597763538360596, + "logps/chosen": -250.51962280273438, + "logps/rejected": -363.7515869140625, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5471956729888916, + "rewards/margins": 13.305665969848633, + "rewards/rejected": -14.852861404418945, + "step": 11400 + }, + { + "epoch": 2.75, + "learning_rate": 4.7022642182207165e-08, + "logits/chosen": -2.556197166442871, + "logits/rejected": -2.604642391204834, + "logps/chosen": -412.33660888671875, + "logps/rejected": -461.094482421875, + "loss": 0.0377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17471656203269958, + "rewards/margins": 13.933385848999023, + "rewards/rejected": -14.108102798461914, + "step": 11410 + }, + { + "epoch": 2.75, + "learning_rate": 4.657692993403459e-08, + "logits/chosen": -2.2716872692108154, + "logits/rejected": -2.2458138465881348, + "logps/chosen": -219.63784790039062, + "logps/rejected": -359.21185302734375, + "loss": 0.0199, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3408005237579346, + "rewards/margins": 11.46550178527832, + "rewards/rejected": -13.806303024291992, + "step": 11420 + }, + { + "epoch": 2.75, + "learning_rate": 4.6131217685862004e-08, + "logits/chosen": -2.3509669303894043, + "logits/rejected": -2.422544002532959, + "logps/chosen": -205.3550567626953, + "logps/rejected": -342.86895751953125, + "loss": 0.0244, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3986648321151733, + "rewards/margins": 13.367385864257812, + "rewards/rejected": -14.766050338745117, + "step": 11430 + }, + { + "epoch": 2.75, + "learning_rate": 4.5685505437689427e-08, + "logits/chosen": -2.606081962585449, + "logits/rejected": -2.4485602378845215, + "logps/chosen": -339.7007141113281, + "logps/rejected": -361.31219482421875, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.799737572669983, + "rewards/margins": 10.338393211364746, + "rewards/rejected": -12.138131141662598, + "step": 11440 + }, + { + "epoch": 2.76, + "learning_rate": 4.523979318951684e-08, + "logits/chosen": -2.4293484687805176, + "logits/rejected": -2.3089652061462402, + "logps/chosen": -288.80133056640625, + "logps/rejected": -384.42108154296875, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2991480827331543, + "rewards/margins": 11.283964157104492, + "rewards/rejected": -13.583112716674805, + "step": 11450 + }, + { + "epoch": 2.76, + "learning_rate": 4.4794080941344265e-08, + "logits/chosen": -2.436283588409424, + "logits/rejected": -2.402207851409912, + "logps/chosen": -303.0555114746094, + "logps/rejected": -428.24371337890625, + "loss": 0.0271, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7075694799423218, + "rewards/margins": 13.223734855651855, + "rewards/rejected": -14.931304931640625, + "step": 11460 + }, + { + "epoch": 2.76, + "learning_rate": 4.434836869317168e-08, + "logits/chosen": -2.375192165374756, + "logits/rejected": -2.3349125385284424, + "logps/chosen": -361.15667724609375, + "logps/rejected": -416.8773498535156, + "loss": 0.0327, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.413932800292969, + "rewards/margins": 10.136090278625488, + "rewards/rejected": -14.550024032592773, + "step": 11470 + }, + { + "epoch": 2.76, + "learning_rate": 4.3902656444999104e-08, + "logits/chosen": -2.4863932132720947, + "logits/rejected": -2.433408498764038, + "logps/chosen": -290.609375, + "logps/rejected": -377.8664245605469, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6029821634292603, + "rewards/margins": 13.398452758789062, + "rewards/rejected": -14.001434326171875, + "step": 11480 + }, + { + "epoch": 2.77, + "learning_rate": 4.345694419682653e-08, + "logits/chosen": -2.5686421394348145, + "logits/rejected": -2.42051100730896, + "logps/chosen": -258.9626770019531, + "logps/rejected": -362.4114074707031, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0931127071380615, + "rewards/margins": 11.589851379394531, + "rewards/rejected": -12.682964324951172, + "step": 11490 + }, + { + "epoch": 2.77, + "learning_rate": 4.301123194865395e-08, + "logits/chosen": -2.4282636642456055, + "logits/rejected": -2.4071927070617676, + "logps/chosen": -294.930419921875, + "logps/rejected": -374.0691223144531, + "loss": 0.0275, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04943211004137993, + "rewards/margins": 12.023344039916992, + "rewards/rejected": -11.973912239074707, + "step": 11500 + }, + { + "epoch": 2.77, + "learning_rate": 4.2565519700481366e-08, + "logits/chosen": -2.457975387573242, + "logits/rejected": -2.432142734527588, + "logps/chosen": -220.9159698486328, + "logps/rejected": -428.310546875, + "loss": 0.0738, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2498180866241455, + "rewards/margins": 14.47473430633545, + "rewards/rejected": -14.724553108215332, + "step": 11510 + }, + { + "epoch": 2.77, + "learning_rate": 4.211980745230879e-08, + "logits/chosen": -2.371051788330078, + "logits/rejected": -2.4538283348083496, + "logps/chosen": -263.51220703125, + "logps/rejected": -334.24102783203125, + "loss": 0.033, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7677242755889893, + "rewards/margins": 11.65418815612793, + "rewards/rejected": -13.421911239624023, + "step": 11520 + }, + { + "epoch": 2.77, + "learning_rate": 4.1674095204136205e-08, + "logits/chosen": -2.494645595550537, + "logits/rejected": -2.414339303970337, + "logps/chosen": -265.6726989746094, + "logps/rejected": -371.0367736816406, + "loss": 0.0582, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7944694757461548, + "rewards/margins": 10.575716972351074, + "rewards/rejected": -11.370186805725098, + "step": 11530 + }, + { + "epoch": 2.78, + "learning_rate": 4.122838295596363e-08, + "logits/chosen": -2.43843412399292, + "logits/rejected": -2.4583516120910645, + "logps/chosen": -208.3157958984375, + "logps/rejected": -390.5905456542969, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8241792917251587, + "rewards/margins": 14.587809562683105, + "rewards/rejected": -16.4119873046875, + "step": 11540 + }, + { + "epoch": 2.78, + "learning_rate": 4.0782670707791043e-08, + "logits/chosen": -2.3243870735168457, + "logits/rejected": -2.2328686714172363, + "logps/chosen": -242.69369506835938, + "logps/rejected": -397.82794189453125, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1840770244598389, + "rewards/margins": 14.214498519897461, + "rewards/rejected": -15.398576736450195, + "step": 11550 + }, + { + "epoch": 2.78, + "learning_rate": 4.0336958459618466e-08, + "logits/chosen": -2.587477922439575, + "logits/rejected": -2.4498391151428223, + "logps/chosen": -353.0970764160156, + "logps/rejected": -425.1826171875, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.048505425453186, + "rewards/margins": 13.219686508178711, + "rewards/rejected": -14.26819133758545, + "step": 11560 + }, + { + "epoch": 2.78, + "learning_rate": 3.989124621144589e-08, + "logits/chosen": -2.463712692260742, + "logits/rejected": -2.4487709999084473, + "logps/chosen": -251.2293701171875, + "logps/rejected": -348.6514892578125, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0674989223480225, + "rewards/margins": 12.788418769836426, + "rewards/rejected": -13.855916976928711, + "step": 11570 + }, + { + "epoch": 2.79, + "learning_rate": 3.944553396327331e-08, + "logits/chosen": -2.357205867767334, + "logits/rejected": -2.3697304725646973, + "logps/chosen": -231.0236358642578, + "logps/rejected": -308.3990478515625, + "loss": 0.0381, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.9180500507354736, + "rewards/margins": 9.993398666381836, + "rewards/rejected": -13.91144847869873, + "step": 11580 + }, + { + "epoch": 2.79, + "learning_rate": 3.899982171510073e-08, + "logits/chosen": -2.4156291484832764, + "logits/rejected": -2.42352557182312, + "logps/chosen": -244.0848388671875, + "logps/rejected": -332.1519470214844, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.404517650604248, + "rewards/margins": 10.540016174316406, + "rewards/rejected": -13.944534301757812, + "step": 11590 + }, + { + "epoch": 2.79, + "learning_rate": 3.855410946692815e-08, + "logits/chosen": -2.407479763031006, + "logits/rejected": -2.460360288619995, + "logps/chosen": -388.8629455566406, + "logps/rejected": -623.1227416992188, + "loss": 0.0328, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.769385576248169, + "rewards/margins": 13.049238204956055, + "rewards/rejected": -15.818624496459961, + "step": 11600 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -2.176734447479248, + "eval_logits/rejected": -2.13875675201416, + "eval_logps/chosen": -278.0234069824219, + "eval_logps/rejected": -309.6459655761719, + "eval_loss": 0.6324948668479919, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": -7.609586715698242, + "eval_rewards/margins": 4.401934623718262, + "eval_rewards/rejected": -12.011521339416504, + "eval_runtime": 132.5011, + "eval_samples_per_second": 23.819, + "eval_steps_per_second": 0.377, + "step": 11600 + }, + { + "epoch": 2.79, + "learning_rate": 3.8108397218755566e-08, + "logits/chosen": -2.354337453842163, + "logits/rejected": -2.3427820205688477, + "logps/chosen": -253.9640655517578, + "logps/rejected": -356.81402587890625, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.540785312652588, + "rewards/margins": 9.694784164428711, + "rewards/rejected": -11.235569953918457, + "step": 11610 + }, + { + "epoch": 2.8, + "learning_rate": 3.766268497058299e-08, + "logits/chosen": -2.416257381439209, + "logits/rejected": -2.4568800926208496, + "logps/chosen": -358.51763916015625, + "logps/rejected": -404.95635986328125, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8332737684249878, + "rewards/margins": 12.491031646728516, + "rewards/rejected": -13.324304580688477, + "step": 11620 + }, + { + "epoch": 2.8, + "learning_rate": 3.721697272241041e-08, + "logits/chosen": -2.5190837383270264, + "logits/rejected": -2.3154964447021484, + "logps/chosen": -274.7784423828125, + "logps/rejected": -369.0568542480469, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48575448989868164, + "rewards/margins": 15.805773735046387, + "rewards/rejected": -15.320019721984863, + "step": 11630 + }, + { + "epoch": 2.8, + "learning_rate": 3.677126047423783e-08, + "logits/chosen": -2.546103000640869, + "logits/rejected": -2.5743775367736816, + "logps/chosen": -263.50146484375, + "logps/rejected": -404.35931396484375, + "loss": 0.0317, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9340923428535461, + "rewards/margins": 11.29251480102539, + "rewards/rejected": -12.226606369018555, + "step": 11640 + }, + { + "epoch": 2.8, + "learning_rate": 3.632554822606525e-08, + "logits/chosen": -2.4657437801361084, + "logits/rejected": -2.487313747406006, + "logps/chosen": -268.81475830078125, + "logps/rejected": -417.0496520996094, + "loss": 0.0417, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2531711459159851, + "rewards/margins": 14.597402572631836, + "rewards/rejected": -14.850573539733887, + "step": 11650 + }, + { + "epoch": 2.81, + "learning_rate": 3.5879835977892673e-08, + "logits/chosen": -2.512868881225586, + "logits/rejected": -2.487403392791748, + "logps/chosen": -297.1785888671875, + "logps/rejected": -486.1136169433594, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2635729312896729, + "rewards/margins": 12.216447830200195, + "rewards/rejected": -13.480020523071289, + "step": 11660 + }, + { + "epoch": 2.81, + "learning_rate": 3.5434123729720096e-08, + "logits/chosen": -2.3326098918914795, + "logits/rejected": -2.2055530548095703, + "logps/chosen": -284.63873291015625, + "logps/rejected": -447.083251953125, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.730940043926239, + "rewards/margins": 12.714864730834961, + "rewards/rejected": -13.445805549621582, + "step": 11670 + }, + { + "epoch": 2.81, + "learning_rate": 3.498841148154751e-08, + "logits/chosen": -2.3177332878112793, + "logits/rejected": -2.3130042552948, + "logps/chosen": -243.593017578125, + "logps/rejected": -440.8395080566406, + "loss": 0.0525, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.358832359313965, + "rewards/margins": 10.459243774414062, + "rewards/rejected": -14.818075180053711, + "step": 11680 + }, + { + "epoch": 2.81, + "learning_rate": 3.4542699233374935e-08, + "logits/chosen": -2.4748282432556152, + "logits/rejected": -2.444901943206787, + "logps/chosen": -351.2947082519531, + "logps/rejected": -421.42169189453125, + "loss": 0.0338, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3172955513000488, + "rewards/margins": 11.108989715576172, + "rewards/rejected": -12.426286697387695, + "step": 11690 + }, + { + "epoch": 2.82, + "learning_rate": 3.409698698520235e-08, + "logits/chosen": -2.603511095046997, + "logits/rejected": -2.533597707748413, + "logps/chosen": -396.93988037109375, + "logps/rejected": -450.39996337890625, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6498171091079712, + "rewards/margins": 13.239534378051758, + "rewards/rejected": -13.889350891113281, + "step": 11700 + }, + { + "epoch": 2.82, + "learning_rate": 3.3651274737029774e-08, + "logits/chosen": -2.388713836669922, + "logits/rejected": -2.4331793785095215, + "logps/chosen": -243.74618530273438, + "logps/rejected": -333.30755615234375, + "loss": 0.0306, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.776449203491211, + "rewards/margins": 9.73577880859375, + "rewards/rejected": -12.512228012084961, + "step": 11710 + }, + { + "epoch": 2.82, + "learning_rate": 3.320556248885719e-08, + "logits/chosen": -2.6002614498138428, + "logits/rejected": -2.538332223892212, + "logps/chosen": -363.6770935058594, + "logps/rejected": -447.22161865234375, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1041080951690674, + "rewards/margins": 12.499316215515137, + "rewards/rejected": -13.603424072265625, + "step": 11720 + }, + { + "epoch": 2.82, + "learning_rate": 3.275985024068461e-08, + "logits/chosen": -2.549431324005127, + "logits/rejected": -2.378579616546631, + "logps/chosen": -354.511962890625, + "logps/rejected": -424.26287841796875, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.980708122253418, + "rewards/margins": 12.861539840698242, + "rewards/rejected": -13.842247009277344, + "step": 11730 + }, + { + "epoch": 2.83, + "learning_rate": 3.2314137992512035e-08, + "logits/chosen": -2.317164659500122, + "logits/rejected": -2.1640431880950928, + "logps/chosen": -331.0914306640625, + "logps/rejected": -359.4324645996094, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1258039474487305, + "rewards/margins": 10.919458389282227, + "rewards/rejected": -15.045262336730957, + "step": 11740 + }, + { + "epoch": 2.83, + "learning_rate": 3.186842574433946e-08, + "logits/chosen": -2.5726194381713867, + "logits/rejected": -2.4622859954833984, + "logps/chosen": -256.5213928222656, + "logps/rejected": -366.3427734375, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2834794521331787, + "rewards/margins": 13.488180160522461, + "rewards/rejected": -13.771661758422852, + "step": 11750 + }, + { + "epoch": 2.83, + "learning_rate": 3.1422713496166874e-08, + "logits/chosen": -2.4840989112854004, + "logits/rejected": -2.4598278999328613, + "logps/chosen": -219.7997589111328, + "logps/rejected": -393.89788818359375, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23123475909233093, + "rewards/margins": 14.300189018249512, + "rewards/rejected": -14.531425476074219, + "step": 11760 + }, + { + "epoch": 2.83, + "learning_rate": 3.09770012479943e-08, + "logits/chosen": -2.6010186672210693, + "logits/rejected": -2.457659959793091, + "logps/chosen": -313.2583923339844, + "logps/rejected": -553.55322265625, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0449435710906982, + "rewards/margins": 21.27800178527832, + "rewards/rejected": -20.23305892944336, + "step": 11770 + }, + { + "epoch": 2.84, + "learning_rate": 3.053128899982171e-08, + "logits/chosen": -2.424867868423462, + "logits/rejected": -2.2959485054016113, + "logps/chosen": -297.36724853515625, + "logps/rejected": -291.015380859375, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1360621452331543, + "rewards/margins": 10.758634567260742, + "rewards/rejected": -12.894696235656738, + "step": 11780 + }, + { + "epoch": 2.84, + "learning_rate": 3.0085576751649136e-08, + "logits/chosen": -2.5902180671691895, + "logits/rejected": -2.3549537658691406, + "logps/chosen": -416.59326171875, + "logps/rejected": -386.328857421875, + "loss": 0.0158, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3209166526794434, + "rewards/margins": 11.371603012084961, + "rewards/rejected": -14.692520141601562, + "step": 11790 + }, + { + "epoch": 2.84, + "learning_rate": 2.963986450347655e-08, + "logits/chosen": -2.6646437644958496, + "logits/rejected": -2.477449893951416, + "logps/chosen": -300.90216064453125, + "logps/rejected": -407.06573486328125, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.187294602394104, + "rewards/margins": 13.52137279510498, + "rewards/rejected": -14.708666801452637, + "step": 11800 + }, + { + "epoch": 2.84, + "learning_rate": 2.9194152255303974e-08, + "logits/chosen": -2.601910352706909, + "logits/rejected": -2.3476791381835938, + "logps/chosen": -271.9495849609375, + "logps/rejected": -325.62652587890625, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8956215381622314, + "rewards/margins": 10.339010238647461, + "rewards/rejected": -12.234630584716797, + "step": 11810 + }, + { + "epoch": 2.84, + "learning_rate": 2.8748440007131394e-08, + "logits/chosen": -2.474547863006592, + "logits/rejected": -2.5126729011535645, + "logps/chosen": -239.34182739257812, + "logps/rejected": -356.5572814941406, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9234554767608643, + "rewards/margins": 10.614490509033203, + "rewards/rejected": -12.537944793701172, + "step": 11820 + }, + { + "epoch": 2.85, + "learning_rate": 2.8302727758958813e-08, + "logits/chosen": -2.447876453399658, + "logits/rejected": -2.273216962814331, + "logps/chosen": -272.24359130859375, + "logps/rejected": -399.24481201171875, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6911055445671082, + "rewards/margins": 13.995025634765625, + "rewards/rejected": -14.686132431030273, + "step": 11830 + }, + { + "epoch": 2.85, + "learning_rate": 2.7857015510786233e-08, + "logits/chosen": -2.432492971420288, + "logits/rejected": -2.529512405395508, + "logps/chosen": -275.92083740234375, + "logps/rejected": -437.6058654785156, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2148009538650513, + "rewards/margins": 12.627706527709961, + "rewards/rejected": -13.842506408691406, + "step": 11840 + }, + { + "epoch": 2.85, + "learning_rate": 2.7411303262613655e-08, + "logits/chosen": -2.1875171661376953, + "logits/rejected": -2.139634847640991, + "logps/chosen": -189.89190673828125, + "logps/rejected": -250.09097290039062, + "loss": 0.0405, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3723418712615967, + "rewards/margins": 8.872549057006836, + "rewards/rejected": -12.244890213012695, + "step": 11850 + }, + { + "epoch": 2.85, + "learning_rate": 2.6965591014441075e-08, + "logits/chosen": -2.489630699157715, + "logits/rejected": -2.4272329807281494, + "logps/chosen": -273.12139892578125, + "logps/rejected": -408.4613342285156, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5734224319458008, + "rewards/margins": 11.171777725219727, + "rewards/rejected": -11.745201110839844, + "step": 11860 + }, + { + "epoch": 2.86, + "learning_rate": 2.6519878766268494e-08, + "logits/chosen": -2.526616334915161, + "logits/rejected": -2.511665105819702, + "logps/chosen": -364.6877136230469, + "logps/rejected": -448.8736267089844, + "loss": 0.0297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9400394558906555, + "rewards/margins": 14.034416198730469, + "rewards/rejected": -14.974454879760742, + "step": 11870 + }, + { + "epoch": 2.86, + "learning_rate": 2.6074166518095914e-08, + "logits/chosen": -2.396233558654785, + "logits/rejected": -2.23172926902771, + "logps/chosen": -292.914794921875, + "logps/rejected": -443.27764892578125, + "loss": 0.0318, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0942294597625732, + "rewards/margins": 14.994674682617188, + "rewards/rejected": -17.088903427124023, + "step": 11880 + }, + { + "epoch": 2.86, + "learning_rate": 2.562845426992334e-08, + "logits/chosen": -2.5167908668518066, + "logits/rejected": -2.5856142044067383, + "logps/chosen": -236.70388793945312, + "logps/rejected": -390.9494934082031, + "loss": 0.0558, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9860879778862, + "rewards/margins": 10.780597686767578, + "rewards/rejected": -11.766684532165527, + "step": 11890 + }, + { + "epoch": 2.86, + "learning_rate": 2.518274202175076e-08, + "logits/chosen": -2.3103814125061035, + "logits/rejected": -2.3382656574249268, + "logps/chosen": -254.10647583007812, + "logps/rejected": -315.13250732421875, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0061086416244507, + "rewards/margins": 11.909418106079102, + "rewards/rejected": -12.91552734375, + "step": 11900 + }, + { + "epoch": 2.87, + "learning_rate": 2.4737029773578178e-08, + "logits/chosen": -2.516348123550415, + "logits/rejected": -2.570709228515625, + "logps/chosen": -282.7447509765625, + "logps/rejected": -372.11151123046875, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.984648585319519, + "rewards/margins": 11.936151504516602, + "rewards/rejected": -12.920801162719727, + "step": 11910 + }, + { + "epoch": 2.87, + "learning_rate": 2.4291317525405598e-08, + "logits/chosen": -2.423058271408081, + "logits/rejected": -2.390669584274292, + "logps/chosen": -286.0511169433594, + "logps/rejected": -361.89581298828125, + "loss": 0.0184, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.80135178565979, + "rewards/margins": 11.103158950805664, + "rewards/rejected": -12.904510498046875, + "step": 11920 + }, + { + "epoch": 2.87, + "learning_rate": 2.3845605277233017e-08, + "logits/chosen": -2.5798749923706055, + "logits/rejected": -2.433136463165283, + "logps/chosen": -254.32369995117188, + "logps/rejected": -274.3641357421875, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7883821725845337, + "rewards/margins": 9.948869705200195, + "rewards/rejected": -11.737250328063965, + "step": 11930 + }, + { + "epoch": 2.87, + "learning_rate": 2.339989302906044e-08, + "logits/chosen": -2.6218628883361816, + "logits/rejected": -2.4903016090393066, + "logps/chosen": -273.2744140625, + "logps/rejected": -320.1408996582031, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44978299736976624, + "rewards/margins": 10.955463409423828, + "rewards/rejected": -11.40524673461914, + "step": 11940 + }, + { + "epoch": 2.88, + "learning_rate": 2.295418078088786e-08, + "logits/chosen": -2.639375925064087, + "logits/rejected": -2.454336404800415, + "logps/chosen": -351.46722412109375, + "logps/rejected": -401.80218505859375, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7721977233886719, + "rewards/margins": 12.326979637145996, + "rewards/rejected": -14.099177360534668, + "step": 11950 + }, + { + "epoch": 2.88, + "learning_rate": 2.250846853271528e-08, + "logits/chosen": -2.226454257965088, + "logits/rejected": -2.206085205078125, + "logps/chosen": -327.9383239746094, + "logps/rejected": -373.9483642578125, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020443439483642578, + "rewards/margins": 13.046069145202637, + "rewards/rejected": -13.066513061523438, + "step": 11960 + }, + { + "epoch": 2.88, + "learning_rate": 2.2062756284542698e-08, + "logits/chosen": -2.186215877532959, + "logits/rejected": -2.3007748126983643, + "logps/chosen": -390.6415710449219, + "logps/rejected": -473.8223571777344, + "loss": 0.036, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.41581654548645, + "rewards/margins": 13.948402404785156, + "rewards/rejected": -16.364215850830078, + "step": 11970 + }, + { + "epoch": 2.88, + "learning_rate": 2.161704403637012e-08, + "logits/chosen": -2.66757869720459, + "logits/rejected": -2.659167528152466, + "logps/chosen": -331.8939514160156, + "logps/rejected": -423.4248046875, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1421377658843994, + "rewards/margins": 15.744915962219238, + "rewards/rejected": -14.602778434753418, + "step": 11980 + }, + { + "epoch": 2.89, + "learning_rate": 2.117133178819754e-08, + "logits/chosen": -2.4948318004608154, + "logits/rejected": -2.486616611480713, + "logps/chosen": -279.82806396484375, + "logps/rejected": -391.32684326171875, + "loss": 0.0325, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.112886905670166, + "rewards/margins": 11.679197311401367, + "rewards/rejected": -15.792081832885742, + "step": 11990 + }, + { + "epoch": 2.89, + "learning_rate": 2.072561954002496e-08, + "logits/chosen": -2.5811285972595215, + "logits/rejected": -2.468341112136841, + "logps/chosen": -347.5221252441406, + "logps/rejected": -434.04791259765625, + "loss": 0.036, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9449437260627747, + "rewards/margins": 14.833131790161133, + "rewards/rejected": -15.778076171875, + "step": 12000 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.2010746002197266, + "eval_logits/rejected": -2.1640734672546387, + "eval_logps/chosen": -280.1643371582031, + "eval_logps/rejected": -312.15899658203125, + "eval_loss": 0.6311665773391724, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -7.823678970336914, + "eval_rewards/margins": 4.439140796661377, + "eval_rewards/rejected": -12.262818336486816, + "eval_runtime": 132.594, + "eval_samples_per_second": 23.802, + "eval_steps_per_second": 0.377, + "step": 12000 + }, + { + "epoch": 2.89, + "learning_rate": 2.027990729185238e-08, + "logits/chosen": -2.3970046043395996, + "logits/rejected": -2.4178857803344727, + "logps/chosen": -324.0255432128906, + "logps/rejected": -445.8971252441406, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6764753460884094, + "rewards/margins": 11.698667526245117, + "rewards/rejected": -12.375143051147461, + "step": 12010 + }, + { + "epoch": 2.89, + "learning_rate": 1.9834195043679802e-08, + "logits/chosen": -2.5303125381469727, + "logits/rejected": -2.3805575370788574, + "logps/chosen": -244.6219940185547, + "logps/rejected": -384.60552978515625, + "loss": 0.0426, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.833562135696411, + "rewards/margins": 9.956976890563965, + "rewards/rejected": -13.790539741516113, + "step": 12020 + }, + { + "epoch": 2.9, + "learning_rate": 1.938848279550722e-08, + "logits/chosen": -2.2676398754119873, + "logits/rejected": -2.273580312728882, + "logps/chosen": -226.77523803710938, + "logps/rejected": -294.951416015625, + "loss": 0.0195, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0574922561645508, + "rewards/margins": 11.238329887390137, + "rewards/rejected": -12.29582405090332, + "step": 12030 + }, + { + "epoch": 2.9, + "learning_rate": 1.894277054733464e-08, + "logits/chosen": -2.22725248336792, + "logits/rejected": -2.2449066638946533, + "logps/chosen": -380.6275329589844, + "logps/rejected": -304.9622497558594, + "loss": 0.0238, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.221129894256592, + "rewards/margins": 9.654314041137695, + "rewards/rejected": -13.875444412231445, + "step": 12040 + }, + { + "epoch": 2.9, + "learning_rate": 1.849705829916206e-08, + "logits/chosen": -2.4066781997680664, + "logits/rejected": -2.3611502647399902, + "logps/chosen": -287.4486999511719, + "logps/rejected": -345.9519348144531, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8492765426635742, + "rewards/margins": 10.583861351013184, + "rewards/rejected": -12.433137893676758, + "step": 12050 + }, + { + "epoch": 2.9, + "learning_rate": 1.8051346050989483e-08, + "logits/chosen": -2.475659132003784, + "logits/rejected": -2.431633472442627, + "logps/chosen": -297.91424560546875, + "logps/rejected": -415.7301330566406, + "loss": 0.0152, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.32944917678833, + "rewards/margins": 11.287969589233398, + "rewards/rejected": -14.61741828918457, + "step": 12060 + }, + { + "epoch": 2.9, + "learning_rate": 1.7605633802816902e-08, + "logits/chosen": -2.388906955718994, + "logits/rejected": -2.247445821762085, + "logps/chosen": -255.1102752685547, + "logps/rejected": -369.0557861328125, + "loss": 0.1142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6246918439865112, + "rewards/margins": 12.72299575805664, + "rewards/rejected": -13.347684860229492, + "step": 12070 + }, + { + "epoch": 2.91, + "learning_rate": 1.715992155464432e-08, + "logits/chosen": -2.5152182579040527, + "logits/rejected": -2.463408946990967, + "logps/chosen": -324.1038513183594, + "logps/rejected": -415.5223693847656, + "loss": 0.0387, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5152909755706787, + "rewards/margins": 11.51399040222168, + "rewards/rejected": -14.029281616210938, + "step": 12080 + }, + { + "epoch": 2.91, + "learning_rate": 1.671420930647174e-08, + "logits/chosen": -2.61296010017395, + "logits/rejected": -2.494764566421509, + "logps/chosen": -306.69537353515625, + "logps/rejected": -344.83258056640625, + "loss": 0.0651, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.15839901566505432, + "rewards/margins": 12.144659996032715, + "rewards/rejected": -12.303059577941895, + "step": 12090 + }, + { + "epoch": 2.91, + "learning_rate": 1.626849705829916e-08, + "logits/chosen": -2.5070996284484863, + "logits/rejected": -2.388580799102783, + "logps/chosen": -234.34188842773438, + "logps/rejected": -412.37200927734375, + "loss": 0.0235, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9789184331893921, + "rewards/margins": 14.761987686157227, + "rewards/rejected": -15.74090576171875, + "step": 12100 + }, + { + "epoch": 2.91, + "learning_rate": 1.5822784810126583e-08, + "logits/chosen": -2.5977272987365723, + "logits/rejected": -2.455437421798706, + "logps/chosen": -315.7597351074219, + "logps/rejected": -445.12872314453125, + "loss": 0.0336, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5849735736846924, + "rewards/margins": 11.080507278442383, + "rewards/rejected": -13.665481567382812, + "step": 12110 + }, + { + "epoch": 2.92, + "learning_rate": 1.5377072561954002e-08, + "logits/chosen": -2.4510676860809326, + "logits/rejected": -2.355278491973877, + "logps/chosen": -229.1404266357422, + "logps/rejected": -328.08831787109375, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.985194444656372, + "rewards/margins": 11.137495040893555, + "rewards/rejected": -13.122690200805664, + "step": 12120 + }, + { + "epoch": 2.92, + "learning_rate": 1.4931360313781422e-08, + "logits/chosen": -2.541381359100342, + "logits/rejected": -2.4581973552703857, + "logps/chosen": -246.9438934326172, + "logps/rejected": -331.85430908203125, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5580185651779175, + "rewards/margins": 10.875816345214844, + "rewards/rejected": -11.433834075927734, + "step": 12130 + }, + { + "epoch": 2.92, + "learning_rate": 1.4485648065608843e-08, + "logits/chosen": -2.435041904449463, + "logits/rejected": -2.3537468910217285, + "logps/chosen": -284.8828125, + "logps/rejected": -479.46588134765625, + "loss": 0.0472, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4715332984924316, + "rewards/margins": 14.933647155761719, + "rewards/rejected": -17.405181884765625, + "step": 12140 + }, + { + "epoch": 2.92, + "learning_rate": 1.4039935817436262e-08, + "logits/chosen": -2.563504695892334, + "logits/rejected": -2.54644513130188, + "logps/chosen": -255.3700408935547, + "logps/rejected": -397.90435791015625, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8312803506851196, + "rewards/margins": 12.085309028625488, + "rewards/rejected": -13.916587829589844, + "step": 12150 + }, + { + "epoch": 2.93, + "learning_rate": 1.3594223569263683e-08, + "logits/chosen": -2.327991485595703, + "logits/rejected": -2.257784366607666, + "logps/chosen": -234.7561492919922, + "logps/rejected": -345.07720947265625, + "loss": 0.0439, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1010355949401855, + "rewards/margins": 9.344538688659668, + "rewards/rejected": -12.445573806762695, + "step": 12160 + }, + { + "epoch": 2.93, + "learning_rate": 1.3148511321091103e-08, + "logits/chosen": -2.4475510120391846, + "logits/rejected": -2.3634095191955566, + "logps/chosen": -286.64276123046875, + "logps/rejected": -417.7353515625, + "loss": 0.0305, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1268631219863892, + "rewards/margins": 12.717265129089355, + "rewards/rejected": -13.844128608703613, + "step": 12170 + }, + { + "epoch": 2.93, + "learning_rate": 1.2702799072918524e-08, + "logits/chosen": -2.515963077545166, + "logits/rejected": -2.4292666912078857, + "logps/chosen": -294.73486328125, + "logps/rejected": -405.29302978515625, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17532333731651306, + "rewards/margins": 13.44401741027832, + "rewards/rejected": -13.619341850280762, + "step": 12180 + }, + { + "epoch": 2.93, + "learning_rate": 1.2257086824745943e-08, + "logits/chosen": -2.5807528495788574, + "logits/rejected": -2.4983749389648438, + "logps/chosen": -289.40106201171875, + "logps/rejected": -375.0059509277344, + "loss": 0.0257, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.073075771331787, + "rewards/margins": 10.209634780883789, + "rewards/rejected": -13.282710075378418, + "step": 12190 + }, + { + "epoch": 2.94, + "learning_rate": 1.1811374576573364e-08, + "logits/chosen": -2.569288730621338, + "logits/rejected": -2.4954841136932373, + "logps/chosen": -303.577880859375, + "logps/rejected": -333.6105041503906, + "loss": 0.0305, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.453270673751831, + "rewards/margins": 9.452234268188477, + "rewards/rejected": -10.905505180358887, + "step": 12200 + }, + { + "epoch": 2.94, + "learning_rate": 1.1365662328400784e-08, + "logits/chosen": -2.393186569213867, + "logits/rejected": -2.2872087955474854, + "logps/chosen": -394.0965270996094, + "logps/rejected": -376.77166748046875, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8482491374015808, + "rewards/margins": 12.98796272277832, + "rewards/rejected": -13.836212158203125, + "step": 12210 + }, + { + "epoch": 2.94, + "learning_rate": 1.0919950080228205e-08, + "logits/chosen": -2.589224338531494, + "logits/rejected": -2.537210702896118, + "logps/chosen": -434.2206115722656, + "logps/rejected": -433.95751953125, + "loss": 0.0333, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0010029077529907, + "rewards/margins": 11.84622859954834, + "rewards/rejected": -12.8472318649292, + "step": 12220 + }, + { + "epoch": 2.94, + "learning_rate": 1.0474237832055624e-08, + "logits/chosen": -2.519874095916748, + "logits/rejected": -2.4072184562683105, + "logps/chosen": -359.74542236328125, + "logps/rejected": -430.4481506347656, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.849611520767212, + "rewards/margins": 10.379087448120117, + "rewards/rejected": -12.228699684143066, + "step": 12230 + }, + { + "epoch": 2.95, + "learning_rate": 1.0028525583883044e-08, + "logits/chosen": -2.4503467082977295, + "logits/rejected": -2.3041646480560303, + "logps/chosen": -316.7175598144531, + "logps/rejected": -411.546142578125, + "loss": 0.0372, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5837459564208984, + "rewards/margins": 10.363752365112305, + "rewards/rejected": -11.94749927520752, + "step": 12240 + }, + { + "epoch": 2.95, + "learning_rate": 9.582813335710465e-09, + "logits/chosen": -2.604128360748291, + "logits/rejected": -2.4129440784454346, + "logps/chosen": -303.106689453125, + "logps/rejected": -401.07403564453125, + "loss": 0.0283, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1794488430023193, + "rewards/margins": 10.987741470336914, + "rewards/rejected": -13.167190551757812, + "step": 12250 + }, + { + "epoch": 2.95, + "learning_rate": 9.137101087537884e-09, + "logits/chosen": -2.621835708618164, + "logits/rejected": -2.611644744873047, + "logps/chosen": -312.47283935546875, + "logps/rejected": -420.8551330566406, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31393319368362427, + "rewards/margins": 12.102607727050781, + "rewards/rejected": -12.41654109954834, + "step": 12260 + }, + { + "epoch": 2.95, + "learning_rate": 8.691388839365305e-09, + "logits/chosen": -2.4669082164764404, + "logits/rejected": -2.4551076889038086, + "logps/chosen": -227.9279022216797, + "logps/rejected": -394.44915771484375, + "loss": 0.039, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9205996990203857, + "rewards/margins": 16.079343795776367, + "rewards/rejected": -18.999942779541016, + "step": 12270 + }, + { + "epoch": 2.96, + "learning_rate": 8.245676591192724e-09, + "logits/chosen": -2.5060513019561768, + "logits/rejected": -2.425774097442627, + "logps/chosen": -337.9098205566406, + "logps/rejected": -466.40045166015625, + "loss": 0.0337, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6899657249450684, + "rewards/margins": 12.175222396850586, + "rewards/rejected": -14.865188598632812, + "step": 12280 + }, + { + "epoch": 2.96, + "learning_rate": 7.799964343020146e-09, + "logits/chosen": -2.4625678062438965, + "logits/rejected": -2.3441214561462402, + "logps/chosen": -154.75999450683594, + "logps/rejected": -238.6104736328125, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8786206245422363, + "rewards/margins": 9.321663856506348, + "rewards/rejected": -10.200284004211426, + "step": 12290 + }, + { + "epoch": 2.96, + "learning_rate": 7.3542520948475666e-09, + "logits/chosen": -2.6390433311462402, + "logits/rejected": -2.4912033081054688, + "logps/chosen": -364.30194091796875, + "logps/rejected": -403.852783203125, + "loss": 0.0498, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.983339309692383, + "rewards/margins": 11.095958709716797, + "rewards/rejected": -14.079297065734863, + "step": 12300 + }, + { + "epoch": 2.96, + "learning_rate": 6.908539846674986e-09, + "logits/chosen": -2.519421100616455, + "logits/rejected": -2.306983709335327, + "logps/chosen": -333.4102478027344, + "logps/rejected": -354.1184997558594, + "loss": 0.0216, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2175986766815186, + "rewards/margins": 10.80386734008789, + "rewards/rejected": -14.021466255187988, + "step": 12310 + }, + { + "epoch": 2.97, + "learning_rate": 6.462827598502406e-09, + "logits/chosen": -2.42765474319458, + "logits/rejected": -2.3466317653656006, + "logps/chosen": -230.0916290283203, + "logps/rejected": -323.4930419921875, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2475663423538208, + "rewards/margins": 11.202420234680176, + "rewards/rejected": -12.449986457824707, + "step": 12320 + }, + { + "epoch": 2.97, + "learning_rate": 6.0171153503298264e-09, + "logits/chosen": -2.4995667934417725, + "logits/rejected": -2.4612293243408203, + "logps/chosen": -304.5595703125, + "logps/rejected": -420.3321838378906, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5263036489486694, + "rewards/margins": 12.25594711303711, + "rewards/rejected": -12.782249450683594, + "step": 12330 + }, + { + "epoch": 2.97, + "learning_rate": 5.571403102157247e-09, + "logits/chosen": -2.5020134449005127, + "logits/rejected": -2.490213632583618, + "logps/chosen": -318.671630859375, + "logps/rejected": -407.4111022949219, + "loss": 0.0323, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3810112476348877, + "rewards/margins": 12.061712265014648, + "rewards/rejected": -13.442724227905273, + "step": 12340 + }, + { + "epoch": 2.97, + "learning_rate": 5.125690853984667e-09, + "logits/chosen": -2.4014594554901123, + "logits/rejected": -2.4098572731018066, + "logps/chosen": -284.7099304199219, + "logps/rejected": -533.87646484375, + "loss": 0.0366, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.165526032447815, + "rewards/margins": 17.97944450378418, + "rewards/rejected": -16.813919067382812, + "step": 12350 + }, + { + "epoch": 2.97, + "learning_rate": 4.679978605812087e-09, + "logits/chosen": -2.448012351989746, + "logits/rejected": -2.5330610275268555, + "logps/chosen": -197.67715454101562, + "logps/rejected": -392.29766845703125, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.932754635810852, + "rewards/margins": 11.501636505126953, + "rewards/rejected": -12.434389114379883, + "step": 12360 + }, + { + "epoch": 2.98, + "learning_rate": 4.234266357639507e-09, + "logits/chosen": -2.4347705841064453, + "logits/rejected": -2.4334542751312256, + "logps/chosen": -273.04345703125, + "logps/rejected": -339.4583740234375, + "loss": 0.0277, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6104110479354858, + "rewards/margins": 12.005154609680176, + "rewards/rejected": -13.615565299987793, + "step": 12370 + }, + { + "epoch": 2.98, + "learning_rate": 3.788554109466928e-09, + "logits/chosen": -2.3450114727020264, + "logits/rejected": -2.1836085319519043, + "logps/chosen": -335.5284423828125, + "logps/rejected": -349.3228759765625, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5563626885414124, + "rewards/margins": 13.879631042480469, + "rewards/rejected": -14.435995101928711, + "step": 12380 + }, + { + "epoch": 2.98, + "learning_rate": 3.3428418612943483e-09, + "logits/chosen": -2.418477773666382, + "logits/rejected": -2.4164748191833496, + "logps/chosen": -214.49325561523438, + "logps/rejected": -500.2579650878906, + "loss": 0.0303, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8474563360214233, + "rewards/margins": 18.810083389282227, + "rewards/rejected": -16.962627410888672, + "step": 12390 + }, + { + "epoch": 2.98, + "learning_rate": 2.8971296131217685e-09, + "logits/chosen": -2.357849597930908, + "logits/rejected": -2.2878222465515137, + "logps/chosen": -339.95703125, + "logps/rejected": -390.0880432128906, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0557844638824463, + "rewards/margins": 10.755159378051758, + "rewards/rejected": -12.810943603515625, + "step": 12400 + }, + { + "epoch": 2.98, + "eval_logits/chosen": -2.198592185974121, + "eval_logits/rejected": -2.1612653732299805, + "eval_logps/chosen": -278.6061096191406, + "eval_logps/rejected": -310.4496154785156, + "eval_loss": 0.6282873749732971, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -7.6678571701049805, + "eval_rewards/margins": 4.4240264892578125, + "eval_rewards/rejected": -12.091883659362793, + "eval_runtime": 132.1622, + "eval_samples_per_second": 23.88, + "eval_steps_per_second": 0.378, + "step": 12400 + }, + { + "epoch": 2.99, + "learning_rate": 2.4514173649491887e-09, + "logits/chosen": -2.455528736114502, + "logits/rejected": -2.414548635482788, + "logps/chosen": -347.48919677734375, + "logps/rejected": -353.279296875, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.921791434288025, + "rewards/margins": 10.95647144317627, + "rewards/rejected": -12.878263473510742, + "step": 12410 + }, + { + "epoch": 2.99, + "learning_rate": 2.005705116776609e-09, + "logits/chosen": -2.5131871700286865, + "logits/rejected": -2.3196463584899902, + "logps/chosen": -286.4059143066406, + "logps/rejected": -333.23028564453125, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35814499855041504, + "rewards/margins": 11.833600044250488, + "rewards/rejected": -12.191744804382324, + "step": 12420 + }, + { + "epoch": 2.99, + "learning_rate": 1.5599928686040292e-09, + "logits/chosen": -2.3323538303375244, + "logits/rejected": -2.097768783569336, + "logps/chosen": -366.6454772949219, + "logps/rejected": -362.915771484375, + "loss": 0.0224, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7946733236312866, + "rewards/margins": 11.0028657913208, + "rewards/rejected": -12.797537803649902, + "step": 12430 + }, + { + "epoch": 2.99, + "learning_rate": 1.1142806204314494e-09, + "logits/chosen": -2.3995718955993652, + "logits/rejected": -2.456651210784912, + "logps/chosen": -319.78326416015625, + "logps/rejected": -416.267822265625, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8602653741836548, + "rewards/margins": 9.876348495483398, + "rewards/rejected": -11.736612319946289, + "step": 12440 + }, + { + "epoch": 3.0, + "learning_rate": 6.685683722588697e-10, + "logits/chosen": -2.534796953201294, + "logits/rejected": -2.4103546142578125, + "logps/chosen": -353.41168212890625, + "logps/rejected": -391.51812744140625, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9421192407608032, + "rewards/margins": 14.015222549438477, + "rewards/rejected": -14.957341194152832, + "step": 12450 + }, + { + "epoch": 3.0, + "learning_rate": 2.2285612408628988e-10, + "logits/chosen": -2.50495982170105, + "logits/rejected": -2.418968915939331, + "logps/chosen": -335.51995849609375, + "logps/rejected": -341.50604248046875, + "loss": 0.0187, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3293344974517822, + "rewards/margins": 12.255937576293945, + "rewards/rejected": -13.585271835327148, + "step": 12460 + }, + { + "epoch": 3.0, + "step": 12465, + "total_flos": 0.0, + "train_loss": 0.24049862180692672, + "train_runtime": 20948.1804, + "train_samples_per_second": 9.519, + "train_steps_per_second": 0.595 + } + ], + "logging_steps": 10, + "max_steps": 12465, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1247, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}