diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,17 +1,17 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.0, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1434, + "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 46.495066480188626, - "learning_rate": 3.4722222222222217e-09, + "grad_norm": 49.891043665102934, + "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.7660439014434814, "logits/rejected": -2.717564582824707, "logps/chosen": -269.8568420410156, @@ -25,2387 +25,787 @@ }, { "epoch": 0.02, - "grad_norm": 45.48250079754631, - "learning_rate": 3.472222222222222e-08, - "logits/chosen": -2.592761516571045, - "logits/rejected": -2.5630030632019043, - "logps/chosen": -264.7732238769531, - "logps/rejected": -251.50889587402344, - "loss": 0.6929, - "rewards/accuracies": 0.3958333432674408, - "rewards/chosen": 0.0012306140270084143, - "rewards/margins": -0.0009704786934889853, - "rewards/rejected": 0.0022010933607816696, + "grad_norm": 46.946091297352105, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.592543125152588, + "logits/rejected": -2.56319522857666, + "logps/chosen": -264.7040100097656, + "logps/rejected": -251.515625, + "loss": 0.6933, + "rewards/accuracies": 0.4791666567325592, + "rewards/chosen": 0.004693002440035343, + "rewards/margins": 0.0028277651872485876, + "rewards/rejected": 0.0018652371363714337, "step": 10 }, { "epoch": 0.04, - "grad_norm": 41.93789081750966, - "learning_rate": 6.944444444444444e-08, - "logits/chosen": -2.6552841663360596, - "logits/rejected": -2.6074695587158203, - "logps/chosen": -281.4732360839844, - "logps/rejected": -296.79010009765625, - "loss": 0.6925, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.004006093833595514, - "rewards/margins": 0.003165980102494359, - "rewards/rejected": 0.0008401140803471208, + "grad_norm": 41.817724108185395, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -2.65449595451355, + "logits/rejected": -2.6068952083587646, + "logps/chosen": -280.5221252441406, + "logps/rejected": -295.92376708984375, + "loss": 0.689, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.05156273767352104, + "rewards/margins": 0.00740828737616539, + "rewards/rejected": 0.04415445029735565, "step": 20 }, { "epoch": 0.06, - "grad_norm": 41.15398691411152, - "learning_rate": 1.0416666666666667e-07, - "logits/chosen": -2.67468523979187, - "logits/rejected": -2.6028592586517334, - "logps/chosen": -300.5101013183594, - "logps/rejected": -263.88922119140625, - "loss": 0.6904, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.008085623383522034, - "rewards/margins": 0.00503805186599493, - "rewards/rejected": 0.003047570353373885, + "grad_norm": 39.81553425430633, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.6671488285064697, + "logits/rejected": -2.5955922603607178, + "logps/chosen": -296.41644287109375, + "logps/rejected": -260.6401672363281, + "loss": 0.6733, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.2127685844898224, + "rewards/margins": 0.04726782441139221, + "rewards/rejected": 0.16550076007843018, "step": 30 }, { "epoch": 0.08, - "grad_norm": 39.57195743724939, - "learning_rate": 1.3888888888888888e-07, - "logits/chosen": -2.5917325019836426, - "logits/rejected": -2.559333562850952, - "logps/chosen": -266.6319274902344, - "logps/rejected": -244.5043487548828, - "loss": 0.6826, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.024673160165548325, - "rewards/margins": 0.030291978269815445, - "rewards/rejected": -0.005618819035589695, + "grad_norm": 38.58454153774096, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.5658886432647705, + "logits/rejected": -2.5324325561523438, + "logps/chosen": -259.78594970703125, + "logps/rejected": -241.00991821289062, + "loss": 0.6399, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.3669721484184265, + "rewards/margins": 0.19786901772022247, + "rewards/rejected": 0.16910310089588165, "step": 40 }, { "epoch": 0.1, - "grad_norm": 38.787652763321745, - "learning_rate": 1.736111111111111e-07, - "logits/chosen": -2.5633091926574707, - "logits/rejected": -2.527255058288574, - "logps/chosen": -278.7782287597656, - "logps/rejected": -292.60064697265625, - "loss": 0.6686, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0476832240819931, - "rewards/margins": 0.03851194307208061, - "rewards/rejected": 0.009171287529170513, + "grad_norm": 37.351752662935816, + "learning_rate": 4.999733114418725e-07, + "logits/chosen": -2.5195257663726807, + "logits/rejected": -2.4827651977539062, + "logps/chosen": -273.65081787109375, + "logps/rejected": -290.78680419921875, + "loss": 0.6094, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.304054319858551, + "rewards/margins": 0.2041884958744049, + "rewards/rejected": 0.09986577928066254, "step": 50 }, { "epoch": 0.13, - "grad_norm": 42.23287032668302, - "learning_rate": 2.0833333333333333e-07, - "logits/chosen": -2.6264617443084717, - "logits/rejected": -2.5678651332855225, - "logps/chosen": -268.39715576171875, - "logps/rejected": -296.73492431640625, - "loss": 0.6517, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.1604687124490738, - "rewards/margins": 0.09314907342195511, - "rewards/rejected": 0.06731964647769928, + "grad_norm": 39.61129660699584, + "learning_rate": 4.990398100856366e-07, + "logits/chosen": -2.567991018295288, + "logits/rejected": -2.5036864280700684, + "logps/chosen": -260.38055419921875, + "logps/rejected": -294.011474609375, + "loss": 0.6013, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.5612996220588684, + "rewards/margins": 0.3578048348426819, + "rewards/rejected": 0.20349478721618652, "step": 60 }, { "epoch": 0.15, - "grad_norm": 37.074057156615645, - "learning_rate": 2.4305555555555555e-07, - "logits/chosen": -2.529860734939575, - "logits/rejected": -2.5335025787353516, - "logps/chosen": -261.59197998046875, - "logps/rejected": -255.90951538085938, - "loss": 0.6244, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.20122408866882324, - "rewards/margins": 0.160838782787323, - "rewards/rejected": 0.04038532078266144, + "grad_norm": 41.460696281749556, + "learning_rate": 4.967775735898179e-07, + "logits/chosen": -2.460195302963257, + "logits/rejected": -2.46120023727417, + "logps/chosen": -253.1399383544922, + "logps/rejected": -253.4242706298828, + "loss": 0.5693, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.6238263845443726, + "rewards/margins": 0.4591788649559021, + "rewards/rejected": 0.16464750468730927, "step": 70 }, { "epoch": 0.17, - "grad_norm": 53.60948399217281, - "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -2.6630194187164307, - "logits/rejected": -2.5986924171447754, - "logps/chosen": -319.0962829589844, - "logps/rejected": -265.9037170410156, - "loss": 0.6056, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.1860823631286621, - "rewards/margins": 0.2783365547657013, - "rewards/rejected": -0.09225417673587799, + "grad_norm": 61.37030849441711, + "learning_rate": 4.931986719649298e-07, + "logits/chosen": -2.615948438644409, + "logits/rejected": -2.5394978523254395, + "logps/chosen": -311.7240295410156, + "logps/rejected": -263.1805725097656, + "loss": 0.5671, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.5546952486038208, + "rewards/margins": 0.5107932686805725, + "rewards/rejected": 0.04390193149447441, "step": 80 }, { "epoch": 0.19, - "grad_norm": 39.28479543071564, - "learning_rate": 3.1249999999999997e-07, - "logits/chosen": -2.5524282455444336, - "logits/rejected": -2.505441188812256, - "logps/chosen": -258.28790283203125, - "logps/rejected": -260.45709228515625, - "loss": 0.5784, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.04117143899202347, - "rewards/margins": 0.4441215991973877, - "rewards/rejected": -0.40295013785362244, + "grad_norm": 39.59727717104598, + "learning_rate": 4.883222001996351e-07, + "logits/chosen": -2.5085294246673584, + "logits/rejected": -2.4543616771698, + "logps/chosen": -251.203369140625, + "logps/rejected": -259.8647766113281, + "loss": 0.5646, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.3953971564769745, + "rewards/margins": 0.7687323689460754, + "rewards/rejected": -0.37333518266677856, "step": 90 }, { "epoch": 0.21, - "grad_norm": 37.33085523160704, - "learning_rate": 3.472222222222222e-07, - "logits/chosen": -2.5317625999450684, - "logits/rejected": -2.5069174766540527, - "logps/chosen": -255.98025512695312, - "logps/rejected": -263.0767517089844, - "loss": 0.582, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.21891236305236816, - "rewards/margins": 0.5213271975517273, - "rewards/rejected": -0.3024148643016815, + "grad_norm": 36.57841721590594, + "learning_rate": 4.821741763807186e-07, + "logits/chosen": -2.499514102935791, + "logits/rejected": -2.4649369716644287, + "logps/chosen": -248.44363403320312, + "logps/rejected": -257.64776611328125, + "loss": 0.565, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.5957446694374084, + "rewards/margins": 0.6267115473747253, + "rewards/rejected": -0.03096688725054264, "step": 100 }, { "epoch": 0.21, - "eval_logits/chosen": -2.581188917160034, - "eval_logits/rejected": -2.5431487560272217, - "eval_logps/chosen": -254.23861694335938, - "eval_logps/rejected": -263.28759765625, - "eval_loss": 0.5878116488456726, - "eval_rewards/accuracies": 0.71875, - "eval_rewards/chosen": 0.41774246096611023, - "eval_rewards/margins": 0.44876235723495483, - "eval_rewards/rejected": -0.031019899994134903, - "eval_runtime": 97.2965, - "eval_samples_per_second": 20.556, - "eval_steps_per_second": 0.329, + "eval_logits/chosen": -2.550398111343384, + "eval_logits/rejected": -2.5104503631591797, + "eval_logps/chosen": -250.69297790527344, + "eval_logps/rejected": -262.7791748046875, + "eval_loss": 0.5717624425888062, + "eval_rewards/accuracies": 0.73828125, + "eval_rewards/chosen": 0.5950239300727844, + "eval_rewards/margins": 0.6006231904029846, + "eval_rewards/rejected": -0.005599223077297211, + "eval_runtime": 96.9486, + "eval_samples_per_second": 20.629, + "eval_steps_per_second": 0.33, "step": 100 }, { "epoch": 0.23, - "grad_norm": 38.91387415080278, - "learning_rate": 3.819444444444444e-07, - "logits/chosen": -2.580745220184326, - "logits/rejected": -2.495126724243164, - "logps/chosen": -298.351806640625, - "logps/rejected": -262.37762451171875, - "loss": 0.5835, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.31141430139541626, - "rewards/margins": 0.5385011434555054, - "rewards/rejected": -0.2270868569612503, + "grad_norm": 51.91494841998397, + "learning_rate": 4.747874028753375e-07, + "logits/chosen": -2.55851149559021, + "logits/rejected": -2.4656014442443848, + "logps/chosen": -292.62615966796875, + "logps/rejected": -258.59661865234375, + "loss": 0.5713, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5976964831352234, + "rewards/margins": 0.6357330083847046, + "rewards/rejected": -0.0380365327000618, "step": 110 }, { "epoch": 0.25, - "grad_norm": 38.5197424217738, - "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -2.473219633102417, - "logits/rejected": -2.4464592933654785, - "logps/chosen": -286.4003601074219, - "logps/rejected": -275.6720886230469, - "loss": 0.5583, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.2616604268550873, - "rewards/margins": 0.5291147828102112, - "rewards/rejected": -0.7907751798629761, + "grad_norm": 70.69069363258822, + "learning_rate": 4.662012913161997e-07, + "logits/chosen": -2.4600424766540527, + "logits/rejected": -2.4324684143066406, + "logps/chosen": -270.7308349609375, + "logps/rejected": -260.5433349609375, + "loss": 0.5497, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5218156576156616, + "rewards/margins": 0.5561539530754089, + "rewards/rejected": -0.03433822840452194, "step": 120 }, { "epoch": 0.27, - "grad_norm": 37.759243136528056, - "learning_rate": 4.513888888888889e-07, - "logits/chosen": -2.463078498840332, - "logits/rejected": -2.4377052783966064, - "logps/chosen": -277.6775817871094, - "logps/rejected": -277.7251281738281, - "loss": 0.53, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.2162761688232422, - "rewards/margins": 0.6807397603988647, - "rewards/rejected": -0.46446362137794495, + "grad_norm": 42.312370253489476, + "learning_rate": 4.5646165232345103e-07, + "logits/chosen": -2.464791774749756, + "logits/rejected": -2.439894676208496, + "logps/chosen": -268.9382019042969, + "logps/rejected": -269.9627685546875, + "loss": 0.5423, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6532469987869263, + "rewards/margins": 0.7295945882797241, + "rewards/rejected": -0.07634757459163666, "step": 130 }, { "epoch": 0.29, - "grad_norm": 43.5485570360565, - "learning_rate": 4.861111111111111e-07, - "logits/chosen": -2.488098621368408, - "logits/rejected": -2.4522647857666016, - "logps/chosen": -313.66253662109375, - "logps/rejected": -313.8337707519531, - "loss": 0.5305, + "grad_norm": 40.45859260542855, + "learning_rate": 4.456204510851956e-07, + "logits/chosen": -2.5265681743621826, + "logits/rejected": -2.485774517059326, + "logps/chosen": -303.1440124511719, + "logps/rejected": -301.68914794921875, + "loss": 0.5376, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.1151454821228981, - "rewards/margins": 0.7729904651641846, - "rewards/rejected": -0.6578450202941895, + "rewards/chosen": 0.6410696506500244, + "rewards/margins": 0.6916864514350891, + "rewards/rejected": -0.0506168007850647, "step": 140 }, { "epoch": 0.31, - "grad_norm": 42.70051446601528, - "learning_rate": 4.999733114418725e-07, - "logits/chosen": -2.4509975910186768, - "logits/rejected": -2.384775161743164, - "logps/chosen": -286.3575744628906, - "logps/rejected": -292.4871520996094, - "loss": 0.5401, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.15666823089122772, - "rewards/margins": 0.6572391390800476, - "rewards/rejected": -0.8139073252677917, + "grad_norm": 41.1747855806655, + "learning_rate": 4.337355301007335e-07, + "logits/chosen": -2.5189616680145264, + "logits/rejected": -2.4531705379486084, + "logps/chosen": -272.0736999511719, + "logps/rejected": -276.2969055175781, + "loss": 0.5442, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5575242042541504, + "rewards/margins": 0.5619192719459534, + "rewards/rejected": -0.004395070485770702, "step": 150 }, { "epoch": 0.33, - "grad_norm": 53.334691841946395, - "learning_rate": 4.998102353328799e-07, - "logits/chosen": -2.4482672214508057, - "logits/rejected": -2.358886957168579, - "logps/chosen": -269.8271789550781, - "logps/rejected": -270.34710693359375, - "loss": 0.57, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.16204610466957092, - "rewards/margins": 0.808205246925354, - "rewards/rejected": -0.6461590528488159, + "grad_norm": 48.726180323544725, + "learning_rate": 4.2087030056579986e-07, + "logits/chosen": -2.5174994468688965, + "logits/rejected": -2.43558406829834, + "logps/chosen": -260.0892028808594, + "logps/rejected": -260.7149658203125, + "loss": 0.5652, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6489425897598267, + "rewards/margins": 0.8134964108467102, + "rewards/rejected": -0.16455380618572235, "step": 160 }, { "epoch": 0.36, - "grad_norm": 44.14727072303912, - "learning_rate": 4.994990066883491e-07, - "logits/chosen": -2.3647940158843994, - "logits/rejected": -2.2960338592529297, - "logps/chosen": -259.77276611328125, - "logps/rejected": -255.1217498779297, - "loss": 0.5382, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5218764543533325, - "rewards/margins": 0.6779727339744568, - "rewards/rejected": -1.1998491287231445, + "grad_norm": 49.53825953706789, + "learning_rate": 4.070934040463998e-07, + "logits/chosen": -2.4509148597717285, + "logits/rejected": -2.3897039890289307, + "logps/chosen": -239.52261352539062, + "logps/rejected": -233.6277618408203, + "loss": 0.5489, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.49063143134117126, + "rewards/margins": 0.6157802939414978, + "rewards/rejected": -0.12514881789684296, "step": 170 }, { "epoch": 0.38, - "grad_norm": 86.93750039599435, - "learning_rate": 4.990398100856366e-07, - "logits/chosen": -2.234858989715576, - "logits/rejected": -2.1829886436462402, - "logps/chosen": -262.09600830078125, - "logps/rejected": -243.8761444091797, - "loss": 0.529, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.2868492603302002, - "rewards/margins": 0.5284022092819214, - "rewards/rejected": -0.8152514696121216, + "grad_norm": 51.08561061111303, + "learning_rate": 3.9247834624635404e-07, + "logits/chosen": -2.3483898639678955, + "logits/rejected": -2.306784152984619, + "logps/chosen": -247.6396026611328, + "logps/rejected": -231.8523406982422, + "loss": 0.5181, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.43596941232681274, + "rewards/margins": 0.6500319242477417, + "rewards/rejected": -0.21406252682209015, "step": 180 }, { "epoch": 0.4, - "grad_norm": 42.637015477998524, - "learning_rate": 4.984329178560219e-07, - "logits/chosen": -2.275543689727783, - "logits/rejected": -2.2290995121002197, - "logps/chosen": -273.13580322265625, - "logps/rejected": -277.8387756347656, - "loss": 0.5193, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.09464643895626068, - "rewards/margins": 0.8095524907112122, - "rewards/rejected": -0.9041990041732788, + "grad_norm": 42.31027201995276, + "learning_rate": 3.7710310482256523e-07, + "logits/chosen": -2.41634464263916, + "logits/rejected": -2.378105878829956, + "logps/chosen": -260.20306396484375, + "logps/rejected": -261.46502685546875, + "loss": 0.5392, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5519876480102539, + "rewards/margins": 0.6375012993812561, + "rewards/rejected": -0.08551368862390518, "step": 190 }, { "epoch": 0.42, - "grad_norm": 224.49804573493958, - "learning_rate": 4.976786899231985e-07, - "logits/chosen": -2.2373015880584717, - "logits/rejected": -2.2193970680236816, - "logps/chosen": -264.38330078125, - "logps/rejected": -271.14654541015625, - "loss": 0.558, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.22433169186115265, - "rewards/margins": 0.7305465936660767, - "rewards/rejected": -0.9548781514167786, + "grad_norm": 102.86207924802177, + "learning_rate": 3.610497133404795e-07, + "logits/chosen": -2.392763614654541, + "logits/rejected": -2.381993532180786, + "logps/chosen": -249.912109375, + "logps/rejected": -256.75439453125, + "loss": 0.5467, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.49922746419906616, + "rewards/margins": 0.7344967126846313, + "rewards/rejected": -0.235269233584404, "step": 200 }, { "epoch": 0.42, - "eval_logits/chosen": -2.38926362991333, - "eval_logits/rejected": -2.3397982120513916, - "eval_logps/chosen": -261.4733581542969, - "eval_logps/rejected": -280.4190673828125, - "eval_loss": 0.5195851922035217, - "eval_rewards/accuracies": 0.77734375, - "eval_rewards/chosen": 0.05600578337907791, - "eval_rewards/margins": 0.9436004757881165, - "eval_rewards/rejected": -0.8875946998596191, - "eval_runtime": 96.7112, - "eval_samples_per_second": 20.68, - "eval_steps_per_second": 0.331, + "eval_logits/chosen": -2.517864942550659, + "eval_logits/rejected": -2.4783387184143066, + "eval_logps/chosen": -249.6370849609375, + "eval_logps/rejected": -264.89788818359375, + "eval_loss": 0.5432960391044617, + "eval_rewards/accuracies": 0.74609375, + "eval_rewards/chosen": 0.6478186845779419, + "eval_rewards/margins": 0.759353518486023, + "eval_rewards/rejected": -0.11153475195169449, + "eval_runtime": 96.4207, + "eval_samples_per_second": 20.742, + "eval_steps_per_second": 0.332, "step": 200 }, { "epoch": 0.44, - "grad_norm": 57.978840386902434, - "learning_rate": 4.967775735898179e-07, - "logits/chosen": -2.3195013999938965, - "logits/rejected": -2.2525343894958496, - "logps/chosen": -293.5995178222656, - "logps/rejected": -273.7899475097656, - "loss": 0.5321, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.1953679621219635, - "rewards/margins": 0.9317470788955688, - "rewards/rejected": -1.12711501121521, + "grad_norm": 45.308290366409736, + "learning_rate": 3.4440382358952115e-07, + "logits/chosen": -2.4460113048553467, + "logits/rejected": -2.391810894012451, + "logps/chosen": -278.56781005859375, + "logps/rejected": -257.2254943847656, + "loss": 0.5436, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.5562152862548828, + "rewards/margins": 0.8551079034805298, + "rewards/rejected": -0.29889267683029175, "step": 210 }, { "epoch": 0.46, - "grad_norm": 45.042317188060295, - "learning_rate": 4.957301032722118e-07, - "logits/chosen": -2.320981502532959, - "logits/rejected": -2.2780816555023193, - "logps/chosen": -275.7085266113281, - "logps/rejected": -272.6601257324219, - "loss": 0.5272, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.5165706872940063, - "rewards/margins": 0.8332468271255493, - "rewards/rejected": -1.3498175144195557, + "grad_norm": 50.1182470431882, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": -2.4605488777160645, + "logits/rejected": -2.42708683013916, + "logps/chosen": -257.90826416015625, + "logps/rejected": -253.3182830810547, + "loss": 0.54, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": 0.3734419643878937, + "rewards/margins": 0.7561658024787903, + "rewards/rejected": -0.382723867893219, "step": 220 }, { "epoch": 0.48, - "grad_norm": 42.64386380595179, - "learning_rate": 4.945369001834514e-07, - "logits/chosen": -2.335122585296631, - "logits/rejected": -2.2802178859710693, - "logps/chosen": -251.1619415283203, - "logps/rejected": -268.8333740234375, - "loss": 0.5242, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.1148248165845871, - "rewards/margins": 0.939952552318573, - "rewards/rejected": -1.0547773838043213, + "grad_norm": 43.71024962971359, + "learning_rate": 3.096924887558854e-07, + "logits/chosen": -2.490509510040283, + "logits/rejected": -2.4491913318634033, + "logps/chosen": -237.17898559570312, + "logps/rejected": -251.81686401367188, + "loss": 0.5441, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.5843235850334167, + "rewards/margins": 0.7882751226425171, + "rewards/rejected": -0.20395155251026154, "step": 230 }, { "epoch": 0.5, - "grad_norm": 43.7138056070312, - "learning_rate": 4.931986719649298e-07, - "logits/chosen": -2.372040271759033, - "logits/rejected": -2.3250112533569336, - "logps/chosen": -267.7872619628906, - "logps/rejected": -270.08612060546875, - "loss": 0.4886, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.20435211062431335, - "rewards/margins": 0.8465341329574585, - "rewards/rejected": -1.0508863925933838, + "grad_norm": 44.93616969967234, + "learning_rate": 2.9181224366319943e-07, + "logits/chosen": -2.533695697784424, + "logits/rejected": -2.500807285308838, + "logps/chosen": -253.635498046875, + "logps/rejected": -253.0944061279297, + "loss": 0.5142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5032340884208679, + "rewards/margins": 0.7045356035232544, + "rewards/rejected": -0.2013014256954193, "step": 240 }, { "epoch": 0.52, - "grad_norm": 46.84966459893474, - "learning_rate": 4.91716212266689e-07, - "logits/chosen": -2.2988550662994385, - "logits/rejected": -2.2003023624420166, - "logps/chosen": -298.1326904296875, - "logps/rejected": -279.72857666015625, - "loss": 0.4983, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0028904906939715147, - "rewards/margins": 1.0131902694702148, - "rewards/rejected": -1.0102999210357666, + "grad_norm": 42.68904256130122, + "learning_rate": 2.7370891215954565e-07, + "logits/chosen": -2.483025074005127, + "logits/rejected": -2.4015185832977295, + "logps/chosen": -285.0963439941406, + "logps/rejected": -263.43560791015625, + "loss": 0.5198, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.6547069549560547, + "rewards/margins": 0.850358784198761, + "rewards/rejected": -0.19565197825431824, "step": 250 }, { "epoch": 0.54, - "grad_norm": 45.60631693718858, - "learning_rate": 4.900904002767367e-07, - "logits/chosen": -2.359170913696289, - "logits/rejected": -2.298555850982666, - "logps/chosen": -298.26898193359375, - "logps/rejected": -277.553955078125, - "loss": 0.4979, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.1978708803653717, - "rewards/margins": 0.9808057546615601, - "rewards/rejected": -1.178676724433899, + "grad_norm": 45.43502171857602, + "learning_rate": 2.55479083351317e-07, + "logits/chosen": -2.516913890838623, + "logits/rejected": -2.478473424911499, + "logps/chosen": -282.80230712890625, + "logps/rejected": -258.77288818359375, + "loss": 0.5235, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.5754625797271729, + "rewards/margins": 0.8150871396064758, + "rewards/rejected": -0.23962458968162537, "step": 260 }, { "epoch": 0.56, - "grad_norm": 43.06547616205831, - "learning_rate": 4.883222001996351e-07, - "logits/chosen": -2.3876845836639404, - "logits/rejected": -2.324982166290283, - "logps/chosen": -285.72674560546875, - "logps/rejected": -283.54534912109375, - "loss": 0.529, + "grad_norm": 41.73526734917468, + "learning_rate": 2.3722002126275822e-07, + "logits/chosen": -2.5381340980529785, + "logits/rejected": -2.4941086769104004, + "logps/chosen": -267.4333190917969, + "logps/rejected": -260.50677490234375, + "loss": 0.5406, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.43653813004493713, - "rewards/margins": 0.8585146069526672, - "rewards/rejected": -1.2950527667999268, + "rewards/chosen": 0.4781308174133301, + "rewards/margins": 0.6212563514709473, + "rewards/rejected": -0.14312560856342316, "step": 270 }, { "epoch": 0.59, - "grad_norm": 47.726430451771456, - "learning_rate": 4.864126606846696e-07, - "logits/chosen": -2.3547611236572266, - "logits/rejected": -2.261406898498535, - "logps/chosen": -271.0657043457031, - "logps/rejected": -269.5400390625, - "loss": 0.5339, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.4694308340549469, - "rewards/margins": 0.8982539176940918, - "rewards/rejected": -1.3676847219467163, + "grad_norm": 48.561323508433155, + "learning_rate": 2.19029145890313e-07, + "logits/chosen": -2.510133743286133, + "logits/rejected": -2.4422435760498047, + "logps/chosen": -250.73855590820312, + "logps/rejected": -247.487060546875, + "loss": 0.5599, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5469261407852173, + "rewards/margins": 0.8119627833366394, + "rewards/rejected": -0.2650366425514221, "step": 280 }, { "epoch": 0.61, - "grad_norm": 43.598738742809324, - "learning_rate": 4.843629142039366e-07, - "logits/chosen": -2.398907423019409, - "logits/rejected": -2.3398590087890625, - "logps/chosen": -260.32208251953125, - "logps/rejected": -267.5470886230469, - "loss": 0.5116, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.4617377817630768, - "rewards/margins": 0.8178228139877319, - "rewards/rejected": -1.2795606851577759, + "grad_norm": 44.504093632124075, + "learning_rate": 2.0100351342479216e-07, + "logits/chosen": -2.5589568614959717, + "logits/rejected": -2.5217483043670654, + "logps/chosen": -240.7520751953125, + "logps/rejected": -244.8422088623047, + "loss": 0.5354, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5167636871337891, + "rewards/margins": 0.661081075668335, + "rewards/rejected": -0.14431743323802948, "step": 290 }, { "epoch": 0.63, - "grad_norm": 53.461838586248135, - "learning_rate": 4.821741763807186e-07, - "logits/chosen": -2.358552932739258, - "logits/rejected": -2.2632975578308105, - "logps/chosen": -306.07708740234375, - "logps/rejected": -284.6916198730469, - "loss": 0.4914, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.254052072763443, - "rewards/margins": 0.9752548336982727, - "rewards/rejected": -1.2293068170547485, + "grad_norm": 52.52022669231452, + "learning_rate": 1.8323929841460178e-07, + "logits/chosen": -2.5429511070251465, + "logits/rejected": -2.472679376602173, + "logps/chosen": -292.2240905761719, + "logps/rejected": -266.68658447265625, + "loss": 0.517, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.4385985732078552, + "rewards/margins": 0.7676541209220886, + "rewards/rejected": -0.329055517911911, "step": 300 }, { "epoch": 0.63, - "eval_logits/chosen": -2.3652639389038086, - "eval_logits/rejected": -2.3039121627807617, - "eval_logps/chosen": -264.29364013671875, - "eval_logps/rejected": -286.6200866699219, - "eval_loss": 0.5109529495239258, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -0.08500880002975464, - "eval_rewards/margins": 1.1126374006271362, - "eval_rewards/rejected": -1.197646141052246, - "eval_runtime": 96.7466, - "eval_samples_per_second": 20.673, - "eval_steps_per_second": 0.331, + "eval_logits/chosen": -2.5622596740722656, + "eval_logits/rejected": -2.520256280899048, + "eval_logps/chosen": -251.2219696044922, + "eval_logps/rejected": -268.04449462890625, + "eval_loss": 0.53697669506073, + "eval_rewards/accuracies": 0.76953125, + "eval_rewards/chosen": 0.5685745477676392, + "eval_rewards/margins": 0.8374388217926025, + "eval_rewards/rejected": -0.2688642740249634, + "eval_runtime": 96.3678, + "eval_samples_per_second": 20.754, + "eval_steps_per_second": 0.332, "step": 300 }, { "epoch": 0.65, - "grad_norm": 39.546683435658494, - "learning_rate": 4.798477452685468e-07, - "logits/chosen": -2.268113613128662, - "logits/rejected": -2.2365353107452393, - "logps/chosen": -298.82098388671875, - "logps/rejected": -280.2306823730469, - "loss": 0.4878, + "grad_norm": 43.866661437938184, + "learning_rate": 1.6583128063291573e-07, + "logits/chosen": -2.4593474864959717, + "logits/rejected": -2.443233013153076, + "logps/chosen": -285.5498046875, + "logps/rejected": -263.8379821777344, + "loss": 0.5077, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.17425382137298584, - "rewards/margins": 0.9114526510238647, - "rewards/rejected": -1.0857064723968506, + "rewards/chosen": 0.4893050193786621, + "rewards/margins": 0.7553777098655701, + "rewards/rejected": -0.26607269048690796, "step": 310 }, { "epoch": 0.67, - "grad_norm": 46.66091222824393, - "learning_rate": 4.773850005813776e-07, - "logits/chosen": -2.2866523265838623, - "logits/rejected": -2.221771240234375, - "logps/chosen": -301.2940368652344, - "logps/rejected": -271.4169921875, - "loss": 0.4739, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3811998963356018, - "rewards/margins": 0.9484872817993164, - "rewards/rejected": -1.329687237739563, + "grad_norm": 43.407860217947494, + "learning_rate": 1.488723393865766e-07, + "logits/chosen": -2.4746253490448, + "logits/rejected": -2.4388270378112793, + "logps/chosen": -283.4583740234375, + "logps/rejected": -250.38204956054688, + "loss": 0.504, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.5105848908424377, + "rewards/margins": 0.788524329662323, + "rewards/rejected": -0.2779393792152405, "step": 320 }, { "epoch": 0.69, - "grad_norm": 38.86653009667017, - "learning_rate": 4.747874028753375e-07, - "logits/chosen": -2.278787136077881, - "logits/rejected": -2.1933655738830566, - "logps/chosen": -265.38922119140625, - "logps/rejected": -282.16143798828125, - "loss": 0.4954, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.1589515209197998, - "rewards/margins": 0.9741891026496887, - "rewards/rejected": -1.1331405639648438, + "grad_norm": 40.302692173545196, + "learning_rate": 1.3245295796480788e-07, + "logits/chosen": -2.4712371826171875, + "logits/rejected": -2.4099698066711426, + "logps/chosen": -252.349853515625, + "logps/rejected": -264.03912353515625, + "loss": 0.5242, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.4930170178413391, + "rewards/margins": 0.7200408577919006, + "rewards/rejected": -0.2270239144563675, "step": 330 }, { "epoch": 0.71, - "grad_norm": 52.563625371137256, - "learning_rate": 4.720564926825267e-07, - "logits/chosen": -2.261157751083374, - "logits/rejected": -2.1779565811157227, - "logps/chosen": -300.0699462890625, - "logps/rejected": -313.32012939453125, - "loss": 0.4961, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.5037744641304016, - "rewards/margins": 1.1601417064666748, - "rewards/rejected": -1.663915991783142, + "grad_norm": 50.168955016672676, + "learning_rate": 1.1666074087171627e-07, + "logits/chosen": -2.467729091644287, + "logits/rejected": -2.4046943187713623, + "logps/chosen": -278.697509765625, + "logps/rejected": -285.4507141113281, + "loss": 0.524, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.5648467540740967, + "rewards/margins": 0.8352931141853333, + "rewards/rejected": -0.2704463601112366, "step": 340 }, { "epoch": 0.73, - "grad_norm": 44.99037304721281, - "learning_rate": 4.6919388959739e-07, - "logits/chosen": -2.2558467388153076, - "logits/rejected": -2.163471221923828, - "logps/chosen": -268.26397705078125, - "logps/rejected": -272.03729248046875, - "loss": 0.465, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.5026377439498901, - "rewards/margins": 1.063787579536438, - "rewards/rejected": -1.566425085067749, + "grad_norm": 46.15971070553052, + "learning_rate": 1.0157994641835734e-07, + "logits/chosen": -2.445666790008545, + "logits/rejected": -2.377004384994507, + "logps/chosen": -248.63241577148438, + "logps/rejected": -248.23904418945312, + "loss": 0.4924, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47894006967544556, + "rewards/margins": 0.8554509878158569, + "rewards/rejected": -0.37651100754737854, "step": 350 }, { "epoch": 0.75, - "grad_norm": 62.16268030811922, - "learning_rate": 4.662012913161997e-07, - "logits/chosen": -2.2976787090301514, - "logits/rejected": -2.1752614974975586, - "logps/chosen": -306.94708251953125, - "logps/rejected": -288.148193359375, - "loss": 0.52, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.212753027677536, - "rewards/margins": 1.1294101476669312, - "rewards/rejected": -1.3421632051467896, + "grad_norm": 54.17198760484943, + "learning_rate": 8.729103716819111e-08, + "logits/chosen": -2.4745380878448486, + "logits/rejected": -2.376185178756714, + "logps/chosen": -292.89483642578125, + "logps/rejected": -269.1952209472656, + "loss": 0.5388, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.4898607134819031, + "rewards/margins": 0.8843740224838257, + "rewards/rejected": -0.3945133090019226, "step": 360 }, { "epoch": 0.77, - "grad_norm": 48.68068839220641, - "learning_rate": 4.6308047263021925e-07, - "logits/chosen": -2.221646785736084, - "logits/rejected": -2.1589419841766357, - "logps/chosen": -286.72344970703125, - "logps/rejected": -267.8009033203125, - "loss": 0.5138, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5695220232009888, - "rewards/margins": 0.9088203310966492, - "rewards/rejected": -1.4783422946929932, + "grad_norm": 44.15468601338237, + "learning_rate": 7.387025063449081e-08, + "logits/chosen": -2.409170150756836, + "logits/rejected": -2.367518186569214, + "logps/chosen": -266.35430908203125, + "logps/rejected": -242.5480194091797, + "loss": 0.5384, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.44893550872802734, + "rewards/margins": 0.6646324992179871, + "rewards/rejected": -0.21569697558879852, "step": 370 }, { "epoch": 0.79, - "grad_norm": 40.16847273705918, - "learning_rate": 4.5983328437314523e-07, - "logits/chosen": -2.1364545822143555, - "logits/rejected": -2.1142001152038574, - "logps/chosen": -257.01513671875, - "logps/rejected": -296.47222900390625, - "loss": 0.4949, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8765251040458679, - "rewards/margins": 1.0112650394439697, - "rewards/rejected": -1.8877900838851929, + "grad_norm": 39.47383320898196, + "learning_rate": 6.138919252022435e-08, + "logits/chosen": -2.3523006439208984, + "logits/rejected": -2.3420968055725098, + "logps/chosen": -230.9795379638672, + "logps/rejected": -267.8912658691406, + "loss": 0.5181, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.4252557158470154, + "rewards/margins": 0.8839966058731079, + "rewards/rejected": -0.45874080061912537, "step": 380 }, { "epoch": 0.82, - "grad_norm": 51.78049342719275, - "learning_rate": 4.5646165232345103e-07, - "logits/chosen": -2.223437547683716, - "logits/rejected": -2.1807045936584473, - "logps/chosen": -312.63714599609375, - "logps/rejected": -308.8570861816406, - "loss": 0.4921, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.3500111699104309, - "rewards/margins": 0.8943204879760742, - "rewards/rejected": -1.2443315982818604, + "grad_norm": 48.64961363299689, + "learning_rate": 4.991445467064689e-08, + "logits/chosen": -2.4286305904388428, + "logits/rejected": -2.394604206085205, + "logps/chosen": -293.20440673828125, + "logps/rejected": -287.0997009277344, + "loss": 0.5149, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6216251850128174, + "rewards/margins": 0.7780872583389282, + "rewards/rejected": -0.15646204352378845, "step": 390 }, { "epoch": 0.84, - "grad_norm": 42.79568140865266, - "learning_rate": 4.529675760622843e-07, - "logits/chosen": -2.2350916862487793, - "logits/rejected": -2.148932695388794, - "logps/chosen": -276.3843078613281, - "logps/rejected": -295.5137939453125, - "loss": 0.4922, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.29245343804359436, - "rewards/margins": 1.1326130628585815, - "rewards/rejected": -1.425066590309143, + "grad_norm": 44.38080817079193, + "learning_rate": 3.9507259776993954e-08, + "logits/chosen": -2.4102301597595215, + "logits/rejected": -2.3357295989990234, + "logps/chosen": -259.7147521972656, + "logps/rejected": -273.10699462890625, + "loss": 0.518, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.541024386882782, + "rewards/margins": 0.8457515835762024, + "rewards/rejected": -0.30472710728645325, "step": 400 }, { "epoch": 0.84, - "eval_logits/chosen": -2.3144989013671875, - "eval_logits/rejected": -2.2569639682769775, - "eval_logps/chosen": -263.2853698730469, - "eval_logps/rejected": -285.4248352050781, - "eval_loss": 0.5095034837722778, - "eval_rewards/accuracies": 0.78515625, - "eval_rewards/chosen": -0.0345957949757576, - "eval_rewards/margins": 1.1032863855361938, - "eval_rewards/rejected": -1.137882113456726, - "eval_runtime": 96.7428, - "eval_samples_per_second": 20.673, - "eval_steps_per_second": 0.331, + "eval_logits/chosen": -2.4731171131134033, + "eval_logits/rejected": -2.4323782920837402, + "eval_logps/chosen": -250.02178955078125, + "eval_logps/rejected": -267.0915222167969, + "eval_loss": 0.5348160862922668, + "eval_rewards/accuracies": 0.75390625, + "eval_rewards/chosen": 0.6285843849182129, + "eval_rewards/margins": 0.8498014211654663, + "eval_rewards/rejected": -0.22121697664260864, + "eval_runtime": 96.4764, + "eval_samples_per_second": 20.73, + "eval_steps_per_second": 0.332, "step": 400 }, { "epoch": 0.86, - "grad_norm": 41.17913551343322, - "learning_rate": 4.493531277875948e-07, - "logits/chosen": -2.300753116607666, - "logits/rejected": -2.2138209342956543, - "logps/chosen": -300.85986328125, - "logps/rejected": -293.10089111328125, - "loss": 0.4939, + "grad_norm": 48.15981350653104, + "learning_rate": 3.022313472693447e-08, + "logits/chosen": -2.444577932357788, + "logits/rejected": -2.3699073791503906, + "logps/chosen": -286.5138854980469, + "logps/rejected": -274.3666687011719, + "loss": 0.5226, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.09711817651987076, - "rewards/margins": 1.1170685291290283, - "rewards/rejected": -1.214186668395996, + "rewards/chosen": 0.6201778650283813, + "rewards/margins": 0.8976529240608215, + "rewards/rejected": -0.2774750590324402, "step": 410 }, { "epoch": 0.88, - "grad_norm": 44.7817002923167, - "learning_rate": 4.456204510851956e-07, - "logits/chosen": -2.296912670135498, - "logits/rejected": -2.2389960289001465, - "logps/chosen": -304.0657043457031, - "logps/rejected": -303.19512939453125, - "loss": 0.4853, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.34959596395492554, - "rewards/margins": 1.0870481729507446, - "rewards/rejected": -1.4366440773010254, + "grad_norm": 48.573506313099124, + "learning_rate": 2.2111614344599684e-08, + "logits/chosen": -2.429912805557251, + "logits/rejected": -2.3931796550750732, + "logps/chosen": -287.13067626953125, + "logps/rejected": -279.46844482421875, + "loss": 0.5212, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4971562325954437, + "rewards/margins": 0.7474662065505981, + "rewards/rejected": -0.25030994415283203, "step": 420 }, { "epoch": 0.9, - "grad_norm": 42.837722286044084, - "learning_rate": 4.4177175965748804e-07, - "logits/chosen": -2.190886974334717, - "logits/rejected": -2.1311051845550537, - "logps/chosen": -289.19403076171875, - "logps/rejected": -291.84967041015625, - "loss": 0.4704, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.5077947378158569, - "rewards/margins": 1.1956888437271118, - "rewards/rejected": -1.7034835815429688, + "grad_norm": 41.98038749926915, + "learning_rate": 1.521597710086439e-08, + "logits/chosen": -2.3573684692382812, + "logits/rejected": -2.3092567920684814, + "logps/chosen": -269.9436950683594, + "logps/rejected": -265.4564514160156, + "loss": 0.501, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.45472264289855957, + "rewards/margins": 0.838543713092804, + "rewards/rejected": -0.38382115960121155, "step": 430 }, { "epoch": 0.92, - "grad_norm": 50.46876260755674, - "learning_rate": 4.378093360106022e-07, - "logits/chosen": -2.275932788848877, - "logits/rejected": -2.206188678741455, - "logps/chosen": -291.9374694824219, - "logps/rejected": -296.0220947265625, - "loss": 0.4947, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.3800189793109894, - "rewards/margins": 0.9654985666275024, - "rewards/rejected": -1.3455175161361694, + "grad_norm": 44.22650678163462, + "learning_rate": 9.57301420397924e-09, + "logits/chosen": -2.4332785606384277, + "logits/rejected": -2.3776473999023438, + "logps/chosen": -272.65960693359375, + "logps/rejected": -271.44329833984375, + "loss": 0.5213, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5838757157325745, + "rewards/margins": 0.700454831123352, + "rewards/rejected": -0.1165790781378746, "step": 440 }, { "epoch": 0.94, - "grad_norm": 49.16582597289157, - "learning_rate": 4.337355301007335e-07, - "logits/chosen": -2.2473514080047607, - "logits/rejected": -2.212566375732422, - "logps/chosen": -304.7669372558594, - "logps/rejected": -337.439453125, - "loss": 0.4809, + "grad_norm": 43.00589727019739, + "learning_rate": 5.212833302556258e-09, + "logits/chosen": -2.3836779594421387, + "logits/rejected": -2.360665798187256, + "logps/chosen": -284.2134704589844, + "logps/rejected": -312.9830627441406, + "loss": 0.5099, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5541377663612366, - "rewards/margins": 0.9641104936599731, - "rewards/rejected": -1.5182483196258545, + "rewards/chosen": 0.4735330641269684, + "rewards/margins": 0.7689631581306458, + "rewards/rejected": -0.29543009400367737, "step": 450 }, { "epoch": 0.96, - "grad_norm": 45.063096434401295, - "learning_rate": 4.2955275794047627e-07, - "logits/chosen": -2.255167007446289, - "logits/rejected": -2.2048747539520264, - "logps/chosen": -277.2271423339844, - "logps/rejected": -268.5289001464844, - "loss": 0.4941, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.26313039660453796, - "rewards/margins": 1.0289232730865479, - "rewards/rejected": -1.2920535802841187, + "grad_norm": 46.86754726240038, + "learning_rate": 2.158697848236607e-09, + "logits/chosen": -2.417273998260498, + "logits/rejected": -2.377349376678467, + "logps/chosen": -262.1804504394531, + "logps/rejected": -247.7431182861328, + "loss": 0.5264, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.48920711874961853, + "rewards/margins": 0.7419728636741638, + "rewards/rejected": -0.2527657151222229, "step": 460 }, { "epoch": 0.98, - "grad_norm": 44.06520192137465, - "learning_rate": 4.252635001659837e-07, - "logits/chosen": -2.2488787174224854, - "logits/rejected": -2.1796398162841797, - "logps/chosen": -284.37713623046875, - "logps/rejected": -309.53546142578125, - "loss": 0.4733, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.28392332792282104, - "rewards/margins": 1.0752943754196167, - "rewards/rejected": -1.3592177629470825, + "grad_norm": 45.055740606082026, + "learning_rate": 4.269029751107489e-10, + "logits/chosen": -2.4338390827178955, + "logits/rejected": -2.3758208751678467, + "logps/chosen": -268.4836730957031, + "logps/rejected": -289.60205078125, + "loss": 0.4974, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5107508897781372, + "rewards/margins": 0.8732994794845581, + "rewards/rejected": -0.3625485301017761, "step": 470 }, { "epoch": 1.0, - "grad_norm": 20.757199114043438, - "learning_rate": 4.2087030056579986e-07, - "logits/chosen": -2.2137668132781982, - "logits/rejected": -2.1583967208862305, - "logps/chosen": -291.67376708984375, - "logps/rejected": -288.5511169433594, - "loss": 0.417, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.14164048433303833, - "rewards/margins": 1.205784559249878, - "rewards/rejected": -1.347425103187561, - "step": 480 - }, - { - "epoch": 1.03, - "grad_norm": 30.758787856873905, - "learning_rate": 4.163757645722403e-07, - "logits/chosen": -2.273852825164795, - "logits/rejected": -2.1564927101135254, - "logps/chosen": -292.22467041015625, - "logps/rejected": -292.77349853515625, - "loss": 0.1895, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": 0.2319210022687912, - "rewards/margins": 2.500307559967041, - "rewards/rejected": -2.2683863639831543, - "step": 490 - }, - { - "epoch": 1.05, - "grad_norm": 30.542382675770316, - "learning_rate": 4.117825577162134e-07, - "logits/chosen": -2.2761318683624268, - "logits/rejected": -2.1567749977111816, - "logps/chosen": -279.54486083984375, - "logps/rejected": -309.9883728027344, - "loss": 0.1908, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.061248529702425, - "rewards/margins": 2.6951680183410645, - "rewards/rejected": -2.6339192390441895, - "step": 500 - }, - { - "epoch": 1.05, - "eval_logits/chosen": -2.2441868782043457, - "eval_logits/rejected": -2.165961980819702, - "eval_logps/chosen": -269.8425598144531, - "eval_logps/rejected": -300.6473693847656, - "eval_loss": 0.5178976058959961, - "eval_rewards/accuracies": 0.78515625, - "eval_rewards/chosen": -0.36245518922805786, - "eval_rewards/margins": 1.5365545749664307, - "eval_rewards/rejected": -1.8990097045898438, - "eval_runtime": 96.6241, - "eval_samples_per_second": 20.699, - "eval_steps_per_second": 0.331, - "step": 500 - }, - { - "epoch": 1.07, - "grad_norm": 18.232913631558603, - "learning_rate": 4.070934040463998e-07, - "logits/chosen": -2.1585049629211426, - "logits/rejected": -2.0768818855285645, - "logps/chosen": -279.84527587890625, - "logps/rejected": -321.41375732421875, - "loss": 0.1751, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.1159655898809433, - "rewards/margins": 2.8791768550872803, - "rewards/rejected": -2.763211250305176, - "step": 510 - }, - { - "epoch": 1.09, - "grad_norm": 28.960170029042352, - "learning_rate": 4.023110845137273e-07, - "logits/chosen": -2.215406894683838, - "logits/rejected": -2.0539145469665527, - "logps/chosen": -276.207763671875, - "logps/rejected": -310.82513427734375, - "loss": 0.1755, - "rewards/accuracies": 0.90625, - "rewards/chosen": -0.07207924127578735, - "rewards/margins": 2.9146130084991455, - "rewards/rejected": -2.986691951751709, - "step": 520 - }, - { - "epoch": 1.11, - "grad_norm": 35.158803563669196, - "learning_rate": 3.974384353220992e-07, - "logits/chosen": -2.187908172607422, - "logits/rejected": -2.049764394760132, - "logps/chosen": -256.0184326171875, - "logps/rejected": -290.10821533203125, - "loss": 0.167, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.13029596209526062, - "rewards/margins": 2.844452381134033, - "rewards/rejected": -2.974748134613037, - "step": 530 - }, - { - "epoch": 1.13, - "grad_norm": 38.75397385029711, - "learning_rate": 3.9247834624635404e-07, - "logits/chosen": -2.1720824241638184, - "logits/rejected": -2.047842502593994, - "logps/chosen": -267.35888671875, - "logps/rejected": -331.7341003417969, - "loss": 0.1939, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.3224119246006012, - "rewards/margins": 3.103816509246826, - "rewards/rejected": -3.4262282848358154, - "step": 540 - }, - { - "epoch": 1.15, - "grad_norm": 21.6571458368692, - "learning_rate": 3.8743375891845556e-07, - "logits/chosen": -2.2194952964782715, - "logits/rejected": -2.088972806930542, - "logps/chosen": -301.56414794921875, - "logps/rejected": -329.8552551269531, - "loss": 0.1541, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.44850125908851624, - "rewards/margins": 2.9901504516601562, - "rewards/rejected": -3.4386515617370605, - "step": 550 - }, - { - "epoch": 1.17, - "grad_norm": 41.5671178301598, - "learning_rate": 3.823076650829267e-07, - "logits/chosen": -2.198901891708374, - "logits/rejected": -2.0278093814849854, - "logps/chosen": -261.13189697265625, - "logps/rejected": -295.24127197265625, - "loss": 0.1609, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.0586366169154644, - "rewards/margins": 2.883596897125244, - "rewards/rejected": -2.9422335624694824, - "step": 560 - }, - { - "epoch": 1.19, - "grad_norm": 22.403810834900185, - "learning_rate": 3.7710310482256523e-07, - "logits/chosen": -2.10243558883667, - "logits/rejected": -1.9939591884613037, - "logps/chosen": -281.2176208496094, - "logps/rejected": -338.7304382324219, - "loss": 0.1581, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -0.1340499371290207, - "rewards/margins": 3.009289503097534, - "rewards/rejected": -3.1433396339416504, - "step": 570 - }, - { - "epoch": 1.21, - "grad_norm": 41.02190574631486, - "learning_rate": 3.718231647554911e-07, - "logits/chosen": -2.1406311988830566, - "logits/rejected": -1.9555755853652954, - "logps/chosen": -292.01739501953125, - "logps/rejected": -329.15045166015625, - "loss": 0.1561, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.6274134516716003, - "rewards/margins": 3.1701507568359375, - "rewards/rejected": -3.7975640296936035, - "step": 580 - }, - { - "epoch": 1.23, - "grad_norm": 24.855938354465067, - "learning_rate": 3.664709762045961e-07, - "logits/chosen": -2.1577706336975098, - "logits/rejected": -2.070456027984619, - "logps/chosen": -284.9651794433594, - "logps/rejected": -321.62994384765625, - "loss": 0.1622, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.7565070390701294, - "rewards/margins": 2.949742078781128, - "rewards/rejected": -3.706249237060547, - "step": 590 - }, - { - "epoch": 1.26, - "grad_norm": 34.33138656502216, - "learning_rate": 3.610497133404795e-07, - "logits/chosen": -2.268091917037964, - "logits/rejected": -2.1013598442077637, - "logps/chosen": -285.04327392578125, - "logps/rejected": -315.2426452636719, - "loss": 0.1675, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.31151503324508667, - "rewards/margins": 3.0706770420074463, - "rewards/rejected": -3.3821918964385986, - "step": 600 - }, - { - "epoch": 1.26, - "eval_logits/chosen": -2.2219574451446533, - "eval_logits/rejected": -2.1248531341552734, - "eval_logps/chosen": -287.2300109863281, - "eval_logps/rejected": -324.08123779296875, - "eval_loss": 0.5377076268196106, - "eval_rewards/accuracies": 0.80078125, - "eval_rewards/chosen": -1.2318270206451416, - "eval_rewards/margins": 1.8388763666152954, - "eval_rewards/rejected": -3.0707035064697266, - "eval_runtime": 96.6082, - "eval_samples_per_second": 20.702, - "eval_steps_per_second": 0.331, - "step": 600 - }, - { - "epoch": 1.28, - "grad_norm": 24.42926394663865, - "learning_rate": 3.555625912989747e-07, - "logits/chosen": -2.2549402713775635, - "logits/rejected": -2.0435824394226074, - "logps/chosen": -345.69683837890625, - "logps/rejected": -355.43841552734375, - "loss": 0.1641, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -0.57981938123703, - "rewards/margins": 3.4938979148864746, - "rewards/rejected": -4.0737175941467285, - "step": 610 - }, - { - "epoch": 1.3, - "grad_norm": 40.76003061063018, - "learning_rate": 3.500128642743793e-07, - "logits/chosen": -1.9793628454208374, - "logits/rejected": -1.8766018152236938, - "logps/chosen": -275.6058654785156, - "logps/rejected": -325.64532470703125, - "loss": 0.1676, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -0.4727633595466614, - "rewards/margins": 3.388253688812256, - "rewards/rejected": -3.8610172271728516, - "step": 620 - }, - { - "epoch": 1.32, - "grad_norm": 27.005984360116628, - "learning_rate": 3.4440382358952115e-07, - "logits/chosen": -2.0091536045074463, - "logits/rejected": -1.8107984066009521, - "logps/chosen": -286.50762939453125, - "logps/rejected": -327.90350341796875, - "loss": 0.1604, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -0.3183102309703827, - "rewards/margins": 3.1906726360321045, - "rewards/rejected": -3.5089828968048096, - "step": 630 - }, - { - "epoch": 1.34, - "grad_norm": 23.606941374377882, - "learning_rate": 3.387387957438061e-07, - "logits/chosen": -1.9466636180877686, - "logits/rejected": -1.7844781875610352, - "logps/chosen": -283.4999084472656, - "logps/rejected": -322.8622131347656, - "loss": 0.1551, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -0.5999014973640442, - "rewards/margins": 3.3343536853790283, - "rewards/rejected": -3.9342548847198486, - "step": 640 - }, - { - "epoch": 1.36, - "grad_norm": 35.62460431561092, - "learning_rate": 3.33021140440403e-07, - "logits/chosen": -2.041600465774536, - "logits/rejected": -1.8977054357528687, - "logps/chosen": -282.59033203125, - "logps/rejected": -359.6671447753906, - "loss": 0.1662, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.652147650718689, - "rewards/margins": 3.4888508319854736, - "rewards/rejected": -4.140998363494873, - "step": 650 - }, - { - "epoch": 1.38, - "grad_norm": 40.45521004861816, - "learning_rate": 3.272542485937368e-07, - "logits/chosen": -2.2044432163238525, - "logits/rejected": -1.995490312576294, - "logps/chosen": -281.81591796875, - "logps/rejected": -305.563720703125, - "loss": 0.1562, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.9047712087631226, - "rewards/margins": 3.0545411109924316, - "rewards/rejected": -3.9593119621276855, - "step": 660 - }, - { - "epoch": 1.4, - "grad_norm": 25.83353768539722, - "learning_rate": 3.214415403184725e-07, - "logits/chosen": -2.136054515838623, - "logits/rejected": -2.0022921562194824, - "logps/chosen": -287.70721435546875, - "logps/rejected": -347.98626708984375, - "loss": 0.1527, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.8447133898735046, - "rewards/margins": 3.195286512374878, - "rewards/rejected": -4.039999961853027, - "step": 670 - }, - { - "epoch": 1.42, - "grad_norm": 33.98889011519535, - "learning_rate": 3.155864629011798e-07, - "logits/chosen": -2.006204605102539, - "logits/rejected": -1.8692758083343506, - "logps/chosen": -266.548583984375, - "logps/rejected": -349.1086730957031, - "loss": 0.1618, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -0.7886885404586792, - "rewards/margins": 3.2961840629577637, - "rewards/rejected": -4.084872722625732, - "step": 680 - }, - { - "epoch": 1.44, - "grad_norm": 23.86841822029929, - "learning_rate": 3.096924887558854e-07, - "logits/chosen": -2.0170860290527344, - "logits/rejected": -1.8624064922332764, - "logps/chosen": -289.9138488769531, - "logps/rejected": -340.1861877441406, - "loss": 0.1705, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.879202663898468, - "rewards/margins": 3.349952220916748, - "rewards/rejected": -4.22915506362915, - "step": 690 - }, - { - "epoch": 1.46, - "grad_norm": 30.325436164549153, - "learning_rate": 3.0376311336472157e-07, - "logits/chosen": -1.9222033023834229, - "logits/rejected": -1.7393171787261963, - "logps/chosen": -308.6827087402344, - "logps/rejected": -357.66937255859375, - "loss": 0.1567, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -1.2114474773406982, - "rewards/margins": 3.2933521270751953, - "rewards/rejected": -4.504799842834473, - "step": 700 - }, - { - "epoch": 1.46, - "eval_logits/chosen": -2.045348882675171, - "eval_logits/rejected": -1.9284720420837402, - "eval_logps/chosen": -298.781982421875, - "eval_logps/rejected": -333.33538818359375, - "eval_loss": 0.5347517132759094, - "eval_rewards/accuracies": 0.7890625, - "eval_rewards/chosen": -1.8094260692596436, - "eval_rewards/margins": 1.7239831686019897, - "eval_rewards/rejected": -3.5334088802337646, - "eval_runtime": 96.8349, - "eval_samples_per_second": 20.654, - "eval_steps_per_second": 0.33, - "step": 700 - }, - { - "epoch": 1.49, - "grad_norm": 44.30207392759187, - "learning_rate": 2.9780185320489397e-07, - "logits/chosen": -1.9471759796142578, - "logits/rejected": -1.777573585510254, - "logps/chosen": -307.9242858886719, - "logps/rejected": -352.77618408203125, - "loss": 0.1417, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -0.9882961511611938, - "rewards/margins": 3.599376678466797, - "rewards/rejected": -4.587672233581543, - "step": 710 - }, - { - "epoch": 1.51, - "grad_norm": 37.177307769645225, - "learning_rate": 2.9181224366319943e-07, - "logits/chosen": -1.9124629497528076, - "logits/rejected": -1.7028827667236328, - "logps/chosen": -315.4266662597656, - "logps/rejected": -355.34857177734375, - "loss": 0.166, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7210714817047119, - "rewards/margins": 3.6800003051757812, - "rewards/rejected": -4.401071071624756, - "step": 720 - }, - { - "epoch": 1.53, - "grad_norm": 39.29672592174553, - "learning_rate": 2.857978369393279e-07, - "logits/chosen": -2.243919849395752, - "logits/rejected": -2.1087582111358643, - "logps/chosen": -308.04412841796875, - "logps/rejected": -345.15594482421875, - "loss": 0.152, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.5387176871299744, - "rewards/margins": 3.3322176933288574, - "rewards/rejected": -3.8709354400634766, - "step": 730 - }, - { - "epoch": 1.55, - "grad_norm": 24.15001856800991, - "learning_rate": 2.797621999391938e-07, - "logits/chosen": -2.3955087661743164, - "logits/rejected": -2.282515048980713, - "logps/chosen": -296.63604736328125, - "logps/rejected": -331.5033264160156, - "loss": 0.1584, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -1.0399105548858643, - "rewards/margins": 3.2611820697784424, - "rewards/rejected": -4.301093101501465, - "step": 740 - }, - { - "epoch": 1.57, - "grad_norm": 28.800655620715126, - "learning_rate": 2.7370891215954565e-07, - "logits/chosen": -2.291198253631592, - "logits/rejected": -2.1874325275421143, - "logps/chosen": -294.86126708984375, - "logps/rejected": -342.1693420410156, - "loss": 0.1327, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -0.5225303769111633, - "rewards/margins": 3.4790916442871094, - "rewards/rejected": -4.001622200012207, - "step": 750 - }, - { - "epoch": 1.59, - "grad_norm": 36.52736639617503, - "learning_rate": 2.676415635651091e-07, - "logits/chosen": -2.2511188983917236, - "logits/rejected": -2.1171295642852783, - "logps/chosen": -314.3142395019531, - "logps/rejected": -401.13018798828125, - "loss": 0.1373, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.4879869818687439, - "rewards/margins": 4.032652854919434, - "rewards/rejected": -4.520639896392822, - "step": 760 - }, - { - "epoch": 1.61, - "grad_norm": 22.81336311327227, - "learning_rate": 2.615637524595207e-07, - "logits/chosen": -2.2899999618530273, - "logits/rejected": -2.117185115814209, - "logps/chosen": -290.29815673828125, - "logps/rejected": -304.9264831542969, - "loss": 0.137, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.9585368037223816, - "rewards/margins": 3.114755153656006, - "rewards/rejected": -4.0732927322387695, - "step": 770 - }, - { - "epoch": 1.63, - "grad_norm": 25.499517087179957, - "learning_rate": 2.55479083351317e-07, - "logits/chosen": -2.2676925659179688, - "logits/rejected": -2.083047866821289, - "logps/chosen": -301.6076965332031, - "logps/rejected": -364.6853942871094, - "loss": 0.1536, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -0.8435381054878235, - "rewards/margins": 3.4508233070373535, - "rewards/rejected": -4.294361114501953, - "step": 780 - }, - { - "epoch": 1.65, - "grad_norm": 27.418147190397534, - "learning_rate": 2.4939116481624407e-07, - "logits/chosen": -2.220470666885376, - "logits/rejected": -1.959284782409668, - "logps/chosen": -299.3353576660156, - "logps/rejected": -319.30914306640625, - "loss": 0.1568, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.8999794125556946, - "rewards/margins": 3.3365378379821777, - "rewards/rejected": -4.236516952514648, - "step": 790 - }, - { - "epoch": 1.67, - "grad_norm": 22.42449451210148, - "learning_rate": 2.4330360735715374e-07, - "logits/chosen": -2.2539217472076416, - "logits/rejected": -2.0446996688842773, - "logps/chosen": -333.44366455078125, - "logps/rejected": -357.18841552734375, - "loss": 0.1475, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.156060814857483, - "rewards/margins": 3.485567092895508, - "rewards/rejected": -4.641627311706543, - "step": 800 - }, - { - "epoch": 1.67, - "eval_logits/chosen": -2.2409238815307617, - "eval_logits/rejected": -2.1201870441436768, - "eval_logps/chosen": -296.3533020019531, - "eval_logps/rejected": -332.4950866699219, - "eval_loss": 0.538158118724823, - "eval_rewards/accuracies": 0.80078125, - "eval_rewards/chosen": -1.6879926919937134, - "eval_rewards/margins": 1.8034026622772217, - "eval_rewards/rejected": -3.4913952350616455, - "eval_runtime": 96.753, - "eval_samples_per_second": 20.671, - "eval_steps_per_second": 0.331, - "step": 800 - }, - { - "epoch": 1.69, - "grad_norm": 27.648844978471217, - "learning_rate": 2.3722002126275822e-07, - "logits/chosen": -2.118046998977661, - "logits/rejected": -1.9478477239608765, - "logps/chosen": -295.06268310546875, - "logps/rejected": -350.56829833984375, - "loss": 0.1534, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.974108099937439, - "rewards/margins": 3.4369990825653076, - "rewards/rejected": -4.411107540130615, - "step": 810 - }, - { - "epoch": 1.72, - "grad_norm": 33.288825778542474, - "learning_rate": 2.311440144665108e-07, - "logits/chosen": -2.166576623916626, - "logits/rejected": -2.005044460296631, - "logps/chosen": -285.5037841796875, - "logps/rejected": -354.74578857421875, - "loss": 0.1616, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -0.7247167229652405, - "rewards/margins": 3.4520015716552734, - "rewards/rejected": -4.176717758178711, - "step": 820 - }, - { - "epoch": 1.74, - "grad_norm": 28.54134777409646, - "learning_rate": 2.2507919040688398e-07, - "logits/chosen": -2.035219669342041, - "logits/rejected": -1.9823194742202759, - "logps/chosen": -302.2669677734375, - "logps/rejected": -337.7466125488281, - "loss": 0.1361, - "rewards/accuracies": 0.96875, - "rewards/chosen": -0.7065524458885193, - "rewards/margins": 3.6438660621643066, - "rewards/rejected": -4.350418567657471, - "step": 830 - }, - { - "epoch": 1.76, - "grad_norm": 29.525737237536532, - "learning_rate": 2.19029145890313e-07, - "logits/chosen": -2.0496773719787598, - "logits/rejected": -1.8104021549224854, - "logps/chosen": -295.2398376464844, - "logps/rejected": -348.8295593261719, - "loss": 0.137, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -1.180415391921997, - "rewards/margins": 4.056873798370361, - "rewards/rejected": -5.237288951873779, - "step": 840 - }, - { - "epoch": 1.78, - "grad_norm": 29.418652242465264, - "learning_rate": 2.1299746895807268e-07, - "logits/chosen": -2.0320029258728027, - "logits/rejected": -1.8159306049346924, - "logps/chosen": -298.8084716796875, - "logps/rejected": -351.4793395996094, - "loss": 0.1426, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.8422020077705383, - "rewards/margins": 3.6892521381378174, - "rewards/rejected": -4.531454563140869, - "step": 850 - }, - { - "epoch": 1.8, - "grad_norm": 28.75219392191004, - "learning_rate": 2.0698773675835246e-07, - "logits/chosen": -1.9433095455169678, - "logits/rejected": -1.7687761783599854, - "logps/chosen": -307.7235107421875, - "logps/rejected": -342.0555114746094, - "loss": 0.1486, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.4170432090759277, - "rewards/margins": 3.423736095428467, - "rewards/rejected": -4.8407793045043945, - "step": 860 - }, - { - "epoch": 1.82, - "grad_norm": 36.42272712771708, - "learning_rate": 2.0100351342479216e-07, - "logits/chosen": -1.9862619638442993, - "logits/rejected": -1.750312089920044, - "logps/chosen": -312.4382019042969, - "logps/rejected": -355.03045654296875, - "loss": 0.1468, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -1.4696812629699707, - "rewards/margins": 3.662865400314331, - "rewards/rejected": -5.132546901702881, - "step": 870 - }, - { - "epoch": 1.84, - "grad_norm": 29.128706338046875, - "learning_rate": 1.9504834796273545e-07, - "logits/chosen": -2.0059654712677, - "logits/rejected": -1.7494417428970337, - "logps/chosen": -284.04669189453125, - "logps/rejected": -307.86334228515625, - "loss": 0.1514, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.4539439678192139, - "rewards/margins": 3.2319369316101074, - "rewards/rejected": -4.6858811378479, - "step": 880 - }, - { - "epoch": 1.86, - "grad_norm": 54.834861913718775, - "learning_rate": 1.8912577214445558e-07, - "logits/chosen": -2.125431537628174, - "logits/rejected": -1.975856065750122, - "logps/chosen": -302.9638671875, - "logps/rejected": -345.03851318359375, - "loss": 0.1219, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -0.8938841819763184, - "rewards/margins": 3.579861879348755, - "rewards/rejected": -4.473746299743652, - "step": 890 - }, - { - "epoch": 1.88, - "grad_norm": 36.4163607482829, - "learning_rate": 1.8323929841460178e-07, - "logits/chosen": -2.0952582359313965, - "logits/rejected": -1.9091037511825562, - "logps/chosen": -300.1233215332031, - "logps/rejected": -345.5321960449219, - "loss": 0.1422, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -1.1558645963668823, - "rewards/margins": 3.8015289306640625, - "rewards/rejected": -4.957393169403076, - "step": 900 - }, - { - "epoch": 1.88, - "eval_logits/chosen": -2.198045492172241, - "eval_logits/rejected": -2.0630321502685547, - "eval_logps/chosen": -296.03240966796875, - "eval_logps/rejected": -335.6015625, - "eval_loss": 0.5517675280570984, - "eval_rewards/accuracies": 0.78515625, - "eval_rewards/chosen": -1.67194664478302, - "eval_rewards/margins": 1.9747719764709473, - "eval_rewards/rejected": -3.6467185020446777, - "eval_runtime": 96.8149, - "eval_samples_per_second": 20.658, - "eval_steps_per_second": 0.331, - "step": 900 - }, - { - "epoch": 1.9, - "grad_norm": 32.93263141944983, - "learning_rate": 1.7739241780710745e-07, - "logits/chosen": -2.067739963531494, - "logits/rejected": -1.879922866821289, - "logps/chosen": -285.4407958984375, - "logps/rejected": -354.6173400878906, - "loss": 0.1572, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -1.3456696271896362, - "rewards/margins": 3.62982439994812, - "rewards/rejected": -4.975494384765625, - "step": 910 - }, - { - "epoch": 1.92, - "grad_norm": 30.163608722340616, - "learning_rate": 1.7158859787479653e-07, - "logits/chosen": -1.945054054260254, - "logits/rejected": -1.7525079250335693, - "logps/chosen": -271.43670654296875, - "logps/rejected": -355.9076232910156, - "loss": 0.1457, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -1.121407151222229, - "rewards/margins": 3.6710968017578125, - "rewards/rejected": -4.79250431060791, - "step": 920 - }, - { - "epoch": 1.95, - "grad_norm": 38.64417902485811, - "learning_rate": 1.6583128063291573e-07, - "logits/chosen": -2.06476092338562, - "logits/rejected": -1.8927568197250366, - "logps/chosen": -288.0918884277344, - "logps/rejected": -331.8973388671875, - "loss": 0.1544, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -1.375701904296875, - "rewards/margins": 3.178281784057617, - "rewards/rejected": -4.55398416519165, - "step": 930 - }, - { - "epoch": 1.97, - "grad_norm": 33.76607136138911, - "learning_rate": 1.6012388051781152e-07, - "logits/chosen": -2.157871961593628, - "logits/rejected": -1.855956792831421, - "logps/chosen": -337.51568603515625, - "logps/rejected": -342.9998779296875, - "loss": 0.1424, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -1.327646255493164, - "rewards/margins": 3.696226119995117, - "rewards/rejected": -5.0238728523254395, - "step": 940 - }, - { - "epoch": 1.99, - "grad_norm": 34.755846610400575, - "learning_rate": 1.54469782361964e-07, - "logits/chosen": -1.9773037433624268, - "logits/rejected": -1.8121259212493896, - "logps/chosen": -310.9110107421875, - "logps/rejected": -364.62359619140625, - "loss": 0.145, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.399800181388855, - "rewards/margins": 3.679616928100586, - "rewards/rejected": -5.0794172286987305, - "step": 950 - }, - { - "epoch": 2.01, - "grad_norm": 9.487017101643959, - "learning_rate": 1.488723393865766e-07, - "logits/chosen": -1.9744060039520264, - "logits/rejected": -1.662217140197754, - "logps/chosen": -328.89556884765625, - "logps/rejected": -356.9366149902344, - "loss": 0.0972, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0852992534637451, - "rewards/margins": 4.1752800941467285, - "rewards/rejected": -5.260579586029053, - "step": 960 - }, - { - "epoch": 2.03, - "grad_norm": 6.928449852103405, - "learning_rate": 1.4333487121291395e-07, - "logits/chosen": -1.9290939569473267, - "logits/rejected": -1.646998643875122, - "logps/chosen": -290.2763977050781, - "logps/rejected": -359.6099548339844, - "loss": 0.0377, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -0.9613211750984192, - "rewards/margins": 4.470429420471191, - "rewards/rejected": -5.431751251220703, - "step": 970 - }, - { - "epoch": 2.05, - "grad_norm": 12.357238308252128, - "learning_rate": 1.3786066189356627e-07, - "logits/chosen": -1.840659499168396, - "logits/rejected": -1.3801103830337524, - "logps/chosen": -323.639892578125, - "logps/rejected": -382.70281982421875, - "loss": 0.0412, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1784292459487915, - "rewards/margins": 4.754664897918701, - "rewards/rejected": -5.933094501495361, - "step": 980 - }, - { - "epoch": 2.07, - "grad_norm": 12.765885065958322, - "learning_rate": 1.3245295796480788e-07, - "logits/chosen": -1.6344573497772217, - "logits/rejected": -1.3037782907485962, - "logps/chosen": -280.02520751953125, - "logps/rejected": -373.9437561035156, - "loss": 0.0483, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -1.5978093147277832, - "rewards/margins": 4.701867580413818, - "rewards/rejected": -6.299676895141602, - "step": 990 - }, - { - "epoch": 2.09, - "grad_norm": 10.639657237631194, - "learning_rate": 1.2711496652120578e-07, - "logits/chosen": -1.6058915853500366, - "logits/rejected": -1.2118239402770996, - "logps/chosen": -295.133056640625, - "logps/rejected": -380.3929748535156, - "loss": 0.044, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -1.539745807647705, - "rewards/margins": 5.079216957092285, - "rewards/rejected": -6.61896276473999, - "step": 1000 - }, - { - "epoch": 2.09, - "eval_logits/chosen": -1.7406296730041504, - "eval_logits/rejected": -1.4629344940185547, - "eval_logps/chosen": -316.4520263671875, - "eval_logps/rejected": -365.49591064453125, - "eval_loss": 0.6058282852172852, - "eval_rewards/accuracies": 0.7890625, - "eval_rewards/chosen": -2.6929280757904053, - "eval_rewards/margins": 2.4485080242156982, - "eval_rewards/rejected": -5.1414361000061035, - "eval_runtime": 96.8107, - "eval_samples_per_second": 20.659, - "eval_steps_per_second": 0.331, - "step": 1000 - }, - { - "epoch": 2.11, - "grad_norm": 8.532531818207984, - "learning_rate": 1.2184985331361878e-07, - "logits/chosen": -1.5615358352661133, - "logits/rejected": -1.3172276020050049, - "logps/chosen": -293.0592041015625, - "logps/rejected": -389.7725830078125, - "loss": 0.0365, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -1.6128288507461548, - "rewards/margins": 4.956469535827637, - "rewards/rejected": -6.56929874420166, - "step": 1010 - }, - { - "epoch": 2.13, - "grad_norm": 8.404017113839789, - "learning_rate": 1.1666074087171627e-07, - "logits/chosen": -1.585607886314392, - "logits/rejected": -1.2554726600646973, - "logps/chosen": -287.69989013671875, - "logps/rejected": -414.93267822265625, - "loss": 0.0337, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -1.7983949184417725, - "rewards/margins": 5.210457801818848, - "rewards/rejected": -7.008852481842041, - "step": 1020 - }, - { - "epoch": 2.15, - "grad_norm": 10.846070292764287, - "learning_rate": 1.115507066521304e-07, - "logits/chosen": -1.5471771955490112, - "logits/rejected": -1.1083250045776367, - "logps/chosen": -304.71533203125, - "logps/rejected": -398.4665222167969, - "loss": 0.0418, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -1.608544111251831, - "rewards/margins": 5.360156536102295, - "rewards/rejected": -6.968701362609863, - "step": 1030 - }, - { - "epoch": 2.18, - "grad_norm": 11.175167320601789, - "learning_rate": 1.065227812133381e-07, - "logits/chosen": -1.481093168258667, - "logits/rejected": -0.8541752696037292, - "logps/chosen": -340.7627868652344, - "logps/rejected": -378.1071472167969, - "loss": 0.0369, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.031552791595459, - "rewards/margins": 5.121342658996582, - "rewards/rejected": -7.152895927429199, - "step": 1040 - }, - { - "epoch": 2.2, - "grad_norm": 9.191567382424724, - "learning_rate": 1.0157994641835734e-07, - "logits/chosen": -1.3451311588287354, - "logits/rejected": -0.851954460144043, - "logps/chosen": -312.46246337890625, - "logps/rejected": -443.61480712890625, - "loss": 0.0366, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4045233726501465, - "rewards/margins": 5.692896842956543, - "rewards/rejected": -8.097419738769531, - "step": 1050 - }, - { - "epoch": 2.22, - "grad_norm": 10.368281313443552, - "learning_rate": 9.672513366632259e-08, - "logits/chosen": -1.4387218952178955, - "logits/rejected": -0.9104587435722351, - "logps/chosen": -313.5397644042969, - "logps/rejected": -392.3639221191406, - "loss": 0.0335, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.1655592918395996, - "rewards/margins": 5.339682579040527, - "rewards/rejected": -7.505242347717285, - "step": 1060 - }, - { - "epoch": 2.24, - "grad_norm": 9.399192097048465, - "learning_rate": 9.196122215398824e-08, - "logits/chosen": -1.5347858667373657, - "logits/rejected": -1.1239128112792969, - "logps/chosen": -345.493408203125, - "logps/rejected": -442.33880615234375, - "loss": 0.0335, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.0253329277038574, - "rewards/margins": 5.696555137634277, - "rewards/rejected": -7.721887111663818, - "step": 1070 - }, - { - "epoch": 2.26, - "grad_norm": 13.854211150002678, - "learning_rate": 8.729103716819111e-08, - "logits/chosen": -1.4636061191558838, - "logits/rejected": -0.9430959820747375, - "logps/chosen": -348.165771484375, - "logps/rejected": -437.73150634765625, - "loss": 0.0324, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -1.851531982421875, - "rewards/margins": 5.859683990478516, - "rewards/rejected": -7.711215972900391, - "step": 1080 - }, - { - "epoch": 2.28, - "grad_norm": 13.012519621486515, - "learning_rate": 8.271734841028552e-08, - "logits/chosen": -1.4426562786102295, - "logits/rejected": -0.9428352117538452, - "logps/chosen": -357.9177551269531, - "logps/rejected": -440.7193298339844, - "loss": 0.0276, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.550102710723877, - "rewards/margins": 5.568441390991211, - "rewards/rejected": -8.11854362487793, - "step": 1090 - }, - { - "epoch": 2.3, - "grad_norm": 7.343017253233073, - "learning_rate": 7.824286835354262e-08, - "logits/chosen": -1.2438175678253174, - "logits/rejected": -0.7905367612838745, - "logps/chosen": -294.59564208984375, - "logps/rejected": -394.0030517578125, - "loss": 0.0307, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -1.902488350868225, - "rewards/margins": 5.68535041809082, - "rewards/rejected": -7.587839603424072, - "step": 1100 - }, - { - "epoch": 2.3, - "eval_logits/chosen": -1.330996036529541, - "eval_logits/rejected": -0.9162449240684509, - "eval_logps/chosen": -337.038330078125, - "eval_logps/rejected": -397.1616516113281, - "eval_loss": 0.6699962019920349, - "eval_rewards/accuracies": 0.76953125, - "eval_rewards/chosen": -3.722243070602417, - "eval_rewards/margins": 3.002480983734131, - "eval_rewards/rejected": -6.724723815917969, - "eval_runtime": 96.8573, - "eval_samples_per_second": 20.649, - "eval_steps_per_second": 0.33, - "step": 1100 - }, - { - "epoch": 2.32, - "grad_norm": 22.14619972377515, - "learning_rate": 7.387025063449081e-08, - "logits/chosen": -1.2601680755615234, - "logits/rejected": -0.4739012122154236, - "logps/chosen": -334.8611755371094, - "logps/rejected": -403.3932800292969, - "loss": 0.0294, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.9858272075653076, - "rewards/margins": 5.638393878936768, - "rewards/rejected": -8.624221801757812, - "step": 1110 - }, - { - "epoch": 2.34, - "grad_norm": 22.797319211696394, - "learning_rate": 6.960208847914884e-08, - "logits/chosen": -1.1370799541473389, - "logits/rejected": -0.7852309942245483, - "logps/chosen": -302.19561767578125, - "logps/rejected": -427.47283935546875, - "loss": 0.0337, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.330061912536621, - "rewards/margins": 6.187905311584473, - "rewards/rejected": -8.517967224121094, - "step": 1120 - }, - { - "epoch": 2.36, - "grad_norm": 12.447087206832133, - "learning_rate": 6.544091316508646e-08, - "logits/chosen": -1.1922690868377686, - "logits/rejected": -0.7016154527664185, - "logps/chosen": -337.29254150390625, - "logps/rejected": -419.9600524902344, - "loss": 0.0309, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.260216474533081, - "rewards/margins": 6.044236660003662, - "rewards/rejected": -8.30445384979248, - "step": 1130 - }, - { - "epoch": 2.38, - "grad_norm": 6.670071848884272, - "learning_rate": 6.138919252022435e-08, - "logits/chosen": -1.2661590576171875, - "logits/rejected": -0.726656973361969, - "logps/chosen": -337.79095458984375, - "logps/rejected": -444.1019592285156, - "loss": 0.0321, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.347249746322632, - "rewards/margins": 5.921750068664551, - "rewards/rejected": -8.268999099731445, - "step": 1140 - }, - { - "epoch": 2.41, - "grad_norm": 20.202187916611614, - "learning_rate": 5.7449329459262895e-08, - "logits/chosen": -1.3327900171279907, - "logits/rejected": -0.7205911874771118, - "logps/chosen": -327.96160888671875, - "logps/rejected": -431.84246826171875, - "loss": 0.0315, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -2.5763912200927734, - "rewards/margins": 6.172744274139404, - "rewards/rejected": -8.749135971069336, - "step": 1150 - }, - { - "epoch": 2.43, - "grad_norm": 12.705067042675866, - "learning_rate": 5.362366055860934e-08, - "logits/chosen": -1.3245675563812256, - "logits/rejected": -0.8294090032577515, - "logps/chosen": -339.1673278808594, - "logps/rejected": -427.9764709472656, - "loss": 0.0264, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.199721336364746, - "rewards/margins": 5.974953651428223, - "rewards/rejected": -8.174674034118652, - "step": 1160 - }, - { - "epoch": 2.45, - "grad_norm": 10.500886681852085, - "learning_rate": 4.991445467064689e-08, - "logits/chosen": -1.205335259437561, - "logits/rejected": -0.8114490509033203, - "logps/chosen": -309.63421630859375, - "logps/rejected": -438.3241271972656, - "loss": 0.0317, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.618736982345581, - "rewards/margins": 6.0979084968566895, - "rewards/rejected": -8.716645240783691, - "step": 1170 - }, - { - "epoch": 2.47, - "grad_norm": 12.519852374223428, - "learning_rate": 4.6323911578168146e-08, - "logits/chosen": -1.1557085514068604, - "logits/rejected": -0.6752947568893433, - "logps/chosen": -306.9227294921875, - "logps/rejected": -395.4352722167969, - "loss": 0.0323, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.859976053237915, - "rewards/margins": 5.740398406982422, - "rewards/rejected": -8.600374221801758, - "step": 1180 - }, - { - "epoch": 2.49, - "grad_norm": 14.10405859024082, - "learning_rate": 4.285416068977166e-08, - "logits/chosen": -1.2870628833770752, - "logits/rejected": -0.8281751871109009, - "logps/chosen": -295.5076904296875, - "logps/rejected": -401.1368713378906, - "loss": 0.0387, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.580979585647583, - "rewards/margins": 6.097764015197754, - "rewards/rejected": -8.678743362426758, - "step": 1190 - }, - { - "epoch": 2.51, - "grad_norm": 10.124969755651362, - "learning_rate": 3.9507259776993954e-08, - "logits/chosen": -1.2816931009292603, - "logits/rejected": -0.8490222096443176, - "logps/chosen": -333.55950927734375, - "logps/rejected": -454.38433837890625, - "loss": 0.0317, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.4796204566955566, - "rewards/margins": 6.314810752868652, - "rewards/rejected": -8.794431686401367, - "step": 1200 - }, - { - "epoch": 2.51, - "eval_logits/chosen": -1.2927337884902954, - "eval_logits/rejected": -0.9226770401000977, - "eval_logps/chosen": -341.82611083984375, - "eval_logps/rejected": -401.94482421875, - "eval_loss": 0.6710954308509827, - "eval_rewards/accuracies": 0.77734375, - "eval_rewards/chosen": -3.9616329669952393, - "eval_rewards/margins": 3.002251148223877, - "eval_rewards/rejected": -6.963884353637695, - "eval_runtime": 96.6177, - "eval_samples_per_second": 20.7, - "eval_steps_per_second": 0.331, - "step": 1200 - }, - { - "epoch": 2.53, - "grad_norm": 9.343788604433174, - "learning_rate": 3.6285193753926995e-08, - "logits/chosen": -1.2478582859039307, - "logits/rejected": -0.6749597787857056, - "logps/chosen": -335.22381591796875, - "logps/rejected": -448.06427001953125, - "loss": 0.0372, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.044740915298462, - "rewards/margins": 5.88831090927124, - "rewards/rejected": -8.933051109313965, - "step": 1210 - }, - { - "epoch": 2.55, - "grad_norm": 15.370939596872683, - "learning_rate": 3.3189873500044376e-08, - "logits/chosen": -1.2345765829086304, - "logits/rejected": -0.7880679368972778, - "logps/chosen": -316.099853515625, - "logps/rejected": -424.1470642089844, - "loss": 0.0304, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.545562267303467, - "rewards/margins": 6.050265312194824, - "rewards/rejected": -8.595827102661133, - "step": 1220 - }, - { - "epoch": 2.57, - "grad_norm": 9.256523996293426, - "learning_rate": 3.022313472693447e-08, - "logits/chosen": -1.2926331758499146, - "logits/rejected": -0.7623311877250671, - "logps/chosen": -323.86981201171875, - "logps/rejected": -432.7613220214844, - "loss": 0.0326, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -3.008089065551758, - "rewards/margins": 5.656081676483154, - "rewards/rejected": -8.66417121887207, - "step": 1230 - }, - { - "epoch": 2.59, - "grad_norm": 13.108228798737402, - "learning_rate": 2.738673688961296e-08, - "logits/chosen": -1.4012349843978882, - "logits/rejected": -0.9218941926956177, - "logps/chosen": -351.8048400878906, - "logps/rejected": -434.9781799316406, - "loss": 0.0339, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.5394856929779053, - "rewards/margins": 5.643149375915527, - "rewards/rejected": -8.182635307312012, - "step": 1240 - }, - { - "epoch": 2.62, - "grad_norm": 7.525159999057864, - "learning_rate": 2.4682362143059797e-08, - "logits/chosen": -1.2464749813079834, - "logits/rejected": -0.6382617354393005, - "logps/chosen": -326.1867370605469, - "logps/rejected": -427.4964294433594, - "loss": 0.0245, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.6778645515441895, - "rewards/margins": 6.1839213371276855, - "rewards/rejected": -8.861784934997559, - "step": 1250 - }, - { - "epoch": 2.64, - "grad_norm": 20.76488422355617, - "learning_rate": 2.2111614344599684e-08, - "logits/chosen": -1.1929394006729126, - "logits/rejected": -0.6367761492729187, - "logps/chosen": -346.1959533691406, - "logps/rejected": -437.5431213378906, - "loss": 0.0344, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.5477092266082764, - "rewards/margins": 5.813665390014648, - "rewards/rejected": -8.361373901367188, - "step": 1260 - }, - { - "epoch": 2.66, - "grad_norm": 8.669045699213816, - "learning_rate": 1.9676018102718213e-08, - "logits/chosen": -1.247870922088623, - "logits/rejected": -0.6409175395965576, - "logps/chosen": -365.83392333984375, - "logps/rejected": -399.61962890625, - "loss": 0.0338, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.6575913429260254, - "rewards/margins": 5.685526371002197, - "rewards/rejected": -8.343117713928223, - "step": 1270 - }, - { - "epoch": 2.68, - "grad_norm": 22.62219866138702, - "learning_rate": 1.7377017872876987e-08, - "logits/chosen": -1.208968162536621, - "logits/rejected": -0.6456762552261353, - "logps/chosen": -317.1292419433594, - "logps/rejected": -430.59613037109375, - "loss": 0.0336, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8459768295288086, - "rewards/margins": 5.9941020011901855, - "rewards/rejected": -8.840079307556152, - "step": 1280 - }, - { - "epoch": 2.7, - "grad_norm": 20.33027332534465, - "learning_rate": 1.521597710086439e-08, - "logits/chosen": -1.1466628313064575, - "logits/rejected": -0.7244657278060913, - "logps/chosen": -356.29986572265625, - "logps/rejected": -458.36279296875, - "loss": 0.0348, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.7150323390960693, - "rewards/margins": 6.071440696716309, - "rewards/rejected": -8.786473274230957, - "step": 1290 - }, - { - "epoch": 2.72, - "grad_norm": 11.138423191199344, - "learning_rate": 1.3194177414189905e-08, - "logits/chosen": -1.0380289554595947, - "logits/rejected": -0.5761706233024597, - "logps/chosen": -299.1542663574219, - "logps/rejected": -413.3262634277344, - "loss": 0.0264, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.816028118133545, - "rewards/margins": 5.865799903869629, - "rewards/rejected": -8.681828498840332, - "step": 1300 - }, - { - "epoch": 2.72, - "eval_logits/chosen": -1.2189825773239136, - "eval_logits/rejected": -0.8369507789611816, - "eval_logps/chosen": -347.2215576171875, - "eval_logps/rejected": -407.835205078125, - "eval_loss": 0.6777693033218384, - "eval_rewards/accuracies": 0.77734375, - "eval_rewards/chosen": -4.231404781341553, - "eval_rewards/margins": 3.0269954204559326, - "eval_rewards/rejected": -7.258399486541748, - "eval_runtime": 96.5907, - "eval_samples_per_second": 20.706, - "eval_steps_per_second": 0.331, - "step": 1300 - }, - { - "epoch": 2.74, - "grad_norm": 18.646420478229594, - "learning_rate": 1.1312817862001945e-08, - "logits/chosen": -1.174865961074829, - "logits/rejected": -0.5988924503326416, - "logps/chosen": -335.7987060546875, - "logps/rejected": -438.8302307128906, - "loss": 0.0323, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.0069799423217773, - "rewards/margins": 6.027297496795654, - "rewards/rejected": -9.034276962280273, - "step": 1310 - }, - { - "epoch": 2.76, - "grad_norm": 9.37653587990746, - "learning_rate": 9.57301420397924e-09, - "logits/chosen": -1.2077335119247437, - "logits/rejected": -0.6788758039474487, - "logps/chosen": -344.1173400878906, - "logps/rejected": -449.73822021484375, - "loss": 0.0296, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -2.7568278312683105, - "rewards/margins": 6.043575763702393, - "rewards/rejected": -8.800403594970703, - "step": 1320 - }, - { - "epoch": 2.78, - "grad_norm": 19.319012047791922, - "learning_rate": 7.975798248618076e-09, - "logits/chosen": -1.2139251232147217, - "logits/rejected": -0.5430259108543396, - "logps/chosen": -304.11981201171875, - "logps/rejected": -399.5338134765625, - "loss": 0.0403, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -2.8824665546417236, - "rewards/margins": 5.712244510650635, - "rewards/rejected": -8.594710350036621, - "step": 1330 - }, - { - "epoch": 2.8, - "grad_norm": 15.988430571781114, - "learning_rate": 6.522117241307606e-09, - "logits/chosen": -1.257893681526184, - "logits/rejected": -0.7202562093734741, - "logps/chosen": -358.0582275390625, - "logps/rejected": -453.0985412597656, - "loss": 0.0249, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.752913475036621, - "rewards/margins": 5.904249668121338, - "rewards/rejected": -8.6571626663208, - "step": 1340 - }, - { - "epoch": 2.82, - "grad_norm": 18.522259659368, - "learning_rate": 5.212833302556258e-09, - "logits/chosen": -1.0940848588943481, - "logits/rejected": -0.642852783203125, - "logps/chosen": -330.330810546875, - "logps/rejected": -439.7862243652344, - "loss": 0.0316, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.221640110015869, - "rewards/margins": 5.774725914001465, - "rewards/rejected": -8.996365547180176, - "step": 1350 - }, - { - "epoch": 2.85, - "grad_norm": 14.304064916507468, - "learning_rate": 4.048722916702302e-09, - "logits/chosen": -1.2064802646636963, - "logits/rejected": -0.7113040685653687, - "logps/chosen": -350.12359619140625, - "logps/rejected": -433.10784912109375, - "loss": 0.0292, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -3.004544734954834, - "rewards/margins": 5.731839179992676, - "rewards/rejected": -8.736384391784668, - "step": 1360 - }, - { - "epoch": 2.87, - "grad_norm": 9.460405793660744, - "learning_rate": 3.030476471411664e-09, - "logits/chosen": -1.2229253053665161, - "logits/rejected": -0.5610889196395874, - "logps/chosen": -358.481689453125, - "logps/rejected": -432.617919921875, - "loss": 0.0255, - "rewards/accuracies": 0.9937499761581421, - "rewards/chosen": -3.1794846057891846, - "rewards/margins": 6.04774284362793, - "rewards/rejected": -9.227226257324219, - "step": 1370 - }, - { - "epoch": 2.89, - "grad_norm": 12.222248359736108, - "learning_rate": 2.158697848236607e-09, - "logits/chosen": -1.3239306211471558, - "logits/rejected": -0.696345865726471, - "logps/chosen": -345.43890380859375, - "logps/rejected": -420.57415771484375, - "loss": 0.0324, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.8451218605041504, - "rewards/margins": 5.87528657913208, - "rewards/rejected": -8.72040843963623, - "step": 1380 - }, - { - "epoch": 2.91, - "grad_norm": 12.712161033692837, - "learning_rate": 1.4339040644774092e-09, - "logits/chosen": -1.1367696523666382, - "logits/rejected": -0.6753785014152527, - "logps/chosen": -300.333984375, - "logps/rejected": -410.33966064453125, - "loss": 0.0346, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -2.8657116889953613, - "rewards/margins": 5.644627571105957, - "rewards/rejected": -8.510337829589844, - "step": 1390 - }, - { - "epoch": 2.93, - "grad_norm": 7.479511958985555, - "learning_rate": 8.56524966559885e-10, - "logits/chosen": -1.297228455543518, - "logits/rejected": -0.9393932223320007, - "logps/chosen": -337.4761962890625, - "logps/rejected": -469.80499267578125, - "loss": 0.0343, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.661848306655884, - "rewards/margins": 5.969611644744873, - "rewards/rejected": -8.631460189819336, - "step": 1400 - }, - { - "epoch": 2.93, - "eval_logits/chosen": -1.2134709358215332, - "eval_logits/rejected": -0.8310315608978271, - "eval_logps/chosen": -347.1476135253906, - "eval_logps/rejected": -408.3960876464844, - "eval_loss": 0.6824045181274414, - "eval_rewards/accuracies": 0.77734375, - "eval_rewards/chosen": -4.227706432342529, - "eval_rewards/margins": 3.058739423751831, - "eval_rewards/rejected": -7.286445617675781, - "eval_runtime": 96.8438, - "eval_samples_per_second": 20.652, - "eval_steps_per_second": 0.33, - "step": 1400 - }, - { - "epoch": 2.95, - "grad_norm": 25.645964088265643, - "learning_rate": 4.269029751107489e-10, - "logits/chosen": -1.169325590133667, - "logits/rejected": -0.646837055683136, - "logps/chosen": -327.53558349609375, - "logps/rejected": -438.29022216796875, - "loss": 0.0315, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -2.6820549964904785, - "rewards/margins": 6.348910331726074, - "rewards/rejected": -9.030964851379395, - "step": 1410 - }, - { - "epoch": 2.97, - "grad_norm": 10.010864148743238, - "learning_rate": 1.4529288188125377e-10, - "logits/chosen": -1.1855485439300537, - "logits/rejected": -0.6920149326324463, - "logps/chosen": -337.4898376464844, - "logps/rejected": -440.05987548828125, - "loss": 0.0257, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.923854351043701, - "rewards/margins": 6.196907997131348, - "rewards/rejected": -9.120763778686523, - "step": 1420 - }, - { - "epoch": 2.99, - "grad_norm": 37.578079095213596, - "learning_rate": 1.1861698640563966e-11, - "logits/chosen": -1.1965336799621582, - "logits/rejected": -0.6326996684074402, - "logps/chosen": -348.0246276855469, - "logps/rejected": -445.17431640625, - "loss": 0.0318, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -2.8414111137390137, - "rewards/margins": 6.086348533630371, - "rewards/rejected": -8.927759170532227, - "step": 1430 - }, - { - "epoch": 3.0, - "step": 1434, + "step": 478, "total_flos": 0.0, - "train_loss": 0.007373018379906397, - "train_runtime": 5373.2533, - "train_samples_per_second": 34.132, - "train_steps_per_second": 0.267 + "train_loss": 0.5478911828795238, + "train_runtime": 7553.9268, + "train_samples_per_second": 8.093, + "train_steps_per_second": 0.063 } ], "logging_steps": 10, - "max_steps": 1434, + "max_steps": 478, "num_input_tokens_seen": 0, - "num_train_epochs": 3, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8,