diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,20 +1,20 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 0.9994767137624281, "eval_steps": 100, - "global_step": 1911, + "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 2.6041666666666667e-08, - "logits/chosen": -2.7462317943573, - "logits/rejected": -2.425077199935913, - "logps/chosen": -250.54042053222656, - "logps/rejected": -177.74742126464844, + "learning_rate": 5.208333333333333e-08, + "logits/chosen": -2.4445395469665527, + "logits/rejected": -2.388306140899658, + "logps/chosen": -221.89984130859375, + "logps/rejected": -197.11672973632812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -22,2996 +22,1492 @@ "rewards/rejected": 0.0, "step": 1 }, - { - "epoch": 0.01, - "learning_rate": 2.604166666666667e-07, - "logits/chosen": -2.343383312225342, - "logits/rejected": -2.2455639839172363, - "logps/chosen": -311.4956970214844, - "logps/rejected": -254.3400115966797, - "loss": 0.6932, - "rewards/accuracies": 0.3611111044883728, - "rewards/chosen": 0.0004503716481849551, - "rewards/margins": 0.0002595583500806242, - "rewards/rejected": 0.00019081326900050044, - "step": 10 - }, { "epoch": 0.01, "learning_rate": 5.208333333333334e-07, - "logits/chosen": -2.3990731239318848, - "logits/rejected": -2.242825984954834, - "logps/chosen": -307.36834716796875, - "logps/rejected": -239.515625, + "logits/chosen": -2.385066509246826, + "logits/rejected": -2.237809181213379, + "logps/chosen": -315.8031005859375, + "logps/rejected": -248.2730255126953, "loss": 0.6931, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.00018046621698886156, - "rewards/margins": -0.0010750549845397472, - "rewards/rejected": 0.0012555213179439306, - "step": 20 + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": 8.269523277704138e-06, + "rewards/margins": -0.00012648198753595352, + "rewards/rejected": 0.00013475156447384506, + "step": 10 }, { "epoch": 0.02, - "learning_rate": 7.8125e-07, - "logits/chosen": -2.156728506088257, - "logits/rejected": -2.1305534839630127, - "logps/chosen": -258.18896484375, - "logps/rejected": -222.45346069335938, - "loss": 0.6931, + "learning_rate": 1.0416666666666667e-06, + "logits/chosen": -2.166614294052124, + "logits/rejected": -2.138037919998169, + "logps/chosen": -230.2421112060547, + "logps/rejected": -215.0667724609375, + "loss": 0.6932, "rewards/accuracies": 0.5, - "rewards/chosen": -0.000468552578240633, - "rewards/margins": -0.0007666439050808549, - "rewards/rejected": 0.0002980913850478828, + "rewards/chosen": -0.00011211812670808285, + "rewards/margins": -0.00017659642617218196, + "rewards/rejected": 6.44782921881415e-05, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 1.5625e-06, + "logits/chosen": -2.359757900238037, + "logits/rejected": -2.239750623703003, + "logps/chosen": -287.16546630859375, + "logps/rejected": -250.483154296875, + "loss": 0.6932, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.0006519377348013222, + "rewards/margins": -0.000464016105979681, + "rewards/rejected": 0.0011159538989886642, "step": 30 }, { - "epoch": 0.02, - "learning_rate": 1.0416666666666667e-06, - "logits/chosen": -2.175902843475342, - "logits/rejected": -2.1458325386047363, - "logps/chosen": -202.2835693359375, - "logps/rejected": -207.637939453125, - "loss": 0.6931, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": 0.00036091814399696887, - "rewards/margins": 0.00010843136260518804, - "rewards/rejected": 0.0002524866722524166, + "epoch": 0.04, + "learning_rate": 2.0833333333333334e-06, + "logits/chosen": -2.332831859588623, + "logits/rejected": -2.2222561836242676, + "logps/chosen": -268.5550231933594, + "logps/rejected": -245.136474609375, + "loss": 0.6928, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0009799805702641606, + "rewards/margins": 0.00043143361108377576, + "rewards/rejected": 0.0005485471338033676, "step": 40 }, { - "epoch": 0.03, - "learning_rate": 1.3020833333333335e-06, - "logits/chosen": -2.3326008319854736, - "logits/rejected": -2.227440118789673, - "logps/chosen": -309.99859619140625, - "logps/rejected": -249.27401733398438, - "loss": 0.693, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.001949362107552588, - "rewards/margins": 0.0011079810792580247, - "rewards/rejected": 0.0008413810282945633, + "epoch": 0.05, + "learning_rate": 2.604166666666667e-06, + "logits/chosen": -2.3112823963165283, + "logits/rejected": -2.315413236618042, + "logps/chosen": -252.92160034179688, + "logps/rejected": -255.1580352783203, + "loss": 0.6927, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0017378001939505339, + "rewards/margins": 0.000829376163892448, + "rewards/rejected": 0.0009084242628887296, "step": 50 }, { - "epoch": 0.03, - "learning_rate": 1.5625e-06, - "logits/chosen": -2.3864035606384277, - "logits/rejected": -2.252150774002075, - "logps/chosen": -264.1552429199219, - "logps/rejected": -251.7004852294922, - "loss": 0.693, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": 0.0011251050746068358, - "rewards/margins": -0.0001833140559028834, - "rewards/rejected": 0.0013084192760288715, + "epoch": 0.06, + "learning_rate": 3.125e-06, + "logits/chosen": -2.2834067344665527, + "logits/rejected": -2.2284858226776123, + "logps/chosen": -261.4151306152344, + "logps/rejected": -252.23080444335938, + "loss": 0.6921, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0034083526115864515, + "rewards/margins": 0.0015303167747333646, + "rewards/rejected": 0.0018780359532684088, "step": 60 }, { - "epoch": 0.04, - "learning_rate": 1.8229166666666666e-06, - "logits/chosen": -2.3157763481140137, - "logits/rejected": -2.1782267093658447, - "logps/chosen": -279.8276062011719, - "logps/rejected": -251.36514282226562, - "loss": 0.6927, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0017941547557711601, - "rewards/margins": 0.0011580453719943762, - "rewards/rejected": 0.0006361090345308185, + "epoch": 0.07, + "learning_rate": 3.6458333333333333e-06, + "logits/chosen": -2.319131374359131, + "logits/rejected": -2.1937270164489746, + "logps/chosen": -305.9224548339844, + "logps/rejected": -275.6637268066406, + "loss": 0.6905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.010511571541428566, + "rewards/margins": 0.006470891647040844, + "rewards/rejected": 0.004040678963065147, "step": 70 }, { - "epoch": 0.04, - "learning_rate": 2.0833333333333334e-06, - "logits/chosen": -2.3494324684143066, - "logits/rejected": -2.265619993209839, - "logps/chosen": -257.0757141113281, - "logps/rejected": -238.935791015625, - "loss": 0.6922, + "epoch": 0.08, + "learning_rate": 4.166666666666667e-06, + "logits/chosen": -2.155569553375244, + "logits/rejected": -1.9963390827178955, + "logps/chosen": -284.39007568359375, + "logps/rejected": -246.76455688476562, + "loss": 0.6876, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.0022330707870423794, - "rewards/margins": 0.002052165335044265, - "rewards/rejected": 0.00018090553930960596, + "rewards/chosen": 0.014133408665657043, + "rewards/margins": 0.012079447507858276, + "rewards/rejected": 0.002053960459306836, "step": 80 }, { - "epoch": 0.05, - "learning_rate": 2.3437500000000002e-06, - "logits/chosen": -2.3054585456848145, - "logits/rejected": -2.407442569732666, - "logps/chosen": -230.22653198242188, - "logps/rejected": -270.58624267578125, - "loss": 0.6925, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0011901370016857982, - "rewards/margins": 1.3230135664343834e-05, - "rewards/rejected": 0.0011769067496061325, + "epoch": 0.09, + "learning_rate": 4.6875000000000004e-06, + "logits/chosen": -2.2905330657958984, + "logits/rejected": -2.254070520401001, + "logps/chosen": -289.9971008300781, + "logps/rejected": -273.9203796386719, + "loss": 0.6847, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.032837510108947754, + "rewards/margins": 0.019895007833838463, + "rewards/rejected": 0.012942495755851269, "step": 90 }, { - "epoch": 0.05, - "learning_rate": 2.604166666666667e-06, - "logits/chosen": -2.317207098007202, - "logits/rejected": -2.224195957183838, - "logps/chosen": -275.38079833984375, - "logps/rejected": -239.62570190429688, - "loss": 0.6916, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0046446667984128, - "rewards/margins": 0.0029630656354129314, - "rewards/rejected": 0.001681601395830512, + "epoch": 0.1, + "learning_rate": 4.9997324926814375e-06, + "logits/chosen": -2.243922710418701, + "logits/rejected": -2.2560582160949707, + "logps/chosen": -273.7684326171875, + "logps/rejected": -287.9248962402344, + "loss": 0.6812, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03909907117486, + "rewards/margins": 0.024433141574263573, + "rewards/rejected": 0.014665926806628704, "step": 100 }, { - "epoch": 0.05, - "eval_logits/chosen": -2.262146234512329, - "eval_logits/rejected": -2.2040350437164307, - "eval_logps/chosen": -249.55955505371094, - "eval_logps/rejected": -251.6075439453125, - "eval_loss": 0.6912463307380676, - "eval_rewards/accuracies": 0.6484375, - "eval_rewards/chosen": 0.0059408266097307205, - "eval_rewards/margins": 0.004077494610100985, - "eval_rewards/rejected": 0.0018633321160450578, - "eval_runtime": 100.3497, - "eval_samples_per_second": 19.93, + "epoch": 0.1, + "eval_logits/chosen": -2.251671075820923, + "eval_logits/rejected": -2.194241762161255, + "eval_logps/chosen": -245.63218688964844, + "eval_logps/rejected": -250.59286499023438, + "eval_loss": 0.6787430644035339, + "eval_rewards/accuracies": 0.69921875, + "eval_rewards/chosen": 0.04521488770842552, + "eval_rewards/margins": 0.033204685896635056, + "eval_rewards/rejected": 0.012010199017822742, + "eval_runtime": 100.4664, + "eval_samples_per_second": 19.907, "eval_steps_per_second": 0.319, "step": 100 }, { - "epoch": 0.06, - "learning_rate": 2.8645833333333334e-06, - "logits/chosen": -2.23917818069458, - "logits/rejected": -2.2198050022125244, - "logps/chosen": -259.9859924316406, - "logps/rejected": -238.02651977539062, - "loss": 0.6911, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.005757027771323919, - "rewards/margins": 0.003131809178739786, - "rewards/rejected": 0.002625218825414777, + "epoch": 0.12, + "learning_rate": 4.996723692767927e-06, + "logits/chosen": -2.2775979042053223, + "logits/rejected": -2.168391704559326, + "logps/chosen": -244.2776641845703, + "logps/rejected": -219.0230712890625, + "loss": 0.6751, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.03759222850203514, + "rewards/margins": 0.03267201781272888, + "rewards/rejected": 0.004920212086290121, "step": 110 }, { - "epoch": 0.06, - "learning_rate": 3.125e-06, - "logits/chosen": -2.327700138092041, - "logits/rejected": -2.237442970275879, - "logps/chosen": -261.9295349121094, - "logps/rejected": -265.9600524902344, - "loss": 0.6896, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.010207553394138813, - "rewards/margins": 0.004325833171606064, - "rewards/rejected": 0.005881720222532749, + "epoch": 0.13, + "learning_rate": 4.9903757462135984e-06, + "logits/chosen": -2.20393705368042, + "logits/rejected": -2.0801148414611816, + "logps/chosen": -254.13204956054688, + "logps/rejected": -222.89602661132812, + "loss": 0.6697, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03248171508312225, + "rewards/margins": 0.04965170845389366, + "rewards/rejected": -0.017169995233416557, "step": 120 }, { - "epoch": 0.07, - "learning_rate": 3.385416666666667e-06, - "logits/chosen": -2.362593173980713, - "logits/rejected": -2.199610471725464, - "logps/chosen": -341.5224304199219, - "logps/rejected": -274.25823974609375, - "loss": 0.6874, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.020573467016220093, - "rewards/margins": 0.017041604965925217, - "rewards/rejected": 0.003531861351802945, + "epoch": 0.14, + "learning_rate": 4.980697142834315e-06, + "logits/chosen": -2.3278605937957764, + "logits/rejected": -2.1921324729919434, + "logps/chosen": -306.6998596191406, + "logps/rejected": -257.21942138671875, + "loss": 0.661, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0004948576679453254, + "rewards/margins": 0.06153837591409683, + "rewards/rejected": -0.062033236026763916, "step": 130 }, { - "epoch": 0.07, - "learning_rate": 3.6458333333333333e-06, - "logits/chosen": -2.2749550342559814, - "logits/rejected": -2.189138650894165, - "logps/chosen": -267.3360900878906, - "logps/rejected": -275.9014587402344, - "loss": 0.6851, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.03031373955309391, - "rewards/margins": 0.014086397364735603, - "rewards/rejected": 0.016227342188358307, + "epoch": 0.15, + "learning_rate": 4.967700826904229e-06, + "logits/chosen": -2.1966867446899414, + "logits/rejected": -2.082227945327759, + "logps/chosen": -234.10623168945312, + "logps/rejected": -225.2598114013672, + "loss": 0.649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.027638202533125877, + "rewards/margins": 0.09491725265979767, + "rewards/rejected": -0.1225554570555687, "step": 140 }, { - "epoch": 0.08, - "learning_rate": 3.90625e-06, - "logits/chosen": -2.2647595405578613, - "logits/rejected": -2.031757354736328, - "logps/chosen": -302.0655212402344, - "logps/rejected": -256.95208740234375, - "loss": 0.6793, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.02602318860590458, - "rewards/margins": 0.022284485399723053, - "rewards/rejected": 0.003738699946552515, + "epoch": 0.16, + "learning_rate": 4.951404179843963e-06, + "logits/chosen": -2.0880074501037598, + "logits/rejected": -2.0883960723876953, + "logps/chosen": -279.95965576171875, + "logps/rejected": -290.29327392578125, + "loss": 0.6478, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.09183233976364136, + "rewards/margins": 0.10491160303354263, + "rewards/rejected": -0.1967439353466034, "step": 150 }, { - "epoch": 0.08, - "learning_rate": 4.166666666666667e-06, - "logits/chosen": -2.0317959785461426, - "logits/rejected": -1.9486854076385498, - "logps/chosen": -263.51666259765625, - "logps/rejected": -237.7399139404297, - "loss": 0.6759, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.034223657101392746, - "rewards/margins": 0.04548361897468567, - "rewards/rejected": -0.011259960941970348, + "epoch": 0.17, + "learning_rate": 4.931828996974498e-06, + "logits/chosen": -2.1437039375305176, + "logits/rejected": -2.035672187805176, + "logps/chosen": -293.6642761230469, + "logps/rejected": -308.59246826171875, + "loss": 0.6161, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.10774645954370499, + "rewards/margins": 0.21509592235088348, + "rewards/rejected": -0.32284238934516907, "step": 160 }, { - "epoch": 0.09, - "learning_rate": 4.427083333333334e-06, - "logits/chosen": -2.257275342941284, - "logits/rejected": -2.2866082191467285, - "logps/chosen": -302.7249450683594, - "logps/rejected": -286.78643798828125, - "loss": 0.6743, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.03409689664840698, - "rewards/margins": 0.028191696852445602, - "rewards/rejected": 0.005905195139348507, + "epoch": 0.18, + "learning_rate": 4.909001458367867e-06, + "logits/chosen": -2.1584107875823975, + "logits/rejected": -1.9964697360992432, + "logps/chosen": -278.0213928222656, + "logps/rejected": -249.1472930908203, + "loss": 0.6207, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1264086812734604, + "rewards/margins": 0.18005268275737762, + "rewards/rejected": -0.306461364030838, "step": 170 }, { - "epoch": 0.09, - "learning_rate": 4.6875000000000004e-06, - "logits/chosen": -2.2668297290802, - "logits/rejected": -2.165213108062744, - "logps/chosen": -275.1216735839844, - "logps/rejected": -266.6285705566406, - "loss": 0.6638, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.05305374786257744, - "rewards/margins": 0.08881576359272003, - "rewards/rejected": -0.0357620045542717, + "epoch": 0.19, + "learning_rate": 4.882952093833628e-06, + "logits/chosen": -2.049844741821289, + "logits/rejected": -2.0112996101379395, + "logps/chosen": -238.4535369873047, + "logps/rejected": -298.28375244140625, + "loss": 0.6092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26498228311538696, + "rewards/margins": 0.2150886058807373, + "rewards/rejected": -0.4800708293914795, "step": 180 }, { - "epoch": 0.1, - "learning_rate": 4.947916666666667e-06, - "logits/chosen": -2.131401777267456, - "logits/rejected": -2.138788938522339, - "logps/chosen": -251.2366943359375, - "logps/rejected": -256.0220031738281, - "loss": 0.6622, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.0315600261092186, - "rewards/margins": 0.050549279898405075, - "rewards/rejected": -0.08210931718349457, + "epoch": 0.2, + "learning_rate": 4.853715742087947e-06, + "logits/chosen": -2.2220816612243652, + "logits/rejected": -2.0675864219665527, + "logps/chosen": -294.275390625, + "logps/rejected": -279.9860534667969, + "loss": 0.6071, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.271901398897171, + "rewards/margins": 0.23620745539665222, + "rewards/rejected": -0.5081088542938232, "step": 190 }, { - "epoch": 0.1, - "learning_rate": 4.9997328038213385e-06, - "logits/chosen": -2.2219460010528564, - "logits/rejected": -2.232409954071045, - "logps/chosen": -310.5309143066406, - "logps/rejected": -345.9176330566406, - "loss": 0.655, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.03254920616745949, - "rewards/margins": 0.11690758168697357, - "rewards/rejected": -0.14945678412914276, + "epoch": 0.21, + "learning_rate": 4.821331504159906e-06, + "logits/chosen": -2.120783567428589, + "logits/rejected": -2.0360219478607178, + "logps/chosen": -284.9825134277344, + "logps/rejected": -325.1084899902344, + "loss": 0.6066, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18728381395339966, + "rewards/margins": 0.21014633774757385, + "rewards/rejected": -0.3974301517009735, "step": 200 }, { - "epoch": 0.1, - "eval_logits/chosen": -2.161393165588379, - "eval_logits/rejected": -2.101147174835205, - "eval_logps/chosen": -255.74208068847656, - "eval_logps/rejected": -269.41058349609375, - "eval_loss": 0.6498140692710876, - "eval_rewards/accuracies": 0.70703125, - "eval_rewards/chosen": -0.055884458124637604, - "eval_rewards/margins": 0.1202826276421547, - "eval_rewards/rejected": -0.1761670857667923, - "eval_runtime": 100.2506, - "eval_samples_per_second": 19.95, - "eval_steps_per_second": 0.319, + "epoch": 0.21, + "eval_logits/chosen": -2.0610172748565674, + "eval_logits/rejected": -1.9906072616577148, + "eval_logps/chosen": -273.18548583984375, + "eval_logps/rejected": -301.99749755859375, + "eval_loss": 0.6150946021080017, + "eval_rewards/accuracies": 0.69921875, + "eval_rewards/chosen": -0.23031830787658691, + "eval_rewards/margins": 0.2717178463935852, + "eval_rewards/rejected": -0.5020360946655273, + "eval_runtime": 100.7938, + "eval_samples_per_second": 19.842, + "eval_steps_per_second": 0.317, "step": 200 }, { - "epoch": 0.11, - "learning_rate": 4.998647417232375e-06, - "logits/chosen": -2.235889434814453, - "logits/rejected": -2.119898796081543, - "logps/chosen": -243.06777954101562, - "logps/rejected": -230.5707244873047, - "loss": 0.6378, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.10976459830999374, - "rewards/margins": 0.07079415768384933, - "rewards/rejected": -0.18055877089500427, + "epoch": 0.22, + "learning_rate": 4.7858426910973435e-06, + "logits/chosen": -2.1503536701202393, + "logits/rejected": -1.9159533977508545, + "logps/chosen": -337.6061706542969, + "logps/rejected": -290.7955322265625, + "loss": 0.6126, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21691349148750305, + "rewards/margins": 0.3816342353820801, + "rewards/rejected": -0.5985476970672607, "step": 210 }, { - "epoch": 0.12, - "learning_rate": 4.996727502703358e-06, - "logits/chosen": -2.109752893447876, - "logits/rejected": -1.9883226156234741, - "logps/chosen": -282.5257263183594, - "logps/rejected": -262.3667907714844, - "loss": 0.6403, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.185433030128479, - "rewards/margins": 0.17308147251605988, - "rewards/rejected": -0.3585144877433777, + "epoch": 0.23, + "learning_rate": 4.747296766042161e-06, + "logits/chosen": -2.0444750785827637, + "logits/rejected": -1.9728294610977173, + "logps/chosen": -281.8607177734375, + "logps/rejected": -295.66534423828125, + "loss": 0.6085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19859151542186737, + "rewards/margins": 0.28593915700912476, + "rewards/rejected": -0.4845306873321533, "step": 220 }, { - "epoch": 0.12, - "learning_rate": 4.993973701470142e-06, - "logits/chosen": -2.0258140563964844, - "logits/rejected": -1.9273954629898071, - "logps/chosen": -300.9564208984375, - "logps/rejected": -278.96063232421875, - "loss": 0.6426, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.10224632173776627, - "rewards/margins": 0.19058939814567566, - "rewards/rejected": -0.29283568263053894, + "epoch": 0.24, + "learning_rate": 4.705745280752586e-06, + "logits/chosen": -2.0718815326690674, + "logits/rejected": -1.9338264465332031, + "logps/chosen": -306.795166015625, + "logps/rejected": -283.50030517578125, + "loss": 0.6077, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2848009765148163, + "rewards/margins": 0.24246558547019958, + "rewards/rejected": -0.5272665619850159, "step": 230 }, { - "epoch": 0.13, - "learning_rate": 4.990386933279973e-06, - "logits/chosen": -2.1792609691619873, - "logits/rejected": -2.0059762001037598, - "logps/chosen": -245.21896362304688, - "logps/rejected": -228.4906463623047, - "loss": 0.6284, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.21190258860588074, - "rewards/margins": 0.14619460701942444, - "rewards/rejected": -0.3580971658229828, + "epoch": 0.25, + "learning_rate": 4.661243806657256e-06, + "logits/chosen": -1.994541883468628, + "logits/rejected": -1.8617355823516846, + "logps/chosen": -278.3886413574219, + "logps/rejected": -256.62603759765625, + "loss": 0.6076, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27843061089515686, + "rewards/margins": 0.25517329573631287, + "rewards/rejected": -0.5336039066314697, "step": 240 }, { - "epoch": 0.13, - "learning_rate": 4.985968396084284e-06, - "logits/chosen": -2.1072471141815186, - "logits/rejected": -2.025906801223755, - "logps/chosen": -308.6363220214844, - "logps/rejected": -286.33856201171875, - "loss": 0.6333, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.31365102529525757, - "rewards/margins": 0.10651323944330215, - "rewards/rejected": -0.4201642870903015, + "epoch": 0.26, + "learning_rate": 4.613851860533367e-06, + "logits/chosen": -1.980902910232544, + "logits/rejected": -1.8700807094573975, + "logps/chosen": -309.05877685546875, + "logps/rejected": -351.293212890625, + "loss": 0.6002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37613645195961, + "rewards/margins": 0.3849337697029114, + "rewards/rejected": -0.761070191860199, "step": 250 }, { - "epoch": 0.14, - "learning_rate": 4.980719565638611e-06, - "logits/chosen": -2.294565439224243, - "logits/rejected": -2.056588649749756, - "logps/chosen": -355.87286376953125, - "logps/rejected": -298.9410095214844, - "loss": 0.6238, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.19843360781669617, - "rewards/margins": 0.2138761281967163, - "rewards/rejected": -0.41230979561805725, + "epoch": 0.27, + "learning_rate": 4.563632824908252e-06, + "logits/chosen": -1.9446996450424194, + "logits/rejected": -1.862592339515686, + "logps/chosen": -309.58544921875, + "logps/rejected": -325.975341796875, + "loss": 0.5955, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4704383909702301, + "rewards/margins": 0.2729704976081848, + "rewards/rejected": -0.7434087991714478, "step": 260 }, { - "epoch": 0.14, - "learning_rate": 4.974642195009681e-06, - "logits/chosen": -2.1215481758117676, - "logits/rejected": -2.0347511768341064, - "logps/chosen": -221.60385131835938, - "logps/rejected": -250.5578155517578, - "loss": 0.5957, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.19110050797462463, - "rewards/margins": 0.27878326177597046, - "rewards/rejected": -0.4698837697505951, + "epoch": 0.28, + "learning_rate": 4.510653863290871e-06, + "logits/chosen": -2.056185483932495, + "logits/rejected": -1.9320430755615234, + "logps/chosen": -311.8960876464844, + "logps/rejected": -307.3726501464844, + "loss": 0.5843, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.35895800590515137, + "rewards/margins": 0.3666152358055115, + "rewards/rejected": -0.7255732417106628, "step": 270 }, { - "epoch": 0.15, - "learning_rate": 4.967738313989918e-06, - "logits/chosen": -1.9943937063217163, - "logits/rejected": -1.8106372356414795, - "logps/chosen": -285.83843994140625, - "logps/rejected": -252.4984893798828, - "loss": 0.631, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.25647464394569397, - "rewards/margins": 0.044119417667388916, - "rewards/rejected": -0.3005940318107605, + "epoch": 0.29, + "learning_rate": 4.454985830346574e-06, + "logits/chosen": -2.163935422897339, + "logits/rejected": -2.0811800956726074, + "logps/chosen": -313.5486145019531, + "logps/rejected": -329.7764587402344, + "loss": 0.595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3059764504432678, + "rewards/margins": 0.3349177837371826, + "rewards/rejected": -0.6408942937850952, "step": 280 }, { - "epoch": 0.15, - "learning_rate": 4.960010228419499e-06, - "logits/chosen": -1.949554204940796, - "logits/rejected": -2.05960750579834, - "logps/chosen": -259.6241760253906, - "logps/rejected": -297.25799560546875, - "loss": 0.6349, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.2845107913017273, - "rewards/margins": 0.24759821593761444, - "rewards/rejected": -0.5321090221405029, + "epoch": 0.3, + "learning_rate": 4.396703177135262e-06, + "logits/chosen": -2.090761423110962, + "logits/rejected": -2.0192935466766357, + "logps/chosen": -313.79547119140625, + "logps/rejected": -292.3644104003906, + "loss": 0.5862, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.42916345596313477, + "rewards/margins": 0.19854620099067688, + "rewards/rejected": -0.6277095675468445, "step": 290 }, { - "epoch": 0.16, - "learning_rate": 4.951460519416228e-06, - "logits/chosen": -1.9813953638076782, - "logits/rejected": -1.8782306909561157, - "logps/chosen": -343.69256591796875, - "logps/rejected": -342.38494873046875, - "loss": 0.6342, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.33312922716140747, - "rewards/margins": 0.11881290376186371, - "rewards/rejected": -0.45194211602211, + "epoch": 0.31, + "learning_rate": 4.335883851539693e-06, + "logits/chosen": -2.0020194053649902, + "logits/rejected": -1.8026950359344482, + "logps/chosen": -264.9991455078125, + "logps/rejected": -290.67193603515625, + "loss": 0.5711, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.41096872091293335, + "rewards/margins": 0.36055171489715576, + "rewards/rejected": -0.7715204358100891, "step": 300 }, { - "epoch": 0.16, - "eval_logits/chosen": -1.9793298244476318, - "eval_logits/rejected": -1.903726577758789, - "eval_logps/chosen": -284.22235107421875, - "eval_logps/rejected": -314.48388671875, - "eval_loss": 0.6145588159561157, - "eval_rewards/accuracies": 0.703125, - "eval_rewards/chosen": -0.3406871259212494, - "eval_rewards/margins": 0.2862129211425781, - "eval_rewards/rejected": -0.6268999576568604, - "eval_runtime": 100.8595, - "eval_samples_per_second": 19.83, - "eval_steps_per_second": 0.317, + "epoch": 0.31, + "eval_logits/chosen": -2.0222675800323486, + "eval_logits/rejected": -1.9416760206222534, + "eval_logps/chosen": -294.5666198730469, + "eval_logps/rejected": -336.92279052734375, + "eval_loss": 0.5926566123962402, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -0.4441298246383667, + "eval_rewards/margins": 0.40715909004211426, + "eval_rewards/rejected": -0.851288914680481, + "eval_runtime": 100.4732, + "eval_samples_per_second": 19.906, + "eval_steps_per_second": 0.318, "step": 300 }, { - "epoch": 0.16, - "learning_rate": 4.942092042513459e-06, - "logits/chosen": -2.022141933441162, - "logits/rejected": -1.8856008052825928, - "logps/chosen": -315.6492919921875, - "logps/rejected": -310.83612060546875, - "loss": 0.5955, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2830706834793091, - "rewards/margins": 0.34754693508148193, - "rewards/rejected": -0.630617618560791, + "epoch": 0.32, + "learning_rate": 4.2726091940171055e-06, + "logits/chosen": -1.9914287328720093, + "logits/rejected": -1.8461036682128906, + "logps/chosen": -303.94287109375, + "logps/rejected": -359.288330078125, + "loss": 0.5712, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4550764560699463, + "rewards/margins": 0.5030550956726074, + "rewards/rejected": -0.9581316113471985, "step": 310 }, { - "epoch": 0.17, - "learning_rate": 4.931907926706374e-06, - "logits/chosen": -2.045828342437744, - "logits/rejected": -1.9249250888824463, - "logps/chosen": -309.50921630859375, - "logps/rejected": -360.74273681640625, - "loss": 0.5891, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.31072211265563965, - "rewards/margins": 0.24828410148620605, - "rewards/rejected": -0.5590062141418457, + "epoch": 0.33, + "learning_rate": 4.206963828813555e-06, + "logits/chosen": -1.9920743703842163, + "logits/rejected": -1.8297895193099976, + "logps/chosen": -330.64007568359375, + "logps/rejected": -369.685302734375, + "loss": 0.5752, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5053039789199829, + "rewards/margins": 0.44878053665161133, + "rewards/rejected": -0.9540846943855286, "step": 320 }, { - "epoch": 0.17, - "learning_rate": 4.920911573406925e-06, - "logits/chosen": -2.1581883430480957, - "logits/rejected": -2.04437255859375, - "logps/chosen": -274.237548828125, - "logps/rejected": -290.13446044921875, - "loss": 0.604, + "epoch": 0.35, + "learning_rate": 4.139035550786495e-06, + "logits/chosen": -1.977081298828125, + "logits/rejected": -1.7652740478515625, + "logps/chosen": -318.32421875, + "logps/rejected": -294.5540466308594, + "loss": 0.6065, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.25217825174331665, - "rewards/margins": 0.2510914206504822, - "rewards/rejected": -0.5032696723937988, + "rewards/chosen": -0.5532656311988831, + "rewards/margins": 0.37579816579818726, + "rewards/rejected": -0.9290636777877808, "step": 330 }, { - "epoch": 0.18, - "learning_rate": 4.9091066553077875e-06, - "logits/chosen": -1.9913349151611328, - "logits/rejected": -1.752171277999878, - "logps/chosen": -317.76812744140625, - "logps/rejected": -265.75146484375, - "loss": 0.6041, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3602680563926697, - "rewards/margins": 0.32529839873313904, - "rewards/rejected": -0.6855664253234863, + "epoch": 0.36, + "learning_rate": 4.068915207986931e-06, + "logits/chosen": -1.8924455642700195, + "logits/rejected": -1.7873607873916626, + "logps/chosen": -267.74505615234375, + "logps/rejected": -279.3656921386719, + "loss": 0.6171, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5295882225036621, + "rewards/margins": 0.28365662693977356, + "rewards/rejected": -0.8132449388504028, "step": 340 }, { - "epoch": 0.18, - "learning_rate": 4.8964971151557095e-06, - "logits/chosen": -2.0480399131774902, - "logits/rejected": -2.013430118560791, - "logps/chosen": -273.3092346191406, - "logps/rejected": -343.83270263671875, - "loss": 0.5753, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5287691354751587, - "rewards/margins": 0.2307828664779663, - "rewards/rejected": -0.7595520615577698, + "epoch": 0.37, + "learning_rate": 3.996696580158211e-06, + "logits/chosen": -1.8571460247039795, + "logits/rejected": -1.9016132354736328, + "logps/chosen": -282.2213134765625, + "logps/rejected": -360.13616943359375, + "loss": 0.5854, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5124384164810181, + "rewards/margins": 0.5215515494346619, + "rewards/rejected": -1.0339899063110352, "step": 350 }, { - "epoch": 0.19, - "learning_rate": 4.883087164434672e-06, - "logits/chosen": -1.888958215713501, - "logits/rejected": -1.835721731185913, - "logps/chosen": -251.35986328125, - "logps/rejected": -312.13140869140625, - "loss": 0.6004, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.4788152277469635, - "rewards/margins": 0.3157396912574768, - "rewards/rejected": -0.7945548295974731, + "epoch": 0.38, + "learning_rate": 3.922476253313921e-06, + "logits/chosen": -2.048137664794922, + "logits/rejected": -1.8757222890853882, + "logps/chosen": -334.61102294921875, + "logps/rejected": -348.38677978515625, + "loss": 0.577, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5598105192184448, + "rewards/margins": 0.36239486932754517, + "rewards/rejected": -0.9222054481506348, "step": 360 }, { - "epoch": 0.19, - "learning_rate": 4.868881281959282e-06, - "logits/chosen": -2.2465357780456543, - "logits/rejected": -2.0700020790100098, - "logps/chosen": -337.7215881347656, - "logps/rejected": -335.09942626953125, - "loss": 0.5924, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.2437172830104828, - "rewards/margins": 0.3737484812736511, - "rewards/rejected": -0.6174657940864563, + "epoch": 0.39, + "learning_rate": 3.846353490562664e-06, + "logits/chosen": -1.8339828252792358, + "logits/rejected": -1.7505193948745728, + "logps/chosen": -286.8439025878906, + "logps/rejected": -329.70751953125, + "loss": 0.5692, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46078333258628845, + "rewards/margins": 0.2924962639808655, + "rewards/rejected": -0.7532795667648315, "step": 370 }, { - "epoch": 0.2, - "learning_rate": 4.853884212378889e-06, - "logits/chosen": -2.1131327152252197, - "logits/rejected": -1.9588050842285156, - "logps/chosen": -246.9884033203125, - "logps/rejected": -242.58206176757812, - "loss": 0.5758, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.261677622795105, - "rewards/margins": 0.31416845321655273, - "rewards/rejected": -0.5758460760116577, + "epoch": 0.4, + "learning_rate": 3.768430099352445e-06, + "logits/chosen": -1.9885179996490479, + "logits/rejected": -1.9461723566055298, + "logps/chosen": -340.37432861328125, + "logps/rejected": -348.7174377441406, + "loss": 0.5628, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5099226236343384, + "rewards/margins": 0.3366420865058899, + "rewards/rejected": -0.846564769744873, "step": 380 }, { - "epoch": 0.2, - "learning_rate": 4.8381009645929044e-06, - "logits/chosen": -2.05610990524292, - "logits/rejected": -1.873307466506958, - "logps/chosen": -315.9834289550781, - "logps/rejected": -398.42181396484375, - "loss": 0.5601, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.45816677808761597, - "rewards/margins": 0.4635772705078125, - "rewards/rejected": -0.9217440485954285, + "epoch": 0.41, + "learning_rate": 3.6888102953122307e-06, + "logits/chosen": -1.7645511627197266, + "logits/rejected": -1.5222413539886475, + "logps/chosen": -293.2809753417969, + "logps/rejected": -302.76483154296875, + "loss": 0.586, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5093228220939636, + "rewards/margins": 0.5271816253662109, + "rewards/rejected": -1.0365045070648193, "step": 390 }, { - "epoch": 0.21, - "learning_rate": 4.821536810077878e-06, - "logits/chosen": -1.9345449209213257, - "logits/rejected": -1.9163599014282227, - "logps/chosen": -311.0472106933594, - "logps/rejected": -330.16021728515625, - "loss": 0.6121, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.48705682158470154, - "rewards/margins": 0.1697099208831787, - "rewards/rejected": -0.6567667722702026, + "epoch": 0.42, + "learning_rate": 3.607600562872785e-06, + "logits/chosen": -2.1015868186950684, + "logits/rejected": -1.896120309829712, + "logps/chosen": -386.51507568359375, + "logps/rejected": -355.5737609863281, + "loss": 0.557, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6686216592788696, + "rewards/margins": 0.3796016275882721, + "rewards/rejected": -1.0482232570648193, "step": 400 }, { - "epoch": 0.21, - "eval_logits/chosen": -1.9542992115020752, - "eval_logits/rejected": -1.8716992139816284, - "eval_logps/chosen": -296.7203063964844, - "eval_logps/rejected": -340.9551086425781, - "eval_loss": 0.5946135520935059, - "eval_rewards/accuracies": 0.703125, - "eval_rewards/chosen": -0.4656665325164795, - "eval_rewards/margins": 0.42594534158706665, - "eval_rewards/rejected": -0.8916119337081909, - "eval_runtime": 100.2068, - "eval_samples_per_second": 19.959, + "epoch": 0.42, + "eval_logits/chosen": -1.8364356756210327, + "eval_logits/rejected": -1.743373155593872, + "eval_logps/chosen": -309.73779296875, + "eval_logps/rejected": -359.1117248535156, + "eval_loss": 0.5816810727119446, + "eval_rewards/accuracies": 0.72265625, + "eval_rewards/chosen": -0.5958413481712341, + "eval_rewards/margins": 0.47733715176582336, + "eval_rewards/rejected": -1.0731785297393799, + "eval_runtime": 100.2581, + "eval_samples_per_second": 19.949, "eval_steps_per_second": 0.319, "step": 400 }, { - "epoch": 0.21, - "learning_rate": 4.804197281126862e-06, - "logits/chosen": -2.094160795211792, - "logits/rejected": -1.9404535293579102, - "logps/chosen": -356.2447814941406, - "logps/rejected": -329.25543212890625, - "loss": 0.6201, + "epoch": 0.43, + "learning_rate": 3.5249095128531863e-06, + "logits/chosen": -2.0195209980010986, + "logits/rejected": -1.954064130783081, + "logps/chosen": -382.4500732421875, + "logps/rejected": -396.9842834472656, + "loss": 0.5825, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.4695979058742523, - "rewards/margins": 0.3254204988479614, - "rewards/rejected": -0.7950183153152466, + "rewards/chosen": -0.507409393787384, + "rewards/margins": 0.45680397748947144, + "rewards/rejected": -0.964213490486145, "step": 410 }, { - "epoch": 0.22, - "learning_rate": 4.786088169001671e-06, - "logits/chosen": -2.0321784019470215, - "logits/rejected": -1.6601848602294922, - "logps/chosen": -354.99945068359375, - "logps/rejected": -307.051025390625, - "loss": 0.5817, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.3245474398136139, - "rewards/margins": 0.6246824860572815, - "rewards/rejected": -0.949229896068573, + "epoch": 0.44, + "learning_rate": 3.4408477372034743e-06, + "logits/chosen": -1.6744167804718018, + "logits/rejected": -1.5238146781921387, + "logps/chosen": -277.4457702636719, + "logps/rejected": -303.87213134765625, + "loss": 0.5778, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5154739022254944, + "rewards/margins": 0.3824000358581543, + "rewards/rejected": -0.8978738784790039, "step": 420 }, { - "epoch": 0.23, - "learning_rate": 4.767215521998649e-06, - "logits/chosen": -1.9951956272125244, - "logits/rejected": -1.843869924545288, - "logps/chosen": -316.816162109375, - "logps/rejected": -329.95159912109375, - "loss": 0.5718, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.3314690589904785, - "rewards/margins": 0.5325202345848083, - "rewards/rejected": -0.8639893531799316, + "epoch": 0.45, + "learning_rate": 3.355527661097728e-06, + "logits/chosen": -1.8267688751220703, + "logits/rejected": -1.8169567584991455, + "logps/chosen": -271.2919616699219, + "logps/rejected": -293.12933349609375, + "loss": 0.5999, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4637935757637024, + "rewards/margins": 0.3348856568336487, + "rewards/rejected": -0.7986792325973511, "step": 430 }, { - "epoch": 0.23, - "learning_rate": 4.747585643428586e-06, - "logits/chosen": -1.8855489492416382, - "logits/rejected": -1.8727896213531494, - "logps/chosen": -270.5180969238281, - "logps/rejected": -311.8236999511719, - "loss": 0.6055, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.30184242129325867, - "rewards/margins": 0.3076760172843933, - "rewards/rejected": -0.6095184087753296, + "epoch": 0.46, + "learning_rate": 3.269063392575352e-06, + "logits/chosen": -1.7343038320541382, + "logits/rejected": -1.7039591073989868, + "logps/chosen": -298.5570983886719, + "logps/rejected": -324.7281799316406, + "loss": 0.5865, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.56922447681427, + "rewards/margins": 0.3157515227794647, + "rewards/rejected": -0.8849760293960571, "step": 440 }, { - "epoch": 0.24, - "learning_rate": 4.727205089511466e-06, - "logits/chosen": -1.9391940832138062, - "logits/rejected": -1.7669153213500977, - "logps/chosen": -310.2550048828125, - "logps/rejected": -332.0259094238281, - "loss": 0.6037, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5411502718925476, - "rewards/margins": 0.3224232792854309, - "rewards/rejected": -0.8635735511779785, + "epoch": 0.47, + "learning_rate": 3.181570569931697e-06, + "logits/chosen": -1.3839043378829956, + "logits/rejected": -1.3726271390914917, + "logps/chosen": -274.1092529296875, + "logps/rejected": -318.86492919921875, + "loss": 0.578, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5215469598770142, + "rewards/margins": 0.42288708686828613, + "rewards/rejected": -0.9444340467453003, "step": 450 }, { - "epoch": 0.24, - "learning_rate": 4.7060806671867386e-06, - "logits/chosen": -1.8398698568344116, - "logits/rejected": -1.672258973121643, - "logps/chosen": -349.76824951171875, - "logps/rejected": -309.33258056640625, - "loss": 0.5768, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.4927810728549957, - "rewards/margins": 0.44175705313682556, - "rewards/rejected": -0.9345381855964661, + "epoch": 0.48, + "learning_rate": 3.09316620706208e-06, + "logits/chosen": -1.7264198064804077, + "logits/rejected": -1.616563081741333, + "logps/chosen": -324.797119140625, + "logps/rejected": -361.42303466796875, + "loss": 0.585, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5468829274177551, + "rewards/margins": 0.3461124002933502, + "rewards/rejected": -0.8929953575134277, "step": 460 }, { - "epoch": 0.25, - "learning_rate": 4.68421943183986e-06, - "logits/chosen": -1.790315866470337, - "logits/rejected": -1.7293519973754883, - "logps/chosen": -277.6053771972656, - "logps/rejected": -304.4560852050781, - "loss": 0.5733, + "epoch": 0.49, + "learning_rate": 3.0039685369660785e-06, + "logits/chosen": -1.6359647512435913, + "logits/rejected": -1.3307011127471924, + "logps/chosen": -346.19561767578125, + "logps/rejected": -339.08978271484375, + "loss": 0.5644, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.48922044038772583, - "rewards/margins": 0.4518005847930908, - "rewards/rejected": -0.9410210847854614, + "rewards/chosen": -0.6865732669830322, + "rewards/margins": 0.44049325585365295, + "rewards/rejected": -1.1270664930343628, "step": 470 }, { - "epoch": 0.25, - "learning_rate": 4.661628684945851e-06, - "logits/chosen": -1.8333717584609985, - "logits/rejected": -1.5805283784866333, - "logps/chosen": -326.5656433105469, - "logps/rejected": -273.71636962890625, - "loss": 0.5815, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.5415780544281006, - "rewards/margins": 0.23381216824054718, - "rewards/rejected": -0.7753901481628418, + "epoch": 0.5, + "learning_rate": 2.91409685362137e-06, + "logits/chosen": -1.1181867122650146, + "logits/rejected": -0.9776461720466614, + "logps/chosen": -293.79425048828125, + "logps/rejected": -316.5558166503906, + "loss": 0.6066, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.7594456076622009, + "rewards/margins": 0.25457096099853516, + "rewards/rejected": -1.0140166282653809, "step": 480 }, { - "epoch": 0.26, - "learning_rate": 4.638315971630662e-06, - "logits/chosen": -1.9222495555877686, - "logits/rejected": -1.776019811630249, - "logps/chosen": -284.45068359375, - "logps/rejected": -410.24859619140625, - "loss": 0.5644, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.4614255428314209, - "rewards/margins": 0.6256455779075623, - "rewards/rejected": -1.0870710611343384, + "epoch": 0.51, + "learning_rate": 2.8236713524386085e-06, + "logits/chosen": -1.3658815622329712, + "logits/rejected": -1.1311851739883423, + "logps/chosen": -310.53594970703125, + "logps/rejected": -355.07257080078125, + "loss": 0.5688, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6269619464874268, + "rewards/margins": 0.4374374449253082, + "rewards/rejected": -1.0643993616104126, "step": 490 }, { - "epoch": 0.26, - "learning_rate": 4.614289078151164e-06, - "logits/chosen": -1.8076536655426025, - "logits/rejected": -1.6980488300323486, - "logps/chosen": -335.1206970214844, - "logps/rejected": -314.8514099121094, - "loss": 0.5973, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.3053855001926422, - "rewards/margins": 0.3548199236392975, - "rewards/rejected": -0.6602053642272949, + "epoch": 0.52, + "learning_rate": 2.7328129695107205e-06, + "logits/chosen": -1.2549781799316406, + "logits/rejected": -0.9512443542480469, + "logps/chosen": -343.2566833496094, + "logps/rejected": -383.65936279296875, + "loss": 0.5703, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7175710201263428, + "rewards/margins": 0.569162905216217, + "rewards/rejected": -1.286733865737915, "step": 500 }, { - "epoch": 0.26, - "eval_logits/chosen": -1.9282360076904297, - "eval_logits/rejected": -1.843964695930481, - "eval_logps/chosen": -286.9665832519531, - "eval_logps/rejected": -329.4521789550781, - "eval_loss": 0.593827486038208, - "eval_rewards/accuracies": 0.73046875, - "eval_rewards/chosen": -0.36812952160835266, - "eval_rewards/margins": 0.40845340490341187, - "eval_rewards/rejected": -0.7765828967094421, - "eval_runtime": 99.738, - "eval_samples_per_second": 20.053, - "eval_steps_per_second": 0.321, + "epoch": 0.52, + "eval_logits/chosen": -0.9966566562652588, + "eval_logits/rejected": -0.8466635942459106, + "eval_logps/chosen": -322.3067932128906, + "eval_logps/rejected": -375.8402099609375, + "eval_loss": 0.5679102540016174, + "eval_rewards/accuracies": 0.7265625, + "eval_rewards/chosen": -0.7215311527252197, + "eval_rewards/margins": 0.5189324617385864, + "eval_rewards/rejected": -1.2404634952545166, + "eval_runtime": 100.2462, + "eval_samples_per_second": 19.951, + "eval_steps_per_second": 0.319, "step": 500 }, { - "epoch": 0.27, - "learning_rate": 4.5895560292946e-06, - "logits/chosen": -1.807668685913086, - "logits/rejected": -1.859532356262207, - "logps/chosen": -280.341552734375, - "logps/rejected": -335.6524963378906, - "loss": 0.5652, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.39417344331741333, - "rewards/margins": 0.301259309053421, - "rewards/rejected": -0.695432722568512, + "epoch": 0.53, + "learning_rate": 2.641643219871597e-06, + "logits/chosen": -1.133755087852478, + "logits/rejected": -0.6763060092926025, + "logps/chosen": -343.71722412109375, + "logps/rejected": -342.73590087890625, + "loss": 0.5469, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.820611834526062, + "rewards/margins": 0.41005903482437134, + "rewards/rejected": -1.2306709289550781, "step": 510 }, { - "epoch": 0.27, - "learning_rate": 4.564125085698375e-06, - "logits/chosen": -1.920212984085083, - "logits/rejected": -1.680850625038147, - "logps/chosen": -333.3331604003906, - "logps/rejected": -321.528076171875, - "loss": 0.6101, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.49174195528030396, - "rewards/margins": 0.35194143652915955, - "rewards/rejected": -0.8436833620071411, + "epoch": 0.54, + "learning_rate": 2.5502840349805074e-06, + "logits/chosen": -1.0033422708511353, + "logits/rejected": -0.8227261304855347, + "logps/chosen": -342.0159606933594, + "logps/rejected": -367.8998718261719, + "loss": 0.5467, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6553717851638794, + "rewards/margins": 0.48232603073120117, + "rewards/rejected": -1.1376978158950806, "step": 520 }, { - "epoch": 0.28, - "learning_rate": 4.538004741091066e-06, - "logits/chosen": -1.8313194513320923, - "logits/rejected": -1.844010591506958, - "logps/chosen": -266.2292175292969, - "logps/rejected": -343.5267639160156, - "loss": 0.546, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.45850616693496704, - "rewards/margins": 0.576072633266449, - "rewards/rejected": -1.0345789194107056, + "epoch": 0.55, + "learning_rate": 2.4588575996495797e-06, + "logits/chosen": -1.015627145767212, + "logits/rejected": -0.7538983225822449, + "logps/chosen": -377.20703125, + "logps/rejected": -394.6551513671875, + "loss": 0.5437, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8781946301460266, + "rewards/margins": 0.4648224413394928, + "rewards/rejected": -1.3430172204971313, "step": 530 }, { - "epoch": 0.28, - "learning_rate": 4.511203719455588e-06, - "logits/chosen": -1.9197680950164795, - "logits/rejected": -1.5895801782608032, - "logps/chosen": -410.54620361328125, - "logps/rejected": -347.0699157714844, - "loss": 0.5822, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7892423868179321, - "rewards/margins": 0.38583868741989136, - "rewards/rejected": -1.1750810146331787, + "epoch": 0.57, + "learning_rate": 2.367486188632446e-06, + "logits/chosen": -1.0752605199813843, + "logits/rejected": -0.7782305479049683, + "logps/chosen": -355.3316955566406, + "logps/rejected": -417.9510803222656, + "loss": 0.5496, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.738643229007721, + "rewards/margins": 0.645391583442688, + "rewards/rejected": -1.3840347528457642, "step": 540 }, { - "epoch": 0.29, - "learning_rate": 4.483730972115454e-06, - "logits/chosen": -1.8053057193756104, - "logits/rejected": -1.6272900104522705, - "logps/chosen": -340.8621520996094, - "logps/rejected": -371.2051696777344, - "loss": 0.5906, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9878808856010437, - "rewards/margins": 0.36620211601257324, - "rewards/rejected": -1.3540828227996826, + "epoch": 0.58, + "learning_rate": 2.276292003092593e-06, + "logits/chosen": -0.9742870330810547, + "logits/rejected": -0.5593458414077759, + "logps/chosen": -347.0143127441406, + "logps/rejected": -346.98065185546875, + "loss": 0.5747, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7598128914833069, + "rewards/margins": 0.5269237756729126, + "rewards/rejected": -1.2867367267608643, "step": 550 }, { - "epoch": 0.29, - "learning_rate": 4.455595674745107e-06, - "logits/chosen": -2.011946201324463, - "logits/rejected": -1.9660618305206299, - "logps/chosen": -399.44403076171875, - "logps/rejected": -416.68109130859375, - "loss": 0.5921, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7561608552932739, - "rewards/margins": 0.454878032207489, - "rewards/rejected": -1.2110389471054077, + "epoch": 0.59, + "learning_rate": 2.1853970071701415e-06, + "logits/chosen": -0.7387961149215698, + "logits/rejected": -0.4743451476097107, + "logps/chosen": -338.78662109375, + "logps/rejected": -341.47320556640625, + "loss": 0.554, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8780034780502319, + "rewards/margins": 0.41334277391433716, + "rewards/rejected": -1.2913460731506348, "step": 560 }, { - "epoch": 0.3, - "learning_rate": 4.426807224305315e-06, - "logits/chosen": -1.841407060623169, - "logits/rejected": -1.7657171487808228, - "logps/chosen": -358.94525146484375, - "logps/rejected": -366.9837341308594, - "loss": 0.5917, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.8852140307426453, - "rewards/margins": 0.26677149534225464, - "rewards/rejected": -1.1519855260849, + "epoch": 0.6, + "learning_rate": 2.0949227648656194e-06, + "logits/chosen": -0.6523474454879761, + "logits/rejected": -0.389020174741745, + "logps/chosen": -314.07220458984375, + "logps/rejected": -362.8746643066406, + "loss": 0.5822, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8458138704299927, + "rewards/margins": 0.4794779419898987, + "rewards/rejected": -1.3252918720245361, "step": 570 }, { - "epoch": 0.3, - "learning_rate": 4.39737523590467e-06, - "logits/chosen": -1.8554290533065796, - "logits/rejected": -1.747036337852478, - "logps/chosen": -346.8446350097656, - "logps/rejected": -310.76336669921875, - "loss": 0.5537, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7551018595695496, - "rewards/margins": 0.27851471304893494, - "rewards/rejected": -1.033616542816162, + "epoch": 0.61, + "learning_rate": 2.00499027745888e-06, + "logits/chosen": -0.516838788986206, + "logits/rejected": -0.14539854228496552, + "logps/chosen": -334.552001953125, + "logps/rejected": -365.7865295410156, + "loss": 0.5619, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7920180559158325, + "rewards/margins": 0.6189525723457336, + "rewards/rejected": -1.410970687866211, "step": 580 }, { - "epoch": 0.31, - "learning_rate": 4.367309539588208e-06, - "logits/chosen": -1.7877289056777954, - "logits/rejected": -1.508073091506958, - "logps/chosen": -303.0578308105469, - "logps/rejected": -299.3207702636719, - "loss": 0.5666, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.6448020935058594, - "rewards/margins": 0.3490978181362152, - "rewards/rejected": -0.993899941444397, + "epoch": 0.62, + "learning_rate": 1.915719821680624e-06, + "logits/chosen": -0.4588368833065033, + "logits/rejected": -0.17827937006950378, + "logps/chosen": -302.94464111328125, + "logps/rejected": -334.8900146484375, + "loss": 0.5691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7475640773773193, + "rewards/margins": 0.3807729184627533, + "rewards/rejected": -1.1283371448516846, "step": 590 }, { - "epoch": 0.31, - "learning_rate": 4.336620177054269e-06, - "logits/chosen": -1.7982432842254639, - "logits/rejected": -1.6228100061416626, - "logps/chosen": -265.236572265625, - "logps/rejected": -344.7894287109375, - "loss": 0.5473, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.5600973963737488, - "rewards/margins": 0.6167066693305969, - "rewards/rejected": -1.1768039464950562, + "epoch": 0.63, + "learning_rate": 1.8272307888529276e-06, + "logits/chosen": -0.7035941481590271, + "logits/rejected": -0.5019673109054565, + "logps/chosen": -361.04827880859375, + "logps/rejected": -436.74560546875, + "loss": 0.5498, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5611416101455688, + "rewards/margins": 0.6350690126419067, + "rewards/rejected": -1.1962106227874756, "step": 600 }, { - "epoch": 0.31, - "eval_logits/chosen": -1.7725939750671387, - "eval_logits/rejected": -1.6815264225006104, - "eval_logps/chosen": -319.0811767578125, - "eval_logps/rejected": -374.43408203125, - "eval_loss": 0.5774475932121277, - "eval_rewards/accuracies": 0.734375, - "eval_rewards/chosen": -0.6892752647399902, - "eval_rewards/margins": 0.5371266603469849, - "eval_rewards/rejected": -1.226401925086975, - "eval_runtime": 100.1017, - "eval_samples_per_second": 19.98, + "epoch": 0.63, + "eval_logits/chosen": -0.44625866413116455, + "eval_logits/rejected": -0.25102561712265015, + "eval_logps/chosen": -320.17938232421875, + "eval_logps/rejected": -380.2698974609375, + "eval_loss": 0.5582141280174255, + "eval_rewards/accuracies": 0.7578125, + "eval_rewards/chosen": -0.7002571821212769, + "eval_rewards/margins": 0.5845023393630981, + "eval_rewards/rejected": -1.284759521484375, + "eval_runtime": 100.0013, + "eval_samples_per_second": 20.0, "eval_steps_per_second": 0.32, "step": 600 }, { - "epoch": 0.32, - "learning_rate": 4.30531739830064e-06, - "logits/chosen": -1.600541353225708, - "logits/rejected": -1.671954870223999, - "logps/chosen": -301.27410888671875, - "logps/rejected": -428.30255126953125, - "loss": 0.563, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8286434412002563, - "rewards/margins": 0.6544786095619202, - "rewards/rejected": -1.4831221103668213, + "epoch": 0.64, + "learning_rate": 1.739641525213929e-06, + "logits/chosen": -0.5911251306533813, + "logits/rejected": -0.18869177997112274, + "logps/chosen": -325.91033935546875, + "logps/rejected": -326.54095458984375, + "loss": 0.5425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7186974287033081, + "rewards/margins": 0.4664137363433838, + "rewards/rejected": -1.185111165046692, "step": 610 }, { - "epoch": 0.32, - "learning_rate": 4.273411658201141e-06, - "logits/chosen": -1.7578967809677124, - "logits/rejected": -1.3492388725280762, - "logps/chosen": -380.08013916015625, - "logps/rejected": -386.08056640625, - "loss": 0.5446, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.8161942362785339, - "rewards/margins": 0.5750104188919067, - "rewards/rejected": -1.391204595565796, + "epoch": 0.65, + "learning_rate": 1.6530691736402317e-06, + "logits/chosen": -0.8488653898239136, + "logits/rejected": -0.5028330087661743, + "logps/chosen": -356.1060485839844, + "logps/rejected": -355.0657958984375, + "loss": 0.5542, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8343976140022278, + "rewards/margins": 0.4692300260066986, + "rewards/rejected": -1.303627610206604, "step": 620 }, - { - "epoch": 0.33, - "learning_rate": 4.240913613013785e-06, - "logits/chosen": -1.5625998973846436, - "logits/rejected": -1.537353754043579, - "logps/chosen": -315.82696533203125, - "logps/rejected": -375.628173828125, - "loss": 0.5564, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.8845146298408508, - "rewards/margins": 0.37904268503189087, - "rewards/rejected": -1.2635573148727417, - "step": 630 - }, - { - "epoch": 0.33, - "learning_rate": 4.207834116821673e-06, - "logits/chosen": -1.7625129222869873, - "logits/rejected": -1.4166629314422607, - "logps/chosen": -409.44677734375, - "logps/rejected": -451.19329833984375, - "loss": 0.5548, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7660303115844727, - "rewards/margins": 0.753090500831604, - "rewards/rejected": -1.5191209316253662, - "step": 640 - }, - { - "epoch": 0.34, - "learning_rate": 4.174184217907818e-06, - "logits/chosen": -1.6399860382080078, - "logits/rejected": -1.3593322038650513, - "logps/chosen": -379.9228515625, - "logps/rejected": -356.4607849121094, - "loss": 0.6129, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.91173255443573, - "rewards/margins": 0.4682820737361908, - "rewards/rejected": -1.3800146579742432, - "step": 650 - }, - { - "epoch": 0.35, - "learning_rate": 4.139975155065109e-06, - "logits/chosen": -1.6585413217544556, - "logits/rejected": -1.4335598945617676, - "logps/chosen": -307.7510681152344, - "logps/rejected": -313.7562255859375, - "loss": 0.5703, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.705053448677063, - "rewards/margins": 0.5841482877731323, - "rewards/rejected": -1.2892017364501953, - "step": 660 - }, - { - "epoch": 0.35, - "learning_rate": 4.105218353842643e-06, - "logits/chosen": -1.6327035427093506, - "logits/rejected": -1.3608360290527344, - "logps/chosen": -330.2358093261719, - "logps/rejected": -306.1988220214844, - "loss": 0.6016, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.8032588958740234, - "rewards/margins": 0.30609965324401855, - "rewards/rejected": -1.1093586683273315, - "step": 670 - }, - { - "epoch": 0.36, - "learning_rate": 4.069925422729689e-06, - "logits/chosen": -1.4646321535110474, - "logits/rejected": -1.5063197612762451, - "logps/chosen": -266.6871337890625, - "logps/rejected": -328.95428466796875, - "loss": 0.6057, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.8702457547187805, - "rewards/margins": 0.41110268235206604, - "rewards/rejected": -1.281348466873169, - "step": 680 - }, - { - "epoch": 0.36, - "learning_rate": 4.034108149278544e-06, - "logits/chosen": -1.8260358572006226, - "logits/rejected": -1.7864515781402588, - "logps/chosen": -367.08892822265625, - "logps/rejected": -439.1949157714844, - "loss": 0.5596, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8357939720153809, - "rewards/margins": 0.6055955290794373, - "rewards/rejected": -1.441389560699463, - "step": 690 - }, - { - "epoch": 0.37, - "learning_rate": 3.997778496167584e-06, - "logits/chosen": -1.2935597896575928, - "logits/rejected": -1.4118752479553223, - "logps/chosen": -245.8689422607422, - "logps/rejected": -354.35906982421875, - "loss": 0.5792, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6742351055145264, - "rewards/margins": 0.6851714849472046, - "rewards/rejected": -1.3594067096710205, - "step": 700 - }, - { - "epoch": 0.37, - "eval_logits/chosen": -1.5775055885314941, - "eval_logits/rejected": -1.4782546758651733, - "eval_logps/chosen": -316.5071716308594, - "eval_logps/rejected": -372.79888916015625, - "eval_loss": 0.570942759513855, - "eval_rewards/accuracies": 0.7578125, - "eval_rewards/chosen": -0.6635350584983826, - "eval_rewards/margins": 0.5465149879455566, - "eval_rewards/rejected": -1.2100499868392944, - "eval_runtime": 99.8188, - "eval_samples_per_second": 20.036, - "eval_steps_per_second": 0.321, - "step": 700 - }, - { - "epoch": 0.37, - "learning_rate": 3.96094859720583e-06, - "logits/chosen": -1.645870566368103, - "logits/rejected": -1.5308467149734497, - "logps/chosen": -324.1588439941406, - "logps/rejected": -402.2452697753906, - "loss": 0.5502, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.775097131729126, - "rewards/margins": 0.43130356073379517, - "rewards/rejected": -1.2064006328582764, - "step": 710 - }, - { - "epoch": 0.38, - "learning_rate": 3.923630753280358e-06, - "logits/chosen": -1.6445233821868896, - "logits/rejected": -1.3163498640060425, - "logps/chosen": -396.59014892578125, - "logps/rejected": -368.18463134765625, - "loss": 0.5609, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8597933650016785, - "rewards/margins": 0.5147794485092163, - "rewards/rejected": -1.37457275390625, - "step": 720 - }, - { - "epoch": 0.38, - "learning_rate": 3.88583742824789e-06, - "logits/chosen": -1.1861711740493774, - "logits/rejected": -1.015647530555725, - "logps/chosen": -354.11944580078125, - "logps/rejected": -418.89886474609375, - "loss": 0.5447, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8276666402816772, - "rewards/margins": 0.6120607256889343, - "rewards/rejected": -1.4397274255752563, - "step": 730 - }, - { - "epoch": 0.39, - "learning_rate": 3.847581244771983e-06, - "logits/chosen": -1.0957353115081787, - "logits/rejected": -1.0387994050979614, - "logps/chosen": -300.3524169921875, - "logps/rejected": -353.12310791015625, - "loss": 0.5477, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9017397165298462, - "rewards/margins": 0.2911619544029236, - "rewards/rejected": -1.1929017305374146, - "step": 740 - }, - { - "epoch": 0.39, - "learning_rate": 3.80887498010715e-06, - "logits/chosen": -1.4777683019638062, - "logits/rejected": -1.2283952236175537, - "logps/chosen": -393.16693115234375, - "logps/rejected": -372.19317626953125, - "loss": 0.5624, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9122506976127625, - "rewards/margins": 0.29486000537872314, - "rewards/rejected": -1.2071107625961304, - "step": 750 - }, - { - "epoch": 0.4, - "learning_rate": 3.769731561831365e-06, - "logits/chosen": -0.9933805465698242, - "logits/rejected": -1.0841389894485474, - "logps/chosen": -360.70294189453125, - "logps/rejected": -431.0218200683594, - "loss": 0.5184, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.8388063311576843, - "rewards/margins": 0.7050133943557739, - "rewards/rejected": -1.5438196659088135, - "step": 760 - }, - { - "epoch": 0.4, - "learning_rate": 3.730164063528359e-06, - "logits/chosen": -0.7796175479888916, - "logits/rejected": -0.5043349266052246, - "logps/chosen": -303.9163513183594, - "logps/rejected": -400.3309020996094, - "loss": 0.5436, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8382329940795898, - "rewards/margins": 0.8575286865234375, - "rewards/rejected": -1.6957619190216064, - "step": 770 - }, - { - "epoch": 0.41, - "learning_rate": 3.690185700421145e-06, - "logits/chosen": -0.8145714998245239, - "logits/rejected": -0.3118314743041992, - "logps/chosen": -349.3692932128906, - "logps/rejected": -309.53875732421875, - "loss": 0.5767, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8476495742797852, - "rewards/margins": 0.572998046875, - "rewards/rejected": -1.420647382736206, - "step": 780 - }, - { - "epoch": 0.41, - "learning_rate": 3.649809824958245e-06, - "logits/chosen": -1.1090258359909058, - "logits/rejected": -0.552297055721283, - "logps/chosen": -440.61883544921875, - "logps/rejected": -392.18914794921875, - "loss": 0.5308, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0068585872650146, - "rewards/margins": 0.43958479166030884, - "rewards/rejected": -1.4464435577392578, - "step": 790 - }, - { - "epoch": 0.42, - "learning_rate": 3.609049922354076e-06, - "logits/chosen": -1.044739842414856, - "logits/rejected": -0.796644389629364, - "logps/chosen": -399.64404296875, - "logps/rejected": -408.13934326171875, - "loss": 0.5194, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0027108192443848, - "rewards/margins": 0.539101779460907, - "rewards/rejected": -1.541812777519226, - "step": 800 - }, - { - "epoch": 0.42, - "eval_logits/chosen": -0.5486251711845398, - "eval_logits/rejected": -0.37914562225341797, - "eval_logps/chosen": -352.2356872558594, - "eval_logps/rejected": -416.3268737792969, - "eval_loss": 0.559037983417511, - "eval_rewards/accuracies": 0.74609375, - "eval_rewards/chosen": -1.0208208560943604, - "eval_rewards/margins": 0.6245089769363403, - "eval_rewards/rejected": -1.6453297138214111, - "eval_runtime": 99.895, - "eval_samples_per_second": 20.021, - "eval_steps_per_second": 0.32, - "step": 800 - }, - { - "epoch": 0.42, - "learning_rate": 3.567919606085004e-06, - "logits/chosen": -0.8196202516555786, - "logits/rejected": -0.5285523533821106, - "logps/chosen": -433.35369873046875, - "logps/rejected": -463.96240234375, - "loss": 0.5493, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1318707466125488, - "rewards/margins": 0.6623066663742065, - "rewards/rejected": -1.7941772937774658, - "step": 810 - }, - { - "epoch": 0.43, - "learning_rate": 3.5264326133425467e-06, - "logits/chosen": -1.0270545482635498, - "logits/rejected": -1.041868805885315, - "logps/chosen": -427.3218688964844, - "logps/rejected": -445.3553771972656, - "loss": 0.581, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.8407033085823059, - "rewards/margins": 0.44703882932662964, - "rewards/rejected": -1.2877418994903564, - "step": 820 - }, - { - "epoch": 0.43, - "learning_rate": 3.4846028004452696e-06, - "logits/chosen": -0.4200538992881775, - "logits/rejected": -0.20024017989635468, - "logps/chosen": -317.58538818359375, - "logps/rejected": -362.42376708984375, - "loss": 0.5375, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.0300816297531128, - "rewards/margins": 0.4815269112586975, - "rewards/rejected": -1.5116084814071655, - "step": 830 - }, - { - "epoch": 0.44, - "learning_rate": 3.442444138210883e-06, - "logits/chosen": -0.5636943578720093, - "logits/rejected": -0.2242840826511383, - "logps/chosen": -356.8027648925781, - "logps/rejected": -383.00006103515625, - "loss": 0.5785, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1958321332931519, - "rewards/margins": 0.46510282158851624, - "rewards/rejected": -1.6609351634979248, - "step": 840 - }, - { - "epoch": 0.44, - "learning_rate": 3.399970707290105e-06, - "logits/chosen": -0.6056006550788879, - "logits/rejected": -0.5171287655830383, - "logps/chosen": -286.99462890625, - "logps/rejected": -349.02764892578125, - "loss": 0.5842, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9791563749313354, - "rewards/margins": 0.45784440636634827, - "rewards/rejected": -1.4370006322860718, - "step": 850 - }, - { - "epoch": 0.45, - "learning_rate": 3.3571966934638378e-06, - "logits/chosen": -0.5877082943916321, - "logits/rejected": -0.630211591720581, - "logps/chosen": -363.6180725097656, - "logps/rejected": -374.6347961425781, - "loss": 0.5743, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0287179946899414, - "rewards/margins": 0.5056768655776978, - "rewards/rejected": -1.5343949794769287, - "step": 860 - }, - { - "epoch": 0.46, - "learning_rate": 3.314136382905234e-06, - "logits/chosen": -0.5021811723709106, - "logits/rejected": -0.5260487794876099, - "logps/chosen": -368.32452392578125, - "logps/rejected": -409.72064208984375, - "loss": 0.5672, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.0683351755142212, - "rewards/margins": 0.44290056824684143, - "rewards/rejected": -1.5112355947494507, - "step": 870 - }, - { - "epoch": 0.46, - "learning_rate": 3.2708041574082257e-06, - "logits/chosen": -0.44730886816978455, - "logits/rejected": -0.31664231419563293, - "logps/chosen": -322.88885498046875, - "logps/rejected": -367.3641662597656, - "loss": 0.5537, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.011106014251709, - "rewards/margins": 0.5238951444625854, - "rewards/rejected": -1.535001277923584, - "step": 880 - }, - { - "epoch": 0.47, - "learning_rate": 3.2272144895841285e-06, - "logits/chosen": -0.01936722919344902, - "logits/rejected": 0.09176506102085114, - "logps/chosen": -342.02069091796875, - "logps/rejected": -354.09027099609375, - "loss": 0.565, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2459288835525513, - "rewards/margins": 0.2770703434944153, - "rewards/rejected": -1.5229991674423218, - "step": 890 - }, - { - "epoch": 0.47, - "learning_rate": 3.1833819380279028e-06, - "logits/chosen": -0.14645493030548096, - "logits/rejected": -0.09734384715557098, - "logps/chosen": -320.1854553222656, - "logps/rejected": -436.3653259277344, - "loss": 0.5367, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.9370415806770325, - "rewards/margins": 0.9560844302177429, - "rewards/rejected": -1.893126130104065, - "step": 900 - }, - { - "epoch": 0.47, - "eval_logits/chosen": -0.28988412022590637, - "eval_logits/rejected": -0.09080193191766739, - "eval_logps/chosen": -364.9275817871094, - "eval_logps/rejected": -437.00396728515625, - "eval_loss": 0.5491830110549927, - "eval_rewards/accuracies": 0.7265625, - "eval_rewards/chosen": -1.1477394104003906, - "eval_rewards/margins": 0.7043614387512207, - "eval_rewards/rejected": -1.8521009683609009, - "eval_runtime": 99.6659, - "eval_samples_per_second": 20.067, - "eval_steps_per_second": 0.321, - "step": 900 - }, - { - "epoch": 0.48, - "learning_rate": 3.1393211424557037e-06, - "logits/chosen": -0.49690476059913635, - "logits/rejected": -0.2139798104763031, - "logps/chosen": -381.86834716796875, - "logps/rejected": -435.0732421875, - "loss": 0.5364, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.218109369277954, - "rewards/margins": 0.5727220177650452, - "rewards/rejected": -1.7908313274383545, - "step": 910 - }, - { - "epoch": 0.48, - "learning_rate": 3.095046818815331e-06, - "logits/chosen": -0.5411738157272339, - "logits/rejected": -0.3583293855190277, - "logps/chosen": -398.50579833984375, - "logps/rejected": -460.13031005859375, - "loss": 0.5766, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.183455467224121, - "rewards/margins": 0.5352784395217896, - "rewards/rejected": -1.7187340259552002, - "step": 920 - }, - { - "epoch": 0.49, - "learning_rate": 3.050573754371228e-06, - "logits/chosen": -0.6063122153282166, - "logits/rejected": -0.1856008768081665, - "logps/chosen": -353.6801452636719, - "logps/rejected": -411.11004638671875, - "loss": 0.5375, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1633942127227783, - "rewards/margins": 0.6123510599136353, - "rewards/rejected": -1.775745153427124, - "step": 930 - }, - { - "epoch": 0.49, - "learning_rate": 3.0059168027656475e-06, - "logits/chosen": -0.4713048040866852, - "logits/rejected": 0.16548386216163635, - "logps/chosen": -429.6309509277344, - "logps/rejected": -403.17755126953125, - "loss": 0.5238, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1189515590667725, - "rewards/margins": 0.7205164432525635, - "rewards/rejected": -1.8394676446914673, - "step": 940 - }, - { - "epoch": 0.5, - "learning_rate": 2.9610908790576664e-06, - "logits/chosen": -0.19411665201187134, - "logits/rejected": 0.067843496799469, - "logps/chosen": -363.87176513671875, - "logps/rejected": -404.2000427246094, - "loss": 0.5985, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3660383224487305, - "rewards/margins": 0.4443749487400055, - "rewards/rejected": -1.810413122177124, - "step": 950 - }, - { - "epoch": 0.5, - "learning_rate": 2.916110954741667e-06, - "logits/chosen": 0.5834629535675049, - "logits/rejected": 0.7395158410072327, - "logps/chosen": -338.3902282714844, - "logps/rejected": -374.72113037109375, - "loss": 0.5698, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2995877265930176, - "rewards/margins": 0.37612777948379517, - "rewards/rejected": -1.675715684890747, - "step": 960 - }, - { - "epoch": 0.51, - "learning_rate": 2.8709920527469836e-06, - "logits/chosen": -0.29962724447250366, - "logits/rejected": -0.045618630945682526, - "logps/chosen": -361.26055908203125, - "logps/rejected": -428.3038024902344, - "loss": 0.5458, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.0471857786178589, - "rewards/margins": 0.619658887386322, - "rewards/rejected": -1.6668447256088257, - "step": 970 - }, - { - "epoch": 0.51, - "learning_rate": 2.8257492424203685e-06, - "logits/chosen": -0.21448250114917755, - "logits/rejected": 0.2300967425107956, - "logps/chosen": -361.10284423828125, - "logps/rejected": -420.21875, - "loss": 0.5348, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2196528911590576, - "rewards/margins": 0.626075267791748, - "rewards/rejected": -1.8457282781600952, - "step": 980 - }, - { - "epoch": 0.52, - "learning_rate": 2.7803976344929497e-06, - "logits/chosen": 0.016325589269399643, - "logits/rejected": 0.5622560977935791, - "logps/chosen": -393.4544982910156, - "logps/rejected": -495.8486328125, - "loss": 0.537, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.3306463956832886, - "rewards/margins": 0.8722470998764038, - "rewards/rejected": -2.2028937339782715, - "step": 990 - }, - { - "epoch": 0.52, - "learning_rate": 2.734952376033368e-06, - "logits/chosen": -0.3442438244819641, - "logits/rejected": -0.028740787878632545, - "logps/chosen": -397.14544677734375, - "logps/rejected": -438.4039001464844, - "loss": 0.5575, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.145361065864563, - "rewards/margins": 0.8945510983467102, - "rewards/rejected": -2.039912462234497, - "step": 1000 - }, - { - "epoch": 0.52, - "eval_logits/chosen": 0.049825601279735565, - "eval_logits/rejected": 0.27614572644233704, - "eval_logps/chosen": -367.19635009765625, - "eval_logps/rejected": -442.27545166015625, - "eval_loss": 0.5450169444084167, - "eval_rewards/accuracies": 0.734375, - "eval_rewards/chosen": -1.1704269647598267, - "eval_rewards/margins": 0.7343888282775879, - "eval_rewards/rejected": -1.904815673828125, - "eval_runtime": 100.5701, - "eval_samples_per_second": 19.887, - "eval_steps_per_second": 0.318, - "step": 1000 - }, - { - "epoch": 0.53, - "learning_rate": 2.689428645388783e-06, - "logits/chosen": 0.0017841160297393799, - "logits/rejected": 0.5289415717124939, - "logps/chosen": -351.3257751464844, - "logps/rejected": -382.60260009765625, - "loss": 0.4955, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.1659106016159058, - "rewards/margins": 0.8750098347663879, - "rewards/rejected": -2.0409202575683594, - "step": 1010 - }, - { - "epoch": 0.53, - "learning_rate": 2.6438416471154277e-06, - "logits/chosen": -0.42786794900894165, - "logits/rejected": 0.2951677441596985, - "logps/chosen": -416.7127990722656, - "logps/rejected": -424.31915283203125, - "loss": 0.54, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.281354546546936, - "rewards/margins": 0.35356560349464417, - "rewards/rejected": -1.6349201202392578, - "step": 1020 - }, - { - "epoch": 0.54, - "learning_rate": 2.598206606900406e-06, - "logits/chosen": -0.01613428071141243, - "logits/rejected": -0.0008750840788707137, - "logps/chosen": -384.22772216796875, - "logps/rejected": -421.3248596191406, - "loss": 0.5156, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9642124176025391, - "rewards/margins": 0.5817215442657471, - "rewards/rejected": -1.5459339618682861, - "step": 1030 - }, - { - "epoch": 0.54, - "learning_rate": 2.5525387664764433e-06, - "logits/chosen": 0.11487326771020889, - "logits/rejected": 0.6310429573059082, - "logps/chosen": -388.2038269042969, - "logps/rejected": -442.6739807128906, - "loss": 0.5192, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2305266857147217, - "rewards/margins": 0.7809251546859741, - "rewards/rejected": -2.0114519596099854, - "step": 1040 - }, - { - "epoch": 0.55, - "learning_rate": 2.5068533785312673e-06, - "logits/chosen": 0.09778528660535812, - "logits/rejected": 0.4881245195865631, - "logps/chosen": -411.2027893066406, - "logps/rejected": -440.90020751953125, - "loss": 0.522, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.5704927444458008, - "rewards/margins": 0.5099351406097412, - "rewards/rejected": -2.080427646636963, - "step": 1050 - }, - { - "epoch": 0.55, - "learning_rate": 2.4611657016133334e-06, - "logits/chosen": -0.07164221256971359, - "logits/rejected": 0.257876992225647, - "logps/chosen": -467.0708923339844, - "logps/rejected": -494.01055908203125, - "loss": 0.5114, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4244937896728516, - "rewards/margins": 0.6371169686317444, - "rewards/rejected": -2.061610698699951, - "step": 1060 - }, - { - "epoch": 0.56, - "learning_rate": 2.4154909950355966e-06, - "logits/chosen": -0.08780699223279953, - "logits/rejected": 0.5813466906547546, - "logps/chosen": -457.4397888183594, - "logps/rejected": -534.4034423828125, - "loss": 0.5268, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.403752326965332, - "rewards/margins": 0.8715044260025024, - "rewards/rejected": -2.275256633758545, - "step": 1070 - }, - { - "epoch": 0.57, - "learning_rate": 2.369844513779026e-06, - "logits/chosen": -0.05808568745851517, - "logits/rejected": 0.19921842217445374, - "logps/chosen": -365.045654296875, - "logps/rejected": -452.080810546875, - "loss": 0.5272, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1917533874511719, - "rewards/margins": 0.8068798184394836, - "rewards/rejected": -1.9986331462860107, - "step": 1080 - }, - { - "epoch": 0.57, - "learning_rate": 2.3242415033975575e-06, - "logits/chosen": 0.1933274269104004, - "logits/rejected": 0.7321079969406128, - "logps/chosen": -426.7176818847656, - "logps/rejected": -412.4091796875, - "loss": 0.5474, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3316107988357544, - "rewards/margins": 0.5840722322463989, - "rewards/rejected": -1.9156830310821533, - "step": 1090 - }, - { - "epoch": 0.58, - "learning_rate": 2.2786971949262137e-06, - "logits/chosen": -0.10808311402797699, - "logits/rejected": 0.449955552816391, - "logps/chosen": -346.48138427734375, - "logps/rejected": -401.5810241699219, - "loss": 0.5507, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.9797190427780151, - "rewards/margins": 0.8783604502677917, - "rewards/rejected": -1.8580795526504517, - "step": 1100 - }, - { - "epoch": 0.58, - "eval_logits/chosen": 0.2876562774181366, - "eval_logits/rejected": 0.533876895904541, - "eval_logps/chosen": -360.5550842285156, - "eval_logps/rejected": -438.5025939941406, - "eval_loss": 0.542937159538269, - "eval_rewards/accuracies": 0.7421875, - "eval_rewards/chosen": -1.1040146350860596, - "eval_rewards/margins": 0.7630726099014282, - "eval_rewards/rejected": -1.8670872449874878, - "eval_runtime": 99.3891, - "eval_samples_per_second": 20.123, - "eval_steps_per_second": 0.322, - "step": 1100 - }, - { - "epoch": 0.58, - "learning_rate": 2.2332267997940514e-06, - "logits/chosen": 0.33531707525253296, - "logits/rejected": 0.6124697923660278, - "logps/chosen": -344.9618835449219, - "logps/rejected": -386.95452880859375, - "loss": 0.5489, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.142901062965393, - "rewards/margins": 0.5753956437110901, - "rewards/rejected": -1.7182966470718384, - "step": 1110 - }, - { - "epoch": 0.59, - "learning_rate": 2.1878455047436754e-06, - "logits/chosen": 0.24967947602272034, - "logits/rejected": 0.7071189880371094, - "logps/chosen": -394.3273620605469, - "logps/rejected": -389.794677734375, - "loss": 0.5209, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2302659749984741, - "rewards/margins": 0.572158694267273, - "rewards/rejected": -1.802424669265747, - "step": 1120 - }, - { - "epoch": 0.59, - "learning_rate": 2.1425684667589853e-06, - "logits/chosen": 0.5281914472579956, - "logits/rejected": 0.8983514904975891, - "logps/chosen": -355.4177551269531, - "logps/rejected": -408.59014892578125, - "loss": 0.5686, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3293302059173584, - "rewards/margins": 0.5205702185630798, - "rewards/rejected": -1.849900245666504, - "step": 1130 - }, - { - "epoch": 0.6, - "learning_rate": 2.097410808002869e-06, - "logits/chosen": 0.4173739552497864, - "logits/rejected": 0.831757664680481, - "logps/chosen": -352.9256286621094, - "logps/rejected": -429.7771911621094, - "loss": 0.5486, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.1642874479293823, - "rewards/margins": 0.7625770568847656, - "rewards/rejected": -1.9268646240234375, - "step": 1140 - }, - { - "epoch": 0.6, - "learning_rate": 2.0523876107665197e-06, - "logits/chosen": 0.33401042222976685, - "logits/rejected": 1.1779800653457642, - "logps/chosen": -382.9471130371094, - "logps/rejected": -419.718994140625, - "loss": 0.5325, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0509642362594604, - "rewards/margins": 0.9581982493400574, - "rewards/rejected": -2.009162425994873, - "step": 1150 - }, - { - "epoch": 0.61, - "learning_rate": 2.007513912432079e-06, - "logits/chosen": 0.5797901153564453, - "logits/rejected": 0.6977930665016174, - "logps/chosen": -363.8152160644531, - "logps/rejected": -425.59124755859375, - "loss": 0.5414, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3096548318862915, - "rewards/margins": 0.6404959559440613, - "rewards/rejected": -1.950150728225708, - "step": 1160 - }, - { - "epoch": 0.61, - "learning_rate": 1.962804700450265e-06, - "logits/chosen": 0.45753908157348633, - "logits/rejected": 0.6594685912132263, - "logps/chosen": -306.3339538574219, - "logps/rejected": -329.291015625, - "loss": 0.5564, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1404427289962769, - "rewards/margins": 0.4516426622867584, - "rewards/rejected": -1.592085599899292, - "step": 1170 - }, - { - "epoch": 0.62, - "learning_rate": 1.9182749073346945e-06, - "logits/chosen": 0.4079570174217224, - "logits/rejected": 0.8342088460922241, - "logps/chosen": -372.5623474121094, - "logps/rejected": -444.697509765625, - "loss": 0.5387, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0847569704055786, - "rewards/margins": 0.6219168901443481, - "rewards/rejected": -1.7066739797592163, - "step": 1180 - }, - { - "epoch": 0.62, - "learning_rate": 1.8739394056745375e-06, - "logits/chosen": -0.03631135821342468, - "logits/rejected": 0.29353755712509155, - "logps/chosen": -428.0572204589844, - "logps/rejected": -495.27301025390625, - "loss": 0.5187, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8947092294692993, - "rewards/margins": 0.8324411511421204, - "rewards/rejected": -1.727150321006775, - "step": 1190 - }, - { - "epoch": 0.63, - "learning_rate": 1.8298130031671974e-06, - "logits/chosen": 0.3293336033821106, - "logits/rejected": 0.4875457286834717, - "logps/chosen": -369.41693115234375, - "logps/rejected": -485.47552490234375, - "loss": 0.5305, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9813504219055176, - "rewards/margins": 0.7564947009086609, - "rewards/rejected": -1.7378450632095337, - "step": 1200 - }, - { - "epoch": 0.63, - "eval_logits/chosen": 0.4754948914051056, - "eval_logits/rejected": 0.7349805235862732, - "eval_logps/chosen": -365.72406005859375, - "eval_logps/rejected": -444.2216796875, - "eval_loss": 0.5365983843803406, - "eval_rewards/accuracies": 0.7578125, - "eval_rewards/chosen": -1.155704379081726, - "eval_rewards/margins": 0.7685737609863281, - "eval_rewards/rejected": -1.9242780208587646, - "eval_runtime": 99.8758, - "eval_samples_per_second": 20.025, - "eval_steps_per_second": 0.32, - "step": 1200 - }, - { - "epoch": 0.63, - "learning_rate": 1.7859104376726583e-06, - "logits/chosen": 0.31323331594467163, - "logits/rejected": 0.8091317415237427, - "logps/chosen": -356.2727966308594, - "logps/rejected": -382.131103515625, - "loss": 0.5003, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.319533348083496, - "rewards/margins": 0.5745511651039124, - "rewards/rejected": -1.8940845727920532, - "step": 1210 - }, - { - "epoch": 0.64, - "learning_rate": 1.7422463722911626e-06, - "logits/chosen": 0.28560835123062134, - "logits/rejected": 0.8169926404953003, - "logps/chosen": -404.62518310546875, - "logps/rejected": -409.4010314941406, - "loss": 0.5401, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2086342573165894, - "rewards/margins": 0.652005672454834, - "rewards/rejected": -1.8606399297714233, - "step": 1220 - }, - { - "epoch": 0.64, - "learning_rate": 1.6988353904658495e-06, - "logits/chosen": 0.0763484388589859, - "logits/rejected": 0.5871134400367737, - "logps/chosen": -396.5611572265625, - "logps/rejected": -415.1891174316406, - "loss": 0.5357, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6239595413208008, - "rewards/margins": 0.532719075679779, - "rewards/rejected": -2.1566786766052246, - "step": 1230 - }, - { - "epoch": 0.65, - "learning_rate": 1.6556919911120084e-06, - "logits/chosen": -0.16248223185539246, - "logits/rejected": 0.2298637330532074, - "logps/chosen": -440.5934143066406, - "logps/rejected": -452.263916015625, - "loss": 0.5228, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2942602634429932, - "rewards/margins": 0.729531466960907, - "rewards/rejected": -2.023791551589966, - "step": 1240 - }, - { - "epoch": 0.65, - "learning_rate": 1.6128305837745548e-06, - "logits/chosen": 0.4567365050315857, - "logits/rejected": 0.9410026669502258, - "logps/chosen": -421.6625061035156, - "logps/rejected": -500.62939453125, - "loss": 0.4833, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.259197473526001, - "rewards/margins": 1.098459005355835, - "rewards/rejected": -2.357656478881836, - "step": 1250 - }, - { - "epoch": 0.66, - "learning_rate": 1.5702654838153641e-06, - "logits/chosen": 0.8424333333969116, - "logits/rejected": 1.3221367597579956, - "logps/chosen": -376.1363830566406, - "logps/rejected": -475.8714904785156, - "loss": 0.5373, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4600459337234497, - "rewards/margins": 0.8872382044792175, - "rewards/rejected": -2.3472840785980225, - "step": 1260 - }, { "epoch": 0.66, - "learning_rate": 1.528010907632051e-06, - "logits/chosen": 0.41781607270240784, - "logits/rejected": 0.819769561290741, - "logps/chosen": -369.557861328125, - "logps/rejected": -404.1021423339844, - "loss": 0.5333, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2966288328170776, - "rewards/margins": 0.6270810961723328, - "rewards/rejected": -1.9237098693847656, - "step": 1270 + "learning_rate": 1.5676295169786864e-06, + "logits/chosen": -0.2444450557231903, + "logits/rejected": 0.13049665093421936, + "logps/chosen": -350.13702392578125, + "logps/rejected": -413.64398193359375, + "loss": 0.5379, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8719974756240845, + "rewards/margins": 0.7344092130661011, + "rewards/rejected": -1.606406807899475, + "step": 630 }, { "epoch": 0.67, - "learning_rate": 1.486080967909816e-06, - "logits/chosen": 0.532579779624939, - "logits/rejected": 1.2962762117385864, - "logps/chosen": -375.81109619140625, - "logps/rejected": -468.69091796875, - "loss": 0.526, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.2724688053131104, - "rewards/margins": 1.1656290292739868, - "rewards/rejected": -2.4380979537963867, - "step": 1280 - }, - { - "epoch": 0.68, - "learning_rate": 1.4444896689079142e-06, - "logits/chosen": 0.10695119202136993, - "logits/rejected": 0.6434232592582703, - "logps/chosen": -473.42364501953125, - "logps/rejected": -503.74432373046875, - "loss": 0.5469, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.7253440618515015, - "rewards/margins": 0.48180991411209106, - "rewards/rejected": -2.207153797149658, - "step": 1290 - }, - { - "epoch": 0.68, - "learning_rate": 1.403250901782354e-06, - "logits/chosen": -0.03585321083664894, - "logits/rejected": 0.7208360433578491, - "logps/chosen": -407.0667419433594, - "logps/rejected": -425.78289794921875, - "loss": 0.5171, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2429778575897217, - "rewards/margins": 0.7077491283416748, - "rewards/rejected": -1.950727105140686, - "step": 1300 + "learning_rate": 1.4834368231970922e-06, + "logits/chosen": -0.38939914107322693, + "logits/rejected": 0.05538179352879524, + "logps/chosen": -322.5318298339844, + "logps/rejected": -368.46343994140625, + "loss": 0.5401, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7830222845077515, + "rewards/margins": 0.7185503244400024, + "rewards/rejected": -1.501572847366333, + "step": 640 }, { "epoch": 0.68, - "eval_logits/chosen": 0.5029312372207642, - "eval_logits/rejected": 0.7685657143592834, - "eval_logps/chosen": -387.5680847167969, - "eval_logps/rejected": -468.57354736328125, - "eval_loss": 0.5304480195045471, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -1.3741439580917358, - "eval_rewards/margins": 0.7936526536941528, - "eval_rewards/rejected": -2.1677966117858887, - "eval_runtime": 99.5127, - "eval_samples_per_second": 20.098, - "eval_steps_per_second": 0.322, - "step": 1300 - }, - { - "epoch": 0.69, - "learning_rate": 1.3623784399463585e-06, - "logits/chosen": 0.167133167386055, - "logits/rejected": 0.6624370813369751, - "logps/chosen": -463.7676696777344, - "logps/rejected": -429.35931396484375, - "loss": 0.5453, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3896446228027344, - "rewards/margins": 0.5783199667930603, - "rewards/rejected": -1.96796452999115, - "step": 1310 + "learning_rate": 1.4006036925609245e-06, + "logits/chosen": -0.8077508807182312, + "logits/rejected": -0.2668865919113159, + "logps/chosen": -380.40252685546875, + "logps/rejected": -398.7056579589844, + "loss": 0.5383, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8857347369194031, + "rewards/margins": 0.5326262712478638, + "rewards/rejected": -1.4183608293533325, + "step": 650 }, { "epoch": 0.69, - "learning_rate": 1.3218859344701634e-06, - "logits/chosen": 0.32800671458244324, - "logits/rejected": 0.7880526185035706, - "logps/chosen": -425.216552734375, - "logps/rejected": -439.70611572265625, - "loss": 0.5281, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4577715396881104, - "rewards/margins": 0.7258911728858948, - "rewards/rejected": -2.1836624145507812, - "step": 1320 - }, - { - "epoch": 0.7, - "learning_rate": 1.2817869095216626e-06, - "logits/chosen": 0.4725889265537262, - "logits/rejected": 0.6833093762397766, - "logps/chosen": -400.4358215332031, - "logps/rejected": -443.46063232421875, - "loss": 0.5021, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2056840658187866, - "rewards/margins": 0.6257123947143555, - "rewards/rejected": -1.831396460533142, - "step": 1330 + "learning_rate": 1.3192409070404582e-06, + "logits/chosen": -0.5546111464500427, + "logits/rejected": -0.17088347673416138, + "logps/chosen": -398.8465881347656, + "logps/rejected": -376.5204162597656, + "loss": 0.554, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9672527313232422, + "rewards/margins": 0.5284374952316284, + "rewards/rejected": -1.4956903457641602, + "step": 660 }, { "epoch": 0.7, - "learning_rate": 1.2420947578494524e-06, - "logits/chosen": 0.28971534967422485, - "logits/rejected": 0.6644312739372253, - "logps/chosen": -418.182861328125, - "logps/rejected": -483.86260986328125, - "loss": 0.5371, + "learning_rate": 1.2394572821496953e-06, + "logits/chosen": -0.39746540784835815, + "logits/rejected": -0.14205250144004822, + "logps/chosen": -365.32281494140625, + "logps/rejected": -412.72650146484375, + "loss": 0.5385, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4614604711532593, - "rewards/margins": 0.6954753994941711, - "rewards/rejected": -2.156935930252075, - "step": 1340 - }, - { - "epoch": 0.71, - "learning_rate": 1.2028227363097583e-06, - "logits/chosen": 0.6982226371765137, - "logits/rejected": 0.4474209249019623, - "logps/chosen": -356.05047607421875, - "logps/rejected": -364.0442810058594, - "loss": 0.5642, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2198214530944824, - "rewards/margins": 0.17703430354595184, - "rewards/rejected": -1.3968555927276611, - "step": 1350 + "rewards/chosen": -0.8937069177627563, + "rewards/margins": 0.5911084413528442, + "rewards/rejected": -1.4848153591156006, + "step": 670 }, { "epoch": 0.71, - "learning_rate": 1.1639839614387575e-06, - "logits/chosen": 0.36424458026885986, - "logits/rejected": 0.7081610560417175, - "logps/chosen": -347.169677734375, - "logps/rejected": -375.45782470703125, - "loss": 0.5351, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1354833841323853, - "rewards/margins": 0.48198217153549194, - "rewards/rejected": -1.6174657344818115, - "step": 1360 - }, - { - "epoch": 0.72, - "learning_rate": 1.1255914050717553e-06, - "logits/chosen": 0.20586355030536652, - "logits/rejected": 0.5243974328041077, - "logps/chosen": -338.34124755859375, - "logps/rejected": -414.631103515625, - "loss": 0.4757, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0596474409103394, - "rewards/margins": 0.7905664443969727, - "rewards/rejected": -1.8502140045166016, - "step": 1370 + "learning_rate": 1.1613595214152713e-06, + "logits/chosen": -0.23002564907073975, + "logits/rejected": -0.18054810166358948, + "logps/chosen": -314.38275146484375, + "logps/rejected": -324.4413146972656, + "loss": 0.5657, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8053790330886841, + "rewards/margins": 0.24868443608283997, + "rewards/rejected": -1.054063320159912, + "step": 680 }, { "epoch": 0.72, - "learning_rate": 1.0876578900107053e-06, - "logits/chosen": 0.5624727010726929, - "logits/rejected": 1.3232195377349854, - "logps/chosen": -299.00689697265625, - "logps/rejected": -350.9127197265625, - "loss": 0.5566, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0775959491729736, - "rewards/margins": 0.9055012464523315, - "rewards/rejected": -1.9830970764160156, - "step": 1380 - }, - { - "epoch": 0.73, - "learning_rate": 1.0501960857414912e-06, - "logits/chosen": 0.11433680355548859, - "logits/rejected": 1.1149139404296875, - "logps/chosen": -396.9518127441406, - "logps/rejected": -397.4958190917969, - "loss": 0.5236, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3632252216339111, - "rewards/margins": 0.8269487619400024, - "rewards/rejected": -2.1901743412017822, - "step": 1390 + "learning_rate": 1.0850520736699362e-06, + "logits/chosen": -0.35535928606987, + "logits/rejected": 0.08797712624073029, + "logps/chosen": -279.0433349609375, + "logps/rejected": -323.66754150390625, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.672313928604126, + "rewards/margins": 0.6532977223396301, + "rewards/rejected": -1.3256117105484009, + "step": 690 }, { "epoch": 0.73, - "learning_rate": 1.0132185042024249e-06, - "logits/chosen": 0.43161505460739136, - "logits/rejected": 0.736209511756897, - "logps/chosen": -331.12628173828125, - "logps/rejected": -450.26202392578125, - "loss": 0.4875, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2222540378570557, - "rewards/margins": 0.826834499835968, - "rewards/rejected": -2.049088716506958, - "step": 1400 + "learning_rate": 1.0106369933615043e-06, + "logits/chosen": -0.46180295944213867, + "logits/rejected": 0.07596547156572342, + "logps/chosen": -314.94793701171875, + "logps/rejected": -355.2444152832031, + "loss": 0.5279, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8018286824226379, + "rewards/margins": 0.6314573884010315, + "rewards/rejected": -1.4332860708236694, + "step": 700 }, { "epoch": 0.73, - "eval_logits/chosen": 0.5925981998443604, - "eval_logits/rejected": 0.8565999269485474, - "eval_logps/chosen": -382.43292236328125, - "eval_logps/rejected": -466.92669677734375, - "eval_loss": 0.5320979356765747, - "eval_rewards/accuracies": 0.7578125, - "eval_rewards/chosen": -1.3227922916412354, - "eval_rewards/margins": 0.8285354375839233, - "eval_rewards/rejected": -2.1513278484344482, - "eval_runtime": 99.9611, + "eval_logits/chosen": -0.198812335729599, + "eval_logits/rejected": 0.014529339037835598, + "eval_logps/chosen": -334.1552734375, + "eval_logps/rejected": -400.80816650390625, + "eval_loss": 0.5489765405654907, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -0.8400164246559143, + "eval_rewards/margins": 0.650126576423645, + "eval_rewards/rejected": -1.4901429414749146, + "eval_runtime": 99.961, "eval_samples_per_second": 20.008, "eval_steps_per_second": 0.32, - "step": 1400 - }, - { - "epoch": 0.74, - "learning_rate": 9.767374956053584e-07, - "logits/chosen": 0.6243610978126526, - "logits/rejected": 1.1261684894561768, - "logps/chosen": -380.84222412109375, - "logps/rejected": -383.2228698730469, - "loss": 0.5409, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4898316860198975, - "rewards/margins": 0.39106887578964233, - "rewards/rejected": -1.8809006214141846, - "step": 1410 + "step": 700 }, { "epoch": 0.74, - "learning_rate": 9.407652443108192e-07, - "logits/chosen": 0.4377509653568268, - "logits/rejected": 0.9771862030029297, - "logps/chosen": -422.27508544921875, - "logps/rejected": -454.2247009277344, - "loss": 0.5151, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.375937581062317, - "rewards/margins": 0.815414547920227, - "rewards/rejected": -2.191352367401123, - "step": 1420 - }, - { - "epoch": 0.75, - "learning_rate": 9.053137647585231e-07, - "logits/chosen": 0.24253082275390625, - "logits/rejected": 0.9329066276550293, - "logps/chosen": -436.6142578125, - "logps/rejected": -496.86151123046875, - "loss": 0.5556, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.5039043426513672, - "rewards/margins": 0.9265894889831543, - "rewards/rejected": -2.4304938316345215, - "step": 1430 - }, - { - "epoch": 0.75, - "learning_rate": 8.703948974546592e-07, - "logits/chosen": 0.5952311158180237, - "logits/rejected": 0.8244959712028503, - "logps/chosen": -419.00579833984375, - "logps/rejected": -488.2655334472656, - "loss": 0.5196, + "learning_rate": 9.382138040640714e-07, + "logits/chosen": -0.22463122010231018, + "logits/rejected": 0.20351561903953552, + "logps/chosen": -352.7989196777344, + "logps/rejected": -359.4533386230469, + "loss": 0.546, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2945992946624756, - "rewards/margins": 0.7285401225090027, - "rewards/rejected": -2.023139476776123, - "step": 1440 + "rewards/chosen": -0.9452872276306152, + "rewards/margins": 0.49813446402549744, + "rewards/rejected": -1.443421721458435, + "step": 710 }, { - "epoch": 0.76, - "learning_rate": 8.360203050172489e-07, - "logits/chosen": 0.06418957561254501, - "logits/rejected": 0.8408614993095398, - "logps/chosen": -460.03668212890625, - "logps/rejected": -426.32220458984375, - "loss": 0.5277, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3083784580230713, - "rewards/margins": 0.6134061217308044, - "rewards/rejected": -1.9217846393585205, - "step": 1450 + "epoch": 0.75, + "learning_rate": 8.678793653740633e-07, + "logits/chosen": -0.256120890378952, + "logits/rejected": 0.14627498388290405, + "logps/chosen": -387.221435546875, + "logps/rejected": -435.5755310058594, + "loss": 0.5552, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9933661222457886, + "rewards/margins": 0.663570761680603, + "rewards/rejected": -1.6569368839263916, + "step": 720 }, { "epoch": 0.76, - "learning_rate": 8.022014682809306e-07, - "logits/chosen": 0.3110508322715759, - "logits/rejected": 0.700973629951477, - "logps/chosen": -397.7454528808594, - "logps/rejected": -399.8217468261719, - "loss": 0.5672, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.165363073348999, - "rewards/margins": 0.5710027813911438, - "rewards/rejected": -1.7363656759262085, - "step": 1460 - }, - { - "epoch": 0.77, - "learning_rate": 7.689496824624526e-07, - "logits/chosen": -0.01150925736874342, - "logits/rejected": 1.1124351024627686, - "logps/chosen": -495.40008544921875, - "logps/rejected": -471.8081970214844, - "loss": 0.5214, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.3093578815460205, - "rewards/margins": 0.9808523058891296, - "rewards/rejected": -2.290210247039795, - "step": 1470 + "learning_rate": 7.997277433690984e-07, + "logits/chosen": -0.48140692710876465, + "logits/rejected": 0.015109086409211159, + "logps/chosen": -395.16815185546875, + "logps/rejected": -368.5047302246094, + "loss": 0.5629, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8996418714523315, + "rewards/margins": 0.48376113176345825, + "rewards/rejected": -1.3834030628204346, + "step": 730 }, { "epoch": 0.77, - "learning_rate": 7.362760533881649e-07, - "logits/chosen": 0.5060569047927856, - "logits/rejected": 0.566477358341217, - "logps/chosen": -409.0811767578125, - "logps/rejected": -493.974609375, - "loss": 0.5545, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3020764589309692, - "rewards/margins": 0.6977756023406982, - "rewards/rejected": -1.999851942062378, - "step": 1480 - }, - { - "epoch": 0.78, - "learning_rate": 7.041914937847586e-07, - "logits/chosen": -0.16839629411697388, - "logits/rejected": 0.4934666156768799, - "logps/chosen": -408.4761962890625, - "logps/rejected": -467.8258361816406, - "loss": 0.5166, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2998807430267334, - "rewards/margins": 0.7749007940292358, - "rewards/rejected": -2.0747811794281006, - "step": 1490 - }, - { - "epoch": 0.78, - "learning_rate": 6.7270671963451e-07, - "logits/chosen": 0.30948877334594727, - "logits/rejected": 0.4193127155303955, - "logps/chosen": -359.7439270019531, - "logps/rejected": -472.97705078125, - "loss": 0.5216, + "learning_rate": 7.338500848029603e-07, + "logits/chosen": -0.3346256911754608, + "logits/rejected": 0.1683504283428192, + "logps/chosen": -410.95672607421875, + "logps/rejected": -427.506591796875, + "loss": 0.5523, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.078048825263977, - "rewards/margins": 0.7132478952407837, - "rewards/rejected": -1.791296362876892, - "step": 1500 + "rewards/chosen": -0.8928782343864441, + "rewards/margins": 0.6983044147491455, + "rewards/rejected": -1.5911829471588135, + "step": 740 }, { "epoch": 0.78, - "eval_logits/chosen": 0.4629691541194916, - "eval_logits/rejected": 0.7188760638237, - "eval_logps/chosen": -370.21026611328125, - "eval_logps/rejected": -452.12982177734375, - "eval_loss": 0.5326071977615356, - "eval_rewards/accuracies": 0.76171875, - "eval_rewards/chosen": -1.2005664110183716, - "eval_rewards/margins": 0.8027929663658142, - "eval_rewards/rejected": -2.003359317779541, - "eval_runtime": 99.5373, - "eval_samples_per_second": 20.093, - "eval_steps_per_second": 0.321, - "step": 1500 - }, - { - "epoch": 0.79, - "learning_rate": 6.418322465962234e-07, - "logits/chosen": 0.22714094817638397, - "logits/rejected": 0.8290818929672241, - "logps/chosen": -376.62322998046875, - "logps/rejected": -393.87847900390625, - "loss": 0.4974, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.1898367404937744, - "rewards/margins": 0.6498056650161743, - "rewards/rejected": -1.8396425247192383, - "step": 1510 - }, - { - "epoch": 0.8, - "learning_rate": 6.115783864930904e-07, - "logits/chosen": 0.2600030303001404, - "logits/rejected": -0.018900180235505104, - "logps/chosen": -407.98211669921875, - "logps/rejected": -459.68212890625, - "loss": 0.5115, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1808936595916748, - "rewards/margins": 0.5539044141769409, - "rewards/rejected": -1.7347980737686157, - "step": 1520 + "learning_rate": 6.70334495204884e-07, + "logits/chosen": -0.5112616419792175, + "logits/rejected": -0.16424962878227234, + "logps/chosen": -346.962890625, + "logps/rejected": -423.4779357910156, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8174928426742554, + "rewards/margins": 0.6463109850883484, + "rewards/rejected": -1.4638036489486694, + "step": 750 }, { "epoch": 0.8, - "learning_rate": 5.819552438686238e-07, - "logits/chosen": 0.11013329029083252, - "logits/rejected": 0.7540661096572876, - "logps/chosen": -472.6468811035156, - "logps/rejected": -463.4947204589844, - "loss": 0.5537, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.388946771621704, - "rewards/margins": 0.4617336690425873, - "rewards/rejected": -1.8506805896759033, - "step": 1530 - }, - { - "epoch": 0.81, - "learning_rate": 5.529727126118229e-07, - "logits/chosen": 0.35062912106513977, - "logits/rejected": 0.3931676149368286, - "logps/chosen": -419.42193603515625, - "logps/rejected": -476.36865234375, - "loss": 0.5286, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9165588617324829, - "rewards/margins": 0.5505474209785461, - "rewards/rejected": -1.4671061038970947, - "step": 1540 + "learning_rate": 6.092659210462232e-07, + "logits/chosen": -0.33304744958877563, + "logits/rejected": -0.1934703141450882, + "logps/chosen": -360.96124267578125, + "logps/rejected": -388.1688537597656, + "loss": 0.5219, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8719509840011597, + "rewards/margins": 0.5291545987129211, + "rewards/rejected": -1.4011056423187256, + "step": 760 }, { "epoch": 0.81, - "learning_rate": 5.246404726526918e-07, - "logits/chosen": 0.8175240755081177, - "logits/rejected": 0.8358044624328613, - "logps/chosen": -309.7247009277344, - "logps/rejected": -335.03045654296875, - "loss": 0.514, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1784462928771973, - "rewards/margins": 0.5345664620399475, - "rewards/rejected": -1.713012456893921, - "step": 1550 - }, - { - "epoch": 0.82, - "learning_rate": 4.969679867292276e-07, - "logits/chosen": 0.5007575750350952, - "logits/rejected": 0.8389018774032593, - "logps/chosen": -331.74761962890625, - "logps/rejected": -406.36187744140625, - "loss": 0.5403, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0899267196655273, - "rewards/margins": 0.5384773015975952, - "rewards/rejected": -1.6284040212631226, - "step": 1560 + "learning_rate": 5.507260361320738e-07, + "logits/chosen": -0.3076072931289673, + "logits/rejected": -0.03598857671022415, + "logps/chosen": -417.52679443359375, + "logps/rejected": -437.1398010253906, + "loss": 0.5557, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8676769137382507, + "rewards/margins": 0.46329760551452637, + "rewards/rejected": -1.3309745788574219, + "step": 770 }, { "epoch": 0.82, - "learning_rate": 4.699644972269332e-07, - "logits/chosen": 0.562352180480957, - "logits/rejected": 0.4318207800388336, - "logps/chosen": -398.3438720703125, - "logps/rejected": -474.5340270996094, - "loss": 0.5186, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.354994535446167, - "rewards/margins": 0.6516760587692261, - "rewards/rejected": -2.0066704750061035, - "step": 1570 - }, - { - "epoch": 0.83, - "learning_rate": 4.4363902309194656e-07, - "logits/chosen": 0.23987403512001038, - "logits/rejected": 0.34921976923942566, - "logps/chosen": -400.57708740234375, - "logps/rejected": -437.4691467285156, - "loss": 0.5214, + "learning_rate": 4.947931323697983e-07, + "logits/chosen": 0.10995174944400787, + "logits/rejected": 0.25233790278434753, + "logps/chosen": -298.9948425292969, + "logps/rejected": -337.920166015625, + "loss": 0.5422, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2433634996414185, - "rewards/margins": 0.4714573323726654, - "rewards/rejected": -1.7148208618164062, - "step": 1580 + "rewards/chosen": -0.9167734980583191, + "rewards/margins": 0.42617493867874146, + "rewards/rejected": -1.3429481983184814, + "step": 780 }, { "epoch": 0.83, - "learning_rate": 4.1800035681877765e-07, - "logits/chosen": 0.3362785577774048, - "logits/rejected": 1.0109045505523682, - "logps/chosen": -453.203369140625, - "logps/rejected": -483.91009521484375, - "loss": 0.5224, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.5157214403152466, - "rewards/margins": 0.7080753445625305, - "rewards/rejected": -2.223797082901001, - "step": 1590 + "learning_rate": 4.4154201506053985e-07, + "logits/chosen": -0.15819688141345978, + "logits/rejected": -0.1625043898820877, + "logps/chosen": -366.3085632324219, + "logps/rejected": -413.33343505859375, + "loss": 0.5391, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.967660129070282, + "rewards/margins": 0.4664042592048645, + "rewards/rejected": -1.434064507484436, + "step": 790 }, { "epoch": 0.84, - "learning_rate": 3.9305706151369195e-07, - "logits/chosen": 0.26526278257369995, - "logits/rejected": 0.6012780070304871, - "logps/chosen": -449.33795166015625, - "logps/rejected": -431.3976135253906, - "loss": 0.4894, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0703104734420776, - "rewards/margins": 0.8223675489425659, - "rewards/rejected": -1.8926780223846436, - "step": 1600 + "learning_rate": 3.910439028537638e-07, + "logits/chosen": -0.2914949953556061, + "logits/rejected": 0.14831864833831787, + "logps/chosen": -412.54827880859375, + "logps/rejected": -403.07647705078125, + "loss": 0.5264, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9057921171188354, + "rewards/margins": 0.6066713333129883, + "rewards/rejected": -1.5124633312225342, + "step": 800 }, { "epoch": 0.84, - "eval_logits/chosen": 0.482819139957428, - "eval_logits/rejected": 0.7405462861061096, - "eval_logps/chosen": -373.15850830078125, - "eval_logps/rejected": -457.3565368652344, - "eval_loss": 0.5327410101890564, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -1.2300488948822021, - "eval_rewards/margins": 0.8255774974822998, - "eval_rewards/rejected": -2.055626630783081, - "eval_runtime": 99.941, - "eval_samples_per_second": 20.012, + "eval_logits/chosen": -0.15486477315425873, + "eval_logits/rejected": 0.06035802513360977, + "eval_logps/chosen": -336.28326416015625, + "eval_logps/rejected": -404.0750732421875, + "eval_loss": 0.5475199818611145, + "eval_rewards/accuracies": 0.74609375, + "eval_rewards/chosen": -0.8612961769104004, + "eval_rewards/margins": 0.6615157127380371, + "eval_rewards/rejected": -1.5228118896484375, + "eval_runtime": 100.0077, + "eval_samples_per_second": 19.998, "eval_steps_per_second": 0.32, - "step": 1600 - }, - { - "epoch": 0.84, - "learning_rate": 3.688174680346976e-07, - "logits/chosen": 0.1651560515165329, - "logits/rejected": 1.0159032344818115, - "logps/chosen": -415.13720703125, - "logps/rejected": -381.99053955078125, - "loss": 0.5122, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3641674518585205, - "rewards/margins": 0.5815728902816772, - "rewards/rejected": -1.9457403421401978, - "step": 1610 - }, - { - "epoch": 0.85, - "learning_rate": 3.4528967220911287e-07, - "logits/chosen": 0.38268962502479553, - "logits/rejected": 0.7260398268699646, - "logps/chosen": -401.5191650390625, - "logps/rejected": -462.7931213378906, - "loss": 0.5414, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.452815055847168, - "rewards/margins": 0.45942768454551697, - "rewards/rejected": -1.9122428894042969, - "step": 1620 + "step": 800 }, { "epoch": 0.85, - "learning_rate": 3.224815321296168e-07, - "logits/chosen": 0.12002329528331757, - "logits/rejected": 0.9523947834968567, - "logps/chosen": -418.4691467285156, - "logps/rejected": -426.87677001953125, - "loss": 0.5471, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2490535974502563, - "rewards/margins": 0.8716403245925903, - "rewards/rejected": -2.1206939220428467, - "step": 1630 - }, - { - "epoch": 0.86, - "learning_rate": 3.004006655297209e-07, - "logits/chosen": 0.5747352242469788, - "logits/rejected": 0.7612552642822266, - "logps/chosen": -385.31268310546875, - "logps/rejected": -435.27960205078125, - "loss": 0.5407, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1729180812835693, - "rewards/margins": 0.7992533445358276, - "rewards/rejected": -1.9721715450286865, - "step": 1640 + "learning_rate": 3.4336633249862084e-07, + "logits/chosen": -0.3513889014720917, + "logits/rejected": 0.16097551584243774, + "logps/chosen": -367.8076477050781, + "logps/rejected": -368.91473388671875, + "loss": 0.5401, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0032860040664673, + "rewards/margins": 0.39093494415283203, + "rewards/rejected": -1.3942210674285889, + "step": 810 }, { "epoch": 0.86, - "learning_rate": 2.7905444723949765e-07, - "logits/chosen": 0.6584094762802124, - "logits/rejected": 0.37713831663131714, - "logps/chosen": -351.07904052734375, - "logps/rejected": -470.117919921875, - "loss": 0.4971, + "learning_rate": 2.98573068519539e-07, + "logits/chosen": -0.27493131160736084, + "logits/rejected": 0.1548646241426468, + "logps/chosen": -365.89007568359375, + "logps/rejected": -377.8901672363281, + "loss": 0.5636, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.173592209815979, - "rewards/margins": 0.6863223314285278, - "rewards/rejected": -1.8599144220352173, - "step": 1650 - }, - { - "epoch": 0.87, - "learning_rate": 2.5845000672245575e-07, - "logits/chosen": 0.3972054123878479, - "logits/rejected": 0.2828903794288635, - "logps/chosen": -398.77264404296875, - "logps/rejected": -473.8125, - "loss": 0.4996, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.435257911682129, - "rewards/margins": 0.6242495179176331, - "rewards/rejected": -2.059507369995117, - "step": 1660 + "rewards/chosen": -0.8509780764579773, + "rewards/margins": 0.6635745167732239, + "rewards/rejected": -1.5145527124404907, + "step": 820 }, { "epoch": 0.87, - "learning_rate": 2.385942256943499e-07, - "logits/chosen": 0.32778117060661316, - "logits/rejected": 0.6275647282600403, - "logps/chosen": -419.7220153808594, - "logps/rejected": -462.55078125, - "loss": 0.5608, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.428417444229126, - "rewards/margins": 0.5705910325050354, - "rewards/rejected": -1.9990084171295166, - "step": 1670 - }, - { - "epoch": 0.88, - "learning_rate": 2.1949373582475065e-07, - "logits/chosen": 0.4387190341949463, - "logits/rejected": 0.7346469759941101, - "logps/chosen": -376.1452941894531, - "logps/rejected": -415.2806701660156, - "loss": 0.5528, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3217471837997437, - "rewards/margins": 0.45661622285842896, - "rewards/rejected": -1.778363585472107, - "step": 1680 + "learning_rate": 2.5672401793681854e-07, + "logits/chosen": -0.1434798389673233, + "logits/rejected": -0.3032146990299225, + "logps/chosen": -335.52349853515625, + "logps/rejected": -418.47979736328125, + "loss": 0.5174, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9104019999504089, + "rewards/margins": 0.5144548416137695, + "rewards/rejected": -1.4248569011688232, + "step": 830 }, { "epoch": 0.88, - "learning_rate": 2.0115491652211271e-07, - "logits/chosen": 0.17830494046211243, - "logits/rejected": 0.9514158368110657, - "logps/chosen": -427.56378173828125, - "logps/rejected": -450.45306396484375, - "loss": 0.5492, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.1855056285858154, - "rewards/margins": 1.0056660175323486, - "rewards/rejected": -2.191171884536743, - "step": 1690 - }, - { - "epoch": 0.89, - "learning_rate": 1.8358389280311306e-07, - "logits/chosen": 0.22087359428405762, - "logits/rejected": 0.33987805247306824, - "logps/chosen": -383.9516906738281, - "logps/rejected": -411.02911376953125, - "loss": 0.5179, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.246467113494873, - "rewards/margins": 0.6251389384269714, - "rewards/rejected": -1.8716061115264893, - "step": 1700 - }, - { - "epoch": 0.89, - "eval_logits/chosen": 0.5011742115020752, - "eval_logits/rejected": 0.760403573513031, - "eval_logps/chosen": -373.2859802246094, - "eval_logps/rejected": -457.37200927734375, - "eval_loss": 0.5325719118118286, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -1.2313237190246582, - "eval_rewards/margins": 0.824457585811615, - "eval_rewards/rejected": -2.055781364440918, - "eval_runtime": 99.393, - "eval_samples_per_second": 20.122, - "eval_steps_per_second": 0.322, - "step": 1700 + "learning_rate": 2.178751501463036e-07, + "logits/chosen": -0.28891128301620483, + "logits/rejected": -0.015888934955000877, + "logps/chosen": -357.6486511230469, + "logps/rejected": -387.8254089355469, + "loss": 0.5675, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9722326993942261, + "rewards/margins": 0.40555062890052795, + "rewards/rejected": -1.3777834177017212, + "step": 840 }, { "epoch": 0.89, - "learning_rate": 1.667865332469379e-07, - "logits/chosen": 0.4138943552970886, - "logits/rejected": 0.8224202394485474, - "logps/chosen": -369.52764892578125, - "logps/rejected": -423.6180725097656, - "loss": 0.538, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.311859369277954, - "rewards/margins": 0.8248014450073242, - "rewards/rejected": -2.1366608142852783, - "step": 1710 + "learning_rate": 1.820784220652766e-07, + "logits/chosen": -0.457050621509552, + "logits/rejected": -0.08179013431072235, + "logps/chosen": -365.02935791015625, + "logps/rejected": -377.90484619140625, + "loss": 0.5464, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8087032437324524, + "rewards/margins": 0.6943231821060181, + "rewards/rejected": -1.5030266046524048, + "step": 850 }, { "epoch": 0.9, - "learning_rate": 1.507684480352292e-07, - "logits/chosen": 0.2538720965385437, - "logits/rejected": 0.32807546854019165, - "logps/chosen": -413.42755126953125, - "logps/rejected": -484.99957275390625, - "loss": 0.5225, + "learning_rate": 1.4938170864468636e-07, + "logits/chosen": -0.2849724292755127, + "logits/rejected": -0.0881032794713974, + "logps/chosen": -353.9926452636719, + "logps/rejected": -402.39886474609375, + "loss": 0.551, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2163430452346802, - "rewards/margins": 0.8377116322517395, - "rewards/rejected": -2.0540547370910645, - "step": 1720 - }, - { - "epoch": 0.91, - "learning_rate": 1.3553498707832763e-07, - "logits/chosen": 0.40428122878074646, - "logits/rejected": 0.49123507738113403, - "logps/chosen": -334.6162109375, - "logps/rejected": -424.0606384277344, - "loss": 0.5422, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1268707513809204, - "rewards/margins": 0.7507694959640503, - "rewards/rejected": -1.8776404857635498, - "step": 1730 + "rewards/chosen": -0.8892512321472168, + "rewards/margins": 0.6870072484016418, + "rewards/rejected": -1.5762584209442139, + "step": 860 }, { "epoch": 0.91, - "learning_rate": 1.2109123822844653e-07, - "logits/chosen": 0.3417002558708191, - "logits/rejected": 0.7004715800285339, - "logps/chosen": -364.5450439453125, - "logps/rejected": -460.31304931640625, - "loss": 0.5338, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.8657018542289734, - "rewards/margins": 1.1912907361984253, - "rewards/rejected": -2.056992769241333, - "step": 1740 + "learning_rate": 1.1982873884064466e-07, + "logits/chosen": -0.2418334186077118, + "logits/rejected": -0.05195746570825577, + "logps/chosen": -314.6041564941406, + "logps/rejected": -390.5324401855469, + "loss": 0.5528, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6465216279029846, + "rewards/margins": 0.8042505383491516, + "rewards/rejected": -1.4507721662521362, + "step": 870 }, { "epoch": 0.92, - "learning_rate": 1.0744202558037014e-07, - "logits/chosen": -0.01776253618299961, - "logits/rejected": 0.4788368344306946, - "logps/chosen": -474.26715087890625, - "logps/rejected": -514.1068725585938, - "loss": 0.4864, + "learning_rate": 9.345903713082305e-08, + "logits/chosen": -0.3975564241409302, + "logits/rejected": -0.0711708664894104, + "logps/chosen": -384.4748229980469, + "logps/rejected": -408.9742126464844, + "loss": 0.5344, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3969725370407104, - "rewards/margins": 0.6193428635597229, - "rewards/rejected": -2.016315460205078, - "step": 1750 - }, - { - "epoch": 0.92, - "learning_rate": 9.459190786024696e-08, - "logits/chosen": 0.476068913936615, - "logits/rejected": 0.7961768507957458, - "logps/chosen": -385.49945068359375, - "logps/rejected": -416.2054748535156, - "loss": 0.5388, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.323490023612976, - "rewards/margins": 0.5129523277282715, - "rewards/rejected": -1.836442232131958, - "step": 1760 - }, - { - "epoch": 0.93, - "learning_rate": 8.254517690300946e-08, - "logits/chosen": 0.15927091240882874, - "logits/rejected": 0.30330100655555725, - "logps/chosen": -370.7587585449219, - "logps/rejected": -503.95654296875, - "loss": 0.5648, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.0612715482711792, - "rewards/margins": 1.1672163009643555, - "rewards/rejected": -2.228487730026245, - "step": 1770 + "rewards/chosen": -0.9061468243598938, + "rewards/margins": 0.4584124684333801, + "rewards/rejected": -1.3645591735839844, + "step": 880 }, { "epoch": 0.93, - "learning_rate": 7.13058562189381e-08, - "logits/chosen": 0.4144035875797272, - "logits/rejected": 0.5552290081977844, - "logps/chosen": -342.7335510253906, - "logps/rejected": -409.74969482421875, - "loss": 0.5131, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2526248693466187, - "rewards/margins": 0.5200544595718384, - "rewards/rejected": -1.772679090499878, - "step": 1780 - }, - { - "epoch": 0.94, - "learning_rate": 6.087769964984058e-08, - "logits/chosen": 0.705097496509552, - "logits/rejected": 1.3561115264892578, - "logps/chosen": -428.72918701171875, - "logps/rejected": -445.7135314941406, - "loss": 0.5319, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4447662830352783, - "rewards/margins": 0.6180745959281921, - "rewards/rejected": -2.0628409385681152, - "step": 1790 + "learning_rate": 7.030787065396866e-08, + "logits/chosen": -0.34069642424583435, + "logits/rejected": -0.19395461678504944, + "logps/chosen": -320.58380126953125, + "logps/rejected": -402.9022216796875, + "loss": 0.5508, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.795324981212616, + "rewards/margins": 0.6657498478889465, + "rewards/rejected": -1.461074948310852, + "step": 890 }, { "epoch": 0.94, - "learning_rate": 5.126419011529993e-08, - "logits/chosen": 0.38589420914649963, - "logits/rejected": 0.7930299043655396, - "logps/chosen": -413.413818359375, - "logps/rejected": -465.1924743652344, - "loss": 0.5534, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2359144687652588, - "rewards/margins": 0.5394344329833984, - "rewards/rejected": -1.7753490209579468, - "step": 1800 + "learning_rate": 5.0406202043228604e-08, + "logits/chosen": -0.0745534598827362, + "logits/rejected": 0.35236138105392456, + "logps/chosen": -382.5845642089844, + "logps/rejected": -400.34735107421875, + "loss": 0.5639, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9554711580276489, + "rewards/margins": 0.41256728768348694, + "rewards/rejected": -1.3680384159088135, + "step": 900 }, { "epoch": 0.94, - "eval_logits/chosen": 0.4956907331943512, - "eval_logits/rejected": 0.7550484538078308, - "eval_logps/chosen": -373.24365234375, - "eval_logps/rejected": -457.37786865234375, - "eval_loss": 0.5324857234954834, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -1.2308998107910156, - "eval_rewards/margins": 0.8249397277832031, - "eval_rewards/rejected": -2.0558395385742188, - "eval_runtime": 99.9672, - "eval_samples_per_second": 20.007, + "eval_logits/chosen": -0.14664390683174133, + "eval_logits/rejected": 0.07037673145532608, + "eval_logps/chosen": -336.4347839355469, + "eval_logps/rejected": -404.4688415527344, + "eval_loss": 0.5475045442581177, + "eval_rewards/accuracies": 0.7421875, + "eval_rewards/chosen": -0.8628111481666565, + "eval_rewards/margins": 0.6639385223388672, + "eval_rewards/rejected": -1.5267497301101685, + "eval_runtime": 99.9101, + "eval_samples_per_second": 20.018, "eval_steps_per_second": 0.32, - "step": 1800 - }, - { - "epoch": 0.95, - "learning_rate": 4.246853844940724e-08, - "logits/chosen": 0.599514365196228, - "logits/rejected": 0.8959542512893677, - "logps/chosen": -335.34039306640625, - "logps/rejected": -418.07080078125, - "loss": 0.5122, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.3220350742340088, - "rewards/margins": 0.5795026421546936, - "rewards/rejected": -1.9015376567840576, - "step": 1810 + "step": 900 }, { "epoch": 0.95, - "learning_rate": 3.4493682328368696e-08, - "logits/chosen": 0.3638584315776825, - "logits/rejected": 0.6814897656440735, - "logps/chosen": -426.3352966308594, - "logps/rejected": -463.9798278808594, - "loss": 0.5181, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2269675731658936, - "rewards/margins": 0.786018967628479, - "rewards/rejected": -2.012986421585083, - "step": 1820 - }, - { - "epoch": 0.96, - "learning_rate": 2.734228528934679e-08, - "logits/chosen": 0.6217674016952515, - "logits/rejected": 1.3591673374176025, - "logps/chosen": -390.75262451171875, - "logps/rejected": -452.92840576171875, - "loss": 0.5142, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.369799017906189, - "rewards/margins": 0.7118135690689087, - "rewards/rejected": -2.0816123485565186, - "step": 1830 + "learning_rate": 3.378064801637687e-08, + "logits/chosen": -0.18566574156284332, + "logits/rejected": 0.08027581125497818, + "logps/chosen": -338.36224365234375, + "logps/rejected": -387.63946533203125, + "loss": 0.5317, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8497447967529297, + "rewards/margins": 0.5736583471298218, + "rewards/rejected": -1.423403263092041, + "step": 910 }, { "epoch": 0.96, - "learning_rate": 2.1016735840859447e-08, - "logits/chosen": 0.09014968574047089, - "logits/rejected": 0.6385570168495178, - "logps/chosen": -421.1087951660156, - "logps/rejected": -426.6112365722656, - "loss": 0.528, + "learning_rate": 2.0453443778310766e-08, + "logits/chosen": -0.29515573382377625, + "logits/rejected": 0.24348752200603485, + "logps/chosen": -364.8608703613281, + "logps/rejected": -390.37200927734375, + "loss": 0.5321, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2675201892852783, - "rewards/margins": 0.579815149307251, - "rewards/rejected": -1.8473352193832397, - "step": 1840 - }, - { - "epoch": 0.97, - "learning_rate": 1.551914666503812e-08, - "logits/chosen": 0.30124786496162415, - "logits/rejected": 0.8672188520431519, - "logps/chosen": -329.217529296875, - "logps/rejected": -450.61468505859375, - "loss": 0.4952, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.0372583866119385, - "rewards/margins": 1.0680160522460938, - "rewards/rejected": -2.1052744388580322, - "step": 1850 + "rewards/chosen": -0.9079607725143433, + "rewards/margins": 0.5625349283218384, + "rewards/rejected": -1.470495581626892, + "step": 920 }, { "epoch": 0.97, - "learning_rate": 1.0851353912008644e-08, - "logits/chosen": 0.28355082869529724, - "logits/rejected": 0.5023699998855591, - "logps/chosen": -317.22869873046875, - "logps/rejected": -411.1966857910156, - "loss": 0.4757, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.067916750907898, - "rewards/margins": 1.0416524410247803, - "rewards/rejected": -2.1095690727233887, - "step": 1860 - }, - { - "epoch": 0.98, - "learning_rate": 7.014916586632336e-09, - "logits/chosen": 0.004716170020401478, - "logits/rejected": 0.6306796073913574, - "logps/chosen": -464.43048095703125, - "logps/rejected": -509.4078063964844, - "loss": 0.5422, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4036610126495361, - "rewards/margins": 0.6888295412063599, - "rewards/rejected": -2.0924906730651855, - "step": 1870 + "learning_rate": 1.0442413283435759e-08, + "logits/chosen": -0.34043288230895996, + "logits/rejected": -0.010392585769295692, + "logps/chosen": -288.91424560546875, + "logps/rejected": -375.74945068359375, + "loss": 0.51, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.7094988226890564, + "rewards/margins": 0.8463606834411621, + "rewards/rejected": -1.5558595657348633, + "step": 930 }, { "epoch": 0.98, - "learning_rate": 4.011116027811956e-09, - "logits/chosen": 0.32270532846450806, - "logits/rejected": 0.7638577818870544, - "logps/chosen": -407.8437194824219, - "logps/rejected": -471.29180908203125, - "loss": 0.4853, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3231006860733032, - "rewards/margins": 0.6853188276290894, - "rewards/rejected": -2.0084195137023926, - "step": 1880 - }, - { - "epoch": 0.99, - "learning_rate": 1.8409554805329243e-09, - "logits/chosen": 0.35667330026626587, - "logits/rejected": 0.8502413630485535, - "logps/chosen": -365.97418212890625, - "logps/rejected": -510.9142150878906, - "loss": 0.5214, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.2438390254974365, - "rewards/margins": 1.2221014499664307, - "rewards/rejected": -2.465939998626709, - "step": 1890 + "learning_rate": 3.760945397705828e-09, + "logits/chosen": -0.4917779862880707, + "logits/rejected": -0.014173048548400402, + "logps/chosen": -391.59149169921875, + "logps/rejected": -432.360107421875, + "loss": 0.5395, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9179250597953796, + "rewards/margins": 0.5526332855224609, + "rewards/rejected": -1.4705584049224854, + "step": 940 }, { "epoch": 0.99, - "learning_rate": 5.051597607894088e-10, - "logits/chosen": 0.5023200511932373, - "logits/rejected": 0.7432835698127747, - "logps/chosen": -382.82867431640625, - "logps/rejected": -409.9466857910156, - "loss": 0.5539, + "learning_rate": 4.1797599220405605e-10, + "logits/chosen": -0.1975761204957962, + "logits/rejected": 0.10586023330688477, + "logps/chosen": -334.07745361328125, + "logps/rejected": -401.03680419921875, + "loss": 0.5484, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4910869598388672, - "rewards/margins": 0.6681244373321533, - "rewards/rejected": -2.1592116355895996, - "step": 1900 - }, - { - "epoch": 0.99, - "eval_logits/chosen": 0.5000983476638794, - "eval_logits/rejected": 0.7595670819282532, - "eval_logps/chosen": -373.40216064453125, - "eval_logps/rejected": -457.4398498535156, - "eval_loss": 0.5325239300727844, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -1.2324851751327515, - "eval_rewards/margins": 0.8239741921424866, - "eval_rewards/rejected": -2.056459426879883, - "eval_runtime": 99.4948, - "eval_samples_per_second": 20.102, - "eval_steps_per_second": 0.322, - "step": 1900 - }, - { - "epoch": 1.0, - "learning_rate": 4.175013500196112e-12, - "logits/chosen": 0.29777097702026367, - "logits/rejected": 0.5786265134811401, - "logps/chosen": -399.68646240234375, - "logps/rejected": -434.2972106933594, - "loss": 0.5322, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.2507613897323608, - "rewards/margins": 0.5890396237373352, - "rewards/rejected": -1.8398010730743408, - "step": 1910 + "rewards/chosen": -0.9642227292060852, + "rewards/margins": 0.754417359828949, + "rewards/rejected": -1.7186399698257446, + "step": 950 }, { "epoch": 1.0, - "step": 1911, + "step": 955, "total_flos": 0.0, - "train_loss": 0.5648497628454511, - "train_runtime": 7610.489, - "train_samples_per_second": 8.033, - "train_steps_per_second": 0.251 + "train_loss": 0.583915277301329, + "train_runtime": 6210.8046, + "train_samples_per_second": 9.843, + "train_steps_per_second": 0.154 } ], "logging_steps": 10, - "max_steps": 1911, + "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100,