{ "best_metric": 0.9069767594337463, "best_model_checkpoint": "./llama3/27-06-24-Weni-ZeroShot-Agents-Llama3-4.0.37-DPO_Experiment with DPO and Llama3 8B, zeroshot 4.0.37-2_max_steps-570_batch_16_2024-06-27_ppid_9/checkpoint-80", "epoch": 3.34640522875817, "eval_steps": 20, "global_step": 320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10457516339869281, "grad_norm": 3.655749559402466, "learning_rate": 3.3333333333333337e-06, "logits/chosen": -0.22948360443115234, "logits/rejected": -0.22978875041007996, "logps/chosen": -39.710899353027344, "logps/rejected": -39.52346420288086, "loss": 0.662, "rewards/accuracies": 0.625, "rewards/chosen": 0.11001075804233551, "rewards/margins": 0.06790003925561905, "rewards/rejected": 0.04211071878671646, "step": 10 }, { "epoch": 0.20915032679738563, "grad_norm": 2.4934024810791016, "learning_rate": 5.978260869565218e-06, "logits/chosen": -0.22906163334846497, "logits/rejected": -0.22876068949699402, "logps/chosen": -30.569751739501953, "logps/rejected": -33.86491012573242, "loss": 0.5549, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.9981435537338257, "rewards/margins": 0.38557058572769165, "rewards/rejected": 0.6125729084014893, "step": 20 }, { "epoch": 0.20915032679738563, "eval_logits/chosen": -0.19793638586997986, "eval_logits/rejected": -0.197507843375206, "eval_logps/chosen": -26.848573684692383, "eval_logps/rejected": -35.13609313964844, "eval_loss": 0.43225446343421936, "eval_rewards/accuracies": 0.8081395626068115, "eval_rewards/chosen": 1.355695366859436, "eval_rewards/margins": 0.861485481262207, "eval_rewards/rejected": 0.4942099153995514, "eval_runtime": 76.6712, "eval_samples_per_second": 2.23, "eval_steps_per_second": 1.122, "step": 20 }, { "epoch": 0.3137254901960784, "grad_norm": 1.9067373275756836, "learning_rate": 5.869565217391305e-06, "logits/chosen": -0.23883526027202606, "logits/rejected": -0.2374078780412674, "logps/chosen": -30.109268188476562, "logps/rejected": -43.55142593383789, "loss": 0.3732, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0676757097244263, "rewards/margins": 1.4297826290130615, "rewards/rejected": -0.3621070086956024, "step": 30 }, { "epoch": 0.41830065359477125, "grad_norm": 3.629997730255127, "learning_rate": 5.760869565217392e-06, "logits/chosen": -0.2035980522632599, "logits/rejected": -0.20044592022895813, "logps/chosen": -28.863727569580078, "logps/rejected": -50.26195526123047, "loss": 0.3276, "rewards/accuracies": 0.84375, "rewards/chosen": 1.200588345527649, "rewards/margins": 2.2083239555358887, "rewards/rejected": -1.0077354907989502, "step": 40 }, { "epoch": 0.41830065359477125, "eval_logits/chosen": -0.16349415481090546, "eval_logits/rejected": -0.15954892337322235, "eval_logps/chosen": -24.61951446533203, "eval_logps/rejected": -48.91069412231445, "eval_loss": 0.34378084540367126, "eval_rewards/accuracies": 0.8895348906517029, "eval_rewards/chosen": 1.5786010026931763, "eval_rewards/margins": 2.4618515968322754, "eval_rewards/rejected": -0.8832504749298096, "eval_runtime": 76.7395, "eval_samples_per_second": 2.228, "eval_steps_per_second": 1.121, "step": 40 }, { "epoch": 0.5228758169934641, "grad_norm": 2.5461719036102295, "learning_rate": 5.652173913043479e-06, "logits/chosen": -0.16493651270866394, "logits/rejected": -0.16144290566444397, "logps/chosen": -24.76712989807129, "logps/rejected": -46.993003845214844, "loss": 0.3648, "rewards/accuracies": 0.875, "rewards/chosen": 1.5913469791412354, "rewards/margins": 2.25089955329895, "rewards/rejected": -0.6595526933670044, "step": 50 }, { "epoch": 0.6274509803921569, "grad_norm": 2.0358729362487793, "learning_rate": 5.543478260869566e-06, "logits/chosen": -0.2202371060848236, "logits/rejected": -0.21734721958637238, "logps/chosen": -26.78680419921875, "logps/rejected": -47.03938674926758, "loss": 0.3448, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 1.3999578952789307, "rewards/margins": 2.0680160522460938, "rewards/rejected": -0.6680583357810974, "step": 60 }, { "epoch": 0.6274509803921569, "eval_logits/chosen": -0.17944073677062988, "eval_logits/rejected": -0.1763658970594406, "eval_logps/chosen": -27.3469295501709, "eval_logps/rejected": -48.937984466552734, "eval_loss": 0.2700035870075226, "eval_rewards/accuracies": 0.9069767594337463, "eval_rewards/chosen": 1.3058594465255737, "eval_rewards/margins": 2.191838026046753, "eval_rewards/rejected": -0.8859787583351135, "eval_runtime": 76.7029, "eval_samples_per_second": 2.229, "eval_steps_per_second": 1.121, "step": 60 }, { "epoch": 0.7320261437908496, "grad_norm": 1.7749762535095215, "learning_rate": 5.4347826086956525e-06, "logits/chosen": -0.1932402402162552, "logits/rejected": -0.19045117497444153, "logps/chosen": -26.827350616455078, "logps/rejected": -46.88459014892578, "loss": 0.2909, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.3978151082992554, "rewards/margins": 2.0628795623779297, "rewards/rejected": -0.6650643348693848, "step": 70 }, { "epoch": 0.8366013071895425, "grad_norm": 2.9730594158172607, "learning_rate": 5.326086956521739e-06, "logits/chosen": -0.15139932930469513, "logits/rejected": -0.14830470085144043, "logps/chosen": -26.229084014892578, "logps/rejected": -47.40288162231445, "loss": 0.2844, "rewards/accuracies": 0.875, "rewards/chosen": 1.493841290473938, "rewards/margins": 2.2263193130493164, "rewards/rejected": -0.7324780225753784, "step": 80 }, { "epoch": 0.8366013071895425, "eval_logits/chosen": -0.15539461374282837, "eval_logits/rejected": -0.15173271298408508, "eval_logps/chosen": -30.73025894165039, "eval_logps/rejected": -54.970027923583984, "eval_loss": 0.262494832277298, "eval_rewards/accuracies": 0.9069767594337463, "eval_rewards/chosen": 0.967526912689209, "eval_rewards/margins": 2.4567105770111084, "eval_rewards/rejected": -1.4891836643218994, "eval_runtime": 76.69, "eval_samples_per_second": 2.23, "eval_steps_per_second": 1.121, "step": 80 }, { "epoch": 0.9411764705882353, "grad_norm": 2.7646617889404297, "learning_rate": 5.2173913043478265e-06, "logits/chosen": -0.17744764685630798, "logits/rejected": -0.1731792390346527, "logps/chosen": -25.179697036743164, "logps/rejected": -51.22761154174805, "loss": 0.2203, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.5833141803741455, "rewards/margins": 2.6879732608795166, "rewards/rejected": -1.104659080505371, "step": 90 }, { "epoch": 1.0457516339869282, "grad_norm": 1.8633959293365479, "learning_rate": 5.1086956521739134e-06, "logits/chosen": -0.1642988920211792, "logits/rejected": -0.1597273051738739, "logps/chosen": -23.413448333740234, "logps/rejected": -53.029579162597656, "loss": 0.3134, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7295477390289307, "rewards/margins": 3.0015645027160645, "rewards/rejected": -1.2720168828964233, "step": 100 }, { "epoch": 1.0457516339869282, "eval_logits/chosen": -0.14374618232250214, "eval_logits/rejected": -0.13886626064777374, "eval_logps/chosen": -25.8454532623291, "eval_logps/rejected": -56.691768646240234, "eval_loss": 0.23542174696922302, "eval_rewards/accuracies": 0.9244186282157898, "eval_rewards/chosen": 1.456007480621338, "eval_rewards/margins": 3.1173653602600098, "eval_rewards/rejected": -1.6613577604293823, "eval_runtime": 76.7594, "eval_samples_per_second": 2.228, "eval_steps_per_second": 1.12, "step": 100 }, { "epoch": 1.1503267973856208, "grad_norm": 3.7231855392456055, "learning_rate": 5e-06, "logits/chosen": -0.17482638359069824, "logits/rejected": -0.1706036627292633, "logps/chosen": -27.381885528564453, "logps/rejected": -53.334877014160156, "loss": 0.3177, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 1.3857864141464233, "rewards/margins": 2.708588123321533, "rewards/rejected": -1.3228017091751099, "step": 110 }, { "epoch": 1.2549019607843137, "grad_norm": 5.588420391082764, "learning_rate": 4.8913043478260865e-06, "logits/chosen": -0.19949769973754883, "logits/rejected": -0.19602252542972565, "logps/chosen": -21.503002166748047, "logps/rejected": -45.17253494262695, "loss": 0.2497, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.9555631875991821, "rewards/margins": 2.432082414627075, "rewards/rejected": -0.47651925683021545, "step": 120 }, { "epoch": 1.2549019607843137, "eval_logits/chosen": -0.14820654690265656, "eval_logits/rejected": -0.14320875704288483, "eval_logps/chosen": -24.672155380249023, "eval_logps/rejected": -54.095088958740234, "eval_loss": 0.21676376461982727, "eval_rewards/accuracies": 0.9186046719551086, "eval_rewards/chosen": 1.5733370780944824, "eval_rewards/margins": 2.9750266075134277, "eval_rewards/rejected": -1.4016892910003662, "eval_runtime": 76.7524, "eval_samples_per_second": 2.228, "eval_steps_per_second": 1.12, "step": 120 }, { "epoch": 1.3594771241830066, "grad_norm": 4.653593063354492, "learning_rate": 4.782608695652174e-06, "logits/chosen": -0.16941645741462708, "logits/rejected": -0.16412410140037537, "logps/chosen": -28.397680282592773, "logps/rejected": -59.14925003051758, "loss": 0.2098, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.2265132665634155, "rewards/margins": 3.106191873550415, "rewards/rejected": -1.879678726196289, "step": 130 }, { "epoch": 1.4640522875816995, "grad_norm": 4.618969440460205, "learning_rate": 4.673913043478261e-06, "logits/chosen": -0.1314123570919037, "logits/rejected": -0.12528486549854279, "logps/chosen": -18.617746353149414, "logps/rejected": -51.451507568359375, "loss": 0.2442, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.209608554840088, "rewards/margins": 3.3631699085235596, "rewards/rejected": -1.1535612344741821, "step": 140 }, { "epoch": 1.4640522875816995, "eval_logits/chosen": -0.13051746785640717, "eval_logits/rejected": -0.1261546015739441, "eval_logps/chosen": -14.140735626220703, "eval_logps/rejected": -38.07098388671875, "eval_loss": 0.2653515338897705, "eval_rewards/accuracies": 0.9069767594337463, "eval_rewards/chosen": 2.626479148864746, "eval_rewards/margins": 2.425758123397827, "eval_rewards/rejected": 0.20072098076343536, "eval_runtime": 76.7224, "eval_samples_per_second": 2.229, "eval_steps_per_second": 1.121, "step": 140 }, { "epoch": 1.5686274509803921, "grad_norm": 2.419158697128296, "learning_rate": 4.565217391304348e-06, "logits/chosen": -0.14353547990322113, "logits/rejected": -0.13822032511234283, "logps/chosen": -17.764816284179688, "logps/rejected": -48.77958679199219, "loss": 0.1411, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 2.28568172454834, "rewards/margins": 3.1483988761901855, "rewards/rejected": -0.8627172708511353, "step": 150 }, { "epoch": 1.673202614379085, "grad_norm": 2.6137094497680664, "learning_rate": 4.456521739130434e-06, "logits/chosen": -0.0922718346118927, "logits/rejected": -0.08614876121282578, "logps/chosen": -30.0429630279541, "logps/rejected": -62.348907470703125, "loss": 0.2677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0795209407806396, "rewards/margins": 3.2884597778320312, "rewards/rejected": -2.2089390754699707, "step": 160 }, { "epoch": 1.673202614379085, "eval_logits/chosen": -0.10249081254005432, "eval_logits/rejected": -0.09574974328279495, "eval_logps/chosen": -22.81549835205078, "eval_logps/rejected": -60.084381103515625, "eval_loss": 0.19992607831954956, "eval_rewards/accuracies": 0.9244186282157898, "eval_rewards/chosen": 1.7590028047561646, "eval_rewards/margins": 3.75962233543396, "eval_rewards/rejected": -2.000619411468506, "eval_runtime": 76.7585, "eval_samples_per_second": 2.228, "eval_steps_per_second": 1.12, "step": 160 }, { "epoch": 1.7777777777777777, "grad_norm": 3.627882480621338, "learning_rate": 4.347826086956522e-06, "logits/chosen": -0.1151178628206253, "logits/rejected": -0.10893462598323822, "logps/chosen": -21.336753845214844, "logps/rejected": -56.53479766845703, "loss": 0.1838, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.9686607122421265, "rewards/margins": 3.594453811645508, "rewards/rejected": -1.6257928609848022, "step": 170 }, { "epoch": 1.8823529411764706, "grad_norm": 2.876845121383667, "learning_rate": 4.239130434782609e-06, "logits/chosen": -0.08697254955768585, "logits/rejected": -0.08123140037059784, "logps/chosen": -20.807594299316406, "logps/rejected": -49.710174560546875, "loss": 0.2598, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.9989045858383179, "rewards/margins": 2.9567558765411377, "rewards/rejected": -0.957851231098175, "step": 180 }, { "epoch": 1.8823529411764706, "eval_logits/chosen": -0.11206170916557312, "eval_logits/rejected": -0.1060233786702156, "eval_logps/chosen": -18.79454231262207, "eval_logps/rejected": -51.605831146240234, "eval_loss": 0.20344915986061096, "eval_rewards/accuracies": 0.930232584476471, "eval_rewards/chosen": 2.1610984802246094, "eval_rewards/margins": 3.3138630390167236, "eval_rewards/rejected": -1.152764081954956, "eval_runtime": 76.6954, "eval_samples_per_second": 2.23, "eval_steps_per_second": 1.121, "step": 180 }, { "epoch": 1.9869281045751634, "grad_norm": 4.388978004455566, "learning_rate": 4.130434782608695e-06, "logits/chosen": -0.12727566063404083, "logits/rejected": -0.12193255126476288, "logps/chosen": -20.929378509521484, "logps/rejected": -50.021419525146484, "loss": 0.3231, "rewards/accuracies": 0.875, "rewards/chosen": 1.967181921005249, "rewards/margins": 2.968064785003662, "rewards/rejected": -1.000882863998413, "step": 190 }, { "epoch": 2.0915032679738563, "grad_norm": 3.108682870864868, "learning_rate": 4.021739130434782e-06, "logits/chosen": -0.15400271117687225, "logits/rejected": -0.14889715611934662, "logps/chosen": -20.020977020263672, "logps/rejected": -49.0496940612793, "loss": 0.165, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.0844578742980957, "rewards/margins": 2.970615863800049, "rewards/rejected": -0.8861583471298218, "step": 200 }, { "epoch": 2.0915032679738563, "eval_logits/chosen": -0.10228858143091202, "eval_logits/rejected": -0.09614047408103943, "eval_logps/chosen": -25.265357971191406, "eval_logps/rejected": -58.75029373168945, "eval_loss": 0.17103791236877441, "eval_rewards/accuracies": 0.9593023061752319, "eval_rewards/chosen": 1.5140167474746704, "eval_rewards/margins": 3.3812272548675537, "eval_rewards/rejected": -1.8672102689743042, "eval_runtime": 76.7398, "eval_samples_per_second": 2.228, "eval_steps_per_second": 1.121, "step": 200 }, { "epoch": 2.196078431372549, "grad_norm": 1.1581649780273438, "learning_rate": 3.91304347826087e-06, "logits/chosen": -0.10600709915161133, "logits/rejected": -0.09994350373744965, "logps/chosen": -28.068603515625, "logps/rejected": -62.65039825439453, "loss": 0.1286, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2790197134017944, "rewards/margins": 3.5068142414093018, "rewards/rejected": -2.2277944087982178, "step": 210 }, { "epoch": 2.3006535947712417, "grad_norm": 7.228103160858154, "learning_rate": 3.804347826086957e-06, "logits/chosen": -0.1612926423549652, "logits/rejected": -0.1542719006538391, "logps/chosen": -17.81163215637207, "logps/rejected": -55.7554817199707, "loss": 0.266, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.290220260620117, "rewards/margins": 3.835573673248291, "rewards/rejected": -1.5453532934188843, "step": 220 }, { "epoch": 2.3006535947712417, "eval_logits/chosen": -0.08572749048471451, "eval_logits/rejected": -0.07834314554929733, "eval_logps/chosen": -16.123777389526367, "eval_logps/rejected": -52.568172454833984, "eval_loss": 0.2138950228691101, "eval_rewards/accuracies": 0.930232584476471, "eval_rewards/chosen": 2.4281749725341797, "eval_rewards/margins": 3.677172899246216, "eval_rewards/rejected": -1.2489980459213257, "eval_runtime": 76.6691, "eval_samples_per_second": 2.23, "eval_steps_per_second": 1.122, "step": 220 }, { "epoch": 2.4052287581699345, "grad_norm": 2.1655898094177246, "learning_rate": 3.695652173913043e-06, "logits/chosen": -0.1236579641699791, "logits/rejected": -0.11640377342700958, "logps/chosen": -18.28099822998047, "logps/rejected": -55.26251983642578, "loss": 0.1805, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.2774600982666016, "rewards/margins": 3.7921690940856934, "rewards/rejected": -1.5147093534469604, "step": 230 }, { "epoch": 2.5098039215686274, "grad_norm": 3.522686243057251, "learning_rate": 3.5869565217391305e-06, "logits/chosen": -0.12028801441192627, "logits/rejected": -0.11475691944360733, "logps/chosen": -20.220806121826172, "logps/rejected": -50.0861701965332, "loss": 0.2234, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.0429701805114746, "rewards/margins": 3.0199804306030273, "rewards/rejected": -0.9770105481147766, "step": 240 }, { "epoch": 2.5098039215686274, "eval_logits/chosen": -0.08792821317911148, "eval_logits/rejected": -0.08158135414123535, "eval_logps/chosen": -18.402149200439453, "eval_logps/rejected": -51.57028579711914, "eval_loss": 0.18542896211147308, "eval_rewards/accuracies": 0.9418604373931885, "eval_rewards/chosen": 2.2003378868103027, "eval_rewards/margins": 3.3495473861694336, "eval_rewards/rejected": -1.1492092609405518, "eval_runtime": 76.7548, "eval_samples_per_second": 2.228, "eval_steps_per_second": 1.12, "step": 240 }, { "epoch": 2.6143790849673203, "grad_norm": 2.4707725048065186, "learning_rate": 3.4782608695652175e-06, "logits/chosen": -0.09261623024940491, "logits/rejected": -0.08608702570199966, "logps/chosen": -20.839956283569336, "logps/rejected": -55.0688362121582, "loss": 0.152, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.9972784519195557, "rewards/margins": 3.4800357818603516, "rewards/rejected": -1.482757568359375, "step": 250 }, { "epoch": 2.718954248366013, "grad_norm": 3.7463388442993164, "learning_rate": 3.369565217391305e-06, "logits/chosen": -0.10273708403110504, "logits/rejected": -0.09585189074277878, "logps/chosen": -27.3509521484375, "logps/rejected": -63.99445724487305, "loss": 0.1878, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.3538360595703125, "rewards/margins": 3.7591099739074707, "rewards/rejected": -2.405273914337158, "step": 260 }, { "epoch": 2.718954248366013, "eval_logits/chosen": -0.06562437862157822, "eval_logits/rejected": -0.058443356305360794, "eval_logps/chosen": -29.113916397094727, "eval_logps/rejected": -68.89017486572266, "eval_loss": 0.15422259271144867, "eval_rewards/accuracies": 0.9534883499145508, "eval_rewards/chosen": 1.12916100025177, "eval_rewards/margins": 4.010359287261963, "eval_rewards/rejected": -2.881197929382324, "eval_runtime": 76.7159, "eval_samples_per_second": 2.229, "eval_steps_per_second": 1.121, "step": 260 }, { "epoch": 2.8235294117647056, "grad_norm": 2.4779317378997803, "learning_rate": 3.260869565217391e-06, "logits/chosen": -0.08126804977655411, "logits/rejected": -0.07377848774194717, "logps/chosen": -29.53684425354004, "logps/rejected": -69.46771240234375, "loss": 0.1214, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1481727361679077, "rewards/margins": 4.097565650939941, "rewards/rejected": -2.9493932723999023, "step": 270 }, { "epoch": 2.928104575163399, "grad_norm": 5.711748123168945, "learning_rate": 3.1521739130434784e-06, "logits/chosen": -0.055128227919340134, "logits/rejected": -0.047185707837343216, "logps/chosen": -16.9158935546875, "logps/rejected": -55.01350784301758, "loss": 0.1515, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 2.360804796218872, "rewards/margins": 3.8613457679748535, "rewards/rejected": -1.5005409717559814, "step": 280 }, { "epoch": 2.928104575163399, "eval_logits/chosen": -0.06331682205200195, "eval_logits/rejected": -0.05519242212176323, "eval_logps/chosen": -14.443292617797852, "eval_logps/rejected": -52.58645248413086, "eval_loss": 0.20675985515117645, "eval_rewards/accuracies": 0.9418604373931885, "eval_rewards/chosen": 2.5962235927581787, "eval_rewards/margins": 3.8470497131347656, "eval_rewards/rejected": -1.2508265972137451, "eval_runtime": 76.6773, "eval_samples_per_second": 2.23, "eval_steps_per_second": 1.122, "step": 280 }, { "epoch": 3.0326797385620914, "grad_norm": NaN, "learning_rate": 3.054347826086957e-06, "logits/chosen": -0.07114674150943756, "logits/rejected": -0.0620611310005188, "logps/chosen": -16.190364837646484, "logps/rejected": -58.654823303222656, "loss": 0.1621, "rewards/accuracies": 0.9375, "rewards/chosen": 2.476926565170288, "rewards/margins": 4.345733165740967, "rewards/rejected": -1.86880624294281, "step": 290 }, { "epoch": 3.1372549019607843, "grad_norm": 3.2887370586395264, "learning_rate": 2.9456521739130436e-06, "logits/chosen": -0.06162800267338753, "logits/rejected": -0.05298132449388504, "logps/chosen": -17.659170150756836, "logps/rejected": -59.8099250793457, "loss": 0.1259, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.3060736656188965, "rewards/margins": 4.271978855133057, "rewards/rejected": -1.965904951095581, "step": 300 }, { "epoch": 3.1372549019607843, "eval_logits/chosen": -0.05268235132098198, "eval_logits/rejected": -0.04376488924026489, "eval_logps/chosen": -14.083008766174316, "eval_logps/rejected": -54.79096984863281, "eval_loss": 0.15972751379013062, "eval_rewards/accuracies": 0.9476743936538696, "eval_rewards/chosen": 2.632251739501953, "eval_rewards/margins": 4.10352897644043, "eval_rewards/rejected": -1.4712772369384766, "eval_runtime": 76.6816, "eval_samples_per_second": 2.23, "eval_steps_per_second": 1.122, "step": 300 }, { "epoch": 3.241830065359477, "grad_norm": 1.3481501340866089, "learning_rate": 2.8369565217391305e-06, "logits/chosen": -0.09273257106542587, "logits/rejected": -0.08304957300424576, "logps/chosen": -14.50407886505127, "logps/rejected": -60.208778381347656, "loss": 0.0975, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.651323080062866, "rewards/margins": 4.6572089195251465, "rewards/rejected": -2.005885601043701, "step": 310 }, { "epoch": 3.34640522875817, "grad_norm": 1.853318691253662, "learning_rate": 2.7282608695652175e-06, "logits/chosen": -0.10112150758504868, "logits/rejected": -0.09216316789388657, "logps/chosen": -17.982646942138672, "logps/rejected": -64.50444793701172, "loss": 0.1342, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.268524408340454, "rewards/margins": 4.688179969787598, "rewards/rejected": -2.4196553230285645, "step": 320 }, { "epoch": 3.34640522875817, "eval_logits/chosen": -0.04029928892850876, "eval_logits/rejected": -0.030180998146533966, "eval_logps/chosen": -16.03810691833496, "eval_logps/rejected": -63.949459075927734, "eval_loss": 0.15685829520225525, "eval_rewards/accuracies": 0.9476743936538696, "eval_rewards/chosen": 2.436742067337036, "eval_rewards/margins": 4.8238677978515625, "eval_rewards/rejected": -2.3871262073516846, "eval_runtime": 76.6972, "eval_samples_per_second": 2.23, "eval_steps_per_second": 1.121, "step": 320 } ], "logging_steps": 10, "max_steps": 570, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }