diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9996190476190476, + "epoch": 1.996400719856029, "eval_steps": 500, - "global_step": 1312, + "global_step": 832, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 3.787878787878788e-08, - "logits/chosen": 0.17224127054214478, - "logits/rejected": 0.18124699592590332, - "logps/chosen": -379.32623291015625, - "logps/rejected": -349.5926208496094, - "loss": 0.2902, + "learning_rate": 5.952380952380953e-08, + "logits/chosen": 0.11703574657440186, + "logits/rejected": 0.3661181330680847, + "logps/chosen": -218.64993286132812, + "logps/rejected": -191.34808349609375, + "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -23,1853 +23,1181 @@ "step": 1 }, { - "epoch": 0.01, - "learning_rate": 3.787878787878788e-07, - "logits/chosen": 0.07856506109237671, - "logits/rejected": 0.2510358691215515, - "logps/chosen": -334.9958190917969, - "logps/rejected": -283.54034423828125, - "loss": 0.3745, - "rewards/accuracies": 0.4375, - "rewards/chosen": 1.0113296411873307e-05, - "rewards/margins": 5.3244151786202565e-05, - "rewards/rejected": -4.313084718887694e-05, + "epoch": 0.02, + "learning_rate": 5.952380952380953e-07, + "logits/chosen": 0.10404814779758453, + "logits/rejected": 0.23778128623962402, + "logps/chosen": -401.4896240234375, + "logps/rejected": -345.9862976074219, + "loss": 0.3642, + "rewards/accuracies": 0.4791666567325592, + "rewards/chosen": 0.0004916194593533874, + "rewards/margins": 0.0005594216636382043, + "rewards/rejected": -6.780229159630835e-05, "step": 10 }, { - "epoch": 0.02, - "learning_rate": 7.575757575757576e-07, - "logits/chosen": 0.07835443317890167, - "logits/rejected": 0.23690445721149445, - "logps/chosen": -343.5411682128906, - "logps/rejected": -300.49774169921875, - "loss": 0.345, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 9.991687193178223e-07, - "rewards/margins": 5.578011041507125e-05, - "rewards/rejected": -5.478093953570351e-05, + "epoch": 0.05, + "learning_rate": 1.1904761904761906e-06, + "logits/chosen": 0.13218173384666443, + "logits/rejected": 0.20688870549201965, + "logps/chosen": -336.506591796875, + "logps/rejected": -319.3189392089844, + "loss": 0.3689, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.00020826223772019148, + "rewards/margins": -0.000311180017888546, + "rewards/rejected": 0.0005194421974010766, "step": 20 }, { - "epoch": 0.02, - "learning_rate": 1.1363636363636364e-06, - "logits/chosen": 0.09362699836492538, - "logits/rejected": 0.24969105422496796, - "logps/chosen": -384.11199951171875, - "logps/rejected": -290.873779296875, - "loss": 0.3667, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.00015662855003029108, - "rewards/margins": 9.987165867642034e-06, - "rewards/rejected": 0.0001466413668822497, + "epoch": 0.07, + "learning_rate": 1.7857142857142859e-06, + "logits/chosen": 0.11459924280643463, + "logits/rejected": 0.1922653764486313, + "logps/chosen": -342.02569580078125, + "logps/rejected": -324.1275939941406, + "loss": 0.3786, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0006439354037865996, + "rewards/margins": 0.0004738263669423759, + "rewards/rejected": -0.0011177618289366364, "step": 30 }, { - "epoch": 0.03, - "learning_rate": 1.5151515151515152e-06, - "logits/chosen": 0.08275317400693893, - "logits/rejected": 0.20892572402954102, - "logps/chosen": -361.24462890625, - "logps/rejected": -296.53094482421875, - "loss": 0.3012, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": 0.00019404105842113495, - "rewards/margins": 0.000132537359604612, - "rewards/rejected": 6.150371336843818e-05, + "epoch": 0.1, + "learning_rate": 2.380952380952381e-06, + "logits/chosen": 0.13577614724636078, + "logits/rejected": 0.17847472429275513, + "logps/chosen": -298.6214294433594, + "logps/rejected": -289.40850830078125, + "loss": 0.3689, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0008146329782903194, + "rewards/margins": 0.0024004268925637007, + "rewards/rejected": -0.001585794030688703, "step": 40 }, { - "epoch": 0.04, - "learning_rate": 1.8939393939393941e-06, - "logits/chosen": 0.14996236562728882, - "logits/rejected": 0.2148081511259079, - "logps/chosen": -339.32916259765625, - "logps/rejected": -292.900146484375, - "loss": 0.3391, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.0002820778754539788, - "rewards/margins": 0.00035016084439121187, - "rewards/rejected": -6.80829762131907e-05, + "epoch": 0.12, + "learning_rate": 2.9761904761904763e-06, + "logits/chosen": 0.10261678695678711, + "logits/rejected": 0.20306341350078583, + "logps/chosen": -351.93572998046875, + "logps/rejected": -362.153564453125, + "loss": 0.3692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0009010158246383071, + "rewards/margins": 0.004100508522242308, + "rewards/rejected": -0.003199493046849966, "step": 50 }, { - "epoch": 0.05, - "learning_rate": 2.2727272727272728e-06, - "logits/chosen": 0.09292325377464294, - "logits/rejected": 0.25342074036598206, - "logps/chosen": -357.79498291015625, - "logps/rejected": -279.56475830078125, - "loss": 0.3523, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.0005591081571765244, - "rewards/margins": 0.0007476316532120109, - "rewards/rejected": -0.00018852358334697783, + "epoch": 0.14, + "learning_rate": 3.5714285714285718e-06, + "logits/chosen": 0.13770776987075806, + "logits/rejected": 0.2188442498445511, + "logps/chosen": -349.51690673828125, + "logps/rejected": -351.1549377441406, + "loss": 0.3655, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.003258631331846118, + "rewards/margins": 0.007584023289382458, + "rewards/rejected": -0.004325392190366983, "step": 60 }, { - "epoch": 0.05, - "learning_rate": 2.6515151515151514e-06, - "logits/chosen": 0.1520170122385025, - "logits/rejected": 0.23838527500629425, - "logps/chosen": -344.97381591796875, - "logps/rejected": -272.6285400390625, - "loss": 0.333, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.000808493874501437, - "rewards/margins": 0.0009522804175503552, - "rewards/rejected": -0.0001437865139450878, + "epoch": 0.17, + "learning_rate": 4.166666666666667e-06, + "logits/chosen": 0.1271902620792389, + "logits/rejected": 0.23070549964904785, + "logps/chosen": -378.33843994140625, + "logps/rejected": -350.60662841796875, + "loss": 0.3586, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.006277147680521011, + "rewards/margins": 0.015207210555672646, + "rewards/rejected": -0.00893006194382906, "step": 70 }, { - "epoch": 0.06, - "learning_rate": 3.0303030303030305e-06, - "logits/chosen": 0.10541319847106934, - "logits/rejected": 0.2519112229347229, - "logps/chosen": -337.99932861328125, - "logps/rejected": -285.3942565917969, - "loss": 0.3276, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.0017709163948893547, - "rewards/margins": 0.0019623935222625732, - "rewards/rejected": -0.00019147712737321854, + "epoch": 0.19, + "learning_rate": 4.761904761904762e-06, + "logits/chosen": 0.08625562489032745, + "logits/rejected": 0.12316304445266724, + "logps/chosen": -307.9439697265625, + "logps/rejected": -335.3281555175781, + "loss": 0.3489, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.013669237494468689, + "rewards/margins": 0.02226843498647213, + "rewards/rejected": -0.008599198423326015, "step": 80 }, { - "epoch": 0.07, - "learning_rate": 3.409090909090909e-06, - "logits/chosen": 0.08055099099874496, - "logits/rejected": 0.23055055737495422, - "logps/chosen": -358.501220703125, - "logps/rejected": -301.07373046875, - "loss": 0.325, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.0031805038452148438, - "rewards/margins": 0.0035764030180871487, - "rewards/rejected": -0.0003958995803259313, + "epoch": 0.22, + "learning_rate": 4.9992062457191005e-06, + "logits/chosen": 0.137899249792099, + "logits/rejected": 0.2165641039609909, + "logps/chosen": -355.6449890136719, + "logps/rejected": -338.1387634277344, + "loss": 0.3229, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.013719858601689339, + "rewards/margins": 0.042457275092601776, + "rewards/rejected": -0.028737416490912437, "step": 90 }, { - "epoch": 0.08, - "learning_rate": 3.7878787878787882e-06, - "logits/chosen": 0.1381727159023285, - "logits/rejected": 0.36128586530685425, - "logps/chosen": -408.2177734375, - "logps/rejected": -295.70391845703125, - "loss": 0.3551, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.005324442870914936, - "rewards/margins": 0.006567128002643585, - "rewards/rejected": -0.00124268583022058, + "epoch": 0.24, + "learning_rate": 4.994357350311441e-06, + "logits/chosen": 0.14011432230472565, + "logits/rejected": 0.21795734763145447, + "logps/chosen": -360.2173156738281, + "logps/rejected": -358.1722717285156, + "loss": 0.3043, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.001885895850136876, + "rewards/margins": 0.06035756319761276, + "rewards/rejected": -0.06224345415830612, "step": 100 }, { - "epoch": 0.08, - "learning_rate": 4.166666666666667e-06, - "logits/chosen": 0.11063258349895477, - "logits/rejected": 0.3397473692893982, - "logps/chosen": -365.5294494628906, - "logps/rejected": -296.68212890625, - "loss": 0.3176, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.0065561323426663876, - "rewards/margins": 0.009804257191717625, - "rewards/rejected": -0.0032481239177286625, + "epoch": 0.26, + "learning_rate": 4.98510907587894e-06, + "logits/chosen": 0.13077042996883392, + "logits/rejected": 0.21840377151966095, + "logps/chosen": -356.6605224609375, + "logps/rejected": -348.19476318359375, + "loss": 0.3169, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.008259604685008526, + "rewards/margins": 0.08336080610752106, + "rewards/rejected": -0.09162042289972305, "step": 110 }, { - "epoch": 0.09, - "learning_rate": 4.5454545454545455e-06, - "logits/chosen": 0.10760724544525146, - "logits/rejected": 0.3315739035606384, - "logps/chosen": -369.8674011230469, - "logps/rejected": -290.553955078125, - "loss": 0.2862, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": 0.009247648529708385, - "rewards/margins": 0.017721759155392647, - "rewards/rejected": -0.008474110625684261, + "epoch": 0.29, + "learning_rate": 4.97147773390341e-06, + "logits/chosen": 0.14791826903820038, + "logits/rejected": 0.1786331683397293, + "logps/chosen": -320.29608154296875, + "logps/rejected": -337.16864013671875, + "loss": 0.2861, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.008191597647964954, + "rewards/margins": 0.09656454622745514, + "rewards/rejected": -0.08837294578552246, "step": 120 }, { - "epoch": 0.1, - "learning_rate": 4.924242424242425e-06, - "logits/chosen": 0.19570864737033844, - "logits/rejected": 0.29995661973953247, - "logps/chosen": -357.7752685546875, - "logps/rejected": -298.80865478515625, - "loss": 0.3091, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.0016540288925170898, - "rewards/margins": 0.030770784243941307, - "rewards/rejected": -0.029116755351424217, + "epoch": 0.31, + "learning_rate": 4.953487366425163e-06, + "logits/chosen": 0.12249626964330673, + "logits/rejected": 0.16907112300395966, + "logps/chosen": -342.0648498535156, + "logps/rejected": -363.51031494140625, + "loss": 0.3175, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0017940097022801638, + "rewards/margins": 0.07947574555873871, + "rewards/rejected": -0.07768173515796661, "step": 130 }, { - "epoch": 0.11, - "learning_rate": 4.999432965739786e-06, - "logits/chosen": 0.18279646337032318, - "logits/rejected": 0.24193449318408966, - "logps/chosen": -326.20623779296875, - "logps/rejected": -340.66204833984375, - "loss": 0.3146, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.02705274522304535, - "rewards/margins": 0.038378529250621796, - "rewards/rejected": -0.06543128192424774, + "epoch": 0.34, + "learning_rate": 4.931169703639282e-06, + "logits/chosen": 0.0919104740023613, + "logits/rejected": 0.18652714788913727, + "logps/chosen": -337.65374755859375, + "logps/rejected": -364.11199951171875, + "loss": 0.2828, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.03137553483247757, + "rewards/margins": 0.12489553540945053, + "rewards/rejected": -0.09352000057697296, "step": 140 }, { - "epoch": 0.11, - "learning_rate": 4.997129829895409e-06, - "logits/chosen": 0.14213022589683533, - "logits/rejected": 0.2512076199054718, - "logps/chosen": -408.73541259765625, - "logps/rejected": -432.90771484375, - "loss": 0.2979, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.0731552243232727, - "rewards/margins": 0.08651129901409149, - "rewards/rejected": -0.159666508436203, + "epoch": 0.36, + "learning_rate": 4.904564107932048e-06, + "logits/chosen": 0.13001379370689392, + "logits/rejected": 0.20237913727760315, + "logps/chosen": -351.857421875, + "logps/rejected": -336.6232604980469, + "loss": 0.2899, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0018621661001816392, + "rewards/margins": 0.10416339337825775, + "rewards/rejected": -0.10602555423974991, "step": 150 }, { - "epoch": 0.12, - "learning_rate": 4.9930567839810125e-06, - "logits/chosen": 0.17261534929275513, - "logits/rejected": 0.27892106771469116, - "logps/chosen": -483.6229553222656, - "logps/rejected": -527.606201171875, - "loss": 0.2727, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.121568962931633, - "rewards/margins": 0.10641299188137054, - "rewards/rejected": -0.22798196971416473, + "epoch": 0.38, + "learning_rate": 4.873717504456219e-06, + "logits/chosen": 0.06932858377695084, + "logits/rejected": 0.15127311646938324, + "logps/chosen": -345.0473937988281, + "logps/rejected": -363.4601745605469, + "loss": 0.2889, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.01222093403339386, + "rewards/margins": 0.11404307186603546, + "rewards/rejected": -0.12626400589942932, "step": 160 }, { - "epoch": 0.13, - "learning_rate": 4.987216714880929e-06, - "logits/chosen": 0.19240622222423553, - "logits/rejected": 0.22908082604408264, - "logps/chosen": -516.4906005859375, - "logps/rejected": -531.3421020507812, - "loss": 0.278, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.17584487795829773, - "rewards/margins": 0.10210248082876205, - "rewards/rejected": -0.27794739603996277, + "epoch": 0.41, + "learning_rate": 4.838684298367616e-06, + "logits/chosen": 0.16357803344726562, + "logits/rejected": 0.23174886405467987, + "logps/chosen": -357.15289306640625, + "logps/rejected": -358.61065673828125, + "loss": 0.2884, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.017432499676942825, + "rewards/margins": 0.11707814782857895, + "rewards/rejected": -0.09964564442634583, "step": 170 }, { - "epoch": 0.14, - "learning_rate": 4.979613761906212e-06, - "logits/chosen": 0.12470928579568863, - "logits/rejected": 0.2596343755722046, - "logps/chosen": -551.5357666015625, - "logps/rejected": -654.6400756835938, - "loss": 0.2664, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.21089038252830505, - "rewards/margins": 0.15304332971572876, - "rewards/rejected": -0.3639337420463562, + "epoch": 0.43, + "learning_rate": 4.7995262788689865e-06, + "logits/chosen": 0.16258656978607178, + "logits/rejected": 0.2536885738372803, + "logps/chosen": -337.7535705566406, + "logps/rejected": -346.13470458984375, + "loss": 0.2789, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.02853301540017128, + "rewards/margins": 0.1016291156411171, + "rewards/rejected": -0.07309609651565552, "step": 180 }, { - "epoch": 0.14, - "learning_rate": 4.970253313860788e-06, - "logits/chosen": 0.1833106130361557, - "logits/rejected": 0.2698153257369995, - "logps/chosen": -562.19677734375, - "logps/rejected": -634.8347778320312, - "loss": 0.2832, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.2445872575044632, - "rewards/margins": 0.11998845636844635, - "rewards/rejected": -0.36457571387290955, + "epoch": 0.46, + "learning_rate": 4.756312510230377e-06, + "logits/chosen": 0.14243337512016296, + "logits/rejected": 0.24410876631736755, + "logps/chosen": -376.64599609375, + "logps/rejected": -363.4615478515625, + "loss": 0.2828, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.03516390174627304, + "rewards/margins": 0.12421919405460358, + "rewards/rejected": -0.08905528485774994, "step": 190 }, { - "epoch": 0.15, - "learning_rate": 4.959142005221991e-06, - "logits/chosen": 0.13080090284347534, - "logits/rejected": 0.21319513022899628, - "logps/chosen": -603.390380859375, - "logps/rejected": -718.3425903320312, - "loss": 0.2595, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.25654521584510803, - "rewards/margins": 0.16353540122509003, - "rewards/rejected": -0.42008060216903687, + "epoch": 0.48, + "learning_rate": 4.709119209978242e-06, + "logits/chosen": 0.17320121824741364, + "logits/rejected": 0.2264091521501541, + "logps/chosen": -362.0121765136719, + "logps/rejected": -352.7041931152344, + "loss": 0.283, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.039128489792346954, + "rewards/margins": 0.11708054691553116, + "rewards/rejected": -0.07795204222202301, "step": 200 }, { - "epoch": 0.16, - "learning_rate": 4.94628771143819e-06, - "logits/chosen": 0.17947080731391907, - "logits/rejected": 0.2654314935207367, - "logps/chosen": -653.5524291992188, - "logps/rejected": -726.80126953125, - "loss": 0.3026, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.2890639305114746, - "rewards/margins": 0.12797169387340546, - "rewards/rejected": -0.41703566908836365, + "epoch": 0.5, + "learning_rate": 4.6580296144681155e-06, + "logits/chosen": 0.1604190617799759, + "logits/rejected": 0.17792078852653503, + "logps/chosen": -315.1614074707031, + "logps/rejected": -340.53619384765625, + "loss": 0.2754, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.05531097203493118, + "rewards/margins": 0.15012916922569275, + "rewards/rejected": -0.09481821954250336, "step": 210 }, { - "epoch": 0.17, - "learning_rate": 4.931699543346854e-06, - "logits/chosen": 0.1152615174651146, - "logits/rejected": 0.25124651193618774, - "logps/chosen": -588.23046875, - "logps/rejected": -704.0632934570312, - "loss": 0.2673, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.2535329759120941, - "rewards/margins": 0.17576026916503906, - "rewards/rejected": -0.4292932152748108, + "epoch": 0.53, + "learning_rate": 4.603133832077953e-06, + "logits/chosen": 0.11915634572505951, + "logits/rejected": 0.15653367340564728, + "logps/chosen": -351.16986083984375, + "logps/rejected": -354.53607177734375, + "loss": 0.2738, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": 0.06388933956623077, + "rewards/margins": 0.1507207453250885, + "rewards/rejected": -0.08683140575885773, "step": 220 }, { - "epoch": 0.18, - "learning_rate": 4.9153878407169815e-06, - "logits/chosen": 0.13138779997825623, - "logits/rejected": 0.1749398410320282, - "logps/chosen": -536.669921875, - "logps/rejected": -619.0616455078125, - "loss": 0.2609, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.2183808982372284, - "rewards/margins": 0.12902414798736572, - "rewards/rejected": -0.3474050462245941, + "epoch": 0.55, + "learning_rate": 4.544528684281056e-06, + "logits/chosen": 0.09443524479866028, + "logits/rejected": 0.1415812075138092, + "logps/chosen": -355.2025451660156, + "logps/rejected": -349.1300354003906, + "loss": 0.276, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021877283230423927, + "rewards/margins": 0.1259470283985138, + "rewards/rejected": -0.10406973212957382, "step": 230 }, { - "epoch": 0.18, - "learning_rate": 4.897364164920515e-06, - "logits/chosen": 0.14722837507724762, - "logits/rejected": 0.2979207932949066, - "logps/chosen": -644.4813232421875, - "logps/rejected": -710.7215576171875, - "loss": 0.2364, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.2495686262845993, - "rewards/margins": 0.14611390233039856, - "rewards/rejected": -0.39568251371383667, + "epoch": 0.58, + "learning_rate": 4.482317534878901e-06, + "logits/chosen": 0.08314280211925507, + "logits/rejected": 0.11439633369445801, + "logps/chosen": -333.59295654296875, + "logps/rejected": -341.5171203613281, + "loss": 0.2668, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.016557829454541206, + "rewards/margins": 0.11629464477300644, + "rewards/rejected": -0.09973680973052979, "step": 240 }, { - "epoch": 0.19, - "learning_rate": 4.8776412907378845e-06, - "logits/chosen": 0.14406076073646545, - "logits/rejected": 0.24922068417072296, - "logps/chosen": -602.2944946289062, - "logps/rejected": -689.9060668945312, - "loss": 0.2803, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.25432199239730835, - "rewards/margins": 0.14366620779037476, - "rewards/rejected": -0.3979881703853607, + "epoch": 0.6, + "learning_rate": 4.416610107695043e-06, + "logits/chosen": 0.11690554767847061, + "logits/rejected": 0.06475332379341125, + "logps/chosen": -331.7200012207031, + "logps/rejected": -341.45245361328125, + "loss": 0.2819, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.025893816724419594, + "rewards/margins": 0.13103850185871124, + "rewards/rejected": -0.15693232417106628, "step": 250 }, { - "epoch": 0.2, - "learning_rate": 4.8562331973035396e-06, - "logits/chosen": 0.14575393497943878, - "logits/rejected": 0.28581660985946655, - "logps/chosen": -567.1799926757812, - "logps/rejected": -649.4906005859375, - "loss": 0.2541, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.21804265677928925, - "rewards/margins": 0.13621987402439117, - "rewards/rejected": -0.3542625308036804, + "epoch": 0.62, + "learning_rate": 4.3475222930516484e-06, + "logits/chosen": 0.08940346539020538, + "logits/rejected": 0.12766343355178833, + "logps/chosen": -333.33343505859375, + "logps/rejected": -372.55755615234375, + "loss": 0.2833, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.026656050235033035, + "rewards/margins": 0.16600963473320007, + "rewards/rejected": -0.19266566634178162, "step": 260 }, { - "epoch": 0.21, - "learning_rate": 4.833155058197842e-06, - "logits/chosen": 0.20403075218200684, - "logits/rejected": 0.32390326261520386, - "logps/chosen": -611.0584106445312, - "logps/rejected": -643.7525634765625, - "loss": 0.2827, + "epoch": 0.65, + "learning_rate": 4.2751759433699745e-06, + "logits/chosen": 0.04847298935055733, + "logits/rejected": 0.11083607375621796, + "logps/chosen": -342.9352722167969, + "logps/rejected": -357.6617736816406, + "loss": 0.274, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.23078171908855438, - "rewards/margins": 0.12669074535369873, - "rewards/rejected": -0.3574724495410919, + "rewards/chosen": -0.021627375856041908, + "rewards/margins": 0.12919363379478455, + "rewards/rejected": -0.1508210003376007, "step": 270 }, { - "epoch": 0.21, - "learning_rate": 4.808423230692374e-06, - "logits/chosen": 0.16620513796806335, - "logits/rejected": 0.3157016634941101, - "logps/chosen": -557.9627075195312, - "logps/rejected": -638.0758666992188, - "loss": 0.2304, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.1864185631275177, - "rewards/margins": 0.15307244658470154, - "rewards/rejected": -0.33949097990989685, + "epoch": 0.67, + "learning_rate": 4.199698658255298e-06, + "logits/chosen": 0.056878913193941116, + "logits/rejected": 0.14858202636241913, + "logps/chosen": -370.22637939453125, + "logps/rejected": -398.57159423828125, + "loss": 0.2715, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.021515587344765663, + "rewards/margins": 0.1492767035961151, + "rewards/rejected": -0.17079228162765503, "step": 280 }, { - "epoch": 0.22, - "learning_rate": 4.7820552441562625e-06, - "logits/chosen": 0.18919572234153748, - "logits/rejected": 0.24673417210578918, - "logps/chosen": -532.9969482421875, - "logps/rejected": -591.529296875, - "loss": 0.2764, + "epoch": 0.7, + "learning_rate": 4.121223559445343e-06, + "logits/chosen": 0.03415738046169281, + "logits/rejected": 0.12577436864376068, + "logps/chosen": -352.68072509765625, + "logps/rejected": -383.16204833984375, + "loss": 0.264, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.19002854824066162, - "rewards/margins": 0.12101318687200546, - "rewards/rejected": -0.3110417425632477, + "rewards/chosen": -0.03958406671881676, + "rewards/margins": 0.1690487265586853, + "rewards/rejected": -0.20863279700279236, "step": 290 }, { - "epoch": 0.23, - "learning_rate": 4.754069787631761e-06, - "logits/chosen": 0.15965518355369568, - "logits/rejected": 0.3048693537712097, - "logps/chosen": -550.864990234375, - "logps/rejected": -679.41357421875, - "loss": 0.25, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.21647436916828156, - "rewards/margins": 0.16393651068210602, - "rewards/rejected": -0.3804108500480652, + "epoch": 0.72, + "learning_rate": 4.039889056019159e-06, + "logits/chosen": 0.02515377476811409, + "logits/rejected": 0.10390216112136841, + "logps/chosen": -353.2736511230469, + "logps/rejected": -353.888671875, + "loss": 0.2461, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.031048249453306198, + "rewards/margins": 0.1348181664943695, + "rewards/rejected": -0.1658664047718048, "step": 300 }, { - "epoch": 0.24, - "learning_rate": 4.724486696587862e-06, - "logits/chosen": 0.17140202224254608, - "logits/rejected": 0.21409063041210175, - "logps/chosen": -635.85791015625, - "logps/rejected": -747.188232421875, - "loss": 0.258, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.29053109884262085, - "rewards/margins": 0.15401865541934967, - "rewards/rejected": -0.4445497393608093, + "epoch": 0.74, + "learning_rate": 3.955838600280535e-06, + "logits/chosen": 0.025213222950696945, + "logits/rejected": 0.1410323679447174, + "logps/chosen": -387.21856689453125, + "logps/rejected": -373.70355224609375, + "loss": 0.2703, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.010617800056934357, + "rewards/margins": 0.19538867473602295, + "rewards/rejected": -0.184770867228508, "step": 310 }, { - "epoch": 0.24, - "learning_rate": 4.693326938861367e-06, - "logits/chosen": 0.19796046614646912, - "logits/rejected": 0.25672799348831177, - "logps/chosen": -643.25537109375, - "logps/rejected": -801.0643310546875, - "loss": 0.2427, + "epoch": 0.77, + "learning_rate": 3.869220434746509e-06, + "logits/chosen": 0.06151404231786728, + "logits/rejected": 0.1290605366230011, + "logps/chosen": -345.41571044921875, + "logps/rejected": -370.25592041015625, + "loss": 0.2703, "rewards/accuracies": 0.75, - "rewards/chosen": -0.31196296215057373, - "rewards/margins": 0.19956240057945251, - "rewards/rejected": -0.5115253925323486, + "rewards/chosen": -0.019938651472330093, + "rewards/margins": 0.16865777969360352, + "rewards/rejected": -0.1885964572429657, "step": 320 }, { - "epoch": 0.25, - "learning_rate": 4.660612599795343e-06, - "logits/chosen": 0.11408114433288574, - "logits/rejected": 0.22174029052257538, - "logps/chosen": -647.49658203125, - "logps/rejected": -756.5814208984375, - "loss": 0.2472, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.29143720865249634, - "rewards/margins": 0.1790235936641693, - "rewards/rejected": -0.47046083211898804, + "epoch": 0.79, + "learning_rate": 3.7801873306872315e-06, + "logits/chosen": 0.06525089591741562, + "logits/rejected": 0.12144273519515991, + "logps/chosen": -340.03277587890625, + "logps/rejected": -371.6439514160156, + "loss": 0.2577, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.02320241369307041, + "rewards/margins": 0.17125853896141052, + "rewards/rejected": -0.14805614948272705, "step": 330 }, { - "epoch": 0.26, - "learning_rate": 4.626366866585528e-06, - "logits/chosen": 0.08651003241539001, - "logits/rejected": 0.29140934348106384, - "logps/chosen": -647.8692626953125, - "logps/rejected": -736.3836669921875, - "loss": 0.2449, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.28671354055404663, - "rewards/margins": 0.16735221445560455, - "rewards/rejected": -0.45406574010849, + "epoch": 0.82, + "learning_rate": 3.688896318678322e-06, + "logits/chosen": 0.055392809212207794, + "logits/rejected": 0.12697988748550415, + "logps/chosen": -349.14556884765625, + "logps/rejected": -333.9625549316406, + "loss": 0.2748, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.016882654279470444, + "rewards/margins": 0.16859912872314453, + "rewards/rejected": -0.1517164707183838, "step": 340 }, { - "epoch": 0.27, - "learning_rate": 4.590614011845758e-06, - "logits/chosen": 0.14404548704624176, - "logits/rejected": 0.26181453466415405, - "logps/chosen": -638.6263427734375, - "logps/rejected": -730.7149658203125, - "loss": 0.2148, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.2571524977684021, - "rewards/margins": 0.1780269742012024, - "rewards/rejected": -0.4351794719696045, + "epoch": 0.84, + "learning_rate": 3.5955084116409382e-06, + "logits/chosen": 0.08919240534305573, + "logits/rejected": 0.1610582321882248, + "logps/chosen": -367.30621337890625, + "logps/rejected": -346.13873291015625, + "loss": 0.2664, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.04106982424855232, + "rewards/margins": 0.14082172513008118, + "rewards/rejected": -0.1818915605545044, "step": 350 }, { - "epoch": 0.27, - "learning_rate": 4.553379376404085e-06, - "logits/chosen": 0.17719906568527222, - "logits/rejected": 0.17130860686302185, - "logps/chosen": -565.2915649414062, - "logps/rejected": -659.3453369140625, - "loss": 0.252, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.23078814148902893, - "rewards/margins": 0.16517826914787292, - "rewards/rejected": -0.39596638083457947, + "epoch": 0.86, + "learning_rate": 3.5001883208580668e-06, + "logits/chosen": 0.056862883269786835, + "logits/rejected": 0.14601710438728333, + "logps/chosen": -383.3697204589844, + "logps/rejected": -388.45147705078125, + "loss": 0.2359, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.005547699984163046, + "rewards/margins": 0.20355132222175598, + "rewards/rejected": -0.20909900963306427, "step": 360 }, { - "epoch": 0.28, - "learning_rate": 4.514689351341751e-06, - "logits/chosen": 0.18946215510368347, - "logits/rejected": 0.2520686089992523, - "logps/chosen": -707.220458984375, - "logps/rejected": -799.5474853515625, - "loss": 0.2296, + "epoch": 0.89, + "learning_rate": 3.403104165467883e-06, + "logits/chosen": 0.047759585082530975, + "logits/rejected": 0.1289873570203781, + "logps/chosen": -363.989990234375, + "logps/rejected": -361.4288330078125, + "loss": 0.2491, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.29390206933021545, - "rewards/margins": 0.1939956247806549, - "rewards/rejected": -0.48789769411087036, + "rewards/chosen": -0.030249441042542458, + "rewards/margins": 0.1802445650100708, + "rewards/rejected": -0.2104939967393875, "step": 370 }, { - "epoch": 0.29, - "learning_rate": 4.474571359287791e-06, - "logits/chosen": 0.1794353723526001, - "logits/rejected": 0.20889122784137726, - "logps/chosen": -659.88720703125, - "logps/rejected": -792.5078125, - "loss": 0.2767, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.32972732186317444, - "rewards/margins": 0.16797736287117004, - "rewards/rejected": -0.4977046847343445, + "epoch": 0.91, + "learning_rate": 3.30442717594657e-06, + "logits/chosen": 0.06461011618375778, + "logits/rejected": 0.14733566343784332, + "logps/chosen": -350.331298828125, + "logps/rejected": -334.6890563964844, + "loss": 0.2754, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.04477550461888313, + "rewards/margins": 0.12478353828191757, + "rewards/rejected": -0.1695590317249298, "step": 380 }, { - "epoch": 0.3, - "learning_rate": 4.4330538349824684e-06, - "logits/chosen": 0.15636876225471497, - "logits/rejected": 0.2390608787536621, - "logps/chosen": -675.450927734375, - "logps/rejected": -797.5559692382812, - "loss": 0.2478, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.3252645432949066, - "rewards/margins": 0.17893439531326294, - "rewards/rejected": -0.5041989684104919, + "epoch": 0.94, + "learning_rate": 3.2043313921035747e-06, + "logits/chosen": 0.07650026679039001, + "logits/rejected": 0.10351625829935074, + "logps/chosen": -319.55328369140625, + "logps/rejected": -328.97625732421875, + "loss": 0.2601, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.017551960423588753, + "rewards/margins": 0.1492632031440735, + "rewards/rejected": -0.1668151617050171, "step": 390 }, { - "epoch": 0.3, - "learning_rate": 4.3901662051233755e-06, - "logits/chosen": 0.1320812702178955, - "logits/rejected": 0.25596413016319275, - "logps/chosen": -722.6912841796875, - "logps/rejected": -806.4666748046875, - "loss": 0.2426, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.33014723658561707, - "rewards/margins": 0.17146170139312744, - "rewards/rejected": -0.5016089677810669, + "epoch": 0.96, + "learning_rate": 3.102993356121938e-06, + "logits/chosen": 0.045068711042404175, + "logits/rejected": 0.133053719997406, + "logps/chosen": -376.1606750488281, + "logps/rejected": -360.3962097167969, + "loss": 0.2547, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.012314733117818832, + "rewards/margins": 0.18502004444599152, + "rewards/rejected": -0.19733479619026184, "step": 400 }, { - "epoch": 0.31, - "learning_rate": 4.345938867508439e-06, - "logits/chosen": 0.15700757503509521, - "logits/rejected": 0.2542612552642822, - "logps/chosen": -715.8280029296875, - "logps/rejected": -812.6696166992188, - "loss": 0.2483, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.34785327315330505, - "rewards/margins": 0.17259207367897034, - "rewards/rejected": -0.5204453468322754, + "epoch": 0.98, + "learning_rate": 3.0005918011851245e-06, + "logits/chosen": 0.03985997289419174, + "logits/rejected": 0.1656588464975357, + "logps/chosen": -379.48199462890625, + "logps/rejected": -362.08380126953125, + "loss": 0.273, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.00483871391043067, + "rewards/margins": 0.1729108989238739, + "rewards/rejected": -0.16807220876216888, "step": 410 }, { - "epoch": 0.32, - "learning_rate": 4.30040316949064e-06, - "logits/chosen": 0.12117477506399155, - "logits/rejected": 0.21865728497505188, - "logps/chosen": -640.8743896484375, - "logps/rejected": -720.25439453125, - "loss": 0.2724, + "epoch": 1.01, + "learning_rate": 2.8973073362395e-06, + "logits/chosen": 0.06932957470417023, + "logits/rejected": 0.11695323139429092, + "logps/chosen": -350.8485107421875, + "logps/rejected": -359.5559387207031, + "loss": 0.2562, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.30353325605392456, - "rewards/margins": 0.136855810880661, - "rewards/rejected": -0.44038906693458557, + "rewards/chosen": -0.015226135030388832, + "rewards/margins": 0.13259340822696686, + "rewards/rejected": -0.14781954884529114, "step": 420 }, { - "epoch": 0.33, - "learning_rate": 4.253591385759705e-06, - "logits/chosen": 0.1048569455742836, - "logits/rejected": 0.2262450009584427, - "logps/chosen": -654.50634765625, - "logps/rejected": -762.971435546875, - "loss": 0.2437, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.30736270546913147, - "rewards/margins": 0.18053661286830902, - "rewards/rejected": -0.48789939284324646, + "epoch": 1.03, + "learning_rate": 2.7933221274484725e-06, + "logits/chosen": 0.022776301950216293, + "logits/rejected": 0.1463911086320877, + "logps/chosen": -344.72900390625, + "logps/rejected": -374.57110595703125, + "loss": 0.2546, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.01165957935154438, + "rewards/margins": 0.17338308691978455, + "rewards/rejected": -0.1617235392332077, "step": 430 }, { - "epoch": 0.34, - "learning_rate": 4.205536695466524e-06, - "logits/chosen": 0.1525915265083313, - "logits/rejected": 0.27372902631759644, - "logps/chosen": -654.9791259765625, - "logps/rejected": -811.6696166992188, - "loss": 0.2447, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.319768488407135, - "rewards/margins": 0.19004443287849426, - "rewards/rejected": -0.5098129510879517, + "epoch": 1.06, + "learning_rate": 2.6888195769001147e-06, + "logits/chosen": 0.011232647113502026, + "logits/rejected": 0.08440439403057098, + "logps/chosen": -315.56158447265625, + "logps/rejected": -370.6732177734375, + "loss": 0.2635, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.0027101226150989532, + "rewards/margins": 0.18474070727825165, + "rewards/rejected": -0.1874508261680603, "step": 440 }, { - "epoch": 0.34, - "learning_rate": 4.15627315870651e-06, - "logits/chosen": 0.136866956949234, - "logits/rejected": 0.21996262669563293, - "logps/chosen": -676.5054931640625, - "logps/rejected": -751.3809204101562, - "loss": 0.2517, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.3240048289299011, - "rewards/margins": 0.1569635421037674, - "rewards/rejected": -0.48096832633018494, + "epoch": 1.08, + "learning_rate": 2.583983999134951e-06, + "logits/chosen": 0.033940933644771576, + "logits/rejected": 0.12383987754583359, + "logps/chosen": -353.528076171875, + "logps/rejected": -358.25433349609375, + "loss": 0.2647, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.01790205016732216, + "rewards/margins": 0.16596254706382751, + "rewards/rejected": -0.18386459350585938, "step": 450 }, { - "epoch": 0.35, - "learning_rate": 4.105835692378557e-06, - "logits/chosen": 0.17371432483196259, - "logits/rejected": 0.2416466474533081, - "logps/chosen": -675.8497314453125, - "logps/rejected": -758.9337158203125, - "loss": 0.2008, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.28624922037124634, - "rewards/margins": 0.17619088292121887, - "rewards/rejected": -0.4624401032924652, + "epoch": 1.1, + "learning_rate": 2.479000296064417e-06, + "logits/chosen": 0.03699932247400284, + "logits/rejected": 0.13089559972286224, + "logps/chosen": -375.724609375, + "logps/rejected": -400.3955383300781, + "loss": 0.2481, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.041518934071063995, + "rewards/margins": 0.1739250123500824, + "rewards/rejected": -0.21544396877288818, "step": 460 }, { - "epoch": 0.36, - "learning_rate": 4.05426004543672e-06, - "logits/chosen": 0.12746404111385345, - "logits/rejected": 0.2692243754863739, - "logps/chosen": -697.2105712890625, - "logps/rejected": -788.6505126953125, - "loss": 0.2271, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.3282613158226013, - "rewards/margins": 0.18242862820625305, - "rewards/rejected": -0.510689914226532, + "epoch": 1.13, + "learning_rate": 2.374053630853358e-06, + "logits/chosen": 0.07867871224880219, + "logits/rejected": 0.0793570876121521, + "logps/chosen": -392.0462646484375, + "logps/rejected": -398.4570617675781, + "loss": 0.2589, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.03614164516329765, + "rewards/margins": 0.18701379001140594, + "rewards/rejected": -0.2231554538011551, "step": 470 }, { - "epoch": 0.37, - "learning_rate": 4.001582773552153e-06, - "logits/chosen": 0.19310589134693146, - "logits/rejected": 0.24649909138679504, - "logps/chosen": -676.9974365234375, - "logps/rejected": -746.8987426757812, - "loss": 0.2236, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.31384724378585815, - "rewards/margins": 0.16469937562942505, - "rewards/rejected": -0.478546679019928, + "epoch": 1.15, + "learning_rate": 2.269329101341745e-06, + "logits/chosen": 0.04767027124762535, + "logits/rejected": 0.10338594764471054, + "logps/chosen": -311.9175109863281, + "logps/rejected": -353.84375, + "loss": 0.253, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.022265803068876266, + "rewards/margins": 0.21186105906963348, + "rewards/rejected": -0.18959525227546692, "step": 480 }, { - "epoch": 0.37, - "learning_rate": 3.947841213203262e-06, - "logits/chosen": 0.15690350532531738, - "logits/rejected": 0.300692081451416, - "logps/chosen": -735.6814575195312, - "logps/rejected": -848.1085815429688, - "loss": 0.2125, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.34843936562538147, - "rewards/margins": 0.20286861062049866, - "rewards/rejected": -0.5513080358505249, + "epoch": 1.18, + "learning_rate": 2.1650114135816052e-06, + "logits/chosen": 0.04343586042523384, + "logits/rejected": 0.14493630826473236, + "logps/chosen": -368.74066162109375, + "logps/rejected": -401.21746826171875, + "loss": 0.254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0024279176723212004, + "rewards/margins": 0.1874578297138214, + "rewards/rejected": -0.18988573551177979, "step": 490 }, { - "epoch": 0.38, - "learning_rate": 3.893073455212438e-06, - "logits/chosen": 0.18581806123256683, - "logits/rejected": 0.28662142157554626, - "logps/chosen": -699.9567260742188, - "logps/rejected": -875.0812377929688, - "loss": 0.2029, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.35302621126174927, - "rewards/margins": 0.2367512434720993, - "rewards/rejected": -0.589777410030365, + "epoch": 1.2, + "learning_rate": 2.06128455606496e-06, + "logits/chosen": 0.04143913835287094, + "logits/rejected": 0.06632859259843826, + "logps/chosen": -320.82281494140625, + "logps/rejected": -348.89923095703125, + "loss": 0.2438, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0026562472339719534, + "rewards/margins": 0.18748678267002106, + "rewards/rejected": -0.19014303386211395, "step": 500 }, { - "epoch": 0.39, - "learning_rate": 3.837318317748134e-06, - "logits/chosen": 0.2250034064054489, - "logits/rejected": 0.25657814741134644, - "logps/chosen": -629.2000732421875, - "logps/rejected": -729.8438720703125, - "loss": 0.2881, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.2759498357772827, - "rewards/margins": 0.16293799877166748, - "rewards/rejected": -0.43888789415359497, + "epoch": 1.22, + "learning_rate": 1.958331475217357e-06, + "logits/chosen": 0.03532598540186882, + "logits/rejected": 0.07111676037311554, + "logps/chosen": -345.3083801269531, + "logps/rejected": -391.5373840332031, + "loss": 0.2428, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.011091398075222969, + "rewards/margins": 0.18126052618026733, + "rewards/rejected": -0.19235190749168396, "step": 510 }, { - "epoch": 0.4, - "learning_rate": 3.7806153188114027e-06, - "logits/chosen": 0.15463793277740479, - "logits/rejected": 0.26033270359039307, - "logps/chosen": -536.32861328125, - "logps/rejected": -633.5609130859375, - "loss": 0.2673, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.219068244099617, - "rewards/margins": 0.13708043098449707, - "rewards/rejected": -0.3561486601829529, + "epoch": 1.25, + "learning_rate": 1.856333752729311e-06, + "logits/chosen": 0.06463773548603058, + "logits/rejected": 0.07833746820688248, + "logps/chosen": -303.89508056640625, + "logps/rejected": -328.54095458984375, + "loss": 0.2549, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.030626490712165833, + "rewards/margins": 0.14131976664066315, + "rewards/rejected": -0.17194625735282898, "step": 520 }, { - "epoch": 0.4, - "learning_rate": 3.7230046482264256e-06, - "logits/chosen": 0.1772613823413849, - "logits/rejected": 0.3070564270019531, - "logps/chosen": -638.0252685546875, - "logps/rejected": -699.7626953125, - "loss": 0.2509, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.24965958297252655, - "rewards/margins": 0.15243235230445862, - "rewards/rejected": -0.402091920375824, + "epoch": 1.27, + "learning_rate": 1.7554712852947915e-06, + "logits/chosen": 0.017867419868707657, + "logits/rejected": 0.13077208399772644, + "logps/chosen": -354.83990478515625, + "logps/rejected": -369.40447998046875, + "loss": 0.2688, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.022668231278657913, + "rewards/margins": 0.164995938539505, + "rewards/rejected": -0.1876641809940338, "step": 530 }, { - "epoch": 0.41, - "learning_rate": 3.6645271391548542e-06, - "logits/chosen": 0.1249006986618042, - "logits/rejected": 0.20080497860908508, - "logps/chosen": -620.0784912109375, - "logps/rejected": -716.3678588867188, - "loss": 0.2308, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.2955653667449951, - "rewards/margins": 0.15287812054157257, - "rewards/rejected": -0.4484435021877289, + "epoch": 1.3, + "learning_rate": 1.6559219673215784e-06, + "logits/chosen": 0.07014649361371994, + "logits/rejected": 0.11957643926143646, + "logps/chosen": -341.1030578613281, + "logps/rejected": -360.0315246582031, + "loss": 0.2559, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0031127408146858215, + "rewards/margins": 0.17289015650749207, + "rewards/rejected": -0.16977740824222565, "step": 540 }, { - "epoch": 0.42, - "learning_rate": 3.6052242391541746e-06, - "logits/chosen": 0.1353389322757721, - "logits/rejected": 0.22389094531536102, - "logps/chosen": -614.9147338867188, - "logps/rejected": -771.8558349609375, - "loss": 0.2102, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.2764993906021118, - "rewards/margins": 0.18298561871051788, - "rewards/rejected": -0.4594849944114685, + "epoch": 1.32, + "learning_rate": 1.5578613771731214e-06, + "logits/chosen": 0.044239241629838943, + "logits/rejected": 0.11994221061468124, + "logps/chosen": -347.32757568359375, + "logps/rejected": -388.6127624511719, + "loss": 0.244, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.0042419894598424435, + "rewards/margins": 0.21681733429431915, + "rewards/rejected": -0.22105932235717773, "step": 550 }, { - "epoch": 0.43, - "learning_rate": 3.5451379808006014e-06, - "logits/chosen": 0.17011868953704834, - "logits/rejected": 0.27556750178337097, - "logps/chosen": -658.0120849609375, - "logps/rejected": -755.5858764648438, - "loss": 0.2345, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.29901498556137085, - "rewards/margins": 0.18377789855003357, - "rewards/rejected": -0.48279285430908203, + "epoch": 1.34, + "learning_rate": 1.4614624674952843e-06, + "logits/chosen": 0.07131338119506836, + "logits/rejected": 0.14118310809135437, + "logps/chosen": -381.21112060546875, + "logps/rejected": -375.3702087402344, + "loss": 0.2594, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.01365007646381855, + "rewards/margins": 0.16313722729682922, + "rewards/rejected": -0.17678730189800262, "step": 560 }, { - "epoch": 0.43, - "learning_rate": 3.484310951897323e-06, - "logits/chosen": 0.1761491745710373, - "logits/rejected": 0.30945947766304016, - "logps/chosen": -717.3760986328125, - "logps/rejected": -787.5486450195312, - "loss": 0.2483, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.32107868790626526, - "rewards/margins": 0.1693280041217804, - "rewards/rejected": -0.49040669202804565, + "epoch": 1.37, + "learning_rate": 1.3668952601741442e-06, + "logits/chosen": 0.019948173314332962, + "logits/rejected": 0.14301837980747223, + "logps/chosen": -359.31829833984375, + "logps/rejected": -386.3388366699219, + "loss": 0.2421, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.003145938040688634, + "rewards/margins": 0.17547301948070526, + "rewards/rejected": -0.17861898243427277, "step": 570 }, { - "epoch": 0.44, - "learning_rate": 3.4227862652892106e-06, - "logits/chosen": 0.20576810836791992, - "logits/rejected": 0.26615187525749207, - "logps/chosen": -668.9432983398438, - "logps/rejected": -773.5794067382812, - "loss": 0.2502, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.31081390380859375, - "rewards/margins": 0.15438678860664368, - "rewards/rejected": -0.46520066261291504, + "epoch": 1.39, + "learning_rate": 1.2743265464628787e-06, + "logits/chosen": 0.04147445410490036, + "logits/rejected": 0.07641445100307465, + "logps/chosen": -358.9191589355469, + "logps/rejected": -354.82989501953125, + "loss": 0.2574, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.03237663954496384, + "rewards/margins": 0.14051951467990875, + "rewards/rejected": -0.17289616167545319, "step": 580 }, { - "epoch": 0.45, - "learning_rate": 3.3606075283054005e-06, - "logits/chosen": 0.18598072230815887, - "logits/rejected": 0.2837832570075989, - "logps/chosen": -603.8995361328125, - "logps/rejected": -735.7412719726562, - "loss": 0.208, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2771075367927551, - "rewards/margins": 0.1824244260787964, - "rewards/rejected": -0.4595320224761963, + "epoch": 1.42, + "learning_rate": 1.1839195928066101e-06, + "logits/chosen": 0.010291008278727531, + "logits/rejected": 0.08601720631122589, + "logps/chosen": -338.0829162597656, + "logps/rejected": -349.2616882324219, + "loss": 0.2504, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.012054244987666607, + "rewards/margins": 0.18035855889320374, + "rewards/rejected": -0.19241279363632202, "step": 590 }, { - "epoch": 0.46, - "learning_rate": 3.2978188118513814e-06, - "logits/chosen": 0.1718396097421646, - "logits/rejected": 0.3636724054813385, - "logps/chosen": -653.9473266601562, - "logps/rejected": -759.1709594726562, - "loss": 0.2263, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.28921762108802795, - "rewards/margins": 0.19078542292118073, - "rewards/rejected": -0.4800030589103699, + "epoch": 1.44, + "learning_rate": 1.0958338528840893e-06, + "logits/chosen": 0.07830692082643509, + "logits/rejected": 0.1112513542175293, + "logps/chosen": -318.32928466796875, + "logps/rejected": -351.01531982421875, + "loss": 0.2642, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.012662211433053017, + "rewards/margins": 0.15172497928142548, + "rewards/rejected": -0.16438719630241394, "step": 600 }, { - "epoch": 0.46, - "learning_rate": 3.234464619172522e-06, - "logits/chosen": 0.1497826725244522, - "logits/rejected": 0.2777649164199829, - "logps/chosen": -638.6043090820312, - "logps/rejected": -779.666748046875, - "loss": 0.223, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.28172770142555237, - "rewards/margins": 0.22779683768749237, - "rewards/rejected": -0.5095245242118835, + "epoch": 1.46, + "learning_rate": 1.0102246863740498e-06, + "logits/chosen": 0.013798505067825317, + "logits/rejected": 0.13072696328163147, + "logps/chosen": -326.76336669921875, + "logps/rejected": -380.63458251953125, + "loss": 0.2398, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.0045418571680784225, + "rewards/margins": 0.19731177389621735, + "rewards/rejected": -0.20185360312461853, "step": 610 }, { - "epoch": 0.47, - "learning_rate": 3.1705898543111576e-06, - "logits/chosen": 0.13684847950935364, - "logits/rejected": 0.24817728996276855, - "logps/chosen": -634.4373779296875, - "logps/rejected": -796.2052612304688, - "loss": 0.2257, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.2933855652809143, - "rewards/margins": 0.20933008193969727, - "rewards/rejected": -0.5027156472206116, + "epoch": 1.49, + "learning_rate": 9.272430849423175e-07, + "logits/chosen": 0.041550200432538986, + "logits/rejected": 0.12003109604120255, + "logps/chosen": -350.9006652832031, + "logps/rejected": -404.7802734375, + "loss": 0.2245, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.015362398698925972, + "rewards/margins": 0.22952251136302948, + "rewards/rejected": -0.21416012942790985, "step": 620 }, { - "epoch": 0.48, - "learning_rate": 3.106239790279606e-06, - "logits/chosen": 0.16658630967140198, - "logits/rejected": 0.2954631447792053, - "logps/chosen": -654.8861083984375, - "logps/rejected": -780.149169921875, - "loss": 0.2097, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.3087119460105896, - "rewards/margins": 0.1822930872440338, - "rewards/rejected": -0.4910050928592682, + "epoch": 1.51, + "learning_rate": 8.470354059328919e-07, + "logits/chosen": 0.104413703083992, + "logits/rejected": 0.11118074506521225, + "logps/chosen": -336.5838928222656, + "logps/rejected": -373.56085205078125, + "loss": 0.2452, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.010279458947479725, + "rewards/margins": 0.2295042723417282, + "rewards/rejected": -0.21922484040260315, "step": 630 }, { - "epoch": 0.49, - "learning_rate": 3.041460036971664e-06, - "logits/chosen": 0.13499195873737335, - "logits/rejected": 0.27665549516677856, - "logps/chosen": -655.3276977539062, - "logps/rejected": -844.3450927734375, - "loss": 0.2251, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.3263702094554901, - "rewards/margins": 0.22413411736488342, - "rewards/rejected": -0.5505043268203735, + "epoch": 1.54, + "learning_rate": 7.697431142327633e-07, + "logits/chosen": 0.07976067811250687, + "logits/rejected": 0.12730778753757477, + "logps/chosen": -348.73443603515625, + "logps/rejected": -358.34088134765625, + "loss": 0.2338, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.030282145366072655, + "rewards/margins": 0.16269627213478088, + "rewards/rejected": -0.1929783970117569, "step": 640 }, { - "epoch": 0.5, - "learning_rate": 2.976296508835326e-06, - "logits/chosen": 0.16133855283260345, - "logits/rejected": 0.256599485874176, - "logps/chosen": -660.0172119140625, - "logps/rejected": -769.0188598632812, - "loss": 0.2228, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.3122148811817169, - "rewards/margins": 0.18627096712589264, - "rewards/rejected": -0.49848586320877075, + "epoch": 1.56, + "learning_rate": 6.955025327656839e-07, + "logits/chosen": 0.04196876287460327, + "logits/rejected": 0.11756552755832672, + "logps/chosen": -327.8496398925781, + "logps/rejected": -355.4369201660156, + "loss": 0.2558, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.0020761913619935513, + "rewards/margins": 0.17507974803447723, + "rewards/rejected": -0.17300358414649963, "step": 650 }, { - "epoch": 0.5, - "learning_rate": 2.910795392329649e-06, - "logits/chosen": 0.1639477163553238, - "logits/rejected": 0.24110262095928192, - "logps/chosen": -739.8809204101562, - "logps/rejected": -895.7755126953125, - "loss": 0.2189, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.33884376287460327, - "rewards/margins": 0.23572292923927307, - "rewards/rejected": -0.5745667219161987, + "epoch": 1.58, + "learning_rate": 6.244446020550182e-07, + "logits/chosen": 0.05316174030303955, + "logits/rejected": 0.10895484685897827, + "logps/chosen": -354.5049133300781, + "logps/rejected": -411.59765625, + "loss": 0.2319, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.0010157767683267593, + "rewards/margins": 0.21365651488304138, + "rewards/rejected": -0.2146722972393036, "step": 660 }, { - "epoch": 0.51, - "learning_rate": 2.8450031131888147e-06, - "logits/chosen": 0.13471753895282745, - "logits/rejected": 0.27453264594078064, - "logps/chosen": -660.6944580078125, - "logps/rejected": -760.9178466796875, - "loss": 0.2316, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.32461482286453247, - "rewards/margins": 0.18133333325386047, - "rewards/rejected": -0.5059481859207153, + "epoch": 1.61, + "learning_rate": 5.566946492796766e-07, + "logits/chosen": 0.07230822741985321, + "logits/rejected": 0.09754084050655365, + "logps/chosen": -368.22802734375, + "logps/rejected": -368.54974365234375, + "loss": 0.2451, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02271811105310917, + "rewards/margins": 0.14353466033935547, + "rewards/rejected": -0.16625277698040009, "step": 670 }, { - "epoch": 0.52, - "learning_rate": 2.7789663035166035e-06, - "logits/chosen": 0.07190994918346405, - "logits/rejected": 0.22703304886817932, - "logps/chosen": -670.5394287109375, - "logps/rejected": -823.9978637695312, - "loss": 0.2036, + "epoch": 1.63, + "learning_rate": 4.923721672305148e-07, + "logits/chosen": 0.04747115820646286, + "logits/rejected": 0.10951533168554306, + "logps/chosen": -373.25653076171875, + "logps/rejected": -403.66619873046875, + "loss": 0.262, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.3125961720943451, - "rewards/margins": 0.2316323220729828, - "rewards/rejected": -0.5442285537719727, + "rewards/chosen": -3.2638385164318606e-05, + "rewards/margins": 0.20511355996131897, + "rewards/rejected": -0.2051461637020111, "step": 680 }, { - "epoch": 0.53, - "learning_rate": 2.7127317687345973e-06, - "logits/chosen": 0.07092462480068207, - "logits/rejected": 0.26120471954345703, - "logps/chosen": -671.69189453125, - "logps/rejected": -811.6243286132812, - "loss": 0.2018, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.30282872915267944, - "rewards/margins": 0.21179743111133575, - "rewards/rejected": -0.514626145362854, + "epoch": 1.66, + "learning_rate": 4.3159060355700943e-07, + "logits/chosen": 0.007146243005990982, + "logits/rejected": 0.15595687925815582, + "logps/chosen": -360.5429382324219, + "logps/rejected": -360.84271240234375, + "loss": 0.2528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026043469086289406, + "rewards/margins": 0.19069012999534607, + "rewards/rejected": -0.21673360466957092, "step": 690 }, { - "epoch": 0.53, - "learning_rate": 2.6463464544075344e-06, - "logits/chosen": 0.14573340117931366, - "logits/rejected": 0.23916473984718323, - "logps/chosen": -684.8336181640625, - "logps/rejected": -777.0270385742188, - "loss": 0.1987, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.3061198592185974, - "rewards/margins": 0.19952335953712463, - "rewards/rejected": -0.5056431889533997, + "epoch": 1.68, + "learning_rate": 3.7445716067596506e-07, + "logits/chosen": -0.016133427619934082, + "logits/rejected": 0.06616418063640594, + "logps/chosen": -315.7747497558594, + "logps/rejected": -344.2303771972656, + "loss": 0.242, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012198897078633308, + "rewards/margins": 0.2178380936384201, + "rewards/rejected": -0.20563916862010956, "step": 700 }, { - "epoch": 0.54, - "learning_rate": 2.579857412969345e-06, - "logits/chosen": 0.11962984502315521, - "logits/rejected": 0.23462708294391632, - "logps/chosen": -730.8359375, - "logps/rejected": -856.9847412109375, - "loss": 0.237, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.37815654277801514, - "rewards/margins": 0.18362250924110413, - "rewards/rejected": -0.5617790818214417, + "epoch": 1.7, + "learning_rate": 3.2107260669512334e-07, + "logits/chosen": 0.06611919403076172, + "logits/rejected": 0.08203423768281937, + "logps/chosen": -342.01263427734375, + "logps/rejected": -353.5125427246094, + "loss": 0.2461, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01212338637560606, + "rewards/margins": 0.17198148369789124, + "rewards/rejected": -0.18410487473011017, "step": 710 }, { - "epoch": 0.55, - "learning_rate": 2.513311770373421e-06, - "logits/chosen": 0.11839403957128525, - "logits/rejected": 0.16228660941123962, - "logps/chosen": -644.6649780273438, - "logps/rejected": -824.513671875, - "loss": 0.2276, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.32936322689056396, - "rewards/margins": 0.2137618511915207, - "rewards/rejected": -0.5431250929832458, + "epoch": 1.73, + "learning_rate": 2.7153109768518926e-07, + "logits/chosen": 0.05342602729797363, + "logits/rejected": 0.11405602842569351, + "logps/chosen": -393.02593994140625, + "logps/rejected": -416.9335021972656, + "loss": 0.244, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.015018805861473083, + "rewards/margins": 0.2232932150363922, + "rewards/rejected": -0.2383120059967041, "step": 720 }, { - "epoch": 0.56, - "learning_rate": 2.446756692690804e-06, - "logits/chosen": 0.13511911034584045, - "logits/rejected": 0.2399536371231079, - "logps/chosen": -717.3846435546875, - "logps/rejected": -858.6282958984375, - "loss": 0.2365, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.3688200116157532, - "rewards/margins": 0.2047378122806549, - "rewards/rejected": -0.5735577940940857, + "epoch": 1.75, + "learning_rate": 2.2592001161370392e-07, + "logits/chosen": 0.059743158519268036, + "logits/rejected": 0.08855228126049042, + "logps/chosen": -365.6115417480469, + "logps/rejected": -373.24310302734375, + "loss": 0.2413, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.007994825020432472, + "rewards/margins": 0.19029465317726135, + "rewards/rejected": -0.19828948378562927, "step": 730 }, { - "epoch": 0.56, - "learning_rate": 2.380239352679908e-06, - "logits/chosen": 0.14800789952278137, - "logits/rejected": 0.2702687382698059, - "logps/chosen": -674.1021728515625, - "logps/rejected": -830.0198974609375, - "loss": 0.2284, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.31319013237953186, - "rewards/margins": 0.2306814193725586, - "rewards/rejected": -0.5438715219497681, + "epoch": 1.78, + "learning_rate": 1.8431979423369607e-07, + "logits/chosen": 0.01501550804823637, + "logits/rejected": 0.09877587854862213, + "logps/chosen": -335.7201232910156, + "logps/rejected": -356.1680603027344, + "loss": 0.2601, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.013049180619418621, + "rewards/margins": 0.1567631959915161, + "rewards/rejected": -0.16981235146522522, "step": 740 }, { - "epoch": 0.57, - "learning_rate": 2.313806896351529e-06, - "logits/chosen": 0.07657043635845184, - "logits/rejected": 0.2402833253145218, - "logps/chosen": -611.2683715820312, - "logps/rejected": -762.6300048828125, - "loss": 0.2543, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.3043960928916931, - "rewards/margins": 0.188668355345726, - "rewards/rejected": -0.49306440353393555, + "epoch": 1.8, + "learning_rate": 1.468038171988881e-07, + "logits/chosen": -0.008327131159603596, + "logits/rejected": 0.04639572650194168, + "logps/chosen": -354.1353759765625, + "logps/rejected": -387.98297119140625, + "loss": 0.2595, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.02448558434844017, + "rewards/margins": 0.1901397705078125, + "rewards/rejected": -0.21462532877922058, "step": 750 }, { - "epoch": 0.58, - "learning_rate": 2.247506409552795e-06, - "logits/chosen": 0.11987291276454926, - "logits/rejected": 0.1521257907152176, - "logps/chosen": -654.2996826171875, - "logps/rejected": -776.6150512695312, - "loss": 0.248, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.2802131772041321, - "rewards/margins": 0.1828833967447281, - "rewards/rejected": -0.463096559047699, + "epoch": 1.82, + "learning_rate": 1.1343824865573422e-07, + "logits/chosen": 0.01856027916073799, + "logits/rejected": 0.07309429347515106, + "logps/chosen": -321.44903564453125, + "logps/rejected": -341.5816955566406, + "loss": 0.2495, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.022448932752013206, + "rewards/margins": 0.17198805510997772, + "rewards/rejected": -0.19443701207637787, "step": 760 }, { - "epoch": 0.59, - "learning_rate": 2.1813848845937695e-06, - "logits/chosen": 0.07216247916221619, - "logits/rejected": 0.16098852455615997, - "logps/chosen": -612.78759765625, - "logps/rejected": -761.1871948242188, - "loss": 0.2486, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.31463655829429626, - "rewards/margins": 0.16312028467655182, - "rewards/rejected": -0.4777568280696869, + "epoch": 1.85, + "learning_rate": 8.428193654051036e-08, + "logits/chosen": 0.04589134082198143, + "logits/rejected": 0.10319966077804565, + "logps/chosen": -388.9933776855469, + "logps/rejected": -376.8731994628906, + "loss": 0.2475, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": 0.008324380032718182, + "rewards/margins": 0.20527882874011993, + "rewards/rejected": -0.19695445895195007, "step": 770 }, { - "epoch": 0.59, - "learning_rate": 2.1154891869403436e-06, - "logits/chosen": 0.056468479335308075, - "logits/rejected": 0.21281781792640686, - "logps/chosen": -676.2210693359375, - "logps/rejected": -826.8810424804688, - "loss": 0.202, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.31608521938323975, - "rewards/margins": 0.22052684426307678, - "rewards/rejected": -0.5366120934486389, + "epoch": 1.87, + "learning_rate": 5.9386304787299175e-08, + "logits/chosen": 0.03318192437291145, + "logits/rejected": 0.1395682990550995, + "logps/chosen": -377.56622314453125, + "logps/rejected": -377.5900573730469, + "loss": 0.2477, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.0049881902523338795, + "rewards/margins": 0.2095176726579666, + "rewards/rejected": -0.2145058661699295, "step": 780 }, { - "epoch": 0.6, - "learning_rate": 2.0498660219970395e-06, - "logits/chosen": 0.14304831624031067, - "logits/rejected": 0.2865908741950989, - "logps/chosen": -720.0322875976562, - "logps/rejected": -834.4865112304688, - "loss": 0.2143, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.3310641646385193, - "rewards/margins": 0.21627302467823029, - "rewards/rejected": -0.5473372340202332, + "epoch": 1.9, + "learning_rate": 3.8795262629929e-08, + "logits/chosen": 0.03711915761232376, + "logits/rejected": 0.07861719280481339, + "logps/chosen": -311.10015869140625, + "logps/rejected": -340.22918701171875, + "loss": 0.2288, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.007546453736722469, + "rewards/margins": 0.215033620595932, + "rewards/rejected": -0.20748718082904816, "step": 790 }, { - "epoch": 0.61, - "learning_rate": 1.9845619020032552e-06, - "logits/chosen": 0.0913710817694664, - "logits/rejected": 0.2621229588985443, - "logps/chosen": -681.4385986328125, - "logps/rejected": -793.8607788085938, - "loss": 0.2232, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.33381250500679016, - "rewards/margins": 0.18326430022716522, - "rewards/rejected": -0.5170767903327942, + "epoch": 1.92, + "learning_rate": 2.2545127157831416e-08, + "logits/chosen": 0.06011080741882324, + "logits/rejected": 0.08075010776519775, + "logps/chosen": -342.993408203125, + "logps/rejected": -338.7896728515625, + "loss": 0.252, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.03219890594482422, + "rewards/margins": 0.15845921635627747, + "rewards/rejected": -0.1906580924987793, "step": 800 }, { - "epoch": 0.62, - "learning_rate": 1.9196231130664282e-06, - "logits/chosen": 0.12347328662872314, - "logits/rejected": 0.22142863273620605, - "logps/chosen": -656.6223754882812, - "logps/rejected": -849.30908203125, - "loss": 0.2207, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.32625263929367065, - "rewards/margins": 0.2373245507478714, - "rewards/rejected": -0.5635771751403809, + "epoch": 1.94, + "learning_rate": 1.0664559262413831e-08, + "logits/chosen": 0.06324592232704163, + "logits/rejected": 0.15417756140232086, + "logps/chosen": -383.63238525390625, + "logps/rejected": -373.19720458984375, + "loss": 0.2445, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.013102272525429726, + "rewards/margins": 0.21051840484142303, + "rewards/rejected": -0.2236206978559494, "step": 810 }, { - "epoch": 0.62, - "learning_rate": 1.8550956823554708e-06, - "logits/chosen": 0.12958547472953796, - "logits/rejected": 0.20247094333171844, - "logps/chosen": -676.3146362304688, - "logps/rejected": -852.1516723632812, - "loss": 0.2231, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.3258567452430725, - "rewards/margins": 0.22926822304725647, - "rewards/rejected": -0.5551249384880066, + "epoch": 1.97, + "learning_rate": 3.1745130869123564e-09, + "logits/chosen": 0.02718031406402588, + "logits/rejected": 0.09324290603399277, + "logps/chosen": -342.188232421875, + "logps/rejected": -382.42657470703125, + "loss": 0.2445, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.02895962819457054, + "rewards/margins": 0.1746593415737152, + "rewards/rejected": -0.20361897349357605, "step": 820 }, { - "epoch": 0.63, - "learning_rate": 1.7910253454777346e-06, - "logits/chosen": 0.1100487932562828, - "logits/rejected": 0.178089901804924, - "logps/chosen": -628.2322387695312, - "logps/rejected": -760.2506103515625, - "loss": 0.2364, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3000151813030243, - "rewards/margins": 0.20657257735729218, - "rewards/rejected": -0.5065878033638, - "step": 830 - }, - { - "epoch": 0.64, - "learning_rate": 1.7274575140626318e-06, - "logits/chosen": 0.06159307807683945, - "logits/rejected": 0.21589604020118713, - "logps/chosen": -658.613525390625, - "logps/rejected": -839.2703247070312, - "loss": 0.2066, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.3004804253578186, - "rewards/margins": 0.23831859230995178, - "rewards/rejected": -0.5387989282608032, - "step": 840 - }, - { - "epoch": 0.65, - "learning_rate": 1.6644372435748823e-06, - "logits/chosen": 0.13025906682014465, - "logits/rejected": 0.25792089104652405, - "logps/chosen": -657.7048950195312, - "logps/rejected": -741.805908203125, - "loss": 0.2056, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.3026004731655121, - "rewards/margins": 0.18632087111473083, - "rewards/rejected": -0.4889214038848877, - "step": 850 - }, - { - "epoch": 0.66, - "learning_rate": 1.6020092013802002e-06, - "logits/chosen": 0.13131192326545715, - "logits/rejected": 0.2267313450574875, - "logps/chosen": -609.1980590820312, - "logps/rejected": -762.2578735351562, - "loss": 0.2173, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.2911318838596344, - "rewards/margins": 0.19050107896327972, - "rewards/rejected": -0.48163294792175293, - "step": 860 - }, - { - "epoch": 0.66, - "learning_rate": 1.5402176350860653e-06, - "logits/chosen": 0.1238790899515152, - "logits/rejected": 0.215298131108284, - "logps/chosen": -654.3424072265625, - "logps/rejected": -766.5152587890625, - "loss": 0.2276, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.311137855052948, - "rewards/margins": 0.18526899814605713, - "rewards/rejected": -0.4964068531990051, - "step": 870 - }, - { - "epoch": 0.67, - "learning_rate": 1.4791063411799938e-06, - "logits/chosen": 0.07573570311069489, - "logits/rejected": 0.16694195568561554, - "logps/chosen": -611.1826782226562, - "logps/rejected": -747.8428955078125, - "loss": 0.225, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.28111016750335693, - "rewards/margins": 0.19318345189094543, - "rewards/rejected": -0.47429361939430237, - "step": 880 - }, - { - "epoch": 0.68, - "learning_rate": 1.4187186339875697e-06, - "logits/chosen": 0.14150698482990265, - "logits/rejected": 0.19218984246253967, - "logps/chosen": -653.740478515625, - "logps/rejected": -790.3258666992188, - "loss": 0.1959, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.2978840172290802, - "rewards/margins": 0.19249173998832703, - "rewards/rejected": -0.4903757572174072, - "step": 890 - }, - { - "epoch": 0.69, - "learning_rate": 1.3590973149722103e-06, - "logits/chosen": 0.07968850433826447, - "logits/rejected": 0.24394066631793976, - "logps/chosen": -660.4170532226562, - "logps/rejected": -787.5786743164062, - "loss": 0.2189, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.3150777518749237, - "rewards/margins": 0.19490757584571838, - "rewards/rejected": -0.5099853277206421, - "step": 900 - }, - { - "epoch": 0.69, - "learning_rate": 1.300284642398445e-06, - "logits/chosen": 0.1339327096939087, - "logits/rejected": 0.19529958069324493, - "logps/chosen": -637.8862915039062, - "logps/rejected": -804.20263671875, - "loss": 0.2179, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.30193617939949036, - "rewards/margins": 0.2071986198425293, - "rewards/rejected": -0.5091347694396973, - "step": 910 - }, - { - "epoch": 0.7, - "learning_rate": 1.2423223013801946e-06, - "logits/chosen": 0.09140697866678238, - "logits/rejected": 0.23906917870044708, - "logps/chosen": -615.2650146484375, - "logps/rejected": -794.8267211914062, - "loss": 0.2345, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.290291965007782, - "rewards/margins": 0.21678297221660614, - "rewards/rejected": -0.5070749521255493, - "step": 920 - }, - { - "epoch": 0.71, - "learning_rate": 1.1852513743352886e-06, - "logits/chosen": 0.09063401818275452, - "logits/rejected": 0.18860659003257751, - "logps/chosen": -603.1500854492188, - "logps/rejected": -759.2166748046875, - "loss": 0.1973, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.26861757040023804, - "rewards/margins": 0.2037418633699417, - "rewards/rejected": -0.47235947847366333, - "step": 930 - }, - { - "epoch": 0.72, - "learning_rate": 1.1291123118671665e-06, - "logits/chosen": 0.082930788397789, - "logits/rejected": 0.177327498793602, - "logps/chosen": -685.0097045898438, - "logps/rejected": -753.899658203125, - "loss": 0.2698, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.31101205945014954, - "rewards/margins": 0.16297343373298645, - "rewards/rejected": -0.473985493183136, - "step": 940 - }, - { - "epoch": 0.72, - "learning_rate": 1.073944904094385e-06, - "logits/chosen": 0.12176795303821564, - "logits/rejected": 0.24055452644824982, - "logps/chosen": -727.9547729492188, - "logps/rejected": -807.3270874023438, - "loss": 0.2284, + "epoch": 1.99, + "learning_rate": 8.819906889168117e-11, + "logits/chosen": 0.07415173202753067, + "logits/rejected": 0.12375295162200928, + "logps/chosen": -362.17572021484375, + "logps/rejected": -372.21044921875, + "loss": 0.2579, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.3418019115924835, - "rewards/margins": 0.1853446215391159, - "rewards/rejected": -0.5271465182304382, - "step": 950 - }, - { - "epoch": 0.73, - "learning_rate": 1.019788252448267e-06, - "logits/chosen": 0.1278046816587448, - "logits/rejected": 0.18657834827899933, - "logps/chosen": -629.8650512695312, - "logps/rejected": -775.0892333984375, - "loss": 0.2073, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.31090590357780457, - "rewards/margins": 0.19522252678871155, - "rewards/rejected": -0.5061284899711609, - "step": 960 - }, - { - "epoch": 0.74, - "learning_rate": 9.66680741958685e-07, - "logits/chosen": 0.06174594908952713, - "logits/rejected": 0.21714496612548828, - "logps/chosen": -706.654052734375, - "logps/rejected": -845.1712646484375, - "loss": 0.2177, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.33396342396736145, - "rewards/margins": 0.21265073120594025, - "rewards/rejected": -0.5466141700744629, - "step": 970 - }, - { - "epoch": 0.75, - "learning_rate": 9.146600140475945e-07, - "logits/chosen": 0.09592024236917496, - "logits/rejected": 0.1995583027601242, - "logps/chosen": -628.862548828125, - "logps/rejected": -779.5205688476562, - "loss": 0.2233, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.3146194815635681, - "rewards/margins": 0.17806106805801392, - "rewards/rejected": -0.49268054962158203, - "step": 980 - }, - { - "epoch": 0.75, - "learning_rate": 8.637629398496378e-07, - "logits/chosen": 0.06787069141864777, - "logits/rejected": 0.19926394522190094, - "logps/chosen": -675.4822387695312, - "logps/rejected": -792.4046630859375, - "loss": 0.2406, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.31674814224243164, - "rewards/margins": 0.19142326712608337, - "rewards/rejected": -0.5081714391708374, - "step": 990 - }, - { - "epoch": 0.76, - "learning_rate": 8.140255940787059e-07, - "logits/chosen": 0.14530155062675476, - "logits/rejected": 0.2198048084974289, - "logps/chosen": -646.3609008789062, - "logps/rejected": -774.7366333007812, - "loss": 0.2234, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.30126315355300903, - "rewards/margins": 0.1865108758211136, - "rewards/rejected": -0.4877740442752838, - "step": 1000 - }, - { - "epoch": 0.77, - "learning_rate": 7.654832294589776e-07, - "logits/chosen": 0.14776812493801117, - "logits/rejected": 0.18824756145477295, - "logps/chosen": -676.3685302734375, - "logps/rejected": -820.2717895507812, - "loss": 0.242, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.3261755406856537, - "rewards/margins": 0.19091561436653137, - "rewards/rejected": -0.5170911550521851, - "step": 1010 - }, - { - "epoch": 0.78, - "learning_rate": 7.181702517385789e-07, - "logits/chosen": 0.15526942908763885, - "logits/rejected": 0.20596864819526672, - "logps/chosen": -720.7452392578125, - "logps/rejected": -820.1512451171875, - "loss": 0.2423, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.33318132162094116, - "rewards/margins": 0.173573836684227, - "rewards/rejected": -0.5067551732063293, - "step": 1020 - }, - { - "epoch": 0.78, - "learning_rate": 6.721201953035511e-07, - "logits/chosen": 0.09918368607759476, - "logits/rejected": 0.23762516677379608, - "logps/chosen": -690.7906494140625, - "logps/rejected": -799.6129150390625, - "loss": 0.222, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.31717661023139954, - "rewards/margins": 0.19311536848545074, - "rewards/rejected": -0.5102919340133667, - "step": 1030 - }, - { - "epoch": 0.79, - "learning_rate": 6.273656994094232e-07, - "logits/chosen": 0.08452818542718887, - "logits/rejected": 0.15428626537322998, - "logps/chosen": -674.3331298828125, - "logps/rejected": -852.9098510742188, - "loss": 0.212, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.3417903482913971, - "rewards/margins": 0.20718708634376526, - "rewards/rejected": -0.5489774942398071, - "step": 1040 - }, - { - "epoch": 0.8, - "learning_rate": 5.839384850472359e-07, - "logits/chosen": 0.11137109994888306, - "logits/rejected": 0.23986658453941345, - "logps/chosen": -681.7805786132812, - "logps/rejected": -840.8401489257812, - "loss": 0.2013, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.33553558588027954, - "rewards/margins": 0.22832973301410675, - "rewards/rejected": -0.5638653039932251, - "step": 1050 - }, - { - "epoch": 0.81, - "learning_rate": 5.418693324604082e-07, - "logits/chosen": 0.054320525377988815, - "logits/rejected": 0.2136838734149933, - "logps/chosen": -713.266357421875, - "logps/rejected": -872.5158081054688, - "loss": 0.1902, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.3621819019317627, - "rewards/margins": 0.21344491839408875, - "rewards/rejected": -0.5756268501281738, - "step": 1060 - }, - { - "epoch": 0.82, - "learning_rate": 5.01188059328386e-07, - "logits/chosen": 0.10655899345874786, - "logits/rejected": 0.20958073437213898, - "logps/chosen": -671.4863891601562, - "logps/rejected": -796.0863647460938, - "loss": 0.1935, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.31292790174484253, - "rewards/margins": 0.21162652969360352, - "rewards/rejected": -0.5245543718338013, - "step": 1070 - }, - { - "epoch": 0.82, - "learning_rate": 4.619234996325314e-07, - "logits/chosen": 0.1319715678691864, - "logits/rejected": 0.21460673213005066, - "logps/chosen": -709.2652587890625, - "logps/rejected": -860.7151489257812, - "loss": 0.2324, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3620796799659729, - "rewards/margins": 0.1911153942346573, - "rewards/rejected": -0.5531951189041138, - "step": 1080 - }, - { - "epoch": 0.83, - "learning_rate": 4.241034832192434e-07, - "logits/chosen": 0.10131983458995819, - "logits/rejected": 0.1885637789964676, - "logps/chosen": -677.9981079101562, - "logps/rejected": -864.3063354492188, - "loss": 0.2159, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.3420425057411194, - "rewards/margins": 0.23228974640369415, - "rewards/rejected": -0.5743322372436523, - "step": 1090 - }, - { - "epoch": 0.84, - "learning_rate": 3.877548160747768e-07, - "logits/chosen": 0.11466535180807114, - "logits/rejected": 0.24565303325653076, - "logps/chosen": -680.638671875, - "logps/rejected": -798.11376953125, - "loss": 0.1963, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.30832600593566895, - "rewards/margins": 0.23151281476020813, - "rewards/rejected": -0.5398387908935547, - "step": 1100 - }, - { - "epoch": 0.85, - "learning_rate": 3.529032613257574e-07, - "logits/chosen": 0.10837472975254059, - "logits/rejected": 0.17577466368675232, - "logps/chosen": -682.72216796875, - "logps/rejected": -828.9464111328125, - "loss": 0.2376, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.3374750316143036, - "rewards/margins": 0.20901791751384735, - "rewards/rejected": -0.5464929342269897, - "step": 1110 - }, - { - "epoch": 0.85, - "learning_rate": 3.195735209788528e-07, - "logits/chosen": 0.1015692800283432, - "logits/rejected": 0.1785646677017212, - "logps/chosen": -639.4000244140625, - "logps/rejected": -781.9521484375, - "loss": 0.2455, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.33476072549819946, - "rewards/margins": 0.18203067779541016, - "rewards/rejected": -0.5167914032936096, - "step": 1120 - }, - { - "epoch": 0.86, - "learning_rate": 2.8778921841253774e-07, - "logits/chosen": 0.06415721774101257, - "logits/rejected": 0.2145998477935791, - "logps/chosen": -690.4867553710938, - "logps/rejected": -865.8243408203125, - "loss": 0.1726, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.3243727684020996, - "rewards/margins": 0.251103937625885, - "rewards/rejected": -0.5754767656326294, - "step": 1130 - }, - { - "epoch": 0.87, - "learning_rate": 2.5757288163336806e-07, - "logits/chosen": 0.08277393132448196, - "logits/rejected": 0.20556513965129852, - "logps/chosen": -701.6412353515625, - "logps/rejected": -884.2093505859375, - "loss": 0.2016, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.34318509697914124, - "rewards/margins": 0.23703384399414062, - "rewards/rejected": -0.5802189707756042, - "step": 1140 - }, - { - "epoch": 0.88, - "learning_rate": 2.2894592730863336e-07, - "logits/chosen": 0.08898656070232391, - "logits/rejected": 0.19494621455669403, - "logps/chosen": -663.3425903320312, - "logps/rejected": -838.2100830078125, - "loss": 0.2081, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.3233157992362976, - "rewards/margins": 0.22612051665782928, - "rewards/rejected": -0.5494363903999329, - "step": 1150 - }, - { - "epoch": 0.88, - "learning_rate": 2.019286455866981e-07, - "logits/chosen": 0.09930244833230972, - "logits/rejected": 0.25030988454818726, - "logps/chosen": -665.492919921875, - "logps/rejected": -791.5203857421875, - "loss": 0.2154, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.3297829031944275, - "rewards/margins": 0.19049373269081116, - "rewards/rejected": -0.520276665687561, - "step": 1160 - }, - { - "epoch": 0.89, - "learning_rate": 1.7654018571579557e-07, - "logits/chosen": 0.07714973390102386, - "logits/rejected": 0.1515274941921234, - "logps/chosen": -710.2567749023438, - "logps/rejected": -828.8133544921875, - "loss": 0.2217, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.3351573944091797, - "rewards/margins": 0.20143434405326843, - "rewards/rejected": -0.5365917086601257, - "step": 1170 - }, - { - "epoch": 0.9, - "learning_rate": 1.5279854247146703e-07, - "logits/chosen": 0.08655952662229538, - "logits/rejected": 0.18316319584846497, - "logps/chosen": -664.911376953125, - "logps/rejected": -804.0281982421875, - "loss": 0.2363, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.3494827151298523, - "rewards/margins": 0.19858424365520477, - "rewards/rejected": -0.5480669140815735, - "step": 1180 - }, - { - "epoch": 0.91, - "learning_rate": 1.307205434022671e-07, - "logits/chosen": 0.0854581817984581, - "logits/rejected": 0.21030446887016296, - "logps/chosen": -680.3010864257812, - "logps/rejected": -877.068359375, - "loss": 0.1977, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.33833056688308716, - "rewards/margins": 0.23077313601970673, - "rewards/rejected": -0.5691036581993103, - "step": 1190 - }, - { - "epoch": 0.91, - "learning_rate": 1.1032183690276754e-07, - "logits/chosen": 0.07660888135433197, - "logits/rejected": 0.1921808272600174, - "logps/chosen": -673.0133056640625, - "logps/rejected": -855.6375732421875, - "loss": 0.2076, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.32487696409225464, - "rewards/margins": 0.2425091564655304, - "rewards/rejected": -0.5673861503601074, - "step": 1200 - }, - { - "epoch": 0.92, - "learning_rate": 9.161688112232836e-08, - "logits/chosen": 0.10119873285293579, - "logits/rejected": 0.22447247803211212, - "logps/chosen": -711.1212158203125, - "logps/rejected": -857.6790771484375, - "loss": 0.2041, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.35159099102020264, - "rewards/margins": 0.22090163826942444, - "rewards/rejected": -0.5724925994873047, - "step": 1210 - }, - { - "epoch": 0.93, - "learning_rate": 7.46189337174788e-08, - "logits/chosen": 0.0487164705991745, - "logits/rejected": 0.1770685911178589, - "logps/chosen": -655.6456298828125, - "logps/rejected": -825.86572265625, - "loss": 0.1868, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.3402259647846222, - "rewards/margins": 0.2170572578907013, - "rewards/rejected": -0.5572832822799683, - "step": 1220 - }, - { - "epoch": 0.94, - "learning_rate": 5.934004245518793e-08, - "logits/chosen": 0.09085109829902649, - "logits/rejected": 0.22920957207679749, - "logps/chosen": -656.9745483398438, - "logps/rejected": -807.7351684570312, - "loss": 0.2086, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.3091570734977722, - "rewards/margins": 0.2053556740283966, - "rewards/rejected": -0.5145127177238464, - "step": 1230 - }, - { - "epoch": 0.94, - "learning_rate": 4.579103667367385e-08, - "logits/chosen": 0.10511846840381622, - "logits/rejected": 0.22097325325012207, - "logps/chosen": -677.5391235351562, - "logps/rejected": -815.5825805664062, - "loss": 0.2188, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3394906222820282, - "rewards/margins": 0.2081030309200287, - "rewards/rejected": -0.5475937128067017, - "step": 1240 - }, - { - "epoch": 0.95, - "learning_rate": 3.398151960681162e-08, - "logits/chosen": 0.04612383991479874, - "logits/rejected": 0.16620075702667236, - "logps/chosen": -679.2454833984375, - "logps/rejected": -800.4119262695312, - "loss": 0.2539, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.35038450360298157, - "rewards/margins": 0.1713865101337433, - "rewards/rejected": -0.5217710733413696, - "step": 1250 - }, - { - "epoch": 0.96, - "learning_rate": 2.3919861577572924e-08, - "logits/chosen": 0.11022261530160904, - "logits/rejected": 0.166605606675148, - "logps/chosen": -690.1370239257812, - "logps/rejected": -797.578369140625, - "loss": 0.2181, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.32749998569488525, - "rewards/margins": 0.20777150988578796, - "rewards/rejected": -0.5352715253829956, - "step": 1260 - }, - { - "epoch": 0.97, - "learning_rate": 1.5613194065327854e-08, - "logits/chosen": 0.052184127271175385, - "logits/rejected": 0.22973528504371643, - "logps/chosen": -629.1040649414062, - "logps/rejected": -766.4088134765625, - "loss": 0.1872, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.2912130355834961, - "rewards/margins": 0.22654838860034943, - "rewards/rejected": -0.5177614092826843, - "step": 1270 - }, - { - "epoch": 0.98, - "learning_rate": 9.067404651211808e-09, - "logits/chosen": 0.11986621469259262, - "logits/rejected": 0.21088480949401855, - "logps/chosen": -687.1105346679688, - "logps/rejected": -877.3173828125, - "loss": 0.2028, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.3289037048816681, - "rewards/margins": 0.23036351799964905, - "rewards/rejected": -0.5592672228813171, - "step": 1280 - }, - { - "epoch": 0.98, - "learning_rate": 4.287132845137709e-09, - "logits/chosen": 0.08242613077163696, - "logits/rejected": 0.20786967873573303, - "logps/chosen": -682.5579223632812, - "logps/rejected": -803.6607666015625, - "loss": 0.2108, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.32676413655281067, - "rewards/margins": 0.19439134001731873, - "rewards/rejected": -0.5211554765701294, - "step": 1290 - }, - { - "epoch": 0.99, - "learning_rate": 1.2757667974155896e-09, - "logits/chosen": 0.03548216074705124, - "logits/rejected": 0.12074669450521469, - "logps/chosen": -665.2953491210938, - "logps/rejected": -846.1324462890625, - "loss": 0.2049, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.3495555818080902, - "rewards/margins": 0.1981724500656128, - "rewards/rejected": -0.5477280020713806, - "step": 1300 - }, - { - "epoch": 1.0, - "learning_rate": 3.544089730633804e-11, - "logits/chosen": 0.10499806702136993, - "logits/rejected": 0.2223556488752365, - "logps/chosen": -656.1048583984375, - "logps/rejected": -808.8086547851562, - "loss": 0.1796, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.3139723837375641, - "rewards/margins": 0.22497034072875977, - "rewards/rejected": -0.5389427542686462, - "step": 1310 + "rewards/chosen": -0.023642729967832565, + "rewards/margins": 0.1827639937400818, + "rewards/rejected": -0.20640675723552704, + "step": 830 }, { - "epoch": 1.0, - "step": 1312, + "epoch": 2.0, + "step": 832, "total_flos": 0.0, - "train_loss": 0.24134081795175627, - "train_runtime": 12108.13, - "train_samples_per_second": 1.734, - "train_steps_per_second": 0.108 + "train_loss": 0.27172684411589915, + "train_runtime": 11567.6763, + "train_samples_per_second": 3.458, + "train_steps_per_second": 0.072 } ], "logging_steps": 10, - "max_steps": 1312, + "max_steps": 832, "num_input_tokens_seen": 0, - "num_train_epochs": 1, + "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4,