{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 500, "global_step": 656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.575757575757576e-08, "logits/chosen": 0.04974596947431564, "logits/rejected": 0.2963363230228424, "logps/chosen": -446.36370849609375, "logps/rejected": -275.23162841796875, "loss": 0.3488, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.575757575757576e-07, "logits/chosen": 0.07917162775993347, "logits/rejected": 0.2545989155769348, "logps/chosen": -351.4217529296875, "logps/rejected": -305.8712463378906, "loss": 0.3403, "rewards/accuracies": 0.3472222089767456, "rewards/chosen": -1.3770493296760833e-06, "rewards/margins": -5.5950724345166236e-05, "rewards/rejected": 5.4573672969127074e-05, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5151515151515152e-06, "logits/chosen": 0.08743862062692642, "logits/rejected": 0.2663661539554596, "logps/chosen": -356.5379333496094, "logps/rejected": -272.34771728515625, "loss": 0.3613, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 8.02798240329139e-05, "rewards/margins": 0.0002525355666875839, "rewards/rejected": -0.00017225573537871242, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.2727272727272728e-06, "logits/chosen": 0.10032109916210175, "logits/rejected": 0.2204272300004959, "logps/chosen": -320.36749267578125, "logps/rejected": -265.65338134765625, "loss": 0.3542, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00015481823356822133, "rewards/margins": 7.609631575178355e-05, "rewards/rejected": 7.872191054048017e-05, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.0303030303030305e-06, "logits/chosen": 0.1070769876241684, "logits/rejected": 0.2916509211063385, "logps/chosen": -365.18646240234375, "logps/rejected": -253.8933868408203, "loss": 0.3552, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0006457852432504296, "rewards/margins": 0.0006201790529303253, "rewards/rejected": 2.560619941505138e-05, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.7878787878787882e-06, "logits/chosen": 0.08771614730358124, "logits/rejected": 0.28780585527420044, "logps/chosen": -370.36212158203125, "logps/rejected": -294.9586181640625, "loss": 0.3338, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0016288382466882467, "rewards/margins": 0.0014908717712387443, "rewards/rejected": 0.000137966446345672, "step": 50 }, { "epoch": 0.09, "learning_rate": 4.5454545454545455e-06, "logits/chosen": 0.06850675493478775, "logits/rejected": 0.276347815990448, "logps/chosen": -375.48883056640625, "logps/rejected": -298.6650695800781, "loss": 0.3422, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.004066063556820154, "rewards/margins": 0.004035579971969128, "rewards/rejected": 3.0483581213047728e-05, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.999432965739786e-06, "logits/chosen": 0.1320158988237381, "logits/rejected": 0.3291288912296295, "logps/chosen": -330.6249084472656, "logps/rejected": -303.25128173828125, "loss": 0.3197, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.007968532852828503, "rewards/margins": 0.007708103861659765, "rewards/rejected": 0.0002604298642836511, "step": 70 }, { "epoch": 0.12, "learning_rate": 4.9930567839810125e-06, "logits/chosen": 0.09887860715389252, "logits/rejected": 0.31491416692733765, "logps/chosen": -353.16217041015625, "logps/rejected": -297.11651611328125, "loss": 0.3248, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01291077770292759, "rewards/margins": 0.018067900091409683, "rewards/rejected": -0.005157124251127243, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.979613761906212e-06, "logits/chosen": 0.17029765248298645, "logits/rejected": 0.260185182094574, "logps/chosen": -352.32379150390625, "logps/rejected": -288.6719665527344, "loss": 0.2939, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.007807808928191662, "rewards/margins": 0.023743372410535812, "rewards/rejected": -0.015935566276311874, "step": 90 }, { "epoch": 0.15, "learning_rate": 4.959142005221991e-06, "logits/chosen": 0.1353963315486908, "logits/rejected": 0.2721942365169525, "logps/chosen": -363.1653137207031, "logps/rejected": -354.89312744140625, "loss": 0.2981, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.01541908085346222, "rewards/margins": 0.04708936810493469, "rewards/rejected": -0.06250844895839691, "step": 100 }, { "epoch": 0.17, "learning_rate": 4.931699543346854e-06, "logits/chosen": 0.20096346735954285, "logits/rejected": 0.3208036422729492, "logps/chosen": -412.08306884765625, "logps/rejected": -410.16162109375, "loss": 0.2759, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.06705033034086227, "rewards/margins": 0.07482309639453888, "rewards/rejected": -0.14187341928482056, "step": 110 }, { "epoch": 0.18, "learning_rate": 4.897364164920515e-06, "logits/chosen": 0.15477022528648376, "logits/rejected": 0.24951288104057312, "logps/chosen": -500.6244201660156, "logps/rejected": -552.2005004882812, "loss": 0.284, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.15765079855918884, "rewards/margins": 0.10079488903284073, "rewards/rejected": -0.258445680141449, "step": 120 }, { "epoch": 0.2, "learning_rate": 4.8562331973035396e-06, "logits/chosen": 0.15251179039478302, "logits/rejected": 0.2451750487089157, "logps/chosen": -500.58416748046875, "logps/rejected": -580.9321899414062, "loss": 0.2784, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15634217858314514, "rewards/margins": 0.12097392231225967, "rewards/rejected": -0.27731606364250183, "step": 130 }, { "epoch": 0.21, "learning_rate": 4.808423230692374e-06, "logits/chosen": 0.14251364767551422, "logits/rejected": 0.3138013184070587, "logps/chosen": -508.4844665527344, "logps/rejected": -568.8987426757812, "loss": 0.2638, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15639568865299225, "rewards/margins": 0.1208159551024437, "rewards/rejected": -0.27721160650253296, "step": 140 }, { "epoch": 0.23, "learning_rate": 4.754069787631761e-06, "logits/chosen": 0.2015986144542694, "logits/rejected": 0.28302785754203796, "logps/chosen": -529.3695068359375, "logps/rejected": -604.30859375, "loss": 0.251, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18331554532051086, "rewards/margins": 0.1465958058834076, "rewards/rejected": -0.32991132140159607, "step": 150 }, { "epoch": 0.24, "learning_rate": 4.693326938861367e-06, "logits/chosen": 0.19963108003139496, "logits/rejected": 0.2948302626609802, "logps/chosen": -566.9232788085938, "logps/rejected": -621.8565673828125, "loss": 0.2514, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2004656344652176, "rewards/margins": 0.12107620388269424, "rewards/rejected": -0.32154184579849243, "step": 160 }, { "epoch": 0.26, "learning_rate": 4.626366866585528e-06, "logits/chosen": 0.20774176716804504, "logits/rejected": 0.31210097670555115, "logps/chosen": -525.9526977539062, "logps/rejected": -598.7525634765625, "loss": 0.2523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2060777246952057, "rewards/margins": 0.1383783519268036, "rewards/rejected": -0.3444560766220093, "step": 170 }, { "epoch": 0.27, "learning_rate": 4.553379376404085e-06, "logits/chosen": 0.1882271021604538, "logits/rejected": 0.2884698808193207, "logps/chosen": -622.5642700195312, "logps/rejected": -703.97998046875, "loss": 0.2686, "rewards/accuracies": 0.75, "rewards/chosen": -0.2503646910190582, "rewards/margins": 0.14028559625148773, "rewards/rejected": -0.39065021276474, "step": 180 }, { "epoch": 0.29, "learning_rate": 4.474571359287791e-06, "logits/chosen": 0.1948501169681549, "logits/rejected": 0.3517269492149353, "logps/chosen": -605.8263549804688, "logps/rejected": -685.9619750976562, "loss": 0.2324, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.21189594268798828, "rewards/margins": 0.19469039142131805, "rewards/rejected": -0.40658634901046753, "step": 190 }, { "epoch": 0.3, "learning_rate": 4.3901662051233755e-06, "logits/chosen": 0.1501813530921936, "logits/rejected": 0.29967957735061646, "logps/chosen": -588.287841796875, "logps/rejected": -692.4317626953125, "loss": 0.232, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.21489660441875458, "rewards/margins": 0.20667514204978943, "rewards/rejected": -0.4215717315673828, "step": 200 }, { "epoch": 0.32, "learning_rate": 4.30040316949064e-06, "logits/chosen": 0.1845458298921585, "logits/rejected": 0.25728127360343933, "logps/chosen": -593.4507446289062, "logps/rejected": -647.9249267578125, "loss": 0.268, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2259581983089447, "rewards/margins": 0.1573750227689743, "rewards/rejected": -0.3833332657814026, "step": 210 }, { "epoch": 0.34, "learning_rate": 4.205536695466524e-06, "logits/chosen": 0.19619445502758026, "logits/rejected": 0.289485901594162, "logps/chosen": -596.2908325195312, "logps/rejected": -673.9340209960938, "loss": 0.2279, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.20733025670051575, "rewards/margins": 0.1769087016582489, "rewards/rejected": -0.38423892855644226, "step": 220 }, { "epoch": 0.35, "learning_rate": 4.105835692378557e-06, "logits/chosen": 0.18007412552833557, "logits/rejected": 0.23454514145851135, "logps/chosen": -605.221923828125, "logps/rejected": -682.0065307617188, "loss": 0.2477, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2464592009782791, "rewards/margins": 0.1562257707118988, "rewards/rejected": -0.4026849865913391, "step": 230 }, { "epoch": 0.37, "learning_rate": 4.001582773552153e-06, "logits/chosen": 0.17503593862056732, "logits/rejected": 0.26662883162498474, "logps/chosen": -545.9107666015625, "logps/rejected": -640.1486206054688, "loss": 0.2225, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.23691585659980774, "rewards/margins": 0.16106419265270233, "rewards/rejected": -0.39798006415367126, "step": 240 }, { "epoch": 0.38, "learning_rate": 3.893073455212438e-06, "logits/chosen": 0.200395867228508, "logits/rejected": 0.3584834933280945, "logps/chosen": -608.1827392578125, "logps/rejected": -740.0325317382812, "loss": 0.2478, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.26550978422164917, "rewards/margins": 0.1945229470729828, "rewards/rejected": -0.46003276109695435, "step": 250 }, { "epoch": 0.4, "learning_rate": 3.7806153188114027e-06, "logits/chosen": 0.1786949336528778, "logits/rejected": 0.2547721266746521, "logps/chosen": -599.2467041015625, "logps/rejected": -727.934326171875, "loss": 0.2601, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.22916042804718018, "rewards/margins": 0.17522968351840973, "rewards/rejected": -0.40439003705978394, "step": 260 }, { "epoch": 0.41, "learning_rate": 3.6645271391548542e-06, "logits/chosen": 0.18216630816459656, "logits/rejected": 0.24754111468791962, "logps/chosen": -618.3128051757812, "logps/rejected": -640.1773681640625, "loss": 0.2761, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22649279236793518, "rewards/margins": 0.14542898535728455, "rewards/rejected": -0.37192174792289734, "step": 270 }, { "epoch": 0.43, "learning_rate": 3.5451379808006014e-06, "logits/chosen": 0.19754137098789215, "logits/rejected": 0.27186593413352966, "logps/chosen": -573.2451171875, "logps/rejected": -707.2796020507812, "loss": 0.2209, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2302563190460205, "rewards/margins": 0.19838322699069977, "rewards/rejected": -0.42863956093788147, "step": 280 }, { "epoch": 0.44, "learning_rate": 3.4227862652892106e-06, "logits/chosen": 0.16008315980434418, "logits/rejected": 0.27827057242393494, "logps/chosen": -662.4371948242188, "logps/rejected": -759.597900390625, "loss": 0.2408, "rewards/accuracies": 0.75, "rewards/chosen": -0.2687516212463379, "rewards/margins": 0.17680145800113678, "rewards/rejected": -0.4455530643463135, "step": 290 }, { "epoch": 0.46, "learning_rate": 3.2978188118513814e-06, "logits/chosen": 0.17593641579151154, "logits/rejected": 0.2789141535758972, "logps/chosen": -642.6029052734375, "logps/rejected": -753.4479370117188, "loss": 0.2304, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.27796030044555664, "rewards/margins": 0.1859740912914276, "rewards/rejected": -0.46393436193466187, "step": 300 }, { "epoch": 0.47, "learning_rate": 3.1705898543111576e-06, "logits/chosen": 0.13969933986663818, "logits/rejected": 0.3079048693180084, "logps/chosen": -630.0859375, "logps/rejected": -765.4630737304688, "loss": 0.2384, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2862771153450012, "rewards/margins": 0.19564750790596008, "rewards/rejected": -0.4819245934486389, "step": 310 }, { "epoch": 0.49, "learning_rate": 3.041460036971664e-06, "logits/chosen": 0.14904941618442535, "logits/rejected": 0.27918586134910583, "logps/chosen": -619.8759765625, "logps/rejected": -759.2853393554688, "loss": 0.2347, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.27361705899238586, "rewards/margins": 0.19538012146949768, "rewards/rejected": -0.4689972400665283, "step": 320 }, { "epoch": 0.5, "learning_rate": 2.910795392329649e-06, "logits/chosen": 0.1377524435520172, "logits/rejected": 0.2504701614379883, "logps/chosen": -647.4981689453125, "logps/rejected": -789.5301513671875, "loss": 0.2352, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2706158757209778, "rewards/margins": 0.18975581228733063, "rewards/rejected": -0.460371732711792, "step": 330 }, { "epoch": 0.52, "learning_rate": 2.7789663035166035e-06, "logits/chosen": 0.18703623116016388, "logits/rejected": 0.3294333815574646, "logps/chosen": -656.440673828125, "logps/rejected": -793.1935424804688, "loss": 0.2072, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2833554744720459, "rewards/margins": 0.21432232856750488, "rewards/rejected": -0.4976778030395508, "step": 340 }, { "epoch": 0.53, "learning_rate": 2.6463464544075344e-06, "logits/chosen": 0.15668293833732605, "logits/rejected": 0.2470695674419403, "logps/chosen": -603.0676879882812, "logps/rejected": -736.9429931640625, "loss": 0.227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.27748292684555054, "rewards/margins": 0.1968688815832138, "rewards/rejected": -0.4743518829345703, "step": 350 }, { "epoch": 0.55, "learning_rate": 2.513311770373421e-06, "logits/chosen": 0.09050649404525757, "logits/rejected": 0.27548736333847046, "logps/chosen": -645.054443359375, "logps/rejected": -780.7376708984375, "loss": 0.2277, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.27279263734817505, "rewards/margins": 0.2085564136505127, "rewards/rejected": -0.48134905099868774, "step": 360 }, { "epoch": 0.56, "learning_rate": 2.380239352679908e-06, "logits/chosen": 0.11263048648834229, "logits/rejected": 0.23795035481452942, "logps/chosen": -635.0528564453125, "logps/rejected": -752.4202270507812, "loss": 0.2295, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.28419169783592224, "rewards/margins": 0.18686431646347046, "rewards/rejected": -0.4710559844970703, "step": 370 }, { "epoch": 0.58, "learning_rate": 2.247506409552795e-06, "logits/chosen": 0.18179914355278015, "logits/rejected": 0.2870942950248718, "logps/chosen": -632.6707153320312, "logps/rejected": -745.6337890625, "loss": 0.21, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2822064459323883, "rewards/margins": 0.18581680953502655, "rewards/rejected": -0.4680232107639313, "step": 380 }, { "epoch": 0.59, "learning_rate": 2.1154891869403436e-06, "logits/chosen": 0.1858624517917633, "logits/rejected": 0.24371235072612762, "logps/chosen": -612.2597045898438, "logps/rejected": -773.7772216796875, "loss": 0.24, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2878335416316986, "rewards/margins": 0.1961011290550232, "rewards/rejected": -0.4839346408843994, "step": 390 }, { "epoch": 0.61, "learning_rate": 1.9845619020032552e-06, "logits/chosen": 0.11868009716272354, "logits/rejected": 0.2442179173231125, "logps/chosen": -628.7079467773438, "logps/rejected": -782.9298095703125, "loss": 0.2128, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2861320376396179, "rewards/margins": 0.2013685703277588, "rewards/rejected": -0.4875006079673767, "step": 400 }, { "epoch": 0.62, "learning_rate": 1.8550956823554708e-06, "logits/chosen": 0.15941022336483002, "logits/rejected": 0.26665717363357544, "logps/chosen": -611.2957763671875, "logps/rejected": -745.6875610351562, "loss": 0.2419, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.26627546548843384, "rewards/margins": 0.20409245789051056, "rewards/rejected": -0.4703678488731384, "step": 410 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.12057618796825409, "logits/rejected": 0.2439500391483307, "logps/chosen": -563.8010864257812, "logps/rejected": -753.3555908203125, "loss": 0.2241, "rewards/accuracies": 0.75, "rewards/chosen": -0.22670452296733856, "rewards/margins": 0.22902043163776398, "rewards/rejected": -0.45572495460510254, "step": 420 }, { "epoch": 0.66, "learning_rate": 1.6020092013802002e-06, "logits/chosen": 0.1292915642261505, "logits/rejected": 0.27418118715286255, "logps/chosen": -623.4757080078125, "logps/rejected": -712.2957153320312, "loss": 0.2196, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.24320180714130402, "rewards/margins": 0.19605949521064758, "rewards/rejected": -0.4392613470554352, "step": 430 }, { "epoch": 0.67, "learning_rate": 1.4791063411799938e-06, "logits/chosen": 0.10819476842880249, "logits/rejected": 0.2511535882949829, "logps/chosen": -570.8446044921875, "logps/rejected": -685.5501708984375, "loss": 0.2124, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22663471102714539, "rewards/margins": 0.20201578736305237, "rewards/rejected": -0.42865046858787537, "step": 440 }, { "epoch": 0.69, "learning_rate": 1.3590973149722103e-06, "logits/chosen": 0.15888772904872894, "logits/rejected": 0.2368732988834381, "logps/chosen": -589.4254150390625, "logps/rejected": -735.7716674804688, "loss": 0.2038, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.26341313123703003, "rewards/margins": 0.1875002086162567, "rewards/rejected": -0.45091336965560913, "step": 450 }, { "epoch": 0.7, "learning_rate": 1.2423223013801946e-06, "logits/chosen": 0.14813843369483948, "logits/rejected": 0.21993982791900635, "logps/chosen": -589.8951416015625, "logps/rejected": -729.8585205078125, "loss": 0.2504, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2774823307991028, "rewards/margins": 0.1887308657169342, "rewards/rejected": -0.4662131667137146, "step": 460 }, { "epoch": 0.72, "learning_rate": 1.1291123118671665e-06, "logits/chosen": 0.09890522062778473, "logits/rejected": 0.2421140968799591, "logps/chosen": -639.2291870117188, "logps/rejected": -729.2130126953125, "loss": 0.1871, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2574080526828766, "rewards/margins": 0.20436222851276398, "rewards/rejected": -0.46177029609680176, "step": 470 }, { "epoch": 0.73, "learning_rate": 1.019788252448267e-06, "logits/chosen": 0.1626007854938507, "logits/rejected": 0.2534940242767334, "logps/chosen": -614.7600708007812, "logps/rejected": -789.6011352539062, "loss": 0.202, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.27494579553604126, "rewards/margins": 0.21227261424064636, "rewards/rejected": -0.48721843957901, "step": 480 }, { "epoch": 0.75, "learning_rate": 9.146600140475945e-07, "logits/chosen": 0.18194663524627686, "logits/rejected": 0.2571627199649811, "logps/chosen": -695.4674682617188, "logps/rejected": -824.3109130859375, "loss": 0.2242, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.31008249521255493, "rewards/margins": 0.20844268798828125, "rewards/rejected": -0.518525242805481, "step": 490 }, { "epoch": 0.76, "learning_rate": 8.140255940787059e-07, "logits/chosen": 0.1600276529788971, "logits/rejected": 0.24592892825603485, "logps/chosen": -621.3486328125, "logps/rejected": -787.8779296875, "loss": 0.2379, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2887078523635864, "rewards/margins": 0.21262690424919128, "rewards/rejected": -0.5013347864151001, "step": 500 }, { "epoch": 0.78, "learning_rate": 7.181702517385789e-07, "logits/chosen": 0.11802725493907928, "logits/rejected": 0.24163658916950226, "logps/chosen": -626.5403442382812, "logps/rejected": -774.00830078125, "loss": 0.219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.27267104387283325, "rewards/margins": 0.21761374175548553, "rewards/rejected": -0.4902847409248352, "step": 510 }, { "epoch": 0.79, "learning_rate": 6.273656994094232e-07, "logits/chosen": 0.1450013965368271, "logits/rejected": 0.23478934168815613, "logps/chosen": -608.7362060546875, "logps/rejected": -739.3218994140625, "loss": 0.2294, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2396283596754074, "rewards/margins": 0.21917715668678284, "rewards/rejected": -0.45880550146102905, "step": 520 }, { "epoch": 0.81, "learning_rate": 5.418693324604082e-07, "logits/chosen": 0.16014720499515533, "logits/rejected": 0.25679388642311096, "logps/chosen": -598.508056640625, "logps/rejected": -752.1389770507812, "loss": 0.2315, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.26550406217575073, "rewards/margins": 0.20078308880329132, "rewards/rejected": -0.46628713607788086, "step": 530 }, { "epoch": 0.82, "learning_rate": 4.619234996325314e-07, "logits/chosen": 0.10893194377422333, "logits/rejected": 0.18960845470428467, "logps/chosen": -589.2452392578125, "logps/rejected": -712.2008666992188, "loss": 0.2139, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2257637232542038, "rewards/margins": 0.20952418446540833, "rewards/rejected": -0.4352878928184509, "step": 540 }, { "epoch": 0.84, "learning_rate": 3.877548160747768e-07, "logits/chosen": 0.1536407321691513, "logits/rejected": 0.25428491830825806, "logps/chosen": -667.2605590820312, "logps/rejected": -712.8155517578125, "loss": 0.2543, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.26264914870262146, "rewards/margins": 0.1710439920425415, "rewards/rejected": -0.4336931109428406, "step": 550 }, { "epoch": 0.85, "learning_rate": 3.195735209788528e-07, "logits/chosen": 0.1331530511379242, "logits/rejected": 0.25019171833992004, "logps/chosen": -614.9050903320312, "logps/rejected": -703.8809814453125, "loss": 0.2492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.27054914832115173, "rewards/margins": 0.14634455740451813, "rewards/rejected": -0.4168936610221863, "step": 560 }, { "epoch": 0.87, "learning_rate": 2.5757288163336806e-07, "logits/chosen": 0.15042927861213684, "logits/rejected": 0.2752520442008972, "logps/chosen": -605.6216430664062, "logps/rejected": -749.6766357421875, "loss": 0.2349, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2648725211620331, "rewards/margins": 0.18051207065582275, "rewards/rejected": -0.44538459181785583, "step": 570 }, { "epoch": 0.88, "learning_rate": 2.019286455866981e-07, "logits/chosen": 0.09744317829608917, "logits/rejected": 0.30391809344291687, "logps/chosen": -656.5165405273438, "logps/rejected": -782.3225708007812, "loss": 0.2347, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.26627975702285767, "rewards/margins": 0.2162231206893921, "rewards/rejected": -0.48250293731689453, "step": 580 }, { "epoch": 0.9, "learning_rate": 1.5279854247146703e-07, "logits/chosen": 0.10827809572219849, "logits/rejected": 0.23128509521484375, "logps/chosen": -597.4005126953125, "logps/rejected": -727.5601196289062, "loss": 0.2356, "rewards/accuracies": 0.75, "rewards/chosen": -0.2508590817451477, "rewards/margins": 0.19355928897857666, "rewards/rejected": -0.44441837072372437, "step": 590 }, { "epoch": 0.91, "learning_rate": 1.1032183690276754e-07, "logits/chosen": 0.1284581571817398, "logits/rejected": 0.22490856051445007, "logps/chosen": -608.4317016601562, "logps/rejected": -756.5374755859375, "loss": 0.2282, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.27711886167526245, "rewards/margins": 0.19119976460933685, "rewards/rejected": -0.4683186411857605, "step": 600 }, { "epoch": 0.93, "learning_rate": 7.46189337174788e-08, "logits/chosen": 0.1757223904132843, "logits/rejected": 0.26904186606407166, "logps/chosen": -603.6378173828125, "logps/rejected": -660.9766235351562, "loss": 0.2443, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.24105675518512726, "rewards/margins": 0.1722828447818756, "rewards/rejected": -0.4133395552635193, "step": 610 }, { "epoch": 0.94, "learning_rate": 4.579103667367385e-08, "logits/chosen": 0.10849970579147339, "logits/rejected": 0.19708193838596344, "logps/chosen": -565.9713134765625, "logps/rejected": -679.5427856445312, "loss": 0.2211, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2512280344963074, "rewards/margins": 0.17560149729251862, "rewards/rejected": -0.4268294870853424, "step": 620 }, { "epoch": 0.96, "learning_rate": 2.3919861577572924e-08, "logits/chosen": 0.13952800631523132, "logits/rejected": 0.20136961340904236, "logps/chosen": -621.4708862304688, "logps/rejected": -748.71240234375, "loss": 0.2234, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2641890048980713, "rewards/margins": 0.19575798511505127, "rewards/rejected": -0.45994701981544495, "step": 630 }, { "epoch": 0.98, "learning_rate": 9.067404651211808e-09, "logits/chosen": 0.18144121766090393, "logits/rejected": 0.23935365676879883, "logps/chosen": -601.0186767578125, "logps/rejected": -719.6766357421875, "loss": 0.2458, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.26298612356185913, "rewards/margins": 0.16590112447738647, "rewards/rejected": -0.428887277841568, "step": 640 }, { "epoch": 0.99, "learning_rate": 1.2757667974155896e-09, "logits/chosen": 0.13537321984767914, "logits/rejected": 0.2640915811061859, "logps/chosen": -576.1346435546875, "logps/rejected": -705.7242431640625, "loss": 0.2243, "rewards/accuracies": 0.75, "rewards/chosen": -0.22819241881370544, "rewards/margins": 0.1957322061061859, "rewards/rejected": -0.42392459511756897, "step": 650 }, { "epoch": 1.0, "step": 656, "total_flos": 0.0, "train_loss": 0.19985946376876132, "train_runtime": 6634.0696, "train_samples_per_second": 3.165, "train_steps_per_second": 0.099 } ], "logging_steps": 10, "max_steps": 656, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }