{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.996400719856029, "eval_steps": 500, "global_step": 832, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.952380952380953e-08, "logits/chosen": 0.11703574657440186, "logits/rejected": 0.3661181330680847, "logps/chosen": -218.64993286132812, "logps/rejected": -191.34808349609375, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 5.952380952380953e-07, "logits/chosen": 0.10404814779758453, "logits/rejected": 0.23778128623962402, "logps/chosen": -401.4896240234375, "logps/rejected": -345.9862976074219, "loss": 0.3642, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 0.0004916194593533874, "rewards/margins": 0.0005594216636382043, "rewards/rejected": -6.780229159630835e-05, "step": 10 }, { "epoch": 0.05, "learning_rate": 1.1904761904761906e-06, "logits/chosen": 0.13218173384666443, "logits/rejected": 0.20688870549201965, "logps/chosen": -336.506591796875, "logps/rejected": -319.3189392089844, "loss": 0.3689, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.00020826223772019148, "rewards/margins": -0.000311180017888546, "rewards/rejected": 0.0005194421974010766, "step": 20 }, { "epoch": 0.07, "learning_rate": 1.7857142857142859e-06, "logits/chosen": 0.11459924280643463, "logits/rejected": 0.1922653764486313, "logps/chosen": -342.02569580078125, "logps/rejected": -324.1275939941406, "loss": 0.3786, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0006439354037865996, "rewards/margins": 0.0004738263669423759, "rewards/rejected": -0.0011177618289366364, "step": 30 }, { "epoch": 0.1, "learning_rate": 2.380952380952381e-06, "logits/chosen": 0.13577614724636078, "logits/rejected": 0.17847472429275513, "logps/chosen": -298.6214294433594, "logps/rejected": -289.40850830078125, "loss": 0.3689, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0008146329782903194, "rewards/margins": 0.0024004268925637007, "rewards/rejected": -0.001585794030688703, "step": 40 }, { "epoch": 0.12, "learning_rate": 2.9761904761904763e-06, "logits/chosen": 0.10261678695678711, "logits/rejected": 0.20306341350078583, "logps/chosen": -351.93572998046875, "logps/rejected": -362.153564453125, "loss": 0.3692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0009010158246383071, "rewards/margins": 0.004100508522242308, "rewards/rejected": -0.003199493046849966, "step": 50 }, { "epoch": 0.14, "learning_rate": 3.5714285714285718e-06, "logits/chosen": 0.13770776987075806, "logits/rejected": 0.2188442498445511, "logps/chosen": -349.51690673828125, "logps/rejected": -351.1549377441406, "loss": 0.3655, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.003258631331846118, "rewards/margins": 0.007584023289382458, "rewards/rejected": -0.004325392190366983, "step": 60 }, { "epoch": 0.17, "learning_rate": 4.166666666666667e-06, "logits/chosen": 0.1271902620792389, "logits/rejected": 0.23070549964904785, "logps/chosen": -378.33843994140625, "logps/rejected": -350.60662841796875, "loss": 0.3586, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.006277147680521011, "rewards/margins": 0.015207210555672646, "rewards/rejected": -0.00893006194382906, "step": 70 }, { "epoch": 0.19, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.08625562489032745, "logits/rejected": 0.12316304445266724, "logps/chosen": -307.9439697265625, "logps/rejected": -335.3281555175781, "loss": 0.3489, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.013669237494468689, "rewards/margins": 0.02226843498647213, "rewards/rejected": -0.008599198423326015, "step": 80 }, { "epoch": 0.22, "learning_rate": 4.9992062457191005e-06, "logits/chosen": 0.137899249792099, "logits/rejected": 0.2165641039609909, "logps/chosen": -355.6449890136719, "logps/rejected": -338.1387634277344, "loss": 0.3229, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.013719858601689339, "rewards/margins": 0.042457275092601776, "rewards/rejected": -0.028737416490912437, "step": 90 }, { "epoch": 0.24, "learning_rate": 4.994357350311441e-06, "logits/chosen": 0.14011432230472565, "logits/rejected": 0.21795734763145447, "logps/chosen": -360.2173156738281, "logps/rejected": -358.1722717285156, "loss": 0.3043, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.001885895850136876, "rewards/margins": 0.06035756319761276, "rewards/rejected": -0.06224345415830612, "step": 100 }, { "epoch": 0.26, "learning_rate": 4.98510907587894e-06, "logits/chosen": 0.13077042996883392, "logits/rejected": 0.21840377151966095, "logps/chosen": -356.6605224609375, "logps/rejected": -348.19476318359375, "loss": 0.3169, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.008259604685008526, "rewards/margins": 0.08336080610752106, "rewards/rejected": -0.09162042289972305, "step": 110 }, { "epoch": 0.29, "learning_rate": 4.97147773390341e-06, "logits/chosen": 0.14791826903820038, "logits/rejected": 0.1786331683397293, "logps/chosen": -320.29608154296875, "logps/rejected": -337.16864013671875, "loss": 0.2861, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008191597647964954, "rewards/margins": 0.09656454622745514, "rewards/rejected": -0.08837294578552246, "step": 120 }, { "epoch": 0.31, "learning_rate": 4.953487366425163e-06, "logits/chosen": 0.12249626964330673, "logits/rejected": 0.16907112300395966, "logps/chosen": -342.0648498535156, "logps/rejected": -363.51031494140625, "loss": 0.3175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0017940097022801638, "rewards/margins": 0.07947574555873871, "rewards/rejected": -0.07768173515796661, "step": 130 }, { "epoch": 0.34, "learning_rate": 4.931169703639282e-06, "logits/chosen": 0.0919104740023613, "logits/rejected": 0.18652714788913727, "logps/chosen": -337.65374755859375, "logps/rejected": -364.11199951171875, "loss": 0.2828, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.03137553483247757, "rewards/margins": 0.12489553540945053, "rewards/rejected": -0.09352000057697296, "step": 140 }, { "epoch": 0.36, "learning_rate": 4.904564107932048e-06, "logits/chosen": 0.13001379370689392, "logits/rejected": 0.20237913727760315, "logps/chosen": -351.857421875, "logps/rejected": -336.6232604980469, "loss": 0.2899, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0018621661001816392, "rewards/margins": 0.10416339337825775, "rewards/rejected": -0.10602555423974991, "step": 150 }, { "epoch": 0.38, "learning_rate": 4.873717504456219e-06, "logits/chosen": 0.06932858377695084, "logits/rejected": 0.15127311646938324, "logps/chosen": -345.0473937988281, "logps/rejected": -363.4601745605469, "loss": 0.2889, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01222093403339386, "rewards/margins": 0.11404307186603546, "rewards/rejected": -0.12626400589942932, "step": 160 }, { "epoch": 0.41, "learning_rate": 4.838684298367616e-06, "logits/chosen": 0.16357803344726562, "logits/rejected": 0.23174886405467987, "logps/chosen": -357.15289306640625, "logps/rejected": -358.61065673828125, "loss": 0.2884, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.017432499676942825, "rewards/margins": 0.11707814782857895, "rewards/rejected": -0.09964564442634583, "step": 170 }, { "epoch": 0.43, "learning_rate": 4.7995262788689865e-06, "logits/chosen": 0.16258656978607178, "logits/rejected": 0.2536885738372803, "logps/chosen": -337.7535705566406, "logps/rejected": -346.13470458984375, "loss": 0.2789, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.02853301540017128, "rewards/margins": 0.1016291156411171, "rewards/rejected": -0.07309609651565552, "step": 180 }, { "epoch": 0.46, "learning_rate": 4.756312510230377e-06, "logits/chosen": 0.14243337512016296, "logits/rejected": 0.24410876631736755, "logps/chosen": -376.64599609375, "logps/rejected": -363.4615478515625, "loss": 0.2828, "rewards/accuracies": 0.71875, "rewards/chosen": 0.03516390174627304, "rewards/margins": 0.12421919405460358, "rewards/rejected": -0.08905528485774994, "step": 190 }, { "epoch": 0.48, "learning_rate": 4.709119209978242e-06, "logits/chosen": 0.17320121824741364, "logits/rejected": 0.2264091521501541, "logps/chosen": -362.0121765136719, "logps/rejected": -352.7041931152344, "loss": 0.283, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.039128489792346954, "rewards/margins": 0.11708054691553116, "rewards/rejected": -0.07795204222202301, "step": 200 }, { "epoch": 0.5, "learning_rate": 4.6580296144681155e-06, "logits/chosen": 0.1604190617799759, "logits/rejected": 0.17792078852653503, "logps/chosen": -315.1614074707031, "logps/rejected": -340.53619384765625, "loss": 0.2754, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.05531097203493118, "rewards/margins": 0.15012916922569275, "rewards/rejected": -0.09481821954250336, "step": 210 }, { "epoch": 0.53, "learning_rate": 4.603133832077953e-06, "logits/chosen": 0.11915634572505951, "logits/rejected": 0.15653367340564728, "logps/chosen": -351.16986083984375, "logps/rejected": -354.53607177734375, "loss": 0.2738, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.06388933956623077, "rewards/margins": 0.1507207453250885, "rewards/rejected": -0.08683140575885773, "step": 220 }, { "epoch": 0.55, "learning_rate": 4.544528684281056e-06, "logits/chosen": 0.09443524479866028, "logits/rejected": 0.1415812075138092, "logps/chosen": -355.2025451660156, "logps/rejected": -349.1300354003906, "loss": 0.276, "rewards/accuracies": 0.625, "rewards/chosen": 0.021877283230423927, "rewards/margins": 0.1259470283985138, "rewards/rejected": -0.10406973212957382, "step": 230 }, { "epoch": 0.58, "learning_rate": 4.482317534878901e-06, "logits/chosen": 0.08314280211925507, "logits/rejected": 0.11439633369445801, "logps/chosen": -333.59295654296875, "logps/rejected": -341.5171203613281, "loss": 0.2668, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.016557829454541206, "rewards/margins": 0.11629464477300644, "rewards/rejected": -0.09973680973052979, "step": 240 }, { "epoch": 0.6, "learning_rate": 4.416610107695043e-06, "logits/chosen": 0.11690554767847061, "logits/rejected": 0.06475332379341125, "logps/chosen": -331.7200012207031, "logps/rejected": -341.45245361328125, "loss": 0.2819, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.025893816724419594, "rewards/margins": 0.13103850185871124, "rewards/rejected": -0.15693232417106628, "step": 250 }, { "epoch": 0.62, "learning_rate": 4.3475222930516484e-06, "logits/chosen": 0.08940346539020538, "logits/rejected": 0.12766343355178833, "logps/chosen": -333.33343505859375, "logps/rejected": -372.55755615234375, "loss": 0.2833, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.026656050235033035, "rewards/margins": 0.16600963473320007, "rewards/rejected": -0.19266566634178162, "step": 260 }, { "epoch": 0.65, "learning_rate": 4.2751759433699745e-06, "logits/chosen": 0.04847298935055733, "logits/rejected": 0.11083607375621796, "logps/chosen": -342.9352722167969, "logps/rejected": -357.6617736816406, "loss": 0.274, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.021627375856041908, "rewards/margins": 0.12919363379478455, "rewards/rejected": -0.1508210003376007, "step": 270 }, { "epoch": 0.67, "learning_rate": 4.199698658255298e-06, "logits/chosen": 0.056878913193941116, "logits/rejected": 0.14858202636241913, "logps/chosen": -370.22637939453125, "logps/rejected": -398.57159423828125, "loss": 0.2715, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.021515587344765663, "rewards/margins": 0.1492767035961151, "rewards/rejected": -0.17079228162765503, "step": 280 }, { "epoch": 0.7, "learning_rate": 4.121223559445343e-06, "logits/chosen": 0.03415738046169281, "logits/rejected": 0.12577436864376068, "logps/chosen": -352.68072509765625, "logps/rejected": -383.16204833984375, "loss": 0.264, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.03958406671881676, "rewards/margins": 0.1690487265586853, "rewards/rejected": -0.20863279700279236, "step": 290 }, { "epoch": 0.72, "learning_rate": 4.039889056019159e-06, "logits/chosen": 0.02515377476811409, "logits/rejected": 0.10390216112136841, "logps/chosen": -353.2736511230469, "logps/rejected": -353.888671875, "loss": 0.2461, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.031048249453306198, "rewards/margins": 0.1348181664943695, "rewards/rejected": -0.1658664047718048, "step": 300 }, { "epoch": 0.74, "learning_rate": 3.955838600280535e-06, "logits/chosen": 0.025213222950696945, "logits/rejected": 0.1410323679447174, "logps/chosen": -387.21856689453125, "logps/rejected": -373.70355224609375, "loss": 0.2703, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.010617800056934357, "rewards/margins": 0.19538867473602295, "rewards/rejected": -0.184770867228508, "step": 310 }, { "epoch": 0.77, "learning_rate": 3.869220434746509e-06, "logits/chosen": 0.06151404231786728, "logits/rejected": 0.1290605366230011, "logps/chosen": -345.41571044921875, "logps/rejected": -370.25592041015625, "loss": 0.2703, "rewards/accuracies": 0.75, "rewards/chosen": -0.019938651472330093, "rewards/margins": 0.16865777969360352, "rewards/rejected": -0.1885964572429657, "step": 320 }, { "epoch": 0.79, "learning_rate": 3.7801873306872315e-06, "logits/chosen": 0.06525089591741562, "logits/rejected": 0.12144273519515991, "logps/chosen": -340.03277587890625, "logps/rejected": -371.6439514160156, "loss": 0.2577, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.02320241369307041, "rewards/margins": 0.17125853896141052, "rewards/rejected": -0.14805614948272705, "step": 330 }, { "epoch": 0.82, "learning_rate": 3.688896318678322e-06, "logits/chosen": 0.055392809212207794, "logits/rejected": 0.12697988748550415, "logps/chosen": -349.14556884765625, "logps/rejected": -333.9625549316406, "loss": 0.2748, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.016882654279470444, "rewards/margins": 0.16859912872314453, "rewards/rejected": -0.1517164707183838, "step": 340 }, { "epoch": 0.84, "learning_rate": 3.5955084116409382e-06, "logits/chosen": 0.08919240534305573, "logits/rejected": 0.1610582321882248, "logps/chosen": -367.30621337890625, "logps/rejected": -346.13873291015625, "loss": 0.2664, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04106982424855232, "rewards/margins": 0.14082172513008118, "rewards/rejected": -0.1818915605545044, "step": 350 }, { "epoch": 0.86, "learning_rate": 3.5001883208580668e-06, "logits/chosen": 0.056862883269786835, "logits/rejected": 0.14601710438728333, "logps/chosen": -383.3697204589844, "logps/rejected": -388.45147705078125, "loss": 0.2359, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.005547699984163046, "rewards/margins": 0.20355132222175598, "rewards/rejected": -0.20909900963306427, "step": 360 }, { "epoch": 0.89, "learning_rate": 3.403104165467883e-06, "logits/chosen": 0.047759585082530975, "logits/rejected": 0.1289873570203781, "logps/chosen": -363.989990234375, "logps/rejected": -361.4288330078125, "loss": 0.2491, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.030249441042542458, "rewards/margins": 0.1802445650100708, "rewards/rejected": -0.2104939967393875, "step": 370 }, { "epoch": 0.91, "learning_rate": 3.30442717594657e-06, "logits/chosen": 0.06461011618375778, "logits/rejected": 0.14733566343784332, "logps/chosen": -350.331298828125, "logps/rejected": -334.6890563964844, "loss": 0.2754, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.04477550461888313, "rewards/margins": 0.12478353828191757, "rewards/rejected": -0.1695590317249298, "step": 380 }, { "epoch": 0.94, "learning_rate": 3.2043313921035747e-06, "logits/chosen": 0.07650026679039001, "logits/rejected": 0.10351625829935074, "logps/chosen": -319.55328369140625, "logps/rejected": -328.97625732421875, "loss": 0.2601, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.017551960423588753, "rewards/margins": 0.1492632031440735, "rewards/rejected": -0.1668151617050171, "step": 390 }, { "epoch": 0.96, "learning_rate": 3.102993356121938e-06, "logits/chosen": 0.045068711042404175, "logits/rejected": 0.133053719997406, "logps/chosen": -376.1606750488281, "logps/rejected": -360.3962097167969, "loss": 0.2547, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.012314733117818832, "rewards/margins": 0.18502004444599152, "rewards/rejected": -0.19733479619026184, "step": 400 }, { "epoch": 0.98, "learning_rate": 3.0005918011851245e-06, "logits/chosen": 0.03985997289419174, "logits/rejected": 0.1656588464975357, "logps/chosen": -379.48199462890625, "logps/rejected": -362.08380126953125, "loss": 0.273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00483871391043067, "rewards/margins": 0.1729108989238739, "rewards/rejected": -0.16807220876216888, "step": 410 }, { "epoch": 1.01, "learning_rate": 2.8973073362395e-06, "logits/chosen": 0.06932957470417023, "logits/rejected": 0.11695323139429092, "logps/chosen": -350.8485107421875, "logps/rejected": -359.5559387207031, "loss": 0.2562, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.015226135030388832, "rewards/margins": 0.13259340822696686, "rewards/rejected": -0.14781954884529114, "step": 420 }, { "epoch": 1.03, "learning_rate": 2.7933221274484725e-06, "logits/chosen": 0.022776301950216293, "logits/rejected": 0.1463911086320877, "logps/chosen": -344.72900390625, "logps/rejected": -374.57110595703125, "loss": 0.2546, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.01165957935154438, "rewards/margins": 0.17338308691978455, "rewards/rejected": -0.1617235392332077, "step": 430 }, { "epoch": 1.06, "learning_rate": 2.6888195769001147e-06, "logits/chosen": 0.011232647113502026, "logits/rejected": 0.08440439403057098, "logps/chosen": -315.56158447265625, "logps/rejected": -370.6732177734375, "loss": 0.2635, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0027101226150989532, "rewards/margins": 0.18474070727825165, "rewards/rejected": -0.1874508261680603, "step": 440 }, { "epoch": 1.08, "learning_rate": 2.583983999134951e-06, "logits/chosen": 0.033940933644771576, "logits/rejected": 0.12383987754583359, "logps/chosen": -353.528076171875, "logps/rejected": -358.25433349609375, "loss": 0.2647, "rewards/accuracies": 0.71875, "rewards/chosen": -0.01790205016732216, "rewards/margins": 0.16596254706382751, "rewards/rejected": -0.18386459350585938, "step": 450 }, { "epoch": 1.1, "learning_rate": 2.479000296064417e-06, "logits/chosen": 0.03699932247400284, "logits/rejected": 0.13089559972286224, "logps/chosen": -375.724609375, "logps/rejected": -400.3955383300781, "loss": 0.2481, "rewards/accuracies": 0.71875, "rewards/chosen": -0.041518934071063995, "rewards/margins": 0.1739250123500824, "rewards/rejected": -0.21544396877288818, "step": 460 }, { "epoch": 1.13, "learning_rate": 2.374053630853358e-06, "logits/chosen": 0.07867871224880219, "logits/rejected": 0.0793570876121521, "logps/chosen": -392.0462646484375, "logps/rejected": -398.4570617675781, "loss": 0.2589, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.03614164516329765, "rewards/margins": 0.18701379001140594, "rewards/rejected": -0.2231554538011551, "step": 470 }, { "epoch": 1.15, "learning_rate": 2.269329101341745e-06, "logits/chosen": 0.04767027124762535, "logits/rejected": 0.10338594764471054, "logps/chosen": -311.9175109863281, "logps/rejected": -353.84375, "loss": 0.253, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.022265803068876266, "rewards/margins": 0.21186105906963348, "rewards/rejected": -0.18959525227546692, "step": 480 }, { "epoch": 1.18, "learning_rate": 2.1650114135816052e-06, "logits/chosen": 0.04343586042523384, "logits/rejected": 0.14493630826473236, "logps/chosen": -368.74066162109375, "logps/rejected": -401.21746826171875, "loss": 0.254, "rewards/accuracies": 0.75, "rewards/chosen": -0.0024279176723212004, "rewards/margins": 0.1874578297138214, "rewards/rejected": -0.18988573551177979, "step": 490 }, { "epoch": 1.2, "learning_rate": 2.06128455606496e-06, "logits/chosen": 0.04143913835287094, "logits/rejected": 0.06632859259843826, "logps/chosen": -320.82281494140625, "logps/rejected": -348.89923095703125, "loss": 0.2438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0026562472339719534, "rewards/margins": 0.18748678267002106, "rewards/rejected": -0.19014303386211395, "step": 500 }, { "epoch": 1.22, "learning_rate": 1.958331475217357e-06, "logits/chosen": 0.03532598540186882, "logits/rejected": 0.07111676037311554, "logps/chosen": -345.3083801269531, "logps/rejected": -391.5373840332031, "loss": 0.2428, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.011091398075222969, "rewards/margins": 0.18126052618026733, "rewards/rejected": -0.19235190749168396, "step": 510 }, { "epoch": 1.25, "learning_rate": 1.856333752729311e-06, "logits/chosen": 0.06463773548603058, "logits/rejected": 0.07833746820688248, "logps/chosen": -303.89508056640625, "logps/rejected": -328.54095458984375, "loss": 0.2549, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.030626490712165833, "rewards/margins": 0.14131976664066315, "rewards/rejected": -0.17194625735282898, "step": 520 }, { "epoch": 1.27, "learning_rate": 1.7554712852947915e-06, "logits/chosen": 0.017867419868707657, "logits/rejected": 0.13077208399772644, "logps/chosen": -354.83990478515625, "logps/rejected": -369.40447998046875, "loss": 0.2688, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.022668231278657913, "rewards/margins": 0.164995938539505, "rewards/rejected": -0.1876641809940338, "step": 530 }, { "epoch": 1.3, "learning_rate": 1.6559219673215784e-06, "logits/chosen": 0.07014649361371994, "logits/rejected": 0.11957643926143646, "logps/chosen": -341.1030578613281, "logps/rejected": -360.0315246582031, "loss": 0.2559, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0031127408146858215, "rewards/margins": 0.17289015650749207, "rewards/rejected": -0.16977740824222565, "step": 540 }, { "epoch": 1.32, "learning_rate": 1.5578613771731214e-06, "logits/chosen": 0.044239241629838943, "logits/rejected": 0.11994221061468124, "logps/chosen": -347.32757568359375, "logps/rejected": -388.6127624511719, "loss": 0.244, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.0042419894598424435, "rewards/margins": 0.21681733429431915, "rewards/rejected": -0.22105932235717773, "step": 550 }, { "epoch": 1.34, "learning_rate": 1.4614624674952843e-06, "logits/chosen": 0.07131338119506836, "logits/rejected": 0.14118310809135437, "logps/chosen": -381.21112060546875, "logps/rejected": -375.3702087402344, "loss": 0.2594, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.01365007646381855, "rewards/margins": 0.16313722729682922, "rewards/rejected": -0.17678730189800262, "step": 560 }, { "epoch": 1.37, "learning_rate": 1.3668952601741442e-06, "logits/chosen": 0.019948173314332962, "logits/rejected": 0.14301837980747223, "logps/chosen": -359.31829833984375, "logps/rejected": -386.3388366699219, "loss": 0.2421, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.003145938040688634, "rewards/margins": 0.17547301948070526, "rewards/rejected": -0.17861898243427277, "step": 570 }, { "epoch": 1.39, "learning_rate": 1.2743265464628787e-06, "logits/chosen": 0.04147445410490036, "logits/rejected": 0.07641445100307465, "logps/chosen": -358.9191589355469, "logps/rejected": -354.82989501953125, "loss": 0.2574, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.03237663954496384, "rewards/margins": 0.14051951467990875, "rewards/rejected": -0.17289616167545319, "step": 580 }, { "epoch": 1.42, "learning_rate": 1.1839195928066101e-06, "logits/chosen": 0.010291008278727531, "logits/rejected": 0.08601720631122589, "logps/chosen": -338.0829162597656, "logps/rejected": -349.2616882324219, "loss": 0.2504, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.012054244987666607, "rewards/margins": 0.18035855889320374, "rewards/rejected": -0.19241279363632202, "step": 590 }, { "epoch": 1.44, "learning_rate": 1.0958338528840893e-06, "logits/chosen": 0.07830692082643509, "logits/rejected": 0.1112513542175293, "logps/chosen": -318.32928466796875, "logps/rejected": -351.01531982421875, "loss": 0.2642, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.012662211433053017, "rewards/margins": 0.15172497928142548, "rewards/rejected": -0.16438719630241394, "step": 600 }, { "epoch": 1.46, "learning_rate": 1.0102246863740498e-06, "logits/chosen": 0.013798505067825317, "logits/rejected": 0.13072696328163147, "logps/chosen": -326.76336669921875, "logps/rejected": -380.63458251953125, "loss": 0.2398, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0045418571680784225, "rewards/margins": 0.19731177389621735, "rewards/rejected": -0.20185360312461853, "step": 610 }, { "epoch": 1.49, "learning_rate": 9.272430849423175e-07, "logits/chosen": 0.041550200432538986, "logits/rejected": 0.12003109604120255, "logps/chosen": -350.9006652832031, "logps/rejected": -404.7802734375, "loss": 0.2245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.015362398698925972, "rewards/margins": 0.22952251136302948, "rewards/rejected": -0.21416012942790985, "step": 620 }, { "epoch": 1.51, "learning_rate": 8.470354059328919e-07, "logits/chosen": 0.104413703083992, "logits/rejected": 0.11118074506521225, "logps/chosen": -336.5838928222656, "logps/rejected": -373.56085205078125, "loss": 0.2452, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.010279458947479725, "rewards/margins": 0.2295042723417282, "rewards/rejected": -0.21922484040260315, "step": 630 }, { "epoch": 1.54, "learning_rate": 7.697431142327633e-07, "logits/chosen": 0.07976067811250687, "logits/rejected": 0.12730778753757477, "logps/chosen": -348.73443603515625, "logps/rejected": -358.34088134765625, "loss": 0.2338, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.030282145366072655, "rewards/margins": 0.16269627213478088, "rewards/rejected": -0.1929783970117569, "step": 640 }, { "epoch": 1.56, "learning_rate": 6.955025327656839e-07, "logits/chosen": 0.04196876287460327, "logits/rejected": 0.11756552755832672, "logps/chosen": -327.8496398925781, "logps/rejected": -355.4369201660156, "loss": 0.2558, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0020761913619935513, "rewards/margins": 0.17507974803447723, "rewards/rejected": -0.17300358414649963, "step": 650 }, { "epoch": 1.58, "learning_rate": 6.244446020550182e-07, "logits/chosen": 0.05316174030303955, "logits/rejected": 0.10895484685897827, "logps/chosen": -354.5049133300781, "logps/rejected": -411.59765625, "loss": 0.2319, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.0010157767683267593, "rewards/margins": 0.21365651488304138, "rewards/rejected": -0.2146722972393036, "step": 660 }, { "epoch": 1.61, "learning_rate": 5.566946492796766e-07, "logits/chosen": 0.07230822741985321, "logits/rejected": 0.09754084050655365, "logps/chosen": -368.22802734375, "logps/rejected": -368.54974365234375, "loss": 0.2451, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02271811105310917, "rewards/margins": 0.14353466033935547, "rewards/rejected": -0.16625277698040009, "step": 670 }, { "epoch": 1.63, "learning_rate": 4.923721672305148e-07, "logits/chosen": 0.04747115820646286, "logits/rejected": 0.10951533168554306, "logps/chosen": -373.25653076171875, "logps/rejected": -403.66619873046875, "loss": 0.262, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.2638385164318606e-05, "rewards/margins": 0.20511355996131897, "rewards/rejected": -0.2051461637020111, "step": 680 }, { "epoch": 1.66, "learning_rate": 4.3159060355700943e-07, "logits/chosen": 0.007146243005990982, "logits/rejected": 0.15595687925815582, "logps/chosen": -360.5429382324219, "logps/rejected": -360.84271240234375, "loss": 0.2528, "rewards/accuracies": 0.75, "rewards/chosen": -0.026043469086289406, "rewards/margins": 0.19069012999534607, "rewards/rejected": -0.21673360466957092, "step": 690 }, { "epoch": 1.68, "learning_rate": 3.7445716067596506e-07, "logits/chosen": -0.016133427619934082, "logits/rejected": 0.06616418063640594, "logps/chosen": -315.7747497558594, "logps/rejected": -344.2303771972656, "loss": 0.242, "rewards/accuracies": 0.75, "rewards/chosen": 0.012198897078633308, "rewards/margins": 0.2178380936384201, "rewards/rejected": -0.20563916862010956, "step": 700 }, { "epoch": 1.7, "learning_rate": 3.2107260669512334e-07, "logits/chosen": 0.06611919403076172, "logits/rejected": 0.08203423768281937, "logps/chosen": -342.01263427734375, "logps/rejected": -353.5125427246094, "loss": 0.2461, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01212338637560606, "rewards/margins": 0.17198148369789124, "rewards/rejected": -0.18410487473011017, "step": 710 }, { "epoch": 1.73, "learning_rate": 2.7153109768518926e-07, "logits/chosen": 0.05342602729797363, "logits/rejected": 0.11405602842569351, "logps/chosen": -393.02593994140625, "logps/rejected": -416.9335021972656, "loss": 0.244, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.015018805861473083, "rewards/margins": 0.2232932150363922, "rewards/rejected": -0.2383120059967041, "step": 720 }, { "epoch": 1.75, "learning_rate": 2.2592001161370392e-07, "logits/chosen": 0.059743158519268036, "logits/rejected": 0.08855228126049042, "logps/chosen": -365.6115417480469, "logps/rejected": -373.24310302734375, "loss": 0.2413, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.007994825020432472, "rewards/margins": 0.19029465317726135, "rewards/rejected": -0.19828948378562927, "step": 730 }, { "epoch": 1.78, "learning_rate": 1.8431979423369607e-07, "logits/chosen": 0.01501550804823637, "logits/rejected": 0.09877587854862213, "logps/chosen": -335.7201232910156, "logps/rejected": -356.1680603027344, "loss": 0.2601, "rewards/accuracies": 0.65625, "rewards/chosen": -0.013049180619418621, "rewards/margins": 0.1567631959915161, "rewards/rejected": -0.16981235146522522, "step": 740 }, { "epoch": 1.8, "learning_rate": 1.468038171988881e-07, "logits/chosen": -0.008327131159603596, "logits/rejected": 0.04639572650194168, "logps/chosen": -354.1353759765625, "logps/rejected": -387.98297119140625, "loss": 0.2595, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.02448558434844017, "rewards/margins": 0.1901397705078125, "rewards/rejected": -0.21462532877922058, "step": 750 }, { "epoch": 1.82, "learning_rate": 1.1343824865573422e-07, "logits/chosen": 0.01856027916073799, "logits/rejected": 0.07309429347515106, "logps/chosen": -321.44903564453125, "logps/rejected": -341.5816955566406, "loss": 0.2495, "rewards/accuracies": 0.6875, "rewards/chosen": -0.022448932752013206, "rewards/margins": 0.17198805510997772, "rewards/rejected": -0.19443701207637787, "step": 760 }, { "epoch": 1.85, "learning_rate": 8.428193654051036e-08, "logits/chosen": 0.04589134082198143, "logits/rejected": 0.10319966077804565, "logps/chosen": -388.9933776855469, "logps/rejected": -376.8731994628906, "loss": 0.2475, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.008324380032718182, "rewards/margins": 0.20527882874011993, "rewards/rejected": -0.19695445895195007, "step": 770 }, { "epoch": 1.87, "learning_rate": 5.9386304787299175e-08, "logits/chosen": 0.03318192437291145, "logits/rejected": 0.1395682990550995, "logps/chosen": -377.56622314453125, "logps/rejected": -377.5900573730469, "loss": 0.2477, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0049881902523338795, "rewards/margins": 0.2095176726579666, "rewards/rejected": -0.2145058661699295, "step": 780 }, { "epoch": 1.9, "learning_rate": 3.8795262629929e-08, "logits/chosen": 0.03711915761232376, "logits/rejected": 0.07861719280481339, "logps/chosen": -311.10015869140625, "logps/rejected": -340.22918701171875, "loss": 0.2288, "rewards/accuracies": 0.78125, "rewards/chosen": 0.007546453736722469, "rewards/margins": 0.215033620595932, "rewards/rejected": -0.20748718082904816, "step": 790 }, { "epoch": 1.92, "learning_rate": 2.2545127157831416e-08, "logits/chosen": 0.06011080741882324, "logits/rejected": 0.08075010776519775, "logps/chosen": -342.993408203125, "logps/rejected": -338.7896728515625, "loss": 0.252, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03219890594482422, "rewards/margins": 0.15845921635627747, "rewards/rejected": -0.1906580924987793, "step": 800 }, { "epoch": 1.94, "learning_rate": 1.0664559262413831e-08, "logits/chosen": 0.06324592232704163, "logits/rejected": 0.15417756140232086, "logps/chosen": -383.63238525390625, "logps/rejected": -373.19720458984375, "loss": 0.2445, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.013102272525429726, "rewards/margins": 0.21051840484142303, "rewards/rejected": -0.2236206978559494, "step": 810 }, { "epoch": 1.97, "learning_rate": 3.1745130869123564e-09, "logits/chosen": 0.02718031406402588, "logits/rejected": 0.09324290603399277, "logps/chosen": -342.188232421875, "logps/rejected": -382.42657470703125, "loss": 0.2445, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02895962819457054, "rewards/margins": 0.1746593415737152, "rewards/rejected": -0.20361897349357605, "step": 820 }, { "epoch": 1.99, "learning_rate": 8.819906889168117e-11, "logits/chosen": 0.07415173202753067, "logits/rejected": 0.12375295162200928, "logps/chosen": -362.17572021484375, "logps/rejected": -372.21044921875, "loss": 0.2579, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.023642729967832565, "rewards/margins": 0.1827639937400818, "rewards/rejected": -0.20640675723552704, "step": 830 }, { "epoch": 2.0, "step": 832, "total_flos": 0.0, "train_loss": 0.27172684411589915, "train_runtime": 11567.6763, "train_samples_per_second": 3.458, "train_steps_per_second": 0.072 } ], "logging_steps": 10, "max_steps": 832, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }