{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 329, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 2.0956511335509704, "learning_rate": 1.5151515151515152e-08, "logits/chosen": -2.6820077896118164, "logits/rejected": -2.6930205821990967, "logps/chosen": -281.2528381347656, "logps/rejected": -258.0622253417969, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6928904056549072, "epoch": 0.03, "grad_norm": 25.14467901589718, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -2.769256353378296, "logits/rejected": -2.7548859119415283, "logps/chosen": -284.5684814453125, "logps/rejected": -249.87168884277344, "loss": 0.6979, "positive_losses": 0.05147833377122879, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.0003567033272702247, "rewards/margins": 0.000515056774020195, "rewards/margins_max": 0.0027906966861337423, "rewards/margins_min": -0.0017034021439030766, "rewards/margins_std": 0.0019198498921468854, "rewards/rejected": -0.00015835335943847895, "step": 10 }, { "dpo_losses": 0.6927427053451538, "epoch": 0.06, "grad_norm": 15.081930583547756, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -2.8346035480499268, "logits/rejected": -2.7817800045013428, "logps/chosen": -291.3860168457031, "logps/rejected": -270.3315124511719, "loss": 0.6968, "positive_losses": 0.02039351500570774, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.001457269536331296, "rewards/margins": 0.0008110570488497615, "rewards/margins_max": 0.0036527395714074373, "rewards/margins_min": -0.0018750065937638283, "rewards/margins_std": 0.0025213216431438923, "rewards/rejected": 0.0006462126038968563, "step": 20 }, { "dpo_losses": 0.6933337450027466, "epoch": 0.09, "grad_norm": 9.990963095062185, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.8629353046417236, "logits/rejected": -2.8151936531066895, "logps/chosen": -258.76837158203125, "logps/rejected": -226.8423614501953, "loss": 0.6951, "positive_losses": 0.014158916659653187, "rewards/accuracies": 0.4375, "rewards/chosen": 0.004769365303218365, "rewards/margins": -0.00037118454929441214, "rewards/margins_max": 0.002330085728317499, "rewards/margins_min": -0.0032659140415489674, "rewards/margins_std": 0.0024877875111997128, "rewards/rejected": 0.005140549503266811, "step": 30 }, { "dpo_losses": 0.6926981806755066, "epoch": 0.12, "grad_norm": 1.604193261987099, "learning_rate": 4.993103596812268e-07, "logits/chosen": -2.829625129699707, "logits/rejected": -2.764366626739502, "logps/chosen": -316.5284729003906, "logps/rejected": -223.76123046875, "loss": 0.693, "positive_losses": 0.007878494448959827, "rewards/accuracies": 0.625, "rewards/chosen": 0.010137422941625118, "rewards/margins": 0.0009021027944982052, "rewards/margins_max": 0.005337115842849016, "rewards/margins_min": -0.0031714539509266615, "rewards/margins_std": 0.0037328810431063175, "rewards/rejected": 0.0092353206127882, "step": 40 }, { "dpo_losses": 0.6929700374603271, "epoch": 0.15, "grad_norm": 7.523440542250164, "learning_rate": 4.959416858332709e-07, "logits/chosen": -2.790869951248169, "logits/rejected": -2.8046393394470215, "logps/chosen": -241.8709259033203, "logps/rejected": -278.8560791015625, "loss": 0.6937, "positive_losses": 0.0050521851517260075, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.010639366693794727, "rewards/margins": 0.0003602326032705605, "rewards/margins_max": 0.005497736390680075, "rewards/margins_min": -0.004975494462996721, "rewards/margins_std": 0.0046127657406032085, "rewards/rejected": 0.010279135778546333, "step": 50 }, { "dpo_losses": 0.6924153566360474, "epoch": 0.18, "grad_norm": 8.317758374112413, "learning_rate": 4.898051734555674e-07, "logits/chosen": -2.834188938140869, "logits/rejected": -2.8444936275482178, "logps/chosen": -320.539794921875, "logps/rejected": -281.9419860839844, "loss": 0.6928, "positive_losses": 0.0005203246837481856, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.013813069090247154, "rewards/margins": 0.0014724148204550147, "rewards/margins_max": 0.008079244755208492, "rewards/margins_min": -0.003974889405071735, "rewards/margins_std": 0.005347874015569687, "rewards/rejected": 0.012340654619038105, "step": 60 }, { "dpo_losses": 0.693148672580719, "epoch": 0.21, "grad_norm": 7.343753116688168, "learning_rate": 4.809698831278217e-07, "logits/chosen": -2.7492079734802246, "logits/rejected": -2.735715389251709, "logps/chosen": -265.208251953125, "logps/rejected": -245.1059112548828, "loss": 0.6937, "positive_losses": 0.01350479107350111, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.012691942974925041, "rewards/margins": 6.041547749191523e-06, "rewards/margins_max": 0.0061768521554768085, "rewards/margins_min": -0.0060454607009887695, "rewards/margins_std": 0.00548047199845314, "rewards/rejected": 0.012685902416706085, "step": 70 }, { "dpo_losses": 0.6920666694641113, "epoch": 0.24, "grad_norm": 2.0944283368710837, "learning_rate": 4.6953524759527053e-07, "logits/chosen": -2.8422322273254395, "logits/rejected": -2.815117597579956, "logps/chosen": -280.7608947753906, "logps/rejected": -273.6212158203125, "loss": 0.6925, "positive_losses": 0.00601539621129632, "rewards/accuracies": 0.625, "rewards/chosen": 0.01512569934129715, "rewards/margins": 0.0021733222529292107, "rewards/margins_max": 0.008934634737670422, "rewards/margins_min": -0.004246932454407215, "rewards/margins_std": 0.006057220045477152, "rewards/rejected": 0.012952374294400215, "step": 80 }, { "dpo_losses": 0.6931439638137817, "epoch": 0.27, "grad_norm": 7.341115665723385, "learning_rate": 4.5562995274820283e-07, "logits/chosen": -2.799257278442383, "logits/rejected": -2.7461183071136475, "logps/chosen": -294.0331726074219, "logps/rejected": -290.0491638183594, "loss": 0.6927, "positive_losses": 0.0, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0151880057528615, "rewards/margins": 1.8423608707962558e-05, "rewards/margins_max": 0.006872598081827164, "rewards/margins_min": -0.007915332913398743, "rewards/margins_std": 0.006555147469043732, "rewards/rejected": 0.015169581398367882, "step": 90 }, { "dpo_losses": 0.6915194988250732, "epoch": 0.3, "grad_norm": 1.7021064862560185, "learning_rate": 4.394104893853007e-07, "logits/chosen": -2.8969879150390625, "logits/rejected": -2.8577048778533936, "logps/chosen": -271.9718017578125, "logps/rejected": -255.94015502929688, "loss": 0.6925, "positive_losses": 0.0063987732864916325, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.01490433793514967, "rewards/margins": 0.0032706786878407, "rewards/margins_max": 0.01071108691394329, "rewards/margins_min": -0.004061442334204912, "rewards/margins_std": 0.006633488927036524, "rewards/rejected": 0.011633659712970257, "step": 100 }, { "epoch": 0.3, "eval_dpo_losses": 0.6918572783470154, "eval_logits/chosen": -2.8058130741119385, "eval_logits/rejected": -2.7668540477752686, "eval_logps/chosen": -282.8757629394531, "eval_logps/rejected": -257.1208801269531, "eval_loss": 0.6926050782203674, "eval_positive_losses": 0.002994521986693144, "eval_rewards/accuracies": 0.6330000162124634, "eval_rewards/chosen": 0.017176620662212372, "eval_rewards/margins": 0.0025962977670133114, "eval_rewards/margins_max": 0.014017846435308456, "eval_rewards/margins_min": -0.007906544953584671, "eval_rewards/margins_std": 0.00730311032384634, "eval_rewards/rejected": 0.014580323360860348, "eval_runtime": 429.0754, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 100 }, { "dpo_losses": 0.6914526224136353, "epoch": 0.33, "grad_norm": 2.1403573418786492, "learning_rate": 4.2105939205932005e-07, "logits/chosen": -2.764038324356079, "logits/rejected": -2.7474615573883057, "logps/chosen": -310.06982421875, "logps/rejected": -234.0756072998047, "loss": 0.6917, "positive_losses": 0.0032207488548010588, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.016864223405718803, "rewards/margins": 0.0034028415102511644, "rewards/margins_max": 0.010497726500034332, "rewards/margins_min": -0.0030488395132124424, "rewards/margins_std": 0.006098336540162563, "rewards/rejected": 0.013461383990943432, "step": 110 }, { "dpo_losses": 0.691475510597229, "epoch": 0.36, "grad_norm": 2.00482561166422, "learning_rate": 4.0078318482522114e-07, "logits/chosen": -2.752159833908081, "logits/rejected": -2.750898838043213, "logps/chosen": -321.77813720703125, "logps/rejected": -272.95037841796875, "loss": 0.6922, "positive_losses": 0.011810302734375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.017863672226667404, "rewards/margins": 0.003372038947418332, "rewards/margins_max": 0.014931520447134972, "rewards/margins_min": -0.006266787648200989, "rewards/margins_std": 0.009545858018100262, "rewards/rejected": 0.01449163444340229, "step": 120 }, { "dpo_losses": 0.692450225353241, "epoch": 0.4, "grad_norm": 1.6365937615424984, "learning_rate": 3.7881005700938627e-07, "logits/chosen": -2.8213768005371094, "logits/rejected": -2.831343650817871, "logps/chosen": -264.4930114746094, "logps/rejected": -232.359130859375, "loss": 0.6926, "positive_losses": 0.008326339535415173, "rewards/accuracies": 0.5, "rewards/chosen": 0.01900574564933777, "rewards/margins": 0.001420500222593546, "rewards/margins_max": 0.011069941334426403, "rewards/margins_min": -0.009186895564198494, "rewards/margins_std": 0.008960539475083351, "rewards/rejected": 0.01758524402976036, "step": 130 }, { "dpo_losses": 0.690600574016571, "epoch": 0.43, "grad_norm": 1.9064212159356546, "learning_rate": 3.5538729515692354e-07, "logits/chosen": -2.7800724506378174, "logits/rejected": -2.763850450515747, "logps/chosen": -292.2286682128906, "logps/rejected": -268.72857666015625, "loss": 0.6912, "positive_losses": 0.013485336676239967, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.02164594829082489, "rewards/margins": 0.005139264743775129, "rewards/margins_max": 0.017013436183333397, "rewards/margins_min": -0.0060675484128296375, "rewards/margins_std": 0.010294691659510136, "rewards/rejected": 0.016506681218743324, "step": 140 }, { "dpo_losses": 0.6907236576080322, "epoch": 0.46, "grad_norm": 1.4378221399671829, "learning_rate": 3.3077850005803125e-07, "logits/chosen": -2.842320203781128, "logits/rejected": -2.8206865787506104, "logps/chosen": -268.4656066894531, "logps/rejected": -243.43276977539062, "loss": 0.6925, "positive_losses": 0.0013946533435955644, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.021758494898676872, "rewards/margins": 0.004882398992776871, "rewards/margins_max": 0.017139647156000137, "rewards/margins_min": -0.0063543543219566345, "rewards/margins_std": 0.010589024983346462, "rewards/rejected": 0.01687609776854515, "step": 150 }, { "dpo_losses": 0.6922906041145325, "epoch": 0.49, "grad_norm": 2.1178244445563013, "learning_rate": 3.0526062017313247e-07, "logits/chosen": -2.7998666763305664, "logits/rejected": -2.7823493480682373, "logps/chosen": -253.3588104248047, "logps/rejected": -238.626708984375, "loss": 0.6933, "positive_losses": 0.0011047363514080644, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.021453257650136948, "rewards/margins": 0.001750586787238717, "rewards/margins_max": 0.014365032315254211, "rewards/margins_min": -0.009759698994457722, "rewards/margins_std": 0.010657011531293392, "rewards/rejected": 0.019702669233083725, "step": 160 }, { "dpo_losses": 0.6903497576713562, "epoch": 0.52, "grad_norm": 1.5792742029110545, "learning_rate": 2.791208348427426e-07, "logits/chosen": -2.817678451538086, "logits/rejected": -2.7354776859283447, "logps/chosen": -301.37139892578125, "logps/rejected": -271.1383972167969, "loss": 0.6907, "positive_losses": 0.0042434693314135075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02347148209810257, "rewards/margins": 0.0056489938870072365, "rewards/margins_max": 0.017104174941778183, "rewards/margins_min": -0.0068167769350111485, "rewards/margins_std": 0.010564757511019707, "rewards/rejected": 0.01782248727977276, "step": 170 }, { "dpo_losses": 0.6911171674728394, "epoch": 0.55, "grad_norm": 11.74545654760677, "learning_rate": 2.526533223585641e-07, "logits/chosen": -2.8419086933135986, "logits/rejected": -2.7769436836242676, "logps/chosen": -253.8654022216797, "logps/rejected": -227.02719116210938, "loss": 0.6917, "positive_losses": 0.0033275603782385588, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02268325164914131, "rewards/margins": 0.0040952288545668125, "rewards/margins_max": 0.016258299350738525, "rewards/margins_min": -0.004967542830854654, "rewards/margins_std": 0.00928699504584074, "rewards/rejected": 0.018588021397590637, "step": 180 }, { "dpo_losses": 0.6901928186416626, "epoch": 0.58, "grad_norm": 9.4811716718965, "learning_rate": 2.261559492680755e-07, "logits/chosen": -2.784456729888916, "logits/rejected": -2.766946792602539, "logps/chosen": -298.0020446777344, "logps/rejected": -268.62164306640625, "loss": 0.6923, "positive_losses": 0.0034267425071448088, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.025788750499486923, "rewards/margins": 0.005970729514956474, "rewards/margins_max": 0.02267562597990036, "rewards/margins_min": -0.008256749249994755, "rewards/margins_std": 0.013814790174365044, "rewards/rejected": 0.01981802098453045, "step": 190 }, { "dpo_losses": 0.6899710893630981, "epoch": 0.61, "grad_norm": 2.1557416454862444, "learning_rate": 1.9992691817133024e-07, "logits/chosen": -2.787891387939453, "logits/rejected": -2.756988048553467, "logps/chosen": -279.0212097167969, "logps/rejected": -286.3407287597656, "loss": 0.6928, "positive_losses": 0.02213897742331028, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.025793900713324547, "rewards/margins": 0.006411595735698938, "rewards/margins_max": 0.022542249411344528, "rewards/margins_min": -0.00852292776107788, "rewards/margins_std": 0.013604698702692986, "rewards/rejected": 0.019382307305932045, "step": 200 }, { "epoch": 0.61, "eval_dpo_losses": 0.6903740763664246, "eval_logits/chosen": -2.8064374923706055, "eval_logits/rejected": -2.7678143978118896, "eval_logps/chosen": -282.0021667480469, "eval_logps/rejected": -256.5478515625, "eval_loss": 0.6918678879737854, "eval_positive_losses": 0.008644149638712406, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 0.025912432000041008, "eval_rewards/margins": 0.005601577460765839, "eval_rewards/margins_max": 0.02654200792312622, "eval_rewards/margins_min": -0.01304223295301199, "eval_rewards/margins_std": 0.013095145113766193, "eval_rewards/rejected": 0.02031085453927517, "eval_runtime": 428.4364, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 200 }, { "dpo_losses": 0.6895982623100281, "epoch": 0.64, "grad_norm": 9.2776467878139, "learning_rate": 1.742614117358029e-07, "logits/chosen": -2.8029887676239014, "logits/rejected": -2.7589237689971924, "logps/chosen": -302.7414855957031, "logps/rejected": -286.43438720703125, "loss": 0.6912, "positive_losses": 0.013084793463349342, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02575690671801567, "rewards/margins": 0.007164796348661184, "rewards/margins_max": 0.023669250309467316, "rewards/margins_min": -0.007106063421815634, "rewards/margins_std": 0.014227842912077904, "rewards/rejected": 0.0185921099036932, "step": 210 }, { "dpo_losses": 0.6903053522109985, "epoch": 0.67, "grad_norm": 1.955123489265158, "learning_rate": 1.4944827069769122e-07, "logits/chosen": -2.8536009788513184, "logits/rejected": -2.8277313709259033, "logps/chosen": -310.047607421875, "logps/rejected": -264.05352783203125, "loss": 0.6902, "positive_losses": 0.002166366670280695, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02925548516213894, "rewards/margins": 0.005735241807997227, "rewards/margins_max": 0.02009030058979988, "rewards/margins_min": -0.007059932686388493, "rewards/margins_std": 0.01263638585805893, "rewards/rejected": 0.023520244285464287, "step": 220 }, { "dpo_losses": 0.6920837163925171, "epoch": 0.7, "grad_norm": 8.688484410559708, "learning_rate": 1.2576674323558928e-07, "logits/chosen": -2.822300910949707, "logits/rejected": -2.8432376384735107, "logps/chosen": -286.09466552734375, "logps/rejected": -260.4286804199219, "loss": 0.6922, "positive_losses": 0.0013641357654705644, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.025508206337690353, "rewards/margins": 0.002203943906351924, "rewards/margins_max": 0.020830942317843437, "rewards/margins_min": -0.015516755171120167, "rewards/margins_std": 0.01637136936187744, "rewards/rejected": 0.02330426312983036, "step": 230 }, { "dpo_losses": 0.690080463886261, "epoch": 0.73, "grad_norm": 2.0416140906382805, "learning_rate": 1.0348334229922676e-07, "logits/chosen": -2.8787901401519775, "logits/rejected": -2.831031322479248, "logps/chosen": -288.3709411621094, "logps/rejected": -273.1520080566406, "loss": 0.6918, "positive_losses": 0.01643219031393528, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02646791562438011, "rewards/margins": 0.00618840241804719, "rewards/margins_max": 0.022397857159376144, "rewards/margins_min": -0.006807761732488871, "rewards/margins_std": 0.013137358240783215, "rewards/rejected": 0.02027951553463936, "step": 240 }, { "dpo_losses": 0.6899579763412476, "epoch": 0.76, "grad_norm": 1.9842983441302535, "learning_rate": 8.284884626103164e-08, "logits/chosen": -2.8189456462860107, "logits/rejected": -2.787537097930908, "logps/chosen": -298.2633056640625, "logps/rejected": -302.3707580566406, "loss": 0.6937, "positive_losses": 0.0013935088645666838, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02826567366719246, "rewards/margins": 0.006455945782363415, "rewards/margins_max": 0.022190529853105545, "rewards/margins_min": -0.010183097794651985, "rewards/margins_std": 0.014948679134249687, "rewards/rejected": 0.02180972695350647, "step": 250 }, { "dpo_losses": 0.6888389587402344, "epoch": 0.79, "grad_norm": 2.1130940747529685, "learning_rate": 6.409547664531733e-08, "logits/chosen": -2.8461780548095703, "logits/rejected": -2.8126139640808105, "logps/chosen": -330.7644958496094, "logps/rejected": -310.1560974121094, "loss": 0.6907, "positive_losses": 0.0, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.03217402845621109, "rewards/margins": 0.008677511475980282, "rewards/margins_max": 0.021433433517813683, "rewards/margins_min": -0.0022177654318511486, "rewards/margins_std": 0.010701724328100681, "rewards/rejected": 0.023496516048908234, "step": 260 }, { "dpo_losses": 0.6895651817321777, "epoch": 0.82, "grad_norm": 1.9914333935784974, "learning_rate": 4.743428469705335e-08, "logits/chosen": -2.7977194786071777, "logits/rejected": -2.792222499847412, "logps/chosen": -301.0746154785156, "logps/rejected": -305.9708557128906, "loss": 0.6919, "positive_losses": 0.020720671862363815, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.027332518249750137, "rewards/margins": 0.007225664798170328, "rewards/margins_max": 0.021714068949222565, "rewards/margins_min": -0.0065118856728076935, "rewards/margins_std": 0.012981800362467766, "rewards/rejected": 0.020106855779886246, "step": 270 }, { "dpo_losses": 0.6888093948364258, "epoch": 0.85, "grad_norm": 2.1081832344794846, "learning_rate": 3.305277620188826e-08, "logits/chosen": -2.8479342460632324, "logits/rejected": -2.829345226287842, "logps/chosen": -322.39654541015625, "logps/rejected": -267.4809875488281, "loss": 0.6907, "positive_losses": 0.00835342425853014, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03169160336256027, "rewards/margins": 0.008761906065046787, "rewards/margins_max": 0.025315579026937485, "rewards/margins_min": -0.00724324444308877, "rewards/margins_std": 0.014560697600245476, "rewards/rejected": 0.02292969450354576, "step": 280 }, { "dpo_losses": 0.690091609954834, "epoch": 0.88, "grad_norm": 1.7109015474069318, "learning_rate": 2.1112801287806375e-08, "logits/chosen": -2.7887511253356934, "logits/rejected": -2.753249168395996, "logps/chosen": -271.52618408203125, "logps/rejected": -243.4734344482422, "loss": 0.691, "positive_losses": 0.0007474899175576866, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02663729526102543, "rewards/margins": 0.0061707692220807076, "rewards/margins_max": 0.020485369488596916, "rewards/margins_min": -0.006647027097642422, "rewards/margins_std": 0.012255651876330376, "rewards/rejected": 0.020466525107622147, "step": 290 }, { "dpo_losses": 0.6898671984672546, "epoch": 0.91, "grad_norm": 6.914969485002492, "learning_rate": 1.1748732956682023e-08, "logits/chosen": -2.8814024925231934, "logits/rejected": -2.8128414154052734, "logps/chosen": -321.03570556640625, "logps/rejected": -283.5750732421875, "loss": 0.6916, "positive_losses": 0.020281601697206497, "rewards/accuracies": 0.625, "rewards/chosen": 0.02677486464381218, "rewards/margins": 0.006634272634983063, "rewards/margins_max": 0.02378956601023674, "rewards/margins_min": -0.009901603683829308, "rewards/margins_std": 0.015127221122384071, "rewards/rejected": 0.020140592008829117, "step": 300 }, { "epoch": 0.91, "eval_dpo_losses": 0.6899807453155518, "eval_logits/chosen": -2.8024394512176514, "eval_logits/rejected": -2.763456344604492, "eval_logps/chosen": -281.7664489746094, "eval_logps/rejected": -256.39239501953125, "eval_loss": 0.6914138197898865, "eval_positive_losses": 0.006967452820390463, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 0.028269650414586067, "eval_rewards/margins": 0.0064042662270367146, "eval_rewards/margins_max": 0.030468182638287544, "eval_rewards/margins_min": -0.014824692159891129, "eval_rewards/margins_std": 0.014939413405954838, "eval_rewards/rejected": 0.02186538279056549, "eval_runtime": 428.1048, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 300 }, { "dpo_losses": 0.690555214881897, "epoch": 0.94, "grad_norm": 1.5197312877950733, "learning_rate": 5.065954844616721e-09, "logits/chosen": -2.827882766723633, "logits/rejected": -2.781325340270996, "logps/chosen": -274.24688720703125, "logps/rejected": -278.71307373046875, "loss": 0.6911, "positive_losses": 0.0, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.028748828917741776, "rewards/margins": 0.005246045999228954, "rewards/margins_max": 0.022478515282273293, "rewards/margins_min": -0.010068441741168499, "rewards/margins_std": 0.014408007264137268, "rewards/rejected": 0.023502781987190247, "step": 310 }, { "dpo_losses": 0.6897249817848206, "epoch": 0.97, "grad_norm": 1.9024323554426086, "learning_rate": 1.1396752298723499e-09, "logits/chosen": -2.869424819946289, "logits/rejected": -2.8175909519195557, "logps/chosen": -246.2608642578125, "logps/rejected": -255.5459442138672, "loss": 0.6898, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.026843314990401268, "rewards/margins": 0.006898718420416117, "rewards/margins_max": 0.020682070404291153, "rewards/margins_min": -0.0074739838019013405, "rewards/margins_std": 0.012545767240226269, "rewards/rejected": 0.01994459703564644, "step": 320 }, { "epoch": 1.0, "step": 329, "total_flos": 0.0, "train_loss": 0.6924228537771114, "train_runtime": 3896.0637, "train_samples_per_second": 1.351, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 329, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }