{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.936507936507937e-08, "logits/chosen": 0.09552346915006638, "logits/rejected": 0.17362232506275177, "logps/chosen": -255.44039916992188, "logps/rejected": -210.80226135253906, "loss": 0.3612, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.936507936507937e-07, "logits/chosen": 0.11144200712442398, "logits/rejected": 0.20884405076503754, "logps/chosen": -359.81549072265625, "logps/rejected": -336.5404968261719, "loss": 0.3743, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 7.773819379508495e-05, "rewards/margins": 3.821194331976585e-05, "rewards/rejected": 3.9526246837340295e-05, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5873015873015873e-06, "logits/chosen": 0.12432358413934708, "logits/rejected": 0.2087073028087616, "logps/chosen": -346.46246337890625, "logps/rejected": -331.729248046875, "loss": 0.369, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00011548679322004318, "rewards/margins": 0.00013375042180996388, "rewards/rejected": -1.82636285899207e-05, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": 0.1460120528936386, "logits/rejected": 0.20255199074745178, "logps/chosen": -362.12652587890625, "logps/rejected": -310.8555908203125, "loss": 0.3676, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00011238453589612618, "rewards/margins": 0.00017934020434040576, "rewards/rejected": -6.695566116832197e-05, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.1746031746031746e-06, "logits/chosen": 0.13341203331947327, "logits/rejected": 0.2324620932340622, "logps/chosen": -350.6195983886719, "logps/rejected": -316.5322265625, "loss": 0.3806, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00011245281348237768, "rewards/margins": 0.00028168410062789917, "rewards/rejected": -0.0001692312944214791, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.968253968253968e-06, "logits/chosen": 0.0949329286813736, "logits/rejected": 0.1779319941997528, "logps/chosen": -320.3857727050781, "logps/rejected": -307.16876220703125, "loss": 0.3807, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 8.781128417467698e-05, "rewards/margins": 0.0011059035314247012, "rewards/rejected": -0.001018092269077897, "step": 50 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.09137465059757233, "logits/rejected": 0.20532509684562683, "logps/chosen": -352.49993896484375, "logps/rejected": -326.77874755859375, "loss": 0.3651, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00026215435354970396, "rewards/margins": 0.001010752865113318, "rewards/rejected": -0.0007485984242521226, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.998086282661188e-06, "logits/chosen": 0.045028798282146454, "logits/rejected": 0.17268504202365875, "logps/chosen": -356.53717041015625, "logps/rejected": -328.4739074707031, "loss": 0.3691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0009223738452419639, "rewards/margins": 0.0022405553609132767, "rewards/rejected": -0.0013181815156713128, "step": 70 }, { "epoch": 0.13, "learning_rate": 4.988720025682995e-06, "logits/chosen": 0.1980675309896469, "logits/rejected": 0.18946149945259094, "logps/chosen": -318.8916015625, "logps/rejected": -316.4183044433594, "loss": 0.3524, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0015520222950726748, "rewards/margins": 0.006260824855417013, "rewards/rejected": -0.004708803258836269, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.9715789537359126e-06, "logits/chosen": 0.1147293671965599, "logits/rejected": 0.2110525667667389, "logps/chosen": -335.58905029296875, "logps/rejected": -333.1260681152344, "loss": 0.3675, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.002089377725496888, "rewards/margins": 0.0062202513217926025, "rewards/rejected": -0.008309627883136272, "step": 90 }, { "epoch": 0.16, "learning_rate": 4.946716615897932e-06, "logits/chosen": 0.09352072328329086, "logits/rejected": 0.131501242518425, "logps/chosen": -339.08734130859375, "logps/rejected": -325.3585205078125, "loss": 0.3558, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.008228459395468235, "rewards/margins": 0.010374611243605614, "rewards/rejected": -0.018603071570396423, "step": 100 }, { "epoch": 0.18, "learning_rate": 4.9142106826480114e-06, "logits/chosen": -0.004745665937662125, "logits/rejected": 0.03175293654203415, "logps/chosen": -348.41973876953125, "logps/rejected": -358.1392517089844, "loss": 0.3559, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.028692331165075302, "rewards/margins": 0.02663610503077507, "rewards/rejected": -0.05532843619585037, "step": 110 }, { "epoch": 0.19, "learning_rate": 4.874162703221823e-06, "logits/chosen": 0.09396891295909882, "logits/rejected": 0.14528635144233704, "logps/chosen": -388.5705261230469, "logps/rejected": -387.454833984375, "loss": 0.3569, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04682096466422081, "rewards/margins": 0.02794179879128933, "rewards/rejected": -0.0747627541422844, "step": 120 }, { "epoch": 0.21, "learning_rate": 4.826697788369752e-06, "logits/chosen": 0.06088032200932503, "logits/rejected": 0.09309231489896774, "logps/chosen": -444.643798828125, "logps/rejected": -433.63409423828125, "loss": 0.3579, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07657051086425781, "rewards/margins": 0.03327987343072891, "rewards/rejected": -0.10985038429498672, "step": 130 }, { "epoch": 0.22, "learning_rate": 4.7719642195082224e-06, "logits/chosen": -0.0028548731934279203, "logits/rejected": 0.0651206225156784, "logps/chosen": -398.1200256347656, "logps/rejected": -429.90447998046875, "loss": 0.3439, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07342123985290527, "rewards/margins": 0.04399186372756958, "rewards/rejected": -0.11741310358047485, "step": 140 }, { "epoch": 0.24, "learning_rate": 4.710132985485355e-06, "logits/chosen": 0.0589798204600811, "logits/rejected": 0.0974341481924057, "logps/chosen": -432.10809326171875, "logps/rejected": -462.78778076171875, "loss": 0.3586, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.09363020956516266, "rewards/margins": 0.04327515512704849, "rewards/rejected": -0.13690535724163055, "step": 150 }, { "epoch": 0.26, "learning_rate": 4.641397248408122e-06, "logits/chosen": 0.06426632404327393, "logits/rejected": 0.1016223207116127, "logps/chosen": -401.81256103515625, "logps/rejected": -423.08984375, "loss": 0.3355, "rewards/accuracies": 0.625, "rewards/chosen": -0.074070505797863, "rewards/margins": 0.038354430347681046, "rewards/rejected": -0.11242493242025375, "step": 160 }, { "epoch": 0.27, "learning_rate": 4.5659717401997655e-06, "logits/chosen": 0.06136215850710869, "logits/rejected": 0.07516863942146301, "logps/chosen": -438.4203186035156, "logps/rejected": -483.72845458984375, "loss": 0.3374, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09795868396759033, "rewards/margins": 0.06916960328817368, "rewards/rejected": -0.1671282798051834, "step": 170 }, { "epoch": 0.29, "learning_rate": 4.4840920917726425e-06, "logits/chosen": 0.07146959006786346, "logits/rejected": 0.13864199817180634, "logps/chosen": -459.9623107910156, "logps/rejected": -529.416015625, "loss": 0.321, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1282103806734085, "rewards/margins": 0.07925314456224442, "rewards/rejected": -0.20746353268623352, "step": 180 }, { "epoch": 0.3, "learning_rate": 4.396014096912182e-06, "logits/chosen": 0.041384804993867874, "logits/rejected": 0.0659438818693161, "logps/chosen": -481.56793212890625, "logps/rejected": -541.1165771484375, "loss": 0.3344, "rewards/accuracies": 0.625, "rewards/chosen": -0.14985546469688416, "rewards/margins": 0.07496772706508636, "rewards/rejected": -0.22482319176197052, "step": 190 }, { "epoch": 0.32, "learning_rate": 4.302012913171584e-06, "logits/chosen": 0.04430466145277023, "logits/rejected": 0.11885523796081543, "logps/chosen": -511.34857177734375, "logps/rejected": -522.5247192382812, "loss": 0.3249, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11126357316970825, "rewards/margins": 0.06654877960681915, "rewards/rejected": -0.1778123527765274, "step": 200 }, { "epoch": 0.34, "learning_rate": 4.202382202273702e-06, "logits/chosen": 0.05428556352853775, "logits/rejected": 0.15545043349266052, "logps/chosen": -450.3092346191406, "logps/rejected": -418.51678466796875, "loss": 0.3455, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.1011916771531105, "rewards/margins": 0.02224273420870304, "rewards/rejected": -0.1234344020485878, "step": 210 }, { "epoch": 0.35, "learning_rate": 4.097433212705492e-06, "logits/chosen": 0.10550371557474136, "logits/rejected": 0.1396268755197525, "logps/chosen": -428.6062927246094, "logps/rejected": -454.2294921875, "loss": 0.3386, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08421845734119415, "rewards/margins": 0.04463706165552139, "rewards/rejected": -0.12885551154613495, "step": 220 }, { "epoch": 0.37, "learning_rate": 3.987493807371033e-06, "logits/chosen": 0.1371748000383377, "logits/rejected": 0.1429559886455536, "logps/chosen": -413.91241455078125, "logps/rejected": -443.8023376464844, "loss": 0.3146, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08825449645519257, "rewards/margins": 0.05310072749853134, "rewards/rejected": -0.1413552314043045, "step": 230 }, { "epoch": 0.38, "learning_rate": 3.872907439340758e-06, "logits/chosen": 0.13840351998806, "logits/rejected": 0.1320694237947464, "logps/chosen": -445.927490234375, "logps/rejected": -498.40325927734375, "loss": 0.3388, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.10737401247024536, "rewards/margins": 0.05018274113535881, "rewards/rejected": -0.15755674242973328, "step": 240 }, { "epoch": 0.4, "learning_rate": 3.75403207889666e-06, "logits/chosen": 0.0929916724562645, "logits/rejected": 0.1281127631664276, "logps/chosen": -415.629150390625, "logps/rejected": -462.8294372558594, "loss": 0.3343, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.10600423812866211, "rewards/margins": 0.05615564063191414, "rewards/rejected": -0.16215987503528595, "step": 250 }, { "epoch": 0.42, "learning_rate": 3.631239095225417e-06, "logits/chosen": 0.07575414329767227, "logits/rejected": 0.18414750695228577, "logps/chosen": -428.9398498535156, "logps/rejected": -469.5541076660156, "loss": 0.322, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11417442560195923, "rewards/margins": 0.0532112643122673, "rewards/rejected": -0.16738571226596832, "step": 260 }, { "epoch": 0.43, "learning_rate": 3.5049120962530608e-06, "logits/chosen": 0.08079143613576889, "logits/rejected": 0.14295849204063416, "logps/chosen": -512.96533203125, "logps/rejected": -558.752685546875, "loss": 0.3222, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16307619214057922, "rewards/margins": 0.07023187726736069, "rewards/rejected": -0.2333080768585205, "step": 270 }, { "epoch": 0.45, "learning_rate": 3.3754457302455464e-06, "logits/chosen": 0.07979480177164078, "logits/rejected": 0.17251968383789062, "logps/chosen": -526.539794921875, "logps/rejected": -628.0927124023438, "loss": 0.3058, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1753678023815155, "rewards/margins": 0.09301600605249405, "rewards/rejected": -0.26838380098342896, "step": 280 }, { "epoch": 0.46, "learning_rate": 3.2432444529190714e-06, "logits/chosen": 0.09937838464975357, "logits/rejected": 0.16915322840213776, "logps/chosen": -547.1159057617188, "logps/rejected": -621.634765625, "loss": 0.316, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17411155998706818, "rewards/margins": 0.09796580672264099, "rewards/rejected": -0.27207738161087036, "step": 290 }, { "epoch": 0.48, "learning_rate": 3.1087212639117057e-06, "logits/chosen": 0.09575396776199341, "logits/rejected": 0.11415497213602066, "logps/chosen": -475.705078125, "logps/rejected": -556.6187744140625, "loss": 0.323, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.15034952759742737, "rewards/margins": 0.0816831961274147, "rewards/rejected": -0.23203274607658386, "step": 300 }, { "epoch": 0.5, "learning_rate": 2.9722964165636263e-06, "logits/chosen": 0.1228911504149437, "logits/rejected": 0.18562354147434235, "logps/chosen": -498.4169921875, "logps/rejected": -489.9468688964844, "loss": 0.34, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1489831507205963, "rewards/margins": 0.03880687803030014, "rewards/rejected": -0.18779003620147705, "step": 310 }, { "epoch": 0.51, "learning_rate": 2.8343961050366275e-06, "logits/chosen": 0.09400780498981476, "logits/rejected": 0.1238022893667221, "logps/chosen": -470.9700622558594, "logps/rejected": -553.8331909179688, "loss": 0.3093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12979908287525177, "rewards/margins": 0.09255190938711166, "rewards/rejected": -0.22235099971294403, "step": 320 }, { "epoch": 0.53, "learning_rate": 2.695451132874385e-06, "logits/chosen": 0.12480980157852173, "logits/rejected": 0.17045611143112183, "logps/chosen": -504.2953186035156, "logps/rejected": -529.3466796875, "loss": 0.3287, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1506374180316925, "rewards/margins": 0.05277082324028015, "rewards/rejected": -0.20340824127197266, "step": 330 }, { "epoch": 0.54, "learning_rate": 2.5558955671628964e-06, "logits/chosen": 0.0985703319311142, "logits/rejected": 0.2024538218975067, "logps/chosen": -503.4974670410156, "logps/rejected": -588.9210815429688, "loss": 0.3256, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16819895803928375, "rewards/margins": 0.09579765796661377, "rewards/rejected": -0.2639966309070587, "step": 340 }, { "epoch": 0.56, "learning_rate": 2.4161653824955654e-06, "logits/chosen": 0.11853840202093124, "logits/rejected": 0.14462777972221375, "logps/chosen": -469.1441955566406, "logps/rejected": -537.0656127929688, "loss": 0.3294, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15223057568073273, "rewards/margins": 0.08282224088907242, "rewards/rejected": -0.23505279421806335, "step": 350 }, { "epoch": 0.58, "learning_rate": 2.2766970989791697e-06, "logits/chosen": 0.153191938996315, "logits/rejected": 0.22593048214912415, "logps/chosen": -507.7642517089844, "logps/rejected": -573.9854736328125, "loss": 0.2987, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15160496532917023, "rewards/margins": 0.10073844343423843, "rewards/rejected": -0.25234344601631165, "step": 360 }, { "epoch": 0.59, "learning_rate": 2.1379264185356545e-06, "logits/chosen": 0.1527024805545807, "logits/rejected": 0.127252459526062, "logps/chosen": -481.59722900390625, "logps/rejected": -552.0794067382812, "loss": 0.3347, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.17246612906455994, "rewards/margins": 0.06559783220291138, "rewards/rejected": -0.23806393146514893, "step": 370 }, { "epoch": 0.61, "learning_rate": 2.000286863759934e-06, "logits/chosen": 0.14587077498435974, "logits/rejected": 0.14887812733650208, "logps/chosen": -488.64801025390625, "logps/rejected": -543.491455078125, "loss": 0.3183, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.16818055510520935, "rewards/margins": 0.08099476993083954, "rewards/rejected": -0.24917533993721008, "step": 380 }, { "epoch": 0.62, "learning_rate": 1.8642084235859764e-06, "logits/chosen": 0.12905414402484894, "logits/rejected": 0.17015649378299713, "logps/chosen": -498.19451904296875, "logps/rejected": -579.4054565429688, "loss": 0.366, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.18893127143383026, "rewards/margins": 0.05997669696807861, "rewards/rejected": -0.24890796840190887, "step": 390 }, { "epoch": 0.64, "learning_rate": 1.7301162099921013e-06, "logits/chosen": 0.15795116126537323, "logits/rejected": 0.23071245849132538, "logps/chosen": -514.3444213867188, "logps/rejected": -514.5433349609375, "loss": 0.3127, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12764492630958557, "rewards/margins": 0.07431678473949432, "rewards/rejected": -0.2019617259502411, "step": 400 }, { "epoch": 0.66, "learning_rate": 1.5984291299420117e-06, "logits/chosen": 0.17297211289405823, "logits/rejected": 0.19146080315113068, "logps/chosen": -451.94317626953125, "logps/rejected": -519.0740966796875, "loss": 0.3096, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12079250812530518, "rewards/margins": 0.07710902392864227, "rewards/rejected": -0.19790153205394745, "step": 410 }, { "epoch": 0.67, "learning_rate": 1.4695585767104092e-06, "logits/chosen": 0.1951906979084015, "logits/rejected": 0.24034972488880157, "logps/chosen": -488.6905212402344, "logps/rejected": -560.0809936523438, "loss": 0.3168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1418723165988922, "rewards/margins": 0.07965030521154404, "rewards/rejected": -0.22152259945869446, "step": 420 }, { "epoch": 0.69, "learning_rate": 1.3439071446815452e-06, "logits/chosen": 0.16284561157226562, "logits/rejected": 0.18018727004528046, "logps/chosen": -507.633056640625, "logps/rejected": -534.8673095703125, "loss": 0.3152, "rewards/accuracies": 0.625, "rewards/chosen": -0.1576879769563675, "rewards/margins": 0.06594133377075195, "rewards/rejected": -0.22362928092479706, "step": 430 }, { "epoch": 0.7, "learning_rate": 1.2218673716356919e-06, "logits/chosen": 0.1776401400566101, "logits/rejected": 0.2645563781261444, "logps/chosen": -505.73052978515625, "logps/rejected": -560.3125, "loss": 0.327, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.16492930054664612, "rewards/margins": 0.060159534215927124, "rewards/rejected": -0.22508880496025085, "step": 440 }, { "epoch": 0.72, "learning_rate": 1.103820512452661e-06, "logits/chosen": 0.1745072603225708, "logits/rejected": 0.22326946258544922, "logps/chosen": -491.3525390625, "logps/rejected": -580.8945922851562, "loss": 0.3292, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1665906310081482, "rewards/margins": 0.07879535853862762, "rewards/rejected": -0.24538597464561462, "step": 450 }, { "epoch": 0.74, "learning_rate": 9.901353480633468e-07, "logits/chosen": 0.2245282232761383, "logits/rejected": 0.21928434073925018, "logps/chosen": -502.446044921875, "logps/rejected": -564.1083374023438, "loss": 0.3117, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15784719586372375, "rewards/margins": 0.06698840856552124, "rewards/rejected": -0.224835604429245, "step": 460 }, { "epoch": 0.75, "learning_rate": 8.811670333701544e-07, "logits/chosen": 0.17931757867336273, "logits/rejected": 0.19498419761657715, "logps/chosen": -488.80633544921875, "logps/rejected": -543.345458984375, "loss": 0.306, "rewards/accuracies": 0.625, "rewards/chosen": -0.15909579396247864, "rewards/margins": 0.06999148428440094, "rewards/rejected": -0.22908727824687958, "step": 470 }, { "epoch": 0.77, "learning_rate": 7.772559877354341e-07, "logits/chosen": 0.18104666471481323, "logits/rejected": 0.2650128901004791, "logps/chosen": -486.8435974121094, "logps/rejected": -529.2096557617188, "loss": 0.3362, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.15709495544433594, "rewards/margins": 0.06306286156177521, "rewards/rejected": -0.22015781700611115, "step": 480 }, { "epoch": 0.78, "learning_rate": 6.787268315040604e-07, "logits/chosen": 0.17338337004184723, "logits/rejected": 0.21364276111125946, "logps/chosen": -494.8131408691406, "logps/rejected": -559.9661865234375, "loss": 0.3009, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.16898290812969208, "rewards/margins": 0.07606662809848785, "rewards/rejected": -0.24504955112934113, "step": 490 }, { "epoch": 0.8, "learning_rate": 5.858873718824829e-07, "logits/chosen": 0.18185105919837952, "logits/rejected": 0.24735283851623535, "logps/chosen": -555.6572875976562, "logps/rejected": -560.2110595703125, "loss": 0.3324, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.17251154780387878, "rewards/margins": 0.05710861086845398, "rewards/rejected": -0.22962014377117157, "step": 500 }, { "epoch": 0.82, "learning_rate": 4.990276413423817e-07, "logits/chosen": 0.17972733080387115, "logits/rejected": 0.22244539856910706, "logps/chosen": -524.2530517578125, "logps/rejected": -599.024658203125, "loss": 0.3117, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.161757230758667, "rewards/margins": 0.09672929346561432, "rewards/rejected": -0.2584865093231201, "step": 510 }, { "epoch": 0.83, "learning_rate": 4.184189915529796e-07, "logits/chosen": 0.10895649343729019, "logits/rejected": 0.20955803990364075, "logps/chosen": -491.8091735839844, "logps/rejected": -556.9285278320312, "loss": 0.3011, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.15785539150238037, "rewards/margins": 0.08289924263954163, "rewards/rejected": -0.240754634141922, "step": 520 }, { "epoch": 0.85, "learning_rate": 3.4431324567258176e-07, "logits/chosen": 0.19427216053009033, "logits/rejected": 0.2759999632835388, "logps/chosen": -519.3431396484375, "logps/rejected": -586.2860107421875, "loss": 0.2963, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15744614601135254, "rewards/margins": 0.1040191501379013, "rewards/rejected": -0.26146528124809265, "step": 530 }, { "epoch": 0.86, "learning_rate": 2.769419116476052e-07, "logits/chosen": 0.1790754646062851, "logits/rejected": 0.24030852317810059, "logps/chosen": -473.218017578125, "logps/rejected": -557.5300903320312, "loss": 0.3077, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.15667091310024261, "rewards/margins": 0.09858374297618866, "rewards/rejected": -0.2552546262741089, "step": 540 }, { "epoch": 0.88, "learning_rate": 2.1651545897676512e-07, "logits/chosen": 0.2089765965938568, "logits/rejected": 0.25732511281967163, "logps/chosen": -567.176025390625, "logps/rejected": -613.021484375, "loss": 0.3129, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17653754353523254, "rewards/margins": 0.09360690414905548, "rewards/rejected": -0.2701444625854492, "step": 550 }, { "epoch": 0.9, "learning_rate": 1.6322266119983222e-07, "logits/chosen": 0.16576240956783295, "logits/rejected": 0.24667489528656006, "logps/chosen": -502.44158935546875, "logps/rejected": -564.0174560546875, "loss": 0.3299, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1819026917219162, "rewards/margins": 0.09441694617271423, "rewards/rejected": -0.2763196527957916, "step": 560 }, { "epoch": 0.91, "learning_rate": 1.1723000616502167e-07, "logits/chosen": 0.16597183048725128, "logits/rejected": 0.13934046030044556, "logps/chosen": -527.2802734375, "logps/rejected": -672.5037841796875, "loss": 0.2996, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18256615102291107, "rewards/margins": 0.12837597727775574, "rewards/rejected": -0.310942143201828, "step": 570 }, { "epoch": 0.93, "learning_rate": 7.868117591737585e-08, "logits/chosen": 0.20055902004241943, "logits/rejected": 0.20812377333641052, "logps/chosen": -483.41082763671875, "logps/rejected": -559.0455932617188, "loss": 0.3085, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.16648918390274048, "rewards/margins": 0.0913398414850235, "rewards/rejected": -0.25782904028892517, "step": 580 }, { "epoch": 0.94, "learning_rate": 4.769659783295383e-08, "logits/chosen": 0.17583322525024414, "logits/rejected": 0.1814696043729782, "logps/chosen": -517.0927734375, "logps/rejected": -562.7242431640625, "loss": 0.3134, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1781904697418213, "rewards/margins": 0.08398625254631042, "rewards/rejected": -0.2621766924858093, "step": 590 }, { "epoch": 0.96, "learning_rate": 2.4373068401120358e-08, "logits/chosen": 0.16853031516075134, "logits/rejected": 0.19935330748558044, "logps/chosen": -528.5987548828125, "logps/rejected": -601.7249755859375, "loss": 0.323, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17777393758296967, "rewards/margins": 0.08920314162969589, "rewards/rejected": -0.26697710156440735, "step": 600 }, { "epoch": 0.98, "learning_rate": 8.78345083022425e-09, "logits/chosen": 0.16420051455497742, "logits/rejected": 0.18029369413852692, "logps/chosen": -543.4736938476562, "logps/rejected": -574.1061401367188, "loss": 0.3197, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1843399703502655, "rewards/margins": 0.060389935970306396, "rewards/rejected": -0.2447299212217331, "step": 610 }, { "epoch": 0.99, "learning_rate": 9.764474213677654e-10, "logits/chosen": 0.19845640659332275, "logits/rejected": 0.22712858021259308, "logps/chosen": -513.5285034179688, "logps/rejected": -559.9097290039062, "loss": 0.2998, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16102448105812073, "rewards/margins": 0.07719925045967102, "rewards/rejected": -0.23822371661663055, "step": 620 }, { "epoch": 1.0, "step": 625, "total_flos": 0.0, "train_loss": 0.3311275300979614, "train_runtime": 7525.4167, "train_samples_per_second": 2.658, "train_steps_per_second": 0.083 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }