{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7316017316017316, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012368583797155226, "grad_norm": 2.8862552642822266, "learning_rate": 4.9998656243398664e-05, "logits/chosen": 0.07544252276420593, "logits/rejected": 0.045758649706840515, "logps/chosen": -1.6279420852661133, "logps/rejected": -1.1392183303833008, "loss": 1.7652, "odds_ratio_loss": 1.3730543851852417, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": -0.16279420256614685, "rewards/margins": -0.048872362822294235, "rewards/rejected": -0.11392184346914291, "sft_loss": 1.6279420852661133, "step": 5 }, { "epoch": 0.024737167594310452, "grad_norm": 2.154451847076416, "learning_rate": 4.99931974797807e-05, "logits/chosen": -0.2569425106048584, "logits/rejected": -0.26398971676826477, "logps/chosen": -1.2734109163284302, "logps/rejected": -0.7473217248916626, "loss": 1.4286, "odds_ratio_loss": 1.5515320301055908, "rewards/accuracies": 0.1875, "rewards/chosen": -0.12734109163284302, "rewards/margins": -0.0526089146733284, "rewards/rejected": -0.07473217695951462, "sft_loss": 1.2734109163284302, "step": 10 }, { "epoch": 0.03710575139146568, "grad_norm": 2.028292179107666, "learning_rate": 4.998354064055474e-05, "logits/chosen": -0.40017586946487427, "logits/rejected": -0.4366907477378845, "logps/chosen": -1.1801488399505615, "logps/rejected": -0.9349920153617859, "loss": 1.2777, "odds_ratio_loss": 0.9751937985420227, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.1180148720741272, "rewards/margins": -0.024515679106116295, "rewards/rejected": -0.09349919855594635, "sft_loss": 1.1801488399505615, "step": 15 }, { "epoch": 0.049474335188620905, "grad_norm": 1.9954569339752197, "learning_rate": 4.99696873477678e-05, "logits/chosen": -0.40699416399002075, "logits/rejected": -0.44454145431518555, "logps/chosen": -1.1877975463867188, "logps/rejected": -1.0904505252838135, "loss": 1.2665, "odds_ratio_loss": 0.7868929505348206, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.1187797412276268, "rewards/margins": -0.009734684601426125, "rewards/rejected": -0.10904505103826523, "sft_loss": 1.1877975463867188, "step": 20 }, { "epoch": 0.06184291898577613, "grad_norm": 1.9759254455566406, "learning_rate": 4.995163992833986e-05, "logits/chosen": -0.23489859700202942, "logits/rejected": -0.25920313596725464, "logps/chosen": -1.1750543117523193, "logps/rejected": -1.2002836465835571, "loss": 1.2448, "odds_ratio_loss": 0.697222888469696, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.11750541627407074, "rewards/margins": 0.0025229300372302532, "rewards/rejected": -0.12002835422754288, "sft_loss": 1.1750543117523193, "step": 25 }, { "epoch": 0.07421150278293136, "grad_norm": 1.87021005153656, "learning_rate": 4.992940141367302e-05, "logits/chosen": -0.08740436285734177, "logits/rejected": -0.12482025474309921, "logps/chosen": -1.1921831369400024, "logps/rejected": -1.2114026546478271, "loss": 1.2626, "odds_ratio_loss": 0.704161524772644, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11921832710504532, "rewards/margins": 0.0019219372188672423, "rewards/rejected": -0.1211402639746666, "sft_loss": 1.1921831369400024, "step": 30 }, { "epoch": 0.08658008658008658, "grad_norm": 1.764119267463684, "learning_rate": 4.9902975539142324e-05, "logits/chosen": -0.0032516077626496553, "logits/rejected": -0.03004361316561699, "logps/chosen": -1.191348671913147, "logps/rejected": -1.2961044311523438, "loss": 1.2578, "odds_ratio_loss": 0.6647518873214722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11913485825061798, "rewards/margins": 0.010475579649209976, "rewards/rejected": -0.12961044907569885, "sft_loss": 1.191348671913147, "step": 35 }, { "epoch": 0.09894867037724181, "grad_norm": 1.8680297136306763, "learning_rate": 4.98723667434683e-05, "logits/chosen": 0.009043162688612938, "logits/rejected": -0.013671426102519035, "logps/chosen": -1.1334426403045654, "logps/rejected": -1.4070128202438354, "loss": 1.1904, "odds_ratio_loss": 0.5695692300796509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11334426701068878, "rewards/margins": 0.027357015758752823, "rewards/rejected": -0.1407012790441513, "sft_loss": 1.1334426403045654, "step": 40 }, { "epoch": 0.11131725417439703, "grad_norm": 1.5694094896316528, "learning_rate": 4.9837580167971476e-05, "logits/chosen": 0.11076197773218155, "logits/rejected": 0.06242935732007027, "logps/chosen": -1.119387149810791, "logps/rejected": -1.5096395015716553, "loss": 1.1728, "odds_ratio_loss": 0.5343354940414429, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1119387149810791, "rewards/margins": 0.03902522847056389, "rewards/rejected": -0.1509639471769333, "sft_loss": 1.119387149810791, "step": 45 }, { "epoch": 0.12368583797155226, "grad_norm": 1.656472086906433, "learning_rate": 4.9798621655708695e-05, "logits/chosen": 0.1408427357673645, "logits/rejected": 0.12525486946105957, "logps/chosen": -1.0644111633300781, "logps/rejected": -1.6023738384246826, "loss": 1.1126, "odds_ratio_loss": 0.48157334327697754, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10644111782312393, "rewards/margins": 0.05379629135131836, "rewards/rejected": -0.1602374017238617, "sft_loss": 1.0644111633300781, "step": 50 }, { "epoch": 0.1360544217687075, "grad_norm": 1.5853365659713745, "learning_rate": 4.9755497750491744e-05, "logits/chosen": 0.17553505301475525, "logits/rejected": 0.16454991698265076, "logps/chosen": -1.1122915744781494, "logps/rejected": -2.071721315383911, "loss": 1.1531, "odds_ratio_loss": 0.4079020619392395, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.11122916638851166, "rewards/margins": 0.09594295173883438, "rewards/rejected": -0.20717212557792664, "sft_loss": 1.1122915744781494, "step": 55 }, { "epoch": 0.14842300556586271, "grad_norm": 1.7390332221984863, "learning_rate": 4.9708215695788154e-05, "logits/chosen": 0.17281469702720642, "logits/rejected": 0.15900108218193054, "logps/chosen": -1.0756841897964478, "logps/rejected": -2.4330568313598633, "loss": 1.1118, "odds_ratio_loss": 0.361270010471344, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1075684204697609, "rewards/margins": 0.13573727011680603, "rewards/rejected": -0.24330571293830872, "sft_loss": 1.0756841897964478, "step": 60 }, { "epoch": 0.16079158936301793, "grad_norm": 1.753174901008606, "learning_rate": 4.965678343350455e-05, "logits/chosen": 0.19705705344676971, "logits/rejected": 0.18248306214809418, "logps/chosen": -1.0352728366851807, "logps/rejected": -2.357375144958496, "loss": 1.0779, "odds_ratio_loss": 0.42638930678367615, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10352728515863419, "rewards/margins": 0.13221022486686707, "rewards/rejected": -0.23573748767375946, "sft_loss": 1.0352728366851807, "step": 65 }, { "epoch": 0.17316017316017315, "grad_norm": 1.6897480487823486, "learning_rate": 4.9601209602652646e-05, "logits/chosen": 0.17476847767829895, "logits/rejected": 0.14479169249534607, "logps/chosen": -1.0864959955215454, "logps/rejected": -2.68327260017395, "loss": 1.1266, "odds_ratio_loss": 0.4005963206291199, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1086495965719223, "rewards/margins": 0.1596776396036148, "rewards/rejected": -0.2683272659778595, "sft_loss": 1.0864959955215454, "step": 70 }, { "epoch": 0.18552875695732837, "grad_norm": 1.58122718334198, "learning_rate": 4.954150353789816e-05, "logits/chosen": 0.1830178052186966, "logits/rejected": 0.16554853320121765, "logps/chosen": -1.0824503898620605, "logps/rejected": -2.600160598754883, "loss": 1.1288, "odds_ratio_loss": 0.46338266134262085, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10824503749608994, "rewards/margins": 0.15177100896835327, "rewards/rejected": -0.2600160539150238, "sft_loss": 1.0824503898620605, "step": 75 }, { "epoch": 0.19789734075448362, "grad_norm": 1.6095364093780518, "learning_rate": 4.9477675267992884e-05, "logits/chosen": 0.1943037360906601, "logits/rejected": 0.1735829859972, "logps/chosen": -1.0325891971588135, "logps/rejected": -2.8387868404388428, "loss": 1.0692, "odds_ratio_loss": 0.36599820852279663, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10325892269611359, "rewards/margins": 0.1806197613477707, "rewards/rejected": -0.28387871384620667, "sft_loss": 1.0325891971588135, "step": 80 }, { "epoch": 0.21026592455163884, "grad_norm": 1.8630558252334595, "learning_rate": 4.940973551409018e-05, "logits/chosen": 0.2234114110469818, "logits/rejected": 0.20650608837604523, "logps/chosen": -1.043575406074524, "logps/rejected": -2.9924750328063965, "loss": 1.0774, "odds_ratio_loss": 0.3379738926887512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10435754060745239, "rewards/margins": 0.19488994777202606, "rewards/rejected": -0.29924747347831726, "sft_loss": 1.043575406074524, "step": 85 }, { "epoch": 0.22263450834879406, "grad_norm": 1.789881706237793, "learning_rate": 4.9337695687944134e-05, "logits/chosen": 0.201242595911026, "logits/rejected": 0.17671005427837372, "logps/chosen": -1.1162173748016357, "logps/rejected": -2.903432846069336, "loss": 1.1563, "odds_ratio_loss": 0.40065255761146545, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11162175983190536, "rewards/margins": 0.17872151732444763, "rewards/rejected": -0.2903432548046112, "sft_loss": 1.1162173748016357, "step": 90 }, { "epoch": 0.23500309214594928, "grad_norm": 1.7106293439865112, "learning_rate": 4.926156788999277e-05, "logits/chosen": 0.19472253322601318, "logits/rejected": 0.17345264554023743, "logps/chosen": -1.0581421852111816, "logps/rejected": -2.624668836593628, "loss": 1.1003, "odds_ratio_loss": 0.42190617322921753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10581421852111816, "rewards/margins": 0.15665265917778015, "rewards/rejected": -0.2624668478965759, "sft_loss": 1.0581421852111816, "step": 95 }, { "epoch": 0.24737167594310452, "grad_norm": 1.706239104270935, "learning_rate": 4.9181364907325536e-05, "logits/chosen": 0.2012307196855545, "logits/rejected": 0.1834832727909088, "logps/chosen": -1.0977017879486084, "logps/rejected": -2.878708839416504, "loss": 1.1355, "odds_ratio_loss": 0.378123939037323, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10977016389369965, "rewards/margins": 0.17810070514678955, "rewards/rejected": -0.2878708839416504, "sft_loss": 1.0977017879486084, "step": 100 }, { "epoch": 0.2597402597402597, "grad_norm": 1.6044131517410278, "learning_rate": 4.9097100211535455e-05, "logits/chosen": 0.18777336180210114, "logits/rejected": 0.17369869351387024, "logps/chosen": -1.0719921588897705, "logps/rejected": -2.8622806072235107, "loss": 1.1102, "odds_ratio_loss": 0.38193395733833313, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.10719920694828033, "rewards/margins": 0.17902883887290955, "rewards/rejected": -0.2862280607223511, "sft_loss": 1.0719921588897705, "step": 105 }, { "epoch": 0.272108843537415, "grad_norm": 1.608841896057129, "learning_rate": 4.900878795645637e-05, "logits/chosen": 0.21402505040168762, "logits/rejected": 0.1990688443183899, "logps/chosen": -1.0759098529815674, "logps/rejected": -3.1743438243865967, "loss": 1.1144, "odds_ratio_loss": 0.3845849633216858, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10759098827838898, "rewards/margins": 0.2098434418439865, "rewards/rejected": -0.3174344003200531, "sft_loss": 1.0759098529815674, "step": 110 }, { "epoch": 0.2844774273345702, "grad_norm": 1.6189693212509155, "learning_rate": 4.891644297578549e-05, "logits/chosen": 0.27493441104888916, "logits/rejected": 0.2731192111968994, "logps/chosen": -1.0565831661224365, "logps/rejected": -3.354783296585083, "loss": 1.0924, "odds_ratio_loss": 0.35769230127334595, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10565831512212753, "rewards/margins": 0.22981998324394226, "rewards/rejected": -0.3354783058166504, "sft_loss": 1.0565831661224365, "step": 115 }, { "epoch": 0.29684601113172543, "grad_norm": 1.7463057041168213, "learning_rate": 4.882008078059184e-05, "logits/chosen": 0.3636896312236786, "logits/rejected": 0.36413443088531494, "logps/chosen": -1.0972557067871094, "logps/rejected": -3.1429848670959473, "loss": 1.1346, "odds_ratio_loss": 0.3731198012828827, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10972557961940765, "rewards/margins": 0.2045729160308838, "rewards/rejected": -0.31429848074913025, "sft_loss": 1.0972557067871094, "step": 120 }, { "epoch": 0.30921459492888065, "grad_norm": 1.642956018447876, "learning_rate": 4.871971755671084e-05, "logits/chosen": 0.2689715027809143, "logits/rejected": 0.27179154753685, "logps/chosen": -1.0361791849136353, "logps/rejected": -3.423287868499756, "loss": 1.0684, "odds_ratio_loss": 0.3219633400440216, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10361792147159576, "rewards/margins": 0.23871088027954102, "rewards/rejected": -0.3423287868499756, "sft_loss": 1.0361791849136353, "step": 125 }, { "epoch": 0.32158317872603587, "grad_norm": 1.781355857849121, "learning_rate": 4.8615370162025605e-05, "logits/chosen": 0.33175164461135864, "logits/rejected": 0.3275807499885559, "logps/chosen": -1.0404772758483887, "logps/rejected": -3.328956127166748, "loss": 1.0771, "odds_ratio_loss": 0.36617815494537354, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1040477305650711, "rewards/margins": 0.2288479059934616, "rewards/rejected": -0.3328956663608551, "sft_loss": 1.0404772758483887, "step": 130 }, { "epoch": 0.3339517625231911, "grad_norm": 1.6253814697265625, "learning_rate": 4.8507056123635383e-05, "logits/chosen": 0.2824321389198303, "logits/rejected": 0.2776624262332916, "logps/chosen": -1.0500437021255493, "logps/rejected": -3.0011651515960693, "loss": 1.0932, "odds_ratio_loss": 0.43134117126464844, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10500438511371613, "rewards/margins": 0.19511213898658752, "rewards/rejected": -0.30011647939682007, "sft_loss": 1.0500437021255493, "step": 135 }, { "epoch": 0.3463203463203463, "grad_norm": 1.6987438201904297, "learning_rate": 4.83947936349115e-05, "logits/chosen": 0.28672313690185547, "logits/rejected": 0.28451842069625854, "logps/chosen": -1.094419240951538, "logps/rejected": -3.2499892711639404, "loss": 1.133, "odds_ratio_loss": 0.38557836413383484, "rewards/accuracies": 0.75, "rewards/chosen": -0.10944193601608276, "rewards/margins": 0.2155570238828659, "rewards/rejected": -0.3249989449977875, "sft_loss": 1.094419240951538, "step": 140 }, { "epoch": 0.3586889301175015, "grad_norm": 1.6374881267547607, "learning_rate": 4.827860155244149e-05, "logits/chosen": 0.3909720778465271, "logits/rejected": 0.3937540054321289, "logps/chosen": -1.0620362758636475, "logps/rejected": -3.3431522846221924, "loss": 1.0996, "odds_ratio_loss": 0.37589576840400696, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10620363801717758, "rewards/margins": 0.22811158001422882, "rewards/rejected": -0.3343152403831482, "sft_loss": 1.0620362758636475, "step": 145 }, { "epoch": 0.37105751391465674, "grad_norm": 1.8385009765625, "learning_rate": 4.815849939286171e-05, "logits/chosen": 0.3534628748893738, "logits/rejected": 0.3815380930900574, "logps/chosen": -1.0393407344818115, "logps/rejected": -3.4164493083953857, "loss": 1.0732, "odds_ratio_loss": 0.33901771903038025, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10393409430980682, "rewards/margins": 0.2377108782529831, "rewards/rejected": -0.34164494276046753, "sft_loss": 1.0393407344818115, "step": 150 }, { "epoch": 0.383426097711812, "grad_norm": 1.6490304470062256, "learning_rate": 4.803450732957924e-05, "logits/chosen": 0.3761630058288574, "logits/rejected": 0.3709793984889984, "logps/chosen": -1.043731927871704, "logps/rejected": -3.3615729808807373, "loss": 1.0772, "odds_ratio_loss": 0.33452075719833374, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10437319427728653, "rewards/margins": 0.23178410530090332, "rewards/rejected": -0.33615732192993164, "sft_loss": 1.043731927871704, "step": 155 }, { "epoch": 0.39579468150896724, "grad_norm": 1.6318919658660889, "learning_rate": 4.790664618938332e-05, "logits/chosen": 0.3841768801212311, "logits/rejected": 0.39820531010627747, "logps/chosen": -1.0293159484863281, "logps/rejected": -3.458782911300659, "loss": 1.0665, "odds_ratio_loss": 0.37178558111190796, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10293158143758774, "rewards/margins": 0.24294674396514893, "rewards/rejected": -0.3458783030509949, "sft_loss": 1.0293159484863281, "step": 160 }, { "epoch": 0.40816326530612246, "grad_norm": 1.6792525053024292, "learning_rate": 4.7774937448947124e-05, "logits/chosen": 0.37755170464515686, "logits/rejected": 0.3805047869682312, "logps/chosen": -1.0624099969863892, "logps/rejected": -3.163449287414551, "loss": 1.1036, "odds_ratio_loss": 0.41204801201820374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10624101012945175, "rewards/margins": 0.21010391414165497, "rewards/rejected": -0.3163449168205261, "sft_loss": 1.0624099969863892, "step": 165 }, { "epoch": 0.4205318491032777, "grad_norm": 1.6043797731399536, "learning_rate": 4.7639403231220355e-05, "logits/chosen": 0.309886634349823, "logits/rejected": 0.3307574689388275, "logps/chosen": -1.038699746131897, "logps/rejected": -3.5207486152648926, "loss": 1.0711, "odds_ratio_loss": 0.3243294060230255, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1038699746131897, "rewards/margins": 0.24820487201213837, "rewards/rejected": -0.35207483172416687, "sft_loss": 1.038699746131897, "step": 170 }, { "epoch": 0.4329004329004329, "grad_norm": 1.7234686613082886, "learning_rate": 4.7500066301713254e-05, "logits/chosen": 0.26021939516067505, "logits/rejected": 0.29408133029937744, "logps/chosen": -1.0488821268081665, "logps/rejected": -3.5299739837646484, "loss": 1.0861, "odds_ratio_loss": 0.37170323729515076, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10488821566104889, "rewards/margins": 0.24810917675495148, "rewards/rejected": -0.35299739241600037, "sft_loss": 1.0488821268081665, "step": 175 }, { "epoch": 0.4452690166975881, "grad_norm": 1.5969773530960083, "learning_rate": 4.7356950064672745e-05, "logits/chosen": 0.22508367896080017, "logits/rejected": 0.23831474781036377, "logps/chosen": -1.0288432836532593, "logps/rejected": -2.8129589557647705, "loss": 1.0738, "odds_ratio_loss": 0.4491986334323883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10288432985544205, "rewards/margins": 0.1784115731716156, "rewards/rejected": -0.28129592537879944, "sft_loss": 1.0288432836532593, "step": 180 }, { "epoch": 0.45763760049474334, "grad_norm": 1.8097518682479858, "learning_rate": 4.7210078559151226e-05, "logits/chosen": 0.22750480473041534, "logits/rejected": 0.2586122155189514, "logps/chosen": -1.0465128421783447, "logps/rejected": -3.433408260345459, "loss": 1.0863, "odds_ratio_loss": 0.39764371514320374, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10465127229690552, "rewards/margins": 0.2386895716190338, "rewards/rejected": -0.34334081411361694, "sft_loss": 1.0465128421783447, "step": 185 }, { "epoch": 0.47000618429189855, "grad_norm": 1.7439857721328735, "learning_rate": 4.705947645496877e-05, "logits/chosen": 0.28627103567123413, "logits/rejected": 0.31621652841567993, "logps/chosen": -1.0685290098190308, "logps/rejected": -3.57133412361145, "loss": 1.1033, "odds_ratio_loss": 0.34804287552833557, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10685290396213531, "rewards/margins": 0.2502805292606354, "rewards/rejected": -0.3571334481239319, "sft_loss": 1.0685290098190308, "step": 190 }, { "epoch": 0.48237476808905383, "grad_norm": 1.6414059400558472, "learning_rate": 4.6905169048569365e-05, "logits/chosen": 0.26304224133491516, "logits/rejected": 0.3106682300567627, "logps/chosen": -1.0814480781555176, "logps/rejected": -3.8510982990264893, "loss": 1.1143, "odds_ratio_loss": 0.32805657386779785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10814480483531952, "rewards/margins": 0.27696502208709717, "rewards/rejected": -0.3851098120212555, "sft_loss": 1.0814480781555176, "step": 195 }, { "epoch": 0.49474335188620905, "grad_norm": 1.6593708992004395, "learning_rate": 4.674718225877189e-05, "logits/chosen": 0.27768149971961975, "logits/rejected": 0.32530421018600464, "logps/chosen": -1.0179694890975952, "logps/rejected": -3.902677536010742, "loss": 1.0504, "odds_ratio_loss": 0.3247123956680298, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1017969623208046, "rewards/margins": 0.2884708046913147, "rewards/rejected": -0.3902677595615387, "sft_loss": 1.0179694890975952, "step": 200 }, { "epoch": 0.5071119356833642, "grad_norm": 1.660892367362976, "learning_rate": 4.658554262241659e-05, "logits/chosen": 0.20089022815227509, "logits/rejected": 0.25533175468444824, "logps/chosen": -1.0165807008743286, "logps/rejected": -3.7854084968566895, "loss": 1.0503, "odds_ratio_loss": 0.3373025059700012, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10165806859731674, "rewards/margins": 0.27688276767730713, "rewards/rejected": -0.37854084372520447, "sft_loss": 1.0165807008743286, "step": 205 }, { "epoch": 0.5194805194805194, "grad_norm": 1.8275647163391113, "learning_rate": 4.64202772899077e-05, "logits/chosen": 0.2793352007865906, "logits/rejected": 0.31450778245925903, "logps/chosen": -1.0388555526733398, "logps/rejected": -3.41314697265625, "loss": 1.076, "odds_ratio_loss": 0.37191542983055115, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10388555377721786, "rewards/margins": 0.23742914199829102, "rewards/rejected": -0.34131473302841187, "sft_loss": 1.0388555526733398, "step": 210 }, { "epoch": 0.5318491032776747, "grad_norm": 1.6608279943466187, "learning_rate": 4.6251414020653036e-05, "logits/chosen": 0.2836456894874573, "logits/rejected": 0.3255106210708618, "logps/chosen": -0.9951368570327759, "logps/rejected": -3.482539653778076, "loss": 1.0346, "odds_ratio_loss": 0.39438098669052124, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.0995136946439743, "rewards/margins": 0.2487403154373169, "rewards/rejected": -0.3482540249824524, "sft_loss": 0.9951368570327759, "step": 215 }, { "epoch": 0.54421768707483, "grad_norm": 1.6944583654403687, "learning_rate": 4.607898117840126e-05, "logits/chosen": 0.3252732455730438, "logits/rejected": 0.32358455657958984, "logps/chosen": -1.067556381225586, "logps/rejected": -3.43945574760437, "loss": 1.1091, "odds_ratio_loss": 0.41562288999557495, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10675563663244247, "rewards/margins": 0.23718991875648499, "rewards/rejected": -0.34394556283950806, "sft_loss": 1.067556381225586, "step": 220 }, { "epoch": 0.5565862708719852, "grad_norm": 1.7083643674850464, "learning_rate": 4.590300772647768e-05, "logits/chosen": 0.35806629061698914, "logits/rejected": 0.40527772903442383, "logps/chosen": -0.9661971926689148, "logps/rejected": -3.7241454124450684, "loss": 0.9985, "odds_ratio_loss": 0.32330435514450073, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09661972522735596, "rewards/margins": 0.2757948637008667, "rewards/rejected": -0.37241455912590027, "sft_loss": 0.9661971926689148, "step": 225 }, { "epoch": 0.5689548546691404, "grad_norm": 1.8688366413116455, "learning_rate": 4.57235232229193e-05, "logits/chosen": 0.37934738397598267, "logits/rejected": 0.40611180663108826, "logps/chosen": -1.0094630718231201, "logps/rejected": -3.4392428398132324, "loss": 1.0535, "odds_ratio_loss": 0.4402863383293152, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10094629228115082, "rewards/margins": 0.24297800660133362, "rewards/rejected": -0.343924343585968, "sft_loss": 1.0094630718231201, "step": 230 }, { "epoch": 0.5813234384662956, "grad_norm": 1.77333402633667, "learning_rate": 4.554055781551002e-05, "logits/chosen": 0.38877761363983154, "logits/rejected": 0.4154542088508606, "logps/chosen": -0.97468101978302, "logps/rejected": -3.6887001991271973, "loss": 1.0076, "odds_ratio_loss": 0.3288685083389282, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.09746809303760529, "rewards/margins": 0.27140194177627563, "rewards/rejected": -0.3688700199127197, "sft_loss": 0.97468101978302, "step": 235 }, { "epoch": 0.5936920222634509, "grad_norm": 1.711516261100769, "learning_rate": 4.535414223671672e-05, "logits/chosen": 0.3428501486778259, "logits/rejected": 0.3981688916683197, "logps/chosen": -1.0287367105484009, "logps/rejected": -4.2329864501953125, "loss": 1.0566, "odds_ratio_loss": 0.2782394289970398, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10287366062402725, "rewards/margins": 0.32042500376701355, "rewards/rejected": -0.4232986867427826, "sft_loss": 1.0287367105484009, "step": 240 }, { "epoch": 0.6060606060606061, "grad_norm": 1.654464602470398, "learning_rate": 4.516430779852721e-05, "logits/chosen": 0.3597055673599243, "logits/rejected": 0.38934797048568726, "logps/chosen": -1.0615828037261963, "logps/rejected": -3.88560152053833, "loss": 1.0993, "odds_ratio_loss": 0.37732991576194763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10615827888250351, "rewards/margins": 0.2824018597602844, "rewards/rejected": -0.38856011629104614, "sft_loss": 1.0615828037261963, "step": 245 }, { "epoch": 0.6184291898577613, "grad_norm": 1.7029634714126587, "learning_rate": 4.497108638719072e-05, "logits/chosen": 0.391928106546402, "logits/rejected": 0.4485929608345032, "logps/chosen": -1.002569317817688, "logps/rejected": -4.275712490081787, "loss": 1.0319, "odds_ratio_loss": 0.2937774062156677, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10025691986083984, "rewards/margins": 0.3273143172264099, "rewards/rejected": -0.42757129669189453, "sft_loss": 1.002569317817688, "step": 250 }, { "epoch": 0.6307977736549165, "grad_norm": 1.7871607542037964, "learning_rate": 4.47745104578621e-05, "logits/chosen": 0.4129851460456848, "logits/rejected": 0.4460812509059906, "logps/chosen": -1.0623544454574585, "logps/rejected": -3.2420449256896973, "loss": 1.1072, "odds_ratio_loss": 0.4487124979496002, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10623542964458466, "rewards/margins": 0.21796905994415283, "rewards/rejected": -0.3242045044898987, "sft_loss": 1.0623544454574585, "step": 255 }, { "epoch": 0.6431663574520717, "grad_norm": 1.7240341901779175, "learning_rate": 4.4574613029150274e-05, "logits/chosen": 0.431743323802948, "logits/rejected": 0.45771631598472595, "logps/chosen": -0.9919835925102234, "logps/rejected": -3.5913169384002686, "loss": 1.0262, "odds_ratio_loss": 0.3426302373409271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0991983562707901, "rewards/margins": 0.25993332266807556, "rewards/rejected": -0.35913175344467163, "sft_loss": 0.9919835925102234, "step": 260 }, { "epoch": 0.655534941249227, "grad_norm": 1.7802373170852661, "learning_rate": 4.437142767757225e-05, "logits/chosen": 0.461709588766098, "logits/rejected": 0.4749070107936859, "logps/chosen": -1.0436687469482422, "logps/rejected": -3.7742443084716797, "loss": 1.0786, "odds_ratio_loss": 0.3492099642753601, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10436688363552094, "rewards/margins": 0.2730575501918793, "rewards/rejected": -0.3774244487285614, "sft_loss": 1.0436687469482422, "step": 265 }, { "epoch": 0.6679035250463822, "grad_norm": 1.5245323181152344, "learning_rate": 4.416498853191321e-05, "logits/chosen": 0.46483153104782104, "logits/rejected": 0.5216314196586609, "logps/chosen": -0.9942825436592102, "logps/rejected": -3.7933108806610107, "loss": 1.0283, "odds_ratio_loss": 0.3399539589881897, "rewards/accuracies": 0.75, "rewards/chosen": -0.09942825138568878, "rewards/margins": 0.2799028158187866, "rewards/rejected": -0.3793310821056366, "sft_loss": 0.9942825436592102, "step": 270 }, { "epoch": 0.6802721088435374, "grad_norm": 1.7103205919265747, "learning_rate": 4.395533026749404e-05, "logits/chosen": 0.40999603271484375, "logits/rejected": 0.46490398049354553, "logps/chosen": -1.0043413639068604, "logps/rejected": -3.855510711669922, "loss": 1.0371, "odds_ratio_loss": 0.32757169008255005, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10043412446975708, "rewards/margins": 0.28511694073677063, "rewards/rejected": -0.3855510652065277, "sft_loss": 1.0043413639068604, "step": 275 }, { "epoch": 0.6926406926406926, "grad_norm": 1.7623897790908813, "learning_rate": 4.374248810034686e-05, "logits/chosen": 0.44833698868751526, "logits/rejected": 0.48242998123168945, "logps/chosen": -1.0079069137573242, "logps/rejected": -3.7589282989501953, "loss": 1.0446, "odds_ratio_loss": 0.36661431193351746, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10079067945480347, "rewards/margins": 0.2751021385192871, "rewards/rejected": -0.37589284777641296, "sft_loss": 1.0079069137573242, "step": 280 }, { "epoch": 0.7050092764378478, "grad_norm": 1.5534154176712036, "learning_rate": 4.352649778129993e-05, "logits/chosen": 0.38957467675209045, "logits/rejected": 0.4419080317020416, "logps/chosen": -0.9628938436508179, "logps/rejected": -3.7131409645080566, "loss": 0.997, "odds_ratio_loss": 0.34095168113708496, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09628938138484955, "rewards/margins": 0.2750246822834015, "rewards/rejected": -0.37131404876708984, "sft_loss": 0.9628938436508179, "step": 285 }, { "epoch": 0.717377860235003, "grad_norm": 1.6475731134414673, "learning_rate": 4.330739558997254e-05, "logits/chosen": 0.37073180079460144, "logits/rejected": 0.41065892577171326, "logps/chosen": -1.0283139944076538, "logps/rejected": -3.7884624004364014, "loss": 1.0641, "odds_ratio_loss": 0.35768234729766846, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1028314009308815, "rewards/margins": 0.2760148346424103, "rewards/rejected": -0.37884625792503357, "sft_loss": 1.0283139944076538, "step": 290 }, { "epoch": 0.7297464440321583, "grad_norm": 1.6990858316421509, "learning_rate": 4.308521832868124e-05, "logits/chosen": 0.41751202940940857, "logits/rejected": 0.46295857429504395, "logps/chosen": -1.0004329681396484, "logps/rejected": -3.6680397987365723, "loss": 1.0301, "odds_ratio_loss": 0.29643428325653076, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10004329681396484, "rewards/margins": 0.2667606472969055, "rewards/rejected": -0.36680394411087036, "sft_loss": 1.0004329681396484, "step": 295 }, { "epoch": 0.7421150278293135, "grad_norm": 1.7011147737503052, "learning_rate": 4.2860003316258183e-05, "logits/chosen": 0.43445339798927307, "logits/rejected": 0.42533907294273376, "logps/chosen": -1.0412126779556274, "logps/rejected": -2.8987720012664795, "loss": 1.0824, "odds_ratio_loss": 0.4119531512260437, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10412125289440155, "rewards/margins": 0.18575593829154968, "rewards/rejected": -0.2898772358894348, "sft_loss": 1.0412126779556274, "step": 300 }, { "epoch": 0.7544836116264688, "grad_norm": 1.8538843393325806, "learning_rate": 4.263178838178269e-05, "logits/chosen": 0.35644233226776123, "logits/rejected": 0.416471391916275, "logps/chosen": -1.0157910585403442, "logps/rejected": -3.5898735523223877, "loss": 1.0489, "odds_ratio_loss": 0.3313884139060974, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10157910734415054, "rewards/margins": 0.2574082612991333, "rewards/rejected": -0.35898739099502563, "sft_loss": 1.0157910585403442, "step": 305 }, { "epoch": 0.766852195423624, "grad_norm": 1.72649347782135, "learning_rate": 4.240061185822717e-05, "logits/chosen": 0.4288834035396576, "logits/rejected": 0.4952777028083801, "logps/chosen": -0.9988624453544617, "logps/rejected": -4.193109035491943, "loss": 1.028, "odds_ratio_loss": 0.2918681204319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09988625347614288, "rewards/margins": 0.31942468881607056, "rewards/rejected": -0.41931095719337463, "sft_loss": 0.9988624453544617, "step": 310 }, { "epoch": 0.7792207792207793, "grad_norm": 1.8174793720245361, "learning_rate": 4.216651257601842e-05, "logits/chosen": 0.45476657152175903, "logits/rejected": 0.5256853699684143, "logps/chosen": -0.9794076681137085, "logps/rejected": -4.4214091300964355, "loss": 1.0069, "odds_ratio_loss": 0.2752961218357086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09794078022241592, "rewards/margins": 0.3442001938819885, "rewards/rejected": -0.4421409070491791, "sft_loss": 0.9794076681137085, "step": 315 }, { "epoch": 0.7915893630179345, "grad_norm": 1.7876821756362915, "learning_rate": 4.192952985651527e-05, "logits/chosen": 0.40811610221862793, "logits/rejected": 0.4600760340690613, "logps/chosen": -1.0282108783721924, "logps/rejected": -4.184118270874023, "loss": 1.0602, "odds_ratio_loss": 0.3197360932826996, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10282108932733536, "rewards/margins": 0.3155907392501831, "rewards/rejected": -0.41841182112693787, "sft_loss": 1.0282108783721924, "step": 320 }, { "epoch": 0.8039579468150897, "grad_norm": 1.7779359817504883, "learning_rate": 4.168970350540384e-05, "logits/chosen": 0.40264803171157837, "logits/rejected": 0.47585710883140564, "logps/chosen": -1.055912733078003, "logps/rejected": -4.692052364349365, "loss": 1.0868, "odds_ratio_loss": 0.30881667137145996, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.10559127479791641, "rewards/margins": 0.363614022731781, "rewards/rejected": -0.46920523047447205, "sft_loss": 1.055912733078003, "step": 325 }, { "epoch": 0.8163265306122449, "grad_norm": 1.7316912412643433, "learning_rate": 4.144707380601146e-05, "logits/chosen": 0.38019031286239624, "logits/rejected": 0.460233211517334, "logps/chosen": -1.024593710899353, "logps/rejected": -4.758363723754883, "loss": 1.0536, "odds_ratio_loss": 0.2901023030281067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1024593859910965, "rewards/margins": 0.3733770251274109, "rewards/rejected": -0.4758363664150238, "sft_loss": 1.024593710899353, "step": 330 }, { "epoch": 0.8286951144094001, "grad_norm": 1.6575307846069336, "learning_rate": 4.120168151254028e-05, "logits/chosen": 0.4096407890319824, "logits/rejected": 0.4633429944515228, "logps/chosen": -0.9848533868789673, "logps/rejected": -4.400225639343262, "loss": 1.0147, "odds_ratio_loss": 0.29888057708740234, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09848534315824509, "rewards/margins": 0.3415372669696808, "rewards/rejected": -0.44002261757850647, "sft_loss": 0.9848533868789673, "step": 335 }, { "epoch": 0.8410636982065554, "grad_norm": 1.7004340887069702, "learning_rate": 4.0953567843221885e-05, "logits/chosen": 0.4503534734249115, "logits/rejected": 0.5456077456474304, "logps/chosen": -0.9916806221008301, "logps/rejected": -5.201076984405518, "loss": 1.0198, "odds_ratio_loss": 0.28106489777565, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0991680696606636, "rewards/margins": 0.4209396243095398, "rewards/rejected": -0.520107626914978, "sft_loss": 0.9916806221008301, "step": 340 }, { "epoch": 0.8534322820037106, "grad_norm": 1.8203457593917847, "learning_rate": 4.07027744733939e-05, "logits/chosen": 0.4509235918521881, "logits/rejected": 0.5145568251609802, "logps/chosen": -1.029637098312378, "logps/rejected": -4.811137676239014, "loss": 1.0595, "odds_ratio_loss": 0.29896241426467896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10296370089054108, "rewards/margins": 0.3781500458717346, "rewards/rejected": -0.4811137616634369, "sft_loss": 1.029637098312378, "step": 345 }, { "epoch": 0.8658008658008658, "grad_norm": 1.8329627513885498, "learning_rate": 4.0449343528499814e-05, "logits/chosen": 0.4359091818332672, "logits/rejected": 0.5287674069404602, "logps/chosen": -1.0532002449035645, "logps/rejected": -5.109292984008789, "loss": 1.0868, "odds_ratio_loss": 0.3357505202293396, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1053200215101242, "rewards/margins": 0.40560922026634216, "rewards/rejected": -0.51092928647995, "sft_loss": 1.0532002449035645, "step": 350 }, { "epoch": 0.878169449598021, "grad_norm": 1.733416199684143, "learning_rate": 4.01933175770133e-05, "logits/chosen": 0.47367730736732483, "logits/rejected": 0.5819244384765625, "logps/chosen": -1.036699652671814, "logps/rejected": -5.688261985778809, "loss": 1.0609, "odds_ratio_loss": 0.24238571524620056, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10366997867822647, "rewards/margins": 0.4651561677455902, "rewards/rejected": -0.5688261985778809, "sft_loss": 1.036699652671814, "step": 355 }, { "epoch": 0.8905380333951762, "grad_norm": 2.341310739517212, "learning_rate": 3.993473962328792e-05, "logits/chosen": 0.5067422389984131, "logits/rejected": 0.6225411891937256, "logps/chosen": -0.9514067769050598, "logps/rejected": -6.085948467254639, "loss": 0.9743, "odds_ratio_loss": 0.22865231335163116, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09514066576957703, "rewards/margins": 0.5134541988372803, "rewards/rejected": -0.6085948348045349, "sft_loss": 0.9514067769050598, "step": 360 }, { "epoch": 0.9029066171923315, "grad_norm": 1.8303320407867432, "learning_rate": 3.967365310033385e-05, "logits/chosen": 0.5122908353805542, "logits/rejected": 0.5836796760559082, "logps/chosen": -0.9814733266830444, "logps/rejected": -5.346036434173584, "loss": 1.0144, "odds_ratio_loss": 0.32878556847572327, "rewards/accuracies": 0.75, "rewards/chosen": -0.09814734756946564, "rewards/margins": 0.4364562928676605, "rewards/rejected": -0.5346036553382874, "sft_loss": 0.9814733266830444, "step": 365 }, { "epoch": 0.9152752009894867, "grad_norm": 1.832222580909729, "learning_rate": 3.941010186252247e-05, "logits/chosen": 0.5147444605827332, "logits/rejected": 0.6010658144950867, "logps/chosen": -0.9541529417037964, "logps/rejected": -5.437819480895996, "loss": 0.9826, "odds_ratio_loss": 0.28464651107788086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09541530907154083, "rewards/margins": 0.448366641998291, "rewards/rejected": -0.5437820553779602, "sft_loss": 0.9541529417037964, "step": 370 }, { "epoch": 0.9276437847866419, "grad_norm": 1.8213523626327515, "learning_rate": 3.914413017822012e-05, "logits/chosen": 0.5199439525604248, "logits/rejected": 0.5949954986572266, "logps/chosen": -1.042752981185913, "logps/rejected": -5.3517913818359375, "loss": 1.0748, "odds_ratio_loss": 0.3207927346229553, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10427530109882355, "rewards/margins": 0.4309038519859314, "rewards/rejected": -0.5351791381835938, "sft_loss": 1.042752981185913, "step": 375 }, { "epoch": 0.9400123685837971, "grad_norm": 1.7328555583953857, "learning_rate": 3.8875782722352525e-05, "logits/chosen": 0.508941113948822, "logits/rejected": 0.6161313056945801, "logps/chosen": -1.0072705745697021, "logps/rejected": -6.502432823181152, "loss": 1.0313, "odds_ratio_loss": 0.24036996066570282, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10072706639766693, "rewards/margins": 0.5495162606239319, "rewards/rejected": -0.65024334192276, "sft_loss": 1.0072705745697021, "step": 380 }, { "epoch": 0.9523809523809523, "grad_norm": 1.8125081062316895, "learning_rate": 3.8605104568900685e-05, "logits/chosen": 0.4614337086677551, "logits/rejected": 0.5842275023460388, "logps/chosen": -0.9928499460220337, "logps/rejected": -5.991772651672363, "loss": 1.0213, "odds_ratio_loss": 0.2844465374946594, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09928499162197113, "rewards/margins": 0.49989232420921326, "rewards/rejected": -0.5991773009300232, "sft_loss": 0.9928499460220337, "step": 385 }, { "epoch": 0.9647495361781077, "grad_norm": 1.7526085376739502, "learning_rate": 3.8332141183329945e-05, "logits/chosen": 0.5468868613243103, "logits/rejected": 0.659899890422821, "logps/chosen": -0.9818607568740845, "logps/rejected": -6.227389812469482, "loss": 1.0088, "odds_ratio_loss": 0.2697482109069824, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09818606078624725, "rewards/margins": 0.5245529413223267, "rewards/rejected": -0.6227389574050903, "sft_loss": 0.9818607568740845, "step": 390 }, { "epoch": 0.9771181199752629, "grad_norm": 1.7234915494918823, "learning_rate": 3.805693841495318e-05, "logits/chosen": 0.5352888703346252, "logits/rejected": 0.600743293762207, "logps/chosen": -0.9930388331413269, "logps/rejected": -5.411107540130615, "loss": 1.0217, "odds_ratio_loss": 0.28665605187416077, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09930390119552612, "rewards/margins": 0.4418068826198578, "rewards/rejected": -0.5411108136177063, "sft_loss": 0.9930388331413269, "step": 395 }, { "epoch": 0.9894867037724181, "grad_norm": 1.7299367189407349, "learning_rate": 3.777954248922952e-05, "logits/chosen": 0.5518743395805359, "logits/rejected": 0.6515039801597595, "logps/chosen": -0.9427730441093445, "logps/rejected": -5.97468376159668, "loss": 0.9653, "odds_ratio_loss": 0.22559209167957306, "rewards/accuracies": 0.875, "rewards/chosen": -0.09427729994058609, "rewards/margins": 0.5031911134719849, "rewards/rejected": -0.597468376159668, "sft_loss": 0.9427730441093445, "step": 400 }, { "epoch": 1.0018552875695732, "grad_norm": 1.6203914880752563, "learning_rate": 3.7500000000000003e-05, "logits/chosen": 0.5115292072296143, "logits/rejected": 0.6834827661514282, "logps/chosen": -0.9711102247238159, "logps/rejected": -6.900592803955078, "loss": 0.9925, "odds_ratio_loss": 0.21393737196922302, "rewards/accuracies": 0.875, "rewards/chosen": -0.09711103141307831, "rewards/margins": 0.592948317527771, "rewards/rejected": -0.6900593042373657, "sft_loss": 0.9711102247238159, "step": 405 }, { "epoch": 1.0142238713667284, "grad_norm": 1.819032907485962, "learning_rate": 3.721835790166115e-05, "logits/chosen": 0.5814910531044006, "logits/rejected": 0.7283499836921692, "logps/chosen": -0.8956617116928101, "logps/rejected": -6.743956565856934, "loss": 0.9167, "odds_ratio_loss": 0.2101050317287445, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.0895661860704422, "rewards/margins": 0.5848295092582703, "rewards/rejected": -0.6743956804275513, "sft_loss": 0.8956617116928101, "step": 410 }, { "epoch": 1.0265924551638836, "grad_norm": 1.7348142862319946, "learning_rate": 3.693466350127818e-05, "logits/chosen": 0.7273398637771606, "logits/rejected": 0.8442907333374023, "logps/chosen": -0.9200183749198914, "logps/rejected": -6.194988250732422, "loss": 0.9432, "odds_ratio_loss": 0.23148088157176971, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09200183302164078, "rewards/margins": 0.5274969935417175, "rewards/rejected": -0.6194988489151001, "sft_loss": 0.9200183749198914, "step": 415 }, { "epoch": 1.0389610389610389, "grad_norm": 1.8714258670806885, "learning_rate": 3.664896445063889e-05, "logits/chosen": 0.7601157426834106, "logits/rejected": 0.9092599749565125, "logps/chosen": -0.8914841413497925, "logps/rejected": -6.917119026184082, "loss": 0.911, "odds_ratio_loss": 0.1953209936618805, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.08914841711521149, "rewards/margins": 0.6025635004043579, "rewards/rejected": -0.6917119026184082, "sft_loss": 0.8914841413497925, "step": 420 }, { "epoch": 1.051329622758194, "grad_norm": 1.9854021072387695, "learning_rate": 3.6361308738249606e-05, "logits/chosen": 0.6464577913284302, "logits/rejected": 0.7664445638656616, "logps/chosen": -0.9081498384475708, "logps/rejected": -6.5246262550354, "loss": 0.9341, "odds_ratio_loss": 0.2599160373210907, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.0908149853348732, "rewards/margins": 0.5616477131843567, "rewards/rejected": -0.6524627804756165, "sft_loss": 0.9081498384475708, "step": 425 }, { "epoch": 1.0636982065553493, "grad_norm": 2.3502695560455322, "learning_rate": 3.6071744681274674e-05, "logits/chosen": 0.6742197275161743, "logits/rejected": 0.7928298115730286, "logps/chosen": -0.866266131401062, "logps/rejected": -6.284573554992676, "loss": 0.8933, "odds_ratio_loss": 0.27044612169265747, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08662661910057068, "rewards/margins": 0.5418307781219482, "rewards/rejected": -0.6284574270248413, "sft_loss": 0.866266131401062, "step": 430 }, { "epoch": 1.0760667903525047, "grad_norm": 1.9892750978469849, "learning_rate": 3.578032091742061e-05, "logits/chosen": 0.6859541535377502, "logits/rejected": 0.7902997136116028, "logps/chosen": -0.9058157205581665, "logps/rejected": -6.380780220031738, "loss": 0.9336, "odds_ratio_loss": 0.2780543863773346, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09058157354593277, "rewards/margins": 0.5474964380264282, "rewards/rejected": -0.6380780339241028, "sft_loss": 0.9058157205581665, "step": 435 }, { "epoch": 1.08843537414966, "grad_norm": 1.894260287284851, "learning_rate": 3.548708639676658e-05, "logits/chosen": 0.6156673431396484, "logits/rejected": 0.756851315498352, "logps/chosen": -0.9166086912155151, "logps/rejected": -7.627225399017334, "loss": 0.9344, "odds_ratio_loss": 0.1778070032596588, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09166087210178375, "rewards/margins": 0.6710616946220398, "rewards/rejected": -0.7627225518226624, "sft_loss": 0.9166086912155151, "step": 440 }, { "epoch": 1.1008039579468152, "grad_norm": 2.0216593742370605, "learning_rate": 3.519209037354222e-05, "logits/chosen": 0.6823121309280396, "logits/rejected": 0.7936086654663086, "logps/chosen": -0.8874192237854004, "logps/rejected": -6.673317909240723, "loss": 0.9125, "odds_ratio_loss": 0.25117507576942444, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08874191343784332, "rewards/margins": 0.5785898566246033, "rewards/rejected": -0.6673317551612854, "sft_loss": 0.8874192237854004, "step": 445 }, { "epoch": 1.1131725417439704, "grad_norm": 2.181389331817627, "learning_rate": 3.489538239785456e-05, "logits/chosen": 0.6259741187095642, "logits/rejected": 0.7684758901596069, "logps/chosen": -0.8544806241989136, "logps/rejected": -7.466407775878906, "loss": 0.8722, "odds_ratio_loss": 0.17718282341957092, "rewards/accuracies": 0.875, "rewards/chosen": -0.08544807136058807, "rewards/margins": 0.661192774772644, "rewards/rejected": -0.7466408610343933, "sft_loss": 0.8544806241989136, "step": 450 }, { "epoch": 1.1255411255411256, "grad_norm": 2.0814061164855957, "learning_rate": 3.459701230736507e-05, "logits/chosen": 0.6075058579444885, "logits/rejected": 0.7201958298683167, "logps/chosen": -0.8551079630851746, "logps/rejected": -6.701899528503418, "loss": 0.879, "odds_ratio_loss": 0.2389557808637619, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08551079779863358, "rewards/margins": 0.5846792459487915, "rewards/rejected": -0.6701899766921997, "sft_loss": 0.8551079630851746, "step": 455 }, { "epoch": 1.1379097093382808, "grad_norm": 2.2949137687683105, "learning_rate": 3.4297030218918534e-05, "logits/chosen": 0.5604380369186401, "logits/rejected": 0.7003478407859802, "logps/chosen": -0.8521453142166138, "logps/rejected": -6.850298881530762, "loss": 0.8764, "odds_ratio_loss": 0.24294734001159668, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0852145329117775, "rewards/margins": 0.5998154282569885, "rewards/rejected": -0.6850299835205078, "sft_loss": 0.8521453142166138, "step": 460 }, { "epoch": 1.150278293135436, "grad_norm": 2.3366565704345703, "learning_rate": 3.3995486520125025e-05, "logits/chosen": 0.6330605149269104, "logits/rejected": 0.7762165069580078, "logps/chosen": -0.9221884608268738, "logps/rejected": -7.140595436096191, "loss": 0.9452, "odds_ratio_loss": 0.23045647144317627, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09221886098384857, "rewards/margins": 0.6218405961990356, "rewards/rejected": -0.7140594720840454, "sft_loss": 0.9221884608268738, "step": 465 }, { "epoch": 1.1626468769325913, "grad_norm": 2.145811080932617, "learning_rate": 3.369243186089627e-05, "logits/chosen": 0.7138184309005737, "logits/rejected": 0.8505274653434753, "logps/chosen": -0.8754245042800903, "logps/rejected": -7.041708946228027, "loss": 0.9009, "odds_ratio_loss": 0.25467249751091003, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08754245936870575, "rewards/margins": 0.6166284680366516, "rewards/rejected": -0.7041709423065186, "sft_loss": 0.8754245042800903, "step": 470 }, { "epoch": 1.1750154607297465, "grad_norm": 2.238600492477417, "learning_rate": 3.3387917144938124e-05, "logits/chosen": 0.6559349894523621, "logits/rejected": 0.823254406452179, "logps/chosen": -0.9376600384712219, "logps/rejected": -7.695790767669678, "loss": 0.9594, "odds_ratio_loss": 0.21776218712329865, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.0937659963965416, "rewards/margins": 0.6758131384849548, "rewards/rejected": -0.7695791721343994, "sft_loss": 0.9376600384712219, "step": 475 }, { "epoch": 1.1873840445269017, "grad_norm": 2.1797354221343994, "learning_rate": 3.308199352120032e-05, "logits/chosen": 0.6254745721817017, "logits/rejected": 0.7800902724266052, "logps/chosen": -0.8817620277404785, "logps/rejected": -7.588588714599609, "loss": 0.904, "odds_ratio_loss": 0.22229580581188202, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.08817620575428009, "rewards/margins": 0.6706826090812683, "rewards/rejected": -0.758858859539032, "sft_loss": 0.8817620277404785, "step": 480 }, { "epoch": 1.199752628324057, "grad_norm": 2.082780122756958, "learning_rate": 3.277471237528502e-05, "logits/chosen": 0.6344858407974243, "logits/rejected": 0.8199793696403503, "logps/chosen": -0.8981841206550598, "logps/rejected": -8.465741157531738, "loss": 0.9165, "odds_ratio_loss": 0.18266572058200836, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08981841802597046, "rewards/margins": 0.7567557096481323, "rewards/rejected": -0.8465741276741028, "sft_loss": 0.8981841206550598, "step": 485 }, { "epoch": 1.2121212121212122, "grad_norm": 2.1837258338928223, "learning_rate": 3.2466125320815666e-05, "logits/chosen": 0.6689679622650146, "logits/rejected": 0.793313205242157, "logps/chosen": -0.833775520324707, "logps/rejected": -7.581145286560059, "loss": 0.8554, "odds_ratio_loss": 0.21656641364097595, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.08337755501270294, "rewards/margins": 0.6747370362281799, "rewards/rejected": -0.7581145763397217, "sft_loss": 0.833775520324707, "step": 490 }, { "epoch": 1.2244897959183674, "grad_norm": 4.293242931365967, "learning_rate": 3.215628419076752e-05, "logits/chosen": 0.7015396356582642, "logits/rejected": 0.8585187792778015, "logps/chosen": -0.8916020393371582, "logps/rejected": -7.45449686050415, "loss": 0.9161, "odds_ratio_loss": 0.24465863406658173, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08916020393371582, "rewards/margins": 0.6562893986701965, "rewards/rejected": -0.7454496622085571, "sft_loss": 0.8916020393371582, "step": 495 }, { "epoch": 1.2368583797155226, "grad_norm": 2.5086278915405273, "learning_rate": 3.1845241028761305e-05, "logits/chosen": 0.6596111059188843, "logits/rejected": 0.8187766075134277, "logps/chosen": -0.8951088190078735, "logps/rejected": -7.086970329284668, "loss": 0.9215, "odds_ratio_loss": 0.26387444138526917, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08951088786125183, "rewards/margins": 0.619186282157898, "rewards/rejected": -0.7086970806121826, "sft_loss": 0.8951088190078735, "step": 500 }, { "epoch": 1.2492269635126778, "grad_norm": 2.287677049636841, "learning_rate": 3.153304808032152e-05, "logits/chosen": 0.6520213484764099, "logits/rejected": 0.7862821817398071, "logps/chosen": -0.8919089436531067, "logps/rejected": -6.897116184234619, "loss": 0.9147, "odds_ratio_loss": 0.22821149230003357, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08919088542461395, "rewards/margins": 0.600520670413971, "rewards/rejected": -0.6897115111351013, "sft_loss": 0.8919089436531067, "step": 505 }, { "epoch": 1.261595547309833, "grad_norm": 2.418534755706787, "learning_rate": 3.121975778410084e-05, "logits/chosen": 0.6674580574035645, "logits/rejected": 0.8278300166130066, "logps/chosen": -0.8698962926864624, "logps/rejected": -7.887909889221191, "loss": 0.8858, "odds_ratio_loss": 0.15918026864528656, "rewards/accuracies": 0.875, "rewards/chosen": -0.08698964864015579, "rewards/margins": 0.7018013000488281, "rewards/rejected": -0.7887909412384033, "sft_loss": 0.8698962926864624, "step": 510 }, { "epoch": 1.2739641311069883, "grad_norm": 2.298325777053833, "learning_rate": 3.0905422763072064e-05, "logits/chosen": 0.6976498961448669, "logits/rejected": 0.8682432174682617, "logps/chosen": -0.8866313695907593, "logps/rejected": -7.854434967041016, "loss": 0.9015, "odds_ratio_loss": 0.14857998490333557, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.08866313844919205, "rewards/margins": 0.6967803239822388, "rewards/rejected": -0.7854434251785278, "sft_loss": 0.8866313695907593, "step": 515 }, { "epoch": 1.2863327149041435, "grad_norm": 2.402522563934326, "learning_rate": 3.0590095815689066e-05, "logits/chosen": 0.6237742304801941, "logits/rejected": 0.7805956602096558, "logps/chosen": -0.9254236221313477, "logps/rejected": -7.266918182373047, "loss": 0.9498, "odds_ratio_loss": 0.2437683790922165, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.092542365193367, "rewards/margins": 0.634149432182312, "rewards/rejected": -0.7266918420791626, "sft_loss": 0.9254236221313477, "step": 520 }, { "epoch": 1.2987012987012987, "grad_norm": 2.30253267288208, "learning_rate": 3.027382990701833e-05, "logits/chosen": 0.6360823512077332, "logits/rejected": 0.8293731808662415, "logps/chosen": -0.8651562929153442, "logps/rejected": -8.601655960083008, "loss": 0.8808, "odds_ratio_loss": 0.15670491755008698, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.0865156278014183, "rewards/margins": 0.7736499905586243, "rewards/rejected": -0.8601655960083008, "sft_loss": 0.8651562929153442, "step": 525 }, { "epoch": 1.311069882498454, "grad_norm": 2.342336893081665, "learning_rate": 2.9956678159842487e-05, "logits/chosen": 0.7011769413948059, "logits/rejected": 0.8781753778457642, "logps/chosen": -0.8656901121139526, "logps/rejected": -8.424004554748535, "loss": 0.8801, "odds_ratio_loss": 0.143966943025589, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.08656902611255646, "rewards/margins": 0.7558314204216003, "rewards/rejected": -0.8424005508422852, "sft_loss": 0.8656901121139526, "step": 530 }, { "epoch": 1.3234384662956091, "grad_norm": 2.6890807151794434, "learning_rate": 2.970235475210599e-05, "logits/chosen": 0.7069934010505676, "logits/rejected": 0.8731686472892761, "logps/chosen": -0.9422451853752136, "logps/rejected": -7.681105136871338, "loss": 0.9641, "odds_ratio_loss": 0.21833665668964386, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09422452747821808, "rewards/margins": 0.6738860607147217, "rewards/rejected": -0.768110454082489, "sft_loss": 0.9422451853752136, "step": 535 }, { "epoch": 1.3358070500927643, "grad_norm": 2.2927892208099365, "learning_rate": 2.9383742834269197e-05, "logits/chosen": 0.6168115139007568, "logits/rejected": 0.8480877876281738, "logps/chosen": -0.875246524810791, "logps/rejected": -9.011473655700684, "loss": 0.8931, "odds_ratio_loss": 0.17843112349510193, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0875246524810791, "rewards/margins": 0.8136228322982788, "rewards/rejected": -0.9011474847793579, "sft_loss": 0.875246524810791, "step": 540 }, { "epoch": 1.3481756338899196, "grad_norm": 2.1677441596984863, "learning_rate": 2.9064394584721378e-05, "logits/chosen": 0.6378396153450012, "logits/rejected": 0.8511786460876465, "logps/chosen": -0.8902000188827515, "logps/rejected": -8.046695709228516, "loss": 0.9132, "odds_ratio_loss": 0.23030586540699005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0890200138092041, "rewards/margins": 0.7156495451927185, "rewards/rejected": -0.8046695590019226, "sft_loss": 0.8902000188827515, "step": 545 }, { "epoch": 1.3605442176870748, "grad_norm": 2.2807095050811768, "learning_rate": 2.874436364398204e-05, "logits/chosen": 0.6105271577835083, "logits/rejected": 0.7862295508384705, "logps/chosen": -0.8917051553726196, "logps/rejected": -6.716872215270996, "loss": 0.9177, "odds_ratio_loss": 0.25969168543815613, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08917051553726196, "rewards/margins": 0.5825167298316956, "rewards/rejected": -0.671687126159668, "sft_loss": 0.8917051553726196, "step": 550 }, { "epoch": 1.37291280148423, "grad_norm": 5.808011054992676, "learning_rate": 2.8423703767241456e-05, "logits/chosen": 0.558686375617981, "logits/rejected": 0.7322561144828796, "logps/chosen": -0.8883673548698425, "logps/rejected": -7.184727668762207, "loss": 0.9119, "odds_ratio_loss": 0.23538272082805634, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08883674442768097, "rewards/margins": 0.6296359896659851, "rewards/rejected": -0.7184727787971497, "sft_loss": 0.8883673548698425, "step": 555 }, { "epoch": 1.3852813852813852, "grad_norm": 2.176337718963623, "learning_rate": 2.8102468815331485e-05, "logits/chosen": 0.6053371429443359, "logits/rejected": 0.7913026809692383, "logps/chosen": -0.8491285443305969, "logps/rejected": -7.447757720947266, "loss": 0.87, "odds_ratio_loss": 0.20908208191394806, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.08491285145282745, "rewards/margins": 0.6598628759384155, "rewards/rejected": -0.7447757720947266, "sft_loss": 0.8491285443305969, "step": 560 }, { "epoch": 1.3976499690785404, "grad_norm": 2.307225465774536, "learning_rate": 2.7780712745678627e-05, "logits/chosen": 0.5974937081336975, "logits/rejected": 0.7808493375778198, "logps/chosen": -0.8643081784248352, "logps/rejected": -9.006669998168945, "loss": 0.8749, "odds_ratio_loss": 0.10602563619613647, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.08643081039190292, "rewards/margins": 0.8142361640930176, "rewards/rejected": -0.9006668925285339, "sft_loss": 0.8643081784248352, "step": 565 }, { "epoch": 1.4100185528756957, "grad_norm": 2.2039754390716553, "learning_rate": 2.745848960324087e-05, "logits/chosen": 0.6687822341918945, "logits/rejected": 0.8781832456588745, "logps/chosen": -0.8760672807693481, "logps/rejected": -8.706890106201172, "loss": 0.8889, "odds_ratio_loss": 0.12820127606391907, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.08760674297809601, "rewards/margins": 0.7830823063850403, "rewards/rejected": -0.8706890344619751, "sft_loss": 0.8760672807693481, "step": 570 }, { "epoch": 1.4223871366728509, "grad_norm": 2.283249855041504, "learning_rate": 2.713585351142982e-05, "logits/chosen": 0.660967230796814, "logits/rejected": 0.8457085490226746, "logps/chosen": -0.8937605023384094, "logps/rejected": -7.70906925201416, "loss": 0.9173, "odds_ratio_loss": 0.2353508025407791, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0893760472536087, "rewards/margins": 0.6815308332443237, "rewards/rejected": -0.7709068655967712, "sft_loss": 0.8937605023384094, "step": 575 }, { "epoch": 1.434755720470006, "grad_norm": 2.2526354789733887, "learning_rate": 2.6812858663019668e-05, "logits/chosen": 0.6724656224250793, "logits/rejected": 0.9433199167251587, "logps/chosen": -0.8368024826049805, "logps/rejected": -9.38489818572998, "loss": 0.8483, "odds_ratio_loss": 0.1150098666548729, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.08368024975061417, "rewards/margins": 0.8548096418380737, "rewards/rejected": -0.9384899139404297, "sft_loss": 0.8368024826049805, "step": 580 }, { "epoch": 1.4471243042671613, "grad_norm": 2.239112615585327, "learning_rate": 2.6489559311044487e-05, "logits/chosen": 0.6206086874008179, "logits/rejected": 0.7730298042297363, "logps/chosen": -0.8205019235610962, "logps/rejected": -7.061163425445557, "loss": 0.8424, "odds_ratio_loss": 0.2186129093170166, "rewards/accuracies": 0.875, "rewards/chosen": -0.08205018937587738, "rewards/margins": 0.624066174030304, "rewards/rejected": -0.7061163783073425, "sft_loss": 0.8205019235610962, "step": 585 }, { "epoch": 1.4594928880643168, "grad_norm": 2.379271984100342, "learning_rate": 2.616600975968544e-05, "logits/chosen": 0.5922850966453552, "logits/rejected": 0.7625475525856018, "logps/chosen": -0.9065483808517456, "logps/rejected": -7.275846004486084, "loss": 0.9304, "odds_ratio_loss": 0.23856253921985626, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09065484255552292, "rewards/margins": 0.6369297504425049, "rewards/rejected": -0.7275846600532532, "sft_loss": 0.9065483808517456, "step": 590 }, { "epoch": 1.4718614718614718, "grad_norm": 5.9870452880859375, "learning_rate": 2.5842264355149354e-05, "logits/chosen": 0.5618978142738342, "logits/rejected": 0.6911571621894836, "logps/chosen": -0.8704769015312195, "logps/rejected": -7.3395256996154785, "loss": 0.8923, "odds_ratio_loss": 0.21864548325538635, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08704768866300583, "rewards/margins": 0.6469048857688904, "rewards/rejected": -0.7339526414871216, "sft_loss": 0.8704769015312195, "step": 595 }, { "epoch": 1.4842300556586272, "grad_norm": 2.622652769088745, "learning_rate": 2.551837747654027e-05, "logits/chosen": 0.59326171875, "logits/rejected": 0.6885969638824463, "logps/chosen": -0.8917046785354614, "logps/rejected": -6.431692600250244, "loss": 0.9143, "odds_ratio_loss": 0.22567525506019592, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08917047083377838, "rewards/margins": 0.5539987683296204, "rewards/rejected": -0.6431692838668823, "sft_loss": 0.8917046785354614, "step": 600 }, { "epoch": 1.4965986394557822, "grad_norm": 2.3183135986328125, "learning_rate": 2.519440352672543e-05, "logits/chosen": 0.5889140963554382, "logits/rejected": 0.7388980984687805, "logps/chosen": -0.8425744771957397, "logps/rejected": -7.886984825134277, "loss": 0.8615, "odds_ratio_loss": 0.18958750367164612, "rewards/accuracies": 0.875, "rewards/chosen": -0.08425744622945786, "rewards/margins": 0.7044410705566406, "rewards/rejected": -0.7886985540390015, "sft_loss": 0.8425744771957397, "step": 605 }, { "epoch": 1.5089672232529376, "grad_norm": 2.4213876724243164, "learning_rate": 2.4870396923197348e-05, "logits/chosen": 0.5814652442932129, "logits/rejected": 0.7500850558280945, "logps/chosen": -0.882533073425293, "logps/rejected": -7.137560844421387, "loss": 0.905, "odds_ratio_loss": 0.22469818592071533, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08825330436229706, "rewards/margins": 0.6255027651786804, "rewards/rejected": -0.7137560844421387, "sft_loss": 0.882533073425293, "step": 610 }, { "epoch": 1.5213358070500926, "grad_norm": 2.504164934158325, "learning_rate": 2.4546412088933308e-05, "logits/chosen": 0.6600581407546997, "logits/rejected": 0.817254364490509, "logps/chosen": -0.8887017965316772, "logps/rejected": -8.124031066894531, "loss": 0.904, "odds_ratio_loss": 0.15254178643226624, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08887019008398056, "rewards/margins": 0.723533034324646, "rewards/rejected": -0.8124032020568848, "sft_loss": 0.8887017965316772, "step": 615 }, { "epoch": 1.533704390847248, "grad_norm": 2.5267105102539062, "learning_rate": 2.422250344325408e-05, "logits/chosen": 0.6468321681022644, "logits/rejected": 0.832545280456543, "logps/chosen": -0.8747802972793579, "logps/rejected": -8.400070190429688, "loss": 0.888, "odds_ratio_loss": 0.13202178478240967, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.08747803419828415, "rewards/margins": 0.752528965473175, "rewards/rejected": -0.840006947517395, "sft_loss": 0.8747802972793579, "step": 620 }, { "epoch": 1.546072974644403, "grad_norm": 2.4034476280212402, "learning_rate": 2.389872539268309e-05, "logits/chosen": 0.6943817138671875, "logits/rejected": 0.8369506597518921, "logps/chosen": -0.9127315282821655, "logps/rejected": -7.5701003074646, "loss": 0.9324, "odds_ratio_loss": 0.196578711271286, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09127315878868103, "rewards/margins": 0.6657368540763855, "rewards/rejected": -0.7570099830627441, "sft_loss": 0.9127315282821655, "step": 625 }, { "epoch": 1.5584415584415585, "grad_norm": 2.367093801498413, "learning_rate": 2.3575132321807883e-05, "logits/chosen": 0.6322135925292969, "logits/rejected": 0.7700641751289368, "logps/chosen": -0.9354011416435242, "logps/rejected": -7.08158016204834, "loss": 0.9591, "odds_ratio_loss": 0.23710572719573975, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09354011714458466, "rewards/margins": 0.6146179437637329, "rewards/rejected": -0.708158016204834, "sft_loss": 0.9354011416435242, "step": 630 }, { "epoch": 1.5708101422387135, "grad_norm": 2.523430109024048, "learning_rate": 2.3251778584145216e-05, "logits/chosen": 0.5951679944992065, "logits/rejected": 0.747269868850708, "logps/chosen": -0.8453165888786316, "logps/rejected": -7.124324798583984, "loss": 0.8653, "odds_ratio_loss": 0.19997093081474304, "rewards/accuracies": 0.875, "rewards/chosen": -0.08453166484832764, "rewards/margins": 0.6279007792472839, "rewards/rejected": -0.7124325037002563, "sft_loss": 0.8453165888786316, "step": 635 }, { "epoch": 1.583178726035869, "grad_norm": 2.2429511547088623, "learning_rate": 2.2928718493011348e-05, "logits/chosen": 0.6156531572341919, "logits/rejected": 0.7733734846115112, "logps/chosen": -0.864497184753418, "logps/rejected": -7.778620719909668, "loss": 0.8815, "odds_ratio_loss": 0.16996565461158752, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.08644971251487732, "rewards/margins": 0.6914123296737671, "rewards/rejected": -0.7778620719909668, "sft_loss": 0.864497184753418, "step": 640 }, { "epoch": 1.595547309833024, "grad_norm": 2.4388530254364014, "learning_rate": 2.2606006312399076e-05, "logits/chosen": 0.6500285863876343, "logits/rejected": 0.8076976537704468, "logps/chosen": -0.8578665852546692, "logps/rejected": -7.3319807052612305, "loss": 0.8769, "odds_ratio_loss": 0.19020141661167145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08578665554523468, "rewards/margins": 0.6474114060401917, "rewards/rejected": -0.7331980466842651, "sft_loss": 0.8578665852546692, "step": 645 }, { "epoch": 1.6079158936301794, "grad_norm": 2.372934579849243, "learning_rate": 2.2283696247863135e-05, "logits/chosen": 0.6975225210189819, "logits/rejected": 0.8702699542045593, "logps/chosen": -0.8257290124893188, "logps/rejected": -8.018144607543945, "loss": 0.8416, "odds_ratio_loss": 0.1590322107076645, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08257289975881577, "rewards/margins": 0.7192414999008179, "rewards/rejected": -0.8018143773078918, "sft_loss": 0.8257290124893188, "step": 650 }, { "epoch": 1.6202844774273346, "grad_norm": 2.378767251968384, "learning_rate": 2.1961842437415338e-05, "logits/chosen": 0.6672301292419434, "logits/rejected": 0.8452484011650085, "logps/chosen": -0.8483393788337708, "logps/rejected": -7.967811584472656, "loss": 0.868, "odds_ratio_loss": 0.19619759917259216, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08483394235372543, "rewards/margins": 0.7119472026824951, "rewards/rejected": -0.7967811822891235, "sft_loss": 0.8483393788337708, "step": 655 }, { "epoch": 1.6326530612244898, "grad_norm": 2.3375089168548584, "learning_rate": 2.1640498942431058e-05, "logits/chosen": 0.7114741206169128, "logits/rejected": 0.8869878649711609, "logps/chosen": -0.8728822469711304, "logps/rejected": -7.964688301086426, "loss": 0.8921, "odds_ratio_loss": 0.1922103762626648, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08728822320699692, "rewards/margins": 0.7091807126998901, "rewards/rejected": -0.79646897315979, "sft_loss": 0.8728822469711304, "step": 660 }, { "epoch": 1.645021645021645, "grad_norm": 2.191344738006592, "learning_rate": 2.1319719738568634e-05, "logits/chosen": 0.7013149261474609, "logits/rejected": 0.8321846127510071, "logps/chosen": -0.8306602239608765, "logps/rejected": -6.830416679382324, "loss": 0.8531, "odds_ratio_loss": 0.22391729056835175, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.08306603133678436, "rewards/margins": 0.5999757051467896, "rewards/rejected": -0.6830417513847351, "sft_loss": 0.8306602239608765, "step": 665 }, { "epoch": 1.6573902288188003, "grad_norm": 2.2089643478393555, "learning_rate": 2.0999558706703156e-05, "logits/chosen": 0.6321022510528564, "logits/rejected": 0.7834582328796387, "logps/chosen": -0.8191835284233093, "logps/rejected": -7.642106056213379, "loss": 0.8374, "odds_ratio_loss": 0.181939959526062, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08191835135221481, "rewards/margins": 0.6822922229766846, "rewards/rejected": -0.76421058177948, "sft_loss": 0.8191835284233093, "step": 670 }, { "epoch": 1.6697588126159555, "grad_norm": 2.470571517944336, "learning_rate": 2.068006962387614e-05, "logits/chosen": 0.6458392143249512, "logits/rejected": 0.8032709956169128, "logps/chosen": -0.8306704759597778, "logps/rejected": -7.713449954986572, "loss": 0.8496, "odds_ratio_loss": 0.1897248476743698, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.08306704461574554, "rewards/margins": 0.6882779002189636, "rewards/rejected": -0.7713449597358704, "sft_loss": 0.8306704759597778, "step": 675 }, { "epoch": 1.6821273964131107, "grad_norm": 2.199373483657837, "learning_rate": 2.0361306154262668e-05, "logits/chosen": 0.7063317894935608, "logits/rejected": 0.8808048367500305, "logps/chosen": -0.8415877223014832, "logps/rejected": -8.196334838867188, "loss": 0.8592, "odds_ratio_loss": 0.17573602497577667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0841587707400322, "rewards/margins": 0.7354747653007507, "rewards/rejected": -0.8196334838867188, "sft_loss": 0.8415877223014832, "step": 680 }, { "epoch": 1.694495980210266, "grad_norm": 2.506664991378784, "learning_rate": 2.0043321840157516e-05, "logits/chosen": 0.6998068690299988, "logits/rejected": 0.8434202075004578, "logps/chosen": -0.8498493432998657, "logps/rejected": -7.3200178146362305, "loss": 0.8705, "odds_ratio_loss": 0.2065567970275879, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0849849209189415, "rewards/margins": 0.6470169425010681, "rewards/rejected": -0.7320019006729126, "sft_loss": 0.8498493432998657, "step": 685 }, { "epoch": 1.7068645640074211, "grad_norm": 2.324230194091797, "learning_rate": 1.9726170092981675e-05, "logits/chosen": 0.6588650941848755, "logits/rejected": 0.8341137170791626, "logps/chosen": -0.8925081491470337, "logps/rejected": -8.823976516723633, "loss": 0.9104, "odds_ratio_loss": 0.1792716234922409, "rewards/accuracies": 0.875, "rewards/chosen": -0.08925081789493561, "rewards/margins": 0.7931469678878784, "rewards/rejected": -0.8823978304862976, "sft_loss": 0.8925081491470337, "step": 690 }, { "epoch": 1.7192331478045764, "grad_norm": 2.4986751079559326, "learning_rate": 1.9409904184310947e-05, "logits/chosen": 0.6268351674079895, "logits/rejected": 0.803906261920929, "logps/chosen": -0.8428052067756653, "logps/rejected": -8.264775276184082, "loss": 0.8614, "odds_ratio_loss": 0.18644191324710846, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08428052067756653, "rewards/margins": 0.7421970367431641, "rewards/rejected": -0.8264775276184082, "sft_loss": 0.8428052067756653, "step": 695 }, { "epoch": 1.7316017316017316, "grad_norm": 2.3888235092163086, "learning_rate": 1.9094577236927938e-05, "logits/chosen": 0.7057437896728516, "logits/rejected": 0.8605132102966309, "logps/chosen": -0.865081787109375, "logps/rejected": -7.3914475440979, "loss": 0.8898, "odds_ratio_loss": 0.2468404322862625, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08650818467140198, "rewards/margins": 0.6526366472244263, "rewards/rejected": -0.7391448616981506, "sft_loss": 0.865081787109375, "step": 700 } ], "logging_steps": 5, "max_steps": 1212, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.550794106047037e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }