{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 11.649026188613895, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.408252239227295, "logits/rejected": -2.408294677734375, "logps/chosen": -208.4792022705078, "logps/rejected": -178.0951690673828, "loss": 0.693, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 10.900219473552317, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.5447659492492676, "logits/rejected": -2.538891315460205, "logps/chosen": -261.517333984375, "logps/rejected": -166.39056396484375, "loss": 0.693, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": 0.00011150226055178791, "rewards/margins": 5.223603511694819e-05, "rewards/rejected": 5.9266225434839725e-05, "step": 10 }, { "epoch": 0.04, "grad_norm": 11.586256170716794, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.5193417072296143, "logits/rejected": -2.526468276977539, "logps/chosen": -252.56442260742188, "logps/rejected": -178.0738525390625, "loss": 0.69, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0030069700442254543, "rewards/margins": 0.006722055375576019, "rewards/rejected": -0.003715085331350565, "step": 20 }, { "epoch": 0.06, "grad_norm": 37.65056149520632, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.4743459224700928, "logits/rejected": -2.4606471061706543, "logps/chosen": -240.6038818359375, "logps/rejected": -181.88919067382812, "loss": 0.6755, "rewards/accuracies": 0.84375, "rewards/chosen": 0.008357289247214794, "rewards/margins": 0.03687674552202225, "rewards/rejected": -0.028519460931420326, "step": 30 }, { "epoch": 0.08, "grad_norm": 11.450052464046935, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.3846631050109863, "logits/rejected": -2.3543992042541504, "logps/chosen": -280.9198913574219, "logps/rejected": -211.95913696289062, "loss": 0.6346, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.06864231824874878, "rewards/margins": 0.12311089038848877, "rewards/rejected": -0.05446857959032059, "step": 40 }, { "epoch": 0.1, "grad_norm": 13.049787778502237, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.256699323654175, "logits/rejected": -2.269990921020508, "logps/chosen": -250.5127716064453, "logps/rejected": -210.73342895507812, "loss": 0.5816, "rewards/accuracies": 0.78125, "rewards/chosen": 0.005404283292591572, "rewards/margins": 0.28600800037384033, "rewards/rejected": -0.2806037366390228, "step": 50 }, { "epoch": 0.13, "grad_norm": 23.40921657971245, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.1533896923065186, "logits/rejected": -2.1007308959960938, "logps/chosen": -282.63739013671875, "logps/rejected": -243.5396270751953, "loss": 0.5257, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.08040512353181839, "rewards/margins": 0.459780216217041, "rewards/rejected": -0.540185272693634, "step": 60 }, { "epoch": 0.15, "grad_norm": 18.316826712401, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.008669137954712, "logits/rejected": -2.016932249069214, "logps/chosen": -298.5509338378906, "logps/rejected": -281.0208435058594, "loss": 0.4813, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.18154878914356232, "rewards/margins": 0.7048229575157166, "rewards/rejected": -0.8863717317581177, "step": 70 }, { "epoch": 0.17, "grad_norm": 17.640568045589177, "learning_rate": 4.931986719649298e-07, "logits/chosen": -1.7213737964630127, "logits/rejected": -1.639634370803833, "logps/chosen": -321.4610900878906, "logps/rejected": -327.48675537109375, "loss": 0.4345, "rewards/accuracies": 0.875, "rewards/chosen": -0.5062848925590515, "rewards/margins": 0.9567643404006958, "rewards/rejected": -1.4630491733551025, "step": 80 }, { "epoch": 0.19, "grad_norm": 29.13663512751633, "learning_rate": 4.883222001996351e-07, "logits/chosen": -1.2982016801834106, "logits/rejected": -1.1025583744049072, "logps/chosen": -372.08209228515625, "logps/rejected": -424.2044372558594, "loss": 0.4001, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9675124287605286, "rewards/margins": 1.4019938707351685, "rewards/rejected": -2.369506359100342, "step": 90 }, { "epoch": 0.21, "grad_norm": 20.753196262748457, "learning_rate": 4.821741763807186e-07, "logits/chosen": -1.386314034461975, "logits/rejected": -1.0822856426239014, "logps/chosen": -367.71942138671875, "logps/rejected": -453.9052734375, "loss": 0.3617, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.3193985223770142, "rewards/margins": 1.4559285640716553, "rewards/rejected": -2.77532696723938, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -1.5665340423583984, "eval_logits/rejected": -1.3192166090011597, "eval_logps/chosen": -371.2482604980469, "eval_logps/rejected": -475.1705017089844, "eval_loss": 0.34448927640914917, "eval_rewards/accuracies": 0.86328125, "eval_rewards/chosen": -1.138149380683899, "eval_rewards/margins": 1.8140134811401367, "eval_rewards/rejected": -2.952162981033325, "eval_runtime": 97.3413, "eval_samples_per_second": 20.546, "eval_steps_per_second": 0.329, "step": 100 }, { "epoch": 0.23, "grad_norm": 18.97759709132058, "learning_rate": 4.747874028753375e-07, "logits/chosen": -1.1811072826385498, "logits/rejected": -0.979759693145752, "logps/chosen": -376.93353271484375, "logps/rejected": -487.2027282714844, "loss": 0.3614, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3289892673492432, "rewards/margins": 1.6757100820541382, "rewards/rejected": -3.004699230194092, "step": 110 }, { "epoch": 0.25, "grad_norm": 15.72680069184703, "learning_rate": 4.662012913161997e-07, "logits/chosen": -0.8997787237167358, "logits/rejected": -0.6056855916976929, "logps/chosen": -377.9947509765625, "logps/rejected": -490.7288513183594, "loss": 0.3357, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.3285906314849854, "rewards/margins": 1.7274820804595947, "rewards/rejected": -3.056072473526001, "step": 120 }, { "epoch": 0.27, "grad_norm": 22.07740307319267, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -0.7402850389480591, "logits/rejected": -0.2074509561061859, "logps/chosen": -404.84442138671875, "logps/rejected": -538.0674438476562, "loss": 0.3035, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3628482818603516, "rewards/margins": 2.222651720046997, "rewards/rejected": -3.5854995250701904, "step": 130 }, { "epoch": 0.29, "grad_norm": 23.838607690547235, "learning_rate": 4.456204510851956e-07, "logits/chosen": -0.677836537361145, "logits/rejected": -0.11348800361156464, "logps/chosen": -422.9839782714844, "logps/rejected": -556.4301147460938, "loss": 0.3117, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.539533257484436, "rewards/margins": 2.1120970249176025, "rewards/rejected": -3.651630401611328, "step": 140 }, { "epoch": 0.31, "grad_norm": 27.05325199994427, "learning_rate": 4.337355301007335e-07, "logits/chosen": -0.3853974938392639, "logits/rejected": 0.20319394767284393, "logps/chosen": -417.6180725097656, "logps/rejected": -582.4807739257812, "loss": 0.3038, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7072023153305054, "rewards/margins": 2.395305871963501, "rewards/rejected": -4.102508068084717, "step": 150 }, { "epoch": 0.33, "grad_norm": 17.832180625180946, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -0.3905831575393677, "logits/rejected": 0.32560938596725464, "logps/chosen": -401.91839599609375, "logps/rejected": -541.798583984375, "loss": 0.291, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.3554723262786865, "rewards/margins": 2.1516823768615723, "rewards/rejected": -3.507154941558838, "step": 160 }, { "epoch": 0.36, "grad_norm": 33.740386423043056, "learning_rate": 4.070934040463998e-07, "logits/chosen": -0.7972911596298218, "logits/rejected": 0.013139176182448864, "logps/chosen": -431.9474182128906, "logps/rejected": -614.335205078125, "loss": 0.2818, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7542915344238281, "rewards/margins": 2.622352361679077, "rewards/rejected": -4.376644134521484, "step": 170 }, { "epoch": 0.38, "grad_norm": 36.32706683293822, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -0.6585810780525208, "logits/rejected": 0.35586631298065186, "logps/chosen": -425.5533752441406, "logps/rejected": -627.4122924804688, "loss": 0.2821, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.5755112171173096, "rewards/margins": 2.9501824378967285, "rewards/rejected": -4.525693416595459, "step": 180 }, { "epoch": 0.4, "grad_norm": 17.303305712606004, "learning_rate": 3.7710310482256523e-07, "logits/chosen": 0.05398033186793327, "logits/rejected": 0.8622593879699707, "logps/chosen": -430.6288146972656, "logps/rejected": -650.1162109375, "loss": 0.2857, "rewards/accuracies": 0.84375, "rewards/chosen": -1.835845947265625, "rewards/margins": 2.724783420562744, "rewards/rejected": -4.560629844665527, "step": 190 }, { "epoch": 0.42, "grad_norm": 25.750253047861463, "learning_rate": 3.610497133404795e-07, "logits/chosen": -0.71821129322052, "logits/rejected": 0.2892279028892517, "logps/chosen": -424.61773681640625, "logps/rejected": -588.4847412109375, "loss": 0.2941, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.6481313705444336, "rewards/margins": 2.587430477142334, "rewards/rejected": -4.235561847686768, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -1.0254615545272827, "eval_logits/rejected": -0.29090192914009094, "eval_logps/chosen": -410.46307373046875, "eval_logps/rejected": -639.6044921875, "eval_loss": 0.2594895660877228, "eval_rewards/accuracies": 0.87109375, "eval_rewards/chosen": -1.5302979946136475, "eval_rewards/margins": 3.0662055015563965, "eval_rewards/rejected": -4.596503734588623, "eval_runtime": 97.4272, "eval_samples_per_second": 20.528, "eval_steps_per_second": 0.328, "step": 200 }, { "epoch": 0.44, "grad_norm": 22.833636886609728, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -0.23435378074645996, "logits/rejected": 0.7262285351753235, "logps/chosen": -413.39495849609375, "logps/rejected": -610.934326171875, "loss": 0.2652, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7757046222686768, "rewards/margins": 2.6218485832214355, "rewards/rejected": -4.397553443908691, "step": 210 }, { "epoch": 0.46, "grad_norm": 19.35747892667393, "learning_rate": 3.272542485937368e-07, "logits/chosen": -0.03855214640498161, "logits/rejected": 1.1163934469223022, "logps/chosen": -422.90655517578125, "logps/rejected": -638.5581665039062, "loss": 0.2649, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.665327787399292, "rewards/margins": 2.9846928119659424, "rewards/rejected": -4.650020122528076, "step": 220 }, { "epoch": 0.48, "grad_norm": 21.48535535733013, "learning_rate": 3.096924887558854e-07, "logits/chosen": 0.25134754180908203, "logits/rejected": 1.690708875656128, "logps/chosen": -465.64324951171875, "logps/rejected": -718.0191650390625, "loss": 0.2576, "rewards/accuracies": 0.90625, "rewards/chosen": -1.8850457668304443, "rewards/margins": 3.3224689960479736, "rewards/rejected": -5.20751428604126, "step": 230 }, { "epoch": 0.5, "grad_norm": 21.422183287654935, "learning_rate": 2.9181224366319943e-07, "logits/chosen": 0.8468208312988281, "logits/rejected": 2.1126351356506348, "logps/chosen": -494.220703125, "logps/rejected": -765.5153198242188, "loss": 0.2597, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.599130153656006, "rewards/margins": 3.3152058124542236, "rewards/rejected": -5.914336204528809, "step": 240 }, { "epoch": 0.52, "grad_norm": 20.426996979966383, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -0.12486964464187622, "logits/rejected": 1.0274794101715088, "logps/chosen": -427.1630859375, "logps/rejected": -634.4889526367188, "loss": 0.2648, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.721609115600586, "rewards/margins": 2.7955689430236816, "rewards/rejected": -4.517177581787109, "step": 250 }, { "epoch": 0.54, "grad_norm": 25.70991400557029, "learning_rate": 2.55479083351317e-07, "logits/chosen": -0.1112385243177414, "logits/rejected": 1.0832737684249878, "logps/chosen": -449.45709228515625, "logps/rejected": -671.4852294921875, "loss": 0.2494, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8507035970687866, "rewards/margins": 3.2304184436798096, "rewards/rejected": -5.081121921539307, "step": 260 }, { "epoch": 0.56, "grad_norm": 26.743751356983804, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -0.3570239245891571, "logits/rejected": 0.8737660646438599, "logps/chosen": -427.79931640625, "logps/rejected": -680.0591430664062, "loss": 0.2351, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8343093395233154, "rewards/margins": 3.1806130409240723, "rewards/rejected": -5.014922142028809, "step": 270 }, { "epoch": 0.59, "grad_norm": 20.437919430365636, "learning_rate": 2.19029145890313e-07, "logits/chosen": -0.14857754111289978, "logits/rejected": 0.7723418474197388, "logps/chosen": -476.52099609375, "logps/rejected": -727.02734375, "loss": 0.2542, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1171650886535645, "rewards/margins": 3.235008716583252, "rewards/rejected": -5.352174282073975, "step": 280 }, { "epoch": 0.61, "grad_norm": 18.45732619902188, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -0.04501671344041824, "logits/rejected": 0.9899295568466187, "logps/chosen": -446.81451416015625, "logps/rejected": -742.0169677734375, "loss": 0.2265, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.055858850479126, "rewards/margins": 3.6457011699676514, "rewards/rejected": -5.701560020446777, "step": 290 }, { "epoch": 0.63, "grad_norm": 18.065200464485685, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -0.25825151801109314, "logits/rejected": 1.0551276206970215, "logps/chosen": -482.5679626464844, "logps/rejected": -777.6588134765625, "loss": 0.259, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.0962753295898438, "rewards/margins": 3.666637897491455, "rewards/rejected": -5.762913227081299, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -0.08032596111297607, "eval_logits/rejected": 1.257252812385559, "eval_logps/chosen": -480.0015563964844, "eval_logps/rejected": -791.1058959960938, "eval_loss": 0.21872717142105103, "eval_rewards/accuracies": 0.89453125, "eval_rewards/chosen": -2.2256827354431152, "eval_rewards/margins": 3.8858344554901123, "eval_rewards/rejected": -6.111516952514648, "eval_runtime": 97.4838, "eval_samples_per_second": 20.516, "eval_steps_per_second": 0.328, "step": 300 }, { "epoch": 0.65, "grad_norm": 52.672419540257096, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -0.3141949772834778, "logits/rejected": 0.9004285931587219, "logps/chosen": -475.15936279296875, "logps/rejected": -737.10009765625, "loss": 0.2596, "rewards/accuracies": 0.90625, "rewards/chosen": -2.111830949783325, "rewards/margins": 3.3903605937957764, "rewards/rejected": -5.502191543579102, "step": 310 }, { "epoch": 0.67, "grad_norm": 22.65476006635999, "learning_rate": 1.488723393865766e-07, "logits/chosen": -0.16484542191028595, "logits/rejected": 1.090689778327942, "logps/chosen": -455.7721252441406, "logps/rejected": -757.0924072265625, "loss": 0.2429, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.08683180809021, "rewards/margins": 3.6439883708953857, "rewards/rejected": -5.730820178985596, "step": 320 }, { "epoch": 0.69, "grad_norm": 19.80833079454062, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -0.27468693256378174, "logits/rejected": 1.061683177947998, "logps/chosen": -481.45501708984375, "logps/rejected": -751.7589721679688, "loss": 0.2279, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.233105421066284, "rewards/margins": 3.570781707763672, "rewards/rejected": -5.803887367248535, "step": 330 }, { "epoch": 0.71, "grad_norm": 24.881750772146, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -0.03138185292482376, "logits/rejected": 1.0509425401687622, "logps/chosen": -470.1802673339844, "logps/rejected": -690.557373046875, "loss": 0.237, "rewards/accuracies": 0.875, "rewards/chosen": -2.2647526264190674, "rewards/margins": 3.035360097885132, "rewards/rejected": -5.300112724304199, "step": 340 }, { "epoch": 0.73, "grad_norm": 19.587851322719366, "learning_rate": 1.0157994641835734e-07, "logits/chosen": 0.006643450353294611, "logits/rejected": 1.0511295795440674, "logps/chosen": -478.51055908203125, "logps/rejected": -806.5072631835938, "loss": 0.2256, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.348606586456299, "rewards/margins": 3.6702816486358643, "rewards/rejected": -6.018888473510742, "step": 350 }, { "epoch": 0.75, "grad_norm": 22.679114876876472, "learning_rate": 8.729103716819111e-08, "logits/chosen": -0.024983350187540054, "logits/rejected": 1.071063756942749, "logps/chosen": -459.17669677734375, "logps/rejected": -731.5552368164062, "loss": 0.2495, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0910515785217285, "rewards/margins": 3.42362642288208, "rewards/rejected": -5.514677047729492, "step": 360 }, { "epoch": 0.77, "grad_norm": 24.62831310204124, "learning_rate": 7.387025063449081e-08, "logits/chosen": -0.07435999810695648, "logits/rejected": 1.1522341966629028, "logps/chosen": -457.25128173828125, "logps/rejected": -728.47412109375, "loss": 0.234, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.266648292541504, "rewards/margins": 3.339157819747925, "rewards/rejected": -5.60580587387085, "step": 370 }, { "epoch": 0.79, "grad_norm": 18.56178014954704, "learning_rate": 6.138919252022435e-08, "logits/chosen": -0.3609544634819031, "logits/rejected": 0.9311397671699524, "logps/chosen": -469.6280212402344, "logps/rejected": -802.5978393554688, "loss": 0.2329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1230244636535645, "rewards/margins": 4.024425506591797, "rewards/rejected": -6.147449016571045, "step": 380 }, { "epoch": 0.82, "grad_norm": 32.926290807333665, "learning_rate": 4.991445467064689e-08, "logits/chosen": -0.012720714323222637, "logits/rejected": 1.3005478382110596, "logps/chosen": -489.65802001953125, "logps/rejected": -779.3419799804688, "loss": 0.2265, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.43365740776062, "rewards/margins": 3.560826539993286, "rewards/rejected": -5.994483947753906, "step": 390 }, { "epoch": 0.84, "grad_norm": 30.149198169987162, "learning_rate": 3.9507259776993954e-08, "logits/chosen": 0.26025086641311646, "logits/rejected": 1.3578455448150635, "logps/chosen": -470.298095703125, "logps/rejected": -760.2213134765625, "loss": 0.2268, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.2305097579956055, "rewards/margins": 3.6479294300079346, "rewards/rejected": -5.878438472747803, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -0.19649244844913483, "eval_logits/rejected": 1.227736234664917, "eval_logps/chosen": -477.7560729980469, "eval_logps/rejected": -812.5330810546875, "eval_loss": 0.2144031673669815, "eval_rewards/accuracies": 0.8984375, "eval_rewards/chosen": -2.2032277584075928, "eval_rewards/margins": 4.122560977935791, "eval_rewards/rejected": -6.325788974761963, "eval_runtime": 97.384, "eval_samples_per_second": 20.537, "eval_steps_per_second": 0.329, "step": 400 }, { "epoch": 0.86, "grad_norm": 15.686531185394536, "learning_rate": 3.022313472693447e-08, "logits/chosen": -0.13663654029369354, "logits/rejected": 1.1233164072036743, "logps/chosen": -492.34588623046875, "logps/rejected": -787.8416137695312, "loss": 0.2073, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.1595890522003174, "rewards/margins": 3.8674533367156982, "rewards/rejected": -6.027042388916016, "step": 410 }, { "epoch": 0.88, "grad_norm": 25.36773289746514, "learning_rate": 2.2111614344599684e-08, "logits/chosen": 0.07713554799556732, "logits/rejected": 1.1611943244934082, "logps/chosen": -468.55523681640625, "logps/rejected": -737.3845825195312, "loss": 0.2294, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.246133804321289, "rewards/margins": 3.3159337043762207, "rewards/rejected": -5.562067985534668, "step": 420 }, { "epoch": 0.9, "grad_norm": 23.64735863277161, "learning_rate": 1.521597710086439e-08, "logits/chosen": -0.11199776083230972, "logits/rejected": 1.2544041872024536, "logps/chosen": -487.00787353515625, "logps/rejected": -748.0345458984375, "loss": 0.2283, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.158412218093872, "rewards/margins": 3.493891954421997, "rewards/rejected": -5.652304649353027, "step": 430 }, { "epoch": 0.92, "grad_norm": 32.07046590554996, "learning_rate": 9.57301420397924e-09, "logits/chosen": -0.2779064476490021, "logits/rejected": 0.990594744682312, "logps/chosen": -476.5415954589844, "logps/rejected": -816.4949340820312, "loss": 0.2335, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1302719116210938, "rewards/margins": 4.236142158508301, "rewards/rejected": -6.366414546966553, "step": 440 }, { "epoch": 0.94, "grad_norm": 29.966664104398923, "learning_rate": 5.212833302556258e-09, "logits/chosen": -0.2724097967147827, "logits/rejected": 1.02344810962677, "logps/chosen": -477.5227966308594, "logps/rejected": -801.6556396484375, "loss": 0.235, "rewards/accuracies": 0.84375, "rewards/chosen": -2.263077735900879, "rewards/margins": 3.993912935256958, "rewards/rejected": -6.256990432739258, "step": 450 }, { "epoch": 0.96, "grad_norm": 54.30402743121173, "learning_rate": 2.158697848236607e-09, "logits/chosen": -0.18335244059562683, "logits/rejected": 0.9830573201179504, "logps/chosen": -479.820556640625, "logps/rejected": -772.7644653320312, "loss": 0.2201, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.1699745655059814, "rewards/margins": 3.6132774353027344, "rewards/rejected": -5.783252239227295, "step": 460 }, { "epoch": 0.98, "grad_norm": 24.80298516842009, "learning_rate": 4.269029751107489e-10, "logits/chosen": -0.0621672198176384, "logits/rejected": 1.1333558559417725, "logps/chosen": -449.02386474609375, "logps/rejected": -767.277099609375, "loss": 0.2269, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.1144473552703857, "rewards/margins": 3.668858289718628, "rewards/rejected": -5.783305644989014, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.3183088973476298, "train_runtime": 7617.3811, "train_samples_per_second": 8.026, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }