{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994242947610823, "eval_steps": 100, "global_step": 868, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 23.51828299790517, "learning_rate": 5.747126436781609e-09, "logits/chosen": -1.865264654159546, "logits/rejected": -1.587956428527832, "logps/chosen": -204.58331298828125, "logps/rejected": -154.1517333984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 24.149515970375678, "learning_rate": 5.747126436781609e-08, "logits/chosen": -1.90481698513031, "logits/rejected": -1.8536584377288818, "logps/chosen": -213.41416931152344, "logps/rejected": -191.33694458007812, "loss": 0.6932, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -1.9929786503780633e-05, "rewards/margins": 0.00017105697770603, "rewards/rejected": -0.00019098672783002257, "step": 10 }, { "epoch": 0.02, "grad_norm": 23.563731768256098, "learning_rate": 1.1494252873563217e-07, "logits/chosen": -1.9680726528167725, "logits/rejected": -1.798654317855835, "logps/chosen": -255.55111694335938, "logps/rejected": -189.6189727783203, "loss": 0.6921, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0016116431215777993, "rewards/margins": 0.002336590550839901, "rewards/rejected": -0.0007249473710544407, "step": 20 }, { "epoch": 0.03, "grad_norm": 23.038450073297746, "learning_rate": 1.7241379310344828e-07, "logits/chosen": -1.8938862085342407, "logits/rejected": -1.8228662014007568, "logps/chosen": -212.65322875976562, "logps/rejected": -194.4668426513672, "loss": 0.6878, "rewards/accuracies": 0.75, "rewards/chosen": 0.011502735316753387, "rewards/margins": 0.014704583212733269, "rewards/rejected": -0.003201847430318594, "step": 30 }, { "epoch": 0.05, "grad_norm": 22.339093495440075, "learning_rate": 2.2988505747126435e-07, "logits/chosen": -1.8691730499267578, "logits/rejected": -1.810280442237854, "logps/chosen": -212.04031372070312, "logps/rejected": -189.72427368164062, "loss": 0.6773, "rewards/accuracies": 0.75, "rewards/chosen": 0.027534600347280502, "rewards/margins": 0.037894655019044876, "rewards/rejected": -0.010360054671764374, "step": 40 }, { "epoch": 0.06, "grad_norm": 21.83120331543706, "learning_rate": 2.873563218390804e-07, "logits/chosen": -1.9792773723602295, "logits/rejected": -1.8856391906738281, "logps/chosen": -199.00392150878906, "logps/rejected": -184.42074584960938, "loss": 0.6637, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02775971218943596, "rewards/margins": 0.08295276015996933, "rewards/rejected": -0.05519305542111397, "step": 50 }, { "epoch": 0.07, "grad_norm": 21.94313336281609, "learning_rate": 3.4482758620689656e-07, "logits/chosen": -1.978032112121582, "logits/rejected": -1.8626216650009155, "logps/chosen": -263.13702392578125, "logps/rejected": -227.51931762695312, "loss": 0.6365, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.024905700236558914, "rewards/margins": 0.1394185870885849, "rewards/rejected": -0.1643243134021759, "step": 60 }, { "epoch": 0.08, "grad_norm": 21.93834951114425, "learning_rate": 4.0229885057471266e-07, "logits/chosen": -1.923208236694336, "logits/rejected": -1.9092395305633545, "logps/chosen": -211.4084930419922, "logps/rejected": -216.09439086914062, "loss": 0.6127, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.16204482316970825, "rewards/margins": 0.21120235323905945, "rewards/rejected": -0.3732471466064453, "step": 70 }, { "epoch": 0.09, "grad_norm": 26.27963832031748, "learning_rate": 4.597701149425287e-07, "logits/chosen": -1.7020299434661865, "logits/rejected": -1.635000467300415, "logps/chosen": -229.10562133789062, "logps/rejected": -228.198486328125, "loss": 0.5888, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3274237811565399, "rewards/margins": 0.26525241136550903, "rewards/rejected": -0.5926762819290161, "step": 80 }, { "epoch": 0.1, "grad_norm": 35.47456739543052, "learning_rate": 4.999817969178237e-07, "logits/chosen": -1.768843412399292, "logits/rejected": -1.73134446144104, "logps/chosen": -271.71563720703125, "logps/rejected": -283.0465393066406, "loss": 0.5313, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.538571298122406, "rewards/margins": 0.47389060258865356, "rewards/rejected": -1.0124619007110596, "step": 90 }, { "epoch": 0.12, "grad_norm": 38.67050237438448, "learning_rate": 4.996582603056428e-07, "logits/chosen": -1.7260372638702393, "logits/rejected": -1.6588356494903564, "logps/chosen": -285.2041320800781, "logps/rejected": -323.65692138671875, "loss": 0.5405, "rewards/accuracies": 0.75, "rewards/chosen": -0.6892239451408386, "rewards/margins": 0.5662633180618286, "rewards/rejected": -1.2554872035980225, "step": 100 }, { "epoch": 0.12, "eval_logits/chosen": -1.746153473854065, "eval_logits/rejected": -1.6546903848648071, "eval_logps/chosen": -421.5047912597656, "eval_logps/rejected": -451.7755432128906, "eval_loss": 0.6086099743843079, "eval_rewards/accuracies": 0.6953125, "eval_rewards/chosen": -0.8599321246147156, "eval_rewards/margins": 0.3267643451690674, "eval_rewards/rejected": -1.1866965293884277, "eval_runtime": 98.2501, "eval_samples_per_second": 20.356, "eval_steps_per_second": 0.326, "step": 100 }, { "epoch": 0.13, "grad_norm": 56.77623681367674, "learning_rate": 4.989308132738126e-07, "logits/chosen": -1.8324391841888428, "logits/rejected": -1.7346527576446533, "logps/chosen": -289.9622802734375, "logps/rejected": -307.9504699707031, "loss": 0.5032, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7402961850166321, "rewards/margins": 0.6292544007301331, "rewards/rejected": -1.3695508241653442, "step": 110 }, { "epoch": 0.14, "grad_norm": 54.65739090602792, "learning_rate": 4.978006327248536e-07, "logits/chosen": -1.91842520236969, "logits/rejected": -1.849988579750061, "logps/chosen": -323.345703125, "logps/rejected": -366.32415771484375, "loss": 0.4966, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.107177972793579, "rewards/margins": 0.7300722599029541, "rewards/rejected": -1.8372503519058228, "step": 120 }, { "epoch": 0.15, "grad_norm": 40.66462467188264, "learning_rate": 4.962695471250032e-07, "logits/chosen": -1.7266982793807983, "logits/rejected": -1.6543283462524414, "logps/chosen": -320.31195068359375, "logps/rejected": -359.983154296875, "loss": 0.4886, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0283275842666626, "rewards/margins": 0.7512324452400208, "rewards/rejected": -1.7795600891113281, "step": 130 }, { "epoch": 0.16, "grad_norm": 45.88018498600559, "learning_rate": 4.94340033546025e-07, "logits/chosen": -1.4110041856765747, "logits/rejected": -1.3973127603530884, "logps/chosen": -312.18145751953125, "logps/rejected": -390.5517578125, "loss": 0.4739, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.310011863708496, "rewards/margins": 0.8049423098564148, "rewards/rejected": -2.1149544715881348, "step": 140 }, { "epoch": 0.17, "grad_norm": 79.78754356153908, "learning_rate": 4.920152136576705e-07, "logits/chosen": -1.2265546321868896, "logits/rejected": -1.1716219186782837, "logps/chosen": -357.737060546875, "logps/rejected": -431.76806640625, "loss": 0.4655, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4461175203323364, "rewards/margins": 0.9848885536193848, "rewards/rejected": -2.4310059547424316, "step": 150 }, { "epoch": 0.18, "grad_norm": 40.08268655919122, "learning_rate": 4.892988486772756e-07, "logits/chosen": -1.2588635683059692, "logits/rejected": -1.1425318717956543, "logps/chosen": -354.57867431640625, "logps/rejected": -432.987060546875, "loss": 0.4787, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4544165134429932, "rewards/margins": 0.9601584672927856, "rewards/rejected": -2.4145748615264893, "step": 160 }, { "epoch": 0.2, "grad_norm": 37.08844280081501, "learning_rate": 4.861953332846629e-07, "logits/chosen": -1.0948612689971924, "logits/rejected": -0.9797511100769043, "logps/chosen": -370.5609436035156, "logps/rejected": -417.10418701171875, "loss": 0.4741, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4464932680130005, "rewards/margins": 0.8114526867866516, "rewards/rejected": -2.257946014404297, "step": 170 }, { "epoch": 0.21, "grad_norm": 51.03369267010431, "learning_rate": 4.827096885121953e-07, "logits/chosen": -0.9882611036300659, "logits/rejected": -0.786241888999939, "logps/chosen": -403.01361083984375, "logps/rejected": -465.450439453125, "loss": 0.4518, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6873054504394531, "rewards/margins": 0.8884965181350708, "rewards/rejected": -2.5758020877838135, "step": 180 }, { "epoch": 0.22, "grad_norm": 40.75117386512369, "learning_rate": 4.788475536214821e-07, "logits/chosen": -0.6994659900665283, "logits/rejected": -0.57302325963974, "logps/chosen": -345.23858642578125, "logps/rejected": -434.90069580078125, "loss": 0.4305, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5364990234375, "rewards/margins": 1.0722037553787231, "rewards/rejected": -2.6087028980255127, "step": 190 }, { "epoch": 0.23, "grad_norm": 50.385160508667006, "learning_rate": 4.746151769798818e-07, "logits/chosen": -0.46505388617515564, "logits/rejected": -0.32105451822280884, "logps/chosen": -395.0636901855469, "logps/rejected": -491.369873046875, "loss": 0.4371, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5998367071151733, "rewards/margins": 1.3637341260910034, "rewards/rejected": -2.9635708332061768, "step": 200 }, { "epoch": 0.23, "eval_logits/chosen": -0.8866692185401917, "eval_logits/rejected": -0.715141236782074, "eval_logps/chosen": -537.5919799804688, "eval_logps/rejected": -591.529052734375, "eval_loss": 0.5454351305961609, "eval_rewards/accuracies": 0.7421875, "eval_rewards/chosen": -2.0208044052124023, "eval_rewards/margins": 0.5634276270866394, "eval_rewards/rejected": -2.5842318534851074, "eval_runtime": 98.1521, "eval_samples_per_second": 20.377, "eval_steps_per_second": 0.326, "step": 200 }, { "epoch": 0.24, "grad_norm": 44.17462139523744, "learning_rate": 4.7001940595156055e-07, "logits/chosen": -0.5879951119422913, "logits/rejected": -0.31766843795776367, "logps/chosen": -347.45184326171875, "logps/rejected": -442.23291015625, "loss": 0.466, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.621807336807251, "rewards/margins": 1.1228187084197998, "rewards/rejected": -2.7446258068084717, "step": 210 }, { "epoch": 0.25, "grad_norm": 46.80720748583798, "learning_rate": 4.650676758194623e-07, "logits/chosen": -0.5494168996810913, "logits/rejected": -0.3329974114894867, "logps/chosen": -386.22528076171875, "logps/rejected": -472.072998046875, "loss": 0.419, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6599994897842407, "rewards/margins": 1.2505383491516113, "rewards/rejected": -2.9105377197265625, "step": 220 }, { "epoch": 0.26, "grad_norm": 43.28959440159286, "learning_rate": 4.5976799775611215e-07, "logits/chosen": -0.6910772919654846, "logits/rejected": -0.4287993013858795, "logps/chosen": -385.10784912109375, "logps/rejected": -484.22314453125, "loss": 0.43, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7417128086090088, "rewards/margins": 1.4360835552215576, "rewards/rejected": -3.1777961254119873, "step": 230 }, { "epoch": 0.28, "grad_norm": 48.21494711877692, "learning_rate": 4.5412894586271543e-07, "logits/chosen": -0.3966357111930847, "logits/rejected": -0.13579869270324707, "logps/chosen": -405.3009338378906, "logps/rejected": -484.6737365722656, "loss": 0.4083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8974357843399048, "rewards/margins": 1.3567252159118652, "rewards/rejected": -3.2541611194610596, "step": 240 }, { "epoch": 0.29, "grad_norm": 42.352515667816355, "learning_rate": 4.481596432975201e-07, "logits/chosen": -0.6702763438224792, "logits/rejected": -0.49778255820274353, "logps/chosen": -340.3480224609375, "logps/rejected": -434.61376953125, "loss": 0.425, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6621681451797485, "rewards/margins": 1.0998741388320923, "rewards/rejected": -2.762042284011841, "step": 250 }, { "epoch": 0.3, "grad_norm": 51.54256095538614, "learning_rate": 4.41869747515886e-07, "logits/chosen": -0.6597603559494019, "logits/rejected": -0.5498248338699341, "logps/chosen": -365.7995910644531, "logps/rejected": -490.1622009277344, "loss": 0.4244, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4162827730178833, "rewards/margins": 1.2882452011108398, "rewards/rejected": -2.7045278549194336, "step": 260 }, { "epoch": 0.31, "grad_norm": 48.71803198385668, "learning_rate": 4.352694346459396e-07, "logits/chosen": 0.04401933029294014, "logits/rejected": 0.16322588920593262, "logps/chosen": -363.21539306640625, "logps/rejected": -463.6495056152344, "loss": 0.4206, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5739765167236328, "rewards/margins": 1.1849424839019775, "rewards/rejected": -2.7589190006256104, "step": 270 }, { "epoch": 0.32, "grad_norm": 38.68223370724194, "learning_rate": 4.2836938302509256e-07, "logits/chosen": -0.13973233103752136, "logits/rejected": 0.19283699989318848, "logps/chosen": -328.5007019042969, "logps/rejected": -440.18365478515625, "loss": 0.4456, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.279756784439087, "rewards/margins": 1.4430491924285889, "rewards/rejected": -2.7228057384490967, "step": 280 }, { "epoch": 0.33, "grad_norm": 45.704934038680605, "learning_rate": 4.2118075592405874e-07, "logits/chosen": 0.20580144226551056, "logits/rejected": 0.34621715545654297, "logps/chosen": -407.57373046875, "logps/rejected": -517.0430908203125, "loss": 0.4242, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8687858581542969, "rewards/margins": 1.2867904901504517, "rewards/rejected": -3.155576229095459, "step": 290 }, { "epoch": 0.35, "grad_norm": 48.006993514366904, "learning_rate": 4.137151834863213e-07, "logits/chosen": 0.6578917503356934, "logits/rejected": 0.7554408311843872, "logps/chosen": -349.4103088378906, "logps/rejected": -480.834228515625, "loss": 0.4348, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.828386664390564, "rewards/margins": 1.3594980239868164, "rewards/rejected": -3.18788480758667, "step": 300 }, { "epoch": 0.35, "eval_logits/chosen": -0.5939264297485352, "eval_logits/rejected": -0.34991100430488586, "eval_logps/chosen": -545.4883422851562, "eval_logps/rejected": -617.2100830078125, "eval_loss": 0.5011798739433289, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -2.0997684001922607, "eval_rewards/margins": 0.7412738800048828, "eval_rewards/rejected": -2.8410420417785645, "eval_runtime": 98.127, "eval_samples_per_second": 20.382, "eval_steps_per_second": 0.326, "step": 300 }, { "epoch": 0.36, "grad_norm": 43.38987414729455, "learning_rate": 4.059847439122671e-07, "logits/chosen": 0.5874438285827637, "logits/rejected": 0.8824877738952637, "logps/chosen": -419.9178771972656, "logps/rejected": -517.2019653320312, "loss": 0.4149, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0750081539154053, "rewards/margins": 1.2572228908538818, "rewards/rejected": -3.332231044769287, "step": 310 }, { "epoch": 0.37, "grad_norm": 56.605050092804255, "learning_rate": 3.98001943918432e-07, "logits/chosen": 0.6735237836837769, "logits/rejected": 1.019078254699707, "logps/chosen": -373.03009033203125, "logps/rejected": -483.0083923339844, "loss": 0.4049, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7667083740234375, "rewards/margins": 1.1942052841186523, "rewards/rejected": -2.960913896560669, "step": 320 }, { "epoch": 0.38, "grad_norm": 57.81664075376147, "learning_rate": 3.8977969850346866e-07, "logits/chosen": 0.4839138090610504, "logits/rejected": 0.8274878263473511, "logps/chosen": -387.33673095703125, "logps/rejected": -499.78094482421875, "loss": 0.4004, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7052650451660156, "rewards/margins": 1.477137565612793, "rewards/rejected": -3.1824028491973877, "step": 330 }, { "epoch": 0.39, "grad_norm": 50.66567087546677, "learning_rate": 3.8133131005357465e-07, "logits/chosen": 0.23904335498809814, "logits/rejected": 0.6436888575553894, "logps/chosen": -374.50750732421875, "logps/rejected": -534.21435546875, "loss": 0.3943, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.705120325088501, "rewards/margins": 1.7923282384872437, "rewards/rejected": -3.497448444366455, "step": 340 }, { "epoch": 0.4, "grad_norm": 41.43510772615216, "learning_rate": 3.7267044682118435e-07, "logits/chosen": 0.3483354449272156, "logits/rejected": 0.6899020075798035, "logps/chosen": -369.47418212890625, "logps/rejected": -496.38262939453125, "loss": 0.3884, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8410135507583618, "rewards/margins": 1.4833061695098877, "rewards/rejected": -3.324319362640381, "step": 350 }, { "epoch": 0.41, "grad_norm": 46.89248795203356, "learning_rate": 3.638111208117425e-07, "logits/chosen": 0.22267869114875793, "logits/rejected": 0.4508979916572571, "logps/chosen": -409.98974609375, "logps/rejected": -508.88055419921875, "loss": 0.4111, "rewards/accuracies": 0.75, "rewards/chosen": -2.0787599086761475, "rewards/margins": 1.0934727191925049, "rewards/rejected": -3.1722328662872314, "step": 360 }, { "epoch": 0.43, "grad_norm": 43.02323311612351, "learning_rate": 3.5476766511433605e-07, "logits/chosen": 0.1800430715084076, "logits/rejected": 0.6425480842590332, "logps/chosen": -431.10736083984375, "logps/rejected": -516.4458618164062, "loss": 0.4194, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9750921726226807, "rewards/margins": 1.3207170963287354, "rewards/rejected": -3.295809268951416, "step": 370 }, { "epoch": 0.44, "grad_norm": 43.154999607698095, "learning_rate": 3.455547107128602e-07, "logits/chosen": 0.3740110993385315, "logits/rejected": 0.8220480680465698, "logps/chosen": -410.6556701660156, "logps/rejected": -515.9549560546875, "loss": 0.3767, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.596968412399292, "rewards/margins": 1.6267616748809814, "rewards/rejected": -3.2237300872802734, "step": 380 }, { "epoch": 0.45, "grad_norm": 56.90068596534485, "learning_rate": 3.361871628152338e-07, "logits/chosen": 0.6576219797134399, "logits/rejected": 1.0373657941818237, "logps/chosen": -398.47906494140625, "logps/rejected": -556.7415771484375, "loss": 0.4239, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9783694744110107, "rewards/margins": 1.5746887922286987, "rewards/rejected": -3.55305814743042, "step": 390 }, { "epoch": 0.46, "grad_norm": 41.49097538770333, "learning_rate": 3.2668017673896077e-07, "logits/chosen": 0.6066378355026245, "logits/rejected": 1.0441324710845947, "logps/chosen": -376.2064514160156, "logps/rejected": -497.462890625, "loss": 0.3733, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7407310009002686, "rewards/margins": 1.581956148147583, "rewards/rejected": -3.3226871490478516, "step": 400 }, { "epoch": 0.46, "eval_logits/chosen": -0.5456388592720032, "eval_logits/rejected": -0.2280205935239792, "eval_logps/chosen": -550.5716552734375, "eval_logps/rejected": -626.190185546875, "eval_loss": 0.47210657596588135, "eval_rewards/accuracies": 0.77734375, "eval_rewards/chosen": -2.1506011486053467, "eval_rewards/margins": 0.7802413105964661, "eval_rewards/rejected": -2.930842399597168, "eval_runtime": 98.1161, "eval_samples_per_second": 20.384, "eval_steps_per_second": 0.326, "step": 400 }, { "epoch": 0.47, "grad_norm": 47.55353494901972, "learning_rate": 3.1704913339205103e-07, "logits/chosen": 0.5084329843521118, "logits/rejected": 0.796318531036377, "logps/chosen": -409.43585205078125, "logps/rejected": -561.5556030273438, "loss": 0.3928, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.992550253868103, "rewards/margins": 1.6422802209854126, "rewards/rejected": -3.6348299980163574, "step": 410 }, { "epoch": 0.48, "grad_norm": 41.646877730648264, "learning_rate": 3.0730961438896885e-07, "logits/chosen": 0.4776241183280945, "logits/rejected": 0.7627217769622803, "logps/chosen": -482.1835021972656, "logps/rejected": -587.5792236328125, "loss": 0.3881, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.093543767929077, "rewards/margins": 1.4904192686080933, "rewards/rejected": -3.583962917327881, "step": 420 }, { "epoch": 0.5, "grad_norm": 68.32669660083764, "learning_rate": 2.9747737684186795e-07, "logits/chosen": 0.7197389602661133, "logits/rejected": 0.8317638635635376, "logps/chosen": -388.28656005859375, "logps/rejected": -509.2151794433594, "loss": 0.3841, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7809364795684814, "rewards/margins": 1.5095723867416382, "rewards/rejected": -3.290508985519409, "step": 430 }, { "epoch": 0.51, "grad_norm": 46.78192200543751, "learning_rate": 2.8756832786789663e-07, "logits/chosen": 0.3376988172531128, "logits/rejected": 0.8295138478279114, "logps/chosen": -403.0928649902344, "logps/rejected": -518.611083984375, "loss": 0.4029, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.744091272354126, "rewards/margins": 1.5630067586898804, "rewards/rejected": -3.307097911834717, "step": 440 }, { "epoch": 0.52, "grad_norm": 49.72034219777285, "learning_rate": 2.7759849885381747e-07, "logits/chosen": 0.3917238414287567, "logits/rejected": 0.9007431268692017, "logps/chosen": -451.806884765625, "logps/rejected": -584.4218139648438, "loss": 0.3785, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0996882915496826, "rewards/margins": 1.9295704364776611, "rewards/rejected": -4.029258728027344, "step": 450 }, { "epoch": 0.53, "grad_norm": 38.3046078852496, "learning_rate": 2.675840195195762e-07, "logits/chosen": 0.1938302218914032, "logits/rejected": 0.7046247720718384, "logps/chosen": -375.27606201171875, "logps/rejected": -523.9801025390625, "loss": 0.3934, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7863012552261353, "rewards/margins": 1.6471843719482422, "rewards/rejected": -3.433485507965088, "step": 460 }, { "epoch": 0.54, "grad_norm": 39.056692194028, "learning_rate": 2.575410918227829e-07, "logits/chosen": 0.09105312079191208, "logits/rejected": 0.5196784138679504, "logps/chosen": -413.9867248535156, "logps/rejected": -532.4803466796875, "loss": 0.3755, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.720029592514038, "rewards/margins": 1.528271198272705, "rewards/rejected": -3.2483010292053223, "step": 470 }, { "epoch": 0.55, "grad_norm": 46.6868254294557, "learning_rate": 2.474859637463226e-07, "logits/chosen": 0.21693472564220428, "logits/rejected": 0.8155421018600464, "logps/chosen": -418.37652587890625, "logps/rejected": -540.866455078125, "loss": 0.3846, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9778916835784912, "rewards/margins": 1.7564996480941772, "rewards/rejected": -3.734391450881958, "step": 480 }, { "epoch": 0.56, "grad_norm": 45.512117273870444, "learning_rate": 2.3743490301150355e-07, "logits/chosen": 0.2570355236530304, "logits/rejected": 0.8997817039489746, "logps/chosen": -381.27801513671875, "logps/rejected": -525.5377807617188, "loss": 0.4012, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6076080799102783, "rewards/margins": 1.8304884433746338, "rewards/rejected": -3.438096523284912, "step": 490 }, { "epoch": 0.58, "grad_norm": 46.09704078060399, "learning_rate": 2.274041707592724e-07, "logits/chosen": 0.7786660194396973, "logits/rejected": 1.2057403326034546, "logps/chosen": -416.14068603515625, "logps/rejected": -602.9859008789062, "loss": 0.3689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2852025032043457, "rewards/margins": 1.9095999002456665, "rewards/rejected": -4.194802284240723, "step": 500 }, { "epoch": 0.58, "eval_logits/chosen": -0.4774431586265564, "eval_logits/rejected": -0.1090613454580307, "eval_logps/chosen": -540.1826171875, "eval_logps/rejected": -627.9595336914062, "eval_loss": 0.448412150144577, "eval_rewards/accuracies": 0.796875, "eval_rewards/chosen": -2.046710968017578, "eval_rewards/margins": 0.9018256068229675, "eval_rewards/rejected": -2.9485368728637695, "eval_runtime": 98.1848, "eval_samples_per_second": 20.37, "eval_steps_per_second": 0.326, "step": 500 }, { "epoch": 0.59, "grad_norm": 42.744213876119844, "learning_rate": 2.17409995242075e-07, "logits/chosen": 0.6994825005531311, "logits/rejected": 1.289393663406372, "logps/chosen": -405.2342224121094, "logps/rejected": -555.2643432617188, "loss": 0.3921, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.062455415725708, "rewards/margins": 1.8831449747085571, "rewards/rejected": -3.9456000328063965, "step": 510 }, { "epoch": 0.6, "grad_norm": 44.25862131066792, "learning_rate": 2.0746854556892544e-07, "logits/chosen": 0.7421714067459106, "logits/rejected": 0.9166728258132935, "logps/chosen": -363.72222900390625, "logps/rejected": -499.4908752441406, "loss": 0.4102, "rewards/accuracies": 0.78125, "rewards/chosen": -1.795539140701294, "rewards/margins": 1.4331713914871216, "rewards/rejected": -3.228710889816284, "step": 520 }, { "epoch": 0.61, "grad_norm": 40.42456029676201, "learning_rate": 1.9759590554616173e-07, "logits/chosen": 0.2788628935813904, "logits/rejected": 0.5978427529335022, "logps/chosen": -387.8989562988281, "logps/rejected": -499.9576110839844, "loss": 0.4053, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6805702447891235, "rewards/margins": 1.3731516599655151, "rewards/rejected": -3.0537219047546387, "step": 530 }, { "epoch": 0.62, "grad_norm": 43.79592437572997, "learning_rate": 1.8780804765620746e-07, "logits/chosen": 0.37570881843566895, "logits/rejected": 0.5200439691543579, "logps/chosen": -394.23284912109375, "logps/rejected": -548.2333374023438, "loss": 0.384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5941615104675293, "rewards/margins": 1.527552843093872, "rewards/rejected": -3.1217141151428223, "step": 540 }, { "epoch": 0.63, "grad_norm": 44.94669101797897, "learning_rate": 1.7812080721643973e-07, "logits/chosen": 0.6379637122154236, "logits/rejected": 1.1335102319717407, "logps/chosen": -422.62200927734375, "logps/rejected": -535.2354736328125, "loss": 0.3932, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0112552642822266, "rewards/margins": 1.6570736169815063, "rewards/rejected": -3.6683287620544434, "step": 550 }, { "epoch": 0.64, "grad_norm": 48.51576878403802, "learning_rate": 1.6854985675997063e-07, "logits/chosen": 0.5151522755622864, "logits/rejected": 0.9227844476699829, "logps/chosen": -410.75244140625, "logps/rejected": -543.8304443359375, "loss": 0.3729, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9362386465072632, "rewards/margins": 1.548099398612976, "rewards/rejected": -3.4843380451202393, "step": 560 }, { "epoch": 0.66, "grad_norm": 42.77055197730572, "learning_rate": 1.5911068067978818e-07, "logits/chosen": 0.7765737771987915, "logits/rejected": 0.9592781066894531, "logps/chosen": -391.6842041015625, "logps/rejected": -575.3435668945312, "loss": 0.3642, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0532357692718506, "rewards/margins": 1.811832070350647, "rewards/rejected": -3.865067720413208, "step": 570 }, { "epoch": 0.67, "grad_norm": 51.09604434640814, "learning_rate": 1.4981855017728197e-07, "logits/chosen": 0.596177875995636, "logits/rejected": 0.7803729772567749, "logps/chosen": -459.51422119140625, "logps/rejected": -612.7260131835938, "loss": 0.388, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.421908140182495, "rewards/margins": 1.5485522747039795, "rewards/rejected": -3.9704601764678955, "step": 580 }, { "epoch": 0.68, "grad_norm": 51.69715596466598, "learning_rate": 1.406884985556804e-07, "logits/chosen": 0.6335197687149048, "logits/rejected": 1.1092630624771118, "logps/chosen": -429.76690673828125, "logps/rejected": -580.2468872070312, "loss": 0.3807, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2301127910614014, "rewards/margins": 1.8223087787628174, "rewards/rejected": -4.052420616149902, "step": 590 }, { "epoch": 0.69, "grad_norm": 48.435911535292384, "learning_rate": 1.3173529689837354e-07, "logits/chosen": 0.5912660956382751, "logits/rejected": 1.1899088621139526, "logps/chosen": -393.476318359375, "logps/rejected": -521.782958984375, "loss": 0.3829, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8528053760528564, "rewards/margins": 1.6730989217758179, "rewards/rejected": -3.5259041786193848, "step": 600 }, { "epoch": 0.69, "eval_logits/chosen": -0.509851336479187, "eval_logits/rejected": -0.14121857285499573, "eval_logps/chosen": -538.1624145507812, "eval_logps/rejected": -623.8541259765625, "eval_loss": 0.44193577766418457, "eval_rewards/accuracies": 0.80859375, "eval_rewards/chosen": -2.0265088081359863, "eval_rewards/margins": 0.8809735774993896, "eval_rewards/rejected": -2.907482147216797, "eval_runtime": 98.167, "eval_samples_per_second": 20.373, "eval_steps_per_second": 0.326, "step": 600 }, { "epoch": 0.7, "grad_norm": 48.985755457205066, "learning_rate": 1.2297343017146726e-07, "logits/chosen": 0.7694305181503296, "logits/rejected": 1.232879877090454, "logps/chosen": -402.1836853027344, "logps/rejected": -533.408447265625, "loss": 0.3929, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.905542016029358, "rewards/margins": 1.6221548318862915, "rewards/rejected": -3.5276970863342285, "step": 610 }, { "epoch": 0.71, "grad_norm": 45.28513242475784, "learning_rate": 1.1441707378923474e-07, "logits/chosen": 0.5253760814666748, "logits/rejected": 1.0413273572921753, "logps/chosen": -359.5643615722656, "logps/rejected": -514.2081909179688, "loss": 0.3806, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.579487919807434, "rewards/margins": 1.767327070236206, "rewards/rejected": -3.3468146324157715, "step": 620 }, { "epoch": 0.73, "grad_norm": 47.72652227607087, "learning_rate": 1.06080070680377e-07, "logits/chosen": 0.4920094907283783, "logits/rejected": 1.009433627128601, "logps/chosen": -399.2576599121094, "logps/rejected": -537.9578247070312, "loss": 0.3821, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7137682437896729, "rewards/margins": 1.7276941537857056, "rewards/rejected": -3.441462755203247, "step": 630 }, { "epoch": 0.74, "grad_norm": 42.1168430015071, "learning_rate": 9.797590889219587e-08, "logits/chosen": 0.3111940026283264, "logits/rejected": 0.8665814399719238, "logps/chosen": -396.842529296875, "logps/rejected": -543.9876098632812, "loss": 0.3843, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7231413125991821, "rewards/margins": 1.8444896936416626, "rewards/rejected": -3.567631244659424, "step": 640 }, { "epoch": 0.75, "grad_norm": 47.41933670532933, "learning_rate": 9.011769976891367e-08, "logits/chosen": 0.4944031834602356, "logits/rejected": 0.8744715452194214, "logps/chosen": -398.05615234375, "logps/rejected": -543.6096801757812, "loss": 0.3763, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8718990087509155, "rewards/margins": 1.6193087100982666, "rewards/rejected": -3.4912078380584717, "step": 650 }, { "epoch": 0.76, "grad_norm": 52.75260796298546, "learning_rate": 8.251815673944218e-08, "logits/chosen": 0.5813334584236145, "logits/rejected": 0.9786221385002136, "logps/chosen": -443.66070556640625, "logps/rejected": -576.3490600585938, "loss": 0.3822, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.220869779586792, "rewards/margins": 1.742889404296875, "rewards/rejected": -3.963758945465088, "step": 660 }, { "epoch": 0.77, "grad_norm": 46.64520061062158, "learning_rate": 7.518957474892148e-08, "logits/chosen": 0.6128578186035156, "logits/rejected": 1.1231881380081177, "logps/chosen": -427.1106872558594, "logps/rejected": -589.3102416992188, "loss": 0.3662, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.0986740589141846, "rewards/margins": 1.9974746704101562, "rewards/rejected": -4.096148490905762, "step": 670 }, { "epoch": 0.78, "grad_norm": 44.32719204523107, "learning_rate": 6.814381036730274e-08, "logits/chosen": 0.44363918900489807, "logits/rejected": 0.8115978240966797, "logps/chosen": -397.6707763671875, "logps/rejected": -538.56591796875, "loss": 0.3962, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9131567478179932, "rewards/margins": 1.6610631942749023, "rewards/rejected": -3.5742194652557373, "step": 680 }, { "epoch": 0.79, "grad_norm": 43.744460103075866, "learning_rate": 6.139226260715872e-08, "logits/chosen": 0.34574732184410095, "logits/rejected": 0.7309020161628723, "logps/chosen": -390.32464599609375, "logps/rejected": -550.9197998046875, "loss": 0.3747, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7989364862442017, "rewards/margins": 1.8234875202178955, "rewards/rejected": -3.622424364089966, "step": 690 }, { "epoch": 0.81, "grad_norm": 48.21671181557863, "learning_rate": 5.4945854481754734e-08, "logits/chosen": 0.4160235822200775, "logits/rejected": 1.0240848064422607, "logps/chosen": -393.590576171875, "logps/rejected": -540.9241333007812, "loss": 0.3725, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9948371648788452, "rewards/margins": 1.689173936843872, "rewards/rejected": -3.6840109825134277, "step": 700 }, { "epoch": 0.81, "eval_logits/chosen": -0.6920372247695923, "eval_logits/rejected": -0.3223564326763153, "eval_logps/chosen": -527.349609375, "eval_logps/rejected": -613.8932495117188, "eval_loss": 0.43294557929039, "eval_rewards/accuracies": 0.82421875, "eval_rewards/chosen": -1.9183804988861084, "eval_rewards/margins": 0.8894931077957153, "eval_rewards/rejected": -2.8078737258911133, "eval_runtime": 98.1374, "eval_samples_per_second": 20.38, "eval_steps_per_second": 0.326, "step": 700 }, { "epoch": 0.82, "grad_norm": 42.53084626680963, "learning_rate": 4.881501533321605e-08, "logits/chosen": 0.6980074048042297, "logits/rejected": 1.0298550128936768, "logps/chosen": -367.0564880371094, "logps/rejected": -539.99560546875, "loss": 0.3547, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8503217697143555, "rewards/margins": 1.9031312465667725, "rewards/rejected": -3.753452777862549, "step": 710 }, { "epoch": 0.83, "grad_norm": 43.590506229310456, "learning_rate": 4.300966395938377e-08, "logits/chosen": 0.35197392106056213, "logits/rejected": 0.8350766897201538, "logps/chosen": -427.9037170410156, "logps/rejected": -580.8751831054688, "loss": 0.3788, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0454230308532715, "rewards/margins": 1.8579833507537842, "rewards/rejected": -3.9034061431884766, "step": 720 }, { "epoch": 0.84, "grad_norm": 47.15415328548373, "learning_rate": 3.7539192566655246e-08, "logits/chosen": 0.3688026964664459, "logits/rejected": 0.7924972772598267, "logps/chosen": -387.2108459472656, "logps/rejected": -532.4842529296875, "loss": 0.3762, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6555856466293335, "rewards/margins": 1.8027565479278564, "rewards/rejected": -3.4583423137664795, "step": 730 }, { "epoch": 0.85, "grad_norm": 41.72651096064494, "learning_rate": 3.24124515747731e-08, "logits/chosen": 0.4526204466819763, "logits/rejected": 0.7684503793716431, "logps/chosen": -406.00042724609375, "logps/rejected": -571.0294189453125, "loss": 0.3881, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0768213272094727, "rewards/margins": 1.7967207431793213, "rewards/rejected": -3.8735415935516357, "step": 740 }, { "epoch": 0.86, "grad_norm": 47.004010938683734, "learning_rate": 2.763773529814506e-08, "logits/chosen": 0.24592173099517822, "logits/rejected": 0.5948923826217651, "logps/chosen": -437.3650817871094, "logps/rejected": -581.8604125976562, "loss": 0.3772, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.974538803100586, "rewards/margins": 1.9196981191635132, "rewards/rejected": -3.8942363262176514, "step": 750 }, { "epoch": 0.88, "grad_norm": 56.33205281532714, "learning_rate": 2.3222768526860698e-08, "logits/chosen": 0.2990577220916748, "logits/rejected": 0.7854124903678894, "logps/chosen": -404.5032653808594, "logps/rejected": -561.688720703125, "loss": 0.3938, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9624149799346924, "rewards/margins": 1.855332374572754, "rewards/rejected": -3.8177475929260254, "step": 760 }, { "epoch": 0.89, "grad_norm": 43.51724396608159, "learning_rate": 1.9174694029115146e-08, "logits/chosen": 0.18542930483818054, "logits/rejected": 0.5257433652877808, "logps/chosen": -424.1546325683594, "logps/rejected": -532.9678344726562, "loss": 0.3879, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8116706609725952, "rewards/margins": 1.6489944458007812, "rewards/rejected": -3.460665225982666, "step": 770 }, { "epoch": 0.9, "grad_norm": 51.298202533295, "learning_rate": 1.5500060995258134e-08, "logits/chosen": 0.3892073333263397, "logits/rejected": 0.8499504327774048, "logps/chosen": -402.9557189941406, "logps/rejected": -541.4577026367188, "loss": 0.349, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8981506824493408, "rewards/margins": 1.7807424068450928, "rewards/rejected": -3.6788933277130127, "step": 780 }, { "epoch": 0.91, "grad_norm": 56.6017962844276, "learning_rate": 1.2204814442165812e-08, "logits/chosen": 0.3551040589809418, "logits/rejected": 0.8326929807662964, "logps/chosen": -402.6451416015625, "logps/rejected": -552.5445556640625, "loss": 0.386, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9424476623535156, "rewards/margins": 1.9533637762069702, "rewards/rejected": -3.8958117961883545, "step": 790 }, { "epoch": 0.92, "grad_norm": 49.36333315496645, "learning_rate": 9.294285595075669e-09, "logits/chosen": 0.06378497928380966, "logits/rejected": 0.5464959144592285, "logps/chosen": -430.5462951660156, "logps/rejected": -562.2453002929688, "loss": 0.4052, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9655787944793701, "rewards/margins": 1.7598493099212646, "rewards/rejected": -3.7254281044006348, "step": 800 }, { "epoch": 0.92, "eval_logits/chosen": -0.8024855852127075, "eval_logits/rejected": -0.4436371624469757, "eval_logps/chosen": -524.2042236328125, "eval_logps/rejected": -612.2493286132812, "eval_loss": 0.42916327714920044, "eval_rewards/accuracies": 0.82421875, "eval_rewards/chosen": -1.8869271278381348, "eval_rewards/margins": 0.9045072793960571, "eval_rewards/rejected": -2.7914342880249023, "eval_runtime": 98.1154, "eval_samples_per_second": 20.384, "eval_steps_per_second": 0.326, "step": 800 }, { "epoch": 0.93, "grad_norm": 48.45659164140374, "learning_rate": 6.773183262446914e-09, "logits/chosen": 0.2793930172920227, "logits/rejected": 0.8751212954521179, "logps/chosen": -400.6767883300781, "logps/rejected": -544.5294799804688, "loss": 0.3882, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9480419158935547, "rewards/margins": 1.711806297302246, "rewards/rejected": -3.65984845161438, "step": 810 }, { "epoch": 0.94, "grad_norm": 48.21463789648397, "learning_rate": 4.645586217799452e-09, "logits/chosen": 0.24326184391975403, "logits/rejected": 0.6566700339317322, "logps/chosen": -410.050537109375, "logps/rejected": -576.2342529296875, "loss": 0.4036, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.893699049949646, "rewards/margins": 1.8373210430145264, "rewards/rejected": -3.731020450592041, "step": 820 }, { "epoch": 0.96, "grad_norm": 52.40196558130504, "learning_rate": 2.9149366008568987e-09, "logits/chosen": 0.2516610622406006, "logits/rejected": 0.6028949022293091, "logps/chosen": -397.42755126953125, "logps/rejected": -558.4515380859375, "loss": 0.3856, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.790833830833435, "rewards/margins": 1.8916391134262085, "rewards/rejected": -3.6824734210968018, "step": 830 }, { "epoch": 0.97, "grad_norm": 45.18885600860689, "learning_rate": 1.5840343486700215e-09, "logits/chosen": 0.011555513367056847, "logits/rejected": 0.5860650539398193, "logps/chosen": -406.7879638671875, "logps/rejected": -555.0967407226562, "loss": 0.3728, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6412866115570068, "rewards/margins": 1.9584299325942993, "rewards/rejected": -3.5997166633605957, "step": 840 }, { "epoch": 0.98, "grad_norm": 45.90265978309936, "learning_rate": 6.550326657293881e-10, "logits/chosen": 0.08577422052621841, "logits/rejected": 0.549113929271698, "logps/chosen": -403.1221618652344, "logps/rejected": -571.7515869140625, "loss": 0.3525, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8727197647094727, "rewards/margins": 2.0556139945983887, "rewards/rejected": -3.9283337593078613, "step": 850 }, { "epoch": 0.99, "grad_norm": 48.04876217861222, "learning_rate": 1.2943454039654467e-10, "logits/chosen": 0.5522348284721375, "logits/rejected": 0.82818204164505, "logps/chosen": -399.8492126464844, "logps/rejected": -529.6903076171875, "loss": 0.3623, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0089919567108154, "rewards/margins": 1.499420404434204, "rewards/rejected": -3.5084125995635986, "step": 860 }, { "epoch": 1.0, "step": 868, "total_flos": 0.0, "train_loss": 0.42912535238925215, "train_runtime": 13911.1927, "train_samples_per_second": 7.989, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 868, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }