{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987908101571947, "eval_steps": 10000000, "global_step": 413, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024183796856106408, "grad_norm": 1308.2534984366307, "learning_rate": 9.523809523809522e-09, "logits/chosen": -2.7005977630615234, "logits/rejected": -2.6288318634033203, "logps/chosen": -1.1158788204193115, "logps/rejected": -1.1333446502685547, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02418379685610641, "grad_norm": 1018.5726644948129, "learning_rate": 9.523809523809523e-08, "logits/chosen": -2.762432098388672, "logits/rejected": -2.697216033935547, "logps/chosen": -0.83743816614151, "logps/rejected": -0.8184080123901367, "loss": 0.7126, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": 0.018050068989396095, "rewards/margins": 0.10968472808599472, "rewards/rejected": -0.09163466095924377, "step": 10 }, { "epoch": 0.04836759371221282, "grad_norm": 1180.2159938574362, "learning_rate": 1.9047619047619045e-07, "logits/chosen": -2.6905813217163086, "logits/rejected": -2.6509311199188232, "logps/chosen": -0.9959298968315125, "logps/rejected": -1.0417280197143555, "loss": 0.6914, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.41868314146995544, "rewards/margins": 0.11889855563640594, "rewards/rejected": -0.5375816822052002, "step": 20 }, { "epoch": 0.07255139056831923, "grad_norm": 2875.4669192965785, "learning_rate": 2.857142857142857e-07, "logits/chosen": -2.7360830307006836, "logits/rejected": -2.68190336227417, "logps/chosen": -0.9741474986076355, "logps/rejected": -0.9867057800292969, "loss": 0.7598, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7634207606315613, "rewards/margins": 0.5467337965965271, "rewards/rejected": -1.3101545572280884, "step": 30 }, { "epoch": 0.09673518742442563, "grad_norm": 2637.428065513591, "learning_rate": 3.809523809523809e-07, "logits/chosen": -2.686359405517578, "logits/rejected": -2.641474485397339, "logps/chosen": -0.9992626309394836, "logps/rejected": -0.9355602264404297, "loss": 0.9431, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0757026672363281, "rewards/margins": 1.4628146886825562, "rewards/rejected": -2.538517475128174, "step": 40 }, { "epoch": 0.12091898428053205, "grad_norm": 975.4162480488375, "learning_rate": 3.995412608484087e-07, "logits/chosen": -2.759364366531372, "logits/rejected": -2.7036304473876953, "logps/chosen": -0.9589918255805969, "logps/rejected": -0.9145007133483887, "loss": 0.9551, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.7774861454963684, "rewards/margins": 1.538771629333496, "rewards/rejected": -0.7612855434417725, "step": 50 }, { "epoch": 0.14510278113663846, "grad_norm": 876.8403861103245, "learning_rate": 3.976812391485896e-07, "logits/chosen": -2.762204647064209, "logits/rejected": -2.693788528442383, "logps/chosen": -0.9397481083869934, "logps/rejected": -0.9560295939445496, "loss": 1.0176, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3399124145507812, "rewards/margins": 2.62353777885437, "rewards/rejected": -4.9634504318237305, "step": 60 }, { "epoch": 0.16928657799274485, "grad_norm": 851.542671730339, "learning_rate": 3.9440458281608213e-07, "logits/chosen": -2.753873109817505, "logits/rejected": -2.727172613143921, "logps/chosen": -0.9090434312820435, "logps/rejected": -0.8764857053756714, "loss": 0.9609, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 1.77558171749115, "rewards/margins": 3.0382332801818848, "rewards/rejected": -1.2626516819000244, "step": 70 }, { "epoch": 0.19347037484885127, "grad_norm": 1072.4102961713095, "learning_rate": 3.897347732134074e-07, "logits/chosen": -2.6825203895568848, "logits/rejected": -2.6251988410949707, "logps/chosen": -0.8969934582710266, "logps/rejected": -1.0139261484146118, "loss": 1.0729, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.25338679552078247, "rewards/margins": 2.662139415740967, "rewards/rejected": -2.9155266284942627, "step": 80 }, { "epoch": 0.21765417170495768, "grad_norm": 1045.2736360081997, "learning_rate": 3.8370527539794614e-07, "logits/chosen": -2.6782150268554688, "logits/rejected": -2.6257784366607666, "logps/chosen": -1.0229610204696655, "logps/rejected": -1.0610243082046509, "loss": 1.0661, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9386919736862183, "rewards/margins": 3.1748452186584473, "rewards/rejected": -4.113537311553955, "step": 90 }, { "epoch": 0.2418379685610641, "grad_norm": 717.8046840728214, "learning_rate": 3.763592983027255e-07, "logits/chosen": -2.7070720195770264, "logits/rejected": -2.657769203186035, "logps/chosen": -0.8991826772689819, "logps/rejected": -0.9345502853393555, "loss": 1.1057, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7811681032180786, "rewards/margins": 2.7421443462371826, "rewards/rejected": -3.52331280708313, "step": 100 }, { "epoch": 0.2660217654171705, "grad_norm": 573.0671194598642, "learning_rate": 3.6774948509008527e-07, "logits/chosen": -2.715163469314575, "logits/rejected": -2.6663975715637207, "logps/chosen": -0.9820922613143921, "logps/rejected": -0.9612447023391724, "loss": 0.9831, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.3871256709098816, "rewards/margins": 2.8536558151245117, "rewards/rejected": -3.240781784057617, "step": 110 }, { "epoch": 0.29020556227327693, "grad_norm": 813.0340299259154, "learning_rate": 3.579375358972288e-07, "logits/chosen": -2.667712688446045, "logits/rejected": -2.6149513721466064, "logps/chosen": -0.9040060043334961, "logps/rejected": -1.0196150541305542, "loss": 0.8569, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3184046745300293, "rewards/margins": 3.717514753341675, "rewards/rejected": -6.035919189453125, "step": 120 }, { "epoch": 0.3143893591293833, "grad_norm": 803.5906659383959, "learning_rate": 3.4699376567716156e-07, "logits/chosen": -2.7040085792541504, "logits/rejected": -2.662682056427002, "logps/chosen": -0.8633000254631042, "logps/rejected": -0.8938501477241516, "loss": 1.0665, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.6341629028320312, "rewards/margins": 4.641495704650879, "rewards/rejected": -4.007332801818848, "step": 130 }, { "epoch": 0.3385731559854897, "grad_norm": 894.1357604383927, "learning_rate": 3.349966003036421e-07, "logits/chosen": -2.678401470184326, "logits/rejected": -2.6329421997070312, "logps/chosen": -0.9393678903579712, "logps/rejected": -0.9559372663497925, "loss": 0.9368, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3545489311218262, "rewards/margins": 2.7926135063171387, "rewards/rejected": -4.147162437438965, "step": 140 }, { "epoch": 0.36275695284159615, "grad_norm": 1218.2734701044658, "learning_rate": 3.220320145511884e-07, "logits/chosen": -2.691074848175049, "logits/rejected": -2.6277267932891846, "logps/chosen": -0.9556415677070618, "logps/rejected": -1.0070703029632568, "loss": 1.0655, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.038965918123722076, "rewards/margins": 3.301687717437744, "rewards/rejected": -3.262721538543701, "step": 150 }, { "epoch": 0.38694074969770254, "grad_norm": 933.333059653217, "learning_rate": 3.0819291597771795e-07, "logits/chosen": -2.6850905418395996, "logits/rejected": -2.641737461090088, "logps/chosen": -0.9150887727737427, "logps/rejected": -0.9567450284957886, "loss": 1.1619, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4916467070579529, "rewards/margins": 3.2452170848846436, "rewards/rejected": -3.7368640899658203, "step": 160 }, { "epoch": 0.4111245465538089, "grad_norm": 1195.530681047137, "learning_rate": 2.9357847912507786e-07, "logits/chosen": -2.664726734161377, "logits/rejected": -2.5912623405456543, "logps/chosen": -0.908007800579071, "logps/rejected": -0.9083712697029114, "loss": 1.0815, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6012508869171143, "rewards/margins": 3.351865768432617, "rewards/rejected": -4.9531168937683105, "step": 170 }, { "epoch": 0.43530834340991537, "grad_norm": 1688.331550337793, "learning_rate": 2.7829343480875617e-07, "logits/chosen": -2.662369966506958, "logits/rejected": -2.5877506732940674, "logps/chosen": -0.9355185627937317, "logps/rejected": -0.9674497842788696, "loss": 1.0193, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7115722894668579, "rewards/margins": 3.03961181640625, "rewards/rejected": -2.3280396461486816, "step": 180 }, { "epoch": 0.45949214026602175, "grad_norm": 1025.3772351835823, "learning_rate": 2.624473195899052e-07, "logits/chosen": -2.7287068367004395, "logits/rejected": -2.696254253387451, "logps/chosen": -0.9740760922431946, "logps/rejected": -1.0570310354232788, "loss": 1.0517, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7886015176773071, "rewards/margins": 3.46142578125, "rewards/rejected": -5.250028133392334, "step": 190 }, { "epoch": 0.4836759371221282, "grad_norm": 781.90565238033, "learning_rate": 2.4615369080815547e-07, "logits/chosen": -2.6937649250030518, "logits/rejected": -2.651794195175171, "logps/chosen": -0.8561639785766602, "logps/rejected": -0.9404775500297546, "loss": 1.0482, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.10315074771642685, "rewards/margins": 2.2021610736846924, "rewards/rejected": -2.0990102291107178, "step": 200 }, { "epoch": 0.5078597339782346, "grad_norm": 718.5486788904052, "learning_rate": 2.2952931280049625e-07, "logits/chosen": -2.7320737838745117, "logits/rejected": -2.666625499725342, "logps/chosen": -1.0195786952972412, "logps/rejected": -0.985085129737854, "loss": 1.2414, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.24961701035499573, "rewards/margins": 4.099853038787842, "rewards/rejected": -3.850236415863037, "step": 210 }, { "epoch": 0.532043530834341, "grad_norm": 896.7965591252768, "learning_rate": 2.1269332013798747e-07, "logits/chosen": -2.7412266731262207, "logits/rejected": -2.719102144241333, "logps/chosen": -0.887231171131134, "logps/rejected": -0.8848485946655273, "loss": 1.1268, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.5452920198440552, "rewards/margins": 3.039072036743164, "rewards/rejected": -2.4937801361083984, "step": 220 }, { "epoch": 0.5562273276904474, "grad_norm": 606.0417693874598, "learning_rate": 1.9576636387676436e-07, "logits/chosen": -2.687635898590088, "logits/rejected": -2.6457619667053223, "logps/chosen": -0.9592903852462769, "logps/rejected": -0.967265784740448, "loss": 0.8874, "rewards/accuracies": 0.75, "rewards/chosen": -1.5358507633209229, "rewards/margins": 3.3348402976989746, "rewards/rejected": -4.870690822601318, "step": 230 }, { "epoch": 0.5804111245465539, "grad_norm": 794.9088518264721, "learning_rate": 1.7886974694151976e-07, "logits/chosen": -2.701521158218384, "logits/rejected": -2.6726996898651123, "logps/chosen": -0.9976873397827148, "logps/rejected": -1.01997709274292, "loss": 1.0599, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8589717745780945, "rewards/margins": 3.750645875930786, "rewards/rejected": -4.609617710113525, "step": 240 }, { "epoch": 0.6045949214026602, "grad_norm": 706.1920920085257, "learning_rate": 1.6212455483752895e-07, "logits/chosen": -2.7483222484588623, "logits/rejected": -2.66450834274292, "logps/chosen": -0.8861182928085327, "logps/rejected": -0.9307917356491089, "loss": 1.0161, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0754988193511963, "rewards/margins": 3.2043755054473877, "rewards/rejected": -2.1288766860961914, "step": 250 }, { "epoch": 0.6287787182587666, "grad_norm": 1160.850163008235, "learning_rate": 1.4565078792075733e-07, "logits/chosen": -2.699370861053467, "logits/rejected": -2.6291871070861816, "logps/chosen": -1.0056778192520142, "logps/rejected": -1.0176211595535278, "loss": 1.0258, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.27300015091896057, "rewards/margins": 4.846710681915283, "rewards/rejected": -4.5737104415893555, "step": 260 }, { "epoch": 0.652962515114873, "grad_norm": 882.9319448662282, "learning_rate": 1.295665014444281e-07, "logits/chosen": -2.726219654083252, "logits/rejected": -2.6420578956604004, "logps/chosen": -0.9637600183486938, "logps/rejected": -0.975050151348114, "loss": 1.1212, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.4779157638549805, "rewards/margins": 3.851034641265869, "rewards/rejected": -5.328949928283691, "step": 270 }, { "epoch": 0.6771463119709794, "grad_norm": 914.3545207404874, "learning_rate": 1.1398695954469597e-07, "logits/chosen": -2.673750162124634, "logits/rejected": -2.611456871032715, "logps/chosen": -0.9046699404716492, "logps/rejected": -0.9061228036880493, "loss": 0.9744, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9529012441635132, "rewards/margins": 2.8092963695526123, "rewards/rejected": -1.8563950061798096, "step": 280 }, { "epoch": 0.7013301088270859, "grad_norm": 1001.1455181062619, "learning_rate": 9.902380922818425e-08, "logits/chosen": -2.7217631340026855, "logits/rejected": -2.676079273223877, "logps/chosen": -0.986601710319519, "logps/rejected": -0.992447555065155, "loss": 0.7804, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.5409294366836548, "rewards/margins": 3.72483491897583, "rewards/rejected": -2.1839053630828857, "step": 290 }, { "epoch": 0.7255139056831923, "grad_norm": 929.8360933885888, "learning_rate": 8.478428028080398e-08, "logits/chosen": -2.7178378105163574, "logits/rejected": -2.6601595878601074, "logps/chosen": -0.9026691317558289, "logps/rejected": -0.9635857343673706, "loss": 0.963, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.20143906772136688, "rewards/margins": 3.291618824005127, "rewards/rejected": -3.4930579662323, "step": 300 }, { "epoch": 0.7496977025392987, "grad_norm": 918.556437892916, "learning_rate": 7.137041683151202e-08, "logits/chosen": -2.7046775817871094, "logits/rejected": -2.6347708702087402, "logps/chosen": -1.0870015621185303, "logps/rejected": -1.049997091293335, "loss": 0.7071, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3678367137908936, "rewards/margins": 3.927046298980713, "rewards/rejected": -5.294882774353027, "step": 310 }, { "epoch": 0.7738814993954051, "grad_norm": 1444.2173436767062, "learning_rate": 5.8878346077822135e-08, "logits/chosen": -2.7104134559631348, "logits/rejected": -2.6237189769744873, "logps/chosen": -0.8994497060775757, "logps/rejected": -0.9579949378967285, "loss": 0.7542, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5055674910545349, "rewards/margins": 4.5611371994018555, "rewards/rejected": -4.055570602416992, "step": 320 }, { "epoch": 0.7980652962515115, "grad_norm": 758.7885560560964, "learning_rate": 4.73975894135696e-08, "logits/chosen": -2.6587905883789062, "logits/rejected": -2.584770917892456, "logps/chosen": -0.9313557744026184, "logps/rejected": -0.9829689860343933, "loss": 0.7798, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.45623573660850525, "rewards/margins": 4.346068859100342, "rewards/rejected": -4.802304744720459, "step": 330 }, { "epoch": 0.8222490931076178, "grad_norm": 818.3776763189691, "learning_rate": 3.701042089556483e-08, "logits/chosen": -2.7390825748443604, "logits/rejected": -2.6632418632507324, "logps/chosen": -0.9014676809310913, "logps/rejected": -0.9521511197090149, "loss": 0.8929, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1090164184570312, "rewards/margins": 2.5331737995147705, "rewards/rejected": -3.642190456390381, "step": 340 }, { "epoch": 0.8464328899637243, "grad_norm": 633.4468460505269, "learning_rate": 2.779127764652889e-08, "logits/chosen": -2.671653985977173, "logits/rejected": -2.6091012954711914, "logps/chosen": -0.9844328761100769, "logps/rejected": -0.9899514317512512, "loss": 0.8681, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1147739887237549, "rewards/margins": 3.46244478225708, "rewards/rejected": -4.577218532562256, "step": 350 }, { "epoch": 0.8706166868198307, "grad_norm": 790.3321831454036, "learning_rate": 1.9806226419516193e-08, "logits/chosen": -2.682281494140625, "logits/rejected": -2.6280040740966797, "logps/chosen": -0.9713428616523743, "logps/rejected": -1.028427004814148, "loss": 0.8939, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3930521309375763, "rewards/margins": 3.496274948120117, "rewards/rejected": -3.889326810836792, "step": 360 }, { "epoch": 0.8948004836759371, "grad_norm": 758.0022509460326, "learning_rate": 1.3112490146559552e-08, "logits/chosen": -2.7291171550750732, "logits/rejected": -2.6654868125915527, "logps/chosen": -0.8997148275375366, "logps/rejected": -0.9173600077629089, "loss": 0.6798, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.3203060030937195, "rewards/margins": 3.888981580734253, "rewards/rejected": -4.209287166595459, "step": 370 }, { "epoch": 0.9189842805320435, "grad_norm": 580.6775247669598, "learning_rate": 7.758037864413247e-09, "logits/chosen": -2.6995849609375, "logits/rejected": -2.6686558723449707, "logps/chosen": -0.906518280506134, "logps/rejected": -0.9883183240890503, "loss": 0.7423, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.12804336845874786, "rewards/margins": 3.583407163619995, "rewards/rejected": -3.7114500999450684, "step": 380 }, { "epoch": 0.9431680773881499, "grad_norm": 740.0766659575787, "learning_rate": 3.78124095609087e-09, "logits/chosen": -2.677032947540283, "logits/rejected": -2.63029408454895, "logps/chosen": -0.9278643727302551, "logps/rejected": -1.0108085870742798, "loss": 0.7435, "rewards/accuracies": 0.84375, "rewards/chosen": 0.13243435323238373, "rewards/margins": 3.7296395301818848, "rewards/rejected": -3.597205400466919, "step": 390 }, { "epoch": 0.9673518742442564, "grad_norm": 664.330995388725, "learning_rate": 1.2105981716597603e-09, "logits/chosen": -2.711874485015869, "logits/rejected": -2.614854574203491, "logps/chosen": -0.9732038378715515, "logps/rejected": -0.9536064863204956, "loss": 0.955, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.32332465052604675, "rewards/margins": 3.985724925994873, "rewards/rejected": -4.309049129486084, "step": 400 }, { "epoch": 0.9915356711003628, "grad_norm": 1022.9589946634649, "learning_rate": 6.453139886395398e-11, "logits/chosen": -2.710538864135742, "logits/rejected": -2.6645543575286865, "logps/chosen": -0.9343970417976379, "logps/rejected": -0.9787089228630066, "loss": 0.8541, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.5227594375610352, "rewards/margins": 4.429939270019531, "rewards/rejected": -4.952698707580566, "step": 410 }, { "epoch": 0.9987908101571947, "step": 413, "total_flos": 0.0, "train_loss": 0.9438655665076674, "train_runtime": 7020.7263, "train_samples_per_second": 7.538, "train_steps_per_second": 0.059 } ], "logging_steps": 10, "max_steps": 413, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }