{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 165, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 4.652278347218786, "learning_rate": 2.941176470588235e-08, "logits/chosen": -0.8284896612167358, "logits/rejected": -0.9010236263275146, "logps/chosen": -1066.3585205078125, "logps/rejected": -1448.19970703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "grad_norm": 4.746771519966634, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.8115520477294922, "logits/rejected": -0.8255029320716858, "logps/chosen": -1131.291259765625, "logps/rejected": -1369.7412109375, "loss": 0.6932, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": 0.0002041943371295929, "rewards/margins": -4.850090044783428e-05, "rewards/rejected": 0.0002526953467167914, "step": 10 }, { "epoch": 0.12, "grad_norm": 4.451745733301935, "learning_rate": 4.994932636402031e-07, "logits/chosen": -0.7243806719779968, "logits/rejected": -0.8158847093582153, "logps/chosen": -1020.7599487304688, "logps/rejected": -1355.944091796875, "loss": 0.6921, "rewards/accuracies": 0.59375, "rewards/chosen": 0.001858971663750708, "rewards/margins": 0.0021505323238670826, "rewards/rejected": -0.00029156063101254404, "step": 20 }, { "epoch": 0.18, "grad_norm": 4.532594184835871, "learning_rate": 4.905416503522123e-07, "logits/chosen": -0.7353666424751282, "logits/rejected": -0.8100309371948242, "logps/chosen": -1033.032470703125, "logps/rejected": -1331.6929931640625, "loss": 0.688, "rewards/accuracies": 0.78125, "rewards/chosen": 0.008143061771988869, "rewards/margins": 0.010795501992106438, "rewards/rejected": -0.002652441617101431, "step": 30 }, { "epoch": 0.24, "grad_norm": 4.452955387273571, "learning_rate": 4.707922373336523e-07, "logits/chosen": -0.7547545433044434, "logits/rejected": -0.7800291776657104, "logps/chosen": -1057.7445068359375, "logps/rejected": -1296.575439453125, "loss": 0.6825, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.008273174986243248, "rewards/margins": 0.016675911843776703, "rewards/rejected": -0.008402736857533455, "step": 40 }, { "epoch": 0.3, "grad_norm": 4.931805222376995, "learning_rate": 4.4113156629677313e-07, "logits/chosen": -0.7371411919593811, "logits/rejected": -0.6845098733901978, "logps/chosen": -1045.1011962890625, "logps/rejected": -1151.344970703125, "loss": 0.6718, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.017855554819107056, "rewards/margins": 0.04448147863149643, "rewards/rejected": -0.026625927537679672, "step": 50 }, { "epoch": 0.36, "grad_norm": 4.689422676957636, "learning_rate": 4.0289109058972283e-07, "logits/chosen": -0.7692807912826538, "logits/rejected": -0.7662399411201477, "logps/chosen": -999.9730224609375, "logps/rejected": -1286.947509765625, "loss": 0.6595, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.027734506875276566, "rewards/margins": 0.07751207053661346, "rewards/rejected": -0.0497775673866272, "step": 60 }, { "epoch": 0.42, "grad_norm": 4.956697872711078, "learning_rate": 3.577874068920446e-07, "logits/chosen": -0.7923519611358643, "logits/rejected": -0.8132171630859375, "logps/chosen": -1077.121337890625, "logps/rejected": -1317.41845703125, "loss": 0.6474, "rewards/accuracies": 0.78125, "rewards/chosen": 0.009537232108414173, "rewards/margins": 0.09025295078754425, "rewards/rejected": -0.08071572333574295, "step": 70 }, { "epoch": 0.48, "grad_norm": 6.011926472486656, "learning_rate": 3.078451980100854e-07, "logits/chosen": -0.7588658928871155, "logits/rejected": -0.8289008140563965, "logps/chosen": -1011.5177612304688, "logps/rejected": -1298.1904296875, "loss": 0.6262, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.023491863161325455, "rewards/margins": 0.1584763377904892, "rewards/rejected": -0.13498449325561523, "step": 80 }, { "epoch": 0.55, "grad_norm": 5.367792800931542, "learning_rate": 2.553063458334059e-07, "logits/chosen": -0.7922073006629944, "logits/rejected": -0.8237783312797546, "logps/chosen": -1067.434326171875, "logps/rejected": -1301.37109375, "loss": 0.5865, "rewards/accuracies": 0.8125, "rewards/chosen": -0.015568578615784645, "rewards/margins": 0.31723320484161377, "rewards/rejected": -0.3328017592430115, "step": 90 }, { "epoch": 0.61, "grad_norm": 5.346586894544636, "learning_rate": 2.0252929432814287e-07, "logits/chosen": -0.779016375541687, "logits/rejected": -0.9120697975158691, "logps/chosen": -1015.7185668945312, "logps/rejected": -1394.1680908203125, "loss": 0.5615, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.05275765806436539, "rewards/margins": 0.5331910848617554, "rewards/rejected": -0.5859487056732178, "step": 100 }, { "epoch": 0.61, "eval_logits/chosen": -0.6675596237182617, "eval_logits/rejected": -0.8939424753189087, "eval_logps/chosen": -826.0933837890625, "eval_logps/rejected": -1433.1563720703125, "eval_loss": 0.6218963861465454, "eval_rewards/accuracies": 0.7459239363670349, "eval_rewards/chosen": -0.044515106827020645, "eval_rewards/margins": 0.19978085160255432, "eval_rewards/rejected": -0.24429598450660706, "eval_runtime": 353.1381, "eval_samples_per_second": 8.289, "eval_steps_per_second": 0.261, "step": 100 }, { "epoch": 0.67, "grad_norm": 5.41704752041782, "learning_rate": 1.5188318011445906e-07, "logits/chosen": -0.7974969744682312, "logits/rejected": -0.8456804156303406, "logps/chosen": -1040.8197021484375, "logps/rejected": -1285.4654541015625, "loss": 0.5406, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.08253589272499084, "rewards/margins": 0.788347601890564, "rewards/rejected": -0.870883584022522, "step": 110 }, { "epoch": 0.73, "grad_norm": 6.860188078136068, "learning_rate": 1.0564148305586295e-07, "logits/chosen": -0.7957097291946411, "logits/rejected": -0.8646506071090698, "logps/chosen": -979.1201171875, "logps/rejected": -1402.442138671875, "loss": 0.5115, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07362432777881622, "rewards/margins": 0.8031437993049622, "rewards/rejected": -0.8767681121826172, "step": 120 }, { "epoch": 0.79, "grad_norm": 6.241835191835041, "learning_rate": 6.587997083462196e-08, "logits/chosen": -0.828117847442627, "logits/rejected": -0.8768518567085266, "logps/chosen": -1065.3460693359375, "logps/rejected": -1385.936767578125, "loss": 0.5146, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0992397740483284, "rewards/margins": 1.1248447895050049, "rewards/rejected": -1.2240846157073975, "step": 130 }, { "epoch": 0.85, "grad_norm": 6.81375264804862, "learning_rate": 3.438351873250492e-08, "logits/chosen": -0.8043051958084106, "logits/rejected": -0.8954287767410278, "logps/chosen": -1059.5267333984375, "logps/rejected": -1414.9984130859375, "loss": 0.5123, "rewards/accuracies": 0.84375, "rewards/chosen": -0.17837993800640106, "rewards/margins": 0.9624137878417969, "rewards/rejected": -1.1407936811447144, "step": 140 }, { "epoch": 0.91, "grad_norm": 7.068585992863548, "learning_rate": 1.256598743236703e-08, "logits/chosen": -0.8356617093086243, "logits/rejected": -0.8929821252822876, "logps/chosen": -1066.185302734375, "logps/rejected": -1415.41064453125, "loss": 0.5079, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.16673186421394348, "rewards/margins": 0.8981729745864868, "rewards/rejected": -1.0649049282073975, "step": 150 }, { "epoch": 0.97, "grad_norm": 6.546330593670502, "learning_rate": 1.406755487774386e-09, "logits/chosen": -0.816728949546814, "logits/rejected": -0.8898676633834839, "logps/chosen": -1021.1590576171875, "logps/rejected": -1444.29296875, "loss": 0.4992, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.196590393781662, "rewards/margins": 0.9931901097297668, "rewards/rejected": -1.189780592918396, "step": 160 }, { "epoch": 1.0, "step": 165, "total_flos": 0.0, "train_loss": 0.20238888480446554, "train_runtime": 919.6394, "train_samples_per_second": 11.461, "train_steps_per_second": 0.179 } ], "logging_steps": 10, "max_steps": 165, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }