{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9969788519637462, "eval_steps": 100, "global_step": 165, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.774768176591989, "learning_rate": 2.941176470588235e-08, "logits/chosen": 0.48741579055786133, "logits/rejected": -0.8717803955078125, "logps/chosen": -311.44610595703125, "logps/rejected": -1042.2933349609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "grad_norm": 4.065933729000048, "learning_rate": 2.941176470588235e-07, "logits/chosen": 0.3187962770462036, "logits/rejected": -0.46175992488861084, "logps/chosen": -526.5966796875, "logps/rejected": -899.632568359375, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0003381037386134267, "rewards/margins": 0.00014021807874087244, "rewards/rejected": 0.00019788570352829993, "step": 10 }, { "epoch": 0.12, "grad_norm": 4.133159908424447, "learning_rate": 4.994932636402031e-07, "logits/chosen": 0.22923466563224792, "logits/rejected": -0.6458711624145508, "logps/chosen": -566.1712646484375, "logps/rejected": -926.1541137695312, "loss": 0.6919, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0022346877958625555, "rewards/margins": 0.0030761375091969967, "rewards/rejected": -0.0008414499461650848, "step": 20 }, { "epoch": 0.18, "grad_norm": 3.759041431537677, "learning_rate": 4.905416503522123e-07, "logits/chosen": 0.2407102882862091, "logits/rejected": -0.7926596999168396, "logps/chosen": -523.1210327148438, "logps/rejected": -1028.3199462890625, "loss": 0.6855, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.010289192199707031, "rewards/margins": 0.013627497479319572, "rewards/rejected": -0.003338304813951254, "step": 30 }, { "epoch": 0.24, "grad_norm": 3.918700608724971, "learning_rate": 4.707922373336523e-07, "logits/chosen": 0.14743538200855255, "logits/rejected": -0.7249930500984192, "logps/chosen": -524.011474609375, "logps/rejected": -989.4501953125, "loss": 0.675, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.016446446999907494, "rewards/margins": 0.051999401301145554, "rewards/rejected": -0.03555295616388321, "step": 40 }, { "epoch": 0.3, "grad_norm": 3.6017852179026626, "learning_rate": 4.4113156629677313e-07, "logits/chosen": 0.23459818959236145, "logits/rejected": -0.6225197911262512, "logps/chosen": -481.66455078125, "logps/rejected": -867.3211059570312, "loss": 0.6639, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.03439263254404068, "rewards/margins": 0.06260526925325394, "rewards/rejected": -0.02821262739598751, "step": 50 }, { "epoch": 0.36, "grad_norm": 3.679527248386035, "learning_rate": 4.0289109058972283e-07, "logits/chosen": 0.26775047183036804, "logits/rejected": -0.49902766942977905, "logps/chosen": -516.3983154296875, "logps/rejected": -819.7734375, "loss": 0.6398, "rewards/accuracies": 0.84375, "rewards/chosen": 0.03639604151248932, "rewards/margins": 0.1593528836965561, "rewards/rejected": -0.12295685708522797, "step": 60 }, { "epoch": 0.42, "grad_norm": 3.945875727521845, "learning_rate": 3.577874068920446e-07, "logits/chosen": 0.26115402579307556, "logits/rejected": -0.6307616233825684, "logps/chosen": -534.6641845703125, "logps/rejected": -911.5435791015625, "loss": 0.6322, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.048932626843452454, "rewards/margins": 0.24001319706439972, "rewards/rejected": -0.19108060002326965, "step": 70 }, { "epoch": 0.48, "grad_norm": 3.7126637404674536, "learning_rate": 3.078451980100854e-07, "logits/chosen": 0.20563717186450958, "logits/rejected": -0.688762903213501, "logps/chosen": -493.32684326171875, "logps/rejected": -957.6318359375, "loss": 0.6237, "rewards/accuracies": 0.84375, "rewards/chosen": 0.060481660068035126, "rewards/margins": 0.21118538081645966, "rewards/rejected": -0.15070374310016632, "step": 80 }, { "epoch": 0.54, "grad_norm": 4.119949298182235, "learning_rate": 2.553063458334059e-07, "logits/chosen": 0.3919462263584137, "logits/rejected": -0.5500736832618713, "logps/chosen": -510.05712890625, "logps/rejected": -912.9411010742188, "loss": 0.6164, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05354578420519829, "rewards/margins": 0.2815362215042114, "rewards/rejected": -0.22799046337604523, "step": 90 }, { "epoch": 0.6, "grad_norm": 4.539444195047728, "learning_rate": 2.0252929432814287e-07, "logits/chosen": 0.23407666385173798, "logits/rejected": -0.6277016401290894, "logps/chosen": -514.2950439453125, "logps/rejected": -985.7261962890625, "loss": 0.6065, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.046576742082834244, "rewards/margins": 0.26194584369659424, "rewards/rejected": -0.2153691053390503, "step": 100 }, { "epoch": 0.6, "eval_logits/chosen": -0.1363597810268402, "eval_logits/rejected": -0.3805391788482666, "eval_logps/chosen": -523.0221557617188, "eval_logps/rejected": -812.6375732421875, "eval_loss": 0.6296960115432739, "eval_rewards/accuracies": 0.7678571343421936, "eval_rewards/chosen": 0.07959667593240738, "eval_rewards/margins": 0.17506957054138184, "eval_rewards/rejected": -0.09547291696071625, "eval_runtime": 22.7695, "eval_samples_per_second": 9.135, "eval_steps_per_second": 0.307, "step": 100 }, { "epoch": 0.66, "grad_norm": 4.166464578639834, "learning_rate": 1.5188318011445906e-07, "logits/chosen": 0.09842907637357712, "logits/rejected": -0.7154465913772583, "logps/chosen": -633.3096923828125, "logps/rejected": -972.07861328125, "loss": 0.5933, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04117094725370407, "rewards/margins": 0.2970955967903137, "rewards/rejected": -0.25592464208602905, "step": 110 }, { "epoch": 0.73, "grad_norm": 4.767777281679362, "learning_rate": 1.0564148305586295e-07, "logits/chosen": 0.2290249764919281, "logits/rejected": -0.5675751566886902, "logps/chosen": -553.788330078125, "logps/rejected": -963.9578247070312, "loss": 0.5795, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.04246982932090759, "rewards/margins": 0.5371382832527161, "rewards/rejected": -0.4946684241294861, "step": 120 }, { "epoch": 0.79, "grad_norm": 4.959401739670467, "learning_rate": 6.587997083462196e-08, "logits/chosen": 0.1415528953075409, "logits/rejected": -0.6273466348648071, "logps/chosen": -579.4324951171875, "logps/rejected": -927.8792114257812, "loss": 0.5587, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02018633857369423, "rewards/margins": 0.7045117616653442, "rewards/rejected": -0.6843255162239075, "step": 130 }, { "epoch": 0.85, "grad_norm": 4.589724744119317, "learning_rate": 3.438351873250492e-08, "logits/chosen": 0.2175011932849884, "logits/rejected": -0.5643750429153442, "logps/chosen": -543.2364501953125, "logps/rejected": -1040.180908203125, "loss": 0.5653, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.03240719065070152, "rewards/margins": 0.5543904900550842, "rewards/rejected": -0.52198326587677, "step": 140 }, { "epoch": 0.91, "grad_norm": 5.293978243611277, "learning_rate": 1.256598743236703e-08, "logits/chosen": 0.2741110026836395, "logits/rejected": -0.6036696434020996, "logps/chosen": -437.42901611328125, "logps/rejected": -982.3721923828125, "loss": 0.5555, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.033161625266075134, "rewards/margins": 0.5011934638023376, "rewards/rejected": -0.46803179383277893, "step": 150 }, { "epoch": 0.97, "grad_norm": 4.856259961602652, "learning_rate": 1.406755487774386e-09, "logits/chosen": 0.14368140697479248, "logits/rejected": -0.6074076294898987, "logps/chosen": -525.0721435546875, "logps/rejected": -952.7180786132812, "loss": 0.5519, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.04387623816728592, "rewards/margins": 0.42462554574012756, "rewards/rejected": -0.38074928522109985, "step": 160 }, { "epoch": 1.0, "step": 165, "total_flos": 0.0, "train_loss": 0.6189163742643414, "train_runtime": 2381.9724, "train_samples_per_second": 4.446, "train_steps_per_second": 0.069 } ], "logging_steps": 10, "max_steps": 165, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }