{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9942857142857143, "eval_steps": 100, "global_step": 87, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 19.437784440370766, "learning_rate": 5.555555555555555e-08, "logits/chosen": -2.6343841552734375, "logits/rejected": -2.6980783939361572, "logps/chosen": -581.0560302734375, "logps/rejected": -816.3157958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.11, "grad_norm": 24.044765874363524, "learning_rate": 4.997972495428924e-07, "logits/chosen": -2.592595100402832, "logits/rejected": -2.6620125770568848, "logps/chosen": -575.6597900390625, "logps/rejected": -842.704345703125, "loss": 0.6913, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.00319318613037467, "rewards/margins": 0.0027422416023910046, "rewards/rejected": 0.00045094432425685227, "step": 10 }, { "epoch": 0.23, "grad_norm": 21.402086416348496, "learning_rate": 4.7586260865259554e-07, "logits/chosen": -2.666720151901245, "logits/rejected": -2.636192798614502, "logps/chosen": -614.2251586914062, "logps/rejected": -898.0855712890625, "loss": 0.6594, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.053952403366565704, "rewards/margins": 0.0747995600104332, "rewards/rejected": -0.020847156643867493, "step": 20 }, { "epoch": 0.34, "grad_norm": 30.86499685610669, "learning_rate": 4.157806645601988e-07, "logits/chosen": -2.8280863761901855, "logits/rejected": -2.7879080772399902, "logps/chosen": -507.06854248046875, "logps/rejected": -962.9939575195312, "loss": 0.5746, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.026104014366865158, "rewards/margins": 0.35351747274398804, "rewards/rejected": -0.3796215355396271, "step": 30 }, { "epoch": 0.46, "grad_norm": 47.705913880090655, "learning_rate": 3.2916699845036815e-07, "logits/chosen": -3.0843756198883057, "logits/rejected": -3.0327470302581787, "logps/chosen": -613.1222534179688, "logps/rejected": -1105.1820068359375, "loss": 0.4397, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6070794463157654, "rewards/margins": 1.289810299873352, "rewards/rejected": -1.8968894481658936, "step": 40 }, { "epoch": 0.57, "grad_norm": 30.02360571000533, "learning_rate": 2.2988335782081851e-07, "logits/chosen": -2.9371981620788574, "logits/rejected": -3.0881218910217285, "logps/chosen": -528.0693359375, "logps/rejected": -1141.9866943359375, "loss": 0.378, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.27318429946899414, "rewards/margins": 1.5688350200653076, "rewards/rejected": -1.8420193195343018, "step": 50 }, { "epoch": 0.69, "grad_norm": 32.099661091341254, "learning_rate": 1.3381920698905784e-07, "logits/chosen": -2.9197936058044434, "logits/rejected": -3.0912833213806152, "logps/chosen": -550.3436279296875, "logps/rejected": -1202.0462646484375, "loss": 0.3308, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4591130316257477, "rewards/margins": 2.357220411300659, "rewards/rejected": -2.816333532333374, "step": 60 }, { "epoch": 0.8, "grad_norm": 32.99216577780543, "learning_rate": 5.6348759543086374e-08, "logits/chosen": -2.9525704383850098, "logits/rejected": -3.2371833324432373, "logps/chosen": -646.6183471679688, "logps/rejected": -1339.00146484375, "loss": 0.3031, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.535283625125885, "rewards/margins": 2.8484416007995605, "rewards/rejected": -3.38372540473938, "step": 70 }, { "epoch": 0.91, "grad_norm": 36.54372644480357, "learning_rate": 9.87047209215694e-09, "logits/chosen": -2.80966854095459, "logits/rejected": -3.226341962814331, "logps/chosen": -611.0046997070312, "logps/rejected": -1294.22900390625, "loss": 0.3159, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.7080464363098145, "rewards/margins": 2.6834611892700195, "rewards/rejected": -3.391507625579834, "step": 80 }, { "epoch": 0.99, "step": 87, "total_flos": 0.0, "train_loss": 0.44661769921752226, "train_runtime": 1188.3148, "train_samples_per_second": 4.689, "train_steps_per_second": 0.073 } ], "logging_steps": 10, "max_steps": 87, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }