{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.971563981042654, "eval_steps": 100, "global_step": 104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018957345971563982, "grad_norm": 15.786988646394411, "learning_rate": 4.545454545454545e-08, "logits/chosen": -13.905267715454102, "logits/rejected": -14.118387222290039, "logps/chosen": -350.8895263671875, "logps/rejected": -446.6286926269531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.1895734597156398, "grad_norm": 15.908099576913655, "learning_rate": 4.545454545454545e-07, "logits/chosen": -14.040081024169922, "logits/rejected": -14.157392501831055, "logps/chosen": -416.2701416015625, "logps/rejected": -449.4697265625, "loss": 0.693, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": 0.0006088384543545544, "rewards/margins": 0.008209776133298874, "rewards/rejected": -0.007600938435643911, "step": 10 }, { "epoch": 0.3791469194312796, "grad_norm": 14.229474825008781, "learning_rate": 4.885348141000122e-07, "logits/chosen": -13.39338207244873, "logits/rejected": -13.542058944702148, "logps/chosen": -392.9753723144531, "logps/rejected": -427.68096923828125, "loss": 0.6892, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.010071685537695885, "rewards/margins": 0.003802267834544182, "rewards/rejected": 0.006269416771829128, "step": 20 }, { "epoch": 0.5687203791469194, "grad_norm": 15.853985724357454, "learning_rate": 4.5025027361734613e-07, "logits/chosen": -14.269427299499512, "logits/rejected": -13.808093070983887, "logps/chosen": -412.9443359375, "logps/rejected": -428.38494873046875, "loss": 0.674, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04771440848708153, "rewards/margins": 0.035354893654584885, "rewards/rejected": 0.012359511107206345, "step": 30 }, { "epoch": 0.7582938388625592, "grad_norm": 14.687978809678542, "learning_rate": 3.893311157806091e-07, "logits/chosen": -13.886492729187012, "logits/rejected": -13.28197956085205, "logps/chosen": -374.98211669921875, "logps/rejected": -366.5968322753906, "loss": 0.657, "rewards/accuracies": 0.65625, "rewards/chosen": 0.13442906737327576, "rewards/margins": 0.07902240008115768, "rewards/rejected": 0.05540664866566658, "step": 40 }, { "epoch": 0.9478672985781991, "grad_norm": 15.872142673244408, "learning_rate": 3.126631330646801e-07, "logits/chosen": -14.917936325073242, "logits/rejected": -14.90648078918457, "logps/chosen": -429.6836853027344, "logps/rejected": -480.3504943847656, "loss": 0.6344, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24091288447380066, "rewards/margins": 0.1330389827489853, "rewards/rejected": 0.10787389427423477, "step": 50 }, { "epoch": 1.1374407582938388, "grad_norm": 14.061428605486398, "learning_rate": 2.2891223348923882e-07, "logits/chosen": -14.622962951660156, "logits/rejected": -14.403157234191895, "logps/chosen": -415.7464904785156, "logps/rejected": -441.731201171875, "loss": 0.6063, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3395090103149414, "rewards/margins": 0.22218124568462372, "rewards/rejected": 0.11732780933380127, "step": 60 }, { "epoch": 1.3270142180094786, "grad_norm": 12.963152293888875, "learning_rate": 1.4754491880085317e-07, "logits/chosen": -14.022384643554688, "logits/rejected": -13.828951835632324, "logps/chosen": -382.23468017578125, "logps/rejected": -418.2818908691406, "loss": 0.6011, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.3396778702735901, "rewards/margins": 0.22157195210456848, "rewards/rejected": 0.118105947971344, "step": 70 }, { "epoch": 1.5165876777251186, "grad_norm": 12.394681314131397, "learning_rate": 7.775827023107834e-08, "logits/chosen": -13.705121040344238, "logits/rejected": -14.205709457397461, "logps/chosen": -367.263427734375, "logps/rejected": -423.30841064453125, "loss": 0.5788, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.36119210720062256, "rewards/margins": 0.3365553319454193, "rewards/rejected": 0.024636749178171158, "step": 80 }, { "epoch": 1.7061611374407581, "grad_norm": 14.456589635016153, "learning_rate": 2.7440387297912122e-08, "logits/chosen": -13.98394775390625, "logits/rejected": -14.161648750305176, "logps/chosen": -399.45458984375, "logps/rejected": -447.48828125, "loss": 0.5766, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3995341658592224, "rewards/margins": 0.34082064032554626, "rewards/rejected": 0.05871356278657913, "step": 90 }, { "epoch": 1.8957345971563981, "grad_norm": 13.44211674398592, "learning_rate": 2.27878296044029e-09, "logits/chosen": -14.160197257995605, "logits/rejected": -14.141824722290039, "logps/chosen": -392.3072509765625, "logps/rejected": -421.604248046875, "loss": 0.5732, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4029002785682678, "rewards/margins": 0.27652695775032043, "rewards/rejected": 0.1263733208179474, "step": 100 }, { "epoch": 1.8957345971563981, "eval_logits/chosen": -13.292621612548828, "eval_logits/rejected": -12.66539478302002, "eval_logps/chosen": -372.0066833496094, "eval_logps/rejected": -373.4093933105469, "eval_loss": 0.5971602201461792, "eval_rewards/accuracies": 0.6770833134651184, "eval_rewards/chosen": 0.3533553183078766, "eval_rewards/margins": 0.24372106790542603, "eval_rewards/rejected": 0.10963428020477295, "eval_runtime": 20.0916, "eval_samples_per_second": 37.329, "eval_steps_per_second": 1.195, "step": 100 }, { "epoch": 1.971563981042654, "step": 104, "total_flos": 0.0, "train_loss": 0.627926590350958, "train_runtime": 756.4701, "train_samples_per_second": 17.846, "train_steps_per_second": 0.137 } ], "logging_steps": 10, "max_steps": 104, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }