{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 122, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 36.349710133839, "learning_rate": 3.846153846153846e-08, "logits/chosen": -3.5315005779266357, "logits/rejected": -3.440955638885498, "logps/chosen": -912.1570434570312, "logps/rejected": -1378.036376953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "grad_norm": 41.12013014177843, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -3.55020809173584, "logits/rejected": -3.4724764823913574, "logps/chosen": -894.586181640625, "logps/rejected": -1449.21484375, "loss": 0.6888, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.0002599477011244744, "rewards/margins": 0.008236742578446865, "rewards/rejected": -0.007976794615387917, "step": 10 }, { "epoch": 0.16, "grad_norm": 29.704748896674143, "learning_rate": 4.949291683053768e-07, "logits/chosen": -3.5908989906311035, "logits/rejected": -3.5577595233917236, "logps/chosen": -912.923828125, "logps/rejected": -1338.394775390625, "loss": 0.5791, "rewards/accuracies": 0.90625, "rewards/chosen": 0.04868435114622116, "rewards/margins": 0.28125035762786865, "rewards/rejected": -0.2325659692287445, "step": 20 }, { "epoch": 0.25, "grad_norm": 26.273281316637295, "learning_rate": 4.70586371748506e-07, "logits/chosen": -3.810521364212036, "logits/rejected": -3.7334792613983154, "logps/chosen": -955.4530029296875, "logps/rejected": -1488.5167236328125, "loss": 0.366, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.09059515595436096, "rewards/margins": 1.322347640991211, "rewards/rejected": -1.412942886352539, "step": 30 }, { "epoch": 0.33, "grad_norm": 43.496160834466956, "learning_rate": 4.280458575653296e-07, "logits/chosen": -3.996204376220703, "logits/rejected": -3.956129789352417, "logps/chosen": -989.1363525390625, "logps/rejected": -1658.587158203125, "loss": 0.3256, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.5707166194915771, "rewards/margins": 2.6343164443969727, "rewards/rejected": -3.20503306388855, "step": 40 }, { "epoch": 0.41, "grad_norm": 44.2304132089698, "learning_rate": 3.7081709127108767e-07, "logits/chosen": -4.008645057678223, "logits/rejected": -3.9912617206573486, "logps/chosen": -1022.1027221679688, "logps/rejected": -1825.446533203125, "loss": 0.1878, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7217316031455994, "rewards/margins": 3.8372483253479004, "rewards/rejected": -4.5589799880981445, "step": 50 }, { "epoch": 0.49, "grad_norm": 17.524621217634877, "learning_rate": 3.0362127536287636e-07, "logits/chosen": -4.027331352233887, "logits/rejected": -4.025083065032959, "logps/chosen": -1039.8206787109375, "logps/rejected": -1963.096435546875, "loss": 0.1549, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7357276082038879, "rewards/margins": 4.72170877456665, "rewards/rejected": -5.457436561584473, "step": 60 }, { "epoch": 0.57, "grad_norm": 14.62238967073387, "learning_rate": 2.3200186419770823e-07, "logits/chosen": -3.992643356323242, "logits/rejected": -3.9795494079589844, "logps/chosen": -954.447265625, "logps/rejected": -1917.7783203125, "loss": 0.153, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6473932266235352, "rewards/margins": 4.5014448165893555, "rewards/rejected": -5.148838043212891, "step": 70 }, { "epoch": 0.66, "grad_norm": 26.239628326203597, "learning_rate": 1.6186724554503237e-07, "logits/chosen": -3.955888032913208, "logits/rejected": -3.937206745147705, "logps/chosen": -976.7513427734375, "logps/rejected": -1973.913818359375, "loss": 0.1099, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7037805318832397, "rewards/margins": 5.219768524169922, "rewards/rejected": -5.923549175262451, "step": 80 }, { "epoch": 0.74, "grad_norm": 11.522776918137943, "learning_rate": 9.900331622138063e-08, "logits/chosen": -3.967766523361206, "logits/rejected": -3.948270797729492, "logps/chosen": -1030.423583984375, "logps/rejected": -2060.10205078125, "loss": 0.1077, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9134801626205444, "rewards/margins": 5.445040225982666, "rewards/rejected": -6.358519554138184, "step": 90 }, { "epoch": 0.82, "grad_norm": 17.331411142935814, "learning_rate": 4.859616286322094e-08, "logits/chosen": -3.9533779621124268, "logits/rejected": -3.9539833068847656, "logps/chosen": -1035.493896484375, "logps/rejected": -1998.699951171875, "loss": 0.1251, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9830253720283508, "rewards/margins": 5.410158634185791, "rewards/rejected": -6.393183708190918, "step": 100 }, { "epoch": 0.82, "eval_logits/chosen": -4.367298126220703, "eval_logits/rejected": -3.9096977710723877, "eval_logps/chosen": -250.05014038085938, "eval_logps/rejected": -632.3324584960938, "eval_loss": 0.46643248200416565, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": -0.30725225806236267, "eval_rewards/margins": 0.5298991203308105, "eval_rewards/rejected": -0.8371513485908508, "eval_runtime": 3.458, "eval_samples_per_second": 3.47, "eval_steps_per_second": 0.289, "step": 100 }, { "epoch": 0.9, "grad_norm": 16.867635583735968, "learning_rate": 1.4804225250339281e-08, "logits/chosen": -3.932652235031128, "logits/rejected": -3.946476459503174, "logps/chosen": -941.6383056640625, "logps/rejected": -2050.407470703125, "loss": 0.1244, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7304863333702087, "rewards/margins": 6.1875152587890625, "rewards/rejected": -6.918001651763916, "step": 110 }, { "epoch": 0.98, "grad_norm": 33.91302977437583, "learning_rate": 4.152374292708538e-10, "logits/chosen": -3.9486804008483887, "logits/rejected": -3.9186534881591797, "logps/chosen": -952.8955078125, "logps/rejected": -1983.6126708984375, "loss": 0.1104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8376309275627136, "rewards/margins": 5.359804153442383, "rewards/rejected": -6.197434902191162, "step": 120 }, { "epoch": 1.0, "step": 122, "total_flos": 0.0, "train_loss": 0.2501322243545876, "train_runtime": 1891.0061, "train_samples_per_second": 4.125, "train_steps_per_second": 0.065 } ], "logging_steps": 10, "max_steps": 122, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }