{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.971563981042654, "eval_steps": 128, "global_step": 104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018957345971563982, "grad_norm": 191.735526981635, "learning_rate": 4.545454545454545e-08, "logits/chosen": 117.53560638427734, "logits/rejected": 126.8960952758789, "logps/chosen": -335.40118408203125, "logps/rejected": -439.16552734375, "loss": 0.8466, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.1895734597156398, "grad_norm": 215.27789653097236, "learning_rate": 4.545454545454545e-07, "logits/chosen": 135.01699829101562, "logits/rejected": 138.35781860351562, "logps/chosen": -395.863525390625, "logps/rejected": -439.87109375, "loss": 0.7497, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 0.0066477167420089245, "rewards/margins": 0.033824387937784195, "rewards/rejected": -0.02717665769159794, "step": 10 }, { "epoch": 0.3791469194312796, "grad_norm": 151.6525661192009, "learning_rate": 4.885348141000122e-07, "logits/chosen": 121.6771240234375, "logits/rejected": 125.45533752441406, "logps/chosen": -372.70947265625, "logps/rejected": -423.9148864746094, "loss": 0.66, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.01487281359732151, "rewards/margins": 0.18130120635032654, "rewards/rejected": -0.1961739957332611, "step": 20 }, { "epoch": 0.5687203791469194, "grad_norm": 127.760794507037, "learning_rate": 4.5025027361734613e-07, "logits/chosen": 140.84371948242188, "logits/rejected": 134.668701171875, "logps/chosen": -408.40216064453125, "logps/rejected": -448.96710205078125, "loss": 0.6211, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7968470454216003, "rewards/margins": 0.6513954997062683, "rewards/rejected": -1.4482426643371582, "step": 30 }, { "epoch": 0.7582938388625592, "grad_norm": 117.3200007673402, "learning_rate": 3.893311157806091e-07, "logits/chosen": 123.24369049072266, "logits/rejected": 112.3370361328125, "logps/chosen": -381.4453125, "logps/rejected": -402.53936767578125, "loss": 0.6035, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3625555038452148, "rewards/margins": 0.8706506490707397, "rewards/rejected": -2.233206033706665, "step": 40 }, { "epoch": 0.9478672985781991, "grad_norm": 205.0988018258791, "learning_rate": 3.126631330646801e-07, "logits/chosen": 140.59500122070312, "logits/rejected": 144.99911499023438, "logps/chosen": -465.456298828125, "logps/rejected": -541.2003173828125, "loss": 0.5469, "rewards/accuracies": 0.75, "rewards/chosen": -2.3156235218048096, "rewards/margins": 0.9280710220336914, "rewards/rejected": -3.243694305419922, "step": 50 }, { "epoch": 1.1374407582938388, "grad_norm": 103.63573519510389, "learning_rate": 2.2891223348923882e-07, "logits/chosen": 132.75270080566406, "logits/rejected": 136.41439819335938, "logps/chosen": -441.16741943359375, "logps/rejected": -510.4600524902344, "loss": 0.463, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.1580872535705566, "rewards/margins": 1.546640157699585, "rewards/rejected": -3.7047271728515625, "step": 60 }, { "epoch": 1.3270142180094786, "grad_norm": 88.9555531672405, "learning_rate": 1.4754491880085317e-07, "logits/chosen": 127.8622055053711, "logits/rejected": 129.76100158691406, "logps/chosen": -398.2987365722656, "logps/rejected": -478.58526611328125, "loss": 0.4087, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5577425956726074, "rewards/margins": 1.6958301067352295, "rewards/rejected": -3.253572940826416, "step": 70 }, { "epoch": 1.5165876777251186, "grad_norm": 91.77009057587127, "learning_rate": 7.775827023107834e-08, "logits/chosen": 115.83001708984375, "logits/rejected": 132.09730529785156, "logps/chosen": -392.9648742675781, "logps/rejected": -491.1802673339844, "loss": 0.3483, "rewards/accuracies": 0.875, "rewards/chosen": -1.9656927585601807, "rewards/margins": 1.931434988975525, "rewards/rejected": -3.897127628326416, "step": 80 }, { "epoch": 1.7061611374407581, "grad_norm": 83.21809324441205, "learning_rate": 2.7440387297912122e-08, "logits/chosen": 115.92982482910156, "logits/rejected": 129.11094665527344, "logps/chosen": -424.06640625, "logps/rejected": -514.6354370117188, "loss": 0.3396, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8817684650421143, "rewards/margins": 1.9659159183502197, "rewards/rejected": -3.847684383392334, "step": 90 }, { "epoch": 1.8957345971563981, "grad_norm": 71.96649220747696, "learning_rate": 2.27878296044029e-09, "logits/chosen": 122.7137680053711, "logits/rejected": 122.6277084350586, "logps/chosen": -415.48455810546875, "logps/rejected": -488.2542419433594, "loss": 0.3434, "rewards/accuracies": 0.875, "rewards/chosen": -1.6754734516143799, "rewards/margins": 1.8990375995635986, "rewards/rejected": -3.5745112895965576, "step": 100 }, { "epoch": 1.971563981042654, "step": 104, "total_flos": 0.0, "train_loss": 0.5053261174605443, "train_runtime": 2173.3813, "train_samples_per_second": 6.212, "train_steps_per_second": 0.048 } ], "logging_steps": 10, "max_steps": 104, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }