{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 171.98892218238854, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -0.1266070306301117, "logits/rejected": 0.7204304933547974, "logps/chosen": -319.01666259765625, "logps/rejected": -252.47039794921875, "loss": 0.6916, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "grad_norm": 158.2614639136714, "learning_rate": 2.631578947368421e-07, "logits/chosen": -0.3861861824989319, "logits/rejected": 0.33749374747276306, "logps/chosen": -266.4891052246094, "logps/rejected": -224.11000061035156, "loss": 0.6758, "rewards/accuracies": 0.5520833134651184, "rewards/chosen": -0.03102089650928974, "rewards/margins": 0.034922875463962555, "rewards/rejected": -0.06594377011060715, "step": 10 }, { "epoch": 0.11, "grad_norm": 104.41587535161224, "learning_rate": 4.999552306674344e-07, "logits/chosen": -0.24374540150165558, "logits/rejected": 0.8117060661315918, "logps/chosen": -289.02911376953125, "logps/rejected": -250.653564453125, "loss": 0.478, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.4349571764469147, "rewards/margins": 1.263426661491394, "rewards/rejected": -1.6983836889266968, "step": 20 }, { "epoch": 0.16, "grad_norm": 98.86884631406178, "learning_rate": 4.946022852363932e-07, "logits/chosen": -0.2871348261833191, "logits/rejected": 0.6740838289260864, "logps/chosen": -281.1429748535156, "logps/rejected": -271.7496032714844, "loss": 0.4067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6509501934051514, "rewards/margins": 2.906687021255493, "rewards/rejected": -4.5576372146606445, "step": 30 }, { "epoch": 0.22, "grad_norm": 95.68866413164287, "learning_rate": 4.805146507594034e-07, "logits/chosen": -0.5090769529342651, "logits/rejected": 0.5341213345527649, "logps/chosen": -283.4405517578125, "logps/rejected": -268.97686767578125, "loss": 0.371, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4618725776672363, "rewards/margins": 3.340365171432495, "rewards/rejected": -5.802238464355469, "step": 40 }, { "epoch": 0.27, "grad_norm": 103.23233983894589, "learning_rate": 4.581953932909403e-07, "logits/chosen": -0.4626421332359314, "logits/rejected": 0.5320831537246704, "logps/chosen": -313.1284484863281, "logps/rejected": -299.7115173339844, "loss": 0.335, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.947516441345215, "rewards/margins": 3.4062907695770264, "rewards/rejected": -6.353806495666504, "step": 50 }, { "epoch": 0.32, "grad_norm": 91.31338474042249, "learning_rate": 4.284415281717847e-07, "logits/chosen": -0.3830726444721222, "logits/rejected": 0.7034914493560791, "logps/chosen": -302.44549560546875, "logps/rejected": -295.2908020019531, "loss": 0.2941, "rewards/accuracies": 0.890625, "rewards/chosen": -2.427272081375122, "rewards/margins": 3.3501389026641846, "rewards/rejected": -5.777410507202148, "step": 60 }, { "epoch": 0.38, "grad_norm": 108.21771645007362, "learning_rate": 3.923155588020165e-07, "logits/chosen": -0.050761766731739044, "logits/rejected": 1.1738256216049194, "logps/chosen": -279.0822448730469, "logps/rejected": -271.3674011230469, "loss": 0.3118, "rewards/accuracies": 0.859375, "rewards/chosen": -2.9542946815490723, "rewards/margins": 3.31215238571167, "rewards/rejected": -6.266446590423584, "step": 70 }, { "epoch": 0.43, "grad_norm": 110.48344039822193, "learning_rate": 3.511075348989692e-07, "logits/chosen": -0.02379416488111019, "logits/rejected": 0.9985305666923523, "logps/chosen": -291.3994140625, "logps/rejected": -278.81207275390625, "loss": 0.3145, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9188703298568726, "rewards/margins": 3.239673614501953, "rewards/rejected": -5.158544063568115, "step": 80 }, { "epoch": 0.49, "grad_norm": 94.44668679211257, "learning_rate": 3.062889851306735e-07, "logits/chosen": 0.15241345763206482, "logits/rejected": 1.204730749130249, "logps/chosen": -285.8970947265625, "logps/rejected": -274.2763671875, "loss": 0.3256, "rewards/accuracies": 0.859375, "rewards/chosen": -2.3829667568206787, "rewards/margins": 3.3176727294921875, "rewards/rejected": -5.700639724731445, "step": 90 }, { "epoch": 0.54, "grad_norm": 72.25131318661623, "learning_rate": 2.594603691794176e-07, "logits/chosen": 0.017316246405243874, "logits/rejected": 1.112657070159912, "logps/chosen": -291.90631103515625, "logps/rejected": -277.5140686035156, "loss": 0.2907, "rewards/accuracies": 0.846875011920929, "rewards/chosen": -2.7051641941070557, "rewards/margins": 3.0991756916046143, "rewards/rejected": -5.804339408874512, "step": 100 }, { "epoch": 0.59, "grad_norm": 106.17479973453032, "learning_rate": 2.1229392570965654e-07, "logits/chosen": 0.5481065511703491, "logits/rejected": 1.4057379961013794, "logps/chosen": -290.0019226074219, "logps/rejected": -288.4178161621094, "loss": 0.2795, "rewards/accuracies": 0.859375, "rewards/chosen": -3.1080322265625, "rewards/margins": 3.114968776702881, "rewards/rejected": -6.223001003265381, "step": 110 }, { "epoch": 0.65, "grad_norm": 88.25415485320248, "learning_rate": 1.6647395712565254e-07, "logits/chosen": 0.10530638694763184, "logits/rejected": 1.3136330842971802, "logps/chosen": -303.7025451660156, "logps/rejected": -291.4312438964844, "loss": 0.3024, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.87852144241333, "rewards/margins": 3.439791440963745, "rewards/rejected": -6.318312644958496, "step": 120 }, { "epoch": 0.7, "grad_norm": 81.41509800140894, "learning_rate": 1.2363668353585485e-07, "logits/chosen": 0.025721266865730286, "logits/rejected": 1.1706856489181519, "logps/chosen": -291.2774963378906, "logps/rejected": -280.7757873535156, "loss": 0.2712, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -2.8241302967071533, "rewards/margins": 3.6137948036193848, "rewards/rejected": -6.437924385070801, "step": 130 }, { "epoch": 0.76, "grad_norm": 76.56961564493653, "learning_rate": 8.53118137245516e-08, "logits/chosen": 0.24798288941383362, "logits/rejected": 1.3128881454467773, "logps/chosen": -298.71783447265625, "logps/rejected": -297.16790771484375, "loss": 0.2607, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": -2.95615553855896, "rewards/margins": 3.7294158935546875, "rewards/rejected": -6.685571193695068, "step": 140 }, { "epoch": 0.81, "grad_norm": 101.60579173655283, "learning_rate": 5.2867919617408553e-08, "logits/chosen": 0.16610342264175415, "logits/rejected": 1.297738790512085, "logps/chosen": -296.17230224609375, "logps/rejected": -285.56707763671875, "loss": 0.2777, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -2.7571194171905518, "rewards/margins": 3.536668062210083, "rewards/rejected": -6.293786525726318, "step": 150 }, { "epoch": 0.86, "grad_norm": 92.310593955402, "learning_rate": 2.7463564905650853e-08, "logits/chosen": 0.06046704202890396, "logits/rejected": 1.0854153633117676, "logps/chosen": -297.1445007324219, "logps/rejected": -291.33868408203125, "loss": 0.2684, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.6816515922546387, "rewards/margins": 3.552661418914795, "rewards/rejected": -6.234313011169434, "step": 160 }, { "epoch": 0.92, "grad_norm": 83.71834684366553, "learning_rate": 1.0005933014019307e-08, "logits/chosen": 0.15604642033576965, "logits/rejected": 1.338841199874878, "logps/chosen": -298.0588684082031, "logps/rejected": -293.54638671875, "loss": 0.2745, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -3.0692405700683594, "rewards/margins": 3.527927875518799, "rewards/rejected": -6.59716796875, "step": 170 }, { "epoch": 0.97, "grad_norm": 78.86616344216218, "learning_rate": 1.1184317978602808e-09, "logits/chosen": -0.07575028389692307, "logits/rejected": 1.0216057300567627, "logps/chosen": -288.5888366699219, "logps/rejected": -287.2474670410156, "loss": 0.3031, "rewards/accuracies": 0.90625, "rewards/chosen": -2.831172466278076, "rewards/margins": 3.852785587310791, "rewards/rejected": -6.683958530426025, "step": 180 }, { "epoch": 1.0, "step": 185, "total_flos": 0.0, "train_loss": 0.33391942269093283, "train_runtime": 5319.9853, "train_samples_per_second": 8.891, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 185, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }