{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 122, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 43.23719376156736, "learning_rate": 3.846153846153846e-08, "logits/chosen": -3.6897170543670654, "logits/rejected": -3.519662618637085, "logps/chosen": -584.1221923828125, "logps/rejected": -1429.938720703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "grad_norm": 36.75962682478825, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -3.6686697006225586, "logits/rejected": -3.5728933811187744, "logps/chosen": -948.7052001953125, "logps/rejected": -1359.1160888671875, "loss": 0.6873, "rewards/accuracies": 0.5902777910232544, "rewards/chosen": 0.0016961859073489904, "rewards/margins": 0.011211401782929897, "rewards/rejected": -0.009515216574072838, "step": 10 }, { "epoch": 0.16, "grad_norm": 26.68557894354543, "learning_rate": 4.949291683053768e-07, "logits/chosen": -3.7271945476531982, "logits/rejected": -3.6335723400115967, "logps/chosen": -889.2982177734375, "logps/rejected": -1387.8297119140625, "loss": 0.5822, "rewards/accuracies": 0.90625, "rewards/chosen": 0.022394303232431412, "rewards/margins": 0.24209070205688477, "rewards/rejected": -0.21969637274742126, "step": 20 }, { "epoch": 0.25, "grad_norm": 24.2663500160121, "learning_rate": 4.70586371748506e-07, "logits/chosen": -3.849905490875244, "logits/rejected": -3.791762590408325, "logps/chosen": -940.0631103515625, "logps/rejected": -1561.598876953125, "loss": 0.3698, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10165198147296906, "rewards/margins": 1.3781466484069824, "rewards/rejected": -1.4797985553741455, "step": 30 }, { "epoch": 0.33, "grad_norm": 20.782644086151006, "learning_rate": 4.280458575653296e-07, "logits/chosen": -4.025510311126709, "logits/rejected": -3.9784233570098877, "logps/chosen": -968.7717895507812, "logps/rejected": -1640.0823974609375, "loss": 0.2677, "rewards/accuracies": 0.875, "rewards/chosen": -0.5273348093032837, "rewards/margins": 2.6067256927490234, "rewards/rejected": -3.1340603828430176, "step": 40 }, { "epoch": 0.41, "grad_norm": 18.40887250285513, "learning_rate": 3.7081709127108767e-07, "logits/chosen": -4.091545104980469, "logits/rejected": -4.069024085998535, "logps/chosen": -968.97900390625, "logps/rejected": -1849.377197265625, "loss": 0.1851, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7245203256607056, "rewards/margins": 3.8392529487609863, "rewards/rejected": -4.5637736320495605, "step": 50 }, { "epoch": 0.49, "grad_norm": 30.82289433036107, "learning_rate": 3.0362127536287636e-07, "logits/chosen": -4.057796001434326, "logits/rejected": -4.068426132202148, "logps/chosen": -992.41796875, "logps/rejected": -1906.330810546875, "loss": 0.1831, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.9383566975593567, "rewards/margins": 4.514256477355957, "rewards/rejected": -5.452613353729248, "step": 60 }, { "epoch": 0.57, "grad_norm": 14.214509503993076, "learning_rate": 2.3200186419770823e-07, "logits/chosen": -4.036534309387207, "logits/rejected": -4.083151817321777, "logps/chosen": -1086.2425537109375, "logps/rejected": -1876.317138671875, "loss": 0.1335, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1720283031463623, "rewards/margins": 4.419107437133789, "rewards/rejected": -5.591135501861572, "step": 70 }, { "epoch": 0.66, "grad_norm": 16.47757248085571, "learning_rate": 1.6186724554503237e-07, "logits/chosen": -4.064330101013184, "logits/rejected": -4.05181360244751, "logps/chosen": -1022.8089599609375, "logps/rejected": -2001.3753662109375, "loss": 0.1218, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9989339113235474, "rewards/margins": 5.62686014175415, "rewards/rejected": -6.625794410705566, "step": 80 }, { "epoch": 0.74, "grad_norm": 18.42221667739246, "learning_rate": 9.900331622138063e-08, "logits/chosen": -4.04154109954834, "logits/rejected": -4.0591230392456055, "logps/chosen": -946.9400634765625, "logps/rejected": -2018.881591796875, "loss": 0.1367, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8327393531799316, "rewards/margins": 5.7661213874816895, "rewards/rejected": -6.598860263824463, "step": 90 }, { "epoch": 0.82, "grad_norm": 27.496315989869185, "learning_rate": 4.859616286322094e-08, "logits/chosen": -4.036691188812256, "logits/rejected": -4.036020755767822, "logps/chosen": -1030.328369140625, "logps/rejected": -2058.629638671875, "loss": 0.1183, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.1028516292572021, "rewards/margins": 5.824145317077637, "rewards/rejected": -6.92699670791626, "step": 100 }, { "epoch": 0.82, "eval_logits/chosen": -5.204982280731201, "eval_logits/rejected": -4.0475640296936035, "eval_logps/chosen": -89.41363525390625, "eval_logps/rejected": -486.17486572265625, "eval_loss": 0.4731297492980957, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.1248578280210495, "eval_rewards/margins": 0.583656907081604, "eval_rewards/rejected": -0.7085147500038147, "eval_runtime": 5.5516, "eval_samples_per_second": 0.721, "eval_steps_per_second": 0.18, "step": 100 }, { "epoch": 0.9, "grad_norm": 16.740929534156027, "learning_rate": 1.4804225250339281e-08, "logits/chosen": -4.010983467102051, "logits/rejected": -4.0438127517700195, "logps/chosen": -967.0818481445312, "logps/rejected": -1875.3775634765625, "loss": 0.1294, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9154699444770813, "rewards/margins": 4.917794704437256, "rewards/rejected": -5.8332648277282715, "step": 110 }, { "epoch": 0.98, "grad_norm": 17.270638468872217, "learning_rate": 4.152374292708538e-10, "logits/chosen": -4.0482587814331055, "logits/rejected": -4.022861480712891, "logps/chosen": -1003.4119262695312, "logps/rejected": -2065.379638671875, "loss": 0.1025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9699057340621948, "rewards/margins": 5.419854164123535, "rewards/rejected": -6.3897600173950195, "step": 120 }, { "epoch": 1.0, "step": 122, "total_flos": 0.0, "train_loss": 0.24888706256131657, "train_runtime": 2705.2939, "train_samples_per_second": 2.886, "train_steps_per_second": 0.045 } ], "logging_steps": 10, "max_steps": 122, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }