{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0064, "grad_norm": 1341.8773394764246, "learning_rate": 3.125e-09, "logits/chosen": -3.9499800205230713, "logits/rejected": -4.237819194793701, "logps/chosen": -300.693115234375, "logps/rejected": -249.96307373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.064, "grad_norm": 1342.2810836893796, "learning_rate": 3.125e-08, "logits/chosen": -4.129705905914307, "logits/rejected": -4.352028846740723, "logps/chosen": -351.5079650878906, "logps/rejected": -308.8138427734375, "loss": 0.7326, "rewards/accuracies": 0.3680555522441864, "rewards/chosen": -0.04078766331076622, "rewards/margins": -0.11378024518489838, "rewards/rejected": 0.07299260050058365, "step": 10 }, { "epoch": 0.128, "grad_norm": 1252.3965895279962, "learning_rate": 4.9899357349880975e-08, "logits/chosen": -4.194980144500732, "logits/rejected": -4.382790565490723, "logps/chosen": -334.9039001464844, "logps/rejected": -293.8416748046875, "loss": 0.683, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.22410114109516144, "rewards/margins": 0.11712154000997543, "rewards/rejected": 0.10697959363460541, "step": 20 }, { "epoch": 0.192, "grad_norm": 904.3776918610464, "learning_rate": 4.877641290737884e-08, "logits/chosen": -4.230466365814209, "logits/rejected": -4.363996505737305, "logps/chosen": -327.71453857421875, "logps/rejected": -295.3287658691406, "loss": 0.5498, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.9708820581436157, "rewards/margins": 0.5084127187728882, "rewards/rejected": 0.46246927976608276, "step": 30 }, { "epoch": 0.256, "grad_norm": 894.6327423356746, "learning_rate": 4.646121984004665e-08, "logits/chosen": -4.1493096351623535, "logits/rejected": -4.351648807525635, "logps/chosen": -330.09368896484375, "logps/rejected": -288.2974853515625, "loss": 0.4125, "rewards/accuracies": 0.8218749761581421, "rewards/chosen": 1.9414455890655518, "rewards/margins": 1.1434320211410522, "rewards/rejected": 0.7980135083198547, "step": 40 }, { "epoch": 0.32, "grad_norm": 706.4309708182283, "learning_rate": 4.3069871595684784e-08, "logits/chosen": -4.244365215301514, "logits/rejected": -4.423664093017578, "logps/chosen": -329.6412353515625, "logps/rejected": -291.22528076171875, "loss": 0.3694, "rewards/accuracies": 0.840624988079071, "rewards/chosen": 2.6057987213134766, "rewards/margins": 1.537340521812439, "rewards/rejected": 1.068458080291748, "step": 50 }, { "epoch": 0.384, "grad_norm": 679.6447682422123, "learning_rate": 3.8772424536302564e-08, "logits/chosen": -4.262530326843262, "logits/rejected": -4.4340620040893555, "logps/chosen": -320.7197570800781, "logps/rejected": -291.15264892578125, "loss": 0.3459, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": 3.022132158279419, "rewards/margins": 1.8344866037368774, "rewards/rejected": 1.187645673751831, "step": 60 }, { "epoch": 0.448, "grad_norm": 600.9568341116722, "learning_rate": 3.378437060203357e-08, "logits/chosen": -4.188047885894775, "logits/rejected": -4.377224445343018, "logps/chosen": -320.23345947265625, "logps/rejected": -288.5027770996094, "loss": 0.3189, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 3.3037331104278564, "rewards/margins": 2.1254096031188965, "rewards/rejected": 1.1783230304718018, "step": 70 }, { "epoch": 0.512, "grad_norm": 654.7049863576665, "learning_rate": 2.8355831645441387e-08, "logits/chosen": -4.0522565841674805, "logits/rejected": -4.341280937194824, "logps/chosen": -345.8344421386719, "logps/rejected": -307.4328918457031, "loss": 0.3105, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7246456146240234, "rewards/margins": 2.5337729454040527, "rewards/rejected": 1.1908724308013916, "step": 80 }, { "epoch": 0.576, "grad_norm": 638.1282144295093, "learning_rate": 2.2759017277414164e-08, "logits/chosen": -4.180428504943848, "logits/rejected": -4.390549659729004, "logps/chosen": -332.82275390625, "logps/rejected": -295.1810607910156, "loss": 0.3099, "rewards/accuracies": 0.875, "rewards/chosen": 3.2552542686462402, "rewards/margins": 2.3172354698181152, "rewards/rejected": 0.9380186796188354, "step": 90 }, { "epoch": 0.64, "grad_norm": 680.3285346474286, "learning_rate": 1.7274575140626317e-08, "logits/chosen": -4.167009353637695, "logits/rejected": -4.386021614074707, "logps/chosen": -330.049560546875, "logps/rejected": -285.8011169433594, "loss": 0.3123, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 3.6218514442443848, "rewards/margins": 2.723836898803711, "rewards/rejected": 0.8980148434638977, "step": 100 }, { "epoch": 0.704, "grad_norm": 616.2712616857408, "learning_rate": 1.217751806485235e-08, "logits/chosen": -4.145500183105469, "logits/rejected": -4.386542320251465, "logps/chosen": -311.7583923339844, "logps/rejected": -276.3233947753906, "loss": 0.3022, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 3.584909439086914, "rewards/margins": 2.6118006706237793, "rewards/rejected": 0.9731090664863586, "step": 110 }, { "epoch": 0.768, "grad_norm": 649.1888991009114, "learning_rate": 7.723433775328384e-09, "logits/chosen": -4.141805171966553, "logits/rejected": -4.35054874420166, "logps/chosen": -325.5559997558594, "logps/rejected": -280.5980529785156, "loss": 0.3033, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 3.6838138103485107, "rewards/margins": 2.6417319774627686, "rewards/rejected": 1.0420820713043213, "step": 120 }, { "epoch": 0.832, "grad_norm": 747.4298760038148, "learning_rate": 4.135668656967433e-09, "logits/chosen": -4.228358268737793, "logits/rejected": -4.38976526260376, "logps/chosen": -331.02642822265625, "logps/rejected": -286.7439880371094, "loss": 0.3064, "rewards/accuracies": 0.875, "rewards/chosen": 3.7264277935028076, "rewards/margins": 2.6530587673187256, "rewards/rejected": 1.073369026184082, "step": 130 }, { "epoch": 0.896, "grad_norm": 697.5841535989922, "learning_rate": 1.5941282340065698e-09, "logits/chosen": -4.18213415145874, "logits/rejected": -4.3970947265625, "logps/chosen": -332.56500244140625, "logps/rejected": -303.63543701171875, "loss": 0.3069, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 3.5617058277130127, "rewards/margins": 2.6050186157226562, "rewards/rejected": 0.9566874504089355, "step": 140 }, { "epoch": 0.96, "grad_norm": 567.1610784183449, "learning_rate": 2.262559558016325e-10, "logits/chosen": -4.118973731994629, "logits/rejected": -4.348026752471924, "logps/chosen": -339.0107116699219, "logps/rejected": -295.09564208984375, "loss": 0.3078, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": 3.7477049827575684, "rewards/margins": 2.61022686958313, "rewards/rejected": 1.1374781131744385, "step": 150 }, { "epoch": 0.9984, "step": 156, "total_flos": 0.0, "train_loss": 0.3884877807054764, "train_runtime": 4677.6403, "train_samples_per_second": 8.539, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }