{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9905213270142181, "eval_steps": 100, "global_step": 210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009478672985781991, "grad_norm": 31.999091462105085, "learning_rate": 2.3809523809523807e-08, "logits/chosen": -1.3901093006134033, "logits/rejected": -1.3982200622558594, "logps/chosen": -439.7777099609375, "logps/rejected": -517.9480590820312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0947867298578199, "grad_norm": 31.526233424514775, "learning_rate": 2.3809523809523806e-07, "logits/chosen": -1.1840474605560303, "logits/rejected": -1.2023670673370361, "logps/chosen": -318.02642822265625, "logps/rejected": -345.5296325683594, "loss": 0.6944, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0005787869449704885, "rewards/margins": 0.002175838453695178, "rewards/rejected": -0.0015970510430634022, "step": 10 }, { "epoch": 0.1895734597156398, "grad_norm": 29.460026918462425, "learning_rate": 4.761904761904761e-07, "logits/chosen": -1.2405064105987549, "logits/rejected": -1.2777436971664429, "logps/chosen": -325.66754150390625, "logps/rejected": -444.10162353515625, "loss": 0.6877, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0043294974602758884, "rewards/margins": 0.01897953823208809, "rewards/rejected": -0.014650041237473488, "step": 20 }, { "epoch": 0.2843601895734597, "grad_norm": 29.858845825573614, "learning_rate": 4.972077065562821e-07, "logits/chosen": -1.2667722702026367, "logits/rejected": -1.2541126012802124, "logps/chosen": -379.63861083984375, "logps/rejected": -386.56842041015625, "loss": 0.6743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015792345628142357, "rewards/margins": 0.05691219121217728, "rewards/rejected": -0.04111984372138977, "step": 30 }, { "epoch": 0.3791469194312796, "grad_norm": 28.04723285384618, "learning_rate": 4.876353872369572e-07, "logits/chosen": -1.2606487274169922, "logits/rejected": -1.2776422500610352, "logps/chosen": -330.6627197265625, "logps/rejected": -432.9537658691406, "loss": 0.6487, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.060253970324993134, "rewards/margins": 0.0897565707564354, "rewards/rejected": -0.029502594843506813, "step": 40 }, { "epoch": 0.47393364928909953, "grad_norm": 29.04625829417335, "learning_rate": 4.715123776075336e-07, "logits/chosen": -1.2534068822860718, "logits/rejected": -1.1865966320037842, "logps/chosen": -307.3436279296875, "logps/rejected": -284.7574462890625, "loss": 0.6288, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.020858120173215866, "rewards/margins": 0.09515110403299332, "rewards/rejected": -0.07429297268390656, "step": 50 }, { "epoch": 0.5687203791469194, "grad_norm": 28.336384421687487, "learning_rate": 4.492831268057306e-07, "logits/chosen": -1.2906352281570435, "logits/rejected": -1.3070456981658936, "logps/chosen": -324.20489501953125, "logps/rejected": -374.00274658203125, "loss": 0.5761, "rewards/accuracies": 0.75, "rewards/chosen": 0.08636633306741714, "rewards/margins": 0.28788790106773376, "rewards/rejected": -0.20152156054973602, "step": 60 }, { "epoch": 0.6635071090047393, "grad_norm": 24.885818315067937, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -1.330127477645874, "logits/rejected": -1.276735782623291, "logps/chosen": -344.3301086425781, "logps/rejected": -326.61151123046875, "loss": 0.5866, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.132747620344162, "rewards/margins": 0.28044360876083374, "rewards/rejected": -0.14769601821899414, "step": 70 }, { "epoch": 0.7582938388625592, "grad_norm": 25.726269521197576, "learning_rate": 3.891084338941603e-07, "logits/chosen": -1.2656139135360718, "logits/rejected": -1.2805755138397217, "logps/chosen": -338.24822998046875, "logps/rejected": -349.0716857910156, "loss": 0.5703, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.06792763620615005, "rewards/margins": 0.5581067204475403, "rewards/rejected": -0.4901791214942932, "step": 80 }, { "epoch": 0.8530805687203792, "grad_norm": 23.99960856337552, "learning_rate": 3.528217757826529e-07, "logits/chosen": -1.303812026977539, "logits/rejected": -1.2814157009124756, "logps/chosen": -288.6119689941406, "logps/rejected": -334.3503723144531, "loss": 0.5343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0316891223192215, "rewards/margins": 0.408654123544693, "rewards/rejected": -0.37696507573127747, "step": 90 }, { "epoch": 0.9478672985781991, "grad_norm": 24.512409938030444, "learning_rate": 3.137007182236637e-07, "logits/chosen": -1.3553069829940796, "logits/rejected": -1.3930418491363525, "logps/chosen": -394.2057800292969, "logps/rejected": -524.2303466796875, "loss": 0.528, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.024631375446915627, "rewards/margins": 0.491068035364151, "rewards/rejected": -0.5156994462013245, "step": 100 }, { "epoch": 0.9478672985781991, "eval_logits/chosen": -1.3159226179122925, "eval_logits/rejected": -1.3169375658035278, "eval_logps/chosen": -320.6200866699219, "eval_logps/rejected": -330.54302978515625, "eval_loss": 0.5266835689544678, "eval_rewards/accuracies": 0.7604166865348816, "eval_rewards/chosen": -0.025471201166510582, "eval_rewards/margins": 0.5800454020500183, "eval_rewards/rejected": -0.6055166125297546, "eval_runtime": 37.7626, "eval_samples_per_second": 19.861, "eval_steps_per_second": 0.636, "step": 100 }, { "epoch": 1.042654028436019, "grad_norm": 18.302035950206864, "learning_rate": 2.728236777596621e-07, "logits/chosen": -1.3288469314575195, "logits/rejected": -1.3007264137268066, "logps/chosen": -330.83392333984375, "logps/rejected": -358.9930419921875, "loss": 0.4808, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03551309555768967, "rewards/margins": 0.604155421257019, "rewards/rejected": -0.5686423778533936, "step": 110 }, { "epoch": 1.1374407582938388, "grad_norm": 21.060796057076118, "learning_rate": 2.3131747660339394e-07, "logits/chosen": -1.3152925968170166, "logits/rejected": -1.2728514671325684, "logps/chosen": -354.1836242675781, "logps/rejected": -403.5113525390625, "loss": 0.4193, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02837497368454933, "rewards/margins": 0.9122093915939331, "rewards/rejected": -0.8838345408439636, "step": 120 }, { "epoch": 1.2322274881516588, "grad_norm": 20.045154065240116, "learning_rate": 1.9032628049921556e-07, "logits/chosen": -1.2807761430740356, "logits/rejected": -1.3415155410766602, "logps/chosen": -319.76373291015625, "logps/rejected": -381.88702392578125, "loss": 0.4052, "rewards/accuracies": 0.9375, "rewards/chosen": 0.052242428064346313, "rewards/margins": 1.2852985858917236, "rewards/rejected": -1.2330560684204102, "step": 130 }, { "epoch": 1.3270142180094786, "grad_norm": 20.731255319297293, "learning_rate": 1.5098005849021078e-07, "logits/chosen": -1.3120365142822266, "logits/rejected": -1.2953948974609375, "logps/chosen": -295.6787109375, "logps/rejected": -355.78753662109375, "loss": 0.4188, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07798402011394501, "rewards/margins": 0.8329852819442749, "rewards/rejected": -0.9109692573547363, "step": 140 }, { "epoch": 1.4218009478672986, "grad_norm": 22.334784640837082, "learning_rate": 1.1436343403356016e-07, "logits/chosen": -1.3462207317352295, "logits/rejected": -1.3286292552947998, "logps/chosen": -368.55035400390625, "logps/rejected": -399.04498291015625, "loss": 0.3699, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05135868862271309, "rewards/margins": 1.201449990272522, "rewards/rejected": -1.2528085708618164, "step": 150 }, { "epoch": 1.5165876777251186, "grad_norm": 20.701829718895063, "learning_rate": 8.148578611867113e-08, "logits/chosen": -1.3034837245941162, "logits/rejected": -1.3216516971588135, "logps/chosen": -353.6603088378906, "logps/rejected": -452.823486328125, "loss": 0.3858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15693092346191406, "rewards/margins": 1.356492519378662, "rewards/rejected": -1.5134233236312866, "step": 160 }, { "epoch": 1.6113744075829384, "grad_norm": 19.451153050692426, "learning_rate": 5.325342458482779e-08, "logits/chosen": -1.2586638927459717, "logits/rejected": -1.292633295059204, "logps/chosen": -292.8143310546875, "logps/rejected": -365.19879150390625, "loss": 0.3721, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.14380544424057007, "rewards/margins": 1.1733076572418213, "rewards/rejected": -1.317112922668457, "step": 170 }, { "epoch": 1.7061611374407581, "grad_norm": 21.916474338608154, "learning_rate": 3.044460665744283e-08, "logits/chosen": -1.3701001405715942, "logits/rejected": -1.3554545640945435, "logps/chosen": -404.88909912109375, "logps/rejected": -457.29754638671875, "loss": 0.3622, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.04532099887728691, "rewards/margins": 1.295754075050354, "rewards/rejected": -1.3410749435424805, "step": 180 }, { "epoch": 1.8009478672985781, "grad_norm": 18.57991978572858, "learning_rate": 1.368808340056879e-08, "logits/chosen": -1.3218441009521484, "logits/rejected": -1.3573790788650513, "logps/chosen": -320.82342529296875, "logps/rejected": -399.82952880859375, "loss": 0.3755, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.12398891150951385, "rewards/margins": 1.0439832210540771, "rewards/rejected": -1.1679723262786865, "step": 190 }, { "epoch": 1.8957345971563981, "grad_norm": 19.884917777757575, "learning_rate": 3.4457674771554422e-09, "logits/chosen": -1.2637640237808228, "logits/rejected": -1.3061563968658447, "logps/chosen": -331.66986083984375, "logps/rejected": -383.78948974609375, "loss": 0.3731, "rewards/accuracies": 0.875, "rewards/chosen": -0.10066553205251694, "rewards/margins": 1.0765092372894287, "rewards/rejected": -1.177174687385559, "step": 200 }, { "epoch": 1.8957345971563981, "eval_logits/chosen": -1.3081656694412231, "eval_logits/rejected": -1.3098862171173096, "eval_logps/chosen": -325.0733337402344, "eval_logps/rejected": -340.2323303222656, "eval_loss": 0.4820757508277893, "eval_rewards/accuracies": 0.7604166865348816, "eval_rewards/chosen": -0.2481323480606079, "eval_rewards/margins": 0.8418500423431396, "eval_rewards/rejected": -1.0899823904037476, "eval_runtime": 37.8271, "eval_samples_per_second": 19.827, "eval_steps_per_second": 0.634, "step": 200 }, { "epoch": 1.9905213270142181, "grad_norm": 21.350222142767173, "learning_rate": 0.0, "logits/chosen": -1.3175251483917236, "logits/rejected": -1.3090837001800537, "logps/chosen": -369.90289306640625, "logps/rejected": -375.2828674316406, "loss": 0.4175, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.085756316781044, "rewards/margins": 0.9539656639099121, "rewards/rejected": -1.039721965789795, "step": 210 }, { "epoch": 1.9905213270142181, "step": 210, "total_flos": 0.0, "train_loss": 0.5004417010716029, "train_runtime": 1390.6464, "train_samples_per_second": 9.708, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 210, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }