{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.968, "eval_steps": 100, "global_step": 248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.0000000000000002e-07, "logits/chosen": 0.7459792494773865, "logits/rejected": 0.8918710947036743, "logps/chosen": -76.09617614746094, "logps/rejected": -62.01979064941406, "loss": 0.0011, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 0.6576791405677795, "logits/rejected": 0.7277867197990417, "logps/chosen": -80.96837615966797, "logps/rejected": -67.04137420654297, "loss": 0.0011, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.0003823735751211643, "rewards/margins": 0.000583351356908679, "rewards/rejected": -0.00020097770902793854, "step": 10 }, { "epoch": 0.32, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.6884250044822693, "logits/rejected": 0.7408124804496765, "logps/chosen": -90.96418762207031, "logps/rejected": -76.06710815429688, "loss": 0.0011, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -5.052227152191335e-06, "rewards/margins": -0.0004625328874681145, "rewards/rejected": 0.0004574806080199778, "step": 20 }, { "epoch": 0.48, "learning_rate": 4.993800445762451e-06, "logits/chosen": 0.753118634223938, "logits/rejected": 0.815566897392273, "logps/chosen": -75.56269836425781, "logps/rejected": -62.7692756652832, "loss": 0.0011, "rewards/accuracies": 0.5, "rewards/chosen": 0.0003265980340074748, "rewards/margins": 0.00027744597173295915, "rewards/rejected": 4.915207318845205e-05, "step": 30 }, { "epoch": 0.64, "learning_rate": 4.944388344834205e-06, "logits/chosen": 0.6737275719642639, "logits/rejected": 0.7539814710617065, "logps/chosen": -99.6846694946289, "logps/rejected": -84.5612564086914, "loss": 0.0011, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0005740473279729486, "rewards/margins": 0.00023269152734428644, "rewards/rejected": 0.00034135582973249257, "step": 40 }, { "epoch": 0.8, "learning_rate": 4.8465431931347904e-06, "logits/chosen": 0.6654059290885925, "logits/rejected": 0.7606115341186523, "logps/chosen": -92.47711181640625, "logps/rejected": -79.58221435546875, "loss": 0.0011, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0007205186993815005, "rewards/margins": 0.0004779071605298668, "rewards/rejected": 0.00024261146609205753, "step": 50 }, { "epoch": 0.96, "learning_rate": 4.702203692102539e-06, "logits/chosen": 0.5616599321365356, "logits/rejected": 0.6836065649986267, "logps/chosen": -78.09901428222656, "logps/rejected": -59.116851806640625, "loss": 0.0011, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0014349878765642643, "rewards/margins": 0.0005719334003515542, "rewards/rejected": 0.000863054592628032, "step": 60 }, { "epoch": 1.12, "learning_rate": 4.514229781074239e-06, "logits/chosen": 0.6593050360679626, "logits/rejected": 0.7536520957946777, "logps/chosen": -76.82511901855469, "logps/rejected": -66.14640808105469, "loss": 0.0011, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.0013633746420964599, "rewards/margins": 1.1641532356165829e-11, "rewards/rejected": 0.001363374525681138, "step": 70 }, { "epoch": 1.28, "learning_rate": 4.286345970517195e-06, "logits/chosen": 0.7199975252151489, "logits/rejected": 0.7695997953414917, "logps/chosen": -95.65029907226562, "logps/rejected": -79.29694366455078, "loss": 0.0011, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.002262389287352562, "rewards/margins": 0.00043650128645822406, "rewards/rejected": 0.0018258880591019988, "step": 80 }, { "epoch": 1.44, "learning_rate": 4.023067544670082e-06, "logits/chosen": 0.6014922261238098, "logits/rejected": 0.7544792890548706, "logps/chosen": -85.02667999267578, "logps/rejected": -71.08529663085938, "loss": 0.0011, "rewards/accuracies": 0.5, "rewards/chosen": 0.0019673267379403114, "rewards/margins": 0.0005049472092650831, "rewards/rejected": 0.0014623795868828893, "step": 90 }, { "epoch": 1.6, "learning_rate": 3.7296110958116845e-06, "logits/chosen": 0.6225594282150269, "logits/rejected": 0.6867057085037231, "logps/chosen": -84.804931640625, "logps/rejected": -70.72709655761719, "loss": 0.0011, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.001966786338016391, "rewards/margins": 0.00019253513892181218, "rewards/rejected": 0.0017742514610290527, "step": 100 }, { "epoch": 1.6, "eval_logits/chosen": 0.8210488557815552, "eval_logits/rejected": 0.8785973787307739, "eval_logps/chosen": -256.8260803222656, "eval_logps/rejected": -233.76206970214844, "eval_loss": 0.0011570560745894909, "eval_rewards/accuracies": 0.5049999952316284, "eval_rewards/chosen": -0.00211901543661952, "eval_rewards/margins": 0.00011873205221490934, "eval_rewards/rejected": -0.0022377476561814547, "eval_runtime": 840.5697, "eval_samples_per_second": 2.379, "eval_steps_per_second": 0.595, "step": 100 }, { "epoch": 1.76, "learning_rate": 3.4117911628292944e-06, "logits/chosen": 0.6016367673873901, "logits/rejected": 0.6993056535720825, "logps/chosen": -87.06333923339844, "logps/rejected": -73.28208923339844, "loss": 0.0011, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0032501216046512127, "rewards/margins": 0.0010317874839529395, "rewards/rejected": 0.0022183340042829514, "step": 110 }, { "epoch": 1.92, "learning_rate": 3.075905022087675e-06, "logits/chosen": 0.6844900250434875, "logits/rejected": 0.7173089981079102, "logps/chosen": -87.12005615234375, "logps/rejected": -70.78279876708984, "loss": 0.001, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.003416379215195775, "rewards/margins": 0.0013540134532377124, "rewards/rejected": 0.0020623658783733845, "step": 120 }, { "epoch": 2.08, "learning_rate": 2.728607913349464e-06, "logits/chosen": 0.6243213415145874, "logits/rejected": 0.6759988069534302, "logps/chosen": -84.78315734863281, "logps/rejected": -69.31483459472656, "loss": 0.001, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003974142484366894, "rewards/margins": 0.00143245083745569, "rewards/rejected": 0.0025416917633265257, "step": 130 }, { "epoch": 2.24, "learning_rate": 2.376781173017589e-06, "logits/chosen": 0.6302226185798645, "logits/rejected": 0.6924747824668884, "logps/chosen": -71.3794174194336, "logps/rejected": -64.13245391845703, "loss": 0.001, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.003757910802960396, "rewards/margins": 0.001318571506999433, "rewards/rejected": 0.0024393394123762846, "step": 140 }, { "epoch": 2.4, "learning_rate": 2.0273958875043877e-06, "logits/chosen": 0.6476806402206421, "logits/rejected": 0.7262701988220215, "logps/chosen": -93.0534896850586, "logps/rejected": -75.62432861328125, "loss": 0.001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004165604244917631, "rewards/margins": 0.0014166636392474174, "rewards/rejected": 0.0027489408385008574, "step": 150 }, { "epoch": 2.56, "learning_rate": 1.6873747682962393e-06, "logits/chosen": 0.6091146469116211, "logits/rejected": 0.6964036226272583, "logps/chosen": -101.43008422851562, "logps/rejected": -83.43622589111328, "loss": 0.001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005559985991567373, "rewards/margins": 0.001756802899762988, "rewards/rejected": 0.0038031828589737415, "step": 160 }, { "epoch": 2.72, "learning_rate": 1.363454985517803e-06, "logits/chosen": 0.6119644045829773, "logits/rejected": 0.6556802988052368, "logps/chosen": -70.76570129394531, "logps/rejected": -60.847434997558594, "loss": 0.0011, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004071240313351154, "rewards/margins": 0.0009927384089678526, "rewards/rejected": 0.0030785012058913708, "step": 170 }, { "epoch": 2.88, "learning_rate": 1.062054677808238e-06, "logits/chosen": 0.6605072021484375, "logits/rejected": 0.7174011468887329, "logps/chosen": -83.5826416015625, "logps/rejected": -65.7984619140625, "loss": 0.0011, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0050515844486653805, "rewards/margins": 0.0010326830670237541, "rewards/rejected": 0.004018902312964201, "step": 180 }, { "epoch": 3.04, "learning_rate": 7.891457834794711e-07, "logits/chosen": 0.6303926706314087, "logits/rejected": 0.7238384485244751, "logps/chosen": -81.99962615966797, "logps/rejected": -67.91422271728516, "loss": 0.001, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.005765286739915609, "rewards/margins": 0.001600590767338872, "rewards/rejected": 0.004164696671068668, "step": 190 }, { "epoch": 3.2, "learning_rate": 5.501357126768117e-07, "logits/chosen": 0.6904739141464233, "logits/rejected": 0.7569997310638428, "logps/chosen": -91.30878448486328, "logps/rejected": -78.43734741210938, "loss": 0.001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006007182411849499, "rewards/margins": 0.0020701445173472166, "rewards/rejected": 0.0039370376616716385, "step": 200 }, { "epoch": 3.2, "eval_logits/chosen": 0.8083356618881226, "eval_logits/rejected": 0.8659967184066772, "eval_logps/chosen": -257.021484375, "eval_logps/rejected": -233.9424285888672, "eval_loss": 0.0011878832010552287, "eval_rewards/accuracies": 0.5009999871253967, "eval_rewards/chosen": -0.004072962328791618, "eval_rewards/margins": -3.138141255476512e-05, "eval_rewards/rejected": -0.004041580483317375, "eval_runtime": 749.8692, "eval_samples_per_second": 2.667, "eval_steps_per_second": 0.667, "step": 200 }, { "epoch": 3.36, "learning_rate": 3.4976020508682345e-07, "logits/chosen": 0.5687087178230286, "logits/rejected": 0.6581428647041321, "logps/chosen": -82.01387023925781, "logps/rejected": -71.76741790771484, "loss": 0.001, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005611591041088104, "rewards/margins": 0.001832630136050284, "rewards/rejected": 0.003778961021453142, "step": 210 }, { "epoch": 3.52, "learning_rate": 1.9198949610721273e-07, "logits/chosen": 0.6064807176589966, "logits/rejected": 0.6868988275527954, "logps/chosen": -88.08094787597656, "logps/rejected": -67.92098236083984, "loss": 0.001, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005776416976004839, "rewards/margins": 0.0024658578913658857, "rewards/rejected": 0.003310559317469597, "step": 220 }, { "epoch": 3.68, "learning_rate": 7.994965069994143e-08, "logits/chosen": 0.6825326681137085, "logits/rejected": 0.7606233954429626, "logps/chosen": -90.36261749267578, "logps/rejected": -74.85285949707031, "loss": 0.001, "rewards/accuracies": 0.53125, "rewards/chosen": 0.005841919686645269, "rewards/margins": 0.0014079047832638025, "rewards/rejected": 0.004434015601873398, "step": 230 }, { "epoch": 3.84, "learning_rate": 1.5860623616664183e-08, "logits/chosen": 0.6195131540298462, "logits/rejected": 0.6464000940322876, "logps/chosen": -86.05440521240234, "logps/rejected": -73.87252044677734, "loss": 0.001, "rewards/accuracies": 0.625, "rewards/chosen": 0.006215591914951801, "rewards/margins": 0.0018872864311560988, "rewards/rejected": 0.004328305833041668, "step": 240 }, { "epoch": 3.97, "step": 248, "total_flos": 0.0, "train_loss": 0.0010566485528421077, "train_runtime": 4635.3882, "train_samples_per_second": 0.863, "train_steps_per_second": 0.054 } ], "logging_steps": 10, "max_steps": 248, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }