{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987908101571947, "eval_steps": 10000000, "global_step": 413, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6401.270603874373, "learning_rate": 9.523809523809522e-09, "logits/chosen": -2.7005977630615234, "logits/rejected": -2.6288318634033203, "logps/chosen": -1.1158788204193115, "logps/rejected": -1.1333446502685547, "loss": 0.7544, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 9369.590990783972, "learning_rate": 9.523809523809523e-08, "logits/chosen": -2.76228666305542, "logits/rejected": -2.6970374584198, "logps/chosen": -0.837486743927002, "logps/rejected": -0.8182350993156433, "loss": 0.9695, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": 0.06597563624382019, "rewards/margins": 0.437710702419281, "rewards/rejected": -0.3717350959777832, "step": 10 }, { "epoch": 0.05, "grad_norm": 5966.657402243146, "learning_rate": 1.9047619047619045e-07, "logits/chosen": -2.6901049613952637, "logits/rejected": -2.6502909660339355, "logps/chosen": -0.9933319091796875, "logps/rejected": -1.0394352674484253, "loss": 1.0318, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.794396698474884, "rewards/margins": 0.7471516728401184, "rewards/rejected": -1.5415483713150024, "step": 20 }, { "epoch": 0.07, "grad_norm": 8820.198504372876, "learning_rate": 2.857142857142857e-07, "logits/chosen": -2.7333264350891113, "logits/rejected": -2.6793360710144043, "logps/chosen": -0.9710652232170105, "logps/rejected": -0.9799602627754211, "loss": 1.3198, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.275942325592041, "rewards/margins": 0.9020620584487915, "rewards/rejected": -3.178004264831543, "step": 30 }, { "epoch": 0.1, "grad_norm": 8453.783513094899, "learning_rate": 3.809523809523809e-07, "logits/chosen": -2.6771621704101562, "logits/rejected": -2.6321842670440674, "logps/chosen": -0.989823043346405, "logps/rejected": -0.9216930270195007, "loss": 2.0555, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6586966514587402, "rewards/margins": 5.100310325622559, "rewards/rejected": -5.759006500244141, "step": 40 }, { "epoch": 0.12, "grad_norm": 4192.139232222726, "learning_rate": 3.995412608484087e-07, "logits/chosen": -2.743403911590576, "logits/rejected": -2.6878693103790283, "logps/chosen": -0.9671042561531067, "logps/rejected": -0.917597770690918, "loss": 2.6495, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.16885781288146973, "rewards/margins": 5.186079978942871, "rewards/rejected": -5.35493803024292, "step": 50 }, { "epoch": 0.15, "grad_norm": 5643.860863524967, "learning_rate": 3.976812391485896e-07, "logits/chosen": -2.7438769340515137, "logits/rejected": -2.676765203475952, "logps/chosen": -0.911353588104248, "logps/rejected": -0.9122518301010132, "loss": 3.8047, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.4976494312286377, "rewards/margins": 5.426072120666504, "rewards/rejected": -2.928422212600708, "step": 60 }, { "epoch": 0.17, "grad_norm": 4497.230754903385, "learning_rate": 3.9440458281608213e-07, "logits/chosen": -2.740940570831299, "logits/rejected": -2.7162723541259766, "logps/chosen": -0.9154363870620728, "logps/rejected": -0.868497371673584, "loss": 3.6432, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 5.681364059448242, "rewards/margins": 8.000432968139648, "rewards/rejected": -2.319068431854248, "step": 70 }, { "epoch": 0.19, "grad_norm": 9843.974275847575, "learning_rate": 3.897347732134074e-07, "logits/chosen": -2.679215908050537, "logits/rejected": -2.625516891479492, "logps/chosen": -0.9146322011947632, "logps/rejected": -1.0181081295013428, "loss": 5.767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -10.08639907836914, "rewards/margins": 6.582289695739746, "rewards/rejected": -16.668689727783203, "step": 80 }, { "epoch": 0.22, "grad_norm": 4773.013380320505, "learning_rate": 3.8370527539794614e-07, "logits/chosen": -2.6771388053894043, "logits/rejected": -2.6291418075561523, "logps/chosen": -1.003847360610962, "logps/rejected": -1.0297266244888306, "loss": 4.6354, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 4.863407611846924, "rewards/margins": 9.78220272064209, "rewards/rejected": -4.918795585632324, "step": 90 }, { "epoch": 0.24, "grad_norm": 3074.8663144850243, "learning_rate": 3.763592983027255e-07, "logits/chosen": -2.705735683441162, "logits/rejected": -2.6605448722839355, "logps/chosen": -0.9163268804550171, "logps/rejected": -0.9396775960922241, "loss": 5.8585, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -12.477940559387207, "rewards/margins": 7.702305793762207, "rewards/rejected": -20.180248260498047, "step": 100 }, { "epoch": 0.27, "grad_norm": 4099.610429119441, "learning_rate": 3.6774948509008527e-07, "logits/chosen": -2.714970111846924, "logits/rejected": -2.6705470085144043, "logps/chosen": -0.9598251581192017, "logps/rejected": -0.9319995641708374, "loss": 5.1529, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 9.19798755645752, "rewards/margins": 10.779365539550781, "rewards/rejected": -1.5813770294189453, "step": 110 }, { "epoch": 0.29, "grad_norm": 4599.711217449366, "learning_rate": 3.579375358972288e-07, "logits/chosen": -2.678779125213623, "logits/rejected": -2.6315762996673584, "logps/chosen": -0.9081487655639648, "logps/rejected": -1.0060938596725464, "loss": 4.0915, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -13.663342475891113, "rewards/margins": 9.755656242370605, "rewards/rejected": -23.418994903564453, "step": 120 }, { "epoch": 0.31, "grad_norm": 4010.334966061441, "learning_rate": 3.4699376567716156e-07, "logits/chosen": -2.7230353355407715, "logits/rejected": -2.684389591217041, "logps/chosen": -0.8652521967887878, "logps/rejected": -0.8799147605895996, "loss": 4.4027, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 2.19469952583313, "rewards/margins": 15.263641357421875, "rewards/rejected": -13.068939208984375, "step": 130 }, { "epoch": 0.34, "grad_norm": 5239.11146834966, "learning_rate": 3.349966003036421e-07, "logits/chosen": -2.689558506011963, "logits/rejected": -2.649766445159912, "logps/chosen": -0.9352903366088867, "logps/rejected": -0.9416161775588989, "loss": 4.7953, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.734063148498535, "rewards/margins": 8.841203689575195, "rewards/rejected": -13.575268745422363, "step": 140 }, { "epoch": 0.36, "grad_norm": 5394.35498681908, "learning_rate": 3.220320145511884e-07, "logits/chosen": -2.7070841789245605, "logits/rejected": -2.647737979888916, "logps/chosen": -0.9441506266593933, "logps/rejected": -0.9885166883468628, "loss": 4.2219, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 5.9402689933776855, "rewards/margins": 12.97706413269043, "rewards/rejected": -7.036795139312744, "step": 150 }, { "epoch": 0.39, "grad_norm": 5022.189692479379, "learning_rate": 3.0819291597771795e-07, "logits/chosen": -2.7051825523376465, "logits/rejected": -2.667494297027588, "logps/chosen": -0.911395251750946, "logps/rejected": -0.939487099647522, "loss": 4.7963, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.6114660501480103, "rewards/margins": 9.443866729736328, "rewards/rejected": -10.055331230163574, "step": 160 }, { "epoch": 0.41, "grad_norm": 5428.944545727042, "learning_rate": 2.9357847912507786e-07, "logits/chosen": -2.6787288188934326, "logits/rejected": -2.609421968460083, "logps/chosen": -0.8976411819458008, "logps/rejected": -0.8857674598693848, "loss": 4.6262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.82297945022583, "rewards/margins": 10.640687942504883, "rewards/rejected": -13.463666915893555, "step": 170 }, { "epoch": 0.44, "grad_norm": 7317.882582449178, "learning_rate": 2.7829343480875617e-07, "logits/chosen": -2.6716930866241455, "logits/rejected": -2.6018152236938477, "logps/chosen": -0.9342878460884094, "logps/rejected": -0.9536906480789185, "loss": 4.5209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.173262596130371, "rewards/margins": 8.933877944946289, "rewards/rejected": -4.760615348815918, "step": 180 }, { "epoch": 0.46, "grad_norm": 5046.2946182405685, "learning_rate": 2.624473195899052e-07, "logits/chosen": -2.737992763519287, "logits/rejected": -2.7089955806732178, "logps/chosen": -0.9629039764404297, "logps/rejected": -1.039236307144165, "loss": 4.5521, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3569388389587402, "rewards/margins": 13.995905876159668, "rewards/rejected": -17.352848052978516, "step": 190 }, { "epoch": 0.48, "grad_norm": 4268.8163809344915, "learning_rate": 2.4615369080815547e-07, "logits/chosen": -2.6982626914978027, "logits/rejected": -2.6629488468170166, "logps/chosen": -0.8523995280265808, "logps/rejected": -0.9246847033500671, "loss": 3.8184, "rewards/accuracies": 0.78125, "rewards/chosen": 2.3979854583740234, "rewards/margins": 4.996596336364746, "rewards/rejected": -2.5986106395721436, "step": 200 }, { "epoch": 0.51, "grad_norm": 4211.242306423206, "learning_rate": 2.2952931280049625e-07, "logits/chosen": -2.7346115112304688, "logits/rejected": -2.6734609603881836, "logps/chosen": -1.0063531398773193, "logps/rejected": -0.9570119976997375, "loss": 4.9954, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 7.86081600189209, "rewards/margins": 13.075413703918457, "rewards/rejected": -5.214597225189209, "step": 210 }, { "epoch": 0.53, "grad_norm": 4906.2448320907815, "learning_rate": 2.1269332013798747e-07, "logits/chosen": -2.7431142330169678, "logits/rejected": -2.7241249084472656, "logps/chosen": -0.8835189938545227, "logps/rejected": -0.8670462369918823, "loss": 4.3795, "rewards/accuracies": 0.78125, "rewards/chosen": 4.582579135894775, "rewards/margins": 8.150335311889648, "rewards/rejected": -3.567755937576294, "step": 220 }, { "epoch": 0.56, "grad_norm": 2995.5119741253625, "learning_rate": 1.9576636387676436e-07, "logits/chosen": -2.690732955932617, "logits/rejected": -2.653067111968994, "logps/chosen": -0.960831344127655, "logps/rejected": -0.9556485414505005, "loss": 4.0487, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.449748039245605, "rewards/margins": 10.095115661621094, "rewards/rejected": -18.544864654541016, "step": 230 }, { "epoch": 0.58, "grad_norm": 3504.414329050279, "learning_rate": 1.7886974694151976e-07, "logits/chosen": -2.7119805812835693, "logits/rejected": -2.6879172325134277, "logps/chosen": -0.990290641784668, "logps/rejected": -0.9934972524642944, "loss": 4.3644, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5965616106987, "rewards/margins": 9.211602210998535, "rewards/rejected": -9.808164596557617, "step": 240 }, { "epoch": 0.6, "grad_norm": 3400.5848210057216, "learning_rate": 1.6212455483752895e-07, "logits/chosen": -2.756906270980835, "logits/rejected": -2.6796135902404785, "logps/chosen": -0.8838168978691101, "logps/rejected": -0.9137406349182129, "loss": 4.5034, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 6.5281982421875, "rewards/margins": 8.64702033996582, "rewards/rejected": -2.1188230514526367, "step": 250 }, { "epoch": 0.63, "grad_norm": 6194.117841583386, "learning_rate": 1.4565078792075733e-07, "logits/chosen": -2.7132773399353027, "logits/rejected": -2.6494650840759277, "logps/chosen": -1.002362847328186, "logps/rejected": -0.9982520341873169, "loss": 4.8134, "rewards/accuracies": 0.84375, "rewards/chosen": 3.0224878787994385, "rewards/margins": 16.206506729125977, "rewards/rejected": -13.1840181350708, "step": 260 }, { "epoch": 0.65, "grad_norm": 4565.495892627232, "learning_rate": 1.295665014444281e-07, "logits/chosen": -2.7381529808044434, "logits/rejected": -2.6608738899230957, "logps/chosen": -0.9501218795776367, "logps/rejected": -0.9476363062858582, "loss": 5.3754, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.570526123046875, "rewards/margins": 12.367398262023926, "rewards/rejected": -12.9379243850708, "step": 270 }, { "epoch": 0.68, "grad_norm": 5337.153187944306, "learning_rate": 1.1398695954469597e-07, "logits/chosen": -2.6872425079345703, "logits/rejected": -2.630267381668091, "logps/chosen": -0.9056104421615601, "logps/rejected": -0.8939152956008911, "loss": 4.1053, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 4.294297218322754, "rewards/margins": 7.472552299499512, "rewards/rejected": -3.1782548427581787, "step": 280 }, { "epoch": 0.7, "grad_norm": 3582.07962645892, "learning_rate": 9.902380922818425e-08, "logits/chosen": -2.7334370613098145, "logits/rejected": -2.6919913291931152, "logps/chosen": -0.9840775728225708, "logps/rejected": -0.9756690263748169, "loss": 3.2759, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 8.966680526733398, "rewards/margins": 11.496904373168945, "rewards/rejected": -2.5302233695983887, "step": 290 }, { "epoch": 0.73, "grad_norm": 4767.591882910886, "learning_rate": 8.478428028080398e-08, "logits/chosen": -2.7305169105529785, "logits/rejected": -2.6773815155029297, "logps/chosen": -0.8988749384880066, "logps/rejected": -0.9437707662582397, "loss": 4.3175, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.8898951411247253, "rewards/margins": 8.447718620300293, "rewards/rejected": -7.55782413482666, "step": 300 }, { "epoch": 0.75, "grad_norm": 4819.380329592898, "learning_rate": 7.137041683151202e-08, "logits/chosen": -2.7228643894195557, "logits/rejected": -2.6581058502197266, "logps/chosen": -1.0781683921813965, "logps/rejected": -1.028840184211731, "loss": 2.9744, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.4226202964782715, "rewards/margins": 13.473236083984375, "rewards/rejected": -15.895855903625488, "step": 310 }, { "epoch": 0.77, "grad_norm": 7840.551721640683, "learning_rate": 5.8878346077822135e-08, "logits/chosen": -2.7280871868133545, "logits/rejected": -2.649958848953247, "logps/chosen": -0.9020591974258423, "logps/rejected": -0.9361578822135925, "loss": 2.7082, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.223115086555481, "rewards/margins": 10.582406997680664, "rewards/rejected": -9.35929012298584, "step": 320 }, { "epoch": 0.8, "grad_norm": 4662.77535052248, "learning_rate": 4.73975894135696e-08, "logits/chosen": -2.6770853996276855, "logits/rejected": -2.6099040508270264, "logps/chosen": -0.9263202548027039, "logps/rejected": -0.9608638882637024, "loss": 3.1985, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2365754395723343, "rewards/margins": 13.195585250854492, "rewards/rejected": -12.959010124206543, "step": 330 }, { "epoch": 0.82, "grad_norm": 4550.588002339864, "learning_rate": 3.701042089556483e-08, "logits/chosen": -2.756493330001831, "logits/rejected": -2.687851667404175, "logps/chosen": -0.8901381492614746, "logps/rejected": -0.9301478266716003, "loss": 3.841, "rewards/accuracies": 0.78125, "rewards/chosen": 0.11963929980993271, "rewards/margins": 7.3289618492126465, "rewards/rejected": -7.209322929382324, "step": 340 }, { "epoch": 0.85, "grad_norm": 5464.471487236709, "learning_rate": 2.779127764652889e-08, "logits/chosen": -2.689107656478882, "logits/rejected": -2.6330015659332275, "logps/chosen": -0.9756801724433899, "logps/rejected": -0.9646003842353821, "loss": 3.6421, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.1974527835845947, "rewards/margins": 9.013090133666992, "rewards/rejected": -10.210542678833008, "step": 350 }, { "epoch": 0.87, "grad_norm": 5949.708940984834, "learning_rate": 1.9806226419516193e-08, "logits/chosen": -2.704460620880127, "logits/rejected": -2.656071186065674, "logps/chosen": -0.9623576402664185, "logps/rejected": -1.0082406997680664, "loss": 3.5231, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.5273587703704834, "rewards/margins": 11.88086223602295, "rewards/rejected": -9.35350227355957, "step": 360 }, { "epoch": 0.89, "grad_norm": 4320.933402478669, "learning_rate": 1.3112490146559552e-08, "logits/chosen": -2.7451281547546387, "logits/rejected": -2.686728000640869, "logps/chosen": -0.8951610326766968, "logps/rejected": -0.89850914478302, "loss": 3.0053, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.6753175258636475, "rewards/margins": 12.29626750946045, "rewards/rejected": -11.620949745178223, "step": 370 }, { "epoch": 0.92, "grad_norm": 2514.940389992379, "learning_rate": 7.758037864413247e-09, "logits/chosen": -2.7158432006835938, "logits/rejected": -2.6906635761260986, "logps/chosen": -0.9033122062683105, "logps/rejected": -0.9709407091140747, "loss": 2.8751, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.9628832936286926, "rewards/margins": 10.83133316040039, "rewards/rejected": -9.868449211120605, "step": 380 }, { "epoch": 0.94, "grad_norm": 3504.225752431698, "learning_rate": 3.78124095609087e-09, "logits/chosen": -2.6947999000549316, "logits/rejected": -2.6553878784179688, "logps/chosen": -0.9263744354248047, "logps/rejected": -0.9935058355331421, "loss": 3.019, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.4072116613388062, "rewards/margins": 10.741894721984863, "rewards/rejected": -9.334683418273926, "step": 390 }, { "epoch": 0.97, "grad_norm": 4987.634749508018, "learning_rate": 1.2105981716597603e-09, "logits/chosen": -2.7300946712493896, "logits/rejected": -2.6389007568359375, "logps/chosen": -0.9686774015426636, "logps/rejected": -0.9328421354293823, "loss": 3.7864, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.6465551853179932, "rewards/margins": 11.809611320495605, "rewards/rejected": -11.163057327270508, "step": 400 }, { "epoch": 0.99, "grad_norm": 5473.226219590305, "learning_rate": 6.453139886395398e-11, "logits/chosen": -2.7284317016601562, "logits/rejected": -2.6886637210845947, "logps/chosen": -0.9334842562675476, "logps/rejected": -0.9600637555122375, "loss": 3.6391, "rewards/accuracies": 0.875, "rewards/chosen": -2.157397747039795, "rewards/margins": 13.2835054397583, "rewards/rejected": -15.440902709960938, "step": 410 }, { "epoch": 1.0, "step": 413, "total_flos": 0.0, "train_loss": 3.8421780889894426, "train_runtime": 6381.4933, "train_samples_per_second": 8.293, "train_steps_per_second": 0.065 } ], "logging_steps": 10, "max_steps": 413, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }