{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990319457889641, "eval_steps": 10000000, "global_step": 516, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001936108422071636, "grad_norm": 4289.771574316242, "learning_rate": 1.9230769230769234e-11, "logits/chosen": -1.8683955669403076, "logits/rejected": -1.7658718824386597, "logps/chosen": -1.0707917213439941, "logps/rejected": -1.2424218654632568, "loss": 1.1711, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01936108422071636, "grad_norm": 4190.035270648744, "learning_rate": 1.9230769230769234e-10, "logits/chosen": -1.6616647243499756, "logits/rejected": -1.6193790435791016, "logps/chosen": -0.9486603140830994, "logps/rejected": -0.9298955202102661, "loss": 1.2029, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.0510055311024189, "rewards/margins": -0.1265178769826889, "rewards/rejected": 0.0755123570561409, "step": 10 }, { "epoch": 0.03872216844143272, "grad_norm": 3083.054373722184, "learning_rate": 3.8461538461538467e-10, "logits/chosen": -1.5834848880767822, "logits/rejected": -1.5355803966522217, "logps/chosen": -1.0245015621185303, "logps/rejected": -0.9704240560531616, "loss": 1.2623, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0036691308487206697, "rewards/margins": 0.06183544546365738, "rewards/rejected": -0.05816630274057388, "step": 20 }, { "epoch": 0.05808325266214908, "grad_norm": 4908.628774557542, "learning_rate": 5.769230769230769e-10, "logits/chosen": -1.5294702053070068, "logits/rejected": -1.4707310199737549, "logps/chosen": -1.0035126209259033, "logps/rejected": -0.9810468554496765, "loss": 1.2911, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10282345861196518, "rewards/margins": -0.00801654439419508, "rewards/rejected": -0.09480690211057663, "step": 30 }, { "epoch": 0.07744433688286544, "grad_norm": 3634.399853197314, "learning_rate": 7.692307692307693e-10, "logits/chosen": -1.5545880794525146, "logits/rejected": -1.4971392154693604, "logps/chosen": -1.0002901554107666, "logps/rejected": -0.9374505281448364, "loss": 1.2544, "rewards/accuracies": 0.4375, "rewards/chosen": -0.056221622973680496, "rewards/margins": -0.12876693904399872, "rewards/rejected": 0.07254532724618912, "step": 40 }, { "epoch": 0.0968054211035818, "grad_norm": 3743.017714256134, "learning_rate": 9.615384615384616e-10, "logits/chosen": -1.6496717929840088, "logits/rejected": -1.6047435998916626, "logps/chosen": -0.990554928779602, "logps/rejected": -0.968321681022644, "loss": 1.2784, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.044189296662807465, "rewards/margins": -0.09925868362188339, "rewards/rejected": 0.05506938695907593, "step": 50 }, { "epoch": 0.11616650532429816, "grad_norm": 3989.4789916346926, "learning_rate": 9.99266706925562e-10, "logits/chosen": -1.6014108657836914, "logits/rejected": -1.5485652685165405, "logps/chosen": -0.9952412843704224, "logps/rejected": -0.9312202334403992, "loss": 1.2834, "rewards/accuracies": 0.53125, "rewards/chosen": 0.05288747698068619, "rewards/margins": 0.01693376898765564, "rewards/rejected": 0.03595370799303055, "step": 60 }, { "epoch": 0.1355275895450145, "grad_norm": 4801.740697750488, "learning_rate": 9.96291389741603e-10, "logits/chosen": -1.5917268991470337, "logits/rejected": -1.505897045135498, "logps/chosen": -0.9961442947387695, "logps/rejected": -0.946877658367157, "loss": 1.2574, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024737417697906494, "rewards/margins": 0.13343551754951477, "rewards/rejected": -0.15817293524742126, "step": 70 }, { "epoch": 0.15488867376573087, "grad_norm": 4317.0356676312595, "learning_rate": 9.91041841371078e-10, "logits/chosen": -1.5460880994796753, "logits/rejected": -1.5285828113555908, "logps/chosen": -1.0357868671417236, "logps/rejected": -0.985648512840271, "loss": 1.2938, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.03525885194540024, "rewards/margins": 0.09618574380874634, "rewards/rejected": -0.0609268844127655, "step": 80 }, { "epoch": 0.17424975798644723, "grad_norm": 4079.0129090665787, "learning_rate": 9.835421176144035e-10, "logits/chosen": -1.678056001663208, "logits/rejected": -1.6193243265151978, "logps/chosen": -1.0110713243484497, "logps/rejected": -0.9249661564826965, "loss": 1.2586, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.03245333582162857, "rewards/margins": 0.027137070894241333, "rewards/rejected": 0.005316261202096939, "step": 90 }, { "epoch": 0.1936108422071636, "grad_norm": 4056.2821430675544, "learning_rate": 9.738265855914014e-10, "logits/chosen": -1.6319992542266846, "logits/rejected": -1.5686506032943726, "logps/chosen": -0.978125274181366, "logps/rejected": -0.9383082389831543, "loss": 1.2723, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.002950614783912897, "rewards/margins": -0.10372404754161835, "rewards/rejected": 0.10667465627193451, "step": 100 }, { "epoch": 0.21297192642787996, "grad_norm": 5228.540901226267, "learning_rate": 9.619397662556434e-10, "logits/chosen": -1.6645421981811523, "logits/rejected": -1.588181495666504, "logps/chosen": -0.8982473611831665, "logps/rejected": -0.8726890683174133, "loss": 1.2749, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.06572394073009491, "rewards/margins": -0.014699941501021385, "rewards/rejected": 0.08042389899492264, "step": 110 }, { "epoch": 0.23233301064859632, "grad_norm": 3837.5381027526573, "learning_rate": 9.47936130379344e-10, "logits/chosen": -1.5425523519515991, "logits/rejected": -1.5234339237213135, "logps/chosen": -0.9705616235733032, "logps/rejected": -0.9401592016220093, "loss": 1.2288, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0972176343202591, "rewards/margins": 0.09945462644100189, "rewards/rejected": -0.0022369951475411654, "step": 120 }, { "epoch": 0.25169409486931266, "grad_norm": 3814.5542415994746, "learning_rate": 9.318798489436919e-10, "logits/chosen": -1.579369306564331, "logits/rejected": -1.4889588356018066, "logps/chosen": -0.9654415845870972, "logps/rejected": -0.9286600351333618, "loss": 1.2977, "rewards/accuracies": 0.53125, "rewards/chosen": 0.14246916770935059, "rewards/margins": 0.08274303376674652, "rewards/rejected": 0.05972614139318466, "step": 130 }, { "epoch": 0.271055179090029, "grad_norm": 3751.478373618925, "learning_rate": 9.138444990784454e-10, "logits/chosen": -1.5760021209716797, "logits/rejected": -1.5278505086898804, "logps/chosen": -0.9930068850517273, "logps/rejected": -0.9939811825752258, "loss": 1.2583, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.029570287093520164, "rewards/margins": -0.044873449951410294, "rewards/rejected": 0.015303166583180428, "step": 140 }, { "epoch": 0.2904162633107454, "grad_norm": 3907.5968331867316, "learning_rate": 8.939127268983109e-10, "logits/chosen": -1.5609729290008545, "logits/rejected": -1.5375111103057861, "logps/chosen": -1.0732953548431396, "logps/rejected": -0.9959889650344849, "loss": 1.2631, "rewards/accuracies": 0.53125, "rewards/chosen": 0.12664008140563965, "rewards/margins": 0.0977608785033226, "rewards/rejected": 0.02887919172644615, "step": 150 }, { "epoch": 0.30977734753146174, "grad_norm": 4306.071797965924, "learning_rate": 8.721758687811352e-10, "logits/chosen": -1.6766704320907593, "logits/rejected": -1.6015819311141968, "logps/chosen": -0.9719392657279968, "logps/rejected": -0.9425565600395203, "loss": 1.3024, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.02740299701690674, "rewards/margins": -0.19174733757972717, "rewards/rejected": 0.16434435546398163, "step": 160 }, { "epoch": 0.3291384317521781, "grad_norm": 4085.902013165705, "learning_rate": 8.487335328233912e-10, "logits/chosen": -1.5358489751815796, "logits/rejected": -1.4411264657974243, "logps/chosen": -0.9960495829582214, "logps/rejected": -0.9736671447753906, "loss": 1.2834, "rewards/accuracies": 0.5, "rewards/chosen": -0.07462203502655029, "rewards/margins": -0.08827298879623413, "rewards/rejected": 0.013650953769683838, "step": 170 }, { "epoch": 0.34849951597289447, "grad_norm": 4830.939949744352, "learning_rate": 8.236931423909139e-10, "logits/chosen": -1.6721004247665405, "logits/rejected": -1.5736545324325562, "logps/chosen": -0.9753511548042297, "logps/rejected": -0.9517275094985962, "loss": 1.2611, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.11362428963184357, "rewards/margins": 0.17257475852966309, "rewards/rejected": -0.05895046144723892, "step": 180 }, { "epoch": 0.36786060019361083, "grad_norm": 4117.362535725707, "learning_rate": 7.971694438565449e-10, "logits/chosen": -1.6258817911148071, "logits/rejected": -1.563320517539978, "logps/chosen": -0.9729933738708496, "logps/rejected": -0.9748104810714722, "loss": 1.2509, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.025180306285619736, "rewards/margins": 0.146215558052063, "rewards/rejected": -0.12103524059057236, "step": 190 }, { "epoch": 0.3872216844143272, "grad_norm": 4321.444634590991, "learning_rate": 7.692839807804521e-10, "logits/chosen": -1.6311115026474, "logits/rejected": -1.5964945554733276, "logps/chosen": -0.9733870625495911, "logps/rejected": -0.9268864393234253, "loss": 1.2991, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10145823657512665, "rewards/margins": -0.11137993633747101, "rewards/rejected": 0.009921704418957233, "step": 200 }, { "epoch": 0.40658276863504356, "grad_norm": 4028.0205254352395, "learning_rate": 7.401645369426697e-10, "logits/chosen": -1.6063648462295532, "logits/rejected": -1.5398415327072144, "logps/chosen": -0.9638331532478333, "logps/rejected": -0.9147375822067261, "loss": 1.2499, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.012107854709029198, "rewards/margins": -0.00303336838260293, "rewards/rejected": -0.009074489586055279, "step": 210 }, { "epoch": 0.4259438528557599, "grad_norm": 4531.775441431939, "learning_rate": 7.099445507801324e-10, "logits/chosen": -1.6380093097686768, "logits/rejected": -1.589648962020874, "logps/chosen": -0.999239444732666, "logps/rejected": -0.9409993886947632, "loss": 1.2661, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.08351825177669525, "rewards/margins": 0.06452666223049164, "rewards/rejected": 0.018991602584719658, "step": 220 }, { "epoch": 0.4453049370764763, "grad_norm": 4190.546129288723, "learning_rate": 6.7876250391152e-10, "logits/chosen": -1.5896263122558594, "logits/rejected": -1.5483357906341553, "logps/chosen": -0.9513187408447266, "logps/rejected": -0.9711192846298218, "loss": 1.2465, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.011254754848778248, "rewards/margins": 0.014234659262001514, "rewards/rejected": -0.002979907440021634, "step": 230 }, { "epoch": 0.46466602129719264, "grad_norm": 4241.882144515242, "learning_rate": 6.467612865519674e-10, "logits/chosen": -1.6262489557266235, "logits/rejected": -1.6008937358856201, "logps/chosen": -0.9875959157943726, "logps/rejected": -0.9101887941360474, "loss": 1.2863, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.02070781961083412, "rewards/margins": -0.0031043351627886295, "rewards/rejected": 0.023812144994735718, "step": 240 }, { "epoch": 0.484027105517909, "grad_norm": 3918.096198448725, "learning_rate": 6.14087542725593e-10, "logits/chosen": -1.6483112573623657, "logits/rejected": -1.6147336959838867, "logps/chosen": -1.020185112953186, "logps/rejected": -0.9349339604377747, "loss": 1.2945, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.05541396886110306, "rewards/margins": -0.13103903830051422, "rewards/rejected": 0.07562507688999176, "step": 250 }, { "epoch": 0.5033881897386253, "grad_norm": 3626.964808344265, "learning_rate": 5.808909982763825e-10, "logits/chosen": -1.6457252502441406, "logits/rejected": -1.5552377700805664, "logps/chosen": -0.9940276145935059, "logps/rejected": -0.9484678506851196, "loss": 1.274, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.017910266295075417, "rewards/margins": -0.03398443013429642, "rewards/rejected": 0.016074160113930702, "step": 260 }, { "epoch": 0.5227492739593417, "grad_norm": 3993.5474645065597, "learning_rate": 5.473237747567806e-10, "logits/chosen": -1.6354029178619385, "logits/rejected": -1.5715049505233765, "logps/chosen": -0.9696807861328125, "logps/rejected": -0.9575467109680176, "loss": 1.2403, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.052422285079956055, "rewards/margins": -0.0962902158498764, "rewards/rejected": 0.04386794939637184, "step": 270 }, { "epoch": 0.542110358180058, "grad_norm": 4125.713570850562, "learning_rate": 5.135396923380673e-10, "logits/chosen": -1.5687922239303589, "logits/rejected": -1.495790719985962, "logps/chosen": -0.9840999841690063, "logps/rejected": -0.9467118978500366, "loss": 1.274, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.07109468430280685, "rewards/margins": 0.008125528693199158, "rewards/rejected": 0.0629691407084465, "step": 280 }, { "epoch": 0.5614714424007744, "grad_norm": 4172.164367513521, "learning_rate": 4.796935649368935e-10, "logits/chosen": -1.5752254724502563, "logits/rejected": -1.4961917400360107, "logps/chosen": -1.047147512435913, "logps/rejected": -0.9933904409408569, "loss": 1.2562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0501960925757885, "rewards/margins": 0.15902431309223175, "rewards/rejected": -0.10882820934057236, "step": 290 }, { "epoch": 0.5808325266214908, "grad_norm": 4075.4312369744443, "learning_rate": 4.4594049078802925e-10, "logits/chosen": -1.5983613729476929, "logits/rejected": -1.501022458076477, "logps/chosen": -0.9565431475639343, "logps/rejected": -0.9199585914611816, "loss": 1.251, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.11743637174367905, "rewards/margins": 0.11259187757968903, "rewards/rejected": 0.004844509996473789, "step": 300 }, { "epoch": 0.6001936108422071, "grad_norm": 4415.771404967329, "learning_rate": 4.1243514171423466e-10, "logits/chosen": -1.5818376541137695, "logits/rejected": -1.5412604808807373, "logps/chosen": -0.9797090291976929, "logps/rejected": -0.9499006271362305, "loss": 1.2733, "rewards/accuracies": 0.46875, "rewards/chosen": 0.04855315759778023, "rewards/margins": -0.004827280528843403, "rewards/rejected": 0.053380437195301056, "step": 310 }, { "epoch": 0.6195546950629235, "grad_norm": 4020.4622713375675, "learning_rate": 3.793310543501473e-10, "logits/chosen": -1.661425232887268, "logits/rejected": -1.589224100112915, "logps/chosen": -0.9785275459289551, "logps/rejected": -0.9554710388183594, "loss": 1.281, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.017457595095038414, "rewards/margins": -0.04888144135475159, "rewards/rejected": 0.06633903831243515, "step": 320 }, { "epoch": 0.6389157792836399, "grad_norm": 4011.0955669502655, "learning_rate": 3.4677992656811053e-10, "logits/chosen": -1.6328061819076538, "logits/rejected": -1.6010538339614868, "logps/chosen": -1.0184727907180786, "logps/rejected": -0.9700371623039246, "loss": 1.2624, "rewards/accuracies": 0.46875, "rewards/chosen": -0.015060502104461193, "rewards/margins": -0.0113009512424469, "rewards/rejected": -0.0037595562171190977, "step": 330 }, { "epoch": 0.6582768635043562, "grad_norm": 4234.390545263189, "learning_rate": 3.149309223300428e-10, "logits/chosen": -1.532965064048767, "logits/rejected": -1.5039539337158203, "logps/chosen": -1.0615012645721436, "logps/rejected": -0.975587010383606, "loss": 1.2536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.23547323048114777, "rewards/margins": 0.2764241397380829, "rewards/rejected": -0.04095090553164482, "step": 340 }, { "epoch": 0.6776379477250726, "grad_norm": 4310.225886680266, "learning_rate": 2.8392998815082717e-10, "logits/chosen": -1.6574161052703857, "logits/rejected": -1.5642915964126587, "logps/chosen": -1.0545518398284912, "logps/rejected": -1.055479645729065, "loss": 1.2634, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.025425389409065247, "rewards/margins": 0.06826835870742798, "rewards/rejected": -0.04284298047423363, "step": 350 }, { "epoch": 0.6969990319457889, "grad_norm": 3627.3654619519734, "learning_rate": 2.5391918430549634e-10, "logits/chosen": -1.6917625665664673, "logits/rejected": -1.6304069757461548, "logps/chosen": -1.0242336988449097, "logps/rejected": -0.9425589442253113, "loss": 1.2358, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0516384020447731, "rewards/margins": 0.07235859334468842, "rewards/rejected": -0.020720209926366806, "step": 360 }, { "epoch": 0.7163601161665053, "grad_norm": 3729.350909018597, "learning_rate": 2.250360338449226e-10, "logits/chosen": -1.7181346416473389, "logits/rejected": -1.7049201726913452, "logps/chosen": -0.9749780893325806, "logps/rejected": -0.9269036054611206, "loss": 1.2295, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.060578178614377975, "rewards/margins": 0.004380314145237207, "rewards/rejected": -0.06495849788188934, "step": 370 }, { "epoch": 0.7357212003872217, "grad_norm": 3635.364845851746, "learning_rate": 1.9741289240311756e-10, "logits/chosen": -1.6250957250595093, "logits/rejected": -1.5766202211380005, "logps/chosen": -0.9923038482666016, "logps/rejected": -0.95228511095047, "loss": 1.279, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.03353533893823624, "rewards/margins": -0.006679633166640997, "rewards/rejected": 0.04021497443318367, "step": 380 }, { "epoch": 0.755082284607938, "grad_norm": 3489.267034208794, "learning_rate": 1.7117634168396773e-10, "logits/chosen": -1.6205106973648071, "logits/rejected": -1.5565264225006104, "logps/chosen": -1.0025845766067505, "logps/rejected": -0.9802389144897461, "loss": 1.2588, "rewards/accuracies": 0.53125, "rewards/chosen": 0.1880512535572052, "rewards/margins": 0.2229468822479248, "rewards/rejected": -0.034895628690719604, "step": 390 }, { "epoch": 0.7744433688286544, "grad_norm": 3578.1741607281892, "learning_rate": 1.4644660940672628e-10, "logits/chosen": -1.6632875204086304, "logits/rejected": -1.5778895616531372, "logps/chosen": -0.9968886375427246, "logps/rejected": -0.9527746438980103, "loss": 1.2605, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.01907891035079956, "rewards/margins": -0.09163277596235275, "rewards/rejected": 0.11071167141199112, "step": 400 }, { "epoch": 0.7938044530493708, "grad_norm": 4697.104208557154, "learning_rate": 1.2333701836832813e-10, "logits/chosen": -1.6258773803710938, "logits/rejected": -1.5613044500350952, "logps/chosen": -0.9721547365188599, "logps/rejected": -0.9434949159622192, "loss": 1.2668, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.04442809149622917, "rewards/margins": 0.06200051307678223, "rewards/rejected": -0.01757242903113365, "step": 410 }, { "epoch": 0.8131655372700871, "grad_norm": 4633.7276813164935, "learning_rate": 1.0195346714717813e-10, "logits/chosen": -1.537536859512329, "logits/rejected": -1.5266730785369873, "logps/chosen": -0.965029239654541, "logps/rejected": -0.9463868141174316, "loss": 1.2845, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0605890154838562, "rewards/margins": -0.1169745922088623, "rewards/rejected": 0.05638556554913521, "step": 420 }, { "epoch": 0.8325266214908035, "grad_norm": 4035.95185171689, "learning_rate": 8.239394482805996e-11, "logits/chosen": -1.5938284397125244, "logits/rejected": -1.5492713451385498, "logps/chosen": -1.0192432403564453, "logps/rejected": -0.9626362919807434, "loss": 1.2589, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.001451274729333818, "rewards/margins": 0.09064897149801254, "rewards/rejected": -0.08919770270586014, "step": 430 }, { "epoch": 0.8518877057115198, "grad_norm": 4317.7906947065, "learning_rate": 6.474808197191401e-11, "logits/chosen": -1.631870985031128, "logits/rejected": -1.5819487571716309, "logps/chosen": -1.0460567474365234, "logps/rejected": -0.9752202033996582, "loss": 1.2476, "rewards/accuracies": 0.53125, "rewards/chosen": 0.15858839452266693, "rewards/margins": 0.21004195511341095, "rewards/rejected": -0.051453519612550735, "step": 440 }, { "epoch": 0.8712487899322362, "grad_norm": 4211.087790819232, "learning_rate": 4.9096739888146e-11, "logits/chosen": -1.6219565868377686, "logits/rejected": -1.5289008617401123, "logps/chosen": -1.013887643814087, "logps/rejected": -0.9734300374984741, "loss": 1.2654, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.10689739137887955, "rewards/margins": 0.05857623741030693, "rewards/rejected": 0.04832116514444351, "step": 450 }, { "epoch": 0.8906098741529526, "grad_norm": 3769.26467222043, "learning_rate": 3.5511640091604293e-11, "logits/chosen": -1.5646103620529175, "logits/rejected": -1.5271342992782593, "logps/chosen": -1.0559688806533813, "logps/rejected": -0.9385896921157837, "loss": 1.2187, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.07790811359882355, "rewards/margins": 0.21173524856567383, "rewards/rejected": -0.13382713496685028, "step": 460 }, { "epoch": 0.9099709583736689, "grad_norm": 4757.142899979977, "learning_rate": 2.4055035642222225e-11, "logits/chosen": -1.6382324695587158, "logits/rejected": -1.5700418949127197, "logps/chosen": -0.9839082956314087, "logps/rejected": -0.9138771295547485, "loss": 1.2532, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.08895201981067657, "rewards/margins": 0.04646927863359451, "rewards/rejected": 0.04248274117708206, "step": 470 }, { "epoch": 0.9293320425943853, "grad_norm": 3836.5764604419264, "learning_rate": 1.477942587339426e-11, "logits/chosen": -1.6090694665908813, "logits/rejected": -1.5662223100662231, "logps/chosen": -0.9489457011222839, "logps/rejected": -0.90757817029953, "loss": 1.2642, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.05137147754430771, "rewards/margins": -0.01941109262406826, "rewards/rejected": -0.0319603867828846, "step": 480 }, { "epoch": 0.9486931268151017, "grad_norm": 4250.965954777766, "learning_rate": 7.727315816331515e-12, "logits/chosen": -1.6554877758026123, "logits/rejected": -1.6055545806884766, "logps/chosen": -1.0777404308319092, "logps/rejected": -1.0004560947418213, "loss": 1.2376, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.07162754982709885, "rewards/margins": 0.12462921440601349, "rewards/rejected": -0.05300166457891464, "step": 490 }, { "epoch": 0.968054211035818, "grad_norm": 3914.5437511702235, "learning_rate": 2.9310214228202016e-12, "logits/chosen": -1.6229422092437744, "logits/rejected": -1.5482664108276367, "logps/chosen": -0.9587628245353699, "logps/rejected": -0.9172463417053223, "loss": 1.2265, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.032034747302532196, "rewards/margins": -0.1518467366695404, "rewards/rejected": 0.1198119968175888, "step": 500 }, { "epoch": 0.9874152952565344, "grad_norm": 4236.531801010516, "learning_rate": 4.125214789427734e-13, "logits/chosen": -1.5640289783477783, "logits/rejected": -1.5025882720947266, "logps/chosen": -0.9930515289306641, "logps/rejected": -0.9385469555854797, "loss": 1.254, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.09590280055999756, "rewards/margins": 0.0512530580163002, "rewards/rejected": 0.04464975371956825, "step": 510 }, { "epoch": 0.9990319457889641, "step": 516, "total_flos": 0.0, "train_loss": 1.2622329578843228, "train_runtime": 8815.3772, "train_samples_per_second": 7.496, "train_steps_per_second": 0.059 } ], "logging_steps": 10, "max_steps": 516, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }