{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 4424, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.32421875, "learning_rate": 1.128668171557562e-09, "logits/chosen": -1.089872121810913, "logits/rejected": -1.1662957668304443, "logps/chosen": -88.48556518554688, "logps/rejected": -128.17625427246094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.376953125, "learning_rate": 1.128668171557562e-08, "logits/chosen": -1.362803339958191, "logits/rejected": -1.0890824794769287, "logps/chosen": -253.95407104492188, "logps/rejected": -205.00909423828125, "loss": 0.693, "rewards/accuracies": 0.3055555522441864, "rewards/chosen": -0.0007014232687652111, "rewards/margins": 4.238979454385117e-05, "rewards/margins_max": 0.0018888049526140094, "rewards/margins_min": -0.0018040253780782223, "rewards/margins_std": 0.002611225238069892, "rewards/rejected": -0.0007438129978254437, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.416015625, "learning_rate": 2.257336343115124e-08, "logits/chosen": -1.3700335025787354, "logits/rejected": -1.0615637302398682, "logps/chosen": -324.6263122558594, "logps/rejected": -199.73663330078125, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.002209640573710203, "rewards/margins": 0.0004743327444884926, "rewards/margins_max": 0.002534114755690098, "rewards/margins_min": -0.0015854493249207735, "rewards/margins_std": 0.002912971656769514, "rewards/rejected": 0.0017353076254948974, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.361328125, "learning_rate": 3.3860045146726863e-08, "logits/chosen": -1.2922523021697998, "logits/rejected": -1.0249769687652588, "logps/chosen": -235.274658203125, "logps/rejected": -220.1529541015625, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0010927047114819288, "rewards/margins": 0.0009888228960335255, "rewards/margins_max": 0.0037243079859763384, "rewards/margins_min": -0.0017466619610786438, "rewards/margins_std": 0.003868559841066599, "rewards/rejected": 0.0001038818372762762, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.306640625, "learning_rate": 4.514672686230248e-08, "logits/chosen": -1.395784616470337, "logits/rejected": -1.054032564163208, "logps/chosen": -255.254150390625, "logps/rejected": -252.3551483154297, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0007894478039816022, "rewards/margins": 0.0006032294477336109, "rewards/margins_max": 0.0026328391395509243, "rewards/margins_min": -0.0014263801276683807, "rewards/margins_std": 0.0028703012503683567, "rewards/rejected": 0.00018621828348841518, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.431640625, "learning_rate": 5.64334085778781e-08, "logits/chosen": -1.3574692010879517, "logits/rejected": -1.0375534296035767, "logps/chosen": -231.46517944335938, "logps/rejected": -224.56625366210938, "loss": 0.6925, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0005603065947070718, "rewards/margins": 0.001050409278832376, "rewards/margins_max": 0.0026080321986228228, "rewards/margins_min": -0.0005072135827504098, "rewards/margins_std": 0.0022028114181011915, "rewards/rejected": -0.0004901026841253042, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.5, "learning_rate": 6.772009029345373e-08, "logits/chosen": -1.3851702213287354, "logits/rejected": -1.0226211547851562, "logps/chosen": -213.94430541992188, "logps/rejected": -209.12704467773438, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.001433684374205768, "rewards/margins": 0.0013705453602597117, "rewards/margins_max": 0.0033714137971401215, "rewards/margins_min": -0.0006303234258666635, "rewards/margins_std": 0.0028296555392444134, "rewards/rejected": 6.313894118648022e-05, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.58984375, "learning_rate": 7.900677200902935e-08, "logits/chosen": -1.3688232898712158, "logits/rejected": -0.9564453363418579, "logps/chosen": -246.9102325439453, "logps/rejected": -241.5129852294922, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00036182976327836514, "rewards/margins": 0.0004978332435712218, "rewards/margins_max": 0.0033143579494208097, "rewards/margins_min": -0.0023186912294477224, "rewards/margins_std": 0.003983167465776205, "rewards/rejected": -0.0001360034802928567, "step": 70 }, { "epoch": 0.02, "grad_norm": 0.49609375, "learning_rate": 9.029345372460496e-08, "logits/chosen": -1.3354227542877197, "logits/rejected": -1.1622496843338013, "logps/chosen": -238.8887481689453, "logps/rejected": -231.7976837158203, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0011917247902601957, "rewards/margins": 0.000608589150942862, "rewards/margins_max": 0.003103874158114195, "rewards/margins_min": -0.0018866958562284708, "rewards/margins_std": 0.0035288657527416945, "rewards/rejected": 0.0005831356393173337, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.4453125, "learning_rate": 1.0158013544018059e-07, "logits/chosen": -1.2897207736968994, "logits/rejected": -1.0346763134002686, "logps/chosen": -199.93191528320312, "logps/rejected": -223.67257690429688, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00017602155276108533, "rewards/margins": -9.21573955565691e-05, "rewards/margins_max": 0.0014471550239250064, "rewards/margins_min": -0.0016314696986228228, "rewards/margins_std": 0.002176916692405939, "rewards/rejected": 0.00026817896286956966, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.34765625, "learning_rate": 1.128668171557562e-07, "logits/chosen": -1.3299012184143066, "logits/rejected": -1.134303092956543, "logps/chosen": -196.38111877441406, "logps/rejected": -186.2175750732422, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.178906005516183e-06, "rewards/margins": 0.0005631408421322703, "rewards/margins_max": 0.0025123213417828083, "rewards/margins_min": -0.0013860397739335895, "rewards/margins_std": 0.0027565578930079937, "rewards/rejected": -0.0005693196435458958, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.423828125, "learning_rate": 1.2415349887133183e-07, "logits/chosen": -1.2904781103134155, "logits/rejected": -1.0873281955718994, "logps/chosen": -240.749755859375, "logps/rejected": -271.0155334472656, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.000635267177131027, "rewards/margins": 0.0005631255335174501, "rewards/margins_max": 0.003234363393858075, "rewards/margins_min": -0.002108112210407853, "rewards/margins_std": 0.0037777007091790438, "rewards/rejected": 7.214168726932257e-05, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.357421875, "learning_rate": 1.3544018058690745e-07, "logits/chosen": -1.4081692695617676, "logits/rejected": -1.034285306930542, "logps/chosen": -194.2665252685547, "logps/rejected": -175.28851318359375, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0005045271245762706, "rewards/margins": 0.0010185183491557837, "rewards/margins_max": 0.0025255356449633837, "rewards/margins_min": -0.0004884987138211727, "rewards/margins_std": 0.0021312441676855087, "rewards/rejected": -0.000513991282787174, "step": 120 }, { "epoch": 0.03, "grad_norm": 0.361328125, "learning_rate": 1.4672686230248308e-07, "logits/chosen": -1.3172610998153687, "logits/rejected": -1.070671796798706, "logps/chosen": -248.647216796875, "logps/rejected": -243.6386260986328, "loss": 0.6926, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0010833472479134798, "rewards/margins": 0.001399085856974125, "rewards/margins_max": 0.0033245470840483904, "rewards/margins_min": -0.0005263749626465142, "rewards/margins_std": 0.0027230128180235624, "rewards/rejected": -0.000315738667268306, "step": 130 }, { "epoch": 0.03, "grad_norm": 0.42578125, "learning_rate": 1.580135440180587e-07, "logits/chosen": -1.3338706493377686, "logits/rejected": -1.0173786878585815, "logps/chosen": -239.69186401367188, "logps/rejected": -195.95968627929688, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005986442556604743, "rewards/margins": 0.00034580417559482157, "rewards/margins_max": 0.0022851484827697277, "rewards/margins_min": -0.0015935400733724236, "rewards/margins_std": 0.0027426474262028933, "rewards/rejected": 0.00025284005096182227, "step": 140 }, { "epoch": 0.03, "grad_norm": 0.41796875, "learning_rate": 1.693002257336343e-07, "logits/chosen": -1.4671242237091064, "logits/rejected": -1.1492918729782104, "logps/chosen": -244.544921875, "logps/rejected": -223.718505859375, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0007274551317095757, "rewards/margins": 0.0009307805448770523, "rewards/margins_max": 0.003343376098200679, "rewards/margins_min": -0.0014818150084465742, "rewards/margins_std": 0.003411925630643964, "rewards/rejected": -0.00020332522399257869, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.337890625, "learning_rate": 1.8058690744920993e-07, "logits/chosen": -1.5086791515350342, "logits/rejected": -1.1972728967666626, "logps/chosen": -222.0345916748047, "logps/rejected": -223.88845825195312, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": 0.0009902592282742262, "rewards/margins": 0.0011595649411901832, "rewards/margins_max": 0.0028987047262489796, "rewards/margins_min": -0.0005795744946226478, "rewards/margins_std": 0.0024595148861408234, "rewards/rejected": -0.0001693058293312788, "step": 160 }, { "epoch": 0.04, "grad_norm": 0.412109375, "learning_rate": 1.9187358916478555e-07, "logits/chosen": -1.331568717956543, "logits/rejected": -1.0612637996673584, "logps/chosen": -276.21881103515625, "logps/rejected": -231.5882110595703, "loss": 0.6925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0013450165279209614, "rewards/margins": 0.0012487067142501473, "rewards/margins_max": 0.002802295843139291, "rewards/margins_min": -0.0003048820362892002, "rewards/margins_std": 0.0021971066016703844, "rewards/rejected": 9.630967542761937e-05, "step": 170 }, { "epoch": 0.04, "grad_norm": 0.3671875, "learning_rate": 2.0316027088036118e-07, "logits/chosen": -1.2156211137771606, "logits/rejected": -1.1811649799346924, "logps/chosen": -175.79335021972656, "logps/rejected": -235.2672576904297, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": 0.0019137079361826181, "rewards/margins": 0.0013536510523408651, "rewards/margins_max": 0.0036528133787214756, "rewards/margins_min": -0.0009455106919631362, "rewards/margins_std": 0.0032515060156583786, "rewards/rejected": 0.0005600567674264312, "step": 180 }, { "epoch": 0.04, "grad_norm": 0.5234375, "learning_rate": 2.1444695259593678e-07, "logits/chosen": -1.391552209854126, "logits/rejected": -1.1357841491699219, "logps/chosen": -217.08740234375, "logps/rejected": -228.02847290039062, "loss": 0.6922, "rewards/accuracies": 0.75, "rewards/chosen": 0.0020487727597355843, "rewards/margins": 0.002171220723539591, "rewards/margins_max": 0.003962562419474125, "rewards/margins_min": 0.00037987896939739585, "rewards/margins_std": 0.0025333398953080177, "rewards/rejected": -0.00012244793470017612, "step": 190 }, { "epoch": 0.05, "grad_norm": 0.365234375, "learning_rate": 2.257336343115124e-07, "logits/chosen": -1.2622658014297485, "logits/rejected": -1.0098450183868408, "logps/chosen": -292.15155029296875, "logps/rejected": -233.5673065185547, "loss": 0.6925, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.001023811404593289, "rewards/margins": 0.0009746513096615672, "rewards/margins_max": 0.0035980145912617445, "rewards/margins_min": -0.0016487122047692537, "rewards/margins_std": 0.003709996584802866, "rewards/rejected": 4.915996760246344e-05, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.462890625, "learning_rate": 2.3702031602708803e-07, "logits/chosen": -1.399113416671753, "logits/rejected": -1.2889292240142822, "logps/chosen": -243.4051971435547, "logps/rejected": -231.0285186767578, "loss": 0.6925, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0008327178657054901, "rewards/margins": 0.0006204862147569656, "rewards/margins_max": 0.003349609673023224, "rewards/margins_min": -0.0021086367778480053, "rewards/margins_std": 0.003859562799334526, "rewards/rejected": 0.00021223162184469402, "step": 210 }, { "epoch": 0.05, "grad_norm": 0.390625, "learning_rate": 2.4830699774266366e-07, "logits/chosen": -1.2222211360931396, "logits/rejected": -0.9842857122421265, "logps/chosen": -229.5582733154297, "logps/rejected": -194.75865173339844, "loss": 0.6921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.001050114631652832, "rewards/margins": 0.001585352816618979, "rewards/margins_max": 0.00386607157997787, "rewards/margins_min": -0.000695365946739912, "rewards/margins_std": 0.00322542292997241, "rewards/rejected": -0.0005352382431738079, "step": 220 }, { "epoch": 0.05, "grad_norm": 0.50390625, "learning_rate": 2.595936794582393e-07, "logits/chosen": -1.2810051441192627, "logits/rejected": -1.065126657485962, "logps/chosen": -206.84463500976562, "logps/rejected": -192.6746368408203, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": 0.001871080370619893, "rewards/margins": 0.0016663589049130678, "rewards/margins_max": 0.0038144756108522415, "rewards/margins_min": -0.0004817581211682409, "rewards/margins_std": 0.003037896240130067, "rewards/rejected": 0.00020472146570682526, "step": 230 }, { "epoch": 0.05, "grad_norm": 0.36328125, "learning_rate": 2.708803611738149e-07, "logits/chosen": -1.2675464153289795, "logits/rejected": -0.9945747256278992, "logps/chosen": -231.3415069580078, "logps/rejected": -205.28945922851562, "loss": 0.6921, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0021059871651232243, "rewards/margins": 0.001979165943339467, "rewards/margins_max": 0.004107628017663956, "rewards/margins_min": -0.0001492957817390561, "rewards/margins_std": 0.0030100992880761623, "rewards/rejected": 0.000126821527373977, "step": 240 }, { "epoch": 0.06, "grad_norm": 0.330078125, "learning_rate": 2.8216704288939053e-07, "logits/chosen": -1.492440104484558, "logits/rejected": -1.2370370626449585, "logps/chosen": -222.0819549560547, "logps/rejected": -246.7217559814453, "loss": 0.692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0020398388151079416, "rewards/margins": 0.0020460395608097315, "rewards/margins_max": 0.0041878316551446915, "rewards/margins_min": -9.57526353886351e-05, "rewards/margins_std": 0.0030289513524621725, "rewards/rejected": -6.200841653480893e-06, "step": 250 }, { "epoch": 0.06, "grad_norm": 0.54296875, "learning_rate": 2.9345372460496616e-07, "logits/chosen": -1.3066132068634033, "logits/rejected": -0.9856610298156738, "logps/chosen": -248.71292114257812, "logps/rejected": -238.1698455810547, "loss": 0.6919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0023428858257830143, "rewards/margins": 0.0027398201636970043, "rewards/margins_max": 0.005885337945073843, "rewards/margins_min": -0.0004056969773955643, "rewards/margins_std": 0.004448432940989733, "rewards/rejected": -0.00039693451253697276, "step": 260 }, { "epoch": 0.06, "grad_norm": 0.32421875, "learning_rate": 3.047404063205418e-07, "logits/chosen": -1.2814630270004272, "logits/rejected": -1.139473557472229, "logps/chosen": -173.00698852539062, "logps/rejected": -206.34634399414062, "loss": 0.6915, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0025371015071868896, "rewards/margins": 0.0033828583545982838, "rewards/margins_max": 0.006202323827892542, "rewards/margins_min": 0.0005633925902657211, "rewards/margins_std": 0.003987326752394438, "rewards/rejected": -0.0008457564981654286, "step": 270 }, { "epoch": 0.06, "grad_norm": 0.396484375, "learning_rate": 3.160270880361174e-07, "logits/chosen": -1.3292253017425537, "logits/rejected": -1.0852205753326416, "logps/chosen": -287.0424499511719, "logps/rejected": -187.3642578125, "loss": 0.6917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0009970188839361072, "rewards/margins": 0.002404967090114951, "rewards/margins_max": 0.005034612491726875, "rewards/margins_min": -0.00022467815142590553, "rewards/margins_std": 0.003718879772350192, "rewards/rejected": -0.0014079485554248095, "step": 280 }, { "epoch": 0.07, "grad_norm": 0.36328125, "learning_rate": 3.27313769751693e-07, "logits/chosen": -1.4166836738586426, "logits/rejected": -1.1044247150421143, "logps/chosen": -228.96609497070312, "logps/rejected": -210.9814910888672, "loss": 0.6911, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.003559564473107457, "rewards/margins": 0.0047764526680111885, "rewards/margins_max": 0.007028171326965094, "rewards/margins_min": 0.0025247344747185707, "rewards/margins_std": 0.0031844109762459993, "rewards/rejected": -0.0012168881949037313, "step": 290 }, { "epoch": 0.07, "grad_norm": 0.3984375, "learning_rate": 3.386004514672686e-07, "logits/chosen": -1.3330157995224, "logits/rejected": -1.1392042636871338, "logps/chosen": -188.25454711914062, "logps/rejected": -213.46505737304688, "loss": 0.6912, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0028670013416558504, "rewards/margins": 0.004401736427098513, "rewards/margins_max": 0.007208372000604868, "rewards/margins_min": 0.0015951006207615137, "rewards/margins_std": 0.003969182260334492, "rewards/rejected": -0.0015347347361966968, "step": 300 }, { "epoch": 0.07, "grad_norm": 0.302734375, "learning_rate": 3.4988713318284423e-07, "logits/chosen": -1.503370761871338, "logits/rejected": -1.1487318277359009, "logps/chosen": -274.6285705566406, "logps/rejected": -273.916259765625, "loss": 0.6904, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.004510351922363043, "rewards/margins": 0.005128798075020313, "rewards/margins_max": 0.008591363206505775, "rewards/margins_min": 0.0016662331763654947, "rewards/margins_std": 0.004896806553006172, "rewards/rejected": -0.0006184463272802532, "step": 310 }, { "epoch": 0.07, "grad_norm": 0.416015625, "learning_rate": 3.6117381489841986e-07, "logits/chosen": -1.273756504058838, "logits/rejected": -1.0820574760437012, "logps/chosen": -151.35482788085938, "logps/rejected": -185.79380798339844, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": 0.0040741064585745335, "rewards/margins": 0.004994765855371952, "rewards/margins_max": 0.008913101628422737, "rewards/margins_min": 0.0010764312464743853, "rewards/margins_std": 0.0055413623340427876, "rewards/rejected": -0.0009206599788740277, "step": 320 }, { "epoch": 0.07, "grad_norm": 0.5859375, "learning_rate": 3.724604966139955e-07, "logits/chosen": -1.4037578105926514, "logits/rejected": -1.166176438331604, "logps/chosen": -241.0926055908203, "logps/rejected": -225.5806427001953, "loss": 0.6909, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.004344758577644825, "rewards/margins": 0.0043768687173724174, "rewards/margins_max": 0.007904845289885998, "rewards/margins_min": 0.000848892261274159, "rewards/margins_std": 0.004989312961697578, "rewards/rejected": -3.211015064152889e-05, "step": 330 }, { "epoch": 0.08, "grad_norm": 0.3828125, "learning_rate": 3.837471783295711e-07, "logits/chosen": -1.2480767965316772, "logits/rejected": -1.085233211517334, "logps/chosen": -203.14663696289062, "logps/rejected": -192.7289581298828, "loss": 0.691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.004463316407054663, "rewards/margins": 0.005001317244023085, "rewards/margins_max": 0.007284390274435282, "rewards/margins_min": 0.002718244446441531, "rewards/margins_std": 0.0032287519425153732, "rewards/rejected": -0.0005380000802688301, "step": 340 }, { "epoch": 0.08, "grad_norm": 0.447265625, "learning_rate": 3.9503386004514673e-07, "logits/chosen": -1.2298004627227783, "logits/rejected": -1.0297021865844727, "logps/chosen": -223.0439453125, "logps/rejected": -187.34703063964844, "loss": 0.6903, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.002816079882904887, "rewards/margins": 0.004824903793632984, "rewards/margins_max": 0.007285586558282375, "rewards/margins_min": 0.002364219631999731, "rewards/margins_std": 0.0034799326676875353, "rewards/rejected": -0.002008823212236166, "step": 350 }, { "epoch": 0.08, "grad_norm": 0.427734375, "learning_rate": 4.0632054176072236e-07, "logits/chosen": -1.3313958644866943, "logits/rejected": -1.11452317237854, "logps/chosen": -213.21676635742188, "logps/rejected": -171.91574096679688, "loss": 0.6904, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.003997773863375187, "rewards/margins": 0.005147859454154968, "rewards/margins_max": 0.008886572904884815, "rewards/margins_min": 0.0014091453049331903, "rewards/margins_std": 0.005287340376526117, "rewards/rejected": -0.0011500853579491377, "step": 360 }, { "epoch": 0.08, "grad_norm": 0.44921875, "learning_rate": 4.1760722347629793e-07, "logits/chosen": -1.4377549886703491, "logits/rejected": -1.2096660137176514, "logps/chosen": -189.43063354492188, "logps/rejected": -193.5634307861328, "loss": 0.6896, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.006203767843544483, "rewards/margins": 0.0066702342592179775, "rewards/margins_max": 0.010389590635895729, "rewards/margins_min": 0.0029508781153708696, "rewards/margins_std": 0.005259964149445295, "rewards/rejected": -0.00046646693954244256, "step": 370 }, { "epoch": 0.09, "grad_norm": 0.47265625, "learning_rate": 4.2889390519187356e-07, "logits/chosen": -1.4636600017547607, "logits/rejected": -1.0369333028793335, "logps/chosen": -206.52481079101562, "logps/rejected": -206.0133514404297, "loss": 0.6894, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.006550403777509928, "rewards/margins": 0.008564477786421776, "rewards/margins_max": 0.012663841247558594, "rewards/margins_min": 0.0044651152566075325, "rewards/margins_std": 0.005797374993562698, "rewards/rejected": -0.0020140744745731354, "step": 380 }, { "epoch": 0.09, "grad_norm": 0.361328125, "learning_rate": 4.401805869074492e-07, "logits/chosen": -1.336717128753662, "logits/rejected": -0.9665637016296387, "logps/chosen": -212.57272338867188, "logps/rejected": -176.93348693847656, "loss": 0.6896, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.005571245681494474, "rewards/margins": 0.007382750511169434, "rewards/margins_max": 0.0123984944075346, "rewards/margins_min": 0.0023670082446187735, "rewards/margins_std": 0.0070933327078819275, "rewards/rejected": -0.0018115064594894648, "step": 390 }, { "epoch": 0.09, "grad_norm": 0.22265625, "learning_rate": 4.514672686230248e-07, "logits/chosen": -1.4351770877838135, "logits/rejected": -1.120939016342163, "logps/chosen": -203.64779663085938, "logps/rejected": -206.8728485107422, "loss": 0.6896, "rewards/accuracies": 0.875, "rewards/chosen": 0.008293208666145802, "rewards/margins": 0.009503757581114769, "rewards/margins_max": 0.015738772228360176, "rewards/margins_min": 0.0032687417697161436, "rewards/margins_std": 0.008817643858492374, "rewards/rejected": -0.0012105483328923583, "step": 400 }, { "epoch": 0.09, "grad_norm": 0.349609375, "learning_rate": 4.6275395033860043e-07, "logits/chosen": -1.2792364358901978, "logits/rejected": -1.0543757677078247, "logps/chosen": -227.12451171875, "logps/rejected": -197.13278198242188, "loss": 0.6902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.005497048608958721, "rewards/margins": 0.006281238980591297, "rewards/margins_max": 0.01041505765169859, "rewards/margins_min": 0.0021474191453307867, "rewards/margins_std": 0.005846103187650442, "rewards/rejected": -0.0007841892656870186, "step": 410 }, { "epoch": 0.09, "grad_norm": 0.330078125, "learning_rate": 4.7404063205417606e-07, "logits/chosen": -1.4206167459487915, "logits/rejected": -1.1501795053482056, "logps/chosen": -351.7145690917969, "logps/rejected": -253.0180206298828, "loss": 0.6895, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.007716922555118799, "rewards/margins": 0.0065300120040774345, "rewards/margins_max": 0.011199641041457653, "rewards/margins_min": 0.0018603820353746414, "rewards/margins_std": 0.006603854242712259, "rewards/rejected": 0.0011869106674566865, "step": 420 }, { "epoch": 0.1, "grad_norm": 0.416015625, "learning_rate": 4.853273137697517e-07, "logits/chosen": -1.460006833076477, "logits/rejected": -1.2835047245025635, "logps/chosen": -222.09024047851562, "logps/rejected": -220.6402587890625, "loss": 0.6896, "rewards/accuracies": 0.875, "rewards/chosen": 0.006278887391090393, "rewards/margins": 0.006416992750018835, "rewards/margins_max": 0.011116179637610912, "rewards/margins_min": 0.0017178058624267578, "rewards/margins_std": 0.0066456543281674385, "rewards/rejected": -0.000138105358928442, "step": 430 }, { "epoch": 0.1, "grad_norm": 0.443359375, "learning_rate": 4.966139954853273e-07, "logits/chosen": -1.4451887607574463, "logits/rejected": -1.039292573928833, "logps/chosen": -272.3094177246094, "logps/rejected": -215.6998291015625, "loss": 0.6887, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008401526138186455, "rewards/margins": 0.0101736756041646, "rewards/margins_max": 0.014983911998569965, "rewards/margins_min": 0.005363441072404385, "rewards/margins_std": 0.006802698131650686, "rewards/rejected": -0.0017721500480547547, "step": 440 }, { "epoch": 0.1, "grad_norm": 0.455078125, "learning_rate": 4.999961856514226e-07, "logits/chosen": -1.3807470798492432, "logits/rejected": -1.0996475219726562, "logps/chosen": -220.24142456054688, "logps/rejected": -211.8799591064453, "loss": 0.6883, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.007313003297895193, "rewards/margins": 0.011259237304329872, "rewards/margins_max": 0.01768874004483223, "rewards/margins_min": 0.004829735029488802, "rewards/margins_std": 0.009092690423130989, "rewards/rejected": -0.003946233075112104, "step": 450 }, { "epoch": 0.1, "grad_norm": 0.416015625, "learning_rate": 4.999775034079764e-07, "logits/chosen": -1.3176883459091187, "logits/rejected": -0.9752163887023926, "logps/chosen": -195.54110717773438, "logps/rejected": -183.15243530273438, "loss": 0.6883, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008682606741786003, "rewards/margins": 0.009235577657818794, "rewards/margins_max": 0.01494432520121336, "rewards/margins_min": 0.003526832442730665, "rewards/margins_std": 0.008073386736214161, "rewards/rejected": -0.0005529728368856013, "step": 460 }, { "epoch": 0.11, "grad_norm": 0.439453125, "learning_rate": 4.999432538370056e-07, "logits/chosen": -1.3628463745117188, "logits/rejected": -1.2302682399749756, "logps/chosen": -167.83572387695312, "logps/rejected": -175.12362670898438, "loss": 0.6874, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.007012033369392157, "rewards/margins": 0.01034090481698513, "rewards/margins_max": 0.01631959341466427, "rewards/margins_min": 0.004362213425338268, "rewards/margins_std": 0.008455146104097366, "rewards/rejected": -0.003328870516270399, "step": 470 }, { "epoch": 0.11, "grad_norm": 0.33984375, "learning_rate": 4.998934390713993e-07, "logits/chosen": -1.4936786890029907, "logits/rejected": -1.2519557476043701, "logps/chosen": -191.031982421875, "logps/rejected": -189.51622009277344, "loss": 0.6875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.005951897706836462, "rewards/margins": 0.007916957139968872, "rewards/margins_max": 0.013470885343849659, "rewards/margins_min": 0.0023630294017493725, "rewards/margins_std": 0.007854441180825233, "rewards/rejected": -0.0019650589674711227, "step": 480 }, { "epoch": 0.11, "grad_norm": 0.296875, "learning_rate": 4.998280622133677e-07, "logits/chosen": -1.2512634992599487, "logits/rejected": -1.0366919040679932, "logps/chosen": -243.0624237060547, "logps/rejected": -188.01596069335938, "loss": 0.6877, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.005484632216393948, "rewards/margins": 0.010485135950148106, "rewards/margins_max": 0.015126067213714123, "rewards/margins_min": 0.005844203755259514, "rewards/margins_std": 0.006563269533216953, "rewards/rejected": -0.005000503268092871, "step": 490 }, { "epoch": 0.11, "grad_norm": 0.3515625, "learning_rate": 4.99747127334249e-07, "logits/chosen": -1.501162052154541, "logits/rejected": -1.1960660219192505, "logps/chosen": -232.68899536132812, "logps/rejected": -200.26589965820312, "loss": 0.6877, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.007800704799592495, "rewards/margins": 0.009690572507679462, "rewards/margins_max": 0.017231535166502, "rewards/margins_min": 0.002149612409994006, "rewards/margins_std": 0.010664528235793114, "rewards/rejected": -0.0018898677080869675, "step": 500 }, { "epoch": 0.12, "grad_norm": 0.390625, "learning_rate": 4.996506394742558e-07, "logits/chosen": -1.3358343839645386, "logits/rejected": -1.0514311790466309, "logps/chosen": -169.50668334960938, "logps/rejected": -182.19361877441406, "loss": 0.6869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.00925387255847454, "rewards/margins": 0.013080105185508728, "rewards/margins_max": 0.020481396466493607, "rewards/margins_min": 0.005678813438862562, "rewards/margins_std": 0.010467005893588066, "rewards/rejected": -0.0038262330926954746, "step": 510 }, { "epoch": 0.12, "grad_norm": 0.322265625, "learning_rate": 4.995386046421613e-07, "logits/chosen": -1.347860336303711, "logits/rejected": -1.2029502391815186, "logps/chosen": -208.1087646484375, "logps/rejected": -204.68594360351562, "loss": 0.6874, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.007269317749887705, "rewards/margins": 0.012022108770906925, "rewards/margins_max": 0.018901333212852478, "rewards/margins_min": 0.005142883397638798, "rewards/margins_std": 0.009728692471981049, "rewards/rejected": -0.004752790089696646, "step": 520 }, { "epoch": 0.12, "grad_norm": 0.400390625, "learning_rate": 4.994110298149252e-07, "logits/chosen": -1.4562270641326904, "logits/rejected": -1.1467808485031128, "logps/chosen": -229.633544921875, "logps/rejected": -241.7759552001953, "loss": 0.6845, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.013689137995243073, "rewards/margins": 0.022165587171912193, "rewards/margins_max": 0.031851477921009064, "rewards/margins_min": 0.01247970201075077, "rewards/margins_std": 0.013697914779186249, "rewards/rejected": -0.008476451970636845, "step": 530 }, { "epoch": 0.12, "grad_norm": 0.404296875, "learning_rate": 4.992679229372587e-07, "logits/chosen": -1.4816101789474487, "logits/rejected": -1.114330530166626, "logps/chosen": -219.7097625732422, "logps/rejected": -195.24655151367188, "loss": 0.6857, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.014646338298916817, "rewards/margins": 0.01716429367661476, "rewards/margins_max": 0.02907278575003147, "rewards/margins_min": 0.0052558062598109245, "rewards/margins_std": 0.01684114709496498, "rewards/rejected": -0.002517957706004381, "step": 540 }, { "epoch": 0.12, "grad_norm": 0.3515625, "learning_rate": 4.991092929211304e-07, "logits/chosen": -1.310151219367981, "logits/rejected": -0.915892481803894, "logps/chosen": -223.8758087158203, "logps/rejected": -195.31063842773438, "loss": 0.6852, "rewards/accuracies": 0.875, "rewards/chosen": 0.009512702003121376, "rewards/margins": 0.013930651359260082, "rewards/margins_max": 0.022518616169691086, "rewards/margins_min": 0.005342685617506504, "rewards/margins_std": 0.012145215645432472, "rewards/rejected": -0.004417949356138706, "step": 550 }, { "epoch": 0.13, "grad_norm": 0.322265625, "learning_rate": 4.989351496452109e-07, "logits/chosen": -1.4654967784881592, "logits/rejected": -1.1996474266052246, "logps/chosen": -185.1067352294922, "logps/rejected": -197.99349975585938, "loss": 0.6862, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.010758506134152412, "rewards/margins": 0.012723572552204132, "rewards/margins_max": 0.02212095446884632, "rewards/margins_min": 0.0033261929638683796, "rewards/margins_std": 0.013289901427924633, "rewards/rejected": -0.0019650678150355816, "step": 560 }, { "epoch": 0.13, "grad_norm": 0.33984375, "learning_rate": 4.987455039542576e-07, "logits/chosen": -1.4222947359085083, "logits/rejected": -1.238435983657837, "logps/chosen": -155.63284301757812, "logps/rejected": -185.66848754882812, "loss": 0.6852, "rewards/accuracies": 0.875, "rewards/chosen": 0.010133227333426476, "rewards/margins": 0.013187420554459095, "rewards/margins_max": 0.020901057869195938, "rewards/margins_min": 0.005473785102367401, "rewards/margins_std": 0.010908729396760464, "rewards/rejected": -0.003054193453863263, "step": 570 }, { "epoch": 0.13, "grad_norm": 0.2578125, "learning_rate": 4.985403676584397e-07, "logits/chosen": -1.5004786252975464, "logits/rejected": -1.2731705904006958, "logps/chosen": -139.58822631835938, "logps/rejected": -161.0989990234375, "loss": 0.686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.00958799198269844, "rewards/margins": 0.012686249800026417, "rewards/margins_max": 0.020806532353162766, "rewards/margins_min": 0.004565965384244919, "rewards/margins_std": 0.011483816429972649, "rewards/rejected": -0.00309825805015862, "step": 580 }, { "epoch": 0.13, "grad_norm": 0.396484375, "learning_rate": 4.983197535326024e-07, "logits/chosen": -1.3106216192245483, "logits/rejected": -0.8944109678268433, "logps/chosen": -242.289794921875, "logps/rejected": -325.63177490234375, "loss": 0.6837, "rewards/accuracies": 0.875, "rewards/chosen": 0.015353771857917309, "rewards/margins": 0.020929958671331406, "rewards/margins_max": 0.03252523019909859, "rewards/margins_min": 0.00933468621224165, "rewards/margins_std": 0.016398191452026367, "rewards/rejected": -0.005576184950768948, "step": 590 }, { "epoch": 0.14, "grad_norm": 0.314453125, "learning_rate": 4.980836753154714e-07, "logits/chosen": -1.361307144165039, "logits/rejected": -1.1225764751434326, "logps/chosen": -209.185546875, "logps/rejected": -244.9947509765625, "loss": 0.6836, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.014351347461342812, "rewards/margins": 0.021498367190361023, "rewards/margins_max": 0.03384038060903549, "rewards/margins_min": 0.009156355634331703, "rewards/margins_std": 0.01745423674583435, "rewards/rejected": -0.007147020194679499, "step": 600 }, { "epoch": 0.14, "grad_norm": 0.388671875, "learning_rate": 4.978321477087972e-07, "logits/chosen": -1.3299026489257812, "logits/rejected": -1.0124680995941162, "logps/chosen": -255.3600616455078, "logps/rejected": -245.94985961914062, "loss": 0.6834, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018793154507875443, "rewards/margins": 0.022329501807689667, "rewards/margins_max": 0.03670556843280792, "rewards/margins_min": 0.007953429594635963, "rewards/margins_std": 0.02033083513379097, "rewards/rejected": -0.003536344738677144, "step": 610 }, { "epoch": 0.14, "grad_norm": 0.333984375, "learning_rate": 4.975651863764402e-07, "logits/chosen": -1.3493794202804565, "logits/rejected": -1.0976240634918213, "logps/chosen": -266.123291015625, "logps/rejected": -219.5732421875, "loss": 0.6863, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.01338467001914978, "rewards/margins": 0.015727603808045387, "rewards/margins_max": 0.028147747740149498, "rewards/margins_min": 0.00330745754763484, "rewards/margins_std": 0.017564736306667328, "rewards/rejected": -0.002342933090403676, "step": 620 }, { "epoch": 0.14, "grad_norm": 0.423828125, "learning_rate": 4.972828079433943e-07, "logits/chosen": -1.48280930519104, "logits/rejected": -1.12116277217865, "logps/chosen": -280.49884033203125, "logps/rejected": -244.3009796142578, "loss": 0.6836, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.01864319108426571, "rewards/margins": 0.017971428111195564, "rewards/margins_max": 0.028208374977111816, "rewards/margins_min": 0.007734485901892185, "rewards/margins_std": 0.014477225951850414, "rewards/rejected": 0.0006717622163705528, "step": 630 }, { "epoch": 0.14, "grad_norm": 0.41015625, "learning_rate": 4.969850299947519e-07, "logits/chosen": -1.4176355600357056, "logits/rejected": -1.071342945098877, "logps/chosen": -280.03997802734375, "logps/rejected": -226.7299346923828, "loss": 0.6822, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01520386803895235, "rewards/margins": 0.023597296327352524, "rewards/margins_max": 0.0348820798099041, "rewards/margins_min": 0.012312507256865501, "rewards/margins_std": 0.01595909893512726, "rewards/rejected": -0.008393426425755024, "step": 640 }, { "epoch": 0.15, "grad_norm": 0.314453125, "learning_rate": 4.966718710746093e-07, "logits/chosen": -1.4015623331069946, "logits/rejected": -1.1037607192993164, "logps/chosen": -204.43484497070312, "logps/rejected": -258.1009216308594, "loss": 0.6816, "rewards/accuracies": 0.875, "rewards/chosen": 0.024477144703269005, "rewards/margins": 0.023006316274404526, "rewards/margins_max": 0.03518133610486984, "rewards/margins_min": 0.010831299237906933, "rewards/margins_std": 0.017218075692653656, "rewards/rejected": 0.0014708290109410882, "step": 650 }, { "epoch": 0.15, "grad_norm": 0.4375, "learning_rate": 4.963433506849114e-07, "logits/chosen": -1.4727814197540283, "logits/rejected": -1.2456432580947876, "logps/chosen": -241.7753143310547, "logps/rejected": -239.16921997070312, "loss": 0.6839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.014989370480179787, "rewards/margins": 0.02068949118256569, "rewards/margins_max": 0.03409608453512192, "rewards/margins_min": 0.007282900158315897, "rewards/margins_std": 0.018959784880280495, "rewards/rejected": -0.005700122099369764, "step": 660 }, { "epoch": 0.15, "grad_norm": 0.3671875, "learning_rate": 4.959994892842371e-07, "logits/chosen": -1.1937696933746338, "logits/rejected": -1.015245795249939, "logps/chosen": -267.62725830078125, "logps/rejected": -282.30084228515625, "loss": 0.6841, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.014117559418082237, "rewards/margins": 0.01710415445268154, "rewards/margins_max": 0.028150636702775955, "rewards/margins_min": 0.006057672202587128, "rewards/margins_std": 0.015622084960341454, "rewards/rejected": -0.002986595267429948, "step": 670 }, { "epoch": 0.15, "grad_norm": 0.3359375, "learning_rate": 4.956403082865256e-07, "logits/chosen": -1.2967605590820312, "logits/rejected": -1.0511850118637085, "logps/chosen": -222.7646484375, "logps/rejected": -171.53314208984375, "loss": 0.6826, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01969096250832081, "rewards/margins": 0.024944283068180084, "rewards/margins_max": 0.03900546580553055, "rewards/margins_min": 0.01088310219347477, "rewards/margins_std": 0.019885513931512833, "rewards/rejected": -0.005253321956843138, "step": 680 }, { "epoch": 0.16, "grad_norm": 0.30078125, "learning_rate": 4.952658300597427e-07, "logits/chosen": -1.3829014301300049, "logits/rejected": -1.1351196765899658, "logps/chosen": -203.21536254882812, "logps/rejected": -208.79837036132812, "loss": 0.6818, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.019899826496839523, "rewards/margins": 0.024015935137867928, "rewards/margins_max": 0.034795574843883514, "rewards/margins_min": 0.013236296363174915, "rewards/margins_std": 0.015244710259139538, "rewards/rejected": -0.00411610770970583, "step": 690 }, { "epoch": 0.16, "grad_norm": 0.412109375, "learning_rate": 4.948760779244875e-07, "logits/chosen": -1.4555695056915283, "logits/rejected": -1.1856902837753296, "logps/chosen": -177.55966186523438, "logps/rejected": -267.4813232421875, "loss": 0.6831, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0156883355230093, "rewards/margins": 0.020723089575767517, "rewards/margins_max": 0.02882271073758602, "rewards/margins_min": 0.012623466551303864, "rewards/margins_std": 0.011454595252871513, "rewards/rejected": -0.005034754052758217, "step": 700 }, { "epoch": 0.16, "grad_norm": 0.388671875, "learning_rate": 4.94471076152541e-07, "logits/chosen": -1.365276575088501, "logits/rejected": -1.0673878192901611, "logps/chosen": -189.3357696533203, "logps/rejected": -177.35968017578125, "loss": 0.6822, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.014079471118748188, "rewards/margins": 0.02116897702217102, "rewards/margins_max": 0.03162091225385666, "rewards/margins_min": 0.010717044584453106, "rewards/margins_std": 0.014781268313527107, "rewards/rejected": -0.007089508231729269, "step": 710 }, { "epoch": 0.16, "grad_norm": 0.2451171875, "learning_rate": 4.940508499653537e-07, "logits/chosen": -1.3948003053665161, "logits/rejected": -1.1609618663787842, "logps/chosen": -237.48367309570312, "logps/rejected": -217.39926147460938, "loss": 0.6821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.01721605286002159, "rewards/margins": 0.02224144898355007, "rewards/margins_max": 0.03620148077607155, "rewards/margins_min": 0.008281421847641468, "rewards/margins_std": 0.019742459058761597, "rewards/rejected": -0.005025396589189768, "step": 720 }, { "epoch": 0.17, "grad_norm": 0.4921875, "learning_rate": 4.936154255324751e-07, "logits/chosen": -1.3077278137207031, "logits/rejected": -1.0122146606445312, "logps/chosen": -231.55557250976562, "logps/rejected": -208.5164337158203, "loss": 0.6804, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.015759726986289024, "rewards/margins": 0.02600025199353695, "rewards/margins_max": 0.04006841406226158, "rewards/margins_min": 0.011932085268199444, "rewards/margins_std": 0.019895387813448906, "rewards/rejected": -0.010240525007247925, "step": 730 }, { "epoch": 0.17, "grad_norm": 0.333984375, "learning_rate": 4.931648299699244e-07, "logits/chosen": -1.2586504220962524, "logits/rejected": -1.0412757396697998, "logps/chosen": -170.1586456298828, "logps/rejected": -164.20999145507812, "loss": 0.6825, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.010317305102944374, "rewards/margins": 0.015293523669242859, "rewards/margins_max": 0.02413867600262165, "rewards/margins_min": 0.006448371801525354, "rewards/margins_std": 0.01250893622636795, "rewards/rejected": -0.004976219031959772, "step": 740 }, { "epoch": 0.17, "grad_norm": 0.25390625, "learning_rate": 4.926990913385014e-07, "logits/chosen": -1.486669898033142, "logits/rejected": -1.190189242362976, "logps/chosen": -227.45913696289062, "logps/rejected": -229.8372039794922, "loss": 0.6821, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.025707051157951355, "rewards/margins": 0.025935638695955276, "rewards/margins_max": 0.04128013923764229, "rewards/margins_min": 0.01059113722294569, "rewards/margins_std": 0.021700400859117508, "rewards/rejected": -0.00022858443844597787, "step": 750 }, { "epoch": 0.17, "grad_norm": 0.330078125, "learning_rate": 4.922182386420394e-07, "logits/chosen": -1.4005054235458374, "logits/rejected": -1.1406983137130737, "logps/chosen": -199.43179321289062, "logps/rejected": -217.20321655273438, "loss": 0.6808, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.014500722289085388, "rewards/margins": 0.027240172028541565, "rewards/margins_max": 0.03960045427083969, "rewards/margins_min": 0.014879885129630566, "rewards/margins_std": 0.01748008280992508, "rewards/rejected": -0.012739451602101326, "step": 760 }, { "epoch": 0.17, "grad_norm": 0.443359375, "learning_rate": 4.917223018255988e-07, "logits/chosen": -1.3443114757537842, "logits/rejected": -1.1962391138076782, "logps/chosen": -216.9431915283203, "logps/rejected": -217.5789031982422, "loss": 0.6778, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0227685384452343, "rewards/margins": 0.02805909514427185, "rewards/margins_max": 0.04799731448292732, "rewards/margins_min": 0.008120874874293804, "rewards/margins_std": 0.028196901082992554, "rewards/rejected": -0.0052905576303601265, "step": 770 }, { "epoch": 0.18, "grad_norm": 0.26171875, "learning_rate": 4.912113117736021e-07, "logits/chosen": -1.3593733310699463, "logits/rejected": -1.0461851358413696, "logps/chosen": -215.6609649658203, "logps/rejected": -276.1067810058594, "loss": 0.6778, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02205914631485939, "rewards/margins": 0.03457336500287056, "rewards/margins_max": 0.05253841355443001, "rewards/margins_min": 0.016608327627182007, "rewards/margins_std": 0.025406410917639732, "rewards/rejected": -0.012514224275946617, "step": 780 }, { "epoch": 0.18, "grad_norm": 0.375, "learning_rate": 4.906853003079108e-07, "logits/chosen": -1.3251748085021973, "logits/rejected": -0.9611700177192688, "logps/chosen": -249.4725799560547, "logps/rejected": -171.49986267089844, "loss": 0.6824, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.014942338690161705, "rewards/margins": 0.022041751071810722, "rewards/margins_max": 0.03632887080311775, "rewards/margins_min": 0.007754630409181118, "rewards/margins_std": 0.020205039530992508, "rewards/rejected": -0.007099410984665155, "step": 790 }, { "epoch": 0.18, "grad_norm": 0.439453125, "learning_rate": 4.901443001858437e-07, "logits/chosen": -1.3746627569198608, "logits/rejected": -1.0284743309020996, "logps/chosen": -199.03555297851562, "logps/rejected": -218.10317993164062, "loss": 0.6763, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.021685587242245674, "rewards/margins": 0.030330291017889977, "rewards/margins_max": 0.04316481575369835, "rewards/margins_min": 0.017495770007371902, "rewards/margins_std": 0.018150756135582924, "rewards/rejected": -0.008644704706966877, "step": 800 }, { "epoch": 0.18, "grad_norm": 0.33203125, "learning_rate": 4.895883450981369e-07, "logits/chosen": -1.2774895429611206, "logits/rejected": -1.1306382417678833, "logps/chosen": -199.5352783203125, "logps/rejected": -192.86988830566406, "loss": 0.6816, "rewards/accuracies": 0.875, "rewards/chosen": 0.013707217760384083, "rewards/margins": 0.026056578382849693, "rewards/margins_max": 0.03815007209777832, "rewards/margins_min": 0.013963082805275917, "rewards/margins_std": 0.017102785408496857, "rewards/rejected": -0.012349361553788185, "step": 810 }, { "epoch": 0.19, "grad_norm": 0.337890625, "learning_rate": 4.890174696668458e-07, "logits/chosen": -1.343576192855835, "logits/rejected": -1.146897554397583, "logps/chosen": -248.36767578125, "logps/rejected": -211.9911346435547, "loss": 0.6818, "rewards/accuracies": 0.875, "rewards/chosen": 0.020799916237592697, "rewards/margins": 0.02256534807384014, "rewards/margins_max": 0.03596457839012146, "rewards/margins_min": 0.009166114963591099, "rewards/margins_std": 0.01894937828183174, "rewards/rejected": -0.0017654303228482604, "step": 820 }, { "epoch": 0.19, "grad_norm": 0.388671875, "learning_rate": 4.884317094431885e-07, "logits/chosen": -1.4735023975372314, "logits/rejected": -1.2554306983947754, "logps/chosen": -195.99563598632812, "logps/rejected": -241.04043579101562, "loss": 0.6798, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.017905376851558685, "rewards/margins": 0.020723778754472733, "rewards/margins_max": 0.03622407838702202, "rewards/margins_min": 0.005223480518907309, "rewards/margins_std": 0.021920733153820038, "rewards/rejected": -0.0028184009715914726, "step": 830 }, { "epoch": 0.19, "grad_norm": 0.279296875, "learning_rate": 4.878311009053327e-07, "logits/chosen": -1.453464150428772, "logits/rejected": -1.2428507804870605, "logps/chosen": -166.3644561767578, "logps/rejected": -181.51950073242188, "loss": 0.6806, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.016648324206471443, "rewards/margins": 0.02127132937312126, "rewards/margins_max": 0.03203215450048447, "rewards/margins_min": 0.010510509833693504, "rewards/margins_std": 0.01521809957921505, "rewards/rejected": -0.004623007960617542, "step": 840 }, { "epoch": 0.19, "grad_norm": 0.349609375, "learning_rate": 4.872156814561235e-07, "logits/chosen": -1.372947335243225, "logits/rejected": -1.055177927017212, "logps/chosen": -226.04931640625, "logps/rejected": -227.8234405517578, "loss": 0.6802, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.016364138573408127, "rewards/margins": 0.02827121689915657, "rewards/margins_max": 0.04151327162981033, "rewards/margins_min": 0.01502915658056736, "rewards/margins_std": 0.018727101385593414, "rewards/rejected": -0.011907076463103294, "step": 850 }, { "epoch": 0.19, "grad_norm": 0.4765625, "learning_rate": 4.865854894207541e-07, "logits/chosen": -1.3590134382247925, "logits/rejected": -1.0492280721664429, "logps/chosen": -294.7002868652344, "logps/rejected": -269.7496032714844, "loss": 0.679, "rewards/accuracies": 0.875, "rewards/chosen": 0.01846398413181305, "rewards/margins": 0.02630179561674595, "rewards/margins_max": 0.04034816473722458, "rewards/margins_min": 0.012255420908331871, "rewards/margins_std": 0.01986457034945488, "rewards/rejected": -0.007837808690965176, "step": 860 }, { "epoch": 0.2, "grad_norm": 0.48828125, "learning_rate": 4.859405640443793e-07, "logits/chosen": -1.463749647140503, "logits/rejected": -1.1530177593231201, "logps/chosen": -241.88876342773438, "logps/rejected": -212.88107299804688, "loss": 0.6775, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0185990110039711, "rewards/margins": 0.027092251926660538, "rewards/margins_max": 0.04592274874448776, "rewards/margins_min": 0.00826175231486559, "rewards/margins_std": 0.026630345731973648, "rewards/rejected": -0.008493239060044289, "step": 870 }, { "epoch": 0.2, "grad_norm": 0.4140625, "learning_rate": 4.852809454896714e-07, "logits/chosen": -1.3249889612197876, "logits/rejected": -0.9983965158462524, "logps/chosen": -231.47012329101562, "logps/rejected": -232.07177734375, "loss": 0.6766, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020357009023427963, "rewards/margins": 0.03504853695631027, "rewards/margins_max": 0.04718896746635437, "rewards/margins_min": 0.022908110171556473, "rewards/margins_std": 0.017169155180454254, "rewards/rejected": -0.014691528864204884, "step": 880 }, { "epoch": 0.2, "grad_norm": 0.365234375, "learning_rate": 4.846066748343192e-07, "logits/chosen": -1.4718701839447021, "logits/rejected": -1.3086658716201782, "logps/chosen": -217.58248901367188, "logps/rejected": -218.3019256591797, "loss": 0.6807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.022301137447357178, "rewards/margins": 0.023601362481713295, "rewards/margins_max": 0.04132290184497833, "rewards/margins_min": 0.005879817996174097, "rewards/margins_std": 0.02506204880774021, "rewards/rejected": -0.0013002243358641863, "step": 890 }, { "epoch": 0.2, "grad_norm": 0.4453125, "learning_rate": 4.839177940684699e-07, "logits/chosen": -1.291446328163147, "logits/rejected": -1.1270654201507568, "logps/chosen": -202.37107849121094, "logps/rejected": -224.5233917236328, "loss": 0.6798, "rewards/accuracies": 0.875, "rewards/chosen": 0.018942687660455704, "rewards/margins": 0.026033837348222733, "rewards/margins_max": 0.04407670348882675, "rewards/margins_min": 0.007990965619683266, "rewards/margins_std": 0.025516469031572342, "rewards/rejected": -0.007091146893799305, "step": 900 }, { "epoch": 0.21, "grad_norm": 0.294921875, "learning_rate": 4.832143460921137e-07, "logits/chosen": -1.5121201276779175, "logits/rejected": -1.2432599067687988, "logps/chosen": -206.224365234375, "logps/rejected": -197.02406311035156, "loss": 0.6795, "rewards/accuracies": 0.875, "rewards/chosen": 0.019623275846242905, "rewards/margins": 0.025538703426718712, "rewards/margins_max": 0.04129987955093384, "rewards/margins_min": 0.009777536615729332, "rewards/margins_std": 0.022289659827947617, "rewards/rejected": -0.005915429908782244, "step": 910 }, { "epoch": 0.21, "grad_norm": 0.474609375, "learning_rate": 4.824963747124131e-07, "logits/chosen": -1.4324082136154175, "logits/rejected": -1.0861629247665405, "logps/chosen": -193.4169921875, "logps/rejected": -184.81045532226562, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.021899651736021042, "rewards/margins": 0.03429209813475609, "rewards/margins_max": 0.04998604208230972, "rewards/margins_min": 0.018598156049847603, "rewards/margins_std": 0.02219458669424057, "rewards/rejected": -0.012392444536089897, "step": 920 }, { "epoch": 0.21, "grad_norm": 0.353515625, "learning_rate": 4.817639246409737e-07, "logits/chosen": -1.2531248331069946, "logits/rejected": -1.0572696924209595, "logps/chosen": -174.64602661132812, "logps/rejected": -178.2739715576172, "loss": 0.6808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.019065044820308685, "rewards/margins": 0.02028615027666092, "rewards/margins_max": 0.030723493546247482, "rewards/margins_min": 0.009848803281784058, "rewards/margins_std": 0.014760637655854225, "rewards/rejected": -0.0012211051071062684, "step": 930 }, { "epoch": 0.21, "grad_norm": 0.51953125, "learning_rate": 4.81017041491061e-07, "logits/chosen": -1.3658699989318848, "logits/rejected": -1.1312452554702759, "logps/chosen": -328.56488037109375, "logps/rejected": -290.95050048828125, "loss": 0.6795, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022527078166604042, "rewards/margins": 0.029167424887418747, "rewards/margins_max": 0.04566577076911926, "rewards/margins_min": 0.012669073417782784, "rewards/margins_std": 0.02333218976855278, "rewards/rejected": -0.006640346255153418, "step": 940 }, { "epoch": 0.21, "grad_norm": 0.34375, "learning_rate": 4.802557717747587e-07, "logits/chosen": -1.4883503913879395, "logits/rejected": -1.2266268730163574, "logps/chosen": -201.34490966796875, "logps/rejected": -194.04742431640625, "loss": 0.6779, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02358989045023918, "rewards/margins": 0.0302322618663311, "rewards/margins_max": 0.04986436665058136, "rewards/margins_min": 0.01060016080737114, "rewards/margins_std": 0.027763986960053444, "rewards/rejected": -0.006642372813075781, "step": 950 }, { "epoch": 0.22, "grad_norm": 0.46875, "learning_rate": 4.79480162900073e-07, "logits/chosen": -1.3080474138259888, "logits/rejected": -1.0905568599700928, "logps/chosen": -230.7699737548828, "logps/rejected": -253.7514190673828, "loss": 0.6773, "rewards/accuracies": 0.875, "rewards/chosen": 0.020071830600500107, "rewards/margins": 0.03410849720239639, "rewards/margins_max": 0.054665457457304, "rewards/margins_min": 0.013551535084843636, "rewards/margins_std": 0.029071932658553123, "rewards/rejected": -0.014036668464541435, "step": 960 }, { "epoch": 0.22, "grad_norm": 0.345703125, "learning_rate": 4.7869026316798e-07, "logits/chosen": -1.3408102989196777, "logits/rejected": -1.0580967664718628, "logps/chosen": -239.954833984375, "logps/rejected": -267.51068115234375, "loss": 0.6729, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02294796332716942, "rewards/margins": 0.04744488745927811, "rewards/margins_max": 0.0710405632853508, "rewards/margins_min": 0.023849209770560265, "rewards/margins_std": 0.03336932882666588, "rewards/rejected": -0.02449692226946354, "step": 970 }, { "epoch": 0.22, "grad_norm": 0.369140625, "learning_rate": 4.778861217694174e-07, "logits/chosen": -1.266989827156067, "logits/rejected": -1.0530259609222412, "logps/chosen": -171.4249725341797, "logps/rejected": -190.92129516601562, "loss": 0.6759, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02101651392877102, "rewards/margins": 0.029414648190140724, "rewards/margins_max": 0.04411066323518753, "rewards/margins_min": 0.01471862755715847, "rewards/margins_std": 0.020783307030797005, "rewards/rejected": -0.008398131467401981, "step": 980 }, { "epoch": 0.22, "grad_norm": 0.279296875, "learning_rate": 4.770677887822217e-07, "logits/chosen": -1.3193893432617188, "logits/rejected": -1.0859930515289307, "logps/chosen": -200.52943420410156, "logps/rejected": -215.36544799804688, "loss": 0.6751, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.024466924369335175, "rewards/margins": 0.034502509981393814, "rewards/margins_max": 0.05769481137394905, "rewards/margins_min": 0.011310202069580555, "rewards/margins_std": 0.032798875123262405, "rewards/rejected": -0.01003558561205864, "step": 990 }, { "epoch": 0.23, "grad_norm": 0.546875, "learning_rate": 4.7623531516800907e-07, "logits/chosen": -1.2966734170913696, "logits/rejected": -1.0474090576171875, "logps/chosen": -237.970947265625, "logps/rejected": -179.91912841796875, "loss": 0.6775, "rewards/accuracies": 0.875, "rewards/chosen": 0.024659987539052963, "rewards/margins": 0.02979280985891819, "rewards/margins_max": 0.044087667018175125, "rewards/margins_min": 0.015497950837016106, "rewards/margins_std": 0.02021598257124424, "rewards/rejected": -0.0051328218542039394, "step": 1000 }, { "epoch": 0.23, "grad_norm": 0.390625, "learning_rate": 4.753887527690026e-07, "logits/chosen": -1.3687721490859985, "logits/rejected": -1.0742595195770264, "logps/chosen": -222.78317260742188, "logps/rejected": -212.2716827392578, "loss": 0.6775, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022007988765835762, "rewards/margins": 0.03305414319038391, "rewards/margins_max": 0.04547839239239693, "rewards/margins_min": 0.020629890263080597, "rewards/margins_std": 0.01757054589688778, "rewards/rejected": -0.011046156287193298, "step": 1010 }, { "epoch": 0.23, "grad_norm": 0.373046875, "learning_rate": 4.745281543048027e-07, "logits/chosen": -1.4337875843048096, "logits/rejected": -1.1076844930648804, "logps/chosen": -229.92776489257812, "logps/rejected": -248.87930297851562, "loss": 0.6752, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02592829242348671, "rewards/margins": 0.04496189206838608, "rewards/margins_max": 0.062208838760852814, "rewards/margins_min": 0.02771494910120964, "rewards/margins_std": 0.02439085766673088, "rewards/rejected": -0.01903359964489937, "step": 1020 }, { "epoch": 0.23, "grad_norm": 0.3984375, "learning_rate": 4.736535733691047e-07, "logits/chosen": -1.4199892282485962, "logits/rejected": -1.1356946229934692, "logps/chosen": -207.2522430419922, "logps/rejected": -198.83889770507812, "loss": 0.6761, "rewards/accuracies": 0.875, "rewards/chosen": 0.021651072427630424, "rewards/margins": 0.029218804091215134, "rewards/margins_max": 0.04629982262849808, "rewards/margins_min": 0.012137781828641891, "rewards/margins_std": 0.02415620908141136, "rewards/rejected": -0.0075677321292459965, "step": 1030 }, { "epoch": 0.24, "grad_norm": 0.396484375, "learning_rate": 4.7276506442636124e-07, "logits/chosen": -1.4461133480072021, "logits/rejected": -1.1284468173980713, "logps/chosen": -249.48037719726562, "logps/rejected": -189.9345245361328, "loss": 0.677, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.024505896493792534, "rewards/margins": 0.033815208822488785, "rewards/margins_max": 0.04826827347278595, "rewards/margins_min": 0.019362136721611023, "rewards/margins_std": 0.020439723506569862, "rewards/rejected": -0.009309305809438229, "step": 1040 }, { "epoch": 0.24, "grad_norm": 0.38671875, "learning_rate": 4.718626828083901e-07, "logits/chosen": -1.3048738241195679, "logits/rejected": -1.0318455696105957, "logps/chosen": -246.67788696289062, "logps/rejected": -189.42269897460938, "loss": 0.6739, "rewards/accuracies": 0.875, "rewards/chosen": 0.020917896181344986, "rewards/margins": 0.03381497040390968, "rewards/margins_max": 0.0529189296066761, "rewards/margins_min": 0.014711007475852966, "rewards/margins_std": 0.027017081156373024, "rewards/rejected": -0.012897074222564697, "step": 1050 }, { "epoch": 0.24, "grad_norm": 0.3984375, "learning_rate": 4.709464847109291e-07, "logits/chosen": -1.3040274381637573, "logits/rejected": -1.0351606607437134, "logps/chosen": -190.476318359375, "logps/rejected": -199.35769653320312, "loss": 0.6794, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02514139749109745, "rewards/margins": 0.03171490877866745, "rewards/margins_max": 0.048779942095279694, "rewards/margins_min": 0.014649872668087482, "rewards/margins_std": 0.024133604019880295, "rewards/rejected": -0.0065735094249248505, "step": 1060 }, { "epoch": 0.24, "grad_norm": 0.357421875, "learning_rate": 4.7001652719013605e-07, "logits/chosen": -1.4631807804107666, "logits/rejected": -1.2612019777297974, "logps/chosen": -195.13742065429688, "logps/rejected": -227.14114379882812, "loss": 0.6751, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.017182352021336555, "rewards/margins": 0.03211602568626404, "rewards/margins_max": 0.05497432500123978, "rewards/margins_min": 0.00925772450864315, "rewards/margins_std": 0.03232651576399803, "rewards/rejected": -0.014933672733604908, "step": 1070 }, { "epoch": 0.24, "grad_norm": 0.5234375, "learning_rate": 4.6907286815903534e-07, "logits/chosen": -1.359162449836731, "logits/rejected": -1.0865987539291382, "logps/chosen": -206.14501953125, "logps/rejected": -195.1103515625, "loss": 0.6764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.017108319327235222, "rewards/margins": 0.03591880574822426, "rewards/margins_max": 0.05570870637893677, "rewards/margins_min": 0.016128908842802048, "rewards/margins_std": 0.02798713743686676, "rewards/rejected": -0.018810484558343887, "step": 1080 }, { "epoch": 0.25, "grad_norm": 0.42578125, "learning_rate": 4.681155663839121e-07, "logits/chosen": -1.3436006307601929, "logits/rejected": -1.09580659866333, "logps/chosen": -205.31448364257812, "logps/rejected": -212.5472412109375, "loss": 0.6741, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02455434389412403, "rewards/margins": 0.04684922844171524, "rewards/margins_max": 0.06827957183122635, "rewards/margins_min": 0.025418881326913834, "rewards/margins_std": 0.030307084321975708, "rewards/rejected": -0.02229488268494606, "step": 1090 }, { "epoch": 0.25, "grad_norm": 0.361328125, "learning_rate": 4.671446814806521e-07, "logits/chosen": -1.4599530696868896, "logits/rejected": -1.3884165287017822, "logps/chosen": -210.5607147216797, "logps/rejected": -242.82540893554688, "loss": 0.6714, "rewards/accuracies": 0.875, "rewards/chosen": 0.02126205340027809, "rewards/margins": 0.040613122284412384, "rewards/margins_max": 0.06381477415561676, "rewards/margins_min": 0.017411479726433754, "rewards/margins_std": 0.03281208127737045, "rewards/rejected": -0.019351070746779442, "step": 1100 }, { "epoch": 0.25, "grad_norm": 0.361328125, "learning_rate": 4.66160273911029e-07, "logits/chosen": -1.3703720569610596, "logits/rejected": -1.1701464653015137, "logps/chosen": -183.18475341796875, "logps/rejected": -199.93508911132812, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.03071439266204834, "rewards/margins": 0.0536029115319252, "rewards/margins_max": 0.07884959131479263, "rewards/margins_min": 0.028356235474348068, "rewards/margins_std": 0.03570418804883957, "rewards/rejected": -0.022888517007231712, "step": 1110 }, { "epoch": 0.25, "grad_norm": 0.408203125, "learning_rate": 4.651624049789397e-07, "logits/chosen": -1.5386936664581299, "logits/rejected": -1.3589353561401367, "logps/chosen": -189.0791778564453, "logps/rejected": -204.80850219726562, "loss": 0.6778, "rewards/accuracies": 0.875, "rewards/chosen": 0.02617635391652584, "rewards/margins": 0.03845269978046417, "rewards/margins_max": 0.0595969632267952, "rewards/margins_min": 0.017308443784713745, "rewards/margins_std": 0.029902497306466103, "rewards/rejected": -0.012276348657906055, "step": 1120 }, { "epoch": 0.26, "grad_norm": 0.435546875, "learning_rate": 4.64151136826586e-07, "logits/chosen": -1.4097120761871338, "logits/rejected": -1.1443564891815186, "logps/chosen": -221.24960327148438, "logps/rejected": -220.4223175048828, "loss": 0.6741, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.026502933353185654, "rewards/margins": 0.040553100407123566, "rewards/margins_max": 0.055532462894916534, "rewards/margins_min": 0.025573736056685448, "rewards/margins_std": 0.02118402160704136, "rewards/rejected": -0.014050167985260487, "step": 1130 }, { "epoch": 0.26, "grad_norm": 0.34375, "learning_rate": 4.631265324306053e-07, "logits/chosen": -1.3383985757827759, "logits/rejected": -1.179694652557373, "logps/chosen": -178.04922485351562, "logps/rejected": -221.2380828857422, "loss": 0.6741, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02292972058057785, "rewards/margins": 0.03944787755608559, "rewards/margins_max": 0.06199193745851517, "rewards/margins_min": 0.01690381020307541, "rewards/margins_std": 0.03188212215900421, "rewards/rejected": -0.01651815138757229, "step": 1140 }, { "epoch": 0.26, "grad_norm": 0.408203125, "learning_rate": 4.6208865559814795e-07, "logits/chosen": -1.224730134010315, "logits/rejected": -1.0031412839889526, "logps/chosen": -225.22817993164062, "logps/rejected": -246.8232421875, "loss": 0.6759, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.022863784804940224, "rewards/margins": 0.033729564398527145, "rewards/margins_max": 0.05401785299181938, "rewards/margins_min": 0.013441281393170357, "rewards/margins_std": 0.028691967949271202, "rewards/rejected": -0.01086578331887722, "step": 1150 }, { "epoch": 0.26, "grad_norm": 0.47265625, "learning_rate": 4.610375709629047e-07, "logits/chosen": -1.3221080303192139, "logits/rejected": -1.0841983556747437, "logps/chosen": -199.57630920410156, "logps/rejected": -244.9528350830078, "loss": 0.672, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.027617940679192543, "rewards/margins": 0.04445469751954079, "rewards/margins_max": 0.0699603408575058, "rewards/margins_min": 0.018949061632156372, "rewards/margins_std": 0.03607042506337166, "rewards/rejected": -0.01683676615357399, "step": 1160 }, { "epoch": 0.26, "grad_norm": 0.39453125, "learning_rate": 4.5997334398108064e-07, "logits/chosen": -1.380094289779663, "logits/rejected": -1.0472290515899658, "logps/chosen": -229.5024871826172, "logps/rejected": -222.79287719726562, "loss": 0.6742, "rewards/accuracies": 0.875, "rewards/chosen": 0.027185887098312378, "rewards/margins": 0.036655206233263016, "rewards/margins_max": 0.0568247064948082, "rewards/margins_min": 0.016485709697008133, "rewards/margins_std": 0.02852398157119751, "rewards/rejected": -0.009469323791563511, "step": 1170 }, { "epoch": 0.27, "grad_norm": 0.2578125, "learning_rate": 4.5889604092731954e-07, "logits/chosen": -1.5819199085235596, "logits/rejected": -1.360912561416626, "logps/chosen": -153.7646026611328, "logps/rejected": -182.9700164794922, "loss": 0.6755, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02463090792298317, "rewards/margins": 0.03991164639592171, "rewards/margins_max": 0.06291045248508453, "rewards/margins_min": 0.016912829130887985, "rewards/margins_std": 0.032525233924388885, "rewards/rejected": -0.015280733816325665, "step": 1180 }, { "epoch": 0.27, "grad_norm": 0.48046875, "learning_rate": 4.578057288905765e-07, "logits/chosen": -1.406576156616211, "logits/rejected": -1.0120534896850586, "logps/chosen": -206.16256713867188, "logps/rejected": -236.18276977539062, "loss": 0.6736, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.023824552074074745, "rewards/margins": 0.03779350593686104, "rewards/margins_max": 0.05464478209614754, "rewards/margins_min": 0.02094222977757454, "rewards/margins_std": 0.02383130043745041, "rewards/rejected": -0.013968953862786293, "step": 1190 }, { "epoch": 0.27, "grad_norm": 0.447265625, "learning_rate": 4.567024757699398e-07, "logits/chosen": -1.3428590297698975, "logits/rejected": -1.1314809322357178, "logps/chosen": -171.41761779785156, "logps/rejected": -193.2316131591797, "loss": 0.6775, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02352306619286537, "rewards/margins": 0.03855372592806816, "rewards/margins_max": 0.05891672894358635, "rewards/margins_min": 0.018190719187259674, "rewards/margins_std": 0.028797641396522522, "rewards/rejected": -0.015030661597847939, "step": 1200 }, { "epoch": 0.27, "grad_norm": 0.416015625, "learning_rate": 4.555863502704026e-07, "logits/chosen": -1.3257324695587158, "logits/rejected": -1.017549753189087, "logps/chosen": -277.9920349121094, "logps/rejected": -204.20068359375, "loss": 0.6725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.029865305870771408, "rewards/margins": 0.04546823725104332, "rewards/margins_max": 0.067599818110466, "rewards/margins_min": 0.023336660116910934, "rewards/margins_std": 0.03129877895116806, "rewards/rejected": -0.015602931380271912, "step": 1210 }, { "epoch": 0.28, "grad_norm": 0.458984375, "learning_rate": 4.544574218985844e-07, "logits/chosen": -1.3070051670074463, "logits/rejected": -1.027001142501831, "logps/chosen": -238.04275512695312, "logps/rejected": -211.5153045654297, "loss": 0.6731, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.031848080456256866, "rewards/margins": 0.03921237960457802, "rewards/margins_max": 0.0675017461180687, "rewards/margins_min": 0.010923011228442192, "rewards/margins_std": 0.04000721126794815, "rewards/rejected": -0.007364300079643726, "step": 1220 }, { "epoch": 0.28, "grad_norm": 0.330078125, "learning_rate": 4.533157609584025e-07, "logits/chosen": -1.5297473669052124, "logits/rejected": -1.2360965013504028, "logps/chosen": -245.1328582763672, "logps/rejected": -284.0274353027344, "loss": 0.6707, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02966652438044548, "rewards/margins": 0.03678882494568825, "rewards/margins_max": 0.05802678316831589, "rewards/margins_min": 0.015550869517028332, "rewards/margins_std": 0.030035007745027542, "rewards/rejected": -0.007122299168258905, "step": 1230 }, { "epoch": 0.28, "grad_norm": 0.287109375, "learning_rate": 4.521614385466938e-07, "logits/chosen": -1.2434746026992798, "logits/rejected": -0.9775724411010742, "logps/chosen": -167.60195922851562, "logps/rejected": -160.4337158203125, "loss": 0.6751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.027229581028223038, "rewards/margins": 0.04429193213582039, "rewards/margins_max": 0.06861467659473419, "rewards/margins_min": 0.019969182088971138, "rewards/margins_std": 0.034397564828395844, "rewards/rejected": -0.017062349244952202, "step": 1240 }, { "epoch": 0.28, "grad_norm": 0.35546875, "learning_rate": 4.50994526548787e-07, "logits/chosen": -1.4568400382995605, "logits/rejected": -1.1636699438095093, "logps/chosen": -277.090576171875, "logps/rejected": -241.61618041992188, "loss": 0.6747, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02653464674949646, "rewards/margins": 0.042523398995399475, "rewards/margins_max": 0.06366567313671112, "rewards/margins_min": 0.021381134167313576, "rewards/margins_std": 0.029899677261710167, "rewards/rejected": -0.015988752245903015, "step": 1250 }, { "epoch": 0.28, "grad_norm": 0.47265625, "learning_rate": 4.498150976340266e-07, "logits/chosen": -1.289681077003479, "logits/rejected": -1.1193758249282837, "logps/chosen": -190.8822021484375, "logps/rejected": -197.9546356201172, "loss": 0.6754, "rewards/accuracies": 0.875, "rewards/chosen": 0.019678102806210518, "rewards/margins": 0.040147751569747925, "rewards/margins_max": 0.06217692047357559, "rewards/margins_min": 0.018118582665920258, "rewards/margins_std": 0.031153947114944458, "rewards/rejected": -0.020469646900892258, "step": 1260 }, { "epoch": 0.29, "grad_norm": 0.3359375, "learning_rate": 4.4862322525124676e-07, "logits/chosen": -1.4055033922195435, "logits/rejected": -1.1639750003814697, "logps/chosen": -182.3616943359375, "logps/rejected": -207.5341033935547, "loss": 0.6735, "rewards/accuracies": 0.875, "rewards/chosen": 0.030965592712163925, "rewards/margins": 0.0414421483874321, "rewards/margins_max": 0.06650801748037338, "rewards/margins_min": 0.016376283019781113, "rewards/margins_std": 0.03544849157333374, "rewards/rejected": -0.010476559400558472, "step": 1270 }, { "epoch": 0.29, "grad_norm": 0.458984375, "learning_rate": 4.474189836241976e-07, "logits/chosen": -1.3990777730941772, "logits/rejected": -1.0768229961395264, "logps/chosen": -271.35107421875, "logps/rejected": -200.7308349609375, "loss": 0.6718, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.025321820750832558, "rewards/margins": 0.05165884643793106, "rewards/margins_max": 0.07472828775644302, "rewards/margins_min": 0.028589408844709396, "rewards/margins_std": 0.032625116407871246, "rewards/rejected": -0.026337021961808205, "step": 1280 }, { "epoch": 0.29, "grad_norm": 0.384765625, "learning_rate": 4.4620244774692296e-07, "logits/chosen": -1.4728714227676392, "logits/rejected": -1.2694370746612549, "logps/chosen": -195.16342163085938, "logps/rejected": -178.6471405029297, "loss": 0.6743, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.024754667654633522, "rewards/margins": 0.03827555477619171, "rewards/margins_max": 0.05812322348356247, "rewards/margins_min": 0.018427889794111252, "rewards/margins_std": 0.02806883677840233, "rewards/rejected": -0.013520888984203339, "step": 1290 }, { "epoch": 0.29, "grad_norm": 0.3125, "learning_rate": 4.4497369337908986e-07, "logits/chosen": -1.3485519886016846, "logits/rejected": -1.1609489917755127, "logps/chosen": -242.7335968017578, "logps/rejected": -244.931640625, "loss": 0.6742, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027875151485204697, "rewards/margins": 0.03989388421177864, "rewards/margins_max": 0.06232045218348503, "rewards/margins_min": 0.017467325553297997, "rewards/margins_std": 0.03171594813466072, "rewards/rejected": -0.012018732726573944, "step": 1300 }, { "epoch": 0.3, "grad_norm": 0.2890625, "learning_rate": 4.437327970412709e-07, "logits/chosen": -1.4367246627807617, "logits/rejected": -1.09050714969635, "logps/chosen": -230.6901397705078, "logps/rejected": -198.34022521972656, "loss": 0.673, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.027688348665833473, "rewards/margins": 0.04100477322936058, "rewards/margins_max": 0.06766901910305023, "rewards/margins_min": 0.014340527355670929, "rewards/margins_std": 0.037708934396505356, "rewards/rejected": -0.013316420838236809, "step": 1310 }, { "epoch": 0.3, "grad_norm": 0.51953125, "learning_rate": 4.424798360101788e-07, "logits/chosen": -1.3804186582565308, "logits/rejected": -1.271663784980774, "logps/chosen": -185.00445556640625, "logps/rejected": -192.37820434570312, "loss": 0.6714, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02708057127892971, "rewards/margins": 0.04213610291481018, "rewards/margins_max": 0.059732962399721146, "rewards/margins_min": 0.024539247155189514, "rewards/margins_std": 0.024885715916752815, "rewards/rejected": -0.01505553163588047, "step": 1320 }, { "epoch": 0.3, "grad_norm": 0.361328125, "learning_rate": 4.41214888313854e-07, "logits/chosen": -1.3674253225326538, "logits/rejected": -1.2365589141845703, "logps/chosen": -196.21676635742188, "logps/rejected": -256.8332214355469, "loss": 0.6732, "rewards/accuracies": 0.875, "rewards/chosen": 0.02884775400161743, "rewards/margins": 0.03929910063743591, "rewards/margins_max": 0.06681930273771286, "rewards/margins_min": 0.011778893880546093, "rewards/margins_std": 0.038919445127248764, "rewards/rejected": -0.010451346635818481, "step": 1330 }, { "epoch": 0.3, "grad_norm": 0.50390625, "learning_rate": 4.3993803272680553e-07, "logits/chosen": -1.363981008529663, "logits/rejected": -1.0240668058395386, "logps/chosen": -252.509033203125, "logps/rejected": -234.2544403076172, "loss": 0.6714, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.031312428414821625, "rewards/margins": 0.05178850144147873, "rewards/margins_max": 0.08234371989965439, "rewards/margins_min": 0.02123328484594822, "rewards/margins_std": 0.043211597949266434, "rewards/rejected": -0.020476069301366806, "step": 1340 }, { "epoch": 0.31, "grad_norm": 0.396484375, "learning_rate": 4.386493487651051e-07, "logits/chosen": -1.4066898822784424, "logits/rejected": -1.2005523443222046, "logps/chosen": -204.56298828125, "logps/rejected": -221.59033203125, "loss": 0.6722, "rewards/accuracies": 0.875, "rewards/chosen": 0.026341985911130905, "rewards/margins": 0.03979109227657318, "rewards/margins_max": 0.0626225396990776, "rewards/margins_min": 0.016959641128778458, "rewards/margins_std": 0.03228854760527611, "rewards/rejected": -0.013449104502797127, "step": 1350 }, { "epoch": 0.31, "grad_norm": 0.306640625, "learning_rate": 4.373489166814358e-07, "logits/chosen": -1.417218804359436, "logits/rejected": -1.0997976064682007, "logps/chosen": -241.2794189453125, "logps/rejected": -208.86245727539062, "loss": 0.6752, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023142900317907333, "rewards/margins": 0.0397111177444458, "rewards/margins_max": 0.059913743287324905, "rewards/margins_min": 0.019508492201566696, "rewards/margins_std": 0.02857082709670067, "rewards/rejected": -0.01656820997595787, "step": 1360 }, { "epoch": 0.31, "grad_norm": 0.4453125, "learning_rate": 4.360368174600937e-07, "logits/chosen": -1.39288330078125, "logits/rejected": -1.104473352432251, "logps/chosen": -232.40640258789062, "logps/rejected": -160.45449829101562, "loss": 0.6746, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.025461440905928612, "rewards/margins": 0.036584287881851196, "rewards/margins_max": 0.05490097403526306, "rewards/margins_min": 0.018267596140503883, "rewards/margins_std": 0.02590371109545231, "rewards/rejected": -0.01112284604460001, "step": 1370 }, { "epoch": 0.31, "grad_norm": 0.431640625, "learning_rate": 4.34713132811945e-07, "logits/chosen": -1.3218541145324707, "logits/rejected": -1.1150844097137451, "logps/chosen": -207.88314819335938, "logps/rejected": -225.9438018798828, "loss": 0.6722, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.029938017949461937, "rewards/margins": 0.04521246999502182, "rewards/margins_max": 0.06895864754915237, "rewards/margins_min": 0.021466294303536415, "rewards/margins_std": 0.033582162111997604, "rewards/rejected": -0.015274452045559883, "step": 1380 }, { "epoch": 0.31, "grad_norm": 0.4296875, "learning_rate": 4.333779451693372e-07, "logits/chosen": -1.3209631443023682, "logits/rejected": -1.0849366188049316, "logps/chosen": -196.41134643554688, "logps/rejected": -191.35043334960938, "loss": 0.6748, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.023151496425271034, "rewards/margins": 0.039687685668468475, "rewards/margins_max": 0.060737740248441696, "rewards/margins_min": 0.018637629225850105, "rewards/margins_std": 0.029769275337457657, "rewards/rejected": -0.01653619296848774, "step": 1390 }, { "epoch": 0.32, "grad_norm": 0.365234375, "learning_rate": 4.32031337680966e-07, "logits/chosen": -1.3408453464508057, "logits/rejected": -1.014647126197815, "logps/chosen": -177.572998046875, "logps/rejected": -179.5858612060547, "loss": 0.6726, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.021111497655510902, "rewards/margins": 0.04325110465288162, "rewards/margins_max": 0.06425291299819946, "rewards/margins_min": 0.022249290719628334, "rewards/margins_std": 0.029701050370931625, "rewards/rejected": -0.02213960886001587, "step": 1400 }, { "epoch": 0.32, "grad_norm": 0.322265625, "learning_rate": 4.306733942066969e-07, "logits/chosen": -1.2504961490631104, "logits/rejected": -1.0722445249557495, "logps/chosen": -207.9407196044922, "logps/rejected": -184.21514892578125, "loss": 0.6754, "rewards/accuracies": 0.875, "rewards/chosen": 0.020327100530266762, "rewards/margins": 0.03269373998045921, "rewards/margins_max": 0.05034772679209709, "rewards/margins_min": 0.01503975223749876, "rewards/margins_std": 0.024966508150100708, "rewards/rejected": -0.012366642244160175, "step": 1410 }, { "epoch": 0.32, "grad_norm": 0.37890625, "learning_rate": 4.29304199312343e-07, "logits/chosen": -1.3490780591964722, "logits/rejected": -1.0001791715621948, "logps/chosen": -253.0603790283203, "logps/rejected": -220.7419891357422, "loss": 0.6723, "rewards/accuracies": 0.875, "rewards/chosen": 0.023180747404694557, "rewards/margins": 0.034993939101696014, "rewards/margins_max": 0.05462303012609482, "rewards/margins_min": 0.015364840626716614, "rewards/margins_std": 0.02775973081588745, "rewards/rejected": -0.011813190765678883, "step": 1420 }, { "epoch": 0.32, "grad_norm": 0.41796875, "learning_rate": 4.279238382643984e-07, "logits/chosen": -1.3278534412384033, "logits/rejected": -1.0635864734649658, "logps/chosen": -212.005615234375, "logps/rejected": -251.8597869873047, "loss": 0.6719, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.025677820667624474, "rewards/margins": 0.04707244411110878, "rewards/margins_max": 0.07336616516113281, "rewards/margins_min": 0.020778721198439598, "rewards/margins_std": 0.037184938788414, "rewards/rejected": -0.021394621580839157, "step": 1430 }, { "epoch": 0.33, "grad_norm": 0.5078125, "learning_rate": 4.26532397024729e-07, "logits/chosen": -1.331188440322876, "logits/rejected": -1.1710078716278076, "logps/chosen": -202.5389404296875, "logps/rejected": -222.7393798828125, "loss": 0.6734, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.036635275930166245, "rewards/margins": 0.03808824345469475, "rewards/margins_max": 0.05488138273358345, "rewards/margins_min": 0.021295102313160896, "rewards/margins_std": 0.023749085143208504, "rewards/rejected": -0.001452968455851078, "step": 1440 }, { "epoch": 0.33, "grad_norm": 0.359375, "learning_rate": 4.251299622452179e-07, "logits/chosen": -1.3149991035461426, "logits/rejected": -1.2217159271240234, "logps/chosen": -144.6352996826172, "logps/rejected": -194.59603881835938, "loss": 0.6721, "rewards/accuracies": 0.875, "rewards/chosen": 0.02758004143834114, "rewards/margins": 0.03688063472509384, "rewards/margins_max": 0.05518157035112381, "rewards/margins_min": 0.018579700961709023, "rewards/margins_std": 0.025881433859467506, "rewards/rejected": -0.009300598874688148, "step": 1450 }, { "epoch": 0.33, "grad_norm": 0.318359375, "learning_rate": 4.2371662126237074e-07, "logits/chosen": -1.2196018695831299, "logits/rejected": -1.113595724105835, "logps/chosen": -187.85397338867188, "logps/rejected": -217.3521728515625, "loss": 0.6728, "rewards/accuracies": 0.875, "rewards/chosen": 0.020274577662348747, "rewards/margins": 0.04040878638625145, "rewards/margins_max": 0.06457223743200302, "rewards/margins_min": 0.016245335340499878, "rewards/margins_std": 0.03417228162288666, "rewards/rejected": -0.02013421058654785, "step": 1460 }, { "epoch": 0.33, "grad_norm": 0.453125, "learning_rate": 4.222924620918755e-07, "logits/chosen": -1.317690134048462, "logits/rejected": -1.0494765043258667, "logps/chosen": -182.76686096191406, "logps/rejected": -199.16482543945312, "loss": 0.673, "rewards/accuracies": 0.875, "rewards/chosen": 0.032194025814533234, "rewards/margins": 0.035994671285152435, "rewards/margins_max": 0.05865035578608513, "rewards/margins_min": 0.013338984921574593, "rewards/margins_std": 0.03203997761011124, "rewards/rejected": -0.0038006496615707874, "step": 1470 }, { "epoch": 0.33, "grad_norm": 0.50390625, "learning_rate": 4.2085757342312203e-07, "logits/chosen": -1.37278151512146, "logits/rejected": -1.064212679862976, "logps/chosen": -313.87835693359375, "logps/rejected": -252.13623046875, "loss": 0.6708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.031817976385354996, "rewards/margins": 0.048167884349823, "rewards/margins_max": 0.07686305046081543, "rewards/margins_min": 0.019472714513540268, "rewards/margins_std": 0.04058109596371651, "rewards/rejected": -0.016349902376532555, "step": 1480 }, { "epoch": 0.34, "grad_norm": 0.40234375, "learning_rate": 4.1941204461367873e-07, "logits/chosen": -1.3766664266586304, "logits/rejected": -1.1429855823516846, "logps/chosen": -245.5808563232422, "logps/rejected": -208.631591796875, "loss": 0.6725, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02773849107325077, "rewards/margins": 0.03844064846634865, "rewards/margins_max": 0.06190527603030205, "rewards/margins_min": 0.0149760153144598, "rewards/margins_std": 0.0331839993596077, "rewards/rejected": -0.010702153667807579, "step": 1490 }, { "epoch": 0.34, "grad_norm": 0.376953125, "learning_rate": 4.1795596568372795e-07, "logits/chosen": -1.3350478410720825, "logits/rejected": -1.0876274108886719, "logps/chosen": -270.3304138183594, "logps/rejected": -191.8568115234375, "loss": 0.671, "rewards/accuracies": 0.875, "rewards/chosen": 0.027609974145889282, "rewards/margins": 0.04803454130887985, "rewards/margins_max": 0.07594247162342072, "rewards/margins_min": 0.02012660540640354, "rewards/margins_std": 0.03946777805685997, "rewards/rejected": -0.02042456530034542, "step": 1500 }, { "epoch": 0.34, "grad_norm": 0.37890625, "learning_rate": 4.1648942731045984e-07, "logits/chosen": -1.494276762008667, "logits/rejected": -1.0351579189300537, "logps/chosen": -225.27645874023438, "logps/rejected": -193.90029907226562, "loss": 0.67, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03104591928422451, "rewards/margins": 0.04760845750570297, "rewards/margins_max": 0.07213185727596283, "rewards/margins_min": 0.023085057735443115, "rewards/margins_std": 0.034681327641010284, "rewards/rejected": -0.016562536358833313, "step": 1510 }, { "epoch": 0.34, "grad_norm": 0.31640625, "learning_rate": 4.1501252082242536e-07, "logits/chosen": -1.4287065267562866, "logits/rejected": -1.3087886571884155, "logps/chosen": -167.2257843017578, "logps/rejected": -184.9642333984375, "loss": 0.6733, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02862406335771084, "rewards/margins": 0.03685583546757698, "rewards/margins_max": 0.05723712965846062, "rewards/margins_min": 0.016474535688757896, "rewards/margins_std": 0.028823506087064743, "rewards/rejected": -0.008231772109866142, "step": 1520 }, { "epoch": 0.35, "grad_norm": 0.34765625, "learning_rate": 4.1352533819384916e-07, "logits/chosen": -1.3988134860992432, "logits/rejected": -1.0942933559417725, "logps/chosen": -203.42141723632812, "logps/rejected": -211.5314178466797, "loss": 0.6701, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.023281631991267204, "rewards/margins": 0.043478261679410934, "rewards/margins_max": 0.0694703757762909, "rewards/margins_min": 0.017486149445176125, "rewards/margins_std": 0.03675839677453041, "rewards/rejected": -0.02019662782549858, "step": 1530 }, { "epoch": 0.35, "grad_norm": 0.419921875, "learning_rate": 4.120279720389014e-07, "logits/chosen": -1.2943384647369385, "logits/rejected": -1.0621731281280518, "logps/chosen": -198.1253204345703, "logps/rejected": -172.11181640625, "loss": 0.6728, "rewards/accuracies": 0.875, "rewards/chosen": 0.03131099045276642, "rewards/margins": 0.034897904843091965, "rewards/margins_max": 0.059048790484666824, "rewards/margins_min": 0.01074702013283968, "rewards/margins_std": 0.0341545045375824, "rewards/rejected": -0.003586915787309408, "step": 1540 }, { "epoch": 0.35, "grad_norm": 0.380859375, "learning_rate": 4.1052051560593065e-07, "logits/chosen": -1.3193204402923584, "logits/rejected": -1.0119553804397583, "logps/chosen": -211.03005981445312, "logps/rejected": -193.88609313964844, "loss": 0.671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.023582275956869125, "rewards/margins": 0.044007860124111176, "rewards/margins_max": 0.06155252456665039, "rewards/margins_min": 0.026463191956281662, "rewards/margins_std": 0.024811910465359688, "rewards/rejected": -0.0204255860298872, "step": 1550 }, { "epoch": 0.35, "grad_norm": 0.515625, "learning_rate": 4.0900306277165666e-07, "logits/chosen": -1.2885864973068237, "logits/rejected": -1.061097502708435, "logps/chosen": -202.5950164794922, "logps/rejected": -194.2788848876953, "loss": 0.6711, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0246761254966259, "rewards/margins": 0.03755421191453934, "rewards/margins_max": 0.05342765524983406, "rewards/margins_min": 0.021680768579244614, "rewards/margins_std": 0.02244843915104866, "rewards/rejected": -0.012878087349236012, "step": 1560 }, { "epoch": 0.35, "grad_norm": 0.462890625, "learning_rate": 4.0747570803532407e-07, "logits/chosen": -1.2883301973342896, "logits/rejected": -1.0276634693145752, "logps/chosen": -267.29791259765625, "logps/rejected": -222.98080444335938, "loss": 0.6727, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.021301481872797012, "rewards/margins": 0.040718015283346176, "rewards/margins_max": 0.059512365609407425, "rewards/margins_min": 0.021923670545220375, "rewards/margins_std": 0.026579225435853004, "rewards/rejected": -0.019416535273194313, "step": 1570 }, { "epoch": 0.36, "grad_norm": 0.36328125, "learning_rate": 4.059385465128178e-07, "logits/chosen": -1.2792766094207764, "logits/rejected": -1.0782058238983154, "logps/chosen": -183.18736267089844, "logps/rejected": -193.03024291992188, "loss": 0.6694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02231001853942871, "rewards/margins": 0.035427968949079514, "rewards/margins_max": 0.057831812649965286, "rewards/margins_min": 0.013024131767451763, "rewards/margins_std": 0.03168381005525589, "rewards/rejected": -0.013117952272295952, "step": 1580 }, { "epoch": 0.36, "grad_norm": 0.36328125, "learning_rate": 4.043916739307394e-07, "logits/chosen": -1.3757706880569458, "logits/rejected": -1.0660990476608276, "logps/chosen": -192.93234252929688, "logps/rejected": -202.1962890625, "loss": 0.6675, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03153764829039574, "rewards/margins": 0.055783580988645554, "rewards/margins_max": 0.0908677726984024, "rewards/margins_min": 0.020699385553598404, "rewards/margins_std": 0.04961654543876648, "rewards/rejected": -0.024245930835604668, "step": 1590 }, { "epoch": 0.36, "grad_norm": 0.46484375, "learning_rate": 4.0283518662044595e-07, "logits/chosen": -1.4345725774765015, "logits/rejected": -1.1135655641555786, "logps/chosen": -259.4965515136719, "logps/rejected": -224.1306915283203, "loss": 0.6715, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02794257365167141, "rewards/margins": 0.03813885152339935, "rewards/margins_max": 0.05748515576124191, "rewards/margins_min": 0.01879255101084709, "rewards/margins_std": 0.02735980786383152, "rewards/rejected": -0.010196278803050518, "step": 1600 }, { "epoch": 0.36, "grad_norm": 0.42578125, "learning_rate": 4.012691815120508e-07, "logits/chosen": -1.5209296941757202, "logits/rejected": -1.0353261232376099, "logps/chosen": -216.0679931640625, "logps/rejected": -189.17874145507812, "loss": 0.6696, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.043269019573926926, "rewards/margins": 0.05212603881955147, "rewards/margins_max": 0.07273893058300018, "rewards/margins_min": 0.03151315450668335, "rewards/margins_std": 0.02915102243423462, "rewards/rejected": -0.00885702483355999, "step": 1610 }, { "epoch": 0.37, "grad_norm": 0.51953125, "learning_rate": 3.996937561283873e-07, "logits/chosen": -1.568519115447998, "logits/rejected": -0.9833946228027344, "logps/chosen": -265.3883056640625, "logps/rejected": -200.9881134033203, "loss": 0.6709, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.025037512183189392, "rewards/margins": 0.04677094891667366, "rewards/margins_max": 0.08109462261199951, "rewards/margins_min": 0.01244727335870266, "rewards/margins_std": 0.04854099825024605, "rewards/rejected": -0.021733436733484268, "step": 1620 }, { "epoch": 0.37, "grad_norm": 0.451171875, "learning_rate": 3.981090085789358e-07, "logits/chosen": -1.216138243675232, "logits/rejected": -1.0891082286834717, "logps/chosen": -274.11761474609375, "logps/rejected": -265.92626953125, "loss": 0.6756, "rewards/accuracies": 0.875, "rewards/chosen": 0.025747573003172874, "rewards/margins": 0.03598998114466667, "rewards/margins_max": 0.060326360166072845, "rewards/margins_min": 0.011653609573841095, "rewards/margins_std": 0.03441683202981949, "rewards/rejected": -0.010242411866784096, "step": 1630 }, { "epoch": 0.37, "grad_norm": 0.53125, "learning_rate": 3.965150375537137e-07, "logits/chosen": -1.4957590103149414, "logits/rejected": -1.1819343566894531, "logps/chosen": -210.2582244873047, "logps/rejected": -211.7596893310547, "loss": 0.6687, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.029509777203202248, "rewards/margins": 0.04957672208547592, "rewards/margins_max": 0.06985175609588623, "rewards/margins_min": 0.02930169366300106, "rewards/margins_std": 0.028673222288489342, "rewards/rejected": -0.020066948607563972, "step": 1640 }, { "epoch": 0.37, "grad_norm": 0.3984375, "learning_rate": 3.949119423171294e-07, "logits/chosen": -1.420458197593689, "logits/rejected": -1.2591960430145264, "logps/chosen": -172.73870849609375, "logps/rejected": -178.8004608154297, "loss": 0.6683, "rewards/accuracies": 0.875, "rewards/chosen": 0.029484177008271217, "rewards/margins": 0.04394937679171562, "rewards/margins_max": 0.06709001213312149, "rewards/margins_min": 0.020808745175600052, "rewards/margins_std": 0.032725803554058075, "rewards/rejected": -0.014465202577412128, "step": 1650 }, { "epoch": 0.38, "grad_norm": 0.35546875, "learning_rate": 3.9329982270180083e-07, "logits/chosen": -1.2149207592010498, "logits/rejected": -1.0226820707321167, "logps/chosen": -196.73216247558594, "logps/rejected": -183.21646118164062, "loss": 0.6685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.01719057932496071, "rewards/margins": 0.044677041471004486, "rewards/margins_max": 0.0677308589220047, "rewards/margins_min": 0.02162322774529457, "rewards/margins_std": 0.03260301426053047, "rewards/rejected": -0.027486462146043777, "step": 1660 }, { "epoch": 0.38, "grad_norm": 0.40625, "learning_rate": 3.916787791023386e-07, "logits/chosen": -1.4802358150482178, "logits/rejected": -1.118082880973816, "logps/chosen": -208.3599853515625, "logps/rejected": -180.79049682617188, "loss": 0.6719, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.033904947340488434, "rewards/margins": 0.037811677902936935, "rewards/margins_max": 0.06144796684384346, "rewards/margins_min": 0.014175387099385262, "rewards/margins_std": 0.033426761627197266, "rewards/rejected": -0.00390672916546464, "step": 1670 }, { "epoch": 0.38, "grad_norm": 0.33203125, "learning_rate": 3.900489124690932e-07, "logits/chosen": -1.4334280490875244, "logits/rejected": -1.1188874244689941, "logps/chosen": -201.74215698242188, "logps/rejected": -270.0927429199219, "loss": 0.672, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.031066913157701492, "rewards/margins": 0.04503883793950081, "rewards/margins_max": 0.06810437142848969, "rewards/margins_min": 0.021973304450511932, "rewards/margins_std": 0.03261958807706833, "rewards/rejected": -0.013971921987831593, "step": 1680 }, { "epoch": 0.38, "grad_norm": 0.484375, "learning_rate": 3.884103243018693e-07, "logits/chosen": -1.2650120258331299, "logits/rejected": -1.0881750583648682, "logps/chosen": -220.5270538330078, "logps/rejected": -243.7740478515625, "loss": 0.6711, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.026399720460176468, "rewards/margins": 0.038081325590610504, "rewards/margins_max": 0.05310334637761116, "rewards/margins_min": 0.0230593029409647, "rewards/margins_std": 0.02124434895813465, "rewards/rejected": -0.011681604199111462, "step": 1690 }, { "epoch": 0.38, "grad_norm": 0.48828125, "learning_rate": 3.867631166436037e-07, "logits/chosen": -1.2718234062194824, "logits/rejected": -1.0437819957733154, "logps/chosen": -221.5988006591797, "logps/rejected": -192.76193237304688, "loss": 0.6703, "rewards/accuracies": 0.875, "rewards/chosen": 0.025970160961151123, "rewards/margins": 0.041418932378292084, "rewards/margins_max": 0.07021013647317886, "rewards/margins_min": 0.012627726420760155, "rewards/margins_std": 0.04071691259741783, "rewards/rejected": -0.015448769554495811, "step": 1700 }, { "epoch": 0.39, "grad_norm": 0.4375, "learning_rate": 3.85107392074012e-07, "logits/chosen": -1.441463589668274, "logits/rejected": -1.232027292251587, "logps/chosen": -226.7698211669922, "logps/rejected": -177.8507537841797, "loss": 0.6722, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.021718554198741913, "rewards/margins": 0.04068540036678314, "rewards/margins_max": 0.060767918825149536, "rewards/margins_min": 0.020602887496352196, "rewards/margins_std": 0.02840096689760685, "rewards/rejected": -0.01896684803068638, "step": 1710 }, { "epoch": 0.39, "grad_norm": 0.29296875, "learning_rate": 3.834432537031991e-07, "logits/chosen": -1.3713477849960327, "logits/rejected": -0.9427247047424316, "logps/chosen": -288.84698486328125, "logps/rejected": -220.6392364501953, "loss": 0.6723, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02653350867331028, "rewards/margins": 0.04643214866518974, "rewards/margins_max": 0.06892819702625275, "rewards/margins_min": 0.02393609844148159, "rewards/margins_std": 0.031814225018024445, "rewards/rejected": -0.019898641854524612, "step": 1720 }, { "epoch": 0.39, "grad_norm": 0.345703125, "learning_rate": 3.817708051652392e-07, "logits/chosen": -1.3596107959747314, "logits/rejected": -1.081020712852478, "logps/chosen": -213.62637329101562, "logps/rejected": -196.31497192382812, "loss": 0.6724, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02571287751197815, "rewards/margins": 0.03983969986438751, "rewards/margins_max": 0.06263546645641327, "rewards/margins_min": 0.017043929547071457, "rewards/margins_std": 0.032238081097602844, "rewards/rejected": -0.014126819558441639, "step": 1730 }, { "epoch": 0.39, "grad_norm": 0.453125, "learning_rate": 3.800901506117209e-07, "logits/chosen": -1.2425081729888916, "logits/rejected": -1.0295426845550537, "logps/chosen": -245.2377471923828, "logps/rejected": -214.0257568359375, "loss": 0.6683, "rewards/accuracies": 0.875, "rewards/chosen": 0.03704250231385231, "rewards/margins": 0.05573665350675583, "rewards/margins_max": 0.08607505261898041, "rewards/margins_min": 0.025398259982466698, "rewards/margins_std": 0.04290497303009033, "rewards/rejected": -0.018694154918193817, "step": 1740 }, { "epoch": 0.4, "grad_norm": 0.322265625, "learning_rate": 3.784013947052621e-07, "logits/chosen": -1.402919888496399, "logits/rejected": -1.1005027294158936, "logps/chosen": -266.9942321777344, "logps/rejected": -205.5738525390625, "loss": 0.6721, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03442750871181488, "rewards/margins": 0.04329963028430939, "rewards/margins_max": 0.07404103130102158, "rewards/margins_min": 0.012558224610984325, "rewards/margins_std": 0.04347491264343262, "rewards/rejected": -0.008872120641171932, "step": 1750 }, { "epoch": 0.4, "grad_norm": 0.431640625, "learning_rate": 3.7670464261299164e-07, "logits/chosen": -1.2231419086456299, "logits/rejected": -1.0529447793960571, "logps/chosen": -203.1646728515625, "logps/rejected": -224.58779907226562, "loss": 0.6683, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030151110142469406, "rewards/margins": 0.0533854141831398, "rewards/margins_max": 0.08413845300674438, "rewards/margins_min": 0.022632379084825516, "rewards/margins_std": 0.043491363525390625, "rewards/rejected": -0.023234302178025246, "step": 1760 }, { "epoch": 0.4, "grad_norm": 0.330078125, "learning_rate": 3.75e-07, "logits/chosen": -1.5182336568832397, "logits/rejected": -1.184272289276123, "logps/chosen": -195.92198181152344, "logps/rejected": -205.4150390625, "loss": 0.6739, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.01970260962843895, "rewards/margins": 0.04444552958011627, "rewards/margins_max": 0.0759601816534996, "rewards/margins_min": 0.01293087750673294, "rewards/margins_std": 0.04456844925880432, "rewards/rejected": -0.02474292181432247, "step": 1770 }, { "epoch": 0.4, "grad_norm": 0.376953125, "learning_rate": 3.732875730227594e-07, "logits/chosen": -1.6119180917739868, "logits/rejected": -1.338212013244629, "logps/chosen": -184.89749145507812, "logps/rejected": -195.16012573242188, "loss": 0.665, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03413277491927147, "rewards/margins": 0.04780154302716255, "rewards/margins_max": 0.06999184191226959, "rewards/margins_min": 0.02561124786734581, "rewards/margins_std": 0.031381815671920776, "rewards/rejected": -0.013668762519955635, "step": 1780 }, { "epoch": 0.4, "grad_norm": 0.43359375, "learning_rate": 3.715674683225126e-07, "logits/chosen": -1.4385994672775269, "logits/rejected": -1.1190850734710693, "logps/chosen": -252.67294311523438, "logps/rejected": -200.71493530273438, "loss": 0.6692, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0406574010848999, "rewards/margins": 0.05437915399670601, "rewards/margins_max": 0.08271868526935577, "rewards/margins_min": 0.02603963017463684, "rewards/margins_std": 0.040078144520521164, "rewards/rejected": -0.013721758499741554, "step": 1790 }, { "epoch": 0.41, "grad_norm": 0.4453125, "learning_rate": 3.698397930186318e-07, "logits/chosen": -1.420417070388794, "logits/rejected": -1.1576616764068604, "logps/chosen": -205.3006591796875, "logps/rejected": -199.37765502929688, "loss": 0.6698, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02717829868197441, "rewards/margins": 0.04901193082332611, "rewards/margins_max": 0.0719413310289383, "rewards/margins_min": 0.026082530617713928, "rewards/margins_std": 0.032427072525024414, "rewards/rejected": -0.0218336321413517, "step": 1800 }, { "epoch": 0.41, "grad_norm": 0.47265625, "learning_rate": 3.681046547019479e-07, "logits/chosen": -1.4996373653411865, "logits/rejected": -1.2397348880767822, "logps/chosen": -214.6435546875, "logps/rejected": -191.0612030029297, "loss": 0.6728, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027710596099495888, "rewards/margins": 0.041693609207868576, "rewards/margins_max": 0.06465200334787369, "rewards/margins_min": 0.01873522251844406, "rewards/margins_std": 0.03246805816888809, "rewards/rejected": -0.01398300938308239, "step": 1810 }, { "epoch": 0.41, "grad_norm": 0.375, "learning_rate": 3.6636216142805044e-07, "logits/chosen": -1.3628098964691162, "logits/rejected": -1.1438556909561157, "logps/chosen": -196.2094268798828, "logps/rejected": -203.31918334960938, "loss": 0.6685, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0251650158315897, "rewards/margins": 0.04904549568891525, "rewards/margins_max": 0.07514000684022903, "rewards/margins_min": 0.022950977087020874, "rewards/margins_std": 0.03690321743488312, "rewards/rejected": -0.023880477994680405, "step": 1820 }, { "epoch": 0.41, "grad_norm": 0.34375, "learning_rate": 3.646124217105582e-07, "logits/chosen": -1.4042203426361084, "logits/rejected": -1.1199661493301392, "logps/chosen": -212.60061645507812, "logps/rejected": -173.20904541015625, "loss": 0.671, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.026237869635224342, "rewards/margins": 0.04421783238649368, "rewards/margins_max": 0.06703042984008789, "rewards/margins_min": 0.021405242383480072, "rewards/margins_std": 0.03226187080144882, "rewards/rejected": -0.01797996461391449, "step": 1830 }, { "epoch": 0.42, "grad_norm": 0.388671875, "learning_rate": 3.6285554451436144e-07, "logits/chosen": -1.2761818170547485, "logits/rejected": -1.074405550956726, "logps/chosen": -213.8831024169922, "logps/rejected": -231.5514678955078, "loss": 0.671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03510306030511856, "rewards/margins": 0.04712874814867973, "rewards/margins_max": 0.07571456581354141, "rewards/margins_min": 0.018542934209108353, "rewards/margins_std": 0.040426451712846756, "rewards/rejected": -0.012025688774883747, "step": 1840 }, { "epoch": 0.42, "grad_norm": 0.36328125, "learning_rate": 3.610916392488366e-07, "logits/chosen": -1.3223119974136353, "logits/rejected": -1.0760774612426758, "logps/chosen": -207.56497192382812, "logps/rejected": -201.53770446777344, "loss": 0.6719, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02942877635359764, "rewards/margins": 0.042011525481939316, "rewards/margins_max": 0.07225628942251205, "rewards/margins_min": 0.011766768991947174, "rewards/margins_std": 0.04277254641056061, "rewards/rejected": -0.012582749128341675, "step": 1850 }, { "epoch": 0.42, "grad_norm": 0.412109375, "learning_rate": 3.593208157610323e-07, "logits/chosen": -1.4547159671783447, "logits/rejected": -1.2126966714859009, "logps/chosen": -201.00279235839844, "logps/rejected": -195.8961181640625, "loss": 0.6702, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.024298720061779022, "rewards/margins": 0.04018304497003555, "rewards/margins_max": 0.061934977769851685, "rewards/margins_min": 0.018431108444929123, "rewards/margins_std": 0.030761878937482834, "rewards/rejected": -0.01588432490825653, "step": 1860 }, { "epoch": 0.42, "grad_norm": 0.5390625, "learning_rate": 3.57543184328829e-07, "logits/chosen": -1.2944796085357666, "logits/rejected": -1.0369160175323486, "logps/chosen": -185.6531982421875, "logps/rejected": -211.05422973632812, "loss": 0.6703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.030656468123197556, "rewards/margins": 0.05110560730099678, "rewards/margins_max": 0.08564882725477219, "rewards/margins_min": 0.016562385484576225, "rewards/margins_std": 0.04885149374604225, "rewards/rejected": -0.020449137315154076, "step": 1870 }, { "epoch": 0.42, "grad_norm": 0.451171875, "learning_rate": 3.5575885565407115e-07, "logits/chosen": -1.3851807117462158, "logits/rejected": -1.1306685209274292, "logps/chosen": -180.79251098632812, "logps/rejected": -173.75868225097656, "loss": 0.6717, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02727353572845459, "rewards/margins": 0.044656019657850266, "rewards/margins_max": 0.07077351957559586, "rewards/margins_min": 0.018538516014814377, "rewards/margins_std": 0.036935724318027496, "rewards/rejected": -0.017382482066750526, "step": 1880 }, { "epoch": 0.43, "grad_norm": 0.474609375, "learning_rate": 3.5396794085567367e-07, "logits/chosen": -1.45248544216156, "logits/rejected": -1.0879206657409668, "logps/chosen": -253.71142578125, "logps/rejected": -229.73867797851562, "loss": 0.6693, "rewards/accuracies": 0.875, "rewards/chosen": 0.035827480256557465, "rewards/margins": 0.052608538419008255, "rewards/margins_max": 0.08579106628894806, "rewards/margins_min": 0.019426017999649048, "rewards/margins_std": 0.04692717269062996, "rewards/rejected": -0.01678105816245079, "step": 1890 }, { "epoch": 0.43, "grad_norm": 0.37109375, "learning_rate": 3.5217055146270143e-07, "logits/chosen": -1.329620599746704, "logits/rejected": -1.0890090465545654, "logps/chosen": -255.60830688476562, "logps/rejected": -210.82730102539062, "loss": 0.6719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.025220388546586037, "rewards/margins": 0.03942624479532242, "rewards/margins_max": 0.07290971279144287, "rewards/margins_min": 0.005942771676927805, "rewards/margins_std": 0.047352783381938934, "rewards/rejected": -0.01420585811138153, "step": 1900 }, { "epoch": 0.43, "grad_norm": 0.392578125, "learning_rate": 3.5036679940742435e-07, "logits/chosen": -1.3408721685409546, "logits/rejected": -0.9262669682502747, "logps/chosen": -193.88815307617188, "logps/rejected": -189.97848510742188, "loss": 0.6682, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.030053604394197464, "rewards/margins": 0.05334904044866562, "rewards/margins_max": 0.07732351869344711, "rewards/margins_min": 0.029374558478593826, "rewards/margins_std": 0.033905040472745895, "rewards/rejected": -0.023295434191823006, "step": 1910 }, { "epoch": 0.43, "grad_norm": 0.341796875, "learning_rate": 3.4855679701834654e-07, "logits/chosen": -1.4233258962631226, "logits/rejected": -1.088521957397461, "logps/chosen": -248.71029663085938, "logps/rejected": -218.1599578857422, "loss": 0.6699, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03422839939594269, "rewards/margins": 0.05857861042022705, "rewards/margins_max": 0.0850900262594223, "rewards/margins_min": 0.032067202031612396, "rewards/margins_std": 0.037492796778678894, "rewards/rejected": -0.024350211024284363, "step": 1920 }, { "epoch": 0.44, "grad_norm": 0.392578125, "learning_rate": 3.4674065701321117e-07, "logits/chosen": -1.2159172296524048, "logits/rejected": -0.8571739196777344, "logps/chosen": -239.7020263671875, "logps/rejected": -207.25259399414062, "loss": 0.6702, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.024594713002443314, "rewards/margins": 0.055344462394714355, "rewards/margins_max": 0.09462883323431015, "rewards/margins_min": 0.01606009155511856, "rewards/margins_std": 0.05555649474263191, "rewards/rejected": -0.03074975311756134, "step": 1930 }, { "epoch": 0.44, "grad_norm": 0.35546875, "learning_rate": 3.449184924919807e-07, "logits/chosen": -1.4591569900512695, "logits/rejected": -1.1621897220611572, "logps/chosen": -176.26205444335938, "logps/rejected": -183.91500854492188, "loss": 0.6679, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.033842552453279495, "rewards/margins": 0.055226556956768036, "rewards/margins_max": 0.08132033050060272, "rewards/margins_min": 0.029132787138223648, "rewards/margins_std": 0.03690216317772865, "rewards/rejected": -0.02138400450348854, "step": 1940 }, { "epoch": 0.44, "grad_norm": 0.41015625, "learning_rate": 3.4309041692979406e-07, "logits/chosen": -1.3285919427871704, "logits/rejected": -1.0929481983184814, "logps/chosen": -185.72769165039062, "logps/rejected": -201.81222534179688, "loss": 0.6688, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03052700124680996, "rewards/margins": 0.05209646373987198, "rewards/margins_max": 0.08264229446649551, "rewards/margins_min": 0.021550629287958145, "rewards/margins_std": 0.043198324739933014, "rewards/rejected": -0.02156945690512657, "step": 1950 }, { "epoch": 0.44, "grad_norm": 0.310546875, "learning_rate": 3.412565441698997e-07, "logits/chosen": -1.6500459909439087, "logits/rejected": -1.299719214439392, "logps/chosen": -199.05360412597656, "logps/rejected": -210.8216552734375, "loss": 0.6732, "rewards/accuracies": 0.875, "rewards/chosen": 0.026170510798692703, "rewards/margins": 0.043105438351631165, "rewards/margins_max": 0.05894836038351059, "rewards/margins_min": 0.02726251445710659, "rewards/margins_std": 0.022405285388231277, "rewards/rejected": -0.01693493127822876, "step": 1960 }, { "epoch": 0.45, "grad_norm": 0.3671875, "learning_rate": 3.394169884165659e-07, "logits/chosen": -1.2527066469192505, "logits/rejected": -1.006007432937622, "logps/chosen": -227.0983123779297, "logps/rejected": -237.21005249023438, "loss": 0.6668, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03402932733297348, "rewards/margins": 0.0556698814034462, "rewards/margins_max": 0.08698919415473938, "rewards/margins_min": 0.024350563064217567, "rewards/margins_std": 0.044292204082012177, "rewards/rejected": -0.021640557795763016, "step": 1970 }, { "epoch": 0.45, "grad_norm": 0.5703125, "learning_rate": 3.3757186422796913e-07, "logits/chosen": -1.5391623973846436, "logits/rejected": -1.2645601034164429, "logps/chosen": -223.9857177734375, "logps/rejected": -290.79302978515625, "loss": 0.6677, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.026502113789319992, "rewards/margins": 0.048506900668144226, "rewards/margins_max": 0.07111908495426178, "rewards/margins_min": 0.025894710794091225, "rewards/margins_std": 0.03197846934199333, "rewards/rejected": -0.022004786878824234, "step": 1980 }, { "epoch": 0.45, "grad_norm": 0.30078125, "learning_rate": 3.357212865090594e-07, "logits/chosen": -1.4560575485229492, "logits/rejected": -1.1469371318817139, "logps/chosen": -217.7390594482422, "logps/rejected": -191.54116821289062, "loss": 0.6712, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03316511958837509, "rewards/margins": 0.040575869381427765, "rewards/margins_max": 0.059667717665433884, "rewards/margins_min": 0.02148401364684105, "rewards/margins_std": 0.026999955996870995, "rewards/rejected": -0.007410746067762375, "step": 1990 }, { "epoch": 0.45, "grad_norm": 0.38671875, "learning_rate": 3.3386537050440505e-07, "logits/chosen": -1.230985164642334, "logits/rejected": -1.074690341949463, "logps/chosen": -153.95193481445312, "logps/rejected": -248.7484893798828, "loss": 0.672, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03268904983997345, "rewards/margins": 0.04694094508886337, "rewards/margins_max": 0.07272408902645111, "rewards/margins_min": 0.021157797425985336, "rewards/margins_std": 0.03646288067102432, "rewards/rejected": -0.014251895248889923, "step": 2000 }, { "epoch": 0.45, "grad_norm": 0.337890625, "learning_rate": 3.3200423179101564e-07, "logits/chosen": -1.4215552806854248, "logits/rejected": -1.1187208890914917, "logps/chosen": -242.44088745117188, "logps/rejected": -258.2455749511719, "loss": 0.6729, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02623608708381653, "rewards/margins": 0.04028294235467911, "rewards/margins_max": 0.059119291603565216, "rewards/margins_min": 0.0214465893805027, "rewards/margins_std": 0.02663862146437168, "rewards/rejected": -0.014046849682927132, "step": 2010 }, { "epoch": 0.46, "grad_norm": 0.408203125, "learning_rate": 3.3013798627114453e-07, "logits/chosen": -1.4693307876586914, "logits/rejected": -1.2081705331802368, "logps/chosen": -199.54930114746094, "logps/rejected": -206.67330932617188, "loss": 0.6706, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.029323678463697433, "rewards/margins": 0.04873298853635788, "rewards/margins_max": 0.07738453149795532, "rewards/margins_min": 0.020081443712115288, "rewards/margins_std": 0.04051940143108368, "rewards/rejected": -0.019409308210015297, "step": 2020 }, { "epoch": 0.46, "grad_norm": 0.44921875, "learning_rate": 3.2826675016507087e-07, "logits/chosen": -1.324035406112671, "logits/rejected": -1.1425608396530151, "logps/chosen": -170.9380340576172, "logps/rejected": -183.2605743408203, "loss": 0.6703, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026225930079817772, "rewards/margins": 0.04071018844842911, "rewards/margins_max": 0.06031109765172005, "rewards/margins_min": 0.02110927924513817, "rewards/margins_std": 0.027719873934984207, "rewards/rejected": -0.014484262093901634, "step": 2030 }, { "epoch": 0.46, "grad_norm": 0.400390625, "learning_rate": 3.263906400038623e-07, "logits/chosen": -1.4063230752944946, "logits/rejected": -1.1503441333770752, "logps/chosen": -225.02719116210938, "logps/rejected": -250.1376190185547, "loss": 0.6707, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.031333766877651215, "rewards/margins": 0.04533548653125763, "rewards/margins_max": 0.06751718372106552, "rewards/margins_min": 0.023153791204094887, "rewards/margins_std": 0.0313696563243866, "rewards/rejected": -0.014001714065670967, "step": 2040 }, { "epoch": 0.46, "grad_norm": 0.546875, "learning_rate": 3.2450977262211765e-07, "logits/chosen": -1.3487093448638916, "logits/rejected": -1.1558421850204468, "logps/chosen": -273.28955078125, "logps/rejected": -219.2425079345703, "loss": 0.6724, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02075299061834812, "rewards/margins": 0.04373856633901596, "rewards/margins_max": 0.06467003375291824, "rewards/margins_min": 0.022807098925113678, "rewards/margins_std": 0.02960156463086605, "rewards/rejected": -0.02298557385802269, "step": 2050 }, { "epoch": 0.47, "grad_norm": 0.4609375, "learning_rate": 3.226242651506914e-07, "logits/chosen": -1.4015865325927734, "logits/rejected": -1.2727991342544556, "logps/chosen": -220.09597778320312, "logps/rejected": -235.02456665039062, "loss": 0.6735, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03195624798536301, "rewards/margins": 0.037409860640764236, "rewards/margins_max": 0.06553162634372711, "rewards/margins_min": 0.009288092143833637, "rewards/margins_std": 0.03977018594741821, "rewards/rejected": -0.0054536135867238045, "step": 2060 }, { "epoch": 0.47, "grad_norm": 0.43359375, "learning_rate": 3.207342350093992e-07, "logits/chosen": -1.4042125940322876, "logits/rejected": -1.0990450382232666, "logps/chosen": -261.49554443359375, "logps/rejected": -226.1964874267578, "loss": 0.6676, "rewards/accuracies": 0.875, "rewards/chosen": 0.03966151177883148, "rewards/margins": 0.051730163395404816, "rewards/margins_max": 0.07370400428771973, "rewards/margins_min": 0.029756318777799606, "rewards/margins_std": 0.03107570670545101, "rewards/rejected": -0.012068650685250759, "step": 2070 }, { "epoch": 0.47, "grad_norm": 0.40234375, "learning_rate": 3.1883979989970556e-07, "logits/chosen": -1.3634693622589111, "logits/rejected": -1.1507164239883423, "logps/chosen": -200.532470703125, "logps/rejected": -223.18521118164062, "loss": 0.6746, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03149800002574921, "rewards/margins": 0.04740763083100319, "rewards/margins_max": 0.07129405438899994, "rewards/margins_min": 0.023521197959780693, "rewards/margins_std": 0.03378051519393921, "rewards/rejected": -0.015909628942608833, "step": 2080 }, { "epoch": 0.47, "grad_norm": 0.36328125, "learning_rate": 3.1694107779739387e-07, "logits/chosen": -1.3686000108718872, "logits/rejected": -1.1741145849227905, "logps/chosen": -177.2476348876953, "logps/rejected": -201.48651123046875, "loss": 0.6722, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.022783290594816208, "rewards/margins": 0.036474697291851044, "rewards/margins_max": 0.0596294105052948, "rewards/margins_min": 0.013319991528987885, "rewards/margins_std": 0.03274570032954216, "rewards/rejected": -0.01369140762835741, "step": 2090 }, { "epoch": 0.47, "grad_norm": 0.26953125, "learning_rate": 3.1503818694521987e-07, "logits/chosen": -1.2748286724090576, "logits/rejected": -1.0838059186935425, "logps/chosen": -167.02737426757812, "logps/rejected": -171.05117797851562, "loss": 0.6706, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026711028069257736, "rewards/margins": 0.04484615474939346, "rewards/margins_max": 0.06936424225568771, "rewards/margins_min": 0.02032807096838951, "rewards/margins_std": 0.03467380255460739, "rewards/rejected": -0.018135128542780876, "step": 2100 }, { "epoch": 0.48, "grad_norm": 0.369140625, "learning_rate": 3.131312458455477e-07, "logits/chosen": -1.319719910621643, "logits/rejected": -1.1327495574951172, "logps/chosen": -190.13870239257812, "logps/rejected": -202.6477508544922, "loss": 0.6689, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030982110649347305, "rewards/margins": 0.04906720668077469, "rewards/margins_max": 0.0723622590303421, "rewards/margins_min": 0.02577214315533638, "rewards/margins_std": 0.03294419124722481, "rewards/rejected": -0.018085090443491936, "step": 2110 }, { "epoch": 0.48, "grad_norm": 0.443359375, "learning_rate": 3.1122037325297023e-07, "logits/chosen": -1.4756828546524048, "logits/rejected": -1.0510185956954956, "logps/chosen": -244.9542694091797, "logps/rejected": -187.69895935058594, "loss": 0.671, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03502636030316353, "rewards/margins": 0.04812877997756004, "rewards/margins_max": 0.07826215028762817, "rewards/margins_min": 0.017995405942201614, "rewards/margins_std": 0.04261502996087074, "rewards/rejected": -0.013102421537041664, "step": 2120 }, { "epoch": 0.48, "grad_norm": 0.388671875, "learning_rate": 3.0930568816691386e-07, "logits/chosen": -1.2691621780395508, "logits/rejected": -1.0920307636260986, "logps/chosen": -204.21749877929688, "logps/rejected": -163.44583129882812, "loss": 0.6686, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02684844098985195, "rewards/margins": 0.04703948646783829, "rewards/margins_max": 0.0725255236029625, "rewards/margins_min": 0.02155345305800438, "rewards/margins_std": 0.036042697727680206, "rewards/rejected": -0.020191045477986336, "step": 2130 }, { "epoch": 0.48, "grad_norm": 0.43359375, "learning_rate": 3.073873098242278e-07, "logits/chosen": -1.375516653060913, "logits/rejected": -1.0434143543243408, "logps/chosen": -211.1145477294922, "logps/rejected": -183.14756774902344, "loss": 0.6643, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026955414563417435, "rewards/margins": 0.05509645864367485, "rewards/margins_max": 0.08361810445785522, "rewards/margins_min": 0.026574820280075073, "rewards/margins_std": 0.04033569246530533, "rewards/rejected": -0.028141042217612267, "step": 2140 }, { "epoch": 0.49, "grad_norm": 0.51171875, "learning_rate": 3.054653576917581e-07, "logits/chosen": -1.3683044910430908, "logits/rejected": -1.1811391115188599, "logps/chosen": -209.4580841064453, "logps/rejected": -221.3155975341797, "loss": 0.6672, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.038317613303661346, "rewards/margins": 0.05631747841835022, "rewards/margins_max": 0.08396416157484055, "rewards/margins_min": 0.028670784085989, "rewards/margins_std": 0.03909832984209061, "rewards/rejected": -0.017999857664108276, "step": 2150 }, { "epoch": 0.49, "grad_norm": 0.357421875, "learning_rate": 3.0353995145890864e-07, "logits/chosen": -1.4687901735305786, "logits/rejected": -1.128990888595581, "logps/chosen": -220.0728759765625, "logps/rejected": -215.1951904296875, "loss": 0.6684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.023461733013391495, "rewards/margins": 0.05785750225186348, "rewards/margins_max": 0.0840335339307785, "rewards/margins_min": 0.03168146312236786, "rewards/margins_std": 0.03701850771903992, "rewards/rejected": -0.034395769238471985, "step": 2160 }, { "epoch": 0.49, "grad_norm": 0.59765625, "learning_rate": 3.01611211030187e-07, "logits/chosen": -1.3273531198501587, "logits/rejected": -1.0275139808654785, "logps/chosen": -344.6279296875, "logps/rejected": -221.2284698486328, "loss": 0.6679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026945358142256737, "rewards/margins": 0.04964347928762436, "rewards/margins_max": 0.07278752326965332, "rewards/margins_min": 0.026499425992369652, "rewards/margins_std": 0.03273063153028488, "rewards/rejected": -0.022698121145367622, "step": 2170 }, { "epoch": 0.49, "grad_norm": 0.328125, "learning_rate": 2.996792565177374e-07, "logits/chosen": -1.4312021732330322, "logits/rejected": -1.1072601079940796, "logps/chosen": -197.82289123535156, "logps/rejected": -207.2589111328125, "loss": 0.6685, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0390249639749527, "rewards/margins": 0.060548871755599976, "rewards/margins_max": 0.09206276386976242, "rewards/margins_min": 0.029034990817308426, "rewards/margins_std": 0.04456736519932747, "rewards/rejected": -0.021523915231227875, "step": 2180 }, { "epoch": 0.5, "grad_norm": 0.365234375, "learning_rate": 2.9774420823386096e-07, "logits/chosen": -1.4531341791152954, "logits/rejected": -1.223080039024353, "logps/chosen": -199.8817596435547, "logps/rejected": -177.36080932617188, "loss": 0.6722, "rewards/accuracies": 0.875, "rewards/chosen": 0.027678947895765305, "rewards/margins": 0.03983256220817566, "rewards/margins_max": 0.06585127860307693, "rewards/margins_min": 0.013813835568726063, "rewards/margins_std": 0.03679602965712547, "rewards/rejected": -0.012153607793152332, "step": 2190 }, { "epoch": 0.5, "grad_norm": 0.4375, "learning_rate": 2.9580618668352317e-07, "logits/chosen": -1.4132052659988403, "logits/rejected": -1.0988004207611084, "logps/chosen": -238.1175994873047, "logps/rejected": -208.7352752685547, "loss": 0.6721, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026243817061185837, "rewards/margins": 0.0455835685133934, "rewards/margins_max": 0.06686623394489288, "rewards/margins_min": 0.024300891906023026, "rewards/margins_std": 0.03009824827313423, "rewards/rejected": -0.019339745864272118, "step": 2200 }, { "epoch": 0.5, "grad_norm": 0.30078125, "learning_rate": 2.9386531255684937e-07, "logits/chosen": -1.3268234729766846, "logits/rejected": -1.0986872911453247, "logps/chosen": -239.9281463623047, "logps/rejected": -202.4623260498047, "loss": 0.6714, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0318286269903183, "rewards/margins": 0.046062078326940536, "rewards/margins_max": 0.07249831408262253, "rewards/margins_min": 0.01962583139538765, "rewards/margins_std": 0.037386488169431686, "rewards/rejected": -0.01423344761133194, "step": 2210 }, { "epoch": 0.5, "grad_norm": 0.4765625, "learning_rate": 2.919217067216089e-07, "logits/chosen": -1.2868921756744385, "logits/rejected": -1.0992939472198486, "logps/chosen": -194.62716674804688, "logps/rejected": -202.38162231445312, "loss": 0.6688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027968263253569603, "rewards/margins": 0.05166172236204147, "rewards/margins_max": 0.07927562296390533, "rewards/margins_min": 0.024047832936048508, "rewards/margins_std": 0.03905193880200386, "rewards/rejected": -0.02369346097111702, "step": 2220 }, { "epoch": 0.5, "grad_norm": 0.462890625, "learning_rate": 2.899754902156879e-07, "logits/chosen": -1.348962664604187, "logits/rejected": -0.9912853240966797, "logps/chosen": -262.5445861816406, "logps/rejected": -196.90609741210938, "loss": 0.6686, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.028756489977240562, "rewards/margins": 0.04377773404121399, "rewards/margins_max": 0.06630166620016098, "rewards/margins_min": 0.0212537981569767, "rewards/margins_std": 0.031853653490543365, "rewards/rejected": -0.015021244063973427, "step": 2230 }, { "epoch": 0.51, "grad_norm": 0.408203125, "learning_rate": 2.88026784239552e-07, "logits/chosen": -1.3572931289672852, "logits/rejected": -1.089862585067749, "logps/chosen": -205.95458984375, "logps/rejected": -220.0574188232422, "loss": 0.6669, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03338629752397537, "rewards/margins": 0.05913955718278885, "rewards/margins_max": 0.08510196954011917, "rewards/margins_min": 0.033177152276039124, "rewards/margins_std": 0.036716386675834656, "rewards/rejected": -0.025753263384103775, "step": 2240 }, { "epoch": 0.51, "grad_norm": 0.4765625, "learning_rate": 2.8607571014869815e-07, "logits/chosen": -1.481650948524475, "logits/rejected": -1.1228121519088745, "logps/chosen": -222.73886108398438, "logps/rejected": -214.49484252929688, "loss": 0.6652, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.031130477786064148, "rewards/margins": 0.06208733841776848, "rewards/margins_max": 0.08940434455871582, "rewards/margins_min": 0.03477033972740173, "rewards/margins_std": 0.038632072508335114, "rewards/rejected": -0.03095685876905918, "step": 2250 }, { "epoch": 0.51, "grad_norm": 0.384765625, "learning_rate": 2.8412238944609754e-07, "logits/chosen": -1.3825039863586426, "logits/rejected": -1.1998974084854126, "logps/chosen": -158.28053283691406, "logps/rejected": -162.88006591796875, "loss": 0.6736, "rewards/accuracies": 0.75, "rewards/chosen": 0.03045378252863884, "rewards/margins": 0.03599992021918297, "rewards/margins_max": 0.05915500596165657, "rewards/margins_min": 0.012844832614064217, "rewards/margins_std": 0.03274623677134514, "rewards/rejected": -0.005546136759221554, "step": 2260 }, { "epoch": 0.51, "grad_norm": 0.392578125, "learning_rate": 2.8216694377462907e-07, "logits/chosen": -1.314422845840454, "logits/rejected": -1.034608244895935, "logps/chosen": -220.6864776611328, "logps/rejected": -234.85214233398438, "loss": 0.6671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03803492337465286, "rewards/margins": 0.06186642125248909, "rewards/margins_max": 0.09449129551649094, "rewards/margins_min": 0.029241541400551796, "rewards/margins_std": 0.046138547360897064, "rewards/rejected": -0.023831497877836227, "step": 2270 }, { "epoch": 0.52, "grad_norm": 0.193359375, "learning_rate": 2.8020949490950365e-07, "logits/chosen": -1.6204173564910889, "logits/rejected": -1.1893694400787354, "logps/chosen": -224.90567016601562, "logps/rejected": -204.3470001220703, "loss": 0.6702, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.033450596034526825, "rewards/margins": 0.05014806240797043, "rewards/margins_max": 0.07468613237142563, "rewards/margins_min": 0.02560998871922493, "rewards/margins_std": 0.03470207750797272, "rewards/rejected": -0.016697466373443604, "step": 2280 }, { "epoch": 0.52, "grad_norm": 0.310546875, "learning_rate": 2.78250164750681e-07, "logits/chosen": -1.5214847326278687, "logits/rejected": -1.2135298252105713, "logps/chosen": -225.0383758544922, "logps/rejected": -222.4280548095703, "loss": 0.6741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.028520096093416214, "rewards/margins": 0.04392777383327484, "rewards/margins_max": 0.072211354970932, "rewards/margins_min": 0.015644187107682228, "rewards/margins_std": 0.03999902680516243, "rewards/rejected": -0.015407675877213478, "step": 2290 }, { "epoch": 0.52, "grad_norm": 0.359375, "learning_rate": 2.7628907531527813e-07, "logits/chosen": -1.4052813053131104, "logits/rejected": -1.084937572479248, "logps/chosen": -287.57550048828125, "logps/rejected": -199.4448699951172, "loss": 0.6684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02502388320863247, "rewards/margins": 0.05385827273130417, "rewards/margins_max": 0.08281184732913971, "rewards/margins_min": 0.024904707446694374, "rewards/margins_std": 0.040946535766124725, "rewards/rejected": -0.028834396973252296, "step": 2300 }, { "epoch": 0.52, "grad_norm": 0.5234375, "learning_rate": 2.743263487299712e-07, "logits/chosen": -1.329421043395996, "logits/rejected": -1.2132747173309326, "logps/chosen": -282.1737365722656, "logps/rejected": -261.6746520996094, "loss": 0.6704, "rewards/accuracies": 0.875, "rewards/chosen": 0.02811703085899353, "rewards/margins": 0.04853527620434761, "rewards/margins_max": 0.07612602412700653, "rewards/margins_min": 0.02094453200697899, "rewards/margins_std": 0.039019204676151276, "rewards/rejected": -0.02041824534535408, "step": 2310 }, { "epoch": 0.52, "grad_norm": 0.333984375, "learning_rate": 2.7236210722338933e-07, "logits/chosen": -1.4650366306304932, "logits/rejected": -1.1293063163757324, "logps/chosen": -246.368896484375, "logps/rejected": -186.78863525390625, "loss": 0.6726, "rewards/accuracies": 0.875, "rewards/chosen": 0.03457409888505936, "rewards/margins": 0.04936753585934639, "rewards/margins_max": 0.07968120276927948, "rewards/margins_min": 0.019053865224123, "rewards/margins_std": 0.04287000000476837, "rewards/rejected": -0.014793431386351585, "step": 2320 }, { "epoch": 0.53, "grad_norm": 0.33984375, "learning_rate": 2.7039647311850346e-07, "logits/chosen": -1.2866630554199219, "logits/rejected": -1.0842173099517822, "logps/chosen": -177.29855346679688, "logps/rejected": -178.6018524169922, "loss": 0.6687, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02924739383161068, "rewards/margins": 0.04846047982573509, "rewards/margins_max": 0.07607054710388184, "rewards/margins_min": 0.02085040882229805, "rewards/margins_std": 0.03904653713107109, "rewards/rejected": -0.01921308971941471, "step": 2330 }, { "epoch": 0.53, "grad_norm": 0.328125, "learning_rate": 2.684295688250084e-07, "logits/chosen": -1.4828417301177979, "logits/rejected": -1.1574809551239014, "logps/chosen": -229.223876953125, "logps/rejected": -193.02645874023438, "loss": 0.6644, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.039402466267347336, "rewards/margins": 0.052919901907444, "rewards/margins_max": 0.07841168344020844, "rewards/margins_min": 0.027428116649389267, "rewards/margins_std": 0.03605083003640175, "rewards/rejected": -0.013517431914806366, "step": 2340 }, { "epoch": 0.53, "grad_norm": 0.388671875, "learning_rate": 2.664615168316998e-07, "logits/chosen": -1.4797755479812622, "logits/rejected": -1.3098758459091187, "logps/chosen": -191.74508666992188, "logps/rejected": -216.4213104248047, "loss": 0.666, "rewards/accuracies": 0.875, "rewards/chosen": 0.036059409379959106, "rewards/margins": 0.050756268203258514, "rewards/margins_max": 0.08269494771957397, "rewards/margins_min": 0.0188176017254591, "rewards/margins_std": 0.04516809806227684, "rewards/rejected": -0.014696864411234856, "step": 2350 }, { "epoch": 0.53, "grad_norm": 0.390625, "learning_rate": 2.6449243969884645e-07, "logits/chosen": -1.4017361402511597, "logits/rejected": -1.0677566528320312, "logps/chosen": -209.80126953125, "logps/rejected": -181.45455932617188, "loss": 0.6739, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02429645135998726, "rewards/margins": 0.043695103377103806, "rewards/margins_max": 0.07192997634410858, "rewards/margins_min": 0.01546022854745388, "rewards/margins_std": 0.03993014246225357, "rewards/rejected": -0.019398652017116547, "step": 2360 }, { "epoch": 0.54, "grad_norm": 0.298828125, "learning_rate": 2.625224600505572e-07, "logits/chosen": -1.3834593296051025, "logits/rejected": -1.1724491119384766, "logps/chosen": -220.4295654296875, "logps/rejected": -285.34014892578125, "loss": 0.6681, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.024225343018770218, "rewards/margins": 0.052785150706768036, "rewards/margins_max": 0.08505556732416153, "rewards/margins_min": 0.020514745265245438, "rewards/margins_std": 0.04563724994659424, "rewards/rejected": -0.028559807687997818, "step": 2370 }, { "epoch": 0.54, "grad_norm": 0.50390625, "learning_rate": 2.605517005671454e-07, "logits/chosen": -1.508434534072876, "logits/rejected": -1.1241999864578247, "logps/chosen": -270.1075439453125, "logps/rejected": -254.4306182861328, "loss": 0.6659, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.037416476756334305, "rewards/margins": 0.051527239382267, "rewards/margins_max": 0.08236847817897797, "rewards/margins_min": 0.02068600058555603, "rewards/margins_std": 0.043616097420454025, "rewards/rejected": -0.014110761694610119, "step": 2380 }, { "epoch": 0.54, "grad_norm": 0.302734375, "learning_rate": 2.5858028397748825e-07, "logits/chosen": -1.4443773031234741, "logits/rejected": -0.9758247137069702, "logps/chosen": -222.76760864257812, "logps/rejected": -232.0113067626953, "loss": 0.6728, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03209487348794937, "rewards/margins": 0.0541413240134716, "rewards/margins_max": 0.08297590911388397, "rewards/margins_min": 0.02530675008893013, "rewards/margins_std": 0.04077824950218201, "rewards/rejected": -0.02204645611345768, "step": 2390 }, { "epoch": 0.54, "grad_norm": 0.55859375, "learning_rate": 2.5660833305138447e-07, "logits/chosen": -1.482155680656433, "logits/rejected": -1.2326542139053345, "logps/chosen": -293.49993896484375, "logps/rejected": -266.3676452636719, "loss": 0.6652, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.030677342787384987, "rewards/margins": 0.0634465366601944, "rewards/margins_max": 0.10515228658914566, "rewards/margins_min": 0.021740790456533432, "rewards/margins_std": 0.05898084118962288, "rewards/rejected": -0.03276919946074486, "step": 2400 }, { "epoch": 0.54, "grad_norm": 0.380859375, "learning_rate": 2.5463597059190827e-07, "logits/chosen": -1.3583731651306152, "logits/rejected": -1.2022713422775269, "logps/chosen": -170.06192016601562, "logps/rejected": -219.817138671875, "loss": 0.6714, "rewards/accuracies": 0.875, "rewards/chosen": 0.023664075881242752, "rewards/margins": 0.034540314227342606, "rewards/margins_max": 0.054481375962495804, "rewards/margins_min": 0.014599250629544258, "rewards/margins_std": 0.028200918808579445, "rewards/rejected": -0.010876237414777279, "step": 2410 }, { "epoch": 0.55, "grad_norm": 0.369140625, "learning_rate": 2.5266331942776213e-07, "logits/chosen": -1.5255191326141357, "logits/rejected": -1.2606998682022095, "logps/chosen": -210.038330078125, "logps/rejected": -205.2218475341797, "loss": 0.6724, "rewards/accuracies": 0.875, "rewards/chosen": 0.03931133449077606, "rewards/margins": 0.043173693120479584, "rewards/margins_max": 0.06728474795818329, "rewards/margins_min": 0.019062651321291924, "rewards/margins_std": 0.03409816697239876, "rewards/rejected": -0.0038623593281954527, "step": 2420 }, { "epoch": 0.55, "grad_norm": 0.427734375, "learning_rate": 2.5069050240562777e-07, "logits/chosen": -1.4253253936767578, "logits/rejected": -1.0836570262908936, "logps/chosen": -193.62042236328125, "logps/rejected": -189.53955078125, "loss": 0.6664, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.029120203107595444, "rewards/margins": 0.05882256105542183, "rewards/margins_max": 0.07818777859210968, "rewards/margins_min": 0.039457354694604874, "rewards/margins_std": 0.027386540547013283, "rewards/rejected": -0.029702359810471535, "step": 2430 }, { "epoch": 0.55, "grad_norm": 0.3203125, "learning_rate": 2.4871764238251546e-07, "logits/chosen": -1.377029299736023, "logits/rejected": -1.1650168895721436, "logps/chosen": -260.649169921875, "logps/rejected": -293.96490478515625, "loss": 0.6703, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03016340360045433, "rewards/margins": 0.048960715532302856, "rewards/margins_max": 0.07401823252439499, "rewards/margins_min": 0.023903196677565575, "rewards/margins_std": 0.035436682403087616, "rewards/rejected": -0.018797313794493675, "step": 2440 }, { "epoch": 0.55, "grad_norm": 0.4609375, "learning_rate": 2.467448622181134e-07, "logits/chosen": -1.1938097476959229, "logits/rejected": -0.9804821014404297, "logps/chosen": -210.96774291992188, "logps/rejected": -213.63916015625, "loss": 0.6678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026612062007188797, "rewards/margins": 0.051428746432065964, "rewards/margins_max": 0.08274015039205551, "rewards/margins_min": 0.020117351785302162, "rewards/margins_std": 0.0442809984087944, "rewards/rejected": -0.024816682562232018, "step": 2450 }, { "epoch": 0.56, "grad_norm": 0.453125, "learning_rate": 2.447722847671369e-07, "logits/chosen": -1.3869951963424683, "logits/rejected": -1.1585485935211182, "logps/chosen": -216.66476440429688, "logps/rejected": -222.16177368164062, "loss": 0.6754, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026378905400633812, "rewards/margins": 0.03656502440571785, "rewards/margins_max": 0.059453725814819336, "rewards/margins_min": 0.013676322996616364, "rewards/margins_std": 0.03236951306462288, "rewards/rejected": -0.010186120867729187, "step": 2460 }, { "epoch": 0.56, "grad_norm": 0.40625, "learning_rate": 2.428000328716768e-07, "logits/chosen": -1.2768441438674927, "logits/rejected": -1.0203564167022705, "logps/chosen": -327.2904052734375, "logps/rejected": -186.34286499023438, "loss": 0.6671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.025987038388848305, "rewards/margins": 0.044793128967285156, "rewards/margins_max": 0.06938231736421585, "rewards/margins_min": 0.02020394243299961, "rewards/margins_std": 0.03477436676621437, "rewards/rejected": -0.01880609430372715, "step": 2470 }, { "epoch": 0.56, "grad_norm": 0.333984375, "learning_rate": 2.4082822935355034e-07, "logits/chosen": -1.3781462907791138, "logits/rejected": -1.1217296123504639, "logps/chosen": -230.134521484375, "logps/rejected": -178.98233032226562, "loss": 0.667, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.025606945157051086, "rewards/margins": 0.04344099014997482, "rewards/margins_max": 0.06334863603115082, "rewards/margins_min": 0.02353333681821823, "rewards/margins_std": 0.0281536765396595, "rewards/rejected": -0.017834046855568886, "step": 2480 }, { "epoch": 0.56, "grad_norm": 0.494140625, "learning_rate": 2.3885699700665214e-07, "logits/chosen": -1.3934246301651, "logits/rejected": -0.9389753341674805, "logps/chosen": -302.7892150878906, "logps/rejected": -338.69769287109375, "loss": 0.6737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03493428975343704, "rewards/margins": 0.05114731192588806, "rewards/margins_max": 0.0730891153216362, "rewards/margins_min": 0.029205525293946266, "rewards/margins_std": 0.031030382961034775, "rewards/rejected": -0.016213025897741318, "step": 2490 }, { "epoch": 0.57, "grad_norm": 0.3515625, "learning_rate": 2.3688645858930683e-07, "logits/chosen": -1.3707977533340454, "logits/rejected": -1.0166159868240356, "logps/chosen": -271.46063232421875, "logps/rejected": -236.61270141601562, "loss": 0.6669, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04207485169172287, "rewards/margins": 0.05323861911892891, "rewards/margins_max": 0.08189557492733002, "rewards/margins_min": 0.024581637233495712, "rewards/margins_std": 0.04052707925438881, "rewards/rejected": -0.011163758113980293, "step": 2500 }, { "epoch": 0.57, "grad_norm": 0.330078125, "learning_rate": 2.3491673681662508e-07, "logits/chosen": -1.343019723892212, "logits/rejected": -1.1173169612884521, "logps/chosen": -209.8493194580078, "logps/rejected": -248.96041870117188, "loss": 0.6723, "rewards/accuracies": 0.875, "rewards/chosen": 0.02808157540857792, "rewards/margins": 0.03859950974583626, "rewards/margins_max": 0.06530123949050903, "rewards/margins_min": 0.011897771619260311, "rewards/margins_std": 0.037761956453323364, "rewards/rejected": -0.01051793061196804, "step": 2510 }, { "epoch": 0.57, "grad_norm": 0.345703125, "learning_rate": 2.329479543528607e-07, "logits/chosen": -1.370318055152893, "logits/rejected": -1.0287044048309326, "logps/chosen": -212.48788452148438, "logps/rejected": -178.92825317382812, "loss": 0.6693, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030343199148774147, "rewards/margins": 0.05134710669517517, "rewards/margins_max": 0.07893334329128265, "rewards/margins_min": 0.02376086637377739, "rewards/margins_std": 0.039012834429740906, "rewards/rejected": -0.021003911271691322, "step": 2520 }, { "epoch": 0.57, "grad_norm": 0.5390625, "learning_rate": 2.3098023380377253e-07, "logits/chosen": -1.2607336044311523, "logits/rejected": -1.1116814613342285, "logps/chosen": -273.98931884765625, "logps/rejected": -227.2015380859375, "loss": 0.6716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.014126409776508808, "rewards/margins": 0.04472264647483826, "rewards/margins_max": 0.06790605932474136, "rewards/margins_min": 0.021539241075515747, "rewards/margins_std": 0.032786283642053604, "rewards/rejected": -0.030596237629652023, "step": 2530 }, { "epoch": 0.57, "grad_norm": 0.376953125, "learning_rate": 2.2901369770898826e-07, "logits/chosen": -1.423923373222351, "logits/rejected": -1.1069891452789307, "logps/chosen": -245.7572479248047, "logps/rejected": -187.391845703125, "loss": 0.6657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028265494853258133, "rewards/margins": 0.05082274600863457, "rewards/margins_max": 0.06974449753761292, "rewards/margins_min": 0.03190099820494652, "rewards/margins_std": 0.026759391650557518, "rewards/rejected": -0.022557254880666733, "step": 2540 }, { "epoch": 0.58, "grad_norm": 0.39453125, "learning_rate": 2.270484685343742e-07, "logits/chosen": -1.2900946140289307, "logits/rejected": -1.0569812059402466, "logps/chosen": -212.2299346923828, "logps/rejected": -204.08480834960938, "loss": 0.6736, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.023340780287981033, "rewards/margins": 0.039305586367845535, "rewards/margins_max": 0.06847918778657913, "rewards/margins_min": 0.010131985880434513, "rewards/margins_std": 0.041257698088884354, "rewards/rejected": -0.015964802354574203, "step": 2550 }, { "epoch": 0.58, "grad_norm": 0.4140625, "learning_rate": 2.2508466866440822e-07, "logits/chosen": -1.3407169580459595, "logits/rejected": -0.9573219418525696, "logps/chosen": -244.609375, "logps/rejected": -233.3170623779297, "loss": 0.6677, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03367815539240837, "rewards/margins": 0.056748904287815094, "rewards/margins_max": 0.08551572263240814, "rewards/margins_min": 0.0279820766299963, "rewards/margins_std": 0.04068244248628616, "rewards/rejected": -0.02307075262069702, "step": 2560 }, { "epoch": 0.58, "grad_norm": 0.50390625, "learning_rate": 2.2312242039455813e-07, "logits/chosen": -1.4510078430175781, "logits/rejected": -1.0880759954452515, "logps/chosen": -217.3108673095703, "logps/rejected": -215.2972869873047, "loss": 0.6683, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.039409976452589035, "rewards/margins": 0.05283751338720322, "rewards/margins_max": 0.08771199733018875, "rewards/margins_min": 0.01796303130686283, "rewards/margins_std": 0.04931997135281563, "rewards/rejected": -0.013427533209323883, "step": 2570 }, { "epoch": 0.58, "grad_norm": 0.484375, "learning_rate": 2.2116184592366637e-07, "logits/chosen": -1.2980551719665527, "logits/rejected": -1.0735098123550415, "logps/chosen": -243.39370727539062, "logps/rejected": -239.2067413330078, "loss": 0.6684, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03561193123459816, "rewards/margins": 0.05166007950901985, "rewards/margins_max": 0.07473595440387726, "rewards/margins_min": 0.028584185987710953, "rewards/margins_std": 0.03263423591852188, "rewards/rejected": -0.016048144549131393, "step": 2580 }, { "epoch": 0.59, "grad_norm": 0.3125, "learning_rate": 2.1920306734633932e-07, "logits/chosen": -1.3777854442596436, "logits/rejected": -1.1635510921478271, "logps/chosen": -178.19998168945312, "logps/rejected": -221.8589324951172, "loss": 0.6684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.039459072053432465, "rewards/margins": 0.051184237003326416, "rewards/margins_max": 0.07255267351865768, "rewards/margins_min": 0.029815804213285446, "rewards/margins_std": 0.030219530686736107, "rewards/rejected": -0.011725172400474548, "step": 2590 }, { "epoch": 0.59, "grad_norm": 0.4765625, "learning_rate": 2.1724620664534452e-07, "logits/chosen": -1.4241256713867188, "logits/rejected": -0.995019793510437, "logps/chosen": -236.72262573242188, "logps/rejected": -216.3303680419922, "loss": 0.6686, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02747528627514839, "rewards/margins": 0.04677470773458481, "rewards/margins_max": 0.07165158540010452, "rewards/margins_min": 0.021897820755839348, "rewards/margins_std": 0.03518122434616089, "rewards/rejected": -0.019299419596791267, "step": 2600 }, { "epoch": 0.59, "grad_norm": 0.515625, "learning_rate": 2.1529138568401374e-07, "logits/chosen": -1.4380706548690796, "logits/rejected": -1.1371575593948364, "logps/chosen": -268.78253173828125, "logps/rejected": -245.05960083007812, "loss": 0.6705, "rewards/accuracies": 0.875, "rewards/chosen": 0.025660425424575806, "rewards/margins": 0.04348149523139, "rewards/margins_max": 0.0655093640089035, "rewards/margins_min": 0.021453622728586197, "rewards/margins_std": 0.031152114272117615, "rewards/rejected": -0.017821069806814194, "step": 2610 }, { "epoch": 0.59, "grad_norm": 0.5546875, "learning_rate": 2.1333872619865436e-07, "logits/chosen": -1.3819271326065063, "logits/rejected": -1.2658441066741943, "logps/chosen": -146.02621459960938, "logps/rejected": -222.25827026367188, "loss": 0.6726, "rewards/accuracies": 0.875, "rewards/chosen": 0.02677101269364357, "rewards/margins": 0.04473014920949936, "rewards/margins_max": 0.06981190294027328, "rewards/margins_min": 0.01964840292930603, "rewards/margins_std": 0.03547095134854317, "rewards/rejected": -0.017959142103791237, "step": 2620 }, { "epoch": 0.59, "grad_norm": 0.373046875, "learning_rate": 2.1138834979096777e-07, "logits/chosen": -1.4313969612121582, "logits/rejected": -1.0846507549285889, "logps/chosen": -319.28924560546875, "logps/rejected": -198.7446746826172, "loss": 0.6743, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02570546232163906, "rewards/margins": 0.03866446763277054, "rewards/margins_max": 0.052897948771715164, "rewards/margins_min": 0.024430977180600166, "rewards/margins_std": 0.020129187032580376, "rewards/rejected": -0.01295899786055088, "step": 2630 }, { "epoch": 0.6, "grad_norm": 0.3203125, "learning_rate": 2.0944037792047694e-07, "logits/chosen": -1.4719856977462769, "logits/rejected": -1.1970919370651245, "logps/chosen": -248.2223358154297, "logps/rejected": -195.37448120117188, "loss": 0.671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03520715609192848, "rewards/margins": 0.04729854315519333, "rewards/margins_max": 0.06652859598398209, "rewards/margins_min": 0.02806849777698517, "rewards/margins_std": 0.02719539776444435, "rewards/rejected": -0.012091396376490593, "step": 2640 }, { "epoch": 0.6, "grad_norm": 0.4765625, "learning_rate": 2.0749493189696277e-07, "logits/chosen": -1.4909460544586182, "logits/rejected": -1.3218861818313599, "logps/chosen": -185.89071655273438, "logps/rejected": -220.0978240966797, "loss": 0.6702, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.030022714287042618, "rewards/margins": 0.04576057940721512, "rewards/margins_max": 0.08507208526134491, "rewards/margins_min": 0.0064490786753594875, "rewards/margins_std": 0.055594854056835175, "rewards/rejected": -0.01573786698281765, "step": 2650 }, { "epoch": 0.6, "grad_norm": 0.275390625, "learning_rate": 2.0555213287290884e-07, "logits/chosen": -1.4318983554840088, "logits/rejected": -1.1789401769638062, "logps/chosen": -219.03793334960938, "logps/rejected": -216.1660919189453, "loss": 0.6678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02756131812930107, "rewards/margins": 0.04779034107923508, "rewards/margins_max": 0.07405062019824982, "rewards/margins_min": 0.02153005078434944, "rewards/margins_std": 0.037137649953365326, "rewards/rejected": -0.020229021087288857, "step": 2660 }, { "epoch": 0.6, "grad_norm": 0.5546875, "learning_rate": 2.036121018359574e-07, "logits/chosen": -1.4215052127838135, "logits/rejected": -1.1148322820663452, "logps/chosen": -214.1374969482422, "logps/rejected": -228.98159790039062, "loss": 0.6669, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03607341647148132, "rewards/margins": 0.06266181170940399, "rewards/margins_max": 0.09267817437648773, "rewards/margins_min": 0.03264545649290085, "rewards/margins_std": 0.04244953393936157, "rewards/rejected": -0.026588398963212967, "step": 2670 }, { "epoch": 0.61, "grad_norm": 0.349609375, "learning_rate": 2.0167495960137438e-07, "logits/chosen": -1.5387341976165771, "logits/rejected": -1.1654388904571533, "logps/chosen": -194.2707977294922, "logps/rejected": -257.0436706542969, "loss": 0.6711, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04224932938814163, "rewards/margins": 0.05386502668261528, "rewards/margins_max": 0.0798775777220726, "rewards/margins_min": 0.027852484956383705, "rewards/margins_std": 0.03678729385137558, "rewards/rejected": -0.01161570381373167, "step": 2680 }, { "epoch": 0.61, "grad_norm": 0.4296875, "learning_rate": 1.997408268045259e-07, "logits/chosen": -1.3454197645187378, "logits/rejected": -1.0782673358917236, "logps/chosen": -234.7984161376953, "logps/rejected": -211.3849334716797, "loss": 0.6703, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03135889023542404, "rewards/margins": 0.04911700636148453, "rewards/margins_max": 0.07688502967357635, "rewards/margins_min": 0.021348986774683, "rewards/margins_std": 0.039269909262657166, "rewards/rejected": -0.017758117988705635, "step": 2690 }, { "epoch": 0.61, "grad_norm": 0.37890625, "learning_rate": 1.9780982389336537e-07, "logits/chosen": -1.5304574966430664, "logits/rejected": -1.174623727798462, "logps/chosen": -223.42050170898438, "logps/rejected": -214.8582763671875, "loss": 0.6725, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02357478067278862, "rewards/margins": 0.04619375243782997, "rewards/margins_max": 0.07109406590461731, "rewards/margins_min": 0.021293427795171738, "rewards/margins_std": 0.0352143719792366, "rewards/rejected": -0.022618968039751053, "step": 2700 }, { "epoch": 0.61, "grad_norm": 0.361328125, "learning_rate": 1.9588207112093322e-07, "logits/chosen": -1.3365089893341064, "logits/rejected": -1.0793324708938599, "logps/chosen": -255.2864990234375, "logps/rejected": -225.516357421875, "loss": 0.6701, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.028706055134534836, "rewards/margins": 0.0458790548145771, "rewards/margins_max": 0.07555496692657471, "rewards/margins_min": 0.016203144565224648, "rewards/margins_std": 0.041968077421188354, "rewards/rejected": -0.017172997817397118, "step": 2710 }, { "epoch": 0.61, "grad_norm": 0.498046875, "learning_rate": 1.9395768853786738e-07, "logits/chosen": -1.4965957403182983, "logits/rejected": -0.957140326499939, "logps/chosen": -261.61871337890625, "logps/rejected": -232.3061981201172, "loss": 0.6626, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03507986664772034, "rewards/margins": 0.062941774725914, "rewards/margins_max": 0.09290553629398346, "rewards/margins_min": 0.03297800570726395, "rewards/margins_std": 0.04237515479326248, "rewards/rejected": -0.02786189876496792, "step": 2720 }, { "epoch": 0.62, "grad_norm": 0.4453125, "learning_rate": 1.9203679598492767e-07, "logits/chosen": -1.5386086702346802, "logits/rejected": -1.2974907159805298, "logps/chosen": -173.94613647460938, "logps/rejected": -201.42996215820312, "loss": 0.6683, "rewards/accuracies": 0.875, "rewards/chosen": 0.03600393980741501, "rewards/margins": 0.05009385943412781, "rewards/margins_max": 0.07953473180532455, "rewards/margins_min": 0.020652998238801956, "rewards/margins_std": 0.0416356697678566, "rewards/rejected": -0.014089921489357948, "step": 2730 }, { "epoch": 0.62, "grad_norm": 0.361328125, "learning_rate": 1.9011951308553282e-07, "logits/chosen": -1.167707085609436, "logits/rejected": -0.9583696126937866, "logps/chosen": -253.3549346923828, "logps/rejected": -247.03994750976562, "loss": 0.6643, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03269846737384796, "rewards/margins": 0.05938447639346123, "rewards/margins_max": 0.0881558284163475, "rewards/margins_min": 0.03061310388147831, "rewards/margins_std": 0.040688853710889816, "rewards/rejected": -0.02668600343167782, "step": 2740 }, { "epoch": 0.62, "grad_norm": 0.42578125, "learning_rate": 1.8820595923831023e-07, "logits/chosen": -1.4228308200836182, "logits/rejected": -1.1265003681182861, "logps/chosen": -237.9507598876953, "logps/rejected": -199.07684326171875, "loss": 0.668, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.035271745175123215, "rewards/margins": 0.04950911924242973, "rewards/margins_max": 0.07321339845657349, "rewards/margins_min": 0.02580484375357628, "rewards/margins_std": 0.03352290391921997, "rewards/rejected": -0.014237369410693645, "step": 2750 }, { "epoch": 0.62, "grad_norm": 0.359375, "learning_rate": 1.8629625360966134e-07, "logits/chosen": -1.4357101917266846, "logits/rejected": -1.2019593715667725, "logps/chosen": -222.2527313232422, "logps/rejected": -240.33810424804688, "loss": 0.6651, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.040762145072221756, "rewards/margins": 0.06205862760543823, "rewards/margins_max": 0.09742200374603271, "rewards/margins_min": 0.026695240288972855, "rewards/margins_std": 0.05001138523221016, "rewards/rejected": -0.02129647508263588, "step": 2760 }, { "epoch": 0.63, "grad_norm": 0.369140625, "learning_rate": 1.8439051512633982e-07, "logits/chosen": -1.448634386062622, "logits/rejected": -1.0578219890594482, "logps/chosen": -298.6661682128906, "logps/rejected": -253.38241577148438, "loss": 0.6737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03544292971491814, "rewards/margins": 0.03846219182014465, "rewards/margins_max": 0.06004079431295395, "rewards/margins_min": 0.016883578151464462, "rewards/margins_std": 0.030516769737005234, "rewards/rejected": -0.003019258612766862, "step": 2770 }, { "epoch": 0.63, "grad_norm": 0.283203125, "learning_rate": 1.8248886246804596e-07, "logits/chosen": -1.3351027965545654, "logits/rejected": -1.178442120552063, "logps/chosen": -208.878662109375, "logps/rejected": -172.73736572265625, "loss": 0.6697, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.027096301317214966, "rewards/margins": 0.04242512583732605, "rewards/margins_max": 0.06505424529314041, "rewards/margins_min": 0.019795997068285942, "rewards/margins_std": 0.032002415508031845, "rewards/rejected": -0.01532882172614336, "step": 2780 }, { "epoch": 0.63, "grad_norm": 0.328125, "learning_rate": 1.805914140600353e-07, "logits/chosen": -1.3552095890045166, "logits/rejected": -1.1506478786468506, "logps/chosen": -165.49307250976562, "logps/rejected": -199.17178344726562, "loss": 0.6676, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03187553584575653, "rewards/margins": 0.04981083795428276, "rewards/margins_max": 0.0677100196480751, "rewards/margins_min": 0.03191165626049042, "rewards/margins_std": 0.02531326375901699, "rewards/rejected": -0.01793530210852623, "step": 2790 }, { "epoch": 0.63, "grad_norm": 0.41015625, "learning_rate": 1.7869828806574438e-07, "logits/chosen": -1.3208125829696655, "logits/rejected": -1.1334774494171143, "logps/chosen": -243.2615203857422, "logps/rejected": -207.776611328125, "loss": 0.6728, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.016092196106910706, "rewards/margins": 0.03982243314385414, "rewards/margins_max": 0.0666196197271347, "rewards/margins_min": 0.013025254011154175, "rewards/margins_std": 0.03789693862199783, "rewards/rejected": -0.023730238899588585, "step": 2800 }, { "epoch": 0.64, "grad_norm": 0.40625, "learning_rate": 1.768096023794317e-07, "logits/chosen": -1.4114409685134888, "logits/rejected": -1.1142067909240723, "logps/chosen": -213.9055938720703, "logps/rejected": -193.6376953125, "loss": 0.6697, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03414800763130188, "rewards/margins": 0.04588519036769867, "rewards/margins_max": 0.06450579315423965, "rewards/margins_min": 0.027264589443802834, "rewards/margins_std": 0.026333507150411606, "rewards/rejected": -0.011737184599041939, "step": 2810 }, { "epoch": 0.64, "grad_norm": 0.287109375, "learning_rate": 1.7492547461883577e-07, "logits/chosen": -1.378385305404663, "logits/rejected": -1.0380463600158691, "logps/chosen": -234.6033477783203, "logps/rejected": -204.3202362060547, "loss": 0.6674, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.030243387445807457, "rewards/margins": 0.04808034002780914, "rewards/margins_max": 0.07598671317100525, "rewards/margins_min": 0.020173965021967888, "rewards/margins_std": 0.03946557641029358, "rewards/rejected": -0.017836952582001686, "step": 2820 }, { "epoch": 0.64, "grad_norm": 0.4375, "learning_rate": 1.7304602211785103e-07, "logits/chosen": -1.4631866216659546, "logits/rejected": -1.1849663257598877, "logps/chosen": -210.96768188476562, "logps/rejected": -216.9154052734375, "loss": 0.6696, "rewards/accuracies": 0.875, "rewards/chosen": 0.03199288621544838, "rewards/margins": 0.054790280759334564, "rewards/margins_max": 0.08474183082580566, "rewards/margins_min": 0.024838734418153763, "rewards/margins_std": 0.042357880622148514, "rewards/rejected": -0.022797394543886185, "step": 2830 }, { "epoch": 0.64, "grad_norm": 0.310546875, "learning_rate": 1.711713619192201e-07, "logits/chosen": -1.3077460527420044, "logits/rejected": -1.139337420463562, "logps/chosen": -185.03842163085938, "logps/rejected": -171.12295532226562, "loss": 0.6667, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030919110402464867, "rewards/margins": 0.051524568349123, "rewards/margins_max": 0.08040101826190948, "rewards/margins_min": 0.022648107260465622, "rewards/margins_std": 0.04083748161792755, "rewards/rejected": -0.020605452358722687, "step": 2840 }, { "epoch": 0.64, "grad_norm": 0.37890625, "learning_rate": 1.6930161076724584e-07, "logits/chosen": -1.4011225700378418, "logits/rejected": -1.0451468229293823, "logps/chosen": -198.51084899902344, "logps/rejected": -216.575927734375, "loss": 0.668, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03980085626244545, "rewards/margins": 0.05166858434677124, "rewards/margins_max": 0.08060939610004425, "rewards/margins_min": 0.022727767005562782, "rewards/margins_std": 0.040928494185209274, "rewards/rejected": -0.011867721565067768, "step": 2850 }, { "epoch": 0.65, "grad_norm": 0.298828125, "learning_rate": 1.6743688510052023e-07, "logits/chosen": -1.445908546447754, "logits/rejected": -1.2983238697052002, "logps/chosen": -184.75465393066406, "logps/rejected": -176.6446533203125, "loss": 0.6719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.021395951509475708, "rewards/margins": 0.03993183746933937, "rewards/margins_max": 0.05802379921078682, "rewards/margins_min": 0.021839866414666176, "rewards/margins_std": 0.025585904717445374, "rewards/rejected": -0.018535882234573364, "step": 2860 }, { "epoch": 0.65, "grad_norm": 0.443359375, "learning_rate": 1.6557730104467403e-07, "logits/chosen": -1.5235058069229126, "logits/rejected": -1.1063475608825684, "logps/chosen": -209.2215576171875, "logps/rejected": -193.5798797607422, "loss": 0.6641, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.036139652132987976, "rewards/margins": 0.04945778846740723, "rewards/margins_max": 0.0719948261976242, "rewards/margins_min": 0.026920750737190247, "rewards/margins_std": 0.03187217935919762, "rewards/rejected": -0.01331813633441925, "step": 2870 }, { "epoch": 0.65, "grad_norm": 0.419921875, "learning_rate": 1.6372297440514415e-07, "logits/chosen": -1.3985137939453125, "logits/rejected": -1.1464028358459473, "logps/chosen": -201.1994171142578, "logps/rejected": -201.71766662597656, "loss": 0.6667, "rewards/accuracies": 0.875, "rewards/chosen": 0.037888336926698685, "rewards/margins": 0.051082391291856766, "rewards/margins_max": 0.07492348551750183, "rewards/margins_min": 0.027241300791502, "rewards/margins_std": 0.03371639922261238, "rewards/rejected": -0.013194059021770954, "step": 2880 }, { "epoch": 0.65, "grad_norm": 0.388671875, "learning_rate": 1.6187402065996263e-07, "logits/chosen": -1.499720811843872, "logits/rejected": -1.1451431512832642, "logps/chosen": -205.8267822265625, "logps/rejected": -189.5011444091797, "loss": 0.6677, "rewards/accuracies": 0.875, "rewards/chosen": 0.040955521166324615, "rewards/margins": 0.05267069488763809, "rewards/margins_max": 0.0834018737077713, "rewards/margins_min": 0.021939512342214584, "rewards/margins_std": 0.04346044734120369, "rewards/rejected": -0.011715171858668327, "step": 2890 }, { "epoch": 0.66, "grad_norm": 0.376953125, "learning_rate": 1.6003055495256506e-07, "logits/chosen": -1.3605386018753052, "logits/rejected": -1.178406000137329, "logps/chosen": -190.85678100585938, "logps/rejected": -172.95631408691406, "loss": 0.6729, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02527732215821743, "rewards/margins": 0.03166641667485237, "rewards/margins_max": 0.04430197551846504, "rewards/margins_min": 0.01903085596859455, "rewards/margins_std": 0.017869381234049797, "rewards/rejected": -0.00638909637928009, "step": 2900 }, { "epoch": 0.66, "grad_norm": 0.349609375, "learning_rate": 1.581926920846196e-07, "logits/chosen": -1.3835715055465698, "logits/rejected": -1.0120103359222412, "logps/chosen": -226.8507843017578, "logps/rejected": -230.98605346679688, "loss": 0.6683, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04309115558862686, "rewards/margins": 0.054244499653577805, "rewards/margins_max": 0.08164414763450623, "rewards/margins_min": 0.02684485912322998, "rewards/margins_std": 0.03874894976615906, "rewards/rejected": -0.011153348721563816, "step": 2910 }, { "epoch": 0.66, "grad_norm": 0.47265625, "learning_rate": 1.5636054650887847e-07, "logits/chosen": -1.367553949356079, "logits/rejected": -1.1866862773895264, "logps/chosen": -218.7686309814453, "logps/rejected": -213.06802368164062, "loss": 0.6681, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03090577758848667, "rewards/margins": 0.058771491050720215, "rewards/margins_max": 0.09552975744009018, "rewards/margins_min": 0.022013235837221146, "rewards/margins_std": 0.0519840307533741, "rewards/rejected": -0.02786571905016899, "step": 2920 }, { "epoch": 0.66, "grad_norm": 0.47265625, "learning_rate": 1.5453423232204965e-07, "logits/chosen": -1.5142792463302612, "logits/rejected": -1.318814754486084, "logps/chosen": -169.22059631347656, "logps/rejected": -188.27011108398438, "loss": 0.6646, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0322001650929451, "rewards/margins": 0.05324719101190567, "rewards/margins_max": 0.08077719062566757, "rewards/margins_min": 0.025717195123434067, "rewards/margins_std": 0.038933295756578445, "rewards/rejected": -0.02104702964425087, "step": 2930 }, { "epoch": 0.66, "grad_norm": 0.447265625, "learning_rate": 1.5271386325769226e-07, "logits/chosen": -1.4925755262374878, "logits/rejected": -1.129923701286316, "logps/chosen": -187.85366821289062, "logps/rejected": -205.51449584960938, "loss": 0.6653, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0343579538166523, "rewards/margins": 0.06020587682723999, "rewards/margins_max": 0.09044940769672394, "rewards/margins_min": 0.029962360858917236, "rewards/margins_std": 0.04277079552412033, "rewards/rejected": -0.02584792673587799, "step": 2940 }, { "epoch": 0.67, "grad_norm": 0.412109375, "learning_rate": 1.5089955267913302e-07, "logits/chosen": -1.2845211029052734, "logits/rejected": -0.9166741371154785, "logps/chosen": -300.81634521484375, "logps/rejected": -228.6632843017578, "loss": 0.6699, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03172794356942177, "rewards/margins": 0.044202424585819244, "rewards/margins_max": 0.06601408123970032, "rewards/margins_min": 0.02239074558019638, "rewards/margins_std": 0.030846362933516502, "rewards/rejected": -0.012474477291107178, "step": 2950 }, { "epoch": 0.67, "grad_norm": 0.421875, "learning_rate": 1.490914135724073e-07, "logits/chosen": -1.4572781324386597, "logits/rejected": -1.0811411142349243, "logps/chosen": -282.17138671875, "logps/rejected": -236.5408172607422, "loss": 0.6692, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0302413459867239, "rewards/margins": 0.0462752990424633, "rewards/margins_max": 0.07116423547267914, "rewards/margins_min": 0.021386370062828064, "rewards/margins_std": 0.03519826382398605, "rewards/rejected": -0.0160339567810297, "step": 2960 }, { "epoch": 0.67, "grad_norm": 0.35546875, "learning_rate": 1.4728955853922237e-07, "logits/chosen": -1.456148386001587, "logits/rejected": -1.277165174484253, "logps/chosen": -204.0210418701172, "logps/rejected": -172.01878356933594, "loss": 0.6698, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.01876869983971119, "rewards/margins": 0.04803822189569473, "rewards/margins_max": 0.07267947494983673, "rewards/margins_min": 0.023396968841552734, "rewards/margins_std": 0.03484799712896347, "rewards/rejected": -0.029269522055983543, "step": 2970 }, { "epoch": 0.67, "grad_norm": 0.42578125, "learning_rate": 1.4549409978994542e-07, "logits/chosen": -1.3960371017456055, "logits/rejected": -1.0863512754440308, "logps/chosen": -288.02093505859375, "logps/rejected": -261.132080078125, "loss": 0.6708, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03317864611744881, "rewards/margins": 0.049657586961984634, "rewards/margins_max": 0.07338405400514603, "rewards/margins_min": 0.025931116193532944, "rewards/margins_std": 0.03355429321527481, "rewards/rejected": -0.016478940844535828, "step": 2980 }, { "epoch": 0.68, "grad_norm": 0.2890625, "learning_rate": 1.4370514913661573e-07, "logits/chosen": -1.269504189491272, "logits/rejected": -0.9573532938957214, "logps/chosen": -253.5911102294922, "logps/rejected": -173.530029296875, "loss": 0.6701, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03387542814016342, "rewards/margins": 0.05059955641627312, "rewards/margins_max": 0.07369405776262283, "rewards/margins_min": 0.027505064383149147, "rewards/margins_std": 0.032660551369190216, "rewards/rejected": -0.016724130138754845, "step": 2990 }, { "epoch": 0.68, "grad_norm": 0.390625, "learning_rate": 1.4192281798598133e-07, "logits/chosen": -1.4107468128204346, "logits/rejected": -1.2959485054016113, "logps/chosen": -170.9486083984375, "logps/rejected": -204.7427520751953, "loss": 0.6693, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03265909105539322, "rewards/margins": 0.052322544157505035, "rewards/margins_max": 0.08253692090511322, "rewards/margins_min": 0.022108152508735657, "rewards/margins_std": 0.04272959753870964, "rewards/rejected": -0.01966344751417637, "step": 3000 }, { "epoch": 0.68, "grad_norm": 0.4375, "learning_rate": 1.4014721733256135e-07, "logits/chosen": -1.5541356801986694, "logits/rejected": -1.1410366296768188, "logps/chosen": -249.2203369140625, "logps/rejected": -211.2033233642578, "loss": 0.6667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.039340030401945114, "rewards/margins": 0.06089523434638977, "rewards/margins_max": 0.09748632460832596, "rewards/margins_min": 0.024304138496518135, "rewards/margins_std": 0.05174762010574341, "rewards/rejected": -0.02155519835650921, "step": 3010 }, { "epoch": 0.68, "grad_norm": 0.4609375, "learning_rate": 1.3837845775173373e-07, "logits/chosen": -1.369077444076538, "logits/rejected": -1.1096550226211548, "logps/chosen": -197.16085815429688, "logps/rejected": -163.0858612060547, "loss": 0.6699, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0283735990524292, "rewards/margins": 0.04257078841328621, "rewards/margins_max": 0.06825915724039078, "rewards/margins_min": 0.01688242517411709, "rewards/margins_std": 0.036328837275505066, "rewards/rejected": -0.014197193086147308, "step": 3020 }, { "epoch": 0.68, "grad_norm": 0.35546875, "learning_rate": 1.3661664939284928e-07, "logits/chosen": -1.3072458505630493, "logits/rejected": -1.0293340682983398, "logps/chosen": -256.4021301269531, "logps/rejected": -215.84683227539062, "loss": 0.6654, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.036074064671993256, "rewards/margins": 0.05670546740293503, "rewards/margins_max": 0.08541737496852875, "rewards/margins_min": 0.027993574738502502, "rewards/margins_std": 0.040604762732982635, "rewards/rejected": -0.02063140645623207, "step": 3030 }, { "epoch": 0.69, "grad_norm": 0.3359375, "learning_rate": 1.3486190197237187e-07, "logits/chosen": -1.5194108486175537, "logits/rejected": -1.1841778755187988, "logps/chosen": -253.387939453125, "logps/rejected": -258.55517578125, "loss": 0.6671, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04065632447600365, "rewards/margins": 0.06677254289388657, "rewards/margins_max": 0.0925917774438858, "rewards/margins_min": 0.04095330461859703, "rewards/margins_std": 0.03651391342282295, "rewards/rejected": -0.02611621282994747, "step": 3040 }, { "epoch": 0.69, "grad_norm": 0.380859375, "learning_rate": 1.3311432476704653e-07, "logits/chosen": -1.3569104671478271, "logits/rejected": -1.2485584020614624, "logps/chosen": -186.0038299560547, "logps/rejected": -232.54959106445312, "loss": 0.6726, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.026211729273200035, "rewards/margins": 0.047675721347332, "rewards/margins_max": 0.07000197470188141, "rewards/margins_min": 0.02534947171807289, "rewards/margins_std": 0.03157408535480499, "rewards/rejected": -0.021463993936777115, "step": 3050 }, { "epoch": 0.69, "grad_norm": 0.369140625, "learning_rate": 1.3137402660709311e-07, "logits/chosen": -1.2760577201843262, "logits/rejected": -1.2659679651260376, "logps/chosen": -175.79026794433594, "logps/rejected": -195.94700622558594, "loss": 0.675, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.026767786592245102, "rewards/margins": 0.03098815120756626, "rewards/margins_max": 0.04722337797284126, "rewards/margins_min": 0.01475292257964611, "rewards/margins_std": 0.022960076108574867, "rewards/rejected": -0.004220363683998585, "step": 3060 }, { "epoch": 0.69, "grad_norm": 0.48828125, "learning_rate": 1.2964111586942994e-07, "logits/chosen": -1.3129364252090454, "logits/rejected": -0.9301867485046387, "logps/chosen": -274.6208190917969, "logps/rejected": -219.16415405273438, "loss": 0.6687, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.027635782957077026, "rewards/margins": 0.05113440752029419, "rewards/margins_max": 0.08066648244857788, "rewards/margins_min": 0.021602336317300797, "rewards/margins_std": 0.04176466166973114, "rewards/rejected": -0.02349862828850746, "step": 3070 }, { "epoch": 0.7, "grad_norm": 0.310546875, "learning_rate": 1.2791570047092413e-07, "logits/chosen": -1.4004180431365967, "logits/rejected": -1.1162279844284058, "logps/chosen": -272.8247985839844, "logps/rejected": -223.28164672851562, "loss": 0.6668, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.020728105679154396, "rewards/margins": 0.05209376662969589, "rewards/margins_max": 0.07638858258724213, "rewards/margins_min": 0.027798956260085106, "rewards/margins_std": 0.034358054399490356, "rewards/rejected": -0.031365666538476944, "step": 3080 }, { "epoch": 0.7, "grad_norm": 0.265625, "learning_rate": 1.2619788786167112e-07, "logits/chosen": -1.3903374671936035, "logits/rejected": -1.3379945755004883, "logps/chosen": -199.85646057128906, "logps/rejected": -250.6023712158203, "loss": 0.6701, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.030954908579587936, "rewards/margins": 0.040415409952402115, "rewards/margins_max": 0.057001303881406784, "rewards/margins_min": 0.023829510435461998, "rewards/margins_std": 0.02345600351691246, "rewards/rejected": -0.009460503235459328, "step": 3090 }, { "epoch": 0.7, "grad_norm": 0.416015625, "learning_rate": 1.2448778501830378e-07, "logits/chosen": -1.4009287357330322, "logits/rejected": -1.1963551044464111, "logps/chosen": -194.9075927734375, "logps/rejected": -232.9965057373047, "loss": 0.6678, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03073223866522312, "rewards/margins": 0.046397607773542404, "rewards/margins_max": 0.0713280513882637, "rewards/margins_min": 0.021467158570885658, "rewards/margins_std": 0.03525697812438011, "rewards/rejected": -0.015665370970964432, "step": 3100 }, { "epoch": 0.7, "grad_norm": 0.3671875, "learning_rate": 1.2278549843732912e-07, "logits/chosen": -1.4153436422348022, "logits/rejected": -1.214393973350525, "logps/chosen": -203.56967163085938, "logps/rejected": -203.48190307617188, "loss": 0.6739, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02419831044971943, "rewards/margins": 0.04225160926580429, "rewards/margins_max": 0.0629769042134285, "rewards/margins_min": 0.021526312455534935, "rewards/margins_std": 0.029309988021850586, "rewards/rejected": -0.018053295090794563, "step": 3110 }, { "epoch": 0.71, "grad_norm": 0.380859375, "learning_rate": 1.210911341284979e-07, "logits/chosen": -1.2526661157608032, "logits/rejected": -1.0572154521942139, "logps/chosen": -161.1710662841797, "logps/rejected": -198.1621856689453, "loss": 0.6739, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02532758191227913, "rewards/margins": 0.029836729168891907, "rewards/margins_max": 0.047510914504528046, "rewards/margins_min": 0.012162544764578342, "rewards/margins_std": 0.024995077401399612, "rewards/rejected": -0.00450914865359664, "step": 3120 }, { "epoch": 0.71, "grad_norm": 0.31640625, "learning_rate": 1.1940479760820175e-07, "logits/chosen": -1.366984486579895, "logits/rejected": -1.0967432260513306, "logps/chosen": -201.30935668945312, "logps/rejected": -235.0970916748047, "loss": 0.6658, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0381210520863533, "rewards/margins": 0.06330465525388718, "rewards/margins_max": 0.10113723576068878, "rewards/margins_min": 0.02547208033502102, "rewards/margins_std": 0.05350334197282791, "rewards/rejected": -0.025183597579598427, "step": 3130 }, { "epoch": 0.71, "grad_norm": 0.56640625, "learning_rate": 1.1772659389290207e-07, "logits/chosen": -1.4767048358917236, "logits/rejected": -1.0964723825454712, "logps/chosen": -260.4814453125, "logps/rejected": -240.03036499023438, "loss": 0.6711, "rewards/accuracies": 0.875, "rewards/chosen": 0.040913086384534836, "rewards/margins": 0.04774344712495804, "rewards/margins_max": 0.07724296301603317, "rewards/margins_min": 0.018243929371237755, "rewards/margins_std": 0.04171861708164215, "rewards/rejected": -0.006830359809100628, "step": 3140 }, { "epoch": 0.71, "grad_norm": 0.384765625, "learning_rate": 1.160566274925912e-07, "logits/chosen": -1.4270305633544922, "logits/rejected": -1.0017156600952148, "logps/chosen": -218.6697235107422, "logps/rejected": -166.73095703125, "loss": 0.6642, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.034726254642009735, "rewards/margins": 0.055394403636455536, "rewards/margins_max": 0.08493579924106598, "rewards/margins_min": 0.02585301361978054, "rewards/margins_std": 0.0417778417468071, "rewards/rejected": -0.0206681527197361, "step": 3150 }, { "epoch": 0.71, "grad_norm": 0.36328125, "learning_rate": 1.1439500240428301e-07, "logits/chosen": -1.41231369972229, "logits/rejected": -1.115092158317566, "logps/chosen": -134.1064910888672, "logps/rejected": -153.6800079345703, "loss": 0.6696, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.024014055728912354, "rewards/margins": 0.03822372108697891, "rewards/margins_max": 0.05592702701687813, "rewards/margins_min": 0.020520424470305443, "rewards/margins_std": 0.025036249309778214, "rewards/rejected": -0.014209670014679432, "step": 3160 }, { "epoch": 0.72, "grad_norm": 0.49609375, "learning_rate": 1.1274182210553698e-07, "logits/chosen": -1.2921323776245117, "logits/rejected": -1.0127956867218018, "logps/chosen": -283.88250732421875, "logps/rejected": -273.15899658203125, "loss": 0.6674, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03838573768734932, "rewards/margins": 0.05664980411529541, "rewards/margins_max": 0.09717298299074173, "rewards/margins_min": 0.016126640141010284, "rewards/margins_std": 0.057308416813611984, "rewards/rejected": -0.01826407015323639, "step": 3170 }, { "epoch": 0.72, "grad_norm": 0.421875, "learning_rate": 1.1109718954801397e-07, "logits/chosen": -1.273221492767334, "logits/rejected": -1.1316196918487549, "logps/chosen": -238.09048461914062, "logps/rejected": -217.0200958251953, "loss": 0.6674, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.027058366686105728, "rewards/margins": 0.05345930904150009, "rewards/margins_max": 0.08340740203857422, "rewards/margins_min": 0.02351122722029686, "rewards/margins_std": 0.04235298931598663, "rewards/rejected": -0.02640094980597496, "step": 3180 }, { "epoch": 0.72, "grad_norm": 0.451171875, "learning_rate": 1.0946120715106511e-07, "logits/chosen": -1.3681175708770752, "logits/rejected": -1.0219160318374634, "logps/chosen": -260.44451904296875, "logps/rejected": -198.5891876220703, "loss": 0.6619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02242262475192547, "rewards/margins": 0.051530640572309494, "rewards/margins_max": 0.07956963032484055, "rewards/margins_min": 0.023491645231842995, "rewards/margins_std": 0.03965312987565994, "rewards/rejected": -0.029108017683029175, "step": 3190 }, { "epoch": 0.72, "grad_norm": 0.3359375, "learning_rate": 1.0783397679535342e-07, "logits/chosen": -1.4784858226776123, "logits/rejected": -1.2056283950805664, "logps/chosen": -255.98971557617188, "logps/rejected": -197.01263427734375, "loss": 0.6738, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0324072502553463, "rewards/margins": 0.03942598029971123, "rewards/margins_max": 0.058343105018138885, "rewards/margins_min": 0.020508846268057823, "rewards/margins_std": 0.026752863079309464, "rewards/rejected": -0.007018730044364929, "step": 3200 }, { "epoch": 0.73, "grad_norm": 0.43359375, "learning_rate": 1.0621559981650938e-07, "logits/chosen": -1.4148783683776855, "logits/rejected": -1.1107372045516968, "logps/chosen": -188.5048828125, "logps/rejected": -159.65768432617188, "loss": 0.6722, "rewards/accuracies": 0.875, "rewards/chosen": 0.027665521949529648, "rewards/margins": 0.04687776416540146, "rewards/margins_max": 0.07250744104385376, "rewards/margins_min": 0.02124808356165886, "rewards/margins_std": 0.036245837807655334, "rewards/rejected": -0.019212238490581512, "step": 3210 }, { "epoch": 0.73, "grad_norm": 0.408203125, "learning_rate": 1.0460617699882011e-07, "logits/chosen": -1.4457701444625854, "logits/rejected": -1.1618094444274902, "logps/chosen": -177.31356811523438, "logps/rejected": -189.93397521972656, "loss": 0.67, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.026849735528230667, "rewards/margins": 0.04811318963766098, "rewards/margins_max": 0.07635174691677094, "rewards/margins_min": 0.019874632358551025, "rewards/margins_std": 0.03993535786867142, "rewards/rejected": -0.02126346156001091, "step": 3220 }, { "epoch": 0.73, "grad_norm": 0.515625, "learning_rate": 1.0300580856895319e-07, "logits/chosen": -1.3305940628051758, "logits/rejected": -1.2187252044677734, "logps/chosen": -232.2171630859375, "logps/rejected": -211.6332550048828, "loss": 0.6692, "rewards/accuracies": 0.875, "rewards/chosen": 0.02285153605043888, "rewards/margins": 0.04603450745344162, "rewards/margins_max": 0.07678231596946716, "rewards/margins_min": 0.01528670359402895, "rewards/margins_std": 0.04348396137356758, "rewards/rejected": -0.023182973265647888, "step": 3230 }, { "epoch": 0.73, "grad_norm": 0.52734375, "learning_rate": 1.0141459418971495e-07, "logits/chosen": -1.3796216249465942, "logits/rejected": -1.17972731590271, "logps/chosen": -209.474853515625, "logps/rejected": -201.80372619628906, "loss": 0.669, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03370387852191925, "rewards/margins": 0.04449697583913803, "rewards/margins_max": 0.07795051485300064, "rewards/margins_min": 0.011043445207178593, "rewards/margins_std": 0.047310441732406616, "rewards/rejected": -0.010793101973831654, "step": 3240 }, { "epoch": 0.73, "grad_norm": 0.5546875, "learning_rate": 9.983263295384389e-08, "logits/chosen": -1.5797169208526611, "logits/rejected": -1.3436121940612793, "logps/chosen": -205.55026245117188, "logps/rejected": -228.02291870117188, "loss": 0.6747, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.041708528995513916, "rewards/margins": 0.040844693779945374, "rewards/margins_max": 0.06195106357336044, "rewards/margins_min": 0.01973830722272396, "rewards/margins_std": 0.029848933219909668, "rewards/rejected": 0.0008638384751975536, "step": 3250 }, { "epoch": 0.74, "grad_norm": 0.404296875, "learning_rate": 9.826002337784017e-08, "logits/chosen": -1.5048518180847168, "logits/rejected": -1.15309476852417, "logps/chosen": -297.9679260253906, "logps/rejected": -218.27078247070312, "loss": 0.6722, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027714397758245468, "rewards/margins": 0.04562808573246002, "rewards/margins_max": 0.07222025096416473, "rewards/margins_min": 0.019035929813981056, "rewards/margins_std": 0.037606991827487946, "rewards/rejected": -0.017913687974214554, "step": 3260 }, { "epoch": 0.74, "grad_norm": 0.326171875, "learning_rate": 9.669686339582958e-08, "logits/chosen": -1.4064114093780518, "logits/rejected": -1.196653127670288, "logps/chosen": -191.02151489257812, "logps/rejected": -213.56784057617188, "loss": 0.6618, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03829699009656906, "rewards/margins": 0.05901988595724106, "rewards/margins_max": 0.08912698924541473, "rewards/margins_min": 0.02891278825700283, "rewards/margins_std": 0.04257786646485329, "rewards/rejected": -0.020722895860671997, "step": 3270 }, { "epoch": 0.74, "grad_norm": 0.4296875, "learning_rate": 9.514325035346576e-08, "logits/chosen": -1.313220739364624, "logits/rejected": -1.0566840171813965, "logps/chosen": -216.5675048828125, "logps/rejected": -237.94631958007812, "loss": 0.6664, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03073856234550476, "rewards/margins": 0.05257689952850342, "rewards/margins_max": 0.08035125583410263, "rewards/margins_min": 0.02480255253612995, "rewards/margins_std": 0.03927886113524437, "rewards/rejected": -0.021838339045643806, "step": 3280 }, { "epoch": 0.74, "grad_norm": 0.380859375, "learning_rate": 9.359928100186723e-08, "logits/chosen": -1.4472345113754272, "logits/rejected": -1.2550368309020996, "logps/chosen": -246.9860076904297, "logps/rejected": -249.814697265625, "loss": 0.67, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.026788320392370224, "rewards/margins": 0.04281577467918396, "rewards/margins_max": 0.07860226929187775, "rewards/margins_min": 0.007029279135167599, "rewards/margins_std": 0.05060974508523941, "rewards/rejected": -0.016027452424168587, "step": 3290 }, { "epoch": 0.75, "grad_norm": 0.40234375, "learning_rate": 9.206505149159258e-08, "logits/chosen": -1.3831764459609985, "logits/rejected": -1.0240981578826904, "logps/chosen": -265.1828308105469, "logps/rejected": -269.2750549316406, "loss": 0.6666, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04783134162425995, "rewards/margins": 0.05456867069005966, "rewards/margins_max": 0.08805376291275024, "rewards/margins_min": 0.021083565428853035, "rewards/margins_std": 0.047355085611343384, "rewards/rejected": -0.006737329065799713, "step": 3300 }, { "epoch": 0.75, "grad_norm": 0.5234375, "learning_rate": 9.054065736665267e-08, "logits/chosen": -1.4093387126922607, "logits/rejected": -1.242791771888733, "logps/chosen": -176.42868041992188, "logps/rejected": -244.3980712890625, "loss": 0.668, "rewards/accuracies": 0.875, "rewards/chosen": 0.02831677719950676, "rewards/margins": 0.0456489659845829, "rewards/margins_max": 0.06995344907045364, "rewards/margins_min": 0.02134447731077671, "rewards/margins_std": 0.03437173739075661, "rewards/rejected": -0.017332185059785843, "step": 3310 }, { "epoch": 0.75, "grad_norm": 0.390625, "learning_rate": 8.90261935585603e-08, "logits/chosen": -1.288233757019043, "logits/rejected": -1.1420161724090576, "logps/chosen": -177.0896759033203, "logps/rejected": -164.56689453125, "loss": 0.6718, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026926511898636818, "rewards/margins": 0.04227718710899353, "rewards/margins_max": 0.06220870465040207, "rewards/margins_min": 0.022345667704939842, "rewards/margins_std": 0.028187427669763565, "rewards/rejected": -0.015350677073001862, "step": 3320 }, { "epoch": 0.75, "grad_norm": 0.416015625, "learning_rate": 8.752175438041906e-08, "logits/chosen": -1.3834733963012695, "logits/rejected": -1.1185014247894287, "logps/chosen": -218.1145477294922, "logps/rejected": -277.34393310546875, "loss": 0.6656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02830580435693264, "rewards/margins": 0.051726728677749634, "rewards/margins_max": 0.08054015040397644, "rewards/margins_min": 0.022913306951522827, "rewards/margins_std": 0.04074833169579506, "rewards/rejected": -0.023420918732881546, "step": 3330 }, { "epoch": 0.75, "grad_norm": 0.439453125, "learning_rate": 8.602743352104936e-08, "logits/chosen": -1.3321176767349243, "logits/rejected": -1.1113364696502686, "logps/chosen": -216.62460327148438, "logps/rejected": -269.898193359375, "loss": 0.6677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.025886043906211853, "rewards/margins": 0.047580696642398834, "rewards/margins_max": 0.07348641008138657, "rewards/margins_min": 0.02167496643960476, "rewards/margins_std": 0.03663622587919235, "rewards/rejected": -0.02169465273618698, "step": 3340 }, { "epoch": 0.76, "grad_norm": 0.458984375, "learning_rate": 8.454332403915415e-08, "logits/chosen": -1.3632304668426514, "logits/rejected": -1.2139122486114502, "logps/chosen": -168.23995971679688, "logps/rejected": -220.98562622070312, "loss": 0.668, "rewards/accuracies": 0.875, "rewards/chosen": 0.02623501978814602, "rewards/margins": 0.041043318808078766, "rewards/margins_max": 0.06607674062252045, "rewards/margins_min": 0.016009902581572533, "rewards/margins_std": 0.035402603447437286, "rewards/rejected": -0.014808299951255322, "step": 3350 }, { "epoch": 0.76, "grad_norm": 0.30859375, "learning_rate": 8.306951835752377e-08, "logits/chosen": -1.5067164897918701, "logits/rejected": -1.2972952127456665, "logps/chosen": -212.9749755859375, "logps/rejected": -215.76840209960938, "loss": 0.6737, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02593279257416725, "rewards/margins": 0.036307260394096375, "rewards/margins_max": 0.0532100610435009, "rewards/margins_min": 0.0194044578820467, "rewards/margins_std": 0.02390417270362377, "rewards/rejected": -0.010374465957283974, "step": 3360 }, { "epoch": 0.76, "grad_norm": 0.345703125, "learning_rate": 8.160610825728029e-08, "logits/chosen": -1.3713653087615967, "logits/rejected": -1.1629103422164917, "logps/chosen": -179.63717651367188, "logps/rejected": -169.6957550048828, "loss": 0.6703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.034700967371463776, "rewards/margins": 0.0471280999481678, "rewards/margins_max": 0.07532576471567154, "rewards/margins_min": 0.018930435180664062, "rewards/margins_std": 0.0398775152862072, "rewards/rejected": -0.012427128851413727, "step": 3370 }, { "epoch": 0.76, "grad_norm": 0.400390625, "learning_rate": 8.015318487216183e-08, "logits/chosen": -1.413757085800171, "logits/rejected": -1.0182082653045654, "logps/chosen": -262.51885986328125, "logps/rejected": -228.93295288085938, "loss": 0.6683, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.038941092789173126, "rewards/margins": 0.05965230613946915, "rewards/margins_max": 0.08189304918050766, "rewards/margins_min": 0.03741155192255974, "rewards/margins_std": 0.031453169882297516, "rewards/rejected": -0.020711207762360573, "step": 3380 }, { "epoch": 0.77, "grad_norm": 0.3828125, "learning_rate": 7.871083868284725e-08, "logits/chosen": -1.2200084924697876, "logits/rejected": -0.9986445307731628, "logps/chosen": -191.07162475585938, "logps/rejected": -171.0322723388672, "loss": 0.6723, "rewards/accuracies": 0.875, "rewards/chosen": 0.023246588185429573, "rewards/margins": 0.03981078788638115, "rewards/margins_max": 0.06385910511016846, "rewards/margins_min": 0.01576247252523899, "rewards/margins_std": 0.034009456634521484, "rewards/rejected": -0.016564201563596725, "step": 3390 }, { "epoch": 0.77, "grad_norm": 0.3984375, "learning_rate": 7.727915951132144e-08, "logits/chosen": -1.4563645124435425, "logits/rejected": -1.103590488433838, "logps/chosen": -294.8932189941406, "logps/rejected": -303.7432556152344, "loss": 0.6668, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029066193848848343, "rewards/margins": 0.050104547291994095, "rewards/margins_max": 0.07443893700838089, "rewards/margins_min": 0.025770163163542747, "rewards/margins_std": 0.03441401198506355, "rewards/rejected": -0.02103835716843605, "step": 3400 }, { "epoch": 0.77, "grad_norm": 0.51171875, "learning_rate": 7.585823651528156e-08, "logits/chosen": -1.3338501453399658, "logits/rejected": -1.1717281341552734, "logps/chosen": -220.5869140625, "logps/rejected": -257.8363037109375, "loss": 0.6662, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02696543000638485, "rewards/margins": 0.04337473213672638, "rewards/margins_max": 0.07208983600139618, "rewards/margins_min": 0.014659630134701729, "rewards/margins_std": 0.04060928896069527, "rewards/rejected": -0.01640930399298668, "step": 3410 }, { "epoch": 0.77, "grad_norm": 0.50390625, "learning_rate": 7.444815818258527e-08, "logits/chosen": -1.3896772861480713, "logits/rejected": -1.2830214500427246, "logps/chosen": -203.7684783935547, "logps/rejected": -247.3974609375, "loss": 0.6671, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03583263233304024, "rewards/margins": 0.052742041647434235, "rewards/margins_max": 0.08703459054231644, "rewards/margins_min": 0.018449490889906883, "rewards/margins_std": 0.04849698767066002, "rewards/rejected": -0.016909409314393997, "step": 3420 }, { "epoch": 0.78, "grad_norm": 0.30859375, "learning_rate": 7.304901232573906e-08, "logits/chosen": -1.5021190643310547, "logits/rejected": -1.2359145879745483, "logps/chosen": -163.47137451171875, "logps/rejected": -196.88661193847656, "loss": 0.6678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04131701588630676, "rewards/margins": 0.05625222995877266, "rewards/margins_max": 0.08685590326786041, "rewards/margins_min": 0.02564854361116886, "rewards/margins_std": 0.043280139565467834, "rewards/rejected": -0.014935208484530449, "step": 3430 }, { "epoch": 0.78, "grad_norm": 0.369140625, "learning_rate": 7.166088607643123e-08, "logits/chosen": -1.322670817375183, "logits/rejected": -1.0070643424987793, "logps/chosen": -214.02304077148438, "logps/rejected": -234.9347381591797, "loss": 0.6688, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.026626717299222946, "rewards/margins": 0.05677711218595505, "rewards/margins_max": 0.08463828265666962, "rewards/margins_min": 0.028915945440530777, "rewards/margins_std": 0.039401642978191376, "rewards/rejected": -0.0301503948867321, "step": 3440 }, { "epoch": 0.78, "grad_norm": 0.53125, "learning_rate": 7.02838658801042e-08, "logits/chosen": -1.4200587272644043, "logits/rejected": -1.102832555770874, "logps/chosen": -280.0638732910156, "logps/rejected": -235.215576171875, "loss": 0.6686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03843649849295616, "rewards/margins": 0.058247823268175125, "rewards/margins_max": 0.08686522394418716, "rewards/margins_min": 0.029630441218614578, "rewards/margins_std": 0.04047109931707382, "rewards/rejected": -0.01981133408844471, "step": 3450 }, { "epoch": 0.78, "grad_norm": 0.404296875, "learning_rate": 6.891803749057254e-08, "logits/chosen": -1.5478912591934204, "logits/rejected": -1.177132248878479, "logps/chosen": -257.420654296875, "logps/rejected": -240.5295867919922, "loss": 0.6698, "rewards/accuracies": 0.875, "rewards/chosen": 0.037167832255363464, "rewards/margins": 0.04988235980272293, "rewards/margins_max": 0.08287341892719269, "rewards/margins_min": 0.016891302540898323, "rewards/margins_std": 0.046656396239995956, "rewards/rejected": -0.012714529410004616, "step": 3460 }, { "epoch": 0.78, "grad_norm": 0.3671875, "learning_rate": 6.756348596468167e-08, "logits/chosen": -1.4443600177764893, "logits/rejected": -1.1571996212005615, "logps/chosen": -217.07919311523438, "logps/rejected": -204.60067749023438, "loss": 0.6689, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.034954361617565155, "rewards/margins": 0.04938163235783577, "rewards/margins_max": 0.0719556212425232, "rewards/margins_min": 0.026807645335793495, "rewards/margins_std": 0.03192444145679474, "rewards/rejected": -0.01442726980894804, "step": 3470 }, { "epoch": 0.79, "grad_norm": 0.32421875, "learning_rate": 6.622029565701118e-08, "logits/chosen": -1.420827865600586, "logits/rejected": -1.2825648784637451, "logps/chosen": -182.84640502929688, "logps/rejected": -178.5884246826172, "loss": 0.6713, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.028441939502954483, "rewards/margins": 0.04441531002521515, "rewards/margins_max": 0.07214485108852386, "rewards/margins_min": 0.01668578013777733, "rewards/margins_std": 0.03921548277139664, "rewards/rejected": -0.015973379835486412, "step": 3480 }, { "epoch": 0.79, "grad_norm": 0.4921875, "learning_rate": 6.488855021462216e-08, "logits/chosen": -1.2294635772705078, "logits/rejected": -0.9490046501159668, "logps/chosen": -220.43417358398438, "logps/rejected": -197.18508911132812, "loss": 0.6711, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.033244356513023376, "rewards/margins": 0.047328390181064606, "rewards/margins_max": 0.0760948583483696, "rewards/margins_min": 0.01856192573904991, "rewards/margins_std": 0.04068192094564438, "rewards/rejected": -0.014084036462008953, "step": 3490 }, { "epoch": 0.79, "grad_norm": 0.251953125, "learning_rate": 6.356833257184746e-08, "logits/chosen": -1.2230401039123535, "logits/rejected": -1.1300714015960693, "logps/chosen": -174.61322021484375, "logps/rejected": -209.37661743164062, "loss": 0.6667, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02614758536219597, "rewards/margins": 0.05365358665585518, "rewards/margins_max": 0.07639142125844955, "rewards/margins_min": 0.030915748327970505, "rewards/margins_std": 0.03215615823864937, "rewards/rejected": -0.02750600315630436, "step": 3500 }, { "epoch": 0.79, "grad_norm": 0.419921875, "learning_rate": 6.225972494512718e-08, "logits/chosen": -1.4761110544204712, "logits/rejected": -1.244800329208374, "logps/chosen": -222.0691375732422, "logps/rejected": -243.110107421875, "loss": 0.6726, "rewards/accuracies": 0.875, "rewards/chosen": 0.018352871760725975, "rewards/margins": 0.04645923897624016, "rewards/margins_max": 0.07281027734279633, "rewards/margins_min": 0.020108195021748543, "rewards/margins_std": 0.03726600855588913, "rewards/rejected": -0.02810637094080448, "step": 3510 }, { "epoch": 0.8, "grad_norm": 0.333984375, "learning_rate": 6.096280882788874e-08, "logits/chosen": -1.3621561527252197, "logits/rejected": -1.188506007194519, "logps/chosen": -170.3199005126953, "logps/rejected": -183.8995361328125, "loss": 0.6728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.030621353536844254, "rewards/margins": 0.03551565110683441, "rewards/margins_max": 0.06205816939473152, "rewards/margins_min": 0.008973127231001854, "rewards/margins_std": 0.03753679618239403, "rewards/rejected": -0.004894299898296595, "step": 3520 }, { "epoch": 0.8, "grad_norm": 0.484375, "learning_rate": 5.96776649854718e-08, "logits/chosen": -1.4086308479309082, "logits/rejected": -1.1658676862716675, "logps/chosen": -210.52297973632812, "logps/rejected": -202.68783569335938, "loss": 0.6701, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.028438914567232132, "rewards/margins": 0.04048081114888191, "rewards/margins_max": 0.059843193739652634, "rewards/margins_min": 0.021118421107530594, "rewards/margins_std": 0.02738254889845848, "rewards/rejected": -0.012041894719004631, "step": 3530 }, { "epoch": 0.8, "grad_norm": 0.44921875, "learning_rate": 5.840437345009858e-08, "logits/chosen": -1.2647746801376343, "logits/rejected": -1.0508357286453247, "logps/chosen": -225.2239532470703, "logps/rejected": -225.44296264648438, "loss": 0.6713, "rewards/accuracies": 0.875, "rewards/chosen": 0.030933257192373276, "rewards/margins": 0.04926546663045883, "rewards/margins_max": 0.07699280232191086, "rewards/margins_min": 0.02153814025223255, "rewards/margins_std": 0.03921236842870712, "rewards/rejected": -0.018332213163375854, "step": 3540 }, { "epoch": 0.8, "grad_norm": 0.2890625, "learning_rate": 5.7143013515890074e-08, "logits/chosen": -1.4881861209869385, "logits/rejected": -1.179818868637085, "logps/chosen": -196.1507110595703, "logps/rejected": -187.4951629638672, "loss": 0.6673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03177567571401596, "rewards/margins": 0.048626724630594254, "rewards/margins_max": 0.08106863498687744, "rewards/margins_min": 0.016184817999601364, "rewards/margins_std": 0.0458797886967659, "rewards/rejected": -0.016851048916578293, "step": 3550 }, { "epoch": 0.8, "grad_norm": 0.40234375, "learning_rate": 5.589366373392754e-08, "logits/chosen": -1.2353525161743164, "logits/rejected": -1.0010229349136353, "logps/chosen": -240.0959014892578, "logps/rejected": -198.23251342773438, "loss": 0.6655, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04709520563483238, "rewards/margins": 0.060860536992549896, "rewards/margins_max": 0.08691618591547012, "rewards/margins_min": 0.034804895520210266, "rewards/margins_std": 0.03684823960065842, "rewards/rejected": -0.013765333220362663, "step": 3560 }, { "epoch": 0.81, "grad_norm": 0.478515625, "learning_rate": 5.465640190736123e-08, "logits/chosen": -1.3841360807418823, "logits/rejected": -1.2201907634735107, "logps/chosen": -203.7855987548828, "logps/rejected": -190.33120727539062, "loss": 0.6728, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.017639171332120895, "rewards/margins": 0.028413286432623863, "rewards/margins_max": 0.04533935338258743, "rewards/margins_min": 0.011487223207950592, "rewards/margins_std": 0.02393706701695919, "rewards/rejected": -0.010774116031825542, "step": 3570 }, { "epoch": 0.81, "grad_norm": 0.345703125, "learning_rate": 5.343130508656501e-08, "logits/chosen": -1.4076919555664062, "logits/rejected": -1.1919476985931396, "logps/chosen": -218.1761016845703, "logps/rejected": -223.3293914794922, "loss": 0.6708, "rewards/accuracies": 0.875, "rewards/chosen": 0.024606024846434593, "rewards/margins": 0.03831148147583008, "rewards/margins_max": 0.05184120684862137, "rewards/margins_min": 0.024781761690974236, "rewards/margins_std": 0.0191339161247015, "rewards/rejected": -0.013705459423363209, "step": 3580 }, { "epoch": 0.81, "grad_norm": 0.380859375, "learning_rate": 5.221844956433794e-08, "logits/chosen": -1.3877627849578857, "logits/rejected": -1.050703525543213, "logps/chosen": -204.9930419921875, "logps/rejected": -199.54371643066406, "loss": 0.6685, "rewards/accuracies": 0.875, "rewards/chosen": 0.0310288667678833, "rewards/margins": 0.05146826431155205, "rewards/margins_max": 0.079628124833107, "rewards/margins_min": 0.023308411240577698, "rewards/margins_std": 0.039824046194553375, "rewards/rejected": -0.020439397543668747, "step": 3590 }, { "epoch": 0.81, "grad_norm": 0.294921875, "learning_rate": 5.101791087115353e-08, "logits/chosen": -1.3680554628372192, "logits/rejected": -1.0979640483856201, "logps/chosen": -195.17733764648438, "logps/rejected": -175.19503784179688, "loss": 0.6659, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03740929067134857, "rewards/margins": 0.052658699452877045, "rewards/margins_max": 0.08210619539022446, "rewards/margins_min": 0.02321120724081993, "rewards/margins_std": 0.04164504259824753, "rewards/rejected": -0.0152494041249156, "step": 3600 }, { "epoch": 0.82, "grad_norm": 0.478515625, "learning_rate": 4.982976377045545e-08, "logits/chosen": -1.3179078102111816, "logits/rejected": -0.9852703809738159, "logps/chosen": -208.92697143554688, "logps/rejected": -201.54153442382812, "loss": 0.6651, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.029262781143188477, "rewards/margins": 0.057259321212768555, "rewards/margins_max": 0.08372489362955093, "rewards/margins_min": 0.030793756246566772, "rewards/margins_std": 0.03742796555161476, "rewards/rejected": -0.027996540069580078, "step": 3610 }, { "epoch": 0.82, "grad_norm": 0.4140625, "learning_rate": 4.865408225400233e-08, "logits/chosen": -1.4115145206451416, "logits/rejected": -1.1585103273391724, "logps/chosen": -178.40237426757812, "logps/rejected": -179.32286071777344, "loss": 0.6714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02827485278248787, "rewards/margins": 0.03694600984454155, "rewards/margins_max": 0.06516902148723602, "rewards/margins_min": 0.008723007515072823, "rewards/margins_std": 0.039913360029459, "rewards/rejected": -0.00867115892469883, "step": 3620 }, { "epoch": 0.82, "grad_norm": 0.365234375, "learning_rate": 4.749093953725952e-08, "logits/chosen": -1.4016355276107788, "logits/rejected": -1.1189079284667969, "logps/chosen": -238.0830841064453, "logps/rejected": -210.646728515625, "loss": 0.6685, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.025943368673324585, "rewards/margins": 0.04778265208005905, "rewards/margins_max": 0.06782630831003189, "rewards/margins_min": 0.02773899957537651, "rewards/margins_std": 0.02834600768983364, "rewards/rejected": -0.021839281544089317, "step": 3630 }, { "epoch": 0.82, "grad_norm": 0.390625, "learning_rate": 4.634040805483946e-08, "logits/chosen": -1.287687063217163, "logits/rejected": -0.9820324778556824, "logps/chosen": -209.3041229248047, "logps/rejected": -242.19949340820312, "loss": 0.6652, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03499635308980942, "rewards/margins": 0.04892340302467346, "rewards/margins_max": 0.07500474154949188, "rewards/margins_min": 0.022842060774564743, "rewards/margins_std": 0.03688458353281021, "rewards/rejected": -0.013927051797509193, "step": 3640 }, { "epoch": 0.83, "grad_norm": 0.38671875, "learning_rate": 4.5202559455991465e-08, "logits/chosen": -1.3673442602157593, "logits/rejected": -1.201022982597351, "logps/chosen": -193.3282470703125, "logps/rejected": -199.09353637695312, "loss": 0.6715, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.029926681891083717, "rewards/margins": 0.04664327949285507, "rewards/margins_max": 0.07682793587446213, "rewards/margins_min": 0.016458621248602867, "rewards/margins_std": 0.0426875576376915, "rewards/rejected": -0.016716599464416504, "step": 3650 }, { "epoch": 0.83, "grad_norm": 0.345703125, "learning_rate": 4.4077464600139116e-08, "logits/chosen": -1.451099157333374, "logits/rejected": -1.2489079236984253, "logps/chosen": -190.05506896972656, "logps/rejected": -183.50390625, "loss": 0.6707, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.022996725514531136, "rewards/margins": 0.04504052549600601, "rewards/margins_max": 0.06947029381990433, "rewards/margins_min": 0.020610753446817398, "rewards/margins_std": 0.034548915922641754, "rewards/rejected": -0.022043799981474876, "step": 3660 }, { "epoch": 0.83, "grad_norm": 0.345703125, "learning_rate": 4.296519355246775e-08, "logits/chosen": -1.4090425968170166, "logits/rejected": -0.9449017643928528, "logps/chosen": -281.3605041503906, "logps/rejected": -215.10693359375, "loss": 0.6673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03155981004238129, "rewards/margins": 0.06298719346523285, "rewards/margins_max": 0.10227738320827484, "rewards/margins_min": 0.023697001859545708, "rewards/margins_std": 0.055564723908901215, "rewards/rejected": -0.03142738342285156, "step": 3670 }, { "epoch": 0.83, "grad_norm": 0.322265625, "learning_rate": 4.1865815579561234e-08, "logits/chosen": -1.3743921518325806, "logits/rejected": -1.051452398300171, "logps/chosen": -222.72189331054688, "logps/rejected": -199.6683349609375, "loss": 0.6653, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03226775676012039, "rewards/margins": 0.05695630982518196, "rewards/margins_max": 0.0883602723479271, "rewards/margins_min": 0.025552351027727127, "rewards/margins_std": 0.044411905109882355, "rewards/rejected": -0.02468855120241642, "step": 3680 }, { "epoch": 0.83, "grad_norm": 0.51171875, "learning_rate": 4.0779399145088236e-08, "logits/chosen": -1.42634117603302, "logits/rejected": -1.1020008325576782, "logps/chosen": -196.1156463623047, "logps/rejected": -203.8043670654297, "loss": 0.6641, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03701802343130112, "rewards/margins": 0.058898139744997025, "rewards/margins_max": 0.0883612185716629, "rewards/margins_min": 0.029435062780976295, "rewards/margins_std": 0.04166708141565323, "rewards/rejected": -0.02188010886311531, "step": 3690 }, { "epoch": 0.84, "grad_norm": 0.291015625, "learning_rate": 3.970601190553882e-08, "logits/chosen": -1.5326263904571533, "logits/rejected": -1.2081806659698486, "logps/chosen": -218.63272094726562, "logps/rejected": -206.61660766601562, "loss": 0.6718, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030933424830436707, "rewards/margins": 0.04485446959733963, "rewards/margins_max": 0.07049910724163055, "rewards/margins_min": 0.01920982263982296, "rewards/margins_std": 0.036267004907131195, "rewards/rejected": -0.013921047560870647, "step": 3700 }, { "epoch": 0.84, "grad_norm": 0.41796875, "learning_rate": 3.864572070601099e-08, "logits/chosen": -1.3997539281845093, "logits/rejected": -1.076831579208374, "logps/chosen": -187.23780822753906, "logps/rejected": -215.34939575195312, "loss": 0.6692, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04385649040341377, "rewards/margins": 0.056016553193330765, "rewards/margins_max": 0.08031867444515228, "rewards/margins_min": 0.03171443194150925, "rewards/margins_std": 0.03436839208006859, "rewards/rejected": -0.012160064652562141, "step": 3710 }, { "epoch": 0.84, "grad_norm": 0.3359375, "learning_rate": 3.7598591576048e-08, "logits/chosen": -1.5493652820587158, "logits/rejected": -1.246185541152954, "logps/chosen": -195.29684448242188, "logps/rejected": -215.9580078125, "loss": 0.6632, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04298774152994156, "rewards/margins": 0.05915018916130066, "rewards/margins_max": 0.08389954268932343, "rewards/margins_min": 0.034400831907987595, "rewards/margins_std": 0.03500087186694145, "rewards/rejected": -0.016162443906068802, "step": 3720 }, { "epoch": 0.84, "grad_norm": 0.349609375, "learning_rate": 3.656468972552637e-08, "logits/chosen": -1.6022872924804688, "logits/rejected": -1.0952297449111938, "logps/chosen": -291.47442626953125, "logps/rejected": -227.7351531982422, "loss": 0.6704, "rewards/accuracies": 0.875, "rewards/chosen": 0.032902318984270096, "rewards/margins": 0.05154528096318245, "rewards/margins_max": 0.08279638737440109, "rewards/margins_min": 0.020294170826673508, "rewards/margins_std": 0.0441957451403141, "rewards/rejected": -0.018642958253622055, "step": 3730 }, { "epoch": 0.85, "grad_norm": 0.42578125, "learning_rate": 3.554407954059488e-08, "logits/chosen": -1.4918447732925415, "logits/rejected": -1.046549677848816, "logps/chosen": -270.91680908203125, "logps/rejected": -204.57913208007812, "loss": 0.6668, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03338420391082764, "rewards/margins": 0.05518590286374092, "rewards/margins_max": 0.09111050516366959, "rewards/margins_min": 0.019261294975876808, "rewards/margins_std": 0.050805073231458664, "rewards/rejected": -0.021801700815558434, "step": 3740 }, { "epoch": 0.85, "grad_norm": 0.30078125, "learning_rate": 3.4536824579665e-08, "logits/chosen": -1.3506847620010376, "logits/rejected": -1.2051067352294922, "logps/chosen": -166.91786193847656, "logps/rejected": -162.62588500976562, "loss": 0.6727, "rewards/accuracies": 0.875, "rewards/chosen": 0.0248137004673481, "rewards/margins": 0.04015351086854935, "rewards/margins_max": 0.060021065175533295, "rewards/margins_min": 0.020285960286855698, "rewards/margins_std": 0.028096962720155716, "rewards/rejected": -0.015339814126491547, "step": 3750 }, { "epoch": 0.85, "grad_norm": 0.43359375, "learning_rate": 3.354298756945292e-08, "logits/chosen": -1.447142481803894, "logits/rejected": -1.2159852981567383, "logps/chosen": -184.87144470214844, "logps/rejected": -246.08230590820312, "loss": 0.6675, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04362271726131439, "rewards/margins": 0.05569925904273987, "rewards/margins_max": 0.07866283506155014, "rewards/margins_min": 0.032735686749219894, "rewards/margins_std": 0.032475393265485764, "rewards/rejected": -0.012076543644070625, "step": 3760 }, { "epoch": 0.85, "grad_norm": 0.53515625, "learning_rate": 3.2562630401072793e-08, "logits/chosen": -1.4080158472061157, "logits/rejected": -1.023107647895813, "logps/chosen": -247.05734252929688, "logps/rejected": -232.3527069091797, "loss": 0.6699, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.037775538861751556, "rewards/margins": 0.05671767145395279, "rewards/margins_max": 0.08435182273387909, "rewards/margins_min": 0.029083510860800743, "rewards/margins_std": 0.03908059746026993, "rewards/rejected": -0.018942128866910934, "step": 3770 }, { "epoch": 0.85, "grad_norm": 0.392578125, "learning_rate": 3.159581412618309e-08, "logits/chosen": -1.4490994215011597, "logits/rejected": -1.2454156875610352, "logps/chosen": -218.85708618164062, "logps/rejected": -199.06924438476562, "loss": 0.6676, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027776191011071205, "rewards/margins": 0.0423356369137764, "rewards/margins_max": 0.06463000178337097, "rewards/margins_min": 0.020041272044181824, "rewards/margins_std": 0.03152899444103241, "rewards/rejected": -0.014559444971382618, "step": 3780 }, { "epoch": 0.86, "grad_norm": 0.41015625, "learning_rate": 3.0642598953184164e-08, "logits/chosen": -1.4658455848693848, "logits/rejected": -0.8866893649101257, "logps/chosen": -239.5939483642578, "logps/rejected": -212.2051544189453, "loss": 0.6684, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030214086174964905, "rewards/margins": 0.06262092292308807, "rewards/margins_max": 0.10303632915019989, "rewards/margins_min": 0.022205516695976257, "rewards/margins_std": 0.057156018912792206, "rewards/rejected": -0.03240683674812317, "step": 3790 }, { "epoch": 0.86, "grad_norm": 0.50390625, "learning_rate": 2.9703044243468866e-08, "logits/chosen": -1.3730664253234863, "logits/rejected": -1.1947462558746338, "logps/chosen": -193.0137176513672, "logps/rejected": -225.66714477539062, "loss": 0.6659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.027939695864915848, "rewards/margins": 0.05449366569519043, "rewards/margins_max": 0.08065618574619293, "rewards/margins_min": 0.02833114191889763, "rewards/margins_std": 0.03699938952922821, "rewards/rejected": -0.026553967967629433, "step": 3800 }, { "epoch": 0.86, "grad_norm": 0.3046875, "learning_rate": 2.8777208507726054e-08, "logits/chosen": -1.3284589052200317, "logits/rejected": -1.029971718788147, "logps/chosen": -172.42007446289062, "logps/rejected": -186.41592407226562, "loss": 0.6658, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03503911942243576, "rewards/margins": 0.06267812103033066, "rewards/margins_max": 0.08370877057313919, "rewards/margins_min": 0.04164748266339302, "rewards/margins_std": 0.029741818085312843, "rewards/rejected": -0.027639007195830345, "step": 3810 }, { "epoch": 0.86, "grad_norm": 0.42578125, "learning_rate": 2.786514940229634e-08, "logits/chosen": -1.3927130699157715, "logits/rejected": -1.131127119064331, "logps/chosen": -211.1796112060547, "logps/rejected": -181.09854125976562, "loss": 0.6691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04030333831906319, "rewards/margins": 0.04070250317454338, "rewards/margins_max": 0.07060278952121735, "rewards/margins_min": 0.010802226141095161, "rewards/margins_std": 0.04228537529706955, "rewards/rejected": -0.0003991674748249352, "step": 3820 }, { "epoch": 0.87, "grad_norm": 0.4140625, "learning_rate": 2.6966923725582234e-08, "logits/chosen": -1.4242016077041626, "logits/rejected": -1.145247220993042, "logps/chosen": -230.03311157226562, "logps/rejected": -180.59939575195312, "loss": 0.6724, "rewards/accuracies": 0.875, "rewards/chosen": 0.026708319783210754, "rewards/margins": 0.034293271601200104, "rewards/margins_max": 0.0604860857129097, "rewards/margins_min": 0.008100450038909912, "rewards/margins_std": 0.03704223781824112, "rewards/rejected": -0.007584949489682913, "step": 3830 }, { "epoch": 0.87, "grad_norm": 0.376953125, "learning_rate": 2.6082587414510442e-08, "logits/chosen": -1.2004412412643433, "logits/rejected": -1.1661103963851929, "logps/chosen": -218.96182250976562, "logps/rejected": -263.70001220703125, "loss": 0.6686, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.020230960100889206, "rewards/margins": 0.04539378732442856, "rewards/margins_max": 0.06975904852151871, "rewards/margins_min": 0.021028511226177216, "rewards/margins_std": 0.034457698464393616, "rewards/rejected": -0.025162819772958755, "step": 3840 }, { "epoch": 0.87, "grad_norm": 0.3984375, "learning_rate": 2.5212195541048698e-08, "logits/chosen": -1.3625192642211914, "logits/rejected": -1.177557349205017, "logps/chosen": -297.97906494140625, "logps/rejected": -289.76824951171875, "loss": 0.6671, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03726391866803169, "rewards/margins": 0.05677323415875435, "rewards/margins_max": 0.07760507613420486, "rewards/margins_min": 0.035941388458013535, "rewards/margins_std": 0.02946067787706852, "rewards/rejected": -0.019509317353367805, "step": 3850 }, { "epoch": 0.87, "grad_norm": 0.60546875, "learning_rate": 2.435580230877607e-08, "logits/chosen": -1.294594168663025, "logits/rejected": -1.0570218563079834, "logps/chosen": -188.47618103027344, "logps/rejected": -227.89230346679688, "loss": 0.6712, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.036184389144182205, "rewards/margins": 0.04435017332434654, "rewards/margins_max": 0.07742326706647873, "rewards/margins_min": 0.011277077719569206, "rewards/margins_std": 0.046772416681051254, "rewards/rejected": -0.008165782317519188, "step": 3860 }, { "epoch": 0.87, "grad_norm": 0.328125, "learning_rate": 2.3513461049507383e-08, "logits/chosen": -1.2781083583831787, "logits/rejected": -1.0610449314117432, "logps/chosen": -178.37767028808594, "logps/rejected": -186.87088012695312, "loss": 0.6707, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.035525936633348465, "rewards/margins": 0.04440684616565704, "rewards/margins_max": 0.0715843215584755, "rewards/margins_min": 0.017229357734322548, "rewards/margins_std": 0.03843476623296738, "rewards/rejected": -0.008880906738340855, "step": 3870 }, { "epoch": 0.88, "grad_norm": 0.384765625, "learning_rate": 2.2685224219972183e-08, "logits/chosen": -1.451965570449829, "logits/rejected": -1.1790364980697632, "logps/chosen": -215.76779174804688, "logps/rejected": -293.38446044921875, "loss": 0.6697, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03224249556660652, "rewards/margins": 0.06852399557828903, "rewards/margins_max": 0.10147424042224884, "rewards/margins_min": 0.03557376563549042, "rewards/margins_std": 0.04659866914153099, "rewards/rejected": -0.03628150746226311, "step": 3880 }, { "epoch": 0.88, "grad_norm": 0.326171875, "learning_rate": 2.1871143398547733e-08, "logits/chosen": -1.3184213638305664, "logits/rejected": -1.0465301275253296, "logps/chosen": -235.9217071533203, "logps/rejected": -187.95343017578125, "loss": 0.6703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029069799929857254, "rewards/margins": 0.052199989557266235, "rewards/margins_max": 0.07874207943677902, "rewards/margins_min": 0.025657888501882553, "rewards/margins_std": 0.03753619268536568, "rewards/rejected": -0.023130184039473534, "step": 3890 }, { "epoch": 0.88, "grad_norm": 0.44140625, "learning_rate": 2.1071269282047195e-08, "logits/chosen": -1.3340555429458618, "logits/rejected": -0.9785248041152954, "logps/chosen": -216.3225555419922, "logps/rejected": -205.25595092773438, "loss": 0.6661, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03301094099879265, "rewards/margins": 0.06074405834078789, "rewards/margins_max": 0.09566470980644226, "rewards/margins_min": 0.02582341432571411, "rewards/margins_std": 0.04938525706529617, "rewards/rejected": -0.027733122929930687, "step": 3900 }, { "epoch": 0.88, "grad_norm": 0.451171875, "learning_rate": 2.0285651682562355e-08, "logits/chosen": -1.4139344692230225, "logits/rejected": -1.09742271900177, "logps/chosen": -234.76651000976562, "logps/rejected": -265.72662353515625, "loss": 0.6633, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.034630510956048965, "rewards/margins": 0.06832818686962128, "rewards/margins_max": 0.11305254697799683, "rewards/margins_min": 0.023603813722729683, "rewards/margins_std": 0.06324980407953262, "rewards/rejected": -0.03369767218828201, "step": 3910 }, { "epoch": 0.89, "grad_norm": 0.48046875, "learning_rate": 1.951433952436174e-08, "logits/chosen": -1.4427460432052612, "logits/rejected": -1.0282113552093506, "logps/chosen": -253.4451904296875, "logps/rejected": -203.51739501953125, "loss": 0.662, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03823239356279373, "rewards/margins": 0.058333247900009155, "rewards/margins_max": 0.09260444343090057, "rewards/margins_min": 0.024062050506472588, "rewards/margins_std": 0.048466794192790985, "rewards/rejected": -0.020100858062505722, "step": 3920 }, { "epoch": 0.89, "grad_norm": 0.45703125, "learning_rate": 1.8757380840843524e-08, "logits/chosen": -1.5329930782318115, "logits/rejected": -1.2544704675674438, "logps/chosen": -225.38186645507812, "logps/rejected": -259.9751892089844, "loss": 0.6713, "rewards/accuracies": 0.875, "rewards/chosen": 0.03190339356660843, "rewards/margins": 0.041507575660943985, "rewards/margins_max": 0.06313765048980713, "rewards/margins_min": 0.01987750083208084, "rewards/margins_std": 0.030589541420340538, "rewards/rejected": -0.009604182094335556, "step": 3930 }, { "epoch": 0.89, "grad_norm": 0.439453125, "learning_rate": 1.8014822771544784e-08, "logits/chosen": -1.4925791025161743, "logits/rejected": -1.267610788345337, "logps/chosen": -192.39517211914062, "logps/rejected": -225.89212036132812, "loss": 0.6688, "rewards/accuracies": 0.875, "rewards/chosen": 0.022831646725535393, "rewards/margins": 0.041496653109788895, "rewards/margins_max": 0.06753791868686676, "rewards/margins_min": 0.01545538567006588, "rewards/margins_std": 0.03682791069149971, "rewards/rejected": -0.018665006384253502, "step": 3940 }, { "epoch": 0.89, "grad_norm": 0.37890625, "learning_rate": 1.7286711559205247e-08, "logits/chosen": -1.1196014881134033, "logits/rejected": -0.8338809013366699, "logps/chosen": -268.010009765625, "logps/rejected": -212.37698364257812, "loss": 0.6678, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.028760245069861412, "rewards/margins": 0.04862253740429878, "rewards/margins_max": 0.06828413903713226, "rewards/margins_min": 0.02896093763411045, "rewards/margins_std": 0.027805697172880173, "rewards/rejected": -0.019862286746501923, "step": 3950 }, { "epoch": 0.9, "grad_norm": 0.46875, "learning_rate": 1.6573092546888128e-08, "logits/chosen": -1.385881781578064, "logits/rejected": -0.7845109105110168, "logps/chosen": -295.77288818359375, "logps/rejected": -195.02999877929688, "loss": 0.6651, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03401690721511841, "rewards/margins": 0.05478140711784363, "rewards/margins_max": 0.08118084818124771, "rewards/margins_min": 0.028381969779729843, "rewards/margins_std": 0.03733444958925247, "rewards/rejected": -0.02076449990272522, "step": 3960 }, { "epoch": 0.9, "grad_norm": 0.6171875, "learning_rate": 1.5874010175156104e-08, "logits/chosen": -1.3183201551437378, "logits/rejected": -1.0514836311340332, "logps/chosen": -294.5506591796875, "logps/rejected": -218.8050079345703, "loss": 0.6703, "rewards/accuracies": 0.875, "rewards/chosen": 0.030712831765413284, "rewards/margins": 0.05188845843076706, "rewards/margins_max": 0.08256353437900543, "rewards/margins_min": 0.021213386207818985, "rewards/margins_std": 0.04338110610842705, "rewards/rejected": -0.021175626665353775, "step": 3970 }, { "epoch": 0.9, "grad_norm": 0.41796875, "learning_rate": 1.518950797930357e-08, "logits/chosen": -1.3133951425552368, "logits/rejected": -1.1905639171600342, "logps/chosen": -189.7636260986328, "logps/rejected": -214.4491424560547, "loss": 0.673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02525566890835762, "rewards/margins": 0.04084246605634689, "rewards/margins_max": 0.06745803356170654, "rewards/margins_min": 0.014226903207600117, "rewards/margins_std": 0.03764009103178978, "rewards/rejected": -0.015586796216666698, "step": 3980 }, { "epoch": 0.9, "grad_norm": 0.46875, "learning_rate": 1.4519628586646072e-08, "logits/chosen": -1.4533544778823853, "logits/rejected": -1.3012058734893799, "logps/chosen": -151.31124877929688, "logps/rejected": -185.54855346679688, "loss": 0.6711, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.031321872025728226, "rewards/margins": 0.0467819944024086, "rewards/margins_max": 0.07836450636386871, "rewards/margins_min": 0.01519948523491621, "rewards/margins_std": 0.0446644201874733, "rewards/rejected": -0.015460127964615822, "step": 3990 }, { "epoch": 0.9, "grad_norm": 0.380859375, "learning_rate": 1.3864413713865098e-08, "logits/chosen": -1.423572301864624, "logits/rejected": -1.1824599504470825, "logps/chosen": -184.4468231201172, "logps/rejected": -183.77056884765625, "loss": 0.6714, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028317485004663467, "rewards/margins": 0.05067143961787224, "rewards/margins_max": 0.08228854835033417, "rewards/margins_min": 0.019054336473345757, "rewards/margins_std": 0.0447133406996727, "rewards/rejected": -0.022353962063789368, "step": 4000 }, { "epoch": 0.91, "grad_norm": 0.28125, "learning_rate": 1.3223904164410494e-08, "logits/chosen": -1.342595100402832, "logits/rejected": -1.0777925252914429, "logps/chosen": -201.78912353515625, "logps/rejected": -189.086181640625, "loss": 0.6711, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03158831596374512, "rewards/margins": 0.051473308354616165, "rewards/margins_max": 0.08596741408109665, "rewards/margins_min": 0.01697920449078083, "rewards/margins_std": 0.04878203570842743, "rewards/rejected": -0.019884996116161346, "step": 4010 }, { "epoch": 0.91, "grad_norm": 0.52734375, "learning_rate": 1.2598139825959392e-08, "logits/chosen": -1.4462448358535767, "logits/rejected": -1.0174031257629395, "logps/chosen": -246.3612060546875, "logps/rejected": -235.6538543701172, "loss": 0.6644, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03416098281741142, "rewards/margins": 0.05894618108868599, "rewards/margins_max": 0.08806253969669342, "rewards/margins_min": 0.02982981503009796, "rewards/margins_std": 0.04117675870656967, "rewards/rejected": -0.024785198271274567, "step": 4020 }, { "epoch": 0.91, "grad_norm": 0.369140625, "learning_rate": 1.1987159667932123e-08, "logits/chosen": -1.6274089813232422, "logits/rejected": -1.4427483081817627, "logps/chosen": -209.2533721923828, "logps/rejected": -223.5661163330078, "loss": 0.6682, "rewards/accuracies": 0.875, "rewards/chosen": 0.035410456359386444, "rewards/margins": 0.054214704781770706, "rewards/margins_max": 0.08917896449565887, "rewards/margins_min": 0.019250452518463135, "rewards/margins_std": 0.049446918070316315, "rewards/rejected": -0.01880425028502941, "step": 4030 }, { "epoch": 0.91, "grad_norm": 0.498046875, "learning_rate": 1.139100173906543e-08, "logits/chosen": -1.3406426906585693, "logits/rejected": -1.109688639640808, "logps/chosen": -150.0030059814453, "logps/rejected": -161.7610321044922, "loss": 0.6714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.023004988208413124, "rewards/margins": 0.04010429233312607, "rewards/margins_max": 0.0670587494969368, "rewards/margins_min": 0.013149833306670189, "rewards/margins_std": 0.0381193533539772, "rewards/rejected": -0.017099300399422646, "step": 4040 }, { "epoch": 0.92, "grad_norm": 0.5, "learning_rate": 1.0809703165043205e-08, "logits/chosen": -1.3344614505767822, "logits/rejected": -0.9872671961784363, "logps/chosen": -264.97149658203125, "logps/rejected": -213.99990844726562, "loss": 0.6679, "rewards/accuracies": 0.875, "rewards/chosen": 0.04212633892893791, "rewards/margins": 0.053166113793849945, "rewards/margins_max": 0.08254306018352509, "rewards/margins_min": 0.023789182305336, "rewards/margins_std": 0.04154526814818382, "rewards/rejected": -0.011039778590202332, "step": 4050 }, { "epoch": 0.92, "grad_norm": 0.34375, "learning_rate": 1.0243300146184047e-08, "logits/chosen": -1.3708654642105103, "logits/rejected": -1.1119438409805298, "logps/chosen": -167.91726684570312, "logps/rejected": -184.01956176757812, "loss": 0.6694, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04270718991756439, "rewards/margins": 0.052512459456920624, "rewards/margins_max": 0.07811418175697327, "rewards/margins_min": 0.026910746470093727, "rewards/margins_std": 0.03620629757642746, "rewards/rejected": -0.009805269539356232, "step": 4060 }, { "epoch": 0.92, "grad_norm": 0.271484375, "learning_rate": 9.69182795518722e-09, "logits/chosen": -1.3817869424819946, "logits/rejected": -1.1750977039337158, "logps/chosen": -214.696533203125, "logps/rejected": -229.936767578125, "loss": 0.6685, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02906607650220394, "rewards/margins": 0.04736794903874397, "rewards/margins_max": 0.07081864774227142, "rewards/margins_min": 0.02391725964844227, "rewards/margins_std": 0.033164288848638535, "rewards/rejected": -0.018301870673894882, "step": 4070 }, { "epoch": 0.92, "grad_norm": 0.4921875, "learning_rate": 9.155320934936039e-09, "logits/chosen": -1.429626226425171, "logits/rejected": -1.0969443321228027, "logps/chosen": -262.22833251953125, "logps/rejected": -243.362060546875, "loss": 0.6685, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.035449132323265076, "rewards/margins": 0.052136652171611786, "rewards/margins_max": 0.07579027116298676, "rewards/margins_min": 0.028483033180236816, "rewards/margins_std": 0.03345127031207085, "rewards/rejected": -0.01668752171099186, "step": 4080 }, { "epoch": 0.92, "grad_norm": 0.447265625, "learning_rate": 8.633812496358972e-09, "logits/chosen": -1.5154110193252563, "logits/rejected": -1.2433773279190063, "logps/chosen": -260.2387390136719, "logps/rejected": -204.50048828125, "loss": 0.6694, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.030448496341705322, "rewards/margins": 0.044683560729026794, "rewards/margins_max": 0.06551718711853027, "rewards/margins_min": 0.023849938064813614, "rewards/margins_std": 0.02946319617331028, "rewards/rejected": -0.014235064387321472, "step": 4090 }, { "epoch": 0.93, "grad_norm": 0.328125, "learning_rate": 8.127335116349304e-09, "logits/chosen": -1.5176746845245361, "logits/rejected": -1.318629264831543, "logps/chosen": -198.71572875976562, "logps/rejected": -189.38839721679688, "loss": 0.6772, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.025044983252882957, "rewards/margins": 0.03115997649729252, "rewards/margins_max": 0.05419089272618294, "rewards/margins_min": 0.00812905840575695, "rewards/margins_std": 0.03257063776254654, "rewards/rejected": -0.0061149937100708485, "step": 4100 }, { "epoch": 0.93, "grad_norm": 0.419921875, "learning_rate": 7.635920335742202e-09, "logits/chosen": -1.4340060949325562, "logits/rejected": -1.1694340705871582, "logps/chosen": -227.19729614257812, "logps/rejected": -253.91653442382812, "loss": 0.6733, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02697773650288582, "rewards/margins": 0.04178674519062042, "rewards/margins_max": 0.062144309282302856, "rewards/margins_min": 0.02142917737364769, "rewards/margins_std": 0.02878994680941105, "rewards/rejected": -0.014809004962444305, "step": 4110 }, { "epoch": 0.93, "grad_norm": 0.25390625, "learning_rate": 7.159598757350921e-09, "logits/chosen": -1.2499946355819702, "logits/rejected": -1.0438182353973389, "logps/chosen": -258.05029296875, "logps/rejected": -260.5018310546875, "loss": 0.6671, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.025008153170347214, "rewards/margins": 0.04763338714838028, "rewards/margins_max": 0.0701470822095871, "rewards/margins_min": 0.025119686499238014, "rewards/margins_std": 0.03183918446302414, "rewards/rejected": -0.022625230252742767, "step": 4120 }, { "epoch": 0.93, "grad_norm": 0.322265625, "learning_rate": 6.698400044060775e-09, "logits/chosen": -1.3737030029296875, "logits/rejected": -1.017594575881958, "logps/chosen": -238.93344116210938, "logps/rejected": -224.15365600585938, "loss": 0.6709, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.028364086523652077, "rewards/margins": 0.04671841114759445, "rewards/margins_max": 0.07550819218158722, "rewards/margins_min": 0.017928630113601685, "rewards/margins_std": 0.040714897215366364, "rewards/rejected": -0.018354322761297226, "step": 4130 }, { "epoch": 0.94, "grad_norm": 0.279296875, "learning_rate": 6.252352916981923e-09, "logits/chosen": -1.4670217037200928, "logits/rejected": -1.2123662233352661, "logps/chosen": -169.3472442626953, "logps/rejected": -166.15847778320312, "loss": 0.6698, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02530873380601406, "rewards/margins": 0.046012792736291885, "rewards/margins_max": 0.07258578389883041, "rewards/margins_min": 0.01943979784846306, "rewards/margins_std": 0.03757988661527634, "rewards/rejected": -0.020704057067632675, "step": 4140 }, { "epoch": 0.94, "grad_norm": 0.4609375, "learning_rate": 5.821485153660932e-09, "logits/chosen": -1.2992416620254517, "logits/rejected": -0.86224764585495, "logps/chosen": -262.5690612792969, "logps/rejected": -302.8780517578125, "loss": 0.6661, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03237038478255272, "rewards/margins": 0.06790334731340408, "rewards/margins_max": 0.09890072047710419, "rewards/margins_min": 0.036905981600284576, "rewards/margins_std": 0.04383689910173416, "rewards/rejected": -0.035532962530851364, "step": 4150 }, { "epoch": 0.94, "grad_norm": 0.333984375, "learning_rate": 5.4058235863506106e-09, "logits/chosen": -1.3638312816619873, "logits/rejected": -1.1424791812896729, "logps/chosen": -224.7232666015625, "logps/rejected": -203.07078552246094, "loss": 0.6687, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.032146066427230835, "rewards/margins": 0.05244447663426399, "rewards/margins_max": 0.07333363592624664, "rewards/margins_min": 0.03155531361699104, "rewards/margins_std": 0.029541734606027603, "rewards/rejected": -0.02029840461909771, "step": 4160 }, { "epoch": 0.94, "grad_norm": 0.330078125, "learning_rate": 5.005394100339371e-09, "logits/chosen": -1.3726253509521484, "logits/rejected": -1.253758192062378, "logps/chosen": -233.70156860351562, "logps/rejected": -231.6342010498047, "loss": 0.6678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.026106491684913635, "rewards/margins": 0.03978271037340164, "rewards/margins_max": 0.05634213238954544, "rewards/margins_min": 0.02322329208254814, "rewards/margins_std": 0.023418551310896873, "rewards/rejected": -0.013676215894520283, "step": 4170 }, { "epoch": 0.94, "grad_norm": 0.455078125, "learning_rate": 4.620221632338994e-09, "logits/chosen": -1.2156215906143188, "logits/rejected": -1.0154184103012085, "logps/chosen": -177.11041259765625, "logps/rejected": -221.79861450195312, "loss": 0.6626, "rewards/accuracies": 0.875, "rewards/chosen": 0.03687124326825142, "rewards/margins": 0.05761794000864029, "rewards/margins_max": 0.08540613949298859, "rewards/margins_min": 0.029829740524291992, "rewards/margins_std": 0.03929844871163368, "rewards/rejected": -0.02074669674038887, "step": 4180 }, { "epoch": 0.95, "grad_norm": 0.4921875, "learning_rate": 4.2503301689318094e-09, "logits/chosen": -1.510063886642456, "logits/rejected": -1.1845905780792236, "logps/chosen": -285.0707092285156, "logps/rejected": -222.8107452392578, "loss": 0.671, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.027932295575737953, "rewards/margins": 0.04651053622364998, "rewards/margins_max": 0.07659061253070831, "rewards/margins_min": 0.016430456191301346, "rewards/margins_std": 0.042539652436971664, "rewards/rejected": -0.018578244373202324, "step": 4190 }, { "epoch": 0.95, "grad_norm": 0.376953125, "learning_rate": 3.895742745076869e-09, "logits/chosen": -1.3405685424804688, "logits/rejected": -1.0025266408920288, "logps/chosen": -218.3401641845703, "logps/rejected": -191.70420837402344, "loss": 0.6684, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03851836919784546, "rewards/margins": 0.06199236959218979, "rewards/margins_max": 0.10131315886974335, "rewards/margins_min": 0.022671589627861977, "rewards/margins_std": 0.05560798570513725, "rewards/rejected": -0.023474005982279778, "step": 4200 }, { "epoch": 0.95, "grad_norm": 0.423828125, "learning_rate": 3.5564814426755073e-09, "logits/chosen": -1.3935787677764893, "logits/rejected": -1.158966302871704, "logps/chosen": -202.38221740722656, "logps/rejected": -199.8167724609375, "loss": 0.669, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.026319408789277077, "rewards/margins": 0.043112870305776596, "rewards/margins_max": 0.0695946216583252, "rewards/margins_min": 0.016631122678518295, "rewards/margins_std": 0.03745085373520851, "rewards/rejected": -0.01679346337914467, "step": 4210 }, { "epoch": 0.95, "grad_norm": 0.328125, "learning_rate": 3.232567389196139e-09, "logits/chosen": -1.4851112365722656, "logits/rejected": -1.1288012266159058, "logps/chosen": -220.33633422851562, "logps/rejected": -192.44102478027344, "loss": 0.6661, "rewards/accuracies": 0.875, "rewards/chosen": 0.031008679419755936, "rewards/margins": 0.04939933493733406, "rewards/margins_max": 0.07498259842395782, "rewards/margins_min": 0.0238160602748394, "rewards/margins_std": 0.03618020936846733, "rewards/rejected": -0.018390655517578125, "step": 4220 }, { "epoch": 0.96, "grad_norm": 0.36328125, "learning_rate": 2.9240207563586137e-09, "logits/chosen": -1.367784857749939, "logits/rejected": -1.1241731643676758, "logps/chosen": -206.1232452392578, "logps/rejected": -191.2694091796875, "loss": 0.6716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03186849504709244, "rewards/margins": 0.054981641471385956, "rewards/margins_max": 0.0860222578048706, "rewards/margins_min": 0.023941034451127052, "rewards/margins_std": 0.04389805719256401, "rewards/rejected": -0.023113155737519264, "step": 4230 }, { "epoch": 0.96, "grad_norm": 0.41015625, "learning_rate": 2.6308607588779173e-09, "logits/chosen": -1.3376381397247314, "logits/rejected": -1.0197269916534424, "logps/chosen": -244.1377716064453, "logps/rejected": -232.1446533203125, "loss": 0.6691, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.032451361417770386, "rewards/margins": 0.05189868062734604, "rewards/margins_max": 0.06959724426269531, "rewards/margins_min": 0.03420013189315796, "rewards/margins_std": 0.025029540061950684, "rewards/rejected": -0.01944732666015625, "step": 4240 }, { "epoch": 0.96, "grad_norm": 0.33984375, "learning_rate": 2.353105653267712e-09, "logits/chosen": -1.4159284830093384, "logits/rejected": -0.9598219990730286, "logps/chosen": -205.52749633789062, "logps/rejected": -196.06875610351562, "loss": 0.6673, "rewards/accuracies": 0.875, "rewards/chosen": 0.030434027314186096, "rewards/margins": 0.05207052826881409, "rewards/margins_max": 0.0799938291311264, "rewards/margins_min": 0.024147219955921173, "rewards/margins_std": 0.039489515125751495, "rewards/rejected": -0.02163649909198284, "step": 4250 }, { "epoch": 0.96, "grad_norm": 0.421875, "learning_rate": 2.0907727367033002e-09, "logits/chosen": -1.4469044208526611, "logits/rejected": -1.0019524097442627, "logps/chosen": -358.66864013671875, "logps/rejected": -218.69802856445312, "loss": 0.6737, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03114650584757328, "rewards/margins": 0.03943406790494919, "rewards/margins_max": 0.0645378977060318, "rewards/margins_min": 0.01433024276047945, "rewards/margins_std": 0.035502173006534576, "rewards/rejected": -0.008287565782666206, "step": 4260 }, { "epoch": 0.97, "grad_norm": 0.3828125, "learning_rate": 1.8438783459444608e-09, "logits/chosen": -1.4788901805877686, "logits/rejected": -1.2845779657363892, "logps/chosen": -227.7974090576172, "logps/rejected": -188.720947265625, "loss": 0.6728, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.031458381563425064, "rewards/margins": 0.02985537424683571, "rewards/margins_max": 0.04987839609384537, "rewards/margins_min": 0.009832354262471199, "rewards/margins_std": 0.02831682562828064, "rewards/rejected": 0.0016030061524361372, "step": 4270 }, { "epoch": 0.97, "grad_norm": 0.302734375, "learning_rate": 1.612437856318205e-09, "logits/chosen": -1.1608082056045532, "logits/rejected": -0.9398641586303711, "logps/chosen": -178.90191650390625, "logps/rejected": -193.39801025390625, "loss": 0.673, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.028339451178908348, "rewards/margins": 0.0333985760807991, "rewards/margins_max": 0.053435277193784714, "rewards/margins_min": 0.013361875899136066, "rewards/margins_std": 0.0283361729234457, "rewards/rejected": -0.005059124436229467, "step": 4280 }, { "epoch": 0.97, "grad_norm": 0.37109375, "learning_rate": 1.396465680761072e-09, "logits/chosen": -1.4545872211456299, "logits/rejected": -1.1205055713653564, "logps/chosen": -231.1746826171875, "logps/rejected": -196.8207244873047, "loss": 0.6705, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03223685547709465, "rewards/margins": 0.05452718213200569, "rewards/margins_max": 0.08367858827114105, "rewards/margins_min": 0.025375764816999435, "rewards/margins_std": 0.04122632369399071, "rewards/rejected": -0.022290324792265892, "step": 4290 }, { "epoch": 0.97, "grad_norm": 0.486328125, "learning_rate": 1.195975268921734e-09, "logits/chosen": -1.5477195978164673, "logits/rejected": -1.216968297958374, "logps/chosen": -286.03790283203125, "logps/rejected": -206.74252319335938, "loss": 0.6714, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03005763329565525, "rewards/margins": 0.037625450640916824, "rewards/margins_max": 0.06239453703165054, "rewards/margins_min": 0.012856366112828255, "rewards/margins_std": 0.03502877429127693, "rewards/rejected": -0.007567819207906723, "step": 4300 }, { "epoch": 0.97, "grad_norm": 0.333984375, "learning_rate": 1.0109791063233897e-09, "logits/chosen": -1.5190095901489258, "logits/rejected": -1.2735944986343384, "logps/chosen": -171.55831909179688, "logps/rejected": -189.24229431152344, "loss": 0.669, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03234173357486725, "rewards/margins": 0.05007977411150932, "rewards/margins_max": 0.07897655665874481, "rewards/margins_min": 0.02118297666311264, "rewards/margins_std": 0.04086623713374138, "rewards/rejected": -0.017738038673996925, "step": 4310 }, { "epoch": 0.98, "grad_norm": 0.373046875, "learning_rate": 8.414887135860526e-10, "logits/chosen": -1.2920136451721191, "logits/rejected": -1.0507843494415283, "logps/chosen": -242.9005889892578, "logps/rejected": -246.7140655517578, "loss": 0.6705, "rewards/accuracies": 0.875, "rewards/chosen": 0.03298826888203621, "rewards/margins": 0.059736646711826324, "rewards/margins_max": 0.09354208409786224, "rewards/margins_min": 0.0259312242269516, "rewards/margins_std": 0.04780808836221695, "rewards/rejected": -0.026748377829790115, "step": 4320 }, { "epoch": 0.98, "grad_norm": 0.34765625, "learning_rate": 6.875146457094583e-10, "logits/chosen": -1.4780220985412598, "logits/rejected": -1.0817339420318604, "logps/chosen": -269.909912109375, "logps/rejected": -272.59796142578125, "loss": 0.667, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03321417048573494, "rewards/margins": 0.05451669171452522, "rewards/margins_max": 0.08729592710733414, "rewards/margins_min": 0.02173745259642601, "rewards/margins_std": 0.04635683819651604, "rewards/rejected": -0.021302521228790283, "step": 4330 }, { "epoch": 0.98, "grad_norm": 0.326171875, "learning_rate": 5.490664914153676e-10, "logits/chosen": -1.2779922485351562, "logits/rejected": -1.056840419769287, "logps/chosen": -176.34756469726562, "logps/rejected": -225.94140625, "loss": 0.6698, "rewards/accuracies": 0.875, "rewards/chosen": 0.035486023873090744, "rewards/margins": 0.05061611533164978, "rewards/margins_max": 0.07715443521738052, "rewards/margins_min": 0.024077793583273888, "rewards/margins_std": 0.03753085806965828, "rewards/rejected": -0.015130092389881611, "step": 4340 }, { "epoch": 0.98, "grad_norm": 0.392578125, "learning_rate": 4.261528725507113e-10, "logits/chosen": -1.4777419567108154, "logits/rejected": -1.3885948657989502, "logps/chosen": -203.3699493408203, "logps/rejected": -191.15402221679688, "loss": 0.6713, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0292215533554554, "rewards/margins": 0.04114232212305069, "rewards/margins_max": 0.06767222285270691, "rewards/margins_min": 0.014612428843975067, "rewards/margins_std": 0.0375189371407032, "rewards/rejected": -0.01192077063024044, "step": 4350 }, { "epoch": 0.99, "grad_norm": 0.326171875, "learning_rate": 3.187814435505198e-10, "logits/chosen": -1.5060135126113892, "logits/rejected": -1.3325016498565674, "logps/chosen": -190.29415893554688, "logps/rejected": -219.4818878173828, "loss": 0.6666, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.028621181845664978, "rewards/margins": 0.04612215235829353, "rewards/margins_max": 0.06957665830850601, "rewards/margins_min": 0.022667638957500458, "rewards/margins_std": 0.033169690519571304, "rewards/rejected": -0.017500972375273705, "step": 4360 }, { "epoch": 0.99, "grad_norm": 0.400390625, "learning_rate": 2.269588909613318e-10, "logits/chosen": -1.409186601638794, "logits/rejected": -1.2228713035583496, "logps/chosen": -173.80068969726562, "logps/rejected": -194.56361389160156, "loss": 0.6723, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023956812918186188, "rewards/margins": 0.03811182081699371, "rewards/margins_max": 0.05768042802810669, "rewards/margins_min": 0.018543217331171036, "rewards/margins_std": 0.027674183249473572, "rewards/rejected": -0.014155007898807526, "step": 4370 }, { "epoch": 0.99, "grad_norm": 0.365234375, "learning_rate": 1.5069093302469415e-10, "logits/chosen": -1.190978765487671, "logits/rejected": -1.0352693796157837, "logps/chosen": -224.6339874267578, "logps/rejected": -217.9106903076172, "loss": 0.6682, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.030144354328513145, "rewards/margins": 0.05521010607481003, "rewards/margins_max": 0.08944573253393173, "rewards/margins_min": 0.020974475890398026, "rewards/margins_std": 0.048416487872600555, "rewards/rejected": -0.02506575547158718, "step": 4380 }, { "epoch": 0.99, "grad_norm": 0.466796875, "learning_rate": 8.99823193210858e-11, "logits/chosen": -1.1929359436035156, "logits/rejected": -0.9103671908378601, "logps/chosen": -210.3509521484375, "logps/rejected": -232.4520721435547, "loss": 0.6695, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.022757183760404587, "rewards/margins": 0.05178738385438919, "rewards/margins_max": 0.07935634255409241, "rewards/margins_min": 0.024218428879976273, "rewards/margins_std": 0.0389883928000927, "rewards/rejected": -0.029030198231339455, "step": 4390 }, { "epoch": 0.99, "grad_norm": 0.34765625, "learning_rate": 4.483683047426523e-11, "logits/chosen": -1.3004522323608398, "logits/rejected": -1.0778028964996338, "logps/chosen": -252.458251953125, "logps/rejected": -205.4815216064453, "loss": 0.6689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.019098959863185883, "rewards/margins": 0.04386705532670021, "rewards/margins_max": 0.0736868605017662, "rewards/margins_min": 0.01404724083840847, "rewards/margins_std": 0.042171578854322433, "rewards/rejected": -0.024768095463514328, "step": 4400 }, { "epoch": 1.0, "grad_norm": 0.396484375, "learning_rate": 1.5257277915653456e-11, "logits/chosen": -1.5603951215744019, "logits/rejected": -1.2852790355682373, "logps/chosen": -157.50111389160156, "logps/rejected": -165.79519653320312, "loss": 0.6705, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03554811701178551, "rewards/margins": 0.04409513995051384, "rewards/margins_max": 0.07175298035144806, "rewards/margins_min": 0.01643729954957962, "rewards/margins_std": 0.0391140915453434, "rewards/rejected": -0.008547024801373482, "step": 4410 }, { "epoch": 1.0, "grad_norm": 0.640625, "learning_rate": 1.2455037093073161e-12, "logits/chosen": -1.4276247024536133, "logits/rejected": -1.0764439105987549, "logps/chosen": -213.16421508789062, "logps/rejected": -192.83340454101562, "loss": 0.6659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028442760929465294, "rewards/margins": 0.054314613342285156, "rewards/margins_max": 0.08231306821107864, "rewards/margins_min": 0.02631615474820137, "rewards/margins_std": 0.039595797657966614, "rewards/rejected": -0.025871848687529564, "step": 4420 }, { "epoch": 1.0, "eval_logits/chosen": -1.0172001123428345, "eval_logits/rejected": -0.8949137330055237, "eval_logps/chosen": -322.9526672363281, "eval_logps/rejected": -314.051513671875, "eval_loss": 0.6914807558059692, "eval_rewards/accuracies": 0.5509999990463257, "eval_rewards/chosen": 0.020110901445150375, "eval_rewards/margins": 0.003983891103416681, "eval_rewards/margins_max": 0.06447038054466248, "eval_rewards/margins_min": -0.059528909623622894, "eval_rewards/margins_std": 0.04112740606069565, "eval_rewards/rejected": 0.01612701080739498, "eval_runtime": 1445.3459, "eval_samples_per_second": 2.768, "eval_steps_per_second": 0.173, "step": 4424 }, { "epoch": 1.0, "step": 4424, "total_flos": 0.0, "train_loss": 0.6737086928666823, "train_runtime": 34589.1987, "train_samples_per_second": 1.023, "train_steps_per_second": 0.128 } ], "logging_steps": 10, "max_steps": 4424, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }