diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8023 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 4424, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.32421875, + "learning_rate": 1.128668171557562e-09, + "logits/chosen": -1.089872121810913, + "logits/rejected": -1.1662957668304443, + "logps/chosen": -88.48556518554688, + "logps/rejected": -128.17625427246094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.376953125, + "learning_rate": 1.128668171557562e-08, + "logits/chosen": -1.362803339958191, + "logits/rejected": -1.0890824794769287, + "logps/chosen": -253.95407104492188, + "logps/rejected": -205.00909423828125, + "loss": 0.693, + "rewards/accuracies": 0.3055555522441864, + "rewards/chosen": -0.0007014232687652111, + "rewards/margins": 4.238979454385117e-05, + "rewards/margins_max": 0.0018888049526140094, + "rewards/margins_min": -0.0018040253780782223, + "rewards/margins_std": 0.002611225238069892, + "rewards/rejected": -0.0007438129978254437, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.416015625, + "learning_rate": 2.257336343115124e-08, + "logits/chosen": -1.3700335025787354, + "logits/rejected": -1.0615637302398682, + "logps/chosen": -324.6263122558594, + "logps/rejected": -199.73663330078125, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002209640573710203, + "rewards/margins": 0.0004743327444884926, + "rewards/margins_max": 0.002534114755690098, + "rewards/margins_min": -0.0015854493249207735, + "rewards/margins_std": 0.002912971656769514, + "rewards/rejected": 0.0017353076254948974, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.361328125, + "learning_rate": 3.3860045146726863e-08, + "logits/chosen": -1.2922523021697998, + "logits/rejected": -1.0249769687652588, + "logps/chosen": -235.274658203125, + "logps/rejected": -220.1529541015625, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0010927047114819288, + "rewards/margins": 0.0009888228960335255, + "rewards/margins_max": 0.0037243079859763384, + "rewards/margins_min": -0.0017466619610786438, + "rewards/margins_std": 0.003868559841066599, + "rewards/rejected": 0.0001038818372762762, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.306640625, + "learning_rate": 4.514672686230248e-08, + "logits/chosen": -1.395784616470337, + "logits/rejected": -1.054032564163208, + "logps/chosen": -255.254150390625, + "logps/rejected": -252.3551483154297, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0007894478039816022, + "rewards/margins": 0.0006032294477336109, + "rewards/margins_max": 0.0026328391395509243, + "rewards/margins_min": -0.0014263801276683807, + "rewards/margins_std": 0.0028703012503683567, + "rewards/rejected": 0.00018621828348841518, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.431640625, + "learning_rate": 5.64334085778781e-08, + "logits/chosen": -1.3574692010879517, + "logits/rejected": -1.0375534296035767, + "logps/chosen": -231.46517944335938, + "logps/rejected": -224.56625366210938, + "loss": 0.6925, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0005603065947070718, + "rewards/margins": 0.001050409278832376, + "rewards/margins_max": 0.0026080321986228228, + "rewards/margins_min": -0.0005072135827504098, + "rewards/margins_std": 0.0022028114181011915, + "rewards/rejected": -0.0004901026841253042, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.5, + "learning_rate": 6.772009029345373e-08, + "logits/chosen": -1.3851702213287354, + "logits/rejected": -1.0226211547851562, + "logps/chosen": -213.94430541992188, + "logps/rejected": -209.12704467773438, + "loss": 0.6925, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.001433684374205768, + "rewards/margins": 0.0013705453602597117, + "rewards/margins_max": 0.0033714137971401215, + "rewards/margins_min": -0.0006303234258666635, + "rewards/margins_std": 0.0028296555392444134, + "rewards/rejected": 6.313894118648022e-05, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.58984375, + "learning_rate": 7.900677200902935e-08, + "logits/chosen": -1.3688232898712158, + "logits/rejected": -0.9564453363418579, + "logps/chosen": -246.9102325439453, + "logps/rejected": -241.5129852294922, + "loss": 0.693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00036182976327836514, + "rewards/margins": 0.0004978332435712218, + "rewards/margins_max": 0.0033143579494208097, + "rewards/margins_min": -0.0023186912294477224, + "rewards/margins_std": 0.003983167465776205, + "rewards/rejected": -0.0001360034802928567, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.49609375, + "learning_rate": 9.029345372460496e-08, + "logits/chosen": -1.3354227542877197, + "logits/rejected": -1.1622496843338013, + "logps/chosen": -238.8887481689453, + "logps/rejected": -231.7976837158203, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0011917247902601957, + "rewards/margins": 0.000608589150942862, + "rewards/margins_max": 0.003103874158114195, + "rewards/margins_min": -0.0018866958562284708, + "rewards/margins_std": 0.0035288657527416945, + "rewards/rejected": 0.0005831356393173337, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 0.4453125, + "learning_rate": 1.0158013544018059e-07, + "logits/chosen": -1.2897207736968994, + "logits/rejected": -1.0346763134002686, + "logps/chosen": -199.93191528320312, + "logps/rejected": -223.67257690429688, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00017602155276108533, + "rewards/margins": -9.21573955565691e-05, + "rewards/margins_max": 0.0014471550239250064, + "rewards/margins_min": -0.0016314696986228228, + "rewards/margins_std": 0.002176916692405939, + "rewards/rejected": 0.00026817896286956966, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 0.34765625, + "learning_rate": 1.128668171557562e-07, + "logits/chosen": -1.3299012184143066, + "logits/rejected": -1.134303092956543, + "logps/chosen": -196.38111877441406, + "logps/rejected": -186.2175750732422, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -6.178906005516183e-06, + "rewards/margins": 0.0005631408421322703, + "rewards/margins_max": 0.0025123213417828083, + "rewards/margins_min": -0.0013860397739335895, + "rewards/margins_std": 0.0027565578930079937, + "rewards/rejected": -0.0005693196435458958, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 0.423828125, + "learning_rate": 1.2415349887133183e-07, + "logits/chosen": -1.2904781103134155, + "logits/rejected": -1.0873281955718994, + "logps/chosen": -240.749755859375, + "logps/rejected": -271.0155334472656, + "loss": 0.6926, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.000635267177131027, + "rewards/margins": 0.0005631255335174501, + "rewards/margins_max": 0.003234363393858075, + "rewards/margins_min": -0.002108112210407853, + "rewards/margins_std": 0.0037777007091790438, + "rewards/rejected": 7.214168726932257e-05, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 0.357421875, + "learning_rate": 1.3544018058690745e-07, + "logits/chosen": -1.4081692695617676, + "logits/rejected": -1.034285306930542, + "logps/chosen": -194.2665252685547, + "logps/rejected": -175.28851318359375, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0005045271245762706, + "rewards/margins": 0.0010185183491557837, + "rewards/margins_max": 0.0025255356449633837, + "rewards/margins_min": -0.0004884987138211727, + "rewards/margins_std": 0.0021312441676855087, + "rewards/rejected": -0.000513991282787174, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 0.361328125, + "learning_rate": 1.4672686230248308e-07, + "logits/chosen": -1.3172610998153687, + "logits/rejected": -1.070671796798706, + "logps/chosen": -248.647216796875, + "logps/rejected": -243.6386260986328, + "loss": 0.6926, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0010833472479134798, + "rewards/margins": 0.001399085856974125, + "rewards/margins_max": 0.0033245470840483904, + "rewards/margins_min": -0.0005263749626465142, + "rewards/margins_std": 0.0027230128180235624, + "rewards/rejected": -0.000315738667268306, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 0.42578125, + "learning_rate": 1.580135440180587e-07, + "logits/chosen": -1.3338706493377686, + "logits/rejected": -1.0173786878585815, + "logps/chosen": -239.69186401367188, + "logps/rejected": -195.95968627929688, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0005986442556604743, + "rewards/margins": 0.00034580417559482157, + "rewards/margins_max": 0.0022851484827697277, + "rewards/margins_min": -0.0015935400733724236, + "rewards/margins_std": 0.0027426474262028933, + "rewards/rejected": 0.00025284005096182227, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 0.41796875, + "learning_rate": 1.693002257336343e-07, + "logits/chosen": -1.4671242237091064, + "logits/rejected": -1.1492918729782104, + "logps/chosen": -244.544921875, + "logps/rejected": -223.718505859375, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0007274551317095757, + "rewards/margins": 0.0009307805448770523, + "rewards/margins_max": 0.003343376098200679, + "rewards/margins_min": -0.0014818150084465742, + "rewards/margins_std": 0.003411925630643964, + "rewards/rejected": -0.00020332522399257869, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 0.337890625, + "learning_rate": 1.8058690744920993e-07, + "logits/chosen": -1.5086791515350342, + "logits/rejected": -1.1972728967666626, + "logps/chosen": -222.0345916748047, + "logps/rejected": -223.88845825195312, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0009902592282742262, + "rewards/margins": 0.0011595649411901832, + "rewards/margins_max": 0.0028987047262489796, + "rewards/margins_min": -0.0005795744946226478, + "rewards/margins_std": 0.0024595148861408234, + "rewards/rejected": -0.0001693058293312788, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 0.412109375, + "learning_rate": 1.9187358916478555e-07, + "logits/chosen": -1.331568717956543, + "logits/rejected": -1.0612637996673584, + "logps/chosen": -276.21881103515625, + "logps/rejected": -231.5882110595703, + "loss": 0.6925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0013450165279209614, + "rewards/margins": 0.0012487067142501473, + "rewards/margins_max": 0.002802295843139291, + "rewards/margins_min": -0.0003048820362892002, + "rewards/margins_std": 0.0021971066016703844, + "rewards/rejected": 9.630967542761937e-05, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 0.3671875, + "learning_rate": 2.0316027088036118e-07, + "logits/chosen": -1.2156211137771606, + "logits/rejected": -1.1811649799346924, + "logps/chosen": -175.79335021972656, + "logps/rejected": -235.2672576904297, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0019137079361826181, + "rewards/margins": 0.0013536510523408651, + "rewards/margins_max": 0.0036528133787214756, + "rewards/margins_min": -0.0009455106919631362, + "rewards/margins_std": 0.0032515060156583786, + "rewards/rejected": 0.0005600567674264312, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 0.5234375, + "learning_rate": 2.1444695259593678e-07, + "logits/chosen": -1.391552209854126, + "logits/rejected": -1.1357841491699219, + "logps/chosen": -217.08740234375, + "logps/rejected": -228.02847290039062, + "loss": 0.6922, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0020487727597355843, + "rewards/margins": 0.002171220723539591, + "rewards/margins_max": 0.003962562419474125, + "rewards/margins_min": 0.00037987896939739585, + "rewards/margins_std": 0.0025333398953080177, + "rewards/rejected": -0.00012244793470017612, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 0.365234375, + "learning_rate": 2.257336343115124e-07, + "logits/chosen": -1.2622658014297485, + "logits/rejected": -1.0098450183868408, + "logps/chosen": -292.15155029296875, + "logps/rejected": -233.5673065185547, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.001023811404593289, + "rewards/margins": 0.0009746513096615672, + "rewards/margins_max": 0.0035980145912617445, + "rewards/margins_min": -0.0016487122047692537, + "rewards/margins_std": 0.003709996584802866, + "rewards/rejected": 4.915996760246344e-05, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 0.462890625, + "learning_rate": 2.3702031602708803e-07, + "logits/chosen": -1.399113416671753, + "logits/rejected": -1.2889292240142822, + "logps/chosen": -243.4051971435547, + "logps/rejected": -231.0285186767578, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0008327178657054901, + "rewards/margins": 0.0006204862147569656, + "rewards/margins_max": 0.003349609673023224, + "rewards/margins_min": -0.0021086367778480053, + "rewards/margins_std": 0.003859562799334526, + "rewards/rejected": 0.00021223162184469402, + "step": 210 + }, + { + "epoch": 0.05, + "grad_norm": 0.390625, + "learning_rate": 2.4830699774266366e-07, + "logits/chosen": -1.2222211360931396, + "logits/rejected": -0.9842857122421265, + "logps/chosen": -229.5582733154297, + "logps/rejected": -194.75865173339844, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.001050114631652832, + "rewards/margins": 0.001585352816618979, + "rewards/margins_max": 0.00386607157997787, + "rewards/margins_min": -0.000695365946739912, + "rewards/margins_std": 0.00322542292997241, + "rewards/rejected": -0.0005352382431738079, + "step": 220 + }, + { + "epoch": 0.05, + "grad_norm": 0.50390625, + "learning_rate": 2.595936794582393e-07, + "logits/chosen": -1.2810051441192627, + "logits/rejected": -1.065126657485962, + "logps/chosen": -206.84463500976562, + "logps/rejected": -192.6746368408203, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.001871080370619893, + "rewards/margins": 0.0016663589049130678, + "rewards/margins_max": 0.0038144756108522415, + "rewards/margins_min": -0.0004817581211682409, + "rewards/margins_std": 0.003037896240130067, + "rewards/rejected": 0.00020472146570682526, + "step": 230 + }, + { + "epoch": 0.05, + "grad_norm": 0.36328125, + "learning_rate": 2.708803611738149e-07, + "logits/chosen": -1.2675464153289795, + "logits/rejected": -0.9945747256278992, + "logps/chosen": -231.3415069580078, + "logps/rejected": -205.28945922851562, + "loss": 0.6921, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.0021059871651232243, + "rewards/margins": 0.001979165943339467, + "rewards/margins_max": 0.004107628017663956, + "rewards/margins_min": -0.0001492957817390561, + "rewards/margins_std": 0.0030100992880761623, + "rewards/rejected": 0.000126821527373977, + "step": 240 + }, + { + "epoch": 0.06, + "grad_norm": 0.330078125, + "learning_rate": 2.8216704288939053e-07, + "logits/chosen": -1.492440104484558, + "logits/rejected": -1.2370370626449585, + "logps/chosen": -222.0819549560547, + "logps/rejected": -246.7217559814453, + "loss": 0.692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0020398388151079416, + "rewards/margins": 0.0020460395608097315, + "rewards/margins_max": 0.0041878316551446915, + "rewards/margins_min": -9.57526353886351e-05, + "rewards/margins_std": 0.0030289513524621725, + "rewards/rejected": -6.200841653480893e-06, + "step": 250 + }, + { + "epoch": 0.06, + "grad_norm": 0.54296875, + "learning_rate": 2.9345372460496616e-07, + "logits/chosen": -1.3066132068634033, + "logits/rejected": -0.9856610298156738, + "logps/chosen": -248.71292114257812, + "logps/rejected": -238.1698455810547, + "loss": 0.6919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0023428858257830143, + "rewards/margins": 0.0027398201636970043, + "rewards/margins_max": 0.005885337945073843, + "rewards/margins_min": -0.0004056969773955643, + "rewards/margins_std": 0.004448432940989733, + "rewards/rejected": -0.00039693451253697276, + "step": 260 + }, + { + "epoch": 0.06, + "grad_norm": 0.32421875, + "learning_rate": 3.047404063205418e-07, + "logits/chosen": -1.2814630270004272, + "logits/rejected": -1.139473557472229, + "logps/chosen": -173.00698852539062, + "logps/rejected": -206.34634399414062, + "loss": 0.6915, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0025371015071868896, + "rewards/margins": 0.0033828583545982838, + "rewards/margins_max": 0.006202323827892542, + "rewards/margins_min": 0.0005633925902657211, + "rewards/margins_std": 0.003987326752394438, + "rewards/rejected": -0.0008457564981654286, + "step": 270 + }, + { + "epoch": 0.06, + "grad_norm": 0.396484375, + "learning_rate": 3.160270880361174e-07, + "logits/chosen": -1.3292253017425537, + "logits/rejected": -1.0852205753326416, + "logps/chosen": -287.0424499511719, + "logps/rejected": -187.3642578125, + "loss": 0.6917, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0009970188839361072, + "rewards/margins": 0.002404967090114951, + "rewards/margins_max": 0.005034612491726875, + "rewards/margins_min": -0.00022467815142590553, + "rewards/margins_std": 0.003718879772350192, + "rewards/rejected": -0.0014079485554248095, + "step": 280 + }, + { + "epoch": 0.07, + "grad_norm": 0.36328125, + "learning_rate": 3.27313769751693e-07, + "logits/chosen": -1.4166836738586426, + "logits/rejected": -1.1044247150421143, + "logps/chosen": -228.96609497070312, + "logps/rejected": -210.9814910888672, + "loss": 0.6911, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.003559564473107457, + "rewards/margins": 0.0047764526680111885, + "rewards/margins_max": 0.007028171326965094, + "rewards/margins_min": 0.0025247344747185707, + "rewards/margins_std": 0.0031844109762459993, + "rewards/rejected": -0.0012168881949037313, + "step": 290 + }, + { + "epoch": 0.07, + "grad_norm": 0.3984375, + "learning_rate": 3.386004514672686e-07, + "logits/chosen": -1.3330157995224, + "logits/rejected": -1.1392042636871338, + "logps/chosen": -188.25454711914062, + "logps/rejected": -213.46505737304688, + "loss": 0.6912, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0028670013416558504, + "rewards/margins": 0.004401736427098513, + "rewards/margins_max": 0.007208372000604868, + "rewards/margins_min": 0.0015951006207615137, + "rewards/margins_std": 0.003969182260334492, + "rewards/rejected": -0.0015347347361966968, + "step": 300 + }, + { + "epoch": 0.07, + "grad_norm": 0.302734375, + "learning_rate": 3.4988713318284423e-07, + "logits/chosen": -1.503370761871338, + "logits/rejected": -1.1487318277359009, + "logps/chosen": -274.6285705566406, + "logps/rejected": -273.916259765625, + "loss": 0.6904, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.004510351922363043, + "rewards/margins": 0.005128798075020313, + "rewards/margins_max": 0.008591363206505775, + "rewards/margins_min": 0.0016662331763654947, + "rewards/margins_std": 0.004896806553006172, + "rewards/rejected": -0.0006184463272802532, + "step": 310 + }, + { + "epoch": 0.07, + "grad_norm": 0.416015625, + "learning_rate": 3.6117381489841986e-07, + "logits/chosen": -1.273756504058838, + "logits/rejected": -1.0820574760437012, + "logps/chosen": -151.35482788085938, + "logps/rejected": -185.79380798339844, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0040741064585745335, + "rewards/margins": 0.004994765855371952, + "rewards/margins_max": 0.008913101628422737, + "rewards/margins_min": 0.0010764312464743853, + "rewards/margins_std": 0.0055413623340427876, + "rewards/rejected": -0.0009206599788740277, + "step": 320 + }, + { + "epoch": 0.07, + "grad_norm": 0.5859375, + "learning_rate": 3.724604966139955e-07, + "logits/chosen": -1.4037578105926514, + "logits/rejected": -1.166176438331604, + "logps/chosen": -241.0926055908203, + "logps/rejected": -225.5806427001953, + "loss": 0.6909, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.004344758577644825, + "rewards/margins": 0.0043768687173724174, + "rewards/margins_max": 0.007904845289885998, + "rewards/margins_min": 0.000848892261274159, + "rewards/margins_std": 0.004989312961697578, + "rewards/rejected": -3.211015064152889e-05, + "step": 330 + }, + { + "epoch": 0.08, + "grad_norm": 0.3828125, + "learning_rate": 3.837471783295711e-07, + "logits/chosen": -1.2480767965316772, + "logits/rejected": -1.085233211517334, + "logps/chosen": -203.14663696289062, + "logps/rejected": -192.7289581298828, + "loss": 0.691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.004463316407054663, + "rewards/margins": 0.005001317244023085, + "rewards/margins_max": 0.007284390274435282, + "rewards/margins_min": 0.002718244446441531, + "rewards/margins_std": 0.0032287519425153732, + "rewards/rejected": -0.0005380000802688301, + "step": 340 + }, + { + "epoch": 0.08, + "grad_norm": 0.447265625, + "learning_rate": 3.9503386004514673e-07, + "logits/chosen": -1.2298004627227783, + "logits/rejected": -1.0297021865844727, + "logps/chosen": -223.0439453125, + "logps/rejected": -187.34703063964844, + "loss": 0.6903, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.002816079882904887, + "rewards/margins": 0.004824903793632984, + "rewards/margins_max": 0.007285586558282375, + "rewards/margins_min": 0.002364219631999731, + "rewards/margins_std": 0.0034799326676875353, + "rewards/rejected": -0.002008823212236166, + "step": 350 + }, + { + "epoch": 0.08, + "grad_norm": 0.427734375, + "learning_rate": 4.0632054176072236e-07, + "logits/chosen": -1.3313958644866943, + "logits/rejected": -1.11452317237854, + "logps/chosen": -213.21676635742188, + "logps/rejected": -171.91574096679688, + "loss": 0.6904, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.003997773863375187, + "rewards/margins": 0.005147859454154968, + "rewards/margins_max": 0.008886572904884815, + "rewards/margins_min": 0.0014091453049331903, + "rewards/margins_std": 0.005287340376526117, + "rewards/rejected": -0.0011500853579491377, + "step": 360 + }, + { + "epoch": 0.08, + "grad_norm": 0.44921875, + "learning_rate": 4.1760722347629793e-07, + "logits/chosen": -1.4377549886703491, + "logits/rejected": -1.2096660137176514, + "logps/chosen": -189.43063354492188, + "logps/rejected": -193.5634307861328, + "loss": 0.6896, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.006203767843544483, + "rewards/margins": 0.0066702342592179775, + "rewards/margins_max": 0.010389590635895729, + "rewards/margins_min": 0.0029508781153708696, + "rewards/margins_std": 0.005259964149445295, + "rewards/rejected": -0.00046646693954244256, + "step": 370 + }, + { + "epoch": 0.09, + "grad_norm": 0.47265625, + "learning_rate": 4.2889390519187356e-07, + "logits/chosen": -1.4636600017547607, + "logits/rejected": -1.0369333028793335, + "logps/chosen": -206.52481079101562, + "logps/rejected": -206.0133514404297, + "loss": 0.6894, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.006550403777509928, + "rewards/margins": 0.008564477786421776, + "rewards/margins_max": 0.012663841247558594, + "rewards/margins_min": 0.0044651152566075325, + "rewards/margins_std": 0.005797374993562698, + "rewards/rejected": -0.0020140744745731354, + "step": 380 + }, + { + "epoch": 0.09, + "grad_norm": 0.361328125, + "learning_rate": 4.401805869074492e-07, + "logits/chosen": -1.336717128753662, + "logits/rejected": -0.9665637016296387, + "logps/chosen": -212.57272338867188, + "logps/rejected": -176.93348693847656, + "loss": 0.6896, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.005571245681494474, + "rewards/margins": 0.007382750511169434, + "rewards/margins_max": 0.0123984944075346, + "rewards/margins_min": 0.0023670082446187735, + "rewards/margins_std": 0.0070933327078819275, + "rewards/rejected": -0.0018115064594894648, + "step": 390 + }, + { + "epoch": 0.09, + "grad_norm": 0.22265625, + "learning_rate": 4.514672686230248e-07, + "logits/chosen": -1.4351770877838135, + "logits/rejected": -1.120939016342163, + "logps/chosen": -203.64779663085938, + "logps/rejected": -206.8728485107422, + "loss": 0.6896, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.008293208666145802, + "rewards/margins": 0.009503757581114769, + "rewards/margins_max": 0.015738772228360176, + "rewards/margins_min": 0.0032687417697161436, + "rewards/margins_std": 0.008817643858492374, + "rewards/rejected": -0.0012105483328923583, + "step": 400 + }, + { + "epoch": 0.09, + "grad_norm": 0.349609375, + "learning_rate": 4.6275395033860043e-07, + "logits/chosen": -1.2792364358901978, + "logits/rejected": -1.0543757677078247, + "logps/chosen": -227.12451171875, + "logps/rejected": -197.13278198242188, + "loss": 0.6902, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.005497048608958721, + "rewards/margins": 0.006281238980591297, + "rewards/margins_max": 0.01041505765169859, + "rewards/margins_min": 0.0021474191453307867, + "rewards/margins_std": 0.005846103187650442, + "rewards/rejected": -0.0007841892656870186, + "step": 410 + }, + { + "epoch": 0.09, + "grad_norm": 0.330078125, + "learning_rate": 4.7404063205417606e-07, + "logits/chosen": -1.4206167459487915, + "logits/rejected": -1.1501795053482056, + "logps/chosen": -351.7145690917969, + "logps/rejected": -253.0180206298828, + "loss": 0.6895, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.007716922555118799, + "rewards/margins": 0.0065300120040774345, + "rewards/margins_max": 0.011199641041457653, + "rewards/margins_min": 0.0018603820353746414, + "rewards/margins_std": 0.006603854242712259, + "rewards/rejected": 0.0011869106674566865, + "step": 420 + }, + { + "epoch": 0.1, + "grad_norm": 0.416015625, + "learning_rate": 4.853273137697517e-07, + "logits/chosen": -1.460006833076477, + "logits/rejected": -1.2835047245025635, + "logps/chosen": -222.09024047851562, + "logps/rejected": -220.6402587890625, + "loss": 0.6896, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006278887391090393, + "rewards/margins": 0.006416992750018835, + "rewards/margins_max": 0.011116179637610912, + "rewards/margins_min": 0.0017178058624267578, + "rewards/margins_std": 0.0066456543281674385, + "rewards/rejected": -0.000138105358928442, + "step": 430 + }, + { + "epoch": 0.1, + "grad_norm": 0.443359375, + "learning_rate": 4.966139954853273e-07, + "logits/chosen": -1.4451887607574463, + "logits/rejected": -1.039292573928833, + "logps/chosen": -272.3094177246094, + "logps/rejected": -215.6998291015625, + "loss": 0.6887, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.008401526138186455, + "rewards/margins": 0.0101736756041646, + "rewards/margins_max": 0.014983911998569965, + "rewards/margins_min": 0.005363441072404385, + "rewards/margins_std": 0.006802698131650686, + "rewards/rejected": -0.0017721500480547547, + "step": 440 + }, + { + "epoch": 0.1, + "grad_norm": 0.455078125, + "learning_rate": 4.999961856514226e-07, + "logits/chosen": -1.3807470798492432, + "logits/rejected": -1.0996475219726562, + "logps/chosen": -220.24142456054688, + "logps/rejected": -211.8799591064453, + "loss": 0.6883, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.007313003297895193, + "rewards/margins": 0.011259237304329872, + "rewards/margins_max": 0.01768874004483223, + "rewards/margins_min": 0.004829735029488802, + "rewards/margins_std": 0.009092690423130989, + "rewards/rejected": -0.003946233075112104, + "step": 450 + }, + { + "epoch": 0.1, + "grad_norm": 0.416015625, + "learning_rate": 4.999775034079764e-07, + "logits/chosen": -1.3176883459091187, + "logits/rejected": -0.9752163887023926, + "logps/chosen": -195.54110717773438, + "logps/rejected": -183.15243530273438, + "loss": 0.6883, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.008682606741786003, + "rewards/margins": 0.009235577657818794, + "rewards/margins_max": 0.01494432520121336, + "rewards/margins_min": 0.003526832442730665, + "rewards/margins_std": 0.008073386736214161, + "rewards/rejected": -0.0005529728368856013, + "step": 460 + }, + { + "epoch": 0.11, + "grad_norm": 0.439453125, + "learning_rate": 4.999432538370056e-07, + "logits/chosen": -1.3628463745117188, + "logits/rejected": -1.2302682399749756, + "logps/chosen": -167.83572387695312, + "logps/rejected": -175.12362670898438, + "loss": 0.6874, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.007012033369392157, + "rewards/margins": 0.01034090481698513, + "rewards/margins_max": 0.01631959341466427, + "rewards/margins_min": 0.004362213425338268, + "rewards/margins_std": 0.008455146104097366, + "rewards/rejected": -0.003328870516270399, + "step": 470 + }, + { + "epoch": 0.11, + "grad_norm": 0.33984375, + "learning_rate": 4.998934390713993e-07, + "logits/chosen": -1.4936786890029907, + "logits/rejected": -1.2519557476043701, + "logps/chosen": -191.031982421875, + "logps/rejected": -189.51622009277344, + "loss": 0.6875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.005951897706836462, + "rewards/margins": 0.007916957139968872, + "rewards/margins_max": 0.013470885343849659, + "rewards/margins_min": 0.0023630294017493725, + "rewards/margins_std": 0.007854441180825233, + "rewards/rejected": -0.0019650589674711227, + "step": 480 + }, + { + "epoch": 0.11, + "grad_norm": 0.296875, + "learning_rate": 4.998280622133677e-07, + "logits/chosen": -1.2512634992599487, + "logits/rejected": -1.0366919040679932, + "logps/chosen": -243.0624237060547, + "logps/rejected": -188.01596069335938, + "loss": 0.6877, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.005484632216393948, + "rewards/margins": 0.010485135950148106, + "rewards/margins_max": 0.015126067213714123, + "rewards/margins_min": 0.005844203755259514, + "rewards/margins_std": 0.006563269533216953, + "rewards/rejected": -0.005000503268092871, + "step": 490 + }, + { + "epoch": 0.11, + "grad_norm": 0.3515625, + "learning_rate": 4.99747127334249e-07, + "logits/chosen": -1.501162052154541, + "logits/rejected": -1.1960660219192505, + "logps/chosen": -232.68899536132812, + "logps/rejected": -200.26589965820312, + "loss": 0.6877, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.007800704799592495, + "rewards/margins": 0.009690572507679462, + "rewards/margins_max": 0.017231535166502, + "rewards/margins_min": 0.002149612409994006, + "rewards/margins_std": 0.010664528235793114, + "rewards/rejected": -0.0018898677080869675, + "step": 500 + }, + { + "epoch": 0.12, + "grad_norm": 0.390625, + "learning_rate": 4.996506394742558e-07, + "logits/chosen": -1.3358343839645386, + "logits/rejected": -1.0514311790466309, + "logps/chosen": -169.50668334960938, + "logps/rejected": -182.19361877441406, + "loss": 0.6869, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.00925387255847454, + "rewards/margins": 0.013080105185508728, + "rewards/margins_max": 0.020481396466493607, + "rewards/margins_min": 0.005678813438862562, + "rewards/margins_std": 0.010467005893588066, + "rewards/rejected": -0.0038262330926954746, + "step": 510 + }, + { + "epoch": 0.12, + "grad_norm": 0.322265625, + "learning_rate": 4.995386046421613e-07, + "logits/chosen": -1.347860336303711, + "logits/rejected": -1.2029502391815186, + "logps/chosen": -208.1087646484375, + "logps/rejected": -204.68594360351562, + "loss": 0.6874, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.007269317749887705, + "rewards/margins": 0.012022108770906925, + "rewards/margins_max": 0.018901333212852478, + "rewards/margins_min": 0.005142883397638798, + "rewards/margins_std": 0.009728692471981049, + "rewards/rejected": -0.004752790089696646, + "step": 520 + }, + { + "epoch": 0.12, + "grad_norm": 0.400390625, + "learning_rate": 4.994110298149252e-07, + "logits/chosen": -1.4562270641326904, + "logits/rejected": -1.1467808485031128, + "logps/chosen": -229.633544921875, + "logps/rejected": -241.7759552001953, + "loss": 0.6845, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.013689137995243073, + "rewards/margins": 0.022165587171912193, + "rewards/margins_max": 0.031851477921009064, + "rewards/margins_min": 0.01247970201075077, + "rewards/margins_std": 0.013697914779186249, + "rewards/rejected": -0.008476451970636845, + "step": 530 + }, + { + "epoch": 0.12, + "grad_norm": 0.404296875, + "learning_rate": 4.992679229372587e-07, + "logits/chosen": -1.4816101789474487, + "logits/rejected": -1.114330530166626, + "logps/chosen": -219.7097625732422, + "logps/rejected": -195.24655151367188, + "loss": 0.6857, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.014646338298916817, + "rewards/margins": 0.01716429367661476, + "rewards/margins_max": 0.02907278575003147, + "rewards/margins_min": 0.0052558062598109245, + "rewards/margins_std": 0.01684114709496498, + "rewards/rejected": -0.002517957706004381, + "step": 540 + }, + { + "epoch": 0.12, + "grad_norm": 0.3515625, + "learning_rate": 4.991092929211304e-07, + "logits/chosen": -1.310151219367981, + "logits/rejected": -0.915892481803894, + "logps/chosen": -223.8758087158203, + "logps/rejected": -195.31063842773438, + "loss": 0.6852, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.009512702003121376, + "rewards/margins": 0.013930651359260082, + "rewards/margins_max": 0.022518616169691086, + "rewards/margins_min": 0.005342685617506504, + "rewards/margins_std": 0.012145215645432472, + "rewards/rejected": -0.004417949356138706, + "step": 550 + }, + { + "epoch": 0.13, + "grad_norm": 0.322265625, + "learning_rate": 4.989351496452109e-07, + "logits/chosen": -1.4654967784881592, + "logits/rejected": -1.1996474266052246, + "logps/chosen": -185.1067352294922, + "logps/rejected": -197.99349975585938, + "loss": 0.6862, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.010758506134152412, + "rewards/margins": 0.012723572552204132, + "rewards/margins_max": 0.02212095446884632, + "rewards/margins_min": 0.0033261929638683796, + "rewards/margins_std": 0.013289901427924633, + "rewards/rejected": -0.0019650678150355816, + "step": 560 + }, + { + "epoch": 0.13, + "grad_norm": 0.33984375, + "learning_rate": 4.987455039542576e-07, + "logits/chosen": -1.4222947359085083, + "logits/rejected": -1.238435983657837, + "logps/chosen": -155.63284301757812, + "logps/rejected": -185.66848754882812, + "loss": 0.6852, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.010133227333426476, + "rewards/margins": 0.013187420554459095, + "rewards/margins_max": 0.020901057869195938, + "rewards/margins_min": 0.005473785102367401, + "rewards/margins_std": 0.010908729396760464, + "rewards/rejected": -0.003054193453863263, + "step": 570 + }, + { + "epoch": 0.13, + "grad_norm": 0.2578125, + "learning_rate": 4.985403676584397e-07, + "logits/chosen": -1.5004786252975464, + "logits/rejected": -1.2731705904006958, + "logps/chosen": -139.58822631835938, + "logps/rejected": -161.0989990234375, + "loss": 0.686, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.00958799198269844, + "rewards/margins": 0.012686249800026417, + "rewards/margins_max": 0.020806532353162766, + "rewards/margins_min": 0.004565965384244919, + "rewards/margins_std": 0.011483816429972649, + "rewards/rejected": -0.00309825805015862, + "step": 580 + }, + { + "epoch": 0.13, + "grad_norm": 0.396484375, + "learning_rate": 4.983197535326024e-07, + "logits/chosen": -1.3106216192245483, + "logits/rejected": -0.8944109678268433, + "logps/chosen": -242.289794921875, + "logps/rejected": -325.63177490234375, + "loss": 0.6837, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.015353771857917309, + "rewards/margins": 0.020929958671331406, + "rewards/margins_max": 0.03252523019909859, + "rewards/margins_min": 0.00933468621224165, + "rewards/margins_std": 0.016398191452026367, + "rewards/rejected": -0.005576184950768948, + "step": 590 + }, + { + "epoch": 0.14, + "grad_norm": 0.314453125, + "learning_rate": 4.980836753154714e-07, + "logits/chosen": -1.361307144165039, + "logits/rejected": -1.1225764751434326, + "logps/chosen": -209.185546875, + "logps/rejected": -244.9947509765625, + "loss": 0.6836, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.014351347461342812, + "rewards/margins": 0.021498367190361023, + "rewards/margins_max": 0.03384038060903549, + "rewards/margins_min": 0.009156355634331703, + "rewards/margins_std": 0.01745423674583435, + "rewards/rejected": -0.007147020194679499, + "step": 600 + }, + { + "epoch": 0.14, + "grad_norm": 0.388671875, + "learning_rate": 4.978321477087972e-07, + "logits/chosen": -1.3299026489257812, + "logits/rejected": -1.0124680995941162, + "logps/chosen": -255.3600616455078, + "logps/rejected": -245.94985961914062, + "loss": 0.6834, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.018793154507875443, + "rewards/margins": 0.022329501807689667, + "rewards/margins_max": 0.03670556843280792, + "rewards/margins_min": 0.007953429594635963, + "rewards/margins_std": 0.02033083513379097, + "rewards/rejected": -0.003536344738677144, + "step": 610 + }, + { + "epoch": 0.14, + "grad_norm": 0.333984375, + "learning_rate": 4.975651863764402e-07, + "logits/chosen": -1.3493794202804565, + "logits/rejected": -1.0976240634918213, + "logps/chosen": -266.123291015625, + "logps/rejected": -219.5732421875, + "loss": 0.6863, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.01338467001914978, + "rewards/margins": 0.015727603808045387, + "rewards/margins_max": 0.028147747740149498, + "rewards/margins_min": 0.00330745754763484, + "rewards/margins_std": 0.017564736306667328, + "rewards/rejected": -0.002342933090403676, + "step": 620 + }, + { + "epoch": 0.14, + "grad_norm": 0.423828125, + "learning_rate": 4.972828079433943e-07, + "logits/chosen": -1.48280930519104, + "logits/rejected": -1.12116277217865, + "logps/chosen": -280.49884033203125, + "logps/rejected": -244.3009796142578, + "loss": 0.6836, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.01864319108426571, + "rewards/margins": 0.017971428111195564, + "rewards/margins_max": 0.028208374977111816, + "rewards/margins_min": 0.007734485901892185, + "rewards/margins_std": 0.014477225951850414, + "rewards/rejected": 0.0006717622163705528, + "step": 630 + }, + { + "epoch": 0.14, + "grad_norm": 0.41015625, + "learning_rate": 4.969850299947519e-07, + "logits/chosen": -1.4176355600357056, + "logits/rejected": -1.071342945098877, + "logps/chosen": -280.03997802734375, + "logps/rejected": -226.7299346923828, + "loss": 0.6822, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01520386803895235, + "rewards/margins": 0.023597296327352524, + "rewards/margins_max": 0.0348820798099041, + "rewards/margins_min": 0.012312507256865501, + "rewards/margins_std": 0.01595909893512726, + "rewards/rejected": -0.008393426425755024, + "step": 640 + }, + { + "epoch": 0.15, + "grad_norm": 0.314453125, + "learning_rate": 4.966718710746093e-07, + "logits/chosen": -1.4015623331069946, + "logits/rejected": -1.1037607192993164, + "logps/chosen": -204.43484497070312, + "logps/rejected": -258.1009216308594, + "loss": 0.6816, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.024477144703269005, + "rewards/margins": 0.023006316274404526, + "rewards/margins_max": 0.03518133610486984, + "rewards/margins_min": 0.010831299237906933, + "rewards/margins_std": 0.017218075692653656, + "rewards/rejected": 0.0014708290109410882, + "step": 650 + }, + { + "epoch": 0.15, + "grad_norm": 0.4375, + "learning_rate": 4.963433506849114e-07, + "logits/chosen": -1.4727814197540283, + "logits/rejected": -1.2456432580947876, + "logps/chosen": -241.7753143310547, + "logps/rejected": -239.16921997070312, + "loss": 0.6839, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.014989370480179787, + "rewards/margins": 0.02068949118256569, + "rewards/margins_max": 0.03409608453512192, + "rewards/margins_min": 0.007282900158315897, + "rewards/margins_std": 0.018959784880280495, + "rewards/rejected": -0.005700122099369764, + "step": 660 + }, + { + "epoch": 0.15, + "grad_norm": 0.3671875, + "learning_rate": 4.959994892842371e-07, + "logits/chosen": -1.1937696933746338, + "logits/rejected": -1.015245795249939, + "logps/chosen": -267.62725830078125, + "logps/rejected": -282.30084228515625, + "loss": 0.6841, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.014117559418082237, + "rewards/margins": 0.01710415445268154, + "rewards/margins_max": 0.028150636702775955, + "rewards/margins_min": 0.006057672202587128, + "rewards/margins_std": 0.015622084960341454, + "rewards/rejected": -0.002986595267429948, + "step": 670 + }, + { + "epoch": 0.15, + "grad_norm": 0.3359375, + "learning_rate": 4.956403082865256e-07, + "logits/chosen": -1.2967605590820312, + "logits/rejected": -1.0511850118637085, + "logps/chosen": -222.7646484375, + "logps/rejected": -171.53314208984375, + "loss": 0.6826, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01969096250832081, + "rewards/margins": 0.024944283068180084, + "rewards/margins_max": 0.03900546580553055, + "rewards/margins_min": 0.01088310219347477, + "rewards/margins_std": 0.019885513931512833, + "rewards/rejected": -0.005253321956843138, + "step": 680 + }, + { + "epoch": 0.16, + "grad_norm": 0.30078125, + "learning_rate": 4.952658300597427e-07, + "logits/chosen": -1.3829014301300049, + "logits/rejected": -1.1351196765899658, + "logps/chosen": -203.21536254882812, + "logps/rejected": -208.79837036132812, + "loss": 0.6818, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.019899826496839523, + "rewards/margins": 0.024015935137867928, + "rewards/margins_max": 0.034795574843883514, + "rewards/margins_min": 0.013236296363174915, + "rewards/margins_std": 0.015244710259139538, + "rewards/rejected": -0.00411610770970583, + "step": 690 + }, + { + "epoch": 0.16, + "grad_norm": 0.412109375, + "learning_rate": 4.948760779244875e-07, + "logits/chosen": -1.4555695056915283, + "logits/rejected": -1.1856902837753296, + "logps/chosen": -177.55966186523438, + "logps/rejected": -267.4813232421875, + "loss": 0.6831, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0156883355230093, + "rewards/margins": 0.020723089575767517, + "rewards/margins_max": 0.02882271073758602, + "rewards/margins_min": 0.012623466551303864, + "rewards/margins_std": 0.011454595252871513, + "rewards/rejected": -0.005034754052758217, + "step": 700 + }, + { + "epoch": 0.16, + "grad_norm": 0.388671875, + "learning_rate": 4.94471076152541e-07, + "logits/chosen": -1.365276575088501, + "logits/rejected": -1.0673878192901611, + "logps/chosen": -189.3357696533203, + "logps/rejected": -177.35968017578125, + "loss": 0.6822, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.014079471118748188, + "rewards/margins": 0.02116897702217102, + "rewards/margins_max": 0.03162091225385666, + "rewards/margins_min": 0.010717044584453106, + "rewards/margins_std": 0.014781268313527107, + "rewards/rejected": -0.007089508231729269, + "step": 710 + }, + { + "epoch": 0.16, + "grad_norm": 0.2451171875, + "learning_rate": 4.940508499653537e-07, + "logits/chosen": -1.3948003053665161, + "logits/rejected": -1.1609618663787842, + "logps/chosen": -237.48367309570312, + "logps/rejected": -217.39926147460938, + "loss": 0.6821, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.01721605286002159, + "rewards/margins": 0.02224144898355007, + "rewards/margins_max": 0.03620148077607155, + "rewards/margins_min": 0.008281421847641468, + "rewards/margins_std": 0.019742459058761597, + "rewards/rejected": -0.005025396589189768, + "step": 720 + }, + { + "epoch": 0.17, + "grad_norm": 0.4921875, + "learning_rate": 4.936154255324751e-07, + "logits/chosen": -1.3077278137207031, + "logits/rejected": -1.0122146606445312, + "logps/chosen": -231.55557250976562, + "logps/rejected": -208.5164337158203, + "loss": 0.6804, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.015759726986289024, + "rewards/margins": 0.02600025199353695, + "rewards/margins_max": 0.04006841406226158, + "rewards/margins_min": 0.011932085268199444, + "rewards/margins_std": 0.019895387813448906, + "rewards/rejected": -0.010240525007247925, + "step": 730 + }, + { + "epoch": 0.17, + "grad_norm": 0.333984375, + "learning_rate": 4.931648299699244e-07, + "logits/chosen": -1.2586504220962524, + "logits/rejected": -1.0412757396697998, + "logps/chosen": -170.1586456298828, + "logps/rejected": -164.20999145507812, + "loss": 0.6825, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.010317305102944374, + "rewards/margins": 0.015293523669242859, + "rewards/margins_max": 0.02413867600262165, + "rewards/margins_min": 0.006448371801525354, + "rewards/margins_std": 0.01250893622636795, + "rewards/rejected": -0.004976219031959772, + "step": 740 + }, + { + "epoch": 0.17, + "grad_norm": 0.25390625, + "learning_rate": 4.926990913385014e-07, + "logits/chosen": -1.486669898033142, + "logits/rejected": -1.190189242362976, + "logps/chosen": -227.45913696289062, + "logps/rejected": -229.8372039794922, + "loss": 0.6821, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.025707051157951355, + "rewards/margins": 0.025935638695955276, + "rewards/margins_max": 0.04128013923764229, + "rewards/margins_min": 0.01059113722294569, + "rewards/margins_std": 0.021700400859117508, + "rewards/rejected": -0.00022858443844597787, + "step": 750 + }, + { + "epoch": 0.17, + "grad_norm": 0.330078125, + "learning_rate": 4.922182386420394e-07, + "logits/chosen": -1.4005054235458374, + "logits/rejected": -1.1406983137130737, + "logps/chosen": -199.43179321289062, + "logps/rejected": -217.20321655273438, + "loss": 0.6808, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.014500722289085388, + "rewards/margins": 0.027240172028541565, + "rewards/margins_max": 0.03960045427083969, + "rewards/margins_min": 0.014879885129630566, + "rewards/margins_std": 0.01748008280992508, + "rewards/rejected": -0.012739451602101326, + "step": 760 + }, + { + "epoch": 0.17, + "grad_norm": 0.443359375, + "learning_rate": 4.917223018255988e-07, + "logits/chosen": -1.3443114757537842, + "logits/rejected": -1.1962391138076782, + "logps/chosen": -216.9431915283203, + "logps/rejected": -217.5789031982422, + "loss": 0.6778, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0227685384452343, + "rewards/margins": 0.02805909514427185, + "rewards/margins_max": 0.04799731448292732, + "rewards/margins_min": 0.008120874874293804, + "rewards/margins_std": 0.028196901082992554, + "rewards/rejected": -0.0052905576303601265, + "step": 770 + }, + { + "epoch": 0.18, + "grad_norm": 0.26171875, + "learning_rate": 4.912113117736021e-07, + "logits/chosen": -1.3593733310699463, + "logits/rejected": -1.0461851358413696, + "logps/chosen": -215.6609649658203, + "logps/rejected": -276.1067810058594, + "loss": 0.6778, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02205914631485939, + "rewards/margins": 0.03457336500287056, + "rewards/margins_max": 0.05253841355443001, + "rewards/margins_min": 0.016608327627182007, + "rewards/margins_std": 0.025406410917639732, + "rewards/rejected": -0.012514224275946617, + "step": 780 + }, + { + "epoch": 0.18, + "grad_norm": 0.375, + "learning_rate": 4.906853003079108e-07, + "logits/chosen": -1.3251748085021973, + "logits/rejected": -0.9611700177192688, + "logps/chosen": -249.4725799560547, + "logps/rejected": -171.49986267089844, + "loss": 0.6824, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.014942338690161705, + "rewards/margins": 0.022041751071810722, + "rewards/margins_max": 0.03632887080311775, + "rewards/margins_min": 0.007754630409181118, + "rewards/margins_std": 0.020205039530992508, + "rewards/rejected": -0.007099410984665155, + "step": 790 + }, + { + "epoch": 0.18, + "grad_norm": 0.439453125, + "learning_rate": 4.901443001858437e-07, + "logits/chosen": -1.3746627569198608, + "logits/rejected": -1.0284743309020996, + "logps/chosen": -199.03555297851562, + "logps/rejected": -218.10317993164062, + "loss": 0.6763, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.021685587242245674, + "rewards/margins": 0.030330291017889977, + "rewards/margins_max": 0.04316481575369835, + "rewards/margins_min": 0.017495770007371902, + "rewards/margins_std": 0.018150756135582924, + "rewards/rejected": -0.008644704706966877, + "step": 800 + }, + { + "epoch": 0.18, + "grad_norm": 0.33203125, + "learning_rate": 4.895883450981369e-07, + "logits/chosen": -1.2774895429611206, + "logits/rejected": -1.1306382417678833, + "logps/chosen": -199.5352783203125, + "logps/rejected": -192.86988830566406, + "loss": 0.6816, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.013707217760384083, + "rewards/margins": 0.026056578382849693, + "rewards/margins_max": 0.03815007209777832, + "rewards/margins_min": 0.013963082805275917, + "rewards/margins_std": 0.017102785408496857, + "rewards/rejected": -0.012349361553788185, + "step": 810 + }, + { + "epoch": 0.19, + "grad_norm": 0.337890625, + "learning_rate": 4.890174696668458e-07, + "logits/chosen": -1.343576192855835, + "logits/rejected": -1.146897554397583, + "logps/chosen": -248.36767578125, + "logps/rejected": -211.9911346435547, + "loss": 0.6818, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.020799916237592697, + "rewards/margins": 0.02256534807384014, + "rewards/margins_max": 0.03596457839012146, + "rewards/margins_min": 0.009166114963591099, + "rewards/margins_std": 0.01894937828183174, + "rewards/rejected": -0.0017654303228482604, + "step": 820 + }, + { + "epoch": 0.19, + "grad_norm": 0.388671875, + "learning_rate": 4.884317094431885e-07, + "logits/chosen": -1.4735023975372314, + "logits/rejected": -1.2554306983947754, + "logps/chosen": -195.99563598632812, + "logps/rejected": -241.04043579101562, + "loss": 0.6798, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.017905376851558685, + "rewards/margins": 0.020723778754472733, + "rewards/margins_max": 0.03622407838702202, + "rewards/margins_min": 0.005223480518907309, + "rewards/margins_std": 0.021920733153820038, + "rewards/rejected": -0.0028184009715914726, + "step": 830 + }, + { + "epoch": 0.19, + "grad_norm": 0.279296875, + "learning_rate": 4.878311009053327e-07, + "logits/chosen": -1.453464150428772, + "logits/rejected": -1.2428507804870605, + "logps/chosen": -166.3644561767578, + "logps/rejected": -181.51950073242188, + "loss": 0.6806, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.016648324206471443, + "rewards/margins": 0.02127132937312126, + "rewards/margins_max": 0.03203215450048447, + "rewards/margins_min": 0.010510509833693504, + "rewards/margins_std": 0.01521809957921505, + "rewards/rejected": -0.004623007960617542, + "step": 840 + }, + { + "epoch": 0.19, + "grad_norm": 0.349609375, + "learning_rate": 4.872156814561235e-07, + "logits/chosen": -1.372947335243225, + "logits/rejected": -1.055177927017212, + "logps/chosen": -226.04931640625, + "logps/rejected": -227.8234405517578, + "loss": 0.6802, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.016364138573408127, + "rewards/margins": 0.02827121689915657, + "rewards/margins_max": 0.04151327162981033, + "rewards/margins_min": 0.01502915658056736, + "rewards/margins_std": 0.018727101385593414, + "rewards/rejected": -0.011907076463103294, + "step": 850 + }, + { + "epoch": 0.19, + "grad_norm": 0.4765625, + "learning_rate": 4.865854894207541e-07, + "logits/chosen": -1.3590134382247925, + "logits/rejected": -1.0492280721664429, + "logps/chosen": -294.7002868652344, + "logps/rejected": -269.7496032714844, + "loss": 0.679, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01846398413181305, + "rewards/margins": 0.02630179561674595, + "rewards/margins_max": 0.04034816473722458, + "rewards/margins_min": 0.012255420908331871, + "rewards/margins_std": 0.01986457034945488, + "rewards/rejected": -0.007837808690965176, + "step": 860 + }, + { + "epoch": 0.2, + "grad_norm": 0.48828125, + "learning_rate": 4.859405640443793e-07, + "logits/chosen": -1.463749647140503, + "logits/rejected": -1.1530177593231201, + "logps/chosen": -241.88876342773438, + "logps/rejected": -212.88107299804688, + "loss": 0.6775, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.0185990110039711, + "rewards/margins": 0.027092251926660538, + "rewards/margins_max": 0.04592274874448776, + "rewards/margins_min": 0.00826175231486559, + "rewards/margins_std": 0.026630345731973648, + "rewards/rejected": -0.008493239060044289, + "step": 870 + }, + { + "epoch": 0.2, + "grad_norm": 0.4140625, + "learning_rate": 4.852809454896714e-07, + "logits/chosen": -1.3249889612197876, + "logits/rejected": -0.9983965158462524, + "logps/chosen": -231.47012329101562, + "logps/rejected": -232.07177734375, + "loss": 0.6766, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.020357009023427963, + "rewards/margins": 0.03504853695631027, + "rewards/margins_max": 0.04718896746635437, + "rewards/margins_min": 0.022908110171556473, + "rewards/margins_std": 0.017169155180454254, + "rewards/rejected": -0.014691528864204884, + "step": 880 + }, + { + "epoch": 0.2, + "grad_norm": 0.365234375, + "learning_rate": 4.846066748343192e-07, + "logits/chosen": -1.4718701839447021, + "logits/rejected": -1.3086658716201782, + "logps/chosen": -217.58248901367188, + "logps/rejected": -218.3019256591797, + "loss": 0.6807, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.022301137447357178, + "rewards/margins": 0.023601362481713295, + "rewards/margins_max": 0.04132290184497833, + "rewards/margins_min": 0.005879817996174097, + "rewards/margins_std": 0.02506204880774021, + "rewards/rejected": -0.0013002243358641863, + "step": 890 + }, + { + "epoch": 0.2, + "grad_norm": 0.4453125, + "learning_rate": 4.839177940684699e-07, + "logits/chosen": -1.291446328163147, + "logits/rejected": -1.1270654201507568, + "logps/chosen": -202.37107849121094, + "logps/rejected": -224.5233917236328, + "loss": 0.6798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018942687660455704, + "rewards/margins": 0.026033837348222733, + "rewards/margins_max": 0.04407670348882675, + "rewards/margins_min": 0.007990965619683266, + "rewards/margins_std": 0.025516469031572342, + "rewards/rejected": -0.007091146893799305, + "step": 900 + }, + { + "epoch": 0.21, + "grad_norm": 0.294921875, + "learning_rate": 4.832143460921137e-07, + "logits/chosen": -1.5121201276779175, + "logits/rejected": -1.2432599067687988, + "logps/chosen": -206.224365234375, + "logps/rejected": -197.02406311035156, + "loss": 0.6795, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.019623275846242905, + "rewards/margins": 0.025538703426718712, + "rewards/margins_max": 0.04129987955093384, + "rewards/margins_min": 0.009777536615729332, + "rewards/margins_std": 0.022289659827947617, + "rewards/rejected": -0.005915429908782244, + "step": 910 + }, + { + "epoch": 0.21, + "grad_norm": 0.474609375, + "learning_rate": 4.824963747124131e-07, + "logits/chosen": -1.4324082136154175, + "logits/rejected": -1.0861629247665405, + "logps/chosen": -193.4169921875, + "logps/rejected": -184.81045532226562, + "loss": 0.6762, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.021899651736021042, + "rewards/margins": 0.03429209813475609, + "rewards/margins_max": 0.04998604208230972, + "rewards/margins_min": 0.018598156049847603, + "rewards/margins_std": 0.02219458669424057, + "rewards/rejected": -0.012392444536089897, + "step": 920 + }, + { + "epoch": 0.21, + "grad_norm": 0.353515625, + "learning_rate": 4.817639246409737e-07, + "logits/chosen": -1.2531248331069946, + "logits/rejected": -1.0572696924209595, + "logps/chosen": -174.64602661132812, + "logps/rejected": -178.2739715576172, + "loss": 0.6808, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.019065044820308685, + "rewards/margins": 0.02028615027666092, + "rewards/margins_max": 0.030723493546247482, + "rewards/margins_min": 0.009848803281784058, + "rewards/margins_std": 0.014760637655854225, + "rewards/rejected": -0.0012211051071062684, + "step": 930 + }, + { + "epoch": 0.21, + "grad_norm": 0.51953125, + "learning_rate": 4.81017041491061e-07, + "logits/chosen": -1.3658699989318848, + "logits/rejected": -1.1312452554702759, + "logps/chosen": -328.56488037109375, + "logps/rejected": -290.95050048828125, + "loss": 0.6795, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.022527078166604042, + "rewards/margins": 0.029167424887418747, + "rewards/margins_max": 0.04566577076911926, + "rewards/margins_min": 0.012669073417782784, + "rewards/margins_std": 0.02333218976855278, + "rewards/rejected": -0.006640346255153418, + "step": 940 + }, + { + "epoch": 0.21, + "grad_norm": 0.34375, + "learning_rate": 4.802557717747587e-07, + "logits/chosen": -1.4883503913879395, + "logits/rejected": -1.2266268730163574, + "logps/chosen": -201.34490966796875, + "logps/rejected": -194.04742431640625, + "loss": 0.6779, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.02358989045023918, + "rewards/margins": 0.0302322618663311, + "rewards/margins_max": 0.04986436665058136, + "rewards/margins_min": 0.01060016080737114, + "rewards/margins_std": 0.027763986960053444, + "rewards/rejected": -0.006642372813075781, + "step": 950 + }, + { + "epoch": 0.22, + "grad_norm": 0.46875, + "learning_rate": 4.79480162900073e-07, + "logits/chosen": -1.3080474138259888, + "logits/rejected": -1.0905568599700928, + "logps/chosen": -230.7699737548828, + "logps/rejected": -253.7514190673828, + "loss": 0.6773, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.020071830600500107, + "rewards/margins": 0.03410849720239639, + "rewards/margins_max": 0.054665457457304, + "rewards/margins_min": 0.013551535084843636, + "rewards/margins_std": 0.029071932658553123, + "rewards/rejected": -0.014036668464541435, + "step": 960 + }, + { + "epoch": 0.22, + "grad_norm": 0.345703125, + "learning_rate": 4.7869026316798e-07, + "logits/chosen": -1.3408102989196777, + "logits/rejected": -1.0580967664718628, + "logps/chosen": -239.954833984375, + "logps/rejected": -267.51068115234375, + "loss": 0.6729, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02294796332716942, + "rewards/margins": 0.04744488745927811, + "rewards/margins_max": 0.0710405632853508, + "rewards/margins_min": 0.023849209770560265, + "rewards/margins_std": 0.03336932882666588, + "rewards/rejected": -0.02449692226946354, + "step": 970 + }, + { + "epoch": 0.22, + "grad_norm": 0.369140625, + "learning_rate": 4.778861217694174e-07, + "logits/chosen": -1.266989827156067, + "logits/rejected": -1.0530259609222412, + "logps/chosen": -171.4249725341797, + "logps/rejected": -190.92129516601562, + "loss": 0.6759, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02101651392877102, + "rewards/margins": 0.029414648190140724, + "rewards/margins_max": 0.04411066323518753, + "rewards/margins_min": 0.01471862755715847, + "rewards/margins_std": 0.020783307030797005, + "rewards/rejected": -0.008398131467401981, + "step": 980 + }, + { + "epoch": 0.22, + "grad_norm": 0.279296875, + "learning_rate": 4.770677887822217e-07, + "logits/chosen": -1.3193893432617188, + "logits/rejected": -1.0859930515289307, + "logps/chosen": -200.52943420410156, + "logps/rejected": -215.36544799804688, + "loss": 0.6751, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.024466924369335175, + "rewards/margins": 0.034502509981393814, + "rewards/margins_max": 0.05769481137394905, + "rewards/margins_min": 0.011310202069580555, + "rewards/margins_std": 0.032798875123262405, + "rewards/rejected": -0.01003558561205864, + "step": 990 + }, + { + "epoch": 0.23, + "grad_norm": 0.546875, + "learning_rate": 4.7623531516800907e-07, + "logits/chosen": -1.2966734170913696, + "logits/rejected": -1.0474090576171875, + "logps/chosen": -237.970947265625, + "logps/rejected": -179.91912841796875, + "loss": 0.6775, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.024659987539052963, + "rewards/margins": 0.02979280985891819, + "rewards/margins_max": 0.044087667018175125, + "rewards/margins_min": 0.015497950837016106, + "rewards/margins_std": 0.02021598257124424, + "rewards/rejected": -0.0051328218542039394, + "step": 1000 + }, + { + "epoch": 0.23, + "grad_norm": 0.390625, + "learning_rate": 4.753887527690026e-07, + "logits/chosen": -1.3687721490859985, + "logits/rejected": -1.0742595195770264, + "logps/chosen": -222.78317260742188, + "logps/rejected": -212.2716827392578, + "loss": 0.6775, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.022007988765835762, + "rewards/margins": 0.03305414319038391, + "rewards/margins_max": 0.04547839239239693, + "rewards/margins_min": 0.020629890263080597, + "rewards/margins_std": 0.01757054589688778, + "rewards/rejected": -0.011046156287193298, + "step": 1010 + }, + { + "epoch": 0.23, + "grad_norm": 0.373046875, + "learning_rate": 4.745281543048027e-07, + "logits/chosen": -1.4337875843048096, + "logits/rejected": -1.1076844930648804, + "logps/chosen": -229.92776489257812, + "logps/rejected": -248.87930297851562, + "loss": 0.6752, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02592829242348671, + "rewards/margins": 0.04496189206838608, + "rewards/margins_max": 0.062208838760852814, + "rewards/margins_min": 0.02771494910120964, + "rewards/margins_std": 0.02439085766673088, + "rewards/rejected": -0.01903359964489937, + "step": 1020 + }, + { + "epoch": 0.23, + "grad_norm": 0.3984375, + "learning_rate": 4.736535733691047e-07, + "logits/chosen": -1.4199892282485962, + "logits/rejected": -1.1356946229934692, + "logps/chosen": -207.2522430419922, + "logps/rejected": -198.83889770507812, + "loss": 0.6761, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021651072427630424, + "rewards/margins": 0.029218804091215134, + "rewards/margins_max": 0.04629982262849808, + "rewards/margins_min": 0.012137781828641891, + "rewards/margins_std": 0.02415620908141136, + "rewards/rejected": -0.0075677321292459965, + "step": 1030 + }, + { + "epoch": 0.24, + "grad_norm": 0.396484375, + "learning_rate": 4.7276506442636124e-07, + "logits/chosen": -1.4461133480072021, + "logits/rejected": -1.1284468173980713, + "logps/chosen": -249.48037719726562, + "logps/rejected": -189.9345245361328, + "loss": 0.677, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.024505896493792534, + "rewards/margins": 0.033815208822488785, + "rewards/margins_max": 0.04826827347278595, + "rewards/margins_min": 0.019362136721611023, + "rewards/margins_std": 0.020439723506569862, + "rewards/rejected": -0.009309305809438229, + "step": 1040 + }, + { + "epoch": 0.24, + "grad_norm": 0.38671875, + "learning_rate": 4.718626828083901e-07, + "logits/chosen": -1.3048738241195679, + "logits/rejected": -1.0318455696105957, + "logps/chosen": -246.67788696289062, + "logps/rejected": -189.42269897460938, + "loss": 0.6739, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.020917896181344986, + "rewards/margins": 0.03381497040390968, + "rewards/margins_max": 0.0529189296066761, + "rewards/margins_min": 0.014711007475852966, + "rewards/margins_std": 0.027017081156373024, + "rewards/rejected": -0.012897074222564697, + "step": 1050 + }, + { + "epoch": 0.24, + "grad_norm": 0.3984375, + "learning_rate": 4.709464847109291e-07, + "logits/chosen": -1.3040274381637573, + "logits/rejected": -1.0351606607437134, + "logps/chosen": -190.476318359375, + "logps/rejected": -199.35769653320312, + "loss": 0.6794, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02514139749109745, + "rewards/margins": 0.03171490877866745, + "rewards/margins_max": 0.048779942095279694, + "rewards/margins_min": 0.014649872668087482, + "rewards/margins_std": 0.024133604019880295, + "rewards/rejected": -0.0065735094249248505, + "step": 1060 + }, + { + "epoch": 0.24, + "grad_norm": 0.357421875, + "learning_rate": 4.7001652719013605e-07, + "logits/chosen": -1.4631807804107666, + "logits/rejected": -1.2612019777297974, + "logps/chosen": -195.13742065429688, + "logps/rejected": -227.14114379882812, + "loss": 0.6751, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.017182352021336555, + "rewards/margins": 0.03211602568626404, + "rewards/margins_max": 0.05497432500123978, + "rewards/margins_min": 0.00925772450864315, + "rewards/margins_std": 0.03232651576399803, + "rewards/rejected": -0.014933672733604908, + "step": 1070 + }, + { + "epoch": 0.24, + "grad_norm": 0.5234375, + "learning_rate": 4.6907286815903534e-07, + "logits/chosen": -1.359162449836731, + "logits/rejected": -1.0865987539291382, + "logps/chosen": -206.14501953125, + "logps/rejected": -195.1103515625, + "loss": 0.6764, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.017108319327235222, + "rewards/margins": 0.03591880574822426, + "rewards/margins_max": 0.05570870637893677, + "rewards/margins_min": 0.016128908842802048, + "rewards/margins_std": 0.02798713743686676, + "rewards/rejected": -0.018810484558343887, + "step": 1080 + }, + { + "epoch": 0.25, + "grad_norm": 0.42578125, + "learning_rate": 4.681155663839121e-07, + "logits/chosen": -1.3436006307601929, + "logits/rejected": -1.09580659866333, + "logps/chosen": -205.31448364257812, + "logps/rejected": -212.5472412109375, + "loss": 0.6741, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02455434389412403, + "rewards/margins": 0.04684922844171524, + "rewards/margins_max": 0.06827957183122635, + "rewards/margins_min": 0.025418881326913834, + "rewards/margins_std": 0.030307084321975708, + "rewards/rejected": -0.02229488268494606, + "step": 1090 + }, + { + "epoch": 0.25, + "grad_norm": 0.361328125, + "learning_rate": 4.671446814806521e-07, + "logits/chosen": -1.4599530696868896, + "logits/rejected": -1.3884165287017822, + "logps/chosen": -210.5607147216797, + "logps/rejected": -242.82540893554688, + "loss": 0.6714, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02126205340027809, + "rewards/margins": 0.040613122284412384, + "rewards/margins_max": 0.06381477415561676, + "rewards/margins_min": 0.017411479726433754, + "rewards/margins_std": 0.03281208127737045, + "rewards/rejected": -0.019351070746779442, + "step": 1100 + }, + { + "epoch": 0.25, + "grad_norm": 0.361328125, + "learning_rate": 4.66160273911029e-07, + "logits/chosen": -1.3703720569610596, + "logits/rejected": -1.1701464653015137, + "logps/chosen": -183.18475341796875, + "logps/rejected": -199.93508911132812, + "loss": 0.6737, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03071439266204834, + "rewards/margins": 0.0536029115319252, + "rewards/margins_max": 0.07884959131479263, + "rewards/margins_min": 0.028356235474348068, + "rewards/margins_std": 0.03570418804883957, + "rewards/rejected": -0.022888517007231712, + "step": 1110 + }, + { + "epoch": 0.25, + "grad_norm": 0.408203125, + "learning_rate": 4.651624049789397e-07, + "logits/chosen": -1.5386936664581299, + "logits/rejected": -1.3589353561401367, + "logps/chosen": -189.0791778564453, + "logps/rejected": -204.80850219726562, + "loss": 0.6778, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02617635391652584, + "rewards/margins": 0.03845269978046417, + "rewards/margins_max": 0.0595969632267952, + "rewards/margins_min": 0.017308443784713745, + "rewards/margins_std": 0.029902497306466103, + "rewards/rejected": -0.012276348657906055, + "step": 1120 + }, + { + "epoch": 0.26, + "grad_norm": 0.435546875, + "learning_rate": 4.64151136826586e-07, + "logits/chosen": -1.4097120761871338, + "logits/rejected": -1.1443564891815186, + "logps/chosen": -221.24960327148438, + "logps/rejected": -220.4223175048828, + "loss": 0.6741, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.026502933353185654, + "rewards/margins": 0.040553100407123566, + "rewards/margins_max": 0.055532462894916534, + "rewards/margins_min": 0.025573736056685448, + "rewards/margins_std": 0.02118402160704136, + "rewards/rejected": -0.014050167985260487, + "step": 1130 + }, + { + "epoch": 0.26, + "grad_norm": 0.34375, + "learning_rate": 4.631265324306053e-07, + "logits/chosen": -1.3383985757827759, + "logits/rejected": -1.179694652557373, + "logps/chosen": -178.04922485351562, + "logps/rejected": -221.2380828857422, + "loss": 0.6741, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02292972058057785, + "rewards/margins": 0.03944787755608559, + "rewards/margins_max": 0.06199193745851517, + "rewards/margins_min": 0.01690381020307541, + "rewards/margins_std": 0.03188212215900421, + "rewards/rejected": -0.01651815138757229, + "step": 1140 + }, + { + "epoch": 0.26, + "grad_norm": 0.408203125, + "learning_rate": 4.6208865559814795e-07, + "logits/chosen": -1.224730134010315, + "logits/rejected": -1.0031412839889526, + "logps/chosen": -225.22817993164062, + "logps/rejected": -246.8232421875, + "loss": 0.6759, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.022863784804940224, + "rewards/margins": 0.033729564398527145, + "rewards/margins_max": 0.05401785299181938, + "rewards/margins_min": 0.013441281393170357, + "rewards/margins_std": 0.028691967949271202, + "rewards/rejected": -0.01086578331887722, + "step": 1150 + }, + { + "epoch": 0.26, + "grad_norm": 0.47265625, + "learning_rate": 4.610375709629047e-07, + "logits/chosen": -1.3221080303192139, + "logits/rejected": -1.0841983556747437, + "logps/chosen": -199.57630920410156, + "logps/rejected": -244.9528350830078, + "loss": 0.672, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.027617940679192543, + "rewards/margins": 0.04445469751954079, + "rewards/margins_max": 0.0699603408575058, + "rewards/margins_min": 0.018949061632156372, + "rewards/margins_std": 0.03607042506337166, + "rewards/rejected": -0.01683676615357399, + "step": 1160 + }, + { + "epoch": 0.26, + "grad_norm": 0.39453125, + "learning_rate": 4.5997334398108064e-07, + "logits/chosen": -1.380094289779663, + "logits/rejected": -1.0472290515899658, + "logps/chosen": -229.5024871826172, + "logps/rejected": -222.79287719726562, + "loss": 0.6742, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027185887098312378, + "rewards/margins": 0.036655206233263016, + "rewards/margins_max": 0.0568247064948082, + "rewards/margins_min": 0.016485709697008133, + "rewards/margins_std": 0.02852398157119751, + "rewards/rejected": -0.009469323791563511, + "step": 1170 + }, + { + "epoch": 0.27, + "grad_norm": 0.2578125, + "learning_rate": 4.5889604092731954e-07, + "logits/chosen": -1.5819199085235596, + "logits/rejected": -1.360912561416626, + "logps/chosen": -153.7646026611328, + "logps/rejected": -182.9700164794922, + "loss": 0.6755, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02463090792298317, + "rewards/margins": 0.03991164639592171, + "rewards/margins_max": 0.06291045248508453, + "rewards/margins_min": 0.016912829130887985, + "rewards/margins_std": 0.032525233924388885, + "rewards/rejected": -0.015280733816325665, + "step": 1180 + }, + { + "epoch": 0.27, + "grad_norm": 0.48046875, + "learning_rate": 4.578057288905765e-07, + "logits/chosen": -1.406576156616211, + "logits/rejected": -1.0120534896850586, + "logps/chosen": -206.16256713867188, + "logps/rejected": -236.18276977539062, + "loss": 0.6736, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.023824552074074745, + "rewards/margins": 0.03779350593686104, + "rewards/margins_max": 0.05464478209614754, + "rewards/margins_min": 0.02094222977757454, + "rewards/margins_std": 0.02383130043745041, + "rewards/rejected": -0.013968953862786293, + "step": 1190 + }, + { + "epoch": 0.27, + "grad_norm": 0.447265625, + "learning_rate": 4.567024757699398e-07, + "logits/chosen": -1.3428590297698975, + "logits/rejected": -1.1314809322357178, + "logps/chosen": -171.41761779785156, + "logps/rejected": -193.2316131591797, + "loss": 0.6775, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02352306619286537, + "rewards/margins": 0.03855372592806816, + "rewards/margins_max": 0.05891672894358635, + "rewards/margins_min": 0.018190719187259674, + "rewards/margins_std": 0.028797641396522522, + "rewards/rejected": -0.015030661597847939, + "step": 1200 + }, + { + "epoch": 0.27, + "grad_norm": 0.416015625, + "learning_rate": 4.555863502704026e-07, + "logits/chosen": -1.3257324695587158, + "logits/rejected": -1.017549753189087, + "logps/chosen": -277.9920349121094, + "logps/rejected": -204.20068359375, + "loss": 0.6725, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.029865305870771408, + "rewards/margins": 0.04546823725104332, + "rewards/margins_max": 0.067599818110466, + "rewards/margins_min": 0.023336660116910934, + "rewards/margins_std": 0.03129877895116806, + "rewards/rejected": -0.015602931380271912, + "step": 1210 + }, + { + "epoch": 0.28, + "grad_norm": 0.458984375, + "learning_rate": 4.544574218985844e-07, + "logits/chosen": -1.3070051670074463, + "logits/rejected": -1.027001142501831, + "logps/chosen": -238.04275512695312, + "logps/rejected": -211.5153045654297, + "loss": 0.6731, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.031848080456256866, + "rewards/margins": 0.03921237960457802, + "rewards/margins_max": 0.0675017461180687, + "rewards/margins_min": 0.010923011228442192, + "rewards/margins_std": 0.04000721126794815, + "rewards/rejected": -0.007364300079643726, + "step": 1220 + }, + { + "epoch": 0.28, + "grad_norm": 0.330078125, + "learning_rate": 4.533157609584025e-07, + "logits/chosen": -1.5297473669052124, + "logits/rejected": -1.2360965013504028, + "logps/chosen": -245.1328582763672, + "logps/rejected": -284.0274353027344, + "loss": 0.6707, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02966652438044548, + "rewards/margins": 0.03678882494568825, + "rewards/margins_max": 0.05802678316831589, + "rewards/margins_min": 0.015550869517028332, + "rewards/margins_std": 0.030035007745027542, + "rewards/rejected": -0.007122299168258905, + "step": 1230 + }, + { + "epoch": 0.28, + "grad_norm": 0.287109375, + "learning_rate": 4.521614385466938e-07, + "logits/chosen": -1.2434746026992798, + "logits/rejected": -0.9775724411010742, + "logps/chosen": -167.60195922851562, + "logps/rejected": -160.4337158203125, + "loss": 0.6751, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.027229581028223038, + "rewards/margins": 0.04429193213582039, + "rewards/margins_max": 0.06861467659473419, + "rewards/margins_min": 0.019969182088971138, + "rewards/margins_std": 0.034397564828395844, + "rewards/rejected": -0.017062349244952202, + "step": 1240 + }, + { + "epoch": 0.28, + "grad_norm": 0.35546875, + "learning_rate": 4.50994526548787e-07, + "logits/chosen": -1.4568400382995605, + "logits/rejected": -1.1636699438095093, + "logps/chosen": -277.090576171875, + "logps/rejected": -241.61618041992188, + "loss": 0.6747, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02653464674949646, + "rewards/margins": 0.042523398995399475, + "rewards/margins_max": 0.06366567313671112, + "rewards/margins_min": 0.021381134167313576, + "rewards/margins_std": 0.029899677261710167, + "rewards/rejected": -0.015988752245903015, + "step": 1250 + }, + { + "epoch": 0.28, + "grad_norm": 0.47265625, + "learning_rate": 4.498150976340266e-07, + "logits/chosen": -1.289681077003479, + "logits/rejected": -1.1193758249282837, + "logps/chosen": -190.8822021484375, + "logps/rejected": -197.9546356201172, + "loss": 0.6754, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.019678102806210518, + "rewards/margins": 0.040147751569747925, + "rewards/margins_max": 0.06217692047357559, + "rewards/margins_min": 0.018118582665920258, + "rewards/margins_std": 0.031153947114944458, + "rewards/rejected": -0.020469646900892258, + "step": 1260 + }, + { + "epoch": 0.29, + "grad_norm": 0.3359375, + "learning_rate": 4.4862322525124676e-07, + "logits/chosen": -1.4055033922195435, + "logits/rejected": -1.1639750003814697, + "logps/chosen": -182.3616943359375, + "logps/rejected": -207.5341033935547, + "loss": 0.6735, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.030965592712163925, + "rewards/margins": 0.0414421483874321, + "rewards/margins_max": 0.06650801748037338, + "rewards/margins_min": 0.016376283019781113, + "rewards/margins_std": 0.03544849157333374, + "rewards/rejected": -0.010476559400558472, + "step": 1270 + }, + { + "epoch": 0.29, + "grad_norm": 0.458984375, + "learning_rate": 4.474189836241976e-07, + "logits/chosen": -1.3990777730941772, + "logits/rejected": -1.0768229961395264, + "logps/chosen": -271.35107421875, + "logps/rejected": -200.7308349609375, + "loss": 0.6718, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.025321820750832558, + "rewards/margins": 0.05165884643793106, + "rewards/margins_max": 0.07472828775644302, + "rewards/margins_min": 0.028589408844709396, + "rewards/margins_std": 0.032625116407871246, + "rewards/rejected": -0.026337021961808205, + "step": 1280 + }, + { + "epoch": 0.29, + "grad_norm": 0.384765625, + "learning_rate": 4.4620244774692296e-07, + "logits/chosen": -1.4728714227676392, + "logits/rejected": -1.2694370746612549, + "logps/chosen": -195.16342163085938, + "logps/rejected": -178.6471405029297, + "loss": 0.6743, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.024754667654633522, + "rewards/margins": 0.03827555477619171, + "rewards/margins_max": 0.05812322348356247, + "rewards/margins_min": 0.018427889794111252, + "rewards/margins_std": 0.02806883677840233, + "rewards/rejected": -0.013520888984203339, + "step": 1290 + }, + { + "epoch": 0.29, + "grad_norm": 0.3125, + "learning_rate": 4.4497369337908986e-07, + "logits/chosen": -1.3485519886016846, + "logits/rejected": -1.1609489917755127, + "logps/chosen": -242.7335968017578, + "logps/rejected": -244.931640625, + "loss": 0.6742, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.027875151485204697, + "rewards/margins": 0.03989388421177864, + "rewards/margins_max": 0.06232045218348503, + "rewards/margins_min": 0.017467325553297997, + "rewards/margins_std": 0.03171594813466072, + "rewards/rejected": -0.012018732726573944, + "step": 1300 + }, + { + "epoch": 0.3, + "grad_norm": 0.2890625, + "learning_rate": 4.437327970412709e-07, + "logits/chosen": -1.4367246627807617, + "logits/rejected": -1.09050714969635, + "logps/chosen": -230.6901397705078, + "logps/rejected": -198.34022521972656, + "loss": 0.673, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.027688348665833473, + "rewards/margins": 0.04100477322936058, + "rewards/margins_max": 0.06766901910305023, + "rewards/margins_min": 0.014340527355670929, + "rewards/margins_std": 0.037708934396505356, + "rewards/rejected": -0.013316420838236809, + "step": 1310 + }, + { + "epoch": 0.3, + "grad_norm": 0.51953125, + "learning_rate": 4.424798360101788e-07, + "logits/chosen": -1.3804186582565308, + "logits/rejected": -1.271663784980774, + "logps/chosen": -185.00445556640625, + "logps/rejected": -192.37820434570312, + "loss": 0.6714, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02708057127892971, + "rewards/margins": 0.04213610291481018, + "rewards/margins_max": 0.059732962399721146, + "rewards/margins_min": 0.024539247155189514, + "rewards/margins_std": 0.024885715916752815, + "rewards/rejected": -0.01505553163588047, + "step": 1320 + }, + { + "epoch": 0.3, + "grad_norm": 0.361328125, + "learning_rate": 4.41214888313854e-07, + "logits/chosen": -1.3674253225326538, + "logits/rejected": -1.2365589141845703, + "logps/chosen": -196.21676635742188, + "logps/rejected": -256.8332214355469, + "loss": 0.6732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02884775400161743, + "rewards/margins": 0.03929910063743591, + "rewards/margins_max": 0.06681930273771286, + "rewards/margins_min": 0.011778893880546093, + "rewards/margins_std": 0.038919445127248764, + "rewards/rejected": -0.010451346635818481, + "step": 1330 + }, + { + "epoch": 0.3, + "grad_norm": 0.50390625, + "learning_rate": 4.3993803272680553e-07, + "logits/chosen": -1.363981008529663, + "logits/rejected": -1.0240668058395386, + "logps/chosen": -252.509033203125, + "logps/rejected": -234.2544403076172, + "loss": 0.6714, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.031312428414821625, + "rewards/margins": 0.05178850144147873, + "rewards/margins_max": 0.08234371989965439, + "rewards/margins_min": 0.02123328484594822, + "rewards/margins_std": 0.043211597949266434, + "rewards/rejected": -0.020476069301366806, + "step": 1340 + }, + { + "epoch": 0.31, + "grad_norm": 0.396484375, + "learning_rate": 4.386493487651051e-07, + "logits/chosen": -1.4066898822784424, + "logits/rejected": -1.2005523443222046, + "logps/chosen": -204.56298828125, + "logps/rejected": -221.59033203125, + "loss": 0.6722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.026341985911130905, + "rewards/margins": 0.03979109227657318, + "rewards/margins_max": 0.0626225396990776, + "rewards/margins_min": 0.016959641128778458, + "rewards/margins_std": 0.03228854760527611, + "rewards/rejected": -0.013449104502797127, + "step": 1350 + }, + { + "epoch": 0.31, + "grad_norm": 0.306640625, + "learning_rate": 4.373489166814358e-07, + "logits/chosen": -1.417218804359436, + "logits/rejected": -1.0997976064682007, + "logps/chosen": -241.2794189453125, + "logps/rejected": -208.86245727539062, + "loss": 0.6752, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023142900317907333, + "rewards/margins": 0.0397111177444458, + "rewards/margins_max": 0.059913743287324905, + "rewards/margins_min": 0.019508492201566696, + "rewards/margins_std": 0.02857082709670067, + "rewards/rejected": -0.01656820997595787, + "step": 1360 + }, + { + "epoch": 0.31, + "grad_norm": 0.4453125, + "learning_rate": 4.360368174600937e-07, + "logits/chosen": -1.39288330078125, + "logits/rejected": -1.104473352432251, + "logps/chosen": -232.40640258789062, + "logps/rejected": -160.45449829101562, + "loss": 0.6746, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.025461440905928612, + "rewards/margins": 0.036584287881851196, + "rewards/margins_max": 0.05490097403526306, + "rewards/margins_min": 0.018267596140503883, + "rewards/margins_std": 0.02590371109545231, + "rewards/rejected": -0.01112284604460001, + "step": 1370 + }, + { + "epoch": 0.31, + "grad_norm": 0.431640625, + "learning_rate": 4.34713132811945e-07, + "logits/chosen": -1.3218541145324707, + "logits/rejected": -1.1150844097137451, + "logps/chosen": -207.88314819335938, + "logps/rejected": -225.9438018798828, + "loss": 0.6722, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.029938017949461937, + "rewards/margins": 0.04521246999502182, + "rewards/margins_max": 0.06895864754915237, + "rewards/margins_min": 0.021466294303536415, + "rewards/margins_std": 0.033582162111997604, + "rewards/rejected": -0.015274452045559883, + "step": 1380 + }, + { + "epoch": 0.31, + "grad_norm": 0.4296875, + "learning_rate": 4.333779451693372e-07, + "logits/chosen": -1.3209631443023682, + "logits/rejected": -1.0849366188049316, + "logps/chosen": -196.41134643554688, + "logps/rejected": -191.35043334960938, + "loss": 0.6748, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.023151496425271034, + "rewards/margins": 0.039687685668468475, + "rewards/margins_max": 0.060737740248441696, + "rewards/margins_min": 0.018637629225850105, + "rewards/margins_std": 0.029769275337457657, + "rewards/rejected": -0.01653619296848774, + "step": 1390 + }, + { + "epoch": 0.32, + "grad_norm": 0.365234375, + "learning_rate": 4.32031337680966e-07, + "logits/chosen": -1.3408453464508057, + "logits/rejected": -1.014647126197815, + "logps/chosen": -177.572998046875, + "logps/rejected": -179.5858612060547, + "loss": 0.6726, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021111497655510902, + "rewards/margins": 0.04325110465288162, + "rewards/margins_max": 0.06425291299819946, + "rewards/margins_min": 0.022249290719628334, + "rewards/margins_std": 0.029701050370931625, + "rewards/rejected": -0.02213960886001587, + "step": 1400 + }, + { + "epoch": 0.32, + "grad_norm": 0.322265625, + "learning_rate": 4.306733942066969e-07, + "logits/chosen": -1.2504961490631104, + "logits/rejected": -1.0722445249557495, + "logps/chosen": -207.9407196044922, + "logps/rejected": -184.21514892578125, + "loss": 0.6754, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.020327100530266762, + "rewards/margins": 0.03269373998045921, + "rewards/margins_max": 0.05034772679209709, + "rewards/margins_min": 0.01503975223749876, + "rewards/margins_std": 0.024966508150100708, + "rewards/rejected": -0.012366642244160175, + "step": 1410 + }, + { + "epoch": 0.32, + "grad_norm": 0.37890625, + "learning_rate": 4.29304199312343e-07, + "logits/chosen": -1.3490780591964722, + "logits/rejected": -1.0001791715621948, + "logps/chosen": -253.0603790283203, + "logps/rejected": -220.7419891357422, + "loss": 0.6723, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023180747404694557, + "rewards/margins": 0.034993939101696014, + "rewards/margins_max": 0.05462303012609482, + "rewards/margins_min": 0.015364840626716614, + "rewards/margins_std": 0.02775973081588745, + "rewards/rejected": -0.011813190765678883, + "step": 1420 + }, + { + "epoch": 0.32, + "grad_norm": 0.41796875, + "learning_rate": 4.279238382643984e-07, + "logits/chosen": -1.3278534412384033, + "logits/rejected": -1.0635864734649658, + "logps/chosen": -212.005615234375, + "logps/rejected": -251.8597869873047, + "loss": 0.6719, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.025677820667624474, + "rewards/margins": 0.04707244411110878, + "rewards/margins_max": 0.07336616516113281, + "rewards/margins_min": 0.020778721198439598, + "rewards/margins_std": 0.037184938788414, + "rewards/rejected": -0.021394621580839157, + "step": 1430 + }, + { + "epoch": 0.33, + "grad_norm": 0.5078125, + "learning_rate": 4.26532397024729e-07, + "logits/chosen": -1.331188440322876, + "logits/rejected": -1.1710078716278076, + "logps/chosen": -202.5389404296875, + "logps/rejected": -222.7393798828125, + "loss": 0.6734, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.036635275930166245, + "rewards/margins": 0.03808824345469475, + "rewards/margins_max": 0.05488138273358345, + "rewards/margins_min": 0.021295102313160896, + "rewards/margins_std": 0.023749085143208504, + "rewards/rejected": -0.001452968455851078, + "step": 1440 + }, + { + "epoch": 0.33, + "grad_norm": 0.359375, + "learning_rate": 4.251299622452179e-07, + "logits/chosen": -1.3149991035461426, + "logits/rejected": -1.2217159271240234, + "logps/chosen": -144.6352996826172, + "logps/rejected": -194.59603881835938, + "loss": 0.6721, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02758004143834114, + "rewards/margins": 0.03688063472509384, + "rewards/margins_max": 0.05518157035112381, + "rewards/margins_min": 0.018579700961709023, + "rewards/margins_std": 0.025881433859467506, + "rewards/rejected": -0.009300598874688148, + "step": 1450 + }, + { + "epoch": 0.33, + "grad_norm": 0.318359375, + "learning_rate": 4.2371662126237074e-07, + "logits/chosen": -1.2196018695831299, + "logits/rejected": -1.113595724105835, + "logps/chosen": -187.85397338867188, + "logps/rejected": -217.3521728515625, + "loss": 0.6728, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.020274577662348747, + "rewards/margins": 0.04040878638625145, + "rewards/margins_max": 0.06457223743200302, + "rewards/margins_min": 0.016245335340499878, + "rewards/margins_std": 0.03417228162288666, + "rewards/rejected": -0.02013421058654785, + "step": 1460 + }, + { + "epoch": 0.33, + "grad_norm": 0.453125, + "learning_rate": 4.222924620918755e-07, + "logits/chosen": -1.317690134048462, + "logits/rejected": -1.0494765043258667, + "logps/chosen": -182.76686096191406, + "logps/rejected": -199.16482543945312, + "loss": 0.673, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.032194025814533234, + "rewards/margins": 0.035994671285152435, + "rewards/margins_max": 0.05865035578608513, + "rewards/margins_min": 0.013338984921574593, + "rewards/margins_std": 0.03203997761011124, + "rewards/rejected": -0.0038006496615707874, + "step": 1470 + }, + { + "epoch": 0.33, + "grad_norm": 0.50390625, + "learning_rate": 4.2085757342312203e-07, + "logits/chosen": -1.37278151512146, + "logits/rejected": -1.064212679862976, + "logps/chosen": -313.87835693359375, + "logps/rejected": -252.13623046875, + "loss": 0.6708, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.031817976385354996, + "rewards/margins": 0.048167884349823, + "rewards/margins_max": 0.07686305046081543, + "rewards/margins_min": 0.019472714513540268, + "rewards/margins_std": 0.04058109596371651, + "rewards/rejected": -0.016349902376532555, + "step": 1480 + }, + { + "epoch": 0.34, + "grad_norm": 0.40234375, + "learning_rate": 4.1941204461367873e-07, + "logits/chosen": -1.3766664266586304, + "logits/rejected": -1.1429855823516846, + "logps/chosen": -245.5808563232422, + "logps/rejected": -208.631591796875, + "loss": 0.6725, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.02773849107325077, + "rewards/margins": 0.03844064846634865, + "rewards/margins_max": 0.06190527603030205, + "rewards/margins_min": 0.0149760153144598, + "rewards/margins_std": 0.0331839993596077, + "rewards/rejected": -0.010702153667807579, + "step": 1490 + }, + { + "epoch": 0.34, + "grad_norm": 0.376953125, + "learning_rate": 4.1795596568372795e-07, + "logits/chosen": -1.3350478410720825, + "logits/rejected": -1.0876274108886719, + "logps/chosen": -270.3304138183594, + "logps/rejected": -191.8568115234375, + "loss": 0.671, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027609974145889282, + "rewards/margins": 0.04803454130887985, + "rewards/margins_max": 0.07594247162342072, + "rewards/margins_min": 0.02012660540640354, + "rewards/margins_std": 0.03946777805685997, + "rewards/rejected": -0.02042456530034542, + "step": 1500 + }, + { + "epoch": 0.34, + "grad_norm": 0.37890625, + "learning_rate": 4.1648942731045984e-07, + "logits/chosen": -1.494276762008667, + "logits/rejected": -1.0351579189300537, + "logps/chosen": -225.27645874023438, + "logps/rejected": -193.90029907226562, + "loss": 0.67, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03104591928422451, + "rewards/margins": 0.04760845750570297, + "rewards/margins_max": 0.07213185727596283, + "rewards/margins_min": 0.023085057735443115, + "rewards/margins_std": 0.034681327641010284, + "rewards/rejected": -0.016562536358833313, + "step": 1510 + }, + { + "epoch": 0.34, + "grad_norm": 0.31640625, + "learning_rate": 4.1501252082242536e-07, + "logits/chosen": -1.4287065267562866, + "logits/rejected": -1.3087886571884155, + "logps/chosen": -167.2257843017578, + "logps/rejected": -184.9642333984375, + "loss": 0.6733, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.02862406335771084, + "rewards/margins": 0.03685583546757698, + "rewards/margins_max": 0.05723712965846062, + "rewards/margins_min": 0.016474535688757896, + "rewards/margins_std": 0.028823506087064743, + "rewards/rejected": -0.008231772109866142, + "step": 1520 + }, + { + "epoch": 0.35, + "grad_norm": 0.34765625, + "learning_rate": 4.1352533819384916e-07, + "logits/chosen": -1.3988134860992432, + "logits/rejected": -1.0942933559417725, + "logps/chosen": -203.42141723632812, + "logps/rejected": -211.5314178466797, + "loss": 0.6701, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.023281631991267204, + "rewards/margins": 0.043478261679410934, + "rewards/margins_max": 0.0694703757762909, + "rewards/margins_min": 0.017486149445176125, + "rewards/margins_std": 0.03675839677453041, + "rewards/rejected": -0.02019662782549858, + "step": 1530 + }, + { + "epoch": 0.35, + "grad_norm": 0.419921875, + "learning_rate": 4.120279720389014e-07, + "logits/chosen": -1.2943384647369385, + "logits/rejected": -1.0621731281280518, + "logps/chosen": -198.1253204345703, + "logps/rejected": -172.11181640625, + "loss": 0.6728, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03131099045276642, + "rewards/margins": 0.034897904843091965, + "rewards/margins_max": 0.059048790484666824, + "rewards/margins_min": 0.01074702013283968, + "rewards/margins_std": 0.0341545045375824, + "rewards/rejected": -0.003586915787309408, + "step": 1540 + }, + { + "epoch": 0.35, + "grad_norm": 0.380859375, + "learning_rate": 4.1052051560593065e-07, + "logits/chosen": -1.3193204402923584, + "logits/rejected": -1.0119553804397583, + "logps/chosen": -211.03005981445312, + "logps/rejected": -193.88609313964844, + "loss": 0.671, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.023582275956869125, + "rewards/margins": 0.044007860124111176, + "rewards/margins_max": 0.06155252456665039, + "rewards/margins_min": 0.026463191956281662, + "rewards/margins_std": 0.024811910465359688, + "rewards/rejected": -0.0204255860298872, + "step": 1550 + }, + { + "epoch": 0.35, + "grad_norm": 0.515625, + "learning_rate": 4.0900306277165666e-07, + "logits/chosen": -1.2885864973068237, + "logits/rejected": -1.061097502708435, + "logps/chosen": -202.5950164794922, + "logps/rejected": -194.2788848876953, + "loss": 0.6711, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0246761254966259, + "rewards/margins": 0.03755421191453934, + "rewards/margins_max": 0.05342765524983406, + "rewards/margins_min": 0.021680768579244614, + "rewards/margins_std": 0.02244843915104866, + "rewards/rejected": -0.012878087349236012, + "step": 1560 + }, + { + "epoch": 0.35, + "grad_norm": 0.462890625, + "learning_rate": 4.0747570803532407e-07, + "logits/chosen": -1.2883301973342896, + "logits/rejected": -1.0276634693145752, + "logps/chosen": -267.29791259765625, + "logps/rejected": -222.98080444335938, + "loss": 0.6727, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.021301481872797012, + "rewards/margins": 0.040718015283346176, + "rewards/margins_max": 0.059512365609407425, + "rewards/margins_min": 0.021923670545220375, + "rewards/margins_std": 0.026579225435853004, + "rewards/rejected": -0.019416535273194313, + "step": 1570 + }, + { + "epoch": 0.36, + "grad_norm": 0.36328125, + "learning_rate": 4.059385465128178e-07, + "logits/chosen": -1.2792766094207764, + "logits/rejected": -1.0782058238983154, + "logps/chosen": -183.18736267089844, + "logps/rejected": -193.03024291992188, + "loss": 0.6694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02231001853942871, + "rewards/margins": 0.035427968949079514, + "rewards/margins_max": 0.057831812649965286, + "rewards/margins_min": 0.013024131767451763, + "rewards/margins_std": 0.03168381005525589, + "rewards/rejected": -0.013117952272295952, + "step": 1580 + }, + { + "epoch": 0.36, + "grad_norm": 0.36328125, + "learning_rate": 4.043916739307394e-07, + "logits/chosen": -1.3757706880569458, + "logits/rejected": -1.0660990476608276, + "logps/chosen": -192.93234252929688, + "logps/rejected": -202.1962890625, + "loss": 0.6675, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03153764829039574, + "rewards/margins": 0.055783580988645554, + "rewards/margins_max": 0.0908677726984024, + "rewards/margins_min": 0.020699385553598404, + "rewards/margins_std": 0.04961654543876648, + "rewards/rejected": -0.024245930835604668, + "step": 1590 + }, + { + "epoch": 0.36, + "grad_norm": 0.46484375, + "learning_rate": 4.0283518662044595e-07, + "logits/chosen": -1.4345725774765015, + "logits/rejected": -1.1135655641555786, + "logps/chosen": -259.4965515136719, + "logps/rejected": -224.1306915283203, + "loss": 0.6715, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02794257365167141, + "rewards/margins": 0.03813885152339935, + "rewards/margins_max": 0.05748515576124191, + "rewards/margins_min": 0.01879255101084709, + "rewards/margins_std": 0.02735980786383152, + "rewards/rejected": -0.010196278803050518, + "step": 1600 + }, + { + "epoch": 0.36, + "grad_norm": 0.42578125, + "learning_rate": 4.012691815120508e-07, + "logits/chosen": -1.5209296941757202, + "logits/rejected": -1.0353261232376099, + "logps/chosen": -216.0679931640625, + "logps/rejected": -189.17874145507812, + "loss": 0.6696, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.043269019573926926, + "rewards/margins": 0.05212603881955147, + "rewards/margins_max": 0.07273893058300018, + "rewards/margins_min": 0.03151315450668335, + "rewards/margins_std": 0.02915102243423462, + "rewards/rejected": -0.00885702483355999, + "step": 1610 + }, + { + "epoch": 0.37, + "grad_norm": 0.51953125, + "learning_rate": 3.996937561283873e-07, + "logits/chosen": -1.568519115447998, + "logits/rejected": -0.9833946228027344, + "logps/chosen": -265.3883056640625, + "logps/rejected": -200.9881134033203, + "loss": 0.6709, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.025037512183189392, + "rewards/margins": 0.04677094891667366, + "rewards/margins_max": 0.08109462261199951, + "rewards/margins_min": 0.01244727335870266, + "rewards/margins_std": 0.04854099825024605, + "rewards/rejected": -0.021733436733484268, + "step": 1620 + }, + { + "epoch": 0.37, + "grad_norm": 0.451171875, + "learning_rate": 3.981090085789358e-07, + "logits/chosen": -1.216138243675232, + "logits/rejected": -1.0891082286834717, + "logps/chosen": -274.11761474609375, + "logps/rejected": -265.92626953125, + "loss": 0.6756, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025747573003172874, + "rewards/margins": 0.03598998114466667, + "rewards/margins_max": 0.060326360166072845, + "rewards/margins_min": 0.011653609573841095, + "rewards/margins_std": 0.03441683202981949, + "rewards/rejected": -0.010242411866784096, + "step": 1630 + }, + { + "epoch": 0.37, + "grad_norm": 0.53125, + "learning_rate": 3.965150375537137e-07, + "logits/chosen": -1.4957590103149414, + "logits/rejected": -1.1819343566894531, + "logps/chosen": -210.2582244873047, + "logps/rejected": -211.7596893310547, + "loss": 0.6687, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.029509777203202248, + "rewards/margins": 0.04957672208547592, + "rewards/margins_max": 0.06985175609588623, + "rewards/margins_min": 0.02930169366300106, + "rewards/margins_std": 0.028673222288489342, + "rewards/rejected": -0.020066948607563972, + "step": 1640 + }, + { + "epoch": 0.37, + "grad_norm": 0.3984375, + "learning_rate": 3.949119423171294e-07, + "logits/chosen": -1.420458197593689, + "logits/rejected": -1.2591960430145264, + "logps/chosen": -172.73870849609375, + "logps/rejected": -178.8004608154297, + "loss": 0.6683, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.029484177008271217, + "rewards/margins": 0.04394937679171562, + "rewards/margins_max": 0.06709001213312149, + "rewards/margins_min": 0.020808745175600052, + "rewards/margins_std": 0.032725803554058075, + "rewards/rejected": -0.014465202577412128, + "step": 1650 + }, + { + "epoch": 0.38, + "grad_norm": 0.35546875, + "learning_rate": 3.9329982270180083e-07, + "logits/chosen": -1.2149207592010498, + "logits/rejected": -1.0226820707321167, + "logps/chosen": -196.73216247558594, + "logps/rejected": -183.21646118164062, + "loss": 0.6685, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.01719057932496071, + "rewards/margins": 0.044677041471004486, + "rewards/margins_max": 0.0677308589220047, + "rewards/margins_min": 0.02162322774529457, + "rewards/margins_std": 0.03260301426053047, + "rewards/rejected": -0.027486462146043777, + "step": 1660 + }, + { + "epoch": 0.38, + "grad_norm": 0.40625, + "learning_rate": 3.916787791023386e-07, + "logits/chosen": -1.4802358150482178, + "logits/rejected": -1.118082880973816, + "logps/chosen": -208.3599853515625, + "logps/rejected": -180.79049682617188, + "loss": 0.6719, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.033904947340488434, + "rewards/margins": 0.037811677902936935, + "rewards/margins_max": 0.06144796684384346, + "rewards/margins_min": 0.014175387099385262, + "rewards/margins_std": 0.033426761627197266, + "rewards/rejected": -0.00390672916546464, + "step": 1670 + }, + { + "epoch": 0.38, + "grad_norm": 0.33203125, + "learning_rate": 3.900489124690932e-07, + "logits/chosen": -1.4334280490875244, + "logits/rejected": -1.1188874244689941, + "logps/chosen": -201.74215698242188, + "logps/rejected": -270.0927429199219, + "loss": 0.672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.031066913157701492, + "rewards/margins": 0.04503883793950081, + "rewards/margins_max": 0.06810437142848969, + "rewards/margins_min": 0.021973304450511932, + "rewards/margins_std": 0.03261958807706833, + "rewards/rejected": -0.013971921987831593, + "step": 1680 + }, + { + "epoch": 0.38, + "grad_norm": 0.484375, + "learning_rate": 3.884103243018693e-07, + "logits/chosen": -1.2650120258331299, + "logits/rejected": -1.0881750583648682, + "logps/chosen": -220.5270538330078, + "logps/rejected": -243.7740478515625, + "loss": 0.6711, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.026399720460176468, + "rewards/margins": 0.038081325590610504, + "rewards/margins_max": 0.05310334637761116, + "rewards/margins_min": 0.0230593029409647, + "rewards/margins_std": 0.02124434895813465, + "rewards/rejected": -0.011681604199111462, + "step": 1690 + }, + { + "epoch": 0.38, + "grad_norm": 0.48828125, + "learning_rate": 3.867631166436037e-07, + "logits/chosen": -1.2718234062194824, + "logits/rejected": -1.0437819957733154, + "logps/chosen": -221.5988006591797, + "logps/rejected": -192.76193237304688, + "loss": 0.6703, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025970160961151123, + "rewards/margins": 0.041418932378292084, + "rewards/margins_max": 0.07021013647317886, + "rewards/margins_min": 0.012627726420760155, + "rewards/margins_std": 0.04071691259741783, + "rewards/rejected": -0.015448769554495811, + "step": 1700 + }, + { + "epoch": 0.39, + "grad_norm": 0.4375, + "learning_rate": 3.85107392074012e-07, + "logits/chosen": -1.441463589668274, + "logits/rejected": -1.232027292251587, + "logps/chosen": -226.7698211669922, + "logps/rejected": -177.8507537841797, + "loss": 0.6722, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.021718554198741913, + "rewards/margins": 0.04068540036678314, + "rewards/margins_max": 0.060767918825149536, + "rewards/margins_min": 0.020602887496352196, + "rewards/margins_std": 0.02840096689760685, + "rewards/rejected": -0.01896684803068638, + "step": 1710 + }, + { + "epoch": 0.39, + "grad_norm": 0.29296875, + "learning_rate": 3.834432537031991e-07, + "logits/chosen": -1.3713477849960327, + "logits/rejected": -0.9427247047424316, + "logps/chosen": -288.84698486328125, + "logps/rejected": -220.6392364501953, + "loss": 0.6723, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02653350867331028, + "rewards/margins": 0.04643214866518974, + "rewards/margins_max": 0.06892819702625275, + "rewards/margins_min": 0.02393609844148159, + "rewards/margins_std": 0.031814225018024445, + "rewards/rejected": -0.019898641854524612, + "step": 1720 + }, + { + "epoch": 0.39, + "grad_norm": 0.345703125, + "learning_rate": 3.817708051652392e-07, + "logits/chosen": -1.3596107959747314, + "logits/rejected": -1.081020712852478, + "logps/chosen": -213.62637329101562, + "logps/rejected": -196.31497192382812, + "loss": 0.6724, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.02571287751197815, + "rewards/margins": 0.03983969986438751, + "rewards/margins_max": 0.06263546645641327, + "rewards/margins_min": 0.017043929547071457, + "rewards/margins_std": 0.032238081097602844, + "rewards/rejected": -0.014126819558441639, + "step": 1730 + }, + { + "epoch": 0.39, + "grad_norm": 0.453125, + "learning_rate": 3.800901506117209e-07, + "logits/chosen": -1.2425081729888916, + "logits/rejected": -1.0295426845550537, + "logps/chosen": -245.2377471923828, + "logps/rejected": -214.0257568359375, + "loss": 0.6683, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03704250231385231, + "rewards/margins": 0.05573665350675583, + "rewards/margins_max": 0.08607505261898041, + "rewards/margins_min": 0.025398259982466698, + "rewards/margins_std": 0.04290497303009033, + "rewards/rejected": -0.018694154918193817, + "step": 1740 + }, + { + "epoch": 0.4, + "grad_norm": 0.322265625, + "learning_rate": 3.784013947052621e-07, + "logits/chosen": -1.402919888496399, + "logits/rejected": -1.1005027294158936, + "logps/chosen": -266.9942321777344, + "logps/rejected": -205.5738525390625, + "loss": 0.6721, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03442750871181488, + "rewards/margins": 0.04329963028430939, + "rewards/margins_max": 0.07404103130102158, + "rewards/margins_min": 0.012558224610984325, + "rewards/margins_std": 0.04347491264343262, + "rewards/rejected": -0.008872120641171932, + "step": 1750 + }, + { + "epoch": 0.4, + "grad_norm": 0.431640625, + "learning_rate": 3.7670464261299164e-07, + "logits/chosen": -1.2231419086456299, + "logits/rejected": -1.0529447793960571, + "logps/chosen": -203.1646728515625, + "logps/rejected": -224.58779907226562, + "loss": 0.6683, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030151110142469406, + "rewards/margins": 0.0533854141831398, + "rewards/margins_max": 0.08413845300674438, + "rewards/margins_min": 0.022632379084825516, + "rewards/margins_std": 0.043491363525390625, + "rewards/rejected": -0.023234302178025246, + "step": 1760 + }, + { + "epoch": 0.4, + "grad_norm": 0.330078125, + "learning_rate": 3.75e-07, + "logits/chosen": -1.5182336568832397, + "logits/rejected": -1.184272289276123, + "logps/chosen": -195.92198181152344, + "logps/rejected": -205.4150390625, + "loss": 0.6739, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.01970260962843895, + "rewards/margins": 0.04444552958011627, + "rewards/margins_max": 0.0759601816534996, + "rewards/margins_min": 0.01293087750673294, + "rewards/margins_std": 0.04456844925880432, + "rewards/rejected": -0.02474292181432247, + "step": 1770 + }, + { + "epoch": 0.4, + "grad_norm": 0.376953125, + "learning_rate": 3.732875730227594e-07, + "logits/chosen": -1.6119180917739868, + "logits/rejected": -1.338212013244629, + "logps/chosen": -184.89749145507812, + "logps/rejected": -195.16012573242188, + "loss": 0.665, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03413277491927147, + "rewards/margins": 0.04780154302716255, + "rewards/margins_max": 0.06999184191226959, + "rewards/margins_min": 0.02561124786734581, + "rewards/margins_std": 0.031381815671920776, + "rewards/rejected": -0.013668762519955635, + "step": 1780 + }, + { + "epoch": 0.4, + "grad_norm": 0.43359375, + "learning_rate": 3.715674683225126e-07, + "logits/chosen": -1.4385994672775269, + "logits/rejected": -1.1190850734710693, + "logps/chosen": -252.67294311523438, + "logps/rejected": -200.71493530273438, + "loss": 0.6692, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0406574010848999, + "rewards/margins": 0.05437915399670601, + "rewards/margins_max": 0.08271868526935577, + "rewards/margins_min": 0.02603963017463684, + "rewards/margins_std": 0.040078144520521164, + "rewards/rejected": -0.013721758499741554, + "step": 1790 + }, + { + "epoch": 0.41, + "grad_norm": 0.4453125, + "learning_rate": 3.698397930186318e-07, + "logits/chosen": -1.420417070388794, + "logits/rejected": -1.1576616764068604, + "logps/chosen": -205.3006591796875, + "logps/rejected": -199.37765502929688, + "loss": 0.6698, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02717829868197441, + "rewards/margins": 0.04901193082332611, + "rewards/margins_max": 0.0719413310289383, + "rewards/margins_min": 0.026082530617713928, + "rewards/margins_std": 0.032427072525024414, + "rewards/rejected": -0.0218336321413517, + "step": 1800 + }, + { + "epoch": 0.41, + "grad_norm": 0.47265625, + "learning_rate": 3.681046547019479e-07, + "logits/chosen": -1.4996373653411865, + "logits/rejected": -1.2397348880767822, + "logps/chosen": -214.6435546875, + "logps/rejected": -191.0612030029297, + "loss": 0.6728, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.027710596099495888, + "rewards/margins": 0.041693609207868576, + "rewards/margins_max": 0.06465200334787369, + "rewards/margins_min": 0.01873522251844406, + "rewards/margins_std": 0.03246805816888809, + "rewards/rejected": -0.01398300938308239, + "step": 1810 + }, + { + "epoch": 0.41, + "grad_norm": 0.375, + "learning_rate": 3.6636216142805044e-07, + "logits/chosen": -1.3628098964691162, + "logits/rejected": -1.1438556909561157, + "logps/chosen": -196.2094268798828, + "logps/rejected": -203.31918334960938, + "loss": 0.6685, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0251650158315897, + "rewards/margins": 0.04904549568891525, + "rewards/margins_max": 0.07514000684022903, + "rewards/margins_min": 0.022950977087020874, + "rewards/margins_std": 0.03690321743488312, + "rewards/rejected": -0.023880477994680405, + "step": 1820 + }, + { + "epoch": 0.41, + "grad_norm": 0.34375, + "learning_rate": 3.646124217105582e-07, + "logits/chosen": -1.4042203426361084, + "logits/rejected": -1.1199661493301392, + "logps/chosen": -212.60061645507812, + "logps/rejected": -173.20904541015625, + "loss": 0.671, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.026237869635224342, + "rewards/margins": 0.04421783238649368, + "rewards/margins_max": 0.06703042984008789, + "rewards/margins_min": 0.021405242383480072, + "rewards/margins_std": 0.03226187080144882, + "rewards/rejected": -0.01797996461391449, + "step": 1830 + }, + { + "epoch": 0.42, + "grad_norm": 0.388671875, + "learning_rate": 3.6285554451436144e-07, + "logits/chosen": -1.2761818170547485, + "logits/rejected": -1.074405550956726, + "logps/chosen": -213.8831024169922, + "logps/rejected": -231.5514678955078, + "loss": 0.671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03510306030511856, + "rewards/margins": 0.04712874814867973, + "rewards/margins_max": 0.07571456581354141, + "rewards/margins_min": 0.018542934209108353, + "rewards/margins_std": 0.040426451712846756, + "rewards/rejected": -0.012025688774883747, + "step": 1840 + }, + { + "epoch": 0.42, + "grad_norm": 0.36328125, + "learning_rate": 3.610916392488366e-07, + "logits/chosen": -1.3223119974136353, + "logits/rejected": -1.0760774612426758, + "logps/chosen": -207.56497192382812, + "logps/rejected": -201.53770446777344, + "loss": 0.6719, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02942877635359764, + "rewards/margins": 0.042011525481939316, + "rewards/margins_max": 0.07225628942251205, + "rewards/margins_min": 0.011766768991947174, + "rewards/margins_std": 0.04277254641056061, + "rewards/rejected": -0.012582749128341675, + "step": 1850 + }, + { + "epoch": 0.42, + "grad_norm": 0.412109375, + "learning_rate": 3.593208157610323e-07, + "logits/chosen": -1.4547159671783447, + "logits/rejected": -1.2126966714859009, + "logps/chosen": -201.00279235839844, + "logps/rejected": -195.8961181640625, + "loss": 0.6702, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.024298720061779022, + "rewards/margins": 0.04018304497003555, + "rewards/margins_max": 0.061934977769851685, + "rewards/margins_min": 0.018431108444929123, + "rewards/margins_std": 0.030761878937482834, + "rewards/rejected": -0.01588432490825653, + "step": 1860 + }, + { + "epoch": 0.42, + "grad_norm": 0.5390625, + "learning_rate": 3.57543184328829e-07, + "logits/chosen": -1.2944796085357666, + "logits/rejected": -1.0369160175323486, + "logps/chosen": -185.6531982421875, + "logps/rejected": -211.05422973632812, + "loss": 0.6703, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.030656468123197556, + "rewards/margins": 0.05110560730099678, + "rewards/margins_max": 0.08564882725477219, + "rewards/margins_min": 0.016562385484576225, + "rewards/margins_std": 0.04885149374604225, + "rewards/rejected": -0.020449137315154076, + "step": 1870 + }, + { + "epoch": 0.42, + "grad_norm": 0.451171875, + "learning_rate": 3.5575885565407115e-07, + "logits/chosen": -1.3851807117462158, + "logits/rejected": -1.1306685209274292, + "logps/chosen": -180.79251098632812, + "logps/rejected": -173.75868225097656, + "loss": 0.6717, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02727353572845459, + "rewards/margins": 0.044656019657850266, + "rewards/margins_max": 0.07077351957559586, + "rewards/margins_min": 0.018538516014814377, + "rewards/margins_std": 0.036935724318027496, + "rewards/rejected": -0.017382482066750526, + "step": 1880 + }, + { + "epoch": 0.43, + "grad_norm": 0.474609375, + "learning_rate": 3.5396794085567367e-07, + "logits/chosen": -1.45248544216156, + "logits/rejected": -1.0879206657409668, + "logps/chosen": -253.71142578125, + "logps/rejected": -229.73867797851562, + "loss": 0.6693, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035827480256557465, + "rewards/margins": 0.052608538419008255, + "rewards/margins_max": 0.08579106628894806, + "rewards/margins_min": 0.019426017999649048, + "rewards/margins_std": 0.04692717269062996, + "rewards/rejected": -0.01678105816245079, + "step": 1890 + }, + { + "epoch": 0.43, + "grad_norm": 0.37109375, + "learning_rate": 3.5217055146270143e-07, + "logits/chosen": -1.329620599746704, + "logits/rejected": -1.0890090465545654, + "logps/chosen": -255.60830688476562, + "logps/rejected": -210.82730102539062, + "loss": 0.6719, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.025220388546586037, + "rewards/margins": 0.03942624479532242, + "rewards/margins_max": 0.07290971279144287, + "rewards/margins_min": 0.005942771676927805, + "rewards/margins_std": 0.047352783381938934, + "rewards/rejected": -0.01420585811138153, + "step": 1900 + }, + { + "epoch": 0.43, + "grad_norm": 0.392578125, + "learning_rate": 3.5036679940742435e-07, + "logits/chosen": -1.3408721685409546, + "logits/rejected": -0.9262669682502747, + "logps/chosen": -193.88815307617188, + "logps/rejected": -189.97848510742188, + "loss": 0.6682, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.030053604394197464, + "rewards/margins": 0.05334904044866562, + "rewards/margins_max": 0.07732351869344711, + "rewards/margins_min": 0.029374558478593826, + "rewards/margins_std": 0.033905040472745895, + "rewards/rejected": -0.023295434191823006, + "step": 1910 + }, + { + "epoch": 0.43, + "grad_norm": 0.341796875, + "learning_rate": 3.4855679701834654e-07, + "logits/chosen": -1.4233258962631226, + "logits/rejected": -1.088521957397461, + "logps/chosen": -248.71029663085938, + "logps/rejected": -218.1599578857422, + "loss": 0.6699, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03422839939594269, + "rewards/margins": 0.05857861042022705, + "rewards/margins_max": 0.0850900262594223, + "rewards/margins_min": 0.032067202031612396, + "rewards/margins_std": 0.037492796778678894, + "rewards/rejected": -0.024350211024284363, + "step": 1920 + }, + { + "epoch": 0.44, + "grad_norm": 0.392578125, + "learning_rate": 3.4674065701321117e-07, + "logits/chosen": -1.2159172296524048, + "logits/rejected": -0.8571739196777344, + "logps/chosen": -239.7020263671875, + "logps/rejected": -207.25259399414062, + "loss": 0.6702, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.024594713002443314, + "rewards/margins": 0.055344462394714355, + "rewards/margins_max": 0.09462883323431015, + "rewards/margins_min": 0.01606009155511856, + "rewards/margins_std": 0.05555649474263191, + "rewards/rejected": -0.03074975311756134, + "step": 1930 + }, + { + "epoch": 0.44, + "grad_norm": 0.35546875, + "learning_rate": 3.449184924919807e-07, + "logits/chosen": -1.4591569900512695, + "logits/rejected": -1.1621897220611572, + "logps/chosen": -176.26205444335938, + "logps/rejected": -183.91500854492188, + "loss": 0.6679, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.033842552453279495, + "rewards/margins": 0.055226556956768036, + "rewards/margins_max": 0.08132033050060272, + "rewards/margins_min": 0.029132787138223648, + "rewards/margins_std": 0.03690216317772865, + "rewards/rejected": -0.02138400450348854, + "step": 1940 + }, + { + "epoch": 0.44, + "grad_norm": 0.41015625, + "learning_rate": 3.4309041692979406e-07, + "logits/chosen": -1.3285919427871704, + "logits/rejected": -1.0929481983184814, + "logps/chosen": -185.72769165039062, + "logps/rejected": -201.81222534179688, + "loss": 0.6688, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03052700124680996, + "rewards/margins": 0.05209646373987198, + "rewards/margins_max": 0.08264229446649551, + "rewards/margins_min": 0.021550629287958145, + "rewards/margins_std": 0.043198324739933014, + "rewards/rejected": -0.02156945690512657, + "step": 1950 + }, + { + "epoch": 0.44, + "grad_norm": 0.310546875, + "learning_rate": 3.412565441698997e-07, + "logits/chosen": -1.6500459909439087, + "logits/rejected": -1.299719214439392, + "logps/chosen": -199.05360412597656, + "logps/rejected": -210.8216552734375, + "loss": 0.6732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.026170510798692703, + "rewards/margins": 0.043105438351631165, + "rewards/margins_max": 0.05894836038351059, + "rewards/margins_min": 0.02726251445710659, + "rewards/margins_std": 0.022405285388231277, + "rewards/rejected": -0.01693493127822876, + "step": 1960 + }, + { + "epoch": 0.45, + "grad_norm": 0.3671875, + "learning_rate": 3.394169884165659e-07, + "logits/chosen": -1.2527066469192505, + "logits/rejected": -1.006007432937622, + "logps/chosen": -227.0983123779297, + "logps/rejected": -237.21005249023438, + "loss": 0.6668, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03402932733297348, + "rewards/margins": 0.0556698814034462, + "rewards/margins_max": 0.08698919415473938, + "rewards/margins_min": 0.024350563064217567, + "rewards/margins_std": 0.044292204082012177, + "rewards/rejected": -0.021640557795763016, + "step": 1970 + }, + { + "epoch": 0.45, + "grad_norm": 0.5703125, + "learning_rate": 3.3757186422796913e-07, + "logits/chosen": -1.5391623973846436, + "logits/rejected": -1.2645601034164429, + "logps/chosen": -223.9857177734375, + "logps/rejected": -290.79302978515625, + "loss": 0.6677, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.026502113789319992, + "rewards/margins": 0.048506900668144226, + "rewards/margins_max": 0.07111908495426178, + "rewards/margins_min": 0.025894710794091225, + "rewards/margins_std": 0.03197846934199333, + "rewards/rejected": -0.022004786878824234, + "step": 1980 + }, + { + "epoch": 0.45, + "grad_norm": 0.30078125, + "learning_rate": 3.357212865090594e-07, + "logits/chosen": -1.4560575485229492, + "logits/rejected": -1.1469371318817139, + "logps/chosen": -217.7390594482422, + "logps/rejected": -191.54116821289062, + "loss": 0.6712, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03316511958837509, + "rewards/margins": 0.040575869381427765, + "rewards/margins_max": 0.059667717665433884, + "rewards/margins_min": 0.02148401364684105, + "rewards/margins_std": 0.026999955996870995, + "rewards/rejected": -0.007410746067762375, + "step": 1990 + }, + { + "epoch": 0.45, + "grad_norm": 0.38671875, + "learning_rate": 3.3386537050440505e-07, + "logits/chosen": -1.230985164642334, + "logits/rejected": -1.074690341949463, + "logps/chosen": -153.95193481445312, + "logps/rejected": -248.7484893798828, + "loss": 0.672, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03268904983997345, + "rewards/margins": 0.04694094508886337, + "rewards/margins_max": 0.07272408902645111, + "rewards/margins_min": 0.021157797425985336, + "rewards/margins_std": 0.03646288067102432, + "rewards/rejected": -0.014251895248889923, + "step": 2000 + }, + { + "epoch": 0.45, + "grad_norm": 0.337890625, + "learning_rate": 3.3200423179101564e-07, + "logits/chosen": -1.4215552806854248, + "logits/rejected": -1.1187208890914917, + "logps/chosen": -242.44088745117188, + "logps/rejected": -258.2455749511719, + "loss": 0.6729, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02623608708381653, + "rewards/margins": 0.04028294235467911, + "rewards/margins_max": 0.059119291603565216, + "rewards/margins_min": 0.0214465893805027, + "rewards/margins_std": 0.02663862146437168, + "rewards/rejected": -0.014046849682927132, + "step": 2010 + }, + { + "epoch": 0.46, + "grad_norm": 0.408203125, + "learning_rate": 3.3013798627114453e-07, + "logits/chosen": -1.4693307876586914, + "logits/rejected": -1.2081705331802368, + "logps/chosen": -199.54930114746094, + "logps/rejected": -206.67330932617188, + "loss": 0.6706, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.029323678463697433, + "rewards/margins": 0.04873298853635788, + "rewards/margins_max": 0.07738453149795532, + "rewards/margins_min": 0.020081443712115288, + "rewards/margins_std": 0.04051940143108368, + "rewards/rejected": -0.019409308210015297, + "step": 2020 + }, + { + "epoch": 0.46, + "grad_norm": 0.44921875, + "learning_rate": 3.2826675016507087e-07, + "logits/chosen": -1.324035406112671, + "logits/rejected": -1.1425608396530151, + "logps/chosen": -170.9380340576172, + "logps/rejected": -183.2605743408203, + "loss": 0.6703, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026225930079817772, + "rewards/margins": 0.04071018844842911, + "rewards/margins_max": 0.06031109765172005, + "rewards/margins_min": 0.02110927924513817, + "rewards/margins_std": 0.027719873934984207, + "rewards/rejected": -0.014484262093901634, + "step": 2030 + }, + { + "epoch": 0.46, + "grad_norm": 0.400390625, + "learning_rate": 3.263906400038623e-07, + "logits/chosen": -1.4063230752944946, + "logits/rejected": -1.1503441333770752, + "logps/chosen": -225.02719116210938, + "logps/rejected": -250.1376190185547, + "loss": 0.6707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.031333766877651215, + "rewards/margins": 0.04533548653125763, + "rewards/margins_max": 0.06751718372106552, + "rewards/margins_min": 0.023153791204094887, + "rewards/margins_std": 0.0313696563243866, + "rewards/rejected": -0.014001714065670967, + "step": 2040 + }, + { + "epoch": 0.46, + "grad_norm": 0.546875, + "learning_rate": 3.2450977262211765e-07, + "logits/chosen": -1.3487093448638916, + "logits/rejected": -1.1558421850204468, + "logps/chosen": -273.28955078125, + "logps/rejected": -219.2425079345703, + "loss": 0.6724, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02075299061834812, + "rewards/margins": 0.04373856633901596, + "rewards/margins_max": 0.06467003375291824, + "rewards/margins_min": 0.022807098925113678, + "rewards/margins_std": 0.02960156463086605, + "rewards/rejected": -0.02298557385802269, + "step": 2050 + }, + { + "epoch": 0.47, + "grad_norm": 0.4609375, + "learning_rate": 3.226242651506914e-07, + "logits/chosen": -1.4015865325927734, + "logits/rejected": -1.2727991342544556, + "logps/chosen": -220.09597778320312, + "logps/rejected": -235.02456665039062, + "loss": 0.6735, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03195624798536301, + "rewards/margins": 0.037409860640764236, + "rewards/margins_max": 0.06553162634372711, + "rewards/margins_min": 0.009288092143833637, + "rewards/margins_std": 0.03977018594741821, + "rewards/rejected": -0.0054536135867238045, + "step": 2060 + }, + { + "epoch": 0.47, + "grad_norm": 0.43359375, + "learning_rate": 3.207342350093992e-07, + "logits/chosen": -1.4042125940322876, + "logits/rejected": -1.0990450382232666, + "logps/chosen": -261.49554443359375, + "logps/rejected": -226.1964874267578, + "loss": 0.6676, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03966151177883148, + "rewards/margins": 0.051730163395404816, + "rewards/margins_max": 0.07370400428771973, + "rewards/margins_min": 0.029756318777799606, + "rewards/margins_std": 0.03107570670545101, + "rewards/rejected": -0.012068650685250759, + "step": 2070 + }, + { + "epoch": 0.47, + "grad_norm": 0.40234375, + "learning_rate": 3.1883979989970556e-07, + "logits/chosen": -1.3634693622589111, + "logits/rejected": -1.1507164239883423, + "logps/chosen": -200.532470703125, + "logps/rejected": -223.18521118164062, + "loss": 0.6746, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03149800002574921, + "rewards/margins": 0.04740763083100319, + "rewards/margins_max": 0.07129405438899994, + "rewards/margins_min": 0.023521197959780693, + "rewards/margins_std": 0.03378051519393921, + "rewards/rejected": -0.015909628942608833, + "step": 2080 + }, + { + "epoch": 0.47, + "grad_norm": 0.36328125, + "learning_rate": 3.1694107779739387e-07, + "logits/chosen": -1.3686000108718872, + "logits/rejected": -1.1741145849227905, + "logps/chosen": -177.2476348876953, + "logps/rejected": -201.48651123046875, + "loss": 0.6722, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.022783290594816208, + "rewards/margins": 0.036474697291851044, + "rewards/margins_max": 0.0596294105052948, + "rewards/margins_min": 0.013319991528987885, + "rewards/margins_std": 0.03274570032954216, + "rewards/rejected": -0.01369140762835741, + "step": 2090 + }, + { + "epoch": 0.47, + "grad_norm": 0.26953125, + "learning_rate": 3.1503818694521987e-07, + "logits/chosen": -1.2748286724090576, + "logits/rejected": -1.0838059186935425, + "logps/chosen": -167.02737426757812, + "logps/rejected": -171.05117797851562, + "loss": 0.6706, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.026711028069257736, + "rewards/margins": 0.04484615474939346, + "rewards/margins_max": 0.06936424225568771, + "rewards/margins_min": 0.02032807096838951, + "rewards/margins_std": 0.03467380255460739, + "rewards/rejected": -0.018135128542780876, + "step": 2100 + }, + { + "epoch": 0.48, + "grad_norm": 0.369140625, + "learning_rate": 3.131312458455477e-07, + "logits/chosen": -1.319719910621643, + "logits/rejected": -1.1327495574951172, + "logps/chosen": -190.13870239257812, + "logps/rejected": -202.6477508544922, + "loss": 0.6689, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030982110649347305, + "rewards/margins": 0.04906720668077469, + "rewards/margins_max": 0.0723622590303421, + "rewards/margins_min": 0.02577214315533638, + "rewards/margins_std": 0.03294419124722481, + "rewards/rejected": -0.018085090443491936, + "step": 2110 + }, + { + "epoch": 0.48, + "grad_norm": 0.443359375, + "learning_rate": 3.1122037325297023e-07, + "logits/chosen": -1.4756828546524048, + "logits/rejected": -1.0510185956954956, + "logps/chosen": -244.9542694091797, + "logps/rejected": -187.69895935058594, + "loss": 0.671, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03502636030316353, + "rewards/margins": 0.04812877997756004, + "rewards/margins_max": 0.07826215028762817, + "rewards/margins_min": 0.017995405942201614, + "rewards/margins_std": 0.04261502996087074, + "rewards/rejected": -0.013102421537041664, + "step": 2120 + }, + { + "epoch": 0.48, + "grad_norm": 0.388671875, + "learning_rate": 3.0930568816691386e-07, + "logits/chosen": -1.2691621780395508, + "logits/rejected": -1.0920307636260986, + "logps/chosen": -204.21749877929688, + "logps/rejected": -163.44583129882812, + "loss": 0.6686, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02684844098985195, + "rewards/margins": 0.04703948646783829, + "rewards/margins_max": 0.0725255236029625, + "rewards/margins_min": 0.02155345305800438, + "rewards/margins_std": 0.036042697727680206, + "rewards/rejected": -0.020191045477986336, + "step": 2130 + }, + { + "epoch": 0.48, + "grad_norm": 0.43359375, + "learning_rate": 3.073873098242278e-07, + "logits/chosen": -1.375516653060913, + "logits/rejected": -1.0434143543243408, + "logps/chosen": -211.1145477294922, + "logps/rejected": -183.14756774902344, + "loss": 0.6643, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026955414563417435, + "rewards/margins": 0.05509645864367485, + "rewards/margins_max": 0.08361810445785522, + "rewards/margins_min": 0.026574820280075073, + "rewards/margins_std": 0.04033569246530533, + "rewards/rejected": -0.028141042217612267, + "step": 2140 + }, + { + "epoch": 0.49, + "grad_norm": 0.51171875, + "learning_rate": 3.054653576917581e-07, + "logits/chosen": -1.3683044910430908, + "logits/rejected": -1.1811391115188599, + "logps/chosen": -209.4580841064453, + "logps/rejected": -221.3155975341797, + "loss": 0.6672, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.038317613303661346, + "rewards/margins": 0.05631747841835022, + "rewards/margins_max": 0.08396416157484055, + "rewards/margins_min": 0.028670784085989, + "rewards/margins_std": 0.03909832984209061, + "rewards/rejected": -0.017999857664108276, + "step": 2150 + }, + { + "epoch": 0.49, + "grad_norm": 0.357421875, + "learning_rate": 3.0353995145890864e-07, + "logits/chosen": -1.4687901735305786, + "logits/rejected": -1.128990888595581, + "logps/chosen": -220.0728759765625, + "logps/rejected": -215.1951904296875, + "loss": 0.6684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.023461733013391495, + "rewards/margins": 0.05785750225186348, + "rewards/margins_max": 0.0840335339307785, + "rewards/margins_min": 0.03168146312236786, + "rewards/margins_std": 0.03701850771903992, + "rewards/rejected": -0.034395769238471985, + "step": 2160 + }, + { + "epoch": 0.49, + "grad_norm": 0.59765625, + "learning_rate": 3.01611211030187e-07, + "logits/chosen": -1.3273531198501587, + "logits/rejected": -1.0275139808654785, + "logps/chosen": -344.6279296875, + "logps/rejected": -221.2284698486328, + "loss": 0.6679, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026945358142256737, + "rewards/margins": 0.04964347928762436, + "rewards/margins_max": 0.07278752326965332, + "rewards/margins_min": 0.026499425992369652, + "rewards/margins_std": 0.03273063153028488, + "rewards/rejected": -0.022698121145367622, + "step": 2170 + }, + { + "epoch": 0.49, + "grad_norm": 0.328125, + "learning_rate": 2.996792565177374e-07, + "logits/chosen": -1.4312021732330322, + "logits/rejected": -1.1072601079940796, + "logps/chosen": -197.82289123535156, + "logps/rejected": -207.2589111328125, + "loss": 0.6685, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0390249639749527, + "rewards/margins": 0.060548871755599976, + "rewards/margins_max": 0.09206276386976242, + "rewards/margins_min": 0.029034990817308426, + "rewards/margins_std": 0.04456736519932747, + "rewards/rejected": -0.021523915231227875, + "step": 2180 + }, + { + "epoch": 0.5, + "grad_norm": 0.365234375, + "learning_rate": 2.9774420823386096e-07, + "logits/chosen": -1.4531341791152954, + "logits/rejected": -1.223080039024353, + "logps/chosen": -199.8817596435547, + "logps/rejected": -177.36080932617188, + "loss": 0.6722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027678947895765305, + "rewards/margins": 0.03983256220817566, + "rewards/margins_max": 0.06585127860307693, + "rewards/margins_min": 0.013813835568726063, + "rewards/margins_std": 0.03679602965712547, + "rewards/rejected": -0.012153607793152332, + "step": 2190 + }, + { + "epoch": 0.5, + "grad_norm": 0.4375, + "learning_rate": 2.9580618668352317e-07, + "logits/chosen": -1.4132052659988403, + "logits/rejected": -1.0988004207611084, + "logps/chosen": -238.1175994873047, + "logps/rejected": -208.7352752685547, + "loss": 0.6721, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026243817061185837, + "rewards/margins": 0.0455835685133934, + "rewards/margins_max": 0.06686623394489288, + "rewards/margins_min": 0.024300891906023026, + "rewards/margins_std": 0.03009824827313423, + "rewards/rejected": -0.019339745864272118, + "step": 2200 + }, + { + "epoch": 0.5, + "grad_norm": 0.30078125, + "learning_rate": 2.9386531255684937e-07, + "logits/chosen": -1.3268234729766846, + "logits/rejected": -1.0986872911453247, + "logps/chosen": -239.9281463623047, + "logps/rejected": -202.4623260498047, + "loss": 0.6714, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0318286269903183, + "rewards/margins": 0.046062078326940536, + "rewards/margins_max": 0.07249831408262253, + "rewards/margins_min": 0.01962583139538765, + "rewards/margins_std": 0.037386488169431686, + "rewards/rejected": -0.01423344761133194, + "step": 2210 + }, + { + "epoch": 0.5, + "grad_norm": 0.4765625, + "learning_rate": 2.919217067216089e-07, + "logits/chosen": -1.2868921756744385, + "logits/rejected": -1.0992939472198486, + "logps/chosen": -194.62716674804688, + "logps/rejected": -202.38162231445312, + "loss": 0.6688, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.027968263253569603, + "rewards/margins": 0.05166172236204147, + "rewards/margins_max": 0.07927562296390533, + "rewards/margins_min": 0.024047832936048508, + "rewards/margins_std": 0.03905193880200386, + "rewards/rejected": -0.02369346097111702, + "step": 2220 + }, + { + "epoch": 0.5, + "grad_norm": 0.462890625, + "learning_rate": 2.899754902156879e-07, + "logits/chosen": -1.348962664604187, + "logits/rejected": -0.9912853240966797, + "logps/chosen": -262.5445861816406, + "logps/rejected": -196.90609741210938, + "loss": 0.6686, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.028756489977240562, + "rewards/margins": 0.04377773404121399, + "rewards/margins_max": 0.06630166620016098, + "rewards/margins_min": 0.0212537981569767, + "rewards/margins_std": 0.031853653490543365, + "rewards/rejected": -0.015021244063973427, + "step": 2230 + }, + { + "epoch": 0.51, + "grad_norm": 0.408203125, + "learning_rate": 2.88026784239552e-07, + "logits/chosen": -1.3572931289672852, + "logits/rejected": -1.089862585067749, + "logps/chosen": -205.95458984375, + "logps/rejected": -220.0574188232422, + "loss": 0.6669, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03338629752397537, + "rewards/margins": 0.05913955718278885, + "rewards/margins_max": 0.08510196954011917, + "rewards/margins_min": 0.033177152276039124, + "rewards/margins_std": 0.036716386675834656, + "rewards/rejected": -0.025753263384103775, + "step": 2240 + }, + { + "epoch": 0.51, + "grad_norm": 0.4765625, + "learning_rate": 2.8607571014869815e-07, + "logits/chosen": -1.481650948524475, + "logits/rejected": -1.1228121519088745, + "logps/chosen": -222.73886108398438, + "logps/rejected": -214.49484252929688, + "loss": 0.6652, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.031130477786064148, + "rewards/margins": 0.06208733841776848, + "rewards/margins_max": 0.08940434455871582, + "rewards/margins_min": 0.03477033972740173, + "rewards/margins_std": 0.038632072508335114, + "rewards/rejected": -0.03095685876905918, + "step": 2250 + }, + { + "epoch": 0.51, + "grad_norm": 0.384765625, + "learning_rate": 2.8412238944609754e-07, + "logits/chosen": -1.3825039863586426, + "logits/rejected": -1.1998974084854126, + "logps/chosen": -158.28053283691406, + "logps/rejected": -162.88006591796875, + "loss": 0.6736, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03045378252863884, + "rewards/margins": 0.03599992021918297, + "rewards/margins_max": 0.05915500596165657, + "rewards/margins_min": 0.012844832614064217, + "rewards/margins_std": 0.03274623677134514, + "rewards/rejected": -0.005546136759221554, + "step": 2260 + }, + { + "epoch": 0.51, + "grad_norm": 0.392578125, + "learning_rate": 2.8216694377462907e-07, + "logits/chosen": -1.314422845840454, + "logits/rejected": -1.034608244895935, + "logps/chosen": -220.6864776611328, + "logps/rejected": -234.85214233398438, + "loss": 0.6671, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03803492337465286, + "rewards/margins": 0.06186642125248909, + "rewards/margins_max": 0.09449129551649094, + "rewards/margins_min": 0.029241541400551796, + "rewards/margins_std": 0.046138547360897064, + "rewards/rejected": -0.023831497877836227, + "step": 2270 + }, + { + "epoch": 0.52, + "grad_norm": 0.193359375, + "learning_rate": 2.8020949490950365e-07, + "logits/chosen": -1.6204173564910889, + "logits/rejected": -1.1893694400787354, + "logps/chosen": -224.90567016601562, + "logps/rejected": -204.3470001220703, + "loss": 0.6702, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.033450596034526825, + "rewards/margins": 0.05014806240797043, + "rewards/margins_max": 0.07468613237142563, + "rewards/margins_min": 0.02560998871922493, + "rewards/margins_std": 0.03470207750797272, + "rewards/rejected": -0.016697466373443604, + "step": 2280 + }, + { + "epoch": 0.52, + "grad_norm": 0.310546875, + "learning_rate": 2.78250164750681e-07, + "logits/chosen": -1.5214847326278687, + "logits/rejected": -1.2135298252105713, + "logps/chosen": -225.0383758544922, + "logps/rejected": -222.4280548095703, + "loss": 0.6741, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.028520096093416214, + "rewards/margins": 0.04392777383327484, + "rewards/margins_max": 0.072211354970932, + "rewards/margins_min": 0.015644187107682228, + "rewards/margins_std": 0.03999902680516243, + "rewards/rejected": -0.015407675877213478, + "step": 2290 + }, + { + "epoch": 0.52, + "grad_norm": 0.359375, + "learning_rate": 2.7628907531527813e-07, + "logits/chosen": -1.4052813053131104, + "logits/rejected": -1.084937572479248, + "logps/chosen": -287.57550048828125, + "logps/rejected": -199.4448699951172, + "loss": 0.6684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02502388320863247, + "rewards/margins": 0.05385827273130417, + "rewards/margins_max": 0.08281184732913971, + "rewards/margins_min": 0.024904707446694374, + "rewards/margins_std": 0.040946535766124725, + "rewards/rejected": -0.028834396973252296, + "step": 2300 + }, + { + "epoch": 0.52, + "grad_norm": 0.5234375, + "learning_rate": 2.743263487299712e-07, + "logits/chosen": -1.329421043395996, + "logits/rejected": -1.2132747173309326, + "logps/chosen": -282.1737365722656, + "logps/rejected": -261.6746520996094, + "loss": 0.6704, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02811703085899353, + "rewards/margins": 0.04853527620434761, + "rewards/margins_max": 0.07612602412700653, + "rewards/margins_min": 0.02094453200697899, + "rewards/margins_std": 0.039019204676151276, + "rewards/rejected": -0.02041824534535408, + "step": 2310 + }, + { + "epoch": 0.52, + "grad_norm": 0.333984375, + "learning_rate": 2.7236210722338933e-07, + "logits/chosen": -1.4650366306304932, + "logits/rejected": -1.1293063163757324, + "logps/chosen": -246.368896484375, + "logps/rejected": -186.78863525390625, + "loss": 0.6726, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03457409888505936, + "rewards/margins": 0.04936753585934639, + "rewards/margins_max": 0.07968120276927948, + "rewards/margins_min": 0.019053865224123, + "rewards/margins_std": 0.04287000000476837, + "rewards/rejected": -0.014793431386351585, + "step": 2320 + }, + { + "epoch": 0.53, + "grad_norm": 0.33984375, + "learning_rate": 2.7039647311850346e-07, + "logits/chosen": -1.2866630554199219, + "logits/rejected": -1.0842173099517822, + "logps/chosen": -177.29855346679688, + "logps/rejected": -178.6018524169922, + "loss": 0.6687, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02924739383161068, + "rewards/margins": 0.04846047982573509, + "rewards/margins_max": 0.07607054710388184, + "rewards/margins_min": 0.02085040882229805, + "rewards/margins_std": 0.03904653713107109, + "rewards/rejected": -0.01921308971941471, + "step": 2330 + }, + { + "epoch": 0.53, + "grad_norm": 0.328125, + "learning_rate": 2.684295688250084e-07, + "logits/chosen": -1.4828417301177979, + "logits/rejected": -1.1574809551239014, + "logps/chosen": -229.223876953125, + "logps/rejected": -193.02645874023438, + "loss": 0.6644, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.039402466267347336, + "rewards/margins": 0.052919901907444, + "rewards/margins_max": 0.07841168344020844, + "rewards/margins_min": 0.027428116649389267, + "rewards/margins_std": 0.03605083003640175, + "rewards/rejected": -0.013517431914806366, + "step": 2340 + }, + { + "epoch": 0.53, + "grad_norm": 0.388671875, + "learning_rate": 2.664615168316998e-07, + "logits/chosen": -1.4797755479812622, + "logits/rejected": -1.3098758459091187, + "logps/chosen": -191.74508666992188, + "logps/rejected": -216.4213104248047, + "loss": 0.666, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.036059409379959106, + "rewards/margins": 0.050756268203258514, + "rewards/margins_max": 0.08269494771957397, + "rewards/margins_min": 0.0188176017254591, + "rewards/margins_std": 0.04516809806227684, + "rewards/rejected": -0.014696864411234856, + "step": 2350 + }, + { + "epoch": 0.53, + "grad_norm": 0.390625, + "learning_rate": 2.6449243969884645e-07, + "logits/chosen": -1.4017361402511597, + "logits/rejected": -1.0677566528320312, + "logps/chosen": -209.80126953125, + "logps/rejected": -181.45455932617188, + "loss": 0.6739, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02429645135998726, + "rewards/margins": 0.043695103377103806, + "rewards/margins_max": 0.07192997634410858, + "rewards/margins_min": 0.01546022854745388, + "rewards/margins_std": 0.03993014246225357, + "rewards/rejected": -0.019398652017116547, + "step": 2360 + }, + { + "epoch": 0.54, + "grad_norm": 0.298828125, + "learning_rate": 2.625224600505572e-07, + "logits/chosen": -1.3834593296051025, + "logits/rejected": -1.1724491119384766, + "logps/chosen": -220.4295654296875, + "logps/rejected": -285.34014892578125, + "loss": 0.6681, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.024225343018770218, + "rewards/margins": 0.052785150706768036, + "rewards/margins_max": 0.08505556732416153, + "rewards/margins_min": 0.020514745265245438, + "rewards/margins_std": 0.04563724994659424, + "rewards/rejected": -0.028559807687997818, + "step": 2370 + }, + { + "epoch": 0.54, + "grad_norm": 0.50390625, + "learning_rate": 2.605517005671454e-07, + "logits/chosen": -1.508434534072876, + "logits/rejected": -1.1241999864578247, + "logps/chosen": -270.1075439453125, + "logps/rejected": -254.4306182861328, + "loss": 0.6659, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.037416476756334305, + "rewards/margins": 0.051527239382267, + "rewards/margins_max": 0.08236847817897797, + "rewards/margins_min": 0.02068600058555603, + "rewards/margins_std": 0.043616097420454025, + "rewards/rejected": -0.014110761694610119, + "step": 2380 + }, + { + "epoch": 0.54, + "grad_norm": 0.302734375, + "learning_rate": 2.5858028397748825e-07, + "logits/chosen": -1.4443773031234741, + "logits/rejected": -0.9758247137069702, + "logps/chosen": -222.76760864257812, + "logps/rejected": -232.0113067626953, + "loss": 0.6728, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03209487348794937, + "rewards/margins": 0.0541413240134716, + "rewards/margins_max": 0.08297590911388397, + "rewards/margins_min": 0.02530675008893013, + "rewards/margins_std": 0.04077824950218201, + "rewards/rejected": -0.02204645611345768, + "step": 2390 + }, + { + "epoch": 0.54, + "grad_norm": 0.55859375, + "learning_rate": 2.5660833305138447e-07, + "logits/chosen": -1.482155680656433, + "logits/rejected": -1.2326542139053345, + "logps/chosen": -293.49993896484375, + "logps/rejected": -266.3676452636719, + "loss": 0.6652, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.030677342787384987, + "rewards/margins": 0.0634465366601944, + "rewards/margins_max": 0.10515228658914566, + "rewards/margins_min": 0.021740790456533432, + "rewards/margins_std": 0.05898084118962288, + "rewards/rejected": -0.03276919946074486, + "step": 2400 + }, + { + "epoch": 0.54, + "grad_norm": 0.380859375, + "learning_rate": 2.5463597059190827e-07, + "logits/chosen": -1.3583731651306152, + "logits/rejected": -1.2022713422775269, + "logps/chosen": -170.06192016601562, + "logps/rejected": -219.817138671875, + "loss": 0.6714, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023664075881242752, + "rewards/margins": 0.034540314227342606, + "rewards/margins_max": 0.054481375962495804, + "rewards/margins_min": 0.014599250629544258, + "rewards/margins_std": 0.028200918808579445, + "rewards/rejected": -0.010876237414777279, + "step": 2410 + }, + { + "epoch": 0.55, + "grad_norm": 0.369140625, + "learning_rate": 2.5266331942776213e-07, + "logits/chosen": -1.5255191326141357, + "logits/rejected": -1.2606998682022095, + "logps/chosen": -210.038330078125, + "logps/rejected": -205.2218475341797, + "loss": 0.6724, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03931133449077606, + "rewards/margins": 0.043173693120479584, + "rewards/margins_max": 0.06728474795818329, + "rewards/margins_min": 0.019062651321291924, + "rewards/margins_std": 0.03409816697239876, + "rewards/rejected": -0.0038623593281954527, + "step": 2420 + }, + { + "epoch": 0.55, + "grad_norm": 0.427734375, + "learning_rate": 2.5069050240562777e-07, + "logits/chosen": -1.4253253936767578, + "logits/rejected": -1.0836570262908936, + "logps/chosen": -193.62042236328125, + "logps/rejected": -189.53955078125, + "loss": 0.6664, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.029120203107595444, + "rewards/margins": 0.05882256105542183, + "rewards/margins_max": 0.07818777859210968, + "rewards/margins_min": 0.039457354694604874, + "rewards/margins_std": 0.027386540547013283, + "rewards/rejected": -0.029702359810471535, + "step": 2430 + }, + { + "epoch": 0.55, + "grad_norm": 0.3203125, + "learning_rate": 2.4871764238251546e-07, + "logits/chosen": -1.377029299736023, + "logits/rejected": -1.1650168895721436, + "logps/chosen": -260.649169921875, + "logps/rejected": -293.96490478515625, + "loss": 0.6703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03016340360045433, + "rewards/margins": 0.048960715532302856, + "rewards/margins_max": 0.07401823252439499, + "rewards/margins_min": 0.023903196677565575, + "rewards/margins_std": 0.035436682403087616, + "rewards/rejected": -0.018797313794493675, + "step": 2440 + }, + { + "epoch": 0.55, + "grad_norm": 0.4609375, + "learning_rate": 2.467448622181134e-07, + "logits/chosen": -1.1938097476959229, + "logits/rejected": -0.9804821014404297, + "logps/chosen": -210.96774291992188, + "logps/rejected": -213.63916015625, + "loss": 0.6678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026612062007188797, + "rewards/margins": 0.051428746432065964, + "rewards/margins_max": 0.08274015039205551, + "rewards/margins_min": 0.020117351785302162, + "rewards/margins_std": 0.0442809984087944, + "rewards/rejected": -0.024816682562232018, + "step": 2450 + }, + { + "epoch": 0.56, + "grad_norm": 0.453125, + "learning_rate": 2.447722847671369e-07, + "logits/chosen": -1.3869951963424683, + "logits/rejected": -1.1585485935211182, + "logps/chosen": -216.66476440429688, + "logps/rejected": -222.16177368164062, + "loss": 0.6754, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026378905400633812, + "rewards/margins": 0.03656502440571785, + "rewards/margins_max": 0.059453725814819336, + "rewards/margins_min": 0.013676322996616364, + "rewards/margins_std": 0.03236951306462288, + "rewards/rejected": -0.010186120867729187, + "step": 2460 + }, + { + "epoch": 0.56, + "grad_norm": 0.40625, + "learning_rate": 2.428000328716768e-07, + "logits/chosen": -1.2768441438674927, + "logits/rejected": -1.0203564167022705, + "logps/chosen": -327.2904052734375, + "logps/rejected": -186.34286499023438, + "loss": 0.6671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.025987038388848305, + "rewards/margins": 0.044793128967285156, + "rewards/margins_max": 0.06938231736421585, + "rewards/margins_min": 0.02020394243299961, + "rewards/margins_std": 0.03477436676621437, + "rewards/rejected": -0.01880609430372715, + "step": 2470 + }, + { + "epoch": 0.56, + "grad_norm": 0.333984375, + "learning_rate": 2.4082822935355034e-07, + "logits/chosen": -1.3781462907791138, + "logits/rejected": -1.1217296123504639, + "logps/chosen": -230.134521484375, + "logps/rejected": -178.98233032226562, + "loss": 0.667, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.025606945157051086, + "rewards/margins": 0.04344099014997482, + "rewards/margins_max": 0.06334863603115082, + "rewards/margins_min": 0.02353333681821823, + "rewards/margins_std": 0.0281536765396595, + "rewards/rejected": -0.017834046855568886, + "step": 2480 + }, + { + "epoch": 0.56, + "grad_norm": 0.494140625, + "learning_rate": 2.3885699700665214e-07, + "logits/chosen": -1.3934246301651, + "logits/rejected": -0.9389753341674805, + "logps/chosen": -302.7892150878906, + "logps/rejected": -338.69769287109375, + "loss": 0.6737, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03493428975343704, + "rewards/margins": 0.05114731192588806, + "rewards/margins_max": 0.0730891153216362, + "rewards/margins_min": 0.029205525293946266, + "rewards/margins_std": 0.031030382961034775, + "rewards/rejected": -0.016213025897741318, + "step": 2490 + }, + { + "epoch": 0.57, + "grad_norm": 0.3515625, + "learning_rate": 2.3688645858930683e-07, + "logits/chosen": -1.3707977533340454, + "logits/rejected": -1.0166159868240356, + "logps/chosen": -271.46063232421875, + "logps/rejected": -236.61270141601562, + "loss": 0.6669, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04207485169172287, + "rewards/margins": 0.05323861911892891, + "rewards/margins_max": 0.08189557492733002, + "rewards/margins_min": 0.024581637233495712, + "rewards/margins_std": 0.04052707925438881, + "rewards/rejected": -0.011163758113980293, + "step": 2500 + }, + { + "epoch": 0.57, + "grad_norm": 0.330078125, + "learning_rate": 2.3491673681662508e-07, + "logits/chosen": -1.343019723892212, + "logits/rejected": -1.1173169612884521, + "logps/chosen": -209.8493194580078, + "logps/rejected": -248.96041870117188, + "loss": 0.6723, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02808157540857792, + "rewards/margins": 0.03859950974583626, + "rewards/margins_max": 0.06530123949050903, + "rewards/margins_min": 0.011897771619260311, + "rewards/margins_std": 0.037761956453323364, + "rewards/rejected": -0.01051793061196804, + "step": 2510 + }, + { + "epoch": 0.57, + "grad_norm": 0.345703125, + "learning_rate": 2.329479543528607e-07, + "logits/chosen": -1.370318055152893, + "logits/rejected": -1.0287044048309326, + "logps/chosen": -212.48788452148438, + "logps/rejected": -178.92825317382812, + "loss": 0.6693, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030343199148774147, + "rewards/margins": 0.05134710669517517, + "rewards/margins_max": 0.07893334329128265, + "rewards/margins_min": 0.02376086637377739, + "rewards/margins_std": 0.039012834429740906, + "rewards/rejected": -0.021003911271691322, + "step": 2520 + }, + { + "epoch": 0.57, + "grad_norm": 0.5390625, + "learning_rate": 2.3098023380377253e-07, + "logits/chosen": -1.2607336044311523, + "logits/rejected": -1.1116814613342285, + "logps/chosen": -273.98931884765625, + "logps/rejected": -227.2015380859375, + "loss": 0.6716, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.014126409776508808, + "rewards/margins": 0.04472264647483826, + "rewards/margins_max": 0.06790605932474136, + "rewards/margins_min": 0.021539241075515747, + "rewards/margins_std": 0.032786283642053604, + "rewards/rejected": -0.030596237629652023, + "step": 2530 + }, + { + "epoch": 0.57, + "grad_norm": 0.376953125, + "learning_rate": 2.2901369770898826e-07, + "logits/chosen": -1.423923373222351, + "logits/rejected": -1.1069891452789307, + "logps/chosen": -245.7572479248047, + "logps/rejected": -187.391845703125, + "loss": 0.6657, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.028265494853258133, + "rewards/margins": 0.05082274600863457, + "rewards/margins_max": 0.06974449753761292, + "rewards/margins_min": 0.03190099820494652, + "rewards/margins_std": 0.026759391650557518, + "rewards/rejected": -0.022557254880666733, + "step": 2540 + }, + { + "epoch": 0.58, + "grad_norm": 0.39453125, + "learning_rate": 2.270484685343742e-07, + "logits/chosen": -1.2900946140289307, + "logits/rejected": -1.0569812059402466, + "logps/chosen": -212.2299346923828, + "logps/rejected": -204.08480834960938, + "loss": 0.6736, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.023340780287981033, + "rewards/margins": 0.039305586367845535, + "rewards/margins_max": 0.06847918778657913, + "rewards/margins_min": 0.010131985880434513, + "rewards/margins_std": 0.041257698088884354, + "rewards/rejected": -0.015964802354574203, + "step": 2550 + }, + { + "epoch": 0.58, + "grad_norm": 0.4140625, + "learning_rate": 2.2508466866440822e-07, + "logits/chosen": -1.3407169580459595, + "logits/rejected": -0.9573219418525696, + "logps/chosen": -244.609375, + "logps/rejected": -233.3170623779297, + "loss": 0.6677, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03367815539240837, + "rewards/margins": 0.056748904287815094, + "rewards/margins_max": 0.08551572263240814, + "rewards/margins_min": 0.0279820766299963, + "rewards/margins_std": 0.04068244248628616, + "rewards/rejected": -0.02307075262069702, + "step": 2560 + }, + { + "epoch": 0.58, + "grad_norm": 0.50390625, + "learning_rate": 2.2312242039455813e-07, + "logits/chosen": -1.4510078430175781, + "logits/rejected": -1.0880759954452515, + "logps/chosen": -217.3108673095703, + "logps/rejected": -215.2972869873047, + "loss": 0.6683, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.039409976452589035, + "rewards/margins": 0.05283751338720322, + "rewards/margins_max": 0.08771199733018875, + "rewards/margins_min": 0.01796303130686283, + "rewards/margins_std": 0.04931997135281563, + "rewards/rejected": -0.013427533209323883, + "step": 2570 + }, + { + "epoch": 0.58, + "grad_norm": 0.484375, + "learning_rate": 2.2116184592366637e-07, + "logits/chosen": -1.2980551719665527, + "logits/rejected": -1.0735098123550415, + "logps/chosen": -243.39370727539062, + "logps/rejected": -239.2067413330078, + "loss": 0.6684, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03561193123459816, + "rewards/margins": 0.05166007950901985, + "rewards/margins_max": 0.07473595440387726, + "rewards/margins_min": 0.028584185987710953, + "rewards/margins_std": 0.03263423591852188, + "rewards/rejected": -0.016048144549131393, + "step": 2580 + }, + { + "epoch": 0.59, + "grad_norm": 0.3125, + "learning_rate": 2.1920306734633932e-07, + "logits/chosen": -1.3777854442596436, + "logits/rejected": -1.1635510921478271, + "logps/chosen": -178.19998168945312, + "logps/rejected": -221.8589324951172, + "loss": 0.6684, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.039459072053432465, + "rewards/margins": 0.051184237003326416, + "rewards/margins_max": 0.07255267351865768, + "rewards/margins_min": 0.029815804213285446, + "rewards/margins_std": 0.030219530686736107, + "rewards/rejected": -0.011725172400474548, + "step": 2590 + }, + { + "epoch": 0.59, + "grad_norm": 0.4765625, + "learning_rate": 2.1724620664534452e-07, + "logits/chosen": -1.4241256713867188, + "logits/rejected": -0.995019793510437, + "logps/chosen": -236.72262573242188, + "logps/rejected": -216.3303680419922, + "loss": 0.6686, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02747528627514839, + "rewards/margins": 0.04677470773458481, + "rewards/margins_max": 0.07165158540010452, + "rewards/margins_min": 0.021897820755839348, + "rewards/margins_std": 0.03518122434616089, + "rewards/rejected": -0.019299419596791267, + "step": 2600 + }, + { + "epoch": 0.59, + "grad_norm": 0.515625, + "learning_rate": 2.1529138568401374e-07, + "logits/chosen": -1.4380706548690796, + "logits/rejected": -1.1371575593948364, + "logps/chosen": -268.78253173828125, + "logps/rejected": -245.05960083007812, + "loss": 0.6705, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025660425424575806, + "rewards/margins": 0.04348149523139, + "rewards/margins_max": 0.0655093640089035, + "rewards/margins_min": 0.021453622728586197, + "rewards/margins_std": 0.031152114272117615, + "rewards/rejected": -0.017821069806814194, + "step": 2610 + }, + { + "epoch": 0.59, + "grad_norm": 0.5546875, + "learning_rate": 2.1333872619865436e-07, + "logits/chosen": -1.3819271326065063, + "logits/rejected": -1.2658441066741943, + "logps/chosen": -146.02621459960938, + "logps/rejected": -222.25827026367188, + "loss": 0.6726, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02677101269364357, + "rewards/margins": 0.04473014920949936, + "rewards/margins_max": 0.06981190294027328, + "rewards/margins_min": 0.01964840292930603, + "rewards/margins_std": 0.03547095134854317, + "rewards/rejected": -0.017959142103791237, + "step": 2620 + }, + { + "epoch": 0.59, + "grad_norm": 0.373046875, + "learning_rate": 2.1138834979096777e-07, + "logits/chosen": -1.4313969612121582, + "logits/rejected": -1.0846507549285889, + "logps/chosen": -319.28924560546875, + "logps/rejected": -198.7446746826172, + "loss": 0.6743, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02570546232163906, + "rewards/margins": 0.03866446763277054, + "rewards/margins_max": 0.052897948771715164, + "rewards/margins_min": 0.024430977180600166, + "rewards/margins_std": 0.020129187032580376, + "rewards/rejected": -0.01295899786055088, + "step": 2630 + }, + { + "epoch": 0.6, + "grad_norm": 0.3203125, + "learning_rate": 2.0944037792047694e-07, + "logits/chosen": -1.4719856977462769, + "logits/rejected": -1.1970919370651245, + "logps/chosen": -248.2223358154297, + "logps/rejected": -195.37448120117188, + "loss": 0.671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03520715609192848, + "rewards/margins": 0.04729854315519333, + "rewards/margins_max": 0.06652859598398209, + "rewards/margins_min": 0.02806849777698517, + "rewards/margins_std": 0.02719539776444435, + "rewards/rejected": -0.012091396376490593, + "step": 2640 + }, + { + "epoch": 0.6, + "grad_norm": 0.4765625, + "learning_rate": 2.0749493189696277e-07, + "logits/chosen": -1.4909460544586182, + "logits/rejected": -1.3218861818313599, + "logps/chosen": -185.89071655273438, + "logps/rejected": -220.0978240966797, + "loss": 0.6702, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.030022714287042618, + "rewards/margins": 0.04576057940721512, + "rewards/margins_max": 0.08507208526134491, + "rewards/margins_min": 0.0064490786753594875, + "rewards/margins_std": 0.055594854056835175, + "rewards/rejected": -0.01573786698281765, + "step": 2650 + }, + { + "epoch": 0.6, + "grad_norm": 0.275390625, + "learning_rate": 2.0555213287290884e-07, + "logits/chosen": -1.4318983554840088, + "logits/rejected": -1.1789401769638062, + "logps/chosen": -219.03793334960938, + "logps/rejected": -216.1660919189453, + "loss": 0.6678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02756131812930107, + "rewards/margins": 0.04779034107923508, + "rewards/margins_max": 0.07405062019824982, + "rewards/margins_min": 0.02153005078434944, + "rewards/margins_std": 0.037137649953365326, + "rewards/rejected": -0.020229021087288857, + "step": 2660 + }, + { + "epoch": 0.6, + "grad_norm": 0.5546875, + "learning_rate": 2.036121018359574e-07, + "logits/chosen": -1.4215052127838135, + "logits/rejected": -1.1148322820663452, + "logps/chosen": -214.1374969482422, + "logps/rejected": -228.98159790039062, + "loss": 0.6669, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03607341647148132, + "rewards/margins": 0.06266181170940399, + "rewards/margins_max": 0.09267817437648773, + "rewards/margins_min": 0.03264545649290085, + "rewards/margins_std": 0.04244953393936157, + "rewards/rejected": -0.026588398963212967, + "step": 2670 + }, + { + "epoch": 0.61, + "grad_norm": 0.349609375, + "learning_rate": 2.0167495960137438e-07, + "logits/chosen": -1.5387341976165771, + "logits/rejected": -1.1654388904571533, + "logps/chosen": -194.2707977294922, + "logps/rejected": -257.0436706542969, + "loss": 0.6711, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04224932938814163, + "rewards/margins": 0.05386502668261528, + "rewards/margins_max": 0.0798775777220726, + "rewards/margins_min": 0.027852484956383705, + "rewards/margins_std": 0.03678729385137558, + "rewards/rejected": -0.01161570381373167, + "step": 2680 + }, + { + "epoch": 0.61, + "grad_norm": 0.4296875, + "learning_rate": 1.997408268045259e-07, + "logits/chosen": -1.3454197645187378, + "logits/rejected": -1.0782673358917236, + "logps/chosen": -234.7984161376953, + "logps/rejected": -211.3849334716797, + "loss": 0.6703, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03135889023542404, + "rewards/margins": 0.04911700636148453, + "rewards/margins_max": 0.07688502967357635, + "rewards/margins_min": 0.021348986774683, + "rewards/margins_std": 0.039269909262657166, + "rewards/rejected": -0.017758117988705635, + "step": 2690 + }, + { + "epoch": 0.61, + "grad_norm": 0.37890625, + "learning_rate": 1.9780982389336537e-07, + "logits/chosen": -1.5304574966430664, + "logits/rejected": -1.174623727798462, + "logps/chosen": -223.42050170898438, + "logps/rejected": -214.8582763671875, + "loss": 0.6725, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02357478067278862, + "rewards/margins": 0.04619375243782997, + "rewards/margins_max": 0.07109406590461731, + "rewards/margins_min": 0.021293427795171738, + "rewards/margins_std": 0.0352143719792366, + "rewards/rejected": -0.022618968039751053, + "step": 2700 + }, + { + "epoch": 0.61, + "grad_norm": 0.361328125, + "learning_rate": 1.9588207112093322e-07, + "logits/chosen": -1.3365089893341064, + "logits/rejected": -1.0793324708938599, + "logps/chosen": -255.2864990234375, + "logps/rejected": -225.516357421875, + "loss": 0.6701, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.028706055134534836, + "rewards/margins": 0.0458790548145771, + "rewards/margins_max": 0.07555496692657471, + "rewards/margins_min": 0.016203144565224648, + "rewards/margins_std": 0.041968077421188354, + "rewards/rejected": -0.017172997817397118, + "step": 2710 + }, + { + "epoch": 0.61, + "grad_norm": 0.498046875, + "learning_rate": 1.9395768853786738e-07, + "logits/chosen": -1.4965957403182983, + "logits/rejected": -0.957140326499939, + "logps/chosen": -261.61871337890625, + "logps/rejected": -232.3061981201172, + "loss": 0.6626, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03507986664772034, + "rewards/margins": 0.062941774725914, + "rewards/margins_max": 0.09290553629398346, + "rewards/margins_min": 0.03297800570726395, + "rewards/margins_std": 0.04237515479326248, + "rewards/rejected": -0.02786189876496792, + "step": 2720 + }, + { + "epoch": 0.62, + "grad_norm": 0.4453125, + "learning_rate": 1.9203679598492767e-07, + "logits/chosen": -1.5386086702346802, + "logits/rejected": -1.2974907159805298, + "logps/chosen": -173.94613647460938, + "logps/rejected": -201.42996215820312, + "loss": 0.6683, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03600393980741501, + "rewards/margins": 0.05009385943412781, + "rewards/margins_max": 0.07953473180532455, + "rewards/margins_min": 0.020652998238801956, + "rewards/margins_std": 0.0416356697678566, + "rewards/rejected": -0.014089921489357948, + "step": 2730 + }, + { + "epoch": 0.62, + "grad_norm": 0.361328125, + "learning_rate": 1.9011951308553282e-07, + "logits/chosen": -1.167707085609436, + "logits/rejected": -0.9583696126937866, + "logps/chosen": -253.3549346923828, + "logps/rejected": -247.03994750976562, + "loss": 0.6643, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03269846737384796, + "rewards/margins": 0.05938447639346123, + "rewards/margins_max": 0.0881558284163475, + "rewards/margins_min": 0.03061310388147831, + "rewards/margins_std": 0.040688853710889816, + "rewards/rejected": -0.02668600343167782, + "step": 2740 + }, + { + "epoch": 0.62, + "grad_norm": 0.42578125, + "learning_rate": 1.8820595923831023e-07, + "logits/chosen": -1.4228308200836182, + "logits/rejected": -1.1265003681182861, + "logps/chosen": -237.9507598876953, + "logps/rejected": -199.07684326171875, + "loss": 0.668, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.035271745175123215, + "rewards/margins": 0.04950911924242973, + "rewards/margins_max": 0.07321339845657349, + "rewards/margins_min": 0.02580484375357628, + "rewards/margins_std": 0.03352290391921997, + "rewards/rejected": -0.014237369410693645, + "step": 2750 + }, + { + "epoch": 0.62, + "grad_norm": 0.359375, + "learning_rate": 1.8629625360966134e-07, + "logits/chosen": -1.4357101917266846, + "logits/rejected": -1.2019593715667725, + "logps/chosen": -222.2527313232422, + "logps/rejected": -240.33810424804688, + "loss": 0.6651, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.040762145072221756, + "rewards/margins": 0.06205862760543823, + "rewards/margins_max": 0.09742200374603271, + "rewards/margins_min": 0.026695240288972855, + "rewards/margins_std": 0.05001138523221016, + "rewards/rejected": -0.02129647508263588, + "step": 2760 + }, + { + "epoch": 0.63, + "grad_norm": 0.369140625, + "learning_rate": 1.8439051512633982e-07, + "logits/chosen": -1.448634386062622, + "logits/rejected": -1.0578219890594482, + "logps/chosen": -298.6661682128906, + "logps/rejected": -253.38241577148438, + "loss": 0.6737, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03544292971491814, + "rewards/margins": 0.03846219182014465, + "rewards/margins_max": 0.06004079431295395, + "rewards/margins_min": 0.016883578151464462, + "rewards/margins_std": 0.030516769737005234, + "rewards/rejected": -0.003019258612766862, + "step": 2770 + }, + { + "epoch": 0.63, + "grad_norm": 0.283203125, + "learning_rate": 1.8248886246804596e-07, + "logits/chosen": -1.3351027965545654, + "logits/rejected": -1.178442120552063, + "logps/chosen": -208.878662109375, + "logps/rejected": -172.73736572265625, + "loss": 0.6697, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.027096301317214966, + "rewards/margins": 0.04242512583732605, + "rewards/margins_max": 0.06505424529314041, + "rewards/margins_min": 0.019795997068285942, + "rewards/margins_std": 0.032002415508031845, + "rewards/rejected": -0.01532882172614336, + "step": 2780 + }, + { + "epoch": 0.63, + "grad_norm": 0.328125, + "learning_rate": 1.805914140600353e-07, + "logits/chosen": -1.3552095890045166, + "logits/rejected": -1.1506478786468506, + "logps/chosen": -165.49307250976562, + "logps/rejected": -199.17178344726562, + "loss": 0.6676, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03187553584575653, + "rewards/margins": 0.04981083795428276, + "rewards/margins_max": 0.0677100196480751, + "rewards/margins_min": 0.03191165626049042, + "rewards/margins_std": 0.02531326375901699, + "rewards/rejected": -0.01793530210852623, + "step": 2790 + }, + { + "epoch": 0.63, + "grad_norm": 0.41015625, + "learning_rate": 1.7869828806574438e-07, + "logits/chosen": -1.3208125829696655, + "logits/rejected": -1.1334774494171143, + "logps/chosen": -243.2615203857422, + "logps/rejected": -207.776611328125, + "loss": 0.6728, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.016092196106910706, + "rewards/margins": 0.03982243314385414, + "rewards/margins_max": 0.0666196197271347, + "rewards/margins_min": 0.013025254011154175, + "rewards/margins_std": 0.03789693862199783, + "rewards/rejected": -0.023730238899588585, + "step": 2800 + }, + { + "epoch": 0.64, + "grad_norm": 0.40625, + "learning_rate": 1.768096023794317e-07, + "logits/chosen": -1.4114409685134888, + "logits/rejected": -1.1142067909240723, + "logps/chosen": -213.9055938720703, + "logps/rejected": -193.6376953125, + "loss": 0.6697, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03414800763130188, + "rewards/margins": 0.04588519036769867, + "rewards/margins_max": 0.06450579315423965, + "rewards/margins_min": 0.027264589443802834, + "rewards/margins_std": 0.026333507150411606, + "rewards/rejected": -0.011737184599041939, + "step": 2810 + }, + { + "epoch": 0.64, + "grad_norm": 0.287109375, + "learning_rate": 1.7492547461883577e-07, + "logits/chosen": -1.378385305404663, + "logits/rejected": -1.0380463600158691, + "logps/chosen": -234.6033477783203, + "logps/rejected": -204.3202362060547, + "loss": 0.6674, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.030243387445807457, + "rewards/margins": 0.04808034002780914, + "rewards/margins_max": 0.07598671317100525, + "rewards/margins_min": 0.020173965021967888, + "rewards/margins_std": 0.03946557641029358, + "rewards/rejected": -0.017836952582001686, + "step": 2820 + }, + { + "epoch": 0.64, + "grad_norm": 0.4375, + "learning_rate": 1.7304602211785103e-07, + "logits/chosen": -1.4631866216659546, + "logits/rejected": -1.1849663257598877, + "logps/chosen": -210.96768188476562, + "logps/rejected": -216.9154052734375, + "loss": 0.6696, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03199288621544838, + "rewards/margins": 0.054790280759334564, + "rewards/margins_max": 0.08474183082580566, + "rewards/margins_min": 0.024838734418153763, + "rewards/margins_std": 0.042357880622148514, + "rewards/rejected": -0.022797394543886185, + "step": 2830 + }, + { + "epoch": 0.64, + "grad_norm": 0.310546875, + "learning_rate": 1.711713619192201e-07, + "logits/chosen": -1.3077460527420044, + "logits/rejected": -1.139337420463562, + "logps/chosen": -185.03842163085938, + "logps/rejected": -171.12295532226562, + "loss": 0.6667, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030919110402464867, + "rewards/margins": 0.051524568349123, + "rewards/margins_max": 0.08040101826190948, + "rewards/margins_min": 0.022648107260465622, + "rewards/margins_std": 0.04083748161792755, + "rewards/rejected": -0.020605452358722687, + "step": 2840 + }, + { + "epoch": 0.64, + "grad_norm": 0.37890625, + "learning_rate": 1.6930161076724584e-07, + "logits/chosen": -1.4011225700378418, + "logits/rejected": -1.0451468229293823, + "logps/chosen": -198.51084899902344, + "logps/rejected": -216.575927734375, + "loss": 0.668, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03980085626244545, + "rewards/margins": 0.05166858434677124, + "rewards/margins_max": 0.08060939610004425, + "rewards/margins_min": 0.022727767005562782, + "rewards/margins_std": 0.040928494185209274, + "rewards/rejected": -0.011867721565067768, + "step": 2850 + }, + { + "epoch": 0.65, + "grad_norm": 0.298828125, + "learning_rate": 1.6743688510052023e-07, + "logits/chosen": -1.445908546447754, + "logits/rejected": -1.2983238697052002, + "logps/chosen": -184.75465393066406, + "logps/rejected": -176.6446533203125, + "loss": 0.6719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021395951509475708, + "rewards/margins": 0.03993183746933937, + "rewards/margins_max": 0.05802379921078682, + "rewards/margins_min": 0.021839866414666176, + "rewards/margins_std": 0.025585904717445374, + "rewards/rejected": -0.018535882234573364, + "step": 2860 + }, + { + "epoch": 0.65, + "grad_norm": 0.443359375, + "learning_rate": 1.6557730104467403e-07, + "logits/chosen": -1.5235058069229126, + "logits/rejected": -1.1063475608825684, + "logps/chosen": -209.2215576171875, + "logps/rejected": -193.5798797607422, + "loss": 0.6641, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.036139652132987976, + "rewards/margins": 0.04945778846740723, + "rewards/margins_max": 0.0719948261976242, + "rewards/margins_min": 0.026920750737190247, + "rewards/margins_std": 0.03187217935919762, + "rewards/rejected": -0.01331813633441925, + "step": 2870 + }, + { + "epoch": 0.65, + "grad_norm": 0.419921875, + "learning_rate": 1.6372297440514415e-07, + "logits/chosen": -1.3985137939453125, + "logits/rejected": -1.1464028358459473, + "logps/chosen": -201.1994171142578, + "logps/rejected": -201.71766662597656, + "loss": 0.6667, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.037888336926698685, + "rewards/margins": 0.051082391291856766, + "rewards/margins_max": 0.07492348551750183, + "rewards/margins_min": 0.027241300791502, + "rewards/margins_std": 0.03371639922261238, + "rewards/rejected": -0.013194059021770954, + "step": 2880 + }, + { + "epoch": 0.65, + "grad_norm": 0.388671875, + "learning_rate": 1.6187402065996263e-07, + "logits/chosen": -1.499720811843872, + "logits/rejected": -1.1451431512832642, + "logps/chosen": -205.8267822265625, + "logps/rejected": -189.5011444091797, + "loss": 0.6677, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.040955521166324615, + "rewards/margins": 0.05267069488763809, + "rewards/margins_max": 0.0834018737077713, + "rewards/margins_min": 0.021939512342214584, + "rewards/margins_std": 0.04346044734120369, + "rewards/rejected": -0.011715171858668327, + "step": 2890 + }, + { + "epoch": 0.66, + "grad_norm": 0.376953125, + "learning_rate": 1.6003055495256506e-07, + "logits/chosen": -1.3605386018753052, + "logits/rejected": -1.178406000137329, + "logps/chosen": -190.85678100585938, + "logps/rejected": -172.95631408691406, + "loss": 0.6729, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02527732215821743, + "rewards/margins": 0.03166641667485237, + "rewards/margins_max": 0.04430197551846504, + "rewards/margins_min": 0.01903085596859455, + "rewards/margins_std": 0.017869381234049797, + "rewards/rejected": -0.00638909637928009, + "step": 2900 + }, + { + "epoch": 0.66, + "grad_norm": 0.349609375, + "learning_rate": 1.581926920846196e-07, + "logits/chosen": -1.3835715055465698, + "logits/rejected": -1.0120103359222412, + "logps/chosen": -226.8507843017578, + "logps/rejected": -230.98605346679688, + "loss": 0.6683, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04309115558862686, + "rewards/margins": 0.054244499653577805, + "rewards/margins_max": 0.08164414763450623, + "rewards/margins_min": 0.02684485912322998, + "rewards/margins_std": 0.03874894976615906, + "rewards/rejected": -0.011153348721563816, + "step": 2910 + }, + { + "epoch": 0.66, + "grad_norm": 0.47265625, + "learning_rate": 1.5636054650887847e-07, + "logits/chosen": -1.367553949356079, + "logits/rejected": -1.1866862773895264, + "logps/chosen": -218.7686309814453, + "logps/rejected": -213.06802368164062, + "loss": 0.6681, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03090577758848667, + "rewards/margins": 0.058771491050720215, + "rewards/margins_max": 0.09552975744009018, + "rewards/margins_min": 0.022013235837221146, + "rewards/margins_std": 0.0519840307533741, + "rewards/rejected": -0.02786571905016899, + "step": 2920 + }, + { + "epoch": 0.66, + "grad_norm": 0.47265625, + "learning_rate": 1.5453423232204965e-07, + "logits/chosen": -1.5142792463302612, + "logits/rejected": -1.318814754486084, + "logps/chosen": -169.22059631347656, + "logps/rejected": -188.27011108398438, + "loss": 0.6646, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0322001650929451, + "rewards/margins": 0.05324719101190567, + "rewards/margins_max": 0.08077719062566757, + "rewards/margins_min": 0.025717195123434067, + "rewards/margins_std": 0.038933295756578445, + "rewards/rejected": -0.02104702964425087, + "step": 2930 + }, + { + "epoch": 0.66, + "grad_norm": 0.447265625, + "learning_rate": 1.5271386325769226e-07, + "logits/chosen": -1.4925755262374878, + "logits/rejected": -1.129923701286316, + "logps/chosen": -187.85366821289062, + "logps/rejected": -205.51449584960938, + "loss": 0.6653, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0343579538166523, + "rewards/margins": 0.06020587682723999, + "rewards/margins_max": 0.09044940769672394, + "rewards/margins_min": 0.029962360858917236, + "rewards/margins_std": 0.04277079552412033, + "rewards/rejected": -0.02584792673587799, + "step": 2940 + }, + { + "epoch": 0.67, + "grad_norm": 0.412109375, + "learning_rate": 1.5089955267913302e-07, + "logits/chosen": -1.2845211029052734, + "logits/rejected": -0.9166741371154785, + "logps/chosen": -300.81634521484375, + "logps/rejected": -228.6632843017578, + "loss": 0.6699, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03172794356942177, + "rewards/margins": 0.044202424585819244, + "rewards/margins_max": 0.06601408123970032, + "rewards/margins_min": 0.02239074558019638, + "rewards/margins_std": 0.030846362933516502, + "rewards/rejected": -0.012474477291107178, + "step": 2950 + }, + { + "epoch": 0.67, + "grad_norm": 0.421875, + "learning_rate": 1.490914135724073e-07, + "logits/chosen": -1.4572781324386597, + "logits/rejected": -1.0811411142349243, + "logps/chosen": -282.17138671875, + "logps/rejected": -236.5408172607422, + "loss": 0.6692, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0302413459867239, + "rewards/margins": 0.0462752990424633, + "rewards/margins_max": 0.07116423547267914, + "rewards/margins_min": 0.021386370062828064, + "rewards/margins_std": 0.03519826382398605, + "rewards/rejected": -0.0160339567810297, + "step": 2960 + }, + { + "epoch": 0.67, + "grad_norm": 0.35546875, + "learning_rate": 1.4728955853922237e-07, + "logits/chosen": -1.456148386001587, + "logits/rejected": -1.277165174484253, + "logps/chosen": -204.0210418701172, + "logps/rejected": -172.01878356933594, + "loss": 0.6698, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.01876869983971119, + "rewards/margins": 0.04803822189569473, + "rewards/margins_max": 0.07267947494983673, + "rewards/margins_min": 0.023396968841552734, + "rewards/margins_std": 0.03484799712896347, + "rewards/rejected": -0.029269522055983543, + "step": 2970 + }, + { + "epoch": 0.67, + "grad_norm": 0.42578125, + "learning_rate": 1.4549409978994542e-07, + "logits/chosen": -1.3960371017456055, + "logits/rejected": -1.0863512754440308, + "logps/chosen": -288.02093505859375, + "logps/rejected": -261.132080078125, + "loss": 0.6708, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03317864611744881, + "rewards/margins": 0.049657586961984634, + "rewards/margins_max": 0.07338405400514603, + "rewards/margins_min": 0.025931116193532944, + "rewards/margins_std": 0.03355429321527481, + "rewards/rejected": -0.016478940844535828, + "step": 2980 + }, + { + "epoch": 0.68, + "grad_norm": 0.2890625, + "learning_rate": 1.4370514913661573e-07, + "logits/chosen": -1.269504189491272, + "logits/rejected": -0.9573532938957214, + "logps/chosen": -253.5911102294922, + "logps/rejected": -173.530029296875, + "loss": 0.6701, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03387542814016342, + "rewards/margins": 0.05059955641627312, + "rewards/margins_max": 0.07369405776262283, + "rewards/margins_min": 0.027505064383149147, + "rewards/margins_std": 0.032660551369190216, + "rewards/rejected": -0.016724130138754845, + "step": 2990 + }, + { + "epoch": 0.68, + "grad_norm": 0.390625, + "learning_rate": 1.4192281798598133e-07, + "logits/chosen": -1.4107468128204346, + "logits/rejected": -1.2959485054016113, + "logps/chosen": -170.9486083984375, + "logps/rejected": -204.7427520751953, + "loss": 0.6693, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03265909105539322, + "rewards/margins": 0.052322544157505035, + "rewards/margins_max": 0.08253692090511322, + "rewards/margins_min": 0.022108152508735657, + "rewards/margins_std": 0.04272959753870964, + "rewards/rejected": -0.01966344751417637, + "step": 3000 + }, + { + "epoch": 0.68, + "grad_norm": 0.4375, + "learning_rate": 1.4014721733256135e-07, + "logits/chosen": -1.5541356801986694, + "logits/rejected": -1.1410366296768188, + "logps/chosen": -249.2203369140625, + "logps/rejected": -211.2033233642578, + "loss": 0.6667, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.039340030401945114, + "rewards/margins": 0.06089523434638977, + "rewards/margins_max": 0.09748632460832596, + "rewards/margins_min": 0.024304138496518135, + "rewards/margins_std": 0.05174762010574341, + "rewards/rejected": -0.02155519835650921, + "step": 3010 + }, + { + "epoch": 0.68, + "grad_norm": 0.4609375, + "learning_rate": 1.3837845775173373e-07, + "logits/chosen": -1.369077444076538, + "logits/rejected": -1.1096550226211548, + "logps/chosen": -197.16085815429688, + "logps/rejected": -163.0858612060547, + "loss": 0.6699, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0283735990524292, + "rewards/margins": 0.04257078841328621, + "rewards/margins_max": 0.06825915724039078, + "rewards/margins_min": 0.01688242517411709, + "rewards/margins_std": 0.036328837275505066, + "rewards/rejected": -0.014197193086147308, + "step": 3020 + }, + { + "epoch": 0.68, + "grad_norm": 0.35546875, + "learning_rate": 1.3661664939284928e-07, + "logits/chosen": -1.3072458505630493, + "logits/rejected": -1.0293340682983398, + "logps/chosen": -256.4021301269531, + "logps/rejected": -215.84683227539062, + "loss": 0.6654, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.036074064671993256, + "rewards/margins": 0.05670546740293503, + "rewards/margins_max": 0.08541737496852875, + "rewards/margins_min": 0.027993574738502502, + "rewards/margins_std": 0.040604762732982635, + "rewards/rejected": -0.02063140645623207, + "step": 3030 + }, + { + "epoch": 0.69, + "grad_norm": 0.3359375, + "learning_rate": 1.3486190197237187e-07, + "logits/chosen": -1.5194108486175537, + "logits/rejected": -1.1841778755187988, + "logps/chosen": -253.387939453125, + "logps/rejected": -258.55517578125, + "loss": 0.6671, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04065632447600365, + "rewards/margins": 0.06677254289388657, + "rewards/margins_max": 0.0925917774438858, + "rewards/margins_min": 0.04095330461859703, + "rewards/margins_std": 0.03651391342282295, + "rewards/rejected": -0.02611621282994747, + "step": 3040 + }, + { + "epoch": 0.69, + "grad_norm": 0.380859375, + "learning_rate": 1.3311432476704653e-07, + "logits/chosen": -1.3569104671478271, + "logits/rejected": -1.2485584020614624, + "logps/chosen": -186.0038299560547, + "logps/rejected": -232.54959106445312, + "loss": 0.6726, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.026211729273200035, + "rewards/margins": 0.047675721347332, + "rewards/margins_max": 0.07000197470188141, + "rewards/margins_min": 0.02534947171807289, + "rewards/margins_std": 0.03157408535480499, + "rewards/rejected": -0.021463993936777115, + "step": 3050 + }, + { + "epoch": 0.69, + "grad_norm": 0.369140625, + "learning_rate": 1.3137402660709311e-07, + "logits/chosen": -1.2760577201843262, + "logits/rejected": -1.2659679651260376, + "logps/chosen": -175.79026794433594, + "logps/rejected": -195.94700622558594, + "loss": 0.675, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.026767786592245102, + "rewards/margins": 0.03098815120756626, + "rewards/margins_max": 0.04722337797284126, + "rewards/margins_min": 0.01475292257964611, + "rewards/margins_std": 0.022960076108574867, + "rewards/rejected": -0.004220363683998585, + "step": 3060 + }, + { + "epoch": 0.69, + "grad_norm": 0.48828125, + "learning_rate": 1.2964111586942994e-07, + "logits/chosen": -1.3129364252090454, + "logits/rejected": -0.9301867485046387, + "logps/chosen": -274.6208190917969, + "logps/rejected": -219.16415405273438, + "loss": 0.6687, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.027635782957077026, + "rewards/margins": 0.05113440752029419, + "rewards/margins_max": 0.08066648244857788, + "rewards/margins_min": 0.021602336317300797, + "rewards/margins_std": 0.04176466166973114, + "rewards/rejected": -0.02349862828850746, + "step": 3070 + }, + { + "epoch": 0.7, + "grad_norm": 0.310546875, + "learning_rate": 1.2791570047092413e-07, + "logits/chosen": -1.4004180431365967, + "logits/rejected": -1.1162279844284058, + "logps/chosen": -272.8247985839844, + "logps/rejected": -223.28164672851562, + "loss": 0.6668, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.020728105679154396, + "rewards/margins": 0.05209376662969589, + "rewards/margins_max": 0.07638858258724213, + "rewards/margins_min": 0.027798956260085106, + "rewards/margins_std": 0.034358054399490356, + "rewards/rejected": -0.031365666538476944, + "step": 3080 + }, + { + "epoch": 0.7, + "grad_norm": 0.265625, + "learning_rate": 1.2619788786167112e-07, + "logits/chosen": -1.3903374671936035, + "logits/rejected": -1.3379945755004883, + "logps/chosen": -199.85646057128906, + "logps/rejected": -250.6023712158203, + "loss": 0.6701, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.030954908579587936, + "rewards/margins": 0.040415409952402115, + "rewards/margins_max": 0.057001303881406784, + "rewards/margins_min": 0.023829510435461998, + "rewards/margins_std": 0.02345600351691246, + "rewards/rejected": -0.009460503235459328, + "step": 3090 + }, + { + "epoch": 0.7, + "grad_norm": 0.416015625, + "learning_rate": 1.2448778501830378e-07, + "logits/chosen": -1.4009287357330322, + "logits/rejected": -1.1963551044464111, + "logps/chosen": -194.9075927734375, + "logps/rejected": -232.9965057373047, + "loss": 0.6678, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03073223866522312, + "rewards/margins": 0.046397607773542404, + "rewards/margins_max": 0.0713280513882637, + "rewards/margins_min": 0.021467158570885658, + "rewards/margins_std": 0.03525697812438011, + "rewards/rejected": -0.015665370970964432, + "step": 3100 + }, + { + "epoch": 0.7, + "grad_norm": 0.3671875, + "learning_rate": 1.2278549843732912e-07, + "logits/chosen": -1.4153436422348022, + "logits/rejected": -1.214393973350525, + "logps/chosen": -203.56967163085938, + "logps/rejected": -203.48190307617188, + "loss": 0.6739, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02419831044971943, + "rewards/margins": 0.04225160926580429, + "rewards/margins_max": 0.0629769042134285, + "rewards/margins_min": 0.021526312455534935, + "rewards/margins_std": 0.029309988021850586, + "rewards/rejected": -0.018053295090794563, + "step": 3110 + }, + { + "epoch": 0.71, + "grad_norm": 0.380859375, + "learning_rate": 1.210911341284979e-07, + "logits/chosen": -1.2526661157608032, + "logits/rejected": -1.0572154521942139, + "logps/chosen": -161.1710662841797, + "logps/rejected": -198.1621856689453, + "loss": 0.6739, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02532758191227913, + "rewards/margins": 0.029836729168891907, + "rewards/margins_max": 0.047510914504528046, + "rewards/margins_min": 0.012162544764578342, + "rewards/margins_std": 0.024995077401399612, + "rewards/rejected": -0.00450914865359664, + "step": 3120 + }, + { + "epoch": 0.71, + "grad_norm": 0.31640625, + "learning_rate": 1.1940479760820175e-07, + "logits/chosen": -1.366984486579895, + "logits/rejected": -1.0967432260513306, + "logps/chosen": -201.30935668945312, + "logps/rejected": -235.0970916748047, + "loss": 0.6658, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0381210520863533, + "rewards/margins": 0.06330465525388718, + "rewards/margins_max": 0.10113723576068878, + "rewards/margins_min": 0.02547208033502102, + "rewards/margins_std": 0.05350334197282791, + "rewards/rejected": -0.025183597579598427, + "step": 3130 + }, + { + "epoch": 0.71, + "grad_norm": 0.56640625, + "learning_rate": 1.1772659389290207e-07, + "logits/chosen": -1.4767048358917236, + "logits/rejected": -1.0964723825454712, + "logps/chosen": -260.4814453125, + "logps/rejected": -240.03036499023438, + "loss": 0.6711, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.040913086384534836, + "rewards/margins": 0.04774344712495804, + "rewards/margins_max": 0.07724296301603317, + "rewards/margins_min": 0.018243929371237755, + "rewards/margins_std": 0.04171861708164215, + "rewards/rejected": -0.006830359809100628, + "step": 3140 + }, + { + "epoch": 0.71, + "grad_norm": 0.384765625, + "learning_rate": 1.160566274925912e-07, + "logits/chosen": -1.4270305633544922, + "logits/rejected": -1.0017156600952148, + "logps/chosen": -218.6697235107422, + "logps/rejected": -166.73095703125, + "loss": 0.6642, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.034726254642009735, + "rewards/margins": 0.055394403636455536, + "rewards/margins_max": 0.08493579924106598, + "rewards/margins_min": 0.02585301361978054, + "rewards/margins_std": 0.0417778417468071, + "rewards/rejected": -0.0206681527197361, + "step": 3150 + }, + { + "epoch": 0.71, + "grad_norm": 0.36328125, + "learning_rate": 1.1439500240428301e-07, + "logits/chosen": -1.41231369972229, + "logits/rejected": -1.115092158317566, + "logps/chosen": -134.1064910888672, + "logps/rejected": -153.6800079345703, + "loss": 0.6696, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.024014055728912354, + "rewards/margins": 0.03822372108697891, + "rewards/margins_max": 0.05592702701687813, + "rewards/margins_min": 0.020520424470305443, + "rewards/margins_std": 0.025036249309778214, + "rewards/rejected": -0.014209670014679432, + "step": 3160 + }, + { + "epoch": 0.72, + "grad_norm": 0.49609375, + "learning_rate": 1.1274182210553698e-07, + "logits/chosen": -1.2921323776245117, + "logits/rejected": -1.0127956867218018, + "logps/chosen": -283.88250732421875, + "logps/rejected": -273.15899658203125, + "loss": 0.6674, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03838573768734932, + "rewards/margins": 0.05664980411529541, + "rewards/margins_max": 0.09717298299074173, + "rewards/margins_min": 0.016126640141010284, + "rewards/margins_std": 0.057308416813611984, + "rewards/rejected": -0.01826407015323639, + "step": 3170 + }, + { + "epoch": 0.72, + "grad_norm": 0.421875, + "learning_rate": 1.1109718954801397e-07, + "logits/chosen": -1.273221492767334, + "logits/rejected": -1.1316196918487549, + "logps/chosen": -238.09048461914062, + "logps/rejected": -217.0200958251953, + "loss": 0.6674, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.027058366686105728, + "rewards/margins": 0.05345930904150009, + "rewards/margins_max": 0.08340740203857422, + "rewards/margins_min": 0.02351122722029686, + "rewards/margins_std": 0.04235298931598663, + "rewards/rejected": -0.02640094980597496, + "step": 3180 + }, + { + "epoch": 0.72, + "grad_norm": 0.451171875, + "learning_rate": 1.0946120715106511e-07, + "logits/chosen": -1.3681175708770752, + "logits/rejected": -1.0219160318374634, + "logps/chosen": -260.44451904296875, + "logps/rejected": -198.5891876220703, + "loss": 0.6619, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02242262475192547, + "rewards/margins": 0.051530640572309494, + "rewards/margins_max": 0.07956963032484055, + "rewards/margins_min": 0.023491645231842995, + "rewards/margins_std": 0.03965312987565994, + "rewards/rejected": -0.029108017683029175, + "step": 3190 + }, + { + "epoch": 0.72, + "grad_norm": 0.3359375, + "learning_rate": 1.0783397679535342e-07, + "logits/chosen": -1.4784858226776123, + "logits/rejected": -1.2056283950805664, + "logps/chosen": -255.98971557617188, + "logps/rejected": -197.01263427734375, + "loss": 0.6738, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0324072502553463, + "rewards/margins": 0.03942598029971123, + "rewards/margins_max": 0.058343105018138885, + "rewards/margins_min": 0.020508846268057823, + "rewards/margins_std": 0.026752863079309464, + "rewards/rejected": -0.007018730044364929, + "step": 3200 + }, + { + "epoch": 0.73, + "grad_norm": 0.43359375, + "learning_rate": 1.0621559981650938e-07, + "logits/chosen": -1.4148783683776855, + "logits/rejected": -1.1107372045516968, + "logps/chosen": -188.5048828125, + "logps/rejected": -159.65768432617188, + "loss": 0.6722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027665521949529648, + "rewards/margins": 0.04687776416540146, + "rewards/margins_max": 0.07250744104385376, + "rewards/margins_min": 0.02124808356165886, + "rewards/margins_std": 0.036245837807655334, + "rewards/rejected": -0.019212238490581512, + "step": 3210 + }, + { + "epoch": 0.73, + "grad_norm": 0.408203125, + "learning_rate": 1.0460617699882011e-07, + "logits/chosen": -1.4457701444625854, + "logits/rejected": -1.1618094444274902, + "logps/chosen": -177.31356811523438, + "logps/rejected": -189.93397521972656, + "loss": 0.67, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.026849735528230667, + "rewards/margins": 0.04811318963766098, + "rewards/margins_max": 0.07635174691677094, + "rewards/margins_min": 0.019874632358551025, + "rewards/margins_std": 0.03993535786867142, + "rewards/rejected": -0.02126346156001091, + "step": 3220 + }, + { + "epoch": 0.73, + "grad_norm": 0.515625, + "learning_rate": 1.0300580856895319e-07, + "logits/chosen": -1.3305940628051758, + "logits/rejected": -1.2187252044677734, + "logps/chosen": -232.2171630859375, + "logps/rejected": -211.6332550048828, + "loss": 0.6692, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02285153605043888, + "rewards/margins": 0.04603450745344162, + "rewards/margins_max": 0.07678231596946716, + "rewards/margins_min": 0.01528670359402895, + "rewards/margins_std": 0.04348396137356758, + "rewards/rejected": -0.023182973265647888, + "step": 3230 + }, + { + "epoch": 0.73, + "grad_norm": 0.52734375, + "learning_rate": 1.0141459418971495e-07, + "logits/chosen": -1.3796216249465942, + "logits/rejected": -1.17972731590271, + "logps/chosen": -209.474853515625, + "logps/rejected": -201.80372619628906, + "loss": 0.669, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03370387852191925, + "rewards/margins": 0.04449697583913803, + "rewards/margins_max": 0.07795051485300064, + "rewards/margins_min": 0.011043445207178593, + "rewards/margins_std": 0.047310441732406616, + "rewards/rejected": -0.010793101973831654, + "step": 3240 + }, + { + "epoch": 0.73, + "grad_norm": 0.5546875, + "learning_rate": 9.983263295384389e-08, + "logits/chosen": -1.5797169208526611, + "logits/rejected": -1.3436121940612793, + "logps/chosen": -205.55026245117188, + "logps/rejected": -228.02291870117188, + "loss": 0.6747, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.041708528995513916, + "rewards/margins": 0.040844693779945374, + "rewards/margins_max": 0.06195106357336044, + "rewards/margins_min": 0.01973830722272396, + "rewards/margins_std": 0.029848933219909668, + "rewards/rejected": 0.0008638384751975536, + "step": 3250 + }, + { + "epoch": 0.74, + "grad_norm": 0.404296875, + "learning_rate": 9.826002337784017e-08, + "logits/chosen": -1.5048518180847168, + "logits/rejected": -1.15309476852417, + "logps/chosen": -297.9679260253906, + "logps/rejected": -218.27078247070312, + "loss": 0.6722, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.027714397758245468, + "rewards/margins": 0.04562808573246002, + "rewards/margins_max": 0.07222025096416473, + "rewards/margins_min": 0.019035929813981056, + "rewards/margins_std": 0.037606991827487946, + "rewards/rejected": -0.017913687974214554, + "step": 3260 + }, + { + "epoch": 0.74, + "grad_norm": 0.326171875, + "learning_rate": 9.669686339582958e-08, + "logits/chosen": -1.4064114093780518, + "logits/rejected": -1.196653127670288, + "logps/chosen": -191.02151489257812, + "logps/rejected": -213.56784057617188, + "loss": 0.6618, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03829699009656906, + "rewards/margins": 0.05901988595724106, + "rewards/margins_max": 0.08912698924541473, + "rewards/margins_min": 0.02891278825700283, + "rewards/margins_std": 0.04257786646485329, + "rewards/rejected": -0.020722895860671997, + "step": 3270 + }, + { + "epoch": 0.74, + "grad_norm": 0.4296875, + "learning_rate": 9.514325035346576e-08, + "logits/chosen": -1.313220739364624, + "logits/rejected": -1.0566840171813965, + "logps/chosen": -216.5675048828125, + "logps/rejected": -237.94631958007812, + "loss": 0.6664, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03073856234550476, + "rewards/margins": 0.05257689952850342, + "rewards/margins_max": 0.08035125583410263, + "rewards/margins_min": 0.02480255253612995, + "rewards/margins_std": 0.03927886113524437, + "rewards/rejected": -0.021838339045643806, + "step": 3280 + }, + { + "epoch": 0.74, + "grad_norm": 0.380859375, + "learning_rate": 9.359928100186723e-08, + "logits/chosen": -1.4472345113754272, + "logits/rejected": -1.2550368309020996, + "logps/chosen": -246.9860076904297, + "logps/rejected": -249.814697265625, + "loss": 0.67, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.026788320392370224, + "rewards/margins": 0.04281577467918396, + "rewards/margins_max": 0.07860226929187775, + "rewards/margins_min": 0.007029279135167599, + "rewards/margins_std": 0.05060974508523941, + "rewards/rejected": -0.016027452424168587, + "step": 3290 + }, + { + "epoch": 0.75, + "grad_norm": 0.40234375, + "learning_rate": 9.206505149159258e-08, + "logits/chosen": -1.3831764459609985, + "logits/rejected": -1.0240981578826904, + "logps/chosen": -265.1828308105469, + "logps/rejected": -269.2750549316406, + "loss": 0.6666, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04783134162425995, + "rewards/margins": 0.05456867069005966, + "rewards/margins_max": 0.08805376291275024, + "rewards/margins_min": 0.021083565428853035, + "rewards/margins_std": 0.047355085611343384, + "rewards/rejected": -0.006737329065799713, + "step": 3300 + }, + { + "epoch": 0.75, + "grad_norm": 0.5234375, + "learning_rate": 9.054065736665267e-08, + "logits/chosen": -1.4093387126922607, + "logits/rejected": -1.242791771888733, + "logps/chosen": -176.42868041992188, + "logps/rejected": -244.3980712890625, + "loss": 0.668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02831677719950676, + "rewards/margins": 0.0456489659845829, + "rewards/margins_max": 0.06995344907045364, + "rewards/margins_min": 0.02134447731077671, + "rewards/margins_std": 0.03437173739075661, + "rewards/rejected": -0.017332185059785843, + "step": 3310 + }, + { + "epoch": 0.75, + "grad_norm": 0.390625, + "learning_rate": 8.90261935585603e-08, + "logits/chosen": -1.288233757019043, + "logits/rejected": -1.1420161724090576, + "logps/chosen": -177.0896759033203, + "logps/rejected": -164.56689453125, + "loss": 0.6718, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026926511898636818, + "rewards/margins": 0.04227718710899353, + "rewards/margins_max": 0.06220870465040207, + "rewards/margins_min": 0.022345667704939842, + "rewards/margins_std": 0.028187427669763565, + "rewards/rejected": -0.015350677073001862, + "step": 3320 + }, + { + "epoch": 0.75, + "grad_norm": 0.416015625, + "learning_rate": 8.752175438041906e-08, + "logits/chosen": -1.3834733963012695, + "logits/rejected": -1.1185014247894287, + "logps/chosen": -218.1145477294922, + "logps/rejected": -277.34393310546875, + "loss": 0.6656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02830580435693264, + "rewards/margins": 0.051726728677749634, + "rewards/margins_max": 0.08054015040397644, + "rewards/margins_min": 0.022913306951522827, + "rewards/margins_std": 0.04074833169579506, + "rewards/rejected": -0.023420918732881546, + "step": 3330 + }, + { + "epoch": 0.75, + "grad_norm": 0.439453125, + "learning_rate": 8.602743352104936e-08, + "logits/chosen": -1.3321176767349243, + "logits/rejected": -1.1113364696502686, + "logps/chosen": -216.62460327148438, + "logps/rejected": -269.898193359375, + "loss": 0.6677, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.025886043906211853, + "rewards/margins": 0.047580696642398834, + "rewards/margins_max": 0.07348641008138657, + "rewards/margins_min": 0.02167496643960476, + "rewards/margins_std": 0.03663622587919235, + "rewards/rejected": -0.02169465273618698, + "step": 3340 + }, + { + "epoch": 0.76, + "grad_norm": 0.458984375, + "learning_rate": 8.454332403915415e-08, + "logits/chosen": -1.3632304668426514, + "logits/rejected": -1.2139122486114502, + "logps/chosen": -168.23995971679688, + "logps/rejected": -220.98562622070312, + "loss": 0.668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02623501978814602, + "rewards/margins": 0.041043318808078766, + "rewards/margins_max": 0.06607674062252045, + "rewards/margins_min": 0.016009902581572533, + "rewards/margins_std": 0.035402603447437286, + "rewards/rejected": -0.014808299951255322, + "step": 3350 + }, + { + "epoch": 0.76, + "grad_norm": 0.30859375, + "learning_rate": 8.306951835752377e-08, + "logits/chosen": -1.5067164897918701, + "logits/rejected": -1.2972952127456665, + "logps/chosen": -212.9749755859375, + "logps/rejected": -215.76840209960938, + "loss": 0.6737, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02593279257416725, + "rewards/margins": 0.036307260394096375, + "rewards/margins_max": 0.0532100610435009, + "rewards/margins_min": 0.0194044578820467, + "rewards/margins_std": 0.02390417270362377, + "rewards/rejected": -0.010374465957283974, + "step": 3360 + }, + { + "epoch": 0.76, + "grad_norm": 0.345703125, + "learning_rate": 8.160610825728029e-08, + "logits/chosen": -1.3713653087615967, + "logits/rejected": -1.1629103422164917, + "logps/chosen": -179.63717651367188, + "logps/rejected": -169.6957550048828, + "loss": 0.6703, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.034700967371463776, + "rewards/margins": 0.0471280999481678, + "rewards/margins_max": 0.07532576471567154, + "rewards/margins_min": 0.018930435180664062, + "rewards/margins_std": 0.0398775152862072, + "rewards/rejected": -0.012427128851413727, + "step": 3370 + }, + { + "epoch": 0.76, + "grad_norm": 0.400390625, + "learning_rate": 8.015318487216183e-08, + "logits/chosen": -1.413757085800171, + "logits/rejected": -1.0182082653045654, + "logps/chosen": -262.51885986328125, + "logps/rejected": -228.93295288085938, + "loss": 0.6683, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.038941092789173126, + "rewards/margins": 0.05965230613946915, + "rewards/margins_max": 0.08189304918050766, + "rewards/margins_min": 0.03741155192255974, + "rewards/margins_std": 0.031453169882297516, + "rewards/rejected": -0.020711207762360573, + "step": 3380 + }, + { + "epoch": 0.77, + "grad_norm": 0.3828125, + "learning_rate": 7.871083868284725e-08, + "logits/chosen": -1.2200084924697876, + "logits/rejected": -0.9986445307731628, + "logps/chosen": -191.07162475585938, + "logps/rejected": -171.0322723388672, + "loss": 0.6723, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023246588185429573, + "rewards/margins": 0.03981078788638115, + "rewards/margins_max": 0.06385910511016846, + "rewards/margins_min": 0.01576247252523899, + "rewards/margins_std": 0.034009456634521484, + "rewards/rejected": -0.016564201563596725, + "step": 3390 + }, + { + "epoch": 0.77, + "grad_norm": 0.3984375, + "learning_rate": 7.727915951132144e-08, + "logits/chosen": -1.4563645124435425, + "logits/rejected": -1.103590488433838, + "logps/chosen": -294.8932189941406, + "logps/rejected": -303.7432556152344, + "loss": 0.6668, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.029066193848848343, + "rewards/margins": 0.050104547291994095, + "rewards/margins_max": 0.07443893700838089, + "rewards/margins_min": 0.025770163163542747, + "rewards/margins_std": 0.03441401198506355, + "rewards/rejected": -0.02103835716843605, + "step": 3400 + }, + { + "epoch": 0.77, + "grad_norm": 0.51171875, + "learning_rate": 7.585823651528156e-08, + "logits/chosen": -1.3338501453399658, + "logits/rejected": -1.1717281341552734, + "logps/chosen": -220.5869140625, + "logps/rejected": -257.8363037109375, + "loss": 0.6662, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02696543000638485, + "rewards/margins": 0.04337473213672638, + "rewards/margins_max": 0.07208983600139618, + "rewards/margins_min": 0.014659630134701729, + "rewards/margins_std": 0.04060928896069527, + "rewards/rejected": -0.01640930399298668, + "step": 3410 + }, + { + "epoch": 0.77, + "grad_norm": 0.50390625, + "learning_rate": 7.444815818258527e-08, + "logits/chosen": -1.3896772861480713, + "logits/rejected": -1.2830214500427246, + "logps/chosen": -203.7684783935547, + "logps/rejected": -247.3974609375, + "loss": 0.6671, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03583263233304024, + "rewards/margins": 0.052742041647434235, + "rewards/margins_max": 0.08703459054231644, + "rewards/margins_min": 0.018449490889906883, + "rewards/margins_std": 0.04849698767066002, + "rewards/rejected": -0.016909409314393997, + "step": 3420 + }, + { + "epoch": 0.78, + "grad_norm": 0.30859375, + "learning_rate": 7.304901232573906e-08, + "logits/chosen": -1.5021190643310547, + "logits/rejected": -1.2359145879745483, + "logps/chosen": -163.47137451171875, + "logps/rejected": -196.88661193847656, + "loss": 0.6678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04131701588630676, + "rewards/margins": 0.05625222995877266, + "rewards/margins_max": 0.08685590326786041, + "rewards/margins_min": 0.02564854361116886, + "rewards/margins_std": 0.043280139565467834, + "rewards/rejected": -0.014935208484530449, + "step": 3430 + }, + { + "epoch": 0.78, + "grad_norm": 0.369140625, + "learning_rate": 7.166088607643123e-08, + "logits/chosen": -1.322670817375183, + "logits/rejected": -1.0070643424987793, + "logps/chosen": -214.02304077148438, + "logps/rejected": -234.9347381591797, + "loss": 0.6688, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.026626717299222946, + "rewards/margins": 0.05677711218595505, + "rewards/margins_max": 0.08463828265666962, + "rewards/margins_min": 0.028915945440530777, + "rewards/margins_std": 0.039401642978191376, + "rewards/rejected": -0.0301503948867321, + "step": 3440 + }, + { + "epoch": 0.78, + "grad_norm": 0.53125, + "learning_rate": 7.02838658801042e-08, + "logits/chosen": -1.4200587272644043, + "logits/rejected": -1.102832555770874, + "logps/chosen": -280.0638732910156, + "logps/rejected": -235.215576171875, + "loss": 0.6686, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03843649849295616, + "rewards/margins": 0.058247823268175125, + "rewards/margins_max": 0.08686522394418716, + "rewards/margins_min": 0.029630441218614578, + "rewards/margins_std": 0.04047109931707382, + "rewards/rejected": -0.01981133408844471, + "step": 3450 + }, + { + "epoch": 0.78, + "grad_norm": 0.404296875, + "learning_rate": 6.891803749057254e-08, + "logits/chosen": -1.5478912591934204, + "logits/rejected": -1.177132248878479, + "logps/chosen": -257.420654296875, + "logps/rejected": -240.5295867919922, + "loss": 0.6698, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.037167832255363464, + "rewards/margins": 0.04988235980272293, + "rewards/margins_max": 0.08287341892719269, + "rewards/margins_min": 0.016891302540898323, + "rewards/margins_std": 0.046656396239995956, + "rewards/rejected": -0.012714529410004616, + "step": 3460 + }, + { + "epoch": 0.78, + "grad_norm": 0.3671875, + "learning_rate": 6.756348596468167e-08, + "logits/chosen": -1.4443600177764893, + "logits/rejected": -1.1571996212005615, + "logps/chosen": -217.07919311523438, + "logps/rejected": -204.60067749023438, + "loss": 0.6689, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.034954361617565155, + "rewards/margins": 0.04938163235783577, + "rewards/margins_max": 0.0719556212425232, + "rewards/margins_min": 0.026807645335793495, + "rewards/margins_std": 0.03192444145679474, + "rewards/rejected": -0.01442726980894804, + "step": 3470 + }, + { + "epoch": 0.79, + "grad_norm": 0.32421875, + "learning_rate": 6.622029565701118e-08, + "logits/chosen": -1.420827865600586, + "logits/rejected": -1.2825648784637451, + "logps/chosen": -182.84640502929688, + "logps/rejected": -178.5884246826172, + "loss": 0.6713, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.028441939502954483, + "rewards/margins": 0.04441531002521515, + "rewards/margins_max": 0.07214485108852386, + "rewards/margins_min": 0.01668578013777733, + "rewards/margins_std": 0.03921548277139664, + "rewards/rejected": -0.015973379835486412, + "step": 3480 + }, + { + "epoch": 0.79, + "grad_norm": 0.4921875, + "learning_rate": 6.488855021462216e-08, + "logits/chosen": -1.2294635772705078, + "logits/rejected": -0.9490046501159668, + "logps/chosen": -220.43417358398438, + "logps/rejected": -197.18508911132812, + "loss": 0.6711, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.033244356513023376, + "rewards/margins": 0.047328390181064606, + "rewards/margins_max": 0.0760948583483696, + "rewards/margins_min": 0.01856192573904991, + "rewards/margins_std": 0.04068192094564438, + "rewards/rejected": -0.014084036462008953, + "step": 3490 + }, + { + "epoch": 0.79, + "grad_norm": 0.251953125, + "learning_rate": 6.356833257184746e-08, + "logits/chosen": -1.2230401039123535, + "logits/rejected": -1.1300714015960693, + "logps/chosen": -174.61322021484375, + "logps/rejected": -209.37661743164062, + "loss": 0.6667, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02614758536219597, + "rewards/margins": 0.05365358665585518, + "rewards/margins_max": 0.07639142125844955, + "rewards/margins_min": 0.030915748327970505, + "rewards/margins_std": 0.03215615823864937, + "rewards/rejected": -0.02750600315630436, + "step": 3500 + }, + { + "epoch": 0.79, + "grad_norm": 0.419921875, + "learning_rate": 6.225972494512718e-08, + "logits/chosen": -1.4761110544204712, + "logits/rejected": -1.244800329208374, + "logps/chosen": -222.0691375732422, + "logps/rejected": -243.110107421875, + "loss": 0.6726, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018352871760725975, + "rewards/margins": 0.04645923897624016, + "rewards/margins_max": 0.07281027734279633, + "rewards/margins_min": 0.020108195021748543, + "rewards/margins_std": 0.03726600855588913, + "rewards/rejected": -0.02810637094080448, + "step": 3510 + }, + { + "epoch": 0.8, + "grad_norm": 0.333984375, + "learning_rate": 6.096280882788874e-08, + "logits/chosen": -1.3621561527252197, + "logits/rejected": -1.188506007194519, + "logps/chosen": -170.3199005126953, + "logps/rejected": -183.8995361328125, + "loss": 0.6728, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.030621353536844254, + "rewards/margins": 0.03551565110683441, + "rewards/margins_max": 0.06205816939473152, + "rewards/margins_min": 0.008973127231001854, + "rewards/margins_std": 0.03753679618239403, + "rewards/rejected": -0.004894299898296595, + "step": 3520 + }, + { + "epoch": 0.8, + "grad_norm": 0.484375, + "learning_rate": 5.96776649854718e-08, + "logits/chosen": -1.4086308479309082, + "logits/rejected": -1.1658676862716675, + "logps/chosen": -210.52297973632812, + "logps/rejected": -202.68783569335938, + "loss": 0.6701, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.028438914567232132, + "rewards/margins": 0.04048081114888191, + "rewards/margins_max": 0.059843193739652634, + "rewards/margins_min": 0.021118421107530594, + "rewards/margins_std": 0.02738254889845848, + "rewards/rejected": -0.012041894719004631, + "step": 3530 + }, + { + "epoch": 0.8, + "grad_norm": 0.44921875, + "learning_rate": 5.840437345009858e-08, + "logits/chosen": -1.2647746801376343, + "logits/rejected": -1.0508357286453247, + "logps/chosen": -225.2239532470703, + "logps/rejected": -225.44296264648438, + "loss": 0.6713, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.030933257192373276, + "rewards/margins": 0.04926546663045883, + "rewards/margins_max": 0.07699280232191086, + "rewards/margins_min": 0.02153814025223255, + "rewards/margins_std": 0.03921236842870712, + "rewards/rejected": -0.018332213163375854, + "step": 3540 + }, + { + "epoch": 0.8, + "grad_norm": 0.2890625, + "learning_rate": 5.7143013515890074e-08, + "logits/chosen": -1.4881861209869385, + "logits/rejected": -1.179818868637085, + "logps/chosen": -196.1507110595703, + "logps/rejected": -187.4951629638672, + "loss": 0.6673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03177567571401596, + "rewards/margins": 0.048626724630594254, + "rewards/margins_max": 0.08106863498687744, + "rewards/margins_min": 0.016184817999601364, + "rewards/margins_std": 0.0458797886967659, + "rewards/rejected": -0.016851048916578293, + "step": 3550 + }, + { + "epoch": 0.8, + "grad_norm": 0.40234375, + "learning_rate": 5.589366373392754e-08, + "logits/chosen": -1.2353525161743164, + "logits/rejected": -1.0010229349136353, + "logps/chosen": -240.0959014892578, + "logps/rejected": -198.23251342773438, + "loss": 0.6655, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.04709520563483238, + "rewards/margins": 0.060860536992549896, + "rewards/margins_max": 0.08691618591547012, + "rewards/margins_min": 0.034804895520210266, + "rewards/margins_std": 0.03684823960065842, + "rewards/rejected": -0.013765333220362663, + "step": 3560 + }, + { + "epoch": 0.81, + "grad_norm": 0.478515625, + "learning_rate": 5.465640190736123e-08, + "logits/chosen": -1.3841360807418823, + "logits/rejected": -1.2201907634735107, + "logps/chosen": -203.7855987548828, + "logps/rejected": -190.33120727539062, + "loss": 0.6728, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.017639171332120895, + "rewards/margins": 0.028413286432623863, + "rewards/margins_max": 0.04533935338258743, + "rewards/margins_min": 0.011487223207950592, + "rewards/margins_std": 0.02393706701695919, + "rewards/rejected": -0.010774116031825542, + "step": 3570 + }, + { + "epoch": 0.81, + "grad_norm": 0.345703125, + "learning_rate": 5.343130508656501e-08, + "logits/chosen": -1.4076919555664062, + "logits/rejected": -1.1919476985931396, + "logps/chosen": -218.1761016845703, + "logps/rejected": -223.3293914794922, + "loss": 0.6708, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.024606024846434593, + "rewards/margins": 0.03831148147583008, + "rewards/margins_max": 0.05184120684862137, + "rewards/margins_min": 0.024781761690974236, + "rewards/margins_std": 0.0191339161247015, + "rewards/rejected": -0.013705459423363209, + "step": 3580 + }, + { + "epoch": 0.81, + "grad_norm": 0.380859375, + "learning_rate": 5.221844956433794e-08, + "logits/chosen": -1.3877627849578857, + "logits/rejected": -1.050703525543213, + "logps/chosen": -204.9930419921875, + "logps/rejected": -199.54371643066406, + "loss": 0.6685, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0310288667678833, + "rewards/margins": 0.05146826431155205, + "rewards/margins_max": 0.079628124833107, + "rewards/margins_min": 0.023308411240577698, + "rewards/margins_std": 0.039824046194553375, + "rewards/rejected": -0.020439397543668747, + "step": 3590 + }, + { + "epoch": 0.81, + "grad_norm": 0.294921875, + "learning_rate": 5.101791087115353e-08, + "logits/chosen": -1.3680554628372192, + "logits/rejected": -1.0979640483856201, + "logps/chosen": -195.17733764648438, + "logps/rejected": -175.19503784179688, + "loss": 0.6659, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03740929067134857, + "rewards/margins": 0.052658699452877045, + "rewards/margins_max": 0.08210619539022446, + "rewards/margins_min": 0.02321120724081993, + "rewards/margins_std": 0.04164504259824753, + "rewards/rejected": -0.0152494041249156, + "step": 3600 + }, + { + "epoch": 0.82, + "grad_norm": 0.478515625, + "learning_rate": 4.982976377045545e-08, + "logits/chosen": -1.3179078102111816, + "logits/rejected": -0.9852703809738159, + "logps/chosen": -208.92697143554688, + "logps/rejected": -201.54153442382812, + "loss": 0.6651, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.029262781143188477, + "rewards/margins": 0.057259321212768555, + "rewards/margins_max": 0.08372489362955093, + "rewards/margins_min": 0.030793756246566772, + "rewards/margins_std": 0.03742796555161476, + "rewards/rejected": -0.027996540069580078, + "step": 3610 + }, + { + "epoch": 0.82, + "grad_norm": 0.4140625, + "learning_rate": 4.865408225400233e-08, + "logits/chosen": -1.4115145206451416, + "logits/rejected": -1.1585103273391724, + "logps/chosen": -178.40237426757812, + "logps/rejected": -179.32286071777344, + "loss": 0.6714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02827485278248787, + "rewards/margins": 0.03694600984454155, + "rewards/margins_max": 0.06516902148723602, + "rewards/margins_min": 0.008723007515072823, + "rewards/margins_std": 0.039913360029459, + "rewards/rejected": -0.00867115892469883, + "step": 3620 + }, + { + "epoch": 0.82, + "grad_norm": 0.365234375, + "learning_rate": 4.749093953725952e-08, + "logits/chosen": -1.4016355276107788, + "logits/rejected": -1.1189079284667969, + "logps/chosen": -238.0830841064453, + "logps/rejected": -210.646728515625, + "loss": 0.6685, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.025943368673324585, + "rewards/margins": 0.04778265208005905, + "rewards/margins_max": 0.06782630831003189, + "rewards/margins_min": 0.02773899957537651, + "rewards/margins_std": 0.02834600768983364, + "rewards/rejected": -0.021839281544089317, + "step": 3630 + }, + { + "epoch": 0.82, + "grad_norm": 0.390625, + "learning_rate": 4.634040805483946e-08, + "logits/chosen": -1.287687063217163, + "logits/rejected": -0.9820324778556824, + "logps/chosen": -209.3041229248047, + "logps/rejected": -242.19949340820312, + "loss": 0.6652, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03499635308980942, + "rewards/margins": 0.04892340302467346, + "rewards/margins_max": 0.07500474154949188, + "rewards/margins_min": 0.022842060774564743, + "rewards/margins_std": 0.03688458353281021, + "rewards/rejected": -0.013927051797509193, + "step": 3640 + }, + { + "epoch": 0.83, + "grad_norm": 0.38671875, + "learning_rate": 4.5202559455991465e-08, + "logits/chosen": -1.3673442602157593, + "logits/rejected": -1.201022982597351, + "logps/chosen": -193.3282470703125, + "logps/rejected": -199.09353637695312, + "loss": 0.6715, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.029926681891083717, + "rewards/margins": 0.04664327949285507, + "rewards/margins_max": 0.07682793587446213, + "rewards/margins_min": 0.016458621248602867, + "rewards/margins_std": 0.0426875576376915, + "rewards/rejected": -0.016716599464416504, + "step": 3650 + }, + { + "epoch": 0.83, + "grad_norm": 0.345703125, + "learning_rate": 4.4077464600139116e-08, + "logits/chosen": -1.451099157333374, + "logits/rejected": -1.2489079236984253, + "logps/chosen": -190.05506896972656, + "logps/rejected": -183.50390625, + "loss": 0.6707, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.022996725514531136, + "rewards/margins": 0.04504052549600601, + "rewards/margins_max": 0.06947029381990433, + "rewards/margins_min": 0.020610753446817398, + "rewards/margins_std": 0.034548915922641754, + "rewards/rejected": -0.022043799981474876, + "step": 3660 + }, + { + "epoch": 0.83, + "grad_norm": 0.345703125, + "learning_rate": 4.296519355246775e-08, + "logits/chosen": -1.4090425968170166, + "logits/rejected": -0.9449017643928528, + "logps/chosen": -281.3605041503906, + "logps/rejected": -215.10693359375, + "loss": 0.6673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03155981004238129, + "rewards/margins": 0.06298719346523285, + "rewards/margins_max": 0.10227738320827484, + "rewards/margins_min": 0.023697001859545708, + "rewards/margins_std": 0.055564723908901215, + "rewards/rejected": -0.03142738342285156, + "step": 3670 + }, + { + "epoch": 0.83, + "grad_norm": 0.322265625, + "learning_rate": 4.1865815579561234e-08, + "logits/chosen": -1.3743921518325806, + "logits/rejected": -1.051452398300171, + "logps/chosen": -222.72189331054688, + "logps/rejected": -199.6683349609375, + "loss": 0.6653, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03226775676012039, + "rewards/margins": 0.05695630982518196, + "rewards/margins_max": 0.0883602723479271, + "rewards/margins_min": 0.025552351027727127, + "rewards/margins_std": 0.044411905109882355, + "rewards/rejected": -0.02468855120241642, + "step": 3680 + }, + { + "epoch": 0.83, + "grad_norm": 0.51171875, + "learning_rate": 4.0779399145088236e-08, + "logits/chosen": -1.42634117603302, + "logits/rejected": -1.1020008325576782, + "logps/chosen": -196.1156463623047, + "logps/rejected": -203.8043670654297, + "loss": 0.6641, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03701802343130112, + "rewards/margins": 0.058898139744997025, + "rewards/margins_max": 0.0883612185716629, + "rewards/margins_min": 0.029435062780976295, + "rewards/margins_std": 0.04166708141565323, + "rewards/rejected": -0.02188010886311531, + "step": 3690 + }, + { + "epoch": 0.84, + "grad_norm": 0.291015625, + "learning_rate": 3.970601190553882e-08, + "logits/chosen": -1.5326263904571533, + "logits/rejected": -1.2081806659698486, + "logps/chosen": -218.63272094726562, + "logps/rejected": -206.61660766601562, + "loss": 0.6718, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030933424830436707, + "rewards/margins": 0.04485446959733963, + "rewards/margins_max": 0.07049910724163055, + "rewards/margins_min": 0.01920982263982296, + "rewards/margins_std": 0.036267004907131195, + "rewards/rejected": -0.013921047560870647, + "step": 3700 + }, + { + "epoch": 0.84, + "grad_norm": 0.41796875, + "learning_rate": 3.864572070601099e-08, + "logits/chosen": -1.3997539281845093, + "logits/rejected": -1.076831579208374, + "logps/chosen": -187.23780822753906, + "logps/rejected": -215.34939575195312, + "loss": 0.6692, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04385649040341377, + "rewards/margins": 0.056016553193330765, + "rewards/margins_max": 0.08031867444515228, + "rewards/margins_min": 0.03171443194150925, + "rewards/margins_std": 0.03436839208006859, + "rewards/rejected": -0.012160064652562141, + "step": 3710 + }, + { + "epoch": 0.84, + "grad_norm": 0.3359375, + "learning_rate": 3.7598591576048e-08, + "logits/chosen": -1.5493652820587158, + "logits/rejected": -1.246185541152954, + "logps/chosen": -195.29684448242188, + "logps/rejected": -215.9580078125, + "loss": 0.6632, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04298774152994156, + "rewards/margins": 0.05915018916130066, + "rewards/margins_max": 0.08389954268932343, + "rewards/margins_min": 0.034400831907987595, + "rewards/margins_std": 0.03500087186694145, + "rewards/rejected": -0.016162443906068802, + "step": 3720 + }, + { + "epoch": 0.84, + "grad_norm": 0.349609375, + "learning_rate": 3.656468972552637e-08, + "logits/chosen": -1.6022872924804688, + "logits/rejected": -1.0952297449111938, + "logps/chosen": -291.47442626953125, + "logps/rejected": -227.7351531982422, + "loss": 0.6704, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.032902318984270096, + "rewards/margins": 0.05154528096318245, + "rewards/margins_max": 0.08279638737440109, + "rewards/margins_min": 0.020294170826673508, + "rewards/margins_std": 0.0441957451403141, + "rewards/rejected": -0.018642958253622055, + "step": 3730 + }, + { + "epoch": 0.85, + "grad_norm": 0.42578125, + "learning_rate": 3.554407954059488e-08, + "logits/chosen": -1.4918447732925415, + "logits/rejected": -1.046549677848816, + "logps/chosen": -270.91680908203125, + "logps/rejected": -204.57913208007812, + "loss": 0.6668, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03338420391082764, + "rewards/margins": 0.05518590286374092, + "rewards/margins_max": 0.09111050516366959, + "rewards/margins_min": 0.019261294975876808, + "rewards/margins_std": 0.050805073231458664, + "rewards/rejected": -0.021801700815558434, + "step": 3740 + }, + { + "epoch": 0.85, + "grad_norm": 0.30078125, + "learning_rate": 3.4536824579665e-08, + "logits/chosen": -1.3506847620010376, + "logits/rejected": -1.2051067352294922, + "logps/chosen": -166.91786193847656, + "logps/rejected": -162.62588500976562, + "loss": 0.6727, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0248137004673481, + "rewards/margins": 0.04015351086854935, + "rewards/margins_max": 0.060021065175533295, + "rewards/margins_min": 0.020285960286855698, + "rewards/margins_std": 0.028096962720155716, + "rewards/rejected": -0.015339814126491547, + "step": 3750 + }, + { + "epoch": 0.85, + "grad_norm": 0.43359375, + "learning_rate": 3.354298756945292e-08, + "logits/chosen": -1.447142481803894, + "logits/rejected": -1.2159852981567383, + "logps/chosen": -184.87144470214844, + "logps/rejected": -246.08230590820312, + "loss": 0.6675, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04362271726131439, + "rewards/margins": 0.05569925904273987, + "rewards/margins_max": 0.07866283506155014, + "rewards/margins_min": 0.032735686749219894, + "rewards/margins_std": 0.032475393265485764, + "rewards/rejected": -0.012076543644070625, + "step": 3760 + }, + { + "epoch": 0.85, + "grad_norm": 0.53515625, + "learning_rate": 3.2562630401072793e-08, + "logits/chosen": -1.4080158472061157, + "logits/rejected": -1.023107647895813, + "logps/chosen": -247.05734252929688, + "logps/rejected": -232.3527069091797, + "loss": 0.6699, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.037775538861751556, + "rewards/margins": 0.05671767145395279, + "rewards/margins_max": 0.08435182273387909, + "rewards/margins_min": 0.029083510860800743, + "rewards/margins_std": 0.03908059746026993, + "rewards/rejected": -0.018942128866910934, + "step": 3770 + }, + { + "epoch": 0.85, + "grad_norm": 0.392578125, + "learning_rate": 3.159581412618309e-08, + "logits/chosen": -1.4490994215011597, + "logits/rejected": -1.2454156875610352, + "logps/chosen": -218.85708618164062, + "logps/rejected": -199.06924438476562, + "loss": 0.6676, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.027776191011071205, + "rewards/margins": 0.0423356369137764, + "rewards/margins_max": 0.06463000178337097, + "rewards/margins_min": 0.020041272044181824, + "rewards/margins_std": 0.03152899444103241, + "rewards/rejected": -0.014559444971382618, + "step": 3780 + }, + { + "epoch": 0.86, + "grad_norm": 0.41015625, + "learning_rate": 3.0642598953184164e-08, + "logits/chosen": -1.4658455848693848, + "logits/rejected": -0.8866893649101257, + "logps/chosen": -239.5939483642578, + "logps/rejected": -212.2051544189453, + "loss": 0.6684, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.030214086174964905, + "rewards/margins": 0.06262092292308807, + "rewards/margins_max": 0.10303632915019989, + "rewards/margins_min": 0.022205516695976257, + "rewards/margins_std": 0.057156018912792206, + "rewards/rejected": -0.03240683674812317, + "step": 3790 + }, + { + "epoch": 0.86, + "grad_norm": 0.50390625, + "learning_rate": 2.9703044243468866e-08, + "logits/chosen": -1.3730664253234863, + "logits/rejected": -1.1947462558746338, + "logps/chosen": -193.0137176513672, + "logps/rejected": -225.66714477539062, + "loss": 0.6659, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.027939695864915848, + "rewards/margins": 0.05449366569519043, + "rewards/margins_max": 0.08065618574619293, + "rewards/margins_min": 0.02833114191889763, + "rewards/margins_std": 0.03699938952922821, + "rewards/rejected": -0.026553967967629433, + "step": 3800 + }, + { + "epoch": 0.86, + "grad_norm": 0.3046875, + "learning_rate": 2.8777208507726054e-08, + "logits/chosen": -1.3284589052200317, + "logits/rejected": -1.029971718788147, + "logps/chosen": -172.42007446289062, + "logps/rejected": -186.41592407226562, + "loss": 0.6658, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03503911942243576, + "rewards/margins": 0.06267812103033066, + "rewards/margins_max": 0.08370877057313919, + "rewards/margins_min": 0.04164748266339302, + "rewards/margins_std": 0.029741818085312843, + "rewards/rejected": -0.027639007195830345, + "step": 3810 + }, + { + "epoch": 0.86, + "grad_norm": 0.42578125, + "learning_rate": 2.786514940229634e-08, + "logits/chosen": -1.3927130699157715, + "logits/rejected": -1.131127119064331, + "logps/chosen": -211.1796112060547, + "logps/rejected": -181.09854125976562, + "loss": 0.6691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.04030333831906319, + "rewards/margins": 0.04070250317454338, + "rewards/margins_max": 0.07060278952121735, + "rewards/margins_min": 0.010802226141095161, + "rewards/margins_std": 0.04228537529706955, + "rewards/rejected": -0.0003991674748249352, + "step": 3820 + }, + { + "epoch": 0.87, + "grad_norm": 0.4140625, + "learning_rate": 2.6966923725582234e-08, + "logits/chosen": -1.4242016077041626, + "logits/rejected": -1.145247220993042, + "logps/chosen": -230.03311157226562, + "logps/rejected": -180.59939575195312, + "loss": 0.6724, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.026708319783210754, + "rewards/margins": 0.034293271601200104, + "rewards/margins_max": 0.0604860857129097, + "rewards/margins_min": 0.008100450038909912, + "rewards/margins_std": 0.03704223781824112, + "rewards/rejected": -0.007584949489682913, + "step": 3830 + }, + { + "epoch": 0.87, + "grad_norm": 0.376953125, + "learning_rate": 2.6082587414510442e-08, + "logits/chosen": -1.2004412412643433, + "logits/rejected": -1.1661103963851929, + "logps/chosen": -218.96182250976562, + "logps/rejected": -263.70001220703125, + "loss": 0.6686, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.020230960100889206, + "rewards/margins": 0.04539378732442856, + "rewards/margins_max": 0.06975904852151871, + "rewards/margins_min": 0.021028511226177216, + "rewards/margins_std": 0.034457698464393616, + "rewards/rejected": -0.025162819772958755, + "step": 3840 + }, + { + "epoch": 0.87, + "grad_norm": 0.3984375, + "learning_rate": 2.5212195541048698e-08, + "logits/chosen": -1.3625192642211914, + "logits/rejected": -1.177557349205017, + "logps/chosen": -297.97906494140625, + "logps/rejected": -289.76824951171875, + "loss": 0.6671, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03726391866803169, + "rewards/margins": 0.05677323415875435, + "rewards/margins_max": 0.07760507613420486, + "rewards/margins_min": 0.035941388458013535, + "rewards/margins_std": 0.02946067787706852, + "rewards/rejected": -0.019509317353367805, + "step": 3850 + }, + { + "epoch": 0.87, + "grad_norm": 0.60546875, + "learning_rate": 2.435580230877607e-08, + "logits/chosen": -1.294594168663025, + "logits/rejected": -1.0570218563079834, + "logps/chosen": -188.47618103027344, + "logps/rejected": -227.89230346679688, + "loss": 0.6712, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.036184389144182205, + "rewards/margins": 0.04435017332434654, + "rewards/margins_max": 0.07742326706647873, + "rewards/margins_min": 0.011277077719569206, + "rewards/margins_std": 0.046772416681051254, + "rewards/rejected": -0.008165782317519188, + "step": 3860 + }, + { + "epoch": 0.87, + "grad_norm": 0.328125, + "learning_rate": 2.3513461049507383e-08, + "logits/chosen": -1.2781083583831787, + "logits/rejected": -1.0610449314117432, + "logps/chosen": -178.37767028808594, + "logps/rejected": -186.87088012695312, + "loss": 0.6707, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.035525936633348465, + "rewards/margins": 0.04440684616565704, + "rewards/margins_max": 0.0715843215584755, + "rewards/margins_min": 0.017229357734322548, + "rewards/margins_std": 0.03843476623296738, + "rewards/rejected": -0.008880906738340855, + "step": 3870 + }, + { + "epoch": 0.88, + "grad_norm": 0.384765625, + "learning_rate": 2.2685224219972183e-08, + "logits/chosen": -1.451965570449829, + "logits/rejected": -1.1790364980697632, + "logps/chosen": -215.76779174804688, + "logps/rejected": -293.38446044921875, + "loss": 0.6697, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03224249556660652, + "rewards/margins": 0.06852399557828903, + "rewards/margins_max": 0.10147424042224884, + "rewards/margins_min": 0.03557376563549042, + "rewards/margins_std": 0.04659866914153099, + "rewards/rejected": -0.03628150746226311, + "step": 3880 + }, + { + "epoch": 0.88, + "grad_norm": 0.326171875, + "learning_rate": 2.1871143398547733e-08, + "logits/chosen": -1.3184213638305664, + "logits/rejected": -1.0465301275253296, + "logps/chosen": -235.9217071533203, + "logps/rejected": -187.95343017578125, + "loss": 0.6703, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.029069799929857254, + "rewards/margins": 0.052199989557266235, + "rewards/margins_max": 0.07874207943677902, + "rewards/margins_min": 0.025657888501882553, + "rewards/margins_std": 0.03753619268536568, + "rewards/rejected": -0.023130184039473534, + "step": 3890 + }, + { + "epoch": 0.88, + "grad_norm": 0.44140625, + "learning_rate": 2.1071269282047195e-08, + "logits/chosen": -1.3340555429458618, + "logits/rejected": -0.9785248041152954, + "logps/chosen": -216.3225555419922, + "logps/rejected": -205.25595092773438, + "loss": 0.6661, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03301094099879265, + "rewards/margins": 0.06074405834078789, + "rewards/margins_max": 0.09566470980644226, + "rewards/margins_min": 0.02582341432571411, + "rewards/margins_std": 0.04938525706529617, + "rewards/rejected": -0.027733122929930687, + "step": 3900 + }, + { + "epoch": 0.88, + "grad_norm": 0.451171875, + "learning_rate": 2.0285651682562355e-08, + "logits/chosen": -1.4139344692230225, + "logits/rejected": -1.09742271900177, + "logps/chosen": -234.76651000976562, + "logps/rejected": -265.72662353515625, + "loss": 0.6633, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.034630510956048965, + "rewards/margins": 0.06832818686962128, + "rewards/margins_max": 0.11305254697799683, + "rewards/margins_min": 0.023603813722729683, + "rewards/margins_std": 0.06324980407953262, + "rewards/rejected": -0.03369767218828201, + "step": 3910 + }, + { + "epoch": 0.89, + "grad_norm": 0.48046875, + "learning_rate": 1.951433952436174e-08, + "logits/chosen": -1.4427460432052612, + "logits/rejected": -1.0282113552093506, + "logps/chosen": -253.4451904296875, + "logps/rejected": -203.51739501953125, + "loss": 0.662, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03823239356279373, + "rewards/margins": 0.058333247900009155, + "rewards/margins_max": 0.09260444343090057, + "rewards/margins_min": 0.024062050506472588, + "rewards/margins_std": 0.048466794192790985, + "rewards/rejected": -0.020100858062505722, + "step": 3920 + }, + { + "epoch": 0.89, + "grad_norm": 0.45703125, + "learning_rate": 1.8757380840843524e-08, + "logits/chosen": -1.5329930782318115, + "logits/rejected": -1.2544704675674438, + "logps/chosen": -225.38186645507812, + "logps/rejected": -259.9751892089844, + "loss": 0.6713, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03190339356660843, + "rewards/margins": 0.041507575660943985, + "rewards/margins_max": 0.06313765048980713, + "rewards/margins_min": 0.01987750083208084, + "rewards/margins_std": 0.030589541420340538, + "rewards/rejected": -0.009604182094335556, + "step": 3930 + }, + { + "epoch": 0.89, + "grad_norm": 0.439453125, + "learning_rate": 1.8014822771544784e-08, + "logits/chosen": -1.4925791025161743, + "logits/rejected": -1.267610788345337, + "logps/chosen": -192.39517211914062, + "logps/rejected": -225.89212036132812, + "loss": 0.6688, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.022831646725535393, + "rewards/margins": 0.041496653109788895, + "rewards/margins_max": 0.06753791868686676, + "rewards/margins_min": 0.01545538567006588, + "rewards/margins_std": 0.03682791069149971, + "rewards/rejected": -0.018665006384253502, + "step": 3940 + }, + { + "epoch": 0.89, + "grad_norm": 0.37890625, + "learning_rate": 1.7286711559205247e-08, + "logits/chosen": -1.1196014881134033, + "logits/rejected": -0.8338809013366699, + "logps/chosen": -268.010009765625, + "logps/rejected": -212.37698364257812, + "loss": 0.6678, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.028760245069861412, + "rewards/margins": 0.04862253740429878, + "rewards/margins_max": 0.06828413903713226, + "rewards/margins_min": 0.02896093763411045, + "rewards/margins_std": 0.027805697172880173, + "rewards/rejected": -0.019862286746501923, + "step": 3950 + }, + { + "epoch": 0.9, + "grad_norm": 0.46875, + "learning_rate": 1.6573092546888128e-08, + "logits/chosen": -1.385881781578064, + "logits/rejected": -0.7845109105110168, + "logps/chosen": -295.77288818359375, + "logps/rejected": -195.02999877929688, + "loss": 0.6651, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03401690721511841, + "rewards/margins": 0.05478140711784363, + "rewards/margins_max": 0.08118084818124771, + "rewards/margins_min": 0.028381969779729843, + "rewards/margins_std": 0.03733444958925247, + "rewards/rejected": -0.02076449990272522, + "step": 3960 + }, + { + "epoch": 0.9, + "grad_norm": 0.6171875, + "learning_rate": 1.5874010175156104e-08, + "logits/chosen": -1.3183201551437378, + "logits/rejected": -1.0514836311340332, + "logps/chosen": -294.5506591796875, + "logps/rejected": -218.8050079345703, + "loss": 0.6703, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.030712831765413284, + "rewards/margins": 0.05188845843076706, + "rewards/margins_max": 0.08256353437900543, + "rewards/margins_min": 0.021213386207818985, + "rewards/margins_std": 0.04338110610842705, + "rewards/rejected": -0.021175626665353775, + "step": 3970 + }, + { + "epoch": 0.9, + "grad_norm": 0.41796875, + "learning_rate": 1.518950797930357e-08, + "logits/chosen": -1.3133951425552368, + "logits/rejected": -1.1905639171600342, + "logps/chosen": -189.7636260986328, + "logps/rejected": -214.4491424560547, + "loss": 0.673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02525566890835762, + "rewards/margins": 0.04084246605634689, + "rewards/margins_max": 0.06745803356170654, + "rewards/margins_min": 0.014226903207600117, + "rewards/margins_std": 0.03764009103178978, + "rewards/rejected": -0.015586796216666698, + "step": 3980 + }, + { + "epoch": 0.9, + "grad_norm": 0.46875, + "learning_rate": 1.4519628586646072e-08, + "logits/chosen": -1.4533544778823853, + "logits/rejected": -1.3012058734893799, + "logps/chosen": -151.31124877929688, + "logps/rejected": -185.54855346679688, + "loss": 0.6711, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.031321872025728226, + "rewards/margins": 0.0467819944024086, + "rewards/margins_max": 0.07836450636386871, + "rewards/margins_min": 0.01519948523491621, + "rewards/margins_std": 0.0446644201874733, + "rewards/rejected": -0.015460127964615822, + "step": 3990 + }, + { + "epoch": 0.9, + "grad_norm": 0.380859375, + "learning_rate": 1.3864413713865098e-08, + "logits/chosen": -1.423572301864624, + "logits/rejected": -1.1824599504470825, + "logps/chosen": -184.4468231201172, + "logps/rejected": -183.77056884765625, + "loss": 0.6714, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.028317485004663467, + "rewards/margins": 0.05067143961787224, + "rewards/margins_max": 0.08228854835033417, + "rewards/margins_min": 0.019054336473345757, + "rewards/margins_std": 0.0447133406996727, + "rewards/rejected": -0.022353962063789368, + "step": 4000 + }, + { + "epoch": 0.91, + "grad_norm": 0.28125, + "learning_rate": 1.3223904164410494e-08, + "logits/chosen": -1.342595100402832, + "logits/rejected": -1.0777925252914429, + "logps/chosen": -201.78912353515625, + "logps/rejected": -189.086181640625, + "loss": 0.6711, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03158831596374512, + "rewards/margins": 0.051473308354616165, + "rewards/margins_max": 0.08596741408109665, + "rewards/margins_min": 0.01697920449078083, + "rewards/margins_std": 0.04878203570842743, + "rewards/rejected": -0.019884996116161346, + "step": 4010 + }, + { + "epoch": 0.91, + "grad_norm": 0.52734375, + "learning_rate": 1.2598139825959392e-08, + "logits/chosen": -1.4462448358535767, + "logits/rejected": -1.0174031257629395, + "logps/chosen": -246.3612060546875, + "logps/rejected": -235.6538543701172, + "loss": 0.6644, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03416098281741142, + "rewards/margins": 0.05894618108868599, + "rewards/margins_max": 0.08806253969669342, + "rewards/margins_min": 0.02982981503009796, + "rewards/margins_std": 0.04117675870656967, + "rewards/rejected": -0.024785198271274567, + "step": 4020 + }, + { + "epoch": 0.91, + "grad_norm": 0.369140625, + "learning_rate": 1.1987159667932123e-08, + "logits/chosen": -1.6274089813232422, + "logits/rejected": -1.4427483081817627, + "logps/chosen": -209.2533721923828, + "logps/rejected": -223.5661163330078, + "loss": 0.6682, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035410456359386444, + "rewards/margins": 0.054214704781770706, + "rewards/margins_max": 0.08917896449565887, + "rewards/margins_min": 0.019250452518463135, + "rewards/margins_std": 0.049446918070316315, + "rewards/rejected": -0.01880425028502941, + "step": 4030 + }, + { + "epoch": 0.91, + "grad_norm": 0.498046875, + "learning_rate": 1.139100173906543e-08, + "logits/chosen": -1.3406426906585693, + "logits/rejected": -1.109688639640808, + "logps/chosen": -150.0030059814453, + "logps/rejected": -161.7610321044922, + "loss": 0.6714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.023004988208413124, + "rewards/margins": 0.04010429233312607, + "rewards/margins_max": 0.0670587494969368, + "rewards/margins_min": 0.013149833306670189, + "rewards/margins_std": 0.0381193533539772, + "rewards/rejected": -0.017099300399422646, + "step": 4040 + }, + { + "epoch": 0.92, + "grad_norm": 0.5, + "learning_rate": 1.0809703165043205e-08, + "logits/chosen": -1.3344614505767822, + "logits/rejected": -0.9872671961784363, + "logps/chosen": -264.97149658203125, + "logps/rejected": -213.99990844726562, + "loss": 0.6679, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04212633892893791, + "rewards/margins": 0.053166113793849945, + "rewards/margins_max": 0.08254306018352509, + "rewards/margins_min": 0.023789182305336, + "rewards/margins_std": 0.04154526814818382, + "rewards/rejected": -0.011039778590202332, + "step": 4050 + }, + { + "epoch": 0.92, + "grad_norm": 0.34375, + "learning_rate": 1.0243300146184047e-08, + "logits/chosen": -1.3708654642105103, + "logits/rejected": -1.1119438409805298, + "logps/chosen": -167.91726684570312, + "logps/rejected": -184.01956176757812, + "loss": 0.6694, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.04270718991756439, + "rewards/margins": 0.052512459456920624, + "rewards/margins_max": 0.07811418175697327, + "rewards/margins_min": 0.026910746470093727, + "rewards/margins_std": 0.03620629757642746, + "rewards/rejected": -0.009805269539356232, + "step": 4060 + }, + { + "epoch": 0.92, + "grad_norm": 0.271484375, + "learning_rate": 9.69182795518722e-09, + "logits/chosen": -1.3817869424819946, + "logits/rejected": -1.1750977039337158, + "logps/chosen": -214.696533203125, + "logps/rejected": -229.936767578125, + "loss": 0.6685, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02906607650220394, + "rewards/margins": 0.04736794903874397, + "rewards/margins_max": 0.07081864774227142, + "rewards/margins_min": 0.02391725964844227, + "rewards/margins_std": 0.033164288848638535, + "rewards/rejected": -0.018301870673894882, + "step": 4070 + }, + { + "epoch": 0.92, + "grad_norm": 0.4921875, + "learning_rate": 9.155320934936039e-09, + "logits/chosen": -1.429626226425171, + "logits/rejected": -1.0969443321228027, + "logps/chosen": -262.22833251953125, + "logps/rejected": -243.362060546875, + "loss": 0.6685, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.035449132323265076, + "rewards/margins": 0.052136652171611786, + "rewards/margins_max": 0.07579027116298676, + "rewards/margins_min": 0.028483033180236816, + "rewards/margins_std": 0.03345127031207085, + "rewards/rejected": -0.01668752171099186, + "step": 4080 + }, + { + "epoch": 0.92, + "grad_norm": 0.447265625, + "learning_rate": 8.633812496358972e-09, + "logits/chosen": -1.5154110193252563, + "logits/rejected": -1.2433773279190063, + "logps/chosen": -260.2387390136719, + "logps/rejected": -204.50048828125, + "loss": 0.6694, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.030448496341705322, + "rewards/margins": 0.044683560729026794, + "rewards/margins_max": 0.06551718711853027, + "rewards/margins_min": 0.023849938064813614, + "rewards/margins_std": 0.02946319617331028, + "rewards/rejected": -0.014235064387321472, + "step": 4090 + }, + { + "epoch": 0.93, + "grad_norm": 0.328125, + "learning_rate": 8.127335116349304e-09, + "logits/chosen": -1.5176746845245361, + "logits/rejected": -1.318629264831543, + "logps/chosen": -198.71572875976562, + "logps/rejected": -189.38839721679688, + "loss": 0.6772, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.025044983252882957, + "rewards/margins": 0.03115997649729252, + "rewards/margins_max": 0.05419089272618294, + "rewards/margins_min": 0.00812905840575695, + "rewards/margins_std": 0.03257063776254654, + "rewards/rejected": -0.0061149937100708485, + "step": 4100 + }, + { + "epoch": 0.93, + "grad_norm": 0.419921875, + "learning_rate": 7.635920335742202e-09, + "logits/chosen": -1.4340060949325562, + "logits/rejected": -1.1694340705871582, + "logps/chosen": -227.19729614257812, + "logps/rejected": -253.91653442382812, + "loss": 0.6733, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02697773650288582, + "rewards/margins": 0.04178674519062042, + "rewards/margins_max": 0.062144309282302856, + "rewards/margins_min": 0.02142917737364769, + "rewards/margins_std": 0.02878994680941105, + "rewards/rejected": -0.014809004962444305, + "step": 4110 + }, + { + "epoch": 0.93, + "grad_norm": 0.25390625, + "learning_rate": 7.159598757350921e-09, + "logits/chosen": -1.2499946355819702, + "logits/rejected": -1.0438182353973389, + "logps/chosen": -258.05029296875, + "logps/rejected": -260.5018310546875, + "loss": 0.6671, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.025008153170347214, + "rewards/margins": 0.04763338714838028, + "rewards/margins_max": 0.0701470822095871, + "rewards/margins_min": 0.025119686499238014, + "rewards/margins_std": 0.03183918446302414, + "rewards/rejected": -0.022625230252742767, + "step": 4120 + }, + { + "epoch": 0.93, + "grad_norm": 0.322265625, + "learning_rate": 6.698400044060775e-09, + "logits/chosen": -1.3737030029296875, + "logits/rejected": -1.017594575881958, + "logps/chosen": -238.93344116210938, + "logps/rejected": -224.15365600585938, + "loss": 0.6709, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.028364086523652077, + "rewards/margins": 0.04671841114759445, + "rewards/margins_max": 0.07550819218158722, + "rewards/margins_min": 0.017928630113601685, + "rewards/margins_std": 0.040714897215366364, + "rewards/rejected": -0.018354322761297226, + "step": 4130 + }, + { + "epoch": 0.94, + "grad_norm": 0.279296875, + "learning_rate": 6.252352916981923e-09, + "logits/chosen": -1.4670217037200928, + "logits/rejected": -1.2123662233352661, + "logps/chosen": -169.3472442626953, + "logps/rejected": -166.15847778320312, + "loss": 0.6698, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02530873380601406, + "rewards/margins": 0.046012792736291885, + "rewards/margins_max": 0.07258578389883041, + "rewards/margins_min": 0.01943979784846306, + "rewards/margins_std": 0.03757988661527634, + "rewards/rejected": -0.020704057067632675, + "step": 4140 + }, + { + "epoch": 0.94, + "grad_norm": 0.4609375, + "learning_rate": 5.821485153660932e-09, + "logits/chosen": -1.2992416620254517, + "logits/rejected": -0.86224764585495, + "logps/chosen": -262.5690612792969, + "logps/rejected": -302.8780517578125, + "loss": 0.6661, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03237038478255272, + "rewards/margins": 0.06790334731340408, + "rewards/margins_max": 0.09890072047710419, + "rewards/margins_min": 0.036905981600284576, + "rewards/margins_std": 0.04383689910173416, + "rewards/rejected": -0.035532962530851364, + "step": 4150 + }, + { + "epoch": 0.94, + "grad_norm": 0.333984375, + "learning_rate": 5.4058235863506106e-09, + "logits/chosen": -1.3638312816619873, + "logits/rejected": -1.1424791812896729, + "logps/chosen": -224.7232666015625, + "logps/rejected": -203.07078552246094, + "loss": 0.6687, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.032146066427230835, + "rewards/margins": 0.05244447663426399, + "rewards/margins_max": 0.07333363592624664, + "rewards/margins_min": 0.03155531361699104, + "rewards/margins_std": 0.029541734606027603, + "rewards/rejected": -0.02029840461909771, + "step": 4160 + }, + { + "epoch": 0.94, + "grad_norm": 0.330078125, + "learning_rate": 5.005394100339371e-09, + "logits/chosen": -1.3726253509521484, + "logits/rejected": -1.253758192062378, + "logps/chosen": -233.70156860351562, + "logps/rejected": -231.6342010498047, + "loss": 0.6678, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.026106491684913635, + "rewards/margins": 0.03978271037340164, + "rewards/margins_max": 0.05634213238954544, + "rewards/margins_min": 0.02322329208254814, + "rewards/margins_std": 0.023418551310896873, + "rewards/rejected": -0.013676215894520283, + "step": 4170 + }, + { + "epoch": 0.94, + "grad_norm": 0.455078125, + "learning_rate": 4.620221632338994e-09, + "logits/chosen": -1.2156215906143188, + "logits/rejected": -1.0154184103012085, + "logps/chosen": -177.11041259765625, + "logps/rejected": -221.79861450195312, + "loss": 0.6626, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03687124326825142, + "rewards/margins": 0.05761794000864029, + "rewards/margins_max": 0.08540613949298859, + "rewards/margins_min": 0.029829740524291992, + "rewards/margins_std": 0.03929844871163368, + "rewards/rejected": -0.02074669674038887, + "step": 4180 + }, + { + "epoch": 0.95, + "grad_norm": 0.4921875, + "learning_rate": 4.2503301689318094e-09, + "logits/chosen": -1.510063886642456, + "logits/rejected": -1.1845905780792236, + "logps/chosen": -285.0707092285156, + "logps/rejected": -222.8107452392578, + "loss": 0.671, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.027932295575737953, + "rewards/margins": 0.04651053622364998, + "rewards/margins_max": 0.07659061253070831, + "rewards/margins_min": 0.016430456191301346, + "rewards/margins_std": 0.042539652436971664, + "rewards/rejected": -0.018578244373202324, + "step": 4190 + }, + { + "epoch": 0.95, + "grad_norm": 0.376953125, + "learning_rate": 3.895742745076869e-09, + "logits/chosen": -1.3405685424804688, + "logits/rejected": -1.0025266408920288, + "logps/chosen": -218.3401641845703, + "logps/rejected": -191.70420837402344, + "loss": 0.6684, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03851836919784546, + "rewards/margins": 0.06199236959218979, + "rewards/margins_max": 0.10131315886974335, + "rewards/margins_min": 0.022671589627861977, + "rewards/margins_std": 0.05560798570513725, + "rewards/rejected": -0.023474005982279778, + "step": 4200 + }, + { + "epoch": 0.95, + "grad_norm": 0.423828125, + "learning_rate": 3.5564814426755073e-09, + "logits/chosen": -1.3935787677764893, + "logits/rejected": -1.158966302871704, + "logps/chosen": -202.38221740722656, + "logps/rejected": -199.8167724609375, + "loss": 0.669, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.026319408789277077, + "rewards/margins": 0.043112870305776596, + "rewards/margins_max": 0.0695946216583252, + "rewards/margins_min": 0.016631122678518295, + "rewards/margins_std": 0.03745085373520851, + "rewards/rejected": -0.01679346337914467, + "step": 4210 + }, + { + "epoch": 0.95, + "grad_norm": 0.328125, + "learning_rate": 3.232567389196139e-09, + "logits/chosen": -1.4851112365722656, + "logits/rejected": -1.1288012266159058, + "logps/chosen": -220.33633422851562, + "logps/rejected": -192.44102478027344, + "loss": 0.6661, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.031008679419755936, + "rewards/margins": 0.04939933493733406, + "rewards/margins_max": 0.07498259842395782, + "rewards/margins_min": 0.0238160602748394, + "rewards/margins_std": 0.03618020936846733, + "rewards/rejected": -0.018390655517578125, + "step": 4220 + }, + { + "epoch": 0.96, + "grad_norm": 0.36328125, + "learning_rate": 2.9240207563586137e-09, + "logits/chosen": -1.367784857749939, + "logits/rejected": -1.1241731643676758, + "logps/chosen": -206.1232452392578, + "logps/rejected": -191.2694091796875, + "loss": 0.6716, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03186849504709244, + "rewards/margins": 0.054981641471385956, + "rewards/margins_max": 0.0860222578048706, + "rewards/margins_min": 0.023941034451127052, + "rewards/margins_std": 0.04389805719256401, + "rewards/rejected": -0.023113155737519264, + "step": 4230 + }, + { + "epoch": 0.96, + "grad_norm": 0.41015625, + "learning_rate": 2.6308607588779173e-09, + "logits/chosen": -1.3376381397247314, + "logits/rejected": -1.0197269916534424, + "logps/chosen": -244.1377716064453, + "logps/rejected": -232.1446533203125, + "loss": 0.6691, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.032451361417770386, + "rewards/margins": 0.05189868062734604, + "rewards/margins_max": 0.06959724426269531, + "rewards/margins_min": 0.03420013189315796, + "rewards/margins_std": 0.025029540061950684, + "rewards/rejected": -0.01944732666015625, + "step": 4240 + }, + { + "epoch": 0.96, + "grad_norm": 0.33984375, + "learning_rate": 2.353105653267712e-09, + "logits/chosen": -1.4159284830093384, + "logits/rejected": -0.9598219990730286, + "logps/chosen": -205.52749633789062, + "logps/rejected": -196.06875610351562, + "loss": 0.6673, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.030434027314186096, + "rewards/margins": 0.05207052826881409, + "rewards/margins_max": 0.0799938291311264, + "rewards/margins_min": 0.024147219955921173, + "rewards/margins_std": 0.039489515125751495, + "rewards/rejected": -0.02163649909198284, + "step": 4250 + }, + { + "epoch": 0.96, + "grad_norm": 0.421875, + "learning_rate": 2.0907727367033002e-09, + "logits/chosen": -1.4469044208526611, + "logits/rejected": -1.0019524097442627, + "logps/chosen": -358.66864013671875, + "logps/rejected": -218.69802856445312, + "loss": 0.6737, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03114650584757328, + "rewards/margins": 0.03943406790494919, + "rewards/margins_max": 0.0645378977060318, + "rewards/margins_min": 0.01433024276047945, + "rewards/margins_std": 0.035502173006534576, + "rewards/rejected": -0.008287565782666206, + "step": 4260 + }, + { + "epoch": 0.97, + "grad_norm": 0.3828125, + "learning_rate": 1.8438783459444608e-09, + "logits/chosen": -1.4788901805877686, + "logits/rejected": -1.2845779657363892, + "logps/chosen": -227.7974090576172, + "logps/rejected": -188.720947265625, + "loss": 0.6728, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.031458381563425064, + "rewards/margins": 0.02985537424683571, + "rewards/margins_max": 0.04987839609384537, + "rewards/margins_min": 0.009832354262471199, + "rewards/margins_std": 0.02831682562828064, + "rewards/rejected": 0.0016030061524361372, + "step": 4270 + }, + { + "epoch": 0.97, + "grad_norm": 0.302734375, + "learning_rate": 1.612437856318205e-09, + "logits/chosen": -1.1608082056045532, + "logits/rejected": -0.9398641586303711, + "logps/chosen": -178.90191650390625, + "logps/rejected": -193.39801025390625, + "loss": 0.673, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.028339451178908348, + "rewards/margins": 0.0333985760807991, + "rewards/margins_max": 0.053435277193784714, + "rewards/margins_min": 0.013361875899136066, + "rewards/margins_std": 0.0283361729234457, + "rewards/rejected": -0.005059124436229467, + "step": 4280 + }, + { + "epoch": 0.97, + "grad_norm": 0.37109375, + "learning_rate": 1.396465680761072e-09, + "logits/chosen": -1.4545872211456299, + "logits/rejected": -1.1205055713653564, + "logps/chosen": -231.1746826171875, + "logps/rejected": -196.8207244873047, + "loss": 0.6705, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03223685547709465, + "rewards/margins": 0.05452718213200569, + "rewards/margins_max": 0.08367858827114105, + "rewards/margins_min": 0.025375764816999435, + "rewards/margins_std": 0.04122632369399071, + "rewards/rejected": -0.022290324792265892, + "step": 4290 + }, + { + "epoch": 0.97, + "grad_norm": 0.486328125, + "learning_rate": 1.195975268921734e-09, + "logits/chosen": -1.5477195978164673, + "logits/rejected": -1.216968297958374, + "logps/chosen": -286.03790283203125, + "logps/rejected": -206.74252319335938, + "loss": 0.6714, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03005763329565525, + "rewards/margins": 0.037625450640916824, + "rewards/margins_max": 0.06239453703165054, + "rewards/margins_min": 0.012856366112828255, + "rewards/margins_std": 0.03502877429127693, + "rewards/rejected": -0.007567819207906723, + "step": 4300 + }, + { + "epoch": 0.97, + "grad_norm": 0.333984375, + "learning_rate": 1.0109791063233897e-09, + "logits/chosen": -1.5190095901489258, + "logits/rejected": -1.2735944986343384, + "logps/chosen": -171.55831909179688, + "logps/rejected": -189.24229431152344, + "loss": 0.669, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03234173357486725, + "rewards/margins": 0.05007977411150932, + "rewards/margins_max": 0.07897655665874481, + "rewards/margins_min": 0.02118297666311264, + "rewards/margins_std": 0.04086623713374138, + "rewards/rejected": -0.017738038673996925, + "step": 4310 + }, + { + "epoch": 0.98, + "grad_norm": 0.373046875, + "learning_rate": 8.414887135860526e-10, + "logits/chosen": -1.2920136451721191, + "logits/rejected": -1.0507843494415283, + "logps/chosen": -242.9005889892578, + "logps/rejected": -246.7140655517578, + "loss": 0.6705, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03298826888203621, + "rewards/margins": 0.059736646711826324, + "rewards/margins_max": 0.09354208409786224, + "rewards/margins_min": 0.0259312242269516, + "rewards/margins_std": 0.04780808836221695, + "rewards/rejected": -0.026748377829790115, + "step": 4320 + }, + { + "epoch": 0.98, + "grad_norm": 0.34765625, + "learning_rate": 6.875146457094583e-10, + "logits/chosen": -1.4780220985412598, + "logits/rejected": -1.0817339420318604, + "logps/chosen": -269.909912109375, + "logps/rejected": -272.59796142578125, + "loss": 0.667, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03321417048573494, + "rewards/margins": 0.05451669171452522, + "rewards/margins_max": 0.08729592710733414, + "rewards/margins_min": 0.02173745259642601, + "rewards/margins_std": 0.04635683819651604, + "rewards/rejected": -0.021302521228790283, + "step": 4330 + }, + { + "epoch": 0.98, + "grad_norm": 0.326171875, + "learning_rate": 5.490664914153676e-10, + "logits/chosen": -1.2779922485351562, + "logits/rejected": -1.056840419769287, + "logps/chosen": -176.34756469726562, + "logps/rejected": -225.94140625, + "loss": 0.6698, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035486023873090744, + "rewards/margins": 0.05061611533164978, + "rewards/margins_max": 0.07715443521738052, + "rewards/margins_min": 0.024077793583273888, + "rewards/margins_std": 0.03753085806965828, + "rewards/rejected": -0.015130092389881611, + "step": 4340 + }, + { + "epoch": 0.98, + "grad_norm": 0.392578125, + "learning_rate": 4.261528725507113e-10, + "logits/chosen": -1.4777419567108154, + "logits/rejected": -1.3885948657989502, + "logps/chosen": -203.3699493408203, + "logps/rejected": -191.15402221679688, + "loss": 0.6713, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.0292215533554554, + "rewards/margins": 0.04114232212305069, + "rewards/margins_max": 0.06767222285270691, + "rewards/margins_min": 0.014612428843975067, + "rewards/margins_std": 0.0375189371407032, + "rewards/rejected": -0.01192077063024044, + "step": 4350 + }, + { + "epoch": 0.99, + "grad_norm": 0.326171875, + "learning_rate": 3.187814435505198e-10, + "logits/chosen": -1.5060135126113892, + "logits/rejected": -1.3325016498565674, + "logps/chosen": -190.29415893554688, + "logps/rejected": -219.4818878173828, + "loss": 0.6666, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.028621181845664978, + "rewards/margins": 0.04612215235829353, + "rewards/margins_max": 0.06957665830850601, + "rewards/margins_min": 0.022667638957500458, + "rewards/margins_std": 0.033169690519571304, + "rewards/rejected": -0.017500972375273705, + "step": 4360 + }, + { + "epoch": 0.99, + "grad_norm": 0.400390625, + "learning_rate": 2.269588909613318e-10, + "logits/chosen": -1.409186601638794, + "logits/rejected": -1.2228713035583496, + "logps/chosen": -173.80068969726562, + "logps/rejected": -194.56361389160156, + "loss": 0.6723, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023956812918186188, + "rewards/margins": 0.03811182081699371, + "rewards/margins_max": 0.05768042802810669, + "rewards/margins_min": 0.018543217331171036, + "rewards/margins_std": 0.027674183249473572, + "rewards/rejected": -0.014155007898807526, + "step": 4370 + }, + { + "epoch": 0.99, + "grad_norm": 0.365234375, + "learning_rate": 1.5069093302469415e-10, + "logits/chosen": -1.190978765487671, + "logits/rejected": -1.0352693796157837, + "logps/chosen": -224.6339874267578, + "logps/rejected": -217.9106903076172, + "loss": 0.6682, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.030144354328513145, + "rewards/margins": 0.05521010607481003, + "rewards/margins_max": 0.08944573253393173, + "rewards/margins_min": 0.020974475890398026, + "rewards/margins_std": 0.048416487872600555, + "rewards/rejected": -0.02506575547158718, + "step": 4380 + }, + { + "epoch": 0.99, + "grad_norm": 0.466796875, + "learning_rate": 8.99823193210858e-11, + "logits/chosen": -1.1929359436035156, + "logits/rejected": -0.9103671908378601, + "logps/chosen": -210.3509521484375, + "logps/rejected": -232.4520721435547, + "loss": 0.6695, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.022757183760404587, + "rewards/margins": 0.05178738385438919, + "rewards/margins_max": 0.07935634255409241, + "rewards/margins_min": 0.024218428879976273, + "rewards/margins_std": 0.0389883928000927, + "rewards/rejected": -0.029030198231339455, + "step": 4390 + }, + { + "epoch": 0.99, + "grad_norm": 0.34765625, + "learning_rate": 4.483683047426523e-11, + "logits/chosen": -1.3004522323608398, + "logits/rejected": -1.0778028964996338, + "logps/chosen": -252.458251953125, + "logps/rejected": -205.4815216064453, + "loss": 0.6689, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.019098959863185883, + "rewards/margins": 0.04386705532670021, + "rewards/margins_max": 0.0736868605017662, + "rewards/margins_min": 0.01404724083840847, + "rewards/margins_std": 0.042171578854322433, + "rewards/rejected": -0.024768095463514328, + "step": 4400 + }, + { + "epoch": 1.0, + "grad_norm": 0.396484375, + "learning_rate": 1.5257277915653456e-11, + "logits/chosen": -1.5603951215744019, + "logits/rejected": -1.2852790355682373, + "logps/chosen": -157.50111389160156, + "logps/rejected": -165.79519653320312, + "loss": 0.6705, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03554811701178551, + "rewards/margins": 0.04409513995051384, + "rewards/margins_max": 0.07175298035144806, + "rewards/margins_min": 0.01643729954957962, + "rewards/margins_std": 0.0391140915453434, + "rewards/rejected": -0.008547024801373482, + "step": 4410 + }, + { + "epoch": 1.0, + "grad_norm": 0.640625, + "learning_rate": 1.2455037093073161e-12, + "logits/chosen": -1.4276247024536133, + "logits/rejected": -1.0764439105987549, + "logps/chosen": -213.16421508789062, + "logps/rejected": -192.83340454101562, + "loss": 0.6659, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.028442760929465294, + "rewards/margins": 0.054314613342285156, + "rewards/margins_max": 0.08231306821107864, + "rewards/margins_min": 0.02631615474820137, + "rewards/margins_std": 0.039595797657966614, + "rewards/rejected": -0.025871848687529564, + "step": 4420 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.0172001123428345, + "eval_logits/rejected": -0.8949137330055237, + "eval_logps/chosen": -322.9526672363281, + "eval_logps/rejected": -314.051513671875, + "eval_loss": 0.6914807558059692, + "eval_rewards/accuracies": 0.5509999990463257, + "eval_rewards/chosen": 0.020110901445150375, + "eval_rewards/margins": 0.003983891103416681, + "eval_rewards/margins_max": 0.06447038054466248, + "eval_rewards/margins_min": -0.059528909623622894, + "eval_rewards/margins_std": 0.04112740606069565, + "eval_rewards/rejected": 0.01612701080739498, + "eval_runtime": 1445.3459, + "eval_samples_per_second": 2.768, + "eval_steps_per_second": 0.173, + "step": 4424 + }, + { + "epoch": 1.0, + "step": 4424, + "total_flos": 0.0, + "train_loss": 0.6737086928666823, + "train_runtime": 34589.1987, + "train_samples_per_second": 1.023, + "train_steps_per_second": 0.128 + } + ], + "logging_steps": 10, + "max_steps": 4424, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}