|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 800000000, |
|
"global_step": 835, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 5.952380952380953e-08, |
|
"logits/chosen": -3.4845848083496094, |
|
"logits/rejected": -3.85036301612854, |
|
"logps/chosen": -306.50885009765625, |
|
"logps/rejected": -197.74395751953125, |
|
"loss": 0.6931, |
|
"rewards/accuracies": 0.0, |
|
"rewards/chosen": 0.0, |
|
"rewards/diff": -0.625, |
|
"rewards/diff_abs": 0.625, |
|
"rewards/rejected": 0.0, |
|
"rewards/student_margin": 0.0, |
|
"rewards/teacher_margin": 0.625, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 5.952380952380953e-07, |
|
"logits/chosen": -3.454127788543701, |
|
"logits/rejected": -3.5237815380096436, |
|
"logps/chosen": -201.42767333984375, |
|
"logps/rejected": -183.9016571044922, |
|
"loss": 0.7039, |
|
"rewards/accuracies": 0.4444444477558136, |
|
"rewards/chosen": 0.005284797865897417, |
|
"rewards/diff": -0.6895497441291809, |
|
"rewards/diff_abs": 0.7025125026702881, |
|
"rewards/rejected": 0.01312162820249796, |
|
"rewards/student_margin": -0.007836826145648956, |
|
"rewards/teacher_margin": 0.6817129254341125, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.5, |
|
"learning_rate": 1.1904761904761906e-06, |
|
"logits/chosen": -3.5940723419189453, |
|
"logits/rejected": -3.5770275592803955, |
|
"logps/chosen": -218.02499389648438, |
|
"logps/rejected": -209.6902313232422, |
|
"loss": 0.7101, |
|
"rewards/accuracies": 0.4333333373069763, |
|
"rewards/chosen": -0.025884132832288742, |
|
"rewards/diff": -0.9974073171615601, |
|
"rewards/diff_abs": 1.025899887084961, |
|
"rewards/rejected": -0.01753927394747734, |
|
"rewards/student_margin": -0.00834486074745655, |
|
"rewards/teacher_margin": 0.9890626072883606, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"logits/chosen": -3.4890503883361816, |
|
"logits/rejected": -3.6032581329345703, |
|
"logps/chosen": -259.52838134765625, |
|
"logps/rejected": -200.54518127441406, |
|
"loss": 0.6961, |
|
"rewards/accuracies": 0.5666666626930237, |
|
"rewards/chosen": 0.11190159618854523, |
|
"rewards/diff": -0.870490550994873, |
|
"rewards/diff_abs": 0.910025417804718, |
|
"rewards/rejected": 0.07041291892528534, |
|
"rewards/student_margin": 0.04148866608738899, |
|
"rewards/teacher_margin": 0.911979079246521, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 2.380952380952381e-06, |
|
"logits/chosen": -3.4495646953582764, |
|
"logits/rejected": -3.5306625366210938, |
|
"logps/chosen": -296.196044921875, |
|
"logps/rejected": -205.72494506835938, |
|
"loss": 0.6763, |
|
"rewards/accuracies": 0.5333333611488342, |
|
"rewards/chosen": 0.27481913566589355, |
|
"rewards/diff": -0.928841233253479, |
|
"rewards/diff_abs": 0.9340232610702515, |
|
"rewards/rejected": 0.1948060244321823, |
|
"rewards/student_margin": 0.08001308888196945, |
|
"rewards/teacher_margin": 1.0088541507720947, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 2.9761904761904763e-06, |
|
"logits/chosen": -3.6242897510528564, |
|
"logits/rejected": -3.6223366260528564, |
|
"logps/chosen": -232.1892852783203, |
|
"logps/rejected": -218.8447265625, |
|
"loss": 0.6524, |
|
"rewards/accuracies": 0.6333333253860474, |
|
"rewards/chosen": 0.5324563980102539, |
|
"rewards/diff": -1.090423583984375, |
|
"rewards/diff_abs": 1.1463050842285156, |
|
"rewards/rejected": 0.3765257000923157, |
|
"rewards/student_margin": 0.15593069791793823, |
|
"rewards/teacher_margin": 1.2463542222976685, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 7.5, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"logits/chosen": -3.52375864982605, |
|
"logits/rejected": -3.5178802013397217, |
|
"logps/chosen": -278.49578857421875, |
|
"logps/rejected": -227.9744110107422, |
|
"loss": 0.6501, |
|
"rewards/accuracies": 0.6999999284744263, |
|
"rewards/chosen": 0.9247525930404663, |
|
"rewards/diff": -0.6484954953193665, |
|
"rewards/diff_abs": 0.8814946413040161, |
|
"rewards/rejected": 0.6779355406761169, |
|
"rewards/student_margin": 0.24681702256202698, |
|
"rewards/teacher_margin": 0.895312488079071, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 4.166666666666667e-06, |
|
"logits/chosen": -3.57179594039917, |
|
"logits/rejected": -3.573483943939209, |
|
"logps/chosen": -299.92742919921875, |
|
"logps/rejected": -295.48846435546875, |
|
"loss": 0.6298, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.0992387533187866, |
|
"rewards/diff": -0.5216845273971558, |
|
"rewards/diff_abs": 0.7238657474517822, |
|
"rewards/rejected": 0.8657148480415344, |
|
"rewards/student_margin": 0.23352384567260742, |
|
"rewards/teacher_margin": 0.7552083730697632, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 4.761904761904762e-06, |
|
"logits/chosen": -3.3535995483398438, |
|
"logits/rejected": -3.4229187965393066, |
|
"logps/chosen": -306.759521484375, |
|
"logps/rejected": -193.92160034179688, |
|
"loss": 0.6005, |
|
"rewards/accuracies": 0.8333333730697632, |
|
"rewards/chosen": 1.2742681503295898, |
|
"rewards/diff": -0.2789258360862732, |
|
"rewards/diff_abs": 0.8034403920173645, |
|
"rewards/rejected": 0.5938189625740051, |
|
"rewards/student_margin": 0.6804491281509399, |
|
"rewards/teacher_margin": 0.9593750238418579, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 4.9992125742993825e-06, |
|
"logits/chosen": -3.5169739723205566, |
|
"logits/rejected": -3.478895664215088, |
|
"logps/chosen": -305.4494323730469, |
|
"logps/rejected": -259.5570373535156, |
|
"loss": 0.5942, |
|
"rewards/accuracies": 0.7333333492279053, |
|
"rewards/chosen": 1.40164053440094, |
|
"rewards/diff": -0.47005853056907654, |
|
"rewards/diff_abs": 0.7208673357963562, |
|
"rewards/rejected": 1.1060739755630493, |
|
"rewards/student_margin": 0.29556649923324585, |
|
"rewards/teacher_margin": 0.765625, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 4.994402324561469e-06, |
|
"logits/chosen": -3.475271701812744, |
|
"logits/rejected": -3.4696757793426514, |
|
"logps/chosen": -290.073974609375, |
|
"logps/rejected": -212.7600860595703, |
|
"loss": 0.6009, |
|
"rewards/accuracies": 0.800000011920929, |
|
"rewards/chosen": 1.396482229232788, |
|
"rewards/diff": -0.12815351784229279, |
|
"rewards/diff_abs": 0.5424238443374634, |
|
"rewards/rejected": 0.6892191171646118, |
|
"rewards/student_margin": 0.7072631120681763, |
|
"rewards/teacher_margin": 0.8354166746139526, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 4.985227689958313e-06, |
|
"logits/chosen": -3.4492225646972656, |
|
"logits/rejected": -3.490285873413086, |
|
"logps/chosen": -309.30743408203125, |
|
"logps/rejected": -202.38356018066406, |
|
"loss": 0.5763, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.2701516151428223, |
|
"rewards/diff": -0.4000614583492279, |
|
"rewards/diff_abs": 0.8188215494155884, |
|
"rewards/rejected": 0.8035463094711304, |
|
"rewards/student_margin": 0.46660518646240234, |
|
"rewards/teacher_margin": 0.8666666746139526, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.625, |
|
"learning_rate": 4.97170472308737e-06, |
|
"logits/chosen": -3.537369966506958, |
|
"logits/rejected": -3.5341758728027344, |
|
"logps/chosen": -238.89035034179688, |
|
"logps/rejected": -219.8264617919922, |
|
"loss": 0.5923, |
|
"rewards/accuracies": 0.6333333253860474, |
|
"rewards/chosen": 1.1539257764816284, |
|
"rewards/diff": -0.407745361328125, |
|
"rewards/diff_abs": 0.8213578462600708, |
|
"rewards/rejected": 0.6814627051353455, |
|
"rewards/student_margin": 0.47246304154396057, |
|
"rewards/teacher_margin": 0.8802083134651184, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 4.953857084699501e-06, |
|
"logits/chosen": -3.3898227214813232, |
|
"logits/rejected": -3.445030689239502, |
|
"logps/chosen": -237.6462860107422, |
|
"logps/rejected": -189.9452667236328, |
|
"loss": 0.5986, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.2968804836273193, |
|
"rewards/diff": -0.1380801498889923, |
|
"rewards/diff_abs": 0.6938132643699646, |
|
"rewards/rejected": 0.6688148379325867, |
|
"rewards/student_margin": 0.6280657649040222, |
|
"rewards/teacher_margin": 0.7661458253860474, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 4.931716002300424e-06, |
|
"logits/chosen": -3.4307568073272705, |
|
"logits/rejected": -3.428516387939453, |
|
"logps/chosen": -303.8690490722656, |
|
"logps/rejected": -268.0577087402344, |
|
"loss": 0.5674, |
|
"rewards/accuracies": 0.76666659116745, |
|
"rewards/chosen": 1.502533197402954, |
|
"rewards/diff": -0.11084864288568497, |
|
"rewards/diff_abs": 0.7653725743293762, |
|
"rewards/rejected": 0.7842152714729309, |
|
"rewards/student_margin": 0.7183180451393127, |
|
"rewards/teacher_margin": 0.8291667103767395, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 4.905320215512843e-06, |
|
"logits/chosen": -3.3582215309143066, |
|
"logits/rejected": -3.445798397064209, |
|
"logps/chosen": -272.33465576171875, |
|
"logps/rejected": -241.3258819580078, |
|
"loss": 0.5839, |
|
"rewards/accuracies": 0.6666666865348816, |
|
"rewards/chosen": 0.9148601293563843, |
|
"rewards/diff": -0.40072593092918396, |
|
"rewards/diff_abs": 0.7023404240608215, |
|
"rewards/rejected": 0.5593360662460327, |
|
"rewards/student_margin": 0.35552406311035156, |
|
"rewards/teacher_margin": 0.7562500238418579, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 4.874715908294827e-06, |
|
"logits/chosen": -3.4379913806915283, |
|
"logits/rejected": -3.411599636077881, |
|
"logps/chosen": -235.85824584960938, |
|
"logps/rejected": -200.01751708984375, |
|
"loss": 0.5673, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 0.91960209608078, |
|
"rewards/diff": -0.3414815068244934, |
|
"rewards/diff_abs": 0.8401057124137878, |
|
"rewards/rejected": 0.38087528944015503, |
|
"rewards/student_margin": 0.5387269258499146, |
|
"rewards/teacher_margin": 0.8802083730697632, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.875, |
|
"learning_rate": 4.839956628133049e-06, |
|
"logits/chosen": -3.3944404125213623, |
|
"logits/rejected": -3.4562854766845703, |
|
"logps/chosen": -236.4658203125, |
|
"logps/rejected": -207.5730438232422, |
|
"loss": 0.5312, |
|
"rewards/accuracies": 0.7333333492279053, |
|
"rewards/chosen": 1.1229525804519653, |
|
"rewards/diff": -0.3498944342136383, |
|
"rewards/diff_abs": 0.8671269416809082, |
|
"rewards/rejected": 0.43118032813072205, |
|
"rewards/student_margin": 0.6917722821235657, |
|
"rewards/teacher_margin": 1.0416667461395264, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 4.801103192352272e-06, |
|
"logits/chosen": -3.5573208332061768, |
|
"logits/rejected": -3.619119167327881, |
|
"logps/chosen": -342.3301086425781, |
|
"logps/rejected": -242.18148803710938, |
|
"loss": 0.5428, |
|
"rewards/accuracies": 0.699999988079071, |
|
"rewards/chosen": 1.7506067752838135, |
|
"rewards/diff": -0.05490243434906006, |
|
"rewards/diff_abs": 1.2669219970703125, |
|
"rewards/rejected": 0.852384090423584, |
|
"rewards/student_margin": 0.8982225656509399, |
|
"rewards/teacher_margin": 0.9531251192092896, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.75, |
|
"learning_rate": 4.758223581705006e-06, |
|
"logits/chosen": -3.493630886077881, |
|
"logits/rejected": -3.531799793243408, |
|
"logps/chosen": -242.2713623046875, |
|
"logps/rejected": -195.7411346435547, |
|
"loss": 0.5605, |
|
"rewards/accuracies": 0.6333333253860474, |
|
"rewards/chosen": 1.3145856857299805, |
|
"rewards/diff": -0.18605072796344757, |
|
"rewards/diff_abs": 0.9721413850784302, |
|
"rewards/rejected": 0.7516780495643616, |
|
"rewards/student_margin": 0.5629075765609741, |
|
"rewards/teacher_margin": 0.7489583492279053, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.375, |
|
"learning_rate": 4.711392821427515e-06, |
|
"logits/chosen": -3.5924346446990967, |
|
"logits/rejected": -3.6110892295837402, |
|
"logps/chosen": -231.8784942626953, |
|
"logps/rejected": -160.18458557128906, |
|
"loss": 0.5528, |
|
"rewards/accuracies": 0.76666659116745, |
|
"rewards/chosen": 1.1364099979400635, |
|
"rewards/diff": -0.20100148022174835, |
|
"rewards/diff_abs": 0.8547786474227905, |
|
"rewards/rejected": 0.13949476182460785, |
|
"rewards/student_margin": 0.9969152212142944, |
|
"rewards/teacher_margin": 1.1979167461395264, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.6606928499702905e-06, |
|
"logits/chosen": -3.583310604095459, |
|
"logits/rejected": -3.646390914916992, |
|
"logps/chosen": -236.2954864501953, |
|
"logps/rejected": -226.30050659179688, |
|
"loss": 0.5455, |
|
"rewards/accuracies": 0.699999988079071, |
|
"rewards/chosen": 1.1413193941116333, |
|
"rewards/diff": -0.6167432069778442, |
|
"rewards/diff_abs": 0.9270604252815247, |
|
"rewards/rejected": 0.8591042757034302, |
|
"rewards/student_margin": 0.2822151482105255, |
|
"rewards/teacher_margin": 0.8989583849906921, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 4.606212375632682e-06, |
|
"logits/chosen": -3.3313984870910645, |
|
"logits/rejected": -3.4001998901367188, |
|
"logps/chosen": -241.50430297851562, |
|
"logps/rejected": -185.7382049560547, |
|
"loss": 0.5455, |
|
"rewards/accuracies": 0.76666659116745, |
|
"rewards/chosen": 1.245307207107544, |
|
"rewards/diff": -0.19214758276939392, |
|
"rewards/diff_abs": 0.9847003817558289, |
|
"rewards/rejected": 0.4134964942932129, |
|
"rewards/student_margin": 0.8318107724189758, |
|
"rewards/teacher_margin": 1.023958444595337, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 4.5480467213524935e-06, |
|
"logits/chosen": -3.4316277503967285, |
|
"logits/rejected": -3.4833552837371826, |
|
"logps/chosen": -258.82781982421875, |
|
"logps/rejected": -248.5332489013672, |
|
"loss": 0.5478, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 1.38356614112854, |
|
"rewards/diff": -0.1715225726366043, |
|
"rewards/diff_abs": 0.7107259631156921, |
|
"rewards/rejected": 0.7259219288825989, |
|
"rewards/student_margin": 0.6576440930366516, |
|
"rewards/teacher_margin": 0.8291667699813843, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 4.4862976579221605e-06, |
|
"logits/chosen": -3.3932158946990967, |
|
"logits/rejected": -3.4250049591064453, |
|
"logps/chosen": -303.9991149902344, |
|
"logps/rejected": -221.3593292236328, |
|
"loss": 0.5413, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.6610606908798218, |
|
"rewards/diff": -0.16523823142051697, |
|
"rewards/diff_abs": 0.8983039855957031, |
|
"rewards/rejected": 0.6669239401817322, |
|
"rewards/student_margin": 0.9941369295120239, |
|
"rewards/teacher_margin": 1.1593749523162842, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.125, |
|
"learning_rate": 4.421073225923276e-06, |
|
"logits/chosen": -3.4080328941345215, |
|
"logits/rejected": -3.545672655105591, |
|
"logps/chosen": -302.5151062011719, |
|
"logps/rejected": -223.77474975585938, |
|
"loss": 0.5379, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.6596599817276, |
|
"rewards/diff": -0.011551931500434875, |
|
"rewards/diff_abs": 0.9808802604675293, |
|
"rewards/rejected": 0.7274617552757263, |
|
"rewards/student_margin": 0.9321980476379395, |
|
"rewards/teacher_margin": 0.9437500238418579, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 4.3524875466910634e-06, |
|
"logits/chosen": -3.377882719039917, |
|
"logits/rejected": -3.380521059036255, |
|
"logps/chosen": -247.70703125, |
|
"logps/rejected": -240.69363403320312, |
|
"loss": 0.5479, |
|
"rewards/accuracies": 0.5666667222976685, |
|
"rewards/chosen": 0.9718164205551147, |
|
"rewards/diff": -0.17575177550315857, |
|
"rewards/diff_abs": 0.8040043711662292, |
|
"rewards/rejected": 0.5954850316047668, |
|
"rewards/student_margin": 0.3763315677642822, |
|
"rewards/teacher_margin": 0.5520833730697632, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 4.280660622639513e-06, |
|
"logits/chosen": -3.5067367553710938, |
|
"logits/rejected": -3.5205013751983643, |
|
"logps/chosen": -237.2410125732422, |
|
"logps/rejected": -190.3438720703125, |
|
"loss": 0.5352, |
|
"rewards/accuracies": 0.7333333492279053, |
|
"rewards/chosen": 1.3214976787567139, |
|
"rewards/diff": -0.030149614438414574, |
|
"rewards/diff_abs": 0.9027697443962097, |
|
"rewards/rejected": 0.4870639443397522, |
|
"rewards/student_margin": 0.8344337344169617, |
|
"rewards/teacher_margin": 0.8645833730697632, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 4.205718127296574e-06, |
|
"logits/chosen": -3.5430946350097656, |
|
"logits/rejected": -3.5217278003692627, |
|
"logps/chosen": -239.96188354492188, |
|
"logps/rejected": -210.2650604248047, |
|
"loss": 0.5277, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 1.366132140159607, |
|
"rewards/diff": -0.2834340035915375, |
|
"rewards/diff_abs": 1.1607670783996582, |
|
"rewards/rejected": 0.7917537689208984, |
|
"rewards/student_margin": 0.5743785500526428, |
|
"rewards/teacher_margin": 0.8578125238418579, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 4.127791185416747e-06, |
|
"logits/chosen": -3.410996675491333, |
|
"logits/rejected": -3.428239345550537, |
|
"logps/chosen": -218.684326171875, |
|
"logps/rejected": -173.12939453125, |
|
"loss": 0.5492, |
|
"rewards/accuracies": 0.7000000476837158, |
|
"rewards/chosen": 1.1556932926177979, |
|
"rewards/diff": -0.2741561830043793, |
|
"rewards/diff_abs": 0.9800466299057007, |
|
"rewards/rejected": 0.501724362373352, |
|
"rewards/student_margin": 0.6539688110351562, |
|
"rewards/teacher_margin": 0.9281250834465027, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 4.047016143555834e-06, |
|
"logits/chosen": -3.4146499633789062, |
|
"logits/rejected": -3.4334769248962402, |
|
"logps/chosen": -246.2406005859375, |
|
"logps/rejected": -207.5984344482422, |
|
"loss": 0.5396, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 1.4659597873687744, |
|
"rewards/diff": -0.0349099263548851, |
|
"rewards/diff_abs": 0.7459529042243958, |
|
"rewards/rejected": 0.5878490209579468, |
|
"rewards/student_margin": 0.8781110048294067, |
|
"rewards/teacher_margin": 0.91302090883255, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.75, |
|
"learning_rate": 3.9635343315092374e-06, |
|
"logits/chosen": -3.3409626483917236, |
|
"logits/rejected": -3.4818501472473145, |
|
"logps/chosen": -242.3018341064453, |
|
"logps/rejected": -209.86740112304688, |
|
"loss": 0.5499, |
|
"rewards/accuracies": 0.800000011920929, |
|
"rewards/chosen": 1.246741533279419, |
|
"rewards/diff": -0.13802729547023773, |
|
"rewards/diff_abs": 0.9692344665527344, |
|
"rewards/rejected": 0.40664371848106384, |
|
"rewards/student_margin": 0.8400977253913879, |
|
"rewards/teacher_margin": 0.9781249165534973, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 3.877491815031241e-06, |
|
"logits/chosen": -3.50838041305542, |
|
"logits/rejected": -3.6322741508483887, |
|
"logps/chosen": -257.099609375, |
|
"logps/rejected": -179.91046142578125, |
|
"loss": 0.526, |
|
"rewards/accuracies": 0.9666666984558105, |
|
"rewards/chosen": 1.3539568185806274, |
|
"rewards/diff": 0.1423492729663849, |
|
"rewards/diff_abs": 0.7491869926452637, |
|
"rewards/rejected": 0.354315847158432, |
|
"rewards/student_margin": 0.999640941619873, |
|
"rewards/teacher_margin": 0.8572916984558105, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 3.789039140267903e-06, |
|
"logits/chosen": -3.6152091026306152, |
|
"logits/rejected": -3.6335906982421875, |
|
"logps/chosen": -238.04483032226562, |
|
"logps/rejected": -203.68545532226562, |
|
"loss": 0.5211, |
|
"rewards/accuracies": 0.7666667103767395, |
|
"rewards/chosen": 1.1535335779190063, |
|
"rewards/diff": -0.22724106907844543, |
|
"rewards/diff_abs": 0.8418729901313782, |
|
"rewards/rejected": 0.3599412739276886, |
|
"rewards/student_margin": 0.7935922741889954, |
|
"rewards/teacher_margin": 1.0208333730697632, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 6.25, |
|
"learning_rate": 3.6983310703507475e-06, |
|
"logits/chosen": -3.474027633666992, |
|
"logits/rejected": -3.618129253387451, |
|
"logps/chosen": -314.650390625, |
|
"logps/rejected": -292.22796630859375, |
|
"loss": 0.5083, |
|
"rewards/accuracies": 0.7333333492279053, |
|
"rewards/chosen": 1.6438385248184204, |
|
"rewards/diff": -0.008717024698853493, |
|
"rewards/diff_abs": 0.9126062393188477, |
|
"rewards/rejected": 0.937972366809845, |
|
"rewards/student_margin": 0.7058663368225098, |
|
"rewards/teacher_margin": 0.7145833969116211, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 3.6055263146121062e-06, |
|
"logits/chosen": -3.4695258140563965, |
|
"logits/rejected": -3.544586658477783, |
|
"logps/chosen": -241.87686157226562, |
|
"logps/rejected": -190.71157836914062, |
|
"loss": 0.5241, |
|
"rewards/accuracies": 0.7333333492279053, |
|
"rewards/chosen": 1.3957545757293701, |
|
"rewards/diff": -0.020256221294403076, |
|
"rewards/diff_abs": 1.0595004558563232, |
|
"rewards/rejected": 0.6003857851028442, |
|
"rewards/student_margin": 0.7953688502311707, |
|
"rewards/teacher_margin": 0.815625011920929, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 3.5107872508959144e-06, |
|
"logits/chosen": -3.5332858562469482, |
|
"logits/rejected": -3.658419370651245, |
|
"logps/chosen": -301.9664001464844, |
|
"logps/rejected": -229.6895294189453, |
|
"loss": 0.5268, |
|
"rewards/accuracies": 0.699999988079071, |
|
"rewards/chosen": 1.3080447912216187, |
|
"rewards/diff": 0.03206339478492737, |
|
"rewards/diff_abs": 1.0055023431777954, |
|
"rewards/rejected": 0.4827522337436676, |
|
"rewards/student_margin": 0.8252925872802734, |
|
"rewards/teacher_margin": 0.7932292222976685, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 3.414279641449809e-06, |
|
"logits/chosen": -3.4194533824920654, |
|
"logits/rejected": -3.459688901901245, |
|
"logps/chosen": -293.80865478515625, |
|
"logps/rejected": -236.77560424804688, |
|
"loss": 0.512, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.4415782690048218, |
|
"rewards/diff": -0.22951290011405945, |
|
"rewards/diff_abs": 0.9373496174812317, |
|
"rewards/rejected": 0.7585911750793457, |
|
"rewards/student_margin": 0.6829870939254761, |
|
"rewards/teacher_margin": 0.9125000238418579, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 6.125, |
|
"learning_rate": 3.3161723428956356e-06, |
|
"logits/chosen": -3.329397201538086, |
|
"logits/rejected": -3.4820456504821777, |
|
"logps/chosen": -303.4757385253906, |
|
"logps/rejected": -242.2552032470703, |
|
"loss": 0.5127, |
|
"rewards/accuracies": 0.699999988079071, |
|
"rewards/chosen": 1.638362169265747, |
|
"rewards/diff": -0.17444480955600739, |
|
"rewards/diff_abs": 0.9962782859802246, |
|
"rewards/rejected": 0.7367652654647827, |
|
"rewards/student_margin": 0.9015968441963196, |
|
"rewards/teacher_margin": 1.0760416984558105, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 3.216637010785813e-06, |
|
"logits/chosen": -3.547212600708008, |
|
"logits/rejected": -3.542712688446045, |
|
"logps/chosen": -321.4063720703125, |
|
"logps/rejected": -284.3674621582031, |
|
"loss": 0.5144, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.7464603185653687, |
|
"rewards/diff": 0.06983740627765656, |
|
"rewards/diff_abs": 0.938123881816864, |
|
"rewards/rejected": 0.7797478437423706, |
|
"rewards/student_margin": 0.9667123556137085, |
|
"rewards/teacher_margin": 0.8968750238418579, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 3.115847799262494e-06, |
|
"logits/chosen": -3.4556503295898438, |
|
"logits/rejected": -3.5828518867492676, |
|
"logps/chosen": -256.46868896484375, |
|
"logps/rejected": -220.39010620117188, |
|
"loss": 0.5092, |
|
"rewards/accuracies": 0.8333333730697632, |
|
"rewards/chosen": 1.3820217847824097, |
|
"rewards/diff": 0.07980125397443771, |
|
"rewards/diff_abs": 0.8547611236572266, |
|
"rewards/rejected": 0.42253294587135315, |
|
"rewards/student_margin": 0.9594887495040894, |
|
"rewards/teacher_margin": 0.879687488079071, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 3.0139810563450094e-06, |
|
"logits/chosen": -3.592397689819336, |
|
"logits/rejected": -3.6688952445983887, |
|
"logps/chosen": -292.0425720214844, |
|
"logps/rejected": -234.98208618164062, |
|
"loss": 0.5161, |
|
"rewards/accuracies": 0.7999999523162842, |
|
"rewards/chosen": 1.5078362226486206, |
|
"rewards/diff": 0.07032543420791626, |
|
"rewards/diff_abs": 0.7481400370597839, |
|
"rewards/rejected": 0.6864690184593201, |
|
"rewards/student_margin": 0.821367084980011, |
|
"rewards/teacher_margin": 0.7510417103767395, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 2.911215015378752e-06, |
|
"logits/chosen": -3.552057981491089, |
|
"logits/rejected": -3.6183040142059326, |
|
"logps/chosen": -224.40554809570312, |
|
"logps/rejected": -186.09158325195312, |
|
"loss": 0.5053, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 1.171048641204834, |
|
"rewards/diff": 0.06066560745239258, |
|
"rewards/diff_abs": 1.058672547340393, |
|
"rewards/rejected": 0.3551747798919678, |
|
"rewards/student_margin": 0.8158739805221558, |
|
"rewards/teacher_margin": 0.7552083730697632, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.8077294831853547e-06, |
|
"logits/chosen": -3.4315121173858643, |
|
"logits/rejected": -3.4911365509033203, |
|
"logps/chosen": -285.84918212890625, |
|
"logps/rejected": -214.66140747070312, |
|
"loss": 0.5183, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 1.376267671585083, |
|
"rewards/diff": -0.3253711462020874, |
|
"rewards/diff_abs": 0.9324856996536255, |
|
"rewards/rejected": 0.7354929447174072, |
|
"rewards/student_margin": 0.6407747268676758, |
|
"rewards/teacher_margin": 0.9661458134651184, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 2.703705525459806e-06, |
|
"logits/chosen": -3.5061888694763184, |
|
"logits/rejected": -3.5336086750030518, |
|
"logps/chosen": -219.6090087890625, |
|
"logps/rejected": -203.97415161132812, |
|
"loss": 0.5235, |
|
"rewards/accuracies": 0.76666659116745, |
|
"rewards/chosen": 1.4297001361846924, |
|
"rewards/diff": 0.08995727449655533, |
|
"rewards/diff_abs": 0.5762092471122742, |
|
"rewards/rejected": 0.5720344185829163, |
|
"rewards/student_margin": 0.8576656579971313, |
|
"rewards/teacher_margin": 0.767708420753479, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 2.599325149964946e-06, |
|
"logits/chosen": -3.4120395183563232, |
|
"logits/rejected": -3.5835208892822266, |
|
"logps/chosen": -336.3391418457031, |
|
"logps/rejected": -304.1842956542969, |
|
"loss": 0.5215, |
|
"rewards/accuracies": 0.7999999523162842, |
|
"rewards/chosen": 1.8122284412384033, |
|
"rewards/diff": -0.12476543337106705, |
|
"rewards/diff_abs": 0.7393094897270203, |
|
"rewards/rejected": 1.2078273296356201, |
|
"rewards/student_margin": 0.604401171207428, |
|
"rewards/teacher_margin": 0.7291667461395264, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 2.4947709880776607e-06, |
|
"logits/chosen": -3.4514999389648438, |
|
"logits/rejected": -3.581846237182617, |
|
"logps/chosen": -248.5549774169922, |
|
"logps/rejected": -214.66116333007812, |
|
"loss": 0.5098, |
|
"rewards/accuracies": 0.7000000476837158, |
|
"rewards/chosen": 1.2313129901885986, |
|
"rewards/diff": 0.08004424721002579, |
|
"rewards/diff_abs": 1.226075530052185, |
|
"rewards/rejected": 0.3127269446849823, |
|
"rewards/student_margin": 0.9185859560966492, |
|
"rewards/teacher_margin": 0.8385416865348816, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.125, |
|
"learning_rate": 2.3902259752439462e-06, |
|
"logits/chosen": -3.492166042327881, |
|
"logits/rejected": -3.5663814544677734, |
|
"logps/chosen": -278.4722595214844, |
|
"logps/rejected": -242.76022338867188, |
|
"loss": 0.5051, |
|
"rewards/accuracies": 0.7999999523162842, |
|
"rewards/chosen": 1.3407200574874878, |
|
"rewards/diff": -0.000378596771042794, |
|
"rewards/diff_abs": 1.141404390335083, |
|
"rewards/rejected": 0.5020361542701721, |
|
"rewards/student_margin": 0.8386839628219604, |
|
"rewards/teacher_margin": 0.839062511920929, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.25, |
|
"learning_rate": 2.2858730309019594e-06, |
|
"logits/chosen": -3.388932704925537, |
|
"logits/rejected": -3.441415309906006, |
|
"logps/chosen": -331.4084777832031, |
|
"logps/rejected": -241.6312713623047, |
|
"loss": 0.5096, |
|
"rewards/accuracies": 0.800000011920929, |
|
"rewards/chosen": 1.8032405376434326, |
|
"rewards/diff": 0.10180602222681046, |
|
"rewards/diff_abs": 1.0163486003875732, |
|
"rewards/rejected": 0.8545595407485962, |
|
"rewards/student_margin": 0.9486810564994812, |
|
"rewards/teacher_margin": 0.846875011920929, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 2.181894738433076e-06, |
|
"logits/chosen": -3.532305955886841, |
|
"logits/rejected": -3.5801339149475098, |
|
"logps/chosen": -246.8173065185547, |
|
"logps/rejected": -220.6660614013672, |
|
"loss": 0.5419, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.4681113958358765, |
|
"rewards/diff": -0.13037186861038208, |
|
"rewards/diff_abs": 0.9364659190177917, |
|
"rewards/rejected": 0.711243748664856, |
|
"rewards/student_margin": 0.7568677067756653, |
|
"rewards/teacher_margin": 0.8872395753860474, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 2.078473025700937e-06, |
|
"logits/chosen": -3.536620616912842, |
|
"logits/rejected": -3.610663652420044, |
|
"logps/chosen": -196.84896850585938, |
|
"logps/rejected": -168.04319763183594, |
|
"loss": 0.5438, |
|
"rewards/accuracies": 0.6333333253860474, |
|
"rewards/chosen": 0.8065615892410278, |
|
"rewards/diff": -0.20639605820178986, |
|
"rewards/diff_abs": 1.1502354145050049, |
|
"rewards/rejected": 0.2853534519672394, |
|
"rewards/student_margin": 0.5212081670761108, |
|
"rewards/teacher_margin": 0.7276042103767395, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.975788846737431e-06, |
|
"logits/chosen": -3.4818530082702637, |
|
"logits/rejected": -3.5163490772247314, |
|
"logps/chosen": -223.56863403320312, |
|
"logps/rejected": -223.87515258789062, |
|
"loss": 0.5176, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.066042423248291, |
|
"rewards/diff": -0.1881529539823532, |
|
"rewards/diff_abs": 0.9577949643135071, |
|
"rewards/rejected": 0.4776328504085541, |
|
"rewards/student_margin": 0.5884095430374146, |
|
"rewards/teacher_margin": 0.7765625715255737, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.8740218651325714e-06, |
|
"logits/chosen": -3.465400218963623, |
|
"logits/rejected": -3.4614810943603516, |
|
"logps/chosen": -256.56890869140625, |
|
"logps/rejected": -236.2727813720703, |
|
"loss": 0.5154, |
|
"rewards/accuracies": 0.7999999523162842, |
|
"rewards/chosen": 1.6230605840682983, |
|
"rewards/diff": 0.13750340044498444, |
|
"rewards/diff_abs": 0.9272276163101196, |
|
"rewards/rejected": 0.6319113969802856, |
|
"rewards/student_margin": 0.9911492466926575, |
|
"rewards/teacher_margin": 0.853645920753479, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.7733501396822178e-06, |
|
"logits/chosen": -3.588365077972412, |
|
"logits/rejected": -3.5591952800750732, |
|
"logps/chosen": -199.75267028808594, |
|
"logps/rejected": -181.21066284179688, |
|
"loss": 0.5305, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.041146993637085, |
|
"rewards/diff": -0.35296258330345154, |
|
"rewards/diff_abs": 1.0002224445343018, |
|
"rewards/rejected": 0.4019221365451813, |
|
"rewards/student_margin": 0.6392248868942261, |
|
"rewards/teacher_margin": 0.9921875, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.6739498128436563e-06, |
|
"logits/chosen": -3.5126869678497314, |
|
"logits/rejected": -3.5697379112243652, |
|
"logps/chosen": -275.712158203125, |
|
"logps/rejected": -249.92800903320312, |
|
"loss": 0.5093, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.5757232904434204, |
|
"rewards/diff": 0.19753125309944153, |
|
"rewards/diff_abs": 1.0118718147277832, |
|
"rewards/rejected": 0.4016294479370117, |
|
"rewards/student_margin": 1.1740937232971191, |
|
"rewards/teacher_margin": 0.9765625, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.5759948025441535e-06, |
|
"logits/chosen": -3.370077610015869, |
|
"logits/rejected": -3.4373347759246826, |
|
"logps/chosen": -266.87689208984375, |
|
"logps/rejected": -229.03158569335938, |
|
"loss": 0.5216, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.2909305095672607, |
|
"rewards/diff": -0.09488488733768463, |
|
"rewards/diff_abs": 1.032594084739685, |
|
"rewards/rejected": 0.4243570864200592, |
|
"rewards/student_margin": 0.8665734529495239, |
|
"rewards/teacher_margin": 0.9614583849906921, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.479656497881698e-06, |
|
"logits/chosen": -3.55267071723938, |
|
"logits/rejected": -3.6114087104797363, |
|
"logps/chosen": -230.6641082763672, |
|
"logps/rejected": -188.7877655029297, |
|
"loss": 0.4984, |
|
"rewards/accuracies": 0.6000000238418579, |
|
"rewards/chosen": 1.1030203104019165, |
|
"rewards/diff": -0.43049484491348267, |
|
"rewards/diff_abs": 1.0176421403884888, |
|
"rewards/rejected": 0.7288275957107544, |
|
"rewards/student_margin": 0.37419265508651733, |
|
"rewards/teacher_margin": 0.8046875, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.3851034592503648e-06, |
|
"logits/chosen": -3.3889052867889404, |
|
"logits/rejected": -3.5159294605255127, |
|
"logps/chosen": -272.55511474609375, |
|
"logps/rejected": -199.54537963867188, |
|
"loss": 0.5254, |
|
"rewards/accuracies": 0.7999999523162842, |
|
"rewards/chosen": 1.3238239288330078, |
|
"rewards/diff": 0.10217878967523575, |
|
"rewards/diff_abs": 0.8275870084762573, |
|
"rewards/rejected": 0.3945617377758026, |
|
"rewards/student_margin": 0.9292620420455933, |
|
"rewards/teacher_margin": 0.82708340883255, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 1.2925011234149859e-06, |
|
"logits/chosen": -3.478515148162842, |
|
"logits/rejected": -3.606118679046631, |
|
"logps/chosen": -204.07174682617188, |
|
"logps/rejected": -156.8729705810547, |
|
"loss": 0.5088, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.1958485841751099, |
|
"rewards/diff": 0.014614415355026722, |
|
"rewards/diff_abs": 1.1147606372833252, |
|
"rewards/rejected": 0.2895674705505371, |
|
"rewards/student_margin": 0.9062811136245728, |
|
"rewards/teacher_margin": 0.8916667699813843, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.2020115140511436e-06, |
|
"logits/chosen": -3.372546434402466, |
|
"logits/rejected": -3.3879222869873047, |
|
"logps/chosen": -285.7796936035156, |
|
"logps/rejected": -257.11016845703125, |
|
"loss": 0.5148, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 1.2352849245071411, |
|
"rewards/diff": -0.10239236056804657, |
|
"rewards/diff_abs": 0.7884107828140259, |
|
"rewards/rejected": 0.5449690818786621, |
|
"rewards/student_margin": 0.6903160214424133, |
|
"rewards/teacher_margin": 0.7927082777023315, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.25, |
|
"learning_rate": 1.11379295825695e-06, |
|
"logits/chosen": -3.4046216011047363, |
|
"logits/rejected": -3.449857711791992, |
|
"logps/chosen": -274.1463317871094, |
|
"logps/rejected": -247.4075927734375, |
|
"loss": 0.5252, |
|
"rewards/accuracies": 0.6333333253860474, |
|
"rewards/chosen": 1.4221832752227783, |
|
"rewards/diff": -0.02768692374229431, |
|
"rewards/diff_abs": 0.8126093745231628, |
|
"rewards/rejected": 0.7457036972045898, |
|
"rewards/student_margin": 0.676479697227478, |
|
"rewards/teacher_margin": 0.7041667699813843, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.0279998095326188e-06, |
|
"logits/chosen": -3.5202414989471436, |
|
"logits/rejected": -3.6290194988250732, |
|
"logps/chosen": -281.3245849609375, |
|
"logps/rejected": -231.3522186279297, |
|
"loss": 0.5181, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 1.2789822816848755, |
|
"rewards/diff": -0.12169651687145233, |
|
"rewards/diff_abs": 0.7308156490325928, |
|
"rewards/rejected": 0.6079703569412231, |
|
"rewards/student_margin": 0.6710118055343628, |
|
"rewards/teacher_margin": 0.7927082777023315, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 9.447821777125376e-07, |
|
"logits/chosen": -3.484200954437256, |
|
"logits/rejected": -3.4762959480285645, |
|
"logps/chosen": -234.85791015625, |
|
"logps/rejected": -222.82534790039062, |
|
"loss": 0.5155, |
|
"rewards/accuracies": 0.699999988079071, |
|
"rewards/chosen": 0.9543665647506714, |
|
"rewards/diff": -0.3350816071033478, |
|
"rewards/diff_abs": 1.0735111236572266, |
|
"rewards/rejected": 0.3592398166656494, |
|
"rewards/student_margin": 0.595126748085022, |
|
"rewards/teacher_margin": 0.9302083849906921, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 7.375, |
|
"learning_rate": 8.642856663223537e-07, |
|
"logits/chosen": -3.6152985095977783, |
|
"logits/rejected": -3.6913936138153076, |
|
"logps/chosen": -278.0227355957031, |
|
"logps/rejected": -192.86460876464844, |
|
"loss": 0.5314, |
|
"rewards/accuracies": 0.7999999523162842, |
|
"rewards/chosen": 1.4040509462356567, |
|
"rewards/diff": -0.1374289095401764, |
|
"rewards/diff_abs": 0.7850462198257446, |
|
"rewards/rejected": 0.5149174928665161, |
|
"rewards/student_margin": 0.889133632183075, |
|
"rewards/teacher_margin": 1.0265624523162842, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 7.866511178206202e-07, |
|
"logits/chosen": -3.5455310344696045, |
|
"logits/rejected": -3.4960360527038574, |
|
"logps/chosen": -288.90374755859375, |
|
"logps/rejected": -259.2998046875, |
|
"loss": 0.5068, |
|
"rewards/accuracies": 0.7000000476837158, |
|
"rewards/chosen": 1.5496443510055542, |
|
"rewards/diff": -0.2831823229789734, |
|
"rewards/diff_abs": 1.0701242685317993, |
|
"rewards/rejected": 0.9109517335891724, |
|
"rewards/student_margin": 0.6386927366256714, |
|
"rewards/teacher_margin": 0.921875, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 7.120143671707535e-07, |
|
"logits/chosen": -3.624680995941162, |
|
"logits/rejected": -3.571241855621338, |
|
"logps/chosen": -238.3637237548828, |
|
"logps/rejected": -190.933349609375, |
|
"loss": 0.5136, |
|
"rewards/accuracies": 0.6999999284744263, |
|
"rewards/chosen": 1.272955060005188, |
|
"rewards/diff": -0.05736231803894043, |
|
"rewards/diff_abs": 0.828034520149231, |
|
"rewards/rejected": 0.6131298542022705, |
|
"rewards/student_margin": 0.6598252058029175, |
|
"rewards/teacher_margin": 0.7171874642372131, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 6.405060041744557e-07, |
|
"logits/chosen": -3.3889694213867188, |
|
"logits/rejected": -3.4272830486297607, |
|
"logps/chosen": -314.39337158203125, |
|
"logps/rejected": -279.32037353515625, |
|
"loss": 0.5242, |
|
"rewards/accuracies": 0.6333333253860474, |
|
"rewards/chosen": 1.6729543209075928, |
|
"rewards/diff": -0.12356214225292206, |
|
"rewards/diff_abs": 1.1017476320266724, |
|
"rewards/rejected": 0.8991208076477051, |
|
"rewards/student_margin": 0.7738337516784668, |
|
"rewards/teacher_margin": 0.8973957896232605, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.0, |
|
"learning_rate": 5.72251144982447e-07, |
|
"logits/chosen": -3.5143237113952637, |
|
"logits/rejected": -3.4414215087890625, |
|
"logps/chosen": -255.2972869873047, |
|
"logps/rejected": -279.54595947265625, |
|
"loss": 0.4898, |
|
"rewards/accuracies": 0.699999988079071, |
|
"rewards/chosen": 1.4696900844573975, |
|
"rewards/diff": 0.25774964690208435, |
|
"rewards/diff_abs": 1.2428072690963745, |
|
"rewards/rejected": 0.48954445123672485, |
|
"rewards/student_margin": 0.9801454544067383, |
|
"rewards/teacher_margin": 0.7223958969116211, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 5.07369213182295e-07, |
|
"logits/chosen": -3.437652111053467, |
|
"logits/rejected": -3.513336181640625, |
|
"logps/chosen": -256.0563659667969, |
|
"logps/rejected": -192.80081176757812, |
|
"loss": 0.5181, |
|
"rewards/accuracies": 0.7333332896232605, |
|
"rewards/chosen": 0.9916725158691406, |
|
"rewards/diff": -0.04445856809616089, |
|
"rewards/diff_abs": 1.0562130212783813, |
|
"rewards/rejected": 0.035089436918497086, |
|
"rewards/student_margin": 0.9565832018852234, |
|
"rewards/teacher_margin": 1.0010416507720947, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 5.75, |
|
"learning_rate": 4.4597373084635717e-07, |
|
"logits/chosen": -3.4108052253723145, |
|
"logits/rejected": -3.403064727783203, |
|
"logps/chosen": -295.178955078125, |
|
"logps/rejected": -241.88851928710938, |
|
"loss": 0.5054, |
|
"rewards/accuracies": 0.5666666626930237, |
|
"rewards/chosen": 1.1539413928985596, |
|
"rewards/diff": -0.40385159850120544, |
|
"rewards/diff_abs": 1.0642129182815552, |
|
"rewards/rejected": 0.7130011320114136, |
|
"rewards/student_margin": 0.44094014167785645, |
|
"rewards/teacher_margin": 0.8447917103767395, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 3.88172119905435e-07, |
|
"logits/chosen": -3.562473773956299, |
|
"logits/rejected": -3.469137668609619, |
|
"logps/chosen": -264.6433410644531, |
|
"logps/rejected": -231.51602172851562, |
|
"loss": 0.5061, |
|
"rewards/accuracies": 0.7666667103767395, |
|
"rewards/chosen": 1.1783157587051392, |
|
"rewards/diff": -0.00968353170901537, |
|
"rewards/diff_abs": 0.7437331080436707, |
|
"rewards/rejected": 0.3218533396720886, |
|
"rewards/student_margin": 0.856462299823761, |
|
"rewards/teacher_margin": 0.86614590883255, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.25, |
|
"learning_rate": 3.3406551419567584e-07, |
|
"logits/chosen": -3.484909772872925, |
|
"logits/rejected": -3.444756269454956, |
|
"logps/chosen": -285.2689208984375, |
|
"logps/rejected": -289.5472106933594, |
|
"loss": 0.4931, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.4713407754898071, |
|
"rewards/diff": 0.38614755868911743, |
|
"rewards/diff_abs": 1.1154059171676636, |
|
"rewards/rejected": 0.4737350344657898, |
|
"rewards/student_margin": 0.9976059198379517, |
|
"rewards/teacher_margin": 0.6114583611488342, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 2.837485825075728e-07, |
|
"logits/chosen": -3.577286958694458, |
|
"logits/rejected": -3.652881145477295, |
|
"logps/chosen": -301.7745361328125, |
|
"logps/rejected": -229.53173828125, |
|
"loss": 0.5191, |
|
"rewards/accuracies": 0.6666666865348816, |
|
"rewards/chosen": 1.0710813999176025, |
|
"rewards/diff": -0.43389981985092163, |
|
"rewards/diff_abs": 1.1230199337005615, |
|
"rewards/rejected": 0.5716478228569031, |
|
"rewards/student_margin": 0.4994335174560547, |
|
"rewards/teacher_margin": 0.9333332777023315, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 2.37309362946673e-07, |
|
"logits/chosen": -3.4588115215301514, |
|
"logits/rejected": -3.5218307971954346, |
|
"logps/chosen": -200.84402465820312, |
|
"logps/rejected": -166.37826538085938, |
|
"loss": 0.513, |
|
"rewards/accuracies": 0.8999999761581421, |
|
"rewards/chosen": 0.8319117426872253, |
|
"rewards/diff": -0.07107441127300262, |
|
"rewards/diff_abs": 0.7676871418952942, |
|
"rewards/rejected": 0.08527780324220657, |
|
"rewards/student_margin": 0.7466338872909546, |
|
"rewards/teacher_margin": 0.8177083134651184, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.948291088958032e-07, |
|
"logits/chosen": -3.379662275314331, |
|
"logits/rejected": -3.4146881103515625, |
|
"logps/chosen": -259.4773864746094, |
|
"logps/rejected": -210.60165405273438, |
|
"loss": 0.5071, |
|
"rewards/accuracies": 0.6000000238418579, |
|
"rewards/chosen": 0.9233649373054504, |
|
"rewards/diff": -0.2959494888782501, |
|
"rewards/diff_abs": 1.0808264017105103, |
|
"rewards/rejected": 0.5380643010139465, |
|
"rewards/student_margin": 0.38530051708221436, |
|
"rewards/teacher_margin": 0.6812499761581421, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 1.5638214684833923e-07, |
|
"logits/chosen": -3.3812708854675293, |
|
"logits/rejected": -3.489490032196045, |
|
"logps/chosen": -282.34906005859375, |
|
"logps/rejected": -206.3470458984375, |
|
"loss": 0.5175, |
|
"rewards/accuracies": 0.6666666865348816, |
|
"rewards/chosen": 1.5291283130645752, |
|
"rewards/diff": -0.04967302083969116, |
|
"rewards/diff_abs": 1.0276174545288086, |
|
"rewards/rejected": 0.6121346354484558, |
|
"rewards/student_margin": 0.9169937372207642, |
|
"rewards/teacher_margin": 0.9666666984558105, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 1.220357463612501e-07, |
|
"logits/chosen": -3.5278987884521484, |
|
"logits/rejected": -3.4870636463165283, |
|
"logps/chosen": -262.7372131347656, |
|
"logps/rejected": -204.89157104492188, |
|
"loss": 0.5368, |
|
"rewards/accuracies": 0.800000011920929, |
|
"rewards/chosen": 1.4189398288726807, |
|
"rewards/diff": 0.041193410754203796, |
|
"rewards/diff_abs": 0.6765682101249695, |
|
"rewards/rejected": 0.6954547166824341, |
|
"rewards/student_margin": 0.7234851121902466, |
|
"rewards/teacher_margin": 0.6822917461395264, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 9.185000235546443e-08, |
|
"logits/chosen": -3.531663417816162, |
|
"logits/rejected": -3.5207691192626953, |
|
"logps/chosen": -221.78579711914062, |
|
"logps/rejected": -199.05947875976562, |
|
"loss": 0.5111, |
|
"rewards/accuracies": 0.6999999284744263, |
|
"rewards/chosen": 1.0741617679595947, |
|
"rewards/diff": -0.33916252851486206, |
|
"rewards/diff_abs": 0.7832191586494446, |
|
"rewards/rejected": 0.7206159830093384, |
|
"rewards/student_margin": 0.35354581475257874, |
|
"rewards/teacher_margin": 0.6927083730697632, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 6.587772996949876e-08, |
|
"logits/chosen": -3.4602973461151123, |
|
"logits/rejected": -3.5840487480163574, |
|
"logps/chosen": -273.8219299316406, |
|
"logps/rejected": -187.69760131835938, |
|
"loss": 0.5151, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.2822003364562988, |
|
"rewards/diff": 0.012847900390625, |
|
"rewards/diff_abs": 0.7760865688323975, |
|
"rewards/rejected": 0.34331077337265015, |
|
"rewards/student_margin": 0.9388895034790039, |
|
"rewards/teacher_margin": 0.9260417819023132, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 4.416437215030628e-08, |
|
"logits/chosen": -3.357858657836914, |
|
"logits/rejected": -3.428370237350464, |
|
"logps/chosen": -231.9785614013672, |
|
"logps/rejected": -208.6261749267578, |
|
"loss": 0.5225, |
|
"rewards/accuracies": 0.7666666507720947, |
|
"rewards/chosen": 1.1795800924301147, |
|
"rewards/diff": -0.32492926716804504, |
|
"rewards/diff_abs": 1.1657856702804565, |
|
"rewards/rejected": 0.5086759328842163, |
|
"rewards/student_margin": 0.6709040403366089, |
|
"rewards/teacher_margin": 0.9958333969116211, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 2.6747920143047056e-08, |
|
"logits/chosen": -3.574307680130005, |
|
"logits/rejected": -3.662809371948242, |
|
"logps/chosen": -242.3976593017578, |
|
"logps/rejected": -184.373291015625, |
|
"loss": 0.4997, |
|
"rewards/accuracies": 0.8333333134651184, |
|
"rewards/chosen": 1.215039610862732, |
|
"rewards/diff": 0.025632739067077637, |
|
"rewards/diff_abs": 0.862860381603241, |
|
"rewards/rejected": 0.06649022549390793, |
|
"rewards/student_margin": 1.1485494375228882, |
|
"rewards/teacher_margin": 1.1229166984558105, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 7.0, |
|
"learning_rate": 1.3658847018884758e-08, |
|
"logits/chosen": -3.3837954998016357, |
|
"logits/rejected": -3.477294445037842, |
|
"logps/chosen": -303.0797119140625, |
|
"logps/rejected": -258.445068359375, |
|
"loss": 0.5211, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 1.3037405014038086, |
|
"rewards/diff": -0.3252946734428406, |
|
"rewards/diff_abs": 1.0890836715698242, |
|
"rewards/rejected": 0.933201789855957, |
|
"rewards/student_margin": 0.3705386519432068, |
|
"rewards/teacher_margin": 0.6958333849906921, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 4.920054357119841e-09, |
|
"logits/chosen": -3.4326694011688232, |
|
"logits/rejected": -3.4905331134796143, |
|
"logps/chosen": -251.43948364257812, |
|
"logps/rejected": -198.3247833251953, |
|
"loss": 0.512, |
|
"rewards/accuracies": 0.8666666746139526, |
|
"rewards/chosen": 1.450547695159912, |
|
"rewards/diff": -0.012740576639771461, |
|
"rewards/diff_abs": 0.7532329559326172, |
|
"rewards/rejected": 0.5716216564178467, |
|
"rewards/student_margin": 0.878926157951355, |
|
"rewards/teacher_margin": 0.8916667699813843, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 5.468321749468875e-10, |
|
"logits/chosen": -3.446951389312744, |
|
"logits/rejected": -3.5641331672668457, |
|
"logps/chosen": -233.23678588867188, |
|
"logps/rejected": -200.827880859375, |
|
"loss": 0.5075, |
|
"rewards/accuracies": 0.6666666269302368, |
|
"rewards/chosen": 0.8117042779922485, |
|
"rewards/diff": -0.23592355847358704, |
|
"rewards/diff_abs": 0.7656908631324768, |
|
"rewards/rejected": 0.2184610813856125, |
|
"rewards/student_margin": 0.5932431817054749, |
|
"rewards/teacher_margin": 0.8291667103767395, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 835, |
|
"total_flos": 0.0, |
|
"train_loss": 0.5412804069633256, |
|
"train_runtime": 5959.7316, |
|
"train_samples_per_second": 26.891, |
|
"train_steps_per_second": 0.14 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 835, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100000000000000000000000000000000, |
|
"total_flos": 0.0, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|