|
{ |
|
"best_metric": 0.6133951445650848, |
|
"best_model_checkpoint": "./step_test_microsoft_dit/checkpoint-2500", |
|
"epoch": 0.6942691239585963, |
|
"eval_steps": 50, |
|
"global_step": 2750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025246149962130774, |
|
"grad_norm": 1.0554239749908447, |
|
"learning_rate": 2.997e-05, |
|
"loss": 0.3197, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005049229992426155, |
|
"grad_norm": 1.5600422620773315, |
|
"learning_rate": 2.994e-05, |
|
"loss": 0.2047, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007573844988639233, |
|
"grad_norm": 2.1541621685028076, |
|
"learning_rate": 2.991e-05, |
|
"loss": 0.1528, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01009845998485231, |
|
"grad_norm": 1.805535078048706, |
|
"learning_rate": 2.9880000000000002e-05, |
|
"loss": 0.1252, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.012623074981065387, |
|
"grad_norm": 1.1236392259597778, |
|
"learning_rate": 2.985e-05, |
|
"loss": 0.1165, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012623074981065387, |
|
"eval_f1": 0.4177168854259269, |
|
"eval_loss": 0.06423712521791458, |
|
"eval_runtime": 1142.2038, |
|
"eval_samples_per_second": 180.582, |
|
"eval_steps_per_second": 2.822, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015147689977278465, |
|
"grad_norm": 1.1924934387207031, |
|
"learning_rate": 2.982e-05, |
|
"loss": 0.1029, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.017672304973491544, |
|
"grad_norm": 1.225701928138733, |
|
"learning_rate": 2.979e-05, |
|
"loss": 0.117, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02019691996970462, |
|
"grad_norm": 2.702486515045166, |
|
"learning_rate": 2.976e-05, |
|
"loss": 0.1103, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.022721534965917698, |
|
"grad_norm": 2.0278918743133545, |
|
"learning_rate": 2.973e-05, |
|
"loss": 0.1054, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025246149962130773, |
|
"grad_norm": 1.9288796186447144, |
|
"learning_rate": 2.97e-05, |
|
"loss": 0.0942, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.025246149962130773, |
|
"eval_f1": 0.4771817453963171, |
|
"eval_loss": 0.048530641943216324, |
|
"eval_runtime": 1007.4651, |
|
"eval_samples_per_second": 204.734, |
|
"eval_steps_per_second": 3.199, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.027770764958343852, |
|
"grad_norm": 1.4688999652862549, |
|
"learning_rate": 2.967e-05, |
|
"loss": 0.0992, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03029537995455693, |
|
"grad_norm": 1.1097605228424072, |
|
"learning_rate": 2.964e-05, |
|
"loss": 0.1126, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03281999495077001, |
|
"grad_norm": 1.0353784561157227, |
|
"learning_rate": 2.961e-05, |
|
"loss": 0.0949, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03534460994698309, |
|
"grad_norm": 1.7303999662399292, |
|
"learning_rate": 2.958e-05, |
|
"loss": 0.1025, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03786922494319616, |
|
"grad_norm": 1.1177138090133667, |
|
"learning_rate": 2.955e-05, |
|
"loss": 0.1076, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03786922494319616, |
|
"eval_f1": 0.46432628333423487, |
|
"eval_loss": 0.05836363136768341, |
|
"eval_runtime": 980.1012, |
|
"eval_samples_per_second": 210.45, |
|
"eval_steps_per_second": 3.288, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04039383993940924, |
|
"grad_norm": 1.1965147256851196, |
|
"learning_rate": 2.9520000000000002e-05, |
|
"loss": 0.0961, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04291845493562232, |
|
"grad_norm": 1.0545780658721924, |
|
"learning_rate": 2.949e-05, |
|
"loss": 0.104, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.045443069931835396, |
|
"grad_norm": 1.8348199129104614, |
|
"learning_rate": 2.946e-05, |
|
"loss": 0.0932, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.047967684928048475, |
|
"grad_norm": 1.8478541374206543, |
|
"learning_rate": 2.943e-05, |
|
"loss": 0.1069, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05049229992426155, |
|
"grad_norm": 0.9377999305725098, |
|
"learning_rate": 2.94e-05, |
|
"loss": 0.1103, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05049229992426155, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.044557176530361176, |
|
"eval_runtime": 978.7525, |
|
"eval_samples_per_second": 210.74, |
|
"eval_steps_per_second": 3.293, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.053016914920474625, |
|
"grad_norm": 1.6204830408096313, |
|
"learning_rate": 2.9370000000000002e-05, |
|
"loss": 0.1019, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.055541529916687704, |
|
"grad_norm": 1.1411000490188599, |
|
"learning_rate": 2.934e-05, |
|
"loss": 0.0969, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05806614491290078, |
|
"grad_norm": 1.1179866790771484, |
|
"learning_rate": 2.931e-05, |
|
"loss": 0.1031, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06059075990911386, |
|
"grad_norm": 1.2155176401138306, |
|
"learning_rate": 2.928e-05, |
|
"loss": 0.0851, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06311537490532694, |
|
"grad_norm": 1.4578701257705688, |
|
"learning_rate": 2.925e-05, |
|
"loss": 0.0873, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06311537490532694, |
|
"eval_f1": 0.5313367950730588, |
|
"eval_loss": 0.05184657499194145, |
|
"eval_runtime": 982.8389, |
|
"eval_samples_per_second": 209.863, |
|
"eval_steps_per_second": 3.279, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06563998990154002, |
|
"grad_norm": 1.2894303798675537, |
|
"learning_rate": 2.922e-05, |
|
"loss": 0.0876, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0681646048977531, |
|
"grad_norm": 0.8404099941253662, |
|
"learning_rate": 2.919e-05, |
|
"loss": 0.0904, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07068921989396618, |
|
"grad_norm": 2.0062506198883057, |
|
"learning_rate": 2.916e-05, |
|
"loss": 0.1009, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07321383489017924, |
|
"grad_norm": 0.8900242447853088, |
|
"learning_rate": 2.913e-05, |
|
"loss": 0.0925, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07573844988639232, |
|
"grad_norm": 1.051013708114624, |
|
"learning_rate": 2.91e-05, |
|
"loss": 0.1053, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07573844988639232, |
|
"eval_f1": 0.532925682031985, |
|
"eval_loss": 0.07359323650598526, |
|
"eval_runtime": 980.9407, |
|
"eval_samples_per_second": 210.27, |
|
"eval_steps_per_second": 3.286, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0782630648826054, |
|
"grad_norm": 0.7765111327171326, |
|
"learning_rate": 2.907e-05, |
|
"loss": 0.0848, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08078767987881848, |
|
"grad_norm": 0.9605777859687805, |
|
"learning_rate": 2.904e-05, |
|
"loss": 0.0746, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08331229487503156, |
|
"grad_norm": 1.9086962938308716, |
|
"learning_rate": 2.901e-05, |
|
"loss": 0.1023, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08583690987124463, |
|
"grad_norm": 1.5782345533370972, |
|
"learning_rate": 2.898e-05, |
|
"loss": 0.0751, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08836152486745771, |
|
"grad_norm": 1.2298818826675415, |
|
"learning_rate": 2.895e-05, |
|
"loss": 0.0797, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08836152486745771, |
|
"eval_f1": 0.5325518588749066, |
|
"eval_loss": 0.07257544994354248, |
|
"eval_runtime": 979.2135, |
|
"eval_samples_per_second": 210.64, |
|
"eval_steps_per_second": 3.291, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09088613986367079, |
|
"grad_norm": 1.1932893991470337, |
|
"learning_rate": 2.892e-05, |
|
"loss": 0.0803, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09341075485988387, |
|
"grad_norm": 0.896007776260376, |
|
"learning_rate": 2.889e-05, |
|
"loss": 0.088, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09593536985609695, |
|
"grad_norm": 2.385890483856201, |
|
"learning_rate": 2.8859999999999998e-05, |
|
"loss": 0.0886, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09845998485231003, |
|
"grad_norm": 0.966077446937561, |
|
"learning_rate": 2.883e-05, |
|
"loss": 0.1038, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1009845998485231, |
|
"grad_norm": 0.969159722328186, |
|
"learning_rate": 2.88e-05, |
|
"loss": 0.0857, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1009845998485231, |
|
"eval_f1": 0.5497736226259776, |
|
"eval_loss": 0.06929118931293488, |
|
"eval_runtime": 978.0405, |
|
"eval_samples_per_second": 210.893, |
|
"eval_steps_per_second": 3.295, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10350921484473617, |
|
"grad_norm": 0.8633397817611694, |
|
"learning_rate": 2.877e-05, |
|
"loss": 0.0895, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10603382984094925, |
|
"grad_norm": 1.163271188735962, |
|
"learning_rate": 2.874e-05, |
|
"loss": 0.0861, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10855844483716233, |
|
"grad_norm": 1.102964997291565, |
|
"learning_rate": 2.871e-05, |
|
"loss": 0.0962, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11108305983337541, |
|
"grad_norm": 1.520044207572937, |
|
"learning_rate": 2.868e-05, |
|
"loss": 0.0981, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11360767482958849, |
|
"grad_norm": 1.8637338876724243, |
|
"learning_rate": 2.865e-05, |
|
"loss": 0.0885, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11360767482958849, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.09174469113349915, |
|
"eval_runtime": 1032.8967, |
|
"eval_samples_per_second": 199.693, |
|
"eval_steps_per_second": 3.12, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11613228982580157, |
|
"grad_norm": 1.1974824666976929, |
|
"learning_rate": 2.862e-05, |
|
"loss": 0.0784, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.11865690482201464, |
|
"grad_norm": 1.6933320760726929, |
|
"learning_rate": 2.859e-05, |
|
"loss": 0.078, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12118151981822772, |
|
"grad_norm": 1.7774609327316284, |
|
"learning_rate": 2.856e-05, |
|
"loss": 0.0715, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1237061348144408, |
|
"grad_norm": 0.7675666213035583, |
|
"learning_rate": 2.853e-05, |
|
"loss": 0.0817, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.12623074981065388, |
|
"grad_norm": 1.169325590133667, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.102, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12623074981065388, |
|
"eval_f1": 0.5648781658864481, |
|
"eval_loss": 0.057994671165943146, |
|
"eval_runtime": 967.2924, |
|
"eval_samples_per_second": 213.236, |
|
"eval_steps_per_second": 3.332, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12875536480686695, |
|
"grad_norm": 0.9567933678627014, |
|
"learning_rate": 2.847e-05, |
|
"loss": 0.0762, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13127997980308004, |
|
"grad_norm": 0.7539889216423035, |
|
"learning_rate": 2.844e-05, |
|
"loss": 0.0655, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1338045947992931, |
|
"grad_norm": 1.873833179473877, |
|
"learning_rate": 2.841e-05, |
|
"loss": 0.0747, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1363292097955062, |
|
"grad_norm": 0.7834559082984924, |
|
"learning_rate": 2.838e-05, |
|
"loss": 0.0923, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.13885382479171926, |
|
"grad_norm": 0.6193771362304688, |
|
"learning_rate": 2.8349999999999998e-05, |
|
"loss": 0.0716, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.13885382479171926, |
|
"eval_f1": 0.538135593220339, |
|
"eval_loss": 0.07973095029592514, |
|
"eval_runtime": 974.1593, |
|
"eval_samples_per_second": 211.733, |
|
"eval_steps_per_second": 3.308, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14137843978793235, |
|
"grad_norm": 1.1256766319274902, |
|
"learning_rate": 2.832e-05, |
|
"loss": 0.0798, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.14390305478414542, |
|
"grad_norm": 1.0669515132904053, |
|
"learning_rate": 2.829e-05, |
|
"loss": 0.0795, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.14642766978035848, |
|
"grad_norm": 1.018234133720398, |
|
"learning_rate": 2.826e-05, |
|
"loss": 0.073, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.14895228477657158, |
|
"grad_norm": 1.2367616891860962, |
|
"learning_rate": 2.823e-05, |
|
"loss": 0.0879, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15147689977278464, |
|
"grad_norm": 1.5840317010879517, |
|
"learning_rate": 2.8199999999999998e-05, |
|
"loss": 0.0854, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15147689977278464, |
|
"eval_f1": 0.571752762018513, |
|
"eval_loss": 0.07439474016427994, |
|
"eval_runtime": 970.5653, |
|
"eval_samples_per_second": 212.517, |
|
"eval_steps_per_second": 3.321, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15400151476899773, |
|
"grad_norm": 0.5361483097076416, |
|
"learning_rate": 2.817e-05, |
|
"loss": 0.0854, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1565261297652108, |
|
"grad_norm": 0.9658698439598083, |
|
"learning_rate": 2.8139999999999998e-05, |
|
"loss": 0.095, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1590507447614239, |
|
"grad_norm": 0.820649266242981, |
|
"learning_rate": 2.8110000000000004e-05, |
|
"loss": 0.0921, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16157535975763695, |
|
"grad_norm": 1.1583890914916992, |
|
"learning_rate": 2.8080000000000002e-05, |
|
"loss": 0.077, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.16409997475385005, |
|
"grad_norm": 0.8755506277084351, |
|
"learning_rate": 2.805e-05, |
|
"loss": 0.089, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.16409997475385005, |
|
"eval_f1": 0.5789600675594161, |
|
"eval_loss": 0.0503680482506752, |
|
"eval_runtime": 976.7796, |
|
"eval_samples_per_second": 211.165, |
|
"eval_steps_per_second": 3.3, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1666245897500631, |
|
"grad_norm": 0.5073147416114807, |
|
"learning_rate": 2.8020000000000003e-05, |
|
"loss": 0.0784, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1691492047462762, |
|
"grad_norm": 1.0332393646240234, |
|
"learning_rate": 2.799e-05, |
|
"loss": 0.0906, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 1.1538151502609253, |
|
"learning_rate": 2.7960000000000003e-05, |
|
"loss": 0.0799, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.17419843473870233, |
|
"grad_norm": 1.2075843811035156, |
|
"learning_rate": 2.7930000000000002e-05, |
|
"loss": 0.0795, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.17672304973491543, |
|
"grad_norm": 2.1169683933258057, |
|
"learning_rate": 2.79e-05, |
|
"loss": 0.0721, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.17672304973491543, |
|
"eval_f1": 0.5727175590644663, |
|
"eval_loss": 0.0618172287940979, |
|
"eval_runtime": 975.6558, |
|
"eval_samples_per_second": 211.409, |
|
"eval_steps_per_second": 3.303, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1792476647311285, |
|
"grad_norm": 1.3094089031219482, |
|
"learning_rate": 2.7870000000000003e-05, |
|
"loss": 0.0723, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18177227972734158, |
|
"grad_norm": 0.9937088489532471, |
|
"learning_rate": 2.784e-05, |
|
"loss": 0.0704, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.18429689472355465, |
|
"grad_norm": 0.6464220881462097, |
|
"learning_rate": 2.7810000000000003e-05, |
|
"loss": 0.0731, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.18682150971976774, |
|
"grad_norm": 0.5544419288635254, |
|
"learning_rate": 2.778e-05, |
|
"loss": 0.0894, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1893461247159808, |
|
"grad_norm": 0.6369556188583374, |
|
"learning_rate": 2.7750000000000004e-05, |
|
"loss": 0.0721, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1893461247159808, |
|
"eval_f1": 0.5904197411394702, |
|
"eval_loss": 0.07033708691596985, |
|
"eval_runtime": 967.5811, |
|
"eval_samples_per_second": 213.173, |
|
"eval_steps_per_second": 3.331, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1918707397121939, |
|
"grad_norm": 2.0700013637542725, |
|
"learning_rate": 2.7720000000000002e-05, |
|
"loss": 0.0831, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.19439535470840696, |
|
"grad_norm": 0.765533983707428, |
|
"learning_rate": 2.769e-05, |
|
"loss": 0.0707, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.19691996970462006, |
|
"grad_norm": 1.6104159355163574, |
|
"learning_rate": 2.7660000000000003e-05, |
|
"loss": 0.073, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.19944458470083312, |
|
"grad_norm": 1.1069729328155518, |
|
"learning_rate": 2.763e-05, |
|
"loss": 0.0702, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2019691996970462, |
|
"grad_norm": 1.6577630043029785, |
|
"learning_rate": 2.7600000000000003e-05, |
|
"loss": 0.0865, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2019691996970462, |
|
"eval_f1": 0.5952780441035476, |
|
"eval_loss": 0.058820515871047974, |
|
"eval_runtime": 917.4267, |
|
"eval_samples_per_second": 224.827, |
|
"eval_steps_per_second": 3.513, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20449381469325928, |
|
"grad_norm": 1.5197840929031372, |
|
"learning_rate": 2.7570000000000002e-05, |
|
"loss": 0.0846, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.20701842968947234, |
|
"grad_norm": 1.1758556365966797, |
|
"learning_rate": 2.754e-05, |
|
"loss": 0.0813, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.20954304468568544, |
|
"grad_norm": 0.5016022324562073, |
|
"learning_rate": 2.7510000000000003e-05, |
|
"loss": 0.0718, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2120676596818985, |
|
"grad_norm": 1.3600627183914185, |
|
"learning_rate": 2.748e-05, |
|
"loss": 0.0942, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2145922746781116, |
|
"grad_norm": 0.6990534067153931, |
|
"learning_rate": 2.7450000000000003e-05, |
|
"loss": 0.0767, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2145922746781116, |
|
"eval_f1": 0.5918155918155918, |
|
"eval_loss": 0.04372716695070267, |
|
"eval_runtime": 913.4291, |
|
"eval_samples_per_second": 225.811, |
|
"eval_steps_per_second": 3.528, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.21711688967432466, |
|
"grad_norm": 1.0468288660049438, |
|
"learning_rate": 2.7420000000000002e-05, |
|
"loss": 0.0805, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.21964150467053775, |
|
"grad_norm": 1.2046771049499512, |
|
"learning_rate": 2.739e-05, |
|
"loss": 0.0879, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.22216611966675082, |
|
"grad_norm": 0.9044977426528931, |
|
"learning_rate": 2.7360000000000002e-05, |
|
"loss": 0.0597, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2246907346629639, |
|
"grad_norm": 1.145572304725647, |
|
"learning_rate": 2.733e-05, |
|
"loss": 0.1007, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.22721534965917697, |
|
"grad_norm": 1.058166742324829, |
|
"learning_rate": 2.7300000000000003e-05, |
|
"loss": 0.0773, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22721534965917697, |
|
"eval_f1": 0.5956852791878172, |
|
"eval_loss": 0.05675825849175453, |
|
"eval_runtime": 923.1927, |
|
"eval_samples_per_second": 223.422, |
|
"eval_steps_per_second": 3.491, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22973996465539007, |
|
"grad_norm": 0.7665570974349976, |
|
"learning_rate": 2.727e-05, |
|
"loss": 0.084, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.23226457965160313, |
|
"grad_norm": 0.8884145021438599, |
|
"learning_rate": 2.724e-05, |
|
"loss": 0.0748, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2347891946478162, |
|
"grad_norm": 0.7132917046546936, |
|
"learning_rate": 2.7210000000000002e-05, |
|
"loss": 0.0861, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2373138096440293, |
|
"grad_norm": 1.3353750705718994, |
|
"learning_rate": 2.718e-05, |
|
"loss": 0.091, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.23983842464024235, |
|
"grad_norm": 1.216691255569458, |
|
"learning_rate": 2.7150000000000003e-05, |
|
"loss": 0.0748, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.23983842464024235, |
|
"eval_f1": 0.5942299042601041, |
|
"eval_loss": 0.04645048826932907, |
|
"eval_runtime": 919.478, |
|
"eval_samples_per_second": 224.325, |
|
"eval_steps_per_second": 3.505, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24236303963645545, |
|
"grad_norm": 1.0420501232147217, |
|
"learning_rate": 2.712e-05, |
|
"loss": 0.0953, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2448876546326685, |
|
"grad_norm": 1.1488158702850342, |
|
"learning_rate": 2.709e-05, |
|
"loss": 0.0796, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2474122696288816, |
|
"grad_norm": 0.7872379422187805, |
|
"learning_rate": 2.7060000000000002e-05, |
|
"loss": 0.0844, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.24993688462509467, |
|
"grad_norm": 0.9102885127067566, |
|
"learning_rate": 2.703e-05, |
|
"loss": 0.0792, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.25246149962130776, |
|
"grad_norm": 1.040650486946106, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.0761, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.25246149962130776, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.06595388799905777, |
|
"eval_runtime": 948.4123, |
|
"eval_samples_per_second": 217.481, |
|
"eval_steps_per_second": 3.398, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.25498611461752085, |
|
"grad_norm": 1.0717836618423462, |
|
"learning_rate": 2.697e-05, |
|
"loss": 0.0569, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2575107296137339, |
|
"grad_norm": 0.7504699230194092, |
|
"learning_rate": 2.6940000000000003e-05, |
|
"loss": 0.072, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.260035344609947, |
|
"grad_norm": 0.9767778515815735, |
|
"learning_rate": 2.691e-05, |
|
"loss": 0.0658, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2625599596061601, |
|
"grad_norm": 0.5905674695968628, |
|
"learning_rate": 2.688e-05, |
|
"loss": 0.0775, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2650845746023731, |
|
"grad_norm": 1.6352293491363525, |
|
"learning_rate": 2.6850000000000002e-05, |
|
"loss": 0.0855, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2650845746023731, |
|
"eval_f1": 0.5963938973647711, |
|
"eval_loss": 0.04910014942288399, |
|
"eval_runtime": 986.8376, |
|
"eval_samples_per_second": 209.013, |
|
"eval_steps_per_second": 3.266, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2676091895985862, |
|
"grad_norm": 0.6634190082550049, |
|
"learning_rate": 2.682e-05, |
|
"loss": 0.0741, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2701338045947993, |
|
"grad_norm": 0.5896914601325989, |
|
"learning_rate": 2.6790000000000003e-05, |
|
"loss": 0.0713, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2726584195910124, |
|
"grad_norm": 1.3768564462661743, |
|
"learning_rate": 2.676e-05, |
|
"loss": 0.0684, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.27518303458722543, |
|
"grad_norm": 0.7323074340820312, |
|
"learning_rate": 2.673e-05, |
|
"loss": 0.084, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.2777076495834385, |
|
"grad_norm": 0.6660707592964172, |
|
"learning_rate": 2.6700000000000002e-05, |
|
"loss": 0.0832, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2777076495834385, |
|
"eval_f1": 0.6048397002825205, |
|
"eval_loss": 0.049847185611724854, |
|
"eval_runtime": 967.6797, |
|
"eval_samples_per_second": 213.151, |
|
"eval_steps_per_second": 3.331, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2802322645796516, |
|
"grad_norm": 1.425309419631958, |
|
"learning_rate": 2.667e-05, |
|
"loss": 0.0793, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2827568795758647, |
|
"grad_norm": 1.3583918809890747, |
|
"learning_rate": 2.6640000000000002e-05, |
|
"loss": 0.0808, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.28528149457207774, |
|
"grad_norm": 1.1851533651351929, |
|
"learning_rate": 2.661e-05, |
|
"loss": 0.0738, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.28780610956829084, |
|
"grad_norm": 1.4497005939483643, |
|
"learning_rate": 2.658e-05, |
|
"loss": 0.078, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.29033072456450393, |
|
"grad_norm": 1.4407027959823608, |
|
"learning_rate": 2.655e-05, |
|
"loss": 0.0821, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.29033072456450393, |
|
"eval_f1": 0.6031633616619453, |
|
"eval_loss": 0.059650588780641556, |
|
"eval_runtime": 962.0892, |
|
"eval_samples_per_second": 214.39, |
|
"eval_steps_per_second": 3.35, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.29285533956071697, |
|
"grad_norm": 1.0721668004989624, |
|
"learning_rate": 2.652e-05, |
|
"loss": 0.0706, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.29537995455693006, |
|
"grad_norm": 1.1033729314804077, |
|
"learning_rate": 2.6490000000000002e-05, |
|
"loss": 0.0737, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.29790456955314315, |
|
"grad_norm": 0.9764577746391296, |
|
"learning_rate": 2.646e-05, |
|
"loss": 0.0743, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.30042918454935624, |
|
"grad_norm": 1.2160297632217407, |
|
"learning_rate": 2.643e-05, |
|
"loss": 0.0768, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3029537995455693, |
|
"grad_norm": 0.8387085795402527, |
|
"learning_rate": 2.64e-05, |
|
"loss": 0.0715, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3029537995455693, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.06428094953298569, |
|
"eval_runtime": 961.1037, |
|
"eval_samples_per_second": 214.61, |
|
"eval_steps_per_second": 3.353, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3054784145417824, |
|
"grad_norm": 1.061087727546692, |
|
"learning_rate": 2.637e-05, |
|
"loss": 0.0672, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.30800302953799547, |
|
"grad_norm": 0.6768150925636292, |
|
"learning_rate": 2.6340000000000002e-05, |
|
"loss": 0.0762, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.31052764453420856, |
|
"grad_norm": 0.7020296454429626, |
|
"learning_rate": 2.631e-05, |
|
"loss": 0.0838, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3130522595304216, |
|
"grad_norm": 0.9264736175537109, |
|
"learning_rate": 2.628e-05, |
|
"loss": 0.0769, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3155768745266347, |
|
"grad_norm": 0.657778799533844, |
|
"learning_rate": 2.625e-05, |
|
"loss": 0.085, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3155768745266347, |
|
"eval_f1": 0.6054250016184373, |
|
"eval_loss": 0.06593530625104904, |
|
"eval_runtime": 970.0262, |
|
"eval_samples_per_second": 212.635, |
|
"eval_steps_per_second": 3.323, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3181014895228478, |
|
"grad_norm": 0.6904731392860413, |
|
"learning_rate": 2.622e-05, |
|
"loss": 0.0736, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3206261045190608, |
|
"grad_norm": 1.4745820760726929, |
|
"learning_rate": 2.619e-05, |
|
"loss": 0.0832, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3231507195152739, |
|
"grad_norm": 1.0614553689956665, |
|
"learning_rate": 2.616e-05, |
|
"loss": 0.0781, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.325675334511487, |
|
"grad_norm": 1.2228913307189941, |
|
"learning_rate": 2.6130000000000002e-05, |
|
"loss": 0.0872, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3281999495077001, |
|
"grad_norm": 0.9905760288238525, |
|
"learning_rate": 2.61e-05, |
|
"loss": 0.0826, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3281999495077001, |
|
"eval_f1": 0.6011740745177908, |
|
"eval_loss": 0.05560224503278732, |
|
"eval_runtime": 964.1962, |
|
"eval_samples_per_second": 213.921, |
|
"eval_steps_per_second": 3.343, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.33072456450391313, |
|
"grad_norm": 1.1195616722106934, |
|
"learning_rate": 2.607e-05, |
|
"loss": 0.0751, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3332491795001262, |
|
"grad_norm": 0.9830445647239685, |
|
"learning_rate": 2.604e-05, |
|
"loss": 0.0694, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3357737944963393, |
|
"grad_norm": 1.7140698432922363, |
|
"learning_rate": 2.601e-05, |
|
"loss": 0.0694, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.3382984094925524, |
|
"grad_norm": 0.9545607566833496, |
|
"learning_rate": 2.5980000000000002e-05, |
|
"loss": 0.0626, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.34082302448876545, |
|
"grad_norm": 0.8236456513404846, |
|
"learning_rate": 2.595e-05, |
|
"loss": 0.064, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.34082302448876545, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.0564185306429863, |
|
"eval_runtime": 1030.8018, |
|
"eval_samples_per_second": 200.099, |
|
"eval_steps_per_second": 3.127, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 1.0344712734222412, |
|
"learning_rate": 2.592e-05, |
|
"loss": 0.074, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.34587225448119163, |
|
"grad_norm": 1.647894024848938, |
|
"learning_rate": 2.589e-05, |
|
"loss": 0.0756, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.34839686947740467, |
|
"grad_norm": 1.0268642902374268, |
|
"learning_rate": 2.586e-05, |
|
"loss": 0.064, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.35092148447361776, |
|
"grad_norm": 0.6588199734687805, |
|
"learning_rate": 2.5830000000000002e-05, |
|
"loss": 0.0685, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.35344609946983085, |
|
"grad_norm": 0.8278918862342834, |
|
"learning_rate": 2.58e-05, |
|
"loss": 0.0854, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.35344609946983085, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.05516933649778366, |
|
"eval_runtime": 1032.3177, |
|
"eval_samples_per_second": 199.805, |
|
"eval_steps_per_second": 3.122, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.35597071446604395, |
|
"grad_norm": 0.4216013550758362, |
|
"learning_rate": 2.577e-05, |
|
"loss": 0.0785, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.358495329462257, |
|
"grad_norm": 0.9567118287086487, |
|
"learning_rate": 2.574e-05, |
|
"loss": 0.089, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3610199444584701, |
|
"grad_norm": 1.3202637434005737, |
|
"learning_rate": 2.571e-05, |
|
"loss": 0.0884, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.36354455945468317, |
|
"grad_norm": 1.3245704174041748, |
|
"learning_rate": 2.568e-05, |
|
"loss": 0.0739, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.36606917445089626, |
|
"grad_norm": 0.6416196823120117, |
|
"learning_rate": 2.565e-05, |
|
"loss": 0.0702, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.36606917445089626, |
|
"eval_f1": 0.6061020319393525, |
|
"eval_loss": 0.06748606264591217, |
|
"eval_runtime": 999.3826, |
|
"eval_samples_per_second": 206.389, |
|
"eval_steps_per_second": 3.225, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.3685937894471093, |
|
"grad_norm": 0.9312785267829895, |
|
"learning_rate": 2.562e-05, |
|
"loss": 0.0674, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3711184044433224, |
|
"grad_norm": 0.9092572927474976, |
|
"learning_rate": 2.559e-05, |
|
"loss": 0.0676, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.3736430194395355, |
|
"grad_norm": 1.4935100078582764, |
|
"learning_rate": 2.556e-05, |
|
"loss": 0.0712, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.3761676344357485, |
|
"grad_norm": 0.9569060802459717, |
|
"learning_rate": 2.553e-05, |
|
"loss": 0.0747, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.3786922494319616, |
|
"grad_norm": 0.947384774684906, |
|
"learning_rate": 2.55e-05, |
|
"loss": 0.0771, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3786922494319616, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.057753585278987885, |
|
"eval_runtime": 1000.0105, |
|
"eval_samples_per_second": 206.26, |
|
"eval_steps_per_second": 3.223, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3812168644281747, |
|
"grad_norm": 0.6996080875396729, |
|
"learning_rate": 2.547e-05, |
|
"loss": 0.0696, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.3837414794243878, |
|
"grad_norm": 0.5415595173835754, |
|
"learning_rate": 2.544e-05, |
|
"loss": 0.0757, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.38626609442060084, |
|
"grad_norm": 0.5137012600898743, |
|
"learning_rate": 2.541e-05, |
|
"loss": 0.0621, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.38879070941681393, |
|
"grad_norm": 0.9606865048408508, |
|
"learning_rate": 2.538e-05, |
|
"loss": 0.073, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.391315324413027, |
|
"grad_norm": 1.1751604080200195, |
|
"learning_rate": 2.535e-05, |
|
"loss": 0.08, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.391315324413027, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.0491572804749012, |
|
"eval_runtime": 963.3777, |
|
"eval_samples_per_second": 214.103, |
|
"eval_steps_per_second": 3.346, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.3938399394092401, |
|
"grad_norm": 0.935338020324707, |
|
"learning_rate": 2.5319999999999998e-05, |
|
"loss": 0.0729, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.39636455440545315, |
|
"grad_norm": 0.7157814502716064, |
|
"learning_rate": 2.529e-05, |
|
"loss": 0.0719, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.39888916940166624, |
|
"grad_norm": 0.6739543676376343, |
|
"learning_rate": 2.526e-05, |
|
"loss": 0.0631, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.40141378439787934, |
|
"grad_norm": 0.4896785318851471, |
|
"learning_rate": 2.523e-05, |
|
"loss": 0.0746, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4039383993940924, |
|
"grad_norm": 0.7619987726211548, |
|
"learning_rate": 2.52e-05, |
|
"loss": 0.0804, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4039383993940924, |
|
"eval_f1": 0.6111605289687482, |
|
"eval_loss": 0.05378127843141556, |
|
"eval_runtime": 958.466, |
|
"eval_samples_per_second": 215.2, |
|
"eval_steps_per_second": 3.363, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.40646301439030547, |
|
"grad_norm": 0.7464210987091064, |
|
"learning_rate": 2.517e-05, |
|
"loss": 0.0707, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.40898762938651856, |
|
"grad_norm": 0.6707102656364441, |
|
"learning_rate": 2.514e-05, |
|
"loss": 0.0671, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.41151224438273165, |
|
"grad_norm": 1.246846079826355, |
|
"learning_rate": 2.511e-05, |
|
"loss": 0.0627, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4140368593789447, |
|
"grad_norm": 0.9796457886695862, |
|
"learning_rate": 2.508e-05, |
|
"loss": 0.0677, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4165614743751578, |
|
"grad_norm": 0.9717236161231995, |
|
"learning_rate": 2.505e-05, |
|
"loss": 0.083, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4165614743751578, |
|
"eval_f1": 0.6047686163965234, |
|
"eval_loss": 0.057900335639715195, |
|
"eval_runtime": 962.7843, |
|
"eval_samples_per_second": 214.235, |
|
"eval_steps_per_second": 3.348, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4190860893713709, |
|
"grad_norm": 1.1706446409225464, |
|
"learning_rate": 2.502e-05, |
|
"loss": 0.0764, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.42161070436758397, |
|
"grad_norm": 0.45280393958091736, |
|
"learning_rate": 2.499e-05, |
|
"loss": 0.0682, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.424135319363797, |
|
"grad_norm": 1.0100760459899902, |
|
"learning_rate": 2.4959999999999998e-05, |
|
"loss": 0.0892, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4266599343600101, |
|
"grad_norm": 1.0506736040115356, |
|
"learning_rate": 2.493e-05, |
|
"loss": 0.0666, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"grad_norm": 0.7978639006614685, |
|
"learning_rate": 2.49e-05, |
|
"loss": 0.0701, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"eval_f1": 0.6044656147662996, |
|
"eval_loss": 0.06738731265068054, |
|
"eval_runtime": 1102.7864, |
|
"eval_samples_per_second": 187.037, |
|
"eval_steps_per_second": 2.923, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4317091643524363, |
|
"grad_norm": 1.121317982673645, |
|
"learning_rate": 2.487e-05, |
|
"loss": 0.0771, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.4342337793486493, |
|
"grad_norm": 1.0836131572723389, |
|
"learning_rate": 2.484e-05, |
|
"loss": 0.0719, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.4367583943448624, |
|
"grad_norm": 0.61658775806427, |
|
"learning_rate": 2.4809999999999998e-05, |
|
"loss": 0.0681, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4392830093410755, |
|
"grad_norm": 0.647393524646759, |
|
"learning_rate": 2.478e-05, |
|
"loss": 0.0668, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.44180762433728854, |
|
"grad_norm": 0.782483696937561, |
|
"learning_rate": 2.475e-05, |
|
"loss": 0.0721, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.44180762433728854, |
|
"eval_f1": 0.5979155238617663, |
|
"eval_loss": 0.04912808537483215, |
|
"eval_runtime": 1171.9033, |
|
"eval_samples_per_second": 176.006, |
|
"eval_steps_per_second": 2.75, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.44433223933350163, |
|
"grad_norm": 0.4538789987564087, |
|
"learning_rate": 2.472e-05, |
|
"loss": 0.0641, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.4468568543297147, |
|
"grad_norm": 0.7954159379005432, |
|
"learning_rate": 2.469e-05, |
|
"loss": 0.079, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.4493814693259278, |
|
"grad_norm": 0.4370203912258148, |
|
"learning_rate": 2.4659999999999998e-05, |
|
"loss": 0.0769, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.45190608432214086, |
|
"grad_norm": 1.2641068696975708, |
|
"learning_rate": 2.463e-05, |
|
"loss": 0.0649, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.45443069931835395, |
|
"grad_norm": 1.262468695640564, |
|
"learning_rate": 2.4599999999999998e-05, |
|
"loss": 0.0765, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.45443069931835395, |
|
"eval_f1": NaN, |
|
"eval_loss": 0.04386861249804497, |
|
"eval_runtime": 1166.6982, |
|
"eval_samples_per_second": 176.791, |
|
"eval_steps_per_second": 2.762, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.45695531431456704, |
|
"grad_norm": 1.0922938585281372, |
|
"learning_rate": 2.457e-05, |
|
"loss": 0.074, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.45947992931078013, |
|
"grad_norm": 0.8298421502113342, |
|
"learning_rate": 2.454e-05, |
|
"loss": 0.0778, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.46200454430699317, |
|
"grad_norm": 1.182712435722351, |
|
"learning_rate": 2.4509999999999997e-05, |
|
"loss": 0.0793, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.46452915930320626, |
|
"grad_norm": 0.7366443276405334, |
|
"learning_rate": 2.448e-05, |
|
"loss": 0.0655, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.46705377429941936, |
|
"grad_norm": 0.9185643792152405, |
|
"learning_rate": 2.4449999999999998e-05, |
|
"loss": 0.0692, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.46705377429941936, |
|
"eval_f1": 0.6057632592224568, |
|
"eval_loss": 0.04681675508618355, |
|
"eval_runtime": 1170.8075, |
|
"eval_samples_per_second": 176.171, |
|
"eval_steps_per_second": 2.753, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.4695783892956324, |
|
"grad_norm": 0.99897301197052, |
|
"learning_rate": 2.442e-05, |
|
"loss": 0.0685, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.4721030042918455, |
|
"grad_norm": 1.0028034448623657, |
|
"learning_rate": 2.439e-05, |
|
"loss": 0.0748, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.4746276192880586, |
|
"grad_norm": 2.5226945877075195, |
|
"learning_rate": 2.4360000000000004e-05, |
|
"loss": 0.0715, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.47715223428427167, |
|
"grad_norm": 0.903256893157959, |
|
"learning_rate": 2.4330000000000003e-05, |
|
"loss": 0.0709, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.4796768492804847, |
|
"grad_norm": 0.9269793629646301, |
|
"learning_rate": 2.43e-05, |
|
"loss": 0.0761, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4796768492804847, |
|
"eval_f1": 0.6124984470120511, |
|
"eval_loss": 0.05741230770945549, |
|
"eval_runtime": 1159.3714, |
|
"eval_samples_per_second": 177.908, |
|
"eval_steps_per_second": 2.78, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4822014642766978, |
|
"grad_norm": 1.0651170015335083, |
|
"learning_rate": 2.4270000000000003e-05, |
|
"loss": 0.0751, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.4847260792729109, |
|
"grad_norm": 1.2628437280654907, |
|
"learning_rate": 2.4240000000000002e-05, |
|
"loss": 0.0852, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.487250694269124, |
|
"grad_norm": 1.3889621496200562, |
|
"learning_rate": 2.4210000000000004e-05, |
|
"loss": 0.073, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.489775309265337, |
|
"grad_norm": 1.028456687927246, |
|
"learning_rate": 2.4180000000000002e-05, |
|
"loss": 0.0644, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.4922999242615501, |
|
"grad_norm": 0.6997565627098083, |
|
"learning_rate": 2.415e-05, |
|
"loss": 0.0757, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.4922999242615501, |
|
"eval_f1": 0.6126181795711549, |
|
"eval_loss": 0.05692484602332115, |
|
"eval_runtime": 1161.9825, |
|
"eval_samples_per_second": 177.509, |
|
"eval_steps_per_second": 2.774, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.4948245392577632, |
|
"grad_norm": 1.384186863899231, |
|
"learning_rate": 2.4120000000000003e-05, |
|
"loss": 0.0697, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.49734915425397624, |
|
"grad_norm": 0.8674394488334656, |
|
"learning_rate": 2.409e-05, |
|
"loss": 0.0739, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.49987376925018934, |
|
"grad_norm": 1.826121211051941, |
|
"learning_rate": 2.4060000000000003e-05, |
|
"loss": 0.0739, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5023983842464024, |
|
"grad_norm": 0.6903666257858276, |
|
"learning_rate": 2.4030000000000002e-05, |
|
"loss": 0.0661, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5049229992426155, |
|
"grad_norm": 0.7339742183685303, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.0654, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5049229992426155, |
|
"eval_f1": 0.6095153739086423, |
|
"eval_loss": 0.05489746853709221, |
|
"eval_runtime": 1168.0449, |
|
"eval_samples_per_second": 176.587, |
|
"eval_steps_per_second": 2.759, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5074476142388286, |
|
"grad_norm": 0.7863900065422058, |
|
"learning_rate": 2.3970000000000003e-05, |
|
"loss": 0.061, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5099722292350417, |
|
"grad_norm": 1.0800750255584717, |
|
"learning_rate": 2.394e-05, |
|
"loss": 0.0781, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5124968442312547, |
|
"grad_norm": 1.0992929935455322, |
|
"learning_rate": 2.3910000000000003e-05, |
|
"loss": 0.0694, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 0.703554093837738, |
|
"learning_rate": 2.3880000000000002e-05, |
|
"loss": 0.0881, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5175460742236809, |
|
"grad_norm": 1.214089274406433, |
|
"learning_rate": 2.385e-05, |
|
"loss": 0.0736, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5175460742236809, |
|
"eval_f1": 0.612187690432663, |
|
"eval_loss": 0.05384594947099686, |
|
"eval_runtime": 1155.7771, |
|
"eval_samples_per_second": 178.462, |
|
"eval_steps_per_second": 2.789, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.520070689219894, |
|
"grad_norm": 0.8359307050704956, |
|
"learning_rate": 2.3820000000000002e-05, |
|
"loss": 0.0759, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.522595304216107, |
|
"grad_norm": 1.6299511194229126, |
|
"learning_rate": 2.379e-05, |
|
"loss": 0.076, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5251199192123202, |
|
"grad_norm": 0.6880617737770081, |
|
"learning_rate": 2.3760000000000003e-05, |
|
"loss": 0.0745, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5276445342085332, |
|
"grad_norm": 0.7822777032852173, |
|
"learning_rate": 2.373e-05, |
|
"loss": 0.0697, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.5301691492047462, |
|
"grad_norm": 0.7941886782646179, |
|
"learning_rate": 2.37e-05, |
|
"loss": 0.0685, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5301691492047462, |
|
"eval_f1": 0.6104315862855695, |
|
"eval_loss": 0.04854836314916611, |
|
"eval_runtime": 1154.0649, |
|
"eval_samples_per_second": 178.727, |
|
"eval_steps_per_second": 2.793, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5326937642009594, |
|
"grad_norm": 0.948130190372467, |
|
"learning_rate": 2.3670000000000002e-05, |
|
"loss": 0.0706, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.5352183791971724, |
|
"grad_norm": 0.959032416343689, |
|
"learning_rate": 2.364e-05, |
|
"loss": 0.0684, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5377429941933856, |
|
"grad_norm": 1.1859666109085083, |
|
"learning_rate": 2.3610000000000003e-05, |
|
"loss": 0.0757, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.5402676091895986, |
|
"grad_norm": 0.9001142978668213, |
|
"learning_rate": 2.358e-05, |
|
"loss": 0.079, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5427922241858116, |
|
"grad_norm": 0.47399717569351196, |
|
"learning_rate": 2.3550000000000003e-05, |
|
"loss": 0.0726, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5427922241858116, |
|
"eval_f1": 0.611992731677771, |
|
"eval_loss": 0.05662121623754501, |
|
"eval_runtime": 1151.3771, |
|
"eval_samples_per_second": 179.144, |
|
"eval_steps_per_second": 2.799, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5453168391820248, |
|
"grad_norm": 0.6292353272438049, |
|
"learning_rate": 2.3520000000000002e-05, |
|
"loss": 0.0677, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5478414541782378, |
|
"grad_norm": 0.7090362906455994, |
|
"learning_rate": 2.349e-05, |
|
"loss": 0.0703, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.5503660691744509, |
|
"grad_norm": 0.6082953810691833, |
|
"learning_rate": 2.3460000000000002e-05, |
|
"loss": 0.0672, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.552890684170664, |
|
"grad_norm": 0.5937643051147461, |
|
"learning_rate": 2.343e-05, |
|
"loss": 0.0686, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.555415299166877, |
|
"grad_norm": 0.7394770979881287, |
|
"learning_rate": 2.3400000000000003e-05, |
|
"loss": 0.0731, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.555415299166877, |
|
"eval_f1": 0.6111780293905084, |
|
"eval_loss": 0.05852247402071953, |
|
"eval_runtime": 1153.1003, |
|
"eval_samples_per_second": 178.876, |
|
"eval_steps_per_second": 2.795, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5579399141630901, |
|
"grad_norm": 0.7641323804855347, |
|
"learning_rate": 2.337e-05, |
|
"loss": 0.0732, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.5604645291593032, |
|
"grad_norm": 0.8567935824394226, |
|
"learning_rate": 2.334e-05, |
|
"loss": 0.0599, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5629891441555163, |
|
"grad_norm": 0.9106941819190979, |
|
"learning_rate": 2.3310000000000002e-05, |
|
"loss": 0.0593, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5655137591517294, |
|
"grad_norm": 1.5944632291793823, |
|
"learning_rate": 2.328e-05, |
|
"loss": 0.0669, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5680383741479424, |
|
"grad_norm": 0.9120457768440247, |
|
"learning_rate": 2.3250000000000003e-05, |
|
"loss": 0.0722, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5680383741479424, |
|
"eval_f1": 0.6139676730710583, |
|
"eval_loss": 0.05887339636683464, |
|
"eval_runtime": 1155.9087, |
|
"eval_samples_per_second": 178.441, |
|
"eval_steps_per_second": 2.788, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5705629891441555, |
|
"grad_norm": 0.8505953550338745, |
|
"learning_rate": 2.322e-05, |
|
"loss": 0.0863, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5730876041403686, |
|
"grad_norm": 0.9573137164115906, |
|
"learning_rate": 2.319e-05, |
|
"loss": 0.0712, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.5756122191365817, |
|
"grad_norm": 1.230735182762146, |
|
"learning_rate": 2.3160000000000002e-05, |
|
"loss": 0.0677, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5781368341327947, |
|
"grad_norm": 1.203621745109558, |
|
"learning_rate": 2.313e-05, |
|
"loss": 0.0634, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.5806614491290079, |
|
"grad_norm": 1.3590195178985596, |
|
"learning_rate": 2.3100000000000002e-05, |
|
"loss": 0.0819, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5806614491290079, |
|
"eval_f1": 0.6121980676328502, |
|
"eval_loss": 0.050494007766246796, |
|
"eval_runtime": 1153.6589, |
|
"eval_samples_per_second": 178.789, |
|
"eval_steps_per_second": 2.794, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5831860641252209, |
|
"grad_norm": 0.8538402318954468, |
|
"learning_rate": 2.307e-05, |
|
"loss": 0.0674, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.5857106791214339, |
|
"grad_norm": 1.1863012313842773, |
|
"learning_rate": 2.304e-05, |
|
"loss": 0.0665, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 1.0120714902877808, |
|
"learning_rate": 2.301e-05, |
|
"loss": 0.0675, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.5907599091138601, |
|
"grad_norm": 0.8394482135772705, |
|
"learning_rate": 2.298e-05, |
|
"loss": 0.0812, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.5932845241100733, |
|
"grad_norm": 0.8855767250061035, |
|
"learning_rate": 2.2950000000000002e-05, |
|
"loss": 0.0694, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5932845241100733, |
|
"eval_f1": 0.6101251634597422, |
|
"eval_loss": 0.053731031715869904, |
|
"eval_runtime": 1147.8424, |
|
"eval_samples_per_second": 179.695, |
|
"eval_steps_per_second": 2.808, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5958091391062863, |
|
"grad_norm": 1.241045594215393, |
|
"learning_rate": 2.292e-05, |
|
"loss": 0.0646, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.5983337541024993, |
|
"grad_norm": 2.065401315689087, |
|
"learning_rate": 2.289e-05, |
|
"loss": 0.0792, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6008583690987125, |
|
"grad_norm": 1.0024877786636353, |
|
"learning_rate": 2.286e-05, |
|
"loss": 0.0751, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6033829840949255, |
|
"grad_norm": 0.4943256080150604, |
|
"learning_rate": 2.283e-05, |
|
"loss": 0.076, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6059075990911386, |
|
"grad_norm": 1.0907814502716064, |
|
"learning_rate": 2.2800000000000002e-05, |
|
"loss": 0.0705, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6059075990911386, |
|
"eval_f1": 0.6130196664177247, |
|
"eval_loss": 0.06461644172668457, |
|
"eval_runtime": 1149.8253, |
|
"eval_samples_per_second": 179.386, |
|
"eval_steps_per_second": 2.803, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6084322140873517, |
|
"grad_norm": 1.1304162740707397, |
|
"learning_rate": 2.277e-05, |
|
"loss": 0.0548, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6109568290835647, |
|
"grad_norm": 1.3394097089767456, |
|
"learning_rate": 2.274e-05, |
|
"loss": 0.0607, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6134814440797778, |
|
"grad_norm": 0.5467960834503174, |
|
"learning_rate": 2.271e-05, |
|
"loss": 0.0701, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.6160060590759909, |
|
"grad_norm": 0.5510517954826355, |
|
"learning_rate": 2.268e-05, |
|
"loss": 0.0725, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.618530674072204, |
|
"grad_norm": 0.7682734131813049, |
|
"learning_rate": 2.265e-05, |
|
"loss": 0.0702, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.618530674072204, |
|
"eval_f1": 0.6124447065762312, |
|
"eval_loss": 0.046234920620918274, |
|
"eval_runtime": 1146.4615, |
|
"eval_samples_per_second": 179.912, |
|
"eval_steps_per_second": 2.811, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.6210552890684171, |
|
"grad_norm": 0.7578818798065186, |
|
"learning_rate": 2.262e-05, |
|
"loss": 0.0703, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6235799040646302, |
|
"grad_norm": 0.7244108319282532, |
|
"learning_rate": 2.2590000000000002e-05, |
|
"loss": 0.0635, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.6261045190608432, |
|
"grad_norm": 1.1047908067703247, |
|
"learning_rate": 2.256e-05, |
|
"loss": 0.0614, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6286291340570563, |
|
"grad_norm": 1.0824987888336182, |
|
"learning_rate": 2.253e-05, |
|
"loss": 0.081, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.6311537490532694, |
|
"grad_norm": 1.9344598054885864, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.0709, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6311537490532694, |
|
"eval_f1": 0.6133951445650848, |
|
"eval_loss": 0.04044894501566887, |
|
"eval_runtime": 1148.0724, |
|
"eval_samples_per_second": 179.659, |
|
"eval_steps_per_second": 2.807, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6336783640494824, |
|
"grad_norm": 1.2797091007232666, |
|
"learning_rate": 2.247e-05, |
|
"loss": 0.072, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.6362029790456956, |
|
"grad_norm": 0.7228933572769165, |
|
"learning_rate": 2.2440000000000002e-05, |
|
"loss": 0.071, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6387275940419086, |
|
"grad_norm": 0.9655591249465942, |
|
"learning_rate": 2.241e-05, |
|
"loss": 0.0611, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.6412522090381216, |
|
"grad_norm": 0.9924450516700745, |
|
"learning_rate": 2.238e-05, |
|
"loss": 0.0676, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6437768240343348, |
|
"grad_norm": 1.12591552734375, |
|
"learning_rate": 2.235e-05, |
|
"loss": 0.0804, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6437768240343348, |
|
"eval_f1": 0.612305676335696, |
|
"eval_loss": 0.04778852313756943, |
|
"eval_runtime": 1160.4576, |
|
"eval_samples_per_second": 177.742, |
|
"eval_steps_per_second": 2.777, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6463014390305478, |
|
"grad_norm": 0.7478006482124329, |
|
"learning_rate": 2.232e-05, |
|
"loss": 0.0638, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.648826054026761, |
|
"grad_norm": 0.7661213874816895, |
|
"learning_rate": 2.2290000000000002e-05, |
|
"loss": 0.0632, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.651350669022974, |
|
"grad_norm": 0.9824168086051941, |
|
"learning_rate": 2.226e-05, |
|
"loss": 0.0602, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.653875284019187, |
|
"grad_norm": 1.1700901985168457, |
|
"learning_rate": 2.223e-05, |
|
"loss": 0.0714, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6563998990154002, |
|
"grad_norm": 0.8846214413642883, |
|
"learning_rate": 2.22e-05, |
|
"loss": 0.0666, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6563998990154002, |
|
"eval_f1": 0.6104417670682731, |
|
"eval_loss": 0.04546576738357544, |
|
"eval_runtime": 1160.1326, |
|
"eval_samples_per_second": 177.792, |
|
"eval_steps_per_second": 2.778, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6589245140116132, |
|
"grad_norm": 0.7641239166259766, |
|
"learning_rate": 2.217e-05, |
|
"loss": 0.058, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.6614491290078263, |
|
"grad_norm": 0.5828648209571838, |
|
"learning_rate": 2.214e-05, |
|
"loss": 0.0686, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6639737440040394, |
|
"grad_norm": 0.6906914710998535, |
|
"learning_rate": 2.211e-05, |
|
"loss": 0.0764, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.6664983590002524, |
|
"grad_norm": 1.3137489557266235, |
|
"learning_rate": 2.208e-05, |
|
"loss": 0.0768, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6690229739964655, |
|
"grad_norm": 0.863865077495575, |
|
"learning_rate": 2.205e-05, |
|
"loss": 0.0749, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6690229739964655, |
|
"eval_f1": 0.6131900703964431, |
|
"eval_loss": 0.04790908098220825, |
|
"eval_runtime": 1162.4462, |
|
"eval_samples_per_second": 177.438, |
|
"eval_steps_per_second": 2.773, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6715475889926786, |
|
"grad_norm": 0.9182652235031128, |
|
"learning_rate": 2.202e-05, |
|
"loss": 0.0625, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6740722039888917, |
|
"grad_norm": 1.4961283206939697, |
|
"learning_rate": 2.199e-05, |
|
"loss": 0.0726, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.6765968189851048, |
|
"grad_norm": 0.7803681492805481, |
|
"learning_rate": 2.196e-05, |
|
"loss": 0.0669, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.6791214339813179, |
|
"grad_norm": 1.0371824502944946, |
|
"learning_rate": 2.193e-05, |
|
"loss": 0.0566, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.6816460489775309, |
|
"grad_norm": 1.1832714080810547, |
|
"learning_rate": 2.19e-05, |
|
"loss": 0.067, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6816460489775309, |
|
"eval_f1": 0.6132461161079312, |
|
"eval_loss": 0.055793602019548416, |
|
"eval_runtime": 1161.8914, |
|
"eval_samples_per_second": 177.523, |
|
"eval_steps_per_second": 2.774, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.684170663973744, |
|
"grad_norm": 0.7899573445320129, |
|
"learning_rate": 2.187e-05, |
|
"loss": 0.0763, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 1.4638808965682983, |
|
"learning_rate": 2.184e-05, |
|
"loss": 0.0768, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.6892198939661701, |
|
"grad_norm": 0.7547538876533508, |
|
"learning_rate": 2.181e-05, |
|
"loss": 0.0761, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.6917445089623833, |
|
"grad_norm": 0.5143932700157166, |
|
"learning_rate": 2.178e-05, |
|
"loss": 0.0808, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.6942691239585963, |
|
"grad_norm": 1.011730432510376, |
|
"learning_rate": 2.175e-05, |
|
"loss": 0.068, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6942691239585963, |
|
"eval_f1": 0.6108202443280978, |
|
"eval_loss": 0.053855251520872116, |
|
"eval_runtime": 1160.3338, |
|
"eval_samples_per_second": 177.761, |
|
"eval_steps_per_second": 2.778, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6942691239585963, |
|
"step": 2750, |
|
"total_flos": 1.3639932886745088e+19, |
|
"train_loss": 0.019194319985129618, |
|
"train_runtime": 18605.0451, |
|
"train_samples_per_second": 34.399, |
|
"train_steps_per_second": 0.537 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3639932886745088e+19, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|