SLM_vs_LLM_experiments
/
max_seq_length_128_experiments
/LoRA
/google
/gemma_7b_LoRA_coastalcph
/lex_glue_ledgar
/trainer_state.json
{ | |
"best_metric": 0.5041437745094299, | |
"best_model_checkpoint": "../experiments_checkpoints/LoRA/google/gemma_7b_LoRA_coastalcph/lex_glue_ledgar/checkpoint-3700", | |
"epoch": 3.0, | |
"eval_steps": 100, | |
"global_step": 5625, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.01, | |
"grad_norm": 227.8190460205078, | |
"learning_rate": 4.991111111111111e-05, | |
"loss": 12.0078, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.01, | |
"grad_norm": 106.68264770507812, | |
"learning_rate": 4.982222222222222e-05, | |
"loss": 5.4906, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.02, | |
"grad_norm": 67.2668685913086, | |
"learning_rate": 4.973333333333334e-05, | |
"loss": 3.8859, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.02, | |
"grad_norm": 50.982364654541016, | |
"learning_rate": 4.964444444444445e-05, | |
"loss": 2.541, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.03, | |
"grad_norm": 60.637367248535156, | |
"learning_rate": 4.955555555555556e-05, | |
"loss": 2.2738, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.03, | |
"grad_norm": 52.631439208984375, | |
"learning_rate": 4.9466666666666665e-05, | |
"loss": 1.6229, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.04, | |
"grad_norm": 39.47542953491211, | |
"learning_rate": 4.9377777777777776e-05, | |
"loss": 1.7955, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.04, | |
"grad_norm": 53.9116096496582, | |
"learning_rate": 4.928888888888889e-05, | |
"loss": 1.263, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.05, | |
"grad_norm": 52.852882385253906, | |
"learning_rate": 4.92e-05, | |
"loss": 1.2939, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.05, | |
"grad_norm": 41.01764678955078, | |
"learning_rate": 4.9111111111111114e-05, | |
"loss": 1.3725, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.05, | |
"eval_accuracy": 0.6864, | |
"eval_f1_macro": 0.5157367163795201, | |
"eval_f1_micro": 0.6864, | |
"eval_loss": 1.3878281116485596, | |
"eval_runtime": 150.4086, | |
"eval_samples_per_second": 66.486, | |
"eval_steps_per_second": 2.081, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.06, | |
"grad_norm": 24.78645896911621, | |
"learning_rate": 4.9022222222222224e-05, | |
"loss": 1.4604, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.06, | |
"grad_norm": 33.56829071044922, | |
"learning_rate": 4.8933333333333335e-05, | |
"loss": 1.2117, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.07, | |
"grad_norm": 44.52119827270508, | |
"learning_rate": 4.8844444444444445e-05, | |
"loss": 1.1012, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.07, | |
"grad_norm": 38.966400146484375, | |
"learning_rate": 4.875555555555556e-05, | |
"loss": 1.3106, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 44.13960647583008, | |
"learning_rate": 4.866666666666667e-05, | |
"loss": 1.1914, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.09, | |
"grad_norm": 38.23433303833008, | |
"learning_rate": 4.8577777777777776e-05, | |
"loss": 1.2592, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.09, | |
"grad_norm": 36.56868362426758, | |
"learning_rate": 4.848888888888889e-05, | |
"loss": 1.0251, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.1, | |
"grad_norm": 33.46715545654297, | |
"learning_rate": 4.8400000000000004e-05, | |
"loss": 1.0881, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.1, | |
"grad_norm": 33.97274398803711, | |
"learning_rate": 4.8311111111111115e-05, | |
"loss": 1.0591, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 42.04470443725586, | |
"learning_rate": 4.8222222222222225e-05, | |
"loss": 1.3256, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.11, | |
"eval_accuracy": 0.7615, | |
"eval_f1_macro": 0.6078459646070684, | |
"eval_f1_micro": 0.7615, | |
"eval_loss": 1.0876250267028809, | |
"eval_runtime": 150.0387, | |
"eval_samples_per_second": 66.649, | |
"eval_steps_per_second": 2.086, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 43.14711380004883, | |
"learning_rate": 4.8133333333333336e-05, | |
"loss": 1.1843, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.12, | |
"grad_norm": 39.68747329711914, | |
"learning_rate": 4.8044444444444446e-05, | |
"loss": 1.208, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.12, | |
"grad_norm": 34.513919830322266, | |
"learning_rate": 4.7955555555555556e-05, | |
"loss": 1.0581, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 37.228477478027344, | |
"learning_rate": 4.7866666666666674e-05, | |
"loss": 1.0649, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 52.42377471923828, | |
"learning_rate": 4.7777777777777784e-05, | |
"loss": 1.1228, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.14, | |
"grad_norm": 40.35560607910156, | |
"learning_rate": 4.768888888888889e-05, | |
"loss": 1.049, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.14, | |
"grad_norm": 41.43683624267578, | |
"learning_rate": 4.76e-05, | |
"loss": 1.0151, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.15, | |
"grad_norm": 23.949188232421875, | |
"learning_rate": 4.751111111111111e-05, | |
"loss": 1.0386, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.15, | |
"grad_norm": 43.994598388671875, | |
"learning_rate": 4.7422222222222226e-05, | |
"loss": 1.0314, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 39.607051849365234, | |
"learning_rate": 4.7333333333333336e-05, | |
"loss": 0.9681, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.16, | |
"eval_accuracy": 0.7699, | |
"eval_f1_macro": 0.6452214349790191, | |
"eval_f1_micro": 0.7699, | |
"eval_loss": 0.9516304731369019, | |
"eval_runtime": 150.0454, | |
"eval_samples_per_second": 66.646, | |
"eval_steps_per_second": 2.086, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.17, | |
"grad_norm": 33.548194885253906, | |
"learning_rate": 4.724444444444445e-05, | |
"loss": 0.9143, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.17, | |
"grad_norm": 28.681020736694336, | |
"learning_rate": 4.715555555555556e-05, | |
"loss": 0.8288, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.18, | |
"grad_norm": 39.852054595947266, | |
"learning_rate": 4.706666666666667e-05, | |
"loss": 1.1122, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.18, | |
"grad_norm": 33.49698257446289, | |
"learning_rate": 4.6977777777777785e-05, | |
"loss": 0.9923, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 47.75785446166992, | |
"learning_rate": 4.6888888888888895e-05, | |
"loss": 0.9374, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 39.51451873779297, | |
"learning_rate": 4.6800000000000006e-05, | |
"loss": 1.0203, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.2, | |
"grad_norm": 35.28321075439453, | |
"learning_rate": 4.671111111111111e-05, | |
"loss": 0.8589, | |
"step": 370 | |
}, | |
{ | |
"epoch": 0.2, | |
"grad_norm": 43.45527267456055, | |
"learning_rate": 4.662222222222222e-05, | |
"loss": 0.9698, | |
"step": 380 | |
}, | |
{ | |
"epoch": 0.21, | |
"grad_norm": 34.52985763549805, | |
"learning_rate": 4.653333333333334e-05, | |
"loss": 0.8177, | |
"step": 390 | |
}, | |
{ | |
"epoch": 0.21, | |
"grad_norm": 28.500659942626953, | |
"learning_rate": 4.644444444444445e-05, | |
"loss": 0.9094, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.21, | |
"eval_accuracy": 0.7893, | |
"eval_f1_macro": 0.6627964242367417, | |
"eval_f1_micro": 0.7893, | |
"eval_loss": 0.9403331875801086, | |
"eval_runtime": 149.9006, | |
"eval_samples_per_second": 66.711, | |
"eval_steps_per_second": 2.088, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.22, | |
"grad_norm": 40.63841247558594, | |
"learning_rate": 4.635555555555556e-05, | |
"loss": 0.973, | |
"step": 410 | |
}, | |
{ | |
"epoch": 0.22, | |
"grad_norm": 32.8942985534668, | |
"learning_rate": 4.626666666666667e-05, | |
"loss": 0.9819, | |
"step": 420 | |
}, | |
{ | |
"epoch": 0.23, | |
"grad_norm": 28.508481979370117, | |
"learning_rate": 4.617777777777778e-05, | |
"loss": 0.833, | |
"step": 430 | |
}, | |
{ | |
"epoch": 0.23, | |
"grad_norm": 33.688846588134766, | |
"learning_rate": 4.608888888888889e-05, | |
"loss": 0.6626, | |
"step": 440 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 42.95698928833008, | |
"learning_rate": 4.600000000000001e-05, | |
"loss": 0.8224, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 26.850563049316406, | |
"learning_rate": 4.591111111111112e-05, | |
"loss": 0.9378, | |
"step": 460 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 29.557905197143555, | |
"learning_rate": 4.582222222222222e-05, | |
"loss": 1.0295, | |
"step": 470 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 20.479171752929688, | |
"learning_rate": 4.573333333333333e-05, | |
"loss": 0.885, | |
"step": 480 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 31.502513885498047, | |
"learning_rate": 4.564444444444444e-05, | |
"loss": 0.9045, | |
"step": 490 | |
}, | |
{ | |
"epoch": 0.27, | |
"grad_norm": 32.363094329833984, | |
"learning_rate": 4.555555555555556e-05, | |
"loss": 0.7715, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.27, | |
"eval_accuracy": 0.7896, | |
"eval_f1_macro": 0.668700376378601, | |
"eval_f1_micro": 0.7896, | |
"eval_loss": 0.8592824339866638, | |
"eval_runtime": 149.9317, | |
"eval_samples_per_second": 66.697, | |
"eval_steps_per_second": 2.088, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.27, | |
"grad_norm": 34.05838394165039, | |
"learning_rate": 4.546666666666667e-05, | |
"loss": 0.8179, | |
"step": 510 | |
}, | |
{ | |
"epoch": 0.28, | |
"grad_norm": 48.86190414428711, | |
"learning_rate": 4.537777777777778e-05, | |
"loss": 0.9052, | |
"step": 520 | |
}, | |
{ | |
"epoch": 0.28, | |
"grad_norm": 32.10374069213867, | |
"learning_rate": 4.528888888888889e-05, | |
"loss": 1.0262, | |
"step": 530 | |
}, | |
{ | |
"epoch": 0.29, | |
"grad_norm": 27.27006721496582, | |
"learning_rate": 4.52e-05, | |
"loss": 0.8747, | |
"step": 540 | |
}, | |
{ | |
"epoch": 0.29, | |
"grad_norm": 34.86284637451172, | |
"learning_rate": 4.511111111111112e-05, | |
"loss": 0.8681, | |
"step": 550 | |
}, | |
{ | |
"epoch": 0.3, | |
"grad_norm": 27.64435386657715, | |
"learning_rate": 4.502222222222223e-05, | |
"loss": 0.8249, | |
"step": 560 | |
}, | |
{ | |
"epoch": 0.3, | |
"grad_norm": 34.09676742553711, | |
"learning_rate": 4.493333333333333e-05, | |
"loss": 1.0702, | |
"step": 570 | |
}, | |
{ | |
"epoch": 0.31, | |
"grad_norm": 26.377086639404297, | |
"learning_rate": 4.484444444444444e-05, | |
"loss": 0.7343, | |
"step": 580 | |
}, | |
{ | |
"epoch": 0.31, | |
"grad_norm": 32.587154388427734, | |
"learning_rate": 4.475555555555555e-05, | |
"loss": 0.8207, | |
"step": 590 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 28.099170684814453, | |
"learning_rate": 4.466666666666667e-05, | |
"loss": 0.7244, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.8061, | |
"eval_f1_macro": 0.694893382279633, | |
"eval_f1_micro": 0.8061, | |
"eval_loss": 0.7621132731437683, | |
"eval_runtime": 150.0447, | |
"eval_samples_per_second": 66.647, | |
"eval_steps_per_second": 2.086, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.33, | |
"grad_norm": 27.44232177734375, | |
"learning_rate": 4.457777777777778e-05, | |
"loss": 0.8029, | |
"step": 610 | |
}, | |
{ | |
"epoch": 0.33, | |
"grad_norm": 29.95503807067871, | |
"learning_rate": 4.448888888888889e-05, | |
"loss": 0.7812, | |
"step": 620 | |
}, | |
{ | |
"epoch": 0.34, | |
"grad_norm": 32.26255416870117, | |
"learning_rate": 4.44e-05, | |
"loss": 0.6953, | |
"step": 630 | |
}, | |
{ | |
"epoch": 0.34, | |
"grad_norm": 26.627235412597656, | |
"learning_rate": 4.431111111111111e-05, | |
"loss": 0.8677, | |
"step": 640 | |
}, | |
{ | |
"epoch": 0.35, | |
"grad_norm": 19.550811767578125, | |
"learning_rate": 4.422222222222222e-05, | |
"loss": 0.8226, | |
"step": 650 | |
}, | |
{ | |
"epoch": 0.35, | |
"grad_norm": 19.106870651245117, | |
"learning_rate": 4.413333333333334e-05, | |
"loss": 0.845, | |
"step": 660 | |
}, | |
{ | |
"epoch": 0.36, | |
"grad_norm": 31.620084762573242, | |
"learning_rate": 4.404444444444445e-05, | |
"loss": 0.7984, | |
"step": 670 | |
}, | |
{ | |
"epoch": 0.36, | |
"grad_norm": 32.98550796508789, | |
"learning_rate": 4.3955555555555554e-05, | |
"loss": 0.7198, | |
"step": 680 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 32.72222137451172, | |
"learning_rate": 4.3866666666666665e-05, | |
"loss": 0.9314, | |
"step": 690 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 36.1794319152832, | |
"learning_rate": 4.377777777777778e-05, | |
"loss": 0.7719, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.37, | |
"eval_accuracy": 0.7884, | |
"eval_f1_macro": 0.6863716720883178, | |
"eval_f1_micro": 0.7884, | |
"eval_loss": 0.8355345726013184, | |
"eval_runtime": 150.0583, | |
"eval_samples_per_second": 66.641, | |
"eval_steps_per_second": 2.086, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 19.62013053894043, | |
"learning_rate": 4.368888888888889e-05, | |
"loss": 0.5494, | |
"step": 710 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 26.641956329345703, | |
"learning_rate": 4.36e-05, | |
"loss": 0.7919, | |
"step": 720 | |
}, | |
{ | |
"epoch": 0.39, | |
"grad_norm": 24.791227340698242, | |
"learning_rate": 4.351111111111111e-05, | |
"loss": 0.7154, | |
"step": 730 | |
}, | |
{ | |
"epoch": 0.39, | |
"grad_norm": 39.39951705932617, | |
"learning_rate": 4.3422222222222224e-05, | |
"loss": 0.6423, | |
"step": 740 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 35.64988327026367, | |
"learning_rate": 4.3333333333333334e-05, | |
"loss": 0.7939, | |
"step": 750 | |
}, | |
{ | |
"epoch": 0.41, | |
"grad_norm": 28.65671157836914, | |
"learning_rate": 4.324444444444445e-05, | |
"loss": 0.8428, | |
"step": 760 | |
}, | |
{ | |
"epoch": 0.41, | |
"grad_norm": 28.99781608581543, | |
"learning_rate": 4.315555555555556e-05, | |
"loss": 0.749, | |
"step": 770 | |
}, | |
{ | |
"epoch": 0.42, | |
"grad_norm": 29.291336059570312, | |
"learning_rate": 4.3066666666666665e-05, | |
"loss": 0.7318, | |
"step": 780 | |
}, | |
{ | |
"epoch": 0.42, | |
"grad_norm": 27.441404342651367, | |
"learning_rate": 4.2977777777777776e-05, | |
"loss": 0.7554, | |
"step": 790 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 10.821943283081055, | |
"learning_rate": 4.2888888888888886e-05, | |
"loss": 0.6305, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.43, | |
"eval_accuracy": 0.7897, | |
"eval_f1_macro": 0.6806730525287331, | |
"eval_f1_micro": 0.7897, | |
"eval_loss": 0.8542162179946899, | |
"eval_runtime": 149.8489, | |
"eval_samples_per_second": 66.734, | |
"eval_steps_per_second": 2.089, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 32.4620246887207, | |
"learning_rate": 4.2800000000000004e-05, | |
"loss": 0.7627, | |
"step": 810 | |
}, | |
{ | |
"epoch": 0.44, | |
"grad_norm": 32.67604446411133, | |
"learning_rate": 4.2711111111111114e-05, | |
"loss": 0.8879, | |
"step": 820 | |
}, | |
{ | |
"epoch": 0.44, | |
"grad_norm": 22.831031799316406, | |
"learning_rate": 4.2622222222222224e-05, | |
"loss": 0.8407, | |
"step": 830 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 36.6854362487793, | |
"learning_rate": 4.2533333333333335e-05, | |
"loss": 0.7145, | |
"step": 840 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 36.49830627441406, | |
"learning_rate": 4.2444444444444445e-05, | |
"loss": 0.7768, | |
"step": 850 | |
}, | |
{ | |
"epoch": 0.46, | |
"grad_norm": 35.065948486328125, | |
"learning_rate": 4.235555555555556e-05, | |
"loss": 1.0117, | |
"step": 860 | |
}, | |
{ | |
"epoch": 0.46, | |
"grad_norm": 37.74482727050781, | |
"learning_rate": 4.226666666666667e-05, | |
"loss": 0.8462, | |
"step": 870 | |
}, | |
{ | |
"epoch": 0.47, | |
"grad_norm": 26.77570152282715, | |
"learning_rate": 4.217777777777778e-05, | |
"loss": 0.8096, | |
"step": 880 | |
}, | |
{ | |
"epoch": 0.47, | |
"grad_norm": 26.55797004699707, | |
"learning_rate": 4.208888888888889e-05, | |
"loss": 0.8612, | |
"step": 890 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 34.126625061035156, | |
"learning_rate": 4.2e-05, | |
"loss": 0.8793, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.48, | |
"eval_accuracy": 0.7935, | |
"eval_f1_macro": 0.6821808056398841, | |
"eval_f1_micro": 0.7935, | |
"eval_loss": 0.8042706847190857, | |
"eval_runtime": 150.0061, | |
"eval_samples_per_second": 66.664, | |
"eval_steps_per_second": 2.087, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.49, | |
"grad_norm": 28.812938690185547, | |
"learning_rate": 4.1911111111111115e-05, | |
"loss": 0.8599, | |
"step": 910 | |
}, | |
{ | |
"epoch": 0.49, | |
"grad_norm": 25.103418350219727, | |
"learning_rate": 4.1822222222222225e-05, | |
"loss": 0.6823, | |
"step": 920 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 22.762414932250977, | |
"learning_rate": 4.1733333333333336e-05, | |
"loss": 0.6981, | |
"step": 930 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 27.674386978149414, | |
"learning_rate": 4.1644444444444446e-05, | |
"loss": 0.6647, | |
"step": 940 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 30.835783004760742, | |
"learning_rate": 4.155555555555556e-05, | |
"loss": 0.8359, | |
"step": 950 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 27.273395538330078, | |
"learning_rate": 4.146666666666667e-05, | |
"loss": 0.8044, | |
"step": 960 | |
}, | |
{ | |
"epoch": 0.52, | |
"grad_norm": 28.3951416015625, | |
"learning_rate": 4.1377777777777784e-05, | |
"loss": 0.8517, | |
"step": 970 | |
}, | |
{ | |
"epoch": 0.52, | |
"grad_norm": 29.438312530517578, | |
"learning_rate": 4.1288888888888895e-05, | |
"loss": 0.8037, | |
"step": 980 | |
}, | |
{ | |
"epoch": 0.53, | |
"grad_norm": 34.72230529785156, | |
"learning_rate": 4.12e-05, | |
"loss": 0.8013, | |
"step": 990 | |
}, | |
{ | |
"epoch": 0.53, | |
"grad_norm": 32.5698127746582, | |
"learning_rate": 4.111111111111111e-05, | |
"loss": 0.7411, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.53, | |
"eval_accuracy": 0.8072, | |
"eval_f1_macro": 0.6939988529805743, | |
"eval_f1_micro": 0.8072, | |
"eval_loss": 0.7256324291229248, | |
"eval_runtime": 150.0248, | |
"eval_samples_per_second": 66.656, | |
"eval_steps_per_second": 2.086, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.54, | |
"grad_norm": 28.614532470703125, | |
"learning_rate": 4.1022222222222226e-05, | |
"loss": 0.733, | |
"step": 1010 | |
}, | |
{ | |
"epoch": 0.54, | |
"grad_norm": 30.533872604370117, | |
"learning_rate": 4.093333333333334e-05, | |
"loss": 0.7474, | |
"step": 1020 | |
}, | |
{ | |
"epoch": 0.55, | |
"grad_norm": 29.524789810180664, | |
"learning_rate": 4.084444444444445e-05, | |
"loss": 0.8004, | |
"step": 1030 | |
}, | |
{ | |
"epoch": 0.55, | |
"grad_norm": 52.84124755859375, | |
"learning_rate": 4.075555555555556e-05, | |
"loss": 0.8063, | |
"step": 1040 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 31.382856369018555, | |
"learning_rate": 4.066666666666667e-05, | |
"loss": 0.7358, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 0.57, | |
"grad_norm": 28.268238067626953, | |
"learning_rate": 4.057777777777778e-05, | |
"loss": 0.6383, | |
"step": 1060 | |
}, | |
{ | |
"epoch": 0.57, | |
"grad_norm": 28.795692443847656, | |
"learning_rate": 4.0488888888888896e-05, | |
"loss": 0.7642, | |
"step": 1070 | |
}, | |
{ | |
"epoch": 0.58, | |
"grad_norm": 24.153024673461914, | |
"learning_rate": 4.0400000000000006e-05, | |
"loss": 0.6057, | |
"step": 1080 | |
}, | |
{ | |
"epoch": 0.58, | |
"grad_norm": 32.658329010009766, | |
"learning_rate": 4.031111111111111e-05, | |
"loss": 0.8125, | |
"step": 1090 | |
}, | |
{ | |
"epoch": 0.59, | |
"grad_norm": 14.572766304016113, | |
"learning_rate": 4.022222222222222e-05, | |
"loss": 0.6403, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.59, | |
"eval_accuracy": 0.819, | |
"eval_f1_macro": 0.7216680126270462, | |
"eval_f1_micro": 0.819, | |
"eval_loss": 0.7033218741416931, | |
"eval_runtime": 149.9473, | |
"eval_samples_per_second": 66.69, | |
"eval_steps_per_second": 2.087, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.59, | |
"grad_norm": 19.96935272216797, | |
"learning_rate": 4.013333333333333e-05, | |
"loss": 0.5791, | |
"step": 1110 | |
}, | |
{ | |
"epoch": 0.6, | |
"grad_norm": 30.314918518066406, | |
"learning_rate": 4.004444444444445e-05, | |
"loss": 0.6803, | |
"step": 1120 | |
}, | |
{ | |
"epoch": 0.6, | |
"grad_norm": 36.90558624267578, | |
"learning_rate": 3.995555555555556e-05, | |
"loss": 0.5809, | |
"step": 1130 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 38.08405303955078, | |
"learning_rate": 3.986666666666667e-05, | |
"loss": 0.7575, | |
"step": 1140 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 25.463375091552734, | |
"learning_rate": 3.977777777777778e-05, | |
"loss": 0.6668, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 0.62, | |
"grad_norm": 30.448307037353516, | |
"learning_rate": 3.968888888888889e-05, | |
"loss": 0.6106, | |
"step": 1160 | |
}, | |
{ | |
"epoch": 0.62, | |
"grad_norm": 27.176774978637695, | |
"learning_rate": 3.960000000000001e-05, | |
"loss": 0.7375, | |
"step": 1170 | |
}, | |
{ | |
"epoch": 0.63, | |
"grad_norm": 29.381431579589844, | |
"learning_rate": 3.951111111111112e-05, | |
"loss": 0.7824, | |
"step": 1180 | |
}, | |
{ | |
"epoch": 0.63, | |
"grad_norm": 30.908754348754883, | |
"learning_rate": 3.942222222222222e-05, | |
"loss": 0.6832, | |
"step": 1190 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 34.62039566040039, | |
"learning_rate": 3.933333333333333e-05, | |
"loss": 0.6971, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.8159, | |
"eval_f1_macro": 0.7334649863076415, | |
"eval_f1_micro": 0.8159, | |
"eval_loss": 0.7008675932884216, | |
"eval_runtime": 150.0312, | |
"eval_samples_per_second": 66.653, | |
"eval_steps_per_second": 2.086, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 0.65, | |
"grad_norm": 33.131656646728516, | |
"learning_rate": 3.924444444444444e-05, | |
"loss": 0.6262, | |
"step": 1210 | |
}, | |
{ | |
"epoch": 0.65, | |
"grad_norm": 22.17938804626465, | |
"learning_rate": 3.915555555555556e-05, | |
"loss": 0.7002, | |
"step": 1220 | |
}, | |
{ | |
"epoch": 0.66, | |
"grad_norm": 38.93252182006836, | |
"learning_rate": 3.906666666666667e-05, | |
"loss": 0.7796, | |
"step": 1230 | |
}, | |
{ | |
"epoch": 0.66, | |
"grad_norm": 32.47177505493164, | |
"learning_rate": 3.897777777777778e-05, | |
"loss": 0.579, | |
"step": 1240 | |
}, | |
{ | |
"epoch": 0.67, | |
"grad_norm": 28.713239669799805, | |
"learning_rate": 3.888888888888889e-05, | |
"loss": 0.7154, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 0.67, | |
"grad_norm": 26.406848907470703, | |
"learning_rate": 3.88e-05, | |
"loss": 0.7948, | |
"step": 1260 | |
}, | |
{ | |
"epoch": 0.68, | |
"grad_norm": 34.53367233276367, | |
"learning_rate": 3.871111111111111e-05, | |
"loss": 0.5775, | |
"step": 1270 | |
}, | |
{ | |
"epoch": 0.68, | |
"grad_norm": 27.183706283569336, | |
"learning_rate": 3.862222222222223e-05, | |
"loss": 0.7497, | |
"step": 1280 | |
}, | |
{ | |
"epoch": 0.69, | |
"grad_norm": 34.97861099243164, | |
"learning_rate": 3.853333333333334e-05, | |
"loss": 0.5953, | |
"step": 1290 | |
}, | |
{ | |
"epoch": 0.69, | |
"grad_norm": 28.780019760131836, | |
"learning_rate": 3.844444444444444e-05, | |
"loss": 0.7053, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 0.69, | |
"eval_accuracy": 0.8291, | |
"eval_f1_macro": 0.7205458318365534, | |
"eval_f1_micro": 0.8291, | |
"eval_loss": 0.6921299695968628, | |
"eval_runtime": 150.1334, | |
"eval_samples_per_second": 66.607, | |
"eval_steps_per_second": 2.085, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 15.419646263122559, | |
"learning_rate": 3.8355555555555553e-05, | |
"loss": 0.5795, | |
"step": 1310 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 32.80424880981445, | |
"learning_rate": 3.8266666666666664e-05, | |
"loss": 0.8922, | |
"step": 1320 | |
}, | |
{ | |
"epoch": 0.71, | |
"grad_norm": 17.997528076171875, | |
"learning_rate": 3.817777777777778e-05, | |
"loss": 0.6339, | |
"step": 1330 | |
}, | |
{ | |
"epoch": 0.71, | |
"grad_norm": 28.21483612060547, | |
"learning_rate": 3.808888888888889e-05, | |
"loss": 0.825, | |
"step": 1340 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 30.419662475585938, | |
"learning_rate": 3.8e-05, | |
"loss": 0.6544, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 0.73, | |
"grad_norm": 22.456350326538086, | |
"learning_rate": 3.791111111111111e-05, | |
"loss": 0.5786, | |
"step": 1360 | |
}, | |
{ | |
"epoch": 0.73, | |
"grad_norm": 47.38570785522461, | |
"learning_rate": 3.782222222222222e-05, | |
"loss": 0.7092, | |
"step": 1370 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 25.527708053588867, | |
"learning_rate": 3.773333333333334e-05, | |
"loss": 0.7458, | |
"step": 1380 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 19.75039291381836, | |
"learning_rate": 3.764444444444445e-05, | |
"loss": 0.6571, | |
"step": 1390 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 19.110685348510742, | |
"learning_rate": 3.7555555555555554e-05, | |
"loss": 0.6413, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 0.75, | |
"eval_accuracy": 0.8301, | |
"eval_f1_macro": 0.729154652044555, | |
"eval_f1_micro": 0.8301, | |
"eval_loss": 0.6514862179756165, | |
"eval_runtime": 150.1615, | |
"eval_samples_per_second": 66.595, | |
"eval_steps_per_second": 2.084, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 29.326581954956055, | |
"learning_rate": 3.7466666666666665e-05, | |
"loss": 0.6138, | |
"step": 1410 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 41.61116409301758, | |
"learning_rate": 3.7377777777777775e-05, | |
"loss": 0.5805, | |
"step": 1420 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 33.782711029052734, | |
"learning_rate": 3.728888888888889e-05, | |
"loss": 0.6785, | |
"step": 1430 | |
}, | |
{ | |
"epoch": 0.77, | |
"grad_norm": 33.50584030151367, | |
"learning_rate": 3.72e-05, | |
"loss": 0.6461, | |
"step": 1440 | |
}, | |
{ | |
"epoch": 0.77, | |
"grad_norm": 33.119720458984375, | |
"learning_rate": 3.7111111111111113e-05, | |
"loss": 0.682, | |
"step": 1450 | |
}, | |
{ | |
"epoch": 0.78, | |
"grad_norm": 28.767709732055664, | |
"learning_rate": 3.7022222222222224e-05, | |
"loss": 0.6927, | |
"step": 1460 | |
}, | |
{ | |
"epoch": 0.78, | |
"grad_norm": 30.515918731689453, | |
"learning_rate": 3.6933333333333334e-05, | |
"loss": 0.6882, | |
"step": 1470 | |
}, | |
{ | |
"epoch": 0.79, | |
"grad_norm": 28.996906280517578, | |
"learning_rate": 3.6844444444444445e-05, | |
"loss": 0.7307, | |
"step": 1480 | |
}, | |
{ | |
"epoch": 0.79, | |
"grad_norm": 28.326091766357422, | |
"learning_rate": 3.675555555555556e-05, | |
"loss": 0.8148, | |
"step": 1490 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 28.253231048583984, | |
"learning_rate": 3.6666666666666666e-05, | |
"loss": 0.6656, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 0.8, | |
"eval_accuracy": 0.8241, | |
"eval_f1_macro": 0.7160978666563796, | |
"eval_f1_micro": 0.8241, | |
"eval_loss": 0.6684937477111816, | |
"eval_runtime": 150.0761, | |
"eval_samples_per_second": 66.633, | |
"eval_steps_per_second": 2.086, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 0.81, | |
"grad_norm": 39.65568542480469, | |
"learning_rate": 3.6577777777777776e-05, | |
"loss": 0.6676, | |
"step": 1510 | |
}, | |
{ | |
"epoch": 0.81, | |
"grad_norm": 37.01750946044922, | |
"learning_rate": 3.648888888888889e-05, | |
"loss": 0.7308, | |
"step": 1520 | |
}, | |
{ | |
"epoch": 0.82, | |
"grad_norm": 15.336851119995117, | |
"learning_rate": 3.6400000000000004e-05, | |
"loss": 0.5771, | |
"step": 1530 | |
}, | |
{ | |
"epoch": 0.82, | |
"grad_norm": 20.613094329833984, | |
"learning_rate": 3.6311111111111114e-05, | |
"loss": 0.5068, | |
"step": 1540 | |
}, | |
{ | |
"epoch": 0.83, | |
"grad_norm": 18.742759704589844, | |
"learning_rate": 3.6222222222222225e-05, | |
"loss": 0.7777, | |
"step": 1550 | |
}, | |
{ | |
"epoch": 0.83, | |
"grad_norm": 33.8448600769043, | |
"learning_rate": 3.6133333333333335e-05, | |
"loss": 0.8493, | |
"step": 1560 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 33.76942825317383, | |
"learning_rate": 3.6044444444444446e-05, | |
"loss": 0.7544, | |
"step": 1570 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 25.16337013244629, | |
"learning_rate": 3.5955555555555556e-05, | |
"loss": 0.6645, | |
"step": 1580 | |
}, | |
{ | |
"epoch": 0.85, | |
"grad_norm": 23.87677764892578, | |
"learning_rate": 3.586666666666667e-05, | |
"loss": 0.7335, | |
"step": 1590 | |
}, | |
{ | |
"epoch": 0.85, | |
"grad_norm": 13.271788597106934, | |
"learning_rate": 3.577777777777778e-05, | |
"loss": 0.6114, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 0.85, | |
"eval_accuracy": 0.8246, | |
"eval_f1_macro": 0.7269472752858881, | |
"eval_f1_micro": 0.8246, | |
"eval_loss": 0.6453167796134949, | |
"eval_runtime": 150.2624, | |
"eval_samples_per_second": 66.55, | |
"eval_steps_per_second": 2.083, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 0.86, | |
"grad_norm": 22.58757972717285, | |
"learning_rate": 3.568888888888889e-05, | |
"loss": 0.602, | |
"step": 1610 | |
}, | |
{ | |
"epoch": 0.86, | |
"grad_norm": 25.419322967529297, | |
"learning_rate": 3.56e-05, | |
"loss": 0.5252, | |
"step": 1620 | |
}, | |
{ | |
"epoch": 0.87, | |
"grad_norm": 26.960481643676758, | |
"learning_rate": 3.551111111111111e-05, | |
"loss": 0.6874, | |
"step": 1630 | |
}, | |
{ | |
"epoch": 0.87, | |
"grad_norm": 27.8248233795166, | |
"learning_rate": 3.5422222222222226e-05, | |
"loss": 0.7663, | |
"step": 1640 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 32.19744873046875, | |
"learning_rate": 3.5333333333333336e-05, | |
"loss": 0.7413, | |
"step": 1650 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 32.132179260253906, | |
"learning_rate": 3.5244444444444447e-05, | |
"loss": 0.5752, | |
"step": 1660 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 32.55865478515625, | |
"learning_rate": 3.515555555555556e-05, | |
"loss": 0.5818, | |
"step": 1670 | |
}, | |
{ | |
"epoch": 0.9, | |
"grad_norm": 32.21278381347656, | |
"learning_rate": 3.506666666666667e-05, | |
"loss": 0.5803, | |
"step": 1680 | |
}, | |
{ | |
"epoch": 0.9, | |
"grad_norm": 25.46314239501953, | |
"learning_rate": 3.4977777777777785e-05, | |
"loss": 0.623, | |
"step": 1690 | |
}, | |
{ | |
"epoch": 0.91, | |
"grad_norm": 26.091236114501953, | |
"learning_rate": 3.4888888888888895e-05, | |
"loss": 0.5616, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 0.91, | |
"eval_accuracy": 0.8275, | |
"eval_f1_macro": 0.7289680784160217, | |
"eval_f1_micro": 0.8275, | |
"eval_loss": 0.6631607413291931, | |
"eval_runtime": 150.375, | |
"eval_samples_per_second": 66.5, | |
"eval_steps_per_second": 2.081, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 0.91, | |
"grad_norm": 25.63671875, | |
"learning_rate": 3.48e-05, | |
"loss": 0.7312, | |
"step": 1710 | |
}, | |
{ | |
"epoch": 0.92, | |
"grad_norm": 32.645164489746094, | |
"learning_rate": 3.471111111111111e-05, | |
"loss": 0.6261, | |
"step": 1720 | |
}, | |
{ | |
"epoch": 0.92, | |
"grad_norm": 31.84140396118164, | |
"learning_rate": 3.462222222222222e-05, | |
"loss": 0.6618, | |
"step": 1730 | |
}, | |
{ | |
"epoch": 0.93, | |
"grad_norm": 23.48900604248047, | |
"learning_rate": 3.453333333333334e-05, | |
"loss": 0.5569, | |
"step": 1740 | |
}, | |
{ | |
"epoch": 0.93, | |
"grad_norm": 21.029348373413086, | |
"learning_rate": 3.444444444444445e-05, | |
"loss": 0.5333, | |
"step": 1750 | |
}, | |
{ | |
"epoch": 0.94, | |
"grad_norm": 26.658044815063477, | |
"learning_rate": 3.435555555555556e-05, | |
"loss": 0.516, | |
"step": 1760 | |
}, | |
{ | |
"epoch": 0.94, | |
"grad_norm": 25.32404899597168, | |
"learning_rate": 3.426666666666667e-05, | |
"loss": 0.5171, | |
"step": 1770 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 33.53643798828125, | |
"learning_rate": 3.417777777777778e-05, | |
"loss": 0.6361, | |
"step": 1780 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 23.664636611938477, | |
"learning_rate": 3.408888888888889e-05, | |
"loss": 0.5254, | |
"step": 1790 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 26.88168716430664, | |
"learning_rate": 3.4000000000000007e-05, | |
"loss": 0.6985, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.8329, | |
"eval_f1_macro": 0.7395314297204796, | |
"eval_f1_micro": 0.8329, | |
"eval_loss": 0.6022256016731262, | |
"eval_runtime": 150.3869, | |
"eval_samples_per_second": 66.495, | |
"eval_steps_per_second": 2.081, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 0.97, | |
"grad_norm": 26.552513122558594, | |
"learning_rate": 3.391111111111111e-05, | |
"loss": 0.7002, | |
"step": 1810 | |
}, | |
{ | |
"epoch": 0.97, | |
"grad_norm": 19.498023986816406, | |
"learning_rate": 3.382222222222222e-05, | |
"loss": 0.74, | |
"step": 1820 | |
}, | |
{ | |
"epoch": 0.98, | |
"grad_norm": 19.793920516967773, | |
"learning_rate": 3.373333333333333e-05, | |
"loss": 0.6457, | |
"step": 1830 | |
}, | |
{ | |
"epoch": 0.98, | |
"grad_norm": 21.879690170288086, | |
"learning_rate": 3.364444444444445e-05, | |
"loss": 0.5794, | |
"step": 1840 | |
}, | |
{ | |
"epoch": 0.99, | |
"grad_norm": 28.4526309967041, | |
"learning_rate": 3.355555555555556e-05, | |
"loss": 0.5144, | |
"step": 1850 | |
}, | |
{ | |
"epoch": 0.99, | |
"grad_norm": 29.433683395385742, | |
"learning_rate": 3.346666666666667e-05, | |
"loss": 0.4621, | |
"step": 1860 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 30.06548309326172, | |
"learning_rate": 3.337777777777778e-05, | |
"loss": 0.6983, | |
"step": 1870 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 16.910295486450195, | |
"learning_rate": 3.328888888888889e-05, | |
"loss": 0.4668, | |
"step": 1880 | |
}, | |
{ | |
"epoch": 1.01, | |
"grad_norm": 18.447338104248047, | |
"learning_rate": 3.32e-05, | |
"loss": 0.4046, | |
"step": 1890 | |
}, | |
{ | |
"epoch": 1.01, | |
"grad_norm": 18.394548416137695, | |
"learning_rate": 3.311111111111112e-05, | |
"loss": 0.387, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 1.01, | |
"eval_accuracy": 0.8475, | |
"eval_f1_macro": 0.768978436326819, | |
"eval_f1_micro": 0.8475, | |
"eval_loss": 0.5910280346870422, | |
"eval_runtime": 150.3955, | |
"eval_samples_per_second": 66.491, | |
"eval_steps_per_second": 2.081, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 1.02, | |
"grad_norm": 15.962233543395996, | |
"learning_rate": 3.302222222222222e-05, | |
"loss": 0.3534, | |
"step": 1910 | |
}, | |
{ | |
"epoch": 1.02, | |
"grad_norm": 25.97977638244629, | |
"learning_rate": 3.293333333333333e-05, | |
"loss": 0.3755, | |
"step": 1920 | |
}, | |
{ | |
"epoch": 1.03, | |
"grad_norm": 16.386619567871094, | |
"learning_rate": 3.284444444444444e-05, | |
"loss": 0.3695, | |
"step": 1930 | |
}, | |
{ | |
"epoch": 1.03, | |
"grad_norm": 29.158287048339844, | |
"learning_rate": 3.275555555555555e-05, | |
"loss": 0.3642, | |
"step": 1940 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 21.561437606811523, | |
"learning_rate": 3.266666666666667e-05, | |
"loss": 0.3619, | |
"step": 1950 | |
}, | |
{ | |
"epoch": 1.05, | |
"grad_norm": 20.957096099853516, | |
"learning_rate": 3.257777777777778e-05, | |
"loss": 0.3565, | |
"step": 1960 | |
}, | |
{ | |
"epoch": 1.05, | |
"grad_norm": 23.491188049316406, | |
"learning_rate": 3.248888888888889e-05, | |
"loss": 0.3597, | |
"step": 1970 | |
}, | |
{ | |
"epoch": 1.06, | |
"grad_norm": 20.992183685302734, | |
"learning_rate": 3.24e-05, | |
"loss": 0.4166, | |
"step": 1980 | |
}, | |
{ | |
"epoch": 1.06, | |
"grad_norm": 10.696439743041992, | |
"learning_rate": 3.231111111111111e-05, | |
"loss": 0.3333, | |
"step": 1990 | |
}, | |
{ | |
"epoch": 1.07, | |
"grad_norm": 20.321285247802734, | |
"learning_rate": 3.222222222222223e-05, | |
"loss": 0.2391, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 1.07, | |
"eval_accuracy": 0.8475, | |
"eval_f1_macro": 0.756420860990717, | |
"eval_f1_micro": 0.8475, | |
"eval_loss": 0.6234980225563049, | |
"eval_runtime": 150.4342, | |
"eval_samples_per_second": 66.474, | |
"eval_steps_per_second": 2.081, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 1.07, | |
"grad_norm": 28.013612747192383, | |
"learning_rate": 3.213333333333334e-05, | |
"loss": 0.4804, | |
"step": 2010 | |
}, | |
{ | |
"epoch": 1.08, | |
"grad_norm": 20.689050674438477, | |
"learning_rate": 3.204444444444444e-05, | |
"loss": 0.2839, | |
"step": 2020 | |
}, | |
{ | |
"epoch": 1.08, | |
"grad_norm": 25.118309020996094, | |
"learning_rate": 3.1955555555555554e-05, | |
"loss": 0.418, | |
"step": 2030 | |
}, | |
{ | |
"epoch": 1.09, | |
"grad_norm": 9.888715744018555, | |
"learning_rate": 3.1866666666666664e-05, | |
"loss": 0.2898, | |
"step": 2040 | |
}, | |
{ | |
"epoch": 1.09, | |
"grad_norm": 25.61309051513672, | |
"learning_rate": 3.177777777777778e-05, | |
"loss": 0.3652, | |
"step": 2050 | |
}, | |
{ | |
"epoch": 1.1, | |
"grad_norm": 29.823627471923828, | |
"learning_rate": 3.168888888888889e-05, | |
"loss": 0.4499, | |
"step": 2060 | |
}, | |
{ | |
"epoch": 1.1, | |
"grad_norm": 25.4545955657959, | |
"learning_rate": 3.16e-05, | |
"loss": 0.3955, | |
"step": 2070 | |
}, | |
{ | |
"epoch": 1.11, | |
"grad_norm": 20.512975692749023, | |
"learning_rate": 3.151111111111111e-05, | |
"loss": 0.3354, | |
"step": 2080 | |
}, | |
{ | |
"epoch": 1.11, | |
"grad_norm": 24.814722061157227, | |
"learning_rate": 3.142222222222222e-05, | |
"loss": 0.406, | |
"step": 2090 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 16.441551208496094, | |
"learning_rate": 3.1333333333333334e-05, | |
"loss": 0.4414, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 1.12, | |
"eval_accuracy": 0.8421, | |
"eval_f1_macro": 0.7650720616593817, | |
"eval_f1_micro": 0.8421, | |
"eval_loss": 0.6027012467384338, | |
"eval_runtime": 150.493, | |
"eval_samples_per_second": 66.448, | |
"eval_steps_per_second": 2.08, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 1.13, | |
"grad_norm": 23.043563842773438, | |
"learning_rate": 3.124444444444445e-05, | |
"loss": 0.4198, | |
"step": 2110 | |
}, | |
{ | |
"epoch": 1.13, | |
"grad_norm": 30.09490203857422, | |
"learning_rate": 3.1155555555555555e-05, | |
"loss": 0.3899, | |
"step": 2120 | |
}, | |
{ | |
"epoch": 1.14, | |
"grad_norm": 26.25542640686035, | |
"learning_rate": 3.1066666666666665e-05, | |
"loss": 0.3292, | |
"step": 2130 | |
}, | |
{ | |
"epoch": 1.14, | |
"grad_norm": 21.587125778198242, | |
"learning_rate": 3.0977777777777776e-05, | |
"loss": 0.434, | |
"step": 2140 | |
}, | |
{ | |
"epoch": 1.15, | |
"grad_norm": 32.34952163696289, | |
"learning_rate": 3.088888888888889e-05, | |
"loss": 0.3563, | |
"step": 2150 | |
}, | |
{ | |
"epoch": 1.15, | |
"grad_norm": 37.00065994262695, | |
"learning_rate": 3.08e-05, | |
"loss": 0.4363, | |
"step": 2160 | |
}, | |
{ | |
"epoch": 1.16, | |
"grad_norm": 18.810853958129883, | |
"learning_rate": 3.0711111111111114e-05, | |
"loss": 0.3945, | |
"step": 2170 | |
}, | |
{ | |
"epoch": 1.16, | |
"grad_norm": 20.760358810424805, | |
"learning_rate": 3.0622222222222224e-05, | |
"loss": 0.358, | |
"step": 2180 | |
}, | |
{ | |
"epoch": 1.17, | |
"grad_norm": 25.902507781982422, | |
"learning_rate": 3.0533333333333335e-05, | |
"loss": 0.431, | |
"step": 2190 | |
}, | |
{ | |
"epoch": 1.17, | |
"grad_norm": 20.889230728149414, | |
"learning_rate": 3.044444444444445e-05, | |
"loss": 0.3869, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 1.17, | |
"eval_accuracy": 0.8437, | |
"eval_f1_macro": 0.7592276312605151, | |
"eval_f1_micro": 0.8437, | |
"eval_loss": 0.6028015613555908, | |
"eval_runtime": 150.4185, | |
"eval_samples_per_second": 66.481, | |
"eval_steps_per_second": 2.081, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 1.18, | |
"grad_norm": 18.823284149169922, | |
"learning_rate": 3.035555555555556e-05, | |
"loss": 0.3956, | |
"step": 2210 | |
}, | |
{ | |
"epoch": 1.18, | |
"grad_norm": 22.283672332763672, | |
"learning_rate": 3.0266666666666666e-05, | |
"loss": 0.4863, | |
"step": 2220 | |
}, | |
{ | |
"epoch": 1.19, | |
"grad_norm": 16.33639144897461, | |
"learning_rate": 3.0177777777777776e-05, | |
"loss": 0.392, | |
"step": 2230 | |
}, | |
{ | |
"epoch": 1.19, | |
"grad_norm": 23.827781677246094, | |
"learning_rate": 3.008888888888889e-05, | |
"loss": 0.3198, | |
"step": 2240 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 26.199676513671875, | |
"learning_rate": 3e-05, | |
"loss": 0.3314, | |
"step": 2250 | |
}, | |
{ | |
"epoch": 1.21, | |
"grad_norm": 20.12962532043457, | |
"learning_rate": 2.991111111111111e-05, | |
"loss": 0.4698, | |
"step": 2260 | |
}, | |
{ | |
"epoch": 1.21, | |
"grad_norm": 27.956256866455078, | |
"learning_rate": 2.9822222222222225e-05, | |
"loss": 0.4283, | |
"step": 2270 | |
}, | |
{ | |
"epoch": 1.22, | |
"grad_norm": 24.309349060058594, | |
"learning_rate": 2.9733333333333336e-05, | |
"loss": 0.3119, | |
"step": 2280 | |
}, | |
{ | |
"epoch": 1.22, | |
"grad_norm": 21.136127471923828, | |
"learning_rate": 2.9644444444444446e-05, | |
"loss": 0.2309, | |
"step": 2290 | |
}, | |
{ | |
"epoch": 1.23, | |
"grad_norm": 14.561148643493652, | |
"learning_rate": 2.955555555555556e-05, | |
"loss": 0.2387, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 1.23, | |
"eval_accuracy": 0.845, | |
"eval_f1_macro": 0.7634532685547798, | |
"eval_f1_micro": 0.845, | |
"eval_loss": 0.6645835638046265, | |
"eval_runtime": 150.7976, | |
"eval_samples_per_second": 66.314, | |
"eval_steps_per_second": 2.076, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 1.23, | |
"grad_norm": 17.950855255126953, | |
"learning_rate": 2.946666666666667e-05, | |
"loss": 0.442, | |
"step": 2310 | |
}, | |
{ | |
"epoch": 1.24, | |
"grad_norm": 25.867813110351562, | |
"learning_rate": 2.937777777777778e-05, | |
"loss": 0.44, | |
"step": 2320 | |
}, | |
{ | |
"epoch": 1.24, | |
"grad_norm": 17.729812622070312, | |
"learning_rate": 2.9288888888888888e-05, | |
"loss": 0.358, | |
"step": 2330 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 19.638261795043945, | |
"learning_rate": 2.9199999999999998e-05, | |
"loss": 0.2645, | |
"step": 2340 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 25.970163345336914, | |
"learning_rate": 2.9111111111111112e-05, | |
"loss": 0.3405, | |
"step": 2350 | |
}, | |
{ | |
"epoch": 1.26, | |
"grad_norm": 11.836894989013672, | |
"learning_rate": 2.9022222222222223e-05, | |
"loss": 0.3557, | |
"step": 2360 | |
}, | |
{ | |
"epoch": 1.26, | |
"grad_norm": 20.230266571044922, | |
"learning_rate": 2.8933333333333333e-05, | |
"loss": 0.4081, | |
"step": 2370 | |
}, | |
{ | |
"epoch": 1.27, | |
"grad_norm": 29.962060928344727, | |
"learning_rate": 2.8844444444444447e-05, | |
"loss": 0.373, | |
"step": 2380 | |
}, | |
{ | |
"epoch": 1.27, | |
"grad_norm": 33.542320251464844, | |
"learning_rate": 2.8755555555555557e-05, | |
"loss": 0.3617, | |
"step": 2390 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 25.599098205566406, | |
"learning_rate": 2.8666666666666668e-05, | |
"loss": 0.3556, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.8487, | |
"eval_f1_macro": 0.7724260808875206, | |
"eval_f1_micro": 0.8487, | |
"eval_loss": 0.6032431125640869, | |
"eval_runtime": 150.7819, | |
"eval_samples_per_second": 66.321, | |
"eval_steps_per_second": 2.076, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 1.29, | |
"grad_norm": 23.8782958984375, | |
"learning_rate": 2.857777777777778e-05, | |
"loss": 0.4046, | |
"step": 2410 | |
}, | |
{ | |
"epoch": 1.29, | |
"grad_norm": 23.979324340820312, | |
"learning_rate": 2.8488888888888892e-05, | |
"loss": 0.3942, | |
"step": 2420 | |
}, | |
{ | |
"epoch": 1.3, | |
"grad_norm": 10.684112548828125, | |
"learning_rate": 2.84e-05, | |
"loss": 0.2238, | |
"step": 2430 | |
}, | |
{ | |
"epoch": 1.3, | |
"grad_norm": 18.40957260131836, | |
"learning_rate": 2.831111111111111e-05, | |
"loss": 0.3078, | |
"step": 2440 | |
}, | |
{ | |
"epoch": 1.31, | |
"grad_norm": 30.96697998046875, | |
"learning_rate": 2.8222222222222223e-05, | |
"loss": 0.3622, | |
"step": 2450 | |
}, | |
{ | |
"epoch": 1.31, | |
"grad_norm": 24.614702224731445, | |
"learning_rate": 2.8133333333333334e-05, | |
"loss": 0.3691, | |
"step": 2460 | |
}, | |
{ | |
"epoch": 1.32, | |
"grad_norm": 23.404987335205078, | |
"learning_rate": 2.8044444444444444e-05, | |
"loss": 0.3551, | |
"step": 2470 | |
}, | |
{ | |
"epoch": 1.32, | |
"grad_norm": 30.258798599243164, | |
"learning_rate": 2.7955555555555558e-05, | |
"loss": 0.3761, | |
"step": 2480 | |
}, | |
{ | |
"epoch": 1.33, | |
"grad_norm": 4.7594780921936035, | |
"learning_rate": 2.786666666666667e-05, | |
"loss": 0.328, | |
"step": 2490 | |
}, | |
{ | |
"epoch": 1.33, | |
"grad_norm": 14.827425956726074, | |
"learning_rate": 2.777777777777778e-05, | |
"loss": 0.4439, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 1.33, | |
"eval_accuracy": 0.8589, | |
"eval_f1_macro": 0.7789734556100073, | |
"eval_f1_micro": 0.8589, | |
"eval_loss": 0.5772649049758911, | |
"eval_runtime": 150.8158, | |
"eval_samples_per_second": 66.306, | |
"eval_steps_per_second": 2.075, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 1.34, | |
"grad_norm": 15.111785888671875, | |
"learning_rate": 2.7688888888888893e-05, | |
"loss": 0.2569, | |
"step": 2510 | |
}, | |
{ | |
"epoch": 1.34, | |
"grad_norm": 28.836196899414062, | |
"learning_rate": 2.7600000000000003e-05, | |
"loss": 0.4003, | |
"step": 2520 | |
}, | |
{ | |
"epoch": 1.35, | |
"grad_norm": 36.57837677001953, | |
"learning_rate": 2.751111111111111e-05, | |
"loss": 0.3891, | |
"step": 2530 | |
}, | |
{ | |
"epoch": 1.35, | |
"grad_norm": 20.131092071533203, | |
"learning_rate": 2.742222222222222e-05, | |
"loss": 0.3853, | |
"step": 2540 | |
}, | |
{ | |
"epoch": 1.36, | |
"grad_norm": 28.32253074645996, | |
"learning_rate": 2.733333333333333e-05, | |
"loss": 0.4096, | |
"step": 2550 | |
}, | |
{ | |
"epoch": 1.37, | |
"grad_norm": 26.20575523376465, | |
"learning_rate": 2.7244444444444445e-05, | |
"loss": 0.3505, | |
"step": 2560 | |
}, | |
{ | |
"epoch": 1.37, | |
"grad_norm": 29.0845947265625, | |
"learning_rate": 2.7155555555555556e-05, | |
"loss": 0.3897, | |
"step": 2570 | |
}, | |
{ | |
"epoch": 1.38, | |
"grad_norm": 15.182287216186523, | |
"learning_rate": 2.706666666666667e-05, | |
"loss": 0.3748, | |
"step": 2580 | |
}, | |
{ | |
"epoch": 1.38, | |
"grad_norm": 14.50926399230957, | |
"learning_rate": 2.697777777777778e-05, | |
"loss": 0.3501, | |
"step": 2590 | |
}, | |
{ | |
"epoch": 1.39, | |
"grad_norm": 23.248886108398438, | |
"learning_rate": 2.688888888888889e-05, | |
"loss": 0.4171, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 1.39, | |
"eval_accuracy": 0.8551, | |
"eval_f1_macro": 0.7759771387428208, | |
"eval_f1_micro": 0.8551, | |
"eval_loss": 0.5601994395256042, | |
"eval_runtime": 150.7993, | |
"eval_samples_per_second": 66.313, | |
"eval_steps_per_second": 2.076, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 1.39, | |
"grad_norm": 20.562131881713867, | |
"learning_rate": 2.6800000000000004e-05, | |
"loss": 0.3811, | |
"step": 2610 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 20.327190399169922, | |
"learning_rate": 2.6711111111111115e-05, | |
"loss": 0.2697, | |
"step": 2620 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 19.044452667236328, | |
"learning_rate": 2.6622222222222225e-05, | |
"loss": 0.3059, | |
"step": 2630 | |
}, | |
{ | |
"epoch": 1.41, | |
"grad_norm": 24.917388916015625, | |
"learning_rate": 2.6533333333333332e-05, | |
"loss": 0.4426, | |
"step": 2640 | |
}, | |
{ | |
"epoch": 1.41, | |
"grad_norm": 25.066818237304688, | |
"learning_rate": 2.6444444444444443e-05, | |
"loss": 0.4759, | |
"step": 2650 | |
}, | |
{ | |
"epoch": 1.42, | |
"grad_norm": 27.263545989990234, | |
"learning_rate": 2.6355555555555557e-05, | |
"loss": 0.3387, | |
"step": 2660 | |
}, | |
{ | |
"epoch": 1.42, | |
"grad_norm": 18.15851402282715, | |
"learning_rate": 2.6266666666666667e-05, | |
"loss": 0.3474, | |
"step": 2670 | |
}, | |
{ | |
"epoch": 1.43, | |
"grad_norm": 21.79593276977539, | |
"learning_rate": 2.6177777777777777e-05, | |
"loss": 0.3775, | |
"step": 2680 | |
}, | |
{ | |
"epoch": 1.43, | |
"grad_norm": 17.30176544189453, | |
"learning_rate": 2.608888888888889e-05, | |
"loss": 0.3177, | |
"step": 2690 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 30.904870986938477, | |
"learning_rate": 2.6000000000000002e-05, | |
"loss": 0.3984, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 1.44, | |
"eval_accuracy": 0.8514, | |
"eval_f1_macro": 0.7708173208271037, | |
"eval_f1_micro": 0.8514, | |
"eval_loss": 0.5800321102142334, | |
"eval_runtime": 150.9969, | |
"eval_samples_per_second": 66.227, | |
"eval_steps_per_second": 2.073, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 1.45, | |
"grad_norm": 22.358997344970703, | |
"learning_rate": 2.5911111111111112e-05, | |
"loss": 0.3168, | |
"step": 2710 | |
}, | |
{ | |
"epoch": 1.45, | |
"grad_norm": 28.393596649169922, | |
"learning_rate": 2.5822222222222226e-05, | |
"loss": 0.3254, | |
"step": 2720 | |
}, | |
{ | |
"epoch": 1.46, | |
"grad_norm": 24.635414123535156, | |
"learning_rate": 2.5733333333333337e-05, | |
"loss": 0.2818, | |
"step": 2730 | |
}, | |
{ | |
"epoch": 1.46, | |
"grad_norm": 18.663604736328125, | |
"learning_rate": 2.5644444444444444e-05, | |
"loss": 0.3943, | |
"step": 2740 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 25.46748161315918, | |
"learning_rate": 2.5555555555555554e-05, | |
"loss": 0.4195, | |
"step": 2750 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 16.54319190979004, | |
"learning_rate": 2.5466666666666668e-05, | |
"loss": 0.2946, | |
"step": 2760 | |
}, | |
{ | |
"epoch": 1.48, | |
"grad_norm": 15.662579536437988, | |
"learning_rate": 2.537777777777778e-05, | |
"loss": 0.3247, | |
"step": 2770 | |
}, | |
{ | |
"epoch": 1.48, | |
"grad_norm": 33.76002883911133, | |
"learning_rate": 2.528888888888889e-05, | |
"loss": 0.3986, | |
"step": 2780 | |
}, | |
{ | |
"epoch": 1.49, | |
"grad_norm": 17.078815460205078, | |
"learning_rate": 2.5200000000000003e-05, | |
"loss": 0.2907, | |
"step": 2790 | |
}, | |
{ | |
"epoch": 1.49, | |
"grad_norm": 19.065820693969727, | |
"learning_rate": 2.5111111111111113e-05, | |
"loss": 0.2491, | |
"step": 2800 | |
}, | |
{ | |
"epoch": 1.49, | |
"eval_accuracy": 0.8463, | |
"eval_f1_macro": 0.7774119411824801, | |
"eval_f1_micro": 0.8463, | |
"eval_loss": 0.5934433341026306, | |
"eval_runtime": 150.8383, | |
"eval_samples_per_second": 66.296, | |
"eval_steps_per_second": 2.075, | |
"step": 2800 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 20.654638290405273, | |
"learning_rate": 2.5022222222222224e-05, | |
"loss": 0.2698, | |
"step": 2810 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 23.666898727416992, | |
"learning_rate": 2.4933333333333334e-05, | |
"loss": 0.387, | |
"step": 2820 | |
}, | |
{ | |
"epoch": 1.51, | |
"grad_norm": 24.191789627075195, | |
"learning_rate": 2.4844444444444444e-05, | |
"loss": 0.2838, | |
"step": 2830 | |
}, | |
{ | |
"epoch": 1.51, | |
"grad_norm": 21.81308937072754, | |
"learning_rate": 2.475555555555556e-05, | |
"loss": 0.3263, | |
"step": 2840 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 21.30182456970215, | |
"learning_rate": 2.466666666666667e-05, | |
"loss": 0.3126, | |
"step": 2850 | |
}, | |
{ | |
"epoch": 1.53, | |
"grad_norm": 20.381277084350586, | |
"learning_rate": 2.457777777777778e-05, | |
"loss": 0.322, | |
"step": 2860 | |
}, | |
{ | |
"epoch": 1.53, | |
"grad_norm": 22.04474639892578, | |
"learning_rate": 2.448888888888889e-05, | |
"loss": 0.3995, | |
"step": 2870 | |
}, | |
{ | |
"epoch": 1.54, | |
"grad_norm": 17.000167846679688, | |
"learning_rate": 2.44e-05, | |
"loss": 0.3385, | |
"step": 2880 | |
}, | |
{ | |
"epoch": 1.54, | |
"grad_norm": 19.123960494995117, | |
"learning_rate": 2.431111111111111e-05, | |
"loss": 0.3365, | |
"step": 2890 | |
}, | |
{ | |
"epoch": 1.55, | |
"grad_norm": 24.588180541992188, | |
"learning_rate": 2.4222222222222224e-05, | |
"loss": 0.2975, | |
"step": 2900 | |
}, | |
{ | |
"epoch": 1.55, | |
"eval_accuracy": 0.8548, | |
"eval_f1_macro": 0.7775962578615729, | |
"eval_f1_micro": 0.8548, | |
"eval_loss": 0.5837641954421997, | |
"eval_runtime": 151.0441, | |
"eval_samples_per_second": 66.206, | |
"eval_steps_per_second": 2.072, | |
"step": 2900 | |
}, | |
{ | |
"epoch": 1.55, | |
"grad_norm": 19.566835403442383, | |
"learning_rate": 2.4133333333333335e-05, | |
"loss": 0.482, | |
"step": 2910 | |
}, | |
{ | |
"epoch": 1.56, | |
"grad_norm": 20.381959915161133, | |
"learning_rate": 2.4044444444444445e-05, | |
"loss": 0.415, | |
"step": 2920 | |
}, | |
{ | |
"epoch": 1.56, | |
"grad_norm": 26.44783592224121, | |
"learning_rate": 2.3955555555555556e-05, | |
"loss": 0.4592, | |
"step": 2930 | |
}, | |
{ | |
"epoch": 1.57, | |
"grad_norm": 24.821016311645508, | |
"learning_rate": 2.3866666666666666e-05, | |
"loss": 0.2879, | |
"step": 2940 | |
}, | |
{ | |
"epoch": 1.57, | |
"grad_norm": 17.898052215576172, | |
"learning_rate": 2.377777777777778e-05, | |
"loss": 0.3406, | |
"step": 2950 | |
}, | |
{ | |
"epoch": 1.58, | |
"grad_norm": 19.308439254760742, | |
"learning_rate": 2.368888888888889e-05, | |
"loss": 0.425, | |
"step": 2960 | |
}, | |
{ | |
"epoch": 1.58, | |
"grad_norm": 20.031681060791016, | |
"learning_rate": 2.36e-05, | |
"loss": 0.284, | |
"step": 2970 | |
}, | |
{ | |
"epoch": 1.59, | |
"grad_norm": 21.92924690246582, | |
"learning_rate": 2.351111111111111e-05, | |
"loss": 0.4024, | |
"step": 2980 | |
}, | |
{ | |
"epoch": 1.59, | |
"grad_norm": 19.973752975463867, | |
"learning_rate": 2.3422222222222222e-05, | |
"loss": 0.3045, | |
"step": 2990 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 18.280500411987305, | |
"learning_rate": 2.3333333333333336e-05, | |
"loss": 0.4375, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 1.6, | |
"eval_accuracy": 0.8497, | |
"eval_f1_macro": 0.7757695298118251, | |
"eval_f1_micro": 0.8497, | |
"eval_loss": 0.5583605170249939, | |
"eval_runtime": 150.8424, | |
"eval_samples_per_second": 66.294, | |
"eval_steps_per_second": 2.075, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 1.61, | |
"grad_norm": 31.200424194335938, | |
"learning_rate": 2.3244444444444446e-05, | |
"loss": 0.3961, | |
"step": 3010 | |
}, | |
{ | |
"epoch": 1.61, | |
"grad_norm": 28.22431755065918, | |
"learning_rate": 2.3155555555555557e-05, | |
"loss": 0.3697, | |
"step": 3020 | |
}, | |
{ | |
"epoch": 1.62, | |
"grad_norm": 25.528642654418945, | |
"learning_rate": 2.3066666666666667e-05, | |
"loss": 0.4165, | |
"step": 3030 | |
}, | |
{ | |
"epoch": 1.62, | |
"grad_norm": 21.460134506225586, | |
"learning_rate": 2.2977777777777778e-05, | |
"loss": 0.337, | |
"step": 3040 | |
}, | |
{ | |
"epoch": 1.63, | |
"grad_norm": 18.815004348754883, | |
"learning_rate": 2.288888888888889e-05, | |
"loss": 0.4345, | |
"step": 3050 | |
}, | |
{ | |
"epoch": 1.63, | |
"grad_norm": 17.853065490722656, | |
"learning_rate": 2.2800000000000002e-05, | |
"loss": 0.3287, | |
"step": 3060 | |
}, | |
{ | |
"epoch": 1.64, | |
"grad_norm": 11.9459228515625, | |
"learning_rate": 2.2711111111111112e-05, | |
"loss": 0.5879, | |
"step": 3070 | |
}, | |
{ | |
"epoch": 1.64, | |
"grad_norm": 25.860185623168945, | |
"learning_rate": 2.2622222222222223e-05, | |
"loss": 0.2999, | |
"step": 3080 | |
}, | |
{ | |
"epoch": 1.65, | |
"grad_norm": 13.486348152160645, | |
"learning_rate": 2.2533333333333333e-05, | |
"loss": 0.4131, | |
"step": 3090 | |
}, | |
{ | |
"epoch": 1.65, | |
"grad_norm": 26.329408645629883, | |
"learning_rate": 2.2444444444444447e-05, | |
"loss": 0.3108, | |
"step": 3100 | |
}, | |
{ | |
"epoch": 1.65, | |
"eval_accuracy": 0.8624, | |
"eval_f1_macro": 0.7863744372305322, | |
"eval_f1_micro": 0.8624, | |
"eval_loss": 0.5624867677688599, | |
"eval_runtime": 151.1981, | |
"eval_samples_per_second": 66.138, | |
"eval_steps_per_second": 2.07, | |
"step": 3100 | |
}, | |
{ | |
"epoch": 1.66, | |
"grad_norm": 13.064452171325684, | |
"learning_rate": 2.2355555555555558e-05, | |
"loss": 0.3844, | |
"step": 3110 | |
}, | |
{ | |
"epoch": 1.66, | |
"grad_norm": 19.07467269897461, | |
"learning_rate": 2.2266666666666668e-05, | |
"loss": 0.4157, | |
"step": 3120 | |
}, | |
{ | |
"epoch": 1.67, | |
"grad_norm": 13.42187213897705, | |
"learning_rate": 2.217777777777778e-05, | |
"loss": 0.3717, | |
"step": 3130 | |
}, | |
{ | |
"epoch": 1.67, | |
"grad_norm": 17.826555252075195, | |
"learning_rate": 2.208888888888889e-05, | |
"loss": 0.3105, | |
"step": 3140 | |
}, | |
{ | |
"epoch": 1.68, | |
"grad_norm": 16.670066833496094, | |
"learning_rate": 2.2000000000000003e-05, | |
"loss": 0.3521, | |
"step": 3150 | |
}, | |
{ | |
"epoch": 1.69, | |
"grad_norm": 23.210683822631836, | |
"learning_rate": 2.1911111111111113e-05, | |
"loss": 0.2766, | |
"step": 3160 | |
}, | |
{ | |
"epoch": 1.69, | |
"grad_norm": 19.09944725036621, | |
"learning_rate": 2.1822222222222224e-05, | |
"loss": 0.331, | |
"step": 3170 | |
}, | |
{ | |
"epoch": 1.7, | |
"grad_norm": 13.545781135559082, | |
"learning_rate": 2.1733333333333334e-05, | |
"loss": 0.3638, | |
"step": 3180 | |
}, | |
{ | |
"epoch": 1.7, | |
"grad_norm": 12.350102424621582, | |
"learning_rate": 2.1644444444444445e-05, | |
"loss": 0.2956, | |
"step": 3190 | |
}, | |
{ | |
"epoch": 1.71, | |
"grad_norm": 26.11676025390625, | |
"learning_rate": 2.1555555555555555e-05, | |
"loss": 0.3546, | |
"step": 3200 | |
}, | |
{ | |
"epoch": 1.71, | |
"eval_accuracy": 0.8586, | |
"eval_f1_macro": 0.7813783110286097, | |
"eval_f1_micro": 0.8586, | |
"eval_loss": 0.5264253616333008, | |
"eval_runtime": 151.3337, | |
"eval_samples_per_second": 66.079, | |
"eval_steps_per_second": 2.068, | |
"step": 3200 | |
}, | |
{ | |
"epoch": 1.71, | |
"grad_norm": 25.43873405456543, | |
"learning_rate": 2.146666666666667e-05, | |
"loss": 0.2851, | |
"step": 3210 | |
}, | |
{ | |
"epoch": 1.72, | |
"grad_norm": 31.734495162963867, | |
"learning_rate": 2.137777777777778e-05, | |
"loss": 0.2799, | |
"step": 3220 | |
}, | |
{ | |
"epoch": 1.72, | |
"grad_norm": 18.14277458190918, | |
"learning_rate": 2.128888888888889e-05, | |
"loss": 0.2679, | |
"step": 3230 | |
}, | |
{ | |
"epoch": 1.73, | |
"grad_norm": 18.775859832763672, | |
"learning_rate": 2.12e-05, | |
"loss": 0.2851, | |
"step": 3240 | |
}, | |
{ | |
"epoch": 1.73, | |
"grad_norm": 24.40532684326172, | |
"learning_rate": 2.111111111111111e-05, | |
"loss": 0.4009, | |
"step": 3250 | |
}, | |
{ | |
"epoch": 1.74, | |
"grad_norm": 17.99397087097168, | |
"learning_rate": 2.1022222222222225e-05, | |
"loss": 0.3728, | |
"step": 3260 | |
}, | |
{ | |
"epoch": 1.74, | |
"grad_norm": 29.886987686157227, | |
"learning_rate": 2.0933333333333335e-05, | |
"loss": 0.2719, | |
"step": 3270 | |
}, | |
{ | |
"epoch": 1.75, | |
"grad_norm": 20.097272872924805, | |
"learning_rate": 2.0844444444444446e-05, | |
"loss": 0.3968, | |
"step": 3280 | |
}, | |
{ | |
"epoch": 1.75, | |
"grad_norm": 12.6510591506958, | |
"learning_rate": 2.0755555555555556e-05, | |
"loss": 0.3883, | |
"step": 3290 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 18.41010284423828, | |
"learning_rate": 2.0666666666666666e-05, | |
"loss": 0.4125, | |
"step": 3300 | |
}, | |
{ | |
"epoch": 1.76, | |
"eval_accuracy": 0.8509, | |
"eval_f1_macro": 0.7787784202092634, | |
"eval_f1_micro": 0.8509, | |
"eval_loss": 0.5483813285827637, | |
"eval_runtime": 151.1106, | |
"eval_samples_per_second": 66.177, | |
"eval_steps_per_second": 2.071, | |
"step": 3300 | |
}, | |
{ | |
"epoch": 1.77, | |
"grad_norm": 19.636871337890625, | |
"learning_rate": 2.057777777777778e-05, | |
"loss": 0.349, | |
"step": 3310 | |
}, | |
{ | |
"epoch": 1.77, | |
"grad_norm": 27.96908187866211, | |
"learning_rate": 2.048888888888889e-05, | |
"loss": 0.2733, | |
"step": 3320 | |
}, | |
{ | |
"epoch": 1.78, | |
"grad_norm": 14.402112007141113, | |
"learning_rate": 2.04e-05, | |
"loss": 0.253, | |
"step": 3330 | |
}, | |
{ | |
"epoch": 1.78, | |
"grad_norm": 20.20763397216797, | |
"learning_rate": 2.031111111111111e-05, | |
"loss": 0.4342, | |
"step": 3340 | |
}, | |
{ | |
"epoch": 1.79, | |
"grad_norm": 12.293317794799805, | |
"learning_rate": 2.0222222222222222e-05, | |
"loss": 0.3254, | |
"step": 3350 | |
}, | |
{ | |
"epoch": 1.79, | |
"grad_norm": 21.796401977539062, | |
"learning_rate": 2.0133333333333336e-05, | |
"loss": 0.3047, | |
"step": 3360 | |
}, | |
{ | |
"epoch": 1.8, | |
"grad_norm": 16.06928062438965, | |
"learning_rate": 2.0044444444444446e-05, | |
"loss": 0.3411, | |
"step": 3370 | |
}, | |
{ | |
"epoch": 1.8, | |
"grad_norm": 21.114471435546875, | |
"learning_rate": 1.9955555555555557e-05, | |
"loss": 0.2933, | |
"step": 3380 | |
}, | |
{ | |
"epoch": 1.81, | |
"grad_norm": 14.466670036315918, | |
"learning_rate": 1.9866666666666667e-05, | |
"loss": 0.2794, | |
"step": 3390 | |
}, | |
{ | |
"epoch": 1.81, | |
"grad_norm": 20.040361404418945, | |
"learning_rate": 1.9777777777777778e-05, | |
"loss": 0.2206, | |
"step": 3400 | |
}, | |
{ | |
"epoch": 1.81, | |
"eval_accuracy": 0.8563, | |
"eval_f1_macro": 0.7800383469520217, | |
"eval_f1_micro": 0.8563, | |
"eval_loss": 0.5634235739707947, | |
"eval_runtime": 151.4012, | |
"eval_samples_per_second": 66.05, | |
"eval_steps_per_second": 2.067, | |
"step": 3400 | |
}, | |
{ | |
"epoch": 1.82, | |
"grad_norm": 31.591552734375, | |
"learning_rate": 1.968888888888889e-05, | |
"loss": 0.437, | |
"step": 3410 | |
}, | |
{ | |
"epoch": 1.82, | |
"grad_norm": 18.39565658569336, | |
"learning_rate": 1.9600000000000002e-05, | |
"loss": 0.3858, | |
"step": 3420 | |
}, | |
{ | |
"epoch": 1.83, | |
"grad_norm": 18.943843841552734, | |
"learning_rate": 1.9511111111111113e-05, | |
"loss": 0.277, | |
"step": 3430 | |
}, | |
{ | |
"epoch": 1.83, | |
"grad_norm": 28.337656021118164, | |
"learning_rate": 1.9422222222222223e-05, | |
"loss": 0.3453, | |
"step": 3440 | |
}, | |
{ | |
"epoch": 1.84, | |
"grad_norm": 20.132535934448242, | |
"learning_rate": 1.9333333333333333e-05, | |
"loss": 0.2824, | |
"step": 3450 | |
}, | |
{ | |
"epoch": 1.85, | |
"grad_norm": 18.038774490356445, | |
"learning_rate": 1.9244444444444444e-05, | |
"loss": 0.3169, | |
"step": 3460 | |
}, | |
{ | |
"epoch": 1.85, | |
"grad_norm": 24.97245216369629, | |
"learning_rate": 1.9155555555555558e-05, | |
"loss": 0.2868, | |
"step": 3470 | |
}, | |
{ | |
"epoch": 1.86, | |
"grad_norm": 24.715192794799805, | |
"learning_rate": 1.9066666666666668e-05, | |
"loss": 0.3365, | |
"step": 3480 | |
}, | |
{ | |
"epoch": 1.86, | |
"grad_norm": 26.29402732849121, | |
"learning_rate": 1.897777777777778e-05, | |
"loss": 0.3746, | |
"step": 3490 | |
}, | |
{ | |
"epoch": 1.87, | |
"grad_norm": 17.953739166259766, | |
"learning_rate": 1.888888888888889e-05, | |
"loss": 0.3348, | |
"step": 3500 | |
}, | |
{ | |
"epoch": 1.87, | |
"eval_accuracy": 0.8644, | |
"eval_f1_macro": 0.7889602469263715, | |
"eval_f1_micro": 0.8644, | |
"eval_loss": 0.5153625011444092, | |
"eval_runtime": 151.5332, | |
"eval_samples_per_second": 65.992, | |
"eval_steps_per_second": 2.066, | |
"step": 3500 | |
}, | |
{ | |
"epoch": 1.87, | |
"grad_norm": 14.394041061401367, | |
"learning_rate": 1.88e-05, | |
"loss": 0.2773, | |
"step": 3510 | |
}, | |
{ | |
"epoch": 1.88, | |
"grad_norm": 19.886205673217773, | |
"learning_rate": 1.8711111111111113e-05, | |
"loss": 0.3233, | |
"step": 3520 | |
}, | |
{ | |
"epoch": 1.88, | |
"grad_norm": 16.647079467773438, | |
"learning_rate": 1.8622222222222224e-05, | |
"loss": 0.2804, | |
"step": 3530 | |
}, | |
{ | |
"epoch": 1.89, | |
"grad_norm": 5.996079921722412, | |
"learning_rate": 1.8533333333333334e-05, | |
"loss": 0.2722, | |
"step": 3540 | |
}, | |
{ | |
"epoch": 1.89, | |
"grad_norm": 1.7200807332992554, | |
"learning_rate": 1.8444444444444445e-05, | |
"loss": 0.2704, | |
"step": 3550 | |
}, | |
{ | |
"epoch": 1.9, | |
"grad_norm": 11.222332000732422, | |
"learning_rate": 1.8355555555555555e-05, | |
"loss": 0.3672, | |
"step": 3560 | |
}, | |
{ | |
"epoch": 1.9, | |
"grad_norm": 27.67245864868164, | |
"learning_rate": 1.826666666666667e-05, | |
"loss": 0.3744, | |
"step": 3570 | |
}, | |
{ | |
"epoch": 1.91, | |
"grad_norm": 21.44915008544922, | |
"learning_rate": 1.817777777777778e-05, | |
"loss": 0.2362, | |
"step": 3580 | |
}, | |
{ | |
"epoch": 1.91, | |
"grad_norm": 34.626834869384766, | |
"learning_rate": 1.808888888888889e-05, | |
"loss": 0.2912, | |
"step": 3590 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 24.425785064697266, | |
"learning_rate": 1.8e-05, | |
"loss": 0.3451, | |
"step": 3600 | |
}, | |
{ | |
"epoch": 1.92, | |
"eval_accuracy": 0.8667, | |
"eval_f1_macro": 0.7857935048958724, | |
"eval_f1_micro": 0.8667, | |
"eval_loss": 0.5220906138420105, | |
"eval_runtime": 151.6123, | |
"eval_samples_per_second": 65.958, | |
"eval_steps_per_second": 2.064, | |
"step": 3600 | |
}, | |
{ | |
"epoch": 1.93, | |
"grad_norm": 19.154823303222656, | |
"learning_rate": 1.791111111111111e-05, | |
"loss": 0.3439, | |
"step": 3610 | |
}, | |
{ | |
"epoch": 1.93, | |
"grad_norm": 22.16014862060547, | |
"learning_rate": 1.7822222222222225e-05, | |
"loss": 0.2799, | |
"step": 3620 | |
}, | |
{ | |
"epoch": 1.94, | |
"grad_norm": 13.806198120117188, | |
"learning_rate": 1.7733333333333335e-05, | |
"loss": 0.2911, | |
"step": 3630 | |
}, | |
{ | |
"epoch": 1.94, | |
"grad_norm": 19.70717430114746, | |
"learning_rate": 1.7644444444444446e-05, | |
"loss": 0.2232, | |
"step": 3640 | |
}, | |
{ | |
"epoch": 1.95, | |
"grad_norm": 24.279129028320312, | |
"learning_rate": 1.7555555555555556e-05, | |
"loss": 0.3711, | |
"step": 3650 | |
}, | |
{ | |
"epoch": 1.95, | |
"grad_norm": 13.949311256408691, | |
"learning_rate": 1.7466666666666667e-05, | |
"loss": 0.3905, | |
"step": 3660 | |
}, | |
{ | |
"epoch": 1.96, | |
"grad_norm": 22.523921966552734, | |
"learning_rate": 1.737777777777778e-05, | |
"loss": 0.3294, | |
"step": 3670 | |
}, | |
{ | |
"epoch": 1.96, | |
"grad_norm": 27.370868682861328, | |
"learning_rate": 1.728888888888889e-05, | |
"loss": 0.3046, | |
"step": 3680 | |
}, | |
{ | |
"epoch": 1.97, | |
"grad_norm": 21.498058319091797, | |
"learning_rate": 1.7199999999999998e-05, | |
"loss": 0.4282, | |
"step": 3690 | |
}, | |
{ | |
"epoch": 1.97, | |
"grad_norm": 18.41233253479004, | |
"learning_rate": 1.7111111111111112e-05, | |
"loss": 0.3077, | |
"step": 3700 | |
}, | |
{ | |
"epoch": 1.97, | |
"eval_accuracy": 0.8662, | |
"eval_f1_macro": 0.7935499844173846, | |
"eval_f1_micro": 0.8662, | |
"eval_loss": 0.5041437745094299, | |
"eval_runtime": 151.0113, | |
"eval_samples_per_second": 66.22, | |
"eval_steps_per_second": 2.073, | |
"step": 3700 | |
}, | |
{ | |
"epoch": 1.98, | |
"grad_norm": 16.763723373413086, | |
"learning_rate": 1.7022222222222222e-05, | |
"loss": 0.225, | |
"step": 3710 | |
}, | |
{ | |
"epoch": 1.98, | |
"grad_norm": 11.353950500488281, | |
"learning_rate": 1.6933333333333333e-05, | |
"loss": 0.2651, | |
"step": 3720 | |
}, | |
{ | |
"epoch": 1.99, | |
"grad_norm": 17.139925003051758, | |
"learning_rate": 1.6844444444444447e-05, | |
"loss": 0.3122, | |
"step": 3730 | |
}, | |
{ | |
"epoch": 1.99, | |
"grad_norm": 16.368776321411133, | |
"learning_rate": 1.6755555555555557e-05, | |
"loss": 0.3058, | |
"step": 3740 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 20.346548080444336, | |
"learning_rate": 1.6666666666666667e-05, | |
"loss": 0.3383, | |
"step": 3750 | |
}, | |
{ | |
"epoch": 2.01, | |
"grad_norm": 9.95333480834961, | |
"learning_rate": 1.6577777777777778e-05, | |
"loss": 0.0964, | |
"step": 3760 | |
}, | |
{ | |
"epoch": 2.01, | |
"grad_norm": 5.907923221588135, | |
"learning_rate": 1.648888888888889e-05, | |
"loss": 0.1152, | |
"step": 3770 | |
}, | |
{ | |
"epoch": 2.02, | |
"grad_norm": 8.541373252868652, | |
"learning_rate": 1.6400000000000002e-05, | |
"loss": 0.131, | |
"step": 3780 | |
}, | |
{ | |
"epoch": 2.02, | |
"grad_norm": 14.729881286621094, | |
"learning_rate": 1.6311111111111113e-05, | |
"loss": 0.1403, | |
"step": 3790 | |
}, | |
{ | |
"epoch": 2.03, | |
"grad_norm": 9.04763412475586, | |
"learning_rate": 1.6222222222222223e-05, | |
"loss": 0.1352, | |
"step": 3800 | |
}, | |
{ | |
"epoch": 2.03, | |
"eval_accuracy": 0.8668, | |
"eval_f1_macro": 0.7919091255565409, | |
"eval_f1_micro": 0.8668, | |
"eval_loss": 0.5686902403831482, | |
"eval_runtime": 151.4998, | |
"eval_samples_per_second": 66.007, | |
"eval_steps_per_second": 2.066, | |
"step": 3800 | |
}, | |
{ | |
"epoch": 2.03, | |
"grad_norm": 6.057829856872559, | |
"learning_rate": 1.6133333333333334e-05, | |
"loss": 0.089, | |
"step": 3810 | |
}, | |
{ | |
"epoch": 2.04, | |
"grad_norm": 16.216115951538086, | |
"learning_rate": 1.6044444444444444e-05, | |
"loss": 0.1422, | |
"step": 3820 | |
}, | |
{ | |
"epoch": 2.04, | |
"grad_norm": 8.361082077026367, | |
"learning_rate": 1.5955555555555558e-05, | |
"loss": 0.0598, | |
"step": 3830 | |
}, | |
{ | |
"epoch": 2.05, | |
"grad_norm": 3.5720770359039307, | |
"learning_rate": 1.586666666666667e-05, | |
"loss": 0.0862, | |
"step": 3840 | |
}, | |
{ | |
"epoch": 2.05, | |
"grad_norm": 11.39939022064209, | |
"learning_rate": 1.577777777777778e-05, | |
"loss": 0.1106, | |
"step": 3850 | |
}, | |
{ | |
"epoch": 2.06, | |
"grad_norm": 15.31943416595459, | |
"learning_rate": 1.568888888888889e-05, | |
"loss": 0.0744, | |
"step": 3860 | |
}, | |
{ | |
"epoch": 2.06, | |
"grad_norm": 10.522315979003906, | |
"learning_rate": 1.56e-05, | |
"loss": 0.0734, | |
"step": 3870 | |
}, | |
{ | |
"epoch": 2.07, | |
"grad_norm": 12.624775886535645, | |
"learning_rate": 1.5511111111111114e-05, | |
"loss": 0.076, | |
"step": 3880 | |
}, | |
{ | |
"epoch": 2.07, | |
"grad_norm": 29.244462966918945, | |
"learning_rate": 1.5422222222222224e-05, | |
"loss": 0.127, | |
"step": 3890 | |
}, | |
{ | |
"epoch": 2.08, | |
"grad_norm": 6.5718607902526855, | |
"learning_rate": 1.5333333333333334e-05, | |
"loss": 0.1012, | |
"step": 3900 | |
}, | |
{ | |
"epoch": 2.08, | |
"eval_accuracy": 0.8651, | |
"eval_f1_macro": 0.7887873056662138, | |
"eval_f1_micro": 0.8651, | |
"eval_loss": 0.575444757938385, | |
"eval_runtime": 151.4945, | |
"eval_samples_per_second": 66.009, | |
"eval_steps_per_second": 2.066, | |
"step": 3900 | |
}, | |
{ | |
"epoch": 2.09, | |
"grad_norm": 7.431846618652344, | |
"learning_rate": 1.5244444444444445e-05, | |
"loss": 0.1201, | |
"step": 3910 | |
}, | |
{ | |
"epoch": 2.09, | |
"grad_norm": 11.772350311279297, | |
"learning_rate": 1.5155555555555555e-05, | |
"loss": 0.0808, | |
"step": 3920 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 6.622474193572998, | |
"learning_rate": 1.5066666666666668e-05, | |
"loss": 0.0708, | |
"step": 3930 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 0.8099313378334045, | |
"learning_rate": 1.497777777777778e-05, | |
"loss": 0.1169, | |
"step": 3940 | |
}, | |
{ | |
"epoch": 2.11, | |
"grad_norm": 11.817524909973145, | |
"learning_rate": 1.4888888888888888e-05, | |
"loss": 0.0881, | |
"step": 3950 | |
}, | |
{ | |
"epoch": 2.11, | |
"grad_norm": 17.54085350036621, | |
"learning_rate": 1.48e-05, | |
"loss": 0.0854, | |
"step": 3960 | |
}, | |
{ | |
"epoch": 2.12, | |
"grad_norm": 6.687868118286133, | |
"learning_rate": 1.4711111111111111e-05, | |
"loss": 0.0392, | |
"step": 3970 | |
}, | |
{ | |
"epoch": 2.12, | |
"grad_norm": 20.661422729492188, | |
"learning_rate": 1.4622222222222223e-05, | |
"loss": 0.1295, | |
"step": 3980 | |
}, | |
{ | |
"epoch": 2.13, | |
"grad_norm": 12.012689590454102, | |
"learning_rate": 1.4533333333333335e-05, | |
"loss": 0.0725, | |
"step": 3990 | |
}, | |
{ | |
"epoch": 2.13, | |
"grad_norm": 14.44579029083252, | |
"learning_rate": 1.4444444444444444e-05, | |
"loss": 0.1006, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 2.13, | |
"eval_accuracy": 0.872, | |
"eval_f1_macro": 0.7958912080196285, | |
"eval_f1_micro": 0.872, | |
"eval_loss": 0.5928996801376343, | |
"eval_runtime": 151.575, | |
"eval_samples_per_second": 65.974, | |
"eval_steps_per_second": 2.065, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 2.14, | |
"grad_norm": 12.146600723266602, | |
"learning_rate": 1.4355555555555556e-05, | |
"loss": 0.0758, | |
"step": 4010 | |
}, | |
{ | |
"epoch": 2.14, | |
"grad_norm": 6.771599769592285, | |
"learning_rate": 1.4266666666666667e-05, | |
"loss": 0.1324, | |
"step": 4020 | |
}, | |
{ | |
"epoch": 2.15, | |
"grad_norm": 6.837072372436523, | |
"learning_rate": 1.4177777777777779e-05, | |
"loss": 0.1205, | |
"step": 4030 | |
}, | |
{ | |
"epoch": 2.15, | |
"grad_norm": 11.44955825805664, | |
"learning_rate": 1.4088888888888891e-05, | |
"loss": 0.0877, | |
"step": 4040 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 19.20203399658203, | |
"learning_rate": 1.4000000000000001e-05, | |
"loss": 0.1092, | |
"step": 4050 | |
}, | |
{ | |
"epoch": 2.17, | |
"grad_norm": 17.78152084350586, | |
"learning_rate": 1.391111111111111e-05, | |
"loss": 0.1364, | |
"step": 4060 | |
}, | |
{ | |
"epoch": 2.17, | |
"grad_norm": 3.235673666000366, | |
"learning_rate": 1.3822222222222222e-05, | |
"loss": 0.0909, | |
"step": 4070 | |
}, | |
{ | |
"epoch": 2.18, | |
"grad_norm": 5.298550128936768, | |
"learning_rate": 1.3733333333333335e-05, | |
"loss": 0.0663, | |
"step": 4080 | |
}, | |
{ | |
"epoch": 2.18, | |
"grad_norm": 4.339886665344238, | |
"learning_rate": 1.3644444444444445e-05, | |
"loss": 0.0641, | |
"step": 4090 | |
}, | |
{ | |
"epoch": 2.19, | |
"grad_norm": 9.271440505981445, | |
"learning_rate": 1.3555555555555557e-05, | |
"loss": 0.0536, | |
"step": 4100 | |
}, | |
{ | |
"epoch": 2.19, | |
"eval_accuracy": 0.8739, | |
"eval_f1_macro": 0.7991612130458756, | |
"eval_f1_micro": 0.8739, | |
"eval_loss": 0.5760007500648499, | |
"eval_runtime": 151.2539, | |
"eval_samples_per_second": 66.114, | |
"eval_steps_per_second": 2.069, | |
"step": 4100 | |
}, | |
{ | |
"epoch": 2.19, | |
"grad_norm": 17.186298370361328, | |
"learning_rate": 1.3466666666666666e-05, | |
"loss": 0.0742, | |
"step": 4110 | |
}, | |
{ | |
"epoch": 2.2, | |
"grad_norm": 0.7883173227310181, | |
"learning_rate": 1.3377777777777778e-05, | |
"loss": 0.0655, | |
"step": 4120 | |
}, | |
{ | |
"epoch": 2.2, | |
"grad_norm": 9.044299125671387, | |
"learning_rate": 1.328888888888889e-05, | |
"loss": 0.0756, | |
"step": 4130 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 16.307199478149414, | |
"learning_rate": 1.32e-05, | |
"loss": 0.0511, | |
"step": 4140 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 7.927936553955078, | |
"learning_rate": 1.3111111111111113e-05, | |
"loss": 0.0507, | |
"step": 4150 | |
}, | |
{ | |
"epoch": 2.22, | |
"grad_norm": 3.2003839015960693, | |
"learning_rate": 1.3022222222222222e-05, | |
"loss": 0.089, | |
"step": 4160 | |
}, | |
{ | |
"epoch": 2.22, | |
"grad_norm": 1.9253557920455933, | |
"learning_rate": 1.2933333333333334e-05, | |
"loss": 0.0479, | |
"step": 4170 | |
}, | |
{ | |
"epoch": 2.23, | |
"grad_norm": 12.930800437927246, | |
"learning_rate": 1.2844444444444446e-05, | |
"loss": 0.0889, | |
"step": 4180 | |
}, | |
{ | |
"epoch": 2.23, | |
"grad_norm": 19.236953735351562, | |
"learning_rate": 1.2755555555555556e-05, | |
"loss": 0.1065, | |
"step": 4190 | |
}, | |
{ | |
"epoch": 2.24, | |
"grad_norm": 2.595717191696167, | |
"learning_rate": 1.2666666666666668e-05, | |
"loss": 0.0401, | |
"step": 4200 | |
}, | |
{ | |
"epoch": 2.24, | |
"eval_accuracy": 0.87, | |
"eval_f1_macro": 0.7935489170050238, | |
"eval_f1_micro": 0.87, | |
"eval_loss": 0.6250885128974915, | |
"eval_runtime": 151.4344, | |
"eval_samples_per_second": 66.035, | |
"eval_steps_per_second": 2.067, | |
"step": 4200 | |
}, | |
{ | |
"epoch": 2.25, | |
"grad_norm": 7.041379928588867, | |
"learning_rate": 1.2577777777777777e-05, | |
"loss": 0.0463, | |
"step": 4210 | |
}, | |
{ | |
"epoch": 2.25, | |
"grad_norm": 25.49098014831543, | |
"learning_rate": 1.248888888888889e-05, | |
"loss": 0.1407, | |
"step": 4220 | |
}, | |
{ | |
"epoch": 2.26, | |
"grad_norm": 2.6813275814056396, | |
"learning_rate": 1.24e-05, | |
"loss": 0.0726, | |
"step": 4230 | |
}, | |
{ | |
"epoch": 2.26, | |
"grad_norm": 3.9798381328582764, | |
"learning_rate": 1.2311111111111112e-05, | |
"loss": 0.0948, | |
"step": 4240 | |
}, | |
{ | |
"epoch": 2.27, | |
"grad_norm": 12.667770385742188, | |
"learning_rate": 1.2222222222222222e-05, | |
"loss": 0.0802, | |
"step": 4250 | |
}, | |
{ | |
"epoch": 2.27, | |
"grad_norm": 16.500022888183594, | |
"learning_rate": 1.2133333333333335e-05, | |
"loss": 0.0976, | |
"step": 4260 | |
}, | |
{ | |
"epoch": 2.28, | |
"grad_norm": 6.598761558532715, | |
"learning_rate": 1.2044444444444445e-05, | |
"loss": 0.0522, | |
"step": 4270 | |
}, | |
{ | |
"epoch": 2.28, | |
"grad_norm": 8.615388870239258, | |
"learning_rate": 1.1955555555555556e-05, | |
"loss": 0.0968, | |
"step": 4280 | |
}, | |
{ | |
"epoch": 2.29, | |
"grad_norm": 2.7019007205963135, | |
"learning_rate": 1.1866666666666668e-05, | |
"loss": 0.0882, | |
"step": 4290 | |
}, | |
{ | |
"epoch": 2.29, | |
"grad_norm": 2.509350538253784, | |
"learning_rate": 1.1777777777777778e-05, | |
"loss": 0.0756, | |
"step": 4300 | |
}, | |
{ | |
"epoch": 2.29, | |
"eval_accuracy": 0.8709, | |
"eval_f1_macro": 0.8027359908802645, | |
"eval_f1_micro": 0.8709, | |
"eval_loss": 0.5894597172737122, | |
"eval_runtime": 151.3844, | |
"eval_samples_per_second": 66.057, | |
"eval_steps_per_second": 2.068, | |
"step": 4300 | |
}, | |
{ | |
"epoch": 2.3, | |
"grad_norm": 8.408510208129883, | |
"learning_rate": 1.168888888888889e-05, | |
"loss": 0.1088, | |
"step": 4310 | |
}, | |
{ | |
"epoch": 2.3, | |
"grad_norm": 17.875926971435547, | |
"learning_rate": 1.16e-05, | |
"loss": 0.0945, | |
"step": 4320 | |
}, | |
{ | |
"epoch": 2.31, | |
"grad_norm": 11.225419998168945, | |
"learning_rate": 1.1511111111111111e-05, | |
"loss": 0.1098, | |
"step": 4330 | |
}, | |
{ | |
"epoch": 2.31, | |
"grad_norm": 14.77673053741455, | |
"learning_rate": 1.1422222222222223e-05, | |
"loss": 0.1175, | |
"step": 4340 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 17.81058692932129, | |
"learning_rate": 1.1333333333333334e-05, | |
"loss": 0.1149, | |
"step": 4350 | |
}, | |
{ | |
"epoch": 2.33, | |
"grad_norm": 1.26676607131958, | |
"learning_rate": 1.1244444444444444e-05, | |
"loss": 0.0419, | |
"step": 4360 | |
}, | |
{ | |
"epoch": 2.33, | |
"grad_norm": 12.364920616149902, | |
"learning_rate": 1.1155555555555556e-05, | |
"loss": 0.1475, | |
"step": 4370 | |
}, | |
{ | |
"epoch": 2.34, | |
"grad_norm": 7.148777484893799, | |
"learning_rate": 1.1066666666666667e-05, | |
"loss": 0.1141, | |
"step": 4380 | |
}, | |
{ | |
"epoch": 2.34, | |
"grad_norm": 10.524274826049805, | |
"learning_rate": 1.0977777777777779e-05, | |
"loss": 0.1338, | |
"step": 4390 | |
}, | |
{ | |
"epoch": 2.35, | |
"grad_norm": 7.37857723236084, | |
"learning_rate": 1.088888888888889e-05, | |
"loss": 0.0501, | |
"step": 4400 | |
}, | |
{ | |
"epoch": 2.35, | |
"eval_accuracy": 0.8707, | |
"eval_f1_macro": 0.7961653209110884, | |
"eval_f1_micro": 0.8707, | |
"eval_loss": 0.5434012413024902, | |
"eval_runtime": 151.1876, | |
"eval_samples_per_second": 66.143, | |
"eval_steps_per_second": 2.07, | |
"step": 4400 | |
}, | |
{ | |
"epoch": 2.35, | |
"grad_norm": 16.111724853515625, | |
"learning_rate": 1.08e-05, | |
"loss": 0.1003, | |
"step": 4410 | |
}, | |
{ | |
"epoch": 2.36, | |
"grad_norm": 6.396437168121338, | |
"learning_rate": 1.0711111111111112e-05, | |
"loss": 0.088, | |
"step": 4420 | |
}, | |
{ | |
"epoch": 2.36, | |
"grad_norm": 19.40189552307129, | |
"learning_rate": 1.0622222222222223e-05, | |
"loss": 0.1436, | |
"step": 4430 | |
}, | |
{ | |
"epoch": 2.37, | |
"grad_norm": 24.316781997680664, | |
"learning_rate": 1.0533333333333335e-05, | |
"loss": 0.0873, | |
"step": 4440 | |
}, | |
{ | |
"epoch": 2.37, | |
"grad_norm": 4.808385372161865, | |
"learning_rate": 1.0444444444444445e-05, | |
"loss": 0.0696, | |
"step": 4450 | |
}, | |
{ | |
"epoch": 2.38, | |
"grad_norm": 1.6188451051712036, | |
"learning_rate": 1.0355555555555556e-05, | |
"loss": 0.1395, | |
"step": 4460 | |
}, | |
{ | |
"epoch": 2.38, | |
"grad_norm": 17.673221588134766, | |
"learning_rate": 1.0266666666666668e-05, | |
"loss": 0.0832, | |
"step": 4470 | |
}, | |
{ | |
"epoch": 2.39, | |
"grad_norm": 11.216964721679688, | |
"learning_rate": 1.0177777777777778e-05, | |
"loss": 0.0707, | |
"step": 4480 | |
}, | |
{ | |
"epoch": 2.39, | |
"grad_norm": 10.152276992797852, | |
"learning_rate": 1.0088888888888889e-05, | |
"loss": 0.0895, | |
"step": 4490 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 11.688224792480469, | |
"learning_rate": 1e-05, | |
"loss": 0.0611, | |
"step": 4500 | |
}, | |
{ | |
"epoch": 2.4, | |
"eval_accuracy": 0.8759, | |
"eval_f1_macro": 0.8042158552214435, | |
"eval_f1_micro": 0.8759, | |
"eval_loss": 0.594874918460846, | |
"eval_runtime": 151.3642, | |
"eval_samples_per_second": 66.066, | |
"eval_steps_per_second": 2.068, | |
"step": 4500 | |
}, | |
{ | |
"epoch": 2.41, | |
"grad_norm": 6.995627403259277, | |
"learning_rate": 9.911111111111111e-06, | |
"loss": 0.0986, | |
"step": 4510 | |
}, | |
{ | |
"epoch": 2.41, | |
"grad_norm": 5.848443031311035, | |
"learning_rate": 9.822222222222223e-06, | |
"loss": 0.0719, | |
"step": 4520 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 12.806991577148438, | |
"learning_rate": 9.733333333333334e-06, | |
"loss": 0.1492, | |
"step": 4530 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 17.90582847595215, | |
"learning_rate": 9.644444444444444e-06, | |
"loss": 0.1033, | |
"step": 4540 | |
}, | |
{ | |
"epoch": 2.43, | |
"grad_norm": 14.297840118408203, | |
"learning_rate": 9.555555555555556e-06, | |
"loss": 0.0654, | |
"step": 4550 | |
}, | |
{ | |
"epoch": 2.43, | |
"grad_norm": 27.18831443786621, | |
"learning_rate": 9.466666666666667e-06, | |
"loss": 0.1117, | |
"step": 4560 | |
}, | |
{ | |
"epoch": 2.44, | |
"grad_norm": 4.1261420249938965, | |
"learning_rate": 9.377777777777779e-06, | |
"loss": 0.0619, | |
"step": 4570 | |
}, | |
{ | |
"epoch": 2.44, | |
"grad_norm": 3.1558353900909424, | |
"learning_rate": 9.288888888888888e-06, | |
"loss": 0.0841, | |
"step": 4580 | |
}, | |
{ | |
"epoch": 2.45, | |
"grad_norm": 16.345598220825195, | |
"learning_rate": 9.2e-06, | |
"loss": 0.0423, | |
"step": 4590 | |
}, | |
{ | |
"epoch": 2.45, | |
"grad_norm": 14.02190113067627, | |
"learning_rate": 9.111111111111112e-06, | |
"loss": 0.081, | |
"step": 4600 | |
}, | |
{ | |
"epoch": 2.45, | |
"eval_accuracy": 0.8787, | |
"eval_f1_macro": 0.8122250468168198, | |
"eval_f1_micro": 0.8787, | |
"eval_loss": 0.6089494228363037, | |
"eval_runtime": 151.2228, | |
"eval_samples_per_second": 66.128, | |
"eval_steps_per_second": 2.07, | |
"step": 4600 | |
}, | |
{ | |
"epoch": 2.46, | |
"grad_norm": 14.47243881225586, | |
"learning_rate": 9.022222222222223e-06, | |
"loss": 0.0982, | |
"step": 4610 | |
}, | |
{ | |
"epoch": 2.46, | |
"grad_norm": 19.576120376586914, | |
"learning_rate": 8.933333333333333e-06, | |
"loss": 0.1043, | |
"step": 4620 | |
}, | |
{ | |
"epoch": 2.47, | |
"grad_norm": 11.853041648864746, | |
"learning_rate": 8.844444444444445e-06, | |
"loss": 0.1342, | |
"step": 4630 | |
}, | |
{ | |
"epoch": 2.47, | |
"grad_norm": 16.51972198486328, | |
"learning_rate": 8.755555555555556e-06, | |
"loss": 0.0949, | |
"step": 4640 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 10.615642547607422, | |
"learning_rate": 8.666666666666668e-06, | |
"loss": 0.0783, | |
"step": 4650 | |
}, | |
{ | |
"epoch": 2.49, | |
"grad_norm": 14.698683738708496, | |
"learning_rate": 8.577777777777778e-06, | |
"loss": 0.0649, | |
"step": 4660 | |
}, | |
{ | |
"epoch": 2.49, | |
"grad_norm": 11.355724334716797, | |
"learning_rate": 8.488888888888889e-06, | |
"loss": 0.096, | |
"step": 4670 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 0.6558987498283386, | |
"learning_rate": 8.400000000000001e-06, | |
"loss": 0.0983, | |
"step": 4680 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 23.330352783203125, | |
"learning_rate": 8.311111111111111e-06, | |
"loss": 0.1113, | |
"step": 4690 | |
}, | |
{ | |
"epoch": 2.51, | |
"grad_norm": 25.149642944335938, | |
"learning_rate": 8.222222222222223e-06, | |
"loss": 0.1033, | |
"step": 4700 | |
}, | |
{ | |
"epoch": 2.51, | |
"eval_accuracy": 0.8752, | |
"eval_f1_macro": 0.8106859211414766, | |
"eval_f1_micro": 0.8752, | |
"eval_loss": 0.5790488123893738, | |
"eval_runtime": 151.0, | |
"eval_samples_per_second": 66.225, | |
"eval_steps_per_second": 2.073, | |
"step": 4700 | |
}, | |
{ | |
"epoch": 2.51, | |
"grad_norm": 9.525333404541016, | |
"learning_rate": 8.133333333333332e-06, | |
"loss": 0.0938, | |
"step": 4710 | |
}, | |
{ | |
"epoch": 2.52, | |
"grad_norm": 11.757851600646973, | |
"learning_rate": 8.044444444444444e-06, | |
"loss": 0.0564, | |
"step": 4720 | |
}, | |
{ | |
"epoch": 2.52, | |
"grad_norm": 8.882828712463379, | |
"learning_rate": 7.955555555555557e-06, | |
"loss": 0.0632, | |
"step": 4730 | |
}, | |
{ | |
"epoch": 2.53, | |
"grad_norm": 4.400601863861084, | |
"learning_rate": 7.866666666666667e-06, | |
"loss": 0.0778, | |
"step": 4740 | |
}, | |
{ | |
"epoch": 2.53, | |
"grad_norm": 2.988673210144043, | |
"learning_rate": 7.777777777777777e-06, | |
"loss": 0.0504, | |
"step": 4750 | |
}, | |
{ | |
"epoch": 2.54, | |
"grad_norm": 12.651586532592773, | |
"learning_rate": 7.68888888888889e-06, | |
"loss": 0.0626, | |
"step": 4760 | |
}, | |
{ | |
"epoch": 2.54, | |
"grad_norm": 5.021940231323242, | |
"learning_rate": 7.6e-06, | |
"loss": 0.079, | |
"step": 4770 | |
}, | |
{ | |
"epoch": 2.55, | |
"grad_norm": 10.836983680725098, | |
"learning_rate": 7.511111111111112e-06, | |
"loss": 0.0636, | |
"step": 4780 | |
}, | |
{ | |
"epoch": 2.55, | |
"grad_norm": 14.971363067626953, | |
"learning_rate": 7.422222222222222e-06, | |
"loss": 0.0602, | |
"step": 4790 | |
}, | |
{ | |
"epoch": 2.56, | |
"grad_norm": 12.086758613586426, | |
"learning_rate": 7.333333333333334e-06, | |
"loss": 0.1131, | |
"step": 4800 | |
}, | |
{ | |
"epoch": 2.56, | |
"eval_accuracy": 0.8747, | |
"eval_f1_macro": 0.8036013701000208, | |
"eval_f1_micro": 0.8747, | |
"eval_loss": 0.5827542543411255, | |
"eval_runtime": 151.0845, | |
"eval_samples_per_second": 66.188, | |
"eval_steps_per_second": 2.072, | |
"step": 4800 | |
}, | |
{ | |
"epoch": 2.57, | |
"grad_norm": 15.703210830688477, | |
"learning_rate": 7.244444444444445e-06, | |
"loss": 0.0659, | |
"step": 4810 | |
}, | |
{ | |
"epoch": 2.57, | |
"grad_norm": 24.10576629638672, | |
"learning_rate": 7.155555555555556e-06, | |
"loss": 0.0621, | |
"step": 4820 | |
}, | |
{ | |
"epoch": 2.58, | |
"grad_norm": 17.84850311279297, | |
"learning_rate": 7.066666666666667e-06, | |
"loss": 0.06, | |
"step": 4830 | |
}, | |
{ | |
"epoch": 2.58, | |
"grad_norm": 14.490592956542969, | |
"learning_rate": 6.9777777777777775e-06, | |
"loss": 0.1885, | |
"step": 4840 | |
}, | |
{ | |
"epoch": 2.59, | |
"grad_norm": 15.106103897094727, | |
"learning_rate": 6.888888888888889e-06, | |
"loss": 0.0664, | |
"step": 4850 | |
}, | |
{ | |
"epoch": 2.59, | |
"grad_norm": 11.113916397094727, | |
"learning_rate": 6.800000000000001e-06, | |
"loss": 0.093, | |
"step": 4860 | |
}, | |
{ | |
"epoch": 2.6, | |
"grad_norm": 15.545525550842285, | |
"learning_rate": 6.711111111111111e-06, | |
"loss": 0.0892, | |
"step": 4870 | |
}, | |
{ | |
"epoch": 2.6, | |
"grad_norm": 14.384480476379395, | |
"learning_rate": 6.622222222222223e-06, | |
"loss": 0.0765, | |
"step": 4880 | |
}, | |
{ | |
"epoch": 2.61, | |
"grad_norm": 4.170871734619141, | |
"learning_rate": 6.533333333333333e-06, | |
"loss": 0.0553, | |
"step": 4890 | |
}, | |
{ | |
"epoch": 2.61, | |
"grad_norm": 10.818324089050293, | |
"learning_rate": 6.4444444444444445e-06, | |
"loss": 0.094, | |
"step": 4900 | |
}, | |
{ | |
"epoch": 2.61, | |
"eval_accuracy": 0.878, | |
"eval_f1_macro": 0.8106900956656236, | |
"eval_f1_micro": 0.878, | |
"eval_loss": 0.5612391829490662, | |
"eval_runtime": 151.0215, | |
"eval_samples_per_second": 66.216, | |
"eval_steps_per_second": 2.073, | |
"step": 4900 | |
}, | |
{ | |
"epoch": 2.62, | |
"grad_norm": 9.160780906677246, | |
"learning_rate": 6.355555555555557e-06, | |
"loss": 0.0859, | |
"step": 4910 | |
}, | |
{ | |
"epoch": 2.62, | |
"grad_norm": 13.956222534179688, | |
"learning_rate": 6.266666666666666e-06, | |
"loss": 0.0756, | |
"step": 4920 | |
}, | |
{ | |
"epoch": 2.63, | |
"grad_norm": 1.7093982696533203, | |
"learning_rate": 6.177777777777778e-06, | |
"loss": 0.0385, | |
"step": 4930 | |
}, | |
{ | |
"epoch": 2.63, | |
"grad_norm": 18.47127342224121, | |
"learning_rate": 6.088888888888889e-06, | |
"loss": 0.0643, | |
"step": 4940 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 5.761905670166016, | |
"learning_rate": 6e-06, | |
"loss": 0.0387, | |
"step": 4950 | |
}, | |
{ | |
"epoch": 2.65, | |
"grad_norm": 11.518508911132812, | |
"learning_rate": 5.9111111111111115e-06, | |
"loss": 0.0857, | |
"step": 4960 | |
}, | |
{ | |
"epoch": 2.65, | |
"grad_norm": 11.523582458496094, | |
"learning_rate": 5.822222222222223e-06, | |
"loss": 0.0607, | |
"step": 4970 | |
}, | |
{ | |
"epoch": 2.66, | |
"grad_norm": 3.5160629749298096, | |
"learning_rate": 5.733333333333333e-06, | |
"loss": 0.0965, | |
"step": 4980 | |
}, | |
{ | |
"epoch": 2.66, | |
"grad_norm": 16.30809783935547, | |
"learning_rate": 5.6444444444444445e-06, | |
"loss": 0.0392, | |
"step": 4990 | |
}, | |
{ | |
"epoch": 2.67, | |
"grad_norm": 12.250411987304688, | |
"learning_rate": 5.555555555555556e-06, | |
"loss": 0.0853, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 2.67, | |
"eval_accuracy": 0.8784, | |
"eval_f1_macro": 0.8123002988678545, | |
"eval_f1_micro": 0.8784, | |
"eval_loss": 0.5772224068641663, | |
"eval_runtime": 151.0336, | |
"eval_samples_per_second": 66.21, | |
"eval_steps_per_second": 2.072, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 2.67, | |
"grad_norm": 18.300569534301758, | |
"learning_rate": 5.466666666666667e-06, | |
"loss": 0.0925, | |
"step": 5010 | |
}, | |
{ | |
"epoch": 2.68, | |
"grad_norm": 4.053598403930664, | |
"learning_rate": 5.3777777777777784e-06, | |
"loss": 0.0906, | |
"step": 5020 | |
}, | |
{ | |
"epoch": 2.68, | |
"grad_norm": 3.2842700481414795, | |
"learning_rate": 5.288888888888889e-06, | |
"loss": 0.0562, | |
"step": 5030 | |
}, | |
{ | |
"epoch": 2.69, | |
"grad_norm": 1.8992033004760742, | |
"learning_rate": 5.2e-06, | |
"loss": 0.0421, | |
"step": 5040 | |
}, | |
{ | |
"epoch": 2.69, | |
"grad_norm": 4.324485778808594, | |
"learning_rate": 5.1111111111111115e-06, | |
"loss": 0.0704, | |
"step": 5050 | |
}, | |
{ | |
"epoch": 2.7, | |
"grad_norm": 10.114728927612305, | |
"learning_rate": 5.022222222222223e-06, | |
"loss": 0.0568, | |
"step": 5060 | |
}, | |
{ | |
"epoch": 2.7, | |
"grad_norm": 3.556407928466797, | |
"learning_rate": 4.933333333333333e-06, | |
"loss": 0.0487, | |
"step": 5070 | |
}, | |
{ | |
"epoch": 2.71, | |
"grad_norm": 11.58324146270752, | |
"learning_rate": 4.8444444444444446e-06, | |
"loss": 0.1051, | |
"step": 5080 | |
}, | |
{ | |
"epoch": 2.71, | |
"grad_norm": 12.190340042114258, | |
"learning_rate": 4.755555555555556e-06, | |
"loss": 0.0609, | |
"step": 5090 | |
}, | |
{ | |
"epoch": 2.72, | |
"grad_norm": 10.30691909790039, | |
"learning_rate": 4.666666666666667e-06, | |
"loss": 0.0917, | |
"step": 5100 | |
}, | |
{ | |
"epoch": 2.72, | |
"eval_accuracy": 0.8805, | |
"eval_f1_macro": 0.8123461705531486, | |
"eval_f1_micro": 0.8805, | |
"eval_loss": 0.559500515460968, | |
"eval_runtime": 150.9375, | |
"eval_samples_per_second": 66.253, | |
"eval_steps_per_second": 2.074, | |
"step": 5100 | |
}, | |
{ | |
"epoch": 2.73, | |
"grad_norm": 15.113903045654297, | |
"learning_rate": 4.5777777777777785e-06, | |
"loss": 0.0634, | |
"step": 5110 | |
}, | |
{ | |
"epoch": 2.73, | |
"grad_norm": 3.779010534286499, | |
"learning_rate": 4.488888888888889e-06, | |
"loss": 0.0507, | |
"step": 5120 | |
}, | |
{ | |
"epoch": 2.74, | |
"grad_norm": 5.111919403076172, | |
"learning_rate": 4.4e-06, | |
"loss": 0.0214, | |
"step": 5130 | |
}, | |
{ | |
"epoch": 2.74, | |
"grad_norm": 0.48539718985557556, | |
"learning_rate": 4.3111111111111115e-06, | |
"loss": 0.0986, | |
"step": 5140 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 7.374268531799316, | |
"learning_rate": 4.222222222222223e-06, | |
"loss": 0.047, | |
"step": 5150 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 12.559208869934082, | |
"learning_rate": 4.133333333333333e-06, | |
"loss": 0.0851, | |
"step": 5160 | |
}, | |
{ | |
"epoch": 2.76, | |
"grad_norm": 5.660250663757324, | |
"learning_rate": 4.044444444444445e-06, | |
"loss": 0.0488, | |
"step": 5170 | |
}, | |
{ | |
"epoch": 2.76, | |
"grad_norm": 4.466784477233887, | |
"learning_rate": 3.955555555555555e-06, | |
"loss": 0.1213, | |
"step": 5180 | |
}, | |
{ | |
"epoch": 2.77, | |
"grad_norm": 6.510303020477295, | |
"learning_rate": 3.866666666666667e-06, | |
"loss": 0.0179, | |
"step": 5190 | |
}, | |
{ | |
"epoch": 2.77, | |
"grad_norm": 6.136260032653809, | |
"learning_rate": 3.777777777777778e-06, | |
"loss": 0.0542, | |
"step": 5200 | |
}, | |
{ | |
"epoch": 2.77, | |
"eval_accuracy": 0.8814, | |
"eval_f1_macro": 0.8146631928155758, | |
"eval_f1_micro": 0.8814, | |
"eval_loss": 0.5781938433647156, | |
"eval_runtime": 151.022, | |
"eval_samples_per_second": 66.216, | |
"eval_steps_per_second": 2.073, | |
"step": 5200 | |
}, | |
{ | |
"epoch": 2.78, | |
"grad_norm": 4.440271854400635, | |
"learning_rate": 3.688888888888889e-06, | |
"loss": 0.0422, | |
"step": 5210 | |
}, | |
{ | |
"epoch": 2.78, | |
"grad_norm": 2.652446746826172, | |
"learning_rate": 3.6e-06, | |
"loss": 0.056, | |
"step": 5220 | |
}, | |
{ | |
"epoch": 2.79, | |
"grad_norm": 10.915148735046387, | |
"learning_rate": 3.5111111111111116e-06, | |
"loss": 0.0615, | |
"step": 5230 | |
}, | |
{ | |
"epoch": 2.79, | |
"grad_norm": 11.108790397644043, | |
"learning_rate": 3.4222222222222224e-06, | |
"loss": 0.0639, | |
"step": 5240 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 16.01081085205078, | |
"learning_rate": 3.3333333333333333e-06, | |
"loss": 0.0517, | |
"step": 5250 | |
}, | |
{ | |
"epoch": 2.81, | |
"grad_norm": 9.080989837646484, | |
"learning_rate": 3.244444444444444e-06, | |
"loss": 0.0594, | |
"step": 5260 | |
}, | |
{ | |
"epoch": 2.81, | |
"grad_norm": 15.85318660736084, | |
"learning_rate": 3.155555555555556e-06, | |
"loss": 0.0511, | |
"step": 5270 | |
}, | |
{ | |
"epoch": 2.82, | |
"grad_norm": 13.549962043762207, | |
"learning_rate": 3.066666666666667e-06, | |
"loss": 0.0996, | |
"step": 5280 | |
}, | |
{ | |
"epoch": 2.82, | |
"grad_norm": 2.0129482746124268, | |
"learning_rate": 2.977777777777778e-06, | |
"loss": 0.0704, | |
"step": 5290 | |
}, | |
{ | |
"epoch": 2.83, | |
"grad_norm": 0.5195946097373962, | |
"learning_rate": 2.888888888888889e-06, | |
"loss": 0.0754, | |
"step": 5300 | |
}, | |
{ | |
"epoch": 2.83, | |
"eval_accuracy": 0.8821, | |
"eval_f1_macro": 0.8171083228509614, | |
"eval_f1_micro": 0.8821, | |
"eval_loss": 0.5936337113380432, | |
"eval_runtime": 151.1685, | |
"eval_samples_per_second": 66.151, | |
"eval_steps_per_second": 2.071, | |
"step": 5300 | |
}, | |
{ | |
"epoch": 2.83, | |
"grad_norm": 17.216556549072266, | |
"learning_rate": 2.8000000000000003e-06, | |
"loss": 0.0802, | |
"step": 5310 | |
}, | |
{ | |
"epoch": 2.84, | |
"grad_norm": 15.105179786682129, | |
"learning_rate": 2.711111111111111e-06, | |
"loss": 0.0701, | |
"step": 5320 | |
}, | |
{ | |
"epoch": 2.84, | |
"grad_norm": 3.2868239879608154, | |
"learning_rate": 2.6222222222222225e-06, | |
"loss": 0.0523, | |
"step": 5330 | |
}, | |
{ | |
"epoch": 2.85, | |
"grad_norm": 17.145092010498047, | |
"learning_rate": 2.5333333333333334e-06, | |
"loss": 0.1424, | |
"step": 5340 | |
}, | |
{ | |
"epoch": 2.85, | |
"grad_norm": 9.835216522216797, | |
"learning_rate": 2.4444444444444447e-06, | |
"loss": 0.0504, | |
"step": 5350 | |
}, | |
{ | |
"epoch": 2.86, | |
"grad_norm": 9.431855201721191, | |
"learning_rate": 2.3555555555555555e-06, | |
"loss": 0.076, | |
"step": 5360 | |
}, | |
{ | |
"epoch": 2.86, | |
"grad_norm": 0.8578177690505981, | |
"learning_rate": 2.266666666666667e-06, | |
"loss": 0.045, | |
"step": 5370 | |
}, | |
{ | |
"epoch": 2.87, | |
"grad_norm": 3.6194913387298584, | |
"learning_rate": 2.1777777777777777e-06, | |
"loss": 0.0414, | |
"step": 5380 | |
}, | |
{ | |
"epoch": 2.87, | |
"grad_norm": 10.720399856567383, | |
"learning_rate": 2.088888888888889e-06, | |
"loss": 0.0757, | |
"step": 5390 | |
}, | |
{ | |
"epoch": 2.88, | |
"grad_norm": 11.850172996520996, | |
"learning_rate": 2.0000000000000003e-06, | |
"loss": 0.1001, | |
"step": 5400 | |
}, | |
{ | |
"epoch": 2.88, | |
"eval_accuracy": 0.8827, | |
"eval_f1_macro": 0.8156503347773745, | |
"eval_f1_micro": 0.8827, | |
"eval_loss": 0.5625789761543274, | |
"eval_runtime": 151.0652, | |
"eval_samples_per_second": 66.197, | |
"eval_steps_per_second": 2.072, | |
"step": 5400 | |
}, | |
{ | |
"epoch": 2.89, | |
"grad_norm": 3.0367789268493652, | |
"learning_rate": 1.9111111111111112e-06, | |
"loss": 0.0614, | |
"step": 5410 | |
}, | |
{ | |
"epoch": 2.89, | |
"grad_norm": 0.6901952028274536, | |
"learning_rate": 1.8222222222222223e-06, | |
"loss": 0.0359, | |
"step": 5420 | |
}, | |
{ | |
"epoch": 2.9, | |
"grad_norm": 6.812145709991455, | |
"learning_rate": 1.7333333333333334e-06, | |
"loss": 0.0217, | |
"step": 5430 | |
}, | |
{ | |
"epoch": 2.9, | |
"grad_norm": 9.04457950592041, | |
"learning_rate": 1.6444444444444447e-06, | |
"loss": 0.0469, | |
"step": 5440 | |
}, | |
{ | |
"epoch": 2.91, | |
"grad_norm": 21.277233123779297, | |
"learning_rate": 1.5555555555555556e-06, | |
"loss": 0.0893, | |
"step": 5450 | |
}, | |
{ | |
"epoch": 2.91, | |
"grad_norm": 9.495359420776367, | |
"learning_rate": 1.4666666666666667e-06, | |
"loss": 0.0607, | |
"step": 5460 | |
}, | |
{ | |
"epoch": 2.92, | |
"grad_norm": 9.839810371398926, | |
"learning_rate": 1.3777777777777778e-06, | |
"loss": 0.066, | |
"step": 5470 | |
}, | |
{ | |
"epoch": 2.92, | |
"grad_norm": 17.838457107543945, | |
"learning_rate": 1.2888888888888889e-06, | |
"loss": 0.0549, | |
"step": 5480 | |
}, | |
{ | |
"epoch": 2.93, | |
"grad_norm": 5.896942138671875, | |
"learning_rate": 1.2000000000000002e-06, | |
"loss": 0.0468, | |
"step": 5490 | |
}, | |
{ | |
"epoch": 2.93, | |
"grad_norm": 0.5075032711029053, | |
"learning_rate": 1.1111111111111112e-06, | |
"loss": 0.0311, | |
"step": 5500 | |
}, | |
{ | |
"epoch": 2.93, | |
"eval_accuracy": 0.8818, | |
"eval_f1_macro": 0.8151866330745442, | |
"eval_f1_micro": 0.8818, | |
"eval_loss": 0.5690019726753235, | |
"eval_runtime": 151.3749, | |
"eval_samples_per_second": 66.061, | |
"eval_steps_per_second": 2.068, | |
"step": 5500 | |
}, | |
{ | |
"epoch": 2.94, | |
"grad_norm": 15.075337409973145, | |
"learning_rate": 1.0222222222222223e-06, | |
"loss": 0.0463, | |
"step": 5510 | |
}, | |
{ | |
"epoch": 2.94, | |
"grad_norm": 7.638895511627197, | |
"learning_rate": 9.333333333333334e-07, | |
"loss": 0.0505, | |
"step": 5520 | |
}, | |
{ | |
"epoch": 2.95, | |
"grad_norm": 21.062416076660156, | |
"learning_rate": 8.444444444444444e-07, | |
"loss": 0.0888, | |
"step": 5530 | |
}, | |
{ | |
"epoch": 2.95, | |
"grad_norm": 11.800668716430664, | |
"learning_rate": 7.555555555555556e-07, | |
"loss": 0.0678, | |
"step": 5540 | |
}, | |
{ | |
"epoch": 2.96, | |
"grad_norm": 3.7984507083892822, | |
"learning_rate": 6.666666666666667e-07, | |
"loss": 0.0496, | |
"step": 5550 | |
}, | |
{ | |
"epoch": 2.97, | |
"grad_norm": 13.194000244140625, | |
"learning_rate": 5.777777777777778e-07, | |
"loss": 0.0778, | |
"step": 5560 | |
}, | |
{ | |
"epoch": 2.97, | |
"grad_norm": 4.690216064453125, | |
"learning_rate": 4.888888888888889e-07, | |
"loss": 0.0592, | |
"step": 5570 | |
}, | |
{ | |
"epoch": 2.98, | |
"grad_norm": 0.8871315121650696, | |
"learning_rate": 4.0000000000000003e-07, | |
"loss": 0.0297, | |
"step": 5580 | |
}, | |
{ | |
"epoch": 2.98, | |
"grad_norm": 9.130186080932617, | |
"learning_rate": 3.111111111111111e-07, | |
"loss": 0.0527, | |
"step": 5590 | |
}, | |
{ | |
"epoch": 2.99, | |
"grad_norm": 1.8399721384048462, | |
"learning_rate": 2.2222222222222224e-07, | |
"loss": 0.03, | |
"step": 5600 | |
}, | |
{ | |
"epoch": 2.99, | |
"eval_accuracy": 0.8831, | |
"eval_f1_macro": 0.8171086031295777, | |
"eval_f1_micro": 0.8831, | |
"eval_loss": 0.5688170790672302, | |
"eval_runtime": 151.3507, | |
"eval_samples_per_second": 66.072, | |
"eval_steps_per_second": 2.068, | |
"step": 5600 | |
}, | |
{ | |
"epoch": 2.99, | |
"grad_norm": 6.7230143547058105, | |
"learning_rate": 1.3333333333333334e-07, | |
"loss": 0.078, | |
"step": 5610 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 5.4988532066345215, | |
"learning_rate": 4.444444444444445e-08, | |
"loss": 0.0337, | |
"step": 5620 | |
}, | |
{ | |
"epoch": 3.0, | |
"step": 5625, | |
"total_flos": 1.126917998641152e+18, | |
"train_loss": 0.4508466554853651, | |
"train_runtime": 15613.1858, | |
"train_samples_per_second": 11.529, | |
"train_steps_per_second": 0.36 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 5625, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 100, | |
"total_flos": 1.126917998641152e+18, | |
"train_batch_size": 16, | |
"trial_name": null, | |
"trial_params": null | |
} | |