{ "best_metric": 0.5041437745094299, "best_model_checkpoint": "../experiments_checkpoints/LoRA/google/gemma_7b_LoRA_coastalcph/lex_glue_ledgar/checkpoint-3700", "epoch": 3.0, "eval_steps": 100, "global_step": 5625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 227.8190460205078, "learning_rate": 4.991111111111111e-05, "loss": 12.0078, "step": 10 }, { "epoch": 0.01, "grad_norm": 106.68264770507812, "learning_rate": 4.982222222222222e-05, "loss": 5.4906, "step": 20 }, { "epoch": 0.02, "grad_norm": 67.2668685913086, "learning_rate": 4.973333333333334e-05, "loss": 3.8859, "step": 30 }, { "epoch": 0.02, "grad_norm": 50.982364654541016, "learning_rate": 4.964444444444445e-05, "loss": 2.541, "step": 40 }, { "epoch": 0.03, "grad_norm": 60.637367248535156, "learning_rate": 4.955555555555556e-05, "loss": 2.2738, "step": 50 }, { "epoch": 0.03, "grad_norm": 52.631439208984375, "learning_rate": 4.9466666666666665e-05, "loss": 1.6229, "step": 60 }, { "epoch": 0.04, "grad_norm": 39.47542953491211, "learning_rate": 4.9377777777777776e-05, "loss": 1.7955, "step": 70 }, { "epoch": 0.04, "grad_norm": 53.9116096496582, "learning_rate": 4.928888888888889e-05, "loss": 1.263, "step": 80 }, { "epoch": 0.05, "grad_norm": 52.852882385253906, "learning_rate": 4.92e-05, "loss": 1.2939, "step": 90 }, { "epoch": 0.05, "grad_norm": 41.01764678955078, "learning_rate": 4.9111111111111114e-05, "loss": 1.3725, "step": 100 }, { "epoch": 0.05, "eval_accuracy": 0.6864, "eval_f1_macro": 0.5157367163795201, "eval_f1_micro": 0.6864, "eval_loss": 1.3878281116485596, "eval_runtime": 150.4086, "eval_samples_per_second": 66.486, "eval_steps_per_second": 2.081, "step": 100 }, { "epoch": 0.06, "grad_norm": 24.78645896911621, "learning_rate": 4.9022222222222224e-05, "loss": 1.4604, "step": 110 }, { "epoch": 0.06, "grad_norm": 33.56829071044922, "learning_rate": 4.8933333333333335e-05, "loss": 1.2117, "step": 120 }, { "epoch": 0.07, "grad_norm": 44.52119827270508, "learning_rate": 4.8844444444444445e-05, "loss": 1.1012, "step": 130 }, { "epoch": 0.07, "grad_norm": 38.966400146484375, "learning_rate": 4.875555555555556e-05, "loss": 1.3106, "step": 140 }, { "epoch": 0.08, "grad_norm": 44.13960647583008, "learning_rate": 4.866666666666667e-05, "loss": 1.1914, "step": 150 }, { "epoch": 0.09, "grad_norm": 38.23433303833008, "learning_rate": 4.8577777777777776e-05, "loss": 1.2592, "step": 160 }, { "epoch": 0.09, "grad_norm": 36.56868362426758, "learning_rate": 4.848888888888889e-05, "loss": 1.0251, "step": 170 }, { "epoch": 0.1, "grad_norm": 33.46715545654297, "learning_rate": 4.8400000000000004e-05, "loss": 1.0881, "step": 180 }, { "epoch": 0.1, "grad_norm": 33.97274398803711, "learning_rate": 4.8311111111111115e-05, "loss": 1.0591, "step": 190 }, { "epoch": 0.11, "grad_norm": 42.04470443725586, "learning_rate": 4.8222222222222225e-05, "loss": 1.3256, "step": 200 }, { "epoch": 0.11, "eval_accuracy": 0.7615, "eval_f1_macro": 0.6078459646070684, "eval_f1_micro": 0.7615, "eval_loss": 1.0876250267028809, "eval_runtime": 150.0387, "eval_samples_per_second": 66.649, "eval_steps_per_second": 2.086, "step": 200 }, { "epoch": 0.11, "grad_norm": 43.14711380004883, "learning_rate": 4.8133333333333336e-05, "loss": 1.1843, "step": 210 }, { "epoch": 0.12, "grad_norm": 39.68747329711914, "learning_rate": 4.8044444444444446e-05, "loss": 1.208, "step": 220 }, { "epoch": 0.12, "grad_norm": 34.513919830322266, "learning_rate": 4.7955555555555556e-05, "loss": 1.0581, "step": 230 }, { "epoch": 0.13, "grad_norm": 37.228477478027344, "learning_rate": 4.7866666666666674e-05, "loss": 1.0649, "step": 240 }, { "epoch": 0.13, "grad_norm": 52.42377471923828, "learning_rate": 4.7777777777777784e-05, "loss": 1.1228, "step": 250 }, { "epoch": 0.14, "grad_norm": 40.35560607910156, "learning_rate": 4.768888888888889e-05, "loss": 1.049, "step": 260 }, { "epoch": 0.14, "grad_norm": 41.43683624267578, "learning_rate": 4.76e-05, "loss": 1.0151, "step": 270 }, { "epoch": 0.15, "grad_norm": 23.949188232421875, "learning_rate": 4.751111111111111e-05, "loss": 1.0386, "step": 280 }, { "epoch": 0.15, "grad_norm": 43.994598388671875, "learning_rate": 4.7422222222222226e-05, "loss": 1.0314, "step": 290 }, { "epoch": 0.16, "grad_norm": 39.607051849365234, "learning_rate": 4.7333333333333336e-05, "loss": 0.9681, "step": 300 }, { "epoch": 0.16, "eval_accuracy": 0.7699, "eval_f1_macro": 0.6452214349790191, "eval_f1_micro": 0.7699, "eval_loss": 0.9516304731369019, "eval_runtime": 150.0454, "eval_samples_per_second": 66.646, "eval_steps_per_second": 2.086, "step": 300 }, { "epoch": 0.17, "grad_norm": 33.548194885253906, "learning_rate": 4.724444444444445e-05, "loss": 0.9143, "step": 310 }, { "epoch": 0.17, "grad_norm": 28.681020736694336, "learning_rate": 4.715555555555556e-05, "loss": 0.8288, "step": 320 }, { "epoch": 0.18, "grad_norm": 39.852054595947266, "learning_rate": 4.706666666666667e-05, "loss": 1.1122, "step": 330 }, { "epoch": 0.18, "grad_norm": 33.49698257446289, "learning_rate": 4.6977777777777785e-05, "loss": 0.9923, "step": 340 }, { "epoch": 0.19, "grad_norm": 47.75785446166992, "learning_rate": 4.6888888888888895e-05, "loss": 0.9374, "step": 350 }, { "epoch": 0.19, "grad_norm": 39.51451873779297, "learning_rate": 4.6800000000000006e-05, "loss": 1.0203, "step": 360 }, { "epoch": 0.2, "grad_norm": 35.28321075439453, "learning_rate": 4.671111111111111e-05, "loss": 0.8589, "step": 370 }, { "epoch": 0.2, "grad_norm": 43.45527267456055, "learning_rate": 4.662222222222222e-05, "loss": 0.9698, "step": 380 }, { "epoch": 0.21, "grad_norm": 34.52985763549805, "learning_rate": 4.653333333333334e-05, "loss": 0.8177, "step": 390 }, { "epoch": 0.21, "grad_norm": 28.500659942626953, "learning_rate": 4.644444444444445e-05, "loss": 0.9094, "step": 400 }, { "epoch": 0.21, "eval_accuracy": 0.7893, "eval_f1_macro": 0.6627964242367417, "eval_f1_micro": 0.7893, "eval_loss": 0.9403331875801086, "eval_runtime": 149.9006, "eval_samples_per_second": 66.711, "eval_steps_per_second": 2.088, "step": 400 }, { "epoch": 0.22, "grad_norm": 40.63841247558594, "learning_rate": 4.635555555555556e-05, "loss": 0.973, "step": 410 }, { "epoch": 0.22, "grad_norm": 32.8942985534668, "learning_rate": 4.626666666666667e-05, "loss": 0.9819, "step": 420 }, { "epoch": 0.23, "grad_norm": 28.508481979370117, "learning_rate": 4.617777777777778e-05, "loss": 0.833, "step": 430 }, { "epoch": 0.23, "grad_norm": 33.688846588134766, "learning_rate": 4.608888888888889e-05, "loss": 0.6626, "step": 440 }, { "epoch": 0.24, "grad_norm": 42.95698928833008, "learning_rate": 4.600000000000001e-05, "loss": 0.8224, "step": 450 }, { "epoch": 0.25, "grad_norm": 26.850563049316406, "learning_rate": 4.591111111111112e-05, "loss": 0.9378, "step": 460 }, { "epoch": 0.25, "grad_norm": 29.557905197143555, "learning_rate": 4.582222222222222e-05, "loss": 1.0295, "step": 470 }, { "epoch": 0.26, "grad_norm": 20.479171752929688, "learning_rate": 4.573333333333333e-05, "loss": 0.885, "step": 480 }, { "epoch": 0.26, "grad_norm": 31.502513885498047, "learning_rate": 4.564444444444444e-05, "loss": 0.9045, "step": 490 }, { "epoch": 0.27, "grad_norm": 32.363094329833984, "learning_rate": 4.555555555555556e-05, "loss": 0.7715, "step": 500 }, { "epoch": 0.27, "eval_accuracy": 0.7896, "eval_f1_macro": 0.668700376378601, "eval_f1_micro": 0.7896, "eval_loss": 0.8592824339866638, "eval_runtime": 149.9317, "eval_samples_per_second": 66.697, "eval_steps_per_second": 2.088, "step": 500 }, { "epoch": 0.27, "grad_norm": 34.05838394165039, "learning_rate": 4.546666666666667e-05, "loss": 0.8179, "step": 510 }, { "epoch": 0.28, "grad_norm": 48.86190414428711, "learning_rate": 4.537777777777778e-05, "loss": 0.9052, "step": 520 }, { "epoch": 0.28, "grad_norm": 32.10374069213867, "learning_rate": 4.528888888888889e-05, "loss": 1.0262, "step": 530 }, { "epoch": 0.29, "grad_norm": 27.27006721496582, "learning_rate": 4.52e-05, "loss": 0.8747, "step": 540 }, { "epoch": 0.29, "grad_norm": 34.86284637451172, "learning_rate": 4.511111111111112e-05, "loss": 0.8681, "step": 550 }, { "epoch": 0.3, "grad_norm": 27.64435386657715, "learning_rate": 4.502222222222223e-05, "loss": 0.8249, "step": 560 }, { "epoch": 0.3, "grad_norm": 34.09676742553711, "learning_rate": 4.493333333333333e-05, "loss": 1.0702, "step": 570 }, { "epoch": 0.31, "grad_norm": 26.377086639404297, "learning_rate": 4.484444444444444e-05, "loss": 0.7343, "step": 580 }, { "epoch": 0.31, "grad_norm": 32.587154388427734, "learning_rate": 4.475555555555555e-05, "loss": 0.8207, "step": 590 }, { "epoch": 0.32, "grad_norm": 28.099170684814453, "learning_rate": 4.466666666666667e-05, "loss": 0.7244, "step": 600 }, { "epoch": 0.32, "eval_accuracy": 0.8061, "eval_f1_macro": 0.694893382279633, "eval_f1_micro": 0.8061, "eval_loss": 0.7621132731437683, "eval_runtime": 150.0447, "eval_samples_per_second": 66.647, "eval_steps_per_second": 2.086, "step": 600 }, { "epoch": 0.33, "grad_norm": 27.44232177734375, "learning_rate": 4.457777777777778e-05, "loss": 0.8029, "step": 610 }, { "epoch": 0.33, "grad_norm": 29.95503807067871, "learning_rate": 4.448888888888889e-05, "loss": 0.7812, "step": 620 }, { "epoch": 0.34, "grad_norm": 32.26255416870117, "learning_rate": 4.44e-05, "loss": 0.6953, "step": 630 }, { "epoch": 0.34, "grad_norm": 26.627235412597656, "learning_rate": 4.431111111111111e-05, "loss": 0.8677, "step": 640 }, { "epoch": 0.35, "grad_norm": 19.550811767578125, "learning_rate": 4.422222222222222e-05, "loss": 0.8226, "step": 650 }, { "epoch": 0.35, "grad_norm": 19.106870651245117, "learning_rate": 4.413333333333334e-05, "loss": 0.845, "step": 660 }, { "epoch": 0.36, "grad_norm": 31.620084762573242, "learning_rate": 4.404444444444445e-05, "loss": 0.7984, "step": 670 }, { "epoch": 0.36, "grad_norm": 32.98550796508789, "learning_rate": 4.3955555555555554e-05, "loss": 0.7198, "step": 680 }, { "epoch": 0.37, "grad_norm": 32.72222137451172, "learning_rate": 4.3866666666666665e-05, "loss": 0.9314, "step": 690 }, { "epoch": 0.37, "grad_norm": 36.1794319152832, "learning_rate": 4.377777777777778e-05, "loss": 0.7719, "step": 700 }, { "epoch": 0.37, "eval_accuracy": 0.7884, "eval_f1_macro": 0.6863716720883178, "eval_f1_micro": 0.7884, "eval_loss": 0.8355345726013184, "eval_runtime": 150.0583, "eval_samples_per_second": 66.641, "eval_steps_per_second": 2.086, "step": 700 }, { "epoch": 0.38, "grad_norm": 19.62013053894043, "learning_rate": 4.368888888888889e-05, "loss": 0.5494, "step": 710 }, { "epoch": 0.38, "grad_norm": 26.641956329345703, "learning_rate": 4.36e-05, "loss": 0.7919, "step": 720 }, { "epoch": 0.39, "grad_norm": 24.791227340698242, "learning_rate": 4.351111111111111e-05, "loss": 0.7154, "step": 730 }, { "epoch": 0.39, "grad_norm": 39.39951705932617, "learning_rate": 4.3422222222222224e-05, "loss": 0.6423, "step": 740 }, { "epoch": 0.4, "grad_norm": 35.64988327026367, "learning_rate": 4.3333333333333334e-05, "loss": 0.7939, "step": 750 }, { "epoch": 0.41, "grad_norm": 28.65671157836914, "learning_rate": 4.324444444444445e-05, "loss": 0.8428, "step": 760 }, { "epoch": 0.41, "grad_norm": 28.99781608581543, "learning_rate": 4.315555555555556e-05, "loss": 0.749, "step": 770 }, { "epoch": 0.42, "grad_norm": 29.291336059570312, "learning_rate": 4.3066666666666665e-05, "loss": 0.7318, "step": 780 }, { "epoch": 0.42, "grad_norm": 27.441404342651367, "learning_rate": 4.2977777777777776e-05, "loss": 0.7554, "step": 790 }, { "epoch": 0.43, "grad_norm": 10.821943283081055, "learning_rate": 4.2888888888888886e-05, "loss": 0.6305, "step": 800 }, { "epoch": 0.43, "eval_accuracy": 0.7897, "eval_f1_macro": 0.6806730525287331, "eval_f1_micro": 0.7897, "eval_loss": 0.8542162179946899, "eval_runtime": 149.8489, "eval_samples_per_second": 66.734, "eval_steps_per_second": 2.089, "step": 800 }, { "epoch": 0.43, "grad_norm": 32.4620246887207, "learning_rate": 4.2800000000000004e-05, "loss": 0.7627, "step": 810 }, { "epoch": 0.44, "grad_norm": 32.67604446411133, "learning_rate": 4.2711111111111114e-05, "loss": 0.8879, "step": 820 }, { "epoch": 0.44, "grad_norm": 22.831031799316406, "learning_rate": 4.2622222222222224e-05, "loss": 0.8407, "step": 830 }, { "epoch": 0.45, "grad_norm": 36.6854362487793, "learning_rate": 4.2533333333333335e-05, "loss": 0.7145, "step": 840 }, { "epoch": 0.45, "grad_norm": 36.49830627441406, "learning_rate": 4.2444444444444445e-05, "loss": 0.7768, "step": 850 }, { "epoch": 0.46, "grad_norm": 35.065948486328125, "learning_rate": 4.235555555555556e-05, "loss": 1.0117, "step": 860 }, { "epoch": 0.46, "grad_norm": 37.74482727050781, "learning_rate": 4.226666666666667e-05, "loss": 0.8462, "step": 870 }, { "epoch": 0.47, "grad_norm": 26.77570152282715, "learning_rate": 4.217777777777778e-05, "loss": 0.8096, "step": 880 }, { "epoch": 0.47, "grad_norm": 26.55797004699707, "learning_rate": 4.208888888888889e-05, "loss": 0.8612, "step": 890 }, { "epoch": 0.48, "grad_norm": 34.126625061035156, "learning_rate": 4.2e-05, "loss": 0.8793, "step": 900 }, { "epoch": 0.48, "eval_accuracy": 0.7935, "eval_f1_macro": 0.6821808056398841, "eval_f1_micro": 0.7935, "eval_loss": 0.8042706847190857, "eval_runtime": 150.0061, "eval_samples_per_second": 66.664, "eval_steps_per_second": 2.087, "step": 900 }, { "epoch": 0.49, "grad_norm": 28.812938690185547, "learning_rate": 4.1911111111111115e-05, "loss": 0.8599, "step": 910 }, { "epoch": 0.49, "grad_norm": 25.103418350219727, "learning_rate": 4.1822222222222225e-05, "loss": 0.6823, "step": 920 }, { "epoch": 0.5, "grad_norm": 22.762414932250977, "learning_rate": 4.1733333333333336e-05, "loss": 0.6981, "step": 930 }, { "epoch": 0.5, "grad_norm": 27.674386978149414, "learning_rate": 4.1644444444444446e-05, "loss": 0.6647, "step": 940 }, { "epoch": 0.51, "grad_norm": 30.835783004760742, "learning_rate": 4.155555555555556e-05, "loss": 0.8359, "step": 950 }, { "epoch": 0.51, "grad_norm": 27.273395538330078, "learning_rate": 4.146666666666667e-05, "loss": 0.8044, "step": 960 }, { "epoch": 0.52, "grad_norm": 28.3951416015625, "learning_rate": 4.1377777777777784e-05, "loss": 0.8517, "step": 970 }, { "epoch": 0.52, "grad_norm": 29.438312530517578, "learning_rate": 4.1288888888888895e-05, "loss": 0.8037, "step": 980 }, { "epoch": 0.53, "grad_norm": 34.72230529785156, "learning_rate": 4.12e-05, "loss": 0.8013, "step": 990 }, { "epoch": 0.53, "grad_norm": 32.5698127746582, "learning_rate": 4.111111111111111e-05, "loss": 0.7411, "step": 1000 }, { "epoch": 0.53, "eval_accuracy": 0.8072, "eval_f1_macro": 0.6939988529805743, "eval_f1_micro": 0.8072, "eval_loss": 0.7256324291229248, "eval_runtime": 150.0248, "eval_samples_per_second": 66.656, "eval_steps_per_second": 2.086, "step": 1000 }, { "epoch": 0.54, "grad_norm": 28.614532470703125, "learning_rate": 4.1022222222222226e-05, "loss": 0.733, "step": 1010 }, { "epoch": 0.54, "grad_norm": 30.533872604370117, "learning_rate": 4.093333333333334e-05, "loss": 0.7474, "step": 1020 }, { "epoch": 0.55, "grad_norm": 29.524789810180664, "learning_rate": 4.084444444444445e-05, "loss": 0.8004, "step": 1030 }, { "epoch": 0.55, "grad_norm": 52.84124755859375, "learning_rate": 4.075555555555556e-05, "loss": 0.8063, "step": 1040 }, { "epoch": 0.56, "grad_norm": 31.382856369018555, "learning_rate": 4.066666666666667e-05, "loss": 0.7358, "step": 1050 }, { "epoch": 0.57, "grad_norm": 28.268238067626953, "learning_rate": 4.057777777777778e-05, "loss": 0.6383, "step": 1060 }, { "epoch": 0.57, "grad_norm": 28.795692443847656, "learning_rate": 4.0488888888888896e-05, "loss": 0.7642, "step": 1070 }, { "epoch": 0.58, "grad_norm": 24.153024673461914, "learning_rate": 4.0400000000000006e-05, "loss": 0.6057, "step": 1080 }, { "epoch": 0.58, "grad_norm": 32.658329010009766, "learning_rate": 4.031111111111111e-05, "loss": 0.8125, "step": 1090 }, { "epoch": 0.59, "grad_norm": 14.572766304016113, "learning_rate": 4.022222222222222e-05, "loss": 0.6403, "step": 1100 }, { "epoch": 0.59, "eval_accuracy": 0.819, "eval_f1_macro": 0.7216680126270462, "eval_f1_micro": 0.819, "eval_loss": 0.7033218741416931, "eval_runtime": 149.9473, "eval_samples_per_second": 66.69, "eval_steps_per_second": 2.087, "step": 1100 }, { "epoch": 0.59, "grad_norm": 19.96935272216797, "learning_rate": 4.013333333333333e-05, "loss": 0.5791, "step": 1110 }, { "epoch": 0.6, "grad_norm": 30.314918518066406, "learning_rate": 4.004444444444445e-05, "loss": 0.6803, "step": 1120 }, { "epoch": 0.6, "grad_norm": 36.90558624267578, "learning_rate": 3.995555555555556e-05, "loss": 0.5809, "step": 1130 }, { "epoch": 0.61, "grad_norm": 38.08405303955078, "learning_rate": 3.986666666666667e-05, "loss": 0.7575, "step": 1140 }, { "epoch": 0.61, "grad_norm": 25.463375091552734, "learning_rate": 3.977777777777778e-05, "loss": 0.6668, "step": 1150 }, { "epoch": 0.62, "grad_norm": 30.448307037353516, "learning_rate": 3.968888888888889e-05, "loss": 0.6106, "step": 1160 }, { "epoch": 0.62, "grad_norm": 27.176774978637695, "learning_rate": 3.960000000000001e-05, "loss": 0.7375, "step": 1170 }, { "epoch": 0.63, "grad_norm": 29.381431579589844, "learning_rate": 3.951111111111112e-05, "loss": 0.7824, "step": 1180 }, { "epoch": 0.63, "grad_norm": 30.908754348754883, "learning_rate": 3.942222222222222e-05, "loss": 0.6832, "step": 1190 }, { "epoch": 0.64, "grad_norm": 34.62039566040039, "learning_rate": 3.933333333333333e-05, "loss": 0.6971, "step": 1200 }, { "epoch": 0.64, "eval_accuracy": 0.8159, "eval_f1_macro": 0.7334649863076415, "eval_f1_micro": 0.8159, "eval_loss": 0.7008675932884216, "eval_runtime": 150.0312, "eval_samples_per_second": 66.653, "eval_steps_per_second": 2.086, "step": 1200 }, { "epoch": 0.65, "grad_norm": 33.131656646728516, "learning_rate": 3.924444444444444e-05, "loss": 0.6262, "step": 1210 }, { "epoch": 0.65, "grad_norm": 22.17938804626465, "learning_rate": 3.915555555555556e-05, "loss": 0.7002, "step": 1220 }, { "epoch": 0.66, "grad_norm": 38.93252182006836, "learning_rate": 3.906666666666667e-05, "loss": 0.7796, "step": 1230 }, { "epoch": 0.66, "grad_norm": 32.47177505493164, "learning_rate": 3.897777777777778e-05, "loss": 0.579, "step": 1240 }, { "epoch": 0.67, "grad_norm": 28.713239669799805, "learning_rate": 3.888888888888889e-05, "loss": 0.7154, "step": 1250 }, { "epoch": 0.67, "grad_norm": 26.406848907470703, "learning_rate": 3.88e-05, "loss": 0.7948, "step": 1260 }, { "epoch": 0.68, "grad_norm": 34.53367233276367, "learning_rate": 3.871111111111111e-05, "loss": 0.5775, "step": 1270 }, { "epoch": 0.68, "grad_norm": 27.183706283569336, "learning_rate": 3.862222222222223e-05, "loss": 0.7497, "step": 1280 }, { "epoch": 0.69, "grad_norm": 34.97861099243164, "learning_rate": 3.853333333333334e-05, "loss": 0.5953, "step": 1290 }, { "epoch": 0.69, "grad_norm": 28.780019760131836, "learning_rate": 3.844444444444444e-05, "loss": 0.7053, "step": 1300 }, { "epoch": 0.69, "eval_accuracy": 0.8291, "eval_f1_macro": 0.7205458318365534, "eval_f1_micro": 0.8291, "eval_loss": 0.6921299695968628, "eval_runtime": 150.1334, "eval_samples_per_second": 66.607, "eval_steps_per_second": 2.085, "step": 1300 }, { "epoch": 0.7, "grad_norm": 15.419646263122559, "learning_rate": 3.8355555555555553e-05, "loss": 0.5795, "step": 1310 }, { "epoch": 0.7, "grad_norm": 32.80424880981445, "learning_rate": 3.8266666666666664e-05, "loss": 0.8922, "step": 1320 }, { "epoch": 0.71, "grad_norm": 17.997528076171875, "learning_rate": 3.817777777777778e-05, "loss": 0.6339, "step": 1330 }, { "epoch": 0.71, "grad_norm": 28.21483612060547, "learning_rate": 3.808888888888889e-05, "loss": 0.825, "step": 1340 }, { "epoch": 0.72, "grad_norm": 30.419662475585938, "learning_rate": 3.8e-05, "loss": 0.6544, "step": 1350 }, { "epoch": 0.73, "grad_norm": 22.456350326538086, "learning_rate": 3.791111111111111e-05, "loss": 0.5786, "step": 1360 }, { "epoch": 0.73, "grad_norm": 47.38570785522461, "learning_rate": 3.782222222222222e-05, "loss": 0.7092, "step": 1370 }, { "epoch": 0.74, "grad_norm": 25.527708053588867, "learning_rate": 3.773333333333334e-05, "loss": 0.7458, "step": 1380 }, { "epoch": 0.74, "grad_norm": 19.75039291381836, "learning_rate": 3.764444444444445e-05, "loss": 0.6571, "step": 1390 }, { "epoch": 0.75, "grad_norm": 19.110685348510742, "learning_rate": 3.7555555555555554e-05, "loss": 0.6413, "step": 1400 }, { "epoch": 0.75, "eval_accuracy": 0.8301, "eval_f1_macro": 0.729154652044555, "eval_f1_micro": 0.8301, "eval_loss": 0.6514862179756165, "eval_runtime": 150.1615, "eval_samples_per_second": 66.595, "eval_steps_per_second": 2.084, "step": 1400 }, { "epoch": 0.75, "grad_norm": 29.326581954956055, "learning_rate": 3.7466666666666665e-05, "loss": 0.6138, "step": 1410 }, { "epoch": 0.76, "grad_norm": 41.61116409301758, "learning_rate": 3.7377777777777775e-05, "loss": 0.5805, "step": 1420 }, { "epoch": 0.76, "grad_norm": 33.782711029052734, "learning_rate": 3.728888888888889e-05, "loss": 0.6785, "step": 1430 }, { "epoch": 0.77, "grad_norm": 33.50584030151367, "learning_rate": 3.72e-05, "loss": 0.6461, "step": 1440 }, { "epoch": 0.77, "grad_norm": 33.119720458984375, "learning_rate": 3.7111111111111113e-05, "loss": 0.682, "step": 1450 }, { "epoch": 0.78, "grad_norm": 28.767709732055664, "learning_rate": 3.7022222222222224e-05, "loss": 0.6927, "step": 1460 }, { "epoch": 0.78, "grad_norm": 30.515918731689453, "learning_rate": 3.6933333333333334e-05, "loss": 0.6882, "step": 1470 }, { "epoch": 0.79, "grad_norm": 28.996906280517578, "learning_rate": 3.6844444444444445e-05, "loss": 0.7307, "step": 1480 }, { "epoch": 0.79, "grad_norm": 28.326091766357422, "learning_rate": 3.675555555555556e-05, "loss": 0.8148, "step": 1490 }, { "epoch": 0.8, "grad_norm": 28.253231048583984, "learning_rate": 3.6666666666666666e-05, "loss": 0.6656, "step": 1500 }, { "epoch": 0.8, "eval_accuracy": 0.8241, "eval_f1_macro": 0.7160978666563796, "eval_f1_micro": 0.8241, "eval_loss": 0.6684937477111816, "eval_runtime": 150.0761, "eval_samples_per_second": 66.633, "eval_steps_per_second": 2.086, "step": 1500 }, { "epoch": 0.81, "grad_norm": 39.65568542480469, "learning_rate": 3.6577777777777776e-05, "loss": 0.6676, "step": 1510 }, { "epoch": 0.81, "grad_norm": 37.01750946044922, "learning_rate": 3.648888888888889e-05, "loss": 0.7308, "step": 1520 }, { "epoch": 0.82, "grad_norm": 15.336851119995117, "learning_rate": 3.6400000000000004e-05, "loss": 0.5771, "step": 1530 }, { "epoch": 0.82, "grad_norm": 20.613094329833984, "learning_rate": 3.6311111111111114e-05, "loss": 0.5068, "step": 1540 }, { "epoch": 0.83, "grad_norm": 18.742759704589844, "learning_rate": 3.6222222222222225e-05, "loss": 0.7777, "step": 1550 }, { "epoch": 0.83, "grad_norm": 33.8448600769043, "learning_rate": 3.6133333333333335e-05, "loss": 0.8493, "step": 1560 }, { "epoch": 0.84, "grad_norm": 33.76942825317383, "learning_rate": 3.6044444444444446e-05, "loss": 0.7544, "step": 1570 }, { "epoch": 0.84, "grad_norm": 25.16337013244629, "learning_rate": 3.5955555555555556e-05, "loss": 0.6645, "step": 1580 }, { "epoch": 0.85, "grad_norm": 23.87677764892578, "learning_rate": 3.586666666666667e-05, "loss": 0.7335, "step": 1590 }, { "epoch": 0.85, "grad_norm": 13.271788597106934, "learning_rate": 3.577777777777778e-05, "loss": 0.6114, "step": 1600 }, { "epoch": 0.85, "eval_accuracy": 0.8246, "eval_f1_macro": 0.7269472752858881, "eval_f1_micro": 0.8246, "eval_loss": 0.6453167796134949, "eval_runtime": 150.2624, "eval_samples_per_second": 66.55, "eval_steps_per_second": 2.083, "step": 1600 }, { "epoch": 0.86, "grad_norm": 22.58757972717285, "learning_rate": 3.568888888888889e-05, "loss": 0.602, "step": 1610 }, { "epoch": 0.86, "grad_norm": 25.419322967529297, "learning_rate": 3.56e-05, "loss": 0.5252, "step": 1620 }, { "epoch": 0.87, "grad_norm": 26.960481643676758, "learning_rate": 3.551111111111111e-05, "loss": 0.6874, "step": 1630 }, { "epoch": 0.87, "grad_norm": 27.8248233795166, "learning_rate": 3.5422222222222226e-05, "loss": 0.7663, "step": 1640 }, { "epoch": 0.88, "grad_norm": 32.19744873046875, "learning_rate": 3.5333333333333336e-05, "loss": 0.7413, "step": 1650 }, { "epoch": 0.89, "grad_norm": 32.132179260253906, "learning_rate": 3.5244444444444447e-05, "loss": 0.5752, "step": 1660 }, { "epoch": 0.89, "grad_norm": 32.55865478515625, "learning_rate": 3.515555555555556e-05, "loss": 0.5818, "step": 1670 }, { "epoch": 0.9, "grad_norm": 32.21278381347656, "learning_rate": 3.506666666666667e-05, "loss": 0.5803, "step": 1680 }, { "epoch": 0.9, "grad_norm": 25.46314239501953, "learning_rate": 3.4977777777777785e-05, "loss": 0.623, "step": 1690 }, { "epoch": 0.91, "grad_norm": 26.091236114501953, "learning_rate": 3.4888888888888895e-05, "loss": 0.5616, "step": 1700 }, { "epoch": 0.91, "eval_accuracy": 0.8275, "eval_f1_macro": 0.7289680784160217, "eval_f1_micro": 0.8275, "eval_loss": 0.6631607413291931, "eval_runtime": 150.375, "eval_samples_per_second": 66.5, "eval_steps_per_second": 2.081, "step": 1700 }, { "epoch": 0.91, "grad_norm": 25.63671875, "learning_rate": 3.48e-05, "loss": 0.7312, "step": 1710 }, { "epoch": 0.92, "grad_norm": 32.645164489746094, "learning_rate": 3.471111111111111e-05, "loss": 0.6261, "step": 1720 }, { "epoch": 0.92, "grad_norm": 31.84140396118164, "learning_rate": 3.462222222222222e-05, "loss": 0.6618, "step": 1730 }, { "epoch": 0.93, "grad_norm": 23.48900604248047, "learning_rate": 3.453333333333334e-05, "loss": 0.5569, "step": 1740 }, { "epoch": 0.93, "grad_norm": 21.029348373413086, "learning_rate": 3.444444444444445e-05, "loss": 0.5333, "step": 1750 }, { "epoch": 0.94, "grad_norm": 26.658044815063477, "learning_rate": 3.435555555555556e-05, "loss": 0.516, "step": 1760 }, { "epoch": 0.94, "grad_norm": 25.32404899597168, "learning_rate": 3.426666666666667e-05, "loss": 0.5171, "step": 1770 }, { "epoch": 0.95, "grad_norm": 33.53643798828125, "learning_rate": 3.417777777777778e-05, "loss": 0.6361, "step": 1780 }, { "epoch": 0.95, "grad_norm": 23.664636611938477, "learning_rate": 3.408888888888889e-05, "loss": 0.5254, "step": 1790 }, { "epoch": 0.96, "grad_norm": 26.88168716430664, "learning_rate": 3.4000000000000007e-05, "loss": 0.6985, "step": 1800 }, { "epoch": 0.96, "eval_accuracy": 0.8329, "eval_f1_macro": 0.7395314297204796, "eval_f1_micro": 0.8329, "eval_loss": 0.6022256016731262, "eval_runtime": 150.3869, "eval_samples_per_second": 66.495, "eval_steps_per_second": 2.081, "step": 1800 }, { "epoch": 0.97, "grad_norm": 26.552513122558594, "learning_rate": 3.391111111111111e-05, "loss": 0.7002, "step": 1810 }, { "epoch": 0.97, "grad_norm": 19.498023986816406, "learning_rate": 3.382222222222222e-05, "loss": 0.74, "step": 1820 }, { "epoch": 0.98, "grad_norm": 19.793920516967773, "learning_rate": 3.373333333333333e-05, "loss": 0.6457, "step": 1830 }, { "epoch": 0.98, "grad_norm": 21.879690170288086, "learning_rate": 3.364444444444445e-05, "loss": 0.5794, "step": 1840 }, { "epoch": 0.99, "grad_norm": 28.4526309967041, "learning_rate": 3.355555555555556e-05, "loss": 0.5144, "step": 1850 }, { "epoch": 0.99, "grad_norm": 29.433683395385742, "learning_rate": 3.346666666666667e-05, "loss": 0.4621, "step": 1860 }, { "epoch": 1.0, "grad_norm": 30.06548309326172, "learning_rate": 3.337777777777778e-05, "loss": 0.6983, "step": 1870 }, { "epoch": 1.0, "grad_norm": 16.910295486450195, "learning_rate": 3.328888888888889e-05, "loss": 0.4668, "step": 1880 }, { "epoch": 1.01, "grad_norm": 18.447338104248047, "learning_rate": 3.32e-05, "loss": 0.4046, "step": 1890 }, { "epoch": 1.01, "grad_norm": 18.394548416137695, "learning_rate": 3.311111111111112e-05, "loss": 0.387, "step": 1900 }, { "epoch": 1.01, "eval_accuracy": 0.8475, "eval_f1_macro": 0.768978436326819, "eval_f1_micro": 0.8475, "eval_loss": 0.5910280346870422, "eval_runtime": 150.3955, "eval_samples_per_second": 66.491, "eval_steps_per_second": 2.081, "step": 1900 }, { "epoch": 1.02, "grad_norm": 15.962233543395996, "learning_rate": 3.302222222222222e-05, "loss": 0.3534, "step": 1910 }, { "epoch": 1.02, "grad_norm": 25.97977638244629, "learning_rate": 3.293333333333333e-05, "loss": 0.3755, "step": 1920 }, { "epoch": 1.03, "grad_norm": 16.386619567871094, "learning_rate": 3.284444444444444e-05, "loss": 0.3695, "step": 1930 }, { "epoch": 1.03, "grad_norm": 29.158287048339844, "learning_rate": 3.275555555555555e-05, "loss": 0.3642, "step": 1940 }, { "epoch": 1.04, "grad_norm": 21.561437606811523, "learning_rate": 3.266666666666667e-05, "loss": 0.3619, "step": 1950 }, { "epoch": 1.05, "grad_norm": 20.957096099853516, "learning_rate": 3.257777777777778e-05, "loss": 0.3565, "step": 1960 }, { "epoch": 1.05, "grad_norm": 23.491188049316406, "learning_rate": 3.248888888888889e-05, "loss": 0.3597, "step": 1970 }, { "epoch": 1.06, "grad_norm": 20.992183685302734, "learning_rate": 3.24e-05, "loss": 0.4166, "step": 1980 }, { "epoch": 1.06, "grad_norm": 10.696439743041992, "learning_rate": 3.231111111111111e-05, "loss": 0.3333, "step": 1990 }, { "epoch": 1.07, "grad_norm": 20.321285247802734, "learning_rate": 3.222222222222223e-05, "loss": 0.2391, "step": 2000 }, { "epoch": 1.07, "eval_accuracy": 0.8475, "eval_f1_macro": 0.756420860990717, "eval_f1_micro": 0.8475, "eval_loss": 0.6234980225563049, "eval_runtime": 150.4342, "eval_samples_per_second": 66.474, "eval_steps_per_second": 2.081, "step": 2000 }, { "epoch": 1.07, "grad_norm": 28.013612747192383, "learning_rate": 3.213333333333334e-05, "loss": 0.4804, "step": 2010 }, { "epoch": 1.08, "grad_norm": 20.689050674438477, "learning_rate": 3.204444444444444e-05, "loss": 0.2839, "step": 2020 }, { "epoch": 1.08, "grad_norm": 25.118309020996094, "learning_rate": 3.1955555555555554e-05, "loss": 0.418, "step": 2030 }, { "epoch": 1.09, "grad_norm": 9.888715744018555, "learning_rate": 3.1866666666666664e-05, "loss": 0.2898, "step": 2040 }, { "epoch": 1.09, "grad_norm": 25.61309051513672, "learning_rate": 3.177777777777778e-05, "loss": 0.3652, "step": 2050 }, { "epoch": 1.1, "grad_norm": 29.823627471923828, "learning_rate": 3.168888888888889e-05, "loss": 0.4499, "step": 2060 }, { "epoch": 1.1, "grad_norm": 25.4545955657959, "learning_rate": 3.16e-05, "loss": 0.3955, "step": 2070 }, { "epoch": 1.11, "grad_norm": 20.512975692749023, "learning_rate": 3.151111111111111e-05, "loss": 0.3354, "step": 2080 }, { "epoch": 1.11, "grad_norm": 24.814722061157227, "learning_rate": 3.142222222222222e-05, "loss": 0.406, "step": 2090 }, { "epoch": 1.12, "grad_norm": 16.441551208496094, "learning_rate": 3.1333333333333334e-05, "loss": 0.4414, "step": 2100 }, { "epoch": 1.12, "eval_accuracy": 0.8421, "eval_f1_macro": 0.7650720616593817, "eval_f1_micro": 0.8421, "eval_loss": 0.6027012467384338, "eval_runtime": 150.493, "eval_samples_per_second": 66.448, "eval_steps_per_second": 2.08, "step": 2100 }, { "epoch": 1.13, "grad_norm": 23.043563842773438, "learning_rate": 3.124444444444445e-05, "loss": 0.4198, "step": 2110 }, { "epoch": 1.13, "grad_norm": 30.09490203857422, "learning_rate": 3.1155555555555555e-05, "loss": 0.3899, "step": 2120 }, { "epoch": 1.14, "grad_norm": 26.25542640686035, "learning_rate": 3.1066666666666665e-05, "loss": 0.3292, "step": 2130 }, { "epoch": 1.14, "grad_norm": 21.587125778198242, "learning_rate": 3.0977777777777776e-05, "loss": 0.434, "step": 2140 }, { "epoch": 1.15, "grad_norm": 32.34952163696289, "learning_rate": 3.088888888888889e-05, "loss": 0.3563, "step": 2150 }, { "epoch": 1.15, "grad_norm": 37.00065994262695, "learning_rate": 3.08e-05, "loss": 0.4363, "step": 2160 }, { "epoch": 1.16, "grad_norm": 18.810853958129883, "learning_rate": 3.0711111111111114e-05, "loss": 0.3945, "step": 2170 }, { "epoch": 1.16, "grad_norm": 20.760358810424805, "learning_rate": 3.0622222222222224e-05, "loss": 0.358, "step": 2180 }, { "epoch": 1.17, "grad_norm": 25.902507781982422, "learning_rate": 3.0533333333333335e-05, "loss": 0.431, "step": 2190 }, { "epoch": 1.17, "grad_norm": 20.889230728149414, "learning_rate": 3.044444444444445e-05, "loss": 0.3869, "step": 2200 }, { "epoch": 1.17, "eval_accuracy": 0.8437, "eval_f1_macro": 0.7592276312605151, "eval_f1_micro": 0.8437, "eval_loss": 0.6028015613555908, "eval_runtime": 150.4185, "eval_samples_per_second": 66.481, "eval_steps_per_second": 2.081, "step": 2200 }, { "epoch": 1.18, "grad_norm": 18.823284149169922, "learning_rate": 3.035555555555556e-05, "loss": 0.3956, "step": 2210 }, { "epoch": 1.18, "grad_norm": 22.283672332763672, "learning_rate": 3.0266666666666666e-05, "loss": 0.4863, "step": 2220 }, { "epoch": 1.19, "grad_norm": 16.33639144897461, "learning_rate": 3.0177777777777776e-05, "loss": 0.392, "step": 2230 }, { "epoch": 1.19, "grad_norm": 23.827781677246094, "learning_rate": 3.008888888888889e-05, "loss": 0.3198, "step": 2240 }, { "epoch": 1.2, "grad_norm": 26.199676513671875, "learning_rate": 3e-05, "loss": 0.3314, "step": 2250 }, { "epoch": 1.21, "grad_norm": 20.12962532043457, "learning_rate": 2.991111111111111e-05, "loss": 0.4698, "step": 2260 }, { "epoch": 1.21, "grad_norm": 27.956256866455078, "learning_rate": 2.9822222222222225e-05, "loss": 0.4283, "step": 2270 }, { "epoch": 1.22, "grad_norm": 24.309349060058594, "learning_rate": 2.9733333333333336e-05, "loss": 0.3119, "step": 2280 }, { "epoch": 1.22, "grad_norm": 21.136127471923828, "learning_rate": 2.9644444444444446e-05, "loss": 0.2309, "step": 2290 }, { "epoch": 1.23, "grad_norm": 14.561148643493652, "learning_rate": 2.955555555555556e-05, "loss": 0.2387, "step": 2300 }, { "epoch": 1.23, "eval_accuracy": 0.845, "eval_f1_macro": 0.7634532685547798, "eval_f1_micro": 0.845, "eval_loss": 0.6645835638046265, "eval_runtime": 150.7976, "eval_samples_per_second": 66.314, "eval_steps_per_second": 2.076, "step": 2300 }, { "epoch": 1.23, "grad_norm": 17.950855255126953, "learning_rate": 2.946666666666667e-05, "loss": 0.442, "step": 2310 }, { "epoch": 1.24, "grad_norm": 25.867813110351562, "learning_rate": 2.937777777777778e-05, "loss": 0.44, "step": 2320 }, { "epoch": 1.24, "grad_norm": 17.729812622070312, "learning_rate": 2.9288888888888888e-05, "loss": 0.358, "step": 2330 }, { "epoch": 1.25, "grad_norm": 19.638261795043945, "learning_rate": 2.9199999999999998e-05, "loss": 0.2645, "step": 2340 }, { "epoch": 1.25, "grad_norm": 25.970163345336914, "learning_rate": 2.9111111111111112e-05, "loss": 0.3405, "step": 2350 }, { "epoch": 1.26, "grad_norm": 11.836894989013672, "learning_rate": 2.9022222222222223e-05, "loss": 0.3557, "step": 2360 }, { "epoch": 1.26, "grad_norm": 20.230266571044922, "learning_rate": 2.8933333333333333e-05, "loss": 0.4081, "step": 2370 }, { "epoch": 1.27, "grad_norm": 29.962060928344727, "learning_rate": 2.8844444444444447e-05, "loss": 0.373, "step": 2380 }, { "epoch": 1.27, "grad_norm": 33.542320251464844, "learning_rate": 2.8755555555555557e-05, "loss": 0.3617, "step": 2390 }, { "epoch": 1.28, "grad_norm": 25.599098205566406, "learning_rate": 2.8666666666666668e-05, "loss": 0.3556, "step": 2400 }, { "epoch": 1.28, "eval_accuracy": 0.8487, "eval_f1_macro": 0.7724260808875206, "eval_f1_micro": 0.8487, "eval_loss": 0.6032431125640869, "eval_runtime": 150.7819, "eval_samples_per_second": 66.321, "eval_steps_per_second": 2.076, "step": 2400 }, { "epoch": 1.29, "grad_norm": 23.8782958984375, "learning_rate": 2.857777777777778e-05, "loss": 0.4046, "step": 2410 }, { "epoch": 1.29, "grad_norm": 23.979324340820312, "learning_rate": 2.8488888888888892e-05, "loss": 0.3942, "step": 2420 }, { "epoch": 1.3, "grad_norm": 10.684112548828125, "learning_rate": 2.84e-05, "loss": 0.2238, "step": 2430 }, { "epoch": 1.3, "grad_norm": 18.40957260131836, "learning_rate": 2.831111111111111e-05, "loss": 0.3078, "step": 2440 }, { "epoch": 1.31, "grad_norm": 30.96697998046875, "learning_rate": 2.8222222222222223e-05, "loss": 0.3622, "step": 2450 }, { "epoch": 1.31, "grad_norm": 24.614702224731445, "learning_rate": 2.8133333333333334e-05, "loss": 0.3691, "step": 2460 }, { "epoch": 1.32, "grad_norm": 23.404987335205078, "learning_rate": 2.8044444444444444e-05, "loss": 0.3551, "step": 2470 }, { "epoch": 1.32, "grad_norm": 30.258798599243164, "learning_rate": 2.7955555555555558e-05, "loss": 0.3761, "step": 2480 }, { "epoch": 1.33, "grad_norm": 4.7594780921936035, "learning_rate": 2.786666666666667e-05, "loss": 0.328, "step": 2490 }, { "epoch": 1.33, "grad_norm": 14.827425956726074, "learning_rate": 2.777777777777778e-05, "loss": 0.4439, "step": 2500 }, { "epoch": 1.33, "eval_accuracy": 0.8589, "eval_f1_macro": 0.7789734556100073, "eval_f1_micro": 0.8589, "eval_loss": 0.5772649049758911, "eval_runtime": 150.8158, "eval_samples_per_second": 66.306, "eval_steps_per_second": 2.075, "step": 2500 }, { "epoch": 1.34, "grad_norm": 15.111785888671875, "learning_rate": 2.7688888888888893e-05, "loss": 0.2569, "step": 2510 }, { "epoch": 1.34, "grad_norm": 28.836196899414062, "learning_rate": 2.7600000000000003e-05, "loss": 0.4003, "step": 2520 }, { "epoch": 1.35, "grad_norm": 36.57837677001953, "learning_rate": 2.751111111111111e-05, "loss": 0.3891, "step": 2530 }, { "epoch": 1.35, "grad_norm": 20.131092071533203, "learning_rate": 2.742222222222222e-05, "loss": 0.3853, "step": 2540 }, { "epoch": 1.36, "grad_norm": 28.32253074645996, "learning_rate": 2.733333333333333e-05, "loss": 0.4096, "step": 2550 }, { "epoch": 1.37, "grad_norm": 26.20575523376465, "learning_rate": 2.7244444444444445e-05, "loss": 0.3505, "step": 2560 }, { "epoch": 1.37, "grad_norm": 29.0845947265625, "learning_rate": 2.7155555555555556e-05, "loss": 0.3897, "step": 2570 }, { "epoch": 1.38, "grad_norm": 15.182287216186523, "learning_rate": 2.706666666666667e-05, "loss": 0.3748, "step": 2580 }, { "epoch": 1.38, "grad_norm": 14.50926399230957, "learning_rate": 2.697777777777778e-05, "loss": 0.3501, "step": 2590 }, { "epoch": 1.39, "grad_norm": 23.248886108398438, "learning_rate": 2.688888888888889e-05, "loss": 0.4171, "step": 2600 }, { "epoch": 1.39, "eval_accuracy": 0.8551, "eval_f1_macro": 0.7759771387428208, "eval_f1_micro": 0.8551, "eval_loss": 0.5601994395256042, "eval_runtime": 150.7993, "eval_samples_per_second": 66.313, "eval_steps_per_second": 2.076, "step": 2600 }, { "epoch": 1.39, "grad_norm": 20.562131881713867, "learning_rate": 2.6800000000000004e-05, "loss": 0.3811, "step": 2610 }, { "epoch": 1.4, "grad_norm": 20.327190399169922, "learning_rate": 2.6711111111111115e-05, "loss": 0.2697, "step": 2620 }, { "epoch": 1.4, "grad_norm": 19.044452667236328, "learning_rate": 2.6622222222222225e-05, "loss": 0.3059, "step": 2630 }, { "epoch": 1.41, "grad_norm": 24.917388916015625, "learning_rate": 2.6533333333333332e-05, "loss": 0.4426, "step": 2640 }, { "epoch": 1.41, "grad_norm": 25.066818237304688, "learning_rate": 2.6444444444444443e-05, "loss": 0.4759, "step": 2650 }, { "epoch": 1.42, "grad_norm": 27.263545989990234, "learning_rate": 2.6355555555555557e-05, "loss": 0.3387, "step": 2660 }, { "epoch": 1.42, "grad_norm": 18.15851402282715, "learning_rate": 2.6266666666666667e-05, "loss": 0.3474, "step": 2670 }, { "epoch": 1.43, "grad_norm": 21.79593276977539, "learning_rate": 2.6177777777777777e-05, "loss": 0.3775, "step": 2680 }, { "epoch": 1.43, "grad_norm": 17.30176544189453, "learning_rate": 2.608888888888889e-05, "loss": 0.3177, "step": 2690 }, { "epoch": 1.44, "grad_norm": 30.904870986938477, "learning_rate": 2.6000000000000002e-05, "loss": 0.3984, "step": 2700 }, { "epoch": 1.44, "eval_accuracy": 0.8514, "eval_f1_macro": 0.7708173208271037, "eval_f1_micro": 0.8514, "eval_loss": 0.5800321102142334, "eval_runtime": 150.9969, "eval_samples_per_second": 66.227, "eval_steps_per_second": 2.073, "step": 2700 }, { "epoch": 1.45, "grad_norm": 22.358997344970703, "learning_rate": 2.5911111111111112e-05, "loss": 0.3168, "step": 2710 }, { "epoch": 1.45, "grad_norm": 28.393596649169922, "learning_rate": 2.5822222222222226e-05, "loss": 0.3254, "step": 2720 }, { "epoch": 1.46, "grad_norm": 24.635414123535156, "learning_rate": 2.5733333333333337e-05, "loss": 0.2818, "step": 2730 }, { "epoch": 1.46, "grad_norm": 18.663604736328125, "learning_rate": 2.5644444444444444e-05, "loss": 0.3943, "step": 2740 }, { "epoch": 1.47, "grad_norm": 25.46748161315918, "learning_rate": 2.5555555555555554e-05, "loss": 0.4195, "step": 2750 }, { "epoch": 1.47, "grad_norm": 16.54319190979004, "learning_rate": 2.5466666666666668e-05, "loss": 0.2946, "step": 2760 }, { "epoch": 1.48, "grad_norm": 15.662579536437988, "learning_rate": 2.537777777777778e-05, "loss": 0.3247, "step": 2770 }, { "epoch": 1.48, "grad_norm": 33.76002883911133, "learning_rate": 2.528888888888889e-05, "loss": 0.3986, "step": 2780 }, { "epoch": 1.49, "grad_norm": 17.078815460205078, "learning_rate": 2.5200000000000003e-05, "loss": 0.2907, "step": 2790 }, { "epoch": 1.49, "grad_norm": 19.065820693969727, "learning_rate": 2.5111111111111113e-05, "loss": 0.2491, "step": 2800 }, { "epoch": 1.49, "eval_accuracy": 0.8463, "eval_f1_macro": 0.7774119411824801, "eval_f1_micro": 0.8463, "eval_loss": 0.5934433341026306, "eval_runtime": 150.8383, "eval_samples_per_second": 66.296, "eval_steps_per_second": 2.075, "step": 2800 }, { "epoch": 1.5, "grad_norm": 20.654638290405273, "learning_rate": 2.5022222222222224e-05, "loss": 0.2698, "step": 2810 }, { "epoch": 1.5, "grad_norm": 23.666898727416992, "learning_rate": 2.4933333333333334e-05, "loss": 0.387, "step": 2820 }, { "epoch": 1.51, "grad_norm": 24.191789627075195, "learning_rate": 2.4844444444444444e-05, "loss": 0.2838, "step": 2830 }, { "epoch": 1.51, "grad_norm": 21.81308937072754, "learning_rate": 2.475555555555556e-05, "loss": 0.3263, "step": 2840 }, { "epoch": 1.52, "grad_norm": 21.30182456970215, "learning_rate": 2.466666666666667e-05, "loss": 0.3126, "step": 2850 }, { "epoch": 1.53, "grad_norm": 20.381277084350586, "learning_rate": 2.457777777777778e-05, "loss": 0.322, "step": 2860 }, { "epoch": 1.53, "grad_norm": 22.04474639892578, "learning_rate": 2.448888888888889e-05, "loss": 0.3995, "step": 2870 }, { "epoch": 1.54, "grad_norm": 17.000167846679688, "learning_rate": 2.44e-05, "loss": 0.3385, "step": 2880 }, { "epoch": 1.54, "grad_norm": 19.123960494995117, "learning_rate": 2.431111111111111e-05, "loss": 0.3365, "step": 2890 }, { "epoch": 1.55, "grad_norm": 24.588180541992188, "learning_rate": 2.4222222222222224e-05, "loss": 0.2975, "step": 2900 }, { "epoch": 1.55, "eval_accuracy": 0.8548, "eval_f1_macro": 0.7775962578615729, "eval_f1_micro": 0.8548, "eval_loss": 0.5837641954421997, "eval_runtime": 151.0441, "eval_samples_per_second": 66.206, "eval_steps_per_second": 2.072, "step": 2900 }, { "epoch": 1.55, "grad_norm": 19.566835403442383, "learning_rate": 2.4133333333333335e-05, "loss": 0.482, "step": 2910 }, { "epoch": 1.56, "grad_norm": 20.381959915161133, "learning_rate": 2.4044444444444445e-05, "loss": 0.415, "step": 2920 }, { "epoch": 1.56, "grad_norm": 26.44783592224121, "learning_rate": 2.3955555555555556e-05, "loss": 0.4592, "step": 2930 }, { "epoch": 1.57, "grad_norm": 24.821016311645508, "learning_rate": 2.3866666666666666e-05, "loss": 0.2879, "step": 2940 }, { "epoch": 1.57, "grad_norm": 17.898052215576172, "learning_rate": 2.377777777777778e-05, "loss": 0.3406, "step": 2950 }, { "epoch": 1.58, "grad_norm": 19.308439254760742, "learning_rate": 2.368888888888889e-05, "loss": 0.425, "step": 2960 }, { "epoch": 1.58, "grad_norm": 20.031681060791016, "learning_rate": 2.36e-05, "loss": 0.284, "step": 2970 }, { "epoch": 1.59, "grad_norm": 21.92924690246582, "learning_rate": 2.351111111111111e-05, "loss": 0.4024, "step": 2980 }, { "epoch": 1.59, "grad_norm": 19.973752975463867, "learning_rate": 2.3422222222222222e-05, "loss": 0.3045, "step": 2990 }, { "epoch": 1.6, "grad_norm": 18.280500411987305, "learning_rate": 2.3333333333333336e-05, "loss": 0.4375, "step": 3000 }, { "epoch": 1.6, "eval_accuracy": 0.8497, "eval_f1_macro": 0.7757695298118251, "eval_f1_micro": 0.8497, "eval_loss": 0.5583605170249939, "eval_runtime": 150.8424, "eval_samples_per_second": 66.294, "eval_steps_per_second": 2.075, "step": 3000 }, { "epoch": 1.61, "grad_norm": 31.200424194335938, "learning_rate": 2.3244444444444446e-05, "loss": 0.3961, "step": 3010 }, { "epoch": 1.61, "grad_norm": 28.22431755065918, "learning_rate": 2.3155555555555557e-05, "loss": 0.3697, "step": 3020 }, { "epoch": 1.62, "grad_norm": 25.528642654418945, "learning_rate": 2.3066666666666667e-05, "loss": 0.4165, "step": 3030 }, { "epoch": 1.62, "grad_norm": 21.460134506225586, "learning_rate": 2.2977777777777778e-05, "loss": 0.337, "step": 3040 }, { "epoch": 1.63, "grad_norm": 18.815004348754883, "learning_rate": 2.288888888888889e-05, "loss": 0.4345, "step": 3050 }, { "epoch": 1.63, "grad_norm": 17.853065490722656, "learning_rate": 2.2800000000000002e-05, "loss": 0.3287, "step": 3060 }, { "epoch": 1.64, "grad_norm": 11.9459228515625, "learning_rate": 2.2711111111111112e-05, "loss": 0.5879, "step": 3070 }, { "epoch": 1.64, "grad_norm": 25.860185623168945, "learning_rate": 2.2622222222222223e-05, "loss": 0.2999, "step": 3080 }, { "epoch": 1.65, "grad_norm": 13.486348152160645, "learning_rate": 2.2533333333333333e-05, "loss": 0.4131, "step": 3090 }, { "epoch": 1.65, "grad_norm": 26.329408645629883, "learning_rate": 2.2444444444444447e-05, "loss": 0.3108, "step": 3100 }, { "epoch": 1.65, "eval_accuracy": 0.8624, "eval_f1_macro": 0.7863744372305322, "eval_f1_micro": 0.8624, "eval_loss": 0.5624867677688599, "eval_runtime": 151.1981, "eval_samples_per_second": 66.138, "eval_steps_per_second": 2.07, "step": 3100 }, { "epoch": 1.66, "grad_norm": 13.064452171325684, "learning_rate": 2.2355555555555558e-05, "loss": 0.3844, "step": 3110 }, { "epoch": 1.66, "grad_norm": 19.07467269897461, "learning_rate": 2.2266666666666668e-05, "loss": 0.4157, "step": 3120 }, { "epoch": 1.67, "grad_norm": 13.42187213897705, "learning_rate": 2.217777777777778e-05, "loss": 0.3717, "step": 3130 }, { "epoch": 1.67, "grad_norm": 17.826555252075195, "learning_rate": 2.208888888888889e-05, "loss": 0.3105, "step": 3140 }, { "epoch": 1.68, "grad_norm": 16.670066833496094, "learning_rate": 2.2000000000000003e-05, "loss": 0.3521, "step": 3150 }, { "epoch": 1.69, "grad_norm": 23.210683822631836, "learning_rate": 2.1911111111111113e-05, "loss": 0.2766, "step": 3160 }, { "epoch": 1.69, "grad_norm": 19.09944725036621, "learning_rate": 2.1822222222222224e-05, "loss": 0.331, "step": 3170 }, { "epoch": 1.7, "grad_norm": 13.545781135559082, "learning_rate": 2.1733333333333334e-05, "loss": 0.3638, "step": 3180 }, { "epoch": 1.7, "grad_norm": 12.350102424621582, "learning_rate": 2.1644444444444445e-05, "loss": 0.2956, "step": 3190 }, { "epoch": 1.71, "grad_norm": 26.11676025390625, "learning_rate": 2.1555555555555555e-05, "loss": 0.3546, "step": 3200 }, { "epoch": 1.71, "eval_accuracy": 0.8586, "eval_f1_macro": 0.7813783110286097, "eval_f1_micro": 0.8586, "eval_loss": 0.5264253616333008, "eval_runtime": 151.3337, "eval_samples_per_second": 66.079, "eval_steps_per_second": 2.068, "step": 3200 }, { "epoch": 1.71, "grad_norm": 25.43873405456543, "learning_rate": 2.146666666666667e-05, "loss": 0.2851, "step": 3210 }, { "epoch": 1.72, "grad_norm": 31.734495162963867, "learning_rate": 2.137777777777778e-05, "loss": 0.2799, "step": 3220 }, { "epoch": 1.72, "grad_norm": 18.14277458190918, "learning_rate": 2.128888888888889e-05, "loss": 0.2679, "step": 3230 }, { "epoch": 1.73, "grad_norm": 18.775859832763672, "learning_rate": 2.12e-05, "loss": 0.2851, "step": 3240 }, { "epoch": 1.73, "grad_norm": 24.40532684326172, "learning_rate": 2.111111111111111e-05, "loss": 0.4009, "step": 3250 }, { "epoch": 1.74, "grad_norm": 17.99397087097168, "learning_rate": 2.1022222222222225e-05, "loss": 0.3728, "step": 3260 }, { "epoch": 1.74, "grad_norm": 29.886987686157227, "learning_rate": 2.0933333333333335e-05, "loss": 0.2719, "step": 3270 }, { "epoch": 1.75, "grad_norm": 20.097272872924805, "learning_rate": 2.0844444444444446e-05, "loss": 0.3968, "step": 3280 }, { "epoch": 1.75, "grad_norm": 12.6510591506958, "learning_rate": 2.0755555555555556e-05, "loss": 0.3883, "step": 3290 }, { "epoch": 1.76, "grad_norm": 18.41010284423828, "learning_rate": 2.0666666666666666e-05, "loss": 0.4125, "step": 3300 }, { "epoch": 1.76, "eval_accuracy": 0.8509, "eval_f1_macro": 0.7787784202092634, "eval_f1_micro": 0.8509, "eval_loss": 0.5483813285827637, "eval_runtime": 151.1106, "eval_samples_per_second": 66.177, "eval_steps_per_second": 2.071, "step": 3300 }, { "epoch": 1.77, "grad_norm": 19.636871337890625, "learning_rate": 2.057777777777778e-05, "loss": 0.349, "step": 3310 }, { "epoch": 1.77, "grad_norm": 27.96908187866211, "learning_rate": 2.048888888888889e-05, "loss": 0.2733, "step": 3320 }, { "epoch": 1.78, "grad_norm": 14.402112007141113, "learning_rate": 2.04e-05, "loss": 0.253, "step": 3330 }, { "epoch": 1.78, "grad_norm": 20.20763397216797, "learning_rate": 2.031111111111111e-05, "loss": 0.4342, "step": 3340 }, { "epoch": 1.79, "grad_norm": 12.293317794799805, "learning_rate": 2.0222222222222222e-05, "loss": 0.3254, "step": 3350 }, { "epoch": 1.79, "grad_norm": 21.796401977539062, "learning_rate": 2.0133333333333336e-05, "loss": 0.3047, "step": 3360 }, { "epoch": 1.8, "grad_norm": 16.06928062438965, "learning_rate": 2.0044444444444446e-05, "loss": 0.3411, "step": 3370 }, { "epoch": 1.8, "grad_norm": 21.114471435546875, "learning_rate": 1.9955555555555557e-05, "loss": 0.2933, "step": 3380 }, { "epoch": 1.81, "grad_norm": 14.466670036315918, "learning_rate": 1.9866666666666667e-05, "loss": 0.2794, "step": 3390 }, { "epoch": 1.81, "grad_norm": 20.040361404418945, "learning_rate": 1.9777777777777778e-05, "loss": 0.2206, "step": 3400 }, { "epoch": 1.81, "eval_accuracy": 0.8563, "eval_f1_macro": 0.7800383469520217, "eval_f1_micro": 0.8563, "eval_loss": 0.5634235739707947, "eval_runtime": 151.4012, "eval_samples_per_second": 66.05, "eval_steps_per_second": 2.067, "step": 3400 }, { "epoch": 1.82, "grad_norm": 31.591552734375, "learning_rate": 1.968888888888889e-05, "loss": 0.437, "step": 3410 }, { "epoch": 1.82, "grad_norm": 18.39565658569336, "learning_rate": 1.9600000000000002e-05, "loss": 0.3858, "step": 3420 }, { "epoch": 1.83, "grad_norm": 18.943843841552734, "learning_rate": 1.9511111111111113e-05, "loss": 0.277, "step": 3430 }, { "epoch": 1.83, "grad_norm": 28.337656021118164, "learning_rate": 1.9422222222222223e-05, "loss": 0.3453, "step": 3440 }, { "epoch": 1.84, "grad_norm": 20.132535934448242, "learning_rate": 1.9333333333333333e-05, "loss": 0.2824, "step": 3450 }, { "epoch": 1.85, "grad_norm": 18.038774490356445, "learning_rate": 1.9244444444444444e-05, "loss": 0.3169, "step": 3460 }, { "epoch": 1.85, "grad_norm": 24.97245216369629, "learning_rate": 1.9155555555555558e-05, "loss": 0.2868, "step": 3470 }, { "epoch": 1.86, "grad_norm": 24.715192794799805, "learning_rate": 1.9066666666666668e-05, "loss": 0.3365, "step": 3480 }, { "epoch": 1.86, "grad_norm": 26.29402732849121, "learning_rate": 1.897777777777778e-05, "loss": 0.3746, "step": 3490 }, { "epoch": 1.87, "grad_norm": 17.953739166259766, "learning_rate": 1.888888888888889e-05, "loss": 0.3348, "step": 3500 }, { "epoch": 1.87, "eval_accuracy": 0.8644, "eval_f1_macro": 0.7889602469263715, "eval_f1_micro": 0.8644, "eval_loss": 0.5153625011444092, "eval_runtime": 151.5332, "eval_samples_per_second": 65.992, "eval_steps_per_second": 2.066, "step": 3500 }, { "epoch": 1.87, "grad_norm": 14.394041061401367, "learning_rate": 1.88e-05, "loss": 0.2773, "step": 3510 }, { "epoch": 1.88, "grad_norm": 19.886205673217773, "learning_rate": 1.8711111111111113e-05, "loss": 0.3233, "step": 3520 }, { "epoch": 1.88, "grad_norm": 16.647079467773438, "learning_rate": 1.8622222222222224e-05, "loss": 0.2804, "step": 3530 }, { "epoch": 1.89, "grad_norm": 5.996079921722412, "learning_rate": 1.8533333333333334e-05, "loss": 0.2722, "step": 3540 }, { "epoch": 1.89, "grad_norm": 1.7200807332992554, "learning_rate": 1.8444444444444445e-05, "loss": 0.2704, "step": 3550 }, { "epoch": 1.9, "grad_norm": 11.222332000732422, "learning_rate": 1.8355555555555555e-05, "loss": 0.3672, "step": 3560 }, { "epoch": 1.9, "grad_norm": 27.67245864868164, "learning_rate": 1.826666666666667e-05, "loss": 0.3744, "step": 3570 }, { "epoch": 1.91, "grad_norm": 21.44915008544922, "learning_rate": 1.817777777777778e-05, "loss": 0.2362, "step": 3580 }, { "epoch": 1.91, "grad_norm": 34.626834869384766, "learning_rate": 1.808888888888889e-05, "loss": 0.2912, "step": 3590 }, { "epoch": 1.92, "grad_norm": 24.425785064697266, "learning_rate": 1.8e-05, "loss": 0.3451, "step": 3600 }, { "epoch": 1.92, "eval_accuracy": 0.8667, "eval_f1_macro": 0.7857935048958724, "eval_f1_micro": 0.8667, "eval_loss": 0.5220906138420105, "eval_runtime": 151.6123, "eval_samples_per_second": 65.958, "eval_steps_per_second": 2.064, "step": 3600 }, { "epoch": 1.93, "grad_norm": 19.154823303222656, "learning_rate": 1.791111111111111e-05, "loss": 0.3439, "step": 3610 }, { "epoch": 1.93, "grad_norm": 22.16014862060547, "learning_rate": 1.7822222222222225e-05, "loss": 0.2799, "step": 3620 }, { "epoch": 1.94, "grad_norm": 13.806198120117188, "learning_rate": 1.7733333333333335e-05, "loss": 0.2911, "step": 3630 }, { "epoch": 1.94, "grad_norm": 19.70717430114746, "learning_rate": 1.7644444444444446e-05, "loss": 0.2232, "step": 3640 }, { "epoch": 1.95, "grad_norm": 24.279129028320312, "learning_rate": 1.7555555555555556e-05, "loss": 0.3711, "step": 3650 }, { "epoch": 1.95, "grad_norm": 13.949311256408691, "learning_rate": 1.7466666666666667e-05, "loss": 0.3905, "step": 3660 }, { "epoch": 1.96, "grad_norm": 22.523921966552734, "learning_rate": 1.737777777777778e-05, "loss": 0.3294, "step": 3670 }, { "epoch": 1.96, "grad_norm": 27.370868682861328, "learning_rate": 1.728888888888889e-05, "loss": 0.3046, "step": 3680 }, { "epoch": 1.97, "grad_norm": 21.498058319091797, "learning_rate": 1.7199999999999998e-05, "loss": 0.4282, "step": 3690 }, { "epoch": 1.97, "grad_norm": 18.41233253479004, "learning_rate": 1.7111111111111112e-05, "loss": 0.3077, "step": 3700 }, { "epoch": 1.97, "eval_accuracy": 0.8662, "eval_f1_macro": 0.7935499844173846, "eval_f1_micro": 0.8662, "eval_loss": 0.5041437745094299, "eval_runtime": 151.0113, "eval_samples_per_second": 66.22, "eval_steps_per_second": 2.073, "step": 3700 }, { "epoch": 1.98, "grad_norm": 16.763723373413086, "learning_rate": 1.7022222222222222e-05, "loss": 0.225, "step": 3710 }, { "epoch": 1.98, "grad_norm": 11.353950500488281, "learning_rate": 1.6933333333333333e-05, "loss": 0.2651, "step": 3720 }, { "epoch": 1.99, "grad_norm": 17.139925003051758, "learning_rate": 1.6844444444444447e-05, "loss": 0.3122, "step": 3730 }, { "epoch": 1.99, "grad_norm": 16.368776321411133, "learning_rate": 1.6755555555555557e-05, "loss": 0.3058, "step": 3740 }, { "epoch": 2.0, "grad_norm": 20.346548080444336, "learning_rate": 1.6666666666666667e-05, "loss": 0.3383, "step": 3750 }, { "epoch": 2.01, "grad_norm": 9.95333480834961, "learning_rate": 1.6577777777777778e-05, "loss": 0.0964, "step": 3760 }, { "epoch": 2.01, "grad_norm": 5.907923221588135, "learning_rate": 1.648888888888889e-05, "loss": 0.1152, "step": 3770 }, { "epoch": 2.02, "grad_norm": 8.541373252868652, "learning_rate": 1.6400000000000002e-05, "loss": 0.131, "step": 3780 }, { "epoch": 2.02, "grad_norm": 14.729881286621094, "learning_rate": 1.6311111111111113e-05, "loss": 0.1403, "step": 3790 }, { "epoch": 2.03, "grad_norm": 9.04763412475586, "learning_rate": 1.6222222222222223e-05, "loss": 0.1352, "step": 3800 }, { "epoch": 2.03, "eval_accuracy": 0.8668, "eval_f1_macro": 0.7919091255565409, "eval_f1_micro": 0.8668, "eval_loss": 0.5686902403831482, "eval_runtime": 151.4998, "eval_samples_per_second": 66.007, "eval_steps_per_second": 2.066, "step": 3800 }, { "epoch": 2.03, "grad_norm": 6.057829856872559, "learning_rate": 1.6133333333333334e-05, "loss": 0.089, "step": 3810 }, { "epoch": 2.04, "grad_norm": 16.216115951538086, "learning_rate": 1.6044444444444444e-05, "loss": 0.1422, "step": 3820 }, { "epoch": 2.04, "grad_norm": 8.361082077026367, "learning_rate": 1.5955555555555558e-05, "loss": 0.0598, "step": 3830 }, { "epoch": 2.05, "grad_norm": 3.5720770359039307, "learning_rate": 1.586666666666667e-05, "loss": 0.0862, "step": 3840 }, { "epoch": 2.05, "grad_norm": 11.39939022064209, "learning_rate": 1.577777777777778e-05, "loss": 0.1106, "step": 3850 }, { "epoch": 2.06, "grad_norm": 15.31943416595459, "learning_rate": 1.568888888888889e-05, "loss": 0.0744, "step": 3860 }, { "epoch": 2.06, "grad_norm": 10.522315979003906, "learning_rate": 1.56e-05, "loss": 0.0734, "step": 3870 }, { "epoch": 2.07, "grad_norm": 12.624775886535645, "learning_rate": 1.5511111111111114e-05, "loss": 0.076, "step": 3880 }, { "epoch": 2.07, "grad_norm": 29.244462966918945, "learning_rate": 1.5422222222222224e-05, "loss": 0.127, "step": 3890 }, { "epoch": 2.08, "grad_norm": 6.5718607902526855, "learning_rate": 1.5333333333333334e-05, "loss": 0.1012, "step": 3900 }, { "epoch": 2.08, "eval_accuracy": 0.8651, "eval_f1_macro": 0.7887873056662138, "eval_f1_micro": 0.8651, "eval_loss": 0.575444757938385, "eval_runtime": 151.4945, "eval_samples_per_second": 66.009, "eval_steps_per_second": 2.066, "step": 3900 }, { "epoch": 2.09, "grad_norm": 7.431846618652344, "learning_rate": 1.5244444444444445e-05, "loss": 0.1201, "step": 3910 }, { "epoch": 2.09, "grad_norm": 11.772350311279297, "learning_rate": 1.5155555555555555e-05, "loss": 0.0808, "step": 3920 }, { "epoch": 2.1, "grad_norm": 6.622474193572998, "learning_rate": 1.5066666666666668e-05, "loss": 0.0708, "step": 3930 }, { "epoch": 2.1, "grad_norm": 0.8099313378334045, "learning_rate": 1.497777777777778e-05, "loss": 0.1169, "step": 3940 }, { "epoch": 2.11, "grad_norm": 11.817524909973145, "learning_rate": 1.4888888888888888e-05, "loss": 0.0881, "step": 3950 }, { "epoch": 2.11, "grad_norm": 17.54085350036621, "learning_rate": 1.48e-05, "loss": 0.0854, "step": 3960 }, { "epoch": 2.12, "grad_norm": 6.687868118286133, "learning_rate": 1.4711111111111111e-05, "loss": 0.0392, "step": 3970 }, { "epoch": 2.12, "grad_norm": 20.661422729492188, "learning_rate": 1.4622222222222223e-05, "loss": 0.1295, "step": 3980 }, { "epoch": 2.13, "grad_norm": 12.012689590454102, "learning_rate": 1.4533333333333335e-05, "loss": 0.0725, "step": 3990 }, { "epoch": 2.13, "grad_norm": 14.44579029083252, "learning_rate": 1.4444444444444444e-05, "loss": 0.1006, "step": 4000 }, { "epoch": 2.13, "eval_accuracy": 0.872, "eval_f1_macro": 0.7958912080196285, "eval_f1_micro": 0.872, "eval_loss": 0.5928996801376343, "eval_runtime": 151.575, "eval_samples_per_second": 65.974, "eval_steps_per_second": 2.065, "step": 4000 }, { "epoch": 2.14, "grad_norm": 12.146600723266602, "learning_rate": 1.4355555555555556e-05, "loss": 0.0758, "step": 4010 }, { "epoch": 2.14, "grad_norm": 6.771599769592285, "learning_rate": 1.4266666666666667e-05, "loss": 0.1324, "step": 4020 }, { "epoch": 2.15, "grad_norm": 6.837072372436523, "learning_rate": 1.4177777777777779e-05, "loss": 0.1205, "step": 4030 }, { "epoch": 2.15, "grad_norm": 11.44955825805664, "learning_rate": 1.4088888888888891e-05, "loss": 0.0877, "step": 4040 }, { "epoch": 2.16, "grad_norm": 19.20203399658203, "learning_rate": 1.4000000000000001e-05, "loss": 0.1092, "step": 4050 }, { "epoch": 2.17, "grad_norm": 17.78152084350586, "learning_rate": 1.391111111111111e-05, "loss": 0.1364, "step": 4060 }, { "epoch": 2.17, "grad_norm": 3.235673666000366, "learning_rate": 1.3822222222222222e-05, "loss": 0.0909, "step": 4070 }, { "epoch": 2.18, "grad_norm": 5.298550128936768, "learning_rate": 1.3733333333333335e-05, "loss": 0.0663, "step": 4080 }, { "epoch": 2.18, "grad_norm": 4.339886665344238, "learning_rate": 1.3644444444444445e-05, "loss": 0.0641, "step": 4090 }, { "epoch": 2.19, "grad_norm": 9.271440505981445, "learning_rate": 1.3555555555555557e-05, "loss": 0.0536, "step": 4100 }, { "epoch": 2.19, "eval_accuracy": 0.8739, "eval_f1_macro": 0.7991612130458756, "eval_f1_micro": 0.8739, "eval_loss": 0.5760007500648499, "eval_runtime": 151.2539, "eval_samples_per_second": 66.114, "eval_steps_per_second": 2.069, "step": 4100 }, { "epoch": 2.19, "grad_norm": 17.186298370361328, "learning_rate": 1.3466666666666666e-05, "loss": 0.0742, "step": 4110 }, { "epoch": 2.2, "grad_norm": 0.7883173227310181, "learning_rate": 1.3377777777777778e-05, "loss": 0.0655, "step": 4120 }, { "epoch": 2.2, "grad_norm": 9.044299125671387, "learning_rate": 1.328888888888889e-05, "loss": 0.0756, "step": 4130 }, { "epoch": 2.21, "grad_norm": 16.307199478149414, "learning_rate": 1.32e-05, "loss": 0.0511, "step": 4140 }, { "epoch": 2.21, "grad_norm": 7.927936553955078, "learning_rate": 1.3111111111111113e-05, "loss": 0.0507, "step": 4150 }, { "epoch": 2.22, "grad_norm": 3.2003839015960693, "learning_rate": 1.3022222222222222e-05, "loss": 0.089, "step": 4160 }, { "epoch": 2.22, "grad_norm": 1.9253557920455933, "learning_rate": 1.2933333333333334e-05, "loss": 0.0479, "step": 4170 }, { "epoch": 2.23, "grad_norm": 12.930800437927246, "learning_rate": 1.2844444444444446e-05, "loss": 0.0889, "step": 4180 }, { "epoch": 2.23, "grad_norm": 19.236953735351562, "learning_rate": 1.2755555555555556e-05, "loss": 0.1065, "step": 4190 }, { "epoch": 2.24, "grad_norm": 2.595717191696167, "learning_rate": 1.2666666666666668e-05, "loss": 0.0401, "step": 4200 }, { "epoch": 2.24, "eval_accuracy": 0.87, "eval_f1_macro": 0.7935489170050238, "eval_f1_micro": 0.87, "eval_loss": 0.6250885128974915, "eval_runtime": 151.4344, "eval_samples_per_second": 66.035, "eval_steps_per_second": 2.067, "step": 4200 }, { "epoch": 2.25, "grad_norm": 7.041379928588867, "learning_rate": 1.2577777777777777e-05, "loss": 0.0463, "step": 4210 }, { "epoch": 2.25, "grad_norm": 25.49098014831543, "learning_rate": 1.248888888888889e-05, "loss": 0.1407, "step": 4220 }, { "epoch": 2.26, "grad_norm": 2.6813275814056396, "learning_rate": 1.24e-05, "loss": 0.0726, "step": 4230 }, { "epoch": 2.26, "grad_norm": 3.9798381328582764, "learning_rate": 1.2311111111111112e-05, "loss": 0.0948, "step": 4240 }, { "epoch": 2.27, "grad_norm": 12.667770385742188, "learning_rate": 1.2222222222222222e-05, "loss": 0.0802, "step": 4250 }, { "epoch": 2.27, "grad_norm": 16.500022888183594, "learning_rate": 1.2133333333333335e-05, "loss": 0.0976, "step": 4260 }, { "epoch": 2.28, "grad_norm": 6.598761558532715, "learning_rate": 1.2044444444444445e-05, "loss": 0.0522, "step": 4270 }, { "epoch": 2.28, "grad_norm": 8.615388870239258, "learning_rate": 1.1955555555555556e-05, "loss": 0.0968, "step": 4280 }, { "epoch": 2.29, "grad_norm": 2.7019007205963135, "learning_rate": 1.1866666666666668e-05, "loss": 0.0882, "step": 4290 }, { "epoch": 2.29, "grad_norm": 2.509350538253784, "learning_rate": 1.1777777777777778e-05, "loss": 0.0756, "step": 4300 }, { "epoch": 2.29, "eval_accuracy": 0.8709, "eval_f1_macro": 0.8027359908802645, "eval_f1_micro": 0.8709, "eval_loss": 0.5894597172737122, "eval_runtime": 151.3844, "eval_samples_per_second": 66.057, "eval_steps_per_second": 2.068, "step": 4300 }, { "epoch": 2.3, "grad_norm": 8.408510208129883, "learning_rate": 1.168888888888889e-05, "loss": 0.1088, "step": 4310 }, { "epoch": 2.3, "grad_norm": 17.875926971435547, "learning_rate": 1.16e-05, "loss": 0.0945, "step": 4320 }, { "epoch": 2.31, "grad_norm": 11.225419998168945, "learning_rate": 1.1511111111111111e-05, "loss": 0.1098, "step": 4330 }, { "epoch": 2.31, "grad_norm": 14.77673053741455, "learning_rate": 1.1422222222222223e-05, "loss": 0.1175, "step": 4340 }, { "epoch": 2.32, "grad_norm": 17.81058692932129, "learning_rate": 1.1333333333333334e-05, "loss": 0.1149, "step": 4350 }, { "epoch": 2.33, "grad_norm": 1.26676607131958, "learning_rate": 1.1244444444444444e-05, "loss": 0.0419, "step": 4360 }, { "epoch": 2.33, "grad_norm": 12.364920616149902, "learning_rate": 1.1155555555555556e-05, "loss": 0.1475, "step": 4370 }, { "epoch": 2.34, "grad_norm": 7.148777484893799, "learning_rate": 1.1066666666666667e-05, "loss": 0.1141, "step": 4380 }, { "epoch": 2.34, "grad_norm": 10.524274826049805, "learning_rate": 1.0977777777777779e-05, "loss": 0.1338, "step": 4390 }, { "epoch": 2.35, "grad_norm": 7.37857723236084, "learning_rate": 1.088888888888889e-05, "loss": 0.0501, "step": 4400 }, { "epoch": 2.35, "eval_accuracy": 0.8707, "eval_f1_macro": 0.7961653209110884, "eval_f1_micro": 0.8707, "eval_loss": 0.5434012413024902, "eval_runtime": 151.1876, "eval_samples_per_second": 66.143, "eval_steps_per_second": 2.07, "step": 4400 }, { "epoch": 2.35, "grad_norm": 16.111724853515625, "learning_rate": 1.08e-05, "loss": 0.1003, "step": 4410 }, { "epoch": 2.36, "grad_norm": 6.396437168121338, "learning_rate": 1.0711111111111112e-05, "loss": 0.088, "step": 4420 }, { "epoch": 2.36, "grad_norm": 19.40189552307129, "learning_rate": 1.0622222222222223e-05, "loss": 0.1436, "step": 4430 }, { "epoch": 2.37, "grad_norm": 24.316781997680664, "learning_rate": 1.0533333333333335e-05, "loss": 0.0873, "step": 4440 }, { "epoch": 2.37, "grad_norm": 4.808385372161865, "learning_rate": 1.0444444444444445e-05, "loss": 0.0696, "step": 4450 }, { "epoch": 2.38, "grad_norm": 1.6188451051712036, "learning_rate": 1.0355555555555556e-05, "loss": 0.1395, "step": 4460 }, { "epoch": 2.38, "grad_norm": 17.673221588134766, "learning_rate": 1.0266666666666668e-05, "loss": 0.0832, "step": 4470 }, { "epoch": 2.39, "grad_norm": 11.216964721679688, "learning_rate": 1.0177777777777778e-05, "loss": 0.0707, "step": 4480 }, { "epoch": 2.39, "grad_norm": 10.152276992797852, "learning_rate": 1.0088888888888889e-05, "loss": 0.0895, "step": 4490 }, { "epoch": 2.4, "grad_norm": 11.688224792480469, "learning_rate": 1e-05, "loss": 0.0611, "step": 4500 }, { "epoch": 2.4, "eval_accuracy": 0.8759, "eval_f1_macro": 0.8042158552214435, "eval_f1_micro": 0.8759, "eval_loss": 0.594874918460846, "eval_runtime": 151.3642, "eval_samples_per_second": 66.066, "eval_steps_per_second": 2.068, "step": 4500 }, { "epoch": 2.41, "grad_norm": 6.995627403259277, "learning_rate": 9.911111111111111e-06, "loss": 0.0986, "step": 4510 }, { "epoch": 2.41, "grad_norm": 5.848443031311035, "learning_rate": 9.822222222222223e-06, "loss": 0.0719, "step": 4520 }, { "epoch": 2.42, "grad_norm": 12.806991577148438, "learning_rate": 9.733333333333334e-06, "loss": 0.1492, "step": 4530 }, { "epoch": 2.42, "grad_norm": 17.90582847595215, "learning_rate": 9.644444444444444e-06, "loss": 0.1033, "step": 4540 }, { "epoch": 2.43, "grad_norm": 14.297840118408203, "learning_rate": 9.555555555555556e-06, "loss": 0.0654, "step": 4550 }, { "epoch": 2.43, "grad_norm": 27.18831443786621, "learning_rate": 9.466666666666667e-06, "loss": 0.1117, "step": 4560 }, { "epoch": 2.44, "grad_norm": 4.1261420249938965, "learning_rate": 9.377777777777779e-06, "loss": 0.0619, "step": 4570 }, { "epoch": 2.44, "grad_norm": 3.1558353900909424, "learning_rate": 9.288888888888888e-06, "loss": 0.0841, "step": 4580 }, { "epoch": 2.45, "grad_norm": 16.345598220825195, "learning_rate": 9.2e-06, "loss": 0.0423, "step": 4590 }, { "epoch": 2.45, "grad_norm": 14.02190113067627, "learning_rate": 9.111111111111112e-06, "loss": 0.081, "step": 4600 }, { "epoch": 2.45, "eval_accuracy": 0.8787, "eval_f1_macro": 0.8122250468168198, "eval_f1_micro": 0.8787, "eval_loss": 0.6089494228363037, "eval_runtime": 151.2228, "eval_samples_per_second": 66.128, "eval_steps_per_second": 2.07, "step": 4600 }, { "epoch": 2.46, "grad_norm": 14.47243881225586, "learning_rate": 9.022222222222223e-06, "loss": 0.0982, "step": 4610 }, { "epoch": 2.46, "grad_norm": 19.576120376586914, "learning_rate": 8.933333333333333e-06, "loss": 0.1043, "step": 4620 }, { "epoch": 2.47, "grad_norm": 11.853041648864746, "learning_rate": 8.844444444444445e-06, "loss": 0.1342, "step": 4630 }, { "epoch": 2.47, "grad_norm": 16.51972198486328, "learning_rate": 8.755555555555556e-06, "loss": 0.0949, "step": 4640 }, { "epoch": 2.48, "grad_norm": 10.615642547607422, "learning_rate": 8.666666666666668e-06, "loss": 0.0783, "step": 4650 }, { "epoch": 2.49, "grad_norm": 14.698683738708496, "learning_rate": 8.577777777777778e-06, "loss": 0.0649, "step": 4660 }, { "epoch": 2.49, "grad_norm": 11.355724334716797, "learning_rate": 8.488888888888889e-06, "loss": 0.096, "step": 4670 }, { "epoch": 2.5, "grad_norm": 0.6558987498283386, "learning_rate": 8.400000000000001e-06, "loss": 0.0983, "step": 4680 }, { "epoch": 2.5, "grad_norm": 23.330352783203125, "learning_rate": 8.311111111111111e-06, "loss": 0.1113, "step": 4690 }, { "epoch": 2.51, "grad_norm": 25.149642944335938, "learning_rate": 8.222222222222223e-06, "loss": 0.1033, "step": 4700 }, { "epoch": 2.51, "eval_accuracy": 0.8752, "eval_f1_macro": 0.8106859211414766, "eval_f1_micro": 0.8752, "eval_loss": 0.5790488123893738, "eval_runtime": 151.0, "eval_samples_per_second": 66.225, "eval_steps_per_second": 2.073, "step": 4700 }, { "epoch": 2.51, "grad_norm": 9.525333404541016, "learning_rate": 8.133333333333332e-06, "loss": 0.0938, "step": 4710 }, { "epoch": 2.52, "grad_norm": 11.757851600646973, "learning_rate": 8.044444444444444e-06, "loss": 0.0564, "step": 4720 }, { "epoch": 2.52, "grad_norm": 8.882828712463379, "learning_rate": 7.955555555555557e-06, "loss": 0.0632, "step": 4730 }, { "epoch": 2.53, "grad_norm": 4.400601863861084, "learning_rate": 7.866666666666667e-06, "loss": 0.0778, "step": 4740 }, { "epoch": 2.53, "grad_norm": 2.988673210144043, "learning_rate": 7.777777777777777e-06, "loss": 0.0504, "step": 4750 }, { "epoch": 2.54, "grad_norm": 12.651586532592773, "learning_rate": 7.68888888888889e-06, "loss": 0.0626, "step": 4760 }, { "epoch": 2.54, "grad_norm": 5.021940231323242, "learning_rate": 7.6e-06, "loss": 0.079, "step": 4770 }, { "epoch": 2.55, "grad_norm": 10.836983680725098, "learning_rate": 7.511111111111112e-06, "loss": 0.0636, "step": 4780 }, { "epoch": 2.55, "grad_norm": 14.971363067626953, "learning_rate": 7.422222222222222e-06, "loss": 0.0602, "step": 4790 }, { "epoch": 2.56, "grad_norm": 12.086758613586426, "learning_rate": 7.333333333333334e-06, "loss": 0.1131, "step": 4800 }, { "epoch": 2.56, "eval_accuracy": 0.8747, "eval_f1_macro": 0.8036013701000208, "eval_f1_micro": 0.8747, "eval_loss": 0.5827542543411255, "eval_runtime": 151.0845, "eval_samples_per_second": 66.188, "eval_steps_per_second": 2.072, "step": 4800 }, { "epoch": 2.57, "grad_norm": 15.703210830688477, "learning_rate": 7.244444444444445e-06, "loss": 0.0659, "step": 4810 }, { "epoch": 2.57, "grad_norm": 24.10576629638672, "learning_rate": 7.155555555555556e-06, "loss": 0.0621, "step": 4820 }, { "epoch": 2.58, "grad_norm": 17.84850311279297, "learning_rate": 7.066666666666667e-06, "loss": 0.06, "step": 4830 }, { "epoch": 2.58, "grad_norm": 14.490592956542969, "learning_rate": 6.9777777777777775e-06, "loss": 0.1885, "step": 4840 }, { "epoch": 2.59, "grad_norm": 15.106103897094727, "learning_rate": 6.888888888888889e-06, "loss": 0.0664, "step": 4850 }, { "epoch": 2.59, "grad_norm": 11.113916397094727, "learning_rate": 6.800000000000001e-06, "loss": 0.093, "step": 4860 }, { "epoch": 2.6, "grad_norm": 15.545525550842285, "learning_rate": 6.711111111111111e-06, "loss": 0.0892, "step": 4870 }, { "epoch": 2.6, "grad_norm": 14.384480476379395, "learning_rate": 6.622222222222223e-06, "loss": 0.0765, "step": 4880 }, { "epoch": 2.61, "grad_norm": 4.170871734619141, "learning_rate": 6.533333333333333e-06, "loss": 0.0553, "step": 4890 }, { "epoch": 2.61, "grad_norm": 10.818324089050293, "learning_rate": 6.4444444444444445e-06, "loss": 0.094, "step": 4900 }, { "epoch": 2.61, "eval_accuracy": 0.878, "eval_f1_macro": 0.8106900956656236, "eval_f1_micro": 0.878, "eval_loss": 0.5612391829490662, "eval_runtime": 151.0215, "eval_samples_per_second": 66.216, "eval_steps_per_second": 2.073, "step": 4900 }, { "epoch": 2.62, "grad_norm": 9.160780906677246, "learning_rate": 6.355555555555557e-06, "loss": 0.0859, "step": 4910 }, { "epoch": 2.62, "grad_norm": 13.956222534179688, "learning_rate": 6.266666666666666e-06, "loss": 0.0756, "step": 4920 }, { "epoch": 2.63, "grad_norm": 1.7093982696533203, "learning_rate": 6.177777777777778e-06, "loss": 0.0385, "step": 4930 }, { "epoch": 2.63, "grad_norm": 18.47127342224121, "learning_rate": 6.088888888888889e-06, "loss": 0.0643, "step": 4940 }, { "epoch": 2.64, "grad_norm": 5.761905670166016, "learning_rate": 6e-06, "loss": 0.0387, "step": 4950 }, { "epoch": 2.65, "grad_norm": 11.518508911132812, "learning_rate": 5.9111111111111115e-06, "loss": 0.0857, "step": 4960 }, { "epoch": 2.65, "grad_norm": 11.523582458496094, "learning_rate": 5.822222222222223e-06, "loss": 0.0607, "step": 4970 }, { "epoch": 2.66, "grad_norm": 3.5160629749298096, "learning_rate": 5.733333333333333e-06, "loss": 0.0965, "step": 4980 }, { "epoch": 2.66, "grad_norm": 16.30809783935547, "learning_rate": 5.6444444444444445e-06, "loss": 0.0392, "step": 4990 }, { "epoch": 2.67, "grad_norm": 12.250411987304688, "learning_rate": 5.555555555555556e-06, "loss": 0.0853, "step": 5000 }, { "epoch": 2.67, "eval_accuracy": 0.8784, "eval_f1_macro": 0.8123002988678545, "eval_f1_micro": 0.8784, "eval_loss": 0.5772224068641663, "eval_runtime": 151.0336, "eval_samples_per_second": 66.21, "eval_steps_per_second": 2.072, "step": 5000 }, { "epoch": 2.67, "grad_norm": 18.300569534301758, "learning_rate": 5.466666666666667e-06, "loss": 0.0925, "step": 5010 }, { "epoch": 2.68, "grad_norm": 4.053598403930664, "learning_rate": 5.3777777777777784e-06, "loss": 0.0906, "step": 5020 }, { "epoch": 2.68, "grad_norm": 3.2842700481414795, "learning_rate": 5.288888888888889e-06, "loss": 0.0562, "step": 5030 }, { "epoch": 2.69, "grad_norm": 1.8992033004760742, "learning_rate": 5.2e-06, "loss": 0.0421, "step": 5040 }, { "epoch": 2.69, "grad_norm": 4.324485778808594, "learning_rate": 5.1111111111111115e-06, "loss": 0.0704, "step": 5050 }, { "epoch": 2.7, "grad_norm": 10.114728927612305, "learning_rate": 5.022222222222223e-06, "loss": 0.0568, "step": 5060 }, { "epoch": 2.7, "grad_norm": 3.556407928466797, "learning_rate": 4.933333333333333e-06, "loss": 0.0487, "step": 5070 }, { "epoch": 2.71, "grad_norm": 11.58324146270752, "learning_rate": 4.8444444444444446e-06, "loss": 0.1051, "step": 5080 }, { "epoch": 2.71, "grad_norm": 12.190340042114258, "learning_rate": 4.755555555555556e-06, "loss": 0.0609, "step": 5090 }, { "epoch": 2.72, "grad_norm": 10.30691909790039, "learning_rate": 4.666666666666667e-06, "loss": 0.0917, "step": 5100 }, { "epoch": 2.72, "eval_accuracy": 0.8805, "eval_f1_macro": 0.8123461705531486, "eval_f1_micro": 0.8805, "eval_loss": 0.559500515460968, "eval_runtime": 150.9375, "eval_samples_per_second": 66.253, "eval_steps_per_second": 2.074, "step": 5100 }, { "epoch": 2.73, "grad_norm": 15.113903045654297, "learning_rate": 4.5777777777777785e-06, "loss": 0.0634, "step": 5110 }, { "epoch": 2.73, "grad_norm": 3.779010534286499, "learning_rate": 4.488888888888889e-06, "loss": 0.0507, "step": 5120 }, { "epoch": 2.74, "grad_norm": 5.111919403076172, "learning_rate": 4.4e-06, "loss": 0.0214, "step": 5130 }, { "epoch": 2.74, "grad_norm": 0.48539718985557556, "learning_rate": 4.3111111111111115e-06, "loss": 0.0986, "step": 5140 }, { "epoch": 2.75, "grad_norm": 7.374268531799316, "learning_rate": 4.222222222222223e-06, "loss": 0.047, "step": 5150 }, { "epoch": 2.75, "grad_norm": 12.559208869934082, "learning_rate": 4.133333333333333e-06, "loss": 0.0851, "step": 5160 }, { "epoch": 2.76, "grad_norm": 5.660250663757324, "learning_rate": 4.044444444444445e-06, "loss": 0.0488, "step": 5170 }, { "epoch": 2.76, "grad_norm": 4.466784477233887, "learning_rate": 3.955555555555555e-06, "loss": 0.1213, "step": 5180 }, { "epoch": 2.77, "grad_norm": 6.510303020477295, "learning_rate": 3.866666666666667e-06, "loss": 0.0179, "step": 5190 }, { "epoch": 2.77, "grad_norm": 6.136260032653809, "learning_rate": 3.777777777777778e-06, "loss": 0.0542, "step": 5200 }, { "epoch": 2.77, "eval_accuracy": 0.8814, "eval_f1_macro": 0.8146631928155758, "eval_f1_micro": 0.8814, "eval_loss": 0.5781938433647156, "eval_runtime": 151.022, "eval_samples_per_second": 66.216, "eval_steps_per_second": 2.073, "step": 5200 }, { "epoch": 2.78, "grad_norm": 4.440271854400635, "learning_rate": 3.688888888888889e-06, "loss": 0.0422, "step": 5210 }, { "epoch": 2.78, "grad_norm": 2.652446746826172, "learning_rate": 3.6e-06, "loss": 0.056, "step": 5220 }, { "epoch": 2.79, "grad_norm": 10.915148735046387, "learning_rate": 3.5111111111111116e-06, "loss": 0.0615, "step": 5230 }, { "epoch": 2.79, "grad_norm": 11.108790397644043, "learning_rate": 3.4222222222222224e-06, "loss": 0.0639, "step": 5240 }, { "epoch": 2.8, "grad_norm": 16.01081085205078, "learning_rate": 3.3333333333333333e-06, "loss": 0.0517, "step": 5250 }, { "epoch": 2.81, "grad_norm": 9.080989837646484, "learning_rate": 3.244444444444444e-06, "loss": 0.0594, "step": 5260 }, { "epoch": 2.81, "grad_norm": 15.85318660736084, "learning_rate": 3.155555555555556e-06, "loss": 0.0511, "step": 5270 }, { "epoch": 2.82, "grad_norm": 13.549962043762207, "learning_rate": 3.066666666666667e-06, "loss": 0.0996, "step": 5280 }, { "epoch": 2.82, "grad_norm": 2.0129482746124268, "learning_rate": 2.977777777777778e-06, "loss": 0.0704, "step": 5290 }, { "epoch": 2.83, "grad_norm": 0.5195946097373962, "learning_rate": 2.888888888888889e-06, "loss": 0.0754, "step": 5300 }, { "epoch": 2.83, "eval_accuracy": 0.8821, "eval_f1_macro": 0.8171083228509614, "eval_f1_micro": 0.8821, "eval_loss": 0.5936337113380432, "eval_runtime": 151.1685, "eval_samples_per_second": 66.151, "eval_steps_per_second": 2.071, "step": 5300 }, { "epoch": 2.83, "grad_norm": 17.216556549072266, "learning_rate": 2.8000000000000003e-06, "loss": 0.0802, "step": 5310 }, { "epoch": 2.84, "grad_norm": 15.105179786682129, "learning_rate": 2.711111111111111e-06, "loss": 0.0701, "step": 5320 }, { "epoch": 2.84, "grad_norm": 3.2868239879608154, "learning_rate": 2.6222222222222225e-06, "loss": 0.0523, "step": 5330 }, { "epoch": 2.85, "grad_norm": 17.145092010498047, "learning_rate": 2.5333333333333334e-06, "loss": 0.1424, "step": 5340 }, { "epoch": 2.85, "grad_norm": 9.835216522216797, "learning_rate": 2.4444444444444447e-06, "loss": 0.0504, "step": 5350 }, { "epoch": 2.86, "grad_norm": 9.431855201721191, "learning_rate": 2.3555555555555555e-06, "loss": 0.076, "step": 5360 }, { "epoch": 2.86, "grad_norm": 0.8578177690505981, "learning_rate": 2.266666666666667e-06, "loss": 0.045, "step": 5370 }, { "epoch": 2.87, "grad_norm": 3.6194913387298584, "learning_rate": 2.1777777777777777e-06, "loss": 0.0414, "step": 5380 }, { "epoch": 2.87, "grad_norm": 10.720399856567383, "learning_rate": 2.088888888888889e-06, "loss": 0.0757, "step": 5390 }, { "epoch": 2.88, "grad_norm": 11.850172996520996, "learning_rate": 2.0000000000000003e-06, "loss": 0.1001, "step": 5400 }, { "epoch": 2.88, "eval_accuracy": 0.8827, "eval_f1_macro": 0.8156503347773745, "eval_f1_micro": 0.8827, "eval_loss": 0.5625789761543274, "eval_runtime": 151.0652, "eval_samples_per_second": 66.197, "eval_steps_per_second": 2.072, "step": 5400 }, { "epoch": 2.89, "grad_norm": 3.0367789268493652, "learning_rate": 1.9111111111111112e-06, "loss": 0.0614, "step": 5410 }, { "epoch": 2.89, "grad_norm": 0.6901952028274536, "learning_rate": 1.8222222222222223e-06, "loss": 0.0359, "step": 5420 }, { "epoch": 2.9, "grad_norm": 6.812145709991455, "learning_rate": 1.7333333333333334e-06, "loss": 0.0217, "step": 5430 }, { "epoch": 2.9, "grad_norm": 9.04457950592041, "learning_rate": 1.6444444444444447e-06, "loss": 0.0469, "step": 5440 }, { "epoch": 2.91, "grad_norm": 21.277233123779297, "learning_rate": 1.5555555555555556e-06, "loss": 0.0893, "step": 5450 }, { "epoch": 2.91, "grad_norm": 9.495359420776367, "learning_rate": 1.4666666666666667e-06, "loss": 0.0607, "step": 5460 }, { "epoch": 2.92, "grad_norm": 9.839810371398926, "learning_rate": 1.3777777777777778e-06, "loss": 0.066, "step": 5470 }, { "epoch": 2.92, "grad_norm": 17.838457107543945, "learning_rate": 1.2888888888888889e-06, "loss": 0.0549, "step": 5480 }, { "epoch": 2.93, "grad_norm": 5.896942138671875, "learning_rate": 1.2000000000000002e-06, "loss": 0.0468, "step": 5490 }, { "epoch": 2.93, "grad_norm": 0.5075032711029053, "learning_rate": 1.1111111111111112e-06, "loss": 0.0311, "step": 5500 }, { "epoch": 2.93, "eval_accuracy": 0.8818, "eval_f1_macro": 0.8151866330745442, "eval_f1_micro": 0.8818, "eval_loss": 0.5690019726753235, "eval_runtime": 151.3749, "eval_samples_per_second": 66.061, "eval_steps_per_second": 2.068, "step": 5500 }, { "epoch": 2.94, "grad_norm": 15.075337409973145, "learning_rate": 1.0222222222222223e-06, "loss": 0.0463, "step": 5510 }, { "epoch": 2.94, "grad_norm": 7.638895511627197, "learning_rate": 9.333333333333334e-07, "loss": 0.0505, "step": 5520 }, { "epoch": 2.95, "grad_norm": 21.062416076660156, "learning_rate": 8.444444444444444e-07, "loss": 0.0888, "step": 5530 }, { "epoch": 2.95, "grad_norm": 11.800668716430664, "learning_rate": 7.555555555555556e-07, "loss": 0.0678, "step": 5540 }, { "epoch": 2.96, "grad_norm": 3.7984507083892822, "learning_rate": 6.666666666666667e-07, "loss": 0.0496, "step": 5550 }, { "epoch": 2.97, "grad_norm": 13.194000244140625, "learning_rate": 5.777777777777778e-07, "loss": 0.0778, "step": 5560 }, { "epoch": 2.97, "grad_norm": 4.690216064453125, "learning_rate": 4.888888888888889e-07, "loss": 0.0592, "step": 5570 }, { "epoch": 2.98, "grad_norm": 0.8871315121650696, "learning_rate": 4.0000000000000003e-07, "loss": 0.0297, "step": 5580 }, { "epoch": 2.98, "grad_norm": 9.130186080932617, "learning_rate": 3.111111111111111e-07, "loss": 0.0527, "step": 5590 }, { "epoch": 2.99, "grad_norm": 1.8399721384048462, "learning_rate": 2.2222222222222224e-07, "loss": 0.03, "step": 5600 }, { "epoch": 2.99, "eval_accuracy": 0.8831, "eval_f1_macro": 0.8171086031295777, "eval_f1_micro": 0.8831, "eval_loss": 0.5688170790672302, "eval_runtime": 151.3507, "eval_samples_per_second": 66.072, "eval_steps_per_second": 2.068, "step": 5600 }, { "epoch": 2.99, "grad_norm": 6.7230143547058105, "learning_rate": 1.3333333333333334e-07, "loss": 0.078, "step": 5610 }, { "epoch": 3.0, "grad_norm": 5.4988532066345215, "learning_rate": 4.444444444444445e-08, "loss": 0.0337, "step": 5620 }, { "epoch": 3.0, "step": 5625, "total_flos": 1.126917998641152e+18, "train_loss": 0.4508466554853651, "train_runtime": 15613.1858, "train_samples_per_second": 11.529, "train_steps_per_second": 0.36 } ], "logging_steps": 10, "max_steps": 5625, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.126917998641152e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }