{ "best_metric": 0.15021921694278717, "best_model_checkpoint": "ViT_Flower102_2/checkpoint-1600", "epoch": 10.0, "eval_steps": 100, "global_step": 4490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.0026773272547870874, "learning_rate": 0.00019955456570155904, "loss": 0.0009, "step": 10 }, { "epoch": 0.04, "grad_norm": 0.005304301157593727, "learning_rate": 0.00019910913140311804, "loss": 0.0009, "step": 20 }, { "epoch": 0.07, "grad_norm": 20.306926727294922, "learning_rate": 0.00019866369710467706, "loss": 0.0194, "step": 30 }, { "epoch": 0.09, "grad_norm": 2.321718692779541, "learning_rate": 0.0001982182628062361, "loss": 0.1544, "step": 40 }, { "epoch": 0.11, "grad_norm": 0.021045241504907608, "learning_rate": 0.00019777282850779511, "loss": 0.0701, "step": 50 }, { "epoch": 0.13, "grad_norm": 0.02391223795711994, "learning_rate": 0.00019732739420935414, "loss": 0.1303, "step": 60 }, { "epoch": 0.16, "grad_norm": 0.20434625446796417, "learning_rate": 0.00019688195991091317, "loss": 0.1413, "step": 70 }, { "epoch": 0.18, "grad_norm": 0.2549870014190674, "learning_rate": 0.00019643652561247217, "loss": 0.1047, "step": 80 }, { "epoch": 0.2, "grad_norm": 6.997387886047363, "learning_rate": 0.0001959910913140312, "loss": 0.1234, "step": 90 }, { "epoch": 0.22, "grad_norm": 5.206876277923584, "learning_rate": 0.0001955456570155902, "loss": 0.053, "step": 100 }, { "epoch": 0.22, "eval_accuracy": 0.9235294117647059, "eval_f1": 0.9235294117647059, "eval_loss": 0.3198450803756714, "eval_precision": 0.9235294117647059, "eval_recall": 0.9235294117647059, "eval_runtime": 17.2015, "eval_samples_per_second": 59.297, "eval_steps_per_second": 7.441, "step": 100 }, { "epoch": 0.24, "grad_norm": 6.388282299041748, "learning_rate": 0.00019510022271714922, "loss": 0.1377, "step": 110 }, { "epoch": 0.27, "grad_norm": 5.129618167877197, "learning_rate": 0.00019465478841870825, "loss": 0.0561, "step": 120 }, { "epoch": 0.29, "grad_norm": 8.198286056518555, "learning_rate": 0.00019420935412026727, "loss": 0.0417, "step": 130 }, { "epoch": 0.31, "grad_norm": 7.592113494873047, "learning_rate": 0.0001937639198218263, "loss": 0.0388, "step": 140 }, { "epoch": 0.33, "grad_norm": 2.7412798404693604, "learning_rate": 0.00019331848552338533, "loss": 0.0921, "step": 150 }, { "epoch": 0.36, "grad_norm": 0.12602120637893677, "learning_rate": 0.00019287305122494432, "loss": 0.1905, "step": 160 }, { "epoch": 0.38, "grad_norm": 5.073046684265137, "learning_rate": 0.00019242761692650335, "loss": 0.1331, "step": 170 }, { "epoch": 0.4, "grad_norm": 0.5400639176368713, "learning_rate": 0.00019198218262806238, "loss": 0.0315, "step": 180 }, { "epoch": 0.42, "grad_norm": 11.878341674804688, "learning_rate": 0.00019153674832962138, "loss": 0.2794, "step": 190 }, { "epoch": 0.45, "grad_norm": 0.48547011613845825, "learning_rate": 0.0001910913140311804, "loss": 0.1225, "step": 200 }, { "epoch": 0.45, "eval_accuracy": 0.9166666666666666, "eval_f1": 0.9166666666666666, "eval_loss": 0.40865278244018555, "eval_precision": 0.9166666666666666, "eval_recall": 0.9166666666666666, "eval_runtime": 17.0881, "eval_samples_per_second": 59.691, "eval_steps_per_second": 7.491, "step": 200 }, { "epoch": 0.47, "grad_norm": 0.015034608542919159, "learning_rate": 0.00019064587973273943, "loss": 0.0953, "step": 210 }, { "epoch": 0.49, "grad_norm": 1.665817141532898, "learning_rate": 0.00019020044543429846, "loss": 0.081, "step": 220 }, { "epoch": 0.51, "grad_norm": 1.737998127937317, "learning_rate": 0.00018975501113585748, "loss": 0.037, "step": 230 }, { "epoch": 0.53, "grad_norm": 0.08088938146829605, "learning_rate": 0.00018930957683741648, "loss": 0.2113, "step": 240 }, { "epoch": 0.56, "grad_norm": 6.148426055908203, "learning_rate": 0.0001888641425389755, "loss": 0.0674, "step": 250 }, { "epoch": 0.58, "grad_norm": 0.07585727423429489, "learning_rate": 0.00018841870824053454, "loss": 0.1698, "step": 260 }, { "epoch": 0.6, "grad_norm": 4.297791481018066, "learning_rate": 0.00018797327394209353, "loss": 0.1087, "step": 270 }, { "epoch": 0.62, "grad_norm": 0.039582639932632446, "learning_rate": 0.00018752783964365256, "loss": 0.082, "step": 280 }, { "epoch": 0.65, "grad_norm": 4.647770881652832, "learning_rate": 0.0001870824053452116, "loss": 0.0256, "step": 290 }, { "epoch": 0.67, "grad_norm": 0.05055054649710655, "learning_rate": 0.00018663697104677061, "loss": 0.1985, "step": 300 }, { "epoch": 0.67, "eval_accuracy": 0.9568627450980393, "eval_f1": 0.9568627450980393, "eval_loss": 0.20681221783161163, "eval_precision": 0.9568627450980393, "eval_recall": 0.9568627450980393, "eval_runtime": 17.1131, "eval_samples_per_second": 59.603, "eval_steps_per_second": 7.48, "step": 300 }, { "epoch": 0.69, "grad_norm": 12.765353202819824, "learning_rate": 0.00018619153674832964, "loss": 0.0698, "step": 310 }, { "epoch": 0.71, "grad_norm": 9.829974174499512, "learning_rate": 0.00018574610244988867, "loss": 0.076, "step": 320 }, { "epoch": 0.73, "grad_norm": 0.024047628045082092, "learning_rate": 0.00018530066815144767, "loss": 0.0335, "step": 330 }, { "epoch": 0.76, "grad_norm": 6.1861066818237305, "learning_rate": 0.0001848552338530067, "loss": 0.1085, "step": 340 }, { "epoch": 0.78, "grad_norm": 1.6630871295928955, "learning_rate": 0.00018440979955456572, "loss": 0.1537, "step": 350 }, { "epoch": 0.8, "grad_norm": 0.012870335020124912, "learning_rate": 0.00018396436525612472, "loss": 0.0729, "step": 360 }, { "epoch": 0.82, "grad_norm": 0.9247069954872131, "learning_rate": 0.00018351893095768375, "loss": 0.0293, "step": 370 }, { "epoch": 0.85, "grad_norm": 0.0049523478373885155, "learning_rate": 0.00018307349665924277, "loss": 0.043, "step": 380 }, { "epoch": 0.87, "grad_norm": 6.6150336265563965, "learning_rate": 0.0001826280623608018, "loss": 0.0184, "step": 390 }, { "epoch": 0.89, "grad_norm": 0.16694368422031403, "learning_rate": 0.00018218262806236082, "loss": 0.0804, "step": 400 }, { "epoch": 0.89, "eval_accuracy": 0.9333333333333333, "eval_f1": 0.9333333333333333, "eval_loss": 0.3181270360946655, "eval_precision": 0.9333333333333333, "eval_recall": 0.9333333333333333, "eval_runtime": 16.8894, "eval_samples_per_second": 60.393, "eval_steps_per_second": 7.579, "step": 400 }, { "epoch": 0.91, "grad_norm": 2.3894243240356445, "learning_rate": 0.00018173719376391982, "loss": 0.0611, "step": 410 }, { "epoch": 0.94, "grad_norm": 0.014911720529198647, "learning_rate": 0.00018129175946547885, "loss": 0.0172, "step": 420 }, { "epoch": 0.96, "grad_norm": 0.01411391980946064, "learning_rate": 0.00018084632516703788, "loss": 0.2164, "step": 430 }, { "epoch": 0.98, "grad_norm": 13.27253532409668, "learning_rate": 0.00018040089086859688, "loss": 0.1459, "step": 440 }, { "epoch": 1.0, "grad_norm": 0.028676768764853477, "learning_rate": 0.0001799554565701559, "loss": 0.0702, "step": 450 }, { "epoch": 1.02, "grad_norm": 4.484796047210693, "learning_rate": 0.00017951002227171493, "loss": 0.1319, "step": 460 }, { "epoch": 1.05, "grad_norm": 5.17644739151001, "learning_rate": 0.00017906458797327396, "loss": 0.1276, "step": 470 }, { "epoch": 1.07, "grad_norm": 4.405980110168457, "learning_rate": 0.00017861915367483298, "loss": 0.1718, "step": 480 }, { "epoch": 1.09, "grad_norm": 0.1352098286151886, "learning_rate": 0.000178173719376392, "loss": 0.1796, "step": 490 }, { "epoch": 1.11, "grad_norm": 8.556909561157227, "learning_rate": 0.000177728285077951, "loss": 0.1672, "step": 500 }, { "epoch": 1.11, "eval_accuracy": 0.9274509803921569, "eval_f1": 0.9274509803921569, "eval_loss": 0.35819730162620544, "eval_precision": 0.9274509803921569, "eval_recall": 0.9274509803921569, "eval_runtime": 16.8086, "eval_samples_per_second": 60.683, "eval_steps_per_second": 7.615, "step": 500 }, { "epoch": 1.14, "grad_norm": 0.4172447919845581, "learning_rate": 0.00017728285077951003, "loss": 0.0686, "step": 510 }, { "epoch": 1.16, "grad_norm": 3.2105464935302734, "learning_rate": 0.00017683741648106903, "loss": 0.0347, "step": 520 }, { "epoch": 1.18, "grad_norm": 0.23049691319465637, "learning_rate": 0.00017639198218262806, "loss": 0.1714, "step": 530 }, { "epoch": 1.2, "grad_norm": 0.008479436859488487, "learning_rate": 0.0001759465478841871, "loss": 0.0519, "step": 540 }, { "epoch": 1.22, "grad_norm": 0.11945914477109909, "learning_rate": 0.00017550111358574611, "loss": 0.0858, "step": 550 }, { "epoch": 1.25, "grad_norm": 1.7262502908706665, "learning_rate": 0.00017505567928730514, "loss": 0.0753, "step": 560 }, { "epoch": 1.27, "grad_norm": 0.02038051374256611, "learning_rate": 0.00017461024498886417, "loss": 0.0571, "step": 570 }, { "epoch": 1.29, "grad_norm": 9.348058700561523, "learning_rate": 0.00017416481069042317, "loss": 0.128, "step": 580 }, { "epoch": 1.31, "grad_norm": 0.039238784462213516, "learning_rate": 0.0001737193763919822, "loss": 0.0551, "step": 590 }, { "epoch": 1.34, "grad_norm": 5.897979736328125, "learning_rate": 0.00017327394209354122, "loss": 0.1287, "step": 600 }, { "epoch": 1.34, "eval_accuracy": 0.9450980392156862, "eval_f1": 0.9450980392156862, "eval_loss": 0.27001550793647766, "eval_precision": 0.9450980392156862, "eval_recall": 0.9450980392156862, "eval_runtime": 17.21, "eval_samples_per_second": 59.268, "eval_steps_per_second": 7.438, "step": 600 }, { "epoch": 1.36, "grad_norm": 5.809938907623291, "learning_rate": 0.00017282850779510022, "loss": 0.0894, "step": 610 }, { "epoch": 1.38, "grad_norm": 0.013608383946120739, "learning_rate": 0.00017238307349665924, "loss": 0.0312, "step": 620 }, { "epoch": 1.4, "grad_norm": 0.14918509125709534, "learning_rate": 0.00017193763919821827, "loss": 0.0813, "step": 630 }, { "epoch": 1.43, "grad_norm": 0.09317727386951447, "learning_rate": 0.0001714922048997773, "loss": 0.1021, "step": 640 }, { "epoch": 1.45, "grad_norm": 0.12424630671739578, "learning_rate": 0.00017104677060133632, "loss": 0.0382, "step": 650 }, { "epoch": 1.47, "grad_norm": 0.004964092746376991, "learning_rate": 0.00017060133630289532, "loss": 0.0729, "step": 660 }, { "epoch": 1.49, "grad_norm": 3.553861379623413, "learning_rate": 0.00017015590200445435, "loss": 0.0475, "step": 670 }, { "epoch": 1.51, "grad_norm": 0.0814567431807518, "learning_rate": 0.00016971046770601338, "loss": 0.0424, "step": 680 }, { "epoch": 1.54, "grad_norm": 0.5184776186943054, "learning_rate": 0.00016926503340757238, "loss": 0.0182, "step": 690 }, { "epoch": 1.56, "grad_norm": 0.0049703894183039665, "learning_rate": 0.0001688195991091314, "loss": 0.0147, "step": 700 }, { "epoch": 1.56, "eval_accuracy": 0.9205882352941176, "eval_f1": 0.9205882352941176, "eval_loss": 0.369125634431839, "eval_precision": 0.9205882352941176, "eval_recall": 0.9205882352941176, "eval_runtime": 16.9323, "eval_samples_per_second": 60.24, "eval_steps_per_second": 7.56, "step": 700 }, { "epoch": 1.58, "grad_norm": 0.021510396152734756, "learning_rate": 0.00016837416481069043, "loss": 0.0821, "step": 710 }, { "epoch": 1.6, "grad_norm": 0.06858960539102554, "learning_rate": 0.00016792873051224946, "loss": 0.0053, "step": 720 }, { "epoch": 1.63, "grad_norm": 0.00354547961615026, "learning_rate": 0.00016748329621380848, "loss": 0.027, "step": 730 }, { "epoch": 1.65, "grad_norm": 0.0027330678422003984, "learning_rate": 0.0001670378619153675, "loss": 0.0038, "step": 740 }, { "epoch": 1.67, "grad_norm": 0.0024623360950499773, "learning_rate": 0.0001665924276169265, "loss": 0.0435, "step": 750 }, { "epoch": 1.69, "grad_norm": 0.006295201368629932, "learning_rate": 0.00016614699331848553, "loss": 0.063, "step": 760 }, { "epoch": 1.71, "grad_norm": 0.8971105813980103, "learning_rate": 0.00016570155902004456, "loss": 0.0625, "step": 770 }, { "epoch": 1.74, "grad_norm": 0.004554128274321556, "learning_rate": 0.00016525612472160356, "loss": 0.004, "step": 780 }, { "epoch": 1.76, "grad_norm": 1.1439096927642822, "learning_rate": 0.0001648106904231626, "loss": 0.0959, "step": 790 }, { "epoch": 1.78, "grad_norm": 0.12275100499391556, "learning_rate": 0.0001643652561247216, "loss": 0.0416, "step": 800 }, { "epoch": 1.78, "eval_accuracy": 0.9470588235294117, "eval_f1": 0.9470588235294117, "eval_loss": 0.25350436568260193, "eval_precision": 0.9470588235294117, "eval_recall": 0.9470588235294117, "eval_runtime": 16.7158, "eval_samples_per_second": 61.02, "eval_steps_per_second": 7.657, "step": 800 }, { "epoch": 1.8, "grad_norm": 0.0064841569401323795, "learning_rate": 0.00016391982182628064, "loss": 0.0479, "step": 810 }, { "epoch": 1.83, "grad_norm": 0.006001554429531097, "learning_rate": 0.00016347438752783967, "loss": 0.0624, "step": 820 }, { "epoch": 1.85, "grad_norm": 0.1525709182024002, "learning_rate": 0.00016302895322939867, "loss": 0.0855, "step": 830 }, { "epoch": 1.87, "grad_norm": 0.017199277877807617, "learning_rate": 0.0001625835189309577, "loss": 0.0199, "step": 840 }, { "epoch": 1.89, "grad_norm": 0.40461423993110657, "learning_rate": 0.00016213808463251672, "loss": 0.0613, "step": 850 }, { "epoch": 1.92, "grad_norm": 0.006222237832844257, "learning_rate": 0.00016169265033407572, "loss": 0.0168, "step": 860 }, { "epoch": 1.94, "grad_norm": 0.0056493207812309265, "learning_rate": 0.00016124721603563474, "loss": 0.0247, "step": 870 }, { "epoch": 1.96, "grad_norm": 0.004911198280751705, "learning_rate": 0.00016080178173719377, "loss": 0.074, "step": 880 }, { "epoch": 1.98, "grad_norm": 0.0023081935942173004, "learning_rate": 0.0001603563474387528, "loss": 0.1029, "step": 890 }, { "epoch": 2.0, "grad_norm": 0.010029925964772701, "learning_rate": 0.00015991091314031182, "loss": 0.0211, "step": 900 }, { "epoch": 2.0, "eval_accuracy": 0.9470588235294117, "eval_f1": 0.9470588235294117, "eval_loss": 0.25747954845428467, "eval_precision": 0.9470588235294117, "eval_recall": 0.9470588235294117, "eval_runtime": 16.8113, "eval_samples_per_second": 60.674, "eval_steps_per_second": 7.614, "step": 900 }, { "epoch": 2.03, "grad_norm": 0.003727799979969859, "learning_rate": 0.00015946547884187085, "loss": 0.0126, "step": 910 }, { "epoch": 2.05, "grad_norm": 0.7938788533210754, "learning_rate": 0.00015902004454342985, "loss": 0.0702, "step": 920 }, { "epoch": 2.07, "grad_norm": 0.48882561922073364, "learning_rate": 0.00015857461024498888, "loss": 0.0513, "step": 930 }, { "epoch": 2.09, "grad_norm": 10.071187973022461, "learning_rate": 0.0001581291759465479, "loss": 0.109, "step": 940 }, { "epoch": 2.12, "grad_norm": 0.009675947949290276, "learning_rate": 0.0001576837416481069, "loss": 0.0034, "step": 950 }, { "epoch": 2.14, "grad_norm": 0.004574100486934185, "learning_rate": 0.00015723830734966593, "loss": 0.0018, "step": 960 }, { "epoch": 2.16, "grad_norm": 0.0033382533583790064, "learning_rate": 0.00015679287305122495, "loss": 0.0719, "step": 970 }, { "epoch": 2.18, "grad_norm": 4.464893341064453, "learning_rate": 0.00015634743875278398, "loss": 0.0099, "step": 980 }, { "epoch": 2.2, "grad_norm": 0.004051242955029011, "learning_rate": 0.000155902004454343, "loss": 0.0846, "step": 990 }, { "epoch": 2.23, "grad_norm": 0.047729793936014175, "learning_rate": 0.000155456570155902, "loss": 0.088, "step": 1000 }, { "epoch": 2.23, "eval_accuracy": 0.9529411764705882, "eval_f1": 0.9529411764705882, "eval_loss": 0.19075074791908264, "eval_precision": 0.9529411764705882, "eval_recall": 0.9529411764705882, "eval_runtime": 16.8471, "eval_samples_per_second": 60.544, "eval_steps_per_second": 7.598, "step": 1000 }, { "epoch": 2.25, "grad_norm": 0.008470825850963593, "learning_rate": 0.00015501113585746103, "loss": 0.0022, "step": 1010 }, { "epoch": 2.27, "grad_norm": 2.529259204864502, "learning_rate": 0.00015456570155902006, "loss": 0.0917, "step": 1020 }, { "epoch": 2.29, "grad_norm": 0.3583894670009613, "learning_rate": 0.00015412026726057906, "loss": 0.0793, "step": 1030 }, { "epoch": 2.32, "grad_norm": 0.039421286433935165, "learning_rate": 0.00015367483296213809, "loss": 0.0742, "step": 1040 }, { "epoch": 2.34, "grad_norm": 0.2551974654197693, "learning_rate": 0.0001532293986636971, "loss": 0.0029, "step": 1050 }, { "epoch": 2.36, "grad_norm": 0.3370533883571625, "learning_rate": 0.00015278396436525614, "loss": 0.0563, "step": 1060 }, { "epoch": 2.38, "grad_norm": 1.642697811126709, "learning_rate": 0.00015233853006681517, "loss": 0.0362, "step": 1070 }, { "epoch": 2.41, "grad_norm": 12.125362396240234, "learning_rate": 0.0001518930957683742, "loss": 0.0949, "step": 1080 }, { "epoch": 2.43, "grad_norm": 0.0022143302485346794, "learning_rate": 0.0001514476614699332, "loss": 0.0219, "step": 1090 }, { "epoch": 2.45, "grad_norm": 0.0026613217778503895, "learning_rate": 0.00015100222717149222, "loss": 0.1849, "step": 1100 }, { "epoch": 2.45, "eval_accuracy": 0.9529411764705882, "eval_f1": 0.9529411764705882, "eval_loss": 0.2200697511434555, "eval_precision": 0.9529411764705882, "eval_recall": 0.9529411764705882, "eval_runtime": 16.9063, "eval_samples_per_second": 60.332, "eval_steps_per_second": 7.571, "step": 1100 }, { "epoch": 2.47, "grad_norm": 0.5232903957366943, "learning_rate": 0.00015055679287305122, "loss": 0.0013, "step": 1110 }, { "epoch": 2.49, "grad_norm": 1.5729717016220093, "learning_rate": 0.00015011135857461024, "loss": 0.0433, "step": 1120 }, { "epoch": 2.52, "grad_norm": 0.005640827585011721, "learning_rate": 0.00014966592427616927, "loss": 0.0007, "step": 1130 }, { "epoch": 2.54, "grad_norm": 7.901826858520508, "learning_rate": 0.0001492204899777283, "loss": 0.1131, "step": 1140 }, { "epoch": 2.56, "grad_norm": 0.4438769221305847, "learning_rate": 0.00014877505567928732, "loss": 0.0557, "step": 1150 }, { "epoch": 2.58, "grad_norm": 0.0032986474689096212, "learning_rate": 0.00014832962138084635, "loss": 0.001, "step": 1160 }, { "epoch": 2.61, "grad_norm": 0.09250050783157349, "learning_rate": 0.00014788418708240535, "loss": 0.02, "step": 1170 }, { "epoch": 2.63, "grad_norm": 0.005981163587421179, "learning_rate": 0.00014743875278396438, "loss": 0.0147, "step": 1180 }, { "epoch": 2.65, "grad_norm": 0.02270863950252533, "learning_rate": 0.0001469933184855234, "loss": 0.0246, "step": 1190 }, { "epoch": 2.67, "grad_norm": 0.014892498031258583, "learning_rate": 0.0001465478841870824, "loss": 0.0009, "step": 1200 }, { "epoch": 2.67, "eval_accuracy": 0.9549019607843138, "eval_f1": 0.9549019607843138, "eval_loss": 0.22289611399173737, "eval_precision": 0.9549019607843138, "eval_recall": 0.9549019607843138, "eval_runtime": 16.9051, "eval_samples_per_second": 60.337, "eval_steps_per_second": 7.572, "step": 1200 }, { "epoch": 2.69, "grad_norm": 10.428900718688965, "learning_rate": 0.00014610244988864143, "loss": 0.0115, "step": 1210 }, { "epoch": 2.72, "grad_norm": 0.0370117723941803, "learning_rate": 0.00014565701559020045, "loss": 0.0525, "step": 1220 }, { "epoch": 2.74, "grad_norm": 0.0029497628565877676, "learning_rate": 0.00014521158129175948, "loss": 0.0009, "step": 1230 }, { "epoch": 2.76, "grad_norm": 0.8202322125434875, "learning_rate": 0.0001447661469933185, "loss": 0.051, "step": 1240 }, { "epoch": 2.78, "grad_norm": 0.012352533638477325, "learning_rate": 0.0001443207126948775, "loss": 0.0034, "step": 1250 }, { "epoch": 2.81, "grad_norm": 0.5958288908004761, "learning_rate": 0.00014387527839643653, "loss": 0.0016, "step": 1260 }, { "epoch": 2.83, "grad_norm": 0.01864822395145893, "learning_rate": 0.00014342984409799556, "loss": 0.0113, "step": 1270 }, { "epoch": 2.85, "grad_norm": 0.053810037672519684, "learning_rate": 0.00014298440979955456, "loss": 0.0044, "step": 1280 }, { "epoch": 2.87, "grad_norm": 0.07594209164381027, "learning_rate": 0.00014253897550111359, "loss": 0.0022, "step": 1290 }, { "epoch": 2.9, "grad_norm": 0.003192998468875885, "learning_rate": 0.0001420935412026726, "loss": 0.0599, "step": 1300 }, { "epoch": 2.9, "eval_accuracy": 0.9607843137254902, "eval_f1": 0.9607843137254902, "eval_loss": 0.1780730038881302, "eval_precision": 0.9607843137254902, "eval_recall": 0.9607843137254902, "eval_runtime": 16.8581, "eval_samples_per_second": 60.505, "eval_steps_per_second": 7.593, "step": 1300 }, { "epoch": 2.92, "grad_norm": 0.0019150603329762816, "learning_rate": 0.00014164810690423164, "loss": 0.0136, "step": 1310 }, { "epoch": 2.94, "grad_norm": 0.0017021102830767632, "learning_rate": 0.00014120267260579067, "loss": 0.0004, "step": 1320 }, { "epoch": 2.96, "grad_norm": 0.002855106256902218, "learning_rate": 0.0001407572383073497, "loss": 0.0028, "step": 1330 }, { "epoch": 2.98, "grad_norm": 0.0011817626655101776, "learning_rate": 0.0001403118040089087, "loss": 0.0146, "step": 1340 }, { "epoch": 3.01, "grad_norm": 11.652885437011719, "learning_rate": 0.00013986636971046772, "loss": 0.0155, "step": 1350 }, { "epoch": 3.03, "grad_norm": 0.002146989107131958, "learning_rate": 0.00013942093541202674, "loss": 0.007, "step": 1360 }, { "epoch": 3.05, "grad_norm": 0.0012873058440163732, "learning_rate": 0.00013897550111358574, "loss": 0.0005, "step": 1370 }, { "epoch": 3.07, "grad_norm": 0.001976664876565337, "learning_rate": 0.00013853006681514477, "loss": 0.0067, "step": 1380 }, { "epoch": 3.1, "grad_norm": 0.004611461888998747, "learning_rate": 0.00013808463251670377, "loss": 0.068, "step": 1390 }, { "epoch": 3.12, "grad_norm": 0.05266120657324791, "learning_rate": 0.00013763919821826282, "loss": 0.0004, "step": 1400 }, { "epoch": 3.12, "eval_accuracy": 0.9666666666666667, "eval_f1": 0.9666666666666667, "eval_loss": 0.1750936210155487, "eval_precision": 0.9666666666666667, "eval_recall": 0.9666666666666667, "eval_runtime": 16.8283, "eval_samples_per_second": 60.612, "eval_steps_per_second": 7.606, "step": 1400 }, { "epoch": 3.14, "grad_norm": 0.005359290167689323, "learning_rate": 0.00013719376391982185, "loss": 0.038, "step": 1410 }, { "epoch": 3.16, "grad_norm": 0.0019557911437004805, "learning_rate": 0.00013674832962138085, "loss": 0.0006, "step": 1420 }, { "epoch": 3.18, "grad_norm": 0.002684570848941803, "learning_rate": 0.00013630289532293988, "loss": 0.0179, "step": 1430 }, { "epoch": 3.21, "grad_norm": 2.459765911102295, "learning_rate": 0.0001358574610244989, "loss": 0.0164, "step": 1440 }, { "epoch": 3.23, "grad_norm": 0.010600595735013485, "learning_rate": 0.0001354120267260579, "loss": 0.0008, "step": 1450 }, { "epoch": 3.25, "grad_norm": 0.12665680050849915, "learning_rate": 0.00013496659242761693, "loss": 0.1161, "step": 1460 }, { "epoch": 3.27, "grad_norm": 0.0013257049722597003, "learning_rate": 0.00013452115812917595, "loss": 0.0718, "step": 1470 }, { "epoch": 3.3, "grad_norm": 2.888887882232666, "learning_rate": 0.00013407572383073498, "loss": 0.0028, "step": 1480 }, { "epoch": 3.32, "grad_norm": 0.0022232867777347565, "learning_rate": 0.000133630289532294, "loss": 0.0009, "step": 1490 }, { "epoch": 3.34, "grad_norm": 0.0026971769984811544, "learning_rate": 0.00013318485523385303, "loss": 0.0004, "step": 1500 }, { "epoch": 3.34, "eval_accuracy": 0.9686274509803922, "eval_f1": 0.9686274509803922, "eval_loss": 0.1684277504682541, "eval_precision": 0.9686274509803922, "eval_recall": 0.9686274509803922, "eval_runtime": 16.8277, "eval_samples_per_second": 60.614, "eval_steps_per_second": 7.606, "step": 1500 }, { "epoch": 3.36, "grad_norm": 0.06871479004621506, "learning_rate": 0.00013273942093541203, "loss": 0.0004, "step": 1510 }, { "epoch": 3.39, "grad_norm": 0.010680126026272774, "learning_rate": 0.00013229398663697106, "loss": 0.0623, "step": 1520 }, { "epoch": 3.41, "grad_norm": 0.0024642229545861483, "learning_rate": 0.00013184855233853006, "loss": 0.0005, "step": 1530 }, { "epoch": 3.43, "grad_norm": 0.0014489213936030865, "learning_rate": 0.00013140311804008909, "loss": 0.0004, "step": 1540 }, { "epoch": 3.45, "grad_norm": 0.004346741829067469, "learning_rate": 0.0001309576837416481, "loss": 0.0025, "step": 1550 }, { "epoch": 3.47, "grad_norm": 0.006028163246810436, "learning_rate": 0.0001305122494432071, "loss": 0.0546, "step": 1560 }, { "epoch": 3.5, "grad_norm": 0.0034053712151944637, "learning_rate": 0.00013006681514476616, "loss": 0.028, "step": 1570 }, { "epoch": 3.52, "grad_norm": 0.004035326186567545, "learning_rate": 0.0001296213808463252, "loss": 0.0042, "step": 1580 }, { "epoch": 3.54, "grad_norm": 0.0025597705971449614, "learning_rate": 0.0001291759465478842, "loss": 0.0201, "step": 1590 }, { "epoch": 3.56, "grad_norm": 0.0010411799885332584, "learning_rate": 0.00012873051224944322, "loss": 0.0352, "step": 1600 }, { "epoch": 3.56, "eval_accuracy": 0.9754901960784313, "eval_f1": 0.9754901960784313, "eval_loss": 0.15021921694278717, "eval_precision": 0.9754901960784313, "eval_recall": 0.9754901960784313, "eval_runtime": 16.7579, "eval_samples_per_second": 60.867, "eval_steps_per_second": 7.638, "step": 1600 }, { "epoch": 3.59, "grad_norm": 0.04698515310883522, "learning_rate": 0.00012828507795100224, "loss": 0.0036, "step": 1610 }, { "epoch": 3.61, "grad_norm": 0.013074472546577454, "learning_rate": 0.00012783964365256124, "loss": 0.0626, "step": 1620 }, { "epoch": 3.63, "grad_norm": 0.002302026841789484, "learning_rate": 0.00012739420935412027, "loss": 0.0004, "step": 1630 }, { "epoch": 3.65, "grad_norm": 0.012451753951609135, "learning_rate": 0.0001269487750556793, "loss": 0.0418, "step": 1640 }, { "epoch": 3.67, "grad_norm": 0.010159431956708431, "learning_rate": 0.00012650334075723832, "loss": 0.0008, "step": 1650 }, { "epoch": 3.7, "grad_norm": 0.20833130180835724, "learning_rate": 0.00012605790645879735, "loss": 0.0006, "step": 1660 }, { "epoch": 3.72, "grad_norm": 0.0008345023961737752, "learning_rate": 0.00012561247216035635, "loss": 0.0013, "step": 1670 }, { "epoch": 3.74, "grad_norm": 0.0027376762591302395, "learning_rate": 0.00012516703786191537, "loss": 0.0003, "step": 1680 }, { "epoch": 3.76, "grad_norm": 0.0012037215055897832, "learning_rate": 0.0001247216035634744, "loss": 0.0003, "step": 1690 }, { "epoch": 3.79, "grad_norm": 0.02287732996046543, "learning_rate": 0.0001242761692650334, "loss": 0.0003, "step": 1700 }, { "epoch": 3.79, "eval_accuracy": 0.9745098039215686, "eval_f1": 0.9745098039215686, "eval_loss": 0.15970657765865326, "eval_precision": 0.9745098039215686, "eval_recall": 0.9745098039215686, "eval_runtime": 16.8382, "eval_samples_per_second": 60.576, "eval_steps_per_second": 7.602, "step": 1700 }, { "epoch": 3.81, "grad_norm": 0.0013968138955533504, "learning_rate": 0.00012383073496659243, "loss": 0.0005, "step": 1710 }, { "epoch": 3.83, "grad_norm": 0.003733535995706916, "learning_rate": 0.00012338530066815145, "loss": 0.0003, "step": 1720 }, { "epoch": 3.85, "grad_norm": 0.976151168346405, "learning_rate": 0.00012293986636971045, "loss": 0.001, "step": 1730 }, { "epoch": 3.88, "grad_norm": 0.003409826662391424, "learning_rate": 0.0001224944320712695, "loss": 0.0003, "step": 1740 }, { "epoch": 3.9, "grad_norm": 0.004897921811789274, "learning_rate": 0.00012204899777282852, "loss": 0.0013, "step": 1750 }, { "epoch": 3.92, "grad_norm": 0.0010848743841052055, "learning_rate": 0.00012160356347438753, "loss": 0.0003, "step": 1760 }, { "epoch": 3.94, "grad_norm": 0.0013221738627180457, "learning_rate": 0.00012115812917594656, "loss": 0.0002, "step": 1770 }, { "epoch": 3.96, "grad_norm": 0.09072667360305786, "learning_rate": 0.00012071269487750559, "loss": 0.0029, "step": 1780 }, { "epoch": 3.99, "grad_norm": 0.00137105374597013, "learning_rate": 0.00012026726057906458, "loss": 0.0007, "step": 1790 }, { "epoch": 4.01, "grad_norm": 0.004980257712304592, "learning_rate": 0.00011982182628062361, "loss": 0.0003, "step": 1800 }, { "epoch": 4.01, "eval_accuracy": 0.9558823529411765, "eval_f1": 0.9558823529411765, "eval_loss": 0.25734347105026245, "eval_precision": 0.9558823529411765, "eval_recall": 0.9558823529411765, "eval_runtime": 16.754, "eval_samples_per_second": 60.881, "eval_steps_per_second": 7.64, "step": 1800 }, { "epoch": 4.03, "grad_norm": 0.0013974602334201336, "learning_rate": 0.00011937639198218265, "loss": 0.0015, "step": 1810 }, { "epoch": 4.05, "grad_norm": 0.0027338312938809395, "learning_rate": 0.00011893095768374165, "loss": 0.0028, "step": 1820 }, { "epoch": 4.08, "grad_norm": 0.0018153024138882756, "learning_rate": 0.00011848552338530068, "loss": 0.0003, "step": 1830 }, { "epoch": 4.1, "grad_norm": 0.0008053297642618418, "learning_rate": 0.00011804008908685969, "loss": 0.0145, "step": 1840 }, { "epoch": 4.12, "grad_norm": 0.0016097394982352853, "learning_rate": 0.00011759465478841872, "loss": 0.0003, "step": 1850 }, { "epoch": 4.14, "grad_norm": 0.0011555146193131804, "learning_rate": 0.00011714922048997774, "loss": 0.0007, "step": 1860 }, { "epoch": 4.16, "grad_norm": 0.0013847779482603073, "learning_rate": 0.00011670378619153674, "loss": 0.0005, "step": 1870 }, { "epoch": 4.19, "grad_norm": 0.026971347630023956, "learning_rate": 0.00011625835189309577, "loss": 0.04, "step": 1880 }, { "epoch": 4.21, "grad_norm": 0.004224107600748539, "learning_rate": 0.0001158129175946548, "loss": 0.0004, "step": 1890 }, { "epoch": 4.23, "grad_norm": 0.008265385404229164, "learning_rate": 0.00011536748329621381, "loss": 0.0005, "step": 1900 }, { "epoch": 4.23, "eval_accuracy": 0.9666666666666667, "eval_f1": 0.9666666666666667, "eval_loss": 0.19066497683525085, "eval_precision": 0.9666666666666667, "eval_recall": 0.9666666666666667, "eval_runtime": 16.876, "eval_samples_per_second": 60.441, "eval_steps_per_second": 7.585, "step": 1900 }, { "epoch": 4.25, "grad_norm": 0.0009556623990647495, "learning_rate": 0.00011492204899777283, "loss": 0.0103, "step": 1910 }, { "epoch": 4.28, "grad_norm": 0.0008651684038341045, "learning_rate": 0.00011447661469933186, "loss": 0.0003, "step": 1920 }, { "epoch": 4.3, "grad_norm": 0.0021915507968515158, "learning_rate": 0.00011403118040089087, "loss": 0.0017, "step": 1930 }, { "epoch": 4.32, "grad_norm": 0.993601381778717, "learning_rate": 0.0001135857461024499, "loss": 0.0016, "step": 1940 }, { "epoch": 4.34, "grad_norm": 0.012279433198273182, "learning_rate": 0.00011314031180400893, "loss": 0.0004, "step": 1950 }, { "epoch": 4.37, "grad_norm": 0.0192144475877285, "learning_rate": 0.00011269487750556793, "loss": 0.0211, "step": 1960 }, { "epoch": 4.39, "grad_norm": 0.002850558841601014, "learning_rate": 0.00011224944320712695, "loss": 0.0011, "step": 1970 }, { "epoch": 4.41, "grad_norm": 0.016757028177380562, "learning_rate": 0.00011180400890868597, "loss": 0.0004, "step": 1980 }, { "epoch": 4.43, "grad_norm": 0.009729539044201374, "learning_rate": 0.00011135857461024499, "loss": 0.0005, "step": 1990 }, { "epoch": 4.45, "grad_norm": 0.0012516066199168563, "learning_rate": 0.00011091314031180402, "loss": 0.0741, "step": 2000 }, { "epoch": 4.45, "eval_accuracy": 0.9637254901960784, "eval_f1": 0.9637254901960784, "eval_loss": 0.20377103984355927, "eval_precision": 0.9637254901960784, "eval_recall": 0.9637254901960784, "eval_runtime": 16.9265, "eval_samples_per_second": 60.26, "eval_steps_per_second": 7.562, "step": 2000 }, { "epoch": 4.48, "grad_norm": 0.03771669417619705, "learning_rate": 0.00011046770601336303, "loss": 0.0005, "step": 2010 }, { "epoch": 4.5, "grad_norm": 0.0029582425486296415, "learning_rate": 0.00011002227171492206, "loss": 0.044, "step": 2020 }, { "epoch": 4.52, "grad_norm": 0.0017603106098249555, "learning_rate": 0.00010957683741648108, "loss": 0.0002, "step": 2030 }, { "epoch": 4.54, "grad_norm": 0.006093455944210291, "learning_rate": 0.00010913140311804008, "loss": 0.0004, "step": 2040 }, { "epoch": 4.57, "grad_norm": 0.024271611124277115, "learning_rate": 0.00010868596881959911, "loss": 0.0004, "step": 2050 }, { "epoch": 4.59, "grad_norm": 0.0059431749396026134, "learning_rate": 0.00010824053452115814, "loss": 0.0171, "step": 2060 }, { "epoch": 4.61, "grad_norm": 0.001350950333289802, "learning_rate": 0.00010779510022271715, "loss": 0.0169, "step": 2070 }, { "epoch": 4.63, "grad_norm": 0.11208397895097733, "learning_rate": 0.00010734966592427618, "loss": 0.0009, "step": 2080 }, { "epoch": 4.65, "grad_norm": 0.000902643718291074, "learning_rate": 0.0001069042316258352, "loss": 0.0002, "step": 2090 }, { "epoch": 4.68, "grad_norm": 0.0008040807442739606, "learning_rate": 0.00010645879732739422, "loss": 0.0025, "step": 2100 }, { "epoch": 4.68, "eval_accuracy": 0.9647058823529412, "eval_f1": 0.9647058823529412, "eval_loss": 0.1929028332233429, "eval_precision": 0.9647058823529412, "eval_recall": 0.9647058823529412, "eval_runtime": 17.2457, "eval_samples_per_second": 59.145, "eval_steps_per_second": 7.422, "step": 2100 }, { "epoch": 4.7, "grad_norm": 0.001127121620811522, "learning_rate": 0.00010601336302895324, "loss": 0.0025, "step": 2110 }, { "epoch": 4.72, "grad_norm": 0.0010411780094727874, "learning_rate": 0.00010556792873051224, "loss": 0.0002, "step": 2120 }, { "epoch": 4.74, "grad_norm": 0.0012262547388672829, "learning_rate": 0.00010512249443207127, "loss": 0.0031, "step": 2130 }, { "epoch": 4.77, "grad_norm": 0.06668848544359207, "learning_rate": 0.0001046770601336303, "loss": 0.0011, "step": 2140 }, { "epoch": 4.79, "grad_norm": 11.083710670471191, "learning_rate": 0.00010423162583518931, "loss": 0.1101, "step": 2150 }, { "epoch": 4.81, "grad_norm": 0.0041348133236169815, "learning_rate": 0.00010378619153674833, "loss": 0.0312, "step": 2160 }, { "epoch": 4.83, "grad_norm": 0.0014749247347936034, "learning_rate": 0.00010334075723830736, "loss": 0.002, "step": 2170 }, { "epoch": 4.86, "grad_norm": 0.00816721748560667, "learning_rate": 0.00010289532293986637, "loss": 0.0374, "step": 2180 }, { "epoch": 4.88, "grad_norm": 0.0029881075024604797, "learning_rate": 0.0001024498886414254, "loss": 0.0245, "step": 2190 }, { "epoch": 4.9, "grad_norm": 0.01441921480000019, "learning_rate": 0.00010200445434298443, "loss": 0.0293, "step": 2200 }, { "epoch": 4.9, "eval_accuracy": 0.9607843137254902, "eval_f1": 0.9607843137254902, "eval_loss": 0.17395375669002533, "eval_precision": 0.9607843137254902, "eval_recall": 0.9607843137254902, "eval_runtime": 17.1376, "eval_samples_per_second": 59.518, "eval_steps_per_second": 7.469, "step": 2200 }, { "epoch": 4.92, "grad_norm": 0.005346087273210287, "learning_rate": 0.00010155902004454343, "loss": 0.054, "step": 2210 }, { "epoch": 4.94, "grad_norm": 0.0087255435064435, "learning_rate": 0.00010111358574610245, "loss": 0.0008, "step": 2220 }, { "epoch": 4.97, "grad_norm": 0.0033257934264838696, "learning_rate": 0.00010066815144766148, "loss": 0.0034, "step": 2230 }, { "epoch": 4.99, "grad_norm": 0.0017741642659530044, "learning_rate": 0.00010022271714922049, "loss": 0.0008, "step": 2240 }, { "epoch": 5.01, "grad_norm": 0.03513794392347336, "learning_rate": 9.977728285077952e-05, "loss": 0.0064, "step": 2250 }, { "epoch": 5.03, "grad_norm": 0.0020874643232673407, "learning_rate": 9.933184855233853e-05, "loss": 0.0003, "step": 2260 }, { "epoch": 5.06, "grad_norm": 0.0035891502629965544, "learning_rate": 9.888641425389756e-05, "loss": 0.0489, "step": 2270 }, { "epoch": 5.08, "grad_norm": 0.001030069775879383, "learning_rate": 9.844097995545658e-05, "loss": 0.0011, "step": 2280 }, { "epoch": 5.1, "grad_norm": 0.053950581699609756, "learning_rate": 9.79955456570156e-05, "loss": 0.0016, "step": 2290 }, { "epoch": 5.12, "grad_norm": 0.0023863562382757664, "learning_rate": 9.755011135857461e-05, "loss": 0.0003, "step": 2300 }, { "epoch": 5.12, "eval_accuracy": 0.9568627450980393, "eval_f1": 0.9568627450980393, "eval_loss": 0.25984036922454834, "eval_precision": 0.9568627450980393, "eval_recall": 0.9568627450980393, "eval_runtime": 17.3202, "eval_samples_per_second": 58.891, "eval_steps_per_second": 7.39, "step": 2300 }, { "epoch": 5.14, "grad_norm": 0.054378170520067215, "learning_rate": 9.710467706013364e-05, "loss": 0.0005, "step": 2310 }, { "epoch": 5.17, "grad_norm": 0.003204792272299528, "learning_rate": 9.665924276169266e-05, "loss": 0.0485, "step": 2320 }, { "epoch": 5.19, "grad_norm": 0.001054179621860385, "learning_rate": 9.621380846325168e-05, "loss": 0.0456, "step": 2330 }, { "epoch": 5.21, "grad_norm": 0.0008275459986180067, "learning_rate": 9.576837416481069e-05, "loss": 0.0003, "step": 2340 }, { "epoch": 5.23, "grad_norm": 0.06051745265722275, "learning_rate": 9.532293986636972e-05, "loss": 0.0005, "step": 2350 }, { "epoch": 5.26, "grad_norm": 0.0037743300199508667, "learning_rate": 9.487750556792874e-05, "loss": 0.0034, "step": 2360 }, { "epoch": 5.28, "grad_norm": 0.00043078724411316216, "learning_rate": 9.443207126948775e-05, "loss": 0.0004, "step": 2370 }, { "epoch": 5.3, "grad_norm": 0.001160395797342062, "learning_rate": 9.398663697104677e-05, "loss": 0.0009, "step": 2380 }, { "epoch": 5.32, "grad_norm": 0.0006127849337644875, "learning_rate": 9.35412026726058e-05, "loss": 0.0001, "step": 2390 }, { "epoch": 5.35, "grad_norm": 0.0023824446834623814, "learning_rate": 9.309576837416482e-05, "loss": 0.0037, "step": 2400 }, { "epoch": 5.35, "eval_accuracy": 0.961764705882353, "eval_f1": 0.961764705882353, "eval_loss": 0.17718201875686646, "eval_precision": 0.961764705882353, "eval_recall": 0.961764705882353, "eval_runtime": 17.3617, "eval_samples_per_second": 58.75, "eval_steps_per_second": 7.373, "step": 2400 }, { "epoch": 5.37, "grad_norm": 0.0015472627710551023, "learning_rate": 9.265033407572383e-05, "loss": 0.0132, "step": 2410 }, { "epoch": 5.39, "grad_norm": 0.002203689655289054, "learning_rate": 9.220489977728286e-05, "loss": 0.0003, "step": 2420 }, { "epoch": 5.41, "grad_norm": 0.003569718450307846, "learning_rate": 9.175946547884187e-05, "loss": 0.0006, "step": 2430 }, { "epoch": 5.43, "grad_norm": 0.0020932252518832684, "learning_rate": 9.13140311804009e-05, "loss": 0.0051, "step": 2440 }, { "epoch": 5.46, "grad_norm": 0.0030166106298565865, "learning_rate": 9.086859688195991e-05, "loss": 0.0008, "step": 2450 }, { "epoch": 5.48, "grad_norm": 0.001667293719947338, "learning_rate": 9.042316258351894e-05, "loss": 0.0004, "step": 2460 }, { "epoch": 5.5, "grad_norm": 0.0005573901580646634, "learning_rate": 8.997772828507795e-05, "loss": 0.0097, "step": 2470 }, { "epoch": 5.52, "grad_norm": 0.0013365427730605006, "learning_rate": 8.953229398663698e-05, "loss": 0.0002, "step": 2480 }, { "epoch": 5.55, "grad_norm": 0.0031586128752678633, "learning_rate": 8.9086859688196e-05, "loss": 0.0022, "step": 2490 }, { "epoch": 5.57, "grad_norm": 0.21919859945774078, "learning_rate": 8.864142538975502e-05, "loss": 0.0213, "step": 2500 }, { "epoch": 5.57, "eval_accuracy": 0.9519607843137254, "eval_f1": 0.9519607843137254, "eval_loss": 0.2910812199115753, "eval_precision": 0.9519607843137254, "eval_recall": 0.9519607843137254, "eval_runtime": 17.3785, "eval_samples_per_second": 58.693, "eval_steps_per_second": 7.365, "step": 2500 }, { "epoch": 5.59, "grad_norm": 0.002107376931235194, "learning_rate": 8.819599109131403e-05, "loss": 0.001, "step": 2510 }, { "epoch": 5.61, "grad_norm": 11.032261848449707, "learning_rate": 8.775055679287306e-05, "loss": 0.0331, "step": 2520 }, { "epoch": 5.63, "grad_norm": 0.0010789623484015465, "learning_rate": 8.730512249443208e-05, "loss": 0.0002, "step": 2530 }, { "epoch": 5.66, "grad_norm": 0.0013054576702415943, "learning_rate": 8.68596881959911e-05, "loss": 0.0019, "step": 2540 }, { "epoch": 5.68, "grad_norm": 0.001707877148874104, "learning_rate": 8.641425389755011e-05, "loss": 0.0004, "step": 2550 }, { "epoch": 5.7, "grad_norm": 0.0006996811716817319, "learning_rate": 8.596881959910914e-05, "loss": 0.0172, "step": 2560 }, { "epoch": 5.72, "grad_norm": 0.00302655971609056, "learning_rate": 8.552338530066816e-05, "loss": 0.0461, "step": 2570 }, { "epoch": 5.75, "grad_norm": 0.0005682900664396584, "learning_rate": 8.507795100222718e-05, "loss": 0.0002, "step": 2580 }, { "epoch": 5.77, "grad_norm": 0.0009555260185152292, "learning_rate": 8.463251670378619e-05, "loss": 0.0002, "step": 2590 }, { "epoch": 5.79, "grad_norm": 0.020968729630112648, "learning_rate": 8.418708240534521e-05, "loss": 0.027, "step": 2600 }, { "epoch": 5.79, "eval_accuracy": 0.9519607843137254, "eval_f1": 0.9519607843137254, "eval_loss": 0.25403299927711487, "eval_precision": 0.9519607843137254, "eval_recall": 0.9519607843137254, "eval_runtime": 17.2499, "eval_samples_per_second": 59.131, "eval_steps_per_second": 7.42, "step": 2600 }, { "epoch": 5.81, "grad_norm": 0.10158411413431168, "learning_rate": 8.374164810690424e-05, "loss": 0.0003, "step": 2610 }, { "epoch": 5.84, "grad_norm": 0.016370078548789024, "learning_rate": 8.329621380846325e-05, "loss": 0.0981, "step": 2620 }, { "epoch": 5.86, "grad_norm": 0.0012518821749836206, "learning_rate": 8.285077951002228e-05, "loss": 0.0004, "step": 2630 }, { "epoch": 5.88, "grad_norm": 0.0029411010909825563, "learning_rate": 8.24053452115813e-05, "loss": 0.0007, "step": 2640 }, { "epoch": 5.9, "grad_norm": 0.000937216158490628, "learning_rate": 8.195991091314032e-05, "loss": 0.0177, "step": 2650 }, { "epoch": 5.92, "grad_norm": 0.0034318570978939533, "learning_rate": 8.151447661469933e-05, "loss": 0.0312, "step": 2660 }, { "epoch": 5.95, "grad_norm": 0.0007262133876793087, "learning_rate": 8.106904231625836e-05, "loss": 0.0002, "step": 2670 }, { "epoch": 5.97, "grad_norm": 0.0007804339984431863, "learning_rate": 8.062360801781737e-05, "loss": 0.0002, "step": 2680 }, { "epoch": 5.99, "grad_norm": 0.001342720352113247, "learning_rate": 8.01781737193764e-05, "loss": 0.0002, "step": 2690 }, { "epoch": 6.01, "grad_norm": 0.001933095627464354, "learning_rate": 7.973273942093543e-05, "loss": 0.0155, "step": 2700 }, { "epoch": 6.01, "eval_accuracy": 0.9549019607843138, "eval_f1": 0.9549019607843138, "eval_loss": 0.22524712979793549, "eval_precision": 0.9549019607843138, "eval_recall": 0.9549019607843138, "eval_runtime": 17.4105, "eval_samples_per_second": 58.585, "eval_steps_per_second": 7.352, "step": 2700 }, { "epoch": 6.04, "grad_norm": 0.0011171975638717413, "learning_rate": 7.928730512249444e-05, "loss": 0.0002, "step": 2710 }, { "epoch": 6.06, "grad_norm": 0.001169139752164483, "learning_rate": 7.884187082405345e-05, "loss": 0.0002, "step": 2720 }, { "epoch": 6.08, "grad_norm": 0.002055455232039094, "learning_rate": 7.839643652561248e-05, "loss": 0.0082, "step": 2730 }, { "epoch": 6.1, "grad_norm": 0.05008271709084511, "learning_rate": 7.79510022271715e-05, "loss": 0.0321, "step": 2740 }, { "epoch": 6.12, "grad_norm": 0.000607622554525733, "learning_rate": 7.750556792873052e-05, "loss": 0.0397, "step": 2750 }, { "epoch": 6.15, "grad_norm": 0.000795868574641645, "learning_rate": 7.706013363028953e-05, "loss": 0.0064, "step": 2760 }, { "epoch": 6.17, "grad_norm": 0.010306187905371189, "learning_rate": 7.661469933184856e-05, "loss": 0.004, "step": 2770 }, { "epoch": 6.19, "grad_norm": 0.001497789635322988, "learning_rate": 7.616926503340758e-05, "loss": 0.005, "step": 2780 }, { "epoch": 6.21, "grad_norm": 0.12043255567550659, "learning_rate": 7.57238307349666e-05, "loss": 0.003, "step": 2790 }, { "epoch": 6.24, "grad_norm": 0.0034782905131578445, "learning_rate": 7.527839643652561e-05, "loss": 0.0002, "step": 2800 }, { "epoch": 6.24, "eval_accuracy": 0.9431372549019608, "eval_f1": 0.9431372549019608, "eval_loss": 0.3040062189102173, "eval_precision": 0.9431372549019608, "eval_recall": 0.9431372549019608, "eval_runtime": 17.1818, "eval_samples_per_second": 59.365, "eval_steps_per_second": 7.45, "step": 2800 }, { "epoch": 6.26, "grad_norm": 5.399389266967773, "learning_rate": 7.483296213808464e-05, "loss": 0.0092, "step": 2810 }, { "epoch": 6.28, "grad_norm": 0.0007157580694183707, "learning_rate": 7.438752783964366e-05, "loss": 0.0134, "step": 2820 }, { "epoch": 6.3, "grad_norm": 0.005470567848533392, "learning_rate": 7.394209354120267e-05, "loss": 0.147, "step": 2830 }, { "epoch": 6.33, "grad_norm": 0.01675906591117382, "learning_rate": 7.34966592427617e-05, "loss": 0.0813, "step": 2840 }, { "epoch": 6.35, "grad_norm": 5.728336334228516, "learning_rate": 7.305122494432071e-05, "loss": 0.0275, "step": 2850 }, { "epoch": 6.37, "grad_norm": 0.003522884799167514, "learning_rate": 7.260579064587974e-05, "loss": 0.0109, "step": 2860 }, { "epoch": 6.39, "grad_norm": 0.026970118284225464, "learning_rate": 7.216035634743875e-05, "loss": 0.0004, "step": 2870 }, { "epoch": 6.41, "grad_norm": 0.12984509766101837, "learning_rate": 7.171492204899778e-05, "loss": 0.0007, "step": 2880 }, { "epoch": 6.44, "grad_norm": 0.009168056771159172, "learning_rate": 7.126948775055679e-05, "loss": 0.0006, "step": 2890 }, { "epoch": 6.46, "grad_norm": 0.0009597400785423815, "learning_rate": 7.082405345211582e-05, "loss": 0.011, "step": 2900 }, { "epoch": 6.46, "eval_accuracy": 0.9598039215686275, "eval_f1": 0.9598039215686275, "eval_loss": 0.1923176795244217, "eval_precision": 0.9598039215686275, "eval_recall": 0.9598039215686275, "eval_runtime": 17.225, "eval_samples_per_second": 59.216, "eval_steps_per_second": 7.431, "step": 2900 }, { "epoch": 6.48, "grad_norm": 0.0009739417000673711, "learning_rate": 7.037861915367485e-05, "loss": 0.0014, "step": 2910 }, { "epoch": 6.5, "grad_norm": 0.0022935476154088974, "learning_rate": 6.993318485523386e-05, "loss": 0.0133, "step": 2920 }, { "epoch": 6.53, "grad_norm": 0.0005638069123961031, "learning_rate": 6.948775055679287e-05, "loss": 0.0009, "step": 2930 }, { "epoch": 6.55, "grad_norm": 0.014625852927565575, "learning_rate": 6.904231625835188e-05, "loss": 0.0006, "step": 2940 }, { "epoch": 6.57, "grad_norm": 0.001474756863899529, "learning_rate": 6.859688195991092e-05, "loss": 0.0128, "step": 2950 }, { "epoch": 6.59, "grad_norm": 0.0029620621353387833, "learning_rate": 6.815144766146994e-05, "loss": 0.0002, "step": 2960 }, { "epoch": 6.61, "grad_norm": 0.0016939816996455193, "learning_rate": 6.770601336302895e-05, "loss": 0.0003, "step": 2970 }, { "epoch": 6.64, "grad_norm": 0.0009252108866348863, "learning_rate": 6.726057906458798e-05, "loss": 0.0011, "step": 2980 }, { "epoch": 6.66, "grad_norm": 0.1285027265548706, "learning_rate": 6.6815144766147e-05, "loss": 0.0071, "step": 2990 }, { "epoch": 6.68, "grad_norm": 0.00125114805996418, "learning_rate": 6.636971046770602e-05, "loss": 0.0006, "step": 3000 }, { "epoch": 6.68, "eval_accuracy": 0.9637254901960784, "eval_f1": 0.9637254901960784, "eval_loss": 0.20890936255455017, "eval_precision": 0.9637254901960784, "eval_recall": 0.9637254901960784, "eval_runtime": 17.1679, "eval_samples_per_second": 59.413, "eval_steps_per_second": 7.456, "step": 3000 }, { "epoch": 6.7, "grad_norm": 0.0017677777213975787, "learning_rate": 6.592427616926503e-05, "loss": 0.03, "step": 3010 }, { "epoch": 6.73, "grad_norm": 0.0006067939684726298, "learning_rate": 6.547884187082406e-05, "loss": 0.0003, "step": 3020 }, { "epoch": 6.75, "grad_norm": 0.0010102881351485848, "learning_rate": 6.503340757238308e-05, "loss": 0.0002, "step": 3030 }, { "epoch": 6.77, "grad_norm": 0.0009144017240032554, "learning_rate": 6.45879732739421e-05, "loss": 0.0282, "step": 3040 }, { "epoch": 6.79, "grad_norm": 0.0007274287054315209, "learning_rate": 6.414253897550112e-05, "loss": 0.0002, "step": 3050 }, { "epoch": 6.82, "grad_norm": 0.0034935837611556053, "learning_rate": 6.369710467706013e-05, "loss": 0.0046, "step": 3060 }, { "epoch": 6.84, "grad_norm": 0.004235483705997467, "learning_rate": 6.325167037861916e-05, "loss": 0.0002, "step": 3070 }, { "epoch": 6.86, "grad_norm": 0.0013753636740148067, "learning_rate": 6.280623608017817e-05, "loss": 0.0101, "step": 3080 }, { "epoch": 6.88, "grad_norm": 0.0008035491337068379, "learning_rate": 6.23608017817372e-05, "loss": 0.0002, "step": 3090 }, { "epoch": 6.9, "grad_norm": 0.00173095241189003, "learning_rate": 6.191536748329621e-05, "loss": 0.0002, "step": 3100 }, { "epoch": 6.9, "eval_accuracy": 0.957843137254902, "eval_f1": 0.957843137254902, "eval_loss": 0.22062458097934723, "eval_precision": 0.957843137254902, "eval_recall": 0.957843137254902, "eval_runtime": 17.3063, "eval_samples_per_second": 58.938, "eval_steps_per_second": 7.396, "step": 3100 }, { "epoch": 6.93, "grad_norm": 0.001496517681516707, "learning_rate": 6.146993318485523e-05, "loss": 0.0002, "step": 3110 }, { "epoch": 6.95, "grad_norm": 0.0005152082885615528, "learning_rate": 6.102449888641426e-05, "loss": 0.0002, "step": 3120 }, { "epoch": 6.97, "grad_norm": 0.0005918457172811031, "learning_rate": 6.057906458797328e-05, "loss": 0.0192, "step": 3130 }, { "epoch": 6.99, "grad_norm": 0.000515251827891916, "learning_rate": 6.013363028953229e-05, "loss": 0.0003, "step": 3140 }, { "epoch": 7.02, "grad_norm": 0.00043858023127540946, "learning_rate": 5.9688195991091325e-05, "loss": 0.0105, "step": 3150 }, { "epoch": 7.04, "grad_norm": 0.0018106413772329688, "learning_rate": 5.924276169265034e-05, "loss": 0.0059, "step": 3160 }, { "epoch": 7.06, "grad_norm": 0.000563229201361537, "learning_rate": 5.879732739420936e-05, "loss": 0.0003, "step": 3170 }, { "epoch": 7.08, "grad_norm": 0.001515958341769874, "learning_rate": 5.835189309576837e-05, "loss": 0.0058, "step": 3180 }, { "epoch": 7.1, "grad_norm": 0.0005047390004619956, "learning_rate": 5.79064587973274e-05, "loss": 0.0002, "step": 3190 }, { "epoch": 7.13, "grad_norm": 0.03174121677875519, "learning_rate": 5.746102449888642e-05, "loss": 0.0006, "step": 3200 }, { "epoch": 7.13, "eval_accuracy": 0.9627450980392157, "eval_f1": 0.9627450980392157, "eval_loss": 0.22668223083019257, "eval_precision": 0.9627450980392157, "eval_recall": 0.9627450980392157, "eval_runtime": 16.905, "eval_samples_per_second": 60.337, "eval_steps_per_second": 7.572, "step": 3200 }, { "epoch": 7.15, "grad_norm": 0.0019246222218498588, "learning_rate": 5.701559020044544e-05, "loss": 0.0002, "step": 3210 }, { "epoch": 7.17, "grad_norm": 8.673022270202637, "learning_rate": 5.6570155902004463e-05, "loss": 0.0058, "step": 3220 }, { "epoch": 7.19, "grad_norm": 0.0006804656004533172, "learning_rate": 5.6124721603563476e-05, "loss": 0.0002, "step": 3230 }, { "epoch": 7.22, "grad_norm": 0.0013651803601533175, "learning_rate": 5.5679287305122496e-05, "loss": 0.0007, "step": 3240 }, { "epoch": 7.24, "grad_norm": 0.0014620161382481456, "learning_rate": 5.5233853006681516e-05, "loss": 0.0002, "step": 3250 }, { "epoch": 7.26, "grad_norm": 0.0009020831785164773, "learning_rate": 5.478841870824054e-05, "loss": 0.0003, "step": 3260 }, { "epoch": 7.28, "grad_norm": 0.0010328377829864621, "learning_rate": 5.4342984409799555e-05, "loss": 0.0016, "step": 3270 }, { "epoch": 7.31, "grad_norm": 0.002698230091482401, "learning_rate": 5.3897550111358575e-05, "loss": 0.0002, "step": 3280 }, { "epoch": 7.33, "grad_norm": 0.0025662758853286505, "learning_rate": 5.34521158129176e-05, "loss": 0.0002, "step": 3290 }, { "epoch": 7.35, "grad_norm": 0.0005825618281960487, "learning_rate": 5.300668151447662e-05, "loss": 0.0001, "step": 3300 }, { "epoch": 7.35, "eval_accuracy": 0.9637254901960784, "eval_f1": 0.9637254901960784, "eval_loss": 0.17346832156181335, "eval_precision": 0.9637254901960784, "eval_recall": 0.9637254901960784, "eval_runtime": 17.0715, "eval_samples_per_second": 59.749, "eval_steps_per_second": 7.498, "step": 3300 }, { "epoch": 7.37, "grad_norm": 0.001066404627636075, "learning_rate": 5.2561247216035634e-05, "loss": 0.0001, "step": 3310 }, { "epoch": 7.39, "grad_norm": 0.0007172970799729228, "learning_rate": 5.2115812917594654e-05, "loss": 0.0002, "step": 3320 }, { "epoch": 7.42, "grad_norm": 0.000634915370028466, "learning_rate": 5.167037861915368e-05, "loss": 0.0001, "step": 3330 }, { "epoch": 7.44, "grad_norm": 0.004406619351357222, "learning_rate": 5.12249443207127e-05, "loss": 0.0001, "step": 3340 }, { "epoch": 7.46, "grad_norm": 0.015614562667906284, "learning_rate": 5.077951002227171e-05, "loss": 0.0002, "step": 3350 }, { "epoch": 7.48, "grad_norm": 0.0017906671855598688, "learning_rate": 5.033407572383074e-05, "loss": 0.0003, "step": 3360 }, { "epoch": 7.51, "grad_norm": 0.002051855204626918, "learning_rate": 4.988864142538976e-05, "loss": 0.0003, "step": 3370 }, { "epoch": 7.53, "grad_norm": 0.0007296734838746488, "learning_rate": 4.944320712694878e-05, "loss": 0.0001, "step": 3380 }, { "epoch": 7.55, "grad_norm": 0.0005030659376643598, "learning_rate": 4.89977728285078e-05, "loss": 0.001, "step": 3390 }, { "epoch": 7.57, "grad_norm": 0.00040412909584119916, "learning_rate": 4.855233853006682e-05, "loss": 0.0001, "step": 3400 }, { "epoch": 7.57, "eval_accuracy": 0.9686274509803922, "eval_f1": 0.9686274509803922, "eval_loss": 0.16111387312412262, "eval_precision": 0.9686274509803922, "eval_recall": 0.9686274509803922, "eval_runtime": 16.979, "eval_samples_per_second": 60.074, "eval_steps_per_second": 7.539, "step": 3400 }, { "epoch": 7.59, "grad_norm": 0.0005902125267311931, "learning_rate": 4.810690423162584e-05, "loss": 0.0002, "step": 3410 }, { "epoch": 7.62, "grad_norm": 0.0005297433235682547, "learning_rate": 4.766146993318486e-05, "loss": 0.0002, "step": 3420 }, { "epoch": 7.64, "grad_norm": 0.0005097580142319202, "learning_rate": 4.721603563474388e-05, "loss": 0.0001, "step": 3430 }, { "epoch": 7.66, "grad_norm": 0.0006824088632129133, "learning_rate": 4.67706013363029e-05, "loss": 0.0002, "step": 3440 }, { "epoch": 7.68, "grad_norm": 0.0007693713996559381, "learning_rate": 4.632516703786192e-05, "loss": 0.0002, "step": 3450 }, { "epoch": 7.71, "grad_norm": 0.0010686744935810566, "learning_rate": 4.5879732739420936e-05, "loss": 0.0001, "step": 3460 }, { "epoch": 7.73, "grad_norm": 0.0008867672295309603, "learning_rate": 4.5434298440979956e-05, "loss": 0.0001, "step": 3470 }, { "epoch": 7.75, "grad_norm": 0.029097959399223328, "learning_rate": 4.4988864142538976e-05, "loss": 0.0002, "step": 3480 }, { "epoch": 7.77, "grad_norm": 1.0295618772506714, "learning_rate": 4.4543429844098e-05, "loss": 0.0232, "step": 3490 }, { "epoch": 7.8, "grad_norm": 0.01833498664200306, "learning_rate": 4.4097995545657015e-05, "loss": 0.0003, "step": 3500 }, { "epoch": 7.8, "eval_accuracy": 0.9676470588235294, "eval_f1": 0.9676470588235294, "eval_loss": 0.15838229656219482, "eval_precision": 0.9676470588235294, "eval_recall": 0.9676470588235294, "eval_runtime": 17.1614, "eval_samples_per_second": 59.436, "eval_steps_per_second": 7.459, "step": 3500 }, { "epoch": 7.82, "grad_norm": 0.00047049217391759157, "learning_rate": 4.365256124721604e-05, "loss": 0.0002, "step": 3510 }, { "epoch": 7.84, "grad_norm": 0.0005326452082954347, "learning_rate": 4.3207126948775055e-05, "loss": 0.0001, "step": 3520 }, { "epoch": 7.86, "grad_norm": 0.004704196471720934, "learning_rate": 4.276169265033408e-05, "loss": 0.0001, "step": 3530 }, { "epoch": 7.88, "grad_norm": 0.0015603323699906468, "learning_rate": 4.2316258351893094e-05, "loss": 0.0001, "step": 3540 }, { "epoch": 7.91, "grad_norm": 0.001259263837710023, "learning_rate": 4.187082405345212e-05, "loss": 0.0002, "step": 3550 }, { "epoch": 7.93, "grad_norm": 0.0009968471713364124, "learning_rate": 4.142538975501114e-05, "loss": 0.0173, "step": 3560 }, { "epoch": 7.95, "grad_norm": 0.0011363897938281298, "learning_rate": 4.097995545657016e-05, "loss": 0.0002, "step": 3570 }, { "epoch": 7.97, "grad_norm": 0.000970890570897609, "learning_rate": 4.053452115812918e-05, "loss": 0.0002, "step": 3580 }, { "epoch": 8.0, "grad_norm": 0.0004184432327747345, "learning_rate": 4.00890868596882e-05, "loss": 0.0002, "step": 3590 }, { "epoch": 8.02, "grad_norm": 0.0016488181427121162, "learning_rate": 3.964365256124722e-05, "loss": 0.0001, "step": 3600 }, { "epoch": 8.02, "eval_accuracy": 0.9715686274509804, "eval_f1": 0.9715686274509804, "eval_loss": 0.15909050405025482, "eval_precision": 0.9715686274509804, "eval_recall": 0.9715686274509804, "eval_runtime": 17.2363, "eval_samples_per_second": 59.178, "eval_steps_per_second": 7.426, "step": 3600 }, { "epoch": 8.04, "grad_norm": 0.0007161149405874312, "learning_rate": 3.919821826280624e-05, "loss": 0.0001, "step": 3610 }, { "epoch": 8.06, "grad_norm": 0.025815103203058243, "learning_rate": 3.875278396436526e-05, "loss": 0.0002, "step": 3620 }, { "epoch": 8.08, "grad_norm": 0.6380942463874817, "learning_rate": 3.830734966592428e-05, "loss": 0.0067, "step": 3630 }, { "epoch": 8.11, "grad_norm": 0.003612744389101863, "learning_rate": 3.78619153674833e-05, "loss": 0.0004, "step": 3640 }, { "epoch": 8.13, "grad_norm": 0.002143553225323558, "learning_rate": 3.741648106904232e-05, "loss": 0.0002, "step": 3650 }, { "epoch": 8.15, "grad_norm": 0.001735298428684473, "learning_rate": 3.697104677060134e-05, "loss": 0.0002, "step": 3660 }, { "epoch": 8.17, "grad_norm": 0.00038884536479599774, "learning_rate": 3.652561247216036e-05, "loss": 0.0004, "step": 3670 }, { "epoch": 8.2, "grad_norm": 0.0004352598334662616, "learning_rate": 3.608017817371938e-05, "loss": 0.0001, "step": 3680 }, { "epoch": 8.22, "grad_norm": 0.0030426643788814545, "learning_rate": 3.5634743875278396e-05, "loss": 0.0002, "step": 3690 }, { "epoch": 8.24, "grad_norm": 0.0004016205493826419, "learning_rate": 3.518930957683742e-05, "loss": 0.0005, "step": 3700 }, { "epoch": 8.24, "eval_accuracy": 0.9705882352941176, "eval_f1": 0.9705882352941176, "eval_loss": 0.15964852273464203, "eval_precision": 0.9705882352941176, "eval_recall": 0.9705882352941176, "eval_runtime": 17.4233, "eval_samples_per_second": 58.542, "eval_steps_per_second": 7.346, "step": 3700 }, { "epoch": 8.26, "grad_norm": 0.0010756178526207805, "learning_rate": 3.4743875278396436e-05, "loss": 0.0001, "step": 3710 }, { "epoch": 8.29, "grad_norm": 0.002286019967868924, "learning_rate": 3.429844097995546e-05, "loss": 0.0001, "step": 3720 }, { "epoch": 8.31, "grad_norm": 0.0019246740266680717, "learning_rate": 3.3853006681514475e-05, "loss": 0.0001, "step": 3730 }, { "epoch": 8.33, "grad_norm": 0.0022740724962204695, "learning_rate": 3.34075723830735e-05, "loss": 0.0001, "step": 3740 }, { "epoch": 8.35, "grad_norm": 0.0003697601496241987, "learning_rate": 3.2962138084632515e-05, "loss": 0.0001, "step": 3750 }, { "epoch": 8.37, "grad_norm": 0.00033845697180368006, "learning_rate": 3.251670378619154e-05, "loss": 0.0001, "step": 3760 }, { "epoch": 8.4, "grad_norm": 0.0004189737082924694, "learning_rate": 3.207126948775056e-05, "loss": 0.0001, "step": 3770 }, { "epoch": 8.42, "grad_norm": 0.0008992131915874779, "learning_rate": 3.162583518930958e-05, "loss": 0.0004, "step": 3780 }, { "epoch": 8.44, "grad_norm": 0.0008794433670118451, "learning_rate": 3.11804008908686e-05, "loss": 0.0001, "step": 3790 }, { "epoch": 8.46, "grad_norm": 0.0017194098327308893, "learning_rate": 3.073496659242761e-05, "loss": 0.0002, "step": 3800 }, { "epoch": 8.46, "eval_accuracy": 0.9715686274509804, "eval_f1": 0.9715686274509804, "eval_loss": 0.15634377300739288, "eval_precision": 0.9715686274509804, "eval_recall": 0.9715686274509804, "eval_runtime": 17.0753, "eval_samples_per_second": 59.735, "eval_steps_per_second": 7.496, "step": 3800 }, { "epoch": 8.49, "grad_norm": 0.0009110970422625542, "learning_rate": 3.028953229398664e-05, "loss": 0.0002, "step": 3810 }, { "epoch": 8.51, "grad_norm": 0.0010168278822675347, "learning_rate": 2.9844097995545663e-05, "loss": 0.0001, "step": 3820 }, { "epoch": 8.53, "grad_norm": 0.0005708567332476377, "learning_rate": 2.939866369710468e-05, "loss": 0.0001, "step": 3830 }, { "epoch": 8.55, "grad_norm": 0.0033207752276211977, "learning_rate": 2.89532293986637e-05, "loss": 0.0001, "step": 3840 }, { "epoch": 8.57, "grad_norm": 0.0010962020605802536, "learning_rate": 2.850779510022272e-05, "loss": 0.0001, "step": 3850 }, { "epoch": 8.6, "grad_norm": 0.0008160584839060903, "learning_rate": 2.8062360801781738e-05, "loss": 0.0001, "step": 3860 }, { "epoch": 8.62, "grad_norm": 0.001714337500743568, "learning_rate": 2.7616926503340758e-05, "loss": 0.0001, "step": 3870 }, { "epoch": 8.64, "grad_norm": 0.002063535852357745, "learning_rate": 2.7171492204899778e-05, "loss": 0.0001, "step": 3880 }, { "epoch": 8.66, "grad_norm": 0.0010866498341783881, "learning_rate": 2.67260579064588e-05, "loss": 0.0001, "step": 3890 }, { "epoch": 8.69, "grad_norm": 0.0009104039054363966, "learning_rate": 2.6280623608017817e-05, "loss": 0.0002, "step": 3900 }, { "epoch": 8.69, "eval_accuracy": 0.9715686274509804, "eval_f1": 0.9715686274509804, "eval_loss": 0.15503399074077606, "eval_precision": 0.9715686274509804, "eval_recall": 0.9715686274509804, "eval_runtime": 17.3241, "eval_samples_per_second": 58.877, "eval_steps_per_second": 7.389, "step": 3900 }, { "epoch": 8.71, "grad_norm": 0.0029284367337822914, "learning_rate": 2.583518930957684e-05, "loss": 0.007, "step": 3910 }, { "epoch": 8.73, "grad_norm": 0.0005087658646516502, "learning_rate": 2.5389755011135856e-05, "loss": 0.0001, "step": 3920 }, { "epoch": 8.75, "grad_norm": 0.0005650007515214384, "learning_rate": 2.494432071269488e-05, "loss": 0.0001, "step": 3930 }, { "epoch": 8.78, "grad_norm": 0.0005321349017322063, "learning_rate": 2.44988864142539e-05, "loss": 0.0001, "step": 3940 }, { "epoch": 8.8, "grad_norm": 0.0008177491254173219, "learning_rate": 2.405345211581292e-05, "loss": 0.0001, "step": 3950 }, { "epoch": 8.82, "grad_norm": 0.0005406069685705006, "learning_rate": 2.360801781737194e-05, "loss": 0.0001, "step": 3960 }, { "epoch": 8.84, "grad_norm": 0.000659614393953234, "learning_rate": 2.316258351893096e-05, "loss": 0.0001, "step": 3970 }, { "epoch": 8.86, "grad_norm": 0.0004996512434445322, "learning_rate": 2.2717149220489978e-05, "loss": 0.0001, "step": 3980 }, { "epoch": 8.89, "grad_norm": 0.0002974416420329362, "learning_rate": 2.2271714922049e-05, "loss": 0.0001, "step": 3990 }, { "epoch": 8.91, "grad_norm": 0.0011179678840562701, "learning_rate": 2.182628062360802e-05, "loss": 0.0001, "step": 4000 }, { "epoch": 8.91, "eval_accuracy": 0.9705882352941176, "eval_f1": 0.9705882352941176, "eval_loss": 0.15417079627513885, "eval_precision": 0.9705882352941176, "eval_recall": 0.9705882352941176, "eval_runtime": 17.0729, "eval_samples_per_second": 59.744, "eval_steps_per_second": 7.497, "step": 4000 }, { "epoch": 8.93, "grad_norm": 0.0005180141888558865, "learning_rate": 2.138084632516704e-05, "loss": 0.0001, "step": 4010 }, { "epoch": 8.95, "grad_norm": 0.0005326379905454814, "learning_rate": 2.093541202672606e-05, "loss": 0.0001, "step": 4020 }, { "epoch": 8.98, "grad_norm": 0.0006433409289456904, "learning_rate": 2.048997772828508e-05, "loss": 0.0001, "step": 4030 }, { "epoch": 9.0, "grad_norm": 0.002777345711365342, "learning_rate": 2.00445434298441e-05, "loss": 0.0001, "step": 4040 }, { "epoch": 9.02, "grad_norm": 0.0007074729655869305, "learning_rate": 1.959910913140312e-05, "loss": 0.0001, "step": 4050 }, { "epoch": 9.04, "grad_norm": 0.001536020776256919, "learning_rate": 1.915367483296214e-05, "loss": 0.0001, "step": 4060 }, { "epoch": 9.06, "grad_norm": 0.0009424517047591507, "learning_rate": 1.870824053452116e-05, "loss": 0.0001, "step": 4070 }, { "epoch": 9.09, "grad_norm": 0.3284554183483124, "learning_rate": 1.826280623608018e-05, "loss": 0.0026, "step": 4080 }, { "epoch": 9.11, "grad_norm": 0.00030903713195584714, "learning_rate": 1.7817371937639198e-05, "loss": 0.0001, "step": 4090 }, { "epoch": 9.13, "grad_norm": 0.0005440358072519302, "learning_rate": 1.7371937639198218e-05, "loss": 0.0001, "step": 4100 }, { "epoch": 9.13, "eval_accuracy": 0.9715686274509804, "eval_f1": 0.9715686274509804, "eval_loss": 0.15382429957389832, "eval_precision": 0.9715686274509804, "eval_recall": 0.9715686274509804, "eval_runtime": 16.821, "eval_samples_per_second": 60.638, "eval_steps_per_second": 7.61, "step": 4100 }, { "epoch": 9.15, "grad_norm": 0.0006746925064362586, "learning_rate": 1.6926503340757238e-05, "loss": 0.0001, "step": 4110 }, { "epoch": 9.18, "grad_norm": 0.00037791216163896024, "learning_rate": 1.6481069042316257e-05, "loss": 0.0001, "step": 4120 }, { "epoch": 9.2, "grad_norm": 0.0012918213615193963, "learning_rate": 1.603563474387528e-05, "loss": 0.0001, "step": 4130 }, { "epoch": 9.22, "grad_norm": 0.000723692704923451, "learning_rate": 1.55902004454343e-05, "loss": 0.0001, "step": 4140 }, { "epoch": 9.24, "grad_norm": 0.0006746066501364112, "learning_rate": 1.514476614699332e-05, "loss": 0.0001, "step": 4150 }, { "epoch": 9.27, "grad_norm": 0.000450183724751696, "learning_rate": 1.469933184855234e-05, "loss": 0.0001, "step": 4160 }, { "epoch": 9.29, "grad_norm": 0.0011862111277878284, "learning_rate": 1.425389755011136e-05, "loss": 0.0001, "step": 4170 }, { "epoch": 9.31, "grad_norm": 0.0017749534454196692, "learning_rate": 1.3808463251670379e-05, "loss": 0.0001, "step": 4180 }, { "epoch": 9.33, "grad_norm": 0.0003237236524000764, "learning_rate": 1.33630289532294e-05, "loss": 0.0001, "step": 4190 }, { "epoch": 9.35, "grad_norm": 0.0004740317235700786, "learning_rate": 1.291759465478842e-05, "loss": 0.0001, "step": 4200 }, { "epoch": 9.35, "eval_accuracy": 0.9715686274509804, "eval_f1": 0.9715686274509804, "eval_loss": 0.15357248485088348, "eval_precision": 0.9715686274509804, "eval_recall": 0.9715686274509804, "eval_runtime": 17.097, "eval_samples_per_second": 59.659, "eval_steps_per_second": 7.487, "step": 4200 }, { "epoch": 9.38, "grad_norm": 0.0004047084948979318, "learning_rate": 1.247216035634744e-05, "loss": 0.0001, "step": 4210 }, { "epoch": 9.4, "grad_norm": 0.0028567886911332607, "learning_rate": 1.202672605790646e-05, "loss": 0.0001, "step": 4220 }, { "epoch": 9.42, "grad_norm": 0.0005680415779352188, "learning_rate": 1.158129175946548e-05, "loss": 0.0001, "step": 4230 }, { "epoch": 9.44, "grad_norm": 0.00213377526961267, "learning_rate": 1.11358574610245e-05, "loss": 0.0001, "step": 4240 }, { "epoch": 9.47, "grad_norm": 0.001148115610703826, "learning_rate": 1.069042316258352e-05, "loss": 0.0001, "step": 4250 }, { "epoch": 9.49, "grad_norm": 0.00045941799180582166, "learning_rate": 1.024498886414254e-05, "loss": 0.0001, "step": 4260 }, { "epoch": 9.51, "grad_norm": 0.0024906108155846596, "learning_rate": 9.79955456570156e-06, "loss": 0.0001, "step": 4270 }, { "epoch": 9.53, "grad_norm": 0.0004669167974498123, "learning_rate": 9.35412026726058e-06, "loss": 0.0001, "step": 4280 }, { "epoch": 9.55, "grad_norm": 0.0003813539515249431, "learning_rate": 8.908685968819599e-06, "loss": 0.0001, "step": 4290 }, { "epoch": 9.58, "grad_norm": 0.00048394210170954466, "learning_rate": 8.463251670378619e-06, "loss": 0.0001, "step": 4300 }, { "epoch": 9.58, "eval_accuracy": 0.9715686274509804, "eval_f1": 0.9715686274509804, "eval_loss": 0.15336920320987701, "eval_precision": 0.9715686274509804, "eval_recall": 0.9715686274509804, "eval_runtime": 16.9378, "eval_samples_per_second": 60.22, "eval_steps_per_second": 7.557, "step": 4300 }, { "epoch": 9.6, "grad_norm": 0.001094264443963766, "learning_rate": 8.01781737193764e-06, "loss": 0.0001, "step": 4310 }, { "epoch": 9.62, "grad_norm": 0.0007720951689407229, "learning_rate": 7.57238307349666e-06, "loss": 0.0001, "step": 4320 }, { "epoch": 9.64, "grad_norm": 0.0010363436304032803, "learning_rate": 7.12694877505568e-06, "loss": 0.0001, "step": 4330 }, { "epoch": 9.67, "grad_norm": 0.0006155350711196661, "learning_rate": 6.6815144766147e-06, "loss": 0.0001, "step": 4340 }, { "epoch": 9.69, "grad_norm": 0.002117099007591605, "learning_rate": 6.23608017817372e-06, "loss": 0.0001, "step": 4350 }, { "epoch": 9.71, "grad_norm": 0.00035223804297856987, "learning_rate": 5.79064587973274e-06, "loss": 0.0001, "step": 4360 }, { "epoch": 9.73, "grad_norm": 0.0008616923005320132, "learning_rate": 5.34521158129176e-06, "loss": 0.0001, "step": 4370 }, { "epoch": 9.76, "grad_norm": 0.0003698187065310776, "learning_rate": 4.89977728285078e-06, "loss": 0.0001, "step": 4380 }, { "epoch": 9.78, "grad_norm": 0.0008464885177090764, "learning_rate": 4.4543429844097995e-06, "loss": 0.0001, "step": 4390 }, { "epoch": 9.8, "grad_norm": 0.000521197565831244, "learning_rate": 4.00890868596882e-06, "loss": 0.0001, "step": 4400 }, { "epoch": 9.8, "eval_accuracy": 0.9715686274509804, "eval_f1": 0.9715686274509804, "eval_loss": 0.15329474210739136, "eval_precision": 0.9715686274509804, "eval_recall": 0.9715686274509804, "eval_runtime": 17.0026, "eval_samples_per_second": 59.991, "eval_steps_per_second": 7.528, "step": 4400 }, { "epoch": 9.82, "grad_norm": 0.00078478833893314, "learning_rate": 3.56347438752784e-06, "loss": 0.0001, "step": 4410 }, { "epoch": 9.84, "grad_norm": 0.00044067302951589227, "learning_rate": 3.11804008908686e-06, "loss": 0.0001, "step": 4420 }, { "epoch": 9.87, "grad_norm": 0.0005106424796395004, "learning_rate": 2.67260579064588e-06, "loss": 0.0001, "step": 4430 }, { "epoch": 9.89, "grad_norm": 0.0003234909090679139, "learning_rate": 2.2271714922048998e-06, "loss": 0.0001, "step": 4440 }, { "epoch": 9.91, "grad_norm": 0.0003544181527104229, "learning_rate": 1.78173719376392e-06, "loss": 0.0001, "step": 4450 }, { "epoch": 9.93, "grad_norm": 0.00040398509008809924, "learning_rate": 1.33630289532294e-06, "loss": 0.0001, "step": 4460 }, { "epoch": 9.96, "grad_norm": 0.0009952255059033632, "learning_rate": 8.9086859688196e-07, "loss": 0.0001, "step": 4470 }, { "epoch": 9.98, "grad_norm": 0.0003712301841005683, "learning_rate": 4.4543429844098e-07, "loss": 0.0001, "step": 4480 }, { "epoch": 10.0, "grad_norm": 0.0012604963267222047, "learning_rate": 0.0, "loss": 0.0099, "step": 4490 }, { "epoch": 10.0, "step": 4490, "total_flos": 5.562769847811564e+18, "train_loss": 0.025327244219600944, "train_runtime": 3846.586, "train_samples_per_second": 18.637, "train_steps_per_second": 1.167 } ], "logging_steps": 10, "max_steps": 4490, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 5.562769847811564e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }