{ "best_metric": 0.583153247833252, "best_model_checkpoint": "./vit-base-brain-tumor-detection/checkpoint-1000", "epoch": 20.0, "eval_steps": 100, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.653470516204834, "learning_rate": 0.0001996, "loss": 1.3897, "step": 10 }, { "epoch": 0.08, "grad_norm": 0.5039440989494324, "learning_rate": 0.00019920000000000002, "loss": 1.4048, "step": 20 }, { "epoch": 0.12, "grad_norm": 0.8407747745513916, "learning_rate": 0.0001988, "loss": 1.3912, "step": 30 }, { "epoch": 0.16, "grad_norm": 0.898512601852417, "learning_rate": 0.0001984, "loss": 1.3661, "step": 40 }, { "epoch": 0.2, "grad_norm": 2.638078212738037, "learning_rate": 0.00019800000000000002, "loss": 1.2842, "step": 50 }, { "epoch": 0.24, "grad_norm": 1.7601313591003418, "learning_rate": 0.0001976, "loss": 1.239, "step": 60 }, { "epoch": 0.28, "grad_norm": 5.79738187789917, "learning_rate": 0.0001972, "loss": 1.1859, "step": 70 }, { "epoch": 0.32, "grad_norm": 3.172907829284668, "learning_rate": 0.0001968, "loss": 1.1265, "step": 80 }, { "epoch": 0.36, "grad_norm": 3.0865306854248047, "learning_rate": 0.0001964, "loss": 1.1506, "step": 90 }, { "epoch": 0.4, "grad_norm": 3.093891143798828, "learning_rate": 0.000196, "loss": 0.9535, "step": 100 }, { "epoch": 0.4, "eval_accuracy": 0.618, "eval_loss": 0.8965551853179932, "eval_runtime": 12.9233, "eval_samples_per_second": 77.38, "eval_steps_per_second": 9.672, "step": 100 }, { "epoch": 0.44, "grad_norm": 1.729243516921997, "learning_rate": 0.0001956, "loss": 0.9068, "step": 110 }, { "epoch": 0.48, "grad_norm": 4.636998653411865, "learning_rate": 0.0001952, "loss": 0.8465, "step": 120 }, { "epoch": 0.52, "grad_norm": 1.6259952783584595, "learning_rate": 0.0001948, "loss": 1.0864, "step": 130 }, { "epoch": 0.56, "grad_norm": 2.269352436065674, "learning_rate": 0.0001944, "loss": 1.0165, "step": 140 }, { "epoch": 0.6, "grad_norm": 2.516747236251831, "learning_rate": 0.000194, "loss": 0.8877, "step": 150 }, { "epoch": 0.64, "grad_norm": 3.543081760406494, "learning_rate": 0.00019360000000000002, "loss": 0.9845, "step": 160 }, { "epoch": 0.68, "grad_norm": 2.1874799728393555, "learning_rate": 0.0001932, "loss": 0.8626, "step": 170 }, { "epoch": 0.72, "grad_norm": 2.4686241149902344, "learning_rate": 0.0001928, "loss": 0.8459, "step": 180 }, { "epoch": 0.76, "grad_norm": 3.529902696609497, "learning_rate": 0.00019240000000000001, "loss": 1.0014, "step": 190 }, { "epoch": 0.8, "grad_norm": 6.457658767700195, "learning_rate": 0.000192, "loss": 0.862, "step": 200 }, { "epoch": 0.8, "eval_accuracy": 0.561, "eval_loss": 1.1149431467056274, "eval_runtime": 5.617, "eval_samples_per_second": 178.029, "eval_steps_per_second": 22.254, "step": 200 }, { "epoch": 0.84, "grad_norm": 3.746286153793335, "learning_rate": 0.0001916, "loss": 0.9211, "step": 210 }, { "epoch": 0.88, "grad_norm": 0.9981279373168945, "learning_rate": 0.0001912, "loss": 0.7935, "step": 220 }, { "epoch": 0.92, "grad_norm": 4.057267189025879, "learning_rate": 0.0001908, "loss": 0.9092, "step": 230 }, { "epoch": 0.96, "grad_norm": 1.723970890045166, "learning_rate": 0.0001904, "loss": 0.8029, "step": 240 }, { "epoch": 1.0, "grad_norm": 2.485327959060669, "learning_rate": 0.00019, "loss": 0.8081, "step": 250 }, { "epoch": 1.04, "grad_norm": 1.9523849487304688, "learning_rate": 0.0001896, "loss": 0.6713, "step": 260 }, { "epoch": 1.08, "grad_norm": 1.719793438911438, "learning_rate": 0.0001892, "loss": 0.6921, "step": 270 }, { "epoch": 1.12, "grad_norm": 1.283629059791565, "learning_rate": 0.0001888, "loss": 0.6989, "step": 280 }, { "epoch": 1.16, "grad_norm": 14.220324516296387, "learning_rate": 0.0001884, "loss": 0.9012, "step": 290 }, { "epoch": 1.2, "grad_norm": 1.621273398399353, "learning_rate": 0.000188, "loss": 0.7373, "step": 300 }, { "epoch": 1.2, "eval_accuracy": 0.605, "eval_loss": 0.8542758822441101, "eval_runtime": 5.4409, "eval_samples_per_second": 183.793, "eval_steps_per_second": 22.974, "step": 300 }, { "epoch": 1.24, "grad_norm": 1.4123197793960571, "learning_rate": 0.0001876, "loss": 0.8087, "step": 310 }, { "epoch": 1.28, "grad_norm": 4.211037635803223, "learning_rate": 0.00018720000000000002, "loss": 0.7055, "step": 320 }, { "epoch": 1.32, "grad_norm": 1.4883663654327393, "learning_rate": 0.00018680000000000001, "loss": 0.7988, "step": 330 }, { "epoch": 1.3599999999999999, "grad_norm": 1.515805959701538, "learning_rate": 0.00018640000000000003, "loss": 0.8664, "step": 340 }, { "epoch": 1.4, "grad_norm": 1.3907535076141357, "learning_rate": 0.00018600000000000002, "loss": 0.6773, "step": 350 }, { "epoch": 1.44, "grad_norm": 1.5013153553009033, "learning_rate": 0.0001856, "loss": 0.6876, "step": 360 }, { "epoch": 1.48, "grad_norm": 2.7530360221862793, "learning_rate": 0.00018520000000000003, "loss": 0.5832, "step": 370 }, { "epoch": 1.52, "grad_norm": 2.470029354095459, "learning_rate": 0.00018480000000000002, "loss": 0.7209, "step": 380 }, { "epoch": 1.56, "grad_norm": 1.3378897905349731, "learning_rate": 0.0001844, "loss": 0.6202, "step": 390 }, { "epoch": 1.6, "grad_norm": 2.541882038116455, "learning_rate": 0.00018400000000000003, "loss": 0.6476, "step": 400 }, { "epoch": 1.6, "eval_accuracy": 0.666, "eval_loss": 0.7306695580482483, "eval_runtime": 6.1267, "eval_samples_per_second": 163.221, "eval_steps_per_second": 20.403, "step": 400 }, { "epoch": 1.6400000000000001, "grad_norm": 2.5715789794921875, "learning_rate": 0.00018360000000000002, "loss": 0.5758, "step": 410 }, { "epoch": 1.6800000000000002, "grad_norm": 1.6180702447891235, "learning_rate": 0.0001832, "loss": 0.7008, "step": 420 }, { "epoch": 1.72, "grad_norm": 2.0645174980163574, "learning_rate": 0.00018280000000000003, "loss": 0.7472, "step": 430 }, { "epoch": 1.76, "grad_norm": 6.950054168701172, "learning_rate": 0.00018240000000000002, "loss": 0.9427, "step": 440 }, { "epoch": 1.8, "grad_norm": 2.622056722640991, "learning_rate": 0.000182, "loss": 0.6746, "step": 450 }, { "epoch": 1.8399999999999999, "grad_norm": 1.489776611328125, "learning_rate": 0.00018160000000000002, "loss": 0.7327, "step": 460 }, { "epoch": 1.88, "grad_norm": 1.0776207447052002, "learning_rate": 0.0001812, "loss": 0.5842, "step": 470 }, { "epoch": 1.92, "grad_norm": 2.286405086517334, "learning_rate": 0.0001808, "loss": 0.6962, "step": 480 }, { "epoch": 1.96, "grad_norm": 2.593034267425537, "learning_rate": 0.00018040000000000002, "loss": 0.6214, "step": 490 }, { "epoch": 2.0, "grad_norm": 1.7781627178192139, "learning_rate": 0.00018, "loss": 0.6712, "step": 500 }, { "epoch": 2.0, "eval_accuracy": 0.694, "eval_loss": 0.6953811049461365, "eval_runtime": 5.961, "eval_samples_per_second": 167.757, "eval_steps_per_second": 20.97, "step": 500 }, { "epoch": 2.04, "grad_norm": 1.331834316253662, "learning_rate": 0.0001796, "loss": 0.3866, "step": 510 }, { "epoch": 2.08, "grad_norm": 1.5159146785736084, "learning_rate": 0.00017920000000000002, "loss": 0.5219, "step": 520 }, { "epoch": 2.12, "grad_norm": 3.525988817214966, "learning_rate": 0.0001788, "loss": 0.4692, "step": 530 }, { "epoch": 2.16, "grad_norm": 6.470268249511719, "learning_rate": 0.0001784, "loss": 0.4269, "step": 540 }, { "epoch": 2.2, "grad_norm": 1.4871269464492798, "learning_rate": 0.00017800000000000002, "loss": 0.3844, "step": 550 }, { "epoch": 2.24, "grad_norm": 3.404496669769287, "learning_rate": 0.0001776, "loss": 0.6704, "step": 560 }, { "epoch": 2.2800000000000002, "grad_norm": 1.3112822771072388, "learning_rate": 0.0001772, "loss": 0.6746, "step": 570 }, { "epoch": 2.32, "grad_norm": 5.380651473999023, "learning_rate": 0.00017680000000000001, "loss": 0.7249, "step": 580 }, { "epoch": 2.36, "grad_norm": 1.006377935409546, "learning_rate": 0.0001764, "loss": 0.4615, "step": 590 }, { "epoch": 2.4, "grad_norm": 2.6458024978637695, "learning_rate": 0.00017600000000000002, "loss": 0.4892, "step": 600 }, { "epoch": 2.4, "eval_accuracy": 0.707, "eval_loss": 0.6391066908836365, "eval_runtime": 5.4367, "eval_samples_per_second": 183.934, "eval_steps_per_second": 22.992, "step": 600 }, { "epoch": 2.44, "grad_norm": 3.180305004119873, "learning_rate": 0.0001756, "loss": 0.5289, "step": 610 }, { "epoch": 2.48, "grad_norm": 3.7770426273345947, "learning_rate": 0.0001752, "loss": 0.4695, "step": 620 }, { "epoch": 2.52, "grad_norm": 3.895836591720581, "learning_rate": 0.00017480000000000002, "loss": 0.4387, "step": 630 }, { "epoch": 2.56, "grad_norm": 3.006532907485962, "learning_rate": 0.0001744, "loss": 0.5095, "step": 640 }, { "epoch": 2.6, "grad_norm": 4.034090995788574, "learning_rate": 0.000174, "loss": 0.5297, "step": 650 }, { "epoch": 2.64, "grad_norm": 2.2521889209747314, "learning_rate": 0.00017360000000000002, "loss": 0.4649, "step": 660 }, { "epoch": 2.68, "grad_norm": 2.8466689586639404, "learning_rate": 0.0001732, "loss": 0.4819, "step": 670 }, { "epoch": 2.7199999999999998, "grad_norm": 8.159728050231934, "learning_rate": 0.0001728, "loss": 0.5005, "step": 680 }, { "epoch": 2.76, "grad_norm": 2.4270811080932617, "learning_rate": 0.00017240000000000002, "loss": 0.4146, "step": 690 }, { "epoch": 2.8, "grad_norm": 4.790659427642822, "learning_rate": 0.000172, "loss": 0.5801, "step": 700 }, { "epoch": 2.8, "eval_accuracy": 0.708, "eval_loss": 0.6246920824050903, "eval_runtime": 5.4112, "eval_samples_per_second": 184.802, "eval_steps_per_second": 23.1, "step": 700 }, { "epoch": 2.84, "grad_norm": 2.981046676635742, "learning_rate": 0.0001716, "loss": 0.5311, "step": 710 }, { "epoch": 2.88, "grad_norm": 3.6522932052612305, "learning_rate": 0.00017120000000000001, "loss": 0.4836, "step": 720 }, { "epoch": 2.92, "grad_norm": 2.448866367340088, "learning_rate": 0.0001708, "loss": 0.6052, "step": 730 }, { "epoch": 2.96, "grad_norm": 3.3822741508483887, "learning_rate": 0.0001704, "loss": 0.4695, "step": 740 }, { "epoch": 3.0, "grad_norm": 3.214433431625366, "learning_rate": 0.00017, "loss": 0.6105, "step": 750 }, { "epoch": 3.04, "grad_norm": 1.5671595335006714, "learning_rate": 0.0001696, "loss": 0.4189, "step": 760 }, { "epoch": 3.08, "grad_norm": 3.648240327835083, "learning_rate": 0.0001692, "loss": 0.2745, "step": 770 }, { "epoch": 3.12, "grad_norm": 1.588549017906189, "learning_rate": 0.0001688, "loss": 0.2882, "step": 780 }, { "epoch": 3.16, "grad_norm": 2.041724920272827, "learning_rate": 0.0001684, "loss": 0.2642, "step": 790 }, { "epoch": 3.2, "grad_norm": 4.308396816253662, "learning_rate": 0.000168, "loss": 0.3505, "step": 800 }, { "epoch": 3.2, "eval_accuracy": 0.778, "eval_loss": 0.6055798530578613, "eval_runtime": 6.0689, "eval_samples_per_second": 164.773, "eval_steps_per_second": 20.597, "step": 800 }, { "epoch": 3.24, "grad_norm": 2.717337131500244, "learning_rate": 0.0001676, "loss": 0.2626, "step": 810 }, { "epoch": 3.2800000000000002, "grad_norm": 2.7859904766082764, "learning_rate": 0.0001672, "loss": 0.3826, "step": 820 }, { "epoch": 3.32, "grad_norm": 3.864297389984131, "learning_rate": 0.0001668, "loss": 0.4313, "step": 830 }, { "epoch": 3.36, "grad_norm": 3.782465934753418, "learning_rate": 0.0001664, "loss": 0.3361, "step": 840 }, { "epoch": 3.4, "grad_norm": 3.1425442695617676, "learning_rate": 0.000166, "loss": 0.2759, "step": 850 }, { "epoch": 3.44, "grad_norm": 3.1139609813690186, "learning_rate": 0.0001656, "loss": 0.3949, "step": 860 }, { "epoch": 3.48, "grad_norm": 4.945987224578857, "learning_rate": 0.0001652, "loss": 0.3701, "step": 870 }, { "epoch": 3.52, "grad_norm": 1.7310290336608887, "learning_rate": 0.0001648, "loss": 0.2879, "step": 880 }, { "epoch": 3.56, "grad_norm": 5.650954246520996, "learning_rate": 0.0001644, "loss": 0.4443, "step": 890 }, { "epoch": 3.6, "grad_norm": 2.8908536434173584, "learning_rate": 0.000164, "loss": 0.3503, "step": 900 }, { "epoch": 3.6, "eval_accuracy": 0.743, "eval_loss": 0.6263792514801025, "eval_runtime": 5.7324, "eval_samples_per_second": 174.449, "eval_steps_per_second": 21.806, "step": 900 }, { "epoch": 3.64, "grad_norm": 1.3399735689163208, "learning_rate": 0.0001636, "loss": 0.3594, "step": 910 }, { "epoch": 3.68, "grad_norm": 1.3512450456619263, "learning_rate": 0.0001632, "loss": 0.31, "step": 920 }, { "epoch": 3.7199999999999998, "grad_norm": 2.5052506923675537, "learning_rate": 0.0001628, "loss": 0.3234, "step": 930 }, { "epoch": 3.76, "grad_norm": 7.230278968811035, "learning_rate": 0.00016240000000000002, "loss": 0.3559, "step": 940 }, { "epoch": 3.8, "grad_norm": 1.464324712753296, "learning_rate": 0.000162, "loss": 0.2914, "step": 950 }, { "epoch": 3.84, "grad_norm": 3.140822649002075, "learning_rate": 0.00016160000000000002, "loss": 0.2292, "step": 960 }, { "epoch": 3.88, "grad_norm": 7.812990665435791, "learning_rate": 0.00016120000000000002, "loss": 0.401, "step": 970 }, { "epoch": 3.92, "grad_norm": 7.805127143859863, "learning_rate": 0.0001608, "loss": 0.4106, "step": 980 }, { "epoch": 3.96, "grad_norm": 5.645721435546875, "learning_rate": 0.00016040000000000002, "loss": 0.3844, "step": 990 }, { "epoch": 4.0, "grad_norm": 2.4538416862487793, "learning_rate": 0.00016, "loss": 0.3416, "step": 1000 }, { "epoch": 4.0, "eval_accuracy": 0.785, "eval_loss": 0.583153247833252, "eval_runtime": 5.857, "eval_samples_per_second": 170.736, "eval_steps_per_second": 21.342, "step": 1000 }, { "epoch": 4.04, "grad_norm": 3.2518820762634277, "learning_rate": 0.0001596, "loss": 0.1471, "step": 1010 }, { "epoch": 4.08, "grad_norm": 5.489699840545654, "learning_rate": 0.00015920000000000002, "loss": 0.1807, "step": 1020 }, { "epoch": 4.12, "grad_norm": 0.6528148651123047, "learning_rate": 0.0001588, "loss": 0.189, "step": 1030 }, { "epoch": 4.16, "grad_norm": 5.080958843231201, "learning_rate": 0.00015840000000000003, "loss": 0.246, "step": 1040 }, { "epoch": 4.2, "grad_norm": 2.6048026084899902, "learning_rate": 0.00015800000000000002, "loss": 0.1937, "step": 1050 }, { "epoch": 4.24, "grad_norm": 2.226956844329834, "learning_rate": 0.0001576, "loss": 0.1781, "step": 1060 }, { "epoch": 4.28, "grad_norm": 1.7226955890655518, "learning_rate": 0.00015720000000000003, "loss": 0.2304, "step": 1070 }, { "epoch": 4.32, "grad_norm": 4.231192111968994, "learning_rate": 0.00015680000000000002, "loss": 0.1828, "step": 1080 }, { "epoch": 4.36, "grad_norm": 2.015580177307129, "learning_rate": 0.0001564, "loss": 0.2052, "step": 1090 }, { "epoch": 4.4, "grad_norm": 1.2273974418640137, "learning_rate": 0.00015600000000000002, "loss": 0.1427, "step": 1100 }, { "epoch": 4.4, "eval_accuracy": 0.769, "eval_loss": 0.7296926975250244, "eval_runtime": 5.2856, "eval_samples_per_second": 189.193, "eval_steps_per_second": 23.649, "step": 1100 }, { "epoch": 4.44, "grad_norm": 12.377334594726562, "learning_rate": 0.00015560000000000001, "loss": 0.2476, "step": 1110 }, { "epoch": 4.48, "grad_norm": 5.310695171356201, "learning_rate": 0.0001552, "loss": 0.2956, "step": 1120 }, { "epoch": 4.52, "grad_norm": 1.6223011016845703, "learning_rate": 0.00015480000000000002, "loss": 0.3322, "step": 1130 }, { "epoch": 4.5600000000000005, "grad_norm": 2.9505221843719482, "learning_rate": 0.0001544, "loss": 0.2848, "step": 1140 }, { "epoch": 4.6, "grad_norm": 8.578746795654297, "learning_rate": 0.000154, "loss": 0.1866, "step": 1150 }, { "epoch": 4.64, "grad_norm": 3.874375820159912, "learning_rate": 0.00015360000000000002, "loss": 0.2218, "step": 1160 }, { "epoch": 4.68, "grad_norm": 2.2798233032226562, "learning_rate": 0.0001532, "loss": 0.2767, "step": 1170 }, { "epoch": 4.72, "grad_norm": 1.274636149406433, "learning_rate": 0.0001528, "loss": 0.2295, "step": 1180 }, { "epoch": 4.76, "grad_norm": 4.265562534332275, "learning_rate": 0.00015240000000000002, "loss": 0.2062, "step": 1190 }, { "epoch": 4.8, "grad_norm": 3.9114277362823486, "learning_rate": 0.000152, "loss": 0.1982, "step": 1200 }, { "epoch": 4.8, "eval_accuracy": 0.73, "eval_loss": 0.7761247754096985, "eval_runtime": 5.3281, "eval_samples_per_second": 187.684, "eval_steps_per_second": 23.46, "step": 1200 }, { "epoch": 4.84, "grad_norm": 3.885077953338623, "learning_rate": 0.0001516, "loss": 0.2723, "step": 1210 }, { "epoch": 4.88, "grad_norm": 3.1466147899627686, "learning_rate": 0.00015120000000000002, "loss": 0.1816, "step": 1220 }, { "epoch": 4.92, "grad_norm": 1.5894757509231567, "learning_rate": 0.0001508, "loss": 0.2388, "step": 1230 }, { "epoch": 4.96, "grad_norm": 2.17746901512146, "learning_rate": 0.0001504, "loss": 0.2346, "step": 1240 }, { "epoch": 5.0, "grad_norm": 4.807671546936035, "learning_rate": 0.00015000000000000001, "loss": 0.1019, "step": 1250 }, { "epoch": 5.04, "grad_norm": 0.5939441919326782, "learning_rate": 0.0001496, "loss": 0.2222, "step": 1260 }, { "epoch": 5.08, "grad_norm": 6.009125232696533, "learning_rate": 0.0001492, "loss": 0.1772, "step": 1270 }, { "epoch": 5.12, "grad_norm": 0.48813578486442566, "learning_rate": 0.0001488, "loss": 0.1592, "step": 1280 }, { "epoch": 5.16, "grad_norm": 2.24546480178833, "learning_rate": 0.0001484, "loss": 0.2064, "step": 1290 }, { "epoch": 5.2, "grad_norm": 1.3820618391036987, "learning_rate": 0.000148, "loss": 0.193, "step": 1300 }, { "epoch": 5.2, "eval_accuracy": 0.741, "eval_loss": 0.8467380404472351, "eval_runtime": 6.1473, "eval_samples_per_second": 162.672, "eval_steps_per_second": 20.334, "step": 1300 }, { "epoch": 5.24, "grad_norm": 1.7147108316421509, "learning_rate": 0.0001476, "loss": 0.1436, "step": 1310 }, { "epoch": 5.28, "grad_norm": 0.31574681401252747, "learning_rate": 0.0001472, "loss": 0.1127, "step": 1320 }, { "epoch": 5.32, "grad_norm": 0.05109202861785889, "learning_rate": 0.00014680000000000002, "loss": 0.0844, "step": 1330 }, { "epoch": 5.36, "grad_norm": 3.080343246459961, "learning_rate": 0.0001464, "loss": 0.1685, "step": 1340 }, { "epoch": 5.4, "grad_norm": 0.8458659648895264, "learning_rate": 0.000146, "loss": 0.0856, "step": 1350 }, { "epoch": 5.44, "grad_norm": 2.8861546516418457, "learning_rate": 0.00014560000000000002, "loss": 0.1465, "step": 1360 }, { "epoch": 5.48, "grad_norm": 0.9648165702819824, "learning_rate": 0.0001452, "loss": 0.1534, "step": 1370 }, { "epoch": 5.52, "grad_norm": 0.8869705200195312, "learning_rate": 0.0001448, "loss": 0.1499, "step": 1380 }, { "epoch": 5.5600000000000005, "grad_norm": 5.056468963623047, "learning_rate": 0.0001444, "loss": 0.3252, "step": 1390 }, { "epoch": 5.6, "grad_norm": 4.4490485191345215, "learning_rate": 0.000144, "loss": 0.1831, "step": 1400 }, { "epoch": 5.6, "eval_accuracy": 0.774, "eval_loss": 0.6975213885307312, "eval_runtime": 5.5981, "eval_samples_per_second": 178.633, "eval_steps_per_second": 22.329, "step": 1400 }, { "epoch": 5.64, "grad_norm": 5.352523326873779, "learning_rate": 0.0001436, "loss": 0.1022, "step": 1410 }, { "epoch": 5.68, "grad_norm": 0.33864977955818176, "learning_rate": 0.0001432, "loss": 0.2627, "step": 1420 }, { "epoch": 5.72, "grad_norm": 3.7317004203796387, "learning_rate": 0.0001428, "loss": 0.1334, "step": 1430 }, { "epoch": 5.76, "grad_norm": 3.7961854934692383, "learning_rate": 0.0001424, "loss": 0.1658, "step": 1440 }, { "epoch": 5.8, "grad_norm": 0.13848626613616943, "learning_rate": 0.000142, "loss": 0.069, "step": 1450 }, { "epoch": 5.84, "grad_norm": 4.271240234375, "learning_rate": 0.0001416, "loss": 0.1401, "step": 1460 }, { "epoch": 5.88, "grad_norm": 3.202373504638672, "learning_rate": 0.0001412, "loss": 0.1468, "step": 1470 }, { "epoch": 5.92, "grad_norm": 0.39670008420944214, "learning_rate": 0.0001408, "loss": 0.0742, "step": 1480 }, { "epoch": 5.96, "grad_norm": 3.4956166744232178, "learning_rate": 0.0001404, "loss": 0.0929, "step": 1490 }, { "epoch": 6.0, "grad_norm": 3.5953352451324463, "learning_rate": 0.00014, "loss": 0.2612, "step": 1500 }, { "epoch": 6.0, "eval_accuracy": 0.775, "eval_loss": 0.8718728423118591, "eval_runtime": 5.287, "eval_samples_per_second": 189.142, "eval_steps_per_second": 23.643, "step": 1500 }, { "epoch": 6.04, "grad_norm": 0.334338515996933, "learning_rate": 0.0001396, "loss": 0.0812, "step": 1510 }, { "epoch": 6.08, "grad_norm": 4.4882612228393555, "learning_rate": 0.0001392, "loss": 0.0565, "step": 1520 }, { "epoch": 6.12, "grad_norm": 6.175742149353027, "learning_rate": 0.00013879999999999999, "loss": 0.0966, "step": 1530 }, { "epoch": 6.16, "grad_norm": 3.152432680130005, "learning_rate": 0.0001384, "loss": 0.1777, "step": 1540 }, { "epoch": 6.2, "grad_norm": 0.0891793891787529, "learning_rate": 0.000138, "loss": 0.0406, "step": 1550 }, { "epoch": 6.24, "grad_norm": 11.810345649719238, "learning_rate": 0.00013759999999999998, "loss": 0.0862, "step": 1560 }, { "epoch": 6.28, "grad_norm": 0.12118436396121979, "learning_rate": 0.00013720000000000003, "loss": 0.0366, "step": 1570 }, { "epoch": 6.32, "grad_norm": 12.970296859741211, "learning_rate": 0.00013680000000000002, "loss": 0.0596, "step": 1580 }, { "epoch": 6.36, "grad_norm": 0.05544751510024071, "learning_rate": 0.0001364, "loss": 0.1192, "step": 1590 }, { "epoch": 6.4, "grad_norm": 0.16358701884746552, "learning_rate": 0.00013600000000000003, "loss": 0.102, "step": 1600 }, { "epoch": 6.4, "eval_accuracy": 0.788, "eval_loss": 0.9045199155807495, "eval_runtime": 5.1633, "eval_samples_per_second": 193.676, "eval_steps_per_second": 24.21, "step": 1600 }, { "epoch": 6.44, "grad_norm": 2.016242742538452, "learning_rate": 0.00013560000000000002, "loss": 0.0633, "step": 1610 }, { "epoch": 6.48, "grad_norm": 0.19605639576911926, "learning_rate": 0.0001352, "loss": 0.0223, "step": 1620 }, { "epoch": 6.52, "grad_norm": 0.13813719153404236, "learning_rate": 0.00013480000000000002, "loss": 0.1615, "step": 1630 }, { "epoch": 6.5600000000000005, "grad_norm": 0.18716655671596527, "learning_rate": 0.00013440000000000001, "loss": 0.0248, "step": 1640 }, { "epoch": 6.6, "grad_norm": 0.14301587641239166, "learning_rate": 0.000134, "loss": 0.0507, "step": 1650 }, { "epoch": 6.64, "grad_norm": 0.10028518736362457, "learning_rate": 0.00013360000000000002, "loss": 0.0334, "step": 1660 }, { "epoch": 6.68, "grad_norm": 6.466274261474609, "learning_rate": 0.0001332, "loss": 0.0881, "step": 1670 }, { "epoch": 6.72, "grad_norm": 0.15147142112255096, "learning_rate": 0.0001328, "loss": 0.1162, "step": 1680 }, { "epoch": 6.76, "grad_norm": 0.06184665858745575, "learning_rate": 0.00013240000000000002, "loss": 0.085, "step": 1690 }, { "epoch": 6.8, "grad_norm": 9.578435897827148, "learning_rate": 0.000132, "loss": 0.1029, "step": 1700 }, { "epoch": 6.8, "eval_accuracy": 0.783, "eval_loss": 0.9655129313468933, "eval_runtime": 5.9259, "eval_samples_per_second": 168.75, "eval_steps_per_second": 21.094, "step": 1700 }, { "epoch": 6.84, "grad_norm": 9.377839088439941, "learning_rate": 0.0001316, "loss": 0.0559, "step": 1710 }, { "epoch": 6.88, "grad_norm": 9.196260452270508, "learning_rate": 0.00013120000000000002, "loss": 0.0919, "step": 1720 }, { "epoch": 6.92, "grad_norm": 0.6224649548530579, "learning_rate": 0.0001308, "loss": 0.0328, "step": 1730 }, { "epoch": 6.96, "grad_norm": 0.8349862098693848, "learning_rate": 0.0001304, "loss": 0.0837, "step": 1740 }, { "epoch": 7.0, "grad_norm": 7.744389057159424, "learning_rate": 0.00013000000000000002, "loss": 0.1493, "step": 1750 }, { "epoch": 7.04, "grad_norm": 19.332578659057617, "learning_rate": 0.0001296, "loss": 0.1517, "step": 1760 }, { "epoch": 7.08, "grad_norm": 9.477415084838867, "learning_rate": 0.00012920000000000002, "loss": 0.0612, "step": 1770 }, { "epoch": 7.12, "grad_norm": 0.04479631036520004, "learning_rate": 0.00012880000000000001, "loss": 0.0629, "step": 1780 }, { "epoch": 7.16, "grad_norm": 0.03189521282911301, "learning_rate": 0.0001284, "loss": 0.0544, "step": 1790 }, { "epoch": 7.2, "grad_norm": 0.04392462968826294, "learning_rate": 0.00012800000000000002, "loss": 0.0735, "step": 1800 }, { "epoch": 7.2, "eval_accuracy": 0.78, "eval_loss": 0.9905818700790405, "eval_runtime": 5.871, "eval_samples_per_second": 170.328, "eval_steps_per_second": 21.291, "step": 1800 }, { "epoch": 7.24, "grad_norm": 0.06359164416790009, "learning_rate": 0.0001276, "loss": 0.1629, "step": 1810 }, { "epoch": 7.28, "grad_norm": 1.7004547119140625, "learning_rate": 0.0001272, "loss": 0.1058, "step": 1820 }, { "epoch": 7.32, "grad_norm": 0.6779342293739319, "learning_rate": 0.00012680000000000002, "loss": 0.0404, "step": 1830 }, { "epoch": 7.36, "grad_norm": 0.050379179418087006, "learning_rate": 0.0001264, "loss": 0.0565, "step": 1840 }, { "epoch": 7.4, "grad_norm": 0.23603555560112, "learning_rate": 0.000126, "loss": 0.0316, "step": 1850 }, { "epoch": 7.44, "grad_norm": 0.030859237536787987, "learning_rate": 0.00012560000000000002, "loss": 0.0781, "step": 1860 }, { "epoch": 7.48, "grad_norm": 0.14111267030239105, "learning_rate": 0.0001252, "loss": 0.1127, "step": 1870 }, { "epoch": 7.52, "grad_norm": 5.0431084632873535, "learning_rate": 0.0001248, "loss": 0.0226, "step": 1880 }, { "epoch": 7.5600000000000005, "grad_norm": 0.1641249656677246, "learning_rate": 0.00012440000000000002, "loss": 0.0875, "step": 1890 }, { "epoch": 7.6, "grad_norm": 1.8398263454437256, "learning_rate": 0.000124, "loss": 0.0715, "step": 1900 }, { "epoch": 7.6, "eval_accuracy": 0.787, "eval_loss": 0.8893071413040161, "eval_runtime": 5.6004, "eval_samples_per_second": 178.559, "eval_steps_per_second": 22.32, "step": 1900 }, { "epoch": 7.64, "grad_norm": 11.241652488708496, "learning_rate": 0.0001236, "loss": 0.0418, "step": 1910 }, { "epoch": 7.68, "grad_norm": 1.0991768836975098, "learning_rate": 0.0001232, "loss": 0.0755, "step": 1920 }, { "epoch": 7.72, "grad_norm": 19.279315948486328, "learning_rate": 0.0001228, "loss": 0.2096, "step": 1930 }, { "epoch": 7.76, "grad_norm": 0.7445815801620483, "learning_rate": 0.0001224, "loss": 0.0395, "step": 1940 }, { "epoch": 7.8, "grad_norm": 0.07692953944206238, "learning_rate": 0.000122, "loss": 0.0305, "step": 1950 }, { "epoch": 7.84, "grad_norm": 0.1866171956062317, "learning_rate": 0.0001216, "loss": 0.0379, "step": 1960 }, { "epoch": 7.88, "grad_norm": 11.213932037353516, "learning_rate": 0.0001212, "loss": 0.0625, "step": 1970 }, { "epoch": 7.92, "grad_norm": 0.762454092502594, "learning_rate": 0.0001208, "loss": 0.0336, "step": 1980 }, { "epoch": 7.96, "grad_norm": 9.390318870544434, "learning_rate": 0.0001204, "loss": 0.0813, "step": 1990 }, { "epoch": 8.0, "grad_norm": 0.019455738365650177, "learning_rate": 0.00012, "loss": 0.1254, "step": 2000 }, { "epoch": 8.0, "eval_accuracy": 0.761, "eval_loss": 1.1220637559890747, "eval_runtime": 5.5365, "eval_samples_per_second": 180.619, "eval_steps_per_second": 22.577, "step": 2000 }, { "epoch": 8.04, "grad_norm": 0.1315418779850006, "learning_rate": 0.00011960000000000001, "loss": 0.1103, "step": 2010 }, { "epoch": 8.08, "grad_norm": 0.4731353521347046, "learning_rate": 0.0001192, "loss": 0.0696, "step": 2020 }, { "epoch": 8.12, "grad_norm": 2.56032133102417, "learning_rate": 0.0001188, "loss": 0.0488, "step": 2030 }, { "epoch": 8.16, "grad_norm": 1.2172237634658813, "learning_rate": 0.0001184, "loss": 0.031, "step": 2040 }, { "epoch": 8.2, "grad_norm": 0.04160616546869278, "learning_rate": 0.000118, "loss": 0.0332, "step": 2050 }, { "epoch": 8.24, "grad_norm": 0.02498347871005535, "learning_rate": 0.0001176, "loss": 0.0046, "step": 2060 }, { "epoch": 8.28, "grad_norm": 0.023147333413362503, "learning_rate": 0.0001172, "loss": 0.0354, "step": 2070 }, { "epoch": 8.32, "grad_norm": 24.763408660888672, "learning_rate": 0.00011679999999999999, "loss": 0.0182, "step": 2080 }, { "epoch": 8.36, "grad_norm": 0.1869715005159378, "learning_rate": 0.0001164, "loss": 0.0198, "step": 2090 }, { "epoch": 8.4, "grad_norm": 0.013836881145834923, "learning_rate": 0.000116, "loss": 0.021, "step": 2100 }, { "epoch": 8.4, "eval_accuracy": 0.779, "eval_loss": 1.1648471355438232, "eval_runtime": 5.2413, "eval_samples_per_second": 190.794, "eval_steps_per_second": 23.849, "step": 2100 }, { "epoch": 8.44, "grad_norm": 0.022183042019605637, "learning_rate": 0.00011559999999999999, "loss": 0.0551, "step": 2110 }, { "epoch": 8.48, "grad_norm": 7.832488536834717, "learning_rate": 0.0001152, "loss": 0.1164, "step": 2120 }, { "epoch": 8.52, "grad_norm": 0.02375981956720352, "learning_rate": 0.0001148, "loss": 0.0164, "step": 2130 }, { "epoch": 8.56, "grad_norm": 0.018566014245152473, "learning_rate": 0.0001144, "loss": 0.0155, "step": 2140 }, { "epoch": 8.6, "grad_norm": 2.625162124633789, "learning_rate": 0.00011399999999999999, "loss": 0.0702, "step": 2150 }, { "epoch": 8.64, "grad_norm": 0.022067397832870483, "learning_rate": 0.0001136, "loss": 0.0441, "step": 2160 }, { "epoch": 8.68, "grad_norm": 0.030472321435809135, "learning_rate": 0.0001132, "loss": 0.012, "step": 2170 }, { "epoch": 8.72, "grad_norm": 0.957785964012146, "learning_rate": 0.00011279999999999999, "loss": 0.0151, "step": 2180 }, { "epoch": 8.76, "grad_norm": 3.766643524169922, "learning_rate": 0.00011240000000000002, "loss": 0.0853, "step": 2190 }, { "epoch": 8.8, "grad_norm": 0.32392239570617676, "learning_rate": 0.00011200000000000001, "loss": 0.0133, "step": 2200 }, { "epoch": 8.8, "eval_accuracy": 0.806, "eval_loss": 0.9857252240180969, "eval_runtime": 5.2301, "eval_samples_per_second": 191.201, "eval_steps_per_second": 23.9, "step": 2200 }, { "epoch": 8.84, "grad_norm": 0.021277396008372307, "learning_rate": 0.00011160000000000002, "loss": 0.0314, "step": 2210 }, { "epoch": 8.88, "grad_norm": 13.449085235595703, "learning_rate": 0.00011120000000000002, "loss": 0.0629, "step": 2220 }, { "epoch": 8.92, "grad_norm": 0.02039525657892227, "learning_rate": 0.00011080000000000001, "loss": 0.0358, "step": 2230 }, { "epoch": 8.96, "grad_norm": 0.10957217961549759, "learning_rate": 0.00011040000000000001, "loss": 0.0379, "step": 2240 }, { "epoch": 9.0, "grad_norm": 0.016647523269057274, "learning_rate": 0.00011000000000000002, "loss": 0.0125, "step": 2250 }, { "epoch": 9.04, "grad_norm": 13.669293403625488, "learning_rate": 0.00010960000000000001, "loss": 0.035, "step": 2260 }, { "epoch": 9.08, "grad_norm": 0.012543587014079094, "learning_rate": 0.00010920000000000001, "loss": 0.031, "step": 2270 }, { "epoch": 9.12, "grad_norm": 0.011763609014451504, "learning_rate": 0.00010880000000000002, "loss": 0.0299, "step": 2280 }, { "epoch": 9.16, "grad_norm": 0.0155358100309968, "learning_rate": 0.00010840000000000002, "loss": 0.0339, "step": 2290 }, { "epoch": 9.2, "grad_norm": 0.2667885720729828, "learning_rate": 0.00010800000000000001, "loss": 0.0086, "step": 2300 }, { "epoch": 9.2, "eval_accuracy": 0.799, "eval_loss": 1.036458134651184, "eval_runtime": 6.0166, "eval_samples_per_second": 166.207, "eval_steps_per_second": 20.776, "step": 2300 }, { "epoch": 9.24, "grad_norm": 0.008514340966939926, "learning_rate": 0.00010760000000000001, "loss": 0.0177, "step": 2310 }, { "epoch": 9.28, "grad_norm": 0.033006951212882996, "learning_rate": 0.00010720000000000002, "loss": 0.0041, "step": 2320 }, { "epoch": 9.32, "grad_norm": 17.495553970336914, "learning_rate": 0.00010680000000000001, "loss": 0.0506, "step": 2330 }, { "epoch": 9.36, "grad_norm": 29.127471923828125, "learning_rate": 0.00010640000000000001, "loss": 0.0216, "step": 2340 }, { "epoch": 9.4, "grad_norm": 0.03006252832710743, "learning_rate": 0.00010600000000000002, "loss": 0.0273, "step": 2350 }, { "epoch": 9.44, "grad_norm": 0.021872328594326973, "learning_rate": 0.0001056, "loss": 0.0027, "step": 2360 }, { "epoch": 9.48, "grad_norm": 0.01040344312787056, "learning_rate": 0.00010520000000000001, "loss": 0.018, "step": 2370 }, { "epoch": 9.52, "grad_norm": 9.743282318115234, "learning_rate": 0.00010480000000000001, "loss": 0.1394, "step": 2380 }, { "epoch": 9.56, "grad_norm": 0.012420209124684334, "learning_rate": 0.0001044, "loss": 0.0257, "step": 2390 }, { "epoch": 9.6, "grad_norm": 0.21533189713954926, "learning_rate": 0.00010400000000000001, "loss": 0.0223, "step": 2400 }, { "epoch": 9.6, "eval_accuracy": 0.812, "eval_loss": 0.982612669467926, "eval_runtime": 5.176, "eval_samples_per_second": 193.201, "eval_steps_per_second": 24.15, "step": 2400 }, { "epoch": 9.64, "grad_norm": 0.02012082375586033, "learning_rate": 0.00010360000000000001, "loss": 0.0371, "step": 2410 }, { "epoch": 9.68, "grad_norm": 0.03367308899760246, "learning_rate": 0.0001032, "loss": 0.0053, "step": 2420 }, { "epoch": 9.72, "grad_norm": 0.9374167919158936, "learning_rate": 0.0001028, "loss": 0.058, "step": 2430 }, { "epoch": 9.76, "grad_norm": 0.010282558389008045, "learning_rate": 0.00010240000000000001, "loss": 0.0438, "step": 2440 }, { "epoch": 9.8, "grad_norm": 0.009324116632342339, "learning_rate": 0.00010200000000000001, "loss": 0.0028, "step": 2450 }, { "epoch": 9.84, "grad_norm": 0.009123372845351696, "learning_rate": 0.0001016, "loss": 0.1, "step": 2460 }, { "epoch": 9.88, "grad_norm": 0.010366716422140598, "learning_rate": 0.00010120000000000001, "loss": 0.1172, "step": 2470 }, { "epoch": 9.92, "grad_norm": 3.760880708694458, "learning_rate": 0.00010080000000000001, "loss": 0.005, "step": 2480 }, { "epoch": 9.96, "grad_norm": 0.033989980816841125, "learning_rate": 0.0001004, "loss": 0.0441, "step": 2490 }, { "epoch": 10.0, "grad_norm": 0.008832584135234356, "learning_rate": 0.0001, "loss": 0.0023, "step": 2500 }, { "epoch": 10.0, "eval_accuracy": 0.795, "eval_loss": 1.0696669816970825, "eval_runtime": 5.262, "eval_samples_per_second": 190.041, "eval_steps_per_second": 23.755, "step": 2500 }, { "epoch": 10.04, "grad_norm": 0.06487290561199188, "learning_rate": 9.960000000000001e-05, "loss": 0.0359, "step": 2510 }, { "epoch": 10.08, "grad_norm": 0.06183789297938347, "learning_rate": 9.92e-05, "loss": 0.0019, "step": 2520 }, { "epoch": 10.12, "grad_norm": 0.007254609372466803, "learning_rate": 9.88e-05, "loss": 0.0443, "step": 2530 }, { "epoch": 10.16, "grad_norm": 27.718936920166016, "learning_rate": 9.84e-05, "loss": 0.0429, "step": 2540 }, { "epoch": 10.2, "grad_norm": 0.007617979310452938, "learning_rate": 9.8e-05, "loss": 0.0034, "step": 2550 }, { "epoch": 10.24, "grad_norm": 0.42735064029693604, "learning_rate": 9.76e-05, "loss": 0.028, "step": 2560 }, { "epoch": 10.28, "grad_norm": 0.00830338429659605, "learning_rate": 9.72e-05, "loss": 0.0049, "step": 2570 }, { "epoch": 10.32, "grad_norm": 0.007815328426659107, "learning_rate": 9.680000000000001e-05, "loss": 0.0015, "step": 2580 }, { "epoch": 10.36, "grad_norm": 0.023354820907115936, "learning_rate": 9.64e-05, "loss": 0.0017, "step": 2590 }, { "epoch": 10.4, "grad_norm": 0.02857878804206848, "learning_rate": 9.6e-05, "loss": 0.0021, "step": 2600 }, { "epoch": 10.4, "eval_accuracy": 0.815, "eval_loss": 1.0489550828933716, "eval_runtime": 5.2508, "eval_samples_per_second": 190.447, "eval_steps_per_second": 23.806, "step": 2600 }, { "epoch": 10.44, "grad_norm": 0.020630279555916786, "learning_rate": 9.56e-05, "loss": 0.0337, "step": 2610 }, { "epoch": 10.48, "grad_norm": 0.2516959607601166, "learning_rate": 9.52e-05, "loss": 0.0099, "step": 2620 }, { "epoch": 10.52, "grad_norm": 0.09867813438177109, "learning_rate": 9.48e-05, "loss": 0.0258, "step": 2630 }, { "epoch": 10.56, "grad_norm": 0.005374481901526451, "learning_rate": 9.44e-05, "loss": 0.0014, "step": 2640 }, { "epoch": 10.6, "grad_norm": 0.012418796308338642, "learning_rate": 9.4e-05, "loss": 0.0052, "step": 2650 }, { "epoch": 10.64, "grad_norm": 0.006738652009516954, "learning_rate": 9.360000000000001e-05, "loss": 0.0015, "step": 2660 }, { "epoch": 10.68, "grad_norm": 3.031865358352661, "learning_rate": 9.320000000000002e-05, "loss": 0.0023, "step": 2670 }, { "epoch": 10.72, "grad_norm": 0.007892146706581116, "learning_rate": 9.28e-05, "loss": 0.0014, "step": 2680 }, { "epoch": 10.76, "grad_norm": 0.008096442557871342, "learning_rate": 9.240000000000001e-05, "loss": 0.0321, "step": 2690 }, { "epoch": 10.8, "grad_norm": 10.601773262023926, "learning_rate": 9.200000000000001e-05, "loss": 0.0401, "step": 2700 }, { "epoch": 10.8, "eval_accuracy": 0.8, "eval_loss": 1.1593725681304932, "eval_runtime": 5.7948, "eval_samples_per_second": 172.569, "eval_steps_per_second": 21.571, "step": 2700 }, { "epoch": 10.84, "grad_norm": 0.00845256820321083, "learning_rate": 9.16e-05, "loss": 0.0016, "step": 2710 }, { "epoch": 10.88, "grad_norm": 0.016398636624217033, "learning_rate": 9.120000000000001e-05, "loss": 0.0108, "step": 2720 }, { "epoch": 10.92, "grad_norm": 0.015316460281610489, "learning_rate": 9.080000000000001e-05, "loss": 0.0394, "step": 2730 }, { "epoch": 10.96, "grad_norm": 0.35788559913635254, "learning_rate": 9.04e-05, "loss": 0.0017, "step": 2740 }, { "epoch": 11.0, "grad_norm": 0.011767162941396236, "learning_rate": 9e-05, "loss": 0.0047, "step": 2750 }, { "epoch": 11.04, "grad_norm": 0.007368568331003189, "learning_rate": 8.960000000000001e-05, "loss": 0.0012, "step": 2760 }, { "epoch": 11.08, "grad_norm": 0.011666523292660713, "learning_rate": 8.92e-05, "loss": 0.0014, "step": 2770 }, { "epoch": 11.12, "grad_norm": 0.00521031254902482, "learning_rate": 8.88e-05, "loss": 0.0012, "step": 2780 }, { "epoch": 11.16, "grad_norm": 0.025624191388487816, "learning_rate": 8.840000000000001e-05, "loss": 0.0014, "step": 2790 }, { "epoch": 11.2, "grad_norm": 0.008872034959495068, "learning_rate": 8.800000000000001e-05, "loss": 0.0012, "step": 2800 }, { "epoch": 11.2, "eval_accuracy": 0.817, "eval_loss": 1.0810606479644775, "eval_runtime": 5.7202, "eval_samples_per_second": 174.82, "eval_steps_per_second": 21.853, "step": 2800 }, { "epoch": 11.24, "grad_norm": 0.006208410952240229, "learning_rate": 8.76e-05, "loss": 0.0014, "step": 2810 }, { "epoch": 11.28, "grad_norm": 0.006170503329485655, "learning_rate": 8.72e-05, "loss": 0.0011, "step": 2820 }, { "epoch": 11.32, "grad_norm": 0.005829779896885157, "learning_rate": 8.680000000000001e-05, "loss": 0.0015, "step": 2830 }, { "epoch": 11.36, "grad_norm": 0.0047014919109642506, "learning_rate": 8.64e-05, "loss": 0.001, "step": 2840 }, { "epoch": 11.4, "grad_norm": 0.01678207516670227, "learning_rate": 8.6e-05, "loss": 0.0019, "step": 2850 }, { "epoch": 11.44, "grad_norm": 0.004566165152937174, "learning_rate": 8.560000000000001e-05, "loss": 0.0014, "step": 2860 }, { "epoch": 11.48, "grad_norm": 0.005536119919270277, "learning_rate": 8.52e-05, "loss": 0.001, "step": 2870 }, { "epoch": 11.52, "grad_norm": 0.005134043283760548, "learning_rate": 8.48e-05, "loss": 0.001, "step": 2880 }, { "epoch": 11.56, "grad_norm": 0.005173493642359972, "learning_rate": 8.44e-05, "loss": 0.001, "step": 2890 }, { "epoch": 11.6, "grad_norm": 0.0057559204287827015, "learning_rate": 8.4e-05, "loss": 0.0034, "step": 2900 }, { "epoch": 11.6, "eval_accuracy": 0.825, "eval_loss": 1.0955668687820435, "eval_runtime": 5.4405, "eval_samples_per_second": 183.805, "eval_steps_per_second": 22.976, "step": 2900 }, { "epoch": 11.64, "grad_norm": 0.004013891331851482, "learning_rate": 8.36e-05, "loss": 0.0009, "step": 2910 }, { "epoch": 11.68, "grad_norm": 4.541440486907959, "learning_rate": 8.32e-05, "loss": 0.0744, "step": 2920 }, { "epoch": 11.72, "grad_norm": 0.004702790640294552, "learning_rate": 8.28e-05, "loss": 0.001, "step": 2930 }, { "epoch": 11.76, "grad_norm": 0.004574855323880911, "learning_rate": 8.24e-05, "loss": 0.0028, "step": 2940 }, { "epoch": 11.8, "grad_norm": 0.006892206612974405, "learning_rate": 8.2e-05, "loss": 0.0031, "step": 2950 }, { "epoch": 11.84, "grad_norm": 0.28937533497810364, "learning_rate": 8.16e-05, "loss": 0.002, "step": 2960 }, { "epoch": 11.88, "grad_norm": 0.004892553202807903, "learning_rate": 8.120000000000001e-05, "loss": 0.0009, "step": 2970 }, { "epoch": 11.92, "grad_norm": 0.00817884597927332, "learning_rate": 8.080000000000001e-05, "loss": 0.0012, "step": 2980 }, { "epoch": 11.96, "grad_norm": 0.007845502346754074, "learning_rate": 8.04e-05, "loss": 0.0169, "step": 2990 }, { "epoch": 12.0, "grad_norm": 0.004683198872953653, "learning_rate": 8e-05, "loss": 0.0012, "step": 3000 }, { "epoch": 12.0, "eval_accuracy": 0.808, "eval_loss": 1.200992226600647, "eval_runtime": 5.2017, "eval_samples_per_second": 192.245, "eval_steps_per_second": 24.031, "step": 3000 }, { "epoch": 12.04, "grad_norm": 0.004240355920046568, "learning_rate": 7.960000000000001e-05, "loss": 0.001, "step": 3010 }, { "epoch": 12.08, "grad_norm": 0.04097750037908554, "learning_rate": 7.920000000000001e-05, "loss": 0.001, "step": 3020 }, { "epoch": 12.12, "grad_norm": 0.036695607006549835, "learning_rate": 7.88e-05, "loss": 0.0009, "step": 3030 }, { "epoch": 12.16, "grad_norm": 0.003702098038047552, "learning_rate": 7.840000000000001e-05, "loss": 0.0009, "step": 3040 }, { "epoch": 12.2, "grad_norm": 10.573301315307617, "learning_rate": 7.800000000000001e-05, "loss": 0.0321, "step": 3050 }, { "epoch": 12.24, "grad_norm": 0.004409339744597673, "learning_rate": 7.76e-05, "loss": 0.0008, "step": 3060 }, { "epoch": 12.28, "grad_norm": 0.004956026095896959, "learning_rate": 7.72e-05, "loss": 0.0008, "step": 3070 }, { "epoch": 12.32, "grad_norm": 0.006153183523565531, "learning_rate": 7.680000000000001e-05, "loss": 0.0009, "step": 3080 }, { "epoch": 12.36, "grad_norm": 0.005817431956529617, "learning_rate": 7.64e-05, "loss": 0.0009, "step": 3090 }, { "epoch": 12.4, "grad_norm": 0.004047242924571037, "learning_rate": 7.6e-05, "loss": 0.0011, "step": 3100 }, { "epoch": 12.4, "eval_accuracy": 0.81, "eval_loss": 1.1712357997894287, "eval_runtime": 5.9454, "eval_samples_per_second": 168.198, "eval_steps_per_second": 21.025, "step": 3100 }, { "epoch": 12.44, "grad_norm": 0.05110397934913635, "learning_rate": 7.560000000000001e-05, "loss": 0.0009, "step": 3110 }, { "epoch": 12.48, "grad_norm": 0.005057331640273333, "learning_rate": 7.52e-05, "loss": 0.0008, "step": 3120 }, { "epoch": 12.52, "grad_norm": 0.00844608899205923, "learning_rate": 7.48e-05, "loss": 0.0008, "step": 3130 }, { "epoch": 12.56, "grad_norm": 0.004293734673410654, "learning_rate": 7.44e-05, "loss": 0.0011, "step": 3140 }, { "epoch": 12.6, "grad_norm": 0.004264953080564737, "learning_rate": 7.4e-05, "loss": 0.0008, "step": 3150 }, { "epoch": 12.64, "grad_norm": 0.021664146333932877, "learning_rate": 7.36e-05, "loss": 0.0432, "step": 3160 }, { "epoch": 12.68, "grad_norm": 0.004172308370471001, "learning_rate": 7.32e-05, "loss": 0.0008, "step": 3170 }, { "epoch": 12.72, "grad_norm": 0.006536141969263554, "learning_rate": 7.280000000000001e-05, "loss": 0.0008, "step": 3180 }, { "epoch": 12.76, "grad_norm": 0.0048880744725465775, "learning_rate": 7.24e-05, "loss": 0.0008, "step": 3190 }, { "epoch": 12.8, "grad_norm": 0.004492601379752159, "learning_rate": 7.2e-05, "loss": 0.0092, "step": 3200 }, { "epoch": 12.8, "eval_accuracy": 0.813, "eval_loss": 1.1813948154449463, "eval_runtime": 5.2063, "eval_samples_per_second": 192.074, "eval_steps_per_second": 24.009, "step": 3200 }, { "epoch": 12.84, "grad_norm": 0.003316229674965143, "learning_rate": 7.16e-05, "loss": 0.0008, "step": 3210 }, { "epoch": 12.88, "grad_norm": 0.004760683514177799, "learning_rate": 7.12e-05, "loss": 0.0007, "step": 3220 }, { "epoch": 12.92, "grad_norm": 0.004011666867882013, "learning_rate": 7.08e-05, "loss": 0.0007, "step": 3230 }, { "epoch": 12.96, "grad_norm": 0.004005063325166702, "learning_rate": 7.04e-05, "loss": 0.0362, "step": 3240 }, { "epoch": 13.0, "grad_norm": 0.0050388905219733715, "learning_rate": 7e-05, "loss": 0.0008, "step": 3250 }, { "epoch": 13.04, "grad_norm": 0.01386964414268732, "learning_rate": 6.96e-05, "loss": 0.0009, "step": 3260 }, { "epoch": 13.08, "grad_norm": 0.004433898255228996, "learning_rate": 6.92e-05, "loss": 0.0008, "step": 3270 }, { "epoch": 13.12, "grad_norm": 0.004408930893987417, "learning_rate": 6.879999999999999e-05, "loss": 0.0008, "step": 3280 }, { "epoch": 13.16, "grad_norm": 0.004361864645034075, "learning_rate": 6.840000000000001e-05, "loss": 0.0007, "step": 3290 }, { "epoch": 13.2, "grad_norm": 0.010670163668692112, "learning_rate": 6.800000000000001e-05, "loss": 0.0007, "step": 3300 }, { "epoch": 13.2, "eval_accuracy": 0.818, "eval_loss": 1.1677281856536865, "eval_runtime": 5.3663, "eval_samples_per_second": 186.347, "eval_steps_per_second": 23.293, "step": 3300 }, { "epoch": 13.24, "grad_norm": 0.004130377899855375, "learning_rate": 6.76e-05, "loss": 0.0007, "step": 3310 }, { "epoch": 13.28, "grad_norm": 0.0031264375429600477, "learning_rate": 6.720000000000001e-05, "loss": 0.0007, "step": 3320 }, { "epoch": 13.32, "grad_norm": 0.004546809010207653, "learning_rate": 6.680000000000001e-05, "loss": 0.0007, "step": 3330 }, { "epoch": 13.36, "grad_norm": 0.0035698998253792524, "learning_rate": 6.64e-05, "loss": 0.0007, "step": 3340 }, { "epoch": 13.4, "grad_norm": 0.004544632509350777, "learning_rate": 6.6e-05, "loss": 0.0007, "step": 3350 }, { "epoch": 13.44, "grad_norm": 0.0035377303138375282, "learning_rate": 6.560000000000001e-05, "loss": 0.0006, "step": 3360 }, { "epoch": 13.48, "grad_norm": 0.003488726681098342, "learning_rate": 6.52e-05, "loss": 0.0007, "step": 3370 }, { "epoch": 13.52, "grad_norm": 0.016187245026230812, "learning_rate": 6.48e-05, "loss": 0.0007, "step": 3380 }, { "epoch": 13.56, "grad_norm": 0.0037824937608093023, "learning_rate": 6.440000000000001e-05, "loss": 0.0006, "step": 3390 }, { "epoch": 13.6, "grad_norm": 0.0033098619896918535, "learning_rate": 6.400000000000001e-05, "loss": 0.0007, "step": 3400 }, { "epoch": 13.6, "eval_accuracy": 0.818, "eval_loss": 1.172300934791565, "eval_runtime": 5.2344, "eval_samples_per_second": 191.045, "eval_steps_per_second": 23.881, "step": 3400 }, { "epoch": 13.64, "grad_norm": 0.0036126759368926287, "learning_rate": 6.36e-05, "loss": 0.0007, "step": 3410 }, { "epoch": 13.68, "grad_norm": 0.0037607806734740734, "learning_rate": 6.32e-05, "loss": 0.0006, "step": 3420 }, { "epoch": 13.72, "grad_norm": 0.002861644374206662, "learning_rate": 6.280000000000001e-05, "loss": 0.0006, "step": 3430 }, { "epoch": 13.76, "grad_norm": 0.0034740378614515066, "learning_rate": 6.24e-05, "loss": 0.0006, "step": 3440 }, { "epoch": 13.8, "grad_norm": 0.0031694734934717417, "learning_rate": 6.2e-05, "loss": 0.0006, "step": 3450 }, { "epoch": 13.84, "grad_norm": 0.0038870801217854023, "learning_rate": 6.16e-05, "loss": 0.0006, "step": 3460 }, { "epoch": 13.88, "grad_norm": 0.0030017481185495853, "learning_rate": 6.12e-05, "loss": 0.0006, "step": 3470 }, { "epoch": 13.92, "grad_norm": 0.004931191448122263, "learning_rate": 6.08e-05, "loss": 0.0006, "step": 3480 }, { "epoch": 13.96, "grad_norm": 0.0034845368936657906, "learning_rate": 6.04e-05, "loss": 0.0008, "step": 3490 }, { "epoch": 14.0, "grad_norm": 0.003087324323132634, "learning_rate": 6e-05, "loss": 0.0006, "step": 3500 }, { "epoch": 14.0, "eval_accuracy": 0.821, "eval_loss": 1.1852285861968994, "eval_runtime": 5.8798, "eval_samples_per_second": 170.074, "eval_steps_per_second": 21.259, "step": 3500 }, { "epoch": 14.04, "grad_norm": 0.003496804041787982, "learning_rate": 5.96e-05, "loss": 0.0006, "step": 3510 }, { "epoch": 14.08, "grad_norm": 0.0027559709269553423, "learning_rate": 5.92e-05, "loss": 0.0006, "step": 3520 }, { "epoch": 14.12, "grad_norm": 0.0026562130078673363, "learning_rate": 5.88e-05, "loss": 0.0006, "step": 3530 }, { "epoch": 14.16, "grad_norm": 0.002727353246882558, "learning_rate": 5.8399999999999997e-05, "loss": 0.0006, "step": 3540 }, { "epoch": 14.2, "grad_norm": 0.02825564332306385, "learning_rate": 5.8e-05, "loss": 0.0006, "step": 3550 }, { "epoch": 14.24, "grad_norm": 0.0026983567513525486, "learning_rate": 5.76e-05, "loss": 0.0007, "step": 3560 }, { "epoch": 14.28, "grad_norm": 0.005061229225248098, "learning_rate": 5.72e-05, "loss": 0.0006, "step": 3570 }, { "epoch": 14.32, "grad_norm": 0.002966745523735881, "learning_rate": 5.68e-05, "loss": 0.0005, "step": 3580 }, { "epoch": 14.36, "grad_norm": 0.002947271103039384, "learning_rate": 5.6399999999999995e-05, "loss": 0.0006, "step": 3590 }, { "epoch": 14.4, "grad_norm": 0.0029176445677876472, "learning_rate": 5.6000000000000006e-05, "loss": 0.0005, "step": 3600 }, { "epoch": 14.4, "eval_accuracy": 0.82, "eval_loss": 1.1927635669708252, "eval_runtime": 5.2928, "eval_samples_per_second": 188.936, "eval_steps_per_second": 23.617, "step": 3600 }, { "epoch": 14.44, "grad_norm": 0.0033958940766751766, "learning_rate": 5.560000000000001e-05, "loss": 0.0005, "step": 3610 }, { "epoch": 14.48, "grad_norm": 0.003239579265937209, "learning_rate": 5.520000000000001e-05, "loss": 0.0006, "step": 3620 }, { "epoch": 14.52, "grad_norm": 0.002811953192576766, "learning_rate": 5.4800000000000004e-05, "loss": 0.0005, "step": 3630 }, { "epoch": 14.56, "grad_norm": 0.0023668641224503517, "learning_rate": 5.440000000000001e-05, "loss": 0.0005, "step": 3640 }, { "epoch": 14.6, "grad_norm": 0.0036032095085829496, "learning_rate": 5.4000000000000005e-05, "loss": 0.0005, "step": 3650 }, { "epoch": 14.64, "grad_norm": 0.0026333522982895374, "learning_rate": 5.360000000000001e-05, "loss": 0.0005, "step": 3660 }, { "epoch": 14.68, "grad_norm": 0.0027461478020995855, "learning_rate": 5.3200000000000006e-05, "loss": 0.0005, "step": 3670 }, { "epoch": 14.72, "grad_norm": 0.002880874089896679, "learning_rate": 5.28e-05, "loss": 0.0005, "step": 3680 }, { "epoch": 14.76, "grad_norm": 0.003313728841021657, "learning_rate": 5.2400000000000007e-05, "loss": 0.0005, "step": 3690 }, { "epoch": 14.8, "grad_norm": 0.002891551936045289, "learning_rate": 5.2000000000000004e-05, "loss": 0.0005, "step": 3700 }, { "epoch": 14.8, "eval_accuracy": 0.819, "eval_loss": 1.203009009361267, "eval_runtime": 5.2248, "eval_samples_per_second": 191.394, "eval_steps_per_second": 23.924, "step": 3700 }, { "epoch": 14.84, "grad_norm": 0.002585276961326599, "learning_rate": 5.16e-05, "loss": 0.0005, "step": 3710 }, { "epoch": 14.88, "grad_norm": 0.002555152401328087, "learning_rate": 5.1200000000000004e-05, "loss": 0.0005, "step": 3720 }, { "epoch": 14.92, "grad_norm": 0.0025339240673929453, "learning_rate": 5.08e-05, "loss": 0.0005, "step": 3730 }, { "epoch": 14.96, "grad_norm": 0.0028862394392490387, "learning_rate": 5.0400000000000005e-05, "loss": 0.0005, "step": 3740 }, { "epoch": 15.0, "grad_norm": 0.0030768855940550566, "learning_rate": 5e-05, "loss": 0.0005, "step": 3750 }, { "epoch": 15.04, "grad_norm": 0.0024443198926746845, "learning_rate": 4.96e-05, "loss": 0.0005, "step": 3760 }, { "epoch": 15.08, "grad_norm": 0.0026144583243876696, "learning_rate": 4.92e-05, "loss": 0.0005, "step": 3770 }, { "epoch": 15.12, "grad_norm": 0.002447100356221199, "learning_rate": 4.88e-05, "loss": 0.0005, "step": 3780 }, { "epoch": 15.16, "grad_norm": 0.003170330310240388, "learning_rate": 4.8400000000000004e-05, "loss": 0.0005, "step": 3790 }, { "epoch": 15.2, "grad_norm": 0.0029565359000116587, "learning_rate": 4.8e-05, "loss": 0.0005, "step": 3800 }, { "epoch": 15.2, "eval_accuracy": 0.818, "eval_loss": 1.2092506885528564, "eval_runtime": 5.9208, "eval_samples_per_second": 168.895, "eval_steps_per_second": 21.112, "step": 3800 }, { "epoch": 15.24, "grad_norm": 0.0024090250954031944, "learning_rate": 4.76e-05, "loss": 0.0005, "step": 3810 }, { "epoch": 15.28, "grad_norm": 0.0021153378766030073, "learning_rate": 4.72e-05, "loss": 0.0005, "step": 3820 }, { "epoch": 15.32, "grad_norm": 0.0027159815654158592, "learning_rate": 4.6800000000000006e-05, "loss": 0.0005, "step": 3830 }, { "epoch": 15.36, "grad_norm": 0.0028497877065092325, "learning_rate": 4.64e-05, "loss": 0.0005, "step": 3840 }, { "epoch": 15.4, "grad_norm": 0.0024237327743321657, "learning_rate": 4.600000000000001e-05, "loss": 0.0005, "step": 3850 }, { "epoch": 15.44, "grad_norm": 0.0031945749651640654, "learning_rate": 4.5600000000000004e-05, "loss": 0.0005, "step": 3860 }, { "epoch": 15.48, "grad_norm": 0.0026720704045146704, "learning_rate": 4.52e-05, "loss": 0.0005, "step": 3870 }, { "epoch": 15.52, "grad_norm": 0.0030961642041802406, "learning_rate": 4.4800000000000005e-05, "loss": 0.0005, "step": 3880 }, { "epoch": 15.56, "grad_norm": 0.002183083677664399, "learning_rate": 4.44e-05, "loss": 0.0005, "step": 3890 }, { "epoch": 15.6, "grad_norm": 0.0029423057567328215, "learning_rate": 4.4000000000000006e-05, "loss": 0.0005, "step": 3900 }, { "epoch": 15.6, "eval_accuracy": 0.818, "eval_loss": 1.2159565687179565, "eval_runtime": 5.3023, "eval_samples_per_second": 188.596, "eval_steps_per_second": 23.575, "step": 3900 }, { "epoch": 15.64, "grad_norm": 0.002488673897460103, "learning_rate": 4.36e-05, "loss": 0.0005, "step": 3910 }, { "epoch": 15.68, "grad_norm": 0.0022629755549132824, "learning_rate": 4.32e-05, "loss": 0.0005, "step": 3920 }, { "epoch": 15.72, "grad_norm": 0.0022780627477914095, "learning_rate": 4.2800000000000004e-05, "loss": 0.0004, "step": 3930 }, { "epoch": 15.76, "grad_norm": 0.002429110463708639, "learning_rate": 4.24e-05, "loss": 0.0005, "step": 3940 }, { "epoch": 15.8, "grad_norm": 0.003042061347514391, "learning_rate": 4.2e-05, "loss": 0.0004, "step": 3950 }, { "epoch": 15.84, "grad_norm": 0.0023479724768549204, "learning_rate": 4.16e-05, "loss": 0.0005, "step": 3960 }, { "epoch": 15.88, "grad_norm": 0.0032163311261683702, "learning_rate": 4.12e-05, "loss": 0.0005, "step": 3970 }, { "epoch": 15.92, "grad_norm": 0.0020656727720052004, "learning_rate": 4.08e-05, "loss": 0.0004, "step": 3980 }, { "epoch": 15.96, "grad_norm": 0.002548342105001211, "learning_rate": 4.0400000000000006e-05, "loss": 0.0005, "step": 3990 }, { "epoch": 16.0, "grad_norm": 0.001894648070447147, "learning_rate": 4e-05, "loss": 0.0004, "step": 4000 }, { "epoch": 16.0, "eval_accuracy": 0.819, "eval_loss": 1.2232067584991455, "eval_runtime": 5.3459, "eval_samples_per_second": 187.058, "eval_steps_per_second": 23.382, "step": 4000 }, { "epoch": 16.04, "grad_norm": 0.002135420450940728, "learning_rate": 3.960000000000001e-05, "loss": 0.0004, "step": 4010 }, { "epoch": 16.08, "grad_norm": 0.002859125379472971, "learning_rate": 3.9200000000000004e-05, "loss": 0.0004, "step": 4020 }, { "epoch": 16.12, "grad_norm": 0.002459143288433552, "learning_rate": 3.88e-05, "loss": 0.0004, "step": 4030 }, { "epoch": 16.16, "grad_norm": 0.0026586554013192654, "learning_rate": 3.8400000000000005e-05, "loss": 0.0004, "step": 4040 }, { "epoch": 16.2, "grad_norm": 0.0031422963365912437, "learning_rate": 3.8e-05, "loss": 0.0004, "step": 4050 }, { "epoch": 16.24, "grad_norm": 0.0020834060851484537, "learning_rate": 3.76e-05, "loss": 0.0004, "step": 4060 }, { "epoch": 16.28, "grad_norm": 0.00235546356998384, "learning_rate": 3.72e-05, "loss": 0.0004, "step": 4070 }, { "epoch": 16.32, "grad_norm": 0.0019886652007699013, "learning_rate": 3.68e-05, "loss": 0.0004, "step": 4080 }, { "epoch": 16.36, "grad_norm": 0.0024375116918236017, "learning_rate": 3.6400000000000004e-05, "loss": 0.0004, "step": 4090 }, { "epoch": 16.4, "grad_norm": 0.002063624793663621, "learning_rate": 3.6e-05, "loss": 0.0004, "step": 4100 }, { "epoch": 16.4, "eval_accuracy": 0.819, "eval_loss": 1.2302378416061401, "eval_runtime": 5.3045, "eval_samples_per_second": 188.52, "eval_steps_per_second": 23.565, "step": 4100 }, { "epoch": 16.44, "grad_norm": 0.0025726554449647665, "learning_rate": 3.56e-05, "loss": 0.0004, "step": 4110 }, { "epoch": 16.48, "grad_norm": 0.0025369401555508375, "learning_rate": 3.52e-05, "loss": 0.0004, "step": 4120 }, { "epoch": 16.52, "grad_norm": 0.0020231506787240505, "learning_rate": 3.48e-05, "loss": 0.0004, "step": 4130 }, { "epoch": 16.56, "grad_norm": 0.0021831688936799765, "learning_rate": 3.4399999999999996e-05, "loss": 0.0004, "step": 4140 }, { "epoch": 16.6, "grad_norm": 0.0023912801407277584, "learning_rate": 3.4000000000000007e-05, "loss": 0.0004, "step": 4150 }, { "epoch": 16.64, "grad_norm": 0.0023613148368895054, "learning_rate": 3.3600000000000004e-05, "loss": 0.0004, "step": 4160 }, { "epoch": 16.68, "grad_norm": 0.0024576077703386545, "learning_rate": 3.32e-05, "loss": 0.0004, "step": 4170 }, { "epoch": 16.72, "grad_norm": 0.002091441536322236, "learning_rate": 3.2800000000000004e-05, "loss": 0.0004, "step": 4180 }, { "epoch": 16.76, "grad_norm": 0.0024098132271319628, "learning_rate": 3.24e-05, "loss": 0.0004, "step": 4190 }, { "epoch": 16.8, "grad_norm": 0.0022078922484070063, "learning_rate": 3.2000000000000005e-05, "loss": 0.0004, "step": 4200 }, { "epoch": 16.8, "eval_accuracy": 0.819, "eval_loss": 1.2350249290466309, "eval_runtime": 5.4057, "eval_samples_per_second": 184.99, "eval_steps_per_second": 23.124, "step": 4200 }, { "epoch": 16.84, "grad_norm": 0.0018520988523960114, "learning_rate": 3.16e-05, "loss": 0.0004, "step": 4210 }, { "epoch": 16.88, "grad_norm": 0.00181812874507159, "learning_rate": 3.12e-05, "loss": 0.0004, "step": 4220 }, { "epoch": 16.92, "grad_norm": 0.0023901008535176516, "learning_rate": 3.08e-05, "loss": 0.0004, "step": 4230 }, { "epoch": 16.96, "grad_norm": 0.002264365553855896, "learning_rate": 3.04e-05, "loss": 0.0004, "step": 4240 }, { "epoch": 17.0, "grad_norm": 0.002284231362864375, "learning_rate": 3e-05, "loss": 0.0004, "step": 4250 }, { "epoch": 17.04, "grad_norm": 0.0025141683872789145, "learning_rate": 2.96e-05, "loss": 0.0004, "step": 4260 }, { "epoch": 17.08, "grad_norm": 0.0023226479534059763, "learning_rate": 2.9199999999999998e-05, "loss": 0.0004, "step": 4270 }, { "epoch": 17.12, "grad_norm": 0.002532349433749914, "learning_rate": 2.88e-05, "loss": 0.0004, "step": 4280 }, { "epoch": 17.16, "grad_norm": 0.0018973592668771744, "learning_rate": 2.84e-05, "loss": 0.0004, "step": 4290 }, { "epoch": 17.2, "grad_norm": 0.0026193023659288883, "learning_rate": 2.8000000000000003e-05, "loss": 0.0004, "step": 4300 }, { "epoch": 17.2, "eval_accuracy": 0.82, "eval_loss": 1.2400437593460083, "eval_runtime": 5.314, "eval_samples_per_second": 188.181, "eval_steps_per_second": 23.523, "step": 4300 }, { "epoch": 17.24, "grad_norm": 0.001868457649834454, "learning_rate": 2.7600000000000003e-05, "loss": 0.0004, "step": 4310 }, { "epoch": 17.28, "grad_norm": 0.0027018135879188776, "learning_rate": 2.7200000000000004e-05, "loss": 0.0004, "step": 4320 }, { "epoch": 17.32, "grad_norm": 0.0023467419669032097, "learning_rate": 2.6800000000000004e-05, "loss": 0.0004, "step": 4330 }, { "epoch": 17.36, "grad_norm": 0.002205600030720234, "learning_rate": 2.64e-05, "loss": 0.0004, "step": 4340 }, { "epoch": 17.4, "grad_norm": 0.0021359582897275686, "learning_rate": 2.6000000000000002e-05, "loss": 0.0004, "step": 4350 }, { "epoch": 17.44, "grad_norm": 0.0024898636620491743, "learning_rate": 2.5600000000000002e-05, "loss": 0.0004, "step": 4360 }, { "epoch": 17.48, "grad_norm": 0.0017528502503409982, "learning_rate": 2.5200000000000003e-05, "loss": 0.0004, "step": 4370 }, { "epoch": 17.52, "grad_norm": 0.0022504818625748158, "learning_rate": 2.48e-05, "loss": 0.0004, "step": 4380 }, { "epoch": 17.56, "grad_norm": 0.00209965487010777, "learning_rate": 2.44e-05, "loss": 0.0004, "step": 4390 }, { "epoch": 17.6, "grad_norm": 0.0020423857495188713, "learning_rate": 2.4e-05, "loss": 0.0004, "step": 4400 }, { "epoch": 17.6, "eval_accuracy": 0.821, "eval_loss": 1.2442196607589722, "eval_runtime": 5.4679, "eval_samples_per_second": 182.887, "eval_steps_per_second": 22.861, "step": 4400 }, { "epoch": 17.64, "grad_norm": 0.0022337392438203096, "learning_rate": 2.36e-05, "loss": 0.0004, "step": 4410 }, { "epoch": 17.68, "grad_norm": 0.0018173023127019405, "learning_rate": 2.32e-05, "loss": 0.0004, "step": 4420 }, { "epoch": 17.72, "grad_norm": 0.002295331796631217, "learning_rate": 2.2800000000000002e-05, "loss": 0.0004, "step": 4430 }, { "epoch": 17.76, "grad_norm": 0.001931387116201222, "learning_rate": 2.2400000000000002e-05, "loss": 0.0004, "step": 4440 }, { "epoch": 17.8, "grad_norm": 0.0019796870183199644, "learning_rate": 2.2000000000000003e-05, "loss": 0.0004, "step": 4450 }, { "epoch": 17.84, "grad_norm": 0.0016401337925344706, "learning_rate": 2.16e-05, "loss": 0.0004, "step": 4460 }, { "epoch": 17.88, "grad_norm": 0.001987120369449258, "learning_rate": 2.12e-05, "loss": 0.0004, "step": 4470 }, { "epoch": 17.92, "grad_norm": 0.002337078796699643, "learning_rate": 2.08e-05, "loss": 0.0004, "step": 4480 }, { "epoch": 17.96, "grad_norm": 0.0024157376028597355, "learning_rate": 2.04e-05, "loss": 0.0004, "step": 4490 }, { "epoch": 18.0, "grad_norm": 0.0021758731454610825, "learning_rate": 2e-05, "loss": 0.0004, "step": 4500 }, { "epoch": 18.0, "eval_accuracy": 0.821, "eval_loss": 1.2483272552490234, "eval_runtime": 5.8485, "eval_samples_per_second": 170.983, "eval_steps_per_second": 21.373, "step": 4500 }, { "epoch": 18.04, "grad_norm": 0.002017402555793524, "learning_rate": 1.9600000000000002e-05, "loss": 0.0004, "step": 4510 }, { "epoch": 18.08, "grad_norm": 0.0020504500716924667, "learning_rate": 1.9200000000000003e-05, "loss": 0.0004, "step": 4520 }, { "epoch": 18.12, "grad_norm": 0.0019253026694059372, "learning_rate": 1.88e-05, "loss": 0.0004, "step": 4530 }, { "epoch": 18.16, "grad_norm": 0.0020694080740213394, "learning_rate": 1.84e-05, "loss": 0.0004, "step": 4540 }, { "epoch": 18.2, "grad_norm": 0.00169233581982553, "learning_rate": 1.8e-05, "loss": 0.0004, "step": 4550 }, { "epoch": 18.24, "grad_norm": 0.0019015150610357523, "learning_rate": 1.76e-05, "loss": 0.0004, "step": 4560 }, { "epoch": 18.28, "grad_norm": 0.002108811168000102, "learning_rate": 1.7199999999999998e-05, "loss": 0.0004, "step": 4570 }, { "epoch": 18.32, "grad_norm": 0.0021513872779905796, "learning_rate": 1.6800000000000002e-05, "loss": 0.0004, "step": 4580 }, { "epoch": 18.36, "grad_norm": 0.0019856479484587908, "learning_rate": 1.6400000000000002e-05, "loss": 0.0004, "step": 4590 }, { "epoch": 18.4, "grad_norm": 0.0020635281689465046, "learning_rate": 1.6000000000000003e-05, "loss": 0.0004, "step": 4600 }, { "epoch": 18.4, "eval_accuracy": 0.821, "eval_loss": 1.25178062915802, "eval_runtime": 5.3158, "eval_samples_per_second": 188.119, "eval_steps_per_second": 23.515, "step": 4600 }, { "epoch": 18.44, "grad_norm": 0.0020957705564796925, "learning_rate": 1.56e-05, "loss": 0.0004, "step": 4610 }, { "epoch": 18.48, "grad_norm": 0.002100448589771986, "learning_rate": 1.52e-05, "loss": 0.0004, "step": 4620 }, { "epoch": 18.52, "grad_norm": 0.0017688070656731725, "learning_rate": 1.48e-05, "loss": 0.0004, "step": 4630 }, { "epoch": 18.56, "grad_norm": 0.0018529172521084547, "learning_rate": 1.44e-05, "loss": 0.0004, "step": 4640 }, { "epoch": 18.6, "grad_norm": 0.0016175085911527276, "learning_rate": 1.4000000000000001e-05, "loss": 0.0004, "step": 4650 }, { "epoch": 18.64, "grad_norm": 0.001927876495756209, "learning_rate": 1.3600000000000002e-05, "loss": 0.0004, "step": 4660 }, { "epoch": 18.68, "grad_norm": 0.00210273964330554, "learning_rate": 1.32e-05, "loss": 0.0004, "step": 4670 }, { "epoch": 18.72, "grad_norm": 0.002222416689619422, "learning_rate": 1.2800000000000001e-05, "loss": 0.0004, "step": 4680 }, { "epoch": 18.76, "grad_norm": 0.0018316478235647082, "learning_rate": 1.24e-05, "loss": 0.0004, "step": 4690 }, { "epoch": 18.8, "grad_norm": 0.0019819007720798254, "learning_rate": 1.2e-05, "loss": 0.0004, "step": 4700 }, { "epoch": 18.8, "eval_accuracy": 0.821, "eval_loss": 1.254600167274475, "eval_runtime": 5.2417, "eval_samples_per_second": 190.777, "eval_steps_per_second": 23.847, "step": 4700 }, { "epoch": 18.84, "grad_norm": 0.0016140459338203073, "learning_rate": 1.16e-05, "loss": 0.0003, "step": 4710 }, { "epoch": 18.88, "grad_norm": 0.0018461669096723199, "learning_rate": 1.1200000000000001e-05, "loss": 0.0003, "step": 4720 }, { "epoch": 18.92, "grad_norm": 0.0018047132762148976, "learning_rate": 1.08e-05, "loss": 0.0003, "step": 4730 }, { "epoch": 18.96, "grad_norm": 0.0021749059669673443, "learning_rate": 1.04e-05, "loss": 0.0004, "step": 4740 }, { "epoch": 19.0, "grad_norm": 0.002118439180776477, "learning_rate": 1e-05, "loss": 0.0004, "step": 4750 }, { "epoch": 19.04, "grad_norm": 0.0019508595578372478, "learning_rate": 9.600000000000001e-06, "loss": 0.0004, "step": 4760 }, { "epoch": 19.08, "grad_norm": 0.002842509187757969, "learning_rate": 9.2e-06, "loss": 0.0004, "step": 4770 }, { "epoch": 19.12, "grad_norm": 0.0016302353469654918, "learning_rate": 8.8e-06, "loss": 0.0003, "step": 4780 }, { "epoch": 19.16, "grad_norm": 0.001962054753676057, "learning_rate": 8.400000000000001e-06, "loss": 0.0003, "step": 4790 }, { "epoch": 19.2, "grad_norm": 0.0021732356399297714, "learning_rate": 8.000000000000001e-06, "loss": 0.0004, "step": 4800 }, { "epoch": 19.2, "eval_accuracy": 0.821, "eval_loss": 1.2561498880386353, "eval_runtime": 10.3614, "eval_samples_per_second": 96.512, "eval_steps_per_second": 12.064, "step": 4800 }, { "epoch": 19.24, "grad_norm": 0.0018920012516900897, "learning_rate": 7.6e-06, "loss": 0.0004, "step": 4810 }, { "epoch": 19.28, "grad_norm": 0.002033686963841319, "learning_rate": 7.2e-06, "loss": 0.0003, "step": 4820 }, { "epoch": 19.32, "grad_norm": 0.0019939634948968887, "learning_rate": 6.800000000000001e-06, "loss": 0.0004, "step": 4830 }, { "epoch": 19.36, "grad_norm": 0.0024624329525977373, "learning_rate": 6.4000000000000006e-06, "loss": 0.0004, "step": 4840 }, { "epoch": 19.4, "grad_norm": 0.002102550817653537, "learning_rate": 6e-06, "loss": 0.0004, "step": 4850 }, { "epoch": 19.44, "grad_norm": 0.0017401843797415495, "learning_rate": 5.600000000000001e-06, "loss": 0.0004, "step": 4860 }, { "epoch": 19.48, "grad_norm": 0.0016316601540893316, "learning_rate": 5.2e-06, "loss": 0.0003, "step": 4870 }, { "epoch": 19.52, "grad_norm": 0.0024664264637976885, "learning_rate": 4.800000000000001e-06, "loss": 0.0003, "step": 4880 }, { "epoch": 19.56, "grad_norm": 0.0017682453617453575, "learning_rate": 4.4e-06, "loss": 0.0004, "step": 4890 }, { "epoch": 19.6, "grad_norm": 0.0019737447146326303, "learning_rate": 4.000000000000001e-06, "loss": 0.0004, "step": 4900 }, { "epoch": 19.6, "eval_accuracy": 0.82, "eval_loss": 1.2573806047439575, "eval_runtime": 5.3728, "eval_samples_per_second": 186.124, "eval_steps_per_second": 23.265, "step": 4900 }, { "epoch": 19.64, "grad_norm": 0.002168442588299513, "learning_rate": 3.6e-06, "loss": 0.0003, "step": 4910 }, { "epoch": 19.68, "grad_norm": 0.0016441222978755832, "learning_rate": 3.2000000000000003e-06, "loss": 0.0003, "step": 4920 }, { "epoch": 19.72, "grad_norm": 0.0016924934461712837, "learning_rate": 2.8000000000000003e-06, "loss": 0.0003, "step": 4930 }, { "epoch": 19.76, "grad_norm": 0.0026684573385864496, "learning_rate": 2.4000000000000003e-06, "loss": 0.0004, "step": 4940 }, { "epoch": 19.8, "grad_norm": 0.002037678612396121, "learning_rate": 2.0000000000000003e-06, "loss": 0.0004, "step": 4950 }, { "epoch": 19.84, "grad_norm": 0.0019053890137001872, "learning_rate": 1.6000000000000001e-06, "loss": 0.0004, "step": 4960 }, { "epoch": 19.88, "grad_norm": 0.002319651423022151, "learning_rate": 1.2000000000000002e-06, "loss": 0.0003, "step": 4970 }, { "epoch": 19.92, "grad_norm": 0.0018768333829939365, "learning_rate": 8.000000000000001e-07, "loss": 0.0004, "step": 4980 }, { "epoch": 19.96, "grad_norm": 0.0020151715725660324, "learning_rate": 4.0000000000000003e-07, "loss": 0.0004, "step": 4990 }, { "epoch": 20.0, "grad_norm": 0.0020873937755823135, "learning_rate": 0.0, "loss": 0.0004, "step": 5000 }, { "epoch": 20.0, "eval_accuracy": 0.82, "eval_loss": 1.257745385169983, "eval_runtime": 5.3256, "eval_samples_per_second": 187.773, "eval_steps_per_second": 23.472, "step": 5000 }, { "epoch": 20.0, "step": 5000, "total_flos": 6.19947029495808e+18, "train_loss": 0.16214322472065687, "train_runtime": 2178.5774, "train_samples_per_second": 36.721, "train_steps_per_second": 2.295 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.19947029495808e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }