{ "best_metric": 0.4731413722038269, "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-2200", "epoch": 4.0, "eval_steps": 100, "global_step": 3568, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 1.2656996250152588, "learning_rate": 0.00019943946188340808, "loss": 1.7641, "step": 10 }, { "epoch": 0.02, "grad_norm": 1.702761173248291, "learning_rate": 0.00019887892376681615, "loss": 1.5342, "step": 20 }, { "epoch": 0.03, "grad_norm": 1.4954216480255127, "learning_rate": 0.00019831838565022422, "loss": 1.2414, "step": 30 }, { "epoch": 0.04, "grad_norm": 1.647926926612854, "learning_rate": 0.0001977578475336323, "loss": 1.1185, "step": 40 }, { "epoch": 0.06, "grad_norm": 2.192228078842163, "learning_rate": 0.00019719730941704039, "loss": 1.1626, "step": 50 }, { "epoch": 0.07, "grad_norm": 2.8315672874450684, "learning_rate": 0.00019663677130044843, "loss": 1.1362, "step": 60 }, { "epoch": 0.08, "grad_norm": 2.4982831478118896, "learning_rate": 0.0001960762331838565, "loss": 1.0978, "step": 70 }, { "epoch": 0.09, "grad_norm": 3.8782095909118652, "learning_rate": 0.0001955156950672646, "loss": 1.0034, "step": 80 }, { "epoch": 0.1, "grad_norm": 2.299464464187622, "learning_rate": 0.00019495515695067267, "loss": 0.9887, "step": 90 }, { "epoch": 0.11, "grad_norm": 2.7226407527923584, "learning_rate": 0.0001943946188340807, "loss": 0.8874, "step": 100 }, { "epoch": 0.11, "eval_accuracy": 0.6571428571428571, "eval_loss": 0.9671773314476013, "eval_runtime": 129.3363, "eval_samples_per_second": 54.934, "eval_steps_per_second": 3.441, "step": 100 }, { "epoch": 0.12, "grad_norm": 1.4987040758132935, "learning_rate": 0.0001938340807174888, "loss": 0.8796, "step": 110 }, { "epoch": 0.13, "grad_norm": 1.5715798139572144, "learning_rate": 0.00019327354260089688, "loss": 0.9511, "step": 120 }, { "epoch": 0.15, "grad_norm": 1.1380341053009033, "learning_rate": 0.00019271300448430495, "loss": 0.8983, "step": 130 }, { "epoch": 0.16, "grad_norm": 2.5433120727539062, "learning_rate": 0.00019215246636771302, "loss": 0.8984, "step": 140 }, { "epoch": 0.17, "grad_norm": 2.033275604248047, "learning_rate": 0.00019159192825112109, "loss": 0.824, "step": 150 }, { "epoch": 0.18, "grad_norm": 2.4264872074127197, "learning_rate": 0.00019103139013452916, "loss": 0.8358, "step": 160 }, { "epoch": 0.19, "grad_norm": 1.856844186782837, "learning_rate": 0.00019047085201793723, "loss": 0.8706, "step": 170 }, { "epoch": 0.2, "grad_norm": 2.4621365070343018, "learning_rate": 0.0001899103139013453, "loss": 0.8521, "step": 180 }, { "epoch": 0.21, "grad_norm": 1.4839106798171997, "learning_rate": 0.00018934977578475337, "loss": 0.8314, "step": 190 }, { "epoch": 0.22, "grad_norm": 2.008270502090454, "learning_rate": 0.00018878923766816143, "loss": 0.7806, "step": 200 }, { "epoch": 0.22, "eval_accuracy": 0.7096410978184378, "eval_loss": 0.8030331134796143, "eval_runtime": 127.3215, "eval_samples_per_second": 55.804, "eval_steps_per_second": 3.495, "step": 200 }, { "epoch": 0.24, "grad_norm": 2.759716272354126, "learning_rate": 0.00018822869955156953, "loss": 0.7414, "step": 210 }, { "epoch": 0.25, "grad_norm": 2.2896201610565186, "learning_rate": 0.00018766816143497757, "loss": 0.7788, "step": 220 }, { "epoch": 0.26, "grad_norm": 1.522684097290039, "learning_rate": 0.00018710762331838564, "loss": 0.7765, "step": 230 }, { "epoch": 0.27, "grad_norm": 2.6483938694000244, "learning_rate": 0.00018654708520179374, "loss": 0.8428, "step": 240 }, { "epoch": 0.28, "grad_norm": 1.5204224586486816, "learning_rate": 0.0001859865470852018, "loss": 0.7756, "step": 250 }, { "epoch": 0.29, "grad_norm": 1.352989912033081, "learning_rate": 0.00018542600896860985, "loss": 0.8085, "step": 260 }, { "epoch": 0.3, "grad_norm": 3.1435911655426025, "learning_rate": 0.00018486547085201795, "loss": 0.8336, "step": 270 }, { "epoch": 0.31, "grad_norm": 1.863233208656311, "learning_rate": 0.00018430493273542602, "loss": 0.7351, "step": 280 }, { "epoch": 0.33, "grad_norm": 1.6672592163085938, "learning_rate": 0.0001837443946188341, "loss": 0.7169, "step": 290 }, { "epoch": 0.34, "grad_norm": 1.852493405342102, "learning_rate": 0.00018318385650224216, "loss": 0.7404, "step": 300 }, { "epoch": 0.34, "eval_accuracy": 0.696551724137931, "eval_loss": 0.8076898455619812, "eval_runtime": 127.361, "eval_samples_per_second": 55.786, "eval_steps_per_second": 3.494, "step": 300 }, { "epoch": 0.35, "grad_norm": 2.066432476043701, "learning_rate": 0.00018262331838565023, "loss": 0.7422, "step": 310 }, { "epoch": 0.36, "grad_norm": 1.5433382987976074, "learning_rate": 0.0001820627802690583, "loss": 0.7509, "step": 320 }, { "epoch": 0.37, "grad_norm": 2.6319210529327393, "learning_rate": 0.00018150224215246637, "loss": 0.7168, "step": 330 }, { "epoch": 0.38, "grad_norm": 1.8435016870498657, "learning_rate": 0.00018094170403587444, "loss": 0.7421, "step": 340 }, { "epoch": 0.39, "grad_norm": 1.946035623550415, "learning_rate": 0.0001803811659192825, "loss": 0.7062, "step": 350 }, { "epoch": 0.4, "grad_norm": 1.926882028579712, "learning_rate": 0.0001798206278026906, "loss": 0.7228, "step": 360 }, { "epoch": 0.41, "grad_norm": 3.0743374824523926, "learning_rate": 0.00017926008968609868, "loss": 0.7722, "step": 370 }, { "epoch": 0.43, "grad_norm": 1.2202589511871338, "learning_rate": 0.00017869955156950672, "loss": 0.7334, "step": 380 }, { "epoch": 0.44, "grad_norm": 1.696330189704895, "learning_rate": 0.00017813901345291482, "loss": 0.7306, "step": 390 }, { "epoch": 0.45, "grad_norm": 2.4689078330993652, "learning_rate": 0.0001775784753363229, "loss": 0.7224, "step": 400 }, { "epoch": 0.45, "eval_accuracy": 0.741027445460943, "eval_loss": 0.6990236043930054, "eval_runtime": 127.2157, "eval_samples_per_second": 55.85, "eval_steps_per_second": 3.498, "step": 400 }, { "epoch": 0.46, "grad_norm": 1.5020831823349, "learning_rate": 0.00017701793721973096, "loss": 0.7291, "step": 410 }, { "epoch": 0.47, "grad_norm": 2.5034234523773193, "learning_rate": 0.00017645739910313903, "loss": 0.6754, "step": 420 }, { "epoch": 0.48, "grad_norm": 1.4598332643508911, "learning_rate": 0.0001758968609865471, "loss": 0.6624, "step": 430 }, { "epoch": 0.49, "grad_norm": 4.073904991149902, "learning_rate": 0.00017533632286995517, "loss": 0.6682, "step": 440 }, { "epoch": 0.5, "grad_norm": 1.7306851148605347, "learning_rate": 0.00017477578475336324, "loss": 0.6515, "step": 450 }, { "epoch": 0.52, "grad_norm": 2.59877347946167, "learning_rate": 0.0001742152466367713, "loss": 0.5476, "step": 460 }, { "epoch": 0.53, "grad_norm": 3.6079037189483643, "learning_rate": 0.00017365470852017938, "loss": 0.7031, "step": 470 }, { "epoch": 0.54, "grad_norm": 3.1861743927001953, "learning_rate": 0.00017309417040358745, "loss": 0.7141, "step": 480 }, { "epoch": 0.55, "grad_norm": 1.9290772676467896, "learning_rate": 0.00017253363228699552, "loss": 0.668, "step": 490 }, { "epoch": 0.56, "grad_norm": 2.201539993286133, "learning_rate": 0.0001719730941704036, "loss": 0.6969, "step": 500 }, { "epoch": 0.56, "eval_accuracy": 0.7362420830401126, "eval_loss": 0.7265785932540894, "eval_runtime": 127.1383, "eval_samples_per_second": 55.884, "eval_steps_per_second": 3.5, "step": 500 }, { "epoch": 0.57, "grad_norm": 1.651583194732666, "learning_rate": 0.00017141255605381166, "loss": 0.6099, "step": 510 }, { "epoch": 0.58, "grad_norm": 1.7216840982437134, "learning_rate": 0.00017085201793721975, "loss": 0.6174, "step": 520 }, { "epoch": 0.59, "grad_norm": 1.8105279207229614, "learning_rate": 0.0001702914798206278, "loss": 0.6444, "step": 530 }, { "epoch": 0.61, "grad_norm": 1.837810754776001, "learning_rate": 0.00016973094170403587, "loss": 0.6607, "step": 540 }, { "epoch": 0.62, "grad_norm": 1.6770997047424316, "learning_rate": 0.00016917040358744396, "loss": 0.6268, "step": 550 }, { "epoch": 0.63, "grad_norm": 1.2837833166122437, "learning_rate": 0.00016860986547085203, "loss": 0.7084, "step": 560 }, { "epoch": 0.64, "grad_norm": 1.0523972511291504, "learning_rate": 0.00016804932735426008, "loss": 0.6271, "step": 570 }, { "epoch": 0.65, "grad_norm": 1.356562852859497, "learning_rate": 0.00016748878923766817, "loss": 0.6242, "step": 580 }, { "epoch": 0.66, "grad_norm": 1.5814656019210815, "learning_rate": 0.00016692825112107624, "loss": 0.564, "step": 590 }, { "epoch": 0.67, "grad_norm": 1.757988691329956, "learning_rate": 0.0001663677130044843, "loss": 0.5929, "step": 600 }, { "epoch": 0.67, "eval_accuracy": 0.7589021815622801, "eval_loss": 0.6735280156135559, "eval_runtime": 126.499, "eval_samples_per_second": 56.166, "eval_steps_per_second": 3.518, "step": 600 }, { "epoch": 0.68, "grad_norm": 1.8649027347564697, "learning_rate": 0.00016580717488789238, "loss": 0.6578, "step": 610 }, { "epoch": 0.7, "grad_norm": 2.1447596549987793, "learning_rate": 0.00016524663677130045, "loss": 0.6224, "step": 620 }, { "epoch": 0.71, "grad_norm": 1.7960020303726196, "learning_rate": 0.00016468609865470852, "loss": 0.6506, "step": 630 }, { "epoch": 0.72, "grad_norm": 2.265737533569336, "learning_rate": 0.00016412556053811662, "loss": 0.6302, "step": 640 }, { "epoch": 0.73, "grad_norm": 1.6668283939361572, "learning_rate": 0.00016356502242152466, "loss": 0.6518, "step": 650 }, { "epoch": 0.74, "grad_norm": 2.8400986194610596, "learning_rate": 0.00016300448430493273, "loss": 0.619, "step": 660 }, { "epoch": 0.75, "grad_norm": 4.842011451721191, "learning_rate": 0.00016244394618834083, "loss": 0.5984, "step": 670 }, { "epoch": 0.76, "grad_norm": 2.0878114700317383, "learning_rate": 0.0001618834080717489, "loss": 0.5451, "step": 680 }, { "epoch": 0.77, "grad_norm": 2.6710331439971924, "learning_rate": 0.00016132286995515694, "loss": 0.6813, "step": 690 }, { "epoch": 0.78, "grad_norm": 2.501116991043091, "learning_rate": 0.00016076233183856504, "loss": 0.5556, "step": 700 }, { "epoch": 0.78, "eval_accuracy": 0.7463757916959888, "eval_loss": 0.6704686284065247, "eval_runtime": 126.4929, "eval_samples_per_second": 56.169, "eval_steps_per_second": 3.518, "step": 700 }, { "epoch": 0.8, "grad_norm": 2.3544445037841797, "learning_rate": 0.0001602017937219731, "loss": 0.6037, "step": 710 }, { "epoch": 0.81, "grad_norm": 1.7408026456832886, "learning_rate": 0.00015964125560538118, "loss": 0.7128, "step": 720 }, { "epoch": 0.82, "grad_norm": 3.1222403049468994, "learning_rate": 0.00015908071748878925, "loss": 0.5976, "step": 730 }, { "epoch": 0.83, "grad_norm": 1.2981232404708862, "learning_rate": 0.00015852017937219732, "loss": 0.5566, "step": 740 }, { "epoch": 0.84, "grad_norm": 0.9470215439796448, "learning_rate": 0.0001579596412556054, "loss": 0.5446, "step": 750 }, { "epoch": 0.85, "grad_norm": 1.5283540487289429, "learning_rate": 0.00015739910313901346, "loss": 0.5127, "step": 760 }, { "epoch": 0.86, "grad_norm": 1.9940904378890991, "learning_rate": 0.00015683856502242153, "loss": 0.5999, "step": 770 }, { "epoch": 0.87, "grad_norm": 2.0824811458587646, "learning_rate": 0.0001562780269058296, "loss": 0.5658, "step": 780 }, { "epoch": 0.89, "grad_norm": 1.9540338516235352, "learning_rate": 0.0001557174887892377, "loss": 0.5329, "step": 790 }, { "epoch": 0.9, "grad_norm": 1.6645208597183228, "learning_rate": 0.00015515695067264574, "loss": 0.5831, "step": 800 }, { "epoch": 0.9, "eval_accuracy": 0.7681914144968333, "eval_loss": 0.6300484538078308, "eval_runtime": 127.6321, "eval_samples_per_second": 55.668, "eval_steps_per_second": 3.487, "step": 800 }, { "epoch": 0.91, "grad_norm": 1.9477753639221191, "learning_rate": 0.0001545964125560538, "loss": 0.5994, "step": 810 }, { "epoch": 0.92, "grad_norm": 1.3609261512756348, "learning_rate": 0.00015403587443946188, "loss": 0.5809, "step": 820 }, { "epoch": 0.93, "grad_norm": 1.7007122039794922, "learning_rate": 0.00015347533632286998, "loss": 0.6124, "step": 830 }, { "epoch": 0.94, "grad_norm": 1.753017783164978, "learning_rate": 0.00015291479820627804, "loss": 0.5809, "step": 840 }, { "epoch": 0.95, "grad_norm": 1.672780156135559, "learning_rate": 0.0001523542600896861, "loss": 0.5536, "step": 850 }, { "epoch": 0.96, "grad_norm": 1.7526847124099731, "learning_rate": 0.00015179372197309418, "loss": 0.6337, "step": 860 }, { "epoch": 0.98, "grad_norm": 1.4697840213775635, "learning_rate": 0.00015123318385650225, "loss": 0.5725, "step": 870 }, { "epoch": 0.99, "grad_norm": 2.619101047515869, "learning_rate": 0.00015067264573991032, "loss": 0.7272, "step": 880 }, { "epoch": 1.0, "grad_norm": 2.2301433086395264, "learning_rate": 0.0001501121076233184, "loss": 0.5315, "step": 890 }, { "epoch": 1.01, "grad_norm": 1.2401838302612305, "learning_rate": 0.00014955156950672646, "loss": 0.3992, "step": 900 }, { "epoch": 1.01, "eval_accuracy": 0.7884588318085856, "eval_loss": 0.5879009366035461, "eval_runtime": 127.0026, "eval_samples_per_second": 55.944, "eval_steps_per_second": 3.504, "step": 900 }, { "epoch": 1.02, "grad_norm": 0.7034109830856323, "learning_rate": 0.00014899103139013453, "loss": 0.3054, "step": 910 }, { "epoch": 1.03, "grad_norm": 1.852588176727295, "learning_rate": 0.0001484304932735426, "loss": 0.4751, "step": 920 }, { "epoch": 1.04, "grad_norm": 2.8306503295898438, "learning_rate": 0.00014786995515695067, "loss": 0.4143, "step": 930 }, { "epoch": 1.05, "grad_norm": 2.520498037338257, "learning_rate": 0.00014730941704035874, "loss": 0.4863, "step": 940 }, { "epoch": 1.07, "grad_norm": 3.4684131145477295, "learning_rate": 0.00014674887892376684, "loss": 0.4911, "step": 950 }, { "epoch": 1.08, "grad_norm": 1.4476227760314941, "learning_rate": 0.00014618834080717488, "loss": 0.4375, "step": 960 }, { "epoch": 1.09, "grad_norm": 2.003276824951172, "learning_rate": 0.00014562780269058295, "loss": 0.4116, "step": 970 }, { "epoch": 1.1, "grad_norm": 2.1394705772399902, "learning_rate": 0.00014506726457399105, "loss": 0.3873, "step": 980 }, { "epoch": 1.11, "grad_norm": 2.4954771995544434, "learning_rate": 0.00014450672645739912, "loss": 0.3475, "step": 990 }, { "epoch": 1.12, "grad_norm": 2.449598550796509, "learning_rate": 0.00014394618834080716, "loss": 0.4661, "step": 1000 }, { "epoch": 1.12, "eval_accuracy": 0.7887403237156931, "eval_loss": 0.5782468914985657, "eval_runtime": 127.3385, "eval_samples_per_second": 55.796, "eval_steps_per_second": 3.495, "step": 1000 }, { "epoch": 1.13, "grad_norm": 2.011767864227295, "learning_rate": 0.00014338565022421526, "loss": 0.3975, "step": 1010 }, { "epoch": 1.14, "grad_norm": 1.4032831192016602, "learning_rate": 0.00014282511210762333, "loss": 0.4123, "step": 1020 }, { "epoch": 1.15, "grad_norm": 1.5600253343582153, "learning_rate": 0.0001422645739910314, "loss": 0.351, "step": 1030 }, { "epoch": 1.17, "grad_norm": 3.0084493160247803, "learning_rate": 0.00014170403587443947, "loss": 0.4234, "step": 1040 }, { "epoch": 1.18, "grad_norm": 1.441857933998108, "learning_rate": 0.00014114349775784754, "loss": 0.4868, "step": 1050 }, { "epoch": 1.19, "grad_norm": 1.2188992500305176, "learning_rate": 0.0001405829596412556, "loss": 0.423, "step": 1060 }, { "epoch": 1.2, "grad_norm": 1.6048247814178467, "learning_rate": 0.0001400224215246637, "loss": 0.4204, "step": 1070 }, { "epoch": 1.21, "grad_norm": 2.915587902069092, "learning_rate": 0.00013946188340807175, "loss": 0.3875, "step": 1080 }, { "epoch": 1.22, "grad_norm": 1.629499912261963, "learning_rate": 0.00013890134529147982, "loss": 0.4025, "step": 1090 }, { "epoch": 1.23, "grad_norm": 2.1824419498443604, "learning_rate": 0.00013834080717488792, "loss": 0.358, "step": 1100 }, { "epoch": 1.23, "eval_accuracy": 0.7942294159042927, "eval_loss": 0.5690400004386902, "eval_runtime": 127.5854, "eval_samples_per_second": 55.688, "eval_steps_per_second": 3.488, "step": 1100 }, { "epoch": 1.24, "grad_norm": 1.4124354124069214, "learning_rate": 0.000137780269058296, "loss": 0.3944, "step": 1110 }, { "epoch": 1.26, "grad_norm": 1.3432316780090332, "learning_rate": 0.00013721973094170403, "loss": 0.4344, "step": 1120 }, { "epoch": 1.27, "grad_norm": 2.4002068042755127, "learning_rate": 0.0001366591928251121, "loss": 0.3457, "step": 1130 }, { "epoch": 1.28, "grad_norm": 4.019514560699463, "learning_rate": 0.0001360986547085202, "loss": 0.4713, "step": 1140 }, { "epoch": 1.29, "grad_norm": 1.3158634901046753, "learning_rate": 0.00013553811659192827, "loss": 0.3754, "step": 1150 }, { "epoch": 1.3, "grad_norm": 3.053358793258667, "learning_rate": 0.0001349775784753363, "loss": 0.4441, "step": 1160 }, { "epoch": 1.31, "grad_norm": 1.6883745193481445, "learning_rate": 0.0001344170403587444, "loss": 0.4562, "step": 1170 }, { "epoch": 1.32, "grad_norm": 2.3927807807922363, "learning_rate": 0.00013385650224215248, "loss": 0.4068, "step": 1180 }, { "epoch": 1.33, "grad_norm": 1.4922747611999512, "learning_rate": 0.00013329596412556055, "loss": 0.4174, "step": 1190 }, { "epoch": 1.35, "grad_norm": 2.156853199005127, "learning_rate": 0.00013273542600896862, "loss": 0.3812, "step": 1200 }, { "epoch": 1.35, "eval_accuracy": 0.8146375791695989, "eval_loss": 0.5108710527420044, "eval_runtime": 127.9489, "eval_samples_per_second": 55.53, "eval_steps_per_second": 3.478, "step": 1200 }, { "epoch": 1.36, "grad_norm": 1.6083571910858154, "learning_rate": 0.00013217488789237669, "loss": 0.3351, "step": 1210 }, { "epoch": 1.37, "grad_norm": 2.889650583267212, "learning_rate": 0.00013161434977578476, "loss": 0.4398, "step": 1220 }, { "epoch": 1.38, "grad_norm": 1.9212812185287476, "learning_rate": 0.00013105381165919283, "loss": 0.3878, "step": 1230 }, { "epoch": 1.39, "grad_norm": 1.9533714056015015, "learning_rate": 0.0001304932735426009, "loss": 0.3447, "step": 1240 }, { "epoch": 1.4, "grad_norm": 3.091277837753296, "learning_rate": 0.00012993273542600897, "loss": 0.3848, "step": 1250 }, { "epoch": 1.41, "grad_norm": 1.8874760866165161, "learning_rate": 0.00012937219730941706, "loss": 0.5295, "step": 1260 }, { "epoch": 1.42, "grad_norm": 2.417236804962158, "learning_rate": 0.0001288116591928251, "loss": 0.4444, "step": 1270 }, { "epoch": 1.43, "grad_norm": 2.4967589378356934, "learning_rate": 0.00012825112107623318, "loss": 0.4944, "step": 1280 }, { "epoch": 1.45, "grad_norm": 3.3610410690307617, "learning_rate": 0.00012769058295964127, "loss": 0.3398, "step": 1290 }, { "epoch": 1.46, "grad_norm": 1.5874662399291992, "learning_rate": 0.00012713004484304934, "loss": 0.3535, "step": 1300 }, { "epoch": 1.46, "eval_accuracy": 0.8147783251231527, "eval_loss": 0.5213413834571838, "eval_runtime": 127.3924, "eval_samples_per_second": 55.773, "eval_steps_per_second": 3.493, "step": 1300 }, { "epoch": 1.47, "grad_norm": 1.2904878854751587, "learning_rate": 0.00012656950672645739, "loss": 0.3868, "step": 1310 }, { "epoch": 1.48, "grad_norm": 2.38608980178833, "learning_rate": 0.00012600896860986548, "loss": 0.4082, "step": 1320 }, { "epoch": 1.49, "grad_norm": 1.6603220701217651, "learning_rate": 0.00012544843049327355, "loss": 0.3222, "step": 1330 }, { "epoch": 1.5, "grad_norm": 1.8661950826644897, "learning_rate": 0.00012488789237668162, "loss": 0.4029, "step": 1340 }, { "epoch": 1.51, "grad_norm": 2.12640643119812, "learning_rate": 0.0001243273542600897, "loss": 0.3691, "step": 1350 }, { "epoch": 1.52, "grad_norm": 2.671631097793579, "learning_rate": 0.00012376681614349776, "loss": 0.3328, "step": 1360 }, { "epoch": 1.54, "grad_norm": 2.103508949279785, "learning_rate": 0.00012320627802690583, "loss": 0.3475, "step": 1370 }, { "epoch": 1.55, "grad_norm": 1.7701743841171265, "learning_rate": 0.00012264573991031393, "loss": 0.4423, "step": 1380 }, { "epoch": 1.56, "grad_norm": 2.38301682472229, "learning_rate": 0.00012208520179372197, "loss": 0.426, "step": 1390 }, { "epoch": 1.57, "grad_norm": 2.7273807525634766, "learning_rate": 0.00012152466367713004, "loss": 0.3901, "step": 1400 }, { "epoch": 1.57, "eval_accuracy": 0.8125263898662913, "eval_loss": 0.5261800289154053, "eval_runtime": 126.367, "eval_samples_per_second": 56.225, "eval_steps_per_second": 3.521, "step": 1400 }, { "epoch": 1.58, "grad_norm": 2.7706730365753174, "learning_rate": 0.00012096412556053814, "loss": 0.4756, "step": 1410 }, { "epoch": 1.59, "grad_norm": 2.9104268550872803, "learning_rate": 0.0001204035874439462, "loss": 0.3625, "step": 1420 }, { "epoch": 1.6, "grad_norm": 1.5548855066299438, "learning_rate": 0.00011984304932735426, "loss": 0.3352, "step": 1430 }, { "epoch": 1.61, "grad_norm": 2.2463669776916504, "learning_rate": 0.00011928251121076232, "loss": 0.3944, "step": 1440 }, { "epoch": 1.63, "grad_norm": 2.0413687229156494, "learning_rate": 0.00011872197309417042, "loss": 0.3637, "step": 1450 }, { "epoch": 1.64, "grad_norm": 2.267987012863159, "learning_rate": 0.00011816143497757847, "loss": 0.3517, "step": 1460 }, { "epoch": 1.65, "grad_norm": 1.90973699092865, "learning_rate": 0.00011760089686098654, "loss": 0.2954, "step": 1470 }, { "epoch": 1.66, "grad_norm": 1.5805819034576416, "learning_rate": 0.00011704035874439463, "loss": 0.3207, "step": 1480 }, { "epoch": 1.67, "grad_norm": 2.012744903564453, "learning_rate": 0.0001164798206278027, "loss": 0.2816, "step": 1490 }, { "epoch": 1.68, "grad_norm": 1.0160799026489258, "learning_rate": 0.00011591928251121075, "loss": 0.3276, "step": 1500 }, { "epoch": 1.68, "eval_accuracy": 0.8081632653061225, "eval_loss": 0.5793688893318176, "eval_runtime": 128.0301, "eval_samples_per_second": 55.495, "eval_steps_per_second": 3.476, "step": 1500 }, { "epoch": 1.69, "grad_norm": 2.1283419132232666, "learning_rate": 0.00011535874439461885, "loss": 0.3697, "step": 1510 }, { "epoch": 1.7, "grad_norm": 2.384875774383545, "learning_rate": 0.00011479820627802691, "loss": 0.4283, "step": 1520 }, { "epoch": 1.72, "grad_norm": 1.6091313362121582, "learning_rate": 0.00011423766816143498, "loss": 0.2917, "step": 1530 }, { "epoch": 1.73, "grad_norm": 2.50388503074646, "learning_rate": 0.00011367713004484306, "loss": 0.3705, "step": 1540 }, { "epoch": 1.74, "grad_norm": 2.4337923526763916, "learning_rate": 0.00011311659192825113, "loss": 0.3724, "step": 1550 }, { "epoch": 1.75, "grad_norm": 1.677294373512268, "learning_rate": 0.00011255605381165919, "loss": 0.3492, "step": 1560 }, { "epoch": 1.76, "grad_norm": 1.634697675704956, "learning_rate": 0.00011199551569506727, "loss": 0.3633, "step": 1570 }, { "epoch": 1.77, "grad_norm": 2.2558953762054443, "learning_rate": 0.00011143497757847534, "loss": 0.3875, "step": 1580 }, { "epoch": 1.78, "grad_norm": 1.5888583660125732, "learning_rate": 0.00011087443946188341, "loss": 0.3323, "step": 1590 }, { "epoch": 1.79, "grad_norm": 2.265427827835083, "learning_rate": 0.0001103139013452915, "loss": 0.3679, "step": 1600 }, { "epoch": 1.79, "eval_accuracy": 0.8115411681914145, "eval_loss": 0.5365468859672546, "eval_runtime": 127.8881, "eval_samples_per_second": 55.556, "eval_steps_per_second": 3.48, "step": 1600 }, { "epoch": 1.8, "grad_norm": 2.279982566833496, "learning_rate": 0.00010975336322869956, "loss": 0.2996, "step": 1610 }, { "epoch": 1.82, "grad_norm": 1.905090093612671, "learning_rate": 0.00010919282511210762, "loss": 0.3268, "step": 1620 }, { "epoch": 1.83, "grad_norm": 4.301815032958984, "learning_rate": 0.0001086322869955157, "loss": 0.3312, "step": 1630 }, { "epoch": 1.84, "grad_norm": 1.8537272214889526, "learning_rate": 0.00010807174887892377, "loss": 0.3654, "step": 1640 }, { "epoch": 1.85, "grad_norm": 1.3462457656860352, "learning_rate": 0.00010751121076233184, "loss": 0.346, "step": 1650 }, { "epoch": 1.86, "grad_norm": 2.4433109760284424, "learning_rate": 0.00010695067264573993, "loss": 0.3563, "step": 1660 }, { "epoch": 1.87, "grad_norm": 1.8175091743469238, "learning_rate": 0.00010639013452914798, "loss": 0.3065, "step": 1670 }, { "epoch": 1.88, "grad_norm": 2.724806547164917, "learning_rate": 0.00010582959641255605, "loss": 0.3949, "step": 1680 }, { "epoch": 1.89, "grad_norm": 1.4651908874511719, "learning_rate": 0.00010526905829596414, "loss": 0.3451, "step": 1690 }, { "epoch": 1.91, "grad_norm": 1.2862894535064697, "learning_rate": 0.0001047085201793722, "loss": 0.3077, "step": 1700 }, { "epoch": 1.91, "eval_accuracy": 0.825615763546798, "eval_loss": 0.503186047077179, "eval_runtime": 127.5407, "eval_samples_per_second": 55.708, "eval_steps_per_second": 3.489, "step": 1700 }, { "epoch": 1.92, "grad_norm": 1.5572246313095093, "learning_rate": 0.00010414798206278026, "loss": 0.3638, "step": 1710 }, { "epoch": 1.93, "grad_norm": 3.091179847717285, "learning_rate": 0.00010358744394618836, "loss": 0.4205, "step": 1720 }, { "epoch": 1.94, "grad_norm": 2.097266435623169, "learning_rate": 0.00010302690582959642, "loss": 0.2865, "step": 1730 }, { "epoch": 1.95, "grad_norm": 3.9726853370666504, "learning_rate": 0.00010246636771300449, "loss": 0.3283, "step": 1740 }, { "epoch": 1.96, "grad_norm": 2.2275006771087646, "learning_rate": 0.00010190582959641257, "loss": 0.2865, "step": 1750 }, { "epoch": 1.97, "grad_norm": 2.571467161178589, "learning_rate": 0.00010134529147982064, "loss": 0.381, "step": 1760 }, { "epoch": 1.98, "grad_norm": 1.8339025974273682, "learning_rate": 0.0001007847533632287, "loss": 0.3399, "step": 1770 }, { "epoch": 2.0, "grad_norm": 1.96084463596344, "learning_rate": 0.00010022421524663677, "loss": 0.4104, "step": 1780 }, { "epoch": 2.01, "grad_norm": 1.5350396633148193, "learning_rate": 9.966367713004485e-05, "loss": 0.2616, "step": 1790 }, { "epoch": 2.02, "grad_norm": 1.11006760597229, "learning_rate": 9.910313901345292e-05, "loss": 0.1593, "step": 1800 }, { "epoch": 2.02, "eval_accuracy": 0.8237860661505981, "eval_loss": 0.4946657121181488, "eval_runtime": 127.5397, "eval_samples_per_second": 55.708, "eval_steps_per_second": 3.489, "step": 1800 }, { "epoch": 2.03, "grad_norm": 1.9963308572769165, "learning_rate": 9.854260089686099e-05, "loss": 0.1422, "step": 1810 }, { "epoch": 2.04, "grad_norm": 1.350595235824585, "learning_rate": 9.798206278026907e-05, "loss": 0.2012, "step": 1820 }, { "epoch": 2.05, "grad_norm": 1.9206656217575073, "learning_rate": 9.742152466367713e-05, "loss": 0.2354, "step": 1830 }, { "epoch": 2.06, "grad_norm": 0.9061004519462585, "learning_rate": 9.686098654708521e-05, "loss": 0.1971, "step": 1840 }, { "epoch": 2.07, "grad_norm": 1.090854287147522, "learning_rate": 9.630044843049327e-05, "loss": 0.2035, "step": 1850 }, { "epoch": 2.09, "grad_norm": 1.798594355583191, "learning_rate": 9.573991031390135e-05, "loss": 0.2362, "step": 1860 }, { "epoch": 2.1, "grad_norm": 0.2856987416744232, "learning_rate": 9.517937219730942e-05, "loss": 0.1448, "step": 1870 }, { "epoch": 2.11, "grad_norm": 1.9062001705169678, "learning_rate": 9.461883408071749e-05, "loss": 0.1019, "step": 1880 }, { "epoch": 2.12, "grad_norm": 1.8595843315124512, "learning_rate": 9.405829596412556e-05, "loss": 0.1844, "step": 1890 }, { "epoch": 2.13, "grad_norm": 1.6841151714324951, "learning_rate": 9.349775784753365e-05, "loss": 0.2495, "step": 1900 }, { "epoch": 2.13, "eval_accuracy": 0.8212526389866291, "eval_loss": 0.5188373327255249, "eval_runtime": 127.5963, "eval_samples_per_second": 55.683, "eval_steps_per_second": 3.488, "step": 1900 }, { "epoch": 2.14, "grad_norm": 2.3805599212646484, "learning_rate": 9.29372197309417e-05, "loss": 0.1725, "step": 1910 }, { "epoch": 2.15, "grad_norm": 1.5507524013519287, "learning_rate": 9.237668161434979e-05, "loss": 0.1951, "step": 1920 }, { "epoch": 2.16, "grad_norm": 4.487265586853027, "learning_rate": 9.181614349775786e-05, "loss": 0.1928, "step": 1930 }, { "epoch": 2.17, "grad_norm": 5.267577171325684, "learning_rate": 9.125560538116593e-05, "loss": 0.2279, "step": 1940 }, { "epoch": 2.19, "grad_norm": 1.9050337076187134, "learning_rate": 9.0695067264574e-05, "loss": 0.1408, "step": 1950 }, { "epoch": 2.2, "grad_norm": 2.7110774517059326, "learning_rate": 9.013452914798208e-05, "loss": 0.1851, "step": 1960 }, { "epoch": 2.21, "grad_norm": 2.4091663360595703, "learning_rate": 8.957399103139014e-05, "loss": 0.1961, "step": 1970 }, { "epoch": 2.22, "grad_norm": 1.0745985507965088, "learning_rate": 8.901345291479822e-05, "loss": 0.1807, "step": 1980 }, { "epoch": 2.23, "grad_norm": 1.907657504081726, "learning_rate": 8.845291479820629e-05, "loss": 0.1656, "step": 1990 }, { "epoch": 2.24, "grad_norm": 1.4196101427078247, "learning_rate": 8.789237668161436e-05, "loss": 0.1604, "step": 2000 }, { "epoch": 2.24, "eval_accuracy": 0.8457424349049965, "eval_loss": 0.47485658526420593, "eval_runtime": 127.5001, "eval_samples_per_second": 55.725, "eval_steps_per_second": 3.49, "step": 2000 }, { "epoch": 2.25, "grad_norm": 0.8819741606712341, "learning_rate": 8.733183856502243e-05, "loss": 0.1541, "step": 2010 }, { "epoch": 2.26, "grad_norm": 2.7008216381073, "learning_rate": 8.67713004484305e-05, "loss": 0.1754, "step": 2020 }, { "epoch": 2.28, "grad_norm": 1.3049780130386353, "learning_rate": 8.621076233183857e-05, "loss": 0.1474, "step": 2030 }, { "epoch": 2.29, "grad_norm": 1.7845088243484497, "learning_rate": 8.565022421524664e-05, "loss": 0.149, "step": 2040 }, { "epoch": 2.3, "grad_norm": 2.162095546722412, "learning_rate": 8.508968609865471e-05, "loss": 0.1711, "step": 2050 }, { "epoch": 2.31, "grad_norm": 2.4429993629455566, "learning_rate": 8.452914798206278e-05, "loss": 0.1278, "step": 2060 }, { "epoch": 2.32, "grad_norm": 4.209596157073975, "learning_rate": 8.396860986547086e-05, "loss": 0.2285, "step": 2070 }, { "epoch": 2.33, "grad_norm": 0.8332444429397583, "learning_rate": 8.340807174887892e-05, "loss": 0.1706, "step": 2080 }, { "epoch": 2.34, "grad_norm": 1.2180029153823853, "learning_rate": 8.2847533632287e-05, "loss": 0.1863, "step": 2090 }, { "epoch": 2.35, "grad_norm": 1.7199923992156982, "learning_rate": 8.228699551569507e-05, "loss": 0.1347, "step": 2100 }, { "epoch": 2.35, "eval_accuracy": 0.8318085855031668, "eval_loss": 0.4878062307834625, "eval_runtime": 127.098, "eval_samples_per_second": 55.902, "eval_steps_per_second": 3.501, "step": 2100 }, { "epoch": 2.37, "grad_norm": 2.4405524730682373, "learning_rate": 8.172645739910314e-05, "loss": 0.1669, "step": 2110 }, { "epoch": 2.38, "grad_norm": 3.2856411933898926, "learning_rate": 8.116591928251121e-05, "loss": 0.1526, "step": 2120 }, { "epoch": 2.39, "grad_norm": 2.642458915710449, "learning_rate": 8.06053811659193e-05, "loss": 0.1866, "step": 2130 }, { "epoch": 2.4, "grad_norm": 1.8131886720657349, "learning_rate": 8.004484304932735e-05, "loss": 0.1413, "step": 2140 }, { "epoch": 2.41, "grad_norm": 2.279311418533325, "learning_rate": 7.948430493273543e-05, "loss": 0.1764, "step": 2150 }, { "epoch": 2.42, "grad_norm": 3.409904956817627, "learning_rate": 7.892376681614349e-05, "loss": 0.1697, "step": 2160 }, { "epoch": 2.43, "grad_norm": 1.6139248609542847, "learning_rate": 7.836322869955157e-05, "loss": 0.2257, "step": 2170 }, { "epoch": 2.44, "grad_norm": 1.9628515243530273, "learning_rate": 7.780269058295964e-05, "loss": 0.1869, "step": 2180 }, { "epoch": 2.46, "grad_norm": 2.0070972442626953, "learning_rate": 7.724215246636771e-05, "loss": 0.2005, "step": 2190 }, { "epoch": 2.47, "grad_norm": 1.0854668617248535, "learning_rate": 7.668161434977578e-05, "loss": 0.1723, "step": 2200 }, { "epoch": 2.47, "eval_accuracy": 0.8441942294159043, "eval_loss": 0.4731413722038269, "eval_runtime": 127.6751, "eval_samples_per_second": 55.649, "eval_steps_per_second": 3.485, "step": 2200 }, { "epoch": 2.48, "grad_norm": 3.4323694705963135, "learning_rate": 7.612107623318387e-05, "loss": 0.1829, "step": 2210 }, { "epoch": 2.49, "grad_norm": 1.0630773305892944, "learning_rate": 7.556053811659192e-05, "loss": 0.1704, "step": 2220 }, { "epoch": 2.5, "grad_norm": 0.900248646736145, "learning_rate": 7.500000000000001e-05, "loss": 0.1428, "step": 2230 }, { "epoch": 2.51, "grad_norm": 0.8738330602645874, "learning_rate": 7.443946188340808e-05, "loss": 0.1403, "step": 2240 }, { "epoch": 2.52, "grad_norm": 2.873507499694824, "learning_rate": 7.387892376681615e-05, "loss": 0.1267, "step": 2250 }, { "epoch": 2.53, "grad_norm": 3.962599515914917, "learning_rate": 7.331838565022422e-05, "loss": 0.1783, "step": 2260 }, { "epoch": 2.54, "grad_norm": 1.1607624292373657, "learning_rate": 7.27578475336323e-05, "loss": 0.1067, "step": 2270 }, { "epoch": 2.56, "grad_norm": 3.1270833015441895, "learning_rate": 7.219730941704036e-05, "loss": 0.1542, "step": 2280 }, { "epoch": 2.57, "grad_norm": 4.381764888763428, "learning_rate": 7.163677130044844e-05, "loss": 0.1032, "step": 2290 }, { "epoch": 2.58, "grad_norm": 4.008007526397705, "learning_rate": 7.107623318385651e-05, "loss": 0.1235, "step": 2300 }, { "epoch": 2.58, "eval_accuracy": 0.8450387051372273, "eval_loss": 0.493280827999115, "eval_runtime": 127.613, "eval_samples_per_second": 55.676, "eval_steps_per_second": 3.487, "step": 2300 }, { "epoch": 2.59, "grad_norm": 2.7960009574890137, "learning_rate": 7.051569506726458e-05, "loss": 0.1695, "step": 2310 }, { "epoch": 2.6, "grad_norm": 3.1904006004333496, "learning_rate": 6.995515695067265e-05, "loss": 0.1388, "step": 2320 }, { "epoch": 2.61, "grad_norm": 3.1949515342712402, "learning_rate": 6.939461883408072e-05, "loss": 0.2264, "step": 2330 }, { "epoch": 2.62, "grad_norm": 2.386139154434204, "learning_rate": 6.883408071748879e-05, "loss": 0.1498, "step": 2340 }, { "epoch": 2.63, "grad_norm": 2.6440839767456055, "learning_rate": 6.827354260089687e-05, "loss": 0.1445, "step": 2350 }, { "epoch": 2.65, "grad_norm": 0.2900611162185669, "learning_rate": 6.771300448430493e-05, "loss": 0.1245, "step": 2360 }, { "epoch": 2.66, "grad_norm": 3.771578073501587, "learning_rate": 6.715246636771301e-05, "loss": 0.1775, "step": 2370 }, { "epoch": 2.67, "grad_norm": 0.2707236707210541, "learning_rate": 6.659192825112108e-05, "loss": 0.1317, "step": 2380 }, { "epoch": 2.68, "grad_norm": 2.4165215492248535, "learning_rate": 6.603139013452915e-05, "loss": 0.1645, "step": 2390 }, { "epoch": 2.69, "grad_norm": 2.579758882522583, "learning_rate": 6.547085201793722e-05, "loss": 0.1752, "step": 2400 }, { "epoch": 2.69, "eval_accuracy": 0.8501055594651654, "eval_loss": 0.47405895590782166, "eval_runtime": 127.5154, "eval_samples_per_second": 55.719, "eval_steps_per_second": 3.49, "step": 2400 }, { "epoch": 2.7, "grad_norm": 2.206603527069092, "learning_rate": 6.491031390134529e-05, "loss": 0.2482, "step": 2410 }, { "epoch": 2.71, "grad_norm": 1.586669921875, "learning_rate": 6.434977578475336e-05, "loss": 0.1746, "step": 2420 }, { "epoch": 2.72, "grad_norm": 2.046320676803589, "learning_rate": 6.378923766816143e-05, "loss": 0.1386, "step": 2430 }, { "epoch": 2.74, "grad_norm": 0.8042988181114197, "learning_rate": 6.322869955156952e-05, "loss": 0.1383, "step": 2440 }, { "epoch": 2.75, "grad_norm": 3.815175771713257, "learning_rate": 6.266816143497759e-05, "loss": 0.1394, "step": 2450 }, { "epoch": 2.76, "grad_norm": 2.830374002456665, "learning_rate": 6.210762331838566e-05, "loss": 0.1511, "step": 2460 }, { "epoch": 2.77, "grad_norm": 2.1348299980163574, "learning_rate": 6.154708520179373e-05, "loss": 0.1295, "step": 2470 }, { "epoch": 2.78, "grad_norm": 4.810758590698242, "learning_rate": 6.0986547085201795e-05, "loss": 0.1757, "step": 2480 }, { "epoch": 2.79, "grad_norm": 1.4163758754730225, "learning_rate": 6.042600896860987e-05, "loss": 0.0962, "step": 2490 }, { "epoch": 2.8, "grad_norm": 2.047985792160034, "learning_rate": 5.9865470852017935e-05, "loss": 0.1421, "step": 2500 }, { "epoch": 2.8, "eval_accuracy": 0.8474313863476425, "eval_loss": 0.4880400598049164, "eval_runtime": 127.7239, "eval_samples_per_second": 55.628, "eval_steps_per_second": 3.484, "step": 2500 }, { "epoch": 2.81, "grad_norm": 1.3623876571655273, "learning_rate": 5.930493273542601e-05, "loss": 0.1455, "step": 2510 }, { "epoch": 2.83, "grad_norm": 1.8722714185714722, "learning_rate": 5.874439461883409e-05, "loss": 0.1755, "step": 2520 }, { "epoch": 2.84, "grad_norm": 2.139150619506836, "learning_rate": 5.818385650224215e-05, "loss": 0.1471, "step": 2530 }, { "epoch": 2.85, "grad_norm": 1.041858434677124, "learning_rate": 5.762331838565023e-05, "loss": 0.1372, "step": 2540 }, { "epoch": 2.86, "grad_norm": 0.6558467149734497, "learning_rate": 5.7062780269058305e-05, "loss": 0.1184, "step": 2550 }, { "epoch": 2.87, "grad_norm": 3.568887233734131, "learning_rate": 5.650224215246637e-05, "loss": 0.1752, "step": 2560 }, { "epoch": 2.88, "grad_norm": 0.7773478627204895, "learning_rate": 5.5941704035874445e-05, "loss": 0.1491, "step": 2570 }, { "epoch": 2.89, "grad_norm": 2.827122688293457, "learning_rate": 5.5381165919282515e-05, "loss": 0.1893, "step": 2580 }, { "epoch": 2.9, "grad_norm": 3.533275842666626, "learning_rate": 5.4820627802690585e-05, "loss": 0.1117, "step": 2590 }, { "epoch": 2.91, "grad_norm": 0.7163364887237549, "learning_rate": 5.426008968609866e-05, "loss": 0.1549, "step": 2600 }, { "epoch": 2.91, "eval_accuracy": 0.8389866291344124, "eval_loss": 0.4745788276195526, "eval_runtime": 127.6611, "eval_samples_per_second": 55.655, "eval_steps_per_second": 3.486, "step": 2600 }, { "epoch": 2.93, "grad_norm": 2.184201717376709, "learning_rate": 5.369955156950673e-05, "loss": 0.1698, "step": 2610 }, { "epoch": 2.94, "grad_norm": 2.9737119674682617, "learning_rate": 5.31390134529148e-05, "loss": 0.1189, "step": 2620 }, { "epoch": 2.95, "grad_norm": 1.0814894437789917, "learning_rate": 5.257847533632287e-05, "loss": 0.132, "step": 2630 }, { "epoch": 2.96, "grad_norm": 0.9624450206756592, "learning_rate": 5.201793721973094e-05, "loss": 0.1032, "step": 2640 }, { "epoch": 2.97, "grad_norm": 2.6256656646728516, "learning_rate": 5.145739910313902e-05, "loss": 0.1592, "step": 2650 }, { "epoch": 2.98, "grad_norm": 3.0557103157043457, "learning_rate": 5.089686098654709e-05, "loss": 0.1327, "step": 2660 }, { "epoch": 2.99, "grad_norm": 2.0262203216552734, "learning_rate": 5.033632286995516e-05, "loss": 0.1487, "step": 2670 }, { "epoch": 3.0, "grad_norm": 0.14867419004440308, "learning_rate": 4.977578475336323e-05, "loss": 0.1005, "step": 2680 }, { "epoch": 3.02, "grad_norm": 0.12747132778167725, "learning_rate": 4.92152466367713e-05, "loss": 0.0543, "step": 2690 }, { "epoch": 3.03, "grad_norm": 0.1839972287416458, "learning_rate": 4.8654708520179374e-05, "loss": 0.0617, "step": 2700 }, { "epoch": 3.03, "eval_accuracy": 0.8496833216045039, "eval_loss": 0.4935864508152008, "eval_runtime": 127.5847, "eval_samples_per_second": 55.688, "eval_steps_per_second": 3.488, "step": 2700 }, { "epoch": 3.04, "grad_norm": 1.8856990337371826, "learning_rate": 4.8094170403587444e-05, "loss": 0.0687, "step": 2710 }, { "epoch": 3.05, "grad_norm": 0.39913854002952576, "learning_rate": 4.7533632286995514e-05, "loss": 0.0553, "step": 2720 }, { "epoch": 3.06, "grad_norm": 0.8423967957496643, "learning_rate": 4.697309417040359e-05, "loss": 0.0797, "step": 2730 }, { "epoch": 3.07, "grad_norm": 0.2557125687599182, "learning_rate": 4.641255605381166e-05, "loss": 0.08, "step": 2740 }, { "epoch": 3.08, "grad_norm": 2.2399497032165527, "learning_rate": 4.585201793721973e-05, "loss": 0.0483, "step": 2750 }, { "epoch": 3.09, "grad_norm": 0.04336220771074295, "learning_rate": 4.52914798206278e-05, "loss": 0.0339, "step": 2760 }, { "epoch": 3.11, "grad_norm": 0.29698485136032104, "learning_rate": 4.473094170403588e-05, "loss": 0.0579, "step": 2770 }, { "epoch": 3.12, "grad_norm": 2.0980567932128906, "learning_rate": 4.417040358744395e-05, "loss": 0.0996, "step": 2780 }, { "epoch": 3.13, "grad_norm": 0.4327409267425537, "learning_rate": 4.360986547085202e-05, "loss": 0.0435, "step": 2790 }, { "epoch": 3.14, "grad_norm": 0.9807620048522949, "learning_rate": 4.3049327354260094e-05, "loss": 0.0835, "step": 2800 }, { "epoch": 3.14, "eval_accuracy": 0.8554539057002111, "eval_loss": 0.4977756142616272, "eval_runtime": 127.7265, "eval_samples_per_second": 55.627, "eval_steps_per_second": 3.484, "step": 2800 }, { "epoch": 3.15, "grad_norm": 1.283894658088684, "learning_rate": 4.2488789237668164e-05, "loss": 0.0397, "step": 2810 }, { "epoch": 3.16, "grad_norm": 0.820012092590332, "learning_rate": 4.1928251121076234e-05, "loss": 0.0601, "step": 2820 }, { "epoch": 3.17, "grad_norm": 1.781630039215088, "learning_rate": 4.1367713004484303e-05, "loss": 0.0452, "step": 2830 }, { "epoch": 3.18, "grad_norm": 0.20719203352928162, "learning_rate": 4.080717488789238e-05, "loss": 0.0401, "step": 2840 }, { "epoch": 3.2, "grad_norm": 2.106254816055298, "learning_rate": 4.024663677130045e-05, "loss": 0.0358, "step": 2850 }, { "epoch": 3.21, "grad_norm": 1.9900000095367432, "learning_rate": 3.968609865470852e-05, "loss": 0.0496, "step": 2860 }, { "epoch": 3.22, "grad_norm": 0.2951858937740326, "learning_rate": 3.91255605381166e-05, "loss": 0.0406, "step": 2870 }, { "epoch": 3.23, "grad_norm": 0.3538978695869446, "learning_rate": 3.8565022421524667e-05, "loss": 0.0481, "step": 2880 }, { "epoch": 3.24, "grad_norm": 0.9406136870384216, "learning_rate": 3.8004484304932737e-05, "loss": 0.0239, "step": 2890 }, { "epoch": 3.25, "grad_norm": 1.0194897651672363, "learning_rate": 3.744394618834081e-05, "loss": 0.0477, "step": 2900 }, { "epoch": 3.25, "eval_accuracy": 0.8586910626319494, "eval_loss": 0.5344606637954712, "eval_runtime": 127.6926, "eval_samples_per_second": 55.641, "eval_steps_per_second": 3.485, "step": 2900 }, { "epoch": 3.26, "grad_norm": 0.11960559338331223, "learning_rate": 3.688340807174888e-05, "loss": 0.0511, "step": 2910 }, { "epoch": 3.27, "grad_norm": 0.57135009765625, "learning_rate": 3.632286995515695e-05, "loss": 0.0119, "step": 2920 }, { "epoch": 3.28, "grad_norm": 0.40805578231811523, "learning_rate": 3.576233183856502e-05, "loss": 0.0735, "step": 2930 }, { "epoch": 3.3, "grad_norm": 3.6458218097686768, "learning_rate": 3.52017937219731e-05, "loss": 0.0713, "step": 2940 }, { "epoch": 3.31, "grad_norm": 0.26397281885147095, "learning_rate": 3.464125560538117e-05, "loss": 0.0248, "step": 2950 }, { "epoch": 3.32, "grad_norm": 0.8868299126625061, "learning_rate": 3.408071748878924e-05, "loss": 0.0734, "step": 2960 }, { "epoch": 3.33, "grad_norm": 0.07102204859256744, "learning_rate": 3.3520179372197316e-05, "loss": 0.0176, "step": 2970 }, { "epoch": 3.34, "grad_norm": 2.244887590408325, "learning_rate": 3.2959641255605386e-05, "loss": 0.0498, "step": 2980 }, { "epoch": 3.35, "grad_norm": 0.5616236925125122, "learning_rate": 3.2399103139013456e-05, "loss": 0.0373, "step": 2990 }, { "epoch": 3.36, "grad_norm": 0.19669972360134125, "learning_rate": 3.1838565022421526e-05, "loss": 0.0287, "step": 3000 }, { "epoch": 3.36, "eval_accuracy": 0.8596762843068262, "eval_loss": 0.5332924723625183, "eval_runtime": 127.3143, "eval_samples_per_second": 55.807, "eval_steps_per_second": 3.495, "step": 3000 }, { "epoch": 3.37, "grad_norm": 1.8116190433502197, "learning_rate": 3.12780269058296e-05, "loss": 0.0203, "step": 3010 }, { "epoch": 3.39, "grad_norm": 4.554844379425049, "learning_rate": 3.071748878923767e-05, "loss": 0.0199, "step": 3020 }, { "epoch": 3.4, "grad_norm": 1.331480622291565, "learning_rate": 3.015695067264574e-05, "loss": 0.0258, "step": 3030 }, { "epoch": 3.41, "grad_norm": 0.2970045208930969, "learning_rate": 2.9596412556053816e-05, "loss": 0.0311, "step": 3040 }, { "epoch": 3.42, "grad_norm": 1.1972055435180664, "learning_rate": 2.9035874439461886e-05, "loss": 0.0559, "step": 3050 }, { "epoch": 3.43, "grad_norm": 3.563384532928467, "learning_rate": 2.8475336322869956e-05, "loss": 0.0457, "step": 3060 }, { "epoch": 3.44, "grad_norm": 1.6148489713668823, "learning_rate": 2.7914798206278025e-05, "loss": 0.0583, "step": 3070 }, { "epoch": 3.45, "grad_norm": 0.8646751642227173, "learning_rate": 2.7354260089686102e-05, "loss": 0.03, "step": 3080 }, { "epoch": 3.46, "grad_norm": 0.4915563762187958, "learning_rate": 2.6793721973094172e-05, "loss": 0.032, "step": 3090 }, { "epoch": 3.48, "grad_norm": 1.7134366035461426, "learning_rate": 2.6233183856502242e-05, "loss": 0.0242, "step": 3100 }, { "epoch": 3.48, "eval_accuracy": 0.8602392681210416, "eval_loss": 0.5433253645896912, "eval_runtime": 128.4379, "eval_samples_per_second": 55.319, "eval_steps_per_second": 3.465, "step": 3100 }, { "epoch": 3.49, "grad_norm": 0.1170569360256195, "learning_rate": 2.567264573991032e-05, "loss": 0.0349, "step": 3110 }, { "epoch": 3.5, "grad_norm": 0.28987565636634827, "learning_rate": 2.511210762331839e-05, "loss": 0.0327, "step": 3120 }, { "epoch": 3.51, "grad_norm": 0.07998673617839813, "learning_rate": 2.455156950672646e-05, "loss": 0.0483, "step": 3130 }, { "epoch": 3.52, "grad_norm": 0.4792230725288391, "learning_rate": 2.3991031390134532e-05, "loss": 0.0271, "step": 3140 }, { "epoch": 3.53, "grad_norm": 3.571005344390869, "learning_rate": 2.3430493273542602e-05, "loss": 0.0474, "step": 3150 }, { "epoch": 3.54, "grad_norm": 0.4508035182952881, "learning_rate": 2.286995515695067e-05, "loss": 0.0487, "step": 3160 }, { "epoch": 3.55, "grad_norm": 0.219608336687088, "learning_rate": 2.2309417040358745e-05, "loss": 0.0206, "step": 3170 }, { "epoch": 3.57, "grad_norm": 0.5323840975761414, "learning_rate": 2.1748878923766815e-05, "loss": 0.0461, "step": 3180 }, { "epoch": 3.58, "grad_norm": 5.022609233856201, "learning_rate": 2.1188340807174888e-05, "loss": 0.0387, "step": 3190 }, { "epoch": 3.59, "grad_norm": 0.12102050334215164, "learning_rate": 2.062780269058296e-05, "loss": 0.0196, "step": 3200 }, { "epoch": 3.59, "eval_accuracy": 0.8584095707248417, "eval_loss": 0.5772469639778137, "eval_runtime": 126.9569, "eval_samples_per_second": 55.964, "eval_steps_per_second": 3.505, "step": 3200 }, { "epoch": 3.6, "grad_norm": 0.7281592488288879, "learning_rate": 2.006726457399103e-05, "loss": 0.029, "step": 3210 }, { "epoch": 3.61, "grad_norm": 3.789141893386841, "learning_rate": 1.9506726457399105e-05, "loss": 0.0747, "step": 3220 }, { "epoch": 3.62, "grad_norm": 0.8820950388908386, "learning_rate": 1.8946188340807175e-05, "loss": 0.0254, "step": 3230 }, { "epoch": 3.63, "grad_norm": 5.991636276245117, "learning_rate": 1.8385650224215248e-05, "loss": 0.0354, "step": 3240 }, { "epoch": 3.64, "grad_norm": 0.16825184226036072, "learning_rate": 1.7825112107623318e-05, "loss": 0.0229, "step": 3250 }, { "epoch": 3.65, "grad_norm": 0.19618666172027588, "learning_rate": 1.726457399103139e-05, "loss": 0.0737, "step": 3260 }, { "epoch": 3.67, "grad_norm": 0.08360274136066437, "learning_rate": 1.6704035874439464e-05, "loss": 0.0341, "step": 3270 }, { "epoch": 3.68, "grad_norm": 0.07565028220415115, "learning_rate": 1.6143497757847534e-05, "loss": 0.0287, "step": 3280 }, { "epoch": 3.69, "grad_norm": 2.6630303859710693, "learning_rate": 1.5582959641255608e-05, "loss": 0.0322, "step": 3290 }, { "epoch": 3.7, "grad_norm": 3.4573700428009033, "learning_rate": 1.5022421524663678e-05, "loss": 0.0297, "step": 3300 }, { "epoch": 3.7, "eval_accuracy": 0.8595355383532723, "eval_loss": 0.5564337372779846, "eval_runtime": 127.7494, "eval_samples_per_second": 55.617, "eval_steps_per_second": 3.483, "step": 3300 }, { "epoch": 3.71, "grad_norm": 2.092428207397461, "learning_rate": 1.4461883408071749e-05, "loss": 0.0486, "step": 3310 }, { "epoch": 3.72, "grad_norm": 4.266282558441162, "learning_rate": 1.3901345291479822e-05, "loss": 0.0191, "step": 3320 }, { "epoch": 3.73, "grad_norm": 0.5256732106208801, "learning_rate": 1.3340807174887892e-05, "loss": 0.0101, "step": 3330 }, { "epoch": 3.74, "grad_norm": 2.7987325191497803, "learning_rate": 1.2780269058295966e-05, "loss": 0.0547, "step": 3340 }, { "epoch": 3.76, "grad_norm": 0.17688162624835968, "learning_rate": 1.2219730941704037e-05, "loss": 0.0221, "step": 3350 }, { "epoch": 3.77, "grad_norm": 1.9824228286743164, "learning_rate": 1.1659192825112109e-05, "loss": 0.0528, "step": 3360 }, { "epoch": 3.78, "grad_norm": 0.06763149797916412, "learning_rate": 1.109865470852018e-05, "loss": 0.0193, "step": 3370 }, { "epoch": 3.79, "grad_norm": 3.0181431770324707, "learning_rate": 1.0538116591928252e-05, "loss": 0.0777, "step": 3380 }, { "epoch": 3.8, "grad_norm": 1.1364251375198364, "learning_rate": 9.977578475336324e-06, "loss": 0.0376, "step": 3390 }, { "epoch": 3.81, "grad_norm": 0.19941140711307526, "learning_rate": 9.417040358744395e-06, "loss": 0.0457, "step": 3400 }, { "epoch": 3.81, "eval_accuracy": 0.8512315270935961, "eval_loss": 0.5806910991668701, "eval_runtime": 128.3848, "eval_samples_per_second": 55.341, "eval_steps_per_second": 3.466, "step": 3400 }, { "epoch": 3.82, "grad_norm": 7.786200523376465, "learning_rate": 8.856502242152467e-06, "loss": 0.0541, "step": 3410 }, { "epoch": 3.83, "grad_norm": 0.42947888374328613, "learning_rate": 8.295964125560539e-06, "loss": 0.0169, "step": 3420 }, { "epoch": 3.85, "grad_norm": 0.17804774641990662, "learning_rate": 7.73542600896861e-06, "loss": 0.0324, "step": 3430 }, { "epoch": 3.86, "grad_norm": 0.34209346771240234, "learning_rate": 7.174887892376682e-06, "loss": 0.0158, "step": 3440 }, { "epoch": 3.87, "grad_norm": 0.16426484286785126, "learning_rate": 6.614349775784753e-06, "loss": 0.0135, "step": 3450 }, { "epoch": 3.88, "grad_norm": 0.19225721061229706, "learning_rate": 6.053811659192826e-06, "loss": 0.0291, "step": 3460 }, { "epoch": 3.89, "grad_norm": 1.250550627708435, "learning_rate": 5.493273542600897e-06, "loss": 0.0214, "step": 3470 }, { "epoch": 3.9, "grad_norm": 0.08164811879396439, "learning_rate": 4.932735426008968e-06, "loss": 0.0259, "step": 3480 }, { "epoch": 3.91, "grad_norm": 0.14926199615001678, "learning_rate": 4.372197309417041e-06, "loss": 0.0299, "step": 3490 }, { "epoch": 3.92, "grad_norm": 0.0725017860531807, "learning_rate": 3.8116591928251122e-06, "loss": 0.016, "step": 3500 }, { "epoch": 3.92, "eval_accuracy": 0.8617874736101337, "eval_loss": 0.5601363778114319, "eval_runtime": 127.8408, "eval_samples_per_second": 55.577, "eval_steps_per_second": 3.481, "step": 3500 }, { "epoch": 3.93, "grad_norm": 1.6513164043426514, "learning_rate": 3.251121076233184e-06, "loss": 0.0165, "step": 3510 }, { "epoch": 3.95, "grad_norm": 0.088756263256073, "learning_rate": 2.690582959641256e-06, "loss": 0.0332, "step": 3520 }, { "epoch": 3.96, "grad_norm": 0.26848122477531433, "learning_rate": 2.1300448430493275e-06, "loss": 0.0232, "step": 3530 }, { "epoch": 3.97, "grad_norm": 0.14724156260490417, "learning_rate": 1.5695067264573993e-06, "loss": 0.02, "step": 3540 }, { "epoch": 3.98, "grad_norm": 1.9452786445617676, "learning_rate": 1.0089686098654709e-06, "loss": 0.0433, "step": 3550 }, { "epoch": 3.99, "grad_norm": 0.16432423889636993, "learning_rate": 4.484304932735426e-07, "loss": 0.0311, "step": 3560 }, { "epoch": 4.0, "step": 3568, "total_flos": 8.839521632856048e+18, "train_loss": 0.3317156129854944, "train_runtime": 7997.833, "train_samples_per_second": 14.262, "train_steps_per_second": 0.446 } ], "logging_steps": 10, "max_steps": 3568, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 8.839521632856048e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }