|
{ |
|
"best_metric": 0.4731413722038269, |
|
"best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-2200", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 3568, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.2656996250152588, |
|
"learning_rate": 0.00019943946188340808, |
|
"loss": 1.7641, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.702761173248291, |
|
"learning_rate": 0.00019887892376681615, |
|
"loss": 1.5342, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4954216480255127, |
|
"learning_rate": 0.00019831838565022422, |
|
"loss": 1.2414, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.647926926612854, |
|
"learning_rate": 0.0001977578475336323, |
|
"loss": 1.1185, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.192228078842163, |
|
"learning_rate": 0.00019719730941704039, |
|
"loss": 1.1626, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.8315672874450684, |
|
"learning_rate": 0.00019663677130044843, |
|
"loss": 1.1362, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.4982831478118896, |
|
"learning_rate": 0.0001960762331838565, |
|
"loss": 1.0978, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.8782095909118652, |
|
"learning_rate": 0.0001955156950672646, |
|
"loss": 1.0034, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.299464464187622, |
|
"learning_rate": 0.00019495515695067267, |
|
"loss": 0.9887, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.7226407527923584, |
|
"learning_rate": 0.0001943946188340807, |
|
"loss": 0.8874, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_accuracy": 0.6571428571428571, |
|
"eval_loss": 0.9671773314476013, |
|
"eval_runtime": 129.3363, |
|
"eval_samples_per_second": 54.934, |
|
"eval_steps_per_second": 3.441, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.4987040758132935, |
|
"learning_rate": 0.0001938340807174888, |
|
"loss": 0.8796, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.5715798139572144, |
|
"learning_rate": 0.00019327354260089688, |
|
"loss": 0.9511, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.1380341053009033, |
|
"learning_rate": 0.00019271300448430495, |
|
"loss": 0.8983, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.5433120727539062, |
|
"learning_rate": 0.00019215246636771302, |
|
"loss": 0.8984, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.033275604248047, |
|
"learning_rate": 0.00019159192825112109, |
|
"loss": 0.824, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.4264872074127197, |
|
"learning_rate": 0.00019103139013452916, |
|
"loss": 0.8358, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.856844186782837, |
|
"learning_rate": 0.00019047085201793723, |
|
"loss": 0.8706, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.4621365070343018, |
|
"learning_rate": 0.0001899103139013453, |
|
"loss": 0.8521, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4839106798171997, |
|
"learning_rate": 0.00018934977578475337, |
|
"loss": 0.8314, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.008270502090454, |
|
"learning_rate": 0.00018878923766816143, |
|
"loss": 0.7806, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_accuracy": 0.7096410978184378, |
|
"eval_loss": 0.8030331134796143, |
|
"eval_runtime": 127.3215, |
|
"eval_samples_per_second": 55.804, |
|
"eval_steps_per_second": 3.495, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.759716272354126, |
|
"learning_rate": 0.00018822869955156953, |
|
"loss": 0.7414, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.2896201610565186, |
|
"learning_rate": 0.00018766816143497757, |
|
"loss": 0.7788, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.522684097290039, |
|
"learning_rate": 0.00018710762331838564, |
|
"loss": 0.7765, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.6483938694000244, |
|
"learning_rate": 0.00018654708520179374, |
|
"loss": 0.8428, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.5204224586486816, |
|
"learning_rate": 0.0001859865470852018, |
|
"loss": 0.7756, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.352989912033081, |
|
"learning_rate": 0.00018542600896860985, |
|
"loss": 0.8085, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.1435911655426025, |
|
"learning_rate": 0.00018486547085201795, |
|
"loss": 0.8336, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.863233208656311, |
|
"learning_rate": 0.00018430493273542602, |
|
"loss": 0.7351, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.6672592163085938, |
|
"learning_rate": 0.0001837443946188341, |
|
"loss": 0.7169, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.852493405342102, |
|
"learning_rate": 0.00018318385650224216, |
|
"loss": 0.7404, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_accuracy": 0.696551724137931, |
|
"eval_loss": 0.8076898455619812, |
|
"eval_runtime": 127.361, |
|
"eval_samples_per_second": 55.786, |
|
"eval_steps_per_second": 3.494, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.066432476043701, |
|
"learning_rate": 0.00018262331838565023, |
|
"loss": 0.7422, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.5433382987976074, |
|
"learning_rate": 0.0001820627802690583, |
|
"loss": 0.7509, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.6319210529327393, |
|
"learning_rate": 0.00018150224215246637, |
|
"loss": 0.7168, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.8435016870498657, |
|
"learning_rate": 0.00018094170403587444, |
|
"loss": 0.7421, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.946035623550415, |
|
"learning_rate": 0.0001803811659192825, |
|
"loss": 0.7062, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.926882028579712, |
|
"learning_rate": 0.0001798206278026906, |
|
"loss": 0.7228, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.0743374824523926, |
|
"learning_rate": 0.00017926008968609868, |
|
"loss": 0.7722, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2202589511871338, |
|
"learning_rate": 0.00017869955156950672, |
|
"loss": 0.7334, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.696330189704895, |
|
"learning_rate": 0.00017813901345291482, |
|
"loss": 0.7306, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.4689078330993652, |
|
"learning_rate": 0.0001775784753363229, |
|
"loss": 0.7224, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_accuracy": 0.741027445460943, |
|
"eval_loss": 0.6990236043930054, |
|
"eval_runtime": 127.2157, |
|
"eval_samples_per_second": 55.85, |
|
"eval_steps_per_second": 3.498, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.5020831823349, |
|
"learning_rate": 0.00017701793721973096, |
|
"loss": 0.7291, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.5034234523773193, |
|
"learning_rate": 0.00017645739910313903, |
|
"loss": 0.6754, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.4598332643508911, |
|
"learning_rate": 0.0001758968609865471, |
|
"loss": 0.6624, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.073904991149902, |
|
"learning_rate": 0.00017533632286995517, |
|
"loss": 0.6682, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.7306851148605347, |
|
"learning_rate": 0.00017477578475336324, |
|
"loss": 0.6515, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.59877347946167, |
|
"learning_rate": 0.0001742152466367713, |
|
"loss": 0.5476, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.6079037189483643, |
|
"learning_rate": 0.00017365470852017938, |
|
"loss": 0.7031, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.1861743927001953, |
|
"learning_rate": 0.00017309417040358745, |
|
"loss": 0.7141, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.9290772676467896, |
|
"learning_rate": 0.00017253363228699552, |
|
"loss": 0.668, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.201539993286133, |
|
"learning_rate": 0.0001719730941704036, |
|
"loss": 0.6969, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_accuracy": 0.7362420830401126, |
|
"eval_loss": 0.7265785932540894, |
|
"eval_runtime": 127.1383, |
|
"eval_samples_per_second": 55.884, |
|
"eval_steps_per_second": 3.5, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.651583194732666, |
|
"learning_rate": 0.00017141255605381166, |
|
"loss": 0.6099, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.7216840982437134, |
|
"learning_rate": 0.00017085201793721975, |
|
"loss": 0.6174, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.8105279207229614, |
|
"learning_rate": 0.0001702914798206278, |
|
"loss": 0.6444, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.837810754776001, |
|
"learning_rate": 0.00016973094170403587, |
|
"loss": 0.6607, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.6770997047424316, |
|
"learning_rate": 0.00016917040358744396, |
|
"loss": 0.6268, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.2837833166122437, |
|
"learning_rate": 0.00016860986547085203, |
|
"loss": 0.7084, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0523972511291504, |
|
"learning_rate": 0.00016804932735426008, |
|
"loss": 0.6271, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.356562852859497, |
|
"learning_rate": 0.00016748878923766817, |
|
"loss": 0.6242, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.5814656019210815, |
|
"learning_rate": 0.00016692825112107624, |
|
"loss": 0.564, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.757988691329956, |
|
"learning_rate": 0.0001663677130044843, |
|
"loss": 0.5929, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_accuracy": 0.7589021815622801, |
|
"eval_loss": 0.6735280156135559, |
|
"eval_runtime": 126.499, |
|
"eval_samples_per_second": 56.166, |
|
"eval_steps_per_second": 3.518, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.8649027347564697, |
|
"learning_rate": 0.00016580717488789238, |
|
"loss": 0.6578, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.1447596549987793, |
|
"learning_rate": 0.00016524663677130045, |
|
"loss": 0.6224, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.7960020303726196, |
|
"learning_rate": 0.00016468609865470852, |
|
"loss": 0.6506, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.265737533569336, |
|
"learning_rate": 0.00016412556053811662, |
|
"loss": 0.6302, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.6668283939361572, |
|
"learning_rate": 0.00016356502242152466, |
|
"loss": 0.6518, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.8400986194610596, |
|
"learning_rate": 0.00016300448430493273, |
|
"loss": 0.619, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.842011451721191, |
|
"learning_rate": 0.00016244394618834083, |
|
"loss": 0.5984, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.0878114700317383, |
|
"learning_rate": 0.0001618834080717489, |
|
"loss": 0.5451, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.6710331439971924, |
|
"learning_rate": 0.00016132286995515694, |
|
"loss": 0.6813, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.501116991043091, |
|
"learning_rate": 0.00016076233183856504, |
|
"loss": 0.5556, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_accuracy": 0.7463757916959888, |
|
"eval_loss": 0.6704686284065247, |
|
"eval_runtime": 126.4929, |
|
"eval_samples_per_second": 56.169, |
|
"eval_steps_per_second": 3.518, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.3544445037841797, |
|
"learning_rate": 0.0001602017937219731, |
|
"loss": 0.6037, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.7408026456832886, |
|
"learning_rate": 0.00015964125560538118, |
|
"loss": 0.7128, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.1222403049468994, |
|
"learning_rate": 0.00015908071748878925, |
|
"loss": 0.5976, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.2981232404708862, |
|
"learning_rate": 0.00015852017937219732, |
|
"loss": 0.5566, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.9470215439796448, |
|
"learning_rate": 0.0001579596412556054, |
|
"loss": 0.5446, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.5283540487289429, |
|
"learning_rate": 0.00015739910313901346, |
|
"loss": 0.5127, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.9940904378890991, |
|
"learning_rate": 0.00015683856502242153, |
|
"loss": 0.5999, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.0824811458587646, |
|
"learning_rate": 0.0001562780269058296, |
|
"loss": 0.5658, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.9540338516235352, |
|
"learning_rate": 0.0001557174887892377, |
|
"loss": 0.5329, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.6645208597183228, |
|
"learning_rate": 0.00015515695067264574, |
|
"loss": 0.5831, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_accuracy": 0.7681914144968333, |
|
"eval_loss": 0.6300484538078308, |
|
"eval_runtime": 127.6321, |
|
"eval_samples_per_second": 55.668, |
|
"eval_steps_per_second": 3.487, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.9477753639221191, |
|
"learning_rate": 0.0001545964125560538, |
|
"loss": 0.5994, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.3609261512756348, |
|
"learning_rate": 0.00015403587443946188, |
|
"loss": 0.5809, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.7007122039794922, |
|
"learning_rate": 0.00015347533632286998, |
|
"loss": 0.6124, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.753017783164978, |
|
"learning_rate": 0.00015291479820627804, |
|
"loss": 0.5809, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.672780156135559, |
|
"learning_rate": 0.0001523542600896861, |
|
"loss": 0.5536, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.7526847124099731, |
|
"learning_rate": 0.00015179372197309418, |
|
"loss": 0.6337, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.4697840213775635, |
|
"learning_rate": 0.00015123318385650225, |
|
"loss": 0.5725, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.619101047515869, |
|
"learning_rate": 0.00015067264573991032, |
|
"loss": 0.7272, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.2301433086395264, |
|
"learning_rate": 0.0001501121076233184, |
|
"loss": 0.5315, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.2401838302612305, |
|
"learning_rate": 0.00014955156950672646, |
|
"loss": 0.3992, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"eval_accuracy": 0.7884588318085856, |
|
"eval_loss": 0.5879009366035461, |
|
"eval_runtime": 127.0026, |
|
"eval_samples_per_second": 55.944, |
|
"eval_steps_per_second": 3.504, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.7034109830856323, |
|
"learning_rate": 0.00014899103139013453, |
|
"loss": 0.3054, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.852588176727295, |
|
"learning_rate": 0.0001484304932735426, |
|
"loss": 0.4751, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.8306503295898438, |
|
"learning_rate": 0.00014786995515695067, |
|
"loss": 0.4143, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.520498037338257, |
|
"learning_rate": 0.00014730941704035874, |
|
"loss": 0.4863, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.4684131145477295, |
|
"learning_rate": 0.00014674887892376684, |
|
"loss": 0.4911, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.4476227760314941, |
|
"learning_rate": 0.00014618834080717488, |
|
"loss": 0.4375, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.003276824951172, |
|
"learning_rate": 0.00014562780269058295, |
|
"loss": 0.4116, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.1394705772399902, |
|
"learning_rate": 0.00014506726457399105, |
|
"loss": 0.3873, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.4954771995544434, |
|
"learning_rate": 0.00014450672645739912, |
|
"loss": 0.3475, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.449598550796509, |
|
"learning_rate": 0.00014394618834080716, |
|
"loss": 0.4661, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_accuracy": 0.7887403237156931, |
|
"eval_loss": 0.5782468914985657, |
|
"eval_runtime": 127.3385, |
|
"eval_samples_per_second": 55.796, |
|
"eval_steps_per_second": 3.495, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.011767864227295, |
|
"learning_rate": 0.00014338565022421526, |
|
"loss": 0.3975, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.4032831192016602, |
|
"learning_rate": 0.00014282511210762333, |
|
"loss": 0.4123, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.5600253343582153, |
|
"learning_rate": 0.0001422645739910314, |
|
"loss": 0.351, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 3.0084493160247803, |
|
"learning_rate": 0.00014170403587443947, |
|
"loss": 0.4234, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.441857933998108, |
|
"learning_rate": 0.00014114349775784754, |
|
"loss": 0.4868, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.2188992500305176, |
|
"learning_rate": 0.0001405829596412556, |
|
"loss": 0.423, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.6048247814178467, |
|
"learning_rate": 0.0001400224215246637, |
|
"loss": 0.4204, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.915587902069092, |
|
"learning_rate": 0.00013946188340807175, |
|
"loss": 0.3875, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.629499912261963, |
|
"learning_rate": 0.00013890134529147982, |
|
"loss": 0.4025, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.1824419498443604, |
|
"learning_rate": 0.00013834080717488792, |
|
"loss": 0.358, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_accuracy": 0.7942294159042927, |
|
"eval_loss": 0.5690400004386902, |
|
"eval_runtime": 127.5854, |
|
"eval_samples_per_second": 55.688, |
|
"eval_steps_per_second": 3.488, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.4124354124069214, |
|
"learning_rate": 0.000137780269058296, |
|
"loss": 0.3944, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.3432316780090332, |
|
"learning_rate": 0.00013721973094170403, |
|
"loss": 0.4344, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.4002068042755127, |
|
"learning_rate": 0.0001366591928251121, |
|
"loss": 0.3457, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 4.019514560699463, |
|
"learning_rate": 0.0001360986547085202, |
|
"loss": 0.4713, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.3158634901046753, |
|
"learning_rate": 0.00013553811659192827, |
|
"loss": 0.3754, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 3.053358793258667, |
|
"learning_rate": 0.0001349775784753363, |
|
"loss": 0.4441, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.6883745193481445, |
|
"learning_rate": 0.0001344170403587444, |
|
"loss": 0.4562, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.3927807807922363, |
|
"learning_rate": 0.00013385650224215248, |
|
"loss": 0.4068, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.4922747611999512, |
|
"learning_rate": 0.00013329596412556055, |
|
"loss": 0.4174, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.156853199005127, |
|
"learning_rate": 0.00013273542600896862, |
|
"loss": 0.3812, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_accuracy": 0.8146375791695989, |
|
"eval_loss": 0.5108710527420044, |
|
"eval_runtime": 127.9489, |
|
"eval_samples_per_second": 55.53, |
|
"eval_steps_per_second": 3.478, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.6083571910858154, |
|
"learning_rate": 0.00013217488789237669, |
|
"loss": 0.3351, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.889650583267212, |
|
"learning_rate": 0.00013161434977578476, |
|
"loss": 0.4398, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.9212812185287476, |
|
"learning_rate": 0.00013105381165919283, |
|
"loss": 0.3878, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.9533714056015015, |
|
"learning_rate": 0.0001304932735426009, |
|
"loss": 0.3447, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.091277837753296, |
|
"learning_rate": 0.00012993273542600897, |
|
"loss": 0.3848, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.8874760866165161, |
|
"learning_rate": 0.00012937219730941706, |
|
"loss": 0.5295, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.417236804962158, |
|
"learning_rate": 0.0001288116591928251, |
|
"loss": 0.4444, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.4967589378356934, |
|
"learning_rate": 0.00012825112107623318, |
|
"loss": 0.4944, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.3610410690307617, |
|
"learning_rate": 0.00012769058295964127, |
|
"loss": 0.3398, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.5874662399291992, |
|
"learning_rate": 0.00012713004484304934, |
|
"loss": 0.3535, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"eval_accuracy": 0.8147783251231527, |
|
"eval_loss": 0.5213413834571838, |
|
"eval_runtime": 127.3924, |
|
"eval_samples_per_second": 55.773, |
|
"eval_steps_per_second": 3.493, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.2904878854751587, |
|
"learning_rate": 0.00012656950672645739, |
|
"loss": 0.3868, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.38608980178833, |
|
"learning_rate": 0.00012600896860986548, |
|
"loss": 0.4082, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.6603220701217651, |
|
"learning_rate": 0.00012544843049327355, |
|
"loss": 0.3222, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.8661950826644897, |
|
"learning_rate": 0.00012488789237668162, |
|
"loss": 0.4029, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.12640643119812, |
|
"learning_rate": 0.0001243273542600897, |
|
"loss": 0.3691, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.671631097793579, |
|
"learning_rate": 0.00012376681614349776, |
|
"loss": 0.3328, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.103508949279785, |
|
"learning_rate": 0.00012320627802690583, |
|
"loss": 0.3475, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.7701743841171265, |
|
"learning_rate": 0.00012264573991031393, |
|
"loss": 0.4423, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.38301682472229, |
|
"learning_rate": 0.00012208520179372197, |
|
"loss": 0.426, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.7273807525634766, |
|
"learning_rate": 0.00012152466367713004, |
|
"loss": 0.3901, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"eval_accuracy": 0.8125263898662913, |
|
"eval_loss": 0.5261800289154053, |
|
"eval_runtime": 126.367, |
|
"eval_samples_per_second": 56.225, |
|
"eval_steps_per_second": 3.521, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.7706730365753174, |
|
"learning_rate": 0.00012096412556053814, |
|
"loss": 0.4756, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.9104268550872803, |
|
"learning_rate": 0.0001204035874439462, |
|
"loss": 0.3625, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.5548855066299438, |
|
"learning_rate": 0.00011984304932735426, |
|
"loss": 0.3352, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.2463669776916504, |
|
"learning_rate": 0.00011928251121076232, |
|
"loss": 0.3944, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.0413687229156494, |
|
"learning_rate": 0.00011872197309417042, |
|
"loss": 0.3637, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.267987012863159, |
|
"learning_rate": 0.00011816143497757847, |
|
"loss": 0.3517, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.90973699092865, |
|
"learning_rate": 0.00011760089686098654, |
|
"loss": 0.2954, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.5805819034576416, |
|
"learning_rate": 0.00011704035874439463, |
|
"loss": 0.3207, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.012744903564453, |
|
"learning_rate": 0.0001164798206278027, |
|
"loss": 0.2816, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.0160799026489258, |
|
"learning_rate": 0.00011591928251121075, |
|
"loss": 0.3276, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_accuracy": 0.8081632653061225, |
|
"eval_loss": 0.5793688893318176, |
|
"eval_runtime": 128.0301, |
|
"eval_samples_per_second": 55.495, |
|
"eval_steps_per_second": 3.476, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.1283419132232666, |
|
"learning_rate": 0.00011535874439461885, |
|
"loss": 0.3697, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.384875774383545, |
|
"learning_rate": 0.00011479820627802691, |
|
"loss": 0.4283, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.6091313362121582, |
|
"learning_rate": 0.00011423766816143498, |
|
"loss": 0.2917, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.50388503074646, |
|
"learning_rate": 0.00011367713004484306, |
|
"loss": 0.3705, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.4337923526763916, |
|
"learning_rate": 0.00011311659192825113, |
|
"loss": 0.3724, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.677294373512268, |
|
"learning_rate": 0.00011255605381165919, |
|
"loss": 0.3492, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.634697675704956, |
|
"learning_rate": 0.00011199551569506727, |
|
"loss": 0.3633, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.2558953762054443, |
|
"learning_rate": 0.00011143497757847534, |
|
"loss": 0.3875, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.5888583660125732, |
|
"learning_rate": 0.00011087443946188341, |
|
"loss": 0.3323, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.265427827835083, |
|
"learning_rate": 0.0001103139013452915, |
|
"loss": 0.3679, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"eval_accuracy": 0.8115411681914145, |
|
"eval_loss": 0.5365468859672546, |
|
"eval_runtime": 127.8881, |
|
"eval_samples_per_second": 55.556, |
|
"eval_steps_per_second": 3.48, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.279982566833496, |
|
"learning_rate": 0.00010975336322869956, |
|
"loss": 0.2996, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.905090093612671, |
|
"learning_rate": 0.00010919282511210762, |
|
"loss": 0.3268, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 4.301815032958984, |
|
"learning_rate": 0.0001086322869955157, |
|
"loss": 0.3312, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.8537272214889526, |
|
"learning_rate": 0.00010807174887892377, |
|
"loss": 0.3654, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.3462457656860352, |
|
"learning_rate": 0.00010751121076233184, |
|
"loss": 0.346, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.4433109760284424, |
|
"learning_rate": 0.00010695067264573993, |
|
"loss": 0.3563, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.8175091743469238, |
|
"learning_rate": 0.00010639013452914798, |
|
"loss": 0.3065, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.724806547164917, |
|
"learning_rate": 0.00010582959641255605, |
|
"loss": 0.3949, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.4651908874511719, |
|
"learning_rate": 0.00010526905829596414, |
|
"loss": 0.3451, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.2862894535064697, |
|
"learning_rate": 0.0001047085201793722, |
|
"loss": 0.3077, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"eval_accuracy": 0.825615763546798, |
|
"eval_loss": 0.503186047077179, |
|
"eval_runtime": 127.5407, |
|
"eval_samples_per_second": 55.708, |
|
"eval_steps_per_second": 3.489, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.5572246313095093, |
|
"learning_rate": 0.00010414798206278026, |
|
"loss": 0.3638, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 3.091179847717285, |
|
"learning_rate": 0.00010358744394618836, |
|
"loss": 0.4205, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.097266435623169, |
|
"learning_rate": 0.00010302690582959642, |
|
"loss": 0.2865, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 3.9726853370666504, |
|
"learning_rate": 0.00010246636771300449, |
|
"loss": 0.3283, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.2275006771087646, |
|
"learning_rate": 0.00010190582959641257, |
|
"loss": 0.2865, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.571467161178589, |
|
"learning_rate": 0.00010134529147982064, |
|
"loss": 0.381, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.8339025974273682, |
|
"learning_rate": 0.0001007847533632287, |
|
"loss": 0.3399, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.96084463596344, |
|
"learning_rate": 0.00010022421524663677, |
|
"loss": 0.4104, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.5350396633148193, |
|
"learning_rate": 9.966367713004485e-05, |
|
"loss": 0.2616, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.11006760597229, |
|
"learning_rate": 9.910313901345292e-05, |
|
"loss": 0.1593, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"eval_accuracy": 0.8237860661505981, |
|
"eval_loss": 0.4946657121181488, |
|
"eval_runtime": 127.5397, |
|
"eval_samples_per_second": 55.708, |
|
"eval_steps_per_second": 3.489, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.9963308572769165, |
|
"learning_rate": 9.854260089686099e-05, |
|
"loss": 0.1422, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.350595235824585, |
|
"learning_rate": 9.798206278026907e-05, |
|
"loss": 0.2012, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.9206656217575073, |
|
"learning_rate": 9.742152466367713e-05, |
|
"loss": 0.2354, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.9061004519462585, |
|
"learning_rate": 9.686098654708521e-05, |
|
"loss": 0.1971, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.090854287147522, |
|
"learning_rate": 9.630044843049327e-05, |
|
"loss": 0.2035, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.798594355583191, |
|
"learning_rate": 9.573991031390135e-05, |
|
"loss": 0.2362, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.2856987416744232, |
|
"learning_rate": 9.517937219730942e-05, |
|
"loss": 0.1448, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.9062001705169678, |
|
"learning_rate": 9.461883408071749e-05, |
|
"loss": 0.1019, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.8595843315124512, |
|
"learning_rate": 9.405829596412556e-05, |
|
"loss": 0.1844, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.6841151714324951, |
|
"learning_rate": 9.349775784753365e-05, |
|
"loss": 0.2495, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"eval_accuracy": 0.8212526389866291, |
|
"eval_loss": 0.5188373327255249, |
|
"eval_runtime": 127.5963, |
|
"eval_samples_per_second": 55.683, |
|
"eval_steps_per_second": 3.488, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.3805599212646484, |
|
"learning_rate": 9.29372197309417e-05, |
|
"loss": 0.1725, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.5507524013519287, |
|
"learning_rate": 9.237668161434979e-05, |
|
"loss": 0.1951, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 4.487265586853027, |
|
"learning_rate": 9.181614349775786e-05, |
|
"loss": 0.1928, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 5.267577171325684, |
|
"learning_rate": 9.125560538116593e-05, |
|
"loss": 0.2279, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.9050337076187134, |
|
"learning_rate": 9.0695067264574e-05, |
|
"loss": 0.1408, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.7110774517059326, |
|
"learning_rate": 9.013452914798208e-05, |
|
"loss": 0.1851, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.4091663360595703, |
|
"learning_rate": 8.957399103139014e-05, |
|
"loss": 0.1961, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.0745985507965088, |
|
"learning_rate": 8.901345291479822e-05, |
|
"loss": 0.1807, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.907657504081726, |
|
"learning_rate": 8.845291479820629e-05, |
|
"loss": 0.1656, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.4196101427078247, |
|
"learning_rate": 8.789237668161436e-05, |
|
"loss": 0.1604, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_accuracy": 0.8457424349049965, |
|
"eval_loss": 0.47485658526420593, |
|
"eval_runtime": 127.5001, |
|
"eval_samples_per_second": 55.725, |
|
"eval_steps_per_second": 3.49, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.8819741606712341, |
|
"learning_rate": 8.733183856502243e-05, |
|
"loss": 0.1541, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 2.7008216381073, |
|
"learning_rate": 8.67713004484305e-05, |
|
"loss": 0.1754, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.3049780130386353, |
|
"learning_rate": 8.621076233183857e-05, |
|
"loss": 0.1474, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.7845088243484497, |
|
"learning_rate": 8.565022421524664e-05, |
|
"loss": 0.149, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.162095546722412, |
|
"learning_rate": 8.508968609865471e-05, |
|
"loss": 0.1711, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 2.4429993629455566, |
|
"learning_rate": 8.452914798206278e-05, |
|
"loss": 0.1278, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 4.209596157073975, |
|
"learning_rate": 8.396860986547086e-05, |
|
"loss": 0.2285, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.8332444429397583, |
|
"learning_rate": 8.340807174887892e-05, |
|
"loss": 0.1706, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.2180029153823853, |
|
"learning_rate": 8.2847533632287e-05, |
|
"loss": 0.1863, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.7199923992156982, |
|
"learning_rate": 8.228699551569507e-05, |
|
"loss": 0.1347, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_accuracy": 0.8318085855031668, |
|
"eval_loss": 0.4878062307834625, |
|
"eval_runtime": 127.098, |
|
"eval_samples_per_second": 55.902, |
|
"eval_steps_per_second": 3.501, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.4405524730682373, |
|
"learning_rate": 8.172645739910314e-05, |
|
"loss": 0.1669, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 3.2856411933898926, |
|
"learning_rate": 8.116591928251121e-05, |
|
"loss": 0.1526, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.642458915710449, |
|
"learning_rate": 8.06053811659193e-05, |
|
"loss": 0.1866, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.8131886720657349, |
|
"learning_rate": 8.004484304932735e-05, |
|
"loss": 0.1413, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.279311418533325, |
|
"learning_rate": 7.948430493273543e-05, |
|
"loss": 0.1764, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 3.409904956817627, |
|
"learning_rate": 7.892376681614349e-05, |
|
"loss": 0.1697, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.6139248609542847, |
|
"learning_rate": 7.836322869955157e-05, |
|
"loss": 0.2257, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.9628515243530273, |
|
"learning_rate": 7.780269058295964e-05, |
|
"loss": 0.1869, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.0070972442626953, |
|
"learning_rate": 7.724215246636771e-05, |
|
"loss": 0.2005, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.0854668617248535, |
|
"learning_rate": 7.668161434977578e-05, |
|
"loss": 0.1723, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"eval_accuracy": 0.8441942294159043, |
|
"eval_loss": 0.4731413722038269, |
|
"eval_runtime": 127.6751, |
|
"eval_samples_per_second": 55.649, |
|
"eval_steps_per_second": 3.485, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 3.4323694705963135, |
|
"learning_rate": 7.612107623318387e-05, |
|
"loss": 0.1829, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.0630773305892944, |
|
"learning_rate": 7.556053811659192e-05, |
|
"loss": 0.1704, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.900248646736145, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.1428, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.8738330602645874, |
|
"learning_rate": 7.443946188340808e-05, |
|
"loss": 0.1403, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.873507499694824, |
|
"learning_rate": 7.387892376681615e-05, |
|
"loss": 0.1267, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.962599515914917, |
|
"learning_rate": 7.331838565022422e-05, |
|
"loss": 0.1783, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.1607624292373657, |
|
"learning_rate": 7.27578475336323e-05, |
|
"loss": 0.1067, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 3.1270833015441895, |
|
"learning_rate": 7.219730941704036e-05, |
|
"loss": 0.1542, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 4.381764888763428, |
|
"learning_rate": 7.163677130044844e-05, |
|
"loss": 0.1032, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 4.008007526397705, |
|
"learning_rate": 7.107623318385651e-05, |
|
"loss": 0.1235, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"eval_accuracy": 0.8450387051372273, |
|
"eval_loss": 0.493280827999115, |
|
"eval_runtime": 127.613, |
|
"eval_samples_per_second": 55.676, |
|
"eval_steps_per_second": 3.487, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.7960009574890137, |
|
"learning_rate": 7.051569506726458e-05, |
|
"loss": 0.1695, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 3.1904006004333496, |
|
"learning_rate": 6.995515695067265e-05, |
|
"loss": 0.1388, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 3.1949515342712402, |
|
"learning_rate": 6.939461883408072e-05, |
|
"loss": 0.2264, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.386139154434204, |
|
"learning_rate": 6.883408071748879e-05, |
|
"loss": 0.1498, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 2.6440839767456055, |
|
"learning_rate": 6.827354260089687e-05, |
|
"loss": 0.1445, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.2900611162185669, |
|
"learning_rate": 6.771300448430493e-05, |
|
"loss": 0.1245, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 3.771578073501587, |
|
"learning_rate": 6.715246636771301e-05, |
|
"loss": 0.1775, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.2707236707210541, |
|
"learning_rate": 6.659192825112108e-05, |
|
"loss": 0.1317, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.4165215492248535, |
|
"learning_rate": 6.603139013452915e-05, |
|
"loss": 0.1645, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.579758882522583, |
|
"learning_rate": 6.547085201793722e-05, |
|
"loss": 0.1752, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"eval_accuracy": 0.8501055594651654, |
|
"eval_loss": 0.47405895590782166, |
|
"eval_runtime": 127.5154, |
|
"eval_samples_per_second": 55.719, |
|
"eval_steps_per_second": 3.49, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.206603527069092, |
|
"learning_rate": 6.491031390134529e-05, |
|
"loss": 0.2482, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.586669921875, |
|
"learning_rate": 6.434977578475336e-05, |
|
"loss": 0.1746, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.046320676803589, |
|
"learning_rate": 6.378923766816143e-05, |
|
"loss": 0.1386, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.8042988181114197, |
|
"learning_rate": 6.322869955156952e-05, |
|
"loss": 0.1383, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 3.815175771713257, |
|
"learning_rate": 6.266816143497759e-05, |
|
"loss": 0.1394, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.830374002456665, |
|
"learning_rate": 6.210762331838566e-05, |
|
"loss": 0.1511, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.1348299980163574, |
|
"learning_rate": 6.154708520179373e-05, |
|
"loss": 0.1295, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 4.810758590698242, |
|
"learning_rate": 6.0986547085201795e-05, |
|
"loss": 0.1757, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.4163758754730225, |
|
"learning_rate": 6.042600896860987e-05, |
|
"loss": 0.0962, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.047985792160034, |
|
"learning_rate": 5.9865470852017935e-05, |
|
"loss": 0.1421, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_accuracy": 0.8474313863476425, |
|
"eval_loss": 0.4880400598049164, |
|
"eval_runtime": 127.7239, |
|
"eval_samples_per_second": 55.628, |
|
"eval_steps_per_second": 3.484, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.3623876571655273, |
|
"learning_rate": 5.930493273542601e-05, |
|
"loss": 0.1455, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.8722714185714722, |
|
"learning_rate": 5.874439461883409e-05, |
|
"loss": 0.1755, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.139150619506836, |
|
"learning_rate": 5.818385650224215e-05, |
|
"loss": 0.1471, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.041858434677124, |
|
"learning_rate": 5.762331838565023e-05, |
|
"loss": 0.1372, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.6558467149734497, |
|
"learning_rate": 5.7062780269058305e-05, |
|
"loss": 0.1184, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 3.568887233734131, |
|
"learning_rate": 5.650224215246637e-05, |
|
"loss": 0.1752, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.7773478627204895, |
|
"learning_rate": 5.5941704035874445e-05, |
|
"loss": 0.1491, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.827122688293457, |
|
"learning_rate": 5.5381165919282515e-05, |
|
"loss": 0.1893, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 3.533275842666626, |
|
"learning_rate": 5.4820627802690585e-05, |
|
"loss": 0.1117, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.7163364887237549, |
|
"learning_rate": 5.426008968609866e-05, |
|
"loss": 0.1549, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"eval_accuracy": 0.8389866291344124, |
|
"eval_loss": 0.4745788276195526, |
|
"eval_runtime": 127.6611, |
|
"eval_samples_per_second": 55.655, |
|
"eval_steps_per_second": 3.486, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 2.184201717376709, |
|
"learning_rate": 5.369955156950673e-05, |
|
"loss": 0.1698, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.9737119674682617, |
|
"learning_rate": 5.31390134529148e-05, |
|
"loss": 0.1189, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.0814894437789917, |
|
"learning_rate": 5.257847533632287e-05, |
|
"loss": 0.132, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.9624450206756592, |
|
"learning_rate": 5.201793721973094e-05, |
|
"loss": 0.1032, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.6256656646728516, |
|
"learning_rate": 5.145739910313902e-05, |
|
"loss": 0.1592, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 3.0557103157043457, |
|
"learning_rate": 5.089686098654709e-05, |
|
"loss": 0.1327, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.0262203216552734, |
|
"learning_rate": 5.033632286995516e-05, |
|
"loss": 0.1487, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.14867419004440308, |
|
"learning_rate": 4.977578475336323e-05, |
|
"loss": 0.1005, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.12747132778167725, |
|
"learning_rate": 4.92152466367713e-05, |
|
"loss": 0.0543, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.1839972287416458, |
|
"learning_rate": 4.8654708520179374e-05, |
|
"loss": 0.0617, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"eval_accuracy": 0.8496833216045039, |
|
"eval_loss": 0.4935864508152008, |
|
"eval_runtime": 127.5847, |
|
"eval_samples_per_second": 55.688, |
|
"eval_steps_per_second": 3.488, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.8856990337371826, |
|
"learning_rate": 4.8094170403587444e-05, |
|
"loss": 0.0687, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.39913854002952576, |
|
"learning_rate": 4.7533632286995514e-05, |
|
"loss": 0.0553, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.8423967957496643, |
|
"learning_rate": 4.697309417040359e-05, |
|
"loss": 0.0797, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.2557125687599182, |
|
"learning_rate": 4.641255605381166e-05, |
|
"loss": 0.08, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.2399497032165527, |
|
"learning_rate": 4.585201793721973e-05, |
|
"loss": 0.0483, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.04336220771074295, |
|
"learning_rate": 4.52914798206278e-05, |
|
"loss": 0.0339, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.29698485136032104, |
|
"learning_rate": 4.473094170403588e-05, |
|
"loss": 0.0579, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 2.0980567932128906, |
|
"learning_rate": 4.417040358744395e-05, |
|
"loss": 0.0996, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.4327409267425537, |
|
"learning_rate": 4.360986547085202e-05, |
|
"loss": 0.0435, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.9807620048522949, |
|
"learning_rate": 4.3049327354260094e-05, |
|
"loss": 0.0835, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"eval_accuracy": 0.8554539057002111, |
|
"eval_loss": 0.4977756142616272, |
|
"eval_runtime": 127.7265, |
|
"eval_samples_per_second": 55.627, |
|
"eval_steps_per_second": 3.484, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.283894658088684, |
|
"learning_rate": 4.2488789237668164e-05, |
|
"loss": 0.0397, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.820012092590332, |
|
"learning_rate": 4.1928251121076234e-05, |
|
"loss": 0.0601, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.781630039215088, |
|
"learning_rate": 4.1367713004484303e-05, |
|
"loss": 0.0452, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.20719203352928162, |
|
"learning_rate": 4.080717488789238e-05, |
|
"loss": 0.0401, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.106254816055298, |
|
"learning_rate": 4.024663677130045e-05, |
|
"loss": 0.0358, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.9900000095367432, |
|
"learning_rate": 3.968609865470852e-05, |
|
"loss": 0.0496, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.2951858937740326, |
|
"learning_rate": 3.91255605381166e-05, |
|
"loss": 0.0406, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.3538978695869446, |
|
"learning_rate": 3.8565022421524667e-05, |
|
"loss": 0.0481, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.9406136870384216, |
|
"learning_rate": 3.8004484304932737e-05, |
|
"loss": 0.0239, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.0194897651672363, |
|
"learning_rate": 3.744394618834081e-05, |
|
"loss": 0.0477, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_accuracy": 0.8586910626319494, |
|
"eval_loss": 0.5344606637954712, |
|
"eval_runtime": 127.6926, |
|
"eval_samples_per_second": 55.641, |
|
"eval_steps_per_second": 3.485, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.11960559338331223, |
|
"learning_rate": 3.688340807174888e-05, |
|
"loss": 0.0511, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.57135009765625, |
|
"learning_rate": 3.632286995515695e-05, |
|
"loss": 0.0119, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.40805578231811523, |
|
"learning_rate": 3.576233183856502e-05, |
|
"loss": 0.0735, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 3.6458218097686768, |
|
"learning_rate": 3.52017937219731e-05, |
|
"loss": 0.0713, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.26397281885147095, |
|
"learning_rate": 3.464125560538117e-05, |
|
"loss": 0.0248, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.8868299126625061, |
|
"learning_rate": 3.408071748878924e-05, |
|
"loss": 0.0734, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.07102204859256744, |
|
"learning_rate": 3.3520179372197316e-05, |
|
"loss": 0.0176, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 2.244887590408325, |
|
"learning_rate": 3.2959641255605386e-05, |
|
"loss": 0.0498, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.5616236925125122, |
|
"learning_rate": 3.2399103139013456e-05, |
|
"loss": 0.0373, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.19669972360134125, |
|
"learning_rate": 3.1838565022421526e-05, |
|
"loss": 0.0287, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_accuracy": 0.8596762843068262, |
|
"eval_loss": 0.5332924723625183, |
|
"eval_runtime": 127.3143, |
|
"eval_samples_per_second": 55.807, |
|
"eval_steps_per_second": 3.495, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.8116190433502197, |
|
"learning_rate": 3.12780269058296e-05, |
|
"loss": 0.0203, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 4.554844379425049, |
|
"learning_rate": 3.071748878923767e-05, |
|
"loss": 0.0199, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.331480622291565, |
|
"learning_rate": 3.015695067264574e-05, |
|
"loss": 0.0258, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.2970045208930969, |
|
"learning_rate": 2.9596412556053816e-05, |
|
"loss": 0.0311, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.1972055435180664, |
|
"learning_rate": 2.9035874439461886e-05, |
|
"loss": 0.0559, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 3.563384532928467, |
|
"learning_rate": 2.8475336322869956e-05, |
|
"loss": 0.0457, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.6148489713668823, |
|
"learning_rate": 2.7914798206278025e-05, |
|
"loss": 0.0583, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.8646751642227173, |
|
"learning_rate": 2.7354260089686102e-05, |
|
"loss": 0.03, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.4915563762187958, |
|
"learning_rate": 2.6793721973094172e-05, |
|
"loss": 0.032, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.7134366035461426, |
|
"learning_rate": 2.6233183856502242e-05, |
|
"loss": 0.0242, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"eval_accuracy": 0.8602392681210416, |
|
"eval_loss": 0.5433253645896912, |
|
"eval_runtime": 128.4379, |
|
"eval_samples_per_second": 55.319, |
|
"eval_steps_per_second": 3.465, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.1170569360256195, |
|
"learning_rate": 2.567264573991032e-05, |
|
"loss": 0.0349, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.28987565636634827, |
|
"learning_rate": 2.511210762331839e-05, |
|
"loss": 0.0327, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.07998673617839813, |
|
"learning_rate": 2.455156950672646e-05, |
|
"loss": 0.0483, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.4792230725288391, |
|
"learning_rate": 2.3991031390134532e-05, |
|
"loss": 0.0271, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 3.571005344390869, |
|
"learning_rate": 2.3430493273542602e-05, |
|
"loss": 0.0474, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.4508035182952881, |
|
"learning_rate": 2.286995515695067e-05, |
|
"loss": 0.0487, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.219608336687088, |
|
"learning_rate": 2.2309417040358745e-05, |
|
"loss": 0.0206, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.5323840975761414, |
|
"learning_rate": 2.1748878923766815e-05, |
|
"loss": 0.0461, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 5.022609233856201, |
|
"learning_rate": 2.1188340807174888e-05, |
|
"loss": 0.0387, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.12102050334215164, |
|
"learning_rate": 2.062780269058296e-05, |
|
"loss": 0.0196, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"eval_accuracy": 0.8584095707248417, |
|
"eval_loss": 0.5772469639778137, |
|
"eval_runtime": 126.9569, |
|
"eval_samples_per_second": 55.964, |
|
"eval_steps_per_second": 3.505, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.7281592488288879, |
|
"learning_rate": 2.006726457399103e-05, |
|
"loss": 0.029, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 3.789141893386841, |
|
"learning_rate": 1.9506726457399105e-05, |
|
"loss": 0.0747, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.8820950388908386, |
|
"learning_rate": 1.8946188340807175e-05, |
|
"loss": 0.0254, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 5.991636276245117, |
|
"learning_rate": 1.8385650224215248e-05, |
|
"loss": 0.0354, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.16825184226036072, |
|
"learning_rate": 1.7825112107623318e-05, |
|
"loss": 0.0229, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.19618666172027588, |
|
"learning_rate": 1.726457399103139e-05, |
|
"loss": 0.0737, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.08360274136066437, |
|
"learning_rate": 1.6704035874439464e-05, |
|
"loss": 0.0341, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.07565028220415115, |
|
"learning_rate": 1.6143497757847534e-05, |
|
"loss": 0.0287, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 2.6630303859710693, |
|
"learning_rate": 1.5582959641255608e-05, |
|
"loss": 0.0322, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 3.4573700428009033, |
|
"learning_rate": 1.5022421524663678e-05, |
|
"loss": 0.0297, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"eval_accuracy": 0.8595355383532723, |
|
"eval_loss": 0.5564337372779846, |
|
"eval_runtime": 127.7494, |
|
"eval_samples_per_second": 55.617, |
|
"eval_steps_per_second": 3.483, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.092428207397461, |
|
"learning_rate": 1.4461883408071749e-05, |
|
"loss": 0.0486, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 4.266282558441162, |
|
"learning_rate": 1.3901345291479822e-05, |
|
"loss": 0.0191, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.5256732106208801, |
|
"learning_rate": 1.3340807174887892e-05, |
|
"loss": 0.0101, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 2.7987325191497803, |
|
"learning_rate": 1.2780269058295966e-05, |
|
"loss": 0.0547, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.17688162624835968, |
|
"learning_rate": 1.2219730941704037e-05, |
|
"loss": 0.0221, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 1.9824228286743164, |
|
"learning_rate": 1.1659192825112109e-05, |
|
"loss": 0.0528, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.06763149797916412, |
|
"learning_rate": 1.109865470852018e-05, |
|
"loss": 0.0193, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 3.0181431770324707, |
|
"learning_rate": 1.0538116591928252e-05, |
|
"loss": 0.0777, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.1364251375198364, |
|
"learning_rate": 9.977578475336324e-06, |
|
"loss": 0.0376, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.19941140711307526, |
|
"learning_rate": 9.417040358744395e-06, |
|
"loss": 0.0457, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"eval_accuracy": 0.8512315270935961, |
|
"eval_loss": 0.5806910991668701, |
|
"eval_runtime": 128.3848, |
|
"eval_samples_per_second": 55.341, |
|
"eval_steps_per_second": 3.466, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 7.786200523376465, |
|
"learning_rate": 8.856502242152467e-06, |
|
"loss": 0.0541, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.42947888374328613, |
|
"learning_rate": 8.295964125560539e-06, |
|
"loss": 0.0169, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.17804774641990662, |
|
"learning_rate": 7.73542600896861e-06, |
|
"loss": 0.0324, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.34209346771240234, |
|
"learning_rate": 7.174887892376682e-06, |
|
"loss": 0.0158, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.16426484286785126, |
|
"learning_rate": 6.614349775784753e-06, |
|
"loss": 0.0135, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.19225721061229706, |
|
"learning_rate": 6.053811659192826e-06, |
|
"loss": 0.0291, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 1.250550627708435, |
|
"learning_rate": 5.493273542600897e-06, |
|
"loss": 0.0214, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.08164811879396439, |
|
"learning_rate": 4.932735426008968e-06, |
|
"loss": 0.0259, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.14926199615001678, |
|
"learning_rate": 4.372197309417041e-06, |
|
"loss": 0.0299, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.0725017860531807, |
|
"learning_rate": 3.8116591928251122e-06, |
|
"loss": 0.016, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_accuracy": 0.8617874736101337, |
|
"eval_loss": 0.5601363778114319, |
|
"eval_runtime": 127.8408, |
|
"eval_samples_per_second": 55.577, |
|
"eval_steps_per_second": 3.481, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 1.6513164043426514, |
|
"learning_rate": 3.251121076233184e-06, |
|
"loss": 0.0165, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.088756263256073, |
|
"learning_rate": 2.690582959641256e-06, |
|
"loss": 0.0332, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.26848122477531433, |
|
"learning_rate": 2.1300448430493275e-06, |
|
"loss": 0.0232, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.14724156260490417, |
|
"learning_rate": 1.5695067264573993e-06, |
|
"loss": 0.02, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 1.9452786445617676, |
|
"learning_rate": 1.0089686098654709e-06, |
|
"loss": 0.0433, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.16432423889636993, |
|
"learning_rate": 4.484304932735426e-07, |
|
"loss": 0.0311, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 3568, |
|
"total_flos": 8.839521632856048e+18, |
|
"train_loss": 0.3317156129854944, |
|
"train_runtime": 7997.833, |
|
"train_samples_per_second": 14.262, |
|
"train_steps_per_second": 0.446 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3568, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 8.839521632856048e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|