microsoft_swinv2-tiny-patch4-window8-256-batch_16_epoch_4_classes_24_final_withAug
/
trainer_state.json
{ | |
"best_metric": 0.2321375608444214, | |
"best_model_checkpoint": "/kaggle/working/Model/microsoft_swinv2-tiny-patch4-window8-256-batch_16_epoch_4_classes_24_final_withAug/checkpoint-4400", | |
"epoch": 4.0, | |
"eval_steps": 100, | |
"global_step": 4692, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.01, | |
"grad_norm": NaN, | |
"learning_rate": 0.0002, | |
"loss": 3.2461, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.02, | |
"grad_norm": NaN, | |
"learning_rate": 0.0002, | |
"loss": 3.2031, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.03, | |
"grad_norm": 5.058732509613037, | |
"learning_rate": 0.0001999147485080989, | |
"loss": 3.2604, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.03, | |
"grad_norm": 2.7387874126434326, | |
"learning_rate": 0.00019948849104859336, | |
"loss": 3.1794, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.04, | |
"grad_norm": 2.7913503646850586, | |
"learning_rate": 0.0001990622335890878, | |
"loss": 2.9532, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.05, | |
"grad_norm": 4.803112030029297, | |
"learning_rate": 0.00019867860187553282, | |
"loss": 2.692, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.06, | |
"grad_norm": 5.479024887084961, | |
"learning_rate": 0.0001982523444160273, | |
"loss": 2.4673, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.07, | |
"grad_norm": 5.984738826751709, | |
"learning_rate": 0.00019782608695652175, | |
"loss": 2.2248, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 8.306194305419922, | |
"learning_rate": 0.0001973998294970162, | |
"loss": 1.9695, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.09, | |
"grad_norm": 6.3569512367248535, | |
"learning_rate": 0.00019697357203751065, | |
"loss": 1.7162, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.09, | |
"eval_accuracy": 0.7078804347826086, | |
"eval_loss": 1.4225009679794312, | |
"eval_runtime": 26.8467, | |
"eval_samples_per_second": 27.415, | |
"eval_steps_per_second": 3.427, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.09, | |
"grad_norm": 6.7053914070129395, | |
"learning_rate": 0.00019654731457800512, | |
"loss": 1.5791, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.1, | |
"grad_norm": 6.75339412689209, | |
"learning_rate": 0.00019616368286445014, | |
"loss": 1.5743, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 6.368953704833984, | |
"learning_rate": 0.0001957374254049446, | |
"loss": 1.4221, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.12, | |
"grad_norm": 7.507160663604736, | |
"learning_rate": 0.00019531116794543904, | |
"loss": 1.4591, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 6.467655181884766, | |
"learning_rate": 0.00019488491048593351, | |
"loss": 1.5203, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.14, | |
"grad_norm": 7.42750883102417, | |
"learning_rate": 0.00019445865302642796, | |
"loss": 1.3534, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.14, | |
"grad_norm": 8.539589881896973, | |
"learning_rate": 0.00019403239556692244, | |
"loss": 1.517, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.15, | |
"grad_norm": 8.488027572631836, | |
"learning_rate": 0.0001936061381074169, | |
"loss": 1.3041, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 6.249123573303223, | |
"learning_rate": 0.00019317988064791134, | |
"loss": 1.178, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.17, | |
"grad_norm": 8.70413589477539, | |
"learning_rate": 0.0001927536231884058, | |
"loss": 1.2286, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.17, | |
"eval_accuracy": 0.7934782608695652, | |
"eval_loss": 0.946118175983429, | |
"eval_runtime": 22.2277, | |
"eval_samples_per_second": 33.112, | |
"eval_steps_per_second": 4.139, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.18, | |
"grad_norm": 6.648161888122559, | |
"learning_rate": 0.00019232736572890027, | |
"loss": 1.1328, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 6.3985419273376465, | |
"learning_rate": 0.00019190110826939472, | |
"loss": 1.3468, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.2, | |
"grad_norm": 6.855421543121338, | |
"learning_rate": 0.0001914748508098892, | |
"loss": 1.121, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.2, | |
"grad_norm": 7.411104202270508, | |
"learning_rate": 0.00019104859335038364, | |
"loss": 1.1668, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.21, | |
"grad_norm": 6.608767986297607, | |
"learning_rate": 0.0001906223358908781, | |
"loss": 1.1276, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.22, | |
"grad_norm": 7.618008613586426, | |
"learning_rate": 0.00019019607843137254, | |
"loss": 1.0934, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.23, | |
"grad_norm": 8.76831340789795, | |
"learning_rate": 0.00018976982097186702, | |
"loss": 1.0502, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 6.6039323806762695, | |
"learning_rate": 0.00018934356351236147, | |
"loss": 0.9731, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 5.2799072265625, | |
"learning_rate": 0.00018891730605285594, | |
"loss": 0.9236, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 8.616353988647461, | |
"learning_rate": 0.0001884910485933504, | |
"loss": 1.0323, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.26, | |
"eval_accuracy": 0.8355978260869565, | |
"eval_loss": 0.7366186380386353, | |
"eval_runtime": 22.3822, | |
"eval_samples_per_second": 32.883, | |
"eval_steps_per_second": 4.11, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 6.827239513397217, | |
"learning_rate": 0.00018806479113384484, | |
"loss": 1.0934, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.27, | |
"grad_norm": 7.527755260467529, | |
"learning_rate": 0.0001876385336743393, | |
"loss": 1.0528, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.28, | |
"grad_norm": 6.007165431976318, | |
"learning_rate": 0.00018721227621483377, | |
"loss": 0.8891, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.29, | |
"grad_norm": 8.186945915222168, | |
"learning_rate": 0.00018678601875532822, | |
"loss": 0.965, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.3, | |
"grad_norm": 6.866254806518555, | |
"learning_rate": 0.0001863597612958227, | |
"loss": 0.8948, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.31, | |
"grad_norm": 6.580564975738525, | |
"learning_rate": 0.00018593350383631715, | |
"loss": 0.9438, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 5.725220680236816, | |
"learning_rate": 0.0001855072463768116, | |
"loss": 0.8196, | |
"step": 370 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 7.180054664611816, | |
"learning_rate": 0.00018508098891730605, | |
"loss": 0.8645, | |
"step": 380 | |
}, | |
{ | |
"epoch": 0.33, | |
"grad_norm": 5.196268081665039, | |
"learning_rate": 0.00018469735720375109, | |
"loss": 0.8335, | |
"step": 390 | |
}, | |
{ | |
"epoch": 0.34, | |
"grad_norm": 5.930848121643066, | |
"learning_rate": 0.00018427109974424554, | |
"loss": 0.8678, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.34, | |
"eval_accuracy": 0.8627717391304348, | |
"eval_loss": 0.6210773587226868, | |
"eval_runtime": 22.1531, | |
"eval_samples_per_second": 33.223, | |
"eval_steps_per_second": 4.153, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.35, | |
"grad_norm": 7.423695087432861, | |
"learning_rate": 0.00018384484228473998, | |
"loss": 0.7715, | |
"step": 410 | |
}, | |
{ | |
"epoch": 0.36, | |
"grad_norm": 4.49029016494751, | |
"learning_rate": 0.00018341858482523443, | |
"loss": 0.6917, | |
"step": 420 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 4.553171157836914, | |
"learning_rate": 0.0001829923273657289, | |
"loss": 0.7546, | |
"step": 430 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 5.762471675872803, | |
"learning_rate": 0.00018256606990622336, | |
"loss": 0.876, | |
"step": 440 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 7.234831809997559, | |
"learning_rate": 0.00018213981244671784, | |
"loss": 0.819, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.39, | |
"grad_norm": 6.18890380859375, | |
"learning_rate": 0.0001817135549872123, | |
"loss": 0.8027, | |
"step": 460 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 5.732609272003174, | |
"learning_rate": 0.00018128729752770674, | |
"loss": 0.7294, | |
"step": 470 | |
}, | |
{ | |
"epoch": 0.41, | |
"grad_norm": 6.72756290435791, | |
"learning_rate": 0.0001808610400682012, | |
"loss": 0.8894, | |
"step": 480 | |
}, | |
{ | |
"epoch": 0.42, | |
"grad_norm": 4.72413444519043, | |
"learning_rate": 0.00018043478260869566, | |
"loss": 0.7052, | |
"step": 490 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 3.8523027896881104, | |
"learning_rate": 0.0001800085251491901, | |
"loss": 0.7849, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.43, | |
"eval_accuracy": 0.8654891304347826, | |
"eval_loss": 0.5353636741638184, | |
"eval_runtime": 22.0834, | |
"eval_samples_per_second": 33.328, | |
"eval_steps_per_second": 4.166, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 5.818722248077393, | |
"learning_rate": 0.0001795822676896846, | |
"loss": 0.6421, | |
"step": 510 | |
}, | |
{ | |
"epoch": 0.44, | |
"grad_norm": 7.893730640411377, | |
"learning_rate": 0.00017915601023017904, | |
"loss": 0.7131, | |
"step": 520 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 6.222229957580566, | |
"learning_rate": 0.0001787297527706735, | |
"loss": 0.642, | |
"step": 530 | |
}, | |
{ | |
"epoch": 0.46, | |
"grad_norm": 6.1911139488220215, | |
"learning_rate": 0.00017830349531116794, | |
"loss": 0.7096, | |
"step": 540 | |
}, | |
{ | |
"epoch": 0.47, | |
"grad_norm": 7.124819278717041, | |
"learning_rate": 0.00017787723785166242, | |
"loss": 0.6946, | |
"step": 550 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 4.969889163970947, | |
"learning_rate": 0.00017745098039215687, | |
"loss": 0.6995, | |
"step": 560 | |
}, | |
{ | |
"epoch": 0.49, | |
"grad_norm": 4.456979751586914, | |
"learning_rate": 0.00017702472293265134, | |
"loss": 0.6533, | |
"step": 570 | |
}, | |
{ | |
"epoch": 0.49, | |
"grad_norm": 7.239711284637451, | |
"learning_rate": 0.0001765984654731458, | |
"loss": 0.755, | |
"step": 580 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 4.915600776672363, | |
"learning_rate": 0.00017617220801364024, | |
"loss": 0.6239, | |
"step": 590 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 4.89735221862793, | |
"learning_rate": 0.0001757459505541347, | |
"loss": 0.7105, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.51, | |
"eval_accuracy": 0.8899456521739131, | |
"eval_loss": 0.47926023602485657, | |
"eval_runtime": 21.8438, | |
"eval_samples_per_second": 33.694, | |
"eval_steps_per_second": 4.212, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.52, | |
"grad_norm": 4.159371376037598, | |
"learning_rate": 0.00017531969309462917, | |
"loss": 0.5948, | |
"step": 610 | |
}, | |
{ | |
"epoch": 0.53, | |
"grad_norm": 4.996119976043701, | |
"learning_rate": 0.00017489343563512362, | |
"loss": 0.6705, | |
"step": 620 | |
}, | |
{ | |
"epoch": 0.54, | |
"grad_norm": 5.600012302398682, | |
"learning_rate": 0.0001744671781756181, | |
"loss": 0.6742, | |
"step": 630 | |
}, | |
{ | |
"epoch": 0.55, | |
"grad_norm": 8.147639274597168, | |
"learning_rate": 0.00017404092071611254, | |
"loss": 0.7171, | |
"step": 640 | |
}, | |
{ | |
"epoch": 0.55, | |
"grad_norm": 4.924251556396484, | |
"learning_rate": 0.000173614663256607, | |
"loss": 0.6655, | |
"step": 650 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 4.968270301818848, | |
"learning_rate": 0.00017318840579710144, | |
"loss": 0.641, | |
"step": 660 | |
}, | |
{ | |
"epoch": 0.57, | |
"grad_norm": 6.600828647613525, | |
"learning_rate": 0.00017276214833759592, | |
"loss": 0.6549, | |
"step": 670 | |
}, | |
{ | |
"epoch": 0.58, | |
"grad_norm": 5.408564567565918, | |
"learning_rate": 0.00017233589087809037, | |
"loss": 0.6431, | |
"step": 680 | |
}, | |
{ | |
"epoch": 0.59, | |
"grad_norm": 6.0379228591918945, | |
"learning_rate": 0.00017190963341858485, | |
"loss": 0.6226, | |
"step": 690 | |
}, | |
{ | |
"epoch": 0.6, | |
"grad_norm": 6.015142917633057, | |
"learning_rate": 0.0001714833759590793, | |
"loss": 0.6198, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.6, | |
"eval_accuracy": 0.9089673913043478, | |
"eval_loss": 0.43188050389289856, | |
"eval_runtime": 21.8402, | |
"eval_samples_per_second": 33.699, | |
"eval_steps_per_second": 4.212, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 5.850244045257568, | |
"learning_rate": 0.00017105711849957375, | |
"loss": 0.5817, | |
"step": 710 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 3.7645974159240723, | |
"learning_rate": 0.0001706308610400682, | |
"loss": 0.6146, | |
"step": 720 | |
}, | |
{ | |
"epoch": 0.62, | |
"grad_norm": 4.481291770935059, | |
"learning_rate": 0.00017020460358056267, | |
"loss": 0.6362, | |
"step": 730 | |
}, | |
{ | |
"epoch": 0.63, | |
"grad_norm": 3.906785488128662, | |
"learning_rate": 0.00016977834612105712, | |
"loss": 0.603, | |
"step": 740 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 6.198018550872803, | |
"learning_rate": 0.0001693520886615516, | |
"loss": 0.547, | |
"step": 750 | |
}, | |
{ | |
"epoch": 0.65, | |
"grad_norm": 3.281864881515503, | |
"learning_rate": 0.00016892583120204605, | |
"loss": 0.5979, | |
"step": 760 | |
}, | |
{ | |
"epoch": 0.66, | |
"grad_norm": 3.2829701900482178, | |
"learning_rate": 0.0001684995737425405, | |
"loss": 0.6047, | |
"step": 770 | |
}, | |
{ | |
"epoch": 0.66, | |
"grad_norm": 4.538403511047363, | |
"learning_rate": 0.00016807331628303495, | |
"loss": 0.6037, | |
"step": 780 | |
}, | |
{ | |
"epoch": 0.67, | |
"grad_norm": 3.8807320594787598, | |
"learning_rate": 0.00016764705882352942, | |
"loss": 0.4783, | |
"step": 790 | |
}, | |
{ | |
"epoch": 0.68, | |
"grad_norm": 5.805025100708008, | |
"learning_rate": 0.00016722080136402387, | |
"loss": 0.6276, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.68, | |
"eval_accuracy": 0.8980978260869565, | |
"eval_loss": 0.4021691083908081, | |
"eval_runtime": 22.1281, | |
"eval_samples_per_second": 33.261, | |
"eval_steps_per_second": 4.158, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.69, | |
"grad_norm": 3.921274185180664, | |
"learning_rate": 0.00016679454390451835, | |
"loss": 0.6383, | |
"step": 810 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 8.53153133392334, | |
"learning_rate": 0.0001663682864450128, | |
"loss": 0.6362, | |
"step": 820 | |
}, | |
{ | |
"epoch": 0.71, | |
"grad_norm": 3.754373073577881, | |
"learning_rate": 0.00016594202898550725, | |
"loss": 0.502, | |
"step": 830 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 5.654985427856445, | |
"learning_rate": 0.0001655157715260017, | |
"loss": 0.6056, | |
"step": 840 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 4.316379070281982, | |
"learning_rate": 0.00016508951406649618, | |
"loss": 0.5892, | |
"step": 850 | |
}, | |
{ | |
"epoch": 0.73, | |
"grad_norm": 5.630128860473633, | |
"learning_rate": 0.00016466325660699063, | |
"loss": 0.6279, | |
"step": 860 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 5.5036797523498535, | |
"learning_rate": 0.0001642369991474851, | |
"loss": 0.5664, | |
"step": 870 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 3.7804858684539795, | |
"learning_rate": 0.00016381074168797955, | |
"loss": 0.6377, | |
"step": 880 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 3.757089138031006, | |
"learning_rate": 0.000163384484228474, | |
"loss": 0.6601, | |
"step": 890 | |
}, | |
{ | |
"epoch": 0.77, | |
"grad_norm": 5.010238170623779, | |
"learning_rate": 0.00016295822676896845, | |
"loss": 0.5411, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.77, | |
"eval_accuracy": 0.9116847826086957, | |
"eval_loss": 0.38156262040138245, | |
"eval_runtime": 22.137, | |
"eval_samples_per_second": 33.248, | |
"eval_steps_per_second": 4.156, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.78, | |
"grad_norm": 4.0843987464904785, | |
"learning_rate": 0.00016253196930946293, | |
"loss": 0.5379, | |
"step": 910 | |
}, | |
{ | |
"epoch": 0.78, | |
"grad_norm": 2.8297243118286133, | |
"learning_rate": 0.00016210571184995738, | |
"loss": 0.4184, | |
"step": 920 | |
}, | |
{ | |
"epoch": 0.79, | |
"grad_norm": 5.259457111358643, | |
"learning_rate": 0.00016167945439045185, | |
"loss": 0.4892, | |
"step": 930 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 6.436057090759277, | |
"learning_rate": 0.0001612531969309463, | |
"loss": 0.5708, | |
"step": 940 | |
}, | |
{ | |
"epoch": 0.81, | |
"grad_norm": 5.9696736335754395, | |
"learning_rate": 0.00016082693947144075, | |
"loss": 0.5395, | |
"step": 950 | |
}, | |
{ | |
"epoch": 0.82, | |
"grad_norm": 5.614426136016846, | |
"learning_rate": 0.0001604006820119352, | |
"loss": 0.573, | |
"step": 960 | |
}, | |
{ | |
"epoch": 0.83, | |
"grad_norm": 3.9581282138824463, | |
"learning_rate": 0.00015997442455242968, | |
"loss": 0.5813, | |
"step": 970 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 4.684549808502197, | |
"learning_rate": 0.00015954816709292413, | |
"loss": 0.5533, | |
"step": 980 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 5.443897247314453, | |
"learning_rate": 0.0001591219096334186, | |
"loss": 0.5539, | |
"step": 990 | |
}, | |
{ | |
"epoch": 0.85, | |
"grad_norm": 4.034463882446289, | |
"learning_rate": 0.00015869565217391306, | |
"loss": 0.4984, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.85, | |
"eval_accuracy": 0.9021739130434783, | |
"eval_loss": 0.38239946961402893, | |
"eval_runtime": 21.9989, | |
"eval_samples_per_second": 33.456, | |
"eval_steps_per_second": 4.182, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.86, | |
"grad_norm": 3.4797463417053223, | |
"learning_rate": 0.0001582693947144075, | |
"loss": 0.5063, | |
"step": 1010 | |
}, | |
{ | |
"epoch": 0.87, | |
"grad_norm": 4.582981586456299, | |
"learning_rate": 0.00015784313725490196, | |
"loss": 0.5875, | |
"step": 1020 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 4.804072856903076, | |
"learning_rate": 0.00015741687979539643, | |
"loss": 0.5107, | |
"step": 1030 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 4.848588943481445, | |
"learning_rate": 0.00015699062233589088, | |
"loss": 0.5255, | |
"step": 1040 | |
}, | |
{ | |
"epoch": 0.9, | |
"grad_norm": 3.0660464763641357, | |
"learning_rate": 0.00015656436487638536, | |
"loss": 0.4437, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 0.9, | |
"grad_norm": 5.267394065856934, | |
"learning_rate": 0.0001561381074168798, | |
"loss": 0.5443, | |
"step": 1060 | |
}, | |
{ | |
"epoch": 0.91, | |
"grad_norm": 5.676567077636719, | |
"learning_rate": 0.00015571184995737426, | |
"loss": 0.5238, | |
"step": 1070 | |
}, | |
{ | |
"epoch": 0.92, | |
"grad_norm": 4.262234210968018, | |
"learning_rate": 0.0001552855924978687, | |
"loss": 0.5148, | |
"step": 1080 | |
}, | |
{ | |
"epoch": 0.93, | |
"grad_norm": 3.7277231216430664, | |
"learning_rate": 0.00015485933503836318, | |
"loss": 0.4952, | |
"step": 1090 | |
}, | |
{ | |
"epoch": 0.94, | |
"grad_norm": 4.57068395614624, | |
"learning_rate": 0.00015443307757885763, | |
"loss": 0.5665, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.94, | |
"eval_accuracy": 0.9211956521739131, | |
"eval_loss": 0.34602978825569153, | |
"eval_runtime": 22.2733, | |
"eval_samples_per_second": 33.044, | |
"eval_steps_per_second": 4.131, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 3.960221767425537, | |
"learning_rate": 0.0001540068201193521, | |
"loss": 0.4565, | |
"step": 1110 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 3.6928794384002686, | |
"learning_rate": 0.00015358056265984656, | |
"loss": 0.4239, | |
"step": 1120 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 2.560225009918213, | |
"learning_rate": 0.000153154305200341, | |
"loss": 0.5735, | |
"step": 1130 | |
}, | |
{ | |
"epoch": 0.97, | |
"grad_norm": 4.45659065246582, | |
"learning_rate": 0.00015272804774083546, | |
"loss": 0.4299, | |
"step": 1140 | |
}, | |
{ | |
"epoch": 0.98, | |
"grad_norm": 5.458113670349121, | |
"learning_rate": 0.00015230179028132994, | |
"loss": 0.4727, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 0.99, | |
"grad_norm": 4.833206653594971, | |
"learning_rate": 0.0001518755328218244, | |
"loss": 0.4722, | |
"step": 1160 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 4.604361057281494, | |
"learning_rate": 0.00015144927536231886, | |
"loss": 0.5225, | |
"step": 1170 | |
}, | |
{ | |
"epoch": 1.01, | |
"grad_norm": 2.7065069675445557, | |
"learning_rate": 0.0001510230179028133, | |
"loss": 0.4548, | |
"step": 1180 | |
}, | |
{ | |
"epoch": 1.01, | |
"grad_norm": 4.440161228179932, | |
"learning_rate": 0.00015059676044330776, | |
"loss": 0.5144, | |
"step": 1190 | |
}, | |
{ | |
"epoch": 1.02, | |
"grad_norm": 4.055997371673584, | |
"learning_rate": 0.0001501705029838022, | |
"loss": 0.5741, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 1.02, | |
"eval_accuracy": 0.9157608695652174, | |
"eval_loss": 0.3336072862148285, | |
"eval_runtime": 21.7333, | |
"eval_samples_per_second": 33.865, | |
"eval_steps_per_second": 4.233, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 1.03, | |
"grad_norm": 3.851508617401123, | |
"learning_rate": 0.0001497442455242967, | |
"loss": 0.5196, | |
"step": 1210 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 6.436770915985107, | |
"learning_rate": 0.00014931798806479114, | |
"loss": 0.5433, | |
"step": 1220 | |
}, | |
{ | |
"epoch": 1.05, | |
"grad_norm": 4.73512601852417, | |
"learning_rate": 0.00014889173060528562, | |
"loss": 0.466, | |
"step": 1230 | |
}, | |
{ | |
"epoch": 1.06, | |
"grad_norm": 6.008419036865234, | |
"learning_rate": 0.00014846547314578007, | |
"loss": 0.4656, | |
"step": 1240 | |
}, | |
{ | |
"epoch": 1.07, | |
"grad_norm": 4.851034164428711, | |
"learning_rate": 0.00014803921568627451, | |
"loss": 0.4271, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 1.07, | |
"grad_norm": 4.398035526275635, | |
"learning_rate": 0.00014761295822676896, | |
"loss": 0.4296, | |
"step": 1260 | |
}, | |
{ | |
"epoch": 1.08, | |
"grad_norm": 4.0159454345703125, | |
"learning_rate": 0.00014718670076726344, | |
"loss": 0.4648, | |
"step": 1270 | |
}, | |
{ | |
"epoch": 1.09, | |
"grad_norm": 5.387024879455566, | |
"learning_rate": 0.0001467604433077579, | |
"loss": 0.4818, | |
"step": 1280 | |
}, | |
{ | |
"epoch": 1.1, | |
"grad_norm": 4.121237277984619, | |
"learning_rate": 0.00014633418584825237, | |
"loss": 0.4996, | |
"step": 1290 | |
}, | |
{ | |
"epoch": 1.11, | |
"grad_norm": 2.9251198768615723, | |
"learning_rate": 0.00014590792838874682, | |
"loss": 0.4039, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 1.11, | |
"eval_accuracy": 0.9130434782608695, | |
"eval_loss": 0.32037827372550964, | |
"eval_runtime": 22.2814, | |
"eval_samples_per_second": 33.032, | |
"eval_steps_per_second": 4.129, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 5.795650482177734, | |
"learning_rate": 0.00014548167092924127, | |
"loss": 0.3762, | |
"step": 1310 | |
}, | |
{ | |
"epoch": 1.13, | |
"grad_norm": 3.4435791969299316, | |
"learning_rate": 0.00014505541346973572, | |
"loss": 0.4195, | |
"step": 1320 | |
}, | |
{ | |
"epoch": 1.13, | |
"grad_norm": 5.9583563804626465, | |
"learning_rate": 0.0001446291560102302, | |
"loss": 0.4561, | |
"step": 1330 | |
}, | |
{ | |
"epoch": 1.14, | |
"grad_norm": 4.873754501342773, | |
"learning_rate": 0.00014420289855072464, | |
"loss": 0.4756, | |
"step": 1340 | |
}, | |
{ | |
"epoch": 1.15, | |
"grad_norm": 2.2037878036499023, | |
"learning_rate": 0.00014377664109121912, | |
"loss": 0.4494, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 1.16, | |
"grad_norm": 3.80642032623291, | |
"learning_rate": 0.00014335038363171357, | |
"loss": 0.454, | |
"step": 1360 | |
}, | |
{ | |
"epoch": 1.17, | |
"grad_norm": 3.6881439685821533, | |
"learning_rate": 0.00014292412617220802, | |
"loss": 0.5275, | |
"step": 1370 | |
}, | |
{ | |
"epoch": 1.18, | |
"grad_norm": 3.6306328773498535, | |
"learning_rate": 0.00014249786871270247, | |
"loss": 0.3859, | |
"step": 1380 | |
}, | |
{ | |
"epoch": 1.18, | |
"grad_norm": 2.9194376468658447, | |
"learning_rate": 0.00014207161125319695, | |
"loss": 0.3732, | |
"step": 1390 | |
}, | |
{ | |
"epoch": 1.19, | |
"grad_norm": 6.103586673736572, | |
"learning_rate": 0.0001416453537936914, | |
"loss": 0.4347, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 1.19, | |
"eval_accuracy": 0.9307065217391305, | |
"eval_loss": 0.3037649989128113, | |
"eval_runtime": 21.9092, | |
"eval_samples_per_second": 33.593, | |
"eval_steps_per_second": 4.199, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 3.9999098777770996, | |
"learning_rate": 0.00014121909633418587, | |
"loss": 0.33, | |
"step": 1410 | |
}, | |
{ | |
"epoch": 1.21, | |
"grad_norm": 4.35922384262085, | |
"learning_rate": 0.00014079283887468032, | |
"loss": 0.4913, | |
"step": 1420 | |
}, | |
{ | |
"epoch": 1.22, | |
"grad_norm": 5.2613019943237305, | |
"learning_rate": 0.00014036658141517477, | |
"loss": 0.4595, | |
"step": 1430 | |
}, | |
{ | |
"epoch": 1.23, | |
"grad_norm": 4.525338172912598, | |
"learning_rate": 0.00013994032395566922, | |
"loss": 0.4338, | |
"step": 1440 | |
}, | |
{ | |
"epoch": 1.24, | |
"grad_norm": 3.5422630310058594, | |
"learning_rate": 0.0001395140664961637, | |
"loss": 0.3084, | |
"step": 1450 | |
}, | |
{ | |
"epoch": 1.24, | |
"grad_norm": 4.331848621368408, | |
"learning_rate": 0.00013908780903665815, | |
"loss": 0.4736, | |
"step": 1460 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 3.7015225887298584, | |
"learning_rate": 0.00013866155157715262, | |
"loss": 0.4691, | |
"step": 1470 | |
}, | |
{ | |
"epoch": 1.26, | |
"grad_norm": 3.8468117713928223, | |
"learning_rate": 0.00013823529411764707, | |
"loss": 0.3247, | |
"step": 1480 | |
}, | |
{ | |
"epoch": 1.27, | |
"grad_norm": 4.560232639312744, | |
"learning_rate": 0.00013780903665814152, | |
"loss": 0.4162, | |
"step": 1490 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 4.275567531585693, | |
"learning_rate": 0.00013738277919863597, | |
"loss": 0.3639, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.9252717391304348, | |
"eval_loss": 0.2954687476158142, | |
"eval_runtime": 21.3813, | |
"eval_samples_per_second": 34.423, | |
"eval_steps_per_second": 4.303, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 1.29, | |
"grad_norm": 4.1551513671875, | |
"learning_rate": 0.00013695652173913045, | |
"loss": 0.4241, | |
"step": 1510 | |
}, | |
{ | |
"epoch": 1.3, | |
"grad_norm": 3.172189712524414, | |
"learning_rate": 0.0001365302642796249, | |
"loss": 0.5576, | |
"step": 1520 | |
}, | |
{ | |
"epoch": 1.3, | |
"grad_norm": 4.926159858703613, | |
"learning_rate": 0.00013610400682011938, | |
"loss": 0.3687, | |
"step": 1530 | |
}, | |
{ | |
"epoch": 1.31, | |
"grad_norm": 5.438605785369873, | |
"learning_rate": 0.00013567774936061383, | |
"loss": 0.4823, | |
"step": 1540 | |
}, | |
{ | |
"epoch": 1.32, | |
"grad_norm": 4.588318347930908, | |
"learning_rate": 0.00013525149190110828, | |
"loss": 0.3816, | |
"step": 1550 | |
}, | |
{ | |
"epoch": 1.33, | |
"grad_norm": 3.6473512649536133, | |
"learning_rate": 0.00013482523444160273, | |
"loss": 0.4009, | |
"step": 1560 | |
}, | |
{ | |
"epoch": 1.34, | |
"grad_norm": 1.983258605003357, | |
"learning_rate": 0.0001343989769820972, | |
"loss": 0.412, | |
"step": 1570 | |
}, | |
{ | |
"epoch": 1.35, | |
"grad_norm": 4.585762023925781, | |
"learning_rate": 0.00013397271952259165, | |
"loss": 0.4427, | |
"step": 1580 | |
}, | |
{ | |
"epoch": 1.36, | |
"grad_norm": 3.362663984298706, | |
"learning_rate": 0.00013354646206308613, | |
"loss": 0.3946, | |
"step": 1590 | |
}, | |
{ | |
"epoch": 1.36, | |
"grad_norm": 2.671499729156494, | |
"learning_rate": 0.00013312020460358058, | |
"loss": 0.4282, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 1.36, | |
"eval_accuracy": 0.9293478260869565, | |
"eval_loss": 0.29482707381248474, | |
"eval_runtime": 22.1321, | |
"eval_samples_per_second": 33.255, | |
"eval_steps_per_second": 4.157, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 1.37, | |
"grad_norm": 4.811617374420166, | |
"learning_rate": 0.00013269394714407503, | |
"loss": 0.4589, | |
"step": 1610 | |
}, | |
{ | |
"epoch": 1.38, | |
"grad_norm": 3.9043726921081543, | |
"learning_rate": 0.00013226768968456948, | |
"loss": 0.4281, | |
"step": 1620 | |
}, | |
{ | |
"epoch": 1.39, | |
"grad_norm": 3.5376808643341064, | |
"learning_rate": 0.00013184143222506395, | |
"loss": 0.3975, | |
"step": 1630 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 4.834260940551758, | |
"learning_rate": 0.0001314151747655584, | |
"loss": 0.4094, | |
"step": 1640 | |
}, | |
{ | |
"epoch": 1.41, | |
"grad_norm": 4.845081806182861, | |
"learning_rate": 0.00013098891730605288, | |
"loss": 0.3806, | |
"step": 1650 | |
}, | |
{ | |
"epoch": 1.42, | |
"grad_norm": 4.995219707489014, | |
"learning_rate": 0.00013056265984654733, | |
"loss": 0.4762, | |
"step": 1660 | |
}, | |
{ | |
"epoch": 1.42, | |
"grad_norm": 4.277749061584473, | |
"learning_rate": 0.00013013640238704178, | |
"loss": 0.4923, | |
"step": 1670 | |
}, | |
{ | |
"epoch": 1.43, | |
"grad_norm": 5.35748815536499, | |
"learning_rate": 0.00012971014492753623, | |
"loss": 0.3878, | |
"step": 1680 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 3.3792693614959717, | |
"learning_rate": 0.0001292838874680307, | |
"loss": 0.369, | |
"step": 1690 | |
}, | |
{ | |
"epoch": 1.45, | |
"grad_norm": 4.180275917053223, | |
"learning_rate": 0.00012885763000852516, | |
"loss": 0.4375, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 1.45, | |
"eval_accuracy": 0.9211956521739131, | |
"eval_loss": 0.2868107855319977, | |
"eval_runtime": 22.1903, | |
"eval_samples_per_second": 33.168, | |
"eval_steps_per_second": 4.146, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 1.46, | |
"grad_norm": 2.387194871902466, | |
"learning_rate": 0.00012843137254901963, | |
"loss": 0.4346, | |
"step": 1710 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 1.87982177734375, | |
"learning_rate": 0.00012800511508951408, | |
"loss": 0.3212, | |
"step": 1720 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 2.5354747772216797, | |
"learning_rate": 0.00012757885763000853, | |
"loss": 0.3389, | |
"step": 1730 | |
}, | |
{ | |
"epoch": 1.48, | |
"grad_norm": 3.975893259048462, | |
"learning_rate": 0.00012715260017050298, | |
"loss": 0.422, | |
"step": 1740 | |
}, | |
{ | |
"epoch": 1.49, | |
"grad_norm": 5.26716423034668, | |
"learning_rate": 0.00012672634271099746, | |
"loss": 0.356, | |
"step": 1750 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 3.5218489170074463, | |
"learning_rate": 0.0001263000852514919, | |
"loss": 0.5009, | |
"step": 1760 | |
}, | |
{ | |
"epoch": 1.51, | |
"grad_norm": 3.965714931488037, | |
"learning_rate": 0.00012587382779198638, | |
"loss": 0.4176, | |
"step": 1770 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 4.640725612640381, | |
"learning_rate": 0.00012544757033248083, | |
"loss": 0.4312, | |
"step": 1780 | |
}, | |
{ | |
"epoch": 1.53, | |
"grad_norm": 3.008267879486084, | |
"learning_rate": 0.00012502131287297528, | |
"loss": 0.4443, | |
"step": 1790 | |
}, | |
{ | |
"epoch": 1.53, | |
"grad_norm": 2.9221649169921875, | |
"learning_rate": 0.00012459505541346973, | |
"loss": 0.3063, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 1.53, | |
"eval_accuracy": 0.9334239130434783, | |
"eval_loss": 0.28607919812202454, | |
"eval_runtime": 21.4966, | |
"eval_samples_per_second": 34.238, | |
"eval_steps_per_second": 4.28, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 1.54, | |
"grad_norm": 4.676169395446777, | |
"learning_rate": 0.0001241687979539642, | |
"loss": 0.4685, | |
"step": 1810 | |
}, | |
{ | |
"epoch": 1.55, | |
"grad_norm": 4.044462203979492, | |
"learning_rate": 0.00012374254049445866, | |
"loss": 0.4094, | |
"step": 1820 | |
}, | |
{ | |
"epoch": 1.56, | |
"grad_norm": 3.891371726989746, | |
"learning_rate": 0.00012331628303495314, | |
"loss": 0.4552, | |
"step": 1830 | |
}, | |
{ | |
"epoch": 1.57, | |
"grad_norm": 5.24341344833374, | |
"learning_rate": 0.0001228900255754476, | |
"loss": 0.2682, | |
"step": 1840 | |
}, | |
{ | |
"epoch": 1.58, | |
"grad_norm": 5.291731834411621, | |
"learning_rate": 0.00012246376811594204, | |
"loss": 0.4047, | |
"step": 1850 | |
}, | |
{ | |
"epoch": 1.59, | |
"grad_norm": 2.1534364223480225, | |
"learning_rate": 0.0001220375106564365, | |
"loss": 0.2985, | |
"step": 1860 | |
}, | |
{ | |
"epoch": 1.59, | |
"grad_norm": 4.051937580108643, | |
"learning_rate": 0.00012161125319693096, | |
"loss": 0.3872, | |
"step": 1870 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 4.822917461395264, | |
"learning_rate": 0.00012118499573742541, | |
"loss": 0.3532, | |
"step": 1880 | |
}, | |
{ | |
"epoch": 1.61, | |
"grad_norm": 2.520695209503174, | |
"learning_rate": 0.00012075873827791986, | |
"loss": 0.3878, | |
"step": 1890 | |
}, | |
{ | |
"epoch": 1.62, | |
"grad_norm": 1.7465733289718628, | |
"learning_rate": 0.00012033248081841433, | |
"loss": 0.3549, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 1.62, | |
"eval_accuracy": 0.9293478260869565, | |
"eval_loss": 0.28257426619529724, | |
"eval_runtime": 22.1114, | |
"eval_samples_per_second": 33.286, | |
"eval_steps_per_second": 4.161, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 1.63, | |
"grad_norm": 4.117008686065674, | |
"learning_rate": 0.00011990622335890877, | |
"loss": 0.345, | |
"step": 1910 | |
}, | |
{ | |
"epoch": 1.64, | |
"grad_norm": 2.7594146728515625, | |
"learning_rate": 0.00011947996589940325, | |
"loss": 0.3515, | |
"step": 1920 | |
}, | |
{ | |
"epoch": 1.65, | |
"grad_norm": 3.63336181640625, | |
"learning_rate": 0.00011905370843989769, | |
"loss": 0.4509, | |
"step": 1930 | |
}, | |
{ | |
"epoch": 1.65, | |
"grad_norm": 2.090906858444214, | |
"learning_rate": 0.00011862745098039216, | |
"loss": 0.4249, | |
"step": 1940 | |
}, | |
{ | |
"epoch": 1.66, | |
"grad_norm": 7.205491542816162, | |
"learning_rate": 0.00011820119352088661, | |
"loss": 0.3791, | |
"step": 1950 | |
}, | |
{ | |
"epoch": 1.67, | |
"grad_norm": 4.450414657592773, | |
"learning_rate": 0.00011777493606138108, | |
"loss": 0.3818, | |
"step": 1960 | |
}, | |
{ | |
"epoch": 1.68, | |
"grad_norm": 3.7030208110809326, | |
"learning_rate": 0.00011734867860187553, | |
"loss": 0.3881, | |
"step": 1970 | |
}, | |
{ | |
"epoch": 1.69, | |
"grad_norm": 3.2162046432495117, | |
"learning_rate": 0.00011692242114237, | |
"loss": 0.4438, | |
"step": 1980 | |
}, | |
{ | |
"epoch": 1.7, | |
"grad_norm": 3.964151382446289, | |
"learning_rate": 0.00011649616368286444, | |
"loss": 0.3491, | |
"step": 1990 | |
}, | |
{ | |
"epoch": 1.71, | |
"grad_norm": 6.592412948608398, | |
"learning_rate": 0.00011606990622335892, | |
"loss": 0.4326, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 1.71, | |
"eval_accuracy": 0.9347826086956522, | |
"eval_loss": 0.26979920268058777, | |
"eval_runtime": 21.9026, | |
"eval_samples_per_second": 33.603, | |
"eval_steps_per_second": 4.2, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 1.71, | |
"grad_norm": 3.8649954795837402, | |
"learning_rate": 0.00011564364876385337, | |
"loss": 0.4652, | |
"step": 2010 | |
}, | |
{ | |
"epoch": 1.72, | |
"grad_norm": 4.012170791625977, | |
"learning_rate": 0.00011521739130434783, | |
"loss": 0.4145, | |
"step": 2020 | |
}, | |
{ | |
"epoch": 1.73, | |
"grad_norm": 3.8874316215515137, | |
"learning_rate": 0.00011479113384484228, | |
"loss": 0.3241, | |
"step": 2030 | |
}, | |
{ | |
"epoch": 1.74, | |
"grad_norm": 3.7281954288482666, | |
"learning_rate": 0.00011436487638533676, | |
"loss": 0.3844, | |
"step": 2040 | |
}, | |
{ | |
"epoch": 1.75, | |
"grad_norm": 3.496342182159424, | |
"learning_rate": 0.00011393861892583119, | |
"loss": 0.4773, | |
"step": 2050 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 2.408202886581421, | |
"learning_rate": 0.00011351236146632567, | |
"loss": 0.44, | |
"step": 2060 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 1.211472749710083, | |
"learning_rate": 0.00011308610400682012, | |
"loss": 0.2852, | |
"step": 2070 | |
}, | |
{ | |
"epoch": 1.77, | |
"grad_norm": 3.826094627380371, | |
"learning_rate": 0.00011265984654731458, | |
"loss": 0.4446, | |
"step": 2080 | |
}, | |
{ | |
"epoch": 1.78, | |
"grad_norm": 3.4480748176574707, | |
"learning_rate": 0.00011223358908780903, | |
"loss": 0.4111, | |
"step": 2090 | |
}, | |
{ | |
"epoch": 1.79, | |
"grad_norm": 4.725152492523193, | |
"learning_rate": 0.00011180733162830351, | |
"loss": 0.3697, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 1.79, | |
"eval_accuracy": 0.9279891304347826, | |
"eval_loss": 0.2602107524871826, | |
"eval_runtime": 21.9521, | |
"eval_samples_per_second": 33.528, | |
"eval_steps_per_second": 4.191, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 1.8, | |
"grad_norm": 3.1105542182922363, | |
"learning_rate": 0.00011138107416879794, | |
"loss": 0.3935, | |
"step": 2110 | |
}, | |
{ | |
"epoch": 1.81, | |
"grad_norm": 4.203391075134277, | |
"learning_rate": 0.00011095481670929242, | |
"loss": 0.4265, | |
"step": 2120 | |
}, | |
{ | |
"epoch": 1.82, | |
"grad_norm": 3.371384382247925, | |
"learning_rate": 0.00011052855924978687, | |
"loss": 0.3914, | |
"step": 2130 | |
}, | |
{ | |
"epoch": 1.82, | |
"grad_norm": 4.558941841125488, | |
"learning_rate": 0.00011010230179028133, | |
"loss": 0.4196, | |
"step": 2140 | |
}, | |
{ | |
"epoch": 1.83, | |
"grad_norm": 2.8182742595672607, | |
"learning_rate": 0.00010967604433077578, | |
"loss": 0.3749, | |
"step": 2150 | |
}, | |
{ | |
"epoch": 1.84, | |
"grad_norm": 5.837876319885254, | |
"learning_rate": 0.00010924978687127026, | |
"loss": 0.3435, | |
"step": 2160 | |
}, | |
{ | |
"epoch": 1.85, | |
"grad_norm": 4.261623859405518, | |
"learning_rate": 0.0001088235294117647, | |
"loss": 0.377, | |
"step": 2170 | |
}, | |
{ | |
"epoch": 1.86, | |
"grad_norm": 3.3105015754699707, | |
"learning_rate": 0.00010839727195225917, | |
"loss": 0.3968, | |
"step": 2180 | |
}, | |
{ | |
"epoch": 1.87, | |
"grad_norm": 4.884303092956543, | |
"learning_rate": 0.00010797101449275362, | |
"loss": 0.3476, | |
"step": 2190 | |
}, | |
{ | |
"epoch": 1.88, | |
"grad_norm": 2.950490951538086, | |
"learning_rate": 0.00010754475703324809, | |
"loss": 0.3155, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 1.88, | |
"eval_accuracy": 0.936141304347826, | |
"eval_loss": 0.2523466646671295, | |
"eval_runtime": 21.8751, | |
"eval_samples_per_second": 33.646, | |
"eval_steps_per_second": 4.206, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 1.88, | |
"grad_norm": 2.758512258529663, | |
"learning_rate": 0.00010711849957374254, | |
"loss": 0.2808, | |
"step": 2210 | |
}, | |
{ | |
"epoch": 1.89, | |
"grad_norm": 3.4424259662628174, | |
"learning_rate": 0.00010669224211423701, | |
"loss": 0.3434, | |
"step": 2220 | |
}, | |
{ | |
"epoch": 1.9, | |
"grad_norm": 3.334716558456421, | |
"learning_rate": 0.00010626598465473145, | |
"loss": 0.4246, | |
"step": 2230 | |
}, | |
{ | |
"epoch": 1.91, | |
"grad_norm": 3.309183120727539, | |
"learning_rate": 0.00010583972719522593, | |
"loss": 0.3535, | |
"step": 2240 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 3.0825111865997314, | |
"learning_rate": 0.00010541346973572037, | |
"loss": 0.4027, | |
"step": 2250 | |
}, | |
{ | |
"epoch": 1.93, | |
"grad_norm": 4.07969856262207, | |
"learning_rate": 0.00010498721227621484, | |
"loss": 0.4758, | |
"step": 2260 | |
}, | |
{ | |
"epoch": 1.94, | |
"grad_norm": 3.2373547554016113, | |
"learning_rate": 0.00010456095481670929, | |
"loss": 0.4168, | |
"step": 2270 | |
}, | |
{ | |
"epoch": 1.94, | |
"grad_norm": 3.9943110942840576, | |
"learning_rate": 0.00010413469735720376, | |
"loss": 0.3319, | |
"step": 2280 | |
}, | |
{ | |
"epoch": 1.95, | |
"grad_norm": 4.852569103240967, | |
"learning_rate": 0.0001037084398976982, | |
"loss": 0.3544, | |
"step": 2290 | |
}, | |
{ | |
"epoch": 1.96, | |
"grad_norm": 2.613586902618408, | |
"learning_rate": 0.00010328218243819268, | |
"loss": 0.3348, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 1.96, | |
"eval_accuracy": 0.9470108695652174, | |
"eval_loss": 0.2506195604801178, | |
"eval_runtime": 22.0934, | |
"eval_samples_per_second": 33.313, | |
"eval_steps_per_second": 4.164, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 1.97, | |
"grad_norm": 3.5085036754608154, | |
"learning_rate": 0.00010285592497868713, | |
"loss": 0.3908, | |
"step": 2310 | |
}, | |
{ | |
"epoch": 1.98, | |
"grad_norm": 2.5577197074890137, | |
"learning_rate": 0.00010242966751918159, | |
"loss": 0.3995, | |
"step": 2320 | |
}, | |
{ | |
"epoch": 1.99, | |
"grad_norm": 3.079270124435425, | |
"learning_rate": 0.00010200341005967604, | |
"loss": 0.2755, | |
"step": 2330 | |
}, | |
{ | |
"epoch": 1.99, | |
"grad_norm": 2.5068159103393555, | |
"learning_rate": 0.00010157715260017052, | |
"loss": 0.4502, | |
"step": 2340 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 4.730781555175781, | |
"learning_rate": 0.00010115089514066495, | |
"loss": 0.3456, | |
"step": 2350 | |
}, | |
{ | |
"epoch": 2.01, | |
"grad_norm": 3.066279888153076, | |
"learning_rate": 0.00010072463768115943, | |
"loss": 0.3106, | |
"step": 2360 | |
}, | |
{ | |
"epoch": 2.02, | |
"grad_norm": 5.494949817657471, | |
"learning_rate": 0.00010029838022165388, | |
"loss": 0.4091, | |
"step": 2370 | |
}, | |
{ | |
"epoch": 2.03, | |
"grad_norm": 4.616675853729248, | |
"learning_rate": 9.987212276214834e-05, | |
"loss": 0.349, | |
"step": 2380 | |
}, | |
{ | |
"epoch": 2.04, | |
"grad_norm": 3.0624680519104004, | |
"learning_rate": 9.94458653026428e-05, | |
"loss": 0.3441, | |
"step": 2390 | |
}, | |
{ | |
"epoch": 2.05, | |
"grad_norm": 3.0316741466522217, | |
"learning_rate": 9.901960784313727e-05, | |
"loss": 0.3854, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 2.05, | |
"eval_accuracy": 0.9320652173913043, | |
"eval_loss": 0.25646623969078064, | |
"eval_runtime": 21.873, | |
"eval_samples_per_second": 33.649, | |
"eval_steps_per_second": 4.206, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 2.05, | |
"grad_norm": 3.854029893875122, | |
"learning_rate": 9.859335038363172e-05, | |
"loss": 0.3514, | |
"step": 2410 | |
}, | |
{ | |
"epoch": 2.06, | |
"grad_norm": 4.106563568115234, | |
"learning_rate": 9.816709292412618e-05, | |
"loss": 0.3358, | |
"step": 2420 | |
}, | |
{ | |
"epoch": 2.07, | |
"grad_norm": 3.2283833026885986, | |
"learning_rate": 9.774083546462064e-05, | |
"loss": 0.4073, | |
"step": 2430 | |
}, | |
{ | |
"epoch": 2.08, | |
"grad_norm": 3.7362446784973145, | |
"learning_rate": 9.73145780051151e-05, | |
"loss": 0.3858, | |
"step": 2440 | |
}, | |
{ | |
"epoch": 2.09, | |
"grad_norm": 2.262639284133911, | |
"learning_rate": 9.688832054560956e-05, | |
"loss": 0.3603, | |
"step": 2450 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 3.2846786975860596, | |
"learning_rate": 9.646206308610402e-05, | |
"loss": 0.3249, | |
"step": 2460 | |
}, | |
{ | |
"epoch": 2.11, | |
"grad_norm": 3.0911221504211426, | |
"learning_rate": 9.603580562659847e-05, | |
"loss": 0.3276, | |
"step": 2470 | |
}, | |
{ | |
"epoch": 2.11, | |
"grad_norm": 3.0129778385162354, | |
"learning_rate": 9.560954816709293e-05, | |
"loss": 0.3321, | |
"step": 2480 | |
}, | |
{ | |
"epoch": 2.12, | |
"grad_norm": 1.167391061782837, | |
"learning_rate": 9.51832907075874e-05, | |
"loss": 0.3583, | |
"step": 2490 | |
}, | |
{ | |
"epoch": 2.13, | |
"grad_norm": 3.6704165935516357, | |
"learning_rate": 9.475703324808185e-05, | |
"loss": 0.3951, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 2.13, | |
"eval_accuracy": 0.9402173913043478, | |
"eval_loss": 0.24821878969669342, | |
"eval_runtime": 22.0497, | |
"eval_samples_per_second": 33.379, | |
"eval_steps_per_second": 4.172, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 2.14, | |
"grad_norm": 4.554658889770508, | |
"learning_rate": 9.433077578857631e-05, | |
"loss": 0.4089, | |
"step": 2510 | |
}, | |
{ | |
"epoch": 2.15, | |
"grad_norm": 1.6168781518936157, | |
"learning_rate": 9.390451832907077e-05, | |
"loss": 0.381, | |
"step": 2520 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 3.4385123252868652, | |
"learning_rate": 9.347826086956522e-05, | |
"loss": 0.3826, | |
"step": 2530 | |
}, | |
{ | |
"epoch": 2.17, | |
"grad_norm": 2.3573708534240723, | |
"learning_rate": 9.305200341005969e-05, | |
"loss": 0.4197, | |
"step": 2540 | |
}, | |
{ | |
"epoch": 2.17, | |
"grad_norm": 3.4988811016082764, | |
"learning_rate": 9.262574595055415e-05, | |
"loss": 0.3862, | |
"step": 2550 | |
}, | |
{ | |
"epoch": 2.18, | |
"grad_norm": 2.5783936977386475, | |
"learning_rate": 9.21994884910486e-05, | |
"loss": 0.4243, | |
"step": 2560 | |
}, | |
{ | |
"epoch": 2.19, | |
"grad_norm": 4.15625, | |
"learning_rate": 9.177323103154306e-05, | |
"loss": 0.4088, | |
"step": 2570 | |
}, | |
{ | |
"epoch": 2.2, | |
"grad_norm": 1.7744431495666504, | |
"learning_rate": 9.134697357203753e-05, | |
"loss": 0.2933, | |
"step": 2580 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 3.8392601013183594, | |
"learning_rate": 9.092071611253197e-05, | |
"loss": 0.3433, | |
"step": 2590 | |
}, | |
{ | |
"epoch": 2.22, | |
"grad_norm": 1.8646149635314941, | |
"learning_rate": 9.049445865302644e-05, | |
"loss": 0.3531, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 2.22, | |
"eval_accuracy": 0.9402173913043478, | |
"eval_loss": 0.24554158747196198, | |
"eval_runtime": 22.0249, | |
"eval_samples_per_second": 33.417, | |
"eval_steps_per_second": 4.177, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 2.23, | |
"grad_norm": 2.4835286140441895, | |
"learning_rate": 9.00682011935209e-05, | |
"loss": 0.3631, | |
"step": 2610 | |
}, | |
{ | |
"epoch": 2.23, | |
"grad_norm": 4.589507102966309, | |
"learning_rate": 8.964194373401535e-05, | |
"loss": 0.2747, | |
"step": 2620 | |
}, | |
{ | |
"epoch": 2.24, | |
"grad_norm": 4.478066444396973, | |
"learning_rate": 8.921568627450981e-05, | |
"loss": 0.352, | |
"step": 2630 | |
}, | |
{ | |
"epoch": 2.25, | |
"grad_norm": 3.647719383239746, | |
"learning_rate": 8.878942881500428e-05, | |
"loss": 0.3804, | |
"step": 2640 | |
}, | |
{ | |
"epoch": 2.26, | |
"grad_norm": 3.516533613204956, | |
"learning_rate": 8.836317135549873e-05, | |
"loss": 0.3542, | |
"step": 2650 | |
}, | |
{ | |
"epoch": 2.27, | |
"grad_norm": 4.30831241607666, | |
"learning_rate": 8.793691389599319e-05, | |
"loss": 0.4834, | |
"step": 2660 | |
}, | |
{ | |
"epoch": 2.28, | |
"grad_norm": 3.8890135288238525, | |
"learning_rate": 8.751065643648765e-05, | |
"loss": 0.4297, | |
"step": 2670 | |
}, | |
{ | |
"epoch": 2.28, | |
"grad_norm": 2.649815559387207, | |
"learning_rate": 8.70843989769821e-05, | |
"loss": 0.4457, | |
"step": 2680 | |
}, | |
{ | |
"epoch": 2.29, | |
"grad_norm": 2.874537467956543, | |
"learning_rate": 8.665814151747657e-05, | |
"loss": 0.3606, | |
"step": 2690 | |
}, | |
{ | |
"epoch": 2.3, | |
"grad_norm": 3.6767563819885254, | |
"learning_rate": 8.623188405797103e-05, | |
"loss": 0.3643, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 2.3, | |
"eval_accuracy": 0.9375, | |
"eval_loss": 0.25128769874572754, | |
"eval_runtime": 22.204, | |
"eval_samples_per_second": 33.147, | |
"eval_steps_per_second": 4.143, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 2.31, | |
"grad_norm": 2.8548221588134766, | |
"learning_rate": 8.580562659846548e-05, | |
"loss": 0.4496, | |
"step": 2710 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 3.32646107673645, | |
"learning_rate": 8.537936913895993e-05, | |
"loss": 0.4037, | |
"step": 2720 | |
}, | |
{ | |
"epoch": 2.33, | |
"grad_norm": 4.917088508605957, | |
"learning_rate": 8.495311167945439e-05, | |
"loss": 0.3816, | |
"step": 2730 | |
}, | |
{ | |
"epoch": 2.34, | |
"grad_norm": 4.138692378997803, | |
"learning_rate": 8.452685421994884e-05, | |
"loss": 0.4166, | |
"step": 2740 | |
}, | |
{ | |
"epoch": 2.34, | |
"grad_norm": 2.5747594833374023, | |
"learning_rate": 8.41005967604433e-05, | |
"loss": 0.3947, | |
"step": 2750 | |
}, | |
{ | |
"epoch": 2.35, | |
"grad_norm": 3.0434372425079346, | |
"learning_rate": 8.367433930093777e-05, | |
"loss": 0.2987, | |
"step": 2760 | |
}, | |
{ | |
"epoch": 2.36, | |
"grad_norm": 4.922779083251953, | |
"learning_rate": 8.324808184143222e-05, | |
"loss": 0.3555, | |
"step": 2770 | |
}, | |
{ | |
"epoch": 2.37, | |
"grad_norm": 3.4687201976776123, | |
"learning_rate": 8.282182438192668e-05, | |
"loss": 0.3274, | |
"step": 2780 | |
}, | |
{ | |
"epoch": 2.38, | |
"grad_norm": 3.288496494293213, | |
"learning_rate": 8.239556692242114e-05, | |
"loss": 0.3321, | |
"step": 2790 | |
}, | |
{ | |
"epoch": 2.39, | |
"grad_norm": 3.5918378829956055, | |
"learning_rate": 8.19693094629156e-05, | |
"loss": 0.3393, | |
"step": 2800 | |
}, | |
{ | |
"epoch": 2.39, | |
"eval_accuracy": 0.9429347826086957, | |
"eval_loss": 0.24919526278972626, | |
"eval_runtime": 21.7658, | |
"eval_samples_per_second": 33.814, | |
"eval_steps_per_second": 4.227, | |
"step": 2800 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 2.7565417289733887, | |
"learning_rate": 8.154305200341006e-05, | |
"loss": 0.3414, | |
"step": 2810 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 3.525897979736328, | |
"learning_rate": 8.111679454390452e-05, | |
"loss": 0.4407, | |
"step": 2820 | |
}, | |
{ | |
"epoch": 2.41, | |
"grad_norm": 2.774212121963501, | |
"learning_rate": 8.069053708439897e-05, | |
"loss": 0.3403, | |
"step": 2830 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 3.12599515914917, | |
"learning_rate": 8.026427962489343e-05, | |
"loss": 0.3312, | |
"step": 2840 | |
}, | |
{ | |
"epoch": 2.43, | |
"grad_norm": 5.231620788574219, | |
"learning_rate": 7.98380221653879e-05, | |
"loss": 0.3958, | |
"step": 2850 | |
}, | |
{ | |
"epoch": 2.44, | |
"grad_norm": 3.423818588256836, | |
"learning_rate": 7.941176470588235e-05, | |
"loss": 0.3513, | |
"step": 2860 | |
}, | |
{ | |
"epoch": 2.45, | |
"grad_norm": 2.218860387802124, | |
"learning_rate": 7.898550724637681e-05, | |
"loss": 0.3747, | |
"step": 2870 | |
}, | |
{ | |
"epoch": 2.46, | |
"grad_norm": 4.602892875671387, | |
"learning_rate": 7.855924978687127e-05, | |
"loss": 0.4701, | |
"step": 2880 | |
}, | |
{ | |
"epoch": 2.46, | |
"grad_norm": 3.3462207317352295, | |
"learning_rate": 7.813299232736572e-05, | |
"loss": 0.3967, | |
"step": 2890 | |
}, | |
{ | |
"epoch": 2.47, | |
"grad_norm": 2.690305233001709, | |
"learning_rate": 7.770673486786019e-05, | |
"loss": 0.3635, | |
"step": 2900 | |
}, | |
{ | |
"epoch": 2.47, | |
"eval_accuracy": 0.9402173913043478, | |
"eval_loss": 0.2394125610589981, | |
"eval_runtime": 21.6739, | |
"eval_samples_per_second": 33.958, | |
"eval_steps_per_second": 4.245, | |
"step": 2900 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 3.6983702182769775, | |
"learning_rate": 7.728047740835465e-05, | |
"loss": 0.3699, | |
"step": 2910 | |
}, | |
{ | |
"epoch": 2.49, | |
"grad_norm": 3.6911110877990723, | |
"learning_rate": 7.68542199488491e-05, | |
"loss": 0.4104, | |
"step": 2920 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 2.3493428230285645, | |
"learning_rate": 7.642796248934356e-05, | |
"loss": 0.323, | |
"step": 2930 | |
}, | |
{ | |
"epoch": 2.51, | |
"grad_norm": 3.3958208560943604, | |
"learning_rate": 7.600170502983802e-05, | |
"loss": 0.3458, | |
"step": 2940 | |
}, | |
{ | |
"epoch": 2.51, | |
"grad_norm": 3.3971476554870605, | |
"learning_rate": 7.557544757033247e-05, | |
"loss": 0.3147, | |
"step": 2950 | |
}, | |
{ | |
"epoch": 2.52, | |
"grad_norm": 3.6205222606658936, | |
"learning_rate": 7.514919011082694e-05, | |
"loss": 0.4776, | |
"step": 2960 | |
}, | |
{ | |
"epoch": 2.53, | |
"grad_norm": 2.697317123413086, | |
"learning_rate": 7.47229326513214e-05, | |
"loss": 0.3806, | |
"step": 2970 | |
}, | |
{ | |
"epoch": 2.54, | |
"grad_norm": 4.160821437835693, | |
"learning_rate": 7.429667519181585e-05, | |
"loss": 0.4149, | |
"step": 2980 | |
}, | |
{ | |
"epoch": 2.55, | |
"grad_norm": 4.857287406921387, | |
"learning_rate": 7.387041773231031e-05, | |
"loss": 0.2937, | |
"step": 2990 | |
}, | |
{ | |
"epoch": 2.56, | |
"grad_norm": 3.1837852001190186, | |
"learning_rate": 7.344416027280478e-05, | |
"loss": 0.3624, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 2.56, | |
"eval_accuracy": 0.938858695652174, | |
"eval_loss": 0.24252061545848846, | |
"eval_runtime": 22.0444, | |
"eval_samples_per_second": 33.387, | |
"eval_steps_per_second": 4.173, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 2.57, | |
"grad_norm": 4.424566745758057, | |
"learning_rate": 7.301790281329923e-05, | |
"loss": 0.2947, | |
"step": 3010 | |
}, | |
{ | |
"epoch": 2.57, | |
"grad_norm": 4.352055549621582, | |
"learning_rate": 7.259164535379369e-05, | |
"loss": 0.408, | |
"step": 3020 | |
}, | |
{ | |
"epoch": 2.58, | |
"grad_norm": 4.074793338775635, | |
"learning_rate": 7.216538789428815e-05, | |
"loss": 0.3518, | |
"step": 3030 | |
}, | |
{ | |
"epoch": 2.59, | |
"grad_norm": 2.518249750137329, | |
"learning_rate": 7.17391304347826e-05, | |
"loss": 0.3168, | |
"step": 3040 | |
}, | |
{ | |
"epoch": 2.6, | |
"grad_norm": 3.670358180999756, | |
"learning_rate": 7.131287297527707e-05, | |
"loss": 0.3239, | |
"step": 3050 | |
}, | |
{ | |
"epoch": 2.61, | |
"grad_norm": 3.765688180923462, | |
"learning_rate": 7.088661551577153e-05, | |
"loss": 0.3946, | |
"step": 3060 | |
}, | |
{ | |
"epoch": 2.62, | |
"grad_norm": 3.1018218994140625, | |
"learning_rate": 7.046035805626598e-05, | |
"loss": 0.3853, | |
"step": 3070 | |
}, | |
{ | |
"epoch": 2.63, | |
"grad_norm": 1.9965764284133911, | |
"learning_rate": 7.003410059676044e-05, | |
"loss": 0.3506, | |
"step": 3080 | |
}, | |
{ | |
"epoch": 2.63, | |
"grad_norm": 4.730674743652344, | |
"learning_rate": 6.96078431372549e-05, | |
"loss": 0.3642, | |
"step": 3090 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 2.4963278770446777, | |
"learning_rate": 6.918158567774935e-05, | |
"loss": 0.3608, | |
"step": 3100 | |
}, | |
{ | |
"epoch": 2.64, | |
"eval_accuracy": 0.9456521739130435, | |
"eval_loss": 0.2389669418334961, | |
"eval_runtime": 22.1191, | |
"eval_samples_per_second": 33.274, | |
"eval_steps_per_second": 4.159, | |
"step": 3100 | |
}, | |
{ | |
"epoch": 2.65, | |
"grad_norm": 2.049652576446533, | |
"learning_rate": 6.875532821824382e-05, | |
"loss": 0.374, | |
"step": 3110 | |
}, | |
{ | |
"epoch": 2.66, | |
"grad_norm": 3.511080026626587, | |
"learning_rate": 6.832907075873828e-05, | |
"loss": 0.3842, | |
"step": 3120 | |
}, | |
{ | |
"epoch": 2.67, | |
"grad_norm": 4.446420192718506, | |
"learning_rate": 6.790281329923273e-05, | |
"loss": 0.288, | |
"step": 3130 | |
}, | |
{ | |
"epoch": 2.68, | |
"grad_norm": 3.2927162647247314, | |
"learning_rate": 6.74765558397272e-05, | |
"loss": 0.2999, | |
"step": 3140 | |
}, | |
{ | |
"epoch": 2.69, | |
"grad_norm": 3.2927627563476562, | |
"learning_rate": 6.705029838022166e-05, | |
"loss": 0.2792, | |
"step": 3150 | |
}, | |
{ | |
"epoch": 2.69, | |
"grad_norm": 4.26877498626709, | |
"learning_rate": 6.66240409207161e-05, | |
"loss": 0.3553, | |
"step": 3160 | |
}, | |
{ | |
"epoch": 2.7, | |
"grad_norm": 2.3921847343444824, | |
"learning_rate": 6.619778346121057e-05, | |
"loss": 0.3472, | |
"step": 3170 | |
}, | |
{ | |
"epoch": 2.71, | |
"grad_norm": 2.2627205848693848, | |
"learning_rate": 6.577152600170503e-05, | |
"loss": 0.3367, | |
"step": 3180 | |
}, | |
{ | |
"epoch": 2.72, | |
"grad_norm": 4.409063816070557, | |
"learning_rate": 6.534526854219948e-05, | |
"loss": 0.4007, | |
"step": 3190 | |
}, | |
{ | |
"epoch": 2.73, | |
"grad_norm": 2.80055570602417, | |
"learning_rate": 6.491901108269395e-05, | |
"loss": 0.3215, | |
"step": 3200 | |
}, | |
{ | |
"epoch": 2.73, | |
"eval_accuracy": 0.9320652173913043, | |
"eval_loss": 0.2482815384864807, | |
"eval_runtime": 22.058, | |
"eval_samples_per_second": 33.367, | |
"eval_steps_per_second": 4.171, | |
"step": 3200 | |
}, | |
{ | |
"epoch": 2.74, | |
"grad_norm": 2.1410982608795166, | |
"learning_rate": 6.449275362318841e-05, | |
"loss": 0.3288, | |
"step": 3210 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 1.486505389213562, | |
"learning_rate": 6.406649616368286e-05, | |
"loss": 0.3833, | |
"step": 3220 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 2.9427695274353027, | |
"learning_rate": 6.364023870417732e-05, | |
"loss": 0.3629, | |
"step": 3230 | |
}, | |
{ | |
"epoch": 2.76, | |
"grad_norm": 6.134848594665527, | |
"learning_rate": 6.321398124467179e-05, | |
"loss": 0.3732, | |
"step": 3240 | |
}, | |
{ | |
"epoch": 2.77, | |
"grad_norm": 2.851616144180298, | |
"learning_rate": 6.278772378516623e-05, | |
"loss": 0.3866, | |
"step": 3250 | |
}, | |
{ | |
"epoch": 2.78, | |
"grad_norm": 4.025167465209961, | |
"learning_rate": 6.23614663256607e-05, | |
"loss": 0.3245, | |
"step": 3260 | |
}, | |
{ | |
"epoch": 2.79, | |
"grad_norm": 6.220050811767578, | |
"learning_rate": 6.193520886615516e-05, | |
"loss": 0.3233, | |
"step": 3270 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 2.960378408432007, | |
"learning_rate": 6.150895140664961e-05, | |
"loss": 0.3752, | |
"step": 3280 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 3.8177359104156494, | |
"learning_rate": 6.108269394714407e-05, | |
"loss": 0.3581, | |
"step": 3290 | |
}, | |
{ | |
"epoch": 2.81, | |
"grad_norm": 3.3736770153045654, | |
"learning_rate": 6.065643648763854e-05, | |
"loss": 0.2971, | |
"step": 3300 | |
}, | |
{ | |
"epoch": 2.81, | |
"eval_accuracy": 0.9402173913043478, | |
"eval_loss": 0.2455403059720993, | |
"eval_runtime": 22.1061, | |
"eval_samples_per_second": 33.294, | |
"eval_steps_per_second": 4.162, | |
"step": 3300 | |
}, | |
{ | |
"epoch": 2.82, | |
"grad_norm": 5.297087669372559, | |
"learning_rate": 6.0230179028132994e-05, | |
"loss": 0.3563, | |
"step": 3310 | |
}, | |
{ | |
"epoch": 2.83, | |
"grad_norm": 2.4093313217163086, | |
"learning_rate": 5.980392156862745e-05, | |
"loss": 0.2741, | |
"step": 3320 | |
}, | |
{ | |
"epoch": 2.84, | |
"grad_norm": 2.5030243396759033, | |
"learning_rate": 5.9377664109121913e-05, | |
"loss": 0.37, | |
"step": 3330 | |
}, | |
{ | |
"epoch": 2.85, | |
"grad_norm": 3.5400922298431396, | |
"learning_rate": 5.895140664961637e-05, | |
"loss": 0.3389, | |
"step": 3340 | |
}, | |
{ | |
"epoch": 2.86, | |
"grad_norm": 2.54128098487854, | |
"learning_rate": 5.8525149190110826e-05, | |
"loss": 0.3459, | |
"step": 3350 | |
}, | |
{ | |
"epoch": 2.86, | |
"grad_norm": 3.479625701904297, | |
"learning_rate": 5.809889173060529e-05, | |
"loss": 0.4011, | |
"step": 3360 | |
}, | |
{ | |
"epoch": 2.87, | |
"grad_norm": 2.196369171142578, | |
"learning_rate": 5.7672634271099746e-05, | |
"loss": 0.4212, | |
"step": 3370 | |
}, | |
{ | |
"epoch": 2.88, | |
"grad_norm": 4.405612468719482, | |
"learning_rate": 5.72463768115942e-05, | |
"loss": 0.4886, | |
"step": 3380 | |
}, | |
{ | |
"epoch": 2.89, | |
"grad_norm": 4.257590293884277, | |
"learning_rate": 5.6820119352088666e-05, | |
"loss": 0.3434, | |
"step": 3390 | |
}, | |
{ | |
"epoch": 2.9, | |
"grad_norm": 5.413020610809326, | |
"learning_rate": 5.639386189258312e-05, | |
"loss": 0.3838, | |
"step": 3400 | |
}, | |
{ | |
"epoch": 2.9, | |
"eval_accuracy": 0.9470108695652174, | |
"eval_loss": 0.23631170392036438, | |
"eval_runtime": 22.1897, | |
"eval_samples_per_second": 33.169, | |
"eval_steps_per_second": 4.146, | |
"step": 3400 | |
}, | |
{ | |
"epoch": 2.91, | |
"grad_norm": 2.7590572834014893, | |
"learning_rate": 5.596760443307758e-05, | |
"loss": 0.3481, | |
"step": 3410 | |
}, | |
{ | |
"epoch": 2.92, | |
"grad_norm": 3.4297289848327637, | |
"learning_rate": 5.554134697357204e-05, | |
"loss": 0.3478, | |
"step": 3420 | |
}, | |
{ | |
"epoch": 2.92, | |
"grad_norm": 3.9188895225524902, | |
"learning_rate": 5.51150895140665e-05, | |
"loss": 0.4562, | |
"step": 3430 | |
}, | |
{ | |
"epoch": 2.93, | |
"grad_norm": 2.648061513900757, | |
"learning_rate": 5.4688832054560955e-05, | |
"loss": 0.2639, | |
"step": 3440 | |
}, | |
{ | |
"epoch": 2.94, | |
"grad_norm": 3.416527509689331, | |
"learning_rate": 5.426257459505542e-05, | |
"loss": 0.3647, | |
"step": 3450 | |
}, | |
{ | |
"epoch": 2.95, | |
"grad_norm": 2.152862310409546, | |
"learning_rate": 5.3836317135549874e-05, | |
"loss": 0.3866, | |
"step": 3460 | |
}, | |
{ | |
"epoch": 2.96, | |
"grad_norm": 3.239790439605713, | |
"learning_rate": 5.341005967604433e-05, | |
"loss": 0.3115, | |
"step": 3470 | |
}, | |
{ | |
"epoch": 2.97, | |
"grad_norm": 4.865437030792236, | |
"learning_rate": 5.2983802216538794e-05, | |
"loss": 0.4108, | |
"step": 3480 | |
}, | |
{ | |
"epoch": 2.98, | |
"grad_norm": 2.4955708980560303, | |
"learning_rate": 5.255754475703325e-05, | |
"loss": 0.3349, | |
"step": 3490 | |
}, | |
{ | |
"epoch": 2.98, | |
"grad_norm": 3.560739755630493, | |
"learning_rate": 5.213128729752771e-05, | |
"loss": 0.3036, | |
"step": 3500 | |
}, | |
{ | |
"epoch": 2.98, | |
"eval_accuracy": 0.9402173913043478, | |
"eval_loss": 0.2422226220369339, | |
"eval_runtime": 22.1146, | |
"eval_samples_per_second": 33.281, | |
"eval_steps_per_second": 4.16, | |
"step": 3500 | |
}, | |
{ | |
"epoch": 2.99, | |
"grad_norm": 2.6449167728424072, | |
"learning_rate": 5.170502983802217e-05, | |
"loss": 0.3553, | |
"step": 3510 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 3.7996511459350586, | |
"learning_rate": 5.1278772378516626e-05, | |
"loss": 0.2752, | |
"step": 3520 | |
}, | |
{ | |
"epoch": 3.01, | |
"grad_norm": 2.710324764251709, | |
"learning_rate": 5.085251491901108e-05, | |
"loss": 0.3815, | |
"step": 3530 | |
}, | |
{ | |
"epoch": 3.02, | |
"grad_norm": 3.8157262802124023, | |
"learning_rate": 5.0426257459505546e-05, | |
"loss": 0.3643, | |
"step": 3540 | |
}, | |
{ | |
"epoch": 3.03, | |
"grad_norm": 3.8065927028656006, | |
"learning_rate": 5e-05, | |
"loss": 0.3013, | |
"step": 3550 | |
}, | |
{ | |
"epoch": 3.03, | |
"grad_norm": 3.4787912368774414, | |
"learning_rate": 4.957374254049446e-05, | |
"loss": 0.3826, | |
"step": 3560 | |
}, | |
{ | |
"epoch": 3.04, | |
"grad_norm": 3.8217809200286865, | |
"learning_rate": 4.914748508098892e-05, | |
"loss": 0.3892, | |
"step": 3570 | |
}, | |
{ | |
"epoch": 3.05, | |
"grad_norm": 3.754225015640259, | |
"learning_rate": 4.872122762148338e-05, | |
"loss": 0.258, | |
"step": 3580 | |
}, | |
{ | |
"epoch": 3.06, | |
"grad_norm": 3.32442045211792, | |
"learning_rate": 4.8294970161977835e-05, | |
"loss": 0.2541, | |
"step": 3590 | |
}, | |
{ | |
"epoch": 3.07, | |
"grad_norm": 3.7520785331726074, | |
"learning_rate": 4.78687127024723e-05, | |
"loss": 0.401, | |
"step": 3600 | |
}, | |
{ | |
"epoch": 3.07, | |
"eval_accuracy": 0.9429347826086957, | |
"eval_loss": 0.23978637158870697, | |
"eval_runtime": 21.9597, | |
"eval_samples_per_second": 33.516, | |
"eval_steps_per_second": 4.189, | |
"step": 3600 | |
}, | |
{ | |
"epoch": 3.08, | |
"grad_norm": 2.6882004737854004, | |
"learning_rate": 4.7442455242966755e-05, | |
"loss": 0.4118, | |
"step": 3610 | |
}, | |
{ | |
"epoch": 3.09, | |
"grad_norm": 2.4365506172180176, | |
"learning_rate": 4.701619778346121e-05, | |
"loss": 0.3586, | |
"step": 3620 | |
}, | |
{ | |
"epoch": 3.09, | |
"grad_norm": 1.2782219648361206, | |
"learning_rate": 4.6589940323955674e-05, | |
"loss": 0.3192, | |
"step": 3630 | |
}, | |
{ | |
"epoch": 3.1, | |
"grad_norm": 4.13897180557251, | |
"learning_rate": 4.616368286445013e-05, | |
"loss": 0.3101, | |
"step": 3640 | |
}, | |
{ | |
"epoch": 3.11, | |
"grad_norm": 1.7676732540130615, | |
"learning_rate": 4.573742540494459e-05, | |
"loss": 0.3854, | |
"step": 3650 | |
}, | |
{ | |
"epoch": 3.12, | |
"grad_norm": 3.285656213760376, | |
"learning_rate": 4.531116794543905e-05, | |
"loss": 0.4299, | |
"step": 3660 | |
}, | |
{ | |
"epoch": 3.13, | |
"grad_norm": 2.670168876647949, | |
"learning_rate": 4.488491048593351e-05, | |
"loss": 0.2919, | |
"step": 3670 | |
}, | |
{ | |
"epoch": 3.14, | |
"grad_norm": 3.4926791191101074, | |
"learning_rate": 4.445865302642796e-05, | |
"loss": 0.356, | |
"step": 3680 | |
}, | |
{ | |
"epoch": 3.15, | |
"grad_norm": 1.534806728363037, | |
"learning_rate": 4.4032395566922426e-05, | |
"loss": 0.3157, | |
"step": 3690 | |
}, | |
{ | |
"epoch": 3.15, | |
"grad_norm": 2.911130905151367, | |
"learning_rate": 4.360613810741688e-05, | |
"loss": 0.3458, | |
"step": 3700 | |
}, | |
{ | |
"epoch": 3.15, | |
"eval_accuracy": 0.9429347826086957, | |
"eval_loss": 0.2517460286617279, | |
"eval_runtime": 22.0673, | |
"eval_samples_per_second": 33.353, | |
"eval_steps_per_second": 4.169, | |
"step": 3700 | |
}, | |
{ | |
"epoch": 3.16, | |
"grad_norm": 3.810058116912842, | |
"learning_rate": 4.317988064791134e-05, | |
"loss": 0.351, | |
"step": 3710 | |
}, | |
{ | |
"epoch": 3.17, | |
"grad_norm": 3.799814462661743, | |
"learning_rate": 4.27536231884058e-05, | |
"loss": 0.4715, | |
"step": 3720 | |
}, | |
{ | |
"epoch": 3.18, | |
"grad_norm": 2.787813663482666, | |
"learning_rate": 4.232736572890026e-05, | |
"loss": 0.3037, | |
"step": 3730 | |
}, | |
{ | |
"epoch": 3.19, | |
"grad_norm": 3.3836419582366943, | |
"learning_rate": 4.1901108269394715e-05, | |
"loss": 0.3279, | |
"step": 3740 | |
}, | |
{ | |
"epoch": 3.2, | |
"grad_norm": 3.981520652770996, | |
"learning_rate": 4.147485080988918e-05, | |
"loss": 0.4078, | |
"step": 3750 | |
}, | |
{ | |
"epoch": 3.21, | |
"grad_norm": 2.715733051300049, | |
"learning_rate": 4.1048593350383635e-05, | |
"loss": 0.3375, | |
"step": 3760 | |
}, | |
{ | |
"epoch": 3.21, | |
"grad_norm": 3.2656917572021484, | |
"learning_rate": 4.062233589087809e-05, | |
"loss": 0.3842, | |
"step": 3770 | |
}, | |
{ | |
"epoch": 3.22, | |
"grad_norm": 4.852041721343994, | |
"learning_rate": 4.0196078431372555e-05, | |
"loss": 0.3119, | |
"step": 3780 | |
}, | |
{ | |
"epoch": 3.23, | |
"grad_norm": 3.6040267944335938, | |
"learning_rate": 3.976982097186701e-05, | |
"loss": 0.3623, | |
"step": 3790 | |
}, | |
{ | |
"epoch": 3.24, | |
"grad_norm": 2.597151279449463, | |
"learning_rate": 3.934356351236147e-05, | |
"loss": 0.2908, | |
"step": 3800 | |
}, | |
{ | |
"epoch": 3.24, | |
"eval_accuracy": 0.9456521739130435, | |
"eval_loss": 0.24226675927639008, | |
"eval_runtime": 21.9179, | |
"eval_samples_per_second": 33.58, | |
"eval_steps_per_second": 4.197, | |
"step": 3800 | |
}, | |
{ | |
"epoch": 3.25, | |
"grad_norm": 4.236294269561768, | |
"learning_rate": 3.891730605285593e-05, | |
"loss": 0.3942, | |
"step": 3810 | |
}, | |
{ | |
"epoch": 3.26, | |
"grad_norm": 1.8580868244171143, | |
"learning_rate": 3.849104859335039e-05, | |
"loss": 0.2885, | |
"step": 3820 | |
}, | |
{ | |
"epoch": 3.27, | |
"grad_norm": 2.5142822265625, | |
"learning_rate": 3.8064791133844843e-05, | |
"loss": 0.322, | |
"step": 3830 | |
}, | |
{ | |
"epoch": 3.27, | |
"grad_norm": 3.2371296882629395, | |
"learning_rate": 3.763853367433931e-05, | |
"loss": 0.2543, | |
"step": 3840 | |
}, | |
{ | |
"epoch": 3.28, | |
"grad_norm": 3.4256720542907715, | |
"learning_rate": 3.721227621483376e-05, | |
"loss": 0.3408, | |
"step": 3850 | |
}, | |
{ | |
"epoch": 3.29, | |
"grad_norm": 2.5072007179260254, | |
"learning_rate": 3.678601875532822e-05, | |
"loss": 0.3094, | |
"step": 3860 | |
}, | |
{ | |
"epoch": 3.3, | |
"grad_norm": 3.4915339946746826, | |
"learning_rate": 3.635976129582268e-05, | |
"loss": 0.4899, | |
"step": 3870 | |
}, | |
{ | |
"epoch": 3.31, | |
"grad_norm": 2.5262367725372314, | |
"learning_rate": 3.593350383631714e-05, | |
"loss": 0.377, | |
"step": 3880 | |
}, | |
{ | |
"epoch": 3.32, | |
"grad_norm": 2.8197975158691406, | |
"learning_rate": 3.5507246376811596e-05, | |
"loss": 0.4031, | |
"step": 3890 | |
}, | |
{ | |
"epoch": 3.32, | |
"grad_norm": 1.553293228149414, | |
"learning_rate": 3.508098891730606e-05, | |
"loss": 0.3016, | |
"step": 3900 | |
}, | |
{ | |
"epoch": 3.32, | |
"eval_accuracy": 0.9442934782608695, | |
"eval_loss": 0.24018557369709015, | |
"eval_runtime": 21.8606, | |
"eval_samples_per_second": 33.668, | |
"eval_steps_per_second": 4.208, | |
"step": 3900 | |
}, | |
{ | |
"epoch": 3.33, | |
"grad_norm": 4.228199481964111, | |
"learning_rate": 3.4654731457800515e-05, | |
"loss": 0.3767, | |
"step": 3910 | |
}, | |
{ | |
"epoch": 3.34, | |
"grad_norm": 3.0265350341796875, | |
"learning_rate": 3.422847399829497e-05, | |
"loss": 0.3735, | |
"step": 3920 | |
}, | |
{ | |
"epoch": 3.35, | |
"grad_norm": 5.04302453994751, | |
"learning_rate": 3.3802216538789435e-05, | |
"loss": 0.3223, | |
"step": 3930 | |
}, | |
{ | |
"epoch": 3.36, | |
"grad_norm": 3.406611919403076, | |
"learning_rate": 3.337595907928389e-05, | |
"loss": 0.3978, | |
"step": 3940 | |
}, | |
{ | |
"epoch": 3.37, | |
"grad_norm": 3.3483784198760986, | |
"learning_rate": 3.294970161977835e-05, | |
"loss": 0.2903, | |
"step": 3950 | |
}, | |
{ | |
"epoch": 3.38, | |
"grad_norm": 4.2682905197143555, | |
"learning_rate": 3.252344416027281e-05, | |
"loss": 0.3356, | |
"step": 3960 | |
}, | |
{ | |
"epoch": 3.38, | |
"grad_norm": 1.5132865905761719, | |
"learning_rate": 3.209718670076726e-05, | |
"loss": 0.3522, | |
"step": 3970 | |
}, | |
{ | |
"epoch": 3.39, | |
"grad_norm": 2.6350772380828857, | |
"learning_rate": 3.1670929241261724e-05, | |
"loss": 0.3271, | |
"step": 3980 | |
}, | |
{ | |
"epoch": 3.4, | |
"grad_norm": 2.6944427490234375, | |
"learning_rate": 3.124467178175618e-05, | |
"loss": 0.2867, | |
"step": 3990 | |
}, | |
{ | |
"epoch": 3.41, | |
"grad_norm": 2.3761823177337646, | |
"learning_rate": 3.081841432225064e-05, | |
"loss": 0.2961, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 3.41, | |
"eval_accuracy": 0.9456521739130435, | |
"eval_loss": 0.2413594275712967, | |
"eval_runtime": 22.1329, | |
"eval_samples_per_second": 33.254, | |
"eval_steps_per_second": 4.157, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 3.42, | |
"grad_norm": 3.0776336193084717, | |
"learning_rate": 3.0392156862745097e-05, | |
"loss": 0.333, | |
"step": 4010 | |
}, | |
{ | |
"epoch": 3.43, | |
"grad_norm": 2.033477783203125, | |
"learning_rate": 2.9965899403239556e-05, | |
"loss": 0.3263, | |
"step": 4020 | |
}, | |
{ | |
"epoch": 3.44, | |
"grad_norm": 4.718287467956543, | |
"learning_rate": 2.9539641943734013e-05, | |
"loss": 0.3373, | |
"step": 4030 | |
}, | |
{ | |
"epoch": 3.44, | |
"grad_norm": 5.81247615814209, | |
"learning_rate": 2.9113384484228473e-05, | |
"loss": 0.3212, | |
"step": 4040 | |
}, | |
{ | |
"epoch": 3.45, | |
"grad_norm": 4.8096723556518555, | |
"learning_rate": 2.8687127024722932e-05, | |
"loss": 0.3458, | |
"step": 4050 | |
}, | |
{ | |
"epoch": 3.46, | |
"grad_norm": 3.833155632019043, | |
"learning_rate": 2.826086956521739e-05, | |
"loss": 0.3212, | |
"step": 4060 | |
}, | |
{ | |
"epoch": 3.47, | |
"grad_norm": 2.942125082015991, | |
"learning_rate": 2.783461210571185e-05, | |
"loss": 0.2945, | |
"step": 4070 | |
}, | |
{ | |
"epoch": 3.48, | |
"grad_norm": 5.171932697296143, | |
"learning_rate": 2.740835464620631e-05, | |
"loss": 0.3338, | |
"step": 4080 | |
}, | |
{ | |
"epoch": 3.49, | |
"grad_norm": 4.480559825897217, | |
"learning_rate": 2.6982097186700765e-05, | |
"loss": 0.3755, | |
"step": 4090 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 3.524714708328247, | |
"learning_rate": 2.6555839727195225e-05, | |
"loss": 0.3822, | |
"step": 4100 | |
}, | |
{ | |
"epoch": 3.5, | |
"eval_accuracy": 0.9415760869565217, | |
"eval_loss": 0.2412695288658142, | |
"eval_runtime": 22.1034, | |
"eval_samples_per_second": 33.298, | |
"eval_steps_per_second": 4.162, | |
"step": 4100 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 2.857635736465454, | |
"learning_rate": 2.6129582267689685e-05, | |
"loss": 0.3709, | |
"step": 4110 | |
}, | |
{ | |
"epoch": 3.51, | |
"grad_norm": 3.15679931640625, | |
"learning_rate": 2.5703324808184144e-05, | |
"loss": 0.2529, | |
"step": 4120 | |
}, | |
{ | |
"epoch": 3.52, | |
"grad_norm": 2.1586387157440186, | |
"learning_rate": 2.52770673486786e-05, | |
"loss": 0.2946, | |
"step": 4130 | |
}, | |
{ | |
"epoch": 3.53, | |
"grad_norm": 3.978802442550659, | |
"learning_rate": 2.4850809889173064e-05, | |
"loss": 0.3867, | |
"step": 4140 | |
}, | |
{ | |
"epoch": 3.54, | |
"grad_norm": 4.345022201538086, | |
"learning_rate": 2.442455242966752e-05, | |
"loss": 0.3664, | |
"step": 4150 | |
}, | |
{ | |
"epoch": 3.55, | |
"grad_norm": 3.7032830715179443, | |
"learning_rate": 2.399829497016198e-05, | |
"loss": 0.3485, | |
"step": 4160 | |
}, | |
{ | |
"epoch": 3.55, | |
"grad_norm": 3.59366774559021, | |
"learning_rate": 2.357203751065644e-05, | |
"loss": 0.3255, | |
"step": 4170 | |
}, | |
{ | |
"epoch": 3.56, | |
"grad_norm": 5.068453311920166, | |
"learning_rate": 2.3145780051150897e-05, | |
"loss": 0.3121, | |
"step": 4180 | |
}, | |
{ | |
"epoch": 3.57, | |
"grad_norm": 5.033252239227295, | |
"learning_rate": 2.2719522591645353e-05, | |
"loss": 0.368, | |
"step": 4190 | |
}, | |
{ | |
"epoch": 3.58, | |
"grad_norm": 2.29129958152771, | |
"learning_rate": 2.2293265132139813e-05, | |
"loss": 0.2596, | |
"step": 4200 | |
}, | |
{ | |
"epoch": 3.58, | |
"eval_accuracy": 0.9456521739130435, | |
"eval_loss": 0.23559102416038513, | |
"eval_runtime": 22.0843, | |
"eval_samples_per_second": 33.327, | |
"eval_steps_per_second": 4.166, | |
"step": 4200 | |
}, | |
{ | |
"epoch": 3.59, | |
"grad_norm": 3.8676092624664307, | |
"learning_rate": 2.1867007672634273e-05, | |
"loss": 0.3853, | |
"step": 4210 | |
}, | |
{ | |
"epoch": 3.6, | |
"grad_norm": 2.257540702819824, | |
"learning_rate": 2.144075021312873e-05, | |
"loss": 0.4, | |
"step": 4220 | |
}, | |
{ | |
"epoch": 3.61, | |
"grad_norm": 4.077911853790283, | |
"learning_rate": 2.101449275362319e-05, | |
"loss": 0.3717, | |
"step": 4230 | |
}, | |
{ | |
"epoch": 3.61, | |
"grad_norm": 3.7997450828552246, | |
"learning_rate": 2.058823529411765e-05, | |
"loss": 0.3895, | |
"step": 4240 | |
}, | |
{ | |
"epoch": 3.62, | |
"grad_norm": 2.1893258094787598, | |
"learning_rate": 2.0161977834612105e-05, | |
"loss": 0.3243, | |
"step": 4250 | |
}, | |
{ | |
"epoch": 3.63, | |
"grad_norm": 2.298306941986084, | |
"learning_rate": 1.9735720375106565e-05, | |
"loss": 0.3166, | |
"step": 4260 | |
}, | |
{ | |
"epoch": 3.64, | |
"grad_norm": 2.121025562286377, | |
"learning_rate": 1.9309462915601025e-05, | |
"loss": 0.2771, | |
"step": 4270 | |
}, | |
{ | |
"epoch": 3.65, | |
"grad_norm": 1.8228780031204224, | |
"learning_rate": 1.888320545609548e-05, | |
"loss": 0.2858, | |
"step": 4280 | |
}, | |
{ | |
"epoch": 3.66, | |
"grad_norm": 4.4333977699279785, | |
"learning_rate": 1.845694799658994e-05, | |
"loss": 0.3386, | |
"step": 4290 | |
}, | |
{ | |
"epoch": 3.67, | |
"grad_norm": 3.167982578277588, | |
"learning_rate": 1.80306905370844e-05, | |
"loss": 0.3064, | |
"step": 4300 | |
}, | |
{ | |
"epoch": 3.67, | |
"eval_accuracy": 0.9497282608695652, | |
"eval_loss": 0.23243670165538788, | |
"eval_runtime": 22.1685, | |
"eval_samples_per_second": 33.2, | |
"eval_steps_per_second": 4.15, | |
"step": 4300 | |
}, | |
{ | |
"epoch": 3.67, | |
"grad_norm": 3.177164316177368, | |
"learning_rate": 1.7604433077578857e-05, | |
"loss": 0.2947, | |
"step": 4310 | |
}, | |
{ | |
"epoch": 3.68, | |
"grad_norm": 4.344371318817139, | |
"learning_rate": 1.7178175618073317e-05, | |
"loss": 0.3845, | |
"step": 4320 | |
}, | |
{ | |
"epoch": 3.69, | |
"grad_norm": 2.364387035369873, | |
"learning_rate": 1.6751918158567777e-05, | |
"loss": 0.3276, | |
"step": 4330 | |
}, | |
{ | |
"epoch": 3.7, | |
"grad_norm": 4.086526870727539, | |
"learning_rate": 1.6325660699062233e-05, | |
"loss": 0.3708, | |
"step": 4340 | |
}, | |
{ | |
"epoch": 3.71, | |
"grad_norm": 4.65876579284668, | |
"learning_rate": 1.5899403239556693e-05, | |
"loss": 0.3258, | |
"step": 4350 | |
}, | |
{ | |
"epoch": 3.72, | |
"grad_norm": 4.176472187042236, | |
"learning_rate": 1.5473145780051153e-05, | |
"loss": 0.2992, | |
"step": 4360 | |
}, | |
{ | |
"epoch": 3.73, | |
"grad_norm": 3.989961624145508, | |
"learning_rate": 1.504688832054561e-05, | |
"loss": 0.3359, | |
"step": 4370 | |
}, | |
{ | |
"epoch": 3.73, | |
"grad_norm": 2.337566614151001, | |
"learning_rate": 1.462063086104007e-05, | |
"loss": 0.3066, | |
"step": 4380 | |
}, | |
{ | |
"epoch": 3.74, | |
"grad_norm": 2.9135842323303223, | |
"learning_rate": 1.4194373401534527e-05, | |
"loss": 0.2812, | |
"step": 4390 | |
}, | |
{ | |
"epoch": 3.75, | |
"grad_norm": 4.203680038452148, | |
"learning_rate": 1.3768115942028985e-05, | |
"loss": 0.3059, | |
"step": 4400 | |
}, | |
{ | |
"epoch": 3.75, | |
"eval_accuracy": 0.9456521739130435, | |
"eval_loss": 0.2321375608444214, | |
"eval_runtime": 21.9466, | |
"eval_samples_per_second": 33.536, | |
"eval_steps_per_second": 4.192, | |
"step": 4400 | |
}, | |
{ | |
"epoch": 3.76, | |
"grad_norm": 4.143373489379883, | |
"learning_rate": 1.3341858482523445e-05, | |
"loss": 0.354, | |
"step": 4410 | |
}, | |
{ | |
"epoch": 3.77, | |
"grad_norm": 2.7329490184783936, | |
"learning_rate": 1.2915601023017903e-05, | |
"loss": 0.3319, | |
"step": 4420 | |
}, | |
{ | |
"epoch": 3.78, | |
"grad_norm": 3.1333062648773193, | |
"learning_rate": 1.2489343563512362e-05, | |
"loss": 0.3421, | |
"step": 4430 | |
}, | |
{ | |
"epoch": 3.79, | |
"grad_norm": 3.976710319519043, | |
"learning_rate": 1.2063086104006821e-05, | |
"loss": 0.3263, | |
"step": 4440 | |
}, | |
{ | |
"epoch": 3.79, | |
"grad_norm": 1.6640021800994873, | |
"learning_rate": 1.163682864450128e-05, | |
"loss": 0.3224, | |
"step": 4450 | |
}, | |
{ | |
"epoch": 3.8, | |
"grad_norm": 2.7301018238067627, | |
"learning_rate": 1.121057118499574e-05, | |
"loss": 0.3694, | |
"step": 4460 | |
}, | |
{ | |
"epoch": 3.81, | |
"grad_norm": 2.9358885288238525, | |
"learning_rate": 1.0784313725490197e-05, | |
"loss": 0.3103, | |
"step": 4470 | |
}, | |
{ | |
"epoch": 3.82, | |
"grad_norm": 4.6855387687683105, | |
"learning_rate": 1.0400682011935209e-05, | |
"loss": 0.3927, | |
"step": 4480 | |
}, | |
{ | |
"epoch": 3.83, | |
"grad_norm": 2.2591495513916016, | |
"learning_rate": 9.974424552429668e-06, | |
"loss": 0.4017, | |
"step": 4490 | |
}, | |
{ | |
"epoch": 3.84, | |
"grad_norm": 2.375493049621582, | |
"learning_rate": 9.548167092924126e-06, | |
"loss": 0.42, | |
"step": 4500 | |
}, | |
{ | |
"epoch": 3.84, | |
"eval_accuracy": 0.9402173913043478, | |
"eval_loss": 0.25556182861328125, | |
"eval_runtime": 22.225, | |
"eval_samples_per_second": 33.116, | |
"eval_steps_per_second": 4.139, | |
"step": 4500 | |
}, | |
{ | |
"epoch": 3.84, | |
"grad_norm": 2.5054142475128174, | |
"learning_rate": 9.121909633418585e-06, | |
"loss": 0.3716, | |
"step": 4510 | |
}, | |
{ | |
"epoch": 3.85, | |
"grad_norm": 3.321662425994873, | |
"learning_rate": 8.695652173913044e-06, | |
"loss": 0.3185, | |
"step": 4520 | |
}, | |
{ | |
"epoch": 3.86, | |
"grad_norm": 2.8269574642181396, | |
"learning_rate": 8.269394714407503e-06, | |
"loss": 0.2985, | |
"step": 4530 | |
}, | |
{ | |
"epoch": 3.87, | |
"grad_norm": 3.426715612411499, | |
"learning_rate": 7.84313725490196e-06, | |
"loss": 0.3068, | |
"step": 4540 | |
}, | |
{ | |
"epoch": 3.88, | |
"grad_norm": 6.179994583129883, | |
"learning_rate": 7.41687979539642e-06, | |
"loss": 0.411, | |
"step": 4550 | |
}, | |
{ | |
"epoch": 3.89, | |
"grad_norm": 3.369870662689209, | |
"learning_rate": 6.990622335890879e-06, | |
"loss": 0.3128, | |
"step": 4560 | |
}, | |
{ | |
"epoch": 3.9, | |
"grad_norm": 4.271053791046143, | |
"learning_rate": 6.564364876385337e-06, | |
"loss": 0.3924, | |
"step": 4570 | |
}, | |
{ | |
"epoch": 3.9, | |
"grad_norm": 3.3079607486724854, | |
"learning_rate": 6.138107416879796e-06, | |
"loss": 0.34, | |
"step": 4580 | |
}, | |
{ | |
"epoch": 3.91, | |
"grad_norm": 6.019475936889648, | |
"learning_rate": 5.711849957374255e-06, | |
"loss": 0.3955, | |
"step": 4590 | |
}, | |
{ | |
"epoch": 3.92, | |
"grad_norm": 3.133575916290283, | |
"learning_rate": 5.285592497868714e-06, | |
"loss": 0.2959, | |
"step": 4600 | |
}, | |
{ | |
"epoch": 3.92, | |
"eval_accuracy": 0.9415760869565217, | |
"eval_loss": 0.24908488988876343, | |
"eval_runtime": 22.0463, | |
"eval_samples_per_second": 33.384, | |
"eval_steps_per_second": 4.173, | |
"step": 4600 | |
}, | |
{ | |
"epoch": 3.93, | |
"grad_norm": 4.321446895599365, | |
"learning_rate": 4.859335038363171e-06, | |
"loss": 0.4081, | |
"step": 4610 | |
}, | |
{ | |
"epoch": 3.94, | |
"grad_norm": 4.398526191711426, | |
"learning_rate": 4.43307757885763e-06, | |
"loss": 0.289, | |
"step": 4620 | |
}, | |
{ | |
"epoch": 3.95, | |
"grad_norm": 3.48781681060791, | |
"learning_rate": 4.006820119352089e-06, | |
"loss": 0.311, | |
"step": 4630 | |
}, | |
{ | |
"epoch": 3.96, | |
"grad_norm": 3.4008705615997314, | |
"learning_rate": 3.5805626598465474e-06, | |
"loss": 0.3046, | |
"step": 4640 | |
}, | |
{ | |
"epoch": 3.96, | |
"grad_norm": 3.372878074645996, | |
"learning_rate": 3.154305200341006e-06, | |
"loss": 0.4128, | |
"step": 4650 | |
}, | |
{ | |
"epoch": 3.97, | |
"grad_norm": 4.755558967590332, | |
"learning_rate": 2.728047740835465e-06, | |
"loss": 0.3619, | |
"step": 4660 | |
}, | |
{ | |
"epoch": 3.98, | |
"grad_norm": 2.060793161392212, | |
"learning_rate": 2.3017902813299235e-06, | |
"loss": 0.2876, | |
"step": 4670 | |
}, | |
{ | |
"epoch": 3.99, | |
"grad_norm": 4.377244472503662, | |
"learning_rate": 1.875532821824382e-06, | |
"loss": 0.3278, | |
"step": 4680 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 3.747706651687622, | |
"learning_rate": 1.4492753623188406e-06, | |
"loss": 0.3186, | |
"step": 4690 | |
}, | |
{ | |
"epoch": 4.0, | |
"step": 4692, | |
"total_flos": 1.870424802038661e+18, | |
"train_loss": 0.5094418521630693, | |
"train_runtime": 3760.9881, | |
"train_samples_per_second": 19.953, | |
"train_steps_per_second": 1.248 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 4692, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 4, | |
"save_steps": 100, | |
"total_flos": 1.870424802038661e+18, | |
"train_batch_size": 16, | |
"trial_name": null, | |
"trial_params": null | |
} | |