|
{ |
|
"best_metric": 0.9789719626168224, |
|
"best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-lora-medmnistv2/checkpoint-1870", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1870, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9260491132736206, |
|
"learning_rate": 0.004973262032085562, |
|
"loss": 1.5983, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1307735443115234, |
|
"learning_rate": 0.004946524064171123, |
|
"loss": 0.9417, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.9537946581840515, |
|
"learning_rate": 0.004919786096256685, |
|
"loss": 0.7642, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8597701191902161, |
|
"learning_rate": 0.004893048128342246, |
|
"loss": 0.6992, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.104675531387329, |
|
"learning_rate": 0.004866310160427808, |
|
"loss": 0.627, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.846555233001709, |
|
"learning_rate": 0.004839572192513369, |
|
"loss": 0.5047, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.423182487487793, |
|
"learning_rate": 0.004812834224598931, |
|
"loss": 0.5431, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8424627780914307, |
|
"learning_rate": 0.004786096256684492, |
|
"loss": 0.5962, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6608781814575195, |
|
"learning_rate": 0.004759358288770054, |
|
"loss": 0.4084, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.130247712135315, |
|
"learning_rate": 0.004732620320855615, |
|
"loss": 0.4932, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6054658889770508, |
|
"learning_rate": 0.004705882352941177, |
|
"loss": 0.4684, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8725093603134155, |
|
"learning_rate": 0.004679144385026738, |
|
"loss": 0.4429, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6343618035316467, |
|
"learning_rate": 0.0046524064171123, |
|
"loss": 0.3952, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.9175045490264893, |
|
"learning_rate": 0.0046256684491978615, |
|
"loss": 0.4592, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0295114517211914, |
|
"learning_rate": 0.004598930481283423, |
|
"loss": 0.4212, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4232007563114166, |
|
"learning_rate": 0.004572192513368984, |
|
"loss": 0.4165, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.18360435962677, |
|
"learning_rate": 0.00454812834224599, |
|
"loss": 0.4245, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7265322804450989, |
|
"learning_rate": 0.004521390374331551, |
|
"loss": 0.4059, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9310747663551402, |
|
"eval_f1": 0.9201346862223367, |
|
"eval_loss": 0.18775394558906555, |
|
"eval_precision": 0.913178148427007, |
|
"eval_recall": 0.9327948208695145, |
|
"eval_runtime": 9.5068, |
|
"eval_samples_per_second": 180.082, |
|
"eval_steps_per_second": 11.255, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.179917335510254, |
|
"learning_rate": 0.004494652406417113, |
|
"loss": 0.3646, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.1189391613006592, |
|
"learning_rate": 0.004467914438502674, |
|
"loss": 0.4339, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.8059839010238647, |
|
"learning_rate": 0.004441176470588235, |
|
"loss": 0.373, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.5934990644454956, |
|
"learning_rate": 0.004414438502673797, |
|
"loss": 0.4089, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.5738559365272522, |
|
"learning_rate": 0.004387700534759359, |
|
"loss": 0.4181, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.0053284168243408, |
|
"learning_rate": 0.00436096256684492, |
|
"loss": 0.354, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.6736829280853271, |
|
"learning_rate": 0.004334224598930481, |
|
"loss": 0.2862, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.7684084177017212, |
|
"learning_rate": 0.0043074866310160425, |
|
"loss": 0.3533, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.04612135887146, |
|
"learning_rate": 0.004280748663101605, |
|
"loss": 0.3654, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.7823394536972046, |
|
"learning_rate": 0.004254010695187166, |
|
"loss": 0.4385, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.9472429752349854, |
|
"learning_rate": 0.004227272727272727, |
|
"loss": 0.4417, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.889252245426178, |
|
"learning_rate": 0.004200534759358289, |
|
"loss": 0.3873, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.7252718806266785, |
|
"learning_rate": 0.00417379679144385, |
|
"loss": 0.3717, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.8687788844108582, |
|
"learning_rate": 0.004147058823529412, |
|
"loss": 0.3854, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.6197172999382019, |
|
"learning_rate": 0.004122994652406417, |
|
"loss": 0.3748, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.6506063342094421, |
|
"learning_rate": 0.004096256684491978, |
|
"loss": 0.2923, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.5267966389656067, |
|
"learning_rate": 0.00406951871657754, |
|
"loss": 0.4045, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 1.1251919269561768, |
|
"learning_rate": 0.004042780748663102, |
|
"loss": 0.3988, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.114890456199646, |
|
"learning_rate": 0.004016042780748663, |
|
"loss": 0.3796, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9082943925233645, |
|
"eval_f1": 0.886066241884805, |
|
"eval_loss": 0.27294662594795227, |
|
"eval_precision": 0.9131012141299326, |
|
"eval_recall": 0.887497540228883, |
|
"eval_runtime": 9.2331, |
|
"eval_samples_per_second": 185.419, |
|
"eval_steps_per_second": 11.589, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.7117612361907959, |
|
"learning_rate": 0.003989304812834224, |
|
"loss": 0.3724, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.9159232974052429, |
|
"learning_rate": 0.00396524064171123, |
|
"loss": 0.3155, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.6797966957092285, |
|
"learning_rate": 0.003938502673796792, |
|
"loss": 0.3531, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.912696361541748, |
|
"learning_rate": 0.003911764705882353, |
|
"loss": 0.2788, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.0336519479751587, |
|
"learning_rate": 0.0038850267379679144, |
|
"loss": 0.3692, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.8013398051261902, |
|
"learning_rate": 0.003858288770053476, |
|
"loss": 0.3561, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.6950948238372803, |
|
"learning_rate": 0.003831550802139038, |
|
"loss": 0.3295, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.7441625595092773, |
|
"learning_rate": 0.003804812834224599, |
|
"loss": 0.3285, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 4.745124816894531, |
|
"learning_rate": 0.0037780748663101605, |
|
"loss": 0.4162, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.3873414993286133, |
|
"learning_rate": 0.003751336898395722, |
|
"loss": 0.3424, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.7891167402267456, |
|
"learning_rate": 0.0037272727272727275, |
|
"loss": 0.3043, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.013873815536499, |
|
"learning_rate": 0.003700534759358289, |
|
"loss": 0.3754, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.9377150535583496, |
|
"learning_rate": 0.00367379679144385, |
|
"loss": 0.3675, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.7368648052215576, |
|
"learning_rate": 0.0036470588235294117, |
|
"loss": 0.2901, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.5487793684005737, |
|
"learning_rate": 0.0036203208556149736, |
|
"loss": 0.482, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 8.680522918701172, |
|
"learning_rate": 0.003593582887700535, |
|
"loss": 0.378, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.3777785301208496, |
|
"learning_rate": 0.0035668449197860962, |
|
"loss": 0.4919, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.1192550659179688, |
|
"learning_rate": 0.0035401069518716578, |
|
"loss": 0.3751, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 9.656478881835938, |
|
"learning_rate": 0.0035133689839572193, |
|
"loss": 0.424, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.866822429906542, |
|
"eval_f1": 0.8491520459723211, |
|
"eval_loss": 0.3701097071170807, |
|
"eval_precision": 0.8797339861417046, |
|
"eval_recall": 0.8520089249800192, |
|
"eval_runtime": 9.219, |
|
"eval_samples_per_second": 185.702, |
|
"eval_steps_per_second": 11.606, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.5421924591064453, |
|
"learning_rate": 0.0034866310160427804, |
|
"loss": 0.4643, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.9370782375335693, |
|
"learning_rate": 0.0034598930481283424, |
|
"loss": 0.4274, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.6456141471862793, |
|
"learning_rate": 0.003433155080213904, |
|
"loss": 0.3616, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.2138258218765259, |
|
"learning_rate": 0.0034064171122994654, |
|
"loss": 0.4241, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.8959400057792664, |
|
"learning_rate": 0.0033796791443850265, |
|
"loss": 0.3392, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.8747026324272156, |
|
"learning_rate": 0.003352941176470588, |
|
"loss": 0.3533, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.7161656618118286, |
|
"learning_rate": 0.00332620320855615, |
|
"loss": 0.3407, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.9229569435119629, |
|
"learning_rate": 0.0032994652406417115, |
|
"loss": 0.3098, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.9468969702720642, |
|
"learning_rate": 0.0032727272727272726, |
|
"loss": 0.3896, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.4430208206176758, |
|
"learning_rate": 0.003245989304812834, |
|
"loss": 0.3395, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.20052969455719, |
|
"learning_rate": 0.0032192513368983957, |
|
"loss": 0.3448, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 1.1726669073104858, |
|
"learning_rate": 0.0031925133689839577, |
|
"loss": 0.342, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.7881722450256348, |
|
"learning_rate": 0.0031657754010695188, |
|
"loss": 0.301, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.7960072159767151, |
|
"learning_rate": 0.0031390374331550803, |
|
"loss": 0.2633, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.964872419834137, |
|
"learning_rate": 0.003112299465240642, |
|
"loss": 0.2691, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.9894037246704102, |
|
"learning_rate": 0.003085561497326203, |
|
"loss": 0.2859, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.0027267932891846, |
|
"learning_rate": 0.003058823529411765, |
|
"loss": 0.3027, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 1.0325654745101929, |
|
"learning_rate": 0.0030320855614973264, |
|
"loss": 0.3141, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9380841121495327, |
|
"eval_f1": 0.9283105641226367, |
|
"eval_loss": 0.18485769629478455, |
|
"eval_precision": 0.9266830676466586, |
|
"eval_recall": 0.9336478146798447, |
|
"eval_runtime": 9.3787, |
|
"eval_samples_per_second": 182.542, |
|
"eval_steps_per_second": 11.409, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 1.263634443283081, |
|
"learning_rate": 0.003005347593582888, |
|
"loss": 0.3592, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.8158007860183716, |
|
"learning_rate": 0.002978609625668449, |
|
"loss": 0.364, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.9459696412086487, |
|
"learning_rate": 0.0029518716577540106, |
|
"loss": 0.3587, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.7624779343605042, |
|
"learning_rate": 0.0029251336898395725, |
|
"loss": 0.304, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.8625235557556152, |
|
"learning_rate": 0.002898395721925134, |
|
"loss": 0.2726, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.962257444858551, |
|
"learning_rate": 0.002871657754010695, |
|
"loss": 0.2601, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.6333624720573425, |
|
"learning_rate": 0.0028449197860962567, |
|
"loss": 0.3448, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 1.3983910083770752, |
|
"learning_rate": 0.002818181818181818, |
|
"loss": 0.3202, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.6626348495483398, |
|
"learning_rate": 0.00279144385026738, |
|
"loss": 0.2529, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.8221544027328491, |
|
"learning_rate": 0.0027647058823529413, |
|
"loss": 0.2523, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.7872591018676758, |
|
"learning_rate": 0.002737967914438503, |
|
"loss": 0.2832, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.50129234790802, |
|
"learning_rate": 0.0027112299465240643, |
|
"loss": 0.2912, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.7471727728843689, |
|
"learning_rate": 0.0026844919786096254, |
|
"loss": 0.3097, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.6078329086303711, |
|
"learning_rate": 0.002657754010695187, |
|
"loss": 0.2657, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.8674110174179077, |
|
"learning_rate": 0.002631016042780749, |
|
"loss": 0.2633, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.5421575307846069, |
|
"learning_rate": 0.0026042780748663104, |
|
"loss": 0.257, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 1.314867377281189, |
|
"learning_rate": 0.0025775401069518715, |
|
"loss": 0.2688, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.698221743106842, |
|
"learning_rate": 0.002550802139037433, |
|
"loss": 0.2506, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.5437451004981995, |
|
"learning_rate": 0.0025240641711229946, |
|
"loss": 0.2553, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9643691588785047, |
|
"eval_f1": 0.9617344813251135, |
|
"eval_loss": 0.1074606254696846, |
|
"eval_precision": 0.9630090863077152, |
|
"eval_recall": 0.9611619604560873, |
|
"eval_runtime": 9.213, |
|
"eval_samples_per_second": 185.824, |
|
"eval_steps_per_second": 11.614, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 0.9639925956726074, |
|
"learning_rate": 0.002497326203208556, |
|
"loss": 0.2186, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 1.0346194505691528, |
|
"learning_rate": 0.0024705882352941176, |
|
"loss": 0.3163, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"grad_norm": 0.9101438522338867, |
|
"learning_rate": 0.002443850267379679, |
|
"loss": 0.257, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 0.9387779831886292, |
|
"learning_rate": 0.0024171122994652407, |
|
"loss": 0.2745, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 1.3407084941864014, |
|
"learning_rate": 0.0023903743315508022, |
|
"loss": 0.2775, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 0.7988283038139343, |
|
"learning_rate": 0.0023636363636363638, |
|
"loss": 0.2568, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 0.8980028033256531, |
|
"learning_rate": 0.0023368983957219253, |
|
"loss": 0.296, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 0.8847124576568604, |
|
"learning_rate": 0.002310160427807487, |
|
"loss": 0.2525, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 1.3140696287155151, |
|
"learning_rate": 0.002283422459893048, |
|
"loss": 0.2967, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"grad_norm": 0.6774911284446716, |
|
"learning_rate": 0.00225668449197861, |
|
"loss": 0.2735, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"grad_norm": 0.9686025977134705, |
|
"learning_rate": 0.002229946524064171, |
|
"loss": 0.2415, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"grad_norm": 1.3379433155059814, |
|
"learning_rate": 0.0022032085561497325, |
|
"loss": 0.2656, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 0.6908765435218811, |
|
"learning_rate": 0.002176470588235294, |
|
"loss": 0.2532, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 0.8308853507041931, |
|
"learning_rate": 0.0021497326203208556, |
|
"loss": 0.2428, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 1.2064207792282104, |
|
"learning_rate": 0.002122994652406417, |
|
"loss": 0.2989, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"grad_norm": 0.8376064896583557, |
|
"learning_rate": 0.0020962566844919786, |
|
"loss": 0.2061, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 0.9363247156143188, |
|
"learning_rate": 0.00206951871657754, |
|
"loss": 0.2447, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"grad_norm": 7.874444007873535, |
|
"learning_rate": 0.0020427807486631017, |
|
"loss": 0.2254, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"grad_norm": 0.9535788297653198, |
|
"learning_rate": 0.002016042780748663, |
|
"loss": 0.2686, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9485981308411215, |
|
"eval_f1": 0.9488981890553403, |
|
"eval_loss": 0.16793404519557953, |
|
"eval_precision": 0.9560571498851578, |
|
"eval_recall": 0.9437216744429628, |
|
"eval_runtime": 9.2543, |
|
"eval_samples_per_second": 184.995, |
|
"eval_steps_per_second": 11.562, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 0.9278040528297424, |
|
"learning_rate": 0.0019893048128342247, |
|
"loss": 0.256, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 1.0177885293960571, |
|
"learning_rate": 0.0019625668449197863, |
|
"loss": 0.2173, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 0.5898217558860779, |
|
"learning_rate": 0.0019358288770053476, |
|
"loss": 0.2257, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 5.235673904418945, |
|
"learning_rate": 0.0019090909090909091, |
|
"loss": 0.2388, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"grad_norm": 1.1271004676818848, |
|
"learning_rate": 0.0018823529411764706, |
|
"loss": 0.2544, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"grad_norm": 0.6136900186538696, |
|
"learning_rate": 0.001855614973262032, |
|
"loss": 0.2785, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 0.9343350529670715, |
|
"learning_rate": 0.0018288770053475937, |
|
"loss": 0.2304, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"grad_norm": 0.7129714488983154, |
|
"learning_rate": 0.001802139037433155, |
|
"loss": 0.1709, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"grad_norm": 0.8645954132080078, |
|
"learning_rate": 0.0017754010695187168, |
|
"loss": 0.2099, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 0.4692780375480652, |
|
"learning_rate": 0.001748663101604278, |
|
"loss": 0.1801, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"grad_norm": 1.1131465435028076, |
|
"learning_rate": 0.0017219251336898396, |
|
"loss": 0.2187, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 6.63, |
|
"grad_norm": 1.0496641397476196, |
|
"learning_rate": 0.0016951871657754011, |
|
"loss": 0.2381, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 0.7512268424034119, |
|
"learning_rate": 0.0016684491978609627, |
|
"loss": 0.2171, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 6.74, |
|
"grad_norm": 0.9206662774085999, |
|
"learning_rate": 0.0016417112299465242, |
|
"loss": 0.1716, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"grad_norm": 1.044285535812378, |
|
"learning_rate": 0.0016149732620320857, |
|
"loss": 0.1996, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"grad_norm": 1.5523549318313599, |
|
"learning_rate": 0.001588235294117647, |
|
"loss": 0.198, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 0.7654513120651245, |
|
"learning_rate": 0.0015614973262032088, |
|
"loss": 0.2341, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"grad_norm": 1.145663857460022, |
|
"learning_rate": 0.00153475935828877, |
|
"loss": 0.2556, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9661214953271028, |
|
"eval_f1": 0.9619479557860847, |
|
"eval_loss": 0.09340371936559677, |
|
"eval_precision": 0.9651383824240083, |
|
"eval_recall": 0.9598949442531882, |
|
"eval_runtime": 9.0216, |
|
"eval_samples_per_second": 189.767, |
|
"eval_steps_per_second": 11.86, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 0.8554219603538513, |
|
"learning_rate": 0.0015080213903743314, |
|
"loss": 0.237, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"grad_norm": 0.7055748701095581, |
|
"learning_rate": 0.0014812834224598931, |
|
"loss": 0.2317, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 7.11, |
|
"grad_norm": 1.0891897678375244, |
|
"learning_rate": 0.0014545454545454545, |
|
"loss": 0.1723, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"grad_norm": 0.5554465651512146, |
|
"learning_rate": 0.0014278074866310162, |
|
"loss": 0.1986, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"grad_norm": 1.0232211351394653, |
|
"learning_rate": 0.0014010695187165775, |
|
"loss": 0.2222, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"grad_norm": 0.6204003095626831, |
|
"learning_rate": 0.001374331550802139, |
|
"loss": 0.1827, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"grad_norm": 0.7353977560997009, |
|
"learning_rate": 0.0013475935828877006, |
|
"loss": 0.1649, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"grad_norm": 0.734186053276062, |
|
"learning_rate": 0.001320855614973262, |
|
"loss": 0.194, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"grad_norm": 0.47959616780281067, |
|
"learning_rate": 0.0012941176470588236, |
|
"loss": 0.1763, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 7.49, |
|
"grad_norm": 0.6939826607704163, |
|
"learning_rate": 0.0012673796791443852, |
|
"loss": 0.2286, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"grad_norm": 0.948558509349823, |
|
"learning_rate": 0.0012406417112299467, |
|
"loss": 0.2506, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 7.59, |
|
"grad_norm": 0.8466843962669373, |
|
"learning_rate": 0.001213903743315508, |
|
"loss": 0.2175, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 0.6146303415298462, |
|
"learning_rate": 0.0011871657754010695, |
|
"loss": 0.1641, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"grad_norm": 0.8321207761764526, |
|
"learning_rate": 0.001160427807486631, |
|
"loss": 0.1903, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.7309682965278625, |
|
"learning_rate": 0.0011336898395721926, |
|
"loss": 0.1981, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 7.81, |
|
"grad_norm": 0.5901007652282715, |
|
"learning_rate": 0.0011069518716577541, |
|
"loss": 0.2011, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 0.9141890406608582, |
|
"learning_rate": 0.0010802139037433154, |
|
"loss": 0.2735, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 0.813578724861145, |
|
"learning_rate": 0.001053475935828877, |
|
"loss": 0.2093, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"grad_norm": 0.4584049582481384, |
|
"learning_rate": 0.0010267379679144385, |
|
"loss": 0.1777, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.969626168224299, |
|
"eval_f1": 0.9686486797969157, |
|
"eval_loss": 0.08350867033004761, |
|
"eval_precision": 0.9696703038283683, |
|
"eval_recall": 0.9682591946397131, |
|
"eval_runtime": 9.2254, |
|
"eval_samples_per_second": 185.574, |
|
"eval_steps_per_second": 11.598, |
|
"step": 1496 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"grad_norm": 0.7080217599868774, |
|
"learning_rate": 0.001, |
|
"loss": 0.1999, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.9281997084617615, |
|
"learning_rate": 0.0009732620320855614, |
|
"loss": 0.1688, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"grad_norm": 0.8174493312835693, |
|
"learning_rate": 0.000946524064171123, |
|
"loss": 0.1731, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"grad_norm": 0.6349031925201416, |
|
"learning_rate": 0.0009197860962566845, |
|
"loss": 0.1672, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"grad_norm": 0.8174115419387817, |
|
"learning_rate": 0.000893048128342246, |
|
"loss": 0.1839, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"grad_norm": 0.6900407671928406, |
|
"learning_rate": 0.0008663101604278075, |
|
"loss": 0.2044, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 0.2948859930038452, |
|
"learning_rate": 0.000839572192513369, |
|
"loss": 0.1328, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.7020041942596436, |
|
"learning_rate": 0.0008128342245989305, |
|
"loss": 0.1759, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"grad_norm": 1.0418401956558228, |
|
"learning_rate": 0.000786096256684492, |
|
"loss": 0.1777, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.7473070025444031, |
|
"learning_rate": 0.0007593582887700536, |
|
"loss": 0.1631, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 8.56, |
|
"grad_norm": 0.8006024360656738, |
|
"learning_rate": 0.000732620320855615, |
|
"loss": 0.1566, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"grad_norm": 1.0594407320022583, |
|
"learning_rate": 0.0007058823529411765, |
|
"loss": 0.184, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 0.6014285087585449, |
|
"learning_rate": 0.000679144385026738, |
|
"loss": 0.1583, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 0.6736869812011719, |
|
"learning_rate": 0.0006524064171122996, |
|
"loss": 0.1468, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.6957813501358032, |
|
"learning_rate": 0.0006256684491978609, |
|
"loss": 0.1731, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.5073075294494629, |
|
"learning_rate": 0.0005989304812834224, |
|
"loss": 0.176, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.5485414862632751, |
|
"learning_rate": 0.000572192513368984, |
|
"loss": 0.1936, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.8590062856674194, |
|
"learning_rate": 0.0005454545454545455, |
|
"loss": 0.1795, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.49274083971977234, |
|
"learning_rate": 0.000518716577540107, |
|
"loss": 0.1607, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9772196261682243, |
|
"eval_f1": 0.9758896890562156, |
|
"eval_loss": 0.07392112910747528, |
|
"eval_precision": 0.9732910812266744, |
|
"eval_recall": 0.97920005624388, |
|
"eval_runtime": 9.2433, |
|
"eval_samples_per_second": 185.214, |
|
"eval_steps_per_second": 11.576, |
|
"step": 1683 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 0.4997323751449585, |
|
"learning_rate": 0.0004919786096256684, |
|
"loss": 0.1352, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.5221167206764221, |
|
"learning_rate": 0.00046524064171122996, |
|
"loss": 0.1597, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 0.6731162071228027, |
|
"learning_rate": 0.0004385026737967915, |
|
"loss": 0.1639, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 0.5156794786453247, |
|
"learning_rate": 0.00041176470588235296, |
|
"loss": 0.1667, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.767203152179718, |
|
"learning_rate": 0.0003850267379679145, |
|
"loss": 0.1672, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 0.5664710402488708, |
|
"learning_rate": 0.0003582887700534759, |
|
"loss": 0.1428, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 0.37641459703445435, |
|
"learning_rate": 0.00033155080213903744, |
|
"loss": 0.1667, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 0.5527117252349854, |
|
"learning_rate": 0.0003048128342245989, |
|
"loss": 0.1723, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"grad_norm": 0.8746387958526611, |
|
"learning_rate": 0.00027807486631016044, |
|
"loss": 0.1596, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 0.5461722612380981, |
|
"learning_rate": 0.0002513368983957219, |
|
"loss": 0.17, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.5201784372329712, |
|
"learning_rate": 0.00022459893048128345, |
|
"loss": 0.1268, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"grad_norm": 0.44921737909317017, |
|
"learning_rate": 0.00019786096256684492, |
|
"loss": 0.1537, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 0.6538177728652954, |
|
"learning_rate": 0.00017112299465240642, |
|
"loss": 0.1564, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 0.39654332399368286, |
|
"learning_rate": 0.00014438502673796793, |
|
"loss": 0.1196, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"grad_norm": 0.5751528143882751, |
|
"learning_rate": 0.00011764705882352942, |
|
"loss": 0.1953, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 0.7018762826919556, |
|
"learning_rate": 9.09090909090909e-05, |
|
"loss": 0.1414, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"grad_norm": 0.8955555558204651, |
|
"learning_rate": 6.41711229946524e-05, |
|
"loss": 0.1415, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"grad_norm": 0.29650095105171204, |
|
"learning_rate": 3.74331550802139e-05, |
|
"loss": 0.1361, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.6939311623573303, |
|
"learning_rate": 1.0695187165775402e-05, |
|
"loss": 0.1898, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9789719626168224, |
|
"eval_f1": 0.9786328578443323, |
|
"eval_loss": 0.06271301954984665, |
|
"eval_precision": 0.9764445771965571, |
|
"eval_recall": 0.9811556249771411, |
|
"eval_runtime": 8.9863, |
|
"eval_samples_per_second": 190.512, |
|
"eval_steps_per_second": 11.907, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1870, |
|
"total_flos": 9.332136680499118e+18, |
|
"train_loss": 0.29726991015959553, |
|
"train_runtime": 1395.7704, |
|
"train_samples_per_second": 85.68, |
|
"train_steps_per_second": 1.34 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1870, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 9.332136680499118e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|